summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/bpf/bpf_design_QA.rst15
-rw-r--r--arch/x86/net/bpf_jit_comp.c5
-rw-r--r--arch/x86/net/bpf_jit_comp32.c198
-rw-r--r--drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c66
-rw-r--r--drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h10
-rw-r--r--drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c40
-rw-r--r--drivers/net/ethernet/pensando/ionic/Makefile1
-rw-r--r--drivers/net/ethernet/pensando/ionic/ionic.h6
-rw-r--r--drivers/net/ethernet/pensando/ionic/ionic_dev.c2
-rw-r--r--drivers/net/ethernet/pensando/ionic/ionic_dev.h3
-rw-r--r--drivers/net/ethernet/pensando/ionic/ionic_ethtool.c93
-rw-r--r--drivers/net/ethernet/pensando/ionic/ionic_if.h214
-rw-r--r--drivers/net/ethernet/pensando/ionic/ionic_lif.c439
-rw-r--r--drivers/net/ethernet/pensando/ionic/ionic_lif.h75
-rw-r--r--drivers/net/ethernet/pensando/ionic/ionic_main.c17
-rw-r--r--drivers/net/ethernet/pensando/ionic/ionic_phc.c589
-rw-r--r--drivers/net/ethernet/pensando/ionic/ionic_rx_filter.c21
-rw-r--r--drivers/net/ethernet/pensando/ionic/ionic_rx_filter.h1
-rw-r--r--drivers/net/ethernet/pensando/ionic/ionic_stats.c38
-rw-r--r--drivers/net/ethernet/pensando/ionic/ionic_txrx.c138
-rw-r--r--drivers/net/ethernet/pensando/ionic/ionic_txrx.h3
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/Makefile1
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/stmmac.h35
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/stmmac_main.c539
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/stmmac_xdp.c40
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/stmmac_xdp.h12
-rw-r--r--drivers/net/veth.c12
-rw-r--r--drivers/nfc/pn533/pn533.c3
-rw-r--r--include/linux/bpf-cgroup.h57
-rw-r--r--include/linux/bpf.h58
-rw-r--r--include/linux/btf.h6
-rw-r--r--include/linux/filter.h13
-rw-r--r--include/linux/skbuff.h1
-rw-r--r--include/linux/skmsg.h77
-rw-r--r--include/net/bpf_sk_storage.h1
-rw-r--r--include/net/mptcp.h18
-rw-r--r--include/net/netns/mib.h30
-rw-r--r--include/net/sock.h3
-rw-r--r--include/net/tcp.h45
-rw-r--r--include/net/udp.h3
-rw-r--r--include/uapi/linux/bpf.h5
-rw-r--r--include/uapi/linux/mptcp.h11
-rw-r--r--kernel/bpf/btf.c219
-rw-r--r--kernel/bpf/core.c47
-rw-r--r--kernel/bpf/disasm.c13
-rw-r--r--kernel/bpf/helpers.c15
-rw-r--r--kernel/bpf/local_storage.c5
-rw-r--r--kernel/bpf/lpm_trie.c3
-rw-r--r--kernel/bpf/syscall.c5
-rw-r--r--kernel/bpf/verifier.c390
-rw-r--r--net/bpf/test_run.c34
-rw-r--r--net/core/filter.c1
-rw-r--r--net/core/skbuff.c55
-rw-r--r--net/core/skmsg.c177
-rw-r--r--net/core/sock_map.c118
-rw-r--r--net/ipv4/af_inet.c1
-rw-r--r--net/ipv4/bpf_tcp_ca.c43
-rw-r--r--net/ipv4/tcp_bpf.c130
-rw-r--r--net/ipv4/tcp_cubic.c24
-rw-r--r--net/ipv4/tcp_ipv4.c24
-rw-r--r--net/ipv4/udp.c32
-rw-r--r--net/ipv4/udp_bpf.c79
-rw-r--r--net/ipv6/af_inet6.c1
-rw-r--r--net/ipv6/tcp_ipv6.c17
-rw-r--r--net/ipv6/udp.c3
-rw-r--r--net/mptcp/mib.c3
-rw-r--r--net/mptcp/mib.h3
-rw-r--r--net/mptcp/options.c69
-rw-r--r--net/mptcp/pm_netlink.c12
-rw-r--r--net/mptcp/protocol.c20
-rw-r--r--net/mptcp/protocol.h14
-rw-r--r--net/mptcp/subflow.c34
-rw-r--r--net/tls/tls_sw.c4
-rw-r--r--samples/bpf/sampleip_kern.c1
-rw-r--r--samples/bpf/trace_event_kern.c1
-rw-r--r--samples/bpf/xdpsock_user.c55
-rw-r--r--tools/bpf/bpftool/common.c1
-rw-r--r--tools/bpf/bpftool/prog.c1
-rw-r--r--tools/bpf/resolve_btfids/main.c11
-rw-r--r--tools/include/uapi/linux/bpf.h5
-rw-r--r--tools/lib/bpf/libbpf.c403
-rw-r--r--tools/lib/bpf/libbpf.h5
-rw-r--r--tools/lib/bpf/libbpf.map1
-rw-r--r--tools/lib/bpf/linker.c37
-rw-r--r--tools/lib/bpf/xsk.c258
-rw-r--r--tools/testing/selftests/bpf/README.rst14
-rw-r--r--tools/testing/selftests/bpf/bpf_tcp_helpers.h29
-rw-r--r--tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c158
-rw-r--r--tools/testing/selftests/bpf/prog_tests/kfunc_call.c59
-rw-r--r--tools/testing/selftests/bpf/prog_tests/sockmap_basic.c40
-rw-r--r--tools/testing/selftests/bpf/prog_tests/sockmap_listen.c136
-rw-r--r--tools/testing/selftests/bpf/prog_tests/test_ima.c6
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_cubic.c36
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_dctcp.c22
-rw-r--r--tools/testing/selftests/bpf/progs/kfunc_call_test.c47
-rw-r--r--tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c42
-rw-r--r--tools/testing/selftests/bpf/progs/test_sockmap_listen.c22
-rw-r--r--tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c18
-rwxr-xr-xtools/testing/selftests/bpf/test_xsk.sh3
-rw-r--r--tools/testing/selftests/bpf/verifier/calls.c12
-rw-r--r--tools/testing/selftests/bpf/verifier/dead_code.c10
-rwxr-xr-xtools/testing/selftests/bpf/vmtest.sh39
-rw-r--r--tools/testing/selftests/bpf/xdpxceiver.c700
-rw-r--r--tools/testing/selftests/bpf/xdpxceiver.h49
-rwxr-xr-xtools/testing/selftests/net/mptcp/diag.sh55
-rwxr-xr-xtools/testing/selftests/net/mptcp/mptcp_connect.sh22
-rwxr-xr-xtools/testing/selftests/net/mptcp/mptcp_join.sh39
-rwxr-xr-xtools/testing/selftests/net/mptcp/simult_flows.sh13
108 files changed, 5495 insertions, 1394 deletions
diff --git a/Documentation/bpf/bpf_design_QA.rst b/Documentation/bpf/bpf_design_QA.rst
index 0e15f9b05c9d..437de2a7a5de 100644
--- a/Documentation/bpf/bpf_design_QA.rst
+++ b/Documentation/bpf/bpf_design_QA.rst
@@ -258,3 +258,18 @@ Q: Can BPF functionality such as new program or map types, new
helpers, etc be added out of kernel module code?
A: NO.
+
+Q: Directly calling kernel function is an ABI?
+----------------------------------------------
+Q: Some kernel functions (e.g. tcp_slow_start) can be called
+by BPF programs. Do these kernel functions become an ABI?
+
+A: NO.
+
+The kernel function protos will change and the bpf programs will be
+rejected by the verifier. Also, for example, some of the bpf-callable
+kernel functions have already been used by other kernel tcp
+cc (congestion-control) implementations. If any of these kernel
+functions has changed, both the in-tree and out-of-tree kernel tcp cc
+implementations have to be changed. The same goes for the bpf
+programs and they have to be adjusted accordingly.
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index b35fc8023884..9eead60f0301 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -2346,3 +2346,8 @@ out:
tmp : orig_prog);
return prog;
}
+
+bool bpf_jit_supports_kfunc_call(void)
+{
+ return true;
+}
diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
index d17b67c69f89..0a7a2870f111 100644
--- a/arch/x86/net/bpf_jit_comp32.c
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -1390,6 +1390,19 @@ static inline void emit_push_r64(const u8 src[], u8 **pprog)
*pprog = prog;
}
+static void emit_push_r32(const u8 src[], u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+ /* mov ecx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src_lo));
+ /* push ecx */
+ EMIT1(0x51);
+
+ *pprog = prog;
+}
+
static u8 get_cond_jmp_opcode(const u8 op, bool is_cmp_lo)
{
u8 jmp_cond;
@@ -1459,6 +1472,174 @@ static u8 get_cond_jmp_opcode(const u8 op, bool is_cmp_lo)
return jmp_cond;
}
+/* i386 kernel compiles with "-mregparm=3". From gcc document:
+ *
+ * ==== snippet ====
+ * regparm (number)
+ * On x86-32 targets, the regparm attribute causes the compiler
+ * to pass arguments number one to (number) if they are of integral
+ * type in registers EAX, EDX, and ECX instead of on the stack.
+ * Functions that take a variable number of arguments continue
+ * to be passed all of their arguments on the stack.
+ * ==== snippet ====
+ *
+ * The first three args of a function will be considered for
+ * putting into the 32bit register EAX, EDX, and ECX.
+ *
+ * Two 32bit registers are used to pass a 64bit arg.
+ *
+ * For example,
+ * void foo(u32 a, u32 b, u32 c, u32 d):
+ * u32 a: EAX
+ * u32 b: EDX
+ * u32 c: ECX
+ * u32 d: stack
+ *
+ * void foo(u64 a, u32 b, u32 c):
+ * u64 a: EAX (lo32) EDX (hi32)
+ * u32 b: ECX
+ * u32 c: stack
+ *
+ * void foo(u32 a, u64 b, u32 c):
+ * u32 a: EAX
+ * u64 b: EDX (lo32) ECX (hi32)
+ * u32 c: stack
+ *
+ * void foo(u32 a, u32 b, u64 c):
+ * u32 a: EAX
+ * u32 b: EDX
+ * u64 c: stack
+ *
+ * The return value will be stored in the EAX (and EDX for 64bit value).
+ *
+ * For example,
+ * u32 foo(u32 a, u32 b, u32 c):
+ * return value: EAX
+ *
+ * u64 foo(u32 a, u32 b, u32 c):
+ * return value: EAX (lo32) EDX (hi32)
+ *
+ * Notes:
+ * The verifier only accepts function having integer and pointers
+ * as its args and return value, so it does not have
+ * struct-by-value.
+ *
+ * emit_kfunc_call() finds out the btf_func_model by calling
+ * bpf_jit_find_kfunc_model(). A btf_func_model
+ * has the details about the number of args, size of each arg,
+ * and the size of the return value.
+ *
+ * It first decides how many args can be passed by EAX, EDX, and ECX.
+ * That will decide what args should be pushed to the stack:
+ * [first_stack_regno, last_stack_regno] are the bpf regnos
+ * that should be pushed to the stack.
+ *
+ * It will first push all args to the stack because the push
+ * will need to use ECX. Then, it moves
+ * [BPF_REG_1, first_stack_regno) to EAX, EDX, and ECX.
+ *
+ * When emitting a call (0xE8), it needs to figure out
+ * the jmp_offset relative to the jit-insn address immediately
+ * following the call (0xE8) instruction. At this point, it knows
+ * the end of the jit-insn address after completely translated the
+ * current (BPF_JMP | BPF_CALL) bpf-insn. It is passed as "end_addr"
+ * to the emit_kfunc_call(). Thus, it can learn the "immediate-follow-call"
+ * address by figuring out how many jit-insn is generated between
+ * the call (0xE8) and the end_addr:
+ * - 0-1 jit-insn (3 bytes each) to restore the esp pointer if there
+ * is arg pushed to the stack.
+ * - 0-2 jit-insns (3 bytes each) to handle the return value.
+ */
+static int emit_kfunc_call(const struct bpf_prog *bpf_prog, u8 *end_addr,
+ const struct bpf_insn *insn, u8 **pprog)
+{
+ const u8 arg_regs[] = { IA32_EAX, IA32_EDX, IA32_ECX };
+ int i, cnt = 0, first_stack_regno, last_stack_regno;
+ int free_arg_regs = ARRAY_SIZE(arg_regs);
+ const struct btf_func_model *fm;
+ int bytes_in_stack = 0;
+ const u8 *cur_arg_reg;
+ u8 *prog = *pprog;
+ s64 jmp_offset;
+
+ fm = bpf_jit_find_kfunc_model(bpf_prog, insn);
+ if (!fm)
+ return -EINVAL;
+
+ first_stack_regno = BPF_REG_1;
+ for (i = 0; i < fm->nr_args; i++) {
+ int regs_needed = fm->arg_size[i] > sizeof(u32) ? 2 : 1;
+
+ if (regs_needed > free_arg_regs)
+ break;
+
+ free_arg_regs -= regs_needed;
+ first_stack_regno++;
+ }
+
+ /* Push the args to the stack */
+ last_stack_regno = BPF_REG_0 + fm->nr_args;
+ for (i = last_stack_regno; i >= first_stack_regno; i--) {
+ if (fm->arg_size[i - 1] > sizeof(u32)) {
+ emit_push_r64(bpf2ia32[i], &prog);
+ bytes_in_stack += 8;
+ } else {
+ emit_push_r32(bpf2ia32[i], &prog);
+ bytes_in_stack += 4;
+ }
+ }
+
+ cur_arg_reg = &arg_regs[0];
+ for (i = BPF_REG_1; i < first_stack_regno; i++) {
+ /* mov e[adc]x,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, *cur_arg_reg++),
+ STACK_VAR(bpf2ia32[i][0]));
+ if (fm->arg_size[i - 1] > sizeof(u32))
+ /* mov e[adc]x,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, *cur_arg_reg++),
+ STACK_VAR(bpf2ia32[i][1]));
+ }
+
+ if (bytes_in_stack)
+ /* add esp,"bytes_in_stack" */
+ end_addr -= 3;
+
+ /* mov dword ptr [ebp+off],edx */
+ if (fm->ret_size > sizeof(u32))
+ end_addr -= 3;
+
+ /* mov dword ptr [ebp+off],eax */
+ if (fm->ret_size)
+ end_addr -= 3;
+
+ jmp_offset = (u8 *)__bpf_call_base + insn->imm - end_addr;
+ if (!is_simm32(jmp_offset)) {
+ pr_err("unsupported BPF kernel function jmp_offset:%lld\n",
+ jmp_offset);
+ return -EINVAL;
+ }
+
+ EMIT1_off32(0xE8, jmp_offset);
+
+ if (fm->ret_size)
+ /* mov dword ptr [ebp+off],eax */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(bpf2ia32[BPF_REG_0][0]));
+
+ if (fm->ret_size > sizeof(u32))
+ /* mov dword ptr [ebp+off],edx */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(bpf2ia32[BPF_REG_0][1]));
+
+ if (bytes_in_stack)
+ /* add esp,"bytes_in_stack" */
+ EMIT3(0x83, add_1reg(0xC0, IA32_ESP), bytes_in_stack);
+
+ *pprog = prog;
+
+ return 0;
+}
+
static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
int oldproglen, struct jit_context *ctx)
{
@@ -1888,6 +2069,18 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
if (insn->src_reg == BPF_PSEUDO_CALL)
goto notyet;
+ if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+ int err;
+
+ err = emit_kfunc_call(bpf_prog,
+ image + addrs[i],
+ insn, &prog);
+
+ if (err)
+ return err;
+ break;
+ }
+
func = (u8 *) __bpf_call_base + imm32;
jmp_offset = func - (image + addrs[i]);
@@ -2393,3 +2586,8 @@ out:
tmp : orig_prog);
return prog;
}
+
+bool bpf_jit_supports_kfunc_call(void)
+{
+ return true;
+}
diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
index fc0eb82cdd6a..e0c3c58e2ac7 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
@@ -223,31 +223,31 @@ static void dpaa2_eth_free_bufs(struct dpaa2_eth_priv *priv, u64 *buf_array,
}
}
-static void dpaa2_eth_xdp_release_buf(struct dpaa2_eth_priv *priv,
- struct dpaa2_eth_channel *ch,
- dma_addr_t addr)
+static void dpaa2_eth_recycle_buf(struct dpaa2_eth_priv *priv,
+ struct dpaa2_eth_channel *ch,
+ dma_addr_t addr)
{
int retries = 0;
int err;
- ch->xdp.drop_bufs[ch->xdp.drop_cnt++] = addr;
- if (ch->xdp.drop_cnt < DPAA2_ETH_BUFS_PER_CMD)
+ ch->recycled_bufs[ch->recycled_bufs_cnt++] = addr;
+ if (ch->recycled_bufs_cnt < DPAA2_ETH_BUFS_PER_CMD)
return;
while ((err = dpaa2_io_service_release(ch->dpio, priv->bpid,
- ch->xdp.drop_bufs,
- ch->xdp.drop_cnt)) == -EBUSY) {
+ ch->recycled_bufs,
+ ch->recycled_bufs_cnt)) == -EBUSY) {
if (retries++ >= DPAA2_ETH_SWP_BUSY_RETRIES)
break;
cpu_relax();
}
if (err) {
- dpaa2_eth_free_bufs(priv, ch->xdp.drop_bufs, ch->xdp.drop_cnt);
- ch->buf_count -= ch->xdp.drop_cnt;
+ dpaa2_eth_free_bufs(priv, ch->recycled_bufs, ch->recycled_bufs_cnt);
+ ch->buf_count -= ch->recycled_bufs_cnt;
}
- ch->xdp.drop_cnt = 0;
+ ch->recycled_bufs_cnt = 0;
}
static int dpaa2_eth_xdp_flush(struct dpaa2_eth_priv *priv,
@@ -300,7 +300,7 @@ static void dpaa2_eth_xdp_tx_flush(struct dpaa2_eth_priv *priv,
ch->stats.xdp_tx++;
}
for (i = enqueued; i < fq->xdp_tx_fds.num; i++) {
- dpaa2_eth_xdp_release_buf(priv, ch, dpaa2_fd_get_addr(&fds[i]));
+ dpaa2_eth_recycle_buf(priv, ch, dpaa2_fd_get_addr(&fds[i]));
percpu_stats->tx_errors++;
ch->stats.xdp_tx_err++;
}
@@ -382,7 +382,7 @@ static u32 dpaa2_eth_run_xdp(struct dpaa2_eth_priv *priv,
trace_xdp_exception(priv->net_dev, xdp_prog, xdp_act);
fallthrough;
case XDP_DROP:
- dpaa2_eth_xdp_release_buf(priv, ch, addr);
+ dpaa2_eth_recycle_buf(priv, ch, addr);
ch->stats.xdp_drop++;
break;
case XDP_REDIRECT:
@@ -403,7 +403,7 @@ static u32 dpaa2_eth_run_xdp(struct dpaa2_eth_priv *priv,
free_pages((unsigned long)vaddr, 0);
} else {
ch->buf_count++;
- dpaa2_eth_xdp_release_buf(priv, ch, addr);
+ dpaa2_eth_recycle_buf(priv, ch, addr);
}
ch->stats.xdp_drop++;
} else {
@@ -418,6 +418,35 @@ out:
return xdp_act;
}
+static struct sk_buff *dpaa2_eth_copybreak(struct dpaa2_eth_channel *ch,
+ const struct dpaa2_fd *fd,
+ void *fd_vaddr)
+{
+ u16 fd_offset = dpaa2_fd_get_offset(fd);
+ struct dpaa2_eth_priv *priv = ch->priv;
+ u32 fd_length = dpaa2_fd_get_len(fd);
+ struct sk_buff *skb = NULL;
+ unsigned int skb_len;
+
+ if (fd_length > priv->rx_copybreak)
+ return NULL;
+
+ skb_len = fd_length + dpaa2_eth_needed_headroom(NULL);
+
+ skb = napi_alloc_skb(&ch->napi, skb_len);
+ if (!skb)
+ return NULL;
+
+ skb_reserve(skb, dpaa2_eth_needed_headroom(NULL));
+ skb_put(skb, fd_length);
+
+ memcpy(skb->data, fd_vaddr + fd_offset, fd_length);
+
+ dpaa2_eth_recycle_buf(priv, ch, dpaa2_fd_get_addr(fd));
+
+ return skb;
+}
+
/* Main Rx frame processing routine */
static void dpaa2_eth_rx(struct dpaa2_eth_priv *priv,
struct dpaa2_eth_channel *ch,
@@ -459,9 +488,12 @@ static void dpaa2_eth_rx(struct dpaa2_eth_priv *priv,
return;
}
- dma_unmap_page(dev, addr, priv->rx_buf_size,
- DMA_BIDIRECTIONAL);
- skb = dpaa2_eth_build_linear_skb(ch, fd, vaddr);
+ skb = dpaa2_eth_copybreak(ch, fd, vaddr);
+ if (!skb) {
+ dma_unmap_page(dev, addr, priv->rx_buf_size,
+ DMA_BIDIRECTIONAL);
+ skb = dpaa2_eth_build_linear_skb(ch, fd, vaddr);
+ }
} else if (fd_format == dpaa2_fd_sg) {
WARN_ON(priv->xdp_prog);
@@ -4302,6 +4334,8 @@ static int dpaa2_eth_probe(struct fsl_mc_device *dpni_dev)
skb_queue_head_init(&priv->tx_skbs);
+ priv->rx_copybreak = DPAA2_ETH_DEFAULT_COPYBREAK;
+
/* Obtain a MC portal */
err = fsl_mc_portal_allocate(dpni_dev, FSL_MC_IO_ATOMIC_CONTEXT_PORTAL,
&priv->mc_io);
diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h
index 9b6a89709ce1..cdb623d5f2c1 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h
@@ -438,8 +438,6 @@ struct dpaa2_eth_fq {
struct dpaa2_eth_ch_xdp {
struct bpf_prog *prog;
- u64 drop_bufs[DPAA2_ETH_BUFS_PER_CMD];
- int drop_cnt;
unsigned int res;
};
@@ -457,6 +455,10 @@ struct dpaa2_eth_channel {
struct dpaa2_eth_ch_xdp xdp;
struct xdp_rxq_info xdp_rxq;
struct list_head *rx_list;
+
+ /* Buffers to be recycled back in the buffer pool */
+ u64 recycled_bufs[DPAA2_ETH_BUFS_PER_CMD];
+ int recycled_bufs_cnt;
};
struct dpaa2_eth_dist_fields {
@@ -487,6 +489,8 @@ struct dpaa2_eth_trap_data {
struct dpaa2_eth_priv *priv;
};
+#define DPAA2_ETH_DEFAULT_COPYBREAK 512
+
/* Driver private data */
struct dpaa2_eth_priv {
struct net_device *net_dev;
@@ -567,6 +571,8 @@ struct dpaa2_eth_priv {
struct devlink *devlink;
struct dpaa2_eth_trap_data *trap_data;
struct devlink_port devlink_port;
+
+ u32 rx_copybreak;
};
struct dpaa2_eth_devlink_priv {
diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c
index bf59708b869e..ad5e374eeccf 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c
@@ -782,6 +782,44 @@ static int dpaa2_eth_get_ts_info(struct net_device *dev,
return 0;
}
+static int dpaa2_eth_get_tunable(struct net_device *net_dev,
+ const struct ethtool_tunable *tuna,
+ void *data)
+{
+ struct dpaa2_eth_priv *priv = netdev_priv(net_dev);
+ int err = 0;
+
+ switch (tuna->id) {
+ case ETHTOOL_RX_COPYBREAK:
+ *(u32 *)data = priv->rx_copybreak;
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ return err;
+}
+
+static int dpaa2_eth_set_tunable(struct net_device *net_dev,
+ const struct ethtool_tunable *tuna,
+ const void *data)
+{
+ struct dpaa2_eth_priv *priv = netdev_priv(net_dev);
+ int err = 0;
+
+ switch (tuna->id) {
+ case ETHTOOL_RX_COPYBREAK:
+ priv->rx_copybreak = *(u32 *)data;
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ return err;
+}
+
const struct ethtool_ops dpaa2_ethtool_ops = {
.get_drvinfo = dpaa2_eth_get_drvinfo,
.nway_reset = dpaa2_eth_nway_reset,
@@ -796,4 +834,6 @@ const struct ethtool_ops dpaa2_ethtool_ops = {
.get_rxnfc = dpaa2_eth_get_rxnfc,
.set_rxnfc = dpaa2_eth_set_rxnfc,
.get_ts_info = dpaa2_eth_get_ts_info,
+ .get_tunable = dpaa2_eth_get_tunable,
+ .set_tunable = dpaa2_eth_set_tunable,
};
diff --git a/drivers/net/ethernet/pensando/ionic/Makefile b/drivers/net/ethernet/pensando/ionic/Makefile
index 8d3c2d3cb10d..4e7642a2d25f 100644
--- a/drivers/net/ethernet/pensando/ionic/Makefile
+++ b/drivers/net/ethernet/pensando/ionic/Makefile
@@ -6,3 +6,4 @@ obj-$(CONFIG_IONIC) := ionic.o
ionic-y := ionic_main.o ionic_bus_pci.o ionic_devlink.o ionic_dev.o \
ionic_debugfs.o ionic_lif.o ionic_rx_filter.o ionic_ethtool.o \
ionic_txrx.o ionic_stats.o ionic_fw.o
+ionic-$(CONFIG_PTP_1588_CLOCK) += ionic_phc.o
diff --git a/drivers/net/ethernet/pensando/ionic/ionic.h b/drivers/net/ethernet/pensando/ionic/ionic.h
index 084a924431d5..66204106f83e 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic.h
+++ b/drivers/net/ethernet/pensando/ionic/ionic.h
@@ -20,6 +20,10 @@ struct ionic_lif;
#define DEVCMD_TIMEOUT 10
+#define IONIC_PHC_UPDATE_NS 10000000000 /* 10s in nanoseconds */
+#define NORMAL_PPB 1000000000 /* one billion parts per billion */
+#define SCALED_PPM (1000000ull << 16) /* 2^16 million parts per 2^16 million */
+
struct ionic_vf {
u16 index;
u8 macaddr[6];
@@ -64,6 +68,8 @@ struct ionic_admin_ctx {
union ionic_adminq_comp comp;
};
+int ionic_adminq_post(struct ionic_lif *lif, struct ionic_admin_ctx *ctx);
+int ionic_adminq_wait(struct ionic_lif *lif, struct ionic_admin_ctx *ctx, int err);
int ionic_adminq_post_wait(struct ionic_lif *lif, struct ionic_admin_ctx *ctx);
int ionic_dev_cmd_wait(struct ionic *ionic, unsigned long max_wait);
int ionic_set_dma_mask(struct ionic *ionic);
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_dev.c b/drivers/net/ethernet/pensando/ionic/ionic_dev.c
index 0e8e88c69e1c..1dfe962e22e0 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_dev.c
+++ b/drivers/net/ethernet/pensando/ionic/ionic_dev.c
@@ -79,6 +79,8 @@ int ionic_dev_setup(struct ionic *ionic)
idev->intr_status = bar->vaddr + IONIC_BAR0_INTR_STATUS_OFFSET;
idev->intr_ctrl = bar->vaddr + IONIC_BAR0_INTR_CTRL_OFFSET;
+ idev->hwstamp_regs = &idev->dev_info_regs->hwstamp;
+
sig = ioread32(&idev->dev_info_regs->signature);
if (sig != IONIC_DEV_INFO_SIGNATURE) {
dev_err(dev, "Incompatible firmware signature %x", sig);
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_dev.h b/drivers/net/ethernet/pensando/ionic/ionic_dev.h
index 0c0533737b2b..c25cf9b744c5 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_dev.h
+++ b/drivers/net/ethernet/pensando/ionic/ionic_dev.h
@@ -59,6 +59,7 @@ static_assert(sizeof(struct ionic_dev_getattr_cmd) == 64);
static_assert(sizeof(struct ionic_dev_getattr_comp) == 16);
static_assert(sizeof(struct ionic_dev_setattr_cmd) == 64);
static_assert(sizeof(struct ionic_dev_setattr_comp) == 16);
+static_assert(sizeof(struct ionic_lif_setphc_cmd) == 64);
/* Port commands */
static_assert(sizeof(struct ionic_port_identify_cmd) == 64);
@@ -135,6 +136,7 @@ struct ionic_devinfo {
struct ionic_dev {
union ionic_dev_info_regs __iomem *dev_info_regs;
union ionic_dev_cmd_regs __iomem *dev_cmd_regs;
+ struct ionic_hwstamp_regs __iomem *hwstamp_regs;
atomic_long_t last_check_time;
unsigned long last_hb_time;
@@ -218,6 +220,7 @@ struct ionic_queue {
unsigned int index;
unsigned int num_descs;
unsigned int max_sg_elems;
+ u64 features;
u64 dbell_count;
u64 stop;
u64 wake;
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c
index b1e78b452fad..71db1e2c7d8a 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c
+++ b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c
@@ -849,6 +849,98 @@ static int ionic_get_module_eeprom(struct net_device *netdev,
return 0;
}
+static int ionic_get_ts_info(struct net_device *netdev,
+ struct ethtool_ts_info *info)
+{
+ struct ionic_lif *lif = netdev_priv(netdev);
+ struct ionic *ionic = lif->ionic;
+ __le64 mask;
+
+ if (!lif->phc || !lif->phc->ptp)
+ return ethtool_op_get_ts_info(netdev, info);
+
+ info->phc_index = ptp_clock_index(lif->phc->ptp);
+
+ info->so_timestamping = SOF_TIMESTAMPING_TX_SOFTWARE |
+ SOF_TIMESTAMPING_RX_SOFTWARE |
+ SOF_TIMESTAMPING_SOFTWARE |
+ SOF_TIMESTAMPING_TX_HARDWARE |
+ SOF_TIMESTAMPING_RX_HARDWARE |
+ SOF_TIMESTAMPING_RAW_HARDWARE;
+
+ /* tx modes */
+
+ info->tx_types = BIT(HWTSTAMP_TX_OFF) |
+ BIT(HWTSTAMP_TX_ON);
+
+ mask = cpu_to_le64(BIT_ULL(IONIC_TXSTAMP_ONESTEP_SYNC));
+ if (ionic->ident.lif.eth.hwstamp_tx_modes & mask)
+ info->tx_types |= BIT(HWTSTAMP_TX_ONESTEP_SYNC);
+
+ mask = cpu_to_le64(BIT_ULL(IONIC_TXSTAMP_ONESTEP_P2P));
+ if (ionic->ident.lif.eth.hwstamp_tx_modes & mask)
+ info->tx_types |= BIT(HWTSTAMP_TX_ONESTEP_P2P);
+
+ /* rx filters */
+
+ info->rx_filters = BIT(HWTSTAMP_FILTER_NONE) |
+ BIT(HWTSTAMP_FILTER_ALL);
+
+ mask = cpu_to_le64(IONIC_PKT_CLS_NTP_ALL);
+ if ((ionic->ident.lif.eth.hwstamp_rx_filters & mask) == mask)
+ info->rx_filters |= HWTSTAMP_FILTER_NTP_ALL;
+
+ mask = cpu_to_le64(IONIC_PKT_CLS_PTP1_SYNC);
+ if ((ionic->ident.lif.eth.hwstamp_rx_filters & mask) == mask)
+ info->rx_filters |= HWTSTAMP_FILTER_PTP_V1_L4_SYNC;
+
+ mask = cpu_to_le64(IONIC_PKT_CLS_PTP1_DREQ);
+ if ((ionic->ident.lif.eth.hwstamp_rx_filters & mask) == mask)
+ info->rx_filters |= HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ;
+
+ mask = cpu_to_le64(IONIC_PKT_CLS_PTP1_ALL);
+ if ((ionic->ident.lif.eth.hwstamp_rx_filters & mask) == mask)
+ info->rx_filters |= HWTSTAMP_FILTER_PTP_V1_L4_EVENT;
+
+ mask = cpu_to_le64(IONIC_PKT_CLS_PTP2_L4_SYNC);
+ if ((ionic->ident.lif.eth.hwstamp_rx_filters & mask) == mask)
+ info->rx_filters |= HWTSTAMP_FILTER_PTP_V2_L4_SYNC;
+
+ mask = cpu_to_le64(IONIC_PKT_CLS_PTP2_L4_DREQ);
+ if ((ionic->ident.lif.eth.hwstamp_rx_filters & mask) == mask)
+ info->rx_filters |= HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ;
+
+ mask = cpu_to_le64(IONIC_PKT_CLS_PTP2_L4_ALL);
+ if ((ionic->ident.lif.eth.hwstamp_rx_filters & mask) == mask)
+ info->rx_filters |= HWTSTAMP_FILTER_PTP_V2_L4_EVENT;
+
+ mask = cpu_to_le64(IONIC_PKT_CLS_PTP2_L2_SYNC);
+ if ((ionic->ident.lif.eth.hwstamp_rx_filters & mask) == mask)
+ info->rx_filters |= HWTSTAMP_FILTER_PTP_V2_L2_SYNC;
+
+ mask = cpu_to_le64(IONIC_PKT_CLS_PTP2_L2_DREQ);
+ if ((ionic->ident.lif.eth.hwstamp_rx_filters & mask) == mask)
+ info->rx_filters |= HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ;
+
+ mask = cpu_to_le64(IONIC_PKT_CLS_PTP2_L2_ALL);
+ if ((ionic->ident.lif.eth.hwstamp_rx_filters & mask) == mask)
+ info->rx_filters |= HWTSTAMP_FILTER_PTP_V2_L2_EVENT;
+
+ mask = cpu_to_le64(IONIC_PKT_CLS_PTP2_SYNC);
+ if ((ionic->ident.lif.eth.hwstamp_rx_filters & mask) == mask)
+ info->rx_filters |= HWTSTAMP_FILTER_PTP_V2_SYNC;
+
+ mask = cpu_to_le64(IONIC_PKT_CLS_PTP2_DREQ);
+ if ((ionic->ident.lif.eth.hwstamp_rx_filters & mask) == mask)
+ info->rx_filters |= HWTSTAMP_FILTER_PTP_V2_DELAY_REQ;
+
+ mask = cpu_to_le64(IONIC_PKT_CLS_PTP2_ALL);
+ if ((ionic->ident.lif.eth.hwstamp_rx_filters & mask) == mask)
+ info->rx_filters |= HWTSTAMP_FILTER_PTP_V2_EVENT;
+
+ return 0;
+}
+
static int ionic_nway_reset(struct net_device *netdev)
{
struct ionic_lif *lif = netdev_priv(netdev);
@@ -906,6 +998,7 @@ static const struct ethtool_ops ionic_ethtool_ops = {
.set_pauseparam = ionic_set_pauseparam,
.get_fecparam = ionic_get_fecparam,
.set_fecparam = ionic_set_fecparam,
+ .get_ts_info = ionic_get_ts_info,
.nway_reset = ionic_nway_reset,
};
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_if.h b/drivers/net/ethernet/pensando/ionic/ionic_if.h
index 88210142395d..0478b48d9895 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_if.h
+++ b/drivers/net/ethernet/pensando/ionic/ionic_if.h
@@ -34,6 +34,7 @@ enum ionic_cmd_opcode {
IONIC_CMD_LIF_RESET = 22,
IONIC_CMD_LIF_GETATTR = 23,
IONIC_CMD_LIF_SETATTR = 24,
+ IONIC_CMD_LIF_SETPHC = 25,
IONIC_CMD_RX_MODE_SET = 30,
IONIC_CMD_RX_FILTER_ADD = 31,
@@ -269,6 +270,9 @@ union ionic_drv_identity {
* value in usecs to device units using:
* device units = usecs * mult / div
* @eq_count: Number of shared event queues
+ * @hwstamp_mask: Bitmask for subtraction of hardware tick values.
+ * @hwstamp_mult: Hardware tick to nanosecond multiplier.
+ * @hwstamp_shift: Hardware tick to nanosecond divisor (power of two).
*/
union ionic_dev_identity {
struct {
@@ -283,6 +287,9 @@ union ionic_dev_identity {
__le32 intr_coal_mult;
__le32 intr_coal_div;
__le32 eq_count;
+ __le64 hwstamp_mask;
+ __le32 hwstamp_mult;
+ __le32 hwstamp_shift;
};
__le32 words[478];
};
@@ -346,6 +353,68 @@ enum ionic_logical_qtype {
};
/**
+ * enum ionic_q_feature - Common Features for most queue types
+ *
+ * Common features use bits 0-15. Per-queue-type features use higher bits.
+ *
+ * @IONIC_QIDENT_F_CQ: Queue has completion ring
+ * @IONIC_QIDENT_F_SG: Queue has scatter/gather ring
+ * @IONIC_QIDENT_F_EQ: Queue can use event queue
+ * @IONIC_QIDENT_F_CMB: Queue is in cmb bar
+ * @IONIC_Q_F_2X_DESC: Double main descriptor size
+ * @IONIC_Q_F_2X_CQ_DESC: Double cq descriptor size
+ * @IONIC_Q_F_2X_SG_DESC: Double sg descriptor size
+ * @IONIC_Q_F_4X_DESC: Quadruple main descriptor size
+ * @IONIC_Q_F_4X_CQ_DESC: Quadruple cq descriptor size
+ * @IONIC_Q_F_4X_SG_DESC: Quadruple sg descriptor size
+ */
+enum ionic_q_feature {
+ IONIC_QIDENT_F_CQ = BIT_ULL(0),
+ IONIC_QIDENT_F_SG = BIT_ULL(1),
+ IONIC_QIDENT_F_EQ = BIT_ULL(2),
+ IONIC_QIDENT_F_CMB = BIT_ULL(3),
+ IONIC_Q_F_2X_DESC = BIT_ULL(4),
+ IONIC_Q_F_2X_CQ_DESC = BIT_ULL(5),
+ IONIC_Q_F_2X_SG_DESC = BIT_ULL(6),
+ IONIC_Q_F_4X_DESC = BIT_ULL(7),
+ IONIC_Q_F_4X_CQ_DESC = BIT_ULL(8),
+ IONIC_Q_F_4X_SG_DESC = BIT_ULL(9),
+};
+
+/**
+ * enum ionic_rxq_feature - RXQ-specific Features
+ *
+ * Per-queue-type features use bits 16 and higher.
+ *
+ * @IONIC_RXQ_F_HWSTAMP: Queue supports Hardware Timestamping
+ */
+enum ionic_rxq_feature {
+ IONIC_RXQ_F_HWSTAMP = BIT_ULL(16),
+};
+
+/**
+ * enum ionic_txq_feature - TXQ-specific Features
+ *
+ * Per-queue-type features use bits 16 and higher.
+ *
+ * @IONIC_TXQ_F_HWSTAMP: Queue supports Hardware Timestamping
+ */
+enum ionic_txq_feature {
+ IONIC_TXQ_F_HWSTAMP = BIT(16),
+};
+
+/**
+ * struct ionic_hwstamp_bits - Hardware timestamp decoding bits
+ * @IONIC_HWSTAMP_INVALID: Invalid hardware timestamp value
+ * @IONIC_HWSTAMP_CQ_NEGOFFSET: Timestamp field negative offset
+ * from the base cq descriptor.
+ */
+enum ionic_hwstamp_bits {
+ IONIC_HWSTAMP_INVALID = ~0ull,
+ IONIC_HWSTAMP_CQ_NEGOFFSET = 8,
+};
+
+/**
* struct ionic_lif_logical_qtype - Descriptor of logical to HW queue type
* @qtype: Hardware Queue Type
* @qid_count: Number of Queue IDs of the logical type
@@ -405,6 +474,8 @@ union ionic_lif_config {
* @max_mcast_filters: Number of perfect multicast addresses supported
* @min_frame_size: Minimum size of frames to be sent
* @max_frame_size: Maximum size of frames to be sent
+ * @hwstamp_tx_modes: Bitmask of BIT_ULL(enum ionic_txstamp_mode)
+ * @hwstamp_rx_filters: Bitmask of enum ionic_pkt_class
* @config: LIF config struct with features, mtu, mac, q counts
*
* @rdma: RDMA identify structure
@@ -438,7 +509,10 @@ union ionic_lif_identity {
__le16 rss_ind_tbl_sz;
__le32 min_frame_size;
__le32 max_frame_size;
- u8 rsvd2[106];
+ u8 rsvd2[2];
+ __le64 hwstamp_tx_modes;
+ __le64 hwstamp_rx_filters;
+ u8 rsvd3[88];
union ionic_lif_config config;
} __packed eth;
@@ -529,7 +603,7 @@ struct ionic_q_identify_comp {
* union ionic_q_identity - queue identity information
* @version: Queue type version that can be used with FW
* @supported: Bitfield of queue versions, first bit = ver 0
- * @features: Queue features
+ * @features: Queue features (enum ionic_q_feature, etc)
* @desc_sz: Descriptor size
* @comp_sz: Completion descriptor size
* @sg_desc_sz: Scatter/Gather descriptor size
@@ -541,10 +615,6 @@ union ionic_q_identity {
u8 version;
u8 supported;
u8 rsvd[6];
-#define IONIC_QIDENT_F_CQ 0x01 /* queue has completion ring */
-#define IONIC_QIDENT_F_SG 0x02 /* queue has scatter/gather ring */
-#define IONIC_QIDENT_F_EQ 0x04 /* queue can use event queue */
-#define IONIC_QIDENT_F_CMB 0x08 /* queue is in cmb bar */
__le64 features;
__le16 desc_sz;
__le16 comp_sz;
@@ -585,6 +655,7 @@ union ionic_q_identity {
* @ring_base: Queue ring base address
* @cq_ring_base: Completion queue ring base address
* @sg_ring_base: Scatter/Gather ring base address
+ * @features: Mask of queue features to enable, if not in the flags above.
*/
struct ionic_q_init_cmd {
u8 opcode;
@@ -608,7 +679,8 @@ struct ionic_q_init_cmd {
__le64 ring_base;
__le64 cq_ring_base;
__le64 sg_ring_base;
- u8 rsvd2[20];
+ u8 rsvd2[12];
+ __le64 features;
} __packed;
/**
@@ -1019,7 +1091,64 @@ enum ionic_eth_hw_features {
IONIC_ETH_HW_TSO_UDP_CSUM = BIT(16),
IONIC_ETH_HW_RX_CSUM_GENEVE = BIT(17),
IONIC_ETH_HW_TX_CSUM_GENEVE = BIT(18),
- IONIC_ETH_HW_TSO_GENEVE = BIT(19)
+ IONIC_ETH_HW_TSO_GENEVE = BIT(19),
+ IONIC_ETH_HW_TIMESTAMP = BIT(20),
+};
+
+/**
+ * enum ionic_pkt_class - Packet classification mask.
+ *
+ * Used with rx steering filter, packets indicated by the mask can be steered
+ * toward a specific receive queue.
+ *
+ * @IONIC_PKT_CLS_NTP_ALL: All NTP packets.
+ * @IONIC_PKT_CLS_PTP1_SYNC: PTPv1 sync
+ * @IONIC_PKT_CLS_PTP1_DREQ: PTPv1 delay-request
+ * @IONIC_PKT_CLS_PTP1_ALL: PTPv1 all packets
+ * @IONIC_PKT_CLS_PTP2_L4_SYNC: PTPv2-UDP sync
+ * @IONIC_PKT_CLS_PTP2_L4_DREQ: PTPv2-UDP delay-request
+ * @IONIC_PKT_CLS_PTP2_L4_ALL: PTPv2-UDP all packets
+ * @IONIC_PKT_CLS_PTP2_L2_SYNC: PTPv2-ETH sync
+ * @IONIC_PKT_CLS_PTP2_L2_DREQ: PTPv2-ETH delay-request
+ * @IONIC_PKT_CLS_PTP2_L2_ALL: PTPv2-ETH all packets
+ * @IONIC_PKT_CLS_PTP2_SYNC: PTPv2 sync
+ * @IONIC_PKT_CLS_PTP2_DREQ: PTPv2 delay-request
+ * @IONIC_PKT_CLS_PTP2_ALL: PTPv2 all packets
+ * @IONIC_PKT_CLS_PTP_SYNC: PTP sync
+ * @IONIC_PKT_CLS_PTP_DREQ: PTP delay-request
+ * @IONIC_PKT_CLS_PTP_ALL: PTP all packets
+ */
+enum ionic_pkt_class {
+ IONIC_PKT_CLS_NTP_ALL = BIT(0),
+
+ IONIC_PKT_CLS_PTP1_SYNC = BIT(1),
+ IONIC_PKT_CLS_PTP1_DREQ = BIT(2),
+ IONIC_PKT_CLS_PTP1_ALL = BIT(3) |
+ IONIC_PKT_CLS_PTP1_SYNC | IONIC_PKT_CLS_PTP1_DREQ,
+
+ IONIC_PKT_CLS_PTP2_L4_SYNC = BIT(4),
+ IONIC_PKT_CLS_PTP2_L4_DREQ = BIT(5),
+ IONIC_PKT_CLS_PTP2_L4_ALL = BIT(6) |
+ IONIC_PKT_CLS_PTP2_L4_SYNC | IONIC_PKT_CLS_PTP2_L4_DREQ,
+
+ IONIC_PKT_CLS_PTP2_L2_SYNC = BIT(7),
+ IONIC_PKT_CLS_PTP2_L2_DREQ = BIT(8),
+ IONIC_PKT_CLS_PTP2_L2_ALL = BIT(9) |
+ IONIC_PKT_CLS_PTP2_L2_SYNC | IONIC_PKT_CLS_PTP2_L2_DREQ,
+
+ IONIC_PKT_CLS_PTP2_SYNC =
+ IONIC_PKT_CLS_PTP2_L4_SYNC | IONIC_PKT_CLS_PTP2_L2_SYNC,
+ IONIC_PKT_CLS_PTP2_DREQ =
+ IONIC_PKT_CLS_PTP2_L4_DREQ | IONIC_PKT_CLS_PTP2_L2_DREQ,
+ IONIC_PKT_CLS_PTP2_ALL =
+ IONIC_PKT_CLS_PTP2_L4_ALL | IONIC_PKT_CLS_PTP2_L2_ALL,
+
+ IONIC_PKT_CLS_PTP_SYNC =
+ IONIC_PKT_CLS_PTP1_SYNC | IONIC_PKT_CLS_PTP2_SYNC,
+ IONIC_PKT_CLS_PTP_DREQ =
+ IONIC_PKT_CLS_PTP1_DREQ | IONIC_PKT_CLS_PTP2_DREQ,
+ IONIC_PKT_CLS_PTP_ALL =
+ IONIC_PKT_CLS_PTP1_ALL | IONIC_PKT_CLS_PTP2_ALL,
};
/**
@@ -1329,6 +1458,20 @@ enum ionic_stats_ctl_cmd {
};
/**
+ * enum ionic_txstamp_mode - List of TX Timestamping Modes
+ * @IONIC_TXSTAMP_OFF: Disable TX hardware timetamping.
+ * @IONIC_TXSTAMP_ON: Enable local TX hardware timetamping.
+ * @IONIC_TXSTAMP_ONESTEP_SYNC: Modify TX PTP Sync packets.
+ * @IONIC_TXSTAMP_ONESTEP_P2P: Modify TX PTP Sync and PDelayResp.
+ */
+enum ionic_txstamp_mode {
+ IONIC_TXSTAMP_OFF = 0,
+ IONIC_TXSTAMP_ON = 1,
+ IONIC_TXSTAMP_ONESTEP_SYNC = 2,
+ IONIC_TXSTAMP_ONESTEP_P2P = 3,
+};
+
+/**
* enum ionic_port_attr - List of device attributes
* @IONIC_PORT_ATTR_STATE: Port state attribute
* @IONIC_PORT_ATTR_SPEED: Port speed attribute
@@ -1570,6 +1713,7 @@ enum ionic_rss_hash_types {
* @IONIC_LIF_ATTR_FEATURES: LIF features attribute
* @IONIC_LIF_ATTR_RSS: LIF RSS attribute
* @IONIC_LIF_ATTR_STATS_CTRL: LIF statistics control attribute
+ * @IONIC_LIF_ATTR_TXSTAMP: LIF TX timestamping mode
*/
enum ionic_lif_attr {
IONIC_LIF_ATTR_STATE = 0,
@@ -1579,6 +1723,7 @@ enum ionic_lif_attr {
IONIC_LIF_ATTR_FEATURES = 4,
IONIC_LIF_ATTR_RSS = 5,
IONIC_LIF_ATTR_STATS_CTRL = 6,
+ IONIC_LIF_ATTR_TXSTAMP = 7,
};
/**
@@ -1596,6 +1741,7 @@ enum ionic_lif_attr {
* @key: The hash secret key
* @addr: Address for the indirection table shared memory
* @stats_ctl: stats control commands (enum ionic_stats_ctl_cmd)
+ * @txstamp: TX Timestamping Mode (enum ionic_txstamp_mode)
*/
struct ionic_lif_setattr_cmd {
u8 opcode;
@@ -1614,6 +1760,7 @@ struct ionic_lif_setattr_cmd {
__le64 addr;
} rss;
u8 stats_ctl;
+ __le16 txstamp_mode;
u8 rsvd[60];
} __packed;
};
@@ -1658,6 +1805,7 @@ struct ionic_lif_getattr_cmd {
* @mtu: Mtu
* @mac: Station mac
* @features: Features (enum ionic_eth_hw_features)
+ * @txstamp: TX Timestamping Mode (enum ionic_txstamp_mode)
* @color: Color bit
*/
struct ionic_lif_getattr_comp {
@@ -1669,11 +1817,35 @@ struct ionic_lif_getattr_comp {
__le32 mtu;
u8 mac[6];
__le64 features;
+ __le16 txstamp_mode;
u8 rsvd2[11];
} __packed;
u8 color;
};
+/**
+ * struct ionic_lif_setphc_cmd - Set LIF PTP Hardware Clock
+ * @opcode: Opcode
+ * @lif_index: LIF index
+ * @tick: Hardware stamp tick of an instant in time.
+ * @nsec: Nanosecond stamp of the same instant.
+ * @frac: Fractional nanoseconds at the same instant.
+ * @mult: Cycle to nanosecond multiplier.
+ * @shift: Cycle to nanosecond divisor (power of two).
+ */
+struct ionic_lif_setphc_cmd {
+ u8 opcode;
+ u8 rsvd1;
+ __le16 lif_index;
+ u8 rsvd2[4];
+ __le64 tick;
+ __le64 nsec;
+ __le64 frac;
+ __le32 mult;
+ __le32 shift;
+ u8 rsvd3[24];
+};
+
enum ionic_rx_mode {
IONIC_RX_MODE_F_UNICAST = BIT(0),
IONIC_RX_MODE_F_MULTICAST = BIT(1),
@@ -1706,9 +1878,10 @@ struct ionic_rx_mode_set_cmd {
typedef struct ionic_admin_comp ionic_rx_mode_set_comp;
enum ionic_rx_filter_match_type {
- IONIC_RX_FILTER_MATCH_VLAN = 0,
- IONIC_RX_FILTER_MATCH_MAC,
- IONIC_RX_FILTER_MATCH_MAC_VLAN,
+ IONIC_RX_FILTER_MATCH_VLAN = 0x0,
+ IONIC_RX_FILTER_MATCH_MAC = 0x1,
+ IONIC_RX_FILTER_MATCH_MAC_VLAN = 0x2,
+ IONIC_RX_FILTER_STEER_PKTCLASS = 0x10,
};
/**
@@ -1725,6 +1898,7 @@ enum ionic_rx_filter_match_type {
* @mac_vlan: MACVLAN filter
* @vlan: VLAN ID
* @addr: MAC address (network-byte order)
+ * @pkt_class: Packet classification filter
*/
struct ionic_rx_filter_add_cmd {
u8 opcode;
@@ -1743,8 +1917,9 @@ struct ionic_rx_filter_add_cmd {
__le16 vlan;
u8 addr[6];
} mac_vlan;
+ __le64 pkt_class;
u8 rsvd[54];
- };
+ } __packed;
};
/**
@@ -2745,6 +2920,16 @@ union ionic_dev_cmd_comp {
};
/**
+ * struct ionic_hwstamp_regs - Hardware current timestamp registers
+ * @tick_low: Low 32 bits of hardware timestamp
+ * @tick_high: High 32 bits of hardware timestamp
+ */
+struct ionic_hwstamp_regs {
+ u32 tick_low;
+ u32 tick_high;
+};
+
+/**
* union ionic_dev_info_regs - Device info register format (read-only)
* @signature: Signature value of 0x44455649 ('DEVI')
* @version: Current version of info
@@ -2754,6 +2939,7 @@ union ionic_dev_cmd_comp {
* @fw_heartbeat: Firmware heartbeat counter
* @serial_num: Serial number
* @fw_version: Firmware version
+ * @hwstamp_regs: Hardware current timestamp registers
*/
union ionic_dev_info_regs {
#define IONIC_DEVINFO_FWVERS_BUFLEN 32
@@ -2768,6 +2954,8 @@ union ionic_dev_info_regs {
u32 fw_heartbeat;
char fw_version[IONIC_DEVINFO_FWVERS_BUFLEN];
char serial_num[IONIC_DEVINFO_SERIAL_BUFLEN];
+ u8 rsvd_pad1024[948];
+ struct ionic_hwstamp_regs hwstamp;
};
u32 words[512];
};
@@ -2815,6 +3003,7 @@ union ionic_adminq_cmd {
struct ionic_q_control_cmd q_control;
struct ionic_lif_setattr_cmd lif_setattr;
struct ionic_lif_getattr_cmd lif_getattr;
+ struct ionic_lif_setphc_cmd lif_setphc;
struct ionic_rx_mode_set_cmd rx_mode_set;
struct ionic_rx_filter_add_cmd rx_filter_add;
struct ionic_rx_filter_del_cmd rx_filter_del;
@@ -2831,6 +3020,7 @@ union ionic_adminq_comp {
struct ionic_q_init_comp q_init;
struct ionic_lif_setattr_comp lif_setattr;
struct ionic_lif_getattr_comp lif_getattr;
+ struct ionic_admin_comp lif_setphc;
struct ionic_rx_filter_add_comp rx_filter_add;
struct ionic_fw_control_comp fw_control;
};
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c
index a51be25723a5..ee56fed12e07 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c
+++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c
@@ -684,11 +684,11 @@ static int ionic_qcqs_alloc(struct ionic_lif *lif)
if (!lif->rxqcqs)
goto err_out;
- lif->txqstats = devm_kcalloc(dev, lif->ionic->ntxqs_per_lif,
+ lif->txqstats = devm_kcalloc(dev, lif->ionic->ntxqs_per_lif + 1,
sizeof(*lif->txqstats), GFP_KERNEL);
if (!lif->txqstats)
goto err_out;
- lif->rxqstats = devm_kcalloc(dev, lif->ionic->nrxqs_per_lif,
+ lif->rxqstats = devm_kcalloc(dev, lif->ionic->nrxqs_per_lif + 1,
sizeof(*lif->rxqstats), GFP_KERNEL);
if (!lif->rxqstats)
goto err_out;
@@ -731,6 +731,7 @@ static int ionic_lif_txq_init(struct ionic_lif *lif, struct ionic_qcq *qcq)
.ring_base = cpu_to_le64(q->base_pa),
.cq_ring_base = cpu_to_le64(cq->base_pa),
.sg_ring_base = cpu_to_le64(q->sg_base_pa),
+ .features = cpu_to_le64(q->features),
},
};
unsigned int intr_index;
@@ -791,6 +792,7 @@ static int ionic_lif_rxq_init(struct ionic_lif *lif, struct ionic_qcq *qcq)
.ring_base = cpu_to_le64(q->base_pa),
.cq_ring_base = cpu_to_le64(cq->base_pa),
.sg_ring_base = cpu_to_le64(q->sg_base_pa),
+ .features = cpu_to_le64(q->features),
},
};
int err;
@@ -828,6 +830,254 @@ static int ionic_lif_rxq_init(struct ionic_lif *lif, struct ionic_qcq *qcq)
return 0;
}
+int ionic_lif_create_hwstamp_txq(struct ionic_lif *lif)
+{
+ unsigned int num_desc, desc_sz, comp_sz, sg_desc_sz;
+ unsigned int txq_i, flags;
+ struct ionic_qcq *txq;
+ u64 features;
+ int err;
+
+ mutex_lock(&lif->queue_lock);
+
+ if (lif->hwstamp_txq)
+ goto out;
+
+ features = IONIC_Q_F_2X_CQ_DESC | IONIC_TXQ_F_HWSTAMP;
+
+ num_desc = IONIC_MIN_TXRX_DESC;
+ desc_sz = sizeof(struct ionic_txq_desc);
+ comp_sz = 2 * sizeof(struct ionic_txq_comp);
+
+ if (lif->qtype_info[IONIC_QTYPE_TXQ].version >= 1 &&
+ lif->qtype_info[IONIC_QTYPE_TXQ].sg_desc_sz == sizeof(struct ionic_txq_sg_desc_v1))
+ sg_desc_sz = sizeof(struct ionic_txq_sg_desc_v1);
+ else
+ sg_desc_sz = sizeof(struct ionic_txq_sg_desc);
+
+ txq_i = lif->ionic->ntxqs_per_lif;
+ flags = IONIC_QCQ_F_TX_STATS | IONIC_QCQ_F_SG;
+
+ err = ionic_qcq_alloc(lif, IONIC_QTYPE_TXQ, txq_i, "hwstamp_tx", flags,
+ num_desc, desc_sz, comp_sz, sg_desc_sz,
+ lif->kern_pid, &txq);
+ if (err)
+ goto err_qcq_alloc;
+
+ txq->q.features = features;
+
+ ionic_link_qcq_interrupts(lif->adminqcq, txq);
+ ionic_debugfs_add_qcq(lif, txq);
+
+ lif->hwstamp_txq = txq;
+
+ if (netif_running(lif->netdev)) {
+ err = ionic_lif_txq_init(lif, txq);
+ if (err)
+ goto err_qcq_init;
+
+ if (test_bit(IONIC_LIF_F_UP, lif->state)) {
+ err = ionic_qcq_enable(txq);
+ if (err)
+ goto err_qcq_enable;
+ }
+ }
+
+out:
+ mutex_unlock(&lif->queue_lock);
+
+ return 0;
+
+err_qcq_enable:
+ ionic_lif_qcq_deinit(lif, txq);
+err_qcq_init:
+ lif->hwstamp_txq = NULL;
+ ionic_debugfs_del_qcq(txq);
+ ionic_qcq_free(lif, txq);
+ devm_kfree(lif->ionic->dev, txq);
+err_qcq_alloc:
+ mutex_unlock(&lif->queue_lock);
+ return err;
+}
+
+int ionic_lif_create_hwstamp_rxq(struct ionic_lif *lif)
+{
+ unsigned int num_desc, desc_sz, comp_sz, sg_desc_sz;
+ unsigned int rxq_i, flags;
+ struct ionic_qcq *rxq;
+ u64 features;
+ int err;
+
+ mutex_lock(&lif->queue_lock);
+
+ if (lif->hwstamp_rxq)
+ goto out;
+
+ features = IONIC_Q_F_2X_CQ_DESC | IONIC_RXQ_F_HWSTAMP;
+
+ num_desc = IONIC_MIN_TXRX_DESC;
+ desc_sz = sizeof(struct ionic_rxq_desc);
+ comp_sz = 2 * sizeof(struct ionic_rxq_comp);
+ sg_desc_sz = sizeof(struct ionic_rxq_sg_desc);
+
+ rxq_i = lif->ionic->nrxqs_per_lif;
+ flags = IONIC_QCQ_F_RX_STATS | IONIC_QCQ_F_SG;
+
+ err = ionic_qcq_alloc(lif, IONIC_QTYPE_RXQ, rxq_i, "hwstamp_rx", flags,
+ num_desc, desc_sz, comp_sz, sg_desc_sz,
+ lif->kern_pid, &rxq);
+ if (err)
+ goto err_qcq_alloc;
+
+ rxq->q.features = features;
+
+ ionic_link_qcq_interrupts(lif->adminqcq, rxq);
+ ionic_debugfs_add_qcq(lif, rxq);
+
+ lif->hwstamp_rxq = rxq;
+
+ if (netif_running(lif->netdev)) {
+ err = ionic_lif_rxq_init(lif, rxq);
+ if (err)
+ goto err_qcq_init;
+
+ if (test_bit(IONIC_LIF_F_UP, lif->state)) {
+ ionic_rx_fill(&rxq->q);
+ err = ionic_qcq_enable(rxq);
+ if (err)
+ goto err_qcq_enable;
+ }
+ }
+
+out:
+ mutex_unlock(&lif->queue_lock);
+
+ return 0;
+
+err_qcq_enable:
+ ionic_lif_qcq_deinit(lif, rxq);
+err_qcq_init:
+ lif->hwstamp_rxq = NULL;
+ ionic_debugfs_del_qcq(rxq);
+ ionic_qcq_free(lif, rxq);
+ devm_kfree(lif->ionic->dev, rxq);
+err_qcq_alloc:
+ mutex_unlock(&lif->queue_lock);
+ return err;
+}
+
+int ionic_lif_config_hwstamp_rxq_all(struct ionic_lif *lif, bool rx_all)
+{
+ struct ionic_queue_params qparam;
+
+ ionic_init_queue_params(lif, &qparam);
+
+ if (rx_all)
+ qparam.rxq_features = IONIC_Q_F_2X_CQ_DESC | IONIC_RXQ_F_HWSTAMP;
+ else
+ qparam.rxq_features = 0;
+
+ /* if we're not running, just set the values and return */
+ if (!netif_running(lif->netdev)) {
+ lif->rxq_features = qparam.rxq_features;
+ return 0;
+ }
+
+ return ionic_reconfigure_queues(lif, &qparam);
+}
+
+int ionic_lif_set_hwstamp_txmode(struct ionic_lif *lif, u16 txstamp_mode)
+{
+ struct ionic_admin_ctx ctx = {
+ .work = COMPLETION_INITIALIZER_ONSTACK(ctx.work),
+ .cmd.lif_setattr = {
+ .opcode = IONIC_CMD_LIF_SETATTR,
+ .index = cpu_to_le16(lif->index),
+ .attr = IONIC_LIF_ATTR_TXSTAMP,
+ .txstamp_mode = cpu_to_le16(txstamp_mode),
+ },
+ };
+
+ return ionic_adminq_post_wait(lif, &ctx);
+}
+
+static void ionic_lif_del_hwstamp_rxfilt(struct ionic_lif *lif)
+{
+ struct ionic_admin_ctx ctx = {
+ .work = COMPLETION_INITIALIZER_ONSTACK(ctx.work),
+ .cmd.rx_filter_del = {
+ .opcode = IONIC_CMD_RX_FILTER_DEL,
+ .lif_index = cpu_to_le16(lif->index),
+ },
+ };
+ struct ionic_rx_filter *f;
+ u32 filter_id;
+ int err;
+
+ spin_lock_bh(&lif->rx_filters.lock);
+
+ f = ionic_rx_filter_rxsteer(lif);
+ if (!f) {
+ spin_unlock_bh(&lif->rx_filters.lock);
+ return;
+ }
+
+ filter_id = f->filter_id;
+ ionic_rx_filter_free(lif, f);
+
+ spin_unlock_bh(&lif->rx_filters.lock);
+
+ netdev_dbg(lif->netdev, "rx_filter del RXSTEER (id %d)\n", filter_id);
+
+ ctx.cmd.rx_filter_del.filter_id = cpu_to_le32(filter_id);
+
+ err = ionic_adminq_post_wait(lif, &ctx);
+ if (err && err != -EEXIST)
+ netdev_dbg(lif->netdev, "failed to delete rx_filter RXSTEER (id %d)\n", filter_id);
+}
+
+static int ionic_lif_add_hwstamp_rxfilt(struct ionic_lif *lif, u64 pkt_class)
+{
+ struct ionic_admin_ctx ctx = {
+ .work = COMPLETION_INITIALIZER_ONSTACK(ctx.work),
+ .cmd.rx_filter_add = {
+ .opcode = IONIC_CMD_RX_FILTER_ADD,
+ .lif_index = cpu_to_le16(lif->index),
+ .match = cpu_to_le16(IONIC_RX_FILTER_STEER_PKTCLASS),
+ .pkt_class = cpu_to_le64(pkt_class),
+ },
+ };
+ u8 qtype;
+ u32 qid;
+ int err;
+
+ if (!lif->hwstamp_rxq)
+ return -EINVAL;
+
+ qtype = lif->hwstamp_rxq->q.type;
+ ctx.cmd.rx_filter_add.qtype = qtype;
+
+ qid = lif->hwstamp_rxq->q.index;
+ ctx.cmd.rx_filter_add.qid = cpu_to_le32(qid);
+
+ netdev_dbg(lif->netdev, "rx_filter add RXSTEER\n");
+ err = ionic_adminq_post_wait(lif, &ctx);
+ if (err && err != -EEXIST)
+ return err;
+
+ return ionic_rx_filter_save(lif, 0, qid, 0, &ctx);
+}
+
+int ionic_lif_set_hwstamp_rxfilt(struct ionic_lif *lif, u64 pkt_class)
+{
+ ionic_lif_del_hwstamp_rxfilt(lif);
+
+ if (!pkt_class)
+ return 0;
+
+ return ionic_lif_add_hwstamp_rxfilt(lif, pkt_class);
+}
+
static bool ionic_notifyq_service(struct ionic_cq *cq,
struct ionic_cq_info *cq_info)
{
@@ -895,9 +1145,12 @@ static int ionic_adminq_napi(struct napi_struct *napi, int budget)
struct ionic_dev *idev = &lif->ionic->idev;
unsigned long irqflags;
unsigned int flags = 0;
+ int rx_work = 0;
+ int tx_work = 0;
int n_work = 0;
int a_work = 0;
int work_done;
+ int credits;
if (lif->notifyqcq && lif->notifyqcq->flags & IONIC_QCQ_F_INITED)
n_work = ionic_cq_service(&lif->notifyqcq->cq, budget,
@@ -909,7 +1162,15 @@ static int ionic_adminq_napi(struct napi_struct *napi, int budget)
ionic_adminq_service, NULL, NULL);
spin_unlock_irqrestore(&lif->adminq_lock, irqflags);
- work_done = max(n_work, a_work);
+ if (lif->hwstamp_rxq)
+ rx_work = ionic_cq_service(&lif->hwstamp_rxq->cq, budget,
+ ionic_rx_service, NULL, NULL);
+
+ if (lif->hwstamp_txq)
+ tx_work = ionic_cq_service(&lif->hwstamp_txq->cq, budget,
+ ionic_tx_service, NULL, NULL);
+
+ work_done = max(max(n_work, a_work), max(rx_work, tx_work));
if (work_done < budget && napi_complete_done(napi, work_done)) {
flags |= IONIC_INTR_CRED_UNMASK;
intr->rearm_count++;
@@ -917,9 +1178,8 @@ static int ionic_adminq_napi(struct napi_struct *napi, int budget)
if (work_done || flags) {
flags |= IONIC_INTR_CRED_RESET_COALESCE;
- ionic_intr_credits(idev->intr_ctrl,
- intr->index,
- n_work + a_work, flags);
+ credits = n_work + a_work + rx_work + tx_work;
+ ionic_intr_credits(idev->intr_ctrl, intr->index, credits, flags);
}
return work_done;
@@ -1279,6 +1539,10 @@ static int ionic_set_nic_features(struct ionic_lif *lif,
int err;
ctx.cmd.lif_setattr.features = ionic_netdev_features_to_nic(features);
+
+ if (lif->phc)
+ ctx.cmd.lif_setattr.features |= cpu_to_le64(IONIC_ETH_HW_TIMESTAMP);
+
err = ionic_adminq_post_wait(lif, &ctx);
if (err)
return err;
@@ -1326,6 +1590,8 @@ static int ionic_set_nic_features(struct ionic_lif *lif,
dev_dbg(dev, "feature ETH_HW_TSO_UDP\n");
if (lif->hw_features & IONIC_ETH_HW_TSO_UDP_CSUM)
dev_dbg(dev, "feature ETH_HW_TSO_UDP_CSUM\n");
+ if (lif->hw_features & IONIC_ETH_HW_TIMESTAMP)
+ dev_dbg(dev, "feature ETH_HW_TIMESTAMP\n");
return 0;
}
@@ -1668,11 +1934,17 @@ static void ionic_txrx_disable(struct ionic_lif *lif)
err = ionic_qcq_disable(lif->txqcqs[i], (err != -ETIMEDOUT));
}
+ if (lif->hwstamp_txq)
+ err = ionic_qcq_disable(lif->hwstamp_txq, (err != -ETIMEDOUT));
+
if (lif->rxqcqs) {
for (i = 0; i < lif->nxqs; i++)
err = ionic_qcq_disable(lif->rxqcqs[i], (err != -ETIMEDOUT));
}
+ if (lif->hwstamp_rxq)
+ err = ionic_qcq_disable(lif->hwstamp_rxq, (err != -ETIMEDOUT));
+
ionic_lif_quiesce(lif);
}
@@ -1695,6 +1967,17 @@ static void ionic_txrx_deinit(struct ionic_lif *lif)
}
}
lif->rx_mode = 0;
+
+ if (lif->hwstamp_txq) {
+ ionic_lif_qcq_deinit(lif, lif->hwstamp_txq);
+ ionic_tx_flush(&lif->hwstamp_txq->cq);
+ ionic_tx_empty(&lif->hwstamp_txq->q);
+ }
+
+ if (lif->hwstamp_rxq) {
+ ionic_lif_qcq_deinit(lif, lif->hwstamp_rxq);
+ ionic_rx_empty(&lif->hwstamp_rxq->q);
+ }
}
static void ionic_txrx_free(struct ionic_lif *lif)
@@ -1716,15 +1999,31 @@ static void ionic_txrx_free(struct ionic_lif *lif)
lif->rxqcqs[i] = NULL;
}
}
+
+ if (lif->hwstamp_txq) {
+ ionic_qcq_free(lif, lif->hwstamp_txq);
+ devm_kfree(lif->ionic->dev, lif->hwstamp_txq);
+ lif->hwstamp_txq = NULL;
+ }
+
+ if (lif->hwstamp_rxq) {
+ ionic_qcq_free(lif, lif->hwstamp_rxq);
+ devm_kfree(lif->ionic->dev, lif->hwstamp_rxq);
+ lif->hwstamp_rxq = NULL;
+ }
}
static int ionic_txrx_alloc(struct ionic_lif *lif)
{
- unsigned int sg_desc_sz;
+ unsigned int num_desc, desc_sz, comp_sz, sg_desc_sz;
unsigned int flags;
unsigned int i;
int err = 0;
+ num_desc = lif->ntxq_descs;
+ desc_sz = sizeof(struct ionic_txq_desc);
+ comp_sz = sizeof(struct ionic_txq_comp);
+
if (lif->qtype_info[IONIC_QTYPE_TXQ].version >= 1 &&
lif->qtype_info[IONIC_QTYPE_TXQ].sg_desc_sz ==
sizeof(struct ionic_txq_sg_desc_v1))
@@ -1737,10 +2036,7 @@ static int ionic_txrx_alloc(struct ionic_lif *lif)
flags |= IONIC_QCQ_F_INTR;
for (i = 0; i < lif->nxqs; i++) {
err = ionic_qcq_alloc(lif, IONIC_QTYPE_TXQ, i, "tx", flags,
- lif->ntxq_descs,
- sizeof(struct ionic_txq_desc),
- sizeof(struct ionic_txq_comp),
- sg_desc_sz,
+ num_desc, desc_sz, comp_sz, sg_desc_sz,
lif->kern_pid, &lif->txqcqs[i]);
if (err)
goto err_out;
@@ -1757,16 +2053,24 @@ static int ionic_txrx_alloc(struct ionic_lif *lif)
}
flags = IONIC_QCQ_F_RX_STATS | IONIC_QCQ_F_SG | IONIC_QCQ_F_INTR;
+
+ num_desc = lif->nrxq_descs;
+ desc_sz = sizeof(struct ionic_rxq_desc);
+ comp_sz = sizeof(struct ionic_rxq_comp);
+ sg_desc_sz = sizeof(struct ionic_rxq_sg_desc);
+
+ if (lif->rxq_features & IONIC_Q_F_2X_CQ_DESC)
+ comp_sz *= 2;
+
for (i = 0; i < lif->nxqs; i++) {
err = ionic_qcq_alloc(lif, IONIC_QTYPE_RXQ, i, "rx", flags,
- lif->nrxq_descs,
- sizeof(struct ionic_rxq_desc),
- sizeof(struct ionic_rxq_comp),
- sizeof(struct ionic_rxq_sg_desc),
+ num_desc, desc_sz, comp_sz, sg_desc_sz,
lif->kern_pid, &lif->rxqcqs[i]);
if (err)
goto err_out;
+ lif->rxqcqs[i]->q.features = lif->rxq_features;
+
ionic_intr_coal_init(lif->ionic->idev.intr_ctrl,
lif->rxqcqs[i]->intr.index,
lif->rx_coalesce_hw);
@@ -1845,8 +2149,26 @@ static int ionic_txrx_enable(struct ionic_lif *lif)
}
}
+ if (lif->hwstamp_rxq) {
+ ionic_rx_fill(&lif->hwstamp_rxq->q);
+ err = ionic_qcq_enable(lif->hwstamp_rxq);
+ if (err)
+ goto err_out_hwstamp_rx;
+ }
+
+ if (lif->hwstamp_txq) {
+ err = ionic_qcq_enable(lif->hwstamp_txq);
+ if (err)
+ goto err_out_hwstamp_tx;
+ }
+
return 0;
+err_out_hwstamp_tx:
+ if (lif->hwstamp_rxq)
+ derr = ionic_qcq_disable(lif->hwstamp_rxq, (derr != -ETIMEDOUT));
+err_out_hwstamp_rx:
+ i = lif->nxqs;
err_out:
while (i--) {
derr = ionic_qcq_disable(lif->txqcqs[i], (derr != -ETIMEDOUT));
@@ -1943,6 +2265,20 @@ static int ionic_stop(struct net_device *netdev)
return 0;
}
+static int ionic_do_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd)
+{
+ struct ionic_lif *lif = netdev_priv(netdev);
+
+ switch (cmd) {
+ case SIOCSHWTSTAMP:
+ return ionic_lif_hwstamp_set(lif, ifr);
+ case SIOCGHWTSTAMP:
+ return ionic_lif_hwstamp_get(lif, ifr);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
static int ionic_get_vf_config(struct net_device *netdev,
int vf, struct ifla_vf_info *ivf)
{
@@ -2191,6 +2527,7 @@ static int ionic_set_vf_link_state(struct net_device *netdev, int vf, int set)
static const struct net_device_ops ionic_netdev_ops = {
.ndo_open = ionic_open,
.ndo_stop = ionic_stop,
+ .ndo_do_ioctl = ionic_do_ioctl,
.ndo_start_xmit = ionic_start_xmit,
.ndo_get_stats64 = ionic_get_stats64,
.ndo_set_rx_mode = ionic_ndo_set_rx_mode,
@@ -2214,7 +2551,9 @@ static const struct net_device_ops ionic_netdev_ops = {
static void ionic_swap_queues(struct ionic_qcq *a, struct ionic_qcq *b)
{
/* only swapping the queues, not the napi, flags, or other stuff */
+ swap(a->q.features, b->q.features);
swap(a->q.num_descs, b->q.num_descs);
+ swap(a->q.desc_size, b->q.desc_size);
swap(a->q.base, b->q.base);
swap(a->q.base_pa, b->q.base_pa);
swap(a->q.info, b->q.info);
@@ -2222,6 +2561,7 @@ static void ionic_swap_queues(struct ionic_qcq *a, struct ionic_qcq *b)
swap(a->q_base_pa, b->q_base_pa);
swap(a->q_size, b->q_size);
+ swap(a->q.sg_desc_size, b->q.sg_desc_size);
swap(a->q.sg_base, b->q.sg_base);
swap(a->q.sg_base_pa, b->q.sg_base_pa);
swap(a->sg_base, b->sg_base);
@@ -2229,6 +2569,7 @@ static void ionic_swap_queues(struct ionic_qcq *a, struct ionic_qcq *b)
swap(a->sg_size, b->sg_size);
swap(a->cq.num_descs, b->cq.num_descs);
+ swap(a->cq.desc_size, b->cq.desc_size);
swap(a->cq.base, b->cq.base);
swap(a->cq.base_pa, b->cq.base_pa);
swap(a->cq.info, b->cq.info);
@@ -2243,9 +2584,9 @@ static void ionic_swap_queues(struct ionic_qcq *a, struct ionic_qcq *b)
int ionic_reconfigure_queues(struct ionic_lif *lif,
struct ionic_queue_params *qparam)
{
+ unsigned int num_desc, desc_sz, comp_sz, sg_desc_sz;
struct ionic_qcq **tx_qcqs = NULL;
struct ionic_qcq **rx_qcqs = NULL;
- unsigned int sg_desc_sz;
unsigned int flags;
int err = -ENOMEM;
unsigned int i;
@@ -2257,7 +2598,9 @@ int ionic_reconfigure_queues(struct ionic_lif *lif,
if (!tx_qcqs)
goto err_out;
}
- if (qparam->nxqs != lif->nxqs || qparam->nrxq_descs != lif->nrxq_descs) {
+ if (qparam->nxqs != lif->nxqs ||
+ qparam->nrxq_descs != lif->nrxq_descs ||
+ qparam->rxq_features != lif->rxq_features) {
rx_qcqs = devm_kcalloc(lif->ionic->dev, lif->ionic->nrxqs_per_lif,
sizeof(struct ionic_qcq *), GFP_KERNEL);
if (!rx_qcqs)
@@ -2267,21 +2610,22 @@ int ionic_reconfigure_queues(struct ionic_lif *lif,
/* allocate new desc_info and rings, but leave the interrupt setup
* until later so as to not mess with the still-running queues
*/
- if (lif->qtype_info[IONIC_QTYPE_TXQ].version >= 1 &&
- lif->qtype_info[IONIC_QTYPE_TXQ].sg_desc_sz ==
- sizeof(struct ionic_txq_sg_desc_v1))
- sg_desc_sz = sizeof(struct ionic_txq_sg_desc_v1);
- else
- sg_desc_sz = sizeof(struct ionic_txq_sg_desc);
-
if (tx_qcqs) {
+ num_desc = qparam->ntxq_descs;
+ desc_sz = sizeof(struct ionic_txq_desc);
+ comp_sz = sizeof(struct ionic_txq_comp);
+
+ if (lif->qtype_info[IONIC_QTYPE_TXQ].version >= 1 &&
+ lif->qtype_info[IONIC_QTYPE_TXQ].sg_desc_sz ==
+ sizeof(struct ionic_txq_sg_desc_v1))
+ sg_desc_sz = sizeof(struct ionic_txq_sg_desc_v1);
+ else
+ sg_desc_sz = sizeof(struct ionic_txq_sg_desc);
+
for (i = 0; i < qparam->nxqs; i++) {
flags = lif->txqcqs[i]->flags & ~IONIC_QCQ_F_INTR;
err = ionic_qcq_alloc(lif, IONIC_QTYPE_TXQ, i, "tx", flags,
- qparam->ntxq_descs,
- sizeof(struct ionic_txq_desc),
- sizeof(struct ionic_txq_comp),
- sg_desc_sz,
+ num_desc, desc_sz, comp_sz, sg_desc_sz,
lif->kern_pid, &tx_qcqs[i]);
if (err)
goto err_out;
@@ -2289,16 +2633,23 @@ int ionic_reconfigure_queues(struct ionic_lif *lif,
}
if (rx_qcqs) {
+ num_desc = qparam->nrxq_descs;
+ desc_sz = sizeof(struct ionic_rxq_desc);
+ comp_sz = sizeof(struct ionic_rxq_comp);
+ sg_desc_sz = sizeof(struct ionic_rxq_sg_desc);
+
+ if (qparam->rxq_features & IONIC_Q_F_2X_CQ_DESC)
+ comp_sz *= 2;
+
for (i = 0; i < qparam->nxqs; i++) {
flags = lif->rxqcqs[i]->flags & ~IONIC_QCQ_F_INTR;
err = ionic_qcq_alloc(lif, IONIC_QTYPE_RXQ, i, "rx", flags,
- qparam->nrxq_descs,
- sizeof(struct ionic_rxq_desc),
- sizeof(struct ionic_rxq_comp),
- sizeof(struct ionic_rxq_sg_desc),
+ num_desc, desc_sz, comp_sz, sg_desc_sz,
lif->kern_pid, &rx_qcqs[i]);
if (err)
goto err_out;
+
+ rx_qcqs[i]->q.features = qparam->rxq_features;
}
}
@@ -2385,6 +2736,7 @@ int ionic_reconfigure_queues(struct ionic_lif *lif,
}
swap(lif->nxqs, qparam->nxqs);
+ swap(lif->rxq_features, qparam->rxq_features);
err_out_reinit_unlock:
/* re-init the queues, but don't lose an error code */
@@ -2536,6 +2888,8 @@ int ionic_lif_alloc(struct ionic *ionic)
}
netdev_rss_key_fill(lif->rss_hash_key, IONIC_RSS_HASH_KEY_SIZE);
+ ionic_lif_alloc_phc(lif);
+
return 0;
err_out_free_qcqs:
@@ -2635,6 +2989,9 @@ static void ionic_lif_handle_fw_up(struct ionic_lif *lif)
goto err_txrx_free;
}
+ /* restore the hardware timestamping queues */
+ ionic_lif_hwstamp_set(lif, NULL);
+
clear_bit(IONIC_LIF_F_FW_RESET, lif->state);
ionic_link_status_check_request(lif, CAN_SLEEP);
netif_device_attach(lif->netdev);
@@ -2656,6 +3013,8 @@ void ionic_lif_free(struct ionic_lif *lif)
{
struct device *dev = lif->ionic->dev;
+ ionic_lif_free_phc(lif);
+
/* free rss indirection table */
dma_free_coherent(dev, lif->rss_ind_tbl_sz, lif->rss_ind_tbl,
lif->rss_ind_tbl_pa);
@@ -2992,6 +3351,8 @@ int ionic_lif_register(struct ionic_lif *lif)
{
int err;
+ ionic_lif_register_phc(lif);
+
INIT_WORK(&lif->ionic->nb_work, ionic_lif_notify_work);
lif->ionic->nb.notifier_call = ionic_lif_notify;
@@ -3004,6 +3365,7 @@ int ionic_lif_register(struct ionic_lif *lif)
err = register_netdev(lif->netdev);
if (err) {
dev_err(lif->ionic->dev, "Cannot register net device, aborting\n");
+ ionic_lif_unregister_phc(lif);
return err;
}
@@ -3024,6 +3386,9 @@ void ionic_lif_unregister(struct ionic_lif *lif)
if (lif->netdev->reg_state == NETREG_REGISTERED)
unregister_netdev(lif->netdev);
+
+ ionic_lif_unregister_phc(lif);
+
lif->registered = false;
}
@@ -3163,6 +3528,16 @@ int ionic_lif_size(struct ionic *ionic)
ntxqs_per_lif = le32_to_cpu(lc->queue_count[IONIC_QTYPE_TXQ]);
nrxqs_per_lif = le32_to_cpu(lc->queue_count[IONIC_QTYPE_RXQ]);
+ /* reserve last queue id for hardware timestamping */
+ if (lc->features & cpu_to_le64(IONIC_ETH_HW_TIMESTAMP)) {
+ if (ntxqs_per_lif <= 1 || nrxqs_per_lif <= 1) {
+ lc->features &= cpu_to_le64(~IONIC_ETH_HW_TIMESTAMP);
+ } else {
+ ntxqs_per_lif -= 1;
+ nrxqs_per_lif -= 1;
+ }
+ }
+
nxqs = min(ntxqs_per_lif, nrxqs_per_lif);
nxqs = min(nxqs, num_online_cpus());
neqs = min(neqs_per_lif, num_online_cpus());
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.h b/drivers/net/ethernet/pensando/ionic/ionic_lif.h
index be5cc89b2bd9..ea3b086af179 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_lif.h
+++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.h
@@ -4,6 +4,9 @@
#ifndef _IONIC_LIF_H_
#define _IONIC_LIF_H_
+#include <linux/ptp_clock_kernel.h>
+#include <linux/timecounter.h>
+#include <uapi/linux/net_tstamp.h>
#include <linux/dim.h>
#include <linux/pci.h>
#include "ionic_rx_filter.h"
@@ -36,6 +39,8 @@ struct ionic_tx_stats {
u64 crc32_csum;
u64 sg_cntr[IONIC_MAX_NUM_SG_CNTR];
u64 dma_map_err;
+ u64 hwstamp_valid;
+ u64 hwstamp_invalid;
};
struct ionic_rx_stats {
@@ -49,6 +54,8 @@ struct ionic_rx_stats {
u64 csum_error;
u64 dma_map_err;
u64 alloc_err;
+ u64 hwstamp_valid;
+ u64 hwstamp_invalid;
};
#define IONIC_QCQ_F_INITED BIT(0)
@@ -125,6 +132,10 @@ struct ionic_lif_sw_stats {
u64 rx_csum_none;
u64 rx_csum_complete;
u64 rx_csum_error;
+ u64 tx_hwstamp_valid;
+ u64 tx_hwstamp_invalid;
+ u64 rx_hwstamp_valid;
+ u64 rx_hwstamp_invalid;
u64 hw_tx_dropped;
u64 hw_rx_dropped;
u64 hw_rx_over_errors;
@@ -158,6 +169,8 @@ struct ionic_qtype_info {
u16 sg_desc_stride;
};
+struct ionic_phc;
+
#define IONIC_LIF_NAME_MAX_SZ 32
struct ionic_lif {
struct net_device *netdev;
@@ -170,8 +183,10 @@ struct ionic_lif {
struct ionic_qcq *adminqcq;
struct ionic_qcq *notifyqcq;
struct ionic_qcq **txqcqs;
+ struct ionic_qcq *hwstamp_txq;
struct ionic_tx_stats *txqstats;
struct ionic_qcq **rxqcqs;
+ struct ionic_qcq *hwstamp_rxq;
struct ionic_rx_stats *rxqstats;
struct ionic_deferred deferred;
struct work_struct tx_timeout_work;
@@ -183,6 +198,7 @@ struct ionic_lif {
unsigned int ntxq_descs;
unsigned int nrxq_descs;
u32 rx_copybreak;
+ u64 rxq_features;
unsigned int rx_mode;
u64 hw_features;
bool registered;
@@ -213,14 +229,35 @@ struct ionic_lif {
unsigned long *dbid_inuse;
unsigned int dbid_count;
+ struct ionic_phc *phc;
+
struct dentry *dentry;
};
+struct ionic_phc {
+ spinlock_t lock; /* lock for cc and tc */
+ struct cyclecounter cc;
+ struct timecounter tc;
+
+ struct mutex config_lock; /* lock for ts_config */
+ struct hwtstamp_config ts_config;
+ u64 ts_config_rx_filt;
+ u32 ts_config_tx_mode;
+
+ u32 init_cc_mult;
+ long aux_work_delay;
+
+ struct ptp_clock_info ptp_info;
+ struct ptp_clock *ptp;
+ struct ionic_lif *lif;
+};
+
struct ionic_queue_params {
unsigned int nxqs;
unsigned int ntxq_descs;
unsigned int nrxq_descs;
unsigned int intr_split;
+ u64 rxq_features;
};
static inline void ionic_init_queue_params(struct ionic_lif *lif,
@@ -230,6 +267,7 @@ static inline void ionic_init_queue_params(struct ionic_lif *lif,
qparam->ntxq_descs = lif->ntxq_descs;
qparam->nrxq_descs = lif->nrxq_descs;
qparam->intr_split = test_bit(IONIC_LIF_F_SPLIT_INTR, lif->state);
+ qparam->rxq_features = lif->rxq_features;
}
static inline u32 ionic_coal_usec_to_hw(struct ionic *ionic, u32 usecs)
@@ -262,6 +300,43 @@ void ionic_lif_unregister(struct ionic_lif *lif);
int ionic_lif_identify(struct ionic *ionic, u8 lif_type,
union ionic_lif_identity *lif_ident);
int ionic_lif_size(struct ionic *ionic);
+
+#if IS_ENABLED(CONFIG_PTP_1588_CLOCK)
+int ionic_lif_hwstamp_set(struct ionic_lif *lif, struct ifreq *ifr);
+int ionic_lif_hwstamp_get(struct ionic_lif *lif, struct ifreq *ifr);
+ktime_t ionic_lif_phc_ktime(struct ionic_lif *lif, u64 counter);
+void ionic_lif_register_phc(struct ionic_lif *lif);
+void ionic_lif_unregister_phc(struct ionic_lif *lif);
+void ionic_lif_alloc_phc(struct ionic_lif *lif);
+void ionic_lif_free_phc(struct ionic_lif *lif);
+#else
+static inline int ionic_lif_hwstamp_set(struct ionic_lif *lif, struct ifreq *ifr)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int ionic_lif_hwstamp_get(struct ionic_lif *lif, struct ifreq *ifr)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline ktime_t ionic_lif_phc_ktime(struct ionic_lif *lif, u64 counter)
+{
+ return ns_to_ktime(0);
+}
+
+static inline void ionic_lif_register_phc(struct ionic_lif *lif) {}
+static inline void ionic_lif_unregister_phc(struct ionic_lif *lif) {}
+static inline void ionic_lif_alloc_phc(struct ionic_lif *lif) {}
+static inline void ionic_lif_free_phc(struct ionic_lif *lif) {}
+#endif
+
+int ionic_lif_create_hwstamp_txq(struct ionic_lif *lif);
+int ionic_lif_create_hwstamp_rxq(struct ionic_lif *lif);
+int ionic_lif_config_hwstamp_rxq_all(struct ionic_lif *lif, bool rx_all);
+int ionic_lif_set_hwstamp_txmode(struct ionic_lif *lif, u16 txstamp_mode);
+int ionic_lif_set_hwstamp_rxfilt(struct ionic_lif *lif, u64 pkt_class);
+
int ionic_lif_rss_config(struct ionic_lif *lif, u16 types,
const u8 *key, const u32 *indir);
int ionic_reconfigure_queues(struct ionic_lif *lif,
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_main.c b/drivers/net/ethernet/pensando/ionic/ionic_main.c
index c4b2906a2ae6..61cfe2120817 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_main.c
+++ b/drivers/net/ethernet/pensando/ionic/ionic_main.c
@@ -148,6 +148,8 @@ static const char *ionic_opcode_to_str(enum ionic_cmd_opcode opcode)
return "IONIC_CMD_LIF_SETATTR";
case IONIC_CMD_LIF_GETATTR:
return "IONIC_CMD_LIF_GETATTR";
+ case IONIC_CMD_LIF_SETPHC:
+ return "IONIC_CMD_LIF_SETPHC";
case IONIC_CMD_RX_MODE_SET:
return "IONIC_CMD_RX_MODE_SET";
case IONIC_CMD_RX_FILTER_ADD:
@@ -256,7 +258,7 @@ static void ionic_adminq_cb(struct ionic_queue *q,
complete_all(&ctx->work);
}
-static int ionic_adminq_post(struct ionic_lif *lif, struct ionic_admin_ctx *ctx)
+int ionic_adminq_post(struct ionic_lif *lif, struct ionic_admin_ctx *ctx)
{
struct ionic_desc_info *desc_info;
unsigned long irqflags;
@@ -295,14 +297,12 @@ err_out:
return err;
}
-int ionic_adminq_post_wait(struct ionic_lif *lif, struct ionic_admin_ctx *ctx)
+int ionic_adminq_wait(struct ionic_lif *lif, struct ionic_admin_ctx *ctx, int err)
{
struct net_device *netdev = lif->netdev;
unsigned long remaining;
const char *name;
- int err;
- err = ionic_adminq_post(lif, ctx);
if (err) {
if (!test_bit(IONIC_LIF_F_FW_RESET, lif->state)) {
name = ionic_opcode_to_str(ctx->cmd.cmd.opcode);
@@ -317,6 +317,15 @@ int ionic_adminq_post_wait(struct ionic_lif *lif, struct ionic_admin_ctx *ctx)
return ionic_adminq_check_err(lif, ctx, (remaining == 0));
}
+int ionic_adminq_post_wait(struct ionic_lif *lif, struct ionic_admin_ctx *ctx)
+{
+ int err;
+
+ err = ionic_adminq_post(lif, ctx);
+
+ return ionic_adminq_wait(lif, ctx, err);
+}
+
static void ionic_dev_cmd_clean(struct ionic *ionic)
{
union __iomem ionic_dev_cmd_regs *regs = ionic->idev.dev_cmd_regs;
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_phc.c b/drivers/net/ethernet/pensando/ionic/ionic_phc.c
new file mode 100644
index 000000000000..86ae5011ac9b
--- /dev/null
+++ b/drivers/net/ethernet/pensando/ionic/ionic_phc.c
@@ -0,0 +1,589 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2017 - 2021 Pensando Systems, Inc */
+
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+
+#include "ionic.h"
+#include "ionic_bus.h"
+#include "ionic_lif.h"
+#include "ionic_ethtool.h"
+
+static int ionic_hwstamp_tx_mode(int config_tx_type)
+{
+ switch (config_tx_type) {
+ case HWTSTAMP_TX_OFF:
+ return IONIC_TXSTAMP_OFF;
+ case HWTSTAMP_TX_ON:
+ return IONIC_TXSTAMP_ON;
+ case HWTSTAMP_TX_ONESTEP_SYNC:
+ return IONIC_TXSTAMP_ONESTEP_SYNC;
+#ifdef HAVE_HWSTAMP_TX_ONESTEP_P2P
+ case HWTSTAMP_TX_ONESTEP_P2P:
+ return IONIC_TXSTAMP_ONESTEP_P2P;
+#endif
+ default:
+ return -ERANGE;
+ }
+}
+
+static u64 ionic_hwstamp_rx_filt(int config_rx_filter)
+{
+ switch (config_rx_filter) {
+ case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+ return IONIC_PKT_CLS_PTP1_ALL;
+ case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
+ return IONIC_PKT_CLS_PTP1_SYNC;
+ case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
+ return IONIC_PKT_CLS_PTP1_SYNC | IONIC_PKT_CLS_PTP1_DREQ;
+
+ case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
+ return IONIC_PKT_CLS_PTP2_L4_ALL;
+ case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
+ return IONIC_PKT_CLS_PTP2_L4_SYNC;
+ case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
+ return IONIC_PKT_CLS_PTP2_L4_SYNC | IONIC_PKT_CLS_PTP2_L4_DREQ;
+
+ case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
+ return IONIC_PKT_CLS_PTP2_L2_ALL;
+ case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
+ return IONIC_PKT_CLS_PTP2_L2_SYNC;
+ case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
+ return IONIC_PKT_CLS_PTP2_L2_SYNC | IONIC_PKT_CLS_PTP2_L2_DREQ;
+
+ case HWTSTAMP_FILTER_PTP_V2_EVENT:
+ return IONIC_PKT_CLS_PTP2_ALL;
+ case HWTSTAMP_FILTER_PTP_V2_SYNC:
+ return IONIC_PKT_CLS_PTP2_SYNC;
+ case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+ return IONIC_PKT_CLS_PTP2_SYNC | IONIC_PKT_CLS_PTP2_DREQ;
+
+ case HWTSTAMP_FILTER_NTP_ALL:
+ return IONIC_PKT_CLS_NTP_ALL;
+
+ default:
+ return 0;
+ }
+}
+
+int ionic_lif_hwstamp_set(struct ionic_lif *lif, struct ifreq *ifr)
+{
+ struct ionic *ionic = lif->ionic;
+ struct hwtstamp_config config;
+ int tx_mode = 0;
+ u64 rx_filt = 0;
+ int err, err2;
+ bool rx_all;
+ __le64 mask;
+
+ if (!lif->phc || !lif->phc->ptp)
+ return -EOPNOTSUPP;
+
+ if (ifr) {
+ if (copy_from_user(&config, ifr->ifr_data, sizeof(config)))
+ return -EFAULT;
+ } else {
+ /* if called with ifr == NULL, behave as if called with the
+ * current ts_config from the initial cleared state.
+ */
+ memcpy(&config, &lif->phc->ts_config, sizeof(config));
+ memset(&lif->phc->ts_config, 0, sizeof(config));
+ }
+
+ tx_mode = ionic_hwstamp_tx_mode(config.tx_type);
+ if (tx_mode < 0)
+ return tx_mode;
+
+ mask = cpu_to_le64(BIT_ULL(tx_mode));
+ if ((ionic->ident.lif.eth.hwstamp_tx_modes & mask) != mask)
+ return -ERANGE;
+
+ rx_filt = ionic_hwstamp_rx_filt(config.rx_filter);
+ rx_all = config.rx_filter != HWTSTAMP_FILTER_NONE && !rx_filt;
+
+ mask = cpu_to_le64(rx_filt);
+ if ((ionic->ident.lif.eth.hwstamp_rx_filters & mask) != mask) {
+ rx_filt = 0;
+ rx_all = true;
+ config.rx_filter = HWTSTAMP_FILTER_ALL;
+ }
+
+ dev_dbg(ionic->dev, "config_rx_filter %d rx_filt %#llx rx_all %d\n",
+ config.rx_filter, rx_filt, rx_all);
+
+ mutex_lock(&lif->phc->config_lock);
+
+ if (tx_mode) {
+ err = ionic_lif_create_hwstamp_txq(lif);
+ if (err)
+ goto err_queues;
+ }
+
+ if (rx_filt) {
+ err = ionic_lif_create_hwstamp_rxq(lif);
+ if (err)
+ goto err_queues;
+ }
+
+ if (tx_mode != lif->phc->ts_config_tx_mode) {
+ err = ionic_lif_set_hwstamp_txmode(lif, tx_mode);
+ if (err)
+ goto err_txmode;
+ }
+
+ if (rx_filt != lif->phc->ts_config_rx_filt) {
+ err = ionic_lif_set_hwstamp_rxfilt(lif, rx_filt);
+ if (err)
+ goto err_rxfilt;
+ }
+
+ if (rx_all != (lif->phc->ts_config.rx_filter == HWTSTAMP_FILTER_ALL)) {
+ err = ionic_lif_config_hwstamp_rxq_all(lif, rx_all);
+ if (err)
+ goto err_rxall;
+ }
+
+ if (ifr) {
+ err = copy_to_user(ifr->ifr_data, &config, sizeof(config));
+ if (err) {
+ err = -EFAULT;
+ goto err_final;
+ }
+ }
+
+ memcpy(&lif->phc->ts_config, &config, sizeof(config));
+ lif->phc->ts_config_rx_filt = rx_filt;
+ lif->phc->ts_config_tx_mode = tx_mode;
+
+ mutex_unlock(&lif->phc->config_lock);
+
+ return 0;
+
+err_final:
+ if (rx_all != (lif->phc->ts_config.rx_filter == HWTSTAMP_FILTER_ALL)) {
+ rx_all = lif->phc->ts_config.rx_filter == HWTSTAMP_FILTER_ALL;
+ err2 = ionic_lif_config_hwstamp_rxq_all(lif, rx_all);
+ if (err2)
+ dev_err(ionic->dev,
+ "Failed to revert all-rxq timestamp config: %d\n", err2);
+ }
+err_rxall:
+ if (rx_filt != lif->phc->ts_config_rx_filt) {
+ rx_filt = lif->phc->ts_config_rx_filt;
+ err2 = ionic_lif_set_hwstamp_rxfilt(lif, rx_filt);
+ if (err2)
+ dev_err(ionic->dev,
+ "Failed to revert rx timestamp filter: %d\n", err2);
+ }
+err_rxfilt:
+ if (tx_mode != lif->phc->ts_config_tx_mode) {
+ tx_mode = lif->phc->ts_config_tx_mode;
+ err2 = ionic_lif_set_hwstamp_txmode(lif, tx_mode);
+ if (err2)
+ dev_err(ionic->dev,
+ "Failed to revert tx timestamp mode: %d\n", err2);
+ }
+err_txmode:
+ /* special queues remain allocated, just unused */
+err_queues:
+ mutex_unlock(&lif->phc->config_lock);
+ return err;
+}
+
+int ionic_lif_hwstamp_get(struct ionic_lif *lif, struct ifreq *ifr)
+{
+ struct hwtstamp_config config;
+
+ if (!lif->phc || !lif->phc->ptp)
+ return -EOPNOTSUPP;
+
+ mutex_lock(&lif->phc->config_lock);
+ memcpy(&config, &lif->phc->ts_config, sizeof(config));
+ mutex_unlock(&lif->phc->config_lock);
+
+ return copy_to_user(ifr->ifr_data, &config, sizeof(config));
+}
+
+static u64 ionic_hwstamp_read(struct ionic *ionic,
+ struct ptp_system_timestamp *sts)
+{
+ u32 tick_high_before, tick_high, tick_low;
+
+ /* read and discard low part to defeat hw staging of high part */
+ (void)ioread32(&ionic->idev.hwstamp_regs->tick_low);
+
+ tick_high_before = ioread32(&ionic->idev.hwstamp_regs->tick_high);
+
+ ptp_read_system_prets(sts);
+ tick_low = ioread32(&ionic->idev.hwstamp_regs->tick_low);
+ ptp_read_system_postts(sts);
+
+ tick_high = ioread32(&ionic->idev.hwstamp_regs->tick_high);
+
+ /* If tick_high changed, re-read tick_low once more. Assume tick_high
+ * cannot change again so soon as in the span of re-reading tick_low.
+ */
+ if (tick_high != tick_high_before) {
+ ptp_read_system_prets(sts);
+ tick_low = ioread32(&ionic->idev.hwstamp_regs->tick_low);
+ ptp_read_system_postts(sts);
+ }
+
+ return (u64)tick_low | ((u64)tick_high << 32);
+}
+
+static u64 ionic_cc_read(const struct cyclecounter *cc)
+{
+ struct ionic_phc *phc = container_of(cc, struct ionic_phc, cc);
+ struct ionic *ionic = phc->lif->ionic;
+
+ return ionic_hwstamp_read(ionic, NULL);
+}
+
+static int ionic_setphc_cmd(struct ionic_phc *phc, struct ionic_admin_ctx *ctx)
+{
+ ctx->work = COMPLETION_INITIALIZER_ONSTACK(ctx->work);
+
+ ctx->cmd.lif_setphc.opcode = IONIC_CMD_LIF_SETPHC;
+ ctx->cmd.lif_setphc.lif_index = cpu_to_le16(phc->lif->index);
+
+ ctx->cmd.lif_setphc.tick = cpu_to_le64(phc->tc.cycle_last);
+ ctx->cmd.lif_setphc.nsec = cpu_to_le64(phc->tc.nsec);
+ ctx->cmd.lif_setphc.frac = cpu_to_le64(phc->tc.frac);
+ ctx->cmd.lif_setphc.mult = cpu_to_le32(phc->cc.mult);
+ ctx->cmd.lif_setphc.shift = cpu_to_le32(phc->cc.shift);
+
+ return ionic_adminq_post(phc->lif, ctx);
+}
+
+static int ionic_phc_adjfine(struct ptp_clock_info *info, long scaled_ppm)
+{
+ struct ionic_phc *phc = container_of(info, struct ionic_phc, ptp_info);
+ struct ionic_admin_ctx ctx = {};
+ unsigned long irqflags;
+ s64 adj;
+ int err;
+
+ /* Reject phc adjustments during device upgrade */
+ if (test_bit(IONIC_LIF_F_FW_RESET, phc->lif->state))
+ return -EBUSY;
+
+ /* Adjustment value scaled by 2^16 million */
+ adj = (s64)scaled_ppm * phc->init_cc_mult;
+
+ /* Adjustment value to scale */
+ adj /= (s64)SCALED_PPM;
+
+ /* Final adjusted multiplier */
+ adj += phc->init_cc_mult;
+
+ spin_lock_irqsave(&phc->lock, irqflags);
+
+ /* update the point-in-time basis to now, before adjusting the rate */
+ timecounter_read(&phc->tc);
+ phc->cc.mult = adj;
+
+ /* Setphc commands are posted in-order, sequenced by phc->lock. We
+ * need to drop the lock before waiting for the command to complete.
+ */
+ err = ionic_setphc_cmd(phc, &ctx);
+
+ spin_unlock_irqrestore(&phc->lock, irqflags);
+
+ return ionic_adminq_wait(phc->lif, &ctx, err);
+}
+
+static int ionic_phc_adjtime(struct ptp_clock_info *info, s64 delta)
+{
+ struct ionic_phc *phc = container_of(info, struct ionic_phc, ptp_info);
+ struct ionic_admin_ctx ctx = {};
+ unsigned long irqflags;
+ int err;
+
+ /* Reject phc adjustments during device upgrade */
+ if (test_bit(IONIC_LIF_F_FW_RESET, phc->lif->state))
+ return -EBUSY;
+
+ spin_lock_irqsave(&phc->lock, irqflags);
+
+ timecounter_adjtime(&phc->tc, delta);
+
+ /* Setphc commands are posted in-order, sequenced by phc->lock. We
+ * need to drop the lock before waiting for the command to complete.
+ */
+ err = ionic_setphc_cmd(phc, &ctx);
+
+ spin_unlock_irqrestore(&phc->lock, irqflags);
+
+ return ionic_adminq_wait(phc->lif, &ctx, err);
+}
+
+static int ionic_phc_settime64(struct ptp_clock_info *info,
+ const struct timespec64 *ts)
+{
+ struct ionic_phc *phc = container_of(info, struct ionic_phc, ptp_info);
+ struct ionic_admin_ctx ctx = {};
+ unsigned long irqflags;
+ int err;
+ u64 ns;
+
+ /* Reject phc adjustments during device upgrade */
+ if (test_bit(IONIC_LIF_F_FW_RESET, phc->lif->state))
+ return -EBUSY;
+
+ ns = timespec64_to_ns(ts);
+
+ spin_lock_irqsave(&phc->lock, irqflags);
+
+ timecounter_init(&phc->tc, &phc->cc, ns);
+
+ /* Setphc commands are posted in-order, sequenced by phc->lock. We
+ * need to drop the lock before waiting for the command to complete.
+ */
+ err = ionic_setphc_cmd(phc, &ctx);
+
+ spin_unlock_irqrestore(&phc->lock, irqflags);
+
+ return ionic_adminq_wait(phc->lif, &ctx, err);
+}
+
+static int ionic_phc_gettimex64(struct ptp_clock_info *info,
+ struct timespec64 *ts,
+ struct ptp_system_timestamp *sts)
+{
+ struct ionic_phc *phc = container_of(info, struct ionic_phc, ptp_info);
+ struct ionic *ionic = phc->lif->ionic;
+ unsigned long irqflags;
+ u64 tick, ns;
+
+ /* Do not attempt to read device time during upgrade */
+ if (test_bit(IONIC_LIF_F_FW_RESET, phc->lif->state))
+ return -EBUSY;
+
+ spin_lock_irqsave(&phc->lock, irqflags);
+
+ tick = ionic_hwstamp_read(ionic, sts);
+
+ ns = timecounter_cyc2time(&phc->tc, tick);
+
+ spin_unlock_irqrestore(&phc->lock, irqflags);
+
+ *ts = ns_to_timespec64(ns);
+
+ return 0;
+}
+
+static long ionic_phc_aux_work(struct ptp_clock_info *info)
+{
+ struct ionic_phc *phc = container_of(info, struct ionic_phc, ptp_info);
+ struct ionic_admin_ctx ctx = {};
+ unsigned long irqflags;
+ int err;
+
+ /* Do not update phc during device upgrade, but keep polling to resume
+ * after upgrade. Since we don't update the point in time basis, there
+ * is no expectation that we are maintaining the phc time during the
+ * upgrade. After upgrade, it will need to be readjusted back to the
+ * correct time by the ptp daemon.
+ */
+ if (test_bit(IONIC_LIF_F_FW_RESET, phc->lif->state))
+ return phc->aux_work_delay;
+
+ spin_lock_irqsave(&phc->lock, irqflags);
+
+ /* update point-in-time basis to now */
+ timecounter_read(&phc->tc);
+
+ /* Setphc commands are posted in-order, sequenced by phc->lock. We
+ * need to drop the lock before waiting for the command to complete.
+ */
+ err = ionic_setphc_cmd(phc, &ctx);
+
+ spin_unlock_irqrestore(&phc->lock, irqflags);
+
+ ionic_adminq_wait(phc->lif, &ctx, err);
+
+ return phc->aux_work_delay;
+}
+
+ktime_t ionic_lif_phc_ktime(struct ionic_lif *lif, u64 tick)
+{
+ unsigned long irqflags;
+ u64 ns;
+
+ if (!lif->phc)
+ return 0;
+
+ spin_lock_irqsave(&lif->phc->lock, irqflags);
+ ns = timecounter_cyc2time(&lif->phc->tc, tick);
+ spin_unlock_irqrestore(&lif->phc->lock, irqflags);
+
+ return ns_to_ktime(ns);
+}
+
+static const struct ptp_clock_info ionic_ptp_info = {
+ .owner = THIS_MODULE,
+ .name = "ionic_ptp",
+ .adjfine = ionic_phc_adjfine,
+ .adjtime = ionic_phc_adjtime,
+ .gettimex64 = ionic_phc_gettimex64,
+ .settime64 = ionic_phc_settime64,
+ .do_aux_work = ionic_phc_aux_work,
+};
+
+void ionic_lif_register_phc(struct ionic_lif *lif)
+{
+ if (!lif->phc || !(lif->hw_features & IONIC_ETH_HW_TIMESTAMP))
+ return;
+
+ lif->phc->ptp = ptp_clock_register(&lif->phc->ptp_info, lif->ionic->dev);
+
+ if (IS_ERR(lif->phc->ptp)) {
+ dev_warn(lif->ionic->dev, "Cannot register phc device: %ld\n",
+ PTR_ERR(lif->phc->ptp));
+
+ lif->phc->ptp = NULL;
+ }
+
+ if (lif->phc->ptp)
+ ptp_schedule_worker(lif->phc->ptp, lif->phc->aux_work_delay);
+}
+
+void ionic_lif_unregister_phc(struct ionic_lif *lif)
+{
+ if (!lif->phc || !lif->phc->ptp)
+ return;
+
+ ptp_clock_unregister(lif->phc->ptp);
+
+ lif->phc->ptp = NULL;
+}
+
+void ionic_lif_alloc_phc(struct ionic_lif *lif)
+{
+ struct ionic *ionic = lif->ionic;
+ struct ionic_phc *phc;
+ u64 delay, diff, mult;
+ u64 frac = 0;
+ u64 features;
+ u32 shift;
+
+ if (!ionic->idev.hwstamp_regs)
+ return;
+
+ features = le64_to_cpu(ionic->ident.lif.eth.config.features);
+ if (!(features & IONIC_ETH_HW_TIMESTAMP))
+ return;
+
+ phc = devm_kzalloc(ionic->dev, sizeof(*phc), GFP_KERNEL);
+ if (!phc)
+ return;
+
+ phc->lif = lif;
+
+ phc->cc.read = ionic_cc_read;
+ phc->cc.mask = le64_to_cpu(ionic->ident.dev.hwstamp_mask);
+ phc->cc.mult = le32_to_cpu(ionic->ident.dev.hwstamp_mult);
+ phc->cc.shift = le32_to_cpu(ionic->ident.dev.hwstamp_shift);
+
+ if (!phc->cc.mult) {
+ dev_err(lif->ionic->dev,
+ "Invalid device PHC mask multiplier %u, disabling HW timestamp support\n",
+ phc->cc.mult);
+ devm_kfree(lif->ionic->dev, phc);
+ lif->phc = NULL;
+ return;
+ }
+
+ dev_dbg(lif->ionic->dev, "Device PHC mask %#llx mult %u shift %u\n",
+ phc->cc.mask, phc->cc.mult, phc->cc.shift);
+
+ spin_lock_init(&phc->lock);
+ mutex_init(&phc->config_lock);
+
+ /* max ticks is limited by the multiplier, or by the update period. */
+ if (phc->cc.shift + 2 + ilog2(IONIC_PHC_UPDATE_NS) >= 64) {
+ /* max ticks that do not overflow when multiplied by max
+ * adjusted multiplier (twice the initial multiplier)
+ */
+ diff = U64_MAX / phc->cc.mult / 2;
+ } else {
+ /* approx ticks at four times the update period */
+ diff = (u64)IONIC_PHC_UPDATE_NS << (phc->cc.shift + 2);
+ diff = DIV_ROUND_UP(diff, phc->cc.mult);
+ }
+
+ /* transform to bitmask */
+ diff |= diff >> 1;
+ diff |= diff >> 2;
+ diff |= diff >> 4;
+ diff |= diff >> 8;
+ diff |= diff >> 16;
+ diff |= diff >> 32;
+
+ /* constrain to the hardware bitmask, and use this as the bitmask */
+ diff &= phc->cc.mask;
+ phc->cc.mask = diff;
+
+ /* the wrap period is now defined by diff (or phc->cc.mask)
+ *
+ * we will update the time basis at about 1/4 the wrap period, so
+ * should not see a difference of more than +/- diff/4.
+ *
+ * this is sufficient not see a difference of more than +/- diff/2, as
+ * required by timecounter_cyc2time, to detect an old time stamp.
+ *
+ * adjust the initial multiplier, being careful to avoid overflow:
+ * - do not overflow 63 bits: init_cc_mult * SCALED_PPM
+ * - do not overflow 64 bits: max_mult * (diff / 2)
+ *
+ * we want to increase the initial multiplier as much as possible, to
+ * allow for more precise adjustment in ionic_phc_adjfine.
+ *
+ * only adjust the multiplier if we can double it or more.
+ */
+ mult = U64_MAX / 2 / max(diff / 2, SCALED_PPM);
+ shift = mult / phc->cc.mult;
+ if (shift >= 2) {
+ /* initial multiplier will be 2^n of hardware cc.mult */
+ shift = fls(shift);
+ /* increase cc.mult and cc.shift by the same 2^n and n. */
+ phc->cc.mult <<= shift;
+ phc->cc.shift += shift;
+ }
+
+ dev_dbg(lif->ionic->dev, "Initial PHC mask %#llx mult %u shift %u\n",
+ phc->cc.mask, phc->cc.mult, phc->cc.shift);
+
+ /* frequency adjustments are relative to the initial multiplier */
+ phc->init_cc_mult = phc->cc.mult;
+
+ timecounter_init(&phc->tc, &phc->cc, ktime_get_real_ns());
+
+ /* Update cycle_last at 1/4 the wrap period, or IONIC_PHC_UPDATE_NS */
+ delay = min_t(u64, IONIC_PHC_UPDATE_NS,
+ cyclecounter_cyc2ns(&phc->cc, diff / 4, 0, &frac));
+ dev_dbg(lif->ionic->dev, "Work delay %llu ms\n", delay / NSEC_PER_MSEC);
+
+ phc->aux_work_delay = nsecs_to_jiffies(delay);
+
+ phc->ptp_info = ionic_ptp_info;
+
+ /* We have allowed to adjust the multiplier up to +/- 1 part per 1.
+ * Here expressed as NORMAL_PPB (1 billion parts per billion).
+ */
+ phc->ptp_info.max_adj = NORMAL_PPB;
+
+ lif->phc = phc;
+}
+
+void ionic_lif_free_phc(struct ionic_lif *lif)
+{
+ if (!lif->phc)
+ return;
+
+ mutex_destroy(&lif->phc->config_lock);
+
+ devm_kfree(lif->ionic->dev, lif->phc);
+ lif->phc = NULL;
+}
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_rx_filter.c b/drivers/net/ethernet/pensando/ionic/ionic_rx_filter.c
index cd0076fc3044..d71316d9ded2 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_rx_filter.c
+++ b/drivers/net/ethernet/pensando/ionic/ionic_rx_filter.c
@@ -140,6 +140,9 @@ int ionic_rx_filter_save(struct ionic_lif *lif, u32 flow_id, u16 rxq_index,
case IONIC_RX_FILTER_MATCH_MAC_VLAN:
key = le16_to_cpu(ac->mac_vlan.vlan);
break;
+ case IONIC_RX_FILTER_STEER_PKTCLASS:
+ key = 0;
+ break;
default:
return -EINVAL;
}
@@ -210,3 +213,21 @@ struct ionic_rx_filter *ionic_rx_filter_by_addr(struct ionic_lif *lif,
return NULL;
}
+
+struct ionic_rx_filter *ionic_rx_filter_rxsteer(struct ionic_lif *lif)
+{
+ struct ionic_rx_filter *f;
+ struct hlist_head *head;
+ unsigned int key;
+
+ key = hash_32(0, IONIC_RX_FILTER_HASH_BITS);
+ head = &lif->rx_filters.by_hash[key];
+
+ hlist_for_each_entry(f, head, by_hash) {
+ if (le16_to_cpu(f->cmd.match) != IONIC_RX_FILTER_STEER_PKTCLASS)
+ continue;
+ return f;
+ }
+
+ return NULL;
+}
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_rx_filter.h b/drivers/net/ethernet/pensando/ionic/ionic_rx_filter.h
index cf8f4c0a961c..1ead48be3c83 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_rx_filter.h
+++ b/drivers/net/ethernet/pensando/ionic/ionic_rx_filter.h
@@ -31,5 +31,6 @@ int ionic_rx_filter_save(struct ionic_lif *lif, u32 flow_id, u16 rxq_index,
u32 hash, struct ionic_admin_ctx *ctx);
struct ionic_rx_filter *ionic_rx_filter_by_vlan(struct ionic_lif *lif, u16 vid);
struct ionic_rx_filter *ionic_rx_filter_by_addr(struct ionic_lif *lif, const u8 *addr);
+struct ionic_rx_filter *ionic_rx_filter_rxsteer(struct ionic_lif *lif);
#endif /* _IONIC_RX_FILTER_H_ */
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_stats.c b/drivers/net/ethernet/pensando/ionic/ionic_stats.c
index ed9cf93d9acd..58a854666c62 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_stats.c
+++ b/drivers/net/ethernet/pensando/ionic/ionic_stats.c
@@ -130,6 +130,8 @@ static const struct ionic_stat_desc ionic_tx_stats_desc[] = {
IONIC_TX_STAT_DESC(frags),
IONIC_TX_STAT_DESC(tso),
IONIC_TX_STAT_DESC(tso_bytes),
+ IONIC_TX_STAT_DESC(hwstamp_valid),
+ IONIC_TX_STAT_DESC(hwstamp_invalid),
IONIC_TX_STAT_DESC(csum_none),
IONIC_TX_STAT_DESC(csum),
IONIC_TX_STAT_DESC(vlan_inserted),
@@ -143,6 +145,8 @@ static const struct ionic_stat_desc ionic_rx_stats_desc[] = {
IONIC_RX_STAT_DESC(csum_none),
IONIC_RX_STAT_DESC(csum_complete),
IONIC_RX_STAT_DESC(csum_error),
+ IONIC_RX_STAT_DESC(hwstamp_valid),
+ IONIC_RX_STAT_DESC(hwstamp_invalid),
IONIC_RX_STAT_DESC(dropped),
IONIC_RX_STAT_DESC(vlan_stripped),
};
@@ -188,6 +192,8 @@ static void ionic_add_lif_txq_stats(struct ionic_lif *lif, int q_num,
stats->tx_tso_bytes += txstats->tso_bytes;
stats->tx_csum_none += txstats->csum_none;
stats->tx_csum += txstats->csum;
+ stats->tx_hwstamp_valid += txstats->hwstamp_valid;
+ stats->tx_hwstamp_invalid += txstats->hwstamp_invalid;
}
static void ionic_add_lif_rxq_stats(struct ionic_lif *lif, int q_num,
@@ -200,6 +206,8 @@ static void ionic_add_lif_rxq_stats(struct ionic_lif *lif, int q_num,
stats->rx_csum_none += rxstats->csum_none;
stats->rx_csum_complete += rxstats->csum_complete;
stats->rx_csum_error += rxstats->csum_error;
+ stats->rx_hwstamp_valid += rxstats->hwstamp_valid;
+ stats->rx_hwstamp_invalid += rxstats->hwstamp_invalid;
}
static void ionic_get_lif_stats(struct ionic_lif *lif,
@@ -215,6 +223,12 @@ static void ionic_get_lif_stats(struct ionic_lif *lif,
ionic_add_lif_rxq_stats(lif, q_num, stats);
}
+ if (lif->hwstamp_txq)
+ ionic_add_lif_txq_stats(lif, lif->hwstamp_txq->q.index, stats);
+
+ if (lif->hwstamp_rxq)
+ ionic_add_lif_rxq_stats(lif, lif->hwstamp_rxq->q.index, stats);
+
ionic_get_stats64(lif->netdev, &ns);
stats->hw_tx_dropped = ns.tx_dropped;
stats->hw_rx_dropped = ns.rx_dropped;
@@ -227,14 +241,18 @@ static u64 ionic_sw_stats_get_count(struct ionic_lif *lif)
{
u64 total = 0, tx_queues = MAX_Q(lif), rx_queues = MAX_Q(lif);
- /* lif stats */
+ if (lif->hwstamp_txq)
+ tx_queues += 1;
+
+ if (lif->hwstamp_rxq)
+ rx_queues += 1;
+
total += IONIC_NUM_LIF_STATS;
+ total += IONIC_NUM_PORT_STATS;
+
total += tx_queues * IONIC_NUM_TX_STATS;
total += rx_queues * IONIC_NUM_RX_STATS;
- /* port stats */
- total += IONIC_NUM_PORT_STATS;
-
if (test_bit(IONIC_LIF_F_UP, lif->state) &&
test_bit(IONIC_LIF_F_SW_DEBUG_STATS, lif->state)) {
/* tx debug stats */
@@ -318,8 +336,14 @@ static void ionic_sw_stats_get_strings(struct ionic_lif *lif, u8 **buf)
for (q_num = 0; q_num < MAX_Q(lif); q_num++)
ionic_sw_stats_get_tx_strings(lif, buf, q_num);
+ if (lif->hwstamp_txq)
+ ionic_sw_stats_get_tx_strings(lif, buf, lif->hwstamp_txq->q.index);
+
for (q_num = 0; q_num < MAX_Q(lif); q_num++)
ionic_sw_stats_get_rx_strings(lif, buf, q_num);
+
+ if (lif->hwstamp_rxq)
+ ionic_sw_stats_get_rx_strings(lif, buf, lif->hwstamp_rxq->q.index);
}
static void ionic_sw_stats_get_txq_values(struct ionic_lif *lif, u64 **buf,
@@ -434,8 +458,14 @@ static void ionic_sw_stats_get_values(struct ionic_lif *lif, u64 **buf)
for (q_num = 0; q_num < MAX_Q(lif); q_num++)
ionic_sw_stats_get_txq_values(lif, buf, q_num);
+ if (lif->hwstamp_txq)
+ ionic_sw_stats_get_txq_values(lif, buf, lif->hwstamp_txq->q.index);
+
for (q_num = 0; q_num < MAX_Q(lif); q_num++)
ionic_sw_stats_get_rxq_values(lif, buf, q_num);
+
+ if (lif->hwstamp_rxq)
+ ionic_sw_stats_get_rxq_values(lif, buf, lif->hwstamp_rxq->q.index);
}
const struct ionic_stats_group_intf ionic_stats_groups[] = {
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c
index 42d29cd2ca47..3478b0f2495f 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c
+++ b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c
@@ -11,8 +11,6 @@
#include "ionic_txrx.h"
-static bool ionic_tx_service(struct ionic_cq *cq, struct ionic_cq_info *cq_info);
-
static inline void ionic_txq_post(struct ionic_queue *q, bool ring_dbell,
ionic_desc_cb cb_func, void *cb_arg)
{
@@ -229,12 +227,14 @@ static void ionic_rx_clean(struct ionic_queue *q,
struct ionic_cq_info *cq_info,
void *cb_arg)
{
- struct ionic_rxq_comp *comp = cq_info->rxcq;
struct net_device *netdev = q->lif->netdev;
struct ionic_qcq *qcq = q_to_qcq(q);
struct ionic_rx_stats *stats;
+ struct ionic_rxq_comp *comp;
struct sk_buff *skb;
+ comp = cq_info->cq_desc + qcq->cq.desc_size - sizeof(*comp);
+
stats = q_to_rx_stats(q);
if (comp->status) {
@@ -296,17 +296,39 @@ static void ionic_rx_clean(struct ionic_queue *q,
stats->vlan_stripped++;
}
+ if (unlikely(q->features & IONIC_RXQ_F_HWSTAMP)) {
+ __le64 *cq_desc_hwstamp;
+ u64 hwstamp;
+
+ cq_desc_hwstamp =
+ cq_info->cq_desc +
+ qcq->cq.desc_size -
+ sizeof(struct ionic_rxq_comp) -
+ IONIC_HWSTAMP_CQ_NEGOFFSET;
+
+ hwstamp = le64_to_cpu(*cq_desc_hwstamp);
+
+ if (hwstamp != IONIC_HWSTAMP_INVALID) {
+ skb_hwtstamps(skb)->hwtstamp = ionic_lif_phc_ktime(q->lif, hwstamp);
+ stats->hwstamp_valid++;
+ } else {
+ stats->hwstamp_invalid++;
+ }
+ }
+
if (le16_to_cpu(comp->len) <= q->lif->rx_copybreak)
napi_gro_receive(&qcq->napi, skb);
else
napi_gro_frags(&qcq->napi);
}
-static bool ionic_rx_service(struct ionic_cq *cq, struct ionic_cq_info *cq_info)
+bool ionic_rx_service(struct ionic_cq *cq, struct ionic_cq_info *cq_info)
{
- struct ionic_rxq_comp *comp = cq_info->rxcq;
struct ionic_queue *q = cq->bound_q;
struct ionic_desc_info *desc_info;
+ struct ionic_rxq_comp *comp;
+
+ comp = cq_info->cq_desc + cq->desc_size - sizeof(*comp);
if (!color_match(comp->pkt_type_color, cq->done_color))
return false;
@@ -661,9 +683,11 @@ static void ionic_tx_clean(struct ionic_queue *q,
{
struct ionic_buf_info *buf_info = desc_info->bufs;
struct ionic_tx_stats *stats = q_to_tx_stats(q);
+ struct ionic_qcq *qcq = q_to_qcq(q);
+ struct sk_buff *skb = cb_arg;
struct device *dev = q->dev;
- u16 queue_index;
unsigned int i;
+ u16 qi;
if (desc_info->nbufs) {
dma_unmap_single(dev, (dma_addr_t)buf_info->dma_addr,
@@ -674,32 +698,59 @@ static void ionic_tx_clean(struct ionic_queue *q,
buf_info->len, DMA_TO_DEVICE);
}
- if (cb_arg) {
- struct sk_buff *skb = cb_arg;
+ if (!skb)
+ return;
- queue_index = skb_get_queue_mapping(skb);
- if (unlikely(__netif_subqueue_stopped(q->lif->netdev,
- queue_index))) {
- netif_wake_subqueue(q->lif->netdev, queue_index);
- q->wake++;
- }
+ qi = skb_get_queue_mapping(skb);
+
+ if (unlikely(q->features & IONIC_TXQ_F_HWSTAMP)) {
+ if (cq_info) {
+ struct skb_shared_hwtstamps hwts = {};
+ __le64 *cq_desc_hwstamp;
+ u64 hwstamp;
- desc_info->bytes = skb->len;
- stats->clean++;
+ cq_desc_hwstamp =
+ cq_info->cq_desc +
+ qcq->cq.desc_size -
+ sizeof(struct ionic_txq_comp) -
+ IONIC_HWSTAMP_CQ_NEGOFFSET;
- dev_consume_skb_any(skb);
+ hwstamp = le64_to_cpu(*cq_desc_hwstamp);
+
+ if (hwstamp != IONIC_HWSTAMP_INVALID) {
+ hwts.hwtstamp = ionic_lif_phc_ktime(q->lif, hwstamp);
+
+ skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
+ skb_tstamp_tx(skb, &hwts);
+
+ stats->hwstamp_valid++;
+ } else {
+ stats->hwstamp_invalid++;
+ }
+ }
+
+ } else if (unlikely(__netif_subqueue_stopped(q->lif->netdev, qi))) {
+ netif_wake_subqueue(q->lif->netdev, qi);
+ q->wake++;
}
+
+ desc_info->bytes = skb->len;
+ stats->clean++;
+
+ dev_consume_skb_any(skb);
}
-static bool ionic_tx_service(struct ionic_cq *cq, struct ionic_cq_info *cq_info)
+bool ionic_tx_service(struct ionic_cq *cq, struct ionic_cq_info *cq_info)
{
- struct ionic_txq_comp *comp = cq_info->txcq;
struct ionic_queue *q = cq->bound_q;
struct ionic_desc_info *desc_info;
+ struct ionic_txq_comp *comp;
int bytes = 0;
int pkts = 0;
u16 index;
+ comp = cq_info->cq_desc + cq->desc_size - sizeof(*comp);
+
if (!color_match(comp->color, cq->done_color))
return false;
@@ -720,7 +771,7 @@ static bool ionic_tx_service(struct ionic_cq *cq, struct ionic_cq_info *cq_info)
desc_info->cb_arg = NULL;
} while (index != le16_to_cpu(comp->comp_index));
- if (pkts && bytes)
+ if (pkts && bytes && !unlikely(q->features & IONIC_TXQ_F_HWSTAMP))
netdev_tx_completed_queue(q_to_ndq(q), pkts, bytes);
return true;
@@ -758,7 +809,7 @@ void ionic_tx_empty(struct ionic_queue *q)
desc_info->cb_arg = NULL;
}
- if (pkts && bytes)
+ if (pkts && bytes && !unlikely(q->features & IONIC_TXQ_F_HWSTAMP))
netdev_tx_completed_queue(q_to_ndq(q), pkts, bytes);
}
@@ -832,7 +883,8 @@ static void ionic_tx_tso_post(struct ionic_queue *q, struct ionic_txq_desc *desc
if (start) {
skb_tx_timestamp(skb);
- netdev_tx_sent_queue(q_to_ndq(q), skb->len);
+ if (!unlikely(q->features & IONIC_TXQ_F_HWSTAMP))
+ netdev_tx_sent_queue(q_to_ndq(q), skb->len);
ionic_txq_post(q, false, ionic_tx_clean, skb);
} else {
ionic_txq_post(q, done, NULL, NULL);
@@ -1079,7 +1131,8 @@ static int ionic_tx(struct ionic_queue *q, struct sk_buff *skb)
stats->pkts++;
stats->bytes += skb->len;
- netdev_tx_sent_queue(q_to_ndq(q), skb->len);
+ if (!unlikely(q->features & IONIC_TXQ_F_HWSTAMP))
+ netdev_tx_sent_queue(q_to_ndq(q), skb->len);
ionic_txq_post(q, !netdev_xmit_more(), ionic_tx_clean, skb);
return 0;
@@ -1131,6 +1184,41 @@ static int ionic_maybe_stop_tx(struct ionic_queue *q, int ndescs)
return stopped;
}
+static netdev_tx_t ionic_start_hwstamp_xmit(struct sk_buff *skb,
+ struct net_device *netdev)
+{
+ struct ionic_lif *lif = netdev_priv(netdev);
+ struct ionic_queue *q = &lif->hwstamp_txq->q;
+ int err, ndescs;
+
+ /* Does not stop/start txq, because we post to a separate tx queue
+ * for timestamping, and if a packet can't be posted immediately to
+ * the timestamping queue, it is dropped.
+ */
+
+ ndescs = ionic_tx_descs_needed(q, skb);
+ if (unlikely(ndescs < 0))
+ goto err_out_drop;
+
+ if (unlikely(!ionic_q_has_space(q, ndescs)))
+ goto err_out_drop;
+
+ if (skb_is_gso(skb))
+ err = ionic_tx_tso(q, skb);
+ else
+ err = ionic_tx(q, skb);
+
+ if (err)
+ goto err_out_drop;
+
+ return NETDEV_TX_OK;
+
+err_out_drop:
+ q->drop++;
+ dev_kfree_skb(skb);
+ return NETDEV_TX_OK;
+}
+
netdev_tx_t ionic_start_xmit(struct sk_buff *skb, struct net_device *netdev)
{
u16 queue_index = skb_get_queue_mapping(skb);
@@ -1144,6 +1232,10 @@ netdev_tx_t ionic_start_xmit(struct sk_buff *skb, struct net_device *netdev)
return NETDEV_TX_OK;
}
+ if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP))
+ if (lif->hwstamp_txq)
+ return ionic_start_hwstamp_xmit(skb, netdev);
+
if (unlikely(queue_index >= lif->nxqs))
queue_index = 0;
q = &lif->txqcqs[queue_index]->q;
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_txrx.h b/drivers/net/ethernet/pensando/ionic/ionic_txrx.h
index 7667b72232b8..d7cbaad8a6fb 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_txrx.h
+++ b/drivers/net/ethernet/pensando/ionic/ionic_txrx.h
@@ -14,4 +14,7 @@ int ionic_tx_napi(struct napi_struct *napi, int budget);
int ionic_txrx_napi(struct napi_struct *napi, int budget);
netdev_tx_t ionic_start_xmit(struct sk_buff *skb, struct net_device *netdev);
+bool ionic_rx_service(struct ionic_cq *cq, struct ionic_cq_info *cq_info);
+bool ionic_tx_service(struct ionic_cq *cq, struct ionic_cq_info *cq_info);
+
#endif /* _IONIC_TXRX_H_ */
diff --git a/drivers/net/ethernet/stmicro/stmmac/Makefile b/drivers/net/ethernet/stmicro/stmmac/Makefile
index 366740ab9c5a..f2e478b884b0 100644
--- a/drivers/net/ethernet/stmicro/stmmac/Makefile
+++ b/drivers/net/ethernet/stmicro/stmmac/Makefile
@@ -6,6 +6,7 @@ stmmac-objs:= stmmac_main.o stmmac_ethtool.o stmmac_mdio.o ring_mode.o \
mmc_core.o stmmac_hwtstamp.o stmmac_ptp.o dwmac4_descs.o \
dwmac4_dma.o dwmac4_lib.o dwmac4_core.o dwmac5.o hwif.o \
stmmac_tc.o dwxgmac2_core.o dwxgmac2_dma.o dwxgmac2_descs.o \
+ stmmac_xdp.o \
$(stmmac-y)
stmmac-$(CONFIG_STMMAC_SELFTESTS) += stmmac_selftests.o
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
index 9966f6f10905..c49debb62b05 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
@@ -36,12 +36,19 @@ struct stmmac_resources {
int tx_irq[MTL_MAX_TX_QUEUES];
};
+enum stmmac_txbuf_type {
+ STMMAC_TXBUF_T_SKB,
+ STMMAC_TXBUF_T_XDP_TX,
+ STMMAC_TXBUF_T_XDP_NDO,
+};
+
struct stmmac_tx_info {
dma_addr_t buf;
bool map_as_page;
unsigned len;
bool last_segment;
bool is_jumbo;
+ enum stmmac_txbuf_type buf_type;
};
#define STMMAC_TBS_AVAIL BIT(0)
@@ -57,7 +64,10 @@ struct stmmac_tx_queue {
struct dma_extended_desc *dma_etx ____cacheline_aligned_in_smp;
struct dma_edesc *dma_entx;
struct dma_desc *dma_tx;
- struct sk_buff **tx_skbuff;
+ union {
+ struct sk_buff **tx_skbuff;
+ struct xdp_frame **xdpf;
+ };
struct stmmac_tx_info *tx_skbuff_dma;
unsigned int cur_tx;
unsigned int dirty_tx;
@@ -68,14 +78,16 @@ struct stmmac_tx_queue {
struct stmmac_rx_buffer {
struct page *page;
- struct page *sec_page;
dma_addr_t addr;
+ __u32 page_offset;
+ struct page *sec_page;
dma_addr_t sec_addr;
};
struct stmmac_rx_queue {
u32 rx_count_frames;
u32 queue_index;
+ struct xdp_rxq_info xdp_rxq;
struct page_pool *page_pool;
struct stmmac_rx_buffer *buf_pool;
struct stmmac_priv *priv_data;
@@ -160,6 +172,7 @@ struct stmmac_priv {
bool tx_path_in_lpi_mode;
bool tso;
int sph;
+ int sph_cap;
u32 sarc_type;
unsigned int dma_buf_sz;
@@ -268,6 +281,9 @@ struct stmmac_priv {
/* Receive Side Scaling */
struct stmmac_rss rss;
+
+ /* XDP BPF Program */
+ struct bpf_prog *xdp_prog;
};
enum stmmac_state {
@@ -284,6 +300,8 @@ void stmmac_set_ethtool_ops(struct net_device *netdev);
void stmmac_ptp_register(struct stmmac_priv *priv);
void stmmac_ptp_unregister(struct stmmac_priv *priv);
+int stmmac_open(struct net_device *dev);
+int stmmac_release(struct net_device *dev);
int stmmac_resume(struct device *dev);
int stmmac_suspend(struct device *dev);
int stmmac_dvr_remove(struct device *dev);
@@ -297,6 +315,19 @@ int stmmac_reinit_ringparam(struct net_device *dev, u32 rx_size, u32 tx_size);
int stmmac_bus_clks_config(struct stmmac_priv *priv, bool enabled);
void stmmac_fpe_handshake(struct stmmac_priv *priv, bool enable);
+static inline bool stmmac_xdp_is_enabled(struct stmmac_priv *priv)
+{
+ return !!priv->xdp_prog;
+}
+
+static inline unsigned int stmmac_rx_offset(struct stmmac_priv *priv)
+{
+ if (stmmac_xdp_is_enabled(priv))
+ return XDP_PACKET_HEADROOM;
+
+ return 0;
+}
+
#if IS_ENABLED(CONFIG_STMMAC_SELFTESTS)
void stmmac_selftest_run(struct net_device *dev,
struct ethtool_test *etest, u64 *buf);
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index d34388b1ffcc..77285646c5fc 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -38,9 +38,11 @@
#include <linux/net_tstamp.h>
#include <linux/phylink.h>
#include <linux/udp.h>
+#include <linux/bpf_trace.h>
#include <net/pkt_cls.h>
#include "stmmac_ptp.h"
#include "stmmac.h"
+#include "stmmac_xdp.h"
#include <linux/reset.h>
#include <linux/of_mdio.h>
#include "dwmac1000.h"
@@ -67,6 +69,11 @@ MODULE_PARM_DESC(phyaddr, "Physical device address");
#define STMMAC_TX_THRESH(x) ((x)->dma_tx_size / 4)
#define STMMAC_RX_THRESH(x) ((x)->dma_rx_size / 4)
+#define STMMAC_XDP_PASS 0
+#define STMMAC_XDP_CONSUMED BIT(0)
+#define STMMAC_XDP_TX BIT(1)
+#define STMMAC_XDP_REDIRECT BIT(2)
+
static int flow_ctrl = FLOW_AUTO;
module_param(flow_ctrl, int, 0644);
MODULE_PARM_DESC(flow_ctrl, "Flow control ability [on/off]");
@@ -1384,6 +1391,7 @@ static int stmmac_init_rx_buffers(struct stmmac_priv *priv, struct dma_desc *p,
buf->page = page_pool_dev_alloc_pages(rx_q->page_pool);
if (!buf->page)
return -ENOMEM;
+ buf->page_offset = stmmac_rx_offset(priv);
if (priv->sph) {
buf->sec_page = page_pool_dev_alloc_pages(rx_q->page_pool);
@@ -1397,7 +1405,8 @@ static int stmmac_init_rx_buffers(struct stmmac_priv *priv, struct dma_desc *p,
stmmac_set_desc_sec_addr(priv, p, buf->sec_addr, false);
}
- buf->addr = page_pool_get_dma_addr(buf->page);
+ buf->addr = page_pool_get_dma_addr(buf->page) + buf->page_offset;
+
stmmac_set_desc_addr(priv, p, buf->addr);
if (priv->dma_buf_sz == BUF_SIZE_16KiB)
stmmac_init_desc3(priv, p);
@@ -1435,7 +1444,8 @@ static void stmmac_free_tx_buffer(struct stmmac_priv *priv, u32 queue, int i)
{
struct stmmac_tx_queue *tx_q = &priv->tx_queue[queue];
- if (tx_q->tx_skbuff_dma[i].buf) {
+ if (tx_q->tx_skbuff_dma[i].buf &&
+ tx_q->tx_skbuff_dma[i].buf_type != STMMAC_TXBUF_T_XDP_TX) {
if (tx_q->tx_skbuff_dma[i].map_as_page)
dma_unmap_page(priv->device,
tx_q->tx_skbuff_dma[i].buf,
@@ -1448,12 +1458,21 @@ static void stmmac_free_tx_buffer(struct stmmac_priv *priv, u32 queue, int i)
DMA_TO_DEVICE);
}
- if (tx_q->tx_skbuff[i]) {
+ if (tx_q->xdpf[i] &&
+ (tx_q->tx_skbuff_dma[i].buf_type == STMMAC_TXBUF_T_XDP_TX ||
+ tx_q->tx_skbuff_dma[i].buf_type == STMMAC_TXBUF_T_XDP_NDO)) {
+ xdp_return_frame(tx_q->xdpf[i]);
+ tx_q->xdpf[i] = NULL;
+ }
+
+ if (tx_q->tx_skbuff[i] &&
+ tx_q->tx_skbuff_dma[i].buf_type == STMMAC_TXBUF_T_SKB) {
dev_kfree_skb_any(tx_q->tx_skbuff[i]);
tx_q->tx_skbuff[i] = NULL;
- tx_q->tx_skbuff_dma[i].buf = 0;
- tx_q->tx_skbuff_dma[i].map_as_page = false;
}
+
+ tx_q->tx_skbuff_dma[i].buf = 0;
+ tx_q->tx_skbuff_dma[i].map_as_page = false;
}
/**
@@ -1503,7 +1522,8 @@ static void stmmac_reinit_rx_buffers(struct stmmac_priv *priv)
if (!buf->page)
goto err_reinit_rx_buffers;
- buf->addr = page_pool_get_dma_addr(buf->page);
+ buf->addr = page_pool_get_dma_addr(buf->page) +
+ buf->page_offset;
}
if (priv->sph && !buf->sec_page) {
@@ -1560,6 +1580,7 @@ static int init_dma_rx_desc_rings(struct net_device *dev, gfp_t flags)
for (queue = 0; queue < rx_count; queue++) {
struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue];
+ int ret;
netif_dbg(priv, probe, priv->dev,
"(%s) dma_rx_phy=0x%08x\n", __func__,
@@ -1567,6 +1588,14 @@ static int init_dma_rx_desc_rings(struct net_device *dev, gfp_t flags)
stmmac_clear_rx_descriptors(priv, queue);
+ WARN_ON(xdp_rxq_info_reg_mem_model(&rx_q->xdp_rxq,
+ MEM_TYPE_PAGE_POOL,
+ rx_q->page_pool));
+
+ netdev_info(priv->dev,
+ "Register MEM_TYPE_PAGE_POOL RxQ-%d\n",
+ rx_q->queue_index);
+
for (i = 0; i < priv->dma_rx_size; i++) {
struct dma_desc *p;
@@ -1767,6 +1796,9 @@ static void free_dma_rx_desc_resources(struct stmmac_priv *priv)
sizeof(struct dma_extended_desc),
rx_q->dma_erx, rx_q->dma_rx_phy);
+ if (xdp_rxq_info_is_reg(&rx_q->xdp_rxq))
+ xdp_rxq_info_unreg(&rx_q->xdp_rxq);
+
kfree(rx_q->buf_pool);
if (rx_q->page_pool)
page_pool_destroy(rx_q->page_pool);
@@ -1821,6 +1853,7 @@ static void free_dma_tx_desc_resources(struct stmmac_priv *priv)
*/
static int alloc_dma_rx_desc_resources(struct stmmac_priv *priv)
{
+ bool xdp_prog = stmmac_xdp_is_enabled(priv);
u32 rx_count = priv->plat->rx_queues_to_use;
int ret = -ENOMEM;
u32 queue;
@@ -1828,19 +1861,23 @@ static int alloc_dma_rx_desc_resources(struct stmmac_priv *priv)
/* RX queues buffers and DMA */
for (queue = 0; queue < rx_count; queue++) {
struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue];
+ struct stmmac_channel *ch = &priv->channel[queue];
struct page_pool_params pp_params = { 0 };
unsigned int num_pages;
+ int ret;
rx_q->queue_index = queue;
rx_q->priv_data = priv;
- pp_params.flags = PP_FLAG_DMA_MAP;
+ pp_params.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
pp_params.pool_size = priv->dma_rx_size;
num_pages = DIV_ROUND_UP(priv->dma_buf_sz, PAGE_SIZE);
pp_params.order = ilog2(num_pages);
pp_params.nid = dev_to_node(priv->device);
pp_params.dev = priv->device;
- pp_params.dma_dir = DMA_FROM_DEVICE;
+ pp_params.dma_dir = xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
+ pp_params.offset = stmmac_rx_offset(priv);
+ pp_params.max_len = STMMAC_MAX_RX_BUF_SIZE(num_pages);
rx_q->page_pool = page_pool_create(&pp_params);
if (IS_ERR(rx_q->page_pool)) {
@@ -1873,6 +1910,14 @@ static int alloc_dma_rx_desc_resources(struct stmmac_priv *priv)
if (!rx_q->dma_rx)
goto err_dma;
}
+
+ ret = xdp_rxq_info_reg(&rx_q->xdp_rxq, priv->dev,
+ rx_q->queue_index,
+ ch->rx_napi.napi_id);
+ if (ret) {
+ netdev_err(priv->dev, "Failed to register xdp rxq info\n");
+ goto err_dma;
+ }
}
return 0;
@@ -1974,11 +2019,13 @@ static int alloc_dma_desc_resources(struct stmmac_priv *priv)
*/
static void free_dma_desc_resources(struct stmmac_priv *priv)
{
- /* Release the DMA RX socket buffers */
- free_dma_rx_desc_resources(priv);
-
/* Release the DMA TX socket buffers */
free_dma_tx_desc_resources(priv);
+
+ /* Release the DMA RX socket buffers later
+ * to ensure all pending XDP_TX buffers are returned.
+ */
+ free_dma_rx_desc_resources(priv);
}
/**
@@ -2170,10 +2217,23 @@ static int stmmac_tx_clean(struct stmmac_priv *priv, int budget, u32 queue)
entry = tx_q->dirty_tx;
while ((entry != tx_q->cur_tx) && (count < budget)) {
- struct sk_buff *skb = tx_q->tx_skbuff[entry];
+ struct xdp_frame *xdpf;
+ struct sk_buff *skb;
struct dma_desc *p;
int status;
+ if (tx_q->tx_skbuff_dma[entry].buf_type == STMMAC_TXBUF_T_XDP_TX ||
+ tx_q->tx_skbuff_dma[entry].buf_type == STMMAC_TXBUF_T_XDP_NDO) {
+ xdpf = tx_q->xdpf[entry];
+ skb = NULL;
+ } else if (tx_q->tx_skbuff_dma[entry].buf_type == STMMAC_TXBUF_T_SKB) {
+ xdpf = NULL;
+ skb = tx_q->tx_skbuff[entry];
+ } else {
+ xdpf = NULL;
+ skb = NULL;
+ }
+
if (priv->extend_desc)
p = (struct dma_desc *)(tx_q->dma_etx + entry);
else if (tx_q->tbs & STMMAC_TBS_AVAIL)
@@ -2203,10 +2263,12 @@ static int stmmac_tx_clean(struct stmmac_priv *priv, int budget, u32 queue)
priv->dev->stats.tx_packets++;
priv->xstats.tx_pkt_n++;
}
- stmmac_get_tx_hwtstamp(priv, p, skb);
+ if (skb)
+ stmmac_get_tx_hwtstamp(priv, p, skb);
}
- if (likely(tx_q->tx_skbuff_dma[entry].buf)) {
+ if (likely(tx_q->tx_skbuff_dma[entry].buf &&
+ tx_q->tx_skbuff_dma[entry].buf_type != STMMAC_TXBUF_T_XDP_TX)) {
if (tx_q->tx_skbuff_dma[entry].map_as_page)
dma_unmap_page(priv->device,
tx_q->tx_skbuff_dma[entry].buf,
@@ -2227,11 +2289,25 @@ static int stmmac_tx_clean(struct stmmac_priv *priv, int budget, u32 queue)
tx_q->tx_skbuff_dma[entry].last_segment = false;
tx_q->tx_skbuff_dma[entry].is_jumbo = false;
- if (likely(skb != NULL)) {
- pkts_compl++;
- bytes_compl += skb->len;
- dev_consume_skb_any(skb);
- tx_q->tx_skbuff[entry] = NULL;
+ if (xdpf &&
+ tx_q->tx_skbuff_dma[entry].buf_type == STMMAC_TXBUF_T_XDP_TX) {
+ xdp_return_frame_rx_napi(xdpf);
+ tx_q->xdpf[entry] = NULL;
+ }
+
+ if (xdpf &&
+ tx_q->tx_skbuff_dma[entry].buf_type == STMMAC_TXBUF_T_XDP_NDO) {
+ xdp_return_frame(xdpf);
+ tx_q->xdpf[entry] = NULL;
+ }
+
+ if (tx_q->tx_skbuff_dma[entry].buf_type == STMMAC_TXBUF_T_SKB) {
+ if (likely(skb)) {
+ pkts_compl++;
+ bytes_compl += skb->len;
+ dev_consume_skb_any(skb);
+ tx_q->tx_skbuff[entry] = NULL;
+ }
}
stmmac_release_tx_desc(priv, p, priv->mode);
@@ -2858,6 +2934,7 @@ static int stmmac_hw_setup(struct net_device *dev, bool init_ptp)
struct stmmac_priv *priv = netdev_priv(dev);
u32 rx_cnt = priv->plat->rx_queues_to_use;
u32 tx_cnt = priv->plat->tx_queues_to_use;
+ bool sph_en;
u32 chan;
int ret;
@@ -2952,10 +3029,10 @@ static int stmmac_hw_setup(struct net_device *dev, bool init_ptp)
}
/* Enable Split Header */
- if (priv->sph && priv->hw->rx_csum) {
- for (chan = 0; chan < rx_cnt; chan++)
- stmmac_enable_sph(priv, priv->ioaddr, 1, chan);
- }
+ sph_en = (priv->hw->rx_csum > 0) && priv->sph;
+ for (chan = 0; chan < rx_cnt; chan++)
+ stmmac_enable_sph(priv, priv->ioaddr, sph_en, chan);
+
/* VLAN Tag Insertion */
if (priv->dma_cap.vlins)
@@ -3005,15 +3082,19 @@ static void stmmac_free_irq(struct net_device *dev,
fallthrough;
case REQ_IRQ_ERR_TX:
for (j = irq_idx - 1; j >= 0; j--) {
- if (priv->tx_irq[j] > 0)
+ if (priv->tx_irq[j] > 0) {
+ irq_set_affinity_hint(priv->tx_irq[j], NULL);
free_irq(priv->tx_irq[j], &priv->tx_queue[j]);
+ }
}
irq_idx = priv->plat->rx_queues_to_use;
fallthrough;
case REQ_IRQ_ERR_RX:
for (j = irq_idx - 1; j >= 0; j--) {
- if (priv->rx_irq[j] > 0)
+ if (priv->rx_irq[j] > 0) {
+ irq_set_affinity_hint(priv->rx_irq[j], NULL);
free_irq(priv->rx_irq[j], &priv->rx_queue[j]);
+ }
}
if (priv->sfty_ue_irq > 0 && priv->sfty_ue_irq != dev->irq)
@@ -3045,6 +3126,7 @@ static int stmmac_request_irq_multi_msi(struct net_device *dev)
{
enum request_irq_err irq_err = REQ_IRQ_ERR_NO;
struct stmmac_priv *priv = netdev_priv(dev);
+ cpumask_t cpu_mask;
int irq_idx = 0;
char *int_name;
int ret;
@@ -3153,6 +3235,9 @@ static int stmmac_request_irq_multi_msi(struct net_device *dev)
irq_idx = i;
goto irq_error;
}
+ cpumask_clear(&cpu_mask);
+ cpumask_set_cpu(i % num_online_cpus(), &cpu_mask);
+ irq_set_affinity_hint(priv->rx_irq[i], &cpu_mask);
}
/* Request Tx MSI irq */
@@ -3173,6 +3258,9 @@ static int stmmac_request_irq_multi_msi(struct net_device *dev)
irq_idx = i;
goto irq_error;
}
+ cpumask_clear(&cpu_mask);
+ cpumask_set_cpu(i % num_online_cpus(), &cpu_mask);
+ irq_set_affinity_hint(priv->tx_irq[i], &cpu_mask);
}
return 0;
@@ -3256,7 +3344,7 @@ static int stmmac_request_irq(struct net_device *dev)
* 0 on success and an appropriate (-)ve integer as defined in errno.h
* file on failure.
*/
-static int stmmac_open(struct net_device *dev)
+int stmmac_open(struct net_device *dev)
{
struct stmmac_priv *priv = netdev_priv(dev);
int bfsize = 0;
@@ -3379,7 +3467,7 @@ static void stmmac_fpe_stop_wq(struct stmmac_priv *priv)
* Description:
* This is the stop entry point of the driver.
*/
-static int stmmac_release(struct net_device *dev)
+int stmmac_release(struct net_device *dev)
{
struct stmmac_priv *priv = netdev_priv(dev);
u32 chan;
@@ -3506,6 +3594,28 @@ static void stmmac_tso_allocator(struct stmmac_priv *priv, dma_addr_t des,
}
}
+static void stmmac_flush_tx_descriptors(struct stmmac_priv *priv, int queue)
+{
+ struct stmmac_tx_queue *tx_q = &priv->tx_queue[queue];
+ int desc_size;
+
+ if (likely(priv->extend_desc))
+ desc_size = sizeof(struct dma_extended_desc);
+ else if (tx_q->tbs & STMMAC_TBS_AVAIL)
+ desc_size = sizeof(struct dma_edesc);
+ else
+ desc_size = sizeof(struct dma_desc);
+
+ /* The own bit must be the latest setting done when prepare the
+ * descriptor and then barrier is needed to make sure that
+ * all is coherent before granting the DMA engine.
+ */
+ wmb();
+
+ tx_q->tx_tail_addr = tx_q->dma_tx_phy + (tx_q->cur_tx * desc_size);
+ stmmac_set_tx_tail_ptr(priv, priv->ioaddr, tx_q->tx_tail_addr, queue);
+}
+
/**
* stmmac_tso_xmit - Tx entry point of the driver for oversized frames (TSO)
* @skb : the socket buffer
@@ -3537,10 +3647,10 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct dma_desc *desc, *first, *mss_desc = NULL;
struct stmmac_priv *priv = netdev_priv(dev);
- int desc_size, tmp_pay_len = 0, first_tx;
int nfrags = skb_shinfo(skb)->nr_frags;
u32 queue = skb_get_queue_mapping(skb);
unsigned int first_entry, tx_packets;
+ int tmp_pay_len = 0, first_tx;
struct stmmac_tx_queue *tx_q;
bool has_vlan, set_ic;
u8 proto_hdr_len, hdr;
@@ -3622,6 +3732,8 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
tx_q->tx_skbuff_dma[first_entry].buf = des;
tx_q->tx_skbuff_dma[first_entry].len = skb_headlen(skb);
+ tx_q->tx_skbuff_dma[first_entry].map_as_page = false;
+ tx_q->tx_skbuff_dma[first_entry].buf_type = STMMAC_TXBUF_T_SKB;
if (priv->dma_cap.addr64 <= 32) {
first->des0 = cpu_to_le32(des);
@@ -3657,12 +3769,14 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
tx_q->tx_skbuff_dma[tx_q->cur_tx].buf = des;
tx_q->tx_skbuff_dma[tx_q->cur_tx].len = skb_frag_size(frag);
tx_q->tx_skbuff_dma[tx_q->cur_tx].map_as_page = true;
+ tx_q->tx_skbuff_dma[tx_q->cur_tx].buf_type = STMMAC_TXBUF_T_SKB;
}
tx_q->tx_skbuff_dma[tx_q->cur_tx].last_segment = true;
/* Only the last descriptor gets to point to the skb. */
tx_q->tx_skbuff[tx_q->cur_tx] = skb;
+ tx_q->tx_skbuff_dma[tx_q->cur_tx].buf_type = STMMAC_TXBUF_T_SKB;
/* Manage tx mitigation */
tx_packets = (tx_q->cur_tx + 1) - first_tx;
@@ -3738,12 +3852,6 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
stmmac_set_tx_owner(priv, mss_desc);
}
- /* The own bit must be the latest setting done when prepare the
- * descriptor and then barrier is needed to make sure that
- * all is coherent before granting the DMA engine.
- */
- wmb();
-
if (netif_msg_pktdata(priv)) {
pr_info("%s: curr=%d dirty=%d f=%d, e=%d, f_p=%p, nfrags %d\n",
__func__, tx_q->cur_tx, tx_q->dirty_tx, first_entry,
@@ -3754,13 +3862,7 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
netdev_tx_sent_queue(netdev_get_tx_queue(dev, queue), skb->len);
- if (tx_q->tbs & STMMAC_TBS_AVAIL)
- desc_size = sizeof(struct dma_edesc);
- else
- desc_size = sizeof(struct dma_desc);
-
- tx_q->tx_tail_addr = tx_q->dma_tx_phy + (tx_q->cur_tx * desc_size);
- stmmac_set_tx_tail_ptr(priv, priv->ioaddr, tx_q->tx_tail_addr, queue);
+ stmmac_flush_tx_descriptors(priv, queue);
stmmac_tx_timer_arm(priv, queue);
return NETDEV_TX_OK;
@@ -3790,10 +3892,10 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
int nfrags = skb_shinfo(skb)->nr_frags;
int gso = skb_shinfo(skb)->gso_type;
struct dma_edesc *tbs_desc = NULL;
- int entry, desc_size, first_tx;
struct dma_desc *desc, *first;
struct stmmac_tx_queue *tx_q;
bool has_vlan, set_ic;
+ int entry, first_tx;
dma_addr_t des;
tx_q = &priv->tx_queue[queue];
@@ -3881,6 +3983,7 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
tx_q->tx_skbuff_dma[entry].map_as_page = true;
tx_q->tx_skbuff_dma[entry].len = len;
tx_q->tx_skbuff_dma[entry].last_segment = last_segment;
+ tx_q->tx_skbuff_dma[entry].buf_type = STMMAC_TXBUF_T_SKB;
/* Prepare the descriptor and set the own bit too */
stmmac_prepare_tx_desc(priv, desc, 0, len, csum_insertion,
@@ -3889,6 +3992,7 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
/* Only the last descriptor gets to point to the skb. */
tx_q->tx_skbuff[entry] = skb;
+ tx_q->tx_skbuff_dma[entry].buf_type = STMMAC_TXBUF_T_SKB;
/* According to the coalesce parameter the IC bit for the latest
* segment is reset and the timer re-started to clean the tx status.
@@ -3967,6 +4071,8 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
goto dma_map_err;
tx_q->tx_skbuff_dma[first_entry].buf = des;
+ tx_q->tx_skbuff_dma[first_entry].buf_type = STMMAC_TXBUF_T_SKB;
+ tx_q->tx_skbuff_dma[first_entry].map_as_page = false;
stmmac_set_desc_addr(priv, first, des);
@@ -3995,25 +4101,11 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
stmmac_set_tx_owner(priv, first);
- /* The own bit must be the latest setting done when prepare the
- * descriptor and then barrier is needed to make sure that
- * all is coherent before granting the DMA engine.
- */
- wmb();
-
netdev_tx_sent_queue(netdev_get_tx_queue(dev, queue), skb->len);
stmmac_enable_dma_transmission(priv, priv->ioaddr);
- if (likely(priv->extend_desc))
- desc_size = sizeof(struct dma_extended_desc);
- else if (tx_q->tbs & STMMAC_TBS_AVAIL)
- desc_size = sizeof(struct dma_edesc);
- else
- desc_size = sizeof(struct dma_desc);
-
- tx_q->tx_tail_addr = tx_q->dma_tx_phy + (tx_q->cur_tx * desc_size);
- stmmac_set_tx_tail_ptr(priv, priv->ioaddr, tx_q->tx_tail_addr, queue);
+ stmmac_flush_tx_descriptors(priv, queue);
stmmac_tx_timer_arm(priv, queue);
return NETDEV_TX_OK;
@@ -4056,11 +4148,9 @@ static void stmmac_rx_vlan(struct net_device *dev, struct sk_buff *skb)
static inline void stmmac_rx_refill(struct stmmac_priv *priv, u32 queue)
{
struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue];
- int len, dirty = stmmac_rx_dirty(priv, queue);
+ int dirty = stmmac_rx_dirty(priv, queue);
unsigned int entry = rx_q->dirty_rx;
- len = DIV_ROUND_UP(priv->dma_buf_sz, PAGE_SIZE) * PAGE_SIZE;
-
while (dirty-- > 0) {
struct stmmac_rx_buffer *buf = &rx_q->buf_pool[entry];
struct dma_desc *p;
@@ -4083,18 +4173,9 @@ static inline void stmmac_rx_refill(struct stmmac_priv *priv, u32 queue)
break;
buf->sec_addr = page_pool_get_dma_addr(buf->sec_page);
-
- dma_sync_single_for_device(priv->device, buf->sec_addr,
- len, DMA_FROM_DEVICE);
}
- buf->addr = page_pool_get_dma_addr(buf->page);
-
- /* Sync whole allocation to device. This will invalidate old
- * data.
- */
- dma_sync_single_for_device(priv->device, buf->addr, len,
- DMA_FROM_DEVICE);
+ buf->addr = page_pool_get_dma_addr(buf->page) + buf->page_offset;
stmmac_set_desc_addr(priv, p, buf->addr);
if (priv->sph)
@@ -4173,6 +4254,180 @@ static unsigned int stmmac_rx_buf2_len(struct stmmac_priv *priv,
return plen - len;
}
+static int stmmac_xdp_xmit_xdpf(struct stmmac_priv *priv, int queue,
+ struct xdp_frame *xdpf, bool dma_map)
+{
+ struct stmmac_tx_queue *tx_q = &priv->tx_queue[queue];
+ unsigned int entry = tx_q->cur_tx;
+ struct dma_desc *tx_desc;
+ dma_addr_t dma_addr;
+ bool set_ic;
+
+ if (stmmac_tx_avail(priv, queue) < STMMAC_TX_THRESH(priv))
+ return STMMAC_XDP_CONSUMED;
+
+ if (likely(priv->extend_desc))
+ tx_desc = (struct dma_desc *)(tx_q->dma_etx + entry);
+ else if (tx_q->tbs & STMMAC_TBS_AVAIL)
+ tx_desc = &tx_q->dma_entx[entry].basic;
+ else
+ tx_desc = tx_q->dma_tx + entry;
+
+ if (dma_map) {
+ dma_addr = dma_map_single(priv->device, xdpf->data,
+ xdpf->len, DMA_TO_DEVICE);
+ if (dma_mapping_error(priv->device, dma_addr))
+ return STMMAC_XDP_CONSUMED;
+
+ tx_q->tx_skbuff_dma[entry].buf_type = STMMAC_TXBUF_T_XDP_NDO;
+ } else {
+ struct page *page = virt_to_page(xdpf->data);
+
+ dma_addr = page_pool_get_dma_addr(page) + sizeof(*xdpf) +
+ xdpf->headroom;
+ dma_sync_single_for_device(priv->device, dma_addr,
+ xdpf->len, DMA_BIDIRECTIONAL);
+
+ tx_q->tx_skbuff_dma[entry].buf_type = STMMAC_TXBUF_T_XDP_TX;
+ }
+
+ tx_q->tx_skbuff_dma[entry].buf = dma_addr;
+ tx_q->tx_skbuff_dma[entry].map_as_page = false;
+ tx_q->tx_skbuff_dma[entry].len = xdpf->len;
+ tx_q->tx_skbuff_dma[entry].last_segment = true;
+ tx_q->tx_skbuff_dma[entry].is_jumbo = false;
+
+ tx_q->xdpf[entry] = xdpf;
+
+ stmmac_set_desc_addr(priv, tx_desc, dma_addr);
+
+ stmmac_prepare_tx_desc(priv, tx_desc, 1, xdpf->len,
+ true, priv->mode, true, true,
+ xdpf->len);
+
+ tx_q->tx_count_frames++;
+
+ if (tx_q->tx_count_frames % priv->tx_coal_frames[queue] == 0)
+ set_ic = true;
+ else
+ set_ic = false;
+
+ if (set_ic) {
+ tx_q->tx_count_frames = 0;
+ stmmac_set_tx_ic(priv, tx_desc);
+ priv->xstats.tx_set_ic_bit++;
+ }
+
+ stmmac_enable_dma_transmission(priv, priv->ioaddr);
+
+ entry = STMMAC_GET_ENTRY(entry, priv->dma_tx_size);
+ tx_q->cur_tx = entry;
+
+ return STMMAC_XDP_TX;
+}
+
+static int stmmac_xdp_get_tx_queue(struct stmmac_priv *priv,
+ int cpu)
+{
+ int index = cpu;
+
+ if (unlikely(index < 0))
+ index = 0;
+
+ while (index >= priv->plat->tx_queues_to_use)
+ index -= priv->plat->tx_queues_to_use;
+
+ return index;
+}
+
+static int stmmac_xdp_xmit_back(struct stmmac_priv *priv,
+ struct xdp_buff *xdp)
+{
+ struct xdp_frame *xdpf = xdp_convert_buff_to_frame(xdp);
+ int cpu = smp_processor_id();
+ struct netdev_queue *nq;
+ int queue;
+ int res;
+
+ if (unlikely(!xdpf))
+ return STMMAC_XDP_CONSUMED;
+
+ queue = stmmac_xdp_get_tx_queue(priv, cpu);
+ nq = netdev_get_tx_queue(priv->dev, queue);
+
+ __netif_tx_lock(nq, cpu);
+ /* Avoids TX time-out as we are sharing with slow path */
+ nq->trans_start = jiffies;
+
+ res = stmmac_xdp_xmit_xdpf(priv, queue, xdpf, false);
+ if (res == STMMAC_XDP_TX)
+ stmmac_flush_tx_descriptors(priv, queue);
+
+ __netif_tx_unlock(nq);
+
+ return res;
+}
+
+static struct sk_buff *stmmac_xdp_run_prog(struct stmmac_priv *priv,
+ struct xdp_buff *xdp)
+{
+ struct bpf_prog *prog;
+ int res;
+ u32 act;
+
+ rcu_read_lock();
+
+ prog = READ_ONCE(priv->xdp_prog);
+ if (!prog) {
+ res = STMMAC_XDP_PASS;
+ goto unlock;
+ }
+
+ act = bpf_prog_run_xdp(prog, xdp);
+ switch (act) {
+ case XDP_PASS:
+ res = STMMAC_XDP_PASS;
+ break;
+ case XDP_TX:
+ res = stmmac_xdp_xmit_back(priv, xdp);
+ break;
+ case XDP_REDIRECT:
+ if (xdp_do_redirect(priv->dev, xdp, prog) < 0)
+ res = STMMAC_XDP_CONSUMED;
+ else
+ res = STMMAC_XDP_REDIRECT;
+ break;
+ default:
+ bpf_warn_invalid_xdp_action(act);
+ fallthrough;
+ case XDP_ABORTED:
+ trace_xdp_exception(priv->dev, prog, act);
+ fallthrough;
+ case XDP_DROP:
+ res = STMMAC_XDP_CONSUMED;
+ break;
+ }
+
+unlock:
+ rcu_read_unlock();
+ return ERR_PTR(-res);
+}
+
+static void stmmac_finalize_xdp_rx(struct stmmac_priv *priv,
+ int xdp_status)
+{
+ int cpu = smp_processor_id();
+ int queue;
+
+ queue = stmmac_xdp_get_tx_queue(priv, cpu);
+
+ if (xdp_status & STMMAC_XDP_TX)
+ stmmac_tx_timer_arm(priv, queue);
+
+ if (xdp_status & STMMAC_XDP_REDIRECT)
+ xdp_do_flush();
+}
+
/**
* stmmac_rx - manage the receive process
* @priv: driver private structure
@@ -4188,8 +4443,15 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue)
unsigned int count = 0, error = 0, len = 0;
int status = 0, coe = priv->hw->rx_csum;
unsigned int next_entry = rx_q->cur_rx;
+ enum dma_data_direction dma_dir;
unsigned int desc_size;
struct sk_buff *skb = NULL;
+ struct xdp_buff xdp;
+ int xdp_status = 0;
+ int buf_sz;
+
+ dma_dir = page_pool_get_dma_dir(rx_q->page_pool);
+ buf_sz = DIV_ROUND_UP(priv->dma_buf_sz, PAGE_SIZE) * PAGE_SIZE;
if (netif_msg_rx_status(priv)) {
void *rx_head;
@@ -4307,6 +4569,64 @@ read_again:
}
if (!skb) {
+ unsigned int pre_len, sync_len;
+
+ dma_sync_single_for_cpu(priv->device, buf->addr,
+ buf1_len, dma_dir);
+
+ xdp.data = page_address(buf->page) + buf->page_offset;
+ xdp.data_end = xdp.data + buf1_len;
+ xdp.data_hard_start = page_address(buf->page);
+ xdp_set_data_meta_invalid(&xdp);
+ xdp.frame_sz = buf_sz;
+ xdp.rxq = &rx_q->xdp_rxq;
+
+ pre_len = xdp.data_end - xdp.data_hard_start -
+ buf->page_offset;
+ skb = stmmac_xdp_run_prog(priv, &xdp);
+ /* Due xdp_adjust_tail: DMA sync for_device
+ * cover max len CPU touch
+ */
+ sync_len = xdp.data_end - xdp.data_hard_start -
+ buf->page_offset;
+ sync_len = max(sync_len, pre_len);
+
+ /* For Not XDP_PASS verdict */
+ if (IS_ERR(skb)) {
+ unsigned int xdp_res = -PTR_ERR(skb);
+
+ if (xdp_res & STMMAC_XDP_CONSUMED) {
+ page_pool_put_page(rx_q->page_pool,
+ virt_to_head_page(xdp.data),
+ sync_len, true);
+ buf->page = NULL;
+ priv->dev->stats.rx_dropped++;
+
+ /* Clear skb as it was set as
+ * status by XDP program.
+ */
+ skb = NULL;
+
+ if (unlikely((status & rx_not_ls)))
+ goto read_again;
+
+ count++;
+ continue;
+ } else if (xdp_res & (STMMAC_XDP_TX |
+ STMMAC_XDP_REDIRECT)) {
+ xdp_status |= xdp_res;
+ buf->page = NULL;
+ skb = NULL;
+ count++;
+ continue;
+ }
+ }
+ }
+
+ if (!skb) {
+ /* XDP program may expand or reduce tail */
+ buf1_len = xdp.data_end - xdp.data;
+
skb = napi_alloc_skb(&ch->rx_napi, buf1_len);
if (!skb) {
priv->dev->stats.rx_dropped++;
@@ -4314,10 +4634,8 @@ read_again:
goto drain_data;
}
- dma_sync_single_for_cpu(priv->device, buf->addr,
- buf1_len, DMA_FROM_DEVICE);
- skb_copy_to_linear_data(skb, page_address(buf->page),
- buf1_len);
+ /* XDP program may adjust header */
+ skb_copy_to_linear_data(skb, xdp.data, buf1_len);
skb_put(skb, buf1_len);
/* Data payload copied into SKB, page ready for recycle */
@@ -4325,9 +4643,9 @@ read_again:
buf->page = NULL;
} else if (buf1_len) {
dma_sync_single_for_cpu(priv->device, buf->addr,
- buf1_len, DMA_FROM_DEVICE);
+ buf1_len, dma_dir);
skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
- buf->page, 0, buf1_len,
+ buf->page, buf->page_offset, buf1_len,
priv->dma_buf_sz);
/* Data payload appended into SKB */
@@ -4337,7 +4655,7 @@ read_again:
if (buf2_len) {
dma_sync_single_for_cpu(priv->device, buf->sec_addr,
- buf2_len, DMA_FROM_DEVICE);
+ buf2_len, dma_dir);
skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
buf->sec_page, 0, buf2_len,
priv->dma_buf_sz);
@@ -4383,6 +4701,8 @@ drain_data:
rx_q->state.len = len;
}
+ stmmac_finalize_xdp_rx(priv, xdp_status);
+
stmmac_rx_refill(priv, queue);
priv->xstats.rx_pkt_n += count;
@@ -4495,6 +4815,11 @@ static int stmmac_change_mtu(struct net_device *dev, int new_mtu)
return -EBUSY;
}
+ if (stmmac_xdp_is_enabled(priv) && new_mtu > ETH_DATA_LEN) {
+ netdev_dbg(priv->dev, "Jumbo frames not supported for XDP\n");
+ return -EINVAL;
+ }
+
new_mtu = STMMAC_ALIGN(new_mtu);
/* If condition true, FIFO is too small or MTU too large */
@@ -4556,6 +4881,7 @@ static int stmmac_set_features(struct net_device *netdev,
stmmac_rx_ipc(priv, priv->hw);
sph_en = (priv->hw->rx_csum > 0) && priv->sph;
+
for (chan = 0; chan < priv->plat->rx_queues_to_use; chan++)
stmmac_enable_sph(priv, priv->ioaddr, sph_en, chan);
@@ -5291,6 +5617,60 @@ del_vlan_error:
return ret;
}
+static int stmmac_bpf(struct net_device *dev, struct netdev_bpf *bpf)
+{
+ struct stmmac_priv *priv = netdev_priv(dev);
+
+ switch (bpf->command) {
+ case XDP_SETUP_PROG:
+ return stmmac_xdp_set_prog(priv, bpf->prog, bpf->extack);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static int stmmac_xdp_xmit(struct net_device *dev, int num_frames,
+ struct xdp_frame **frames, u32 flags)
+{
+ struct stmmac_priv *priv = netdev_priv(dev);
+ int cpu = smp_processor_id();
+ struct netdev_queue *nq;
+ int i, nxmit = 0;
+ int queue;
+
+ if (unlikely(test_bit(STMMAC_DOWN, &priv->state)))
+ return -ENETDOWN;
+
+ if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
+ return -EINVAL;
+
+ queue = stmmac_xdp_get_tx_queue(priv, cpu);
+ nq = netdev_get_tx_queue(priv->dev, queue);
+
+ __netif_tx_lock(nq, cpu);
+ /* Avoids TX time-out as we are sharing with slow path */
+ nq->trans_start = jiffies;
+
+ for (i = 0; i < num_frames; i++) {
+ int res;
+
+ res = stmmac_xdp_xmit_xdpf(priv, queue, frames[i], true);
+ if (res == STMMAC_XDP_CONSUMED)
+ break;
+
+ nxmit++;
+ }
+
+ if (flags & XDP_XMIT_FLUSH) {
+ stmmac_flush_tx_descriptors(priv, queue);
+ stmmac_tx_timer_arm(priv, queue);
+ }
+
+ __netif_tx_unlock(nq);
+
+ return nxmit;
+}
+
static const struct net_device_ops stmmac_netdev_ops = {
.ndo_open = stmmac_open,
.ndo_start_xmit = stmmac_xmit,
@@ -5309,6 +5689,8 @@ static const struct net_device_ops stmmac_netdev_ops = {
.ndo_set_mac_address = stmmac_set_mac_address,
.ndo_vlan_rx_add_vid = stmmac_vlan_rx_add_vid,
.ndo_vlan_rx_kill_vid = stmmac_vlan_rx_kill_vid,
+ .ndo_bpf = stmmac_bpf,
+ .ndo_xdp_xmit = stmmac_xdp_xmit,
};
static void stmmac_reset_subtask(struct stmmac_priv *priv)
@@ -5697,7 +6079,8 @@ int stmmac_dvr_probe(struct device *device,
if (priv->dma_cap.sphen) {
ndev->hw_features |= NETIF_F_GRO;
- priv->sph = true;
+ priv->sph_cap = true;
+ priv->sph = priv->sph_cap;
dev_info(priv->device, "SPH feature enabled\n");
}
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_xdp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_xdp.c
new file mode 100644
index 000000000000..bf38d231860b
--- /dev/null
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_xdp.c
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021, Intel Corporation. */
+
+#include "stmmac.h"
+#include "stmmac_xdp.h"
+
+int stmmac_xdp_set_prog(struct stmmac_priv *priv, struct bpf_prog *prog,
+ struct netlink_ext_ack *extack)
+{
+ struct net_device *dev = priv->dev;
+ struct bpf_prog *old_prog;
+ bool need_update;
+ bool if_running;
+
+ if_running = netif_running(dev);
+
+ if (prog && dev->mtu > ETH_DATA_LEN) {
+ /* For now, the driver doesn't support XDP functionality with
+ * jumbo frames so we return error.
+ */
+ NL_SET_ERR_MSG_MOD(extack, "Jumbo frames not supported");
+ return -EOPNOTSUPP;
+ }
+
+ need_update = !!priv->xdp_prog != !!prog;
+ if (if_running && need_update)
+ stmmac_release(dev);
+
+ old_prog = xchg(&priv->xdp_prog, prog);
+ if (old_prog)
+ bpf_prog_put(old_prog);
+
+ /* Disable RX SPH for XDP operation */
+ priv->sph = priv->sph_cap && !stmmac_xdp_is_enabled(priv);
+
+ if (if_running && need_update)
+ stmmac_open(dev);
+
+ return 0;
+}
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_xdp.h b/drivers/net/ethernet/stmicro/stmmac/stmmac_xdp.h
new file mode 100644
index 000000000000..93948569d92a
--- /dev/null
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_xdp.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2021, Intel Corporation. */
+
+#ifndef _STMMAC_XDP_H_
+#define _STMMAC_XDP_H_
+
+#define STMMAC_MAX_RX_BUF_SIZE(num) (((num) * PAGE_SIZE) - XDP_PACKET_HEADROOM)
+
+int stmmac_xdp_set_prog(struct stmmac_priv *priv, struct bpf_prog *prog,
+ struct netlink_ext_ack *extack);
+
+#endif /* _STMMAC_XDP_H_ */
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 91b73db37555..9e525646df1d 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -218,6 +218,17 @@ static void veth_get_ethtool_stats(struct net_device *dev,
}
}
+static void veth_get_channels(struct net_device *dev,
+ struct ethtool_channels *channels)
+{
+ channels->tx_count = dev->real_num_tx_queues;
+ channels->rx_count = dev->real_num_rx_queues;
+ channels->max_tx = dev->real_num_tx_queues;
+ channels->max_rx = dev->real_num_rx_queues;
+ channels->combined_count = min(dev->real_num_rx_queues, dev->real_num_tx_queues);
+ channels->max_combined = min(dev->real_num_rx_queues, dev->real_num_tx_queues);
+}
+
static const struct ethtool_ops veth_ethtool_ops = {
.get_drvinfo = veth_get_drvinfo,
.get_link = ethtool_op_get_link,
@@ -226,6 +237,7 @@ static const struct ethtool_ops veth_ethtool_ops = {
.get_ethtool_stats = veth_get_ethtool_stats,
.get_link_ksettings = veth_get_link_ksettings,
.get_ts_info = ethtool_op_get_ts_info,
+ .get_channels = veth_get_channels,
};
/* general routines */
diff --git a/drivers/nfc/pn533/pn533.c b/drivers/nfc/pn533/pn533.c
index f1469ac8ff42..3fe5b81eda2d 100644
--- a/drivers/nfc/pn533/pn533.c
+++ b/drivers/nfc/pn533/pn533.c
@@ -706,6 +706,9 @@ static bool pn533_target_type_a_is_valid(struct pn533_target_type_a *type_a,
if (PN533_TYPE_A_SEL_CASCADE(type_a->sel_res) != 0)
return false;
+ if (type_a->nfcid_len > NFC_NFCID1_MAXSIZE)
+ return false;
+
return true;
}
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index c42e02b4d84b..6a29fe11485d 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -20,14 +20,25 @@ struct bpf_sock_ops_kern;
struct bpf_cgroup_storage;
struct ctl_table;
struct ctl_table_header;
+struct task_struct;
#ifdef CONFIG_CGROUP_BPF
extern struct static_key_false cgroup_bpf_enabled_key[MAX_BPF_ATTACH_TYPE];
#define cgroup_bpf_enabled(type) static_branch_unlikely(&cgroup_bpf_enabled_key[type])
-DECLARE_PER_CPU(struct bpf_cgroup_storage*,
- bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
+#define BPF_CGROUP_STORAGE_NEST_MAX 8
+
+struct bpf_cgroup_storage_info {
+ struct task_struct *task;
+ struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE];
+};
+
+/* For each cpu, permit maximum BPF_CGROUP_STORAGE_NEST_MAX number of tasks
+ * to use bpf cgroup storage simultaneously.
+ */
+DECLARE_PER_CPU(struct bpf_cgroup_storage_info,
+ bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]);
#define for_each_cgroup_storage_type(stype) \
for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++)
@@ -161,13 +172,42 @@ static inline enum bpf_cgroup_storage_type cgroup_storage_type(
return BPF_CGROUP_STORAGE_SHARED;
}
-static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage
- *storage[MAX_BPF_CGROUP_STORAGE_TYPE])
+static inline int bpf_cgroup_storage_set(struct bpf_cgroup_storage
+ *storage[MAX_BPF_CGROUP_STORAGE_TYPE])
{
enum bpf_cgroup_storage_type stype;
+ int i, err = 0;
+
+ preempt_disable();
+ for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) {
+ if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != NULL))
+ continue;
+
+ this_cpu_write(bpf_cgroup_storage_info[i].task, current);
+ for_each_cgroup_storage_type(stype)
+ this_cpu_write(bpf_cgroup_storage_info[i].storage[stype],
+ storage[stype]);
+ goto out;
+ }
+ err = -EBUSY;
+ WARN_ON_ONCE(1);
+
+out:
+ preempt_enable();
+ return err;
+}
+
+static inline void bpf_cgroup_storage_unset(void)
+{
+ int i;
+
+ for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) {
+ if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current))
+ continue;
- for_each_cgroup_storage_type(stype)
- this_cpu_write(bpf_cgroup_storage[stype], storage[stype]);
+ this_cpu_write(bpf_cgroup_storage_info[i].task, NULL);
+ return;
+ }
}
struct bpf_cgroup_storage *
@@ -448,8 +488,9 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr,
return -EINVAL;
}
-static inline void bpf_cgroup_storage_set(
- struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) {}
+static inline int bpf_cgroup_storage_set(
+ struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) { return 0; }
+static inline void bpf_cgroup_storage_unset(void) {}
static inline int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux,
struct bpf_map *map) { return 0; }
static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 39dce9d3c3a5..9fdd839b418c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -56,7 +56,7 @@ struct bpf_iter_seq_info {
u32 seq_priv_size;
};
-/* map is generic key/value storage optionally accesible by eBPF programs */
+/* map is generic key/value storage optionally accessible by eBPF programs */
struct bpf_map_ops {
/* funcs callable from userspace (via syscall) */
int (*map_alloc_check)(union bpf_attr *attr);
@@ -427,6 +427,7 @@ enum bpf_reg_type {
PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */
PTR_TO_FUNC, /* reg points to a bpf program function */
PTR_TO_MAP_KEY, /* reg points to a map element key */
+ __BPF_REG_TYPE_MAX,
};
/* The information passed from prog-specific *_is_valid_access
@@ -480,6 +481,7 @@ struct bpf_verifier_ops {
const struct btf_type *t, int off, int size,
enum bpf_access_type atype,
u32 *next_btf_id);
+ bool (*check_kfunc_call)(u32 kfunc_btf_id);
};
struct bpf_prog_offload_ops {
@@ -796,6 +798,8 @@ struct btf_mod_pair {
struct module *module;
};
+struct bpf_kfunc_desc_tab;
+
struct bpf_prog_aux {
atomic64_t refcnt;
u32 used_map_cnt;
@@ -832,6 +836,7 @@ struct bpf_prog_aux {
struct bpf_prog **func;
void *jit_data; /* JIT specific data. arch dependent */
struct bpf_jit_poke_descriptor *poke_tab;
+ struct bpf_kfunc_desc_tab *kfunc_tab;
u32 size_poke_tab;
struct bpf_ksym ksym;
const struct bpf_prog_ops *ops;
@@ -1106,6 +1111,13 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
/* BPF program asks to set CN on the packet. */
#define BPF_RET_SET_CN (1 << 0)
+/* For BPF_PROG_RUN_ARRAY_FLAGS and __BPF_PROG_RUN_ARRAY,
+ * if bpf_cgroup_storage_set() failed, the rest of programs
+ * will not execute. This should be a really rare scenario
+ * as it requires BPF_CGROUP_STORAGE_NEST_MAX number of
+ * preemptions all between bpf_cgroup_storage_set() and
+ * bpf_cgroup_storage_unset() on the same cpu.
+ */
#define BPF_PROG_RUN_ARRAY_FLAGS(array, ctx, func, ret_flags) \
({ \
struct bpf_prog_array_item *_item; \
@@ -1118,10 +1130,12 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
_array = rcu_dereference(array); \
_item = &_array->items[0]; \
while ((_prog = READ_ONCE(_item->prog))) { \
- bpf_cgroup_storage_set(_item->cgroup_storage); \
+ if (unlikely(bpf_cgroup_storage_set(_item->cgroup_storage))) \
+ break; \
func_ret = func(_prog, ctx); \
_ret &= (func_ret & 1); \
*(ret_flags) |= (func_ret >> 1); \
+ bpf_cgroup_storage_unset(); \
_item++; \
} \
rcu_read_unlock(); \
@@ -1142,9 +1156,14 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
goto _out; \
_item = &_array->items[0]; \
while ((_prog = READ_ONCE(_item->prog))) { \
- if (set_cg_storage) \
- bpf_cgroup_storage_set(_item->cgroup_storage); \
- _ret &= func(_prog, ctx); \
+ if (!set_cg_storage) { \
+ _ret &= func(_prog, ctx); \
+ } else { \
+ if (unlikely(bpf_cgroup_storage_set(_item->cgroup_storage))) \
+ break; \
+ _ret &= func(_prog, ctx); \
+ bpf_cgroup_storage_unset(); \
+ } \
_item++; \
} \
_out: \
@@ -1513,6 +1532,7 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog,
int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog,
const union bpf_attr *kattr,
union bpf_attr __user *uattr);
+bool bpf_prog_test_check_kfunc_call(u32 kfunc_id);
bool btf_ctx_access(int off, int size, enum bpf_access_type type,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info);
@@ -1531,8 +1551,11 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
struct btf_func_model *m);
struct bpf_reg_state;
-int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
- struct bpf_reg_state *regs);
+int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
+ struct bpf_reg_state *regs);
+int btf_check_kfunc_arg_match(struct bpf_verifier_env *env,
+ const struct btf *btf, u32 func_id,
+ struct bpf_reg_state *regs);
int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
struct bpf_reg_state *reg);
int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
@@ -1543,6 +1566,10 @@ struct bpf_link *bpf_link_by_id(u32 id);
const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id);
void bpf_task_storage_free(struct task_struct *task);
+bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog);
+const struct btf_func_model *
+bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
+ const struct bpf_insn *insn);
#else /* !CONFIG_BPF_SYSCALL */
static inline struct bpf_prog *bpf_prog_get(u32 ufd)
{
@@ -1705,6 +1732,11 @@ static inline int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog,
return -ENOTSUPP;
}
+static inline bool bpf_prog_test_check_kfunc_call(u32 kfunc_id)
+{
+ return false;
+}
+
static inline void bpf_map_put(struct bpf_map *map)
{
}
@@ -1723,6 +1755,18 @@ bpf_base_func_proto(enum bpf_func_id func_id)
static inline void bpf_task_storage_free(struct task_struct *task)
{
}
+
+static inline bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog)
+{
+ return false;
+}
+
+static inline const struct btf_func_model *
+bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
+ const struct bpf_insn *insn)
+{
+ return NULL;
+}
#endif /* CONFIG_BPF_SYSCALL */
void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 9c1b52738bbe..3bac66e0183a 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -110,6 +110,7 @@ const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf,
const struct btf_type *
btf_resolve_size(const struct btf *btf, const struct btf_type *type,
u32 *type_size);
+const char *btf_type_str(const struct btf_type *t);
#define for_each_member(i, struct_type, member) \
for (i = 0, member = btf_type_member(struct_type); \
@@ -141,6 +142,11 @@ static inline bool btf_type_is_enum(const struct btf_type *t)
return BTF_INFO_KIND(t->info) == BTF_KIND_ENUM;
}
+static inline bool btf_type_is_scalar(const struct btf_type *t)
+{
+ return btf_type_is_int(t) || btf_type_is_enum(t);
+}
+
static inline bool btf_type_is_typedef(const struct btf_type *t)
{
return BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index b2b85b2cad8e..9a09547bc7ba 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -877,8 +877,7 @@ void bpf_prog_free_linfo(struct bpf_prog *prog);
void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
const u32 *insn_to_jit_off);
int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog);
-void bpf_prog_free_jited_linfo(struct bpf_prog *prog);
-void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog);
+void bpf_prog_jit_attempt_done(struct bpf_prog *prog);
struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags);
struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags);
@@ -919,6 +918,7 @@ u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog);
void bpf_jit_compile(struct bpf_prog *prog);
bool bpf_jit_needs_zext(void);
+bool bpf_jit_supports_kfunc_call(void);
bool bpf_helper_changes_pkt_data(void *func);
static inline bool bpf_dump_raw_ok(const struct cred *cred)
@@ -1246,15 +1246,6 @@ static inline u16 bpf_anc_helper(const struct sock_filter *ftest)
void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb,
int k, unsigned int size);
-static inline void *bpf_load_pointer(const struct sk_buff *skb, int k,
- unsigned int size, void *buffer)
-{
- if (k >= 0)
- return skb_header_pointer(skb, k, size, buffer);
-
- return bpf_internal_load_pointer_neg_helper(skb, k, size);
-}
-
static inline int bpf_tell_extensions(void)
{
return SKF_AD_MAX;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c8def85fcc22..dbf820a50a39 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3626,6 +3626,7 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
unsigned int flags);
int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
int len);
+int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len);
void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to);
unsigned int skb_zerocopy_headlen(const struct sk_buff *from);
int skb_zerocopy(struct sk_buff *to, struct sk_buff *from,
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 6c09d94be2e9..f78e90a04a69 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -58,6 +58,7 @@ struct sk_psock_progs {
struct bpf_prog *msg_parser;
struct bpf_prog *stream_parser;
struct bpf_prog *stream_verdict;
+ struct bpf_prog *skb_verdict;
};
enum sk_psock_state_bits {
@@ -89,6 +90,7 @@ struct sk_psock {
#endif
struct sk_buff_head ingress_skb;
struct list_head ingress_msg;
+ spinlock_t ingress_lock;
unsigned long state;
struct list_head link;
spinlock_t link_lock;
@@ -97,13 +99,12 @@ struct sk_psock {
void (*saved_close)(struct sock *sk, long timeout);
void (*saved_write_space)(struct sock *sk);
void (*saved_data_ready)(struct sock *sk);
+ int (*psock_update_sk_prot)(struct sock *sk, bool restore);
struct proto *sk_proto;
+ struct mutex work_mutex;
struct sk_psock_work_state work_state;
struct work_struct work;
- union {
- struct rcu_head rcu;
- struct work_struct gc;
- };
+ struct rcu_work rwork;
};
int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
@@ -124,6 +125,10 @@ int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
struct sk_msg *msg, u32 bytes);
int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
struct sk_msg *msg, u32 bytes);
+int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags,
+ long timeo, int *err);
+int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
+ int len, int flags);
static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes)
{
@@ -284,7 +289,45 @@ static inline struct sk_psock *sk_psock(const struct sock *sk)
static inline void sk_psock_queue_msg(struct sk_psock *psock,
struct sk_msg *msg)
{
+ spin_lock_bh(&psock->ingress_lock);
list_add_tail(&msg->list, &psock->ingress_msg);
+ spin_unlock_bh(&psock->ingress_lock);
+}
+
+static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock)
+{
+ struct sk_msg *msg;
+
+ spin_lock_bh(&psock->ingress_lock);
+ msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list);
+ if (msg)
+ list_del(&msg->list);
+ spin_unlock_bh(&psock->ingress_lock);
+ return msg;
+}
+
+static inline struct sk_msg *sk_psock_peek_msg(struct sk_psock *psock)
+{
+ struct sk_msg *msg;
+
+ spin_lock_bh(&psock->ingress_lock);
+ msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list);
+ spin_unlock_bh(&psock->ingress_lock);
+ return msg;
+}
+
+static inline struct sk_msg *sk_psock_next_msg(struct sk_psock *psock,
+ struct sk_msg *msg)
+{
+ struct sk_msg *ret;
+
+ spin_lock_bh(&psock->ingress_lock);
+ if (list_is_last(&msg->list, &psock->ingress_msg))
+ ret = NULL;
+ else
+ ret = list_next_entry(msg, list);
+ spin_unlock_bh(&psock->ingress_lock);
+ return ret;
}
static inline bool sk_psock_queue_empty(const struct sk_psock *psock)
@@ -292,6 +335,13 @@ static inline bool sk_psock_queue_empty(const struct sk_psock *psock)
return psock ? list_empty(&psock->ingress_msg) : true;
}
+static inline void kfree_sk_msg(struct sk_msg *msg)
+{
+ if (msg->skb)
+ consume_skb(msg->skb);
+ kfree(msg);
+}
+
static inline void sk_psock_report_error(struct sk_psock *psock, int err)
{
struct sock *sk = psock->sk;
@@ -301,6 +351,7 @@ static inline void sk_psock_report_error(struct sk_psock *psock, int err)
}
struct sk_psock *sk_psock_init(struct sock *sk, int node);
+void sk_psock_stop(struct sk_psock *psock, bool wait);
#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock);
@@ -349,25 +400,12 @@ static inline void sk_psock_cork_free(struct sk_psock *psock)
}
}
-static inline void sk_psock_update_proto(struct sock *sk,
- struct sk_psock *psock,
- struct proto *ops)
-{
- /* Pairs with lockless read in sk_clone_lock() */
- WRITE_ONCE(sk->sk_prot, ops);
-}
-
static inline void sk_psock_restore_proto(struct sock *sk,
struct sk_psock *psock)
{
sk->sk_prot->unhash = psock->saved_unhash;
- if (inet_csk_has_ulp(sk)) {
- tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space);
- } else {
- sk->sk_write_space = psock->saved_write_space;
- /* Pairs with lockless read in sk_clone_lock() */
- WRITE_ONCE(sk->sk_prot, psock->sk_proto);
- }
+ if (psock->psock_update_sk_prot)
+ psock->psock_update_sk_prot(sk, true);
}
static inline void sk_psock_set_state(struct sk_psock *psock,
@@ -442,6 +480,7 @@ static inline void psock_progs_drop(struct sk_psock_progs *progs)
psock_set_prog(&progs->msg_parser, NULL);
psock_set_prog(&progs->stream_parser, NULL);
psock_set_prog(&progs->stream_verdict, NULL);
+ psock_set_prog(&progs->skb_verdict, NULL);
}
int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb);
diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h
index 0e85713f56df..2926f1f00d65 100644
--- a/include/net/bpf_sk_storage.h
+++ b/include/net/bpf_sk_storage.h
@@ -27,7 +27,6 @@ struct bpf_local_storage_elem;
struct bpf_sk_storage_diag;
struct sk_buff;
struct nlattr;
-struct sock;
#ifdef CONFIG_BPF_SYSCALL
int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk);
diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index cea69c801595..16fe34d139c3 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -30,8 +30,8 @@ struct mptcp_ext {
ack64:1,
mpc_map:1,
frozen:1,
- __unused:1;
- /* one byte hole */
+ reset_transient:1;
+ u8 reset_reason:4;
};
#define MPTCP_RM_IDS_MAX 8
@@ -58,6 +58,8 @@ struct mptcp_out_options {
struct mptcp_rm_list rm_list;
u8 join_id;
u8 backup;
+ u8 reset_reason:4;
+ u8 reset_transient:1;
u32 nonce;
u64 thmac;
u32 token;
@@ -156,6 +158,16 @@ void mptcp_seq_show(struct seq_file *seq);
int mptcp_subflow_init_cookie_req(struct request_sock *req,
const struct sock *sk_listener,
struct sk_buff *skb);
+
+__be32 mptcp_get_reset_option(const struct sk_buff *skb);
+
+static inline __be32 mptcp_reset_option(const struct sk_buff *skb)
+{
+ if (skb_ext_exist(skb, SKB_EXT_MPTCP))
+ return mptcp_get_reset_option(skb);
+
+ return htonl(0u);
+}
#else
static inline void mptcp_init(void)
@@ -236,6 +248,8 @@ static inline int mptcp_subflow_init_cookie_req(struct request_sock *req,
{
return 0; /* TCP fallback */
}
+
+static inline __be32 mptcp_reset_option(const struct sk_buff *skb) { return htonl(0u); }
#endif /* CONFIG_MPTCP */
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
diff --git a/include/net/netns/mib.h b/include/net/netns/mib.h
index 59b2c3a3db42..7e373664b1e7 100644
--- a/include/net/netns/mib.h
+++ b/include/net/netns/mib.h
@@ -5,22 +5,19 @@
#include <net/snmp.h>
struct netns_mib {
- DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics);
+#if IS_ENABLED(CONFIG_IPV6)
+ DEFINE_SNMP_STAT(struct ipstats_mib, ipv6_statistics);
+#endif
+
+ DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
DEFINE_SNMP_STAT(struct linux_mib, net_statistics);
- DEFINE_SNMP_STAT(struct udp_mib, udp_statistics);
- DEFINE_SNMP_STAT(struct udp_mib, udplite_statistics);
- DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics);
- DEFINE_SNMP_STAT_ATOMIC(struct icmpmsg_mib, icmpmsg_statistics);
+ DEFINE_SNMP_STAT(struct udp_mib, udp_statistics);
#if IS_ENABLED(CONFIG_IPV6)
- struct proc_dir_entry *proc_net_devsnmp6;
DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6);
- DEFINE_SNMP_STAT(struct udp_mib, udplite_stats_in6);
- DEFINE_SNMP_STAT(struct ipstats_mib, ipv6_statistics);
- DEFINE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics);
- DEFINE_SNMP_STAT_ATOMIC(struct icmpv6msg_mib, icmpv6msg_statistics);
#endif
+
#ifdef CONFIG_XFRM_STATISTICS
DEFINE_SNMP_STAT(struct linux_xfrm_mib, xfrm_statistics);
#endif
@@ -30,6 +27,19 @@ struct netns_mib {
#ifdef CONFIG_MPTCP
DEFINE_SNMP_STAT(struct mptcp_mib, mptcp_statistics);
#endif
+
+ DEFINE_SNMP_STAT(struct udp_mib, udplite_statistics);
+#if IS_ENABLED(CONFIG_IPV6)
+ DEFINE_SNMP_STAT(struct udp_mib, udplite_stats_in6);
+#endif
+
+ DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics);
+ DEFINE_SNMP_STAT_ATOMIC(struct icmpmsg_mib, icmpmsg_statistics);
+#if IS_ENABLED(CONFIG_IPV6)
+ DEFINE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics);
+ DEFINE_SNMP_STAT_ATOMIC(struct icmpv6msg_mib, icmpv6msg_statistics);
+ struct proc_dir_entry *proc_net_devsnmp6;
+#endif
};
#endif
diff --git a/include/net/sock.h b/include/net/sock.h
index 0b6266fd6bf6..8b4155e756c2 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1184,6 +1184,9 @@ struct proto {
void (*unhash)(struct sock *sk);
void (*rehash)(struct sock *sk);
int (*get_port)(struct sock *sk, unsigned short snum);
+#ifdef CONFIG_BPF_SYSCALL
+ int (*psock_update_sk_prot)(struct sock *sk, bool restore);
+#endif
/* Keeping track of sockets in use */
#ifdef CONFIG_PROC_FS
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 075de26f449d..eaea43afcc97 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1035,44 +1035,56 @@ struct rate_sample {
};
struct tcp_congestion_ops {
- struct list_head list;
- u32 key;
- u32 flags;
-
- /* initialize private data (optional) */
- void (*init)(struct sock *sk);
- /* cleanup private data (optional) */
- void (*release)(struct sock *sk);
+/* fast path fields are put first to fill one cache line */
/* return slow start threshold (required) */
u32 (*ssthresh)(struct sock *sk);
+
/* do new cwnd calculation (required) */
void (*cong_avoid)(struct sock *sk, u32 ack, u32 acked);
+
/* call before changing ca_state (optional) */
void (*set_state)(struct sock *sk, u8 new_state);
+
/* call when cwnd event occurs (optional) */
void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev);
+
/* call when ack arrives (optional) */
void (*in_ack_event)(struct sock *sk, u32 flags);
- /* new value of cwnd after loss (required) */
- u32 (*undo_cwnd)(struct sock *sk);
+
/* hook for packet ack accounting (optional) */
void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
+
/* override sysctl_tcp_min_tso_segs */
u32 (*min_tso_segs)(struct sock *sk);
- /* returns the multiplier used in tcp_sndbuf_expand (optional) */
- u32 (*sndbuf_expand)(struct sock *sk);
+
/* call when packets are delivered to update cwnd and pacing rate,
* after all the ca_state processing. (optional)
*/
void (*cong_control)(struct sock *sk, const struct rate_sample *rs);
+
+
+ /* new value of cwnd after loss (required) */
+ u32 (*undo_cwnd)(struct sock *sk);
+ /* returns the multiplier used in tcp_sndbuf_expand (optional) */
+ u32 (*sndbuf_expand)(struct sock *sk);
+
+/* control/slow paths put last */
/* get info for inet_diag (optional) */
size_t (*get_info)(struct sock *sk, u32 ext, int *attr,
union tcp_cc_info *info);
- char name[TCP_CA_NAME_MAX];
- struct module *owner;
-};
+ char name[TCP_CA_NAME_MAX];
+ struct module *owner;
+ struct list_head list;
+ u32 key;
+ u32 flags;
+
+ /* initialize private data (optional) */
+ void (*init)(struct sock *sk);
+ /* cleanup private data (optional) */
+ void (*release)(struct sock *sk);
+} ____cacheline_aligned_in_smp;
int tcp_register_congestion_control(struct tcp_congestion_ops *type);
void tcp_unregister_congestion_control(struct tcp_congestion_ops *type);
@@ -2203,13 +2215,12 @@ struct sk_psock;
#ifdef CONFIG_BPF_SYSCALL
struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock);
+int tcp_bpf_update_proto(struct sock *sk, bool restore);
void tcp_bpf_clone(const struct sock *sk, struct sock *newsk);
#endif /* CONFIG_BPF_SYSCALL */
int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes,
int flags);
-int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
- struct msghdr *msg, int len, int flags);
#endif /* CONFIG_NET_SOCK_MSG */
#if !defined(CONFIG_BPF_SYSCALL) || !defined(CONFIG_NET_SOCK_MSG)
diff --git a/include/net/udp.h b/include/net/udp.h
index adf2ff8ac87c..f55aaeef7e91 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -329,6 +329,8 @@ struct sock *__udp6_lib_lookup(struct net *net,
struct sk_buff *skb);
struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb,
__be16 sport, __be16 dport);
+int udp_read_sock(struct sock *sk, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor);
/* UDP uses skb->dev_scratch to cache as much information as possible and avoid
* possibly multiple cache miss on dequeue()
@@ -541,6 +543,7 @@ static inline void udp_post_segment_fix_csum(struct sk_buff *skb)
#ifdef CONFIG_BPF_SYSCALL
struct sk_psock;
struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock);
+int udp_bpf_update_proto(struct sock *sk, bool restore);
#endif
#endif /* _UDP_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 008edc1dc8c1..49371eba98ba 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -957,6 +957,7 @@ enum bpf_attach_type {
BPF_XDP_CPUMAP,
BPF_SK_LOOKUP,
BPF_XDP,
+ BPF_SK_SKB_VERDICT,
__MAX_BPF_ATTACH_TYPE
};
@@ -1117,6 +1118,10 @@ enum bpf_link_type {
* offset to another bpf function
*/
#define BPF_PSEUDO_CALL 1
+/* when bpf_call->src_reg == BPF_PSEUDO_KFUNC_CALL,
+ * bpf_call->imm == btf_id of a BTF_KIND_FUNC in the running kernel
+ */
+#define BPF_PSEUDO_KFUNC_CALL 2
/* flags for BPF_MAP_UPDATE_ELEM command */
enum {
diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h
index e1172c1ffdfd..8eb3c0844bff 100644
--- a/include/uapi/linux/mptcp.h
+++ b/include/uapi/linux/mptcp.h
@@ -174,10 +174,21 @@ enum mptcp_event_attr {
MPTCP_ATTR_FLAGS, /* u16 */
MPTCP_ATTR_TIMEOUT, /* u32 */
MPTCP_ATTR_IF_IDX, /* s32 */
+ MPTCP_ATTR_RESET_REASON,/* u32 */
+ MPTCP_ATTR_RESET_FLAGS, /* u32 */
__MPTCP_ATTR_AFTER_LAST
};
#define MPTCP_ATTR_MAX (__MPTCP_ATTR_AFTER_LAST - 1)
+/* MPTCP Reset reason codes, rfc8684 */
+#define MPTCP_RST_EUNSPEC 0
+#define MPTCP_RST_EMPTCP 1
+#define MPTCP_RST_ERESOURCE 2
+#define MPTCP_RST_EPROHIBIT 3
+#define MPTCP_RST_EWQ2BIG 4
+#define MPTCP_RST_EBADPERF 5
+#define MPTCP_RST_EMIDDLEBOX 6
+
#endif /* _UAPI_MPTCP_H */
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 369faeddf1df..0600ed325fa0 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -283,7 +283,7 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
[BTF_KIND_FLOAT] = "FLOAT",
};
-static const char *btf_type_str(const struct btf_type *t)
+const char *btf_type_str(const struct btf_type *t)
{
return btf_kind_str[BTF_INFO_KIND(t->info)];
}
@@ -789,7 +789,6 @@ static const struct btf_type *btf_type_skip_qualifiers(const struct btf *btf,
while (btf_type_is_modifier(t) &&
BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF) {
- id = t->type;
t = btf_type_by_id(btf, t->type);
}
@@ -4377,7 +4376,7 @@ static u8 bpf_ctx_convert_map[] = {
#undef BPF_LINK_TYPE
static const struct btf_member *
-btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,
+btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
const struct btf_type *t, enum bpf_prog_type prog_type,
int arg)
{
@@ -5362,122 +5361,190 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr
return btf_check_func_type_match(log, btf1, t1, btf2, t2);
}
-/* Compare BTF of a function with given bpf_reg_state.
- * Returns:
- * EFAULT - there is a verifier bug. Abort verification.
- * EINVAL - there is a type mismatch or BTF is not available.
- * 0 - BTF matches with what bpf_reg_state expects.
- * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
- */
-int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
- struct bpf_reg_state *regs)
+static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
+#ifdef CONFIG_NET
+ [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
+ [PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
+ [PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
+#endif
+};
+
+static int btf_check_func_arg_match(struct bpf_verifier_env *env,
+ const struct btf *btf, u32 func_id,
+ struct bpf_reg_state *regs,
+ bool ptr_to_mem_ok)
{
struct bpf_verifier_log *log = &env->log;
- struct bpf_prog *prog = env->prog;
- struct btf *btf = prog->aux->btf;
- const struct btf_param *args;
+ const char *func_name, *ref_tname;
const struct btf_type *t, *ref_t;
- u32 i, nargs, btf_id, type_size;
- const char *tname;
- bool is_global;
-
- if (!prog->aux->func_info)
- return -EINVAL;
-
- btf_id = prog->aux->func_info[subprog].type_id;
- if (!btf_id)
- return -EFAULT;
-
- if (prog->aux->func_info_aux[subprog].unreliable)
- return -EINVAL;
+ const struct btf_param *args;
+ u32 i, nargs, ref_id;
- t = btf_type_by_id(btf, btf_id);
+ t = btf_type_by_id(btf, func_id);
if (!t || !btf_type_is_func(t)) {
/* These checks were already done by the verifier while loading
- * struct bpf_func_info
+ * struct bpf_func_info or in add_kfunc_call().
*/
- bpf_log(log, "BTF of func#%d doesn't point to KIND_FUNC\n",
- subprog);
+ bpf_log(log, "BTF of func_id %u doesn't point to KIND_FUNC\n",
+ func_id);
return -EFAULT;
}
- tname = btf_name_by_offset(btf, t->name_off);
+ func_name = btf_name_by_offset(btf, t->name_off);
t = btf_type_by_id(btf, t->type);
if (!t || !btf_type_is_func_proto(t)) {
- bpf_log(log, "Invalid BTF of func %s\n", tname);
+ bpf_log(log, "Invalid BTF of func %s\n", func_name);
return -EFAULT;
}
args = (const struct btf_param *)(t + 1);
nargs = btf_type_vlen(t);
if (nargs > MAX_BPF_FUNC_REG_ARGS) {
- bpf_log(log, "Function %s has %d > %d args\n", tname, nargs,
+ bpf_log(log, "Function %s has %d > %d args\n", func_name, nargs,
MAX_BPF_FUNC_REG_ARGS);
- goto out;
+ return -EINVAL;
}
- is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
/* check that BTF function arguments match actual types that the
* verifier sees.
*/
for (i = 0; i < nargs; i++) {
- struct bpf_reg_state *reg = &regs[i + 1];
+ u32 regno = i + 1;
+ struct bpf_reg_state *reg = &regs[regno];
- t = btf_type_by_id(btf, args[i].type);
- while (btf_type_is_modifier(t))
- t = btf_type_by_id(btf, t->type);
- if (btf_type_is_int(t) || btf_type_is_enum(t)) {
+ t = btf_type_skip_modifiers(btf, args[i].type, NULL);
+ if (btf_type_is_scalar(t)) {
if (reg->type == SCALAR_VALUE)
continue;
- bpf_log(log, "R%d is not a scalar\n", i + 1);
- goto out;
+ bpf_log(log, "R%d is not a scalar\n", regno);
+ return -EINVAL;
}
- if (btf_type_is_ptr(t)) {
+
+ if (!btf_type_is_ptr(t)) {
+ bpf_log(log, "Unrecognized arg#%d type %s\n",
+ i, btf_type_str(t));
+ return -EINVAL;
+ }
+
+ ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
+ ref_tname = btf_name_by_offset(btf, ref_t->name_off);
+ if (btf_is_kernel(btf)) {
+ const struct btf_type *reg_ref_t;
+ const struct btf *reg_btf;
+ const char *reg_ref_tname;
+ u32 reg_ref_id;
+
+ if (!btf_type_is_struct(ref_t)) {
+ bpf_log(log, "kernel function %s args#%d pointer type %s %s is not supported\n",
+ func_name, i, btf_type_str(ref_t),
+ ref_tname);
+ return -EINVAL;
+ }
+
+ if (reg->type == PTR_TO_BTF_ID) {
+ reg_btf = reg->btf;
+ reg_ref_id = reg->btf_id;
+ } else if (reg2btf_ids[reg->type]) {
+ reg_btf = btf_vmlinux;
+ reg_ref_id = *reg2btf_ids[reg->type];
+ } else {
+ bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d is not a pointer to btf_id\n",
+ func_name, i,
+ btf_type_str(ref_t), ref_tname, regno);
+ return -EINVAL;
+ }
+
+ reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id,
+ &reg_ref_id);
+ reg_ref_tname = btf_name_by_offset(reg_btf,
+ reg_ref_t->name_off);
+ if (!btf_struct_ids_match(log, reg_btf, reg_ref_id,
+ reg->off, btf, ref_id)) {
+ bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
+ func_name, i,
+ btf_type_str(ref_t), ref_tname,
+ regno, btf_type_str(reg_ref_t),
+ reg_ref_tname);
+ return -EINVAL;
+ }
+ } else if (btf_get_prog_ctx_type(log, btf, t,
+ env->prog->type, i)) {
/* If function expects ctx type in BTF check that caller
* is passing PTR_TO_CTX.
*/
- if (btf_get_prog_ctx_type(log, btf, t, prog->type, i)) {
- if (reg->type != PTR_TO_CTX) {
- bpf_log(log,
- "arg#%d expected pointer to ctx, but got %s\n",
- i, btf_kind_str[BTF_INFO_KIND(t->info)]);
- goto out;
- }
- if (check_ctx_reg(env, reg, i + 1))
- goto out;
- continue;
+ if (reg->type != PTR_TO_CTX) {
+ bpf_log(log,
+ "arg#%d expected pointer to ctx, but got %s\n",
+ i, btf_type_str(t));
+ return -EINVAL;
}
+ if (check_ctx_reg(env, reg, regno))
+ return -EINVAL;
+ } else if (ptr_to_mem_ok) {
+ const struct btf_type *resolve_ret;
+ u32 type_size;
- if (!is_global)
- goto out;
-
- t = btf_type_skip_modifiers(btf, t->type, NULL);
-
- ref_t = btf_resolve_size(btf, t, &type_size);
- if (IS_ERR(ref_t)) {
+ resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
+ if (IS_ERR(resolve_ret)) {
bpf_log(log,
- "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
- i, btf_type_str(t), btf_name_by_offset(btf, t->name_off),
- PTR_ERR(ref_t));
- goto out;
+ "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
+ i, btf_type_str(ref_t), ref_tname,
+ PTR_ERR(resolve_ret));
+ return -EINVAL;
}
- if (check_mem_reg(env, reg, i + 1, type_size))
- goto out;
-
- continue;
+ if (check_mem_reg(env, reg, regno, type_size))
+ return -EINVAL;
+ } else {
+ return -EINVAL;
}
- bpf_log(log, "Unrecognized arg#%d type %s\n",
- i, btf_kind_str[BTF_INFO_KIND(t->info)]);
- goto out;
}
+
return 0;
-out:
+}
+
+/* Compare BTF of a function with given bpf_reg_state.
+ * Returns:
+ * EFAULT - there is a verifier bug. Abort verification.
+ * EINVAL - there is a type mismatch or BTF is not available.
+ * 0 - BTF matches with what bpf_reg_state expects.
+ * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
+ */
+int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
+ struct bpf_reg_state *regs)
+{
+ struct bpf_prog *prog = env->prog;
+ struct btf *btf = prog->aux->btf;
+ bool is_global;
+ u32 btf_id;
+ int err;
+
+ if (!prog->aux->func_info)
+ return -EINVAL;
+
+ btf_id = prog->aux->func_info[subprog].type_id;
+ if (!btf_id)
+ return -EFAULT;
+
+ if (prog->aux->func_info_aux[subprog].unreliable)
+ return -EINVAL;
+
+ is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
+ err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global);
+
/* Compiler optimizations can remove arguments from static functions
* or mismatched type can be passed into a global function.
* In such cases mark the function as unreliable from BTF point of view.
*/
- prog->aux->func_info_aux[subprog].unreliable = true;
- return -EINVAL;
+ if (err)
+ prog->aux->func_info_aux[subprog].unreliable = true;
+ return err;
+}
+
+int btf_check_kfunc_arg_match(struct bpf_verifier_env *env,
+ const struct btf *btf, u32 func_id,
+ struct bpf_reg_state *regs)
+{
+ return btf_check_func_arg_match(env, btf, func_id, regs, false);
}
/* Convert BTF of a function into bpf_reg_state if possible
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 75244ecb2389..f5423251c118 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -143,25 +143,25 @@ int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)
if (!prog->aux->nr_linfo || !prog->jit_requested)
return 0;
- prog->aux->jited_linfo = kcalloc(prog->aux->nr_linfo,
- sizeof(*prog->aux->jited_linfo),
- GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+ prog->aux->jited_linfo = kvcalloc(prog->aux->nr_linfo,
+ sizeof(*prog->aux->jited_linfo),
+ GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
if (!prog->aux->jited_linfo)
return -ENOMEM;
return 0;
}
-void bpf_prog_free_jited_linfo(struct bpf_prog *prog)
+void bpf_prog_jit_attempt_done(struct bpf_prog *prog)
{
- kfree(prog->aux->jited_linfo);
- prog->aux->jited_linfo = NULL;
-}
+ if (prog->aux->jited_linfo &&
+ (!prog->jited || !prog->aux->jited_linfo[0])) {
+ kvfree(prog->aux->jited_linfo);
+ prog->aux->jited_linfo = NULL;
+ }
-void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog)
-{
- if (prog->aux->jited_linfo && !prog->aux->jited_linfo[0])
- bpf_prog_free_jited_linfo(prog);
+ kfree(prog->aux->kfunc_tab);
+ prog->aux->kfunc_tab = NULL;
}
/* The jit engine is responsible to provide an array
@@ -217,12 +217,6 @@ void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
insn_to_jit_off[linfo[i].insn_off - insn_start - 1];
}
-void bpf_prog_free_linfo(struct bpf_prog *prog)
-{
- bpf_prog_free_jited_linfo(prog);
- kvfree(prog->aux->linfo);
-}
-
struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
gfp_t gfp_extra_flags)
{
@@ -1849,9 +1843,15 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
/* In case of BPF to BPF calls, verifier did all the prep
* work with regards to JITing, etc.
*/
+ bool jit_needed = false;
+
if (fp->bpf_func)
goto finalize;
+ if (IS_ENABLED(CONFIG_BPF_JIT_ALWAYS_ON) ||
+ bpf_prog_has_kfunc_call(fp))
+ jit_needed = true;
+
bpf_prog_select_func(fp);
/* eBPF JITs can rewrite the program in case constant
@@ -1866,14 +1866,10 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
return fp;
fp = bpf_int_jit_compile(fp);
- if (!fp->jited) {
- bpf_prog_free_jited_linfo(fp);
-#ifdef CONFIG_BPF_JIT_ALWAYS_ON
+ bpf_prog_jit_attempt_done(fp);
+ if (!fp->jited && jit_needed) {
*err = -ENOTSUPP;
return fp;
-#endif
- } else {
- bpf_prog_free_unused_jited_linfo(fp);
}
} else {
*err = bpf_prog_offload_compile(fp);
@@ -2354,6 +2350,11 @@ bool __weak bpf_jit_needs_zext(void)
return false;
}
+bool __weak bpf_jit_supports_kfunc_call(void)
+{
+ return false;
+}
+
/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
* skb_copy_bits(), so provide a weak definition of it for NET-less config.
*/
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index 3acc7e0b6916..dad821c8ecd0 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -19,16 +19,23 @@ static const char *__func_get_name(const struct bpf_insn_cbs *cbs,
{
BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
- if (insn->src_reg != BPF_PSEUDO_CALL &&
+ if (!insn->src_reg &&
insn->imm >= 0 && insn->imm < __BPF_FUNC_MAX_ID &&
func_id_str[insn->imm])
return func_id_str[insn->imm];
- if (cbs && cbs->cb_call)
- return cbs->cb_call(cbs->private_data, insn);
+ if (cbs && cbs->cb_call) {
+ const char *res;
+
+ res = cbs->cb_call(cbs->private_data, insn);
+ if (res)
+ return res;
+ }
if (insn->src_reg == BPF_PSEUDO_CALL)
snprintf(buff, len, "%+d", insn->imm);
+ else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL)
+ snprintf(buff, len, "kernel-function");
return buff;
}
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 074800226327..f306611c4ddf 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -382,8 +382,8 @@ const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = {
};
#ifdef CONFIG_CGROUP_BPF
-DECLARE_PER_CPU(struct bpf_cgroup_storage*,
- bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
+DECLARE_PER_CPU(struct bpf_cgroup_storage_info,
+ bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]);
BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
{
@@ -392,10 +392,17 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
* verifier checks that its value is correct.
*/
enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
- struct bpf_cgroup_storage *storage;
+ struct bpf_cgroup_storage *storage = NULL;
void *ptr;
+ int i;
- storage = this_cpu_read(bpf_cgroup_storage[stype]);
+ for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) {
+ if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current))
+ continue;
+
+ storage = this_cpu_read(bpf_cgroup_storage_info[i].storage[stype]);
+ break;
+ }
if (stype == BPF_CGROUP_STORAGE_SHARED)
ptr = &READ_ONCE(storage->buf)->data[0];
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 2d4f9ac12377..bd11db9774c3 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -9,10 +9,11 @@
#include <linux/slab.h>
#include <uapi/linux/btf.h>
-DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
-
#ifdef CONFIG_CGROUP_BPF
+DEFINE_PER_CPU(struct bpf_cgroup_storage_info,
+ bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]);
+
#include "../cgroup/cgroup-internal.h"
#define LOCAL_STORAGE_CREATE_FLAG_MASK \
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index cec792a17e5f..1b7b8a6f34ee 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -726,6 +726,9 @@ const struct bpf_map_ops trie_map_ops = {
.map_lookup_elem = trie_lookup_elem,
.map_update_elem = trie_update_elem,
.map_delete_elem = trie_delete_elem,
+ .map_lookup_batch = generic_map_lookup_batch,
+ .map_update_batch = generic_map_update_batch,
+ .map_delete_batch = generic_map_delete_batch,
.map_check_btf = trie_check_btf,
.map_btf_name = "lpm_trie",
.map_btf_id = &trie_map_btf_id,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 250503482cda..6428634da57e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1694,7 +1694,9 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
{
bpf_prog_kallsyms_del_all(prog);
btf_put(prog->aux->btf);
- bpf_prog_free_linfo(prog);
+ kvfree(prog->aux->jited_linfo);
+ kvfree(prog->aux->linfo);
+ kfree(prog->aux->kfunc_tab);
if (prog->aux->attach_btf)
btf_put(prog->aux->attach_btf);
@@ -2946,6 +2948,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
return BPF_PROG_TYPE_SK_MSG;
case BPF_SK_SKB_STREAM_PARSER:
case BPF_SK_SKB_STREAM_VERDICT:
+ case BPF_SK_SKB_VERDICT:
return BPF_PROG_TYPE_SK_SKB;
case BPF_LIRC_MODE2:
return BPF_PROG_TYPE_LIRC_MODE2;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 210169c25ead..852541a435ef 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -234,6 +234,12 @@ static bool bpf_pseudo_call(const struct bpf_insn *insn)
insn->src_reg == BPF_PSEUDO_CALL;
}
+static bool bpf_pseudo_kfunc_call(const struct bpf_insn *insn)
+{
+ return insn->code == (BPF_JMP | BPF_CALL) &&
+ insn->src_reg == BPF_PSEUDO_KFUNC_CALL;
+}
+
static bool bpf_pseudo_func(const struct bpf_insn *insn)
{
return insn->code == (BPF_LD | BPF_IMM | BPF_DW) &&
@@ -1554,47 +1560,205 @@ static int add_subprog(struct bpf_verifier_env *env, int off)
verbose(env, "too many subprograms\n");
return -E2BIG;
}
+ /* determine subprog starts. The end is one before the next starts */
env->subprog_info[env->subprog_cnt++].start = off;
sort(env->subprog_info, env->subprog_cnt,
sizeof(env->subprog_info[0]), cmp_subprogs, NULL);
return env->subprog_cnt - 1;
}
-static int check_subprogs(struct bpf_verifier_env *env)
+struct bpf_kfunc_desc {
+ struct btf_func_model func_model;
+ u32 func_id;
+ s32 imm;
+};
+
+#define MAX_KFUNC_DESCS 256
+struct bpf_kfunc_desc_tab {
+ struct bpf_kfunc_desc descs[MAX_KFUNC_DESCS];
+ u32 nr_descs;
+};
+
+static int kfunc_desc_cmp_by_id(const void *a, const void *b)
+{
+ const struct bpf_kfunc_desc *d0 = a;
+ const struct bpf_kfunc_desc *d1 = b;
+
+ /* func_id is not greater than BTF_MAX_TYPE */
+ return d0->func_id - d1->func_id;
+}
+
+static const struct bpf_kfunc_desc *
+find_kfunc_desc(const struct bpf_prog *prog, u32 func_id)
+{
+ struct bpf_kfunc_desc desc = {
+ .func_id = func_id,
+ };
+ struct bpf_kfunc_desc_tab *tab;
+
+ tab = prog->aux->kfunc_tab;
+ return bsearch(&desc, tab->descs, tab->nr_descs,
+ sizeof(tab->descs[0]), kfunc_desc_cmp_by_id);
+}
+
+static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id)
+{
+ const struct btf_type *func, *func_proto;
+ struct bpf_kfunc_desc_tab *tab;
+ struct bpf_prog_aux *prog_aux;
+ struct bpf_kfunc_desc *desc;
+ const char *func_name;
+ unsigned long addr;
+ int err;
+
+ prog_aux = env->prog->aux;
+ tab = prog_aux->kfunc_tab;
+ if (!tab) {
+ if (!btf_vmlinux) {
+ verbose(env, "calling kernel function is not supported without CONFIG_DEBUG_INFO_BTF\n");
+ return -ENOTSUPP;
+ }
+
+ if (!env->prog->jit_requested) {
+ verbose(env, "JIT is required for calling kernel function\n");
+ return -ENOTSUPP;
+ }
+
+ if (!bpf_jit_supports_kfunc_call()) {
+ verbose(env, "JIT does not support calling kernel function\n");
+ return -ENOTSUPP;
+ }
+
+ if (!env->prog->gpl_compatible) {
+ verbose(env, "cannot call kernel function from non-GPL compatible program\n");
+ return -EINVAL;
+ }
+
+ tab = kzalloc(sizeof(*tab), GFP_KERNEL);
+ if (!tab)
+ return -ENOMEM;
+ prog_aux->kfunc_tab = tab;
+ }
+
+ if (find_kfunc_desc(env->prog, func_id))
+ return 0;
+
+ if (tab->nr_descs == MAX_KFUNC_DESCS) {
+ verbose(env, "too many different kernel function calls\n");
+ return -E2BIG;
+ }
+
+ func = btf_type_by_id(btf_vmlinux, func_id);
+ if (!func || !btf_type_is_func(func)) {
+ verbose(env, "kernel btf_id %u is not a function\n",
+ func_id);
+ return -EINVAL;
+ }
+ func_proto = btf_type_by_id(btf_vmlinux, func->type);
+ if (!func_proto || !btf_type_is_func_proto(func_proto)) {
+ verbose(env, "kernel function btf_id %u does not have a valid func_proto\n",
+ func_id);
+ return -EINVAL;
+ }
+
+ func_name = btf_name_by_offset(btf_vmlinux, func->name_off);
+ addr = kallsyms_lookup_name(func_name);
+ if (!addr) {
+ verbose(env, "cannot find address for kernel function %s\n",
+ func_name);
+ return -EINVAL;
+ }
+
+ desc = &tab->descs[tab->nr_descs++];
+ desc->func_id = func_id;
+ desc->imm = BPF_CAST_CALL(addr) - __bpf_call_base;
+ err = btf_distill_func_proto(&env->log, btf_vmlinux,
+ func_proto, func_name,
+ &desc->func_model);
+ if (!err)
+ sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
+ kfunc_desc_cmp_by_id, NULL);
+ return err;
+}
+
+static int kfunc_desc_cmp_by_imm(const void *a, const void *b)
+{
+ const struct bpf_kfunc_desc *d0 = a;
+ const struct bpf_kfunc_desc *d1 = b;
+
+ if (d0->imm > d1->imm)
+ return 1;
+ else if (d0->imm < d1->imm)
+ return -1;
+ return 0;
+}
+
+static void sort_kfunc_descs_by_imm(struct bpf_prog *prog)
+{
+ struct bpf_kfunc_desc_tab *tab;
+
+ tab = prog->aux->kfunc_tab;
+ if (!tab)
+ return;
+
+ sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
+ kfunc_desc_cmp_by_imm, NULL);
+}
+
+bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog)
+{
+ return !!prog->aux->kfunc_tab;
+}
+
+const struct btf_func_model *
+bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
+ const struct bpf_insn *insn)
+{
+ const struct bpf_kfunc_desc desc = {
+ .imm = insn->imm,
+ };
+ const struct bpf_kfunc_desc *res;
+ struct bpf_kfunc_desc_tab *tab;
+
+ tab = prog->aux->kfunc_tab;
+ res = bsearch(&desc, tab->descs, tab->nr_descs,
+ sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm);
+
+ return res ? &res->func_model : NULL;
+}
+
+static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
{
- int i, ret, subprog_start, subprog_end, off, cur_subprog = 0;
struct bpf_subprog_info *subprog = env->subprog_info;
struct bpf_insn *insn = env->prog->insnsi;
- int insn_cnt = env->prog->len;
+ int i, ret, insn_cnt = env->prog->len;
/* Add entry function. */
ret = add_subprog(env, 0);
- if (ret < 0)
+ if (ret)
return ret;
- /* determine subprog starts. The end is one before the next starts */
- for (i = 0; i < insn_cnt; i++) {
- if (bpf_pseudo_func(insn + i)) {
- if (!env->bpf_capable) {
- verbose(env,
- "function pointers are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
- return -EPERM;
- }
- ret = add_subprog(env, i + insn[i].imm + 1);
- if (ret < 0)
- return ret;
- /* remember subprog */
- insn[i + 1].imm = ret;
- continue;
- }
- if (!bpf_pseudo_call(insn + i))
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn) &&
+ !bpf_pseudo_kfunc_call(insn))
continue;
+
if (!env->bpf_capable) {
- verbose(env,
- "function calls to other bpf functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
+ verbose(env, "loading/calling other bpf or kernel functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
return -EPERM;
}
- ret = add_subprog(env, i + insn[i].imm + 1);
+
+ if (bpf_pseudo_func(insn)) {
+ ret = add_subprog(env, i + insn->imm + 1);
+ if (ret >= 0)
+ /* remember subprog */
+ insn[1].imm = ret;
+ } else if (bpf_pseudo_call(insn)) {
+ ret = add_subprog(env, i + insn->imm + 1);
+ } else {
+ ret = add_kfunc_call(env, insn->imm);
+ }
+
if (ret < 0)
return ret;
}
@@ -1608,6 +1772,16 @@ static int check_subprogs(struct bpf_verifier_env *env)
for (i = 0; i < env->subprog_cnt; i++)
verbose(env, "func#%d @%d\n", i, subprog[i].start);
+ return 0;
+}
+
+static int check_subprogs(struct bpf_verifier_env *env)
+{
+ int i, subprog_start, subprog_end, off, cur_subprog = 0;
+ struct bpf_subprog_info *subprog = env->subprog_info;
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+
/* now check that all jumps are within the same subprog */
subprog_start = subprog[cur_subprog].start;
subprog_end = subprog[cur_subprog + 1].start;
@@ -1916,6 +2090,17 @@ static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
return i;
}
+static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
+{
+ const struct btf_type *func;
+
+ if (insn->src_reg != BPF_PSEUDO_KFUNC_CALL)
+ return NULL;
+
+ func = btf_type_by_id(btf_vmlinux, insn->imm);
+ return btf_name_by_offset(btf_vmlinux, func->name_off);
+}
+
/* For given verifier state backtrack_insn() is called from the last insn to
* the first insn. Its purpose is to compute a bitmask of registers and
* stack slots that needs precision in the parent verifier state.
@@ -1924,6 +2109,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
u32 *reg_mask, u64 *stack_mask)
{
const struct bpf_insn_cbs cbs = {
+ .cb_call = disasm_kfunc_name,
.cb_print = verbose,
.private_data = env,
};
@@ -5365,7 +5551,7 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
func_info_aux = env->prog->aux->func_info_aux;
if (func_info_aux)
is_global = func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
- err = btf_check_func_arg_match(env, subprog, caller->regs);
+ err = btf_check_subprog_arg_match(env, subprog, caller->regs);
if (err == -EFAULT)
return err;
if (is_global) {
@@ -5960,6 +6146,98 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return 0;
}
+/* mark_btf_func_reg_size() is used when the reg size is determined by
+ * the BTF func_proto's return value size and argument.
+ */
+static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno,
+ size_t reg_size)
+{
+ struct bpf_reg_state *reg = &cur_regs(env)[regno];
+
+ if (regno == BPF_REG_0) {
+ /* Function return value */
+ reg->live |= REG_LIVE_WRITTEN;
+ reg->subreg_def = reg_size == sizeof(u64) ?
+ DEF_NOT_SUBREG : env->insn_idx + 1;
+ } else {
+ /* Function argument */
+ if (reg_size == sizeof(u64)) {
+ mark_insn_zext(env, reg);
+ mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
+ } else {
+ mark_reg_read(env, reg, reg->parent, REG_LIVE_READ32);
+ }
+ }
+}
+
+static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn)
+{
+ const struct btf_type *t, *func, *func_proto, *ptr_type;
+ struct bpf_reg_state *regs = cur_regs(env);
+ const char *func_name, *ptr_type_name;
+ u32 i, nargs, func_id, ptr_type_id;
+ const struct btf_param *args;
+ int err;
+
+ func_id = insn->imm;
+ func = btf_type_by_id(btf_vmlinux, func_id);
+ func_name = btf_name_by_offset(btf_vmlinux, func->name_off);
+ func_proto = btf_type_by_id(btf_vmlinux, func->type);
+
+ if (!env->ops->check_kfunc_call ||
+ !env->ops->check_kfunc_call(func_id)) {
+ verbose(env, "calling kernel function %s is not allowed\n",
+ func_name);
+ return -EACCES;
+ }
+
+ /* Check the arguments */
+ err = btf_check_kfunc_arg_match(env, btf_vmlinux, func_id, regs);
+ if (err)
+ return err;
+
+ for (i = 0; i < CALLER_SAVED_REGS; i++)
+ mark_reg_not_init(env, regs, caller_saved[i]);
+
+ /* Check return type */
+ t = btf_type_skip_modifiers(btf_vmlinux, func_proto->type, NULL);
+ if (btf_type_is_scalar(t)) {
+ mark_reg_unknown(env, regs, BPF_REG_0);
+ mark_btf_func_reg_size(env, BPF_REG_0, t->size);
+ } else if (btf_type_is_ptr(t)) {
+ ptr_type = btf_type_skip_modifiers(btf_vmlinux, t->type,
+ &ptr_type_id);
+ if (!btf_type_is_struct(ptr_type)) {
+ ptr_type_name = btf_name_by_offset(btf_vmlinux,
+ ptr_type->name_off);
+ verbose(env, "kernel function %s returns pointer type %s %s is not supported\n",
+ func_name, btf_type_str(ptr_type),
+ ptr_type_name);
+ return -EINVAL;
+ }
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].btf = btf_vmlinux;
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID;
+ regs[BPF_REG_0].btf_id = ptr_type_id;
+ mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *));
+ } /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */
+
+ nargs = btf_type_vlen(func_proto);
+ args = (const struct btf_param *)(func_proto + 1);
+ for (i = 0; i < nargs; i++) {
+ u32 regno = i + 1;
+
+ t = btf_type_skip_modifiers(btf_vmlinux, args[i].type, NULL);
+ if (btf_type_is_ptr(t))
+ mark_btf_func_reg_size(env, regno, sizeof(void *));
+ else
+ /* scalar. ensured by btf_check_kfunc_arg_match() */
+ mark_btf_func_reg_size(env, regno, t->size);
+ }
+
+ return 0;
+}
+
static bool signed_add_overflows(s64 a, s64 b)
{
/* Do the add in u64, where overflow is well-defined */
@@ -6062,19 +6340,6 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
else
*ptr_limit = -off - 1;
return *ptr_limit >= max ? -ERANGE : 0;
- case PTR_TO_MAP_KEY:
- /* Currently, this code is not exercised as the only use
- * is bpf_for_each_map_elem() helper which requires
- * bpf_capble. The code has been tested manually for
- * future use.
- */
- if (mask_to_left) {
- *ptr_limit = ptr_reg->umax_value + ptr_reg->off;
- } else {
- off = ptr_reg->smin_value + ptr_reg->off;
- *ptr_limit = ptr_reg->map_ptr->key_size - off;
- }
- return 0;
case PTR_TO_MAP_VALUE:
max = ptr_reg->map_ptr->value_size;
if (mask_to_left) {
@@ -6281,7 +6546,6 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
verbose(env, "R%d pointer arithmetic on %s prohibited\n",
dst, reg_type_str[ptr_reg->type]);
return -EACCES;
- case PTR_TO_MAP_KEY:
case PTR_TO_MAP_VALUE:
if (!env->allow_ptr_leaks && !known && (smin_val < 0) != (smax_val < 0)) {
verbose(env, "R%d has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root\n",
@@ -10176,6 +10440,7 @@ static int do_check(struct bpf_verifier_env *env)
if (env->log.level & BPF_LOG_LEVEL) {
const struct bpf_insn_cbs cbs = {
+ .cb_call = disasm_kfunc_name,
.cb_print = verbose,
.private_data = env,
};
@@ -10323,7 +10588,8 @@ static int do_check(struct bpf_verifier_env *env)
if (BPF_SRC(insn->code) != BPF_K ||
insn->off != 0 ||
(insn->src_reg != BPF_REG_0 &&
- insn->src_reg != BPF_PSEUDO_CALL) ||
+ insn->src_reg != BPF_PSEUDO_CALL &&
+ insn->src_reg != BPF_PSEUDO_KFUNC_CALL) ||
insn->dst_reg != BPF_REG_0 ||
class == BPF_JMP32) {
verbose(env, "BPF_CALL uses reserved fields\n");
@@ -10338,6 +10604,8 @@ static int do_check(struct bpf_verifier_env *env)
}
if (insn->src_reg == BPF_PSEUDO_CALL)
err = check_func_call(env, insn, &env->insn_idx);
+ else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL)
+ err = check_kfunc_call(env, insn);
else
err = check_helper_call(env, insn, &env->insn_idx);
if (err)
@@ -11648,6 +11916,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
func[i]->aux->name[0] = 'F';
func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
func[i]->jit_requested = 1;
+ func[i]->aux->kfunc_tab = prog->aux->kfunc_tab;
func[i]->aux->linfo = prog->aux->linfo;
func[i]->aux->nr_linfo = prog->aux->nr_linfo;
func[i]->aux->jited_linfo = prog->aux->jited_linfo;
@@ -11755,7 +12024,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
prog->bpf_func = func[0]->bpf_func;
prog->aux->func = func;
prog->aux->func_cnt = env->subprog_cnt;
- bpf_prog_free_unused_jited_linfo(prog);
+ bpf_prog_jit_attempt_done(prog);
return 0;
out_free:
for (i = 0; i < env->subprog_cnt; i++) {
@@ -11778,7 +12047,7 @@ out_undo_insn:
insn->off = 0;
insn->imm = env->insn_aux_data[i].call_imm;
}
- bpf_prog_free_jited_linfo(prog);
+ bpf_prog_jit_attempt_done(prog);
return err;
}
@@ -11787,6 +12056,7 @@ static int fixup_call_args(struct bpf_verifier_env *env)
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
struct bpf_prog *prog = env->prog;
struct bpf_insn *insn = prog->insnsi;
+ bool has_kfunc_call = bpf_prog_has_kfunc_call(prog);
int i, depth;
#endif
int err = 0;
@@ -11800,6 +12070,10 @@ static int fixup_call_args(struct bpf_verifier_env *env)
return err;
}
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
+ if (has_kfunc_call) {
+ verbose(env, "calling kernel functions are not allowed in non-JITed programs\n");
+ return -EINVAL;
+ }
if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) {
/* When JIT fails the progs with bpf2bpf calls and tail_calls
* have to be rejected, since interpreter doesn't support them yet.
@@ -11828,6 +12102,26 @@ static int fixup_call_args(struct bpf_verifier_env *env)
return err;
}
+static int fixup_kfunc_call(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
+{
+ const struct bpf_kfunc_desc *desc;
+
+ /* insn->imm has the btf func_id. Replace it with
+ * an address (relative to __bpf_base_call).
+ */
+ desc = find_kfunc_desc(env->prog, insn->imm);
+ if (!desc) {
+ verbose(env, "verifier internal error: kernel function descriptor not found for func_id %u\n",
+ insn->imm);
+ return -EFAULT;
+ }
+
+ insn->imm = desc->imm;
+
+ return 0;
+}
+
/* Do various post-verification rewrites in a single program pass.
* These rewrites simplify JIT and interpreter implementations.
*/
@@ -11963,6 +12257,12 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
continue;
if (insn->src_reg == BPF_PSEUDO_CALL)
continue;
+ if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+ ret = fixup_kfunc_call(env, insn);
+ if (ret)
+ return ret;
+ continue;
+ }
if (insn->imm == BPF_FUNC_get_route_realm)
prog->dst_needed = 1;
@@ -12192,6 +12492,8 @@ patch_call_imm:
}
}
+ sort_kfunc_descs_by_imm(env->prog);
+
return 0;
}
@@ -12302,7 +12604,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
/* 1st arg to a function */
regs[BPF_REG_1].type = PTR_TO_CTX;
mark_reg_known_zero(env, regs, BPF_REG_1);
- ret = btf_check_func_arg_match(env, subprog, regs);
+ ret = btf_check_subprog_arg_match(env, subprog, regs);
if (ret == -EFAULT)
/* unlikely verifier bug. abort.
* ret == 0 and ret < 0 are sadly acceptable for
@@ -12897,6 +13199,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
if (!env->explored_states)
goto skip_full_check;
+ ret = add_subprog_and_kfunc(env);
+ if (ret < 0)
+ goto skip_full_check;
+
ret = check_subprogs(env);
if (ret < 0)
goto skip_full_check;
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 0abdd67f44b1..a5d72c48fb66 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -2,6 +2,7 @@
/* Copyright (c) 2017 Facebook
*/
#include <linux/bpf.h>
+#include <linux/btf_ids.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/etherdevice.h>
@@ -106,12 +107,16 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
bpf_test_timer_enter(&t);
do {
- bpf_cgroup_storage_set(storage);
+ ret = bpf_cgroup_storage_set(storage);
+ if (ret)
+ break;
if (xdp)
*retval = bpf_prog_run_xdp(prog, ctx);
else
*retval = BPF_PROG_RUN(prog, ctx);
+
+ bpf_cgroup_storage_unset();
} while (bpf_test_timer_continue(&t, repeat, &ret, time));
bpf_test_timer_leave(&t);
@@ -209,10 +214,37 @@ int noinline bpf_modify_return_test(int a, int *b)
*b += 1;
return a + *b;
}
+
+u64 noinline bpf_kfunc_call_test1(struct sock *sk, u32 a, u64 b, u32 c, u64 d)
+{
+ return a + b + c + d;
+}
+
+int noinline bpf_kfunc_call_test2(struct sock *sk, u32 a, u32 b)
+{
+ return a + b;
+}
+
+struct sock * noinline bpf_kfunc_call_test3(struct sock *sk)
+{
+ return sk;
+}
+
__diag_pop();
ALLOW_ERROR_INJECTION(bpf_modify_return_test, ERRNO);
+BTF_SET_START(test_sk_kfunc_ids)
+BTF_ID(func, bpf_kfunc_call_test1)
+BTF_ID(func, bpf_kfunc_call_test2)
+BTF_ID(func, bpf_kfunc_call_test3)
+BTF_SET_END(test_sk_kfunc_ids)
+
+bool bpf_prog_test_check_kfunc_call(u32 kfunc_id)
+{
+ return btf_id_set_contains(&test_sk_kfunc_ids, kfunc_id);
+}
+
static void *bpf_test_init(const union bpf_attr *kattr, u32 size,
u32 headroom, u32 tailroom)
{
diff --git a/net/core/filter.c b/net/core/filter.c
index 17dc159ec40c..cae56d08a670 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -9813,6 +9813,7 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
.convert_ctx_access = tc_cls_act_convert_ctx_access,
.gen_prologue = tc_cls_act_prologue,
.gen_ld_abs = bpf_gen_ld_abs,
+ .check_kfunc_call = bpf_prog_test_check_kfunc_call,
};
const struct bpf_prog_ops tc_cls_act_prog_ops = {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index e8320b5d651a..3ad9e8425ab2 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2500,9 +2500,32 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
}
EXPORT_SYMBOL_GPL(skb_splice_bits);
-/* Send skb data on a socket. Socket must be locked. */
-int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
- int len)
+static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg,
+ struct kvec *vec, size_t num, size_t size)
+{
+ struct socket *sock = sk->sk_socket;
+
+ if (!sock)
+ return -EINVAL;
+ return kernel_sendmsg(sock, msg, vec, num, size);
+}
+
+static int sendpage_unlocked(struct sock *sk, struct page *page, int offset,
+ size_t size, int flags)
+{
+ struct socket *sock = sk->sk_socket;
+
+ if (!sock)
+ return -EINVAL;
+ return kernel_sendpage(sock, page, offset, size, flags);
+}
+
+typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg,
+ struct kvec *vec, size_t num, size_t size);
+typedef int (*sendpage_func)(struct sock *sk, struct page *page, int offset,
+ size_t size, int flags);
+static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset,
+ int len, sendmsg_func sendmsg, sendpage_func sendpage)
{
unsigned int orig_len = len;
struct sk_buff *head = skb;
@@ -2522,7 +2545,8 @@ do_frag_list:
memset(&msg, 0, sizeof(msg));
msg.msg_flags = MSG_DONTWAIT;
- ret = kernel_sendmsg_locked(sk, &msg, &kv, 1, slen);
+ ret = INDIRECT_CALL_2(sendmsg, kernel_sendmsg_locked,
+ sendmsg_unlocked, sk, &msg, &kv, 1, slen);
if (ret <= 0)
goto error;
@@ -2553,9 +2577,11 @@ do_frag_list:
slen = min_t(size_t, len, skb_frag_size(frag) - offset);
while (slen) {
- ret = kernel_sendpage_locked(sk, skb_frag_page(frag),
- skb_frag_off(frag) + offset,
- slen, MSG_DONTWAIT);
+ ret = INDIRECT_CALL_2(sendpage, kernel_sendpage_locked,
+ sendpage_unlocked, sk,
+ skb_frag_page(frag),
+ skb_frag_off(frag) + offset,
+ slen, MSG_DONTWAIT);
if (ret <= 0)
goto error;
@@ -2587,8 +2613,23 @@ out:
error:
return orig_len == len ? ret : orig_len - len;
}
+
+/* Send skb data on a socket. Socket must be locked. */
+int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
+ int len)
+{
+ return __skb_send_sock(sk, skb, offset, len, kernel_sendmsg_locked,
+ kernel_sendpage_locked);
+}
EXPORT_SYMBOL_GPL(skb_send_sock_locked);
+/* Send skb data on a socket. Socket must be unlocked. */
+int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len)
+{
+ return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked,
+ sendpage_unlocked);
+}
+
/**
* skb_store_bits - store bits from kernel buffer to skb
* @skb: destination buffer
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 07f54015238a..92a83c02562a 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -399,6 +399,104 @@ out:
}
EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter);
+int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags,
+ long timeo, int *err)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ int ret = 0;
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ return 1;
+
+ if (!timeo)
+ return ret;
+
+ add_wait_queue(sk_sleep(sk), &wait);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ ret = sk_wait_event(sk, &timeo,
+ !list_empty(&psock->ingress_msg) ||
+ !skb_queue_empty(&sk->sk_receive_queue), &wait);
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ remove_wait_queue(sk_sleep(sk), &wait);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sk_msg_wait_data);
+
+/* Receive sk_msg from psock->ingress_msg to @msg. */
+int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
+ int len, int flags)
+{
+ struct iov_iter *iter = &msg->msg_iter;
+ int peek = flags & MSG_PEEK;
+ struct sk_msg *msg_rx;
+ int i, copied = 0;
+
+ msg_rx = sk_psock_peek_msg(psock);
+ while (copied != len) {
+ struct scatterlist *sge;
+
+ if (unlikely(!msg_rx))
+ break;
+
+ i = msg_rx->sg.start;
+ do {
+ struct page *page;
+ int copy;
+
+ sge = sk_msg_elem(msg_rx, i);
+ copy = sge->length;
+ page = sg_page(sge);
+ if (copied + copy > len)
+ copy = len - copied;
+ copy = copy_page_to_iter(page, sge->offset, copy, iter);
+ if (!copy)
+ return copied ? copied : -EFAULT;
+
+ copied += copy;
+ if (likely(!peek)) {
+ sge->offset += copy;
+ sge->length -= copy;
+ if (!msg_rx->skb)
+ sk_mem_uncharge(sk, copy);
+ msg_rx->sg.size -= copy;
+
+ if (!sge->length) {
+ sk_msg_iter_var_next(i);
+ if (!msg_rx->skb)
+ put_page(page);
+ }
+ } else {
+ /* Lets not optimize peek case if copy_page_to_iter
+ * didn't copy the entire length lets just break.
+ */
+ if (copy != sge->length)
+ return copied;
+ sk_msg_iter_var_next(i);
+ }
+
+ if (copied == len)
+ break;
+ } while (i != msg_rx->sg.end);
+
+ if (unlikely(peek)) {
+ msg_rx = sk_psock_next_msg(psock, msg_rx);
+ if (!msg_rx)
+ break;
+ continue;
+ }
+
+ msg_rx->sg.start = i;
+ if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) {
+ msg_rx = sk_psock_dequeue_msg(psock);
+ kfree_sk_msg(msg_rx);
+ }
+ msg_rx = sk_psock_peek_msg(psock);
+ }
+
+ return copied;
+}
+EXPORT_SYMBOL_GPL(sk_msg_recvmsg);
+
static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk,
struct sk_buff *skb)
{
@@ -410,7 +508,7 @@ static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk,
if (!sk_rmem_schedule(sk, skb, skb->truesize))
return NULL;
- msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC);
+ msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_KERNEL);
if (unlikely(!msg))
return NULL;
@@ -497,7 +595,7 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
if (!ingress) {
if (!sock_writeable(psock->sk))
return -EAGAIN;
- return skb_send_sock_locked(psock->sk, skb, off, len);
+ return skb_send_sock(psock->sk, skb, off, len);
}
return sk_psock_skb_ingress(psock, skb);
}
@@ -511,8 +609,7 @@ static void sk_psock_backlog(struct work_struct *work)
u32 len, off;
int ret;
- /* Lock sock to avoid losing sk_socket during loop. */
- lock_sock(psock->sk);
+ mutex_lock(&psock->work_mutex);
if (state->skb) {
skb = state->skb;
len = state->len;
@@ -529,7 +626,7 @@ start:
skb_bpf_redirect_clear(skb);
do {
ret = -EIO;
- if (likely(psock->sk->sk_socket))
+ if (!sock_flag(psock->sk, SOCK_DEAD))
ret = sk_psock_handle_skb(psock, skb, off,
len, ingress);
if (ret <= 0) {
@@ -553,7 +650,7 @@ start:
kfree_skb(skb);
}
end:
- release_sock(psock->sk);
+ mutex_unlock(&psock->work_mutex);
}
struct sk_psock *sk_psock_init(struct sock *sk, int node)
@@ -563,11 +660,6 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node)
write_lock_bh(&sk->sk_callback_lock);
- if (inet_csk_has_ulp(sk)) {
- psock = ERR_PTR(-EINVAL);
- goto out;
- }
-
if (sk->sk_user_data) {
psock = ERR_PTR(-EBUSY);
goto out;
@@ -591,7 +683,9 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node)
spin_lock_init(&psock->link_lock);
INIT_WORK(&psock->work, sk_psock_backlog);
+ mutex_init(&psock->work_mutex);
INIT_LIST_HEAD(&psock->ingress_msg);
+ spin_lock_init(&psock->ingress_lock);
skb_queue_head_init(&psock->ingress_skb);
sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED);
@@ -630,11 +724,11 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
}
}
-static void sk_psock_zap_ingress(struct sk_psock *psock)
+static void __sk_psock_zap_ingress(struct sk_psock *psock)
{
struct sk_buff *skb;
- while ((skb = __skb_dequeue(&psock->ingress_skb)) != NULL) {
+ while ((skb = skb_dequeue(&psock->ingress_skb)) != NULL) {
skb_bpf_redirect_clear(skb);
kfree_skb(skb);
}
@@ -651,23 +745,35 @@ static void sk_psock_link_destroy(struct sk_psock *psock)
}
}
+void sk_psock_stop(struct sk_psock *psock, bool wait)
+{
+ spin_lock_bh(&psock->ingress_lock);
+ sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
+ sk_psock_cork_free(psock);
+ __sk_psock_zap_ingress(psock);
+ spin_unlock_bh(&psock->ingress_lock);
+
+ if (wait)
+ cancel_work_sync(&psock->work);
+}
+
static void sk_psock_done_strp(struct sk_psock *psock);
-static void sk_psock_destroy_deferred(struct work_struct *gc)
+static void sk_psock_destroy(struct work_struct *work)
{
- struct sk_psock *psock = container_of(gc, struct sk_psock, gc);
-
+ struct sk_psock *psock = container_of(to_rcu_work(work),
+ struct sk_psock, rwork);
/* No sk_callback_lock since already detached. */
sk_psock_done_strp(psock);
cancel_work_sync(&psock->work);
+ mutex_destroy(&psock->work_mutex);
psock_progs_drop(&psock->progs);
sk_psock_link_destroy(psock);
sk_psock_cork_free(psock);
- sk_psock_zap_ingress(psock);
if (psock->sk_redir)
sock_put(psock->sk_redir);
@@ -675,30 +781,21 @@ static void sk_psock_destroy_deferred(struct work_struct *gc)
kfree(psock);
}
-static void sk_psock_destroy(struct rcu_head *rcu)
-{
- struct sk_psock *psock = container_of(rcu, struct sk_psock, rcu);
-
- INIT_WORK(&psock->gc, sk_psock_destroy_deferred);
- schedule_work(&psock->gc);
-}
-
void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
{
- sk_psock_cork_free(psock);
- sk_psock_zap_ingress(psock);
+ sk_psock_stop(psock, false);
write_lock_bh(&sk->sk_callback_lock);
sk_psock_restore_proto(sk, psock);
rcu_assign_sk_user_data(sk, NULL);
if (psock->progs.stream_parser)
sk_psock_stop_strp(sk, psock);
- else if (psock->progs.stream_verdict)
+ else if (psock->progs.stream_verdict || psock->progs.skb_verdict)
sk_psock_stop_verdict(sk, psock);
write_unlock_bh(&sk->sk_callback_lock);
- sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
- call_rcu(&psock->rcu, sk_psock_destroy);
+ INIT_RCU_WORK(&psock->rwork, sk_psock_destroy);
+ queue_rcu_work(system_wq, &psock->rwork);
}
EXPORT_SYMBOL_GPL(sk_psock_drop);
@@ -767,14 +864,20 @@ static void sk_psock_skb_redirect(struct sk_buff *skb)
* error that caused the pipe to break. We can't send a packet on
* a socket that is in this state so we drop the skb.
*/
- if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
- !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {
+ if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) {
+ kfree_skb(skb);
+ return;
+ }
+ spin_lock_bh(&psock_other->ingress_lock);
+ if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {
+ spin_unlock_bh(&psock_other->ingress_lock);
kfree_skb(skb);
return;
}
skb_queue_tail(&psock_other->ingress_skb, skb);
schedule_work(&psock_other->work);
+ spin_unlock_bh(&psock_other->ingress_lock);
}
static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sock *sk, int verdict)
@@ -842,8 +945,12 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
err = sk_psock_skb_ingress_self(psock, skb);
}
if (err < 0) {
- skb_queue_tail(&psock->ingress_skb, skb);
- schedule_work(&psock->work);
+ spin_lock_bh(&psock->ingress_lock);
+ if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
+ skb_queue_tail(&psock->ingress_skb, skb);
+ schedule_work(&psock->work);
+ }
+ spin_unlock_bh(&psock->ingress_lock);
}
break;
case __SK_REDIRECT:
@@ -1010,6 +1117,8 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
}
skb_set_owner_r(skb, sk);
prog = READ_ONCE(psock->progs.stream_verdict);
+ if (!prog)
+ prog = READ_ONCE(psock->progs.skb_verdict);
if (likely(prog)) {
skb_dst_drop(skb);
skb_bpf_redirect_clear(skb);
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index dd53a7771d7e..3d190d22b0d8 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -26,6 +26,7 @@ struct bpf_stab {
static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
struct bpf_prog *old, u32 which);
+static struct sk_psock_progs *sock_map_progs(struct bpf_map *map);
static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
{
@@ -155,6 +156,8 @@ static void sock_map_del_link(struct sock *sk,
strp_stop = true;
if (psock->saved_data_ready && stab->progs.stream_verdict)
verdict_stop = true;
+ if (psock->saved_data_ready && stab->progs.skb_verdict)
+ verdict_stop = true;
list_del(&link->list);
sk_psock_free_link(link);
}
@@ -182,26 +185,10 @@ static void sock_map_unref(struct sock *sk, void *link_raw)
static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock)
{
- struct proto *prot;
-
- switch (sk->sk_type) {
- case SOCK_STREAM:
- prot = tcp_bpf_get_proto(sk, psock);
- break;
-
- case SOCK_DGRAM:
- prot = udp_bpf_get_proto(sk, psock);
- break;
-
- default:
+ if (!sk->sk_prot->psock_update_sk_prot)
return -EINVAL;
- }
-
- if (IS_ERR(prot))
- return PTR_ERR(prot);
-
- sk_psock_update_proto(sk, psock, prot);
- return 0;
+ psock->psock_update_sk_prot = sk->sk_prot->psock_update_sk_prot;
+ return sk->sk_prot->psock_update_sk_prot(sk, false);
}
static struct sk_psock *sock_map_psock_get_checked(struct sock *sk)
@@ -224,13 +211,25 @@ out:
return psock;
}
-static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
- struct sock *sk)
+static bool sock_map_redirect_allowed(const struct sock *sk);
+
+static int sock_map_link(struct bpf_map *map, struct sock *sk)
{
- struct bpf_prog *msg_parser, *stream_parser, *stream_verdict;
+ struct sk_psock_progs *progs = sock_map_progs(map);
+ struct bpf_prog *stream_verdict = NULL;
+ struct bpf_prog *stream_parser = NULL;
+ struct bpf_prog *skb_verdict = NULL;
+ struct bpf_prog *msg_parser = NULL;
struct sk_psock *psock;
int ret;
+ /* Only sockets we can redirect into/from in BPF need to hold
+ * refs to parser/verdict progs and have their sk_data_ready
+ * and sk_write_space callbacks overridden.
+ */
+ if (!sock_map_redirect_allowed(sk))
+ goto no_progs;
+
stream_verdict = READ_ONCE(progs->stream_verdict);
if (stream_verdict) {
stream_verdict = bpf_prog_inc_not_zero(stream_verdict);
@@ -256,6 +255,16 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
}
}
+ skb_verdict = READ_ONCE(progs->skb_verdict);
+ if (skb_verdict) {
+ skb_verdict = bpf_prog_inc_not_zero(skb_verdict);
+ if (IS_ERR(skb_verdict)) {
+ ret = PTR_ERR(skb_verdict);
+ goto out_put_msg_parser;
+ }
+ }
+
+no_progs:
psock = sock_map_psock_get_checked(sk);
if (IS_ERR(psock)) {
ret = PTR_ERR(psock);
@@ -265,6 +274,9 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
if (psock) {
if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) ||
(stream_parser && READ_ONCE(psock->progs.stream_parser)) ||
+ (skb_verdict && READ_ONCE(psock->progs.skb_verdict)) ||
+ (skb_verdict && READ_ONCE(psock->progs.stream_verdict)) ||
+ (stream_verdict && READ_ONCE(psock->progs.skb_verdict)) ||
(stream_verdict && READ_ONCE(psock->progs.stream_verdict))) {
sk_psock_put(sk, psock);
ret = -EBUSY;
@@ -296,6 +308,9 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
} else if (!stream_parser && stream_verdict && !psock->saved_data_ready) {
psock_set_prog(&psock->progs.stream_verdict, stream_verdict);
sk_psock_start_verdict(sk,psock);
+ } else if (!stream_verdict && skb_verdict && !psock->saved_data_ready) {
+ psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
+ sk_psock_start_verdict(sk, psock);
}
write_unlock_bh(&sk->sk_callback_lock);
return 0;
@@ -304,6 +319,9 @@ out_unlock_drop:
out_drop:
sk_psock_put(sk, psock);
out_progs:
+ if (skb_verdict)
+ bpf_prog_put(skb_verdict);
+out_put_msg_parser:
if (msg_parser)
bpf_prog_put(msg_parser);
out_put_stream_parser:
@@ -315,27 +333,6 @@ out_put_stream_verdict:
return ret;
}
-static int sock_map_link_no_progs(struct bpf_map *map, struct sock *sk)
-{
- struct sk_psock *psock;
- int ret;
-
- psock = sock_map_psock_get_checked(sk);
- if (IS_ERR(psock))
- return PTR_ERR(psock);
-
- if (!psock) {
- psock = sk_psock_init(sk, map->numa_node);
- if (IS_ERR(psock))
- return PTR_ERR(psock);
- }
-
- ret = sock_map_init_proto(sk, psock);
- if (ret < 0)
- sk_psock_put(sk, psock);
- return ret;
-}
-
static void sock_map_free(struct bpf_map *map)
{
struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
@@ -466,8 +463,6 @@ static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next)
return 0;
}
-static bool sock_map_redirect_allowed(const struct sock *sk);
-
static int sock_map_update_common(struct bpf_map *map, u32 idx,
struct sock *sk, u64 flags)
{
@@ -487,14 +482,7 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx,
if (!link)
return -ENOMEM;
- /* Only sockets we can redirect into/from in BPF need to hold
- * refs to parser/verdict progs and have their sk_data_ready
- * and sk_write_space callbacks overridden.
- */
- if (sock_map_redirect_allowed(sk))
- ret = sock_map_link(map, &stab->progs, sk);
- else
- ret = sock_map_link_no_progs(map, sk);
+ ret = sock_map_link(map, sk);
if (ret < 0)
goto out_free;
@@ -547,12 +535,15 @@ static bool sk_is_udp(const struct sock *sk)
static bool sock_map_redirect_allowed(const struct sock *sk)
{
- return sk_is_tcp(sk) && sk->sk_state != TCP_LISTEN;
+ if (sk_is_tcp(sk))
+ return sk->sk_state != TCP_LISTEN;
+ else
+ return sk->sk_state == TCP_ESTABLISHED;
}
static bool sock_map_sk_is_suitable(const struct sock *sk)
{
- return sk_is_tcp(sk) || sk_is_udp(sk);
+ return !!sk->sk_prot->psock_update_sk_prot;
}
static bool sock_map_sk_state_allowed(const struct sock *sk)
@@ -999,14 +990,7 @@ static int sock_hash_update_common(struct bpf_map *map, void *key,
if (!link)
return -ENOMEM;
- /* Only sockets we can redirect into/from in BPF need to hold
- * refs to parser/verdict progs and have their sk_data_ready
- * and sk_write_space callbacks overridden.
- */
- if (sock_map_redirect_allowed(sk))
- ret = sock_map_link(map, &htab->progs, sk);
- else
- ret = sock_map_link_no_progs(map, sk);
+ ret = sock_map_link(map, sk);
if (ret < 0)
goto out_free;
@@ -1466,8 +1450,15 @@ static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
break;
#endif
case BPF_SK_SKB_STREAM_VERDICT:
+ if (progs->skb_verdict)
+ return -EBUSY;
pprog = &progs->stream_verdict;
break;
+ case BPF_SK_SKB_VERDICT:
+ if (progs->stream_verdict)
+ return -EBUSY;
+ pprog = &progs->skb_verdict;
+ break;
default:
return -EOPNOTSUPP;
}
@@ -1540,6 +1531,7 @@ void sock_map_close(struct sock *sk, long timeout)
saved_close = psock->saved_close;
sock_map_remove_links(sk, psock);
rcu_read_unlock();
+ sk_psock_stop(psock, true);
release_sock(sk);
saved_close(sk, timeout);
}
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 1355e6c0d567..f17870ee558b 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1070,6 +1070,7 @@ const struct proto_ops inet_dgram_ops = {
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
+ .read_sock = udp_read_sock,
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index d520e61649c8..dff4f0eb96b0 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -5,6 +5,7 @@
#include <linux/bpf_verifier.h>
#include <linux/bpf.h>
#include <linux/btf.h>
+#include <linux/btf_ids.h>
#include <linux/filter.h>
#include <net/tcp.h>
#include <net/bpf_sk_storage.h>
@@ -178,10 +179,52 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
}
}
+BTF_SET_START(bpf_tcp_ca_kfunc_ids)
+BTF_ID(func, tcp_reno_ssthresh)
+BTF_ID(func, tcp_reno_cong_avoid)
+BTF_ID(func, tcp_reno_undo_cwnd)
+BTF_ID(func, tcp_slow_start)
+BTF_ID(func, tcp_cong_avoid_ai)
+#ifdef CONFIG_DYNAMIC_FTRACE
+#if IS_BUILTIN(CONFIG_TCP_CONG_CUBIC)
+BTF_ID(func, cubictcp_init)
+BTF_ID(func, cubictcp_recalc_ssthresh)
+BTF_ID(func, cubictcp_cong_avoid)
+BTF_ID(func, cubictcp_state)
+BTF_ID(func, cubictcp_cwnd_event)
+BTF_ID(func, cubictcp_acked)
+#endif
+#if IS_BUILTIN(CONFIG_TCP_CONG_DCTCP)
+BTF_ID(func, dctcp_init)
+BTF_ID(func, dctcp_update_alpha)
+BTF_ID(func, dctcp_cwnd_event)
+BTF_ID(func, dctcp_ssthresh)
+BTF_ID(func, dctcp_cwnd_undo)
+BTF_ID(func, dctcp_state)
+#endif
+#if IS_BUILTIN(CONFIG_TCP_CONG_BBR)
+BTF_ID(func, bbr_init)
+BTF_ID(func, bbr_main)
+BTF_ID(func, bbr_sndbuf_expand)
+BTF_ID(func, bbr_undo_cwnd)
+BTF_ID(func, bbr_cwnd_event)
+BTF_ID(func, bbr_ssthresh)
+BTF_ID(func, bbr_min_tso_segs)
+BTF_ID(func, bbr_set_state)
+#endif
+#endif /* CONFIG_DYNAMIC_FTRACE */
+BTF_SET_END(bpf_tcp_ca_kfunc_ids)
+
+static bool bpf_tcp_ca_check_kfunc_call(u32 kfunc_btf_id)
+{
+ return btf_id_set_contains(&bpf_tcp_ca_kfunc_ids, kfunc_btf_id);
+}
+
static const struct bpf_verifier_ops bpf_tcp_ca_verifier_ops = {
.get_func_proto = bpf_tcp_ca_get_func_proto,
.is_valid_access = bpf_tcp_ca_is_valid_access,
.btf_struct_access = bpf_tcp_ca_btf_struct_access,
+ .check_kfunc_call = bpf_tcp_ca_check_kfunc_call,
};
static int bpf_tcp_ca_init_member(const struct btf_type *t,
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 17c322b875fd..3d622a0d0753 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -10,86 +10,6 @@
#include <net/inet_common.h>
#include <net/tls.h>
-int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
- struct msghdr *msg, int len, int flags)
-{
- struct iov_iter *iter = &msg->msg_iter;
- int peek = flags & MSG_PEEK;
- struct sk_msg *msg_rx;
- int i, copied = 0;
-
- msg_rx = list_first_entry_or_null(&psock->ingress_msg,
- struct sk_msg, list);
-
- while (copied != len) {
- struct scatterlist *sge;
-
- if (unlikely(!msg_rx))
- break;
-
- i = msg_rx->sg.start;
- do {
- struct page *page;
- int copy;
-
- sge = sk_msg_elem(msg_rx, i);
- copy = sge->length;
- page = sg_page(sge);
- if (copied + copy > len)
- copy = len - copied;
- copy = copy_page_to_iter(page, sge->offset, copy, iter);
- if (!copy)
- return copied ? copied : -EFAULT;
-
- copied += copy;
- if (likely(!peek)) {
- sge->offset += copy;
- sge->length -= copy;
- if (!msg_rx->skb)
- sk_mem_uncharge(sk, copy);
- msg_rx->sg.size -= copy;
-
- if (!sge->length) {
- sk_msg_iter_var_next(i);
- if (!msg_rx->skb)
- put_page(page);
- }
- } else {
- /* Lets not optimize peek case if copy_page_to_iter
- * didn't copy the entire length lets just break.
- */
- if (copy != sge->length)
- return copied;
- sk_msg_iter_var_next(i);
- }
-
- if (copied == len)
- break;
- } while (i != msg_rx->sg.end);
-
- if (unlikely(peek)) {
- if (msg_rx == list_last_entry(&psock->ingress_msg,
- struct sk_msg, list))
- break;
- msg_rx = list_next_entry(msg_rx, list);
- continue;
- }
-
- msg_rx->sg.start = i;
- if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) {
- list_del(&msg_rx->list);
- if (msg_rx->skb)
- consume_skb(msg_rx->skb);
- kfree(msg_rx);
- }
- msg_rx = list_first_entry_or_null(&psock->ingress_msg,
- struct sk_msg, list);
- }
-
- return copied;
-}
-EXPORT_SYMBOL_GPL(__tcp_bpf_recvmsg);
-
static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
struct sk_msg *msg, u32 apply_bytes, int flags)
{
@@ -243,28 +163,6 @@ static bool tcp_bpf_stream_read(const struct sock *sk)
return !empty;
}
-static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock,
- int flags, long timeo, int *err)
-{
- DEFINE_WAIT_FUNC(wait, woken_wake_function);
- int ret = 0;
-
- if (sk->sk_shutdown & RCV_SHUTDOWN)
- return 1;
-
- if (!timeo)
- return ret;
-
- add_wait_queue(sk_sleep(sk), &wait);
- sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
- ret = sk_wait_event(sk, &timeo,
- !list_empty(&psock->ingress_msg) ||
- !skb_queue_empty(&sk->sk_receive_queue), &wait);
- sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
- remove_wait_queue(sk_sleep(sk), &wait);
- return ret;
-}
-
static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
int nonblock, int flags, int *addr_len)
{
@@ -284,13 +182,13 @@ static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
}
lock_sock(sk);
msg_bytes_ready:
- copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags);
+ copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
if (!copied) {
int data, err = 0;
long timeo;
timeo = sock_rcvtimeo(sk, nonblock);
- data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err);
+ data = sk_msg_wait_data(sk, psock, flags, timeo, &err);
if (data) {
if (!sk_psock_queue_empty(psock))
goto msg_bytes_ready;
@@ -601,20 +499,38 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops)
ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP;
}
-struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock)
+int tcp_bpf_update_proto(struct sock *sk, bool restore)
{
+ struct sk_psock *psock = sk_psock(sk);
int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE;
+ if (restore) {
+ if (inet_csk_has_ulp(sk)) {
+ tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space);
+ } else {
+ sk->sk_write_space = psock->saved_write_space;
+ /* Pairs with lockless read in sk_clone_lock() */
+ WRITE_ONCE(sk->sk_prot, psock->sk_proto);
+ }
+ return 0;
+ }
+
+ if (inet_csk_has_ulp(sk))
+ return -EINVAL;
+
if (sk->sk_family == AF_INET6) {
if (tcp_bpf_assert_proto_ops(psock->sk_proto))
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
tcp_bpf_check_v6_needs_rebuild(psock->sk_proto);
}
- return &tcp_bpf_prots[family][config];
+ /* Pairs with lockless read in sk_clone_lock() */
+ WRITE_ONCE(sk->sk_prot, &tcp_bpf_prots[family][config]);
+ return 0;
}
+EXPORT_SYMBOL_GPL(tcp_bpf_update_proto);
/* If a child got cloned from a listening socket that had tcp_bpf
* protocol callbacks installed, we need to restore the callbacks to
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index ffcbe46dacdb..4a30deaa9a37 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -124,7 +124,7 @@ static inline void bictcp_hystart_reset(struct sock *sk)
ca->sample_cnt = 0;
}
-static void bictcp_init(struct sock *sk)
+static void cubictcp_init(struct sock *sk)
{
struct bictcp *ca = inet_csk_ca(sk);
@@ -137,7 +137,7 @@ static void bictcp_init(struct sock *sk)
tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
}
-static void bictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+static void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event)
{
if (event == CA_EVENT_TX_START) {
struct bictcp *ca = inet_csk_ca(sk);
@@ -319,7 +319,7 @@ tcp_friendliness:
ca->cnt = max(ca->cnt, 2U);
}
-static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+static void cubictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
@@ -338,7 +338,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
tcp_cong_avoid_ai(tp, ca->cnt, acked);
}
-static u32 bictcp_recalc_ssthresh(struct sock *sk)
+static u32 cubictcp_recalc_ssthresh(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
@@ -355,7 +355,7 @@ static u32 bictcp_recalc_ssthresh(struct sock *sk)
return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
}
-static void bictcp_state(struct sock *sk, u8 new_state)
+static void cubictcp_state(struct sock *sk, u8 new_state)
{
if (new_state == TCP_CA_Loss) {
bictcp_reset(inet_csk_ca(sk));
@@ -442,7 +442,7 @@ static void hystart_update(struct sock *sk, u32 delay)
}
}
-static void bictcp_acked(struct sock *sk, const struct ack_sample *sample)
+static void cubictcp_acked(struct sock *sk, const struct ack_sample *sample)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
@@ -471,13 +471,13 @@ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample)
}
static struct tcp_congestion_ops cubictcp __read_mostly = {
- .init = bictcp_init,
- .ssthresh = bictcp_recalc_ssthresh,
- .cong_avoid = bictcp_cong_avoid,
- .set_state = bictcp_state,
+ .init = cubictcp_init,
+ .ssthresh = cubictcp_recalc_ssthresh,
+ .cong_avoid = cubictcp_cong_avoid,
+ .set_state = cubictcp_state,
.undo_cwnd = tcp_reno_undo_cwnd,
- .cwnd_event = bictcp_cwnd_event,
- .pkts_acked = bictcp_acked,
+ .cwnd_event = cubictcp_cwnd_event,
+ .pkts_acked = cubictcp_acked,
.owner = THIS_MODULE,
.name = "cubic",
};
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index daad4f99db32..312184cead57 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -655,14 +655,18 @@ EXPORT_SYMBOL(tcp_v4_send_check);
* Exception: precedence violation. We do not implement it in any case.
*/
+#ifdef CONFIG_TCP_MD5SIG
+#define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
+#else
+#define OPTION_BYTES sizeof(__be32)
+#endif
+
static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
{
const struct tcphdr *th = tcp_hdr(skb);
struct {
struct tcphdr th;
-#ifdef CONFIG_TCP_MD5SIG
- __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
-#endif
+ __be32 opt[OPTION_BYTES / sizeof(__be32)];
} rep;
struct ip_reply_arg arg;
#ifdef CONFIG_TCP_MD5SIG
@@ -770,6 +774,17 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
ip_hdr(skb)->daddr, &rep.th);
}
#endif
+ /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
+ if (rep.opt[0] == 0) {
+ __be32 mrst = mptcp_reset_option(skb);
+
+ if (mrst) {
+ rep.opt[0] = mrst;
+ arg.iov[0].iov_len += sizeof(mrst);
+ rep.th.doff = arg.iov[0].iov_len / 4;
+ }
+ }
+
arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
ip_hdr(skb)->saddr, /* XXX */
arg.iov[0].iov_len, IPPROTO_TCP, 0);
@@ -2806,6 +2821,9 @@ struct proto tcp_prot = {
.hash = inet_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
+#ifdef CONFIG_BPF_SYSCALL
+ .psock_update_sk_prot = tcp_bpf_update_proto,
+#endif
.enter_memory_pressure = tcp_enter_memory_pressure,
.leave_memory_pressure = tcp_leave_memory_pressure,
.stream_memory_free = tcp_stream_memory_free,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index c0695ce42dc5..bfcc7f1a8a7f 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1782,6 +1782,35 @@ busy_check:
}
EXPORT_SYMBOL(__skb_recv_udp);
+int udp_read_sock(struct sock *sk, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor)
+{
+ int copied = 0;
+
+ while (1) {
+ struct sk_buff *skb;
+ int err, used;
+
+ skb = skb_recv_udp(sk, 0, 1, &err);
+ if (!skb)
+ return err;
+ used = recv_actor(desc, skb, 0, skb->len);
+ if (used <= 0) {
+ if (!copied)
+ copied = used;
+ break;
+ } else if (used <= skb->len) {
+ copied += used;
+ }
+
+ if (!desc->count)
+ break;
+ }
+
+ return copied;
+}
+EXPORT_SYMBOL(udp_read_sock);
+
/*
* This should be easy, if there is something there we
* return it, otherwise we block.
@@ -2854,6 +2883,9 @@ struct proto udp_prot = {
.unhash = udp_lib_unhash,
.rehash = udp_v4_rehash,
.get_port = udp_v4_get_port,
+#ifdef CONFIG_BPF_SYSCALL
+ .psock_update_sk_prot = udp_bpf_update_proto,
+#endif
.memory_allocated = &udp_memory_allocated,
.sysctl_mem = sysctl_udp_mem,
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min),
diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c
index 7a94791efc1a..7d5c4ebf42fe 100644
--- a/net/ipv4/udp_bpf.c
+++ b/net/ipv4/udp_bpf.c
@@ -4,6 +4,68 @@
#include <linux/skmsg.h>
#include <net/sock.h>
#include <net/udp.h>
+#include <net/inet_common.h>
+
+#include "udp_impl.h"
+
+static struct proto *udpv6_prot_saved __read_mostly;
+
+static int sk_udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int noblock, int flags, int *addr_len)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6)
+ return udpv6_prot_saved->recvmsg(sk, msg, len, noblock, flags,
+ addr_len);
+#endif
+ return udp_prot.recvmsg(sk, msg, len, noblock, flags, addr_len);
+}
+
+static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int nonblock, int flags, int *addr_len)
+{
+ struct sk_psock *psock;
+ int copied, ret;
+
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return inet_recv_error(sk, msg, len, addr_len);
+
+ psock = sk_psock_get(sk);
+ if (unlikely(!psock))
+ return sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+
+ lock_sock(sk);
+ if (sk_psock_queue_empty(psock)) {
+ ret = sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+ goto out;
+ }
+
+msg_bytes_ready:
+ copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
+ if (!copied) {
+ int data, err = 0;
+ long timeo;
+
+ timeo = sock_rcvtimeo(sk, nonblock);
+ data = sk_msg_wait_data(sk, psock, flags, timeo, &err);
+ if (data) {
+ if (!sk_psock_queue_empty(psock))
+ goto msg_bytes_ready;
+ ret = sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+ goto out;
+ }
+ if (err) {
+ ret = err;
+ goto out;
+ }
+ copied = -EAGAIN;
+ }
+ ret = copied;
+out:
+ release_sock(sk);
+ sk_psock_put(sk, psock);
+ return ret;
+}
enum {
UDP_BPF_IPV4,
@@ -11,7 +73,6 @@ enum {
UDP_BPF_NUM_PROTS,
};
-static struct proto *udpv6_prot_saved __read_mostly;
static DEFINE_SPINLOCK(udpv6_prot_lock);
static struct proto udp_bpf_prots[UDP_BPF_NUM_PROTS];
@@ -20,6 +81,7 @@ static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base)
*prot = *base;
prot->unhash = sock_map_unhash;
prot->close = sock_map_close;
+ prot->recvmsg = udp_bpf_recvmsg;
}
static void udp_bpf_check_v6_needs_rebuild(struct proto *ops)
@@ -41,12 +103,23 @@ static int __init udp_bpf_v4_build_proto(void)
}
core_initcall(udp_bpf_v4_build_proto);
-struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock)
+int udp_bpf_update_proto(struct sock *sk, bool restore)
{
int family = sk->sk_family == AF_INET ? UDP_BPF_IPV4 : UDP_BPF_IPV6;
+ struct sk_psock *psock = sk_psock(sk);
+
+ if (restore) {
+ sk->sk_write_space = psock->saved_write_space;
+ /* Pairs with lockless read in sk_clone_lock() */
+ WRITE_ONCE(sk->sk_prot, psock->sk_proto);
+ return 0;
+ }
if (sk->sk_family == AF_INET6)
udp_bpf_check_v6_needs_rebuild(psock->sk_proto);
- return &udp_bpf_prots[family];
+ /* Pairs with lockless read in sk_clone_lock() */
+ WRITE_ONCE(sk->sk_prot, &udp_bpf_prots[family]);
+ return 0;
}
+EXPORT_SYMBOL_GPL(udp_bpf_update_proto);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 4f7ca5807046..2389ff702f51 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -714,6 +714,7 @@ const struct proto_ops inet6_dgram_ops = {
.getsockopt = sock_common_getsockopt, /* ok */
.sendmsg = inet6_sendmsg, /* retpoline's sake */
.recvmsg = inet6_recvmsg, /* retpoline's sake */
+ .read_sock = udp_read_sock,
.mmap = sock_no_mmap,
.sendpage = sock_no_sendpage,
.set_peek_off = sk_set_peek_off,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d0f007741e8e..5f47c0b6e3de 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -879,8 +879,8 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
struct net *net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
struct sock *ctl_sk = net->ipv6.tcp_sk;
unsigned int tot_len = sizeof(struct tcphdr);
+ __be32 mrst = 0, *topt;
struct dst_entry *dst;
- __be32 *topt;
__u32 mark = 0;
if (tsecr)
@@ -890,6 +890,15 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
tot_len += TCPOLEN_MD5SIG_ALIGNED;
#endif
+#ifdef CONFIG_MPTCP
+ if (rst && !key) {
+ mrst = mptcp_reset_option(skb);
+
+ if (mrst)
+ tot_len += sizeof(__be32);
+ }
+#endif
+
buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len,
GFP_ATOMIC);
if (!buff)
@@ -920,6 +929,9 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
*topt++ = htonl(tsecr);
}
+ if (mrst)
+ *topt++ = mrst;
+
#ifdef CONFIG_TCP_MD5SIG
if (key) {
*topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
@@ -2139,6 +2151,9 @@ struct proto tcpv6_prot = {
.hash = inet6_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
+#ifdef CONFIG_BPF_SYSCALL
+ .psock_update_sk_prot = tcp_bpf_update_proto,
+#endif
.enter_memory_pressure = tcp_enter_memory_pressure,
.leave_memory_pressure = tcp_leave_memory_pressure,
.stream_memory_free = tcp_stream_memory_free,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index fa2f54738392..199b080d418a 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1714,6 +1714,9 @@ struct proto udpv6_prot = {
.unhash = udp_lib_unhash,
.rehash = udp_v6_rehash,
.get_port = udp_v6_get_port,
+#ifdef CONFIG_BPF_SYSCALL
+ .psock_update_sk_prot = udp_bpf_update_proto,
+#endif
.memory_allocated = &udp_memory_allocated,
.sysctl_mem = sysctl_udp_mem,
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min),
diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c
index 3780c29c321d..eb2dc6dbe212 100644
--- a/net/mptcp/mib.c
+++ b/net/mptcp/mib.c
@@ -10,9 +10,12 @@
static const struct snmp_mib mptcp_snmp_list[] = {
SNMP_MIB_ITEM("MPCapableSYNRX", MPTCP_MIB_MPCAPABLEPASSIVE),
+ SNMP_MIB_ITEM("MPCapableSYNTX", MPTCP_MIB_MPCAPABLEACTIVE),
+ SNMP_MIB_ITEM("MPCapableSYNACKRX", MPTCP_MIB_MPCAPABLEACTIVEACK),
SNMP_MIB_ITEM("MPCapableACKRX", MPTCP_MIB_MPCAPABLEPASSIVEACK),
SNMP_MIB_ITEM("MPCapableFallbackACK", MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK),
SNMP_MIB_ITEM("MPCapableFallbackSYNACK", MPTCP_MIB_MPCAPABLEACTIVEFALLBACK),
+ SNMP_MIB_ITEM("MPFallbackTokenInit", MPTCP_MIB_TOKENFALLBACKINIT),
SNMP_MIB_ITEM("MPTCPRetrans", MPTCP_MIB_RETRANSSEGS),
SNMP_MIB_ITEM("MPJoinNoTokenFound", MPTCP_MIB_JOINNOTOKEN),
SNMP_MIB_ITEM("MPJoinSynRx", MPTCP_MIB_JOINSYNRX),
diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h
index 72afbc135f8e..f0da4f060fe1 100644
--- a/net/mptcp/mib.h
+++ b/net/mptcp/mib.h
@@ -3,9 +3,12 @@
enum linux_mptcp_mib_field {
MPTCP_MIB_NUM = 0,
MPTCP_MIB_MPCAPABLEPASSIVE, /* Received SYN with MP_CAPABLE */
+ MPTCP_MIB_MPCAPABLEACTIVE, /* Sent SYN with MP_CAPABLE */
+ MPTCP_MIB_MPCAPABLEACTIVEACK, /* Received SYN/ACK with MP_CAPABLE */
MPTCP_MIB_MPCAPABLEPASSIVEACK, /* Received third ACK with MP_CAPABLE */
MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK,/* Server-side fallback during 3-way handshake */
MPTCP_MIB_MPCAPABLEACTIVEFALLBACK, /* Client-side fallback during 3-way handshake */
+ MPTCP_MIB_TOKENFALLBACKINIT, /* Could not init/allocate token */
MPTCP_MIB_RETRANSSEGS, /* Segments retransmitted at the MPTCP-level */
MPTCP_MIB_JOINNOTOKEN, /* Received MP_JOIN but the token was not found */
MPTCP_MIB_JOINSYNRX, /* Received a SYN + MP_JOIN */
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 69cafaacc31b..4b7119eb2c31 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -305,6 +305,18 @@ static void mptcp_parse_option(const struct sk_buff *skb,
mp_opt->fastclose = 1;
break;
+ case MPTCPOPT_RST:
+ if (opsize != TCPOLEN_MPTCP_RST)
+ break;
+
+ if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST))
+ break;
+ mp_opt->reset = 1;
+ flags = *ptr++;
+ mp_opt->reset_transient = flags & MPTCP_RST_TRANSIENT;
+ mp_opt->reset_reason = *ptr;
+ break;
+
default:
break;
}
@@ -327,6 +339,7 @@ void mptcp_get_options(const struct sk_buff *skb,
mp_opt->rm_addr = 0;
mp_opt->dss = 0;
mp_opt->mp_prio = 0;
+ mp_opt->reset = 0;
length = (th->doff * 4) - sizeof(struct tcphdr);
ptr = (const unsigned char *)(th + 1);
@@ -726,6 +739,22 @@ static bool mptcp_established_options_mp_prio(struct sock *sk,
return true;
}
+static noinline void mptcp_established_options_rst(struct sock *sk, struct sk_buff *skb,
+ unsigned int *size,
+ unsigned int remaining,
+ struct mptcp_out_options *opts)
+{
+ const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+ if (remaining < TCPOLEN_MPTCP_RST)
+ return;
+
+ *size = TCPOLEN_MPTCP_RST;
+ opts->suboptions |= OPTION_MPTCP_RST;
+ opts->reset_transient = subflow->reset_transient;
+ opts->reset_reason = subflow->reset_reason;
+}
+
bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
unsigned int *size, unsigned int remaining,
struct mptcp_out_options *opts)
@@ -741,11 +770,10 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
if (unlikely(__mptcp_check_fallback(msk)))
return false;
- /* prevent adding of any MPTCP related options on reset packet
- * until we support MP_TCPRST/MP_FASTCLOSE
- */
- if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST))
- return false;
+ if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) {
+ mptcp_established_options_rst(sk, skb, size, remaining, opts);
+ return true;
+ }
snd_data_fin = mptcp_data_fin_enabled(msk);
if (mptcp_established_options_mp(sk, skb, snd_data_fin, &opt_size, remaining, opts))
@@ -952,7 +980,7 @@ bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool us
* should match. If they mismatch, the peer is misbehaving and
* we will prefer the most recent information.
*/
- if (READ_ONCE(msk->rcv_data_fin) || !READ_ONCE(msk->first))
+ if (READ_ONCE(msk->rcv_data_fin))
return false;
WRITE_ONCE(msk->rcv_data_fin_seq,
@@ -1062,6 +1090,12 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
mp_opt.mp_prio = 0;
}
+ if (mp_opt.reset) {
+ subflow->reset_seen = 1;
+ subflow->reset_reason = mp_opt.reset_reason;
+ subflow->reset_transient = mp_opt.reset_transient;
+ }
+
if (!mp_opt.dss)
return;
@@ -1289,6 +1323,12 @@ mp_capable_done:
ptr += 5;
}
+ if (OPTION_MPTCP_RST & opts->suboptions)
+ *ptr++ = mptcp_option(MPTCPOPT_RST,
+ TCPOLEN_MPTCP_RST,
+ opts->reset_transient,
+ opts->reset_reason);
+
if (opts->ext_copy.use_ack || opts->ext_copy.use_map) {
struct mptcp_ext *mpext = &opts->ext_copy;
u8 len = TCPOLEN_MPTCP_DSS_BASE;
@@ -1340,3 +1380,20 @@ mp_capable_done:
if (tp)
mptcp_set_rwin(tp);
}
+
+__be32 mptcp_get_reset_option(const struct sk_buff *skb)
+{
+ const struct mptcp_ext *ext = mptcp_get_ext(skb);
+ u8 flags, reason;
+
+ if (ext) {
+ flags = ext->reset_transient;
+ reason = ext->reset_reason;
+
+ return mptcp_option(MPTCPOPT_RST, TCPOLEN_MPTCP_RST,
+ flags, reason);
+ }
+
+ return htonl(0u);
+}
+EXPORT_SYMBOL_GPL(mptcp_get_reset_option);
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index cadafafa1049..51be6c34b339 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -1687,9 +1687,21 @@ static int mptcp_event_sub_closed(struct sk_buff *skb,
const struct mptcp_sock *msk,
const struct sock *ssk)
{
+ const struct mptcp_subflow_context *sf;
+
if (mptcp_event_put_token_and_ssk(skb, msk, ssk))
return -EMSGSIZE;
+ sf = mptcp_subflow_ctx(ssk);
+ if (!sf->reset_seen)
+ return 0;
+
+ if (nla_put_u32(skb, MPTCP_ATTR_RESET_REASON, sf->reset_reason))
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, MPTCP_ATTR_RESET_FLAGS, sf->reset_transient))
+ return -EMSGSIZE;
+
return 0;
}
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 171b77537dcb..e894345d10c1 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -493,7 +493,7 @@ static bool mptcp_check_data_fin(struct sock *sk)
u64 rcv_data_fin_seq;
bool ret = false;
- if (__mptcp_check_fallback(msk) || !msk->first)
+ if (__mptcp_check_fallback(msk))
return ret;
/* Need to ack a DATA_FIN received from a peer while this side
@@ -3090,14 +3090,18 @@ bool mptcp_finish_join(struct sock *ssk)
pr_debug("msk=%p, subflow=%p", msk, subflow);
/* mptcp socket already closing? */
- if (!mptcp_is_fully_established(parent))
+ if (!mptcp_is_fully_established(parent)) {
+ subflow->reset_reason = MPTCP_RST_EMPTCP;
return false;
+ }
if (!msk->pm.server_side)
goto out;
- if (!mptcp_pm_allow_new_subflow(msk))
+ if (!mptcp_pm_allow_new_subflow(msk)) {
+ subflow->reset_reason = MPTCP_RST_EPROHIBIT;
return false;
+ }
/* active connections are already on conn_list, and we can't acquire
* msk lock here.
@@ -3111,8 +3115,10 @@ bool mptcp_finish_join(struct sock *ssk)
sock_hold(ssk);
}
spin_unlock_bh(&msk->join_list_lock);
- if (!ret)
+ if (!ret) {
+ subflow->reset_reason = MPTCP_RST_EPROHIBIT;
return false;
+ }
/* attach to msk socket only after we are sure he will deal with us
* at close time
@@ -3224,8 +3230,12 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,
if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info))
mptcp_subflow_early_fallback(msk, subflow);
#endif
- if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk))
+ if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk)) {
+ MPTCP_INC_STATS(sock_net(ssock->sk), MPTCP_MIB_TOKENFALLBACKINIT);
mptcp_subflow_early_fallback(msk, subflow);
+ }
+ if (likely(!__mptcp_check_fallback(msk)))
+ MPTCP_INC_STATS(sock_net(sock->sk), MPTCP_MIB_MPCAPABLEACTIVE);
do_connect:
err = ssock->ops->connect(ssock, uaddr, addr_len, flags);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index e8c5ff2b8ace..40e9b05856cd 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -26,6 +26,7 @@
#define OPTION_MPTCP_RM_ADDR BIT(8)
#define OPTION_MPTCP_FASTCLOSE BIT(9)
#define OPTION_MPTCP_PRIO BIT(10)
+#define OPTION_MPTCP_RST BIT(11)
/* MPTCP option subtypes */
#define MPTCPOPT_MP_CAPABLE 0
@@ -36,6 +37,7 @@
#define MPTCPOPT_MP_PRIO 5
#define MPTCPOPT_MP_FAIL 6
#define MPTCPOPT_MP_FASTCLOSE 7
+#define MPTCPOPT_RST 8
/* MPTCP suboption lengths */
#define TCPOLEN_MPTCP_MPC_SYN 4
@@ -65,6 +67,7 @@
#define TCPOLEN_MPTCP_PRIO 3
#define TCPOLEN_MPTCP_PRIO_ALIGN 4
#define TCPOLEN_MPTCP_FASTCLOSE 12
+#define TCPOLEN_MPTCP_RST 4
/* MPTCP MP_JOIN flags */
#define MPTCPOPT_BACKUP BIT(0)
@@ -94,6 +97,9 @@
/* MPTCP MP_PRIO flags */
#define MPTCP_PRIO_BKUP BIT(0)
+/* MPTCP TCPRST flags */
+#define MPTCP_RST_TRANSIENT BIT(0)
+
/* MPTCP socket flags */
#define MPTCP_DATA_READY 0
#define MPTCP_NOSPACE 1
@@ -123,6 +129,7 @@ struct mptcp_options_received {
u16 mp_capable : 1,
mp_join : 1,
fastclose : 1,
+ reset : 1,
dss : 1,
add_addr : 1,
rm_addr : 1,
@@ -152,6 +159,8 @@ struct mptcp_options_received {
};
u64 ahmac;
u16 port;
+ u8 reset_reason:4;
+ u8 reset_transient:1;
};
static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field)
@@ -422,6 +431,9 @@ struct mptcp_subflow_context {
u8 hmac[MPTCPOPT_HMAC_LEN];
u8 local_id;
u8 remote_id;
+ u8 reset_seen:1;
+ u8 reset_transient:1;
+ u8 reset_reason:4;
long delegated_status;
struct list_head delegated_node; /* link into delegated_action, protected by local BH */
@@ -742,7 +754,7 @@ unsigned int mptcp_pm_get_add_addr_accept_max(struct mptcp_sock *msk);
unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk);
unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk);
-static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb)
+static inline struct mptcp_ext *mptcp_get_ext(const struct sk_buff *skb)
{
return (struct mptcp_ext *)skb_ext_find(skb, SKB_EXT_MPTCP);
}
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 6c074d3db0ed..223d6be5fc3b 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -115,6 +115,16 @@ static bool subflow_use_different_sport(struct mptcp_sock *msk, const struct soc
return inet_sk(sk)->inet_sport != inet_sk((struct sock *)msk)->inet_sport;
}
+static void subflow_add_reset_reason(struct sk_buff *skb, u8 reason)
+{
+ struct mptcp_ext *mpext = skb_ext_add(skb, SKB_EXT_MPTCP);
+
+ if (mpext) {
+ memset(mpext, 0, sizeof(*mpext));
+ mpext->reset_reason = reason;
+ }
+}
+
/* Init mptcp request socket.
*
* Returns an error code if a JOIN has failed and a TCP reset
@@ -165,6 +175,7 @@ again:
if (mptcp_token_exists(subflow_req->token)) {
if (retries-- > 0)
goto again;
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_TOKENFALLBACKINIT);
} else {
subflow_req->mp_capable = 1;
}
@@ -176,6 +187,8 @@ again:
subflow_req->mp_capable = 1;
else if (retries-- > 0)
goto again;
+ else
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_TOKENFALLBACKINIT);
} else if (mp_opt.mp_join && listener->request_mptcp) {
subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
@@ -187,8 +200,10 @@ again:
subflow_req->msk = subflow_token_join_request(req);
/* Can't fall back to TCP in this case. */
- if (!subflow_req->msk)
+ if (!subflow_req->msk) {
+ subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP);
return -EPERM;
+ }
if (subflow_use_different_sport(subflow_req->msk, sk_listener)) {
pr_debug("syn inet_sport=%d %d",
@@ -392,12 +407,15 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
subflow->remote_key = mp_opt.sndr_key;
pr_debug("subflow=%p, remote_key=%llu", subflow,
subflow->remote_key);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK);
mptcp_finish_connect(sk);
} else if (subflow->request_join) {
u8 hmac[SHA256_DIGEST_SIZE];
- if (!mp_opt.mp_join)
+ if (!mp_opt.mp_join) {
+ subflow->reset_reason = MPTCP_RST_EMPTCP;
goto do_reset;
+ }
subflow->thmac = mp_opt.thmac;
subflow->remote_nonce = mp_opt.nonce;
@@ -406,6 +424,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
if (!subflow_thmac_valid(subflow)) {
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC);
+ subflow->reset_reason = MPTCP_RST_EMPTCP;
goto do_reset;
}
@@ -434,6 +453,7 @@ fallback:
return;
do_reset:
+ subflow->reset_transient = 0;
mptcp_subflow_reset(sk);
}
@@ -650,8 +670,10 @@ create_child:
* to reset the context to non MPTCP status.
*/
if (!ctx || fallback) {
- if (fallback_is_fatal)
+ if (fallback_is_fatal) {
+ subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP);
goto dispose_child;
+ }
subflow_drop_ctx(child);
goto out;
@@ -686,8 +708,10 @@ create_child:
struct mptcp_sock *owner;
owner = subflow_req->msk;
- if (!owner)
+ if (!owner) {
+ subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT);
goto dispose_child;
+ }
/* move the msk reference ownership to the subflow */
subflow_req->msk = NULL;
@@ -1052,6 +1076,8 @@ fatal:
smp_wmb();
ssk->sk_error_report(ssk);
tcp_set_state(ssk, TCP_CLOSE);
+ subflow->reset_transient = 0;
+ subflow->reset_reason = MPTCP_RST_EMPTCP;
tcp_send_active_reset(ssk, GFP_ATOMIC);
subflow->data_avail = 0;
return false;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 01d933ae5f16..1dcb34dfd56b 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1789,8 +1789,8 @@ int tls_sw_recvmsg(struct sock *sk,
skb = tls_wait_data(sk, psock, flags, timeo, &err);
if (!skb) {
if (psock) {
- int ret = __tcp_bpf_recvmsg(sk, psock,
- msg, len, flags);
+ int ret = sk_msg_recvmsg(sk, psock, msg, len,
+ flags);
if (ret > 0) {
decrypted += ret;
diff --git a/samples/bpf/sampleip_kern.c b/samples/bpf/sampleip_kern.c
index f24806ac24e7..a3f8a3998e0a 100644
--- a/samples/bpf/sampleip_kern.c
+++ b/samples/bpf/sampleip_kern.c
@@ -4,7 +4,6 @@
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
-#include <linux/version.h>
#include <linux/ptrace.h>
#include <uapi/linux/bpf.h>
#include <uapi/linux/bpf_perf_event.h>
diff --git a/samples/bpf/trace_event_kern.c b/samples/bpf/trace_event_kern.c
index 7d3c66fb3f88..0bba5fcd7d24 100644
--- a/samples/bpf/trace_event_kern.c
+++ b/samples/bpf/trace_event_kern.c
@@ -5,7 +5,6 @@
* License as published by the Free Software Foundation.
*/
#include <linux/ptrace.h>
-#include <linux/version.h>
#include <uapi/linux/bpf.h>
#include <uapi/linux/bpf_perf_event.h>
#include <uapi/linux/perf_event.h>
diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index 1e2a1105d0e6..aa696854be78 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -96,7 +96,6 @@ static int opt_xsk_frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
static int opt_timeout = 1000;
static bool opt_need_wakeup = true;
static u32 opt_num_xsks = 1;
-static u32 prog_id;
static bool opt_busy_poll;
static bool opt_reduced_cap;
@@ -462,59 +461,37 @@ static void *poller(void *arg)
return NULL;
}
-static void remove_xdp_program(void)
+static void int_exit(int sig)
{
- u32 curr_prog_id = 0;
- int cmd = CLOSE_CONN;
-
- if (bpf_get_link_xdp_id(opt_ifindex, &curr_prog_id, opt_xdp_flags)) {
- printf("bpf_get_link_xdp_id failed\n");
- exit(EXIT_FAILURE);
- }
- if (prog_id == curr_prog_id)
- bpf_set_link_xdp_fd(opt_ifindex, -1, opt_xdp_flags);
- else if (!curr_prog_id)
- printf("couldn't find a prog id on a given interface\n");
- else
- printf("program on interface changed, not removing\n");
-
- if (opt_reduced_cap) {
- if (write(sock, &cmd, sizeof(int)) < 0) {
- fprintf(stderr, "Error writing into stream socket: %s", strerror(errno));
- exit(EXIT_FAILURE);
- }
- }
+ benchmark_done = true;
}
-static void int_exit(int sig)
+static void __exit_with_error(int error, const char *file, const char *func,
+ int line)
{
- benchmark_done = true;
+ fprintf(stderr, "%s:%s:%i: errno: %d/\"%s\"\n", file, func,
+ line, error, strerror(error));
+ exit(EXIT_FAILURE);
}
+#define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, __LINE__)
+
static void xdpsock_cleanup(void)
{
struct xsk_umem *umem = xsks[0]->umem->umem;
- int i;
+ int i, cmd = CLOSE_CONN;
dump_stats();
for (i = 0; i < num_socks; i++)
xsk_socket__delete(xsks[i]->xsk);
(void)xsk_umem__delete(umem);
- remove_xdp_program();
-}
-static void __exit_with_error(int error, const char *file, const char *func,
- int line)
-{
- fprintf(stderr, "%s:%s:%i: errno: %d/\"%s\"\n", file, func,
- line, error, strerror(error));
- dump_stats();
- remove_xdp_program();
- exit(EXIT_FAILURE);
+ if (opt_reduced_cap) {
+ if (write(sock, &cmd, sizeof(int)) < 0)
+ exit_with_error(errno);
+ }
}
-#define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, \
- __LINE__)
static void swap_mac_addresses(void *data)
{
struct ether_header *eth = (struct ether_header *)data;
@@ -880,10 +857,6 @@ static struct xsk_socket_info *xsk_configure_socket(struct xsk_umem_info *umem,
if (ret)
exit_with_error(-ret);
- ret = bpf_get_link_xdp_id(opt_ifindex, &prog_id, opt_xdp_flags);
- if (ret)
- exit_with_error(-ret);
-
xsk->app_stats.rx_empty_polls = 0;
xsk->app_stats.fill_fail_polls = 0;
xsk->app_stats.copy_tx_sendtos = 0;
diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
index 65303664417e..1828bba19020 100644
--- a/tools/bpf/bpftool/common.c
+++ b/tools/bpf/bpftool/common.c
@@ -57,6 +57,7 @@ const char * const attach_type_name[__MAX_BPF_ATTACH_TYPE] = {
[BPF_SK_SKB_STREAM_PARSER] = "sk_skb_stream_parser",
[BPF_SK_SKB_STREAM_VERDICT] = "sk_skb_stream_verdict",
+ [BPF_SK_SKB_VERDICT] = "sk_skb_verdict",
[BPF_SK_MSG_VERDICT] = "sk_msg_verdict",
[BPF_LIRC_MODE2] = "lirc_mode2",
[BPF_FLOW_DISSECTOR] = "flow_dissector",
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index f2b915b20546..3f067d2d7584 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -76,6 +76,7 @@ enum dump_mode {
static const char * const attach_type_strings[] = {
[BPF_SK_SKB_STREAM_PARSER] = "stream_parser",
[BPF_SK_SKB_STREAM_VERDICT] = "stream_verdict",
+ [BPF_SK_SKB_VERDICT] = "skb_verdict",
[BPF_SK_MSG_VERDICT] = "msg_verdict",
[BPF_FLOW_DISSECTOR] = "flow_dissector",
[__MAX_BPF_ATTACH_TYPE] = NULL,
diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c
index 80d966cfcaa1..7550fd9c3188 100644
--- a/tools/bpf/resolve_btfids/main.c
+++ b/tools/bpf/resolve_btfids/main.c
@@ -115,10 +115,10 @@ struct object {
static int verbose;
-int eprintf(int level, int var, const char *fmt, ...)
+static int eprintf(int level, int var, const char *fmt, ...)
{
va_list args;
- int ret;
+ int ret = 0;
if (var >= level) {
va_start(args, fmt);
@@ -385,7 +385,7 @@ static int elf_collect(struct object *obj)
static int symbols_collect(struct object *obj)
{
Elf_Scn *scn = NULL;
- int n, i, err = 0;
+ int n, i;
GElf_Shdr sh;
char *name;
@@ -402,11 +402,10 @@ static int symbols_collect(struct object *obj)
* Scan symbols and look for the ones starting with
* __BTF_ID__* over .BTF_ids section.
*/
- for (i = 0; !err && i < n; i++) {
- char *tmp, *prefix;
+ for (i = 0; i < n; i++) {
+ char *prefix;
struct btf_id *id;
GElf_Sym sym;
- int err = -1;
if (!gelf_getsym(obj->efile.symbols, i, &sym))
return -1;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 2d3036e292a9..69902603012c 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -957,6 +957,7 @@ enum bpf_attach_type {
BPF_XDP_CPUMAP,
BPF_SK_LOOKUP,
BPF_XDP,
+ BPF_SK_SKB_VERDICT,
__MAX_BPF_ATTACH_TYPE
};
@@ -1117,6 +1118,10 @@ enum bpf_link_type {
* offset to another bpf function
*/
#define BPF_PSEUDO_CALL 1
+/* when bpf_call->src_reg == BPF_PSEUDO_KFUNC_CALL,
+ * bpf_call->imm == btf_id of a BTF_KIND_FUNC in the running kernel
+ */
+#define BPF_PSEUDO_KFUNC_CALL 2
/* flags for BPF_MAP_UPDATE_ELEM command */
enum {
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index c0fd2c718a7d..7aad78dbb4b4 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -185,7 +185,8 @@ enum reloc_type {
RELO_LD64,
RELO_CALL,
RELO_DATA,
- RELO_EXTERN,
+ RELO_EXTERN_VAR,
+ RELO_EXTERN_FUNC,
RELO_SUBPROG_ADDR,
};
@@ -573,14 +574,19 @@ static bool insn_is_subprog_call(const struct bpf_insn *insn)
insn->off == 0;
}
-static bool is_ldimm64(struct bpf_insn *insn)
+static bool is_ldimm64_insn(struct bpf_insn *insn)
{
return insn->code == (BPF_LD | BPF_IMM | BPF_DW);
}
+static bool is_call_insn(const struct bpf_insn *insn)
+{
+ return insn->code == (BPF_JMP | BPF_CALL);
+}
+
static bool insn_is_pseudo_func(struct bpf_insn *insn)
{
- return is_ldimm64(insn) && insn->src_reg == BPF_PSEUDO_FUNC;
+ return is_ldimm64_insn(insn) && insn->src_reg == BPF_PSEUDO_FUNC;
}
static int
@@ -1921,9 +1927,9 @@ resolve_func_ptr(const struct btf *btf, __u32 id, __u32 *res_id)
return btf_is_func_proto(t) ? t : NULL;
}
-static const char *btf_kind_str(const struct btf_type *t)
+static const char *__btf_kind_str(__u16 kind)
{
- switch (btf_kind(t)) {
+ switch (kind) {
case BTF_KIND_UNKN: return "void";
case BTF_KIND_INT: return "int";
case BTF_KIND_PTR: return "ptr";
@@ -1945,6 +1951,16 @@ static const char *btf_kind_str(const struct btf_type *t)
}
}
+static const char *btf_kind_str(const struct btf_type *t)
+{
+ return __btf_kind_str(btf_kind(t));
+}
+
+static enum btf_func_linkage btf_func_linkage(const struct btf_type *t)
+{
+ return (enum btf_func_linkage)BTF_INFO_VLEN(t->info);
+}
+
/*
* Fetch integer attribute of BTF map definition. Such attributes are
* represented using a pointer to an array, in which dimensionality of array
@@ -3009,7 +3025,7 @@ static bool sym_is_subprog(const GElf_Sym *sym, int text_shndx)
static int find_extern_btf_id(const struct btf *btf, const char *ext_name)
{
const struct btf_type *t;
- const char *var_name;
+ const char *tname;
int i, n;
if (!btf)
@@ -3019,14 +3035,18 @@ static int find_extern_btf_id(const struct btf *btf, const char *ext_name)
for (i = 1; i <= n; i++) {
t = btf__type_by_id(btf, i);
- if (!btf_is_var(t))
+ if (!btf_is_var(t) && !btf_is_func(t))
continue;
- var_name = btf__name_by_offset(btf, t->name_off);
- if (strcmp(var_name, ext_name))
+ tname = btf__name_by_offset(btf, t->name_off);
+ if (strcmp(tname, ext_name))
continue;
- if (btf_var(t)->linkage != BTF_VAR_GLOBAL_EXTERN)
+ if (btf_is_var(t) &&
+ btf_var(t)->linkage != BTF_VAR_GLOBAL_EXTERN)
+ return -EINVAL;
+
+ if (btf_is_func(t) && btf_func_linkage(t) != BTF_FUNC_EXTERN)
return -EINVAL;
return i;
@@ -3139,12 +3159,48 @@ static int find_int_btf_id(const struct btf *btf)
return 0;
}
+static int add_dummy_ksym_var(struct btf *btf)
+{
+ int i, int_btf_id, sec_btf_id, dummy_var_btf_id;
+ const struct btf_var_secinfo *vs;
+ const struct btf_type *sec;
+
+ sec_btf_id = btf__find_by_name_kind(btf, KSYMS_SEC,
+ BTF_KIND_DATASEC);
+ if (sec_btf_id < 0)
+ return 0;
+
+ sec = btf__type_by_id(btf, sec_btf_id);
+ vs = btf_var_secinfos(sec);
+ for (i = 0; i < btf_vlen(sec); i++, vs++) {
+ const struct btf_type *vt;
+
+ vt = btf__type_by_id(btf, vs->type);
+ if (btf_is_func(vt))
+ break;
+ }
+
+ /* No func in ksyms sec. No need to add dummy var. */
+ if (i == btf_vlen(sec))
+ return 0;
+
+ int_btf_id = find_int_btf_id(btf);
+ dummy_var_btf_id = btf__add_var(btf,
+ "dummy_ksym",
+ BTF_VAR_GLOBAL_ALLOCATED,
+ int_btf_id);
+ if (dummy_var_btf_id < 0)
+ pr_warn("cannot create a dummy_ksym var\n");
+
+ return dummy_var_btf_id;
+}
+
static int bpf_object__collect_externs(struct bpf_object *obj)
{
struct btf_type *sec, *kcfg_sec = NULL, *ksym_sec = NULL;
const struct btf_type *t;
struct extern_desc *ext;
- int i, n, off;
+ int i, n, off, dummy_var_btf_id;
const char *ext_name, *sec_name;
Elf_Scn *scn;
GElf_Shdr sh;
@@ -3156,6 +3212,10 @@ static int bpf_object__collect_externs(struct bpf_object *obj)
if (elf_sec_hdr(obj, scn, &sh))
return -LIBBPF_ERRNO__FORMAT;
+ dummy_var_btf_id = add_dummy_ksym_var(obj->btf);
+ if (dummy_var_btf_id < 0)
+ return dummy_var_btf_id;
+
n = sh.sh_size / sh.sh_entsize;
pr_debug("looking for externs among %d symbols...\n", n);
@@ -3200,6 +3260,11 @@ static int bpf_object__collect_externs(struct bpf_object *obj)
sec_name = btf__name_by_offset(obj->btf, sec->name_off);
if (strcmp(sec_name, KCONFIG_SEC) == 0) {
+ if (btf_is_func(t)) {
+ pr_warn("extern function %s is unsupported under %s section\n",
+ ext->name, KCONFIG_SEC);
+ return -ENOTSUP;
+ }
kcfg_sec = sec;
ext->type = EXT_KCFG;
ext->kcfg.sz = btf__resolve_size(obj->btf, t->type);
@@ -3221,6 +3286,11 @@ static int bpf_object__collect_externs(struct bpf_object *obj)
return -ENOTSUP;
}
} else if (strcmp(sec_name, KSYMS_SEC) == 0) {
+ if (btf_is_func(t) && ext->is_weak) {
+ pr_warn("extern weak function %s is unsupported\n",
+ ext->name);
+ return -ENOTSUP;
+ }
ksym_sec = sec;
ext->type = EXT_KSYM;
skip_mods_and_typedefs(obj->btf, t->type,
@@ -3247,7 +3317,14 @@ static int bpf_object__collect_externs(struct bpf_object *obj)
* extern variables in DATASEC
*/
int int_btf_id = find_int_btf_id(obj->btf);
+ /* For extern function, a dummy_var added earlier
+ * will be used to replace the vs->type and
+ * its name string will be used to refill
+ * the missing param's name.
+ */
+ const struct btf_type *dummy_var;
+ dummy_var = btf__type_by_id(obj->btf, dummy_var_btf_id);
for (i = 0; i < obj->nr_extern; i++) {
ext = &obj->externs[i];
if (ext->type != EXT_KSYM)
@@ -3266,12 +3343,32 @@ static int bpf_object__collect_externs(struct bpf_object *obj)
ext_name = btf__name_by_offset(obj->btf, vt->name_off);
ext = find_extern_by_name(obj, ext_name);
if (!ext) {
- pr_warn("failed to find extern definition for BTF var '%s'\n",
- ext_name);
+ pr_warn("failed to find extern definition for BTF %s '%s'\n",
+ btf_kind_str(vt), ext_name);
return -ESRCH;
}
- btf_var(vt)->linkage = BTF_VAR_GLOBAL_ALLOCATED;
- vt->type = int_btf_id;
+ if (btf_is_func(vt)) {
+ const struct btf_type *func_proto;
+ struct btf_param *param;
+ int j;
+
+ func_proto = btf__type_by_id(obj->btf,
+ vt->type);
+ param = btf_params(func_proto);
+ /* Reuse the dummy_var string if the
+ * func proto does not have param name.
+ */
+ for (j = 0; j < btf_vlen(func_proto); j++)
+ if (param[j].type && !param[j].name_off)
+ param[j].name_off =
+ dummy_var->name_off;
+ vs->type = dummy_var_btf_id;
+ vt->info &= ~0xffff;
+ vt->info |= BTF_FUNC_GLOBAL;
+ } else {
+ btf_var(vt)->linkage = BTF_VAR_GLOBAL_ALLOCATED;
+ vt->type = int_btf_id;
+ }
vs->offset = off;
vs->size = sizeof(int);
}
@@ -3403,31 +3500,7 @@ static int bpf_program__record_reloc(struct bpf_program *prog,
reloc_desc->processed = false;
- /* sub-program call relocation */
- if (insn->code == (BPF_JMP | BPF_CALL)) {
- if (insn->src_reg != BPF_PSEUDO_CALL) {
- pr_warn("prog '%s': incorrect bpf_call opcode\n", prog->name);
- return -LIBBPF_ERRNO__RELOC;
- }
- /* text_shndx can be 0, if no default "main" program exists */
- if (!shdr_idx || shdr_idx != obj->efile.text_shndx) {
- sym_sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, shdr_idx));
- pr_warn("prog '%s': bad call relo against '%s' in section '%s'\n",
- prog->name, sym_name, sym_sec_name);
- return -LIBBPF_ERRNO__RELOC;
- }
- if (sym->st_value % BPF_INSN_SZ) {
- pr_warn("prog '%s': bad call relo against '%s' at offset %zu\n",
- prog->name, sym_name, (size_t)sym->st_value);
- return -LIBBPF_ERRNO__RELOC;
- }
- reloc_desc->type = RELO_CALL;
- reloc_desc->insn_idx = insn_idx;
- reloc_desc->sym_off = sym->st_value;
- return 0;
- }
-
- if (!is_ldimm64(insn)) {
+ if (!is_call_insn(insn) && !is_ldimm64_insn(insn)) {
pr_warn("prog '%s': invalid relo against '%s' for insns[%d].code 0x%x\n",
prog->name, sym_name, insn_idx, insn->code);
return -LIBBPF_ERRNO__RELOC;
@@ -3450,12 +3523,39 @@ static int bpf_program__record_reloc(struct bpf_program *prog,
}
pr_debug("prog '%s': found extern #%d '%s' (sym %d) for insn #%u\n",
prog->name, i, ext->name, ext->sym_idx, insn_idx);
- reloc_desc->type = RELO_EXTERN;
+ if (insn->code == (BPF_JMP | BPF_CALL))
+ reloc_desc->type = RELO_EXTERN_FUNC;
+ else
+ reloc_desc->type = RELO_EXTERN_VAR;
reloc_desc->insn_idx = insn_idx;
reloc_desc->sym_off = i; /* sym_off stores extern index */
return 0;
}
+ /* sub-program call relocation */
+ if (is_call_insn(insn)) {
+ if (insn->src_reg != BPF_PSEUDO_CALL) {
+ pr_warn("prog '%s': incorrect bpf_call opcode\n", prog->name);
+ return -LIBBPF_ERRNO__RELOC;
+ }
+ /* text_shndx can be 0, if no default "main" program exists */
+ if (!shdr_idx || shdr_idx != obj->efile.text_shndx) {
+ sym_sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, shdr_idx));
+ pr_warn("prog '%s': bad call relo against '%s' in section '%s'\n",
+ prog->name, sym_name, sym_sec_name);
+ return -LIBBPF_ERRNO__RELOC;
+ }
+ if (sym->st_value % BPF_INSN_SZ) {
+ pr_warn("prog '%s': bad call relo against '%s' at offset %zu\n",
+ prog->name, sym_name, (size_t)sym->st_value);
+ return -LIBBPF_ERRNO__RELOC;
+ }
+ reloc_desc->type = RELO_CALL;
+ reloc_desc->insn_idx = insn_idx;
+ reloc_desc->sym_off = sym->st_value;
+ return 0;
+ }
+
if (!shdr_idx || shdr_idx >= SHN_LORESERVE) {
pr_warn("prog '%s': invalid relo against '%s' in special section 0x%x; forgot to initialize global var?..\n",
prog->name, sym_name, shdr_idx);
@@ -5695,7 +5795,7 @@ poison:
/* poison second part of ldimm64 to avoid confusing error from
* verifier about "unknown opcode 00"
*/
- if (is_ldimm64(insn))
+ if (is_ldimm64_insn(insn))
bpf_core_poison_insn(prog, relo_idx, insn_idx + 1, insn + 1);
bpf_core_poison_insn(prog, relo_idx, insn_idx, insn);
return 0;
@@ -5771,7 +5871,7 @@ poison:
case BPF_LD: {
__u64 imm;
- if (!is_ldimm64(insn) ||
+ if (!is_ldimm64_insn(insn) ||
insn[0].src_reg != 0 || insn[0].off != 0 ||
insn_idx + 1 >= prog->insns_cnt ||
insn[1].code != 0 || insn[1].dst_reg != 0 ||
@@ -6213,7 +6313,7 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog)
insn[0].imm = obj->maps[relo->map_idx].fd;
relo->processed = true;
break;
- case RELO_EXTERN:
+ case RELO_EXTERN_VAR:
ext = &obj->externs[relo->sym_off];
if (ext->type == EXT_KCFG) {
insn[0].src_reg = BPF_PSEUDO_MAP_VALUE;
@@ -6231,6 +6331,12 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog)
}
relo->processed = true;
break;
+ case RELO_EXTERN_FUNC:
+ ext = &obj->externs[relo->sym_off];
+ insn[0].src_reg = BPF_PSEUDO_KFUNC_CALL;
+ insn[0].imm = ext->ksym.kernel_btf_id;
+ relo->processed = true;
+ break;
case RELO_SUBPROG_ADDR:
insn[0].src_reg = BPF_PSEUDO_FUNC;
/* will be handled as a follow up pass */
@@ -7351,6 +7457,7 @@ static int bpf_object__read_kallsyms_file(struct bpf_object *obj)
{
char sym_type, sym_name[500];
unsigned long long sym_addr;
+ const struct btf_type *t;
struct extern_desc *ext;
int ret, err = 0;
FILE *f;
@@ -7377,6 +7484,10 @@ static int bpf_object__read_kallsyms_file(struct bpf_object *obj)
if (!ext || ext->type != EXT_KSYM)
continue;
+ t = btf__type_by_id(obj->btf, ext->btf_id);
+ if (!btf_is_var(t))
+ continue;
+
if (ext->is_set && ext->ksym.addr != sym_addr) {
pr_warn("extern (ksym) '%s' resolution is ambiguous: 0x%llx or 0x%llx\n",
sym_name, ext->ksym.addr, sym_addr);
@@ -7395,75 +7506,151 @@ out:
return err;
}
-static int bpf_object__resolve_ksyms_btf_id(struct bpf_object *obj)
+static int find_ksym_btf_id(struct bpf_object *obj, const char *ksym_name,
+ __u16 kind, struct btf **res_btf,
+ int *res_btf_fd)
{
- struct extern_desc *ext;
+ int i, id, btf_fd, err;
struct btf *btf;
- int i, j, id, btf_fd, err;
- for (i = 0; i < obj->nr_extern; i++) {
- const struct btf_type *targ_var, *targ_type;
- __u32 targ_type_id, local_type_id;
- const char *targ_var_name;
- int ret;
+ btf = obj->btf_vmlinux;
+ btf_fd = 0;
+ id = btf__find_by_name_kind(btf, ksym_name, kind);
- ext = &obj->externs[i];
- if (ext->type != EXT_KSYM || !ext->ksym.type_id)
- continue;
-
- btf = obj->btf_vmlinux;
- btf_fd = 0;
- id = btf__find_by_name_kind(btf, ext->name, BTF_KIND_VAR);
- if (id == -ENOENT) {
- err = load_module_btfs(obj);
- if (err)
- return err;
+ if (id == -ENOENT) {
+ err = load_module_btfs(obj);
+ if (err)
+ return err;
- for (j = 0; j < obj->btf_module_cnt; j++) {
- btf = obj->btf_modules[j].btf;
- /* we assume module BTF FD is always >0 */
- btf_fd = obj->btf_modules[j].fd;
- id = btf__find_by_name_kind(btf, ext->name, BTF_KIND_VAR);
- if (id != -ENOENT)
- break;
- }
- }
- if (id <= 0) {
- pr_warn("extern (ksym) '%s': failed to find BTF ID in kernel BTF(s).\n",
- ext->name);
- return -ESRCH;
+ for (i = 0; i < obj->btf_module_cnt; i++) {
+ btf = obj->btf_modules[i].btf;
+ /* we assume module BTF FD is always >0 */
+ btf_fd = obj->btf_modules[i].fd;
+ id = btf__find_by_name_kind(btf, ksym_name, kind);
+ if (id != -ENOENT)
+ break;
}
+ }
+ if (id <= 0) {
+ pr_warn("extern (%s ksym) '%s': failed to find BTF ID in kernel BTF(s).\n",
+ __btf_kind_str(kind), ksym_name);
+ return -ESRCH;
+ }
- /* find local type_id */
- local_type_id = ext->ksym.type_id;
+ *res_btf = btf;
+ *res_btf_fd = btf_fd;
+ return id;
+}
- /* find target type_id */
- targ_var = btf__type_by_id(btf, id);
- targ_var_name = btf__name_by_offset(btf, targ_var->name_off);
- targ_type = skip_mods_and_typedefs(btf, targ_var->type, &targ_type_id);
+static int bpf_object__resolve_ksym_var_btf_id(struct bpf_object *obj,
+ struct extern_desc *ext)
+{
+ const struct btf_type *targ_var, *targ_type;
+ __u32 targ_type_id, local_type_id;
+ const char *targ_var_name;
+ int id, btf_fd = 0, err;
+ struct btf *btf = NULL;
- ret = bpf_core_types_are_compat(obj->btf, local_type_id,
- btf, targ_type_id);
- if (ret <= 0) {
- const struct btf_type *local_type;
- const char *targ_name, *local_name;
+ id = find_ksym_btf_id(obj, ext->name, BTF_KIND_VAR, &btf, &btf_fd);
+ if (id < 0)
+ return id;
- local_type = btf__type_by_id(obj->btf, local_type_id);
- local_name = btf__name_by_offset(obj->btf, local_type->name_off);
- targ_name = btf__name_by_offset(btf, targ_type->name_off);
+ /* find local type_id */
+ local_type_id = ext->ksym.type_id;
- pr_warn("extern (ksym) '%s': incompatible types, expected [%d] %s %s, but kernel has [%d] %s %s\n",
- ext->name, local_type_id,
- btf_kind_str(local_type), local_name, targ_type_id,
- btf_kind_str(targ_type), targ_name);
- return -EINVAL;
- }
+ /* find target type_id */
+ targ_var = btf__type_by_id(btf, id);
+ targ_var_name = btf__name_by_offset(btf, targ_var->name_off);
+ targ_type = skip_mods_and_typedefs(btf, targ_var->type, &targ_type_id);
+
+ err = bpf_core_types_are_compat(obj->btf, local_type_id,
+ btf, targ_type_id);
+ if (err <= 0) {
+ const struct btf_type *local_type;
+ const char *targ_name, *local_name;
- ext->is_set = true;
- ext->ksym.kernel_btf_obj_fd = btf_fd;
- ext->ksym.kernel_btf_id = id;
- pr_debug("extern (ksym) '%s': resolved to [%d] %s %s\n",
- ext->name, id, btf_kind_str(targ_var), targ_var_name);
+ local_type = btf__type_by_id(obj->btf, local_type_id);
+ local_name = btf__name_by_offset(obj->btf, local_type->name_off);
+ targ_name = btf__name_by_offset(btf, targ_type->name_off);
+
+ pr_warn("extern (var ksym) '%s': incompatible types, expected [%d] %s %s, but kernel has [%d] %s %s\n",
+ ext->name, local_type_id,
+ btf_kind_str(local_type), local_name, targ_type_id,
+ btf_kind_str(targ_type), targ_name);
+ return -EINVAL;
+ }
+
+ ext->is_set = true;
+ ext->ksym.kernel_btf_obj_fd = btf_fd;
+ ext->ksym.kernel_btf_id = id;
+ pr_debug("extern (var ksym) '%s': resolved to [%d] %s %s\n",
+ ext->name, id, btf_kind_str(targ_var), targ_var_name);
+
+ return 0;
+}
+
+static int bpf_object__resolve_ksym_func_btf_id(struct bpf_object *obj,
+ struct extern_desc *ext)
+{
+ int local_func_proto_id, kfunc_proto_id, kfunc_id;
+ const struct btf_type *kern_func;
+ struct btf *kern_btf = NULL;
+ int ret, kern_btf_fd = 0;
+
+ local_func_proto_id = ext->ksym.type_id;
+
+ kfunc_id = find_ksym_btf_id(obj, ext->name, BTF_KIND_FUNC,
+ &kern_btf, &kern_btf_fd);
+ if (kfunc_id < 0) {
+ pr_warn("extern (func ksym) '%s': not found in kernel BTF\n",
+ ext->name);
+ return kfunc_id;
+ }
+
+ if (kern_btf != obj->btf_vmlinux) {
+ pr_warn("extern (func ksym) '%s': function in kernel module is not supported\n",
+ ext->name);
+ return -ENOTSUP;
+ }
+
+ kern_func = btf__type_by_id(kern_btf, kfunc_id);
+ kfunc_proto_id = kern_func->type;
+
+ ret = bpf_core_types_are_compat(obj->btf, local_func_proto_id,
+ kern_btf, kfunc_proto_id);
+ if (ret <= 0) {
+ pr_warn("extern (func ksym) '%s': func_proto [%d] incompatible with kernel [%d]\n",
+ ext->name, local_func_proto_id, kfunc_proto_id);
+ return -EINVAL;
+ }
+
+ ext->is_set = true;
+ ext->ksym.kernel_btf_obj_fd = kern_btf_fd;
+ ext->ksym.kernel_btf_id = kfunc_id;
+ pr_debug("extern (func ksym) '%s': resolved to kernel [%d]\n",
+ ext->name, kfunc_id);
+
+ return 0;
+}
+
+static int bpf_object__resolve_ksyms_btf_id(struct bpf_object *obj)
+{
+ const struct btf_type *t;
+ struct extern_desc *ext;
+ int i, err;
+
+ for (i = 0; i < obj->nr_extern; i++) {
+ ext = &obj->externs[i];
+ if (ext->type != EXT_KSYM || !ext->ksym.type_id)
+ continue;
+
+ t = btf__type_by_id(obj->btf, ext->btf_id);
+ if (btf_is_var(t))
+ err = bpf_object__resolve_ksym_var_btf_id(obj, ext);
+ else
+ err = bpf_object__resolve_ksym_func_btf_id(obj, ext);
+ if (err)
+ return err;
}
return 0;
}
@@ -8270,6 +8457,16 @@ int bpf_object__btf_fd(const struct bpf_object *obj)
return obj->btf ? btf__fd(obj->btf) : -1;
}
+int bpf_object__set_kversion(struct bpf_object *obj, __u32 kern_version)
+{
+ if (obj->loaded)
+ return -EINVAL;
+
+ obj->kern_version = kern_version;
+
+ return 0;
+}
+
int bpf_object__set_priv(struct bpf_object *obj, void *priv,
bpf_object_clear_priv_t clear_priv)
{
@@ -8458,7 +8655,7 @@ int bpf_program__nth_fd(const struct bpf_program *prog, int n)
return fd;
}
-enum bpf_prog_type bpf_program__get_type(struct bpf_program *prog)
+enum bpf_prog_type bpf_program__get_type(const struct bpf_program *prog)
{
return prog->type;
}
@@ -8503,7 +8700,7 @@ BPF_PROG_TYPE_FNS(extension, BPF_PROG_TYPE_EXT);
BPF_PROG_TYPE_FNS(sk_lookup, BPF_PROG_TYPE_SK_LOOKUP);
enum bpf_attach_type
-bpf_program__get_expected_attach_type(struct bpf_program *prog)
+bpf_program__get_expected_attach_type(const struct bpf_program *prog)
{
return prog->expected_attach_type;
}
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index a1a424b9b8ff..f500621d28e5 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -143,6 +143,7 @@ LIBBPF_API int bpf_object__unload(struct bpf_object *obj);
LIBBPF_API const char *bpf_object__name(const struct bpf_object *obj);
LIBBPF_API unsigned int bpf_object__kversion(const struct bpf_object *obj);
+LIBBPF_API int bpf_object__set_kversion(struct bpf_object *obj, __u32 kern_version);
struct btf;
LIBBPF_API struct btf *bpf_object__btf(const struct bpf_object *obj);
@@ -361,12 +362,12 @@ LIBBPF_API int bpf_program__set_struct_ops(struct bpf_program *prog);
LIBBPF_API int bpf_program__set_extension(struct bpf_program *prog);
LIBBPF_API int bpf_program__set_sk_lookup(struct bpf_program *prog);
-LIBBPF_API enum bpf_prog_type bpf_program__get_type(struct bpf_program *prog);
+LIBBPF_API enum bpf_prog_type bpf_program__get_type(const struct bpf_program *prog);
LIBBPF_API void bpf_program__set_type(struct bpf_program *prog,
enum bpf_prog_type type);
LIBBPF_API enum bpf_attach_type
-bpf_program__get_expected_attach_type(struct bpf_program *prog);
+bpf_program__get_expected_attach_type(const struct bpf_program *prog);
LIBBPF_API void
bpf_program__set_expected_attach_type(struct bpf_program *prog,
enum bpf_attach_type type);
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 279ae861f568..f5990f7208ce 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -359,4 +359,5 @@ LIBBPF_0.4.0 {
bpf_linker__finalize;
bpf_linker__free;
bpf_linker__new;
+ bpf_object__set_kversion;
} LIBBPF_0.3.0;
diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c
index 5e0aa2f2c0ca..46b16cbdcda3 100644
--- a/tools/lib/bpf/linker.c
+++ b/tools/lib/bpf/linker.c
@@ -94,6 +94,7 @@ struct dst_sec {
int sec_sym_idx;
/* section's DATASEC variable info, emitted on BTF finalization */
+ bool has_btf;
int sec_var_cnt;
struct btf_var_secinfo *sec_vars;
@@ -1436,6 +1437,16 @@ static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj)
continue;
dst_sec = &linker->secs[src_sec->dst_id];
+ /* Mark section as having BTF regardless of the presence of
+ * variables. In some cases compiler might generate empty BTF
+ * with no variables information. E.g., when promoting local
+ * array/structure variable initial values and BPF object
+ * file otherwise has no read-only static variables in
+ * .rodata. We need to preserve such empty BTF and just set
+ * correct section size.
+ */
+ dst_sec->has_btf = true;
+
t = btf__type_by_id(obj->btf, src_sec->sec_type_id);
src_var = btf_var_secinfos(t);
n = btf_vlen(t);
@@ -1717,7 +1728,7 @@ static int finalize_btf(struct bpf_linker *linker)
for (i = 1; i < linker->sec_cnt; i++) {
struct dst_sec *sec = &linker->secs[i];
- if (!sec->sec_var_cnt)
+ if (!sec->has_btf)
continue;
id = btf__add_datasec(btf, sec->sec_name, sec->sec_sz);
@@ -1895,8 +1906,10 @@ static int finalize_btf_ext(struct bpf_linker *linker)
struct dst_sec *sec = &linker->secs[i];
sz = emit_btf_ext_data(linker, cur, sec->sec_name, &sec->func_info);
- if (sz < 0)
- return sz;
+ if (sz < 0) {
+ err = sz;
+ goto out;
+ }
cur += sz;
}
@@ -1910,8 +1923,10 @@ static int finalize_btf_ext(struct bpf_linker *linker)
struct dst_sec *sec = &linker->secs[i];
sz = emit_btf_ext_data(linker, cur, sec->sec_name, &sec->line_info);
- if (sz < 0)
- return sz;
+ if (sz < 0) {
+ err = sz;
+ goto out;
+ }
cur += sz;
}
@@ -1925,8 +1940,10 @@ static int finalize_btf_ext(struct bpf_linker *linker)
struct dst_sec *sec = &linker->secs[i];
sz = emit_btf_ext_data(linker, cur, sec->sec_name, &sec->core_relo_info);
- if (sz < 0)
- return sz;
+ if (sz < 0) {
+ err = sz;
+ goto out;
+ }
cur += sz;
}
@@ -1937,8 +1954,10 @@ static int finalize_btf_ext(struct bpf_linker *linker)
if (err) {
linker->btf_ext = NULL;
pr_warn("failed to parse final .BTF.ext data: %d\n", err);
- return err;
+ goto out;
}
- return 0;
+out:
+ free(data);
+ return err;
}
diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c
index 526fc35c0b23..95da0e19f4a5 100644
--- a/tools/lib/bpf/xsk.c
+++ b/tools/lib/bpf/xsk.c
@@ -28,6 +28,7 @@
#include <sys/mman.h>
#include <sys/socket.h>
#include <sys/types.h>
+#include <linux/if_link.h>
#include "bpf.h"
#include "libbpf.h"
@@ -70,8 +71,10 @@ struct xsk_ctx {
int ifindex;
struct list_head list;
int prog_fd;
+ int link_fd;
int xsks_map_fd;
char ifname[IFNAMSIZ];
+ bool has_bpf_link;
};
struct xsk_socket {
@@ -409,7 +412,7 @@ static int xsk_load_xdp_prog(struct xsk_socket *xsk)
static const int log_buf_size = 16 * 1024;
struct xsk_ctx *ctx = xsk->ctx;
char log_buf[log_buf_size];
- int err, prog_fd;
+ int prog_fd;
/* This is the fallback C-program:
* SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
@@ -499,14 +502,41 @@ static int xsk_load_xdp_prog(struct xsk_socket *xsk)
return prog_fd;
}
- err = bpf_set_link_xdp_fd(xsk->ctx->ifindex, prog_fd,
- xsk->config.xdp_flags);
+ ctx->prog_fd = prog_fd;
+ return 0;
+}
+
+static int xsk_create_bpf_link(struct xsk_socket *xsk)
+{
+ DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts);
+ struct xsk_ctx *ctx = xsk->ctx;
+ __u32 prog_id = 0;
+ int link_fd;
+ int err;
+
+ err = bpf_get_link_xdp_id(ctx->ifindex, &prog_id, xsk->config.xdp_flags);
if (err) {
- close(prog_fd);
+ pr_warn("getting XDP prog id failed\n");
return err;
}
- ctx->prog_fd = prog_fd;
+ /* if there's a netlink-based XDP prog loaded on interface, bail out
+ * and ask user to do the removal by himself
+ */
+ if (prog_id) {
+ pr_warn("Netlink-based XDP prog detected, please unload it in order to launch AF_XDP prog\n");
+ return -EINVAL;
+ }
+
+ opts.flags = xsk->config.xdp_flags & ~(XDP_FLAGS_UPDATE_IF_NOEXIST | XDP_FLAGS_REPLACE);
+
+ link_fd = bpf_link_create(ctx->prog_fd, ctx->ifindex, BPF_XDP, &opts);
+ if (link_fd < 0) {
+ pr_warn("bpf_link_create failed: %s\n", strerror(errno));
+ return link_fd;
+ }
+
+ ctx->link_fd = link_fd;
return 0;
}
@@ -625,7 +655,6 @@ static int xsk_lookup_bpf_maps(struct xsk_socket *xsk)
close(fd);
}
- err = 0;
if (ctx->xsks_map_fd == -1)
err = -ENOENT;
@@ -642,6 +671,98 @@ static int xsk_set_bpf_maps(struct xsk_socket *xsk)
&xsk->fd, 0);
}
+static int xsk_link_lookup(int ifindex, __u32 *prog_id, int *link_fd)
+{
+ struct bpf_link_info link_info;
+ __u32 link_len;
+ __u32 id = 0;
+ int err;
+ int fd;
+
+ while (true) {
+ err = bpf_link_get_next_id(id, &id);
+ if (err) {
+ if (errno == ENOENT) {
+ err = 0;
+ break;
+ }
+ pr_warn("can't get next link: %s\n", strerror(errno));
+ break;
+ }
+
+ fd = bpf_link_get_fd_by_id(id);
+ if (fd < 0) {
+ if (errno == ENOENT)
+ continue;
+ pr_warn("can't get link by id (%u): %s\n", id, strerror(errno));
+ err = -errno;
+ break;
+ }
+
+ link_len = sizeof(struct bpf_link_info);
+ memset(&link_info, 0, link_len);
+ err = bpf_obj_get_info_by_fd(fd, &link_info, &link_len);
+ if (err) {
+ pr_warn("can't get link info: %s\n", strerror(errno));
+ close(fd);
+ break;
+ }
+ if (link_info.type == BPF_LINK_TYPE_XDP) {
+ if (link_info.xdp.ifindex == ifindex) {
+ *link_fd = fd;
+ if (prog_id)
+ *prog_id = link_info.prog_id;
+ break;
+ }
+ }
+ close(fd);
+ }
+
+ return err;
+}
+
+static bool xsk_probe_bpf_link(void)
+{
+ DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts,
+ .flags = XDP_FLAGS_SKB_MODE);
+ struct bpf_load_program_attr prog_attr;
+ struct bpf_insn insns[2] = {
+ BPF_MOV64_IMM(BPF_REG_0, XDP_PASS),
+ BPF_EXIT_INSN()
+ };
+ int prog_fd, link_fd = -1;
+ int ifindex_lo = 1;
+ bool ret = false;
+ int err;
+
+ err = xsk_link_lookup(ifindex_lo, NULL, &link_fd);
+ if (err)
+ return ret;
+
+ if (link_fd >= 0)
+ return true;
+
+ memset(&prog_attr, 0, sizeof(prog_attr));
+ prog_attr.prog_type = BPF_PROG_TYPE_XDP;
+ prog_attr.insns = insns;
+ prog_attr.insns_cnt = ARRAY_SIZE(insns);
+ prog_attr.license = "GPL";
+
+ prog_fd = bpf_load_program_xattr(&prog_attr, NULL, 0);
+ if (prog_fd < 0)
+ return ret;
+
+ link_fd = bpf_link_create(prog_fd, ifindex_lo, BPF_XDP, &opts);
+ close(prog_fd);
+
+ if (link_fd >= 0) {
+ ret = true;
+ close(link_fd);
+ }
+
+ return ret;
+}
+
static int xsk_create_xsk_struct(int ifindex, struct xsk_socket *xsk)
{
char ifname[IFNAMSIZ];
@@ -663,64 +784,108 @@ static int xsk_create_xsk_struct(int ifindex, struct xsk_socket *xsk)
ctx->ifname[IFNAMSIZ - 1] = 0;
xsk->ctx = ctx;
+ xsk->ctx->has_bpf_link = xsk_probe_bpf_link();
return 0;
}
-static int __xsk_setup_xdp_prog(struct xsk_socket *_xdp,
- int *xsks_map_fd)
+static int xsk_init_xdp_res(struct xsk_socket *xsk,
+ int *xsks_map_fd)
{
- struct xsk_socket *xsk = _xdp;
struct xsk_ctx *ctx = xsk->ctx;
- __u32 prog_id = 0;
int err;
- err = bpf_get_link_xdp_id(ctx->ifindex, &prog_id,
- xsk->config.xdp_flags);
+ err = xsk_create_bpf_maps(xsk);
if (err)
return err;
- if (!prog_id) {
- err = xsk_create_bpf_maps(xsk);
- if (err)
- return err;
+ err = xsk_load_xdp_prog(xsk);
+ if (err)
+ goto err_load_xdp_prog;
- err = xsk_load_xdp_prog(xsk);
- if (err) {
- goto err_load_xdp_prog;
- }
- } else {
- ctx->prog_fd = bpf_prog_get_fd_by_id(prog_id);
- if (ctx->prog_fd < 0)
- return -errno;
- err = xsk_lookup_bpf_maps(xsk);
- if (err) {
- close(ctx->prog_fd);
- return err;
- }
- }
+ if (ctx->has_bpf_link)
+ err = xsk_create_bpf_link(xsk);
+ else
+ err = bpf_set_link_xdp_fd(xsk->ctx->ifindex, ctx->prog_fd,
+ xsk->config.xdp_flags);
- if (xsk->rx) {
- err = xsk_set_bpf_maps(xsk);
- if (err) {
- if (!prog_id) {
- goto err_set_bpf_maps;
- } else {
- close(ctx->prog_fd);
- return err;
- }
- }
- }
- if (xsks_map_fd)
- *xsks_map_fd = ctx->xsks_map_fd;
+ if (err)
+ goto err_attach_xdp_prog;
- return 0;
+ if (!xsk->rx)
+ return err;
+
+ err = xsk_set_bpf_maps(xsk);
+ if (err)
+ goto err_set_bpf_maps;
+
+ return err;
err_set_bpf_maps:
+ if (ctx->has_bpf_link)
+ close(ctx->link_fd);
+ else
+ bpf_set_link_xdp_fd(ctx->ifindex, -1, 0);
+err_attach_xdp_prog:
close(ctx->prog_fd);
- bpf_set_link_xdp_fd(ctx->ifindex, -1, 0);
err_load_xdp_prog:
xsk_delete_bpf_maps(xsk);
+ return err;
+}
+
+static int xsk_lookup_xdp_res(struct xsk_socket *xsk, int *xsks_map_fd, int prog_id)
+{
+ struct xsk_ctx *ctx = xsk->ctx;
+ int err;
+
+ ctx->prog_fd = bpf_prog_get_fd_by_id(prog_id);
+ if (ctx->prog_fd < 0) {
+ err = -errno;
+ goto err_prog_fd;
+ }
+ err = xsk_lookup_bpf_maps(xsk);
+ if (err)
+ goto err_lookup_maps;
+
+ if (!xsk->rx)
+ return err;
+
+ err = xsk_set_bpf_maps(xsk);
+ if (err)
+ goto err_set_maps;
+
+ return err;
+
+err_set_maps:
+ close(ctx->xsks_map_fd);
+err_lookup_maps:
+ close(ctx->prog_fd);
+err_prog_fd:
+ if (ctx->has_bpf_link)
+ close(ctx->link_fd);
+ return err;
+}
+
+static int __xsk_setup_xdp_prog(struct xsk_socket *_xdp, int *xsks_map_fd)
+{
+ struct xsk_socket *xsk = _xdp;
+ struct xsk_ctx *ctx = xsk->ctx;
+ __u32 prog_id = 0;
+ int err;
+
+ if (ctx->has_bpf_link)
+ err = xsk_link_lookup(ctx->ifindex, &prog_id, &ctx->link_fd);
+ else
+ err = bpf_get_link_xdp_id(ctx->ifindex, &prog_id, xsk->config.xdp_flags);
+
+ if (err)
+ return err;
+
+ err = !prog_id ? xsk_init_xdp_res(xsk, xsks_map_fd) :
+ xsk_lookup_xdp_res(xsk, xsks_map_fd, prog_id);
+
+ if (!err && xsks_map_fd)
+ *xsks_map_fd = ctx->xsks_map_fd;
return err;
}
@@ -898,6 +1063,7 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr,
}
}
xsk->ctx = ctx;
+ xsk->ctx->has_bpf_link = xsk_probe_bpf_link();
if (rx) {
err = setsockopt(xsk->fd, SOL_XDP, XDP_RX_RING,
@@ -1054,6 +1220,8 @@ void xsk_socket__delete(struct xsk_socket *xsk)
if (ctx->prog_fd != -1) {
xsk_delete_bpf_maps(xsk);
close(ctx->prog_fd);
+ if (ctx->has_bpf_link)
+ close(ctx->link_fd);
}
err = xsk_get_mmap_offsets(xsk->fd, &off);
diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst
index 3464161c8eea..65fe318d1e71 100644
--- a/tools/testing/selftests/bpf/README.rst
+++ b/tools/testing/selftests/bpf/README.rst
@@ -179,3 +179,17 @@ types, which was introduced in `Clang 13`__. The older Clang versions will
either crash when compiling these tests, or generate an incorrect BTF.
__ https://reviews.llvm.org/D83289
+
+Kernel function call test and Clang version
+===========================================
+
+Some selftests (e.g. kfunc_call and bpf_tcp_ca) require a LLVM support
+to generate extern function in BTF. It was introduced in `Clang 13`__.
+
+Without it, the error from compiling bpf selftests looks like:
+
+.. code-block:: console
+
+ libbpf: failed to find BTF for extern 'tcp_slow_start' [25] section: -2
+
+__ https://reviews.llvm.org/D93563
diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
index 91f0fac632f4..029589c008c9 100644
--- a/tools/testing/selftests/bpf/bpf_tcp_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
@@ -187,16 +187,6 @@ struct tcp_congestion_ops {
typeof(y) __y = (y); \
__x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); })
-static __always_inline __u32 tcp_slow_start(struct tcp_sock *tp, __u32 acked)
-{
- __u32 cwnd = min(tp->snd_cwnd + acked, tp->snd_ssthresh);
-
- acked -= cwnd - tp->snd_cwnd;
- tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);
-
- return acked;
-}
-
static __always_inline bool tcp_in_slow_start(const struct tcp_sock *tp)
{
return tp->snd_cwnd < tp->snd_ssthresh;
@@ -213,22 +203,7 @@ static __always_inline bool tcp_is_cwnd_limited(const struct sock *sk)
return !!BPF_CORE_READ_BITFIELD(tp, is_cwnd_limited);
}
-static __always_inline void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked)
-{
- /* If credits accumulated at a higher w, apply them gently now. */
- if (tp->snd_cwnd_cnt >= w) {
- tp->snd_cwnd_cnt = 0;
- tp->snd_cwnd++;
- }
-
- tp->snd_cwnd_cnt += acked;
- if (tp->snd_cwnd_cnt >= w) {
- __u32 delta = tp->snd_cwnd_cnt / w;
-
- tp->snd_cwnd_cnt -= delta * w;
- tp->snd_cwnd += delta;
- }
- tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_cwnd_clamp);
-}
+extern __u32 tcp_slow_start(struct tcp_sock *tp, __u32 acked) __ksym;
+extern void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked) __ksym;
#endif
diff --git a/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c b/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c
new file mode 100644
index 000000000000..2e986e5e4cac
--- /dev/null
+++ b/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c
@@ -0,0 +1,158 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <arpa/inet.h>
+#include <linux/bpf.h>
+#include <netinet/in.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include <test_maps.h>
+
+struct test_lpm_key {
+ __u32 prefix;
+ struct in_addr ipv4;
+};
+
+static void map_batch_update(int map_fd, __u32 max_entries,
+ struct test_lpm_key *keys, int *values)
+{
+ __u32 i;
+ int err;
+ char buff[16] = { 0 };
+ DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts,
+ .elem_flags = 0,
+ .flags = 0,
+ );
+
+ for (i = 0; i < max_entries; i++) {
+ keys[i].prefix = 32;
+ snprintf(buff, 16, "192.168.1.%d", i + 1);
+ inet_pton(AF_INET, buff, &keys[i].ipv4);
+ values[i] = i + 1;
+ }
+
+ err = bpf_map_update_batch(map_fd, keys, values, &max_entries, &opts);
+ CHECK(err, "bpf_map_update_batch()", "error:%s\n", strerror(errno));
+}
+
+static void map_batch_verify(int *visited, __u32 max_entries,
+ struct test_lpm_key *keys, int *values)
+{
+ char buff[16] = { 0 };
+ int lower_byte = 0;
+ __u32 i;
+
+ memset(visited, 0, max_entries * sizeof(*visited));
+ for (i = 0; i < max_entries; i++) {
+ inet_ntop(AF_INET, &keys[i].ipv4, buff, 32);
+ CHECK(sscanf(buff, "192.168.1.%d", &lower_byte) == EOF,
+ "sscanf()", "error: i %d\n", i);
+ CHECK(lower_byte != values[i], "key/value checking",
+ "error: i %d key %s value %d\n", i, buff, values[i]);
+ visited[i] = 1;
+ }
+ for (i = 0; i < max_entries; i++) {
+ CHECK(visited[i] != 1, "visited checking",
+ "error: keys array at index %d missing\n", i);
+ }
+}
+
+void test_lpm_trie_map_batch_ops(void)
+{
+ struct bpf_create_map_attr xattr = {
+ .name = "lpm_trie_map",
+ .map_type = BPF_MAP_TYPE_LPM_TRIE,
+ .key_size = sizeof(struct test_lpm_key),
+ .value_size = sizeof(int),
+ .map_flags = BPF_F_NO_PREALLOC,
+ };
+ struct test_lpm_key *keys, key;
+ int map_fd, *values, *visited;
+ __u32 step, count, total, total_success;
+ const __u32 max_entries = 10;
+ __u64 batch = 0;
+ int err;
+ DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts,
+ .elem_flags = 0,
+ .flags = 0,
+ );
+
+ xattr.max_entries = max_entries;
+ map_fd = bpf_create_map_xattr(&xattr);
+ CHECK(map_fd == -1, "bpf_create_map_xattr()", "error:%s\n",
+ strerror(errno));
+
+ keys = malloc(max_entries * sizeof(struct test_lpm_key));
+ values = malloc(max_entries * sizeof(int));
+ visited = malloc(max_entries * sizeof(int));
+ CHECK(!keys || !values || !visited, "malloc()", "error:%s\n",
+ strerror(errno));
+
+ total_success = 0;
+ for (step = 1; step < max_entries; step++) {
+ map_batch_update(map_fd, max_entries, keys, values);
+ map_batch_verify(visited, max_entries, keys, values);
+ memset(keys, 0, max_entries * sizeof(*keys));
+ memset(values, 0, max_entries * sizeof(*values));
+ batch = 0;
+ total = 0;
+ /* iteratively lookup/delete elements with 'step'
+ * elements each.
+ */
+ count = step;
+ while (true) {
+ err = bpf_map_lookup_batch(map_fd,
+ total ? &batch : NULL, &batch,
+ keys + total, values + total, &count, &opts);
+
+ CHECK((err && errno != ENOENT), "lookup with steps",
+ "error: %s\n", strerror(errno));
+
+ total += count;
+ if (err)
+ break;
+ }
+
+ CHECK(total != max_entries, "lookup with steps",
+ "total = %u, max_entries = %u\n", total, max_entries);
+
+ map_batch_verify(visited, max_entries, keys, values);
+
+ total = 0;
+ count = step;
+ while (total < max_entries) {
+ if (max_entries - total < step)
+ count = max_entries - total;
+ err = bpf_map_delete_batch(map_fd, keys + total, &count,
+ &opts);
+ CHECK((err && errno != ENOENT), "delete batch",
+ "error: %s\n", strerror(errno));
+ total += count;
+ if (err)
+ break;
+ }
+ CHECK(total != max_entries, "delete with steps",
+ "total = %u, max_entries = %u\n", total, max_entries);
+
+ /* check map is empty, errono == ENOENT */
+ err = bpf_map_get_next_key(map_fd, NULL, &key);
+ CHECK(!err || errno != ENOENT, "bpf_map_get_next_key()",
+ "error: %s\n", strerror(errno));
+
+ total_success++;
+ }
+
+ CHECK(total_success == 0, "check total_success",
+ "unexpected failure\n");
+
+ printf("%s:PASS\n", __func__);
+
+ free(keys);
+ free(values);
+ free(visited);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c
new file mode 100644
index 000000000000..7fc0951ee75f
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include <test_progs.h>
+#include <network_helpers.h>
+#include "kfunc_call_test.skel.h"
+#include "kfunc_call_test_subprog.skel.h"
+
+static void test_main(void)
+{
+ struct kfunc_call_test *skel;
+ int prog_fd, retval, err;
+
+ skel = kfunc_call_test__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "skel"))
+ return;
+
+ prog_fd = bpf_program__fd(skel->progs.kfunc_call_test1);
+ err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
+ NULL, NULL, (__u32 *)&retval, NULL);
+ ASSERT_OK(err, "bpf_prog_test_run(test1)");
+ ASSERT_EQ(retval, 12, "test1-retval");
+
+ prog_fd = bpf_program__fd(skel->progs.kfunc_call_test2);
+ err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
+ NULL, NULL, (__u32 *)&retval, NULL);
+ ASSERT_OK(err, "bpf_prog_test_run(test2)");
+ ASSERT_EQ(retval, 3, "test2-retval");
+
+ kfunc_call_test__destroy(skel);
+}
+
+static void test_subprog(void)
+{
+ struct kfunc_call_test_subprog *skel;
+ int prog_fd, retval, err;
+
+ skel = kfunc_call_test_subprog__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "skel"))
+ return;
+
+ prog_fd = bpf_program__fd(skel->progs.kfunc_call_test1);
+ err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
+ NULL, NULL, (__u32 *)&retval, NULL);
+ ASSERT_OK(err, "bpf_prog_test_run(test1)");
+ ASSERT_EQ(retval, 10, "test1-retval");
+ ASSERT_NEQ(skel->data->active_res, -1, "active_res");
+ ASSERT_EQ(skel->data->sk_state, BPF_TCP_CLOSE, "sk_state");
+
+ kfunc_call_test_subprog__destroy(skel);
+}
+
+void test_kfunc_call(void)
+{
+ if (test__start_subtest("main"))
+ test_main();
+
+ if (test__start_subtest("subprog"))
+ test_subprog();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
index b8b48cac2ac3..ab77596b64e3 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
@@ -7,6 +7,7 @@
#include "test_skmsg_load_helpers.skel.h"
#include "test_sockmap_update.skel.h"
#include "test_sockmap_invalid_update.skel.h"
+#include "test_sockmap_skb_verdict_attach.skel.h"
#include "bpf_iter_sockmap.skel.h"
#define TCP_REPAIR 19 /* TCP sock is under repair right now */
@@ -281,6 +282,39 @@ out:
bpf_iter_sockmap__destroy(skel);
}
+static void test_sockmap_skb_verdict_attach(enum bpf_attach_type first,
+ enum bpf_attach_type second)
+{
+ struct test_sockmap_skb_verdict_attach *skel;
+ int err, map, verdict;
+
+ skel = test_sockmap_skb_verdict_attach__open_and_load();
+ if (CHECK_FAIL(!skel)) {
+ perror("test_sockmap_skb_verdict_attach__open_and_load");
+ return;
+ }
+
+ verdict = bpf_program__fd(skel->progs.prog_skb_verdict);
+ map = bpf_map__fd(skel->maps.sock_map);
+
+ err = bpf_prog_attach(verdict, map, first, 0);
+ if (CHECK_FAIL(err)) {
+ perror("bpf_prog_attach");
+ goto out;
+ }
+
+ err = bpf_prog_attach(verdict, map, second, 0);
+ assert(err == -1 && errno == EBUSY);
+
+ err = bpf_prog_detach2(verdict, map, first);
+ if (CHECK_FAIL(err)) {
+ perror("bpf_prog_detach2");
+ goto out;
+ }
+out:
+ test_sockmap_skb_verdict_attach__destroy(skel);
+}
+
void test_sockmap_basic(void)
{
if (test__start_subtest("sockmap create_update_free"))
@@ -301,4 +335,10 @@ void test_sockmap_basic(void)
test_sockmap_copy(BPF_MAP_TYPE_SOCKMAP);
if (test__start_subtest("sockhash copy"))
test_sockmap_copy(BPF_MAP_TYPE_SOCKHASH);
+ if (test__start_subtest("sockmap skb_verdict attach")) {
+ test_sockmap_skb_verdict_attach(BPF_SK_SKB_VERDICT,
+ BPF_SK_SKB_STREAM_VERDICT);
+ test_sockmap_skb_verdict_attach(BPF_SK_SKB_STREAM_VERDICT,
+ BPF_SK_SKB_VERDICT);
+ }
}
diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
index c26e6bf05e49..648d9ae898d2 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
@@ -1603,6 +1603,141 @@ static void test_reuseport(struct test_sockmap_listen *skel,
}
}
+static void udp_redir_to_connected(int family, int sotype, int sock_mapfd,
+ int verd_mapfd, enum redir_mode mode)
+{
+ const char *log_prefix = redir_mode_str(mode);
+ struct sockaddr_storage addr;
+ int c0, c1, p0, p1;
+ unsigned int pass;
+ socklen_t len;
+ int err, n;
+ u64 value;
+ u32 key;
+ char b;
+
+ zero_verdict_count(verd_mapfd);
+
+ p0 = socket_loopback(family, sotype | SOCK_NONBLOCK);
+ if (p0 < 0)
+ return;
+ len = sizeof(addr);
+ err = xgetsockname(p0, sockaddr(&addr), &len);
+ if (err)
+ goto close_peer0;
+
+ c0 = xsocket(family, sotype | SOCK_NONBLOCK, 0);
+ if (c0 < 0)
+ goto close_peer0;
+ err = xconnect(c0, sockaddr(&addr), len);
+ if (err)
+ goto close_cli0;
+ err = xgetsockname(c0, sockaddr(&addr), &len);
+ if (err)
+ goto close_cli0;
+ err = xconnect(p0, sockaddr(&addr), len);
+ if (err)
+ goto close_cli0;
+
+ p1 = socket_loopback(family, sotype | SOCK_NONBLOCK);
+ if (p1 < 0)
+ goto close_cli0;
+ err = xgetsockname(p1, sockaddr(&addr), &len);
+ if (err)
+ goto close_cli0;
+
+ c1 = xsocket(family, sotype | SOCK_NONBLOCK, 0);
+ if (c1 < 0)
+ goto close_peer1;
+ err = xconnect(c1, sockaddr(&addr), len);
+ if (err)
+ goto close_cli1;
+ err = xgetsockname(c1, sockaddr(&addr), &len);
+ if (err)
+ goto close_cli1;
+ err = xconnect(p1, sockaddr(&addr), len);
+ if (err)
+ goto close_cli1;
+
+ key = 0;
+ value = p0;
+ err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST);
+ if (err)
+ goto close_cli1;
+
+ key = 1;
+ value = p1;
+ err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST);
+ if (err)
+ goto close_cli1;
+
+ n = write(c1, "a", 1);
+ if (n < 0)
+ FAIL_ERRNO("%s: write", log_prefix);
+ if (n == 0)
+ FAIL("%s: incomplete write", log_prefix);
+ if (n < 1)
+ goto close_cli1;
+
+ key = SK_PASS;
+ err = xbpf_map_lookup_elem(verd_mapfd, &key, &pass);
+ if (err)
+ goto close_cli1;
+ if (pass != 1)
+ FAIL("%s: want pass count 1, have %d", log_prefix, pass);
+
+ n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1);
+ if (n < 0)
+ FAIL_ERRNO("%s: read", log_prefix);
+ if (n == 0)
+ FAIL("%s: incomplete read", log_prefix);
+
+close_cli1:
+ xclose(c1);
+close_peer1:
+ xclose(p1);
+close_cli0:
+ xclose(c0);
+close_peer0:
+ xclose(p0);
+}
+
+static void udp_skb_redir_to_connected(struct test_sockmap_listen *skel,
+ struct bpf_map *inner_map, int family)
+{
+ int verdict = bpf_program__fd(skel->progs.prog_skb_verdict);
+ int verdict_map = bpf_map__fd(skel->maps.verdict_map);
+ int sock_map = bpf_map__fd(inner_map);
+ int err;
+
+ err = xbpf_prog_attach(verdict, sock_map, BPF_SK_SKB_VERDICT, 0);
+ if (err)
+ return;
+
+ skel->bss->test_ingress = false;
+ udp_redir_to_connected(family, SOCK_DGRAM, sock_map, verdict_map,
+ REDIR_EGRESS);
+ skel->bss->test_ingress = true;
+ udp_redir_to_connected(family, SOCK_DGRAM, sock_map, verdict_map,
+ REDIR_INGRESS);
+
+ xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_VERDICT);
+}
+
+static void test_udp_redir(struct test_sockmap_listen *skel, struct bpf_map *map,
+ int family)
+{
+ const char *family_name, *map_name;
+ char s[MAX_TEST_NAME];
+
+ family_name = family_str(family);
+ map_name = map_type_str(map);
+ snprintf(s, sizeof(s), "%s %s %s", map_name, family_name, __func__);
+ if (!test__start_subtest(s))
+ return;
+ udp_skb_redir_to_connected(skel, map, family);
+}
+
static void run_tests(struct test_sockmap_listen *skel, struct bpf_map *map,
int family)
{
@@ -1611,6 +1746,7 @@ static void run_tests(struct test_sockmap_listen *skel, struct bpf_map *map,
test_redir(skel, map, family, SOCK_STREAM);
test_reuseport(skel, map, family, SOCK_STREAM);
test_reuseport(skel, map, family, SOCK_DGRAM);
+ test_udp_redir(skel, map, family);
}
void test_sockmap_listen(void)
diff --git a/tools/testing/selftests/bpf/prog_tests/test_ima.c b/tools/testing/selftests/bpf/prog_tests/test_ima.c
index b54bc0c351b7..0252f61d611a 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_ima.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_ima.c
@@ -68,7 +68,8 @@ void test_test_ima(void)
goto close_prog;
snprintf(cmd, sizeof(cmd), "./ima_setup.sh setup %s", measured_dir);
- if (CHECK_FAIL(system(cmd)))
+ err = system(cmd);
+ if (CHECK(err, "failed to run command", "%s, errno = %d\n", cmd, errno))
goto close_clean;
err = run_measured_process(measured_dir, &skel->bss->monitored_pid);
@@ -81,7 +82,8 @@ void test_test_ima(void)
close_clean:
snprintf(cmd, sizeof(cmd), "./ima_setup.sh cleanup %s", measured_dir);
- CHECK_FAIL(system(cmd));
+ err = system(cmd);
+ CHECK(err, "failed to run command", "%s, errno = %d\n", cmd, errno);
close_prog:
ima__destroy(skel);
}
diff --git a/tools/testing/selftests/bpf/progs/bpf_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cubic.c
index 6939bfd8690f..f62df4d023f9 100644
--- a/tools/testing/selftests/bpf/progs/bpf_cubic.c
+++ b/tools/testing/selftests/bpf/progs/bpf_cubic.c
@@ -174,8 +174,8 @@ static __always_inline void bictcp_hystart_reset(struct sock *sk)
* as long as it is used in one of the func ptr
* under SEC(".struct_ops").
*/
-SEC("struct_ops/bictcp_init")
-void BPF_PROG(bictcp_init, struct sock *sk)
+SEC("struct_ops/bpf_cubic_init")
+void BPF_PROG(bpf_cubic_init, struct sock *sk)
{
struct bictcp *ca = inet_csk_ca(sk);
@@ -192,7 +192,7 @@ void BPF_PROG(bictcp_init, struct sock *sk)
* The remaining tcp-cubic functions have an easier way.
*/
SEC("no-sec-prefix-bictcp_cwnd_event")
-void BPF_PROG(bictcp_cwnd_event, struct sock *sk, enum tcp_ca_event event)
+void BPF_PROG(bpf_cubic_cwnd_event, struct sock *sk, enum tcp_ca_event event)
{
if (event == CA_EVENT_TX_START) {
struct bictcp *ca = inet_csk_ca(sk);
@@ -384,7 +384,7 @@ tcp_friendliness:
}
/* Or simply use the BPF_STRUCT_OPS to avoid the SEC boiler plate. */
-void BPF_STRUCT_OPS(bictcp_cong_avoid, struct sock *sk, __u32 ack, __u32 acked)
+void BPF_STRUCT_OPS(bpf_cubic_cong_avoid, struct sock *sk, __u32 ack, __u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
@@ -403,7 +403,7 @@ void BPF_STRUCT_OPS(bictcp_cong_avoid, struct sock *sk, __u32 ack, __u32 acked)
tcp_cong_avoid_ai(tp, ca->cnt, acked);
}
-__u32 BPF_STRUCT_OPS(bictcp_recalc_ssthresh, struct sock *sk)
+__u32 BPF_STRUCT_OPS(bpf_cubic_recalc_ssthresh, struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
@@ -420,7 +420,7 @@ __u32 BPF_STRUCT_OPS(bictcp_recalc_ssthresh, struct sock *sk)
return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
}
-void BPF_STRUCT_OPS(bictcp_state, struct sock *sk, __u8 new_state)
+void BPF_STRUCT_OPS(bpf_cubic_state, struct sock *sk, __u8 new_state)
{
if (new_state == TCP_CA_Loss) {
bictcp_reset(inet_csk_ca(sk));
@@ -496,7 +496,7 @@ static __always_inline void hystart_update(struct sock *sk, __u32 delay)
}
}
-void BPF_STRUCT_OPS(bictcp_acked, struct sock *sk,
+void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk,
const struct ack_sample *sample)
{
const struct tcp_sock *tp = tcp_sk(sk);
@@ -525,21 +525,21 @@ void BPF_STRUCT_OPS(bictcp_acked, struct sock *sk,
hystart_update(sk, delay);
}
-__u32 BPF_STRUCT_OPS(tcp_reno_undo_cwnd, struct sock *sk)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
+extern __u32 tcp_reno_undo_cwnd(struct sock *sk) __ksym;
- return max(tp->snd_cwnd, tp->prior_cwnd);
+__u32 BPF_STRUCT_OPS(bpf_cubic_undo_cwnd, struct sock *sk)
+{
+ return tcp_reno_undo_cwnd(sk);
}
SEC(".struct_ops")
struct tcp_congestion_ops cubic = {
- .init = (void *)bictcp_init,
- .ssthresh = (void *)bictcp_recalc_ssthresh,
- .cong_avoid = (void *)bictcp_cong_avoid,
- .set_state = (void *)bictcp_state,
- .undo_cwnd = (void *)tcp_reno_undo_cwnd,
- .cwnd_event = (void *)bictcp_cwnd_event,
- .pkts_acked = (void *)bictcp_acked,
+ .init = (void *)bpf_cubic_init,
+ .ssthresh = (void *)bpf_cubic_recalc_ssthresh,
+ .cong_avoid = (void *)bpf_cubic_cong_avoid,
+ .set_state = (void *)bpf_cubic_state,
+ .undo_cwnd = (void *)bpf_cubic_undo_cwnd,
+ .cwnd_event = (void *)bpf_cubic_cwnd_event,
+ .pkts_acked = (void *)bpf_cubic_acked,
.name = "bpf_cubic",
};
diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c
index 4dc1a967776a..fd42247da8b4 100644
--- a/tools/testing/selftests/bpf/progs/bpf_dctcp.c
+++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c
@@ -194,22 +194,12 @@ __u32 BPF_PROG(dctcp_cwnd_undo, struct sock *sk)
return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
}
-SEC("struct_ops/tcp_reno_cong_avoid")
-void BPF_PROG(tcp_reno_cong_avoid, struct sock *sk, __u32 ack, __u32 acked)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (!tcp_is_cwnd_limited(sk))
- return;
+extern void tcp_reno_cong_avoid(struct sock *sk, __u32 ack, __u32 acked) __ksym;
- /* In "safe" area, increase. */
- if (tcp_in_slow_start(tp)) {
- acked = tcp_slow_start(tp, acked);
- if (!acked)
- return;
- }
- /* In dangerous area, increase slowly. */
- tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked);
+SEC("struct_ops/dctcp_reno_cong_avoid")
+void BPF_PROG(dctcp_cong_avoid, struct sock *sk, __u32 ack, __u32 acked)
+{
+ tcp_reno_cong_avoid(sk, ack, acked);
}
SEC(".struct_ops")
@@ -226,7 +216,7 @@ struct tcp_congestion_ops dctcp = {
.in_ack_event = (void *)dctcp_update_alpha,
.cwnd_event = (void *)dctcp_cwnd_event,
.ssthresh = (void *)dctcp_ssthresh,
- .cong_avoid = (void *)tcp_reno_cong_avoid,
+ .cong_avoid = (void *)dctcp_cong_avoid,
.undo_cwnd = (void *)dctcp_cwnd_undo,
.set_state = (void *)dctcp_state,
.flags = TCP_CONG_NEEDS_ECN,
diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_test.c b/tools/testing/selftests/bpf/progs/kfunc_call_test.c
new file mode 100644
index 000000000000..470f8723e463
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/kfunc_call_test.c
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_tcp_helpers.h"
+
+extern int bpf_kfunc_call_test2(struct sock *sk, __u32 a, __u32 b) __ksym;
+extern __u64 bpf_kfunc_call_test1(struct sock *sk, __u32 a, __u64 b,
+ __u32 c, __u64 d) __ksym;
+
+SEC("classifier")
+int kfunc_call_test2(struct __sk_buff *skb)
+{
+ struct bpf_sock *sk = skb->sk;
+
+ if (!sk)
+ return -1;
+
+ sk = bpf_sk_fullsock(sk);
+ if (!sk)
+ return -1;
+
+ return bpf_kfunc_call_test2((struct sock *)sk, 1, 2);
+}
+
+SEC("classifier")
+int kfunc_call_test1(struct __sk_buff *skb)
+{
+ struct bpf_sock *sk = skb->sk;
+ __u64 a = 1ULL << 32;
+ __u32 ret;
+
+ if (!sk)
+ return -1;
+
+ sk = bpf_sk_fullsock(sk);
+ if (!sk)
+ return -1;
+
+ a = bpf_kfunc_call_test1((struct sock *)sk, 1, a | 2, 3, a | 4);
+ ret = a >> 32; /* ret should be 2 */
+ ret += (__u32)a; /* ret should be 12 */
+
+ return ret;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c b/tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c
new file mode 100644
index 000000000000..b2dcb7d9cb03
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_tcp_helpers.h"
+
+extern const int bpf_prog_active __ksym;
+extern __u64 bpf_kfunc_call_test1(struct sock *sk, __u32 a, __u64 b,
+ __u32 c, __u64 d) __ksym;
+extern struct sock *bpf_kfunc_call_test3(struct sock *sk) __ksym;
+int active_res = -1;
+int sk_state = -1;
+
+int __noinline f1(struct __sk_buff *skb)
+{
+ struct bpf_sock *sk = skb->sk;
+ int *active;
+
+ if (!sk)
+ return -1;
+
+ sk = bpf_sk_fullsock(sk);
+ if (!sk)
+ return -1;
+
+ active = (int *)bpf_per_cpu_ptr(&bpf_prog_active,
+ bpf_get_smp_processor_id());
+ if (active)
+ active_res = *active;
+
+ sk_state = bpf_kfunc_call_test3((struct sock *)sk)->__sk_common.skc_state;
+
+ return (__u32)bpf_kfunc_call_test1((struct sock *)sk, 1, 2, 3, 4);
+}
+
+SEC("classifier")
+int kfunc_call_test1(struct __sk_buff *skb)
+{
+ return f1(skb);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_listen.c b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c
index fa221141e9c1..a39eba9f5201 100644
--- a/tools/testing/selftests/bpf/progs/test_sockmap_listen.c
+++ b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c
@@ -29,6 +29,7 @@ struct {
} verdict_map SEC(".maps");
static volatile bool test_sockmap; /* toggled by user-space */
+static volatile bool test_ingress; /* toggled by user-space */
SEC("sk_skb/stream_parser")
int prog_stream_parser(struct __sk_buff *skb)
@@ -55,6 +56,27 @@ int prog_stream_verdict(struct __sk_buff *skb)
return verdict;
}
+SEC("sk_skb/skb_verdict")
+int prog_skb_verdict(struct __sk_buff *skb)
+{
+ unsigned int *count;
+ __u32 zero = 0;
+ int verdict;
+
+ if (test_sockmap)
+ verdict = bpf_sk_redirect_map(skb, &sock_map, zero,
+ test_ingress ? BPF_F_INGRESS : 0);
+ else
+ verdict = bpf_sk_redirect_hash(skb, &sock_hash, &zero,
+ test_ingress ? BPF_F_INGRESS : 0);
+
+ count = bpf_map_lookup_elem(&verdict_map, &verdict);
+ if (count)
+ (*count)++;
+
+ return verdict;
+}
+
SEC("sk_msg")
int prog_msg_verdict(struct sk_msg_md *msg)
{
diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c b/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c
new file mode 100644
index 000000000000..2d31f66e4f23
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SOCKMAP);
+ __uint(max_entries, 2);
+ __type(key, __u32);
+ __type(value, __u64);
+} sock_map SEC(".maps");
+
+SEC("sk_skb/skb_verdict")
+int prog_skb_verdict(struct __sk_buff *skb)
+{
+ return SK_DROP;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_xsk.sh b/tools/testing/selftests/bpf/test_xsk.sh
index 56d4474e2c83..46633a3bfb0b 100755
--- a/tools/testing/selftests/bpf/test_xsk.sh
+++ b/tools/testing/selftests/bpf/test_xsk.sh
@@ -107,7 +107,7 @@ setup_vethPairs() {
echo "setting up ${VETH0}: namespace: ${NS0}"
fi
ip netns add ${NS1}
- ip link add ${VETH0} type veth peer name ${VETH1}
+ ip link add ${VETH0} numtxqueues 4 numrxqueues 4 type veth peer name ${VETH1} numtxqueues 4 numrxqueues 4
if [ -f /proc/net/if_inet6 ]; then
echo 1 > /proc/sys/net/ipv6/conf/${VETH0}/disable_ipv6
fi
@@ -118,6 +118,7 @@ setup_vethPairs() {
ip netns exec ${NS1} ip link set ${VETH1} mtu ${MTU}
ip link set ${VETH0} mtu ${MTU}
ip netns exec ${NS1} ip link set ${VETH1} up
+ ip netns exec ${NS1} ip link set dev lo up
ip link set ${VETH0} up
}
diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c
index eb888c8479c3..336a749673d1 100644
--- a/tools/testing/selftests/bpf/verifier/calls.c
+++ b/tools/testing/selftests/bpf/verifier/calls.c
@@ -19,7 +19,7 @@
BPF_MOV64_IMM(BPF_REG_0, 2),
BPF_EXIT_INSN(),
},
- .errstr_unpriv = "function calls to other bpf functions are allowed for",
+ .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
.result_unpriv = REJECT,
.result = ACCEPT,
.retval = 1,
@@ -136,7 +136,7 @@
{
"calls: wrong src reg",
.insns = {
- BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 2, 0, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 3, 0, 0),
BPF_MOV64_IMM(BPF_REG_0, 1),
BPF_EXIT_INSN(),
},
@@ -397,7 +397,7 @@
BPF_MOV64_IMM(BPF_REG_0, 1),
BPF_EXIT_INSN(),
},
- .errstr_unpriv = "function calls to other bpf functions are allowed for",
+ .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
.fixup_map_hash_48b = { 3 },
.result_unpriv = REJECT,
.result = ACCEPT,
@@ -1977,7 +1977,7 @@
BPF_EXIT_INSN(),
},
.prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
- .errstr_unpriv = "function calls to other bpf functions are allowed for",
+ .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
.result_unpriv = REJECT,
.result = ACCEPT,
},
@@ -2003,7 +2003,7 @@
BPF_EXIT_INSN(),
},
.prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
- .errstr_unpriv = "function calls to other bpf functions are allowed for",
+ .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
.errstr = "!read_ok",
.result = REJECT,
},
@@ -2028,7 +2028,7 @@
BPF_EXIT_INSN(),
},
.prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
- .errstr_unpriv = "function calls to other bpf functions are allowed for",
+ .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
.errstr = "!read_ok",
.result = REJECT,
},
diff --git a/tools/testing/selftests/bpf/verifier/dead_code.c b/tools/testing/selftests/bpf/verifier/dead_code.c
index 5cf361d8eb1c..17fe33a75034 100644
--- a/tools/testing/selftests/bpf/verifier/dead_code.c
+++ b/tools/testing/selftests/bpf/verifier/dead_code.c
@@ -85,7 +85,7 @@
BPF_MOV64_IMM(BPF_REG_0, 12),
BPF_EXIT_INSN(),
},
- .errstr_unpriv = "function calls to other bpf functions are allowed for",
+ .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
.result_unpriv = REJECT,
.result = ACCEPT,
.retval = 7,
@@ -103,7 +103,7 @@
BPF_MOV64_IMM(BPF_REG_0, 12),
BPF_EXIT_INSN(),
},
- .errstr_unpriv = "function calls to other bpf functions are allowed for",
+ .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
.result_unpriv = REJECT,
.result = ACCEPT,
.retval = 7,
@@ -121,7 +121,7 @@
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -5),
BPF_EXIT_INSN(),
},
- .errstr_unpriv = "function calls to other bpf functions are allowed for",
+ .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
.result_unpriv = REJECT,
.result = ACCEPT,
.retval = 7,
@@ -137,7 +137,7 @@
BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
BPF_EXIT_INSN(),
},
- .errstr_unpriv = "function calls to other bpf functions are allowed for",
+ .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
.result_unpriv = REJECT,
.result = ACCEPT,
.retval = 2,
@@ -152,7 +152,7 @@
BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
BPF_EXIT_INSN(),
},
- .errstr_unpriv = "function calls to other bpf functions are allowed for",
+ .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
.result_unpriv = REJECT,
.result = ACCEPT,
.retval = 2,
diff --git a/tools/testing/selftests/bpf/vmtest.sh b/tools/testing/selftests/bpf/vmtest.sh
index 22554894db99..8889b3f55236 100755
--- a/tools/testing/selftests/bpf/vmtest.sh
+++ b/tools/testing/selftests/bpf/vmtest.sh
@@ -24,15 +24,15 @@ EXIT_STATUS_FILE="${LOG_FILE_BASE}.exit_status"
usage()
{
cat <<EOF
-Usage: $0 [-i] [-d <output_dir>] -- [<command>]
+Usage: $0 [-i] [-s] [-d <output_dir>] -- [<command>]
<command> is the command you would normally run when you are in
tools/testing/selftests/bpf. e.g:
$0 -- ./test_progs -t test_lsm
-If no command is specified, "${DEFAULT_COMMAND}" will be run by
-default.
+If no command is specified and a debug shell (-s) is not requested,
+"${DEFAULT_COMMAND}" will be run by default.
If you build your kernel using KBUILD_OUTPUT= or O= options, these
can be passed as environment variables to the script:
@@ -49,6 +49,9 @@ Options:
-d) Update the output directory (default: ${OUTPUT_DIR})
-j) Number of jobs for compilation, similar to -j in make
(default: ${NUM_COMPILE_JOBS})
+ -s) Instead of powering off the VM, start an interactive
+ shell. If <command> is specified, the shell runs after
+ the command finishes executing
EOF
}
@@ -149,6 +152,7 @@ update_init_script()
local init_script_dir="${OUTPUT_DIR}/${MOUNT_DIR}/etc/rcS.d"
local init_script="${init_script_dir}/S50-startup"
local command="$1"
+ local exit_command="$2"
mount_image
@@ -162,9 +166,10 @@ EOF
fi
- sudo bash -c "cat >${init_script}" <<EOF
-#!/bin/bash
+ sudo bash -c "echo '#!/bin/bash' > ${init_script}"
+ if [[ "${command}" != "" ]]; then
+ sudo bash -c "cat >>${init_script}" <<EOF
# Have a default value in the exit status file
# incase the VM is forcefully stopped.
echo "130" > "/root/${EXIT_STATUS_FILE}"
@@ -175,9 +180,12 @@ echo "130" > "/root/${EXIT_STATUS_FILE}"
stdbuf -oL -eL ${command}
echo "\$?" > "/root/${EXIT_STATUS_FILE}"
} 2>&1 | tee "/root/${LOG_FILE}"
-poweroff -f
+# Ensure that the logs are written to disk
+sync
EOF
+ fi
+ sudo bash -c "echo ${exit_command} >> ${init_script}"
sudo chmod a+x "${init_script}"
unmount_image
}
@@ -277,8 +285,10 @@ main()
local kernel_bzimage="${kernel_checkout}/${X86_BZIMAGE}"
local command="${DEFAULT_COMMAND}"
local update_image="no"
+ local exit_command="poweroff -f"
+ local debug_shell="no"
- while getopts 'hkid:j:' opt; do
+ while getopts 'hskid:j:' opt; do
case ${opt} in
i)
update_image="yes"
@@ -289,6 +299,11 @@ main()
j)
NUM_COMPILE_JOBS="$OPTARG"
;;
+ s)
+ command=""
+ debug_shell="yes"
+ exit_command="bash"
+ ;;
h)
usage
exit 0
@@ -307,7 +322,7 @@ main()
done
shift $((OPTIND -1))
- if [[ $# -eq 0 ]]; then
+ if [[ $# -eq 0 && "${debug_shell}" == "no" ]]; then
echo "No command specified, will run ${DEFAULT_COMMAND} in the vm"
else
command="$@"
@@ -355,10 +370,12 @@ main()
fi
update_selftests "${kernel_checkout}" "${make_command}"
- update_init_script "${command}"
+ update_init_script "${command}" "${exit_command}"
run_vm "${kernel_bzimage}"
- copy_logs
- echo "Logs saved in ${OUTPUT_DIR}/${LOG_FILE}"
+ if [[ "${command}" != "" ]]; then
+ copy_logs
+ echo "Logs saved in ${OUTPUT_DIR}/${LOG_FILE}"
+ fi
}
catch()
diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c
index 1e21a3172687..1135fb980814 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.c
+++ b/tools/testing/selftests/bpf/xdpxceiver.c
@@ -41,8 +41,12 @@
* Reduce the size of the RX ring to a fraction of the fill ring size.
* iv. fill queue empty
* Do not populate the fill queue and then try to receive pkts.
+ * f. bpf_link resource persistence
+ * Configure sockets at indexes 0 and 1, run a traffic on queue ids 0,
+ * then remove xsk sockets from queue 0 on both veth interfaces and
+ * finally run a traffic on queues ids 1
*
- * Total tests: 10
+ * Total tests: 12
*
* Flow:
* -----
@@ -93,6 +97,13 @@ typedef __u16 __sum16;
#include "xdpxceiver.h"
#include "../kselftest.h"
+static const char *MAC1 = "\x00\x0A\x56\x9E\xEE\x62";
+static const char *MAC2 = "\x00\x0A\x56\x9E\xEE\x61";
+static const char *IP1 = "192.168.100.162";
+static const char *IP2 = "192.168.100.161";
+static const u16 UDP_PORT1 = 2020;
+static const u16 UDP_PORT2 = 2121;
+
static void __exit_with_error(int error, const char *file, const char *func, int line)
{
if (configured_mode == TEST_MODE_UNCONFIGURED) {
@@ -108,27 +119,12 @@ static void __exit_with_error(int error, const char *file, const char *func, int
#define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, __LINE__)
#define print_ksft_result(void)\
- (ksft_test_result_pass("PASS: %s %s %s%s%s\n", configured_mode ? "DRV" : "SKB",\
+ (ksft_test_result_pass("PASS: %s %s %s%s%s%s\n", configured_mode ? "DRV" : "SKB",\
test_type == TEST_TYPE_POLL ? "POLL" : "NOPOLL",\
test_type == TEST_TYPE_TEARDOWN ? "Socket Teardown" : "",\
test_type == TEST_TYPE_BIDI ? "Bi-directional Sockets" : "",\
- test_type == TEST_TYPE_STATS ? "Stats" : ""))
-
-static void pthread_init_mutex(void)
-{
- pthread_mutex_init(&sync_mutex, NULL);
- pthread_mutex_init(&sync_mutex_tx, NULL);
- pthread_cond_init(&signal_rx_condition, NULL);
- pthread_cond_init(&signal_tx_condition, NULL);
-}
-
-static void pthread_destroy_mutex(void)
-{
- pthread_mutex_destroy(&sync_mutex);
- pthread_mutex_destroy(&sync_mutex_tx);
- pthread_cond_destroy(&signal_rx_condition);
- pthread_cond_destroy(&signal_tx_condition);
-}
+ test_type == TEST_TYPE_STATS ? "Stats" : "",\
+ test_type == TEST_TYPE_BPF_RES ? "BPF RES" : ""))
static void *memset32_htonl(void *dest, u32 val, u32 size)
{
@@ -147,24 +143,11 @@ static void *memset32_htonl(void *dest, u32 val, u32 size)
}
/*
- * This function code has been taken from
- * Linux kernel lib/checksum.c
- */
-static inline unsigned short from32to16(unsigned int x)
-{
- /* add up 16-bit and 16-bit for 16+c bit */
- x = (x & 0xffff) + (x >> 16);
- /* add up carry.. */
- x = (x & 0xffff) + (x >> 16);
- return x;
-}
-
-/*
* Fold a partial checksum
* This function code has been taken from
* Linux kernel include/asm-generic/checksum.h
*/
-static inline __u16 csum_fold(__u32 csum)
+static __u16 csum_fold(__u32 csum)
{
u32 sum = (__force u32)csum;
@@ -177,7 +160,7 @@ static inline __u16 csum_fold(__u32 csum)
* This function code has been taken from
* Linux kernel lib/checksum.c
*/
-static inline u32 from64to32(u64 x)
+static u32 from64to32(u64 x)
{
/* add up 32-bit and 32-bit for 32+c bit */
x = (x & 0xffffffff) + (x >> 32);
@@ -186,13 +169,11 @@ static inline u32 from64to32(u64 x)
return (u32)x;
}
-__u32 csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum);
-
/*
* This function code has been taken from
* Linux kernel lib/checksum.c
*/
-__u32 csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum)
+static __u32 csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum)
{
unsigned long long s = (__force u32)sum;
@@ -210,13 +191,12 @@ __u32 csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u3
* This function has been taken from
* Linux kernel include/asm-generic/checksum.h
*/
-static inline __u16
-csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum)
+static __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum)
{
return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
}
-static inline u16 udp_csum(u32 saddr, u32 daddr, u32 len, u8 proto, u16 *udp_pkt)
+static u16 udp_csum(u32 saddr, u32 daddr, u32 len, u8 proto, u16 *udp_pkt)
{
u32 csum = 0;
u32 cnt = 0;
@@ -271,9 +251,8 @@ static void gen_eth_frame(struct xsk_umem_info *umem, u64 addr)
memcpy(xsk_umem__get_data(umem->buffer, addr), pkt_data, PKT_SIZE);
}
-static void xsk_configure_umem(struct ifobject *data, void *buffer, u64 size)
+static void xsk_configure_umem(struct ifobject *data, void *buffer, int idx)
{
- int ret;
struct xsk_umem_config cfg = {
.fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
.comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
@@ -281,17 +260,22 @@ static void xsk_configure_umem(struct ifobject *data, void *buffer, u64 size)
.frame_headroom = frame_headroom,
.flags = XSK_UMEM__DEFAULT_FLAGS
};
+ int size = num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE;
+ struct xsk_umem_info *umem;
+ int ret;
- data->umem = calloc(1, sizeof(struct xsk_umem_info));
- if (!data->umem)
+ umem = calloc(1, sizeof(struct xsk_umem_info));
+ if (!umem)
exit_with_error(errno);
- ret = xsk_umem__create(&data->umem->umem, buffer, size,
- &data->umem->fq, &data->umem->cq, &cfg);
+ ret = xsk_umem__create(&umem->umem, buffer, size,
+ &umem->fq, &umem->cq, &cfg);
if (ret)
exit_with_error(ret);
- data->umem->buffer = buffer;
+ umem->buffer = buffer;
+
+ data->umem_arr[idx] = umem;
}
static void xsk_populate_fill_ring(struct xsk_umem_info *umem)
@@ -307,18 +291,19 @@ static void xsk_populate_fill_ring(struct xsk_umem_info *umem)
xsk_ring_prod__submit(&umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS);
}
-static int xsk_configure_socket(struct ifobject *ifobject)
+static int xsk_configure_socket(struct ifobject *ifobject, int idx)
{
struct xsk_socket_config cfg;
+ struct xsk_socket_info *xsk;
struct xsk_ring_cons *rxr;
struct xsk_ring_prod *txr;
int ret;
- ifobject->xsk = calloc(1, sizeof(struct xsk_socket_info));
- if (!ifobject->xsk)
+ xsk = calloc(1, sizeof(struct xsk_socket_info));
+ if (!xsk)
exit_with_error(errno);
- ifobject->xsk->umem = ifobject->umem;
+ xsk->umem = ifobject->umem;
cfg.rx_size = rxqsize;
cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
cfg.libbpf_flags = 0;
@@ -326,19 +311,20 @@ static int xsk_configure_socket(struct ifobject *ifobject)
cfg.bind_flags = xdp_bind_flags;
if (test_type != TEST_TYPE_BIDI) {
- rxr = (ifobject->fv.vector == rx) ? &ifobject->xsk->rx : NULL;
- txr = (ifobject->fv.vector == tx) ? &ifobject->xsk->tx : NULL;
+ rxr = (ifobject->fv.vector == rx) ? &xsk->rx : NULL;
+ txr = (ifobject->fv.vector == tx) ? &xsk->tx : NULL;
} else {
- rxr = &ifobject->xsk->rx;
- txr = &ifobject->xsk->tx;
+ rxr = &xsk->rx;
+ txr = &xsk->tx;
}
- ret = xsk_socket__create(&ifobject->xsk->xsk, ifobject->ifname,
- opt_queue, ifobject->umem->umem, rxr, txr, &cfg);
-
+ ret = xsk_socket__create(&xsk->xsk, ifobject->ifname, idx,
+ ifobject->umem->umem, rxr, txr, &cfg);
if (ret)
return 1;
+ ifobject->xsk_arr[idx] = xsk;
+
return 0;
}
@@ -364,12 +350,15 @@ static void usage(const char *prog)
ksft_print_msg(str, prog);
}
-static bool switch_namespace(int idx)
+static int switch_namespace(const char *nsname)
{
char fqns[26] = "/var/run/netns/";
int nsfd;
- strncat(fqns, ifdict[idx]->nsname, sizeof(fqns) - strlen(fqns) - 1);
+ if (!nsname || strlen(nsname) == 0)
+ return -1;
+
+ strncat(fqns, nsname, sizeof(fqns) - strlen(fqns) - 1);
nsfd = open(fqns, O_RDONLY);
if (nsfd == -1)
@@ -378,26 +367,9 @@ static bool switch_namespace(int idx)
if (setns(nsfd, 0) == -1)
exit_with_error(errno);
- return true;
-}
+ print_verbose("NS switched: %s\n", nsname);
-static void *nsswitchthread(void *args)
-{
- struct targs *targs = args;
-
- targs->retptr = false;
-
- if (switch_namespace(targs->idx)) {
- ifdict[targs->idx]->ifindex = if_nametoindex(ifdict[targs->idx]->ifname);
- if (!ifdict[targs->idx]->ifindex) {
- ksft_test_result_fail("ERROR: [%s] interface \"%s\" does not exist\n",
- __func__, ifdict[targs->idx]->ifname);
- } else {
- print_verbose("Interface found: %s\n", ifdict[targs->idx]->ifname);
- targs->retptr = true;
- }
- }
- pthread_exit(NULL);
+ return nsfd;
}
static int validate_interfaces(void)
@@ -409,33 +381,6 @@ static int validate_interfaces(void)
ret = false;
ksft_test_result_fail("ERROR: interfaces: -i <int>,<ns> -i <int>,<ns>.");
}
- if (strcmp(ifdict[i]->nsname, "")) {
- struct targs *targs;
-
- targs = malloc(sizeof(*targs));
- if (!targs)
- exit_with_error(errno);
-
- targs->idx = i;
- if (pthread_create(&ns_thread, NULL, nsswitchthread, targs))
- exit_with_error(errno);
-
- pthread_join(ns_thread, NULL);
-
- if (targs->retptr)
- print_verbose("NS switched: %s\n", ifdict[i]->nsname);
-
- free(targs);
- } else {
- ifdict[i]->ifindex = if_nametoindex(ifdict[i]->ifname);
- if (!ifdict[i]->ifindex) {
- ksft_test_result_fail
- ("ERROR: interface \"%s\" does not exist\n", ifdict[i]->ifname);
- ret = false;
- } else {
- print_verbose("Interface found: %s\n", ifdict[i]->ifname);
- }
- }
}
return ret;
}
@@ -447,7 +392,7 @@ static void parse_command_line(int argc, char **argv)
opterr = 0;
for (;;) {
- c = getopt_long(argc, argv, "i:q:DC:v", long_options, &option_index);
+ c = getopt_long(argc, argv, "i:DC:v", long_options, &option_index);
if (c == -1)
break;
@@ -467,9 +412,6 @@ static void parse_command_line(int argc, char **argv)
MAX_INTERFACES_NAMESPACE_CHARS);
interface_index++;
break;
- case 'q':
- opt_queue = atoi(optarg);
- break;
case 'D':
debug_pkt_dump = 1;
break;
@@ -506,7 +448,7 @@ static void kick_tx(struct xsk_socket_info *xsk)
exit_with_error(errno);
}
-static inline void complete_tx_only(struct xsk_socket_info *xsk, int batch_size)
+static void complete_tx_only(struct xsk_socket_info *xsk, int batch_size)
{
unsigned int rcvd;
u32 idx;
@@ -514,7 +456,7 @@ static inline void complete_tx_only(struct xsk_socket_info *xsk, int batch_size)
if (!xsk->outstanding_tx)
return;
- if (!NEED_WAKEUP || xsk_ring_prod__needs_wakeup(&xsk->tx))
+ if (xsk_ring_prod__needs_wakeup(&xsk->tx))
kick_tx(xsk);
rcvd = xsk_ring_cons__peek(&xsk->umem->cq, batch_size, &idx);
@@ -602,16 +544,15 @@ static void tx_only(struct xsk_socket_info *xsk, u32 *frameptr, int batch_size)
xsk_ring_prod__submit(&xsk->tx, batch_size);
if (!tx_invalid_test) {
xsk->outstanding_tx += batch_size;
- } else {
- if (!NEED_WAKEUP || xsk_ring_prod__needs_wakeup(&xsk->tx))
- kick_tx(xsk);
+ } else if (xsk_ring_prod__needs_wakeup(&xsk->tx)) {
+ kick_tx(xsk);
}
*frameptr += batch_size;
*frameptr %= num_frames;
complete_tx_only(xsk, batch_size);
}
-static inline int get_batch_size(int pkt_cnt)
+static int get_batch_size(int pkt_cnt)
{
if (!opt_pkt_count)
return BATCH_SIZE;
@@ -667,45 +608,40 @@ static void tx_only_all(struct ifobject *ifobject)
static void worker_pkt_dump(void)
{
- struct in_addr ipaddr;
+ struct ethhdr *ethhdr;
+ struct iphdr *iphdr;
+ struct udphdr *udphdr;
+ char s[128];
+ int payload;
+ void *ptr;
fprintf(stdout, "---------------------------------------\n");
for (int iter = 0; iter < num_frames - 1; iter++) {
+ ptr = pkt_buf[iter]->payload;
+ ethhdr = ptr;
+ iphdr = ptr + sizeof(*ethhdr);
+ udphdr = ptr + sizeof(*ethhdr) + sizeof(*iphdr);
+
/*extract L2 frame */
fprintf(stdout, "DEBUG>> L2: dst mac: ");
for (int i = 0; i < ETH_ALEN; i++)
- fprintf(stdout, "%02X", ((struct ethhdr *)
- pkt_buf[iter]->payload)->h_dest[i]);
+ fprintf(stdout, "%02X", ethhdr->h_dest[i]);
fprintf(stdout, "\nDEBUG>> L2: src mac: ");
for (int i = 0; i < ETH_ALEN; i++)
- fprintf(stdout, "%02X", ((struct ethhdr *)
- pkt_buf[iter]->payload)->h_source[i]);
+ fprintf(stdout, "%02X", ethhdr->h_source[i]);
/*extract L3 frame */
- fprintf(stdout, "\nDEBUG>> L3: ip_hdr->ihl: %02X\n",
- ((struct iphdr *)(pkt_buf[iter]->payload + sizeof(struct ethhdr)))->ihl);
-
- ipaddr.s_addr =
- ((struct iphdr *)(pkt_buf[iter]->payload + sizeof(struct ethhdr)))->saddr;
- fprintf(stdout, "DEBUG>> L3: ip_hdr->saddr: %s\n", inet_ntoa(ipaddr));
-
- ipaddr.s_addr =
- ((struct iphdr *)(pkt_buf[iter]->payload + sizeof(struct ethhdr)))->daddr;
- fprintf(stdout, "DEBUG>> L3: ip_hdr->daddr: %s\n", inet_ntoa(ipaddr));
-
+ fprintf(stdout, "\nDEBUG>> L3: ip_hdr->ihl: %02X\n", iphdr->ihl);
+ fprintf(stdout, "DEBUG>> L3: ip_hdr->saddr: %s\n",
+ inet_ntop(AF_INET, &iphdr->saddr, s, sizeof(s)));
+ fprintf(stdout, "DEBUG>> L3: ip_hdr->daddr: %s\n",
+ inet_ntop(AF_INET, &iphdr->daddr, s, sizeof(s)));
/*extract L4 frame */
- fprintf(stdout, "DEBUG>> L4: udp_hdr->src: %d\n",
- ntohs(((struct udphdr *)(pkt_buf[iter]->payload +
- sizeof(struct ethhdr) +
- sizeof(struct iphdr)))->source));
-
- fprintf(stdout, "DEBUG>> L4: udp_hdr->dst: %d\n",
- ntohs(((struct udphdr *)(pkt_buf[iter]->payload +
- sizeof(struct ethhdr) +
- sizeof(struct iphdr)))->dest));
+ fprintf(stdout, "DEBUG>> L4: udp_hdr->src: %d\n", ntohs(udphdr->source));
+ fprintf(stdout, "DEBUG>> L4: udp_hdr->dst: %d\n", ntohs(udphdr->dest));
/*extract L5 frame */
- int payload = *((uint32_t *)(pkt_buf[iter]->payload + PKT_HDR_SIZE));
+ payload = *((uint32_t *)(ptr + PKT_HDR_SIZE));
if (payload == EOT) {
print_verbose("End-of-transmission frame received\n");
@@ -809,37 +745,69 @@ static void worker_pkt_validate(void)
}
}
-static void thread_common_ops(struct ifobject *ifobject, void *bufs, pthread_mutex_t *mutexptr,
- atomic_int *spinningptr)
+static void thread_common_ops(struct ifobject *ifobject, void *bufs)
{
+ int umem_sz = num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE;
int ctr = 0;
int ret;
- xsk_configure_umem(ifobject, bufs, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE);
- ret = xsk_configure_socket(ifobject);
+ ifobject->ns_fd = switch_namespace(ifobject->nsname);
+
+ if (test_type == TEST_TYPE_BPF_RES)
+ umem_sz *= 2;
+
+ bufs = mmap(NULL, umem_sz,
+ PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (bufs == MAP_FAILED)
+ exit_with_error(errno);
+
+ xsk_configure_umem(ifobject, bufs, 0);
+ ifobject->umem = ifobject->umem_arr[0];
+ ret = xsk_configure_socket(ifobject, 0);
/* Retry Create Socket if it fails as xsk_socket__create()
* is asynchronous
- *
- * Essential to lock Mutex here to prevent Tx thread from
- * entering before Rx and causing a deadlock
*/
- pthread_mutex_lock(mutexptr);
while (ret && ctr < SOCK_RECONF_CTR) {
- atomic_store(spinningptr, 1);
- xsk_configure_umem(ifobject, bufs, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE);
- ret = xsk_configure_socket(ifobject);
+ xsk_configure_umem(ifobject, bufs, 0);
+ ifobject->umem = ifobject->umem_arr[0];
+ ret = xsk_configure_socket(ifobject, 0);
usleep(USLEEP_MAX);
ctr++;
}
- atomic_store(spinningptr, 0);
- pthread_mutex_unlock(mutexptr);
if (ctr >= SOCK_RECONF_CTR)
exit_with_error(ret);
+
+ ifobject->umem = ifobject->umem_arr[0];
+ ifobject->xsk = ifobject->xsk_arr[0];
+
+ if (test_type == TEST_TYPE_BPF_RES) {
+ xsk_configure_umem(ifobject, (u8 *)bufs + (umem_sz / 2), 1);
+ ifobject->umem = ifobject->umem_arr[1];
+ ret = xsk_configure_socket(ifobject, 1);
+ }
+
+ ifobject->umem = ifobject->umem_arr[0];
+ ifobject->xsk = ifobject->xsk_arr[0];
+ print_verbose("Interface [%s] vector [%s]\n",
+ ifobject->ifname, ifobject->fv.vector == tx ? "Tx" : "Rx");
+}
+
+static bool testapp_is_test_two_stepped(void)
+{
+ return (test_type != TEST_TYPE_BIDI && test_type != TEST_TYPE_BPF_RES) || second_step;
+}
+
+static void testapp_cleanup_xsk_res(struct ifobject *ifobj)
+{
+ if (testapp_is_test_two_stepped()) {
+ xsk_socket__delete(ifobj->xsk->xsk);
+ (void)xsk_umem__delete(ifobj->umem->umem);
+ }
}
-static void *worker_testapp_validate(void *arg)
+static void *worker_testapp_validate_tx(void *arg)
{
struct udphdr *udp_hdr =
(struct udphdr *)(pkt_data + sizeof(struct ethhdr) + sizeof(struct iphdr));
@@ -849,157 +817,97 @@ static void *worker_testapp_validate(void *arg)
struct generic_data data;
void *bufs = NULL;
- pthread_attr_setstacksize(&attr, THREAD_STACK);
-
- if (!bidi_pass) {
- bufs = mmap(NULL, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE,
- PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
- if (bufs == MAP_FAILED)
- exit_with_error(errno);
-
- if (strcmp(ifobject->nsname, ""))
- switch_namespace(ifobject->ifdict_index);
+ if (!second_step)
+ thread_common_ops(ifobject, bufs);
+
+ for (int i = 0; i < num_frames; i++) {
+ /*send EOT frame */
+ if (i == (num_frames - 1))
+ data.seqnum = -1;
+ else
+ data.seqnum = i;
+ gen_udp_hdr(&data, ifobject, udp_hdr);
+ gen_ip_hdr(ifobject, ip_hdr);
+ gen_udp_csum(udp_hdr, ip_hdr);
+ gen_eth_hdr(ifobject, eth_hdr);
+ gen_eth_frame(ifobject->umem, i * XSK_UMEM__DEFAULT_FRAME_SIZE);
}
- if (ifobject->fv.vector == tx) {
- int spinningrxctr = 0;
+ print_verbose("Sending %d packets on interface %s\n",
+ (opt_pkt_count - 1), ifobject->ifname);
+ tx_only_all(ifobject);
- if (!bidi_pass)
- thread_common_ops(ifobject, bufs, &sync_mutex_tx, &spinning_tx);
-
- while (atomic_load(&spinning_rx) && spinningrxctr < SOCK_RECONF_CTR) {
- spinningrxctr++;
- usleep(USLEEP_MAX);
- }
+ testapp_cleanup_xsk_res(ifobject);
+ pthread_exit(NULL);
+}
- print_verbose("Interface [%s] vector [Tx]\n", ifobject->ifname);
- for (int i = 0; i < num_frames; i++) {
- /*send EOT frame */
- if (i == (num_frames - 1))
- data.seqnum = -1;
- else
- data.seqnum = i;
- gen_udp_hdr(&data, ifobject, udp_hdr);
- gen_ip_hdr(ifobject, ip_hdr);
- gen_udp_csum(udp_hdr, ip_hdr);
- gen_eth_hdr(ifobject, eth_hdr);
- gen_eth_frame(ifobject->umem, i * XSK_UMEM__DEFAULT_FRAME_SIZE);
- }
+static void *worker_testapp_validate_rx(void *arg)
+{
+ struct ifobject *ifobject = (struct ifobject *)arg;
+ struct pollfd fds[MAX_SOCKS] = { };
+ void *bufs = NULL;
- print_verbose("Sending %d packets on interface %s\n",
- (opt_pkt_count - 1), ifobject->ifname);
- tx_only_all(ifobject);
- } else if (ifobject->fv.vector == rx) {
- struct pollfd fds[MAX_SOCKS] = { };
- int ret;
-
- if (!bidi_pass)
- thread_common_ops(ifobject, bufs, &sync_mutex_tx, &spinning_rx);
-
- print_verbose("Interface [%s] vector [Rx]\n", ifobject->ifname);
- if (stat_test_type != STAT_TEST_RX_FILL_EMPTY)
- xsk_populate_fill_ring(ifobject->umem);
-
- TAILQ_INIT(&head);
- if (debug_pkt_dump) {
- pkt_buf = calloc(num_frames, sizeof(*pkt_buf));
- if (!pkt_buf)
- exit_with_error(errno);
- }
+ if (!second_step)
+ thread_common_ops(ifobject, bufs);
- fds[0].fd = xsk_socket__fd(ifobject->xsk->xsk);
- fds[0].events = POLLIN;
+ if (stat_test_type != STAT_TEST_RX_FILL_EMPTY)
+ xsk_populate_fill_ring(ifobject->umem);
- pthread_mutex_lock(&sync_mutex);
- pthread_cond_signal(&signal_rx_condition);
- pthread_mutex_unlock(&sync_mutex);
+ TAILQ_INIT(&head);
+ if (debug_pkt_dump) {
+ pkt_buf = calloc(num_frames, sizeof(*pkt_buf));
+ if (!pkt_buf)
+ exit_with_error(errno);
+ }
- while (1) {
- if (test_type == TEST_TYPE_POLL) {
- ret = poll(fds, 1, POLL_TMOUT);
- if (ret <= 0)
- continue;
- }
+ fds[0].fd = xsk_socket__fd(ifobject->xsk->xsk);
+ fds[0].events = POLLIN;
- if (test_type != TEST_TYPE_STATS) {
- rx_pkt(ifobject->xsk, fds);
- worker_pkt_validate();
- } else {
- worker_stats_validate(ifobject);
- }
+ pthread_barrier_wait(&barr);
- if (sigvar)
- break;
+ while (1) {
+ if (test_type != TEST_TYPE_STATS) {
+ rx_pkt(ifobject->xsk, fds);
+ worker_pkt_validate();
+ } else {
+ worker_stats_validate(ifobject);
}
+ if (sigvar)
+ break;
+ }
- if (test_type != TEST_TYPE_STATS)
- print_verbose("Received %d packets on interface %s\n",
- pkt_counter, ifobject->ifname);
+ print_verbose("Received %d packets on interface %s\n",
+ pkt_counter, ifobject->ifname);
- if (test_type == TEST_TYPE_TEARDOWN)
- print_verbose("Destroying socket\n");
- }
+ if (test_type == TEST_TYPE_TEARDOWN)
+ print_verbose("Destroying socket\n");
- if ((test_type != TEST_TYPE_BIDI) || bidi_pass) {
- xsk_socket__delete(ifobject->xsk->xsk);
- (void)xsk_umem__delete(ifobject->umem->umem);
- }
+ testapp_cleanup_xsk_res(ifobject);
pthread_exit(NULL);
}
static void testapp_validate(void)
{
- struct timespec max_wait = { 0, 0 };
bool bidi = test_type == TEST_TYPE_BIDI;
+ bool bpf = test_type == TEST_TYPE_BPF_RES;
- pthread_attr_init(&attr);
- pthread_attr_setstacksize(&attr, THREAD_STACK);
-
- if ((test_type == TEST_TYPE_BIDI) && bidi_pass) {
- pthread_init_mutex();
- if (!switching_notify) {
- print_verbose("Switching Tx/Rx vectors\n");
- switching_notify++;
- }
- }
-
- pthread_mutex_lock(&sync_mutex);
+ if (pthread_barrier_init(&barr, NULL, 2))
+ exit_with_error(errno);
/*Spawn RX thread */
- if (!bidi || !bidi_pass) {
- if (pthread_create(&t0, &attr, worker_testapp_validate, ifdict[1]))
- exit_with_error(errno);
- } else if (bidi && bidi_pass) {
- /*switch Tx/Rx vectors */
- ifdict[0]->fv.vector = rx;
- if (pthread_create(&t0, &attr, worker_testapp_validate, ifdict[0]))
- exit_with_error(errno);
- }
-
- if (clock_gettime(CLOCK_REALTIME, &max_wait))
- exit_with_error(errno);
- max_wait.tv_sec += TMOUT_SEC;
+ pthread_create(&t0, NULL, ifdict_rx->func_ptr, ifdict_rx);
- if (pthread_cond_timedwait(&signal_rx_condition, &sync_mutex, &max_wait) == ETIMEDOUT)
+ pthread_barrier_wait(&barr);
+ if (pthread_barrier_destroy(&barr))
exit_with_error(errno);
- pthread_mutex_unlock(&sync_mutex);
-
/*Spawn TX thread */
- if (!bidi || !bidi_pass) {
- if (pthread_create(&t1, &attr, worker_testapp_validate, ifdict[0]))
- exit_with_error(errno);
- } else if (bidi && bidi_pass) {
- /*switch Tx/Rx vectors */
- ifdict[1]->fv.vector = tx;
- if (pthread_create(&t1, &attr, worker_testapp_validate, ifdict[1]))
- exit_with_error(errno);
- }
+ pthread_create(&t1, NULL, ifdict_tx->func_ptr, ifdict_tx);
pthread_join(t1, NULL);
pthread_join(t0, NULL);
- if (debug_pkt_dump) {
+ if (debug_pkt_dump && test_type != TEST_TYPE_STATS) {
worker_pkt_dump();
for (int iter = 0; iter < num_frames - 1; iter++) {
free(pkt_buf[iter]->payload);
@@ -1008,20 +916,85 @@ static void testapp_validate(void)
free(pkt_buf);
}
- if (!(test_type == TEST_TYPE_TEARDOWN) && !bidi && !(test_type == TEST_TYPE_STATS))
+ if (!(test_type == TEST_TYPE_TEARDOWN) && !bidi && !bpf && !(test_type == TEST_TYPE_STATS))
print_ksft_result();
}
-static void testapp_sockets(void)
+static void testapp_teardown(void)
{
- for (int i = 0; i < ((test_type == TEST_TYPE_TEARDOWN) ? MAX_TEARDOWN_ITER : MAX_BIDI_ITER);
- i++) {
+ int i;
+
+ for (i = 0; i < MAX_TEARDOWN_ITER; i++) {
pkt_counter = 0;
prev_pkt = -1;
sigvar = 0;
print_verbose("Creating socket\n");
testapp_validate();
- test_type == TEST_TYPE_BIDI ? bidi_pass++ : bidi_pass;
+ }
+
+ print_ksft_result();
+}
+
+static void swap_vectors(struct ifobject *ifobj1, struct ifobject *ifobj2)
+{
+ void *(*tmp_func_ptr)(void *) = ifobj1->func_ptr;
+ enum fvector tmp_vector = ifobj1->fv.vector;
+
+ ifobj1->func_ptr = ifobj2->func_ptr;
+ ifobj1->fv.vector = ifobj2->fv.vector;
+
+ ifobj2->func_ptr = tmp_func_ptr;
+ ifobj2->fv.vector = tmp_vector;
+
+ ifdict_tx = ifobj1;
+ ifdict_rx = ifobj2;
+}
+
+static void testapp_bidi(void)
+{
+ for (int i = 0; i < MAX_BIDI_ITER; i++) {
+ pkt_counter = 0;
+ prev_pkt = -1;
+ sigvar = 0;
+ print_verbose("Creating socket\n");
+ testapp_validate();
+ if (!second_step) {
+ print_verbose("Switching Tx/Rx vectors\n");
+ swap_vectors(ifdict[1], ifdict[0]);
+ }
+ second_step = true;
+ }
+
+ swap_vectors(ifdict[0], ifdict[1]);
+
+ print_ksft_result();
+}
+
+static void swap_xsk_res(void)
+{
+ xsk_socket__delete(ifdict_tx->xsk->xsk);
+ xsk_umem__delete(ifdict_tx->umem->umem);
+ xsk_socket__delete(ifdict_rx->xsk->xsk);
+ xsk_umem__delete(ifdict_rx->umem->umem);
+ ifdict_tx->umem = ifdict_tx->umem_arr[1];
+ ifdict_tx->xsk = ifdict_tx->xsk_arr[1];
+ ifdict_rx->umem = ifdict_rx->umem_arr[1];
+ ifdict_rx->xsk = ifdict_rx->xsk_arr[1];
+}
+
+static void testapp_bpf_res(void)
+{
+ int i;
+
+ for (i = 0; i < MAX_BPF_ITER; i++) {
+ pkt_counter = 0;
+ prev_pkt = -1;
+ sigvar = 0;
+ print_verbose("Creating socket\n");
+ testapp_validate();
+ if (!second_step)
+ swap_xsk_res();
+ second_step = true;
}
print_ksft_result();
@@ -1053,78 +1026,33 @@ static void testapp_stats(void)
print_ksft_result();
}
-static void init_iface_config(struct ifaceconfigobj *ifaceconfig)
-{
- /*Init interface0 */
- ifdict[0]->fv.vector = tx;
- memcpy(ifdict[0]->dst_mac, ifaceconfig->dst_mac, ETH_ALEN);
- memcpy(ifdict[0]->src_mac, ifaceconfig->src_mac, ETH_ALEN);
- ifdict[0]->dst_ip = ifaceconfig->dst_ip.s_addr;
- ifdict[0]->src_ip = ifaceconfig->src_ip.s_addr;
- ifdict[0]->dst_port = ifaceconfig->dst_port;
- ifdict[0]->src_port = ifaceconfig->src_port;
-
- /*Init interface1 */
- ifdict[1]->fv.vector = rx;
- memcpy(ifdict[1]->dst_mac, ifaceconfig->src_mac, ETH_ALEN);
- memcpy(ifdict[1]->src_mac, ifaceconfig->dst_mac, ETH_ALEN);
- ifdict[1]->dst_ip = ifaceconfig->src_ip.s_addr;
- ifdict[1]->src_ip = ifaceconfig->dst_ip.s_addr;
- ifdict[1]->dst_port = ifaceconfig->src_port;
- ifdict[1]->src_port = ifaceconfig->dst_port;
-}
-
-static void *nsdisablemodethread(void *args)
+static void init_iface(struct ifobject *ifobj, const char *dst_mac,
+ const char *src_mac, const char *dst_ip,
+ const char *src_ip, const u16 dst_port,
+ const u16 src_port, enum fvector vector)
{
- struct targs *targs = args;
+ struct in_addr ip;
- targs->retptr = false;
+ memcpy(ifobj->dst_mac, dst_mac, ETH_ALEN);
+ memcpy(ifobj->src_mac, src_mac, ETH_ALEN);
- if (switch_namespace(targs->idx)) {
- targs->retptr = bpf_set_link_xdp_fd(ifdict[targs->idx]->ifindex, -1, targs->flags);
- } else {
- targs->retptr = errno;
- print_verbose("Failed to switch namespace to %s\n", ifdict[targs->idx]->nsname);
- }
-
- pthread_exit(NULL);
-}
+ inet_aton(dst_ip, &ip);
+ ifobj->dst_ip = ip.s_addr;
-static void disable_xdp_mode(int mode)
-{
- int err = 0;
- __u32 flags = XDP_FLAGS_UPDATE_IF_NOEXIST | mode;
- char *mode_str = mode & XDP_FLAGS_SKB_MODE ? "skb" : "drv";
+ inet_aton(src_ip, &ip);
+ ifobj->src_ip = ip.s_addr;
- for (int i = 0; i < MAX_INTERFACES; i++) {
- if (strcmp(ifdict[i]->nsname, "")) {
- struct targs *targs;
-
- targs = malloc(sizeof(*targs));
- memset(targs, 0, sizeof(*targs));
- if (!targs)
- exit_with_error(errno);
-
- targs->idx = i;
- targs->flags = flags;
- if (pthread_create(&ns_thread, NULL, nsdisablemodethread, targs))
- exit_with_error(errno);
-
- pthread_join(ns_thread, NULL);
- err = targs->retptr;
- free(targs);
- } else {
- err = bpf_set_link_xdp_fd(ifdict[i]->ifindex, -1, flags);
- }
-
- if (err) {
- print_verbose("Failed to disable %s mode on interface %s\n",
- mode_str, ifdict[i]->ifname);
- exit_with_error(err);
- }
+ ifobj->dst_port = dst_port;
+ ifobj->src_port = src_port;
- print_verbose("Disabled %s mode for interface: %s\n", mode_str, ifdict[i]->ifname);
- configured_mode = mode & XDP_FLAGS_SKB_MODE ? TEST_MODE_DRV : TEST_MODE_SKB;
+ if (vector == tx) {
+ ifobj->fv.vector = tx;
+ ifobj->func_ptr = worker_testapp_validate_tx;
+ ifdict_tx = ifobj;
+ } else {
+ ifobj->fv.vector = rx;
+ ifobj->func_ptr = worker_testapp_validate_rx;
+ ifdict_rx = ifobj;
}
}
@@ -1135,72 +1063,70 @@ static void run_pkt_test(int mode, int type)
/* reset defaults after potential previous test */
xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
pkt_counter = 0;
- switching_notify = 0;
- bidi_pass = 0;
+ second_step = 0;
prev_pkt = -1;
- ifdict[0]->fv.vector = tx;
- ifdict[1]->fv.vector = rx;
sigvar = 0;
stat_test_type = -1;
rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS;
frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM;
+ configured_mode = mode;
+
switch (mode) {
case (TEST_MODE_SKB):
- if (configured_mode == TEST_MODE_DRV)
- disable_xdp_mode(XDP_FLAGS_DRV_MODE);
xdp_flags |= XDP_FLAGS_SKB_MODE;
break;
case (TEST_MODE_DRV):
- if (configured_mode == TEST_MODE_SKB)
- disable_xdp_mode(XDP_FLAGS_SKB_MODE);
xdp_flags |= XDP_FLAGS_DRV_MODE;
break;
default:
break;
}
- pthread_init_mutex();
-
- if (test_type == TEST_TYPE_STATS)
+ switch (test_type) {
+ case TEST_TYPE_STATS:
testapp_stats();
- else if ((test_type != TEST_TYPE_TEARDOWN) && (test_type != TEST_TYPE_BIDI))
+ break;
+ case TEST_TYPE_TEARDOWN:
+ testapp_teardown();
+ break;
+ case TEST_TYPE_BIDI:
+ testapp_bidi();
+ break;
+ case TEST_TYPE_BPF_RES:
+ testapp_bpf_res();
+ break;
+ default:
testapp_validate();
- else
- testapp_sockets();
-
- pthread_destroy_mutex();
+ break;
+ }
}
int main(int argc, char **argv)
{
struct rlimit _rlim = { RLIM_INFINITY, RLIM_INFINITY };
+ bool failure = false;
+ int i, j;
if (setrlimit(RLIMIT_MEMLOCK, &_rlim))
exit_with_error(errno);
- const char *MAC1 = "\x00\x0A\x56\x9E\xEE\x62";
- const char *MAC2 = "\x00\x0A\x56\x9E\xEE\x61";
- const char *IP1 = "192.168.100.162";
- const char *IP2 = "192.168.100.161";
- u16 UDP_DST_PORT = 2020;
- u16 UDP_SRC_PORT = 2121;
- int i, j;
-
- ifaceconfig = malloc(sizeof(struct ifaceconfigobj));
- memcpy(ifaceconfig->dst_mac, MAC1, ETH_ALEN);
- memcpy(ifaceconfig->src_mac, MAC2, ETH_ALEN);
- inet_aton(IP1, &ifaceconfig->dst_ip);
- inet_aton(IP2, &ifaceconfig->src_ip);
- ifaceconfig->dst_port = UDP_DST_PORT;
- ifaceconfig->src_port = UDP_SRC_PORT;
-
for (int i = 0; i < MAX_INTERFACES; i++) {
ifdict[i] = malloc(sizeof(struct ifobject));
if (!ifdict[i])
exit_with_error(errno);
ifdict[i]->ifdict_index = i;
+ ifdict[i]->xsk_arr = calloc(2, sizeof(struct xsk_socket_info *));
+ if (!ifdict[i]->xsk_arr) {
+ failure = true;
+ goto cleanup;
+ }
+ ifdict[i]->umem_arr = calloc(2, sizeof(struct xsk_umem_info *));
+ if (!ifdict[i]->umem_arr) {
+ failure = true;
+ goto cleanup;
+ }
}
setlocale(LC_ALL, "");
@@ -1209,9 +1135,8 @@ int main(int argc, char **argv)
num_frames = ++opt_pkt_count;
- init_iface_config(ifaceconfig);
-
- disable_xdp_mode(XDP_FLAGS_DRV_MODE);
+ init_iface(ifdict[0], MAC1, MAC2, IP1, IP2, UDP_PORT1, UDP_PORT2, tx);
+ init_iface(ifdict[1], MAC2, MAC1, IP2, IP1, UDP_PORT2, UDP_PORT1, rx);
ksft_set_plan(TEST_MODE_MAX * TEST_TYPE_MAX);
@@ -1220,8 +1145,17 @@ int main(int argc, char **argv)
run_pkt_test(i, j);
}
- for (int i = 0; i < MAX_INTERFACES; i++)
+cleanup:
+ for (int i = 0; i < MAX_INTERFACES; i++) {
+ if (ifdict[i]->ns_fd != -1)
+ close(ifdict[i]->ns_fd);
+ free(ifdict[i]->xsk_arr);
+ free(ifdict[i]->umem_arr);
free(ifdict[i]);
+ }
+
+ if (failure)
+ exit_with_error(errno);
ksft_exit_pass();
diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h
index 30314ef305c2..6c428b276ab6 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.h
+++ b/tools/testing/selftests/bpf/xdpxceiver.h
@@ -23,6 +23,7 @@
#define MAX_SOCKS 1
#define MAX_TEARDOWN_ITER 10
#define MAX_BIDI_ITER 2
+#define MAX_BPF_ITER 2
#define PKT_HDR_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \
sizeof(struct udphdr))
#define MIN_PKT_SIZE 64
@@ -33,14 +34,11 @@
#define IP_PKT_TOS 0x9
#define UDP_PKT_SIZE (IP_PKT_SIZE - sizeof(struct iphdr))
#define UDP_PKT_DATA_SIZE (UDP_PKT_SIZE - sizeof(struct udphdr))
-#define TMOUT_SEC (3)
#define EOT (-1)
#define USLEEP_MAX 200000
-#define THREAD_STACK 60000000
#define SOCK_RECONF_CTR 10
#define BATCH_SIZE 64
#define POLL_TMOUT 1000
-#define NEED_WAKEUP true
#define DEFAULT_PKT_CNT 10000
#define RX_FULL_RXQSIZE 32
@@ -63,6 +61,7 @@ enum TEST_TYPES {
TEST_TYPE_TEARDOWN,
TEST_TYPE_BIDI,
TEST_TYPE_STATS,
+ TEST_TYPE_BPF_RES,
TEST_TYPE_MAX
};
@@ -77,11 +76,9 @@ enum STAT_TEST_TYPES {
static int configured_mode = TEST_MODE_UNCONFIGURED;
static u8 debug_pkt_dump;
static u32 num_frames;
-static u8 switching_notify;
-static u8 bidi_pass;
+static bool second_step;
static int test_type;
-static int opt_queue;
static int opt_pkt_count;
static u8 opt_verbose;
@@ -125,48 +122,32 @@ struct generic_data {
u32 seqnum;
};
-struct ifaceconfigobj {
- u8 dst_mac[ETH_ALEN];
- u8 src_mac[ETH_ALEN];
- struct in_addr dst_ip;
- struct in_addr src_ip;
- u16 src_port;
- u16 dst_port;
-} *ifaceconfig;
-
struct ifobject {
- int ifindex;
- int ifdict_index;
char ifname[MAX_INTERFACE_NAME_CHARS];
char nsname[MAX_INTERFACES_NAMESPACE_CHARS];
- struct flow_vector fv;
struct xsk_socket_info *xsk;
+ struct xsk_socket_info **xsk_arr;
+ struct xsk_umem_info **umem_arr;
struct xsk_umem_info *umem;
- u8 dst_mac[ETH_ALEN];
- u8 src_mac[ETH_ALEN];
+ void *(*func_ptr)(void *arg);
+ struct flow_vector fv;
+ int ns_fd;
+ int ifdict_index;
u32 dst_ip;
u32 src_ip;
u16 src_port;
u16 dst_port;
+ u8 dst_mac[ETH_ALEN];
+ u8 src_mac[ETH_ALEN];
};
static struct ifobject *ifdict[MAX_INTERFACES];
+static struct ifobject *ifdict_rx;
+static struct ifobject *ifdict_tx;
/*threads*/
-atomic_int spinning_tx;
-atomic_int spinning_rx;
-pthread_mutex_t sync_mutex;
-pthread_mutex_t sync_mutex_tx;
-pthread_cond_t signal_rx_condition;
-pthread_cond_t signal_tx_condition;
-pthread_t t0, t1, ns_thread;
-pthread_attr_t attr;
-
-struct targs {
- u8 retptr;
- int idx;
- u32 flags;
-};
+pthread_barrier_t barr;
+pthread_t t0, t1;
TAILQ_HEAD(head_s, pkt) head = TAILQ_HEAD_INITIALIZER(head);
struct head_s *head_p;
diff --git a/tools/testing/selftests/net/mptcp/diag.sh b/tools/testing/selftests/net/mptcp/diag.sh
index 39edce4f541c..2674ba20d524 100755
--- a/tools/testing/selftests/net/mptcp/diag.sh
+++ b/tools/testing/selftests/net/mptcp/diag.sh
@@ -5,8 +5,9 @@ rndh=$(printf %x $sec)-$(mktemp -u XXXXXX)
ns="ns1-$rndh"
ksft_skip=4
test_cnt=1
+timeout_poll=100
+timeout_test=$((timeout_poll * 2 + 1))
ret=0
-pids=()
flush_pids()
{
@@ -14,18 +15,14 @@ flush_pids()
# give it some time
sleep 1.1
- for pid in ${pids[@]}; do
- [ -d /proc/$pid ] && kill -SIGUSR1 $pid >/dev/null 2>&1
- done
- pids=()
+ ip netns pids "${ns}" | xargs --no-run-if-empty kill -SIGUSR1 &>/dev/null
}
cleanup()
{
+ ip netns pids "${ns}" | xargs --no-run-if-empty kill -SIGKILL &>/dev/null
+
ip netns del $ns
- for pid in ${pids[@]}; do
- [ -d /proc/$pid ] && kill -9 $pid >/dev/null 2>&1
- done
}
ip -Version > /dev/null 2>&1
@@ -79,39 +76,57 @@ trap cleanup EXIT
ip netns add $ns
ip -n $ns link set dev lo up
-echo "a" | ip netns exec $ns ./mptcp_connect -p 10000 -l 0.0.0.0 -t 100 >/dev/null &
+echo "a" | \
+ timeout ${timeout_test} \
+ ip netns exec $ns \
+ ./mptcp_connect -p 10000 -l -t ${timeout_poll} \
+ 0.0.0.0 >/dev/null &
sleep 0.1
-pids[0]=$!
chk_msk_nr 0 "no msk on netns creation"
-echo "b" | ip netns exec $ns ./mptcp_connect -p 10000 127.0.0.1 -j -t 100 >/dev/null &
+echo "b" | \
+ timeout ${timeout_test} \
+ ip netns exec $ns \
+ ./mptcp_connect -p 10000 -j -t ${timeout_poll} \
+ 127.0.0.1 >/dev/null &
sleep 0.1
-pids[1]=$!
chk_msk_nr 2 "after MPC handshake "
chk_msk_remote_key_nr 2 "....chk remote_key"
chk_msk_fallback_nr 0 "....chk no fallback"
flush_pids
-echo "a" | ip netns exec $ns ./mptcp_connect -p 10001 -s TCP -l 0.0.0.0 -t 100 >/dev/null &
-pids[0]=$!
+echo "a" | \
+ timeout ${timeout_test} \
+ ip netns exec $ns \
+ ./mptcp_connect -p 10001 -l -s TCP -t ${timeout_poll} \
+ 0.0.0.0 >/dev/null &
sleep 0.1
-echo "b" | ip netns exec $ns ./mptcp_connect -p 10001 127.0.0.1 -j -t 100 >/dev/null &
-pids[1]=$!
+echo "b" | \
+ timeout ${timeout_test} \
+ ip netns exec $ns \
+ ./mptcp_connect -p 10001 -j -t ${timeout_poll} \
+ 127.0.0.1 >/dev/null &
sleep 0.1
chk_msk_fallback_nr 1 "check fallback"
flush_pids
NR_CLIENTS=100
for I in `seq 1 $NR_CLIENTS`; do
- echo "a" | ip netns exec $ns ./mptcp_connect -p $((I+10001)) -l 0.0.0.0 -t 100 -w 10 >/dev/null &
- pids[$((I*2))]=$!
+ echo "a" | \
+ timeout ${timeout_test} \
+ ip netns exec $ns \
+ ./mptcp_connect -p $((I+10001)) -l -w 10 \
+ -t ${timeout_poll} 0.0.0.0 >/dev/null &
done
sleep 0.1
for I in `seq 1 $NR_CLIENTS`; do
- echo "b" | ip netns exec $ns ./mptcp_connect -p $((I+10001)) 127.0.0.1 -t 100 -w 10 >/dev/null &
- pids[$((I*2 + 1))]=$!
+ echo "b" | \
+ timeout ${timeout_test} \
+ ip netns exec $ns \
+ ./mptcp_connect -p $((I+10001)) -w 10 \
+ -t ${timeout_poll} 127.0.0.1 >/dev/null &
done
sleep 1.5
diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
index 10a030b53b23..385cdc98aed8 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
@@ -11,7 +11,8 @@ cin=""
cout=""
ksft_skip=4
capture=false
-timeout=30
+timeout_poll=30
+timeout_test=$((timeout_poll * 2 + 1))
ipv6=true
ethtool_random_on=true
tc_delay="$((RANDOM%50))"
@@ -273,7 +274,7 @@ check_mptcp_disabled()
ip netns exec ${disabled_ns} sysctl -q net.mptcp.enabled=0
local err=0
- LANG=C ip netns exec ${disabled_ns} ./mptcp_connect -t $timeout -p 10000 -s MPTCP 127.0.0.1 < "$cin" 2>&1 | \
+ LANG=C ip netns exec ${disabled_ns} ./mptcp_connect -p 10000 -s MPTCP 127.0.0.1 < "$cin" 2>&1 | \
grep -q "^socket: Protocol not available$" && err=1
ip netns delete ${disabled_ns}
@@ -425,19 +426,32 @@ do_transfer()
sleep 1
fi
+ NSTAT_HISTORY=/tmp/${listener_ns}.nstat ip netns exec ${listener_ns} \
+ nstat -n
+ if [ ${listener_ns} != ${connector_ns} ]; then
+ NSTAT_HISTORY=/tmp/${connector_ns}.nstat ip netns exec ${connector_ns} \
+ nstat -n
+ fi
+
local stat_synrx_last_l=$(get_mib_counter "${listener_ns}" "MPTcpExtMPCapableSYNRX")
local stat_ackrx_last_l=$(get_mib_counter "${listener_ns}" "MPTcpExtMPCapableACKRX")
local stat_cookietx_last=$(get_mib_counter "${listener_ns}" "TcpExtSyncookiesSent")
local stat_cookierx_last=$(get_mib_counter "${listener_ns}" "TcpExtSyncookiesRecv")
- ip netns exec ${listener_ns} ./mptcp_connect -t $timeout -l -p $port -s ${srv_proto} $extra_args $local_addr < "$sin" > "$sout" &
+ timeout ${timeout_test} \
+ ip netns exec ${listener_ns} \
+ ./mptcp_connect -t ${timeout_poll} -l -p $port -s ${srv_proto} \
+ $extra_args $local_addr < "$sin" > "$sout" &
local spid=$!
wait_local_port_listen "${listener_ns}" "${port}"
local start
start=$(date +%s%3N)
- ip netns exec ${connector_ns} ./mptcp_connect -t $timeout -p $port -s ${cl_proto} $extra_args $connect_addr < "$cin" > "$cout" &
+ timeout ${timeout_test} \
+ ip netns exec ${connector_ns} \
+ ./mptcp_connect -t ${timeout_poll} -p $port -s ${cl_proto} \
+ $extra_args $connect_addr < "$cin" > "$cout" &
local cpid=$!
wait $cpid
diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index d2273b88e72c..abeb24b7f8ec 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -8,7 +8,8 @@ cin=""
cinsent=""
cout=""
ksft_skip=4
-timeout=30
+timeout_poll=30
+timeout_test=$((timeout_poll * 2 + 1))
mptcp_connect=""
capture=0
do_all_tests=1
@@ -77,6 +78,7 @@ cleanup_partial()
for netns in "$ns1" "$ns2"; do
ip netns del $netns
+ rm -f /tmp/$netns.{nstat,out}
done
}
@@ -232,6 +234,11 @@ do_transfer()
sleep 1
fi
+ NSTAT_HISTORY=/tmp/${listener_ns}.nstat ip netns exec ${listener_ns} \
+ nstat -n
+ NSTAT_HISTORY=/tmp/${connector_ns}.nstat ip netns exec ${connector_ns} \
+ nstat -n
+
if [ $speed = "fast" ]; then
mptcp_connect="./mptcp_connect -j"
elif [ $speed = "slow" ]; then
@@ -247,17 +254,26 @@ do_transfer()
local_addr="0.0.0.0"
fi
- ip netns exec ${listener_ns} $mptcp_connect -t $timeout -l -p $port \
- -s ${srv_proto} ${local_addr} < "$sin" > "$sout" &
+ timeout ${timeout_test} \
+ ip netns exec ${listener_ns} \
+ $mptcp_connect -t ${timeout_poll} -l -p $port -s ${srv_proto} \
+ ${local_addr} < "$sin" > "$sout" &
spid=$!
sleep 1
if [ "$test_link_fail" -eq 0 ];then
- ip netns exec ${connector_ns} $mptcp_connect -t $timeout -p $port -s ${cl_proto} $connect_addr < "$cin" > "$cout" &
+ timeout ${timeout_test} \
+ ip netns exec ${connector_ns} \
+ $mptcp_connect -t ${timeout_poll} -p $port -s ${cl_proto} \
+ $connect_addr < "$cin" > "$cout" &
else
- ( cat "$cin" ; sleep 2; link_failure $listener_ns ; cat "$cin" ) | tee "$cinsent" | \
- ip netns exec ${connector_ns} $mptcp_connect -t $timeout -p $port -s ${cl_proto} $connect_addr > "$cout" &
+ ( cat "$cin" ; sleep 2; link_failure $listener_ns ; cat "$cin" ) | \
+ tee "$cinsent" | \
+ timeout ${timeout_test} \
+ ip netns exec ${connector_ns} \
+ $mptcp_connect -t ${timeout_poll} -p $port -s ${cl_proto} \
+ $connect_addr > "$cout" &
fi
cpid=$!
@@ -373,12 +389,19 @@ do_transfer()
kill $cappid
fi
+ NSTAT_HISTORY=/tmp/${listener_ns}.nstat ip netns exec ${listener_ns} \
+ nstat | grep Tcp > /tmp/${listener_ns}.out
+ NSTAT_HISTORY=/tmp/${connector_ns}.nstat ip netns exec ${connector_ns} \
+ nstat | grep Tcp > /tmp/${connector_ns}.out
+
if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then
echo " client exit code $retc, server $rets" 1>&2
echo -e "\nnetns ${listener_ns} socket stat for ${port}:" 1>&2
- ip netns exec ${listener_ns} ss -nita 1>&2 -o "sport = :$port"
+ ip netns exec ${listener_ns} ss -Menita 1>&2 -o "sport = :$port"
+ cat /tmp/${listener_ns}.out
echo -e "\nnetns ${connector_ns} socket stat for ${port}:" 1>&2
- ip netns exec ${connector_ns} ss -nita 1>&2 -o "dport = :$port"
+ ip netns exec ${connector_ns} ss -Menita 1>&2 -o "dport = :$port"
+ cat /tmp/${connector_ns}.out
cat "$capout"
ret=1
diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh
index f039ee57eb3c..3aeef3bcb101 100755
--- a/tools/testing/selftests/net/mptcp/simult_flows.sh
+++ b/tools/testing/selftests/net/mptcp/simult_flows.sh
@@ -7,7 +7,8 @@ ns2="ns2-$rndh"
ns3="ns3-$rndh"
capture=false
ksft_skip=4
-timeout=30
+timeout_poll=30
+timeout_test=$((timeout_poll * 2 + 1))
test_cnt=1
ret=0
bail=0
@@ -157,14 +158,20 @@ do_transfer()
sleep 1
fi
- ip netns exec ${ns3} ./mptcp_connect -jt $timeout -l -p $port 0.0.0.0 < "$sin" > "$sout" &
+ timeout ${timeout_test} \
+ ip netns exec ${ns3} \
+ ./mptcp_connect -jt ${timeout_poll} -l -p $port \
+ 0.0.0.0 < "$sin" > "$sout" &
local spid=$!
wait_local_port_listen "${ns3}" "${port}"
local start
start=$(date +%s%3N)
- ip netns exec ${ns1} ./mptcp_connect -jt $timeout -p $port 10.0.3.3 < "$cin" > "$cout" &
+ timeout ${timeout_test} \
+ ip netns exec ${ns1} \
+ ./mptcp_connect -jt ${timeout_poll} -p $port \
+ 10.0.3.3 < "$cin" > "$cout" &
local cpid=$!
wait $cpid