376 files changed, 41547 insertions, 6517 deletions
diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore
index 2f9d378edec3..532bb732bc6d 100644
--- a/tools/testing/selftests/net/.gitignore
+++ b/tools/testing/selftests/net/.gitignore
@@ -2,9 +2,10 @@
 bind_bhash
 bind_timewait
 bind_wildcard
-csum
+busy_poller
 cmsg_sender
 diag_uid
+epoll_busy_poll
 fin_ack_lat
 gro
 hwtstamp_config
@@ -16,8 +17,11 @@ ipsec
 ipv6_flowlabel
 ipv6_flowlabel_mgr
 log.txt
+msg_oob
 msg_zerocopy
+netlink-dumps
 nettest
+proc_net_pktgen
 psock_fanout
 psock_snd
 psock_tpacket
@@ -31,18 +35,21 @@ reuseport_dualstack
 rxtimestamp
 sctp_hello
 scm_pidfd
+scm_rights
 sk_bind_sendto_listen
 sk_connect_zero_addr
+sk_so_peek_off
+skf_net_off
 socket
 so_incoming_cpu
 so_netns_cookie
 so_txtime
+so_rcv_listener
 stress_reuseport_listen
 tap
 tcp_fastopen_backup_key
 tcp_inq
 tcp_mmap
-test_unix_oob
 timestamping
 tls
 toeplitz
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 7b6918d5f4af..ea84b88bcb30 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -1,13 +1,13 @@
 # SPDX-License-Identifier: GPL-2.0
 # Makefile for net selftests
 
-CFLAGS =  -Wall -Wl,--no-as-needed -O2 -g
+CFLAGS +=  -Wall -Wl,--no-as-needed -O2 -g
 CFLAGS += -I../../../../usr/include/ $(KHDR_INCLUDES)
 # Additional include paths needed by kselftest.h
 CFLAGS += -I../
 
 TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh \
-	      rtnetlink.sh xfrm_policy.sh test_blackhole_dev.sh
+	      rtnetlink.sh xfrm_policy.sh
 TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh udpgso.sh ip_defrag.sh
 TEST_PROGS += udpgso_bench.sh fib_rule_tests.sh msg_zerocopy.sh psock_snd.sh
 TEST_PROGS += udpgro_bench.sh udpgro.sh test_vxlan_under_vrf.sh reuseport_addr_any.sh
@@ -20,7 +20,6 @@ TEST_PROGS += reuseaddr_ports_exhausted.sh
 TEST_PROGS += txtimestamp.sh
 TEST_PROGS += vrf-xfrm-tests.sh
 TEST_PROGS += rxtimestamp.sh
-TEST_PROGS += devlink_port_split.py
 TEST_PROGS += drop_monitor_tests.sh
 TEST_PROGS += vrf_route_leaking.sh
 TEST_PROGS += bareudp.sh
@@ -32,9 +31,15 @@ TEST_PROGS += veth.sh
 TEST_PROGS += ioam6.sh
 TEST_PROGS += gro.sh
 TEST_PROGS += gre_gso.sh
+TEST_PROGS += gre_ipv6_lladdr.sh
 TEST_PROGS += cmsg_so_mark.sh
-TEST_PROGS += cmsg_time.sh cmsg_ipv6.sh
+TEST_PROGS += cmsg_so_priority.sh
+TEST_PROGS += test_so_rcv.sh
+TEST_PROGS += cmsg_time.sh cmsg_ip.sh
 TEST_PROGS += netns-name.sh
+TEST_PROGS += link_netns.py
+TEST_PROGS += nl_netdev.py
+TEST_PROGS += rtnetlink.py
 TEST_PROGS += srv6_end_dt46_l3vpn_test.sh
 TEST_PROGS += srv6_end_dt4_l3vpn_test.sh
 TEST_PROGS += srv6_end_dt6_l3vpn_test.sh
@@ -43,6 +48,8 @@ TEST_PROGS += srv6_hl2encap_red_l2vpn_test.sh
 TEST_PROGS += srv6_end_next_csid_l3vpn_test.sh
 TEST_PROGS += srv6_end_x_next_csid_l3vpn_test.sh
 TEST_PROGS += srv6_end_flavors_test.sh
+TEST_PROGS += srv6_end_dx4_netfilter_test.sh
+TEST_PROGS += srv6_end_dx6_netfilter_test.sh
 TEST_PROGS += vrf_strict_mode_test.sh
 TEST_PROGS += arp_ndisc_evict_nocarrier.sh
 TEST_PROGS += ndisc_unsolicited_na_test.sh
@@ -53,7 +60,8 @@ TEST_PROGS += bind_bhash.sh
 TEST_PROGS += ip_local_port_range.sh
 TEST_PROGS += rps_default_mask.sh
 TEST_PROGS += big_tcp.sh
-TEST_PROGS_EXTENDED := toeplitz_client.sh toeplitz.sh
+TEST_PROGS += netns-sysctl.sh
+TEST_PROGS_EXTENDED := toeplitz_client.sh toeplitz.sh xfrm_policy_add_speed.sh
 TEST_GEN_FILES =  socket nettest
 TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any
 TEST_GEN_FILES += tcp_mmap tcp_inq psock_snd txring_overwrite
@@ -67,89 +75,63 @@ TEST_GEN_FILES += ipsec
 TEST_GEN_FILES += ioam6_parser
 TEST_GEN_FILES += gro
 TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa
-TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls tun tap
+TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls tun tap epoll_busy_poll
 TEST_GEN_FILES += toeplitz
 TEST_GEN_FILES += cmsg_sender
 TEST_GEN_FILES += stress_reuseport_listen
+TEST_GEN_FILES += so_rcv_listener
 TEST_PROGS += test_vxlan_vnifiltering.sh
 TEST_GEN_FILES += io_uring_zerocopy_tx
 TEST_PROGS += io_uring_zerocopy_tx.sh
 TEST_GEN_FILES += bind_bhash
 TEST_GEN_PROGS += sk_bind_sendto_listen
 TEST_GEN_PROGS += sk_connect_zero_addr
+TEST_GEN_PROGS += sk_so_peek_off
 TEST_PROGS += test_ingress_egress_chaining.sh
 TEST_GEN_PROGS += so_incoming_cpu
 TEST_PROGS += sctp_vrf.sh
 TEST_GEN_FILES += sctp_hello
-TEST_GEN_FILES += csum
-TEST_GEN_FILES += nat6to4.o
-TEST_GEN_FILES += xdp_dummy.o
 TEST_GEN_FILES += ip_local_port_range
-TEST_GEN_FILES += bind_wildcard
+TEST_GEN_PROGS += bind_wildcard
+TEST_GEN_PROGS += bind_timewait
 TEST_PROGS += test_vxlan_mdb.sh
 TEST_PROGS += test_bridge_neigh_suppress.sh
 TEST_PROGS += test_vxlan_nolocalbypass.sh
 TEST_PROGS += test_bridge_backup_port.sh
-TEST_PROGS += fdb_flush.sh
+TEST_PROGS += fdb_flush.sh fdb_notify.sh
 TEST_PROGS += fq_band_pktlimit.sh
 TEST_PROGS += vlan_hw_filter.sh
+TEST_PROGS += vlan_bridge_binding.sh
+TEST_PROGS += bpf_offload.py
+TEST_PROGS += ipv6_route_update_soft_lockup.sh
+TEST_PROGS += busy_poll_test.sh
+TEST_GEN_PROGS += proc_net_pktgen
+TEST_PROGS += lwt_dst_cache_ref_loop.sh
+TEST_PROGS += skf_net_off.sh
+TEST_GEN_FILES += skf_net_off
+
+# YNL files, must be before "include ..lib.mk"
+YNL_GEN_FILES := busy_poller netlink-dumps
+TEST_GEN_FILES += $(YNL_GEN_FILES)
 
 TEST_FILES := settings
-TEST_FILES += in_netns.sh lib.sh net_helper.sh setup_loopback.sh setup_veth.sh
+TEST_FILES += in_netns.sh lib.sh setup_loopback.sh setup_veth.sh
+
+TEST_GEN_FILES += $(patsubst %.c,%.o,$(wildcard *.bpf.c))
 
 TEST_INCLUDES := forwarding/lib.sh
 
 include ../lib.mk
 
+# YNL build
+YNL_GENS := netdev
+include ynl.mk
+
+$(OUTPUT)/epoll_busy_poll: LDLIBS += -lcap
 $(OUTPUT)/reuseport_bpf_numa: LDLIBS += -lnuma
 $(OUTPUT)/tcp_mmap: LDLIBS += -lpthread -lcrypto
 $(OUTPUT)/tcp_inq: LDLIBS += -lpthread
 $(OUTPUT)/bind_bhash: LDLIBS += -lpthread
 $(OUTPUT)/io_uring_zerocopy_tx: CFLAGS += -I../../../include/
 
-# Rules to generate bpf objs
-CLANG ?= clang
-SCRATCH_DIR := $(OUTPUT)/tools
-BUILD_DIR := $(SCRATCH_DIR)/build
-BPFDIR := $(abspath ../../../lib/bpf)
-APIDIR := $(abspath ../../../include/uapi)
-
-CCINCLUDE += -I../bpf
-CCINCLUDE += -I../../../../usr/include/
-CCINCLUDE += -I$(SCRATCH_DIR)/include
-
-BPFOBJ := $(BUILD_DIR)/libbpf/libbpf.a
-
-MAKE_DIRS := $(BUILD_DIR)/libbpf
-$(MAKE_DIRS):
-	mkdir -p $@
-
-# Get Clang's default includes on this system, as opposed to those seen by
-# '--target=bpf'. This fixes "missing" files on some architectures/distros,
-# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc.
-#
-# Use '-idirafter': Don't interfere with include mechanics except where the
-# build would have failed anyways.
-define get_sys_includes
-$(shell $(1) $(2) -v -E - </dev/null 2>&1 \
-	| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \
-$(shell $(1) $(2) -dM -E - </dev/null | grep '__riscv_xlen ' | awk '{printf("-D__riscv_xlen=%d -D__BITS_PER_LONG=%d", $$3, $$3)}')
-endef
-
-ifneq ($(CROSS_COMPILE),)
-CLANG_TARGET_ARCH = --target=$(notdir $(CROSS_COMPILE:%-=%))
-endif
-
-CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG),$(CLANG_TARGET_ARCH))
-
-$(OUTPUT)/nat6to4.o $(OUTPUT)/xdp_dummy.o: $(OUTPUT)/%.o : %.c $(BPFOBJ) | $(MAKE_DIRS)
-	$(CLANG) -O2 --target=bpf -c $< $(CCINCLUDE) $(CLANG_SYS_INCLUDES) -o $@
-
-$(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile)		       \
-	   $(APIDIR)/linux/bpf.h					       \
-	   | $(BUILD_DIR)/libbpf
-	$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(BUILD_DIR)/libbpf/     \
-		    EXTRA_CFLAGS='-g -O0'				       \
-		    DESTDIR=$(SCRATCH_DIR) prefix= all install_headers
-
-EXTRA_CLEAN := $(SCRATCH_DIR)
+include bpf.mk
diff --git a/tools/testing/selftests/net/af_unix/Makefile b/tools/testing/selftests/net/af_unix/Makefile
index 221c387a7d7f..50584479540b 100644
--- a/tools/testing/selftests/net/af_unix/Makefile
+++ b/tools/testing/selftests/net/af_unix/Makefile
@@ -1,4 +1,4 @@
 CFLAGS += $(KHDR_INCLUDES)
-TEST_GEN_PROGS := diag_uid test_unix_oob unix_connect scm_pidfd
+TEST_GEN_PROGS := diag_uid msg_oob scm_pidfd scm_rights unix_connect
 
 include ../../lib.mk
diff --git a/tools/testing/selftests/net/af_unix/config b/tools/testing/selftests/net/af_unix/config
new file mode 100644
index 000000000000..37368567768c
--- /dev/null
+++ b/tools/testing/selftests/net/af_unix/config
@@ -0,0 +1,3 @@
+CONFIG_UNIX=y
+CONFIG_AF_UNIX_OOB=y
+CONFIG_UNIX_DIAG=m
diff --git a/tools/testing/selftests/net/af_unix/msg_oob.c b/tools/testing/selftests/net/af_unix/msg_oob.c
new file mode 100644
index 000000000000..3ed3882a93b8
--- /dev/null
+++ b/tools/testing/selftests/net/af_unix/msg_oob.c
@@ -0,0 +1,757 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright Amazon.com Inc. or its affiliates. */
+
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <netinet/in.h>
+#include <sys/epoll.h>
+#include <sys/ioctl.h>
+#include <sys/signalfd.h>
+#include <sys/socket.h>
+
+#include "../../kselftest_harness.h"
+
+#define BUF_SZ	32
+
+FIXTURE(msg_oob)
+{
+	int fd[4];		/* 0: AF_UNIX sender
+				 * 1: AF_UNIX receiver
+				 * 2: TCP sender
+				 * 3: TCP receiver
+				 */
+	int signal_fd;
+	int epoll_fd[2];	/* 0: AF_UNIX receiver
+				 * 1: TCP receiver
+				 */
+	bool tcp_compliant;
+};
+
+FIXTURE_VARIANT(msg_oob)
+{
+	bool peek;
+};
+
+FIXTURE_VARIANT_ADD(msg_oob, no_peek)
+{
+	.peek = false,
+};
+
+FIXTURE_VARIANT_ADD(msg_oob, peek)
+{
+	.peek = true
+};
+
+static void create_unix_socketpair(struct __test_metadata *_metadata,
+				   FIXTURE_DATA(msg_oob) *self)
+{
+	int ret;
+
+	ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0, self->fd);
+	ASSERT_EQ(ret, 0);
+}
+
+static void create_tcp_socketpair(struct __test_metadata *_metadata,
+				  FIXTURE_DATA(msg_oob) *self)
+{
+	struct sockaddr_in addr;
+	socklen_t addrlen;
+	int listen_fd;
+	int ret;
+
+	listen_fd = socket(AF_INET, SOCK_STREAM, 0);
+	ASSERT_GE(listen_fd, 0);
+
+	ret = listen(listen_fd, -1);
+	ASSERT_EQ(ret, 0);
+
+	addrlen = sizeof(addr);
+	ret = getsockname(listen_fd, (struct sockaddr *)&addr, &addrlen);
+	ASSERT_EQ(ret, 0);
+
+	self->fd[2] = socket(AF_INET, SOCK_STREAM, 0);
+	ASSERT_GE(self->fd[2], 0);
+
+	ret = connect(self->fd[2], (struct sockaddr *)&addr, addrlen);
+	ASSERT_EQ(ret, 0);
+
+	self->fd[3] = accept(listen_fd, (struct sockaddr *)&addr, &addrlen);
+	ASSERT_GE(self->fd[3], 0);
+
+	ret = fcntl(self->fd[3], F_SETFL, O_NONBLOCK);
+	ASSERT_EQ(ret, 0);
+}
+
+static void setup_sigurg(struct __test_metadata *_metadata,
+			 FIXTURE_DATA(msg_oob) *self)
+{
+	struct signalfd_siginfo siginfo;
+	int pid = getpid();
+	sigset_t mask;
+	int i, ret;
+
+	for (i = 0; i < 2; i++) {
+		ret = ioctl(self->fd[i * 2 + 1], FIOSETOWN, &pid);
+		ASSERT_EQ(ret, 0);
+	}
+
+	ret = sigemptyset(&mask);
+	ASSERT_EQ(ret, 0);
+
+	ret = sigaddset(&mask, SIGURG);
+	ASSERT_EQ(ret, 0);
+
+	ret = sigprocmask(SIG_BLOCK, &mask, NULL);
+	ASSERT_EQ(ret, 0);
+
+	self->signal_fd = signalfd(-1, &mask, SFD_NONBLOCK);
+	ASSERT_GE(self->signal_fd, 0);
+
+	ret = read(self->signal_fd, &siginfo, sizeof(siginfo));
+	ASSERT_EQ(ret, -1);
+}
+
+static void setup_epollpri(struct __test_metadata *_metadata,
+			   FIXTURE_DATA(msg_oob) *self)
+{
+	struct epoll_event event = {
+		.events = EPOLLPRI,
+	};
+	int i;
+
+	for (i = 0; i < 2; i++) {
+		int ret;
+
+		self->epoll_fd[i] = epoll_create1(0);
+		ASSERT_GE(self->epoll_fd[i], 0);
+
+		ret = epoll_ctl(self->epoll_fd[i], EPOLL_CTL_ADD, self->fd[i * 2 + 1], &event);
+		ASSERT_EQ(ret, 0);
+	}
+}
+
+static void close_sockets(FIXTURE_DATA(msg_oob) *self)
+{
+	int i;
+
+	for (i = 0; i < 4; i++)
+		close(self->fd[i]);
+}
+
+FIXTURE_SETUP(msg_oob)
+{
+	create_unix_socketpair(_metadata, self);
+	create_tcp_socketpair(_metadata, self);
+
+	setup_sigurg(_metadata, self);
+	setup_epollpri(_metadata, self);
+
+	self->tcp_compliant = true;
+}
+
+FIXTURE_TEARDOWN(msg_oob)
+{
+	close_sockets(self);
+}
+
+static void __epollpair(struct __test_metadata *_metadata,
+			FIXTURE_DATA(msg_oob) *self,
+			bool oob_remaining)
+{
+	struct epoll_event event[2] = {};
+	int i, ret[2];
+
+	for (i = 0; i < 2; i++)
+		ret[i] = epoll_wait(self->epoll_fd[i], &event[i], 1, 0);
+
+	ASSERT_EQ(ret[0], oob_remaining);
+
+	if (self->tcp_compliant)
+		ASSERT_EQ(ret[0], ret[1]);
+
+	if (oob_remaining) {
+		ASSERT_EQ(event[0].events, EPOLLPRI);
+
+		if (self->tcp_compliant)
+			ASSERT_EQ(event[0].events, event[1].events);
+	}
+}
+
+static void __sendpair(struct __test_metadata *_metadata,
+		       FIXTURE_DATA(msg_oob) *self,
+		       const void *buf, size_t len, int flags)
+{
+	int i, ret[2];
+
+	for (i = 0; i < 2; i++) {
+		struct signalfd_siginfo siginfo = {};
+		int bytes;
+
+		ret[i] = send(self->fd[i * 2], buf, len, flags);
+
+		bytes = read(self->signal_fd, &siginfo, sizeof(siginfo));
+
+		if (flags & MSG_OOB) {
+			ASSERT_EQ(bytes, sizeof(siginfo));
+			ASSERT_EQ(siginfo.ssi_signo, SIGURG);
+
+			bytes = read(self->signal_fd, &siginfo, sizeof(siginfo));
+		}
+
+		ASSERT_EQ(bytes, -1);
+	}
+
+	ASSERT_EQ(ret[0], len);
+	ASSERT_EQ(ret[0], ret[1]);
+}
+
+static void __recvpair(struct __test_metadata *_metadata,
+		       FIXTURE_DATA(msg_oob) *self,
+		       const char *expected_buf, int expected_len,
+		       int buf_len, int flags)
+{
+	int i, ret[2], recv_errno[2], expected_errno = 0;
+	char recv_buf[2][BUF_SZ] = {};
+	bool printed = false;
+
+	ASSERT_GE(BUF_SZ, buf_len);
+
+	errno = 0;
+
+	for (i = 0; i < 2; i++) {
+		ret[i] = recv(self->fd[i * 2 + 1], recv_buf[i], buf_len, flags);
+		recv_errno[i] = errno;
+	}
+
+	if (expected_len < 0) {
+		expected_errno = -expected_len;
+		expected_len = -1;
+	}
+
+	if (ret[0] != expected_len || recv_errno[0] != expected_errno) {
+		TH_LOG("AF_UNIX :%s", ret[0] < 0 ? strerror(recv_errno[0]) : recv_buf[0]);
+		TH_LOG("Expected:%s", expected_errno ? strerror(expected_errno) : expected_buf);
+
+		ASSERT_EQ(ret[0], expected_len);
+		ASSERT_EQ(recv_errno[0], expected_errno);
+	}
+
+	if (ret[0] != ret[1] || recv_errno[0] != recv_errno[1]) {
+		TH_LOG("AF_UNIX :%s", ret[0] < 0 ? strerror(recv_errno[0]) : recv_buf[0]);
+		TH_LOG("TCP     :%s", ret[1] < 0 ? strerror(recv_errno[1]) : recv_buf[1]);
+
+		printed = true;
+
+		if (self->tcp_compliant) {
+			ASSERT_EQ(ret[0], ret[1]);
+			ASSERT_EQ(recv_errno[0], recv_errno[1]);
+		}
+	}
+
+	if (expected_len >= 0) {
+		int cmp;
+
+		cmp = strncmp(expected_buf, recv_buf[0], expected_len);
+		if (cmp) {
+			TH_LOG("AF_UNIX :%s", ret[0] < 0 ? strerror(recv_errno[0]) : recv_buf[0]);
+			TH_LOG("Expected:%s", expected_errno ? strerror(expected_errno) : expected_buf);
+
+			ASSERT_EQ(cmp, 0);
+		}
+
+		cmp = strncmp(recv_buf[0], recv_buf[1], expected_len);
+		if (cmp) {
+			if (!printed) {
+				TH_LOG("AF_UNIX :%s", ret[0] < 0 ? strerror(recv_errno[0]) : recv_buf[0]);
+				TH_LOG("TCP     :%s", ret[1] < 0 ? strerror(recv_errno[1]) : recv_buf[1]);
+			}
+
+			if (self->tcp_compliant)
+				ASSERT_EQ(cmp, 0);
+		}
+	}
+}
+
+static void __setinlinepair(struct __test_metadata *_metadata,
+			    FIXTURE_DATA(msg_oob) *self)
+{
+	int i, oob_inline = 1;
+
+	for (i = 0; i < 2; i++) {
+		int ret;
+
+		ret = setsockopt(self->fd[i * 2 + 1], SOL_SOCKET, SO_OOBINLINE,
+				 &oob_inline, sizeof(oob_inline));
+		ASSERT_EQ(ret, 0);
+	}
+}
+
+static void __siocatmarkpair(struct __test_metadata *_metadata,
+			     FIXTURE_DATA(msg_oob) *self,
+			     bool oob_head)
+{
+	int answ[2] = {};
+	int i;
+
+	for (i = 0; i < 2; i++) {
+		int ret;
+
+		ret = ioctl(self->fd[i * 2 + 1], SIOCATMARK, &answ[i]);
+		ASSERT_EQ(ret, 0);
+	}
+
+	ASSERT_EQ(answ[0], oob_head);
+
+	if (self->tcp_compliant)
+		ASSERT_EQ(answ[0], answ[1]);
+}
+
+#define sendpair(buf, len, flags)					\
+	__sendpair(_metadata, self, buf, len, flags)
+
+#define recvpair(expected_buf, expected_len, buf_len, flags)		\
+	do {								\
+		if (variant->peek)					\
+			__recvpair(_metadata, self,			\
+				   expected_buf, expected_len,		\
+				   buf_len, (flags) | MSG_PEEK);	\
+		__recvpair(_metadata, self,				\
+			   expected_buf, expected_len, buf_len, flags);	\
+	} while (0)
+
+#define epollpair(oob_remaining)					\
+	__epollpair(_metadata, self, oob_remaining)
+
+#define siocatmarkpair(oob_head)					\
+	__siocatmarkpair(_metadata, self, oob_head)
+
+#define setinlinepair()							\
+	__setinlinepair(_metadata, self)
+
+#define tcp_incompliant							\
+	for (self->tcp_compliant = false;				\
+	     self->tcp_compliant == false;				\
+	     self->tcp_compliant = true)
+
+TEST_F(msg_oob, non_oob)
+{
+	sendpair("x", 1, 0);
+	epollpair(false);
+	siocatmarkpair(false);
+
+	recvpair("", -EINVAL, 1, MSG_OOB);
+	epollpair(false);
+	siocatmarkpair(false);
+}
+
+TEST_F(msg_oob, oob)
+{
+	sendpair("x", 1, MSG_OOB);
+	epollpair(true);
+	siocatmarkpair(true);
+
+	recvpair("x", 1, 1, MSG_OOB);
+	epollpair(false);
+	siocatmarkpair(true);
+}
+
+TEST_F(msg_oob, oob_drop)
+{
+	sendpair("x", 1, MSG_OOB);
+	epollpair(true);
+	siocatmarkpair(true);
+
+	recvpair("", -EAGAIN, 1, 0);		/* Drop OOB. */
+	epollpair(false);
+	siocatmarkpair(false);
+
+	recvpair("", -EINVAL, 1, MSG_OOB);
+	epollpair(false);
+	siocatmarkpair(false);
+}
+
+TEST_F(msg_oob, oob_ahead)
+{
+	sendpair("hello", 5, MSG_OOB);
+	epollpair(true);
+	siocatmarkpair(false);
+
+	recvpair("o", 1, 1, MSG_OOB);
+	epollpair(false);
+	siocatmarkpair(false);
+
+	recvpair("hell", 4, 4, 0);
+	epollpair(false);
+	siocatmarkpair(true);
+}
+
+TEST_F(msg_oob, oob_break)
+{
+	sendpair("hello", 5, MSG_OOB);
+	epollpair(true);
+	siocatmarkpair(false);
+
+	recvpair("hell", 4, 5, 0);		/* Break at OOB even with enough buffer. */
+	epollpair(true);
+	siocatmarkpair(true);
+
+	recvpair("o", 1, 1, MSG_OOB);
+	epollpair(false);
+	siocatmarkpair(true);
+
+	recvpair("", -EAGAIN, 1, 0);
+	siocatmarkpair(false);
+}
+
+TEST_F(msg_oob, oob_ahead_break)
+{
+	sendpair("hello", 5, MSG_OOB);
+	epollpair(true);
+	siocatmarkpair(false);
+
+	sendpair("world", 5, 0);
+	epollpair(true);
+	siocatmarkpair(false);
+
+	recvpair("o", 1, 1, MSG_OOB);
+	epollpair(false);
+	siocatmarkpair(false);
+
+	recvpair("hell", 4, 9, 0);		/* Break at OOB even after it's recv()ed. */
+	epollpair(false);
+	siocatmarkpair(true);
+
+	recvpair("world", 5, 5, 0);
+	epollpair(false);
+	siocatmarkpair(false);
+}
+
+TEST_F(msg_oob, oob_break_drop)
+{
+	sendpair("hello", 5, MSG_OOB);
+	epollpair(true);
+	siocatmarkpair(false);
+
+	sendpair("world", 5, 0);
+	epollpair(true);
+	siocatmarkpair(false);
+
+	recvpair("hell", 4, 10, 0);		/* Break at OOB even with enough buffer. */
+	epollpair(true);
+	siocatmarkpair(true);
+
+	recvpair("world", 5, 10, 0);		/* Drop OOB and recv() the next skb. */
+	epollpair(false);
+	siocatmarkpair(false);
+
+	recvpair("", -EINVAL, 1, MSG_OOB);
+	epollpair(false);
+	siocatmarkpair(false);
+}
+
+TEST_F(msg_oob, ex_oob_break)
+{
+	sendpair("hello", 5, MSG_OOB);
+	epollpair(true);
+	siocatmarkpair(false);
+
+	sendpair("wor", 3, MSG_OOB);
+	epollpair(true);
+	siocatmarkpair(false);
+
+	sendpair("ld", 2, 0);
+	epollpair(true);
+	siocatmarkpair(false);
+
+	recvpair("hellowo", 7, 10, 0);		/* Break at OOB but not at ex-OOB. */
+	epollpair(true);
+	siocatmarkpair(true);
+
+	recvpair("r", 1, 1, MSG_OOB);
+	epollpair(false);
+	siocatmarkpair(true);
+
+	recvpair("ld", 2, 2, 0);
+	epollpair(false);
+	siocatmarkpair(false);
+}
+
+TEST_F(msg_oob, ex_oob_drop)
+{
+	sendpair("x", 1, MSG_OOB);
+	epollpair(true);
+	siocatmarkpair(true);
+
+	sendpair("y", 1, MSG_OOB);		/* TCP drops "x" at this moment. */
+	epollpair(true);
+
+	tcp_incompliant {
+		siocatmarkpair(false);
+
+		recvpair("x", 1, 1, 0);		/* TCP drops "y" by passing through it. */
+		epollpair(true);
+		siocatmarkpair(true);
+
+		recvpair("y", 1, 1, MSG_OOB);	/* TCP returns -EINVAL. */
+		epollpair(false);
+		siocatmarkpair(true);
+	}
+}
+
+TEST_F(msg_oob, ex_oob_drop_2)
+{
+	sendpair("x", 1, MSG_OOB);
+	epollpair(true);
+	siocatmarkpair(true);
+
+	sendpair("y", 1, MSG_OOB);		/* TCP drops "x" at this moment. */
+	epollpair(true);
+
+	tcp_incompliant {
+		siocatmarkpair(false);
+	}
+
+	recvpair("y", 1, 1, MSG_OOB);
+	epollpair(false);
+
+	tcp_incompliant {
+		siocatmarkpair(false);
+
+		recvpair("x", 1, 1, 0);		/* TCP returns -EAGAIN. */
+		epollpair(false);
+		siocatmarkpair(true);
+	}
+}
+
+TEST_F(msg_oob, ex_oob_oob)
+{
+	sendpair("x", 1, MSG_OOB);
+	epollpair(true);
+	siocatmarkpair(true);
+
+	recvpair("x", 1, 1, MSG_OOB);
+	epollpair(false);
+	siocatmarkpair(true);
+
+	sendpair("y", 1, MSG_OOB);
+	epollpair(true);
+	siocatmarkpair(true);
+
+	recvpair("", -EAGAIN, 1, 0);
+	epollpair(false);
+	siocatmarkpair(false);
+
+	recvpair("", -EINVAL, 1, MSG_OOB);
+	epollpair(false);
+	siocatmarkpair(false);
+}
+
+TEST_F(msg_oob, ex_oob_ahead_break)
+{
+	sendpair("hello", 5, MSG_OOB);
+	epollpair(true);
+	siocatmarkpair(false);
+
+	sendpair("wor", 3, MSG_OOB);
+	epollpair(true);
+	siocatmarkpair(false);
+
+	recvpair("r", 1, 1, MSG_OOB);
+	epollpair(false);
+	siocatmarkpair(false);
+
+	sendpair("ld", 2, MSG_OOB);
+	epollpair(true);
+	siocatmarkpair(false);
+
+	tcp_incompliant {
+		recvpair("hellowol", 8, 10, 0);	/* TCP recv()s "helloworl", why "r" ?? */
+	}
+
+	epollpair(true);
+	siocatmarkpair(true);
+
+	recvpair("d", 1, 1, MSG_OOB);
+	epollpair(false);
+	siocatmarkpair(true);
+}
+
+TEST_F(msg_oob, ex_oob_siocatmark)
+{
+	sendpair("hello", 5, MSG_OOB);
+	epollpair(true);
+	siocatmarkpair(false);
+
+	recvpair("o", 1, 1, MSG_OOB);
+	epollpair(false);
+	siocatmarkpair(false);
+
+	sendpair("world", 5, MSG_OOB);
+	epollpair(true);
+	siocatmarkpair(false);
+
+	recvpair("hell", 4, 4, 0);		/* Intentionally stop at ex-OOB. */
+	epollpair(true);
+	siocatmarkpair(false);
+}
+
+TEST_F(msg_oob, inline_oob)
+{
+	setinlinepair();
+
+	sendpair("x", 1, MSG_OOB);
+	epollpair(true);
+	siocatmarkpair(true);
+
+	recvpair("", -EINVAL, 1, MSG_OOB);
+	epollpair(true);
+	siocatmarkpair(true);
+
+	recvpair("x", 1, 1, 0);
+	epollpair(false);
+	siocatmarkpair(false);
+}
+
+TEST_F(msg_oob, inline_oob_break)
+{
+	setinlinepair();
+
+	sendpair("hello", 5, MSG_OOB);
+	epollpair(true);
+	siocatmarkpair(false);
+
+	recvpair("", -EINVAL, 1, MSG_OOB);
+	epollpair(true);
+	siocatmarkpair(false);
+
+	recvpair("hell", 4, 5, 0);		/* Break at OOB but not at ex-OOB. */
+	epollpair(true);
+	siocatmarkpair(true);
+
+	recvpair("o", 1, 1, 0);
+	epollpair(false);
+	siocatmarkpair(false);
+}
+
+TEST_F(msg_oob, inline_oob_ahead_break)
+{
+	sendpair("hello", 5, MSG_OOB);
+	epollpair(true);
+	siocatmarkpair(false);
+
+	sendpair("world", 5, 0);
+	epollpair(true);
+	siocatmarkpair(false);
+
+	recvpair("o", 1, 1, MSG_OOB);
+	epollpair(false);
+	siocatmarkpair(false);
+
+	setinlinepair();
+
+	recvpair("hell", 4, 9, 0);		/* Break at OOB even with enough buffer. */
+	epollpair(false);
+	siocatmarkpair(true);
+
+	tcp_incompliant {
+		recvpair("world", 5, 6, 0);	/* TCP recv()s "oworld", ... "o" ??? */
+	}
+
+	epollpair(false);
+	siocatmarkpair(false);
+}
+
+TEST_F(msg_oob, inline_ex_oob_break)
+{
+	sendpair("hello", 5, MSG_OOB);
+	epollpair(true);
+	siocatmarkpair(false);
+
+	sendpair("wor", 3, MSG_OOB);
+	epollpair(true);
+	siocatmarkpair(false);
+
+	sendpair("ld", 2, 0);
+	epollpair(true);
+	siocatmarkpair(false);
+
+	setinlinepair();
+
+	recvpair("hellowo", 7, 10, 0);		/* Break at OOB but not at ex-OOB. */
+	epollpair(true);
+	siocatmarkpair(true);
+
+	recvpair("rld", 3, 3, 0);
+	epollpair(false);
+	siocatmarkpair(false);
+}
+
+TEST_F(msg_oob, inline_ex_oob_no_drop)
+{
+	sendpair("x", 1, MSG_OOB);
+	epollpair(true);
+	siocatmarkpair(true);
+
+	setinlinepair();
+
+	sendpair("y", 1, MSG_OOB);		/* TCP does NOT drops "x" at this moment. */
+	epollpair(true);
+	siocatmarkpair(false);
+
+	recvpair("x", 1, 1, 0);
+	epollpair(true);
+	siocatmarkpair(true);
+
+	recvpair("y", 1, 1, 0);
+	epollpair(false);
+	siocatmarkpair(false);
+}
+
+TEST_F(msg_oob, inline_ex_oob_drop)
+{
+	sendpair("x", 1, MSG_OOB);
+	epollpair(true);
+	siocatmarkpair(true);
+
+	sendpair("y", 1, MSG_OOB);		/* TCP drops "x" at this moment. */
+	epollpair(true);
+
+	setinlinepair();
+
+	tcp_incompliant {
+		siocatmarkpair(false);
+
+		recvpair("x", 1, 1, 0);		/* TCP recv()s "y". */
+		epollpair(true);
+		siocatmarkpair(true);
+
+		recvpair("y", 1, 1, 0);		/* TCP returns -EAGAIN. */
+		epollpair(false);
+		siocatmarkpair(false);
+	}
+}
+
+TEST_F(msg_oob, inline_ex_oob_siocatmark)
+{
+	sendpair("hello", 5, MSG_OOB);
+	epollpair(true);
+	siocatmarkpair(false);
+
+	recvpair("o", 1, 1, MSG_OOB);
+	epollpair(false);
+	siocatmarkpair(false);
+
+	setinlinepair();
+
+	sendpair("world", 5, MSG_OOB);
+	epollpair(true);
+	siocatmarkpair(false);
+
+	recvpair("hell", 4, 4, 0);		/* Intentionally stop at ex-OOB. */
+	epollpair(true);
+	siocatmarkpair(false);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/net/af_unix/scm_rights.c b/tools/testing/selftests/net/af_unix/scm_rights.c
new file mode 100644
index 000000000000..8b015f16c03d
--- /dev/null
+++ b/tools/testing/selftests/net/af_unix/scm_rights.c
@@ -0,0 +1,383 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright Amazon.com Inc. or its affiliates. */
+#define _GNU_SOURCE
+#include <sched.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "../../kselftest_harness.h"
+
+FIXTURE(scm_rights)
+{
+	int fd[32];
+};
+
+FIXTURE_VARIANT(scm_rights)
+{
+	char name[32];
+	int type;
+	int flags;
+	bool test_listener;
+	bool disabled;
+};
+
+FIXTURE_VARIANT_ADD(scm_rights, dgram)
+{
+	.name = "UNIX ",
+	.type = SOCK_DGRAM,
+	.flags = 0,
+	.test_listener = false,
+	.disabled = false,
+};
+
+FIXTURE_VARIANT_ADD(scm_rights, dgram_disabled)
+{
+	.name = "UNIX ",
+	.type = SOCK_DGRAM,
+	.flags = 0,
+	.test_listener = false,
+	.disabled = true,
+};
+
+FIXTURE_VARIANT_ADD(scm_rights, stream)
+{
+	.name = "UNIX-STREAM ",
+	.type = SOCK_STREAM,
+	.flags = 0,
+	.test_listener = false,
+	.disabled = false,
+};
+
+FIXTURE_VARIANT_ADD(scm_rights, stream_disabled)
+{
+	.name = "UNIX-STREAM ",
+	.type = SOCK_STREAM,
+	.flags = 0,
+	.test_listener = false,
+	.disabled = true,
+};
+
+FIXTURE_VARIANT_ADD(scm_rights, stream_oob)
+{
+	.name = "UNIX-STREAM ",
+	.type = SOCK_STREAM,
+	.flags = MSG_OOB,
+	.test_listener = false,
+	.disabled = false,
+};
+
+FIXTURE_VARIANT_ADD(scm_rights, stream_oob_disabled)
+{
+	.name = "UNIX-STREAM ",
+	.type = SOCK_STREAM,
+	.flags = MSG_OOB,
+	.test_listener = false,
+	.disabled = true,
+};
+
+FIXTURE_VARIANT_ADD(scm_rights, stream_listener)
+{
+	.name = "UNIX-STREAM ",
+	.type = SOCK_STREAM,
+	.flags = 0,
+	.test_listener = true,
+	.disabled = false,
+};
+
+FIXTURE_VARIANT_ADD(scm_rights, stream_listener_disabled)
+{
+	.name = "UNIX-STREAM ",
+	.type = SOCK_STREAM,
+	.flags = 0,
+	.test_listener = true,
+	.disabled = true,
+};
+
+FIXTURE_VARIANT_ADD(scm_rights, stream_listener_oob)
+{
+	.name = "UNIX-STREAM ",
+	.type = SOCK_STREAM,
+	.flags = MSG_OOB,
+	.test_listener = true,
+	.disabled = false,
+};
+
+FIXTURE_VARIANT_ADD(scm_rights, stream_listener_oob_disabled)
+{
+	.name = "UNIX-STREAM ",
+	.type = SOCK_STREAM,
+	.flags = MSG_OOB,
+	.test_listener = true,
+	.disabled = true,
+};
+
+static int count_sockets(struct __test_metadata *_metadata,
+			 const FIXTURE_VARIANT(scm_rights) *variant)
+{
+	int sockets = -1, len, ret;
+	char *line = NULL;
+	size_t unused;
+	FILE *f;
+
+	f = fopen("/proc/net/protocols", "r");
+	ASSERT_NE(NULL, f);
+
+	len = strlen(variant->name);
+
+	while (getline(&line, &unused, f) != -1) {
+		int unused2;
+
+		if (strncmp(line, variant->name, len))
+			continue;
+
+		ret = sscanf(line + len, "%d %d", &unused2, &sockets);
+		ASSERT_EQ(2, ret);
+
+		break;
+	}
+
+	free(line);
+
+	ret = fclose(f);
+	ASSERT_EQ(0, ret);
+
+	return sockets;
+}
+
+FIXTURE_SETUP(scm_rights)
+{
+	int ret;
+
+	ret = unshare(CLONE_NEWNET);
+	ASSERT_EQ(0, ret);
+
+	if (variant->disabled)
+		return;
+
+	ret = count_sockets(_metadata, variant);
+	ASSERT_EQ(0, ret);
+}
+
+FIXTURE_TEARDOWN(scm_rights)
+{
+	int ret;
+
+	if (variant->disabled)
+		return;
+
+	sleep(1);
+
+	ret = count_sockets(_metadata, variant);
+	ASSERT_EQ(0, ret);
+}
+
+static void create_listeners(struct __test_metadata *_metadata,
+			     FIXTURE_DATA(scm_rights) *self,
+			     const FIXTURE_VARIANT(scm_rights) *variant,
+			     int n)
+{
+	struct sockaddr_un addr = {
+		.sun_family = AF_UNIX,
+	};
+	socklen_t addrlen;
+	int i, ret;
+
+	for (i = 0; i < n * 2; i += 2) {
+		self->fd[i] = socket(AF_UNIX, SOCK_STREAM, 0);
+		ASSERT_LE(0, self->fd[i]);
+
+		addrlen = sizeof(addr.sun_family);
+		ret = bind(self->fd[i], (struct sockaddr *)&addr, addrlen);
+		ASSERT_EQ(0, ret);
+
+		ret = listen(self->fd[i], -1);
+		ASSERT_EQ(0, ret);
+
+		if (variant->disabled) {
+			ret = setsockopt(self->fd[i], SOL_SOCKET, SO_PASSRIGHTS,
+					 &(int){0}, sizeof(int));
+			ASSERT_EQ(0, ret);
+		}
+
+		addrlen = sizeof(addr);
+		ret = getsockname(self->fd[i], (struct sockaddr *)&addr, &addrlen);
+		ASSERT_EQ(0, ret);
+
+		self->fd[i + 1] = socket(AF_UNIX, SOCK_STREAM, 0);
+		ASSERT_LE(0, self->fd[i + 1]);
+
+		ret = connect(self->fd[i + 1], (struct sockaddr *)&addr, addrlen);
+		ASSERT_EQ(0, ret);
+	}
+}
+
+static void create_socketpairs(struct __test_metadata *_metadata,
+			       FIXTURE_DATA(scm_rights) *self,
+			       const FIXTURE_VARIANT(scm_rights) *variant,
+			       int n)
+{
+	int i, ret;
+
+	ASSERT_GE(sizeof(self->fd) / sizeof(int), n);
+
+	for (i = 0; i < n * 2; i += 2) {
+		ret = socketpair(AF_UNIX, variant->type, 0, self->fd + i);
+		ASSERT_EQ(0, ret);
+
+		if (variant->disabled) {
+			ret = setsockopt(self->fd[i], SOL_SOCKET, SO_PASSRIGHTS,
+					 &(int){0}, sizeof(int));
+			ASSERT_EQ(0, ret);
+		}
+	}
+}
+
+static void __create_sockets(struct __test_metadata *_metadata,
+			     FIXTURE_DATA(scm_rights) *self,
+			     const FIXTURE_VARIANT(scm_rights) *variant,
+			     int n)
+{
+	ASSERT_LE(n * 2, sizeof(self->fd) / sizeof(self->fd[0]));
+
+	if (variant->test_listener)
+		create_listeners(_metadata, self, variant, n);
+	else
+		create_socketpairs(_metadata, self, variant, n);
+}
+
+static void __close_sockets(struct __test_metadata *_metadata,
+			    FIXTURE_DATA(scm_rights) *self,
+			    int n)
+{
+	int i, ret;
+
+	ASSERT_GE(sizeof(self->fd) / sizeof(int), n);
+
+	for (i = 0; i < n * 2; i++) {
+		ret = close(self->fd[i]);
+		ASSERT_EQ(0, ret);
+	}
+}
+
+void __send_fd(struct __test_metadata *_metadata,
+	       const FIXTURE_DATA(scm_rights) *self,
+	       const FIXTURE_VARIANT(scm_rights) *variant,
+	       int inflight, int receiver)
+{
+#define MSG "x"
+#define MSGLEN 1
+	struct {
+		struct cmsghdr cmsghdr;
+		int fd[2];
+	} cmsg = {
+		.cmsghdr = {
+			.cmsg_len = CMSG_LEN(sizeof(cmsg.fd)),
+			.cmsg_level = SOL_SOCKET,
+			.cmsg_type = SCM_RIGHTS,
+		},
+		.fd = {
+			self->fd[inflight * 2],
+			self->fd[inflight * 2],
+		},
+	};
+	struct iovec iov = {
+		.iov_base = MSG,
+		.iov_len = MSGLEN,
+	};
+	struct msghdr msg = {
+		.msg_name = NULL,
+		.msg_namelen = 0,
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+		.msg_control = &cmsg,
+		.msg_controllen = CMSG_SPACE(sizeof(cmsg.fd)),
+	};
+	int ret;
+
+	ret = sendmsg(self->fd[receiver * 2 + 1], &msg, variant->flags);
+
+	if (variant->disabled) {
+		ASSERT_EQ(-1, ret);
+		ASSERT_EQ(-EPERM, -errno);
+	} else {
+		ASSERT_EQ(MSGLEN, ret);
+	}
+}
+
+#define create_sockets(n)					\
+	__create_sockets(_metadata, self, variant, n)
+#define close_sockets(n)					\
+	__close_sockets(_metadata, self, n)
+#define send_fd(inflight, receiver)				\
+	__send_fd(_metadata, self, variant, inflight, receiver)
+
+TEST_F(scm_rights, self_ref)
+{
+	create_sockets(2);
+
+	send_fd(0, 0);
+
+	send_fd(1, 1);
+
+	close_sockets(2);
+}
+
+TEST_F(scm_rights, triangle)
+{
+	create_sockets(6);
+
+	send_fd(0, 1);
+	send_fd(1, 2);
+	send_fd(2, 0);
+
+	send_fd(3, 4);
+	send_fd(4, 5);
+	send_fd(5, 3);
+
+	close_sockets(6);
+}
+
+TEST_F(scm_rights, cross_edge)
+{
+	create_sockets(8);
+
+	send_fd(0, 1);
+	send_fd(1, 2);
+	send_fd(2, 0);
+	send_fd(1, 3);
+	send_fd(3, 2);
+
+	send_fd(4, 5);
+	send_fd(5, 6);
+	send_fd(6, 4);
+	send_fd(5, 7);
+	send_fd(7, 6);
+
+	close_sockets(8);
+}
+
+TEST_F(scm_rights, backtrack_from_scc)
+{
+	create_sockets(10);
+
+	send_fd(0, 1);
+	send_fd(0, 4);
+	send_fd(1, 2);
+	send_fd(2, 3);
+	send_fd(3, 1);
+
+	send_fd(5, 6);
+	send_fd(5, 9);
+	send_fd(6, 7);
+	send_fd(7, 8);
+	send_fd(8, 6);
+
+	close_sockets(10);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/net/af_unix/test_unix_oob.c b/tools/testing/selftests/net/af_unix/test_unix_oob.c
deleted file mode 100644
index a7c51889acd5..000000000000
--- a/tools/testing/selftests/net/af_unix/test_unix_oob.c
+++ /dev/null
@@ -1,436 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/socket.h>
-#include <arpa/inet.h>
-#include <unistd.h>
-#include <string.h>
-#include <fcntl.h>
-#include <sys/ioctl.h>
-#include <errno.h>
-#include <netinet/tcp.h>
-#include <sys/un.h>
-#include <sys/signal.h>
-#include <sys/poll.h>
-
-static int pipefd[2];
-static int signal_recvd;
-static pid_t producer_id;
-static char sock_name[32];
-
-static void sig_hand(int sn, siginfo_t *si, void *p)
-{
-	signal_recvd = sn;
-}
-
-static int set_sig_handler(int signal)
-{
-	struct sigaction sa;
-
-	sa.sa_sigaction = sig_hand;
-	sigemptyset(&sa.sa_mask);
-	sa.sa_flags = SA_SIGINFO | SA_RESTART;
-
-	return sigaction(signal, &sa, NULL);
-}
-
-static void set_filemode(int fd, int set)
-{
-	int flags = fcntl(fd, F_GETFL, 0);
-
-	if (set)
-		flags &= ~O_NONBLOCK;
-	else
-		flags |= O_NONBLOCK;
-	fcntl(fd, F_SETFL, flags);
-}
-
-static void signal_producer(int fd)
-{
-	char cmd;
-
-	cmd = 'S';
-	write(fd, &cmd, sizeof(cmd));
-}
-
-static void wait_for_signal(int fd)
-{
-	char buf[5];
-
-	read(fd, buf, 5);
-}
-
-static void die(int status)
-{
-	fflush(NULL);
-	unlink(sock_name);
-	kill(producer_id, SIGTERM);
-	exit(status);
-}
-
-int is_sioctatmark(int fd)
-{
-	int ans = -1;
-
-	if (ioctl(fd, SIOCATMARK, &ans, sizeof(ans)) < 0) {
-#ifdef DEBUG
-		perror("SIOCATMARK Failed");
-#endif
-	}
-	return ans;
-}
-
-void read_oob(int fd, char *c)
-{
-
-	*c = ' ';
-	if (recv(fd, c, sizeof(*c), MSG_OOB) < 0) {
-#ifdef DEBUG
-		perror("Reading MSG_OOB Failed");
-#endif
-	}
-}
-
-int read_data(int pfd, char *buf, int size)
-{
-	int len = 0;
-
-	memset(buf, size, '0');
-	len = read(pfd, buf, size);
-#ifdef DEBUG
-	if (len < 0)
-		perror("read failed");
-#endif
-	return len;
-}
-
-static void wait_for_data(int pfd, int event)
-{
-	struct pollfd pfds[1];
-
-	pfds[0].fd = pfd;
-	pfds[0].events = event;
-	poll(pfds, 1, -1);
-}
-
-void producer(struct sockaddr_un *consumer_addr)
-{
-	int cfd;
-	char buf[64];
-	int i;
-
-	memset(buf, 'x', sizeof(buf));
-	cfd = socket(AF_UNIX, SOCK_STREAM, 0);
-
-	wait_for_signal(pipefd[0]);
-	if (connect(cfd, (struct sockaddr *)consumer_addr,
-		     sizeof(*consumer_addr)) != 0) {
-		perror("Connect failed");
-		kill(0, SIGTERM);
-		exit(1);
-	}
-
-	for (i = 0; i < 2; i++) {
-		/* Test 1: Test for SIGURG and OOB */
-		wait_for_signal(pipefd[0]);
-		memset(buf, 'x', sizeof(buf));
-		buf[63] = '@';
-		send(cfd, buf, sizeof(buf), MSG_OOB);
-
-		wait_for_signal(pipefd[0]);
-
-		/* Test 2: Test for OOB being overwitten */
-		memset(buf, 'x', sizeof(buf));
-		buf[63] = '%';
-		send(cfd, buf, sizeof(buf), MSG_OOB);
-
-		memset(buf, 'x', sizeof(buf));
-		buf[63] = '#';
-		send(cfd, buf, sizeof(buf), MSG_OOB);
-
-		wait_for_signal(pipefd[0]);
-
-		/* Test 3: Test for SIOCATMARK */
-		memset(buf, 'x', sizeof(buf));
-		buf[63] = '@';
-		send(cfd, buf, sizeof(buf), MSG_OOB);
-
-		memset(buf, 'x', sizeof(buf));
-		buf[63] = '%';
-		send(cfd, buf, sizeof(buf), MSG_OOB);
-
-		memset(buf, 'x', sizeof(buf));
-		send(cfd, buf, sizeof(buf), 0);
-
-		wait_for_signal(pipefd[0]);
-
-		/* Test 4: Test for 1byte OOB msg */
-		memset(buf, 'x', sizeof(buf));
-		buf[0] = '@';
-		send(cfd, buf, 1, MSG_OOB);
-	}
-}
-
-int
-main(int argc, char **argv)
-{
-	int lfd, pfd;
-	struct sockaddr_un consumer_addr, paddr;
-	socklen_t len = sizeof(consumer_addr);
-	char buf[1024];
-	int on = 0;
-	char oob;
-	int atmark;
-
-	lfd = socket(AF_UNIX, SOCK_STREAM, 0);
-	memset(&consumer_addr, 0, sizeof(consumer_addr));
-	consumer_addr.sun_family = AF_UNIX;
-	sprintf(sock_name, "unix_oob_%d", getpid());
-	unlink(sock_name);
-	strcpy(consumer_addr.sun_path, sock_name);
-
-	if ((bind(lfd, (struct sockaddr *)&consumer_addr,
-		  sizeof(consumer_addr))) != 0) {
-		perror("socket bind failed");
-		exit(1);
-	}
-
-	pipe(pipefd);
-
-	listen(lfd, 1);
-
-	producer_id = fork();
-	if (producer_id == 0) {
-		producer(&consumer_addr);
-		exit(0);
-	}
-
-	set_sig_handler(SIGURG);
-	signal_producer(pipefd[1]);
-
-	pfd = accept(lfd, (struct sockaddr *) &paddr, &len);
-	fcntl(pfd, F_SETOWN, getpid());
-
-	signal_recvd = 0;
-	signal_producer(pipefd[1]);
-
-	/* Test 1:
-	 * veriyf that SIGURG is
-	 * delivered, 63 bytes are
-	 * read, oob is '@', and POLLPRI works.
-	 */
-	wait_for_data(pfd, POLLPRI);
-	read_oob(pfd, &oob);
-	len = read_data(pfd, buf, 1024);
-	if (!signal_recvd || len != 63 || oob != '@') {
-		fprintf(stderr, "Test 1 failed sigurg %d len %d %c\n",
-			 signal_recvd, len, oob);
-			die(1);
-	}
-
-	signal_recvd = 0;
-	signal_producer(pipefd[1]);
-
-	/* Test 2:
-	 * Verify that the first OOB is over written by
-	 * the 2nd one and the first OOB is returned as
-	 * part of the read, and sigurg is received.
-	 */
-	wait_for_data(pfd, POLLIN | POLLPRI);
-	len = 0;
-	while (len < 70)
-		len = recv(pfd, buf, 1024, MSG_PEEK);
-	len = read_data(pfd, buf, 1024);
-	read_oob(pfd, &oob);
-	if (!signal_recvd || len != 127 || oob != '#') {
-		fprintf(stderr, "Test 2 failed, sigurg %d len %d OOB %c\n",
-		signal_recvd, len, oob);
-		die(1);
-	}
-
-	signal_recvd = 0;
-	signal_producer(pipefd[1]);
-
-	/* Test 3:
-	 * verify that 2nd oob over writes
-	 * the first one and read breaks at
-	 * oob boundary returning 127 bytes
-	 * and sigurg is received and atmark
-	 * is set.
-	 * oob is '%' and second read returns
-	 * 64 bytes.
-	 */
-	len = 0;
-	wait_for_data(pfd, POLLIN | POLLPRI);
-	while (len < 150)
-		len = recv(pfd, buf, 1024, MSG_PEEK);
-	len = read_data(pfd, buf, 1024);
-	atmark = is_sioctatmark(pfd);
-	read_oob(pfd, &oob);
-
-	if (!signal_recvd || len != 127 || oob != '%' || atmark != 1) {
-		fprintf(stderr,
-			"Test 3 failed, sigurg %d len %d OOB %c atmark %d\n",
-			signal_recvd, len, oob, atmark);
-		die(1);
-	}
-
-	signal_recvd = 0;
-
-	len = read_data(pfd, buf, 1024);
-	if (len != 64) {
-		fprintf(stderr, "Test 3.1 failed, sigurg %d len %d OOB %c\n",
-			signal_recvd, len, oob);
-		die(1);
-	}
-
-	signal_recvd = 0;
-	signal_producer(pipefd[1]);
-
-	/* Test 4:
-	 * verify that a single byte
-	 * oob message is delivered.
-	 * set non blocking mode and
-	 * check proper error is
-	 * returned and sigurg is
-	 * received and correct
-	 * oob is read.
-	 */
-
-	set_filemode(pfd, 0);
-
-	wait_for_data(pfd, POLLIN | POLLPRI);
-	len = read_data(pfd, buf, 1024);
-	if ((len == -1) && (errno == 11))
-		len = 0;
-
-	read_oob(pfd, &oob);
-
-	if (!signal_recvd || len != 0 || oob != '@') {
-		fprintf(stderr, "Test 4 failed, sigurg %d len %d OOB %c\n",
-			 signal_recvd, len, oob);
-		die(1);
-	}
-
-	set_filemode(pfd, 1);
-
-	/* Inline Testing */
-
-	on = 1;
-	if (setsockopt(pfd, SOL_SOCKET, SO_OOBINLINE, &on, sizeof(on))) {
-		perror("SO_OOBINLINE");
-		die(1);
-	}
-
-	signal_recvd = 0;
-	signal_producer(pipefd[1]);
-
-	/* Test 1 -- Inline:
-	 * Check that SIGURG is
-	 * delivered and 63 bytes are
-	 * read and oob is '@'
-	 */
-
-	wait_for_data(pfd, POLLIN | POLLPRI);
-	len = read_data(pfd, buf, 1024);
-
-	if (!signal_recvd || len != 63) {
-		fprintf(stderr, "Test 1 Inline failed, sigurg %d len %d\n",
-			signal_recvd, len);
-		die(1);
-	}
-
-	len = read_data(pfd, buf, 1024);
-
-	if (len != 1) {
-		fprintf(stderr,
-			 "Test 1.1 Inline failed, sigurg %d len %d oob %c\n",
-			 signal_recvd, len, oob);
-		die(1);
-	}
-
-	signal_recvd = 0;
-	signal_producer(pipefd[1]);
-
-	/* Test 2 -- Inline:
-	 * Verify that the first OOB is over written by
-	 * the 2nd one and read breaks correctly on
-	 * 2nd OOB boundary with the first OOB returned as
-	 * part of the read, and sigurg is delivered and
-	 * siocatmark returns true.
-	 * next read returns one byte, the oob byte
-	 * and siocatmark returns false.
-	 */
-	len = 0;
-	wait_for_data(pfd, POLLIN | POLLPRI);
-	while (len < 70)
-		len = recv(pfd, buf, 1024, MSG_PEEK);
-	len = read_data(pfd, buf, 1024);
-	atmark = is_sioctatmark(pfd);
-	if (len != 127 || atmark != 1 || !signal_recvd) {
-		fprintf(stderr, "Test 2 Inline failed, len %d atmark %d\n",
-			 len, atmark);
-		die(1);
-	}
-
-	len = read_data(pfd, buf, 1024);
-	atmark = is_sioctatmark(pfd);
-	if (len != 1 || buf[0] != '#' || atmark == 1) {
-		fprintf(stderr, "Test 2.1 Inline failed, len %d data %c atmark %d\n",
-			len, buf[0], atmark);
-		die(1);
-	}
-
-	signal_recvd = 0;
-	signal_producer(pipefd[1]);
-
-	/* Test 3 -- Inline:
-	 * verify that 2nd oob over writes
-	 * the first one and read breaks at
-	 * oob boundary returning 127 bytes
-	 * and sigurg is received and siocatmark
-	 * is true after the read.
-	 * subsequent read returns 65 bytes
-	 * because of oob which should be '%'.
-	 */
-	len = 0;
-	wait_for_data(pfd, POLLIN | POLLPRI);
-	while (len < 126)
-		len = recv(pfd, buf, 1024, MSG_PEEK);
-	len = read_data(pfd, buf, 1024);
-	atmark = is_sioctatmark(pfd);
-	if (!signal_recvd || len != 127 || !atmark) {
-		fprintf(stderr,
-			 "Test 3 Inline failed, sigurg %d len %d data %c\n",
-			 signal_recvd, len, buf[0]);
-		die(1);
-	}
-
-	len = read_data(pfd, buf, 1024);
-	atmark = is_sioctatmark(pfd);
-	if (len != 65 || buf[0] != '%' || atmark != 0) {
-		fprintf(stderr,
-			 "Test 3.1 Inline failed, len %d oob %c atmark %d\n",
-			 len, buf[0], atmark);
-		die(1);
-	}
-
-	signal_recvd = 0;
-	signal_producer(pipefd[1]);
-
-	/* Test 4 -- Inline:
-	 * verify that a single
-	 * byte oob message is delivered
-	 * and read returns one byte, the oob
-	 * byte and sigurg is received
-	 */
-	wait_for_data(pfd, POLLIN | POLLPRI);
-	len = read_data(pfd, buf, 1024);
-	if (!signal_recvd || len != 1 || buf[0] != '@') {
-		fprintf(stderr,
-			"Test 4 Inline failed, signal %d len %d data %c\n",
-		signal_recvd, len, buf[0]);
-		die(1);
-	}
-	die(0);
-}
diff --git a/tools/testing/selftests/net/amt.sh b/tools/testing/selftests/net/amt.sh
index 75528788cb95..3ef209cacb8e 100755
--- a/tools/testing/selftests/net/amt.sh
+++ b/tools/testing/selftests/net/amt.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
 # Author: Taehee Yoo <ap420073@gmail.com>
@@ -77,6 +77,7 @@ readonly LISTENER=$(mktemp -u listener-XXXXXXXX)
 readonly GATEWAY=$(mktemp -u gateway-XXXXXXXX)
 readonly RELAY=$(mktemp -u relay-XXXXXXXX)
 readonly SOURCE=$(mktemp -u source-XXXXXXXX)
+readonly SMCROUTEDIR="$(mktemp -d)"
 ERR=4
 err=0
 
@@ -85,6 +86,11 @@ exit_cleanup()
 	for ns in "$@"; do
 		ip netns delete "${ns}" 2>/dev/null || true
 	done
+	if [ -f "$SMCROUTEDIR/amt.pid" ]; then
+		smcpid=$(< $SMCROUTEDIR/amt.pid)
+		kill $smcpid
+	fi
+	rm -rf $SMCROUTEDIR
 
 	exit $ERR
 }
@@ -167,7 +173,7 @@ setup_iptables()
 
 setup_mcast_routing()
 {
-	ip netns exec "${RELAY}" smcrouted
+	ip netns exec "${RELAY}" smcrouted -P $SMCROUTEDIR/amt.pid
 	ip netns exec "${RELAY}" smcroutectl a relay_src \
 		172.17.0.2 239.0.0.1 amtr
 	ip netns exec "${RELAY}" smcroutectl a relay_src \
@@ -188,15 +194,21 @@ test_remote_ip()
 
 send_mcast_torture4()
 {
-	ip netns exec "${SOURCE}" bash -c \
-		'cat /dev/urandom | head -c 1G | nc -w 1 -u 239.0.0.1 4001'
+	for i in `seq 10`; do
+		ip netns exec "${SOURCE}" bash -c \
+		   'cat /dev/urandom | head -c 100M | nc -w 1 -u 239.0.0.1 4001'
+		echo -n "."
+	done
 }
 
 
 send_mcast_torture6()
 {
-	ip netns exec "${SOURCE}" bash -c \
-		'cat /dev/urandom | head -c 1G | nc -w 1 -u ff0e::5:6 6001'
+	for i in `seq 10`; do
+		ip netns exec "${SOURCE}" bash -c \
+		   'cat /dev/urandom | head -c 100M | nc -w 1 -u ff0e::5:6 6001'
+		echo -n "."
+	done
 }
 
 check_features()
@@ -210,8 +222,8 @@ check_features()
 
 test_ipv4_forward()
 {
-	RESULT4=$(ip netns exec "${LISTENER}" nc -w 1 -l -u 239.0.0.1 4000)
-	if [ "$RESULT4" == "172.17.0.2" ]; then
+	RESULT4=$(ip netns exec "${LISTENER}" timeout 15 socat - UDP4-LISTEN:4000,readbytes=128 || true)
+	if echo "$RESULT4" | grep -q "172.17.0.2"; then
 		printf "TEST: %-60s  [ OK ]\n" "IPv4 amt multicast forwarding"
 		exit 0
 	else
@@ -222,8 +234,8 @@ test_ipv4_forward()
 
 test_ipv6_forward()
 {
-	RESULT6=$(ip netns exec "${LISTENER}" nc -w 1 -l -u ff0e::5:6 6000)
-	if [ "$RESULT6" == "2001:db8:3::2" ]; then
+	RESULT6=$(ip netns exec "${LISTENER}" timeout 15 socat - UDP6-LISTEN:6000,readbytes=128 || true)
+	if echo "$RESULT6" | grep -q "2001:db8:3::2"; then
 		printf "TEST: %-60s  [ OK ]\n" "IPv6 amt multicast forwarding"
 		exit 0
 	else
@@ -236,14 +248,14 @@ send_mcast4()
 {
 	sleep 2
 	ip netns exec "${SOURCE}" bash -c \
-		'echo 172.17.0.2 | nc -w 1 -u 239.0.0.1 4000' &
+		'printf "%s %128s" 172.17.0.2 | nc -w 1 -u 239.0.0.1 4000' &
 }
 
 send_mcast6()
 {
 	sleep 2
 	ip netns exec "${SOURCE}" bash -c \
-		'echo 2001:db8:3::2 | nc -w 1 -u ff0e::5:6 6000' &
+		'printf "%s %128s" 2001:db8:3::2 | nc -w 1 -u ff0e::5:6 6000' &
 }
 
 check_features
@@ -272,10 +284,12 @@ wait $pid || err=$?
 if [ $err -eq 1 ]; then
 	ERR=1
 fi
+printf "TEST: %-50s" "IPv4 amt traffic forwarding torture"
 send_mcast_torture4
-printf "TEST: %-60s  [ OK ]\n" "IPv4 amt traffic forwarding torture"
+printf "  [ OK ]\n"
+printf "TEST: %-50s" "IPv6 amt traffic forwarding torture"
 send_mcast_torture6
-printf "TEST: %-60s  [ OK ]\n" "IPv6 amt traffic forwarding torture"
+printf "  [ OK ]\n"
 sleep 5
 if [ "${ERR}" -eq 1 ]; then
         echo "Some tests failed." >&2
diff --git a/tools/testing/selftests/net/arp_ndisc_untracked_subnets.sh b/tools/testing/selftests/net/arp_ndisc_untracked_subnets.sh
index a40c0e9bd023..eef5cbf6eecc 100755
--- a/tools/testing/selftests/net/arp_ndisc_untracked_subnets.sh
+++ b/tools/testing/selftests/net/arp_ndisc_untracked_subnets.sh
@@ -73,25 +73,19 @@ setup_v6() {
 	# namespaces. veth0 is veth-router, veth1 is veth-host.
 	# first, set up the inteface's link to the namespace
 	# then, set the interface "up"
-	ip -6 -netns ${ROUTER_NS_V6} link add name ${ROUTER_INTF} \
-		type veth peer name ${HOST_INTF}
-
-	ip -6 -netns ${ROUTER_NS_V6} link set dev ${ROUTER_INTF} up
-	ip -6 -netns ${ROUTER_NS_V6} link set dev ${HOST_INTF} netns \
-		${HOST_NS_V6}
+	ip -n ${ROUTER_NS_V6} link add name ${ROUTER_INTF} \
+		type veth peer name ${HOST_INTF} netns ${HOST_NS_V6}
 
-	ip -6 -netns ${HOST_NS_V6} link set dev ${HOST_INTF} up
-	ip -6 -netns ${ROUTER_NS_V6} addr add \
-		${ROUTER_ADDR_V6}/${PREFIX_WIDTH_V6} dev ${ROUTER_INTF} nodad
+	# Add tc rule to filter out host na message
+	tc -n ${ROUTER_NS_V6} qdisc add dev ${ROUTER_INTF} clsact
+	tc -n ${ROUTER_NS_V6} filter add dev ${ROUTER_INTF} \
+		ingress protocol ipv6 pref 1 handle 101 \
+		flower src_ip ${HOST_ADDR_V6} ip_proto icmpv6 type 136 skip_hw action pass
 
 	HOST_CONF=net.ipv6.conf.${HOST_INTF}
 	ip netns exec ${HOST_NS_V6} sysctl -qw ${HOST_CONF}.ndisc_notify=1
 	ip netns exec ${HOST_NS_V6} sysctl -qw ${HOST_CONF}.disable_ipv6=0
-	ip -6 -netns ${HOST_NS_V6} addr add ${HOST_ADDR_V6}/${PREFIX_WIDTH_V6} \
-		dev ${HOST_INTF}
-
 	ROUTER_CONF=net.ipv6.conf.${ROUTER_INTF}
-
 	ip netns exec ${ROUTER_NS_V6} sysctl -w \
 		${ROUTER_CONF}.forwarding=1 >/dev/null 2>&1
 	ip netns exec ${ROUTER_NS_V6} sysctl -w \
@@ -99,6 +93,13 @@ setup_v6() {
 	ip netns exec ${ROUTER_NS_V6} sysctl -w \
 		${ROUTER_CONF}.accept_untracked_na=${accept_untracked_na} \
 		>/dev/null 2>&1
+
+	ip -n ${ROUTER_NS_V6} link set dev ${ROUTER_INTF} up
+	ip -n ${HOST_NS_V6} link set dev ${HOST_INTF} up
+	ip -n ${ROUTER_NS_V6} addr add ${ROUTER_ADDR_V6}/${PREFIX_WIDTH_V6} \
+		dev ${ROUTER_INTF} nodad
+	ip -n ${HOST_NS_V6} addr add ${HOST_ADDR_V6}/${PREFIX_WIDTH_V6} \
+		dev ${HOST_INTF}
 	set +e
 }
 
@@ -162,26 +163,6 @@ arp_test_gratuitous_combinations() {
 	arp_test_gratuitous 2 1
 }
 
-cleanup_tcpdump() {
-	set -e
-	[[ ! -z  ${tcpdump_stdout} ]] && rm -f ${tcpdump_stdout}
-	[[ ! -z  ${tcpdump_stderr} ]] && rm -f ${tcpdump_stderr}
-	tcpdump_stdout=
-	tcpdump_stderr=
-	set +e
-}
-
-start_tcpdump() {
-	set -e
-	tcpdump_stdout=`mktemp`
-	tcpdump_stderr=`mktemp`
-	ip netns exec ${ROUTER_NS_V6} timeout 15s \
-		tcpdump --immediate-mode -tpni ${ROUTER_INTF} -c 1 \
-		"icmp6 && icmp6[0] == 136 && src ${HOST_ADDR_V6}" \
-		> ${tcpdump_stdout} 2> /dev/null
-	set +e
-}
-
 verify_ndisc() {
 	local accept_untracked_na=$1
 	local same_subnet=$2
@@ -222,8 +203,9 @@ ndisc_test_untracked_advertisements() {
 			HOST_ADDR_V6=2001:db8:abcd:0012::3
 		fi
 	fi
-	setup_v6 $1 $2
-	start_tcpdump
+	setup_v6 $1
+	slowwait_for_counter 15 1 \
+		tc_rule_handle_stats_get "dev ${ROUTER_INTF} ingress" 101 ".packets" "-n ${ROUTER_NS_V6}"
 
 	if verify_ndisc $1 $2; then
 		printf "    TEST: %-60s  [ OK ]\n" "${test_msg[*]}"
@@ -231,7 +213,6 @@ ndisc_test_untracked_advertisements() {
 		printf "    TEST: %-60s  [FAIL]\n" "${test_msg[*]}"
 	fi
 
-	cleanup_tcpdump
 	cleanup_v6
 	set +e
 }
diff --git a/tools/testing/selftests/net/bareudp.sh b/tools/testing/selftests/net/bareudp.sh
index f366cadbc5e8..4046131e7888 100755
--- a/tools/testing/selftests/net/bareudp.sh
+++ b/tools/testing/selftests/net/bareudp.sh
@@ -106,26 +106,16 @@
 # |                                                                       |
 # +-----------------------------------------------------------------------+
 
+. ./lib.sh
+
 ERR=4 # Return 4 by default, which is the SKIP code for kselftest
 PING6="ping"
 PAUSE_ON_FAIL="no"
 
-readonly NS0=$(mktemp -u ns0-XXXXXXXX)
-readonly NS1=$(mktemp -u ns1-XXXXXXXX)
-readonly NS2=$(mktemp -u ns2-XXXXXXXX)
-readonly NS3=$(mktemp -u ns3-XXXXXXXX)
-
 # Exit the script after having removed the network namespaces it created
-#
-# Parameters:
-#
-#   * The list of network namespaces to delete before exiting.
-#
 exit_cleanup()
 {
-	for ns in "$@"; do
-		ip netns delete "${ns}" 2>/dev/null || true
-	done
+	cleanup_all_ns
 
 	if [ "${ERR}" -eq 4 ]; then
 		echo "Error: Setting up the testing environment failed." >&2
@@ -140,17 +130,7 @@ exit_cleanup()
 # namespaces created by this script are deleted.
 create_namespaces()
 {
-	ip netns add "${NS0}" || exit_cleanup
-	ip netns add "${NS1}" || exit_cleanup "${NS0}"
-	ip netns add "${NS2}" || exit_cleanup "${NS0}" "${NS1}"
-	ip netns add "${NS3}" || exit_cleanup "${NS0}" "${NS1}" "${NS2}"
-}
-
-# The trap function handler
-#
-exit_cleanup_all()
-{
-	exit_cleanup "${NS0}" "${NS1}" "${NS2}" "${NS3}"
+	setup_ns NS0 NS1 NS2 NS3 || exit_cleanup
 }
 
 # Configure a network interface using a host route
@@ -188,10 +168,6 @@ iface_config()
 #
 setup_underlay()
 {
-	for ns in "${NS0}" "${NS1}" "${NS2}" "${NS3}"; do
-		ip -netns "${ns}" link set dev lo up
-	done;
-
 	ip link add name veth01 netns "${NS0}" type veth peer name veth10 netns "${NS1}"
 	ip link add name veth12 netns "${NS1}" type veth peer name veth21 netns "${NS2}"
 	ip link add name veth23 netns "${NS2}" type veth peer name veth32 netns "${NS3}"
@@ -234,14 +210,6 @@ setup_overlay_ipv4()
 	ip netns exec "${NS2}" sysctl -qw net.ipv4.ip_forward=1
 	ip -netns "${NS1}" route add 192.0.2.100/32 via 192.0.2.10
 	ip -netns "${NS2}" route add 192.0.2.103/32 via 192.0.2.33
-
-	# The intermediate namespaces don't have routes for the reverse path,
-	# as it will be handled by tc. So we need to ensure that rp_filter is
-	# not going to block the traffic.
-	ip netns exec "${NS1}" sysctl -qw net.ipv4.conf.all.rp_filter=0
-	ip netns exec "${NS2}" sysctl -qw net.ipv4.conf.all.rp_filter=0
-	ip netns exec "${NS1}" sysctl -qw net.ipv4.conf.default.rp_filter=0
-	ip netns exec "${NS2}" sysctl -qw net.ipv4.conf.default.rp_filter=0
 }
 
 setup_overlay_ipv6()
@@ -521,13 +489,10 @@ done
 
 check_features
 
-# Create namespaces before setting up the exit trap.
-# Otherwise, exit_cleanup_all() could delete namespaces that were not created
-# by this script.
-create_namespaces
-
 set -e
-trap exit_cleanup_all EXIT
+trap exit_cleanup EXIT
+
+create_namespaces
 
 setup_underlay
 setup_overlay_ipv4
diff --git a/tools/testing/selftests/net/bpf.mk b/tools/testing/selftests/net/bpf.mk
new file mode 100644
index 000000000000..a4f6755dd894
--- /dev/null
+++ b/tools/testing/selftests/net/bpf.mk
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: GPL-2.0
+# Rules to generate bpf objs
+CLANG ?= clang
+SCRATCH_DIR := $(OUTPUT)/tools
+BUILD_DIR := $(SCRATCH_DIR)/build
+BPFDIR := $(top_srcdir)/tools/lib/bpf
+APIDIR := $(top_srcdir)/tools/include/uapi
+
+CCINCLUDE += -I$(selfdir)/bpf
+CCINCLUDE += -I$(top_srcdir)/usr/include/
+CCINCLUDE += -I$(SCRATCH_DIR)/include
+
+BPFOBJ := $(BUILD_DIR)/libbpf/libbpf.a
+
+MAKE_DIRS := $(BUILD_DIR)/libbpf
+$(MAKE_DIRS):
+	$(call msg,MKDIR,,$@)
+	$(Q)mkdir -p $@
+
+# Get Clang's default includes on this system, as opposed to those seen by
+# '--target=bpf'. This fixes "missing" files on some architectures/distros,
+# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc.
+#
+# Use '-idirafter': Don't interfere with include mechanics except where the
+# build would have failed anyways.
+define get_sys_includes
+$(shell $(1) $(2) -v -E - </dev/null 2>&1 \
+	| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \
+$(shell $(1) $(2) -dM -E - </dev/null | grep '__riscv_xlen ' | awk '{printf("-D__riscv_xlen=%d -D__BITS_PER_LONG=%d", $$3, $$3)}')
+endef
+
+ifneq ($(CROSS_COMPILE),)
+CLANG_TARGET_ARCH = --target=$(notdir $(CROSS_COMPILE:%-=%))
+endif
+
+CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG),$(CLANG_TARGET_ARCH))
+
+BPF_PROG_OBJS := $(patsubst %.c,$(OUTPUT)/%.o,$(wildcard *.bpf.c))
+
+$(BPF_PROG_OBJS): $(OUTPUT)/%.o : %.c $(BPFOBJ) | $(MAKE_DIRS)
+	$(call msg,BPF_PROG,,$@)
+	$(Q)$(CLANG) -O2 -g --target=bpf $(CCINCLUDE) $(CLANG_SYS_INCLUDES) \
+	-c $< -o $@
+
+$(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile)		       \
+	   $(APIDIR)/linux/bpf.h					       \
+	   | $(BUILD_DIR)/libbpf
+	$(call msg,MAKE,,$@)
+	$(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(BUILD_DIR)/libbpf/ \
+		    EXTRA_CFLAGS='-g -O0'				       \
+		    DESTDIR=$(SCRATCH_DIR) prefix= all install_headers
+
+EXTRA_CLEAN += $(SCRATCH_DIR)
diff --git a/tools/testing/selftests/net/bpf_offload.py b/tools/testing/selftests/net/bpf_offload.py
new file mode 100755
index 000000000000..b2c271b79240
--- /dev/null
+++ b/tools/testing/selftests/net/bpf_offload.py
@@ -0,0 +1,1357 @@
+#!/usr/bin/env python3
+
+# Copyright (C) 2017 Netronome Systems, Inc.
+# Copyright (c) 2019 Mellanox Technologies. All rights reserved
+#
+# This software is licensed under the GNU General License Version 2,
+# June 1991 as shown in the file COPYING in the top-level directory of this
+# source tree.
+#
+# THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS"
+# WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
+# BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+# FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE
+# OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME
+# THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+from datetime import datetime
+import argparse
+import errno
+import json
+import os
+import pprint
+import random
+import re
+import stat
+import string
+import struct
+import subprocess
+import time
+import traceback
+
+from lib.py import NetdevSim, NetdevSimDev
+
+
+logfile = None
+log_level = 1
+skip_extack = False
+bpf_test_dir = os.path.dirname(os.path.realpath(__file__))
+pp = pprint.PrettyPrinter()
+devs = [] # devices we created for clean up
+files = [] # files to be removed
+netns = [] # net namespaces to be removed
+
+def log_get_sec(level=0):
+    return "*" * (log_level + level)
+
+def log_level_inc(add=1):
+    global log_level
+    log_level += add
+
+def log_level_dec(sub=1):
+    global log_level
+    log_level -= sub
+
+def log_level_set(level):
+    global log_level
+    log_level = level
+
+def log(header, data, level=None):
+    """
+    Output to an optional log.
+    """
+    if logfile is None:
+        return
+    if level is not None:
+        log_level_set(level)
+
+    if not isinstance(data, str):
+        data = pp.pformat(data)
+
+    if len(header):
+        logfile.write("\n" + log_get_sec() + " ")
+        logfile.write(header)
+    if len(header) and len(data.strip()):
+        logfile.write("\n")
+    logfile.write(data)
+
+def skip(cond, msg):
+    if not cond:
+        return
+    print("SKIP: " + msg)
+    log("SKIP: " + msg, "", level=1)
+    os.sys.exit(0)
+
+def fail(cond, msg):
+    if not cond:
+        return
+    print("FAIL: " + msg)
+    tb = "".join(traceback.extract_stack().format())
+    print(tb)
+    log("FAIL: " + msg, tb, level=1)
+    os.sys.exit(1)
+
+def start_test(msg):
+    log(msg, "", level=1)
+    log_level_inc()
+    print(msg)
+
+def cmd(cmd, shell=True, include_stderr=False, background=False, fail=True):
+    """
+    Run a command in subprocess and return tuple of (retval, stdout);
+    optionally return stderr as well as third value.
+    """
+    proc = subprocess.Popen(cmd, shell=shell, stdout=subprocess.PIPE,
+                            stderr=subprocess.PIPE)
+    if background:
+        msg = "%s START: %s" % (log_get_sec(1),
+                                datetime.now().strftime("%H:%M:%S.%f"))
+        log("BKG " + proc.args, msg)
+        return proc
+
+    return cmd_result(proc, include_stderr=include_stderr, fail=fail)
+
+def cmd_result(proc, include_stderr=False, fail=False):
+    stdout, stderr = proc.communicate()
+    stdout = stdout.decode("utf-8")
+    stderr = stderr.decode("utf-8")
+    proc.stdout.close()
+    proc.stderr.close()
+
+    stderr = "\n" + stderr
+    if stderr[-1] == "\n":
+        stderr = stderr[:-1]
+
+    sec = log_get_sec(1)
+    log("CMD " + proc.args,
+        "RETCODE: %d\n%s STDOUT:\n%s%s STDERR:%s\n%s END: %s" %
+        (proc.returncode, sec, stdout, sec, stderr,
+         sec, datetime.now().strftime("%H:%M:%S.%f")))
+
+    if proc.returncode != 0 and fail:
+        if len(stderr) > 0 and stderr[-1] == "\n":
+            stderr = stderr[:-1]
+        raise Exception("Command failed: %s\n%s" % (proc.args, stderr))
+
+    if include_stderr:
+        return proc.returncode, stdout, stderr
+    else:
+        return proc.returncode, stdout
+
+def rm(f):
+    cmd("rm -f %s" % (f))
+    if f in files:
+        files.remove(f)
+
+def tool(name, args, flags, JSON=True, ns="", fail=True, include_stderr=False):
+    params = ""
+    if JSON:
+        params += "%s " % (flags["json"])
+
+    if ns:
+        ns = "ip netns exec %s " % (ns)
+    elif ns is None:
+        ns = ""
+
+    if include_stderr:
+        ret, stdout, stderr = cmd(ns + name + " " + params + args,
+                                  fail=fail, include_stderr=True)
+    else:
+        ret, stdout = cmd(ns + name + " " + params + args,
+                          fail=fail, include_stderr=False)
+
+    if JSON and len(stdout.strip()) != 0:
+        out = json.loads(stdout)
+    else:
+        out = stdout
+
+    if include_stderr:
+        return ret, out, stderr
+    else:
+        return ret, out
+
+def bpftool(args, JSON=True, ns="", fail=True, include_stderr=False):
+    return tool("bpftool", args, {"json":"-p"}, JSON=JSON, ns=ns,
+                fail=fail, include_stderr=include_stderr)
+
+def bpftool_prog_list(expected=None, ns="", exclude_orphaned=True):
+    _, progs = bpftool("prog show", JSON=True, ns=ns, fail=True)
+    # Remove the base progs
+    for p in base_progs:
+        if p in progs:
+            progs.remove(p)
+    if exclude_orphaned:
+        progs = [ p for p in progs if not p['orphaned'] ]
+    if expected is not None:
+        if len(progs) != expected:
+            fail(True, "%d BPF programs loaded, expected %d" %
+                 (len(progs), expected))
+    return progs
+
+def bpftool_map_list(expected=None, ns=""):
+    _, maps = bpftool("map show", JSON=True, ns=ns, fail=True)
+    # Remove the base maps
+    maps = [m for m in maps if m not in base_maps and m.get('name') and m.get('name') not in base_map_names]
+    if expected is not None:
+        if len(maps) != expected:
+            fail(True, "%d BPF maps loaded, expected %d" %
+                 (len(maps), expected))
+    return maps
+
+def bpftool_prog_list_wait(expected=0, n_retry=20):
+    for i in range(n_retry):
+        nprogs = len(bpftool_prog_list())
+        if nprogs == expected:
+            return
+        time.sleep(0.05)
+    raise Exception("Time out waiting for program counts to stabilize want %d, have %d" % (expected, nprogs))
+
+def bpftool_map_list_wait(expected=0, n_retry=20, ns=""):
+    nmaps = None
+    for i in range(n_retry):
+        maps = bpftool_map_list(ns=ns)
+        nmaps = len(maps)
+        if nmaps == expected:
+            return maps
+        time.sleep(0.05)
+    raise Exception("Time out waiting for map counts to stabilize want %d, have %d" % (expected, nmaps))
+
+def bpftool_prog_load(sample, file_name, maps=[], prog_type="xdp", dev=None,
+                      fail=True, include_stderr=False, dev_bind=None):
+    args = "prog load %s %s" % (os.path.join(bpf_test_dir, sample), file_name)
+    if prog_type is not None:
+        args += " type " + prog_type
+    if dev is not None:
+        args += " dev " + dev
+    elif dev_bind is not None:
+        args += " xdpmeta_dev " + dev_bind
+    if len(maps):
+        args += " map " + " map ".join(maps)
+
+    res = bpftool(args, fail=fail, include_stderr=include_stderr)
+    if res[0] == 0:
+        files.append(file_name)
+    return res
+
+def ip(args, force=False, JSON=True, ns="", fail=True, include_stderr=False):
+    if force:
+        args = "-force " + args
+    return tool("ip", args, {"json":"-j"}, JSON=JSON, ns=ns,
+                fail=fail, include_stderr=include_stderr)
+
+def tc(args, JSON=True, ns="", fail=True, include_stderr=False):
+    return tool("tc", args, {"json":"-p"}, JSON=JSON, ns=ns,
+                fail=fail, include_stderr=include_stderr)
+
+def ethtool(dev, opt, args, fail=True):
+    return cmd("ethtool %s %s %s" % (opt, dev["ifname"], args), fail=fail)
+
+def bpf_obj(name, sec="xdp", path=bpf_test_dir,):
+    return "obj %s sec %s" % (os.path.join(path, name), sec)
+
+def bpf_pinned(name):
+    return "pinned %s" % (name)
+
+def bpf_bytecode(bytecode):
+    return "bytecode \"%s\"" % (bytecode)
+
+def mknetns(n_retry=10):
+    for i in range(n_retry):
+        name = ''.join([random.choice(string.ascii_letters) for i in range(8)])
+        ret, _ = ip("netns add %s" % (name), fail=False)
+        if ret == 0:
+            netns.append(name)
+            return name
+    return None
+
+def int2str(fmt, val):
+    ret = []
+    for b in struct.pack(fmt, val):
+        ret.append(int(b))
+    return " ".join(map(lambda x: str(x), ret))
+
+def str2int(strtab):
+    inttab = []
+    for i in strtab:
+        inttab.append(int(i, 16))
+    ba = bytearray(inttab)
+    if len(strtab) == 4:
+        fmt = "I"
+    elif len(strtab) == 8:
+        fmt = "Q"
+    else:
+        raise Exception("String array of len %d can't be unpacked to an int" %
+                        (len(strtab)))
+    return struct.unpack(fmt, ba)[0]
+
+class DebugfsDir:
+    """
+    Class for accessing DebugFS directories as a dictionary.
+    """
+
+    def __init__(self, path):
+        self.path = path
+        self._dict = self._debugfs_dir_read(path)
+
+    def __len__(self):
+        return len(self._dict.keys())
+
+    def __getitem__(self, key):
+        if type(key) is int:
+            key = list(self._dict.keys())[key]
+        return self._dict[key]
+
+    def __setitem__(self, key, value):
+        log("DebugFS set %s = %s" % (key, value), "")
+        log_level_inc()
+
+        cmd("echo '%s' > %s/%s" % (value, self.path, key))
+        log_level_dec()
+
+        _, out = cmd('cat %s/%s' % (self.path, key))
+        self._dict[key] = out.strip()
+
+    def _debugfs_dir_read(self, path):
+        dfs = {}
+
+        log("DebugFS state for %s" % (path), "")
+        log_level_inc(add=2)
+
+        _, out = cmd('ls ' + path)
+        for f in out.split():
+            if f == "ports":
+                continue
+
+            p = os.path.join(path, f)
+            if not os.stat(p).st_mode & stat.S_IRUSR:
+                continue
+
+            if os.path.isfile(p):
+                # We need to init trap_flow_action_cookie before read it
+                if f == "trap_flow_action_cookie":
+                    cmd('echo deadbeef > %s/%s' % (path, f))
+                _, out = cmd('cat %s/%s' % (path, f))
+                dfs[f] = out.strip()
+            elif os.path.isdir(p):
+                dfs[f] = DebugfsDir(p)
+            else:
+                raise Exception("%s is neither file nor directory" % (p))
+
+        log_level_dec()
+        log("DebugFS state", dfs)
+        log_level_dec()
+
+        return dfs
+
+class BpfNetdevSimDev(NetdevSimDev):
+    """
+    Class for netdevsim bus device and its attributes.
+    """
+    def __init__(self, port_count=1, ns=None):
+        super().__init__(port_count, ns=ns)
+        devs.append(self)
+
+    def _make_port(self, port_index, ifname):
+        return BpfNetdevSim(self, port_index, ifname, self.ns)
+
+    def dfs_num_bound_progs(self):
+        path = os.path.join(self.dfs_dir, "bpf_bound_progs")
+        _, progs = cmd('ls %s' % (path))
+        return len(progs.split())
+
+    def dfs_get_bound_progs(self, expected):
+        progs = DebugfsDir(os.path.join(self.dfs_dir, "bpf_bound_progs"))
+        if expected is not None:
+            if len(progs) != expected:
+                fail(True, "%d BPF programs bound, expected %d" %
+                     (len(progs), expected))
+        return progs
+
+    def remove(self):
+        super().remove()
+        devs.remove(self)
+
+
+class BpfNetdevSim(NetdevSim):
+    """
+    Class for netdevsim netdevice and its attributes.
+    """
+
+    def __init__(self, nsimdev, port_index, ifname, ns=None):
+        super().__init__(nsimdev, port_index, ifname, ns=ns)
+
+        self.dfs_dir = "%s/ports/%u/" % (nsimdev.dfs_dir, port_index)
+        self.dfs_refresh()
+
+    def __getitem__(self, key):
+        return self.dev[key]
+
+    def remove(self):
+        self.nsimdev.remove_nsim(self)
+
+    def dfs_refresh(self):
+        self.dfs = DebugfsDir(self.dfs_dir)
+        return self.dfs
+
+    def dfs_read(self, f):
+        path = os.path.join(self.dfs_dir, f)
+        _, data = cmd('cat %s' % (path))
+        return data.strip()
+
+    def wait_for_flush(self, bound=0, total=0, n_retry=20):
+        for i in range(n_retry):
+            nbound = self.nsimdev.dfs_num_bound_progs()
+            nprogs = len(bpftool_prog_list())
+            if nbound == bound and nprogs == total:
+                return
+            time.sleep(0.05)
+        raise Exception("Time out waiting for program counts to stabilize want %d/%d, have %d bound, %d loaded" % (bound, total, nbound, nprogs))
+
+    def set_ns(self, ns):
+        name = ns if ns else "1"
+        ip("link set dev %s netns %s" % (self.dev["ifname"], name), ns=self.ns)
+        self.ns = ns
+
+    def set_mtu(self, mtu, fail=True):
+        return ip("link set dev %s mtu %d" % (self.dev["ifname"], mtu),
+                  fail=fail)
+
+    def set_xdp(self, bpf, mode, force=False, JSON=True, verbose=False,
+                fail=True, include_stderr=False):
+        if verbose:
+            bpf += " verbose"
+        return ip("link set dev %s xdp%s %s" % (self.dev["ifname"], mode, bpf),
+                  force=force, JSON=JSON,
+                  fail=fail, include_stderr=include_stderr)
+
+    def unset_xdp(self, mode, force=False, JSON=True,
+                  fail=True, include_stderr=False):
+        return ip("link set dev %s xdp%s off" % (self.dev["ifname"], mode),
+                  force=force, JSON=JSON,
+                  fail=fail, include_stderr=include_stderr)
+
+    def ip_link_show(self, xdp):
+        _, link = ip("link show dev %s" % (self['ifname']))
+        if len(link) > 1:
+            raise Exception("Multiple objects on ip link show")
+        if len(link) < 1:
+            return {}
+        fail(xdp != "xdp" in link,
+             "XDP program not reporting in iplink (reported %s, expected %s)" %
+             ("xdp" in link, xdp))
+        return link[0]
+
+    def tc_add_ingress(self):
+        tc("qdisc add dev %s ingress" % (self['ifname']))
+
+    def tc_del_ingress(self):
+        tc("qdisc del dev %s ingress" % (self['ifname']))
+
+    def tc_flush_filters(self, bound=0, total=0):
+        self.tc_del_ingress()
+        self.tc_add_ingress()
+        self.wait_for_flush(bound=bound, total=total)
+
+    def tc_show_ingress(self, expected=None):
+        # No JSON support, oh well...
+        flags = ["skip_sw", "skip_hw", "in_hw"]
+        named = ["protocol", "pref", "chain", "handle", "id", "tag"]
+
+        args = "-s filter show dev %s ingress" % (self['ifname'])
+        _, out = tc(args, JSON=False)
+
+        filters = []
+        lines = out.split('\n')
+        for line in lines:
+            words = line.split()
+            if "handle" not in words:
+                continue
+            fltr = {}
+            for flag in flags:
+                fltr[flag] = flag in words
+            for name in named:
+                try:
+                    idx = words.index(name)
+                    fltr[name] = words[idx + 1]
+                except ValueError:
+                    pass
+            filters.append(fltr)
+
+        if expected is not None:
+            fail(len(filters) != expected,
+                 "%d ingress filters loaded, expected %d" %
+                 (len(filters), expected))
+        return filters
+
+    def cls_filter_op(self, op, qdisc="ingress", prio=None, handle=None,
+                      chain=None, cls="", params="",
+                      fail=True, include_stderr=False):
+        spec = ""
+        if prio is not None:
+            spec += " prio %d" % (prio)
+        if handle:
+            spec += " handle %s" % (handle)
+        if chain is not None:
+            spec += " chain %d" % (chain)
+
+        return tc("filter {op} dev {dev} {qdisc} {spec} {cls} {params}"\
+                  .format(op=op, dev=self['ifname'], qdisc=qdisc, spec=spec,
+                          cls=cls, params=params),
+                  fail=fail, include_stderr=include_stderr)
+
+    def cls_bpf_add_filter(self, bpf, op="add", prio=None, handle=None,
+                           chain=None, da=False, verbose=False,
+                           skip_sw=False, skip_hw=False,
+                           fail=True, include_stderr=False):
+        cls = "bpf " + bpf
+
+        params = ""
+        if da:
+            params += " da"
+        if verbose:
+            params += " verbose"
+        if skip_sw:
+            params += " skip_sw"
+        if skip_hw:
+            params += " skip_hw"
+
+        return self.cls_filter_op(op=op, prio=prio, handle=handle, cls=cls,
+                                  chain=chain, params=params,
+                                  fail=fail, include_stderr=include_stderr)
+
+    def set_ethtool_tc_offloads(self, enable, fail=True):
+        args = "hw-tc-offload %s" % ("on" if enable else "off")
+        return ethtool(self, "-K", args, fail=fail)
+
+################################################################################
+def clean_up():
+    global files, netns, devs
+
+    for dev in devs:
+        dev.remove()
+    for f in files:
+        cmd("rm -f %s" % (f))
+    for ns in netns:
+        cmd("ip netns delete %s" % (ns))
+    files = []
+    netns = []
+
+def pin_prog(file_name, idx=0):
+    progs = bpftool_prog_list(expected=(idx + 1))
+    prog = progs[idx]
+    bpftool("prog pin id %d %s" % (prog["id"], file_name))
+    files.append(file_name)
+
+    return file_name, bpf_pinned(file_name)
+
+def pin_map(file_name, idx=0, expected=1):
+    maps = bpftool_map_list_wait(expected=expected)
+    m = maps[idx]
+    bpftool("map pin id %d %s" % (m["id"], file_name))
+    files.append(file_name)
+
+    return file_name, bpf_pinned(file_name)
+
+def check_dev_info_removed(prog_file=None, map_file=None):
+    bpftool_prog_list(expected=0)
+    bpftool_prog_list(expected=1, exclude_orphaned=False)
+    ret, err = bpftool("prog show pin %s" % (prog_file), fail=False)
+    fail(ret != 0, "failed to show prog with removed device")
+
+    bpftool_map_list_wait(expected=0)
+    ret, err = bpftool("map show pin %s" % (map_file), fail=False)
+    fail(ret == 0, "Showing map with removed device did not fail")
+    fail(err["error"].find("No such device") == -1,
+         "Showing map with removed device expected ENODEV, error is %s" %
+         (err["error"]))
+
+def check_dev_info(other_ns, ns, prog_file=None, map_file=None, removed=False):
+    progs = bpftool_prog_list(expected=1, ns=ns)
+    prog = progs[0]
+
+    fail("dev" not in prog.keys(), "Device parameters not reported")
+    dev = prog["dev"]
+    fail("ifindex" not in dev.keys(), "Device parameters not reported")
+    fail("ns_dev" not in dev.keys(), "Device parameters not reported")
+    fail("ns_inode" not in dev.keys(), "Device parameters not reported")
+
+    if not other_ns:
+        fail("ifname" not in dev.keys(), "Ifname not reported")
+        fail(dev["ifname"] != sim["ifname"],
+             "Ifname incorrect %s vs %s" % (dev["ifname"], sim["ifname"]))
+    else:
+        fail("ifname" in dev.keys(), "Ifname is reported for other ns")
+
+    maps = bpftool_map_list_wait(expected=2, ns=ns)
+    for m in maps:
+        fail("dev" not in m.keys(), "Device parameters not reported")
+        fail(dev != m["dev"], "Map's device different than program's")
+
+def check_extack(output, reference, args):
+    if skip_extack:
+        return
+    lines = output.split("\n")
+    comp = len(lines) >= 2 and lines[1] == 'Error: ' + reference
+    fail(not comp, "Missing or incorrect netlink extack message")
+
+def check_extack_nsim(output, reference, args):
+    check_extack(output, "netdevsim: " + reference, args)
+
+def check_no_extack(res, needle):
+    haystack = (res[1] + res[2]).strip()
+    fail(haystack.count(needle) or haystack.count("Warning:"),
+         "Unexpected command output, leaky extack? ('%s', '%s')" % (needle, haystack))
+
+def check_verifier_log(output, reference):
+    lines = output.split("\n")
+    for l in reversed(lines):
+        if l == reference:
+            return
+    fail(True, "Missing or incorrect message from netdevsim in verifier log")
+
+def check_multi_basic(two_xdps):
+    fail(two_xdps["mode"] != 4, "Bad mode reported with multiple programs")
+    fail("prog" in two_xdps, "Base program reported in multi program mode")
+    fail(len(two_xdps["attached"]) != 2,
+         "Wrong attached program count with two programs")
+    fail(two_xdps["attached"][0]["prog"]["id"] ==
+         two_xdps["attached"][1]["prog"]["id"],
+         "Offloaded and other programs have the same id")
+
+def test_spurios_extack(sim, obj, skip_hw, needle):
+    res = sim.cls_bpf_add_filter(obj, prio=1, handle=1, skip_hw=skip_hw,
+                                 include_stderr=True)
+    check_no_extack(res, needle)
+    res = sim.cls_bpf_add_filter(obj, op="replace", prio=1, handle=1,
+                                 skip_hw=skip_hw, include_stderr=True)
+    check_no_extack(res, needle)
+    res = sim.cls_filter_op(op="delete", prio=1, handle=1, cls="bpf",
+                            include_stderr=True)
+    check_no_extack(res, needle)
+
+def test_multi_prog(simdev, sim, obj, modename, modeid):
+    start_test("Test multi-attachment XDP - %s + offload..." %
+               (modename or "default", ))
+    sim.set_xdp(obj, "offload")
+    xdp = sim.ip_link_show(xdp=True)["xdp"]
+    offloaded = sim.dfs_read("bpf_offloaded_id")
+    fail("prog" not in xdp, "Base program not reported in single program mode")
+    fail(len(xdp["attached"]) != 1,
+         "Wrong attached program count with one program")
+
+    sim.set_xdp(obj, modename)
+    two_xdps = sim.ip_link_show(xdp=True)["xdp"]
+
+    fail(xdp["attached"][0] not in two_xdps["attached"],
+         "Offload program not reported after other activated")
+    check_multi_basic(two_xdps)
+
+    offloaded2 = sim.dfs_read("bpf_offloaded_id")
+    fail(offloaded != offloaded2,
+         "Offload ID changed after loading other program")
+
+    start_test("Test multi-attachment XDP - replace...")
+    ret, _, err = sim.set_xdp(obj, "offload", fail=False, include_stderr=True)
+    fail(ret == 0, "Replaced one of programs without -force")
+    check_extack(err, "XDP program already attached.", args)
+
+    start_test("Test multi-attachment XDP - remove without mode...")
+    ret, _, err = sim.unset_xdp("", force=True,
+                                fail=False, include_stderr=True)
+    fail(ret == 0, "Removed program without a mode flag")
+    check_extack(err, "More than one program loaded, unset mode is ambiguous.", args)
+
+    sim.unset_xdp("offload")
+    xdp = sim.ip_link_show(xdp=True)["xdp"]
+    offloaded = sim.dfs_read("bpf_offloaded_id")
+
+    fail(xdp["mode"] != modeid, "Bad mode reported after multiple programs")
+    fail("prog" not in xdp,
+         "Base program not reported after multi program mode")
+    fail(xdp["attached"][0] not in two_xdps["attached"],
+         "Offload program not reported after other activated")
+    fail(len(xdp["attached"]) != 1,
+         "Wrong attached program count with remaining programs")
+    fail(offloaded != "0", "Offload ID reported with only other program left")
+
+    start_test("Test multi-attachment XDP - reattach...")
+    sim.set_xdp(obj, "offload")
+    two_xdps = sim.ip_link_show(xdp=True)["xdp"]
+
+    fail(xdp["attached"][0] not in two_xdps["attached"],
+         "Other program not reported after offload activated")
+    check_multi_basic(two_xdps)
+
+    start_test("Test multi-attachment XDP - device remove...")
+    simdev.remove()
+
+    simdev = BpfNetdevSimDev()
+    sim, = simdev.nsims
+    sim.set_ethtool_tc_offloads(True)
+    return [simdev, sim]
+
+# Parse command line
+parser = argparse.ArgumentParser()
+parser.add_argument("--log", help="output verbose log to given file")
+args = parser.parse_args()
+if args.log:
+    logfile = open(args.log, 'w+')
+    logfile.write("# -*-Org-*-")
+
+log("Prepare...", "", level=1)
+log_level_inc()
+
+# Check permissions
+skip(os.getuid() != 0, "test must be run as root")
+
+# Check tools
+ret, progs = bpftool("prog", fail=False)
+skip(ret != 0, "bpftool not installed")
+base_progs = progs
+_, base_maps = bpftool("map")
+base_map_names = [
+    'pid_iter.rodata', # created on each bpftool invocation
+    'libbpf_det_bind', # created on each bpftool invocation
+    'libbpf_global',
+]
+
+# Check netdevsim
+if not os.path.isdir("/sys/bus/netdevsim/"):
+    ret, out = cmd("modprobe netdevsim", fail=False)
+    skip(ret != 0, "netdevsim module could not be loaded")
+
+# Check debugfs
+_, out = cmd("mount")
+if out.find("/sys/kernel/debug type debugfs") == -1:
+    cmd("mount -t debugfs none /sys/kernel/debug")
+
+# Check samples are compiled
+samples = ["sample_ret0.bpf.o", "sample_map_ret0.bpf.o"]
+for s in samples:
+    ret, out = cmd("ls %s/%s" % (bpf_test_dir, s), fail=False)
+    skip(ret != 0, "sample %s/%s not found, please compile it" %
+         (bpf_test_dir, s))
+
+# Check if iproute2 is built with libmnl (needed by extack support)
+_, _, err = cmd("tc qdisc delete dev lo handle 0",
+                fail=False, include_stderr=True)
+if err.find("Error: Failed to find qdisc with specified handle.") == -1:
+    print("Warning: no extack message in iproute2 output, libmnl missing?")
+    log("Warning: no extack message in iproute2 output, libmnl missing?", "")
+    skip_extack = True
+
+# Check if net namespaces seem to work
+ns = mknetns()
+skip(ns is None, "Could not create a net namespace")
+cmd("ip netns delete %s" % (ns))
+netns = []
+
+try:
+    obj = bpf_obj("sample_ret0.bpf.o")
+    bytecode = bpf_bytecode("1,6 0 0 4294967295,")
+
+    start_test("Test destruction of generic XDP...")
+    simdev = BpfNetdevSimDev()
+    sim, = simdev.nsims
+    sim.set_xdp(obj, "generic")
+    simdev.remove()
+    bpftool_prog_list_wait(expected=0)
+
+    simdev = BpfNetdevSimDev()
+    sim, = simdev.nsims
+    sim.tc_add_ingress()
+
+    start_test("Test TC non-offloaded...")
+    ret, _ = sim.cls_bpf_add_filter(obj, skip_hw=True, fail=False)
+    fail(ret != 0, "Software TC filter did not load")
+
+    start_test("Test TC non-offloaded isn't getting bound...")
+    ret, _ = sim.cls_bpf_add_filter(obj, fail=False)
+    fail(ret != 0, "Software TC filter did not load")
+    simdev.dfs_get_bound_progs(expected=0)
+
+    sim.tc_flush_filters()
+
+    start_test("Test TC offloads are off by default...")
+    ret, _, err = sim.cls_bpf_add_filter(obj, skip_sw=True,
+                                         fail=False, include_stderr=True)
+    fail(ret == 0, "TC filter loaded without enabling TC offloads")
+    check_extack(err, "TC offload is disabled on net device.", args)
+    sim.wait_for_flush()
+
+    sim.set_ethtool_tc_offloads(True)
+    sim.dfs["bpf_tc_non_bound_accept"] = "Y"
+
+    start_test("Test TC offload by default...")
+    ret, _ = sim.cls_bpf_add_filter(obj, fail=False)
+    fail(ret != 0, "Software TC filter did not load")
+    simdev.dfs_get_bound_progs(expected=0)
+    ingress = sim.tc_show_ingress(expected=1)
+    fltr = ingress[0]
+    fail(not fltr["in_hw"], "Filter not offloaded by default")
+
+    sim.tc_flush_filters()
+
+    start_test("Test TC cBPF bytcode tries offload by default...")
+    ret, _ = sim.cls_bpf_add_filter(bytecode, fail=False)
+    fail(ret != 0, "Software TC filter did not load")
+    simdev.dfs_get_bound_progs(expected=0)
+    ingress = sim.tc_show_ingress(expected=1)
+    fltr = ingress[0]
+    fail(not fltr["in_hw"], "Bytecode not offloaded by default")
+
+    sim.tc_flush_filters()
+    sim.dfs["bpf_tc_non_bound_accept"] = "N"
+
+    start_test("Test TC cBPF unbound bytecode doesn't offload...")
+    ret, _, err = sim.cls_bpf_add_filter(bytecode, skip_sw=True,
+                                         fail=False, include_stderr=True)
+    fail(ret == 0, "TC bytecode loaded for offload")
+    check_extack_nsim(err, "netdevsim configured to reject unbound programs.",
+                      args)
+    sim.wait_for_flush()
+
+    start_test("Test non-0 chain offload...")
+    ret, _, err = sim.cls_bpf_add_filter(obj, chain=1, prio=1, handle=1,
+                                         skip_sw=True,
+                                         fail=False, include_stderr=True)
+    fail(ret == 0, "Offloaded a filter to chain other than 0")
+    check_extack(err, "Driver supports only offload of chain 0.", args)
+    sim.tc_flush_filters()
+
+    start_test("Test TC replace...")
+    sim.cls_bpf_add_filter(obj, prio=1, handle=1)
+    sim.cls_bpf_add_filter(obj, op="replace", prio=1, handle=1)
+    sim.cls_filter_op(op="delete", prio=1, handle=1, cls="bpf")
+
+    sim.cls_bpf_add_filter(obj, prio=1, handle=1, skip_sw=True)
+    sim.cls_bpf_add_filter(obj, op="replace", prio=1, handle=1, skip_sw=True)
+    sim.cls_filter_op(op="delete", prio=1, handle=1, cls="bpf")
+
+    sim.cls_bpf_add_filter(obj, prio=1, handle=1, skip_hw=True)
+    sim.cls_bpf_add_filter(obj, op="replace", prio=1, handle=1, skip_hw=True)
+    sim.cls_filter_op(op="delete", prio=1, handle=1, cls="bpf")
+
+    start_test("Test TC replace bad flags...")
+    for i in range(3):
+        for j in range(3):
+            ret, _ = sim.cls_bpf_add_filter(obj, op="replace", prio=1, handle=1,
+                                            skip_sw=(j == 1), skip_hw=(j == 2),
+                                            fail=False)
+            fail(bool(ret) != bool(j),
+                 "Software TC incorrect load in replace test, iteration %d" %
+                 (j))
+        sim.cls_filter_op(op="delete", prio=1, handle=1, cls="bpf")
+
+    start_test("Test spurious extack from the driver...")
+    test_spurios_extack(sim, obj, False, "netdevsim")
+    test_spurios_extack(sim, obj, True, "netdevsim")
+
+    sim.set_ethtool_tc_offloads(False)
+
+    test_spurios_extack(sim, obj, False, "TC offload is disabled")
+    test_spurios_extack(sim, obj, True, "TC offload is disabled")
+
+    sim.set_ethtool_tc_offloads(True)
+
+    sim.tc_flush_filters()
+
+    start_test("Test TC offloads failure...")
+    sim.dfs["dev/bpf_bind_verifier_accept"] = 0
+    ret, _, err = sim.cls_bpf_add_filter(obj, verbose=True, skip_sw=True,
+                                         fail=False, include_stderr=True)
+    fail(ret == 0, "TC filter did not reject with TC offloads enabled")
+    check_verifier_log(err, "[netdevsim] Hello from netdevsim!")
+    sim.dfs["dev/bpf_bind_verifier_accept"] = 1
+
+    start_test("Test TC offloads work...")
+    ret, _, err = sim.cls_bpf_add_filter(obj, verbose=True, skip_sw=True,
+                                         fail=False, include_stderr=True)
+    fail(ret != 0, "TC filter did not load with TC offloads enabled")
+
+    start_test("Test TC offload basics...")
+    dfs = simdev.dfs_get_bound_progs(expected=1)
+    progs = bpftool_prog_list(expected=1)
+    ingress = sim.tc_show_ingress(expected=1)
+
+    dprog = dfs[0]
+    prog = progs[0]
+    fltr = ingress[0]
+    fail(fltr["skip_hw"], "TC does reports 'skip_hw' on offloaded filter")
+    fail(not fltr["in_hw"], "TC does not report 'in_hw' for offloaded filter")
+    fail(not fltr["skip_sw"], "TC does not report 'skip_sw' back")
+
+    start_test("Test TC offload is device-bound...")
+    fail(str(prog["id"]) != fltr["id"], "Program IDs don't match")
+    fail(prog["tag"] != fltr["tag"], "Program tags don't match")
+    fail(fltr["id"] != dprog["id"], "Program IDs don't match")
+    fail(dprog["state"] != "xlated", "Offloaded program state not translated")
+    fail(dprog["loaded"] != "Y", "Offloaded program is not loaded")
+
+    start_test("Test disabling TC offloads is rejected while filters installed...")
+    ret, _ = sim.set_ethtool_tc_offloads(False, fail=False)
+    fail(ret == 0, "Driver should refuse to disable TC offloads with filters installed...")
+    sim.set_ethtool_tc_offloads(True)
+
+    start_test("Test qdisc removal frees things...")
+    sim.tc_flush_filters()
+    sim.tc_show_ingress(expected=0)
+
+    start_test("Test disabling TC offloads is OK without filters...")
+    ret, _ = sim.set_ethtool_tc_offloads(False, fail=False)
+    fail(ret != 0,
+         "Driver refused to disable TC offloads without filters installed...")
+
+    sim.set_ethtool_tc_offloads(True)
+
+    start_test("Test destroying device gets rid of TC filters...")
+    sim.cls_bpf_add_filter(obj, skip_sw=True)
+    simdev.remove()
+    bpftool_prog_list_wait(expected=0)
+
+    simdev = BpfNetdevSimDev()
+    sim, = simdev.nsims
+    sim.set_ethtool_tc_offloads(True)
+
+    start_test("Test destroying device gets rid of XDP...")
+    sim.set_xdp(obj, "offload")
+    simdev.remove()
+    bpftool_prog_list_wait(expected=0)
+
+    simdev = BpfNetdevSimDev()
+    sim, = simdev.nsims
+    sim.set_ethtool_tc_offloads(True)
+
+    start_test("Test XDP prog reporting...")
+    sim.set_xdp(obj, "drv")
+    ipl = sim.ip_link_show(xdp=True)
+    progs = bpftool_prog_list(expected=1)
+    fail(ipl["xdp"]["prog"]["id"] != progs[0]["id"],
+         "Loaded program has wrong ID")
+
+    start_test("Test XDP prog replace without force...")
+    ret, _ = sim.set_xdp(obj, "drv", fail=False)
+    fail(ret == 0, "Replaced XDP program without -force")
+    sim.wait_for_flush(total=1)
+
+    start_test("Test XDP prog replace with force...")
+    ret, _ = sim.set_xdp(obj, "drv", force=True, fail=False)
+    fail(ret != 0, "Could not replace XDP program with -force")
+    bpftool_prog_list_wait(expected=1)
+    ipl = sim.ip_link_show(xdp=True)
+    progs = bpftool_prog_list(expected=1)
+    fail(ipl["xdp"]["prog"]["id"] != progs[0]["id"],
+         "Loaded program has wrong ID")
+    fail("dev" in progs[0].keys(),
+         "Device parameters reported for non-offloaded program")
+
+    start_test("Test XDP prog replace with bad flags...")
+    ret, _, err = sim.set_xdp(obj, "generic", force=True,
+                              fail=False, include_stderr=True)
+    fail(ret == 0, "Replaced XDP program with a program in different mode")
+    check_extack(err,
+                 "Native and generic XDP can't be active at the same time.",
+                 args)
+
+    start_test("Test MTU restrictions...")
+    ret, _ = sim.set_mtu(9000, fail=False)
+    fail(ret == 0,
+         "Driver should refuse to increase MTU to 9000 with XDP loaded...")
+    sim.unset_xdp("drv")
+    bpftool_prog_list_wait(expected=0)
+    sim.set_mtu(9000)
+    ret, _, err = sim.set_xdp(obj, "drv", fail=False, include_stderr=True)
+    fail(ret == 0, "Driver should refuse to load program with MTU of 9000...")
+    check_extack_nsim(err, "MTU too large w/ XDP enabled.", args)
+    sim.set_mtu(1500)
+
+    sim.wait_for_flush()
+    start_test("Test non-offload XDP attaching to HW...")
+    bpftool_prog_load("sample_ret0.bpf.o", "/sys/fs/bpf/nooffload")
+    nooffload = bpf_pinned("/sys/fs/bpf/nooffload")
+    ret, _, err = sim.set_xdp(nooffload, "offload",
+                              fail=False, include_stderr=True)
+    fail(ret == 0, "attached non-offloaded XDP program to HW")
+    check_extack_nsim(err, "xdpoffload of non-bound program.", args)
+    rm("/sys/fs/bpf/nooffload")
+
+    start_test("Test offload XDP attaching to drv...")
+    bpftool_prog_load("sample_ret0.bpf.o", "/sys/fs/bpf/offload",
+                      dev=sim['ifname'])
+    offload = bpf_pinned("/sys/fs/bpf/offload")
+    ret, _, err = sim.set_xdp(offload, "drv", fail=False, include_stderr=True)
+    fail(ret == 0, "attached offloaded XDP program to drv")
+    check_extack(err, "Using offloaded program without HW_MODE flag is not supported.", args)
+    rm("/sys/fs/bpf/offload")
+    sim.wait_for_flush()
+
+    bpftool_prog_load("sample_ret0.bpf.o", "/sys/fs/bpf/devbound",
+                      dev_bind=sim['ifname'])
+    devbound = bpf_pinned("/sys/fs/bpf/devbound")
+    start_test("Test dev-bound program in generic mode...")
+    ret, _, err = sim.set_xdp(devbound, "generic", fail=False, include_stderr=True)
+    fail(ret == 0, "devbound program in generic mode allowed")
+    check_extack(err, "Can't attach device-bound programs in generic mode.", args)
+    rm("/sys/fs/bpf/devbound")
+    sim.wait_for_flush()
+
+    start_test("Test XDP load failure...")
+    sim.dfs["dev/bpf_bind_verifier_accept"] = 0
+    ret, _, err = bpftool_prog_load("sample_ret0.bpf.o", "/sys/fs/bpf/offload",
+                                 dev=sim['ifname'], fail=False, include_stderr=True)
+    fail(ret == 0, "verifier should fail on load")
+    check_verifier_log(err, "[netdevsim] Hello from netdevsim!")
+    sim.dfs["dev/bpf_bind_verifier_accept"] = 1
+    sim.wait_for_flush()
+
+    start_test("Test XDP offload...")
+    _, _, err = sim.set_xdp(obj, "offload", verbose=True, include_stderr=True)
+    ipl = sim.ip_link_show(xdp=True)
+    link_xdp = ipl["xdp"]["prog"]
+    progs = bpftool_prog_list(expected=1)
+    prog = progs[0]
+    fail(link_xdp["id"] != prog["id"], "Loaded program has wrong ID")
+
+    start_test("Test XDP offload is device bound...")
+    dfs = simdev.dfs_get_bound_progs(expected=1)
+    dprog = dfs[0]
+
+    fail(prog["id"] != link_xdp["id"], "Program IDs don't match")
+    fail(prog["tag"] != link_xdp["tag"], "Program tags don't match")
+    fail(str(link_xdp["id"]) != dprog["id"], "Program IDs don't match")
+    fail(dprog["state"] != "xlated", "Offloaded program state not translated")
+    fail(dprog["loaded"] != "Y", "Offloaded program is not loaded")
+
+    start_test("Test removing XDP program many times...")
+    sim.unset_xdp("offload")
+    sim.unset_xdp("offload")
+    sim.unset_xdp("drv")
+    sim.unset_xdp("drv")
+    sim.unset_xdp("")
+    sim.unset_xdp("")
+    bpftool_prog_list_wait(expected=0)
+
+    start_test("Test attempt to use a program for a wrong device...")
+    simdev2 = BpfNetdevSimDev()
+    sim2, = simdev2.nsims
+    sim2.set_xdp(obj, "offload")
+    pin_file, pinned = pin_prog("/sys/fs/bpf/tmp")
+
+    ret, _, err = sim.set_xdp(pinned, "offload",
+                              fail=False, include_stderr=True)
+    fail(ret == 0, "Pinned program loaded for a different device accepted")
+    check_extack(err, "Program bound to different device.", args)
+    simdev2.remove()
+    ret, _, err = sim.set_xdp(pinned, "offload",
+                              fail=False, include_stderr=True)
+    fail(ret == 0, "Pinned program loaded for a removed device accepted")
+    check_extack(err, "Program bound to different device.", args)
+    rm(pin_file)
+    bpftool_prog_list_wait(expected=0)
+
+    simdev, sim = test_multi_prog(simdev, sim, obj, "", 1)
+    simdev, sim = test_multi_prog(simdev, sim, obj, "drv", 1)
+    simdev, sim = test_multi_prog(simdev, sim, obj, "generic", 2)
+
+    start_test("Test mixing of TC and XDP...")
+    sim.tc_add_ingress()
+    sim.set_xdp(obj, "offload")
+    ret, _, err = sim.cls_bpf_add_filter(obj, skip_sw=True,
+                                         fail=False, include_stderr=True)
+    fail(ret == 0, "Loading TC when XDP active should fail")
+    check_extack_nsim(err, "driver and netdev offload states mismatch.", args)
+    sim.unset_xdp("offload")
+    sim.wait_for_flush()
+
+    sim.cls_bpf_add_filter(obj, skip_sw=True)
+    ret, _, err = sim.set_xdp(obj, "offload", fail=False, include_stderr=True)
+    fail(ret == 0, "Loading XDP when TC active should fail")
+    check_extack_nsim(err, "TC program is already loaded.", args)
+
+    start_test("Test binding TC from pinned...")
+    pin_file, pinned = pin_prog("/sys/fs/bpf/tmp")
+    sim.tc_flush_filters(bound=1, total=1)
+    sim.cls_bpf_add_filter(pinned, da=True, skip_sw=True)
+    sim.tc_flush_filters(bound=1, total=1)
+
+    start_test("Test binding XDP from pinned...")
+    sim.set_xdp(obj, "offload")
+    pin_file, pinned = pin_prog("/sys/fs/bpf/tmp2", idx=1)
+
+    sim.set_xdp(pinned, "offload", force=True)
+    sim.unset_xdp("offload")
+    sim.set_xdp(pinned, "offload", force=True)
+    sim.unset_xdp("offload")
+
+    start_test("Test offload of wrong type fails...")
+    ret, _ = sim.cls_bpf_add_filter(pinned, da=True, skip_sw=True, fail=False)
+    fail(ret == 0, "Managed to attach XDP program to TC")
+
+    start_test("Test asking for TC offload of two filters...")
+    sim.cls_bpf_add_filter(obj, da=True, skip_sw=True)
+    ret, _, err = sim.cls_bpf_add_filter(obj, da=True, skip_sw=True,
+                                         fail=False, include_stderr=True)
+    fail(ret == 0, "Managed to offload two TC filters at the same time")
+    check_extack_nsim(err, "driver and netdev offload states mismatch.", args)
+
+    sim.tc_flush_filters(bound=2, total=2)
+
+    start_test("Test if netdev removal waits for translation...")
+    delay_msec = 500
+    sim.dfs["dev/bpf_bind_verifier_delay"] = delay_msec
+    start = time.time()
+    cmd_line = "tc filter add dev %s ingress bpf %s da skip_sw" % \
+               (sim['ifname'], obj)
+    tc_proc = cmd(cmd_line, background=True, fail=False)
+    # Wait for the verifier to start
+    while simdev.dfs_num_bound_progs() <= 2:
+        pass
+    simdev.remove()
+    end = time.time()
+    ret, _ = cmd_result(tc_proc, fail=False)
+    time_diff = end - start
+    log("Time", "start:\t%s\nend:\t%s\ndiff:\t%s" % (start, end, time_diff))
+
+    fail(ret == 0, "Managed to load TC filter on a unregistering device")
+    delay_sec = delay_msec * 0.001
+    fail(time_diff < delay_sec, "Removal process took %s, expected %s" %
+         (time_diff, delay_sec))
+
+    # Remove all pinned files and reinstantiate the netdev
+    clean_up()
+    bpftool_prog_list_wait(expected=0)
+
+    simdev = BpfNetdevSimDev()
+    sim, = simdev.nsims
+    map_obj = bpf_obj("sample_map_ret0.bpf.o")
+    start_test("Test loading program with maps...")
+    sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON
+
+    start_test("Test bpftool bound info reporting (own ns)...")
+    check_dev_info(False, "")
+
+    start_test("Test bpftool bound info reporting (other ns)...")
+    ns = mknetns()
+    sim.set_ns(ns)
+    check_dev_info(True, "")
+
+    start_test("Test bpftool bound info reporting (remote ns)...")
+    check_dev_info(False, ns)
+
+    start_test("Test bpftool bound info reporting (back to own ns)...")
+    sim.set_ns("")
+    check_dev_info(False, "")
+
+    prog_file, _ = pin_prog("/sys/fs/bpf/tmp_prog")
+    map_file, _ = pin_map("/sys/fs/bpf/tmp_map", idx=1, expected=2)
+    simdev.remove()
+
+    start_test("Test bpftool bound info reporting (removed dev)...")
+    check_dev_info_removed(prog_file=prog_file, map_file=map_file)
+
+    # Remove all pinned files and reinstantiate the netdev
+    clean_up()
+    bpftool_prog_list_wait(expected=0)
+
+    simdev = BpfNetdevSimDev()
+    sim, = simdev.nsims
+
+    start_test("Test map update (no flags)...")
+    sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON
+    maps = bpftool_map_list_wait(expected=2)
+    array = maps[0] if maps[0]["type"] == "array" else maps[1]
+    htab = maps[0] if maps[0]["type"] == "hash" else maps[1]
+    for m in maps:
+        for i in range(2):
+            bpftool("map update id %d key %s value %s" %
+                    (m["id"], int2str("I", i), int2str("Q", i * 3)))
+
+    for m in maps:
+        ret, _ = bpftool("map update id %d key %s value %s" %
+                         (m["id"], int2str("I", 3), int2str("Q", 3 * 3)),
+                         fail=False)
+        fail(ret == 0, "added too many entries")
+
+    start_test("Test map update (exists)...")
+    for m in maps:
+        for i in range(2):
+            bpftool("map update id %d key %s value %s exist" %
+                    (m["id"], int2str("I", i), int2str("Q", i * 3)))
+
+    for m in maps:
+        ret, err = bpftool("map update id %d key %s value %s exist" %
+                           (m["id"], int2str("I", 3), int2str("Q", 3 * 3)),
+                           fail=False)
+        fail(ret == 0, "updated non-existing key")
+        fail(err["error"].find("No such file or directory") == -1,
+             "expected ENOENT, error is '%s'" % (err["error"]))
+
+    start_test("Test map update (noexist)...")
+    for m in maps:
+        for i in range(2):
+            ret, err = bpftool("map update id %d key %s value %s noexist" %
+                               (m["id"], int2str("I", i), int2str("Q", i * 3)),
+                               fail=False)
+        fail(ret == 0, "updated existing key")
+        fail(err["error"].find("File exists") == -1,
+             "expected EEXIST, error is '%s'" % (err["error"]))
+
+    start_test("Test map dump...")
+    for m in maps:
+        _, entries = bpftool("map dump id %d" % (m["id"]))
+        for i in range(2):
+            key = str2int(entries[i]["key"])
+            fail(key != i, "expected key %d, got %d" % (key, i))
+            val = str2int(entries[i]["value"])
+            fail(val != i * 3, "expected value %d, got %d" % (val, i * 3))
+
+    start_test("Test map getnext...")
+    for m in maps:
+        _, entry = bpftool("map getnext id %d" % (m["id"]))
+        key = str2int(entry["next_key"])
+        fail(key != 0, "next key %d, expected %d" % (key, 0))
+        _, entry = bpftool("map getnext id %d key %s" %
+                           (m["id"], int2str("I", 0)))
+        key = str2int(entry["next_key"])
+        fail(key != 1, "next key %d, expected %d" % (key, 1))
+        ret, err = bpftool("map getnext id %d key %s" %
+                           (m["id"], int2str("I", 1)), fail=False)
+        fail(ret == 0, "got next key past the end of map")
+        fail(err["error"].find("No such file or directory") == -1,
+             "expected ENOENT, error is '%s'" % (err["error"]))
+
+    start_test("Test map delete (htab)...")
+    for i in range(2):
+        bpftool("map delete id %d key %s" % (htab["id"], int2str("I", i)))
+
+    start_test("Test map delete (array)...")
+    for i in range(2):
+        ret, err = bpftool("map delete id %d key %s" %
+                           (htab["id"], int2str("I", i)), fail=False)
+        fail(ret == 0, "removed entry from an array")
+        fail(err["error"].find("No such file or directory") == -1,
+             "expected ENOENT, error is '%s'" % (err["error"]))
+
+    start_test("Test map remove...")
+    sim.unset_xdp("offload")
+    bpftool_map_list_wait(expected=0)
+    simdev.remove()
+
+    simdev = BpfNetdevSimDev()
+    sim, = simdev.nsims
+    sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON
+    simdev.remove()
+    bpftool_map_list_wait(expected=0)
+
+    start_test("Test map creation fail path...")
+    simdev = BpfNetdevSimDev()
+    sim, = simdev.nsims
+    sim.dfs["bpf_map_accept"] = "N"
+    ret, _ = sim.set_xdp(map_obj, "offload", JSON=False, fail=False)
+    fail(ret == 0,
+         "netdevsim didn't refuse to create a map with offload disabled")
+
+    simdev.remove()
+
+    start_test("Test multi-dev ASIC program reuse...")
+    simdevA = BpfNetdevSimDev()
+    simA, = simdevA.nsims
+    simdevB = BpfNetdevSimDev(3)
+    simB1, simB2, simB3 = simdevB.nsims
+    sims = (simA, simB1, simB2, simB3)
+    simB = (simB1, simB2, simB3)
+
+    bpftool_prog_load("sample_map_ret0.bpf.o", "/sys/fs/bpf/nsimA",
+                      dev=simA['ifname'])
+    progA = bpf_pinned("/sys/fs/bpf/nsimA")
+    bpftool_prog_load("sample_map_ret0.bpf.o", "/sys/fs/bpf/nsimB",
+                      dev=simB1['ifname'])
+    progB = bpf_pinned("/sys/fs/bpf/nsimB")
+
+    simA.set_xdp(progA, "offload", JSON=False)
+    for d in simdevB.nsims:
+        d.set_xdp(progB, "offload", JSON=False)
+
+    start_test("Test multi-dev ASIC cross-dev replace...")
+    ret, _ = simA.set_xdp(progB, "offload", force=True, JSON=False, fail=False)
+    fail(ret == 0, "cross-ASIC program allowed")
+    for d in simdevB.nsims:
+        ret, _ = d.set_xdp(progA, "offload", force=True, JSON=False, fail=False)
+        fail(ret == 0, "cross-ASIC program allowed")
+
+    start_test("Test multi-dev ASIC cross-dev install...")
+    for d in sims:
+        d.unset_xdp("offload")
+
+    ret, _, err = simA.set_xdp(progB, "offload", force=True, JSON=False,
+                               fail=False, include_stderr=True)
+    fail(ret == 0, "cross-ASIC program allowed")
+    check_extack(err, "Program bound to different device.", args)
+    for d in simdevB.nsims:
+        ret, _, err = d.set_xdp(progA, "offload", force=True, JSON=False,
+                                fail=False, include_stderr=True)
+        fail(ret == 0, "cross-ASIC program allowed")
+        check_extack(err, "Program bound to different device.", args)
+
+    start_test("Test multi-dev ASIC cross-dev map reuse...")
+
+    mapA = bpftool("prog show %s" % (progA))[1]["map_ids"][0]
+    mapB = bpftool("prog show %s" % (progB))[1]["map_ids"][0]
+
+    ret, _ = bpftool_prog_load("sample_map_ret0.bpf.o", "/sys/fs/bpf/nsimB_",
+                               dev=simB3['ifname'],
+                               maps=["idx 0 id %d" % (mapB)],
+                               fail=False)
+    fail(ret != 0, "couldn't reuse a map on the same ASIC")
+    rm("/sys/fs/bpf/nsimB_")
+
+    ret, _, err = bpftool_prog_load("sample_map_ret0.bpf.o", "/sys/fs/bpf/nsimA_",
+                                    dev=simA['ifname'],
+                                    maps=["idx 0 id %d" % (mapB)],
+                                    fail=False, include_stderr=True)
+    fail(ret == 0, "could reuse a map on a different ASIC")
+    fail(err.count("offload device mismatch between prog and map") == 0,
+         "error message missing for cross-ASIC map")
+
+    ret, _, err = bpftool_prog_load("sample_map_ret0.bpf.o", "/sys/fs/bpf/nsimB_",
+                                    dev=simB1['ifname'],
+                                    maps=["idx 0 id %d" % (mapA)],
+                                    fail=False, include_stderr=True)
+    fail(ret == 0, "could reuse a map on a different ASIC")
+    fail(err.count("offload device mismatch between prog and map") == 0,
+         "error message missing for cross-ASIC map")
+
+    start_test("Test multi-dev ASIC cross-dev destruction...")
+    bpftool_prog_list_wait(expected=2)
+
+    simdevA.remove()
+    bpftool_prog_list_wait(expected=1)
+
+    ifnameB = bpftool("prog show %s" % (progB))[1]["dev"]["ifname"]
+    fail(ifnameB != simB1['ifname'], "program not bound to original device")
+    simB1.remove()
+    bpftool_prog_list_wait(expected=1)
+
+    start_test("Test multi-dev ASIC cross-dev destruction - move...")
+    ifnameB = bpftool("prog show %s" % (progB))[1]["dev"]["ifname"]
+    fail(ifnameB not in (simB2['ifname'], simB3['ifname']),
+         "program not bound to remaining devices")
+
+    simB2.remove()
+    ifnameB = bpftool("prog show %s" % (progB))[1]["dev"]["ifname"]
+    fail(ifnameB != simB3['ifname'], "program not bound to remaining device")
+
+    simB3.remove()
+    simdevB.remove()
+    bpftool_prog_list_wait(expected=0)
+
+    start_test("Test multi-dev ASIC cross-dev destruction - orphaned...")
+    ret, out = bpftool("prog show %s" % (progB), fail=False)
+    fail(ret != 0, "couldn't get information about orphaned program")
+
+    print("%s: OK" % (os.path.basename(__file__)))
+
+finally:
+    log("Clean up...", "", level=1)
+    log_level_inc()
+    clean_up()
diff --git a/tools/testing/selftests/net/busy_poll_test.sh b/tools/testing/selftests/net/busy_poll_test.sh
new file mode 100755
index 000000000000..7d2d40812074
--- /dev/null
+++ b/tools/testing/selftests/net/busy_poll_test.sh
@@ -0,0 +1,165 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+source lib.sh
+
+NSIM_SV_ID=$((256 + RANDOM % 256))
+NSIM_SV_SYS=/sys/bus/netdevsim/devices/netdevsim$NSIM_SV_ID
+NSIM_CL_ID=$((512 + RANDOM % 256))
+NSIM_CL_SYS=/sys/bus/netdevsim/devices/netdevsim$NSIM_CL_ID
+
+NSIM_DEV_SYS_NEW=/sys/bus/netdevsim/new_device
+NSIM_DEV_SYS_DEL=/sys/bus/netdevsim/del_device
+NSIM_DEV_SYS_LINK=/sys/bus/netdevsim/link_device
+NSIM_DEV_SYS_UNLINK=/sys/bus/netdevsim/unlink_device
+
+SERVER_IP=192.168.1.1
+CLIENT_IP=192.168.1.2
+SERVER_PORT=48675
+
+# busy poll config
+MAX_EVENTS=8
+BUSY_POLL_USECS=0
+BUSY_POLL_BUDGET=16
+PREFER_BUSY_POLL=1
+
+# IRQ deferral config
+NAPI_DEFER_HARD_IRQS=100
+GRO_FLUSH_TIMEOUT=50000
+SUSPEND_TIMEOUT=20000000
+
+setup_ns()
+{
+	set -e
+	ip netns add nssv
+	ip netns add nscl
+
+	NSIM_SV_NAME=$(find $NSIM_SV_SYS/net -maxdepth 1 -type d ! \
+		-path $NSIM_SV_SYS/net -exec basename {} \;)
+	NSIM_CL_NAME=$(find $NSIM_CL_SYS/net -maxdepth 1 -type d ! \
+		-path $NSIM_CL_SYS/net -exec basename {} \;)
+
+	# ensure the server has 1 queue
+	ethtool -L $NSIM_SV_NAME combined 1 2>/dev/null
+
+	ip link set $NSIM_SV_NAME netns nssv
+	ip link set $NSIM_CL_NAME netns nscl
+
+	ip netns exec nssv ip addr add "${SERVER_IP}/24" dev $NSIM_SV_NAME
+	ip netns exec nscl ip addr add "${CLIENT_IP}/24" dev $NSIM_CL_NAME
+
+	ip netns exec nssv ip link set dev $NSIM_SV_NAME up
+	ip netns exec nscl ip link set dev $NSIM_CL_NAME up
+
+	set +e
+}
+
+cleanup_ns()
+{
+	ip netns del nscl
+	ip netns del nssv
+}
+
+test_busypoll()
+{
+	suspend_value=${1:-0}
+	tmp_file=$(mktemp)
+	out_file=$(mktemp)
+
+	# fill a test file with random data
+	dd if=/dev/urandom of=${tmp_file} bs=1M count=1 2> /dev/null
+
+	timeout -k 1s 30s ip netns exec nssv ./busy_poller         \
+					     -p${SERVER_PORT}      \
+					     -b${SERVER_IP}        \
+					     -m${MAX_EVENTS}       \
+					     -u${BUSY_POLL_USECS}  \
+					     -P${PREFER_BUSY_POLL} \
+					     -g${BUSY_POLL_BUDGET} \
+					     -i${NSIM_SV_IFIDX}    \
+					     -s${suspend_value}    \
+					     -o${out_file}&
+
+	wait_local_port_listen nssv ${SERVER_PORT} tcp
+
+	ip netns exec nscl socat -u $tmp_file TCP:${SERVER_IP}:${SERVER_PORT}
+
+	wait
+
+	tmp_file_md5sum=$(md5sum $tmp_file | cut -f1 -d' ')
+	out_file_md5sum=$(md5sum $out_file | cut -f1 -d' ')
+
+	if [ "$tmp_file_md5sum" = "$out_file_md5sum" ]; then
+		res=0
+	else
+		echo "md5sum mismatch"
+		echo "input file md5sum: ${tmp_file_md5sum}";
+		echo "output file md5sum: ${out_file_md5sum}";
+		res=1
+	fi
+
+	rm $out_file $tmp_file
+
+	return $res
+}
+
+test_busypoll_with_suspend()
+{
+	test_busypoll ${SUSPEND_TIMEOUT}
+
+	return $?
+}
+
+###
+### Code start
+###
+
+modprobe netdevsim
+
+# linking
+
+echo $NSIM_SV_ID > $NSIM_DEV_SYS_NEW
+echo $NSIM_CL_ID > $NSIM_DEV_SYS_NEW
+udevadm settle
+
+setup_ns
+
+NSIM_SV_FD=$((256 + RANDOM % 256))
+exec {NSIM_SV_FD}</var/run/netns/nssv
+NSIM_SV_IFIDX=$(ip netns exec nssv cat /sys/class/net/$NSIM_SV_NAME/ifindex)
+
+NSIM_CL_FD=$((256 + RANDOM % 256))
+exec {NSIM_CL_FD}</var/run/netns/nscl
+NSIM_CL_IFIDX=$(ip netns exec nscl cat /sys/class/net/$NSIM_CL_NAME/ifindex)
+
+echo "$NSIM_SV_FD:$NSIM_SV_IFIDX $NSIM_CL_FD:$NSIM_CL_IFIDX" > \
+     $NSIM_DEV_SYS_LINK
+
+if [ $? -ne 0 ]; then
+	echo "linking netdevsim1 with netdevsim2 should succeed"
+	cleanup_ns
+	exit 1
+fi
+
+test_busypoll
+if [ $? -ne 0 ]; then
+	echo "test_busypoll failed"
+	cleanup_ns
+	exit 1
+fi
+
+test_busypoll_with_suspend
+if [ $? -ne 0 ]; then
+	echo "test_busypoll_with_suspend failed"
+	cleanup_ns
+	exit 1
+fi
+
+echo "$NSIM_SV_FD:$NSIM_SV_IFIDX" > $NSIM_DEV_SYS_UNLINK
+
+echo $NSIM_CL_ID > $NSIM_DEV_SYS_DEL
+
+cleanup_ns
+
+modprobe -r netdevsim
+
+exit 0
diff --git a/tools/testing/selftests/net/busy_poller.c b/tools/testing/selftests/net/busy_poller.c
new file mode 100644
index 000000000000..04c7ff577bb8
--- /dev/null
+++ b/tools/testing/selftests/net/busy_poller.c
@@ -0,0 +1,358 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <assert.h>
+#include <errno.h>
+#include <error.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <ynl.h>
+
+#include <arpa/inet.h>
+#include <netinet/in.h>
+
+#include <sys/epoll.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include <linux/genetlink.h>
+#include <linux/netlink.h>
+
+#include "netdev-user.h"
+
+/* The below ifdef blob is required because:
+ *
+ * - sys/epoll.h does not (yet) have the ioctl definitions included. So,
+ *   systems with older glibcs will not have them available. However,
+ *   sys/epoll.h does include the type definition for epoll_data, which is
+ *   needed by the user program (e.g. epoll_event.data.fd)
+ *
+ * - linux/eventpoll.h does not define the epoll_data type, it is simply an
+ *   opaque __u64. It does, however, include the ioctl definition.
+ *
+ * Including both headers is impossible (types would be redefined), so I've
+ * opted instead to take sys/epoll.h, and include the blob below.
+ *
+ * Someday, when glibc is globally up to date, the blob below can be removed.
+ */
+#if !defined(EPOLL_IOC_TYPE)
+struct epoll_params {
+	uint32_t busy_poll_usecs;
+	uint16_t busy_poll_budget;
+	uint8_t prefer_busy_poll;
+
+	/* pad the struct to a multiple of 64bits */
+	uint8_t __pad;
+};
+
+#define EPOLL_IOC_TYPE 0x8A
+#define EPIOCSPARAMS _IOW(EPOLL_IOC_TYPE, 0x01, struct epoll_params)
+#define EPIOCGPARAMS _IOR(EPOLL_IOC_TYPE, 0x02, struct epoll_params)
+#endif
+
+static uint16_t cfg_port = 8000;
+static struct in_addr cfg_bind_addr = { .s_addr = INADDR_ANY };
+static char *cfg_outfile;
+static int cfg_max_events = 8;
+static uint32_t cfg_ifindex;
+
+/* busy poll params */
+static uint32_t cfg_busy_poll_usecs;
+static uint16_t cfg_busy_poll_budget;
+static uint8_t cfg_prefer_busy_poll;
+
+/* IRQ params */
+static uint32_t cfg_defer_hard_irqs;
+static uint64_t cfg_gro_flush_timeout;
+static uint64_t cfg_irq_suspend_timeout;
+
+static void usage(const char *filepath)
+{
+	error(1, 0,
+	      "Usage: %s -p<port> -b<addr> -m<max_events> -u<busy_poll_usecs> -P<prefer_busy_poll> -g<busy_poll_budget> -o<outfile> -d<defer_hard_irqs> -r<gro_flush_timeout> -s<irq_suspend_timeout> -i<ifindex>",
+	      filepath);
+}
+
+static void parse_opts(int argc, char **argv)
+{
+	unsigned long long tmp;
+	int ret;
+	int c;
+
+	if (argc <= 1)
+		usage(argv[0]);
+
+	while ((c = getopt(argc, argv, "p:m:b:u:P:g:o:d:r:s:i:")) != -1) {
+		/* most options take integer values, except o and b, so reduce
+		 * code duplication a bit for the common case by calling
+		 * strtoull here and leave bounds checking and casting per
+		 * option below.
+		 */
+		if (c != 'o' && c != 'b')
+			tmp = strtoull(optarg, NULL, 0);
+
+		switch (c) {
+		case 'u':
+			if (tmp == ULLONG_MAX || tmp > UINT32_MAX)
+				error(1, ERANGE, "busy_poll_usecs too large");
+
+			cfg_busy_poll_usecs = (uint32_t)tmp;
+			break;
+		case 'P':
+			if (tmp == ULLONG_MAX || tmp > 1)
+				error(1, ERANGE,
+				      "prefer busy poll should be 0 or 1");
+
+			cfg_prefer_busy_poll = (uint8_t)tmp;
+			break;
+		case 'g':
+			if (tmp == ULLONG_MAX || tmp > UINT16_MAX)
+				error(1, ERANGE,
+				      "busy poll budget must be [0, UINT16_MAX]");
+
+			cfg_busy_poll_budget = (uint16_t)tmp;
+			break;
+		case 'p':
+			if (tmp == ULLONG_MAX || tmp > UINT16_MAX)
+				error(1, ERANGE, "port must be <= 65535");
+
+			cfg_port = (uint16_t)tmp;
+			break;
+		case 'b':
+			ret = inet_aton(optarg, &cfg_bind_addr);
+			if (ret == 0)
+				error(1, errno,
+				      "bind address %s invalid", optarg);
+			break;
+		case 'o':
+			cfg_outfile = strdup(optarg);
+			if (!cfg_outfile)
+				error(1, 0, "outfile invalid");
+			break;
+		case 'm':
+			if (tmp == ULLONG_MAX || tmp > INT_MAX)
+				error(1, ERANGE,
+				      "max events must be > 0 and <= INT_MAX");
+
+			cfg_max_events = (int)tmp;
+			break;
+		case 'd':
+			if (tmp == ULLONG_MAX || tmp > INT32_MAX)
+				error(1, ERANGE,
+				      "defer_hard_irqs must be <= INT32_MAX");
+
+			cfg_defer_hard_irqs = (uint32_t)tmp;
+			break;
+		case 'r':
+			if (tmp == ULLONG_MAX || tmp > UINT64_MAX)
+				error(1, ERANGE,
+				      "gro_flush_timeout must be < UINT64_MAX");
+
+			cfg_gro_flush_timeout = (uint64_t)tmp;
+			break;
+		case 's':
+			if (tmp == ULLONG_MAX || tmp > UINT64_MAX)
+				error(1, ERANGE,
+				      "irq_suspend_timeout must be < ULLONG_MAX");
+
+			cfg_irq_suspend_timeout = (uint64_t)tmp;
+			break;
+		case 'i':
+			if (tmp == ULLONG_MAX || tmp > INT_MAX)
+				error(1, ERANGE,
+				      "ifindex must be <= INT_MAX");
+
+			cfg_ifindex = (int)tmp;
+			break;
+		}
+	}
+
+	if (!cfg_ifindex)
+		usage(argv[0]);
+
+	if (optind != argc)
+		usage(argv[0]);
+}
+
+static void epoll_ctl_add(int epfd, int fd, uint32_t events)
+{
+	struct epoll_event ev;
+
+	ev.events = events;
+	ev.data.fd = fd;
+	if (epoll_ctl(epfd, EPOLL_CTL_ADD, fd, &ev) == -1)
+		error(1, errno, "epoll_ctl add fd: %d", fd);
+}
+
+static void setnonblock(int sockfd)
+{
+	int flags;
+
+	flags = fcntl(sockfd, F_GETFL, 0);
+
+	if (fcntl(sockfd, F_SETFL, flags | O_NONBLOCK) == -1)
+		error(1, errno, "unable to set socket to nonblocking mode");
+}
+
+static void write_chunk(int fd, char *buf, ssize_t buflen)
+{
+	ssize_t remaining = buflen;
+	char *buf_offset = buf;
+	ssize_t writelen = 0;
+	ssize_t write_result;
+
+	while (writelen < buflen) {
+		write_result = write(fd, buf_offset, remaining);
+		if (write_result == -1)
+			error(1, errno, "unable to write data to outfile");
+
+		writelen += write_result;
+		remaining -= write_result;
+		buf_offset += write_result;
+	}
+}
+
+static void setup_queue(void)
+{
+	struct netdev_napi_get_list *napi_list = NULL;
+	struct netdev_napi_get_req_dump *req = NULL;
+	struct netdev_napi_set_req *set_req = NULL;
+	struct ynl_sock *ys;
+	struct ynl_error yerr;
+	uint32_t napi_id = 0;
+
+	ys = ynl_sock_create(&ynl_netdev_family, &yerr);
+	if (!ys)
+		error(1, 0, "YNL: %s", yerr.msg);
+
+	req = netdev_napi_get_req_dump_alloc();
+	netdev_napi_get_req_dump_set_ifindex(req, cfg_ifindex);
+	napi_list = netdev_napi_get_dump(ys, req);
+
+	/* assume there is 1 NAPI configured and take the first */
+	if (napi_list->obj._present.id)
+		napi_id = napi_list->obj.id;
+	else
+		error(1, 0, "napi ID not present?");
+
+	set_req = netdev_napi_set_req_alloc();
+	netdev_napi_set_req_set_id(set_req, napi_id);
+	netdev_napi_set_req_set_defer_hard_irqs(set_req, cfg_defer_hard_irqs);
+	netdev_napi_set_req_set_gro_flush_timeout(set_req,
+						  cfg_gro_flush_timeout);
+	netdev_napi_set_req_set_irq_suspend_timeout(set_req,
+						    cfg_irq_suspend_timeout);
+
+	if (netdev_napi_set(ys, set_req))
+		error(1, 0, "can't set NAPI params: %s\n", yerr.msg);
+
+	netdev_napi_get_list_free(napi_list);
+	netdev_napi_get_req_dump_free(req);
+	netdev_napi_set_req_free(set_req);
+	ynl_sock_destroy(ys);
+}
+
+static void run_poller(void)
+{
+	struct epoll_event events[cfg_max_events];
+	struct epoll_params epoll_params = {0};
+	struct sockaddr_in server_addr;
+	int i, epfd, nfds;
+	ssize_t readlen;
+	int outfile_fd;
+	char buf[1024];
+	int sockfd;
+	int conn;
+	int val;
+
+	outfile_fd = open(cfg_outfile, O_WRONLY | O_CREAT, 0644);
+	if (outfile_fd == -1)
+		error(1, errno, "unable to open outfile: %s", cfg_outfile);
+
+	sockfd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+	if (sockfd == -1)
+		error(1, errno, "unable to create listen socket");
+
+	server_addr.sin_family = AF_INET;
+	server_addr.sin_port = htons(cfg_port);
+	server_addr.sin_addr = cfg_bind_addr;
+
+	/* these values are range checked during parse_opts, so casting is safe
+	 * here
+	 */
+	epoll_params.busy_poll_usecs = cfg_busy_poll_usecs;
+	epoll_params.busy_poll_budget = cfg_busy_poll_budget;
+	epoll_params.prefer_busy_poll = cfg_prefer_busy_poll;
+	epoll_params.__pad = 0;
+
+	val = 1;
+	if (setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val)))
+		error(1, errno, "poller setsockopt reuseaddr");
+
+	setnonblock(sockfd);
+
+	if (bind(sockfd, (struct sockaddr *)&server_addr,
+		 sizeof(struct sockaddr_in)))
+		error(0, errno, "poller bind to port: %d\n", cfg_port);
+
+	if (listen(sockfd, 1))
+		error(1, errno, "poller listen");
+
+	epfd = epoll_create1(0);
+	if (ioctl(epfd, EPIOCSPARAMS, &epoll_params) == -1)
+		error(1, errno, "unable to set busy poll params");
+
+	epoll_ctl_add(epfd, sockfd, EPOLLIN | EPOLLOUT | EPOLLET);
+
+	for (;;) {
+		nfds = epoll_wait(epfd, events, cfg_max_events, -1);
+		for (i = 0; i < nfds; i++) {
+			if (events[i].data.fd == sockfd) {
+				conn = accept(sockfd, NULL, NULL);
+				if (conn == -1)
+					error(1, errno,
+					      "accepting incoming connection failed");
+
+				setnonblock(conn);
+				epoll_ctl_add(epfd, conn,
+					      EPOLLIN | EPOLLET | EPOLLRDHUP |
+					      EPOLLHUP);
+			} else if (events[i].events & EPOLLIN) {
+				for (;;) {
+					readlen = read(events[i].data.fd, buf,
+						       sizeof(buf));
+					if (readlen > 0)
+						write_chunk(outfile_fd, buf,
+							    readlen);
+					else
+						break;
+				}
+			} else {
+				/* spurious event ? */
+			}
+			if (events[i].events & (EPOLLRDHUP | EPOLLHUP)) {
+				epoll_ctl(epfd, EPOLL_CTL_DEL,
+					  events[i].data.fd, NULL);
+				close(events[i].data.fd);
+				close(outfile_fd);
+				return;
+			}
+		}
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	parse_opts(argc, argv);
+	setup_queue();
+	run_poller();
+
+	if (cfg_outfile)
+		free(cfg_outfile);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/net/can/.gitignore b/tools/testing/selftests/net/can/.gitignore
new file mode 100644
index 000000000000..764a53fc837f
--- /dev/null
+++ b/tools/testing/selftests/net/can/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+test_raw_filter
diff --git a/tools/testing/selftests/net/can/Makefile b/tools/testing/selftests/net/can/Makefile
new file mode 100644
index 000000000000..5b82e60a03e7
--- /dev/null
+++ b/tools/testing/selftests/net/can/Makefile
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0
+
+top_srcdir = ../../../../..
+
+CFLAGS += -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include $(KHDR_INCLUDES)
+
+TEST_PROGS := test_raw_filter.sh
+
+TEST_GEN_FILES := test_raw_filter
+
+include ../../lib.mk
diff --git a/tools/testing/selftests/net/can/test_raw_filter.c b/tools/testing/selftests/net/can/test_raw_filter.c
new file mode 100644
index 000000000000..4101c36390fd
--- /dev/null
+++ b/tools/testing/selftests/net/can/test_raw_filter.c
@@ -0,0 +1,405 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause)
+/*
+ * Copyright (c) 2011 Volkswagen Group Electronic Research
+ * All rights reserved.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <net/if.h>
+#include <linux/if.h>
+
+#include <linux/can.h>
+#include <linux/can/raw.h>
+
+#include "../../kselftest_harness.h"
+
+#define ID 0x123
+
+char CANIF[IFNAMSIZ];
+
+static int send_can_frames(int sock, int testcase)
+{
+	struct can_frame frame;
+
+	frame.can_dlc = 1;
+	frame.data[0] = testcase;
+
+	frame.can_id = ID;
+	if (write(sock, &frame, sizeof(frame)) < 0)
+		goto write_err;
+
+	frame.can_id = (ID | CAN_RTR_FLAG);
+	if (write(sock, &frame, sizeof(frame)) < 0)
+		goto write_err;
+
+	frame.can_id = (ID | CAN_EFF_FLAG);
+	if (write(sock, &frame, sizeof(frame)) < 0)
+		goto write_err;
+
+	frame.can_id = (ID | CAN_EFF_FLAG | CAN_RTR_FLAG);
+	if (write(sock, &frame, sizeof(frame)) < 0)
+		goto write_err;
+
+	return 0;
+
+write_err:
+	perror("write");
+	return 1;
+}
+
+FIXTURE(can_filters) {
+	int sock;
+};
+
+FIXTURE_SETUP(can_filters)
+{
+	struct sockaddr_can addr;
+	struct ifreq ifr;
+	int recv_own_msgs = 1;
+	int s, ret;
+
+	s = socket(PF_CAN, SOCK_RAW, CAN_RAW);
+	ASSERT_GE(s, 0)
+		TH_LOG("failed to create CAN_RAW socket: %d", errno);
+
+	strncpy(ifr.ifr_name, CANIF, sizeof(ifr.ifr_name));
+	ret = ioctl(s, SIOCGIFINDEX, &ifr);
+	ASSERT_GE(ret, 0)
+		TH_LOG("failed SIOCGIFINDEX: %d", errno);
+
+	addr.can_family = AF_CAN;
+	addr.can_ifindex = ifr.ifr_ifindex;
+
+	setsockopt(s, SOL_CAN_RAW, CAN_RAW_RECV_OWN_MSGS,
+		   &recv_own_msgs, sizeof(recv_own_msgs));
+
+	ret = bind(s, (struct sockaddr *)&addr, sizeof(addr));
+	ASSERT_EQ(ret, 0)
+		TH_LOG("failed bind socket: %d", errno);
+
+	self->sock = s;
+}
+
+FIXTURE_TEARDOWN(can_filters)
+{
+	close(self->sock);
+}
+
+FIXTURE_VARIANT(can_filters) {
+	int testcase;
+	canid_t id;
+	canid_t mask;
+	int exp_num_rx;
+	canid_t exp_flags[];
+};
+
+/* Receive all frames when filtering for the ID in standard frame format */
+FIXTURE_VARIANT_ADD(can_filters, base) {
+	.testcase = 1,
+	.id = ID,
+	.mask = CAN_SFF_MASK,
+	.exp_num_rx = 4,
+	.exp_flags = {
+		0,
+		CAN_RTR_FLAG,
+		CAN_EFF_FLAG,
+		CAN_EFF_FLAG | CAN_RTR_FLAG,
+	},
+};
+
+/* Ignore EFF flag in filter ID if not covered by filter mask */
+FIXTURE_VARIANT_ADD(can_filters, base_eff) {
+	.testcase = 2,
+	.id = ID | CAN_EFF_FLAG,
+	.mask = CAN_SFF_MASK,
+	.exp_num_rx = 4,
+	.exp_flags = {
+		0,
+		CAN_RTR_FLAG,
+		CAN_EFF_FLAG,
+		CAN_EFF_FLAG | CAN_RTR_FLAG,
+	},
+};
+
+/* Ignore RTR flag in filter ID if not covered by filter mask */
+FIXTURE_VARIANT_ADD(can_filters, base_rtr) {
+	.testcase = 3,
+	.id = ID | CAN_RTR_FLAG,
+	.mask = CAN_SFF_MASK,
+	.exp_num_rx = 4,
+	.exp_flags = {
+		0,
+		CAN_RTR_FLAG,
+		CAN_EFF_FLAG,
+		CAN_EFF_FLAG | CAN_RTR_FLAG,
+	},
+};
+
+/* Ignore EFF and RTR flags in filter ID if not covered by filter mask */
+FIXTURE_VARIANT_ADD(can_filters, base_effrtr) {
+	.testcase = 4,
+	.id = ID | CAN_EFF_FLAG | CAN_RTR_FLAG,
+	.mask = CAN_SFF_MASK,
+	.exp_num_rx = 4,
+	.exp_flags = {
+		0,
+		CAN_RTR_FLAG,
+		CAN_EFF_FLAG,
+		CAN_EFF_FLAG | CAN_RTR_FLAG,
+	},
+};
+
+/* Receive only SFF frames when expecting no EFF flag */
+FIXTURE_VARIANT_ADD(can_filters, filter_eff) {
+	.testcase = 5,
+	.id = ID,
+	.mask = CAN_SFF_MASK | CAN_EFF_FLAG,
+	.exp_num_rx = 2,
+	.exp_flags = {
+		0,
+		CAN_RTR_FLAG,
+	},
+};
+
+/* Receive only EFF frames when filter id and filter mask include EFF flag */
+FIXTURE_VARIANT_ADD(can_filters, filter_eff_eff) {
+	.testcase = 6,
+	.id = ID | CAN_EFF_FLAG,
+	.mask = CAN_SFF_MASK | CAN_EFF_FLAG,
+	.exp_num_rx = 2,
+	.exp_flags = {
+		CAN_EFF_FLAG,
+		CAN_EFF_FLAG | CAN_RTR_FLAG,
+	},
+};
+
+/* Receive only SFF frames when expecting no EFF flag, ignoring RTR flag */
+FIXTURE_VARIANT_ADD(can_filters, filter_eff_rtr) {
+	.testcase = 7,
+	.id = ID | CAN_RTR_FLAG,
+	.mask = CAN_SFF_MASK | CAN_EFF_FLAG,
+	.exp_num_rx = 2,
+	.exp_flags = {
+		0,
+		CAN_RTR_FLAG,
+	},
+};
+
+/* Receive only EFF frames when filter id and filter mask include EFF flag,
+ * ignoring RTR flag
+ */
+FIXTURE_VARIANT_ADD(can_filters, filter_eff_effrtr) {
+	.testcase = 8,
+	.id = ID | CAN_EFF_FLAG | CAN_RTR_FLAG,
+	.mask = CAN_SFF_MASK | CAN_EFF_FLAG,
+	.exp_num_rx = 2,
+	.exp_flags = {
+		CAN_EFF_FLAG,
+		CAN_EFF_FLAG | CAN_RTR_FLAG,
+	},
+};
+
+/* Receive no remote frames when filtering for no RTR flag */
+FIXTURE_VARIANT_ADD(can_filters, filter_rtr) {
+	.testcase = 9,
+	.id = ID,
+	.mask = CAN_SFF_MASK | CAN_RTR_FLAG,
+	.exp_num_rx = 2,
+	.exp_flags = {
+		0,
+		CAN_EFF_FLAG,
+	},
+};
+
+/* Receive no remote frames when filtering for no RTR flag, ignoring EFF flag */
+FIXTURE_VARIANT_ADD(can_filters, filter_rtr_eff) {
+	.testcase = 10,
+	.id = ID | CAN_EFF_FLAG,
+	.mask = CAN_SFF_MASK | CAN_RTR_FLAG,
+	.exp_num_rx = 2,
+	.exp_flags = {
+		0,
+		CAN_EFF_FLAG,
+	},
+};
+
+/* Receive only remote frames when filter includes RTR flag */
+FIXTURE_VARIANT_ADD(can_filters, filter_rtr_rtr) {
+	.testcase = 11,
+	.id = ID | CAN_RTR_FLAG,
+	.mask = CAN_SFF_MASK | CAN_RTR_FLAG,
+	.exp_num_rx = 2,
+	.exp_flags = {
+		CAN_RTR_FLAG,
+		CAN_EFF_FLAG | CAN_RTR_FLAG,
+	},
+};
+
+/* Receive only remote frames when filter includes RTR flag, ignoring EFF
+ * flag
+ */
+FIXTURE_VARIANT_ADD(can_filters, filter_rtr_effrtr) {
+	.testcase = 12,
+	.id = ID | CAN_EFF_FLAG | CAN_RTR_FLAG,
+	.mask = CAN_SFF_MASK | CAN_RTR_FLAG,
+	.exp_num_rx = 2,
+	.exp_flags = {
+		CAN_RTR_FLAG,
+		CAN_EFF_FLAG | CAN_RTR_FLAG,
+	},
+};
+
+/* Receive only SFF data frame when filtering for no flags */
+FIXTURE_VARIANT_ADD(can_filters, filter_effrtr) {
+	.testcase = 13,
+	.id = ID,
+	.mask = CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG,
+	.exp_num_rx = 1,
+	.exp_flags = {
+		0,
+	},
+};
+
+/* Receive only EFF data frame when filtering for EFF but no RTR flag */
+FIXTURE_VARIANT_ADD(can_filters, filter_effrtr_eff) {
+	.testcase = 14,
+	.id = ID | CAN_EFF_FLAG,
+	.mask = CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG,
+	.exp_num_rx = 1,
+	.exp_flags = {
+		CAN_EFF_FLAG,
+	},
+};
+
+/* Receive only SFF remote frame when filtering for RTR but no EFF flag */
+FIXTURE_VARIANT_ADD(can_filters, filter_effrtr_rtr) {
+	.testcase = 15,
+	.id = ID | CAN_RTR_FLAG,
+	.mask = CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG,
+	.exp_num_rx = 1,
+	.exp_flags = {
+		CAN_RTR_FLAG,
+	},
+};
+
+/* Receive only EFF remote frame when filtering for EFF and RTR flag */
+FIXTURE_VARIANT_ADD(can_filters, filter_effrtr_effrtr) {
+	.testcase = 16,
+	.id = ID | CAN_EFF_FLAG | CAN_RTR_FLAG,
+	.mask = CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG,
+	.exp_num_rx = 1,
+	.exp_flags = {
+		CAN_EFF_FLAG | CAN_RTR_FLAG,
+	},
+};
+
+/* Receive only SFF data frame when filtering for no EFF flag and no RTR flag
+ * but based on EFF mask
+ */
+FIXTURE_VARIANT_ADD(can_filters, eff) {
+	.testcase = 17,
+	.id = ID,
+	.mask = CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG,
+	.exp_num_rx = 1,
+	.exp_flags = {
+		0,
+	},
+};
+
+/* Receive only EFF data frame when filtering for EFF flag and no RTR flag but
+ * based on EFF mask
+ */
+FIXTURE_VARIANT_ADD(can_filters, eff_eff) {
+	.testcase = 18,
+	.id = ID | CAN_EFF_FLAG,
+	.mask = CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG,
+	.exp_num_rx = 1,
+	.exp_flags = {
+		CAN_EFF_FLAG,
+	},
+};
+
+/* This test verifies that the raw CAN filters work, by checking if only frames
+ * with the expected set of flags are received. For each test case, the given
+ * filter (id and mask) is added and four CAN frames are sent with every
+ * combination of set/unset EFF/RTR flags.
+ */
+TEST_F(can_filters, test_filter)
+{
+	struct can_filter rfilter;
+	int ret;
+
+	rfilter.can_id = variant->id;
+	rfilter.can_mask = variant->mask;
+	setsockopt(self->sock, SOL_CAN_RAW, CAN_RAW_FILTER,
+		   &rfilter, sizeof(rfilter));
+
+	TH_LOG("filters: can_id = 0x%08X can_mask = 0x%08X",
+		rfilter.can_id, rfilter.can_mask);
+
+	ret = send_can_frames(self->sock, variant->testcase);
+	ASSERT_EQ(ret, 0)
+		TH_LOG("failed to send CAN frames");
+
+	for (int i = 0; i <= variant->exp_num_rx; i++) {
+		struct can_frame frame;
+		struct timeval tv = {
+			.tv_sec = 0,
+			.tv_usec = 50000, /* 50ms timeout */
+		};
+		fd_set rdfs;
+
+		FD_ZERO(&rdfs);
+		FD_SET(self->sock, &rdfs);
+
+		ret = select(self->sock + 1, &rdfs, NULL, NULL, &tv);
+		ASSERT_GE(ret, 0)
+			TH_LOG("failed select for frame %d, err: %d)", i, errno);
+
+		ret = FD_ISSET(self->sock, &rdfs);
+		if (i == variant->exp_num_rx) {
+			ASSERT_EQ(ret, 0)
+				TH_LOG("too many frames received");
+		} else {
+			ASSERT_NE(ret, 0)
+				TH_LOG("too few frames received");
+
+			ret = read(self->sock, &frame, sizeof(frame));
+			ASSERT_GE(ret, 0)
+				TH_LOG("failed to read frame %d, err: %d", i, errno);
+
+			TH_LOG("rx: can_id = 0x%08X rx = %d", frame.can_id, i);
+
+			ASSERT_EQ(ID, frame.can_id & CAN_SFF_MASK)
+				TH_LOG("received wrong can_id");
+			ASSERT_EQ(variant->testcase, frame.data[0])
+				TH_LOG("received wrong test case");
+
+			ASSERT_EQ(frame.can_id & ~CAN_ERR_MASK,
+				  variant->exp_flags[i])
+				TH_LOG("received unexpected flags");
+		}
+	}
+}
+
+int main(int argc, char **argv)
+{
+	char *ifname = getenv("CANIF");
+
+	if (!ifname) {
+		printf("CANIF environment variable must contain the test interface\n");
+		return KSFT_FAIL;
+	}
+
+	strncpy(CANIF, ifname, sizeof(CANIF) - 1);
+
+	return test_harness_run(argc, argv);
+}
diff --git a/tools/testing/selftests/net/can/test_raw_filter.sh b/tools/testing/selftests/net/can/test_raw_filter.sh
new file mode 100755
index 000000000000..276d6c06ac95
--- /dev/null
+++ b/tools/testing/selftests/net/can/test_raw_filter.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="
+	test_raw_filter
+"
+
+net_dir=$(dirname $0)/..
+source $net_dir/lib.sh
+
+export CANIF=${CANIF:-"vcan0"}
+BITRATE=${BITRATE:-500000}
+
+setup()
+{
+	if [[ $CANIF == vcan* ]]; then
+		ip link add name $CANIF type vcan || exit $ksft_skip
+	else
+		ip link set dev $CANIF type can bitrate $BITRATE || exit $ksft_skip
+	fi
+	ip link set dev $CANIF up
+	pwd
+}
+
+cleanup()
+{
+	ip link set dev $CANIF down
+	if [[ $CANIF == vcan* ]]; then
+		ip link delete $CANIF
+	fi
+}
+
+test_raw_filter()
+{
+	./test_raw_filter
+	check_err $?
+	log_test "test_raw_filter"
+}
+
+trap cleanup EXIT
+setup
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/cmsg_ip.sh b/tools/testing/selftests/net/cmsg_ip.sh
new file mode 100755
index 000000000000..b55680e081ad
--- /dev/null
+++ b/tools/testing/selftests/net/cmsg_ip.sh
@@ -0,0 +1,187 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source lib.sh
+
+IP4=172.16.0.1/24
+TGT4=172.16.0.2
+IP6=2001:db8:1::1/64
+TGT6=2001:db8:1::2
+TMPF=$(mktemp --suffix ".pcap")
+
+cleanup()
+{
+    rm -f $TMPF
+    cleanup_ns $NS
+}
+
+trap cleanup EXIT
+
+tcpdump -h | grep immediate-mode >> /dev/null
+if [ $? -ne 0 ]; then
+    echo "SKIP - tcpdump with --immediate-mode option required"
+    exit $ksft_skip
+fi
+
+# Namespaces
+setup_ns NS
+NSEXE="ip netns exec $NS"
+
+$NSEXE sysctl -w net.ipv4.ping_group_range='0 2147483647' > /dev/null
+
+# Connectivity
+ip -netns $NS link add type dummy
+ip -netns $NS link set dev dummy0 up
+ip -netns $NS addr add $IP4 dev dummy0
+ip -netns $NS addr add $IP6 dev dummy0
+
+# Test
+BAD=0
+TOTAL=0
+
+check_result() {
+    ((TOTAL++))
+    if [ $1 -ne $2 ]; then
+	echo "  Case $3 returned $1, expected $2"
+	((BAD++))
+    fi
+}
+
+# IPV6_DONTFRAG
+for ovr in setsock cmsg both diff; do
+    for df in 0 1; do
+	for p in u U i r; do
+	    [ $p == "u" ] && prot=UDP
+	    [ $p == "U" ] && prot=UDP
+	    [ $p == "i" ] && prot=ICMP
+	    [ $p == "r" ] && prot=RAW
+
+	    [ $ovr == "setsock" ] && m="-F $df"
+	    [ $ovr == "cmsg" ]    && m="-f $df"
+	    [ $ovr == "both" ]    && m="-F $df -f $df"
+	    [ $ovr == "diff" ]    && m="-F $((1 - df)) -f $df"
+
+	    $NSEXE ./cmsg_sender -s -S 2000 -6 -p $p $m $TGT6 1234
+	    check_result $? $df "DONTFRAG $prot $ovr"
+	done
+    done
+done
+
+# IP_TOS + IPV6_TCLASS
+
+test_dscp() {
+    local -r IPVER=$1
+    local -r TGT=$2
+    local -r MATCH=$3
+
+    local -r TOS=0x10
+    local -r TOS2=0x20
+    local -r ECN=0x3
+
+    ip $IPVER -netns $NS rule add tos $TOS lookup 300
+    ip $IPVER -netns $NS route add table 300 prohibit any
+
+    for ovr in setsock cmsg both diff; do
+	for p in u U i r; do
+	    [ $p == "u" ] && prot=UDP
+	    [ $p == "U" ] && prot=UDP
+	    [ $p == "i" ] && prot=ICMP
+	    [ $p == "r" ] && prot=RAW
+
+	    [ $ovr == "setsock" ] && m="-C"
+	    [ $ovr == "cmsg" ]    && m="-c"
+	    [ $ovr == "both" ]    && m="-C $((TOS2)) -c"
+	    [ $ovr == "diff" ]    && m="-C $((TOS )) -c"
+
+	    $NSEXE nohup tcpdump --immediate-mode -p -ni dummy0 -w $TMPF -c 4 2> /dev/null &
+	    BG=$!
+	    sleep 0.05
+
+	    $NSEXE ./cmsg_sender $IPVER -p $p $m $((TOS2)) $TGT 1234
+	    check_result $? 0 "$MATCH $prot $ovr - pass"
+
+	    while [ -d /proc/$BG ]; do
+	        $NSEXE ./cmsg_sender $IPVER -p $p $m $((TOS2)) $TGT 1234
+	    done
+
+	    tcpdump -r $TMPF -v 2>&1 | grep "$MATCH $TOS2" >> /dev/null
+	    check_result $? 0 "$MATCH $prot $ovr - packet data"
+	    rm $TMPF
+
+	    [ $ovr == "both" ]    && m="-C $((TOS )) -c"
+	    [ $ovr == "diff" ]    && m="-C $((TOS2)) -c"
+
+	    # Match prohibit rule: expect failure
+	    $NSEXE ./cmsg_sender $IPVER -p $p $m $((TOS)) -s $TGT 1234
+	    check_result $? 1 "$MATCH $prot $ovr - rejection"
+
+	    # Match prohibit rule: IPv4 masks ECN: expect failure
+	    if [[ "$IPVER" == "-4" ]]; then
+		$NSEXE ./cmsg_sender $IPVER -p $p $m "$((TOS | ECN))" -s $TGT 1234
+		check_result $? 1 "$MATCH $prot $ovr - rejection (ECN)"
+	    fi
+	done
+    done
+}
+
+test_dscp -4 $TGT4 tos
+test_dscp -6 $TGT6 class
+
+# IP_TTL + IPV6_HOPLIMIT
+test_ttl_hoplimit() {
+    local -r IPVER=$1
+    local -r TGT=$2
+    local -r MATCH=$3
+
+    local -r LIM=4
+
+    for ovr in setsock cmsg both diff; do
+	for p in u U i r; do
+	    [ $p == "u" ] && prot=UDP
+	    [ $p == "U" ] && prot=UDP
+	    [ $p == "i" ] && prot=ICMP
+	    [ $p == "r" ] && prot=RAW
+
+	    [ $ovr == "setsock" ] && m="-L"
+	    [ $ovr == "cmsg" ]    && m="-l"
+	    [ $ovr == "both" ]    && m="-L $LIM -l"
+	    [ $ovr == "diff" ]    && m="-L $((LIM + 1)) -l"
+
+	    $NSEXE nohup tcpdump --immediate-mode -p -ni dummy0 -w $TMPF -c 4 2> /dev/null &
+	    BG=$!
+	    sleep 0.05
+
+	    $NSEXE ./cmsg_sender $IPVER -p $p $m $LIM $TGT 1234
+	    check_result $? 0 "$MATCH $prot $ovr - pass"
+
+	    while [ -d /proc/$BG ]; do
+		$NSEXE ./cmsg_sender $IPVER -p $p $m $LIM $TGT 1234
+	    done
+
+	    tcpdump -r $TMPF -v 2>&1 | grep "$MATCH $LIM[^0-9]" >> /dev/null
+	    check_result $? 0 "$MATCH $prot $ovr - packet data"
+	    rm $TMPF
+	done
+    done
+}
+
+test_ttl_hoplimit -4 $TGT4 ttl
+test_ttl_hoplimit -6 $TGT6 hlim
+
+# IPV6 exthdr
+for p in u U i r; do
+    # Very basic "does it crash" test
+    for h in h d r; do
+	$NSEXE ./cmsg_sender -p $p -6 -H $h $TGT6 1234
+	check_result $? 0 "ExtHdr $prot $ovr - pass"
+    done
+done
+
+# Summary
+if [ $BAD -ne 0 ]; then
+    echo "FAIL - $BAD/$TOTAL cases failed"
+    exit 1
+else
+    echo "OK"
+    exit 0
+fi
diff --git a/tools/testing/selftests/net/cmsg_ipv6.sh b/tools/testing/selftests/net/cmsg_ipv6.sh
deleted file mode 100755
index 8bc23fb4c82b..000000000000
--- a/tools/testing/selftests/net/cmsg_ipv6.sh
+++ /dev/null
@@ -1,154 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-source lib.sh
-
-IP6=2001:db8:1::1/64
-TGT6=2001:db8:1::2
-TMPF=$(mktemp --suffix ".pcap")
-
-cleanup()
-{
-    rm -f $TMPF
-    cleanup_ns $NS
-}
-
-trap cleanup EXIT
-
-tcpdump -h | grep immediate-mode >> /dev/null
-if [ $? -ne 0 ]; then
-    echo "SKIP - tcpdump with --immediate-mode option required"
-    exit $ksft_skip
-fi
-
-# Namespaces
-setup_ns NS
-NSEXE="ip netns exec $NS"
-
-$NSEXE sysctl -w net.ipv4.ping_group_range='0 2147483647' > /dev/null
-
-# Connectivity
-ip -netns $NS link add type dummy
-ip -netns $NS link set dev dummy0 up
-ip -netns $NS addr add $IP6 dev dummy0
-
-# Test
-BAD=0
-TOTAL=0
-
-check_result() {
-    ((TOTAL++))
-    if [ $1 -ne $2 ]; then
-	echo "  Case $3 returned $1, expected $2"
-	((BAD++))
-    fi
-}
-
-# IPV6_DONTFRAG
-for ovr in setsock cmsg both diff; do
-    for df in 0 1; do
-	for p in u i r; do
-	    [ $p == "u" ] && prot=UDP
-	    [ $p == "i" ] && prot=ICMP
-	    [ $p == "r" ] && prot=RAW
-
-	    [ $ovr == "setsock" ] && m="-F $df"
-	    [ $ovr == "cmsg" ]    && m="-f $df"
-	    [ $ovr == "both" ]    && m="-F $df -f $df"
-	    [ $ovr == "diff" ]    && m="-F $((1 - df)) -f $df"
-
-	    $NSEXE ./cmsg_sender -s -S 2000 -6 -p $p $m $TGT6 1234
-	    check_result $? $df "DONTFRAG $prot $ovr"
-	done
-    done
-done
-
-# IPV6_TCLASS
-TOS=0x10
-TOS2=0x20
-
-ip -6 -netns $NS rule add tos $TOS lookup 300
-ip -6 -netns $NS route add table 300 prohibit any
-
-for ovr in setsock cmsg both diff; do
-    for p in u i r; do
-	[ $p == "u" ] && prot=UDP
-	[ $p == "i" ] && prot=ICMP
-	[ $p == "r" ] && prot=RAW
-
-	[ $ovr == "setsock" ] && m="-C"
-	[ $ovr == "cmsg" ]    && m="-c"
-	[ $ovr == "both" ]    && m="-C $((TOS2)) -c"
-	[ $ovr == "diff" ]    && m="-C $((TOS )) -c"
-
-	$NSEXE nohup tcpdump --immediate-mode -p -ni dummy0 -w $TMPF -c 4 2> /dev/null &
-	BG=$!
-	sleep 0.05
-
-	$NSEXE ./cmsg_sender -6 -p $p $m $((TOS2)) $TGT6 1234
-	check_result $? 0 "TCLASS $prot $ovr - pass"
-
-	while [ -d /proc/$BG ]; do
-	    $NSEXE ./cmsg_sender -6 -p $p $m $((TOS2)) $TGT6 1234
-	done
-
-	tcpdump -r $TMPF -v 2>&1 | grep "class $TOS2" >> /dev/null
-	check_result $? 0 "TCLASS $prot $ovr - packet data"
-	rm $TMPF
-
-	[ $ovr == "both" ]    && m="-C $((TOS )) -c"
-	[ $ovr == "diff" ]    && m="-C $((TOS2)) -c"
-
-	$NSEXE ./cmsg_sender -6 -p $p $m $((TOS)) -s $TGT6 1234
-	check_result $? 1 "TCLASS $prot $ovr - rejection"
-    done
-done
-
-# IPV6_HOPLIMIT
-LIM=4
-
-for ovr in setsock cmsg both diff; do
-    for p in u i r; do
-	[ $p == "u" ] && prot=UDP
-	[ $p == "i" ] && prot=ICMP
-	[ $p == "r" ] && prot=RAW
-
-	[ $ovr == "setsock" ] && m="-L"
-	[ $ovr == "cmsg" ]    && m="-l"
-	[ $ovr == "both" ]    && m="-L $LIM -l"
-	[ $ovr == "diff" ]    && m="-L $((LIM + 1)) -l"
-
-	$NSEXE nohup tcpdump --immediate-mode -p -ni dummy0 -w $TMPF -c 4 2> /dev/null &
-	BG=$!
-	sleep 0.05
-
-	$NSEXE ./cmsg_sender -6 -p $p $m $LIM $TGT6 1234
-	check_result $? 0 "HOPLIMIT $prot $ovr - pass"
-
-	while [ -d /proc/$BG ]; do
-	    $NSEXE ./cmsg_sender -6 -p $p $m $LIM $TGT6 1234
-	done
-
-	tcpdump -r $TMPF -v 2>&1 | grep "hlim $LIM[^0-9]" >> /dev/null
-	check_result $? 0 "HOPLIMIT $prot $ovr - packet data"
-	rm $TMPF
-    done
-done
-
-# IPV6 exthdr
-for p in u i r; do
-    # Very basic "does it crash" test
-    for h in h d r; do
-	$NSEXE ./cmsg_sender -p $p -6 -H $h $TGT6 1234
-	check_result $? 0 "ExtHdr $prot $ovr - pass"
-    done
-done
-
-# Summary
-if [ $BAD -ne 0 ]; then
-    echo "FAIL - $BAD/$TOTAL cases failed"
-    exit 1
-else
-    echo "OK"
-    exit 0
-fi
diff --git a/tools/testing/selftests/net/cmsg_sender.c b/tools/testing/selftests/net/cmsg_sender.c
index c79e65581dc3..a825e628aee7 100644
--- a/tools/testing/selftests/net/cmsg_sender.c
+++ b/tools/testing/selftests/net/cmsg_sender.c
@@ -33,6 +33,7 @@ enum {
 	ERN_RECVERR,
 	ERN_CMSG_RD,
 	ERN_CMSG_RCV,
+	ERN_SEND_MORE,
 };
 
 struct option_cmsg_u32 {
@@ -46,6 +47,7 @@ struct options {
 	const char *service;
 	unsigned int size;
 	unsigned int num_pkt;
+	bool msg_more;
 	struct {
 		unsigned int mark;
 		unsigned int dontfrag;
@@ -59,6 +61,7 @@ struct options {
 		unsigned int proto;
 	} sock;
 	struct option_cmsg_u32 mark;
+	struct option_cmsg_u32 priority;
 	struct {
 		bool ena;
 		unsigned int delay;
@@ -71,7 +74,7 @@ struct options {
 		struct option_cmsg_u32 tclass;
 		struct option_cmsg_u32 hlimit;
 		struct option_cmsg_u32 exthdr;
-	} v6;
+	} cmsg;
 } opt = {
 	.size = 13,
 	.num_pkt = 1,
@@ -93,21 +96,24 @@ static void __attribute__((noreturn)) cs_usage(const char *bin)
 	       "\t\t-S      send() size\n"
 	       "\t\t-4/-6   Force IPv4 / IPv6 only\n"
 	       "\t\t-p prot Socket protocol\n"
-	       "\t\t        (u = UDP (default); i = ICMP; r = RAW)\n"
+	       "\t\t        (u = UDP (default); i = ICMP; r = RAW;\n"
+	       "\t\t         U = UDP with MSG_MORE)\n"
 	       "\n"
 	       "\t\t-m val  Set SO_MARK with given value\n"
 	       "\t\t-M val  Set SO_MARK via setsockopt\n"
+	       "\t\t-P val  Set SO_PRIORITY via setsockopt\n"
+	       "\t\t-Q val  Set SO_PRIORITY via cmsg\n"
 	       "\t\t-d val  Set SO_TXTIME with given delay (usec)\n"
 	       "\t\t-t      Enable time stamp reporting\n"
 	       "\t\t-f val  Set don't fragment via cmsg\n"
 	       "\t\t-F val  Set don't fragment via setsockopt\n"
-	       "\t\t-c val  Set TCLASS via cmsg\n"
-	       "\t\t-C val  Set TCLASS via setsockopt\n"
-	       "\t\t-l val  Set HOPLIMIT via cmsg\n"
-	       "\t\t-L val  Set HOPLIMIT via setsockopt\n"
+	       "\t\t-c val  Set TOS/TCLASS via cmsg\n"
+	       "\t\t-C val  Set TOS/TCLASS via setsockopt\n"
+	       "\t\t-l val  Set TTL/HOPLIMIT via cmsg\n"
+	       "\t\t-L val  Set TTL/HOPLIMIT via setsockopt\n"
 	       "\t\t-H type Add an IPv6 header option\n"
-	       "\t\t        (h = HOP; d = DST; r = RTDST)"
-	       "");
+	       "\t\t        (h = HOP; d = DST; r = RTDST)\n"
+	       "\n");
 	exit(ERN_HELP);
 }
 
@@ -115,7 +121,7 @@ static void cs_parse_args(int argc, char *argv[])
 {
 	int o;
 
-	while ((o = getopt(argc, argv, "46sS:p:P:m:M:n:d:tf:F:c:C:l:L:H:")) != -1) {
+	while ((o = getopt(argc, argv, "46sS:p:P:m:M:n:d:tf:F:c:C:l:L:H:Q:")) != -1) {
 		switch (o) {
 		case 's':
 			opt.silent_send = true;
@@ -130,8 +136,11 @@ static void cs_parse_args(int argc, char *argv[])
 			opt.sock.family = AF_INET6;
 			break;
 		case 'p':
-			if (*optarg == 'u' || *optarg == 'U') {
+			if (*optarg == 'u') {
 				opt.sock.proto = IPPROTO_UDP;
+			} else if (*optarg == 'U') {
+				opt.sock.proto = IPPROTO_UDP;
+				opt.msg_more = true;
 			} else if (*optarg == 'i' || *optarg == 'I') {
 				opt.sock.proto = IPPROTO_ICMP;
 			} else if (*optarg == 'r') {
@@ -148,6 +157,10 @@ static void cs_parse_args(int argc, char *argv[])
 			opt.mark.ena = true;
 			opt.mark.val = atoi(optarg);
 			break;
+		case 'Q':
+			opt.priority.ena = true;
+			opt.priority.val = atoi(optarg);
+			break;
 		case 'M':
 			opt.sockopt.mark = atoi(optarg);
 			break;
@@ -162,37 +175,37 @@ static void cs_parse_args(int argc, char *argv[])
 			opt.ts.ena = true;
 			break;
 		case 'f':
-			opt.v6.dontfrag.ena = true;
-			opt.v6.dontfrag.val = atoi(optarg);
+			opt.cmsg.dontfrag.ena = true;
+			opt.cmsg.dontfrag.val = atoi(optarg);
 			break;
 		case 'F':
 			opt.sockopt.dontfrag = atoi(optarg);
 			break;
 		case 'c':
-			opt.v6.tclass.ena = true;
-			opt.v6.tclass.val = atoi(optarg);
+			opt.cmsg.tclass.ena = true;
+			opt.cmsg.tclass.val = atoi(optarg);
 			break;
 		case 'C':
 			opt.sockopt.tclass = atoi(optarg);
 			break;
 		case 'l':
-			opt.v6.hlimit.ena = true;
-			opt.v6.hlimit.val = atoi(optarg);
+			opt.cmsg.hlimit.ena = true;
+			opt.cmsg.hlimit.val = atoi(optarg);
 			break;
 		case 'L':
 			opt.sockopt.hlimit = atoi(optarg);
 			break;
 		case 'H':
-			opt.v6.exthdr.ena = true;
+			opt.cmsg.exthdr.ena = true;
 			switch (optarg[0]) {
 			case 'h':
-				opt.v6.exthdr.val = IPV6_HOPOPTS;
+				opt.cmsg.exthdr.val = IPV6_HOPOPTS;
 				break;
 			case 'd':
-				opt.v6.exthdr.val = IPV6_DSTOPTS;
+				opt.cmsg.exthdr.val = IPV6_DSTOPTS;
 				break;
 			case 'r':
-				opt.v6.exthdr.val = IPV6_RTHDRDSTOPTS;
+				opt.cmsg.exthdr.val = IPV6_RTHDRDSTOPTS;
 				break;
 			default:
 				printf("Error: hdr type: %s\n", optarg);
@@ -253,22 +266,25 @@ cs_write_cmsg(int fd, struct msghdr *msg, char *cbuf, size_t cbuf_sz)
 	ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len,
 			  SOL_SOCKET, SO_MARK, &opt.mark);
 	ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len,
-			  SOL_IPV6, IPV6_DONTFRAG, &opt.v6.dontfrag);
-	ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len,
-			  SOL_IPV6, IPV6_TCLASS, &opt.v6.tclass);
-	ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len,
-			  SOL_IPV6, IPV6_HOPLIMIT, &opt.v6.hlimit);
+			  SOL_SOCKET, SO_PRIORITY, &opt.priority);
+
+	if (opt.sock.family == AF_INET) {
+		ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len,
+				  SOL_IP, IP_TOS, &opt.cmsg.tclass);
+		ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len,
+				  SOL_IP, IP_TTL, &opt.cmsg.hlimit);
+	} else {
+		ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len,
+				  SOL_IPV6, IPV6_DONTFRAG, &opt.cmsg.dontfrag);
+		ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len,
+				  SOL_IPV6, IPV6_TCLASS, &opt.cmsg.tclass);
+		ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len,
+				  SOL_IPV6, IPV6_HOPLIMIT, &opt.cmsg.hlimit);
+	}
 
 	if (opt.txtime.ena) {
-		struct sock_txtime so_txtime = {
-			.clockid = CLOCK_MONOTONIC,
-		};
 		__u64 txtime;
 
-		if (setsockopt(fd, SOL_SOCKET, SO_TXTIME,
-			       &so_txtime, sizeof(so_txtime)))
-			error(ERN_SOCKOPT, errno, "setsockopt TXTIME");
-
 		txtime = time_start_mono.tv_sec * (1000ULL * 1000 * 1000) +
 			 time_start_mono.tv_nsec +
 			 opt.txtime.delay * 1000;
@@ -284,13 +300,6 @@ cs_write_cmsg(int fd, struct msghdr *msg, char *cbuf, size_t cbuf_sz)
 		memcpy(CMSG_DATA(cmsg), &txtime, sizeof(txtime));
 	}
 	if (opt.ts.ena) {
-		__u32 val = SOF_TIMESTAMPING_SOFTWARE |
-			    SOF_TIMESTAMPING_OPT_TSONLY;
-
-		if (setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING,
-			       &val, sizeof(val)))
-			error(ERN_SOCKOPT, errno, "setsockopt TIMESTAMPING");
-
 		cmsg = (struct cmsghdr *)(cbuf + cmsg_len);
 		cmsg_len += CMSG_SPACE(sizeof(__u32));
 		if (cbuf_sz < cmsg_len)
@@ -302,14 +311,14 @@ cs_write_cmsg(int fd, struct msghdr *msg, char *cbuf, size_t cbuf_sz)
 		*(__u32 *)CMSG_DATA(cmsg) = SOF_TIMESTAMPING_TX_SCHED |
 					    SOF_TIMESTAMPING_TX_SOFTWARE;
 	}
-	if (opt.v6.exthdr.ena) {
+	if (opt.cmsg.exthdr.ena) {
 		cmsg = (struct cmsghdr *)(cbuf + cmsg_len);
 		cmsg_len += CMSG_SPACE(8);
 		if (cbuf_sz < cmsg_len)
 			error(ERN_CMSG_WR, EFAULT, "cmsg buffer too small");
 
 		cmsg->cmsg_level = SOL_IPV6;
-		cmsg->cmsg_type = opt.v6.exthdr.val;
+		cmsg->cmsg_type = opt.cmsg.exthdr.val;
 		cmsg->cmsg_len = CMSG_LEN(8);
 		*(__u64 *)CMSG_DATA(cmsg) = 0;
 	}
@@ -333,16 +342,17 @@ static const char *cs_ts_info2str(unsigned int info)
 	return "unknown";
 }
 
-static void
+static unsigned long
 cs_read_cmsg(int fd, struct msghdr *msg, char *cbuf, size_t cbuf_sz)
 {
 	struct sock_extended_err *see;
 	struct scm_timestamping *ts;
+	unsigned long ts_seen = 0;
 	struct cmsghdr *cmsg;
 	int i, err;
 
 	if (!opt.ts.ena)
-		return;
+		return 0;
 	msg->msg_control = cbuf;
 	msg->msg_controllen = cbuf_sz;
 
@@ -396,8 +406,11 @@ cs_read_cmsg(int fd, struct msghdr *msg, char *cbuf, size_t cbuf_sz)
 			printf(" %5s ts%d %lluus\n",
 			       cs_ts_info2str(see->ee_info),
 			       i, rel_time);
+			ts_seen |= 1 << see->ee_info;
 		}
 	}
+
+	return ts_seen;
 }
 
 static void ca_set_sockopts(int fd)
@@ -406,22 +419,52 @@ static void ca_set_sockopts(int fd)
 	    setsockopt(fd, SOL_SOCKET, SO_MARK,
 		       &opt.sockopt.mark, sizeof(opt.sockopt.mark)))
 		error(ERN_SOCKOPT, errno, "setsockopt SO_MARK");
-	if (opt.sockopt.dontfrag &&
-	    setsockopt(fd, SOL_IPV6, IPV6_DONTFRAG,
-		       &opt.sockopt.dontfrag, sizeof(opt.sockopt.dontfrag)))
-		error(ERN_SOCKOPT, errno, "setsockopt IPV6_DONTFRAG");
-	if (opt.sockopt.tclass &&
-	    setsockopt(fd, SOL_IPV6, IPV6_TCLASS,
-		       &opt.sockopt.tclass, sizeof(opt.sockopt.tclass)))
-		error(ERN_SOCKOPT, errno, "setsockopt IPV6_TCLASS");
-	if (opt.sockopt.hlimit &&
-	    setsockopt(fd, SOL_IPV6, IPV6_UNICAST_HOPS,
-		       &opt.sockopt.hlimit, sizeof(opt.sockopt.hlimit)))
-		error(ERN_SOCKOPT, errno, "setsockopt IPV6_HOPLIMIT");
 	if (opt.sockopt.priority &&
 	    setsockopt(fd, SOL_SOCKET, SO_PRIORITY,
 		       &opt.sockopt.priority, sizeof(opt.sockopt.priority)))
 		error(ERN_SOCKOPT, errno, "setsockopt SO_PRIORITY");
+
+	if (opt.sock.family == AF_INET) {
+		if (opt.sockopt.tclass &&
+		    setsockopt(fd, SOL_IP, IP_TOS,
+			       &opt.sockopt.tclass, sizeof(opt.sockopt.tclass)))
+			error(ERN_SOCKOPT, errno, "setsockopt IP_TOS");
+		if (opt.sockopt.hlimit &&
+		    setsockopt(fd, SOL_IP, IP_TTL,
+			       &opt.sockopt.hlimit, sizeof(opt.sockopt.hlimit)))
+			error(ERN_SOCKOPT, errno, "setsockopt IP_TTL");
+	} else {
+		if (opt.sockopt.dontfrag &&
+		    setsockopt(fd, SOL_IPV6, IPV6_DONTFRAG,
+			       &opt.sockopt.dontfrag, sizeof(opt.sockopt.dontfrag)))
+			error(ERN_SOCKOPT, errno, "setsockopt IPV6_DONTFRAG");
+		if (opt.sockopt.tclass &&
+		    setsockopt(fd, SOL_IPV6, IPV6_TCLASS,
+			       &opt.sockopt.tclass, sizeof(opt.sockopt.tclass)))
+			error(ERN_SOCKOPT, errno, "setsockopt IPV6_TCLASS");
+		if (opt.sockopt.hlimit &&
+		    setsockopt(fd, SOL_IPV6, IPV6_UNICAST_HOPS,
+			       &opt.sockopt.hlimit, sizeof(opt.sockopt.hlimit)))
+			error(ERN_SOCKOPT, errno, "setsockopt IPV6_HOPLIMIT");
+	}
+
+	if (opt.txtime.ena) {
+		struct sock_txtime so_txtime = {
+			.clockid = CLOCK_MONOTONIC,
+		};
+
+		if (setsockopt(fd, SOL_SOCKET, SO_TXTIME,
+			       &so_txtime, sizeof(so_txtime)))
+			error(ERN_SOCKOPT, errno, "setsockopt TXTIME");
+	}
+	if (opt.ts.ena) {
+		__u32 val = SOF_TIMESTAMPING_SOFTWARE |
+			SOF_TIMESTAMPING_OPT_TSONLY;
+
+		if (setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING,
+			       &val, sizeof(val)))
+			error(ERN_SOCKOPT, errno, "setsockopt TIMESTAMPING");
+	}
 }
 
 int main(int argc, char *argv[])
@@ -494,7 +537,7 @@ int main(int argc, char *argv[])
 	cs_write_cmsg(fd, &msg, cbuf, sizeof(cbuf));
 
 	for (i = 0; i < opt.num_pkt; i++) {
-		err = sendmsg(fd, &msg, 0);
+		err = sendmsg(fd, &msg, opt.msg_more ? MSG_MORE : 0);
 		if (err < 0) {
 			if (!opt.silent_send)
 				fprintf(stderr, "send failed: %s\n", strerror(errno));
@@ -505,14 +548,28 @@ int main(int argc, char *argv[])
 			err = ERN_SEND_SHORT;
 			goto err_out;
 		}
+		if (opt.msg_more) {
+			err = write(fd, NULL, 0);
+			if (err < 0) {
+				fprintf(stderr, "send more: %s\n", strerror(errno));
+				err = ERN_SEND_MORE;
+				goto err_out;
+			}
+		}
 	}
 	err = ERN_SUCCESS;
 
 	if (opt.ts.ena) {
-		/* Make sure all timestamps have time to loop back */
-		usleep(opt.txtime.delay);
+		unsigned long seen;
+		int i;
 
-		cs_read_cmsg(fd, &msg, cbuf, sizeof(cbuf));
+		/* Make sure all timestamps have time to loop back */
+		for (i = 0; i < 40; i++) {
+			seen = cs_read_cmsg(fd, &msg, cbuf, sizeof(cbuf));
+			if (seen & (1 << SCM_TSTAMP_SND))
+				break;
+			usleep(opt.txtime.delay / 20);
+		}
 	}
 
 err_out:
diff --git a/tools/testing/selftests/net/cmsg_so_priority.sh b/tools/testing/selftests/net/cmsg_so_priority.sh
new file mode 100755
index 000000000000..ee07d8653262
--- /dev/null
+++ b/tools/testing/selftests/net/cmsg_so_priority.sh
@@ -0,0 +1,151 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source lib.sh
+
+readonly KSFT_SKIP=4
+
+IP4=192.0.2.1/24
+TGT4=192.0.2.2
+TGT4_RAW=192.0.2.3
+IP6=2001:db8::1/64
+TGT6=2001:db8::2
+TGT6_RAW=2001:db8::3
+PORT=1234
+TOTAL_TESTS=0
+FAILED_TESTS=0
+
+if ! command -v jq &> /dev/null; then
+    echo "SKIP cmsg_so_priroity.sh test: jq is not installed." >&2
+    exit "$KSFT_SKIP"
+fi
+
+check_result() {
+    ((TOTAL_TESTS++))
+    if [ "$1" -ne 0 ]; then
+        ((FAILED_TESTS++))
+    fi
+}
+
+cleanup()
+{
+    cleanup_ns $NS
+}
+
+trap cleanup EXIT
+
+setup_ns NS
+
+create_filter() {
+    local handle=$1
+    local vlan_prio=$2
+    local ip_type=$3
+    local proto=$4
+    local dst_ip=$5
+    local ip_proto
+
+    if [[ "$proto" == "u" ]]; then
+        ip_proto="udp"
+    elif [[ "$ip_type" == "ipv4" && "$proto" == "i" ]]; then
+        ip_proto="icmp"
+    elif [[ "$ip_type" == "ipv6" && "$proto" == "i" ]]; then
+        ip_proto="icmpv6"
+    fi
+
+    tc -n $NS filter add dev dummy1 \
+        egress pref 1 handle "$handle" proto 802.1q \
+        flower vlan_prio "$vlan_prio" vlan_ethtype "$ip_type" \
+        dst_ip "$dst_ip" ${ip_proto:+ip_proto $ip_proto} \
+        action pass
+}
+
+ip -n $NS link set dev lo up
+ip -n $NS link add name dummy1 up type dummy
+
+ip -n $NS link add link dummy1 name dummy1.10 up type vlan id 10 \
+    egress-qos-map 0:0 1:1 2:2 3:3 4:4 5:5 6:6 7:7
+
+ip -n $NS address add $IP4 dev dummy1.10
+ip -n $NS address add $IP6 dev dummy1.10 nodad
+
+ip netns exec $NS sysctl -wq net.ipv4.ping_group_range='0 2147483647'
+
+ip -n $NS neigh add $TGT4 lladdr 00:11:22:33:44:55 nud permanent \
+    dev dummy1.10
+ip -n $NS neigh add $TGT6 lladdr 00:11:22:33:44:55 nud permanent \
+    dev dummy1.10
+ip -n $NS neigh add $TGT4_RAW lladdr 00:11:22:33:44:66 nud permanent \
+    dev dummy1.10
+ip -n $NS neigh add $TGT6_RAW lladdr 00:11:22:33:44:66 nud permanent \
+    dev dummy1.10
+
+tc -n $NS qdisc add dev dummy1 clsact
+
+FILTER_COUNTER=10
+
+for i in 4 6; do
+    for proto in u i r; do
+        echo "Test IPV$i, prot: $proto"
+        for priority in {0..7}; do
+            if [[ $i == 4 && $proto == "r" ]]; then
+                TGT=$TGT4_RAW
+            elif [[ $i == 6 && $proto == "r" ]]; then
+                TGT=$TGT6_RAW
+            elif [ $i == 4 ]; then
+                TGT=$TGT4
+            else
+                TGT=$TGT6
+            fi
+
+            handle="${FILTER_COUNTER}${priority}"
+
+            create_filter $handle $priority ipv$i $proto $TGT
+
+            pkts=$(tc -n $NS -j -s filter show dev dummy1 egress \
+                | jq ".[] | select(.options.handle == ${handle}) | \
+                .options.actions[0].stats.packets")
+
+            if [[ $pkts == 0 ]]; then
+                check_result 0
+            else
+                echo "prio $priority: expected 0, got $pkts"
+                check_result 1
+            fi
+
+            ip netns exec $NS ./cmsg_sender -$i -Q $priority \
+	            -p $proto $TGT $PORT
+
+            pkts=$(tc -n $NS -j -s filter show dev dummy1 egress \
+                | jq ".[] | select(.options.handle == ${handle}) | \
+                .options.actions[0].stats.packets")
+            if [[ $pkts == 1 ]]; then
+                check_result 0
+            else
+                echo "prio $priority -Q: expected 1, got $pkts"
+                check_result 1
+            fi
+
+            ip netns exec $NS ./cmsg_sender -$i -P $priority \
+	            -p $proto $TGT $PORT
+
+            pkts=$(tc -n $NS -j -s filter show dev dummy1 egress \
+                | jq ".[] | select(.options.handle == ${handle}) | \
+                .options.actions[0].stats.packets")
+            if [[ $pkts == 2 ]]; then
+                check_result 0
+            else
+                echo "prio $priority -P: expected 2, got $pkts"
+                check_result 1
+            fi
+        done
+        FILTER_COUNTER=$((FILTER_COUNTER + 10))
+    done
+done
+
+if [ $FAILED_TESTS -ne 0 ]; then
+    echo "FAIL - $FAILED_TESTS/$TOTAL_TESTS tests failed"
+    exit 1
+else
+    echo "OK - All $TOTAL_TESTS tests passed"
+    exit 0
+fi
diff --git a/tools/testing/selftests/net/cmsg_time.sh b/tools/testing/selftests/net/cmsg_time.sh
index af85267ad1e3..478af0aefa97 100755
--- a/tools/testing/selftests/net/cmsg_time.sh
+++ b/tools/testing/selftests/net/cmsg_time.sh
@@ -34,13 +34,28 @@ BAD=0
 TOTAL=0
 
 check_result() {
+    local ret=$1
+    local got=$2
+    local exp=$3
+    local case=$4
+    local xfail=$5
+    local xf=
+    local inc=
+
+    if [ "$xfail" == "xfail" ]; then
+	xf="(XFAIL)"
+	inc=0
+    else
+	inc=1
+    fi
+
     ((TOTAL++))
-    if [ $1 -ne 0 ]; then
-	echo "  Case $4 returned $1, expected 0"
-	((BAD++))
+    if [ $ret -ne 0 ]; then
+	echo "  Case $case returned $ret, expected 0 $xf"
+	((BAD+=inc))
     elif [ "$2" != "$3" ]; then
-	echo "  Case $4 returned '$2', expected '$3'"
-	((BAD++))
+	echo "  Case $case returned '$got', expected '$exp' $xf"
+	((BAD+=inc))
     fi
 }
 
@@ -66,11 +81,14 @@ for i in "-4 $TGT4" "-6 $TGT6"; do
 		 awk '/SND/ { if ($3 > 1000) print "OK"; }')
 	check_result $? "$ts" "OK" "$prot - TXTIME abs"
 
+	[ "$KSFT_MACHINE_SLOW" = yes ] && xfail=xfail
+
 	ts=$(ip netns exec $NS ./cmsg_sender -p $p $i 1234 -t -d 1000 |
 		 awk '/SND/ {snd=$3}
 		      /SCHED/ {sch=$3}
-		      END { if (snd - sch > 500) print "OK"; }')
-	check_result $? "$ts" "OK" "$prot - TXTIME rel"
+		      END { if (snd - sch > 500) print "OK";
+			    else print snd, "-", sch, "<", 500; }')
+	check_result $? "$ts" "OK" "$prot - TXTIME rel" $xfail
     done
 done
 
diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config
index 5e4390cac17e..3cfef5153823 100644
--- a/tools/testing/selftests/net/config
+++ b/tools/testing/selftests/net/config
@@ -18,6 +18,8 @@ CONFIG_DUMMY=y
 CONFIG_BRIDGE_VLAN_FILTERING=y
 CONFIG_BRIDGE=y
 CONFIG_CRYPTO_CHACHA20POLY1305=m
+CONFIG_DEBUG_INFO_BTF=y
+CONFIG_DEBUG_INFO_BTF_MODULES=n
 CONFIG_VLAN_8021Q=y
 CONFIG_GENEVE=m
 CONFIG_IFB=y
@@ -26,12 +28,11 @@ CONFIG_INET_ESP=y
 CONFIG_INET_ESP_OFFLOAD=y
 CONFIG_NET_FOU=y
 CONFIG_NET_FOU_IP_TUNNELS=y
-CONFIG_IP_GRE=m
 CONFIG_NETFILTER=y
 CONFIG_NETFILTER_ADVANCED=y
 CONFIG_NF_CONNTRACK=m
+CONFIG_IPV6_MROUTE=y
 CONFIG_IPV6_SIT=y
-CONFIG_IP_DCCP=m
 CONFIG_NF_NAT=m
 CONFIG_IP6_NF_IPTABLES=m
 CONFIG_IP_NF_IPTABLES=m
@@ -74,7 +75,12 @@ CONFIG_NET_SCH_ETF=m
 CONFIG_NET_SCH_NETEM=y
 CONFIG_NET_SCH_PRIO=m
 CONFIG_NFT_COMPAT=m
+CONFIG_NF_CONNTRACK_OVS=y
 CONFIG_NF_FLOW_TABLE=m
+CONFIG_OPENVSWITCH=m
+CONFIG_OPENVSWITCH_GENEVE=m
+CONFIG_OPENVSWITCH_GRE=m
+CONFIG_OPENVSWITCH_VXLAN=m
 CONFIG_PSAMPLE=m
 CONFIG_TCP_MD5SIG=y
 CONFIG_TEST_BLACKHOLE_DEV=m
@@ -100,3 +106,13 @@ CONFIG_NETFILTER_XT_MATCH_POLICY=m
 CONFIG_CRYPTO_ARIA=y
 CONFIG_XFRM_INTERFACE=m
 CONFIG_XFRM_USER=m
+CONFIG_IP_NF_MATCH_RPFILTER=m
+CONFIG_IP6_NF_MATCH_RPFILTER=m
+CONFIG_IPVLAN=m
+CONFIG_CAN=m
+CONFIG_CAN_DEV=m
+CONFIG_CAN_VXCAN=m
+CONFIG_NETKIT=y
+CONFIG_NET_PKTGEN=m
+CONFIG_IPV6_ILA=m
+CONFIG_IPV6_RPL_LWTUNNEL=y
diff --git a/tools/testing/selftests/net/devlink_port_split.py b/tools/testing/selftests/net/devlink_port_split.py
deleted file mode 100755
index 2d84c7a0be6b..000000000000
--- a/tools/testing/selftests/net/devlink_port_split.py
+++ /dev/null
@@ -1,309 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: GPL-2.0
-
-from subprocess import PIPE, Popen
-import json
-import time
-import argparse
-import collections
-import sys
-
-#
-# Test port split configuration using devlink-port lanes attribute.
-# The test is skipped in case the attribute is not available.
-#
-# First, check that all the ports with 1 lane fail to split.
-# Second, check that all the ports with more than 1 lane can be split
-# to all valid configurations (e.g., split to 2, split to 4 etc.)
-#
-
-
-# Kselftest framework requirement - SKIP code is 4
-KSFT_SKIP=4
-Port = collections.namedtuple('Port', 'bus_info name')
-
-
-def run_command(cmd, should_fail=False):
-    """
-    Run a command in subprocess.
-    Return: Tuple of (stdout, stderr).
-    """
-
-    p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
-    stdout, stderr = p.communicate()
-    stdout, stderr = stdout.decode(), stderr.decode()
-
-    if stderr != "" and not should_fail:
-        print("Error sending command: %s" % cmd)
-        print(stdout)
-        print(stderr)
-    return stdout, stderr
-
-
-class devlink_ports(object):
-    """
-    Class that holds information on the devlink ports, required to the tests;
-    if_names: A list of interfaces in the devlink ports.
-    """
-
-    def get_if_names(dev):
-        """
-        Get a list of physical devlink ports.
-        Return: Array of tuples (bus_info/port, if_name).
-        """
-
-        arr = []
-
-        cmd = "devlink -j port show"
-        stdout, stderr = run_command(cmd)
-        assert stderr == ""
-        ports = json.loads(stdout)['port']
-
-        validate_devlink_output(ports, 'flavour')
-
-        for port in ports:
-            if dev in port:
-                if ports[port]['flavour'] == 'physical':
-                    arr.append(Port(bus_info=port, name=ports[port]['netdev']))
-
-        return arr
-
-    def __init__(self, dev):
-        self.if_names = devlink_ports.get_if_names(dev)
-
-
-def get_max_lanes(port):
-    """
-    Get the $port's maximum number of lanes.
-    Return: number of lanes, e.g. 1, 2, 4 and 8.
-    """
-
-    cmd = "devlink -j port show %s" % port
-    stdout, stderr = run_command(cmd)
-    assert stderr == ""
-    values = list(json.loads(stdout)['port'].values())[0]
-
-    if 'lanes' in values:
-        lanes = values['lanes']
-    else:
-        lanes = 0
-    return lanes
-
-
-def get_split_ability(port):
-    """
-    Get the $port split ability.
-    Return: split ability, true or false.
-    """
-
-    cmd = "devlink -j port show %s" % port.name
-    stdout, stderr = run_command(cmd)
-    assert stderr == ""
-    values = list(json.loads(stdout)['port'].values())[0]
-
-    return values['splittable']
-
-
-def split(k, port, should_fail=False):
-    """
-    Split $port into $k ports.
-    If should_fail == True, the split should fail. Otherwise, should pass.
-    Return: Array of sub ports after splitting.
-            If the $port wasn't split, the array will be empty.
-    """
-
-    cmd = "devlink port split %s count %s" % (port.bus_info, k)
-    stdout, stderr = run_command(cmd, should_fail=should_fail)
-
-    if should_fail:
-        if not test(stderr != "", "%s is unsplittable" % port.name):
-            print("split an unsplittable port %s" % port.name)
-            return create_split_group(port, k)
-    else:
-        if stderr == "":
-            return create_split_group(port, k)
-        print("didn't split a splittable port %s" % port.name)
-
-    return []
-
-
-def unsplit(port):
-    """
-    Unsplit $port.
-    """
-
-    cmd = "devlink port unsplit %s" % port
-    stdout, stderr = run_command(cmd)
-    test(stderr == "", "Unsplit port %s" % port)
-
-
-def exists(port, dev):
-    """
-    Check if $port exists in the devlink ports.
-    Return: True is so, False otherwise.
-    """
-
-    return any(dev_port.name == port
-               for dev_port in devlink_ports.get_if_names(dev))
-
-
-def exists_and_lanes(ports, lanes, dev):
-    """
-    Check if every port in the list $ports exists in the devlink ports and has
-    $lanes number of lanes after splitting.
-    Return: True if both are True, False otherwise.
-    """
-
-    for port in ports:
-        max_lanes = get_max_lanes(port)
-        if not exists(port, dev):
-            print("port %s doesn't exist in devlink ports" % port)
-            return False
-        if max_lanes != lanes:
-            print("port %s has %d lanes, but %s were expected"
-                  % (port, lanes, max_lanes))
-            return False
-    return True
-
-
-def test(cond, msg):
-    """
-    Check $cond and print a message accordingly.
-    Return: True is pass, False otherwise.
-    """
-
-    if cond:
-        print("TEST: %-60s [ OK ]" % msg)
-    else:
-        print("TEST: %-60s [FAIL]" % msg)
-
-    return cond
-
-
-def create_split_group(port, k):
-    """
-    Create the split group for $port.
-    Return: Array with $k elements, which are the split port group.
-    """
-
-    return list(port.name + "s" + str(i) for i in range(k))
-
-
-def split_unsplittable_port(port, k):
-    """
-    Test that splitting of unsplittable port fails.
-    """
-
-    # split to max
-    new_split_group = split(k, port, should_fail=True)
-
-    if new_split_group != []:
-        unsplit(port.bus_info)
-
-
-def split_splittable_port(port, k, lanes, dev):
-    """
-    Test that splitting of splittable port passes correctly.
-    """
-
-    new_split_group = split(k, port)
-
-    # Once the split command ends, it takes some time to the sub ifaces'
-    # to get their names. Use udevadm to continue only when all current udev
-    # events are handled.
-    cmd = "udevadm settle"
-    stdout, stderr = run_command(cmd)
-    assert stderr == ""
-
-    if new_split_group != []:
-        test(exists_and_lanes(new_split_group, lanes/k, dev),
-             "split port %s into %s" % (port.name, k))
-
-    unsplit(port.bus_info)
-
-
-def validate_devlink_output(devlink_data, target_property=None):
-    """
-    Determine if test should be skipped by checking:
-      1. devlink_data contains values
-      2. The target_property exist in devlink_data
-    """
-    skip_reason = None
-    if any(devlink_data.values()):
-        if target_property:
-            skip_reason = "{} not found in devlink output, test skipped".format(target_property)
-            for key in devlink_data:
-                if target_property in devlink_data[key]:
-                    skip_reason = None
-    else:
-        skip_reason = 'devlink output is empty, test skipped'
-
-    if skip_reason:
-        print(skip_reason)
-        sys.exit(KSFT_SKIP)
-
-
-def make_parser():
-    parser = argparse.ArgumentParser(description='A test for port splitting.')
-    parser.add_argument('--dev',
-                        help='The devlink handle of the device under test. ' +
-                             'The default is the first registered devlink ' +
-                             'handle.')
-
-    return parser
-
-
-def main(cmdline=None):
-    parser = make_parser()
-    args = parser.parse_args(cmdline)
-
-    dev = args.dev
-    if not dev:
-        cmd = "devlink -j dev show"
-        stdout, stderr = run_command(cmd)
-        assert stderr == ""
-
-        validate_devlink_output(json.loads(stdout))
-        devs = json.loads(stdout)['dev']
-        dev = list(devs.keys())[0]
-
-    cmd = "devlink dev show %s" % dev
-    stdout, stderr = run_command(cmd)
-    if stderr != "":
-        print("devlink device %s can not be found" % dev)
-        sys.exit(1)
-
-    ports = devlink_ports(dev)
-
-    found_max_lanes = False
-    for port in ports.if_names:
-        max_lanes = get_max_lanes(port.name)
-
-        # If max lanes is 0, do not test port splitting at all
-        if max_lanes == 0:
-            continue
-
-        # If 1 lane, shouldn't be able to split
-        elif max_lanes == 1:
-            test(not get_split_ability(port),
-                 "%s should not be able to split" % port.name)
-            split_unsplittable_port(port, max_lanes)
-
-        # Else, splitting should pass and all the split ports should exist.
-        else:
-            lane = max_lanes
-            test(get_split_ability(port),
-                 "%s should be able to split" % port.name)
-            while lane > 1:
-                split_splittable_port(port, lane, max_lanes, dev)
-
-                lane //= 2
-        found_max_lanes = True
-
-    if not found_max_lanes:
-        print(f"Test not started, no port of device {dev} reports max_lanes")
-        sys.exit(KSFT_SKIP)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/testing/selftests/net/drop_monitor_tests.sh b/tools/testing/selftests/net/drop_monitor_tests.sh
index 7c4818c971fc..507d0a82f5f0 100755
--- a/tools/testing/selftests/net/drop_monitor_tests.sh
+++ b/tools/testing/selftests/net/drop_monitor_tests.sh
@@ -77,7 +77,7 @@ sw_drops_test()
 
 	rm ${dir}/packets.pcap
 
-	{ kill %% && wait %%; } 2>/dev/null
+	kill_process %%
 	timeout 5 dwdump -o sw -w ${dir}/packets.pcap
 	(( $(tshark -r ${dir}/packets.pcap \
 		-Y 'ip.dst == 192.0.2.10' 2> /dev/null | wc -l) == 0))
diff --git a/tools/testing/selftests/net/epoll_busy_poll.c b/tools/testing/selftests/net/epoll_busy_poll.c
new file mode 100644
index 000000000000..16e457c2f877
--- /dev/null
+++ b/tools/testing/selftests/net/epoll_busy_poll.c
@@ -0,0 +1,320 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/* Basic per-epoll context busy poll test.
+ *
+ * Only tests the ioctls, but should be expanded to test two connected hosts in
+ * the future
+ */
+
+#define _GNU_SOURCE
+
+#include <error.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/capability.h>
+
+#include <sys/epoll.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+
+#include "../kselftest_harness.h"
+
+/* if the headers haven't been updated, we need to define some things */
+#if !defined(EPOLL_IOC_TYPE)
+struct epoll_params {
+	uint32_t busy_poll_usecs;
+	uint16_t busy_poll_budget;
+	uint8_t prefer_busy_poll;
+
+	/* pad the struct to a multiple of 64bits */
+	uint8_t __pad;
+};
+
+#define EPOLL_IOC_TYPE 0x8A
+#define EPIOCSPARAMS _IOW(EPOLL_IOC_TYPE, 0x01, struct epoll_params)
+#define EPIOCGPARAMS _IOR(EPOLL_IOC_TYPE, 0x02, struct epoll_params)
+#endif
+
+FIXTURE(invalid_fd)
+{
+	int invalid_fd;
+	struct epoll_params params;
+};
+
+FIXTURE_SETUP(invalid_fd)
+{
+	int ret;
+
+	ret = socket(AF_UNIX, SOCK_DGRAM, 0);
+	EXPECT_NE(-1, ret)
+		TH_LOG("error creating unix socket");
+
+	self->invalid_fd = ret;
+}
+
+FIXTURE_TEARDOWN(invalid_fd)
+{
+	int ret;
+
+	ret = close(self->invalid_fd);
+	EXPECT_EQ(0, ret);
+}
+
+TEST_F(invalid_fd, test_invalid_fd)
+{
+	int ret;
+
+	ret = ioctl(self->invalid_fd, EPIOCGPARAMS, &self->params);
+
+	EXPECT_EQ(-1, ret)
+		TH_LOG("EPIOCGPARAMS on invalid epoll FD should error");
+
+	EXPECT_EQ(ENOTTY, errno)
+		TH_LOG("EPIOCGPARAMS on invalid epoll FD should set errno to ENOTTY");
+
+	memset(&self->params, 0, sizeof(struct epoll_params));
+
+	ret = ioctl(self->invalid_fd, EPIOCSPARAMS, &self->params);
+
+	EXPECT_EQ(-1, ret)
+		TH_LOG("EPIOCSPARAMS on invalid epoll FD should error");
+
+	EXPECT_EQ(ENOTTY, errno)
+		TH_LOG("EPIOCSPARAMS on invalid epoll FD should set errno to ENOTTY");
+}
+
+FIXTURE(epoll_busy_poll)
+{
+	int fd;
+	struct epoll_params params;
+	struct epoll_params *invalid_params;
+	cap_t caps;
+};
+
+FIXTURE_SETUP(epoll_busy_poll)
+{
+	int ret;
+
+	ret = epoll_create1(0);
+	EXPECT_NE(-1, ret)
+		TH_LOG("epoll_create1 failed?");
+
+	self->fd = ret;
+
+	self->caps = cap_get_proc();
+	EXPECT_NE(NULL, self->caps);
+}
+
+FIXTURE_TEARDOWN(epoll_busy_poll)
+{
+	int ret;
+
+	ret = close(self->fd);
+	EXPECT_EQ(0, ret);
+
+	ret = cap_free(self->caps);
+	EXPECT_NE(-1, ret)
+		TH_LOG("unable to free capabilities");
+}
+
+TEST_F(epoll_busy_poll, test_get_params)
+{
+	/* begin by getting the epoll params from the kernel
+	 *
+	 * the default should be default and all fields should be zero'd by the
+	 * kernel, so set params fields to garbage to test this.
+	 */
+	int ret = 0;
+
+	self->params.busy_poll_usecs = 0xff;
+	self->params.busy_poll_budget = 0xff;
+	self->params.prefer_busy_poll = 1;
+	self->params.__pad = 0xf;
+
+	ret = ioctl(self->fd, EPIOCGPARAMS, &self->params);
+	EXPECT_EQ(0, ret)
+		TH_LOG("ioctl EPIOCGPARAMS should succeed");
+
+	EXPECT_EQ(0, self->params.busy_poll_usecs)
+		TH_LOG("EPIOCGPARAMS busy_poll_usecs should have been 0");
+
+	EXPECT_EQ(0, self->params.busy_poll_budget)
+		TH_LOG("EPIOCGPARAMS busy_poll_budget should have been 0");
+
+	EXPECT_EQ(0, self->params.prefer_busy_poll)
+		TH_LOG("EPIOCGPARAMS prefer_busy_poll should have been 0");
+
+	EXPECT_EQ(0, self->params.__pad)
+		TH_LOG("EPIOCGPARAMS __pad should have been 0");
+
+	self->invalid_params = (struct epoll_params *)0xdeadbeef;
+	ret = ioctl(self->fd, EPIOCGPARAMS, self->invalid_params);
+
+	EXPECT_EQ(-1, ret)
+		TH_LOG("EPIOCGPARAMS should error with invalid params");
+
+	EXPECT_EQ(EFAULT, errno)
+		TH_LOG("EPIOCGPARAMS with invalid params should set errno to EFAULT");
+}
+
+TEST_F(epoll_busy_poll, test_set_invalid)
+{
+	int ret;
+
+	memset(&self->params, 0, sizeof(struct epoll_params));
+
+	self->params.__pad = 1;
+
+	ret = ioctl(self->fd, EPIOCSPARAMS, &self->params);
+
+	EXPECT_EQ(-1, ret)
+		TH_LOG("EPIOCSPARAMS non-zero __pad should error");
+
+	EXPECT_EQ(EINVAL, errno)
+		TH_LOG("EPIOCSPARAMS non-zero __pad errno should be EINVAL");
+
+	self->params.__pad = 0;
+	self->params.busy_poll_usecs = (uint32_t)INT_MAX + 1;
+
+	ret = ioctl(self->fd, EPIOCSPARAMS, &self->params);
+
+	EXPECT_EQ(-1, ret)
+		TH_LOG("EPIOCSPARAMS should error busy_poll_usecs > S32_MAX");
+
+	EXPECT_EQ(EINVAL, errno)
+		TH_LOG("EPIOCSPARAMS busy_poll_usecs > S32_MAX errno should be EINVAL");
+
+	self->params.__pad = 0;
+	self->params.busy_poll_usecs = 32;
+	self->params.prefer_busy_poll = 2;
+
+	ret = ioctl(self->fd, EPIOCSPARAMS, &self->params);
+
+	EXPECT_EQ(-1, ret)
+		TH_LOG("EPIOCSPARAMS should error prefer_busy_poll > 1");
+
+	EXPECT_EQ(EINVAL, errno)
+		TH_LOG("EPIOCSPARAMS prefer_busy_poll > 1 errno should be EINVAL");
+
+	self->params.__pad = 0;
+	self->params.busy_poll_usecs = 32;
+	self->params.prefer_busy_poll = 1;
+
+	/* set budget well above kernel's NAPI_POLL_WEIGHT of 64 */
+	self->params.busy_poll_budget = UINT16_MAX;
+
+	/* test harness should run with CAP_NET_ADMIN, but let's make sure */
+	cap_flag_value_t tmp;
+
+	ret = cap_get_flag(self->caps, CAP_NET_ADMIN, CAP_EFFECTIVE, &tmp);
+	EXPECT_EQ(0, ret)
+		TH_LOG("unable to get CAP_NET_ADMIN cap flag");
+
+	EXPECT_EQ(CAP_SET, tmp)
+		TH_LOG("expecting CAP_NET_ADMIN to be set for the test harness");
+
+	/* at this point we know CAP_NET_ADMIN is available, so setting the
+	 * params with a busy_poll_budget > NAPI_POLL_WEIGHT should succeed
+	 */
+	ret = ioctl(self->fd, EPIOCSPARAMS, &self->params);
+
+	EXPECT_EQ(0, ret)
+		TH_LOG("EPIOCSPARAMS should allow busy_poll_budget > NAPI_POLL_WEIGHT");
+
+	/* remove CAP_NET_ADMIN from our effective set */
+	cap_value_t net_admin[] = { CAP_NET_ADMIN };
+
+	ret = cap_set_flag(self->caps, CAP_EFFECTIVE, 1, net_admin, CAP_CLEAR);
+	EXPECT_EQ(0, ret)
+		TH_LOG("couldn't clear CAP_NET_ADMIN");
+
+	ret = cap_set_proc(self->caps);
+	EXPECT_EQ(0, ret)
+		TH_LOG("cap_set_proc should drop CAP_NET_ADMIN");
+
+	/* this is now expected to fail */
+	ret = ioctl(self->fd, EPIOCSPARAMS, &self->params);
+
+	EXPECT_EQ(-1, ret)
+		TH_LOG("EPIOCSPARAMS should error busy_poll_budget > NAPI_POLL_WEIGHT");
+
+	EXPECT_EQ(EPERM, errno)
+		TH_LOG("EPIOCSPARAMS errno should be EPERM busy_poll_budget > NAPI_POLL_WEIGHT");
+
+	/* restore CAP_NET_ADMIN to our effective set */
+	ret = cap_set_flag(self->caps, CAP_EFFECTIVE, 1, net_admin, CAP_SET);
+	EXPECT_EQ(0, ret)
+		TH_LOG("couldn't restore CAP_NET_ADMIN");
+
+	ret = cap_set_proc(self->caps);
+	EXPECT_EQ(0, ret)
+		TH_LOG("cap_set_proc should set  CAP_NET_ADMIN");
+
+	self->invalid_params = (struct epoll_params *)0xdeadbeef;
+	ret = ioctl(self->fd, EPIOCSPARAMS, self->invalid_params);
+
+	EXPECT_EQ(-1, ret)
+		TH_LOG("EPIOCSPARAMS should error when epoll_params is invalid");
+
+	EXPECT_EQ(EFAULT, errno)
+		TH_LOG("EPIOCSPARAMS should set errno to EFAULT when epoll_params is invalid");
+}
+
+TEST_F(epoll_busy_poll, test_set_and_get_valid)
+{
+	int ret;
+
+	memset(&self->params, 0, sizeof(struct epoll_params));
+
+	self->params.busy_poll_usecs = 25;
+	self->params.busy_poll_budget = 16;
+	self->params.prefer_busy_poll = 1;
+
+	ret = ioctl(self->fd, EPIOCSPARAMS, &self->params);
+
+	EXPECT_EQ(0, ret)
+		TH_LOG("EPIOCSPARAMS with valid params should not error");
+
+	/* check that the kernel returns the same values back */
+
+	memset(&self->params, 0, sizeof(struct epoll_params));
+
+	ret = ioctl(self->fd, EPIOCGPARAMS, &self->params);
+
+	EXPECT_EQ(0, ret)
+		TH_LOG("EPIOCGPARAMS should not error");
+
+	EXPECT_EQ(25, self->params.busy_poll_usecs)
+		TH_LOG("params.busy_poll_usecs incorrect");
+
+	EXPECT_EQ(16, self->params.busy_poll_budget)
+		TH_LOG("params.busy_poll_budget incorrect");
+
+	EXPECT_EQ(1, self->params.prefer_busy_poll)
+		TH_LOG("params.prefer_busy_poll incorrect");
+
+	EXPECT_EQ(0, self->params.__pad)
+		TH_LOG("params.__pad was not 0");
+}
+
+TEST_F(epoll_busy_poll, test_invalid_ioctl)
+{
+	int invalid_ioctl = EPIOCGPARAMS + 10;
+	int ret;
+
+	ret = ioctl(self->fd, invalid_ioctl, &self->params);
+
+	EXPECT_EQ(-1, ret)
+		TH_LOG("invalid ioctl should return error");
+
+	EXPECT_EQ(EINVAL, errno)
+		TH_LOG("invalid ioctl should set errno to EINVAL");
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh
index 386ebd829df5..4fcc38907e48 100755
--- a/tools/testing/selftests/net/fcnal-test.sh
+++ b/tools/testing/selftests/net/fcnal-test.sh
@@ -3667,7 +3667,7 @@ ipv6_addr_bind_novrf()
 	# when it really should not
 	a=${NSA_LO_IP6}
 	log_start
-	show_hint "Tecnically should fail since address is not on device but kernel allows"
+	show_hint "Technically should fail since address is not on device but kernel allows"
 	run_cmd nettest -6 -s -l ${a} -I ${NSA_DEV} -t1 -b
 	log_test_addr ${a} $? 0 "TCP socket bind to out of scope local address"
 }
@@ -3724,7 +3724,7 @@ ipv6_addr_bind_vrf()
 	# passes when it really should not
 	a=${VRF_IP6}
 	log_start
-	show_hint "Tecnically should fail since address is not on device but kernel allows"
+	show_hint "Technically should fail since address is not on device but kernel allows"
 	run_cmd nettest -6 -s -l ${a} -I ${NSA_DEV} -t1 -b
 	log_test_addr ${a} $? 0 "TCP socket bind to VRF address with device bind"
 
@@ -4304,14 +4304,7 @@ elif [ "$TESTS" = "ipv6" ]; then
 	TESTS="$TESTS_IPV6"
 fi
 
-# nettest can be run from PATH or from same directory as this selftest
-if ! which nettest >/dev/null; then
-	PATH=$PWD:$PATH
-	if ! which nettest >/dev/null; then
-		echo "'nettest' command not found; skipping tests"
-		exit $ksft_skip
-	fi
-fi
+check_gen_prog "nettest"
 
 declare -i nfail=0
 declare -i nsuccess=0
diff --git a/tools/testing/selftests/net/fdb_flush.sh b/tools/testing/selftests/net/fdb_flush.sh
index d5e3abb8658c..9931a1e36e3d 100755
--- a/tools/testing/selftests/net/fdb_flush.sh
+++ b/tools/testing/selftests/net/fdb_flush.sh
@@ -583,7 +583,7 @@ vxlan_test_flush_by_remote_attributes()
 	$IP link del dev vx10
 	$IP link add name vx10 type vxlan dstport "$VXPORT" external
 
-	# For multicat FDB entries, the VXLAN driver stores a linked list of
+	# For multicast FDB entries, the VXLAN driver stores a linked list of
 	# remotes for a given key. Verify that only the expected remotes are
 	# flushed.
 	multicast_fdb_entries_add
diff --git a/tools/testing/selftests/net/fdb_notify.sh b/tools/testing/selftests/net/fdb_notify.sh
new file mode 100755
index 000000000000..c159230c9b62
--- /dev/null
+++ b/tools/testing/selftests/net/fdb_notify.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source lib.sh
+
+ALL_TESTS="
+	test_dup_bridge
+	test_dup_vxlan_self
+	test_dup_vxlan_master
+	test_dup_macvlan_self
+	test_dup_macvlan_master
+"
+
+do_test_dup()
+{
+	local op=$1; shift
+	local what=$1; shift
+	local tmpf
+
+	RET=0
+
+	tmpf=$(mktemp)
+	defer rm "$tmpf"
+
+	defer_scope_push
+		bridge monitor fdb &> "$tmpf" &
+		defer kill_process $!
+
+		sleep 0.5
+		bridge fdb "$op" 00:11:22:33:44:55 vlan 1 "$@"
+		sleep 0.5
+	defer_scope_pop
+
+	local count=$(grep -c -e 00:11:22:33:44:55 $tmpf)
+	((count == 1))
+	check_err $? "Got $count notifications, expected 1"
+
+	log_test "$what $op: Duplicate notifications"
+}
+
+test_dup_bridge()
+{
+	ip_link_add br up type bridge vlan_filtering 1
+	do_test_dup add "bridge" dev br self
+	do_test_dup del "bridge" dev br self
+}
+
+test_dup_vxlan_self()
+{
+	ip_link_add br up type bridge vlan_filtering 1
+	ip_link_add vx up type vxlan id 2000 dstport 4789
+	ip_link_set_master vx br
+
+	do_test_dup add "vxlan" dev vx self dst 192.0.2.1
+	do_test_dup del "vxlan" dev vx self dst 192.0.2.1
+}
+
+test_dup_vxlan_master()
+{
+	ip_link_add br up type bridge vlan_filtering 1
+	ip_link_add vx up type vxlan id 2000 dstport 4789
+	ip_link_set_master vx br
+
+	do_test_dup add "vxlan master" dev vx master
+	do_test_dup del "vxlan master" dev vx master
+}
+
+test_dup_macvlan_self()
+{
+	ip_link_add dd up type dummy
+	ip_link_add mv up link dd type macvlan mode passthru
+
+	do_test_dup add "macvlan self" dev mv self
+	do_test_dup del "macvlan self" dev mv self
+}
+
+test_dup_macvlan_master()
+{
+	ip_link_add br up type bridge vlan_filtering 1
+	ip_link_add dd up type dummy
+	ip_link_add mv up link dd type macvlan mode passthru
+	ip_link_set_master mv br
+
+	do_test_dup add "macvlan master" dev mv self
+	do_test_dup del "macvlan master" dev mv self
+}
+
+cleanup()
+{
+	defer_scopes_cleanup
+}
+
+trap cleanup EXIT
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh
index ac0b2c6a5761..b39f748c2572 100755
--- a/tools/testing/selftests/net/fib_nexthops.sh
+++ b/tools/testing/selftests/net/fib_nexthops.sh
@@ -76,9 +76,16 @@ log_test()
 		printf "TEST: %-60s  [ OK ]\n" "${msg}"
 		nsuccess=$((nsuccess+1))
 	else
-		ret=1
-		nfail=$((nfail+1))
-		printf "TEST: %-60s  [FAIL]\n" "${msg}"
+		if [[ $rc -eq $ksft_skip ]]; then
+			[[ $ret -eq 0 ]] && ret=$ksft_skip
+			nskip=$((nskip+1))
+			printf "TEST: %-60s  [SKIP]\n" "${msg}"
+		else
+			ret=1
+			nfail=$((nfail+1))
+			printf "TEST: %-60s  [FAIL]\n" "${msg}"
+		fi
+
 		if [ "$VERBOSE" = "1" ]; then
 			echo "    rc=$rc, expected $expected"
 		fi
@@ -736,7 +743,7 @@ ipv6_fcnal()
 	run_cmd "$IP nexthop add id 52 via 2001:db8:92::3"
 	log_test $? 2 "Create nexthop - gw only"
 
-	# gw is not reachable throught given dev
+	# gw is not reachable through given dev
 	run_cmd "$IP nexthop add id 53 via 2001:db8:3::3 dev veth1"
 	log_test $? 2 "Create nexthop - invalid gw+dev combination"
 
@@ -923,6 +930,29 @@ ipv6_grp_fcnal()
 
 	ipv6_grp_refs
 	log_test $? 0 "Nexthop group replace refcounts"
+
+	#
+	# 16-bit weights.
+	#
+	run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1"
+	run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1"
+	run_cmd "$IP nexthop add id 64 via 2001:db8:91::4 dev veth1"
+	run_cmd "$IP nexthop add id 65 via 2001:db8:91::5 dev veth1"
+	run_cmd "$IP nexthop add id 66 dev veth1"
+
+	run_cmd "$IP nexthop add id 103 group 62,1000"
+	if [[ $? == 0 ]]; then
+		local GRP="id 103 group 62,254/63,255/64,256/65,257/66,65535"
+		run_cmd "$IP nexthop replace $GRP"
+		check_nexthop "id 103" "$GRP"
+		rc=$?
+	else
+		rc=$ksft_skip
+	fi
+
+	$IP nexthop flush >/dev/null 2>&1
+
+	log_test $rc 0 "16-bit weights"
 }
 
 ipv6_res_grp_fcnal()
@@ -987,6 +1017,31 @@ ipv6_res_grp_fcnal()
 	check_nexthop_bucket "list id 102" \
 		"id 102 index 0 nhid 63 id 102 index 1 nhid 62 id 102 index 2 nhid 62 id 102 index 3 nhid 62"
 	log_test $? 0 "Nexthop buckets updated after replace - nECMP"
+
+	#
+	# 16-bit weights.
+	#
+	run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1"
+	run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1"
+	run_cmd "$IP nexthop add id 64 via 2001:db8:91::4 dev veth1"
+	run_cmd "$IP nexthop add id 65 via 2001:db8:91::5 dev veth1"
+	run_cmd "$IP nexthop add id 66 dev veth1"
+
+	run_cmd "$IP nexthop add id 103 group 62,1000 type resilient buckets 32"
+	if [[ $? == 0 ]]; then
+		local GRP="id 103 group 62,254/63,255/64,256/65,257/66,65535 $(:
+			  )type resilient buckets 32 idle_timer 0 $(:
+			  )unbalanced_timer 0"
+		run_cmd "$IP nexthop replace $GRP"
+		check_nexthop "id 103" "$GRP unbalanced_time 0"
+		rc=$?
+	else
+		rc=$ksft_skip
+	fi
+
+	$IP nexthop flush >/dev/null 2>&1
+
+	log_test $rc 0 "16-bit weights"
 }
 
 ipv6_fcnal_runtime()
@@ -2475,6 +2530,7 @@ done
 if [ "$TESTS" != "none" ]; then
 	printf "\nTests passed: %3d\n" ${nsuccess}
 	printf "Tests failed: %3d\n"   ${nfail}
+	printf "Tests skipped: %2d\n"  ${nskip}
 fi
 
 exit $ret
diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh
index 51157a5559b7..5fbdd2a0b537 100755
--- a/tools/testing/selftests/net/fib_rule_tests.sh
+++ b/tools/testing/selftests/net/fib_rule_tests.sh
@@ -9,6 +9,7 @@ PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no}
 
 RTABLE=100
 RTABLE_PEER=101
+RTABLE_VRF=102
 GW_IP4=192.51.100.2
 SRC_IP=192.51.100.3
 GW_IP6=2001:db8:1::2
@@ -17,7 +18,14 @@ SRC_IP6=2001:db8:1::3
 DEV_ADDR=192.51.100.1
 DEV_ADDR6=2001:db8:1::1
 DEV=dummy0
-TESTS="fib_rule6 fib_rule4 fib_rule6_connect fib_rule4_connect"
+TESTS="
+	fib_rule6
+	fib_rule4
+	fib_rule6_connect
+	fib_rule4_connect
+	fib_rule6_vrf
+	fib_rule4_vrf
+"
 
 SELFTEST_PATH=""
 
@@ -29,11 +37,11 @@ log_test()
 
 	if [ ${rc} -eq ${expected} ]; then
 		nsuccess=$((nsuccess+1))
-		printf "\n    TEST: %-50s  [ OK ]\n" "${msg}"
+		printf "    TEST: %-60s  [ OK ]\n" "${msg}"
 	else
 		ret=1
 		nfail=$((nfail+1))
-		printf "\n    TEST: %-50s  [FAIL]\n" "${msg}"
+		printf "    TEST: %-60s  [FAIL]\n" "${msg}"
 		if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
 			echo
 			echo "hit enter to continue, 'q' to quit"
@@ -43,39 +51,6 @@ log_test()
 	fi
 }
 
-log_section()
-{
-	echo
-	echo "######################################################################"
-	echo "TEST SECTION: $*"
-	echo "######################################################################"
-}
-
-check_nettest()
-{
-	if which nettest > /dev/null 2>&1; then
-		return 0
-	fi
-
-	# Add the selftest directory to PATH if not already done
-	if [ "${SELFTEST_PATH}" = "" ]; then
-		SELFTEST_PATH="$(dirname $0)"
-		PATH="${PATH}:${SELFTEST_PATH}"
-
-		# Now retry with the new path
-		if which nettest > /dev/null 2>&1; then
-			return 0
-		fi
-
-		if [ "${ret}" -eq 0 ]; then
-			ret="${ksft_skip}"
-		fi
-		echo "nettest not found (try 'make -C ${SELFTEST_PATH} nettest')"
-	fi
-
-	return 1
-}
-
 setup()
 {
 	set -e
@@ -130,6 +105,17 @@ cleanup_peer()
 	ip netns del $peerns
 }
 
+setup_vrf()
+{
+	$IP link add name vrf0 up type vrf table $RTABLE_VRF
+	$IP link set dev $DEV master vrf0
+}
+
+cleanup_vrf()
+{
+	$IP link del dev vrf0
+}
+
 fib_check_iproute_support()
 {
 	ip rule help 2>&1 | grep -q $1
@@ -163,12 +149,17 @@ fib_rule6_test_match_n_redirect()
 {
 	local match="$1"
 	local getmatch="$2"
-	local description="$3"
+	local getnomatch="$3"
+	local description="$4"
+	local nomatch_description="$5"
 
 	$IP -6 rule add $match table $RTABLE
 	$IP -6 route get $GW_IP6 $getmatch | grep -q "table $RTABLE"
 	log_test $? 0 "rule6 check: $description"
 
+	$IP -6 route get $GW_IP6 $getnomatch 2>&1 | grep -q "table $RTABLE"
+	log_test $? 1 "rule6 check: $nomatch_description"
+
 	fib_rule6_del_by_pref "$match"
 	log_test $? 0 "rule6 del by pref: $description"
 }
@@ -189,18 +180,27 @@ fib_rule6_test_reject()
 
 fib_rule6_test()
 {
+	local ext_name=$1; shift
+	local getnomatch
 	local getmatch
 	local match
 	local cnt
 
+	echo
+	echo "IPv6 FIB rule tests $ext_name"
+
 	# setup the fib rule redirect route
 	$IP -6 route add table $RTABLE default via $GW_IP6 dev $DEV onlink
 
 	match="oif $DEV"
-	fib_rule6_test_match_n_redirect "$match" "$match" "oif redirect to table"
+	getnomatch="oif lo"
+	fib_rule6_test_match_n_redirect "$match" "$match" "$getnomatch" \
+		"oif redirect to table" "oif no redirect to table"
 
 	match="from $SRC_IP6 iif $DEV"
-	fib_rule6_test_match_n_redirect "$match" "$match" "iif redirect to table"
+	getnomatch="from $SRC_IP6 iif lo"
+	fib_rule6_test_match_n_redirect "$match" "$match" "$getnomatch" \
+		"iif redirect to table" "iif no redirect to table"
 
 	# Reject dsfield (tos) options which have ECN bits set
 	for cnt in $(seq 1 3); do
@@ -214,38 +214,175 @@ fib_rule6_test()
 		# Using option 'tos' instead of 'dsfield' as old iproute2
 		# versions don't support 'dsfield' in ip rule show.
 		getmatch="tos $cnt"
+		getnomatch="tos 0x20"
 		fib_rule6_test_match_n_redirect "$match" "$getmatch" \
-						"$getmatch redirect to table"
+			"$getnomatch" "$getmatch redirect to table" \
+			"$getnomatch no redirect to table"
+	done
+
+	# Re-test TOS matching, but with input routes since they are handled
+	# differently from output routes.
+	match="tos 0x10"
+	for cnt in "0x10" "0x11" "0x12" "0x13"; do
+		getmatch="tos $cnt"
+		getnomatch="tos 0x20"
+		fib_rule6_test_match_n_redirect "$match" \
+			"from $SRC_IP6 iif $DEV $getmatch" \
+			"from $SRC_IP6 iif $DEV $getnomatch" \
+			"iif $getmatch redirect to table" \
+			"iif $getnomatch no redirect to table"
 	done
 
 	match="fwmark 0x64"
 	getmatch="mark 0x64"
-	fib_rule6_test_match_n_redirect "$match" "$getmatch" "fwmark redirect to table"
+	getnomatch="mark 0x63"
+	fib_rule6_test_match_n_redirect "$match" "$getmatch" "$getnomatch" \
+		"fwmark redirect to table" "fwmark no redirect to table"
 
 	fib_check_iproute_support "uidrange" "uid"
 	if [ $? -eq 0 ]; then
 		match="uidrange 100-100"
 		getmatch="uid 100"
-		fib_rule6_test_match_n_redirect "$match" "$getmatch" "uid redirect to table"
+		getnomatch="uid 101"
+		fib_rule6_test_match_n_redirect "$match" "$getmatch" \
+			"$getnomatch" "uid redirect to table" \
+			"uid no redirect to table"
 	fi
 
 	fib_check_iproute_support "sport" "sport"
 	if [ $? -eq 0 ]; then
 		match="sport 666 dport 777"
-		fib_rule6_test_match_n_redirect "$match" "$match" "sport and dport redirect to table"
+		getnomatch="sport 667 dport 778"
+		fib_rule6_test_match_n_redirect "$match" "$match" \
+			"$getnomatch" "sport and dport redirect to table" \
+			"sport and dport no redirect to table"
+
+		match="sport 100-200 dport 300-400"
+		getmatch="sport 100 dport 400"
+		getnomatch="sport 100 dport 401"
+		fib_rule6_test_match_n_redirect "$match" "$getmatch" \
+			"$getnomatch" \
+			"sport and dport range redirect to table" \
+			"sport and dport range no redirect to table"
+	fi
+
+	ip rule help 2>&1 | grep sport | grep -q MASK
+	if [ $? -eq 0 ]; then
+		match="sport 0x0f00/0xff00 dport 0x000f/0x00ff"
+		getmatch="sport 0x0f11 dport 0x220f"
+		getnomatch="sport 0x1f11 dport 0x221f"
+		fib_rule6_test_match_n_redirect "$match" "$getmatch" \
+			"$getnomatch" "sport and dport masked redirect to table" \
+			"sport and dport masked no redirect to table"
 	fi
 
 	fib_check_iproute_support "ipproto" "ipproto"
 	if [ $? -eq 0 ]; then
 		match="ipproto tcp"
-		fib_rule6_test_match_n_redirect "$match" "$match" "ipproto match"
+		getnomatch="ipproto udp"
+		fib_rule6_test_match_n_redirect "$match" "$match" \
+			"$getnomatch" "ipproto tcp match" "ipproto udp no match"
 	fi
 
 	fib_check_iproute_support "ipproto" "ipproto"
 	if [ $? -eq 0 ]; then
 		match="ipproto ipv6-icmp"
-		fib_rule6_test_match_n_redirect "$match" "$match" "ipproto ipv6-icmp match"
+		getnomatch="ipproto tcp"
+		fib_rule6_test_match_n_redirect "$match" "$match" \
+			"$getnomatch" "ipproto ipv6-icmp match" \
+			"ipproto ipv6-tcp no match"
 	fi
+
+	fib_check_iproute_support "dscp" "tos"
+	if [ $? -eq 0 ]; then
+		match="dscp 0x3f"
+		getmatch="tos 0xfc"
+		getnomatch="tos 0xf4"
+		fib_rule6_test_match_n_redirect "$match" "$getmatch" \
+			"$getnomatch" "dscp redirect to table" \
+			"dscp no redirect to table"
+
+		match="dscp 0x3f"
+		getmatch="from $SRC_IP6 iif $DEV tos 0xfc"
+		getnomatch="from $SRC_IP6 iif $DEV tos 0xf4"
+		fib_rule6_test_match_n_redirect "$match" "$getmatch" \
+			"$getnomatch" "iif dscp redirect to table" \
+			"iif dscp no redirect to table"
+	fi
+
+	ip rule help 2>&1 | grep -q "DSCP\[/MASK\]"
+	if [ $? -eq 0 ]; then
+		match="dscp 0x0f/0x0f"
+		tosmatch=$(printf 0x"%x" $((0x1f << 2)))
+		tosnomatch=$(printf 0x"%x" $((0x1e << 2)))
+		getmatch="tos $tosmatch"
+		getnomatch="tos $tosnomatch"
+		fib_rule6_test_match_n_redirect "$match" "$getmatch" \
+			"$getnomatch" "dscp masked redirect to table" \
+			"dscp masked no redirect to table"
+
+		match="dscp 0x0f/0x0f"
+		getmatch="from $SRC_IP6 iif $DEV tos $tosmatch"
+		getnomatch="from $SRC_IP6 iif $DEV tos $tosnomatch"
+		fib_rule6_test_match_n_redirect "$match" "$getmatch" \
+			"$getnomatch" "iif dscp masked redirect to table" \
+			"iif dscp masked no redirect to table"
+	fi
+
+	fib_check_iproute_support "flowlabel" "flowlabel"
+	if [ $? -eq 0 ]; then
+		match="flowlabel 0xfffff"
+		getmatch="flowlabel 0xfffff"
+		getnomatch="flowlabel 0xf"
+		fib_rule6_test_match_n_redirect "$match" "$getmatch" \
+			"$getnomatch" "flowlabel redirect to table" \
+			"flowlabel no redirect to table"
+
+		match="flowlabel 0xfffff"
+		getmatch="from $SRC_IP6 iif $DEV flowlabel 0xfffff"
+		getnomatch="from $SRC_IP6 iif $DEV flowlabel 0xf"
+		fib_rule6_test_match_n_redirect "$match" "$getmatch" \
+			"$getnomatch" "iif flowlabel redirect to table" \
+			"iif flowlabel no redirect to table"
+
+		match="flowlabel 0x08000/0x08000"
+		getmatch="flowlabel 0xfffff"
+		getnomatch="flowlabel 0xf7fff"
+		fib_rule6_test_match_n_redirect "$match" "$getmatch" \
+			"$getnomatch" "flowlabel masked redirect to table" \
+			"flowlabel masked no redirect to table"
+
+		match="flowlabel 0x08000/0x08000"
+		getmatch="from $SRC_IP6 iif $DEV flowlabel 0xfffff"
+		getnomatch="from $SRC_IP6 iif $DEV flowlabel 0xf7fff"
+		fib_rule6_test_match_n_redirect "$match" "$getmatch" \
+			"$getnomatch" "iif flowlabel masked redirect to table" \
+			"iif flowlabel masked no redirect to table"
+	fi
+
+	$IP link show dev $DEV | grep -q vrf0
+	if [ $? -eq 0 ]; then
+		match="oif vrf0"
+		getmatch="oif $DEV"
+		getnomatch="oif lo"
+		fib_rule6_test_match_n_redirect "$match" "$getmatch" \
+			"$getnomatch" "VRF oif redirect to table" \
+			"VRF oif no redirect to table"
+
+		match="from $SRC_IP6 iif vrf0"
+		getmatch="from $SRC_IP6 iif $DEV"
+		getnomatch="from $SRC_IP6 iif lo"
+		fib_rule6_test_match_n_redirect "$match" "$getmatch" \
+			"$getnomatch" "VRF iif redirect to table" \
+			"VRF iif no redirect to table"
+	fi
+}
+
+fib_rule6_vrf_test()
+{
+	setup_vrf
+	fib_rule6_test "- with VRF"
+	cleanup_vrf
 }
 
 # Verify that the IPV6_TCLASS option of UDPv6 and TCPv6 sockets is properly
@@ -254,10 +391,8 @@ fib_rule6_connect_test()
 {
 	local dsfield
 
-	if ! check_nettest; then
-		echo "SKIP: Could not run test without nettest tool"
-		return
-	fi
+	echo
+	echo "IPv6 FIB rule connect tests"
 
 	setup_peer
 	$IP -6 rule add dsfield 0x04 table $RTABLE_PEER
@@ -275,7 +410,45 @@ fib_rule6_connect_test()
 		log_test $? 0 "rule6 dsfield tcp connect (dsfield ${dsfield})"
 	done
 
+	# Check that UDP and TCP connections fail when using a DS Field that
+	# does not match the previously configured FIB rule.
+	nettest -q -6 -B -t 5 -N $testns -O $peerns -U -D \
+		-Q 0x20 -l 2001:db8::1:11 -r 2001:db8::1:11
+	log_test $? 1 "rule6 dsfield udp no connect (dsfield 0x20)"
+
+	nettest -q -6 -B -t 5 -N $testns -O $peerns -Q 0x20 \
+		-l 2001:db8::1:11 -r 2001:db8::1:11
+	log_test $? 1 "rule6 dsfield tcp no connect (dsfield 0x20)"
+
 	$IP -6 rule del dsfield 0x04 table $RTABLE_PEER
+
+	ip rule help 2>&1 | grep -q dscp
+	if [ $? -ne 0 ]; then
+		echo "SKIP: iproute2 iprule too old, missing dscp match"
+		cleanup_peer
+		return
+	fi
+
+	$IP -6 rule add dscp 0x3f table $RTABLE_PEER
+
+	nettest -q -6 -B -t 5 -N $testns -O $peerns -U -D -Q 0xfc \
+		-l 2001:db8::1:11 -r 2001:db8::1:11
+	log_test $? 0 "rule6 dscp udp connect"
+
+	nettest -q -6 -B -t 5 -N $testns -O $peerns -Q 0xfc \
+		-l 2001:db8::1:11 -r 2001:db8::1:11
+	log_test $? 0 "rule6 dscp tcp connect"
+
+	nettest -q -6 -B -t 5 -N $testns -O $peerns -U -D -Q 0xf4 \
+		-l 2001:db8::1:11 -r 2001:db8::1:11
+	log_test $? 1 "rule6 dscp udp no connect"
+
+	nettest -q -6 -B -t 5 -N $testns -O $peerns -Q 0xf4 \
+		-l 2001:db8::1:11 -r 2001:db8::1:11
+	log_test $? 1 "rule6 dscp tcp no connect"
+
+	$IP -6 rule del dscp 0x3f table $RTABLE_PEER
+
 	cleanup_peer
 }
 
@@ -295,12 +468,17 @@ fib_rule4_test_match_n_redirect()
 {
 	local match="$1"
 	local getmatch="$2"
-	local description="$3"
+	local getnomatch="$3"
+	local description="$4"
+	local nomatch_description="$5"
 
 	$IP rule add $match table $RTABLE
 	$IP route get $GW_IP4 $getmatch | grep -q "table $RTABLE"
 	log_test $? 0 "rule4 check: $description"
 
+	$IP route get $GW_IP4 $getnomatch 2>&1 | grep -q "table $RTABLE"
+	log_test $? 1 "rule4 check: $nomatch_description"
+
 	fib_rule4_del_by_pref "$match"
 	log_test $? 0 "rule4 del by pref: $description"
 }
@@ -321,23 +499,28 @@ fib_rule4_test_reject()
 
 fib_rule4_test()
 {
+	local ext_name=$1; shift
+	local getnomatch
 	local getmatch
 	local match
 	local cnt
 
+	echo
+	echo "IPv4 FIB rule tests $ext_name"
+
 	# setup the fib rule redirect route
 	$IP route add table $RTABLE default via $GW_IP4 dev $DEV onlink
 
 	match="oif $DEV"
-	fib_rule4_test_match_n_redirect "$match" "$match" "oif redirect to table"
+	getnomatch="oif lo"
+	fib_rule4_test_match_n_redirect "$match" "$match" "$getnomatch" \
+		"oif redirect to table" "oif no redirect to table"
 
-	# need enable forwarding and disable rp_filter temporarily as all the
-	# addresses are in the same subnet and egress device == ingress device.
 	ip netns exec $testns sysctl -qw net.ipv4.ip_forward=1
-	ip netns exec $testns sysctl -qw net.ipv4.conf.$DEV.rp_filter=0
 	match="from $SRC_IP iif $DEV"
-	fib_rule4_test_match_n_redirect "$match" "$match" "iif redirect to table"
-	ip netns exec $testns sysctl -qw net.ipv4.ip_forward=0
+	getnomatch="from $SRC_IP iif lo"
+	fib_rule4_test_match_n_redirect "$match" "$match" "$getnomatch" \
+		"iif redirect to table" "iif no redirect to table"
 
 	# Reject dsfield (tos) options which have ECN bits set
 	for cnt in $(seq 1 3); do
@@ -351,38 +534,145 @@ fib_rule4_test()
 		# Using option 'tos' instead of 'dsfield' as old iproute2
 		# versions don't support 'dsfield' in ip rule show.
 		getmatch="tos $cnt"
+		getnomatch="tos 0x20"
 		fib_rule4_test_match_n_redirect "$match" "$getmatch" \
-						"$getmatch redirect to table"
+			"$getnomatch" "$getmatch redirect to table" \
+			"$getnomatch no redirect to table"
+	done
+
+	# Re-test TOS matching, but with input routes since they are handled
+	# differently from output routes.
+	match="tos 0x10"
+	for cnt in "0x10" "0x11" "0x12" "0x13"; do
+		getmatch="tos $cnt"
+		getnomatch="tos 0x20"
+		fib_rule4_test_match_n_redirect "$match" \
+			"from $SRC_IP iif $DEV $getmatch" \
+			"from $SRC_IP iif $DEV $getnomatch" \
+			"iif $getmatch redirect to table" \
+			"iif $getnomatch no redirect to table"
 	done
 
 	match="fwmark 0x64"
 	getmatch="mark 0x64"
-	fib_rule4_test_match_n_redirect "$match" "$getmatch" "fwmark redirect to table"
+	getnomatch="mark 0x63"
+	fib_rule4_test_match_n_redirect "$match" "$getmatch" "$getnomatch" \
+		"fwmark redirect to table" "fwmark no redirect to table"
 
 	fib_check_iproute_support "uidrange" "uid"
 	if [ $? -eq 0 ]; then
 		match="uidrange 100-100"
 		getmatch="uid 100"
-		fib_rule4_test_match_n_redirect "$match" "$getmatch" "uid redirect to table"
+		getnomatch="uid 101"
+		fib_rule4_test_match_n_redirect "$match" "$getmatch" \
+			"$getnomatch" "uid redirect to table" \
+			"uid no redirect to table"
 	fi
 
 	fib_check_iproute_support "sport" "sport"
 	if [ $? -eq 0 ]; then
 		match="sport 666 dport 777"
-		fib_rule4_test_match_n_redirect "$match" "$match" "sport and dport redirect to table"
+		getnomatch="sport 667 dport 778"
+		fib_rule4_test_match_n_redirect "$match" "$match" \
+			"$getnomatch" "sport and dport redirect to table" \
+			"sport and dport no redirect to table"
+
+		match="sport 100-200 dport 300-400"
+		getmatch="sport 100 dport 400"
+		getnomatch="sport 100 dport 401"
+		fib_rule4_test_match_n_redirect "$match" "$getmatch" \
+			"$getnomatch" \
+			"sport and dport range redirect to table" \
+			"sport and dport range no redirect to table"
+	fi
+
+	ip rule help 2>&1 | grep sport | grep -q MASK
+	if [ $? -eq 0 ]; then
+		match="sport 0x0f00/0xff00 dport 0x000f/0x00ff"
+		getmatch="sport 0x0f11 dport 0x220f"
+		getnomatch="sport 0x1f11 dport 0x221f"
+		fib_rule4_test_match_n_redirect "$match" "$getmatch" \
+			"$getnomatch" "sport and dport masked redirect to table" \
+			"sport and dport masked no redirect to table"
 	fi
 
 	fib_check_iproute_support "ipproto" "ipproto"
 	if [ $? -eq 0 ]; then
 		match="ipproto tcp"
-		fib_rule4_test_match_n_redirect "$match" "$match" "ipproto tcp match"
+		getnomatch="ipproto udp"
+		fib_rule4_test_match_n_redirect "$match" "$match" \
+			"$getnomatch" "ipproto tcp match" \
+			"ipproto udp no match"
 	fi
 
 	fib_check_iproute_support "ipproto" "ipproto"
 	if [ $? -eq 0 ]; then
 		match="ipproto icmp"
-		fib_rule4_test_match_n_redirect "$match" "$match" "ipproto icmp match"
+		getnomatch="ipproto tcp"
+		fib_rule4_test_match_n_redirect "$match" "$match" \
+			"$getnomatch" "ipproto icmp match" \
+			"ipproto tcp no match"
+	fi
+
+	fib_check_iproute_support "dscp" "tos"
+	if [ $? -eq 0 ]; then
+		match="dscp 0x3f"
+		getmatch="tos 0xfc"
+		getnomatch="tos 0xf4"
+		fib_rule4_test_match_n_redirect "$match" "$getmatch" \
+			"$getnomatch" "dscp redirect to table" \
+			"dscp no redirect to table"
+
+		match="dscp 0x3f"
+		getmatch="from $SRC_IP iif $DEV tos 0xfc"
+		getnomatch="from $SRC_IP iif $DEV tos 0xf4"
+		fib_rule4_test_match_n_redirect "$match" "$getmatch" \
+			"$getnomatch" "iif dscp redirect to table" \
+			"iif dscp no redirect to table"
+	fi
+
+	ip rule help 2>&1 | grep -q "DSCP\[/MASK\]"
+	if [ $? -eq 0 ]; then
+		match="dscp 0x0f/0x0f"
+		tosmatch=$(printf 0x"%x" $((0x1f << 2)))
+		tosnomatch=$(printf 0x"%x" $((0x1e << 2)))
+		getmatch="tos $tosmatch"
+		getnomatch="tos $tosnomatch"
+		fib_rule4_test_match_n_redirect "$match" "$getmatch" \
+			"$getnomatch" "dscp masked redirect to table" \
+			"dscp masked no redirect to table"
+
+		match="dscp 0x0f/0x0f"
+		getmatch="from $SRC_IP iif $DEV tos $tosmatch"
+		getnomatch="from $SRC_IP iif $DEV tos $tosnomatch"
+		fib_rule4_test_match_n_redirect "$match" "$getmatch" \
+			"$getnomatch" "iif dscp masked redirect to table" \
+			"iif dscp masked no redirect to table"
 	fi
+
+	$IP link show dev $DEV | grep -q vrf0
+	if [ $? -eq 0 ]; then
+		match="oif vrf0"
+		getmatch="oif $DEV"
+		getnomatch="oif lo"
+		fib_rule4_test_match_n_redirect "$match" "$getmatch" \
+			"$getnomatch" "VRF oif redirect to table" \
+			"VRF oif no redirect to table"
+
+		match="from $SRC_IP iif vrf0"
+		getmatch="from $SRC_IP iif $DEV"
+		getnomatch="from $SRC_IP iif lo"
+		fib_rule4_test_match_n_redirect "$match" "$getmatch" \
+			"$getnomatch" "VRF iif redirect to table" \
+			"VRF iif no redirect to table"
+	fi
+}
+
+fib_rule4_vrf_test()
+{
+	setup_vrf
+	fib_rule4_test "- with VRF"
+	cleanup_vrf
 }
 
 # Verify that the IP_TOS option of UDPv4 and TCPv4 sockets is properly taken
@@ -391,10 +681,8 @@ fib_rule4_connect_test()
 {
 	local dsfield
 
-	if ! check_nettest; then
-		echo "SKIP: Could not run test without nettest tool"
-		return
-	fi
+	echo
+	echo "IPv4 FIB rule connect tests"
 
 	setup_peer
 	$IP -4 rule add dsfield 0x04 table $RTABLE_PEER
@@ -412,16 +700,46 @@ fib_rule4_connect_test()
 		log_test $? 0 "rule4 dsfield tcp connect (dsfield ${dsfield})"
 	done
 
+	# Check that UDP and TCP connections fail when using a DS Field that
+	# does not match the previously configured FIB rule.
+	nettest -q -B -t 5 -N $testns -O $peerns -D -U -Q 0x20 \
+		-l 198.51.100.11 -r 198.51.100.11
+	log_test $? 1 "rule4 dsfield udp no connect (dsfield 0x20)"
+
+	nettest -q -B -t 5 -N $testns -O $peerns -Q 0x20 \
+		-l 198.51.100.11 -r 198.51.100.11
+	log_test $? 1 "rule4 dsfield tcp no connect (dsfield 0x20)"
+
 	$IP -4 rule del dsfield 0x04 table $RTABLE_PEER
-	cleanup_peer
-}
 
-run_fibrule_tests()
-{
-	log_section "IPv4 fib rule"
-	fib_rule4_test
-	log_section "IPv6 fib rule"
-	fib_rule6_test
+	ip rule help 2>&1 | grep -q dscp
+	if [ $? -ne 0 ]; then
+		echo "SKIP: iproute2 iprule too old, missing dscp match"
+		cleanup_peer
+		return
+	fi
+
+	$IP -4 rule add dscp 0x3f table $RTABLE_PEER
+
+	nettest -q -B -t 5 -N $testns -O $peerns -D -U -Q 0xfc \
+		-l 198.51.100.11 -r 198.51.100.11
+	log_test $? 0 "rule4 dscp udp connect"
+
+	nettest -q -B -t 5 -N $testns -O $peerns -Q 0xfc \
+		-l 198.51.100.11 -r 198.51.100.11
+	log_test $? 0 "rule4 dscp tcp connect"
+
+	nettest -q -B -t 5 -N $testns -O $peerns -D -U -Q 0xf4 \
+		-l 198.51.100.11 -r 198.51.100.11
+	log_test $? 1 "rule4 dscp udp no connect"
+
+	nettest -q -B -t 5 -N $testns -O $peerns -Q 0xf4 \
+		-l 198.51.100.11 -r 198.51.100.11
+	log_test $? 1 "rule4 dscp tcp no connect"
+
+	$IP -4 rule del dscp 0x3f table $RTABLE_PEER
+
+	cleanup_peer
 }
 ################################################################################
 # usage
@@ -457,6 +775,8 @@ if [ ! -x "$(command -v ip)" ]; then
 	exit $ksft_skip
 fi
 
+check_gen_prog "nettest"
+
 # start clean
 cleanup &> /dev/null
 setup
@@ -467,6 +787,8 @@ do
 	fib_rule4_test|fib_rule4)		fib_rule4_test;;
 	fib_rule6_connect_test|fib_rule6_connect)	fib_rule6_connect_test;;
 	fib_rule4_connect_test|fib_rule4_connect)	fib_rule4_connect_test;;
+	fib_rule6_vrf_test|fib_rule6_vrf)	fib_rule6_vrf_test;;
+	fib_rule4_vrf_test|fib_rule4_vrf)	fib_rule4_vrf_test;;
 
 	help) echo "Test names: $TESTS"; exit 0;;
 
diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh
index 73895711cdf4..a94b73a53f72 100755
--- a/tools/testing/selftests/net/fib_tests.sh
+++ b/tools/testing/selftests/net/fib_tests.sh
@@ -11,7 +11,7 @@ TESTS="unregister down carrier nexthop suppress ipv6_notify ipv4_notify \
        ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics \
        ipv4_route_metrics ipv4_route_v6_gw rp_filter ipv4_del_addr \
        ipv6_del_addr ipv4_mangle ipv6_mangle ipv4_bcast_neigh fib6_gc_test \
-       ipv4_mpath_list ipv6_mpath_list"
+       ipv4_mpath_list ipv6_mpath_list ipv4_mpath_balance ipv6_mpath_balance"
 
 VERBOSE=0
 PAUSE_ON_FAIL=no
@@ -689,7 +689,7 @@ fib6_notify_test()
 
 	log_test $ret 0 "ipv6 route add notify"
 
-	{ kill %% && wait %%; } 2>/dev/null
+	kill_process %%
 
 	#rm errors.txt
 
@@ -736,7 +736,7 @@ fib_notify_test()
 
 	log_test $ret 0 "ipv4 route add notify"
 
-	{ kill %% && wait %%; } 2>/dev/null
+	kill_process %%
 
 	rm  errors.txt
 
@@ -1085,6 +1085,35 @@ route_setup()
 	set +e
 }
 
+forwarding_cleanup()
+{
+	cleanup_ns $ns3
+
+	route_cleanup
+}
+
+# extend route_setup with an ns3 reachable through ns2 over both devices
+forwarding_setup()
+{
+	forwarding_cleanup
+
+	route_setup
+
+	setup_ns ns3
+
+	ip link add veth5 netns $ns3 type veth peer name veth6 netns $ns2
+	ip -netns $ns3 link set veth5 up
+	ip -netns $ns2 link set veth6 up
+
+	ip -netns $ns3 -4 addr add dev veth5 172.16.105.1/24
+	ip -netns $ns2 -4 addr add dev veth6 172.16.105.2/24
+	ip -netns $ns3 -4 route add 172.16.100.0/22 via 172.16.105.2
+
+	ip -netns $ns3 -6 addr add dev veth5 2001:db8:105::1/64 nodad
+	ip -netns $ns2 -6 addr add dev veth6 2001:db8:105::2/64 nodad
+	ip -netns $ns3 -6 route add 2001:db8:101::/33 via 2001:db8:105::2
+}
+
 # assumption is that basic add of a single path route works
 # otherwise just adding an address on an interface is broken
 ipv6_rt_add()
@@ -1737,53 +1766,53 @@ ipv4_rt_dsfield()
 
 	# DSCP 0x10 should match the specific route, no matter the ECN bits
 	$IP route get fibmatch 172.16.102.1 dsfield 0x10 | \
-		grep -q "via 172.16.103.2"
+		grep -q "172.16.102.0/24 tos 0x10 via 172.16.103.2"
 	log_test $? 0 "IPv4 route with DSCP and ECN:Not-ECT"
 
 	$IP route get fibmatch 172.16.102.1 dsfield 0x11 | \
-		grep -q "via 172.16.103.2"
+		grep -q "172.16.102.0/24 tos 0x10 via 172.16.103.2"
 	log_test $? 0 "IPv4 route with DSCP and ECN:ECT(1)"
 
 	$IP route get fibmatch 172.16.102.1 dsfield 0x12 | \
-		grep -q "via 172.16.103.2"
+		grep -q "172.16.102.0/24 tos 0x10 via 172.16.103.2"
 	log_test $? 0 "IPv4 route with DSCP and ECN:ECT(0)"
 
 	$IP route get fibmatch 172.16.102.1 dsfield 0x13 | \
-		grep -q "via 172.16.103.2"
+		grep -q "172.16.102.0/24 tos 0x10 via 172.16.103.2"
 	log_test $? 0 "IPv4 route with DSCP and ECN:CE"
 
 	# Unknown DSCP should match the generic route, no matter the ECN bits
 	$IP route get fibmatch 172.16.102.1 dsfield 0x14 | \
-		grep -q "via 172.16.101.2"
+		grep -q "172.16.102.0/24 via 172.16.101.2"
 	log_test $? 0 "IPv4 route with unknown DSCP and ECN:Not-ECT"
 
 	$IP route get fibmatch 172.16.102.1 dsfield 0x15 | \
-		grep -q "via 172.16.101.2"
+		grep -q "172.16.102.0/24 via 172.16.101.2"
 	log_test $? 0 "IPv4 route with unknown DSCP and ECN:ECT(1)"
 
 	$IP route get fibmatch 172.16.102.1 dsfield 0x16 | \
-		grep -q "via 172.16.101.2"
+		grep -q "172.16.102.0/24 via 172.16.101.2"
 	log_test $? 0 "IPv4 route with unknown DSCP and ECN:ECT(0)"
 
 	$IP route get fibmatch 172.16.102.1 dsfield 0x17 | \
-		grep -q "via 172.16.101.2"
+		grep -q "172.16.102.0/24 via 172.16.101.2"
 	log_test $? 0 "IPv4 route with unknown DSCP and ECN:CE"
 
 	# Null DSCP should match the generic route, no matter the ECN bits
 	$IP route get fibmatch 172.16.102.1 dsfield 0x00 | \
-		grep -q "via 172.16.101.2"
+		grep -q "172.16.102.0/24 via 172.16.101.2"
 	log_test $? 0 "IPv4 route with no DSCP and ECN:Not-ECT"
 
 	$IP route get fibmatch 172.16.102.1 dsfield 0x01 | \
-		grep -q "via 172.16.101.2"
+		grep -q "172.16.102.0/24 via 172.16.101.2"
 	log_test $? 0 "IPv4 route with no DSCP and ECN:ECT(1)"
 
 	$IP route get fibmatch 172.16.102.1 dsfield 0x02 | \
-		grep -q "via 172.16.101.2"
+		grep -q "172.16.102.0/24 via 172.16.101.2"
 	log_test $? 0 "IPv4 route with no DSCP and ECN:ECT(0)"
 
 	$IP route get fibmatch 172.16.102.1 dsfield 0x03 | \
-		grep -q "via 172.16.101.2"
+		grep -q "172.16.102.0/24 via 172.16.101.2"
 	log_test $? 0 "IPv4 route with no DSCP and ECN:CE"
 }
 
@@ -2328,7 +2357,7 @@ ipv4_mangle_test()
 	$IP route del table 123 172.16.101.0/24 dev veth1
 	$IP rule del pref 100
 
-	{ kill %% && wait %%; } 2>/dev/null
+	kill_process %%
 	rm $tmp_file
 
 	route_cleanup
@@ -2386,7 +2415,7 @@ ipv6_mangle_test()
 	$IP -6 route del table 123 2001:db8:101::/64 dev veth1
 	$IP -6 rule del pref 100
 
-	{ kill %% && wait %%; } 2>/dev/null
+	kill_process %%
 	rm $tmp_file
 
 	route_cleanup
@@ -2531,9 +2560,6 @@ ipv4_mpath_list_test()
 	run_cmd "ip -n $ns2 route add 203.0.113.0/24
 		nexthop via 172.16.201.2 nexthop via 172.16.202.2"
 	run_cmd "ip netns exec $ns2 sysctl -qw net.ipv4.fib_multipath_hash_policy=1"
-	run_cmd "ip netns exec $ns2 sysctl -qw net.ipv4.conf.veth2.rp_filter=0"
-	run_cmd "ip netns exec $ns2 sysctl -qw net.ipv4.conf.all.rp_filter=0"
-	run_cmd "ip netns exec $ns2 sysctl -qw net.ipv4.conf.default.rp_filter=0"
 	set +e
 
 	local dmac=$(ip -n $ns2 -j link show dev veth2 | jq -r '.[]["address"]')
@@ -2600,6 +2626,93 @@ ipv6_mpath_list_test()
 	route_cleanup
 }
 
+tc_set_flower_counter__saddr_syn() {
+	tc_set_flower_counter $1 $2 $3 "src_ip $4 ip_proto tcp tcp_flags 0x2"
+}
+
+ip_mpath_balance_dep_check()
+{
+	if [ ! -x "$(command -v socat)" ]; then
+		echo "socat command not found. Skipping test"
+		return 1
+	fi
+
+	if [ ! -x "$(command -v jq)" ]; then
+		echo "jq command not found. Skipping test"
+		return 1
+	fi
+}
+
+ip_mpath_balance() {
+	local -r ipver=$1
+	local -r daddr=$2
+	local -r num_conn=20
+
+	for i in $(seq 1 $num_conn); do
+		ip netns exec $ns3 socat $ipver TCP-LISTEN:8000 STDIO >/dev/null &
+		sleep 0.02
+		echo -n a | ip netns exec $ns1 socat $ipver STDIO TCP:$daddr:8000
+	done
+
+	local -r syn0="$(tc_get_flower_counter $ns1 veth1)"
+	local -r syn1="$(tc_get_flower_counter $ns1 veth3)"
+	local -r syns=$((syn0+syn1))
+
+	[ "$VERBOSE" = "1" ] && echo "multipath: syns seen: ($syn0,$syn1)"
+
+	[[ $syns -ge $num_conn ]] && [[ $syn0 -gt 0 ]] && [[ $syn1 -gt 0 ]]
+}
+
+ipv4_mpath_balance_test()
+{
+	echo
+	echo "IPv4 multipath load balance test"
+
+	ip_mpath_balance_dep_check || return 1
+	forwarding_setup
+
+	$IP route add 172.16.105.1 \
+		nexthop via 172.16.101.2 \
+		nexthop via 172.16.103.2
+
+	ip netns exec $ns1 \
+		sysctl -q -w net.ipv4.fib_multipath_hash_policy=1
+
+	tc_set_flower_counter__saddr_syn $ns1 4 veth1 172.16.101.1
+	tc_set_flower_counter__saddr_syn $ns1 4 veth3 172.16.103.1
+
+	ip_mpath_balance -4 172.16.105.1
+
+	log_test $? 0 "IPv4 multipath loadbalance"
+
+	forwarding_cleanup
+}
+
+ipv6_mpath_balance_test()
+{
+	echo
+	echo "IPv6 multipath load balance test"
+
+	ip_mpath_balance_dep_check || return 1
+	forwarding_setup
+
+	$IP route add 2001:db8:105::1\
+		nexthop via 2001:db8:101::2 \
+		nexthop via 2001:db8:103::2
+
+	ip netns exec $ns1 \
+		sysctl -q -w net.ipv6.fib_multipath_hash_policy=1
+
+	tc_set_flower_counter__saddr_syn $ns1 6 veth1 2001:db8:101::1
+	tc_set_flower_counter__saddr_syn $ns1 6 veth3 2001:db8:103::1
+
+	ip_mpath_balance -6 "[2001:db8:105::1]"
+
+	log_test $? 0 "IPv6 multipath loadbalance"
+
+	forwarding_cleanup
+}
+
 ################################################################################
 # usage
 
@@ -2683,6 +2796,8 @@ do
 	fib6_gc_test|ipv6_gc)		fib6_gc_test;;
 	ipv4_mpath_list)		ipv4_mpath_list_test;;
 	ipv6_mpath_list)		ipv6_mpath_list_test;;
+	ipv4_mpath_balance)		ipv4_mpath_balance_test;;
+	ipv6_mpath_balance)		ipv6_mpath_balance_test;;
 
 	help) echo "Test names: $TESTS"; exit 0;;
 	esac
diff --git a/tools/testing/selftests/net/forwarding/Makefile b/tools/testing/selftests/net/forwarding/Makefile
index 535865b3d1d6..00bde7b6f39e 100644
--- a/tools/testing/selftests/net/forwarding/Makefile
+++ b/tools/testing/selftests/net/forwarding/Makefile
@@ -15,18 +15,12 @@ TEST_PROGS = bridge_fdb_learning_limit.sh \
 	bridge_vlan_unaware.sh \
 	custom_multipath_hash.sh \
 	dual_vxlan_bridge.sh \
-	ethtool_extended_state.sh \
-	ethtool_mm.sh \
-	ethtool_rmon.sh \
-	ethtool.sh \
 	gre_custom_multipath_hash.sh \
 	gre_inner_v4_multipath.sh \
 	gre_inner_v6_multipath.sh \
 	gre_multipath_nh_res.sh \
 	gre_multipath_nh.sh \
 	gre_multipath.sh \
-	hw_stats_l3.sh \
-	hw_stats_l3_gre.sh \
 	ip6_forward_instats_vrf.sh \
 	ip6gre_custom_multipath_hash.sh \
 	ip6gre_flat_key.sh \
@@ -43,8 +37,9 @@ TEST_PROGS = bridge_fdb_learning_limit.sh \
 	ipip_hier_gre_key.sh \
 	ipip_hier_gre_keys.sh \
 	ipip_hier_gre.sh \
+	lib_sh_test.sh \
 	local_termination.sh \
-	loopback.sh \
+	min_max_mtu.sh \
 	mirror_gre_bound.sh \
 	mirror_gre_bridge_1d.sh \
 	mirror_gre_bridge_1d_vlan.sh \
@@ -76,6 +71,7 @@ TEST_PROGS = bridge_fdb_learning_limit.sh \
 	router_broadcast.sh \
 	router_mpath_nh_res.sh \
 	router_mpath_nh.sh \
+	router_mpath_seed.sh \
 	router_multicast.sh \
 	router_multipath.sh \
 	router_nh.sh \
@@ -109,11 +105,11 @@ TEST_PROGS = bridge_fdb_learning_limit.sh \
 	vxlan_bridge_1q_port_8472_ipv6.sh \
 	vxlan_bridge_1q_port_8472.sh \
 	vxlan_bridge_1q.sh \
+	vxlan_reserved.sh \
 	vxlan_symmetric_ipv6.sh \
 	vxlan_symmetric.sh
 
 TEST_FILES := devlink_lib.sh \
-	ethtool_lib.sh \
 	fib_offload_lib.sh \
 	forwarding.config.sample \
 	ip6gre_lib.sh \
@@ -131,6 +127,7 @@ TEST_FILES := devlink_lib.sh \
 	tc_common.sh
 
 TEST_INCLUDES := \
-	../lib.sh
+	../lib.sh \
+	$(wildcard ../lib/sh/*.sh)
 
 include ../../lib.mk
diff --git a/tools/testing/selftests/net/forwarding/README b/tools/testing/selftests/net/forwarding/README
index b8a2af8fcfb7..7b41cff993ad 100644
--- a/tools/testing/selftests/net/forwarding/README
+++ b/tools/testing/selftests/net/forwarding/README
@@ -6,7 +6,7 @@ to easily create and test complex environments.
 
 Unfortunately, these namespaces can not be used with actual switching
 ASICs, as their ports can not be migrated to other network namespaces
-(NETIF_F_NETNS_LOCAL) and most of them probably do not support the
+(dev->netns_immutable) and most of them probably do not support the
 L1-separation provided by namespaces.
 
 However, a similar kind of flexibility can be achieved by using VRFs and
@@ -56,3 +56,36 @@ o Checks shall be added to lib.sh for any external dependencies.
 o Code shall be checked using ShellCheck [1] prior to submission.
 
 1. https://www.shellcheck.net/
+
+Customization
+=============
+
+The forwarding selftests framework uses a number of variables that
+influence its behavior and tools it invokes, and how it invokes them, in
+various ways. A number of these variables can be overridden. The way these
+overridable variables are specified is typically one of the following two
+syntaxes:
+
+	: "${VARIABLE:=default_value}"
+	VARIABLE=${VARIABLE:=default_value}
+
+Any of these variables can be overridden. Notably net/forwarding/lib.sh and
+net/lib.sh contain a number of overridable variables.
+
+One way of overriding these variables is through the environment:
+
+	PAUSE_ON_FAIL=yes ./some_test.sh
+
+The variable NETIFS is special. Since it is an array variable, there is no
+way to pass it through the environment. Its value can instead be given as
+consecutive arguments to the selftest:
+
+	./some_test.sh swp{1..8}
+
+A way to customize variables in a persistent fashion is to create a file
+named forwarding.config in this directory. lib.sh sources the file if
+present, so it can contain any shell code. Typically it will contain
+assignments of variables whose value should be overridden.
+
+forwarding.config.sample is available in the directory as an example of
+how forwarding.config might look.
diff --git a/tools/testing/selftests/net/forwarding/bridge_fdb_learning_limit.sh b/tools/testing/selftests/net/forwarding/bridge_fdb_learning_limit.sh
index 0760a34b7114..a21b7085da2e 100755
--- a/tools/testing/selftests/net/forwarding/bridge_fdb_learning_limit.sh
+++ b/tools/testing/selftests/net/forwarding/bridge_fdb_learning_limit.sh
@@ -178,6 +178,22 @@ fdb_del()
 	check_err $? "Failed to remove a FDB entry of type ${type}"
 }
 
+check_fdb_n_learned_support()
+{
+	if ! ip link help bridge 2>&1 | grep -q "fdb_max_learned"; then
+		echo "SKIP: iproute2 too old, missing bridge max learned support"
+		exit $ksft_skip
+	fi
+
+	ip link add dev br0 type bridge
+	local learned=$(fdb_get_n_learned)
+	ip link del dev br0
+	if [ "$learned" == "null" ]; then
+		echo "SKIP: kernel too old; bridge fdb_n_learned feature not supported."
+		exit $ksft_skip
+	fi
+}
+
 check_accounting_one_type()
 {
 	local type=$1 is_counted=$2 overrides_learned=$3
@@ -274,6 +290,8 @@ check_limit()
 	done
 }
 
+check_fdb_n_learned_support
+
 trap cleanup EXIT
 
 setup_prepare
diff --git a/tools/testing/selftests/net/forwarding/bridge_igmp.sh b/tools/testing/selftests/net/forwarding/bridge_igmp.sh
index 2aa66d2a1702..d4e7dd659354 100755
--- a/tools/testing/selftests/net/forwarding/bridge_igmp.sh
+++ b/tools/testing/selftests/net/forwarding/bridge_igmp.sh
@@ -1,10 +1,24 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
-ALL_TESTS="v2reportleave_test v3include_test v3inc_allow_test v3inc_is_include_test \
-	   v3inc_is_exclude_test v3inc_to_exclude_test v3exc_allow_test v3exc_is_include_test \
-	   v3exc_is_exclude_test v3exc_to_exclude_test v3inc_block_test v3exc_block_test \
-	   v3exc_timeout_test v3star_ex_auto_add_test"
+ALL_TESTS="
+	v2reportleave_test
+	v3include_test
+	v3inc_allow_test
+	v3inc_is_include_test
+	v3inc_is_exclude_test
+	v3inc_to_exclude_test
+	v3exc_allow_test
+	v3exc_is_include_test
+	v3exc_is_exclude_test
+	v3exc_to_exclude_test
+	v3inc_block_test
+	v3exc_block_test
+	v3exc_timeout_test
+	v3star_ex_auto_add_test
+	v2per_vlan_snooping_port_stp_test
+	v2per_vlan_snooping_vlan_stp_test
+"
 NUM_NETIFS=4
 CHECK_TC="yes"
 TEST_GROUP="239.10.10.10"
@@ -478,10 +492,10 @@ v3exc_timeout_test()
 	RET=0
 	local X=("192.0.2.20" "192.0.2.30")
 
-	# GMI should be 3 seconds
+	# GMI should be 5 seconds
 	ip link set dev br0 type bridge mcast_query_interval 100 \
 					mcast_query_response_interval 100 \
-					mcast_membership_interval 300
+					mcast_membership_interval 500
 
 	v3exclude_prepare $h1 $ALL_MAC $ALL_GROUP
 	ip link set dev br0 type bridge mcast_query_interval 500 \
@@ -489,7 +503,7 @@ v3exc_timeout_test()
 					mcast_membership_interval 1500
 
 	$MZ $h1 -c 1 -b $ALL_MAC -B $ALL_GROUP -t ip "proto=2,p=$MZPKT_ALLOW2" -q
-	sleep 3
+	sleep 5
 	bridge -j -d -s mdb show dev br0 \
 		| jq -e ".[].mdb[] | \
 			 select(.grp == \"$TEST_GROUP\" and \
@@ -554,6 +568,64 @@ v3star_ex_auto_add_test()
 	v3cleanup $swp2 $TEST_GROUP
 }
 
+v2per_vlan_snooping_stp_test()
+{
+	local is_port=$1
+
+	local msg="port"
+	[[ $is_port -ne 1 ]] && msg="vlan"
+
+	ip link set br0 up type bridge vlan_filtering 1 \
+					mcast_igmp_version 2 \
+					mcast_snooping 1 \
+					mcast_vlan_snooping 1 \
+					mcast_querier 1 \
+					mcast_stats_enabled 1
+	bridge vlan global set vid 1 dev br0 \
+					mcast_snooping 1 \
+					mcast_querier 1 \
+					mcast_query_interval 100 \
+					mcast_startup_query_count 0
+	[[ $is_port -eq 1 ]] && bridge link set dev $swp1 state 0
+	[[ $is_port -ne 1 ]] && bridge vlan set vid 1 dev $swp1 state 4
+	sleep 5
+	local tx_s=$(ip -j -p stats show dev $swp1 \
+			group xstats_slave subgroup bridge suite mcast \
+			| jq '.[]["multicast"]["igmp_queries"]["tx_v2"]')
+
+	[[ $is_port -eq 1 ]] && bridge link set dev $swp1 state 3
+	[[ $is_port -ne 1 ]] && bridge vlan set vid 1 dev $swp1 state 3
+	sleep 5
+	local tx_e=$(ip -j -p stats show dev $swp1 \
+			group xstats_slave subgroup bridge suite mcast \
+			| jq '.[]["multicast"]["igmp_queries"]["tx_v2"]')
+
+	RET=0
+	local tx=$(expr $tx_e - $tx_s)
+	test $tx -gt 0
+	check_err $? "No IGMP queries after STP state becomes forwarding"
+	log_test "per vlan snooping with $msg stp state change"
+
+	# restore settings
+	bridge vlan global set vid 1 dev br0 \
+					mcast_querier 0 \
+					mcast_query_interval 12500 \
+					mcast_startup_query_count 2
+	ip link set br0 up type bridge vlan_filtering 0 \
+					mcast_vlan_snooping 0 \
+					mcast_stats_enabled 0
+}
+
+v2per_vlan_snooping_port_stp_test()
+{
+	v2per_vlan_snooping_stp_test 1
+}
+
+v2per_vlan_snooping_vlan_stp_test()
+{
+	v2per_vlan_snooping_stp_test 0
+}
+
 trap cleanup EXIT
 
 setup_prepare
diff --git a/tools/testing/selftests/net/forwarding/bridge_mdb.sh b/tools/testing/selftests/net/forwarding/bridge_mdb.sh
index d9d587454d20..8c1597ebc2d3 100755
--- a/tools/testing/selftests/net/forwarding/bridge_mdb.sh
+++ b/tools/testing/selftests/net/forwarding/bridge_mdb.sh
@@ -149,7 +149,7 @@ cfg_test_host_common()
 	check_err $? "Failed to add $name host entry"
 
 	bridge mdb replace dev br0 port br0 grp $grp $state vid 10 &> /dev/null
-	check_fail $? "Managed to replace $name host entry"
+	check_err $? "Failed to replace $name host entry"
 
 	bridge mdb del dev br0 port br0 grp $grp $state vid 10
 	bridge mdb get dev br0 grp $grp vid 10 &> /dev/null
diff --git a/tools/testing/selftests/net/forwarding/bridge_mld.sh b/tools/testing/selftests/net/forwarding/bridge_mld.sh
index e2b9ff773c6b..4cacef5a813a 100755
--- a/tools/testing/selftests/net/forwarding/bridge_mld.sh
+++ b/tools/testing/selftests/net/forwarding/bridge_mld.sh
@@ -1,10 +1,23 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
-ALL_TESTS="mldv2include_test mldv2inc_allow_test mldv2inc_is_include_test mldv2inc_is_exclude_test \
-	   mldv2inc_to_exclude_test mldv2exc_allow_test mldv2exc_is_include_test \
-	   mldv2exc_is_exclude_test mldv2exc_to_exclude_test mldv2inc_block_test \
-	   mldv2exc_block_test mldv2exc_timeout_test mldv2star_ex_auto_add_test"
+ALL_TESTS="
+	mldv2include_test
+	mldv2inc_allow_test
+	mldv2inc_is_include_test
+	mldv2inc_is_exclude_test
+	mldv2inc_to_exclude_test
+	mldv2exc_allow_test
+	mldv2exc_is_include_test
+	mldv2exc_is_exclude_test
+	mldv2exc_to_exclude_test
+	mldv2inc_block_test
+	mldv2exc_block_test
+	mldv2exc_timeout_test
+	mldv2star_ex_auto_add_test
+	mldv2per_vlan_snooping_port_stp_test
+	mldv2per_vlan_snooping_vlan_stp_test
+"
 NUM_NETIFS=4
 CHECK_TC="yes"
 TEST_GROUP="ff02::cc"
@@ -478,10 +491,10 @@ mldv2exc_timeout_test()
 	RET=0
 	local X=("2001:db8:1::20" "2001:db8:1::30")
 
-	# GMI should be 3 seconds
+	# GMI should be 5 seconds
 	ip link set dev br0 type bridge mcast_query_interval 100 \
 					mcast_query_response_interval 100 \
-					mcast_membership_interval 300
+					mcast_membership_interval 500
 
 	mldv2exclude_prepare $h1
 	ip link set dev br0 type bridge mcast_query_interval 500 \
@@ -489,7 +502,7 @@ mldv2exc_timeout_test()
 					mcast_membership_interval 1500
 
 	$MZ $h1 -c 1 $MZPKT_ALLOW2 -q
-	sleep 3
+	sleep 5
 	bridge -j -d -s mdb show dev br0 \
 		| jq -e ".[].mdb[] | \
 			 select(.grp == \"$TEST_GROUP\" and \
@@ -554,6 +567,66 @@ mldv2star_ex_auto_add_test()
 	mldv2cleanup $swp2
 }
 
+mldv2per_vlan_snooping_stp_test()
+{
+	local is_port=$1
+
+	local msg="port"
+	[[ $is_port -ne 1 ]] && msg="vlan"
+
+	ip link set br0 up type bridge vlan_filtering 1 \
+					mcast_mld_version 2 \
+					mcast_snooping 1 \
+					mcast_vlan_snooping 1 \
+					mcast_querier 1 \
+					mcast_stats_enabled 1
+	bridge vlan global set vid 1 dev br0 \
+					mcast_mld_version 2 \
+					mcast_snooping 1 \
+					mcast_querier 1 \
+					mcast_query_interval 100 \
+					mcast_startup_query_count 0
+
+	[[ $is_port -eq 1 ]] && bridge link set dev $swp1 state 0
+	[[ $is_port -ne 1 ]] && bridge vlan set vid 1 dev $swp1 state 4
+	sleep 5
+	local tx_s=$(ip -j -p stats show dev $swp1 \
+			group xstats_slave subgroup bridge suite mcast \
+			| jq '.[]["multicast"]["mld_queries"]["tx_v2"]')
+	[[ $is_port -eq 1 ]] && bridge link set dev $swp1 state 3
+	[[ $is_port -ne 1 ]] && bridge vlan set vid 1 dev $swp1 state 3
+	sleep 5
+	local tx_e=$(ip -j -p stats show dev $swp1 \
+			group xstats_slave subgroup bridge suite mcast \
+			| jq '.[]["multicast"]["mld_queries"]["tx_v2"]')
+
+	RET=0
+	local tx=$(expr $tx_e - $tx_s)
+	test $tx -gt 0
+	check_err $? "No MLD queries after STP state becomes forwarding"
+	log_test "per vlan snooping with $msg stp state change"
+
+	# restore settings
+	bridge vlan global set vid 1 dev br0 \
+					mcast_querier 0 \
+					mcast_query_interval 12500 \
+					mcast_startup_query_count 2 \
+					mcast_mld_version 1
+	ip link set br0 up type bridge vlan_filtering 0 \
+					mcast_vlan_snooping 0 \
+					mcast_stats_enabled 0
+}
+
+mldv2per_vlan_snooping_port_stp_test()
+{
+	mldv2per_vlan_snooping_stp_test 1
+}
+
+mldv2per_vlan_snooping_vlan_stp_test()
+{
+	mldv2per_vlan_snooping_stp_test 0
+}
+
 trap cleanup EXIT
 
 setup_prepare
diff --git a/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh b/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh
index 64bd00fe9a4f..e59fba366a0a 100755
--- a/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh
+++ b/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
-ALL_TESTS="ping_ipv4 ping_ipv6 learning flooding vlan_deletion extern_learn"
+ALL_TESTS="ping_ipv4 ping_ipv6 learning flooding vlan_deletion extern_learn other_tpid 8021p drop_untagged"
 NUM_NETIFS=4
 CHECK_TC="yes"
 source lib.sh
@@ -142,6 +142,152 @@ extern_learn()
 	bridge fdb del de:ad:be:ef:13:37 dev $swp1 master vlan 1 &> /dev/null
 }
 
+other_tpid()
+{
+	local mac=de:ad:be:ef:13:37
+
+	# Test that packets with TPID 802.1ad VID 3 + TPID 802.1Q VID 5 are
+	# classified as untagged by a bridge with vlan_protocol 802.1Q, and
+	# are processed in the PVID of the ingress port (here 1). Not VID 3,
+	# and not VID 5.
+	RET=0
+
+	tc qdisc add dev $h2 clsact
+	tc filter add dev $h2 ingress protocol all pref 1 handle 101 \
+		flower dst_mac $mac action drop
+	ip link set $h2 promisc on
+	ethtool -K $h2 rx-vlan-filter off rx-vlan-stag-filter off
+
+	$MZ -q $h1 -c 1 -b $mac -a own "88:a8 00:03 81:00 00:05 08:00 aa-aa-aa-aa-aa-aa-aa-aa-aa"
+	sleep 1
+
+	# Match on 'self' addresses as well, for those drivers which
+	# do not push their learned addresses to the bridge software
+	# database
+	bridge -j fdb show $swp1 | \
+		jq -e ".[] | select(.mac == \"$(mac_get $h1)\") | select(.vlan == 1)" &> /dev/null
+	check_err $? "FDB entry was not learned when it should"
+
+	log_test "FDB entry in PVID for VLAN-tagged with other TPID"
+
+	RET=0
+	tc -j -s filter show dev $h2 ingress \
+		| jq -e ".[] | select(.options.handle == 101) \
+		| select(.options.actions[0].stats.packets == 1)" &> /dev/null
+	check_err $? "Packet was not forwarded when it should"
+	log_test "Reception of VLAN with other TPID as untagged"
+
+	bridge vlan del dev $swp1 vid 1
+
+	$MZ -q $h1 -c 1 -b $mac -a own "88:a8 00:03 81:00 00:05 08:00 aa-aa-aa-aa-aa-aa-aa-aa-aa"
+	sleep 1
+
+	RET=0
+	tc -j -s filter show dev $h2 ingress \
+		| jq -e ".[] | select(.options.handle == 101) \
+		| select(.options.actions[0].stats.packets == 1)" &> /dev/null
+	check_err $? "Packet was forwarded when should not"
+	log_test "Reception of VLAN with other TPID as untagged (no PVID)"
+
+	bridge vlan add dev $swp1 vid 1 pvid untagged
+	ip link set $h2 promisc off
+	tc qdisc del dev $h2 clsact
+}
+
+8021p_do()
+{
+	local should_fail=$1; shift
+	local mac=de:ad:be:ef:13:37
+
+	tc filter add dev $h2 ingress protocol all pref 1 handle 101 \
+		flower dst_mac $mac action drop
+
+	$MZ -q $h1 -c 1 -b $mac -a own "81:00 00:00 08:00 aa-aa-aa-aa-aa-aa-aa-aa-aa"
+	sleep 1
+
+	tc -j -s filter show dev $h2 ingress \
+		| jq -e ".[] | select(.options.handle == 101) \
+		| select(.options.actions[0].stats.packets == 1)" &> /dev/null
+	check_err_fail $should_fail $? "802.1p-tagged reception"
+
+	tc filter del dev $h2 ingress pref 1
+}
+
+8021p()
+{
+	RET=0
+
+	tc qdisc add dev $h2 clsact
+	ip link set $h2 promisc on
+
+	# Test that with the default_pvid, 1, packets tagged with VID 0 are
+	# accepted.
+	8021p_do 0
+
+	# Test that packets tagged with VID 0 are still accepted after changing
+	# the default_pvid.
+	ip link set br0 type bridge vlan_default_pvid 10
+	8021p_do 0
+
+	log_test "Reception of 802.1p-tagged traffic"
+
+	ip link set $h2 promisc off
+	tc qdisc del dev $h2 clsact
+}
+
+send_untagged_and_8021p()
+{
+	ping_do $h1 192.0.2.2
+	check_fail $?
+
+	8021p_do 1
+}
+
+drop_untagged()
+{
+	RET=0
+
+	tc qdisc add dev $h2 clsact
+	ip link set $h2 promisc on
+
+	# Test that with no PVID, untagged and 802.1p-tagged traffic is
+	# dropped.
+	ip link set br0 type bridge vlan_default_pvid 1
+
+	# First we reconfigure the default_pvid, 1, as a non-PVID VLAN.
+	bridge vlan add dev $swp1 vid 1 untagged
+	send_untagged_and_8021p
+	bridge vlan add dev $swp1 vid 1 pvid untagged
+
+	# Next we try to delete VID 1 altogether
+	bridge vlan del dev $swp1 vid 1
+	send_untagged_and_8021p
+	bridge vlan add dev $swp1 vid 1 pvid untagged
+
+	# Set up the bridge without a default_pvid, then check that the 8021q
+	# module, when the bridge port goes down and then up again, does not
+	# accidentally re-enable untagged packet reception.
+	ip link set br0 type bridge vlan_default_pvid 0
+	ip link set $swp1 down
+	ip link set $swp1 up
+	setup_wait
+	send_untagged_and_8021p
+
+	# Remove swp1 as a bridge port and let it rejoin the bridge while it
+	# has no default_pvid.
+	ip link set $swp1 nomaster
+	ip link set $swp1 master br0
+	send_untagged_and_8021p
+
+	# Restore settings
+	ip link set br0 type bridge vlan_default_pvid 1
+
+	log_test "Dropping of untagged and 802.1p-tagged traffic with no PVID"
+
+	ip link set $h2 promisc off
+	tc qdisc del dev $h2 clsact
+}
+
 trap cleanup EXIT
 
 setup_prepare
diff --git a/tools/testing/selftests/net/forwarding/bridge_vlan_unaware.sh b/tools/testing/selftests/net/forwarding/bridge_vlan_unaware.sh
index 1c8a26046589..2b5700b61ffa 100755
--- a/tools/testing/selftests/net/forwarding/bridge_vlan_unaware.sh
+++ b/tools/testing/selftests/net/forwarding/bridge_vlan_unaware.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
-ALL_TESTS="ping_ipv4 ping_ipv6 learning flooding"
+ALL_TESTS="ping_ipv4 ping_ipv6 learning flooding pvid_change"
 NUM_NETIFS=4
 source lib.sh
 
@@ -77,12 +77,16 @@ cleanup()
 
 ping_ipv4()
 {
-	ping_test $h1 192.0.2.2
+	local msg=$1
+
+	ping_test $h1 192.0.2.2 "$msg"
 }
 
 ping_ipv6()
 {
-	ping6_test $h1 2001:db8:1::2
+	local msg=$1
+
+	ping6_test $h1 2001:db8:1::2 "$msg"
 }
 
 learning()
@@ -95,6 +99,21 @@ flooding()
 	flood_test $swp2 $h1 $h2
 }
 
+pvid_change()
+{
+	# Test that the changing of the VLAN-aware PVID does not affect
+	# VLAN-unaware forwarding
+	bridge vlan add vid 3 dev $swp1 pvid untagged
+
+	ping_ipv4 " with bridge port $swp1 PVID changed"
+	ping_ipv6 " with bridge port $swp1 PVID changed"
+
+	bridge vlan del vid 3 dev $swp1
+
+	ping_ipv4 " with bridge port $swp1 PVID deleted"
+	ping_ipv6 " with bridge port $swp1 PVID deleted"
+}
+
 trap cleanup EXIT
 
 setup_prepare
diff --git a/tools/testing/selftests/net/forwarding/config b/tools/testing/selftests/net/forwarding/config
index 8d7a1a004b7c..18fd69d8d937 100644
--- a/tools/testing/selftests/net/forwarding/config
+++ b/tools/testing/selftests/net/forwarding/config
@@ -1,6 +1,7 @@
 CONFIG_BRIDGE=m
 CONFIG_VLAN_8021Q=m
 CONFIG_BRIDGE_VLAN_FILTERING=y
+CONFIG_BRIDGE_IGMP_SNOOPING=y
 CONFIG_NET_L3_MASTER_DEV=y
 CONFIG_IPV6_MULTIPLE_TABLES=y
 CONFIG_NET_VRF=m
diff --git a/tools/testing/selftests/net/forwarding/custom_multipath_hash.sh b/tools/testing/selftests/net/forwarding/custom_multipath_hash.sh
index 1783c10215e5..7d531f7091e6 100755
--- a/tools/testing/selftests/net/forwarding/custom_multipath_hash.sh
+++ b/tools/testing/selftests/net/forwarding/custom_multipath_hash.sh
@@ -224,10 +224,10 @@ send_dst_ipv6()
 send_flowlabel()
 {
 	# Generate 16384 echo requests, each with a random flow label.
-	for _ in $(seq 1 16384); do
-		ip vrf exec v$h1 \
-			$PING6 2001:db8:4::2 -F 0 -c 1 -q >/dev/null 2>&1
-	done
+	ip vrf exec v$h1 sh -c \
+		"for _ in {1..16384}; do \
+			$PING6 2001:db8:4::2 -F 0 -c 1 -q >/dev/null 2>&1; \
+		done"
 }
 
 send_src_udp6()
diff --git a/tools/testing/selftests/net/forwarding/devlink_lib.sh b/tools/testing/selftests/net/forwarding/devlink_lib.sh
index f1de525cfa55..18afa89ebbcc 100644
--- a/tools/testing/selftests/net/forwarding/devlink_lib.sh
+++ b/tools/testing/selftests/net/forwarding/devlink_lib.sh
@@ -122,6 +122,8 @@ devlink_reload()
 	still_pending=$(devlink resource show "$DEVLINK_DEV" | \
 			grep -c "size_new")
 	check_err $still_pending "Failed reload - There are still unset sizes"
+
+	udevadm settle
 }
 
 declare -A DEVLINK_ORIG
@@ -499,7 +501,7 @@ devlink_trap_drop_cleanup()
 	local pref=$1; shift
 	local handle=$1; shift
 
-	kill $mz_pid && wait $mz_pid &> /dev/null
+	kill_process $mz_pid
 	tc filter del dev $dev egress protocol $proto pref $pref handle $handle flower
 }
 
diff --git a/tools/testing/selftests/net/forwarding/ethtool.sh b/tools/testing/selftests/net/forwarding/ethtool.sh
deleted file mode 100755
index aa2eafb7b243..000000000000
--- a/tools/testing/selftests/net/forwarding/ethtool.sh
+++ /dev/null
@@ -1,301 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-ALL_TESTS="
-	same_speeds_autoneg_off
-	different_speeds_autoneg_off
-	combination_of_neg_on_and_off
-	advertise_subset_of_speeds
-	check_highest_speed_is_chosen
-	different_speeds_autoneg_on
-"
-NUM_NETIFS=2
-source lib.sh
-source ethtool_lib.sh
-
-h1_create()
-{
-	simple_if_init $h1 192.0.2.1/24
-}
-
-h1_destroy()
-{
-	simple_if_fini $h1 192.0.2.1/24
-}
-
-h2_create()
-{
-	simple_if_init $h2 192.0.2.2/24
-}
-
-h2_destroy()
-{
-	simple_if_fini $h2 192.0.2.2/24
-}
-
-setup_prepare()
-{
-	h1=${NETIFS[p1]}
-	h2=${NETIFS[p2]}
-
-	h1_create
-	h2_create
-}
-
-cleanup()
-{
-	pre_cleanup
-
-	h2_destroy
-	h1_destroy
-}
-
-same_speeds_autoneg_off()
-{
-	# Check that when each of the reported speeds is forced, the links come
-	# up and are operational.
-	local -a speeds_arr=($(common_speeds_get $h1 $h2 0 0))
-
-	for speed in "${speeds_arr[@]}"; do
-		RET=0
-		ethtool_set $h1 speed $speed autoneg off
-		ethtool_set $h2 speed $speed autoneg off
-
-		setup_wait_dev_with_timeout $h1
-		setup_wait_dev_with_timeout $h2
-		ping_do $h1 192.0.2.2
-		check_err $? "speed $speed autoneg off"
-		log_test "force of same speed autoneg off"
-		log_info "speed = $speed"
-	done
-
-	ethtool -s $h2 autoneg on
-	ethtool -s $h1 autoneg on
-}
-
-different_speeds_autoneg_off()
-{
-	# Test that when we force different speeds, links are not up and ping
-	# fails.
-	RET=0
-
-	local -a speeds_arr=($(different_speeds_get $h1 $h2 0 0))
-	local speed1=${speeds_arr[0]}
-	local speed2=${speeds_arr[1]}
-
-	ethtool_set $h1 speed $speed1 autoneg off
-	ethtool_set $h2 speed $speed2 autoneg off
-
-	setup_wait_dev_with_timeout $h1
-	setup_wait_dev_with_timeout $h2
-	ping_do $h1 192.0.2.2
-	check_fail $? "ping with different speeds"
-
-	log_test "force of different speeds autoneg off"
-
-	ethtool -s $h2 autoneg on
-	ethtool -s $h1 autoneg on
-}
-
-combination_of_neg_on_and_off()
-{
-	# Test that when one device is forced to a speed supported by both
-	# endpoints and the other device is configured to autoneg on, the links
-	# are up and ping passes.
-	local -a speeds_arr=($(common_speeds_get $h1 $h2 0 1))
-
-	for speed in "${speeds_arr[@]}"; do
-		RET=0
-		ethtool_set $h1 speed $speed autoneg off
-
-		setup_wait_dev_with_timeout $h1
-		setup_wait_dev_with_timeout $h2
-		ping_do $h1 192.0.2.2
-		check_err $? "h1-speed=$speed autoneg off, h2 autoneg on"
-		log_test "one side with autoneg off and another with autoneg on"
-		log_info "force speed = $speed"
-	done
-
-	ethtool -s $h1 autoneg on
-}
-
-hex_speed_value_get()
-{
-	local speed=$1; shift
-
-	local shift_size=${speed_values[$speed]}
-	speed=$((0x1 << $"shift_size"))
-	printf "%#x" "$speed"
-}
-
-subset_of_common_speeds_get()
-{
-	local dev1=$1; shift
-	local dev2=$1; shift
-	local adver=$1; shift
-
-	local -a speeds_arr=($(common_speeds_get $dev1 $dev2 0 $adver))
-	local speed_to_advertise=0
-	local speed_to_remove=${speeds_arr[0]}
-	speed_to_remove+='base'
-
-	local -a speeds_mode_arr=($(common_speeds_get $dev1 $dev2 1 $adver))
-
-	for speed in ${speeds_mode_arr[@]}; do
-		if [[ $speed != $speed_to_remove* ]]; then
-			speed=$(hex_speed_value_get $speed)
-			speed_to_advertise=$(($speed_to_advertise | \
-						$speed))
-		fi
-
-	done
-
-	# Convert to hex.
-	printf "%#x" "$speed_to_advertise"
-}
-
-speed_to_advertise_get()
-{
-	# The function returns the hex number that is composed by OR-ing all
-	# the modes corresponding to the provided speed.
-	local speed_without_mode=$1; shift
-	local supported_speeds=("$@"); shift
-	local speed_to_advertise=0
-
-	speed_without_mode+='base'
-
-	for speed in ${supported_speeds[@]}; do
-		if [[ $speed == $speed_without_mode* ]]; then
-			speed=$(hex_speed_value_get $speed)
-			speed_to_advertise=$(($speed_to_advertise | \
-						$speed))
-		fi
-
-	done
-
-	# Convert to hex.
-	printf "%#x" "$speed_to_advertise"
-}
-
-advertise_subset_of_speeds()
-{
-	# Test that when one device advertises a subset of speeds and another
-	# advertises a specific speed (but all modes of this speed), the links
-	# are up and ping passes.
-	RET=0
-
-	local speed_1_to_advertise=$(subset_of_common_speeds_get $h1 $h2 1)
-	ethtool_set $h1 advertise $speed_1_to_advertise
-
-	if [ $RET != 0 ]; then
-		log_test "advertise subset of speeds"
-		return
-	fi
-
-	local -a speeds_arr_without_mode=($(common_speeds_get $h1 $h2 0 1))
-	# Check only speeds that h1 advertised. Remove the first speed.
-	unset speeds_arr_without_mode[0]
-	local -a speeds_arr_with_mode=($(common_speeds_get $h1 $h2 1 1))
-
-	for speed_value in ${speeds_arr_without_mode[@]}; do
-		RET=0
-		local speed_2_to_advertise=$(speed_to_advertise_get $speed_value \
-			"${speeds_arr_with_mode[@]}")
-		ethtool_set $h2 advertise $speed_2_to_advertise
-
-		setup_wait_dev_with_timeout $h1
-		setup_wait_dev_with_timeout $h2
-		ping_do $h1 192.0.2.2
-		check_err $? "h1=$speed_1_to_advertise, h2=$speed_2_to_advertise ($speed_value)"
-
-		log_test "advertise subset of speeds"
-		log_info "h1=$speed_1_to_advertise, h2=$speed_2_to_advertise"
-	done
-
-	ethtool -s $h2 autoneg on
-	ethtool -s $h1 autoneg on
-}
-
-check_highest_speed_is_chosen()
-{
-	# Test that when one device advertises a subset of speeds, the other
-	# chooses the highest speed. This test checks configuration without
-	# traffic.
-	RET=0
-
-	local max_speed
-	local chosen_speed
-	local speed_to_advertise=$(subset_of_common_speeds_get $h1 $h2 1)
-
-	ethtool_set $h1 advertise $speed_to_advertise
-
-	if [ $RET != 0 ]; then
-		log_test "check highest speed"
-		return
-	fi
-
-	local -a speeds_arr=($(common_speeds_get $h1 $h2 0 1))
-
-	max_speed=${speeds_arr[0]}
-	for current in ${speeds_arr[@]}; do
-		if [[ $current -gt $max_speed ]]; then
-			max_speed=$current
-		fi
-	done
-
-	setup_wait_dev_with_timeout $h1
-	setup_wait_dev_with_timeout $h2
-	chosen_speed=$(ethtool $h1 | grep 'Speed:')
-	chosen_speed=${chosen_speed%"Mb/s"*}
-	chosen_speed=${chosen_speed#*"Speed: "}
-	((chosen_speed == max_speed))
-	check_err $? "h1 advertise $speed_to_advertise, h2 sync to speed $chosen_speed"
-
-	log_test "check highest speed"
-
-	ethtool -s $h2 autoneg on
-	ethtool -s $h1 autoneg on
-}
-
-different_speeds_autoneg_on()
-{
-	# Test that when we configure links to advertise different speeds,
-	# links are not up and ping fails.
-	RET=0
-
-	local -a speeds=($(different_speeds_get $h1 $h2 1 1))
-	local speed1=${speeds[0]}
-	local speed2=${speeds[1]}
-
-	speed1=$(hex_speed_value_get $speed1)
-	speed2=$(hex_speed_value_get $speed2)
-
-	ethtool_set $h1 advertise $speed1
-	ethtool_set $h2 advertise $speed2
-
-	if (($RET)); then
-		setup_wait_dev_with_timeout $h1
-		setup_wait_dev_with_timeout $h2
-		ping_do $h1 192.0.2.2
-		check_fail $? "ping with different speeds autoneg on"
-	fi
-
-	log_test "advertise different speeds autoneg on"
-
-	ethtool -s $h2 autoneg on
-	ethtool -s $h1 autoneg on
-}
-
-skip_on_veth
-
-trap cleanup EXIT
-
-setup_prepare
-setup_wait
-
-declare -gA speed_values
-eval "speed_values=($(speeds_arr_get))"
-
-tests_run
-
-exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/ethtool_extended_state.sh b/tools/testing/selftests/net/forwarding/ethtool_extended_state.sh
deleted file mode 100755
index 17f89c3b7c02..000000000000
--- a/tools/testing/selftests/net/forwarding/ethtool_extended_state.sh
+++ /dev/null
@@ -1,117 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-ALL_TESTS="
-	autoneg
-	autoneg_force_mode
-	no_cable
-"
-
-NUM_NETIFS=2
-source lib.sh
-source ethtool_lib.sh
-
-TIMEOUT=$((WAIT_TIMEOUT * 1000)) # ms
-
-setup_prepare()
-{
-	swp1=${NETIFS[p1]}
-	swp2=${NETIFS[p2]}
-	swp3=$NETIF_NO_CABLE
-}
-
-ethtool_ext_state()
-{
-	local dev=$1; shift
-	local expected_ext_state=$1; shift
-	local expected_ext_substate=${1:-""}; shift
-
-	local ext_state=$(ethtool $dev | grep "Link detected" \
-		| cut -d "(" -f2 | cut -d ")" -f1)
-	local ext_substate=$(echo $ext_state | cut -sd "," -f2 \
-		| sed -e 's/^[[:space:]]*//')
-	ext_state=$(echo $ext_state | cut -d "," -f1)
-
-	if [[ $ext_state != $expected_ext_state ]]; then
-		echo "Expected \"$expected_ext_state\", got \"$ext_state\""
-		return 1
-	fi
-	if [[ $ext_substate != $expected_ext_substate ]]; then
-		echo "Expected \"$expected_ext_substate\", got \"$ext_substate\""
-		return 1
-	fi
-}
-
-autoneg()
-{
-	local msg
-
-	RET=0
-
-	ip link set dev $swp1 up
-
-	msg=$(busywait $TIMEOUT ethtool_ext_state $swp1 \
-			"Autoneg" "No partner detected")
-	check_err $? "$msg"
-
-	log_test "Autoneg, No partner detected"
-
-	ip link set dev $swp1 down
-}
-
-autoneg_force_mode()
-{
-	local msg
-
-	RET=0
-
-	ip link set dev $swp1 up
-	ip link set dev $swp2 up
-
-	local -a speeds_arr=($(different_speeds_get $swp1 $swp2 0 0))
-	local speed1=${speeds_arr[0]}
-	local speed2=${speeds_arr[1]}
-
-	ethtool_set $swp1 speed $speed1 autoneg off
-	ethtool_set $swp2 speed $speed2 autoneg off
-
-	msg=$(busywait $TIMEOUT ethtool_ext_state $swp1 \
-			"Autoneg" "No partner detected during force mode")
-	check_err $? "$msg"
-
-	msg=$(busywait $TIMEOUT ethtool_ext_state $swp2 \
-			"Autoneg" "No partner detected during force mode")
-	check_err $? "$msg"
-
-	log_test "Autoneg, No partner detected during force mode"
-
-	ethtool -s $swp2 autoneg on
-	ethtool -s $swp1 autoneg on
-
-	ip link set dev $swp2 down
-	ip link set dev $swp1 down
-}
-
-no_cable()
-{
-	local msg
-
-	RET=0
-
-	ip link set dev $swp3 up
-
-	msg=$(busywait $TIMEOUT ethtool_ext_state $swp3 "No cable")
-	check_err $? "$msg"
-
-	log_test "No cable"
-
-	ip link set dev $swp3 down
-}
-
-skip_on_veth
-
-setup_prepare
-
-tests_run
-
-exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/ethtool_lib.sh b/tools/testing/selftests/net/forwarding/ethtool_lib.sh
deleted file mode 100644
index b9bfb45085af..000000000000
--- a/tools/testing/selftests/net/forwarding/ethtool_lib.sh
+++ /dev/null
@@ -1,120 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-speeds_arr_get()
-{
-	cmd='/ETHTOOL_LINK_MODE_[^[:space:]]*_BIT[[:space:]]+=[[:space:]]+/ \
-		{sub(/,$/, "") \
-		sub(/ETHTOOL_LINK_MODE_/,"") \
-		sub(/_BIT/,"") \
-		sub(/_Full/,"/Full") \
-		sub(/_Half/,"/Half");\
-		print "["$1"]="$3}'
-
-	awk "${cmd}" /usr/include/linux/ethtool.h
-}
-
-ethtool_set()
-{
-	local cmd="$@"
-	local out=$(ethtool -s $cmd 2>&1 | wc -l)
-
-	check_err $out "error in configuration. $cmd"
-}
-
-dev_linkmodes_params_get()
-{
-	local dev=$1; shift
-	local adver=$1; shift
-	local -a linkmodes_params
-	local param_count
-	local arr
-
-	if (($adver)); then
-		mode="Advertised link modes"
-	else
-		mode="Supported link modes"
-	fi
-
-	local -a dev_linkmodes=($(dev_speeds_get $dev 1 $adver))
-	for ((i=0; i<${#dev_linkmodes[@]}; i++)); do
-		linkmodes_params[$i]=$(echo -e "${dev_linkmodes[$i]}" | \
-			# Replaces all non numbers with spaces
-			sed -e 's/[^0-9]/ /g' | \
-			# Squeeze spaces in sequence to 1 space
-			tr -s ' ')
-		# Count how many numbers were found in the linkmode
-		param_count=$(echo "${linkmodes_params[$i]}" | wc -w)
-		if [[ $param_count -eq 1 ]]; then
-			linkmodes_params[$i]="${linkmodes_params[$i]} 1"
-		elif [[ $param_count -ge 3 ]]; then
-			arr=(${linkmodes_params[$i]})
-			# Take only first two params
-			linkmodes_params[$i]=$(echo "${arr[@]:0:2}")
-		fi
-	done
-	echo ${linkmodes_params[@]}
-}
-
-dev_speeds_get()
-{
-	local dev=$1; shift
-	local with_mode=$1; shift
-	local adver=$1; shift
-	local speeds_str
-
-	if (($adver)); then
-		mode="Advertised link modes"
-	else
-		mode="Supported link modes"
-	fi
-
-	speeds_str=$(ethtool "$dev" | \
-		# Snip everything before the link modes section.
-		sed -n '/'"$mode"':/,$p' | \
-		# Quit processing the rest at the start of the next section.
-		# When checking, skip the header of this section (hence the 2,).
-		sed -n '2,${/^[\t][^ \t]/q};p' | \
-		# Drop the section header of the current section.
-		cut -d':' -f2)
-
-	local -a speeds_arr=($speeds_str)
-	if [[ $with_mode -eq 0 ]]; then
-		for ((i=0; i<${#speeds_arr[@]}; i++)); do
-			speeds_arr[$i]=${speeds_arr[$i]%base*}
-		done
-	fi
-	echo ${speeds_arr[@]}
-}
-
-common_speeds_get()
-{
-	dev1=$1; shift
-	dev2=$1; shift
-	with_mode=$1; shift
-	adver=$1; shift
-
-	local -a dev1_speeds=($(dev_speeds_get $dev1 $with_mode $adver))
-	local -a dev2_speeds=($(dev_speeds_get $dev2 $with_mode $adver))
-
-	comm -12 \
-		<(printf '%s\n' "${dev1_speeds[@]}" | sort -u) \
-		<(printf '%s\n' "${dev2_speeds[@]}" | sort -u)
-}
-
-different_speeds_get()
-{
-	local dev1=$1; shift
-	local dev2=$1; shift
-	local with_mode=$1; shift
-	local adver=$1; shift
-
-	local -a speeds_arr
-
-	speeds_arr=($(common_speeds_get $dev1 $dev2 $with_mode $adver))
-	if [[ ${#speeds_arr[@]} < 2 ]]; then
-		check_err 1 "cannot check different speeds. There are not enough speeds"
-	fi
-
-	echo ${speeds_arr[0]} ${speeds_arr[1]}
-}
diff --git a/tools/testing/selftests/net/forwarding/ethtool_mm.sh b/tools/testing/selftests/net/forwarding/ethtool_mm.sh
deleted file mode 100755
index 50d5bfb17ef1..000000000000
--- a/tools/testing/selftests/net/forwarding/ethtool_mm.sh
+++ /dev/null
@@ -1,340 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-ALL_TESTS="
-	manual_with_verification_h1_to_h2
-	manual_with_verification_h2_to_h1
-	manual_without_verification_h1_to_h2
-	manual_without_verification_h2_to_h1
-	manual_failed_verification_h1_to_h2
-	manual_failed_verification_h2_to_h1
-	lldp
-"
-
-NUM_NETIFS=2
-REQUIRE_MZ=no
-PREEMPTIBLE_PRIO=0
-source lib.sh
-
-traffic_test()
-{
-	local if=$1; shift
-	local src=$1; shift
-	local num_pkts=10000
-	local before=
-	local after=
-	local delta=
-
-	if [ ${has_pmac_stats[$if]} = false ]; then
-		src="aggregate"
-	fi
-
-	before=$(ethtool_std_stats_get $if "eth-mac" "FramesTransmittedOK" $src)
-
-	$MZ $if -q -c $num_pkts -p 64 -b bcast -t ip -R $PREEMPTIBLE_PRIO
-
-	after=$(ethtool_std_stats_get $if "eth-mac" "FramesTransmittedOK" $src)
-
-	delta=$((after - before))
-
-	# Allow an extra 1% tolerance for random packets sent by the stack
-	[ $delta -ge $num_pkts ] && [ $delta -le $((num_pkts + 100)) ]
-}
-
-manual_with_verification()
-{
-	local tx=$1; shift
-	local rx=$1; shift
-
-	RET=0
-
-	# It isn't completely clear from IEEE 802.3-2018 Figure 99-5: Transmit
-	# Processing state diagram whether the "send_r" variable (send response
-	# to verification frame) should be taken into consideration while the
-	# MAC Merge TX direction is disabled. That being said, at least the
-	# NXP ENETC does not, and requires tx-enabled on in order to respond to
-	# the link partner's verification frames.
-	ethtool --set-mm $rx tx-enabled on
-	ethtool --set-mm $tx verify-enabled on tx-enabled on
-
-	# Wait for verification to finish
-	sleep 1
-
-	ethtool --json --show-mm $tx | jq -r '.[]."verify-status"' | \
-		grep -q 'SUCCEEDED'
-	check_err "$?" "Verification did not succeed"
-
-	ethtool --json --show-mm $tx | jq -r '.[]."tx-active"' | grep -q 'true'
-	check_err "$?" "pMAC TX is not active"
-
-	traffic_test $tx "pmac"
-	check_err "$?" "Traffic did not get sent through $tx's pMAC"
-
-	ethtool --set-mm $tx verify-enabled off tx-enabled off
-	ethtool --set-mm $rx tx-enabled off
-
-	log_test "Manual configuration with verification: $tx to $rx"
-}
-
-manual_with_verification_h1_to_h2()
-{
-	manual_with_verification $h1 $h2
-}
-
-manual_with_verification_h2_to_h1()
-{
-	manual_with_verification $h2 $h1
-}
-
-manual_without_verification()
-{
-	local tx=$1; shift
-	local rx=$1; shift
-
-	RET=0
-
-	ethtool --set-mm $tx verify-enabled off tx-enabled on
-
-	ethtool --json --show-mm $tx | jq -r '.[]."verify-status"' | \
-		grep -q 'DISABLED'
-	check_err "$?" "Verification is not disabled"
-
-	ethtool --json --show-mm $tx | jq -r '.[]."tx-active"' | grep -q 'true'
-	check_err "$?" "pMAC TX is not active"
-
-	traffic_test $tx "pmac"
-	check_err "$?" "Traffic did not get sent through $tx's pMAC"
-
-	ethtool --set-mm $tx verify-enabled off tx-enabled off
-
-	log_test "Manual configuration without verification: $tx to $rx"
-}
-
-manual_without_verification_h1_to_h2()
-{
-	manual_without_verification $h1 $h2
-}
-
-manual_without_verification_h2_to_h1()
-{
-	manual_without_verification $h2 $h1
-}
-
-manual_failed_verification()
-{
-	local tx=$1; shift
-	local rx=$1; shift
-
-	RET=0
-
-	ethtool --set-mm $rx pmac-enabled off
-	ethtool --set-mm $tx verify-enabled on tx-enabled on
-
-	# Wait for verification to time out
-	sleep 1
-
-	ethtool --json --show-mm $tx | jq -r '.[]."verify-status"' | \
-		grep -q 'SUCCEEDED'
-	check_fail "$?" "Verification succeeded when it shouldn't have"
-
-	ethtool --json --show-mm $tx | jq -r '.[]."tx-active"' | grep -q 'true'
-	check_fail "$?" "pMAC TX is active when it shouldn't have"
-
-	traffic_test $tx "emac"
-	check_err "$?" "Traffic did not get sent through $tx's eMAC"
-
-	ethtool --set-mm $tx verify-enabled off tx-enabled off
-	ethtool --set-mm $rx pmac-enabled on
-
-	log_test "Manual configuration with failed verification: $tx to $rx"
-}
-
-manual_failed_verification_h1_to_h2()
-{
-	manual_failed_verification $h1 $h2
-}
-
-manual_failed_verification_h2_to_h1()
-{
-	manual_failed_verification $h2 $h1
-}
-
-smallest_supported_add_frag_size()
-{
-	local iface=$1
-	local rx_min_frag_size=
-
-	rx_min_frag_size=$(ethtool --json --show-mm $iface | \
-		jq '.[]."rx-min-frag-size"')
-
-	if [ $rx_min_frag_size -le 60 ]; then
-		echo 0
-	elif [ $rx_min_frag_size -le 124 ]; then
-		echo 1
-	elif [ $rx_min_frag_size -le 188 ]; then
-		echo 2
-	elif [ $rx_min_frag_size -le 252 ]; then
-		echo 3
-	else
-		echo "$iface: RX min frag size $rx_min_frag_size cannot be advertised over LLDP"
-		exit 1
-	fi
-}
-
-expected_add_frag_size()
-{
-	local iface=$1
-	local requested=$2
-	local min=$(smallest_supported_add_frag_size $iface)
-
-	[ $requested -le $min ] && echo $min || echo $requested
-}
-
-lldp_change_add_frag_size()
-{
-	local add_frag_size=$1
-	local pattern=
-
-	lldptool -T -i $h1 -V addEthCaps addFragSize=$add_frag_size >/dev/null
-	# Wait for TLVs to be received
-	sleep 2
-	pattern=$(printf "Additional fragment size: %d" \
-			 $(expected_add_frag_size $h1 $add_frag_size))
-	lldptool -i $h2 -t -n -V addEthCaps | grep -q "$pattern"
-}
-
-lldp()
-{
-	RET=0
-
-	systemctl start lldpad
-
-	# Configure the interfaces to receive and transmit LLDPDUs
-	lldptool -L -i $h1 adminStatus=rxtx >/dev/null
-	lldptool -L -i $h2 adminStatus=rxtx >/dev/null
-
-	# Enable the transmission of Additional Ethernet Capabilities TLV
-	lldptool -T -i $h1 -V addEthCaps enableTx=yes >/dev/null
-	lldptool -T -i $h2 -V addEthCaps enableTx=yes >/dev/null
-
-	# Wait for TLVs to be received
-	sleep 2
-
-	lldptool -i $h1 -t -n -V addEthCaps | \
-		grep -q "Preemption capability active"
-	check_err "$?" "$h1 pMAC TX is not active"
-
-	lldptool -i $h2 -t -n -V addEthCaps | \
-		grep -q "Preemption capability active"
-	check_err "$?" "$h2 pMAC TX is not active"
-
-	lldp_change_add_frag_size 3
-	check_err "$?" "addFragSize 3"
-
-	lldp_change_add_frag_size 2
-	check_err "$?" "addFragSize 2"
-
-	lldp_change_add_frag_size 1
-	check_err "$?" "addFragSize 1"
-
-	lldp_change_add_frag_size 0
-	check_err "$?" "addFragSize 0"
-
-	traffic_test $h1 "pmac"
-	check_err "$?" "Traffic did not get sent through $h1's pMAC"
-
-	traffic_test $h2 "pmac"
-	check_err "$?" "Traffic did not get sent through $h2's pMAC"
-
-	systemctl stop lldpad
-
-	log_test "LLDP"
-}
-
-h1_create()
-{
-	ip link set dev $h1 up
-
-	tc qdisc add dev $h1 root mqprio num_tc 4 map 0 1 2 3 \
-		queues 1@0 1@1 1@2 1@3 \
-		fp P E E E \
-		hw 1
-
-	ethtool --set-mm $h1 pmac-enabled on tx-enabled off verify-enabled off
-}
-
-h2_create()
-{
-	ip link set dev $h2 up
-
-	ethtool --set-mm $h2 pmac-enabled on tx-enabled off verify-enabled off
-
-	tc qdisc add dev $h2 root mqprio num_tc 4 map 0 1 2 3 \
-		queues 1@0 1@1 1@2 1@3 \
-		fp P E E E \
-		hw 1
-}
-
-h1_destroy()
-{
-	ethtool --set-mm $h1 pmac-enabled off tx-enabled off verify-enabled off
-
-	tc qdisc del dev $h1 root
-
-	ip link set dev $h1 down
-}
-
-h2_destroy()
-{
-	tc qdisc del dev $h2 root
-
-	ethtool --set-mm $h2 pmac-enabled off tx-enabled off verify-enabled off
-
-	ip link set dev $h2 down
-}
-
-setup_prepare()
-{
-	h1=${NETIFS[p1]}
-	h2=${NETIFS[p2]}
-
-	h1_create
-	h2_create
-}
-
-cleanup()
-{
-	pre_cleanup
-
-	h2_destroy
-	h1_destroy
-}
-
-check_ethtool_mm_support
-check_tc_fp_support
-require_command lldptool
-bail_on_lldpad "autoconfigure the MAC Merge layer" "configure it manually"
-
-for netif in ${NETIFS[@]}; do
-	ethtool --show-mm $netif 2>&1 &> /dev/null
-	if [[ $? -ne 0 ]]; then
-		echo "SKIP: $netif does not support MAC Merge"
-		exit $ksft_skip
-	fi
-
-	if check_ethtool_pmac_std_stats_support $netif eth-mac; then
-		has_pmac_stats[$netif]=true
-	else
-		has_pmac_stats[$netif]=false
-		echo "$netif does not report pMAC statistics, falling back to aggregate"
-	fi
-done
-
-trap cleanup EXIT
-
-setup_prepare
-setup_wait
-
-tests_run
-
-exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/ethtool_rmon.sh b/tools/testing/selftests/net/forwarding/ethtool_rmon.sh
deleted file mode 100755
index 41a34a61f763..000000000000
--- a/tools/testing/selftests/net/forwarding/ethtool_rmon.sh
+++ /dev/null
@@ -1,143 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-ALL_TESTS="
-	rmon_rx_histogram
-	rmon_tx_histogram
-"
-
-NUM_NETIFS=2
-source lib.sh
-
-ETH_FCS_LEN=4
-ETH_HLEN=$((6+6+2))
-
-declare -A netif_mtu
-
-ensure_mtu()
-{
-	local iface=$1; shift
-	local len=$1; shift
-	local current=$(ip -j link show dev $iface | jq -r '.[0].mtu')
-	local required=$((len - ETH_HLEN - ETH_FCS_LEN))
-
-	if [ $current -lt $required ]; then
-		ip link set dev $iface mtu $required || return 1
-	fi
-}
-
-bucket_test()
-{
-	local iface=$1; shift
-	local neigh=$1; shift
-	local set=$1; shift
-	local bucket=$1; shift
-	local len=$1; shift
-	local num_rx=10000
-	local num_tx=20000
-	local expected=
-	local before=
-	local after=
-	local delta=
-
-	# Mausezahn does not include FCS bytes in its length - but the
-	# histogram counters do
-	len=$((len - ETH_FCS_LEN))
-
-	before=$(ethtool --json -S $iface --groups rmon | \
-		jq -r ".[0].rmon[\"${set}-pktsNtoM\"][$bucket].val")
-
-	# Send 10k one way and 20k in the other, to detect counters
-	# mapped to the wrong direction
-	$MZ $neigh -q -c $num_rx -p $len -a own -b bcast -d 10us
-	$MZ $iface -q -c $num_tx -p $len -a own -b bcast -d 10us
-
-	after=$(ethtool --json -S $iface --groups rmon | \
-		jq -r ".[0].rmon[\"${set}-pktsNtoM\"][$bucket].val")
-
-	delta=$((after - before))
-
-	expected=$([ $set = rx ] && echo $num_rx || echo $num_tx)
-
-	# Allow some extra tolerance for other packets sent by the stack
-	[ $delta -ge $expected ] && [ $delta -le $((expected + 100)) ]
-}
-
-rmon_histogram()
-{
-	local iface=$1; shift
-	local neigh=$1; shift
-	local set=$1; shift
-	local nbuckets=0
-	local step=
-
-	RET=0
-
-	while read -r -a bucket; do
-		step="$set-pkts${bucket[0]}to${bucket[1]} on $iface"
-
-		for if in $iface $neigh; do
-			if ! ensure_mtu $if ${bucket[0]}; then
-				log_test_skip "$if does not support the required MTU for $step"
-				return
-			fi
-		done
-
-		if ! bucket_test $iface $neigh $set $nbuckets ${bucket[0]}; then
-			check_err 1 "$step failed"
-			return 1
-		fi
-		log_test "$step"
-		nbuckets=$((nbuckets + 1))
-	done < <(ethtool --json -S $iface --groups rmon | \
-		jq -r ".[0].rmon[\"${set}-pktsNtoM\"][]|[.low, .high]|@tsv" 2>/dev/null)
-
-	if [ $nbuckets -eq 0 ]; then
-		log_test_skip "$iface does not support $set histogram counters"
-		return
-	fi
-}
-
-rmon_rx_histogram()
-{
-	rmon_histogram $h1 $h2 rx
-	rmon_histogram $h2 $h1 rx
-}
-
-rmon_tx_histogram()
-{
-	rmon_histogram $h1 $h2 tx
-	rmon_histogram $h2 $h1 tx
-}
-
-setup_prepare()
-{
-	h1=${NETIFS[p1]}
-	h2=${NETIFS[p2]}
-
-	for iface in $h1 $h2; do
-		netif_mtu[$iface]=$(ip -j link show dev $iface | jq -r '.[0].mtu')
-		ip link set dev $iface up
-	done
-}
-
-cleanup()
-{
-	pre_cleanup
-
-	for iface in $h2 $h1; do
-		ip link set dev $iface \
-			mtu ${netif_mtu[$iface]} \
-			down
-	done
-}
-
-check_ethtool_counter_group_support
-trap cleanup EXIT
-
-setup_prepare
-setup_wait
-
-tests_run
-
-exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/forwarding.config.sample b/tools/testing/selftests/net/forwarding/forwarding.config.sample
index 1fc4f0242fc5..f1ca95e79a65 100644
--- a/tools/testing/selftests/net/forwarding/forwarding.config.sample
+++ b/tools/testing/selftests/net/forwarding/forwarding.config.sample
@@ -3,51 +3,28 @@
 
 ##############################################################################
 # Topology description. p1 looped back to p2, p3 to p4 and so on.
-declare -A NETIFS
 
-NETIFS[p1]=veth0
-NETIFS[p2]=veth1
-NETIFS[p3]=veth2
-NETIFS[p4]=veth3
-NETIFS[p5]=veth4
-NETIFS[p6]=veth5
-NETIFS[p7]=veth6
-NETIFS[p8]=veth7
-NETIFS[p9]=veth8
-NETIFS[p10]=veth9
+NETIFS=(
+	[p1]=veth0
+	[p2]=veth1
+	[p3]=veth2
+	[p4]=veth3
+	[p5]=veth4
+	[p6]=veth5
+	[p7]=veth6
+	[p8]=veth7
+	[p9]=veth8
+	[p10]=veth9
+)
 
 # Port that does not have a cable connected.
 NETIF_NO_CABLE=eth8
 
 ##############################################################################
-# Defines
+# In addition to the topology-related variables, it is also possible to override
+# in this file other variables that net/lib.sh, net/forwarding/lib.sh or other
+# libraries or selftests use. E.g.:
 
-# IPv4 ping utility name
-PING=ping
-# IPv6 ping utility name. Some distributions use 'ping' for IPv6.
 PING6=ping6
-# Packet generator. Some distributions use 'mz'.
 MZ=mausezahn
-# mausezahn delay between transmissions in microseconds.
-MZ_DELAY=0
-# Time to wait after interfaces participating in the test are all UP
 WAIT_TIME=5
-# Whether to pause on failure or not.
-PAUSE_ON_FAIL=no
-# Whether to pause on cleanup or not.
-PAUSE_ON_CLEANUP=no
-# Type of network interface to create
-NETIF_TYPE=veth
-# Whether to create virtual interfaces (veth) or not
-NETIF_CREATE=yes
-# Timeout (in seconds) before ping exits regardless of how many packets have
-# been sent or received
-PING_TIMEOUT=5
-# Minimum ageing_time (in centiseconds) supported by hardware
-LOW_AGEING_TIME=1000
-# Flag for tc match, supposed to be skip_sw/skip_hw which means do not process
-# filter by software/hardware
-TC_FLAG=skip_hw
-# IPv6 traceroute utility name.
-TROUTE6=traceroute6
-
diff --git a/tools/testing/selftests/net/forwarding/gre_custom_multipath_hash.sh b/tools/testing/selftests/net/forwarding/gre_custom_multipath_hash.sh
index 9788bd0f6e8b..dda11a4a9450 100755
--- a/tools/testing/selftests/net/forwarding/gre_custom_multipath_hash.sh
+++ b/tools/testing/selftests/net/forwarding/gre_custom_multipath_hash.sh
@@ -319,10 +319,10 @@ send_dst_ipv6()
 send_flowlabel()
 {
 	# Generate 16384 echo requests, each with a random flow label.
-	for _ in $(seq 1 16384); do
-		ip vrf exec v$h1 \
-			$PING6 2001:db8:2::2 -F 0 -c 1 -q >/dev/null 2>&1
-	done
+	ip vrf exec v$h1 sh -c \
+		"for _ in {1..16384}; do \
+			$PING6 2001:db8:2::2 -F 0 -c 1 -q >/dev/null 2>&1; \
+		done"
 }
 
 send_src_udp6()
diff --git a/tools/testing/selftests/net/forwarding/hw_stats_l3.sh b/tools/testing/selftests/net/forwarding/hw_stats_l3.sh
deleted file mode 100755
index 48584a51388f..000000000000
--- a/tools/testing/selftests/net/forwarding/hw_stats_l3.sh
+++ /dev/null
@@ -1,340 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-# +--------------------+                     +----------------------+
-# | H1                 |                     |                   H2 |
-# |                    |                     |                      |
-# |          $h1.200 + |                     | + $h2.200            |
-# |     192.0.2.1/28 | |                     | | 192.0.2.18/28      |
-# | 2001:db8:1::1/64 | |                     | | 2001:db8:2::1/64   |
-# |                  | |                     | |                    |
-# |              $h1 + |                     | + $h2                |
-# |                  | |                     | |                    |
-# +------------------|-+                     +-|--------------------+
-#                    |                         |
-# +------------------|-------------------------|--------------------+
-# | SW               |                         |                    |
-# |                  |                         |                    |
-# |             $rp1 +                         + $rp2               |
-# |                  |                         |                    |
-# |         $rp1.200 +                         + $rp2.200           |
-# |     192.0.2.2/28                             192.0.2.17/28      |
-# | 2001:db8:1::2/64                             2001:db8:2::2/64   |
-# |                                                                 |
-# +-----------------------------------------------------------------+
-
-ALL_TESTS="
-	ping_ipv4
-	ping_ipv6
-	test_stats_rx_ipv4
-	test_stats_tx_ipv4
-	test_stats_rx_ipv6
-	test_stats_tx_ipv6
-	respin_enablement
-	test_stats_rx_ipv4
-	test_stats_tx_ipv4
-	test_stats_rx_ipv6
-	test_stats_tx_ipv6
-	reapply_config
-	ping_ipv4
-	ping_ipv6
-	test_stats_rx_ipv4
-	test_stats_tx_ipv4
-	test_stats_rx_ipv6
-	test_stats_tx_ipv6
-	test_stats_report_rx
-	test_stats_report_tx
-	test_destroy_enabled
-	test_double_enable
-"
-NUM_NETIFS=4
-source lib.sh
-
-h1_create()
-{
-	simple_if_init $h1
-	vlan_create $h1 200 v$h1 192.0.2.1/28 2001:db8:1::1/64
-	ip route add 192.0.2.16/28 vrf v$h1 nexthop via 192.0.2.2
-	ip -6 route add 2001:db8:2::/64 vrf v$h1 nexthop via 2001:db8:1::2
-}
-
-h1_destroy()
-{
-	ip -6 route del 2001:db8:2::/64 vrf v$h1 nexthop via 2001:db8:1::2
-	ip route del 192.0.2.16/28 vrf v$h1 nexthop via 192.0.2.2
-	vlan_destroy $h1 200
-	simple_if_fini $h1
-}
-
-h2_create()
-{
-	simple_if_init $h2
-	vlan_create $h2 200 v$h2 192.0.2.18/28 2001:db8:2::1/64
-	ip route add 192.0.2.0/28 vrf v$h2 nexthop via 192.0.2.17
-	ip -6 route add 2001:db8:1::/64 vrf v$h2 nexthop via 2001:db8:2::2
-}
-
-h2_destroy()
-{
-	ip -6 route del 2001:db8:1::/64 vrf v$h2 nexthop via 2001:db8:2::2
-	ip route del 192.0.2.0/28 vrf v$h2 nexthop via 192.0.2.17
-	vlan_destroy $h2 200
-	simple_if_fini $h2
-}
-
-router_rp1_200_create()
-{
-	ip link add name $rp1.200 link $rp1 type vlan id 200
-	ip link set dev $rp1.200 addrgenmode eui64
-	ip link set dev $rp1.200 up
-	ip address add dev $rp1.200 192.0.2.2/28
-	ip address add dev $rp1.200 2001:db8:1::2/64
-	ip stats set dev $rp1.200 l3_stats on
-}
-
-router_rp1_200_destroy()
-{
-	ip stats set dev $rp1.200 l3_stats off
-	ip address del dev $rp1.200 2001:db8:1::2/64
-	ip address del dev $rp1.200 192.0.2.2/28
-	ip link del dev $rp1.200
-}
-
-router_create()
-{
-	ip link set dev $rp1 up
-	router_rp1_200_create
-
-	ip link set dev $rp2 up
-	vlan_create $rp2 200 "" 192.0.2.17/28 2001:db8:2::2/64
-}
-
-router_destroy()
-{
-	vlan_destroy $rp2 200
-	ip link set dev $rp2 down
-
-	router_rp1_200_destroy
-	ip link set dev $rp1 down
-}
-
-setup_prepare()
-{
-	h1=${NETIFS[p1]}
-	rp1=${NETIFS[p2]}
-
-	rp2=${NETIFS[p3]}
-	h2=${NETIFS[p4]}
-
-	rp1mac=$(mac_get $rp1)
-	rp2mac=$(mac_get $rp2)
-
-	vrf_prepare
-
-	h1_create
-	h2_create
-
-	router_create
-
-	forwarding_enable
-}
-
-cleanup()
-{
-	pre_cleanup
-
-	forwarding_restore
-
-	router_destroy
-
-	h2_destroy
-	h1_destroy
-
-	vrf_cleanup
-}
-
-ping_ipv4()
-{
-	ping_test $h1.200 192.0.2.18 " IPv4"
-}
-
-ping_ipv6()
-{
-	ping_test $h1.200 2001:db8:2::1 " IPv6"
-}
-
-send_packets_rx_ipv4()
-{
-	# Send 21 packets instead of 20, because the first one might trap and go
-	# through the SW datapath, which might not bump the HW counter.
-	$MZ $h1.200 -c 21 -d 20msec -p 100 \
-	    -a own -b $rp1mac -A 192.0.2.1 -B 192.0.2.18 \
-	    -q -t udp sp=54321,dp=12345
-}
-
-send_packets_rx_ipv6()
-{
-	$MZ $h1.200 -6 -c 21 -d 20msec -p 100 \
-	    -a own -b $rp1mac -A 2001:db8:1::1 -B 2001:db8:2::1 \
-	    -q -t udp sp=54321,dp=12345
-}
-
-send_packets_tx_ipv4()
-{
-	$MZ $h2.200 -c 21 -d 20msec -p 100 \
-	    -a own -b $rp2mac -A 192.0.2.18 -B 192.0.2.1 \
-	    -q -t udp sp=54321,dp=12345
-}
-
-send_packets_tx_ipv6()
-{
-	$MZ $h2.200 -6 -c 21 -d 20msec -p 100 \
-	    -a own -b $rp2mac -A 2001:db8:2::1 -B 2001:db8:1::1 \
-	    -q -t udp sp=54321,dp=12345
-}
-
-___test_stats()
-{
-	local dir=$1; shift
-	local prot=$1; shift
-
-	local a
-	local b
-
-	a=$(hw_stats_get l3_stats $rp1.200 ${dir} packets)
-	send_packets_${dir}_${prot}
-	"$@"
-	b=$(busywait "$TC_HIT_TIMEOUT" until_counter_is ">= $a + 20" \
-		       hw_stats_get l3_stats $rp1.200 ${dir} packets)
-	check_err $? "Traffic not reflected in the counter: $a -> $b"
-}
-
-__test_stats()
-{
-	local dir=$1; shift
-	local prot=$1; shift
-
-	RET=0
-	___test_stats "$dir" "$prot"
-	log_test "Test $dir packets: $prot"
-}
-
-test_stats_rx_ipv4()
-{
-	__test_stats rx ipv4
-}
-
-test_stats_tx_ipv4()
-{
-	__test_stats tx ipv4
-}
-
-test_stats_rx_ipv6()
-{
-	__test_stats rx ipv6
-}
-
-test_stats_tx_ipv6()
-{
-	__test_stats tx ipv6
-}
-
-# Make sure everything works well even after stats have been disabled and
-# reenabled on the same device without touching the L3 configuration.
-respin_enablement()
-{
-	log_info "Turning stats off and on again"
-	ip stats set dev $rp1.200 l3_stats off
-	ip stats set dev $rp1.200 l3_stats on
-}
-
-# For the initial run, l3_stats is enabled on a completely set up netdevice. Now
-# do it the other way around: enabling the L3 stats on an L2 netdevice, and only
-# then apply the L3 configuration.
-reapply_config()
-{
-	log_info "Reapplying configuration"
-
-	router_rp1_200_destroy
-
-	ip link add name $rp1.200 link $rp1 type vlan id 200
-	ip link set dev $rp1.200 addrgenmode none
-	ip stats set dev $rp1.200 l3_stats on
-	ip link set dev $rp1.200 addrgenmode eui64
-	ip link set dev $rp1.200 up
-	ip address add dev $rp1.200 192.0.2.2/28
-	ip address add dev $rp1.200 2001:db8:1::2/64
-}
-
-__test_stats_report()
-{
-	local dir=$1; shift
-	local prot=$1; shift
-
-	local a
-	local b
-
-	RET=0
-
-	a=$(hw_stats_get l3_stats $rp1.200 ${dir} packets)
-	send_packets_${dir}_${prot}
-	ip address flush dev $rp1.200
-	b=$(busywait "$TC_HIT_TIMEOUT" until_counter_is ">= $a + 20" \
-		       hw_stats_get l3_stats $rp1.200 ${dir} packets)
-	check_err $? "Traffic not reflected in the counter: $a -> $b"
-	log_test "Test ${dir} packets: stats pushed on loss of L3"
-
-	ip stats set dev $rp1.200 l3_stats off
-	ip link del dev $rp1.200
-	router_rp1_200_create
-}
-
-test_stats_report_rx()
-{
-	__test_stats_report rx ipv4
-}
-
-test_stats_report_tx()
-{
-	__test_stats_report tx ipv4
-}
-
-test_destroy_enabled()
-{
-	RET=0
-
-	ip link del dev $rp1.200
-	router_rp1_200_create
-
-	log_test "Destroy l3_stats-enabled netdev"
-}
-
-test_double_enable()
-{
-	RET=0
-	___test_stats rx ipv4 \
-		ip stats set dev $rp1.200 l3_stats on
-	log_test "Test stat retention across a spurious enablement"
-}
-
-trap cleanup EXIT
-
-setup_prepare
-setup_wait
-
-used=$(ip -j stats show dev $rp1.200 group offload subgroup hw_stats_info |
-	   jq '.[].info.l3_stats.used')
-kind=$(ip -j -d link show dev $rp1 |
-	   jq -r '.[].linkinfo.info_kind')
-if [[ $used != true ]]; then
-	if [[ $kind == veth ]]; then
-		log_test_skip "l3_stats not offloaded on veth interface"
-		EXIT_STATUS=$ksft_skip
-	else
-		RET=1 log_test "l3_stats not offloaded"
-	fi
-else
-	tests_run
-fi
-
-exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/hw_stats_l3_gre.sh b/tools/testing/selftests/net/forwarding/hw_stats_l3_gre.sh
deleted file mode 100755
index 7594bbb49029..000000000000
--- a/tools/testing/selftests/net/forwarding/hw_stats_l3_gre.sh
+++ /dev/null
@@ -1,111 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-# Test L3 stats on IP-in-IP GRE tunnel without key.
-
-# This test uses flat topology for IP tunneling tests. See ipip_lib.sh for more
-# details.
-
-ALL_TESTS="
-	ping_ipv4
-	test_stats_rx
-	test_stats_tx
-"
-NUM_NETIFS=6
-source lib.sh
-source ipip_lib.sh
-
-setup_prepare()
-{
-	h1=${NETIFS[p1]}
-	ol1=${NETIFS[p2]}
-
-	ul1=${NETIFS[p3]}
-	ul2=${NETIFS[p4]}
-
-	ol2=${NETIFS[p5]}
-	h2=${NETIFS[p6]}
-
-	ol1mac=$(mac_get $ol1)
-
-	forwarding_enable
-	vrf_prepare
-	h1_create
-	h2_create
-	sw1_flat_create gre $ol1 $ul1
-	sw2_flat_create gre $ol2 $ul2
-	ip stats set dev g1a l3_stats on
-	ip stats set dev g2a l3_stats on
-}
-
-cleanup()
-{
-	pre_cleanup
-
-	ip stats set dev g1a l3_stats off
-	ip stats set dev g2a l3_stats off
-
-	sw2_flat_destroy $ol2 $ul2
-	sw1_flat_destroy $ol1 $ul1
-	h2_destroy
-	h1_destroy
-
-	vrf_cleanup
-	forwarding_restore
-}
-
-ping_ipv4()
-{
-	RET=0
-
-	ping_test $h1 192.0.2.18 " gre flat"
-}
-
-send_packets_ipv4()
-{
-	# Send 21 packets instead of 20, because the first one might trap and go
-	# through the SW datapath, which might not bump the HW counter.
-	$MZ $h1 -c 21 -d 20msec -p 100 \
-	    -a own -b $ol1mac -A 192.0.2.1 -B 192.0.2.18 \
-	    -q -t udp sp=54321,dp=12345
-}
-
-test_stats()
-{
-	local dev=$1; shift
-	local dir=$1; shift
-
-	local a
-	local b
-
-	RET=0
-
-	a=$(hw_stats_get l3_stats $dev $dir packets)
-	send_packets_ipv4
-	b=$(busywait "$TC_HIT_TIMEOUT" until_counter_is ">= $a + 20" \
-		     hw_stats_get l3_stats $dev $dir packets)
-	check_err $? "Traffic not reflected in the counter: $a -> $b"
-
-	log_test "Test $dir packets: $prot"
-}
-
-test_stats_tx()
-{
-	test_stats g1a tx
-}
-
-test_stats_rx()
-{
-	test_stats g2a rx
-}
-
-skip_on_veth
-
-trap cleanup EXIT
-
-setup_prepare
-setup_wait
-
-tests_run
-
-exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/ip6gre_custom_multipath_hash.sh b/tools/testing/selftests/net/forwarding/ip6gre_custom_multipath_hash.sh
index 2ab9eaaa5532..e28b4a079e52 100755
--- a/tools/testing/selftests/net/forwarding/ip6gre_custom_multipath_hash.sh
+++ b/tools/testing/selftests/net/forwarding/ip6gre_custom_multipath_hash.sh
@@ -321,10 +321,10 @@ send_dst_ipv6()
 send_flowlabel()
 {
 	# Generate 16384 echo requests, each with a random flow label.
-	for _ in $(seq 1 16384); do
-		ip vrf exec v$h1 \
-			$PING6 2001:db8:2::2 -F 0 -c 1 -q >/dev/null 2>&1
-	done
+	ip vrf exec v$h1 sh -c \
+		"for _ in {1..16384}; do \
+			$PING6 2001:db8:2::2 -F 0 -c 1 -q >/dev/null 2>&1; \
+		done"
 }
 
 send_src_udp6()
diff --git a/tools/testing/selftests/net/forwarding/ip6gre_flat.sh b/tools/testing/selftests/net/forwarding/ip6gre_flat.sh
index 96c97064f2d3..becc7c3fc809 100755
--- a/tools/testing/selftests/net/forwarding/ip6gre_flat.sh
+++ b/tools/testing/selftests/net/forwarding/ip6gre_flat.sh
@@ -8,6 +8,7 @@
 ALL_TESTS="
 	gre_flat
 	gre_mtu_change
+	gre_flat_remote_change
 "
 
 NUM_NETIFS=6
@@ -44,6 +45,19 @@ gre_mtu_change()
 	test_mtu_change
 }
 
+gre_flat_remote_change()
+{
+	flat_remote_change
+
+	test_traffic_ip4ip6 "GRE flat IPv4-in-IPv6 (new remote)"
+	test_traffic_ip6ip6 "GRE flat IPv6-in-IPv6 (new remote)"
+
+	flat_remote_restore
+
+	test_traffic_ip4ip6 "GRE flat IPv4-in-IPv6 (old remote)"
+	test_traffic_ip6ip6 "GRE flat IPv6-in-IPv6 (old remote)"
+}
+
 cleanup()
 {
 	pre_cleanup
diff --git a/tools/testing/selftests/net/forwarding/ip6gre_flat_key.sh b/tools/testing/selftests/net/forwarding/ip6gre_flat_key.sh
index ff9fb0db9bd1..e5335116a2fd 100755
--- a/tools/testing/selftests/net/forwarding/ip6gre_flat_key.sh
+++ b/tools/testing/selftests/net/forwarding/ip6gre_flat_key.sh
@@ -8,6 +8,7 @@
 ALL_TESTS="
 	gre_flat
 	gre_mtu_change
+	gre_flat_remote_change
 "
 
 NUM_NETIFS=6
@@ -44,6 +45,19 @@ gre_mtu_change()
 	test_mtu_change
 }
 
+gre_flat_remote_change()
+{
+	flat_remote_change
+
+	test_traffic_ip4ip6 "GRE flat IPv4-in-IPv6 with key (new remote)"
+	test_traffic_ip6ip6 "GRE flat IPv6-in-IPv6 with key (new remote)"
+
+	flat_remote_restore
+
+	test_traffic_ip4ip6 "GRE flat IPv4-in-IPv6 with key (old remote)"
+	test_traffic_ip6ip6 "GRE flat IPv6-in-IPv6 with key (old remote)"
+}
+
 cleanup()
 {
 	pre_cleanup
diff --git a/tools/testing/selftests/net/forwarding/ip6gre_flat_keys.sh b/tools/testing/selftests/net/forwarding/ip6gre_flat_keys.sh
index 12c138785242..7e0cbfdefab0 100755
--- a/tools/testing/selftests/net/forwarding/ip6gre_flat_keys.sh
+++ b/tools/testing/selftests/net/forwarding/ip6gre_flat_keys.sh
@@ -8,6 +8,7 @@
 ALL_TESTS="
 	gre_flat
 	gre_mtu_change
+	gre_flat_remote_change
 "
 
 NUM_NETIFS=6
@@ -44,6 +45,19 @@ gre_mtu_change()
 	test_mtu_change	gre
 }
 
+gre_flat_remote_change()
+{
+	flat_remote_change
+
+	test_traffic_ip4ip6 "GRE flat IPv4-in-IPv6 with ikey/okey (new remote)"
+	test_traffic_ip6ip6 "GRE flat IPv6-in-IPv6 with ikey/okey (new remote)"
+
+	flat_remote_restore
+
+	test_traffic_ip4ip6 "GRE flat IPv4-in-IPv6 with ikey/okey (old remote)"
+	test_traffic_ip6ip6 "GRE flat IPv6-in-IPv6 with ikey/okey (old remote)"
+}
+
 cleanup()
 {
 	pre_cleanup
diff --git a/tools/testing/selftests/net/forwarding/ip6gre_hier.sh b/tools/testing/selftests/net/forwarding/ip6gre_hier.sh
index 83b55c30a5c3..e0844495f3d1 100755
--- a/tools/testing/selftests/net/forwarding/ip6gre_hier.sh
+++ b/tools/testing/selftests/net/forwarding/ip6gre_hier.sh
@@ -8,6 +8,7 @@
 ALL_TESTS="
 	gre_hier
 	gre_mtu_change
+	gre_hier_remote_change
 "
 
 NUM_NETIFS=6
@@ -44,6 +45,19 @@ gre_mtu_change()
 	test_mtu_change gre
 }
 
+gre_hier_remote_change()
+{
+	hier_remote_change
+
+	test_traffic_ip4ip6 "GRE hierarchical IPv4-in-IPv6 (new remote)"
+	test_traffic_ip6ip6 "GRE hierarchical IPv6-in-IPv6 (new remote)"
+
+	hier_remote_restore
+
+	test_traffic_ip4ip6 "GRE hierarchical IPv4-in-IPv6 (old remote)"
+	test_traffic_ip6ip6 "GRE hierarchical IPv6-in-IPv6 (old remote)"
+}
+
 cleanup()
 {
 	pre_cleanup
diff --git a/tools/testing/selftests/net/forwarding/ip6gre_hier_key.sh b/tools/testing/selftests/net/forwarding/ip6gre_hier_key.sh
index 256607916d92..741bc9c928eb 100755
--- a/tools/testing/selftests/net/forwarding/ip6gre_hier_key.sh
+++ b/tools/testing/selftests/net/forwarding/ip6gre_hier_key.sh
@@ -8,6 +8,7 @@
 ALL_TESTS="
 	gre_hier
 	gre_mtu_change
+	gre_hier_remote_change
 "
 
 NUM_NETIFS=6
@@ -44,6 +45,19 @@ gre_mtu_change()
 	test_mtu_change gre
 }
 
+gre_hier_remote_change()
+{
+	hier_remote_change
+
+	test_traffic_ip4ip6 "GRE hierarchical IPv4-in-IPv6 with key (new remote)"
+	test_traffic_ip6ip6 "GRE hierarchical IPv6-in-IPv6 with key (new remote)"
+
+	hier_remote_restore
+
+	test_traffic_ip4ip6 "GRE hierarchical IPv4-in-IPv6 with key (old remote)"
+	test_traffic_ip6ip6 "GRE hierarchical IPv6-in-IPv6 with key (old remote)"
+}
+
 cleanup()
 {
 	pre_cleanup
diff --git a/tools/testing/selftests/net/forwarding/ip6gre_hier_keys.sh b/tools/testing/selftests/net/forwarding/ip6gre_hier_keys.sh
index ad1bcd6334a8..ad9eab4b1367 100755
--- a/tools/testing/selftests/net/forwarding/ip6gre_hier_keys.sh
+++ b/tools/testing/selftests/net/forwarding/ip6gre_hier_keys.sh
@@ -8,6 +8,7 @@
 ALL_TESTS="
 	gre_hier
 	gre_mtu_change
+	gre_hier_remote_change
 "
 
 NUM_NETIFS=6
@@ -44,6 +45,19 @@ gre_mtu_change()
 	test_mtu_change gre
 }
 
+gre_hier_remote_change()
+{
+	hier_remote_change
+
+	test_traffic_ip4ip6 "GRE hierarchical IPv4-in-IPv6 with ikey/okey (new remote)"
+	test_traffic_ip6ip6 "GRE hierarchical IPv6-in-IPv6 with ikey/okey (new remote)"
+
+	hier_remote_restore
+
+	test_traffic_ip4ip6 "GRE hierarchical IPv4-in-IPv6 with ikey/okey (old remote)"
+	test_traffic_ip6ip6 "GRE hierarchical IPv6-in-IPv6 with ikey/okey (old remote)"
+}
+
 cleanup()
 {
 	pre_cleanup
diff --git a/tools/testing/selftests/net/forwarding/ip6gre_lib.sh b/tools/testing/selftests/net/forwarding/ip6gre_lib.sh
index 24f4ab328bd2..2d91281dc5b7 100644
--- a/tools/testing/selftests/net/forwarding/ip6gre_lib.sh
+++ b/tools/testing/selftests/net/forwarding/ip6gre_lib.sh
@@ -436,3 +436,83 @@ test_mtu_change()
 	check_err $?
 	log_test "ping GRE IPv6, packet size 1800 after MTU change"
 }
+
+topo_flat_remote_change()
+{
+	local old1=$1; shift
+	local new1=$1; shift
+	local old2=$1; shift
+	local new2=$1; shift
+
+	ip link set dev g1a type ip6gre local $new1 remote $new2
+        __addr_add_del g1a add "$new1/128"
+        __addr_add_del g1a del "$old1/128"
+	ip -6 route add $new2/128 via 2001:db8:10::2
+	ip -6 route del $old2/128
+
+	ip link set dev g2a type ip6gre local $new2 remote $new1
+        __addr_add_del g2a add "$new2/128"
+        __addr_add_del g2a del "$old2/128"
+	ip -6 route add vrf v$ol2 $new1/128 via 2001:db8:10::1
+	ip -6 route del vrf v$ol2 $old1/128
+}
+
+flat_remote_change()
+{
+	local old1=2001:db8:3::1
+	local new1=2001:db8:3::10
+	local old2=2001:db8:3::2
+	local new2=2001:db8:3::20
+
+	topo_flat_remote_change $old1 $new1 $old2 $new2
+}
+
+flat_remote_restore()
+{
+	local old1=2001:db8:3::10
+	local new1=2001:db8:3::1
+	local old2=2001:db8:3::20
+	local new2=2001:db8:3::2
+
+	topo_flat_remote_change $old1 $new1 $old2 $new2
+}
+
+topo_hier_remote_change()
+{
+	local old1=$1; shift
+	local new1=$1; shift
+	local old2=$1; shift
+	local new2=$1; shift
+
+        __addr_add_del dummy1 del "$old1/64"
+        __addr_add_del dummy1 add "$new1/64"
+	ip link set dev g1a type ip6gre local $new1 remote $new2
+	ip -6 route add vrf v$ul1 $new2/128 via 2001:db8:10::2
+	ip -6 route del vrf v$ul1 $old2/128
+
+        __addr_add_del dummy2 del "$old2/64"
+        __addr_add_del dummy2 add "$new2/64"
+	ip link set dev g2a type ip6gre local $new2 remote $new1
+	ip -6 route add vrf v$ul2 $new1/128 via 2001:db8:10::1
+	ip -6 route del vrf v$ul2 $old1/128
+}
+
+hier_remote_change()
+{
+	local old1=2001:db8:3::1
+	local new1=2001:db8:3::10
+	local old2=2001:db8:3::2
+	local new2=2001:db8:3::20
+
+	topo_hier_remote_change $old1 $new1 $old2 $new2
+}
+
+hier_remote_restore()
+{
+	local old1=2001:db8:3::10
+	local new1=2001:db8:3::1
+	local old2=2001:db8:3::20
+	local new2=2001:db8:3::2
+
+	topo_hier_remote_change $old1 $new1 $old2 $new2
+}
diff --git a/tools/testing/selftests/net/forwarding/ipip_lib.sh b/tools/testing/selftests/net/forwarding/ipip_lib.sh
index 30f36a57bae6..01e62c4ac94d 100644
--- a/tools/testing/selftests/net/forwarding/ipip_lib.sh
+++ b/tools/testing/selftests/net/forwarding/ipip_lib.sh
@@ -141,7 +141,6 @@
 # |               $h2 +       |
 # |     192.0.2.18/28         |
 # +---------------------------+
-source lib.sh
 
 h1_create()
 {
diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index e579c2e0c462..508f3c700d71 100644
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -2,68 +2,133 @@
 # SPDX-License-Identifier: GPL-2.0
 
 ##############################################################################
+# Topology description. p1 looped back to p2, p3 to p4 and so on.
+
+declare -A NETIFS=(
+    [p1]=veth0
+    [p2]=veth1
+    [p3]=veth2
+    [p4]=veth3
+    [p5]=veth4
+    [p6]=veth5
+    [p7]=veth6
+    [p8]=veth7
+    [p9]=veth8
+    [p10]=veth9
+)
+
+# Port that does not have a cable connected.
+: "${NETIF_NO_CABLE:=eth8}"
+
+##############################################################################
 # Defines
 
-# Can be overridden by the configuration file.
-PING=${PING:=ping}
-PING6=${PING6:=ping6}
-MZ=${MZ:=mausezahn}
-MZ_DELAY=${MZ_DELAY:=0}
-ARPING=${ARPING:=arping}
-TEAMD=${TEAMD:=teamd}
-WAIT_TIME=${WAIT_TIME:=5}
-PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no}
-PAUSE_ON_CLEANUP=${PAUSE_ON_CLEANUP:=no}
-NETIF_TYPE=${NETIF_TYPE:=veth}
-NETIF_CREATE=${NETIF_CREATE:=yes}
-MCD=${MCD:=smcrouted}
-MC_CLI=${MC_CLI:=smcroutectl}
-PING_COUNT=${PING_COUNT:=10}
-PING_TIMEOUT=${PING_TIMEOUT:=5}
-WAIT_TIMEOUT=${WAIT_TIMEOUT:=20}
-INTERFACE_TIMEOUT=${INTERFACE_TIMEOUT:=600}
-LOW_AGEING_TIME=${LOW_AGEING_TIME:=1000}
-REQUIRE_JQ=${REQUIRE_JQ:=yes}
-REQUIRE_MZ=${REQUIRE_MZ:=yes}
-REQUIRE_MTOOLS=${REQUIRE_MTOOLS:=no}
-STABLE_MAC_ADDRS=${STABLE_MAC_ADDRS:=no}
-TCPDUMP_EXTRA_FLAGS=${TCPDUMP_EXTRA_FLAGS:=}
-TROUTE6=${TROUTE6:=traceroute6}
+# Networking utilities.
+: "${PING:=ping}"
+: "${PING6:=ping6}"	# Some distros just use ping.
+: "${ARPING:=arping}"
+: "${TROUTE6:=traceroute6}"
+
+# Packet generator.
+: "${MZ:=mausezahn}"	# Some distributions use 'mz'.
+: "${MZ_DELAY:=0}"
+
+# Host configuration tools.
+: "${TEAMD:=teamd}"
+: "${MCD:=smcrouted}"
+: "${MC_CLI:=smcroutectl}"
+
+# Constants for netdevice bring-up:
+# Default time in seconds to wait for an interface to come up before giving up
+# and bailing out. Used during initial setup.
+: "${INTERFACE_TIMEOUT:=600}"
+# Like INTERFACE_TIMEOUT, but default for ad-hoc waiting in testing scripts.
+: "${WAIT_TIMEOUT:=20}"
+# Time to wait after interfaces participating in the test are all UP.
+: "${WAIT_TIME:=5}"
+
+# Whether to pause on, respectively, after a failure and before cleanup.
+: "${PAUSE_ON_CLEANUP:=no}"
+
+# Whether to create virtual interfaces, and what netdevice type they should be.
+: "${NETIF_CREATE:=yes}"
+: "${NETIF_TYPE:=veth}"
+
+# Constants for ping tests:
+# How many packets should be sent.
+: "${PING_COUNT:=10}"
+# Timeout (in seconds) before ping exits regardless of how many packets have
+# been sent or received
+: "${PING_TIMEOUT:=5}"
+
+# Minimum ageing_time (in centiseconds) supported by hardware
+: "${LOW_AGEING_TIME:=1000}"
+
+# Whether to check for availability of certain tools.
+: "${REQUIRE_JQ:=yes}"
+: "${REQUIRE_MZ:=yes}"
+: "${REQUIRE_MTOOLS:=no}"
+: "${REQUIRE_TEAMD:=no}"
+
+# Whether to override MAC addresses on interfaces participating in the test.
+: "${STABLE_MAC_ADDRS:=no}"
+
+# Flags for tcpdump
+: "${TCPDUMP_EXTRA_FLAGS:=}"
+
+# Flags for TC filters.
+: "${TC_FLAG:=skip_hw}"
+
+# Whether the machine is "slow" -- i.e. might be incapable of running tests
+# involving heavy traffic. This might be the case on a debug kernel, a VM, or
+# e.g. a low-power board.
+: "${KSFT_MACHINE_SLOW:=no}"
 
-net_forwarding_dir=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
+##############################################################################
+# Find netifs by test-specified driver name
 
-if [[ -f $net_forwarding_dir/forwarding.config ]]; then
-	source "$net_forwarding_dir/forwarding.config"
-fi
+driver_name_get()
+{
+	local dev=$1; shift
+	local driver_path="/sys/class/net/$dev/device/driver"
 
-source "$net_forwarding_dir/../lib.sh"
+	if [[ -L $driver_path ]]; then
+		basename `realpath $driver_path`
+	fi
+}
 
-# timeout in seconds
-slowwait()
+netif_find_driver()
 {
-	local timeout=$1; shift
+	local ifnames=`ip -j link show | jq -r ".[].ifname"`
+	local count=0
 
-	local start_time="$(date -u +%s)"
-	while true
+	for ifname in $ifnames
 	do
-		local out
-		out=$("$@")
-		local ret=$?
-		if ((!ret)); then
-			echo -n "$out"
-			return 0
+		local driver_name=`driver_name_get $ifname`
+		if [[ ! -z $driver_name && $driver_name == $NETIF_FIND_DRIVER ]]; then
+			count=$((count + 1))
+			NETIFS[p$count]="$ifname"
 		fi
-
-		local current_time="$(date -u +%s)"
-		if ((current_time - start_time > timeout)); then
-			echo -n "$out"
-			return 1
-		fi
-
-		sleep 0.1
 	done
 }
 
+# Whether to find netdevice according to the driver speficied by the importer
+: "${NETIF_FIND_DRIVER:=}"
+
+if [[ $NETIF_FIND_DRIVER ]]; then
+	unset NETIFS
+	declare -A NETIFS
+	netif_find_driver
+fi
+
+net_forwarding_dir=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
+
+if [[ -f $net_forwarding_dir/forwarding.config ]]; then
+	source "$net_forwarding_dir/forwarding.config"
+fi
+
+source "$net_forwarding_dir/../lib.sh"
+
 ##############################################################################
 # Sanity checks
 
@@ -205,32 +270,38 @@ check_port_mab_support()
 	fi
 }
 
-skip_on_veth()
+if [[ "$(id -u)" -ne 0 ]]; then
+	echo "SKIP: need root privileges"
+	exit $ksft_skip
+fi
+
+check_driver()
 {
-	local kind=$(ip -j -d link show dev ${NETIFS[p1]} |
-		jq -r '.[].linkinfo.info_kind')
+	local dev=$1; shift
+	local expected=$1; shift
+	local driver_name=`driver_name_get $dev`
 
-	if [[ $kind == veth ]]; then
-		echo "SKIP: Test cannot be run with veth pairs"
+	if [[ $driver_name != $expected ]]; then
+		echo "SKIP: expected driver $expected for $dev, got $driver_name instead"
 		exit $ksft_skip
 	fi
 }
 
-if [[ "$(id -u)" -ne 0 ]]; then
-	echo "SKIP: need root privileges"
-	exit $ksft_skip
-fi
-
 if [[ "$CHECK_TC" = "yes" ]]; then
 	check_tc_version
 fi
 
-require_command()
+# IPv6 support was added in v3.0
+check_mtools_version()
 {
-	local cmd=$1; shift
+	local version="$(msend -v)"
+	local major
+
+	version=${version##msend version }
+	major=$(echo $version | cut -d. -f1)
 
-	if [[ ! -x "$(command -v "$cmd")" ]]; then
-		echo "SKIP: $cmd not installed"
+	if [ $major -lt 3 ]; then
+		echo "SKIP: expected mtools version 3.0, got $version"
 		exit $ksft_skip
 	fi
 }
@@ -241,16 +312,14 @@ fi
 if [[ "$REQUIRE_MZ" = "yes" ]]; then
 	require_command $MZ
 fi
+if [[ "$REQUIRE_TEAMD" = "yes" ]]; then
+	require_command $TEAMD
+fi
 if [[ "$REQUIRE_MTOOLS" = "yes" ]]; then
-	# https://github.com/vladimiroltean/mtools/
-	# patched for IPv6 support
+	# https://github.com/troglobit/mtools
 	require_command msend
 	require_command mreceive
-fi
-
-if [[ ! -v NUM_NETIFS ]]; then
-	echo "SKIP: importer does not define \"NUM_NETIFS\""
-	exit $ksft_skip
+	check_mtools_version
 fi
 
 ##############################################################################
@@ -271,6 +340,23 @@ done
 ##############################################################################
 # Network interfaces configuration
 
+if [[ ! -v NUM_NETIFS ]]; then
+	echo "SKIP: importer does not define \"NUM_NETIFS\""
+	exit $ksft_skip
+fi
+
+if (( NUM_NETIFS > ${#NETIFS[@]} )); then
+	echo "SKIP: Importer requires $NUM_NETIFS NETIFS, but only ${#NETIFS[@]} are defined (${NETIFS[@]})"
+	exit $ksft_skip
+fi
+
+for i in $(seq ${#NETIFS[@]}); do
+	if [[ ! ${NETIFS[p$i]} ]]; then
+		echo "SKIP: NETIFS[p$i] not given"
+		exit $ksft_skip
+	fi
+done
+
 create_netif_veth()
 {
 	local i
@@ -353,89 +439,6 @@ done
 ##############################################################################
 # Helpers
 
-# Exit status to return at the end. Set in case one of the tests fails.
-EXIT_STATUS=0
-# Per-test return value. Clear at the beginning of each test.
-RET=0
-
-check_err()
-{
-	local err=$1
-	local msg=$2
-
-	if [[ $RET -eq 0 && $err -ne 0 ]]; then
-		RET=$err
-		retmsg=$msg
-	fi
-}
-
-check_fail()
-{
-	local err=$1
-	local msg=$2
-
-	if [[ $RET -eq 0 && $err -eq 0 ]]; then
-		RET=1
-		retmsg=$msg
-	fi
-}
-
-check_err_fail()
-{
-	local should_fail=$1; shift
-	local err=$1; shift
-	local what=$1; shift
-
-	if ((should_fail)); then
-		check_fail $err "$what succeeded, but should have failed"
-	else
-		check_err $err "$what failed"
-	fi
-}
-
-log_test()
-{
-	local test_name=$1
-	local opt_str=$2
-
-	if [[ $# -eq 2 ]]; then
-		opt_str="($opt_str)"
-	fi
-
-	if [[ $RET -ne 0 ]]; then
-		EXIT_STATUS=1
-		printf "TEST: %-60s  [FAIL]\n" "$test_name $opt_str"
-		if [[ ! -z "$retmsg" ]]; then
-			printf "\t%s\n" "$retmsg"
-		fi
-		if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
-			echo "Hit enter to continue, 'q' to quit"
-			read a
-			[ "$a" = "q" ] && exit 1
-		fi
-		return 1
-	fi
-
-	printf "TEST: %-60s  [ OK ]\n" "$test_name $opt_str"
-	return 0
-}
-
-log_test_skip()
-{
-	local test_name=$1
-	local opt_str=$2
-
-	printf "TEST: %-60s  [SKIP]\n" "$test_name $opt_str"
-	return 0
-}
-
-log_info()
-{
-	local msg=$1
-
-	echo "INFO: $msg"
-}
-
 not()
 {
 	"$@"
@@ -487,33 +490,6 @@ wait_for_trap()
 	"$@" | grep -q trap
 }
 
-until_counter_is()
-{
-	local expr=$1; shift
-	local current=$("$@")
-
-	echo $((current))
-	((current $expr))
-}
-
-busywait_for_counter()
-{
-	local timeout=$1; shift
-	local delta=$1; shift
-
-	local base=$("$@")
-	busywait "$timeout" until_counter_is ">= $((base + delta))" "$@"
-}
-
-slowwait_for_counter()
-{
-	local timeout=$1; shift
-	local delta=$1; shift
-
-	local base=$("$@")
-	slowwait "$timeout" until_counter_is ">= $((base + delta))" "$@"
-}
-
 setup_wait_dev()
 {
 	local dev=$1; shift
@@ -562,6 +538,19 @@ setup_wait()
 	sleep $WAIT_TIME
 }
 
+wait_for_dev()
+{
+        local dev=$1; shift
+        local timeout=${1:-$WAIT_TIMEOUT}; shift
+
+        slowwait $timeout ip link show dev $dev &> /dev/null
+        if (( $? )); then
+                check_err 1
+                log_test wait_for_dev "Interface $dev did not appear."
+                exit $EXIT_STATUS
+        fi
+}
+
 cmd_jq()
 {
 	local cmd=$1
@@ -819,29 +808,6 @@ link_stats_rx_errors_get()
 	link_stats_get $1 rx errors
 }
 
-tc_rule_stats_get()
-{
-	local dev=$1; shift
-	local pref=$1; shift
-	local dir=$1; shift
-	local selector=${1:-.packets}; shift
-
-	tc -j -s filter show dev $dev ${dir:-ingress} pref $pref \
-	    | jq ".[1].options.actions[].stats$selector"
-}
-
-tc_rule_handle_stats_get()
-{
-	local id=$1; shift
-	local handle=$1; shift
-	local selector=${1:-.packets}; shift
-	local netns=${1:-""}; shift
-
-	tc $netns -j -s filter show $id \
-	    | jq ".[] | select(.options.handle == $handle) | \
-		  .options.actions[0].stats$selector"
-}
-
 ethtool_stats_get()
 {
 	local dev=$1; shift
@@ -960,11 +926,37 @@ packets_rate()
 	echo $(((t1 - t0) / interval))
 }
 
-mac_get()
+ether_addr_to_u64()
 {
-	local if_name=$1
+	local addr="$1"
+	local order="$((1 << 40))"
+	local val=0
+	local byte
+
+	addr="${addr//:/ }"
+
+	for byte in $addr; do
+		byte="0x$byte"
+		val=$((val + order * byte))
+		order=$((order >> 8))
+	done
+
+	printf "0x%x" $val
+}
+
+u64_to_ether_addr()
+{
+	local val=$1
+	local byte
+	local i
 
-	ip -j link show dev $if_name | jq -r '.[]["address"]'
+	for ((i = 40; i >= 0; i -= 8)); do
+		byte=$(((val & (0xff << i)) >> i))
+		printf "%02x" $byte
+		if [ $i -ne 0 ]; then
+			printf ":"
+		fi
+	done
 }
 
 ipv6_lladdr_get()
@@ -988,12 +980,19 @@ bridge_ageing_time_get()
 }
 
 declare -A SYSCTL_ORIG
+sysctl_save()
+{
+	local key=$1; shift
+
+	SYSCTL_ORIG[$key]=$(sysctl -n $key)
+}
+
 sysctl_set()
 {
 	local key=$1; shift
 	local value=$1; shift
 
-	SYSCTL_ORIG[$key]=$(sysctl -n $key)
+	sysctl_save "$key"
 	sysctl -qw $key="$value"
 }
 
@@ -1072,22 +1071,6 @@ trap_uninstall()
 	tc filter del dev $dev $direction pref 1 flower
 }
 
-slow_path_trap_install()
-{
-	# For slow-path testing, we need to install a trap to get to
-	# slow path the packets that would otherwise be switched in HW.
-	if [ "${tcflags/skip_hw}" != "$tcflags" ]; then
-		trap_install "$@"
-	fi
-}
-
-slow_path_trap_uninstall()
-{
-	if [ "${tcflags/skip_hw}" != "$tcflags" ]; then
-		trap_uninstall "$@"
-	fi
-}
-
 __icmp_capture_add_del()
 {
 	local add_del=$1; shift
@@ -1104,22 +1087,34 @@ __icmp_capture_add_del()
 
 icmp_capture_install()
 {
-	__icmp_capture_add_del add 100 "" "$@"
+	local tundev=$1; shift
+	local filter=$1; shift
+
+	__icmp_capture_add_del add 100 "" "$tundev" "$filter"
 }
 
 icmp_capture_uninstall()
 {
-	__icmp_capture_add_del del 100 "" "$@"
+	local tundev=$1; shift
+	local filter=$1; shift
+
+	__icmp_capture_add_del del 100 "" "$tundev" "$filter"
 }
 
 icmp6_capture_install()
 {
-	__icmp_capture_add_del add 100 v6 "$@"
+	local tundev=$1; shift
+	local filter=$1; shift
+
+	__icmp_capture_add_del add 100 v6 "$tundev" "$filter"
 }
 
 icmp6_capture_uninstall()
 {
-	__icmp_capture_add_del del 100 v6 "$@"
+	local tundev=$1; shift
+	local filter=$1; shift
+
+	__icmp_capture_add_del del 100 v6 "$tundev" "$filter"
 }
 
 __vlan_capture_add_del()
@@ -1137,12 +1132,18 @@ __vlan_capture_add_del()
 
 vlan_capture_install()
 {
-	__vlan_capture_add_del add 100 "$@"
+	local dev=$1; shift
+	local filter=$1; shift
+
+	__vlan_capture_add_del add 100 "$dev" "$filter"
 }
 
 vlan_capture_uninstall()
 {
-	__vlan_capture_add_del del 100 "$@"
+	local dev=$1; shift
+	local filter=$1; shift
+
+	__vlan_capture_add_del del 100 "$dev" "$filter"
 }
 
 __dscp_capture_add_del()
@@ -1198,13 +1199,10 @@ matchall_sink_create()
 	   action drop
 }
 
-tests_run()
+cleanup()
 {
-	local current_test
-
-	for current_test in ${TESTS:-$ALL_TESTS}; do
-		$current_test
-	done
+	pre_cleanup
+	defer_scopes_cleanup
 }
 
 multipath_eval()
@@ -1502,40 +1500,68 @@ __start_traffic()
 	local sip=$1; shift
 	local dip=$1; shift
 	local dmac=$1; shift
+	local -a mz_args=("$@")
 
 	$MZ $h_in -p $pktsize -A $sip -B $dip -c 0 \
-		-a own -b $dmac -t "$proto" -q "$@" &
+		-a own -b $dmac -t "$proto" -q "${mz_args[@]}" &
 	sleep 1
 }
 
 start_traffic_pktsize()
 {
 	local pktsize=$1; shift
+	local h_in=$1; shift
+	local sip=$1; shift
+	local dip=$1; shift
+	local dmac=$1; shift
+	local -a mz_args=("$@")
 
-	__start_traffic $pktsize udp "$@"
+	__start_traffic $pktsize udp "$h_in" "$sip" "$dip" "$dmac" \
+			"${mz_args[@]}"
 }
 
 start_tcp_traffic_pktsize()
 {
 	local pktsize=$1; shift
+	local h_in=$1; shift
+	local sip=$1; shift
+	local dip=$1; shift
+	local dmac=$1; shift
+	local -a mz_args=("$@")
 
-	__start_traffic $pktsize tcp "$@"
+	__start_traffic $pktsize tcp "$h_in" "$sip" "$dip" "$dmac" \
+			"${mz_args[@]}"
 }
 
 start_traffic()
 {
-	start_traffic_pktsize 8000 "$@"
+	local h_in=$1; shift
+	local sip=$1; shift
+	local dip=$1; shift
+	local dmac=$1; shift
+	local -a mz_args=("$@")
+
+	start_traffic_pktsize 8000 "$h_in" "$sip" "$dip" "$dmac" \
+			      "${mz_args[@]}"
 }
 
 start_tcp_traffic()
 {
-	start_tcp_traffic_pktsize 8000 "$@"
+	local h_in=$1; shift
+	local sip=$1; shift
+	local dip=$1; shift
+	local dmac=$1; shift
+	local -a mz_args=("$@")
+
+	start_tcp_traffic_pktsize 8000 "$h_in" "$sip" "$dip" "$dmac" \
+				  "${mz_args[@]}"
 }
 
 stop_traffic()
 {
-	# Suppress noise from killing mausezahn.
-	{ kill %% && wait %%; } 2>/dev/null
+	local pid=${1-%%}; shift
+
+	kill_process "$pid"
 }
 
 declare -A cappid
@@ -2011,6 +2037,8 @@ bail_on_lldpad()
 {
 	local reason1="$1"; shift
 	local reason2="$1"; shift
+	local caller=${FUNCNAME[1]}
+	local src=${BASH_SOURCE[1]}
 
 	if systemctl is-active --quiet lldpad; then
 
@@ -2031,7 +2059,8 @@ bail_on_lldpad()
 				an environment variable ALLOW_LLDPAD to a
 				non-empty string.
 			EOF
-			exit 1
+			log_test_skip $src:$caller
+			exit $EXIT_STATUS
 		else
 			return
 		fi
@@ -2044,3 +2073,22 @@ absval()
 
 	echo $((v > 0 ? v : -v))
 }
+
+has_unicast_flt()
+{
+	local dev=$1; shift
+	local mac_addr=$(mac_get $dev)
+	local tmp=$(ether_addr_to_u64 $mac_addr)
+	local promisc
+
+	ip link set $dev up
+	ip link add link $dev name macvlan-tmp type macvlan mode private
+	ip link set macvlan-tmp address $(u64_to_ether_addr $((tmp + 1)))
+	ip link set macvlan-tmp up
+
+	promisc=$(ip -j -d link show dev $dev | jq -r '.[].promiscuity')
+
+	ip link del macvlan-tmp
+
+	[[ $promisc == 1 ]] && echo "no" || echo "yes"
+}
diff --git a/tools/testing/selftests/net/forwarding/lib_sh_test.sh b/tools/testing/selftests/net/forwarding/lib_sh_test.sh
new file mode 100755
index 000000000000..ff2accccaf4d
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/lib_sh_test.sh
@@ -0,0 +1,208 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This tests the operation of lib.sh itself.
+
+ALL_TESTS="
+	test_ret
+	test_exit_status
+"
+NUM_NETIFS=0
+source lib.sh
+
+# Simulated checks.
+
+do_test()
+{
+	local msg=$1; shift
+
+	"$@"
+	check_err $? "$msg"
+}
+
+tpass()
+{
+	do_test "tpass" true
+}
+
+tfail()
+{
+	do_test "tfail" false
+}
+
+txfail()
+{
+	FAIL_TO_XFAIL=yes do_test "txfail" false
+}
+
+# Simulated tests.
+
+pass()
+{
+	RET=0
+	do_test "true" true
+	log_test "true"
+}
+
+fail()
+{
+	RET=0
+	do_test "false" false
+	log_test "false"
+}
+
+xfail()
+{
+	RET=0
+	FAIL_TO_XFAIL=yes do_test "xfalse" false
+	log_test "xfalse"
+}
+
+skip()
+{
+	RET=0
+	log_test_skip "skip"
+}
+
+slow_xfail()
+{
+	RET=0
+	xfail_on_slow do_test "slow_false" false
+	log_test "slow_false"
+}
+
+# lib.sh tests.
+
+ret_tests_run()
+{
+	local t
+
+	RET=0
+	retmsg=
+	for t in "$@"; do
+		$t
+	done
+	echo "$retmsg"
+	return $RET
+}
+
+ret_subtest()
+{
+	local expect_ret=$1; shift
+	local expect_retmsg=$1; shift
+	local -a tests=( "$@" )
+
+	local status_names=(pass fail xfail xpass skip)
+	local ret
+	local out
+
+	RET=0
+
+	# Run this in a subshell, so that our environment is intact.
+	out=$(ret_tests_run "${tests[@]}")
+	ret=$?
+
+	(( ret == expect_ret ))
+	check_err $? "RET=$ret expected $expect_ret"
+
+	[[ $out == $expect_retmsg ]]
+	check_err $? "retmsg=$out expected $expect_retmsg"
+
+	log_test "RET $(echo ${tests[@]}) -> ${status_names[$ret]}"
+}
+
+test_ret()
+{
+	ret_subtest $ksft_pass ""
+
+	ret_subtest $ksft_pass "" tpass
+	ret_subtest $ksft_fail "tfail" tfail
+	ret_subtest $ksft_xfail "txfail" txfail
+
+	ret_subtest $ksft_pass "" tpass tpass
+	ret_subtest $ksft_fail "tfail" tpass tfail
+	ret_subtest $ksft_xfail "txfail" tpass txfail
+
+	ret_subtest $ksft_fail "tfail" tfail tpass
+	ret_subtest $ksft_xfail "txfail" txfail tpass
+
+	ret_subtest $ksft_fail "tfail" tfail tfail
+	ret_subtest $ksft_fail "tfail" tfail txfail
+
+	ret_subtest $ksft_fail "tfail" txfail tfail
+
+	ret_subtest $ksft_xfail "txfail" txfail txfail
+}
+
+exit_status_tests_run()
+{
+	EXIT_STATUS=0
+	tests_run > /dev/null
+	return $EXIT_STATUS
+}
+
+exit_status_subtest()
+{
+	local expect_exit_status=$1; shift
+	local tests=$1; shift
+	local what=$1; shift
+
+	local status_names=(pass fail xfail xpass skip)
+	local exit_status
+	local out
+
+	RET=0
+
+	# Run this in a subshell, so that our environment is intact.
+	out=$(TESTS="$tests" exit_status_tests_run)
+	exit_status=$?
+
+	(( exit_status == expect_exit_status ))
+	check_err $? "EXIT_STATUS=$exit_status, expected $expect_exit_status"
+
+	log_test "EXIT_STATUS $tests$what -> ${status_names[$exit_status]}"
+}
+
+test_exit_status()
+{
+	exit_status_subtest $ksft_pass ":"
+
+	exit_status_subtest $ksft_pass "pass"
+	exit_status_subtest $ksft_fail "fail"
+	exit_status_subtest $ksft_pass "xfail"
+	exit_status_subtest $ksft_skip "skip"
+
+	exit_status_subtest $ksft_pass "pass pass"
+	exit_status_subtest $ksft_fail "pass fail"
+	exit_status_subtest $ksft_pass "pass xfail"
+	exit_status_subtest $ksft_skip "pass skip"
+
+	exit_status_subtest $ksft_fail "fail pass"
+	exit_status_subtest $ksft_pass "xfail pass"
+	exit_status_subtest $ksft_skip "skip pass"
+
+	exit_status_subtest $ksft_fail "fail fail"
+	exit_status_subtest $ksft_fail "fail xfail"
+	exit_status_subtest $ksft_fail "fail skip"
+
+	exit_status_subtest $ksft_fail "xfail fail"
+	exit_status_subtest $ksft_fail "skip fail"
+
+	exit_status_subtest $ksft_pass "xfail xfail"
+	exit_status_subtest $ksft_skip "xfail skip"
+	exit_status_subtest $ksft_skip "skip xfail"
+
+	exit_status_subtest $ksft_skip "skip skip"
+
+	KSFT_MACHINE_SLOW=yes \
+		exit_status_subtest $ksft_pass "slow_xfail" ": slow"
+
+	KSFT_MACHINE_SLOW=no \
+		exit_status_subtest $ksft_fail "slow_xfail" ": fast"
+}
+
+trap pre_cleanup EXIT
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh
index c5b0cbc85b3e..ecd34f364125 100755
--- a/tools/testing/selftests/net/forwarding/local_termination.sh
+++ b/tools/testing/selftests/net/forwarding/local_termination.sh
@@ -1,11 +1,12 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
-ALL_TESTS="standalone bridge"
+ALL_TESTS="standalone vlan_unaware_bridge vlan_aware_bridge test_vlan \
+	   vlan_over_vlan_unaware_bridged_port vlan_over_vlan_aware_bridged_port \
+	   vlan_over_vlan_unaware_bridge vlan_over_vlan_aware_bridge"
 NUM_NETIFS=2
 PING_COUNT=1
 REQUIRE_MTOOLS=yes
-REQUIRE_MZ=no
 
 source lib.sh
 
@@ -37,9 +38,68 @@ UNKNOWN_MACV6_MC_ADDR1="33:33:01:02:03:05"
 UNKNOWN_MACV6_MC_ADDR2="33:33:01:02:03:06"
 UNKNOWN_MACV6_MC_ADDR3="33:33:01:02:03:07"
 
-NON_IP_MC="01:02:03:04:05:06"
-NON_IP_PKT="00:04 48:45:4c:4f"
-BC="ff:ff:ff:ff:ff:ff"
+PTP_1588_L2_SYNC=" \
+01:1b:19:00:00:00 00:00:de:ad:be:ef 88:f7 00 02 \
+00 2c 00 00 02 00 00 00 00 00 00 00 00 00 00 00 \
+00 00 3e 37 63 ff fe cf 17 0e 00 01 00 00 00 00 \
+00 00 00 00 00 00 00 00 00 00"
+PTP_1588_L2_FOLLOW_UP=" \
+01:1b:19:00:00:00 00:00:de:ad:be:ef 88:f7 08 02 \
+00 2c 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \
+00 00 3e 37 63 ff fe cf 17 0e 00 01 00 00 02 00 \
+00 00 66 83 c5 f1 17 97 ed f0"
+PTP_1588_L2_PDELAY_REQ=" \
+01:80:c2:00:00:0e 00:00:de:ad:be:ef 88:f7 02 02 \
+00 36 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \
+00 00 3e 37 63 ff fe cf 17 0e 00 01 00 06 05 7f \
+00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \
+00 00 00 00"
+PTP_1588_IPV4_SYNC=" \
+01:00:5e:00:01:81 00:00:de:ad:be:ef 08:00 45 00 \
+00 48 0a 9a 40 00 01 11 cb 88 c0 00 02 01 e0 00 \
+01 81 01 3f 01 3f 00 34 a3 c8 00 02 00 2c 00 00 \
+02 00 00 00 00 00 00 00 00 00 00 00 00 00 3e 37 \
+63 ff fe cf 17 0e 00 01 00 00 00 00 00 00 00 00 \
+00 00 00 00 00 00"
+PTP_1588_IPV4_FOLLOW_UP="
+01:00:5e:00:01:81 00:00:de:ad:be:ef 08:00 45 00 \
+00 48 0a 9b 40 00 01 11 cb 87 c0 00 02 01 e0 00 \
+01 81 01 40 01 40 00 34 a3 c8 08 02 00 2c 00 00 \
+00 00 00 00 00 00 00 00 00 00 00 00 00 00 3e 37 \
+63 ff fe cf 17 0e 00 01 00 00 02 00 00 00 66 83 \
+c6 0f 1d 9a 61 87"
+PTP_1588_IPV4_PDELAY_REQ=" \
+01:00:5e:00:00:6b 00:00:de:ad:be:ef 08:00 45 00 \
+00 52 35 a9 40 00 01 11 a1 85 c0 00 02 01 e0 00 \
+00 6b 01 3f 01 3f 00 3e a2 bc 02 02 00 36 00 00 \
+00 00 00 00 00 00 00 00 00 00 00 00 00 00 3e 37 \
+63 ff fe cf 17 0e 00 01 00 01 05 7f 00 00 00 00 \
+00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00"
+PTP_1588_IPV6_SYNC=" \
+33:33:00:00:01:81 00:00:de:ad:be:ef 86:dd 60 06 \
+7c 2f 00 36 11 01 20 01 0d b8 00 01 00 00 00 00 \
+00 00 00 00 00 01 ff 0e 00 00 00 00 00 00 00 00 \
+00 00 00 00 01 81 01 3f 01 3f 00 36 2e 92 00 02 \
+00 2c 00 00 02 00 00 00 00 00 00 00 00 00 00 00 \
+00 00 3e 37 63 ff fe cf 17 0e 00 01 00 00 00 00 \
+00 00 00 00 00 00 00 00 00 00 00 00"
+PTP_1588_IPV6_FOLLOW_UP=" \
+33:33:00:00:01:81 00:00:de:ad:be:ef 86:dd 60 0a \
+00 bc 00 36 11 01 20 01 0d b8 00 01 00 00 00 00 \
+00 00 00 00 00 01 ff 0e 00 00 00 00 00 00 00 00 \
+00 00 00 00 01 81 01 40 01 40 00 36 2e 92 08 02 \
+00 2c 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \
+00 00 3e 37 63 ff fe cf 17 0e 00 01 00 00 02 00 \
+00 00 66 83 c6 2a 32 09 bd 74 00 00"
+PTP_1588_IPV6_PDELAY_REQ=" \
+33:33:00:00:00:6b 00:00:de:ad:be:ef 86:dd 60 0c \
+5c fd 00 40 11 01 fe 80 00 00 00 00 00 00 3c 37 \
+63 ff fe cf 17 0e ff 02 00 00 00 00 00 00 00 00 \
+00 00 00 00 00 6b 01 3f 01 3f 00 40 b4 54 02 02 \
+00 36 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \
+00 00 3e 37 63 ff fe cf 17 0e 00 01 00 01 05 7f \
+00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \
+00 00 00 00 00 00"
 
 # Disable promisc to ensure we don't receive unknown MAC DA packets
 export TCPDUMP_EXTRA_FLAGS="-pl"
@@ -47,13 +107,15 @@ export TCPDUMP_EXTRA_FLAGS="-pl"
 h1=${NETIFS[p1]}
 h2=${NETIFS[p2]}
 
-send_non_ip()
+send_raw()
 {
-	local if_name=$1
-	local smac=$2
-	local dmac=$3
+	local if_name=$1; shift
+	local pkt="$1"; shift
+	local smac=$(mac_get $if_name)
+
+	pkt="${pkt/00:00:de:ad:be:ef/$smac}"
 
-	$MZ -q $if_name "$dmac $smac $NON_IP_PKT"
+	$MZ -q $if_name "$pkt"
 }
 
 send_uc_ipv4()
@@ -68,10 +130,11 @@ send_uc_ipv4()
 
 check_rcv()
 {
-	local if_name=$1
-	local type=$2
-	local pattern=$3
-	local should_receive=$4
+	local if_name=$1; shift
+	local type=$1; shift
+	local pattern=$1; shift
+	local should_receive=$1; shift
+	local test_name="$1"; shift
 	local should_fail=
 
 	[ $should_receive = true ] && should_fail=0 || should_fail=1
@@ -81,7 +144,7 @@ check_rcv()
 
 	check_err_fail "$should_fail" "$?" "reception"
 
-	log_test "$if_name: $type"
+	log_test "$test_name: $type"
 }
 
 mc_route_prepare()
@@ -104,44 +167,78 @@ mc_route_destroy()
 
 run_test()
 {
-	local rcv_if_name=$1
-	local smac=$(mac_get $h1)
+	local send_if_name=$1; shift
+	local rcv_if_name=$1; shift
+	local skip_ptp=$1; shift
+	local no_unicast_flt=$1; shift
+	local test_name="$1"; shift
+	local smac=$(mac_get $send_if_name)
 	local rcv_dmac=$(mac_get $rcv_if_name)
+	local should_receive
 
 	tcpdump_start $rcv_if_name
 
-	mc_route_prepare $h1
+	mc_route_prepare $send_if_name
 	mc_route_prepare $rcv_if_name
 
-	send_uc_ipv4 $h1 $rcv_dmac
-	send_uc_ipv4 $h1 $MACVLAN_ADDR
-	send_uc_ipv4 $h1 $UNKNOWN_UC_ADDR1
+	send_uc_ipv4 $send_if_name $rcv_dmac
+	send_uc_ipv4 $send_if_name $MACVLAN_ADDR
+	send_uc_ipv4 $send_if_name $UNKNOWN_UC_ADDR1
 
 	ip link set dev $rcv_if_name promisc on
-	send_uc_ipv4 $h1 $UNKNOWN_UC_ADDR2
-	mc_send $h1 $UNKNOWN_IPV4_MC_ADDR2
-	mc_send $h1 $UNKNOWN_IPV6_MC_ADDR2
+	send_uc_ipv4 $send_if_name $UNKNOWN_UC_ADDR2
+	mc_send $send_if_name $UNKNOWN_IPV4_MC_ADDR2
+	mc_send $send_if_name $UNKNOWN_IPV6_MC_ADDR2
 	ip link set dev $rcv_if_name promisc off
 
 	mc_join $rcv_if_name $JOINED_IPV4_MC_ADDR
-	mc_send $h1 $JOINED_IPV4_MC_ADDR
+	mc_send $send_if_name $JOINED_IPV4_MC_ADDR
 	mc_leave
 
 	mc_join $rcv_if_name $JOINED_IPV6_MC_ADDR
-	mc_send $h1 $JOINED_IPV6_MC_ADDR
+	mc_send $send_if_name $JOINED_IPV6_MC_ADDR
 	mc_leave
 
-	mc_send $h1 $UNKNOWN_IPV4_MC_ADDR1
-	mc_send $h1 $UNKNOWN_IPV6_MC_ADDR1
+	mc_send $send_if_name $UNKNOWN_IPV4_MC_ADDR1
+	mc_send $send_if_name $UNKNOWN_IPV6_MC_ADDR1
 
 	ip link set dev $rcv_if_name allmulticast on
-	send_uc_ipv4 $h1 $UNKNOWN_UC_ADDR3
-	mc_send $h1 $UNKNOWN_IPV4_MC_ADDR3
-	mc_send $h1 $UNKNOWN_IPV6_MC_ADDR3
+	send_uc_ipv4 $send_if_name $UNKNOWN_UC_ADDR3
+	mc_send $send_if_name $UNKNOWN_IPV4_MC_ADDR3
+	mc_send $send_if_name $UNKNOWN_IPV6_MC_ADDR3
 	ip link set dev $rcv_if_name allmulticast off
 
 	mc_route_destroy $rcv_if_name
-	mc_route_destroy $h1
+	mc_route_destroy $send_if_name
+
+	if [ $skip_ptp = false ]; then
+		ip maddress add 01:1b:19:00:00:00 dev $rcv_if_name
+		send_raw $send_if_name "$PTP_1588_L2_SYNC"
+		send_raw $send_if_name "$PTP_1588_L2_FOLLOW_UP"
+		ip maddress del 01:1b:19:00:00:00 dev $rcv_if_name
+
+		ip maddress add 01:80:c2:00:00:0e dev $rcv_if_name
+		send_raw $send_if_name "$PTP_1588_L2_PDELAY_REQ"
+		ip maddress del 01:80:c2:00:00:0e dev $rcv_if_name
+
+		mc_join $rcv_if_name 224.0.1.129
+		send_raw $send_if_name "$PTP_1588_IPV4_SYNC"
+		send_raw $send_if_name "$PTP_1588_IPV4_FOLLOW_UP"
+		mc_leave
+
+		mc_join $rcv_if_name 224.0.0.107
+		send_raw $send_if_name "$PTP_1588_IPV4_PDELAY_REQ"
+		mc_leave
+
+		mc_join $rcv_if_name ff0e::181
+		send_raw $send_if_name "$PTP_1588_IPV6_SYNC"
+		send_raw $send_if_name "$PTP_1588_IPV6_FOLLOW_UP"
+		mc_leave
+
+		mc_join $rcv_if_name ff02::6b
+		send_raw $send_if_name "$PTP_1588_IPV6_PDELAY_REQ"
+		mc_leave
+	fi
 
 	sleep 1
 
@@ -149,55 +246,99 @@ run_test()
 
 	check_rcv $rcv_if_name "Unicast IPv4 to primary MAC address" \
 		"$smac > $rcv_dmac, ethertype IPv4 (0x0800)" \
-		true
+		true "$test_name"
 
 	check_rcv $rcv_if_name "Unicast IPv4 to macvlan MAC address" \
 		"$smac > $MACVLAN_ADDR, ethertype IPv4 (0x0800)" \
-		true
+		true "$test_name"
 
+	[ $no_unicast_flt = true ] && should_receive=true || should_receive=false
 	check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address" \
 		"$smac > $UNKNOWN_UC_ADDR1, ethertype IPv4 (0x0800)" \
-		false
+		$should_receive "$test_name"
 
 	check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address, promisc" \
 		"$smac > $UNKNOWN_UC_ADDR2, ethertype IPv4 (0x0800)" \
-		true
+		true "$test_name"
 
-	check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address, allmulti" \
+	[ $no_unicast_flt = true ] && should_receive=true || should_receive=false
+	check_rcv $rcv_if_name \
+		"Unicast IPv4 to unknown MAC address, allmulti" \
 		"$smac > $UNKNOWN_UC_ADDR3, ethertype IPv4 (0x0800)" \
-		false
+		$should_receive "$test_name"
 
 	check_rcv $rcv_if_name "Multicast IPv4 to joined group" \
 		"$smac > $JOINED_MACV4_MC_ADDR, ethertype IPv4 (0x0800)" \
-		true
+		true "$test_name"
 
-	check_rcv $rcv_if_name "Multicast IPv4 to unknown group" \
-		"$smac > $UNKNOWN_MACV4_MC_ADDR1, ethertype IPv4 (0x0800)" \
-		false
+	xfail \
+		check_rcv $rcv_if_name \
+			"Multicast IPv4 to unknown group" \
+			"$smac > $UNKNOWN_MACV4_MC_ADDR1, ethertype IPv4 (0x0800)" \
+			false "$test_name"
 
 	check_rcv $rcv_if_name "Multicast IPv4 to unknown group, promisc" \
 		"$smac > $UNKNOWN_MACV4_MC_ADDR2, ethertype IPv4 (0x0800)" \
-		true
+		true "$test_name"
 
 	check_rcv $rcv_if_name "Multicast IPv4 to unknown group, allmulti" \
 		"$smac > $UNKNOWN_MACV4_MC_ADDR3, ethertype IPv4 (0x0800)" \
-		true
+		true "$test_name"
 
 	check_rcv $rcv_if_name "Multicast IPv6 to joined group" \
 		"$smac > $JOINED_MACV6_MC_ADDR, ethertype IPv6 (0x86dd)" \
-		true
+		true "$test_name"
 
-	check_rcv $rcv_if_name "Multicast IPv6 to unknown group" \
-		"$smac > $UNKNOWN_MACV6_MC_ADDR1, ethertype IPv6 (0x86dd)" \
-		false
+	xfail \
+		check_rcv $rcv_if_name "Multicast IPv6 to unknown group" \
+			"$smac > $UNKNOWN_MACV6_MC_ADDR1, ethertype IPv6 (0x86dd)" \
+			false "$test_name"
 
 	check_rcv $rcv_if_name "Multicast IPv6 to unknown group, promisc" \
 		"$smac > $UNKNOWN_MACV6_MC_ADDR2, ethertype IPv6 (0x86dd)" \
-		true
+		true "$test_name"
 
 	check_rcv $rcv_if_name "Multicast IPv6 to unknown group, allmulti" \
 		"$smac > $UNKNOWN_MACV6_MC_ADDR3, ethertype IPv6 (0x86dd)" \
-		true
+		true "$test_name"
+
+	if [ $skip_ptp = false ]; then
+		check_rcv $rcv_if_name "1588v2 over L2 transport, Sync" \
+			"ethertype PTP (0x88f7).* PTPv2.* msg type : sync msg" \
+			true "$test_name"
+
+		check_rcv $rcv_if_name "1588v2 over L2 transport, Follow-Up" \
+			"ethertype PTP (0x88f7).* PTPv2.* msg type : follow up msg" \
+			true "$test_name"
+
+		check_rcv $rcv_if_name "1588v2 over L2 transport, Peer Delay Request" \
+			"ethertype PTP (0x88f7).* PTPv2.* msg type : peer delay req msg" \
+			true "$test_name"
+
+		check_rcv $rcv_if_name "1588v2 over IPv4, Sync" \
+			"ethertype IPv4 (0x0800).* PTPv2.* msg type : sync msg" \
+			true "$test_name"
+
+		check_rcv $rcv_if_name "1588v2 over IPv4, Follow-Up" \
+			"ethertype IPv4 (0x0800).* PTPv2.* msg type : follow up msg" \
+			true "$test_name"
+
+		check_rcv $rcv_if_name "1588v2 over IPv4, Peer Delay Request" \
+			"ethertype IPv4 (0x0800).* PTPv2.* msg type : peer delay req msg" \
+			true "$test_name"
+
+		check_rcv $rcv_if_name "1588v2 over IPv6, Sync" \
+			"ethertype IPv6 (0x86dd).* PTPv2.* msg type : sync msg" \
+			true "$test_name"
+
+		check_rcv $rcv_if_name "1588v2 over IPv6, Follow-Up" \
+			"ethertype IPv6 (0x86dd).* PTPv2.* msg type : follow up msg" \
+			true "$test_name"
+
+		check_rcv $rcv_if_name "1588v2 over IPv6, Peer Delay Request" \
+			"ethertype IPv6 (0x86dd).* PTPv2.* msg type : peer delay req msg" \
+			true "$test_name"
+	fi
 
 	tcpdump_cleanup $rcv_if_name
 }
@@ -222,62 +363,217 @@ h2_destroy()
 	simple_if_fini $h2 $H2_IPV4/24 $H2_IPV6/64
 }
 
+h1_vlan_create()
+{
+	simple_if_init $h1
+	vlan_create $h1 100 v$h1 $H1_IPV4/24 $H1_IPV6/64
+}
+
+h1_vlan_destroy()
+{
+	vlan_destroy $h1 100
+	simple_if_fini $h1
+}
+
+h2_vlan_create()
+{
+	simple_if_init $h2
+	vlan_create $h2 100 v$h2 $H2_IPV4/24 $H2_IPV6/64
+}
+
+h2_vlan_destroy()
+{
+	vlan_destroy $h2 100
+	simple_if_fini $h2
+}
+
 bridge_create()
 {
-	ip link add br0 type bridge
+	local vlan_filtering=$1
+
+	ip link add br0 type bridge vlan_filtering $vlan_filtering
 	ip link set br0 address $BRIDGE_ADDR
 	ip link set br0 up
 
 	ip link set $h2 master br0
 	ip link set $h2 up
-
-	simple_if_init br0 $H2_IPV4/24 $H2_IPV6/64
 }
 
 bridge_destroy()
 {
-	simple_if_fini br0 $H2_IPV4/24 $H2_IPV6/64
-
 	ip link del br0
 }
 
-standalone()
+macvlan_create()
 {
-	h1_create
-	h2_create
+	local lower=$1
 
-	ip link add link $h2 name macvlan0 type macvlan mode private
+	ip link add link $lower name macvlan0 type macvlan mode private
 	ip link set macvlan0 address $MACVLAN_ADDR
 	ip link set macvlan0 up
+}
 
-	run_test $h2
-
+macvlan_destroy()
+{
 	ip link del macvlan0
+}
+
+standalone()
+{
+	local no_unicast_flt=true
+	local skip_ptp=false
+
+	if [ $(has_unicast_flt $h2) = yes ]; then
+		no_unicast_flt=false
+	fi
 
+	h1_create
+	h2_create
+	macvlan_create $h2
+
+	run_test $h1 $h2 $skip_ptp $no_unicast_flt "$h2"
+
+	macvlan_destroy
 	h2_destroy
 	h1_destroy
 }
 
-bridge()
+test_bridge()
 {
+	local no_unicast_flt=true
+	local vlan_filtering=$1
+	local skip_ptp=true
+
 	h1_create
-	bridge_create
+	bridge_create $vlan_filtering
+	simple_if_init br0 $H2_IPV4/24 $H2_IPV6/64
+	macvlan_create br0
 
-	ip link add link br0 name macvlan0 type macvlan mode private
-	ip link set macvlan0 address $MACVLAN_ADDR
-	ip link set macvlan0 up
+	run_test $h1 br0 $skip_ptp $no_unicast_flt \
+		"vlan_filtering=$vlan_filtering bridge"
 
-	run_test br0
+	macvlan_destroy
+	simple_if_fini br0 $H2_IPV4/24 $H2_IPV6/64
+	bridge_destroy
+	h1_destroy
+}
 
-	ip link del macvlan0
+vlan_unaware_bridge()
+{
+	test_bridge 0
+}
+
+vlan_aware_bridge()
+{
+	test_bridge 1
+}
+
+test_vlan()
+{
+	local no_unicast_flt=true
+	local skip_ptp=false
+
+	if [ $(has_unicast_flt $h2) = yes ]; then
+		no_unicast_flt=false
+	fi
+
+	h1_vlan_create
+	h2_vlan_create
+	macvlan_create $h2.100
+
+	run_test $h1.100 $h2.100 $skip_ptp $no_unicast_flt "VLAN upper"
 
+	macvlan_destroy
+	h2_vlan_destroy
+	h1_vlan_destroy
+}
+
+vlan_over_bridged_port()
+{
+	local no_unicast_flt=true
+	local vlan_filtering=$1
+	local skip_ptp=false
+
+	# br_manage_promisc() will not force a single vlan_filtering port to
+	# promiscuous mode, so we should still expect unicast filtering to take
+	# place if the device can do it.
+	if [ $(has_unicast_flt $h2) = yes ] && [ $vlan_filtering = 1 ]; then
+		no_unicast_flt=false
+	fi
+
+	h1_vlan_create
+	h2_vlan_create
+	bridge_create $vlan_filtering
+	macvlan_create $h2.100
+
+	run_test $h1.100 $h2.100 $skip_ptp $no_unicast_flt \
+		"VLAN over vlan_filtering=$vlan_filtering bridged port"
+
+	macvlan_destroy
 	bridge_destroy
-	h1_destroy
+	h2_vlan_destroy
+	h1_vlan_destroy
+}
+
+vlan_over_vlan_unaware_bridged_port()
+{
+	vlan_over_bridged_port 0
+}
+
+vlan_over_vlan_aware_bridged_port()
+{
+	vlan_over_bridged_port 1
+}
+
+vlan_over_bridge()
+{
+	local no_unicast_flt=true
+	local vlan_filtering=$1
+	local skip_ptp=true
+
+	h1_vlan_create
+	bridge_create $vlan_filtering
+	simple_if_init br0
+	vlan_create br0 100 vbr0 $H2_IPV4/24 $H2_IPV6/64
+	macvlan_create br0.100
+
+	if [ $vlan_filtering = 1 ]; then
+		bridge vlan add dev $h2 vid 100 master
+		bridge vlan add dev br0 vid 100 self
+	fi
+
+	run_test $h1.100 br0.100 $skip_ptp $no_unicast_flt \
+		"VLAN over vlan_filtering=$vlan_filtering bridge"
+
+	if [ $vlan_filtering = 1 ]; then
+		bridge vlan del dev br0 vid 100 self
+		bridge vlan del dev $h2 vid 100 master
+	fi
+
+	macvlan_destroy
+	vlan_destroy br0 100
+	simple_if_fini br0
+	bridge_destroy
+	h1_vlan_destroy
+}
+
+vlan_over_vlan_unaware_bridge()
+{
+	vlan_over_bridge 0
+}
+
+vlan_over_vlan_aware_bridge()
+{
+	vlan_over_bridge 1
 }
 
 cleanup()
 {
 	pre_cleanup
+
+	ip link set $h2 down
+	ip link set $h1 down
+
 	vrf_cleanup
 }
 
diff --git a/tools/testing/selftests/net/forwarding/loopback.sh b/tools/testing/selftests/net/forwarding/loopback.sh
deleted file mode 100755
index 8f4057310b5b..000000000000
--- a/tools/testing/selftests/net/forwarding/loopback.sh
+++ /dev/null
@@ -1,102 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
-
-ALL_TESTS="loopback_test"
-NUM_NETIFS=2
-source tc_common.sh
-source lib.sh
-
-h1_create()
-{
-	simple_if_init $h1 192.0.2.1/24
-	tc qdisc add dev $h1 clsact
-}
-
-h1_destroy()
-{
-	tc qdisc del dev $h1 clsact
-	simple_if_fini $h1 192.0.2.1/24
-}
-
-h2_create()
-{
-	simple_if_init $h2
-}
-
-h2_destroy()
-{
-	simple_if_fini $h2
-}
-
-loopback_test()
-{
-	RET=0
-
-	tc filter add dev $h1 ingress protocol arp pref 1 handle 101 flower \
-		skip_hw arp_op reply arp_tip 192.0.2.1 action drop
-
-	$MZ $h1 -c 1 -t arp -q
-
-	tc_check_packets "dev $h1 ingress" 101 1
-	check_fail $? "Matched on a filter without loopback setup"
-
-	ethtool -K $h1 loopback on
-	check_err $? "Failed to enable loopback"
-
-	setup_wait_dev $h1
-
-	$MZ $h1 -c 1 -t arp -q
-
-	tc_check_packets "dev $h1 ingress" 101 1
-	check_err $? "Did not match on filter with loopback"
-
-	ethtool -K $h1 loopback off
-	check_err $? "Failed to disable loopback"
-
-	$MZ $h1 -c 1 -t arp -q
-
-	tc_check_packets "dev $h1 ingress" 101 2
-	check_fail $? "Matched on a filter after loopback was removed"
-
-	tc filter del dev $h1 ingress protocol arp pref 1 handle 101 flower
-
-	log_test "loopback"
-}
-
-setup_prepare()
-{
-	h1=${NETIFS[p1]}
-	h2=${NETIFS[p2]}
-
-	vrf_prepare
-
-	h1_create
-	h2_create
-
-	if ethtool -k $h1 | grep loopback | grep -q fixed; then
-		log_test "SKIP: dev $h1 does not support loopback feature"
-		exit $ksft_skip
-	fi
-}
-
-cleanup()
-{
-	pre_cleanup
-
-	h2_destroy
-	h1_destroy
-
-	vrf_cleanup
-}
-
-trap cleanup EXIT
-
-setup_prepare
-setup_wait
-
-tests_run
-
-exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/min_max_mtu.sh b/tools/testing/selftests/net/forwarding/min_max_mtu.sh
new file mode 100755
index 000000000000..97bb8b221bed
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/min_max_mtu.sh
@@ -0,0 +1,283 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# +--------------------+
+# | H1                 |
+# |                    |
+# |           $h1.10 + |
+# |     192.0.2.2/24 | |
+# | 2001:db8:1::2/64 | |
+# |                  | |
+# |              $h1 + |
+# |                  | |
+# +------------------|-+
+#                    |
+# +------------------|-+
+# | SW               | |
+# |            $swp1 + |
+# |                  | |
+# |         $swp1.10 + |
+# |     192.0.2.1/24   |
+# | 2001:db8:1::1/64   |
+# |                    |
+# +--------------------+
+
+ALL_TESTS="
+	ping_ipv4
+	ping_ipv6
+	max_mtu_config_test
+	max_mtu_traffic_test
+	min_mtu_config_test
+	min_mtu_traffic_test
+"
+
+NUM_NETIFS=2
+source lib.sh
+
+h1_create()
+{
+	simple_if_init $h1
+	vlan_create $h1 10 v$h1 192.0.2.2/24 2001:db8:1::2/64
+}
+
+h1_destroy()
+{
+	vlan_destroy $h1 10 192.0.2.2/24 2001:db8:1::2/64
+	simple_if_fini $h1
+}
+
+switch_create()
+{
+	ip li set dev $swp1 up
+	vlan_create $swp1 10 "" 192.0.2.1/24 2001:db8:1::1/64
+}
+
+switch_destroy()
+{
+	ip li set dev $swp1 down
+	vlan_destroy $swp1 10
+}
+
+setup_prepare()
+{
+	h1=${NETIFS[p1]}
+	swp1=${NETIFS[p2]}
+
+	vrf_prepare
+
+	h1_create
+
+	switch_create
+
+	forwarding_enable
+}
+
+cleanup()
+{
+	pre_cleanup
+
+	forwarding_restore
+
+	switch_destroy
+
+	h1_destroy
+
+	vrf_cleanup
+}
+
+ping_ipv4()
+{
+	ping_test $h1.10 192.0.2.1
+}
+
+ping_ipv6()
+{
+	ping6_test $h1.10 2001:db8:1::1
+}
+
+min_max_mtu_get_if()
+{
+	local dev=$1; shift
+	local min_max=$1; shift
+
+	ip -d -j link show $dev | jq ".[].$min_max"
+}
+
+ensure_compatible_min_max_mtu()
+{
+	local min_max=$1; shift
+
+	local mtu=$(min_max_mtu_get_if ${NETIFS[p1]} $min_max)
+	local i
+
+	for ((i = 2; i <= NUM_NETIFS; ++i)); do
+		local current_mtu=$(min_max_mtu_get_if ${NETIFS[p$i]} $min_max)
+
+		if [ $current_mtu -ne $mtu ]; then
+			return 1
+		fi
+	done
+}
+
+mtu_set_if()
+{
+	local dev=$1; shift
+	local mtu=$1; shift
+	local should_fail=${1:-0}; shift
+
+	mtu_set $dev $mtu 2>/dev/null
+	check_err_fail $should_fail $? "Set MTU $mtu for $dev"
+}
+
+mtu_set_all_if()
+{
+	local mtu=$1; shift
+	local i
+
+	for ((i = 1; i <= NUM_NETIFS; ++i)); do
+		mtu_set_if ${NETIFS[p$i]} $mtu
+		mtu_set_if ${NETIFS[p$i]}.10 $mtu
+	done
+}
+
+mtu_restore_all_if()
+{
+	local i
+
+	for ((i = 1; i <= NUM_NETIFS; ++i)); do
+		mtu_restore ${NETIFS[p$i]}.10
+		mtu_restore ${NETIFS[p$i]}
+	done
+}
+
+mtu_test_ping4()
+{
+	local mtu=$1; shift
+	local should_fail=$1; shift
+
+	# Ping adds 8 bytes for ICMP header and 20 bytes for IP header
+	local ping_headers_len=$((20 + 8))
+	local pkt_size=$((mtu - ping_headers_len))
+
+	ping_do $h1.10 192.0.2.1 "-s $pkt_size -M do"
+	check_err_fail $should_fail $? "Ping, packet size: $pkt_size"
+}
+
+mtu_test_ping6()
+{
+	local mtu=$1; shift
+	local should_fail=$1; shift
+
+	# Ping adds 8 bytes for ICMP header and 40 bytes for IPv6 header
+	local ping6_headers_len=$((40 + 8))
+	local pkt_size=$((mtu - ping6_headers_len))
+
+	ping6_do $h1.10 2001:db8:1::1 "-s $pkt_size -M do"
+	check_err_fail $should_fail $? "Ping6, packet size: $pkt_size"
+}
+
+max_mtu_config_test()
+{
+	local i
+
+	RET=0
+
+	for ((i = 1; i <= NUM_NETIFS; ++i)); do
+		local dev=${NETIFS[p$i]}
+		local max_mtu=$(min_max_mtu_get_if $dev "max_mtu")
+		local should_fail
+
+		should_fail=0
+		mtu_set_if $dev $max_mtu $should_fail
+		mtu_restore $dev
+
+		should_fail=1
+		mtu_set_if $dev $((max_mtu + 1)) $should_fail
+		mtu_restore $dev
+	done
+
+	log_test "Test maximum MTU configuration"
+}
+
+max_mtu_traffic_test()
+{
+	local should_fail
+	local max_mtu
+
+	RET=0
+
+	if ! ensure_compatible_min_max_mtu "max_mtu"; then
+		log_test_xfail "Topology has incompatible maximum MTU values"
+		return
+	fi
+
+	max_mtu=$(min_max_mtu_get_if ${NETIFS[p1]} "max_mtu")
+
+	should_fail=0
+	mtu_set_all_if $max_mtu
+	mtu_test_ping4 $max_mtu $should_fail
+	mtu_test_ping6 $max_mtu $should_fail
+	mtu_restore_all_if
+
+	should_fail=1
+	mtu_set_all_if $((max_mtu - 1))
+	mtu_test_ping4 $max_mtu $should_fail
+	mtu_test_ping6 $max_mtu $should_fail
+	mtu_restore_all_if
+
+	log_test "Test traffic, packet size is maximum MTU"
+}
+
+min_mtu_config_test()
+{
+	local i
+
+	RET=0
+
+	for ((i = 1; i <= NUM_NETIFS; ++i)); do
+		local dev=${NETIFS[p$i]}
+		local min_mtu=$(min_max_mtu_get_if $dev "min_mtu")
+		local should_fail
+
+		should_fail=0
+		mtu_set_if $dev $min_mtu $should_fail
+		mtu_restore $dev
+
+		should_fail=1
+		mtu_set_if $dev $((min_mtu - 1)) $should_fail
+		mtu_restore $dev
+	done
+
+	log_test "Test minimum MTU configuration"
+}
+
+min_mtu_traffic_test()
+{
+	local should_fail=0
+	local min_mtu
+
+	RET=0
+
+	if ! ensure_compatible_min_max_mtu "min_mtu"; then
+		log_test_xfail "Topology has incompatible minimum MTU values"
+		return
+	fi
+
+	min_mtu=$(min_max_mtu_get_if ${NETIFS[p1]} "min_mtu")
+	mtu_set_all_if $min_mtu
+	mtu_test_ping4 $min_mtu $should_fail
+	# Do not test minimum MTU with IPv6, as IPv6 requires higher MTU.
+
+	mtu_restore_all_if
+
+	log_test "Test traffic, packet size is minimum MTU"
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre.sh b/tools/testing/selftests/net/forwarding/mirror_gre.sh
index 0266443601bc..921c733ee04f 100755
--- a/tools/testing/selftests/net/forwarding/mirror_gre.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre.sh
@@ -74,7 +74,7 @@ test_span_gre_mac()
 
 	RET=0
 
-	mirror_install $swp1 $direction $tundev "matchall $tcflags"
+	mirror_install $swp1 $direction $tundev "matchall"
 	icmp_capture_install h3-${tundev} "src_mac $src_mac dst_mac $dst_mac"
 
 	mirror_test v$h1 192.0.2.1 192.0.2.2 h3-${tundev} 100 10
@@ -82,29 +82,29 @@ test_span_gre_mac()
 	icmp_capture_uninstall h3-${tundev}
 	mirror_uninstall $swp1 $direction
 
-	log_test "$direction $what: envelope MAC ($tcflags)"
+	log_test "$direction $what: envelope MAC"
 }
 
 test_two_spans()
 {
 	RET=0
 
-	mirror_install $swp1 ingress gt4 "matchall $tcflags"
-	mirror_install $swp1 egress gt6 "matchall $tcflags"
-	quick_test_span_gre_dir gt4 ingress
-	quick_test_span_gre_dir gt6 egress
+	mirror_install $swp1 ingress gt4 "matchall"
+	mirror_install $swp1 egress gt6 "matchall"
+	quick_test_span_gre_dir gt4 8 0
+	quick_test_span_gre_dir gt6 0 8
 
 	mirror_uninstall $swp1 ingress
-	fail_test_span_gre_dir gt4 ingress
-	quick_test_span_gre_dir gt6 egress
+	fail_test_span_gre_dir gt4 8 0
+	quick_test_span_gre_dir gt6 0 8
 
-	mirror_install $swp1 ingress gt4 "matchall $tcflags"
+	mirror_install $swp1 ingress gt4 "matchall"
 	mirror_uninstall $swp1 egress
-	quick_test_span_gre_dir gt4 ingress
-	fail_test_span_gre_dir gt6 egress
+	quick_test_span_gre_dir gt4 8 0
+	fail_test_span_gre_dir gt6 0 8
 
 	mirror_uninstall $swp1 ingress
-	log_test "two simultaneously configured mirrors ($tcflags)"
+	log_test "two simultaneously configured mirrors"
 }
 
 test_gretap()
@@ -131,30 +131,11 @@ test_ip6gretap_mac()
 	test_span_gre_mac gt6 egress "mirror to ip6gretap"
 }
 
-test_all()
-{
-	slow_path_trap_install $swp1 ingress
-	slow_path_trap_install $swp1 egress
-
-	tests_run
-
-	slow_path_trap_uninstall $swp1 egress
-	slow_path_trap_uninstall $swp1 ingress
-}
-
 trap cleanup EXIT
 
 setup_prepare
 setup_wait
 
-tcflags="skip_hw"
-test_all
-
-if ! tc_offload_check; then
-	echo "WARN: Could not test offloaded functionality"
-else
-	tcflags="skip_sw"
-	test_all
-fi
+tests_run
 
 exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_bound.sh b/tools/testing/selftests/net/forwarding/mirror_gre_bound.sh
index 6c257ec03756..e3cd48e18eeb 100755
--- a/tools/testing/selftests/net/forwarding/mirror_gre_bound.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_bound.sh
@@ -196,32 +196,11 @@ test_ip6gretap()
 	full_test_span_gre_dir gt6 egress  0 8 "mirror to ip6gretap w/ UL"
 }
 
-test_all()
-{
-	RET=0
-
-	slow_path_trap_install $swp1 ingress
-	slow_path_trap_install $swp1 egress
-
-	tests_run
-
-	slow_path_trap_uninstall $swp1 egress
-	slow_path_trap_uninstall $swp1 ingress
-}
-
 trap cleanup EXIT
 
 setup_prepare
 setup_wait
 
-tcflags="skip_hw"
-test_all
-
-if ! tc_offload_check; then
-	echo "WARN: Could not test offloaded functionality"
-else
-	tcflags="skip_sw"
-	test_all
-fi
+tests_run
 
 exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d.sh b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d.sh
index 04fd14b0a9b7..6c7bd33332c2 100755
--- a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d.sh
@@ -108,30 +108,11 @@ test_ip6gretap()
 	full_test_span_gre_dir gt6 egress 0 8 "mirror to ip6gretap"
 }
 
-test_all()
-{
-	slow_path_trap_install $swp1 ingress
-	slow_path_trap_install $swp1 egress
-
-	tests_run
-
-	slow_path_trap_uninstall $swp1 egress
-	slow_path_trap_uninstall $swp1 ingress
-}
-
 trap cleanup EXIT
 
 setup_prepare
 setup_wait
 
-tcflags="skip_hw"
-test_all
-
-if ! tc_offload_check; then
-	echo "WARN: Could not test offloaded functionality"
-else
-	tcflags="skip_sw"
-	test_all
-fi
+tests_run
 
 exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d_vlan.sh b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d_vlan.sh
index f35313c76fac..909ec956a5e5 100755
--- a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d_vlan.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d_vlan.sh
@@ -104,30 +104,11 @@ test_ip6gretap_stp()
 	full_test_span_gre_stp gt6 $swp3.555 "mirror to ip6gretap"
 }
 
-test_all()
-{
-	slow_path_trap_install $swp1 ingress
-	slow_path_trap_install $swp1 egress
-
-	tests_run
-
-	slow_path_trap_uninstall $swp1 egress
-	slow_path_trap_uninstall $swp1 ingress
-}
-
 trap cleanup EXIT
 
 setup_prepare
 setup_wait
 
-tcflags="skip_hw"
-test_all
-
-if ! tc_offload_check; then
-	echo "WARN: Could not test offloaded functionality"
-else
-	tcflags="skip_sw"
-	test_all
-fi
+tests_run
 
 exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q.sh b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q.sh
index 0cf4c47a46f9..40ac9dd3aff1 100755
--- a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q.sh
@@ -104,30 +104,11 @@ test_ip6gretap()
 	full_test_span_gre_dir gt6 egress 0 8 "mirror to ip6gretap"
 }
 
-tests()
-{
-	slow_path_trap_install $swp1 ingress
-	slow_path_trap_install $swp1 egress
-
-	tests_run
-
-	slow_path_trap_uninstall $swp1 egress
-	slow_path_trap_uninstall $swp1 ingress
-}
-
 trap cleanup EXIT
 
 setup_prepare
 setup_wait
 
-tcflags="skip_hw"
-tests
-
-if ! tc_offload_check; then
-	echo "WARN: Could not test offloaded functionality"
-else
-	tcflags="skip_sw"
-	tests
-fi
+tests_run
 
 exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q_lag.sh b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q_lag.sh
index c53148b1dc63..a20d22d1df36 100755
--- a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q_lag.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q_lag.sh
@@ -49,6 +49,7 @@ ALL_TESTS="
 	test_mirror_gretap_second
 "
 
+REQUIRE_TEAMD="yes"
 NUM_NETIFS=6
 source lib.sh
 source mirror_lib.sh
@@ -227,10 +228,10 @@ test_lag_slave()
 	RET=0
 
 	tc filter add dev $swp1 ingress pref 999 \
-		proto 802.1q flower vlan_ethtype arp $tcflags \
+		proto 802.1q flower vlan_ethtype arp \
 		action pass
 	mirror_install $swp1 ingress gt4 \
-		"proto 802.1q flower vlan_id 333 $tcflags"
+		"proto 802.1q flower vlan_id 333"
 
 	# Test connectivity through $up_dev when $down_dev is set down.
 	ip link set dev $down_dev down
@@ -239,7 +240,7 @@ test_lag_slave()
 	setup_wait_dev $host_dev
 	$ARPING -I br1 192.0.2.130 -qfc 1
 	sleep 2
-	mirror_test vrf-h1 192.0.2.1 192.0.2.18 $host_dev 1 10
+	mirror_test vrf-h1 192.0.2.1 192.0.2.18 $host_dev 1 ">= 10"
 
 	# Test lack of connectivity when both slaves are down.
 	ip link set dev $up_dev down
@@ -252,7 +253,7 @@ test_lag_slave()
 	mirror_uninstall $swp1 ingress
 	tc filter del dev $swp1 ingress pref 999
 
-	log_test "$what ($tcflags)"
+	log_test "$what"
 }
 
 test_mirror_gretap_first()
@@ -265,30 +266,11 @@ test_mirror_gretap_second()
 	test_lag_slave $h4 $swp4 $swp3 "mirror to gretap: LAG second slave"
 }
 
-test_all()
-{
-	slow_path_trap_install $swp1 ingress
-	slow_path_trap_install $swp1 egress
-
-	tests_run
-
-	slow_path_trap_uninstall $swp1 egress
-	slow_path_trap_uninstall $swp1 ingress
-}
-
 trap cleanup EXIT
 
 setup_prepare
 setup_wait
 
-tcflags="skip_hw"
-test_all
-
-if ! tc_offload_check; then
-	echo "WARN: Could not test offloaded functionality"
-else
-	tcflags="skip_sw"
-	test_all
-fi
+tests_run
 
 exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh b/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh
index 5ea9d63915f7..65ae9d960c18 100755
--- a/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh
@@ -73,7 +73,7 @@ test_span_gre_ttl()
 	RET=0
 
 	mirror_install $swp1 ingress $tundev \
-		"prot ip flower $tcflags ip_prot icmp"
+		"prot ip flower ip_prot icmp"
 	tc filter add dev $h3 ingress pref 77 prot $prot \
 		flower skip_hw ip_ttl 50 action pass
 
@@ -81,13 +81,13 @@ test_span_gre_ttl()
 
 	ip link set dev $tundev type $type ttl 50
 	sleep 2
-	mirror_test v$h1 192.0.2.1 192.0.2.2 $h3 77 10
+	mirror_test v$h1 192.0.2.1 192.0.2.2 $h3 77 ">= 10"
 
 	ip link set dev $tundev type $type ttl 100
 	tc filter del dev $h3 ingress pref 77
 	mirror_uninstall $swp1 ingress
 
-	log_test "$what: TTL change ($tcflags)"
+	log_test "$what: TTL change"
 }
 
 test_span_gre_tun_up()
@@ -98,15 +98,15 @@ test_span_gre_tun_up()
 	RET=0
 
 	ip link set dev $tundev down
-	mirror_install $swp1 ingress $tundev "matchall $tcflags"
-	fail_test_span_gre_dir $tundev ingress
+	mirror_install $swp1 ingress $tundev "matchall"
+	fail_test_span_gre_dir $tundev
 
 	ip link set dev $tundev up
 
-	quick_test_span_gre_dir $tundev ingress
+	quick_test_span_gre_dir $tundev
 	mirror_uninstall $swp1 ingress
 
-	log_test "$what: tunnel down/up ($tcflags)"
+	log_test "$what: tunnel down/up"
 }
 
 test_span_gre_egress_up()
@@ -118,8 +118,8 @@ test_span_gre_egress_up()
 	RET=0
 
 	ip link set dev $swp3 down
-	mirror_install $swp1 ingress $tundev "matchall $tcflags"
-	fail_test_span_gre_dir $tundev ingress
+	mirror_install $swp1 ingress $tundev "matchall"
+	fail_test_span_gre_dir $tundev
 
 	# After setting the device up, wait for neighbor to get resolved so that
 	# we can expect mirroring to work.
@@ -127,10 +127,10 @@ test_span_gre_egress_up()
 	setup_wait_dev $swp3
 	ping -c 1 -I $swp3 $remote_ip &>/dev/null
 
-	quick_test_span_gre_dir $tundev ingress
+	quick_test_span_gre_dir $tundev
 	mirror_uninstall $swp1 ingress
 
-	log_test "$what: egress down/up ($tcflags)"
+	log_test "$what: egress down/up"
 }
 
 test_span_gre_remote_ip()
@@ -144,14 +144,14 @@ test_span_gre_remote_ip()
 	RET=0
 
 	ip link set dev $tundev type $type remote $wrong_ip
-	mirror_install $swp1 ingress $tundev "matchall $tcflags"
-	fail_test_span_gre_dir $tundev ingress
+	mirror_install $swp1 ingress $tundev "matchall"
+	fail_test_span_gre_dir $tundev
 
 	ip link set dev $tundev type $type remote $correct_ip
-	quick_test_span_gre_dir $tundev ingress
+	quick_test_span_gre_dir $tundev
 	mirror_uninstall $swp1 ingress
 
-	log_test "$what: remote address change ($tcflags)"
+	log_test "$what: remote address change"
 }
 
 test_span_gre_tun_del()
@@ -165,10 +165,10 @@ test_span_gre_tun_del()
 
 	RET=0
 
-	mirror_install $swp1 ingress $tundev "matchall $tcflags"
-	quick_test_span_gre_dir $tundev ingress
+	mirror_install $swp1 ingress $tundev "matchall"
+	quick_test_span_gre_dir $tundev
 	ip link del dev $tundev
-	fail_test_span_gre_dir $tundev ingress
+	fail_test_span_gre_dir $tundev
 
 	tunnel_create $tundev $type $local_ip $remote_ip \
 		      ttl 100 tos inherit $flags
@@ -176,11 +176,11 @@ test_span_gre_tun_del()
 	# Recreating the tunnel doesn't reestablish mirroring, so reinstall it
 	# and verify it works for the follow-up tests.
 	mirror_uninstall $swp1 ingress
-	mirror_install $swp1 ingress $tundev "matchall $tcflags"
-	quick_test_span_gre_dir $tundev ingress
+	mirror_install $swp1 ingress $tundev "matchall"
+	quick_test_span_gre_dir $tundev
 	mirror_uninstall $swp1 ingress
 
-	log_test "$what: tunnel deleted ($tcflags)"
+	log_test "$what: tunnel deleted"
 }
 
 test_span_gre_route_del()
@@ -192,18 +192,18 @@ test_span_gre_route_del()
 
 	RET=0
 
-	mirror_install $swp1 ingress $tundev "matchall $tcflags"
-	quick_test_span_gre_dir $tundev ingress
+	mirror_install $swp1 ingress $tundev "matchall"
+	quick_test_span_gre_dir $tundev
 
 	ip route del $route dev $edev
-	fail_test_span_gre_dir $tundev ingress
+	fail_test_span_gre_dir $tundev
 
 	ip route add $route dev $edev
-	quick_test_span_gre_dir $tundev ingress
+	quick_test_span_gre_dir $tundev
 
 	mirror_uninstall $swp1 ingress
 
-	log_test "$what: underlay route removal ($tcflags)"
+	log_test "$what: underlay route removal"
 }
 
 test_ttl()
@@ -244,30 +244,11 @@ test_route_del()
 	test_span_gre_route_del gt6 $swp3 2001:db8:2::/64 "mirror to ip6gretap"
 }
 
-test_all()
-{
-	slow_path_trap_install $swp1 ingress
-	slow_path_trap_install $swp1 egress
-
-	tests_run
-
-	slow_path_trap_uninstall $swp1 egress
-	slow_path_trap_uninstall $swp1 ingress
-}
-
 trap cleanup EXIT
 
 setup_prepare
 setup_wait
 
-tcflags="skip_hw"
-test_all
-
-if ! tc_offload_check; then
-	echo "WARN: Could not test offloaded functionality"
-else
-	tcflags="skip_sw"
-	test_all
-fi
+tests_run
 
 exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_flower.sh b/tools/testing/selftests/net/forwarding/mirror_gre_flower.sh
index 09389f3b9369..3a84f3ab5856 100755
--- a/tools/testing/selftests/net/forwarding/mirror_gre_flower.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_flower.sh
@@ -64,12 +64,19 @@ cleanup()
 
 test_span_gre_dir_acl()
 {
-	test_span_gre_dir_ips "$@" 192.0.2.3 192.0.2.4
+	local tundev=$1; shift
+	local forward_type=$1; shift
+	local backward_type=$1; shift
+
+	test_span_gre_dir_ips "$tundev" "$forward_type" \
+			      "$backward_type" 192.0.2.3 192.0.2.4
 }
 
 fail_test_span_gre_dir_acl()
 {
-	fail_test_span_gre_dir_ips "$@" 192.0.2.3 192.0.2.4
+	local tundev=$1; shift
+
+	fail_test_span_gre_dir_ips "$tundev" 192.0.2.3 192.0.2.4
 }
 
 full_test_span_gre_dir_acl()
@@ -84,16 +91,15 @@ full_test_span_gre_dir_acl()
 	RET=0
 
 	mirror_install $swp1 $direction $tundev \
-		       "protocol ip flower $tcflags dst_ip $match_dip"
-	fail_test_span_gre_dir $tundev $direction
-	test_span_gre_dir_acl "$tundev" "$direction" \
-			  "$forward_type" "$backward_type"
+		       "protocol ip flower dst_ip $match_dip"
+	fail_test_span_gre_dir $tundev
+	test_span_gre_dir_acl "$tundev" "$forward_type" "$backward_type"
 	mirror_uninstall $swp1 $direction
 
 	# Test lack of mirroring after ACL mirror is uninstalled.
-	fail_test_span_gre_dir_acl "$tundev" "$direction"
+	fail_test_span_gre_dir_acl "$tundev"
 
-	log_test "$direction $what ($tcflags)"
+	log_test "$direction $what"
 }
 
 test_gretap()
@@ -108,30 +114,11 @@ test_ip6gretap()
 	full_test_span_gre_dir_acl gt6 egress 0 8 192.0.2.3 "ACL mirror to ip6gretap"
 }
 
-test_all()
-{
-	slow_path_trap_install $swp1 ingress
-	slow_path_trap_install $swp1 egress
-
-	tests_run
-
-	slow_path_trap_uninstall $swp1 egress
-	slow_path_trap_uninstall $swp1 ingress
-}
-
 trap cleanup EXIT
 
 setup_prepare
 setup_wait
 
-tcflags="skip_hw"
-test_all
-
-if ! tc_offload_check; then
-	echo "WARN: Could not test offloaded functionality"
-else
-	tcflags="skip_sw"
-	test_all
-fi
+tests_run
 
 exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_lag_lacp.sh b/tools/testing/selftests/net/forwarding/mirror_gre_lag_lacp.sh
index 9edf4cb104a8..ff7049582d35 100755
--- a/tools/testing/selftests/net/forwarding/mirror_gre_lag_lacp.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_lag_lacp.sh
@@ -37,8 +37,14 @@
 # |    \                                              /                       |
 # |     \____________________________________________/                        |
 # |                           |                                               |
-# |                           + lag2 (team)                                   |
-# |                             192.0.2.130/28                                |
+# |                           + lag2 (team)  ------>   + gt4-dst (gretap)     |
+# |                             192.0.2.130/28           loc=192.0.2.130      |
+# |                                                      rem=192.0.2.129      |
+# |                                                      ttl=100              |
+# |                                                      tos=inherit          |
+# |                                                                           |
+# |                                                                           |
+# |                                                                           |
 # |                                                                           |
 # +---------------------------------------------------------------------------+
 
@@ -47,12 +53,10 @@ ALL_TESTS="
 	test_mirror_gretap_second
 "
 
+REQUIRE_TEAMD="yes"
 NUM_NETIFS=6
 source lib.sh
 source mirror_lib.sh
-source mirror_gre_lib.sh
-
-require_command $ARPING
 
 vlan_host_create()
 {
@@ -122,16 +126,21 @@ h3_create()
 {
 	vrf_create vrf-h3
 	ip link set dev vrf-h3 up
-	tc qdisc add dev $h3 clsact
-	tc qdisc add dev $h4 clsact
 	h3_create_team
+
+	tunnel_create gt4-dst gretap 192.0.2.130 192.0.2.129 \
+		      ttl 100 tos inherit
+	ip link set dev gt4-dst master vrf-h3
+	tc qdisc add dev gt4-dst clsact
 }
 
 h3_destroy()
 {
+	tc qdisc del dev gt4-dst clsact
+	ip link set dev gt4-dst nomaster
+	tunnel_destroy gt4-dst
+
 	h3_destroy_team
-	tc qdisc del dev $h4 clsact
-	tc qdisc del dev $h3 clsact
 	ip link set dev vrf-h3 down
 	vrf_destroy vrf-h3
 }
@@ -188,18 +197,12 @@ setup_prepare()
 	h2_create
 	h3_create
 	switch_create
-
-	trap_install $h3 ingress
-	trap_install $h4 ingress
 }
 
 cleanup()
 {
 	pre_cleanup
 
-	trap_uninstall $h4 ingress
-	trap_uninstall $h3 ingress
-
 	switch_destroy
 	h3_destroy
 	h2_destroy
@@ -218,7 +221,8 @@ test_lag_slave()
 	RET=0
 
 	mirror_install $swp1 ingress gt4 \
-		       "proto 802.1q flower vlan_id 333 $tcflags"
+		       "proto 802.1q flower vlan_id 333"
+	vlan_capture_install gt4-dst "vlan_ethtype ipv4 ip_proto icmp type 8"
 
 	# Move $down_dev away from the team. That will prompt change in
 	# txability of the connected device, without changing its upness. The
@@ -226,13 +230,14 @@ test_lag_slave()
 	# other slave.
 	ip link set dev $down_dev nomaster
 	sleep 2
-	mirror_test vrf-h1 192.0.2.1 192.0.2.18 $up_dev 1 10
+	mirror_test vrf-h1 192.0.2.1 192.0.2.18 gt4-dst 100 10
 
 	# Test lack of connectivity when neither slave is txable.
 	ip link set dev $up_dev nomaster
 	sleep 2
-	mirror_test vrf-h1 192.0.2.1 192.0.2.18 $h3 1 0
-	mirror_test vrf-h1 192.0.2.1 192.0.2.18 $h4 1 0
+	mirror_test vrf-h1 192.0.2.1 192.0.2.18 gt4-dst 100 0
+
+	vlan_capture_uninstall gt4-dst
 	mirror_uninstall $swp1 ingress
 
 	# Recreate H3's team device, because mlxsw, which this test is
@@ -243,7 +248,7 @@ test_lag_slave()
 	# Wait for ${h,swp}{3,4}.
 	setup_wait
 
-	log_test "$what ($tcflags)"
+	log_test "$what"
 }
 
 test_mirror_gretap_first()
@@ -256,30 +261,11 @@ test_mirror_gretap_second()
 	test_lag_slave $h4 $h3 "mirror to gretap: LAG second slave"
 }
 
-test_all()
-{
-	slow_path_trap_install $swp1 ingress
-	slow_path_trap_install $swp1 egress
-
-	tests_run
-
-	slow_path_trap_uninstall $swp1 egress
-	slow_path_trap_uninstall $swp1 ingress
-}
-
 trap cleanup EXIT
 
 setup_prepare
 setup_wait
 
-tcflags="skip_hw"
-test_all
-
-if ! tc_offload_check; then
-	echo "WARN: Could not test offloaded functionality"
-else
-	tcflags="skip_sw"
-	test_all
-fi
+tests_run
 
 exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_lib.sh b/tools/testing/selftests/net/forwarding/mirror_gre_lib.sh
index 0c36546e131e..20078cc55f24 100644
--- a/tools/testing/selftests/net/forwarding/mirror_gre_lib.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_lib.sh
@@ -5,22 +5,34 @@ source "$net_forwarding_dir/mirror_lib.sh"
 quick_test_span_gre_dir_ips()
 {
 	local tundev=$1; shift
+	local ip1=$1; shift
+	local ip2=$1; shift
+	local forward_type=$1; shift
+	local backward_type=$1; shift
 
-	do_test_span_dir_ips 10 h3-$tundev "$@"
+	do_test_span_dir_ips 10 h3-$tundev "$ip1" "$ip2" \
+			     "$forward_type" "$backward_type"
 }
 
 fail_test_span_gre_dir_ips()
 {
 	local tundev=$1; shift
+	local ip1=$1; shift
+	local ip2=$1; shift
 
-	do_test_span_dir_ips 0 h3-$tundev "$@"
+	do_test_span_dir_ips 0 h3-$tundev "$ip1" "$ip2"
 }
 
 test_span_gre_dir_ips()
 {
 	local tundev=$1; shift
+	local forward_type=$1; shift
+	local backward_type=$1; shift
+	local ip1=$1; shift
+	local ip2=$1; shift
 
-	test_span_dir_ips h3-$tundev "$@"
+	test_span_dir_ips h3-$tundev "$forward_type" \
+			  "$backward_type" "$ip1" "$ip2"
 }
 
 full_test_span_gre_dir_ips()
@@ -35,12 +47,12 @@ full_test_span_gre_dir_ips()
 
 	RET=0
 
-	mirror_install $swp1 $direction $tundev "matchall $tcflags"
-	test_span_dir_ips "h3-$tundev" "$direction" "$forward_type" \
+	mirror_install $swp1 $direction $tundev "matchall"
+	test_span_dir_ips "h3-$tundev" "$forward_type" \
 			  "$backward_type" "$ip1" "$ip2"
 	mirror_uninstall $swp1 $direction
 
-	log_test "$direction $what ($tcflags)"
+	log_test "$direction $what"
 }
 
 full_test_span_gre_dir_vlan_ips()
@@ -56,45 +68,63 @@ full_test_span_gre_dir_vlan_ips()
 
 	RET=0
 
-	mirror_install $swp1 $direction $tundev "matchall $tcflags"
+	mirror_install $swp1 $direction $tundev "matchall"
 
-	test_span_dir_ips "h3-$tundev" "$direction" "$forward_type" \
+	test_span_dir_ips "h3-$tundev" "$forward_type" \
 			  "$backward_type" "$ip1" "$ip2"
 
 	tc filter add dev $h3 ingress pref 77 prot 802.1q \
 		flower $vlan_match \
 		action pass
-	mirror_test v$h1 $ip1 $ip2 $h3 77 10
+	mirror_test v$h1 $ip1 $ip2 $h3 77 '>= 10'
 	tc filter del dev $h3 ingress pref 77
 
 	mirror_uninstall $swp1 $direction
 
-	log_test "$direction $what ($tcflags)"
+	log_test "$direction $what"
 }
 
 quick_test_span_gre_dir()
 {
-	quick_test_span_gre_dir_ips "$@" 192.0.2.1 192.0.2.2
+	local tundev=$1; shift
+	local forward_type=${1-8}; shift
+	local backward_type=${1-0}; shift
+
+	quick_test_span_gre_dir_ips "$tundev" 192.0.2.1 192.0.2.2 \
+				    "$forward_type" "$backward_type"
 }
 
 fail_test_span_gre_dir()
 {
-	fail_test_span_gre_dir_ips "$@" 192.0.2.1 192.0.2.2
-}
+	local tundev=$1; shift
 
-test_span_gre_dir()
-{
-	test_span_gre_dir_ips "$@" 192.0.2.1 192.0.2.2
+	fail_test_span_gre_dir_ips "$tundev" 192.0.2.1 192.0.2.2
 }
 
 full_test_span_gre_dir()
 {
-	full_test_span_gre_dir_ips "$@" 192.0.2.1 192.0.2.2
+	local tundev=$1; shift
+	local direction=$1; shift
+	local forward_type=$1; shift
+	local backward_type=$1; shift
+	local what=$1; shift
+
+	full_test_span_gre_dir_ips "$tundev" "$direction" "$forward_type" \
+				   "$backward_type" "$what" 192.0.2.1 192.0.2.2
 }
 
 full_test_span_gre_dir_vlan()
 {
-	full_test_span_gre_dir_vlan_ips "$@" 192.0.2.1 192.0.2.2
+	local tundev=$1; shift
+	local direction=$1; shift
+	local vlan_match=$1; shift
+	local forward_type=$1; shift
+	local backward_type=$1; shift
+	local what=$1; shift
+
+	full_test_span_gre_dir_vlan_ips "$tundev" "$direction" "$vlan_match" \
+					"$forward_type" "$backward_type" \
+					"$what" 192.0.2.1 192.0.2.2
 }
 
 full_test_span_gre_stp_ips()
@@ -104,27 +134,39 @@ full_test_span_gre_stp_ips()
 	local what=$1; shift
 	local ip1=$1; shift
 	local ip2=$1; shift
+	local forward_type=$1; shift
+	local backward_type=$1; shift
 	local h3mac=$(mac_get $h3)
 
 	RET=0
 
-	mirror_install $swp1 ingress $tundev "matchall $tcflags"
-	quick_test_span_gre_dir_ips $tundev ingress $ip1 $ip2
+	mirror_install $swp1 ingress $tundev "matchall"
+	quick_test_span_gre_dir_ips $tundev $ip1 $ip2 \
+				    "$forward_type" "$backward_type"
 
 	bridge link set dev $nbpdev state disabled
 	sleep 1
-	fail_test_span_gre_dir_ips $tundev ingress $ip1 $ip2
+	fail_test_span_gre_dir_ips $tundev $ip1 $ip2
 
 	bridge link set dev $nbpdev state forwarding
 	sleep 1
-	quick_test_span_gre_dir_ips $tundev ingress $ip1 $ip2
+	quick_test_span_gre_dir_ips $tundev $ip1 $ip2 \
+				    "$forward_type" "$backward_type"
 
 	mirror_uninstall $swp1 ingress
 
-	log_test "$what: STP state ($tcflags)"
+	log_test "$what: STP state"
 }
 
 full_test_span_gre_stp()
 {
-	full_test_span_gre_stp_ips "$@" 192.0.2.1 192.0.2.2
+	local tundev=$1; shift
+	local nbpdev=$1; shift
+	local what=$1; shift
+	local forward_type=${1-8}; shift
+	local backward_type=${1-0}; shift
+
+	full_test_span_gre_stp_ips "$tundev" "$nbpdev" "$what" \
+				   192.0.2.1 192.0.2.2 \
+				   "$forward_type" "$backward_type"
 }
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_neigh.sh b/tools/testing/selftests/net/forwarding/mirror_gre_neigh.sh
index fc0508e40fca..2cbfbecf25c8 100755
--- a/tools/testing/selftests/net/forwarding/mirror_gre_neigh.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_neigh.sh
@@ -60,41 +60,32 @@ test_span_gre_neigh()
 	local addr=$1; shift
 	local tundev=$1; shift
 	local direction=$1; shift
+	local forward_type=$1; shift
+	local backward_type=$1; shift
 	local what=$1; shift
 
 	RET=0
 
 	ip neigh replace dev $swp3 $addr lladdr 00:11:22:33:44:55
-	mirror_install $swp1 $direction $tundev "matchall $tcflags"
-	fail_test_span_gre_dir $tundev ingress
+	mirror_install $swp1 $direction $tundev "matchall"
+	fail_test_span_gre_dir $tundev "$forward_type" "$backward_type"
 	ip neigh del dev $swp3 $addr
-	quick_test_span_gre_dir $tundev ingress
+	quick_test_span_gre_dir $tundev "$forward_type" "$backward_type"
 	mirror_uninstall $swp1 $direction
 
-	log_test "$direction $what: neighbor change ($tcflags)"
+	log_test "$direction $what: neighbor change"
 }
 
 test_gretap()
 {
-	test_span_gre_neigh 192.0.2.130 gt4 ingress "mirror to gretap"
-	test_span_gre_neigh 192.0.2.130 gt4 egress "mirror to gretap"
+	test_span_gre_neigh 192.0.2.130 gt4 ingress 8 0 "mirror to gretap"
+	test_span_gre_neigh 192.0.2.130 gt4 egress 0 8 "mirror to gretap"
 }
 
 test_ip6gretap()
 {
-	test_span_gre_neigh 2001:db8:2::2 gt6 ingress "mirror to ip6gretap"
-	test_span_gre_neigh 2001:db8:2::2 gt6 egress "mirror to ip6gretap"
-}
-
-test_all()
-{
-	slow_path_trap_install $swp1 ingress
-	slow_path_trap_install $swp1 egress
-
-	tests_run
-
-	slow_path_trap_uninstall $swp1 egress
-	slow_path_trap_uninstall $swp1 ingress
+	test_span_gre_neigh 2001:db8:2::2 gt6 ingress 8 0 "mirror to ip6gretap"
+	test_span_gre_neigh 2001:db8:2::2 gt6 egress 0 8 "mirror to ip6gretap"
 }
 
 trap cleanup EXIT
@@ -102,14 +93,6 @@ trap cleanup EXIT
 setup_prepare
 setup_wait
 
-tcflags="skip_hw"
-test_all
-
-if ! tc_offload_check; then
-	echo "WARN: Could not test offloaded functionality"
-else
-	tcflags="skip_sw"
-	test_all
-fi
+tests_run
 
 exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_nh.sh b/tools/testing/selftests/net/forwarding/mirror_gre_nh.sh
index 6f9ef1820e93..34bc646938e3 100755
--- a/tools/testing/selftests/net/forwarding/mirror_gre_nh.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_nh.sh
@@ -75,42 +75,31 @@ cleanup()
 test_gretap()
 {
 	RET=0
-	mirror_install $swp1 ingress gt4 "matchall $tcflags"
+	mirror_install $swp1 ingress gt4 "matchall"
 
 	# For IPv4, test that there's no mirroring without the route directing
 	# the traffic to tunnel remote address. Then add it and test that
 	# mirroring starts. For IPv6 we can't test this due to the limitation
 	# that routes for locally-specified IPv6 addresses can't be added.
-	fail_test_span_gre_dir gt4 ingress
+	fail_test_span_gre_dir gt4
 
 	ip route add 192.0.2.130/32 via 192.0.2.162
-	quick_test_span_gre_dir gt4 ingress
+	quick_test_span_gre_dir gt4
 	ip route del 192.0.2.130/32 via 192.0.2.162
 
 	mirror_uninstall $swp1 ingress
-	log_test "mirror to gre with next-hop remote ($tcflags)"
+	log_test "mirror to gre with next-hop remote"
 }
 
 test_ip6gretap()
 {
 	RET=0
 
-	mirror_install $swp1 ingress gt6 "matchall $tcflags"
-	quick_test_span_gre_dir gt6 ingress
+	mirror_install $swp1 ingress gt6 "matchall"
+	quick_test_span_gre_dir gt6
 	mirror_uninstall $swp1 ingress
 
-	log_test "mirror to ip6gre with next-hop remote ($tcflags)"
-}
-
-test_all()
-{
-	slow_path_trap_install $swp1 ingress
-	slow_path_trap_install $swp1 egress
-
-	tests_run
-
-	slow_path_trap_uninstall $swp1 egress
-	slow_path_trap_uninstall $swp1 ingress
+	log_test "mirror to ip6gre with next-hop remote"
 }
 
 trap cleanup EXIT
@@ -118,14 +107,6 @@ trap cleanup EXIT
 setup_prepare
 setup_wait
 
-tcflags="skip_hw"
-test_all
-
-if ! tc_offload_check; then
-	echo "WARN: Could not test offloaded functionality"
-else
-	tcflags="skip_sw"
-	test_all
-fi
+tests_run
 
 exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_vlan.sh b/tools/testing/selftests/net/forwarding/mirror_gre_vlan.sh
index 88cecdb9a861..63689928cb51 100755
--- a/tools/testing/selftests/net/forwarding/mirror_gre_vlan.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_vlan.sh
@@ -63,30 +63,11 @@ test_gretap()
 	full_test_span_gre_dir gt4 egress 0 8 "mirror to gretap"
 }
 
-test_all()
-{
-	slow_path_trap_install $swp1 ingress
-	slow_path_trap_install $swp1 egress
-
-	tests_run
-
-	slow_path_trap_uninstall $swp1 egress
-	slow_path_trap_uninstall $swp1 ingress
-}
-
 trap cleanup EXIT
 
 setup_prepare
 setup_wait
 
-tcflags="skip_hw"
-test_all
-
-if ! tc_offload_check; then
-	echo "WARN: Could not test offloaded functionality"
-else
-	tcflags="skip_sw"
-	test_all
-fi
+tests_run
 
 exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh b/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh
index c8a9b5bd841f..1b902cc579f6 100755
--- a/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh
@@ -153,21 +153,21 @@ test_span_gre_forbidden_cpu()
 	RET=0
 
 	# Run the pass-test first, to prime neighbor table.
-	mirror_install $swp1 ingress $tundev "matchall $tcflags"
-	quick_test_span_gre_dir $tundev ingress
+	mirror_install $swp1 ingress $tundev "matchall"
+	quick_test_span_gre_dir $tundev
 
 	# Now forbid the VLAN at the bridge and see it fail.
 	bridge vlan del dev br1 vid 555 self
 	sleep 1
-	fail_test_span_gre_dir $tundev ingress
+	fail_test_span_gre_dir $tundev
 
 	bridge vlan add dev br1 vid 555 self
 	sleep 1
-	quick_test_span_gre_dir $tundev ingress
+	quick_test_span_gre_dir $tundev
 
 	mirror_uninstall $swp1 ingress
 
-	log_test "$what: vlan forbidden at a bridge ($tcflags)"
+	log_test "$what: vlan forbidden at a bridge"
 }
 
 test_gretap_forbidden_cpu()
@@ -187,22 +187,22 @@ test_span_gre_forbidden_egress()
 
 	RET=0
 
-	mirror_install $swp1 ingress $tundev "matchall $tcflags"
-	quick_test_span_gre_dir $tundev ingress
+	mirror_install $swp1 ingress $tundev "matchall"
+	quick_test_span_gre_dir $tundev
 
 	bridge vlan del dev $swp3 vid 555
 	sleep 1
-	fail_test_span_gre_dir $tundev ingress
+	fail_test_span_gre_dir $tundev
 
 	bridge vlan add dev $swp3 vid 555
 	# Re-prime FDB
 	$ARPING -I br1.555 192.0.2.130 -fqc 1
 	sleep 1
-	quick_test_span_gre_dir $tundev ingress
+	quick_test_span_gre_dir $tundev
 
 	mirror_uninstall $swp1 ingress
 
-	log_test "$what: vlan forbidden at a bridge egress ($tcflags)"
+	log_test "$what: vlan forbidden at a bridge egress"
 }
 
 test_gretap_forbidden_egress()
@@ -223,30 +223,30 @@ test_span_gre_untagged_egress()
 
 	RET=0
 
-	mirror_install $swp1 ingress $tundev "matchall $tcflags"
+	mirror_install $swp1 ingress $tundev "matchall"
 
-	quick_test_span_gre_dir $tundev ingress
-	quick_test_span_vlan_dir $h3 555 ingress "$ul_proto"
+	quick_test_span_gre_dir $tundev
+	quick_test_span_vlan_dir $h3 555 "$ul_proto"
 
 	h3_addr_add_del del $h3.555
 	bridge vlan add dev $swp3 vid 555 pvid untagged
 	h3_addr_add_del add $h3
 	sleep 5
 
-	quick_test_span_gre_dir $tundev ingress
-	fail_test_span_vlan_dir $h3 555 ingress "$ul_proto"
+	quick_test_span_gre_dir $tundev
+	fail_test_span_vlan_dir $h3 555 "$ul_proto"
 
 	h3_addr_add_del del $h3
 	bridge vlan add dev $swp3 vid 555
 	h3_addr_add_del add $h3.555
 	sleep 5
 
-	quick_test_span_gre_dir $tundev ingress
-	quick_test_span_vlan_dir $h3 555 ingress "$ul_proto"
+	quick_test_span_gre_dir $tundev
+	quick_test_span_vlan_dir $h3 555 "$ul_proto"
 
 	mirror_uninstall $swp1 ingress
 
-	log_test "$what: vlan untagged at a bridge egress ($tcflags)"
+	log_test "$what: vlan untagged at a bridge egress"
 }
 
 test_gretap_untagged_egress()
@@ -267,19 +267,19 @@ test_span_gre_fdb_roaming()
 
 	RET=0
 
-	mirror_install $swp1 ingress $tundev "matchall $tcflags"
-	quick_test_span_gre_dir $tundev ingress
+	mirror_install $swp1 ingress $tundev "matchall"
+	quick_test_span_gre_dir $tundev
 
 	while ((RET == 0)); do
 		bridge fdb del dev $swp3 $h3mac vlan 555 master 2>/dev/null
 		bridge fdb add dev $swp2 $h3mac vlan 555 master static
 		sleep 1
-		fail_test_span_gre_dir $tundev ingress
+		fail_test_span_gre_dir $tundev
 
 		if ! bridge fdb sh dev $swp2 vlan 555 master \
 		    | grep -q $h3mac; then
 			printf "TEST: %-60s  [RETRY]\n" \
-				"$what: MAC roaming ($tcflags)"
+				"$what: MAC roaming"
 			# ARP or ND probably reprimed the FDB while the test
 			# was running. We would get a spurious failure.
 			RET=0
@@ -292,11 +292,11 @@ test_span_gre_fdb_roaming()
 	# Re-prime FDB
 	$ARPING -I br1.555 192.0.2.130 -fqc 1
 	sleep 1
-	quick_test_span_gre_dir $tundev ingress
+	quick_test_span_gre_dir $tundev
 
 	mirror_uninstall $swp1 ingress
 
-	log_test "$what: MAC roaming ($tcflags)"
+	log_test "$what: MAC roaming"
 }
 
 test_gretap_fdb_roaming()
@@ -319,30 +319,11 @@ test_ip6gretap_stp()
 	full_test_span_gre_stp gt6 $swp3 "mirror to ip6gretap"
 }
 
-test_all()
-{
-	slow_path_trap_install $swp1 ingress
-	slow_path_trap_install $swp1 egress
-
-	tests_run
-
-	slow_path_trap_uninstall $swp1 egress
-	slow_path_trap_uninstall $swp1 ingress
-}
-
 trap cleanup EXIT
 
 setup_prepare
 setup_wait
 
-tcflags="skip_hw"
-test_all
-
-if ! tc_offload_check; then
-	echo "WARN: Could not test offloaded functionality"
-else
-	tcflags="skip_sw"
-	test_all
-fi
+tests_run
 
 exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/mirror_lib.sh b/tools/testing/selftests/net/forwarding/mirror_lib.sh
index 3e8ebeff3019..6bf9d5ae933c 100644
--- a/tools/testing/selftests/net/forwarding/mirror_lib.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_lib.sh
@@ -44,14 +44,17 @@ mirror_test()
 		local type="icmp echoreq"
 	fi
 
+	if [[ -z ${expect//[[:digit:]]/} ]]; then
+		expect="== $expect"
+	fi
+
 	local t0=$(tc_rule_stats_get $dev $pref)
 	$MZ $proto $vrf_name ${sip:+-A $sip} -B $dip -a own -b bc -q \
 	    -c 10 -d 100msec -t $type
 	sleep 0.5
 	local t1=$(tc_rule_stats_get $dev $pref)
 	local delta=$((t1 - t0))
-	# Tolerate a couple stray extra packets.
-	((expect <= delta && delta <= expect + 2))
+	((delta $expect))
 	check_err $? "Expected to capture $expect packets, got $delta."
 }
 
@@ -59,36 +62,42 @@ do_test_span_dir_ips()
 {
 	local expect=$1; shift
 	local dev=$1; shift
-	local direction=$1; shift
 	local ip1=$1; shift
 	local ip2=$1; shift
+	local forward_type=${1-8}; shift
+	local backward_type=${1-0}; shift
 
-	icmp_capture_install $dev
+	icmp_capture_install $dev "type $forward_type"
 	mirror_test v$h1 $ip1 $ip2 $dev 100 $expect
+	icmp_capture_uninstall $dev
+
+	icmp_capture_install $dev "type $backward_type"
 	mirror_test v$h2 $ip2 $ip1 $dev 100 $expect
 	icmp_capture_uninstall $dev
 }
 
 quick_test_span_dir_ips()
 {
-	do_test_span_dir_ips 10 "$@"
-}
+	local dev=$1; shift
+	local ip1=$1; shift
+	local ip2=$1; shift
+	local forward_type=${1-8}; shift
+	local backward_type=${1-0}; shift
 
-fail_test_span_dir_ips()
-{
-	do_test_span_dir_ips 0 "$@"
+	do_test_span_dir_ips 10 "$dev" "$ip1" "$ip2" \
+			     "$forward_type" "$backward_type"
 }
 
 test_span_dir_ips()
 {
 	local dev=$1; shift
-	local direction=$1; shift
 	local forward_type=$1; shift
 	local backward_type=$1; shift
 	local ip1=$1; shift
 	local ip2=$1; shift
 
-	quick_test_span_dir_ips "$dev" "$direction" "$ip1" "$ip2"
+	quick_test_span_dir_ips "$dev" "$ip1" "$ip2" \
+				"$forward_type" "$backward_type"
 
 	icmp_capture_install $dev "type $forward_type"
 	mirror_test v$h1 $ip1 $ip2 $dev 100 10
@@ -99,14 +108,14 @@ test_span_dir_ips()
 	icmp_capture_uninstall $dev
 }
 
-fail_test_span_dir()
-{
-	fail_test_span_dir_ips "$@" 192.0.2.1 192.0.2.2
-}
-
 test_span_dir()
 {
-	test_span_dir_ips "$@" 192.0.2.1 192.0.2.2
+	local dev=$1; shift
+	local forward_type=$1; shift
+	local backward_type=$1; shift
+
+	test_span_dir_ips "$dev" "$forward_type" "$backward_type" \
+			  192.0.2.1 192.0.2.2
 }
 
 do_test_span_vlan_dir_ips()
@@ -114,7 +123,6 @@ do_test_span_vlan_dir_ips()
 	local expect=$1; shift
 	local dev=$1; shift
 	local vid=$1; shift
-	local direction=$1; shift
 	local ul_proto=$1; shift
 	local ip1=$1; shift
 	local ip2=$1; shift
@@ -123,27 +131,50 @@ do_test_span_vlan_dir_ips()
 	# The traffic is meant for local box anyway, so will be trapped to
 	# kernel.
 	vlan_capture_install $dev "skip_hw vlan_id $vid vlan_ethtype $ul_proto"
-	mirror_test v$h1 $ip1 $ip2 $dev 100 $expect
-	mirror_test v$h2 $ip2 $ip1 $dev 100 $expect
+	mirror_test v$h1 $ip1 $ip2 $dev 100 "$expect"
+	mirror_test v$h2 $ip2 $ip1 $dev 100 "$expect"
 	vlan_capture_uninstall $dev
 }
 
 quick_test_span_vlan_dir_ips()
 {
-	do_test_span_vlan_dir_ips 10 "$@"
+	local dev=$1; shift
+	local vid=$1; shift
+	local ul_proto=$1; shift
+	local ip1=$1; shift
+	local ip2=$1; shift
+
+	do_test_span_vlan_dir_ips '>= 10' "$dev" "$vid" "$ul_proto" \
+				  "$ip1" "$ip2"
 }
 
 fail_test_span_vlan_dir_ips()
 {
-	do_test_span_vlan_dir_ips 0 "$@"
+	local dev=$1; shift
+	local vid=$1; shift
+	local ul_proto=$1; shift
+	local ip1=$1; shift
+	local ip2=$1; shift
+
+	do_test_span_vlan_dir_ips 0 "$dev" "$vid" "$ul_proto" "$ip1" "$ip2"
 }
 
 quick_test_span_vlan_dir()
 {
-	quick_test_span_vlan_dir_ips "$@" 192.0.2.1 192.0.2.2
+	local dev=$1; shift
+	local vid=$1; shift
+	local ul_proto=$1; shift
+
+	quick_test_span_vlan_dir_ips "$dev" "$vid" "$ul_proto" \
+				     192.0.2.1 192.0.2.2
 }
 
 fail_test_span_vlan_dir()
 {
-	fail_test_span_vlan_dir_ips "$@" 192.0.2.1 192.0.2.2
+	local dev=$1; shift
+	local vid=$1; shift
+	local ul_proto=$1; shift
+
+	fail_test_span_vlan_dir_ips "$dev" "$vid" "$ul_proto" \
+				    192.0.2.1 192.0.2.2
 }
diff --git a/tools/testing/selftests/net/forwarding/mirror_vlan.sh b/tools/testing/selftests/net/forwarding/mirror_vlan.sh
index 0b44e148235e..2f150a414d38 100755
--- a/tools/testing/selftests/net/forwarding/mirror_vlan.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_vlan.sh
@@ -40,12 +40,16 @@ setup_prepare()
 
 	vlan_create $h2 111 v$h2 192.0.2.18/28
 	bridge vlan add dev $swp2 vid 111
+
+	trap_install $h3 ingress
 }
 
 cleanup()
 {
 	pre_cleanup
 
+	trap_uninstall $h3 ingress
+
 	vlan_destroy $h2 111
 	vlan_destroy $h1 111
 	vlan_destroy $h3 555
@@ -63,11 +67,11 @@ test_vlan_dir()
 
 	RET=0
 
-	mirror_install $swp1 $direction $swp3.555 "matchall $tcflags"
-	test_span_dir "$h3.555" "$direction" "$forward_type" "$backward_type"
+	mirror_install $swp1 $direction $swp3.555 "matchall"
+	test_span_dir "$h3.555" "$forward_type" "$backward_type"
 	mirror_uninstall $swp1 $direction
 
-	log_test "$direction mirror to vlan ($tcflags)"
+	log_test "$direction mirror to vlan"
 }
 
 test_vlan()
@@ -84,14 +88,12 @@ test_tagged_vlan_dir()
 
 	RET=0
 
-	mirror_install $swp1 $direction $swp3.555 "matchall $tcflags"
-	do_test_span_vlan_dir_ips 10 "$h3.555" 111 "$direction" ip \
-				  192.0.2.17 192.0.2.18
-	do_test_span_vlan_dir_ips  0 "$h3.555" 555 "$direction" ip \
-				  192.0.2.17 192.0.2.18
+	mirror_install $swp1 $direction $swp3.555 "matchall"
+	do_test_span_vlan_dir_ips '>= 10' "$h3.555" 111 ip 192.0.2.17 192.0.2.18
+	do_test_span_vlan_dir_ips  0 "$h3.555" 555 ip 192.0.2.17 192.0.2.18
 	mirror_uninstall $swp1 $direction
 
-	log_test "$direction mirror tagged to vlan ($tcflags)"
+	log_test "$direction mirror tagged to vlan"
 }
 
 test_tagged_vlan()
@@ -100,32 +102,11 @@ test_tagged_vlan()
 	test_tagged_vlan_dir egress 0 8
 }
 
-test_all()
-{
-	slow_path_trap_install $swp1 ingress
-	slow_path_trap_install $swp1 egress
-	trap_install $h3 ingress
-
-	tests_run
-
-	trap_uninstall $h3 ingress
-	slow_path_trap_uninstall $swp1 egress
-	slow_path_trap_uninstall $swp1 ingress
-}
-
 trap cleanup EXIT
 
 setup_prepare
 setup_wait
 
-tcflags="skip_hw"
-test_all
-
-if ! tc_offload_check; then
-	echo "WARN: Could not test offloaded functionality"
-else
-	tcflags="skip_sw"
-	test_all
-fi
+tests_run
 
 exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/no_forwarding.sh b/tools/testing/selftests/net/forwarding/no_forwarding.sh
index af3b398d13f0..694ece9ba3a7 100755
--- a/tools/testing/selftests/net/forwarding/no_forwarding.sh
+++ b/tools/testing/selftests/net/forwarding/no_forwarding.sh
@@ -202,7 +202,7 @@ one_bridge_two_pvids()
 	ip link set $swp2 master br0
 
 	bridge vlan add dev $swp1 vid 1 pvid untagged
-	bridge vlan add dev $swp1 vid 2 pvid untagged
+	bridge vlan add dev $swp2 vid 2 pvid untagged
 
 	run_test "Switch ports in VLAN-aware bridge with different PVIDs"
 
@@ -233,6 +233,9 @@ cleanup()
 {
 	pre_cleanup
 
+	ip link set dev $swp2 down
+	ip link set dev $swp1 down
+
 	h2_destroy
 	h1_destroy
 
diff --git a/tools/testing/selftests/net/forwarding/router_bridge_1d_lag.sh b/tools/testing/selftests/net/forwarding/router_bridge_1d_lag.sh
index e064b946e821..16583a470ec3 100755
--- a/tools/testing/selftests/net/forwarding/router_bridge_1d_lag.sh
+++ b/tools/testing/selftests/net/forwarding/router_bridge_1d_lag.sh
@@ -109,6 +109,7 @@ ALL_TESTS="
 	ping_ipv4
 	ping_ipv6
 "
+REQUIRE_TEAMD="yes"
 NUM_NETIFS=8
 source lib.sh
 
diff --git a/tools/testing/selftests/net/forwarding/router_bridge_lag.sh b/tools/testing/selftests/net/forwarding/router_bridge_lag.sh
index f05ffe213c46..2a4cd1af1b85 100755
--- a/tools/testing/selftests/net/forwarding/router_bridge_lag.sh
+++ b/tools/testing/selftests/net/forwarding/router_bridge_lag.sh
@@ -76,6 +76,7 @@
 	ping_ipv4
 	ping_ipv6
     "}
+REQUIRE_TEAMD="yes"
 NUM_NETIFS=8
 : ${lib_dir:=.}
 source $lib_dir/lib.sh
diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh.sh
index 3f0f5dc95542..a7d8399c8d4f 100755
--- a/tools/testing/selftests/net/forwarding/router_mpath_nh.sh
+++ b/tools/testing/selftests/net/forwarding/router_mpath_nh.sh
@@ -1,10 +1,46 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
+# +-------------------------+
+# |  H1                     |
+# |               $h1 +     |
+# |      192.0.2.2/24 |     |
+# |  2001:db8:1::2/64 |     |
+# +-------------------|-----+
+#                     |
+# +-------------------|----------------------+
+# |                   |                   R1 |
+# |             $rp11 +                      |
+# |      192.0.2.1/24                        |
+# |  2001:db8:1::1/64                        |
+# |                                          |
+# |  + $rp12              + $rp13            |
+# |  | 169.254.2.12/24    | 169.254.3.13/24  |
+# |  | fe80:2::12/64      | fe80:3::13/64    |
+# +--|--------------------|------------------+
+#    |                    |
+# +--|--------------------|------------------+
+# |  + $rp22              + $rp23            |
+# |    169.254.2.22/24      169.254.3.23/24  |
+# |    fe80:2::22/64        fe80:3::23/64    |
+# |                                          |
+# |             $rp21 +                      |
+# |   198.51.100.1/24 |                      |
+# |  2001:db8:2::1/64 |                   R2 |
+# +-------------------|----------------------+
+#                     |
+# +-------------------|-----+
+# |                   |     |
+# |               $h2 +     |
+# |   198.51.100.2/24       |
+# |  2001:db8:2::2/64    H2 |
+# +-------------------------+
+
 ALL_TESTS="
 	ping_ipv4
 	ping_ipv6
 	multipath_test
+	multipath16_test
 	ping_ipv4_blackhole
 	ping_ipv6_blackhole
 	nh_stats_test_v4
@@ -191,9 +227,11 @@ routing_nh_obj()
 
 multipath4_test()
 {
-	local desc="$1"
-	local weight_rp12=$2
-	local weight_rp13=$3
+	local desc=$1; shift
+	local weight_rp12=$1; shift
+	local weight_rp13=$1; shift
+	local ports=${1-sp=1024,dp=0-32768}; shift
+
 	local t0_rp12 t0_rp13 t1_rp12 t1_rp13
 	local packets_rp12 packets_rp13
 
@@ -207,7 +245,8 @@ multipath4_test()
 	t0_rp13=$(link_stats_tx_packets_get $rp13)
 
 	ip vrf exec vrf-h1 $MZ $h1 -q -p 64 -A 192.0.2.2 -B 198.51.100.2 \
-		-d $MZ_DELAY -t udp "sp=1024,dp=0-32768"
+		-d $MZ_DELAY -t udp "$ports"
+	sleep 1
 
 	t1_rp12=$(link_stats_tx_packets_get $rp12)
 	t1_rp13=$(link_stats_tx_packets_get $rp13)
@@ -223,9 +262,11 @@ multipath4_test()
 
 multipath6_test()
 {
-	local desc="$1"
-	local weight_rp12=$2
-	local weight_rp13=$3
+	local desc=$1; shift
+	local weight_rp12=$1; shift
+	local weight_rp13=$1; shift
+	local ports=${1-sp=1024,dp=0-32768}; shift
+
 	local t0_rp12 t0_rp13 t1_rp12 t1_rp13
 	local packets_rp12 packets_rp13
 
@@ -240,7 +281,8 @@ multipath6_test()
 	t0_rp13=$(link_stats_tx_packets_get $rp13)
 
 	$MZ $h1 -6 -q -p 64 -A 2001:db8:1::2 -B 2001:db8:2::2 \
-		-d $MZ_DELAY -t udp "sp=1024,dp=0-32768"
+		-d $MZ_DELAY -t udp "$ports"
+	sleep 1
 
 	t1_rp12=$(link_stats_tx_packets_get $rp12)
 	t1_rp13=$(link_stats_tx_packets_get $rp13)
@@ -278,6 +320,23 @@ multipath_test()
 	multipath6_test "Weighted MP 11:45" 11 45
 }
 
+multipath16_test()
+{
+	check_nhgw16 104 || return
+
+	log_info "Running 16-bit IPv4 multipath tests"
+	multipath4_test "65535:65535" 65535 65535
+	multipath4_test "128:512" 128 512
+	omit_on_slow \
+		multipath4_test "255:65535" 255 65535 sp=1024-1026,dp=0-65535
+
+	log_info "Running 16-bit IPv6 multipath tests"
+	multipath6_test "65535:65535" 65535 65535
+	multipath6_test "128:512" 128 512
+	omit_on_slow \
+		multipath6_test "255:65535" 255 65535 sp=1024-1026,dp=0-65535
+}
+
 ping_ipv4_blackhole()
 {
 	RET=0
diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh
index 7e7d62161c34..507b2852dabe 100644
--- a/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh
+++ b/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh
@@ -56,21 +56,12 @@ nh_stats_test_dispatch_swhw()
 	local group_id=$1; shift
 	local mz="$@"
 
-	local used
-
 	nh_stats_do_test "$what" "$nh1_id" "$nh2_id" "$group_id" \
 			 nh_stats_get "${mz[@]}"
 
-	used=$(ip -s -j -d nexthop show id $group_id |
-		   jq '.[].hw_stats.used')
-	kind=$(ip -j -d link show dev $rp11 |
-		   jq -r '.[].linkinfo.info_kind')
-	if [[ $used == true ]]; then
+	xfail_on_veth $rp11 \
 		nh_stats_do_test "HW $what" "$nh1_id" "$nh2_id" "$group_id" \
 				 nh_stats_get_hw "${mz[@]}"
-	elif [[ $kind == veth ]]; then
-		log_test_skip "HW stats not offloaded on veth topology"
-	fi
 }
 
 nh_stats_test_dispatch()
@@ -83,7 +74,6 @@ nh_stats_test_dispatch()
 	local mz="$@"
 
 	local enabled
-	local kind
 
 	if ! ip nexthop help 2>&1 | grep -q hw_stats; then
 		log_test_skip "NH stats test: ip doesn't support HW stats"
@@ -127,3 +117,16 @@ __nh_stats_test_v6()
 			       $MZ -6 $h1 -A 2001:db8:1::2 -B 2001:db8:2::2
 	sysctl_restore net.ipv6.fib_multipath_hash_policy
 }
+
+check_nhgw16()
+{
+	local nhid=$1; shift
+
+	ip nexthop replace id 9999 group "$nhid,65535" &>/dev/null
+	if (( $? )); then
+		log_test_skip "16-bit multipath tests" \
+			      "iproute2 or the kernel do not support 16-bit next hop weights"
+		return 1
+	fi
+	ip nexthop del id 9999 ||:
+}
diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh
index 4b483d24ad00..88ddae05b39d 100755
--- a/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh
+++ b/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh
@@ -1,10 +1,46 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
+# +-------------------------+
+# |  H1                     |
+# |               $h1 +     |
+# |      192.0.2.2/24 |     |
+# |  2001:db8:1::2/64 |     |
+# +-------------------|-----+
+#                     |
+# +-------------------|----------------------+
+# |                   |                   R1 |
+# |             $rp11 +                      |
+# |      192.0.2.1/24                        |
+# |  2001:db8:1::1/64                        |
+# |                                          |
+# |  + $rp12              + $rp13            |
+# |  | 169.254.2.12/24    | 169.254.3.13/24  |
+# |  | fe80:2::12/64      | fe80:3::13/64    |
+# +--|--------------------|------------------+
+#    |                    |
+# +--|--------------------|------------------+
+# |  + $rp22              + $rp23            |
+# |    169.254.2.22/24      169.254.3.23/24  |
+# |    fe80:2::22/64        fe80:3::23/64    |
+# |                                          |
+# |             $rp21 +                      |
+# |   198.51.100.1/24 |                      |
+# |  2001:db8:2::1/64 |                   R2 |
+# +-------------------|----------------------+
+#                     |
+# +-------------------|-----+
+# |                   |     |
+# |               $h2 +     |
+# |   198.51.100.2/24       |
+# |  2001:db8:2::2/64    H2 |
+# +-------------------------+
+
 ALL_TESTS="
 	ping_ipv4
 	ping_ipv6
 	multipath_test
+	multipath16_test
 	nh_stats_test_v4
 	nh_stats_test_v6
 "
@@ -193,9 +229,11 @@ routing_nh_obj()
 
 multipath4_test()
 {
-	local desc="$1"
-	local weight_rp12=$2
-	local weight_rp13=$3
+	local desc=$1; shift
+	local weight_rp12=$1; shift
+	local weight_rp13=$1; shift
+	local ports=${1-sp=1024,dp=0-32768}; shift
+
 	local t0_rp12 t0_rp13 t1_rp12 t1_rp13
 	local packets_rp12 packets_rp13
 
@@ -208,7 +246,8 @@ multipath4_test()
 	t0_rp13=$(link_stats_tx_packets_get $rp13)
 
 	ip vrf exec vrf-h1 $MZ $h1 -q -p 64 -A 192.0.2.2 -B 198.51.100.2 \
-		-d $MZ_DELAY -t udp "sp=1024,dp=0-32768"
+		-d $MZ_DELAY -t udp "$ports"
+	sleep 1
 
 	t1_rp12=$(link_stats_tx_packets_get $rp12)
 	t1_rp13=$(link_stats_tx_packets_get $rp13)
@@ -223,9 +262,11 @@ multipath4_test()
 
 multipath6_l4_test()
 {
-	local desc="$1"
-	local weight_rp12=$2
-	local weight_rp13=$3
+	local desc=$1; shift
+	local weight_rp12=$1; shift
+	local weight_rp13=$1; shift
+	local ports=${1-sp=1024,dp=0-32768}; shift
+
 	local t0_rp12 t0_rp13 t1_rp12 t1_rp13
 	local packets_rp12 packets_rp13
 
@@ -238,7 +279,8 @@ multipath6_l4_test()
 	t0_rp13=$(link_stats_tx_packets_get $rp13)
 
 	$MZ $h1 -6 -q -p 64 -A 2001:db8:1::2 -B 2001:db8:2::2 \
-		-d $MZ_DELAY -t udp "sp=1024,dp=0-32768"
+		-d $MZ_DELAY -t udp "$ports"
+	sleep 1
 
 	t1_rp12=$(link_stats_tx_packets_get $rp12)
 	t1_rp13=$(link_stats_tx_packets_get $rp13)
@@ -336,6 +378,41 @@ multipath_test()
 	ip nexthop replace id 106 group 104,1/105,1 type resilient
 }
 
+multipath16_test()
+{
+	check_nhgw16 104 || return
+
+	log_info "Running 16-bit IPv4 multipath tests"
+	ip nexthop replace id 103 group 101/102 type resilient idle_timer 0
+
+	ip nexthop replace id 103 group 101,65535/102,65535 type resilient
+	multipath4_test "65535:65535" 65535 65535
+
+	ip nexthop replace id 103 group 101,128/102,512 type resilient
+	multipath4_test "128:512" 128 512
+
+	ip nexthop replace id 103 group 101,255/102,65535 type resilient
+	omit_on_slow \
+		multipath4_test "255:65535" 255 65535 sp=1024-1026,dp=0-65535
+
+	ip nexthop replace id 103 group 101,1/102,1 type resilient
+
+	log_info "Running 16-bit IPv6 L4 hash multipath tests"
+	ip nexthop replace id 106 group 104/105 type resilient idle_timer 0
+
+	ip nexthop replace id 106 group 104,65535/105,65535 type resilient
+	multipath6_l4_test "65535:65535" 65535 65535
+
+	ip nexthop replace id 106 group 104,128/105,512 type resilient
+	multipath6_l4_test "128:512" 128 512
+
+	ip nexthop replace id 106 group 104,255/105,65535 type resilient
+	omit_on_slow \
+		multipath6_l4_test "255:65535" 255 65535 sp=1024-1026,dp=0-65535
+
+	ip nexthop replace id 106 group 104,1/105,1 type resilient
+}
+
 nh_stats_test_v4()
 {
 	__nh_stats_test_v4 resilient
diff --git a/tools/testing/selftests/net/forwarding/router_mpath_seed.sh b/tools/testing/selftests/net/forwarding/router_mpath_seed.sh
new file mode 100755
index 000000000000..314cb906c1eb
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/router_mpath_seed.sh
@@ -0,0 +1,333 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# +-------------------------+  +-------------------------+
+# |  H1                     |  |                      H2 |
+# |               $h1 +     |  | + $h2                   |
+# |      192.0.2.1/28 |     |  | | 192.0.2.34/28         |
+# |  2001:db8:1::1/64 |     |  | | 2001:db8:3::2/64      |
+# +-------------------|-----+  +-|-----------------------+
+#                     |          |
+# +-------------------|-----+  +-|-----------------------+
+# |  R1               |     |  | |                    R2 |
+# |             $rp11 +     |  | + $rp21                 |
+# |      192.0.2.2/28       |  |   192.0.2.33/28         |
+# |  2001:db8:1::2/64       |  |   2001:db8:3::1/64      |
+# |                         |  |                         |
+# |             $rp12 +     |  | + $rp22                 |
+# |     192.0.2.17/28 |     |  | | 192.0.2.18..27/28     |
+# | 2001:db8:2::17/64 |     |  | | 2001:db8:2::18..27/64 |
+# +-------------------|-----+  +-|-----------------------+
+#                     |          |
+#                     `----------'
+
+ALL_TESTS="
+	ping_ipv4
+	ping_ipv6
+	test_mpath_seed_stability_ipv4
+	test_mpath_seed_stability_ipv6
+	test_mpath_seed_get
+	test_mpath_seed_ipv4
+	test_mpath_seed_ipv6
+"
+NUM_NETIFS=6
+source lib.sh
+
+h1_create()
+{
+	simple_if_init $h1 192.0.2.1/28 2001:db8:1::1/64
+	ip -4 route add 192.0.2.32/28 vrf v$h1 nexthop via 192.0.2.2
+	ip -6 route add 2001:db8:3::/64 vrf v$h1 nexthop via 2001:db8:1::2
+}
+
+h1_destroy()
+{
+	ip -6 route del 2001:db8:3::/64 vrf v$h1 nexthop via 2001:db8:1::2
+	ip -4 route del 192.0.2.32/28 vrf v$h1 nexthop via 192.0.2.2
+	simple_if_fini $h1 192.0.2.1/28 2001:db8:1::1/64
+}
+
+h2_create()
+{
+	simple_if_init $h2 192.0.2.34/28 2001:db8:3::2/64
+	ip -4 route add 192.0.2.0/28 vrf v$h2 nexthop via 192.0.2.33
+	ip -6 route add 2001:db8:1::/64 vrf v$h2 nexthop via 2001:db8:3::1
+}
+
+h2_destroy()
+{
+	ip -6 route del 2001:db8:1::/64 vrf v$h2 nexthop via 2001:db8:3::1
+	ip -4 route del 192.0.2.0/28 vrf v$h2 nexthop via 192.0.2.33
+	simple_if_fini $h2 192.0.2.34/28 2001:db8:3::2/64
+}
+
+router1_create()
+{
+	simple_if_init $rp11 192.0.2.2/28 2001:db8:1::2/64
+	__simple_if_init $rp12 v$rp11 192.0.2.17/28 2001:db8:2::17/64
+}
+
+router1_destroy()
+{
+	__simple_if_fini $rp12 192.0.2.17/28 2001:db8:2::17/64
+	simple_if_fini $rp11 192.0.2.2/28 2001:db8:1::2/64
+}
+
+router2_create()
+{
+	simple_if_init $rp21 192.0.2.33/28 2001:db8:3::1/64
+	__simple_if_init $rp22 v$rp21 192.0.2.18/28 2001:db8:2::18/64
+	ip -4 route add 192.0.2.0/28 vrf v$rp21 nexthop via 192.0.2.17
+	ip -6 route add 2001:db8:1::/64 vrf v$rp21 nexthop via 2001:db8:2::17
+}
+
+router2_destroy()
+{
+	ip -6 route del 2001:db8:1::/64 vrf v$rp21 nexthop via 2001:db8:2::17
+	ip -4 route del 192.0.2.0/28 vrf v$rp21 nexthop via 192.0.2.17
+	__simple_if_fini $rp22 192.0.2.18/28 2001:db8:2::18/64
+	simple_if_fini $rp21 192.0.2.33/28 2001:db8:3::1/64
+}
+
+nexthops_create()
+{
+	local i
+	for i in $(seq 10); do
+		ip nexthop add id $((1000 + i)) via 192.0.2.18 dev $rp12
+		ip nexthop add id $((2000 + i)) via 2001:db8:2::18 dev $rp12
+	done
+
+	ip nexthop add id 1000 group $(seq -s / 1001 1010) hw_stats on
+	ip nexthop add id 2000 group $(seq -s / 2001 2010) hw_stats on
+	ip -4 route add 192.0.2.32/28 vrf v$rp11 nhid 1000
+	ip -6 route add 2001:db8:3::/64 vrf v$rp11 nhid 2000
+}
+
+nexthops_destroy()
+{
+	local i
+
+	ip -6 route del 2001:db8:3::/64 vrf v$rp11 nhid 2000
+	ip -4 route del 192.0.2.32/28 vrf v$rp11 nhid 1000
+	ip nexthop del id 2000
+	ip nexthop del id 1000
+
+	for i in $(seq 10 -1 1); do
+		ip nexthop del id $((2000 + i))
+		ip nexthop del id $((1000 + i))
+	done
+}
+
+setup_prepare()
+{
+	h1=${NETIFS[p1]}
+	rp11=${NETIFS[p2]}
+
+	rp12=${NETIFS[p3]}
+	rp22=${NETIFS[p4]}
+
+	rp21=${NETIFS[p5]}
+	h2=${NETIFS[p6]}
+
+	sysctl_save net.ipv4.fib_multipath_hash_seed
+
+	vrf_prepare
+
+	h1_create
+	h2_create
+	router1_create
+	router2_create
+
+	forwarding_enable
+}
+
+cleanup()
+{
+	pre_cleanup
+
+	forwarding_restore
+
+	nexthops_destroy
+	router2_destroy
+	router1_destroy
+	h2_destroy
+	h1_destroy
+
+	vrf_cleanup
+
+	sysctl_restore net.ipv4.fib_multipath_hash_seed
+}
+
+ping_ipv4()
+{
+	ping_test $h1 192.0.2.34
+}
+
+ping_ipv6()
+{
+	ping6_test $h1 2001:db8:3::2
+}
+
+test_mpath_seed_get()
+{
+	RET=0
+
+	local i
+	for ((i = 0; i < 100; i++)); do
+		local seed_w=$((999331 * i))
+		sysctl -qw net.ipv4.fib_multipath_hash_seed=$seed_w
+		local seed_r=$(sysctl -n net.ipv4.fib_multipath_hash_seed)
+		((seed_r == seed_w))
+		check_err $? "mpath seed written as $seed_w, but read as $seed_r"
+	done
+
+	log_test "mpath seed set/get"
+}
+
+nh_stats_snapshot()
+{
+	local group_id=$1; shift
+
+	ip -j -s -s nexthop show id $group_id |
+	    jq -c '[.[].group_stats | sort_by(.id) | .[].packets]'
+}
+
+get_active_nh()
+{
+	local s0=$1; shift
+	local s1=$1; shift
+
+	jq -n --argjson s0 "$s0" --argjson s1 "$s1" -f /dev/stdin <<-"EOF"
+		[range($s0 | length)] |
+		map($s1[.] - $s0[.]) |
+		map(if . > 8 then 1 else 0 end) |
+		index(1)
+	EOF
+}
+
+probe_nh()
+{
+	local group_id=$1; shift
+	local -a mz=("$@")
+
+	local s0=$(nh_stats_snapshot $group_id)
+	"${mz[@]}"
+	local s1=$(nh_stats_snapshot $group_id)
+
+	get_active_nh "$s0" "$s1"
+}
+
+probe_seed()
+{
+	local group_id=$1; shift
+	local seed=$1; shift
+	local -a mz=("$@")
+
+	sysctl -qw net.ipv4.fib_multipath_hash_seed=$seed
+	probe_nh "$group_id" "${mz[@]}"
+}
+
+test_mpath_seed()
+{
+	local group_id=$1; shift
+	local what=$1; shift
+	local -a mz=("$@")
+	local ii
+
+	RET=0
+
+	local -a tally=(0 0 0 0 0 0 0 0 0 0)
+	for ((ii = 0; ii < 100; ii++)); do
+		local act=$(probe_seed $group_id $((999331 * ii)) "${mz[@]}")
+		((tally[act]++))
+	done
+
+	local tally_str="${tally[@]}"
+	for ((ii = 0; ii < ${#tally[@]}; ii++)); do
+		((tally[ii] > 0))
+		check_err $? "NH #$ii not hit, tally='$tally_str'"
+	done
+
+	log_test "mpath seed $what"
+	sysctl -qw net.ipv4.fib_multipath_hash_seed=0
+}
+
+test_mpath_seed_ipv4()
+{
+	test_mpath_seed 1000 IPv4 \
+		$MZ $h1 -A 192.0.2.1 -B 192.0.2.34 -q \
+			-p 64 -d 0 -c 10 -t udp
+}
+
+test_mpath_seed_ipv6()
+{
+	test_mpath_seed 2000 IPv6 \
+		$MZ -6 $h1 -A 2001:db8:1::1 -B 2001:db8:3::2 -q \
+			-p 64 -d 0 -c 10 -t udp
+}
+
+check_mpath_seed_stability()
+{
+	local seed=$1; shift
+	local act_0=$1; shift
+	local act_1=$1; shift
+
+	((act_0 == act_1))
+	check_err $? "seed $seed: active NH moved from $act_0 to $act_1 after seed change"
+}
+
+test_mpath_seed_stability()
+{
+	local group_id=$1; shift
+	local what=$1; shift
+	local -a mz=("$@")
+
+	RET=0
+
+	local seed_0=0
+	local seed_1=3221338814
+	local seed_2=3735928559
+
+	# Initial active NH before touching the seed at all.
+	local act_ini=$(probe_nh $group_id "${mz[@]}")
+
+	local act_0_0=$(probe_seed $group_id $seed_0 "${mz[@]}")
+	local act_1_0=$(probe_seed $group_id $seed_1 "${mz[@]}")
+	local act_2_0=$(probe_seed $group_id $seed_2 "${mz[@]}")
+
+	local act_0_1=$(probe_seed $group_id $seed_0 "${mz[@]}")
+	local act_1_1=$(probe_seed $group_id $seed_1 "${mz[@]}")
+	local act_2_1=$(probe_seed $group_id $seed_2 "${mz[@]}")
+
+	check_mpath_seed_stability initial $act_ini $act_0_0
+	check_mpath_seed_stability $seed_0 $act_0_0 $act_0_1
+	check_mpath_seed_stability $seed_1 $act_1_0 $act_1_1
+	check_mpath_seed_stability $seed_2 $act_2_0 $act_2_1
+
+	log_test "mpath seed stability $what"
+	sysctl -qw net.ipv4.fib_multipath_hash_seed=0
+}
+
+test_mpath_seed_stability_ipv4()
+{
+	test_mpath_seed_stability 1000 IPv4 \
+		$MZ $h1 -A 192.0.2.1 -B 192.0.2.34 -q \
+			-p 64 -d 0 -c 10 -t udp
+}
+
+test_mpath_seed_stability_ipv6()
+{
+	test_mpath_seed_stability 2000 IPv6 \
+		$MZ -6 $h1 -A 2001:db8:1::1 -B 2001:db8:3::2 -q \
+			-p 64 -d 0 -c 10 -t udp
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+nexthops_create
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/router_multipath.sh b/tools/testing/selftests/net/forwarding/router_multipath.sh
index e2be354167a1..46f365b557b7 100755
--- a/tools/testing/selftests/net/forwarding/router_multipath.sh
+++ b/tools/testing/selftests/net/forwarding/router_multipath.sh
@@ -180,6 +180,7 @@ multipath4_test()
 
        ip vrf exec vrf-h1 $MZ $h1 -q -p 64 -A 192.0.2.2 -B 198.51.100.2 \
 	       -d $MZ_DELAY -t udp "sp=1024,dp=0-32768"
+       sleep 1
 
        t1_rp12=$(link_stats_tx_packets_get $rp12)
        t1_rp13=$(link_stats_tx_packets_get $rp13)
@@ -217,6 +218,7 @@ multipath6_test()
 
        $MZ $h1 -6 -q -p 64 -A 2001:db8:1::2 -B 2001:db8:2::2 \
 	       -d $MZ_DELAY -t udp "sp=1024,dp=0-32768"
+       sleep 1
 
        t1_rp12=$(link_stats_tx_packets_get $rp12)
        t1_rp13=$(link_stats_tx_packets_get $rp13)
diff --git a/tools/testing/selftests/net/forwarding/router_nh.sh b/tools/testing/selftests/net/forwarding/router_nh.sh
index f3a53738bdcc..92904b01eae9 100755
--- a/tools/testing/selftests/net/forwarding/router_nh.sh
+++ b/tools/testing/selftests/net/forwarding/router_nh.sh
@@ -1,6 +1,20 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
+# +-------------------------+  +-------------------------+
+# | H1                      |  |                      H2 |
+# |               $h1 +     |  |               $h2 +     |
+# |      192.0.2.2/24 |     |  |   198.51.100.2/24 |     |
+# |  2001:db8:1::2/64 |     |  |  2001:db8:2::2/64 |     |
+# +-------------------|-----+  +-------------------|-----+
+#                     |                            |
+# +-------------------|----------------------------|-----+
+# | R1                |                            |     |
+# |              $rp1 +                       $rp2 +     |
+# |      192.0.2.1/24              198.51.100.1/24       |
+# |  2001:db8:1::1/64             2001:db8:2::1/64       |
+# +------------------------------------------------------+
+
 ALL_TESTS="
 	ping_ipv4
 	ping_ipv6
diff --git a/tools/testing/selftests/net/forwarding/sch_ets.sh b/tools/testing/selftests/net/forwarding/sch_ets.sh
index e60c8b4818cc..1f6f53e284b5 100755
--- a/tools/testing/selftests/net/forwarding/sch_ets.sh
+++ b/tools/testing/selftests/net/forwarding/sch_ets.sh
@@ -24,15 +24,10 @@ switch_create()
 	# Create a bottleneck so that the DWRR process can kick in.
 	tc qdisc add dev $swp2 root handle 1: tbf \
 	   rate 1Gbit burst 1Mbit latency 100ms
+	defer tc qdisc del dev $swp2 root
 	PARENT="parent 1:"
 }
 
-switch_destroy()
-{
-	ets_switch_destroy
-	tc qdisc del dev $swp2 root
-}
-
 # Callback from sch_ets_tests.sh
 collect_stats()
 {
diff --git a/tools/testing/selftests/net/forwarding/sch_ets_core.sh b/tools/testing/selftests/net/forwarding/sch_ets_core.sh
index f906fcc66572..8f9922c695b0 100644
--- a/tools/testing/selftests/net/forwarding/sch_ets_core.sh
+++ b/tools/testing/selftests/net/forwarding/sch_ets_core.sh
@@ -166,44 +166,32 @@ h1_create()
 	local i;
 
 	simple_if_init $h1
+	defer simple_if_fini $h1
+
 	mtu_set $h1 9900
+	defer mtu_restore $h1
+
 	for i in {0..2}; do
 		vlan_create $h1 1$i v$h1 $(sip $i)/28
+		defer vlan_destroy $h1 1$i
 		ip link set dev $h1.1$i type vlan egress 0:$i
 	done
 }
 
-h1_destroy()
-{
-	local i
-
-	for i in {0..2}; do
-		vlan_destroy $h1 1$i
-	done
-	mtu_restore $h1
-	simple_if_fini $h1
-}
-
 h2_create()
 {
 	local i
 
 	simple_if_init $h2
-	mtu_set $h2 9900
-	for i in {0..2}; do
-		vlan_create $h2 1$i v$h2 $(dip $i)/28
-	done
-}
+	defer simple_if_fini $h2
 
-h2_destroy()
-{
-	local i
+	mtu_set $h2 9900
+	defer mtu_restore $h2
 
 	for i in {0..2}; do
-		vlan_destroy $h2 1$i
+		vlan_create $h2 1$i v$h2 $(dip $i)/28
+		defer vlan_destroy $h2 1$i
 	done
-	mtu_restore $h2
-	simple_if_fini $h2
 }
 
 ets_switch_create()
@@ -211,44 +199,45 @@ ets_switch_create()
 	local i
 
 	ip link set dev $swp1 up
+	defer ip link set dev $swp1 down
+
 	mtu_set $swp1 9900
+	defer mtu_restore $swp1
 
 	ip link set dev $swp2 up
+	defer ip link set dev $swp2 down
+
 	mtu_set $swp2 9900
+	defer mtu_restore $swp2
 
 	for i in {0..2}; do
 		vlan_create $swp1 1$i
+		defer vlan_destroy $swp1 1$i
 		ip link set dev $swp1.1$i type vlan ingress 0:0 1:1 2:2
 
 		vlan_create $swp2 1$i
+		defer vlan_destroy $swp2 1$i
 
 		ip link add dev br1$i type bridge
+		defer ip link del dev br1$i
+
 		ip link set dev $swp1.1$i master br1$i
+		defer ip link set dev $swp1.1$i nomaster
+
 		ip link set dev $swp2.1$i master br1$i
+		defer ip link set dev $swp2.1$i nomaster
 
 		ip link set dev br1$i up
-		ip link set dev $swp1.1$i up
-		ip link set dev $swp2.1$i up
-	done
-}
+		defer ip link set dev br1$i down
 
-ets_switch_destroy()
-{
-	local i
-
-	ets_delete_qdisc
+		ip link set dev $swp1.1$i up
+		defer ip link set dev $swp1.1$i down
 
-	for i in {0..2}; do
-		ip link del dev br1$i
-		vlan_destroy $swp2 1$i
-		vlan_destroy $swp1 1$i
+		ip link set dev $swp2.1$i up
+		defer ip link set dev $swp2.1$i down
 	done
 
-	mtu_restore $swp2
-	ip link set dev $swp2 down
-
-	mtu_restore $swp1
-	ip link set dev $swp1 down
+	defer ets_delete_qdisc
 }
 
 setup_prepare()
@@ -263,23 +252,13 @@ setup_prepare()
 	hut=$h2
 
 	vrf_prepare
+	defer vrf_cleanup
 
 	h1_create
 	h2_create
 	switch_create
 }
 
-cleanup()
-{
-	pre_cleanup
-
-	switch_destroy
-	h2_destroy
-	h1_destroy
-
-	vrf_cleanup
-}
-
 ping_ipv4()
 {
 	ping_test $h1.10 $(dip 0) " vlan 10"
diff --git a/tools/testing/selftests/net/forwarding/sch_ets_tests.sh b/tools/testing/selftests/net/forwarding/sch_ets_tests.sh
index cdf689e99458..08240d3e3c87 100644
--- a/tools/testing/selftests/net/forwarding/sch_ets_tests.sh
+++ b/tools/testing/selftests/net/forwarding/sch_ets_tests.sh
@@ -90,6 +90,7 @@ __ets_dwrr_test()
 
 	for stream in ${streams[@]}; do
 		ets_start_traffic $stream
+		defer stop_traffic $!
 	done
 
 	sleep 10
@@ -120,25 +121,24 @@ __ets_dwrr_test()
 				       ${d[0]} ${d[$i]}
 		fi
 	done
-
-	for stream in ${streams[@]}; do
-		stop_traffic
-	done
 }
 
 ets_dwrr_test_012()
 {
-	__ets_dwrr_test 0 1 2
+	in_defer_scope \
+		__ets_dwrr_test 0 1 2
 }
 
 ets_dwrr_test_01()
 {
-	__ets_dwrr_test 0 1
+	in_defer_scope \
+		__ets_dwrr_test 0 1
 }
 
 ets_dwrr_test_12()
 {
-	__ets_dwrr_test 1 2
+	in_defer_scope \
+		__ets_dwrr_test 1 2
 }
 
 ets_qdisc_setup()
@@ -199,25 +199,28 @@ ets_set_dwrr_two_bands()
 ets_test_strict()
 {
 	ets_set_strict
-	ets_dwrr_test_01
-	ets_dwrr_test_12
+	xfail_on_slow ets_dwrr_test_01
+	xfail_on_slow ets_dwrr_test_12
 }
 
 ets_test_mixed()
 {
 	ets_set_mixed
-	ets_dwrr_test_01
-	ets_dwrr_test_12
+	xfail_on_slow ets_dwrr_test_01
+	xfail_on_slow ets_dwrr_test_12
 }
 
 ets_test_dwrr()
 {
 	ets_set_dwrr_uniform
-	ets_dwrr_test_012
+	xfail_on_slow ets_dwrr_test_012
+
 	ets_set_dwrr_varying
-	ets_dwrr_test_012
+	xfail_on_slow ets_dwrr_test_012
+
 	ets_change_quantum
-	ets_dwrr_test_012
+	xfail_on_slow ets_dwrr_test_012
+
 	ets_set_dwrr_two_bands
-	ets_dwrr_test_01
+	xfail_on_slow ets_dwrr_test_01
 }
diff --git a/tools/testing/selftests/net/forwarding/sch_red.sh b/tools/testing/selftests/net/forwarding/sch_red.sh
index 81f31179ac88..af166662b78a 100755
--- a/tools/testing/selftests/net/forwarding/sch_red.sh
+++ b/tools/testing/selftests/net/forwarding/sch_red.sh
@@ -53,71 +53,63 @@ PKTSZ=1400
 h1_create()
 {
 	simple_if_init $h1 192.0.2.1/28
+	defer simple_if_fini $h1 192.0.2.1/28
+
 	mtu_set $h1 10000
+	defer mtu_restore $h1
+
 	tc qdisc replace dev $h1 root handle 1: tbf \
 	   rate 10Mbit burst 10K limit 1M
-}
-
-h1_destroy()
-{
-	tc qdisc del dev $h1 root
-	mtu_restore $h1
-	simple_if_fini $h1 192.0.2.1/28
+	defer tc qdisc del dev $h1 root
 }
 
 h2_create()
 {
 	simple_if_init $h2 192.0.2.2/28
-	mtu_set $h2 10000
-}
+	defer simple_if_fini $h2 192.0.2.2/28
 
-h2_destroy()
-{
-	mtu_restore $h2
-	simple_if_fini $h2 192.0.2.2/28
+	mtu_set $h2 10000
+	defer mtu_restore $h2
 }
 
 h3_create()
 {
 	simple_if_init $h3 192.0.2.3/28
-	mtu_set $h3 10000
-}
+	defer simple_if_fini $h3 192.0.2.3/28
 
-h3_destroy()
-{
-	mtu_restore $h3
-	simple_if_fini $h3 192.0.2.3/28
+	mtu_set $h3 10000
+	defer mtu_restore $h3
 }
 
 switch_create()
 {
 	ip link add dev br up type bridge
+	defer ip link del dev br
+
 	ip link set dev $swp1 up master br
+	defer ip link set dev $swp1 down nomaster
+
 	ip link set dev $swp2 up master br
+	defer ip link set dev $swp2 down nomaster
+
 	ip link set dev $swp3 up master br
+	defer ip link set dev $swp3 down nomaster
 
 	mtu_set $swp1 10000
+	defer mtu_restore $h1
+
 	mtu_set $swp2 10000
+	defer mtu_restore $h2
+
 	mtu_set $swp3 10000
+	defer mtu_restore $h3
 
 	tc qdisc replace dev $swp3 root handle 1: tbf \
 	   rate 10Mbit burst 10K limit 1M
-	ip link add name _drop_test up type dummy
-}
+	defer tc qdisc del dev $swp3 root
 
-switch_destroy()
-{
-	ip link del dev _drop_test
-	tc qdisc del dev $swp3 root
-
-	mtu_restore $h3
-	mtu_restore $h2
-	mtu_restore $h1
-
-	ip link set dev $swp3 down nomaster
-	ip link set dev $swp2 down nomaster
-	ip link set dev $swp1 down nomaster
-	ip link del dev br
+	ip link add name _drop_test up type dummy
+	defer ip link del dev _drop_test
 }
 
 setup_prepare()
@@ -134,6 +126,7 @@ setup_prepare()
 	h3_mac=$(mac_get $h3)
 
 	vrf_prepare
+	defer vrf_cleanup
 
 	h1_create
 	h2_create
@@ -141,18 +134,6 @@ setup_prepare()
 	switch_create
 }
 
-cleanup()
-{
-	pre_cleanup
-
-	switch_destroy
-	h3_destroy
-	h2_destroy
-	h1_destroy
-
-	vrf_cleanup
-}
-
 ping_ipv4()
 {
 	ping_test $h1 192.0.2.3 " from host 1"
@@ -287,6 +268,7 @@ do_ecn_test()
 
 	$MZ $h1 -p $PKTSZ -A 192.0.2.1 -B 192.0.2.3 -c 0 \
 		-a own -b $h3_mac -t tcp -q tos=0x01 &
+	defer stop_traffic $!
 	sleep 1
 
 	ecn_test_common "$name" $limit
@@ -298,9 +280,6 @@ do_ecn_test()
 	build_backlog $((2 * limit)) udp >/dev/null
 	check_fail $? "UDP traffic went into backlog instead of being early-dropped"
 	log_test "$name backlog > limit: UDP early-dropped"
-
-	stop_traffic
-	sleep 1
 }
 
 do_ecn_nodrop_test()
@@ -310,6 +289,7 @@ do_ecn_nodrop_test()
 
 	$MZ $h1 -p $PKTSZ -A 192.0.2.1 -B 192.0.2.3 -c 0 \
 		-a own -b $h3_mac -t tcp -q tos=0x01 &
+	defer stop_traffic $!
 	sleep 1
 
 	ecn_test_common "$name" $limit
@@ -321,9 +301,6 @@ do_ecn_nodrop_test()
 	build_backlog $((2 * limit)) udp >/dev/null
 	check_err $? "UDP traffic was early-dropped instead of getting into backlog"
 	log_test "$name backlog > limit: UDP not dropped"
-
-	stop_traffic
-	sleep 1
 }
 
 do_red_test()
@@ -336,6 +313,7 @@ do_red_test()
 	# is above limit.
 	$MZ $h1 -p $PKTSZ -A 192.0.2.1 -B 192.0.2.3 -c 0 \
 		-a own -b $h3_mac -t tcp -q tos=0x01 &
+	defer stop_traffic $!
 
 	# Pushing below the queue limit should work.
 	RET=0
@@ -352,9 +330,6 @@ do_red_test()
 	pct=$(check_marking "== 0")
 	check_err $? "backlog $backlog / $limit Got $pct% marked packets, expected == 0."
 	log_test "RED backlog > limit"
-
-	stop_traffic
-	sleep 1
 }
 
 do_red_qevent_test()
@@ -369,6 +344,7 @@ do_red_qevent_test()
 
 	$MZ $h1 -p $PKTSZ -A 192.0.2.1 -B 192.0.2.3 -c 0 \
 		-a own -b $h3_mac -t udp -q &
+	defer stop_traffic $!
 	sleep 1
 
 	tc filter add block 10 pref 1234 handle 102 matchall skip_hw \
@@ -396,9 +372,6 @@ do_red_qevent_test()
 	check_err $? "Dropped packets still observed: 0 expected, $((now - base)) seen"
 
 	log_test "RED early_dropped packets mirrored"
-
-	stop_traffic
-	sleep 1
 }
 
 do_ecn_qevent_test()
@@ -410,6 +383,7 @@ do_ecn_qevent_test()
 
 	$MZ $h1 -p $PKTSZ -A 192.0.2.1 -B 192.0.2.3 -c 0 \
 		-a own -b $h3_mac -t tcp -q tos=0x01 &
+	defer stop_traffic $!
 	sleep 1
 
 	tc filter add block 10 pref 1234 handle 102 matchall skip_hw \
@@ -428,9 +402,6 @@ do_ecn_qevent_test()
 	tc filter del block 10 pref 1234 handle 102 matchall
 
 	log_test "ECN marked packets mirrored"
-
-	stop_traffic
-	sleep 1
 }
 
 install_qdisc()
@@ -451,36 +422,36 @@ uninstall_qdisc()
 ecn_test()
 {
 	install_qdisc ecn
-	do_ecn_test $BACKLOG
-	uninstall_qdisc
+	defer uninstall_qdisc
+	xfail_on_slow do_ecn_test $BACKLOG
 }
 
 ecn_nodrop_test()
 {
 	install_qdisc ecn nodrop
-	do_ecn_nodrop_test $BACKLOG
-	uninstall_qdisc
+	defer uninstall_qdisc
+	xfail_on_slow do_ecn_nodrop_test $BACKLOG
 }
 
 red_test()
 {
 	install_qdisc
-	do_red_test $BACKLOG
-	uninstall_qdisc
+	defer uninstall_qdisc
+	xfail_on_slow do_red_test $BACKLOG
 }
 
 red_qevent_test()
 {
 	install_qdisc qevent early_drop block 10
-	do_red_qevent_test $BACKLOG
-	uninstall_qdisc
+	defer uninstall_qdisc
+	xfail_on_slow do_red_qevent_test $BACKLOG
 }
 
 ecn_qevent_test()
 {
 	install_qdisc ecn qevent mark block 10
-	do_ecn_qevent_test $BACKLOG
-	uninstall_qdisc
+	defer uninstall_qdisc
+	xfail_on_slow do_ecn_qevent_test $BACKLOG
 }
 
 trap cleanup EXIT
diff --git a/tools/testing/selftests/net/forwarding/sch_tbf_core.sh b/tools/testing/selftests/net/forwarding/sch_tbf_core.sh
index d1f26cb7cd73..ec309a5086bc 100644
--- a/tools/testing/selftests/net/forwarding/sch_tbf_core.sh
+++ b/tools/testing/selftests/net/forwarding/sch_tbf_core.sh
@@ -60,68 +60,65 @@ host_create()
 	local host=$1; shift
 
 	simple_if_init $dev
+	defer simple_if_fini $dev
+
 	mtu_set $dev 10000
+	defer mtu_restore $dev
 
 	vlan_create $dev 10 v$dev $(ipaddr $host 10)/28
+	defer vlan_destroy $dev 10
 	ip link set dev $dev.10 type vlan egress 0:0
 
 	vlan_create $dev 11 v$dev $(ipaddr $host 11)/28
+	defer vlan_destroy $dev 11
 	ip link set dev $dev.11 type vlan egress 0:1
 }
 
-host_destroy()
-{
-	local dev=$1; shift
-
-	vlan_destroy $dev 11
-	vlan_destroy $dev 10
-	mtu_restore $dev
-	simple_if_fini $dev
-}
-
 h1_create()
 {
 	host_create $h1 1
 }
 
-h1_destroy()
-{
-	host_destroy $h1
-}
-
 h2_create()
 {
 	host_create $h2 2
 
 	tc qdisc add dev $h2 clsact
+	defer tc qdisc del dev $h2 clsact
+
 	tc filter add dev $h2 ingress pref 1010 prot 802.1q \
 	   flower $TCFLAGS vlan_id 10 action pass
 	tc filter add dev $h2 ingress pref 1011 prot 802.1q \
 	   flower $TCFLAGS vlan_id 11 action pass
 }
 
-h2_destroy()
-{
-	tc qdisc del dev $h2 clsact
-	host_destroy $h2
-}
-
 switch_create()
 {
 	local intf
 	local vlan
 
 	ip link add dev br10 type bridge
+	defer ip link del dev br10
+
 	ip link add dev br11 type bridge
+	defer ip link del dev br11
 
 	for intf in $swp1 $swp2; do
 		ip link set dev $intf up
+		defer ip link set dev $intf down
+
 		mtu_set $intf 10000
+		defer mtu_restore $intf
 
 		for vlan in 10 11; do
 			vlan_create $intf $vlan
+			defer vlan_destroy $intf $vlan
+
 			ip link set dev $intf.$vlan master br$vlan
+			defer ip link set dev $intf.$vlan nomaster
+
 			ip link set dev $intf.$vlan up
+			defer ip link set dev $intf.$vlan down
 		done
 	done
 
@@ -130,34 +127,10 @@ switch_create()
 	done
 
 	ip link set dev br10 up
-	ip link set dev br11 up
-}
-
-switch_destroy()
-{
-	local intf
-	local vlan
-
-	# A test may have been interrupted mid-run, with Qdisc installed. Delete
-	# it here.
-	tc qdisc del dev $swp2 root 2>/dev/null
-
-	ip link set dev br11 down
-	ip link set dev br10 down
+	defer ip link set dev br10 down
 
-	for intf in $swp2 $swp1; do
-		for vlan in 11 10; do
-			ip link set dev $intf.$vlan down
-			ip link set dev $intf.$vlan nomaster
-			vlan_destroy $intf $vlan
-		done
-
-		mtu_restore $intf
-		ip link set dev $intf down
-	done
-
-	ip link del dev br11
-	ip link del dev br10
+	ip link set dev br11 up
+	defer ip link set dev br11 down
 }
 
 setup_prepare()
@@ -177,23 +150,13 @@ setup_prepare()
 	h2_mac=$(mac_get $h2)
 
 	vrf_prepare
+	defer vrf_cleanup
 
 	h1_create
 	h2_create
 	switch_create
 }
 
-cleanup()
-{
-	pre_cleanup
-
-	switch_destroy
-	h2_destroy
-	h1_destroy
-
-	vrf_cleanup
-}
-
 ping_ipv4()
 {
 	ping_test $h1.10 $(ipaddr 2 10) " vlan 10"
@@ -207,18 +170,18 @@ tbf_get_counter()
 	tc_rule_stats_get $h2 10$vlan ingress .bytes
 }
 
-do_tbf_test()
+__tbf_test()
 {
 	local vlan=$1; shift
 	local mbit=$1; shift
 
 	start_traffic $h1.$vlan $(ipaddr 1 $vlan) $(ipaddr 2 $vlan) $h2_mac
+	defer stop_traffic $!
 	sleep 5 # Wait for the burst to dwindle
 
 	local t2=$(busywait_for_counter 1000 +1 tbf_get_counter $vlan)
 	sleep 10
 	local t3=$(tbf_get_counter $vlan)
-	stop_traffic
 
 	RET=0
 
@@ -227,7 +190,13 @@ do_tbf_test()
 	local nr=$(rate $t2 $t3 10)
 	local nr_pct=$((100 * (nr - er) / er))
 	((-5 <= nr_pct && nr_pct <= 5))
-	check_err $? "Expected rate $(humanize $er), got $(humanize $nr), which is $nr_pct% off. Required accuracy is +-5%."
+	xfail_on_slow check_err $? "Expected rate $(humanize $er), got $(humanize $nr), which is $nr_pct% off. Required accuracy is +-5%."
 
 	log_test "TC $((vlan - 10)): TBF rate ${mbit}Mbit"
 }
+
+do_tbf_test()
+{
+	in_defer_scope \
+		__tbf_test "$@"
+}
diff --git a/tools/testing/selftests/net/forwarding/sch_tbf_etsprio.sh b/tools/testing/selftests/net/forwarding/sch_tbf_etsprio.sh
index df9bcd6a811a..c182a04282bc 100644
--- a/tools/testing/selftests/net/forwarding/sch_tbf_etsprio.sh
+++ b/tools/testing/selftests/net/forwarding/sch_tbf_etsprio.sh
@@ -30,8 +30,9 @@ tbf_test()
 	# This test is used for both ETS and PRIO. Even though we only need two
 	# bands, PRIO demands a minimum of three.
 	tc qdisc add dev $swp2 root handle 10: $QDISC 3 priomap 2 1 0
+	defer tc qdisc del dev $swp2 root
+
 	tbf_test_one 128K
-	tc qdisc del dev $swp2 root
 }
 
 tbf_root_test()
@@ -42,6 +43,8 @@ tbf_root_test()
 
 	tc qdisc replace dev $swp2 root handle 1: \
 		tbf rate 400Mbit burst $bs limit 1M
+	defer tc qdisc del dev $swp2 root
+
 	tc qdisc replace dev $swp2 parent 1:1 handle 10: \
 		$QDISC 3 priomap 2 1 0
 	tc qdisc replace dev $swp2 parent 10:3 handle 103: \
@@ -53,8 +56,6 @@ tbf_root_test()
 
 	do_tbf_test 10 400 $bs
 	do_tbf_test 11 400 $bs
-
-	tc qdisc del dev $swp2 root
 }
 
 if type -t sch_tbf_pre_hook >/dev/null; then
diff --git a/tools/testing/selftests/net/forwarding/sch_tbf_root.sh b/tools/testing/selftests/net/forwarding/sch_tbf_root.sh
index 96c997be0d03..9f20320f8d84 100755
--- a/tools/testing/selftests/net/forwarding/sch_tbf_root.sh
+++ b/tools/testing/selftests/net/forwarding/sch_tbf_root.sh
@@ -14,13 +14,14 @@ tbf_test_one()
 
 	tc qdisc replace dev $swp2 root handle 108: tbf \
 	   rate 400Mbit burst $bs limit 1M
+	defer tc qdisc del dev $swp2 root
+
 	do_tbf_test 10 400 $bs
 }
 
 tbf_test()
 {
 	tbf_test_one 128K
-	tc qdisc del dev $swp2 root
 }
 
 if type -t sch_tbf_pre_hook >/dev/null; then
diff --git a/tools/testing/selftests/net/forwarding/tc_actions.sh b/tools/testing/selftests/net/forwarding/tc_actions.sh
index 589629636502..ea89e558672d 100755
--- a/tools/testing/selftests/net/forwarding/tc_actions.sh
+++ b/tools/testing/selftests/net/forwarding/tc_actions.sh
@@ -4,7 +4,8 @@
 ALL_TESTS="gact_drop_and_ok_test mirred_egress_redirect_test \
 	mirred_egress_mirror_test matchall_mirred_egress_mirror_test \
 	gact_trap_test mirred_egress_to_ingress_test \
-	mirred_egress_to_ingress_tcp_test"
+	mirred_egress_to_ingress_tcp_test \
+	ingress_2nd_vlan_push egress_2nd_vlan_push"
 NUM_NETIFS=4
 source tc_common.sh
 source lib.sh
@@ -244,6 +245,49 @@ mirred_egress_to_ingress_tcp_test()
 	log_test "mirred_egress_to_ingress_tcp ($tcflags)"
 }
 
+ingress_2nd_vlan_push()
+{
+	tc filter add dev $swp1 ingress pref 20 chain 0 handle 20 flower \
+		$tcflags num_of_vlans 1 \
+		action vlan push id 100 protocol 0x8100 action goto chain 5
+	tc filter add dev $swp1 ingress pref 30 chain 5 handle 30 flower \
+		$tcflags num_of_vlans 2 \
+		cvlan_ethtype 0x800 action pass
+
+	$MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+		-t ip -Q 10 -q
+
+	tc_check_packets "dev $swp1 ingress" 30 1
+	check_err $? "No double-vlan packets received"
+
+	tc filter del dev $swp1 ingress pref 20 chain 0 handle 20 flower
+	tc filter del dev $swp1 ingress pref 30 chain 5 handle 30 flower
+
+	log_test "ingress_2nd_vlan_push ($tcflags)"
+}
+
+egress_2nd_vlan_push()
+{
+	tc filter add dev $h1 egress pref 20 chain 0 handle 20 flower \
+		$tcflags num_of_vlans 0 \
+		action vlan push id 10 protocol 0x8100 \
+		pipe action vlan push id 100 protocol 0x8100 action goto chain 5
+	tc filter add dev $h1 egress pref 30 chain 5 handle 30 flower \
+		$tcflags num_of_vlans 2 \
+		cvlan_ethtype 0x800 action pass
+
+	$MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+		-t ip -q
+
+	tc_check_packets "dev $h1 egress" 30 1
+	check_err $? "No double-vlan packets received"
+
+	tc filter del dev $h1 egress pref 20 chain 0 handle 20 flower
+	tc filter del dev $h1 egress pref 30 chain 5 handle 30 flower
+
+	log_test "egress_2nd_vlan_push ($tcflags)"
+}
+
 setup_prepare()
 {
 	h1=${NETIFS[p1]}
diff --git a/tools/testing/selftests/net/forwarding/tc_common.sh b/tools/testing/selftests/net/forwarding/tc_common.sh
index bce8bb8d2b6f..2e3326edfa9a 100644
--- a/tools/testing/selftests/net/forwarding/tc_common.sh
+++ b/tools/testing/selftests/net/forwarding/tc_common.sh
@@ -4,7 +4,7 @@
 CHECK_TC="yes"
 
 # Can be overridden by the configuration file. See lib.sh
-TC_HIT_TIMEOUT=${TC_HIT_TIMEOUT:=1000} # ms
+: "${TC_HIT_TIMEOUT:=1000}" # ms
 
 tc_check_packets()
 {
diff --git a/tools/testing/selftests/net/forwarding/tc_flower_port_range.sh b/tools/testing/selftests/net/forwarding/tc_flower_port_range.sh
index 3885a2a91f7d..baed5e380dae 100755
--- a/tools/testing/selftests/net/forwarding/tc_flower_port_range.sh
+++ b/tools/testing/selftests/net/forwarding/tc_flower_port_range.sh
@@ -20,6 +20,7 @@ ALL_TESTS="
 	test_port_range_ipv4_tcp
 	test_port_range_ipv6_udp
 	test_port_range_ipv6_tcp
+	test_port_range_ipv4_udp_drop
 "
 
 NUM_NETIFS=4
@@ -194,6 +195,51 @@ test_port_range_ipv6_tcp()
 	__test_port_range $proto $ip_proto $sip $dip $mode "$name"
 }
 
+test_port_range_ipv4_udp_drop()
+{
+	local proto=ipv4
+	local ip_proto=udp
+	local sip=192.0.2.1
+	local dip=192.0.2.2
+	local mode="-4"
+	local name="IPv4 UDP Drop"
+	local dmac=$(mac_get $h2)
+	local smac=$(mac_get $h1)
+	local sport_min=2000
+	local sport_max=3000
+	local sport_mid=$((sport_min + (sport_max - sport_min) / 2))
+	local dport=5000
+
+	RET=0
+
+	tc filter add dev $swp1 ingress protocol $proto handle 101 pref 1 \
+		flower src_ip $sip dst_ip $dip ip_proto $ip_proto \
+		src_port $sport_min-$sport_max \
+		dst_port $dport \
+		action drop
+
+	# Test ports outside range - should pass
+	$MZ $mode $h1 -c 1 -q -p 100 -a $smac -b $dmac -A $sip -B $dip \
+		-t $ip_proto "sp=$((sport_min - 1)),dp=$dport"
+	$MZ $mode $h1 -c 1 -q -p 100 -a $smac -b $dmac -A $sip -B $dip \
+		-t $ip_proto "sp=$((sport_max + 1)),dp=$dport"
+
+	# Test ports inside range - should be dropped
+	$MZ $mode $h1 -c 1 -q -p 100 -a $smac -b $dmac -A $sip -B $dip \
+		-t $ip_proto "sp=$sport_min,dp=$dport"
+	$MZ $mode $h1 -c 1 -q -p 100 -a $smac -b $dmac -A $sip -B $dip \
+		-t $ip_proto "sp=$sport_mid,dp=$dport"
+	$MZ $mode $h1 -c 1 -q -p 100 -a $smac -b $dmac -A $sip -B $dip \
+		-t $ip_proto "sp=$sport_max,dp=$dport"
+
+	tc_check_packets "dev $swp1 ingress" 101 3
+	check_err $? "Filter did not drop the expected number of packets"
+
+	tc filter del dev $swp1 ingress protocol $proto pref 1 handle 101 flower
+
+	log_test "Port range matching - $name"
+}
+
 setup_prepare()
 {
 	h1=${NETIFS[p1]}
diff --git a/tools/testing/selftests/net/forwarding/tc_police.sh b/tools/testing/selftests/net/forwarding/tc_police.sh
index 5103f64a71d6..509fdedfcfa1 100755
--- a/tools/testing/selftests/net/forwarding/tc_police.sh
+++ b/tools/testing/selftests/net/forwarding/tc_police.sh
@@ -148,7 +148,7 @@ police_common_test()
 
 	log_test "$test_name"
 
-	{ kill %% && wait %%; } 2>/dev/null
+	kill_process %%
 	tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
 }
 
@@ -198,7 +198,7 @@ police_shared_common_test()
 
 	log_test "$test_name"
 
-	{ kill %% && wait %%; } 2>/dev/null
+	kill_process %%
 }
 
 police_shared_test()
@@ -278,7 +278,7 @@ police_mirror_common_test()
 
 	log_test "$test_name"
 
-	{ kill %% && wait %%; } 2>/dev/null
+	kill_process %%
 	tc filter del dev $pol_if $dir protocol ip pref 1 handle 101 flower
 	tc filter del dev $h3 ingress protocol ip pref 1 handle 101 flower
 	tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
@@ -320,7 +320,7 @@ police_pps_common_test()
 
 	log_test "$test_name"
 
-	{ kill %% && wait %%; } 2>/dev/null
+	kill_process %%
 	tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
 }
 
diff --git a/tools/testing/selftests/net/forwarding/tc_taprio.sh b/tools/testing/selftests/net/forwarding/tc_taprio.sh
new file mode 100755
index 000000000000..8992aeabfe0b
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/tc_taprio.sh
@@ -0,0 +1,421 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS=" \
+	test_clock_jump_backward \
+	test_taprio_after_ptp \
+	test_max_sdu \
+	test_clock_jump_backward_forward \
+"
+NUM_NETIFS=4
+source tc_common.sh
+source lib.sh
+source tsn_lib.sh
+
+require_command python3
+
+# The test assumes the usual topology from the README, where h1 is connected to
+# swp1, h2 to swp2, and swp1 and swp2 are together in a bridge.
+# Additional assumption: h1 and h2 use the same PHC, and so do swp1 and swp2.
+# By synchronizing h1 to swp1 via PTP, h2 is also implicitly synchronized to
+# swp1 (and both to CLOCK_REALTIME).
+h1=${NETIFS[p1]}
+swp1=${NETIFS[p2]}
+swp2=${NETIFS[p3]}
+h2=${NETIFS[p4]}
+
+UDS_ADDRESS_H1="/var/run/ptp4l_h1"
+UDS_ADDRESS_SWP1="/var/run/ptp4l_swp1"
+
+H1_IPV4="192.0.2.1"
+H2_IPV4="192.0.2.2"
+H1_IPV6="2001:db8:1::1"
+H2_IPV6="2001:db8:1::2"
+
+# Tunables
+NUM_PKTS=100
+STREAM_VID=10
+STREAM_PRIO_1=6
+STREAM_PRIO_2=5
+STREAM_PRIO_3=4
+# PTP uses TC 0
+ALL_GATES=$((1 << 0 | 1 << STREAM_PRIO_1 | 1 << STREAM_PRIO_2))
+# Use a conservative cycle of 10 ms to allow the test to still pass when the
+# kernel has some extra overhead like lockdep etc
+CYCLE_TIME_NS=10000000
+# Create two Gate Control List entries, one OPEN and one CLOSE, of equal
+# durations
+GATE_DURATION_NS=$((CYCLE_TIME_NS / 2))
+# Give 2/3 of the cycle time to user space and 1/3 to the kernel
+FUDGE_FACTOR=$((CYCLE_TIME_NS / 3))
+# Shift the isochron base time by half the gate time, so that packets are
+# always received by swp1 close to the middle of the time slot, to minimize
+# inaccuracies due to network sync
+SHIFT_TIME_NS=$((GATE_DURATION_NS / 2))
+
+path_delay=
+
+h1_create()
+{
+	simple_if_init $h1 $H1_IPV4/24 $H1_IPV6/64
+}
+
+h1_destroy()
+{
+	simple_if_fini $h1 $H1_IPV4/24 $H1_IPV6/64
+}
+
+h2_create()
+{
+	simple_if_init $h2 $H2_IPV4/24 $H2_IPV6/64
+}
+
+h2_destroy()
+{
+	simple_if_fini $h2 $H2_IPV4/24 $H2_IPV6/64
+}
+
+switch_create()
+{
+	local h2_mac_addr=$(mac_get $h2)
+
+	ip link set $swp1 up
+	ip link set $swp2 up
+
+	ip link add br0 type bridge vlan_filtering 1
+	ip link set $swp1 master br0
+	ip link set $swp2 master br0
+	ip link set br0 up
+
+	bridge vlan add dev $swp2 vid $STREAM_VID
+	bridge vlan add dev $swp1 vid $STREAM_VID
+	bridge fdb add dev $swp2 \
+		$h2_mac_addr vlan $STREAM_VID static master
+}
+
+switch_destroy()
+{
+	ip link del br0
+}
+
+ptp_setup()
+{
+	# Set up swp1 as a master PHC for h1, synchronized to the local
+	# CLOCK_REALTIME.
+	phc2sys_start $UDS_ADDRESS_SWP1
+	ptp4l_start $h1 true $UDS_ADDRESS_H1
+	ptp4l_start $swp1 false $UDS_ADDRESS_SWP1
+}
+
+ptp_cleanup()
+{
+	ptp4l_stop $swp1
+	ptp4l_stop $h1
+	phc2sys_stop
+}
+
+txtime_setup()
+{
+	local if_name=$1
+
+	tc qdisc add dev $if_name clsact
+	# Classify PTP on TC 7 and isochron on TC 6
+	tc filter add dev $if_name egress protocol 0x88f7 \
+		flower action skbedit priority 7
+	tc filter add dev $if_name egress protocol 802.1Q \
+		flower vlan_ethtype 0xdead action skbedit priority 6
+	tc qdisc add dev $if_name handle 100: parent root mqprio num_tc 8 \
+		queues 1@0 1@1 1@2 1@3 1@4 1@5 1@6 1@7 \
+		map 0 1 2 3 4 5 6 7 \
+		hw 1
+	# Set up TC 5, 6, 7 for SO_TXTIME. tc-mqprio queues count from 1.
+	tc qdisc replace dev $if_name parent 100:$((STREAM_PRIO_1 + 1)) etf \
+		clockid CLOCK_TAI offload delta $FUDGE_FACTOR
+	tc qdisc replace dev $if_name parent 100:$((STREAM_PRIO_2 + 1)) etf \
+		clockid CLOCK_TAI offload delta $FUDGE_FACTOR
+	tc qdisc replace dev $if_name parent 100:$((STREAM_PRIO_3 + 1)) etf \
+		clockid CLOCK_TAI offload delta $FUDGE_FACTOR
+}
+
+txtime_cleanup()
+{
+	local if_name=$1
+
+	tc qdisc del dev $if_name clsact
+	tc qdisc del dev $if_name root
+}
+
+taprio_replace()
+{
+	local if_name="$1"; shift
+	local extra_args="$1"; shift
+
+	# STREAM_PRIO_1 always has an open gate.
+	# STREAM_PRIO_2 has a gate open for GATE_DURATION_NS (half the cycle time)
+	# STREAM_PRIO_3 always has a closed gate.
+	tc qdisc replace dev $if_name root stab overhead 24 taprio num_tc 8 \
+		queues 1@0 1@1 1@2 1@3 1@4 1@5 1@6 1@7 \
+		map 0 1 2 3 4 5 6 7 \
+		sched-entry S $(printf "%x" $ALL_GATES) $GATE_DURATION_NS \
+		sched-entry S $(printf "%x" $((ALL_GATES & ~(1 << STREAM_PRIO_2)))) $GATE_DURATION_NS \
+		base-time 0 flags 0x2 $extra_args
+	taprio_wait_for_admin $if_name
+}
+
+taprio_cleanup()
+{
+	local if_name=$1
+
+	tc qdisc del dev $if_name root
+}
+
+probe_path_delay()
+{
+	local isochron_dat="$(mktemp)"
+	local received
+
+	log_info "Probing path delay"
+
+	isochron_do "$h1" "$h2" "$UDS_ADDRESS_H1" "" 0 \
+		"$CYCLE_TIME_NS" "" "" "$NUM_PKTS" \
+		"$STREAM_VID" "$STREAM_PRIO_1" "" "$isochron_dat"
+
+	received=$(isochron_report_num_received "$isochron_dat")
+	if [ "$received" != "$NUM_PKTS" ]; then
+		echo "Cannot establish basic data path between $h1 and $h2"
+		exit $ksft_fail
+	fi
+
+	printf "pdelay = {}\n" > isochron_data.py
+	isochron report --input-file "$isochron_dat" \
+		--printf-format "pdelay[%u] = %d - %d\n" \
+		--printf-args "qRT" \
+		>> isochron_data.py
+	cat <<-'EOF' > isochron_postprocess.py
+	#!/usr/bin/env python3
+
+	from isochron_data import pdelay
+	import numpy as np
+
+	w = np.array(list(pdelay.values()))
+	print("{}".format(np.max(w)))
+	EOF
+	path_delay=$(python3 ./isochron_postprocess.py)
+
+	log_info "Path delay from $h1 to $h2 estimated at $path_delay ns"
+
+	if [ "$path_delay" -gt "$GATE_DURATION_NS" ]; then
+		echo "Path delay larger than gate duration, aborting"
+		exit $ksft_fail
+	fi
+
+	rm -f ./isochron_data.py 2> /dev/null
+	rm -f ./isochron_postprocess.py 2> /dev/null
+	rm -f "$isochron_dat" 2> /dev/null
+}
+
+setup_prepare()
+{
+	vrf_prepare
+
+	h1_create
+	h2_create
+	switch_create
+
+	txtime_setup $h1
+
+	# Temporarily set up PTP just to probe the end-to-end path delay.
+	ptp_setup
+	probe_path_delay
+	ptp_cleanup
+}
+
+cleanup()
+{
+	pre_cleanup
+
+	isochron_recv_stop
+	txtime_cleanup $h1
+
+	switch_destroy
+	h2_destroy
+	h1_destroy
+
+	vrf_cleanup
+}
+
+run_test()
+{
+	local base_time=$1; shift
+	local stream_prio=$1; shift
+	local expected_delay=$1; shift
+	local should_fail=$1; shift
+	local test_name=$1; shift
+	local isochron_dat="$(mktemp)"
+	local received
+	local median_delay
+
+	RET=0
+
+	# Set the shift time equal to the cycle time, which effectively
+	# cancels the default advance time. Packets won't be sent early in
+	# software, which ensures that they won't prematurely enter through
+	# the open gate in __test_out_of_band(). Also, the gate is open for
+	# long enough that this won't cause a problem in __test_in_band().
+	isochron_do "$h1" "$h2" "$UDS_ADDRESS_H1" "" "$base_time" \
+		"$CYCLE_TIME_NS" "$SHIFT_TIME_NS" "$GATE_DURATION_NS" \
+		"$NUM_PKTS" "$STREAM_VID" "$stream_prio" "" "$isochron_dat"
+
+	received=$(isochron_report_num_received "$isochron_dat")
+	[ "$received" = "$NUM_PKTS" ]
+	check_err_fail $should_fail $? "Reception of $NUM_PKTS packets"
+
+	if [ $should_fail = 0 ] && [ "$received" = "$NUM_PKTS" ]; then
+		printf "pdelay = {}\n" > isochron_data.py
+		isochron report --input-file "$isochron_dat" \
+			--printf-format "pdelay[%u] = %d - %d\n" \
+			--printf-args "qRT" \
+			>> isochron_data.py
+		cat <<-'EOF' > isochron_postprocess.py
+		#!/usr/bin/env python3
+
+		from isochron_data import pdelay
+		import numpy as np
+
+		w = np.array(list(pdelay.values()))
+		print("{}".format(int(np.median(w))))
+		EOF
+		median_delay=$(python3 ./isochron_postprocess.py)
+
+		# If the condition below is true, packets were delayed by a closed gate
+		[ "$median_delay" -gt $((path_delay + expected_delay)) ]
+		check_fail $? "Median delay $median_delay is greater than expected delay $expected_delay plus path delay $path_delay"
+
+		# If the condition below is true, packets were sent expecting them to
+		# hit a closed gate in the switch, but were not delayed
+		[ "$expected_delay" -gt 0 ] && [ "$median_delay" -lt "$expected_delay" ]
+		check_fail $? "Median delay $median_delay is less than expected delay $expected_delay"
+	fi
+
+	log_test "$test_name"
+
+	rm -f ./isochron_data.py 2> /dev/null
+	rm -f ./isochron_postprocess.py 2> /dev/null
+	rm -f "$isochron_dat" 2> /dev/null
+}
+
+__test_always_open()
+{
+	run_test 0.000000000 $STREAM_PRIO_1 0 0 "Gate always open"
+}
+
+__test_always_closed()
+{
+	run_test 0.000000000 $STREAM_PRIO_3 0 1 "Gate always closed"
+}
+
+__test_in_band()
+{
+	# Send packets in-band with the OPEN gate entry
+	run_test 0.000000000 $STREAM_PRIO_2 0 0 "In band with gate"
+}
+
+__test_out_of_band()
+{
+	# Send packets in-band with the CLOSE gate entry
+	run_test 0.005000000 $STREAM_PRIO_2 \
+		$((GATE_DURATION_NS - SHIFT_TIME_NS)) 0 \
+		"Out of band with gate"
+}
+
+run_subtests()
+{
+	__test_always_open
+	__test_always_closed
+	__test_in_band
+	__test_out_of_band
+}
+
+test_taprio_after_ptp()
+{
+	log_info "Setting up taprio after PTP"
+	ptp_setup
+	taprio_replace $swp2
+	run_subtests
+	taprio_cleanup $swp2
+	ptp_cleanup
+}
+
+__test_under_max_sdu()
+{
+	# Limit max-sdu for STREAM_PRIO_1
+	taprio_replace "$swp2" "max-sdu 0 0 0 0 0 0 100 0"
+	run_test 0.000000000 $STREAM_PRIO_1 0 0 "Under maximum SDU"
+}
+
+__test_over_max_sdu()
+{
+	# Limit max-sdu for STREAM_PRIO_1
+	taprio_replace "$swp2" "max-sdu 0 0 0 0 0 0 20 0"
+	run_test 0.000000000 $STREAM_PRIO_1 0 1 "Over maximum SDU"
+}
+
+test_max_sdu()
+{
+	ptp_setup
+	__test_under_max_sdu
+	__test_over_max_sdu
+	taprio_cleanup $swp2
+	ptp_cleanup
+}
+
+# Perform a clock jump in the past without synchronization running, so that the
+# time base remains where it was set by phc_ctl.
+test_clock_jump_backward()
+{
+	# This is a more complex schedule specifically crafted in a way that
+	# has been problematic on NXP LS1028A. Not much to test with it other
+	# than the fact that it passes traffic.
+	tc qdisc replace dev $swp2 root stab overhead 24 taprio num_tc 8 \
+		queues 1@0 1@1 1@2 1@3 1@4 1@5 1@6 1@7 map 0 1 2 3 4 5 6 7 \
+		base-time 0 sched-entry S 20 300000 sched-entry S 10 200000 \
+		sched-entry S 20 300000 sched-entry S 48 200000 \
+		sched-entry S 20 300000 sched-entry S 83 200000 \
+		sched-entry S 40 300000 sched-entry S 00 200000 flags 2
+
+	log_info "Forcing a backward clock jump"
+	phc_ctl $swp1 set 0
+
+	ping_test $h1 192.0.2.2
+	taprio_cleanup $swp2
+}
+
+# Test that taprio tolerates clock jumps.
+# Since ptp4l and phc2sys are running, it is expected for the time to
+# eventually recover (through yet another clock jump). Isochron waits
+# until that is the case.
+test_clock_jump_backward_forward()
+{
+	log_info "Forcing a backward and a forward clock jump"
+	taprio_replace $swp2
+	phc_ctl $swp1 set 0
+	ptp_setup
+	ping_test $h1 192.0.2.2
+	run_subtests
+	ptp_cleanup
+	taprio_cleanup $swp2
+}
+
+tc_offload_check
+if [[ $? -ne 0 ]]; then
+	log_test_skip "Could not test offloaded functionality"
+	exit $EXIT_STATUS
+fi
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/tc_tunnel_key.sh b/tools/testing/selftests/net/forwarding/tc_tunnel_key.sh
index 5a5dd9034819..79775b10b99f 100755
--- a/tools/testing/selftests/net/forwarding/tc_tunnel_key.sh
+++ b/tools/testing/selftests/net/forwarding/tc_tunnel_key.sh
@@ -1,7 +1,5 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
 
 ALL_TESTS="tunnel_key_nofrag_test"
 
diff --git a/tools/testing/selftests/net/forwarding/tsn_lib.sh b/tools/testing/selftests/net/forwarding/tsn_lib.sh
index b91bcd8008a9..08c044ff6689 100644
--- a/tools/testing/selftests/net/forwarding/tsn_lib.sh
+++ b/tools/testing/selftests/net/forwarding/tsn_lib.sh
@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: GPL-2.0
 # Copyright 2021-2022 NXP
 
+tc_testing_scripts_dir=$(dirname $0)/../../tc-testing/scripts
+
 REQUIRE_ISOCHRON=${REQUIRE_ISOCHRON:=yes}
 REQUIRE_LINUXPTP=${REQUIRE_LINUXPTP:=yes}
 
@@ -18,6 +20,7 @@ fi
 if [[ "$REQUIRE_LINUXPTP" = "yes" ]]; then
 	require_command phc2sys
 	require_command ptp4l
+	require_command phc_ctl
 fi
 
 phc2sys_start()
@@ -182,6 +185,7 @@ isochron_do()
 	local base_time=$1; shift
 	local cycle_time=$1; shift
 	local shift_time=$1; shift
+	local window_size=$1; shift
 	local num_pkts=$1; shift
 	local vid=$1; shift
 	local priority=$1; shift
@@ -212,6 +216,10 @@ isochron_do()
 		extra_args="${extra_args} --shift-time=${shift_time}"
 	fi
 
+	if ! [ -z "${window_size}" ]; then
+		extra_args="${extra_args} --window-size=${window_size}"
+	fi
+
 	if [ "${use_l2}" = "true" ]; then
 		extra_args="${extra_args} --l2 --etype=0xdead ${vid}"
 		receiver_extra_args="--l2 --etype=0xdead"
@@ -247,3 +255,21 @@ isochron_do()
 
 	cpufreq_restore ${ISOCHRON_CPU}
 }
+
+isochron_report_num_received()
+{
+	local isochron_dat=$1; shift
+
+	# Count all received packets by looking at the non-zero RX timestamps
+	isochron report \
+		--input-file "${isochron_dat}" \
+		--printf-format "%u\n" --printf-args "R" | \
+		grep -w -v '0' | wc -l
+}
+
+taprio_wait_for_admin()
+{
+	local if_name="$1"; shift
+
+	"$tc_testing_scripts_dir/taprio_wait_for_admin.sh" "$(which tc)" "$if_name"
+}
diff --git a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh
index 6f0a2e452ba1..b43816dd998c 100755
--- a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh
+++ b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh
@@ -428,6 +428,14 @@ __test_flood()
 test_flood()
 {
 	__test_flood de:ad:be:ef:13:37 192.0.2.100 "flood"
+
+	# Add an entry with arbitrary destination IP. Verify that packets are
+	# not duplicated (this can happen if hardware floods the packets, and
+	# then traps them due to misconfiguration, so software data path repeats
+	# flooding and resends packets).
+	bridge fdb append dev vx1 00:00:00:00:00:00 dst 198.51.100.1 self
+	__test_flood de:ad:be:ef:13:37 192.0.2.100 "flood, unresolved FDB entry"
+	bridge fdb del dev vx1 00:00:00:00:00:00 dst 198.51.100.1 self
 }
 
 vxlan_fdb_add_del()
@@ -680,9 +688,9 @@ test_learning()
 	local mac=de:ad:be:ef:13:37
 	local dst=192.0.2.100
 
-	# Enable learning on the VxLAN device and set ageing time to 10 seconds
-	ip link set dev br1 type bridge ageing_time 1000
-	ip link set dev vx1 type vxlan ageing 10
+	# Enable learning on the VxLAN device and set ageing time to 30 seconds
+	ip link set dev br1 type bridge ageing_time 3000
+	ip link set dev vx1 type vxlan ageing 30
 	ip link set dev vx1 type vxlan learning
 	reapply_config
 
@@ -740,7 +748,9 @@ test_learning()
 
 	vxlan_flood_test $mac $dst 0 10 0
 
-	sleep 20
+	# The entry should age out when it only forwards traffic
+	$MZ $h1 -c 50 -d 1sec -p 64 -b $mac -B $dst -t icmp -q &
+	sleep 60
 
 	bridge fdb show brport vx1 | grep $mac | grep -q self
 	check_fail $?
diff --git a/tools/testing/selftests/net/forwarding/vxlan_bridge_1q.sh b/tools/testing/selftests/net/forwarding/vxlan_bridge_1q.sh
index fb9a34cb50c6..afc65647f673 100755
--- a/tools/testing/selftests/net/forwarding/vxlan_bridge_1q.sh
+++ b/tools/testing/selftests/net/forwarding/vxlan_bridge_1q.sh
@@ -539,6 +539,21 @@ test_flood()
 		10 10 0 10 0
 	__test_flood ca:fe:be:ef:13:37 198.51.100.100 20 "flood vlan 20" \
 		10 0 10 0 10
+
+	# Add entries with arbitrary destination IP. Verify that packets are
+	# not duplicated (this can happen if hardware floods the packets, and
+	# then traps them due to misconfiguration, so software data path repeats
+	# flooding and resends packets).
+	bridge fdb append dev vx10 00:00:00:00:00:00 dst 203.0.113.1 self
+	bridge fdb append dev vx20 00:00:00:00:00:00 dst 203.0.113.2 self
+
+	__test_flood de:ad:be:ef:13:37 192.0.2.100 10 \
+		"flood vlan 10, unresolved FDB entry" 10 10 0 10 0
+	__test_flood ca:fe:be:ef:13:37 198.51.100.100 20 \
+		"flood vlan 20, unresolved FDB entry" 10 0 10 0 10
+
+	bridge fdb del dev vx20 00:00:00:00:00:00 dst 203.0.113.2 self
+	bridge fdb del dev vx10 00:00:00:00:00:00 dst 203.0.113.1 self
 }
 
 vxlan_fdb_add_del()
diff --git a/tools/testing/selftests/net/forwarding/vxlan_reserved.sh b/tools/testing/selftests/net/forwarding/vxlan_reserved.sh
new file mode 100755
index 000000000000..46c31794b91b
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/vxlan_reserved.sh
@@ -0,0 +1,352 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# +--------------------+
+# | H1 (vrf)           |
+# |    + $h1           |
+# |    | 192.0.2.1/28  |
+# +----|---------------+
+#      |
+# +----|--------------------------------+
+# | SW |                                |
+# | +--|------------------------------+ |
+# | |  + $swp1           BR1 (802.1d) | |
+# | |                                 | |
+# | |  + vx1 (vxlan)                  | |
+# | |    local 192.0.2.17             | |
+# | |    id 1000 dstport $VXPORT      | |
+# | +---------------------------------+ |
+# |                                     |
+# |  192.0.2.32/28 via 192.0.2.18       |
+# |                                     |
+# |  + $rp1                             |
+# |  | 192.0.2.17/28                    |
+# +--|----------------------------------+
+#    |
+# +--|----------------------------------+
+# |  |                                  |
+# |  + $rp2                             |
+# |    192.0.2.18/28                    |
+# |                                     |
+# | VRP2 (vrf)                          |
+# +-------------------------------------+
+
+: ${VXPORT:=4789}
+: ${ALL_TESTS:="
+	default_test
+	plain_test
+	reserved_0_test
+	reserved_10_test
+	reserved_31_test
+	reserved_56_test
+	reserved_63_test
+    "}
+
+NUM_NETIFS=4
+source lib.sh
+
+h1_create()
+{
+	simple_if_init $h1 192.0.2.1/28
+	defer simple_if_fini $h1 192.0.2.1/28
+
+	tc qdisc add dev $h1 clsact
+	defer tc qdisc del dev $h1 clsact
+
+	tc filter add dev $h1 ingress pref 77 \
+	   prot ip flower skip_hw ip_proto icmp action drop
+	defer tc filter del dev $h1 ingress pref 77
+}
+
+switch_create()
+{
+	ip_link_add br1 type bridge vlan_filtering 0 mcast_snooping 0
+	# Make sure the bridge uses the MAC address of the local port and not
+	# that of the VxLAN's device.
+	ip_link_set_addr br1 $(mac_get $swp1)
+	ip_link_set_up br1
+
+	ip_link_set_up $rp1
+	ip_addr_add $rp1 192.0.2.17/28
+	ip_route_add 192.0.2.32/28 nexthop via 192.0.2.18
+
+	ip_link_set_master $swp1 br1
+	ip_link_set_up $swp1
+}
+
+vrp2_create()
+{
+	simple_if_init $rp2 192.0.2.18/28
+	defer simple_if_fini $rp2 192.0.2.18/28
+}
+
+setup_prepare()
+{
+	h1=${NETIFS[p1]}
+	swp1=${NETIFS[p2]}
+
+	rp1=${NETIFS[p3]}
+	rp2=${NETIFS[p4]}
+
+	vrf_prepare
+	defer vrf_cleanup
+
+	forwarding_enable
+	defer forwarding_restore
+
+	h1_create
+	switch_create
+
+	vrp2_create
+}
+
+vxlan_header_bytes()
+{
+	local vni=$1; shift
+	local -a extra_bits=("$@")
+	local -a bits
+	local i
+
+	for ((i=0; i < 64; i++)); do
+		bits[i]=0
+	done
+
+	# Bit 4 is the I flag and is always on.
+	bits[4]=1
+
+	for i in ${extra_bits[@]}; do
+		bits[i]=1
+	done
+
+	# Bits 32..55 carry the VNI
+	local mask=0x800000
+	for ((i=0; i < 24; i++)); do
+		bits[$((i + 32))]=$(((vni & mask) != 0))
+		((mask >>= 1))
+	done
+
+	local bytes
+	for ((i=0; i < 8; i++)); do
+		local byte=0
+		local j
+		for ((j=0; j < 8; j++)); do
+			local bit=${bits[8 * i + j]}
+			((byte += bit << (7 - j)))
+		done
+		bytes+=$(printf %02x $byte):
+	done
+
+	echo ${bytes%:}
+}
+
+neg_bytes()
+{
+	local bytes=$1; shift
+
+	local -A neg=([0]=f [1]=e [2]=d [3]=c [4]=b [5]=a [6]=9 [7]=8
+		      [8]=7 [9]=6 [a]=5 [b]=4 [c]=3 [d]=2 [e]=1 [f]=0 [:]=:)
+	local out
+	local i
+
+	for ((i=0; i < ${#bytes}; i++)); do
+		local c=${bytes:$i:1}
+		out+=${neg[$c]}
+	done
+	echo $out
+}
+
+vxlan_ping_do()
+{
+	local count=$1; shift
+	local dev=$1; shift
+	local next_hop_mac=$1; shift
+	local dest_ip=$1; shift
+	local dest_mac=$1; shift
+	local vni=$1; shift
+	local reserved_bits=$1; shift
+
+	local vxlan_header=$(vxlan_header_bytes $vni $reserved_bits)
+
+	$MZ $dev -c $count -d 100msec -q \
+		-b $next_hop_mac -B $dest_ip \
+		-t udp sp=23456,dp=$VXPORT,p=$(:
+		    )"$vxlan_header:"$(              : VXLAN
+		    )"$dest_mac:"$(                  : ETH daddr
+		    )"00:11:22:33:44:55:"$(          : ETH saddr
+		    )"08:00:"$(                      : ETH type
+		    )"45:"$(                         : IP version + IHL
+		    )"00:"$(                         : IP TOS
+		    )"00:54:"$(                      : IP total length
+		    )"99:83:"$(                      : IP identification
+		    )"40:00:"$(                      : IP flags + frag off
+		    )"40:"$(                         : IP TTL
+		    )"01:"$(                         : IP proto
+		    )"00:00:"$(                      : IP header csum
+		    )"$(ipv4_to_bytes 192.0.2.3):"$( : IP saddr
+		    )"$(ipv4_to_bytes 192.0.2.1):"$( : IP daddr
+		    )"08:"$(                         : ICMP type
+		    )"00:"$(                         : ICMP code
+		    )"8b:f2:"$(                      : ICMP csum
+		    )"1f:6a:"$(                      : ICMP request identifier
+		    )"00:01:"$(                      : ICMP request seq. number
+		    )"4f:ff:c5:5b:00:00:00:00:"$(    : ICMP payload
+		    )"6d:74:0b:00:00:00:00:00:"$(    :
+		    )"10:11:12:13:14:15:16:17:"$(    :
+		    )"18:19:1a:1b:1c:1d:1e:1f:"$(    :
+		    )"20:21:22:23:24:25:26:27:"$(    :
+		    )"28:29:2a:2b:2c:2d:2e:2f:"$(    :
+		    )"30:31:32:33:34:35:36:37"
+}
+
+vxlan_device_add()
+{
+	ip_link_add vx1 up type vxlan id 1000		\
+		local 192.0.2.17 dstport "$VXPORT"	\
+		nolearning noudpcsum tos inherit ttl 100 "$@"
+	ip_link_set_master vx1 br1
+}
+
+vxlan_all_reserved_bits()
+{
+	local i
+
+	for ((i=0; i < 64; i++)); do
+		if ((i == 4 || i >= 32 && i < 56)); then
+			continue
+		fi
+		echo $i
+	done
+}
+
+vxlan_ping_vanilla()
+{
+	vxlan_ping_do 10 $rp2 $(mac_get $rp1) 192.0.2.17 $(mac_get $h1) 1000
+}
+
+vxlan_ping_reserved()
+{
+	for bit in $(vxlan_all_reserved_bits); do
+		vxlan_ping_do 1 $rp2 $(mac_get $rp1) \
+			      192.0.2.17 $(mac_get $h1) 1000 "$bit"
+		((n++))
+	done
+}
+
+vxlan_ping_test()
+{
+	local what=$1; shift
+	local get_stat=$1; shift
+	local expect=$1; shift
+
+	RET=0
+
+	local t0=$($get_stat)
+
+	"$@"
+	check_err $? "Failure when running $@"
+
+	local t1=$($get_stat)
+	local delta=$((t1 - t0))
+
+	((expect == delta))
+	check_err $? "Expected to capture $expect packets, got $delta."
+
+	log_test "$what"
+}
+
+__default_test_do()
+{
+	local n_allowed_bits=$1; shift
+	local what=$1; shift
+
+	vxlan_ping_test "$what: clean packets" \
+		"tc_rule_stats_get $h1 77 ingress" \
+		10 vxlan_ping_vanilla
+
+	local t0=$(link_stats_get vx1 rx errors)
+	vxlan_ping_test "$what: mangled packets" \
+		"tc_rule_stats_get $h1 77 ingress" \
+		$n_allowed_bits vxlan_ping_reserved
+	local t1=$(link_stats_get vx1 rx errors)
+
+	RET=0
+	local expect=$((39 - n_allowed_bits))
+	local delta=$((t1 - t0))
+	((expect == delta))
+	check_err $? "Expected $expect error packets, got $delta."
+	log_test "$what: drops reported"
+}
+
+default_test_do()
+{
+	vxlan_device_add
+	__default_test_do 0 "Default"
+}
+
+default_test()
+{
+	in_defer_scope \
+	    default_test_do
+}
+
+plain_test_do()
+{
+	vxlan_device_add reserved_bits 0xf7ffffff000000ff
+	__default_test_do 0 "reserved_bits 0xf7ffffff000000ff"
+}
+
+plain_test()
+{
+	in_defer_scope \
+	    plain_test_do
+}
+
+reserved_test()
+{
+	local bit=$1; shift
+
+	local allowed_bytes=$(vxlan_header_bytes 0xffffff $bit)
+	local reserved_bytes=$(neg_bytes $allowed_bytes)
+	local reserved_bits=${reserved_bytes//:/}
+
+	vxlan_device_add reserved_bits 0x$reserved_bits
+	__default_test_do 1 "reserved_bits 0x$reserved_bits"
+}
+
+reserved_0_test()
+{
+	in_defer_scope \
+	    reserved_test 0
+}
+
+reserved_10_test()
+{
+	in_defer_scope \
+	    reserved_test 10
+}
+
+reserved_31_test()
+{
+	in_defer_scope \
+	    reserved_test 31
+}
+
+reserved_56_test()
+{
+	in_defer_scope \
+	    reserved_test 56
+}
+
+reserved_63_test()
+{
+	in_defer_scope \
+	    reserved_test 63
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/gre_ipv6_lladdr.sh b/tools/testing/selftests/net/gre_ipv6_lladdr.sh
new file mode 100755
index 000000000000..5b34f6e1f831
--- /dev/null
+++ b/tools/testing/selftests/net/gre_ipv6_lladdr.sh
@@ -0,0 +1,177 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source ./lib.sh
+
+PAUSE_ON_FAIL="no"
+
+# The trap function handler
+#
+exit_cleanup_all()
+{
+	cleanup_all_ns
+
+	exit "${EXIT_STATUS}"
+}
+
+# Add fake IPv4 and IPv6 networks on the loopback device, to be used as
+# underlay by future GRE devices.
+#
+setup_basenet()
+{
+	ip -netns "${NS0}" link set dev lo up
+	ip -netns "${NS0}" address add dev lo 192.0.2.10/24
+	ip -netns "${NS0}" address add dev lo 2001:db8::10/64 nodad
+}
+
+# Check if network device has an IPv6 link-local address assigned.
+#
+# Parameters:
+#
+#   * $1: The network device to test
+#   * $2: An extra regular expression that should be matched (to verify the
+#         presence of extra attributes)
+#   * $3: The expected return code from grep (to allow checking the absence of
+#         a link-local address)
+#   * $4: The user visible name for the scenario being tested
+#
+check_ipv6_ll_addr()
+{
+	local DEV="$1"
+	local EXTRA_MATCH="$2"
+	local XRET="$3"
+	local MSG="$4"
+
+	RET=0
+	set +e
+	ip -netns "${NS0}" -6 address show dev "${DEV}" scope link | grep "fe80::" | grep -q "${EXTRA_MATCH}"
+	check_err_fail "${XRET}" $? ""
+	log_test "${MSG}"
+	set -e
+}
+
+# Create a GRE device and verify that it gets an IPv6 link-local address as
+# expected.
+#
+# Parameters:
+#
+#   * $1: The device type (gre, ip6gre, gretap or ip6gretap)
+#   * $2: The local underlay IP address (can be an IPv4, an IPv6 or "any")
+#   * $3: The remote underlay IP address (can be an IPv4, an IPv6 or "any")
+#   * $4: The IPv6 interface identifier generation mode to use for the GRE
+#         device (eui64, none, stable-privacy or random).
+#
+test_gre_device()
+{
+	local GRE_TYPE="$1"
+	local LOCAL_IP="$2"
+	local REMOTE_IP="$3"
+	local MODE="$4"
+	local ADDR_GEN_MODE
+	local MATCH_REGEXP
+	local MSG
+
+	ip link add netns "${NS0}" name gretest type "${GRE_TYPE}" local "${LOCAL_IP}" remote "${REMOTE_IP}"
+
+	case "${MODE}" in
+	    "eui64")
+		ADDR_GEN_MODE=0
+		MATCH_REGEXP=""
+		MSG="${GRE_TYPE}, mode: 0 (EUI64), ${LOCAL_IP} -> ${REMOTE_IP}"
+		XRET=0
+		;;
+	    "none")
+		ADDR_GEN_MODE=1
+		MATCH_REGEXP=""
+		MSG="${GRE_TYPE}, mode: 1 (none), ${LOCAL_IP} -> ${REMOTE_IP}"
+		XRET=1 # No link-local address should be generated
+		;;
+	    "stable-privacy")
+		ADDR_GEN_MODE=2
+		MATCH_REGEXP="stable-privacy"
+		MSG="${GRE_TYPE}, mode: 2 (stable privacy), ${LOCAL_IP} -> ${REMOTE_IP}"
+		XRET=0
+		# Initialise stable_secret (required for stable-privacy mode)
+		ip netns exec "${NS0}" sysctl -qw net.ipv6.conf.gretest.stable_secret="2001:db8::abcd"
+		;;
+	    "random")
+		ADDR_GEN_MODE=3
+		MATCH_REGEXP="stable-privacy"
+		MSG="${GRE_TYPE}, mode: 3 (random), ${LOCAL_IP} -> ${REMOTE_IP}"
+		XRET=0
+		;;
+	esac
+
+	# Check that IPv6 link-local address is generated when device goes up
+	ip netns exec "${NS0}" sysctl -qw net.ipv6.conf.gretest.addr_gen_mode="${ADDR_GEN_MODE}"
+	ip -netns "${NS0}" link set dev gretest up
+	check_ipv6_ll_addr gretest "${MATCH_REGEXP}" "${XRET}" "config: ${MSG}"
+
+	# Now disable link-local address generation
+	ip -netns "${NS0}" link set dev gretest down
+	ip netns exec "${NS0}" sysctl -qw net.ipv6.conf.gretest.addr_gen_mode=1
+	ip -netns "${NS0}" link set dev gretest up
+
+	# Check that link-local address generation works when re-enabled while
+	# the device is already up
+	ip netns exec "${NS0}" sysctl -qw net.ipv6.conf.gretest.addr_gen_mode="${ADDR_GEN_MODE}"
+	check_ipv6_ll_addr gretest "${MATCH_REGEXP}" "${XRET}" "update: ${MSG}"
+
+	ip -netns "${NS0}" link del dev gretest
+}
+
+test_gre4()
+{
+	local GRE_TYPE
+	local MODE
+
+	for GRE_TYPE in "gre" "gretap"; do
+		printf "\n####\nTesting IPv6 link-local address generation on ${GRE_TYPE} devices\n####\n\n"
+
+		for MODE in "eui64" "none" "stable-privacy" "random"; do
+			test_gre_device "${GRE_TYPE}" 192.0.2.10 192.0.2.11 "${MODE}"
+			test_gre_device "${GRE_TYPE}" any 192.0.2.11 "${MODE}"
+			test_gre_device "${GRE_TYPE}" 192.0.2.10 any "${MODE}"
+		done
+	done
+}
+
+test_gre6()
+{
+	local GRE_TYPE
+	local MODE
+
+	for GRE_TYPE in "ip6gre" "ip6gretap"; do
+		printf "\n####\nTesting IPv6 link-local address generation on ${GRE_TYPE} devices\n####\n\n"
+
+		for MODE in "eui64" "none" "stable-privacy" "random"; do
+			test_gre_device "${GRE_TYPE}" 2001:db8::10 2001:db8::11 "${MODE}"
+			test_gre_device "${GRE_TYPE}" any 2001:db8::11 "${MODE}"
+			test_gre_device "${GRE_TYPE}" 2001:db8::10 any "${MODE}"
+		done
+	done
+}
+
+usage()
+{
+	echo "Usage: $0 [-p]"
+	exit 1
+}
+
+while getopts :p o
+do
+	case $o in
+		p) PAUSE_ON_FAIL="yes";;
+		*) usage;;
+	esac
+done
+
+setup_ns NS0
+
+set -e
+trap exit_cleanup_all EXIT
+
+setup_basenet
+
+test_gre4
+test_gre6
diff --git a/tools/testing/selftests/net/gro.c b/tools/testing/selftests/net/gro.c
index 353e1e867fbb..d5824eadea10 100644
--- a/tools/testing/selftests/net/gro.c
+++ b/tools/testing/selftests/net/gro.c
@@ -93,6 +93,7 @@ static bool tx_socket = true;
 static int tcp_offset = -1;
 static int total_hdr_len = -1;
 static int ethhdr_proto = -1;
+static const int num_flush_id_cases = 6;
 
 static void vlog(const char *fmt, ...)
 {
@@ -119,6 +120,9 @@ static void setup_sock_filter(int fd)
 		next_off = offsetof(struct ipv6hdr, nexthdr);
 	ipproto_off = ETH_HLEN + next_off;
 
+	/* Overridden later if exthdrs are used: */
+	opt_ipproto_off = ipproto_off;
+
 	if (strcmp(testname, "ip") == 0) {
 		if (proto == PF_INET)
 			optlen = sizeof(struct ip_timestamp);
@@ -617,6 +621,113 @@ static void add_ipv6_exthdr(void *buf, void *optpkt, __u8 exthdr_type, char *ext
 	iph->payload_len = htons(ntohs(iph->payload_len) + MIN_EXTHDR_SIZE);
 }
 
+static void fix_ip4_checksum(struct iphdr *iph)
+{
+	iph->check = 0;
+	iph->check = checksum_fold(iph, sizeof(struct iphdr), 0);
+}
+
+static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase)
+{
+	static char buf1[MAX_HDR_LEN + PAYLOAD_LEN];
+	static char buf2[MAX_HDR_LEN + PAYLOAD_LEN];
+	static char buf3[MAX_HDR_LEN + PAYLOAD_LEN];
+	bool send_three = false;
+	struct iphdr *iph1;
+	struct iphdr *iph2;
+	struct iphdr *iph3;
+
+	iph1 = (struct iphdr *)(buf1 + ETH_HLEN);
+	iph2 = (struct iphdr *)(buf2 + ETH_HLEN);
+	iph3 = (struct iphdr *)(buf3 + ETH_HLEN);
+
+	create_packet(buf1, 0, 0, PAYLOAD_LEN, 0);
+	create_packet(buf2, PAYLOAD_LEN, 0, PAYLOAD_LEN, 0);
+	create_packet(buf3, PAYLOAD_LEN * 2, 0, PAYLOAD_LEN, 0);
+
+	switch (tcase) {
+	case 0: /* DF=1, Incrementing - should coalesce */
+		iph1->frag_off |= htons(IP_DF);
+		iph1->id = htons(8);
+
+		iph2->frag_off |= htons(IP_DF);
+		iph2->id = htons(9);
+		break;
+
+	case 1: /* DF=1, Fixed - should coalesce */
+		iph1->frag_off |= htons(IP_DF);
+		iph1->id = htons(8);
+
+		iph2->frag_off |= htons(IP_DF);
+		iph2->id = htons(8);
+		break;
+
+	case 2: /* DF=0, Incrementing - should coalesce */
+		iph1->frag_off &= ~htons(IP_DF);
+		iph1->id = htons(8);
+
+		iph2->frag_off &= ~htons(IP_DF);
+		iph2->id = htons(9);
+		break;
+
+	case 3: /* DF=0, Fixed - should not coalesce */
+		iph1->frag_off &= ~htons(IP_DF);
+		iph1->id = htons(8);
+
+		iph2->frag_off &= ~htons(IP_DF);
+		iph2->id = htons(8);
+		break;
+
+	case 4: /* DF=1, two packets incrementing, and one fixed - should
+		 * coalesce only the first two packets
+		 */
+		iph1->frag_off |= htons(IP_DF);
+		iph1->id = htons(8);
+
+		iph2->frag_off |= htons(IP_DF);
+		iph2->id = htons(9);
+
+		iph3->frag_off |= htons(IP_DF);
+		iph3->id = htons(9);
+		send_three = true;
+		break;
+
+	case 5: /* DF=1, two packets fixed, and one incrementing - should
+		 * coalesce only the first two packets
+		 */
+		iph1->frag_off |= htons(IP_DF);
+		iph1->id = htons(8);
+
+		iph2->frag_off |= htons(IP_DF);
+		iph2->id = htons(8);
+
+		iph3->frag_off |= htons(IP_DF);
+		iph3->id = htons(9);
+		send_three = true;
+		break;
+	}
+
+	fix_ip4_checksum(iph1);
+	fix_ip4_checksum(iph2);
+	write_packet(fd, buf1, total_hdr_len + PAYLOAD_LEN, daddr);
+	write_packet(fd, buf2, total_hdr_len + PAYLOAD_LEN, daddr);
+
+	if (send_three) {
+		fix_ip4_checksum(iph3);
+		write_packet(fd, buf3, total_hdr_len + PAYLOAD_LEN, daddr);
+	}
+}
+
+static void test_flush_id(int fd, struct sockaddr_ll *daddr, char *fin_pkt)
+{
+	for (int i = 0; i < num_flush_id_cases; i++) {
+		sleep(1);
+		send_flush_id_case(fd, daddr, i);
+		sleep(1);
+		write_packet(fd, fin_pkt, total_hdr_len, daddr);
+	}
+}
+
 static void send_ipv6_exthdr(int fd, struct sockaddr_ll *daddr, char *ext_data1, char *ext_data2)
 {
 	static char buf[MAX_HDR_LEN + PAYLOAD_LEN];
@@ -935,6 +1046,8 @@ static void gro_sender(void)
 			send_fragment4(txfd, &daddr);
 			sleep(1);
 			write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
+
+			test_flush_id(txfd, &daddr, fin_pkt);
 		} else if (proto == PF_INET6) {
 			sleep(1);
 			send_fragment6(txfd, &daddr);
@@ -1061,6 +1174,34 @@ static void gro_receiver(void)
 
 			printf("fragmented ip4 doesn't coalesce: ");
 			check_recv_pkts(rxfd, correct_payload, 2);
+
+			/* is_atomic checks */
+			printf("DF=1, Incrementing - should coalesce: ");
+			correct_payload[0] = PAYLOAD_LEN * 2;
+			check_recv_pkts(rxfd, correct_payload, 1);
+
+			printf("DF=1, Fixed - should coalesce: ");
+			correct_payload[0] = PAYLOAD_LEN * 2;
+			check_recv_pkts(rxfd, correct_payload, 1);
+
+			printf("DF=0, Incrementing - should coalesce: ");
+			correct_payload[0] = PAYLOAD_LEN * 2;
+			check_recv_pkts(rxfd, correct_payload, 1);
+
+			printf("DF=0, Fixed - should not coalesce: ");
+			correct_payload[0] = PAYLOAD_LEN;
+			correct_payload[1] = PAYLOAD_LEN;
+			check_recv_pkts(rxfd, correct_payload, 2);
+
+			printf("DF=1, 2 Incrementing and one fixed - should coalesce only first 2 packets: ");
+			correct_payload[0] = PAYLOAD_LEN * 2;
+			correct_payload[1] = PAYLOAD_LEN;
+			check_recv_pkts(rxfd, correct_payload, 2);
+
+			printf("DF=1, 2 Fixed and one incrementing - should coalesce only first 2 packets: ");
+			correct_payload[0] = PAYLOAD_LEN * 2;
+			correct_payload[1] = PAYLOAD_LEN;
+			check_recv_pkts(rxfd, correct_payload, 2);
 		} else if (proto == PF_INET6) {
 			/* GRO doesn't check for ipv6 hop limit when flushing.
 			 * Hence no corresponding test to the ipv4 case.
@@ -1177,11 +1318,13 @@ int main(int argc, char **argv)
 	read_MAC(src_mac, smac);
 	read_MAC(dst_mac, dmac);
 
-	if (tx_socket)
+	if (tx_socket) {
 		gro_sender();
-	else
+	} else {
+		/* Only the receiver exit status determines test success. */
 		gro_receiver();
+		fprintf(stderr, "Gro::%s test passed.\n", testname);
+	}
 
-	fprintf(stderr, "Gro::%s test passed.\n", testname);
 	return 0;
 }
diff --git a/tools/testing/selftests/net/gro.sh b/tools/testing/selftests/net/gro.sh
index 02c21ff4ca81..9e3f186bc2a1 100755
--- a/tools/testing/selftests/net/gro.sh
+++ b/tools/testing/selftests/net/gro.sh
@@ -18,10 +18,10 @@ run_test() {
   "--smac" "${CLIENT_MAC}" "--test" "${test}" "--verbose" )
 
   setup_ns
-  # Each test is run 3 times to deflake, because given the receive timing,
+  # Each test is run 6 times to deflake, because given the receive timing,
   # not all packets that should coalesce will be considered in the same flow
   # on every try.
-  for tries in {1..3}; do
+  for tries in {1..6}; do
     # Actual test starts here
     ip netns exec $server_ns ./gro "${ARGS[@]}" "--rx" "--iface" "server" \
       1>>log.txt &
@@ -100,5 +100,6 @@ trap cleanup EXIT
 if [[ "${test}" == "all" ]]; then
   run_all_tests
 else
-  run_test "${proto}" "${test}"
+  exit_code=$(run_test "${proto}" "${test}")
+  exit $exit_code
 fi;
diff --git a/tools/testing/selftests/net/hsr/Makefile b/tools/testing/selftests/net/hsr/Makefile
index 92c1d9d080cd..884cd2cc0681 100644
--- a/tools/testing/selftests/net/hsr/Makefile
+++ b/tools/testing/selftests/net/hsr/Makefile
@@ -2,6 +2,7 @@
 
 top_srcdir = ../../../../..
 
-TEST_PROGS := hsr_ping.sh
+TEST_PROGS := hsr_ping.sh hsr_redbox.sh
+TEST_FILES += hsr_common.sh
 
 include ../../lib.mk
diff --git a/tools/testing/selftests/net/hsr/config b/tools/testing/selftests/net/hsr/config
index 22061204fb69..555a868743f0 100644
--- a/tools/testing/selftests/net/hsr/config
+++ b/tools/testing/selftests/net/hsr/config
@@ -2,3 +2,5 @@ CONFIG_IPV6=y
 CONFIG_NET_SCH_NETEM=m
 CONFIG_HSR=y
 CONFIG_VETH=y
+CONFIG_BRIDGE=y
+CONFIG_VLAN_8021Q=m
diff --git a/tools/testing/selftests/net/hsr/hsr_common.sh b/tools/testing/selftests/net/hsr/hsr_common.sh
new file mode 100644
index 000000000000..1dc882ac1c74
--- /dev/null
+++ b/tools/testing/selftests/net/hsr/hsr_common.sh
@@ -0,0 +1,84 @@
+# SPDX-License-Identifier: GPL-2.0
+# Common code for HSR testing scripts
+
+source ../lib.sh
+ret=0
+ksft_skip=4
+
+# $1: IP address
+is_v6()
+{
+	[ -z "${1##*:*}" ]
+}
+
+do_ping()
+{
+	local netns="$1"
+	local connect_addr="$2"
+	local ping_args="-q -c 2 -i 0.1"
+
+	if is_v6 "${connect_addr}"; then
+		$ipv6 || return 0
+		ping_args="${ping_args} -6"
+	fi
+
+	ip netns exec ${netns} ping ${ping_args} $connect_addr >/dev/null
+	if [ $? -ne 0 ] ; then
+		echo "$netns -> $connect_addr connectivity [ FAIL ]" 1>&2
+		ret=1
+		return 1
+	fi
+
+	return 0
+}
+
+do_ping_long()
+{
+	local netns="$1"
+	local connect_addr="$2"
+	local ping_args="-q -c 10 -i 0.1"
+
+	if is_v6 "${connect_addr}"; then
+		$ipv6 || return 0
+		ping_args="${ping_args} -6"
+	fi
+
+	OUT="$(LANG=C ip netns exec ${netns} ping ${ping_args} $connect_addr | grep received)"
+	if [ $? -ne 0 ] ; then
+		echo "$netns -> $connect_addr ping [ FAIL ]" 1>&2
+		ret=1
+		return 1
+	fi
+
+	VAL="$(echo $OUT | cut -d' ' -f1-8)"
+	SED_VAL="$(echo ${VAL} | sed -r -e 's/([0-9]{2}).*([0-9]{2}).*[[:space:]]([0-9]+%).*/\1 transmitted \2 received \3 loss/')"
+	if [ "${SED_VAL}" != "10 transmitted 10 received 0% loss" ]
+	then
+		echo "$netns -> $connect_addr ping TEST [ FAIL ]"
+		echo "Expect to send and receive 10 packets and no duplicates."
+		echo "Full message: ${OUT}."
+		ret=1
+		return 1
+	fi
+
+	return 0
+}
+
+stop_if_error()
+{
+	local msg="$1"
+
+	if [ ${ret} -ne 0 ]; then
+		echo "FAIL: ${msg}" 1>&2
+		exit ${ret}
+	fi
+}
+
+check_prerequisites()
+{
+	ip -Version > /dev/null 2>&1
+	if [ $? -ne 0 ];then
+		echo "SKIP: Could not run test without ip tool"
+		exit $ksft_skip
+	fi
+}
diff --git a/tools/testing/selftests/net/hsr/hsr_ping.sh b/tools/testing/selftests/net/hsr/hsr_ping.sh
index 1c6457e54625..5a65f4f836be 100755
--- a/tools/testing/selftests/net/hsr/hsr_ping.sh
+++ b/tools/testing/selftests/net/hsr/hsr_ping.sh
@@ -1,10 +1,10 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
-ret=0
-ksft_skip=4
 ipv6=true
 
+source ./hsr_common.sh
+
 optstring="h4"
 usage() {
 	echo "Usage: $0 [OPTION]"
@@ -27,88 +27,6 @@ while getopts "$optstring" option;do
 esac
 done
 
-sec=$(date +%s)
-rndh=$(printf %x $sec)-$(mktemp -u XXXXXX)
-ns1="ns1-$rndh"
-ns2="ns2-$rndh"
-ns3="ns3-$rndh"
-
-cleanup()
-{
-	local netns
-	for netns in "$ns1" "$ns2" "$ns3" ;do
-		ip netns del $netns
-	done
-}
-
-# $1: IP address
-is_v6()
-{
-	[ -z "${1##*:*}" ]
-}
-
-do_ping()
-{
-	local netns="$1"
-	local connect_addr="$2"
-	local ping_args="-q -c 2"
-
-	if is_v6 "${connect_addr}"; then
-		$ipv6 || return 0
-		ping_args="${ping_args} -6"
-	fi
-
-	ip netns exec ${netns} ping ${ping_args} $connect_addr >/dev/null
-	if [ $? -ne 0 ] ; then
-		echo "$netns -> $connect_addr connectivity [ FAIL ]" 1>&2
-		ret=1
-		return 1
-	fi
-
-	return 0
-}
-
-do_ping_long()
-{
-	local netns="$1"
-	local connect_addr="$2"
-	local ping_args="-q -c 10"
-
-	if is_v6 "${connect_addr}"; then
-		$ipv6 || return 0
-		ping_args="${ping_args} -6"
-	fi
-
-	OUT="$(LANG=C ip netns exec ${netns} ping ${ping_args} $connect_addr | grep received)"
-	if [ $? -ne 0 ] ; then
-		echo "$netns -> $connect_addr ping [ FAIL ]" 1>&2
-		ret=1
-		return 1
-	fi
-
-	VAL="$(echo $OUT | cut -d' ' -f1-8)"
-	if [ "$VAL" != "10 packets transmitted, 10 received, 0% packet loss," ]
-	then
-		echo "$netns -> $connect_addr ping TEST [ FAIL ]"
-		echo "Expect to send and receive 10 packets and no duplicates."
-		echo "Full message: ${OUT}."
-		ret=1
-		return 1
-	fi
-
-	return 0
-}
-
-stop_if_error()
-{
-	local msg="$1"
-
-	if [ ${ret} -ne 0 ]; then
-		echo "FAIL: ${msg}" 1>&2
-		exit ${ret}
-	fi
-}
-
 do_complete_ping_test()
 {
 	echo "INFO: Initial validation ping."
@@ -234,6 +152,15 @@ setup_hsr_interfaces()
 	ip -net "$ns3" addr add 100.64.0.3/24 dev hsr3
 	ip -net "$ns3" addr add dead:beef:1::3/64 dev hsr3 nodad
 
+	ip -net "$ns1" link set address 00:11:22:00:01:01 dev ns1eth1
+	ip -net "$ns1" link set address 00:11:22:00:01:02 dev ns1eth2
+
+	ip -net "$ns2" link set address 00:11:22:00:02:01 dev ns2eth1
+	ip -net "$ns2" link set address 00:11:22:00:02:02 dev ns2eth2
+
+	ip -net "$ns3" link set address 00:11:22:00:03:01 dev ns3eth1
+	ip -net "$ns3" link set address 00:11:22:00:03:02 dev ns3eth2
+
 	# All Links up
 	ip -net "$ns1" link set ns1eth1 up
 	ip -net "$ns1" link set ns1eth2 up
@@ -248,29 +175,115 @@ setup_hsr_interfaces()
 	ip -net "$ns3" link set hsr3 up
 }
 
-ip -Version > /dev/null 2>&1
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not run test without ip tool"
-	exit $ksft_skip
-fi
+setup_vlan_interfaces() {
+	ip -net "$ns1" link add link hsr1 name hsr1.2 type vlan id 2
+	ip -net "$ns1" link add link hsr1 name hsr1.3 type vlan id 3
+	ip -net "$ns1" link add link hsr1 name hsr1.4 type vlan id 4
+	ip -net "$ns1" link add link hsr1 name hsr1.5 type vlan id 5
+
+	ip -net "$ns2" link add link hsr2 name hsr2.2 type vlan id 2
+	ip -net "$ns2" link add link hsr2 name hsr2.3 type vlan id 3
+	ip -net "$ns2" link add link hsr2 name hsr2.4 type vlan id 4
+	ip -net "$ns2" link add link hsr2 name hsr2.5 type vlan id 5
+
+	ip -net "$ns3" link add link hsr3 name hsr3.2 type vlan id 2
+	ip -net "$ns3" link add link hsr3 name hsr3.3 type vlan id 3
+	ip -net "$ns3" link add link hsr3 name hsr3.4 type vlan id 4
+	ip -net "$ns3" link add link hsr3 name hsr3.5 type vlan id 5
+
+	ip -net "$ns1" addr add 100.64.2.1/24 dev hsr1.2
+	ip -net "$ns1" addr add 100.64.3.1/24 dev hsr1.3
+	ip -net "$ns1" addr add 100.64.4.1/24 dev hsr1.4
+	ip -net "$ns1" addr add 100.64.5.1/24 dev hsr1.5
+
+	ip -net "$ns2" addr add 100.64.2.2/24 dev hsr2.2
+	ip -net "$ns2" addr add 100.64.3.2/24 dev hsr2.3
+	ip -net "$ns2" addr add 100.64.4.2/24 dev hsr2.4
+	ip -net "$ns2" addr add 100.64.5.2/24 dev hsr2.5
+
+	ip -net "$ns3" addr add 100.64.2.3/24 dev hsr3.2
+	ip -net "$ns3" addr add 100.64.3.3/24 dev hsr3.3
+	ip -net "$ns3" addr add 100.64.4.3/24 dev hsr3.4
+	ip -net "$ns3" addr add 100.64.5.3/24 dev hsr3.5
+
+	ip -net "$ns1" link set dev hsr1.2 up
+	ip -net "$ns1" link set dev hsr1.3 up
+	ip -net "$ns1" link set dev hsr1.4 up
+	ip -net "$ns1" link set dev hsr1.5 up
+
+	ip -net "$ns2" link set dev hsr2.2 up
+	ip -net "$ns2" link set dev hsr2.3 up
+	ip -net "$ns2" link set dev hsr2.4 up
+	ip -net "$ns2" link set dev hsr2.5 up
+
+	ip -net "$ns3" link set dev hsr3.2 up
+	ip -net "$ns3" link set dev hsr3.3 up
+	ip -net "$ns3" link set dev hsr3.4 up
+	ip -net "$ns3" link set dev hsr3.5 up
 
-trap cleanup EXIT
+}
 
-for i in "$ns1" "$ns2" "$ns3" ;do
-	ip netns add $i || exit $ksft_skip
-	ip -net $i link set lo up
-done
+hsr_vlan_ping() {
+	do_ping "$ns1" 100.64.2.2
+	do_ping "$ns1" 100.64.3.2
+	do_ping "$ns1" 100.64.4.2
+	do_ping "$ns1" 100.64.5.2
+
+	do_ping "$ns1" 100.64.2.3
+	do_ping "$ns1" 100.64.3.3
+	do_ping "$ns1" 100.64.4.3
+	do_ping "$ns1" 100.64.5.3
+
+	do_ping "$ns2" 100.64.2.1
+	do_ping "$ns2" 100.64.3.1
+	do_ping "$ns2" 100.64.4.1
+	do_ping "$ns2" 100.64.5.1
+
+	do_ping "$ns2" 100.64.2.3
+	do_ping "$ns2" 100.64.3.3
+	do_ping "$ns2" 100.64.4.3
+	do_ping "$ns2" 100.64.5.3
+
+	do_ping "$ns3" 100.64.2.1
+	do_ping "$ns3" 100.64.3.1
+	do_ping "$ns3" 100.64.4.1
+	do_ping "$ns3" 100.64.5.1
+
+	do_ping "$ns3" 100.64.2.2
+	do_ping "$ns3" 100.64.3.2
+	do_ping "$ns3" 100.64.4.2
+	do_ping "$ns3" 100.64.5.2
+}
+
+run_vlan_tests() {
+	vlan_challenged_hsr1=$(ip net exec "$ns1" ethtool -k hsr1 | grep "vlan-challenged" | awk '{print $2}')
+	vlan_challenged_hsr2=$(ip net exec "$ns2" ethtool -k hsr2 | grep "vlan-challenged" | awk '{print $2}')
+	vlan_challenged_hsr3=$(ip net exec "$ns3" ethtool -k hsr3 | grep "vlan-challenged" | awk '{print $2}')
+
+	if [[ "$vlan_challenged_hsr1" = "off" || "$vlan_challenged_hsr2" = "off" || "$vlan_challenged_hsr3" = "off" ]]; then
+		echo "INFO: Running VLAN tests"
+		setup_vlan_interfaces
+		hsr_vlan_ping
+	else
+		echo "INFO: Not Running VLAN tests as the device does not support VLAN"
+	fi
+}
+
+check_prerequisites
+setup_ns ns1 ns2 ns3
+
+trap cleanup_all_ns EXIT
 
 setup_hsr_interfaces 0
 do_complete_ping_test
-cleanup
 
-for i in "$ns1" "$ns2" "$ns3" ;do
-	ip netns add $i || exit $ksft_skip
-	ip -net $i link set lo up
-done
+run_vlan_tests
+
+setup_ns ns1 ns2 ns3
 
 setup_hsr_interfaces 1
 do_complete_ping_test
 
+run_vlan_tests
+
 exit $ret
diff --git a/tools/testing/selftests/net/hsr/hsr_redbox.sh b/tools/testing/selftests/net/hsr/hsr_redbox.sh
new file mode 100755
index 000000000000..998103502d5d
--- /dev/null
+++ b/tools/testing/selftests/net/hsr/hsr_redbox.sh
@@ -0,0 +1,136 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ipv6=false
+
+source ./hsr_common.sh
+
+do_complete_ping_test()
+{
+	echo "INFO: Initial validation ping (HSR-SAN/RedBox)."
+	# Each node has to be able to reach each one.
+	do_ping "${ns1}" 100.64.0.2
+	do_ping "${ns2}" 100.64.0.1
+	# Ping between SANs (test bridge)
+	do_ping "${ns4}" 100.64.0.51
+	do_ping "${ns5}" 100.64.0.41
+	# Ping from SANs to hsr1 (via hsr2) (and opposite)
+	do_ping "${ns3}" 100.64.0.1
+	do_ping "${ns1}" 100.64.0.3
+	do_ping "${ns1}" 100.64.0.41
+	do_ping "${ns4}" 100.64.0.1
+	do_ping "${ns1}" 100.64.0.51
+	do_ping "${ns5}" 100.64.0.1
+	stop_if_error "Initial validation failed."
+
+	# Wait for MGNT HSR frames being received and nodes being
+	# merged.
+	sleep 5
+
+	echo "INFO: Longer ping test (HSR-SAN/RedBox)."
+	# Ping from SAN to hsr1 (via hsr2)
+	do_ping_long "${ns3}" 100.64.0.1
+	# Ping from hsr1 (via hsr2) to SANs (and opposite)
+	do_ping_long "${ns1}" 100.64.0.3
+	do_ping_long "${ns1}" 100.64.0.41
+	do_ping_long "${ns4}" 100.64.0.1
+	do_ping_long "${ns1}" 100.64.0.51
+	do_ping_long "${ns5}" 100.64.0.1
+	stop_if_error "Longer ping test failed."
+
+	echo "INFO: All good."
+}
+
+setup_hsr_interfaces()
+{
+	local HSRv="$1"
+
+	echo "INFO: preparing interfaces for HSRv${HSRv} (HSR-SAN/RedBox)."
+#
+# IPv4 addresses (100.64.X.Y/24), and [X.Y] is presented on below diagram:
+#
+#
+# |NS1                     |               |NS4                |
+# |       [0.1]            |               |                   |
+# |    /-- hsr1 --\        |               |    [0.41]         |
+# | ns1eth1     ns1eth2    |               |    ns4eth1 (SAN)  |
+# |------------------------|               |-------------------|
+#      |            |                                |
+#      |            |                                |
+#      |            |                                |
+# |------------------------|   |-------------------------------|
+# | ns2eth1     ns2eth2    |   |                  ns3eth2      |
+# |    \-- hsr2 --/        |   |                 /             |
+# |      [0.2] \           |   |                /              |  |------------|
+# |             ns2eth3    |---| ns3eth1 -- ns3br1 -- ns3eth3--|--| ns5eth1    |
+# |             (interlink)|   | [0.3]      [0.11]             |  | [0.51]     |
+# |NS2 (RedBOX)            |   |NS3 (BR)                       |  | NS5 (SAN)  |
+#
+#
+	# Check if iproute2 supports adding interlink port to hsrX device
+	ip link help hsr | grep -q INTERLINK
+	[ $? -ne 0 ] && { echo "iproute2: HSR interlink interface not supported!"; exit 0; }
+
+	# Create interfaces for name spaces
+	ip link add ns1eth1 netns "${ns1}" type veth peer name ns2eth1 netns "${ns2}"
+	ip link add ns1eth2 netns "${ns1}" type veth peer name ns2eth2 netns "${ns2}"
+	ip link add ns2eth3 netns "${ns2}" type veth peer name ns3eth1 netns "${ns3}"
+	ip link add ns3eth2 netns "${ns3}" type veth peer name ns4eth1 netns "${ns4}"
+	ip link add ns3eth3 netns "${ns3}" type veth peer name ns5eth1 netns "${ns5}"
+
+	sleep 1
+
+	ip -n "${ns1}" link set ns1eth1 up
+	ip -n "${ns1}" link set ns1eth2 up
+
+	ip -n "${ns2}" link set ns2eth1 up
+	ip -n "${ns2}" link set ns2eth2 up
+	ip -n "${ns2}" link set ns2eth3 up
+
+	ip -n "${ns3}" link add name ns3br1 type bridge
+	ip -n "${ns3}" link set ns3br1 up
+	ip -n "${ns3}" link set ns3eth1 master ns3br1 up
+	ip -n "${ns3}" link set ns3eth2 master ns3br1 up
+	ip -n "${ns3}" link set ns3eth3 master ns3br1 up
+
+	ip -n "${ns4}" link set ns4eth1 up
+	ip -n "${ns5}" link set ns5eth1 up
+
+	ip -net "$ns1" link set address 00:11:22:00:01:01 dev ns1eth1
+	ip -net "$ns1" link set address 00:11:22:00:01:02 dev ns1eth2
+
+	ip -net "$ns2" link set address 00:11:22:00:02:01 dev ns2eth1
+	ip -net "$ns2" link set address 00:11:22:00:02:02 dev ns2eth2
+	ip -net "$ns2" link set address 00:11:22:00:02:03 dev ns2eth3
+
+	ip -net "$ns3" link set address 00:11:22:00:03:11 dev ns3eth1
+	ip -net "$ns3" link set address 00:11:22:00:03:11 dev ns3eth2
+	ip -net "$ns3" link set address 00:11:22:00:03:11 dev ns3eth3
+	ip -net "$ns3" link set address 00:11:22:00:03:11 dev ns3br1
+
+	ip -net "$ns4" link set address 00:11:22:00:04:01 dev ns4eth1
+	ip -net "$ns5" link set address 00:11:22:00:05:01 dev ns5eth1
+
+	ip -net "${ns1}" link add name hsr1 type hsr slave1 ns1eth1 slave2 ns1eth2 supervision 45 version ${HSRv} proto 0
+	ip -net "${ns2}" link add name hsr2 type hsr slave1 ns2eth1 slave2 ns2eth2 interlink ns2eth3 supervision 45 version ${HSRv} proto 0
+
+	ip -n "${ns1}" addr add 100.64.0.1/24 dev hsr1
+	ip -n "${ns2}" addr add 100.64.0.2/24 dev hsr2
+	ip -n "${ns3}" addr add 100.64.0.11/24 dev ns3br1
+	ip -n "${ns3}" addr add 100.64.0.3/24 dev ns3eth1
+	ip -n "${ns4}" addr add 100.64.0.41/24 dev ns4eth1
+	ip -n "${ns5}" addr add 100.64.0.51/24 dev ns5eth1
+
+	ip -n "${ns1}" link set hsr1 up
+	ip -n "${ns2}" link set hsr2 up
+}
+
+check_prerequisites
+setup_ns ns1 ns2 ns3 ns4 ns5
+
+trap cleanup_all_ns EXIT
+
+setup_hsr_interfaces 1
+do_complete_ping_test
+
+exit $ret
diff --git a/tools/testing/selftests/net/hsr/settings b/tools/testing/selftests/net/hsr/settings
new file mode 100644
index 000000000000..0fbc037f2aa8
--- /dev/null
+++ b/tools/testing/selftests/net/hsr/settings
@@ -0,0 +1 @@
+timeout=50
diff --git a/tools/testing/selftests/net/icmp_redirect.sh b/tools/testing/selftests/net/icmp_redirect.sh
index d6f0e449c029..b13c89a99ecb 100755
--- a/tools/testing/selftests/net/icmp_redirect.sh
+++ b/tools/testing/selftests/net/icmp_redirect.sh
@@ -178,8 +178,6 @@ setup()
 		else
 			ip netns exec $ns sysctl -q -w net.ipv4.ip_forward=1
 			ip netns exec $ns sysctl -q -w net.ipv4.conf.all.send_redirects=1
-			ip netns exec $ns sysctl -q -w net.ipv4.conf.default.rp_filter=0
-			ip netns exec $ns sysctl -q -w net.ipv4.conf.all.rp_filter=0
 
 			ip netns exec $ns sysctl -q -w net.ipv6.conf.all.forwarding=1
 			ip netns exec $ns sysctl -q -w net.ipv6.route.mtu_expires=10
diff --git a/tools/testing/selftests/net/ioam6.sh b/tools/testing/selftests/net/ioam6.sh
index 12491850ae98..845c26dd01a9 100755
--- a/tools/testing/selftests/net/ioam6.sh
+++ b/tools/testing/selftests/net/ioam6.sh
@@ -3,119 +3,106 @@
 #
 # Author: Justin Iurman <justin.iurman@uliege.be>
 #
-# This script evaluates the IOAM insertion for IPv6 by checking the IOAM data
-# consistency directly inside packets on the receiver side. Tests are divided
-# into three categories: OUTPUT (evaluates the IOAM processing by the sender),
-# INPUT (evaluates the IOAM processing by a receiver) and GLOBAL (evaluates
-# wider use cases that do not fall into the other two categories). Both OUTPUT
-# and INPUT tests only use a two-node topology (alpha and beta), while GLOBAL
-# tests use the entire three-node topology (alpha, beta, gamma). Each test is
-# documented inside its own handler in the code below.
+# This script evaluates IOAM for IPv6 by checking local IOAM configurations and
+# IOAM data inside packets. There are three categories of tests: LOCAL, OUTPUT,
+# and INPUT. The former (LOCAL) checks all IOAM related configurations locally
+# without sending packets. OUTPUT tests verify the processing of an IOAM
+# encapsulating node, while INPUT tests verify the processing of an IOAM transit
+# node. Both OUTPUT and INPUT tests send packets. Each test is documented inside
+# its own handler.
 #
-# An IOAM domain is configured from Alpha to Gamma but not on the reverse path.
-# When either Beta or Gamma is the destination (depending on the test category),
-# Alpha adds an IOAM option (Pre-allocated Trace) inside a Hop-by-hop.
+# The topology used for OUTPUT and INPUT tests is made of three nodes:
+# - Alpha (the IOAM encapsulating node)
+# - Beta  (the IOAM transit node)
+# - Gamma (the receiver) **
 #
+# An IOAM domain is configured from Alpha to Beta, but not on the reverse path.
+# Alpha adds an IOAM option (Pre-allocated Trace) inside a Hop-by-hop.
 #
-#            +-------------------+            +-------------------+
-#            |                   |            |                   |
-#            |    Alpha netns    |            |    Gamma netns    |
-#            |                   |            |                   |
-#            |  +-------------+  |            |  +-------------+  |
-#            |  |    veth0    |  |            |  |    veth0    |  |
-#            |  |  db01::2/64 |  |            |  |  db02::2/64 |  |
-#            |  +-------------+  |            |  +-------------+  |
-#            |         .         |            |         .         |
-#            +-------------------+            +-------------------+
-#                      .                                .
-#                      .                                .
-#                      .                                .
-#            +----------------------------------------------------+
-#            |         .                                .         |
-#            |  +-------------+                  +-------------+  |
-#            |  |    veth0    |                  |    veth1    |  |
-#            |  |  db01::1/64 | ................ |  db02::1/64 |  |
-#            |  +-------------+                  +-------------+  |
-#            |                                                    |
-#            |                      Beta netns                    |
-#            |                                                    |
-#            +----------------------------------------------------+
+# ** Gamma is required because ioam6_parser.c uses a packet socket and we need
+#    to see IOAM data inserted by the very last node (Beta), which would happen
+#    _after_ we get a copy of the packet on Beta. Note that using an
+#    IPv6 raw socket with IPV6_RECVHOPOPTS on Beta would not be enough: we also
+#    need to access the IPv6 header to check some fields (e.g., source and
+#    destination addresses), which is not possible in that case. As a
+#    consequence, we need Gamma as a receiver to run ioam6_parser.c which uses a
+#    packet socket.
 #
 #
+#         +-----------------------+          +-----------------------+
+#         |                       |          |                       |
+#         |      Alpha netns      |          |      Gamma netns      |
+#         |                       |          |                       |
+#         | +-------------------+ |          | +-------------------+ |
+#         | |       veth0       | |          | |       veth0       | |
+#         | | 2001:db8:1::2/64  | |          | | 2001:db8:2::2/64  | |
+#         | +-------------------+ |          | +-------------------+ |
+#         |           .           |          |           .           |
+#         +-----------.-----------+          +-----------.-----------+
+#                     .                                  .
+#                     .                                  .
+#                     .                                  .
+#         +-----------.----------------------------------.-----------+
+#         |           .                                  .           |
+#         | +-------------------+              +-------------------+ |
+#         | |       veth0       |              |       veth1       | |
+#         | | 2001:db8:1::1/64  | ............ | 2001:db8:2::1/64  | |
+#         | +-------------------+              +-------------------+ |
+#         |                                                          |
+#         |                        Beta netns                        |
+#         |                                                          |
+#         +----------------------------------------------------------+
 #
-#        =============================================================
-#        |                Alpha - IOAM configuration                 |
-#        +===========================================================+
-#        | Node ID             | 1                                   |
-#        +-----------------------------------------------------------+
-#        | Node Wide ID        | 11111111                            |
-#        +-----------------------------------------------------------+
-#        | Ingress ID          | 0xffff (default value)              |
-#        +-----------------------------------------------------------+
-#        | Ingress Wide ID     | 0xffffffff (default value)          |
-#        +-----------------------------------------------------------+
-#        | Egress ID           | 101                                 |
-#        +-----------------------------------------------------------+
-#        | Egress Wide ID      | 101101                              |
-#        +-----------------------------------------------------------+
-#        | Namespace Data      | 0xdeadbee0                          |
-#        +-----------------------------------------------------------+
-#        | Namespace Wide Data | 0xcafec0caf00dc0de                  |
-#        +-----------------------------------------------------------+
-#        | Schema ID           | 777                                 |
-#        +-----------------------------------------------------------+
-#        | Schema Data         | something that will be 4n-aligned   |
-#        +-----------------------------------------------------------+
 #
 #
-#        =============================================================
-#        |                 Beta - IOAM configuration                 |
-#        +===========================================================+
-#        | Node ID             | 2                                   |
-#        +-----------------------------------------------------------+
-#        | Node Wide ID        | 22222222                            |
-#        +-----------------------------------------------------------+
-#        | Ingress ID          | 201                                 |
-#        +-----------------------------------------------------------+
-#        | Ingress Wide ID     | 201201                              |
-#        +-----------------------------------------------------------+
-#        | Egress ID           | 202                                 |
-#        +-----------------------------------------------------------+
-#        | Egress Wide ID      | 202202                              |
-#        +-----------------------------------------------------------+
-#        | Namespace Data      | 0xdeadbee1                          |
-#        +-----------------------------------------------------------+
-#        | Namespace Wide Data | 0xcafec0caf11dc0de                  |
-#        +-----------------------------------------------------------+
-#        | Schema ID           | 666                                 |
-#        +-----------------------------------------------------------+
-#        | Schema Data         | Hello there -Obi                    |
-#        +-----------------------------------------------------------+
+#         +==========================================================+
+#         |                Alpha - IOAM configuration                |
+#         +=====================+====================================+
+#         | Node ID             | 1                                  |
+#         +---------------------+------------------------------------+
+#         | Node Wide ID        | 11111111                           |
+#         +---------------------+------------------------------------+
+#         | Ingress ID          | 0xffff (default value)             |
+#         +---------------------+------------------------------------+
+#         | Ingress Wide ID     | 0xffffffff (default value)         |
+#         +---------------------+------------------------------------+
+#         | Egress ID           | 101                                |
+#         +---------------------+------------------------------------+
+#         | Egress Wide ID      | 101101                             |
+#         +---------------------+------------------------------------+
+#         | Namespace Data      | 0xdeadbeef                         |
+#         +---------------------+------------------------------------+
+#         | Namespace Wide Data | 0xcafec0caf00dc0de                 |
+#         +---------------------+------------------------------------+
+#         | Schema ID           | 777                                |
+#         +---------------------+------------------------------------+
+#         | Schema Data         | something that will be 4n-aligned  |
+#         +---------------------+------------------------------------+
 #
 #
-#        =============================================================
-#        |                Gamma - IOAM configuration                 |
-#        +===========================================================+
-#        | Node ID             | 3                                   |
-#        +-----------------------------------------------------------+
-#        | Node Wide ID        | 33333333                            |
-#        +-----------------------------------------------------------+
-#        | Ingress ID          | 301                                 |
-#        +-----------------------------------------------------------+
-#        | Ingress Wide ID     | 301301                              |
-#        +-----------------------------------------------------------+
-#        | Egress ID           | 0xffff (default value)              |
-#        +-----------------------------------------------------------+
-#        | Egress Wide ID      | 0xffffffff (default value)          |
-#        +-----------------------------------------------------------+
-#        | Namespace Data      | 0xdeadbee2                          |
-#        +-----------------------------------------------------------+
-#        | Namespace Wide Data | 0xcafec0caf22dc0de                  |
-#        +-----------------------------------------------------------+
-#        | Schema ID           | 0xffffff (= None)                   |
-#        +-----------------------------------------------------------+
-#        | Schema Data         |                                     |
-#        +-----------------------------------------------------------+
+#         +==========================================================+
+#         |                 Beta - IOAM configuration                |
+#         +=====================+====================================+
+#         | Node ID             | 2                                  |
+#         +---------------------+------------------------------------+
+#         | Node Wide ID        | 22222222                           |
+#         +---------------------+------------------------------------+
+#         | Ingress ID          | 201                                |
+#         +---------------------+------------------------------------+
+#         | Ingress Wide ID     | 201201                             |
+#         +---------------------+------------------------------------+
+#         | Egress ID           | 202                                |
+#         +---------------------+------------------------------------+
+#         | Egress Wide ID      | 202202                             |
+#         +---------------------+------------------------------------+
+#         | Namespace Data      | 0xffffffff (default value)         |
+#         +---------------------+------------------------------------+
+#         | Namespace Wide Data | 0xffffffffffffffff (default value) |
+#         +---------------------+------------------------------------+
+#         | Schema ID           | 0xffffff (= None)                  |
+#         +---------------------+------------------------------------+
+#         | Schema Data         |                                    |
+#         +---------------------+------------------------------------+
 
 source lib.sh
 
@@ -128,64 +115,69 @@ source lib.sh
 ################################################################################
 
 ALPHA=(
-	1					# ID
-	11111111				# Wide ID
-	0xffff					# Ingress ID
-	0xffffffff				# Ingress Wide ID
-	101					# Egress ID
-	101101					# Egress Wide ID
-	0xdeadbee0				# Namespace Data
-	0xcafec0caf00dc0de			# Namespace Wide Data
-	777					# Schema ID (0xffffff = None)
-	"something that will be 4n-aligned"	# Schema Data
+  1                                    # ID
+  11111111                             # Wide ID
+  0xffff                               # Ingress ID (default value)
+  0xffffffff                           # Ingress Wide ID (default value)
+  101                                  # Egress ID
+  101101                               # Egress Wide ID
+  0xdeadbeef                           # Namespace Data
+  0xcafec0caf00dc0de                   # Namespace Wide Data
+  777                                  # Schema ID
+  "something that will be 4n-aligned"  # Schema Data
 )
 
 BETA=(
-	2
-	22222222
-	201
-	201201
-	202
-	202202
-	0xdeadbee1
-	0xcafec0caf11dc0de
-	666
-	"Hello there -Obi"
+  2                                    # ID
+  22222222                             # Wide ID
+  201                                  # Ingress ID
+  201201                               # Ingress Wide ID
+  202                                  # Egress ID
+  202202                               # Egress Wide ID
+  0xffffffff                           # Namespace Data (empty value)
+  0xffffffffffffffff                   # Namespace Wide Data (empty value)
+  0xffffff                             # Schema ID (empty value)
+  ""                                   # Schema Data (empty value)
 )
 
-GAMMA=(
-	3
-	33333333
-	301
-	301301
-	0xffff
-	0xffffffff
-	0xdeadbee2
-	0xcafec0caf22dc0de
-	0xffffff
-	""
-)
+TESTS_LOCAL="
+  local_sysctl_ioam_id
+  local_sysctl_ioam_id_wide
+  local_sysctl_ioam_intf_id
+  local_sysctl_ioam_intf_id_wide
+  local_sysctl_ioam_intf_enabled
+  local_ioam_namespace
+  local_ioam_schema
+  local_ioam_schema_namespace
+  local_route_ns
+  local_route_tunsrc
+  local_route_tundst
+  local_route_trace_type
+  local_route_trace_size
+  local_route_trace_type_bits
+  local_route_trace_size_values
+"
 
 TESTS_OUTPUT="
-	out_undef_ns
-	out_no_room
-	out_bits
-	out_full_supp_trace
+  output_undef_ns
+  output_no_room
+  output_no_room_oss
+  output_bits
+  output_sizes
+  output_full_supp_trace
 "
 
 TESTS_INPUT="
-	in_undef_ns
-	in_no_room
-	in_oflag
-	in_bits
-	in_full_supp_trace
+  input_undef_ns
+  input_no_room
+  input_no_room_oss
+  input_disabled
+  input_oflag
+  input_bits
+  input_sizes
+  input_full_supp_trace
 "
 
-TESTS_GLOBAL="
-	fwd_full_supp_trace
-"
-
-
 ################################################################################
 #                                                                              #
 #                                   LIBRARY                                    #
@@ -194,66 +186,64 @@ TESTS_GLOBAL="
 
 check_kernel_compatibility()
 {
-  setup_ns ioam_tmp_node
-  ip link add name veth0 netns $ioam_tmp_node type veth \
-         peer name veth1 netns $ioam_tmp_node
+  setup_ns ioam_tmp_node &>/dev/null
+  local ret=$?
 
-  ip -netns $ioam_tmp_node link set veth0 up
-  ip -netns $ioam_tmp_node link set veth1 up
+  ip link add name veth0 netns $ioam_tmp_node type veth \
+    peer name veth1 netns $ioam_tmp_node &>/dev/null
+  ret=$((ret + $?))
 
-  ip -netns $ioam_tmp_node ioam namespace add 0
-  ns_ad=$?
+  ip -netns $ioam_tmp_node link set veth0 up &>/dev/null
+  ret=$((ret + $?))
 
-  ip -netns $ioam_tmp_node ioam namespace show | grep -q "namespace 0"
-  ns_sh=$?
+  ip -netns $ioam_tmp_node link set veth1 up &>/dev/null
+  ret=$((ret + $?))
 
-  if [[ $ns_ad != 0 || $ns_sh != 0 ]]
+  if [ $ret != 0 ]
   then
-    echo "SKIP: kernel version probably too old, missing ioam support"
-    ip link del veth0 2>/dev/null || true
-    cleanup_ns $ioam_tmp_node || true
+    echo "SKIP: Setup failed."
+    cleanup_ns $ioam_tmp_node
     exit $ksft_skip
   fi
 
-  ip -netns $ioam_tmp_node route add db02::/64 encap ioam6 mode inline \
-         trace prealloc type 0x800000 ns 0 size 4 dev veth0
-  tr_ad=$?
+  ip -netns $ioam_tmp_node route add 2001:db8:2::/64 \
+    encap ioam6 trace prealloc type 0x800000 ns 0 size 4 dev veth0 &>/dev/null
+  ret=$?
 
-  ip -netns $ioam_tmp_node -6 route | grep -q "encap ioam6"
-  tr_sh=$?
+  ip -netns $ioam_tmp_node -6 route 2>/dev/null | grep -q "encap ioam6"
+  ret=$((ret + $?))
 
-  if [[ $tr_ad != 0 || $tr_sh != 0 ]]
+  if [ $ret != 0 ]
   then
-    echo "SKIP: cannot attach an ioam trace to a route, did you compile" \
-         "without CONFIG_IPV6_IOAM6_LWTUNNEL?"
-    ip link del veth0 2>/dev/null || true
-    cleanup_ns $ioam_tmp_node || true
+    echo "SKIP: Cannot attach an IOAM trace to a route. Was your kernel" \
+         "compiled without CONFIG_IPV6_IOAM6_LWTUNNEL? Are you running an" \
+         "old kernel? Are you using an old version of iproute2?"
+    cleanup_ns $ioam_tmp_node
     exit $ksft_skip
   fi
 
-  ip link del veth0 2>/dev/null || true
-  cleanup_ns $ioam_tmp_node || true
+  cleanup_ns $ioam_tmp_node
 
-  lsmod | grep -q "ip6_tunnel"
+  lsmod 2>/dev/null | grep -q "ip6_tunnel"
   ip6tnl_loaded=$?
 
-  if [ $ip6tnl_loaded = 0 ]
+  if [ $ip6tnl_loaded == 0 ]
   then
     encap_tests=0
   else
     modprobe ip6_tunnel &>/dev/null
-    lsmod | grep -q "ip6_tunnel"
+    lsmod 2>/dev/null | grep -q "ip6_tunnel"
     encap_tests=$?
 
     if [ $encap_tests != 0 ]
     then
-      ip a | grep -q "ip6tnl0"
+      ip a 2>/dev/null | grep -q "ip6tnl0"
       encap_tests=$?
 
       if [ $encap_tests != 0 ]
       then
         echo "Note: ip6_tunnel not found neither as a module nor inside the" \
-             "kernel, tests that require it (encap mode) will be omitted"
+             "kernel. Any tests that require it will be skipped."
       fi
     fi
   fi
@@ -261,477 +251,1400 @@ check_kernel_compatibility()
 
 cleanup()
 {
-  ip link del ioam-veth-alpha 2>/dev/null || true
-  ip link del ioam-veth-gamma 2>/dev/null || true
-
-  cleanup_ns $ioam_node_alpha $ioam_node_beta $ioam_node_gamma || true
+  cleanup_ns $ioam_node_alpha $ioam_node_beta $ioam_node_gamma
 
   if [ $ip6tnl_loaded != 0 ]
   then
-    modprobe -r ip6_tunnel 2>/dev/null || true
+    modprobe -r ip6_tunnel &>/dev/null
   fi
 }
 
 setup()
 {
-  setup_ns ioam_node_alpha ioam_node_beta ioam_node_gamma
+  setup_ns ioam_node_alpha ioam_node_beta ioam_node_gamma &>/dev/null
 
   ip link add name ioam-veth-alpha netns $ioam_node_alpha type veth \
-         peer name ioam-veth-betaL netns $ioam_node_beta
+    peer name ioam-veth-betaL netns $ioam_node_beta &>/dev/null
   ip link add name ioam-veth-betaR netns $ioam_node_beta type veth \
-         peer name ioam-veth-gamma netns $ioam_node_gamma
-
-  ip -netns $ioam_node_alpha link set ioam-veth-alpha name veth0
-  ip -netns $ioam_node_beta link set ioam-veth-betaL name veth0
-  ip -netns $ioam_node_beta link set ioam-veth-betaR name veth1
-  ip -netns $ioam_node_gamma link set ioam-veth-gamma name veth0
-
-  ip -netns $ioam_node_alpha addr add db01::2/64 dev veth0
-  ip -netns $ioam_node_alpha link set veth0 up
-  ip -netns $ioam_node_alpha link set lo up
-  ip -netns $ioam_node_alpha route add db02::/64 via db01::1 dev veth0
-  ip -netns $ioam_node_alpha route del db01::/64
-  ip -netns $ioam_node_alpha route add db01::/64 dev veth0
-
-  ip -netns $ioam_node_beta addr add db01::1/64 dev veth0
-  ip -netns $ioam_node_beta addr add db02::1/64 dev veth1
-  ip -netns $ioam_node_beta link set veth0 up
-  ip -netns $ioam_node_beta link set veth1 up
-  ip -netns $ioam_node_beta link set lo up
-
-  ip -netns $ioam_node_gamma addr add db02::2/64 dev veth0
-  ip -netns $ioam_node_gamma link set veth0 up
-  ip -netns $ioam_node_gamma link set lo up
-  ip -netns $ioam_node_gamma route add db01::/64 via db02::1 dev veth0
-
-  # - IOAM config -
-  ip netns exec $ioam_node_alpha sysctl -wq net.ipv6.ioam6_id=${ALPHA[0]}
-  ip netns exec $ioam_node_alpha sysctl -wq net.ipv6.ioam6_id_wide=${ALPHA[1]}
-  ip netns exec $ioam_node_alpha sysctl -wq net.ipv6.conf.veth0.ioam6_id=${ALPHA[4]}
-  ip netns exec $ioam_node_alpha sysctl -wq net.ipv6.conf.veth0.ioam6_id_wide=${ALPHA[5]}
-  ip -netns $ioam_node_alpha ioam namespace add 123 data ${ALPHA[6]} wide ${ALPHA[7]}
-  ip -netns $ioam_node_alpha ioam schema add ${ALPHA[8]} "${ALPHA[9]}"
-  ip -netns $ioam_node_alpha ioam namespace set 123 schema ${ALPHA[8]}
-
-  ip netns exec $ioam_node_beta sysctl -wq net.ipv6.conf.all.forwarding=1
-  ip netns exec $ioam_node_beta sysctl -wq net.ipv6.ioam6_id=${BETA[0]}
-  ip netns exec $ioam_node_beta sysctl -wq net.ipv6.ioam6_id_wide=${BETA[1]}
-  ip netns exec $ioam_node_beta sysctl -wq net.ipv6.conf.veth0.ioam6_enabled=1
-  ip netns exec $ioam_node_beta sysctl -wq net.ipv6.conf.veth0.ioam6_id=${BETA[2]}
-  ip netns exec $ioam_node_beta sysctl -wq net.ipv6.conf.veth0.ioam6_id_wide=${BETA[3]}
-  ip netns exec $ioam_node_beta sysctl -wq net.ipv6.conf.veth1.ioam6_id=${BETA[4]}
-  ip netns exec $ioam_node_beta sysctl -wq net.ipv6.conf.veth1.ioam6_id_wide=${BETA[5]}
-  ip -netns $ioam_node_beta ioam namespace add 123 data ${BETA[6]} wide ${BETA[7]}
-  ip -netns $ioam_node_beta ioam schema add ${BETA[8]} "${BETA[9]}"
-  ip -netns $ioam_node_beta ioam namespace set 123 schema ${BETA[8]}
-
-  ip netns exec $ioam_node_gamma sysctl -wq net.ipv6.ioam6_id=${GAMMA[0]}
-  ip netns exec $ioam_node_gamma sysctl -wq net.ipv6.ioam6_id_wide=${GAMMA[1]}
-  ip netns exec $ioam_node_gamma sysctl -wq net.ipv6.conf.veth0.ioam6_enabled=1
-  ip netns exec $ioam_node_gamma sysctl -wq net.ipv6.conf.veth0.ioam6_id=${GAMMA[2]}
-  ip netns exec $ioam_node_gamma sysctl -wq net.ipv6.conf.veth0.ioam6_id_wide=${GAMMA[3]}
-  ip -netns $ioam_node_gamma ioam namespace add 123 data ${GAMMA[6]} wide ${GAMMA[7]}
+    peer name ioam-veth-gamma netns $ioam_node_gamma &>/dev/null
+
+  ip -netns $ioam_node_alpha link set ioam-veth-alpha name veth0 &>/dev/null
+  ip -netns $ioam_node_beta link set ioam-veth-betaL name veth0 &>/dev/null
+  ip -netns $ioam_node_beta link set ioam-veth-betaR name veth1 &>/dev/null
+  ip -netns $ioam_node_gamma link set ioam-veth-gamma name veth0 &>/dev/null
+
+  ip -netns $ioam_node_alpha addr add 2001:db8:1::50/64 dev veth0 &>/dev/null
+  ip -netns $ioam_node_alpha addr add 2001:db8:1::2/64 dev veth0 &>/dev/null
+  ip -netns $ioam_node_alpha link set veth0 up &>/dev/null
+  ip -netns $ioam_node_alpha link set lo up &>/dev/null
+  ip -netns $ioam_node_alpha route add 2001:db8:2::/64 \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  ip -netns $ioam_node_beta addr add 2001:db8:1::1/64 dev veth0 &>/dev/null
+  ip -netns $ioam_node_beta addr add 2001:db8:2::1/64 dev veth1 &>/dev/null
+  ip -netns $ioam_node_beta link set veth0 up &>/dev/null
+  ip -netns $ioam_node_beta link set veth1 up &>/dev/null
+  ip -netns $ioam_node_beta link set lo up &>/dev/null
+
+  ip -netns $ioam_node_gamma addr add 2001:db8:2::2/64 dev veth0 &>/dev/null
+  ip -netns $ioam_node_gamma link set veth0 up &>/dev/null
+  ip -netns $ioam_node_gamma link set lo up &>/dev/null
+  ip -netns $ioam_node_gamma route add 2001:db8:1::/64 \
+    via 2001:db8:2::1 dev veth0 &>/dev/null
+
+  # - Alpha: IOAM config -
+  ip netns exec $ioam_node_alpha \
+    sysctl -wq net.ipv6.ioam6_id=${ALPHA[0]} &>/dev/null
+  ip netns exec $ioam_node_alpha \
+    sysctl -wq net.ipv6.ioam6_id_wide=${ALPHA[1]} &>/dev/null
+  ip netns exec $ioam_node_alpha \
+    sysctl -wq net.ipv6.conf.veth0.ioam6_id=${ALPHA[4]} &>/dev/null
+  ip netns exec $ioam_node_alpha \
+    sysctl -wq net.ipv6.conf.veth0.ioam6_id_wide=${ALPHA[5]} &>/dev/null
+  ip -netns $ioam_node_alpha \
+    ioam namespace add 123 data ${ALPHA[6]} wide ${ALPHA[7]} &>/dev/null
+  ip -netns $ioam_node_alpha \
+    ioam schema add ${ALPHA[8]} "${ALPHA[9]}" &>/dev/null
+  ip -netns $ioam_node_alpha \
+    ioam namespace set 123 schema ${ALPHA[8]} &>/dev/null
+
+  # - Beta: IOAM config -
+  ip netns exec $ioam_node_beta \
+    sysctl -wq net.ipv6.conf.all.forwarding=1 &>/dev/null
+  ip netns exec $ioam_node_beta \
+    sysctl -wq net.ipv6.ioam6_id=${BETA[0]} &>/dev/null
+  ip netns exec $ioam_node_beta \
+    sysctl -wq net.ipv6.ioam6_id_wide=${BETA[1]} &>/dev/null
+  ip netns exec $ioam_node_beta \
+    sysctl -wq net.ipv6.conf.veth0.ioam6_enabled=1 &>/dev/null
+  ip netns exec $ioam_node_beta \
+    sysctl -wq net.ipv6.conf.veth0.ioam6_id=${BETA[2]} &>/dev/null
+  ip netns exec $ioam_node_beta \
+    sysctl -wq net.ipv6.conf.veth0.ioam6_id_wide=${BETA[3]} &>/dev/null
+  ip netns exec $ioam_node_beta \
+    sysctl -wq net.ipv6.conf.veth1.ioam6_id=${BETA[4]} &>/dev/null
+  ip netns exec $ioam_node_beta \
+    sysctl -wq net.ipv6.conf.veth1.ioam6_id_wide=${BETA[5]} &>/dev/null
+  ip -netns $ioam_node_beta ioam namespace add 123 &>/dev/null
 
   sleep 1
 
-  ip netns exec $ioam_node_alpha ping6 -c 5 -W 1 db02::2 &>/dev/null
+  ip netns exec $ioam_node_alpha ping6 -c 5 -W 1 2001:db8:2::2 &>/dev/null
   if [ $? != 0 ]
   then
-    echo "Setup FAILED"
-    cleanup &>/dev/null
-    exit 0
+    echo "SKIP: Setup failed."
+    cleanup
+    exit $ksft_skip
   fi
 }
 
 log_test_passed()
 {
-  local desc=$1
-  printf "TEST: %-60s  [ OK ]\n" "${desc}"
+  printf " - TEST: %-57s  [ OK ]\n" "$1"
+  npassed=$((npassed+1))
 }
 
-log_test_failed()
+log_test_skipped()
 {
-  local desc=$1
-  printf "TEST: %-60s  [FAIL]\n" "${desc}"
+  printf " - TEST: %-57s  [SKIP]\n" "$1"
+  nskipped=$((nskipped+1))
 }
 
-log_results()
+log_test_failed()
 {
-  echo "- Tests passed: ${npassed}"
-  echo "- Tests failed: ${nfailed}"
+  printf " - TEST: %-57s  [FAIL]\n" "$1"
+  nfailed=$((nfailed+1))
 }
 
 run_test()
 {
   local name=$1
   local desc=$2
-  local node_src=$3
-  local node_dst=$4
-  local ip6_dst=$5
-  local trace_type=$6
-  local ioam_ns=$7
-  local type=$8
-
-  ip netns exec $node_dst ./ioam6_parser $name $trace_type $ioam_ns $type &
+  local ip6_src=$3
+  local trace_type=$4
+  local trace_size=$5
+  local ioam_ns=$6
+  local type=$7
+
+  ip netns exec $ioam_node_gamma \
+    ./ioam6_parser veth0 $name $ip6_src 2001:db8:2::2 \
+                   $trace_type $trace_size $ioam_ns $type &
   local spid=$!
   sleep 0.1
 
-  ip netns exec $node_src ping6 -t 64 -c 1 -W 1 $ip6_dst &>/dev/null
+  ip netns exec $ioam_node_alpha ping6 -t 64 -c 1 -W 1 2001:db8:2::2 &>/dev/null
   if [ $? != 0 ]
   then
-    nfailed=$((nfailed+1))
     log_test_failed "${desc}"
     kill -2 $spid &>/dev/null
   else
     wait $spid
-    if [ $? = 0 ]
-    then
-      npassed=$((npassed+1))
-      log_test_passed "${desc}"
-    else
-      nfailed=$((nfailed+1))
-      log_test_failed "${desc}"
-    fi
+    [ $? == 0 ] && log_test_passed "${desc}" || log_test_failed "${desc}"
   fi
 }
 
 run()
 {
+  local test
+
+  echo
+  printf "+"
+  printf "%0.s-" {1..72}
+  printf "+"
+  echo
+  printf "| %-28s LOCAL tests %-29s |"
   echo
-  printf "%0.s-" {1..74}
+  printf "+"
+  printf "%0.s-" {1..72}
+  printf "+"
   echo
-  echo "OUTPUT tests"
-  printf "%0.s-" {1..74}
+
+  echo
+  echo "Global config"
+  for test in $TESTS_LOCAL
+  do
+    $test
+  done
+
+  echo
+  echo "Inline mode"
+  for test in $TESTS_LOCAL
+  do
+    $test "inline"
+  done
+
+  echo
+  echo "Encap mode"
+  for test in $TESTS_LOCAL
+  do
+    $test "encap"
+  done
+
+  echo
+  printf "+"
+  printf "%0.s-" {1..72}
+  printf "+"
+  echo
+  printf "| %-28s OUTPUT tests %-28s |"
+  echo
+  printf "+"
+  printf "%0.s-" {1..72}
+  printf "+"
   echo
 
   # set OUTPUT settings
-  ip netns exec $ioam_node_beta sysctl -wq net.ipv6.conf.veth0.ioam6_enabled=0
+  ip netns exec $ioam_node_beta \
+    sysctl -wq net.ipv6.conf.veth0.ioam6_enabled=0 &>/dev/null
 
-  for t in $TESTS_OUTPUT
+  echo
+  echo "Inline mode"
+  for test in $TESTS_OUTPUT
   do
-    $t "inline"
-    [ $encap_tests = 0 ] && $t "encap"
+    $test "inline"
   done
 
-  # clean OUTPUT settings
-  ip netns exec $ioam_node_beta sysctl -wq net.ipv6.conf.veth0.ioam6_enabled=1
-  ip -netns $ioam_node_alpha route change db01::/64 dev veth0
+  echo
+  echo "Encap mode"
+  for test in $TESTS_OUTPUT
+  do
+    $test "encap"
+  done
 
+  echo
+  echo "Encap mode (with tunsrc)"
+  for test in $TESTS_OUTPUT
+  do
+    $test "encap" "tunsrc"
+  done
+
+  # clean OUTPUT settings
+  ip netns exec $ioam_node_beta \
+    sysctl -wq net.ipv6.conf.veth0.ioam6_enabled=1 &>/dev/null
 
   echo
-  printf "%0.s-" {1..74}
+  printf "+"
+  printf "%0.s-" {1..72}
+  printf "+"
   echo
-  echo "INPUT tests"
-  printf "%0.s-" {1..74}
+  printf "| %-28s INPUT tests %-29s |"
+  echo
+  printf "+"
+  printf "%0.s-" {1..72}
+  printf "+"
   echo
 
   # set INPUT settings
-  ip -netns $ioam_node_alpha ioam namespace del 123
+  ip -netns $ioam_node_alpha ioam namespace del 123 &>/dev/null
 
-  for t in $TESTS_INPUT
+  echo
+  echo "Inline mode"
+  for test in $TESTS_INPUT
   do
-    $t "inline"
-    [ $encap_tests = 0 ] && $t "encap"
+    $test "inline"
+  done
+
+  echo
+  echo "Encap mode"
+  for test in $TESTS_INPUT
+  do
+    $test "encap"
   done
 
   # clean INPUT settings
-  ip -netns $ioam_node_alpha ioam namespace add 123 \
-         data ${ALPHA[6]} wide ${ALPHA[7]}
-  ip -netns $ioam_node_alpha ioam namespace set 123 schema ${ALPHA[8]}
-  ip -netns $ioam_node_alpha route change db01::/64 dev veth0
+  ip -netns $ioam_node_alpha \
+    ioam namespace add 123 data ${ALPHA[6]} wide ${ALPHA[7]} &>/dev/null
+  ip -netns $ioam_node_alpha \
+    ioam namespace set 123 schema ${ALPHA[8]} &>/dev/null
 
   echo
-  printf "%0.s-" {1..74}
+  printf "+"
+  printf "%0.s-" {1..72}
+  printf "+"
   echo
-  echo "GLOBAL tests"
-  printf "%0.s-" {1..74}
+  printf "| %-30s Results %-31s |"
+  echo
+  printf "+"
+  printf "%0.s-" {1..72}
+  printf "+"
   echo
 
-  for t in $TESTS_GLOBAL
-  do
-    $t "inline"
-    [ $encap_tests = 0 ] && $t "encap"
-  done
-
   echo
-  log_results
+  echo "- Passed:  ${npassed}"
+  echo "- Skipped: ${nskipped}"
+  echo "- Failed:  ${nfailed}"
+  echo
 }
 
 bit2type=(
   0x800000 0x400000 0x200000 0x100000 0x080000 0x040000 0x020000 0x010000
   0x008000 0x004000 0x002000 0x001000 0x000800 0x000400 0x000200 0x000100
-  0x000080 0x000040 0x000020 0x000010 0x000008 0x000004 0x000002
+  0x000080 0x000040 0x000020 0x000010 0x000008 0x000004 0x000002 0x000001
 )
-bit2size=( 4 4 4 4 4 4 4 4 8 8 8 4 4 4 4 4 4 4 4 4 4 4 4 )
+bit2size=( 4 4 4 4 4 4 4 4 8 8 8 4 4 4 4 4 4 4 4 4 4 4 4 0 )
 
 
 ################################################################################
 #                                                                              #
-#                              OUTPUT tests                                    #
+#                                 LOCAL tests                                  #
 #                                                                              #
-#   Two nodes (sender/receiver), IOAM disabled on ingress for the receiver.    #
 ################################################################################
 
-out_undef_ns()
+local_sysctl_ioam_id()
+{
+  ##############################################################################
+  # Make sure the sysctl "net.ipv6.ioam6_id" works as expected.                #
+  ##############################################################################
+  local desc="Sysctl net.ipv6.ioam6_id"
+
+  [ ! -z $1 ] && return
+
+  ip netns exec $ioam_node_alpha \
+    sysctl net.ipv6.ioam6_id 2>/dev/null | grep -wq ${ALPHA[0]}
+
+  [ $? == 0 ] && log_test_passed "${desc}" || log_test_failed "${desc}"
+}
+
+local_sysctl_ioam_id_wide()
 {
   ##############################################################################
-  # Make sure that the encap node won't fill the trace if the chosen IOAM      #
-  # namespace is not configured locally.                                       #
+  # Make sure the sysctl "net.ipv6.ioam6_id_wide" works as expected.           #
   ##############################################################################
-  local desc="Unknown IOAM namespace"
+  local desc="Sysctl net.ipv6.ioam6_id_wide"
 
-  [ "$1" = "encap" ] && mode="$1 tundst db01::1" || mode="$1"
-  [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 up
+  [ ! -z $1 ] && return
 
-  ip -netns $ioam_node_alpha route change db01::/64 encap ioam6 mode $mode \
-         trace prealloc type 0x800000 ns 0 size 4 dev veth0
+  ip netns exec $ioam_node_alpha \
+    sysctl net.ipv6.ioam6_id_wide 2>/dev/null | grep -wq ${ALPHA[1]}
 
-  run_test ${FUNCNAME[0]} "${desc} ($1 mode)" $ioam_node_alpha $ioam_node_beta \
-         db01::1 0x800000 0 $1
+  [ $? == 0 ] && log_test_passed "${desc}" || log_test_failed "${desc}"
+}
 
-  [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 down
+local_sysctl_ioam_intf_id()
+{
+  ##############################################################################
+  # Make sure the sysctl "net.ipv6.conf.XX.ioam6_id" works as expected.        #
+  ##############################################################################
+  local desc="Sysctl net.ipv6.conf.XX.ioam6_id"
+
+  [ ! -z $1 ] && return
+
+  ip netns exec $ioam_node_alpha \
+    sysctl net.ipv6.conf.veth0.ioam6_id 2>/dev/null | grep -wq ${ALPHA[4]}
+
+  [ $? == 0 ] && log_test_passed "${desc}" || log_test_failed "${desc}"
 }
 
-out_no_room()
+local_sysctl_ioam_intf_id_wide()
 {
   ##############################################################################
-  # Make sure that the encap node won't fill the trace and will set the        #
-  # Overflow flag since there is no room enough for its data.                  #
+  # Make sure the sysctl "net.ipv6.conf.XX.ioam6_id_wide" works as expected.   #
   ##############################################################################
-  local desc="Missing trace room"
+  local desc="Sysctl net.ipv6.conf.XX.ioam6_id_wide"
+
+  [ ! -z $1 ] && return
+
+  ip netns exec $ioam_node_alpha \
+    sysctl net.ipv6.conf.veth0.ioam6_id_wide 2>/dev/null | grep -wq ${ALPHA[5]}
+
+  [ $? == 0 ] && log_test_passed "${desc}" || log_test_failed "${desc}"
+}
 
-  [ "$1" = "encap" ] && mode="$1 tundst db01::1" || mode="$1"
-  [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 up
+local_sysctl_ioam_intf_enabled()
+{
+  ##############################################################################
+  # Make sure the sysctl "net.ipv6.conf.XX.ioam6_enabled" works as expected.   #
+  ##############################################################################
+  local desc="Sysctl net.ipv6.conf.XX.ioam6_enabled"
 
-  ip -netns $ioam_node_alpha route change db01::/64 encap ioam6 mode $mode \
-         trace prealloc type 0xc00000 ns 123 size 4 dev veth0
+  [ ! -z $1 ] && return
 
-  run_test ${FUNCNAME[0]} "${desc} ($1 mode)" $ioam_node_alpha $ioam_node_beta \
-         db01::1 0xc00000 123 $1
+  ip netns exec $ioam_node_beta \
+    sysctl net.ipv6.conf.veth0.ioam6_enabled 2>/dev/null | grep -wq 1
 
-  [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 down
+  [ $? == 0 ] && log_test_passed "${desc}" || log_test_failed "${desc}"
 }
 
-out_bits()
+local_ioam_namespace()
 {
   ##############################################################################
-  # Make sure that, for each trace type bit, the encap node will either:       #
-  #  (i)  fill the trace with its data when it is a supported bit              #
-  #  (ii) not fill the trace with its data when it is an unsupported bit       #
+  # Make sure the creation of an IOAM Namespace works as expected.             #
   ##############################################################################
-  local desc="Trace type with bit <n> only"
+  local desc="Create an IOAM Namespace"
 
-  local tmp=${bit2size[22]}
-  bit2size[22]=$(( $tmp + ${#ALPHA[9]} + ((4 - (${#ALPHA[9]} % 4)) % 4) ))
+  [ ! -z $1 ] && return
 
-  [ "$1" = "encap" ] && mode="$1 tundst db01::1" || mode="$1"
-  [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 up
+  ip -netns $ioam_node_alpha \
+    ioam namespace show 2>/dev/null | grep -wq 123
+  local ret=$?
 
-  for i in {0..22}
-  do
-    ip -netns $ioam_node_alpha route change db01::/64 encap ioam6 mode $mode \
-           trace prealloc type ${bit2type[$i]} ns 123 size ${bit2size[$i]} \
-           dev veth0 &>/dev/null
+  ip -netns $ioam_node_alpha \
+    ioam namespace show 2>/dev/null | grep -wq ${ALPHA[6]}
+  ret=$((ret + $?))
 
-    local cmd_res=$?
-    local descr="${desc/<n>/$i}"
+  ip -netns $ioam_node_alpha \
+    ioam namespace show 2>/dev/null | grep -wq ${ALPHA[7]}
+  ret=$((ret + $?))
+
+  [ $ret == 0 ] && log_test_passed "${desc}" || log_test_failed "${desc}"
+}
+
+local_ioam_schema()
+{
+  ##############################################################################
+  # Make sure the creation of an IOAM Schema works as expected.                #
+  ##############################################################################
+  local desc="Create an IOAM Schema"
+
+  [ ! -z $1 ] && return
+
+  ip -netns $ioam_node_alpha \
+    ioam schema show 2>/dev/null | grep -wq ${ALPHA[8]}
+  local ret=$?
+
+  local sc_data=$(
+    for i in `seq 0 $((${#ALPHA[9]}-1))`
+    do
+      chr=${ALPHA[9]:i:1}
+      printf "%x " "'${chr}"
+    done
+  )
+
+  ip -netns $ioam_node_alpha \
+    ioam schema show 2>/dev/null | grep -q "$sc_data"
+  ret=$((ret + $?))
+
+  [ $ret == 0 ] && log_test_passed "${desc}" || log_test_failed "${desc}"
+}
+
+local_ioam_schema_namespace()
+{
+  ##############################################################################
+  # Make sure the binding of a Schema to a Namespace works as expected.        #
+  ##############################################################################
+  local desc="Bind an IOAM Schema to an IOAM Namespace"
+
+  [ ! -z $1 ] && return
+
+  ip -netns $ioam_node_alpha \
+    ioam namespace show 2>/dev/null | grep -wq ${ALPHA[8]}
+  local ret=$?
+
+  ip -netns $ioam_node_alpha \
+    ioam schema show 2>/dev/null | grep -wq 123
+  ret=$((ret + $?))
+
+  [ $ret == 0 ] && log_test_passed "${desc}" || log_test_failed "${desc}"
+}
+
+local_route_ns()
+{
+  ##############################################################################
+  # Make sure the Namespace-ID is always provided, whatever the mode.          #
+  ##############################################################################
+  local desc="Mandatory Namespace-ID"
+  local mode
+
+  [ -z $1 ] && return
+
+  [ "$1" == "encap" ] && mode="$1 tundst 2001:db8:2::2" || mode="$1"
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode trace prealloc type 0x800000 size 4 \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+  local ret1=$?
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode trace prealloc type 0x800000 ns 0 size 4 \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+  local ret2=$?
+
+  [[ $ret1 == 0 || $ret2 != 0 ]] && log_test_failed "${desc}" \
+                                 || log_test_passed "${desc}"
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+}
+
+local_route_tunsrc()
+{
+  ##############################################################################
+  # Make sure the Tunnel Source is only (and possibly) used with encap mode.   #
+  ##############################################################################
+  local desc
+  local mode
+  local mode_tunsrc
 
-    if [[ $i -ge 12 && $i -le 21 ]]
+  [ -z $1 ] && return
+
+  if [ "$1" == "encap" ]
+  then
+    desc="Optional Tunnel Source"
+    mode="$1 tundst 2001:db8:2::2"
+    mode_tunsrc="$1 tunsrc 2001:db8:1::50 tundst 2001:db8:2::2"
+  else
+    desc="Unneeded Tunnel Source"
+    mode="$1"
+    mode_tunsrc="$1 tunsrc 2001:db8:1::50"
+  fi
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode trace prealloc type 0x800000 ns 0 size 4 \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+  local ret1=$?
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode_tunsrc trace prealloc type 0x800000 ns 0 size 4 \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+  local ret2=$?
+
+  if [ "$1" == "encap" ]
+  then
+    [[ $ret1 != 0 || $ret2 != 0 ]] && log_test_failed "${desc}" \
+                                   || log_test_passed "${desc}"
+  else
+    [[ $ret1 != 0 || $ret2 == 0 ]] && log_test_failed "${desc}" \
+                                   || log_test_passed "${desc}"
+  fi
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+}
+
+local_route_tundst()
+{
+  ##############################################################################
+  # Make sure the Tunnel Destination is only (and always) used with encap mode.#
+  ##############################################################################
+  local desc
+
+  [ -z $1 ] && return
+
+  [ "$1" == "encap" ] && desc="Mandatory Tunnel Destination" \
+                     || desc="Unneeded Tunnel Destination"
+
+  local mode="$1"
+  local mode_tundst="$1 tundst 2001:db8:2::2"
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode trace prealloc type 0x800000 ns 0 size 4 \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+  local ret1=$?
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode_tundst trace prealloc type 0x800000 ns 0 size 4 \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+  local ret2=$?
+
+  if [ "$1" == "encap" ]
+  then
+    [[ $ret1 == 0 || $ret2 != 0 ]] && log_test_failed "${desc}" \
+                                   || log_test_passed "${desc}"
+  else
+    [[ $ret1 != 0 || $ret2 == 0 ]] && log_test_failed "${desc}" \
+                                   || log_test_passed "${desc}"
+  fi
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+}
+
+local_route_trace_type()
+{
+  ##############################################################################
+  # Make sure the Trace Type is always provided, whatever the mode.            #
+  ##############################################################################
+  local desc="Mandatory Trace Type"
+  local mode
+
+  [ -z $1 ] && return
+
+  [ "$1" == "encap" ] && mode="$1 tundst 2001:db8:2::2" || mode="$1"
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode trace prealloc ns 0 size 4 \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+  local ret1=$?
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode trace prealloc type 0x800000 ns 0 size 4 \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+  local ret2=$?
+
+  [[ $ret1 == 0 || $ret2 != 0 ]] && log_test_failed "${desc}" \
+                                 || log_test_passed "${desc}"
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+}
+
+local_route_trace_size()
+{
+  ##############################################################################
+  # Make sure the Trace Size is always provided, whatever the mode.            #
+  ##############################################################################
+  local desc="Mandatory Trace Size"
+  local mode
+
+  [ -z $1 ] && return
+
+  [ "$1" == "encap" ] && mode="$1 tundst 2001:db8:2::2" || mode="$1"
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode trace prealloc type 0x800000 ns 0 \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+  local ret1=$?
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode trace prealloc type 0x800000 ns 0 size 4 \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+  local ret2=$?
+
+  [[ $ret1 == 0 || $ret2 != 0 ]] && log_test_failed "${desc}" \
+                                 || log_test_passed "${desc}"
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+}
+
+local_route_trace_type_bits()
+{
+  ##############################################################################
+  # Make sure only allowed bits (0-11 and 22) are accepted.                    #
+  ##############################################################################
+  local desc="Trace Type bits"
+  local mode
+
+  [ -z $1 ] && return
+
+  [ "$1" == "encap" ] && mode="$1 tundst 2001:db8:2::2" || mode="$1"
+
+  local i
+  for i in {0..23}
+  do
+    ip -netns $ioam_node_alpha \
+      route change 2001:db8:2::/64 \
+      encap ioam6 mode $mode trace prealloc type ${bit2type[$i]} ns 0 size 4 \
+      via 2001:db8:1::1 dev veth0 &>/dev/null
+
+    if [[ ($? == 0 && (($i -ge 12 && $i -le 21) || $i == 23)) ||
+          ($? != 0 && (($i -ge 0 && $i -le 11) || $i == 22)) ]]
     then
-      if [ $cmd_res != 0 ]
-      then
-        npassed=$((npassed+1))
-        log_test_passed "$descr ($1 mode)"
-      else
-        nfailed=$((nfailed+1))
-        log_test_failed "$descr ($1 mode)"
-      fi
-    else
-	run_test "out_bit$i" "$descr ($1 mode)" $ioam_node_alpha \
-           $ioam_node_beta db01::1 ${bit2type[$i]} 123 $1
+      local err=1
+      break
     fi
   done
 
-  [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 down
+  [ -z $err ] && log_test_passed "${desc}" || log_test_failed "${desc}"
 
-  bit2size[22]=$tmp
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
 }
 
-out_full_supp_trace()
+local_route_trace_size_values()
 {
   ##############################################################################
-  # Make sure that the encap node will correctly fill a full trace. Be careful,#
-  # "full trace" here does NOT mean all bits (only supported ones).            #
+  # Make sure only allowed sizes (multiples of four in [4,244]) are accepted.  #
   ##############################################################################
-  local desc="Full supported trace"
+  local desc="Trace Size values"
+  local mode
 
-  [ "$1" = "encap" ] && mode="$1 tundst db01::1" || mode="$1"
-  [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 up
+  [ -z $1 ] && return
 
-  ip -netns $ioam_node_alpha route change db01::/64 encap ioam6 mode $mode \
-         trace prealloc type 0xfff002 ns 123 size 100 dev veth0
+  [ "$1" == "encap" ] && mode="$1 tundst 2001:db8:2::2" || mode="$1"
 
-  run_test ${FUNCNAME[0]} "${desc} ($1 mode)" $ioam_node_alpha $ioam_node_beta \
-         db01::1 0xfff002 123 $1
+  # we also try the next multiple of four after the MAX to check it's refused
+  local i
+  for i in {0..248}
+  do
+    ip -netns $ioam_node_alpha \
+      route change 2001:db8:2::/64 \
+      encap ioam6 mode $mode trace prealloc type 0x800000 ns 0 size $i \
+      via 2001:db8:1::1 dev veth0 &>/dev/null
 
-  [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 down
+    if [[ ($? == 0 && ($i == 0 || $i == 248 || $(( $i % 4 )) != 0)) ||
+          ($? != 0 && $i != 0 && $i != 248 && $(( $i % 4 )) == 0) ]]
+    then
+      local err=1
+      break
+    fi
+  done
+
+  [ -z $err ] && log_test_passed "${desc}" || log_test_failed "${desc}"
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
 }
 
 
 ################################################################################
 #                                                                              #
-#                               INPUT tests                                    #
+#                                 OUTPUT tests                                 #
 #                                                                              #
-#     Two nodes (sender/receiver), the sender MUST NOT fill the trace upon     #
-#     insertion -> the IOAM namespace configured on the sender is removed      #
-#     and is used in the inserted trace to force the sender not to fill it.    #
 ################################################################################
 
-in_undef_ns()
+output_undef_ns()
 {
   ##############################################################################
-  # Make sure that the receiving node won't fill the trace if the related IOAM #
-  # namespace is not configured locally.                                       #
+  # Make sure an IOAM encapsulating node does NOT fill the trace when the      #
+  # corresponding IOAM Namespace-ID is not configured locally.                 #
   ##############################################################################
-  local desc="Unknown IOAM namespace"
+  local desc="Unknown IOAM Namespace-ID"
+  local ns=0
+  local tr_type=0x800000
+  local tr_size=4
+  local mode="$1"
+  local saddr="2001:db8:1::2"
+
+  if [ "$1" == "encap" ]
+  then
+    if [ $encap_tests != 0 ]
+    then
+      log_test_skipped "${desc}"
+      return
+    fi
+
+    if [ "$2" == "tunsrc" ]
+    then
+      saddr="2001:db8:1::50"
+      mode+=" tunsrc 2001:db8:1::50"
+    fi
+
+    mode+=" tundst 2001:db8:2::2"
+    ip -netns $ioam_node_gamma link set ip6tnl0 up &>/dev/null
+  fi
 
-  [ "$1" = "encap" ] && mode="$1 tundst db01::1" || mode="$1"
-  [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 up
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode trace prealloc type $tr_type ns $ns size $tr_size \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
 
-  ip -netns $ioam_node_alpha route change db01::/64 encap ioam6 mode $mode \
-         trace prealloc type 0x800000 ns 0 size 4 dev veth0
+  if [ $? == 0 ]
+  then
+    run_test ${FUNCNAME[0]} "${desc}" $saddr $tr_type $tr_size $ns $1
+  else
+    log_test_failed "${desc}"
+  fi
 
-  run_test ${FUNCNAME[0]} "${desc} ($1 mode)" $ioam_node_alpha $ioam_node_beta \
-         db01::1 0x800000 0 $1
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
 
-  [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 down
+  [ "$1" == "encap" ] && ip -netns $ioam_node_gamma \
+    link set ip6tnl0 down &>/dev/null
 }
 
-in_no_room()
+output_no_room()
 {
   ##############################################################################
-  # Make sure that the receiving node won't fill the trace and will set the    #
-  # Overflow flag if there is no room enough for its data.                     #
+  # Make sure an IOAM encapsulating node does NOT fill the trace AND sets the  #
+  # Overflow flag when there is not enough room for its data.                  #
   ##############################################################################
-  local desc="Missing trace room"
+  local desc="Missing room for data"
+  local ns=123
+  local tr_type=0xc00000
+  local tr_size=4
+  local mode="$1"
+  local saddr="2001:db8:1::2"
+
+  if [ "$1" == "encap" ]
+  then
+    if [ $encap_tests != 0 ]
+    then
+      log_test_skipped "${desc}"
+      return
+    fi
 
-  [ "$1" = "encap" ] && mode="$1 tundst db01::1" || mode="$1"
-  [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 up
+    if [ "$2" == "tunsrc" ]
+    then
+      saddr="2001:db8:1::50"
+      mode+=" tunsrc 2001:db8:1::50"
+    fi
 
-  ip -netns $ioam_node_alpha route change db01::/64 encap ioam6 mode $mode \
-         trace prealloc type 0xc00000 ns 123 size 4 dev veth0
+    mode+=" tundst 2001:db8:2::2"
+    ip -netns $ioam_node_gamma link set ip6tnl0 up &>/dev/null
+  fi
 
-  run_test ${FUNCNAME[0]} "${desc} ($1 mode)" $ioam_node_alpha $ioam_node_beta \
-         db01::1 0xc00000 123 $1
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode trace prealloc type $tr_type ns $ns size $tr_size \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
 
-  [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 down
+  if [ $? == 0 ]
+  then
+    run_test ${FUNCNAME[0]} "${desc}" $saddr $tr_type $tr_size $ns $1
+  else
+    log_test_failed "${desc}"
+  fi
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  [ "$1" == "encap" ] && ip -netns $ioam_node_gamma \
+    link set ip6tnl0 down &>/dev/null
 }
 
-in_bits()
+output_no_room_oss()
 {
   ##############################################################################
-  # Make sure that, for each trace type bit, the receiving node will either:   #
-  #  (i)  fill the trace with its data when it is a supported bit              #
-  #  (ii) not fill the trace with its data when it is an unsupported bit       #
+  # Make sure an IOAM encapsulating node does NOT fill the trace AND sets the  #
+  # Overflow flag when there is not enough room for the Opaque State Snapshot. #
   ##############################################################################
-  local desc="Trace type with bit <n> only"
+  local desc="Missing room for Opaque State Snapshot"
+  local ns=123
+  local tr_type=0x000002
+  local tr_size=4
+  local mode="$1"
+  local saddr="2001:db8:1::2"
+
+  if [ "$1" == "encap" ]
+  then
+    if [ $encap_tests != 0 ]
+    then
+      log_test_skipped "${desc}"
+      return
+    fi
 
-  local tmp=${bit2size[22]}
-  bit2size[22]=$(( $tmp + ${#BETA[9]} + ((4 - (${#BETA[9]} % 4)) % 4) ))
+    if [ "$2" == "tunsrc" ]
+    then
+      saddr="2001:db8:1::50"
+      mode+=" tunsrc 2001:db8:1::50"
+    fi
+
+    mode+=" tundst 2001:db8:2::2"
+    ip -netns $ioam_node_gamma link set ip6tnl0 up &>/dev/null
+  fi
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode trace prealloc type $tr_type ns $ns size $tr_size \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  if [ $? == 0 ]
+  then
+    run_test ${FUNCNAME[0]} "${desc}" $saddr $tr_type $tr_size $ns $1
+  else
+    log_test_failed "${desc}"
+  fi
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  [ "$1" == "encap" ] && ip -netns $ioam_node_gamma \
+    link set ip6tnl0 down &>/dev/null
+}
+
+output_bits()
+{
+  ##############################################################################
+  # Make sure an IOAM encapsulating node implements all supported bits by      #
+  # checking it correctly fills the trace with its data.                       #
+  ##############################################################################
+  local desc="Trace Type with supported bit <n> only"
+  local ns=123
+  local mode="$1"
+  local saddr="2001:db8:1::2"
 
-  [ "$1" = "encap" ] && mode="$1 tundst db01::1" || mode="$1"
-  [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 up
+  if [ "$1" == "encap" ]
+  then
+    if [ "$2" == "tunsrc" ]
+    then
+      saddr="2001:db8:1::50"
+      mode+=" tunsrc 2001:db8:1::50"
+    fi
+
+    mode+=" tundst 2001:db8:2::2"
+    ip -netns $ioam_node_gamma link set ip6tnl0 up &>/dev/null
+  fi
+
+  local tmp=${bit2size[22]}
+  bit2size[22]=$(( $tmp + ${#ALPHA[9]} + ((4 - (${#ALPHA[9]} % 4)) % 4) ))
 
+  local i
   for i in {0..11} {22..22}
   do
-    ip -netns $ioam_node_alpha route change db01::/64 encap ioam6 mode $mode \
-           trace prealloc type ${bit2type[$i]} ns 123 size ${bit2size[$i]} \
-           dev veth0
+    local descr="${desc/<n>/$i}"
+
+    if [[ "$1" == "encap" && $encap_tests != 0 ]]
+    then
+      log_test_skipped "${descr}"
+      continue
+    fi
 
-    run_test "in_bit$i" "${desc/<n>/$i} ($1 mode)" $ioam_node_alpha \
-           $ioam_node_beta db01::1 ${bit2type[$i]} 123 $1
+    ip -netns $ioam_node_alpha \
+      route change 2001:db8:2::/64 \
+      encap ioam6 mode $mode trace prealloc \
+      type ${bit2type[$i]} ns $ns size ${bit2size[$i]} \
+      via 2001:db8:1::1 dev veth0 &>/dev/null
+
+    if [ $? == 0 ]
+    then
+      run_test "output_bit$i" "${descr}" $saddr \
+        ${bit2type[$i]} ${bit2size[$i]} $ns $1
+    else
+      log_test_failed "${descr}"
+    fi
   done
 
-  [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 down
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  [ "$1" == "encap" ] && ip -netns $ioam_node_gamma \
+    link set ip6tnl0 down &>/dev/null
 
   bit2size[22]=$tmp
 }
 
-in_oflag()
+output_sizes()
 {
   ##############################################################################
-  # Make sure that the receiving node won't fill the trace since the Overflow  #
-  # flag is set.                                                               #
+  # Make sure an IOAM encapsulating node allocates supported sizes correctly.  #
   ##############################################################################
-  local desc="Overflow flag is set"
+  local desc="Trace Size of <n> bytes"
+  local ns=0
+  local tr_type=0x800000
+  local mode="$1"
+  local saddr="2001:db8:1::2"
 
-  # Exception:
-  #   Here, we need the sender to set the Overflow flag. For that, we will add
-  #   back the IOAM namespace that was previously configured on the sender.
-  ip -netns $ioam_node_alpha ioam namespace add 123
+  if [ "$1" == "encap" ]
+  then
+    if [ "$2" == "tunsrc" ]
+    then
+      saddr="2001:db8:1::50"
+      mode+=" tunsrc 2001:db8:1::50"
+    fi
 
-  [ "$1" = "encap" ] && mode="$1 tundst db01::1" || mode="$1"
-  [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 up
+    mode+=" tundst 2001:db8:2::2"
+    ip -netns $ioam_node_gamma link set ip6tnl0 up &>/dev/null
+  fi
 
-  ip -netns $ioam_node_alpha route change db01::/64 encap ioam6 mode $mode \
-         trace prealloc type 0xc00000 ns 123 size 4 dev veth0
+  local i
+  for i in $(seq 4 4 244)
+  do
+    local descr="${desc/<n>/$i}"
 
-  run_test ${FUNCNAME[0]} "${desc} ($1 mode)" $ioam_node_alpha $ioam_node_beta \
-         db01::1 0xc00000 123 $1
+    if [[ "$1" == "encap" && $encap_tests != 0 ]]
+    then
+      log_test_skipped "${descr}"
+      continue
+    fi
 
-  [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 down
+    ip -netns $ioam_node_alpha \
+      route change 2001:db8:2::/64 \
+      encap ioam6 mode $mode trace prealloc type $tr_type ns $ns size $i \
+      via 2001:db8:1::1 dev veth0 &>/dev/null
 
-  # And we clean the exception for this test to get things back to normal for
-  # other INPUT tests
-  ip -netns $ioam_node_alpha ioam namespace del 123
+    if [ $? == 0 ]
+    then
+      run_test "output_size$i" "${descr}" $saddr $tr_type $i $ns $1
+    else
+      log_test_failed "${descr}"
+    fi
+  done
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  [ "$1" == "encap" ] && ip -netns $ioam_node_gamma \
+    link set ip6tnl0 down &>/dev/null
 }
 
-in_full_supp_trace()
+output_full_supp_trace()
 {
   ##############################################################################
-  # Make sure that the receiving node will correctly fill a full trace. Be     #
-  # careful, "full trace" here does NOT mean all bits (only supported ones).   #
+  # Make sure an IOAM encapsulating node correctly fills a trace when all      #
+  # supported bits are set.                                                    #
   ##############################################################################
   local desc="Full supported trace"
+  local ns=123
+  local tr_type=0xfff002
+  local tr_size
+  local mode="$1"
+  local saddr="2001:db8:1::2"
 
-  [ "$1" = "encap" ] && mode="$1 tundst db01::1" || mode="$1"
-  [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 up
+  if [ "$1" == "encap" ]
+  then
+    if [ $encap_tests != 0 ]
+    then
+      log_test_skipped "${desc}"
+      return
+    fi
 
-  ip -netns $ioam_node_alpha route change db01::/64 encap ioam6 mode $mode \
-         trace prealloc type 0xfff002 ns 123 size 80 dev veth0
+    if [ "$2" == "tunsrc" ]
+    then
+      saddr="2001:db8:1::50"
+      mode+=" tunsrc 2001:db8:1::50"
+    fi
 
-  run_test ${FUNCNAME[0]} "${desc} ($1 mode)" $ioam_node_alpha $ioam_node_beta \
-         db01::1 0xfff002 123 $1
+    mode+=" tundst 2001:db8:2::2"
+    ip -netns $ioam_node_gamma link set ip6tnl0 up &>/dev/null
+  fi
 
-  [ "$1" = "encap" ] && ip -netns $ioam_node_beta link set ip6tnl0 down
+  local i
+  tr_size=$(( ${#ALPHA[9]} + ((4 - (${#ALPHA[9]} % 4)) % 4) ))
+  for i in {0..11} {22..22}
+  do
+    tr_size=$((tr_size + bit2size[$i]))
+  done
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode trace prealloc type $tr_type ns $ns size $tr_size \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  if [ $? == 0 ]
+  then
+    run_test ${FUNCNAME[0]} "${desc}" $saddr $tr_type $tr_size $ns $1
+  else
+    log_test_failed "${desc}"
+  fi
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  [ "$1" == "encap" ] && ip -netns $ioam_node_gamma \
+    link set ip6tnl0 down &>/dev/null
 }
 
 
 ################################################################################
 #                                                                              #
-#                              GLOBAL tests                                    #
+#                                 INPUT tests                                  #
 #                                                                              #
-#   Three nodes (sender/router/receiver), IOAM fully enabled on every node.    #
 ################################################################################
 
-fwd_full_supp_trace()
+input_undef_ns()
+{
+  ##############################################################################
+  # Make sure an IOAM node does NOT fill the trace when the corresponding IOAM #
+  # Namespace-ID is not configured locally.                                    #
+  ##############################################################################
+  local desc="Unknown IOAM Namespace-ID"
+  local ns=0
+  local tr_type=0x800000
+  local tr_size=4
+  local mode="$1"
+
+  if [ "$1" == "encap" ]
+  then
+    if [ $encap_tests != 0 ]
+    then
+      log_test_skipped "${desc}"
+      return
+    fi
+
+    mode+=" tundst 2001:db8:2::2"
+    ip -netns $ioam_node_gamma link set ip6tnl0 up &>/dev/null
+  fi
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode trace prealloc type $tr_type ns $ns size $tr_size \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  if [ $? == 0 ]
+  then
+    run_test ${FUNCNAME[0]} "${desc}" 2001:db8:1::2 $tr_type $tr_size $ns $1
+  else
+    log_test_failed "${desc}"
+  fi
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  [ "$1" == "encap" ] && ip -netns $ioam_node_gamma \
+    link set ip6tnl0 down &>/dev/null
+}
+
+input_no_room()
+{
+  ##############################################################################
+  # Make sure an IOAM node does NOT fill the trace AND sets the Overflow flag  #
+  # when there is not enough room for its data.                                #
+  ##############################################################################
+  local desc="Missing room for data"
+  local ns=123
+  local tr_type=0xc00000
+  local tr_size=4
+  local mode="$1"
+
+  if [ "$1" == "encap" ]
+  then
+    if [ $encap_tests != 0 ]
+    then
+      log_test_skipped "${desc}"
+      return
+    fi
+
+    mode+=" tundst 2001:db8:2::2"
+    ip -netns $ioam_node_gamma link set ip6tnl0 up &>/dev/null
+  fi
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode trace prealloc type $tr_type ns $ns size $tr_size \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  if [ $? == 0 ]
+  then
+    run_test ${FUNCNAME[0]} "${desc}" 2001:db8:1::2 $tr_type $tr_size $ns $1
+  else
+    log_test_failed "${desc}"
+  fi
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  [ "$1" == "encap" ] && ip -netns $ioam_node_gamma \
+    link set ip6tnl0 down &>/dev/null
+}
+
+input_no_room_oss()
+{
+  ##############################################################################
+  # Make sure an IOAM node does NOT fill the trace AND sets the Overflow flag  #
+  # when there is not enough room for the Opaque State Snapshot.               #
+  ##############################################################################
+  local desc="Missing room for Opaque State Snapshot"
+  local ns=123
+  local tr_type=0x000002
+  local tr_size=4
+  local mode="$1"
+
+  if [ "$1" == "encap" ]
+  then
+    if [ $encap_tests != 0 ]
+    then
+      log_test_skipped "${desc}"
+      return
+    fi
+
+    mode+=" tundst 2001:db8:2::2"
+    ip -netns $ioam_node_gamma link set ip6tnl0 up &>/dev/null
+  fi
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode trace prealloc type $tr_type ns $ns size $tr_size \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  if [ $? == 0 ]
+  then
+    run_test ${FUNCNAME[0]} "${desc}" 2001:db8:1::2 $tr_type $tr_size $ns $1
+  else
+    log_test_failed "${desc}"
+  fi
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  [ "$1" == "encap" ] && ip -netns $ioam_node_gamma \
+    link set ip6tnl0 down &>/dev/null
+}
+
+input_disabled()
+{
+  ##############################################################################
+  # Make sure an IOAM node does NOT fill the trace when IOAM is not enabled on #
+  # the corresponding (ingress) interface.                                     #
+  ##############################################################################
+  local desc="IOAM disabled on ingress interface"
+  local ns=123
+  local tr_type=0x800000
+  local tr_size=4
+  local mode="$1"
+
+  if [ "$1" == "encap" ]
+  then
+    if [ $encap_tests != 0 ]
+    then
+      log_test_skipped "${desc}"
+      return
+    fi
+
+    mode+=" tundst 2001:db8:2::2"
+    ip -netns $ioam_node_gamma link set ip6tnl0 up &>/dev/null
+  fi
+
+  # Exception: disable IOAM on ingress interface
+  ip netns exec $ioam_node_beta \
+    sysctl -wq net.ipv6.conf.veth0.ioam6_enabled=0 &>/dev/null
+  local ret=$?
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode trace prealloc type $tr_type ns $ns size $tr_size \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+  ret=$((ret + $?))
+
+  if [ $ret == 0 ]
+  then
+    run_test ${FUNCNAME[0]} "${desc}" 2001:db8:1::2 $tr_type $tr_size $ns $1
+  else
+    log_test_failed "${desc}"
+  fi
+
+  # Clean Exception
+  ip netns exec $ioam_node_beta \
+    sysctl -wq net.ipv6.conf.veth0.ioam6_enabled=1 &>/dev/null
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  [ "$1" == "encap" ] && ip -netns $ioam_node_gamma \
+    link set ip6tnl0 down &>/dev/null
+}
+
+input_oflag()
+{
+  ##############################################################################
+  # Make sure an IOAM node does NOT fill the trace when the Overflow flag is   #
+  # set.                                                                       #
+  ##############################################################################
+  local desc="Overflow flag is set"
+  local ns=123
+  local tr_type=0xc00000
+  local tr_size=4
+  local mode="$1"
+
+  if [ "$1" == "encap" ]
+  then
+    if [ $encap_tests != 0 ]
+    then
+      log_test_skipped "${desc}"
+      return
+    fi
+
+    mode+=" tundst 2001:db8:2::2"
+    ip -netns $ioam_node_gamma link set ip6tnl0 up &>/dev/null
+  fi
+
+  # Exception:
+  #   Here, we need the sender to set the Overflow flag. For that, we will add
+  #   back the IOAM namespace that was previously configured on the sender.
+  ip -netns $ioam_node_alpha ioam namespace add 123 &>/dev/null
+  local ret=$?
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode trace prealloc type $tr_type ns $ns size $tr_size \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+  ret=$((ret + $?))
+
+  if [ $ret == 0 ]
+  then
+    run_test ${FUNCNAME[0]} "${desc}" 2001:db8:1::2 $tr_type $tr_size $ns $1
+  else
+    log_test_failed "${desc}"
+  fi
+
+  # Clean Exception
+  ip -netns $ioam_node_alpha ioam namespace del 123 &>/dev/null
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  [ "$1" == "encap" ] && ip -netns $ioam_node_gamma \
+    link set ip6tnl0 down &>/dev/null
+}
+
+input_bits()
+{
+  ##############################################################################
+  # Make sure an IOAM node implements all supported bits by checking it        #
+  # correctly fills the trace with its data.                                   #
+  ##############################################################################
+  local desc="Trace Type with supported bit <n> only"
+  local ns=123
+  local mode="$1"
+
+  if [ "$1" == "encap" ]
+  then
+    mode+=" tundst 2001:db8:2::2"
+    ip -netns $ioam_node_gamma link set ip6tnl0 up &>/dev/null
+  fi
+
+  local tmp=${bit2size[22]}
+  bit2size[22]=$(( $tmp + ${#BETA[9]} + ((4 - (${#BETA[9]} % 4)) % 4) ))
+
+  local i
+  for i in {0..11} {22..22}
+  do
+    local descr="${desc/<n>/$i}"
+
+    if [[ "$1" == "encap" && $encap_tests != 0 ]]
+    then
+      log_test_skipped "${descr}"
+      continue
+    fi
+
+    ip -netns $ioam_node_alpha \
+      route change 2001:db8:2::/64 \
+      encap ioam6 mode $mode trace prealloc \
+      type ${bit2type[$i]} ns $ns size ${bit2size[$i]} \
+      via 2001:db8:1::1 dev veth0 &>/dev/null
+
+    if [ $? == 0 ]
+    then
+      run_test "input_bit$i" "${descr}" 2001:db8:1::2 \
+        ${bit2type[$i]} ${bit2size[$i]} $ns $1
+    else
+      log_test_failed "${descr}"
+    fi
+  done
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  [ "$1" == "encap" ] && ip -netns $ioam_node_gamma \
+    link set ip6tnl0 down &>/dev/null
+
+  bit2size[22]=$tmp
+}
+
+input_sizes()
 {
   ##############################################################################
-  # Make sure that all three nodes correctly filled the full supported trace   #
-  # by checking that the trace data is consistent with the predefined config.  #
+  # Make sure an IOAM node handles all supported sizes correctly.              #
   ##############################################################################
-  local desc="Forward - Full supported trace"
+  local desc="Trace Size of <n> bytes"
+  local ns=123
+  local tr_type=0x800000
+  local mode="$1"
+
+  if [ "$1" == "encap" ]
+  then
+    mode+=" tundst 2001:db8:2::2"
+    ip -netns $ioam_node_gamma link set ip6tnl0 up &>/dev/null
+  fi
 
-  [ "$1" = "encap" ] && mode="$1 tundst db02::2" || mode="$1"
-  [ "$1" = "encap" ] && ip -netns $ioam_node_gamma link set ip6tnl0 up
+  local i
+  for i in $(seq 4 4 244)
+  do
+    local descr="${desc/<n>/$i}"
 
-  ip -netns $ioam_node_alpha route change db02::/64 encap ioam6 mode $mode \
-         trace prealloc type 0xfff002 ns 123 size 244 via db01::1 dev veth0
+    if [[ "$1" == "encap" && $encap_tests != 0 ]]
+    then
+      log_test_skipped "${descr}"
+      continue
+    fi
 
-  run_test ${FUNCNAME[0]} "${desc} ($1 mode)" $ioam_node_alpha $ioam_node_gamma \
-         db02::2 0xfff002 123 $1
+    ip -netns $ioam_node_alpha \
+      route change 2001:db8:2::/64 \
+      encap ioam6 mode $mode trace prealloc type $tr_type ns $ns size $i \
+      via 2001:db8:1::1 dev veth0 &>/dev/null
 
-  [ "$1" = "encap" ] && ip -netns $ioam_node_gamma link set ip6tnl0 down
+    if [ $? == 0 ]
+    then
+      run_test "input_size$i" "${descr}" 2001:db8:1::2 $tr_type $i $ns $1
+    else
+      log_test_failed "${descr}"
+    fi
+  done
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  [ "$1" == "encap" ] && ip -netns $ioam_node_gamma \
+    link set ip6tnl0 down &>/dev/null
+}
+
+input_full_supp_trace()
+{
+  ##############################################################################
+  # Make sure an IOAM node correctly fills a trace when all supported bits are #
+  # set.                                                                       #
+  ##############################################################################
+  local desc="Full supported trace"
+  local ns=123
+  local tr_type=0xfff002
+  local tr_size
+  local mode="$1"
+
+  if [ "$1" == "encap" ]
+  then
+    if [ $encap_tests != 0 ]
+    then
+      log_test_skipped "${desc}"
+      return
+    fi
+
+    mode+=" tundst 2001:db8:2::2"
+    ip -netns $ioam_node_gamma link set ip6tnl0 up &>/dev/null
+  fi
+
+  local i
+  tr_size=$(( ${#BETA[9]} + ((4 - (${#BETA[9]} % 4)) % 4) ))
+  for i in {0..11} {22..22}
+  do
+    tr_size=$((tr_size + bit2size[$i]))
+  done
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 \
+    encap ioam6 mode $mode trace prealloc type $tr_type ns $ns size $tr_size \
+    via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  if [ $? == 0 ]
+  then
+    run_test ${FUNCNAME[0]} "${desc}" 2001:db8:1::2 $tr_type $tr_size $ns $1
+  else
+    log_test_failed "${desc}"
+  fi
+
+  ip -netns $ioam_node_alpha \
+    route change 2001:db8:2::/64 via 2001:db8:1::1 dev veth0 &>/dev/null
+
+  [ "$1" == "encap" ] && ip -netns $ioam_node_gamma \
+    link set ip6tnl0 down &>/dev/null
 }
 
 
@@ -742,30 +1655,29 @@ fwd_full_supp_trace()
 ################################################################################
 
 npassed=0
+nskipped=0
 nfailed=0
 
 if [ "$(id -u)" -ne 0 ]
 then
-  echo "SKIP: Need root privileges"
+  echo "SKIP: Need root privileges."
   exit $ksft_skip
 fi
 
 if [ ! -x "$(command -v ip)" ]
 then
-  echo "SKIP: Could not run test without ip tool"
-  exit $ksft_skip
-fi
-
-ip ioam &>/dev/null
-if [ $? = 1 ]
-then
-  echo "SKIP: iproute2 too old, missing ioam command"
+  echo "SKIP: Could not run test without ip tool."
   exit $ksft_skip
 fi
 
 check_kernel_compatibility
-
-cleanup &>/dev/null
 setup
 run
-cleanup &>/dev/null
+cleanup
+
+if [ $nfailed != 0 ]
+then
+  exit $ksft_fail
+fi
+
+exit $ksft_pass
diff --git a/tools/testing/selftests/net/ioam6_parser.c b/tools/testing/selftests/net/ioam6_parser.c
index 895e5bb5044b..de4b5c9e8a74 100644
--- a/tools/testing/selftests/net/ioam6_parser.c
+++ b/tools/testing/selftests/net/ioam6_parser.c
@@ -8,8 +8,10 @@
 #include <errno.h>
 #include <limits.h>
 #include <linux/const.h>
+#include <linux/if_ether.h>
 #include <linux/ioam6.h>
 #include <linux/ipv6.h>
+#include <stdbool.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
@@ -40,7 +42,7 @@ static struct ioam_config node1 = {
 	.egr_id = 101,
 	.ingr_wide = 0xffffffff, /* default value */
 	.egr_wide = 101101,
-	.ns_data = 0xdeadbee0,
+	.ns_data = 0xdeadbeef,
 	.ns_wide = 0xcafec0caf00dc0de,
 	.sc_id = 777,
 	.sc_data = "something that will be 4n-aligned",
@@ -54,33 +56,22 @@ static struct ioam_config node2 = {
 	.egr_id = 202,
 	.ingr_wide = 201201,
 	.egr_wide = 202202,
-	.ns_data = 0xdeadbee1,
-	.ns_wide = 0xcafec0caf11dc0de,
-	.sc_id = 666,
-	.sc_data = "Hello there -Obi",
-	.hlim = 63,
-};
-
-static struct ioam_config node3 = {
-	.id = 3,
-	.wide = 33333333,
-	.ingr_id = 301,
-	.egr_id = 0xffff, /* default value */
-	.ingr_wide = 301301,
-	.egr_wide = 0xffffffff, /* default value */
-	.ns_data = 0xdeadbee2,
-	.ns_wide = 0xcafec0caf22dc0de,
+	.ns_data = 0xffffffff, /* default value */
+	.ns_wide = 0xffffffffffffffff, /* default value */
 	.sc_id = 0xffffff, /* default value */
 	.sc_data = NULL,
-	.hlim = 62,
+	.hlim = 63,
 };
 
 enum {
 	/**********
 	 * OUTPUT *
 	 **********/
+	__TEST_OUT_MIN,
+
 	TEST_OUT_UNDEF_NS,
 	TEST_OUT_NO_ROOM,
+	TEST_OUT_NO_ROOM_OSS,
 	TEST_OUT_BIT0,
 	TEST_OUT_BIT1,
 	TEST_OUT_BIT2,
@@ -94,13 +85,80 @@ enum {
 	TEST_OUT_BIT10,
 	TEST_OUT_BIT11,
 	TEST_OUT_BIT22,
+	TEST_OUT_SIZE4,
+	TEST_OUT_SIZE8,
+	TEST_OUT_SIZE12,
+	TEST_OUT_SIZE16,
+	TEST_OUT_SIZE20,
+	TEST_OUT_SIZE24,
+	TEST_OUT_SIZE28,
+	TEST_OUT_SIZE32,
+	TEST_OUT_SIZE36,
+	TEST_OUT_SIZE40,
+	TEST_OUT_SIZE44,
+	TEST_OUT_SIZE48,
+	TEST_OUT_SIZE52,
+	TEST_OUT_SIZE56,
+	TEST_OUT_SIZE60,
+	TEST_OUT_SIZE64,
+	TEST_OUT_SIZE68,
+	TEST_OUT_SIZE72,
+	TEST_OUT_SIZE76,
+	TEST_OUT_SIZE80,
+	TEST_OUT_SIZE84,
+	TEST_OUT_SIZE88,
+	TEST_OUT_SIZE92,
+	TEST_OUT_SIZE96,
+	TEST_OUT_SIZE100,
+	TEST_OUT_SIZE104,
+	TEST_OUT_SIZE108,
+	TEST_OUT_SIZE112,
+	TEST_OUT_SIZE116,
+	TEST_OUT_SIZE120,
+	TEST_OUT_SIZE124,
+	TEST_OUT_SIZE128,
+	TEST_OUT_SIZE132,
+	TEST_OUT_SIZE136,
+	TEST_OUT_SIZE140,
+	TEST_OUT_SIZE144,
+	TEST_OUT_SIZE148,
+	TEST_OUT_SIZE152,
+	TEST_OUT_SIZE156,
+	TEST_OUT_SIZE160,
+	TEST_OUT_SIZE164,
+	TEST_OUT_SIZE168,
+	TEST_OUT_SIZE172,
+	TEST_OUT_SIZE176,
+	TEST_OUT_SIZE180,
+	TEST_OUT_SIZE184,
+	TEST_OUT_SIZE188,
+	TEST_OUT_SIZE192,
+	TEST_OUT_SIZE196,
+	TEST_OUT_SIZE200,
+	TEST_OUT_SIZE204,
+	TEST_OUT_SIZE208,
+	TEST_OUT_SIZE212,
+	TEST_OUT_SIZE216,
+	TEST_OUT_SIZE220,
+	TEST_OUT_SIZE224,
+	TEST_OUT_SIZE228,
+	TEST_OUT_SIZE232,
+	TEST_OUT_SIZE236,
+	TEST_OUT_SIZE240,
+	TEST_OUT_SIZE244,
 	TEST_OUT_FULL_SUPP_TRACE,
 
+	__TEST_OUT_MAX,
+
 	/*********
 	 * INPUT *
 	 *********/
+	__TEST_IN_MIN,
+
 	TEST_IN_UNDEF_NS,
 	TEST_IN_NO_ROOM,
+	TEST_IN_NO_ROOM_OSS,
+	TEST_IN_DISABLED,
 	TEST_IN_OFLAG,
 	TEST_IN_BIT0,
 	TEST_IN_BIT1,
@@ -115,36 +173,107 @@ enum {
 	TEST_IN_BIT10,
 	TEST_IN_BIT11,
 	TEST_IN_BIT22,
+	TEST_IN_SIZE4,
+	TEST_IN_SIZE8,
+	TEST_IN_SIZE12,
+	TEST_IN_SIZE16,
+	TEST_IN_SIZE20,
+	TEST_IN_SIZE24,
+	TEST_IN_SIZE28,
+	TEST_IN_SIZE32,
+	TEST_IN_SIZE36,
+	TEST_IN_SIZE40,
+	TEST_IN_SIZE44,
+	TEST_IN_SIZE48,
+	TEST_IN_SIZE52,
+	TEST_IN_SIZE56,
+	TEST_IN_SIZE60,
+	TEST_IN_SIZE64,
+	TEST_IN_SIZE68,
+	TEST_IN_SIZE72,
+	TEST_IN_SIZE76,
+	TEST_IN_SIZE80,
+	TEST_IN_SIZE84,
+	TEST_IN_SIZE88,
+	TEST_IN_SIZE92,
+	TEST_IN_SIZE96,
+	TEST_IN_SIZE100,
+	TEST_IN_SIZE104,
+	TEST_IN_SIZE108,
+	TEST_IN_SIZE112,
+	TEST_IN_SIZE116,
+	TEST_IN_SIZE120,
+	TEST_IN_SIZE124,
+	TEST_IN_SIZE128,
+	TEST_IN_SIZE132,
+	TEST_IN_SIZE136,
+	TEST_IN_SIZE140,
+	TEST_IN_SIZE144,
+	TEST_IN_SIZE148,
+	TEST_IN_SIZE152,
+	TEST_IN_SIZE156,
+	TEST_IN_SIZE160,
+	TEST_IN_SIZE164,
+	TEST_IN_SIZE168,
+	TEST_IN_SIZE172,
+	TEST_IN_SIZE176,
+	TEST_IN_SIZE180,
+	TEST_IN_SIZE184,
+	TEST_IN_SIZE188,
+	TEST_IN_SIZE192,
+	TEST_IN_SIZE196,
+	TEST_IN_SIZE200,
+	TEST_IN_SIZE204,
+	TEST_IN_SIZE208,
+	TEST_IN_SIZE212,
+	TEST_IN_SIZE216,
+	TEST_IN_SIZE220,
+	TEST_IN_SIZE224,
+	TEST_IN_SIZE228,
+	TEST_IN_SIZE232,
+	TEST_IN_SIZE236,
+	TEST_IN_SIZE240,
+	TEST_IN_SIZE244,
 	TEST_IN_FULL_SUPP_TRACE,
 
-	/**********
-	 * GLOBAL *
-	 **********/
-	TEST_FWD_FULL_SUPP_TRACE,
+	__TEST_IN_MAX,
 
 	__TEST_MAX,
 };
 
-static int check_ioam_header(int tid, struct ioam6_trace_hdr *ioam6h,
-			     __u32 trace_type, __u16 ioam_ns)
+static int check_header(int tid, struct ioam6_trace_hdr *trace,
+			__u32 trace_type, __u8 trace_size, __u16 ioam_ns)
 {
-	if (__be16_to_cpu(ioam6h->namespace_id) != ioam_ns ||
-	    __be32_to_cpu(ioam6h->type_be32) != (trace_type << 8))
+	if (__be16_to_cpu(trace->namespace_id) != ioam_ns ||
+	    __be32_to_cpu(trace->type_be32) != (trace_type << 8))
 		return 1;
 
 	switch (tid) {
 	case TEST_OUT_UNDEF_NS:
 	case TEST_IN_UNDEF_NS:
-		return ioam6h->overflow ||
-		       ioam6h->nodelen != 1 ||
-		       ioam6h->remlen != 1;
+	case TEST_IN_DISABLED:
+		return trace->overflow == 1 ||
+		       trace->nodelen != 1 ||
+		       trace->remlen != 1;
 
 	case TEST_OUT_NO_ROOM:
 	case TEST_IN_NO_ROOM:
 	case TEST_IN_OFLAG:
-		return !ioam6h->overflow ||
-		       ioam6h->nodelen != 2 ||
-		       ioam6h->remlen != 1;
+		return trace->overflow == 0 ||
+		       trace->nodelen != 2 ||
+		       trace->remlen != 1;
+
+	case TEST_OUT_NO_ROOM_OSS:
+		return trace->overflow == 0 ||
+		       trace->nodelen != 0 ||
+		       trace->remlen != 1;
+
+	case TEST_IN_NO_ROOM_OSS:
+	case TEST_OUT_BIT22:
+	case TEST_IN_BIT22:
+		return trace->overflow == 1 ||
+		       trace->nodelen != 0 ||
+		       trace->remlen != 0;
 
 	case TEST_OUT_BIT0:
 	case TEST_IN_BIT0:
@@ -164,9 +293,9 @@ static int check_ioam_header(int tid, struct ioam6_trace_hdr *ioam6h,
 	case TEST_IN_BIT7:
 	case TEST_OUT_BIT11:
 	case TEST_IN_BIT11:
-		return ioam6h->overflow ||
-		       ioam6h->nodelen != 1 ||
-		       ioam6h->remlen;
+		return trace->overflow == 1 ||
+		       trace->nodelen != 1 ||
+		       trace->remlen != 0;
 
 	case TEST_OUT_BIT8:
 	case TEST_IN_BIT8:
@@ -174,22 +303,145 @@ static int check_ioam_header(int tid, struct ioam6_trace_hdr *ioam6h,
 	case TEST_IN_BIT9:
 	case TEST_OUT_BIT10:
 	case TEST_IN_BIT10:
-		return ioam6h->overflow ||
-		       ioam6h->nodelen != 2 ||
-		       ioam6h->remlen;
-
-	case TEST_OUT_BIT22:
-	case TEST_IN_BIT22:
-		return ioam6h->overflow ||
-		       ioam6h->nodelen ||
-		       ioam6h->remlen;
+		return trace->overflow == 1 ||
+		       trace->nodelen != 2 ||
+		       trace->remlen != 0;
+
+	case TEST_OUT_SIZE4:
+	case TEST_OUT_SIZE8:
+	case TEST_OUT_SIZE12:
+	case TEST_OUT_SIZE16:
+	case TEST_OUT_SIZE20:
+	case TEST_OUT_SIZE24:
+	case TEST_OUT_SIZE28:
+	case TEST_OUT_SIZE32:
+	case TEST_OUT_SIZE36:
+	case TEST_OUT_SIZE40:
+	case TEST_OUT_SIZE44:
+	case TEST_OUT_SIZE48:
+	case TEST_OUT_SIZE52:
+	case TEST_OUT_SIZE56:
+	case TEST_OUT_SIZE60:
+	case TEST_OUT_SIZE64:
+	case TEST_OUT_SIZE68:
+	case TEST_OUT_SIZE72:
+	case TEST_OUT_SIZE76:
+	case TEST_OUT_SIZE80:
+	case TEST_OUT_SIZE84:
+	case TEST_OUT_SIZE88:
+	case TEST_OUT_SIZE92:
+	case TEST_OUT_SIZE96:
+	case TEST_OUT_SIZE100:
+	case TEST_OUT_SIZE104:
+	case TEST_OUT_SIZE108:
+	case TEST_OUT_SIZE112:
+	case TEST_OUT_SIZE116:
+	case TEST_OUT_SIZE120:
+	case TEST_OUT_SIZE124:
+	case TEST_OUT_SIZE128:
+	case TEST_OUT_SIZE132:
+	case TEST_OUT_SIZE136:
+	case TEST_OUT_SIZE140:
+	case TEST_OUT_SIZE144:
+	case TEST_OUT_SIZE148:
+	case TEST_OUT_SIZE152:
+	case TEST_OUT_SIZE156:
+	case TEST_OUT_SIZE160:
+	case TEST_OUT_SIZE164:
+	case TEST_OUT_SIZE168:
+	case TEST_OUT_SIZE172:
+	case TEST_OUT_SIZE176:
+	case TEST_OUT_SIZE180:
+	case TEST_OUT_SIZE184:
+	case TEST_OUT_SIZE188:
+	case TEST_OUT_SIZE192:
+	case TEST_OUT_SIZE196:
+	case TEST_OUT_SIZE200:
+	case TEST_OUT_SIZE204:
+	case TEST_OUT_SIZE208:
+	case TEST_OUT_SIZE212:
+	case TEST_OUT_SIZE216:
+	case TEST_OUT_SIZE220:
+	case TEST_OUT_SIZE224:
+	case TEST_OUT_SIZE228:
+	case TEST_OUT_SIZE232:
+	case TEST_OUT_SIZE236:
+	case TEST_OUT_SIZE240:
+	case TEST_OUT_SIZE244:
+		return trace->overflow == 1 ||
+		       trace->nodelen != 1 ||
+		       trace->remlen != trace_size / 4;
+
+	case TEST_IN_SIZE4:
+	case TEST_IN_SIZE8:
+	case TEST_IN_SIZE12:
+	case TEST_IN_SIZE16:
+	case TEST_IN_SIZE20:
+	case TEST_IN_SIZE24:
+	case TEST_IN_SIZE28:
+	case TEST_IN_SIZE32:
+	case TEST_IN_SIZE36:
+	case TEST_IN_SIZE40:
+	case TEST_IN_SIZE44:
+	case TEST_IN_SIZE48:
+	case TEST_IN_SIZE52:
+	case TEST_IN_SIZE56:
+	case TEST_IN_SIZE60:
+	case TEST_IN_SIZE64:
+	case TEST_IN_SIZE68:
+	case TEST_IN_SIZE72:
+	case TEST_IN_SIZE76:
+	case TEST_IN_SIZE80:
+	case TEST_IN_SIZE84:
+	case TEST_IN_SIZE88:
+	case TEST_IN_SIZE92:
+	case TEST_IN_SIZE96:
+	case TEST_IN_SIZE100:
+	case TEST_IN_SIZE104:
+	case TEST_IN_SIZE108:
+	case TEST_IN_SIZE112:
+	case TEST_IN_SIZE116:
+	case TEST_IN_SIZE120:
+	case TEST_IN_SIZE124:
+	case TEST_IN_SIZE128:
+	case TEST_IN_SIZE132:
+	case TEST_IN_SIZE136:
+	case TEST_IN_SIZE140:
+	case TEST_IN_SIZE144:
+	case TEST_IN_SIZE148:
+	case TEST_IN_SIZE152:
+	case TEST_IN_SIZE156:
+	case TEST_IN_SIZE160:
+	case TEST_IN_SIZE164:
+	case TEST_IN_SIZE168:
+	case TEST_IN_SIZE172:
+	case TEST_IN_SIZE176:
+	case TEST_IN_SIZE180:
+	case TEST_IN_SIZE184:
+	case TEST_IN_SIZE188:
+	case TEST_IN_SIZE192:
+	case TEST_IN_SIZE196:
+	case TEST_IN_SIZE200:
+	case TEST_IN_SIZE204:
+	case TEST_IN_SIZE208:
+	case TEST_IN_SIZE212:
+	case TEST_IN_SIZE216:
+	case TEST_IN_SIZE220:
+	case TEST_IN_SIZE224:
+	case TEST_IN_SIZE228:
+	case TEST_IN_SIZE232:
+	case TEST_IN_SIZE236:
+	case TEST_IN_SIZE240:
+	case TEST_IN_SIZE244:
+		return trace->overflow == 1 ||
+		       trace->nodelen != 1 ||
+		       trace->remlen != (trace_size / 4) - trace->nodelen;
 
 	case TEST_OUT_FULL_SUPP_TRACE:
 	case TEST_IN_FULL_SUPP_TRACE:
-	case TEST_FWD_FULL_SUPP_TRACE:
-		return ioam6h->overflow ||
-		       ioam6h->nodelen != 15 ||
-		       ioam6h->remlen;
+		return trace->overflow == 1 ||
+		       trace->nodelen != 15 ||
+		       trace->remlen != 0;
 
 	default:
 		break;
@@ -198,167 +450,137 @@ static int check_ioam_header(int tid, struct ioam6_trace_hdr *ioam6h,
 	return 1;
 }
 
-static int check_ioam6_data(__u8 **p, struct ioam6_trace_hdr *ioam6h,
-			    const struct ioam_config cnf)
+static int check_data(struct ioam6_trace_hdr *trace, __u8 trace_size,
+		      const struct ioam_config cnf, bool is_output)
 {
-	unsigned int len;
+	unsigned int len, i;
 	__u8 aligned;
 	__u64 raw64;
 	__u32 raw32;
+	__u8 *p;
 
-	if (ioam6h->type.bit0) {
-		raw32 = __be32_to_cpu(*((__u32 *)*p));
-		if (cnf.hlim != (raw32 >> 24) || cnf.id != (raw32 & 0xffffff))
-			return 1;
-		*p += sizeof(__u32);
-	}
-
-	if (ioam6h->type.bit1) {
-		raw32 = __be32_to_cpu(*((__u32 *)*p));
-		if (cnf.ingr_id != (raw32 >> 16) ||
-		    cnf.egr_id != (raw32 & 0xffff))
-			return 1;
-		*p += sizeof(__u32);
-	}
-
-	if (ioam6h->type.bit2)
-		*p += sizeof(__u32);
-
-	if (ioam6h->type.bit3)
-		*p += sizeof(__u32);
-
-	if (ioam6h->type.bit4) {
-		if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff)
-			return 1;
-		*p += sizeof(__u32);
-	}
-
-	if (ioam6h->type.bit5) {
-		if (__be32_to_cpu(*((__u32 *)*p)) != cnf.ns_data)
-			return 1;
-		*p += sizeof(__u32);
-	}
-
-	if (ioam6h->type.bit6)
-		*p += sizeof(__u32);
+	if (trace->type.bit12 | trace->type.bit13 | trace->type.bit14 |
+	    trace->type.bit15 | trace->type.bit16 | trace->type.bit17 |
+	    trace->type.bit18 | trace->type.bit19 | trace->type.bit20 |
+	    trace->type.bit21 | trace->type.bit23)
+		return 1;
 
-	if (ioam6h->type.bit7) {
-		if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff)
+	for (i = 0; i < trace->remlen * 4; i++) {
+		if (trace->data[i] != 0)
 			return 1;
-		*p += sizeof(__u32);
 	}
 
-	if (ioam6h->type.bit8) {
-		raw64 = __be64_to_cpu(*((__u64 *)*p));
-		if (cnf.hlim != (raw64 >> 56) ||
-		    cnf.wide != (raw64 & 0xffffffffffffff))
-			return 1;
-		*p += sizeof(__u64);
-	}
+	if (trace->remlen * 4 == trace_size)
+		return 0;
 
-	if (ioam6h->type.bit9) {
-		if (__be32_to_cpu(*((__u32 *)*p)) != cnf.ingr_wide)
-			return 1;
-		*p += sizeof(__u32);
+	p = trace->data + trace->remlen * 4;
 
-		if (__be32_to_cpu(*((__u32 *)*p)) != cnf.egr_wide)
+	if (trace->type.bit0) {
+		raw32 = __be32_to_cpu(*((__u32 *)p));
+		if (cnf.hlim != (raw32 >> 24) || cnf.id != (raw32 & 0xffffff))
 			return 1;
-		*p += sizeof(__u32);
+		p += sizeof(__u32);
 	}
 
-	if (ioam6h->type.bit10) {
-		if (__be64_to_cpu(*((__u64 *)*p)) != cnf.ns_wide)
+	if (trace->type.bit1) {
+		raw32 = __be32_to_cpu(*((__u32 *)p));
+		if (cnf.ingr_id != (raw32 >> 16) ||
+		    cnf.egr_id != (raw32 & 0xffff))
 			return 1;
-		*p += sizeof(__u64);
+		p += sizeof(__u32);
 	}
 
-	if (ioam6h->type.bit11) {
-		if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff)
+	if (trace->type.bit2) {
+		raw32 = __be32_to_cpu(*((__u32 *)p));
+		if ((is_output && raw32 != 0xffffffff) ||
+		    (!is_output && (raw32 == 0 || raw32 == 0xffffffff)))
 			return 1;
-		*p += sizeof(__u32);
+		p += sizeof(__u32);
 	}
 
-	if (ioam6h->type.bit12) {
-		if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff)
+	if (trace->type.bit3) {
+		raw32 = __be32_to_cpu(*((__u32 *)p));
+		if ((is_output && raw32 != 0xffffffff) ||
+		    (!is_output && (raw32 == 0 || raw32 == 0xffffffff)))
 			return 1;
-		*p += sizeof(__u32);
+		p += sizeof(__u32);
 	}
 
-	if (ioam6h->type.bit13) {
-		if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff)
+	if (trace->type.bit4) {
+		if (__be32_to_cpu(*((__u32 *)p)) != 0xffffffff)
 			return 1;
-		*p += sizeof(__u32);
+		p += sizeof(__u32);
 	}
 
-	if (ioam6h->type.bit14) {
-		if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff)
+	if (trace->type.bit5) {
+		if (__be32_to_cpu(*((__u32 *)p)) != cnf.ns_data)
 			return 1;
-		*p += sizeof(__u32);
+		p += sizeof(__u32);
 	}
 
-	if (ioam6h->type.bit15) {
-		if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff)
+	if (trace->type.bit6) {
+		if (__be32_to_cpu(*((__u32 *)p)) == 0xffffffff)
 			return 1;
-		*p += sizeof(__u32);
+		p += sizeof(__u32);
 	}
 
-	if (ioam6h->type.bit16) {
-		if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff)
+	if (trace->type.bit7) {
+		if (__be32_to_cpu(*((__u32 *)p)) != 0xffffffff)
 			return 1;
-		*p += sizeof(__u32);
+		p += sizeof(__u32);
 	}
 
-	if (ioam6h->type.bit17) {
-		if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff)
+	if (trace->type.bit8) {
+		raw64 = __be64_to_cpu(*((__u64 *)p));
+		if (cnf.hlim != (raw64 >> 56) ||
+		    cnf.wide != (raw64 & 0xffffffffffffff))
 			return 1;
-		*p += sizeof(__u32);
+		p += sizeof(__u64);
 	}
 
-	if (ioam6h->type.bit18) {
-		if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff)
+	if (trace->type.bit9) {
+		if (__be32_to_cpu(*((__u32 *)p)) != cnf.ingr_wide)
 			return 1;
-		*p += sizeof(__u32);
-	}
+		p += sizeof(__u32);
 
-	if (ioam6h->type.bit19) {
-		if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff)
+		if (__be32_to_cpu(*((__u32 *)p)) != cnf.egr_wide)
 			return 1;
-		*p += sizeof(__u32);
+		p += sizeof(__u32);
 	}
 
-	if (ioam6h->type.bit20) {
-		if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff)
+	if (trace->type.bit10) {
+		if (__be64_to_cpu(*((__u64 *)p)) != cnf.ns_wide)
 			return 1;
-		*p += sizeof(__u32);
+		p += sizeof(__u64);
 	}
 
-	if (ioam6h->type.bit21) {
-		if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff)
+	if (trace->type.bit11) {
+		if (__be32_to_cpu(*((__u32 *)p)) != 0xffffffff)
 			return 1;
-		*p += sizeof(__u32);
+		p += sizeof(__u32);
 	}
 
-	if (ioam6h->type.bit22) {
+	if (trace->type.bit22) {
 		len = cnf.sc_data ? strlen(cnf.sc_data) : 0;
 		aligned = cnf.sc_data ? __ALIGN_KERNEL(len, 4) : 0;
 
-		raw32 = __be32_to_cpu(*((__u32 *)*p));
+		raw32 = __be32_to_cpu(*((__u32 *)p));
 		if (aligned != (raw32 >> 24) * 4 ||
 		    cnf.sc_id != (raw32 & 0xffffff))
 			return 1;
-		*p += sizeof(__u32);
+		p += sizeof(__u32);
 
 		if (cnf.sc_data) {
-			if (strncmp((char *)*p, cnf.sc_data, len))
+			if (strncmp((char *)p, cnf.sc_data, len))
 				return 1;
 
-			*p += len;
+			p += len;
 			aligned -= len;
 
 			while (aligned--) {
-				if (**p != '\0')
+				if (*p != '\0')
 					return 1;
-				*p += sizeof(__u8);
+				p += sizeof(__u8);
 			}
 		}
 	}
@@ -366,151 +588,351 @@ static int check_ioam6_data(__u8 **p, struct ioam6_trace_hdr *ioam6h,
 	return 0;
 }
 
-static int check_ioam_header_and_data(int tid, struct ioam6_trace_hdr *ioam6h,
-				      __u32 trace_type, __u16 ioam_ns)
+static int check_ioam_trace(int tid, struct ioam6_trace_hdr *trace,
+			    __u32 trace_type, __u8 trace_size, __u16 ioam_ns)
 {
-	__u8 *p;
-
-	if (check_ioam_header(tid, ioam6h, trace_type, ioam_ns))
+	if (check_header(tid, trace, trace_type, trace_size, ioam_ns))
 		return 1;
 
-	p = ioam6h->data + ioam6h->remlen * 4;
-
-	switch (tid) {
-	case TEST_OUT_BIT0:
-	case TEST_OUT_BIT1:
-	case TEST_OUT_BIT2:
-	case TEST_OUT_BIT3:
-	case TEST_OUT_BIT4:
-	case TEST_OUT_BIT5:
-	case TEST_OUT_BIT6:
-	case TEST_OUT_BIT7:
-	case TEST_OUT_BIT8:
-	case TEST_OUT_BIT9:
-	case TEST_OUT_BIT10:
-	case TEST_OUT_BIT11:
-	case TEST_OUT_BIT22:
-	case TEST_OUT_FULL_SUPP_TRACE:
-		return check_ioam6_data(&p, ioam6h, node1);
-
-	case TEST_IN_BIT0:
-	case TEST_IN_BIT1:
-	case TEST_IN_BIT2:
-	case TEST_IN_BIT3:
-	case TEST_IN_BIT4:
-	case TEST_IN_BIT5:
-	case TEST_IN_BIT6:
-	case TEST_IN_BIT7:
-	case TEST_IN_BIT8:
-	case TEST_IN_BIT9:
-	case TEST_IN_BIT10:
-	case TEST_IN_BIT11:
-	case TEST_IN_BIT22:
-	case TEST_IN_FULL_SUPP_TRACE:
-	{
-		__u32 tmp32 = node2.egr_wide;
-		__u16 tmp16 = node2.egr_id;
-		int res;
-
-		node2.egr_id = 0xffff;
-		node2.egr_wide = 0xffffffff;
+	if (tid > __TEST_OUT_MIN && tid < __TEST_OUT_MAX)
+		return check_data(trace, trace_size, node1, true);
 
-		res = check_ioam6_data(&p, ioam6h, node2);
-
-		node2.egr_id = tmp16;
-		node2.egr_wide = tmp32;
-
-		return res;
-	}
-
-	case TEST_FWD_FULL_SUPP_TRACE:
-		if (check_ioam6_data(&p, ioam6h, node3))
-			return 1;
-		if (check_ioam6_data(&p, ioam6h, node2))
-			return 1;
-		return check_ioam6_data(&p, ioam6h, node1);
-
-	default:
-		break;
-	}
+	if (tid > __TEST_IN_MIN && tid < __TEST_IN_MAX)
+		return check_data(trace, trace_size, node2, false);
 
 	return 1;
 }
 
 static int str2id(const char *tname)
 {
-	if (!strcmp("out_undef_ns", tname))
+	if (!strcmp("output_undef_ns", tname))
 		return TEST_OUT_UNDEF_NS;
-	if (!strcmp("out_no_room", tname))
+	if (!strcmp("output_no_room", tname))
 		return TEST_OUT_NO_ROOM;
-	if (!strcmp("out_bit0", tname))
+	if (!strcmp("output_no_room_oss", tname))
+		return TEST_OUT_NO_ROOM_OSS;
+	if (!strcmp("output_bit0", tname))
 		return TEST_OUT_BIT0;
-	if (!strcmp("out_bit1", tname))
+	if (!strcmp("output_bit1", tname))
 		return TEST_OUT_BIT1;
-	if (!strcmp("out_bit2", tname))
+	if (!strcmp("output_bit2", tname))
 		return TEST_OUT_BIT2;
-	if (!strcmp("out_bit3", tname))
+	if (!strcmp("output_bit3", tname))
 		return TEST_OUT_BIT3;
-	if (!strcmp("out_bit4", tname))
+	if (!strcmp("output_bit4", tname))
 		return TEST_OUT_BIT4;
-	if (!strcmp("out_bit5", tname))
+	if (!strcmp("output_bit5", tname))
 		return TEST_OUT_BIT5;
-	if (!strcmp("out_bit6", tname))
+	if (!strcmp("output_bit6", tname))
 		return TEST_OUT_BIT6;
-	if (!strcmp("out_bit7", tname))
+	if (!strcmp("output_bit7", tname))
 		return TEST_OUT_BIT7;
-	if (!strcmp("out_bit8", tname))
+	if (!strcmp("output_bit8", tname))
 		return TEST_OUT_BIT8;
-	if (!strcmp("out_bit9", tname))
+	if (!strcmp("output_bit9", tname))
 		return TEST_OUT_BIT9;
-	if (!strcmp("out_bit10", tname))
+	if (!strcmp("output_bit10", tname))
 		return TEST_OUT_BIT10;
-	if (!strcmp("out_bit11", tname))
+	if (!strcmp("output_bit11", tname))
 		return TEST_OUT_BIT11;
-	if (!strcmp("out_bit22", tname))
+	if (!strcmp("output_bit22", tname))
 		return TEST_OUT_BIT22;
-	if (!strcmp("out_full_supp_trace", tname))
+	if (!strcmp("output_size4", tname))
+		return TEST_OUT_SIZE4;
+	if (!strcmp("output_size8", tname))
+		return TEST_OUT_SIZE8;
+	if (!strcmp("output_size12", tname))
+		return TEST_OUT_SIZE12;
+	if (!strcmp("output_size16", tname))
+		return TEST_OUT_SIZE16;
+	if (!strcmp("output_size20", tname))
+		return TEST_OUT_SIZE20;
+	if (!strcmp("output_size24", tname))
+		return TEST_OUT_SIZE24;
+	if (!strcmp("output_size28", tname))
+		return TEST_OUT_SIZE28;
+	if (!strcmp("output_size32", tname))
+		return TEST_OUT_SIZE32;
+	if (!strcmp("output_size36", tname))
+		return TEST_OUT_SIZE36;
+	if (!strcmp("output_size40", tname))
+		return TEST_OUT_SIZE40;
+	if (!strcmp("output_size44", tname))
+		return TEST_OUT_SIZE44;
+	if (!strcmp("output_size48", tname))
+		return TEST_OUT_SIZE48;
+	if (!strcmp("output_size52", tname))
+		return TEST_OUT_SIZE52;
+	if (!strcmp("output_size56", tname))
+		return TEST_OUT_SIZE56;
+	if (!strcmp("output_size60", tname))
+		return TEST_OUT_SIZE60;
+	if (!strcmp("output_size64", tname))
+		return TEST_OUT_SIZE64;
+	if (!strcmp("output_size68", tname))
+		return TEST_OUT_SIZE68;
+	if (!strcmp("output_size72", tname))
+		return TEST_OUT_SIZE72;
+	if (!strcmp("output_size76", tname))
+		return TEST_OUT_SIZE76;
+	if (!strcmp("output_size80", tname))
+		return TEST_OUT_SIZE80;
+	if (!strcmp("output_size84", tname))
+		return TEST_OUT_SIZE84;
+	if (!strcmp("output_size88", tname))
+		return TEST_OUT_SIZE88;
+	if (!strcmp("output_size92", tname))
+		return TEST_OUT_SIZE92;
+	if (!strcmp("output_size96", tname))
+		return TEST_OUT_SIZE96;
+	if (!strcmp("output_size100", tname))
+		return TEST_OUT_SIZE100;
+	if (!strcmp("output_size104", tname))
+		return TEST_OUT_SIZE104;
+	if (!strcmp("output_size108", tname))
+		return TEST_OUT_SIZE108;
+	if (!strcmp("output_size112", tname))
+		return TEST_OUT_SIZE112;
+	if (!strcmp("output_size116", tname))
+		return TEST_OUT_SIZE116;
+	if (!strcmp("output_size120", tname))
+		return TEST_OUT_SIZE120;
+	if (!strcmp("output_size124", tname))
+		return TEST_OUT_SIZE124;
+	if (!strcmp("output_size128", tname))
+		return TEST_OUT_SIZE128;
+	if (!strcmp("output_size132", tname))
+		return TEST_OUT_SIZE132;
+	if (!strcmp("output_size136", tname))
+		return TEST_OUT_SIZE136;
+	if (!strcmp("output_size140", tname))
+		return TEST_OUT_SIZE140;
+	if (!strcmp("output_size144", tname))
+		return TEST_OUT_SIZE144;
+	if (!strcmp("output_size148", tname))
+		return TEST_OUT_SIZE148;
+	if (!strcmp("output_size152", tname))
+		return TEST_OUT_SIZE152;
+	if (!strcmp("output_size156", tname))
+		return TEST_OUT_SIZE156;
+	if (!strcmp("output_size160", tname))
+		return TEST_OUT_SIZE160;
+	if (!strcmp("output_size164", tname))
+		return TEST_OUT_SIZE164;
+	if (!strcmp("output_size168", tname))
+		return TEST_OUT_SIZE168;
+	if (!strcmp("output_size172", tname))
+		return TEST_OUT_SIZE172;
+	if (!strcmp("output_size176", tname))
+		return TEST_OUT_SIZE176;
+	if (!strcmp("output_size180", tname))
+		return TEST_OUT_SIZE180;
+	if (!strcmp("output_size184", tname))
+		return TEST_OUT_SIZE184;
+	if (!strcmp("output_size188", tname))
+		return TEST_OUT_SIZE188;
+	if (!strcmp("output_size192", tname))
+		return TEST_OUT_SIZE192;
+	if (!strcmp("output_size196", tname))
+		return TEST_OUT_SIZE196;
+	if (!strcmp("output_size200", tname))
+		return TEST_OUT_SIZE200;
+	if (!strcmp("output_size204", tname))
+		return TEST_OUT_SIZE204;
+	if (!strcmp("output_size208", tname))
+		return TEST_OUT_SIZE208;
+	if (!strcmp("output_size212", tname))
+		return TEST_OUT_SIZE212;
+	if (!strcmp("output_size216", tname))
+		return TEST_OUT_SIZE216;
+	if (!strcmp("output_size220", tname))
+		return TEST_OUT_SIZE220;
+	if (!strcmp("output_size224", tname))
+		return TEST_OUT_SIZE224;
+	if (!strcmp("output_size228", tname))
+		return TEST_OUT_SIZE228;
+	if (!strcmp("output_size232", tname))
+		return TEST_OUT_SIZE232;
+	if (!strcmp("output_size236", tname))
+		return TEST_OUT_SIZE236;
+	if (!strcmp("output_size240", tname))
+		return TEST_OUT_SIZE240;
+	if (!strcmp("output_size244", tname))
+		return TEST_OUT_SIZE244;
+	if (!strcmp("output_full_supp_trace", tname))
 		return TEST_OUT_FULL_SUPP_TRACE;
-	if (!strcmp("in_undef_ns", tname))
+	if (!strcmp("input_undef_ns", tname))
 		return TEST_IN_UNDEF_NS;
-	if (!strcmp("in_no_room", tname))
+	if (!strcmp("input_no_room", tname))
 		return TEST_IN_NO_ROOM;
-	if (!strcmp("in_oflag", tname))
+	if (!strcmp("input_no_room_oss", tname))
+		return TEST_IN_NO_ROOM_OSS;
+	if (!strcmp("input_disabled", tname))
+		return TEST_IN_DISABLED;
+	if (!strcmp("input_oflag", tname))
 		return TEST_IN_OFLAG;
-	if (!strcmp("in_bit0", tname))
+	if (!strcmp("input_bit0", tname))
 		return TEST_IN_BIT0;
-	if (!strcmp("in_bit1", tname))
+	if (!strcmp("input_bit1", tname))
 		return TEST_IN_BIT1;
-	if (!strcmp("in_bit2", tname))
+	if (!strcmp("input_bit2", tname))
 		return TEST_IN_BIT2;
-	if (!strcmp("in_bit3", tname))
+	if (!strcmp("input_bit3", tname))
 		return TEST_IN_BIT3;
-	if (!strcmp("in_bit4", tname))
+	if (!strcmp("input_bit4", tname))
 		return TEST_IN_BIT4;
-	if (!strcmp("in_bit5", tname))
+	if (!strcmp("input_bit5", tname))
 		return TEST_IN_BIT5;
-	if (!strcmp("in_bit6", tname))
+	if (!strcmp("input_bit6", tname))
 		return TEST_IN_BIT6;
-	if (!strcmp("in_bit7", tname))
+	if (!strcmp("input_bit7", tname))
 		return TEST_IN_BIT7;
-	if (!strcmp("in_bit8", tname))
+	if (!strcmp("input_bit8", tname))
 		return TEST_IN_BIT8;
-	if (!strcmp("in_bit9", tname))
+	if (!strcmp("input_bit9", tname))
 		return TEST_IN_BIT9;
-	if (!strcmp("in_bit10", tname))
+	if (!strcmp("input_bit10", tname))
 		return TEST_IN_BIT10;
-	if (!strcmp("in_bit11", tname))
+	if (!strcmp("input_bit11", tname))
 		return TEST_IN_BIT11;
-	if (!strcmp("in_bit22", tname))
+	if (!strcmp("input_bit22", tname))
 		return TEST_IN_BIT22;
-	if (!strcmp("in_full_supp_trace", tname))
+	if (!strcmp("input_size4", tname))
+		return TEST_IN_SIZE4;
+	if (!strcmp("input_size8", tname))
+		return TEST_IN_SIZE8;
+	if (!strcmp("input_size12", tname))
+		return TEST_IN_SIZE12;
+	if (!strcmp("input_size16", tname))
+		return TEST_IN_SIZE16;
+	if (!strcmp("input_size20", tname))
+		return TEST_IN_SIZE20;
+	if (!strcmp("input_size24", tname))
+		return TEST_IN_SIZE24;
+	if (!strcmp("input_size28", tname))
+		return TEST_IN_SIZE28;
+	if (!strcmp("input_size32", tname))
+		return TEST_IN_SIZE32;
+	if (!strcmp("input_size36", tname))
+		return TEST_IN_SIZE36;
+	if (!strcmp("input_size40", tname))
+		return TEST_IN_SIZE40;
+	if (!strcmp("input_size44", tname))
+		return TEST_IN_SIZE44;
+	if (!strcmp("input_size48", tname))
+		return TEST_IN_SIZE48;
+	if (!strcmp("input_size52", tname))
+		return TEST_IN_SIZE52;
+	if (!strcmp("input_size56", tname))
+		return TEST_IN_SIZE56;
+	if (!strcmp("input_size60", tname))
+		return TEST_IN_SIZE60;
+	if (!strcmp("input_size64", tname))
+		return TEST_IN_SIZE64;
+	if (!strcmp("input_size68", tname))
+		return TEST_IN_SIZE68;
+	if (!strcmp("input_size72", tname))
+		return TEST_IN_SIZE72;
+	if (!strcmp("input_size76", tname))
+		return TEST_IN_SIZE76;
+	if (!strcmp("input_size80", tname))
+		return TEST_IN_SIZE80;
+	if (!strcmp("input_size84", tname))
+		return TEST_IN_SIZE84;
+	if (!strcmp("input_size88", tname))
+		return TEST_IN_SIZE88;
+	if (!strcmp("input_size92", tname))
+		return TEST_IN_SIZE92;
+	if (!strcmp("input_size96", tname))
+		return TEST_IN_SIZE96;
+	if (!strcmp("input_size100", tname))
+		return TEST_IN_SIZE100;
+	if (!strcmp("input_size104", tname))
+		return TEST_IN_SIZE104;
+	if (!strcmp("input_size108", tname))
+		return TEST_IN_SIZE108;
+	if (!strcmp("input_size112", tname))
+		return TEST_IN_SIZE112;
+	if (!strcmp("input_size116", tname))
+		return TEST_IN_SIZE116;
+	if (!strcmp("input_size120", tname))
+		return TEST_IN_SIZE120;
+	if (!strcmp("input_size124", tname))
+		return TEST_IN_SIZE124;
+	if (!strcmp("input_size128", tname))
+		return TEST_IN_SIZE128;
+	if (!strcmp("input_size132", tname))
+		return TEST_IN_SIZE132;
+	if (!strcmp("input_size136", tname))
+		return TEST_IN_SIZE136;
+	if (!strcmp("input_size140", tname))
+		return TEST_IN_SIZE140;
+	if (!strcmp("input_size144", tname))
+		return TEST_IN_SIZE144;
+	if (!strcmp("input_size148", tname))
+		return TEST_IN_SIZE148;
+	if (!strcmp("input_size152", tname))
+		return TEST_IN_SIZE152;
+	if (!strcmp("input_size156", tname))
+		return TEST_IN_SIZE156;
+	if (!strcmp("input_size160", tname))
+		return TEST_IN_SIZE160;
+	if (!strcmp("input_size164", tname))
+		return TEST_IN_SIZE164;
+	if (!strcmp("input_size168", tname))
+		return TEST_IN_SIZE168;
+	if (!strcmp("input_size172", tname))
+		return TEST_IN_SIZE172;
+	if (!strcmp("input_size176", tname))
+		return TEST_IN_SIZE176;
+	if (!strcmp("input_size180", tname))
+		return TEST_IN_SIZE180;
+	if (!strcmp("input_size184", tname))
+		return TEST_IN_SIZE184;
+	if (!strcmp("input_size188", tname))
+		return TEST_IN_SIZE188;
+	if (!strcmp("input_size192", tname))
+		return TEST_IN_SIZE192;
+	if (!strcmp("input_size196", tname))
+		return TEST_IN_SIZE196;
+	if (!strcmp("input_size200", tname))
+		return TEST_IN_SIZE200;
+	if (!strcmp("input_size204", tname))
+		return TEST_IN_SIZE204;
+	if (!strcmp("input_size208", tname))
+		return TEST_IN_SIZE208;
+	if (!strcmp("input_size212", tname))
+		return TEST_IN_SIZE212;
+	if (!strcmp("input_size216", tname))
+		return TEST_IN_SIZE216;
+	if (!strcmp("input_size220", tname))
+		return TEST_IN_SIZE220;
+	if (!strcmp("input_size224", tname))
+		return TEST_IN_SIZE224;
+	if (!strcmp("input_size228", tname))
+		return TEST_IN_SIZE228;
+	if (!strcmp("input_size232", tname))
+		return TEST_IN_SIZE232;
+	if (!strcmp("input_size236", tname))
+		return TEST_IN_SIZE236;
+	if (!strcmp("input_size240", tname))
+		return TEST_IN_SIZE240;
+	if (!strcmp("input_size244", tname))
+		return TEST_IN_SIZE244;
+	if (!strcmp("input_full_supp_trace", tname))
 		return TEST_IN_FULL_SUPP_TRACE;
-	if (!strcmp("fwd_full_supp_trace", tname))
-		return TEST_FWD_FULL_SUPP_TRACE;
 
 	return -1;
 }
 
+static int ipv6_addr_equal(const struct in6_addr *a1, const struct in6_addr *a2)
+{
+	return ((a1->s6_addr32[0] ^ a2->s6_addr32[0]) |
+		(a1->s6_addr32[1] ^ a2->s6_addr32[1]) |
+		(a1->s6_addr32[2] ^ a2->s6_addr32[2]) |
+		(a1->s6_addr32[3] ^ a2->s6_addr32[3])) == 0;
+}
+
 static int get_u32(__u32 *val, const char *arg, int base)
 {
 	unsigned long res;
@@ -555,119 +977,124 @@ static int get_u16(__u16 *val, const char *arg, int base)
 	return 0;
 }
 
-static int (*func[__TEST_MAX])(int, struct ioam6_trace_hdr *, __u32, __u16) = {
-	[TEST_OUT_UNDEF_NS]		= check_ioam_header,
-	[TEST_OUT_NO_ROOM]		= check_ioam_header,
-	[TEST_OUT_BIT0]		= check_ioam_header_and_data,
-	[TEST_OUT_BIT1]		= check_ioam_header_and_data,
-	[TEST_OUT_BIT2]		= check_ioam_header_and_data,
-	[TEST_OUT_BIT3]		= check_ioam_header_and_data,
-	[TEST_OUT_BIT4]		= check_ioam_header_and_data,
-	[TEST_OUT_BIT5]		= check_ioam_header_and_data,
-	[TEST_OUT_BIT6]		= check_ioam_header_and_data,
-	[TEST_OUT_BIT7]		= check_ioam_header_and_data,
-	[TEST_OUT_BIT8]		= check_ioam_header_and_data,
-	[TEST_OUT_BIT9]		= check_ioam_header_and_data,
-	[TEST_OUT_BIT10]		= check_ioam_header_and_data,
-	[TEST_OUT_BIT11]		= check_ioam_header_and_data,
-	[TEST_OUT_BIT22]		= check_ioam_header_and_data,
-	[TEST_OUT_FULL_SUPP_TRACE]	= check_ioam_header_and_data,
-	[TEST_IN_UNDEF_NS]		= check_ioam_header,
-	[TEST_IN_NO_ROOM]		= check_ioam_header,
-	[TEST_IN_OFLAG]		= check_ioam_header,
-	[TEST_IN_BIT0]			= check_ioam_header_and_data,
-	[TEST_IN_BIT1]			= check_ioam_header_and_data,
-	[TEST_IN_BIT2]			= check_ioam_header_and_data,
-	[TEST_IN_BIT3]			= check_ioam_header_and_data,
-	[TEST_IN_BIT4]			= check_ioam_header_and_data,
-	[TEST_IN_BIT5]			= check_ioam_header_and_data,
-	[TEST_IN_BIT6]			= check_ioam_header_and_data,
-	[TEST_IN_BIT7]			= check_ioam_header_and_data,
-	[TEST_IN_BIT8]			= check_ioam_header_and_data,
-	[TEST_IN_BIT9]			= check_ioam_header_and_data,
-	[TEST_IN_BIT10]		= check_ioam_header_and_data,
-	[TEST_IN_BIT11]		= check_ioam_header_and_data,
-	[TEST_IN_BIT22]		= check_ioam_header_and_data,
-	[TEST_IN_FULL_SUPP_TRACE]	= check_ioam_header_and_data,
-	[TEST_FWD_FULL_SUPP_TRACE]	= check_ioam_header_and_data,
-};
+static int get_u8(__u8 *val, const char *arg, int base)
+{
+	unsigned long res;
+	char *ptr;
+
+	if (!arg || !*arg)
+		return -1;
+	res = strtoul(arg, &ptr, base);
+
+	if (!ptr || ptr == arg || *ptr)
+		return -1;
+
+	if (res == ULONG_MAX && errno == ERANGE)
+		return -1;
+
+	if (res > 0xFFUL)
+		return -1;
+
+	*val = res;
+	return 0;
+}
 
 int main(int argc, char **argv)
 {
-	int fd, size, hoplen, tid, ret = 1, on = 1;
-	struct ioam6_hdr *opt;
-	struct cmsghdr *cmsg;
-	struct msghdr msg;
-	struct iovec iov;
-	__u8 buffer[512];
+	__u8 buffer[512], *ptr, nexthdr, tr_size;
+	struct ioam6_trace_hdr *trace;
+	unsigned int hoplen, ret = 1;
+	struct ipv6_hopopt_hdr *hbh;
+	int fd, size, testname_id;
+	struct in6_addr src, dst;
+	struct ioam6_hdr *ioam6;
+	struct timeval timeout;
+	struct ipv6hdr *ipv6;
 	__u32 tr_type;
 	__u16 ioam_ns;
-	__u8 *ptr;
 
-	if (argc != 5)
+	if (argc != 9)
 		goto out;
 
-	tid = str2id(argv[1]);
-	if (tid < 0 || !func[tid])
-		goto out;
+	testname_id = str2id(argv[2]);
 
-	if (get_u32(&tr_type, argv[2], 16) ||
-	    get_u16(&ioam_ns, argv[3], 0))
+	if (testname_id < 0 ||
+	    inet_pton(AF_INET6, argv[3], &src) != 1 ||
+	    inet_pton(AF_INET6, argv[4], &dst) != 1 ||
+	    get_u32(&tr_type, argv[5], 16) ||
+	    get_u8(&tr_size, argv[6], 0) ||
+	    get_u16(&ioam_ns, argv[7], 0))
 		goto out;
 
-	fd = socket(PF_INET6, SOCK_RAW,
-		    !strcmp(argv[4], "encap") ? IPPROTO_IPV6 : IPPROTO_ICMPV6);
+	nexthdr = (!strcmp(argv[8], "encap") ? IPPROTO_IPV6 : IPPROTO_ICMPV6);
+
+	hoplen = sizeof(*hbh);
+	hoplen += 2; // 2-byte padding for alignment
+	hoplen += sizeof(*ioam6); // IOAM option header
+	hoplen += sizeof(*trace); // IOAM trace header
+	hoplen += tr_size; // IOAM trace size
+	hoplen += (tr_size % 8); // optional padding
+
+	fd = socket(AF_PACKET, SOCK_DGRAM, __cpu_to_be16(ETH_P_IPV6));
 	if (fd < 0)
 		goto out;
 
-	setsockopt(fd, IPPROTO_IPV6, IPV6_RECVHOPOPTS,  &on, sizeof(on));
+	if (setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE,
+		       argv[1], strlen(argv[1])))
+		goto close;
 
-	iov.iov_len = 1;
-	iov.iov_base = malloc(CMSG_SPACE(sizeof(buffer)));
-	if (!iov.iov_base)
+	timeout.tv_sec = 1;
+	timeout.tv_usec = 0;
+	if (setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO,
+		       (const char *)&timeout, sizeof(timeout)))
 		goto close;
 recv:
-	memset(&msg, 0, sizeof(msg));
-	msg.msg_iov = &iov;
-	msg.msg_iovlen = 1;
-	msg.msg_control = buffer;
-	msg.msg_controllen = CMSG_SPACE(sizeof(buffer));
-
-	size = recvmsg(fd, &msg, 0);
+	size = recv(fd, buffer, sizeof(buffer), 0);
 	if (size <= 0)
 		goto close;
 
-	for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
-		if (cmsg->cmsg_level != IPPROTO_IPV6 ||
-		    cmsg->cmsg_type != IPV6_HOPOPTS ||
-		    cmsg->cmsg_len < sizeof(struct ipv6_hopopt_hdr))
-			continue;
+	ipv6 = (struct ipv6hdr *)buffer;
+
+	/* Skip packets that do not have the expected src/dst address or that
+	 * do not have a Hop-by-hop.
+	 */
+	if (!ipv6_addr_equal(&ipv6->saddr, &src) ||
+	    !ipv6_addr_equal(&ipv6->daddr, &dst) ||
+	    ipv6->nexthdr != IPPROTO_HOPOPTS)
+		goto recv;
+
+	/* Check Hbh's Next Header and Size. */
+	hbh = (struct ipv6_hopopt_hdr *)(buffer + sizeof(*ipv6));
+	if (hbh->nexthdr != nexthdr || hbh->hdrlen != (hoplen >> 3) - 1)
+		goto close;
 
-		ptr = (__u8 *)CMSG_DATA(cmsg);
+	/* Check we have a 2-byte padding for alignment. */
+	ptr = (__u8 *)hbh + sizeof(*hbh);
+	if (ptr[0] != IPV6_TLV_PADN && ptr[1] != 0)
+		goto close;
 
-		hoplen = (ptr[1] + 1) << 3;
-		ptr += sizeof(struct ipv6_hopopt_hdr);
+	/* Check we now have the IOAM option. */
+	ptr += 2;
+	if (ptr[0] != IPV6_TLV_IOAM)
+		goto close;
 
-		while (hoplen > 0) {
-			opt = (struct ioam6_hdr *)ptr;
+	/* Check its size and the IOAM option type. */
+	ioam6 = (struct ioam6_hdr *)ptr;
+	if (ioam6->opt_len != sizeof(*ioam6) - 2 + sizeof(*trace) + tr_size ||
+	    ioam6->type != IOAM6_TYPE_PREALLOC)
+		goto close;
 
-			if (opt->opt_type == IPV6_TLV_IOAM &&
-			    opt->type == IOAM6_TYPE_PREALLOC) {
-				ptr += sizeof(*opt);
-				ret = func[tid](tid,
-						(struct ioam6_trace_hdr *)ptr,
-						tr_type, ioam_ns);
-				goto close;
-			}
+	trace = (struct ioam6_trace_hdr *)(ptr + sizeof(*ioam6));
 
-			ptr += opt->opt_len + 2;
-			hoplen -= opt->opt_len + 2;
-		}
-	}
+	/* Check the trailing 4-byte padding (potentially). */
+	ptr = (__u8 *)trace + sizeof(*trace) + tr_size;
+	if (tr_size % 8 && ptr[0] != IPV6_TLV_PADN && ptr[1] != 2 &&
+	    ptr[2] != 0 && ptr[3] != 0)
+		goto close;
 
-	goto recv;
+	/* Check the IOAM header and data. */
+	ret = check_ioam_trace(testname_id, trace, tr_type, tr_size, ioam_ns);
 close:
-	free(iov.iov_base);
 	close(fd);
 out:
 	return ret;
diff --git a/tools/testing/selftests/net/ip_local_port_range.c b/tools/testing/selftests/net/ip_local_port_range.c
index 193b82745fd8..29451d2244b7 100644
--- a/tools/testing/selftests/net/ip_local_port_range.c
+++ b/tools/testing/selftests/net/ip_local_port_range.c
@@ -359,7 +359,7 @@ TEST_F(ip_local_port_range, late_bind)
 		struct sockaddr_in v4;
 		struct sockaddr_in6 v6;
 	} addr;
-	socklen_t addr_len;
+	socklen_t addr_len = 0;
 	const int one = 1;
 	int fd, err;
 	__u32 range;
diff --git a/tools/testing/selftests/net/ip_local_port_range.sh b/tools/testing/selftests/net/ip_local_port_range.sh
index 6c6ad346eaa0..4ff746db1256 100755
--- a/tools/testing/selftests/net/ip_local_port_range.sh
+++ b/tools/testing/selftests/net/ip_local_port_range.sh
@@ -2,4 +2,6 @@
 # SPDX-License-Identifier: GPL-2.0
 
 ./in_netns.sh \
-  sh -c 'sysctl -q -w net.ipv4.ip_local_port_range="40000 49999" && ./ip_local_port_range'
+  sh -c 'sysctl -q -w net.mptcp.enabled=1 && \
+         sysctl -q -w net.ipv4.ip_local_port_range="40000 49999" && \
+         ./ip_local_port_range'
diff --git a/tools/testing/selftests/net/ipsec.c b/tools/testing/selftests/net/ipsec.c
index be4a30a0d02a..9b44a091802c 100644
--- a/tools/testing/selftests/net/ipsec.c
+++ b/tools/testing/selftests/net/ipsec.c
@@ -227,7 +227,8 @@ static int rtattr_pack(struct nlmsghdr *nh, size_t req_sz,
 
 	attr->rta_len = RTA_LENGTH(size);
 	attr->rta_type = rta_type;
-	memcpy(RTA_DATA(attr), payload, size);
+	if (payload)
+		memcpy(RTA_DATA(attr), payload, size);
 
 	return 0;
 }
diff --git a/tools/testing/selftests/net/ipv6_route_update_soft_lockup.sh b/tools/testing/selftests/net/ipv6_route_update_soft_lockup.sh
new file mode 100755
index 000000000000..c6866e42f95c
--- /dev/null
+++ b/tools/testing/selftests/net/ipv6_route_update_soft_lockup.sh
@@ -0,0 +1,261 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Testing for potential kernel soft lockup during IPv6 routing table
+# refresh under heavy outgoing IPv6 traffic. If a kernel soft lockup
+# occurs, a kernel panic will be triggered to prevent associated issues.
+#
+#
+#                            Test Environment Layout
+#
+# ┌----------------┐                                         ┌----------------┐
+# |     SOURCE_NS  |                                         |     SINK_NS    |
+# |    NAMESPACE   |                                         |    NAMESPACE   |
+# |(iperf3 clients)|                                         |(iperf3 servers)|
+# |                |                                         |                |
+# |                |                                         |                |
+# |    ┌-----------|                             nexthops    |---------┐      |
+# |    |veth_source|<--------------------------------------->|veth_sink|<┐    |
+# |    └-----------|2001:0DB8:1::0:1/96  2001:0DB8:1::1:1/96 |---------┘ |    |
+# |                |         ^           2001:0DB8:1::1:2/96 |           |    |
+# |                |         .                   .           |       fwd |    |
+# |  ┌---------┐   |         .                   .           |           |    |
+# |  |   IPv6  |   |         .                   .           |           V    |
+# |  | routing |   |         .           2001:0DB8:1::1:80/96|        ┌-----┐ |
+# |  |  table  |   |         .                               |        | lo  | |
+# |  | nexthop |   |         .                               └--------┴-----┴-┘
+# |  | update  |   |         ............................> 2001:0DB8:2::1:1/128
+# |  └-------- ┘   |
+# └----------------┘
+#
+# The test script sets up two network namespaces, source_ns and sink_ns,
+# connected via a veth link. Within source_ns, it continuously updates the
+# IPv6 routing table by flushing and inserting IPV6_NEXTHOP_ADDR_COUNT nexthop
+# IPs destined for SINK_LOOPBACK_IP_ADDR in sink_ns. This refresh occurs at a
+# rate of 1/ROUTING_TABLE_REFRESH_PERIOD per second for TEST_DURATION seconds.
+#
+# Simultaneously, multiple iperf3 clients within source_ns generate heavy
+# outgoing IPv6 traffic. Each client is assigned a unique port number starting
+# at 5000 and incrementing sequentially. Each client targets a unique iperf3
+# server running in sink_ns, connected to the SINK_LOOPBACK_IFACE interface
+# using the same port number.
+#
+# The number of iperf3 servers and clients is set to half of the total
+# available cores on each machine.
+#
+# NOTE: We have tested this script on machines with various CPU specifications,
+# ranging from lower to higher performance as listed below. The test script
+# effectively triggered a kernel soft lockup on machines running an unpatched
+# kernel in under a minute:
+#
+# - 1x Intel Xeon E-2278G 8-Core Processor @ 3.40GHz
+# - 1x Intel Xeon E-2378G Processor 8-Core @ 2.80GHz
+# - 1x AMD EPYC 7401P 24-Core Processor @ 2.00GHz
+# - 1x AMD EPYC 7402P 24-Core Processor @ 2.80GHz
+# - 2x Intel Xeon Gold 5120 14-Core Processor @ 2.20GHz
+# - 1x Ampere Altra Q80-30 80-Core Processor @ 3.00GHz
+# - 2x Intel Xeon Gold 5120 14-Core Processor @ 2.20GHz
+# - 2x Intel Xeon Silver 4214 24-Core Processor @ 2.20GHz
+# - 1x AMD EPYC 7502P 32-Core @ 2.50GHz
+# - 1x Intel Xeon Gold 6314U 32-Core Processor @ 2.30GHz
+# - 2x Intel Xeon Gold 6338 32-Core Processor @ 2.00GHz
+#
+# On less performant machines, you may need to increase the TEST_DURATION
+# parameter to enhance the likelihood of encountering a race condition leading
+# to a kernel soft lockup and avoid a false negative result.
+#
+# NOTE: The test may not produce the expected result in virtualized
+# environments (e.g., qemu) due to differences in timing and CPU handling,
+# which can affect the conditions needed to trigger a soft lockup.
+
+source lib.sh
+
+TEST_DURATION=300
+ROUTING_TABLE_REFRESH_PERIOD=0.01
+
+IPERF3_BITRATE="300m"
+
+
+IPV6_NEXTHOP_ADDR_COUNT="128"
+IPV6_NEXTHOP_ADDR_MASK="96"
+IPV6_NEXTHOP_PREFIX="2001:0DB8:1"
+
+
+SOURCE_TEST_IFACE="veth_source"
+SOURCE_TEST_IP_ADDR="2001:0DB8:1::0:1/96"
+
+SINK_TEST_IFACE="veth_sink"
+# ${SINK_TEST_IFACE} is populated with the following range of IPv6 addresses:
+# 2001:0DB8:1::1:1  to 2001:0DB8:1::1:${IPV6_NEXTHOP_ADDR_COUNT}
+SINK_LOOPBACK_IFACE="lo"
+SINK_LOOPBACK_IP_MASK="128"
+SINK_LOOPBACK_IP_ADDR="2001:0DB8:2::1:1"
+
+nexthop_ip_list=""
+termination_signal=""
+kernel_softlokup_panic_prev_val=""
+
+terminate_ns_processes_by_pattern() {
+	local ns=$1
+	local pattern=$2
+
+	for pid in $(ip netns pids ${ns}); do
+		[ -e /proc/$pid/cmdline ] && grep -qe "${pattern}" /proc/$pid/cmdline && kill -9 $pid
+	done
+}
+
+cleanup() {
+	echo "info: cleaning up namespaces and terminating all processes within them..."
+
+
+	# Terminate iperf3 instances running in the source_ns. To avoid race
+	# conditions, first iterate over the PIDs and terminate those
+	# associated with the bash shells running the
+	# `while true; do iperf3 -c ...; done` loops. In a second iteration,
+	# terminate the individual `iperf3 -c ...` instances.
+	terminate_ns_processes_by_pattern ${source_ns} while
+	terminate_ns_processes_by_pattern ${source_ns} iperf3
+
+	# Repeat the same process for sink_ns
+	terminate_ns_processes_by_pattern ${sink_ns} while
+	terminate_ns_processes_by_pattern ${sink_ns} iperf3
+
+	# Check if any iperf3 instances are still running. This could happen
+	# if a core has entered an infinite loop and the timeout for detecting
+	# the soft lockup has not expired, but either the test interval has
+	# already elapsed or the test was terminated manually (e.g., with ^C)
+	for pid in $(ip netns pids ${source_ns}); do
+		if [ -e /proc/$pid/cmdline ] && grep -qe 'iperf3' /proc/$pid/cmdline; then
+			echo "FAIL: unable to terminate some iperf3 instances. Soft lockup is underway. A kernel panic is on the way!"
+			exit ${ksft_fail}
+		fi
+	done
+
+	if [ "$termination_signal" == "SIGINT" ]; then
+		echo "SKIP: Termination due to ^C (SIGINT)"
+	elif [ "$termination_signal" == "SIGALRM" ]; then
+		echo "PASS: No kernel soft lockup occurred during this ${TEST_DURATION} second test"
+	fi
+
+	cleanup_ns ${source_ns} ${sink_ns}
+
+	sysctl -qw kernel.softlockup_panic=${kernel_softlokup_panic_prev_val}
+}
+
+setup_prepare() {
+	setup_ns source_ns sink_ns
+
+	ip -n ${source_ns} link add name ${SOURCE_TEST_IFACE} type veth peer name ${SINK_TEST_IFACE} netns ${sink_ns}
+
+	# Setting up the Source namespace
+	ip -n ${source_ns} addr add ${SOURCE_TEST_IP_ADDR} dev ${SOURCE_TEST_IFACE}
+	ip -n ${source_ns} link set dev ${SOURCE_TEST_IFACE} qlen 10000
+	ip -n ${source_ns} link set dev ${SOURCE_TEST_IFACE} up
+	ip netns exec ${source_ns} sysctl -qw net.ipv6.fib_multipath_hash_policy=1
+
+	# Setting up the Sink namespace
+	ip -n ${sink_ns} addr add ${SINK_LOOPBACK_IP_ADDR}/${SINK_LOOPBACK_IP_MASK} dev ${SINK_LOOPBACK_IFACE}
+	ip -n ${sink_ns} link set dev ${SINK_LOOPBACK_IFACE} up
+	ip netns exec ${sink_ns} sysctl -qw net.ipv6.conf.${SINK_LOOPBACK_IFACE}.forwarding=1
+
+	ip -n ${sink_ns} link set ${SINK_TEST_IFACE} up
+	ip netns exec ${sink_ns} sysctl -qw net.ipv6.conf.${SINK_TEST_IFACE}.forwarding=1
+
+
+	# Populate nexthop IPv6 addresses on the test interface in the sink_ns
+	echo "info: populating ${IPV6_NEXTHOP_ADDR_COUNT} IPv6 addresses on the ${SINK_TEST_IFACE} interface ..."
+	for IP in $(seq 1 ${IPV6_NEXTHOP_ADDR_COUNT}); do
+		ip -n ${sink_ns} addr add ${IPV6_NEXTHOP_PREFIX}::$(printf "1:%x" "${IP}")/${IPV6_NEXTHOP_ADDR_MASK} dev ${SINK_TEST_IFACE};
+	done
+
+	# Preparing list of nexthops
+	for IP in $(seq 1 ${IPV6_NEXTHOP_ADDR_COUNT}); do
+		nexthop_ip_list=$nexthop_ip_list" nexthop via ${IPV6_NEXTHOP_PREFIX}::$(printf "1:%x" $IP) dev ${SOURCE_TEST_IFACE} weight 1"
+	done
+}
+
+
+test_soft_lockup_during_routing_table_refresh() {
+	# Start num_of_iperf_servers iperf3 servers in the sink_ns namespace,
+	# each listening on ports starting at 5001 and incrementing
+	# sequentially. Since iperf3 instances may terminate unexpectedly, a
+	# while loop is used to automatically restart them in such cases.
+	echo "info: starting ${num_of_iperf_servers} iperf3 servers in the sink_ns namespace ..."
+	for i in $(seq 1 ${num_of_iperf_servers}); do
+		cmd="iperf3 --bind ${SINK_LOOPBACK_IP_ADDR} -s -p $(printf '5%03d' ${i}) --rcv-timeout 200 &>/dev/null"
+		ip netns exec ${sink_ns} bash -c "while true; do ${cmd}; done &" &>/dev/null
+	done
+
+	# Wait for the iperf3 servers to be ready
+	for i in $(seq ${num_of_iperf_servers}); do
+		port=$(printf '5%03d' ${i});
+		wait_local_port_listen ${sink_ns} ${port} tcp
+	done
+
+	# Continuously refresh the routing table in the background within
+	# the source_ns namespace
+	ip netns exec ${source_ns} bash -c "
+		while \$(ip netns list | grep -q ${source_ns}); do
+			ip -6 route add ${SINK_LOOPBACK_IP_ADDR}/${SINK_LOOPBACK_IP_MASK} ${nexthop_ip_list};
+			sleep ${ROUTING_TABLE_REFRESH_PERIOD};
+			ip -6 route delete ${SINK_LOOPBACK_IP_ADDR}/${SINK_LOOPBACK_IP_MASK};
+		done &"
+
+	# Start num_of_iperf_servers iperf3 clients in the source_ns namespace,
+	# each sending TCP traffic on sequential ports starting at 5001.
+	# Since iperf3 instances may terminate unexpectedly (e.g., if the route
+	# to the server is deleted in the background during a route refresh), a
+	# while loop is used to automatically restart them in such cases.
+	echo "info: starting ${num_of_iperf_servers} iperf3 clients in the source_ns namespace ..."
+	for i in $(seq 1 ${num_of_iperf_servers}); do
+		cmd="iperf3 -c ${SINK_LOOPBACK_IP_ADDR} -p $(printf '5%03d' ${i}) --length 64 --bitrate ${IPERF3_BITRATE} -t 0 --connect-timeout 150 &>/dev/null"
+		ip netns exec ${source_ns} bash -c "while true; do ${cmd}; done &" &>/dev/null
+	done
+
+	echo "info: IPv6 routing table is being updated at the rate of $(echo "1/${ROUTING_TABLE_REFRESH_PERIOD}" | bc)/s for ${TEST_DURATION} seconds ..."
+	echo "info: A kernel soft lockup, if detected, results in a kernel panic!"
+
+	wait
+}
+
+# Make sure 'iperf3' is installed, skip the test otherwise
+if [ ! -x "$(command -v "iperf3")" ]; then
+	echo "SKIP: 'iperf3' is not installed. Skipping the test."
+	exit ${ksft_skip}
+fi
+
+# Determine the number of cores on the machine
+num_of_iperf_servers=$(( $(nproc)/2 ))
+
+# Check if we are running on a multi-core machine, skip the test otherwise
+if [ "${num_of_iperf_servers}" -eq 0 ]; then
+	echo "SKIP: This test is not valid on a single core machine!"
+	exit ${ksft_skip}
+fi
+
+# Since the kernel soft lockup we're testing causes at least one core to enter
+# an infinite loop, destabilizing the host and likely affecting subsequent
+# tests, we trigger a kernel panic instead of reporting a failure and
+# continuing
+kernel_softlokup_panic_prev_val=$(sysctl -n kernel.softlockup_panic)
+sysctl -qw kernel.softlockup_panic=1
+
+handle_sigint() {
+	termination_signal="SIGINT"
+	cleanup
+	exit ${ksft_skip}
+}
+
+handle_sigalrm() {
+	termination_signal="SIGALRM"
+	cleanup
+	exit ${ksft_pass}
+}
+
+trap handle_sigint SIGINT
+trap handle_sigalrm SIGALRM
+
+(sleep ${TEST_DURATION} && kill -s SIGALRM $$)&
+
+setup_prepare
+test_soft_lockup_during_routing_table_refresh
diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh
index f9fe182dfbd4..006fdadcc4b9 100644
--- a/tools/testing/selftests/net/lib.sh
+++ b/tools/testing/selftests/net/lib.sh
@@ -1,96 +1,618 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
+net_dir=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
+source "$net_dir/lib/sh/defer.sh"
+
 ##############################################################################
 # Defines
 
-WAIT_TIMEOUT=${WAIT_TIMEOUT:=20}
+: "${WAIT_TIMEOUT:=20}"
+
+# Whether to pause on after a failure.
+: "${PAUSE_ON_FAIL:=no}"
+
 BUSYWAIT_TIMEOUT=$((WAIT_TIMEOUT * 1000)) # ms
 
-# Kselftest framework requirement - SKIP code is 4.
+# Kselftest framework constants.
+ksft_pass=0
+ksft_fail=1
+ksft_xfail=2
 ksft_skip=4
+
 # namespace list created by setup_ns
-NS_LIST=""
+NS_LIST=()
+
+# Exit status to return at the end. Set in case one of the tests fails.
+EXIT_STATUS=0
+# Per-test return value. Clear at the beginning of each test.
+RET=0
 
 ##############################################################################
 # Helpers
-busywait()
+
+__ksft_status_merge()
 {
-	local timeout=$1; shift
+	local a=$1; shift
+	local b=$1; shift
+	local -A weights
+	local weight=0
+
+	local i
+	for i in "$@"; do
+		weights[$i]=$((weight++))
+	done
+
+	if [[ ${weights[$a]} > ${weights[$b]} ]]; then
+		echo "$a"
+		return 0
+	else
+		echo "$b"
+		return 1
+	fi
+}
+
+ksft_status_merge()
+{
+	local a=$1; shift
+	local b=$1; shift
+
+	__ksft_status_merge "$a" "$b" \
+		$ksft_pass $ksft_xfail $ksft_skip $ksft_fail
+}
+
+ksft_exit_status_merge()
+{
+	local a=$1; shift
+	local b=$1; shift
+
+	__ksft_status_merge "$a" "$b" \
+		$ksft_xfail $ksft_pass $ksft_skip $ksft_fail
+}
+
+loopy_wait()
+{
+	local sleep_cmd=$1; shift
+	local timeout_ms=$1; shift
 
 	local start_time="$(date -u +%s%3N)"
 	while true
 	do
 		local out
-		out=$("$@")
-		local ret=$?
-		if ((!ret)); then
+		if out=$("$@"); then
 			echo -n "$out"
 			return 0
 		fi
 
 		local current_time="$(date -u +%s%3N)"
-		if ((current_time - start_time > timeout)); then
+		if ((current_time - start_time > timeout_ms)); then
 			echo -n "$out"
 			return 1
 		fi
+
+		$sleep_cmd
+	done
+}
+
+busywait()
+{
+	local timeout_ms=$1; shift
+
+	loopy_wait : "$timeout_ms" "$@"
+}
+
+# timeout in seconds
+slowwait()
+{
+	local timeout_sec=$1; shift
+
+	loopy_wait "sleep 0.1" "$((timeout_sec * 1000))" "$@"
+}
+
+until_counter_is()
+{
+	local expr=$1; shift
+	local current=$("$@")
+
+	echo $((current))
+	((current $expr))
+}
+
+busywait_for_counter()
+{
+	local timeout=$1; shift
+	local delta=$1; shift
+
+	local base=$("$@")
+	busywait "$timeout" until_counter_is ">= $((base + delta))" "$@"
+}
+
+slowwait_for_counter()
+{
+	local timeout=$1; shift
+	local delta=$1; shift
+
+	local base=$("$@")
+	slowwait "$timeout" until_counter_is ">= $((base + delta))" "$@"
+}
+
+# Check for existence of tools which are built as part of selftests
+# but may also already exist in $PATH
+check_gen_prog()
+{
+	local prog_name=$1; shift
+
+	if ! which $prog_name >/dev/null 2>/dev/null; then
+		PATH=$PWD:$PATH
+		if ! which $prog_name >/dev/null; then
+			echo "'$prog_name' command not found; skipping tests"
+			exit $ksft_skip
+		fi
+	fi
+}
+
+remove_ns_list()
+{
+	local item=$1
+	local ns
+	local ns_list=("${NS_LIST[@]}")
+	NS_LIST=()
+
+	for ns in "${ns_list[@]}"; do
+		if [ "${ns}" != "${item}" ]; then
+			NS_LIST+=("${ns}")
+		fi
 	done
 }
 
 cleanup_ns()
 {
 	local ns=""
-	local errexit=0
 	local ret=0
 
-	# disable errexit temporary
-	if [[ $- =~ "e" ]]; then
-		errexit=1
-		set +e
-	fi
-
 	for ns in "$@"; do
-		ip netns delete "${ns}" &> /dev/null
+		[ -z "${ns}" ] && continue
+		ip netns pids "${ns}" 2> /dev/null | xargs -r kill || true
+		ip netns delete "${ns}" &> /dev/null || true
 		if ! busywait $BUSYWAIT_TIMEOUT ip netns list \| grep -vq "^$ns$" &> /dev/null; then
 			echo "Warn: Failed to remove namespace $ns"
 			ret=1
+		else
+			remove_ns_list "${ns}"
 		fi
 	done
 
-	[ $errexit -eq 1 ] && set -e
 	return $ret
 }
 
 cleanup_all_ns()
 {
-	cleanup_ns $NS_LIST
+	cleanup_ns "${NS_LIST[@]}"
 }
 
 # setup netns with given names as prefix. e.g
 # setup_ns local remote
 setup_ns()
 {
-	local ns=""
 	local ns_name=""
-	local ns_list=""
+	local ns_list=()
 	for ns_name in "$@"; do
+		# avoid conflicts with local var: internal error
+		if [ "${ns_name}" = "ns_name" ]; then
+			echo "Failed to setup namespace '${ns_name}': invalid name"
+			cleanup_ns "${ns_list[@]}"
+			exit $ksft_fail
+		fi
+
 		# Some test may setup/remove same netns multi times
-		if unset ${ns_name} 2> /dev/null; then
-			ns="${ns_name,,}-$(mktemp -u XXXXXX)"
-			eval readonly ${ns_name}="$ns"
+		if [ -z "${!ns_name}" ]; then
+			eval "${ns_name}=${ns_name,,}-$(mktemp -u XXXXXX)"
 		else
-			eval ns='$'${ns_name}
-			cleanup_ns "$ns"
-
+			cleanup_ns "${!ns_name}"
 		fi
 
-		if ! ip netns add "$ns"; then
+		if ! ip netns add "${!ns_name}"; then
 			echo "Failed to create namespace $ns_name"
-			cleanup_ns "$ns_list"
+			cleanup_ns "${ns_list[@]}"
 			return $ksft_skip
 		fi
-		ip -n "$ns" link set lo up
-		ns_list="$ns_list $ns"
+		ip -n "${!ns_name}" link set lo up
+		ip netns exec "${!ns_name}" sysctl -wq net.ipv4.conf.all.rp_filter=0
+		ip netns exec "${!ns_name}" sysctl -wq net.ipv4.conf.default.rp_filter=0
+		ns_list+=("${!ns_name}")
+	done
+	NS_LIST+=("${ns_list[@]}")
+}
+
+# Create netdevsim with given id and net namespace.
+create_netdevsim() {
+    local id="$1"
+    local ns="$2"
+
+    modprobe netdevsim &> /dev/null
+    udevadm settle
+
+    echo "$id 1" | ip netns exec $ns tee /sys/bus/netdevsim/new_device >/dev/null
+    local dev=$(ip netns exec $ns ls /sys/bus/netdevsim/devices/netdevsim$id/net)
+    ip -netns $ns link set dev $dev name nsim$id
+    ip -netns $ns link set dev nsim$id up
+
+    echo nsim$id
+}
+
+# Remove netdevsim with given id.
+cleanup_netdevsim() {
+    local id="$1"
+
+    if [ -d "/sys/bus/netdevsim/devices/netdevsim$id/net" ]; then
+        echo "$id" > /sys/bus/netdevsim/del_device
+    fi
+}
+
+tc_rule_stats_get()
+{
+	local dev=$1; shift
+	local pref=$1; shift
+	local dir=${1:-ingress}; shift
+	local selector=${1:-.packets}; shift
+
+	tc -j -s filter show dev $dev $dir pref $pref \
+	    | jq ".[1].options.actions[].stats$selector"
+}
+
+tc_rule_handle_stats_get()
+{
+	local id=$1; shift
+	local handle=$1; shift
+	local selector=${1:-.packets}; shift
+	local netns=${1:-""}; shift
+
+	tc $netns -j -s filter show $id \
+	    | jq ".[] | select(.options.handle == $handle) | \
+		  .options.actions[0].stats$selector"
+}
+
+# attach a qdisc with two children match/no-match and a flower filter to match
+tc_set_flower_counter() {
+	local -r ns=$1
+	local -r ipver=$2
+	local -r dev=$3
+	local -r flower_expr=$4
+
+	tc -n $ns qdisc add dev $dev root handle 1: prio bands 2 \
+			priomap 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+
+	tc -n $ns qdisc add dev $dev parent 1:1 handle 11: pfifo
+	tc -n $ns qdisc add dev $dev parent 1:2 handle 12: pfifo
+
+	tc -n $ns filter add dev $dev parent 1: protocol ipv$ipver \
+			flower $flower_expr classid 1:2
+}
+
+tc_get_flower_counter() {
+	local -r ns=$1
+	local -r dev=$2
+
+	tc -n $ns -j -s qdisc show dev $dev handle 12: | jq .[0].packets
+}
+
+ret_set_ksft_status()
+{
+	local ksft_status=$1; shift
+	local msg=$1; shift
+
+	RET=$(ksft_status_merge $RET $ksft_status)
+	if (( $? )); then
+		retmsg=$msg
+	fi
+}
+
+log_test_result()
+{
+	local test_name=$1; shift
+	local opt_str=$1; shift
+	local result=$1; shift
+	local retmsg=$1; shift
+
+	printf "TEST: %-60s  [%s]\n" "$test_name $opt_str" "$result"
+	if [[ $retmsg ]]; then
+		printf "\t%s\n" "$retmsg"
+	fi
+}
+
+pause_on_fail()
+{
+	if [[ $PAUSE_ON_FAIL == yes ]]; then
+		echo "Hit enter to continue, 'q' to quit"
+		read a
+		[[ $a == q ]] && exit 1
+	fi
+}
+
+handle_test_result_pass()
+{
+	local test_name=$1; shift
+	local opt_str=$1; shift
+
+	log_test_result "$test_name" "$opt_str" " OK "
+}
+
+handle_test_result_fail()
+{
+	local test_name=$1; shift
+	local opt_str=$1; shift
+
+	log_test_result "$test_name" "$opt_str" FAIL "$retmsg"
+	pause_on_fail
+}
+
+handle_test_result_xfail()
+{
+	local test_name=$1; shift
+	local opt_str=$1; shift
+
+	log_test_result "$test_name" "$opt_str" XFAIL "$retmsg"
+	pause_on_fail
+}
+
+handle_test_result_skip()
+{
+	local test_name=$1; shift
+	local opt_str=$1; shift
+
+	log_test_result "$test_name" "$opt_str" SKIP "$retmsg"
+}
+
+log_test()
+{
+	local test_name=$1
+	local opt_str=$2
+
+	if [[ $# -eq 2 ]]; then
+		opt_str="($opt_str)"
+	fi
+
+	if ((RET == ksft_pass)); then
+		handle_test_result_pass "$test_name" "$opt_str"
+	elif ((RET == ksft_xfail)); then
+		handle_test_result_xfail "$test_name" "$opt_str"
+	elif ((RET == ksft_skip)); then
+		handle_test_result_skip "$test_name" "$opt_str"
+	else
+		handle_test_result_fail "$test_name" "$opt_str"
+	fi
+
+	EXIT_STATUS=$(ksft_exit_status_merge $EXIT_STATUS $RET)
+	return $RET
+}
+
+log_test_skip()
+{
+	RET=$ksft_skip retmsg= log_test "$@"
+}
+
+log_test_xfail()
+{
+	RET=$ksft_xfail retmsg= log_test "$@"
+}
+
+log_info()
+{
+	local msg=$1
+
+	echo "INFO: $msg"
+}
+
+tests_run()
+{
+	local current_test
+
+	for current_test in ${TESTS:-$ALL_TESTS}; do
+		in_defer_scope \
+			$current_test
+	done
+}
+
+# Whether FAILs should be interpreted as XFAILs. Internal.
+FAIL_TO_XFAIL=
+
+check_err()
+{
+	local err=$1
+	local msg=$2
+
+	if ((err)); then
+		if [[ $FAIL_TO_XFAIL = yes ]]; then
+			ret_set_ksft_status $ksft_xfail "$msg"
+		else
+			ret_set_ksft_status $ksft_fail "$msg"
+		fi
+	fi
+}
+
+check_fail()
+{
+	local err=$1
+	local msg=$2
+
+	check_err $((!err)) "$msg"
+}
+
+check_err_fail()
+{
+	local should_fail=$1; shift
+	local err=$1; shift
+	local what=$1; shift
+
+	if ((should_fail)); then
+		check_fail $err "$what succeeded, but should have failed"
+	else
+		check_err $err "$what failed"
+	fi
+}
+
+xfail()
+{
+	FAIL_TO_XFAIL=yes "$@"
+}
+
+xfail_on_slow()
+{
+	if [[ $KSFT_MACHINE_SLOW = yes ]]; then
+		FAIL_TO_XFAIL=yes "$@"
+	else
+		"$@"
+	fi
+}
+
+omit_on_slow()
+{
+	if [[ $KSFT_MACHINE_SLOW != yes ]]; then
+		"$@"
+	fi
+}
+
+xfail_on_veth()
+{
+	local dev=$1; shift
+	local kind
+
+	kind=$(ip -j -d link show dev $dev |
+			jq -r '.[].linkinfo.info_kind')
+	if [[ $kind = veth ]]; then
+		FAIL_TO_XFAIL=yes "$@"
+	else
+		"$@"
+	fi
+}
+
+mac_get()
+{
+	local if_name=$1
+
+	ip -j link show dev $if_name | jq -r '.[]["address"]'
+}
+
+kill_process()
+{
+	local pid=$1; shift
+
+	# Suppress noise from killing the process.
+	{ kill $pid && wait $pid; } 2>/dev/null
+}
+
+check_command()
+{
+	local cmd=$1; shift
+
+	if [[ ! -x "$(command -v "$cmd")" ]]; then
+		log_test_skip "$cmd not installed"
+		return $EXIT_STATUS
+	fi
+}
+
+require_command()
+{
+	local cmd=$1; shift
+
+	if ! check_command "$cmd"; then
+		exit $EXIT_STATUS
+	fi
+}
+
+ip_link_add()
+{
+	local name=$1; shift
+
+	ip link add name "$name" "$@"
+	defer ip link del dev "$name"
+}
+
+ip_link_set_master()
+{
+	local member=$1; shift
+	local master=$1; shift
+
+	ip link set dev "$member" master "$master"
+	defer ip link set dev "$member" nomaster
+}
+
+ip_link_set_addr()
+{
+	local name=$1; shift
+	local addr=$1; shift
+
+	local old_addr=$(mac_get "$name")
+	ip link set dev "$name" address "$addr"
+	defer ip link set dev "$name" address "$old_addr"
+}
+
+ip_link_is_up()
+{
+	local name=$1; shift
+
+	local state=$(ip -j link show "$name" |
+		      jq -r '(.[].flags[] | select(. == "UP")) // "DOWN"')
+	[[ $state == "UP" ]]
+}
+
+ip_link_set_up()
+{
+	local name=$1; shift
+
+	if ! ip_link_is_up "$name"; then
+		ip link set dev "$name" up
+		defer ip link set dev "$name" down
+	fi
+}
+
+ip_link_set_down()
+{
+	local name=$1; shift
+
+	if ip_link_is_up "$name"; then
+		ip link set dev "$name" down
+		defer ip link set dev "$name" up
+	fi
+}
+
+ip_addr_add()
+{
+	local name=$1; shift
+
+	ip addr add dev "$name" "$@"
+	defer ip addr del dev "$name" "$@"
+}
+
+ip_route_add()
+{
+	ip route add "$@"
+	defer ip route del "$@"
+}
+
+bridge_vlan_add()
+{
+	bridge vlan add "$@"
+	defer bridge vlan del "$@"
+}
+
+wait_local_port_listen()
+{
+	local listener_ns="${1}"
+	local port="${2}"
+	local protocol="${3}"
+	local pattern
+	local i
+
+	pattern=":$(printf "%04X" "${port}") "
+
+	# for tcp protocol additionally check the socket state
+	[ ${protocol} = "tcp" ] && pattern="${pattern}0A"
+	for i in $(seq 10); do
+		if ip netns exec "${listener_ns}" awk '{print $2" "$4}' \
+		   /proc/net/"${protocol}"* | grep -q "${pattern}"; then
+			break
+		fi
+		sleep 0.1
 	done
-	NS_LIST="$NS_LIST $ns_list"
 }
diff --git a/tools/testing/selftests/net/lib/.gitignore b/tools/testing/selftests/net/lib/.gitignore
new file mode 100644
index 000000000000..bbc97d6bf556
--- /dev/null
+++ b/tools/testing/selftests/net/lib/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+csum
+xdp_helper
diff --git a/tools/testing/selftests/net/lib/Makefile b/tools/testing/selftests/net/lib/Makefile
new file mode 100644
index 000000000000..88c4bc461459
--- /dev/null
+++ b/tools/testing/selftests/net/lib/Makefile
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: GPL-2.0
+
+CFLAGS += -Wall -Wl,--no-as-needed -O2 -g
+CFLAGS += -I../../../../../usr/include/ $(KHDR_INCLUDES)
+# Additional include paths needed by kselftest.h
+CFLAGS += -I../../
+
+TEST_FILES := ../../../../../Documentation/netlink/specs
+TEST_FILES += ../../../../net/ynl
+
+TEST_GEN_FILES += csum
+TEST_GEN_FILES += $(patsubst %.c,%.o,$(wildcard *.bpf.c))
+TEST_GEN_FILES += xdp_helper
+
+TEST_INCLUDES := $(wildcard py/*.py sh/*.sh)
+
+include ../../lib.mk
+
+include ../bpf.mk
diff --git a/tools/testing/selftests/net/csum.c b/tools/testing/selftests/net/lib/csum.c
index 90eb06fefa59..27437590eeb5 100644
--- a/tools/testing/selftests/net/csum.c
+++ b/tools/testing/selftests/net/lib/csum.c
@@ -654,10 +654,16 @@ static int recv_verify_packet_ipv4(void *nh, int len)
 {
 	struct iphdr *iph = nh;
 	uint16_t proto = cfg_encap ? IPPROTO_UDP : cfg_proto;
+	uint16_t ip_len;
 
 	if (len < sizeof(*iph) || iph->protocol != proto)
 		return -1;
 
+	ip_len = ntohs(iph->tot_len);
+	if (ip_len > len || ip_len < sizeof(*iph))
+		return -1;
+
+	len = ip_len;
 	iph_addr_p = &iph->saddr;
 	if (proto == IPPROTO_TCP)
 		return recv_verify_packet_tcp(iph + 1, len - sizeof(*iph));
@@ -669,20 +675,24 @@ static int recv_verify_packet_ipv6(void *nh, int len)
 {
 	struct ipv6hdr *ip6h = nh;
 	uint16_t proto = cfg_encap ? IPPROTO_UDP : cfg_proto;
+	uint16_t payload_len;
 
 	if (len < sizeof(*ip6h) || ip6h->nexthdr != proto)
 		return -1;
 
-	iph_addr_p = &ip6h->saddr;
+	payload_len = ntohs(ip6h->payload_len);
+	if (payload_len > len - sizeof(*ip6h))
+		return -1;
 
+	iph_addr_p = &ip6h->saddr;
 	if (proto == IPPROTO_TCP)
-		return recv_verify_packet_tcp(ip6h + 1, len - sizeof(*ip6h));
+		return recv_verify_packet_tcp(ip6h + 1, payload_len);
 	else
-		return recv_verify_packet_udp(ip6h + 1, len - sizeof(*ip6h));
+		return recv_verify_packet_udp(ip6h + 1, payload_len);
 }
 
 /* return whether auxdata includes TP_STATUS_CSUM_VALID */
-static bool recv_verify_packet_csum(struct msghdr *msg)
+static uint32_t recv_get_packet_csum_status(struct msghdr *msg)
 {
 	struct tpacket_auxdata *aux = NULL;
 	struct cmsghdr *cm;
@@ -706,7 +716,7 @@ static bool recv_verify_packet_csum(struct msghdr *msg)
 	if (!aux)
 		error(1, 0, "cmsg: no auxdata");
 
-	return aux->tp_status & TP_STATUS_CSUM_VALID;
+	return aux->tp_status;
 }
 
 static int recv_packet(int fd)
@@ -716,6 +726,7 @@ static int recv_packet(int fd)
 	char ctrl[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
 	struct pkt *buf = (void *)_buf;
 	struct msghdr msg = {0};
+	uint32_t tp_status;
 	struct iovec iov;
 	int len, ret;
 
@@ -737,6 +748,17 @@ static int recv_packet(int fd)
 		if (len == -1)
 			error(1, errno, "recv p");
 
+		tp_status = recv_get_packet_csum_status(&msg);
+
+		/* GRO might coalesce randomized packets. Such GSO packets are
+		 * then reinitialized for csum offload (CHECKSUM_PARTIAL), with
+		 * a pseudo csum. Do not try to validate these checksums.
+		 */
+		if (tp_status & TP_STATUS_CSUMNOTREADY) {
+			fprintf(stderr, "cmsg: GSO packet has partial csum: skip\n");
+			continue;
+		}
+
 		if (cfg_family == PF_INET6)
 			ret = recv_verify_packet_ipv6(buf, len);
 		else
@@ -753,7 +775,7 @@ static int recv_packet(int fd)
 		 * Do not fail if kernel does not validate a good csum:
 		 * Absence of validation does not imply invalid.
 		 */
-		if (recv_verify_packet_csum(&msg) && cfg_bad_csum) {
+		if (tp_status & TP_STATUS_CSUM_VALID && cfg_bad_csum) {
 			fprintf(stderr, "cmsg: expected bad csum, pf_packet returns valid\n");
 			bad_validations++;
 		}
diff --git a/tools/testing/selftests/net/lib/ksft.h b/tools/testing/selftests/net/lib/ksft.h
new file mode 100644
index 000000000000..17dc34a612c6
--- /dev/null
+++ b/tools/testing/selftests/net/lib/ksft.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(__NET_KSFT_H__)
+#define __NET_KSFT_H__
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+static inline void ksft_ready(void)
+{
+	const char msg[7] = "ready\n";
+	char *env_str;
+	int fd;
+
+	env_str = getenv("KSFT_READY_FD");
+	if (env_str) {
+		fd = atoi(env_str);
+		if (!fd) {
+			fprintf(stderr, "invalid KSFT_READY_FD = '%s'\n",
+				env_str);
+			return;
+		}
+	} else {
+		fd = STDOUT_FILENO;
+	}
+
+	write(fd, msg, sizeof(msg));
+	if (fd != STDOUT_FILENO)
+		close(fd);
+}
+
+static inline void ksft_wait(void)
+{
+	char *env_str;
+	char byte;
+	int fd;
+
+	env_str = getenv("KSFT_WAIT_FD");
+	if (env_str) {
+		fd = atoi(env_str);
+		if (!fd) {
+			fprintf(stderr, "invalid KSFT_WAIT_FD = '%s'\n",
+				env_str);
+			return;
+		}
+	} else {
+		/* Not running in KSFT env, wait for input from STDIN instead */
+		fd = STDIN_FILENO;
+	}
+
+	read(fd, &byte, sizeof(byte));
+	if (fd != STDIN_FILENO)
+		close(fd);
+}
+
+#endif
diff --git a/tools/testing/selftests/net/lib/py/__init__.py b/tools/testing/selftests/net/lib/py/__init__.py
new file mode 100644
index 000000000000..8697bd27dc30
--- /dev/null
+++ b/tools/testing/selftests/net/lib/py/__init__.py
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0
+
+from .consts import KSRC
+from .ksft import *
+from .netns import NetNS, NetNSEnter
+from .nsim import *
+from .utils import *
+from .ynl import NlError, YnlFamily, EthtoolFamily, NetdevFamily, RtnlFamily, RtnlAddrFamily
+from .ynl import NetshaperFamily
diff --git a/tools/testing/selftests/net/lib/py/consts.py b/tools/testing/selftests/net/lib/py/consts.py
new file mode 100644
index 000000000000..f518ce79d82c
--- /dev/null
+++ b/tools/testing/selftests/net/lib/py/consts.py
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0
+
+import sys
+from pathlib import Path
+
+KSFT_DIR = (Path(__file__).parent / "../../..").resolve()
+KSRC = (Path(__file__).parent / "../../../../../..").resolve()
+
+KSFT_MAIN_NAME = Path(sys.argv[0]).with_suffix("").name
diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py
new file mode 100644
index 000000000000..61287c203b6e
--- /dev/null
+++ b/tools/testing/selftests/net/lib/py/ksft.py
@@ -0,0 +1,280 @@
+# SPDX-License-Identifier: GPL-2.0
+
+import builtins
+import functools
+import inspect
+import signal
+import sys
+import time
+import traceback
+from .consts import KSFT_MAIN_NAME
+from .utils import global_defer_queue
+
+KSFT_RESULT = None
+KSFT_RESULT_ALL = True
+KSFT_DISRUPTIVE = True
+
+
+class KsftFailEx(Exception):
+    pass
+
+
+class KsftSkipEx(Exception):
+    pass
+
+
+class KsftXfailEx(Exception):
+    pass
+
+
+class KsftTerminate(KeyboardInterrupt):
+    pass
+
+
+def ksft_pr(*objs, **kwargs):
+    print("#", *objs, **kwargs)
+
+
+def _fail(*args):
+    global KSFT_RESULT
+    KSFT_RESULT = False
+
+    stack = inspect.stack()
+    started = False
+    for frame in reversed(stack[2:]):
+        # Start printing from the test case function
+        if not started:
+            if frame.function == 'ksft_run':
+                started = True
+            continue
+
+        ksft_pr("Check| At " + frame.filename + ", line " + str(frame.lineno) +
+                ", in " + frame.function + ":")
+        ksft_pr("Check|     " + frame.code_context[0].strip())
+    ksft_pr(*args)
+
+
+def ksft_eq(a, b, comment=""):
+    global KSFT_RESULT
+    if a != b:
+        _fail("Check failed", a, "!=", b, comment)
+
+
+def ksft_ne(a, b, comment=""):
+    global KSFT_RESULT
+    if a == b:
+        _fail("Check failed", a, "==", b, comment)
+
+
+def ksft_true(a, comment=""):
+    if not a:
+        _fail("Check failed", a, "does not eval to True", comment)
+
+
+def ksft_in(a, b, comment=""):
+    if a not in b:
+        _fail("Check failed", a, "not in", b, comment)
+
+
+def ksft_not_in(a, b, comment=""):
+    if a in b:
+        _fail("Check failed", a, "in", b, comment)
+
+
+def ksft_is(a, b, comment=""):
+    if a is not b:
+        _fail("Check failed", a, "is not", b, comment)
+
+
+def ksft_ge(a, b, comment=""):
+    if a < b:
+        _fail("Check failed", a, "<", b, comment)
+
+
+def ksft_lt(a, b, comment=""):
+    if a >= b:
+        _fail("Check failed", a, ">=", b, comment)
+
+
+class ksft_raises:
+    def __init__(self, expected_type):
+        self.exception = None
+        self.expected_type = expected_type
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_type is None:
+            _fail(f"Expected exception {str(self.expected_type.__name__)}, none raised")
+        elif self.expected_type != exc_type:
+            _fail(f"Expected exception {str(self.expected_type.__name__)}, raised {str(exc_type.__name__)}")
+        self.exception = exc_val
+        # Suppress the exception if its the expected one
+        return self.expected_type == exc_type
+
+
+def ksft_busy_wait(cond, sleep=0.005, deadline=1, comment=""):
+    end = time.monotonic() + deadline
+    while True:
+        if cond():
+            return
+        if time.monotonic() > end:
+            _fail("Waiting for condition timed out", comment)
+            return
+        time.sleep(sleep)
+
+
+def ktap_result(ok, cnt=1, case="", comment=""):
+    global KSFT_RESULT_ALL
+    KSFT_RESULT_ALL = KSFT_RESULT_ALL and ok
+
+    res = ""
+    if not ok:
+        res += "not "
+    res += "ok "
+    res += str(cnt) + " "
+    res += KSFT_MAIN_NAME
+    if case:
+        res += "." + str(case.__name__)
+    if comment:
+        res += " # " + comment
+    print(res)
+
+
+def ksft_flush_defer():
+    global KSFT_RESULT
+
+    i = 0
+    qlen_start = len(global_defer_queue)
+    while global_defer_queue:
+        i += 1
+        entry = global_defer_queue.pop()
+        try:
+            entry.exec_only()
+        except:
+            ksft_pr(f"Exception while handling defer / cleanup (callback {i} of {qlen_start})!")
+            tb = traceback.format_exc()
+            for line in tb.strip().split('\n'):
+                ksft_pr("Defer Exception|", line)
+            KSFT_RESULT = False
+
+
+def ksft_disruptive(func):
+    """
+    Decorator that marks the test as disruptive (e.g. the test
+    that can down the interface). Disruptive tests can be skipped
+    by passing DISRUPTIVE=False environment variable.
+    """
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        if not KSFT_DISRUPTIVE:
+            raise KsftSkipEx(f"marked as disruptive")
+        return func(*args, **kwargs)
+    return wrapper
+
+
+def ksft_setup(env):
+    """
+    Setup test framework global state from the environment.
+    """
+
+    def get_bool(env, name):
+        value = env.get(name, "").lower()
+        if value in ["yes", "true"]:
+            return True
+        if value in ["no", "false"]:
+            return False
+        try:
+            return bool(int(value))
+        except:
+            raise Exception(f"failed to parse {name}")
+
+    if "DISRUPTIVE" in env:
+        global KSFT_DISRUPTIVE
+        KSFT_DISRUPTIVE = get_bool(env, "DISRUPTIVE")
+
+    return env
+
+
+def _ksft_intr(signum, frame):
+    # ksft runner.sh sends 2 SIGTERMs in a row on a timeout
+    # if we don't ignore the second one it will stop us from handling cleanup
+    global term_cnt
+    term_cnt += 1
+    if term_cnt == 1:
+        raise KsftTerminate()
+    else:
+        ksft_pr(f"Ignoring SIGTERM (cnt: {term_cnt}), already exiting...")
+
+
+def ksft_run(cases=None, globs=None, case_pfx=None, args=()):
+    cases = cases or []
+
+    if globs and case_pfx:
+        for key, value in globs.items():
+            if not callable(value):
+                continue
+            for prefix in case_pfx:
+                if key.startswith(prefix):
+                    cases.append(value)
+                    break
+
+    global term_cnt
+    term_cnt = 0
+    prev_sigterm = signal.signal(signal.SIGTERM, _ksft_intr)
+
+    totals = {"pass": 0, "fail": 0, "skip": 0, "xfail": 0}
+
+    print("TAP version 13")
+    print("1.." + str(len(cases)))
+
+    global KSFT_RESULT
+    cnt = 0
+    stop = False
+    for case in cases:
+        KSFT_RESULT = True
+        cnt += 1
+        comment = ""
+        cnt_key = ""
+
+        try:
+            case(*args)
+        except KsftSkipEx as e:
+            comment = "SKIP " + str(e)
+            cnt_key = 'skip'
+        except KsftXfailEx as e:
+            comment = "XFAIL " + str(e)
+            cnt_key = 'xfail'
+        except BaseException as e:
+            stop |= isinstance(e, KeyboardInterrupt)
+            tb = traceback.format_exc()
+            for line in tb.strip().split('\n'):
+                ksft_pr("Exception|", line)
+            if stop:
+                ksft_pr(f"Stopping tests due to {type(e).__name__}.")
+            KSFT_RESULT = False
+            cnt_key = 'fail'
+
+        ksft_flush_defer()
+
+        if not cnt_key:
+            cnt_key = 'pass' if KSFT_RESULT else 'fail'
+
+        ktap_result(KSFT_RESULT, cnt, case, comment=comment)
+        totals[cnt_key] += 1
+
+        if stop:
+            break
+
+    signal.signal(signal.SIGTERM, prev_sigterm)
+
+    print(
+        f"# Totals: pass:{totals['pass']} fail:{totals['fail']} xfail:{totals['xfail']} xpass:0 skip:{totals['skip']} error:0"
+    )
+
+
+def ksft_exit():
+    global KSFT_RESULT_ALL
+    sys.exit(0 if KSFT_RESULT_ALL else 1)
diff --git a/tools/testing/selftests/net/lib/py/netns.py b/tools/testing/selftests/net/lib/py/netns.py
new file mode 100644
index 000000000000..8e9317044eef
--- /dev/null
+++ b/tools/testing/selftests/net/lib/py/netns.py
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: GPL-2.0
+
+from .utils import ip
+import ctypes
+import random
+import string
+
+libc = ctypes.cdll.LoadLibrary('libc.so.6')
+
+
+class NetNS:
+    def __init__(self, name=None):
+        if name:
+            self.name = name
+        else:
+            self.name = ''.join(random.choice(string.ascii_lowercase) for _ in range(8))
+        ip('netns add ' + self.name)
+
+    def __del__(self):
+        if self.name:
+            ip('netns del ' + self.name)
+            self.name = None
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, ex_type, ex_value, ex_tb):
+        self.__del__()
+
+    def __str__(self):
+        return self.name
+
+    def __repr__(self):
+        return f"NetNS({self.name})"
+
+
+class NetNSEnter:
+    def __init__(self, ns_name):
+        self.ns_path = f"/run/netns/{ns_name}"
+
+    def __enter__(self):
+        self.saved = open("/proc/thread-self/ns/net")
+        with open(self.ns_path) as ns_file:
+            libc.setns(ns_file.fileno(), 0)
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        libc.setns(self.saved.fileno(), 0)
+        self.saved.close()
diff --git a/tools/testing/selftests/net/lib/py/nsim.py b/tools/testing/selftests/net/lib/py/nsim.py
new file mode 100644
index 000000000000..1a8cbe9acc48
--- /dev/null
+++ b/tools/testing/selftests/net/lib/py/nsim.py
@@ -0,0 +1,135 @@
+# SPDX-License-Identifier: GPL-2.0
+
+import errno
+import json
+import os
+import random
+import re
+import time
+from .utils import cmd, ip
+
+
+class NetdevSim:
+    """
+    Class for netdevsim netdevice and its attributes.
+    """
+
+    def __init__(self, nsimdev, port_index, ifname, ns=None):
+        # In case udev renamed the netdev to according to new schema,
+        # check if the name matches the port_index.
+        nsimnamere = re.compile(r"eni\d+np(\d+)")
+        match = nsimnamere.match(ifname)
+        if match and int(match.groups()[0]) != port_index + 1:
+            raise Exception("netdevice name mismatches the expected one")
+
+        self.ifname = ifname
+        self.nsimdev = nsimdev
+        self.port_index = port_index
+        self.ns = ns
+        self.dfs_dir = "%s/ports/%u/" % (nsimdev.dfs_dir, port_index)
+        ret = ip("-j link show dev %s" % ifname, ns=ns)
+        self.dev = json.loads(ret.stdout)[0]
+        self.ifindex = self.dev["ifindex"]
+
+    def dfs_write(self, path, val):
+        self.nsimdev.dfs_write(f'ports/{self.port_index}/' + path, val)
+
+
+class NetdevSimDev:
+    """
+    Class for netdevsim bus device and its attributes.
+    """
+    @staticmethod
+    def ctrl_write(path, val):
+        fullpath = os.path.join("/sys/bus/netdevsim/", path)
+        with open(fullpath, "w") as f:
+            f.write(val)
+
+    def dfs_write(self, path, val):
+        fullpath = os.path.join(f"/sys/kernel/debug/netdevsim/netdevsim{self.addr}/", path)
+        with open(fullpath, "w") as f:
+            f.write(val)
+
+    def __init__(self, port_count=1, queue_count=1, ns=None):
+        # nsim will spawn in init_net, we'll set to actual ns once we switch it there
+        self.ns = None
+
+        if not os.path.exists("/sys/bus/netdevsim"):
+            cmd("modprobe netdevsim")
+
+        addr = random.randrange(1 << 15)
+        while True:
+            try:
+                self.ctrl_write("new_device", "%u %u %u" % (addr, port_count, queue_count))
+            except OSError as e:
+                if e.errno == errno.ENOSPC:
+                    addr = random.randrange(1 << 15)
+                    continue
+                raise e
+            break
+        self.addr = addr
+
+        # As probe of netdevsim device might happen from a workqueue,
+        # so wait here until all netdevs appear.
+        self.wait_for_netdevs(port_count)
+
+        if ns:
+            cmd(f"devlink dev reload netdevsim/netdevsim{addr} netns {ns.name}")
+            self.ns = ns
+
+        cmd("udevadm settle", ns=self.ns)
+        ifnames = self.get_ifnames()
+
+        self.dfs_dir = "/sys/kernel/debug/netdevsim/netdevsim%u/" % addr
+
+        self.nsims = []
+        for port_index in range(port_count):
+            self.nsims.append(self._make_port(port_index, ifnames[port_index]))
+
+        self.removed = False
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, ex_type, ex_value, ex_tb):
+        """
+        __exit__ gets called at the end of a "with" block.
+        """
+        self.remove()
+
+    def _make_port(self, port_index, ifname):
+        return NetdevSim(self, port_index, ifname, self.ns)
+
+    def get_ifnames(self):
+        ifnames = []
+        listdir = cmd(f"ls /sys/bus/netdevsim/devices/netdevsim{self.addr}/net/",
+                      ns=self.ns).stdout.split()
+        for ifname in listdir:
+            ifnames.append(ifname)
+        ifnames.sort()
+        return ifnames
+
+    def wait_for_netdevs(self, port_count):
+        timeout = 5
+        timeout_start = time.time()
+
+        while True:
+            try:
+                ifnames = self.get_ifnames()
+            except FileNotFoundError as e:
+                ifnames = []
+            if len(ifnames) == port_count:
+                break
+            if time.time() < timeout_start + timeout:
+                continue
+            raise Exception("netdevices did not appear within timeout")
+
+    def remove(self):
+        if not self.removed:
+            self.ctrl_write("del_device", "%u" % (self.addr, ))
+            self.removed = True
+
+    def remove_nsim(self, nsim):
+        self.nsims.remove(nsim)
+        self.ctrl_write("devices/netdevsim%u/del_port" % (self.addr, ),
+                        "%u" % (nsim.port_index, ))
diff --git a/tools/testing/selftests/net/lib/py/utils.py b/tools/testing/selftests/net/lib/py/utils.py
new file mode 100644
index 000000000000..34470d65d871
--- /dev/null
+++ b/tools/testing/selftests/net/lib/py/utils.py
@@ -0,0 +1,212 @@
+# SPDX-License-Identifier: GPL-2.0
+
+import errno
+import json as _json
+import os
+import random
+import re
+import select
+import socket
+import subprocess
+import time
+
+
+class CmdExitFailure(Exception):
+    def __init__(self, msg, cmd_obj):
+        super().__init__(msg)
+        self.cmd = cmd_obj
+
+
+def fd_read_timeout(fd, timeout):
+    rlist, _, _ = select.select([fd], [], [], timeout)
+    if rlist:
+        return os.read(fd, 1024)
+    else:
+        raise TimeoutError("Timeout waiting for fd read")
+
+
+class cmd:
+    """
+    Execute a command on local or remote host.
+
+    Use bkg() instead to run a command in the background.
+    """
+    def __init__(self, comm, shell=True, fail=True, ns=None, background=False,
+                 host=None, timeout=5, ksft_wait=None):
+        if ns:
+            comm = f'ip netns exec {ns} ' + comm
+
+        self.stdout = None
+        self.stderr = None
+        self.ret = None
+        self.ksft_term_fd = None
+
+        self.comm = comm
+        if host:
+            self.proc = host.cmd(comm)
+        else:
+            # ksft_wait lets us wait for the background process to fully start,
+            # we pass an FD to the child process, and wait for it to write back.
+            # Similarly term_fd tells child it's time to exit.
+            pass_fds = ()
+            env = os.environ.copy()
+            if ksft_wait is not None:
+                rfd, ready_fd = os.pipe()
+                wait_fd, self.ksft_term_fd = os.pipe()
+                pass_fds = (ready_fd, wait_fd, )
+                env["KSFT_READY_FD"] = str(ready_fd)
+                env["KSFT_WAIT_FD"]  = str(wait_fd)
+
+            self.proc = subprocess.Popen(comm, shell=shell, stdout=subprocess.PIPE,
+                                         stderr=subprocess.PIPE, pass_fds=pass_fds,
+                                         env=env)
+            if ksft_wait is not None:
+                os.close(ready_fd)
+                os.close(wait_fd)
+                msg = fd_read_timeout(rfd, ksft_wait)
+                os.close(rfd)
+                if not msg:
+                    raise Exception("Did not receive ready message")
+        if not background:
+            self.process(terminate=False, fail=fail, timeout=timeout)
+
+    def process(self, terminate=True, fail=None, timeout=5):
+        if fail is None:
+            fail = not terminate
+
+        if self.ksft_term_fd:
+            os.write(self.ksft_term_fd, b"1")
+        if terminate:
+            self.proc.terminate()
+        stdout, stderr = self.proc.communicate(timeout)
+        self.stdout = stdout.decode("utf-8")
+        self.stderr = stderr.decode("utf-8")
+        self.proc.stdout.close()
+        self.proc.stderr.close()
+        self.ret = self.proc.returncode
+
+        if self.proc.returncode != 0 and fail:
+            if len(stderr) > 0 and stderr[-1] == "\n":
+                stderr = stderr[:-1]
+            raise CmdExitFailure("Command failed: %s\nSTDOUT: %s\nSTDERR: %s" %
+                                 (self.proc.args, stdout, stderr), self)
+
+
+class bkg(cmd):
+    """
+    Run a command in the background.
+
+    Examples usage:
+
+    Run a command on remote host, and wait for it to finish.
+    This is usually paired with wait_port_listen() to make sure
+    the command has initialized:
+
+        with bkg("socat ...", exit_wait=True, host=cfg.remote) as nc:
+            ...
+
+    Run a command and expect it to let us know that it's ready
+    by writing to a special file descriptor passed via KSFT_READY_FD.
+    Command will be terminated when we exit the context manager:
+
+        with bkg("my_binary", ksft_wait=5):
+    """
+    def __init__(self, comm, shell=True, fail=None, ns=None, host=None,
+                 exit_wait=False, ksft_wait=None):
+        super().__init__(comm, background=True,
+                         shell=shell, fail=fail, ns=ns, host=host,
+                         ksft_wait=ksft_wait)
+        self.terminate = not exit_wait and not ksft_wait
+        self.check_fail = fail
+
+        if shell and self.terminate:
+            print("# Warning: combining shell and terminate is risky!")
+            print("#          SIGTERM may not reach the child on zsh/ksh!")
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, ex_type, ex_value, ex_tb):
+        return self.process(terminate=self.terminate, fail=self.check_fail)
+
+
+global_defer_queue = []
+
+
+class defer:
+    def __init__(self, func, *args, **kwargs):
+        global global_defer_queue
+
+        if not callable(func):
+            raise Exception("defer created with un-callable object, did you call the function instead of passing its name?")
+
+        self.func = func
+        self.args = args
+        self.kwargs = kwargs
+
+        self._queue =  global_defer_queue
+        self._queue.append(self)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, ex_type, ex_value, ex_tb):
+        return self.exec()
+
+    def exec_only(self):
+        self.func(*self.args, **self.kwargs)
+
+    def cancel(self):
+        self._queue.remove(self)
+
+    def exec(self):
+        self.cancel()
+        self.exec_only()
+
+
+def tool(name, args, json=None, ns=None, host=None):
+    cmd_str = name + ' '
+    if json:
+        cmd_str += '--json '
+    cmd_str += args
+    cmd_obj = cmd(cmd_str, ns=ns, host=host)
+    if json:
+        return _json.loads(cmd_obj.stdout)
+    return cmd_obj
+
+
+def ip(args, json=None, ns=None, host=None):
+    if ns:
+        args = f'-netns {ns} ' + args
+    return tool('ip', args, json=json, host=host)
+
+
+def ethtool(args, json=None, ns=None, host=None):
+    return tool('ethtool', args, json=json, ns=ns, host=host)
+
+
+def rand_port(type=socket.SOCK_STREAM):
+    """
+    Get a random unprivileged port.
+    """
+    with socket.socket(socket.AF_INET6, type) as s:
+        s.bind(("", 0))
+        return s.getsockname()[1]
+
+
+def wait_port_listen(port, proto="tcp", ns=None, host=None, sleep=0.005, deadline=5):
+    end = time.monotonic() + deadline
+
+    pattern = f":{port:04X} .* "
+    if proto == "tcp": # for tcp protocol additionally check the socket state
+        pattern += "0A"
+    pattern = re.compile(pattern)
+
+    while True:
+        data = cmd(f'cat /proc/net/{proto}*', ns=ns, host=host, shell=True).stdout
+        for row in data.split("\n"):
+            if pattern.search(row):
+                return
+        if time.monotonic() > end:
+            raise Exception("Waiting for port listen timed out")
+        time.sleep(sleep)
diff --git a/tools/testing/selftests/net/lib/py/ynl.py b/tools/testing/selftests/net/lib/py/ynl.py
new file mode 100644
index 000000000000..6329ae805abf
--- /dev/null
+++ b/tools/testing/selftests/net/lib/py/ynl.py
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: GPL-2.0
+
+import sys
+from pathlib import Path
+from .consts import KSRC, KSFT_DIR
+from .ksft import ksft_pr, ktap_result
+
+# Resolve paths
+try:
+    if (KSFT_DIR / "kselftest-list.txt").exists():
+        # Running in "installed" selftests
+        tools_full_path = KSFT_DIR
+        SPEC_PATH = KSFT_DIR / "net/lib/specs"
+
+        sys.path.append(tools_full_path.as_posix())
+        from net.lib.ynl.pyynl.lib import YnlFamily, NlError
+    else:
+        # Running in tree
+        tools_full_path = KSRC / "tools"
+        SPEC_PATH = KSRC / "Documentation/netlink/specs"
+
+        sys.path.append(tools_full_path.as_posix())
+        from net.ynl.pyynl.lib import YnlFamily, NlError
+except ModuleNotFoundError as e:
+    ksft_pr("Failed importing `ynl` library from kernel sources")
+    ksft_pr(str(e))
+    ktap_result(True, comment="SKIP")
+    sys.exit(4)
+
+#
+# Wrapper classes, loading the right specs
+# Set schema='' to avoid jsonschema validation, it's slow
+#
+class EthtoolFamily(YnlFamily):
+    def __init__(self, recv_size=0):
+        super().__init__((SPEC_PATH / Path('ethtool.yaml')).as_posix(),
+                         schema='', recv_size=recv_size)
+
+
+class RtnlFamily(YnlFamily):
+    def __init__(self, recv_size=0):
+        super().__init__((SPEC_PATH / Path('rt-link.yaml')).as_posix(),
+                         schema='', recv_size=recv_size)
+
+class RtnlAddrFamily(YnlFamily):
+    def __init__(self, recv_size=0):
+        super().__init__((SPEC_PATH / Path('rt-addr.yaml')).as_posix(),
+                         schema='', recv_size=recv_size)
+
+class NetdevFamily(YnlFamily):
+    def __init__(self, recv_size=0):
+        super().__init__((SPEC_PATH / Path('netdev.yaml')).as_posix(),
+                         schema='', recv_size=recv_size)
+
+class NetshaperFamily(YnlFamily):
+    def __init__(self, recv_size=0):
+        super().__init__((SPEC_PATH / Path('net_shaper.yaml')).as_posix(),
+                         schema='', recv_size=recv_size)
diff --git a/tools/testing/selftests/net/lib/sh/defer.sh b/tools/testing/selftests/net/lib/sh/defer.sh
new file mode 100644
index 000000000000..082f5d38321b
--- /dev/null
+++ b/tools/testing/selftests/net/lib/sh/defer.sh
@@ -0,0 +1,115 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# map[(scope_id,track,cleanup_id) -> cleanup_command]
+# track={d=default | p=priority}
+declare -A __DEFER__JOBS
+
+# map[(scope_id,track) -> # cleanup_commands]
+declare -A __DEFER__NJOBS
+
+# scope_id of the topmost scope.
+__DEFER__SCOPE_ID=0
+
+__defer__ndefer_key()
+{
+	local track=$1; shift
+
+	echo $__DEFER__SCOPE_ID,$track
+}
+
+__defer__defer_key()
+{
+	local track=$1; shift
+	local defer_ix=$1; shift
+
+	echo $__DEFER__SCOPE_ID,$track,$defer_ix
+}
+
+__defer__ndefers()
+{
+	local track=$1; shift
+
+	echo ${__DEFER__NJOBS[$(__defer__ndefer_key $track)]}
+}
+
+__defer__run()
+{
+	local track=$1; shift
+	local defer_ix=$1; shift
+	local defer_key=$(__defer__defer_key $track $defer_ix)
+
+	${__DEFER__JOBS[$defer_key]}
+	unset __DEFER__JOBS[$defer_key]
+}
+
+__defer__schedule()
+{
+	local track=$1; shift
+	local ndefers=$(__defer__ndefers $track)
+	local ndefers_key=$(__defer__ndefer_key $track)
+	local defer_key=$(__defer__defer_key $track $ndefers)
+	local defer="$@"
+
+	__DEFER__JOBS[$defer_key]="$defer"
+	__DEFER__NJOBS[$ndefers_key]=$((ndefers + 1))
+}
+
+__defer__scope_wipe()
+{
+	__DEFER__NJOBS[$(__defer__ndefer_key d)]=0
+	__DEFER__NJOBS[$(__defer__ndefer_key p)]=0
+}
+
+defer_scope_push()
+{
+	((__DEFER__SCOPE_ID++))
+	__defer__scope_wipe
+}
+
+defer_scope_pop()
+{
+	local defer_ix
+
+	for ((defer_ix=$(__defer__ndefers p); defer_ix-->0; )); do
+		__defer__run p $defer_ix
+	done
+
+	for ((defer_ix=$(__defer__ndefers d); defer_ix-->0; )); do
+		__defer__run d $defer_ix
+	done
+
+	__defer__scope_wipe
+	((__DEFER__SCOPE_ID--))
+}
+
+defer()
+{
+	__defer__schedule d "$@"
+}
+
+defer_prio()
+{
+	__defer__schedule p "$@"
+}
+
+defer_scopes_cleanup()
+{
+	while ((__DEFER__SCOPE_ID >= 0)); do
+		defer_scope_pop
+	done
+}
+
+in_defer_scope()
+{
+	local ret
+
+	defer_scope_push
+	"$@"
+	ret=$?
+	defer_scope_pop
+
+	return $ret
+}
+
+__defer__scope_wipe
diff --git a/tools/testing/selftests/net/xdp_dummy.c b/tools/testing/selftests/net/lib/xdp_dummy.bpf.c
index d988b2e0cee8..e73fab3edd9f 100644
--- a/tools/testing/selftests/net/xdp_dummy.c
+++ b/tools/testing/selftests/net/lib/xdp_dummy.bpf.c
@@ -10,4 +10,10 @@ int xdp_dummy_prog(struct xdp_md *ctx)
 	return XDP_PASS;
 }
 
+SEC("xdp.frags")
+int xdp_dummy_prog_frags(struct xdp_md *ctx)
+{
+	return XDP_PASS;
+}
+
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/net/lib/xdp_helper.c b/tools/testing/selftests/net/lib/xdp_helper.c
new file mode 100644
index 000000000000..eb025a9f35b1
--- /dev/null
+++ b/tools/testing/selftests/net/lib/xdp_helper.c
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <linux/if_xdp.h>
+#include <linux/if_link.h>
+#include <net/if.h>
+#include <inttypes.h>
+
+#include "ksft.h"
+
+#define UMEM_SZ (1U << 16)
+#define NUM_DESC (UMEM_SZ / 2048)
+
+
+static void print_usage(const char *bin)
+{
+	fprintf(stderr, "Usage: %s ifindex queue_id [-z]\n\n"
+		"where:\n\t-z: force zerocopy mode", bin);
+}
+
+/* this is a simple helper program that creates an XDP socket and does the
+ * minimum necessary to get bind() to succeed.
+ *
+ * this test program is not intended to actually process packets, but could be
+ * extended in the future if that is actually needed.
+ *
+ * it is used by queues.py to ensure the xsk netlinux attribute is set
+ * correctly.
+ */
+int main(int argc, char **argv)
+{
+	struct xdp_umem_reg umem_reg = { 0 };
+	struct sockaddr_xdp sxdp = { 0 };
+	int num_desc = NUM_DESC;
+	void *umem_area;
+	int retry = 0;
+	int ifindex;
+	int sock_fd;
+	int queue;
+
+	if (argc != 3 && argc != 4) {
+		print_usage(argv[0]);
+		return 1;
+	}
+
+	sock_fd = socket(AF_XDP, SOCK_RAW, 0);
+	if (sock_fd < 0) {
+		perror("socket creation failed");
+		/* if the kernel doesn't support AF_XDP, let the test program
+		 * know with -1. All other error paths return 1.
+		 */
+		if (errno == EAFNOSUPPORT)
+			return -1;
+		return 1;
+	}
+
+	/* "Probing mode", just checking if AF_XDP sockets are supported */
+	if (!strcmp(argv[1], "-") && !strcmp(argv[2], "-")) {
+		printf("AF_XDP support detected\n");
+		close(sock_fd);
+		return 0;
+	}
+
+	ifindex = atoi(argv[1]);
+	queue = atoi(argv[2]);
+
+	umem_area = mmap(NULL, UMEM_SZ, PROT_READ | PROT_WRITE, MAP_PRIVATE |
+			MAP_ANONYMOUS, -1, 0);
+	if (umem_area == MAP_FAILED) {
+		perror("mmap failed");
+		return 1;
+	}
+
+	umem_reg.addr = (uintptr_t)umem_area;
+	umem_reg.len = UMEM_SZ;
+	umem_reg.chunk_size = 2048;
+	umem_reg.headroom = 0;
+
+	setsockopt(sock_fd, SOL_XDP, XDP_UMEM_REG, &umem_reg,
+		   sizeof(umem_reg));
+	setsockopt(sock_fd, SOL_XDP, XDP_UMEM_FILL_RING, &num_desc,
+		   sizeof(num_desc));
+	setsockopt(sock_fd, SOL_XDP, XDP_UMEM_COMPLETION_RING, &num_desc,
+		   sizeof(num_desc));
+	setsockopt(sock_fd, SOL_XDP, XDP_RX_RING, &num_desc, sizeof(num_desc));
+
+	sxdp.sxdp_family = AF_XDP;
+	sxdp.sxdp_ifindex = ifindex;
+	sxdp.sxdp_queue_id = queue;
+	sxdp.sxdp_flags = 0;
+
+	if (argc > 3) {
+		if (!strcmp(argv[3], "-z")) {
+			sxdp.sxdp_flags = XDP_ZEROCOPY;
+		} else {
+			print_usage(argv[0]);
+			return 1;
+		}
+	}
+
+	while (1) {
+		if (bind(sock_fd, (struct sockaddr *)&sxdp, sizeof(sxdp)) == 0)
+			break;
+
+		if (errno == EBUSY && retry < 3) {
+			retry++;
+			sleep(1);
+			continue;
+		} else {
+			perror("bind failed");
+			munmap(umem_area, UMEM_SZ);
+			close(sock_fd);
+			return 1;
+		}
+	}
+
+	ksft_ready();
+	ksft_wait();
+
+	/* parent program will write a byte to stdin when its ready for this
+	 * helper to exit
+	 */
+
+	close(sock_fd);
+	return 0;
+}
diff --git a/tools/testing/selftests/net/link_netns.py b/tools/testing/selftests/net/link_netns.py
new file mode 100755
index 000000000000..aab043c59d69
--- /dev/null
+++ b/tools/testing/selftests/net/link_netns.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+import time
+
+from lib.py import ksft_run, ksft_exit, ksft_true
+from lib.py import ip
+from lib.py import NetNS, NetNSEnter
+from lib.py import RtnlFamily
+
+
+LINK_NETNSID = 100
+
+
+def test_event() -> None:
+    with NetNS() as ns1, NetNS() as ns2:
+        with NetNSEnter(str(ns2)):
+            rtnl = RtnlFamily()
+
+        rtnl.ntf_subscribe("rtnlgrp-link")
+
+        ip(f"netns set {ns2} {LINK_NETNSID}", ns=str(ns1))
+        ip(f"link add netns {ns1} link-netnsid {LINK_NETNSID} dummy1 type dummy")
+        ip(f"link add netns {ns1} dummy2 type dummy", ns=str(ns2))
+
+        ip("link del dummy1", ns=str(ns1))
+        ip("link del dummy2", ns=str(ns1))
+
+        time.sleep(1)
+        rtnl.check_ntf()
+        ksft_true(rtnl.async_msg_queue.empty(),
+                  "Received unexpected link notification")
+
+
+def validate_link_netns(netns, ifname, link_netnsid) -> bool:
+    link_info = ip(f"-d link show dev {ifname}", ns=netns, json=True)
+    if not link_info:
+        return False
+    return link_info[0].get("link_netnsid") == link_netnsid
+
+
+def test_link_net() -> None:
+    configs = [
+        # type, common args, type args, fallback to dev_net
+        ("ipvlan", "link dummy1", "", False),
+        ("macsec", "link dummy1", "", False),
+        ("macvlan", "link dummy1", "", False),
+        ("macvtap", "link dummy1", "", False),
+        ("vlan", "link dummy1", "id 100", False),
+        ("gre", "", "local 192.0.2.1", True),
+        ("vti", "", "local 192.0.2.1", True),
+        ("ipip", "", "local 192.0.2.1", True),
+        ("ip6gre", "", "local 2001:db8::1", True),
+        ("ip6tnl", "", "local 2001:db8::1", True),
+        ("vti6", "", "local 2001:db8::1", True),
+        ("sit", "", "local 192.0.2.1", True),
+        ("xfrm", "", "if_id 1", True),
+    ]
+
+    with NetNS() as ns1, NetNS() as ns2, NetNS() as ns3:
+        net1, net2, net3 = str(ns1), str(ns2), str(ns3)
+
+        # prepare link netnsid  and a dummy link needed by certain drivers
+        ip(f"netns set {net3} {LINK_NETNSID}", ns=str(net2))
+        ip("link add dummy1 type dummy", ns=net3)
+
+        cases = [
+            # source, "netns", "link-netns", expected link-netns
+            (net3, None, None, None, None),
+            (net3, net2, None, None, LINK_NETNSID),
+            (net2, None, net3, LINK_NETNSID, LINK_NETNSID),
+            (net1, net2, net3, LINK_NETNSID, LINK_NETNSID),
+        ]
+
+        for src_net, netns, link_netns, exp1, exp2 in cases:
+            tgt_net = netns or src_net
+            for typ, cargs, targs, fb_dev_net in configs:
+                cmd = "link add"
+                if netns:
+                    cmd += f" netns {netns}"
+                if link_netns:
+                    cmd += f" link-netns {link_netns}"
+                cmd += f" {cargs} foo type {typ} {targs}"
+                ip(cmd, ns=src_net)
+                if fb_dev_net:
+                    ksft_true(validate_link_netns(tgt_net, "foo", exp1),
+                              f"{typ} link_netns validation failed")
+                else:
+                    ksft_true(validate_link_netns(tgt_net, "foo", exp2),
+                              f"{typ} link_netns validation failed")
+                ip(f"link del foo", ns=tgt_net)
+
+
+def test_peer_net() -> None:
+    types = [
+        "vxcan",
+        "netkit",
+        "veth",
+    ]
+
+    with NetNS() as ns1, NetNS() as ns2, NetNS() as ns3, NetNS() as ns4:
+        net1, net2, net3, net4 = str(ns1), str(ns2), str(ns3), str(ns4)
+
+        ip(f"netns set {net3} {LINK_NETNSID}", ns=str(net2))
+
+        cases = [
+            # source, "netns", "link-netns", "peer netns", expected
+            (net1, None, None, None, None),
+            (net1, net2, None, None, None),
+            (net2, None, net3, None, LINK_NETNSID),
+            (net1, net2, net3, None, None),
+            (net2, None, None, net3, LINK_NETNSID),
+            (net1, net2, None, net3, LINK_NETNSID),
+            (net2, None, net2, net3, LINK_NETNSID),
+            (net1, net2, net4, net3, LINK_NETNSID),
+        ]
+
+        for src_net, netns, link_netns, peer_netns, exp in cases:
+            tgt_net = netns or src_net
+            for typ in types:
+                cmd = "link add"
+                if netns:
+                    cmd += f" netns {netns}"
+                if link_netns:
+                    cmd += f" link-netns {link_netns}"
+                cmd += f" foo type {typ}"
+                if peer_netns:
+                    cmd += f" peer netns {peer_netns}"
+                ip(cmd, ns=src_net)
+                ksft_true(validate_link_netns(tgt_net, "foo", exp),
+                          f"{typ} peer_netns validation failed")
+                ip(f"link del foo", ns=tgt_net)
+
+
+def main() -> None:
+    ksft_run([test_event, test_link_net, test_peer_net])
+    ksft_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/testing/selftests/net/lwt_dst_cache_ref_loop.sh b/tools/testing/selftests/net/lwt_dst_cache_ref_loop.sh
new file mode 100755
index 000000000000..881eb399798f
--- /dev/null
+++ b/tools/testing/selftests/net/lwt_dst_cache_ref_loop.sh
@@ -0,0 +1,246 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Author: Justin Iurman <justin.iurman@uliege.be>
+#
+# WARNING
+# -------
+# This is just a dummy script that triggers encap cases with possible dst cache
+# reference loops in affected lwt users (see list below). Some cases are
+# pathological configurations for simplicity, others are valid. Overall, we
+# don't want this issue to happen, no matter what. In order to catch any
+# reference loops, kmemleak MUST be used. The results alone are always blindly
+# successful, don't rely on them. Note that the following tests may crash the
+# kernel if the fix to prevent lwtunnel_{input|output|xmit}() reentry loops is
+# not present.
+#
+# Affected lwt users so far (please update accordingly if needed):
+#  - ila_lwt (output only)
+#  - ioam6_iptunnel (output only)
+#  - rpl_iptunnel (both input and output)
+#  - seg6_iptunnel (both input and output)
+
+source lib.sh
+
+check_compatibility()
+{
+	setup_ns tmp_node &>/dev/null
+	if [ $? != 0 ]; then
+		echo "SKIP: Cannot create netns."
+		exit $ksft_skip
+	fi
+
+	ip link add name veth0 netns $tmp_node type veth \
+		peer name veth1 netns $tmp_node &>/dev/null
+	local ret=$?
+
+	ip -netns $tmp_node link set veth0 up &>/dev/null
+	ret=$((ret + $?))
+
+	ip -netns $tmp_node link set veth1 up &>/dev/null
+	ret=$((ret + $?))
+
+	if [ $ret != 0 ]; then
+		echo "SKIP: Cannot configure links."
+		cleanup_ns $tmp_node
+		exit $ksft_skip
+	fi
+
+	lsmod 2>/dev/null | grep -q "ila"
+	ila_lsmod=$?
+	[ $ila_lsmod != 0 ] && modprobe ila &>/dev/null
+
+	ip -netns $tmp_node route add 2001:db8:1::/64 \
+		encap ila 1:2:3:4 csum-mode no-action ident-type luid \
+			hook-type output \
+		dev veth0 &>/dev/null
+
+	ip -netns $tmp_node route add 2001:db8:2::/64 \
+		encap ioam6 trace prealloc type 0x800000 ns 0 size 4 \
+		dev veth0 &>/dev/null
+
+	ip -netns $tmp_node route add 2001:db8:3::/64 \
+		encap rpl segs 2001:db8:3::1 dev veth0 &>/dev/null
+
+	ip -netns $tmp_node route add 2001:db8:4::/64 \
+		encap seg6 mode inline segs 2001:db8:4::1 dev veth0 &>/dev/null
+
+	ip -netns $tmp_node -6 route 2>/dev/null | grep -q "encap ila"
+	skip_ila=$?
+
+	ip -netns $tmp_node -6 route 2>/dev/null | grep -q "encap ioam6"
+	skip_ioam6=$?
+
+	ip -netns $tmp_node -6 route 2>/dev/null | grep -q "encap rpl"
+	skip_rpl=$?
+
+	ip -netns $tmp_node -6 route 2>/dev/null | grep -q "encap seg6"
+	skip_seg6=$?
+
+	cleanup_ns $tmp_node
+}
+
+setup()
+{
+	setup_ns alpha beta gamma &>/dev/null
+
+	ip link add name veth-alpha netns $alpha type veth \
+		peer name veth-betaL netns $beta &>/dev/null
+
+	ip link add name veth-betaR netns $beta type veth \
+		peer name veth-gamma netns $gamma &>/dev/null
+
+	ip -netns $alpha link set veth-alpha name veth0 &>/dev/null
+	ip -netns $beta link set veth-betaL name veth0 &>/dev/null
+	ip -netns $beta link set veth-betaR name veth1 &>/dev/null
+	ip -netns $gamma link set veth-gamma name veth0 &>/dev/null
+
+	ip -netns $alpha addr add 2001:db8:1::2/64 dev veth0 &>/dev/null
+	ip -netns $alpha link set veth0 up &>/dev/null
+	ip -netns $alpha link set lo up &>/dev/null
+	ip -netns $alpha route add 2001:db8:2::/64 \
+		via 2001:db8:1::1 dev veth0 &>/dev/null
+
+	ip -netns $beta addr add 2001:db8:1::1/64 dev veth0 &>/dev/null
+	ip -netns $beta addr add 2001:db8:2::1/64 dev veth1 &>/dev/null
+	ip -netns $beta link set veth0 up &>/dev/null
+	ip -netns $beta link set veth1 up &>/dev/null
+	ip -netns $beta link set lo up &>/dev/null
+	ip -netns $beta route del 2001:db8:2::/64
+	ip -netns $beta route add 2001:db8:2::/64 dev veth1
+	ip netns exec $beta \
+		sysctl -wq net.ipv6.conf.all.forwarding=1 &>/dev/null
+
+	ip -netns $gamma addr add 2001:db8:2::2/64 dev veth0 &>/dev/null
+	ip -netns $gamma link set veth0 up &>/dev/null
+	ip -netns $gamma link set lo up &>/dev/null
+	ip -netns $gamma route add 2001:db8:1::/64 \
+		via 2001:db8:2::1 dev veth0 &>/dev/null
+
+	sleep 1
+
+	ip netns exec $alpha ping6 -c 5 -W 1 2001:db8:2::2 &>/dev/null
+	if [ $? != 0 ]; then
+		echo "SKIP: Setup failed."
+		exit $ksft_skip
+	fi
+
+	sleep 1
+}
+
+cleanup()
+{
+	cleanup_ns $alpha $beta $gamma
+	[ $ila_lsmod != 0 ] && modprobe -r ila &>/dev/null
+}
+
+run_ila()
+{
+	if [ $skip_ila != 0 ]; then
+		echo "SKIP: ila (output)"
+		return
+	fi
+
+	ip -netns $beta route del 2001:db8:2::/64
+	ip -netns $beta route add 2001:db8:2:0:0:0:0:2/128 \
+		encap ila 2001:db8:2:0 csum-mode no-action ident-type luid \
+			hook-type output \
+		dev veth1 &>/dev/null
+	sleep 1
+
+	echo "TEST: ila (output)"
+	ip netns exec $beta ping6 -c 2 -W 1 2001:db8:2::2 &>/dev/null
+	sleep 1
+
+	ip -netns $beta route del 2001:db8:2:0:0:0:0:2/128
+	ip -netns $beta route add 2001:db8:2::/64 dev veth1
+	sleep 1
+}
+
+run_ioam6()
+{
+	if [ $skip_ioam6 != 0 ]; then
+		echo "SKIP: ioam6 (output)"
+		return
+	fi
+
+	ip -netns $beta route change 2001:db8:2::/64 \
+		encap ioam6 trace prealloc type 0x800000 ns 1 size 4 \
+		dev veth1 &>/dev/null
+	sleep 1
+
+	echo "TEST: ioam6 (output)"
+	ip netns exec $beta ping6 -c 2 -W 1 2001:db8:2::2 &>/dev/null
+	sleep 1
+}
+
+run_rpl()
+{
+	if [ $skip_rpl != 0 ]; then
+		echo "SKIP: rpl (input)"
+		echo "SKIP: rpl (output)"
+		return
+	fi
+
+	ip -netns $beta route change 2001:db8:2::/64 \
+		encap rpl segs 2001:db8:2::2 \
+		dev veth1 &>/dev/null
+	sleep 1
+
+	echo "TEST: rpl (input)"
+	ip netns exec $alpha ping6 -c 2 -W 1 2001:db8:2::2 &>/dev/null
+	sleep 1
+
+	echo "TEST: rpl (output)"
+	ip netns exec $beta ping6 -c 2 -W 1 2001:db8:2::2 &>/dev/null
+	sleep 1
+}
+
+run_seg6()
+{
+	if [ $skip_seg6 != 0 ]; then
+		echo "SKIP: seg6 (input)"
+		echo "SKIP: seg6 (output)"
+		return
+	fi
+
+	ip -netns $beta route change 2001:db8:2::/64 \
+		encap seg6 mode inline segs 2001:db8:2::2 \
+		dev veth1 &>/dev/null
+	sleep 1
+
+	echo "TEST: seg6 (input)"
+	ip netns exec $alpha ping6 -c 2 -W 1 2001:db8:2::2 &>/dev/null
+	sleep 1
+
+	echo "TEST: seg6 (output)"
+	ip netns exec $beta ping6 -c 2 -W 1 2001:db8:2::2 &>/dev/null
+	sleep 1
+}
+
+run()
+{
+	run_ila
+	run_ioam6
+	run_rpl
+	run_seg6
+}
+
+if [ "$(id -u)" -ne 0 ]; then
+	echo "SKIP: Need root privileges."
+	exit $ksft_skip
+fi
+
+if [ ! -x "$(command -v ip)" ]; then
+	echo "SKIP: Could not run test without ip tool."
+	exit $ksft_skip
+fi
+
+check_compatibility
+
+trap cleanup EXIT
+
+setup
+run
+
+exit $ksft_pass
diff --git a/tools/testing/selftests/net/mptcp/.gitignore b/tools/testing/selftests/net/mptcp/.gitignore
index 49daae73c41e..833279fb34e2 100644
--- a/tools/testing/selftests/net/mptcp/.gitignore
+++ b/tools/testing/selftests/net/mptcp/.gitignore
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 mptcp_connect
+mptcp_diag
 mptcp_inq
 mptcp_sockopt
 pm_nl_ctl
diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/selftests/net/mptcp/Makefile
index 7b936a926859..e47788bfa671 100644
--- a/tools/testing/selftests/net/mptcp/Makefile
+++ b/tools/testing/selftests/net/mptcp/Makefile
@@ -2,15 +2,17 @@
 
 top_srcdir = ../../../../..
 
-CFLAGS =  -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include $(KHDR_INCLUDES)
+CFLAGS += -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include $(KHDR_INCLUDES)
 
 TEST_PROGS := mptcp_connect.sh pm_netlink.sh mptcp_join.sh diag.sh \
 	      simult_flows.sh mptcp_sockopt.sh userspace_pm.sh
 
-TEST_GEN_FILES = mptcp_connect pm_nl_ctl mptcp_sockopt mptcp_inq
+TEST_GEN_FILES = mptcp_connect pm_nl_ctl mptcp_sockopt mptcp_inq mptcp_diag
 
 TEST_FILES := mptcp_lib.sh settings
 
+TEST_INCLUDES := ../lib.sh $(wildcard ../lib/sh/*.sh)
+
 EXTRA_CLEAN := *.pcap
 
 include ../../lib.mk
diff --git a/tools/testing/selftests/net/mptcp/diag.sh b/tools/testing/selftests/net/mptcp/diag.sh
index bc97ab33a00e..7a3cb4c09e45 100755
--- a/tools/testing/selftests/net/mptcp/diag.sh
+++ b/tools/testing/selftests/net/mptcp/diag.sh
@@ -200,6 +200,114 @@ chk_msk_cestab()
 		 "${expected}" "${msg}" ""
 }
 
+chk_dump_one()
+{
+	local ss_token
+	local token
+	local msg
+
+	ss_token="$(ss -inmHMN $ns |
+		    mptcp_lib_get_info_value "token" "token")"
+
+	token="$(ip netns exec $ns ./mptcp_diag -t $ss_token |\
+		 awk -F':[ \t]+' '/^token/ {print $2}')"
+
+	msg="....chk dump_one"
+
+	mptcp_lib_print_title "$msg"
+	if [ -n "$ss_token" ] && [ "$ss_token" = "$token" ]; then
+		mptcp_lib_pr_ok
+		mptcp_lib_result_pass "${msg}"
+	else
+		mptcp_lib_pr_fail "expected $ss_token found $token"
+		mptcp_lib_result_fail "${msg}"
+		ret=${KSFT_FAIL}
+	fi
+}
+
+chk_dump_subflow()
+{
+	local inet_diag_token
+	local subflow_line
+	local ss_output
+	local ss_token
+	local msg
+
+	ss_output=$(ss -tniN $ns)
+
+	subflow_line=$(echo "$ss_output" | \
+		       grep -m1 -Eo '[0-9.]+:[0-9].+ +[0-9.]+:[0-9.]+')
+
+	ss_token=$(echo "$ss_output" | grep -m1 -Eo 'token:[^ ]+')
+
+	inet_diag_token=$(ip netns exec $ns ./mptcp_diag -s "$subflow_line" | \
+			  grep -Eo 'token:[^ ]+')
+
+	msg="....chk dump_subflow"
+
+	mptcp_lib_print_title "$msg"
+	if [ -n "$ss_token" ] && [ "$ss_token" = "$inet_diag_token" ]; then
+		mptcp_lib_pr_ok
+		mptcp_lib_result_pass "${msg}"
+	else
+		mptcp_lib_pr_fail "expected $ss_token found $inet_diag_token"
+		mptcp_lib_result_fail "${msg}"
+		ret=${KSFT_FAIL}
+	fi
+}
+
+msk_info_get_value()
+{
+	local port="${1}"
+	local info="${2}"
+
+	ss -N "${ns}" -inHM dport "${port}" | \
+		mptcp_lib_get_info_value "${info}" "${info}"
+}
+
+chk_msk_info()
+{
+	local port="${1}"
+	local info="${2}"
+	local cnt="${3}"
+	local msg="....chk ${info}"
+	local delta_ms=250  # half what we waited before, just to be sure
+	local now
+
+	now=$(msk_info_get_value "${port}" "${info}")
+
+	mptcp_lib_print_title "${msg}"
+	if { [ -z "${cnt}" ] || [ -z "${now}" ]; } &&
+	   ! mptcp_lib_expect_all_features; then
+		mptcp_lib_pr_skip "Feature probably not supported"
+		mptcp_lib_result_skip "${msg}"
+	elif [ "$((cnt + delta_ms))" -lt "${now}" ]; then
+		mptcp_lib_pr_ok
+		mptcp_lib_result_pass "${msg}"
+	else
+		mptcp_lib_pr_fail "value of ${info} changed by $((now - cnt))ms," \
+				  "expected at least ${delta_ms}ms"
+		mptcp_lib_result_fail "${msg}"
+		ret=${KSFT_FAIL}
+	fi
+}
+
+chk_last_time_info()
+{
+	local port="${1}"
+	local data_sent data_recv ack_recv
+
+	data_sent=$(msk_info_get_value "${port}" "last_data_sent")
+	data_recv=$(msk_info_get_value "${port}" "last_data_recv")
+	ack_recv=$(msk_info_get_value "${port}" "last_ack_recv")
+
+	sleep 0.5  # wait to check after if the timestamps difference
+
+	chk_msk_info "${port}" "last_data_sent" "${data_sent}"
+	chk_msk_info "${port}" "last_data_recv" "${data_recv}"
+	chk_msk_info "${port}" "last_ack_recv" "${ack_recv}"
+}
+
 wait_connected()
 {
 	local listener_ns="${1}"
@@ -232,11 +340,14 @@ echo "b" | \
 			./mptcp_connect -p 10000 -r 0 -t ${timeout_poll} -w 20 \
 				127.0.0.1 >/dev/null &
 wait_connected $ns 10000
-chk_msk_nr 2 "after MPC handshake "
+chk_msk_nr 2 "after MPC handshake"
+chk_last_time_info 10000
 chk_msk_remote_key_nr 2 "....chk remote_key"
 chk_msk_fallback_nr 0 "....chk no fallback"
 chk_msk_inuse 2
 chk_msk_cestab 2
+chk_dump_one
+chk_dump_subflow
 flush_pids
 
 chk_msk_inuse 0 "2->0"
diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c
index d2043ec3bf6d..ac1349c4b9e5 100644
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.c
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c
@@ -25,6 +25,8 @@
 #include <sys/types.h>
 #include <sys/mman.h>
 
+#include <arpa/inet.h>
+
 #include <netdb.h>
 #include <netinet/in.h>
 
@@ -178,13 +180,26 @@ static void xgetnameinfo(const struct sockaddr *addr, socklen_t addrlen,
 }
 
 static void xgetaddrinfo(const char *node, const char *service,
-			 const struct addrinfo *hints,
+			 struct addrinfo *hints,
 			 struct addrinfo **res)
 {
+again:
 	int err = getaddrinfo(node, service, hints, res);
 
 	if (err) {
-		const char *errstr = getxinfo_strerr(err);
+		const char *errstr;
+
+		/* glibc starts to support MPTCP since v2.42.
+		 * For older versions, use IPPROTO_TCP to resolve,
+		 * and use TCP/MPTCP to create socket.
+		 * Link: https://sourceware.org/git/?p=glibc.git;a=commit;h=a8e9022e0f82
+		 */
+		if (err == EAI_SOCKTYPE) {
+			hints->ai_protocol = IPPROTO_TCP;
+			goto again;
+		}
+
+		errstr = getxinfo_strerr(err);
 
 		fprintf(stderr, "Fatal: getaddrinfo(%s:%s): %s\n",
 			node ? node : "", service ? service : "", errstr);
@@ -290,7 +305,7 @@ static int sock_listen_mptcp(const char * const listenaddr,
 {
 	int sock = -1;
 	struct addrinfo hints = {
-		.ai_protocol = IPPROTO_TCP,
+		.ai_protocol = IPPROTO_MPTCP,
 		.ai_socktype = SOCK_STREAM,
 		.ai_flags = AI_PASSIVE | AI_NUMERICHOST
 	};
@@ -354,7 +369,7 @@ static int sock_connect_mptcp(const char * const remoteaddr,
 			      int infd, struct wstate *winfo)
 {
 	struct addrinfo hints = {
-		.ai_protocol = IPPROTO_TCP,
+		.ai_protocol = IPPROTO_MPTCP,
 		.ai_socktype = SOCK_STREAM,
 	};
 	struct addrinfo *a, *addr;
@@ -1115,11 +1130,11 @@ again:
 		return 1;
 	}
 
-	if (--cfg_repeat > 0) {
-		if (cfg_input)
-			close(fd);
+	if (cfg_input)
+		close(fd);
+
+	if (--cfg_repeat > 0)
 		goto again;
-	}
 
 	return 0;
 }
@@ -1211,23 +1226,42 @@ static void parse_setsock_options(const char *name)
 	exit(1);
 }
 
-void xdisconnect(int fd, int addrlen)
+void xdisconnect(int fd)
 {
-	struct sockaddr_storage empty;
+	socklen_t addrlen = sizeof(struct sockaddr_storage);
+	struct sockaddr_storage addr, empty;
 	int msec_sleep = 10;
-	int queued = 1;
-	int i;
+	void *raw_addr;
+	int i, cmdlen;
+	char cmd[128];
+
+	/* get the local address and convert it to string */
+	if (getsockname(fd, (struct sockaddr *)&addr, &addrlen) < 0)
+		xerror("getsockname");
+
+	if (addr.ss_family == AF_INET)
+		raw_addr = &(((struct sockaddr_in *)&addr)->sin_addr);
+	else if (addr.ss_family == AF_INET6)
+		raw_addr = &(((struct sockaddr_in6 *)&addr)->sin6_addr);
+	else
+		xerror("bad family");
+
+	strcpy(cmd, "ss -M | grep -q ");
+	cmdlen = strlen(cmd);
+	if (!inet_ntop(addr.ss_family, raw_addr, &cmd[cmdlen],
+		       sizeof(cmd) - cmdlen))
+		xerror("inet_ntop");
 
 	shutdown(fd, SHUT_WR);
 
-	/* while until the pending data is completely flushed, the later
+	/*
+	 * wait until the pending data is completely flushed and all
+	 * the MPTCP sockets reached the closed status.
 	 * disconnect will bypass/ignore/drop any pending data.
 	 */
 	for (i = 0; ; i += msec_sleep) {
-		if (ioctl(fd, SIOCOUTQ, &queued) < 0)
-			xerror("can't query out socket queue: %d", errno);
-
-		if (!queued)
+		/* closed socket are not listed by 'ss' */
+		if (system(cmd) != 0)
 			break;
 
 		if (i > poll_timeout)
@@ -1249,7 +1283,7 @@ int main_loop(void)
 
 	if (cfg_input && cfg_sockopt_types.mptfo) {
 		fd_in = open(cfg_input, O_RDONLY);
-		if (fd < 0)
+		if (fd_in < 0)
 			xerror("can't open %s:%d", cfg_input, errno);
 	}
 
@@ -1272,18 +1306,18 @@ again:
 
 	if (cfg_input && !cfg_sockopt_types.mptfo) {
 		fd_in = open(cfg_input, O_RDONLY);
-		if (fd < 0)
+		if (fd_in < 0)
 			xerror("can't open %s:%d", cfg_input, errno);
 	}
 
 	ret = copyfd_io(fd_in, fd, 1, 0, &winfo);
 	if (ret)
-		return ret;
+		goto out;
 
 	if (cfg_truncate > 0) {
-		xdisconnect(fd, peer->ai_addrlen);
+		shutdown(fd, SHUT_WR);
 	} else if (--cfg_repeat > 0) {
-		xdisconnect(fd, peer->ai_addrlen);
+		xdisconnect(fd);
 
 		/* the socket could be unblocking at this point, we need the
 		 * connect to be blocking
@@ -1299,7 +1333,10 @@ again:
 		close(fd);
 	}
 
-	return 0;
+out:
+	if (cfg_input)
+		close(fd_in);
+	return ret;
 }
 
 int parse_proto(const char *proto)
diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
index 4131f3263a48..5e3c56253274 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
@@ -137,7 +137,7 @@ TEST_GROUP=""
 #shellcheck disable=SC2317
 cleanup()
 {
-	rm -f "$cin_disconnect" "$cout_disconnect"
+	rm -f "$cin_disconnect"
 	rm -f "$cin" "$cout"
 	rm -f "$sin" "$sout"
 	rm -f "$capout"
@@ -147,7 +147,7 @@ cleanup()
 
 mptcp_lib_check_mptcp
 mptcp_lib_check_kallsyms
-mptcp_lib_check_tools ip
+mptcp_lib_check_tools ip tc
 
 sin=$(mktemp)
 sout=$(mktemp)
@@ -155,7 +155,6 @@ cin=$(mktemp)
 cout=$(mktemp)
 capout=$(mktemp)
 cin_disconnect="$cin".disconnect
-cout_disconnect="$cout".disconnect
 trap cleanup EXIT
 
 mptcp_lib_ns_init ns1 ns2 ns3 ns4
@@ -259,6 +258,15 @@ check_mptcp_disabled()
 	mptcp_lib_ns_init disabled_ns
 
 	print_larger_title "New MPTCP socket can be blocked via sysctl"
+
+	# mainly to cover more code
+	if ! ip netns exec ${disabled_ns} sysctl net.mptcp >/dev/null; then
+		mptcp_lib_pr_fail "not able to list net.mptcp sysctl knobs"
+		mptcp_lib_result_fail "not able to list net.mptcp sysctl knobs"
+		ret=${KSFT_FAIL}
+		return 1
+	fi
+
 	# net.mptcp.enabled should be enabled by default
 	if [ "$(ip netns exec ${disabled_ns} sysctl net.mptcp.enabled | awk '{ print $3 }')" -ne 1 ]; then
 		mptcp_lib_pr_fail "net.mptcp.enabled sysctl is not 1 by default"
@@ -345,9 +353,11 @@ do_transfer()
 
 	local addr_port
 	addr_port=$(printf "%s:%d" ${connect_addr} ${port})
-	local result_msg
-	result_msg="$(printf "%.3s %-5s -> %.3s (%-20s) %-5s" ${connector_ns} ${cl_proto} ${listener_ns} ${addr_port} ${srv_proto})"
-	mptcp_lib_print_title "${result_msg}"
+	local pretty_title
+	pretty_title="$(printf "%.3s %-5s -> %.3s (%-20s) %-5s" ${connector_ns} ${cl_proto} ${listener_ns} ${addr_port} ${srv_proto})"
+	mptcp_lib_print_title "${pretty_title}"
+
+	local tap_title="${connector_ns:0:3} ${cl_proto} -> ${listener_ns:0:3} (${addr_port}) ${srv_proto}"
 
 	if $capture; then
 		local capuser
@@ -431,20 +441,15 @@ do_transfer()
 
 	local duration
 	duration=$((stop-start))
-	result_msg+=" # time=${duration}ms"
 	printf "(duration %05sms) " "${duration}"
 	if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then
 		mptcp_lib_pr_fail "client exit code $retc, server $rets"
-		echo -e "\nnetns ${listener_ns} socket stat for ${port}:" 1>&2
-		ip netns exec ${listener_ns} ss -Menita 1>&2 -o "sport = :$port"
-		cat /tmp/${listener_ns}.out
-		echo -e "\nnetns ${connector_ns} socket stat for ${port}:" 1>&2
-		ip netns exec ${connector_ns} ss -Menita 1>&2 -o "dport = :$port"
-		[ ${listener_ns} != ${connector_ns} ] && cat /tmp/${connector_ns}.out
+		mptcp_lib_pr_err_stats "${listener_ns}" "${connector_ns}" "${port}" \
+			"/tmp/${listener_ns}.out" "/tmp/${connector_ns}.out"
 
 		echo
 		cat "$capout"
-		mptcp_lib_result_fail "${TEST_GROUP}: ${result_msg}"
+		mptcp_lib_result_fail "${TEST_GROUP}: ${tap_title}"
 		return 1
 	fi
 
@@ -544,12 +549,12 @@ do_transfer()
 
 	if [ $retc -eq 0 ] && [ $rets -eq 0 ]; then
 		mptcp_lib_pr_ok "${extra:1}"
-		mptcp_lib_result_pass "${TEST_GROUP}: ${result_msg}"
+		mptcp_lib_result_pass "${TEST_GROUP}: ${tap_title}"
 	else
 		if [ -n "${extra}" ]; then
 			mptcp_lib_print_warn "${extra:1}"
 		fi
-		mptcp_lib_result_fail "${TEST_GROUP}: ${result_msg}"
+		mptcp_lib_result_fail "${TEST_GROUP}: ${tap_title}"
 	fi
 
 	cat "$capout"
@@ -577,7 +582,7 @@ make_file()
 	mptcp_lib_make_file $name 1024 $ksize
 	dd if=/dev/urandom conv=notrunc of="$name" oflag=append bs=1 count=$rem 2> /dev/null
 
-	echo "Created $name (size $(du -b "$name")) containing data sent by $who"
+	echo "Created $name (size $(stat -c "%s" "$name") B) containing data sent by $who"
 }
 
 run_tests_lo()
@@ -848,6 +853,8 @@ stop_if_error()
 make_file "$cin" "client"
 make_file "$sin" "server"
 
+mptcp_lib_subtests_last_ts_reset
+
 check_mptcp_disabled
 
 stop_if_error "The kernel configuration is not valid for MPTCP"
diff --git a/tools/testing/selftests/net/mptcp/mptcp_diag.c b/tools/testing/selftests/net/mptcp/mptcp_diag.c
new file mode 100644
index 000000000000..e084796e804d
--- /dev/null
+++ b/tools/testing/selftests/net/mptcp/mptcp_diag.c
@@ -0,0 +1,435 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025, Kylin Software */
+
+#include <linux/sock_diag.h>
+#include <linux/rtnetlink.h>
+#include <linux/inet_diag.h>
+#include <linux/netlink.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <linux/tcp.h>
+#include <arpa/inet.h>
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+
+#ifndef IPPROTO_MPTCP
+#define IPPROTO_MPTCP 262
+#endif
+
+#define parse_rtattr_nested(tb, max, rta) \
+	(parse_rtattr_flags((tb), (max), RTA_DATA(rta), RTA_PAYLOAD(rta), \
+			    NLA_F_NESTED))
+
+struct params {
+	__u32 target_token;
+	char subflow_addrs[1024];
+};
+
+struct mptcp_info {
+	__u8	mptcpi_subflows;
+	__u8	mptcpi_add_addr_signal;
+	__u8	mptcpi_add_addr_accepted;
+	__u8	mptcpi_subflows_max;
+	__u8	mptcpi_add_addr_signal_max;
+	__u8	mptcpi_add_addr_accepted_max;
+	__u32	mptcpi_flags;
+	__u32	mptcpi_token;
+	__u64	mptcpi_write_seq;
+	__u64	mptcpi_snd_una;
+	__u64	mptcpi_rcv_nxt;
+	__u8	mptcpi_local_addr_used;
+	__u8	mptcpi_local_addr_max;
+	__u8	mptcpi_csum_enabled;
+	__u32	mptcpi_retransmits;
+	__u64	mptcpi_bytes_retrans;
+	__u64	mptcpi_bytes_sent;
+	__u64	mptcpi_bytes_received;
+	__u64	mptcpi_bytes_acked;
+	__u8	mptcpi_subflows_total;
+	__u8	reserved[3];
+	__u32	mptcpi_last_data_sent;
+	__u32	mptcpi_last_data_recv;
+	__u32	mptcpi_last_ack_recv;
+};
+
+enum {
+	MPTCP_SUBFLOW_ATTR_UNSPEC,
+	MPTCP_SUBFLOW_ATTR_TOKEN_REM,
+	MPTCP_SUBFLOW_ATTR_TOKEN_LOC,
+	MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ,
+	MPTCP_SUBFLOW_ATTR_MAP_SEQ,
+	MPTCP_SUBFLOW_ATTR_MAP_SFSEQ,
+	MPTCP_SUBFLOW_ATTR_SSN_OFFSET,
+	MPTCP_SUBFLOW_ATTR_MAP_DATALEN,
+	MPTCP_SUBFLOW_ATTR_FLAGS,
+	MPTCP_SUBFLOW_ATTR_ID_REM,
+	MPTCP_SUBFLOW_ATTR_ID_LOC,
+	MPTCP_SUBFLOW_ATTR_PAD,
+
+	__MPTCP_SUBFLOW_ATTR_MAX
+};
+
+#define MPTCP_SUBFLOW_ATTR_MAX (__MPTCP_SUBFLOW_ATTR_MAX - 1)
+
+#define MPTCP_SUBFLOW_FLAG_MCAP_REM		_BITUL(0)
+#define MPTCP_SUBFLOW_FLAG_MCAP_LOC		_BITUL(1)
+#define MPTCP_SUBFLOW_FLAG_JOIN_REM		_BITUL(2)
+#define MPTCP_SUBFLOW_FLAG_JOIN_LOC		_BITUL(3)
+#define MPTCP_SUBFLOW_FLAG_BKUP_REM		_BITUL(4)
+#define MPTCP_SUBFLOW_FLAG_BKUP_LOC		_BITUL(5)
+#define MPTCP_SUBFLOW_FLAG_FULLY_ESTABLISHED	_BITUL(6)
+#define MPTCP_SUBFLOW_FLAG_CONNECTED		_BITUL(7)
+#define MPTCP_SUBFLOW_FLAG_MAPVALID		_BITUL(8)
+
+#define rta_getattr(type, value)		(*(type *)RTA_DATA(value))
+
+static void die_perror(const char *msg)
+{
+	perror(msg);
+	exit(1);
+}
+
+static void die_usage(int r)
+{
+	fprintf(stderr, "Usage:\n"
+			"mptcp_diag -t <token>\n"
+			"mptcp_diag -s \"<saddr>:<sport> <daddr>:<dport>\"\n");
+	exit(r);
+}
+
+static void send_query(int fd, struct inet_diag_req_v2 *r, __u32 proto)
+{
+	struct sockaddr_nl nladdr = {
+		.nl_family = AF_NETLINK
+	};
+	struct {
+		struct nlmsghdr nlh;
+		struct inet_diag_req_v2 r;
+	} req = {
+		.nlh = {
+			.nlmsg_len = sizeof(req),
+			.nlmsg_type = SOCK_DIAG_BY_FAMILY,
+			.nlmsg_flags = NLM_F_REQUEST
+		},
+		.r = *r
+	};
+	struct rtattr rta_proto;
+	struct iovec iov[6];
+	int iovlen = 0;
+
+	iov[iovlen++] = (struct iovec) {
+		.iov_base = &req,
+		.iov_len = sizeof(req)
+	};
+
+	if (proto == IPPROTO_MPTCP) {
+		rta_proto.rta_type = INET_DIAG_REQ_PROTOCOL;
+		rta_proto.rta_len = RTA_LENGTH(sizeof(proto));
+
+		iov[iovlen++] = (struct iovec){ &rta_proto, sizeof(rta_proto)};
+		iov[iovlen++] = (struct iovec){ &proto, sizeof(proto)};
+		req.nlh.nlmsg_len += RTA_LENGTH(sizeof(proto));
+	}
+
+	struct msghdr msg = {
+		.msg_name = &nladdr,
+		.msg_namelen = sizeof(nladdr),
+		.msg_iov = iov,
+		.msg_iovlen = iovlen
+	};
+
+	for (;;) {
+		if (sendmsg(fd, &msg, 0) < 0) {
+			if (errno == EINTR)
+				continue;
+			die_perror("sendmsg");
+		}
+		break;
+	}
+}
+
+static void parse_rtattr_flags(struct rtattr *tb[], int max, struct rtattr *rta,
+			       int len, unsigned short flags)
+{
+	unsigned short type;
+
+	memset(tb, 0, sizeof(struct rtattr *) * (max + 1));
+	while (RTA_OK(rta, len)) {
+		type = rta->rta_type & ~flags;
+		if (type <= max && !tb[type])
+			tb[type] = rta;
+		rta = RTA_NEXT(rta, len);
+	}
+}
+
+static void print_info_msg(struct mptcp_info *info)
+{
+	printf("Token & Flags\n");
+	printf("token:        %x\n", info->mptcpi_token);
+	printf("flags:        %x\n", info->mptcpi_flags);
+	printf("csum_enabled: %u\n", info->mptcpi_csum_enabled);
+
+	printf("\nBasic Info\n");
+	printf("subflows:              %u\n", info->mptcpi_subflows);
+	printf("subflows_max:          %u\n", info->mptcpi_subflows_max);
+	printf("subflows_total:        %u\n", info->mptcpi_subflows_total);
+	printf("local_addr_used:       %u\n", info->mptcpi_local_addr_used);
+	printf("local_addr_max:        %u\n", info->mptcpi_local_addr_max);
+	printf("add_addr_signal:       %u\n", info->mptcpi_add_addr_signal);
+	printf("add_addr_accepted:     %u\n", info->mptcpi_add_addr_accepted);
+	printf("add_addr_signal_max:   %u\n", info->mptcpi_add_addr_signal_max);
+	printf("add_addr_accepted_max: %u\n", info->mptcpi_add_addr_accepted_max);
+
+	printf("\nTransmission Info\n");
+	printf("write_seq:        %llu\n", info->mptcpi_write_seq);
+	printf("snd_una:          %llu\n", info->mptcpi_snd_una);
+	printf("rcv_nxt:          %llu\n", info->mptcpi_rcv_nxt);
+	printf("last_data_sent:   %u\n", info->mptcpi_last_data_sent);
+	printf("last_data_recv:   %u\n", info->mptcpi_last_data_recv);
+	printf("last_ack_recv:    %u\n", info->mptcpi_last_ack_recv);
+	printf("retransmits:      %u\n", info->mptcpi_retransmits);
+	printf("retransmit bytes: %llu\n", info->mptcpi_bytes_retrans);
+	printf("bytes_sent:       %llu\n", info->mptcpi_bytes_sent);
+	printf("bytes_received:   %llu\n", info->mptcpi_bytes_received);
+	printf("bytes_acked:      %llu\n", info->mptcpi_bytes_acked);
+}
+
+/*
+ * 'print_subflow_info' is from 'mptcp_subflow_info'
+ * which is a function in 'misc/ss.c' of iproute2.
+ */
+static void print_subflow_info(struct rtattr *tb[])
+{
+	u_int32_t flags = 0;
+
+	printf("It's a mptcp subflow, the subflow info:\n");
+	if (tb[MPTCP_SUBFLOW_ATTR_FLAGS]) {
+		char caps[32 + 1] = { 0 }, *cap = &caps[0];
+
+		flags = rta_getattr(__u32, tb[MPTCP_SUBFLOW_ATTR_FLAGS]);
+
+		if (flags & MPTCP_SUBFLOW_FLAG_MCAP_REM)
+			*cap++ = 'M';
+		if (flags & MPTCP_SUBFLOW_FLAG_MCAP_LOC)
+			*cap++ = 'm';
+		if (flags & MPTCP_SUBFLOW_FLAG_JOIN_REM)
+			*cap++ = 'J';
+		if (flags & MPTCP_SUBFLOW_FLAG_JOIN_LOC)
+			*cap++ = 'j';
+		if (flags & MPTCP_SUBFLOW_FLAG_BKUP_REM)
+			*cap++ = 'B';
+		if (flags & MPTCP_SUBFLOW_FLAG_BKUP_LOC)
+			*cap++ = 'b';
+		if (flags & MPTCP_SUBFLOW_FLAG_FULLY_ESTABLISHED)
+			*cap++ = 'e';
+		if (flags & MPTCP_SUBFLOW_FLAG_CONNECTED)
+			*cap++ = 'c';
+		if (flags & MPTCP_SUBFLOW_FLAG_MAPVALID)
+			*cap++ = 'v';
+
+		if (flags)
+			printf(" flags:%s", caps);
+	}
+	if (tb[MPTCP_SUBFLOW_ATTR_TOKEN_REM] &&
+	    tb[MPTCP_SUBFLOW_ATTR_TOKEN_LOC] &&
+	    tb[MPTCP_SUBFLOW_ATTR_ID_REM] &&
+	    tb[MPTCP_SUBFLOW_ATTR_ID_LOC])
+		printf(" token:%04x(id:%u)/%04x(id:%u)",
+		       rta_getattr(__u32, tb[MPTCP_SUBFLOW_ATTR_TOKEN_REM]),
+		       rta_getattr(__u8, tb[MPTCP_SUBFLOW_ATTR_ID_REM]),
+		       rta_getattr(__u32, tb[MPTCP_SUBFLOW_ATTR_TOKEN_LOC]),
+		       rta_getattr(__u8, tb[MPTCP_SUBFLOW_ATTR_ID_LOC]));
+	if (tb[MPTCP_SUBFLOW_ATTR_MAP_SEQ])
+		printf(" seq:%llu",
+		       rta_getattr(__u64, tb[MPTCP_SUBFLOW_ATTR_MAP_SEQ]));
+	if (tb[MPTCP_SUBFLOW_ATTR_MAP_SFSEQ])
+		printf(" sfseq:%u",
+		       rta_getattr(__u32, tb[MPTCP_SUBFLOW_ATTR_MAP_SFSEQ]));
+	if (tb[MPTCP_SUBFLOW_ATTR_SSN_OFFSET])
+		printf(" ssnoff:%u",
+		       rta_getattr(__u32, tb[MPTCP_SUBFLOW_ATTR_SSN_OFFSET]));
+	if (tb[MPTCP_SUBFLOW_ATTR_MAP_DATALEN])
+		printf(" maplen:%u",
+		       rta_getattr(__u32, tb[MPTCP_SUBFLOW_ATTR_MAP_DATALEN]));
+	printf("\n");
+}
+
+static void parse_nlmsg(struct nlmsghdr *nlh, __u32 proto)
+{
+	struct inet_diag_msg *r = NLMSG_DATA(nlh);
+	struct rtattr *tb[INET_DIAG_MAX + 1];
+
+	parse_rtattr_flags(tb, INET_DIAG_MAX, (struct rtattr *)(r + 1),
+			   nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*r)),
+			   NLA_F_NESTED);
+
+	if (proto == IPPROTO_MPTCP && tb[INET_DIAG_INFO]) {
+		int len = RTA_PAYLOAD(tb[INET_DIAG_INFO]);
+		struct mptcp_info *info;
+
+		/* workaround fort older kernels with less fields */
+		if (len < sizeof(*info)) {
+			info = alloca(sizeof(*info));
+			memcpy(info, RTA_DATA(tb[INET_DIAG_INFO]), len);
+			memset((char *)info + len, 0, sizeof(*info) - len);
+		} else {
+			info = RTA_DATA(tb[INET_DIAG_INFO]);
+		}
+		print_info_msg(info);
+	}
+	if (proto == IPPROTO_TCP && tb[INET_DIAG_ULP_INFO]) {
+		struct rtattr *ulpinfo[INET_ULP_INFO_MAX + 1] = { 0 };
+
+		parse_rtattr_nested(ulpinfo, INET_ULP_INFO_MAX,
+				    tb[INET_DIAG_ULP_INFO]);
+
+		if (ulpinfo[INET_ULP_INFO_MPTCP]) {
+			struct rtattr *sfinfo[MPTCP_SUBFLOW_ATTR_MAX + 1] = { 0 };
+
+			parse_rtattr_nested(sfinfo, MPTCP_SUBFLOW_ATTR_MAX,
+					    ulpinfo[INET_ULP_INFO_MPTCP]);
+			print_subflow_info(sfinfo);
+		} else {
+			printf("It's a normal TCP!\n");
+		}
+	}
+}
+
+static void recv_nlmsg(int fd, __u32 proto)
+{
+	char rcv_buff[8192];
+	struct nlmsghdr *nlh = (struct nlmsghdr *)rcv_buff;
+	struct sockaddr_nl rcv_nladdr = {
+		.nl_family = AF_NETLINK
+	};
+	struct iovec rcv_iov = {
+		.iov_base = rcv_buff,
+		.iov_len = sizeof(rcv_buff)
+	};
+	struct msghdr rcv_msg = {
+		.msg_name = &rcv_nladdr,
+		.msg_namelen = sizeof(rcv_nladdr),
+		.msg_iov = &rcv_iov,
+		.msg_iovlen = 1
+	};
+	int len;
+
+	len = recvmsg(fd, &rcv_msg, 0);
+
+	while (NLMSG_OK(nlh, len)) {
+		if (nlh->nlmsg_type == NLMSG_DONE) {
+			printf("NLMSG_DONE\n");
+			break;
+		} else if (nlh->nlmsg_type == NLMSG_ERROR) {
+			struct nlmsgerr *err;
+
+			err = (struct nlmsgerr *)NLMSG_DATA(nlh);
+			printf("Error %d:%s\n",
+			       -(err->error), strerror(-(err->error)));
+			break;
+		}
+		parse_nlmsg(nlh, proto);
+		nlh = NLMSG_NEXT(nlh, len);
+	}
+}
+
+static void get_mptcpinfo(__u32 token)
+{
+	struct inet_diag_req_v2 r = {
+		.sdiag_family           = AF_INET,
+		/* Real proto is set via INET_DIAG_REQ_PROTOCOL */
+		.sdiag_protocol         = IPPROTO_TCP,
+		.idiag_ext              = 1 << (INET_DIAG_INFO - 1),
+		.id.idiag_cookie[0]     = token,
+	};
+	__u32 proto = IPPROTO_MPTCP;
+	int fd;
+
+	fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_SOCK_DIAG);
+	if (fd < 0)
+		die_perror("Netlink socket");
+
+	send_query(fd, &r, proto);
+	recv_nlmsg(fd, proto);
+
+	close(fd);
+}
+
+static void get_subflow_info(char *subflow_addrs)
+{
+	struct inet_diag_req_v2 r = {
+		.sdiag_family           = AF_INET,
+		.sdiag_protocol         = IPPROTO_TCP,
+		.idiag_ext              = 1 << (INET_DIAG_INFO - 1),
+		.id.idiag_cookie[0]     = INET_DIAG_NOCOOKIE,
+		.id.idiag_cookie[1]     = INET_DIAG_NOCOOKIE,
+	};
+	char saddr[64], daddr[64];
+	int sport, dport;
+	int ret;
+	int fd;
+
+	ret = sscanf(subflow_addrs, "%[^:]:%d %[^:]:%d", saddr, &sport, daddr, &dport);
+	if (ret != 4)
+		die_perror("IP PORT Pairs has style problems!");
+
+	printf("%s:%d -> %s:%d\n", saddr, sport, daddr, dport);
+
+	fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_SOCK_DIAG);
+	if (fd < 0)
+		die_perror("Netlink socket");
+
+	r.id.idiag_sport = htons(sport);
+	r.id.idiag_dport = htons(dport);
+
+	inet_pton(AF_INET, saddr, &r.id.idiag_src);
+	inet_pton(AF_INET, daddr, &r.id.idiag_dst);
+	send_query(fd, &r, IPPROTO_TCP);
+	recv_nlmsg(fd, IPPROTO_TCP);
+}
+
+static void parse_opts(int argc, char **argv, struct params *p)
+{
+	int c;
+
+	if (argc < 2)
+		die_usage(1);
+
+	while ((c = getopt(argc, argv, "ht:s:")) != -1) {
+		switch (c) {
+		case 'h':
+			die_usage(0);
+			break;
+		case 't':
+			sscanf(optarg, "%x", &p->target_token);
+			break;
+		case 's':
+			strncpy(p->subflow_addrs, optarg,
+				sizeof(p->subflow_addrs) - 1);
+			break;
+		default:
+			die_usage(1);
+			break;
+		}
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	struct params p = { 0 };
+
+	parse_opts(argc, argv, &p);
+
+	if (p.target_token)
+		get_mptcpinfo(p.target_token);
+
+	if (p.subflow_addrs[0] != '\0')
+		get_subflow_info(p.subflow_addrs);
+
+	return 0;
+}
+
diff --git a/tools/testing/selftests/net/mptcp/mptcp_inq.c b/tools/testing/selftests/net/mptcp/mptcp_inq.c
index 218aac467321..3cf1e2a612ce 100644
--- a/tools/testing/selftests/net/mptcp/mptcp_inq.c
+++ b/tools/testing/selftests/net/mptcp/mptcp_inq.c
@@ -72,13 +72,21 @@ static const char *getxinfo_strerr(int err)
 }
 
 static void xgetaddrinfo(const char *node, const char *service,
-			 const struct addrinfo *hints,
+			 struct addrinfo *hints,
 			 struct addrinfo **res)
 {
+again:
 	int err = getaddrinfo(node, service, hints, res);
 
 	if (err) {
-		const char *errstr = getxinfo_strerr(err);
+		const char *errstr;
+
+		if (err == EAI_SOCKTYPE) {
+			hints->ai_protocol = IPPROTO_TCP;
+			goto again;
+		}
+
+		errstr = getxinfo_strerr(err);
 
 		fprintf(stderr, "Fatal: getaddrinfo(%s:%s): %s\n",
 			node ? node : "", service ? service : "", errstr);
@@ -91,7 +99,7 @@ static int sock_listen_mptcp(const char * const listenaddr,
 {
 	int sock = -1;
 	struct addrinfo hints = {
-		.ai_protocol = IPPROTO_TCP,
+		.ai_protocol = IPPROTO_MPTCP,
 		.ai_socktype = SOCK_STREAM,
 		.ai_flags = AI_PASSIVE | AI_NUMERICHOST
 	};
@@ -136,7 +144,7 @@ static int sock_connect_mptcp(const char * const remoteaddr,
 			      const char * const port, int proto)
 {
 	struct addrinfo hints = {
-		.ai_protocol = IPPROTO_TCP,
+		.ai_protocol = IPPROTO_MPTCP,
 		.ai_socktype = SOCK_STREAM,
 	};
 	struct addrinfo *a, *addr;
diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index e4403236f655..b8af65373b3a 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -23,6 +23,7 @@ tmpfile=""
 cout=""
 err=""
 capout=""
+cappid=""
 ns1=""
 ns2=""
 iptables="iptables"
@@ -31,7 +32,6 @@ timeout_poll=30
 timeout_test=$((timeout_poll * 2 + 1))
 capture=false
 checksum=false
-ip_mptcp=0
 check_invert=0
 validate_checksum=false
 init=0
@@ -62,6 +62,17 @@ unset sflags
 unset fastclose
 unset fullmesh
 unset speed
+unset join_syn_rej
+unset join_csum_ns1
+unset join_csum_ns2
+unset join_fail_nr
+unset join_rst_nr
+unset join_infi_nr
+unset join_corrupted_pkts
+unset join_syn_tx
+unset join_create_err
+unset join_bind_err
+unset join_connect_err
 
 # generated using "nfbpf_compile '(ip && (ip[54] & 0xf0) == 0x30) ||
 #				  (ip6 && (ip6[74] & 0xf0) == 0x30)'"
@@ -125,8 +136,8 @@ init_shapers()
 {
 	local i
 	for i in $(seq 1 4); do
-		tc -n $ns1 qdisc add dev ns1eth$i root netem rate 20mbit delay 1
-		tc -n $ns2 qdisc add dev ns2eth$i root netem rate 20mbit delay 1
+		tc -n $ns1 qdisc add dev ns1eth$i root netem rate 20mbit delay 1ms
+		tc -n $ns2 qdisc add dev ns2eth$i root netem rate 20mbit delay 1ms
 	done
 }
 
@@ -142,7 +153,7 @@ init() {
 
 	mptcp_lib_check_mptcp
 	mptcp_lib_check_kallsyms
-	mptcp_lib_check_tools ip ss "${iptables}" "${ip6tables}"
+	mptcp_lib_check_tools ip tc ss "${iptables}" "${ip6tables}"
 
 	sin=$(mktemp)
 	sout=$(mktemp)
@@ -197,6 +208,22 @@ print_skip()
 	mptcp_lib_pr_skip "${@}"
 }
 
+# $1: check name; $2: rc
+print_results()
+{
+	local check="${1}"
+	local rc=${2}
+
+	print_check "${check}"
+	if [ ${rc} = ${KSFT_PASS} ]; then
+		print_ok
+	elif [ ${rc} = ${KSFT_SKIP} ]; then
+		print_skip
+	else
+		fail_test "see above"
+	fi
+}
+
 # [ $1: fail msg ]
 mark_as_skipped()
 {
@@ -262,6 +289,8 @@ reset()
 
 	TEST_NAME="${1}"
 
+	MPTCP_LIB_SUBTEST_FLAKY=0 # reset if modified
+
 	if skip_test; then
 		MPTCP_LIB_TEST_COUNTER=$((MPTCP_LIB_TEST_COUNTER+1))
 		last_test_ignored=1
@@ -336,7 +365,7 @@ reset_with_checksum()
 	local ns1_enable=$1
 	local ns2_enable=$2
 
-	reset "checksum test ${1} ${2}" || return 1
+	reset "checksum test ${ns1_enable} ${ns2_enable}" || return 1
 
 	ip netns exec $ns1 sysctl -q net.mptcp.checksum_enabled=$ns1_enable
 	ip netns exec $ns2 sysctl -q net.mptcp.checksum_enabled=$ns2_enable
@@ -419,12 +448,17 @@ reset_with_fail()
 	fi
 }
 
+start_events()
+{
+	mptcp_lib_events "${ns1}" "${evts_ns1}" evts_ns1_pid
+	mptcp_lib_events "${ns2}" "${evts_ns2}" evts_ns2_pid
+}
+
 reset_with_events()
 {
 	reset "${1}" || return 1
 
-	mptcp_lib_events "${ns1}" "${evts_ns1}" evts_ns1_pid
-	mptcp_lib_events "${ns2}" "${evts_ns2}" evts_ns2_pid
+	start_events
 }
 
 reset_with_tcp_filter()
@@ -435,9 +469,10 @@ reset_with_tcp_filter()
 	local ns="${!1}"
 	local src="${2}"
 	local target="${3}"
+	local chain="${4:-INPUT}"
 
 	if ! ip netns exec "${ns}" ${iptables} \
-			-A INPUT \
+			-A "${chain}" \
 			-s "${src}" \
 			-p tcp \
 			-j "${target}"; then
@@ -449,7 +484,9 @@ reset_with_tcp_filter()
 # $1: err msg
 fail_test()
 {
-	ret=${KSFT_FAIL}
+	if ! mptcp_lib_subtest_is_flaky; then
+		ret=${KSFT_FAIL}
+	fi
 
 	if [ ${#} -gt 0 ]; then
 		print_fail "${@}"
@@ -606,173 +643,65 @@ kill_events_pids()
 
 pm_nl_set_limits()
 {
-	local ns=$1
-	local addrs=$2
-	local subflows=$3
-
-	if [ $ip_mptcp -eq 1 ]; then
-		ip -n $ns mptcp limits set add_addr_accepted $addrs subflows $subflows
-	else
-		ip netns exec $ns ./pm_nl_ctl limits $addrs $subflows
-	fi
+	mptcp_lib_pm_nl_set_limits "${@}"
 }
 
 pm_nl_add_endpoint()
 {
-	local ns=$1
-	local addr=$2
-	local flags _flags
-	local port _port
-	local dev _dev
-	local id _id
-	local nr=2
-
-	local p
-	for p in "${@}"
-	do
-		if [ $p = "flags" ]; then
-			eval _flags=\$"$nr"
-			[ -n "$_flags" ]; flags="flags $_flags"
-		fi
-		if [ $p = "dev" ]; then
-			eval _dev=\$"$nr"
-			[ -n "$_dev" ]; dev="dev $_dev"
-		fi
-		if [ $p = "id" ]; then
-			eval _id=\$"$nr"
-			[ -n "$_id" ]; id="id $_id"
-		fi
-		if [ $p = "port" ]; then
-			eval _port=\$"$nr"
-			[ -n "$_port" ]; port="port $_port"
-		fi
-
-		nr=$((nr + 1))
-	done
-
-	if [ $ip_mptcp -eq 1 ]; then
-		ip -n $ns mptcp endpoint add $addr ${_flags//","/" "} $dev $id $port
-	else
-		ip netns exec $ns ./pm_nl_ctl add $addr $flags $dev $id $port
-	fi
+	mptcp_lib_pm_nl_add_endpoint "${@}"
 }
 
 pm_nl_del_endpoint()
 {
-	local ns=$1
-	local id=$2
-	local addr=$3
-
-	if [ $ip_mptcp -eq 1 ]; then
-		[ $id -ne 0 ] && addr=''
-		ip -n $ns mptcp endpoint delete id $id $addr
-	else
-		ip netns exec $ns ./pm_nl_ctl del $id $addr
-	fi
+	mptcp_lib_pm_nl_del_endpoint "${@}"
 }
 
 pm_nl_flush_endpoint()
 {
-	local ns=$1
-
-	if [ $ip_mptcp -eq 1 ]; then
-		ip -n $ns mptcp endpoint flush
-	else
-		ip netns exec $ns ./pm_nl_ctl flush
-	fi
+	mptcp_lib_pm_nl_flush_endpoint "${@}"
 }
 
 pm_nl_show_endpoints()
 {
-	local ns=$1
-
-	if [ $ip_mptcp -eq 1 ]; then
-		ip -n $ns mptcp endpoint show
-	else
-		ip netns exec $ns ./pm_nl_ctl dump
-	fi
+	mptcp_lib_pm_nl_show_endpoints "${@}"
 }
 
 pm_nl_change_endpoint()
 {
-	local ns=$1
-	local id=$2
-	local flags=$3
-
-	if [ $ip_mptcp -eq 1 ]; then
-		ip -n $ns mptcp endpoint change id $id ${flags//","/" "}
-	else
-		ip netns exec $ns ./pm_nl_ctl set id $id flags $flags
-	fi
+	mptcp_lib_pm_nl_change_endpoint "${@}"
 }
 
 pm_nl_check_endpoint()
 {
-	local line expected_line
 	local msg="$1"
 	local ns=$2
 	local addr=$3
-	local _flags=""
-	local flags
-	local _port
-	local port
-	local dev
-	local _id
-	local id
+	local flags dev id port
 
 	print_check "${msg}"
 
 	shift 3
 	while [ -n "$1" ]; do
-		if [ $1 = "flags" ]; then
-			_flags=$2
-			[ -n "$_flags" ]; flags="flags $_flags"
+		case "${1}" in
+		"flags" | "dev" | "id" | "port")
+			eval "${1}"="${2}"
 			shift
-		elif [ $1 = "dev" ]; then
-			[ -n "$2" ]; dev="dev $2"
-			shift
-		elif [ $1 = "id" ]; then
-			_id=$2
-			[ -n "$_id" ]; id="id $_id"
-			shift
-		elif [ $1 = "port" ]; then
-			_port=$2
-			[ -n "$_port" ]; port=" port $_port"
-			shift
-		fi
+			;;
+		*)
+			;;
+		esac
 
 		shift
 	done
 
-	if [ -z "$id" ]; then
-		test_fail "bad test - missing endpoint id"
+	if [ -z "${id}" ]; then
+		fail_test "bad test - missing endpoint id"
 		return
 	fi
 
-	if [ $ip_mptcp -eq 1 ]; then
-		# get line and trim trailing whitespace
-		line=$(ip -n $ns mptcp endpoint show $id)
-		line="${line% }"
-		# the dump order is: address id flags port dev
-		[ -n "$addr" ] && expected_line="$addr"
-		expected_line+=" $id"
-		[ -n "$_flags" ] && expected_line+=" ${_flags//","/" "}"
-		[ -n "$dev" ] && expected_line+=" $dev"
-		[ -n "$port" ] && expected_line+=" $port"
-	else
-		line=$(ip netns exec $ns ./pm_nl_ctl get $_id)
-		# the dump order is: id flags dev address port
-		expected_line="$id"
-		[ -n "$flags" ] && expected_line+=" $flags"
-		[ -n "$dev" ] && expected_line+=" $dev"
-		[ -n "$addr" ] && expected_line+=" $addr"
-		[ -n "$_port" ] && expected_line+=" $_port"
-	fi
-	if [ "$line" = "$expected_line" ]; then
-		print_ok
-	else
-		fail_test "expected '$expected_line' found '$line'"
-	fi
+	check_output "mptcp_lib_pm_nl_get_endpoint ${ns} ${id}" \
+		"$(mptcp_lib_pm_nl_format_endpoints \
+			"${id},${addr},${flags//","/" "},${dev},${port}")"
 }
 
 pm_nl_set_endpoint()
@@ -938,7 +867,7 @@ chk_cestab_nr()
 	local cestab=$2
 	local count
 
-	print_check "cestab $cestab"
+	print_check "currently established: $cestab"
 	count=$(mptcp_lib_get_counter ${ns} "MPTcpExtMPCurrEstab")
 	if [ -z "$count" ]; then
 		print_skip
@@ -960,40 +889,62 @@ check_cestab()
 	fi
 }
 
-do_transfer()
+cond_start_capture()
 {
-	local listener_ns="$1"
-	local connector_ns="$2"
-	local cl_proto="$3"
-	local srv_proto="$4"
-	local connect_addr="$5"
-
-	local port=$((10000 + MPTCP_LIB_TEST_COUNTER - 1))
-	local cappid
-	local FAILING_LINKS=${FAILING_LINKS:-""}
-	local fastclose=${fastclose:-""}
-	local speed=${speed:-"fast"}
+	local ns="$1"
 
-	:> "$cout"
-	:> "$sout"
 	:> "$capout"
 
 	if $capture; then
-		local capuser
-		if [ -z $SUDO_USER ] ; then
+		local capuser capfile
+		if [ -z $SUDO_USER ]; then
 			capuser=""
 		else
 			capuser="-Z $SUDO_USER"
 		fi
 
-		capfile=$(printf "mp_join-%02u-%s.pcap" "$MPTCP_LIB_TEST_COUNTER" "${listener_ns}")
+		capfile=$(printf "mp_join-%02u-%s.pcap" "$MPTCP_LIB_TEST_COUNTER" "$ns")
 
 		echo "Capturing traffic for test $MPTCP_LIB_TEST_COUNTER into $capfile"
-		ip netns exec ${listener_ns} tcpdump -i any -s 65535 -B 32768 $capuser -w $capfile > "$capout" 2>&1 &
+		ip netns exec "$ns" tcpdump -i any -s 65535 -B 32768 $capuser -w "$capfile" > "$capout" 2>&1 &
 		cappid=$!
 
 		sleep 1
 	fi
+}
+
+cond_stop_capture()
+{
+	if $capture; then
+		sleep 1
+		kill $cappid
+		cat "$capout"
+	fi
+}
+
+get_port()
+{
+	echo "$((10000 + MPTCP_LIB_TEST_COUNTER - 1))"
+}
+
+do_transfer()
+{
+	local listener_ns="$1"
+	local connector_ns="$2"
+	local cl_proto="$3"
+	local srv_proto="$4"
+	local connect_addr="$5"
+	local port
+
+	local FAILING_LINKS=${FAILING_LINKS:-""}
+	local fastclose=${fastclose:-""}
+	local speed=${speed:-"fast"}
+	port=$(get_port)
+
+	:> "$cout"
+	:> "$sout"
+
+	cond_start_capture ${listener_ns}
 
 	NSTAT_HISTORY=/tmp/${listener_ns}.nstat ip netns exec ${listener_ns} \
 		nstat -n
@@ -1080,10 +1031,7 @@ do_transfer()
 	wait $spid
 	local rets=$?
 
-	if $capture; then
-	    sleep 1
-	    kill $cappid
-	fi
+	cond_stop_capture
 
 	NSTAT_HISTORY=/tmp/${listener_ns}.nstat ip netns exec ${listener_ns} \
 		nstat | grep Tcp > /tmp/${listener_ns}.out
@@ -1092,14 +1040,8 @@ do_transfer()
 
 	if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then
 		fail_test "client exit code $retc, server $rets"
-		echo -e "\nnetns ${listener_ns} socket stat for ${port}:" 1>&2
-		ip netns exec ${listener_ns} ss -Menita 1>&2 -o "sport = :$port"
-		cat /tmp/${listener_ns}.out
-		echo -e "\nnetns ${connector_ns} socket stat for ${port}:" 1>&2
-		ip netns exec ${connector_ns} ss -Menita 1>&2 -o "dport = :$port"
-		cat /tmp/${connector_ns}.out
-
-		cat "$capout"
+		mptcp_lib_pr_err_stats "${listener_ns}" "${connector_ns}" "${port}" \
+			"/tmp/${listener_ns}.out" "/tmp/${connector_ns}.out"
 		return 1
 	fi
 
@@ -1116,13 +1058,7 @@ do_transfer()
 	fi
 	rets=$?
 
-	if [ $retc -eq 0 ] && [ $rets -eq 0 ];then
-		cat "$capout"
-		return 0
-	fi
-
-	cat "$capout"
-	return 1
+	[ $retc -eq 0 ] && [ $rets -eq 0 ]
 }
 
 make_file()
@@ -1214,28 +1150,29 @@ chk_csum_nr()
 		csum_ns2=${csum_ns2:1}
 	fi
 
-	print_check "sum"
+	print_check "checksum server"
 	count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtDataCsumErr")
-	if [ "$count" != "$csum_ns1" ]; then
+	if [ -n "$count" ] && [ "$count" != "$csum_ns1" ]; then
 		extra_msg+=" ns1=$count"
 	fi
 	if [ -z "$count" ]; then
 		print_skip
 	elif { [ "$count" != $csum_ns1 ] && [ $allow_multi_errors_ns1 -eq 0 ]; } ||
-	   { [ "$count" -lt $csum_ns1 ] && [ $allow_multi_errors_ns1 -eq 1 ]; }; then
+	     { [ "$count" -lt $csum_ns1 ] && [ $allow_multi_errors_ns1 -eq 1 ]; }; then
 		fail_test "got $count data checksum error[s] expected $csum_ns1"
 	else
 		print_ok
 	fi
-	print_check "csum"
+
+	print_check "checksum client"
 	count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtDataCsumErr")
-	if [ "$count" != "$csum_ns2" ]; then
+	if [ -n "$count" ] && [ "$count" != "$csum_ns2" ]; then
 		extra_msg+=" ns2=$count"
 	fi
 	if [ -z "$count" ]; then
 		print_skip
 	elif { [ "$count" != $csum_ns2 ] && [ $allow_multi_errors_ns2 -eq 0 ]; } ||
-	   { [ "$count" -lt $csum_ns2 ] && [ $allow_multi_errors_ns2 -eq 1 ]; }; then
+	     { [ "$count" -lt $csum_ns2 ] && [ $allow_multi_errors_ns2 -eq 1 ]; }; then
 		fail_test "got $count data checksum error[s] expected $csum_ns2"
 	else
 		print_ok
@@ -1252,6 +1189,8 @@ chk_fail_nr()
 	local count
 	local ns_tx=$ns1
 	local ns_rx=$ns2
+	local tx="server"
+	local rx="client"
 	local extra_msg=""
 	local allow_tx_lost=0
 	local allow_rx_lost=0
@@ -1259,7 +1198,8 @@ chk_fail_nr()
 	if [[ $ns_invert = "invert" ]]; then
 		ns_tx=$ns2
 		ns_rx=$ns1
-		extra_msg="invert"
+		tx="client"
+		rx="server"
 	fi
 
 	if [[ "${fail_tx}" = "-"* ]]; then
@@ -1271,29 +1211,29 @@ chk_fail_nr()
 		fail_rx=${fail_rx:1}
 	fi
 
-	print_check "ftx"
+	print_check "fail tx ${tx}"
 	count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMPFailTx")
-	if [ "$count" != "$fail_tx" ]; then
-		extra_msg+=",tx=$count"
+	if [ -n "$count" ] && [ "$count" != "$fail_tx" ]; then
+		extra_msg+=" tx=$count"
 	fi
 	if [ -z "$count" ]; then
 		print_skip
 	elif { [ "$count" != "$fail_tx" ] && [ $allow_tx_lost -eq 0 ]; } ||
-	   { [ "$count" -gt "$fail_tx" ] && [ $allow_tx_lost -eq 1 ]; }; then
+	     { [ "$count" -gt "$fail_tx" ] && [ $allow_tx_lost -eq 1 ]; }; then
 		fail_test "got $count MP_FAIL[s] TX expected $fail_tx"
 	else
 		print_ok
 	fi
 
-	print_check "failrx"
+	print_check "fail rx ${rx}"
 	count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtMPFailRx")
-	if [ "$count" != "$fail_rx" ]; then
-		extra_msg+=",rx=$count"
+	if [ -n "$count" ] && [ "$count" != "$fail_rx" ]; then
+		extra_msg+=" rx=$count"
 	fi
 	if [ -z "$count" ]; then
 		print_skip
 	elif { [ "$count" != "$fail_rx" ] && [ $allow_rx_lost -eq 0 ]; } ||
-	   { [ "$count" -gt "$fail_rx" ] && [ $allow_rx_lost -eq 1 ]; }; then
+	     { [ "$count" -gt "$fail_rx" ] && [ $allow_rx_lost -eq 1 ]; }; then
 		fail_test "got $count MP_FAIL[s] RX expected $fail_rx"
 	else
 		print_ok
@@ -1310,37 +1250,35 @@ chk_fclose_nr()
 	local count
 	local ns_tx=$ns2
 	local ns_rx=$ns1
-	local extra_msg=""
+	local tx="client"
+	local rx="server"
 
 	if [[ $ns_invert = "invert" ]]; then
 		ns_tx=$ns1
 		ns_rx=$ns2
-		extra_msg="invert"
+		tx="server"
+		rx="client"
 	fi
 
-	print_check "ctx"
+	print_check "fast close tx ${tx}"
 	count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMPFastcloseTx")
 	if [ -z "$count" ]; then
 		print_skip
 	elif [ "$count" != "$fclose_tx" ]; then
-		extra_msg+=",tx=$count"
 		fail_test "got $count MP_FASTCLOSE[s] TX expected $fclose_tx"
 	else
 		print_ok
 	fi
 
-	print_check "fclzrx"
+	print_check "fast close rx ${rx}"
 	count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtMPFastcloseRx")
 	if [ -z "$count" ]; then
 		print_skip
 	elif [ "$count" != "$fclose_rx" ]; then
-		extra_msg+=",rx=$count"
 		fail_test "got $count MP_FASTCLOSE[s] RX expected $fclose_rx"
 	else
 		print_ok
 	fi
-
-	print_info "$extra_msg"
 }
 
 chk_rst_nr()
@@ -1351,15 +1289,17 @@ chk_rst_nr()
 	local count
 	local ns_tx=$ns1
 	local ns_rx=$ns2
-	local extra_msg=""
+	local tx="server"
+	local rx="client"
 
 	if [[ $ns_invert = "invert" ]]; then
 		ns_tx=$ns2
 		ns_rx=$ns1
-		extra_msg="invert"
+		tx="client"
+		rx="server"
 	fi
 
-	print_check "rtx"
+	print_check "reset tx ${tx}"
 	count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMPRstTx")
 	if [ -z "$count" ]; then
 		print_skip
@@ -1371,7 +1311,7 @@ chk_rst_nr()
 		print_ok
 	fi
 
-	print_check "rstrx"
+	print_check "reset rx ${rx}"
 	count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtMPRstRx")
 	if [ -z "$count" ]; then
 		print_skip
@@ -1382,8 +1322,6 @@ chk_rst_nr()
 	else
 		print_ok
 	fi
-
-	print_info "$extra_msg"
 }
 
 chk_infi_nr()
@@ -1392,7 +1330,7 @@ chk_infi_nr()
 	local infi_rx=$2
 	local count
 
-	print_check "itx"
+	print_check "infi tx client"
 	count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtInfiniteMapTx")
 	if [ -z "$count" ]; then
 		print_skip
@@ -1402,7 +1340,7 @@ chk_infi_nr()
 		print_ok
 	fi
 
-	print_check "infirx"
+	print_check "infi rx server"
 	count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtInfiniteMapRx")
 	if [ -z "$count" ]; then
 		print_skip
@@ -1413,17 +1351,67 @@ chk_infi_nr()
 	fi
 }
 
+chk_join_tx_nr()
+{
+	local syn_tx=${join_syn_tx:-0}
+	local create=${join_create_err:-0}
+	local bind=${join_bind_err:-0}
+	local connect=${join_connect_err:-0}
+	local rc=${KSFT_PASS}
+	local count
+
+	count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinSynTx")
+	if [ -z "$count" ]; then
+		rc=${KSFT_SKIP}
+	elif [ "$count" != "$syn_tx" ]; then
+		rc=${KSFT_FAIL}
+		print_check "syn tx"
+		fail_test "got $count JOIN[s] syn tx expected $syn_tx"
+	fi
+
+	count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinSynTxCreatSkErr")
+	if [ -z "$count" ]; then
+		rc=${KSFT_SKIP}
+	elif [ "$count" != "$create" ]; then
+		rc=${KSFT_FAIL}
+		print_check "syn tx create socket error"
+		fail_test "got $count JOIN[s] syn tx create socket error expected $create"
+	fi
+
+	count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinSynTxBindErr")
+	if [ -z "$count" ]; then
+		rc=${KSFT_SKIP}
+	elif [ "$count" != "$bind" ]; then
+		rc=${KSFT_FAIL}
+		print_check "syn tx bind error"
+		fail_test "got $count JOIN[s] syn tx bind error expected $bind"
+	fi
+
+	count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinSynTxConnectErr")
+	if [ -z "$count" ]; then
+		rc=${KSFT_SKIP}
+	elif [ "$count" != "$connect" ]; then
+		rc=${KSFT_FAIL}
+		print_check "syn tx connect error"
+		fail_test "got $count JOIN[s] syn tx connect error expected $connect"
+	fi
+
+	print_results "join Tx" ${rc}
+}
+
 chk_join_nr()
 {
 	local syn_nr=$1
 	local syn_ack_nr=$2
 	local ack_nr=$3
-	local csum_ns1=${4:-0}
-	local csum_ns2=${5:-0}
-	local fail_nr=${6:-0}
-	local rst_nr=${7:-0}
-	local infi_nr=${8:-0}
-	local corrupted_pkts=${9:-0}
+	local syn_rej=${join_syn_rej:-0}
+	local csum_ns1=${join_csum_ns1:-0}
+	local csum_ns2=${join_csum_ns2:-0}
+	local fail_nr=${join_fail_nr:-0}
+	local rst_nr=${join_rst_nr:-0}
+	local infi_nr=${join_infi_nr:-0}
+	local corrupted_pkts=${join_corrupted_pkts:-0}
+	local rc=${KSFT_PASS}
 	local count
 	local with_cookie
 
@@ -1431,43 +1419,71 @@ chk_join_nr()
 		print_info "${corrupted_pkts} corrupted pkts"
 	fi
 
-	print_check "syn"
 	count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPJoinSynRx")
 	if [ -z "$count" ]; then
-		print_skip
+		rc=${KSFT_SKIP}
 	elif [ "$count" != "$syn_nr" ]; then
-		fail_test "got $count JOIN[s] syn expected $syn_nr"
-	else
-		print_ok
+		rc=${KSFT_FAIL}
+		print_check "syn rx"
+		fail_test "got $count JOIN[s] syn rx expected $syn_nr"
 	fi
 
-	print_check "synack"
 	with_cookie=$(ip netns exec $ns2 sysctl -n net.ipv4.tcp_syncookies)
 	count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinSynAckRx")
 	if [ -z "$count" ]; then
-		print_skip
+		rc=${KSFT_SKIP}
 	elif [ "$count" != "$syn_ack_nr" ]; then
 		# simult connections exceeding the limit with cookie enabled could go up to
 		# synack validation as the conn limit can be enforced reliably only after
 		# the subflow creation
-		if [ "$with_cookie" = 2 ] && [ "$count" -gt "$syn_ack_nr" ] && [ "$count" -le "$syn_nr" ]; then
-			print_ok
-		else
-			fail_test "got $count JOIN[s] synack expected $syn_ack_nr"
+		if [ "$with_cookie" != 2 ] || [ "$count" -le "$syn_ack_nr" ] || [ "$count" -gt "$syn_nr" ]; then
+			rc=${KSFT_FAIL}
+			print_check "synack rx"
+			fail_test "got $count JOIN[s] synack rx expected $syn_ack_nr"
 		fi
-	else
-		print_ok
 	fi
 
-	print_check "ack"
+	count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinSynAckHMacFailure")
+	if [ -z "$count" ]; then
+		rc=${KSFT_SKIP}
+	elif [ "$count" != "0" ]; then
+		rc=${KSFT_FAIL}
+		print_check "synack HMAC"
+		fail_test "got $count JOIN[s] synack HMAC failure expected 0"
+	fi
+
 	count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPJoinAckRx")
 	if [ -z "$count" ]; then
-		print_skip
+		rc=${KSFT_SKIP}
 	elif [ "$count" != "$ack_nr" ]; then
-		fail_test "got $count JOIN[s] ack expected $ack_nr"
-	else
-		print_ok
+		rc=${KSFT_FAIL}
+		print_check "ack rx"
+		fail_test "got $count JOIN[s] ack rx expected $ack_nr"
+	fi
+
+	count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPJoinAckHMacFailure")
+	if [ -z "$count" ]; then
+		rc=${KSFT_SKIP}
+	elif [ "$count" != "0" ]; then
+		rc=${KSFT_FAIL}
+		print_check "ack HMAC"
+		fail_test "got $count JOIN[s] ack HMAC failure expected 0"
+	fi
+
+	count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPJoinRejected")
+	if [ -z "$count" ]; then
+		rc=${KSFT_SKIP}
+	elif [ "$count" != "$syn_rej" ]; then
+		rc=${KSFT_FAIL}
+		print_check "syn rejected"
+		fail_test "got $count JOIN[s] syn rejected expected $syn_rej"
 	fi
+
+	print_results "join Rx" ${rc}
+
+	join_syn_tx="${join_syn_tx:-${syn_nr}}" \
+		chk_join_tx_nr
+
 	if $validate_checksum; then
 		chk_csum_nr $csum_ns1 $csum_ns2
 		chk_fail_nr $fail_nr $fail_nr
@@ -1520,18 +1536,30 @@ chk_add_nr()
 	local add_nr=$1
 	local echo_nr=$2
 	local port_nr=${3:-0}
-	local syn_nr=${4:-$port_nr}
-	local syn_ack_nr=${5:-$port_nr}
-	local ack_nr=${6:-$port_nr}
-	local mis_syn_nr=${7:-0}
-	local mis_ack_nr=${8:-0}
+	local ns_invert=${4:-""}
+	local syn_nr=$port_nr
+	local syn_ack_nr=$port_nr
+	local ack_nr=$port_nr
+	local mis_syn_nr=0
+	local mis_ack_nr=0
+	local ns_tx=$ns1
+	local ns_rx=$ns2
+	local tx=""
+	local rx=""
 	local count
 	local timeout
 
-	timeout=$(ip netns exec $ns1 sysctl -n net.mptcp.add_addr_timeout)
+	if [[ $ns_invert = "invert" ]]; then
+		ns_tx=$ns2
+		ns_rx=$ns1
+		tx=" client"
+		rx=" server"
+	fi
 
-	print_check "add"
-	count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtAddAddr")
+	timeout=$(ip netns exec ${ns_tx} sysctl -n net.mptcp.add_addr_timeout)
+
+	print_check "add addr rx${rx}"
+	count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtAddAddr")
 	if [ -z "$count" ]; then
 		print_skip
 	# if the test configured a short timeout tolerate greater then expected
@@ -1542,8 +1570,8 @@ chk_add_nr()
 		print_ok
 	fi
 
-	print_check "echo"
-	count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtEchoAdd")
+	print_check "add addr echo rx${tx}"
+	count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtEchoAdd")
 	if [ -z "$count" ]; then
 		print_skip
 	elif [ "$count" != "$echo_nr" ]; then
@@ -1553,8 +1581,8 @@ chk_add_nr()
 	fi
 
 	if [ $port_nr -gt 0 ]; then
-		print_check "pt"
-		count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtPortAdd")
+		print_check "add addr rx with port${rx}"
+		count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtPortAdd")
 		if [ -z "$count" ]; then
 			print_skip
 		elif [ "$count" != "$port_nr" ]; then
@@ -1563,8 +1591,8 @@ chk_add_nr()
 			print_ok
 		fi
 
-		print_check "syn"
-		count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPJoinPortSynRx")
+		print_check "syn rx port${tx}"
+		count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMPJoinPortSynRx")
 		if [ -z "$count" ]; then
 			print_skip
 		elif [ "$count" != "$syn_nr" ]; then
@@ -1574,8 +1602,8 @@ chk_add_nr()
 			print_ok
 		fi
 
-		print_check "synack"
-		count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinPortSynAckRx")
+		print_check "synack rx port${rx}"
+		count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtMPJoinPortSynAckRx")
 		if [ -z "$count" ]; then
 			print_skip
 		elif [ "$count" != "$syn_ack_nr" ]; then
@@ -1585,8 +1613,8 @@ chk_add_nr()
 			print_ok
 		fi
 
-		print_check "ack"
-		count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPJoinPortAckRx")
+		print_check "ack rx port${tx}"
+		count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMPJoinPortAckRx")
 		if [ -z "$count" ]; then
 			print_skip
 		elif [ "$count" != "$ack_nr" ]; then
@@ -1596,8 +1624,8 @@ chk_add_nr()
 			print_ok
 		fi
 
-		print_check "syn"
-		count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMismatchPortSynRx")
+		print_check "syn rx port mismatch${tx}"
+		count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMismatchPortSynRx")
 		if [ -z "$count" ]; then
 			print_skip
 		elif [ "$count" != "$mis_syn_nr" ]; then
@@ -1607,8 +1635,8 @@ chk_add_nr()
 			print_ok
 		fi
 
-		print_check "ack"
-		count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMismatchPortAckRx")
+		print_check "ack rx port mismatch${tx}"
+		count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMismatchPortAckRx")
 		if [ -z "$count" ]; then
 			print_skip
 		elif [ "$count" != "$mis_ack_nr" ]; then
@@ -1629,7 +1657,7 @@ chk_add_tx_nr()
 
 	timeout=$(ip netns exec $ns1 sysctl -n net.mptcp.add_addr_timeout)
 
-	print_check "add TX"
+	print_check "add addr tx"
 	count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtAddAddrTx")
 	if [ -z "$count" ]; then
 		print_skip
@@ -1641,7 +1669,7 @@ chk_add_tx_nr()
 		print_ok
 	fi
 
-	print_check "echo TX"
+	print_check "add addr echo tx"
 	count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtEchoAddTx")
 	if [ -z "$count" ]; then
 		print_skip
@@ -1661,6 +1689,8 @@ chk_rm_nr()
 	local count
 	local addr_ns=$ns1
 	local subflow_ns=$ns2
+	local addr="server"
+	local subflow="client"
 	local extra_msg=""
 
 	shift 2
@@ -1670,16 +1700,14 @@ chk_rm_nr()
 		shift
 	done
 
-	if [ -z $invert ]; then
-		addr_ns=$ns1
-		subflow_ns=$ns2
-	elif [ $invert = "true" ]; then
+	if [ "$invert" = "true" ]; then
 		addr_ns=$ns2
 		subflow_ns=$ns1
-		extra_msg="invert"
+		addr="client"
+		subflow="server"
 	fi
 
-	print_check "rm"
+	print_check "rm addr rx ${addr}"
 	count=$(mptcp_lib_get_counter ${addr_ns} "MPTcpExtRmAddr")
 	if [ -z "$count" ]; then
 		print_skip
@@ -1689,7 +1717,7 @@ chk_rm_nr()
 		print_ok
 	fi
 
-	print_check "rmsf"
+	print_check "rm subflow ${subflow}"
 	count=$(mptcp_lib_get_counter ${subflow_ns} "MPTcpExtRmSubflow")
 	if [ -z "$count" ]; then
 		print_skip
@@ -1703,7 +1731,7 @@ chk_rm_nr()
 		count=$((count + cnt))
 		if [ "$count" != "$rm_subflow_nr" ]; then
 			suffix="$count in [$rm_subflow_nr:$((rm_subflow_nr*2))]"
-			extra_msg+=" simult"
+			extra_msg="simult"
 		fi
 		if [ $count -ge "$rm_subflow_nr" ] && \
 		   [ "$count" -le "$((rm_subflow_nr *2 ))" ]; then
@@ -1724,7 +1752,7 @@ chk_rm_tx_nr()
 {
 	local rm_addr_tx_nr=$1
 
-	print_check "rm TX"
+	print_check "rm addr tx client"
 	count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtRmAddrTx")
 	if [ -z "$count" ]; then
 		print_skip
@@ -1739,9 +1767,11 @@ chk_prio_nr()
 {
 	local mp_prio_nr_tx=$1
 	local mp_prio_nr_rx=$2
+	local mpj_syn=$3
+	local mpj_syn_ack=$4
 	local count
 
-	print_check "ptx"
+	print_check "mp_prio tx server"
 	count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPPrioTx")
 	if [ -z "$count" ]; then
 		print_skip
@@ -1751,7 +1781,7 @@ chk_prio_nr()
 		print_ok
 	fi
 
-	print_check "prx"
+	print_check "mp_prio rx client"
 	count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPPrioRx")
 	if [ -z "$count" ]; then
 		print_skip
@@ -1760,6 +1790,26 @@ chk_prio_nr()
 	else
 		print_ok
 	fi
+
+	print_check "syn backup"
+	count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPJoinSynBackupRx")
+	if [ -z "$count" ]; then
+		print_skip
+	elif [ "$count" != "$mpj_syn" ]; then
+		fail_test "got $count JOIN[s] syn with Backup expected $mpj_syn"
+	else
+		print_ok
+	fi
+
+	print_check "synack backup"
+	count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinSynAckBackupRx")
+	if [ -z "$count" ]; then
+		print_skip
+	elif [ "$count" != "$mpj_syn_ack" ]; then
+		fail_test "got $count JOIN[s] synack with Backup expected $mpj_syn_ack"
+	else
+		print_ok
+	fi
 }
 
 chk_subflow_nr()
@@ -1924,7 +1974,8 @@ subflows_tests()
 		pm_nl_set_limits $ns2 0 1
 		pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
 		run_tests $ns1 $ns2 10.0.1.1
-		chk_join_nr 1 1 0
+		join_syn_rej=1 \
+			chk_join_nr 1 1 0
 	fi
 
 	# subflow
@@ -1953,7 +2004,8 @@ subflows_tests()
 		pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
 		pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow
 		run_tests $ns1 $ns2 10.0.1.1
-		chk_join_nr 2 2 1
+		join_syn_rej=1 \
+			chk_join_nr 2 2 1
 	fi
 
 	# single subflow, dev
@@ -1974,9 +2026,11 @@ subflows_error_tests()
 		pm_nl_set_limits $ns1 0 1
 		pm_nl_set_limits $ns2 0 1
 		pm_nl_add_endpoint $ns2 10.0.1.2 flags subflow
+		pm_nl_add_endpoint $ns2 10.0.12.2 flags subflow
 		speed=slow \
 			run_tests $ns1 $ns2 10.0.1.1
-		chk_join_nr 0 0 0
+		join_bind_err=1 \
+			chk_join_nr 0 0 0
 	fi
 
 	# multiple subflows, with subflow creation error
@@ -1988,7 +2042,8 @@ subflows_error_tests()
 		pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow
 		speed=slow \
 			run_tests $ns1 $ns2 10.0.1.1
-		chk_join_nr 1 1 1
+		join_syn_tx=2 \
+			chk_join_nr 1 1 1
 	fi
 
 	# multiple subflows, with subflow timeout on MPJ
@@ -2000,7 +2055,8 @@ subflows_error_tests()
 		pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow
 		speed=slow \
 			run_tests $ns1 $ns2 10.0.1.1
-		chk_join_nr 1 1 1
+		join_syn_tx=2 \
+			chk_join_nr 1 1 1
 	fi
 
 	# multiple subflows, check that the endpoint corresponding to
@@ -2021,7 +2077,8 @@ subflows_error_tests()
 
 		# additional subflow could be created only if the PM select
 		# the later endpoint, skipping the already used one
-		chk_join_nr 1 1 1
+		join_syn_tx=2 \
+			chk_join_nr 1 1 1
 	fi
 }
 
@@ -2060,6 +2117,21 @@ signal_address_tests()
 		chk_add_nr 1 1
 	fi
 
+	# uncommon: subflow and signal flags on the same endpoint
+	# or because the user wrongly picked both, but still expects the client
+	# to create additional subflows
+	if reset "subflow and signal together"; then
+		pm_nl_set_limits $ns1 0 2
+		pm_nl_set_limits $ns2 0 2
+		pm_nl_add_endpoint $ns2 10.0.3.2 flags signal,subflow
+		run_tests $ns1 $ns2 10.0.1.1
+		chk_join_nr 1 1 1
+		chk_add_nr 1 1 0 invert  # only initiated by ns2
+		chk_add_nr 0 0 0         # none initiated by ns1
+		chk_rst_nr 0 0 invert    # no RST sent by the client
+		chk_rst_nr 0 0           # no RST sent by the server
+	fi
+
 	# accept and use add_addr with additional subflows
 	if reset "multiple subflows and signal"; then
 		pm_nl_set_limits $ns1 0 3
@@ -2092,7 +2164,8 @@ signal_address_tests()
 		pm_nl_add_endpoint $ns1 10.0.14.1 flags signal
 		pm_nl_set_limits $ns2 3 3
 		run_tests $ns1 $ns2 10.0.1.1
-		chk_join_nr 1 1 1
+		join_syn_tx=3 \
+			chk_join_nr 1 1 1
 		chk_add_nr 3 3
 	fi
 
@@ -2260,7 +2333,8 @@ add_addr_timeout_tests()
 		pm_nl_set_limits $ns2 2 2
 		speed=10 \
 			run_tests $ns1 $ns2 10.0.1.1
-		chk_join_nr 1 1 1
+		join_syn_tx=2 \
+			chk_join_nr 1 1 1
 		chk_add_nr 8 0
 	fi
 }
@@ -2354,12 +2428,14 @@ remove_tests()
 	if reset "remove invalid addresses"; then
 		pm_nl_set_limits $ns1 3 3
 		pm_nl_add_endpoint $ns1 10.0.12.1 flags signal
+		# broadcast IP: no packet for this address will be received on ns1
+		pm_nl_add_endpoint $ns1 224.0.0.1 flags signal
 		pm_nl_add_endpoint $ns1 10.0.3.1 flags signal
-		pm_nl_add_endpoint $ns1 10.0.14.1 flags signal
-		pm_nl_set_limits $ns2 3 3
+		pm_nl_set_limits $ns2 2 2
 		addr_nr_ns1=-3 speed=10 \
 			run_tests $ns1 $ns2 10.0.1.1
-		chk_join_nr 1 1 1
+		join_syn_tx=2 join_connect_err=1 \
+			chk_join_nr 1 1 1
 		chk_add_nr 3 3
 		chk_rm_nr 3 1 invert
 		chk_rst_nr 0 0
@@ -2424,7 +2500,8 @@ remove_tests()
 		pm_nl_set_limits $ns2 3 3
 		addr_nr_ns1=-8 speed=slow \
 			run_tests $ns1 $ns2 10.0.1.1
-		chk_join_nr 1 1 1
+		join_syn_tx=3 \
+			chk_join_nr 1 1 1
 		chk_add_nr 3 3
 		chk_rm_nr 3 1 invert
 		chk_rst_nr 0 0
@@ -2716,33 +2793,46 @@ backup_tests()
 		sflags=nobackup speed=slow \
 			run_tests $ns1 $ns2 10.0.1.1
 		chk_join_nr 1 1 1
-		chk_prio_nr 0 1
+		chk_prio_nr 0 1 1 0
 	fi
 
 	# single address, backup
 	if reset "single address, backup" &&
 	   continue_if mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then
 		pm_nl_set_limits $ns1 0 1
+		pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup
+		pm_nl_set_limits $ns2 1 1
+		sflags=nobackup speed=slow \
+			run_tests $ns1 $ns2 10.0.1.1
+		chk_join_nr 1 1 1
+		chk_add_nr 1 1
+		chk_prio_nr 1 0 0 1
+	fi
+
+	# single address, switch to backup
+	if reset "single address, switch to backup" &&
+	   continue_if mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then
+		pm_nl_set_limits $ns1 0 1
 		pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
 		pm_nl_set_limits $ns2 1 1
 		sflags=backup speed=slow \
 			run_tests $ns1 $ns2 10.0.1.1
 		chk_join_nr 1 1 1
 		chk_add_nr 1 1
-		chk_prio_nr 1 1
+		chk_prio_nr 1 1 0 0
 	fi
 
 	# single address with port, backup
 	if reset "single address with port, backup" &&
 	   continue_if mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then
 		pm_nl_set_limits $ns1 0 1
-		pm_nl_add_endpoint $ns1 10.0.2.1 flags signal port 10100
+		pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup port 10100
 		pm_nl_set_limits $ns2 1 1
-		sflags=backup speed=slow \
+		sflags=nobackup speed=slow \
 			run_tests $ns1 $ns2 10.0.1.1
 		chk_join_nr 1 1 1
 		chk_add_nr 1 1
-		chk_prio_nr 1 1
+		chk_prio_nr 1 0 0 1
 	fi
 
 	if reset "mpc backup" &&
@@ -2751,17 +2841,26 @@ backup_tests()
 		speed=slow \
 			run_tests $ns1 $ns2 10.0.1.1
 		chk_join_nr 0 0 0
-		chk_prio_nr 0 1
+		chk_prio_nr 0 1 0 0
 	fi
 
 	if reset "mpc backup both sides" &&
 	   continue_if mptcp_lib_kallsyms_doesnt_have "T mptcp_subflow_send_ack$"; then
-		pm_nl_add_endpoint $ns1 10.0.1.1 flags subflow,backup
+		pm_nl_set_limits $ns1 0 2
+		pm_nl_set_limits $ns2 1 2
+		pm_nl_add_endpoint $ns1 10.0.1.1 flags signal,backup
 		pm_nl_add_endpoint $ns2 10.0.1.2 flags subflow,backup
+
+		# 10.0.2.2 (non-backup) -> 10.0.1.1 (backup)
+		pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow
+		# 10.0.1.2 (backup) -> 10.0.2.1 (non-backup)
+		pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
+		ip -net "$ns2" route add 10.0.2.1 via 10.0.1.1 dev ns2eth1 # force this path
+
 		speed=slow \
 			run_tests $ns1 $ns2 10.0.1.1
-		chk_join_nr 0 0 0
-		chk_prio_nr 1 1
+		chk_join_nr 2 2 2
+		chk_prio_nr 1 1 1 1
 	fi
 
 	if reset "mpc switch to backup" &&
@@ -2770,7 +2869,7 @@ backup_tests()
 		sflags=backup speed=slow \
 			run_tests $ns1 $ns2 10.0.1.1
 		chk_join_nr 0 0 0
-		chk_prio_nr 0 1
+		chk_prio_nr 0 1 0 0
 	fi
 
 	if reset "mpc switch to backup both sides" &&
@@ -2780,7 +2879,7 @@ backup_tests()
 		sflags=backup speed=slow \
 			run_tests $ns1 $ns2 10.0.1.1
 		chk_join_nr 0 0 0
-		chk_prio_nr 1 1
+		chk_prio_nr 1 1 0 0
 	fi
 }
 
@@ -2813,6 +2912,32 @@ verify_listener_events()
 	fail_test
 }
 
+chk_mpc_endp_attempt()
+{
+	local retl=$1
+	local attempts=$2
+
+	print_check "Connect"
+
+	if [ ${retl} = 124 ]; then
+		fail_test "timeout on connect"
+	elif [ ${retl} = 0 ]; then
+		fail_test "unexpected successful connect"
+	else
+		print_ok
+
+		print_check "Attempts"
+		count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPCapableEndpAttempt")
+		if [ -z "$count" ]; then
+			print_skip
+		elif [ "$count" != "$attempts" ]; then
+			fail_test "got ${count} MPC attempt[s] on port-based endpoint, expected ${attempts}"
+		else
+			print_ok
+		fi
+	fi
+}
+
 add_addr_ports_tests()
 {
 	# signal address with port
@@ -2903,6 +3028,22 @@ add_addr_ports_tests()
 		chk_join_nr 2 2 2
 		chk_add_nr 2 2 2
 	fi
+
+	if reset "port-based signal endpoint must not accept mpc"; then
+		local port retl count
+		port=$(get_port)
+
+		cond_start_capture ${ns1}
+		pm_nl_add_endpoint ${ns1} 10.0.2.1 flags signal port ${port}
+		mptcp_lib_wait_local_port_listen ${ns1} ${port}
+
+		timeout 1 ip netns exec ${ns2} \
+			./mptcp_connect -t ${timeout_poll} -p $port -s MPTCP 10.0.2.1 >/dev/null 2>&1
+		retl=$?
+		cond_stop_capture
+
+		chk_mpc_endp_attempt ${retl} 1
+	fi
 }
 
 syncookies_tests()
@@ -2933,7 +3074,8 @@ syncookies_tests()
 		pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
 		pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow
 		run_tests $ns1 $ns2 10.0.1.1
-		chk_join_nr 2 1 1
+		join_syn_rej=1 \
+			chk_join_nr 2 1 1
 	fi
 
 	# test signal address with cookies
@@ -2972,37 +3114,16 @@ syncookies_tests()
 
 checksum_tests()
 {
-	# checksum test 0 0
-	if reset_with_checksum 0 0; then
-		pm_nl_set_limits $ns1 0 1
-		pm_nl_set_limits $ns2 0 1
-		run_tests $ns1 $ns2 10.0.1.1
-		chk_join_nr 0 0 0
-	fi
-
-	# checksum test 1 1
-	if reset_with_checksum 1 1; then
-		pm_nl_set_limits $ns1 0 1
-		pm_nl_set_limits $ns2 0 1
-		run_tests $ns1 $ns2 10.0.1.1
-		chk_join_nr 0 0 0
-	fi
-
-	# checksum test 0 1
-	if reset_with_checksum 0 1; then
-		pm_nl_set_limits $ns1 0 1
-		pm_nl_set_limits $ns2 0 1
-		run_tests $ns1 $ns2 10.0.1.1
-		chk_join_nr 0 0 0
-	fi
-
-	# checksum test 1 0
-	if reset_with_checksum 1 0; then
-		pm_nl_set_limits $ns1 0 1
-		pm_nl_set_limits $ns2 0 1
-		run_tests $ns1 $ns2 10.0.1.1
-		chk_join_nr 0 0 0
-	fi
+	local checksum_enable
+	for checksum_enable in "0 0" "1 1" "0 1" "1 0"; do
+		# checksum test 0 0, 1 1, 0 1, 1 0
+		if reset_with_checksum ${checksum_enable}; then
+			pm_nl_set_limits $ns1 0 1
+			pm_nl_set_limits $ns2 0 1
+			run_tests $ns1 $ns2 10.0.1.1
+			chk_join_nr 0 0 0
+		fi
+	done
 }
 
 deny_join_id0_tests()
@@ -3091,6 +3212,9 @@ fullmesh_tests()
 		pm_nl_set_limits $ns1 1 3
 		pm_nl_set_limits $ns2 1 3
 		pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
+		if mptcp_lib_kallsyms_has "mptcp_pm_subflow_check_next$"; then
+			pm_nl_add_endpoint $ns2 10.0.1.2 flags subflow,fullmesh
+		fi
 		fullmesh=1 speed=slow \
 			run_tests $ns1 $ns2 10.0.1.1
 		chk_join_nr 3 3 3
@@ -3157,7 +3281,7 @@ fullmesh_tests()
 		addr_nr_ns2=1 sflags=backup,fullmesh speed=slow \
 			run_tests $ns1 $ns2 10.0.1.1
 		chk_join_nr 2 2 2
-		chk_prio_nr 0 1
+		chk_prio_nr 0 1 1 0
 		chk_rm_nr 0 1
 	fi
 
@@ -3170,7 +3294,7 @@ fullmesh_tests()
 		sflags=nobackup,nofullmesh speed=slow \
 			run_tests $ns1 $ns2 10.0.1.1
 		chk_join_nr 2 2 2
-		chk_prio_nr 0 1
+		chk_prio_nr 0 1 1 0
 		chk_rm_nr 0 1
 	fi
 }
@@ -3178,6 +3302,7 @@ fullmesh_tests()
 fastclose_tests()
 {
 	if reset_check_counter "fastclose test" "MPTcpExtMPFastcloseTx"; then
+		MPTCP_LIB_SUBTEST_FLAKY=1
 		test_linkfail=1024 fastclose=client \
 			run_tests $ns1 $ns2 10.0.1.1
 		chk_join_nr 0 0 0
@@ -3186,9 +3311,11 @@ fastclose_tests()
 	fi
 
 	if reset_check_counter "fastclose server test" "MPTcpExtMPFastcloseRx"; then
+		MPTCP_LIB_SUBTEST_FLAKY=1
 		test_linkfail=1024 fastclose=server \
 			run_tests $ns1 $ns2 10.0.1.1
-		chk_join_nr 0 0 0 0 0 0 1
+		join_rst_nr=1 \
+			chk_join_nr 0 0 0
 		chk_fclose_nr 1 1 invert
 		chk_rst_nr 1 1
 	fi
@@ -3204,21 +3331,29 @@ fail_tests()
 {
 	# single subflow
 	if reset_with_fail "Infinite map" 1; then
+		MPTCP_LIB_SUBTEST_FLAKY=1
 		test_linkfail=128 \
 			run_tests $ns1 $ns2 10.0.1.1
-		chk_join_nr 0 0 0 +1 +0 1 0 1 "$(pedit_action_pkts)"
+		join_csum_ns1=+1 join_csum_ns2=+0 \
+			join_fail_nr=1 join_rst_nr=0 join_infi_nr=1 \
+			join_corrupted_pkts="$(pedit_action_pkts)" \
+			chk_join_nr 0 0 0
 		chk_fail_nr 1 -1 invert
 	fi
 
 	# multiple subflows
 	if reset_with_fail "MP_FAIL MP_RST" 2; then
-		tc -n $ns2 qdisc add dev ns2eth1 root netem rate 1mbit delay 5
+		MPTCP_LIB_SUBTEST_FLAKY=1
+		tc -n $ns2 qdisc add dev ns2eth1 root netem rate 1mbit delay 5ms
 		pm_nl_set_limits $ns1 0 1
 		pm_nl_set_limits $ns2 0 1
 		pm_nl_add_endpoint $ns2 10.0.2.2 dev ns2eth2 flags subflow
 		test_linkfail=1024 \
 			run_tests $ns1 $ns2 10.0.1.1
-		chk_join_nr 1 1 1 1 0 1 1 0 "$(pedit_action_pkts)"
+		join_csum_ns1=1 join_csum_ns2=0 \
+			join_fail_nr=1 join_rst_nr=1 join_infi_nr=0 \
+			join_corrupted_pkts="$(pedit_action_pkts)" \
+			chk_join_nr 1 1 1
 	fi
 }
 
@@ -3360,6 +3495,36 @@ userspace_pm_chk_get_addr()
 	fi
 }
 
+# $1: ns ; $2: event type ; $3: count
+chk_evt_nr()
+{
+	local ns=${1}
+	local evt_name="${2}"
+	local exp="${3}"
+
+	local evts="${evts_ns1}"
+	local evt="${!evt_name}"
+	local count
+
+	evt_name="${evt_name:16}" # without MPTCP_LIB_EVENT_
+	[ "${ns}" == "ns2" ] && evts="${evts_ns2}"
+
+	print_check "event ${ns} ${evt_name} (${exp})"
+
+	if [[ "${evt_name}" = "LISTENER_"* ]] &&
+	   ! mptcp_lib_kallsyms_has "mptcp_event_pm_listener$"; then
+		print_skip "event not supported"
+		return
+	fi
+
+	count=$(grep -cw "type:${evt}" "${evts}")
+	if [ "${count}" != "${exp}" ]; then
+		fail_test "got ${count} events, expected ${exp}"
+	else
+		print_ok
+	fi
+}
+
 userspace_tests()
 {
 	# userspace pm type prevents add_addr
@@ -3394,7 +3559,8 @@ userspace_tests()
 		pm_nl_set_limits $ns2 1 1
 		pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
 		run_tests $ns1 $ns2 10.0.1.1
-		chk_join_nr 1 1 0
+		join_syn_rej=1 \
+			chk_join_nr 1 1 0
 	fi
 
 	# userspace pm type does not send join
@@ -3417,8 +3583,9 @@ userspace_tests()
 		pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
 		sflags=backup speed=slow \
 			run_tests $ns1 $ns2 10.0.1.1
-		chk_join_nr 1 1 0
-		chk_prio_nr 0 0
+		join_syn_rej=1 \
+			chk_join_nr 1 1 0
+		chk_prio_nr 0 0 0 0
 	fi
 
 	# userspace pm type prevents rm_addr
@@ -3440,8 +3607,8 @@ userspace_tests()
 	   continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then
 		set_userspace_pm $ns1
 		pm_nl_set_limits $ns2 2 2
-		speed=5 \
-			run_tests $ns1 $ns2 10.0.1.1 &
+		{ speed=5 \
+			run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
 		local tests_pid=$!
 		wait_mpj $ns1
 		userspace_pm_add_addr $ns1 10.0.2.1 10
@@ -3456,14 +3623,12 @@ userspace_tests()
 			"signal"
 		userspace_pm_chk_get_addr "${ns1}" "10" "id 10 flags signal 10.0.2.1"
 		userspace_pm_chk_get_addr "${ns1}" "20" "id 20 flags signal 10.0.3.1"
-		userspace_pm_rm_addr $ns1 10
 		userspace_pm_rm_sf $ns1 "::ffff:10.0.2.1" $MPTCP_LIB_EVENT_SUB_ESTABLISHED
 		userspace_pm_chk_dump_addr "${ns1}" \
-			"id 20 flags signal 10.0.3.1" "after rm_addr 10"
+			"id 20 flags signal 10.0.3.1" "after rm_sf 10"
 		userspace_pm_rm_addr $ns1 20
-		userspace_pm_rm_sf $ns1 10.0.3.1 $MPTCP_LIB_EVENT_SUB_ESTABLISHED
 		userspace_pm_chk_dump_addr "${ns1}" "" "after rm_addr 20"
-		chk_rm_nr 2 2 invert
+		chk_rm_nr 1 1 invert
 		chk_mptcp_info subflows 0 subflows 0
 		chk_subflows_total 1 1
 		kill_events_pids
@@ -3475,8 +3640,8 @@ userspace_tests()
 	   continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then
 		set_userspace_pm $ns2
 		pm_nl_set_limits $ns1 0 1
-		speed=5 \
-			run_tests $ns1 $ns2 10.0.1.1 &
+		{ speed=5 \
+			run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
 		local tests_pid=$!
 		wait_mpj $ns2
 		userspace_pm_add_sf $ns2 10.0.3.2 20
@@ -3487,12 +3652,11 @@ userspace_tests()
 			"id 20 flags subflow 10.0.3.2" \
 			"subflow"
 		userspace_pm_chk_get_addr "${ns2}" "20" "id 20 flags subflow 10.0.3.2"
-		userspace_pm_rm_addr $ns2 20
 		userspace_pm_rm_sf $ns2 10.0.3.2 $MPTCP_LIB_EVENT_SUB_ESTABLISHED
 		userspace_pm_chk_dump_addr "${ns2}" \
 			"" \
-			"after rm_addr 20"
-		chk_rm_nr 1 1
+			"after rm_sf 20"
+		chk_rm_nr 0 1
 		chk_mptcp_info subflows 0 subflows 0
 		chk_subflows_total 1 1
 		kill_events_pids
@@ -3504,8 +3668,8 @@ userspace_tests()
 	   continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then
 		set_userspace_pm $ns2
 		pm_nl_set_limits $ns1 0 1
-		speed=5 \
-			run_tests $ns1 $ns2 10.0.1.1 &
+		{ speed=5 \
+			run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
 		local tests_pid=$!
 		wait_mpj $ns2
 		chk_mptcp_info subflows 0 subflows 0
@@ -3525,8 +3689,8 @@ userspace_tests()
 	   continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then
 		set_userspace_pm $ns2
 		pm_nl_set_limits $ns1 0 1
-		speed=5 \
-			run_tests $ns1 $ns2 10.0.1.1 &
+		{ speed=5 \
+			run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
 		local tests_pid=$!
 		wait_mpj $ns2
 		userspace_pm_add_sf $ns2 10.0.3.2 20
@@ -3549,8 +3713,8 @@ userspace_tests()
 	   continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then
 		set_userspace_pm $ns1
 		pm_nl_set_limits $ns2 1 1
-		speed=5 \
-			run_tests $ns1 $ns2 10.0.1.1 &
+		{ speed=5 \
+			run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
 		local tests_pid=$!
 		wait_mpj $ns1
 		userspace_pm_add_addr $ns1 10.0.2.1 10
@@ -3580,8 +3744,8 @@ endpoint_tests()
 		pm_nl_set_limits $ns1 2 2
 		pm_nl_set_limits $ns2 2 2
 		pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
-		speed=slow \
-			run_tests $ns1 $ns2 10.0.1.1 &
+		{ speed=slow \
+			run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
 		local tests_pid=$!
 
 		wait_mpj $ns1
@@ -3600,31 +3764,185 @@ endpoint_tests()
 		mptcp_lib_kill_wait $tests_pid
 	fi
 
-	if reset "delete and re-add" &&
+	if reset_with_tcp_filter "delete and re-add" ns2 10.0.3.2 REJECT OUTPUT &&
 	   mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then
-		pm_nl_set_limits $ns1 1 1
-		pm_nl_set_limits $ns2 1 1
+		start_events
+		pm_nl_set_limits $ns1 0 3
+		pm_nl_set_limits $ns2 0 3
+		pm_nl_add_endpoint $ns2 10.0.1.2 id 1 dev ns2eth1 flags subflow
 		pm_nl_add_endpoint $ns2 10.0.2.2 id 2 dev ns2eth2 flags subflow
-		test_linkfail=4 speed=20 \
-			run_tests $ns1 $ns2 10.0.1.1 &
+		{ test_linkfail=4 speed=5 \
+			run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
 		local tests_pid=$!
 
 		wait_mpj $ns2
 		pm_nl_check_endpoint "creation" \
 			$ns2 10.0.2.2 id 2 flags subflow dev ns2eth2
-		chk_subflow_nr "before delete" 2
+		chk_subflow_nr "before delete id 2" 2
 		chk_mptcp_info subflows 1 subflows 1
 
 		pm_nl_del_endpoint $ns2 2 10.0.2.2
 		sleep 0.5
-		chk_subflow_nr "after delete" 1
+		chk_subflow_nr "after delete id 2" 1
 		chk_mptcp_info subflows 0 subflows 0
 
-		pm_nl_add_endpoint $ns2 10.0.2.2 dev ns2eth2 flags subflow
+		pm_nl_add_endpoint $ns2 10.0.2.2 id 2 dev ns2eth2 flags subflow
 		wait_mpj $ns2
-		chk_subflow_nr "after re-add" 2
+		chk_subflow_nr "after re-add id 2" 2
+		chk_mptcp_info subflows 1 subflows 1
+
+		pm_nl_add_endpoint $ns2 10.0.3.2 id 3 flags subflow
+		wait_attempt_fail $ns2
+		chk_subflow_nr "after new reject" 2
 		chk_mptcp_info subflows 1 subflows 1
+
+		ip netns exec "${ns2}" ${iptables} -D OUTPUT -s "10.0.3.2" -p tcp -j REJECT
+		pm_nl_del_endpoint $ns2 3 10.0.3.2
+		pm_nl_add_endpoint $ns2 10.0.3.2 id 3 flags subflow
+		wait_mpj $ns2
+		chk_subflow_nr "after no reject" 3
+		chk_mptcp_info subflows 2 subflows 2
+
+		local i
+		for i in $(seq 3); do
+			pm_nl_del_endpoint $ns2 1 10.0.1.2
+			sleep 0.5
+			chk_subflow_nr "after delete id 0 ($i)" 2
+			chk_mptcp_info subflows 2 subflows 2 # only decr for additional sf
+
+			pm_nl_add_endpoint $ns2 10.0.1.2 id 1 dev ns2eth1 flags subflow
+			wait_mpj $ns2
+			chk_subflow_nr "after re-add id 0 ($i)" 3
+			chk_mptcp_info subflows 3 subflows 3
+		done
+
 		mptcp_lib_kill_wait $tests_pid
+
+		kill_events_pids
+		chk_evt_nr ns1 MPTCP_LIB_EVENT_LISTENER_CREATED 1
+		chk_evt_nr ns1 MPTCP_LIB_EVENT_CREATED 1
+		chk_evt_nr ns1 MPTCP_LIB_EVENT_ESTABLISHED 1
+		chk_evt_nr ns1 MPTCP_LIB_EVENT_ANNOUNCED 0
+		chk_evt_nr ns1 MPTCP_LIB_EVENT_REMOVED 4
+		chk_evt_nr ns1 MPTCP_LIB_EVENT_SUB_ESTABLISHED 6
+		chk_evt_nr ns1 MPTCP_LIB_EVENT_SUB_CLOSED 4
+
+		chk_evt_nr ns2 MPTCP_LIB_EVENT_CREATED 1
+		chk_evt_nr ns2 MPTCP_LIB_EVENT_ESTABLISHED 1
+		chk_evt_nr ns2 MPTCP_LIB_EVENT_ANNOUNCED 0
+		chk_evt_nr ns2 MPTCP_LIB_EVENT_REMOVED 0
+		chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_ESTABLISHED 6
+		chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_CLOSED 5 # one has been closed before estab
+
+		join_syn_tx=7 \
+			chk_join_nr 6 6 6
+		chk_rm_nr 4 4
+	fi
+
+	# remove and re-add
+	if reset_with_events "delete re-add signal" &&
+	   mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then
+		pm_nl_set_limits $ns1 0 3
+		pm_nl_set_limits $ns2 3 3
+		pm_nl_add_endpoint $ns1 10.0.2.1 id 1 flags signal
+		# broadcast IP: no packet for this address will be received on ns1
+		pm_nl_add_endpoint $ns1 224.0.0.1 id 2 flags signal
+		pm_nl_add_endpoint $ns1 10.0.1.1 id 42 flags signal
+		{ test_linkfail=4 speed=5 \
+			run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
+		local tests_pid=$!
+
+		wait_mpj $ns2
+		pm_nl_check_endpoint "creation" \
+			$ns1 10.0.2.1 id 1 flags signal
+		chk_subflow_nr "before delete" 2
+		chk_mptcp_info subflows 1 subflows 1
+
+		pm_nl_del_endpoint $ns1 1 10.0.2.1
+		pm_nl_del_endpoint $ns1 2 224.0.0.1
+		sleep 0.5
+		chk_subflow_nr "after delete" 1
+		chk_mptcp_info subflows 0 subflows 0
+
+		pm_nl_add_endpoint $ns1 10.0.2.1 id 1 flags signal
+		pm_nl_add_endpoint $ns1 10.0.3.1 id 2 flags signal
+		wait_mpj $ns2
+		chk_subflow_nr "after re-add" 3
+		chk_mptcp_info subflows 2 subflows 2
+
+		pm_nl_del_endpoint $ns1 42 10.0.1.1
+		sleep 0.5
+		chk_subflow_nr "after delete ID 0" 2
+		chk_mptcp_info subflows 2 subflows 2
+
+		pm_nl_add_endpoint $ns1 10.0.1.1 id 99 flags signal
+		wait_mpj $ns2
+		chk_subflow_nr "after re-add ID 0" 3
+		chk_mptcp_info subflows 3 subflows 3
+
+		pm_nl_del_endpoint $ns1 99 10.0.1.1
+		sleep 0.5
+		chk_subflow_nr "after re-delete ID 0" 2
+		chk_mptcp_info subflows 2 subflows 2
+
+		pm_nl_add_endpoint $ns1 10.0.1.1 id 88 flags signal
+		wait_mpj $ns2
+		chk_subflow_nr "after re-re-add ID 0" 3
+		chk_mptcp_info subflows 3 subflows 3
+		mptcp_lib_kill_wait $tests_pid
+
+		kill_events_pids
+		chk_evt_nr ns1 MPTCP_LIB_EVENT_LISTENER_CREATED 1
+		chk_evt_nr ns1 MPTCP_LIB_EVENT_CREATED 1
+		chk_evt_nr ns1 MPTCP_LIB_EVENT_ESTABLISHED 1
+		chk_evt_nr ns1 MPTCP_LIB_EVENT_ANNOUNCED 0
+		chk_evt_nr ns1 MPTCP_LIB_EVENT_REMOVED 0
+		chk_evt_nr ns1 MPTCP_LIB_EVENT_SUB_ESTABLISHED 5
+		chk_evt_nr ns1 MPTCP_LIB_EVENT_SUB_CLOSED 3
+
+		chk_evt_nr ns2 MPTCP_LIB_EVENT_CREATED 1
+		chk_evt_nr ns2 MPTCP_LIB_EVENT_ESTABLISHED 1
+		chk_evt_nr ns2 MPTCP_LIB_EVENT_ANNOUNCED 6
+		chk_evt_nr ns2 MPTCP_LIB_EVENT_REMOVED 4
+		chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_ESTABLISHED 5
+		chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_CLOSED 3
+
+		join_connect_err=1 \
+			chk_join_nr 5 5 5
+		chk_add_nr 6 6
+		chk_rm_nr 4 3 invert
+	fi
+
+	# flush and re-add
+	if reset_with_tcp_filter "flush re-add" ns2 10.0.3.2 REJECT OUTPUT &&
+	   mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then
+		pm_nl_set_limits $ns1 0 2
+		pm_nl_set_limits $ns2 1 2
+		# broadcast IP: no packet for this address will be received on ns1
+		pm_nl_add_endpoint $ns1 224.0.0.1 id 2 flags signal
+		pm_nl_add_endpoint $ns2 10.0.3.2 id 3 flags subflow
+		{ test_linkfail=4 speed=20 \
+			run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
+		local tests_pid=$!
+
+		wait_attempt_fail $ns2
+		chk_subflow_nr "before flush" 1
+		chk_mptcp_info subflows 0 subflows 0
+
+		pm_nl_flush_endpoint $ns2
+		pm_nl_flush_endpoint $ns1
+		wait_rm_addr $ns2 0
+		ip netns exec "${ns2}" ${iptables} -D OUTPUT -s "10.0.3.2" -p tcp -j REJECT
+		pm_nl_add_endpoint $ns2 10.0.3.2 id 3 flags subflow
+		wait_mpj $ns2
+		pm_nl_add_endpoint $ns1 10.0.3.1 id 2 flags signal
+		wait_mpj $ns2
+		mptcp_lib_kill_wait $tests_pid
+
+		join_syn_tx=3 join_connect_err=1 \
+			chk_join_nr 2 2 2
+		chk_add_nr 2 2
+		chk_rm_nr 1 0 invert
 	fi
 }
 
@@ -3702,7 +4020,7 @@ while getopts "${all_tests_args}cCih" opt; do
 			checksum=true
 			;;
 		i)
-			ip_mptcp=1
+			mptcp_lib_set_ip_mptcp
 			;;
 		h)
 			usage
@@ -3727,9 +4045,11 @@ if [ ${#tests[@]} -eq 0 ]; then
 	tests=("${all_tests_names[@]}")
 fi
 
+mptcp_lib_subtests_last_ts_reset
 for subtests in "${tests[@]}"; do
 	"${subtests}"
 done
+append_prev_results
 
 if [ ${ret} -ne 0 ]; then
 	echo
@@ -3740,7 +4060,6 @@ if [ ${ret} -ne 0 ]; then
 	echo
 fi
 
-append_prev_results
 mptcp_lib_result_print_all_tap
 
 exit $ret
diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh
index d529b4b37af8..09cd24b2ae46 100644
--- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh
@@ -1,6 +1,8 @@
 #! /bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
+. "$(dirname "${0}")/../lib.sh"
+
 readonly KSFT_PASS=0
 readonly KSFT_FAIL=1
 readonly KSFT_SKIP=4
@@ -9,10 +11,14 @@ readonly KSFT_SKIP=4
 readonly KSFT_TEST="${MPTCP_LIB_KSFT_TEST:-$(basename "${0}" .sh)}"
 
 # These variables are used in some selftests, read-only
+declare -rx MPTCP_LIB_EVENT_CREATED=1           # MPTCP_EVENT_CREATED
+declare -rx MPTCP_LIB_EVENT_ESTABLISHED=2       # MPTCP_EVENT_ESTABLISHED
+declare -rx MPTCP_LIB_EVENT_CLOSED=3            # MPTCP_EVENT_CLOSED
 declare -rx MPTCP_LIB_EVENT_ANNOUNCED=6         # MPTCP_EVENT_ANNOUNCED
 declare -rx MPTCP_LIB_EVENT_REMOVED=7           # MPTCP_EVENT_REMOVED
 declare -rx MPTCP_LIB_EVENT_SUB_ESTABLISHED=10  # MPTCP_EVENT_SUB_ESTABLISHED
 declare -rx MPTCP_LIB_EVENT_SUB_CLOSED=11       # MPTCP_EVENT_SUB_CLOSED
+declare -rx MPTCP_LIB_EVENT_SUB_PRIORITY=13     # MPTCP_EVENT_SUB_PRIORITY
 declare -rx MPTCP_LIB_EVENT_LISTENER_CREATED=15 # MPTCP_EVENT_LISTENER_CREATED
 declare -rx MPTCP_LIB_EVENT_LISTENER_CLOSED=16  # MPTCP_EVENT_LISTENER_CLOSED
 
@@ -21,8 +27,11 @@ declare -rx MPTCP_LIB_AF_INET6=10
 
 MPTCP_LIB_SUBTESTS=()
 MPTCP_LIB_SUBTESTS_DUPLICATED=0
+MPTCP_LIB_SUBTEST_FLAKY=0
+MPTCP_LIB_SUBTESTS_LAST_TS_MS=
 MPTCP_LIB_TEST_COUNTER=0
 MPTCP_LIB_TEST_FORMAT="%02u %-50s"
+MPTCP_LIB_IP_MPTCP=0
 
 # only if supported (or forced) and not disabled, see no-color.org
 if { [ -t 1 ] || [ "${SELFTESTS_MPTCP_LIB_COLOR_FORCE:-}" = "1" ]; } &&
@@ -40,6 +49,16 @@ else
 	readonly MPTCP_LIB_COLOR_RESET=
 fi
 
+# SELFTESTS_MPTCP_LIB_OVERRIDE_FLAKY env var can be set not to ignore errors
+# from subtests marked as flaky
+mptcp_lib_override_flaky() {
+	[ "${SELFTESTS_MPTCP_LIB_OVERRIDE_FLAKY:-}" = 1 ]
+}
+
+mptcp_lib_subtest_is_flaky() {
+	[ "${MPTCP_LIB_SUBTEST_FLAKY}" = 1 ] && ! mptcp_lib_override_flaky
+}
+
 # $1: color, $2: text
 mptcp_lib_print_color() {
 	echo -e "${MPTCP_LIB_START_PRINT:-}${*}${MPTCP_LIB_COLOR_RESET}"
@@ -71,13 +90,43 @@ mptcp_lib_pr_skip() {
 }
 
 mptcp_lib_pr_fail() {
-	mptcp_lib_print_err "[FAIL]${1:+ ${*}}"
+	local title cmt
+
+	if mptcp_lib_subtest_is_flaky; then
+		title="IGNO"
+		cmt=" (flaky)"
+	else
+		title="FAIL"
+	fi
+
+	mptcp_lib_print_err "[${title}]${cmt}${1:+ ${*}}"
 }
 
 mptcp_lib_pr_info() {
 	mptcp_lib_print_info "INFO: ${*}"
 }
 
+# $1-2: listener/connector ns ; $3 port ; $4-5 listener/connector stat file
+mptcp_lib_pr_err_stats() {
+	local lns="${1}"
+	local cns="${2}"
+	local port="${3}"
+	local lstat="${4}"
+	local cstat="${5}"
+
+	echo -en "${MPTCP_LIB_COLOR_RED}"
+	{
+		printf "\nnetns %s (listener) socket stat for %d:\n" "${lns}" "${port}"
+		ip netns exec "${lns}" ss -Menitam -o "sport = :${port}"
+		cat "${lstat}"
+
+		printf "\nnetns %s (connector) socket stat for %d:\n" "${cns}" "${port}"
+		ip netns exec "${cns}" ss -Menitam -o "dport = :${port}"
+		[ "${lstat}" != "${cstat}" ] && cat "${cstat}"
+	} 1>&2
+	echo -en "${MPTCP_LIB_COLOR_RESET}"
+}
+
 # SELFTESTS_MPTCP_LIB_EXPECT_ALL_FEATURES env var can be set when validating all
 # features using the last version of the kernel and the selftests to make sure
 # a test is not being skipped by mistake.
@@ -177,6 +226,11 @@ mptcp_lib_kversion_ge() {
 	mptcp_lib_fail_if_expected_feature "kernel version ${1} lower than ${v}"
 }
 
+mptcp_lib_subtests_last_ts_reset() {
+	MPTCP_LIB_SUBTESTS_LAST_TS_MS="$(date +%s%3N)"
+}
+mptcp_lib_subtests_last_ts_reset
+
 __mptcp_lib_result_check_duplicated() {
 	local subtest
 
@@ -191,13 +245,22 @@ __mptcp_lib_result_check_duplicated() {
 
 __mptcp_lib_result_add() {
 	local result="${1}"
+	local time="time="
+	local ts_prev_ms
 	shift
 
 	local id=$((${#MPTCP_LIB_SUBTESTS[@]} + 1))
 
 	__mptcp_lib_result_check_duplicated "${*}"
 
-	MPTCP_LIB_SUBTESTS+=("${result} ${id} - ${KSFT_TEST}: ${*}")
+	# not to add two '#'
+	[[ "${*}" != *"#"* ]] && time="# ${time}"
+
+	ts_prev_ms="${MPTCP_LIB_SUBTESTS_LAST_TS_MS}"
+	mptcp_lib_subtests_last_ts_reset
+	time+="$((MPTCP_LIB_SUBTESTS_LAST_TS_MS - ts_prev_ms))ms"
+
+	MPTCP_LIB_SUBTESTS+=("${result} ${id} - ${KSFT_TEST}: ${*} ${time}")
 }
 
 # $1: test name
@@ -207,7 +270,13 @@ mptcp_lib_result_pass() {
 
 # $1: test name
 mptcp_lib_result_fail() {
-	__mptcp_lib_result_add "not ok" "${1}"
+	if mptcp_lib_subtest_is_flaky; then
+		# It might sound better to use 'not ok # TODO' or 'ok # SKIP',
+		# but some CIs don't understand 'TODO' and treat SKIP as errors.
+		__mptcp_lib_result_add "ok" "${1} # IGNORE Flaky"
+	else
+		__mptcp_lib_result_add "not ok" "${1}"
+	fi
 }
 
 # $1: test name
@@ -261,12 +330,15 @@ mptcp_lib_result_print_all_tap() {
 
 # get the value of keyword $1 in the line marked by keyword $2
 mptcp_lib_get_info_value() {
-	grep "${2}" | sed -n 's/.*\('"${1}"':\)\([0-9a-f:.]*\).*$/\2/p;q'
+	grep "${2}" 2>/dev/null |
+		sed -n 's/.*\('"${1}"':\)\([0-9a-f:.]*\).*$/\2/p;q'
+		# the ';q' at the end limits to the first matched entry.
 }
 
 # $1: info name ; $2: evts_ns ; [$3: event type; [$4: addr]]
 mptcp_lib_evts_get_info() {
-	grep "${4:-}" "${2}" | mptcp_lib_get_info_value "${1}" "^type:${3:-1},"
+	grep "${4:-}" "${2}" 2>/dev/null |
+		mptcp_lib_get_info_value "${1}" "^type:${3:-1},"
 }
 
 # $1: PID
@@ -334,20 +406,7 @@ mptcp_lib_check_transfer() {
 
 # $1: ns, $2: port
 mptcp_lib_wait_local_port_listen() {
-	local listener_ns="${1}"
-	local port="${2}"
-
-	local port_hex
-	port_hex="$(printf "%04X" "${port}")"
-
-	local _
-	for _ in $(seq 10); do
-		ip netns exec "${listener_ns}" cat /proc/net/tcp* | \
-			awk "BEGIN {rc=1} {if (\$2 ~ /:${port_hex}\$/ && \$4 ~ /0A/) \
-			     {rc=0; exit}} END {exit rc}" &&
-			break
-		sleep 0.1
-	done
+	wait_local_port_listen "${@}" "tcp"
 }
 
 mptcp_lib_check_output() {
@@ -384,6 +443,12 @@ mptcp_lib_check_tools() {
 				exit ${KSFT_SKIP}
 			fi
 			;;
+		"tc")
+			if ! tc -help &> /dev/null; then
+				mptcp_lib_pr_skip "Could not run test without tc tool"
+				exit ${KSFT_SKIP}
+			fi
+			;;
 		"ss")
 			if ! ss -h | grep -q MPTCP; then
 				mptcp_lib_pr_skip "ss tool does not support MPTCP"
@@ -405,27 +470,22 @@ mptcp_lib_check_tools() {
 }
 
 mptcp_lib_ns_init() {
-	local sec rndh
-
-	sec=$(date +%s)
-	rndh=$(printf %x "${sec}")-$(mktemp -u XXXXXX)
+	if ! setup_ns "${@}"; then
+		mptcp_lib_pr_fail "Failed to setup namespaces ${*}"
+		exit ${KSFT_FAIL}
+	fi
 
 	local netns
 	for netns in "${@}"; do
-		eval "${netns}=${netns}-${rndh}"
-
-		ip netns add "${!netns}" || exit ${KSFT_SKIP}
-		ip -net "${!netns}" link set lo up
 		ip netns exec "${!netns}" sysctl -q net.mptcp.enabled=1
-		ip netns exec "${!netns}" sysctl -q net.ipv4.conf.all.rp_filter=0
-		ip netns exec "${!netns}" sysctl -q net.ipv4.conf.default.rp_filter=0
 	done
 }
 
 mptcp_lib_ns_exit() {
+	cleanup_ns "${@}"
+
 	local netns
 	for netns in "${@}"; do
-		ip netns del "${netns}"
 		rm -f /tmp/"${netns}".{nstat,out}
 	done
 }
@@ -505,3 +565,131 @@ mptcp_lib_verify_listener_events() {
 	mptcp_lib_check_expected "type" "family" "saddr" "sport" || rc="${?}"
 	return "${rc}"
 }
+
+mptcp_lib_set_ip_mptcp() {
+	MPTCP_LIB_IP_MPTCP=1
+}
+
+mptcp_lib_is_ip_mptcp() {
+	[ "${MPTCP_LIB_IP_MPTCP}" = "1" ]
+}
+
+# format: <id>,<ip>,<flags>,<dev>
+mptcp_lib_pm_nl_format_endpoints() {
+	local entry id ip flags dev port
+
+	for entry in "${@}"; do
+		IFS=, read -r id ip flags dev port <<< "${entry}"
+		if mptcp_lib_is_ip_mptcp; then
+			echo -n "${ip}"
+			[ -n "${port}" ] && echo -n " port ${port}"
+			echo -n " id ${id}"
+			[ -n "${flags}" ] && echo -n " ${flags}"
+			[ -n "${dev}" ] && echo -n " dev ${dev}"
+			echo " " # always a space at the end
+		else
+			echo -n "id ${id}"
+			echo -n " flags ${flags//" "/","}"
+			[ -n "${dev}" ] && echo -n " dev ${dev}"
+			echo -n " ${ip}"
+			[ -n "${port}" ] && echo -n " ${port}"
+			echo ""
+		fi
+	done
+}
+
+mptcp_lib_pm_nl_get_endpoint() {
+	local ns=${1}
+	local id=${2}
+
+	if mptcp_lib_is_ip_mptcp; then
+		ip -n "${ns}" mptcp endpoint show id "${id}"
+	else
+		ip netns exec "${ns}" ./pm_nl_ctl get "${id}"
+	fi
+}
+
+mptcp_lib_pm_nl_set_limits() {
+	local ns=${1}
+	local addrs=${2}
+	local subflows=${3}
+
+	if mptcp_lib_is_ip_mptcp; then
+		ip -n "${ns}" mptcp limits set add_addr_accepted "${addrs}" subflows "${subflows}"
+	else
+		ip netns exec "${ns}" ./pm_nl_ctl limits "${addrs}" "${subflows}"
+	fi
+}
+
+mptcp_lib_pm_nl_add_endpoint() {
+	local ns=${1}
+	local addr=${2}
+	local flags dev id port
+	local nr=2
+
+	local p
+	for p in "${@}"; do
+		case "${p}" in
+		"flags" | "dev" | "id" | "port")
+			eval "${p}"=\$"${nr}"
+			;;
+		esac
+
+		nr=$((nr + 1))
+	done
+
+	if mptcp_lib_is_ip_mptcp; then
+		# shellcheck disable=SC2086 # blanks in flags, no double quote
+		ip -n "${ns}" mptcp endpoint add "${addr}" ${flags//","/" "} \
+			${dev:+dev "${dev}"} ${id:+id "${id}"} ${port:+port "${port}"}
+	else
+		ip netns exec "${ns}" ./pm_nl_ctl add "${addr}" ${flags:+flags "${flags}"} \
+			${dev:+dev "${dev}"} ${id:+id "${id}"} ${port:+port "${port}"}
+	fi
+}
+
+mptcp_lib_pm_nl_del_endpoint() {
+	local ns=${1}
+	local id=${2}
+	local addr=${3}
+
+	if mptcp_lib_is_ip_mptcp; then
+		[ "${id}" -ne 0 ] && addr=''
+		ip -n "${ns}" mptcp endpoint delete id "${id}" ${addr:+"${addr}"}
+	else
+		ip netns exec "${ns}" ./pm_nl_ctl del "${id}" "${addr}"
+	fi
+}
+
+mptcp_lib_pm_nl_flush_endpoint() {
+	local ns=${1}
+
+	if mptcp_lib_is_ip_mptcp; then
+		ip -n "${ns}" mptcp endpoint flush
+	else
+		ip netns exec "${ns}" ./pm_nl_ctl flush
+	fi
+}
+
+mptcp_lib_pm_nl_show_endpoints() {
+	local ns=${1}
+
+	if mptcp_lib_is_ip_mptcp; then
+		ip -n "${ns}" mptcp endpoint show
+	else
+		ip netns exec "${ns}" ./pm_nl_ctl dump
+	fi
+}
+
+mptcp_lib_pm_nl_change_endpoint() {
+	local ns=${1}
+	local id=${2}
+	local flags=${3}
+
+	if mptcp_lib_is_ip_mptcp; then
+		# shellcheck disable=SC2086 # blanks in flags, no double quote
+		ip -n "${ns}" mptcp endpoint change id "${id}" ${flags//","/" "}
+	else
+		ip netns exec "${ns}" ./pm_nl_ctl set id "${id}" flags "${flags}"
+	fi
+}
diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.c b/tools/testing/selftests/net/mptcp/mptcp_sockopt.c
index 926b0be87c99..9934a68df237 100644
--- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.c
+++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.c
@@ -159,13 +159,21 @@ static const char *getxinfo_strerr(int err)
 }
 
 static void xgetaddrinfo(const char *node, const char *service,
-			 const struct addrinfo *hints,
+			 struct addrinfo *hints,
 			 struct addrinfo **res)
 {
+again:
 	int err = getaddrinfo(node, service, hints, res);
 
 	if (err) {
-		const char *errstr = getxinfo_strerr(err);
+		const char *errstr;
+
+		if (err == EAI_SOCKTYPE) {
+			hints->ai_protocol = IPPROTO_TCP;
+			goto again;
+		}
+
+		errstr = getxinfo_strerr(err);
 
 		fprintf(stderr, "Fatal: getaddrinfo(%s:%s): %s\n",
 			node ? node : "", service ? service : "", errstr);
@@ -178,7 +186,7 @@ static int sock_listen_mptcp(const char * const listenaddr,
 {
 	int sock = -1;
 	struct addrinfo hints = {
-		.ai_protocol = IPPROTO_TCP,
+		.ai_protocol = IPPROTO_MPTCP,
 		.ai_socktype = SOCK_STREAM,
 		.ai_flags = AI_PASSIVE | AI_NUMERICHOST
 	};
@@ -223,7 +231,7 @@ static int sock_connect_mptcp(const char * const remoteaddr,
 			      const char * const port, int proto)
 {
 	struct addrinfo hints = {
-		.ai_protocol = IPPROTO_TCP,
+		.ai_protocol = IPPROTO_MPTCP,
 		.ai_socktype = SOCK_STREAM,
 	};
 	struct addrinfo *a, *addr;
diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh
index e2d70c18786e..418a903c3a4d 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh
@@ -22,6 +22,28 @@ ns1=""
 ns2=""
 ns_sbox=""
 
+usage() {
+	echo "Usage: $0 [ -i ] [ -h ]"
+	echo -e "\t-i: use 'ip mptcp' instead of 'pm_nl_ctl'"
+	echo -e "\t-h: help"
+}
+
+while getopts "hi" option;do
+	case "$option" in
+	"h")
+		usage "$0"
+		exit ${KSFT_PASS}
+		;;
+	"i")
+		mptcp_lib_set_ip_mptcp
+		;;
+	"?")
+		usage "$0"
+		exit ${KSFT_FAIL}
+		;;
+	esac
+done
+
 add_mark_rules()
 {
 	local ns=$1
@@ -58,15 +80,15 @@ init()
 		# let $ns2 reach any $ns1 address from any interface
 		ip -net "$ns2" route add default via 10.0.$i.1 dev ns2eth$i metric 10$i
 
-		ip netns exec $ns1 ./pm_nl_ctl add 10.0.$i.1 flags signal
-		ip netns exec $ns1 ./pm_nl_ctl add dead:beef:$i::1 flags signal
+		mptcp_lib_pm_nl_add_endpoint "${ns1}" "10.0.${i}.1" flags signal
+		mptcp_lib_pm_nl_add_endpoint "${ns1}" "dead:beef:${i}::1" flags signal
 
-		ip netns exec $ns2 ./pm_nl_ctl add 10.0.$i.2 flags signal
-		ip netns exec $ns2 ./pm_nl_ctl add dead:beef:$i::2 flags signal
+		mptcp_lib_pm_nl_add_endpoint "${ns2}" "10.0.${i}.2" flags signal
+		mptcp_lib_pm_nl_add_endpoint "${ns2}" "dead:beef:${i}::2" flags signal
 	done
 
-	ip netns exec $ns1 ./pm_nl_ctl limits 8 8
-	ip netns exec $ns2 ./pm_nl_ctl limits 8 8
+	mptcp_lib_pm_nl_set_limits "${ns1}" 8 8
+	mptcp_lib_pm_nl_set_limits "${ns2}" 8 8
 
 	add_mark_rules $ns1 1
 	add_mark_rules $ns2 2
@@ -147,6 +169,11 @@ do_transfer()
 		cmsg+=",TCPINQ"
 	fi
 
+	NSTAT_HISTORY=/tmp/${listener_ns}.nstat ip netns exec ${listener_ns} \
+		nstat -n
+	NSTAT_HISTORY=/tmp/${connector_ns}.nstat ip netns exec ${connector_ns} \
+		nstat -n
+
 	timeout ${timeout_test} \
 		ip netns exec ${listener_ns} \
 			$mptcp_connect -t ${timeout_poll} -l -M 1 -p $port -s ${srv_proto} -c "${cmsg}" \
@@ -167,14 +194,16 @@ do_transfer()
 	wait $spid
 	local rets=$?
 
+	NSTAT_HISTORY=/tmp/${listener_ns}.nstat ip netns exec ${listener_ns} \
+		nstat | grep Tcp > /tmp/${listener_ns}.out
+	NSTAT_HISTORY=/tmp/${connector_ns}.nstat ip netns exec ${connector_ns} \
+		nstat | grep Tcp > /tmp/${connector_ns}.out
+
 	print_title "Transfer ${ip:2}"
 	if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then
 		mptcp_lib_pr_fail "client exit code $retc, server $rets"
-		echo -e "\nnetns ${listener_ns} socket stat for ${port}:" 1>&2
-		ip netns exec ${listener_ns} ss -Menita 1>&2 -o "sport = :$port"
-
-		echo -e "\nnetns ${connector_ns} socket stat for ${port}:" 1>&2
-		ip netns exec ${connector_ns} ss -Menita 1>&2 -o "dport = :$port"
+		mptcp_lib_pr_err_stats "${listener_ns}" "${connector_ns}" "${port}" \
+			"/tmp/${listener_ns}.out" "/tmp/${connector_ns}.out"
 
 		mptcp_lib_result_fail "transfer ${ip}"
 
@@ -327,6 +356,7 @@ init
 make_file "$cin" "client" 1
 make_file "$sin" "server" 1
 trap cleanup EXIT
+mptcp_lib_subtests_last_ts_reset
 
 run_tests $ns1 $ns2 10.0.1.1
 run_tests $ns1 $ns2 dead:beef:1::1
diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh
index 6ab8c5d36340..2e6648a2b2c0 100755
--- a/tools/testing/selftests/net/mptcp/pm_netlink.sh
+++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh
@@ -1,28 +1,28 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
-# Double quotes to prevent globbing and word splitting is recommended in new
-# code but we accept it, especially because there were too many before having
-# address all other issues detected by shellcheck.
-#shellcheck disable=SC2086
-
 . "$(dirname "${0}")/mptcp_lib.sh"
 
 ret=0
 
 usage() {
-	echo "Usage: $0 [ -h ]"
+	echo "Usage: $0 [ -i ] [ -h ]"
+	echo -e "\t-i: use 'ip mptcp' instead of 'pm_nl_ctl'"
+	echo -e "\t-h: help"
 }
 
-optstring=h
+optstring=hi
 while getopts "$optstring" option;do
 	case "$option" in
 	"h")
-		usage $0
+		usage "$0"
 		exit ${KSFT_PASS}
 		;;
+	"i")
+		mptcp_lib_set_ip_mptcp
+		;;
 	"?")
-		usage $0
+		usage "$0"
 		exit ${KSFT_FAIL}
 		;;
 	esac
@@ -35,7 +35,7 @@ err=$(mktemp)
 #shellcheck disable=SC2317
 cleanup()
 {
-	rm -f $err
+	rm -f "${err}"
 	mptcp_lib_ns_exit "${ns1}"
 }
 
@@ -46,6 +46,76 @@ trap cleanup EXIT
 
 mptcp_lib_ns_init ns1
 
+format_limits() {
+	local accept="${1}"
+	local subflows="${2}"
+
+	if mptcp_lib_is_ip_mptcp; then
+		# with a space at the end
+		printf "add_addr_accepted %d subflows %d \n" "${accept}" "${subflows}"
+	else
+		printf "accept %d\nsubflows %d\n" "${accept}" "${subflows}"
+	fi
+}
+
+get_limits() {
+	if mptcp_lib_is_ip_mptcp; then
+		ip -n "${ns1}" mptcp limits
+	else
+		ip netns exec "${ns1}" ./pm_nl_ctl limits
+	fi
+}
+
+format_endpoints() {
+	mptcp_lib_pm_nl_format_endpoints "${@}"
+}
+
+get_endpoint() {
+	# shellcheck disable=SC2317 # invoked indirectly
+	mptcp_lib_pm_nl_get_endpoint "${ns1}" "${@}"
+}
+
+change_address() {
+	local addr=${1}
+	local flags=${2}
+
+	if mptcp_lib_is_ip_mptcp; then
+		ip -n "${ns1}" mptcp endpoint change "${addr}" "${flags}"
+	else
+		ip netns exec "${ns1}" ./pm_nl_ctl set "${addr}" flags "${flags}"
+	fi
+}
+
+set_limits()
+{
+	mptcp_lib_pm_nl_set_limits "${ns1}" "${@}"
+}
+
+add_endpoint()
+{
+	mptcp_lib_pm_nl_add_endpoint "${ns1}" "${@}"
+}
+
+del_endpoint()
+{
+	mptcp_lib_pm_nl_del_endpoint "${ns1}" "${@}"
+}
+
+flush_endpoint()
+{
+	mptcp_lib_pm_nl_flush_endpoint "${ns1}"
+}
+
+show_endpoints()
+{
+	mptcp_lib_pm_nl_show_endpoints "${ns1}"
+}
+
+change_endpoint()
+{
+	mptcp_lib_pm_nl_change_endpoint "${ns1}" "${@}"
+}
+
 check()
 {
 	local cmd="$1"
@@ -67,125 +137,128 @@ check()
 	fi
 }
 
-check "ip netns exec $ns1 ./pm_nl_ctl dump" "" "defaults addr list"
+mptcp_lib_subtests_last_ts_reset
+
+check "show_endpoints" "" "defaults addr list"
 
-default_limits="$(ip netns exec $ns1 ./pm_nl_ctl limits)"
+default_limits="$(get_limits)"
 if mptcp_lib_expect_all_features; then
-	check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 0
-subflows 2" "defaults limits"
+	check "get_limits" "$(format_limits 0 2)" "defaults limits"
 fi
 
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.1
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.2 flags subflow dev lo
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.3 flags signal,backup
-check "ip netns exec $ns1 ./pm_nl_ctl get 1" "id 1 flags  10.0.1.1" "simple add/get addr"
+add_endpoint 10.0.1.1
+add_endpoint 10.0.1.2 flags subflow dev lo
+add_endpoint 10.0.1.3 flags signal,backup
+check "get_endpoint 1" "$(format_endpoints "1,10.0.1.1")" "simple add/get addr"
 
-check "ip netns exec $ns1 ./pm_nl_ctl dump" \
-"id 1 flags  10.0.1.1
-id 2 flags subflow dev lo 10.0.1.2
-id 3 flags signal,backup 10.0.1.3" "dump addrs"
+check "show_endpoints" \
+	"$(format_endpoints "1,10.0.1.1" \
+			    "2,10.0.1.2,subflow,lo" \
+			    "3,10.0.1.3,signal backup")" "dump addrs"
 
-ip netns exec $ns1 ./pm_nl_ctl del 2
-check "ip netns exec $ns1 ./pm_nl_ctl get 2" "" "simple del addr"
-check "ip netns exec $ns1 ./pm_nl_ctl dump" \
-"id 1 flags  10.0.1.1
-id 3 flags signal,backup 10.0.1.3" "dump addrs after del"
+del_endpoint 2
+check "get_endpoint 2" "" "simple del addr"
+check "show_endpoints" \
+	"$(format_endpoints "1,10.0.1.1" \
+			    "3,10.0.1.3,signal backup")" "dump addrs after del"
 
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.3 2>/dev/null
-check "ip netns exec $ns1 ./pm_nl_ctl get 4" "" "duplicate addr"
+add_endpoint 10.0.1.3 2>/dev/null
+check "get_endpoint 4" "" "duplicate addr"
 
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.4 flags signal
-check "ip netns exec $ns1 ./pm_nl_ctl get 4" "id 4 flags signal 10.0.1.4" "id addr increment"
+add_endpoint 10.0.1.4 flags signal
+check "get_endpoint 4" "$(format_endpoints "4,10.0.1.4,signal")" "id addr increment"
 
 for i in $(seq 5 9); do
-	ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.$i flags signal >/dev/null 2>&1
+	add_endpoint "10.0.1.${i}" flags signal >/dev/null 2>&1
 done
-check "ip netns exec $ns1 ./pm_nl_ctl get 9" "id 9 flags signal 10.0.1.9" "hard addr limit"
-check "ip netns exec $ns1 ./pm_nl_ctl get 10" "" "above hard addr limit"
+check "get_endpoint 9" "$(format_endpoints "9,10.0.1.9,signal")" "hard addr limit"
+check "get_endpoint 10" "" "above hard addr limit"
 
-ip netns exec $ns1 ./pm_nl_ctl del 9
+del_endpoint 9
 for i in $(seq 10 255); do
-	ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.9 id $i
-	ip netns exec $ns1 ./pm_nl_ctl del $i
+	add_endpoint 10.0.0.9 id "${i}"
+	del_endpoint "${i}"
 done
-check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags  10.0.1.1
-id 3 flags signal,backup 10.0.1.3
-id 4 flags signal 10.0.1.4
-id 5 flags signal 10.0.1.5
-id 6 flags signal 10.0.1.6
-id 7 flags signal 10.0.1.7
-id 8 flags signal 10.0.1.8" "id limit"
-
-ip netns exec $ns1 ./pm_nl_ctl flush
-check "ip netns exec $ns1 ./pm_nl_ctl dump" "" "flush addrs"
-
-ip netns exec $ns1 ./pm_nl_ctl limits 9 1 2>/dev/null
-check "ip netns exec $ns1 ./pm_nl_ctl limits" "$default_limits" "rcv addrs above hard limit"
-
-ip netns exec $ns1 ./pm_nl_ctl limits 1 9 2>/dev/null
-check "ip netns exec $ns1 ./pm_nl_ctl limits" "$default_limits" "subflows above hard limit"
-
-ip netns exec $ns1 ./pm_nl_ctl limits 8 8
-check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 8
-subflows 8" "set limits"
-
-ip netns exec $ns1 ./pm_nl_ctl flush
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.1
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.2
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.3 id 100
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.4
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.5 id 254
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.6
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.7
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.8
-check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags  10.0.1.1
-id 2 flags  10.0.1.2
-id 3 flags  10.0.1.7
-id 4 flags  10.0.1.8
-id 100 flags  10.0.1.3
-id 101 flags  10.0.1.4
-id 254 flags  10.0.1.5
-id 255 flags  10.0.1.6" "set ids"
-
-ip netns exec $ns1 ./pm_nl_ctl flush
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.1
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.2 id 254
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.3
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.4
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.5 id 253
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.6
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.7
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.8
-check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags  10.0.0.1
-id 2 flags  10.0.0.4
-id 3 flags  10.0.0.6
-id 4 flags  10.0.0.7
-id 5 flags  10.0.0.8
-id 253 flags  10.0.0.5
-id 254 flags  10.0.0.2
-id 255 flags  10.0.0.3" "wrap-around ids"
-
-ip netns exec $ns1 ./pm_nl_ctl flush
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.1 flags subflow
-ip netns exec $ns1 ./pm_nl_ctl set 10.0.1.1 flags backup
-check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \
-subflow,backup 10.0.1.1" "set flags (backup)"
-ip netns exec $ns1 ./pm_nl_ctl set 10.0.1.1 flags nobackup
-check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \
-subflow 10.0.1.1" "          (nobackup)"
+check "show_endpoints" \
+	"$(format_endpoints "1,10.0.1.1" \
+			    "3,10.0.1.3,signal backup" \
+			    "4,10.0.1.4,signal" \
+			    "5,10.0.1.5,signal" \
+			    "6,10.0.1.6,signal" \
+			    "7,10.0.1.7,signal" \
+			    "8,10.0.1.8,signal")" "id limit"
+
+flush_endpoint
+check "show_endpoints" "" "flush addrs"
+
+set_limits 9 1 2>/dev/null
+check "get_limits" "${default_limits}" "rcv addrs above hard limit"
+
+set_limits 1 9 2>/dev/null
+check "get_limits" "${default_limits}" "subflows above hard limit"
+
+set_limits 8 8
+check "get_limits" "$(format_limits 8 8)" "set limits"
+
+flush_endpoint
+add_endpoint 10.0.1.1
+add_endpoint 10.0.1.2
+add_endpoint 10.0.1.3 id 100
+add_endpoint 10.0.1.4
+add_endpoint 10.0.1.5 id 254
+add_endpoint 10.0.1.6
+add_endpoint 10.0.1.7
+add_endpoint 10.0.1.8
+check "show_endpoints" \
+	"$(format_endpoints "1,10.0.1.1" \
+			    "2,10.0.1.2" \
+			    "3,10.0.1.7" \
+			    "4,10.0.1.8" \
+			    "100,10.0.1.3" \
+			    "101,10.0.1.4" \
+			    "254,10.0.1.5" \
+			    "255,10.0.1.6")" "set ids"
+
+flush_endpoint
+add_endpoint 10.0.0.1
+add_endpoint 10.0.0.2 id 254
+add_endpoint 10.0.0.3
+add_endpoint 10.0.0.4
+add_endpoint 10.0.0.5 id 253
+add_endpoint 10.0.0.6
+add_endpoint 10.0.0.7
+add_endpoint 10.0.0.8
+check "show_endpoints" \
+	"$(format_endpoints "1,10.0.0.1" \
+			    "2,10.0.0.4" \
+			    "3,10.0.0.6" \
+			    "4,10.0.0.7" \
+			    "5,10.0.0.8" \
+			    "253,10.0.0.5" \
+			    "254,10.0.0.2" \
+			    "255,10.0.0.3")" "wrap-around ids"
+
+flush_endpoint
+add_endpoint 10.0.1.1 flags subflow
+change_address 10.0.1.1 backup
+check "show_endpoints" "$(format_endpoints "1,10.0.1.1,subflow backup")" \
+	"set flags (backup)"
+change_address 10.0.1.1 nobackup
+check "show_endpoints" "$(format_endpoints "1,10.0.1.1,subflow")" \
+	"          (nobackup)"
 
 # fullmesh support has been added later
-ip netns exec $ns1 ./pm_nl_ctl set id 1 flags fullmesh 2>/dev/null
-if ip netns exec $ns1 ./pm_nl_ctl dump | grep -q "fullmesh" ||
+change_endpoint 1 fullmesh 2>/dev/null
+if show_endpoints | grep -q "fullmesh" ||
    mptcp_lib_expect_all_features; then
-	check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \
-subflow,fullmesh 10.0.1.1" "          (fullmesh)"
-	ip netns exec $ns1 ./pm_nl_ctl set id 1 flags nofullmesh
-	check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \
-subflow 10.0.1.1" "          (nofullmesh)"
-	ip netns exec $ns1 ./pm_nl_ctl set id 1 flags backup,fullmesh
-	check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \
-subflow,backup,fullmesh 10.0.1.1" "          (backup,fullmesh)"
+	check "show_endpoints" "$(format_endpoints "1,10.0.1.1,subflow fullmesh")" \
+		"          (fullmesh)"
+	change_endpoint 1 nofullmesh
+	check "show_endpoints" "$(format_endpoints "1,10.0.1.1,subflow")" \
+		"          (nofullmesh)"
+	change_endpoint 1 backup,fullmesh
+	check "show_endpoints" "$(format_endpoints "1,10.0.1.1,subflow backup fullmesh")" \
+		"          (backup,fullmesh)"
 else
 	for st in fullmesh nofullmesh backup,fullmesh; do
 		st="          (${st})"
diff --git a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c
index 7426a2cbd4a0..994a556f46c1 100644
--- a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c
+++ b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c
@@ -19,12 +19,6 @@
 
 #include "linux/mptcp.h"
 
-#ifndef MPTCP_PM_NAME
-#define MPTCP_PM_NAME		"mptcp_pm"
-#endif
-#ifndef MPTCP_PM_EVENTS
-#define MPTCP_PM_EVENTS		"mptcp_pm_events"
-#endif
 #ifndef IPPROTO_MPTCP
 #define IPPROTO_MPTCP 262
 #endif
@@ -116,7 +110,7 @@ static int capture_events(int fd, int event_group)
 
 	if (setsockopt(fd, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP,
 		       &event_group, sizeof(event_group)) < 0)
-		error(1, errno, "could not join the " MPTCP_PM_EVENTS " mcast group");
+		error(1, errno, "could not join the " MPTCP_PM_EV_GRP_NAME " mcast group");
 
 	do {
 		FD_ZERO(&rfds);
@@ -288,7 +282,7 @@ static int genl_parse_getfamily(struct nlmsghdr *nlh, int *pm_family,
 					if (grp->rta_type == CTRL_ATTR_MCAST_GRP_ID)
 						*events_mcast_grp = *(__u32 *)RTA_DATA(grp);
 					else if (grp->rta_type == CTRL_ATTR_MCAST_GRP_NAME &&
-						 !strcmp(RTA_DATA(grp), MPTCP_PM_EVENTS))
+						 !strcmp(RTA_DATA(grp), MPTCP_PM_EV_GRP_NAME))
 						got_events_grp = 1;
 
 					grp = RTA_NEXT(grp, grp_len);
@@ -1276,7 +1270,7 @@ int add_listener(int argc, char *argv[])
 	struct sockaddr_storage addr;
 	struct sockaddr_in6 *a6;
 	struct sockaddr_in *a4;
-	u_int16_t family;
+	u_int16_t family = AF_UNSPEC;
 	int enable = 1;
 	int sock;
 	int err;
diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh
index 1b2366220388..2329c2f8519b 100755
--- a/tools/testing/selftests/net/mptcp/simult_flows.sh
+++ b/tools/testing/selftests/net/mptcp/simult_flows.sh
@@ -27,10 +27,11 @@ capout=""
 size=0
 
 usage() {
-	echo "Usage: $0 [ -b ] [ -c ] [ -d ]"
-	echo -e "\t-b: bail out after first error, otherwise runs al testcases"
+	echo "Usage: $0 [ -b ] [ -c ] [ -d ] [ -i]"
+	echo -e "\t-b: bail out after first error, otherwise runs all testcases"
 	echo -e "\t-c: capture packets for each test using tcpdump (default: no capture)"
 	echo -e "\t-d: debug this script"
+	echo -e "\t-i: use 'ip mptcp' instead of 'pm_nl_ctl'"
 }
 
 # This function is used in the cleanup trap
@@ -45,7 +46,7 @@ cleanup()
 }
 
 mptcp_lib_check_mptcp
-mptcp_lib_check_tools ip
+mptcp_lib_check_tools ip tc
 
 #  "$ns1"              ns2                    ns3
 #     ns1eth1    ns2eth1   ns2eth3      ns3eth1
@@ -85,8 +86,8 @@ setup()
 	ip -net "$ns1" route add default via 10.0.2.2 metric 101
 	ip -net "$ns1" route add default via dead:beef:2::2 metric 101
 
-	ip netns exec "$ns1" ./pm_nl_ctl limits 1 1
-	ip netns exec "$ns1" ./pm_nl_ctl add 10.0.2.1 dev ns1eth2 flags subflow
+	mptcp_lib_pm_nl_set_limits "${ns1}" 1 1
+	mptcp_lib_pm_nl_add_endpoint "${ns1}" 10.0.2.1 dev ns1eth2 flags subflow
 
 	ip -net "$ns2" addr add 10.0.1.2/24 dev ns2eth1
 	ip -net "$ns2" addr add dead:beef:1::2/64 dev ns2eth1 nodad
@@ -108,7 +109,7 @@ setup()
 	ip -net "$ns3" route add default via 10.0.3.2
 	ip -net "$ns3" route add default via dead:beef:3::2
 
-	ip netns exec "$ns3" ./pm_nl_ctl limits 1 1
+	mptcp_lib_pm_nl_set_limits "${ns3}" 1 1
 
 	# debug build can slow down measurably the test program
 	# we use quite tight time limit on the run-time, to ensure
@@ -154,6 +155,11 @@ do_transfer()
 		sleep 1
 	fi
 
+	NSTAT_HISTORY=/tmp/${ns3}.nstat ip netns exec ${ns3} \
+		nstat -n
+	NSTAT_HISTORY=/tmp/${ns1}.nstat ip netns exec ${ns1} \
+		nstat -n
+
 	timeout ${timeout_test} \
 		ip netns exec ${ns3} \
 			./mptcp_connect -jt ${timeout_poll} -l -p $port -T $max_time \
@@ -179,25 +185,27 @@ do_transfer()
 		kill ${cappid_connector}
 	fi
 
+	NSTAT_HISTORY=/tmp/${ns3}.nstat ip netns exec ${ns3} \
+		nstat | grep Tcp > /tmp/${ns3}.out
+	NSTAT_HISTORY=/tmp/${ns1}.nstat ip netns exec ${ns1} \
+		nstat | grep Tcp > /tmp/${ns1}.out
+
 	cmp $sin $cout > /dev/null 2>&1
 	local cmps=$?
 	cmp $cin $sout > /dev/null 2>&1
 	local cmpc=$?
 
-	printf "%-16s" " max $max_time "
 	if [ $retc -eq 0 ] && [ $rets -eq 0 ] && \
 	   [ $cmpc -eq 0 ] && [ $cmps -eq 0 ]; then
+		printf "%-16s" " max $max_time "
 		mptcp_lib_pr_ok
 		cat "$capout"
 		return 0
 	fi
 
-	mptcp_lib_pr_fail
-	echo "client exit code $retc, server $rets" 1>&2
-	echo -e "\nnetns ${ns3} socket stat for $port:" 1>&2
-	ip netns exec ${ns3} ss -nita 1>&2 -o "sport = :$port"
-	echo -e "\nnetns ${ns1} socket stat for $port:" 1>&2
-	ip netns exec ${ns1} ss -nita 1>&2 -o "dport = :$port"
+	mptcp_lib_pr_fail "client exit code $retc, server $rets"
+	mptcp_lib_pr_err_stats "${ns3}" "${ns1}" "${port}" \
+		"/tmp/${ns3}.out" "/tmp/${ns1}.out"
 	ls -l $sin $cout
 	ls -l $cin $sout
 
@@ -216,8 +224,8 @@ run_test()
 	shift 4
 	local msg=$*
 
-	[ $delay1 -gt 0 ] && delay1="delay $delay1" || delay1=""
-	[ $delay2 -gt 0 ] && delay2="delay $delay2" || delay2=""
+	[ $delay1 -gt 0 ] && delay1="delay ${delay1}ms" || delay1=""
+	[ $delay2 -gt 0 ] && delay2="delay ${delay2}ms" || delay2=""
 
 	for dev in ns1eth1 ns1eth2; do
 		tc -n $ns1 qdisc del dev $dev root >/dev/null 2>&1
@@ -243,7 +251,7 @@ run_test()
 	do_transfer $small $large $time
 	lret=$?
 	mptcp_lib_result_code "${lret}" "${msg}"
-	if [ $lret -ne 0 ]; then
+	if [ $lret -ne 0 ] && ! mptcp_lib_subtest_is_flaky; then
 		ret=$lret
 		[ $bail -eq 0 ] || exit $ret
 	fi
@@ -253,13 +261,13 @@ run_test()
 	do_transfer $large $small $time
 	lret=$?
 	mptcp_lib_result_code "${lret}" "${msg}"
-	if [ $lret -ne 0 ]; then
+	if [ $lret -ne 0 ] && ! mptcp_lib_subtest_is_flaky; then
 		ret=$lret
 		[ $bail -eq 0 ] || exit $ret
 	fi
 }
 
-while getopts "bcdh" option;do
+while getopts "bcdhi" option;do
 	case "$option" in
 	"h")
 		usage $0
@@ -274,6 +282,9 @@ while getopts "bcdh" option;do
 	"d")
 		set -x
 		;;
+	"i")
+		mptcp_lib_set_ip_mptcp
+		;;
 	"?")
 		usage $0
 		exit ${KSFT_FAIL}
@@ -282,11 +293,12 @@ while getopts "bcdh" option;do
 done
 
 setup
+mptcp_lib_subtests_last_ts_reset
 run_test 10 10 0 0 "balanced bwidth"
 run_test 10 10 1 25 "balanced bwidth with unbalanced delay"
 
 # we still need some additional infrastructure to pass the following test-cases
-run_test 10 3 0 0 "unbalanced bwidth"
+MPTCP_LIB_SUBTEST_FLAKY=1 run_test 10 3 0 0 "unbalanced bwidth"
 run_test 10 3 1 25 "unbalanced bwidth with unbalanced delay"
 run_test 10 3 25 1 "unbalanced bwidth with opposed, unbalanced delay"
 
diff --git a/tools/testing/selftests/net/mptcp/userspace_pm.sh b/tools/testing/selftests/net/mptcp/userspace_pm.sh
index 9e2981f2d7f5..333064b0b5ac 100755
--- a/tools/testing/selftests/net/mptcp/userspace_pm.sh
+++ b/tools/testing/selftests/net/mptcp/userspace_pm.sh
@@ -117,7 +117,36 @@ cleanup()
 trap cleanup EXIT
 
 # Create and configure network namespaces for testing
+print_title "Init"
 mptcp_lib_ns_init ns1 ns2
+
+# check path_manager and pm_type sysctl mapping
+if [ -f /proc/sys/net/mptcp/path_manager ]; then
+	ip netns exec "$ns1" sysctl -q net.mptcp.path_manager=userspace
+	pm_type="$(ip netns exec "$ns1" sysctl -n net.mptcp.pm_type)"
+	if [ "${pm_type}" != "1" ]; then
+		test_fail "unexpected pm_type: ${pm_type}"
+		mptcp_lib_result_print_all_tap
+		exit ${KSFT_FAIL}
+	fi
+
+	ip netns exec "$ns1" sysctl -q net.mptcp.path_manager=error 2>/dev/null
+	pm_type="$(ip netns exec "$ns1" sysctl -n net.mptcp.pm_type)"
+	if [ "${pm_type}" != "1" ]; then
+		test_fail "unexpected pm_type after error: ${pm_type}"
+		mptcp_lib_result_print_all_tap
+		exit ${KSFT_FAIL}
+	fi
+
+	ip netns exec "$ns1" sysctl -q net.mptcp.pm_type=0
+	pm_name="$(ip netns exec "$ns1" sysctl -n net.mptcp.path_manager)"
+	if [ "${pm_name}" != "kernel" ]; then
+		test_fail "unexpected path-manager: ${pm_name}"
+		mptcp_lib_result_print_all_tap
+		exit ${KSFT_FAIL}
+	fi
+fi
+
 for i in "$ns1" "$ns2" ;do
 	ip netns exec "$i" sysctl -q net.mptcp.pm_type=1
 done
@@ -150,8 +179,8 @@ mptcp_lib_events "${ns2}" "${client_evts}" client_evts_pid
 server_evts=$(mktemp)
 mptcp_lib_events "${ns1}" "${server_evts}" server_evts_pid
 sleep 0.5
+mptcp_lib_subtests_last_ts_reset
 
-print_title "Init"
 print_test "Created network namespaces ns1, ns2"
 test_pass
 
@@ -160,10 +189,12 @@ make_connection()
 	local is_v6=$1
 	local app_port=$app4_port
 	local connect_addr="10.0.1.1"
+	local client_addr="10.0.1.2"
 	local listen_addr="0.0.0.0"
 	if [ "$is_v6" = "v6" ]
 	then
 		connect_addr="dead:beef:1::1"
+		client_addr="dead:beef:1::2"
 		listen_addr="::"
 		app_port=$app6_port
 	else
@@ -206,6 +237,7 @@ make_connection()
 		   [ "$server_serverside" = 1 ]
 	then
 		test_pass
+		print_title "Connection info: ${client_addr}:${client_port} -> ${connect_addr}:${app_port}"
 	else
 		test_fail "Expected tokens (c:${client_token} - s:${server_token}) and server (c:${client_serverside} - s:${server_serverside})"
 		mptcp_lib_result_print_all_tap
@@ -297,7 +329,7 @@ test_announce()
 	ip netns exec "$ns2"\
 	   ./pm_nl_ctl ann 10.0.2.2 token "$client4_token" id $client_addr_id dev\
 	   ns2eth1
-	print_test "ADD_ADDR id:${client_addr_id} 10.0.2.2 (ns2) => ns1, reuse port"
+	print_test "ADD_ADDR id:client 10.0.2.2 (ns2) => ns1, reuse port"
 	sleep 0.5
 	verify_announce_event $server_evts $ANNOUNCED $server4_token "10.0.2.2" $client_addr_id \
 			      "$client4_port"
@@ -306,7 +338,7 @@ test_announce()
 	:>"$server_evts"
 	ip netns exec "$ns2" ./pm_nl_ctl ann\
 	   dead:beef:2::2 token "$client6_token" id $client_addr_id dev ns2eth1
-	print_test "ADD_ADDR6 id:${client_addr_id} dead:beef:2::2 (ns2) => ns1, reuse port"
+	print_test "ADD_ADDR6 id:client dead:beef:2::2 (ns2) => ns1, reuse port"
 	sleep 0.5
 	verify_announce_event "$server_evts" "$ANNOUNCED" "$server6_token" "dead:beef:2::2"\
 			      "$client_addr_id" "$client6_port" "v6"
@@ -316,7 +348,7 @@ test_announce()
 	client_addr_id=$((client_addr_id+1))
 	ip netns exec "$ns2" ./pm_nl_ctl ann 10.0.2.2 token "$client4_token" id\
 	   $client_addr_id dev ns2eth1 port $new4_port
-	print_test "ADD_ADDR id:${client_addr_id} 10.0.2.2 (ns2) => ns1, new port"
+	print_test "ADD_ADDR id:client+1 10.0.2.2 (ns2) => ns1, new port"
 	sleep 0.5
 	verify_announce_event "$server_evts" "$ANNOUNCED" "$server4_token" "10.0.2.2"\
 			      "$client_addr_id" "$new4_port"
@@ -327,7 +359,7 @@ test_announce()
 	# ADD_ADDR from the server to client machine reusing the subflow port
 	ip netns exec "$ns1" ./pm_nl_ctl ann 10.0.2.1 token "$server4_token" id\
 	   $server_addr_id dev ns1eth2
-	print_test "ADD_ADDR id:${server_addr_id} 10.0.2.1 (ns1) => ns2, reuse port"
+	print_test "ADD_ADDR id:server 10.0.2.1 (ns1) => ns2, reuse port"
 	sleep 0.5
 	verify_announce_event "$client_evts" "$ANNOUNCED" "$client4_token" "10.0.2.1"\
 			      "$server_addr_id" "$app4_port"
@@ -336,7 +368,7 @@ test_announce()
 	:>"$client_evts"
 	ip netns exec "$ns1" ./pm_nl_ctl ann dead:beef:2::1 token "$server6_token" id\
 	   $server_addr_id dev ns1eth2
-	print_test "ADD_ADDR6 id:${server_addr_id} dead:beef:2::1 (ns1) => ns2, reuse port"
+	print_test "ADD_ADDR6 id:server dead:beef:2::1 (ns1) => ns2, reuse port"
 	sleep 0.5
 	verify_announce_event "$client_evts" "$ANNOUNCED" "$client6_token" "dead:beef:2::1"\
 			      "$server_addr_id" "$app6_port" "v6"
@@ -346,7 +378,7 @@ test_announce()
 	server_addr_id=$((server_addr_id+1))
 	ip netns exec "$ns1" ./pm_nl_ctl ann 10.0.2.1 token "$server4_token" id\
 	   $server_addr_id dev ns1eth2 port $new4_port
-	print_test "ADD_ADDR id:${server_addr_id} 10.0.2.1 (ns1) => ns2, new port"
+	print_test "ADD_ADDR id:server+1 10.0.2.1 (ns1) => ns2, new port"
 	sleep 0.5
 	verify_announce_event "$client_evts" "$ANNOUNCED" "$client4_token" "10.0.2.1"\
 			      "$server_addr_id" "$new4_port"
@@ -380,7 +412,7 @@ test_remove()
 	local invalid_token=$(( client4_token - 1 ))
 	ip netns exec "$ns2" ./pm_nl_ctl rem token $invalid_token id\
 	   $client_addr_id > /dev/null 2>&1
-	print_test "RM_ADDR id:${client_addr_id} ns2 => ns1, invalid token"
+	print_test "RM_ADDR id:client ns2 => ns1, invalid token"
 	local type
 	type=$(mptcp_lib_evts_get_info type "$server_evts")
 	if [ "$type" = "" ]
@@ -394,7 +426,7 @@ test_remove()
 	local invalid_id=$(( client_addr_id + 1 ))
 	ip netns exec "$ns2" ./pm_nl_ctl rem token "$client4_token" id\
 	   $invalid_id > /dev/null 2>&1
-	print_test "RM_ADDR id:${invalid_id} ns2 => ns1, invalid id"
+	print_test "RM_ADDR id:client+1 ns2 => ns1, invalid id"
 	type=$(mptcp_lib_evts_get_info type "$server_evts")
 	if [ "$type" = "" ]
 	then
@@ -407,7 +439,7 @@ test_remove()
 	:>"$server_evts"
 	ip netns exec "$ns2" ./pm_nl_ctl rem token "$client4_token" id\
 	   $client_addr_id
-	print_test "RM_ADDR id:${client_addr_id} ns2 => ns1"
+	print_test "RM_ADDR id:client ns2 => ns1"
 	sleep 0.5
 	verify_remove_event "$server_evts" "$REMOVED" "$server4_token" "$client_addr_id"
 
@@ -416,7 +448,7 @@ test_remove()
 	client_addr_id=$(( client_addr_id - 1 ))
 	ip netns exec "$ns2" ./pm_nl_ctl rem token "$client4_token" id\
 	   $client_addr_id
-	print_test "RM_ADDR id:${client_addr_id} ns2 => ns1"
+	print_test "RM_ADDR id:client-1 ns2 => ns1"
 	sleep 0.5
 	verify_remove_event "$server_evts" "$REMOVED" "$server4_token" "$client_addr_id"
 
@@ -424,7 +456,7 @@ test_remove()
 	:>"$server_evts"
 	ip netns exec "$ns2" ./pm_nl_ctl rem token "$client6_token" id\
 	   $client_addr_id
-	print_test "RM_ADDR6 id:${client_addr_id} ns2 => ns1"
+	print_test "RM_ADDR6 id:client-1 ns2 => ns1"
 	sleep 0.5
 	verify_remove_event "$server_evts" "$REMOVED" "$server6_token" "$client_addr_id"
 
@@ -434,7 +466,7 @@ test_remove()
 	# RM_ADDR from the server to client machine
 	ip netns exec "$ns1" ./pm_nl_ctl rem token "$server4_token" id\
 	   $server_addr_id
-	print_test "RM_ADDR id:${server_addr_id} ns1 => ns2"
+	print_test "RM_ADDR id:server ns1 => ns2"
 	sleep 0.5
 	verify_remove_event "$client_evts" "$REMOVED" "$client4_token" "$server_addr_id"
 
@@ -443,7 +475,7 @@ test_remove()
 	server_addr_id=$(( server_addr_id - 1 ))
 	ip netns exec "$ns1" ./pm_nl_ctl rem token "$server4_token" id\
 	   $server_addr_id
-	print_test "RM_ADDR id:${server_addr_id} ns1 => ns2"
+	print_test "RM_ADDR id:server-1 ns1 => ns2"
 	sleep 0.5
 	verify_remove_event "$client_evts" "$REMOVED" "$client4_token" "$server_addr_id"
 
@@ -451,7 +483,7 @@ test_remove()
 	:>"$client_evts"
 	ip netns exec "$ns1" ./pm_nl_ctl rem token "$server6_token" id\
 	   $server_addr_id
-	print_test "RM_ADDR6 id:${server_addr_id} ns1 => ns2"
+	print_test "RM_ADDR6 id:server-1 ns1 => ns2"
 	sleep 0.5
 	verify_remove_event "$client_evts" "$REMOVED" "$client6_token" "$server_addr_id"
 }
@@ -479,8 +511,14 @@ verify_subflow_events()
 	local locid
 	local remid
 	local info
+	local e_dport_txt
+
+	# only display the fixed ports
+	if [ "${e_dport}" -ge "${app4_port}" ] && [ "${e_dport}" -le "${app6_port}" ]; then
+		e_dport_txt=":${e_dport}"
+	fi
 
-	info="${e_saddr} (${e_from}) => ${e_daddr}:${e_dport} (${e_to})"
+	info="${e_saddr} (${e_from}) => ${e_daddr}${e_dport_txt} (${e_to})"
 
 	if [ "$e_type" = "$SUB_ESTABLISHED" ]
 	then
@@ -766,7 +804,7 @@ test_subflows_v4_v6_mix()
 	:>"$client_evts"
 	ip netns exec "$ns1" ./pm_nl_ctl ann 10.0.2.1 token "$server6_token" id\
 	   $server_addr_id dev ns1eth2
-	print_test "ADD_ADDR4 id:${server_addr_id} 10.0.2.1 (ns1) => ns2, reuse port"
+	print_test "ADD_ADDR4 id:server 10.0.2.1 (ns1) => ns2, reuse port"
 	sleep 0.5
 	verify_announce_event "$client_evts" "$ANNOUNCED" "$client6_token" "10.0.2.1"\
 			      "$server_addr_id" "$app6_port"
@@ -861,7 +899,7 @@ test_listener()
 	local listener_pid=$!
 
 	sleep 0.5
-	print_test "CREATE_LISTENER 10.0.2.2:$client4_port"
+	print_test "CREATE_LISTENER 10.0.2.2 (client port)"
 	verify_listener_events $client_evts $LISTENER_CREATED $AF_INET 10.0.2.2 $client4_port
 
 	# ADD_ADDR from client to server machine reusing the subflow port
@@ -878,13 +916,14 @@ test_listener()
 	mptcp_lib_kill_wait $listener_pid
 
 	sleep 0.5
-	print_test "CLOSE_LISTENER 10.0.2.2:$client4_port"
+	print_test "CLOSE_LISTENER 10.0.2.2 (client port)"
 	verify_listener_events $client_evts $LISTENER_CLOSED $AF_INET 10.0.2.2 $client4_port
 }
 
 print_title "Make connections"
 make_connection
 make_connection "v6"
+print_title "Will be using address IDs ${client_addr_id} (client) and ${server_addr_id} (server)"
 
 test_announce
 test_remove
diff --git a/tools/testing/selftests/net/msg_zerocopy.c b/tools/testing/selftests/net/msg_zerocopy.c
index bdc03a2097e8..7ea5fb28c93d 100644
--- a/tools/testing/selftests/net/msg_zerocopy.c
+++ b/tools/testing/selftests/net/msg_zerocopy.c
@@ -85,6 +85,7 @@ static bool cfg_rx;
 static int  cfg_runtime_ms	= 4200;
 static int  cfg_verbose;
 static int  cfg_waittime_ms	= 500;
+static int  cfg_notification_limit = 32;
 static bool cfg_zerocopy;
 
 static socklen_t cfg_alen;
@@ -95,6 +96,7 @@ static char payload[IP_MAXPACKET];
 static long packets, bytes, completions, expected_completions;
 static int  zerocopied = -1;
 static uint32_t next_completion;
+static uint32_t sends_since_notify;
 
 static unsigned long gettimeofday_ms(void)
 {
@@ -208,6 +210,7 @@ static bool do_sendmsg(int fd, struct msghdr *msg, bool do_zerocopy, int domain)
 		error(1, errno, "send");
 	if (cfg_verbose && ret != len)
 		fprintf(stderr, "send: ret=%u != %u\n", ret, len);
+	sends_since_notify++;
 
 	if (len) {
 		packets++;
@@ -435,7 +438,7 @@ static bool do_recv_completion(int fd, int domain)
 	/* Detect notification gaps. These should not happen often, if at all.
 	 * Gaps can occur due to drops, reordering and retransmissions.
 	 */
-	if (lo != next_completion)
+	if (cfg_verbose && lo != next_completion)
 		fprintf(stderr, "gap: %u..%u does not append to %u\n",
 			lo, hi, next_completion);
 	next_completion = hi + 1;
@@ -460,6 +463,7 @@ static bool do_recv_completion(int fd, int domain)
 static void do_recv_completions(int fd, int domain)
 {
 	while (do_recv_completion(fd, domain)) {}
+	sends_since_notify = 0;
 }
 
 /* Wait for all remaining completions on the errqueue */
@@ -549,6 +553,9 @@ static void do_tx(int domain, int type, int protocol)
 		else
 			do_sendmsg(fd, &msg, cfg_zerocopy, domain);
 
+		if (cfg_zerocopy && sends_since_notify >= cfg_notification_limit)
+			do_recv_completions(fd, domain);
+
 		while (!do_poll(fd, POLLOUT)) {
 			if (cfg_zerocopy)
 				do_recv_completions(fd, domain);
@@ -708,7 +715,7 @@ static void parse_opts(int argc, char **argv)
 
 	cfg_payload_len = max_payload_len;
 
-	while ((c = getopt(argc, argv, "46c:C:D:i:mp:rs:S:t:vz")) != -1) {
+	while ((c = getopt(argc, argv, "46c:C:D:i:l:mp:rs:S:t:vz")) != -1) {
 		switch (c) {
 		case '4':
 			if (cfg_family != PF_UNSPEC)
@@ -736,6 +743,9 @@ static void parse_opts(int argc, char **argv)
 			if (cfg_ifindex == 0)
 				error(1, errno, "invalid iface: %s", optarg);
 			break;
+		case 'l':
+			cfg_notification_limit = strtoul(optarg, NULL, 0);
+			break;
 		case 'm':
 			cfg_cork_mixed = true;
 			break;
diff --git a/tools/testing/selftests/net/nat6to4.c b/tools/testing/selftests/net/nat6to4.bpf.c
index ac54c36b25fc..ac54c36b25fc 100644
--- a/tools/testing/selftests/net/nat6to4.c
+++ b/tools/testing/selftests/net/nat6to4.bpf.c
diff --git a/tools/testing/selftests/net/net_helper.sh b/tools/testing/selftests/net/net_helper.sh
deleted file mode 100644
index 6596fe03c77f..000000000000
--- a/tools/testing/selftests/net/net_helper.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-#
-# Helper functions
-
-wait_local_port_listen()
-{
-	local listener_ns="${1}"
-	local port="${2}"
-	local protocol="${3}"
-	local pattern
-	local i
-
-	pattern=":$(printf "%04X" "${port}") "
-
-	# for tcp protocol additionally check the socket state
-	[ ${protocol} = "tcp" ] && pattern="${pattern}0A"
-	for i in $(seq 10); do
-		if ip netns exec "${listener_ns}" awk '{print $2" "$4}' \
-		   /proc/net/"${protocol}"* | grep -q "${pattern}"; then
-			break
-		fi
-		sleep 0.1
-	done
-}
diff --git a/tools/testing/selftests/net/netdevice.sh b/tools/testing/selftests/net/netdevice.sh
index e3afcb424710..438f7b2acc5f 100755
--- a/tools/testing/selftests/net/netdevice.sh
+++ b/tools/testing/selftests/net/netdevice.sh
@@ -67,8 +67,12 @@ kci_net_setup()
 		return $ksft_skip
 	fi
 
-	# TODO what ipaddr to set ? DHCP ?
-	echo "SKIP: $netdev: set IP address"
+	if [ "$veth_created" ]; then
+		echo "XFAIL: $netdev: set IP address unsupported for veth*"
+	else
+		# TODO what ipaddr to set ? DHCP ?
+		echo "SKIP: $netdev: set IP address"
+	fi
 	return $ksft_skip
 }
 
@@ -86,7 +90,7 @@ kci_netdev_ethtool_test()
 	ret=$?
 	if [ $ret -ne 0 ];then
 		if [ $ret -eq "$1" ];then
-			echo "SKIP: $netdev: ethtool $2 not supported"
+			echo "XFAIL: $netdev: ethtool $2 not supported"
 			return $ksft_skip
 		else
 			echo "FAIL: $netdev: ethtool $2"
@@ -124,11 +128,45 @@ kci_netdev_ethtool()
 		return 1
 	fi
 	echo "PASS: $netdev: ethtool list features"
-	#TODO for each non fixed features, try to turn them on/off
+
+	while read -r FEATURE VALUE FIXED; do
+		[ "$FEATURE" != "Features" ] || continue # Skip "Features"
+		[ "$FIXED" != "[fixed]" ] || continue # Skip fixed features
+		feature="${FEATURE%:*}"
+
+		ethtool --offload "$netdev" "$feature" off
+		if [ $? -eq 0 ]; then
+			echo "PASS: $netdev: Turned off feature: $feature"
+		else
+			echo "FAIL: $netdev: Failed to turn off feature:" \
+				"$feature"
+		fi
+
+		ethtool --offload "$netdev" "$feature" on
+		if [ $? -eq 0 ]; then
+			echo "PASS: $netdev: Turned on feature: $feature"
+		else
+			echo "FAIL: $netdev: Failed to turn on feature:" \
+				"$feature"
+		fi
+
+		#restore the feature to its initial state
+		ethtool --offload "$netdev" "$feature" "$VALUE"
+		if [ $? -eq 0 ]; then
+			echo "PASS: $netdev: Restore feature $feature" \
+				"to initial state $VALUE"
+		else
+			echo "FAIL: $netdev: Failed to restore feature" \
+				"$feature to initial state $VALUE"
+		fi
+
+	done < "$TMP_ETHTOOL_FEATURES"
+
 	rm "$TMP_ETHTOOL_FEATURES"
 
 	kci_netdev_ethtool_test 74 'dump' "ethtool -d $netdev"
 	kci_netdev_ethtool_test 94 'stats' "ethtool -S $netdev"
+
 	return 0
 }
 
@@ -196,10 +234,24 @@ if [ ! -e "$TMP_LIST_NETDEV" ];then
 fi
 
 ip link show |grep '^[0-9]' | grep -oE '[[:space:]].*eth[0-9]*:|[[:space:]].*enp[0-9]s[0-9]:' | cut -d\  -f2 | cut -d: -f1> "$TMP_LIST_NETDEV"
+
+if [ ! -s "$TMP_LIST_NETDEV" ]; then
+	echo "No valid network device found, creating veth pair"
+	ip link add veth0 type veth peer name veth1
+	echo "veth0" > "$TMP_LIST_NETDEV"
+	veth_created=1
+fi
+
 while read netdev
 do
 	kci_test_netdev "$netdev"
 done < "$TMP_LIST_NETDEV"
 
+#clean up veth interface pair if it was created
+if [ "$veth_created" ]; then
+	ip link delete veth0
+	echo "Removed veth pair"
+fi
+
 rm "$TMP_LIST_NETDEV"
 exit 0
diff --git a/tools/testing/selftests/net/netfilter/.gitignore b/tools/testing/selftests/net/netfilter/.gitignore
new file mode 100644
index 000000000000..64c4f8d9aa6c
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/.gitignore
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
+audit_logread
+connect_close
+conntrack_dump_flush
+conntrack_reverse_clash
+sctp_collision
+nf_queue
diff --git a/tools/testing/selftests/net/netfilter/Makefile b/tools/testing/selftests/net/netfilter/Makefile
new file mode 100644
index 000000000000..e9b2f553588d
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/Makefile
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: GPL-2.0
+
+top_srcdir = ../../../../..
+
+HOSTPKG_CONFIG := pkg-config
+MNL_CFLAGS := $(shell $(HOSTPKG_CONFIG) --cflags libmnl 2>/dev/null)
+MNL_LDLIBS := $(shell $(HOSTPKG_CONFIG) --libs libmnl 2>/dev/null || echo -lmnl)
+
+TEST_PROGS := br_netfilter.sh bridge_brouter.sh
+TEST_PROGS += br_netfilter_queue.sh
+TEST_PROGS += conntrack_dump_flush.sh
+TEST_PROGS += conntrack_icmp_related.sh
+TEST_PROGS += conntrack_ipip_mtu.sh
+TEST_PROGS += conntrack_tcp_unreplied.sh
+TEST_PROGS += conntrack_resize.sh
+TEST_PROGS += conntrack_sctp_collision.sh
+TEST_PROGS += conntrack_vrf.sh
+TEST_PROGS += conntrack_reverse_clash.sh
+TEST_PROGS += ipvs.sh
+TEST_PROGS += nf_conntrack_packetdrill.sh
+TEST_PROGS += nf_nat_edemux.sh
+TEST_PROGS += nft_audit.sh
+TEST_PROGS += nft_concat_range.sh
+TEST_PROGS += nft_conntrack_helper.sh
+TEST_PROGS += nft_fib.sh
+TEST_PROGS += nft_flowtable.sh
+TEST_PROGS += nft_interface_stress.sh
+TEST_PROGS += nft_meta.sh
+TEST_PROGS += nft_nat.sh
+TEST_PROGS += nft_nat_zones.sh
+TEST_PROGS += nft_queue.sh
+TEST_PROGS += nft_synproxy.sh
+TEST_PROGS += nft_tproxy_tcp.sh
+TEST_PROGS += nft_tproxy_udp.sh
+TEST_PROGS += nft_zones_many.sh
+TEST_PROGS += rpath.sh
+TEST_PROGS += vxlan_mtu_frag.sh
+TEST_PROGS += xt_string.sh
+
+TEST_PROGS_EXTENDED = nft_concat_range_perf.sh
+
+TEST_GEN_FILES = audit_logread
+TEST_GEN_FILES += connect_close nf_queue
+TEST_GEN_FILES += conntrack_dump_flush
+TEST_GEN_FILES += conntrack_reverse_clash
+TEST_GEN_FILES += sctp_collision
+
+include ../../lib.mk
+
+$(OUTPUT)/nf_queue: CFLAGS += $(MNL_CFLAGS)
+$(OUTPUT)/nf_queue: LDLIBS += $(MNL_LDLIBS)
+
+$(OUTPUT)/conntrack_dump_flush: CFLAGS += $(MNL_CFLAGS)
+$(OUTPUT)/conntrack_dump_flush: LDLIBS += $(MNL_LDLIBS)
+
+TEST_FILES := lib.sh
+TEST_FILES += packetdrill
+
+TEST_INCLUDES := \
+	../lib.sh \
+	$(wildcard ../lib/sh/*.sh)
diff --git a/tools/testing/selftests/net/netfilter/audit_logread.c b/tools/testing/selftests/net/netfilter/audit_logread.c
new file mode 100644
index 000000000000..a0a880fc2d9d
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/audit_logread.c
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <signal.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#include <linux/audit.h>
+#include <linux/netlink.h>
+
+static int fd;
+
+#define MAX_AUDIT_MESSAGE_LENGTH	8970
+struct audit_message {
+	struct nlmsghdr nlh;
+	union {
+		struct audit_status s;
+		char data[MAX_AUDIT_MESSAGE_LENGTH];
+	} u;
+};
+
+int audit_recv(int fd, struct audit_message *rep)
+{
+	struct sockaddr_nl addr;
+	socklen_t addrlen = sizeof(addr);
+	int ret;
+
+	do {
+		ret = recvfrom(fd, rep, sizeof(*rep), 0,
+			       (struct sockaddr *)&addr, &addrlen);
+	} while (ret < 0 && errno == EINTR);
+
+	if (ret < 0 ||
+	    addrlen != sizeof(addr) ||
+	    addr.nl_pid != 0 ||
+	    rep->nlh.nlmsg_type == NLMSG_ERROR) /* short-cut for now */
+		return -1;
+
+	return ret;
+}
+
+int audit_send(int fd, uint16_t type, uint32_t key, uint32_t val)
+{
+	static int seq = 0;
+	struct audit_message msg = {
+		.nlh = {
+			.nlmsg_len   = NLMSG_SPACE(sizeof(msg.u.s)),
+			.nlmsg_type  = type,
+			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
+			.nlmsg_seq   = ++seq,
+		},
+		.u.s = {
+			.mask    = key,
+			.enabled = key == AUDIT_STATUS_ENABLED ? val : 0,
+			.pid     = key == AUDIT_STATUS_PID ? val : 0,
+		}
+	};
+	struct sockaddr_nl addr = {
+		.nl_family = AF_NETLINK,
+	};
+	int ret;
+
+	do {
+		ret = sendto(fd, &msg, msg.nlh.nlmsg_len, 0,
+			     (struct sockaddr *)&addr, sizeof(addr));
+	} while (ret < 0 && errno == EINTR);
+
+	if (ret != (int)msg.nlh.nlmsg_len)
+		return -1;
+	return 0;
+}
+
+int audit_set(int fd, uint32_t key, uint32_t val)
+{
+	struct audit_message rep = { 0 };
+	int ret;
+
+	ret = audit_send(fd, AUDIT_SET, key, val);
+	if (ret)
+		return ret;
+
+	ret = audit_recv(fd, &rep);
+	if (ret < 0)
+		return ret;
+	return 0;
+}
+
+int readlog(int fd)
+{
+	struct audit_message rep = { 0 };
+	int ret = audit_recv(fd, &rep);
+	const char *sep = "";
+	char *k, *v;
+
+	if (ret < 0)
+		return ret;
+
+	if (rep.nlh.nlmsg_type != AUDIT_NETFILTER_CFG)
+		return 0;
+
+	/* skip the initial "audit(...): " part */
+	strtok(rep.u.data, " ");
+
+	while ((k = strtok(NULL, "="))) {
+		v = strtok(NULL, " ");
+
+		/* these vary and/or are uninteresting, ignore */
+		if (!strcmp(k, "pid") ||
+		    !strcmp(k, "comm") ||
+		    !strcmp(k, "subj"))
+			continue;
+
+		/* strip the varying sequence number */
+		if (!strcmp(k, "table"))
+			*strchrnul(v, ':') = '\0';
+
+		printf("%s%s=%s", sep, k, v);
+		sep = " ";
+	}
+	if (*sep) {
+		printf("\n");
+		fflush(stdout);
+	}
+	return 0;
+}
+
+void cleanup(int sig)
+{
+	audit_set(fd, AUDIT_STATUS_ENABLED, 0);
+	close(fd);
+	if (sig)
+		exit(0);
+}
+
+int main(int argc, char **argv)
+{
+	struct sigaction act = {
+		.sa_handler = cleanup,
+	};
+
+	fd = socket(PF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
+	if (fd < 0) {
+		perror("Can't open netlink socket");
+		return -1;
+	}
+
+	if (sigaction(SIGTERM, &act, NULL) < 0 ||
+	    sigaction(SIGINT, &act, NULL) < 0) {
+		perror("Can't set signal handler");
+		close(fd);
+		return -1;
+	}
+
+	audit_set(fd, AUDIT_STATUS_ENABLED, 1);
+	audit_set(fd, AUDIT_STATUS_PID, getpid());
+
+	while (1)
+		readlog(fd);
+}
diff --git a/tools/testing/selftests/net/netfilter/br_netfilter.sh b/tools/testing/selftests/net/netfilter/br_netfilter.sh
new file mode 100755
index 000000000000..011de8763094
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/br_netfilter.sh
@@ -0,0 +1,175 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test for legacy br_netfilter module combined with connection tracking,
+# a combination that doesn't really work.
+# Multicast/broadcast packets race for hash table insertion.
+
+#           eth0    br0     eth0
+# setup is: ns1 <->,ns0 <-> ns3
+#           ns2 <-'    `'-> ns4
+
+source lib.sh
+
+checktool "nft --version" "run test without nft tool"
+
+read t < /proc/sys/kernel/tainted
+if [ "$t" -ne 0 ];then
+	echo SKIP: kernel is tainted
+	exit $ksft_skip
+fi
+
+cleanup() {
+	cleanup_all_ns
+}
+
+trap cleanup EXIT
+
+setup_ns ns0 ns1 ns2 ns3 ns4
+
+ret=0
+
+do_ping()
+{
+	fromns="$1"
+	dstip="$2"
+
+	if ! ip netns exec "$fromns" ping -c 1 -q "$dstip" > /dev/null; then
+		echo "ERROR: ping from $fromns to $dstip"
+		ip netns exec "$ns0" nft list ruleset
+		ret=1
+	fi
+}
+
+bcast_ping()
+{
+	fromns="$1"
+	dstip="$2"
+
+	local packets=500
+
+	[ "$KSFT_MACHINE_SLOW" = yes ] && packets=100
+
+	for i in $(seq 1 $packets); do
+		if ! ip netns exec "$fromns" ping -q -f -b -c 1 -q "$dstip" > /dev/null 2>&1; then
+			echo "ERROR: ping -b from $fromns to $dstip"
+			ip netns exec "$ns0" nft list ruleset
+			ret=1
+			break
+		fi
+	done
+}
+
+if ! ip link add veth1 netns "$ns0" type veth peer name eth0 netns "$ns1"; then
+	echo "SKIP: Can't create veth device"
+	exit $ksft_skip
+fi
+
+ip link add veth2 netns "$ns0" type veth peer name eth0 netns "$ns2"
+ip link add veth3 netns "$ns0" type veth peer name eth0 netns "$ns3"
+ip link add veth4 netns "$ns0" type veth peer name eth0 netns "$ns4"
+
+for i in $(seq 1 4); do
+  ip -net "$ns0" link set "veth$i" up
+done
+
+if ! ip -net "$ns0" link add br0 type bridge stp_state 0 forward_delay 0 nf_call_iptables 1 nf_call_ip6tables 1 nf_call_arptables 1; then
+	echo "SKIP: Can't create bridge br0"
+	exit $ksft_skip
+fi
+
+# make veth0,1,2 part of bridge.
+for i in $(seq 1 3); do
+  ip -net "$ns0" link set "veth$i" master br0
+done
+
+# add a macvlan on top of the bridge.
+MACVLAN_ADDR=ba:f3:13:37:42:23
+ip -net "$ns0" link add link br0 name macvlan0 type macvlan mode private
+ip -net "$ns0" link set macvlan0 address ${MACVLAN_ADDR}
+ip -net "$ns0" link set macvlan0 up
+ip -net "$ns0" addr add 10.23.0.1/24 dev macvlan0
+
+# add a macvlan on top of veth4.
+MACVLAN_ADDR=ba:f3:13:37:42:24
+ip -net "$ns0" link add link veth4 name macvlan4 type macvlan mode passthru
+ip -net "$ns0" link set macvlan4 address ${MACVLAN_ADDR}
+ip -net "$ns0" link set macvlan4 up
+
+# make the macvlan part of the bridge.
+# veth4 is not a bridge port, only the macvlan on top of it.
+ip -net "$ns0" link set macvlan4 master br0
+
+ip -net "$ns0" link set br0 up
+ip -net "$ns0" addr add 10.0.0.1/24 dev br0
+
+modprobe -q br_netfilter
+if ! ip netns exec "$ns0" sysctl -q net.bridge.bridge-nf-call-iptables=1; then
+	echo "SKIP: bridge netfilter not available"
+	ret=$ksft_skip
+fi
+
+# for testing, so namespaces will reply to ping -b probes.
+ip netns exec "$ns0" sysctl -q net.ipv4.icmp_echo_ignore_broadcasts=0
+
+# enable conntrack in ns0 and drop broadcast packets in forward to
+# avoid them from getting confirmed in the postrouting hook before
+# the cloned skb is passed up the stack.
+ip netns exec "$ns0" nft -f - <<EOF
+table ip filter {
+	chain input {
+		type filter hook input priority 1; policy accept
+		iifname br0 counter
+		ct state new accept
+	}
+}
+
+table bridge filter {
+	chain forward {
+		type filter hook forward priority 0; policy accept
+		meta pkttype broadcast ip protocol icmp counter drop
+	}
+}
+EOF
+if [ "$?" -ne 0 ];then
+	echo "SKIP: could not add nftables ruleset"
+	exit $ksft_skip
+fi
+
+# place 1, 2 & 3 in same subnet, connected via ns0:br0.
+# ns4 is placed in same subnet as well, but its not
+# part of the bridge: the corresponding veth4 is not
+# part of the bridge, only its macvlan interface.
+for i in $(seq 1 4); do
+  eval ip -net \$ns"$i" link set eth0 up
+done
+for i in $(seq 1 2); do
+  eval ip -net \$ns"$i" addr add "10.0.0.1$i/24" dev eth0
+done
+
+ip -net "$ns3" addr add 10.23.0.13/24 dev eth0
+ip -net "$ns4" addr add 10.23.0.14/24 dev eth0
+
+# test basic connectivity
+do_ping "$ns1" 10.0.0.12
+do_ping "$ns3" 10.23.0.1
+do_ping "$ns4" 10.23.0.1
+
+bcast_ping "$ns1" 10.0.0.255
+
+# This should deliver broadcast to macvlan0, which is on top of ns0:br0.
+bcast_ping "$ns3" 10.23.0.255
+
+# same, this time via veth4:macvlan4.
+bcast_ping "$ns4" 10.23.0.255
+
+read t < /proc/sys/kernel/tainted
+if [ "$t" -eq 0 ];then
+	echo PASS: kernel not tainted
+else
+	echo ERROR: kernel is tainted
+	dmesg
+	ret=1
+fi
+
+exit $ret
diff --git a/tools/testing/selftests/net/netfilter/br_netfilter_queue.sh b/tools/testing/selftests/net/netfilter/br_netfilter_queue.sh
new file mode 100755
index 000000000000..4788641717d9
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/br_netfilter_queue.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+
+source lib.sh
+
+checktool "nft --version" "run test without nft tool"
+
+read t < /proc/sys/kernel/tainted
+if [ "$t" -ne 0 ];then
+	echo SKIP: kernel is tainted
+	exit $ksft_skip
+fi
+
+cleanup() {
+	cleanup_all_ns
+}
+
+setup_ns c1 c2 c3 sender
+
+trap cleanup EXIT
+
+nf_queue_wait()
+{
+	grep -q "^ *$1 " "/proc/self/net/netfilter/nfnetlink_queue"
+}
+
+port_add() {
+	ns="$1"
+	dev="$2"
+	a="$3"
+
+	ip link add name "$dev" type veth peer name "$dev" netns "$ns"
+
+	ip -net "$ns" addr add 192.168.1."$a"/24 dev "$dev"
+	ip -net "$ns" link set "$dev" up
+
+	ip link set "$dev" master br0
+	ip link set "$dev" up
+}
+
+[ "${1}" != "run" ] && { unshare -n "${0}" run; exit $?; }
+
+ip link add br0 type bridge
+ip addr add 192.168.1.254/24 dev br0
+
+port_add "$c1" "c1" 1
+port_add "$c2" "c2" 2
+port_add "$c3" "c3" 3
+port_add "$sender" "sender" 253
+
+ip link set br0 up
+
+modprobe -q br_netfilter
+
+sysctl net.bridge.bridge-nf-call-iptables=1 || exit 1
+
+ip netns exec "$sender" ping -I sender -c1 192.168.1.1 || exit 1
+ip netns exec "$sender" ping -I sender -c1 192.168.1.2 || exit 2
+ip netns exec "$sender" ping -I sender -c1 192.168.1.3 || exit 3
+
+nft -f /dev/stdin <<EOF
+table ip filter {
+	chain forward {
+		type filter hook forward priority 0; policy accept;
+		ct state new counter
+		ip protocol icmp counter queue num 0 bypass
+	}
+}
+EOF
+./nf_queue -t 5 > /dev/null &
+
+busywait 5000 nf_queue_wait
+
+for i in $(seq 1 5); do conntrack -F > /dev/null 2> /dev/null; sleep 0.1 ; done &
+ip netns exec "$sender" ping -I sender -f -c 50 -b 192.168.1.255
+
+read t < /proc/sys/kernel/tainted
+if [ "$t" -eq 0 ];then
+	echo PASS: kernel not tainted
+else
+	echo ERROR: kernel is tainted
+	dmesg
+	exit 1
+fi
+
+exit 0
diff --git a/tools/testing/selftests/net/netfilter/bridge_brouter.sh b/tools/testing/selftests/net/netfilter/bridge_brouter.sh
new file mode 100755
index 000000000000..ea76f2bc2f59
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/bridge_brouter.sh
@@ -0,0 +1,120 @@
+#!/bin/bash
+#
+# This test is for bridge 'brouting', i.e. make some packets being routed
+# rather than getting bridged even though they arrive on interface that is
+# part of a bridge.
+
+#           eth0    br0     eth0
+# setup is: ns1 <-> nsbr <-> ns2
+
+source lib.sh
+
+if ! ebtables -V > /dev/null 2>&1;then
+	echo "SKIP: Could not run test without ebtables"
+	exit $ksft_skip
+fi
+
+cleanup() {
+	cleanup_all_ns
+}
+
+trap cleanup EXIT
+
+setup_ns nsbr ns1 ns2
+
+if ! ip link add veth0 netns "$nsbr" type veth peer name eth0 netns "$ns1"; then
+	echo "SKIP: Can't create veth device"
+	exit $ksft_skip
+fi
+ip link add veth1 netns "$nsbr" type veth peer name eth0 netns "$ns2"
+
+if ! ip -net "$nsbr" link add br0 type bridge; then
+	echo "SKIP: Can't create bridge br0"
+	exit $ksft_skip
+fi
+
+ip -net "$nsbr" link set veth0 up
+ip -net "$nsbr" link set veth1 up
+
+ip -net "$nsbr" link set veth0 master br0
+ip -net "$nsbr" link set veth1 master br0
+ip -net "$nsbr" link set br0 up
+ip -net "$nsbr" addr add 10.0.0.1/24 dev br0
+
+# place both in same subnet, ${ns1} and ${ns2} connected via ${nsbr}:br0
+ip -net "$ns1" link set eth0 up
+ip -net "$ns2" link set eth0 up
+ip -net "$ns1" addr add 10.0.0.11/24 dev eth0
+ip -net "$ns2" addr add 10.0.0.12/24 dev eth0
+
+test_ebtables_broute()
+{
+	# redirect is needed so the dstmac is rewritten to the bridge itself,
+	# ip stack won't process OTHERHOST (foreign unicast mac) packets.
+	if ! ip netns exec "$nsbr" ebtables -t broute -A BROUTING -p ipv4 --ip-protocol icmp -j redirect --redirect-target=DROP; then
+		echo "SKIP: Could not add ebtables broute redirect rule"
+		return $ksft_skip
+	fi
+
+	ip netns exec "$nsbr" sysctl -q net.ipv4.conf.veth0.forwarding=0
+
+	# ping net${ns1}, expected to not work (ip forwarding is off)
+	if ip netns exec "$ns1" ping -q -c 1 10.0.0.12 -W 0.5 > /dev/null 2>&1; then
+		echo "ERROR: ping works, should have failed" 1>&2
+		return 1
+	fi
+
+	# enable forwarding on both interfaces.
+	# neither needs an ip address, but at least the bridge needs
+	# an ip address in same network segment as ${ns1} and ${ns2} (${nsbr}
+	# needs to be able to determine route for to-be-forwarded packet).
+	ip netns exec "$nsbr" sysctl -q net.ipv4.conf.veth0.forwarding=1
+	ip netns exec "$nsbr" sysctl -q net.ipv4.conf.veth1.forwarding=1
+
+	if ! ip netns exec "$ns1" ping -q -c 1 10.0.0.12 > /dev/null; then
+		echo "ERROR: ping did not work, but it should (broute+forward)" 1>&2
+		return 1
+	fi
+
+	echo "PASS: ${ns1}/${ns2} connectivity with active broute rule"
+	ip netns exec "$nsbr" ebtables -t broute -F
+
+	# ping net${ns1}, expected to work (frames are bridged)
+	if ! ip netns exec "$ns1" ping -q -c 1 10.0.0.12 > /dev/null; then
+		echo "ERROR: ping did not work, but it should (bridged)" 1>&2
+		return 1
+	fi
+
+	ip netns exec "$nsbr" ebtables -t filter -A FORWARD -p ipv4 --ip-protocol icmp -j DROP
+
+	# ping net${ns1}, expected to not work (DROP in bridge forward)
+	if ip netns exec "$ns1" ping -q -c 1 10.0.0.12 -W 0.5 > /dev/null 2>&1; then
+		echo "ERROR: ping works, should have failed (icmp forward drop)" 1>&2
+		return 1
+	fi
+
+	# re-activate brouter
+	ip netns exec "$nsbr" ebtables -t broute -A BROUTING -p ipv4 --ip-protocol icmp -j redirect --redirect-target=DROP
+
+	if ! ip netns exec "$ns2" ping -q -c 1 10.0.0.11 > /dev/null; then
+		echo "ERROR: ping did not work, but it should (broute+forward 2)" 1>&2
+		return 1
+	fi
+
+	echo "PASS: ${ns1}/${ns2} connectivity with active broute rule and bridge forward drop"
+	return 0
+}
+
+# test basic connectivity
+if ! ip netns exec "$ns1" ping -c 1 -q 10.0.0.12 > /dev/null; then
+    echo "ERROR: Could not reach ${ns2} from ${ns1}" 1>&2
+    exit 1
+fi
+
+if ! ip netns exec "$ns2" ping -c 1 -q 10.0.0.11 > /dev/null; then
+    echo "ERROR: Could not reach ${ns1} from ${ns2}" 1>&2
+    exit 1
+fi
+
+test_ebtables_broute
+exit $?
diff --git a/tools/testing/selftests/net/netfilter/config b/tools/testing/selftests/net/netfilter/config
new file mode 100644
index 000000000000..363646f4fefe
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/config
@@ -0,0 +1,95 @@
+CONFIG_AUDIT=y
+CONFIG_BPF_SYSCALL=y
+CONFIG_BRIDGE=m
+CONFIG_BRIDGE_EBT_BROUTE=m
+CONFIG_BRIDGE_EBT_IP=m
+CONFIG_BRIDGE_EBT_REDIRECT=m
+CONFIG_BRIDGE_EBT_T_FILTER=m
+CONFIG_BRIDGE_NETFILTER=m
+CONFIG_BRIDGE_NF_EBTABLES=m
+CONFIG_BRIDGE_VLAN_FILTERING=y
+CONFIG_CGROUP_BPF=y
+CONFIG_DUMMY=m
+CONFIG_INET_ESP=m
+CONFIG_IP_NF_MATCH_RPFILTER=m
+CONFIG_IP6_NF_MATCH_RPFILTER=m
+CONFIG_IP_NF_IPTABLES=m
+CONFIG_IP6_NF_IPTABLES=m
+CONFIG_IP_NF_FILTER=m
+CONFIG_IP6_NF_FILTER=m
+CONFIG_IP_NF_RAW=m
+CONFIG_IP6_NF_RAW=m
+CONFIG_IP_SCTP=m
+CONFIG_IP_VS=m
+CONFIG_IP_VS_PROTO_TCP=y
+CONFIG_IP_VS_RR=m
+CONFIG_IPV6=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_MACVLAN=m
+CONFIG_NAMESPACES=y
+CONFIG_NET_CLS_U32=m
+CONFIG_NET_L3_MASTER_DEV=y
+CONFIG_NET_NS=y
+CONFIG_NET_SCH_NETEM=m
+CONFIG_NET_SCH_HTB=m
+CONFIG_NET_IPIP=m
+CONFIG_NET_VRF=y
+CONFIG_NETFILTER=y
+CONFIG_NETFILTER_ADVANCED=y
+CONFIG_NETFILTER_NETLINK=m
+CONFIG_NETFILTER_NETLINK_QUEUE=m
+CONFIG_NETFILTER_SYNPROXY=m
+CONFIG_NETFILTER_XTABLES=m
+CONFIG_NETFILTER_XT_NAT=m
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
+CONFIG_NETFILTER_XT_MATCH_STATE=m
+CONFIG_NETFILTER_XT_MATCH_STRING=m
+CONFIG_NETFILTER_XT_TARGET_REDIRECT=m
+CONFIG_NF_CONNTRACK=m
+CONFIG_NF_CONNTRACK_PROCFS=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CONNTRACK_FTP=m
+CONFIG_NF_CONNTRACK_MARK=y
+CONFIG_NF_CONNTRACK_ZONES=y
+CONFIG_NF_CT_NETLINK=m
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_FLOW_TABLE=m
+CONFIG_NF_LOG_IPV4=m
+CONFIG_NF_LOG_IPV6=m
+CONFIG_NF_NAT=m
+CONFIG_NF_NAT_REDIRECT=y
+CONFIG_NF_NAT_MASQUERADE=y
+CONFIG_NF_TABLES=m
+CONFIG_NF_TABLES_BRIDGE=m
+CONFIG_NF_TABLES_INET=y
+CONFIG_NF_TABLES_IPV4=y
+CONFIG_NF_TABLES_IPV6=y
+CONFIG_NF_TABLES_NETDEV=y
+CONFIG_NF_FLOW_TABLE_INET=m
+CONFIG_NFT_BRIDGE_META=m
+CONFIG_NFT_COMPAT=m
+CONFIG_NFT_CT=m
+CONFIG_NFT_FIB=m
+CONFIG_NFT_FIB_INET=m
+CONFIG_NFT_FIB_IPV4=m
+CONFIG_NFT_FIB_IPV6=m
+CONFIG_NFT_FLOW_OFFLOAD=m
+CONFIG_NFT_LIMIT=m
+CONFIG_NFT_LOG=m
+CONFIG_NFT_MASQ=m
+CONFIG_NFT_NAT=m
+CONFIG_NFT_NUMGEN=m
+CONFIG_NFT_QUEUE=m
+CONFIG_NFT_QUOTA=m
+CONFIG_NFT_REDIR=m
+CONFIG_NFT_SYNPROXY=m
+CONFIG_NFT_TPROXY=m
+CONFIG_VETH=m
+CONFIG_VLAN_8021Q=m
+CONFIG_VXLAN=m
+CONFIG_XFRM_USER=m
+CONFIG_XFRM_STATISTICS=y
+CONFIG_NET_PKTGEN=m
+CONFIG_TUN=m
+CONFIG_INET_DIAG=m
+CONFIG_SCTP_DIAG=m
diff --git a/tools/testing/selftests/net/netfilter/connect_close.c b/tools/testing/selftests/net/netfilter/connect_close.c
new file mode 100644
index 000000000000..1c3b0add54c4
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/connect_close.c
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+#include <signal.h>
+
+#include <arpa/inet.h>
+#include <sys/socket.h>
+
+#define PORT 12345
+#define RUNTIME 10
+
+static struct {
+	unsigned int timeout;
+	unsigned int port;
+} opts = {
+	.timeout = RUNTIME,
+	.port = PORT,
+};
+
+static void handler(int sig)
+{
+	_exit(sig == SIGALRM ? 0 : 1);
+}
+
+static void set_timeout(void)
+{
+	struct sigaction action = {
+		.sa_handler = handler,
+	};
+
+	sigaction(SIGALRM, &action, NULL);
+
+	alarm(opts.timeout);
+}
+
+static void do_connect(const struct sockaddr_in *dst)
+{
+	int s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+
+	if (s >= 0)
+		fcntl(s, F_SETFL, O_NONBLOCK);
+
+	connect(s, (struct sockaddr *)dst, sizeof(*dst));
+	close(s);
+}
+
+static void do_accept(const struct sockaddr_in *src)
+{
+	int c, one = 1, s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+
+	if (s < 0)
+		return;
+
+	setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one));
+	setsockopt(s, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one));
+
+	bind(s, (struct sockaddr *)src, sizeof(*src));
+
+	listen(s, 16);
+
+	c = accept(s, NULL, NULL);
+	if (c >= 0)
+		close(c);
+
+	close(s);
+}
+
+static int accept_loop(void)
+{
+	struct sockaddr_in src = {
+		.sin_family = AF_INET,
+		.sin_port = htons(opts.port),
+	};
+
+	inet_pton(AF_INET, "127.0.0.1", &src.sin_addr);
+
+	set_timeout();
+
+	for (;;)
+		do_accept(&src);
+
+	return 1;
+}
+
+static int connect_loop(void)
+{
+	struct sockaddr_in dst = {
+		.sin_family = AF_INET,
+		.sin_port = htons(opts.port),
+	};
+
+	inet_pton(AF_INET, "127.0.0.1", &dst.sin_addr);
+
+	set_timeout();
+
+	for (;;)
+		do_connect(&dst);
+
+	return 1;
+}
+
+static void parse_opts(int argc, char **argv)
+{
+	int c;
+
+	while ((c = getopt(argc, argv, "t:p:")) != -1) {
+		switch (c) {
+		case 't':
+			opts.timeout = atoi(optarg);
+			break;
+		case 'p':
+			opts.port = atoi(optarg);
+			break;
+		}
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	pid_t p;
+
+	parse_opts(argc, argv);
+
+	p = fork();
+	if (p < 0)
+		return 111;
+
+	if (p > 0)
+		return accept_loop();
+
+	return connect_loop();
+}
diff --git a/tools/testing/selftests/net/netfilter/conntrack_dump_flush.c b/tools/testing/selftests/net/netfilter/conntrack_dump_flush.c
new file mode 100644
index 000000000000..5f827e10717d
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/conntrack_dump_flush.c
@@ -0,0 +1,476 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+
+#include <time.h>
+#include <libmnl/libmnl.h>
+#include <netinet/ip.h>
+
+#include <linux/netlink.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+#include <linux/netfilter/nf_conntrack_tcp.h>
+#include "../../kselftest_harness.h"
+
+#define TEST_ZONE_ID 123
+#define NF_CT_DEFAULT_ZONE_ID 0
+
+static int reply_counter;
+
+static int build_cta_tuple_v4(struct nlmsghdr *nlh, int type,
+			      uint32_t src_ip, uint32_t dst_ip,
+			      uint16_t src_port, uint16_t dst_port)
+{
+	struct nlattr *nest, *nest_ip, *nest_proto;
+
+	nest = mnl_attr_nest_start(nlh, type);
+	if (!nest)
+		return -1;
+
+	nest_ip = mnl_attr_nest_start(nlh, CTA_TUPLE_IP);
+	if (!nest_ip)
+		return -1;
+	mnl_attr_put_u32(nlh, CTA_IP_V4_SRC, src_ip);
+	mnl_attr_put_u32(nlh, CTA_IP_V4_DST, dst_ip);
+	mnl_attr_nest_end(nlh, nest_ip);
+
+	nest_proto = mnl_attr_nest_start(nlh, CTA_TUPLE_PROTO);
+	if (!nest_proto)
+		return -1;
+	mnl_attr_put_u8(nlh, CTA_PROTO_NUM, 6);
+	mnl_attr_put_u16(nlh, CTA_PROTO_SRC_PORT, htons(src_port));
+	mnl_attr_put_u16(nlh, CTA_PROTO_DST_PORT, htons(dst_port));
+	mnl_attr_nest_end(nlh, nest_proto);
+
+	mnl_attr_nest_end(nlh, nest);
+
+	return 0;
+}
+
+static int build_cta_tuple_v6(struct nlmsghdr *nlh, int type,
+			      struct in6_addr src_ip, struct in6_addr dst_ip,
+			      uint16_t src_port, uint16_t dst_port)
+{
+	struct nlattr *nest, *nest_ip, *nest_proto;
+
+	nest = mnl_attr_nest_start(nlh, type);
+	if (!nest)
+		return -1;
+
+	nest_ip = mnl_attr_nest_start(nlh, CTA_TUPLE_IP);
+	if (!nest_ip)
+		return -1;
+	mnl_attr_put(nlh, CTA_IP_V6_SRC, sizeof(struct in6_addr), &src_ip);
+	mnl_attr_put(nlh, CTA_IP_V6_DST, sizeof(struct in6_addr), &dst_ip);
+	mnl_attr_nest_end(nlh, nest_ip);
+
+	nest_proto = mnl_attr_nest_start(nlh, CTA_TUPLE_PROTO);
+	if (!nest_proto)
+		return -1;
+	mnl_attr_put_u8(nlh, CTA_PROTO_NUM, 6);
+	mnl_attr_put_u16(nlh, CTA_PROTO_SRC_PORT, htons(src_port));
+	mnl_attr_put_u16(nlh, CTA_PROTO_DST_PORT, htons(dst_port));
+	mnl_attr_nest_end(nlh, nest_proto);
+
+	mnl_attr_nest_end(nlh, nest);
+
+	return 0;
+}
+
+static int build_cta_proto(struct nlmsghdr *nlh)
+{
+	struct nlattr *nest, *nest_proto;
+
+	nest = mnl_attr_nest_start(nlh, CTA_PROTOINFO);
+	if (!nest)
+		return -1;
+
+	nest_proto = mnl_attr_nest_start(nlh, CTA_PROTOINFO_TCP);
+	if (!nest_proto)
+		return -1;
+	mnl_attr_put_u8(nlh, CTA_PROTOINFO_TCP_STATE, TCP_CONNTRACK_ESTABLISHED);
+	mnl_attr_put_u16(nlh, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL, 0x0a0a);
+	mnl_attr_put_u16(nlh, CTA_PROTOINFO_TCP_FLAGS_REPLY, 0x0a0a);
+	mnl_attr_nest_end(nlh, nest_proto);
+
+	mnl_attr_nest_end(nlh, nest);
+
+	return 0;
+}
+
+static int conntrack_data_insert(struct mnl_socket *sock, struct nlmsghdr *nlh,
+				 uint16_t zone)
+{
+	char buf[MNL_SOCKET_BUFFER_SIZE];
+	struct nlmsghdr *rplnlh;
+	unsigned int portid;
+	int ret;
+
+	portid = mnl_socket_get_portid(sock);
+
+	ret = build_cta_proto(nlh);
+	if (ret < 0) {
+		perror("build_cta_proto");
+		return -1;
+	}
+	mnl_attr_put_u32(nlh, CTA_TIMEOUT, htonl(20000));
+	mnl_attr_put_u16(nlh, CTA_ZONE, htons(zone));
+
+	if (mnl_socket_sendto(sock, nlh, nlh->nlmsg_len) < 0) {
+		perror("mnl_socket_sendto");
+		return -1;
+	}
+
+	ret = mnl_socket_recvfrom(sock, buf, MNL_SOCKET_BUFFER_SIZE);
+	if (ret < 0) {
+		perror("mnl_socket_recvfrom");
+		return ret;
+	}
+
+	ret = mnl_cb_run(buf, ret, nlh->nlmsg_seq, portid, NULL, NULL);
+	if (ret < 0) {
+		if (errno == EEXIST) {
+			/* The entries are probably still there from a previous
+			 * run. So we are good
+			 */
+			return 0;
+		}
+		perror("mnl_cb_run");
+		return ret;
+	}
+
+	return 0;
+}
+
+static int conntrack_data_generate_v4(struct mnl_socket *sock, uint32_t src_ip,
+				      uint32_t dst_ip, uint16_t zone)
+{
+	char buf[MNL_SOCKET_BUFFER_SIZE];
+	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfh;
+	int ret;
+
+	nlh = mnl_nlmsg_put_header(buf);
+	nlh->nlmsg_type = (NFNL_SUBSYS_CTNETLINK << 8) | IPCTNL_MSG_CT_NEW;
+	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
+			   NLM_F_ACK | NLM_F_EXCL;
+	nlh->nlmsg_seq = time(NULL);
+
+	nfh = mnl_nlmsg_put_extra_header(nlh, sizeof(struct nfgenmsg));
+	nfh->nfgen_family = AF_INET;
+	nfh->version = NFNETLINK_V0;
+	nfh->res_id = 0;
+
+	ret = build_cta_tuple_v4(nlh, CTA_TUPLE_ORIG, src_ip, dst_ip, 12345, 443);
+	if (ret < 0) {
+		perror("build_cta_tuple_v4");
+		return ret;
+	}
+	ret = build_cta_tuple_v4(nlh, CTA_TUPLE_REPLY, dst_ip, src_ip, 443, 12345);
+	if (ret < 0) {
+		perror("build_cta_tuple_v4");
+		return ret;
+	}
+	return conntrack_data_insert(sock, nlh, zone);
+}
+
+static int conntrack_data_generate_v6(struct mnl_socket *sock,
+				      struct in6_addr src_ip,
+				      struct in6_addr dst_ip,
+				      uint16_t zone)
+{
+	char buf[MNL_SOCKET_BUFFER_SIZE];
+	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfh;
+	int ret;
+
+	nlh = mnl_nlmsg_put_header(buf);
+	nlh->nlmsg_type = (NFNL_SUBSYS_CTNETLINK << 8) | IPCTNL_MSG_CT_NEW;
+	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
+			   NLM_F_ACK | NLM_F_EXCL;
+	nlh->nlmsg_seq = time(NULL);
+
+	nfh = mnl_nlmsg_put_extra_header(nlh, sizeof(struct nfgenmsg));
+	nfh->nfgen_family = AF_INET6;
+	nfh->version = NFNETLINK_V0;
+	nfh->res_id = 0;
+
+	ret = build_cta_tuple_v6(nlh, CTA_TUPLE_ORIG, src_ip, dst_ip,
+				 12345, 443);
+	if (ret < 0) {
+		perror("build_cta_tuple_v6");
+		return ret;
+	}
+	ret = build_cta_tuple_v6(nlh, CTA_TUPLE_REPLY, dst_ip, src_ip,
+				 12345, 443);
+	if (ret < 0) {
+		perror("build_cta_tuple_v6");
+		return ret;
+	}
+	return conntrack_data_insert(sock, nlh, zone);
+}
+
+static int count_entries(const struct nlmsghdr *nlh, void *data)
+{
+	reply_counter++;
+	return MNL_CB_OK;
+}
+
+static int conntracK_count_zone(struct mnl_socket *sock, uint16_t zone)
+{
+	char buf[MNL_SOCKET_BUFFER_SIZE];
+	struct nlmsghdr *nlh, *rplnlh;
+	struct nfgenmsg *nfh;
+	struct nlattr *nest;
+	unsigned int portid;
+	int ret;
+
+	portid = mnl_socket_get_portid(sock);
+
+	nlh = mnl_nlmsg_put_header(buf);
+	nlh->nlmsg_type	= (NFNL_SUBSYS_CTNETLINK << 8) | IPCTNL_MSG_CT_GET;
+	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+	nlh->nlmsg_seq = time(NULL);
+
+	nfh = mnl_nlmsg_put_extra_header(nlh, sizeof(struct nfgenmsg));
+	nfh->nfgen_family = AF_UNSPEC;
+	nfh->version = NFNETLINK_V0;
+	nfh->res_id = 0;
+
+	mnl_attr_put_u16(nlh, CTA_ZONE, htons(zone));
+
+	ret = mnl_socket_sendto(sock, nlh, nlh->nlmsg_len);
+	if (ret < 0) {
+		perror("mnl_socket_sendto");
+		return ret;
+	}
+
+	reply_counter = 0;
+	ret = mnl_socket_recvfrom(sock, buf, MNL_SOCKET_BUFFER_SIZE);
+	while (ret > 0) {
+		ret = mnl_cb_run(buf, ret, nlh->nlmsg_seq, portid,
+				 count_entries, NULL);
+		if (ret <= MNL_CB_STOP)
+			break;
+
+		ret = mnl_socket_recvfrom(sock, buf, MNL_SOCKET_BUFFER_SIZE);
+	}
+	if (ret < 0) {
+		perror("mnl_socket_recvfrom");
+		return ret;
+	}
+
+	return reply_counter;
+}
+
+static int conntrack_flush_zone(struct mnl_socket *sock, uint16_t zone)
+{
+	char buf[MNL_SOCKET_BUFFER_SIZE];
+	struct nlmsghdr *nlh, *rplnlh;
+	struct nfgenmsg *nfh;
+	struct nlattr *nest;
+	unsigned int portid;
+	int ret;
+
+	portid = mnl_socket_get_portid(sock);
+
+	nlh = mnl_nlmsg_put_header(buf);
+	nlh->nlmsg_type	= (NFNL_SUBSYS_CTNETLINK << 8) | IPCTNL_MSG_CT_DELETE;
+	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+	nlh->nlmsg_seq = time(NULL);
+
+	nfh = mnl_nlmsg_put_extra_header(nlh, sizeof(struct nfgenmsg));
+	nfh->nfgen_family = AF_UNSPEC;
+	nfh->version = NFNETLINK_V0;
+	nfh->res_id = 0;
+
+	mnl_attr_put_u16(nlh, CTA_ZONE, htons(zone));
+
+	ret = mnl_socket_sendto(sock, nlh, nlh->nlmsg_len);
+	if (ret < 0) {
+		perror("mnl_socket_sendto");
+		return ret;
+	}
+
+	ret = mnl_socket_recvfrom(sock, buf, MNL_SOCKET_BUFFER_SIZE);
+	if (ret < 0) {
+		perror("mnl_socket_recvfrom");
+		return ret;
+	}
+
+	ret = mnl_cb_run(buf, ret, nlh->nlmsg_seq, portid, NULL, NULL);
+	if (ret < 0) {
+		perror("mnl_cb_run");
+		return ret;
+	}
+
+	return 0;
+}
+
+FIXTURE(conntrack_dump_flush)
+{
+	struct mnl_socket *sock;
+};
+
+FIXTURE_SETUP(conntrack_dump_flush)
+{
+	struct in6_addr src, dst;
+	int ret;
+
+	self->sock = mnl_socket_open(NETLINK_NETFILTER);
+	if (!self->sock) {
+		perror("mnl_socket_open");
+		SKIP(return, "cannot open netlink_netfilter socket");
+	}
+
+	ret = mnl_socket_bind(self->sock, 0, MNL_SOCKET_AUTOPID);
+	EXPECT_EQ(ret, 0);
+
+	ret = conntracK_count_zone(self->sock, TEST_ZONE_ID);
+	if (ret < 0 && errno == EPERM)
+		SKIP(return, "Needs to be run as root");
+	else if (ret < 0 && errno == EOPNOTSUPP)
+		SKIP(return, "Kernel does not seem to support conntrack zones");
+
+	ret = conntrack_data_generate_v4(self->sock, 0xf0f0f0f0, 0xf1f1f1f1,
+					 TEST_ZONE_ID);
+	EXPECT_EQ(ret, 0);
+	ret = conntrack_data_generate_v4(self->sock, 0xf2f2f2f2, 0xf3f3f3f3,
+					 TEST_ZONE_ID + 1);
+	EXPECT_EQ(ret, 0);
+	ret = conntrack_data_generate_v4(self->sock, 0xf4f4f4f4, 0xf5f5f5f5,
+					 TEST_ZONE_ID + 2);
+	EXPECT_EQ(ret, 0);
+	ret = conntrack_data_generate_v4(self->sock, 0xf6f6f6f6, 0xf7f7f7f7,
+					 NF_CT_DEFAULT_ZONE_ID);
+	EXPECT_EQ(ret, 0);
+
+	src = (struct in6_addr) {{
+		.__u6_addr32 = {
+			0xb80d0120,
+			0x00000000,
+			0x00000000,
+			0x01000000
+		}
+	}};
+	dst = (struct in6_addr) {{
+		.__u6_addr32 = {
+			0xb80d0120,
+			0x00000000,
+			0x00000000,
+			0x02000000
+		}
+	}};
+	ret = conntrack_data_generate_v6(self->sock, src, dst,
+					 TEST_ZONE_ID);
+	EXPECT_EQ(ret, 0);
+	src = (struct in6_addr) {{
+		.__u6_addr32 = {
+			0xb80d0120,
+			0x00000000,
+			0x00000000,
+			0x03000000
+		}
+	}};
+	dst = (struct in6_addr) {{
+		.__u6_addr32 = {
+			0xb80d0120,
+			0x00000000,
+			0x00000000,
+			0x04000000
+		}
+	}};
+	ret = conntrack_data_generate_v6(self->sock, src, dst,
+					 TEST_ZONE_ID + 1);
+	EXPECT_EQ(ret, 0);
+	src = (struct in6_addr) {{
+		.__u6_addr32 = {
+			0xb80d0120,
+			0x00000000,
+			0x00000000,
+			0x05000000
+		}
+	}};
+	dst = (struct in6_addr) {{
+		.__u6_addr32 = {
+			0xb80d0120,
+			0x00000000,
+			0x00000000,
+			0x06000000
+		}
+	}};
+	ret = conntrack_data_generate_v6(self->sock, src, dst,
+					 TEST_ZONE_ID + 2);
+	EXPECT_EQ(ret, 0);
+
+	src = (struct in6_addr) {{
+		.__u6_addr32 = {
+			0xb80d0120,
+			0x00000000,
+			0x00000000,
+			0x07000000
+		}
+	}};
+	dst = (struct in6_addr) {{
+		.__u6_addr32 = {
+			0xb80d0120,
+			0x00000000,
+			0x00000000,
+			0x08000000
+		}
+	}};
+	ret = conntrack_data_generate_v6(self->sock, src, dst,
+					 NF_CT_DEFAULT_ZONE_ID);
+	EXPECT_EQ(ret, 0);
+
+	ret = conntracK_count_zone(self->sock, TEST_ZONE_ID);
+	EXPECT_GE(ret, 2);
+	if (ret > 2)
+		SKIP(return, "kernel does not support filtering by zone");
+}
+
+FIXTURE_TEARDOWN(conntrack_dump_flush)
+{
+}
+
+TEST_F(conntrack_dump_flush, test_dump_by_zone)
+{
+	int ret;
+
+	ret = conntracK_count_zone(self->sock, TEST_ZONE_ID);
+	EXPECT_EQ(ret, 2);
+}
+
+TEST_F(conntrack_dump_flush, test_flush_by_zone)
+{
+	int ret;
+
+	ret = conntrack_flush_zone(self->sock, TEST_ZONE_ID);
+	EXPECT_EQ(ret, 0);
+	ret = conntracK_count_zone(self->sock, TEST_ZONE_ID);
+	EXPECT_EQ(ret, 0);
+	ret = conntracK_count_zone(self->sock, TEST_ZONE_ID + 1);
+	EXPECT_EQ(ret, 2);
+	ret = conntracK_count_zone(self->sock, TEST_ZONE_ID + 2);
+	EXPECT_EQ(ret, 2);
+	ret = conntracK_count_zone(self->sock, NF_CT_DEFAULT_ZONE_ID);
+	EXPECT_EQ(ret, 2);
+}
+
+TEST_F(conntrack_dump_flush, test_flush_by_zone_default)
+{
+	int ret;
+
+	ret = conntrack_flush_zone(self->sock, NF_CT_DEFAULT_ZONE_ID);
+	EXPECT_EQ(ret, 0);
+	ret = conntracK_count_zone(self->sock, TEST_ZONE_ID);
+	EXPECT_EQ(ret, 2);
+	ret = conntracK_count_zone(self->sock, TEST_ZONE_ID + 1);
+	EXPECT_EQ(ret, 2);
+	ret = conntracK_count_zone(self->sock, TEST_ZONE_ID + 2);
+	EXPECT_EQ(ret, 2);
+	ret = conntracK_count_zone(self->sock, NF_CT_DEFAULT_ZONE_ID);
+	EXPECT_EQ(ret, 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/net/netfilter/conntrack_dump_flush.sh b/tools/testing/selftests/net/netfilter/conntrack_dump_flush.sh
new file mode 100755
index 000000000000..8b0935385849
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/conntrack_dump_flush.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+exec unshare -n ./conntrack_dump_flush
diff --git a/tools/testing/selftests/net/netfilter/conntrack_icmp_related.sh b/tools/testing/selftests/net/netfilter/conntrack_icmp_related.sh
new file mode 100755
index 000000000000..c63d840ead61
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/conntrack_icmp_related.sh
@@ -0,0 +1,278 @@
+#!/bin/bash
+#
+# check that ICMP df-needed/pkttoobig icmp are set are set as related
+# state
+#
+# Setup is:
+#
+# nsclient1 -> nsrouter1 -> nsrouter2 -> nsclient2
+# MTU 1500, except for nsrouter2 <-> nsclient2 link (1280).
+# ping nsclient2 from nsclient1, checking that conntrack did set RELATED
+# 'fragmentation needed' icmp packet.
+#
+# In addition, nsrouter1 will perform IP masquerading, i.e. also
+# check the icmp errors are propagated to the correct host as per
+# nat of "established" icmp-echo "connection".
+
+source lib.sh
+
+if ! nft --version > /dev/null 2>&1;then
+	echo "SKIP: Could not run test without nft tool"
+	exit $ksft_skip
+fi
+
+cleanup() {
+	cleanup_all_ns
+}
+
+trap cleanup EXIT
+
+setup_ns nsclient1 nsclient2 nsrouter1 nsrouter2
+
+ret=0
+
+add_addr()
+{
+	ns=$1
+	dev=$2
+	i=$3
+
+	ip -net "$ns" link set "$dev" up
+	ip -net "$ns" addr add "192.168.$i.2/24" dev "$dev"
+	ip -net "$ns" addr add "dead:$i::2/64" dev "$dev" nodad
+}
+
+check_counter()
+{
+	ns=$1
+	name=$2
+	expect=$3
+	local lret=0
+
+	if ! ip netns exec "$ns" nft list counter inet filter "$name" | grep -q "$expect"; then
+		echo "ERROR: counter $name in $ns has unexpected value (expected $expect)" 1>&2
+		ip netns exec "$ns" nft list counter inet filter "$name" 1>&2
+		lret=1
+	fi
+
+	return $lret
+}
+
+check_unknown()
+{
+	expect="packets 0 bytes 0"
+	for n in ${nsclient1} ${nsclient2} ${nsrouter1} ${nsrouter2}; do
+		if ! check_counter "$n" "unknown" "$expect"; then
+			return 1
+		fi
+	done
+
+	return 0
+}
+
+DEV=veth0
+ip link add "$DEV" netns "$nsclient1" type veth peer name eth1 netns "$nsrouter1"
+ip link add "$DEV" netns "$nsclient2" type veth peer name eth1 netns "$nsrouter2"
+ip link add "$DEV" netns "$nsrouter1" type veth peer name eth2 netns "$nsrouter2"
+
+add_addr "$nsclient1" $DEV 1
+add_addr "$nsclient2" $DEV 2
+
+ip -net "$nsrouter1" link set eth1 up
+ip -net "$nsrouter1" link set $DEV up
+
+ip -net "$nsrouter2" link set eth1 mtu 1280 up
+ip -net "$nsrouter2" link set eth2 up
+
+ip -net "$nsclient1" route add default via 192.168.1.1
+ip -net "$nsclient1" -6 route add default via dead:1::1
+
+ip -net "$nsclient2" route add default via 192.168.2.1
+ip -net "$nsclient2" route add default via dead:2::1
+ip -net "$nsclient2" link set veth0 mtu 1280
+
+ip -net "$nsrouter1" addr add 192.168.1.1/24 dev eth1
+ip -net "$nsrouter1" addr add 192.168.3.1/24 dev veth0
+ip -net "$nsrouter1" addr add dead:1::1/64 dev eth1 nodad
+ip -net "$nsrouter1" addr add dead:3::1/64 dev veth0 nodad
+ip -net "$nsrouter1" route add default via 192.168.3.10
+ip -net "$nsrouter1" -6 route add default via dead:3::10
+
+ip -net "$nsrouter2" addr add 192.168.2.1/24 dev eth1
+ip -net "$nsrouter2" addr add 192.168.3.10/24 dev eth2
+ip -net "$nsrouter2" addr add dead:2::1/64  dev eth1 nodad
+ip -net "$nsrouter2" addr add dead:3::10/64 dev eth2 nodad
+ip -net "$nsrouter2" route add default via 192.168.3.1
+ip -net "$nsrouter2" route add default via dead:3::1
+
+for i in 4 6; do
+	ip netns exec "$nsrouter1" sysctl -q net.ipv$i.conf.all.forwarding=1
+	ip netns exec "$nsrouter2" sysctl -q net.ipv$i.conf.all.forwarding=1
+done
+
+for netns in "$nsrouter1" "$nsrouter2"; do
+ip netns exec "$netns" nft -f - <<EOF
+table inet filter {
+	counter unknown { }
+	counter related { }
+	chain forward {
+		type filter hook forward priority 0; policy accept;
+		meta l4proto icmpv6 icmpv6 type "packet-too-big" ct state "related" counter name "related" accept
+		meta l4proto icmp icmp type "destination-unreachable" ct state "related" counter name "related" accept
+		meta l4proto { icmp, icmpv6 } ct state new,established accept
+		counter name "unknown" drop
+	}
+}
+EOF
+done
+
+ip netns exec "$nsclient1" nft -f - <<EOF
+table inet filter {
+	counter unknown { }
+	counter related { }
+	counter redir4 { }
+	counter redir6 { }
+	chain input {
+		type filter hook input priority 0; policy accept;
+
+		icmp type "redirect" ct state "related" counter name "redir4" accept
+		icmpv6 type "nd-redirect" ct state "related" counter name "redir6" accept
+
+		meta l4proto { icmp, icmpv6 } ct state established,untracked accept
+		meta l4proto { icmp, icmpv6 } ct state "related" counter name "related" accept
+
+		counter name "unknown" drop
+	}
+}
+EOF
+
+ip netns exec "$nsclient2" nft -f - <<EOF
+table inet filter {
+	counter unknown { }
+	counter new { }
+	counter established { }
+
+	chain input {
+		type filter hook input priority 0; policy accept;
+		meta l4proto { icmp, icmpv6 } ct state established,untracked accept
+
+		meta l4proto { icmp, icmpv6 } ct state "new" counter name "new" accept
+		meta l4proto { icmp, icmpv6 } ct state "established" counter name "established" accept
+		counter name "unknown" drop
+	}
+	chain output {
+		type filter hook output priority 0; policy accept;
+		meta l4proto { icmp, icmpv6 } ct state established,untracked accept
+
+		meta l4proto { icmp, icmpv6 } ct state "new" counter name "new"
+		meta l4proto { icmp, icmpv6 } ct state "established" counter name "established"
+		counter name "unknown" drop
+	}
+}
+EOF
+
+# make sure NAT core rewrites adress of icmp error if nat is used according to
+# conntrack nat information (icmp error will be directed at nsrouter1 address,
+# but it needs to be routed to nsclient1 address).
+ip netns exec "$nsrouter1" nft -f - <<EOF
+table ip nat {
+	chain postrouting {
+		type nat hook postrouting priority 0; policy accept;
+		ip protocol icmp oifname "veth0" counter masquerade
+	}
+}
+table ip6 nat {
+	chain postrouting {
+		type nat hook postrouting priority 0; policy accept;
+		ip6 nexthdr icmpv6 oifname "veth0" counter masquerade
+	}
+}
+EOF
+
+if ! ip netns exec "$nsclient1" ping -c 1 -s 1000 -q -M "do" 192.168.2.2 >/dev/null; then
+	echo "ERROR: netns ip routing/connectivity broken" 1>&2
+	exit 1
+fi
+if ! ip netns exec "$nsclient1" ping -c 1 -s 1000 -q dead:2::2 >/dev/null; then
+	echo "ERROR: netns ipv6 routing/connectivity broken" 1>&2
+	exit 1
+fi
+
+if ! check_unknown; then
+	ret=1
+fi
+
+expect="packets 0 bytes 0"
+for netns in "$nsrouter1" "$nsrouter2" "$nsclient1";do
+	if ! check_counter "$netns" "related" "$expect"; then
+		ret=1
+	fi
+done
+
+expect="packets 2 bytes 2076"
+if ! check_counter "$nsclient2" "new" "$expect"; then
+	ret=1
+fi
+
+if ip netns exec "$nsclient1" ping -W 0.5 -q -c 1 -s 1300 -M "do" 192.168.2.2 > /dev/null; then
+	echo "ERROR: ping should have failed with PMTU too big error" 1>&2
+	ret=1
+fi
+
+# nsrouter2 should have generated the icmp error, so
+# related counter should be 0 (its in forward).
+expect="packets 0 bytes 0"
+if ! check_counter "$nsrouter2" "related" "$expect"; then
+	ret=1
+fi
+
+# but nsrouter1 should have seen it, same for nsclient1.
+expect="packets 1 bytes 576"
+for netns in ${nsrouter1} ${nsclient1};do
+	if ! check_counter "$netns" "related" "$expect"; then
+		ret=1
+	fi
+done
+
+if ip netns exec "${nsclient1}" ping6 -W 0.5 -c 1 -s 1300 dead:2::2 > /dev/null; then
+	echo "ERROR: ping6 should have failed with PMTU too big error" 1>&2
+	ret=1
+fi
+
+expect="packets 2 bytes 1856"
+for netns in "${nsrouter1}" "${nsclient1}";do
+	if ! check_counter "$netns" "related" "$expect"; then
+		ret=1
+	fi
+done
+
+if [ $ret -eq 0 ];then
+	echo "PASS: icmp mtu error had RELATED state"
+else
+	echo "ERROR: icmp error RELATED state test has failed"
+fi
+
+# add 'bad' route,  expect icmp REDIRECT to be generated
+ip netns exec "${nsclient1}" ip route add 192.168.1.42 via 192.168.1.1
+ip netns exec "${nsclient1}" ip route add dead:1::42 via dead:1::1
+
+ip netns exec "$nsclient1" ping -W 1 -q -i 0.5 -c 2 192.168.1.42 > /dev/null
+
+expect="packets 1 bytes 112"
+if ! check_counter "$nsclient1" "redir4" "$expect"; then
+	ret=1
+fi
+
+ip netns exec "$nsclient1" ping -W 1 -c 1 dead:1::42 > /dev/null
+expect="packets 1 bytes 192"
+if ! check_counter "$nsclient1" "redir6" "$expect"; then
+	ret=1
+fi
+
+if [ $ret -eq 0 ];then
+	echo "PASS: icmp redirects had RELATED state"
+else
+	echo "ERROR: icmp redirect RELATED state test has failed"
+fi
+
+exit $ret
diff --git a/tools/testing/selftests/net/netfilter/conntrack_ipip_mtu.sh b/tools/testing/selftests/net/netfilter/conntrack_ipip_mtu.sh
new file mode 100755
index 000000000000..9832a5d0198a
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/conntrack_ipip_mtu.sh
@@ -0,0 +1,191 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source lib.sh
+
+# Conntrack needs to reassemble fragments in order to have complete
+# packets for rule matching.  Reassembly can lead to packet loss.
+
+# Consider the following setup:
+#            +--------+       +---------+       +--------+
+#            |Router A|-------|Wanrouter|-------|Router B|
+#            |        |.IPIP..|         |..IPIP.|        |
+#            +--------+       +---------+       +--------+
+#           /                  mtu 1400                   \
+#          /                                               \
+#+--------+                                                 +--------+
+#|Client A|                                                 |Client B|
+#|        |                                                 |        |
+#+--------+                                                 +--------+
+
+# Router A and Router B use IPIP tunnel interfaces to tunnel traffic
+# between Client A and Client B over WAN. Wanrouter has MTU 1400 set
+# on its interfaces.
+
+rx=$(mktemp)
+
+checktool "iptables --version" "run test without iptables"
+checktool "socat -h" "run test without socat"
+
+setup_ns r_a r_b r_w c_a c_b
+
+cleanup() {
+	cleanup_all_ns
+	rm -f "$rx"
+}
+
+trap cleanup EXIT
+
+listener_ready()
+{
+	ns="$1"
+	port="$2"
+	ss -N "$ns" -lnu -o "sport = :$port" | grep -q "$port"
+}
+
+test_path() {
+	msg="$1"
+
+	ip netns exec "$c_b" socat -t 3 - udp4-listen:5000,reuseaddr > "$rx" < /dev/null &
+
+	busywait $BUSYWAIT_TIMEOUT listener_ready "$c_b" 5000
+
+	for i in 1 2 3; do
+		head -c1400 /dev/zero | tr "\000" "a" | \
+			ip netns exec "$c_a" socat -t 1 -u STDIN UDP:192.168.20.2:5000
+	done
+
+	wait
+
+	bytes=$(wc -c < "$rx")
+
+	if [ "$bytes" -eq 1400 ];then
+		echo "OK: PMTU $msg connection tracking"
+	else
+		echo "FAIL: PMTU $msg connection tracking: got $bytes, expected 1400"
+		exit 1
+	fi
+}
+
+# Detailed setup for Router A
+# ---------------------------
+# Interfaces:
+# eth0: 10.2.2.1/24
+# eth1: 192.168.10.1/24
+# ipip0: No IP address, local 10.2.2.1 remote 10.4.4.1
+# Routes:
+# 192.168.20.0/24 dev ipip0    (192.168.20.0/24 is subnet of Client B)
+# 10.4.4.1 via 10.2.2.254      (Router B via Wanrouter)
+# No iptables rules at all.
+
+ip link add veth0 netns "$r_a" type veth peer name veth0 netns "$r_w"
+ip link add veth1 netns "$r_a" type veth peer name veth0 netns "$c_a"
+
+l_addr="10.2.2.1"
+r_addr="10.4.4.1"
+ip netns exec "$r_a" ip link add ipip0 type ipip local "$l_addr" remote "$r_addr" mode ipip || exit $ksft_skip
+
+for dev in lo veth0 veth1 ipip0; do
+    ip -net "$r_a" link set "$dev" up
+done
+
+ip -net "$r_a" addr add 10.2.2.1/24 dev veth0
+ip -net "$r_a" addr add 192.168.10.1/24 dev veth1
+
+ip -net "$r_a" route add 192.168.20.0/24 dev ipip0
+ip -net "$r_a" route add 10.4.4.0/24 via 10.2.2.254
+
+ip netns exec "$r_a" sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null
+
+# Detailed setup for Router B
+# ---------------------------
+# Interfaces:
+# eth0: 10.4.4.1/24
+# eth1: 192.168.20.1/24
+# ipip0: No IP address, local 10.4.4.1 remote 10.2.2.1
+# Routes:
+# 192.168.10.0/24 dev ipip0    (192.168.10.0/24 is subnet of Client A)
+# 10.2.2.1 via 10.4.4.254      (Router A via Wanrouter)
+# No iptables rules at all.
+
+ip link add veth0 netns "$r_b" type veth peer name veth1 netns "$r_w"
+ip link add veth1 netns "$r_b" type veth peer name veth0 netns "$c_b"
+
+l_addr="10.4.4.1"
+r_addr="10.2.2.1"
+
+ip netns exec "$r_b" ip link add ipip0 type ipip local "${l_addr}" remote "${r_addr}" mode ipip || exit $ksft_skip
+
+for dev in veth0 veth1 ipip0; do
+	ip -net "$r_b" link set $dev up
+done
+
+ip -net "$r_b" addr add 10.4.4.1/24 dev veth0
+ip -net "$r_b" addr add 192.168.20.1/24 dev veth1
+
+ip -net "$r_b" route add 192.168.10.0/24 dev ipip0
+ip -net "$r_b" route add 10.2.2.0/24 via 10.4.4.254
+ip netns exec "$r_b" sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null
+
+# Client A
+ip -net "$c_a" addr add 192.168.10.2/24 dev veth0
+ip -net "$c_a" link set dev veth0 up
+ip -net "$c_a" route add default via 192.168.10.1
+
+# Client A
+ip -net "$c_b" addr add 192.168.20.2/24 dev veth0
+ip -net "$c_b" link set dev veth0 up
+ip -net "$c_b" route add default via 192.168.20.1
+
+# Wan
+ip -net "$r_w" addr add 10.2.2.254/24 dev veth0
+ip -net "$r_w" addr add 10.4.4.254/24 dev veth1
+
+ip -net "$r_w" link set dev veth0 up mtu 1400
+ip -net "$r_w" link set dev veth1 up mtu 1400
+
+ip -net "$r_a" link set dev veth0 mtu 1400
+ip -net "$r_b" link set dev veth0 mtu 1400
+
+ip netns exec "$r_w" sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null
+
+# Path MTU discovery
+# ------------------
+# Running tracepath from Client A to Client B shows PMTU discovery is working
+# as expected:
+#
+# clienta:~# tracepath 192.168.20.2
+# 1?: [LOCALHOST]                      pmtu 1500
+# 1:  192.168.10.1                                          0.867ms
+# 1:  192.168.10.1                                          0.302ms
+# 2:  192.168.10.1                                          0.312ms pmtu 1480
+# 2:  no reply
+# 3:  192.168.10.1                                          0.510ms pmtu 1380
+# 3:  192.168.20.2                                          2.320ms reached
+# Resume: pmtu 1380 hops 3 back 3
+
+# ip netns exec ${c_a} traceroute --mtu 192.168.20.2
+
+# Router A has learned PMTU (1400) to Router B from Wanrouter.
+# Client A has learned PMTU (1400 - IPIP overhead = 1380) to Client B
+# from Router A.
+
+#Send large UDP packet
+#---------------------
+#Now we send a 1400 bytes UDP packet from Client A to Client B:
+
+# clienta:~# head -c1400 /dev/zero | tr "\000" "a" | socat -u STDIN UDP:192.168.20.2:5000
+test_path "without"
+
+# The IPv4 stack on Client A already knows the PMTU to Client B, so the
+# UDP packet is sent as two fragments (1380 + 20). Router A forwards the
+# fragments between eth1 and ipip0. The fragments fit into the tunnel and
+# reach their destination.
+
+#When sending the large UDP packet again, Router A now reassembles the
+#fragments before routing the packet over ipip0. The resulting IPIP
+#packet is too big (1400) for the tunnel PMTU (1380) to Router B, it is
+#dropped on Router A before sending.
+
+ip netns exec "$r_a" iptables -A FORWARD -m conntrack --ctstate NEW
+test_path "with"
diff --git a/tools/testing/selftests/net/netfilter/conntrack_resize.sh b/tools/testing/selftests/net/netfilter/conntrack_resize.sh
new file mode 100755
index 000000000000..9e033e80219e
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/conntrack_resize.sh
@@ -0,0 +1,427 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source lib.sh
+
+checktool "conntrack --version" "run test without conntrack"
+checktool "nft --version" "run test without nft tool"
+
+init_net_max=0
+ct_buckets=0
+tmpfile=""
+tmpfile_proc=""
+tmpfile_uniq=""
+ret=0
+
+insert_count=2000
+[ "$KSFT_MACHINE_SLOW" = "yes" ] && insert_count=400
+
+modprobe -q nf_conntrack
+if ! sysctl -q net.netfilter.nf_conntrack_max >/dev/null;then
+	echo "SKIP: conntrack sysctls not available"
+	exit $KSFT_SKIP
+fi
+
+init_net_max=$(sysctl -n net.netfilter.nf_conntrack_max) || exit 1
+ct_buckets=$(sysctl -n net.netfilter.nf_conntrack_buckets) || exit 1
+
+cleanup() {
+	cleanup_all_ns
+
+	rm -f "$tmpfile" "$tmpfile_proc" "$tmpfile_uniq"
+
+	# restore original sysctl setting
+	sysctl -q net.netfilter.nf_conntrack_max=$init_net_max
+	sysctl -q net.netfilter.nf_conntrack_buckets=$ct_buckets
+}
+trap cleanup EXIT
+
+check_max_alias()
+{
+	local expected="$1"
+	# old name, expected to alias to the first, i.e. changing one
+	# changes the other as well.
+	local lv=$(sysctl -n net.nf_conntrack_max)
+
+	if [ $expected -ne "$lv" ];then
+		echo "nf_conntrack_max sysctls should have identical values"
+		exit 1
+	fi
+}
+
+insert_ctnetlink() {
+	local ns="$1"
+	local count="$2"
+	local i=0
+	local bulk=16
+
+	while [ $i -lt $count ] ;do
+		ip netns exec "$ns" bash -c "for i in \$(seq 1 $bulk); do \
+			if ! conntrack -I -s \$((\$RANDOM%256)).\$((\$RANDOM%256)).\$((\$RANDOM%256)).\$((\$RANDOM%255+1)) \
+					  -d \$((\$RANDOM%256)).\$((\$RANDOM%256)).\$((\$RANDOM%256)).\$((\$RANDOM%255+1)) \
+					  --protonum 17 --timeout 3600 --status ASSURED,SEEN_REPLY --sport \$RANDOM --dport 53; then \
+					  return;\
+			fi & \
+		done ; wait" 2>/dev/null
+
+		i=$((i+bulk))
+	done
+}
+
+check_ctcount() {
+	local ns="$1"
+	local count="$2"
+	local msg="$3"
+
+	local now=$(ip netns exec "$ns" conntrack -C)
+
+	if [ $now -ne "$count" ] ;then
+		echo "expected $count entries in $ns, not $now: $msg"
+		exit 1
+	fi
+
+	echo "PASS: got $count connections: $msg"
+}
+
+ctresize() {
+	local duration="$1"
+	local now=$(date +%s)
+	local end=$((now + duration))
+
+	while [ $now -lt $end ]; do
+		sysctl -q net.netfilter.nf_conntrack_buckets=$RANDOM
+		now=$(date +%s)
+	done
+}
+
+do_rsleep() {
+	local limit="$1"
+	local r=$RANDOM
+
+	r=$((r%limit))
+	sleep "$r"
+}
+
+ct_flush_once() {
+	local ns="$1"
+
+	ip netns exec "$ns" conntrack -F 2>/dev/null
+}
+
+ctflush() {
+	local ns="$1"
+	local duration="$2"
+	local now=$(date +%s)
+	local end=$((now + duration))
+
+	do_rsleep "$duration"
+
+        while [ $now -lt $end ]; do
+		ct_flush_once "$ns"
+		do_rsleep "$duration"
+		now=$(date +%s)
+        done
+}
+
+ctflood()
+{
+	local ns="$1"
+	local duration="$2"
+	local msg="$3"
+	local now=$(date +%s)
+	local end=$((now + duration))
+	local j=0
+	local k=0
+
+        while [ $now -lt $end ]; do
+		j=$((j%256))
+		k=$((k%256))
+
+		ip netns exec "$ns" bash -c \
+			"j=$j k=$k; for i in \$(seq 1 254); do ping -q -c 1 127.\$k.\$j.\$i & done; wait" >/dev/null 2>&1
+
+		j=$((j+1))
+
+		if [ $j -eq 256 ];then
+			k=$((k+1))
+		fi
+
+		now=$(date +%s)
+	done
+
+	wait
+}
+
+# dump to /dev/null.  We don't want dumps to cause infinite loops
+# or use-after-free even when conntrack table is altered while dumps
+# are in progress.
+ct_nulldump()
+{
+	local ns="$1"
+
+	ip netns exec "$ns" conntrack -L > /dev/null 2>&1 &
+
+	# Don't require /proc support in conntrack
+	if [ -r /proc/self/net/nf_conntrack ] ; then
+		ip netns exec "$ns" bash -c "wc -l < /proc/self/net/nf_conntrack" > /dev/null &
+	fi
+
+	wait
+}
+
+check_taint()
+{
+	local tainted_then="$1"
+	local msg="$2"
+
+	local tainted_now=0
+
+	if [ "$tainted_then" -ne 0 ];then
+		return
+	fi
+
+	read tainted_now < /proc/sys/kernel/tainted
+
+	if [ "$tainted_now" -eq 0 ];then
+		echo "PASS: $msg"
+	else
+		echo "TAINT: $msg"
+		dmesg
+		exit 1
+	fi
+}
+
+insert_flood()
+{
+	local n="$1"
+	local r=0
+
+	r=$((RANDOM%$insert_count))
+
+	ctflood "$n" "$timeout" "floodresize" &
+	insert_ctnetlink "$n" "$r" &
+	ctflush "$n" "$timeout" &
+	ct_nulldump "$n" &
+
+	wait
+}
+
+test_floodresize_all()
+{
+	local timeout=20
+	local n=""
+	local tainted_then=""
+
+	read tainted_then < /proc/sys/kernel/tainted
+
+	for n in "$nsclient1" "$nsclient2";do
+		insert_flood "$n" &
+	done
+
+	# resize table constantly while flood/insert/dump/flushs
+	# are happening in parallel.
+	ctresize "$timeout"
+
+	# wait for subshells to complete, everything is limited
+	# by $timeout.
+	wait
+
+	check_taint "$tainted_then" "resize+flood"
+}
+
+check_dump()
+{
+	local ns="$1"
+	local protoname="$2"
+	local c=0
+	local proto=0
+	local proc=0
+	local unique=""
+	local lret=0
+
+	# NOTE: assumes timeouts are large enough to not have
+	# expirations in all following tests.
+	l=$(ip netns exec "$ns" conntrack -L 2>/dev/null | sort | tee "$tmpfile" | wc -l)
+	c=$(ip netns exec "$ns" conntrack -C)
+
+	if [ "$c" -eq 0 ]; then
+		echo "FAIL: conntrack count for $ns is 0"
+		lret=1
+	fi
+
+	if [ "$c" -ne "$l" ]; then
+		echo "FAIL: conntrack count inconsistency for $ns -L: $c != $l"
+		lret=1
+	fi
+
+	# check the dump we retrieved is free of duplicated entries.
+	unique=$(uniq "$tmpfile" | tee "$tmpfile_uniq" | wc -l)
+	if [ "$l" -ne "$unique" ]; then
+		echo "FAIL: listing contained redundant entries for $ns: $l != $unique"
+		diff -u "$tmpfile" "$tmpfile_uniq"
+		lret=1
+	fi
+
+	# we either inserted icmp or only udp, hence, --proto should return same entry count as without filter.
+	proto=$(ip netns exec "$ns" conntrack -L --proto $protoname 2>/dev/null | sort | uniq | tee "$tmpfile_uniq" | wc -l)
+	if [ "$l" -ne "$proto" ]; then
+		echo "FAIL: dump inconsistency for $ns -L --proto $protoname: $l != $proto"
+		diff -u "$tmpfile" "$tmpfile_uniq"
+		lret=1
+	fi
+
+	if [ -r /proc/self/net/nf_conntrack ] ; then
+		proc=$(ip netns exec "$ns" bash -c "sort < /proc/self/net/nf_conntrack | tee \"$tmpfile_proc\" | wc -l")
+
+		if [ "$l" -ne "$proc" ]; then
+			echo "FAIL: proc inconsistency for $ns: $l != $proc"
+			lret=1
+		fi
+
+		proc=$(uniq "$tmpfile_proc" | tee "$tmpfile_uniq" | wc -l)
+		if [ "$l" -ne "$proc" ]; then
+			echo "FAIL: proc inconsistency after uniq filter for $ns: $l != $proc"
+			diff -u "$tmpfile_proc" "$tmpfile_uniq"
+			lret=1
+		fi
+	fi
+
+	if [ $lret -eq 0 ];then
+		echo "PASS: dump in netns $ns had same entry count (-C $c, -L $l, -p $proto, /proc $proc)"
+	else
+		echo "FAIL: dump in netns $ns had different entry count (-C $c, -L $l, -p $proto, /proc $proc)"
+		ret=1
+	fi
+}
+
+test_dump_all()
+{
+	local timeout=3
+	local tainted_then=""
+
+	read tainted_then < /proc/sys/kernel/tainted
+
+	ct_flush_once "$nsclient1"
+	ct_flush_once "$nsclient2"
+
+	ip netns exec "$nsclient1" sysctl -q net.netfilter.nf_conntrack_icmp_timeout=3600
+
+	ctflood "$nsclient1" $timeout "dumpall" &
+	insert_ctnetlink "$nsclient2" $insert_count
+
+	wait
+
+	check_dump "$nsclient1" "icmp"
+	check_dump "$nsclient2" "udp"
+
+	check_taint "$tainted_then" "test parallel conntrack dumps"
+}
+
+check_sysctl_immutable()
+{
+	local ns="$1"
+	local name="$2"
+	local failhard="$3"
+	local o=0
+	local n=0
+
+	o=$(ip netns exec "$ns" sysctl -n "$name" 2>/dev/null)
+	n=$((o+1))
+
+	# return value isn't reliable, need to read it back
+	ip netns exec "$ns" sysctl -q "$name"=$n 2>/dev/null >/dev/null
+
+	n=$(ip netns exec "$ns" sysctl -n "$name" 2>/dev/null)
+
+	[ -z "$n" ] && return 1
+
+	if [ $o -ne $n ]; then
+		if [ $failhard -gt 0 ] ;then
+			echo "FAIL: net.$name should not be changeable from namespace (now $n)"
+			ret=1
+		fi
+		return 0
+	fi
+
+	return 1
+}
+
+test_conntrack_max_limit()
+{
+	sysctl -q net.netfilter.nf_conntrack_max=100
+	insert_ctnetlink "$nsclient1" 101
+
+	# check netns is clamped by init_net, i.e., either netns follows
+	# init_net value, or a higher pernet limit (compared to init_net) is ignored.
+	check_ctcount "$nsclient1" 100 "netns conntrack_max is init_net bound"
+
+	sysctl -q net.netfilter.nf_conntrack_max=$init_net_max
+}
+
+test_conntrack_disable()
+{
+	local timeout=2
+
+	# disable conntrack pickups
+	ip netns exec "$nsclient1" nft flush table ip test_ct
+
+	ct_flush_once "$nsclient1"
+	ct_flush_once "$nsclient2"
+
+	ctflood "$nsclient1" "$timeout" "conntrack disable"
+	ip netns exec "$nsclient2" ping -q -c 1 127.0.0.1 >/dev/null 2>&1
+
+	# Disabled, should not have picked up any connection.
+	check_ctcount "$nsclient1" 0 "conntrack disabled"
+
+	# This one is still active, expect 1 connection.
+	check_ctcount "$nsclient2" 1 "conntrack enabled"
+}
+
+init_net_max=$(sysctl -n net.netfilter.nf_conntrack_max)
+
+check_max_alias $init_net_max
+
+sysctl -q net.netfilter.nf_conntrack_max="262000"
+check_max_alias 262000
+
+setup_ns nsclient1 nsclient2
+
+# check this only works from init_net
+for n in netfilter.nf_conntrack_buckets netfilter.nf_conntrack_expect_max net.nf_conntrack_max;do
+	check_sysctl_immutable "$nsclient1" "net.$n" 1
+done
+
+# won't work on older kernels. If it works, check that the netns obeys the limit
+if check_sysctl_immutable "$nsclient1" net.netfilter.nf_conntrack_max 0;then
+	# subtest: if pernet is changeable, check that reducing it in pernet
+	# limits the pernet entries.  Inverse, pernet clamped by a lower init_net
+	# setting, is already checked by "test_conntrack_max_limit" test.
+
+	ip netns exec "$nsclient1" sysctl -q net.netfilter.nf_conntrack_max=1
+	insert_ctnetlink "$nsclient1" 2
+	check_ctcount "$nsclient1" 1 "netns conntrack_max is pernet bound"
+	ip netns exec "$nsclient1" sysctl -q net.netfilter.nf_conntrack_max=$init_net_max
+fi
+
+for n in "$nsclient1" "$nsclient2";do
+# enable conntrack in both namespaces
+ip netns exec "$n" nft -f - <<EOF
+table ip test_ct {
+	chain input {
+		type filter hook input priority 0
+		ct state new counter
+	}
+}
+EOF
+done
+
+tmpfile=$(mktemp)
+tmpfile_proc=$(mktemp)
+tmpfile_uniq=$(mktemp)
+test_conntrack_max_limit
+test_dump_all
+test_floodresize_all
+test_conntrack_disable
+
+exit $ret
diff --git a/tools/testing/selftests/net/netfilter/conntrack_reverse_clash.c b/tools/testing/selftests/net/netfilter/conntrack_reverse_clash.c
new file mode 100644
index 000000000000..507930cee8cb
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/conntrack_reverse_clash.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Needs something like:
+ *
+ * iptables -t nat -A POSTROUTING -o nomatch -j MASQUERADE
+ *
+ * so NAT engine attaches a NAT null-binding to each connection.
+ *
+ * With unmodified kernels, child or parent will exit with
+ * "Port number changed" error, even though no port translation
+ * was requested.
+ */
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+#include <unistd.h>
+#include <arpa/inet.h>
+#include <sys/socket.h>
+#include <sys/wait.h>
+
+#define LEN 512
+#define PORT 56789
+#define TEST_TIME 5
+
+static void die(const char *e)
+{
+	perror(e);
+	exit(111);
+}
+
+static void die_port(uint16_t got, uint16_t want)
+{
+	fprintf(stderr, "Port number changed, wanted %d got %d\n", want, ntohs(got));
+	exit(1);
+}
+
+static int udp_socket(void)
+{
+	static const struct timeval tv = {
+		.tv_sec = 1,
+	};
+	int fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
+
+	if (fd < 0)
+		die("socket");
+
+	setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv));
+	return fd;
+}
+
+int main(int argc, char *argv[])
+{
+	struct sockaddr_in sa1 = {
+		.sin_family = AF_INET,
+	};
+	struct sockaddr_in sa2 = {
+		.sin_family = AF_INET,
+	};
+	int s1, s2, status;
+	time_t end, now;
+	socklen_t plen;
+	char buf[LEN];
+	bool child;
+
+	sa1.sin_port = htons(PORT);
+	sa2.sin_port = htons(PORT + 1);
+
+	s1 = udp_socket();
+	s2 = udp_socket();
+
+	inet_pton(AF_INET, "127.0.0.11", &sa1.sin_addr);
+	inet_pton(AF_INET, "127.0.0.12", &sa2.sin_addr);
+
+	if (bind(s1, (struct sockaddr *)&sa1, sizeof(sa1)) < 0)
+		die("bind 1");
+	if (bind(s2, (struct sockaddr *)&sa2, sizeof(sa2)) < 0)
+		die("bind 2");
+
+	child = fork() == 0;
+
+	now = time(NULL);
+	end = now + TEST_TIME;
+
+	while (now < end) {
+		struct sockaddr_in peer;
+		socklen_t plen = sizeof(peer);
+
+		now = time(NULL);
+
+		if (child) {
+			if (sendto(s1, buf, LEN, 0, (struct sockaddr *)&sa2, sizeof(sa2)) != LEN)
+				continue;
+
+			if (recvfrom(s2, buf, LEN, 0, (struct sockaddr *)&peer, &plen) < 0)
+				die("child recvfrom");
+
+			if (peer.sin_port != htons(PORT))
+				die_port(peer.sin_port, PORT);
+		} else {
+			if (sendto(s2, buf, LEN, 0, (struct sockaddr *)&sa1, sizeof(sa1)) != LEN)
+				continue;
+
+			if (recvfrom(s1, buf, LEN, 0, (struct sockaddr *)&peer, &plen) < 0)
+				die("parent recvfrom");
+
+			if (peer.sin_port != htons((PORT + 1)))
+				die_port(peer.sin_port, PORT + 1);
+		}
+	}
+
+	if (child)
+		return 0;
+
+	wait(&status);
+
+	if (WIFEXITED(status))
+		return WEXITSTATUS(status);
+
+	return 1;
+}
diff --git a/tools/testing/selftests/net/netfilter/conntrack_reverse_clash.sh b/tools/testing/selftests/net/netfilter/conntrack_reverse_clash.sh
new file mode 100755
index 000000000000..a24c896347a8
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/conntrack_reverse_clash.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source lib.sh
+
+cleanup()
+{
+	cleanup_all_ns
+}
+
+checktool "nft --version" "run test without nft"
+checktool "conntrack --version" "run test without conntrack"
+
+trap cleanup EXIT
+
+setup_ns ns0
+
+# make loopback connections get nat null bindings assigned
+ip netns exec "$ns0" nft -f - <<EOF
+table ip nat {
+        chain POSTROUTING {
+                type nat hook postrouting priority srcnat; policy accept;
+                oifname "nomatch" counter packets 0 bytes 0 masquerade
+        }
+}
+EOF
+
+do_flush()
+{
+	local end
+	local now
+
+	now=$(date +%s)
+	end=$((now + 5))
+
+	while [ $now -lt $end ];do
+		ip netns exec "$ns0" conntrack -F 2>/dev/null
+		now=$(date +%s)
+	done
+}
+
+do_flush &
+
+if ip netns exec "$ns0" ./conntrack_reverse_clash; then
+	echo "PASS: No SNAT performed for null bindings"
+else
+	echo "ERROR: SNAT performed without any matching snat rule"
+	exit 1
+fi
+
+exit 0
diff --git a/tools/testing/selftests/net/netfilter/conntrack_sctp_collision.sh b/tools/testing/selftests/net/netfilter/conntrack_sctp_collision.sh
new file mode 100755
index 000000000000..d860f7d9744b
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/conntrack_sctp_collision.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Testing For SCTP COLLISION SCENARIO as Below:
+#
+#   14:35:47.655279 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [INIT] [init tag: 2017837359]
+#   14:35:48.353250 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [INIT] [init tag: 1187206187]
+#   14:35:48.353275 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [INIT ACK] [init tag: 2017837359]
+#   14:35:48.353283 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [COOKIE ECHO]
+#   14:35:48.353977 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [COOKIE ACK]
+#   14:35:48.855335 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [INIT ACK] [init tag: 164579970]
+#
+# TOPO: SERVER_NS (link0)<--->(link1) ROUTER_NS (link2)<--->(link3) CLIENT_NS
+
+source lib.sh
+
+CLIENT_IP="198.51.200.1"
+CLIENT_PORT=1234
+
+SERVER_IP="198.51.100.1"
+SERVER_PORT=1234
+
+CLIENT_GW="198.51.200.2"
+SERVER_GW="198.51.100.2"
+
+# setup the topo
+setup() {
+	setup_ns CLIENT_NS SERVER_NS ROUTER_NS
+	ip -n "$SERVER_NS" link add link0 type veth peer name link1 netns "$ROUTER_NS"
+	ip -n "$CLIENT_NS" link add link3 type veth peer name link2 netns "$ROUTER_NS"
+
+	ip -n "$SERVER_NS" link set link0 up
+	ip -n "$SERVER_NS" addr add $SERVER_IP/24 dev link0
+	ip -n "$SERVER_NS" route add $CLIENT_IP dev link0 via $SERVER_GW
+
+	ip -n "$ROUTER_NS" link set link1 up
+	ip -n "$ROUTER_NS" link set link2 up
+	ip -n "$ROUTER_NS" addr add $SERVER_GW/24 dev link1
+	ip -n "$ROUTER_NS" addr add $CLIENT_GW/24 dev link2
+	ip net exec "$ROUTER_NS" sysctl -wq net.ipv4.ip_forward=1
+
+	ip -n "$CLIENT_NS" link set link3 up
+	ip -n "$CLIENT_NS" addr add $CLIENT_IP/24 dev link3
+	ip -n "$CLIENT_NS" route add $SERVER_IP dev link3 via $CLIENT_GW
+
+	# simulate the delay on OVS upcall by setting up a delay for INIT_ACK with
+	# tc on $SERVER_NS side
+	tc -n "$SERVER_NS" qdisc add dev link0 root handle 1: htb r2q 64
+	tc -n "$SERVER_NS" class add dev link0 parent 1: classid 1:1 htb rate 100mbit
+	tc -n "$SERVER_NS" filter add dev link0 parent 1: protocol ip u32 match ip protocol 132 \
+		0xff match u8 2 0xff at 32 flowid 1:1
+	if ! tc -n "$SERVER_NS" qdisc add dev link0 parent 1:1 handle 10: netem delay 1200ms; then
+		echo "SKIP: Cannot add netem qdisc"
+		exit $ksft_skip
+	fi
+
+	# simulate the ctstate check on OVS nf_conntrack
+	ip net exec "$ROUTER_NS" iptables -A FORWARD -m state --state INVALID,UNTRACKED -j DROP
+	ip net exec "$ROUTER_NS" iptables -A INPUT -p sctp -j DROP
+
+	# use a smaller number for assoc's max_retrans to reproduce the issue
+	modprobe -q sctp
+	ip net exec "$CLIENT_NS" sysctl -wq net.sctp.association_max_retrans=3
+}
+
+cleanup() {
+	ip net exec "$CLIENT_NS" pkill sctp_collision >/dev/null 2>&1
+	ip net exec "$SERVER_NS" pkill sctp_collision >/dev/null 2>&1
+	cleanup_all_ns
+}
+
+do_test() {
+	ip net exec "$SERVER_NS" ./sctp_collision server \
+		$SERVER_IP $SERVER_PORT $CLIENT_IP $CLIENT_PORT &
+	ip net exec "$CLIENT_NS" ./sctp_collision client \
+		$CLIENT_IP $CLIENT_PORT $SERVER_IP $SERVER_PORT
+}
+
+# NOTE: one way to work around the issue is set a smaller hb_interval
+# ip net exec $CLIENT_NS sysctl -wq net.sctp.hb_interval=3500
+
+# run the test case
+trap cleanup EXIT
+setup && \
+echo "Test for SCTP Collision in nf_conntrack:" && \
+do_test && echo "PASS!"
+exit $?
diff --git a/tools/testing/selftests/net/netfilter/conntrack_tcp_unreplied.sh b/tools/testing/selftests/net/netfilter/conntrack_tcp_unreplied.sh
new file mode 100755
index 000000000000..121ea93c0178
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/conntrack_tcp_unreplied.sh
@@ -0,0 +1,164 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Check that UNREPLIED tcp conntrack will eventually timeout.
+#
+
+source lib.sh
+
+if ! nft --version > /dev/null 2>&1;then
+	echo "SKIP: Could not run test without nft tool"
+	exit $ksft_skip
+fi
+
+if ! conntrack --version > /dev/null 2>&1;then
+	echo "SKIP: Could not run test without conntrack tool"
+	exit $ksft_skip
+fi
+
+ret=0
+
+cleanup() {
+	ip netns pids "$ns1" | xargs kill 2>/dev/null
+	ip netns pids "$ns2" | xargs kill 2>/dev/null
+
+	cleanup_all_ns
+}
+
+ipv4() {
+    echo -n 192.168."$1".2
+}
+
+check_counter()
+{
+	ns=$1
+	name=$2
+	expect=$3
+	local lret=0
+
+	if ! ip netns exec "$ns2" nft list counter inet filter "$name" | grep -q "$expect"; then
+		echo "ERROR: counter $name in $ns2 has unexpected value (expected $expect)" 1>&2
+		ip netns exec "$ns2" nft list counter inet filter "$name" 1>&2
+		lret=1
+	fi
+
+	return $lret
+}
+
+trap cleanup EXIT
+
+# Create test namespaces
+setup_ns ns1 ns2
+
+# Connect the namespace to the host using a veth pair
+ip -net "$ns1" link add name veth1 type veth peer name veth2
+ip -net "$ns1" link set netns "$ns2" dev veth2
+
+ip -net "$ns1" link set up dev lo
+ip -net "$ns2" link set up dev lo
+ip -net "$ns1" link set up dev veth1
+ip -net "$ns2" link set up dev veth2
+
+ip -net "$ns2" addr add 10.11.11.2/24 dev veth2
+ip -net "$ns2" route add default via 10.11.11.1
+
+ip netns exec "$ns2" sysctl -q net.ipv4.conf.veth2.forwarding=1
+
+# add a rule inside NS so we enable conntrack
+ip netns exec "$ns1" nft -f - <<EOF
+table inet filter {
+	chain input {
+		type filter hook input priority 0; policy accept;
+		ct state established accept
+	}
+}
+EOF
+
+ip -net "$ns1" addr add 10.11.11.1/24 dev veth1
+ip -net "$ns1" route add 10.99.99.99 via 10.11.11.2
+
+# Check connectivity works
+ip netns exec "$ns1" ping -q -c 2 10.11.11.2 >/dev/null || exit 1
+
+ip netns exec "$ns2" socat -u -4 TCP-LISTEN:8080,reuseaddr STDOUT &
+
+ip netns exec "$ns2" nft -f - <<EOF
+table inet filter {
+	counter connreq { }
+	counter redir { }
+	chain input {
+		type filter hook input priority 0; policy accept;
+		ct state new tcp flags syn ip daddr 10.99.99.99 tcp dport 80 counter name "connreq" accept
+		ct state new ct status dnat tcp dport 8080 counter name "redir" accept
+	}
+}
+EOF
+if [ $? -ne 0 ]; then
+	echo "ERROR: Could not load nft rules"
+	exit 1
+fi
+
+ip netns exec "$ns2" sysctl -q net.netfilter.nf_conntrack_tcp_timeout_syn_sent=10
+
+echo "INFO: connect $ns1 -> $ns2 to the virtual ip"
+ip netns exec "$ns1" bash -c 'for i in $(seq 1 $BUSYWAIT_TIMEOUT) ; do
+	socat -u STDIN TCP:10.99.99.99:80 < /dev/null
+	sleep 0.1
+	done' &
+
+wait_for_attempt()
+{
+	count=$(ip netns exec "$ns2" conntrack -L -p tcp --dport 80 2>/dev/null | wc -l)
+	if [ "$count" -gt 0 ]; then
+		return 0
+	fi
+
+	return 1
+}
+
+# wait for conntrack to pick the new connection request up before loading
+# the nat redirect rule.
+if ! busywait "$BUSYWAIT_TIMEOUT" wait_for_attempt; then
+	echo "ERROR: $ns2 did not pick up tcp connection from peer"
+	exit 1
+fi
+
+ip netns exec "$ns2" nft -f - <<EOF
+table inet nat {
+	chain prerouting {
+		type nat hook prerouting priority 0; policy accept;
+		ip daddr 10.99.99.99 tcp dport 80 redirect to :8080
+	}
+}
+EOF
+if [ $? -ne 0 ]; then
+	echo "ERROR: Could not load nat redirect"
+	exit 1
+fi
+
+wait_for_redirect()
+{
+	count=$(ip netns exec "$ns2" conntrack -L -p tcp --reply-port-src 8080 2>/dev/null | wc -l)
+	if [ "$count" -gt 0 ]; then
+		return 0
+	fi
+
+	return 1
+}
+echo "INFO: NAT redirect added in ns $ns2, waiting for $BUSYWAIT_TIMEOUT ms for nat to take effect"
+
+busywait "$BUSYWAIT_TIMEOUT" wait_for_redirect
+ret=$?
+
+expect="packets 1 bytes 60"
+if ! check_counter "$ns2" "redir" "$expect"; then
+	ret=1
+fi
+
+if [ $ret -eq 0 ];then
+	echo "PASS: redirection counter has expected values"
+else
+	echo "ERROR: no tcp connection was redirected"
+fi
+
+exit $ret
diff --git a/tools/testing/selftests/net/netfilter/conntrack_vrf.sh b/tools/testing/selftests/net/netfilter/conntrack_vrf.sh
new file mode 100755
index 000000000000..207b79932d91
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/conntrack_vrf.sh
@@ -0,0 +1,216 @@
+#!/bin/bash
+
+# This script demonstrates interaction of conntrack and vrf.
+# The vrf driver calls the netfilter hooks again, with oif/iif
+# pointing at the VRF device.
+#
+# For ingress, this means first iteration has iifname of lower/real
+# device.  In this script, thats veth0.
+# Second iteration is iifname set to vrf device, tvrf in this script.
+#
+# For egress, this is reversed: first iteration has the vrf device,
+# second iteration is done with the lower/real/veth0 device.
+#
+# test_ct_zone_in demonstrates unexpected change of nftables
+# behavior # caused by commit 09e856d54bda5f28 "vrf: Reset skb conntrack
+# connection on VRF rcv"
+#
+# It was possible to assign conntrack zone to a packet (or mark it for
+# `notracking`) in the prerouting chain before conntrack, based on real iif.
+#
+# After the change, the zone assignment is lost and the zone is assigned based
+# on the VRF master interface (in case such a rule exists).
+# assignment is lost. Instead, assignment based on the `iif` matching
+# Thus it is impossible to distinguish packets based on the original
+# interface.
+#
+# test_masquerade_vrf and test_masquerade_veth0 demonstrate the problem
+# that was supposed to be fixed by the commit mentioned above to make sure
+# that any fix to test case 1 won't break masquerade again.
+
+source lib.sh
+
+IP0=172.30.30.1
+IP1=172.30.30.2
+PFXL=30
+ret=0
+
+cleanup()
+{
+	ip netns pids $ns0 | xargs kill 2>/dev/null
+	ip netns pids $ns1 | xargs kill 2>/dev/null
+
+	cleanup_all_ns
+}
+
+checktool "nft --version" "run test without nft"
+checktool "conntrack --version" "run test without conntrack"
+checktool "socat -h" "run test without socat"
+
+trap cleanup EXIT
+
+setup_ns ns0 ns1
+
+if ! ip link add veth0 netns "$ns0" type veth peer name veth0 netns "$ns1" > /dev/null 2>&1; then
+	echo "SKIP: Could not add veth device"
+	exit $ksft_skip
+fi
+
+if ! ip -net "$ns0" li add tvrf type vrf table 9876; then
+	echo "SKIP: Could not add vrf device"
+	exit $ksft_skip
+fi
+
+ip -net "$ns0" li set veth0 master tvrf
+ip -net "$ns0" li set tvrf up
+ip -net "$ns0" li set veth0 up
+ip -net "$ns1" li set veth0 up
+
+ip -net "$ns0" addr add $IP0/$PFXL dev veth0
+ip -net "$ns1" addr add $IP1/$PFXL dev veth0
+
+listener_ready()
+{
+        local ns="$1"
+
+        ss -N "$ns" -l -n -t -o "sport = :55555" | grep -q "55555"
+}
+
+ip netns exec "$ns1" socat -u -4 TCP-LISTEN:55555,reuseaddr,fork STDOUT > /dev/null &
+busywait $BUSYWAIT_TIMEOUT listener_ready "$ns1"
+
+# test vrf ingress handling.
+# The incoming connection should be placed in conntrack zone 1,
+# as decided by the first iteration of the ruleset.
+test_ct_zone_in()
+{
+ip netns exec "$ns0" nft -f - <<EOF
+table testct {
+	chain rawpre {
+		type filter hook prerouting priority raw;
+
+		iif { veth0, tvrf } counter meta nftrace set 1
+		iif veth0 counter ct zone set 1 counter return
+		iif tvrf counter ct zone set 2 counter return
+		ip protocol icmp counter
+		notrack counter
+	}
+
+	chain rawout {
+		type filter hook output priority raw;
+
+		oif veth0 counter ct zone set 1 counter return
+		oif tvrf counter ct zone set 2 counter return
+		notrack counter
+	}
+}
+EOF
+	ip netns exec "$ns1" ping -W 1 -c 1 -I veth0 "$IP0" > /dev/null
+
+	# should be in zone 1, not zone 2
+	count=$(ip netns exec "$ns0" conntrack -L -s $IP1 -d $IP0 -p icmp --zone 1 2>/dev/null | wc -l)
+	if [ "$count" -eq 1 ]; then
+		echo "PASS: entry found in conntrack zone 1"
+	else
+		echo "FAIL: entry not found in conntrack zone 1"
+		count=$(ip netns exec "$ns0" conntrack -L -s $IP1 -d $IP0 -p icmp --zone 2 2> /dev/null | wc -l)
+		if [ "$count" -eq 1 ]; then
+			echo "FAIL: entry found in zone 2 instead"
+		else
+			echo "FAIL: entry not in zone 1 or 2, dumping table"
+			ip netns exec "$ns0" conntrack -L
+			ip netns exec "$ns0" nft list ruleset
+		fi
+	fi
+}
+
+# add masq rule that gets evaluated w. outif set to vrf device.
+# This tests the first iteration of the packet through conntrack,
+# oifname is the vrf device.
+test_masquerade_vrf()
+{
+	local qdisc=$1
+
+	if [ "$qdisc" != "default" ]; then
+		tc -net "$ns0" qdisc add dev tvrf root "$qdisc"
+	fi
+
+	ip netns exec "$ns0" conntrack -F 2>/dev/null
+
+ip netns exec "$ns0" nft -f - <<EOF
+flush ruleset
+table ip nat {
+	chain rawout {
+		type filter hook output priority raw;
+
+		oif tvrf ct state untracked counter
+	}
+	chain postrouting2 {
+		type filter hook postrouting priority mangle;
+
+		oif tvrf ct state untracked counter
+	}
+	chain postrouting {
+		type nat hook postrouting priority 0;
+		# NB: masquerade should always be combined with 'oif(name) bla',
+		# lack of this is intentional here, we want to exercise double-snat.
+		ip saddr 172.30.30.0/30 counter masquerade random
+	}
+}
+EOF
+	if ! ip netns exec "$ns0" ip vrf exec tvrf socat -u -4 STDIN TCP:"$IP1":55555 < /dev/null > /dev/null;then
+		echo "FAIL: connect failure with masquerade + sport rewrite on vrf device"
+		ret=1
+		return
+	fi
+
+	# must also check that nat table was evaluated on second (lower device) iteration.
+	if ip netns exec "$ns0" nft list table ip nat |grep -q 'counter packets 1' &&
+	   ip netns exec "$ns0" nft list table ip nat |grep -q 'untracked counter packets [1-9]'; then
+		echo "PASS: connect with masquerade + sport rewrite on vrf device ($qdisc qdisc)"
+	else
+		echo "FAIL: vrf rules have unexpected counter value"
+		ret=1
+	fi
+
+	if [ "$qdisc" != "default" ]; then
+		tc -net "$ns0" qdisc del dev tvrf root
+	fi
+}
+
+# add masq rule that gets evaluated w. outif set to veth device.
+# This tests the 2nd iteration of the packet through conntrack,
+# oifname is the lower device (veth0 in this case).
+test_masquerade_veth()
+{
+	ip netns exec "$ns0" conntrack -F 2>/dev/null
+ip netns exec "$ns0" nft -f - <<EOF
+flush ruleset
+table ip nat {
+	chain postrouting {
+		type nat hook postrouting priority 0;
+		meta oif veth0 ip saddr 172.30.30.0/30 counter masquerade random
+	}
+}
+EOF
+	if ! ip netns exec "$ns0" ip vrf exec tvrf socat -u -4 STDIN TCP:"$IP1":55555 < /dev/null > /dev/null;then
+		echo "FAIL: connect failure with masquerade + sport rewrite on veth device"
+		ret=1
+		return
+	fi
+
+	# must also check that nat table was evaluated on second (lower device) iteration.
+	if ip netns exec "$ns0" nft list table ip nat |grep -q 'counter packets 1'; then
+		echo "PASS: connect with masquerade + sport rewrite on veth device"
+	else
+		echo "FAIL: vrf masq rule has unexpected counter value"
+		ret=1
+	fi
+}
+
+test_ct_zone_in
+test_masquerade_vrf "default"
+test_masquerade_vrf "pfifo"
+test_masquerade_veth
+
+exit $ret
diff --git a/tools/testing/selftests/net/netfilter/ipvs.sh b/tools/testing/selftests/net/netfilter/ipvs.sh
new file mode 100755
index 000000000000..6af2ea3ad6b8
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/ipvs.sh
@@ -0,0 +1,205 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# End-to-end ipvs test suite
+# Topology:
+#--------------------------------------------------------------+
+#                      |                                       |
+#         ns0          |         ns1                           |
+#      -----------     |     -----------    -----------        |
+#      | veth01  | --------- | veth10  |    | veth12  |        |
+#      -----------    peer   -----------    -----------        |
+#           |          |                        |              |
+#      -----------     |                        |              |
+#      |  br0    |     |-----------------  peer |--------------|
+#      -----------     |                        |              |
+#           |          |                        |              |
+#      ----------     peer   ----------      -----------       |
+#      |  veth02 | --------- |  veth20 |     | veth21  |       |
+#      ----------      |     ----------      -----------       |
+#                      |         ns2                           |
+#                      |                                       |
+#--------------------------------------------------------------+
+#
+# We assume that all network driver are loaded
+#
+
+source lib.sh
+
+ret=0
+GREEN='\033[0;92m'
+RED='\033[0;31m'
+NC='\033[0m' # No Color
+
+readonly port=8080
+
+readonly vip_v4=207.175.44.110
+readonly cip_v4=10.0.0.2
+readonly gip_v4=10.0.0.1
+readonly dip_v4=172.16.0.1
+readonly rip_v4=172.16.0.2
+readonly sip_v4=10.0.0.3
+
+readonly infile="$(mktemp)"
+readonly outfile="$(mktemp)"
+readonly datalen=32
+
+sysipvsnet="/proc/sys/net/ipv4/vs/"
+if [ ! -d $sysipvsnet ]; then
+	if ! modprobe -q ip_vs; then
+		echo "skip: could not run test without ipvs module"
+		exit $ksft_skip
+	fi
+fi
+
+checktool "ipvsadm -v" "run test without ipvsadm"
+checktool "socat -h" "run test without socat"
+
+setup() {
+	setup_ns ns0 ns1 ns2
+
+	ip link add veth01 netns "${ns0}" type veth peer name veth10 netns "${ns1}"
+	ip link add veth02 netns "${ns0}" type veth peer name veth20 netns "${ns2}"
+	ip link add veth12 netns "${ns1}" type veth peer name veth21 netns "${ns2}"
+
+	ip netns exec "${ns0}" ip link set veth01 up
+	ip netns exec "${ns0}" ip link set veth02 up
+	ip netns exec "${ns0}" ip link add br0 type bridge
+	ip netns exec "${ns0}" ip link set veth01 master br0
+	ip netns exec "${ns0}" ip link set veth02 master br0
+	ip netns exec "${ns0}" ip link set br0 up
+	ip netns exec "${ns0}" ip addr add "${cip_v4}/24" dev br0
+
+	ip netns exec "${ns1}" ip link set veth10 up
+	ip netns exec "${ns1}" ip addr add "${gip_v4}/24" dev veth10
+	ip netns exec "${ns1}" ip link set veth12 up
+	ip netns exec "${ns1}" ip addr add "${dip_v4}/24" dev veth12
+
+	ip netns exec "${ns2}" ip link set veth21 up
+	ip netns exec "${ns2}" ip addr add "${rip_v4}/24" dev veth21
+	ip netns exec "${ns2}" ip link set veth20 up
+	ip netns exec "${ns2}" ip addr add "${sip_v4}/24" dev veth20
+
+	sleep 1
+
+	dd if=/dev/urandom of="${infile}" bs="${datalen}" count=1 status=none
+}
+
+cleanup() {
+	cleanup_all_ns
+
+	if [ -f "${outfile}" ]; then
+		rm "${outfile}"
+	fi
+	if [ -f "${infile}" ]; then
+		rm "${infile}"
+	fi
+}
+
+server_listen() {
+	ip netns exec "$ns2" timeout 5 socat -u -4 TCP-LISTEN:8080,reuseaddr STDOUT > "${outfile}" &
+	server_pid=$!
+	sleep 0.2
+}
+
+client_connect() {
+	ip netns exec "${ns0}" timeout 2 socat -u -4 STDIN TCP:"${vip_v4}":"${port}" < "${infile}"
+}
+
+verify_data() {
+	wait "${server_pid}"
+	cmp "$infile" "$outfile" 2>/dev/null
+}
+
+test_service() {
+	server_listen
+	client_connect
+	verify_data
+}
+
+
+test_dr() {
+	ip netns exec "${ns0}" ip route add "${vip_v4}" via "${gip_v4}" dev br0
+
+	ip netns exec "${ns1}" sysctl -qw net.ipv4.ip_forward=1
+	ip netns exec "${ns1}" ipvsadm -A -t "${vip_v4}:${port}" -s rr
+	ip netns exec "${ns1}" ipvsadm -a -t "${vip_v4}:${port}" -r "${rip_v4}:${port}"
+	ip netns exec "${ns1}" ip addr add "${vip_v4}/32" dev lo:1
+
+	# avoid incorrect arp response
+	ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.arp_ignore=1
+	ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.arp_announce=2
+	ip netns exec "${ns2}" ip addr add "${vip_v4}/32" dev lo:1
+
+	test_service
+}
+
+test_nat() {
+	ip netns exec "${ns0}" ip route add "${vip_v4}" via "${gip_v4}" dev br0
+
+	ip netns exec "${ns1}" sysctl -qw net.ipv4.ip_forward=1
+	ip netns exec "${ns1}" ipvsadm -A -t "${vip_v4}:${port}" -s rr
+	ip netns exec "${ns1}" ipvsadm -a -m -t "${vip_v4}:${port}" -r "${rip_v4}:${port}"
+	ip netns exec "${ns1}" ip addr add "${vip_v4}/32" dev lo:1
+
+	ip netns exec "${ns2}" ip link del veth20
+	ip netns exec "${ns2}" ip route add default via "${dip_v4}" dev veth21
+
+	test_service
+}
+
+test_tun() {
+	ip netns exec "${ns0}" ip route add "${vip_v4}" via "${gip_v4}" dev br0
+
+	ip netns exec "${ns1}" modprobe -q ipip
+	ip netns exec "${ns1}" ip link set tunl0 up
+	ip netns exec "${ns1}" sysctl -qw net.ipv4.ip_forward=0
+	ip netns exec "${ns1}" sysctl -qw net.ipv4.conf.all.send_redirects=0
+	ip netns exec "${ns1}" sysctl -qw net.ipv4.conf.default.send_redirects=0
+	ip netns exec "${ns1}" ipvsadm -A -t "${vip_v4}:${port}" -s rr
+	ip netns exec "${ns1}" ipvsadm -a -i -t "${vip_v4}:${port}" -r ${rip_v4}:${port}
+	ip netns exec "${ns1}" ip addr add ${vip_v4}/32 dev lo:1
+
+	ip netns exec "${ns2}" modprobe -q ipip
+	ip netns exec "${ns2}" ip link set tunl0 up
+	ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.arp_ignore=1
+	ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.arp_announce=2
+	ip netns exec "${ns2}" ip addr add "${vip_v4}/32" dev lo:1
+
+	test_service
+}
+
+run_tests() {
+	local errors=
+
+	echo "Testing DR mode..."
+	cleanup
+	setup
+	test_dr
+	errors=$(( $errors + $? ))
+
+	echo "Testing NAT mode..."
+	cleanup
+	setup
+	test_nat
+	errors=$(( $errors + $? ))
+
+	echo "Testing Tunnel mode..."
+	cleanup
+	setup
+	test_tun
+	errors=$(( $errors + $? ))
+
+	return $errors
+}
+
+trap cleanup EXIT
+
+run_tests
+
+if [ $? -ne 0 ]; then
+	echo -e "$(basename $0): ${RED}FAIL${NC}"
+	exit 1
+fi
+echo -e "$(basename $0): ${GREEN}PASS${NC}"
+exit 0
diff --git a/tools/testing/selftests/net/netfilter/lib.sh b/tools/testing/selftests/net/netfilter/lib.sh
new file mode 100644
index 000000000000..bedd35298e15
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/lib.sh
@@ -0,0 +1,10 @@
+net_netfilter_dir=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
+
+source "$net_netfilter_dir/../lib.sh"
+
+checktool (){
+	if ! $1 > /dev/null 2>&1; then
+		echo "SKIP: Could not $2"
+		exit $ksft_skip
+	fi
+}
diff --git a/tools/testing/selftests/net/netfilter/nf_conntrack_packetdrill.sh b/tools/testing/selftests/net/netfilter/nf_conntrack_packetdrill.sh
new file mode 100755
index 000000000000..c6fdd2079f4d
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/nf_conntrack_packetdrill.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source lib.sh
+
+checktool "conntrack --version" "run test without conntrack"
+checktool "iptables --version" "run test without iptables"
+checktool "ip6tables --version" "run test without ip6tables"
+
+modprobe -q tun
+modprobe -q nf_conntrack
+# echo 1 > /proc/sys/net/netfilter/nf_log_all_netns
+
+PDRILL_TIMEOUT=10
+
+files="
+conntrack_ack_loss_stall.pkt
+conntrack_inexact_rst.pkt
+conntrack_syn_challenge_ack.pkt
+conntrack_synack_old.pkt
+conntrack_synack_reuse.pkt
+conntrack_rst_invalid.pkt
+"
+
+if ! packetdrill --dry_run --verbose "packetdrill/conntrack_ack_loss_stall.pkt";then
+	echo "SKIP: packetdrill not installed"
+	exit ${ksft_skip}
+fi
+
+ret=0
+
+run_packetdrill()
+{
+	filename="$1"
+	ipver="$2"
+	local mtu=1500
+
+	export NFCT_IP_VERSION="$ipver"
+
+	if [ "$ipver" = "ipv4" ];then
+		export xtables="iptables"
+	elif [ "$ipver" = "ipv6" ];then
+		export xtables="ip6tables"
+		mtu=1520
+	fi
+
+	timeout "$PDRILL_TIMEOUT" unshare -n packetdrill --ip_version="$ipver" --mtu=$mtu \
+		--tolerance_usecs=1000000 --non_fatal packet "$filename"
+}
+
+run_one_test_file()
+{
+	filename="$1"
+
+	for v in ipv4 ipv6;do
+		printf "%-50s(%s)%-20s" "$filename" "$v" ""
+		if run_packetdrill packetdrill/"$f" "$v";then
+			echo OK
+		else
+			echo FAIL
+			ret=1
+		fi
+	done
+}
+
+echo "Replaying packetdrill test cases:"
+for f in $files;do
+	run_one_test_file packetdrill/"$f"
+done
+
+exit $ret
diff --git a/tools/testing/selftests/net/netfilter/nf_nat_edemux.sh b/tools/testing/selftests/net/netfilter/nf_nat_edemux.sh
new file mode 100755
index 000000000000..1014551dd769
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/nf_nat_edemux.sh
@@ -0,0 +1,97 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test NAT source port clash resolution
+#
+
+source lib.sh
+ret=0
+socatpid=0
+
+cleanup()
+{
+	[ "$socatpid" -gt 0 ] && kill "$socatpid"
+
+	cleanup_all_ns
+}
+
+checktool "socat -h" "run test without socat"
+checktool "iptables --version" "run test without iptables"
+
+trap cleanup EXIT
+
+setup_ns ns1 ns2
+
+# Connect the namespaces using a veth pair
+ip link add name veth2 type veth peer name veth1
+ip link set netns "$ns1" dev veth1
+ip link set netns "$ns2" dev veth2
+
+ip netns exec "$ns1" ip link set up dev lo
+ip netns exec "$ns1" ip link set up dev veth1
+ip netns exec "$ns1" ip addr add 192.168.1.1/24 dev veth1
+
+ip netns exec "$ns2" ip link set up dev lo
+ip netns exec "$ns2" ip link set up dev veth2
+ip netns exec "$ns2" ip addr add 192.168.1.2/24 dev veth2
+
+# Create a server in one namespace
+ip netns exec "$ns1" socat -u TCP-LISTEN:5201,fork OPEN:/dev/null,wronly=1 &
+socatpid=$!
+
+# Restrict source port to just one so we don't have to exhaust
+# all others.
+ip netns exec "$ns2" sysctl -q net.ipv4.ip_local_port_range="10000 10000"
+
+# add a virtual IP using DNAT
+ip netns exec "$ns2" iptables -t nat -A OUTPUT -d 10.96.0.1/32 -p tcp --dport 443 -j DNAT --to-destination 192.168.1.1:5201
+
+# ... and route it to the other namespace
+ip netns exec "$ns2" ip route add 10.96.0.1 via 192.168.1.1
+
+# add a persistent connection from the other namespace
+ip netns exec "$ns2" socat -t 10 - TCP:192.168.1.1:5201 > /dev/null &
+
+sleep 1
+
+# ip daddr:dport will be rewritten to 192.168.1.1 5201
+# NAT must reallocate source port 10000 because
+# 192.168.1.2:10000 -> 192.168.1.1:5201 is already in use
+echo test | ip netns exec "$ns2" socat -t 3 -u STDIN TCP:10.96.0.1:443,connect-timeout=3 >/dev/null
+ret=$?
+
+# Check socat can connect to 10.96.0.1:443 (aka 192.168.1.1:5201).
+if [ $ret -eq 0 ]; then
+	echo "PASS: socat can connect via NAT'd address"
+else
+	echo "FAIL: socat cannot connect via NAT'd address"
+fi
+
+# check sport clashres.
+ip netns exec "$ns1" iptables -t nat -A PREROUTING -p tcp --dport 5202 -j REDIRECT --to-ports 5201
+ip netns exec "$ns1" iptables -t nat -A PREROUTING -p tcp --dport 5203 -j REDIRECT --to-ports 5201
+
+sleep 5 | ip netns exec "$ns2" socat -t 5 -u STDIN TCP:192.168.1.1:5202,connect-timeout=5 >/dev/null &
+
+# if connect succeeds, client closes instantly due to EOF on stdin.
+# if connect hangs, it will time out after 5s.
+echo | ip netns exec "$ns2" socat -t 3 -u STDIN TCP:192.168.1.1:5203,connect-timeout=5 >/dev/null &
+cpid2=$!
+
+time_then=$(date +%s)
+wait $cpid2
+rv=$?
+time_now=$(date +%s)
+
+# Check how much time has elapsed, expectation is for
+# 'cpid2' to connect and then exit (and no connect delay).
+delta=$((time_now - time_then))
+
+if [ $delta -lt 2 ] && [ $rv -eq 0 ]; then
+	echo "PASS: could connect to service via redirected ports"
+else
+	echo "FAIL: socat cannot connect to service via redirect ($delta seconds elapsed, returned $rv)"
+	ret=1
+fi
+
+exit $ret
diff --git a/tools/testing/selftests/net/netfilter/nf_queue.c b/tools/testing/selftests/net/netfilter/nf_queue.c
new file mode 100644
index 000000000000..9e56b9d47037
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/nf_queue.c
@@ -0,0 +1,395 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <time.h>
+#include <arpa/inet.h>
+
+#include <libmnl/libmnl.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_queue.h>
+
+struct options {
+	bool count_packets;
+	bool gso_enabled;
+	int verbose;
+	unsigned int queue_num;
+	unsigned int timeout;
+	uint32_t verdict;
+	uint32_t delay_ms;
+};
+
+static unsigned int queue_stats[5];
+static struct options opts;
+
+static void help(const char *p)
+{
+	printf("Usage: %s [-c|-v [-vv] ] [-t timeout] [-q queue_num] [-Qdst_queue ] [ -d ms_delay ] [-G]\n", p);
+}
+
+static int parse_attr_cb(const struct nlattr *attr, void *data)
+{
+	const struct nlattr **tb = data;
+	int type = mnl_attr_get_type(attr);
+
+	/* skip unsupported attribute in user-space */
+	if (mnl_attr_type_valid(attr, NFQA_MAX) < 0)
+		return MNL_CB_OK;
+
+	switch (type) {
+	case NFQA_MARK:
+	case NFQA_IFINDEX_INDEV:
+	case NFQA_IFINDEX_OUTDEV:
+	case NFQA_IFINDEX_PHYSINDEV:
+	case NFQA_IFINDEX_PHYSOUTDEV:
+		if (mnl_attr_validate(attr, MNL_TYPE_U32) < 0) {
+			perror("mnl_attr_validate");
+			return MNL_CB_ERROR;
+		}
+		break;
+	case NFQA_TIMESTAMP:
+		if (mnl_attr_validate2(attr, MNL_TYPE_UNSPEC,
+		    sizeof(struct nfqnl_msg_packet_timestamp)) < 0) {
+			perror("mnl_attr_validate2");
+			return MNL_CB_ERROR;
+		}
+		break;
+	case NFQA_HWADDR:
+		if (mnl_attr_validate2(attr, MNL_TYPE_UNSPEC,
+		    sizeof(struct nfqnl_msg_packet_hw)) < 0) {
+			perror("mnl_attr_validate2");
+			return MNL_CB_ERROR;
+		}
+		break;
+	case NFQA_PAYLOAD:
+		break;
+	}
+	tb[type] = attr;
+	return MNL_CB_OK;
+}
+
+static int queue_cb(const struct nlmsghdr *nlh, void *data)
+{
+	struct nlattr *tb[NFQA_MAX+1] = { 0 };
+	struct nfqnl_msg_packet_hdr *ph = NULL;
+	uint32_t id = 0;
+
+	(void)data;
+
+	mnl_attr_parse(nlh, sizeof(struct nfgenmsg), parse_attr_cb, tb);
+	if (tb[NFQA_PACKET_HDR]) {
+		ph = mnl_attr_get_payload(tb[NFQA_PACKET_HDR]);
+		id = ntohl(ph->packet_id);
+
+		if (opts.verbose > 0)
+			printf("packet hook=%u, hwproto 0x%x",
+				ntohs(ph->hw_protocol), ph->hook);
+
+		if (ph->hook >= 5) {
+			fprintf(stderr, "Unknown hook %d\n", ph->hook);
+			return MNL_CB_ERROR;
+		}
+
+		if (opts.verbose > 0) {
+			uint32_t skbinfo = 0;
+
+			if (tb[NFQA_SKB_INFO])
+				skbinfo = ntohl(mnl_attr_get_u32(tb[NFQA_SKB_INFO]));
+			if (skbinfo & NFQA_SKB_CSUMNOTREADY)
+				printf(" csumnotready");
+			if (skbinfo & NFQA_SKB_GSO)
+				printf(" gso");
+			if (skbinfo & NFQA_SKB_CSUM_NOTVERIFIED)
+				printf(" csumnotverified");
+			puts("");
+		}
+
+		if (opts.count_packets)
+			queue_stats[ph->hook]++;
+	}
+
+	return MNL_CB_OK + id;
+}
+
+static struct nlmsghdr *
+nfq_build_cfg_request(char *buf, uint8_t command, int queue_num)
+{
+	struct nlmsghdr *nlh = mnl_nlmsg_put_header(buf);
+	struct nfqnl_msg_config_cmd cmd = {
+		.command = command,
+		.pf = htons(AF_INET),
+	};
+	struct nfgenmsg *nfg;
+
+	nlh->nlmsg_type	= (NFNL_SUBSYS_QUEUE << 8) | NFQNL_MSG_CONFIG;
+	nlh->nlmsg_flags = NLM_F_REQUEST;
+
+	nfg = mnl_nlmsg_put_extra_header(nlh, sizeof(*nfg));
+
+	nfg->nfgen_family = AF_UNSPEC;
+	nfg->version = NFNETLINK_V0;
+	nfg->res_id = htons(queue_num);
+
+	mnl_attr_put(nlh, NFQA_CFG_CMD, sizeof(cmd), &cmd);
+
+	return nlh;
+}
+
+static struct nlmsghdr *
+nfq_build_cfg_params(char *buf, uint8_t mode, int range, int queue_num)
+{
+	struct nlmsghdr *nlh = mnl_nlmsg_put_header(buf);
+	struct nfqnl_msg_config_params params = {
+		.copy_range = htonl(range),
+		.copy_mode = mode,
+	};
+	struct nfgenmsg *nfg;
+
+	nlh->nlmsg_type	= (NFNL_SUBSYS_QUEUE << 8) | NFQNL_MSG_CONFIG;
+	nlh->nlmsg_flags = NLM_F_REQUEST;
+
+	nfg = mnl_nlmsg_put_extra_header(nlh, sizeof(*nfg));
+	nfg->nfgen_family = AF_UNSPEC;
+	nfg->version = NFNETLINK_V0;
+	nfg->res_id = htons(queue_num);
+
+	mnl_attr_put(nlh, NFQA_CFG_PARAMS, sizeof(params), &params);
+
+	return nlh;
+}
+
+static struct nlmsghdr *
+nfq_build_verdict(char *buf, int id, int queue_num, uint32_t verd)
+{
+	struct nfqnl_msg_verdict_hdr vh = {
+		.verdict = htonl(verd),
+		.id = htonl(id),
+	};
+	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfg;
+
+	nlh = mnl_nlmsg_put_header(buf);
+	nlh->nlmsg_type = (NFNL_SUBSYS_QUEUE << 8) | NFQNL_MSG_VERDICT;
+	nlh->nlmsg_flags = NLM_F_REQUEST;
+	nfg = mnl_nlmsg_put_extra_header(nlh, sizeof(*nfg));
+	nfg->nfgen_family = AF_UNSPEC;
+	nfg->version = NFNETLINK_V0;
+	nfg->res_id = htons(queue_num);
+
+	mnl_attr_put(nlh, NFQA_VERDICT_HDR, sizeof(vh), &vh);
+
+	return nlh;
+}
+
+static void print_stats(void)
+{
+	unsigned int last, total;
+	int i;
+
+	total = 0;
+	last = queue_stats[0];
+
+	for (i = 0; i < 5; i++) {
+		printf("hook %d packets %08u\n", i, queue_stats[i]);
+		last = queue_stats[i];
+		total += last;
+	}
+
+	printf("%u packets total\n", total);
+}
+
+struct mnl_socket *open_queue(void)
+{
+	char buf[MNL_SOCKET_BUFFER_SIZE];
+	unsigned int queue_num;
+	struct mnl_socket *nl;
+	struct nlmsghdr *nlh;
+	struct timeval tv;
+	uint32_t flags;
+
+	nl = mnl_socket_open(NETLINK_NETFILTER);
+	if (nl == NULL) {
+		perror("mnl_socket_open");
+		exit(EXIT_FAILURE);
+	}
+
+	if (mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID) < 0) {
+		perror("mnl_socket_bind");
+		exit(EXIT_FAILURE);
+	}
+
+	queue_num = opts.queue_num;
+	nlh = nfq_build_cfg_request(buf, NFQNL_CFG_CMD_BIND, queue_num);
+
+	if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) {
+		perror("mnl_socket_sendto");
+		exit(EXIT_FAILURE);
+	}
+
+	nlh = nfq_build_cfg_params(buf, NFQNL_COPY_PACKET, 0xFFFF, queue_num);
+
+	flags = opts.gso_enabled ? NFQA_CFG_F_GSO : 0;
+	flags |= NFQA_CFG_F_UID_GID;
+	mnl_attr_put_u32(nlh, NFQA_CFG_FLAGS, htonl(flags));
+	mnl_attr_put_u32(nlh, NFQA_CFG_MASK, htonl(flags));
+
+	if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) {
+		perror("mnl_socket_sendto");
+		exit(EXIT_FAILURE);
+	}
+
+	memset(&tv, 0, sizeof(tv));
+	tv.tv_sec = opts.timeout;
+	if (opts.timeout && setsockopt(mnl_socket_get_fd(nl),
+				       SOL_SOCKET, SO_RCVTIMEO,
+				       &tv, sizeof(tv))) {
+		perror("setsockopt(SO_RCVTIMEO)");
+		exit(EXIT_FAILURE);
+	}
+
+	return nl;
+}
+
+static void sleep_ms(uint32_t delay)
+{
+	struct timespec ts = { .tv_sec = delay / 1000 };
+
+	delay %= 1000;
+
+	ts.tv_nsec = delay * 1000llu * 1000llu;
+
+	nanosleep(&ts, NULL);
+}
+
+static int mainloop(void)
+{
+	unsigned int buflen = 64 * 1024 + MNL_SOCKET_BUFFER_SIZE;
+	struct mnl_socket *nl;
+	struct nlmsghdr *nlh;
+	unsigned int portid;
+	char *buf;
+	int ret;
+
+	buf = malloc(buflen);
+	if (!buf) {
+		perror("malloc");
+		exit(EXIT_FAILURE);
+	}
+
+	nl = open_queue();
+	portid = mnl_socket_get_portid(nl);
+
+	for (;;) {
+		uint32_t id;
+
+		ret = mnl_socket_recvfrom(nl, buf, buflen);
+		if (ret == -1) {
+			if (errno == ENOBUFS || errno == EINTR)
+				continue;
+
+			if (errno == EAGAIN) {
+				errno = 0;
+				ret = 0;
+				break;
+			}
+
+			perror("mnl_socket_recvfrom");
+			exit(EXIT_FAILURE);
+		}
+
+		ret = mnl_cb_run(buf, ret, 0, portid, queue_cb, NULL);
+		if (ret < 0) {
+			perror("mnl_cb_run");
+			exit(EXIT_FAILURE);
+		}
+
+		id = ret - MNL_CB_OK;
+		if (opts.delay_ms)
+			sleep_ms(opts.delay_ms);
+
+		nlh = nfq_build_verdict(buf, id, opts.queue_num, opts.verdict);
+		if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) {
+			perror("mnl_socket_sendto");
+			exit(EXIT_FAILURE);
+		}
+	}
+
+	mnl_socket_close(nl);
+
+	return ret;
+}
+
+static void parse_opts(int argc, char **argv)
+{
+	int c;
+
+	while ((c = getopt(argc, argv, "chvt:q:Q:d:G")) != -1) {
+		switch (c) {
+		case 'c':
+			opts.count_packets = true;
+			break;
+		case 'h':
+			help(argv[0]);
+			exit(0);
+			break;
+		case 'q':
+			opts.queue_num = atoi(optarg);
+			if (opts.queue_num > 0xffff)
+				opts.queue_num = 0;
+			break;
+		case 'Q':
+			opts.verdict = atoi(optarg);
+			if (opts.verdict > 0xffff) {
+				fprintf(stderr, "Expected destination queue number\n");
+				exit(1);
+			}
+
+			opts.verdict <<= 16;
+			opts.verdict |= NF_QUEUE;
+			break;
+		case 'd':
+			opts.delay_ms = atoi(optarg);
+			if (opts.delay_ms == 0) {
+				fprintf(stderr, "Expected nonzero delay (in milliseconds)\n");
+				exit(1);
+			}
+			break;
+		case 't':
+			opts.timeout = atoi(optarg);
+			break;
+		case 'G':
+			opts.gso_enabled = false;
+			break;
+		case 'v':
+			opts.verbose++;
+			break;
+		}
+	}
+
+	if (opts.verdict != NF_ACCEPT && (opts.verdict >> 16 == opts.queue_num)) {
+		fprintf(stderr, "Cannot use same destination and source queue\n");
+		exit(1);
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	int ret;
+
+	opts.verdict = NF_ACCEPT;
+	opts.gso_enabled = true;
+
+	parse_opts(argc, argv);
+
+	ret = mainloop();
+	if (opts.count_packets)
+		print_stats();
+
+	return ret;
+}
diff --git a/tools/testing/selftests/net/netfilter/nft_audit.sh b/tools/testing/selftests/net/netfilter/nft_audit.sh
new file mode 100755
index 000000000000..87f2b4c725aa
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/nft_audit.sh
@@ -0,0 +1,269 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Check that audit logs generated for nft commands are as expected.
+
+SKIP_RC=4
+RC=0
+
+if [ -r /var/run/auditd.pid ];then
+	read pid < /var/run/auditd.pid
+	p=$(pgrep ^auditd$)
+
+	if [ "$pid" -eq "$p" ]; then
+		echo "SKIP: auditd is running"
+		exit $SKIP_RC
+	fi
+fi
+
+nft --version >/dev/null 2>&1 || {
+	echo "SKIP: missing nft tool"
+	exit $SKIP_RC
+}
+
+# nft must be recent enough to support "reset" keyword.
+nft --check -f /dev/stdin >/dev/null 2>&1 <<EOF
+add table t
+add chain t c
+reset rules t c
+EOF
+
+if [ "$?" -ne 0 ];then
+	echo -n "SKIP: nft reset feature test failed: "
+	nft --version
+	exit $SKIP_RC
+fi
+
+# Run everything in a separate network namespace
+[ "${1}" != "run" ] && { unshare -n "${0}" run; exit $?; }
+
+# give other scripts a chance to finish - audit_logread sees all activity
+sleep 1
+
+logfile=$(mktemp)
+rulefile=$(mktemp)
+echo "logging into $logfile"
+./audit_logread >"$logfile" &
+logread_pid=$!
+trap 'kill $logread_pid; rm -f $logfile $rulefile' EXIT
+exec 3<"$logfile"
+
+lsplit='s/^\(.*\) entries=\([^ ]*\) \(.*\)$/pfx="\1"\nval="\2"\nsfx="\3"/'
+summarize_logs() {
+	sum=0
+	while read line; do
+		eval $(sed "$lsplit" <<< "$line")
+		[[ $sum -gt 0 ]] && {
+			[[ "$pfx $sfx" == "$tpfx $tsfx" ]] && {
+				let "sum += val"
+				continue
+			}
+			echo "$tpfx entries=$sum $tsfx"
+		}
+		tpfx="$pfx"
+		tsfx="$sfx"
+		sum=$val
+	done
+	echo "$tpfx entries=$sum $tsfx"
+}
+
+do_test() { # (cmd, log)
+	echo -n "testing for cmd: $1 ... "
+	cat <&3 >/dev/null
+	$1 >/dev/null || exit 1
+	sleep 0.1
+	res=$(diff -a -u <(echo "$2") <(summarize_logs <&3))
+	[ $? -eq 0 ] && { echo "OK"; return; }
+	echo "FAIL"
+	grep -v '^\(---\|+++\|@@\)' <<< "$res"
+	((RC--))
+}
+
+nft flush ruleset
+
+# adding tables, chains and rules
+
+for table in t1 t2; do
+	do_test "nft add table $table" \
+	"table=$table family=2 entries=1 op=nft_register_table"
+
+	do_test "nft add chain $table c1" \
+	"table=$table family=2 entries=1 op=nft_register_chain"
+
+	do_test "nft add chain $table c2; add chain $table c3" \
+	"table=$table family=2 entries=2 op=nft_register_chain"
+
+	cmd="add rule $table c1 counter"
+
+	do_test "nft $cmd" \
+	"table=$table family=2 entries=1 op=nft_register_rule"
+
+	do_test "nft $cmd; $cmd" \
+	"table=$table family=2 entries=2 op=nft_register_rule"
+
+	cmd=""
+	sep=""
+	for chain in c2 c3; do
+		for i in {1..3}; do
+			cmd+="$sep add rule $table $chain counter"
+			sep=";"
+		done
+	done
+	do_test "nft $cmd" \
+	"table=$table family=2 entries=6 op=nft_register_rule"
+done
+
+for ((i = 0; i < 500; i++)); do
+	echo "add rule t2 c3 counter accept comment \"rule $i\""
+done > "$rulefile"
+do_test "nft -f $rulefile" \
+'table=t2 family=2 entries=500 op=nft_register_rule'
+
+# adding sets and elements
+
+settype='type inet_service; counter'
+setelem='{ 22, 80, 443 }'
+setblock="{ $settype; elements = $setelem; }"
+do_test "nft add set t1 s $setblock" \
+"table=t1 family=2 entries=4 op=nft_register_set"
+
+do_test "nft add set t1 s2 $setblock; add set t1 s3 { $settype; }" \
+"table=t1 family=2 entries=5 op=nft_register_set"
+
+do_test "nft add element t1 s3 $setelem" \
+"table=t1 family=2 entries=3 op=nft_register_setelem"
+
+# adding counters
+
+do_test 'nft add counter t1 c1' \
+'table=t1 family=2 entries=1 op=nft_register_obj'
+
+do_test 'nft add counter t2 c1; add counter t2 c2' \
+'table=t2 family=2 entries=2 op=nft_register_obj'
+
+for ((i = 3; i <= 500; i++)); do
+	echo "add counter t2 c$i"
+done > "$rulefile"
+do_test "nft -f $rulefile" \
+'table=t2 family=2 entries=498 op=nft_register_obj'
+
+# adding/updating quotas
+
+do_test 'nft add quota t1 q1 { 10 bytes }' \
+'table=t1 family=2 entries=1 op=nft_register_obj'
+
+do_test 'nft add quota t2 q1 { 10 bytes }; add quota t2 q2 { 10 bytes }' \
+'table=t2 family=2 entries=2 op=nft_register_obj'
+
+for ((i = 3; i <= 500; i++)); do
+	echo "add quota t2 q$i { 10 bytes }"
+done > "$rulefile"
+do_test "nft -f $rulefile" \
+'table=t2 family=2 entries=498 op=nft_register_obj'
+
+# changing the quota value triggers obj update path
+do_test 'nft add quota t1 q1 { 20 bytes }' \
+'table=t1 family=2 entries=1 op=nft_register_obj'
+
+# resetting rules
+
+do_test 'nft reset rules t1 c2' \
+'table=t1 family=2 entries=3 op=nft_reset_rule'
+
+do_test 'nft reset rules table t1' \
+'table=t1 family=2 entries=9 op=nft_reset_rule'
+
+do_test 'nft reset rules t2 c3' \
+'table=t2 family=2 entries=503 op=nft_reset_rule'
+
+do_test 'nft reset rules t2' \
+'table=t2 family=2 entries=509 op=nft_reset_rule'
+
+do_test 'nft reset rules' \
+'table=t1 family=2 entries=9 op=nft_reset_rule
+table=t2 family=2 entries=509 op=nft_reset_rule'
+
+# resetting sets and elements
+
+elem=(22 ",80" ",443")
+relem=""
+for i in {1..3}; do
+	relem+="${elem[((i - 1))]}"
+	do_test "nft reset element t1 s { $relem }" \
+	"table=t1 family=2 entries=$i op=nft_reset_setelem"
+done
+
+do_test 'nft reset set t1 s' \
+'table=t1 family=2 entries=3 op=nft_reset_setelem'
+
+# resetting counters
+
+do_test 'nft reset counter t1 c1' \
+'table=t1 family=2 entries=1 op=nft_reset_obj'
+
+do_test 'nft reset counters t1' \
+'table=t1 family=2 entries=1 op=nft_reset_obj'
+
+do_test 'nft reset counters t2' \
+'table=t2 family=2 entries=500 op=nft_reset_obj'
+
+do_test 'nft reset counters' \
+'table=t1 family=2 entries=1 op=nft_reset_obj
+table=t2 family=2 entries=500 op=nft_reset_obj'
+
+# resetting quotas
+
+do_test 'nft reset quota t1 q1' \
+'table=t1 family=2 entries=1 op=nft_reset_obj'
+
+do_test 'nft reset quotas t1' \
+'table=t1 family=2 entries=1 op=nft_reset_obj'
+
+do_test 'nft reset quotas t2' \
+'table=t2 family=2 entries=500 op=nft_reset_obj'
+
+do_test 'nft reset quotas' \
+'table=t1 family=2 entries=1 op=nft_reset_obj
+table=t2 family=2 entries=500 op=nft_reset_obj'
+
+# deleting rules
+
+readarray -t handles < <(nft -a list chain t1 c1 | \
+			 sed -n 's/.*counter.* handle \(.*\)$/\1/p')
+
+do_test "nft delete rule t1 c1 handle ${handles[0]}" \
+'table=t1 family=2 entries=1 op=nft_unregister_rule'
+
+cmd='delete rule t1 c1 handle'
+do_test "nft $cmd ${handles[1]}; $cmd ${handles[2]}" \
+'table=t1 family=2 entries=2 op=nft_unregister_rule'
+
+do_test 'nft flush chain t1 c2' \
+'table=t1 family=2 entries=3 op=nft_unregister_rule'
+
+do_test 'nft flush table t2' \
+'table=t2 family=2 entries=509 op=nft_unregister_rule'
+
+# deleting chains
+
+do_test 'nft delete chain t2 c2' \
+'table=t2 family=2 entries=1 op=nft_unregister_chain'
+
+# deleting sets and elements
+
+do_test 'nft delete element t1 s { 22 }' \
+'table=t1 family=2 entries=1 op=nft_unregister_setelem'
+
+do_test 'nft delete element t1 s { 80, 443 }' \
+'table=t1 family=2 entries=2 op=nft_unregister_setelem'
+
+do_test 'nft flush set t1 s2' \
+'table=t1 family=2 entries=3 op=nft_unregister_setelem'
+
+do_test 'nft delete set t1 s2' \
+'table=t1 family=2 entries=1 op=nft_unregister_set'
+
+do_test 'nft delete set t1 s3' \
+'table=t1 family=2 entries=1 op=nft_unregister_set'
+
+exit $RC
diff --git a/tools/testing/selftests/net/netfilter/nft_concat_range.sh b/tools/testing/selftests/net/netfilter/nft_concat_range.sh
new file mode 100755
index 000000000000..efea93cf23d4
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/nft_concat_range.sh
@@ -0,0 +1,1890 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# nft_concat_range.sh - Tests for sets with concatenation of ranged fields
+#
+# Copyright (c) 2019 Red Hat GmbH
+#
+# Author: Stefano Brivio <sbrivio@redhat.com>
+#
+# shellcheck disable=SC2154,SC2034,SC2016,SC2030,SC2031,SC2317
+# ^ Configuration and templates sourced with eval, counters reused in subshells
+
+source lib.sh
+
+# Available test groups:
+# - reported_issues: check for issues that were reported in the past
+# - correctness: check that packets match given entries, and only those
+# - correctness_large: same but with additional non-matching entries
+# - concurrency: attempt races between insertion, deletion and lookup
+# - timeout: check that packets match entries until they expire
+# - performance: estimate matching rate, compare with rbtree and hash baselines
+TESTS="reported_issues correctness correctness_large concurrency timeout"
+
+[ -n "$NFT_CONCAT_RANGE_TESTS" ] && TESTS="${NFT_CONCAT_RANGE_TESTS}"
+
+# Set types, defined by TYPE_ variables below
+TYPES="net_port port_net net6_port port_proto net6_port_mac net6_port_mac_proto
+       net_port_net net_mac mac_net net_mac_icmp net6_mac_icmp
+       net6_port_net6_port net_port_mac_proto_net"
+
+# Reported bugs, also described by TYPE_ variables below
+BUGS="flush_remove_add reload net_port_proto_match avx2_mismatch"
+
+# List of possible paths to pktgen script from kernel tree for performance tests
+PKTGEN_SCRIPT_PATHS="
+	../../../../../samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh
+	pktgen/pktgen_bench_xmit_mode_netif_receive.sh"
+
+# Definition of set types:
+# display	display text for test report
+# type_spec	nftables set type specifier
+# chain_spec	nftables type specifier for rules mapping to set
+# dst		call sequence of format_*() functions for destination fields
+# src		call sequence of format_*() functions for source fields
+# start		initial integer used to generate addresses and ports
+# count		count of entries to generate and match
+# src_delta	number summed to destination generator for source fields
+# tools		list of tools for correctness and timeout tests, any can be used
+# proto		L4 protocol of test packets
+#
+# race_repeat	race attempts per thread, 0 disables concurrency test for type
+# flood_tools	list of tools for concurrency tests, any can be used
+# flood_proto	L4 protocol of test packets for concurrency tests
+# flood_spec	nftables type specifier for concurrency tests
+#
+# perf_duration	duration of single pktgen injection test
+# perf_spec	nftables type specifier for performance tests
+# perf_dst	format_*() functions for destination fields in performance test
+# perf_src	format_*() functions for source fields in performance test
+# perf_entries	number of set entries for performance test
+# perf_proto	L3 protocol of test packets
+TYPE_net_port="
+display		net,port
+type_spec	ipv4_addr . inet_service
+chain_spec	ip daddr . udp dport
+dst		addr4 port
+src		 
+start		1
+count		5
+src_delta	2000
+tools		sendip bash
+proto		udp
+
+race_repeat	3
+flood_tools	iperf3 iperf netperf
+flood_proto	udp
+flood_spec	ip daddr . udp dport
+
+perf_duration	5
+perf_spec	ip daddr . udp dport
+perf_dst	addr4 port
+perf_src	 
+perf_entries	1000
+perf_proto	ipv4
+"
+
+TYPE_port_net="
+display		port,net
+type_spec	inet_service . ipv4_addr
+chain_spec	udp dport . ip daddr
+dst		port addr4
+src		 
+start		1
+count		5
+src_delta	2000
+tools		sendip socat bash
+proto		udp
+
+race_repeat	3
+flood_tools	iperf3 iperf netperf
+flood_proto	udp
+flood_spec	udp dport . ip daddr
+
+perf_duration	5
+perf_spec	udp dport . ip daddr
+perf_dst	port addr4
+perf_src	 
+perf_entries	100
+perf_proto	ipv4
+"
+
+TYPE_net6_port="
+display		net6,port
+type_spec	ipv6_addr . inet_service
+chain_spec	ip6 daddr . udp dport
+dst		addr6 port
+src		 
+start		10
+count		5
+src_delta	2000
+tools		sendip socat bash
+proto		udp6
+
+race_repeat	3
+flood_tools	iperf3 iperf netperf
+flood_proto	tcp6
+flood_spec	ip6 daddr . udp dport
+
+perf_duration	5
+perf_spec	ip6 daddr . udp dport
+perf_dst	addr6 port
+perf_src	 
+perf_entries	1000
+perf_proto	ipv6
+"
+
+TYPE_port_proto="
+display		port,proto
+type_spec	inet_service . inet_proto
+chain_spec	udp dport . meta l4proto
+dst		port proto
+src		 
+start		1
+count		5
+src_delta	2000
+tools		sendip socat bash
+proto		udp
+
+race_repeat	0
+
+perf_duration	5
+perf_spec	udp dport . meta l4proto
+perf_dst	port proto
+perf_src	 
+perf_entries	30000
+perf_proto	ipv4
+"
+
+TYPE_net6_port_mac="
+display		net6,port,mac
+type_spec	ipv6_addr . inet_service . ether_addr
+chain_spec	ip6 daddr . udp dport . ether saddr
+dst		addr6 port
+src		mac
+start		10
+count		5
+src_delta	2000
+tools		sendip socat bash
+proto		udp6
+
+race_repeat	0
+
+perf_duration	5
+perf_spec	ip6 daddr . udp dport . ether daddr
+perf_dst	addr6 port mac
+perf_src	 
+perf_entries	10
+perf_proto	ipv6
+"
+
+TYPE_net6_port_mac_proto="
+display		net6,port,mac,proto
+type_spec	ipv6_addr . inet_service . ether_addr . inet_proto
+chain_spec	ip6 daddr . udp dport . ether saddr . meta l4proto
+dst		addr6 port
+src		mac proto
+start		10
+count		5
+src_delta	2000
+tools		sendip socat bash
+proto		udp6
+
+race_repeat	0
+
+perf_duration	5
+perf_spec	ip6 daddr . udp dport . ether daddr . meta l4proto
+perf_dst	addr6 port mac proto
+perf_src	 
+perf_entries	1000
+perf_proto	ipv6
+"
+
+TYPE_net_port_net="
+display		net,port,net
+type_spec	ipv4_addr . inet_service . ipv4_addr
+chain_spec	ip daddr . udp dport . ip saddr
+dst		addr4 port
+src		addr4
+start		1
+count		5
+src_delta	2000
+tools		sendip socat bash
+proto		udp
+
+race_repeat	3
+flood_tools	iperf3 iperf netperf
+flood_proto	tcp
+flood_spec	ip daddr . udp dport . ip saddr
+
+perf_duration	0
+"
+
+TYPE_net6_port_net6_port="
+display		net6,port,net6,port
+type_spec	ipv6_addr . inet_service . ipv6_addr . inet_service
+chain_spec	ip6 daddr . udp dport . ip6 saddr . udp sport
+dst		addr6 port
+src		addr6 port
+start		10
+count		5
+src_delta	2000
+tools		sendip socat
+proto		udp6
+
+race_repeat	3
+flood_tools	iperf3 iperf netperf
+flood_proto	tcp6
+flood_spec	ip6 daddr . tcp dport . ip6 saddr . tcp sport
+
+perf_duration	0
+"
+
+TYPE_net_port_mac_proto_net="
+display		net,port,mac,proto,net
+type_spec	ipv4_addr . inet_service . ether_addr . inet_proto . ipv4_addr
+chain_spec	ip daddr . udp dport . ether saddr . meta l4proto . ip saddr
+dst		addr4 port
+src		mac proto addr4
+start		1
+count		5
+src_delta	2000
+tools		sendip socat bash
+proto		udp
+
+race_repeat	0
+
+perf_duration	0
+"
+
+TYPE_net_mac="
+display		net,mac
+type_spec	ipv4_addr . ether_addr
+chain_spec	ip daddr . ether saddr
+dst		addr4
+src		mac
+start		1
+count		5
+src_delta	2000
+tools		sendip socat bash
+proto		udp
+
+race_repeat	0
+
+perf_duration	5
+perf_spec	ip daddr . ether daddr
+perf_dst	addr4 mac
+perf_src	 
+perf_entries	1000
+perf_proto	ipv4
+"
+
+TYPE_mac_net="
+display		mac,net
+type_spec	ether_addr . ipv4_addr
+chain_spec	ether saddr . ip saddr
+dst		 
+src		mac addr4
+start		1
+count		5
+src_delta	2000
+tools		sendip socat bash
+proto		udp
+
+race_repeat	0
+
+perf_duration	0
+"
+
+TYPE_net_mac_icmp="
+display		net,mac - ICMP
+type_spec	ipv4_addr . ether_addr
+chain_spec	ip daddr . ether saddr
+dst		addr4
+src		mac
+start		1
+count		5
+src_delta	2000
+tools		ping
+proto		icmp
+
+race_repeat	0
+
+perf_duration	0
+"
+
+TYPE_net6_mac_icmp="
+display		net6,mac - ICMPv6
+type_spec	ipv6_addr . ether_addr
+chain_spec	ip6 daddr . ether saddr
+dst		addr6
+src		mac
+start		10
+count		50
+src_delta	2000
+tools		ping
+proto		icmp6
+
+race_repeat	0
+
+perf_duration	0
+"
+
+TYPE_net_port_proto_net="
+display		net,port,proto,net
+type_spec	ipv4_addr . inet_service . inet_proto . ipv4_addr
+chain_spec	ip daddr . udp dport . meta l4proto . ip saddr
+dst		addr4 port proto
+src		addr4
+start		1
+count		5
+src_delta	2000
+tools		sendip socat
+proto		udp
+
+race_repeat	3
+flood_tools	iperf3 iperf netperf
+flood_proto	tcp
+flood_spec	ip daddr . tcp dport . meta l4proto . ip saddr
+
+perf_duration	0
+"
+
+# Definition of tests for bugs reported in the past:
+# display	display text for test report
+TYPE_flush_remove_add="
+display		Add two elements, flush, re-add
+"
+
+TYPE_reload="
+display		net,mac with reload
+type_spec	ipv4_addr . ether_addr
+chain_spec	ip daddr . ether saddr
+dst		addr4
+src		mac
+start		1
+count		1
+src_delta	2000
+tools		sendip socat bash
+proto		udp
+
+race_repeat	0
+
+perf_duration	0
+"
+
+TYPE_net_port_proto_match="
+display		net,port,proto
+type_spec	ipv4_addr . inet_service . inet_proto
+chain_spec	ip daddr . udp dport . meta l4proto
+dst		addr4 port proto
+src
+start		1
+count		9
+src_delta	9
+tools		sendip bash
+proto		udp
+
+race_repeat	0
+
+perf_duration	0
+"
+
+TYPE_avx2_mismatch="
+display		avx2 false match
+type_spec	inet_proto . ipv6_addr
+chain_spec	meta l4proto . ip6 daddr
+dst		proto addr6
+src
+start		1
+count		1
+src_delta	1
+tools		ping
+proto		icmp6
+
+race_repeat	0
+
+perf_duration	0
+"
+
+
+# Set template for all tests, types and rules are filled in depending on test
+set_template='
+flush ruleset
+
+table inet filter {
+	counter test {
+		packets 0 bytes 0
+	}
+
+	set test {
+		type ${type_spec}
+		flags interval,timeout
+	}
+
+	chain input {
+		type filter hook prerouting priority 0; policy accept;
+		${chain_spec} @test counter name \"test\"
+	}
+}
+
+table netdev perf {
+	counter test {
+		packets 0 bytes 0
+	}
+
+	counter match {
+		packets 0 bytes 0
+	}
+
+	set test {
+		type ${type_spec}
+		flags interval
+	}
+
+	set norange {
+		type ${type_spec}
+	}
+
+	set noconcat {
+		type ${type_spec%% *}
+		flags interval
+	}
+
+	chain test {
+		type filter hook ingress device veth_a priority 0;
+	}
+}
+'
+
+err_buf=
+info_buf=
+
+# Append string to error buffer
+err() {
+	err_buf="${err_buf}${1}
+"
+}
+
+# Append string to information buffer
+info() {
+	info_buf="${info_buf}${1}
+"
+}
+
+# Flush error buffer to stdout
+err_flush() {
+	printf "%s" "${err_buf}"
+	err_buf=
+}
+
+# Flush information buffer to stdout
+info_flush() {
+	printf "%s" "${info_buf}"
+	info_buf=
+}
+
+# Setup veth pair: this namespace receives traffic, B generates it
+setup_veth() {
+	ip netns add B
+	ip link add veth_a type veth peer name veth_b || return 1
+
+	ip link set veth_a up
+	ip link set veth_b netns B
+
+	ip -n B link set veth_b up
+
+	ip addr add dev veth_a 10.0.0.1
+	ip route add default dev veth_a
+
+	ip -6 addr add fe80::1/64 dev veth_a nodad
+	ip -6 addr add 2001:db8::1/64 dev veth_a nodad
+	ip -6 route add default dev veth_a
+
+	ip -n B route add default dev veth_b
+
+	ip -6 -n B addr add fe80::2/64 dev veth_b nodad
+	ip -6 -n B addr add 2001:db8::2/64 dev veth_b nodad
+	ip -6 -n B route add default dev veth_b
+
+	B() {
+		ip netns exec B "$@" >/dev/null 2>&1
+	}
+}
+
+# Fill in set template and initialise set
+setup_set() {
+	eval "echo \"${set_template}\"" | nft -f -
+}
+
+# Check that at least one of the needed tools is available
+check_tools() {
+	[ -z "${tools}" ] && return 0
+
+	__tools=
+	for tool in ${tools}; do
+		__tools="${__tools} ${tool}"
+
+		command -v "${tool}" >/dev/null && return 0
+	done
+	err "need one of:${__tools}, skipping" && return 1
+}
+
+# Set up function to send ICMP packets
+setup_send_icmp() {
+	send_icmp() {
+		B ping -c1 -W1 "${dst_addr4}" >/dev/null 2>&1
+	}
+}
+
+# Set up function to send ICMPv6 packets
+setup_send_icmp6() {
+	if command -v ping6 >/dev/null; then
+		send_icmp6() {
+			ip -6 addr add "${dst_addr6}" dev veth_a nodad \
+				2>/dev/null
+			B ping6 -q -c1 -W1 "${dst_addr6}"
+		}
+	else
+		send_icmp6() {
+			ip -6 addr add "${dst_addr6}" dev veth_a nodad \
+				2>/dev/null
+			B ping -q -6 -c1 -W1 "${dst_addr6}"
+		}
+	fi
+}
+
+# Set up function to send single UDP packets on IPv4
+setup_send_udp() {
+	if command -v sendip >/dev/null; then
+		send_udp() {
+			[ -n "${src_port}" ] && src_port="-us ${src_port}"
+			[ -n "${dst_port}" ] && dst_port="-ud ${dst_port}"
+			[ -n "${src_addr4}" ] && src_addr4="-is ${src_addr4}"
+
+			# shellcheck disable=SC2086 # sendip needs split options
+			B sendip -p ipv4 -p udp ${src_addr4} ${src_port} \
+						${dst_port} "${dst_addr4}"
+
+			src_port=
+			dst_port=
+			src_addr4=
+		}
+	elif command -v socat -v >/dev/null; then
+		send_udp() {
+			if [ -n "${src_addr4}" ]; then
+				B ip addr add "${src_addr4}" dev veth_b
+				__socatbind=",bind=${src_addr4}"
+				if [ -n "${src_port}" ];then
+					__socatbind="${__socatbind}:${src_port}"
+				fi
+			fi
+
+			ip addr add "${dst_addr4}" dev veth_a 2>/dev/null
+			[ -z "${dst_port}" ] && dst_port=12345
+
+			echo "test4" | B socat -t 0.01 STDIN UDP4-DATAGRAM:"$dst_addr4":"$dst_port""${__socatbind}"
+
+			src_addr4=
+			src_port=
+		}
+	elif [ -z "$(bash -c 'type -p')" ]; then
+		send_udp() {
+			ip addr add "${dst_addr4}" dev veth_a 2>/dev/null
+			if [ -n "${src_addr4}" ]; then
+				B ip addr add "${src_addr4}/16" dev veth_b
+				B ip route add default dev veth_b
+			fi
+
+			B bash -c "echo > /dev/udp/${dst_addr4}/${dst_port}"
+
+			if [ -n "${src_addr4}" ]; then
+				B ip addr del "${src_addr4}/16" dev veth_b
+			fi
+			src_addr4=
+		}
+	else
+		return 1
+	fi
+}
+
+# Set up function to send single UDP packets on IPv6
+setup_send_udp6() {
+	if command -v sendip >/dev/null; then
+		send_udp6() {
+			[ -n "${src_port}" ] && src_port="-us ${src_port}"
+			[ -n "${dst_port}" ] && dst_port="-ud ${dst_port}"
+			if [ -n "${src_addr6}" ]; then
+				src_addr6="-6s ${src_addr6}"
+			else
+				src_addr6="-6s 2001:db8::2"
+			fi
+			ip -6 addr add "${dst_addr6}" dev veth_a nodad \
+				2>/dev/null
+
+			# shellcheck disable=SC2086 # this needs split options
+			B sendip -p ipv6 -p udp ${src_addr6} ${src_port} \
+						${dst_port} "${dst_addr6}"
+
+			src_port=
+			dst_port=
+			src_addr6=
+		}
+	elif command -v socat -v >/dev/null; then
+		send_udp6() {
+			ip -6 addr add "${dst_addr6}" dev veth_a nodad \
+				2>/dev/null
+
+			__socatbind6=
+
+			if [ -n "${src_addr6}" ]; then
+				B ip addr add "${src_addr6}" dev veth_b nodad
+
+				__socatbind6=",bind=[${src_addr6}]"
+
+				if [ -n "${src_port}" ] ;then
+					__socatbind6="${__socatbind6}:${src_port}"
+				fi
+			fi
+
+			echo "test6" | B socat -t 0.01 STDIN UDP6-DATAGRAM:["$dst_addr6"]:"$dst_port""${__socatbind6}"
+		}
+	elif [ -z "$(bash -c 'type -p')" ]; then
+		send_udp6() {
+			ip -6 addr add "${dst_addr6}" dev veth_a nodad \
+				2>/dev/null
+			B ip addr add "${src_addr6}" dev veth_b nodad
+			B bash -c "echo > /dev/udp/${dst_addr6}/${dst_port}"
+			ip -6 addr del "${dst_addr6}" dev veth_a 2>/dev/null
+		}
+	else
+		return 1
+	fi
+}
+
+listener_ready()
+{
+	port="$1"
+	ss -lnt -o "sport = :$port" | grep -q "$port"
+}
+
+# Set up function to send TCP traffic on IPv4
+setup_flood_tcp() {
+	if command -v iperf3 >/dev/null; then
+		flood_tcp() {
+			local n_port="${dst_port}"
+			[ -n "${dst_port}" ] && dst_port="-p ${dst_port}"
+			if [ -n "${src_addr4}" ]; then
+				B ip addr add "${src_addr4}/16" dev veth_b
+				src_addr4="-B ${src_addr4}"
+			else
+				B ip addr add dev veth_b 10.0.0.2
+				src_addr4="-B 10.0.0.2"
+			fi
+			if [ -n "${src_port}" ]; then
+				src_port="--cport ${src_port}"
+			fi
+			B ip route add default dev veth_b 2>/dev/null
+			ip addr add "${dst_addr4}" dev veth_a 2>/dev/null
+
+			# shellcheck disable=SC2086 # this needs split options
+			iperf3 -s -DB "${dst_addr4}" ${dst_port} >/dev/null 2>&1
+			busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port"
+
+			# shellcheck disable=SC2086 # this needs split options
+			B iperf3 -c "${dst_addr4}" ${dst_port} ${src_port} \
+				${src_addr4} -l16 -t 1000
+
+			src_addr4=
+			src_port=
+			dst_port=
+		}
+	elif command -v iperf >/dev/null; then
+		flood_tcp() {
+			local n_port="${dst_port}"
+			[ -n "${dst_port}" ] && dst_port="-p ${dst_port}"
+			if [ -n "${src_addr4}" ]; then
+				B ip addr add "${src_addr4}/16" dev veth_b
+				src_addr4="-B ${src_addr4}"
+			else
+				B ip addr add dev veth_b 10.0.0.2 2>/dev/null
+				src_addr4="-B 10.0.0.2"
+			fi
+			if [ -n "${src_port}" ]; then
+				src_addr4="${src_addr4}:${src_port}"
+			fi
+			B ip route add default dev veth_b
+			ip addr add "${dst_addr4}" dev veth_a 2>/dev/null
+
+			# shellcheck disable=SC2086 # this needs split options
+			iperf -s -DB "${dst_addr4}" ${dst_port} >/dev/null 2>&1
+			busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port"
+
+			# shellcheck disable=SC2086 # this needs split options
+			B iperf -c "${dst_addr4}" ${dst_port} ${src_addr4} \
+				-l20 -t 1000
+
+			src_addr4=
+			src_port=
+			dst_port=
+		}
+	elif command -v netperf >/dev/null; then
+		flood_tcp() {
+			local n_port="${dst_port}"
+			[ -n "${dst_port}" ] && dst_port="-p ${dst_port}"
+			if [ -n "${src_addr4}" ]; then
+				B ip addr add "${src_addr4}/16" dev veth_b
+			else
+				B ip addr add dev veth_b 10.0.0.2
+				src_addr4="10.0.0.2"
+			fi
+			if [ -n "${src_port}" ]; then
+				dst_port="${dst_port},${src_port}"
+			fi
+			B ip route add default dev veth_b
+			ip addr add "${dst_addr4}" dev veth_a 2>/dev/null
+
+			# shellcheck disable=SC2086 # this needs split options
+			netserver -4 ${dst_port} -L "${dst_addr4}" \
+				>/dev/null 2>&1
+			busywait "$BUSYWAIT_TIMEOUT" listener_ready "${n_port}"
+
+			# shellcheck disable=SC2086 # this needs split options
+			B netperf -4 -H "${dst_addr4}" ${dst_port} \
+				-L "${src_addr4}" -l 1000 -t TCP_STREAM
+
+			src_addr4=
+			src_port=
+			dst_port=
+		}
+	else
+		return 1
+	fi
+}
+
+# Set up function to send TCP traffic on IPv6
+setup_flood_tcp6() {
+	if command -v iperf3 >/dev/null; then
+		flood_tcp6() {
+			local n_port="${dst_port}"
+			[ -n "${dst_port}" ] && dst_port="-p ${dst_port}"
+			if [ -n "${src_addr6}" ]; then
+				B ip addr add "${src_addr6}" dev veth_b nodad
+				src_addr6="-B ${src_addr6}"
+			else
+				src_addr6="-B 2001:db8::2"
+			fi
+			if [ -n "${src_port}" ]; then
+				src_port="--cport ${src_port}"
+			fi
+			B ip route add default dev veth_b
+			ip -6 addr add "${dst_addr6}" dev veth_a nodad \
+				2>/dev/null
+
+			# shellcheck disable=SC2086 # this needs split options
+			iperf3 -s -DB "${dst_addr6}" ${dst_port} >/dev/null 2>&1
+			busywait "$BUSYWAIT_TIMEOUT" listener_ready "${n_port}"
+
+			# shellcheck disable=SC2086 # this needs split options
+			B iperf3 -c "${dst_addr6}" ${dst_port} \
+				${src_port} ${src_addr6} -l16 -t 1000
+
+			src_addr6=
+			src_port=
+			dst_port=
+		}
+	elif command -v iperf >/dev/null; then
+		flood_tcp6() {
+			local n_port="${dst_port}"
+			[ -n "${dst_port}" ] && dst_port="-p ${dst_port}"
+			if [ -n "${src_addr6}" ]; then
+				B ip addr add "${src_addr6}" dev veth_b nodad
+				src_addr6="-B ${src_addr6}"
+			else
+				src_addr6="-B 2001:db8::2"
+			fi
+			if [ -n "${src_port}" ]; then
+				src_addr6="${src_addr6}:${src_port}"
+			fi
+			B ip route add default dev veth_b
+			ip -6 addr add "${dst_addr6}" dev veth_a nodad \
+				2>/dev/null
+
+			# shellcheck disable=SC2086 # this needs split options
+			iperf -s -VDB "${dst_addr6}" ${dst_port} >/dev/null 2>&1
+			busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port"
+
+			# shellcheck disable=SC2086 # this needs split options
+			B iperf -c "${dst_addr6}" -V ${dst_port} \
+				${src_addr6} -l1 -t 1000
+
+			src_addr6=
+			src_port=
+			dst_port=
+		}
+	elif command -v netperf >/dev/null; then
+		flood_tcp6() {
+			local n_port="${dst_port}"
+			[ -n "${dst_port}" ] && dst_port="-p ${dst_port}"
+			if [ -n "${src_addr6}" ]; then
+				B ip addr add "${src_addr6}" dev veth_b nodad
+			else
+				src_addr6="2001:db8::2"
+			fi
+			if [ -n "${src_port}" ]; then
+				dst_port="${dst_port},${src_port}"
+			fi
+			B ip route add default dev veth_b
+			ip -6 addr add "${dst_addr6}" dev veth_a nodad \
+				2>/dev/null
+
+			# shellcheck disable=SC2086 # this needs split options
+			netserver -6 ${dst_port} -L "${dst_addr6}" \
+				>/dev/null 2>&1
+			busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port"
+
+			# shellcheck disable=SC2086 # this needs split options
+			B netperf -6 -H "${dst_addr6}" ${dst_port} \
+				-L "${src_addr6}" -l 1000 -t TCP_STREAM
+
+			src_addr6=
+			src_port=
+			dst_port=
+		}
+	else
+		return 1
+	fi
+}
+
+# Set up function to send UDP traffic on IPv4
+setup_flood_udp() {
+	if command -v iperf3 >/dev/null; then
+		flood_udp() {
+			local n_port="${dst_port}"
+			[ -n "${dst_port}" ] && dst_port="-p ${dst_port}"
+			if [ -n "${src_addr4}" ]; then
+				B ip addr add "${src_addr4}/16" dev veth_b
+				src_addr4="-B ${src_addr4}"
+			else
+				B ip addr add dev veth_b 10.0.0.2 2>/dev/null
+				src_addr4="-B 10.0.0.2"
+			fi
+			if [ -n "${src_port}" ]; then
+				src_port="--cport ${src_port}"
+			fi
+			B ip route add default dev veth_b
+			ip addr add "${dst_addr4}" dev veth_a 2>/dev/null
+
+			# shellcheck disable=SC2086 # this needs split options
+			iperf3 -s -DB "${dst_addr4}" ${dst_port}
+			busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port"
+
+			# shellcheck disable=SC2086 # this needs split options
+			B iperf3 -u -c "${dst_addr4}" -Z -b 100M -l16 -t1000 \
+				${dst_port} ${src_port} ${src_addr4}
+
+			src_addr4=
+			src_port=
+			dst_port=
+		}
+	elif command -v iperf >/dev/null; then
+		flood_udp() {
+			local n_port="${dst_port}"
+			[ -n "${dst_port}" ] && dst_port="-p ${dst_port}"
+			if [ -n "${src_addr4}" ]; then
+				B ip addr add "${src_addr4}/16" dev veth_b
+				src_addr4="-B ${src_addr4}"
+			else
+				B ip addr add dev veth_b 10.0.0.2
+				src_addr4="-B 10.0.0.2"
+			fi
+			if [ -n "${src_port}" ]; then
+				src_addr4="${src_addr4}:${src_port}"
+			fi
+			B ip route add default dev veth_b
+			ip addr add "${dst_addr4}" dev veth_a 2>/dev/null
+
+			# shellcheck disable=SC2086 # this needs split options
+			iperf -u -sDB "${dst_addr4}" ${dst_port} >/dev/null 2>&1
+			busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port"
+
+			# shellcheck disable=SC2086 # this needs split options
+			B iperf -u -c "${dst_addr4}" -b 100M -l1 -t1000 \
+				${dst_port} ${src_addr4}
+
+			src_addr4=
+			src_port=
+			dst_port=
+		}
+	elif command -v netperf >/dev/null; then
+		flood_udp() {
+			local n_port="${dst_port}"
+			[ -n "${dst_port}" ] && dst_port="-p ${dst_port}"
+			if [ -n "${src_addr4}" ]; then
+				B ip addr add "${src_addr4}/16" dev veth_b
+			else
+				B ip addr add dev veth_b 10.0.0.2
+				src_addr4="10.0.0.2"
+			fi
+			if [ -n "${src_port}" ]; then
+				dst_port="${dst_port},${src_port}"
+			fi
+			B ip route add default dev veth_b
+			ip addr add "${dst_addr4}" dev veth_a 2>/dev/null
+
+			# shellcheck disable=SC2086 # this needs split options
+			netserver -4 ${dst_port} -L "${dst_addr4}" \
+				>/dev/null 2>&1
+			busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port"
+
+			# shellcheck disable=SC2086 # this needs split options
+			B netperf -4 -H "${dst_addr4}" ${dst_port} \
+				-L "${src_addr4}" -l 1000 -t UDP_STREAM
+
+			src_addr4=
+			src_port=
+			dst_port=
+		}
+	else
+		return 1
+	fi
+}
+
+# Find pktgen script and set up function to start pktgen injection
+setup_perf() {
+	for pktgen_script_path in ${PKTGEN_SCRIPT_PATHS} __notfound; do
+		command -v "${pktgen_script_path}" >/dev/null && break
+	done
+	[ "${pktgen_script_path}" = "__notfound" ] && return 1
+
+	perf_ipv4() {
+		${pktgen_script_path} -s80 \
+			-i veth_a -d "${dst_addr4}" -p "${dst_port}" \
+			-m "${dst_mac}" \
+			-t $(($(nproc) / 5 + 1)) -b10000 -n0 2>/dev/null &
+		perf_pid=$!
+	}
+	perf_ipv6() {
+		IP6=6 ${pktgen_script_path} -s100 \
+			-i veth_a -d "${dst_addr6}" -p "${dst_port}" \
+			-m "${dst_mac}" \
+			-t $(($(nproc) / 5 + 1)) -b10000 -n0 2>/dev/null &
+		perf_pid=$!
+	}
+}
+
+# Clean up before each test
+cleanup() {
+	nft reset counter inet filter test	>/dev/null 2>&1
+	nft flush ruleset			>/dev/null 2>&1
+	ip link del dummy0			2>/dev/null
+	ip route del default			2>/dev/null
+	ip -6 route del default			2>/dev/null
+	ip netns pids B				2>/dev/null | xargs kill 2>/dev/null
+	ip netns del B				2>/dev/null
+	ip link del veth_a			2>/dev/null
+	timeout=
+	killall iperf3				2>/dev/null
+	killall iperf				2>/dev/null
+	killall netperf				2>/dev/null
+	killall netserver			2>/dev/null
+}
+
+cleanup_exit() {
+	cleanup
+	rm -f "$tmp"
+}
+
+# Entry point for setup functions
+setup() {
+	if [ "$(id -u)" -ne 0 ]; then
+		echo "  need to run as root"
+		exit ${ksft_skip}
+	fi
+
+	cleanup
+	check_tools || return 1
+	for arg do
+		if ! eval setup_"${arg}"; then
+			err "  ${arg} not supported"
+			return 1
+		fi
+	done
+}
+
+# Format integer into IPv4 address, summing 10.0.0.5 (arbitrary) to it
+format_addr4() {
+	a=$((${1} + 16777216 * 10 + 5))
+	printf "%i.%i.%i.%i"						\
+	       "$((a / 16777216))" "$((a % 16777216 / 65536))"	\
+	       "$((a % 65536 / 256))" "$((a % 256))"
+}
+
+# Format integer into IPv6 address, summing 2001:db8:: to it
+format_addr6() {
+	printf "2001:db8::%04x:%04x" "$((${1} / 65536))" "$((${1} % 65536))"
+}
+
+# Format integer into EUI-48 address, summing 00:01:00:00:00:00 to it
+format_mac() {
+	printf "00:01:%02x:%02x:%02x:%02x" \
+	       "$((${1} / 16777216))" "$((${1} % 16777216 / 65536))"	\
+	       "$((${1} % 65536 / 256))" "$((${1} % 256))"
+}
+
+# Format integer into port, avoid 0 port
+format_port() {
+	printf "%i" "$((${1} % 65534 + 1))"
+}
+
+# Drop suffixed '6' from L4 protocol, if any
+format_proto() {
+	printf "%s" "${proto}" | tr -d 6
+}
+
+# Format destination and source fields into nft concatenated type
+format() {
+	__start=
+	__end=
+	__expr="{ "
+
+	for f in ${dst}; do
+		[ "${__expr}" != "{ " ] && __expr="${__expr} . "
+
+		__start="$(eval format_"${f}" "${start}")"
+		__end="$(eval format_"${f}" "${end}")"
+
+		if [ "${f}" = "proto" ]; then
+			__expr="${__expr}${__start}"
+		else
+			__expr="${__expr}${__start}-${__end}"
+		fi
+	done
+	for f in ${src}; do
+		[ "${__expr}" != "{ " ] && __expr="${__expr} . "
+
+		__start="$(eval format_"${f}" "${srcstart}")"
+		__end="$(eval format_"${f}" "${srcend}")"
+
+		if [ "${f}" = "proto" ]; then
+			__expr="${__expr}${__start}"
+		else
+			__expr="${__expr}${__start}-${__end}"
+		fi
+	done
+
+	if [ -n "${timeout}" ]; then
+		echo "${__expr} timeout ${timeout}s }"
+	else
+		echo "${__expr} }"
+	fi
+}
+
+# Format destination and source fields into nft type, start element only
+format_norange() {
+	__expr="{ "
+
+	for f in ${dst}; do
+		[ "${__expr}" != "{ " ] && __expr="${__expr} . "
+
+		__expr="${__expr}$(eval format_"${f}" "${start}")"
+	done
+	for f in ${src}; do
+		__expr="${__expr} . $(eval format_"${f}" "${start}")"
+	done
+
+	echo "${__expr} }"
+}
+
+# Format first destination field into nft type
+format_noconcat() {
+	for f in ${dst}; do
+		__start="$(eval format_"${f}" "${start}")"
+		__end="$(eval format_"${f}" "${end}")"
+
+		if [ "${f}" = "proto" ]; then
+			echo "{ ${__start} }"
+		else
+			echo "{ ${__start}-${__end} }"
+		fi
+		return
+	done
+}
+
+# Add single entry to 'test' set in 'inet filter' table
+add() {
+	if ! nft add element inet filter test "${1}"; then
+		err "Failed to add ${1} given ruleset:"
+		err "$(nft -a list ruleset)"
+		return 1
+	fi
+}
+
+# Format and output entries for sets in 'netdev perf' table
+add_perf() {
+	if [ "${1}" = "test" ]; then
+		echo "add element netdev perf test $(format)"
+	elif [ "${1}" = "norange" ]; then
+		echo "add element netdev perf norange $(format_norange)"
+	elif [ "${1}" = "noconcat" ]; then
+		echo "add element netdev perf noconcat $(format_noconcat)"
+	fi
+}
+
+# Add single entry to 'norange' set in 'netdev perf' table
+add_perf_norange() {
+	if ! nft add element netdev perf norange "${1}"; then
+		err "Failed to add ${1} given ruleset:"
+		err "$(nft -a list ruleset)"
+		return 1
+	fi
+}
+
+# Add single entry to 'noconcat' set in 'netdev perf' table
+add_perf_noconcat() {
+	if ! nft add element netdev perf noconcat "${1}"; then
+		err "Failed to add ${1} given ruleset:"
+		err "$(nft -a list ruleset)"
+		return 1
+	fi
+}
+
+# Delete single entry from set
+del() {
+	if ! nft delete element inet filter test "${1}"; then
+		err "Failed to delete ${1} given ruleset:"
+		err "$(nft -a list ruleset)"
+		return 1
+	fi
+}
+
+# Return packet count from 'test' counter in 'inet filter' table
+count_packets() {
+	found=0
+	for token in $(nft list counter inet filter test); do
+		[ ${found} -eq 1 ] && echo "${token}" && return
+		[ "${token}" = "packets" ] && found=1
+	done
+}
+
+# Return packet count from 'test' counter in 'netdev perf' table
+count_perf_packets() {
+	found=0
+	for token in $(nft list counter netdev perf test); do
+		[ ${found} -eq 1 ] && echo "${token}" && return
+		[ "${token}" = "packets" ] && found=1
+	done
+}
+
+# Set MAC addresses, send traffic according to specifier
+flood() {
+	ip link set veth_a address "$(format_mac "${1}")"
+	ip -n B link set veth_b address "$(format_mac "${2}")"
+
+	for f in ${dst}; do
+		eval dst_"$f"=\$\(format_\$f "${1}"\)
+	done
+	for f in ${src}; do
+		eval src_"$f"=\$\(format_\$f "${2}"\)
+	done
+	eval flood_\$proto
+}
+
+# Set MAC addresses, start pktgen injection
+perf() {
+	dst_mac="$(format_mac "${1}")"
+	ip link set veth_a address "${dst_mac}"
+
+	for f in ${dst}; do
+		eval dst_"$f"=\$\(format_\$f "${1}"\)
+	done
+	for f in ${src}; do
+		eval src_"$f"=\$\(format_\$f "${2}"\)
+	done
+	eval perf_\$perf_proto
+}
+
+# Set MAC addresses, send single packet, check that it matches, reset counter
+send_match() {
+	ip link set veth_a address "$(format_mac "${1}")"
+	ip -n B link set veth_b address "$(format_mac "${2}")"
+
+	for f in ${dst}; do
+		eval dst_"$f"=\$\(format_\$f "${1}"\)
+	done
+	for f in ${src}; do
+		eval src_"$f"=\$\(format_\$f "${2}"\)
+	done
+	eval send_\$proto
+	if [ "$(count_packets)" != "1" ]; then
+		err "${proto} packet to:"
+		err "  $(for f in ${dst}; do
+			 eval format_\$f "${1}"; printf ' '; done)"
+		err "from:"
+		err "  $(for f in ${src}; do
+			 eval format_\$f "${2}"; printf ' '; done)"
+		err "should have matched ruleset:"
+		err "$(nft -a list ruleset)"
+		return 1
+	fi
+	nft reset counter inet filter test >/dev/null
+}
+
+# Set MAC addresses, send single packet, check that it doesn't match
+send_nomatch() {
+	ip link set veth_a address "$(format_mac "${1}")"
+	ip -n B link set veth_b address "$(format_mac "${2}")"
+
+	for f in ${dst}; do
+		eval dst_"$f"=\$\(format_\$f "${1}"\)
+	done
+	for f in ${src}; do
+		eval src_"$f"=\$\(format_\$f "${2}"\)
+	done
+	eval send_\$proto
+	if [ "$(count_packets)" != "0" ]; then
+		err "${proto} packet to:"
+		err "  $(for f in ${dst}; do
+			 eval format_\$f "${1}"; printf ' '; done)"
+		err "from:"
+		err "  $(for f in ${src}; do
+			 eval format_\$f "${2}"; printf ' '; done)"
+		err "should not have matched ruleset:"
+		err "$(nft -a list ruleset)"
+		return 1
+	fi
+}
+
+# Correctness test template:
+# - add ranged element, check that packets match it
+# - check that packets outside range don't match it
+# - remove some elements, check that packets don't match anymore
+test_correctness_main() {
+	range_size=1
+	for i in $(seq "${start}" $((start + count))); do
+		end=$((start + range_size))
+
+		# Avoid negative or zero-sized port ranges
+		if [ $((end / 65534)) -gt $((start / 65534)) ]; then
+			start=${end}
+			end=$((end + 1))
+		fi
+		srcstart=$((start + src_delta))
+		srcend=$((end + src_delta))
+
+		add "$(format)" || return 1
+		for j in $(seq "$start" $((range_size / 2 + 1)) ${end}); do
+			send_match "${j}" $((j + src_delta)) || return 1
+		done
+		send_nomatch $((end + 1)) $((end + 1 + src_delta)) || return 1
+
+		# Delete elements now and then
+		if [ $((i % 3)) -eq 0 ]; then
+			del "$(format)" || return 1
+			for j in $(seq "$start" \
+				   $((range_size / 2 + 1)) ${end}); do
+				send_nomatch "${j}" $((j + src_delta)) \
+					|| return 1
+			done
+		fi
+
+		range_size=$((range_size + 1))
+		start=$((end + range_size))
+	done
+}
+
+test_correctness() {
+	setup veth send_"${proto}" set || return ${ksft_skip}
+
+	test_correctness_main
+}
+
+# Repeat the correctness tests, but add extra non-matching entries.
+# This exercises the more compact '4 bit group' representation that
+# gets picked when the default 8-bit representation exceed
+# NFT_PIPAPO_LT_SIZE_HIGH bytes of memory.
+# See usage of NFT_PIPAPO_LT_SIZE_HIGH in pipapo_lt_bits_adjust().
+#
+# The format() helper is way too slow when generating lots of
+# entries so its not used here.
+test_correctness_large() {
+	setup veth send_"${proto}" set || return ${ksft_skip}
+	# number of dummy (filler) entries to add.
+	local dcount=16385
+
+	(
+	echo -n "add element inet filter test { "
+
+	case "$type_spec" in
+	"ether_addr . ipv4_addr")
+		for i in $(seq 1 $dcount); do
+			[ $i -gt 1 ] && echo ", "
+			format_mac $((1000000 + i))
+			printf ". 172.%i.%i.%i " $((RANDOM%256)) $((RANDOM%256)) $((i%256))
+		done
+		;;
+	"inet_proto . ipv6_addr")
+		for i in $(seq 1 $dcount); do
+			[ $i -gt 1 ] && echo ", "
+			printf "%i . " $((RANDOM%256))
+			format_addr6 $((1000000 + i))
+		done
+		;;
+	"inet_service . inet_proto")
+		# smaller key sizes, need more entries to hit the
+		# 4-bit threshold.
+		dcount=65536
+		for i in $(seq 1 $dcount); do
+			local proto=$((RANDOM%256))
+
+			# Test uses UDP to match, as it also fails when matching
+			# an entry that doesn't exist, so skip 'udp' entries
+			# to not trigger a wrong failure.
+			[ $proto -eq 17 ] && proto=18
+			[ $i -gt 1 ] && echo ", "
+			printf "%i . %i " $(((i%65534) + 1)) $((proto))
+		done
+		;;
+	"inet_service . ipv4_addr")
+		dcount=32768
+		for i in $(seq 1 $dcount); do
+			[ $i -gt 1 ] && echo ", "
+			printf "%i . 172.%i.%i.%i " $(((RANDOM%65534) + 1)) $((RANDOM%256)) $((RANDOM%256)) $((i%256))
+		done
+		;;
+	"ipv4_addr . ether_addr")
+		for i in $(seq 1 $dcount); do
+			[ $i -gt 1 ] && echo ", "
+			printf "172.%i.%i.%i . " $((RANDOM%256)) $((RANDOM%256)) $((i%256))
+			format_mac $((1000000 + i))
+		done
+		;;
+	"ipv4_addr . inet_service")
+		dcount=32768
+		for i in $(seq 1 $dcount); do
+			[ $i -gt 1 ] && echo ", "
+			printf "172.%i.%i.%i . %i" $((RANDOM%256)) $((RANDOM%256)) $((i%256)) $(((RANDOM%65534) + 1))
+		done
+		;;
+	"ipv4_addr . inet_service . ether_addr . inet_proto . ipv4_addr")
+		dcount=65536
+		for i in $(seq 1 $dcount); do
+			[ $i -gt 1 ] && echo ", "
+			printf "172.%i.%i.%i . %i . " $((RANDOM%256)) $((RANDOM%256)) $((i%256)) $(((RANDOM%65534) + 1))
+			format_mac $((1000000 + i))
+			printf ". %i . 192.168.%i.%i" $((RANDOM%256)) $((RANDOM%256)) $((i%256))
+		done
+		;;
+	"ipv4_addr . inet_service . inet_proto")
+		for i in $(seq 1 $dcount); do
+			[ $i -gt 1 ] && echo ", "
+			printf "172.%i.%i.%i . %i . %i " $((RANDOM%256)) $((RANDOM%256)) $((i%256)) $(((RANDOM%65534) + 1)) $((RANDOM%256))
+		done
+		;;
+	"ipv4_addr . inet_service . inet_proto . ipv4_addr")
+		for i in $(seq 1 $dcount); do
+			[ $i -gt 1 ] && echo ", "
+			printf "172.%i.%i.%i . %i . %i . 192.168.%i.%i " $((RANDOM%256)) $((RANDOM%256)) $((i%256)) $(((RANDOM%65534) + 1)) $((RANDOM%256)) $((RANDOM%256)) $((RANDOM%256))
+		done
+		;;
+	"ipv4_addr . inet_service . ipv4_addr")
+		dcount=32768
+		for i in $(seq 1 $dcount); do
+			[ $i -gt 1 ] && echo ", "
+			printf "172.%i.%i.%i . %i . 192.168.%i.%i " $((RANDOM%256)) $((RANDOM%256)) $((i%256)) $(((RANDOM%65534) + 1)) $((RANDOM%256)) $((RANDOM%256))
+		done
+		;;
+	"ipv6_addr . ether_addr")
+		for i in $(seq 1 $dcount); do
+			[ $i -gt 1 ] && echo ", "
+			format_addr6 $((i + 1000000))
+			echo -n " . "
+			format_mac $((1000000 + i))
+		done
+		;;
+	"ipv6_addr . inet_service")
+		dcount=32768
+		for i in $(seq 1 $dcount); do
+			[ $i -gt 1 ] && echo ", "
+			format_addr6 $((i + 1000000))
+			echo -n " .  $(((RANDOM%65534) + 1))"
+		done
+		;;
+	"ipv6_addr . inet_service . ether_addr")
+		dcount=32768
+		for i in $(seq 1 $dcount); do
+			[ $i -gt 1 ] && echo ", "
+			format_addr6 $((i + 1000000))
+			echo -n " .  $(((RANDOM%65534) + 1)) . "
+			format_mac $((i + 1000000))
+		done
+		;;
+	"ipv6_addr . inet_service . ether_addr . inet_proto")
+		dcount=65536
+		for i in $(seq 1 $dcount); do
+			[ $i -gt 1 ] && echo ", "
+			format_addr6 $((i + 1000000))
+			echo -n " .  $(((RANDOM%65534) + 1)) . "
+			format_mac $((i + 1000000))
+			echo -n " .  $((RANDOM%256))"
+		done
+		;;
+	"ipv6_addr . inet_service . ipv6_addr . inet_service")
+		dcount=32768
+		for i in $(seq 1 $dcount); do
+			[ $i -gt 1 ] && echo ", "
+			format_addr6 $((i + 1000000))
+			echo -n " .  $(((RANDOM%65534) + 1)) . "
+			format_addr6 $((i + 2123456))
+			echo -n " .  $((RANDOM%256))"
+		done
+		;;
+	*)
+		"Unhandled $type_spec"
+		return 1
+	esac
+	echo -n "}"
+
+	) | nft -f - || return 1
+
+	test_correctness_main
+}
+
+# Concurrency test template:
+# - add all the elements
+# - start a thread for each physical thread that:
+#   - adds all the elements
+#   - flushes the set
+#   - adds all the elements
+#   - flushes the entire ruleset
+#   - adds the set back
+#   - adds all the elements
+#   - delete all the elements
+test_concurrency() {
+	proto=${flood_proto}
+	tools=${flood_tools}
+	chain_spec=${flood_spec}
+	setup veth flood_"${proto}" set || return ${ksft_skip}
+
+	range_size=1
+	cstart=${start}
+	flood_pids=
+	for i in $(seq "$start" $((start + count))); do
+		end=$((start + range_size))
+		srcstart=$((start + src_delta))
+		srcend=$((end + src_delta))
+
+		add "$(format)" || return 1
+
+		flood "${i}" $((i + src_delta)) & flood_pids="${flood_pids} $!"
+
+		range_size=$((range_size + 1))
+		start=$((end + range_size))
+	done
+
+	sleep $((RANDOM%10))
+
+	pids=
+	for c in $(seq 1 "$(nproc)"); do (
+		for r in $(seq 1 "${race_repeat}"); do
+			range_size=1
+
+			# $start needs to be local to this subshell
+			# shellcheck disable=SC2030
+			start=${cstart}
+			for i in $(seq "$start" $((start + count))); do
+				end=$((start + range_size))
+				srcstart=$((start + src_delta))
+				srcend=$((end + src_delta))
+
+				add "$(format)" 2>/dev/null
+
+				range_size=$((range_size + 1))
+				start=$((end + range_size))
+			done
+
+			nft flush inet filter test 2>/dev/null
+
+			range_size=1
+			start=${cstart}
+			for i in $(seq "$start" $((start + count))); do
+				end=$((start + range_size))
+				srcstart=$((start + src_delta))
+				srcend=$((end + src_delta))
+
+				add "$(format)" 2>/dev/null
+
+				range_size=$((range_size + 1))
+				start=$((end + range_size))
+			done
+
+			nft flush ruleset
+			setup set 2>/dev/null
+
+			range_size=1
+			start=${cstart}
+			for i in $(seq "$start" $((start + count))); do
+				end=$((start + range_size))
+				srcstart=$((start + src_delta))
+				srcend=$((end + src_delta))
+
+				add "$(format)" 2>/dev/null
+
+				range_size=$((range_size + 1))
+				start=$((end + range_size))
+			done
+
+			range_size=1
+			start=${cstart}
+			for i in $(seq "$start" $((start + count))); do
+				end=$((start + range_size))
+				srcstart=$((start + src_delta))
+				srcend=$((end + src_delta))
+
+				del "$(format)" 2>/dev/null
+
+				range_size=$((range_size + 1))
+				start=$((end + range_size))
+			done
+		done
+	) & pids="${pids} $!"
+	done
+
+	# shellcheck disable=SC2046,SC2086 # word splitting wanted here
+	wait $(for pid in ${pids}; do echo ${pid}; done)
+	# shellcheck disable=SC2046,SC2086
+	kill $(for pid in ${flood_pids}; do echo ${pid}; done) 2>/dev/null
+	# shellcheck disable=SC2046,SC2086
+	wait $(for pid in ${flood_pids}; do echo ${pid}; done) 2>/dev/null
+
+	return 0
+}
+
+# Timeout test template:
+# - add all the elements with 3s timeout while checking that packets match
+# - wait 3s after the last insertion, check that packets don't match any entry
+test_timeout() {
+	setup veth send_"${proto}" set || return ${ksft_skip}
+
+	timeout=3
+
+	[ "$KSFT_MACHINE_SLOW" = "yes" ] && timeout=8
+
+	range_size=1
+	for i in $(seq "$start" $((start + count))); do
+		end=$((start + range_size))
+		srcstart=$((start + src_delta))
+		srcend=$((end + src_delta))
+
+		add "$(format)" || return 1
+
+		for j in $(seq "$start" $((range_size / 2 + 1)) ${end}); do
+			send_match "${j}" $((j + src_delta)) || return 1
+		done
+
+		range_size=$((range_size + 1))
+		start=$((end + range_size))
+	done
+	sleep $timeout
+	for i in $(seq "$start" $((start + count))); do
+		end=$((start + range_size))
+		srcstart=$((start + src_delta))
+		srcend=$((end + src_delta))
+
+		for j in $(seq "$start" $((range_size / 2 + 1)) ${end}); do
+			send_nomatch "${j}" $((j + src_delta)) || return 1
+		done
+
+		range_size=$((range_size + 1))
+		start=$((end + range_size))
+	done
+}
+
+# Performance test template:
+# - add concatenated ranged entries
+# - add non-ranged concatenated entries (for hash set matching rate baseline)
+# - add ranged entries with first field only (for rbhash baseline)
+# - start pktgen injection directly on device rx path of this namespace
+# - measure drop only rate, hash and rbtree baselines, then matching rate
+test_performance() {
+	chain_spec=${perf_spec}
+	dst="${perf_dst}"
+	src="${perf_src}"
+	setup veth perf set || return ${ksft_skip}
+
+	first=${start}
+	range_size=1
+	for set in test norange noconcat; do
+		start=${first}
+		for i in $(seq "$start" $((start + perf_entries))); do
+			end=$((start + range_size))
+			srcstart=$((start + src_delta))
+			srcend=$((end + src_delta))
+
+			if [ $((end / 65534)) -gt $((start / 65534)) ]; then
+				start=${end}
+				end=$((end + 1))
+			elif [ "$start" -eq "$end" ]; then
+				end=$((start + 1))
+			fi
+
+			add_perf ${set}
+
+			start=$((end + range_size))
+		done > "${tmp}"
+		nft -f "${tmp}"
+	done
+
+	perf $((end - 1)) "$srcstart"
+
+	sleep 2
+
+	nft add rule netdev perf test counter name \"test\" drop
+	nft reset counter netdev perf test >/dev/null 2>&1
+	sleep "${perf_duration}"
+	pps="$(printf %10s $(($(count_perf_packets) / perf_duration)))"
+	info "    baseline (drop from netdev hook):            ${pps}pps"
+	handle="$(nft -a list chain netdev perf test | grep counter)"
+	handle="${handle##* }"
+	nft delete rule netdev perf test handle "${handle}"
+
+	nft add rule "netdev perf test ${chain_spec} @norange \
+		counter name \"test\" drop"
+	nft reset counter netdev perf test >/dev/null 2>&1
+	sleep "${perf_duration}"
+	pps="$(printf %10s $(($(count_perf_packets) / perf_duration)))"
+	info "    baseline hash (non-ranged entries):          ${pps}pps"
+	handle="$(nft -a list chain netdev perf test | grep counter)"
+	handle="${handle##* }"
+	nft delete rule netdev perf test handle "${handle}"
+
+	nft add rule "netdev perf test ${chain_spec%%. *} @noconcat \
+		counter name \"test\" drop"
+	nft reset counter netdev perf test >/dev/null 2>&1
+	sleep "${perf_duration}"
+	pps="$(printf %10s $(($(count_perf_packets) / perf_duration)))"
+	info "    baseline rbtree (match on first field only): ${pps}pps"
+	handle="$(nft -a list chain netdev perf test | grep counter)"
+	handle="${handle##* }"
+	nft delete rule netdev perf test handle "${handle}"
+
+	nft add rule "netdev perf test ${chain_spec} @test \
+		counter name \"test\" drop"
+	nft reset counter netdev perf test >/dev/null 2>&1
+	sleep "${perf_duration}"
+	pps="$(printf %10s $(($(count_perf_packets) / perf_duration)))"
+	p5="$(printf %5s "${perf_entries}")"
+	info "    set with ${p5} full, ranged entries:         ${pps}pps"
+	kill "${perf_pid}"
+}
+
+test_bug_flush_remove_add() {
+	rounds=100
+	[ "$KSFT_MACHINE_SLOW" = "yes" ] && rounds=10
+
+	set_cmd='{ set s { type ipv4_addr . inet_service; flags interval; }; }'
+	elem1='{ 10.0.0.1 . 22-25, 10.0.0.1 . 10-20 }'
+	elem2='{ 10.0.0.1 . 10-20, 10.0.0.1 . 22-25 }'
+	for i in $(seq 1 $rounds); do
+		nft add table t "$set_cmd"	|| return ${ksft_skip}
+		nft add element t s "$elem1"	2>/dev/null || return 1
+		nft flush set t s		2>/dev/null || return 1
+		nft add element t s "$elem2"	2>/dev/null || return 1
+	done
+	nft flush ruleset
+}
+
+# - add ranged element, check that packets match it
+# - reload the set, check packets still match
+test_bug_reload() {
+	setup veth send_"${proto}" set || return ${ksft_skip}
+	rstart=${start}
+
+	range_size=1
+	for i in $(seq "${start}" $((start + count))); do
+		end=$((start + range_size))
+
+		# Avoid negative or zero-sized port ranges
+		if [ $((end / 65534)) -gt $((start / 65534)) ]; then
+			start=${end}
+			end=$((end + 1))
+		fi
+		srcstart=$((start + src_delta))
+		srcend=$((end + src_delta))
+
+		add "$(format)" || return 1
+		range_size=$((range_size + 1))
+		start=$((end + range_size))
+	done
+
+	# check kernel does allocate pcpu sctrach map
+	# for reload with no elemet add/delete
+	( echo flush set inet filter test ;
+	  nft list set inet filter test ) | nft -f -
+
+	start=${rstart}
+	range_size=1
+
+	for i in $(seq "${start}" $((start + count))); do
+		end=$((start + range_size))
+
+		# Avoid negative or zero-sized port ranges
+		if [ $((end / 65534)) -gt $((start / 65534)) ]; then
+			start=${end}
+			end=$((end + 1))
+		fi
+		srcstart=$((start + src_delta))
+		srcend=$((end + src_delta))
+
+		for j in $(seq "$start" $((range_size / 2 + 1)) ${end}); do
+			send_match "${j}" $((j + src_delta)) || return 1
+		done
+
+		range_size=$((range_size + 1))
+		start=$((end + range_size))
+	done
+
+	nft flush ruleset
+}
+
+# - add ranged element, check that packets match it
+# - delete element again, check it is gone
+test_bug_net_port_proto_match() {
+	setup veth send_"${proto}" set || return ${ksft_skip}
+	rstart=${start}
+
+	range_size=1
+	for i in $(seq 1 10); do
+		for j in $(seq 1 20) ; do
+			elem=$(printf "10.%d.%d.0/24 . %d1-%d0 . 6-17 " ${i} ${j} ${i} "$((i+1))")
+
+			nft "add element inet filter test { $elem }" || return 1
+			nft "get element inet filter test { $elem }" | grep -q "$elem"
+			if [ $? -ne 0 ];then
+				local got=$(nft "get element inet filter test { $elem }")
+				err "post-add: should have returned $elem but got $got"
+				return 1
+			fi
+		done
+	done
+
+	# recheck after set was filled
+	for i in $(seq 1 10); do
+		for j in $(seq 1 20) ; do
+			elem=$(printf "10.%d.%d.0/24 . %d1-%d0 . 6-17 " ${i} ${j} ${i} "$((i+1))")
+
+			nft "get element inet filter test { $elem }" | grep -q "$elem"
+			if [ $? -ne 0 ];then
+				local got=$(nft "get element inet filter test { $elem }")
+				err "post-fill: should have returned $elem but got $got"
+				return 1
+			fi
+		done
+	done
+
+	# random del and re-fetch
+	for i in $(seq 1 10); do
+		for j in $(seq 1 20) ; do
+			local rnd=$((RANDOM%10))
+			local got=""
+
+			elem=$(printf "10.%d.%d.0/24 . %d1-%d0 . 6-17 " ${i} ${j} ${i} "$((i+1))")
+			if [ $rnd -gt 0 ];then
+				continue
+			fi
+
+			nft "delete element inet filter test { $elem }"
+			got=$(nft "get element inet filter test { $elem }" 2>/dev/null)
+			if [ $? -eq 0 ];then
+				err "post-delete: query for $elem returned $got instead of error."
+				return 1
+			fi
+		done
+	done
+
+	nft flush ruleset
+}
+
+test_bug_avx2_mismatch()
+{
+	setup veth send_"${proto}" set || return ${ksft_skip}
+
+	local a1="fe80:dead:01ff:0a02:0b03:6007:8009:a001"
+	local a2="fe80:dead:01fe:0a02:0b03:6007:8009:a001"
+
+	nft "add element inet filter test { icmpv6 . $a1 }"
+
+	dst_addr6="$a2"
+	send_icmp6
+
+	if [ "$(count_packets)" -gt "0" ]; then
+		err "False match for $a2"
+		return 1
+	fi
+}
+
+test_reported_issues() {
+	eval test_bug_"${subtest}"
+}
+
+# Run everything in a separate network namespace
+[ "${1}" != "run" ] && { unshare -n "${0}" run; exit $?; }
+tmp="$(mktemp)"
+trap cleanup_exit EXIT
+
+# Entry point for test runs
+passed=0
+for name in ${TESTS}; do
+	printf "TEST: %s\n" "$(echo "$name" | tr '_' ' ')"
+	if [ "${name}" = "reported_issues" ]; then
+		SUBTESTS="${BUGS}"
+	else
+		SUBTESTS="${TYPES}"
+	fi
+
+	for subtest in ${SUBTESTS}; do
+		eval desc=\$TYPE_"${subtest}"
+		IFS='
+'
+		for __line in ${desc}; do
+			# shellcheck disable=SC2086
+			eval ${__line%%	*}=\"${__line##*	}\";
+		done
+		IFS=' 	
+'
+
+		if [ "${name}" = "concurrency" ] && \
+		   [ "${race_repeat}" = "0" ]; then
+			continue
+		fi
+		if [ "${name}" = "performance" ] && \
+		   [ "${perf_duration}" = "0" ]; then
+			continue
+		fi
+
+		[ "$KSFT_MACHINE_SLOW" = "yes" ] && count=1
+
+		printf "  %-32s  " "${display}"
+		tthen=$(date +%s)
+		eval test_"${name}"
+		ret=$?
+
+		tnow=$(date +%s)
+		printf "%5ds%-30s" $((tnow-tthen))
+
+		if [ $ret -eq 0 ]; then
+			printf "[ OK ]\n"
+			info_flush
+			passed=$((passed + 1))
+		elif [ $ret -eq 1 ]; then
+			printf "[FAIL]\n"
+			err_flush
+			exit 1
+		elif [ $ret -eq ${ksft_skip} ]; then
+			printf "[SKIP]\n"
+			err_flush
+		fi
+	done
+done
+
+[ ${passed} -eq 0 ] && exit ${ksft_skip} || exit 0
diff --git a/tools/testing/selftests/net/netfilter/nft_concat_range_perf.sh b/tools/testing/selftests/net/netfilter/nft_concat_range_perf.sh
new file mode 100755
index 000000000000..5d276995a5c5
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/nft_concat_range_perf.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+
+source lib.sh
+
+[ "$KSFT_MACHINE_SLOW" = yes ] && exit ${ksft_skip}
+
+NFT_CONCAT_RANGE_TESTS="performance" exec ./nft_concat_range.sh
diff --git a/tools/testing/selftests/net/netfilter/nft_conntrack_helper.sh b/tools/testing/selftests/net/netfilter/nft_conntrack_helper.sh
new file mode 100755
index 000000000000..abcaa7337197
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/nft_conntrack_helper.sh
@@ -0,0 +1,171 @@
+#!/bin/bash
+#
+# This tests connection tracking helper assignment:
+# 1. can attach ftp helper to a connection from nft ruleset.
+# 2. auto-assign still works.
+#
+# Kselftest framework requirement - SKIP code is 4.
+
+source lib.sh
+
+ret=0
+
+testipv6=1
+
+checktool "socat -h" "run test without socat"
+checktool "conntrack --version" "run test without conntrack"
+checktool "nft --version" "run test without nft"
+
+cleanup()
+{
+	ip netns pids "$ns1" | xargs kill 2>/dev/null
+
+	ip netns del "$ns1"
+	ip netns del "$ns2"
+}
+
+trap cleanup EXIT
+
+setup_ns ns1 ns2
+
+if ! ip link add veth0 netns "$ns1" type veth peer name veth0 netns "$ns2" > /dev/null 2>&1;then
+    echo "SKIP: No virtual ethernet pair device support in kernel"
+    exit $ksft_skip
+fi
+
+ip -net "$ns1" link set veth0 up
+ip -net "$ns2" link set veth0 up
+
+ip -net "$ns1" addr add 10.0.1.1/24 dev veth0
+ip -net "$ns1" addr add dead:1::1/64 dev veth0 nodad
+
+ip -net "$ns2" addr add 10.0.1.2/24 dev veth0
+ip -net "$ns2" addr add dead:1::2/64 dev veth0 nodad
+
+load_ruleset_family() {
+	local family=$1
+	local ns=$2
+
+ip netns exec "$ns" nft -f - <<EOF
+table $family raw {
+	ct helper ftp {
+             type "ftp" protocol tcp
+        }
+	chain pre {
+		type filter hook prerouting priority 0; policy accept;
+		tcp dport 2121 ct helper set "ftp"
+	}
+	chain output {
+		type filter hook output priority 0; policy accept;
+		tcp dport 2121 ct helper set "ftp"
+	}
+}
+EOF
+	return $?
+}
+
+check_for_helper()
+{
+	local netns=$1
+	local message=$2
+	local port=$3
+
+	if echo "$message" |grep -q 'ipv6';then
+		local family="ipv6"
+	else
+		local family="ipv4"
+	fi
+
+	if ! ip netns exec "$netns" conntrack -L -f $family -p tcp --dport "$port" 2> /dev/null |grep -q 'helper=ftp';then
+		if [ "$autoassign" -eq 0 ] ;then
+			echo "FAIL: ${netns} did not show attached helper $message" 1>&2
+			ret=1
+		else
+			echo "PASS: ${netns} did not show attached helper $message" 1>&2
+		fi
+	else
+		if [ "$autoassign" -eq 0 ] ;then
+			echo "PASS: ${netns} connection on port $port has ftp helper attached" 1>&2
+		else
+			echo "FAIL: ${netns} connection on port $port has ftp helper attached" 1>&2
+			ret=1
+		fi
+	fi
+
+	return 0
+}
+
+listener_ready()
+{
+	ns="$1"
+	port="$2"
+	proto="$3"
+	ss -N "$ns" -lnt -o "sport = :$port" | grep -q "$port"
+}
+
+test_helper()
+{
+	local port=$1
+	local autoassign=$2
+
+	if [ "$autoassign" -eq 0 ] ;then
+		msg="set via ruleset"
+	else
+		msg="auto-assign"
+	fi
+
+	ip netns exec "$ns2" socat -t 3 -u -4 TCP-LISTEN:"$port",reuseaddr STDOUT > /dev/null &
+	busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns2" "$port" "-4"
+
+	ip netns exec "$ns1" socat -u -4 STDIN TCP:10.0.1.2:"$port" < /dev/null > /dev/null
+
+	check_for_helper "$ns1" "ip $msg" "$port" "$autoassign"
+	check_for_helper "$ns2" "ip $msg" "$port" "$autoassign"
+
+	if [ $testipv6 -eq 0 ] ;then
+		return 0
+	fi
+
+	ip netns exec "$ns1" conntrack -F 2> /dev/null
+	ip netns exec "$ns2" conntrack -F 2> /dev/null
+
+	ip netns exec "$ns2" socat -t 3 -u -6 TCP-LISTEN:"$port",reuseaddr STDOUT > /dev/null &
+	busywait $BUSYWAIT_TIMEOUT listener_ready "$ns2" "$port" "-6"
+
+	ip netns exec "$ns1" socat -t 3 -u -6 STDIN TCP:"[dead:1::2]":"$port" < /dev/null > /dev/null
+
+	check_for_helper "$ns1" "ipv6 $msg" "$port"
+	check_for_helper "$ns2" "ipv6 $msg" "$port"
+}
+
+if ! load_ruleset_family ip "$ns1"; then
+	echo "FAIL: ${ns1} cannot load ip ruleset" 1>&2
+	exit 1
+fi
+
+if ! load_ruleset_family ip6 "$ns1"; then
+	echo "SKIP: ${ns1} cannot load ip6 ruleset" 1>&2
+	testipv6=0
+fi
+
+if ! load_ruleset_family inet "${ns2}"; then
+	echo "SKIP: ${ns1} cannot load inet ruleset" 1>&2
+	if ! load_ruleset_family ip "${ns2}"; then
+		echo "FAIL: ${ns2} cannot load ip ruleset" 1>&2
+		exit 1
+	fi
+
+	if [ "$testipv6" -eq 1 ] ;then
+		if ! load_ruleset_family ip6 "$ns2"; then
+			echo "FAIL: ${ns2} cannot load ip6 ruleset" 1>&2
+			exit 1
+		fi
+	fi
+fi
+
+test_helper 2121 0
+ip netns exec "$ns1" sysctl -qe 'net.netfilter.nf_conntrack_helper=1'
+ip netns exec "$ns2" sysctl -qe 'net.netfilter.nf_conntrack_helper=1'
+test_helper 21 1
+
+exit $ret
diff --git a/tools/testing/selftests/net/netfilter/nft_fib.sh b/tools/testing/selftests/net/netfilter/nft_fib.sh
new file mode 100755
index 000000000000..9929a9ffef65
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/nft_fib.sh
@@ -0,0 +1,847 @@
+#!/bin/bash
+#
+# This tests the fib expression.
+#
+# Kselftest framework requirement - SKIP code is 4.
+#
+#  10.0.1.99     10.0.1.1           10.0.2.1         10.0.2.99
+# dead:1::99    dead:1::1          dead:2::1        dead:2::99
+# ns1 <-------> [ veth0 ] nsrouter [veth1] <-------> ns2
+
+source lib.sh
+
+ret=0
+
+timeout=4
+
+log_netns=$(sysctl -n net.netfilter.nf_log_all_netns)
+
+cleanup()
+{
+	cleanup_all_ns
+
+	[ "$log_netns" -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns
+}
+
+checktool "nft --version" "run test without nft"
+
+setup_ns nsrouter ns1 ns2
+
+trap cleanup EXIT
+
+if dmesg | grep -q ' nft_rpfilter: ';then
+	dmesg -c | grep ' nft_rpfilter: '
+	echo "WARN: a previous test run has failed" 1>&2
+fi
+
+sysctl -q net.netfilter.nf_log_all_netns=1
+
+load_ruleset() {
+	local netns=$1
+
+ip netns exec "$netns" nft -f /dev/stdin <<EOF
+table inet filter {
+	chain prerouting {
+		type filter hook prerouting priority 0; policy accept;
+	        fib saddr . iif oif missing counter log prefix "$netns nft_rpfilter: " drop
+	}
+}
+EOF
+}
+
+load_input_ruleset() {
+	local netns=$1
+
+ip netns exec "$netns" nft -f /dev/stdin <<EOF
+table inet filter {
+	chain input {
+		type filter hook input priority 0; policy accept;
+	        fib saddr . iif oif missing counter log prefix "$netns nft_rpfilter: " drop
+	}
+}
+EOF
+}
+
+load_pbr_ruleset() {
+	local netns=$1
+
+ip netns exec "$netns" nft -f /dev/stdin <<EOF
+table inet filter {
+	chain forward {
+		type filter hook forward priority raw;
+		fib saddr . iif oif gt 0 accept
+		log drop
+	}
+}
+EOF
+}
+
+load_type_ruleset() {
+	local netns=$1
+
+	for family in ip ip6;do
+ip netns exec "$netns" nft -f /dev/stdin <<EOF
+table $family filter {
+	chain type_match_in {
+		fib daddr type local counter comment "daddr configured on other iface"
+		fib daddr . iif type local counter comment "daddr configured on iif"
+		fib daddr type unicast counter comment "daddr not local"
+		fib daddr . iif type unicast counter comment "daddr not configured on iif"
+	}
+
+	chain type_match_out {
+		fib daddr type unicast counter
+		fib daddr . oif type unicast counter
+		fib daddr type local counter
+		fib daddr . oif type local counter
+	}
+
+	chain prerouting {
+		type filter hook prerouting priority 0;
+		icmp type echo-request counter jump type_match_in
+		icmpv6 type echo-request counter jump type_match_in
+	}
+
+	chain input {
+		type filter hook input priority 0;
+		icmp type echo-request counter jump type_match_in
+		icmpv6 type echo-request counter jump type_match_in
+	}
+
+	chain forward {
+		type filter hook forward priority 0;
+		icmp type echo-request counter jump type_match_in
+		icmpv6 type echo-request counter jump type_match_in
+	}
+
+	chain output {
+		type filter hook output priority 0;
+		icmp type echo-request counter jump type_match_out
+		icmpv6 type echo-request counter jump type_match_out
+	}
+
+	chain postrouting {
+		type filter hook postrouting priority 0;
+		icmp type echo-request counter jump type_match_out
+		icmpv6 type echo-request counter jump type_match_out
+	}
+}
+EOF
+done
+}
+
+reload_type_ruleset() {
+	ip netns exec "$1" nft flush table ip filter
+	ip netns exec "$1" nft flush table ip6 filter
+	load_type_ruleset "$1"
+}
+
+check_fib_type_counter_family() {
+	local family="$1"
+	local want="$2"
+	local ns="$3"
+	local chain="$4"
+	local what="$5"
+	local errmsg="$6"
+
+	if ! ip netns exec "$ns" nft list chain "$family" filter "$chain" | grep "$what" | grep -q "packets $want";then
+		echo "Netns $ns $family fib type counter doesn't match expected packet count of $want for $what $errmsg" 1>&2
+		ip netns exec "$ns" nft list chain "$family" filter "$chain"
+		ret=1
+		return 1
+	fi
+
+	return 0
+}
+
+check_fib_type_counter() {
+	check_fib_type_counter_family "ip" "$@" || return 1
+	check_fib_type_counter_family "ip6" "$@" || return 1
+}
+
+load_ruleset_count() {
+	local netns=$1
+
+ip netns exec "$netns" nft -f /dev/stdin <<EOF
+table inet filter {
+	chain prerouting {
+		type filter hook prerouting priority 0; policy accept;
+		ip daddr 1.1.1.1 fib saddr . iif oif missing counter drop
+		ip6 daddr 1c3::c01d fib saddr . iif oif missing counter drop
+	}
+}
+EOF
+}
+
+check_drops() {
+	if dmesg | grep -q ' nft_rpfilter: ';then
+		dmesg | grep ' nft_rpfilter: '
+		echo "FAIL: rpfilter did drop packets"
+		ret=1
+		return 1
+	fi
+
+	return 0
+}
+
+check_fib_counter() {
+	local want=$1
+	local ns=$2
+	local address=$3
+
+	if ! ip netns exec "$ns" nft list table inet filter | grep 'fib saddr . iif' | grep "$address" | grep -q "packets $want";then
+		echo "Netns $ns fib counter doesn't match expected packet count of $want for $address" 1>&2
+		ip netns exec "$ns" nft list table inet filter
+		return 1
+	fi
+
+	if [ "$want" -gt 0 ]; then
+		echo "PASS: fib expression did drop packets for $address"
+	fi
+
+	return 0
+}
+
+load_ruleset "$nsrouter"
+load_ruleset "$ns1"
+load_ruleset "$ns2"
+
+if ! ip link add veth0 netns "$nsrouter" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1; then
+    echo "SKIP: No virtual ethernet pair device support in kernel"
+    exit $ksft_skip
+fi
+ip link add veth1 netns "$nsrouter" type veth peer name eth0 netns "$ns2"
+
+ip -net "$nsrouter" link set veth0 up
+ip -net "$nsrouter" addr add 10.0.1.1/24 dev veth0
+ip -net "$nsrouter" addr add dead:1::1/64 dev veth0 nodad
+
+ip -net "$nsrouter" link set veth1 up
+ip -net "$nsrouter" addr add 10.0.2.1/24 dev veth1
+ip -net "$nsrouter" addr add dead:2::1/64 dev veth1 nodad
+
+ip -net "$ns1" link set eth0 up
+ip -net "$ns2" link set eth0 up
+
+ip -net "$ns1" addr add 10.0.1.99/24 dev eth0
+ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad
+ip -net "$ns1" route add default via 10.0.1.1
+ip -net "$ns1" route add default via dead:1::1
+
+ip -net "$ns2" addr add 10.0.2.99/24 dev eth0
+ip -net "$ns2" addr add dead:2::99/64 dev eth0 nodad
+ip -net "$ns2" route add default via 10.0.2.1
+ip -net "$ns2" route add default via dead:2::1
+
+test_ping() {
+  local daddr4=$1
+  local daddr6=$2
+
+  if ! ip netns exec "$ns1" ping -c 1 -q "$daddr4" > /dev/null; then
+	check_drops
+	echo "FAIL: ${ns1} cannot reach $daddr4, ret $ret" 1>&2
+	return 1
+  fi
+
+  if ! ip netns exec "$ns1" ping -c 1 -q "$daddr6" > /dev/null; then
+	check_drops
+	echo "FAIL: ${ns1} cannot reach $daddr6, ret $ret" 1>&2
+	return 1
+  fi
+
+  return 0
+}
+
+test_ping_unreachable() {
+  local daddr4=$1
+  local daddr6=$2
+
+  if ip netns exec "$ns1" ping -c 1 -w 1 -q "$daddr4" > /dev/null; then
+	echo "FAIL: ${ns1} could reach $daddr4" 1>&2
+	return 1
+  fi
+
+  if ip netns exec "$ns1" ping -c 1 -w 1 -q "$daddr6" > /dev/null; then
+	echo "FAIL: ${ns1} could reach $daddr6" 1>&2
+	return 1
+  fi
+
+  return 0
+}
+
+test_fib_type() {
+	local notice="$1"
+	local errmsg="addr-on-if"
+	local lret=0
+
+	if ! load_type_ruleset "$nsrouter";then
+		echo "SKIP: Could not load fib type ruleset"
+		[ $ret -eq 0 ] && ret=$ksft_skip
+		return
+	fi
+
+	# makes router receive packet for addresses configured on incoming
+	# interface.
+	test_ping 10.0.1.1 dead:1::1 || return 1
+
+	# expectation: triggers all 'local' in prerouting/input.
+	check_fib_type_counter 2 "$nsrouter" "type_match_in" "fib daddr type local" "$errmsg" || lret=1
+	check_fib_type_counter 2 "$nsrouter" "type_match_in" "fib daddr . iif type local" "$errmsg" || lret=1
+
+	reload_type_ruleset "$nsrouter"
+	# makes router receive packet for address configured on a different (but local)
+	# interface.
+	test_ping 10.0.2.1 dead:2::1 || return 1
+
+	# expectation: triggers 'unicast' in prerouting/input for daddr . iif and local for 'daddr'.
+	errmsg="addr-on-host"
+	check_fib_type_counter 2 "$nsrouter" "type_match_in" "fib daddr type local" "$errmsg" || lret=1
+	check_fib_type_counter 2 "$nsrouter" "type_match_in" "fib daddr . iif type unicast" "$errmsg" || lret=1
+
+	reload_type_ruleset "$nsrouter"
+	test_ping 10.0.2.99 dead:2::99 || return 1
+	errmsg="addr-on-otherhost"
+	check_fib_type_counter 2 "$nsrouter" "type_match_in" "fib daddr type unicast" "$errmsg" || lret=1
+	check_fib_type_counter 2 "$nsrouter" "type_match_in" "fib daddr . iif type unicast" "$errmsg" || lret=1
+
+	if [ $lret -eq 0 ];then
+		echo "PASS: fib expression address types match ($notice)"
+	else
+		echo "FAIL: fib expression address types match ($notice)"
+		ret=1
+	fi
+}
+
+test_fib_vrf_dev_add_dummy()
+{
+	if ! ip -net "$nsrouter" link add dummy0 type dummy ;then
+		echo "SKIP: VRF tests: dummy device type not supported"
+		return 1
+	fi
+
+	if ! ip -net "$nsrouter" link add tvrf type vrf table 9876;then
+		echo "SKIP: VRF tests: vrf device type not supported"
+		return 1
+	fi
+
+	ip -net "$nsrouter" link set dummy0 master tvrf
+	ip -net "$nsrouter" link set dummy0 up
+	ip -net "$nsrouter" link set tvrf up
+}
+
+load_ruleset_vrf()
+{
+# Due to the many different possible combinations using named counters
+# or one-rule-per-expected-result is complex.
+#
+# Instead, add dynamic sets for the fib modes
+# (fib address type, fib output interface lookup .. ),
+# and then add the obtained fib results to them.
+#
+# The test is successful if the sets contain the expected results
+# and no unexpected extra entries existed.
+ip netns exec "$nsrouter" nft -f - <<EOF
+flush ruleset
+table inet t {
+	set fibif4 {
+		typeof meta iif . ip daddr . fib daddr oif
+		flags dynamic
+		counter
+	}
+
+	set fibif4iif {
+		typeof meta iif . ip daddr . fib daddr . iif oif
+		flags dynamic
+		counter
+	}
+
+	set fibif6 {
+		typeof meta iif . ip6 daddr . fib daddr oif
+		flags dynamic
+		counter
+	}
+
+	set fibif6iif {
+		typeof meta iif . ip6 daddr . fib daddr . iif oif
+		flags dynamic
+		counter
+	}
+
+	set fibtype4 {
+		typeof meta iif . ip daddr . fib daddr type
+		flags dynamic
+		counter
+	}
+
+	set fibtype4iif {
+		typeof meta iif . ip daddr . fib daddr . iif type
+		flags dynamic
+		counter
+	}
+
+	set fibtype6 {
+		typeof meta iif . ip6 daddr . fib daddr type
+		flags dynamic
+		counter
+	}
+
+	set fibtype6iif {
+		typeof meta iif . ip6 daddr . fib daddr . iif type
+		flags dynamic
+		counter
+	}
+
+	chain fib_test {
+		meta nfproto ipv4 jump {
+			add @fibif4 { meta iif . ip daddr . fib daddr oif }
+			add @fibif4iif { meta iif . ip daddr . fib daddr . iif oif }
+			add @fibtype4 { meta iif . ip daddr . fib daddr type }
+			add @fibtype4iif { meta iif . ip daddr . fib daddr . iif type }
+
+			add @fibif4 { meta iif . ip saddr . fib saddr oif }
+			add @fibif4iif { meta iif . ip saddr . fib saddr . iif oif }
+		}
+
+		meta nfproto ipv6 jump {
+			add @fibif6    { meta iif . ip6 daddr . fib daddr oif }
+			add @fibif6iif { meta iif . ip6 daddr . fib daddr . iif oif }
+			add @fibtype6    { meta iif . ip6 daddr . fib daddr type }
+			add @fibtype6iif { meta iif . ip6 daddr . fib daddr . iif type }
+
+			add @fibif6 { meta iif . ip6 saddr . fib saddr oif }
+			add @fibif6iif { meta iif . ip6 saddr . fib saddr . iif oif }
+		}
+	}
+
+	chain prerouting {
+		type filter hook prerouting priority 0;
+		icmp type echo-request counter jump fib_test
+
+		# neighbour discovery to be ignored.
+		icmpv6 type echo-request counter jump fib_test
+	}
+}
+EOF
+
+if [ $? -ne 0 ] ;then
+	echo "SKIP: Could not load ruleset for fib vrf test"
+	[ $ret -eq 0 ] && ret=$ksft_skip
+	return 1
+fi
+}
+
+check_type()
+{
+	local setname="$1"
+	local iifname="$2"
+	local addr="$3"
+	local type="$4"
+	local count="$5"
+
+	[ -z "$count" ] && count=1
+
+	if ! ip netns exec "$nsrouter" nft get element inet t "$setname" { "$iifname" . "$addr" . "$type" } |grep -q "counter packets $count";then
+		echo "FAIL: did not find $iifname . $addr . $type in $setname"
+		ip netns exec "$nsrouter" nft list set inet t "$setname"
+		ret=1
+		return 1
+	fi
+
+	# delete the entry, this allows to check if anything unexpected appeared
+	# at the end of the test run: all dynamic sets should be empty by then.
+	if ! ip netns exec "$nsrouter" nft delete element inet t "$setname" { "$iifname" . "$addr" . "$type" } ; then
+		echo "FAIL: can't delete $iifname . $addr . $type in $setname"
+		ip netns exec "$nsrouter" nft list set inet t "$setname"
+		ret=1
+		return 1
+	fi
+
+	return 0
+}
+
+check_local()
+{
+	check_type $@ "local" 1
+}
+
+check_unicast()
+{
+	check_type $@ "unicast" 1
+}
+
+check_rpf()
+{
+	check_type $@
+}
+
+check_fib_vrf_sets_empty()
+{
+	local setname=""
+	local lret=0
+
+	# A non-empty set means that we have seen unexpected packets OR
+	# that a fib lookup provided unexpected results.
+	for setname in "fibif4" "fibif4iif" "fibif6" "fibif6iif" \
+		       "fibtype4" "fibtype4iif" "fibtype6" "fibtype6iif";do
+		if ip netns exec "$nsrouter" nft list set inet t "$setname" | grep -q elements;then
+			echo "FAIL: $setname not empty"
+	                ip netns exec "$nsrouter" nft list set inet t "$setname"
+			ret=1
+			lret=1
+		fi
+	done
+
+	return $lret
+}
+
+check_fib_vrf_type()
+{
+	local msg="$1"
+
+	local addr
+	# the incoming interface is always veth0.  As its not linked to a VRF,
+	# the 'tvrf' device should NOT show up anywhere.
+	local ifname="veth0"
+	local lret=0
+
+	# local_veth0, local_veth1
+	for addr in "10.0.1.1" "10.0.2.1"; do
+		check_local fibtype4  "$ifname" "$addr" || lret=1
+		check_type  fibif4    "$ifname" "$addr" "0" || lret=1
+	done
+	for addr in "dead:1::1" "dead:2::1";do
+		check_local fibtype6  "$ifname" "$addr" || lret=1
+		check_type  fibif6    "$ifname" "$addr" "0" || lret=1
+	done
+
+	# when restricted to the incoming interface, 10.0.1.1 should
+	# be 'local', but 10.0.2.1 unicast.
+	check_local fibtype4iif   "$ifname" "10.0.1.1" || lret=1
+	check_unicast fibtype4iif "$ifname" "10.0.2.1" || lret=1
+
+	# same for the ipv6 addresses.
+	check_local fibtype6iif   "$ifname" "dead:1::1" || lret=1
+	check_unicast fibtype6iif "$ifname" "dead:2::1" || lret=1
+
+	# None of these addresses should find a valid route when restricting
+	# to the incoming interface (we ask for daddr - 10.0.1.1/2.1 are
+	# reachable via 'lo'.
+	for addr in "10.0.1.1" "10.0.2.1" "10.9.9.1" "10.9.9.2";do
+		check_type fibif4iif "$ifname" "$addr" "0" || lret=1
+	done
+
+	# expect default route (veth1), dummy0 is part of VRF but iif isn't.
+	for addr in "10.9.9.1" "10.9.9.2";do
+		check_unicast fibtype4    "$ifname" "$addr" || lret=1
+		check_unicast fibtype4iif "$ifname" "$addr" || lret=1
+		check_type fibif4 "$ifname" "$addr" "veth1" || lret=1
+	done
+	for addr in "dead:9::1" "dead:9::2";do
+		check_unicast fibtype6    "$ifname" "$addr" || lret=1
+		check_unicast fibtype6iif "$ifname" "$addr" || lret=1
+		check_type fibif6 "$ifname" "$addr" "veth1" || lret=1
+	done
+
+	# same for the IPv6 equivalent addresses.
+	for addr in "dead:1::1" "dead:2::1" "dead:9::1" "dead:9::2";do
+		check_type  fibif6iif "$ifname" "$addr" "0" || lret=1
+	done
+
+	check_unicast fibtype4    "$ifname" "10.0.2.99" || lret=1
+	check_unicast fibtype4iif "$ifname" "10.0.2.99" || lret=1
+	check_unicast fibtype6    "$ifname" "dead:2::99" || lret=1
+	check_unicast fibtype6iif "$ifname" "dead:2::99" || lret=1
+
+	check_type fibif4 "$ifname" "10.0.2.99" "veth1" || lret=1
+	check_type fibif4iif "$ifname" "10.0.2.99" 0 || lret=1
+	check_type fibif6 "$ifname" "dead:2::99" "veth1" || lret=1
+	check_type fibif6iif "$ifname" "dead:2::99" 0 || lret=1
+
+	check_rpf  fibif4    "$ifname" "10.0.1.99" "veth0" 5 || lret=1
+	check_rpf  fibif4iif "$ifname" "10.0.1.99" "veth0" 5 || lret=1
+	check_rpf  fibif6    "$ifname" "dead:1::99" "veth0" 5 || lret=1
+	check_rpf  fibif6iif "$ifname" "dead:1::99" "veth0" 5 || lret=1
+
+	check_fib_vrf_sets_empty || lret=1
+
+	if [ $lret -eq 0 ];then
+		echo "PASS: $msg"
+	else
+		echo "FAIL: $msg"
+		ret=1
+	fi
+}
+
+check_fib_veth_vrf_type()
+{
+	local msg="$1"
+
+	local addr
+	local ifname
+	local setname
+	local lret=0
+
+	# as veth0 is now part of tvrf interface, packets will be seen
+	# twice, once with iif veth0, then with iif tvrf.
+
+	for ifname in "veth0" "tvrf"; do
+		for addr in "10.0.1.1" "10.9.9.1"; do
+			check_local fibtype4  "$ifname" "$addr" || lret=1
+			# addr local, but nft_fib doesn't return routes with RTN_LOCAL.
+			check_type  fibif4    "$ifname" "$addr" 0 || lret=1
+			check_type  fibif4iif "$ifname" "$addr" 0 || lret=1
+		done
+
+		for addr in "dead:1::1" "dead:9::1"; do
+			check_local fibtype6 "$ifname" "$addr" || lret=1
+			# same, address is local but no route is returned for lo.
+			check_type  fibif6    "$ifname" "$addr" 0 || lret=1
+			check_type  fibif6iif "$ifname" "$addr" 0 || lret=1
+		done
+
+		for t in fibtype4 fibtype4iif; do
+			check_unicast "$t" "$ifname" 10.9.9.2 || lret=1
+		done
+		for t in fibtype6 fibtype6iif; do
+			check_unicast "$t" "$ifname" dead:9::2 || lret=1
+		done
+
+		check_unicast fibtype4iif "$ifname" "10.9.9.1" || lret=1
+		check_unicast fibtype6iif "$ifname" "dead:9::1" || lret=1
+
+		check_unicast fibtype4    "$ifname" "10.0.2.99" || lret=1
+		check_unicast fibtype4iif "$ifname" "10.0.2.99" || lret=1
+
+		check_unicast fibtype6    "$ifname" "dead:2::99" || lret=1
+		check_unicast fibtype6iif "$ifname" "dead:2::99" || lret=1
+
+		check_type fibif4    "$ifname"  "10.0.2.99" "veth1" || lret=1
+		check_type fibif6    "$ifname" "dead:2::99" "veth1" || lret=1
+		check_type fibif4    "$ifname"   "10.9.9.2" "dummy0" || lret=1
+		check_type fibif6    "$ifname"  "dead:9::2" "dummy0" || lret=1
+
+		# restricted to iif -- MUST NOT provide result, its != $ifname.
+		check_type fibif4iif "$ifname"  "10.0.2.99" 0 || lret=1
+		check_type fibif6iif "$ifname" "dead:2::99" 0 || lret=1
+
+		check_rpf  fibif4 "$ifname" "10.0.1.99" "veth0" 4 || lret=1
+		check_rpf  fibif6 "$ifname" "dead:1::99" "veth0" 4 || lret=1
+		check_rpf  fibif4iif "$ifname" "10.0.1.99" "$ifname" 4 || lret=1
+		check_rpf  fibif6iif "$ifname" "dead:1::99" "$ifname" 4 || lret=1
+	done
+
+	check_local fibtype4iif "veth0" "10.0.1.1" || lret=1
+	check_local fibtype6iif "veth0" "dead:1::1" || lret=1
+
+	check_unicast fibtype4iif "tvrf" "10.0.1.1" || lret=1
+	check_unicast fibtype6iif "tvrf" "dead:1::1" || lret=1
+
+	# 10.9.9.2 should not provide a result for iif veth, but
+	# should when iif is tvrf.
+	# This is because its reachable via dummy0 which is part of
+	# tvrf.  iif veth0 MUST conceal the dummy0 result (i.e. return oif 0).
+	check_type fibif4iif "veth0" "10.9.9.2" 0 || lret=1
+	check_type fibif6iif "veth0"  "dead:9::2" 0 || lret=1
+
+	check_type fibif4iif "tvrf" "10.9.9.2" "tvrf" || lret=1
+	check_type fibif6iif "tvrf" "dead:9::2" "tvrf" || lret=1
+
+	check_fib_vrf_sets_empty || lret=1
+
+	if [ $lret -eq 0 ];then
+		echo "PASS: $msg"
+	else
+		echo "FAIL: $msg"
+		ret=1
+	fi
+}
+
+# Extends nsrouter config by adding dummy0+vrf.
+#
+#  10.0.1.99     10.0.1.1           10.0.2.1         10.0.2.99
+# dead:1::99    dead:1::1          dead:2::1        dead:2::99
+# ns1 <-------> [ veth0 ] nsrouter [veth1] <-------> ns2
+#                         [dummy0]
+#                         10.9.9.1
+#                        dead:9::1
+#                          [tvrf]
+test_fib_vrf()
+{
+	local cntname=""
+
+	if ! test_fib_vrf_dev_add_dummy; then
+		[ $ret -eq 0 ] && ret=$ksft_skip
+		return
+	fi
+
+	ip -net "$nsrouter" addr add "10.9.9.1"/24 dev dummy0
+	ip -net "$nsrouter" addr add "dead:9::1"/64 dev dummy0 nodad
+
+	ip -net "$nsrouter" route add default via 10.0.2.99
+	ip -net "$nsrouter" route add default via dead:2::99
+
+	load_ruleset_vrf || return
+
+	# no echo reply for these addresses: The dummy interface is part of tvrf,
+	# but veth0 (incoming interface) isn't linked to it.
+	test_ping_unreachable "10.9.9.1" "dead:9::1" &
+	test_ping_unreachable "10.9.9.2" "dead:9::2" &
+
+	# expect replies from these.
+	test_ping "10.0.1.1" "dead:1::1"
+	test_ping "10.0.2.1" "dead:2::1"
+	test_ping "10.0.2.99" "dead:2::99"
+
+	wait
+
+	check_fib_vrf_type "fib expression address types match (iif not in vrf)"
+
+	# second round: this time, make veth0 (rx interface) part of the vrf.
+	# 10.9.9.1 / dead:9::1 become reachable from ns1, while ns2
+	# becomes unreachable.
+	ip -net "$nsrouter" link set veth0 master tvrf
+	ip -net "$nsrouter" addr add dead:1::1/64 dev veth0 nodad
+
+	# this reload should not be needed, but in case
+	# there is some error (missing or unexpected entry) this will prevent them
+	# from leaking into round 2.
+	load_ruleset_vrf || return
+
+	test_ping "10.0.1.1" "dead:1::1"
+	test_ping "10.9.9.1" "dead:9::1"
+
+	# ns2 should no longer be reachable (veth1 not in vrf)
+	test_ping_unreachable "10.0.2.99" "dead:2::99" &
+
+	# vrf via dummy0, but host doesn't exist
+	test_ping_unreachable "10.9.9.2" "dead:9::2" &
+
+	wait
+
+	check_fib_veth_vrf_type "fib expression address types match (iif in vrf)"
+}
+
+ip netns exec "$nsrouter" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
+
+test_ping 10.0.2.1 dead:2::1 || exit 1
+check_drops
+
+test_ping 10.0.2.99 dead:2::99 || exit 1
+check_drops
+
+[ $ret -eq 0 ] && echo "PASS: fib expression did not cause unwanted packet drops"
+
+load_input_ruleset "$ns1"
+
+test_ping 127.0.0.1 ::1
+check_drops
+
+test_ping 10.0.1.99 dead:1::99
+check_drops
+
+[ $ret -eq 0 ] && echo "PASS: fib expression did not discard loopback packets"
+
+load_input_ruleset "$ns1"
+
+test_ping 127.0.0.1 ::1 || exit 1
+check_drops || exit 1
+
+test_ping 10.0.1.99 dead:1::99 || exit 1
+check_drops || exit 1
+
+echo "PASS: fib expression did not discard loopback packets"
+
+ip netns exec "$nsrouter" nft flush table inet filter
+
+ip -net "$ns1" route del default
+ip -net "$ns1" -6 route del default
+
+ip -net "$ns1" addr del 10.0.1.99/24 dev eth0
+ip -net "$ns1" addr del dead:1::99/64 dev eth0
+
+ip -net "$ns1" addr add 10.0.2.99/24 dev eth0
+ip -net "$ns1" addr add dead:2::99/64 dev eth0 nodad
+
+ip -net "$ns1" route add default via 10.0.2.1
+ip -net "$ns1" -6 route add default via dead:2::1
+
+ip -net "$nsrouter" addr add dead:2::1/64 dev veth0 nodad
+
+# switch to ruleset that doesn't log, this time
+# its expected that this does drop the packets.
+load_ruleset_count "$nsrouter"
+
+# ns1 has a default route, but nsrouter does not.
+# must not check return value, ping to 1.1.1.1 will
+# fail.
+check_fib_counter 0 "$nsrouter" 1.1.1.1 || exit 1
+check_fib_counter 0 "$nsrouter" 1c3::c01d || exit 1
+
+ip netns exec "$ns1" ping -W 0.5 -c 1 -q 1.1.1.1 > /dev/null
+check_fib_counter 1 "$nsrouter" 1.1.1.1 || exit 1
+
+ip netns exec "$ns1" ping -W 0.5 -i 0.1 -c 3 -q 1c3::c01d > /dev/null
+check_fib_counter 3 "$nsrouter" 1c3::c01d || exit 1
+
+# delete all rules
+ip netns exec "$ns1" nft flush ruleset
+ip netns exec "$ns2" nft flush ruleset
+ip netns exec "$nsrouter" nft flush ruleset
+
+ip -net "$ns1" addr add 10.0.1.99/24 dev eth0
+ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad
+
+ip -net "$ns1" addr del 10.0.2.99/24 dev eth0
+ip -net "$ns1" addr del dead:2::99/64 dev eth0
+
+ip -net "$nsrouter" addr del dead:2::1/64 dev veth0
+
+# ... pbr ruleset for the router, check iif+oif.
+if ! load_pbr_ruleset "$nsrouter";then
+	echo "SKIP: Could not load fib forward ruleset"
+	[ "$ret" -eq 0 ] && ret=$ksft_skip
+fi
+
+ip -net "$nsrouter" rule add from all table 128
+ip -net "$nsrouter" rule add from all iif veth0 table 129
+ip -net "$nsrouter" route add table 128 to 10.0.1.0/24 dev veth0
+ip -net "$nsrouter" route add table 129 to 10.0.2.0/24 dev veth1
+
+# drop main ipv4 table
+ip -net "$nsrouter" -4 rule delete table main
+
+if test_ping 10.0.2.99 dead:2::99;then
+	echo "PASS: fib expression forward check with policy based routing"
+else
+	echo "FAIL: fib expression forward check with policy based routing"
+	ret=1
+fi
+
+test_fib_type "policy routing"
+ip netns exec "$nsrouter" nft delete table ip filter
+ip netns exec "$nsrouter" nft delete table ip6 filter
+
+# Un-do policy routing changes
+ip -net "$nsrouter" rule del from all table 128
+ip -net "$nsrouter" rule del from all iif veth0 table 129
+
+ip -net "$nsrouter" route del table 128 to 10.0.1.0/24 dev veth0
+ip -net "$nsrouter" route del table 129 to 10.0.2.0/24 dev veth1
+
+ip -net "$ns1" -4 route del default
+ip -net "$ns1" -6 route del default
+
+ip -net "$ns1" -4 route add default via 10.0.1.1
+ip -net "$ns1" -6 route add default via dead:1::1
+
+ip -net "$nsrouter" -4 rule add from all table main priority 32766
+
+test_fib_type "default table"
+ip netns exec "$nsrouter" nft delete table ip filter
+ip netns exec "$nsrouter" nft delete table ip6 filter
+
+test_fib_vrf
+
+exit $ret
diff --git a/tools/testing/selftests/net/netfilter/nft_flowtable.sh b/tools/testing/selftests/net/netfilter/nft_flowtable.sh
new file mode 100755
index 000000000000..a4ee5496f2a1
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/nft_flowtable.sh
@@ -0,0 +1,674 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# This tests basic flowtable functionality.
+# Creates following default topology:
+#
+# Originator (MTU 9000) <-Router1-> MTU 1500 <-Router2-> Responder (MTU 2000)
+# Router1 is the one doing flow offloading, Router2 has no special
+# purpose other than having a link that is smaller than either Originator
+# and responder, i.e. TCPMSS announced values are too large and will still
+# result in fragmentation and/or PMTU discovery.
+#
+# You can check with different Orgininator/Link/Responder MTU eg:
+# nft_flowtable.sh -o8000 -l1500 -r2000
+#
+
+source lib.sh
+
+ret=0
+SOCAT_TIMEOUT=60
+
+nsin=""
+ns1out=""
+ns2out=""
+
+log_netns=$(sysctl -n net.netfilter.nf_log_all_netns)
+
+checktool "nft --version" "run test without nft tool"
+checktool "socat -h" "run test without socat"
+
+setup_ns ns1 ns2 nsr1 nsr2
+
+cleanup() {
+	ip netns pids "$ns1" | xargs kill 2>/dev/null
+	ip netns pids "$ns2" | xargs kill 2>/dev/null
+
+	cleanup_all_ns
+
+	rm -f "$nsin" "$ns1out" "$ns2out"
+
+	[ "$log_netns" -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns="$log_netns"
+}
+
+trap cleanup EXIT
+
+sysctl -q net.netfilter.nf_log_all_netns=1
+
+ip link add veth0 netns "$nsr1" type veth peer name eth0 netns "$ns1"
+ip link add veth1 netns "$nsr1" type veth peer name veth0 netns "$nsr2"
+
+ip link add veth1 netns "$nsr2" type veth peer name eth0 netns "$ns2"
+
+for dev in veth0 veth1; do
+    ip -net "$nsr1" link set "$dev" up
+    ip -net "$nsr2" link set "$dev" up
+done
+
+ip -net "$nsr1" addr add 10.0.1.1/24 dev veth0
+ip -net "$nsr1" addr add dead:1::1/64 dev veth0 nodad
+
+ip -net "$nsr2" addr add 10.0.2.1/24 dev veth1
+ip -net "$nsr2" addr add dead:2::1/64 dev veth1 nodad
+
+# set different MTUs so we need to push packets coming from ns1 (large MTU)
+# to ns2 (smaller MTU) to stack either to perform fragmentation (ip_no_pmtu_disc=1),
+# or to do PTMU discovery (send ICMP error back to originator).
+# ns2 is going via nsr2 with a smaller mtu, so that TCPMSS announced by both peers
+# is NOT the lowest link mtu.
+
+omtu=9000
+lmtu=1500
+rmtu=2000
+
+filesize=$((2 * 1024 * 1024))
+
+usage(){
+	echo "nft_flowtable.sh [OPTIONS]"
+	echo
+	echo "MTU options"
+	echo "   -o originator"
+	echo "   -l link"
+	echo "   -r responder"
+	exit 1
+}
+
+while getopts "o:l:r:s:" o
+do
+	case $o in
+		o) omtu=$OPTARG;;
+		l) lmtu=$OPTARG;;
+		r) rmtu=$OPTARG;;
+		s) filesize=$OPTARG;;
+		*) usage;;
+	esac
+done
+
+if ! ip -net "$nsr1" link set veth0 mtu "$omtu"; then
+	exit 1
+fi
+
+ip -net "$ns1" link set eth0 mtu "$omtu"
+
+if ! ip -net "$nsr2" link set veth1 mtu "$rmtu"; then
+	exit 1
+fi
+
+if ! ip -net "$nsr1" link set veth1 mtu "$lmtu"; then
+	exit 1
+fi
+
+if ! ip -net "$nsr2" link set veth0 mtu "$lmtu"; then
+	exit 1
+fi
+
+ip -net "$ns2" link set eth0 mtu "$rmtu"
+
+# transfer-net between nsr1 and nsr2.
+# these addresses are not used for connections.
+ip -net "$nsr1" addr add 192.168.10.1/24 dev veth1
+ip -net "$nsr1" addr add fee1:2::1/64 dev veth1 nodad
+
+ip -net "$nsr2" addr add 192.168.10.2/24 dev veth0
+ip -net "$nsr2" addr add fee1:2::2/64 dev veth0 nodad
+
+for i in 0 1; do
+  ip netns exec "$nsr1" sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null
+  ip netns exec "$nsr2" sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null
+done
+
+for ns in "$ns1" "$ns2";do
+  ip -net "$ns" link set eth0 up
+
+  if ! ip netns exec "$ns" sysctl net.ipv4.tcp_no_metrics_save=1 > /dev/null; then
+	echo "ERROR: Check Originator/Responder values (problem during address addition)"
+	exit 1
+  fi
+  # don't set ip DF bit for first two tests
+  ip netns exec "$ns" sysctl net.ipv4.ip_no_pmtu_disc=1 > /dev/null
+done
+
+ip -net "$ns1" addr add 10.0.1.99/24 dev eth0
+ip -net "$ns2" addr add 10.0.2.99/24 dev eth0
+ip -net "$ns1" route add default via 10.0.1.1
+ip -net "$ns2" route add default via 10.0.2.1
+ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad
+ip -net "$ns2" addr add dead:2::99/64 dev eth0 nodad
+ip -net "$ns1" route add default via dead:1::1
+ip -net "$ns2" route add default via dead:2::1
+
+ip -net "$nsr1" route add default via 192.168.10.2
+ip -net "$nsr2" route add default via 192.168.10.1
+
+ip netns exec "$nsr1" nft -f - <<EOF
+table inet filter {
+  flowtable f1 {
+     hook ingress priority 0
+     devices = { veth0, veth1 }
+   }
+
+   counter routed_orig { }
+   counter routed_repl { }
+
+   chain forward {
+      type filter hook forward priority 0; policy drop;
+
+      # flow offloaded? Tag ct with mark 1, so we can detect when it fails.
+      meta oif "veth1" tcp dport 12345 ct mark set 1 flow add @f1 counter name routed_orig accept
+
+      # count packets supposedly offloaded as per direction.
+      ct mark 1 counter name ct direction map { original : routed_orig, reply : routed_repl } accept
+
+      ct state established,related accept
+
+      meta nfproto ipv4 meta l4proto icmp accept
+      meta nfproto ipv6 meta l4proto icmpv6 accept
+   }
+}
+EOF
+
+if [ $? -ne 0 ]; then
+	echo "SKIP: Could not load nft ruleset"
+	exit $ksft_skip
+fi
+
+ip netns exec "$ns2" nft -f - <<EOF
+table inet filter {
+   counter ip4dscp0 { }
+   counter ip4dscp3 { }
+
+   chain input {
+      type filter hook input priority 0; policy accept;
+      meta l4proto tcp goto {
+	      ip dscp cs3 counter name ip4dscp3 accept
+	      ip dscp 0 counter name ip4dscp0 accept
+      }
+   }
+}
+EOF
+
+if [ $? -ne 0 ]; then
+	echo -n "SKIP: Could not load ruleset: "
+	nft --version
+	exit $ksft_skip
+fi
+
+# test basic connectivity
+if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.99 > /dev/null; then
+  echo "ERROR: $ns1 cannot reach ns2" 1>&2
+  exit 1
+fi
+
+if ! ip netns exec "$ns2" ping -c 1 -q 10.0.1.99 > /dev/null; then
+  echo "ERROR: $ns2 cannot reach $ns1" 1>&2
+  exit 1
+fi
+
+nsin=$(mktemp)
+ns1out=$(mktemp)
+ns2out=$(mktemp)
+
+make_file()
+{
+	name="$1"
+	sz="$2"
+
+	head -c "$sz" < /dev/urandom > "$name"
+}
+
+check_counters()
+{
+	local what=$1
+	local ok=1
+
+	local orig repl
+	orig=$(ip netns exec "$nsr1" nft reset counter inet filter routed_orig | grep packets)
+	repl=$(ip netns exec "$nsr1" nft reset counter inet filter routed_repl | grep packets)
+
+	local orig_cnt=${orig#*bytes}
+	local repl_cnt=${repl#*bytes}
+
+	local fs
+	fs=$(du -sb "$nsin")
+	local max_orig=${fs%%/*}
+	local max_repl=$((max_orig))
+
+	# flowtable fastpath should bypass normal routing one, i.e. the counters in forward hook
+	# should always be lower than the size of the transmitted file (max_orig).
+	if [ "$orig_cnt" -gt "$max_orig" ];then
+		echo "FAIL: $what: original counter $orig_cnt exceeds expected value $max_orig, reply counter $repl_cnt" 1>&2
+		ret=1
+		ok=0
+	fi
+
+	if [ "$repl_cnt" -gt $max_repl ];then
+		echo "FAIL: $what: reply counter $repl_cnt exceeds expected value $max_repl, original counter $orig_cnt" 1>&2
+		ret=1
+		ok=0
+	fi
+
+	if [ $ok -eq 1 ]; then
+		echo "PASS: $what"
+	fi
+}
+
+check_dscp()
+{
+	local what=$1
+	local ok=1
+
+	local counter
+	counter=$(ip netns exec "$ns2" nft reset counter inet filter ip4dscp3 | grep packets)
+
+	local pc4=${counter%*bytes*}
+	local pc4=${pc4#*packets}
+
+	counter=$(ip netns exec "$ns2" nft reset counter inet filter ip4dscp0 | grep packets)
+	local pc4z=${counter%*bytes*}
+	local pc4z=${pc4z#*packets}
+
+	case "$what" in
+	"dscp_none")
+		if [ "$pc4" -gt 0 ] || [ "$pc4z" -eq 0 ]; then
+			echo "FAIL: dscp counters do not match, expected dscp3 == 0, dscp0 > 0, but got $pc4,$pc4z" 1>&2
+			ret=1
+			ok=0
+		fi
+		;;
+	"dscp_fwd")
+		if [ "$pc4" -eq 0 ] || [ "$pc4z" -eq 0 ]; then
+			echo "FAIL: dscp counters do not match, expected dscp3 and dscp0 > 0 but got $pc4,$pc4z" 1>&2
+			ret=1
+			ok=0
+		fi
+		;;
+	"dscp_ingress")
+		if [ "$pc4" -eq 0 ] || [ "$pc4z" -gt 0 ]; then
+			echo "FAIL: dscp counters do not match, expected dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2
+			ret=1
+			ok=0
+		fi
+		;;
+	"dscp_egress")
+		if [ "$pc4" -eq 0 ] || [ "$pc4z" -gt 0 ]; then
+			echo "FAIL: dscp counters do not match, expected dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2
+			ret=1
+			ok=0
+		fi
+		;;
+	*)
+		echo "FAIL: Unknown DSCP check" 1>&2
+		ret=1
+		ok=0
+	esac
+
+	if [ "$ok" -eq 1 ] ;then
+		echo "PASS: $what: dscp packet counters match"
+	fi
+}
+
+check_transfer()
+{
+	in=$1
+	out=$2
+	what=$3
+
+	if ! cmp "$in" "$out" > /dev/null 2>&1; then
+		echo "FAIL: file mismatch for $what" 1>&2
+		ls -l "$in"
+		ls -l "$out"
+		return 1
+	fi
+
+	return 0
+}
+
+listener_ready()
+{
+	ss -N "$nsb" -lnt -o "sport = :12345" | grep -q 12345
+}
+
+test_tcp_forwarding_ip()
+{
+	local nsa=$1
+	local nsb=$2
+	local dstip=$3
+	local dstport=$4
+	local lret=0
+
+	timeout "$SOCAT_TIMEOUT" ip netns exec "$nsb" socat -4 TCP-LISTEN:12345,reuseaddr STDIO < "$nsin" > "$ns2out" &
+	lpid=$!
+
+	busywait 1000 listener_ready
+
+	timeout "$SOCAT_TIMEOUT" ip netns exec "$nsa" socat -4 TCP:"$dstip":"$dstport" STDIO < "$nsin" > "$ns1out"
+
+	wait $lpid
+
+	if ! check_transfer "$nsin" "$ns2out" "ns1 -> ns2"; then
+		lret=1
+		ret=1
+	fi
+
+	if ! check_transfer "$nsin" "$ns1out" "ns1 <- ns2"; then
+		lret=1
+		ret=1
+	fi
+
+	return $lret
+}
+
+test_tcp_forwarding()
+{
+	test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345
+
+	return $?
+}
+
+test_tcp_forwarding_set_dscp()
+{
+	check_dscp "dscp_none"
+
+ip netns exec "$nsr1" nft -f - <<EOF
+table netdev dscpmangle {
+   chain setdscp0 {
+      type filter hook ingress device "veth0" priority 0; policy accept
+	ip dscp set cs3
+  }
+}
+EOF
+if [ $? -eq 0 ]; then
+	test_tcp_forwarding_ip "$1" "$2"  10.0.2.99 12345
+	check_dscp "dscp_ingress"
+
+	ip netns exec "$nsr1" nft delete table netdev dscpmangle
+else
+	echo "SKIP: Could not load netdev:ingress for veth0"
+fi
+
+ip netns exec "$nsr1" nft -f - <<EOF
+table netdev dscpmangle {
+   chain setdscp0 {
+      type filter hook egress device "veth1" priority 0; policy accept
+      ip dscp set cs3
+  }
+}
+EOF
+if [ $? -eq 0 ]; then
+	test_tcp_forwarding_ip "$1" "$2"  10.0.2.99 12345
+	check_dscp "dscp_egress"
+
+	ip netns exec "$nsr1" nft flush table netdev dscpmangle
+else
+	echo "SKIP: Could not load netdev:egress for veth1"
+fi
+
+	# partial.  If flowtable really works, then both dscp-is-0 and dscp-is-cs3
+	# counters should have seen packets (before and after ft offload kicks in).
+	ip netns exec "$nsr1" nft -a insert rule inet filter forward ip dscp set cs3
+	test_tcp_forwarding_ip "$1" "$2"  10.0.2.99 12345
+	check_dscp "dscp_fwd"
+}
+
+test_tcp_forwarding_nat()
+{
+	local lret
+	local pmtu
+
+	test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345
+	lret=$?
+
+	pmtu=$3
+	what=$4
+
+	if [ "$lret" -eq 0 ] ; then
+		if [ "$pmtu" -eq 1 ] ;then
+			check_counters "flow offload for ns1/ns2 with masquerade and pmtu discovery $what"
+		else
+			echo "PASS: flow offload for ns1/ns2 with masquerade $what"
+		fi
+
+		test_tcp_forwarding_ip "$1" "$2" 10.6.6.6 1666
+		lret=$?
+		if [ "$pmtu" -eq 1 ] ;then
+			check_counters "flow offload for ns1/ns2 with dnat and pmtu discovery $what"
+		elif [ "$lret" -eq 0 ] ; then
+			echo "PASS: flow offload for ns1/ns2 with dnat $what"
+		fi
+	fi
+
+	return $lret
+}
+
+make_file "$nsin" "$filesize"
+
+# First test:
+# No PMTU discovery, nsr1 is expected to fragment packets from ns1 to ns2 as needed.
+# Due to MTU mismatch in both directions, all packets (except small packets like pure
+# acks) have to be handled by normal forwarding path.  Therefore, packet counters
+# are not checked.
+if test_tcp_forwarding "$ns1" "$ns2"; then
+	echo "PASS: flow offloaded for ns1/ns2"
+else
+	echo "FAIL: flow offload for ns1/ns2:" 1>&2
+	ip netns exec "$nsr1" nft list ruleset
+	ret=1
+fi
+
+# delete default route, i.e. ns2 won't be able to reach ns1 and
+# will depend on ns1 being masqueraded in nsr1.
+# expect ns1 has nsr1 address.
+ip -net "$ns2" route del default via 10.0.2.1
+ip -net "$ns2" route del default via dead:2::1
+ip -net "$ns2" route add 192.168.10.1 via 10.0.2.1
+
+# Second test:
+# Same, but with NAT enabled.  Same as in first test: we expect normal forward path
+# to handle most packets.
+ip netns exec "$nsr1" nft -f - <<EOF
+table ip nat {
+   chain prerouting {
+      type nat hook prerouting priority 0; policy accept;
+      meta iif "veth0" ip daddr 10.6.6.6 tcp dport 1666 counter dnat ip to 10.0.2.99:12345
+   }
+
+   chain postrouting {
+      type nat hook postrouting priority 0; policy accept;
+      meta oifname "veth1" counter masquerade
+   }
+}
+EOF
+
+if ! test_tcp_forwarding_set_dscp "$ns1" "$ns2" 0 ""; then
+	echo "FAIL: flow offload for ns1/ns2 with dscp update" 1>&2
+	exit 0
+fi
+
+if ! test_tcp_forwarding_nat "$ns1" "$ns2" 0 ""; then
+	echo "FAIL: flow offload for ns1/ns2 with NAT" 1>&2
+	ip netns exec "$nsr1" nft list ruleset
+	ret=1
+fi
+
+# Third test:
+# Same as second test, but with PMTU discovery enabled. This
+# means that we expect the fastpath to handle packets as soon
+# as the endpoints adjust the packet size.
+ip netns exec "$ns1" sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null
+ip netns exec "$ns2" sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null
+
+# reset counters.
+# With pmtu in-place we'll also check that nft counters
+# are lower than file size and packets were forwarded via flowtable layer.
+# For earlier tests (large mtus), packets cannot be handled via flowtable
+# (except pure acks and other small packets).
+ip netns exec "$nsr1" nft reset counters table inet filter >/dev/null
+
+if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 ""; then
+	echo "FAIL: flow offload for ns1/ns2 with NAT and pmtu discovery" 1>&2
+	ip netns exec "$nsr1" nft list ruleset
+fi
+
+# Another test:
+# Add bridge interface br0 to Router1, with NAT enabled.
+test_bridge() {
+if ! ip -net "$nsr1" link add name br0 type bridge 2>/dev/null;then
+	echo "SKIP: could not add bridge br0"
+	[ "$ret" -eq 0 ] && ret=$ksft_skip
+	return
+fi
+ip -net "$nsr1" addr flush dev veth0
+ip -net "$nsr1" link set up dev veth0
+ip -net "$nsr1" link set veth0 master br0
+ip -net "$nsr1" addr add 10.0.1.1/24 dev br0
+ip -net "$nsr1" addr add dead:1::1/64 dev br0 nodad
+ip -net "$nsr1" link set up dev br0
+
+ip netns exec "$nsr1" sysctl net.ipv4.conf.br0.forwarding=1 > /dev/null
+
+# br0 with NAT enabled.
+ip netns exec "$nsr1" nft -f - <<EOF
+flush table ip nat
+table ip nat {
+   chain prerouting {
+      type nat hook prerouting priority 0; policy accept;
+      meta iif "br0" ip daddr 10.6.6.6 tcp dport 1666 counter dnat ip to 10.0.2.99:12345
+   }
+
+   chain postrouting {
+      type nat hook postrouting priority 0; policy accept;
+      meta oifname "veth1" counter masquerade
+   }
+}
+EOF
+
+if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "on bridge"; then
+	echo "FAIL: flow offload for ns1/ns2 with bridge NAT" 1>&2
+	ip netns exec "$nsr1" nft list ruleset
+	ret=1
+fi
+
+
+# Another test:
+# Add bridge interface br0 to Router1, with NAT and VLAN.
+ip -net "$nsr1" link set veth0 nomaster
+ip -net "$nsr1" link set down dev veth0
+ip -net "$nsr1" link add link veth0 name veth0.10 type vlan id 10
+ip -net "$nsr1" link set up dev veth0
+ip -net "$nsr1" link set up dev veth0.10
+ip -net "$nsr1" link set veth0.10 master br0
+
+ip -net "$ns1" addr flush dev eth0
+ip -net "$ns1" link add link eth0 name eth0.10 type vlan id 10
+ip -net "$ns1" link set eth0 up
+ip -net "$ns1" link set eth0.10 up
+ip -net "$ns1" addr add 10.0.1.99/24 dev eth0.10
+ip -net "$ns1" route add default via 10.0.1.1
+ip -net "$ns1" addr add dead:1::99/64 dev eth0.10 nodad
+
+if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "bridge and VLAN"; then
+	echo "FAIL: flow offload for ns1/ns2 with bridge NAT and VLAN" 1>&2
+	ip netns exec "$nsr1" nft list ruleset
+	ret=1
+fi
+
+# restore test topology (remove bridge and VLAN)
+ip -net "$nsr1" link set veth0 nomaster
+ip -net "$nsr1" link set veth0 down
+ip -net "$nsr1" link set veth0.10 down
+ip -net "$nsr1" link delete veth0.10 type vlan
+ip -net "$nsr1" link delete br0 type bridge
+ip -net "$ns1" addr flush dev eth0.10
+ip -net "$ns1" link set eth0.10 down
+ip -net "$ns1" link set eth0 down
+ip -net "$ns1" link delete eth0.10 type vlan
+
+# restore address in ns1 and nsr1
+ip -net "$ns1" link set eth0 up
+ip -net "$ns1" addr add 10.0.1.99/24 dev eth0
+ip -net "$ns1" route add default via 10.0.1.1
+ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad
+ip -net "$ns1" route add default via dead:1::1
+ip -net "$nsr1" addr add 10.0.1.1/24 dev veth0
+ip -net "$nsr1" addr add dead:1::1/64 dev veth0 nodad
+ip -net "$nsr1" link set up dev veth0
+}
+
+test_bridge
+
+KEY_SHA="0x"$(ps -af | sha1sum | cut -d " " -f 1)
+KEY_AES="0x"$(ps -af | md5sum | cut -d " " -f 1)
+SPI1=$RANDOM
+SPI2=$RANDOM
+
+if [ $SPI1 -eq $SPI2 ]; then
+	SPI2=$((SPI2+1))
+fi
+
+do_esp() {
+    local ns=$1
+    local me=$2
+    local remote=$3
+    local lnet=$4
+    local rnet=$5
+    local spi_out=$6
+    local spi_in=$7
+
+    ip -net "$ns" xfrm state add src "$remote" dst "$me" proto esp spi "$spi_in"  enc aes "$KEY_AES"  auth sha1 "$KEY_SHA" mode tunnel sel src "$rnet" dst "$lnet"
+    ip -net "$ns" xfrm state add src "$me"  dst "$remote" proto esp spi "$spi_out" enc aes "$KEY_AES" auth sha1 "$KEY_SHA" mode tunnel sel src "$lnet" dst "$rnet"
+
+    # to encrypt packets as they go out (includes forwarded packets that need encapsulation)
+    ip -net "$ns" xfrm policy add src "$lnet" dst "$rnet" dir out tmpl src "$me" dst "$remote" proto esp mode tunnel priority 1 action allow
+    # to fwd decrypted packets after esp processing:
+    ip -net "$ns" xfrm policy add src "$rnet" dst "$lnet" dir fwd tmpl src "$remote" dst "$me" proto esp mode tunnel priority 1 action allow
+}
+
+do_esp "$nsr1" 192.168.10.1 192.168.10.2 10.0.1.0/24 10.0.2.0/24 "$SPI1" "$SPI2"
+
+do_esp "$nsr2" 192.168.10.2 192.168.10.1 10.0.2.0/24 10.0.1.0/24 "$SPI2" "$SPI1"
+
+ip netns exec "$nsr1" nft delete table ip nat
+
+# restore default routes
+ip -net "$ns2" route del 192.168.10.1 via 10.0.2.1
+ip -net "$ns2" route add default via 10.0.2.1
+ip -net "$ns2" route add default via dead:2::1
+
+if test_tcp_forwarding "$ns1" "$ns2"; then
+	check_counters "ipsec tunnel mode for ns1/ns2"
+else
+	echo "FAIL: ipsec tunnel mode for ns1/ns2"
+	ip netns exec "$nsr1" nft list ruleset 1>&2
+	ip netns exec "$nsr1" cat /proc/net/xfrm_stat 1>&2
+fi
+
+if [ "$1" = "" ]; then
+	low=1280
+	mtu=$((65536 - low))
+	o=$(((RANDOM%mtu) + low))
+	l=$(((RANDOM%mtu) + low))
+	r=$(((RANDOM%mtu) + low))
+
+	MINSIZE=$((2 *  1000 * 1000))
+	MAXSIZE=$((64 * 1000 * 1000))
+
+	filesize=$(((RANDOM * RANDOM) % MAXSIZE))
+	if [ "$filesize" -lt "$MINSIZE" ]; then
+		filesize=$((filesize+MINSIZE))
+	fi
+
+	echo "re-run with random mtus and file size: -o $o -l $l -r $r -s $filesize"
+	$0 -o "$o" -l "$l" -r "$r" -s "$filesize"
+fi
+
+exit $ret
diff --git a/tools/testing/selftests/net/netfilter/nft_interface_stress.sh b/tools/testing/selftests/net/netfilter/nft_interface_stress.sh
new file mode 100755
index 000000000000..5ff7be9daeee
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/nft_interface_stress.sh
@@ -0,0 +1,154 @@
+#!/bin/bash -e
+#
+# SPDX-License-Identifier: GPL-2.0
+#
+# Torture nftables' netdevice notifier callbacks and related code by frequent
+# renaming of interfaces which netdev-family chains and flowtables hook into.
+
+source lib.sh
+
+checktool "nft --version" "run test without nft tool"
+checktool "iperf3 --version" "run test without iperf3 tool"
+
+# how many seconds to torture the kernel?
+# default to 80% of max run time but don't exceed 48s
+TEST_RUNTIME=$((${kselftest_timeout:-60} * 8 / 10))
+[[ $TEST_RUNTIME -gt 48 ]] && TEST_RUNTIME=48
+
+trap "cleanup_all_ns" EXIT
+
+setup_ns nsc nsr nss
+
+ip -net $nsc link add cr0 type veth peer name rc0 netns $nsr
+ip -net $nsc addr add 10.0.0.1/24 dev cr0
+ip -net $nsc link set cr0 up
+ip -net $nsc route add default via 10.0.0.2
+
+ip -net $nss link add sr0 type veth peer name rs0 netns $nsr
+ip -net $nss addr add 10.1.0.1/24 dev sr0
+ip -net $nss link set sr0 up
+ip -net $nss route add default via 10.1.0.2
+
+ip -net $nsr addr add 10.0.0.2/24 dev rc0
+ip -net $nsr link set rc0 up
+ip -net $nsr addr add 10.1.0.2/24 dev rs0
+ip -net $nsr link set rs0 up
+ip netns exec $nsr sysctl -q net.ipv4.ip_forward=1
+ip netns exec $nsr sysctl -q net.ipv4.conf.all.forwarding=1
+
+{
+	echo "table netdev t {"
+	for ((i = 0; i < 10; i++)); do
+		cat <<-EOF
+		chain chain_rc$i {
+			type filter hook ingress device rc$i priority 0
+			counter
+		}
+		chain chain_rs$i {
+			type filter hook ingress device rs$i priority 0
+			counter
+		}
+		EOF
+	done
+	echo "}"
+	echo "table ip t {"
+	for ((i = 0; i < 10; i++)); do
+		cat <<-EOF
+		flowtable ft_${i} {
+			hook ingress priority 0
+			devices = { rc$i, rs$i }
+		}
+		EOF
+	done
+	echo "chain c {"
+	echo "type filter hook forward priority 0"
+	for ((i = 0; i < 10; i++)); do
+		echo -n "iifname rc$i oifname rs$i "
+		echo    "ip protocol tcp counter flow add @ft_${i}"
+	done
+	echo "counter"
+	echo "}"
+	echo "}"
+} | ip netns exec $nsr nft -f - || {
+	echo "SKIP: Could not load nft ruleset"
+	exit $ksft_skip
+}
+
+for ((o=0, n=1; ; o=n, n++, n %= 10)); do
+	ip -net $nsr link set rc$o name rc$n
+	ip -net $nsr link set rs$o name rs$n
+done &
+rename_loop_pid=$!
+
+while true; do ip netns exec $nsr nft list ruleset >/dev/null 2>&1; done &
+nft_list_pid=$!
+
+ip netns exec $nsr nft monitor >/dev/null &
+nft_monitor_pid=$!
+
+ip netns exec $nss iperf3 --server --daemon -1
+summary_expr='s,^\[SUM\] .* \([0-9\.]\+\) Kbits/sec .* receiver,\1,p'
+rate=$(ip netns exec $nsc iperf3 \
+	--format k -c 10.1.0.1 --time $TEST_RUNTIME \
+	--length 56 --parallel 10 -i 0 | sed -n "$summary_expr")
+
+kill $nft_list_pid
+kill $nft_monitor_pid
+kill $rename_loop_pid
+wait
+
+wildcard_prep() {
+	ip netns exec $nsr nft -f - <<EOF
+table ip t {
+	flowtable ft_wild {
+		hook ingress priority 0
+		devices = { wild* }
+	}
+}
+EOF
+}
+
+if ! wildcard_prep; then
+	echo "SKIP wildcard tests: not supported by host's nft?"
+else
+	for ((i = 0; i < 100; i++)); do
+		ip -net $nsr link add wild$i type dummy &
+	done
+	wait
+	for ((i = 80; i < 100; i++)); do
+		ip -net $nsr link del wild$i &
+	done
+	for ((i = 0; i < 80; i++)); do
+		ip -net $nsr link del wild$i &
+	done
+	wait
+	for ((i = 0; i < 100; i += 10)); do
+		(
+		for ((j = 0; j < 10; j++)); do
+			ip -net $nsr link add wild$((i + j)) type dummy
+		done
+		for ((j = 0; j < 10; j++)); do
+			ip -net $nsr link del wild$((i + j))
+		done
+		) &
+	done
+	wait
+fi
+
+[[ $(</proc/sys/kernel/tainted) -eq 0 ]] || {
+	echo "FAIL: Kernel is tainted!"
+	exit $ksft_fail
+}
+
+[[ $rate -gt 0 ]] || {
+	echo "FAIL: Zero throughput in iperf3"
+	exit $ksft_fail
+}
+
+[[ -f /sys/kernel/debug/kmemleak && \
+   -n $(</sys/kernel/debug/kmemleak) ]] && {
+	echo "FAIL: non-empty kmemleak report"
+	exit $ksft_fail
+}
+
+exit $ksft_pass
diff --git a/tools/testing/selftests/net/netfilter/nft_meta.sh b/tools/testing/selftests/net/netfilter/nft_meta.sh
new file mode 100755
index 000000000000..71505b6cb252
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/nft_meta.sh
@@ -0,0 +1,142 @@
+#!/bin/bash
+
+# check iif/iifname/oifgroup/iiftype match.
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+sfx=$(mktemp -u "XXXXXXXX")
+ns0="ns0-$sfx"
+
+if ! nft --version > /dev/null 2>&1; then
+	echo "SKIP: Could not run test without nft tool"
+	exit $ksft_skip
+fi
+
+cleanup()
+{
+	ip netns del "$ns0"
+}
+
+ip netns add "$ns0"
+ip -net "$ns0" link set lo up
+ip -net "$ns0" addr add 127.0.0.1 dev lo
+
+trap cleanup EXIT
+
+currentyear=$(date +%Y)
+lastyear=$((currentyear-1))
+ip netns exec "$ns0" nft -f /dev/stdin <<EOF
+table inet filter {
+	counter iifcount {}
+	counter iifnamecount {}
+	counter iifgroupcount {}
+	counter iiftypecount {}
+	counter infproto4count {}
+	counter il4protocounter {}
+	counter imarkcounter {}
+	counter icpu0counter {}
+	counter ilastyearcounter {}
+	counter icurrentyearcounter {}
+
+	counter oifcount {}
+	counter oifnamecount {}
+	counter oifgroupcount {}
+	counter oiftypecount {}
+	counter onfproto4count {}
+	counter ol4protocounter {}
+	counter oskuidcounter {}
+	counter oskgidcounter {}
+	counter omarkcounter {}
+
+	chain input {
+		type filter hook input priority 0; policy accept;
+
+		meta iif lo counter name "iifcount"
+		meta iifname "lo" counter name "iifnamecount"
+		meta iifgroup "default" counter name "iifgroupcount"
+		meta iiftype "loopback" counter name "iiftypecount"
+		meta nfproto ipv4 counter name "infproto4count"
+		meta l4proto icmp counter name "il4protocounter"
+		meta mark 42 counter name "imarkcounter"
+		meta cpu 0 counter name "icpu0counter"
+		meta time "$lastyear-01-01" - "$lastyear-12-31" counter name ilastyearcounter
+		meta time "$currentyear-01-01" - "$currentyear-12-31" counter name icurrentyearcounter
+	}
+
+	chain output {
+		type filter hook output priority 0; policy accept;
+		meta oif lo counter name "oifcount" counter
+		meta oifname "lo" counter name "oifnamecount"
+		meta oifgroup "default" counter name "oifgroupcount"
+		meta oiftype "loopback" counter name "oiftypecount"
+		meta nfproto ipv4 counter name "onfproto4count"
+		meta l4proto icmp counter name "ol4protocounter"
+		meta skuid 0 counter name "oskuidcounter"
+		meta skgid 0 counter name "oskgidcounter"
+		meta mark 42 counter name "omarkcounter"
+	}
+}
+EOF
+
+if [ $? -ne 0 ]; then
+	echo "SKIP: Could not add test ruleset"
+	exit $ksft_skip
+fi
+
+ret=0
+
+check_one_counter()
+{
+	local cname="$1"
+	local want="packets $2"
+	local verbose="$3"
+
+	if ! ip netns exec "$ns0" nft list counter inet filter "$cname" | grep -q "$want"; then
+		echo "FAIL: $cname, want \"$want\", got"
+		ret=1
+		ip netns exec "$ns0" nft list counter inet filter "$cname"
+	fi
+}
+
+check_lo_counters()
+{
+	local want="$1"
+	local verbose="$2"
+	local counter
+
+	for counter in iifcount iifnamecount iifgroupcount iiftypecount infproto4count \
+		       oifcount oifnamecount oifgroupcount oiftypecount onfproto4count \
+		       il4protocounter icurrentyearcounter ol4protocounter \
+	     ; do
+		check_one_counter "$counter" "$want" "$verbose"
+	done
+}
+
+check_lo_counters "0" false
+ip netns exec "$ns0" ping -q -c 1 127.0.0.1 -m 42 > /dev/null
+
+check_lo_counters "2" true
+
+check_one_counter oskuidcounter "1" true
+check_one_counter oskgidcounter "1" true
+check_one_counter imarkcounter "1" true
+check_one_counter omarkcounter "1" true
+check_one_counter ilastyearcounter "0" true
+
+if [ $ret -eq 0 ];then
+	echo "OK: nftables meta iif/oif counters at expected values"
+else
+	exit $ret
+fi
+
+#First CPU execution and counter
+taskset -p 01 $$ > /dev/null
+ip netns exec "$ns0" nft reset counters > /dev/null
+ip netns exec "$ns0" ping -q -c 1 127.0.0.1 > /dev/null
+check_one_counter icpu0counter "2" true
+
+if [ $ret -eq 0 ];then
+	echo "OK: nftables meta cpu counter at expected values"
+fi
+
+exit $ret
diff --git a/tools/testing/selftests/net/netfilter/nft_nat.sh b/tools/testing/selftests/net/netfilter/nft_nat.sh
new file mode 100755
index 000000000000..9e39de26455f
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/nft_nat.sh
@@ -0,0 +1,1156 @@
+#!/bin/bash
+#
+# This test is for basic NAT functionality: snat, dnat, redirect, masquerade.
+#
+
+source lib.sh
+
+ret=0
+test_inet_nat=true
+
+checktool "nft --version" "run test without nft tool"
+checktool "socat -h" "run test without socat"
+
+cleanup()
+{
+	ip netns pids "$ns0" | xargs kill 2>/dev/null
+	ip netns pids "$ns1" | xargs kill 2>/dev/null
+	ip netns pids "$ns2" | xargs kill 2>/dev/null
+
+	rm -f "$INFILE" "$OUTFILE"
+
+	cleanup_all_ns
+}
+
+trap cleanup EXIT
+
+INFILE=$(mktemp)
+OUTFILE=$(mktemp)
+
+setup_ns ns0 ns1 ns2
+
+if ! ip link add veth0 netns "$ns0" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1;then
+    echo "SKIP: No virtual ethernet pair device support in kernel"
+    exit $ksft_skip
+fi
+ip link add veth1 netns "$ns0" type veth peer name eth0 netns "$ns2"
+
+ip -net "$ns0" link set veth0 up
+ip -net "$ns0" addr add 10.0.1.1/24 dev veth0
+ip -net "$ns0" addr add dead:1::1/64 dev veth0 nodad
+
+ip -net "$ns0" link set veth1 up
+ip -net "$ns0" addr add 10.0.2.1/24 dev veth1
+ip -net "$ns0" addr add dead:2::1/64 dev veth1 nodad
+
+do_config()
+{
+	ns="$1"
+	subnet="$2"
+
+	ip -net "$ns" link set eth0 up
+	ip -net "$ns" addr add "10.0.$subnet.99/24" dev eth0
+	ip -net "$ns" route add default via "10.0.$subnet.1"
+	ip -net "$ns" addr add "dead:$subnet::99/64" dev eth0 nodad
+	ip -net "$ns" route add default via "dead:$subnet::1"
+}
+
+do_config "$ns1" 1
+do_config "$ns2" 2
+
+bad_counter()
+{
+	local ns=$1
+	local counter=$2
+	local expect=$3
+	local tag=$4
+
+	echo "ERROR: $counter counter in $ns has unexpected value (expected $expect) at $tag" 1>&2
+	ip netns exec "$ns" nft list counter inet filter "$counter" 1>&2
+}
+
+check_counters()
+{
+	ns=$1
+	local lret=0
+
+	if ! ip netns exec "$ns" nft list counter inet filter ns0in | grep -q "packets 1 bytes 84";then
+		bad_counter "$ns" ns0in "packets 1 bytes 84" "check_counters 1"
+		lret=1
+	fi
+
+	if ! ip netns exec "$ns" nft list counter inet filter ns0out | grep -q "packets 1 bytes 84";then
+		bad_counter "$ns" ns0out "packets 1 bytes 84" "check_counters 2"
+		lret=1
+	fi
+
+	expect="packets 1 bytes 104"
+	if ! ip netns exec "$ns" nft list counter inet filter ns0in6 | grep -q "$expect";then
+		bad_counter "$ns" ns0in6 "$expect" "check_counters 3"
+		lret=1
+	fi
+	if ! ip netns exec "$ns" nft list counter inet filter ns0out6 | grep -q "$expect";then
+		bad_counter "$ns" ns0out6 "$expect" "check_counters 4"
+		lret=1
+	fi
+
+	return $lret
+}
+
+check_ns0_counters()
+{
+	local ns=$1
+	local lret=0
+
+	if ! ip netns exec "$ns0" nft list counter inet filter ns0in | grep -q "packets 0 bytes 0";then
+		bad_counter "$ns0" ns0in "packets 0 bytes 0" "check_ns0_counters 1"
+		lret=1
+	fi
+
+	if ! ip netns exec "$ns0" nft list counter inet filter ns0in6 | grep -q "packets 0 bytes 0";then
+		bad_counter "$ns0" ns0in6 "packets 0 bytes 0"
+		lret=1
+	fi
+
+	if ! ip netns exec "$ns0" nft list counter inet filter ns0out | grep -q "packets 0 bytes 0";then
+		bad_counter "$ns0" ns0out "packets 0 bytes 0" "check_ns0_counters 2"
+		lret=1
+	fi
+	if ! ip netns exec "$ns0" nft list counter inet filter ns0out6 | grep -q "packets 0 bytes 0";then
+		bad_counter "$ns0" ns0out6 "packets 0 bytes 0" "check_ns0_counters3 "
+		lret=1
+	fi
+
+	for dir in "in" "out" ; do
+		expect="packets 1 bytes 84"
+		if ! ip netns exec "$ns0" nft list counter inet filter "${ns}${dir}" | grep -q "$expect";then
+			bad_counter "$ns0" "$ns${dir}" "$expect" "check_ns0_counters 4"
+			lret=1
+		fi
+
+		expect="packets 1 bytes 104"
+		if ! ip netns exec "$ns0" nft list counter inet filter "${ns}${dir}6" | grep -q "$expect";then
+			bad_counter "$ns0" "$ns${dir}6" "$expect" "check_ns0_counters 5"
+			lret=1
+		fi
+	done
+
+	return $lret
+}
+
+reset_counters()
+{
+	for i in "$ns0" "$ns1" "$ns2" ;do
+		ip netns exec "$i" nft reset counters inet > /dev/null
+	done
+}
+
+test_local_dnat6()
+{
+	local family=$1
+	local lret=0
+	local IPF=""
+
+	if [ "$family" = "inet" ];then
+		IPF="ip6"
+	fi
+
+ip netns exec "$ns0" nft -f /dev/stdin <<EOF
+table $family nat {
+	chain output {
+		type nat hook output priority 0; policy accept;
+		ip6 daddr dead:1::99 dnat $IPF to dead:2::99
+	}
+}
+EOF
+	if [ $? -ne 0 ]; then
+		echo "SKIP: Could not add add $family dnat hook"
+		return $ksft_skip
+	fi
+
+	# ping netns1, expect rewrite to netns2
+	if ! ip netns exec "$ns0" ping -q -c 1 dead:1::99 > /dev/null;then
+		lret=1
+		echo "ERROR: ping6 failed"
+		return $lret
+	fi
+
+	expect="packets 0 bytes 0"
+	for dir in "in6" "out6" ; do
+		if ! ip netns exec "$ns0" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then
+			bad_counter "$ns0" ns1$dir "$expect" "test_local_dnat6 1"
+			lret=1
+		fi
+	done
+
+	expect="packets 1 bytes 104"
+	for dir in "in6" "out6" ; do
+		if ! ip netns exec "$ns0" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then
+			bad_counter "$ns0" ns2$dir "$expect" "test_local_dnat6 2"
+			lret=1
+		fi
+	done
+
+	# expect 0 count in ns1
+	expect="packets 0 bytes 0"
+	for dir in "in6" "out6" ; do
+		if ! ip netns exec "$ns1" nft list counter inet filter "ns0${dir}" | grep -q "$expect";then
+			bad_counter "$ns1" ns0$dir "$expect" "test_local_dnat6 3"
+			lret=1
+		fi
+	done
+
+	# expect 1 packet in ns2
+	expect="packets 1 bytes 104"
+	for dir in "in6" "out6" ; do
+		if ! ip netns exec "$ns2" nft list counter inet filter "ns0${dir}" | grep -q "$expect";then
+			bad_counter "$ns2" ns0$dir "$expect" "test_local_dnat6 4"
+			lret=1
+		fi
+	done
+
+	test $lret -eq 0 && echo "PASS: ipv6 ping to $ns1 was $family NATted to $ns2"
+	ip netns exec "$ns0" nft flush chain ip6 nat output
+
+	return $lret
+}
+
+test_local_dnat()
+{
+	local family=$1
+	local lret=0
+	local IPF=""
+
+	if [ "$family" = "inet" ];then
+		IPF="ip"
+	fi
+
+ip netns exec "$ns0" nft -f /dev/stdin <<EOF 2>/dev/null
+table $family nat {
+	chain output {
+		type nat hook output priority 0; policy accept;
+		ip daddr 10.0.1.99 dnat $IPF to 10.0.2.99
+	}
+}
+EOF
+	if [ $? -ne 0 ]; then
+		if [ "$family" = "inet" ];then
+			echo "SKIP: inet nat tests"
+			test_inet_nat=false
+			return $ksft_skip
+		fi
+		echo "SKIP: Could not add add $family dnat hook"
+		return $ksft_skip
+	fi
+
+	# ping netns1, expect rewrite to netns2
+	if ! ip netns exec "$ns0" ping -q -c 1 10.0.1.99 > /dev/null;then
+		lret=1
+		echo "ERROR: ping failed"
+		return $lret
+	fi
+
+	expect="packets 0 bytes 0"
+	for dir in "in" "out" ; do
+		if ! ip netns exec "$ns0" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then
+			bad_counter "$ns0" "ns1$dir" "$expect" "test_local_dnat 1"
+			lret=1
+		fi
+	done
+
+	expect="packets 1 bytes 84"
+	for dir in "in" "out" ; do
+		if ! ip netns exec "$ns0" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then
+			bad_counter "$ns0" "ns2$dir" "$expect" "test_local_dnat 2"
+			lret=1
+		fi
+	done
+
+	# expect 0 count in ns1
+	expect="packets 0 bytes 0"
+	for dir in "in" "out" ; do
+		if ! ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect";then
+			bad_counter "$ns1" "ns0$dir" "$expect" "test_local_dnat 3"
+			lret=1
+		fi
+	done
+
+	# expect 1 packet in ns2
+	expect="packets 1 bytes 84"
+	for dir in "in" "out" ; do
+		if ! ip netns exec "$ns2" nft list counter inet filter ns0${dir} | grep -q "$expect";then
+			bad_counter "$ns2" "ns0$dir" "$expect" "test_local_dnat 4"
+			lret=1
+		fi
+	done
+
+	test $lret -eq 0 && echo "PASS: ping to $ns1 was $family NATted to $ns2"
+
+	ip netns exec "$ns0" nft flush chain "$family" nat output
+
+	reset_counters
+	if ! ip netns exec "$ns0" ping -q -c 1 10.0.1.99 > /dev/null;then
+		lret=1
+		echo "ERROR: ping failed"
+		return $lret
+	fi
+
+	expect="packets 1 bytes 84"
+	for dir in "in" "out" ; do
+		if ! ip netns exec "$ns0" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then
+			bad_counter "$ns1" ns1$dir "$expect" "test_local_dnat 5"
+			lret=1
+		fi
+	done
+	expect="packets 0 bytes 0"
+	for dir in "in" "out" ; do
+		if ! ip netns exec "$ns0" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then
+			bad_counter "$ns0" ns2$dir "$expect" "test_local_dnat 6"
+			lret=1
+		fi
+	done
+
+	# expect 1 count in ns1
+	expect="packets 1 bytes 84"
+	for dir in "in" "out" ; do
+		if ! ip netns exec "$ns1" nft list counter inet filter "ns0${dir}" | grep -q "$expect";then
+			bad_counter "$ns0" ns0$dir "$expect" "test_local_dnat 7"
+			lret=1
+		fi
+	done
+
+	# expect 0 packet in ns2
+	expect="packets 0 bytes 0"
+	for dir in "in" "out" ; do
+		if ! ip netns exec "$ns2" nft list counter inet filter "ns0${dir}" | grep -q "$expect";then
+			bad_counter "$ns2" ns0$dir "$expect" "test_local_dnat 8"
+			lret=1
+		fi
+	done
+
+	test $lret -eq 0 && echo "PASS: ping to $ns1 OK after $family nat output chain flush"
+
+	return $lret
+}
+
+listener_ready()
+{
+	local ns="$1"
+	local port="$2"
+	local proto="$3"
+	ss -N "$ns" -ln "$proto" -o "sport = :$port" | grep -q "$port"
+}
+
+test_local_dnat_portonly()
+{
+	local family=$1
+	local daddr=$2
+	local lret=0
+
+ip netns exec "$ns0" nft -f /dev/stdin <<EOF
+table $family nat {
+	chain output {
+		type nat hook output priority 0; policy accept;
+		meta l4proto tcp dnat to :2000
+
+	}
+}
+EOF
+	if [ $? -ne 0 ]; then
+		if [ "$family" = "inet" ];then
+			echo "SKIP: inet port test"
+			test_inet_nat=false
+			return
+		fi
+		echo "SKIP: Could not add $family dnat hook"
+		return
+	fi
+
+	echo "SERVER-$family" | ip netns exec "$ns1" timeout 3 socat -u STDIN TCP-LISTEN:2000 &
+
+	busywait $BUSYWAIT_TIMEOUT listener_ready "$ns1" 2000 "-t"
+
+	result=$(ip netns exec "$ns0" timeout 1 socat -u TCP:"$daddr":2000 STDOUT)
+
+	if [ "$result" = "SERVER-inet" ];then
+		echo "PASS: inet port rewrite without l3 address"
+	else
+		echo "ERROR: inet port rewrite without l3 address, got $result"
+		ret=1
+	fi
+}
+
+test_masquerade6()
+{
+	local family=$1
+	local natflags=$2
+	local lret=0
+
+	ip netns exec "$ns0" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+
+	if ! ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null;then
+		echo "ERROR: cannot ping $ns1 from $ns2 via ipv6"
+		return 1
+	fi
+
+	expect="packets 1 bytes 104"
+	for dir in "in6" "out6" ; do
+		if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then
+			bad_counter "$ns1" "ns2$dir" "$expect" "test_masquerade6 1"
+			lret=1
+		fi
+
+		if ! ip netns exec "$ns2" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then
+			bad_counter "$ns2" "ns1$dir" "$expect" "test_masquerade6 2"
+			lret=1
+		fi
+	done
+
+	reset_counters
+
+# add masquerading rule
+ip netns exec "$ns0" nft -f /dev/stdin <<EOF
+table $family nat {
+	chain postrouting {
+		type nat hook postrouting priority 0; policy accept;
+		meta oif veth0 masquerade $natflags
+	}
+}
+EOF
+	if [ $? -ne 0 ]; then
+		echo "SKIP: Could not add add $family masquerade hook"
+		return $ksft_skip
+	fi
+
+	if ! ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null;then
+		echo "ERROR: cannot ping $ns1 from $ns2 with active $family masquerade $natflags"
+		lret=1
+	fi
+
+	# ns1 should have seen packets from ns0, due to masquerade
+	expect="packets 1 bytes 104"
+	for dir in "in6" "out6" ; do
+		if ! ip netns exec "$ns1" nft list counter inet filter "ns0${dir}" | grep -q "$expect";then
+			bad_counter "$ns1" ns0$dir "$expect" "test_masquerade6 3"
+			lret=1
+		fi
+
+		if ! ip netns exec "$ns2" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then
+			bad_counter "$ns2" ns1$dir "$expect" "test_masquerade6 4"
+			lret=1
+		fi
+	done
+
+	# ns1 should not have seen packets from ns2, due to masquerade
+	expect="packets 0 bytes 0"
+	for dir in "in6" "out6" ; do
+		if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then
+			bad_counter "$ns1" ns0$dir "$expect" "test_masquerade6 5"
+			lret=1
+		fi
+
+		if ! ip netns exec "$ns0" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then
+			bad_counter "$ns0" "ns1$dir" "$expect" "test_masquerade6 6"
+			lret=1
+		fi
+	done
+
+	if ! ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null;then
+		echo "ERROR: cannot ping $ns1 from $ns2 with active ipv6 masquerade $natflags (attempt 2)"
+		lret=1
+	fi
+
+	if ! ip netns exec "$ns0" nft flush chain "$family" nat postrouting;then
+		echo "ERROR: Could not flush $family nat postrouting" 1>&2
+		lret=1
+	fi
+
+	test $lret -eq 0 && echo "PASS: $family IPv6 masquerade $natflags for $ns2"
+
+	return $lret
+}
+
+test_masquerade()
+{
+	local family=$1
+	local natflags=$2
+	local lret=0
+
+	ip netns exec "$ns0" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
+	ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
+
+	if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null;then
+		echo "ERROR: cannot ping $ns1 from $ns2 $natflags"
+		lret=1
+	fi
+
+	expect="packets 1 bytes 84"
+	for dir in "in" "out" ; do
+		if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then
+			bad_counter "$ns1" "ns2$dir" "$expect" "test_masquerade 1"
+			lret=1
+		fi
+
+		if ! ip netns exec "$ns2" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then
+			bad_counter "$ns2" "ns1$dir" "$expect" "test_masquerade 2"
+			lret=1
+		fi
+	done
+
+	reset_counters
+
+# add masquerading rule
+ip netns exec "$ns0" nft -f /dev/stdin <<EOF
+table $family nat {
+	chain postrouting {
+		type nat hook postrouting priority 0; policy accept;
+		meta oif veth0 masquerade $natflags
+	}
+}
+EOF
+	if [ $? -ne 0 ]; then
+		echo "SKIP: Could not add add $family masquerade hook"
+		return $ksft_skip
+	fi
+
+	if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null;then
+		echo "ERROR: cannot ping $ns1 from $ns2 with active $family masquerade $natflags"
+		lret=1
+	fi
+
+	# ns1 should have seen packets from ns0, due to masquerade
+	expect="packets 1 bytes 84"
+	for dir in "in" "out" ; do
+		if ! ip netns exec "$ns1" nft list counter inet filter "ns0${dir}" | grep -q "$expect";then
+			bad_counter "$ns1" "ns0$dir" "$expect" "test_masquerade 3"
+			lret=1
+		fi
+
+		if ! ip netns exec "$ns2" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then
+			bad_counter "$ns2" "ns1$dir" "$expect" "test_masquerade 4"
+			lret=1
+		fi
+	done
+
+	# ns1 should not have seen packets from ns2, due to masquerade
+	expect="packets 0 bytes 0"
+	for dir in "in" "out" ; do
+		if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then
+			bad_counter "$ns1" "ns0$dir" "$expect" "test_masquerade 5"
+			lret=1
+		fi
+
+		if ! ip netns exec "$ns0" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then
+			bad_counter "$ns0" "ns1$dir" "$expect" "test_masquerade 6"
+			lret=1
+		fi
+	done
+
+	if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null;then
+		echo "ERROR: cannot ping $ns1 from $ns2 with active ip masquerade $natflags (attempt 2)"
+		lret=1
+	fi
+
+	if ! ip netns exec "$ns0" nft flush chain "$family" nat postrouting; then
+		echo "ERROR: Could not flush $family nat postrouting" 1>&2
+		lret=1
+	fi
+
+	test $lret -eq 0 && echo "PASS: $family IP masquerade $natflags for $ns2"
+
+	return $lret
+}
+
+test_redirect6()
+{
+	local family=$1
+	local lret=0
+
+	ip netns exec "$ns0" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+
+	if ! ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null;then
+		echo "ERROR: cannnot ping $ns1 from $ns2 via ipv6"
+		lret=1
+	fi
+
+	expect="packets 1 bytes 104"
+	for dir in "in6" "out6" ; do
+		if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then
+			bad_counter "$ns1" ns2$dir "$expect" "test_redirect6 1"
+			lret=1
+		fi
+
+		if ! ip netns exec "$ns2" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then
+			bad_counter "$ns2" ns1$dir "$expect" "test_redirect6 2"
+			lret=1
+		fi
+	done
+
+	reset_counters
+
+# add redirect rule
+ip netns exec "$ns0" nft -f /dev/stdin <<EOF
+table $family nat {
+	chain prerouting {
+		type nat hook prerouting priority 0; policy accept;
+		meta iif veth1 meta l4proto icmpv6 ip6 saddr dead:2::99 ip6 daddr dead:1::99 redirect
+	}
+}
+EOF
+	if [ $? -ne 0 ]; then
+		echo "SKIP: Could not add add $family redirect hook"
+		return $ksft_skip
+	fi
+
+	if ! ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null;then
+		echo "ERROR: cannot ping $ns1 from $ns2 via ipv6 with active $family redirect"
+		lret=1
+	fi
+
+	# ns1 should have seen no packets from ns2, due to redirection
+	expect="packets 0 bytes 0"
+	for dir in "in6" "out6" ; do
+		if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then
+			bad_counter "$ns1" ns0$dir "$expect" "test_redirect6 3"
+			lret=1
+		fi
+	done
+
+	# ns0 should have seen packets from ns2, due to masquerade
+	expect="packets 1 bytes 104"
+	for dir in "in6" "out6" ; do
+		if ! ip netns exec "$ns0" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then
+			bad_counter "$ns1" ns0$dir "$expect" "test_redirect6 4"
+			lret=1
+		fi
+	done
+
+	if ! ip netns exec "$ns0" nft delete table "$family" nat;then
+		echo "ERROR: Could not delete $family nat table" 1>&2
+		lret=1
+	fi
+
+	test $lret -eq 0 && echo "PASS: $family IPv6 redirection for $ns2"
+
+	return $lret
+}
+
+test_redirect()
+{
+	local family=$1
+	local lret=0
+
+	ip netns exec "$ns0" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
+	ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
+
+	if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null;then
+		echo "ERROR: cannot ping $ns1 from $ns2"
+		lret=1
+	fi
+
+	expect="packets 1 bytes 84"
+	for dir in "in" "out" ; do
+		if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then
+			bad_counter "$ns1" "$ns2$dir" "$expect" "test_redirect 1"
+			lret=1
+		fi
+
+		if ! ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect";then
+			bad_counter "$ns2" ns1$dir "$expect" "test_redirect 2"
+			lret=1
+		fi
+	done
+
+	reset_counters
+
+# add redirect rule
+ip netns exec "$ns0" nft -f /dev/stdin <<EOF
+table $family nat {
+	chain prerouting {
+		type nat hook prerouting priority 0; policy accept;
+		meta iif veth1 ip protocol icmp ip saddr 10.0.2.99 ip daddr 10.0.1.99 redirect
+	}
+}
+EOF
+	if [ $? -ne 0 ]; then
+		echo "SKIP: Could not add add $family redirect hook"
+		return $ksft_skip
+	fi
+
+	if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null;then
+		echo "ERROR: cannot ping $ns1 from $ns2 with active $family ip redirect"
+		lret=1
+	fi
+
+	# ns1 should have seen no packets from ns2, due to redirection
+	expect="packets 0 bytes 0"
+	for dir in "in" "out" ; do
+
+		if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then
+			bad_counter "$ns1" ns0$dir "$expect" "test_redirect 3"
+			lret=1
+		fi
+	done
+
+	# ns0 should have seen packets from ns2, due to masquerade
+	expect="packets 1 bytes 84"
+	for dir in "in" "out" ; do
+		if ! ip netns exec "$ns0" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then
+			bad_counter "$ns0" ns0$dir "$expect" "test_redirect 4"
+			lret=1
+		fi
+	done
+
+	if ! ip netns exec "$ns0" nft delete table "$family" nat;then
+		echo "ERROR: Could not delete $family nat table" 1>&2
+		lret=1
+	fi
+
+	test $lret -eq 0 && echo "PASS: $family IP redirection for $ns2"
+
+	return $lret
+}
+
+# test port shadowing.
+# create two listening services, one on router (ns0), one
+# on client (ns2), which is masqueraded from ns1 point of view.
+# ns2 sends udp packet coming from service port to ns1, on a highport.
+# Later, if n1 uses same highport to connect to ns0:service, packet
+# might be port-forwarded to ns2 instead.
+
+# second argument tells if we expect the 'fake-entry' to take effect
+# (CLIENT) or not (ROUTER).
+test_port_shadow()
+{
+	local test=$1
+	local expect=$2
+	local daddrc="10.0.1.99"
+	local daddrs="10.0.1.1"
+	local result=""
+	local logmsg=""
+
+	# make shadow entry, from client (ns2), going to (ns1), port 41404, sport 1405.
+	echo "fake-entry" | ip netns exec "$ns2" timeout 1 socat -u STDIN UDP:"$daddrc":41404,sourceport=1405
+
+	echo ROUTER | ip netns exec "$ns0" timeout 3 socat -T 3 -u STDIN UDP4-LISTEN:1405 2>/dev/null &
+	local sc_r=$!
+	echo CLIENT | ip netns exec "$ns2" timeout 3 socat -T 3 -u STDIN UDP4-LISTEN:1405,reuseport 2>/dev/null &
+	local sc_c=$!
+
+	busywait $BUSYWAIT_TIMEOUT listener_ready "$ns0" 1405 "-u"
+	busywait $BUSYWAIT_TIMEOUT listener_ready "$ns2" 1405 "-u"
+
+	# ns1 tries to connect to ns0:1405.  With default settings this should connect
+	# to client, it matches the conntrack entry created above.
+
+	result=$(echo "data" | ip netns exec "$ns1" timeout 1 socat - UDP:"$daddrs":1405,sourceport=41404)
+
+	if [ "$result" = "$expect" ] ;then
+		echo "PASS: portshadow test $test: got reply from ${expect}${logmsg}"
+	else
+		echo "ERROR: portshadow test $test: got reply from \"$result\", not $expect as intended"
+		ret=1
+	fi
+
+	kill $sc_r $sc_c 2>/dev/null
+
+	# flush udp entries for next test round, if any
+	ip netns exec "$ns0" conntrack -F >/dev/null 2>&1
+}
+
+# This prevents port shadow of router service via packet filter,
+# packets claiming to originate from service port from internal
+# network are dropped.
+test_port_shadow_filter()
+{
+	local family=$1
+
+ip netns exec "$ns0" nft -f /dev/stdin <<EOF
+table $family filter {
+	chain forward {
+		type filter hook forward priority 0; policy accept;
+		meta iif veth1 udp sport 1405 drop
+	}
+}
+EOF
+	test_port_shadow "port-filter" "ROUTER"
+
+	ip netns exec "$ns0" nft delete table "$family" filter
+}
+
+# This prevents port shadow of router service via notrack.
+test_port_shadow_notrack()
+{
+	local family=$1
+
+ip netns exec "$ns0" nft -f /dev/stdin <<EOF
+table $family raw {
+	chain prerouting {
+		type filter hook prerouting priority -300; policy accept;
+		meta iif veth0 udp dport 1405 notrack
+	}
+	chain output {
+		type filter hook output priority -300; policy accept;
+		meta oif veth0 udp sport 1405 notrack
+	}
+}
+EOF
+	test_port_shadow "port-notrack" "ROUTER"
+
+	ip netns exec "$ns0" nft delete table "$family" raw
+}
+
+# This prevents port shadow of router service via sport remap.
+test_port_shadow_pat()
+{
+	local family=$1
+
+ip netns exec "$ns0" nft -f /dev/stdin <<EOF
+table $family pat {
+	chain postrouting {
+		type nat hook postrouting priority -1; policy accept;
+		meta iif veth1 udp sport <= 1405 masquerade to : 1406-65535 random
+	}
+}
+EOF
+	test_port_shadow "pat" "ROUTER"
+
+	ip netns exec "$ns0" nft delete table "$family" pat
+}
+
+test_port_shadowing()
+{
+	local family="ip"
+
+	if ! conntrack -h >/dev/null 2>&1;then
+		echo "SKIP: Could not run nat port shadowing test without conntrack tool"
+		return
+	fi
+
+	if ! socat -h > /dev/null 2>&1;then
+		echo "SKIP: Could not run nat port shadowing test without socat tool"
+		return
+	fi
+
+	ip netns exec "$ns0" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
+	ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
+
+	ip netns exec "$ns0" nft -f /dev/stdin <<EOF
+table $family nat {
+	chain postrouting {
+		type nat hook postrouting priority 0; policy accept;
+		meta oif veth0 masquerade
+	}
+}
+EOF
+	if [ $? -ne 0 ]; then
+		echo "SKIP: Could not add add $family masquerade hook"
+		return $ksft_skip
+	fi
+
+	# test default behaviour. Packet from ns1 to ns0 is redirected to ns2.
+	test_port_shadow "default" "CLIENT"
+
+	# test packet filter based mitigation: prevent forwarding of
+	# packets claiming to come from the service port.
+	test_port_shadow_filter "$family"
+
+	# test conntrack based mitigation: connections going or coming
+	# from router:service bypass connection tracking.
+	test_port_shadow_notrack "$family"
+
+	# test nat based mitigation: fowarded packets coming from service port
+	# are masqueraded with random highport.
+	test_port_shadow_pat "$family"
+
+	ip netns exec "$ns0" nft delete table $family nat
+}
+
+test_stateless_nat_ip()
+{
+	local lret=0
+
+	ip netns exec "$ns0" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
+	ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
+
+	if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null;then
+		echo "ERROR: cannot ping $ns1 from $ns2 before loading stateless rules"
+		return 1
+	fi
+
+ip netns exec "$ns0" nft -f /dev/stdin <<EOF
+table ip stateless {
+	map xlate_in {
+		typeof meta iifname . ip saddr . ip daddr : ip daddr
+		elements = {
+			"veth1" . 10.0.2.99 . 10.0.1.99 : 10.0.2.2,
+		}
+	}
+	map xlate_out {
+		typeof meta iifname . ip saddr . ip daddr : ip daddr
+		elements = {
+			"veth0" . 10.0.1.99 . 10.0.2.2 : 10.0.2.99
+		}
+	}
+
+	chain prerouting {
+		type filter hook prerouting priority -400; policy accept;
+		ip saddr set meta iifname . ip saddr . ip daddr map @xlate_in
+		ip daddr set meta iifname . ip saddr . ip daddr map @xlate_out
+	}
+}
+EOF
+	if [ $? -ne 0 ]; then
+		echo "SKIP: Could not add ip statless rules"
+		return $ksft_skip
+	fi
+
+	reset_counters
+
+	if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null; then
+		echo "ERROR: cannot ping $ns1 from $ns2 with stateless rules"
+		lret=1
+	fi
+
+	# ns1 should have seen packets from .2.2, due to stateless rewrite.
+	expect="packets 1 bytes 84"
+	if ! ip netns exec "$ns1" nft list counter inet filter ns0insl | grep -q "$expect";then
+		bad_counter "$ns1" ns0insl "$expect" "test_stateless 1"
+		lret=1
+	fi
+
+	for dir in "in" "out" ; do
+		if ! ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect";then
+			bad_counter "$ns2" ns1$dir "$expect" "test_stateless 2"
+			lret=1
+		fi
+	done
+
+	# ns1 should not have seen packets from ns2, due to masquerade
+	expect="packets 0 bytes 0"
+	for dir in "in" "out" ; do
+		if ! ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect";then
+			bad_counter "$ns1" ns0$dir "$expect" "test_stateless 3"
+			lret=1
+		fi
+
+		if ! ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect";then
+			bad_counter "$ns0" ns1$dir "$expect" "test_stateless 4"
+			lret=1
+		fi
+	done
+
+	reset_counters
+
+	if ! socat -h > /dev/null 2>&1;then
+		echo "SKIP: Could not run stateless nat frag test without socat tool"
+		if [ $lret -eq 0 ]; then
+			return $ksft_skip
+		fi
+
+		ip netns exec "$ns0" nft delete table ip stateless
+		return $lret
+	fi
+
+	dd if=/dev/urandom of="$INFILE" bs=4096 count=1 2>/dev/null
+
+	ip netns exec "$ns1" timeout 3 socat -u UDP4-RECV:4233 OPEN:"$OUTFILE" < /dev/null 2>/dev/null &
+
+	busywait $BUSYWAIT_TIMEOUT listener_ready "$ns1" 4233 "-u"
+
+	# re-do with large ping -> ip fragmentation
+	if ! ip netns exec "$ns2" timeout 3 socat -u STDIN UDP4-SENDTO:"10.0.1.99:4233" < "$INFILE" > /dev/null;then
+		echo "ERROR: failed to test udp $ns1 to $ns2 with stateless ip nat" 1>&2
+		lret=1
+	fi
+
+	wait
+
+	if ! cmp "$INFILE" "$OUTFILE";then
+		ls -l "$INFILE" "$OUTFILE"
+		echo "ERROR: in and output file mismatch when checking udp with stateless nat" 1>&2
+		lret=1
+	fi
+
+	:> "$OUTFILE"
+
+	# ns1 should have seen packets from 2.2, due to stateless rewrite.
+	expect="packets 3 bytes 4164"
+	if ! ip netns exec "$ns1" nft list counter inet filter ns0insl | grep -q "$expect";then
+		bad_counter "$ns1" ns0insl "$expect" "test_stateless 5"
+		lret=1
+	fi
+
+	if ! ip netns exec "$ns0" nft delete table ip stateless; then
+		echo "ERROR: Could not delete table ip stateless" 1>&2
+		lret=1
+	fi
+
+	test $lret -eq 0 && echo "PASS: IP statless for $ns2"
+
+	return $lret
+}
+
+# ip netns exec "$ns0" ping -c 1 -q 10.0.$i.99
+for i in "$ns0" "$ns1" "$ns2" ;do
+ip netns exec "$i" nft -f /dev/stdin <<EOF
+table inet filter {
+	counter ns0in {}
+	counter ns1in {}
+	counter ns2in {}
+
+	counter ns0out {}
+	counter ns1out {}
+	counter ns2out {}
+
+	counter ns0in6 {}
+	counter ns1in6 {}
+	counter ns2in6 {}
+
+	counter ns0out6 {}
+	counter ns1out6 {}
+	counter ns2out6 {}
+
+	map nsincounter {
+		type ipv4_addr : counter
+		elements = { 10.0.1.1 : "ns0in",
+			     10.0.2.1 : "ns0in",
+			     10.0.1.99 : "ns1in",
+			     10.0.2.99 : "ns2in" }
+	}
+
+	map nsincounter6 {
+		type ipv6_addr : counter
+		elements = { dead:1::1 : "ns0in6",
+			     dead:2::1 : "ns0in6",
+			     dead:1::99 : "ns1in6",
+			     dead:2::99 : "ns2in6" }
+	}
+
+	map nsoutcounter {
+		type ipv4_addr : counter
+		elements = { 10.0.1.1 : "ns0out",
+			     10.0.2.1 : "ns0out",
+			     10.0.1.99: "ns1out",
+			     10.0.2.99: "ns2out" }
+	}
+
+	map nsoutcounter6 {
+		type ipv6_addr : counter
+		elements = { dead:1::1 : "ns0out6",
+			     dead:2::1 : "ns0out6",
+			     dead:1::99 : "ns1out6",
+			     dead:2::99 : "ns2out6" }
+	}
+
+	chain input {
+		type filter hook input priority 0; policy accept;
+		counter name ip saddr map @nsincounter
+		icmpv6 type { "echo-request", "echo-reply" } counter name ip6 saddr map @nsincounter6
+	}
+	chain output {
+		type filter hook output priority 0; policy accept;
+		counter name ip daddr map @nsoutcounter
+		icmpv6 type { "echo-request", "echo-reply" } counter name ip6 daddr map @nsoutcounter6
+	}
+}
+EOF
+done
+
+# special case for stateless nat check, counter needs to
+# be done before (input) ip defragmentation
+ip netns exec "$ns1" nft -f /dev/stdin <<EOF
+table inet filter {
+	counter ns0insl {}
+
+	chain pre {
+		type filter hook prerouting priority -400; policy accept;
+		ip saddr 10.0.2.2 counter name "ns0insl"
+	}
+}
+EOF
+
+ping_basic()
+{
+	i="$1"
+	if ! ip netns exec "$ns0" ping -c 1 -q 10.0."$i".99 > /dev/null;then
+		echo "ERROR: Could not reach other namespace(s)" 1>&2
+		ret=1
+	fi
+
+	if ! ip netns exec "$ns0" ping -c 1 -q dead:"$i"::99 > /dev/null;then
+		echo "ERROR: Could not reach other namespace(s) via ipv6" 1>&2
+		ret=1
+	fi
+}
+
+test_basic_conn()
+{
+	local nsexec
+	name="$1"
+
+	nsexec=$(eval echo \$"$1")
+
+	ping_basic 1
+	ping_basic 2
+
+	if ! check_counters "$nsexec";then
+		return 1
+	fi
+
+	if ! check_ns0_counters "$name";then
+		return 1
+	fi
+
+	reset_counters
+	return 0
+}
+
+if ! test_basic_conn "ns1" ; then
+	echo "ERROR: basic test for ns1 failed" 1>&2
+	exit 1
+fi
+if ! test_basic_conn "ns2"; then
+	echo "ERROR: basic test for ns1 failed" 1>&2
+fi
+
+if [ $ret -eq 0 ];then
+	echo "PASS: netns routing/connectivity: $ns0 can reach $ns1 and $ns2"
+fi
+
+reset_counters
+test_local_dnat ip
+test_local_dnat6 ip6
+
+reset_counters
+test_local_dnat_portonly inet 10.0.1.99
+
+reset_counters
+$test_inet_nat && test_local_dnat inet
+$test_inet_nat && test_local_dnat6 inet
+
+for flags in "" "fully-random"; do
+reset_counters
+test_masquerade ip $flags
+test_masquerade6 ip6 $flags
+reset_counters
+$test_inet_nat && test_masquerade inet $flags
+$test_inet_nat && test_masquerade6 inet $flags
+done
+
+reset_counters
+test_redirect ip
+test_redirect6 ip6
+reset_counters
+$test_inet_nat && test_redirect inet
+$test_inet_nat && test_redirect6 inet
+
+test_port_shadowing
+test_stateless_nat_ip
+
+if [ $ret -ne 0 ];then
+	echo -n "FAIL: "
+	nft --version
+fi
+
+exit $ret
diff --git a/tools/testing/selftests/net/netfilter/nft_nat_zones.sh b/tools/testing/selftests/net/netfilter/nft_nat_zones.sh
new file mode 100755
index 000000000000..9f200f80253a
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/nft_nat_zones.sh
@@ -0,0 +1,265 @@
+#!/bin/bash
+#
+# Test connection tracking zone and NAT source port reallocation support.
+#
+
+source lib.sh
+
+# Don't increase too much, 2000 clients should work
+# just fine but script can then take several minutes with
+# KASAN/debug builds.
+maxclients=100
+
+have_socat=0
+ret=0
+
+[ "$KSFT_MACHINE_SLOW" = yes ] && maxclients=40
+# client1---.
+#            veth1-.
+#                  |
+#               NAT Gateway --veth0--> Server
+#                  | |
+#            veth2-' |
+# client2---'        |
+#  ....              |
+# clientX----vethX---'
+
+# All clients share identical IP address.
+# NAT Gateway uses policy routing and conntrack zones to isolate client
+# namespaces.  Each client connects to Server, each with colliding tuples:
+#   clientsaddr:10000 -> serveraddr:dport
+#   NAT Gateway is supposed to do port reallocation for each of the
+#   connections.
+
+v4gc1=$(sysctl -n net.ipv4.neigh.default.gc_thresh1 2>/dev/null)
+v4gc2=$(sysctl -n net.ipv4.neigh.default.gc_thresh2 2>/dev/null)
+v4gc3=$(sysctl -n net.ipv4.neigh.default.gc_thresh3 2>/dev/null)
+v6gc1=$(sysctl -n net.ipv6.neigh.default.gc_thresh1 2>/dev/null)
+v6gc2=$(sysctl -n net.ipv6.neigh.default.gc_thresh2 2>/dev/null)
+v6gc3=$(sysctl -n net.ipv6.neigh.default.gc_thresh3 2>/dev/null)
+
+cleanup()
+{
+	cleanup_all_ns
+
+	sysctl -q net.ipv4.neigh.default.gc_thresh1="$v4gc1" 2>/dev/null
+	sysctl -q net.ipv4.neigh.default.gc_thresh2="$v4gc2" 2>/dev/null
+	sysctl -q net.ipv4.neigh.default.gc_thresh3="$v4gc3" 2>/dev/null
+	sysctl -q net.ipv6.neigh.default.gc_thresh1="$v6gc1" 2>/dev/null
+	sysctl -q net.ipv6.neigh.default.gc_thresh2="$v6gc2" 2>/dev/null
+	sysctl -q net.ipv6.neigh.default.gc_thresh3="$v6gc3" 2>/dev/null
+}
+
+checktool "nft --version" echo "run test without nft tool"
+checktool "conntrack -V" "run test without conntrack tool"
+
+if socat -h >/dev/null 2>&1; then
+	have_socat=1
+fi
+
+setup_ns gw srv
+
+trap cleanup EXIT
+
+ip link add veth0 netns "$gw" type veth peer name eth0 netns "$srv"
+ip -net "$gw" link set veth0 up
+ip -net "$srv" link set eth0 up
+
+sysctl -q net.ipv6.neigh.default.gc_thresh1=512  2>/dev/null
+sysctl -q net.ipv6.neigh.default.gc_thresh2=1024 2>/dev/null
+sysctl -q net.ipv6.neigh.default.gc_thresh3=4096 2>/dev/null
+sysctl -q net.ipv4.neigh.default.gc_thresh1=512  2>/dev/null
+sysctl -q net.ipv4.neigh.default.gc_thresh2=1024 2>/dev/null
+sysctl -q net.ipv4.neigh.default.gc_thresh3=4096 2>/dev/null
+
+for i in $(seq 1 "$maxclients");do
+  setup_ns "cl$i"
+
+  cl=$(eval echo \$cl"$i")
+  if ! ip link add veth"$i" netns "$gw" type veth peer name eth0 netns "$cl" > /dev/null 2>&1;then
+    echo "SKIP: No virtual ethernet pair device support in kernel"
+    exit $ksft_skip
+  fi
+done
+
+for i in $(seq 1 "$maxclients");do
+  cl=$(eval echo \$cl"$i")
+  echo netns exec "$cl" ip link set eth0 up
+  echo netns exec "$cl" sysctl -q net.ipv4.tcp_syn_retries=2
+  echo netns exec "$gw" ip link set "veth$i" up
+  echo netns exec "$gw" sysctl -q net.ipv4.conf.veth"$i".arp_ignore=2
+
+  # clients have same IP addresses.
+  echo netns exec "$cl" ip addr add 10.1.0.3/24 dev eth0
+  echo netns exec "$cl" ip addr add dead:1::3/64 dev eth0 nodad
+  echo netns exec "$cl" ip route add default via 10.1.0.2 dev eth0
+  echo netns exec "$cl" ip route add default via dead:1::2 dev eth0
+
+  # NB: same addresses on client-facing interfaces.
+  echo netns exec "$gw" ip addr add 10.1.0.2/24 dev "veth$i"
+  echo netns exec "$gw" ip addr add dead:1::2/64 dev "veth$i" nodad
+
+  # gw: policy routing
+  echo netns exec "$gw" ip route add 10.1.0.0/24 dev "veth$i" table $((1000+i))
+  echo netns exec "$gw" ip route add dead:1::0/64 dev "veth$i" table $((1000+i))
+  echo netns exec "$gw" ip route add 10.3.0.0/24 dev veth0 table $((1000+i))
+  echo netns exec "$gw" ip route add dead:3::0/64 dev veth0 table $((1000+i))
+  echo netns exec "$gw" ip rule add fwmark "$i" lookup $((1000+i))
+done | ip -batch /dev/stdin
+
+ip -net "$gw" addr add 10.3.0.1/24 dev veth0
+ip -net "$gw" addr add dead:3::1/64 dev veth0 nodad
+
+ip -net "$srv" addr add 10.3.0.99/24 dev eth0
+ip -net "$srv" addr add dead:3::99/64 dev eth0 nodad
+
+ip netns exec "$gw" nft -f /dev/stdin<<EOF
+table inet raw {
+	map iiftomark {
+		type ifname : mark
+	}
+
+	map iiftozone {
+		typeof iifname : ct zone
+	}
+
+	set inicmp {
+		flags dynamic
+		type ipv4_addr . ifname . ipv4_addr
+	}
+	set inflows {
+		flags dynamic
+		type ipv4_addr . inet_service . ifname . ipv4_addr . inet_service
+	}
+
+	set inflows6 {
+		flags dynamic
+		type ipv6_addr . inet_service . ifname . ipv6_addr . inet_service
+	}
+
+	chain prerouting {
+		type filter hook prerouting priority -64000; policy accept;
+		ct original zone set meta iifname map @iiftozone
+		meta mark set meta iifname map @iiftomark
+
+		tcp flags & (syn|ack) == ack add @inflows { ip saddr . tcp sport . meta iifname . ip daddr . tcp dport counter }
+		add @inflows6 { ip6 saddr . tcp sport . meta iifname . ip6 daddr . tcp dport counter }
+		ip protocol icmp add @inicmp { ip saddr . meta iifname . ip daddr counter }
+	}
+
+	chain nat_postrouting {
+		type nat hook postrouting priority 0; policy accept;
+                ct mark set meta mark meta oifname veth0 masquerade
+	}
+
+	chain mangle_prerouting {
+		type filter hook prerouting priority -100; policy accept;
+		ct direction reply meta mark set ct mark
+	}
+}
+EOF
+if [ "$?" -ne 0 ];then
+	echo "SKIP: Could not add nftables rules"
+	exit $ksft_skip
+fi
+
+( echo add element inet raw iiftomark \{
+	for i in $(seq 1 $((maxclients-1))); do
+		echo \"veth"$i"\" : "$i",
+	done
+	echo \"veth"$maxclients"\" : "$maxclients" \}
+	echo add element inet raw iiftozone \{
+	for i in $(seq 1 $((maxclients-1))); do
+		echo \"veth"$i"\" : "$i",
+	done
+	echo \"veth$maxclients\" : $maxclients \}
+) | ip netns exec "$gw" nft -f /dev/stdin
+
+ip netns exec "$gw" sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null
+ip netns exec "$gw" sysctl -q net.ipv6.conf.all.forwarding=1 > /dev/null
+
+# useful for debugging: allows to use 'ping' from clients to gateway.
+ip netns exec "$gw" sysctl -q net.ipv4.fwmark_reflect=1 > /dev/null
+ip netns exec "$gw" sysctl -q net.ipv6.fwmark_reflect=1 > /dev/null
+
+for i in $(seq 1 "$maxclients"); do
+  cl=$(eval echo \$cl"$i")
+  ip netns exec "$cl" ping -i 0.5 -q -c 3 10.3.0.99 > /dev/null 2>&1 &
+done
+
+wait || ret=1
+
+[ "$ret" -ne 0 ] && "FAIL: Ping failure from $cl" 1>&2
+
+for i in $(seq 1 "$maxclients"); do
+   if ! ip netns exec "$gw" nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" | grep -q "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 counter packets 3 bytes 252 }"; then
+      ret=1
+      echo "FAIL: counter icmp mismatch for veth$i" 1>&2
+      ip netns exec "$gw" nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" 1>&2
+      break
+   fi
+done
+
+if ! ip netns exec "$gw" nft get element inet raw inicmp "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 }" | grep -q "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * maxclients)) bytes $((252 * maxclients)) }"; then
+    ret=1
+    echo "FAIL: counter icmp mismatch for veth0: { 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * maxclients)) bytes $((252 * maxclients)) }"
+    ip netns exec "$gw" nft get element inet raw inicmp "{ 10.3.99 . \"veth0\" . 10.3.0.1 }" 1>&2
+fi
+
+if [ $ret -eq 0 ]; then
+	echo "PASS: ping test from all $maxclients namespaces"
+fi
+
+if [ $have_socat -eq 0 ];then
+	echo "SKIP: socat not installed"
+	if [ $ret -ne 0 ];then
+	    exit $ret
+	fi
+	exit $ksft_skip
+fi
+
+listener_ready()
+{
+	ss -N "$1" -lnt -o "sport = :5201" | grep -q 5201
+}
+
+ip netns exec "$srv" socat -u TCP-LISTEN:5201,fork STDOUT > /dev/null 2>/dev/null &
+socatpid=$!
+
+busywait 1000 listener_ready "$srv"
+
+for i in $(seq 1 "$maxclients"); do
+  if [ $ret -ne 0 ]; then
+     break
+  fi
+  cl=$(eval echo \$cl"$i")
+  if ! ip netns exec "$cl" socat -4 -u STDIN TCP:10.3.0.99:5201,sourceport=10000 < /dev/null > /dev/null; then
+     echo "FAIL: Failure to connect for $cl" 1>&2
+     ip netns exec "$gw" conntrack -S 1>&2
+     ret=1
+  fi
+done
+if [ $ret -eq 0 ];then
+	echo "PASS: socat connections for all $maxclients net namespaces"
+fi
+
+kill $socatpid
+wait
+
+for i in $(seq 1 "$maxclients"); do
+   if ! ip netns exec "$gw" nft get element inet raw inflows "{ 10.1.0.3 . 10000 . \"veth$i\" . 10.3.0.99 . 5201 }" > /dev/null;then
+      ret=1
+      echo "FAIL: can't find expected tcp entry for veth$i" 1>&2
+      break
+   fi
+done
+if [ $ret -eq 0 ];then
+	echo "PASS: Found client connection for all $maxclients net namespaces"
+fi
+
+if ! ip netns exec "$gw" nft get element inet raw inflows "{ 10.3.0.99 . 5201 . \"veth0\" . 10.3.0.1 . 10000 }" > /dev/null;then
+    ret=1
+    echo "FAIL: cannot find return entry on veth0" 1>&2
+fi
+
+exit $ret
diff --git a/tools/testing/selftests/net/netfilter/nft_queue.sh b/tools/testing/selftests/net/netfilter/nft_queue.sh
new file mode 100755
index 000000000000..6136ceec45e0
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/nft_queue.sh
@@ -0,0 +1,672 @@
+#!/bin/bash
+#
+# This tests nf_queue:
+# 1. can process packets from all hooks
+# 2. support running nfqueue from more than one base chain
+#
+# shellcheck disable=SC2162,SC2317
+
+source lib.sh
+ret=0
+timeout=5
+
+SCTP_TEST_TIMEOUT=60
+
+cleanup()
+{
+	ip netns pids "$ns1" | xargs kill 2>/dev/null
+	ip netns pids "$ns2" | xargs kill 2>/dev/null
+	ip netns pids "$nsrouter" | xargs kill 2>/dev/null
+
+	cleanup_all_ns
+
+	rm -f "$TMPINPUT"
+	rm -f "$TMPFILE0"
+	rm -f "$TMPFILE1"
+	rm -f "$TMPFILE2" "$TMPFILE3"
+}
+
+checktool "nft --version" "test without nft tool"
+checktool "socat -h" "run test without socat"
+
+modprobe -q sctp
+
+trap cleanup EXIT
+
+setup_ns ns1 ns2 ns3 nsrouter
+
+TMPFILE0=$(mktemp)
+TMPFILE1=$(mktemp)
+TMPFILE2=$(mktemp)
+TMPFILE3=$(mktemp)
+
+TMPINPUT=$(mktemp)
+COUNT=200
+[ "$KSFT_MACHINE_SLOW" = "yes" ] && COUNT=$((COUNT/8))
+dd conv=sparse status=none if=/dev/zero bs=1M count=$COUNT of="$TMPINPUT"
+
+if ! ip link add veth0 netns "$nsrouter" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1; then
+    echo "SKIP: No virtual ethernet pair device support in kernel"
+    exit $ksft_skip
+fi
+ip link add veth1 netns "$nsrouter" type veth peer name eth0 netns "$ns2"
+ip link add veth2 netns "$nsrouter" type veth peer name eth0 netns "$ns3"
+
+ip -net "$nsrouter" link set veth0 up
+ip -net "$nsrouter" addr add 10.0.1.1/24 dev veth0
+ip -net "$nsrouter" addr add dead:1::1/64 dev veth0 nodad
+
+ip -net "$nsrouter" link set veth1 up
+ip -net "$nsrouter" addr add 10.0.2.1/24 dev veth1
+ip -net "$nsrouter" addr add dead:2::1/64 dev veth1 nodad
+
+ip -net "$nsrouter" link set veth2 up
+ip -net "$nsrouter" addr add 10.0.3.1/24 dev veth2
+ip -net "$nsrouter" addr add dead:3::1/64 dev veth2 nodad
+
+ip -net "$ns1" link set eth0 up
+ip -net "$ns2" link set eth0 up
+ip -net "$ns3" link set eth0 up
+
+ip -net "$ns1" addr add 10.0.1.99/24 dev eth0
+ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad
+ip -net "$ns1" route add default via 10.0.1.1
+ip -net "$ns1" route add default via dead:1::1
+
+ip -net "$ns2" addr add 10.0.2.99/24 dev eth0
+ip -net "$ns2" addr add dead:2::99/64 dev eth0 nodad
+ip -net "$ns2" route add default via 10.0.2.1
+ip -net "$ns2" route add default via dead:2::1
+
+ip -net "$ns3" addr add 10.0.3.99/24 dev eth0
+ip -net "$ns3" addr add dead:3::99/64 dev eth0 nodad
+ip -net "$ns3" route add default via 10.0.3.1
+ip -net "$ns3" route add default via dead:3::1
+
+load_ruleset() {
+	local name=$1
+	local prio=$2
+
+ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF
+table inet $name {
+	chain nfq {
+		ip protocol icmp queue bypass
+		icmpv6 type { "echo-request", "echo-reply" } queue num 1 bypass
+	}
+	chain pre {
+		type filter hook prerouting priority $prio; policy accept;
+		jump nfq
+	}
+	chain input {
+		type filter hook input priority $prio; policy accept;
+		jump nfq
+	}
+	chain forward {
+		type filter hook forward priority $prio; policy accept;
+		tcp dport 12345 queue num 2
+		jump nfq
+	}
+	chain output {
+		type filter hook output priority $prio; policy accept;
+		tcp dport 12345 queue num 3
+		tcp sport 23456 queue num 3
+		jump nfq
+	}
+	chain post {
+		type filter hook postrouting priority $prio; policy accept;
+		jump nfq
+	}
+}
+EOF
+}
+
+load_counter_ruleset() {
+	local prio=$1
+
+ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF
+table inet countrules {
+	chain pre {
+		type filter hook prerouting priority $prio; policy accept;
+		counter
+	}
+	chain input {
+		type filter hook input priority $prio; policy accept;
+		counter
+	}
+	chain forward {
+		type filter hook forward priority $prio; policy accept;
+		counter
+	}
+	chain output {
+		type filter hook output priority $prio; policy accept;
+		counter
+	}
+	chain post {
+		type filter hook postrouting priority $prio; policy accept;
+		counter
+	}
+}
+EOF
+}
+
+test_ping() {
+  if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.99 > /dev/null; then
+	return 1
+  fi
+
+  if ! ip netns exec "$ns1" ping -c 1 -q dead:2::99 > /dev/null; then
+	return 2
+  fi
+
+  return 0
+}
+
+test_ping_router() {
+  if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.1 > /dev/null; then
+	return 3
+  fi
+
+  if ! ip netns exec "$ns1" ping -c 1 -q dead:2::1 > /dev/null; then
+	return 4
+  fi
+
+  return 0
+}
+
+test_queue_blackhole() {
+	local proto=$1
+
+ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF
+table $proto blackh {
+	chain forward {
+	type filter hook forward priority 0; policy accept;
+		queue num 600
+	}
+}
+EOF
+	if [ "$proto" = "ip" ] ;then
+		ip netns exec "$ns1" ping -W 2 -c 1 -q 10.0.2.99 > /dev/null
+		lret=$?
+	elif [ "$proto" = "ip6" ]; then
+		ip netns exec "$ns1" ping -W 2 -c 1 -q dead:2::99 > /dev/null
+		lret=$?
+	else
+		lret=111
+	fi
+
+	# queue without bypass keyword should drop traffic if no listener exists.
+	if [ "$lret" -eq 0 ];then
+		echo "FAIL: $proto expected failure, got $lret" 1>&2
+		exit 1
+	fi
+
+	if ! ip netns exec "$nsrouter" nft delete table "$proto" blackh; then
+	        echo "FAIL: $proto: Could not delete blackh table"
+	        exit 1
+	fi
+
+        echo "PASS: $proto: statement with no listener results in packet drop"
+}
+
+nf_queue_wait()
+{
+	local procfile="/proc/self/net/netfilter/nfnetlink_queue"
+	local netns id
+
+	netns="$1"
+	id="$2"
+
+	# if this file doesn't exist, nfnetlink_module isn't loaded.
+	# rather than loading it ourselves, wait for kernel module autoload
+	# completion, nfnetlink should do so automatically because nf_queue
+	# helper program, spawned in the background, asked for this functionality.
+	test -f "$procfile" &&
+		ip netns exec "$netns" cat "$procfile" | grep -q "^ *$id "
+}
+
+test_queue()
+{
+	local expected="$1"
+	local last=""
+
+	# spawn nf_queue listeners
+	ip netns exec "$nsrouter" ./nf_queue -c -q 0 -t $timeout > "$TMPFILE0" &
+	ip netns exec "$nsrouter" ./nf_queue -c -q 1 -t $timeout > "$TMPFILE1" &
+
+	busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 0
+	busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 1
+
+	if ! test_ping;then
+		echo "FAIL: netns routing/connectivity with active listener on queues 0 and 1: $ret" 1>&2
+		exit $ret
+	fi
+
+	if ! test_ping_router;then
+		echo "FAIL: netns router unreachable listener on queue 0 and 1: $ret" 1>&2
+		exit $ret
+	fi
+
+	wait
+	ret=$?
+
+	for file in $TMPFILE0 $TMPFILE1; do
+		last=$(tail -n1 "$file")
+		if [ x"$last" != x"$expected packets total" ]; then
+			echo "FAIL: Expected $expected packets total, but got $last" 1>&2
+			ip netns exec "$nsrouter" nft list ruleset
+			exit 1
+		fi
+	done
+
+	echo "PASS: Expected and received $last"
+}
+
+listener_ready()
+{
+	ss -N "$1" -lnt -o "sport = :12345" | grep -q 12345
+}
+
+test_tcp_forward()
+{
+	ip netns exec "$nsrouter" ./nf_queue -q 2 &
+	local nfqpid=$!
+
+	timeout 5 ip netns exec "$ns2" socat -u TCP-LISTEN:12345 STDOUT >/dev/null &
+	local rpid=$!
+
+	busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns2"
+	busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 2
+
+	local tthen=$(date +%s)
+
+	ip netns exec "$ns1" socat -u STDIN TCP:10.0.2.99:12345 <"$TMPINPUT" >/dev/null
+
+	wait_and_check_retval "$rpid" "tcp and nfqueue in forward chain" "$tthen"
+	kill "$nfqpid"
+}
+
+test_tcp_localhost()
+{
+	timeout 5 ip netns exec "$nsrouter" socat -u TCP-LISTEN:12345 STDOUT >/dev/null &
+	local rpid=$!
+
+	ip netns exec "$nsrouter" ./nf_queue -q 3 &
+	local nfqpid=$!
+	local tthen=$(date +%s)
+
+	busywait "$BUSYWAIT_TIMEOUT" listener_ready "$nsrouter"
+	busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 3
+
+	ip netns exec "$nsrouter" socat -u STDIN TCP:127.0.0.1:12345 <"$TMPINPUT" >/dev/null
+
+	wait_and_check_retval "$rpid" "tcp via loopback" "$tthen"
+	kill "$nfqpid"
+}
+
+test_tcp_localhost_connectclose()
+{
+	ip netns exec "$nsrouter" ./nf_queue -q 3 &
+	local nfqpid=$!
+
+	busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 3
+
+	timeout 10 ip netns exec "$nsrouter" ./connect_close -p 23456 -t 3
+
+	kill "$nfqpid"
+	wait && echo "PASS: tcp via loopback with connect/close"
+}
+
+test_tcp_localhost_requeue()
+{
+ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF
+flush ruleset
+table inet filter {
+	chain output {
+		type filter hook output priority 0; policy accept;
+		tcp dport 12345 limit rate 1/second burst 1 packets counter queue num 0
+	}
+	chain post {
+		type filter hook postrouting priority 0; policy accept;
+		tcp dport 12345 limit rate 1/second burst 1 packets counter queue num 0
+	}
+}
+EOF
+	timeout 5 ip netns exec "$nsrouter" socat -u TCP-LISTEN:12345 STDOUT >/dev/null &
+	local rpid=$!
+
+	ip netns exec "$nsrouter" ./nf_queue -c -q 1 -t "$timeout" > "$TMPFILE2" &
+
+	# nfqueue 1 will be called via output hook.  But this time,
+        # re-queue the packet to nfqueue program on queue 2.
+	ip netns exec "$nsrouter" ./nf_queue -G -d 150 -c -q 0 -Q 1 -t "$timeout" > "$TMPFILE3" &
+
+	busywait "$BUSYWAIT_TIMEOUT" listener_ready "$nsrouter"
+	ip netns exec "$nsrouter" socat -u STDIN TCP:127.0.0.1:12345 <"$TMPINPUT" > /dev/null
+
+	wait
+
+	if ! diff -u "$TMPFILE2" "$TMPFILE3" ; then
+		echo "FAIL: lost packets during requeue?!" 1>&2
+		return
+	fi
+
+	echo "PASS: tcp via loopback and re-queueing"
+}
+
+test_icmp_vrf() {
+	if ! ip -net "$ns1" link add tvrf type vrf table 9876;then
+		echo "SKIP: Could not add vrf device"
+		return
+	fi
+
+	ip -net "$ns1" li set eth0 master tvrf
+	ip -net "$ns1" li set tvrf up
+
+	ip -net "$ns1" route add 10.0.2.0/24 via 10.0.1.1 dev eth0 table 9876
+ip netns exec "$ns1" nft -f /dev/stdin <<EOF
+flush ruleset
+table inet filter {
+	chain output {
+		type filter hook output priority 0; policy accept;
+		meta oifname "tvrf" icmp type echo-request counter queue num 1
+		meta oifname "eth0" icmp type echo-request counter queue num 1
+	}
+	chain post {
+		type filter hook postrouting priority 0; policy accept;
+		meta oifname "tvrf" icmp type echo-request counter queue num 1
+		meta oifname "eth0" icmp type echo-request counter queue num 1
+	}
+}
+EOF
+	ip netns exec "$ns1" ./nf_queue -q 1 &
+	local nfqpid=$!
+
+	busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$ns1" 1
+
+	ip netns exec "$ns1" ip vrf exec tvrf ping -c 1 10.0.2.99 > /dev/null
+
+	for n in output post; do
+		for d in tvrf eth0; do
+			if ! ip netns exec "$ns1" nft list chain inet filter "$n" | grep -q "oifname \"$d\" icmp type echo-request counter packets 1"; then
+				kill "$nfqpid"
+				echo "FAIL: chain $n: icmp packet counter mismatch for device $d" 1>&2
+				ip netns exec "$ns1" nft list ruleset
+				ret=1
+				return
+			fi
+		done
+	done
+
+	kill "$nfqpid"
+	echo "PASS: icmp+nfqueue via vrf"
+}
+
+sctp_listener_ready()
+{
+	ss -S -N "$1" -ln -o "sport = :12345" | grep -q 12345
+}
+
+check_output_files()
+{
+	local f1="$1"
+	local f2="$2"
+	local err="$3"
+
+	if ! cmp "$f1" "$f2" ; then
+		echo "FAIL: $err: input and output file differ" 1>&2
+		echo -n " Input file" 1>&2
+		ls -l "$f1" 1>&2
+		echo -n "Output file" 1>&2
+		ls -l "$f2" 1>&2
+		ret=1
+	fi
+}
+
+wait_and_check_retval()
+{
+	local rpid="$1"
+	local msg="$2"
+	local tthen="$3"
+	local tnow=$(date +%s)
+
+	if wait "$rpid";then
+		echo -n "PASS: "
+	else
+		echo -n "FAIL: "
+		ret=1
+	fi
+
+	printf "%s (duration: %ds)\n" "$msg" $((tnow-tthen))
+}
+
+test_sctp_forward()
+{
+	ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF
+flush ruleset
+table inet sctpq {
+        chain forward {
+        type filter hook forward priority 0; policy accept;
+                sctp dport 12345 queue num 10
+        }
+}
+EOF
+	timeout "$SCTP_TEST_TIMEOUT" ip netns exec "$ns2" socat -u SCTP-LISTEN:12345 STDOUT > "$TMPFILE1" &
+	local rpid=$!
+
+	busywait "$BUSYWAIT_TIMEOUT" sctp_listener_ready "$ns2"
+
+	ip netns exec "$nsrouter" ./nf_queue -q 10 -G &
+	local nfqpid=$!
+	local tthen=$(date +%s)
+
+	ip netns exec "$ns1" socat -u STDIN SCTP:10.0.2.99:12345 <"$TMPINPUT" >/dev/null
+
+	if ! ip netns exec "$nsrouter" nft delete table inet sctpq; then
+		echo "FAIL:  Could not delete sctpq table"
+		exit 1
+	fi
+
+	wait_and_check_retval "$rpid" "sctp and nfqueue in forward chain" "$tthen"
+	kill "$nfqpid"
+
+	check_output_files "$TMPINPUT" "$TMPFILE1" "sctp forward"
+}
+
+test_sctp_output()
+{
+        ip netns exec "$ns1" nft -f /dev/stdin <<EOF
+table inet sctpq {
+        chain output {
+        type filter hook output priority 0; policy accept;
+                sctp dport 12345 queue num 11
+        }
+}
+EOF
+	# reduce test file size, software segmentation causes sk wmem increase.
+	dd conv=sparse status=none if=/dev/zero bs=1M count=$((COUNT/2)) of="$TMPINPUT"
+
+	timeout "$SCTP_TEST_TIMEOUT" ip netns exec "$ns2" socat -u SCTP-LISTEN:12345 STDOUT > "$TMPFILE1" &
+	local rpid=$!
+
+	busywait "$BUSYWAIT_TIMEOUT" sctp_listener_ready "$ns2"
+
+	ip netns exec "$ns1" ./nf_queue -q 11 &
+	local nfqpid=$!
+	local tthen=$(date +%s)
+
+	ip netns exec "$ns1" socat -u STDIN SCTP:10.0.2.99:12345 <"$TMPINPUT" >/dev/null
+
+	if ! ip netns exec "$ns1" nft delete table inet sctpq; then
+		echo "FAIL:  Could not delete sctpq table"
+		exit 1
+	fi
+
+	# must wait before checking completeness of output file.
+	wait_and_check_retval "$rpid" "sctp and nfqueue in output chain with GSO" "$tthen"
+	kill "$nfqpid"
+
+	check_output_files "$TMPINPUT" "$TMPFILE1" "sctp output"
+}
+
+udp_listener_ready()
+{
+	ss -S -N "$1" -uln -o "sport = :12345" | grep -q 12345
+}
+
+output_files_written()
+{
+	test -s "$1" && test -s "$2"
+}
+
+test_udp_ct_race()
+{
+        ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF
+flush ruleset
+table inet udpq {
+	chain prerouting {
+		type nat hook prerouting priority dstnat - 5; policy accept;
+		ip daddr 10.6.6.6 udp dport 12345 counter dnat to numgen inc mod 2 map { 0 : 10.0.2.99, 1 : 10.0.3.99 }
+	}
+        chain postrouting {
+		type filter hook postrouting priority srcnat - 5; policy accept;
+		udp dport 12345 counter queue num 12
+        }
+}
+EOF
+	:> "$TMPFILE1"
+	:> "$TMPFILE2"
+
+	timeout 10 ip netns exec "$ns2" socat UDP-LISTEN:12345,fork,pf=ipv4 OPEN:"$TMPFILE1",trunc &
+	local rpid1=$!
+
+	timeout 10 ip netns exec "$ns3" socat UDP-LISTEN:12345,fork,pf=ipv4 OPEN:"$TMPFILE2",trunc &
+	local rpid2=$!
+
+	ip netns exec "$nsrouter" ./nf_queue -q 12 -d 1000 &
+	local nfqpid=$!
+
+	busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns2"
+	busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns3"
+	busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 12
+
+	# Send two packets, one should end up in ns1, other in ns2.
+	# This is because nfqueue will delay packet for long enough so that
+	# second packet will not find existing conntrack entry.
+	echo "Packet 1" | ip netns exec "$ns1" socat -u STDIN UDP-DATAGRAM:10.6.6.6:12345,bind=0.0.0.0:55221
+	echo "Packet 2" | ip netns exec "$ns1" socat -u STDIN UDP-DATAGRAM:10.6.6.6:12345,bind=0.0.0.0:55221
+
+	busywait 10000 output_files_written "$TMPFILE1" "$TMPFILE2"
+
+	kill "$nfqpid"
+
+	if ! ip netns exec "$nsrouter" bash -c 'conntrack -L -p udp --dport 12345 2>/dev/null | wc -l | grep -q "^1"'; then
+		echo "FAIL: Expected One udp conntrack entry"
+		ip netns exec "$nsrouter" conntrack -L -p udp --dport 12345
+		ret=1
+	fi
+
+	if ! ip netns exec "$nsrouter" nft delete table inet udpq; then
+		echo "FAIL: Could not delete udpq table"
+		ret=1
+		return
+	fi
+
+	NUMLINES1=$(wc -l < "$TMPFILE1")
+	NUMLINES2=$(wc -l < "$TMPFILE2")
+
+	if [ "$NUMLINES1" -ne 1 ] || [ "$NUMLINES2" -ne 1 ]; then
+		ret=1
+		echo "FAIL: uneven udp packet distribution: $NUMLINES1 $NUMLINES2"
+		echo -n "$TMPFILE1: ";cat "$TMPFILE1"
+		echo -n "$TMPFILE2: ";cat "$TMPFILE2"
+		return
+	fi
+
+	echo "PASS: both udp receivers got one packet each"
+}
+
+test_queue_removal()
+{
+	read tainted_then < /proc/sys/kernel/tainted
+
+	ip netns exec "$ns1" nft -f - <<EOF
+flush ruleset
+table ip filter {
+	chain output {
+		type filter hook output priority 0; policy accept;
+		ip protocol icmp queue num 0
+	}
+}
+EOF
+	ip netns exec "$ns1" ./nf_queue -q 0 -d 30000 &
+	local nfqpid=$!
+
+	busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$ns1" 0
+
+	ip netns exec "$ns1" ping -w 2 -f -c 10 127.0.0.1 -q >/dev/null
+	kill $nfqpid
+
+	ip netns exec "$ns1" nft flush ruleset
+
+	if [ "$tainted_then" -ne 0 ];then
+		return
+	fi
+
+	read tainted_now < /proc/sys/kernel/tainted
+	if [ "$tainted_now" -eq 0 ];then
+		echo "PASS: queue program exiting while packets queued"
+	else
+		echo "TAINT: queue program exiting while packets queued"
+		dmesg
+		ret=1
+	fi
+}
+
+ip netns exec "$nsrouter" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth2.forwarding=1 > /dev/null
+
+load_ruleset "filter" 0
+
+if test_ping; then
+	# queue bypass works (rules were skipped, no listener)
+	echo "PASS: ${ns1} can reach ${ns2}"
+else
+	echo "FAIL: ${ns1} cannot reach ${ns2}: $ret" 1>&2
+	exit $ret
+fi
+
+test_queue_blackhole ip
+test_queue_blackhole ip6
+
+# dummy ruleset to add base chains between the
+# queueing rules.  We don't want the second reinject
+# to re-execute the old hooks.
+load_counter_ruleset 10
+
+# we are hooking all: prerouting/input/forward/output/postrouting.
+# we ping ${ns2} from ${ns1} via ${nsrouter} using ipv4 and ipv6, so:
+# 1x icmp prerouting,forward,postrouting -> 3 queue events (6 incl. reply).
+# 1x icmp prerouting,input,output postrouting -> 4 queue events incl. reply.
+# so we expect that userspace program receives 10 packets.
+test_queue 10
+
+# same.  We queue to a second program as well.
+load_ruleset "filter2" 20
+test_queue 20
+ip netns exec "$ns1" nft flush ruleset
+
+test_tcp_forward
+test_tcp_localhost
+test_tcp_localhost_connectclose
+test_tcp_localhost_requeue
+test_sctp_forward
+test_sctp_output
+test_udp_ct_race
+
+# should be last, adds vrf device in ns1 and changes routes
+test_icmp_vrf
+test_queue_removal
+
+exit $ret
diff --git a/tools/testing/selftests/net/netfilter/nft_synproxy.sh b/tools/testing/selftests/net/netfilter/nft_synproxy.sh
new file mode 100755
index 000000000000..293f667a6aec
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/nft_synproxy.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source lib.sh
+
+ret=0
+
+checktool "nft --version" "run test without nft tool"
+checktool "iperf3 --version" "run test without iperf3"
+
+setup_ns nsr ns1 ns2
+
+modprobe -q nf_conntrack
+
+cleanup() {
+	ip netns pids "$ns1" | xargs kill 2>/dev/null
+	ip netns pids "$ns2" | xargs kill 2>/dev/null
+
+	cleanup_all_ns
+}
+
+trap cleanup EXIT
+
+ip link add veth0 netns "$nsr" type veth peer name eth0 netns "$ns1"
+ip link add veth1 netns "$nsr" type veth peer name eth0 netns "$ns2"
+
+for dev in veth0 veth1; do
+	ip -net "$nsr" link set "$dev" up
+done
+
+ip -net "$nsr" addr add 10.0.1.1/24 dev veth0
+ip -net "$nsr" addr add 10.0.2.1/24 dev veth1
+
+ip netns exec "$nsr" sysctl -q net.ipv4.conf.veth0.forwarding=1
+ip netns exec "$nsr" sysctl -q net.ipv4.conf.veth1.forwarding=1
+ip netns exec "$nsr" sysctl -q net.netfilter.nf_conntrack_tcp_loose=0
+
+for n in $ns1 $ns2; do
+  ip -net "$n" link set eth0 up
+done
+ip -net "$ns1" addr add 10.0.1.99/24 dev eth0
+ip -net "$ns2" addr add 10.0.2.99/24 dev eth0
+ip -net "$ns1" route add default via 10.0.1.1
+ip -net "$ns2" route add default via 10.0.2.1
+
+# test basic connectivity
+if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.99 > /dev/null; then
+  echo "ERROR: $ns1 cannot reach $ns2" 1>&2
+  exit 1
+fi
+
+if ! ip netns exec "$ns2" ping -c 1 -q 10.0.1.99 > /dev/null; then
+  echo "ERROR: $ns2 cannot reach $ns1" 1>&2
+  exit 1
+fi
+
+ip netns exec "$ns2" iperf3 -s > /dev/null 2>&1 &
+# ip netns exec $nsr tcpdump -vvv -n -i veth1 tcp | head -n 10 &
+
+sleep 1
+
+ip netns exec "$nsr" nft -f - <<EOF
+table inet filter {
+   chain prerouting {
+      type filter hook prerouting priority -300; policy accept;
+      meta iif veth0 tcp flags syn counter notrack
+   }
+
+  chain forward {
+      type filter hook forward priority 0; policy accept;
+
+      ct state new,established counter accept
+
+      meta iif veth0 meta l4proto tcp ct state untracked,invalid synproxy mss 1460 sack-perm timestamp
+
+      ct state invalid counter drop
+
+      # make ns2 unreachable w.o. tcp synproxy
+      tcp flags syn counter drop
+   }
+}
+EOF
+if [ $? -ne 0 ]; then
+	echo "SKIP: Cannot add nft synproxy"
+	exit $ksft_skip
+fi
+
+if ! ip netns exec "$ns1" timeout 5 iperf3 -c 10.0.2.99 -n $((1 * 1024 * 1024)) > /dev/null; then
+	echo "FAIL: iperf3 returned an error" 1>&2
+	ret=1
+	ip netns exec "$nsr" nft list ruleset
+else
+	echo "PASS: synproxy connection successful"
+fi
+
+exit $ret
diff --git a/tools/testing/selftests/net/netfilter/nft_tproxy_tcp.sh b/tools/testing/selftests/net/netfilter/nft_tproxy_tcp.sh
new file mode 100755
index 000000000000..e208fb03eeb7
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/nft_tproxy_tcp.sh
@@ -0,0 +1,358 @@
+#!/bin/bash
+#
+# This tests tproxy on the following scenario:
+#
+#                         +------------+
+# +-------+               |  nsrouter  |                  +-------+
+# |ns1    |.99          .1|            |.1             .99|    ns2|
+# |   eth0|---------------|veth0  veth1|------------------|eth0   |
+# |       |  10.0.1.0/24  |            |   10.0.2.0/24    |       |
+# +-------+  dead:1::/64  |    veth2   |   dead:2::/64    +-------+
+#                         +------------+
+#                                |.1
+#                                |
+#                                |
+#                                |                        +-------+
+#                                |                     .99|    ns3|
+#                                +------------------------|eth0   |
+#                                       10.0.3.0/24       |       |
+#                                       dead:3::/64       +-------+
+#
+# The tproxy implementation acts as an echo server so the client
+# must receive the same message it sent if it has been proxied.
+# If is not proxied the servers return PONG_NS# with the number
+# of the namespace the server is running.
+#
+# shellcheck disable=SC2162,SC2317
+
+source lib.sh
+ret=0
+timeout=5
+
+cleanup()
+{
+	ip netns pids "$ns1" | xargs kill 2>/dev/null
+	ip netns pids "$ns2" | xargs kill 2>/dev/null
+	ip netns pids "$ns3" | xargs kill 2>/dev/null
+	ip netns pids "$nsrouter" | xargs kill 2>/dev/null
+
+	cleanup_all_ns
+}
+
+checktool "nft --version" "test without nft tool"
+checktool "socat -h" "run test without socat"
+
+trap cleanup EXIT
+setup_ns ns1 ns2 ns3 nsrouter
+
+if ! ip link add veth0 netns "$nsrouter" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1; then
+    echo "SKIP: No virtual ethernet pair device support in kernel"
+    exit $ksft_skip
+fi
+ip link add veth1 netns "$nsrouter" type veth peer name eth0 netns "$ns2"
+ip link add veth2 netns "$nsrouter" type veth peer name eth0 netns "$ns3"
+
+ip -net "$nsrouter" link set veth0 up
+ip -net "$nsrouter" addr add 10.0.1.1/24 dev veth0
+ip -net "$nsrouter" addr add dead:1::1/64 dev veth0 nodad
+
+ip -net "$nsrouter" link set veth1 up
+ip -net "$nsrouter" addr add 10.0.2.1/24 dev veth1
+ip -net "$nsrouter" addr add dead:2::1/64 dev veth1 nodad
+
+ip -net "$nsrouter" link set veth2 up
+ip -net "$nsrouter" addr add 10.0.3.1/24 dev veth2
+ip -net "$nsrouter" addr add dead:3::1/64 dev veth2 nodad
+
+ip -net "$ns1" link set eth0 up
+ip -net "$ns2" link set eth0 up
+ip -net "$ns3" link set eth0 up
+
+ip -net "$ns1" addr add 10.0.1.99/24 dev eth0
+ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad
+ip -net "$ns1" route add default via 10.0.1.1
+ip -net "$ns1" route add default via dead:1::1
+
+ip -net "$ns2" addr add 10.0.2.99/24 dev eth0
+ip -net "$ns2" addr add dead:2::99/64 dev eth0 nodad
+ip -net "$ns2" route add default via 10.0.2.1
+ip -net "$ns2" route add default via dead:2::1
+
+ip -net "$ns3" addr add 10.0.3.99/24 dev eth0
+ip -net "$ns3" addr add dead:3::99/64 dev eth0 nodad
+ip -net "$ns3" route add default via 10.0.3.1
+ip -net "$ns3" route add default via dead:3::1
+
+ip netns exec "$nsrouter" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth2.forwarding=1 > /dev/null
+
+test_ping() {
+  if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.99 > /dev/null; then
+	return 1
+  fi
+
+  if ! ip netns exec "$ns1" ping -c 1 -q dead:2::99 > /dev/null; then
+	return 2
+  fi
+
+  if ! ip netns exec "$ns1" ping -c 1 -q 10.0.3.99 > /dev/null; then
+	return 1
+  fi
+
+  if ! ip netns exec "$ns1" ping -c 1 -q dead:3::99 > /dev/null; then
+	return 2
+  fi
+
+  return 0
+}
+
+test_ping_router() {
+  if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.1 > /dev/null; then
+	return 3
+  fi
+
+  if ! ip netns exec "$ns1" ping -c 1 -q dead:2::1 > /dev/null; then
+	return 4
+  fi
+
+  return 0
+}
+
+
+listener_ready()
+{
+	local ns="$1"
+	local port="$2"
+	local proto="$3"
+	ss -N "$ns" -ln "$proto" -o "sport = :$port" | grep -q "$port"
+}
+
+test_tproxy()
+{
+	local traffic_origin="$1"
+	local ip_proto="$2"
+	local expect_ns1_ns2="$3"
+	local expect_ns1_ns3="$4"
+	local expect_nsrouter_ns2="$5"
+	local expect_nsrouter_ns3="$6"
+
+	# derived variables
+	local testname="test_${ip_proto}_tcp_${traffic_origin}"
+	local socat_ipproto
+	local ns1_ip
+	local ns2_ip
+	local ns3_ip
+	local ns2_target
+	local ns3_target
+	local nftables_subject
+	local ip_command
+
+	# socat 1.8.0 has a bug that requires to specify the IP family to bind (fixed in 1.8.0.1)
+	case $ip_proto in
+	"ip")
+		socat_ipproto="-4"
+		ns1_ip=10.0.1.99
+		ns2_ip=10.0.2.99
+		ns3_ip=10.0.3.99
+		ns2_target="tcp:$ns2_ip:8080"
+		ns3_target="tcp:$ns3_ip:8080"
+		nftables_subject="ip daddr $ns2_ip tcp dport 8080"
+		ip_command="ip"
+	;;
+	"ip6")
+		socat_ipproto="-6"
+		ns1_ip=dead:1::99
+		ns2_ip=dead:2::99
+		ns3_ip=dead:3::99
+		ns2_target="tcp:[$ns2_ip]:8080"
+		ns3_target="tcp:[$ns3_ip]:8080"
+		nftables_subject="ip6 daddr $ns2_ip tcp dport 8080"
+		ip_command="ip -6"
+	;;
+	*)
+	echo "FAIL: unsupported protocol"
+	exit 255
+	;;
+	esac
+
+	case $traffic_origin in
+	# to capture the local originated traffic we need to mark the outgoing
+	# traffic so the policy based routing rule redirects it and can be processed
+	# in the prerouting chain.
+	"local")
+		nftables_rules="
+flush ruleset
+table inet filter {
+	chain divert {
+		type filter hook prerouting priority 0; policy accept;
+		$nftables_subject tproxy $ip_proto to :12345 meta mark set 1 accept
+	}
+	chain output {
+		type route hook output priority 0; policy accept;
+		$nftables_subject meta mark set 1 accept
+	}
+}"
+	;;
+	"forward")
+		nftables_rules="
+flush ruleset
+table inet filter {
+	chain divert {
+		type filter hook prerouting priority 0; policy accept;
+		$nftables_subject tproxy $ip_proto to :12345 meta mark set 1 accept
+	}
+}"
+	;;
+	*)
+	echo "FAIL: unsupported parameter for traffic origin"
+	exit 255
+	;;
+	esac
+
+	# shellcheck disable=SC2046 # Intended splitting of ip_command
+	ip netns exec "$nsrouter" $ip_command rule add fwmark 1 table 100
+	ip netns exec "$nsrouter" $ip_command route add local "${ns2_ip}" dev lo table 100
+	echo "$nftables_rules" | ip netns exec "$nsrouter" nft -f /dev/stdin
+
+	timeout "$timeout" ip netns exec "$nsrouter" socat "$socat_ipproto" tcp-listen:12345,fork,ip-transparent SYSTEM:"cat" 2>/dev/null &
+	local tproxy_pid=$!
+
+	timeout "$timeout" ip netns exec "$ns2" socat "$socat_ipproto" tcp-listen:8080,fork SYSTEM:"echo PONG_NS2" 2>/dev/null &
+	local server2_pid=$!
+
+	timeout "$timeout" ip netns exec "$ns3" socat "$socat_ipproto" tcp-listen:8080,fork SYSTEM:"echo PONG_NS3" 2>/dev/null &
+	local server3_pid=$!
+
+	busywait "$BUSYWAIT_TIMEOUT" listener_ready "$nsrouter" 12345 "-t"
+	busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns2" 8080 "-t"
+	busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns3" 8080 "-t"
+
+	local result
+	# request from ns1 to ns2 (forwarded traffic)
+	result=$(echo I_M_PROXIED | ip netns exec "$ns1" socat -t 2 -T 2 STDIO "$ns2_target")
+	if [ "$result" == "$expect_ns1_ns2" ] ;then
+		echo "PASS: tproxy test $testname: ns1 got reply \"$result\" connecting to ns2"
+	else
+		echo "ERROR: tproxy test $testname: ns1 got reply \"$result\" connecting to ns2, not \"${expect_ns1_ns2}\" as intended"
+		ret=1
+	fi
+
+	# request from ns1 to ns3(forwarded traffic)
+	result=$(echo I_M_PROXIED | ip netns exec "$ns1" socat -t 2 -T 2 STDIO "$ns3_target")
+	if [ "$result" = "$expect_ns1_ns3" ] ;then
+		echo "PASS: tproxy test $testname: ns1 got reply \"$result\" connecting to ns3"
+	else
+		echo "ERROR: tproxy test $testname: ns1 got reply \"$result\" connecting to ns3, not \"$expect_ns1_ns3\" as intended"
+		ret=1
+	fi
+
+	# request from nsrouter to ns2 (localy originated traffic)
+	result=$(echo I_M_PROXIED | ip netns exec "$nsrouter" socat -t 2 -T 2 STDIO "$ns2_target")
+	if [ "$result" == "$expect_nsrouter_ns2" ] ;then
+		echo "PASS: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns2"
+	else
+		echo "ERROR: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns2, not \"$expect_nsrouter_ns2\" as intended"
+		ret=1
+	fi
+
+	# request from nsrouter to ns3 (localy originated traffic)
+	result=$(echo I_M_PROXIED | ip netns exec "$nsrouter" socat -t 2 -T 2 STDIO "$ns3_target")
+	if [ "$result" = "$expect_nsrouter_ns3" ] ;then
+		echo "PASS: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns3"
+	else
+		echo "ERROR: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns3, not \"$expect_nsrouter_ns3\"  as intended"
+		ret=1
+	fi
+
+	# cleanup
+	kill "$tproxy_pid" "$server2_pid" "$server3_pid" 2>/dev/null
+	# shellcheck disable=SC2046 # Intended splitting of ip_command
+	ip netns exec "$nsrouter" $ip_command rule del fwmark 1 table 100
+	ip netns exec "$nsrouter" $ip_command route flush table 100
+}
+
+
+test_ipv4_tcp_forward()
+{
+	local traffic_origin="forward"
+	local ip_proto="ip"
+	local expect_ns1_ns2="I_M_PROXIED"
+	local expect_ns1_ns3="PONG_NS3"
+	local expect_nsrouter_ns2="PONG_NS2"
+	local expect_nsrouter_ns3="PONG_NS3"
+
+	test_tproxy     "$traffic_origin" \
+			"$ip_proto" \
+			"$expect_ns1_ns2" \
+			"$expect_ns1_ns3" \
+			"$expect_nsrouter_ns2" \
+			"$expect_nsrouter_ns3"
+}
+
+test_ipv4_tcp_local()
+{
+	local traffic_origin="local"
+	local ip_proto="ip"
+	local expect_ns1_ns2="I_M_PROXIED"
+	local expect_ns1_ns3="PONG_NS3"
+	local expect_nsrouter_ns2="I_M_PROXIED"
+	local expect_nsrouter_ns3="PONG_NS3"
+
+	test_tproxy     "$traffic_origin" \
+			"$ip_proto" \
+			"$expect_ns1_ns2" \
+			"$expect_ns1_ns3" \
+			"$expect_nsrouter_ns2" \
+			"$expect_nsrouter_ns3"
+}
+
+test_ipv6_tcp_forward()
+{
+	local traffic_origin="forward"
+	local ip_proto="ip6"
+	local expect_ns1_ns2="I_M_PROXIED"
+	local expect_ns1_ns3="PONG_NS3"
+	local expect_nsrouter_ns2="PONG_NS2"
+	local expect_nsrouter_ns3="PONG_NS3"
+
+	test_tproxy     "$traffic_origin" \
+			"$ip_proto" \
+			"$expect_ns1_ns2" \
+			"$expect_ns1_ns3" \
+			"$expect_nsrouter_ns2" \
+			"$expect_nsrouter_ns3"
+}
+
+test_ipv6_tcp_local()
+{
+	local traffic_origin="local"
+	local ip_proto="ip6"
+	local expect_ns1_ns2="I_M_PROXIED"
+	local expect_ns1_ns3="PONG_NS3"
+	local expect_nsrouter_ns2="I_M_PROXIED"
+	local expect_nsrouter_ns3="PONG_NS3"
+
+	test_tproxy     "$traffic_origin" \
+			"$ip_proto" \
+			"$expect_ns1_ns2" \
+			"$expect_ns1_ns3" \
+			"$expect_nsrouter_ns2" \
+			"$expect_nsrouter_ns3"
+}
+
+if test_ping; then
+	# queue bypass works (rules were skipped, no listener)
+	echo "PASS: ${ns1} can reach ${ns2}"
+else
+	echo "FAIL: ${ns1} cannot reach ${ns2}: $ret" 1>&2
+	exit $ret
+fi
+
+test_ipv4_tcp_forward
+test_ipv4_tcp_local
+test_ipv6_tcp_forward
+test_ipv6_tcp_local
+
+exit $ret
diff --git a/tools/testing/selftests/net/netfilter/nft_tproxy_udp.sh b/tools/testing/selftests/net/netfilter/nft_tproxy_udp.sh
new file mode 100755
index 000000000000..d16de13fe5a7
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/nft_tproxy_udp.sh
@@ -0,0 +1,262 @@
+#!/bin/bash
+#
+# This tests tproxy on the following scenario:
+#
+#                         +------------+
+# +-------+               |  nsrouter  |                  +-------+
+# |ns1    |.99          .1|            |.1             .99|    ns2|
+# |   eth0|---------------|veth0  veth1|------------------|eth0   |
+# |       |  10.0.1.0/24  |            |   10.0.2.0/24    |       |
+# +-------+  dead:1::/64  |    veth2   |   dead:2::/64    +-------+
+#                         +------------+
+#                                |.1
+#                                |
+#                                |
+#                                |                        +-------+
+#                                |                     .99|    ns3|
+#                                +------------------------|eth0   |
+#                                       10.0.3.0/24       |       |
+#                                       dead:3::/64       +-------+
+#
+# The tproxy implementation acts as an echo server so the client
+# must receive the same message it sent if it has been proxied.
+# If is not proxied the servers return PONG_NS# with the number
+# of the namespace the server is running.
+# shellcheck disable=SC2162,SC2317
+
+source lib.sh
+ret=0
+# UDP is slow
+timeout=15
+
+cleanup()
+{
+	ip netns pids "$ns1" | xargs kill 2>/dev/null
+	ip netns pids "$ns2" | xargs kill 2>/dev/null
+	ip netns pids "$ns3" | xargs kill 2>/dev/null
+	ip netns pids "$nsrouter" | xargs kill 2>/dev/null
+
+	cleanup_all_ns
+}
+
+checktool "nft --version" "test without nft tool"
+checktool "socat -h" "run test without socat"
+
+trap cleanup EXIT
+setup_ns ns1 ns2 ns3 nsrouter
+
+if ! ip link add veth0 netns "$nsrouter" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1; then
+    echo "SKIP: No virtual ethernet pair device support in kernel"
+    exit $ksft_skip
+fi
+ip link add veth1 netns "$nsrouter" type veth peer name eth0 netns "$ns2"
+ip link add veth2 netns "$nsrouter" type veth peer name eth0 netns "$ns3"
+
+ip -net "$nsrouter" link set veth0 up
+ip -net "$nsrouter" addr add 10.0.1.1/24 dev veth0
+ip -net "$nsrouter" addr add dead:1::1/64 dev veth0 nodad
+
+ip -net "$nsrouter" link set veth1 up
+ip -net "$nsrouter" addr add 10.0.2.1/24 dev veth1
+ip -net "$nsrouter" addr add dead:2::1/64 dev veth1 nodad
+
+ip -net "$nsrouter" link set veth2 up
+ip -net "$nsrouter" addr add 10.0.3.1/24 dev veth2
+ip -net "$nsrouter" addr add dead:3::1/64 dev veth2 nodad
+
+ip -net "$ns1" link set eth0 up
+ip -net "$ns2" link set eth0 up
+ip -net "$ns3" link set eth0 up
+
+ip -net "$ns1" addr add 10.0.1.99/24 dev eth0
+ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad
+ip -net "$ns1" route add default via 10.0.1.1
+ip -net "$ns1" route add default via dead:1::1
+
+ip -net "$ns2" addr add 10.0.2.99/24 dev eth0
+ip -net "$ns2" addr add dead:2::99/64 dev eth0 nodad
+ip -net "$ns2" route add default via 10.0.2.1
+ip -net "$ns2" route add default via dead:2::1
+
+ip -net "$ns3" addr add 10.0.3.99/24 dev eth0
+ip -net "$ns3" addr add dead:3::99/64 dev eth0 nodad
+ip -net "$ns3" route add default via 10.0.3.1
+ip -net "$ns3" route add default via dead:3::1
+
+ip netns exec "$nsrouter" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth2.forwarding=1 > /dev/null
+
+test_ping() {
+  if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.99 > /dev/null; then
+	return 1
+  fi
+
+  if ! ip netns exec "$ns1" ping -c 1 -q dead:2::99 > /dev/null; then
+	return 2
+  fi
+
+  if ! ip netns exec "$ns1" ping -c 1 -q 10.0.3.99 > /dev/null; then
+	return 1
+  fi
+
+  if ! ip netns exec "$ns1" ping -c 1 -q dead:3::99 > /dev/null; then
+	return 2
+  fi
+
+  return 0
+}
+
+test_ping_router() {
+  if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.1 > /dev/null; then
+	return 3
+  fi
+
+  if ! ip netns exec "$ns1" ping -c 1 -q dead:2::1 > /dev/null; then
+	return 4
+  fi
+
+  return 0
+}
+
+
+listener_ready()
+{
+	local ns="$1"
+	local port="$2"
+	local proto="$3"
+	ss -N "$ns" -ln "$proto" -o "sport = :$port" | grep -q "$port"
+}
+
+test_tproxy_udp_forward()
+{
+	local ip_proto="$1"
+
+	local expect_ns1_ns2="I_M_PROXIED"
+	local expect_ns1_ns3="PONG_NS3"
+	local expect_nsrouter_ns2="PONG_NS2"
+	local expect_nsrouter_ns3="PONG_NS3"
+
+	# derived variables
+	local testname="test_${ip_proto}_udp_forward"
+	local socat_ipproto
+	local ns1_ip
+	local ns2_ip
+	local ns3_ip
+	local ns1_ip_port
+	local ns2_ip_port
+	local ns3_ip_port
+	local ip_command
+
+	# socat 1.8.0 has a bug that requires to specify the IP family to bind (fixed in 1.8.0.1)
+	case $ip_proto in
+	"ip")
+		socat_ipproto="-4"
+		ns1_ip=10.0.1.99
+		ns2_ip=10.0.2.99
+		ns3_ip=10.0.3.99
+		ns1_ip_port="$ns1_ip:18888"
+		ns2_ip_port="$ns2_ip:8080"
+		ns3_ip_port="$ns3_ip:8080"
+		ip_command="ip"
+	;;
+	"ip6")
+		socat_ipproto="-6"
+		ns1_ip=dead:1::99
+		ns2_ip=dead:2::99
+		ns3_ip=dead:3::99
+		ns1_ip_port="[$ns1_ip]:18888"
+		ns2_ip_port="[$ns2_ip]:8080"
+		ns3_ip_port="[$ns3_ip]:8080"
+		ip_command="ip -6"
+	;;
+	*)
+	echo "FAIL: unsupported protocol"
+	exit 255
+	;;
+	esac
+
+	# shellcheck disable=SC2046 # Intended splitting of ip_command
+	ip netns exec "$nsrouter" $ip_command rule add fwmark 1 table 100
+	ip netns exec "$nsrouter" $ip_command route add local "$ns2_ip" dev lo table 100
+	ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF
+flush ruleset
+table inet filter {
+	chain divert {
+		type filter hook prerouting priority 0; policy accept;
+		$ip_proto daddr $ns2_ip udp dport 8080 tproxy $ip_proto to :12345 meta mark set 1 accept
+	}
+}
+EOF
+
+	timeout "$timeout" ip netns exec "$nsrouter" socat -u "$socat_ipproto" udp-listen:12345,fork,ip-transparent,reuseport udp:"$ns1_ip_port",ip-transparent,reuseport,bind="$ns2_ip_port" 2>/dev/null &
+	local tproxy_pid=$!
+
+	timeout "$timeout" ip netns exec "$ns2" socat "$socat_ipproto" udp-listen:8080,fork SYSTEM:"echo PONG_NS2" 2>/dev/null &
+	local server2_pid=$!
+
+	timeout "$timeout" ip netns exec "$ns3" socat "$socat_ipproto" udp-listen:8080,fork SYSTEM:"echo PONG_NS3" 2>/dev/null &
+	local server3_pid=$!
+
+	busywait "$BUSYWAIT_TIMEOUT" listener_ready "$nsrouter" 12345 "-u"
+	busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns2" 8080 "-u"
+	busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns3" 8080 "-u"
+
+	local result
+	# request from ns1 to ns2 (forwarded traffic)
+	result=$(echo I_M_PROXIED | ip netns exec "$ns1" socat -t 2 -T 2 STDIO udp:"$ns2_ip_port",sourceport=18888)
+	if [ "$result" == "$expect_ns1_ns2" ] ;then
+		echo "PASS: tproxy test $testname: ns1 got reply \"$result\" connecting to ns2"
+	else
+		echo "ERROR: tproxy test $testname: ns1 got reply \"$result\" connecting to ns2, not \"${expect_ns1_ns2}\" as intended"
+		ret=1
+	fi
+
+	# request from ns1 to ns3 (forwarded traffic)
+	result=$(echo I_M_PROXIED | ip netns exec "$ns1" socat -t 2 -T 2 STDIO udp:"$ns3_ip_port")
+	if [ "$result" = "$expect_ns1_ns3" ] ;then
+		echo "PASS: tproxy test $testname: ns1 got reply \"$result\" connecting to ns3"
+	else
+		echo "ERROR: tproxy test $testname: ns1 got reply \"$result\" connecting to ns3, not \"$expect_ns1_ns3\" as intended"
+		ret=1
+	fi
+
+	# request from nsrouter to ns2 (localy originated traffic)
+	result=$(echo I_M_PROXIED | ip netns exec "$nsrouter" socat -t 2 -T 2 STDIO udp:"$ns2_ip_port")
+	if [ "$result" == "$expect_nsrouter_ns2" ] ;then
+		echo "PASS: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns2"
+	else
+		echo "ERROR: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns2, not \"$expect_nsrouter_ns2\" as intended"
+		ret=1
+	fi
+
+	# request from nsrouter to ns3 (localy originated traffic)
+	result=$(echo I_M_PROXIED | ip netns exec "$nsrouter" socat -t 2 -T 2 STDIO udp:"$ns3_ip_port")
+	if [ "$result" = "$expect_nsrouter_ns3" ] ;then
+		echo "PASS: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns3"
+	else
+		echo "ERROR: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns3, not \"$expect_nsrouter_ns3\"  as intended"
+		ret=1
+	fi
+
+	# cleanup
+	kill "$tproxy_pid" "$server2_pid" "$server3_pid" 2>/dev/null
+	# shellcheck disable=SC2046 # Intended splitting of ip_command
+	ip netns exec "$nsrouter" $ip_command rule del fwmark 1 table 100
+	ip netns exec "$nsrouter" $ip_command route flush table 100
+}
+
+
+if test_ping; then
+	# queue bypass works (rules were skipped, no listener)
+	echo "PASS: ${ns1} can reach ${ns2}"
+else
+	echo "FAIL: ${ns1} cannot reach ${ns2}: $ret" 1>&2
+	exit $ret
+fi
+
+test_tproxy_udp_forward "ip"
+test_tproxy_udp_forward "ip6"
+
+exit $ret
diff --git a/tools/testing/selftests/net/netfilter/nft_zones_many.sh b/tools/testing/selftests/net/netfilter/nft_zones_many.sh
new file mode 100755
index 000000000000..7db9982ba5a6
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/nft_zones_many.sh
@@ -0,0 +1,164 @@
+#!/bin/bash
+
+# Test insertion speed for packets with identical addresses/ports
+# that are all placed in distinct conntrack zones.
+
+source lib.sh
+
+zones=2000
+[ "$KSFT_MACHINE_SLOW" = yes ] && zones=500
+
+have_ct_tool=0
+ret=0
+
+cleanup()
+{
+	cleanup_all_ns
+}
+
+checktool "nft --version" "run test without nft tool"
+checktool "socat -V" "run test without socat tool"
+
+setup_ns ns1
+
+trap cleanup EXIT
+
+if conntrack -V > /dev/null 2>&1; then
+	have_ct_tool=1
+fi
+
+test_zones() {
+	local max_zones=$1
+
+ip netns exec "$ns1" nft -f /dev/stdin<<EOF
+flush ruleset
+table inet raw {
+	map rndzone {
+		typeof numgen inc mod $max_zones : ct zone
+	}
+
+	chain output {
+		type filter hook output priority -64000; policy accept;
+		udp dport 12345  ct zone set numgen inc mod 65536 map @rndzone
+	}
+}
+EOF
+if [ "$?" -ne 0 ];then
+	echo "SKIP: Cannot add nftables rules"
+	exit $ksft_skip
+fi
+
+	ip netns exec "$ns1" sysctl -q net.netfilter.nf_conntrack_udp_timeout=3600
+
+	(
+		echo "add element inet raw rndzone {"
+	for i in $(seq 1 "$max_zones");do
+		echo -n "$i : $i"
+		if [ "$i" -lt "$max_zones" ]; then
+			echo ","
+		else
+			echo "}"
+		fi
+	done
+	) | ip netns exec "$ns1" nft -f /dev/stdin
+
+	local i=0
+	local j=0
+	local outerstart
+	local stop
+	outerstart=$(date +%s%3N)
+	stop=$outerstart
+
+	while [ "$i" -lt "$max_zones" ]; do
+		local start
+		start=$(date +%s%3N)
+		i=$((i + 1000))
+		j=$((j + 1))
+		# nft rule in output places each packet in a different zone.
+		dd if=/dev/zero bs=8k count=1000 2>/dev/null | ip netns exec "$ns1" socat -u STDIN UDP:127.0.0.1:12345,sourceport=12345
+		if [ $? -ne 0 ] ;then
+			ret=1
+			break
+		fi
+
+		stop=$(date +%s%3N)
+		local duration=$((stop-start))
+		echo "PASS: added 1000 entries in $duration ms (now $i total, loop $j)"
+	done
+
+	if [ "$have_ct_tool" -eq 1 ]; then
+		local count duration
+		count=$(ip netns exec "$ns1" conntrack -C)
+		duration=$((stop-outerstart))
+
+		if [ "$count" -ge "$max_zones" ]; then
+			echo "PASS: inserted $count entries from packet path in $duration ms total"
+		else
+			ip netns exec "$ns1" conntrack -S 1>&2
+			echo "FAIL: inserted $count entries from packet path in $duration ms total, expected $max_zones entries"
+			ret=1
+		fi
+	fi
+
+	if [ $ret -ne 0 ];then
+		echo "FAIL: insert $max_zones entries from packet path" 1>&2
+	fi
+}
+
+test_conntrack_tool() {
+	local max_zones=$1
+
+	ip netns exec "$ns1" conntrack -F >/dev/null 2>/dev/null
+
+	local outerstart start stop i
+	outerstart=$(date +%s%3N)
+	start=$(date +%s%3N)
+	stop="$start"
+	i=0
+	while [ "$i" -lt "$max_zones" ]; do
+		i=$((i + 1))
+		ip netns exec "$ns1" conntrack -I -s 1.1.1.1 -d 2.2.2.2 --protonum 6 \
+	                 --timeout 3600 --state ESTABLISHED --sport 12345 --dport 1000 --zone $i >/dev/null 2>&1
+		if [ $? -ne 0 ];then
+			ip netns exec "$ns1" conntrack -I -s 1.1.1.1 -d 2.2.2.2 --protonum 6 \
+	                 --timeout 3600 --state ESTABLISHED --sport 12345 --dport 1000 --zone $i > /dev/null
+			echo "FAIL: conntrack -I returned an error"
+			ret=1
+			break
+		fi
+
+		if [ $((i%1000)) -eq 0 ];then
+			stop=$(date +%s%3N)
+
+			local duration=$((stop-start))
+			echo "PASS: added 1000 entries in $duration ms (now $i total)"
+			start=$stop
+		fi
+	done
+
+	local count
+	local duration
+	count=$(ip netns exec "$ns1" conntrack -C)
+	duration=$((stop-outerstart))
+
+	if [ "$count" -eq "$max_zones" ]; then
+		echo "PASS: inserted $count entries via ctnetlink in $duration ms"
+	else
+		ip netns exec "$ns1" conntrack -S 1>&2
+		echo "FAIL: inserted $count entries via ctnetlink in $duration ms, expected $max_zones entries ($duration ms)"
+		ret=1
+	fi
+}
+
+test_zones $zones
+
+if [ "$have_ct_tool" -eq 1 ];then
+	test_conntrack_tool $zones
+else
+	echo "SKIP: Could not run ctnetlink insertion test without conntrack tool"
+	if [ $ret -eq 0 ];then
+		exit $ksft_skip
+	fi
+fi
+
+exit $ret
diff --git a/tools/testing/selftests/net/netfilter/packetdrill/common.sh b/tools/testing/selftests/net/netfilter/packetdrill/common.sh
new file mode 100755
index 000000000000..ed36d535196d
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/packetdrill/common.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# for debugging set net.netfilter.nf_log_all_netns=1 in init_net
+# or do not use net namespaces.
+modprobe -q nf_conntrack
+sysctl -q net.netfilter.nf_conntrack_log_invalid=6
+
+# Flush old cached data (fastopen cookies).
+ip tcp_metrics flush all > /dev/null 2>&1
+
+# TCP min, default, and max receive and send buffer sizes.
+sysctl -q net.ipv4.tcp_rmem="4096 540000 $((15*1024*1024))"
+sysctl -q net.ipv4.tcp_wmem="4096 $((256*1024)) 4194304"
+
+# TCP congestion control.
+sysctl -q net.ipv4.tcp_congestion_control=cubic
+
+# TCP slow start after idle.
+sysctl -q net.ipv4.tcp_slow_start_after_idle=0
+
+# TCP Explicit Congestion Notification (ECN)
+sysctl -q net.ipv4.tcp_ecn=0
+
+sysctl -q net.ipv4.tcp_notsent_lowat=4294967295 > /dev/null 2>&1
+
+# Override the default qdisc on the tun device.
+# Many tests fail with timing errors if the default
+# is FQ and that paces their flows.
+tc qdisc add dev tun0 root pfifo
+
+# Enable conntrack
+$xtables -A INPUT -m conntrack --ctstate NEW -p tcp --syn
diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_ack_loss_stall.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_ack_loss_stall.pkt
new file mode 100644
index 000000000000..d755bd64c54f
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_ack_loss_stall.pkt
@@ -0,0 +1,118 @@
+// check that already-acked (retransmitted) packet is let through rather
+// than tagged as INVALID.
+
+`packetdrill/common.sh`
+
+// should set -P DROP but it disconnects VM w.o. extra netns
++0 `$xtables -A INPUT -m conntrack --ctstate INVALID -j DROP`
+
++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
++0 bind(3, ..., ...) = 0
++0 listen(3, 10) = 0
+
++0 < S 0:0(0) win 32792 <mss 1000>
++0 > S. 0:0(0) ack 1 <mss 1460>
++.01 < . 1:1(0) ack 1 win 65535
++0 accept(3, ..., ...) = 4
+
++0.0001 < P. 1:1461(1460) ack 1 win 257
++.0 > . 1:1(0) ack 1461 win 65535
++0.0001 < P. 1461:2921(1460) ack 1 win 257
++.0 > . 1:1(0) ack 2921 win 65535
++0.0001 < P. 2921:4381(1460) ack 1 win 257
++.0 > . 1:1(0) ack 4381 win 65535
++0.0001 < P. 4381:5841(1460) ack 1 win 257
++.0 > . 1:1(0) ack 5841 win 65535
++0.0001 < P. 5841:7301(1460) ack 1 win 257
++.0 > . 1:1(0) ack 7301 win 65535
++0.0001 < P. 7301:8761(1460) ack 1 win 257
++.0 > . 1:1(0) ack 8761 win 65535
++0.0001 < P. 8761:10221(1460) ack 1 win 257
++.0 > . 1:1(0) ack 10221 win 65535
++0.0001 < P. 10221:11681(1460) ack 1 win 257
++.0 > . 1:1(0) ack 11681 win 65535
++0.0001 < P. 11681:13141(1460) ack 1 win 257
++.0 > . 1:1(0) ack 13141 win 65535
++0.0001 < P. 13141:14601(1460) ack 1 win 257
++.0 > . 1:1(0) ack 14601 win 65535
++0.0001 < P. 14601:16061(1460) ack 1 win 257
++.0 > . 1:1(0) ack 16061 win 65535
++0.0001 < P. 16061:17521(1460) ack 1 win 257
++.0 > . 1:1(0) ack 17521 win 65535
++0.0001 < P. 17521:18981(1460) ack 1 win 257
++.0 > . 1:1(0) ack 18981 win 65535
++0.0001 < P. 18981:20441(1460) ack 1 win 257
++.0 > . 1:1(0) ack 20441 win 65535
++0.0001 < P. 20441:21901(1460) ack 1 win 257
++.0 > . 1:1(0) ack 21901 win 65535
++0.0001 < P. 21901:23361(1460) ack 1 win 257
++.0 > . 1:1(0) ack 23361 win 65535
++0.0001 < P. 23361:24821(1460) ack 1 win 257
+0.055 > . 1:1(0) ack 24821 win 65535
++0.0001 < P. 24821:26281(1460) ack 1 win 257
++.0 > . 1:1(0) ack 26281 win 65535
++0.0001 < P. 26281:27741(1460) ack 1 win 257
++.0 > . 1:1(0) ack 27741 win 65535
++0.0001 < P. 27741:29201(1460) ack 1 win 257
++.0 > . 1:1(0) ack 29201 win 65535
++0.0001 < P. 29201:30661(1460) ack 1 win 257
++.0 > . 1:1(0) ack 30661 win 65535
++0.0001 < P. 30661:32121(1460) ack 1 win 257
++.0 > . 1:1(0) ack 32121 win 65535
++0.0001 < P. 32121:33581(1460) ack 1 win 257
++.0 > . 1:1(0) ack 33581 win 65535
++0.0001 < P. 33581:35041(1460) ack 1 win 257
++.0 > . 1:1(0) ack 35041 win 65535
++0.0001 < P. 35041:36501(1460) ack 1 win 257
++.0 > . 1:1(0) ack 36501 win 65535
++0.0001 < P. 36501:37961(1460) ack 1 win 257
++.0 > . 1:1(0) ack 37961 win 65535
++0.0001 < P. 37961:39421(1460) ack 1 win 257
++.0 > . 1:1(0) ack 39421 win 65535
++0.0001 < P. 39421:40881(1460) ack 1 win 257
++.0 > . 1:1(0) ack 40881 win 65535
++0.0001 < P. 40881:42341(1460) ack 1 win 257
++.0 > . 1:1(0) ack 42341 win 65535
++0.0001 < P. 42341:43801(1460) ack 1 win 257
++.0 > . 1:1(0) ack 43801 win 65535
++0.0001 < P. 43801:45261(1460) ack 1 win 257
++.0 > . 1:1(0) ack 45261 win 65535
++0.0001 < P. 45261:46721(1460) ack 1 win 257
++.0 > . 1:1(0) ack 46721 win 65535
++0.0001 < P. 46721:48181(1460) ack 1 win 257
++.0 > . 1:1(0) ack 48181 win 65535
++0.0001 < P. 48181:49641(1460) ack 1 win 257
++.0 > . 1:1(0) ack 49641 win 65535
++0.0001 < P. 49641:51101(1460) ack 1 win 257
++.0 > . 1:1(0) ack 51101 win 65535
++0.0001 < P. 51101:52561(1460) ack 1 win 257
++.0 > . 1:1(0) ack 52561 win 65535
++0.0001 < P. 52561:54021(1460) ack 1 win 257
++.0 > . 1:1(0) ack 54021 win 65535
++0.0001 < P. 54021:55481(1460) ack 1 win 257
++.0 > . 1:1(0) ack 55481 win 65535
++0.0001 < P. 55481:56941(1460) ack 1 win 257
++.0 > . 1:1(0) ack 56941 win 65535
++0.0001 < P. 56941:58401(1460) ack 1 win 257
++.0 > . 1:1(0) ack 58401 win 65535
++0.0001 < P. 58401:59861(1460) ack 1 win 257
++.0 > . 1:1(0) ack 59861 win 65535
++0.0001 < P. 59861:61321(1460) ack 1 win 257
++.0 > . 1:1(0) ack 61321 win 65535
++0.0001 < P. 61321:62781(1460) ack 1 win 257
++.0 > . 1:1(0) ack 62781 win 65535
++0.0001 < P. 62781:64241(1460) ack 1 win 257
++.0 > . 1:1(0) ack 64241 win 65535
++0.0001 < P. 64241:65701(1460) ack 1 win 257
++.0 > . 1:1(0) ack 65701 win 65535
++0.0001 < P. 65701:67161(1460) ack 1 win 257
++.0 > . 1:1(0) ack 67161 win 65535
+
+// nf_ct_proto_6: SEQ is under the lower bound (already ACKed data retransmitted) IN=tun0 OUT= MAC= SRC=192.0.2.1 DST=192.168.24.72 LEN=1500 TOS=0x00 PREC=0x00 TTL=255 ID=0 PROTO=TCP SPT=34375 DPT=8080 SEQ=1 ACK=4162510439 WINDOW=257 RES=0x00 ACK PSH URGP=0
++0.0001 < P. 1:1461(1460) ack 1 win 257
+
+// only sent if above packet isn't flagged as invalid
++.0 > . 1:1(0) ack 67161 win 65535
+
++0 `$xtables -D INPUT -m conntrack --ctstate INVALID -j DROP`
diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_inexact_rst.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_inexact_rst.pkt
new file mode 100644
index 000000000000..dccdd4c009c6
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_inexact_rst.pkt
@@ -0,0 +1,62 @@
+// check RST packet that doesn't exactly match expected next sequence
+// number still transitions conntrack state to CLOSE iff its already in
+// FIN/CLOSE_WAIT.
+
+`packetdrill/common.sh`
+
+//  5.771921 server_ip > client_ip TLSv1.2 337 [Packet size limited during capture]
+//  5.771994 server_ip > client_ip TLSv1.2 337 [Packet size limited during capture]
+//  5.772212 client_ip > server_ip TCP 66 45020 > 443 [ACK] Seq=1905874048 Ack=781810658 Win=36352 Len=0 TSval=3317842872 TSecr=675936334
+//  5.787924 server_ip > client_ip TLSv1.2 1300 [Packet size limited during capture]
+//  5.788126 server_ip > client_ip TLSv1.2 90 Application Data
+//  5.788207 server_ip > client_ip TCP 66 443 > 45020 [FIN, ACK] Seq=781811916 Ack=1905874048 Win=31104 Len=0 TSval=675936350 TSecr=3317842872
+//  5.788447 client_ip > server_ip TLSv1.2 90 Application Data
+//  5.788479 client_ip > server_ip TCP 66 45020 > 443 [RST, ACK] Seq=1905874072 Ack=781811917 Win=39040 Len=0 TSval=3317842889 TSecr=675936350
+//  5.788581 server_ip > client_ip TCP 54 8443 > 45020 [RST] Seq=781811892 Win=0 Len=0
+
++0 `iptables -A INPUT -p tcp -m conntrack --ctstate INVALID -j DROP`
++0 `iptables -A OUTPUT -p tcp -m conntrack --ctstate INVALID -j DROP`
+
++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+
+0.1 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress)
+
+0.1 > S 0:0(0) win 65535 <mss 1460,sackOK,TS val 1 ecr 0,nop,wscale 8>
+
++0.1 < S. 1:1(0) ack 1 win 65535 <mss 1460>
+
++0 > . 1:1(0) ack 1 win 65535
++0 < . 1:1001(1000) ack 1 win 65535
++0 < . 1001:2001(1000) ack 1 win 65535
++0 < . 2001:3001(1000) ack 1 win 65535
+
++0 > . 1:1(0) ack 1001 win 65535
++0 > . 1:1(0) ack 2001 win 65535
++0 > . 1:1(0) ack 3001 win 65535
+
++0 write(3, ..., 1000) = 1000
+
++0.0 > P. 1:1001(1000) ack 3001 win 65535
+
++0.1 read(3, ..., 1000) = 1000
+
+// Conntrack should move to FIN_WAIT, then CLOSE_WAIT.
++0 < F. 3001:3001(0) ack 1001 win 65535
++0 >  . 1001:1001(0) ack 3002 win 65535
+
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q CLOSE_WAIT`
+
++1 close(3) = 0
+// RST: unread data. FIN was seen, hence ack + 1
++0 > R. 1001:1001(0) ack 3002 win 65535
+// ... and then, CLOSE.
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q CLOSE\ `
+
+// Spurious RST from peer -- no sk state.  Should NOT get
+// marked INVALID, because conntrack is already closing.
++0.1 < R 2001:2001(0) win 0
+
+// No packets should have been marked INVALID
++0 `iptables -v -S INPUT  | grep INVALID | grep -q -- "-c 0 0"`
++0 `iptables -v -S OUTPUT | grep INVALID | grep -q -- "-c 0 0"`
diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_rst_invalid.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_rst_invalid.pkt
new file mode 100644
index 000000000000..686f18a3d9ef
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_rst_invalid.pkt
@@ -0,0 +1,59 @@
+// check that out of window resets are marked as INVALID and conntrack remains
+// in ESTABLISHED state.
+
+`packetdrill/common.sh`
+
++0 `$xtables -A INPUT -p tcp -m conntrack --ctstate INVALID -j DROP`
++0 `$xtables -A OUTPUT -p tcp -m conntrack --ctstate INVALID -j DROP`
+
++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+
+0.1 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress)
+
+0.1 > S 0:0(0) win 65535 <mss 1460,sackOK,TS val 1 ecr 0,nop,wscale 8>
+
++0.1 < S. 1:1(0) ack 1 win 65535 <mss 1460>
+
++0 > . 1:1(0) ack 1 win 65535
++0 < . 1:1001(1000) ack 1 win 65535
++0 < . 1001:2001(1000) ack 1 win 65535
++0 < . 2001:3001(1000) ack 1 win 65535
+
++0 > . 1:1(0) ack 1001 win 65535
++0 > . 1:1(0) ack 2001 win 65535
++0 > . 1:1(0) ack 3001 win 65535
+
++0 write(3, ..., 1000) = 1000
+
+// out of window
++0.0 < R	0:0(0)	win 0
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q ESTABLISHED`
+
+// out of window
++0.0 < R	1000000:1000000(0)	win 0
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q ESTABLISHED`
+
+// in-window but not exact match
++0.0 < R	42:42(0)	win 0
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q ESTABLISHED`
+
++0.0 > P. 1:1001(1000) ack 3001 win 65535
+
++0.1 read(3, ..., 1000) = 1000
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q ESTABLISHED`
+
++0 < . 3001:3001(0) ack 1001 win 65535
+
++0.0 < R. 3000:3000(0) ack 1001 win 0
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q ESTABLISHED`
+
+// exact next sequence
++0.0 < R. 3001:3001(0) ack 1001 win 0
+// Conntrack should move to CLOSE
+
+// Expect four invalid RSTs
++0 `$xtables -v -S INPUT  | grep INVALID | grep -q -- "-c 4 "`
++0 `$xtables -v -S OUTPUT | grep INVALID | grep -q -- "-c 0 0"`
+
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q CLOSE\ `
diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_syn_challenge_ack.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_syn_challenge_ack.pkt
new file mode 100644
index 000000000000..3442cd29bc93
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_syn_challenge_ack.pkt
@@ -0,0 +1,44 @@
+// Check connection re-use, i.e. peer that receives the SYN answers with
+// a challenge-ACK.
+// Check that conntrack lets all packets pass, including the challenge ack,
+// and that a new connection is established.
+
+`packetdrill/common.sh`
+
+// S  >
+//  . < (challnge-ack)
+// R. >
+// S  >
+// S. <
+// Expected outcome: established connection.
+
++0 `$xtables -A INPUT -p tcp -m conntrack --ctstate INVALID -j DROP`
++0 `$xtables -A OUTPUT -p tcp -m conntrack --ctstate INVALID -j DROP`
+
++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+
+0.1 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress)
+0.1 > S 0:0(0) win 65535 <mss 1460,sackOK,TS val 1 ecr 0,nop,wscale 8>
+
+// Challenge ACK, old incarnation.
+0.1 < . 145824453:145824453(0) ack 643160523 win 240 <mss 1460,nop,nop,TS val 1 ecr 1,nop,wscale 0>
+
++0.01 > R 643160523:643160523(0) win 0
+
++0.01 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep UNREPLIED | grep -q SYN_SENT`
+
+// Must go through.
++0.01 > S 0:0(0) win 65535 <mss 1460,sackOK,TS val 1 ecr 0,nop,wscale 8>
+
+// correct synack
++0.1 < S. 0:0(0) ack 1 win 250 <mss 1460,nop,nop,TS val 1 ecr 1,nop,wscale 0>
+
+// 3whs completes.
++0.01 > . 1:1(0) ack 1 win 256 <nop,nop,TS val 1 ecr 1>
+
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep ESTABLISHED | grep -q ASSURED`
+
+// No packets should have been marked INVALID
++0 `$xtables -v -S INPUT  | grep INVALID | grep -q -- "-c 0 0"`
++0 `$xtables -v -S OUTPUT | grep INVALID | grep -q -- "-c 0 0"`
diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_old.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_old.pkt
new file mode 100644
index 000000000000..3047160c4bf3
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_old.pkt
@@ -0,0 +1,51 @@
+// Check conntrack copes with syn/ack reply for a previous, old incarnation.
+
+// tcpdump with buggy sequence
+// 10.176.25.8.829 > 10.192.171.30.2049: Flags [S], seq 2375731741, win 29200, options [mss 1460,sackOK,TS val 2083107423 ecr 0,nop,wscale 7], length 0
+// OLD synack, for old/previous S
+// 10.192.171.30.2049 > 10.176.25.8.829: Flags [S.], seq 145824453, ack 643160523, win 65535, options [mss 8952,nop,wscale 5,TS val 3215437785 ecr 2082921663,nop,nop], length 0
+// This reset never makes it to the endpoint, elided in the packetdrill script
+// 10.192.171.30.2049 > 10.176.25.8.829: Flags [R.], seq 1, ack 1, win 65535, options [mss 8952,nop,wscale 5,TS val 3215443451 ecr 2082921663,nop,nop], length 0
+// Syn retransmit, no change
+// 10.176.25.8.829 > 10.192.171.30.2049: Flags [S], seq 2375731741, win 29200, options [mss 1460,sackOK,TS val 2083115583 ecr 0,nop,wscale 7], length 0
+// CORRECT synack, should be accepted, but conntrack classified this as INVALID:
+// SEQ is over the upper bound (over the window of the receiver) IN=tun0 OUT= MAC= SRC=192.0.2.1 DST=192.168.37.78 LEN=40 TOS=0x00 PREC=0x00 TTL=255 ID=0 PROTO=TCP SPT=8080 DPT=34500 SEQ=162602411 ACK=2124350315 ..
+// 10.192.171.30.2049 > 10.176.25.8.829: Flags [S.], seq 162602410, ack 2375731742, win 65535, options [mss 8952,nop,wscale 5,TS val 3215445754 ecr 2083115583,nop,nop], length 0
+
+`packetdrill/common.sh`
+
++0 `$xtables -A INPUT -p tcp -m conntrack --ctstate INVALID -j DROP`
++0 `$xtables -A OUTPUT -p tcp -m conntrack --ctstate INVALID -j DROP`
+
++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+
+0.1 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress)
+0.1 > S 0:0(0) win 65535 <mss 1460,sackOK,TS val 1 ecr 0,nop,wscale 8>
+
+// bogus/outdated synack, invalid ack value
+0.1 < S. 145824453:145824453(0) ack 643160523 win 240 <mss 1440,nop,nop,TS val 1 ecr 1,nop,wscale 0>
+
+// syn retransmitted
+1.01 > S 0:0(0) win 65535 <mss 1460,sackOK,TS val 1015 ecr 0,nop,wscale 8>
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep UNREPLIED | grep -q SYN_SENT`
+
+// correct synack
++0 < S. 145758918:145758918(0) ack 1 win 250 <mss 1460,nop,nop,TS val 1 ecr 1,nop,wscale 0>
++0 write(3, ..., 1) = 1
+
+// with buggy conntrack above packet is dropped, so SYN rtx is seen:
+// script packet:  1.054007 . 1:1(0) ack 16777958 win 256 <nop,nop,TS val 1033 ecr 1>
+// actual packet:  3.010000 S 0:0(0) win 65535 <mss 1460,sackOK,TS val 1015 ecr 0,nop,wscale 8>
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep ESTABLISHED | grep -q ASSURED`
+
++0 > P. 1:2(1) ack 4294901762 win 256 <nop,nop,TS val 1067 ecr 1>
+
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep ASSURED | grep -q ESTABLISHED`
+
+// No packets should have been marked INVALID in OUTPUT direction, 1 in INPUT
++0 `$xtables -v -S OUTPUT | grep INVALID | grep -q -- "-c 0 0"`
++0 `$xtables -v -S INPUT  | grep INVALID | grep -q -- "-c 1 "`
+
++0 `$xtables -D INPUT -p tcp -m conntrack --ctstate INVALID -j DROP`
++0 `$xtables -D OUTPUT -p tcp -m conntrack --ctstate INVALID -j DROP`
diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_reuse.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_reuse.pkt
new file mode 100644
index 000000000000..842242f8ccf7
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_reuse.pkt
@@ -0,0 +1,34 @@
+// Check reception of another SYN while we have an established conntrack state.
+// Challenge ACK is supposed to pass through, RST reply should clear conntrack
+// state and SYN retransmit should give us new 'SYN_RECV' connection state.
+
+`packetdrill/common.sh`
+
+// should show a match if bug is present:
++0 `iptables -A INPUT -m conntrack --ctstate INVALID -p tcp --tcp-flags SYN,ACK SYN,ACK`
+
++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
++0 bind(3, ..., ...) = 0
++0 listen(3, 10) = 0
+
++0 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7, TS val 1 ecr 0,nop,nop>
++0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,TS val 100 ecr 1,nop,wscale 8>
++.01 < . 1:1(0) ack 1 win 257 <TS val 1 ecr 100,nop,nop>
++0 accept(3, ..., ...) = 4
+
++0 < P. 1:101(100) ack 1 win 257 <TS val 2 ecr 100,nop,nop>
++.001 > . 1:1(0) ack 101 win 256 <nop,nop,TS val 110 ecr 2>
++0 read(4, ..., 101) = 100
+
+1.0 < S 2000:2000(0) win 32792 <mss 1000,nop,wscale 7, TS val 233 ecr 0,nop,nop>
+// Won't expect this: challenge ack.
+
++0 > . 1:1(0) ack 101 win 256 <nop,nop,TS val 112 ecr 2>
++0 < R. 101:101(0) ack 1 win 257
++0 close(4) = 0
+
+1.5 < S 2000:2000(0) win 32792 <mss 1000,nop,wscale 0, TS val 233 ecr 0,nop,nop>
+
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep -q SYN_RECV`
++0 `iptables -v -S INPUT | grep INVALID | grep -q -- "-c 0 0"`
diff --git a/tools/testing/selftests/net/netfilter/rpath.sh b/tools/testing/selftests/net/netfilter/rpath.sh
new file mode 100755
index 000000000000..24ad41d526d9
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/rpath.sh
@@ -0,0 +1,181 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source lib.sh
+
+# search for legacy iptables (it uses the xtables extensions
+if iptables-legacy --version >/dev/null 2>&1; then
+	iptables='iptables-legacy'
+elif iptables --version >/dev/null 2>&1; then
+	iptables='iptables'
+else
+	iptables=''
+fi
+
+if ip6tables-legacy --version >/dev/null 2>&1; then
+	ip6tables='ip6tables-legacy'
+elif ip6tables --version >/dev/null 2>&1; then
+	ip6tables='ip6tables'
+else
+	ip6tables=''
+fi
+
+if nft --version >/dev/null 2>&1; then
+	nft='nft'
+else
+	nft=''
+fi
+
+if [ -z "$iptables$ip6tables$nft" ]; then
+	echo "SKIP: Test needs iptables, ip6tables or nft"
+	exit $ksft_skip
+fi
+
+trap cleanup_all_ns EXIT
+
+# create two netns, keep IPv6 address when moving into VRF
+setup_ns ns1 ns2
+ip netns exec "$ns2" sysctl -q net.ipv6.conf.all.keep_addr_on_down=1
+
+# a standard connection between the netns, should not trigger rp filter
+ip -net "$ns1" link add v0 type veth peer name v0 netns "$ns2"
+ip -net "$ns1" link set v0 up; ip -net "$ns2" link set v0 up
+ip -net "$ns1" a a 192.168.23.2/24 dev v0
+ip -net "$ns2" a a 192.168.23.1/24 dev v0
+ip -net "$ns1" a a fec0:23::2/64 dev v0 nodad
+ip -net "$ns2" a a fec0:23::1/64 dev v0 nodad
+
+# rp filter testing: ns1 sends packets via v0 which ns2 would route back via d0
+ip -net "$ns2" link add d0 type dummy
+ip -net "$ns2" link set d0 up
+ip -net "$ns1" a a 192.168.42.2/24 dev v0
+ip -net "$ns2" a a 192.168.42.1/24 dev d0
+ip -net "$ns1" a a fec0:42::2/64 dev v0 nodad
+ip -net "$ns2" a a fec0:42::1/64 dev d0 nodad
+
+# avoid neighbor lookups and enable martian IPv6 pings
+ns2_hwaddr=$(ip -net "$ns2" link show dev v0 | \
+	     sed -n 's, *link/ether \([^ ]*\) .*,\1,p')
+ns1_hwaddr=$(ip -net "$ns1" link show dev v0 | \
+	     sed -n 's, *link/ether \([^ ]*\) .*,\1,p')
+ip -net "$ns1" neigh add fec0:42::1 lladdr "$ns2_hwaddr" nud permanent dev v0
+ip -net "$ns1" neigh add fec0:23::1 lladdr "$ns2_hwaddr" nud permanent dev v0
+ip -net "$ns2" neigh add fec0:42::2 lladdr "$ns1_hwaddr" nud permanent dev d0
+ip -net "$ns2" neigh add fec0:23::2 lladdr "$ns1_hwaddr" nud permanent dev v0
+
+# firewall matches to test
+[ -n "$iptables" ] && {
+	common='-t raw -A PREROUTING -s 192.168.0.0/16'
+	common+=' -p icmp --icmp-type echo-request'
+	if ! ip netns exec "$ns2" "$iptables" $common -m rpfilter;then
+		echo "Cannot add rpfilter rule"
+		exit $ksft_skip
+	fi
+	ip netns exec "$ns2" "$iptables" $common -m rpfilter --invert
+}
+[ -n "$ip6tables" ] && {
+	common='-t raw -A PREROUTING -s fec0::/16'
+	common+=' -p icmpv6 --icmpv6-type echo-request'
+	if ! ip netns exec "$ns2" "$ip6tables" $common -m rpfilter;then
+		echo "Cannot add rpfilter rule"
+		exit $ksft_skip
+	fi
+	ip netns exec "$ns2" "$ip6tables" $common -m rpfilter --invert
+}
+[ -n "$nft" ] && ip netns exec "$ns2" $nft -f - <<EOF
+table inet t {
+	chain c {
+		type filter hook prerouting priority raw;
+		ip saddr 192.168.0.0/16 icmp type echo-request \
+			fib saddr . iif oif exists counter
+		ip6 saddr fec0::/16 icmpv6 type echo-request \
+			fib saddr . iif oif exists counter
+	}
+}
+EOF
+
+die() {
+	echo "FAIL: $*"
+	#ip netns exec "$ns2" "$iptables" -t raw -vS
+	#ip netns exec "$ns2" "$ip6tables" -t raw -vS
+	#ip netns exec "$ns2" nft list ruleset
+	exit 1
+}
+
+# check rule counters, return true if rule did not match
+ipt_zero_rule() { # (command)
+	[ -n "$1" ] || return 0
+	ip netns exec "$ns2" "$1" -t raw -vS | grep -q -- "-m rpfilter -c 0 0"
+}
+ipt_zero_reverse_rule() { # (command)
+	[ -n "$1" ] || return 0
+	ip netns exec "$ns2" "$1" -t raw -vS | \
+		grep -q -- "-m rpfilter --invert -c 0 0"
+}
+nft_zero_rule() { # (family)
+	[ -n "$nft" ] || return 0
+	ip netns exec "$ns2" "$nft" list chain inet t c | \
+		grep -q "$1 saddr .* counter packets 0 bytes 0"
+}
+
+netns_ping() { # (netns, args...)
+	local netns="$1"
+	shift
+	ip netns exec "$netns" ping -q -c 1 -W 1 "$@" >/dev/null
+}
+
+clear_counters() {
+	[ -n "$iptables" ] && ip netns exec "$ns2" "$iptables" -t raw -Z
+	[ -n "$ip6tables" ] && ip netns exec "$ns2" "$ip6tables" -t raw -Z
+	if [ -n "$nft" ]; then
+		(
+			echo "delete table inet t";
+			ip netns exec "$ns2" $nft -s list table inet t;
+		) | ip netns exec "$ns2" $nft -f -
+	fi
+}
+
+testrun() {
+	clear_counters
+
+	# test 1: martian traffic should fail rpfilter matches
+	netns_ping "$ns1" -I v0 192.168.42.1 && \
+		die "martian ping 192.168.42.1 succeeded"
+	netns_ping "$ns1" -I v0 fec0:42::1 && \
+		die "martian ping fec0:42::1 succeeded"
+
+	ipt_zero_rule "$iptables" || die "iptables matched martian"
+	ipt_zero_rule "$ip6tables" || die "ip6tables matched martian"
+	ipt_zero_reverse_rule "$iptables" && die "iptables not matched martian"
+	ipt_zero_reverse_rule "$ip6tables" && die "ip6tables not matched martian"
+	nft_zero_rule ip || die "nft IPv4 matched martian"
+	nft_zero_rule ip6 || die "nft IPv6 matched martian"
+
+	clear_counters
+
+	# test 2: rpfilter match should pass for regular traffic
+	netns_ping "$ns1" 192.168.23.1 || \
+		die "regular ping 192.168.23.1 failed"
+	netns_ping "$ns1" fec0:23::1 || \
+		die "regular ping fec0:23::1 failed"
+
+	ipt_zero_rule "$iptables" && die "iptables match not effective"
+	ipt_zero_rule "$ip6tables" && die "ip6tables match not effective"
+	ipt_zero_reverse_rule "$iptables" || die "iptables match over-effective"
+	ipt_zero_reverse_rule "$ip6tables" || die "ip6tables match over-effective"
+	nft_zero_rule ip && die "nft IPv4 match not effective"
+	nft_zero_rule ip6 && die "nft IPv6 match not effective"
+
+}
+
+testrun
+
+# repeat test with vrf device in $ns2
+ip -net "$ns2" link add vrf0 type vrf table 10
+ip -net "$ns2" link set vrf0 up
+ip -net "$ns2" link set v0 master vrf0
+
+testrun
+
+echo "PASS: netfilter reverse path match works as intended"
+exit 0
diff --git a/tools/testing/selftests/net/netfilter/sctp_collision.c b/tools/testing/selftests/net/netfilter/sctp_collision.c
new file mode 100644
index 000000000000..21bb1cfd8a85
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/sctp_collision.c
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <arpa/inet.h>
+
+int main(int argc, char *argv[])
+{
+	struct sockaddr_in saddr = {}, daddr = {};
+	int sd, ret, len = sizeof(daddr);
+	struct timeval tv = {25, 0};
+	char buf[] = "hello";
+
+	if (argc != 6 || (strcmp(argv[1], "server") && strcmp(argv[1], "client"))) {
+		printf("%s <server|client> <LOCAL_IP> <LOCAL_PORT> <REMOTE_IP> <REMOTE_PORT>\n",
+		       argv[0]);
+		return -1;
+	}
+
+	sd = socket(AF_INET, SOCK_SEQPACKET, IPPROTO_SCTP);
+	if (sd < 0) {
+		printf("Failed to create sd\n");
+		return -1;
+	}
+
+	saddr.sin_family = AF_INET;
+	saddr.sin_addr.s_addr = inet_addr(argv[2]);
+	saddr.sin_port = htons(atoi(argv[3]));
+
+	ret = bind(sd, (struct sockaddr *)&saddr, sizeof(saddr));
+	if (ret < 0) {
+		printf("Failed to bind to address\n");
+		goto out;
+	}
+
+	ret = listen(sd, 5);
+	if (ret < 0) {
+		printf("Failed to listen on port\n");
+		goto out;
+	}
+
+	daddr.sin_family = AF_INET;
+	daddr.sin_addr.s_addr = inet_addr(argv[4]);
+	daddr.sin_port = htons(atoi(argv[5]));
+
+	/* make test shorter than 25s */
+	ret = setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv));
+	if (ret < 0) {
+		printf("Failed to setsockopt SO_RCVTIMEO\n");
+		goto out;
+	}
+
+	if (!strcmp(argv[1], "server")) {
+		sleep(1); /* wait a bit for client's INIT */
+		ret = connect(sd, (struct sockaddr *)&daddr, len);
+		if (ret < 0) {
+			printf("Failed to connect to peer\n");
+			goto out;
+		}
+		ret = recvfrom(sd, buf, sizeof(buf), 0, (struct sockaddr *)&daddr, &len);
+		if (ret < 0) {
+			printf("Failed to recv msg %d\n", ret);
+			goto out;
+		}
+		ret = sendto(sd, buf, strlen(buf) + 1, 0, (struct sockaddr *)&daddr, len);
+		if (ret < 0) {
+			printf("Failed to send msg %d\n", ret);
+			goto out;
+		}
+		printf("Server: sent! %d\n", ret);
+	}
+
+	if (!strcmp(argv[1], "client")) {
+		usleep(300000); /* wait a bit for server's listening */
+		ret = connect(sd, (struct sockaddr *)&daddr, len);
+		if (ret < 0) {
+			printf("Failed to connect to peer\n");
+			goto out;
+		}
+		sleep(1); /* wait a bit for server's delayed INIT_ACK to reproduce the issue */
+		ret = sendto(sd, buf, strlen(buf) + 1, 0, (struct sockaddr *)&daddr, len);
+		if (ret < 0) {
+			printf("Failed to send msg %d\n", ret);
+			goto out;
+		}
+		ret = recvfrom(sd, buf, sizeof(buf), 0, (struct sockaddr *)&daddr, &len);
+		if (ret < 0) {
+			printf("Failed to recv msg %d\n", ret);
+			goto out;
+		}
+		printf("Client: rcvd! %d\n", ret);
+	}
+	ret = 0;
+out:
+	close(sd);
+	return ret;
+}
diff --git a/tools/testing/selftests/net/netfilter/settings b/tools/testing/selftests/net/netfilter/settings
new file mode 100644
index 000000000000..abc5648b59ab
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/settings
@@ -0,0 +1 @@
+timeout=1800
diff --git a/tools/testing/selftests/net/netfilter/vxlan_mtu_frag.sh b/tools/testing/selftests/net/netfilter/vxlan_mtu_frag.sh
new file mode 100755
index 000000000000..912cb9583af1
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/vxlan_mtu_frag.sh
@@ -0,0 +1,121 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source lib.sh
+
+if ! modprobe -q -n br_netfilter 2>&1; then
+        echo "SKIP: Test needs br_netfilter kernel module"
+        exit $ksft_skip
+fi
+
+cleanup()
+{
+        cleanup_all_ns
+}
+
+trap cleanup EXIT
+
+setup_ns host vtep router
+
+create_topology()
+{
+    ip link add host-eth0 netns "$host" type veth peer name vtep-host netns "$vtep"
+    ip link add vtep-router netns "$vtep" type veth peer name router-vtep netns "$router"
+}
+
+setup_host()
+{
+    # bring ports up
+    ip -n "$host" addr add 10.0.0.1/24 dev host-eth0
+    ip -n "$host" link set host-eth0 up
+
+    # Add VLAN 10,20
+    for vid in 10 20; do
+        ip -n "$host" link add link host-eth0 name host-eth0.$vid type vlan id $vid
+        ip -n "$host" addr add 10.0.$vid.1/24 dev host-eth0.$vid
+        ip -n "$host" link set host-eth0.$vid up
+    done
+}
+
+setup_vtep()
+{
+    # create bridge on vtep
+    ip -n "$vtep" link add name br0 type bridge
+    ip -n "$vtep" link set br0 type bridge vlan_filtering 1
+
+    # VLAN 10 is untagged PVID
+    ip -n "$vtep" link set dev vtep-host master br0
+    bridge -n "$vtep" vlan add dev vtep-host vid 10 pvid untagged
+
+    # VLAN 20 as other VID
+    ip -n "$vtep" link set dev vtep-host master br0
+    bridge -n "$vtep" vlan add dev vtep-host vid 20
+
+    # single-vxlan device on vtep
+    ip -n "$vtep" address add dev vtep-router 60.0.0.1/24
+    ip -n "$vtep" link add dev vxd type vxlan external \
+        vnifilter local 60.0.0.1 remote 60.0.0.2 dstport 4789 ttl 64
+    ip -n "$vtep" link set vxd master br0
+
+    # Add VLAN-VNI 1-1 mappings
+    bridge -n "$vtep" link set dev vxd vlan_tunnel on
+    for vid in 10 20; do
+        bridge -n "$vtep" vlan add dev vxd vid $vid
+        bridge -n "$vtep" vlan add dev vxd vid $vid tunnel_info id $vid
+        bridge -n "$vtep" vni add dev vxd vni $vid
+    done
+
+    # bring ports up
+    ip -n "$vtep" link set vxd up
+    ip -n "$vtep" link set vtep-router up
+    ip -n "$vtep" link set vtep-host up
+    ip -n "$vtep" link set dev br0 up
+}
+
+setup_router()
+{
+    # bring ports up
+    ip -n "$router" link set router-vtep up
+}
+
+setup()
+{
+    modprobe -q br_netfilter
+    create_topology
+    setup_host
+    setup_vtep
+    setup_router
+}
+
+test_large_mtu_untagged_traffic()
+{
+    ip -n "$vtep" link set vxd mtu 1000
+    ip -n "$host" neigh add 10.0.0.2 lladdr ca:fe:ba:be:00:01 dev host-eth0
+    ip netns exec "$host" \
+        ping -q 10.0.0.2 -I host-eth0 -c 1 -W 0.5 -s2000 > /dev/null 2>&1
+    return 0
+}
+
+test_large_mtu_tagged_traffic()
+{
+    for vid in 10 20; do
+        ip -n "$vtep" link set vxd mtu 1000
+        ip -n "$host" neigh add 10.0.$vid.2 lladdr ca:fe:ba:be:00:01 dev host-eth0.$vid
+        ip netns exec "$host" \
+            ping -q 10.0.$vid.2 -I host-eth0.$vid -c 1 -W 0.5 -s2000 > /dev/null 2>&1
+    done
+    return 0
+}
+
+do_test()
+{
+    # Frames will be dropped so ping will not succeed
+    # If it doesn't panic, it passes
+    test_large_mtu_tagged_traffic
+    test_large_mtu_untagged_traffic
+}
+
+setup && \
+echo "Test for VxLAN fragmentation with large MTU in br_netfilter:" && \
+do_test && echo "PASS!"
+exit $?
diff --git a/tools/testing/selftests/net/netfilter/xt_string.sh b/tools/testing/selftests/net/netfilter/xt_string.sh
new file mode 100755
index 000000000000..8d401c69e317
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/xt_string.sh
@@ -0,0 +1,133 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# return code to signal skipped test
+ksft_skip=4
+rc=0
+
+source lib.sh
+
+checktool "socat -h" "run test without socat"
+checktool "iptables --version" "test needs iptables"
+
+infile=$(mktemp)
+
+cleanup()
+{
+	ip netns del "$netns"
+	rm -f "$infile"
+}
+
+trap cleanup EXIT
+
+setup_ns netns
+
+ip -net "$netns" link add d0 type dummy
+ip -net "$netns" link set d0 up
+ip -net "$netns" addr add 10.1.2.1/24 dev d0
+
+pattern="foo bar baz"
+patlen=11
+hdrlen=$((20 + 8)) # IPv4 + UDP
+
+#ip netns exec "$netns" tcpdump -npXi d0 &
+#tcpdump_pid=$!
+#trap 'kill $tcpdump_pid; ip netns del $netns' EXIT
+
+add_rule() { # (alg, from, to)
+	ip netns exec "$netns" \
+		iptables -A OUTPUT -o d0 -m string \
+			--string "$pattern" --algo "$1" --from "$2" --to "$3"
+}
+showrules() { # ()
+	ip netns exec "$netns" iptables -v -S OUTPUT | grep '^-A'
+}
+zerorules() {
+	ip netns exec "$netns" iptables -Z OUTPUT
+}
+countrule() { # (pattern)
+	showrules | grep -c -- "$*"
+}
+send() { # (offset)
+	( for ((i = 0; i < $1 - hdrlen; i++)); do
+		echo -n " "
+	  done
+	  echo -n "$pattern"
+	) > "$infile"
+
+	ip netns exec "$netns" socat -t 1 -u STDIN UDP-SENDTO:10.1.2.2:27374 < "$infile"
+}
+
+add_rule bm 1000 1500
+add_rule bm 1400 1600
+add_rule kmp 1000 1500
+add_rule kmp 1400 1600
+
+zerorules
+send 0
+send $((1000 - patlen))
+if [ "$(countrule -c 0 0)" -ne 4 ]; then
+	echo "FAIL: rules match data before --from"
+	showrules
+	((rc--))
+fi
+
+zerorules
+send 1000
+send $((1400 - patlen))
+if [ "$(countrule -c 2)" -ne 2 ]; then
+	echo "FAIL: only two rules should match at low offset"
+	showrules
+	((rc--))
+fi
+
+zerorules
+send $((1500 - patlen))
+if [ "$(countrule -c 1)" -ne 4 ]; then
+	echo "FAIL: all rules should match at end of packet"
+	showrules
+	((rc--))
+fi
+
+zerorules
+send 1495
+if [ "$(countrule -c 1)" -ne 1 ]; then
+	echo "FAIL: only kmp with proper --to should match pattern spanning fragments"
+	showrules
+	((rc--))
+fi
+
+zerorules
+send 1500
+if [ "$(countrule -c 1)" -ne 2 ]; then
+	echo "FAIL: two rules should match pattern at start of second fragment"
+	showrules
+	((rc--))
+fi
+
+zerorules
+send $((1600 - patlen))
+if [ "$(countrule -c 1)" -ne 2 ]; then
+	echo "FAIL: two rules should match pattern at end of largest --to"
+	showrules
+	((rc--))
+fi
+
+zerorules
+send $((1600 - patlen + 1))
+if [ "$(countrule -c 1)" -ne 0 ]; then
+	echo "FAIL: no rules should match pattern extending largest --to"
+	showrules
+	((rc--))
+fi
+
+zerorules
+send 1600
+if [ "$(countrule -c 1)" -ne 0 ]; then
+	echo "FAIL: no rule should match pattern past largest --to"
+	showrules
+	((rc--))
+fi
+
+[ $rc -eq 0 ] && echo "PASS: string match tests"
+exit $rc
diff --git a/tools/testing/selftests/net/netlink-dumps.c b/tools/testing/selftests/net/netlink-dumps.c
new file mode 100644
index 000000000000..07423f256f96
--- /dev/null
+++ b/tools/testing/selftests/net/netlink-dumps.c
@@ -0,0 +1,239 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <linux/genetlink.h>
+#include <linux/neighbour.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <linux/mqueue.h>
+#include <linux/rtnetlink.h>
+
+#include "../kselftest_harness.h"
+
+#include <ynl.h>
+
+struct ext_ack {
+	int err;
+
+	__u32 attr_offs;
+	__u32 miss_type;
+	__u32 miss_nest;
+	const char *str;
+};
+
+/* 0: no done, 1: done found, 2: extack found, -1: error */
+static int nl_get_extack(char *buf, size_t n, struct ext_ack *ea)
+{
+	const struct nlmsghdr *nlh;
+	const struct nlattr *attr;
+	ssize_t rem;
+
+	for (rem = n; rem > 0; NLMSG_NEXT(nlh, rem)) {
+		nlh = (struct nlmsghdr *)&buf[n - rem];
+		if (!NLMSG_OK(nlh, rem))
+			return -1;
+
+		if (nlh->nlmsg_type != NLMSG_DONE)
+			continue;
+
+		ea->err = -*(int *)NLMSG_DATA(nlh);
+
+		if (!(nlh->nlmsg_flags & NLM_F_ACK_TLVS))
+			return 1;
+
+		ynl_attr_for_each(attr, nlh, sizeof(int)) {
+			switch (ynl_attr_type(attr)) {
+			case NLMSGERR_ATTR_OFFS:
+				ea->attr_offs = ynl_attr_get_u32(attr);
+				break;
+			case NLMSGERR_ATTR_MISS_TYPE:
+				ea->miss_type = ynl_attr_get_u32(attr);
+				break;
+			case NLMSGERR_ATTR_MISS_NEST:
+				ea->miss_nest = ynl_attr_get_u32(attr);
+				break;
+			case NLMSGERR_ATTR_MSG:
+				ea->str = ynl_attr_get_str(attr);
+				break;
+			}
+		}
+
+		return 2;
+	}
+
+	return 0;
+}
+
+static const struct {
+	struct nlmsghdr nlhdr;
+	struct ndmsg ndm;
+	struct nlattr ahdr;
+	__u32 val;
+} dump_neigh_bad = {
+	.nlhdr = {
+		.nlmsg_len	= sizeof(dump_neigh_bad),
+		.nlmsg_type	= RTM_GETNEIGH,
+		.nlmsg_flags	= NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
+		.nlmsg_seq	= 1,
+	},
+	.ndm = {
+		.ndm_family	= 123,
+	},
+	.ahdr = {
+		.nla_len	= 4 + 4,
+		.nla_type	= NDA_FLAGS_EXT,
+	},
+	.val = -1, // should fail MASK validation
+};
+
+TEST(dump_extack)
+{
+	int netlink_sock;
+	char buf[8192];
+	int one = 1;
+	int i, cnt;
+	ssize_t n;
+
+	netlink_sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+	ASSERT_GE(netlink_sock, 0);
+
+	n = setsockopt(netlink_sock, SOL_NETLINK, NETLINK_CAP_ACK,
+		       &one, sizeof(one));
+	ASSERT_EQ(n, 0);
+	n = setsockopt(netlink_sock, SOL_NETLINK, NETLINK_EXT_ACK,
+		       &one, sizeof(one));
+	ASSERT_EQ(n, 0);
+	n = setsockopt(netlink_sock, SOL_NETLINK, NETLINK_GET_STRICT_CHK,
+		       &one, sizeof(one));
+	ASSERT_EQ(n, 0);
+
+	/* Dump so many times we fill up the buffer */
+	cnt = 64;
+	for (i = 0; i < cnt; i++) {
+		n = send(netlink_sock, &dump_neigh_bad,
+			 sizeof(dump_neigh_bad), 0);
+		ASSERT_EQ(n, sizeof(dump_neigh_bad));
+	}
+
+	/* Read out the ENOBUFS */
+	n = recv(netlink_sock, buf, sizeof(buf), MSG_DONTWAIT);
+	EXPECT_EQ(n, -1);
+	EXPECT_EQ(errno, ENOBUFS);
+
+	for (i = 0; i < cnt; i++) {
+		struct ext_ack ea = {};
+
+		n = recv(netlink_sock, buf, sizeof(buf), MSG_DONTWAIT);
+		if (n < 0) {
+			ASSERT_GE(i, 10);
+			break;
+		}
+		ASSERT_GE(n, (ssize_t)sizeof(struct nlmsghdr));
+
+		EXPECT_EQ(nl_get_extack(buf, n, &ea), 2);
+		EXPECT_EQ(ea.attr_offs,
+			  sizeof(struct nlmsghdr) + sizeof(struct ndmsg));
+	}
+}
+
+static const struct {
+	struct nlmsghdr nlhdr;
+	struct genlmsghdr genlhdr;
+	struct nlattr ahdr;
+	__u16 val;
+	__u16 pad;
+} dump_policies = {
+	.nlhdr = {
+		.nlmsg_len	= sizeof(dump_policies),
+		.nlmsg_type	= GENL_ID_CTRL,
+		.nlmsg_flags	= NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
+		.nlmsg_seq	= 1,
+	},
+	.genlhdr = {
+		.cmd		= CTRL_CMD_GETPOLICY,
+		.version	= 2,
+	},
+	.ahdr = {
+		.nla_len	= 6,
+		.nla_type	= CTRL_ATTR_FAMILY_ID,
+	},
+	.val = GENL_ID_CTRL,
+	.pad = 0,
+};
+
+// Sanity check for the test itself, make sure the dump doesn't fit in one msg
+TEST(test_sanity)
+{
+	int netlink_sock;
+	char buf[8192];
+	ssize_t n;
+
+	netlink_sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC);
+	ASSERT_GE(netlink_sock, 0);
+
+	n = send(netlink_sock, &dump_policies, sizeof(dump_policies), 0);
+	ASSERT_EQ(n, sizeof(dump_policies));
+
+	n = recv(netlink_sock, buf, sizeof(buf), MSG_DONTWAIT);
+	ASSERT_GE(n, (ssize_t)sizeof(struct nlmsghdr));
+
+	n = recv(netlink_sock, buf, sizeof(buf), MSG_DONTWAIT);
+	ASSERT_GE(n, (ssize_t)sizeof(struct nlmsghdr));
+
+	close(netlink_sock);
+}
+
+TEST(close_in_progress)
+{
+	int netlink_sock;
+	ssize_t n;
+
+	netlink_sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC);
+	ASSERT_GE(netlink_sock, 0);
+
+	n = send(netlink_sock, &dump_policies, sizeof(dump_policies), 0);
+	ASSERT_EQ(n, sizeof(dump_policies));
+
+	close(netlink_sock);
+}
+
+TEST(close_with_ref)
+{
+	char cookie[NOTIFY_COOKIE_LEN] = {};
+	int netlink_sock, mq_fd;
+	struct sigevent sigev;
+	ssize_t n;
+
+	netlink_sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC);
+	ASSERT_GE(netlink_sock, 0);
+
+	n = send(netlink_sock, &dump_policies, sizeof(dump_policies), 0);
+	ASSERT_EQ(n, sizeof(dump_policies));
+
+	mq_fd = syscall(__NR_mq_open, "sed", O_CREAT | O_WRONLY, 0600, 0);
+	ASSERT_GE(mq_fd, 0);
+
+	memset(&sigev, 0, sizeof(sigev));
+	sigev.sigev_notify		= SIGEV_THREAD;
+	sigev.sigev_value.sival_ptr	= cookie;
+	sigev.sigev_signo		= netlink_sock;
+
+	syscall(__NR_mq_notify, mq_fd, &sigev);
+
+	close(netlink_sock);
+
+	// give mqueue time to fire
+	usleep(100 * 1000);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/net/netns-name.sh b/tools/testing/selftests/net/netns-name.sh
index 6974474c26f3..38871bdef67f 100755
--- a/tools/testing/selftests/net/netns-name.sh
+++ b/tools/testing/selftests/net/netns-name.sh
@@ -7,10 +7,12 @@ set -o pipefail
 DEV=dummy-dev0
 DEV2=dummy-dev1
 ALT_NAME=some-alt-name
+NSIM_ADDR=2025
 
 RET_CODE=0
 
 cleanup() {
+    cleanup_netdevsim $NSIM_ADDR
     cleanup_ns $NS $test_ns
 }
 
@@ -25,12 +27,15 @@ setup_ns NS test_ns
 
 #
 # Test basic move without a rename
+# Use netdevsim because it has extra asserts for notifiers.
 #
-ip -netns $NS link add name $DEV type dummy || fail
-ip -netns $NS link set dev $DEV netns $test_ns ||
+
+nsim=$(create_netdevsim $NSIM_ADDR $NS)
+ip -netns $NS link set dev $nsim netns $test_ns ||
     fail "Can't perform a netns move"
-ip -netns $test_ns link show dev $DEV >> /dev/null || fail "Device not found after move"
-ip -netns $test_ns link del $DEV || fail
+ip -netns $test_ns link show dev $nsim >> /dev/null ||
+    fail "Device not found after move"
+cleanup_netdevsim $NSIM_ADDR
 
 #
 # Test move with a conflict
@@ -78,6 +83,16 @@ ip -netns $NS link show dev $ALT_NAME 2> /dev/null &&
     fail "Can still find alt-name after move"
 ip -netns $test_ns link del $DEV || fail
 
+#
+# Test no conflict of the same name/ifindex in different netns
+#
+ip -netns $NS link add name $DEV index 100 type dummy || fail
+ip -netns $NS link add netns $test_ns name $DEV index 100 type dummy ||
+    fail "Can create in netns without moving"
+ip -netns $test_ns link show dev $DEV >> /dev/null || fail "Device not found"
+ip -netns $NS link del $DEV || fail
+ip -netns $test_ns link del $DEV || fail
+
 echo -ne "$(basename $0) \t\t\t\t"
 if [ $RET_CODE -eq 0 ]; then
     echo "[  OK  ]"
diff --git a/tools/testing/selftests/net/netns-sysctl.sh b/tools/testing/selftests/net/netns-sysctl.sh
new file mode 100755
index 000000000000..45c34a3b9aae
--- /dev/null
+++ b/tools/testing/selftests/net/netns-sysctl.sh
@@ -0,0 +1,40 @@
+#!/bin/bash -e
+# SPDX-License-Identifier: GPL-2.0
+#
+# This test checks that the network buffer sysctls are present
+# in a network namespaces, and that they are readonly.
+
+source lib.sh
+
+cleanup() {
+    cleanup_ns $test_ns
+}
+
+trap cleanup EXIT
+
+fail() {
+	echo "ERROR: $*" >&2
+	exit 1
+}
+
+setup_ns test_ns
+
+for sc in {r,w}mem_{default,max}; do
+	# check that this is writable in a netns
+	[ -w "/proc/sys/net/core/$sc" ] ||
+		fail "$sc isn't writable in the init netns!"
+
+	# change the value in the host netns
+	sysctl -qw "net.core.$sc=300000" ||
+		fail "Can't write $sc in init netns!"
+
+	# check that the value is read from the init netns
+	[ "$(ip netns exec $test_ns sysctl -n "net.core.$sc")" -eq 300000 ] ||
+		fail "Value for $sc mismatch!"
+
+	# check that this isn't writable in a netns
+	ip netns exec $test_ns [ -w "/proc/sys/net/core/$sc" ] &&
+		fail "$sc is writable in a netns!"
+done
+
+echo 'Test passed OK'
diff --git a/tools/testing/selftests/net/nl_netdev.py b/tools/testing/selftests/net/nl_netdev.py
new file mode 100755
index 000000000000..beaee5e4e2aa
--- /dev/null
+++ b/tools/testing/selftests/net/nl_netdev.py
@@ -0,0 +1,131 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+import time
+from lib.py import ksft_run, ksft_exit, ksft_pr
+from lib.py import ksft_eq, ksft_ge, ksft_busy_wait
+from lib.py import NetdevFamily, NetdevSimDev, ip
+
+
+def empty_check(nf) -> None:
+    devs = nf.dev_get({}, dump=True)
+    ksft_ge(len(devs), 1)
+
+
+def lo_check(nf) -> None:
+    lo_info = nf.dev_get({"ifindex": 1})
+    ksft_eq(len(lo_info['xdp-features']), 0)
+    ksft_eq(len(lo_info['xdp-rx-metadata-features']), 0)
+
+
+def napi_list_check(nf) -> None:
+    with NetdevSimDev(queue_count=100) as nsimdev:
+        nsim = nsimdev.nsims[0]
+
+        ip(f"link set dev {nsim.ifname} up")
+
+        napis = nf.napi_get({'ifindex': nsim.ifindex}, dump=True)
+        ksft_eq(len(napis), 100)
+
+        for q in [50, 0, 99]:
+            for i in range(4):
+                nsim.dfs_write("queue_reset", f"{q} {i}")
+                napis = nf.napi_get({'ifindex': nsim.ifindex}, dump=True)
+                ksft_eq(len(napis), 100,
+                        comment=f"queue count after reset queue {q} mode {i}")
+
+
+def nsim_rxq_reset_down(nf) -> None:
+    """
+    Test that the queue API supports resetting a queue
+    while the interface is down. We should convert this
+    test to testing real HW once more devices support
+    queue API.
+    """
+    with NetdevSimDev(queue_count=4) as nsimdev:
+        nsim = nsimdev.nsims[0]
+
+        ip(f"link set dev {nsim.ifname} down")
+        for i in [0, 2, 3]:
+            nsim.dfs_write("queue_reset", f"1 {i}")
+
+
+def page_pool_check(nf) -> None:
+    with NetdevSimDev() as nsimdev:
+        nsim = nsimdev.nsims[0]
+
+        def up():
+            ip(f"link set dev {nsim.ifname} up")
+
+        def down():
+            ip(f"link set dev {nsim.ifname} down")
+
+        def get_pp():
+            pp_list = nf.page_pool_get({}, dump=True)
+            return [pp for pp in pp_list if pp.get("ifindex") == nsim.ifindex]
+
+        # No page pools when down
+        down()
+        ksft_eq(len(get_pp()), 0)
+
+        # Up, empty page pool appears
+        up()
+        pp_list = get_pp()
+        ksft_ge(len(pp_list), 0)
+        refs = sum([pp["inflight"] for pp in pp_list])
+        ksft_eq(refs, 0)
+
+        # Down, it disappears, again
+        down()
+        pp_list = get_pp()
+        ksft_eq(len(pp_list), 0)
+
+        # Up, allocate a page
+        up()
+        nsim.dfs_write("pp_hold", "y")
+        pp_list = nf.page_pool_get({}, dump=True)
+        refs = sum([pp["inflight"] for pp in pp_list if pp.get("ifindex") == nsim.ifindex])
+        ksft_ge(refs, 1)
+
+        # Now let's leak a page
+        down()
+        pp_list = get_pp()
+        ksft_eq(len(pp_list), 1)
+        refs = sum([pp["inflight"] for pp in pp_list])
+        ksft_eq(refs, 1)
+        attached = [pp for pp in pp_list if "detach-time" not in pp]
+        ksft_eq(len(attached), 0)
+
+        # New pp can get created, and we'll have two
+        up()
+        pp_list = get_pp()
+        attached = [pp for pp in pp_list if "detach-time" not in pp]
+        detached = [pp for pp in pp_list if "detach-time" in pp]
+        ksft_eq(len(attached), 1)
+        ksft_eq(len(detached), 1)
+
+        # Free the old page and the old pp is gone
+        nsim.dfs_write("pp_hold", "n")
+        # Freeing check is once a second so we may need to retry
+        ksft_busy_wait(lambda: len(get_pp()) == 1, deadline=2)
+
+        # And down...
+        down()
+        ksft_eq(len(get_pp()), 0)
+
+        # Last, leave the page hanging for destroy, nothing to check
+        # we're trying to exercise the orphaning path in the kernel
+        up()
+        nsim.dfs_write("pp_hold", "y")
+
+
+def main() -> None:
+    nf = NetdevFamily()
+    ksft_run([empty_check, lo_check, page_pool_check, napi_list_check,
+              nsim_rxq_reset_down],
+             args=(nf, ))
+    ksft_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/testing/selftests/net/openvswitch/Makefile b/tools/testing/selftests/net/openvswitch/Makefile
index 2f1508abc826..3fd1da2ec07d 100644
--- a/tools/testing/selftests/net/openvswitch/Makefile
+++ b/tools/testing/selftests/net/openvswitch/Makefile
@@ -2,7 +2,7 @@
 
 top_srcdir = ../../../../..
 
-CFLAGS =  -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include $(KHDR_INCLUDES)
+CFLAGS += -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include $(KHDR_INCLUDES)
 
 TEST_PROGS := openvswitch.sh
 
diff --git a/tools/testing/selftests/net/openvswitch/openvswitch.sh b/tools/testing/selftests/net/openvswitch/openvswitch.sh
index 5cae53543849..3c8d3455d8e7 100755
--- a/tools/testing/selftests/net/openvswitch/openvswitch.sh
+++ b/tools/testing/selftests/net/openvswitch/openvswitch.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 #
 # OVS kernel module self tests
@@ -11,6 +11,11 @@ ksft_skip=4
 PAUSE_ON_FAIL=no
 VERBOSE=0
 TRACING=0
+WAIT_TIMEOUT=5
+
+if test "X$KSFT_MACHINE_SLOW" == "Xyes"; then
+	WAIT_TIMEOUT=10
+fi
 
 tests="
 	arp_ping				eth-arp: Basic arp ping between two NS
@@ -20,10 +25,37 @@ tests="
 	nat_related_v4				ip4-nat-related: ICMP related matches work with SNAT
 	netlink_checks				ovsnl: validate netlink attrs and settings
 	upcall_interfaces			ovs: test the upcall interfaces
-	drop_reason				drop: test drop reasons are emitted"
+	drop_reason				drop: test drop reasons are emitted
+	psample					psample: Sampling packets with psample"
 
 info() {
-    [ $VERBOSE = 0 ] || echo $*
+	[ "${ovs_dir}" != "" ] &&
+		echo "`date +"[%m-%d %H:%M:%S]"` $*" >> ${ovs_dir}/debug.log
+	[ $VERBOSE = 0 ] || echo $*
+}
+
+ovs_wait() {
+	info "waiting $WAIT_TIMEOUT s for: $@"
+
+	if "$@" ; then
+		info "wait succeeded immediately"
+		return 0
+	fi
+
+	# A quick re-check helps speed up small races in fast systems.
+	# However, fractional sleeps might not necessarily work.
+	local start=0
+	sleep 0.1 || { sleep 1; start=1; }
+
+	for (( i=start; i<WAIT_TIMEOUT; i++ )); do
+		if "$@" ; then
+			info "wait succeeded after $i seconds"
+			return 0
+		fi
+		sleep 1
+	done
+	info "wait failed after $i seconds"
+	return 1
 }
 
 ovs_base=`pwd`
@@ -65,7 +97,8 @@ ovs_setenv() {
 
 ovs_sbx() {
 	if test "X$2" != X; then
-		(ovs_setenv $1; shift; "$@" >> ${ovs_dir}/debug.log)
+		(ovs_setenv $1; shift;
+		 info "run cmd: $@"; "$@" >> ${ovs_dir}/debug.log)
 	else
 		ovs_setenv $1
 	fi
@@ -102,12 +135,21 @@ ovs_netns_spawn_daemon() {
 	shift
 	netns=$1
 	shift
-	info "spawning cmd: $*"
-	ip netns exec $netns $*  >> $ovs_dir/stdout  2>> $ovs_dir/stderr &
+	if [ "$netns" == "_default" ]; then
+		$*  >> $ovs_dir/stdout  2>> $ovs_dir/stderr &
+	else
+		ip netns exec $netns $*  >> $ovs_dir/stdout  2>> $ovs_dir/stderr &
+	fi
 	pid=$!
 	ovs_sbx "$sbx" on_exit "kill -TERM $pid 2>/dev/null"
 }
 
+ovs_spawn_daemon() {
+	sbx=$1
+	shift
+	ovs_netns_spawn_daemon $sbx "_default" $*
+}
+
 ovs_add_netns_and_veths () {
 	info "Adding netns attached: sbx:$1 dp:$2 {$3, $4, $5}"
 	ovs_sbx "$1" ip netns add "$3" || return 1
@@ -129,8 +171,10 @@ ovs_add_netns_and_veths () {
 		ovs_add_if "$1" "$2" "$4" -u || return 1
 	fi
 
-	[ $TRACING -eq 1 ] && ovs_netns_spawn_daemon "$1" "$ns" \
-			tcpdump -i any -s 65535
+	if [ $TRACING -eq 1 ]; then
+		ovs_netns_spawn_daemon "$1" "$3" tcpdump -l -i any -s 6553
+		ovs_wait grep -q "listening on any" ${ovs_dir}/stderr
+	fi
 
 	return 0
 }
@@ -139,7 +183,7 @@ ovs_add_flow () {
 	info "Adding flow to DP: sbx:$1 br:$2 flow:$3 act:$4"
 	ovs_sbx "$1" python3 $ovs_base/ovs-dpctl.py add-flow "$2" "$3" "$4"
 	if [ $? -ne 0 ]; then
-		echo "Flow [ $3 : $4 ] failed" >> ${ovs_dir}/debug.log
+		info "Flow [ $3 : $4 ] failed"
 		return 1
 	fi
 	return 0
@@ -170,6 +214,19 @@ ovs_drop_reason_count()
 	return `echo "$perf_output" | grep "$pattern" | wc -l`
 }
 
+ovs_test_flow_fails () {
+	ERR_MSG="Flow actions may not be safe on all matching packets"
+
+	PRE_TEST=$(dmesg | grep -c "${ERR_MSG}")
+	ovs_add_flow $@ &> /dev/null $@ && return 1
+	POST_TEST=$(dmesg | grep -c "${ERR_MSG}")
+
+	if [ "$PRE_TEST" == "$POST_TEST" ]; then
+		return 1
+	fi
+	return 0
+}
+
 usage() {
 	echo
 	echo "$0 [OPTIONS] [TEST]..."
@@ -184,10 +241,100 @@ usage() {
 	exit 1
 }
 
+
+# psample test
+# - use psample to observe packets
+test_psample() {
+	sbx_add "test_psample" || return $?
+
+	# Add a datapath with per-vport dispatching.
+	ovs_add_dp "test_psample" psample -V 2:1 || return 1
+
+	info "create namespaces"
+	ovs_add_netns_and_veths "test_psample" "psample" \
+		client c0 c1 172.31.110.10/24 -u || return 1
+	ovs_add_netns_and_veths "test_psample" "psample" \
+		server s0 s1 172.31.110.20/24 -u || return 1
+
+	# Check if psample actions can be configured.
+	ovs_add_flow "test_psample" psample \
+	'in_port(1),eth(),eth_type(0x0806),arp()' 'psample(group=1)' &> /dev/null
+	if [ $? == 1 ]; then
+		info "no support for psample - skipping"
+		ovs_exit_sig
+		return $ksft_skip
+	fi
+
+	ovs_del_flows "test_psample" psample
+
+	# Test action verification.
+	OLDIFS=$IFS
+	IFS='*'
+	min_key='in_port(1),eth(),eth_type(0x0800),ipv4()'
+	for testcase in \
+		"cookie to large"*"psample(group=1,cookie=1615141312111009080706050403020100)" \
+		"no group with cookie"*"psample(cookie=abcd)" \
+		"no group"*"psample()";
+	do
+		set -- $testcase;
+		ovs_test_flow_fails "test_psample" psample $min_key $2
+		if [ $? == 1 ]; then
+			info "failed - $1"
+			return 1
+		fi
+	done
+	IFS=$OLDIFS
+
+	ovs_del_flows "test_psample" psample
+	# Allow ARP
+	ovs_add_flow "test_psample" psample \
+		'in_port(1),eth(),eth_type(0x0806),arp()' '2' || return 1
+	ovs_add_flow "test_psample" psample \
+		'in_port(2),eth(),eth_type(0x0806),arp()' '1' || return 1
+
+	# Sample first 14 bytes of all traffic.
+	ovs_add_flow "test_psample" psample \
+	    "in_port(1),eth(),eth_type(0x0800),ipv4()" \
+            "trunc(14),psample(group=1,cookie=c0ffee),2"
+
+	# Sample all traffic. In this case, use a sample() action with both
+	# psample and an upcall emulating simultaneous local sampling and
+	# sFlow / IPFIX.
+	nlpid=$(grep -E "listening on upcall packet handler" \
+            $ovs_dir/s0.out | cut -d ":" -f 2 | tr -d ' ')
+
+	ovs_add_flow "test_psample" psample \
+            "in_port(2),eth(),eth_type(0x0800),ipv4()" \
+            "sample(sample=100%,actions(psample(group=2,cookie=eeff0c),userspace(pid=${nlpid},userdata=eeff0c))),1"
+
+	# Record psample data.
+	ovs_spawn_daemon "test_psample" python3 $ovs_base/ovs-dpctl.py psample-events
+	ovs_wait grep -q "listening for psample events" ${ovs_dir}/stdout
+
+	# Send a single ping.
+	ovs_sbx "test_psample" ip netns exec client ping -I c1 172.31.110.20 -c 1 || return 1
+
+	# We should have received one userspace action upcall and 2 psample packets.
+	ovs_wait grep -q "userspace action command" $ovs_dir/s0.out || return 1
+
+	# client -> server samples should only contain the first 14 bytes of the packet.
+	ovs_wait grep -qE "rate:4294967295,group:1,cookie:c0ffee data:[0-9a-f]{28}$" \
+		$ovs_dir/stdout || return 1
+
+	ovs_wait grep -q "rate:4294967295,group:2,cookie:eeff0c" $ovs_dir/stdout || return 1
+
+	return 0
+}
+
 # drop_reason test
 # - drop packets and verify the right drop reason is reported
 test_drop_reason() {
 	which perf >/dev/null 2>&1 || return $ksft_skip
+	which pahole >/dev/null 2>&1 || return $ksft_skip
+
+	ovs_drop_subsys=$(pahole -C skb_drop_reason_subsys |
+			      awk '/OPENVSWITCH/ { print $3; }' |
+			      tr -d ,)
 
 	sbx_add "test_drop_reason" || return $?
 
@@ -231,7 +378,7 @@ test_drop_reason() {
 		"in_port(2),eth(),eth_type(0x0800),ipv4(src=172.31.110.20,proto=1),icmp()" 'drop'
 
 	ovs_drop_record_and_run "test_drop_reason" ip netns exec client ping -c 2 172.31.110.20
-	ovs_drop_reason_count 0x30001 # OVS_DROP_FLOW_ACTION
+	ovs_drop_reason_count 0x${ovs_drop_subsys}0001 # OVS_DROP_FLOW_ACTION
 	if [[ "$?" -ne "2" ]]; then
 		info "Did not detect expected drops: $?"
 		return 1
@@ -248,7 +395,7 @@ test_drop_reason() {
 
 	ovs_drop_record_and_run \
             "test_drop_reason" ip netns exec client nc -i 1 -zuv 172.31.110.20 6000
-	ovs_drop_reason_count 0x30004 # OVS_DROP_EXPLICIT_ACTION_ERROR
+	ovs_drop_reason_count 0x${ovs_drop_subsys}0004 # OVS_DROP_EXPLICIT_ACTION_ERROR
 	if [[ "$?" -ne "1" ]]; then
 		info "Did not detect expected explicit error drops: $?"
 		return 1
@@ -256,7 +403,7 @@ test_drop_reason() {
 
 	ovs_drop_record_and_run \
             "test_drop_reason" ip netns exec client nc -i 1 -zuv 172.31.110.20 7000
-	ovs_drop_reason_count 0x30003 # OVS_DROP_EXPLICIT_ACTION
+	ovs_drop_reason_count 0x${ovs_drop_subsys}0003 # OVS_DROP_EXPLICIT_ACTION
 	if [[ "$?" -ne "1" ]]; then
 		info "Did not detect expected explicit drops: $?"
 		return 1
@@ -599,7 +746,8 @@ test_upcall_interfaces() {
 	ovs_add_netns_and_veths "test_upcall_interfaces" ui0 upc left0 l0 \
 	    172.31.110.1/24 -u || return 1
 
-	sleep 1
+	ovs_wait grep -q "listening on upcall packet handler" ${ovs_dir}/left0.out
+
 	info "sending arping"
 	ip netns exec upc arping -I l0 172.31.110.20 -c 1 \
 	    >$ovs_dir/arping.stdout 2>$ovs_dir/arping.stderr
@@ -613,16 +761,20 @@ run_test() {
 	tname="$1"
 	tdesc="$2"
 
-	if ! lsmod | grep openvswitch >/dev/null 2>&1; then
-		stdbuf -o0 printf "TEST: %-60s  [NOMOD]\n" "${tdesc}"
-		return $ksft_skip
-	fi
-
 	if python3 ovs-dpctl.py -h 2>&1 | \
 	     grep -E "Need to (install|upgrade) the python" >/dev/null 2>&1; then
 		stdbuf -o0 printf "TEST: %-60s  [PYLIB]\n" "${tdesc}"
 		return $ksft_skip
 	fi
+
+	python3 ovs-dpctl.py show >/dev/null 2>&1 || \
+		echo "[DPCTL] show exception."
+
+	if ! lsmod | grep openvswitch >/dev/null 2>&1; then
+		stdbuf -o0 printf "TEST: %-60s  [NOMOD]\n" "${tdesc}"
+		return $ksft_skip
+	fi
+
 	printf "TEST: %-60s  [START]\n" "${tname}"
 
 	unset IFS
diff --git a/tools/testing/selftests/net/openvswitch/ovs-dpctl.py b/tools/testing/selftests/net/openvswitch/ovs-dpctl.py
index 5e0e539a323d..8a0396bfaf99 100644
--- a/tools/testing/selftests/net/openvswitch/ovs-dpctl.py
+++ b/tools/testing/selftests/net/openvswitch/ovs-dpctl.py
@@ -8,8 +8,10 @@ import argparse
 import errno
 import ipaddress
 import logging
+import math
 import multiprocessing
 import re
+import socket
 import struct
 import sys
 import time
@@ -26,13 +28,16 @@ try:
     from pyroute2.netlink import genlmsg
     from pyroute2.netlink import nla
     from pyroute2.netlink import nlmsg_atoms
+    from pyroute2.netlink.event import EventSocket
     from pyroute2.netlink.exceptions import NetlinkError
     from pyroute2.netlink.generic import GenericNetlinkSocket
+    from pyroute2.netlink.nlsocket import Marshal
     import pyroute2
+    import pyroute2.iproute
 
 except ModuleNotFoundError:
     print("Need to install the python pyroute2 package >= 0.6.")
-    sys.exit(0)
+    sys.exit(1)
 
 
 OVS_DATAPATH_FAMILY = "ovs_datapath"
@@ -58,6 +63,7 @@ OVS_FLOW_CMD_DEL = 2
 OVS_FLOW_CMD_GET = 3
 OVS_FLOW_CMD_SET = 4
 
+UINT32_MAX = 0xFFFFFFFF
 
 def macstr(mac):
     outstr = ":".join(["%02X" % i for i in mac])
@@ -198,6 +204,18 @@ def convert_ipv4(data):
 
     return int(ipaddress.IPv4Address(ip)), int(ipaddress.IPv4Address(mask))
 
+def convert_ipv6(data):
+    ip, _, mask = data.partition('/')
+
+    if not ip:
+        ip = mask = 0
+    elif not mask:
+        mask = 'ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff'
+    elif mask.isdigit():
+        mask = ipaddress.IPv6Network("::/" + mask).hostmask
+
+    return ipaddress.IPv6Address(ip).packed, ipaddress.IPv6Address(mask).packed
+
 def convert_int(size):
     def convert_int_sized(data):
         value, _, mask = data.partition('/')
@@ -267,6 +285,75 @@ def parse_extract_field(
     return str_skipped, data
 
 
+def parse_attrs(actstr, attr_desc):
+    """Parses the given action string and returns a list of netlink
+    attributes based on a list of attribute descriptions.
+
+    Each element in the attribute description list is a tuple such as:
+        (name, attr_name, parse_func)
+    where:
+        name: is the string representing the attribute
+        attr_name: is the name of the attribute as defined in the uAPI.
+        parse_func: is a callable accepting a string and returning either
+            a single object (the parsed attribute value) or a tuple of
+            two values (the parsed attribute value and the remaining string)
+
+    Returns a list of attributes and the remaining string.
+    """
+    def parse_attr(actstr, key, func):
+        actstr = actstr[len(key) :]
+
+        if not func:
+            return None, actstr
+
+        delim = actstr[0]
+        actstr = actstr[1:]
+
+        if delim == "=":
+            pos = strcspn(actstr, ",)")
+            ret = func(actstr[:pos])
+        else:
+            ret = func(actstr)
+
+        if isinstance(ret, tuple):
+            (datum, actstr) = ret
+        else:
+            datum = ret
+            actstr = actstr[strcspn(actstr, ",)"):]
+
+        if delim == "(":
+            if not actstr or actstr[0] != ")":
+                raise ValueError("Action contains unbalanced parentheses")
+
+            actstr = actstr[1:]
+
+        actstr = actstr[strspn(actstr, ", ") :]
+
+        return datum, actstr
+
+    attrs = []
+    attr_desc = list(attr_desc)
+    while actstr and actstr[0] != ")" and attr_desc:
+        found = False
+        for i, (key, attr, func) in enumerate(attr_desc):
+            if actstr.startswith(key):
+                datum, actstr = parse_attr(actstr, key, func)
+                attrs.append([attr, datum])
+                found = True
+                del attr_desc[i]
+
+        if not found:
+            raise ValueError("Unknown attribute: '%s'" % actstr)
+
+        actstr = actstr[strspn(actstr, ", ") :]
+
+    if actstr[0] != ")":
+        raise ValueError("Action string contains extra garbage or has "
+                         "unbalanced parenthesis: '%s'" % actstr)
+
+    return attrs, actstr[1:]
+
+
 class ovs_dp_msg(genlmsg):
     # include the OVS version
     # We need a custom header rather than just being able to rely on
@@ -282,15 +369,15 @@ class ovsactions(nla):
         ("OVS_ACTION_ATTR_UNSPEC", "none"),
         ("OVS_ACTION_ATTR_OUTPUT", "uint32"),
         ("OVS_ACTION_ATTR_USERSPACE", "userspace"),
-        ("OVS_ACTION_ATTR_SET", "none"),
+        ("OVS_ACTION_ATTR_SET", "ovskey"),
         ("OVS_ACTION_ATTR_PUSH_VLAN", "none"),
         ("OVS_ACTION_ATTR_POP_VLAN", "flag"),
-        ("OVS_ACTION_ATTR_SAMPLE", "none"),
+        ("OVS_ACTION_ATTR_SAMPLE", "sample"),
         ("OVS_ACTION_ATTR_RECIRC", "uint32"),
         ("OVS_ACTION_ATTR_HASH", "none"),
         ("OVS_ACTION_ATTR_PUSH_MPLS", "none"),
         ("OVS_ACTION_ATTR_POP_MPLS", "flag"),
-        ("OVS_ACTION_ATTR_SET_MASKED", "none"),
+        ("OVS_ACTION_ATTR_SET_MASKED", "ovskey"),
         ("OVS_ACTION_ATTR_CT", "ctact"),
         ("OVS_ACTION_ATTR_TRUNC", "uint32"),
         ("OVS_ACTION_ATTR_PUSH_ETH", "none"),
@@ -304,8 +391,85 @@ class ovsactions(nla):
         ("OVS_ACTION_ATTR_ADD_MPLS", "none"),
         ("OVS_ACTION_ATTR_DEC_TTL", "none"),
         ("OVS_ACTION_ATTR_DROP", "uint32"),
+        ("OVS_ACTION_ATTR_PSAMPLE", "psample"),
     )
 
+    class psample(nla):
+        nla_flags = NLA_F_NESTED
+
+        nla_map = (
+            ("OVS_PSAMPLE_ATTR_UNSPEC", "none"),
+            ("OVS_PSAMPLE_ATTR_GROUP", "uint32"),
+            ("OVS_PSAMPLE_ATTR_COOKIE", "array(uint8)"),
+        )
+
+        def dpstr(self, more=False):
+            args = "group=%d" % self.get_attr("OVS_PSAMPLE_ATTR_GROUP")
+
+            cookie = self.get_attr("OVS_PSAMPLE_ATTR_COOKIE")
+            if cookie:
+                args += ",cookie(%s)" % \
+                        "".join(format(x, "02x") for x in cookie)
+
+            return "psample(%s)" % args
+
+        def parse(self, actstr):
+            desc = (
+                ("group", "OVS_PSAMPLE_ATTR_GROUP", int),
+                ("cookie", "OVS_PSAMPLE_ATTR_COOKIE",
+                    lambda x: list(bytearray.fromhex(x)))
+            )
+
+            attrs, actstr = parse_attrs(actstr, desc)
+
+            for attr in attrs:
+                self["attrs"].append(attr)
+
+            return actstr
+
+    class sample(nla):
+        nla_flags = NLA_F_NESTED
+
+        nla_map = (
+            ("OVS_SAMPLE_ATTR_UNSPEC", "none"),
+            ("OVS_SAMPLE_ATTR_PROBABILITY", "uint32"),
+            ("OVS_SAMPLE_ATTR_ACTIONS", "ovsactions"),
+        )
+
+        def dpstr(self, more=False):
+            args = []
+
+            args.append("sample={:.2f}%".format(
+                100 * self.get_attr("OVS_SAMPLE_ATTR_PROBABILITY") /
+                UINT32_MAX))
+
+            actions = self.get_attr("OVS_SAMPLE_ATTR_ACTIONS")
+            if actions:
+                args.append("actions(%s)" % actions.dpstr(more))
+
+            return "sample(%s)" % ",".join(args)
+
+        def parse(self, actstr):
+            def parse_nested_actions(actstr):
+                subacts = ovsactions()
+                parsed_len = subacts.parse(actstr)
+                return subacts, actstr[parsed_len :]
+
+            def percent_to_rate(percent):
+                percent = float(percent.strip('%'))
+                return int(math.floor(UINT32_MAX * (percent / 100.0) + .5))
+
+            desc = (
+                ("sample", "OVS_SAMPLE_ATTR_PROBABILITY", percent_to_rate),
+                ("actions", "OVS_SAMPLE_ATTR_ACTIONS", parse_nested_actions),
+            )
+            attrs, actstr = parse_attrs(actstr, desc)
+
+            for attr in attrs:
+                self["attrs"].append(attr)
+
+            return actstr
+
     class ctact(nla):
         nla_flags = NLA_F_NESTED
 
@@ -427,50 +591,77 @@ class ovsactions(nla):
                 print_str += "userdata="
                 for f in self.get_attr("OVS_USERSPACE_ATTR_USERDATA"):
                     print_str += "%x." % f
-            if self.get_attr("OVS_USERSPACE_ATTR_TUN_PORT") is not None:
+            if self.get_attr("OVS_USERSPACE_ATTR_EGRESS_TUN_PORT") is not None:
                 print_str += "egress_tun_port=%d" % self.get_attr(
-                    "OVS_USERSPACE_ATTR_TUN_PORT"
+                    "OVS_USERSPACE_ATTR_EGRESS_TUN_PORT"
                 )
             print_str += ")"
             return print_str
 
+        def parse(self, actstr):
+            attrs_desc = (
+                ("pid", "OVS_USERSPACE_ATTR_PID", int),
+                ("userdata", "OVS_USERSPACE_ATTR_USERDATA",
+                    lambda x: list(bytearray.fromhex(x))),
+                ("egress_tun_port", "OVS_USERSPACE_ATTR_EGRESS_TUN_PORT", int)
+            )
+
+            attrs, actstr = parse_attrs(actstr, attrs_desc)
+            for attr in attrs:
+                self["attrs"].append(attr)
+
+            return actstr
+
     def dpstr(self, more=False):
         print_str = ""
 
-        for field in self.nla_map:
+        for field in self["attrs"]:
             if field[1] == "none" or self.get_attr(field[0]) is None:
                 continue
             if print_str != "":
                 print_str += ","
 
-            if field[1] == "uint32":
-                if field[0] == "OVS_ACTION_ATTR_OUTPUT":
-                    print_str += "%d" % int(self.get_attr(field[0]))
-                elif field[0] == "OVS_ACTION_ATTR_RECIRC":
-                    print_str += "recirc(0x%x)" % int(self.get_attr(field[0]))
-                elif field[0] == "OVS_ACTION_ATTR_TRUNC":
-                    print_str += "trunc(%d)" % int(self.get_attr(field[0]))
-                elif field[0] == "OVS_ACTION_ATTR_DROP":
-                    print_str += "drop(%d)" % int(self.get_attr(field[0]))
-            elif field[1] == "flag":
-                if field[0] == "OVS_ACTION_ATTR_CT_CLEAR":
-                    print_str += "ct_clear"
-                elif field[0] == "OVS_ACTION_ATTR_POP_VLAN":
-                    print_str += "pop_vlan"
-                elif field[0] == "OVS_ACTION_ATTR_POP_ETH":
-                    print_str += "pop_eth"
-                elif field[0] == "OVS_ACTION_ATTR_POP_NSH":
-                    print_str += "pop_nsh"
-                elif field[0] == "OVS_ACTION_ATTR_POP_MPLS":
-                    print_str += "pop_mpls"
+            if field[0] == "OVS_ACTION_ATTR_OUTPUT":
+                print_str += "%d" % int(self.get_attr(field[0]))
+            elif field[0] == "OVS_ACTION_ATTR_RECIRC":
+                print_str += "recirc(0x%x)" % int(self.get_attr(field[0]))
+            elif field[0] == "OVS_ACTION_ATTR_TRUNC":
+                print_str += "trunc(%d)" % int(self.get_attr(field[0]))
+            elif field[0] == "OVS_ACTION_ATTR_DROP":
+                print_str += "drop(%d)" % int(self.get_attr(field[0]))
+            elif field[0] == "OVS_ACTION_ATTR_CT_CLEAR":
+                print_str += "ct_clear"
+            elif field[0] == "OVS_ACTION_ATTR_POP_VLAN":
+                print_str += "pop_vlan"
+            elif field[0] == "OVS_ACTION_ATTR_POP_ETH":
+                print_str += "pop_eth"
+            elif field[0] == "OVS_ACTION_ATTR_POP_NSH":
+                print_str += "pop_nsh"
+            elif field[0] == "OVS_ACTION_ATTR_POP_MPLS":
+                print_str += "pop_mpls"
             else:
                 datum = self.get_attr(field[0])
                 if field[0] == "OVS_ACTION_ATTR_CLONE":
                     print_str += "clone("
                     print_str += datum.dpstr(more)
                     print_str += ")"
+                elif field[0] == "OVS_ACTION_ATTR_SET" or \
+                     field[0] == "OVS_ACTION_ATTR_SET_MASKED":
+                    print_str += "set"
+                    field = datum
+                    mask = None
+                    if field[0] == "OVS_ACTION_ATTR_SET_MASKED":
+                        print_str += "_masked"
+                        field = datum[0]
+                        mask = datum[1]
+                    print_str += "("
+                    print_str += field.dpstr(mask, more)
+                    print_str += ")"
                 else:
-                    print_str += datum.dpstr(more)
+                    try:
+                        print_str += datum.dpstr(more)
+                    except:
+                        print_str += "{ATTR: %s not decoded}" % field[0]
 
         return print_str
 
@@ -489,7 +680,7 @@ class ovsactions(nla):
                     actstr, reason = parse_extract_field(
                         actstr,
                         "drop(",
-                        "([0-9]+)",
+                        r"([0-9]+)",
                         lambda x: int(x, 0),
                         False,
                         None,
@@ -502,9 +693,9 @@ class ovsactions(nla):
                     actstr = actstr[len("drop"): ]
                     return (totallen - len(actstr))
 
-            elif parse_starts_block(actstr, "^(\d+)", False, True):
+            elif parse_starts_block(actstr, r"^(\d+)", False, True):
                 actstr, output = parse_extract_field(
-                    actstr, None, "(\d+)", lambda x: int(x), False, "0"
+                    actstr, None, r"(\d+)", lambda x: int(x), False, "0"
                 )
                 self["attrs"].append(["OVS_ACTION_ATTR_OUTPUT", output])
                 parsed = True
@@ -512,7 +703,7 @@ class ovsactions(nla):
                 actstr, recircid = parse_extract_field(
                     actstr,
                     "recirc(",
-                    "([0-9a-fA-Fx]+)",
+                    r"([0-9a-fA-Fx]+)",
                     lambda x: int(x, 0),
                     False,
                     0,
@@ -531,7 +722,7 @@ class ovsactions(nla):
             for flat_act in parse_flat_map:
                 if parse_starts_block(actstr, flat_act[0], False):
                     actstr = actstr[len(flat_act[0]):]
-                    self["attrs"].append([flat_act[1]])
+                    self["attrs"].append([flat_act[1], True])
                     actstr = actstr[strspn(actstr, ", ") :]
                     parsed = True
 
@@ -544,6 +735,25 @@ class ovsactions(nla):
                 self["attrs"].append(("OVS_ACTION_ATTR_CLONE", subacts))
                 actstr = actstr[parsedLen:]
                 parsed = True
+            elif parse_starts_block(actstr, "set(", False):
+                parencount += 1
+                k = ovskey()
+                actstr = actstr[len("set("):]
+                actstr = k.parse(actstr, None)
+                self["attrs"].append(("OVS_ACTION_ATTR_SET", k))
+                if not actstr.startswith(")"):
+                    actstr = ")" + actstr
+                parsed = True
+            elif parse_starts_block(actstr, "set_masked(", False):
+                parencount += 1
+                k = ovskey()
+                m = ovskey()
+                actstr = actstr[len("set_masked("):]
+                actstr = k.parse(actstr, m)
+                self["attrs"].append(("OVS_ACTION_ATTR_SET_MASKED", [k, m]))
+                if not actstr.startswith(")"):
+                    actstr = ")" + actstr
+                parsed = True
             elif parse_starts_block(actstr, "ct(", False):
                 parencount += 1
                 actstr = actstr[len("ct(") :]
@@ -588,17 +798,17 @@ class ovsactions(nla):
                                 actstr = actstr[3:]
 
                             actstr, ip_block_min = parse_extract_field(
-                                actstr, "=", "([0-9a-fA-F\.]+)", str, False
+                                actstr, "=", r"([0-9a-fA-F\.]+)", str, False
                             )
                             actstr, ip_block_max = parse_extract_field(
-                                actstr, "-", "([0-9a-fA-F\.]+)", str, False
+                                actstr, "-", r"([0-9a-fA-F\.]+)", str, False
                             )
 
                             actstr, proto_min = parse_extract_field(
-                                actstr, ":", "(\d+)", int, False
+                                actstr, ":", r"(\d+)", int, False
                             )
                             actstr, proto_max = parse_extract_field(
-                                actstr, "-", "(\d+)", int, False
+                                actstr, "-", r"(\d+)", int, False
                             )
 
                             if t is not None:
@@ -637,6 +847,37 @@ class ovsactions(nla):
                 self["attrs"].append(["OVS_ACTION_ATTR_CT", ctact])
                 parsed = True
 
+            elif parse_starts_block(actstr, "sample(", False):
+                sampleact = self.sample()
+                actstr = sampleact.parse(actstr[len("sample(") : ])
+                self["attrs"].append(["OVS_ACTION_ATTR_SAMPLE", sampleact])
+                parsed = True
+
+            elif parse_starts_block(actstr, "psample(", False):
+                psampleact = self.psample()
+                actstr = psampleact.parse(actstr[len("psample(") : ])
+                self["attrs"].append(["OVS_ACTION_ATTR_PSAMPLE", psampleact])
+                parsed = True
+
+            elif parse_starts_block(actstr, "userspace(", False):
+                uact = self.userspace()
+                actstr = uact.parse(actstr[len("userspace(") : ])
+                self["attrs"].append(["OVS_ACTION_ATTR_USERSPACE", uact])
+                parsed = True
+
+            elif parse_starts_block(actstr, "trunc(", False):
+                parencount += 1
+                actstr, val = parse_extract_field(
+                    actstr,
+                    "trunc(",
+                    r"([0-9]+)",
+                    int,
+                    False,
+                    None,
+                )
+                self["attrs"].append(["OVS_ACTION_ATTR_TRUNC", val])
+                parsed = True
+
             actstr = actstr[strspn(actstr, ", ") :]
             while parencount > 0:
                 parencount -= 1
@@ -675,7 +916,7 @@ class ovskey(nla):
         ("OVS_KEY_ATTR_ARP", "ovs_key_arp"),
         ("OVS_KEY_ATTR_ND", "ovs_key_nd"),
         ("OVS_KEY_ATTR_SKB_MARK", "uint32"),
-        ("OVS_KEY_ATTR_TUNNEL", "none"),
+        ("OVS_KEY_ATTR_TUNNEL", "ovs_key_tunnel"),
         ("OVS_KEY_ATTR_SCTP", "ovs_key_sctp"),
         ("OVS_KEY_ATTR_TCP_FLAGS", "be16"),
         ("OVS_KEY_ATTR_DP_HASH", "uint32"),
@@ -907,21 +1148,21 @@ class ovskey(nla):
                 "src",
                 "src",
                 lambda x: str(ipaddress.IPv6Address(x)),
-                lambda x: int.from_bytes(x, "big"),
-                lambda x: ipaddress.IPv6Address(x),
+                lambda x: ipaddress.IPv6Address(x).packed if x else 0,
+                convert_ipv6,
             ),
             (
                 "dst",
                 "dst",
                 lambda x: str(ipaddress.IPv6Address(x)),
-                lambda x: int.from_bytes(x, "big"),
-                lambda x: ipaddress.IPv6Address(x),
+                lambda x: ipaddress.IPv6Address(x).packed if x else 0,
+                convert_ipv6,
             ),
-            ("label", "label", "%d", int),
-            ("proto", "proto", "%d", int),
-            ("tclass", "tclass", "%d", int),
-            ("hlimit", "hlimit", "%d", int),
-            ("frag", "frag", "%d", int),
+            ("label", "label", "%d", lambda x: int(x) if x else 0),
+            ("proto", "proto", "%d", lambda x: int(x) if x else 0),
+            ("tclass", "tclass", "%d", lambda x: int(x) if x else 0),
+            ("hlimit", "hlimit", "%d", lambda x: int(x) if x else 0),
+            ("frag", "frag", "%d", lambda x: int(x) if x else 0),
         )
 
         def __init__(
@@ -1119,7 +1360,7 @@ class ovskey(nla):
                 "target",
                 "target",
                 lambda x: str(ipaddress.IPv6Address(x)),
-                lambda x: int.from_bytes(x, "big"),
+                convert_ipv6,
             ),
             ("sll", "sll", macstr, lambda x: int.from_bytes(x, "big")),
             ("tll", "tll", macstr, lambda x: int.from_bytes(x, "big")),
@@ -1204,13 +1445,13 @@ class ovskey(nla):
                 "src",
                 "src",
                 lambda x: str(ipaddress.IPv6Address(x)),
-                lambda x: int.from_bytes(x, "big", convertmac),
+                convert_ipv6,
             ),
             (
                 "dst",
                 "dst",
                 lambda x: str(ipaddress.IPv6Address(x)),
-                lambda x: int.from_bytes(x, "big"),
+                convert_ipv6,
             ),
             ("tp_src", "tp_src", "%d", int),
             ("tp_dst", "tp_dst", "%d", int),
@@ -1235,6 +1476,163 @@ class ovskey(nla):
                 init=init,
             )
 
+    class ovs_key_tunnel(nla):
+        nla_flags = NLA_F_NESTED
+
+        nla_map = (
+            ("OVS_TUNNEL_KEY_ATTR_ID", "be64"),
+            ("OVS_TUNNEL_KEY_ATTR_IPV4_SRC", "ipaddr"),
+            ("OVS_TUNNEL_KEY_ATTR_IPV4_DST", "ipaddr"),
+            ("OVS_TUNNEL_KEY_ATTR_TOS", "uint8"),
+            ("OVS_TUNNEL_KEY_ATTR_TTL", "uint8"),
+            ("OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT", "flag"),
+            ("OVS_TUNNEL_KEY_ATTR_CSUM", "flag"),
+            ("OVS_TUNNEL_KEY_ATTR_OAM", "flag"),
+            ("OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS", "array(uint32)"),
+            ("OVS_TUNNEL_KEY_ATTR_TP_SRC", "be16"),
+            ("OVS_TUNNEL_KEY_ATTR_TP_DST", "be16"),
+            ("OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS", "none"),
+            ("OVS_TUNNEL_KEY_ATTR_IPV6_SRC", "ipaddr"),
+            ("OVS_TUNNEL_KEY_ATTR_IPV6_DST", "ipaddr"),
+            ("OVS_TUNNEL_KEY_ATTR_PAD", "none"),
+            ("OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS", "none"),
+            ("OVS_TUNNEL_KEY_ATTR_IPV4_INFO_BRIDGE", "flag"),
+        )
+
+        def parse(self, flowstr, mask=None):
+            if not flowstr.startswith("tunnel("):
+                return None, None
+
+            k = ovskey.ovs_key_tunnel()
+            if mask is not None:
+                mask = ovskey.ovs_key_tunnel()
+
+            flowstr = flowstr[len("tunnel("):]
+
+            v6_address = None
+
+            fields = [
+                ("tun_id=", r"(\d+)", int, "OVS_TUNNEL_KEY_ATTR_ID",
+                 0xffffffffffffffff, None, None),
+
+                ("src=", r"([0-9a-fA-F\.]+)", str,
+                 "OVS_TUNNEL_KEY_ATTR_IPV4_SRC", "255.255.255.255", "0.0.0.0",
+                 False),
+                ("dst=", r"([0-9a-fA-F\.]+)", str,
+                 "OVS_TUNNEL_KEY_ATTR_IPV4_DST", "255.255.255.255", "0.0.0.0",
+                 False),
+
+                ("ipv6_src=", r"([0-9a-fA-F:]+)", str,
+                 "OVS_TUNNEL_KEY_ATTR_IPV6_SRC",
+                 "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff", "::", True),
+                ("ipv6_dst=", r"([0-9a-fA-F:]+)", str,
+                 "OVS_TUNNEL_KEY_ATTR_IPV6_DST",
+                 "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff", "::", True),
+
+                ("tos=", r"(\d+)", int, "OVS_TUNNEL_KEY_ATTR_TOS", 255, 0,
+                 None),
+                ("ttl=", r"(\d+)", int, "OVS_TUNNEL_KEY_ATTR_TTL", 255, 0,
+                 None),
+
+                ("tp_src=", r"(\d+)", int, "OVS_TUNNEL_KEY_ATTR_TP_SRC",
+                 65535, 0, None),
+                ("tp_dst=", r"(\d+)", int, "OVS_TUNNEL_KEY_ATTR_TP_DST",
+                 65535, 0, None),
+            ]
+
+            forced_include = ["OVS_TUNNEL_KEY_ATTR_TTL"]
+
+            for prefix, regex, typ, attr_name, mask_val, default_val, v46_flag in fields:
+                flowstr, value = parse_extract_field(flowstr, prefix, regex, typ, False)
+                if not attr_name:
+                    raise Exception("Bad list value in tunnel fields")
+
+                if value is None and attr_name in forced_include:
+                    value = default_val
+                    mask_val = default_val
+
+                if value is not None:
+                    if v46_flag is not None:
+                        if v6_address is None:
+                            v6_address = v46_flag
+                        if v46_flag != v6_address:
+                            raise ValueError("Cannot mix v6 and v4 addresses")
+                    k["attrs"].append([attr_name, value])
+                    if mask is not None:
+                        mask["attrs"].append([attr_name, mask_val])
+                else:
+                    if v46_flag is not None:
+                        if v6_address is None or v46_flag != v6_address:
+                            continue
+                    if mask is not None:
+                        mask["attrs"].append([attr_name, default_val])
+
+            if k["attrs"][0][0] != "OVS_TUNNEL_KEY_ATTR_ID":
+                raise ValueError("Needs a tunid set")
+
+            if flowstr.startswith("flags("):
+                flowstr = flowstr[len("flags("):]
+                flagspos = flowstr.find(")")
+                flags = flowstr[:flagspos]
+                flowstr = flowstr[flagspos + 1:]
+
+                flag_attrs = {
+                    "df": "OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT",
+                    "csum": "OVS_TUNNEL_KEY_ATTR_CSUM",
+                    "oam": "OVS_TUNNEL_KEY_ATTR_OAM"
+                }
+
+                for flag in flags.split("|"):
+                    if flag in flag_attrs:
+                        k["attrs"].append([flag_attrs[flag], True])
+                        if mask is not None:
+                            mask["attrs"].append([flag_attrs[flag], True])
+
+            flowstr = flowstr[strspn(flowstr, ", ") :]
+            return flowstr, k, mask
+
+        def dpstr(self, mask=None, more=False):
+            print_str = "tunnel("
+
+            flagsattrs = []
+            for k in self["attrs"]:
+                noprint = False
+                if k[0] == "OVS_TUNNEL_KEY_ATTR_ID":
+                    print_str += "tun_id=%d" % k[1]
+                elif k[0] == "OVS_TUNNEL_KEY_ATTR_IPV4_SRC":
+                    print_str += "src=%s" % k[1]
+                elif k[0] == "OVS_TUNNEL_KEY_ATTR_IPV4_DST":
+                    print_str += "dst=%s" % k[1]
+                elif k[0] == "OVS_TUNNEL_KEY_ATTR_IPV6_SRC":
+                    print_str += "ipv6_src=%s" % k[1]
+                elif k[0] == "OVS_TUNNEL_KEY_ATTR_IPV6_DST":
+                    print_str += "ipv6_dst=%s" % k[1]
+                elif k[0] == "OVS_TUNNEL_KEY_ATTR_TOS":
+                    print_str += "tos=%d" % k[1]
+                elif k[0] == "OVS_TUNNEL_KEY_ATTR_TTL":
+                    print_str += "ttl=%d" % k[1]
+                elif k[0] == "OVS_TUNNEL_KEY_ATTR_TP_SRC":
+                    print_str += "tp_src=%d" % k[1]
+                elif k[0] == "OVS_TUNNEL_KEY_ATTR_TP_DST":
+                    print_str += "tp_dst=%d" % k[1]
+                elif k[0] == "OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT":
+                    noprint = True
+                    flagsattrs.append("df")
+                elif k[0] == "OVS_TUNNEL_KEY_ATTR_CSUM":
+                    noprint = True
+                    flagsattrs.append("csum")
+                elif k[0] == "OVS_TUNNEL_KEY_ATTR_OAM":
+                    noprint = True
+                    flagsattrs.append("oam")
+
+                if not noprint:
+                    print_str += ","
+
+            if len(flagsattrs):
+                print_str += "flags(" + "|".join(flagsattrs) + ")"
+            print_str += ")"
+            return print_str
+
     class ovs_key_mpls(nla):
         fields = (("lse", ">I"),)
 
@@ -1243,6 +1641,7 @@ class ovskey(nla):
             ("OVS_KEY_ATTR_PRIORITY", "skb_priority", intparse),
             ("OVS_KEY_ATTR_SKB_MARK", "skb_mark", intparse),
             ("OVS_KEY_ATTR_RECIRC_ID", "recirc_id", intparse),
+            ("OVS_KEY_ATTR_TUNNEL", "tunnel", ovskey.ovs_key_tunnel),
             ("OVS_KEY_ATTR_DP_HASH", "dp_hash", intparse),
             ("OVS_KEY_ATTR_CT_STATE", "ct_state", parse_ct_state),
             ("OVS_KEY_ATTR_CT_ZONE", "ct_zone", intparse),
@@ -1309,7 +1708,7 @@ class ovskey(nla):
                 mask["attrs"].append([field[0], m])
             self["attrs"].append([field[0], k])
 
-            flowstr = flowstr[strspn(flowstr, "),") :]
+            flowstr = flowstr[strspn(flowstr, "), ") :]
 
         return flowstr
 
@@ -1346,6 +1745,13 @@ class ovskey(nla):
                 True,
             ),
             (
+                "OVS_KEY_ATTR_TUNNEL",
+                "tunnel",
+                None,
+                False,
+                False,
+            ),
+            (
                 "OVS_KEY_ATTR_CT_STATE",
                 "ct_state",
                 "0x%04x",
@@ -1617,7 +2023,7 @@ class OvsVport(GenericNetlinkSocket):
             ("OVS_VPORT_ATTR_PORT_NO", "uint32"),
             ("OVS_VPORT_ATTR_TYPE", "uint32"),
             ("OVS_VPORT_ATTR_NAME", "asciiz"),
-            ("OVS_VPORT_ATTR_OPTIONS", "none"),
+            ("OVS_VPORT_ATTR_OPTIONS", "vportopts"),
             ("OVS_VPORT_ATTR_UPCALL_PID", "array(uint32)"),
             ("OVS_VPORT_ATTR_STATS", "vportstats"),
             ("OVS_VPORT_ATTR_PAD", "none"),
@@ -1625,6 +2031,13 @@ class OvsVport(GenericNetlinkSocket):
             ("OVS_VPORT_ATTR_NETNSID", "uint32"),
         )
 
+        class vportopts(nla):
+            nla_map = (
+                ("OVS_TUNNEL_ATTR_UNSPEC", "none"),
+                ("OVS_TUNNEL_ATTR_DST_PORT", "uint16"),
+                ("OVS_TUNNEL_ATTR_EXTENSION", "none"),
+            )
+
         class vportstats(nla):
             fields = (
                 ("rx_packets", "=Q"),
@@ -1693,7 +2106,7 @@ class OvsVport(GenericNetlinkSocket):
                 raise ne
         return reply
 
-    def attach(self, dpindex, vport_ifname, ptype):
+    def attach(self, dpindex, vport_ifname, ptype, dport, lwt):
         msg = OvsVport.ovs_vport_msg()
 
         msg["cmd"] = OVS_VPORT_CMD_NEW
@@ -1702,12 +2115,43 @@ class OvsVport(GenericNetlinkSocket):
         msg["dpifindex"] = dpindex
         port_type = OvsVport.str_to_type(ptype)
 
-        msg["attrs"].append(["OVS_VPORT_ATTR_TYPE", port_type])
         msg["attrs"].append(["OVS_VPORT_ATTR_NAME", vport_ifname])
         msg["attrs"].append(
             ["OVS_VPORT_ATTR_UPCALL_PID", [self.upcall_packet.epid]]
         )
 
+        TUNNEL_DEFAULTS = [("geneve", 6081),
+                           ("vxlan", 4789)]
+
+        for tnl in TUNNEL_DEFAULTS:
+            if ptype == tnl[0]:
+                if not dport:
+                    dport = tnl[1]
+
+                if not lwt:
+                    vportopt = OvsVport.ovs_vport_msg.vportopts()
+                    vportopt["attrs"].append(
+                        ["OVS_TUNNEL_ATTR_DST_PORT", socket.htons(dport)]
+                    )
+                    msg["attrs"].append(
+                        ["OVS_VPORT_ATTR_OPTIONS", vportopt]
+                    )
+                else:
+                    port_type = OvsVport.OVS_VPORT_TYPE_NETDEV
+                    ipr = pyroute2.iproute.IPRoute()
+
+                    if tnl[0] == "geneve":
+                        ipr.link("add", ifname=vport_ifname, kind=tnl[0],
+                                 geneve_port=dport,
+                                 geneve_collect_metadata=True,
+                                 geneve_udp_zero_csum6_rx=1)
+                    elif tnl[0] == "vxlan":
+                        ipr.link("add", ifname=vport_ifname, kind=tnl[0],
+                                 vxlan_learning=0, vxlan_collect_metadata=1,
+                                 vxlan_udp_zero_csum6_rx=1, vxlan_port=dport)
+                break
+        msg["attrs"].append(["OVS_VPORT_ATTR_TYPE", port_type])
+
         try:
             reply = self.nlm_request(
                 msg, msg_type=self.prid, msg_flags=NLM_F_REQUEST | NLM_F_ACK
@@ -2018,10 +2462,71 @@ class OvsFlow(GenericNetlinkSocket):
         print("MISS upcall[%d/%s]: %s" % (seq, pktpres, keystr), flush=True)
 
     def execute(self, packetmsg):
-        print("userspace execute command")
+        print("userspace execute command", flush=True)
 
     def action(self, packetmsg):
-        print("userspace action command")
+        print("userspace action command", flush=True)
+
+
+class psample_sample(genlmsg):
+    nla_map = (
+        ("PSAMPLE_ATTR_IIFINDEX", "none"),
+        ("PSAMPLE_ATTR_OIFINDEX", "none"),
+        ("PSAMPLE_ATTR_ORIGSIZE", "none"),
+        ("PSAMPLE_ATTR_SAMPLE_GROUP", "uint32"),
+        ("PSAMPLE_ATTR_GROUP_SEQ", "none"),
+        ("PSAMPLE_ATTR_SAMPLE_RATE", "uint32"),
+        ("PSAMPLE_ATTR_DATA", "array(uint8)"),
+        ("PSAMPLE_ATTR_GROUP_REFCOUNT", "none"),
+        ("PSAMPLE_ATTR_TUNNEL", "none"),
+        ("PSAMPLE_ATTR_PAD", "none"),
+        ("PSAMPLE_ATTR_OUT_TC", "none"),
+        ("PSAMPLE_ATTR_OUT_TC_OCC", "none"),
+        ("PSAMPLE_ATTR_LATENCY", "none"),
+        ("PSAMPLE_ATTR_TIMESTAMP", "none"),
+        ("PSAMPLE_ATTR_PROTO", "none"),
+        ("PSAMPLE_ATTR_USER_COOKIE", "array(uint8)"),
+    )
+
+    def dpstr(self):
+        fields = []
+        data = ""
+        for (attr, value) in self["attrs"]:
+            if attr == "PSAMPLE_ATTR_SAMPLE_GROUP":
+                fields.append("group:%d" % value)
+            if attr == "PSAMPLE_ATTR_SAMPLE_RATE":
+                fields.append("rate:%d" % value)
+            if attr == "PSAMPLE_ATTR_USER_COOKIE":
+                value = "".join(format(x, "02x") for x in value)
+                fields.append("cookie:%s" % value)
+            if attr == "PSAMPLE_ATTR_DATA" and len(value) > 0:
+                data = "data:%s" % "".join(format(x, "02x") for x in value)
+
+        return ("%s %s" % (",".join(fields), data)).strip()
+
+
+class psample_msg(Marshal):
+    PSAMPLE_CMD_SAMPLE = 0
+    PSAMPLE_CMD_GET_GROUP = 1
+    PSAMPLE_CMD_NEW_GROUP = 2
+    PSAMPLE_CMD_DEL_GROUP = 3
+    PSAMPLE_CMD_SET_FILTER = 4
+    msg_map = {PSAMPLE_CMD_SAMPLE: psample_sample}
+
+
+class PsampleEvent(EventSocket):
+    genl_family = "psample"
+    mcast_groups = ["packets"]
+    marshal_class = psample_msg
+
+    def read_samples(self):
+        print("listening for psample events", flush=True)
+        while True:
+            try:
+                for msg in self.get():
+                    print(msg.dpstr(), flush=True)
+            except NetlinkError as ne:
+                raise ne
 
 
 def print_ovsdp_full(dp_lookup_rep, ifindex, ndb=NDB(), vpl=OvsVport()):
@@ -2053,12 +2558,19 @@ def print_ovsdp_full(dp_lookup_rep, ifindex, ndb=NDB(), vpl=OvsVport()):
     for iface in ndb.interfaces:
         rep = vpl.info(iface.ifname, ifindex)
         if rep is not None:
+            opts = ""
+            vpo = rep.get_attr("OVS_VPORT_ATTR_OPTIONS")
+            if vpo:
+                dpo = vpo.get_attr("OVS_TUNNEL_ATTR_DST_PORT")
+                if dpo:
+                    opts += " tnl-dport:%s" % socket.ntohs(dpo)
             print(
-                "  port %d: %s (%s)"
+                "  port %d: %s (%s%s)"
                 % (
                     rep.get_attr("OVS_VPORT_ATTR_PORT_NO"),
                     rep.get_attr("OVS_VPORT_ATTR_NAME"),
                     OvsVport.type_to_str(rep.get_attr("OVS_VPORT_ATTR_TYPE")),
+                    opts,
                 )
             )
 
@@ -2081,7 +2593,7 @@ def main(argv):
         help="Increment 'verbose' output counter.",
         default=0,
     )
-    subparsers = parser.add_subparsers()
+    subparsers = parser.add_subparsers(dest="subcommand")
 
     showdpcmd = subparsers.add_parser("show")
     showdpcmd.add_argument(
@@ -2120,12 +2632,30 @@ def main(argv):
         "--ptype",
         type=str,
         default="netdev",
-        choices=["netdev", "internal"],
+        choices=["netdev", "internal", "geneve", "vxlan"],
         help="Interface type (default netdev)",
     )
+    addifcmd.add_argument(
+        "-p",
+        "--dport",
+        type=int,
+        default=0,
+        help="Destination port (0 for default)"
+    )
+    addifcmd.add_argument(
+        "-l",
+        "--lwt",
+        type=bool,
+        default=True,
+        help="Use LWT infrastructure instead of vport (default true)."
+    )
     delifcmd = subparsers.add_parser("del-if")
     delifcmd.add_argument("dpname", help="Datapath Name")
     delifcmd.add_argument("delif", help="Interface name for adding")
+    delifcmd.add_argument("-d",
+                          "--dellink",
+                          type=bool, default=False,
+                          help="Delete the link as well.")
 
     dumpflcmd = subparsers.add_parser("dump-flows")
     dumpflcmd.add_argument("dumpdp", help="Datapath Name")
@@ -2138,6 +2668,8 @@ def main(argv):
     delfscmd = subparsers.add_parser("del-flows")
     delfscmd.add_argument("flsbr", help="Datapath name")
 
+    subparsers.add_parser("psample-events")
+
     args = parser.parse_args()
 
     if args.verbose > 0:
@@ -2152,6 +2684,9 @@ def main(argv):
 
     sys.setrecursionlimit(100000)
 
+    if args.subcommand == "psample-events":
+        PsampleEvent().read_samples()
+
     if hasattr(args, "showdp"):
         found = False
         for iface in ndb.interfaces:
@@ -2186,7 +2721,8 @@ def main(argv):
             print("DP '%s' not found." % args.dpname)
             return 1
         dpindex = rep["dpifindex"]
-        rep = ovsvp.attach(rep["dpifindex"], args.addif, args.ptype)
+        rep = ovsvp.attach(rep["dpifindex"], args.addif, args.ptype,
+                           args.dport, args.lwt)
         msg = "vport '%s'" % args.addif
         if rep and rep["header"]["error"] is None:
             msg += " added."
@@ -2207,6 +2743,9 @@ def main(argv):
             msg += " removed."
         else:
             msg += " failed to remove."
+        if args.dellink:
+            ipr = pyroute2.iproute.IPRoute()
+            ipr.link("del", index=ipr.link_lookup(ifname=args.delif)[0])
     elif hasattr(args, "dumpdp"):
         rep = ovsdp.info(args.dumpdp, 0)
         if rep is None:
diff --git a/tools/testing/selftests/net/openvswitch/settings b/tools/testing/selftests/net/openvswitch/settings
new file mode 100644
index 000000000000..e2206265f67c
--- /dev/null
+++ b/tools/testing/selftests/net/openvswitch/settings
@@ -0,0 +1 @@
+timeout=900
diff --git a/tools/testing/selftests/net/ovpn/.gitignore b/tools/testing/selftests/net/ovpn/.gitignore
new file mode 100644
index 000000000000..ee44c081ca7c
--- /dev/null
+++ b/tools/testing/selftests/net/ovpn/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0+
+ovpn-cli
diff --git a/tools/testing/selftests/net/ovpn/Makefile b/tools/testing/selftests/net/ovpn/Makefile
new file mode 100644
index 000000000000..e0926d76b4c8
--- /dev/null
+++ b/tools/testing/selftests/net/ovpn/Makefile
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2020-2025 OpenVPN, Inc.
+#
+CFLAGS = -pedantic -Wextra -Wall -Wl,--no-as-needed -g -O0 -ggdb $(KHDR_INCLUDES)
+VAR_CFLAGS = $(shell pkg-config --cflags libnl-3.0 libnl-genl-3.0 2>/dev/null)
+ifeq ($(VAR_CFLAGS),)
+VAR_CFLAGS = -I/usr/include/libnl3
+endif
+CFLAGS += $(VAR_CFLAGS)
+
+
+LDLIBS = -lmbedtls -lmbedcrypto
+VAR_LDLIBS = $(shell pkg-config --libs libnl-3.0 libnl-genl-3.0 2>/dev/null)
+ifeq ($(VAR_LDLIBS),)
+VAR_LDLIBS = -lnl-genl-3 -lnl-3
+endif
+LDLIBS += $(VAR_LDLIBS)
+
+
+TEST_FILES = common.sh
+
+TEST_PROGS = test.sh \
+	test-large-mtu.sh \
+	test-chachapoly.sh \
+	test-tcp.sh \
+	test-float.sh \
+	test-close-socket.sh \
+	test-close-socket-tcp.sh
+
+TEST_GEN_FILES := ovpn-cli
+
+include ../../lib.mk
diff --git a/tools/testing/selftests/net/ovpn/common.sh b/tools/testing/selftests/net/ovpn/common.sh
new file mode 100644
index 000000000000..88869c675d03
--- /dev/null
+++ b/tools/testing/selftests/net/ovpn/common.sh
@@ -0,0 +1,108 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2020-2025 OpenVPN, Inc.
+#
+#  Author:	Antonio Quartulli <antonio@openvpn.net>
+
+UDP_PEERS_FILE=${UDP_PEERS_FILE:-udp_peers.txt}
+TCP_PEERS_FILE=${TCP_PEERS_FILE:-tcp_peers.txt}
+OVPN_CLI=${OVPN_CLI:-./ovpn-cli}
+ALG=${ALG:-aes}
+PROTO=${PROTO:-UDP}
+FLOAT=${FLOAT:-0}
+
+LAN_IP="11.11.11.11"
+
+create_ns() {
+	ip netns add peer${1}
+}
+
+setup_ns() {
+	MODE="P2P"
+
+	if [ ${1} -eq 0 ]; then
+		MODE="MP"
+		for p in $(seq 1 ${NUM_PEERS}); do
+			ip link add veth${p} netns peer0 type veth peer name veth${p} netns peer${p}
+
+			ip -n peer0 addr add 10.10.${p}.1/24 dev veth${p}
+			ip -n peer0 addr add fd00:0:0:${p}::1/64 dev veth${p}
+			ip -n peer0 link set veth${p} up
+
+			ip -n peer${p} addr add 10.10.${p}.2/24 dev veth${p}
+			ip -n peer${p} addr add fd00:0:0:${p}::2/64 dev veth${p}
+			ip -n peer${p} link set veth${p} up
+		done
+	fi
+
+	ip netns exec peer${1} ${OVPN_CLI} new_iface tun${1} $MODE
+	ip -n peer${1} addr add ${2} dev tun${1}
+	# add a secondary IP to peer 1, to test a LAN behind a client
+	if [ ${1} -eq 1 -a -n "${LAN_IP}" ]; then
+		ip -n peer${1} addr add ${LAN_IP} dev tun${1}
+		ip -n peer0 route add ${LAN_IP} via $(echo ${2} |sed -e s'!/.*!!') dev tun0
+	fi
+	if [ -n "${3}" ]; then
+		ip -n peer${1} link set mtu ${3} dev tun${1}
+	fi
+	ip -n peer${1} link set tun${1} up
+}
+
+add_peer() {
+	if [ "${PROTO}" == "UDP" ]; then
+		if [ ${1} -eq 0 ]; then
+			ip netns exec peer0 ${OVPN_CLI} new_multi_peer tun0 1 ${UDP_PEERS_FILE}
+
+			for p in $(seq 1 ${NUM_PEERS}); do
+				ip netns exec peer0 ${OVPN_CLI} new_key tun0 ${p} 1 0 ${ALG} 0 \
+					data64.key
+			done
+		else
+			RADDR=$(awk "NR == ${1} {print \$2}" ${UDP_PEERS_FILE})
+			RPORT=$(awk "NR == ${1} {print \$3}" ${UDP_PEERS_FILE})
+			LPORT=$(awk "NR == ${1} {print \$5}" ${UDP_PEERS_FILE})
+			ip netns exec peer${1} ${OVPN_CLI} new_peer tun${1} ${1} ${LPORT} \
+				${RADDR} ${RPORT}
+			ip netns exec peer${1} ${OVPN_CLI} new_key tun${1} ${1} 1 0 ${ALG} 1 \
+				data64.key
+		fi
+	else
+		if [ ${1} -eq 0 ]; then
+			(ip netns exec peer0 ${OVPN_CLI} listen tun0 1 ${TCP_PEERS_FILE} && {
+				for p in $(seq 1 ${NUM_PEERS}); do
+					ip netns exec peer0 ${OVPN_CLI} new_key tun0 ${p} 1 0 \
+						${ALG} 0 data64.key
+				done
+			}) &
+			sleep 5
+		else
+			ip netns exec peer${1} ${OVPN_CLI} connect tun${1} ${1} 10.10.${1}.1 1 \
+				data64.key
+		fi
+	fi
+}
+
+cleanup() {
+	# some ovpn-cli processes sleep in background so they need manual poking
+	killall $(basename ${OVPN_CLI}) 2>/dev/null || true
+
+	# netns peer0 is deleted without erasing ifaces first
+	for p in $(seq 1 10); do
+		ip -n peer${p} link set tun${p} down 2>/dev/null || true
+		ip netns exec peer${p} ${OVPN_CLI} del_iface tun${p} 2>/dev/null || true
+	done
+	for p in $(seq 1 10); do
+		ip -n peer0 link del veth${p} 2>/dev/null || true
+	done
+	for p in $(seq 0 10); do
+		ip netns del peer${p} 2>/dev/null || true
+	done
+}
+
+if [ "${PROTO}" == "UDP" ]; then
+	NUM_PEERS=${NUM_PEERS:-$(wc -l ${UDP_PEERS_FILE} | awk '{print $1}')}
+else
+	NUM_PEERS=${NUM_PEERS:-$(wc -l ${TCP_PEERS_FILE} | awk '{print $1}')}
+fi
+
+
diff --git a/tools/testing/selftests/net/ovpn/config b/tools/testing/selftests/net/ovpn/config
new file mode 100644
index 000000000000..71946ba9fa17
--- /dev/null
+++ b/tools/testing/selftests/net/ovpn/config
@@ -0,0 +1,10 @@
+CONFIG_NET=y
+CONFIG_INET=y
+CONFIG_STREAM_PARSER=y
+CONFIG_NET_UDP_TUNNEL=y
+CONFIG_DST_CACHE=y
+CONFIG_CRYPTO=y
+CONFIG_CRYPTO_AES=y
+CONFIG_CRYPTO_GCM=y
+CONFIG_CRYPTO_CHACHA20POLY1305=y
+CONFIG_OVPN=m
diff --git a/tools/testing/selftests/net/ovpn/data64.key b/tools/testing/selftests/net/ovpn/data64.key
new file mode 100644
index 000000000000..a99e88c4e290
--- /dev/null
+++ b/tools/testing/selftests/net/ovpn/data64.key
@@ -0,0 +1,5 @@
+jRqMACN7d7/aFQNT8S7jkrBD8uwrgHbG5OQZP2eu4R1Y7tfpS2bf5RHv06Vi163CGoaIiTX99R3B
+ia9ycAH8Wz1+9PWv51dnBLur9jbShlgZ2QHLtUc4a/gfT7zZwULXuuxdLnvR21DDeMBaTbkgbai9
+uvAa7ne1liIgGFzbv+Bas4HDVrygxIxuAnP5Qgc3648IJkZ0QEXPF+O9f0n5+QIvGCxkAUVx+5K6
+KIs+SoeWXnAopELmoGSjUpFtJbagXK82HfdqpuUxT2Tnuef0/14SzVE/vNleBNu2ZbyrSAaah8tE
+BofkPJUBFY+YQcfZNM5Dgrw3i+Bpmpq/gpdg5w==
diff --git a/tools/testing/selftests/net/ovpn/ovpn-cli.c b/tools/testing/selftests/net/ovpn/ovpn-cli.c
new file mode 100644
index 000000000000..de9c26f98b2e
--- /dev/null
+++ b/tools/testing/selftests/net/ovpn/ovpn-cli.c
@@ -0,0 +1,2383 @@
+// SPDX-License-Identifier: GPL-2.0
+/*  OpenVPN data channel accelerator
+ *
+ *  Copyright (C) 2020-2025 OpenVPN, Inc.
+ *
+ *  Author:	Antonio Quartulli <antonio@openvpn.net>
+ */
+
+#include <stdio.h>
+#include <inttypes.h>
+#include <stdbool.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <arpa/inet.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <time.h>
+
+#include <linux/ovpn.h>
+#include <linux/types.h>
+#include <linux/netlink.h>
+
+#include <netlink/socket.h>
+#include <netlink/netlink.h>
+#include <netlink/genl/genl.h>
+#include <netlink/genl/family.h>
+#include <netlink/genl/ctrl.h>
+
+#include <mbedtls/base64.h>
+#include <mbedtls/error.h>
+
+#include <sys/socket.h>
+
+/* defines to make checkpatch happy */
+#define strscpy strncpy
+#define __always_unused __attribute__((__unused__))
+
+/* libnl < 3.5.0 does not set the NLA_F_NESTED on its own, therefore we
+ * have to explicitly do it to prevent the kernel from failing upon
+ * parsing of the message
+ */
+#define nla_nest_start(_msg, _type) \
+	nla_nest_start(_msg, (_type) | NLA_F_NESTED)
+
+/* libnl < 3.11.0 does not implement nla_get_uint() */
+uint64_t ovpn_nla_get_uint(struct nlattr *attr)
+{
+	if (nla_len(attr) == sizeof(uint32_t))
+		return nla_get_u32(attr);
+	else
+		return nla_get_u64(attr);
+}
+
+typedef int (*ovpn_nl_cb)(struct nl_msg *msg, void *arg);
+
+enum ovpn_key_direction {
+	KEY_DIR_IN = 0,
+	KEY_DIR_OUT,
+};
+
+#define KEY_LEN (256 / 8)
+#define NONCE_LEN 8
+
+#define PEER_ID_UNDEF 0x00FFFFFF
+#define MAX_PEERS 10
+
+struct nl_ctx {
+	struct nl_sock *nl_sock;
+	struct nl_msg *nl_msg;
+	struct nl_cb *nl_cb;
+
+	int ovpn_dco_id;
+};
+
+enum ovpn_cmd {
+	CMD_INVALID,
+	CMD_NEW_IFACE,
+	CMD_DEL_IFACE,
+	CMD_LISTEN,
+	CMD_CONNECT,
+	CMD_NEW_PEER,
+	CMD_NEW_MULTI_PEER,
+	CMD_SET_PEER,
+	CMD_DEL_PEER,
+	CMD_GET_PEER,
+	CMD_NEW_KEY,
+	CMD_DEL_KEY,
+	CMD_GET_KEY,
+	CMD_SWAP_KEYS,
+	CMD_LISTEN_MCAST,
+};
+
+struct ovpn_ctx {
+	enum ovpn_cmd cmd;
+
+	__u8 key_enc[KEY_LEN];
+	__u8 key_dec[KEY_LEN];
+	__u8 nonce[NONCE_LEN];
+
+	enum ovpn_cipher_alg cipher;
+
+	sa_family_t sa_family;
+
+	unsigned long peer_id;
+	unsigned long lport;
+
+	union {
+		struct sockaddr_in in4;
+		struct sockaddr_in6 in6;
+	} remote;
+
+	union {
+		struct sockaddr_in in4;
+		struct sockaddr_in6 in6;
+	} peer_ip;
+
+	bool peer_ip_set;
+
+	unsigned int ifindex;
+	char ifname[IFNAMSIZ];
+	enum ovpn_mode mode;
+	bool mode_set;
+
+	int socket;
+	int cli_sockets[MAX_PEERS];
+
+	__u32 keepalive_interval;
+	__u32 keepalive_timeout;
+
+	enum ovpn_key_direction key_dir;
+	enum ovpn_key_slot key_slot;
+	int key_id;
+
+	const char *peers_file;
+};
+
+static int ovpn_nl_recvmsgs(struct nl_ctx *ctx)
+{
+	int ret;
+
+	ret = nl_recvmsgs(ctx->nl_sock, ctx->nl_cb);
+
+	switch (ret) {
+	case -NLE_INTR:
+		fprintf(stderr,
+			"netlink received interrupt due to signal - ignoring\n");
+		break;
+	case -NLE_NOMEM:
+		fprintf(stderr, "netlink out of memory error\n");
+		break;
+	case -NLE_AGAIN:
+		fprintf(stderr,
+			"netlink reports blocking read - aborting wait\n");
+		break;
+	default:
+		if (ret)
+			fprintf(stderr, "netlink reports error (%d): %s\n",
+				ret, nl_geterror(-ret));
+		break;
+	}
+
+	return ret;
+}
+
+static struct nl_ctx *nl_ctx_alloc_flags(struct ovpn_ctx *ovpn, int cmd,
+					 int flags)
+{
+	struct nl_ctx *ctx;
+	int err, ret;
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (!ctx)
+		return NULL;
+
+	ctx->nl_sock = nl_socket_alloc();
+	if (!ctx->nl_sock) {
+		fprintf(stderr, "cannot allocate netlink socket\n");
+		goto err_free;
+	}
+
+	nl_socket_set_buffer_size(ctx->nl_sock, 8192, 8192);
+
+	ret = genl_connect(ctx->nl_sock);
+	if (ret) {
+		fprintf(stderr, "cannot connect to generic netlink: %s\n",
+			nl_geterror(ret));
+		goto err_sock;
+	}
+
+	/* enable Extended ACK for detailed error reporting */
+	err = 1;
+	setsockopt(nl_socket_get_fd(ctx->nl_sock), SOL_NETLINK, NETLINK_EXT_ACK,
+		   &err, sizeof(err));
+
+	ctx->ovpn_dco_id = genl_ctrl_resolve(ctx->nl_sock, OVPN_FAMILY_NAME);
+	if (ctx->ovpn_dco_id < 0) {
+		fprintf(stderr, "cannot find ovpn_dco netlink component: %d\n",
+			ctx->ovpn_dco_id);
+		goto err_free;
+	}
+
+	ctx->nl_msg = nlmsg_alloc();
+	if (!ctx->nl_msg) {
+		fprintf(stderr, "cannot allocate netlink message\n");
+		goto err_sock;
+	}
+
+	ctx->nl_cb = nl_cb_alloc(NL_CB_DEFAULT);
+	if (!ctx->nl_cb) {
+		fprintf(stderr, "failed to allocate netlink callback\n");
+		goto err_msg;
+	}
+
+	nl_socket_set_cb(ctx->nl_sock, ctx->nl_cb);
+
+	genlmsg_put(ctx->nl_msg, 0, 0, ctx->ovpn_dco_id, 0, flags, cmd, 0);
+
+	if (ovpn->ifindex > 0)
+		NLA_PUT_U32(ctx->nl_msg, OVPN_A_IFINDEX, ovpn->ifindex);
+
+	return ctx;
+nla_put_failure:
+err_msg:
+	nlmsg_free(ctx->nl_msg);
+err_sock:
+	nl_socket_free(ctx->nl_sock);
+err_free:
+	free(ctx);
+	return NULL;
+}
+
+static struct nl_ctx *nl_ctx_alloc(struct ovpn_ctx *ovpn, int cmd)
+{
+	return nl_ctx_alloc_flags(ovpn, cmd, 0);
+}
+
+static void nl_ctx_free(struct nl_ctx *ctx)
+{
+	if (!ctx)
+		return;
+
+	nl_socket_free(ctx->nl_sock);
+	nlmsg_free(ctx->nl_msg);
+	nl_cb_put(ctx->nl_cb);
+	free(ctx);
+}
+
+static int ovpn_nl_cb_error(struct sockaddr_nl (*nla)__always_unused,
+			    struct nlmsgerr *err, void *arg)
+{
+	struct nlmsghdr *nlh = (struct nlmsghdr *)err - 1;
+	struct nlattr *tb_msg[NLMSGERR_ATTR_MAX + 1];
+	int len = nlh->nlmsg_len;
+	struct nlattr *attrs;
+	int *ret = arg;
+	int ack_len = sizeof(*nlh) + sizeof(int) + sizeof(*nlh);
+
+	*ret = err->error;
+
+	if (!(nlh->nlmsg_flags & NLM_F_ACK_TLVS))
+		return NL_STOP;
+
+	if (!(nlh->nlmsg_flags & NLM_F_CAPPED))
+		ack_len += err->msg.nlmsg_len - sizeof(*nlh);
+
+	if (len <= ack_len)
+		return NL_STOP;
+
+	attrs = (void *)((uint8_t *)nlh + ack_len);
+	len -= ack_len;
+
+	nla_parse(tb_msg, NLMSGERR_ATTR_MAX, attrs, len, NULL);
+	if (tb_msg[NLMSGERR_ATTR_MSG]) {
+		len = strnlen((char *)nla_data(tb_msg[NLMSGERR_ATTR_MSG]),
+			      nla_len(tb_msg[NLMSGERR_ATTR_MSG]));
+		fprintf(stderr, "kernel error: %*s\n", len,
+			(char *)nla_data(tb_msg[NLMSGERR_ATTR_MSG]));
+	}
+
+	if (tb_msg[NLMSGERR_ATTR_MISS_NEST]) {
+		fprintf(stderr, "missing required nesting type %u\n",
+			nla_get_u32(tb_msg[NLMSGERR_ATTR_MISS_NEST]));
+	}
+
+	if (tb_msg[NLMSGERR_ATTR_MISS_TYPE]) {
+		fprintf(stderr, "missing required attribute type %u\n",
+			nla_get_u32(tb_msg[NLMSGERR_ATTR_MISS_TYPE]));
+	}
+
+	return NL_STOP;
+}
+
+static int ovpn_nl_cb_finish(struct nl_msg (*msg)__always_unused,
+			     void *arg)
+{
+	int *status = arg;
+
+	*status = 0;
+	return NL_SKIP;
+}
+
+static int ovpn_nl_cb_ack(struct nl_msg (*msg)__always_unused,
+			  void *arg)
+{
+	int *status = arg;
+
+	*status = 0;
+	return NL_STOP;
+}
+
+static int ovpn_nl_msg_send(struct nl_ctx *ctx, ovpn_nl_cb cb)
+{
+	int status = 1;
+
+	nl_cb_err(ctx->nl_cb, NL_CB_CUSTOM, ovpn_nl_cb_error, &status);
+	nl_cb_set(ctx->nl_cb, NL_CB_FINISH, NL_CB_CUSTOM, ovpn_nl_cb_finish,
+		  &status);
+	nl_cb_set(ctx->nl_cb, NL_CB_ACK, NL_CB_CUSTOM, ovpn_nl_cb_ack, &status);
+
+	if (cb)
+		nl_cb_set(ctx->nl_cb, NL_CB_VALID, NL_CB_CUSTOM, cb, ctx);
+
+	nl_send_auto_complete(ctx->nl_sock, ctx->nl_msg);
+
+	while (status == 1)
+		ovpn_nl_recvmsgs(ctx);
+
+	if (status < 0)
+		fprintf(stderr, "failed to send netlink message: %s (%d)\n",
+			strerror(-status), status);
+
+	return status;
+}
+
+static int ovpn_parse_key(const char *file, struct ovpn_ctx *ctx)
+{
+	int idx_enc, idx_dec, ret = -1;
+	unsigned char *ckey = NULL;
+	__u8 *bkey = NULL;
+	size_t olen = 0;
+	long ckey_len;
+	FILE *fp;
+
+	fp = fopen(file, "r");
+	if (!fp) {
+		fprintf(stderr, "cannot open: %s\n", file);
+		return -1;
+	}
+
+	/* get file size */
+	fseek(fp, 0L, SEEK_END);
+	ckey_len = ftell(fp);
+	rewind(fp);
+
+	/* if the file is longer, let's just read a portion */
+	if (ckey_len > 256)
+		ckey_len = 256;
+
+	ckey = malloc(ckey_len);
+	if (!ckey)
+		goto err;
+
+	ret = fread(ckey, 1, ckey_len, fp);
+	if (ret != ckey_len) {
+		fprintf(stderr,
+			"couldn't read enough data from key file: %dbytes read\n",
+			ret);
+		goto err;
+	}
+
+	olen = 0;
+	ret = mbedtls_base64_decode(NULL, 0, &olen, ckey, ckey_len);
+	if (ret != MBEDTLS_ERR_BASE64_BUFFER_TOO_SMALL) {
+		char buf[256];
+
+		mbedtls_strerror(ret, buf, sizeof(buf));
+		fprintf(stderr, "unexpected base64 error1: %s (%d)\n", buf,
+			ret);
+
+		goto err;
+	}
+
+	bkey = malloc(olen);
+	if (!bkey) {
+		fprintf(stderr, "cannot allocate binary key buffer\n");
+		goto err;
+	}
+
+	ret = mbedtls_base64_decode(bkey, olen, &olen, ckey, ckey_len);
+	if (ret) {
+		char buf[256];
+
+		mbedtls_strerror(ret, buf, sizeof(buf));
+		fprintf(stderr, "unexpected base64 error2: %s (%d)\n", buf,
+			ret);
+
+		goto err;
+	}
+
+	if (olen < 2 * KEY_LEN + NONCE_LEN) {
+		fprintf(stderr,
+			"not enough data in key file, found %zdB but needs %dB\n",
+			olen, 2 * KEY_LEN + NONCE_LEN);
+		goto err;
+	}
+
+	switch (ctx->key_dir) {
+	case KEY_DIR_IN:
+		idx_enc = 0;
+		idx_dec = 1;
+		break;
+	case KEY_DIR_OUT:
+		idx_enc = 1;
+		idx_dec = 0;
+		break;
+	default:
+		goto err;
+	}
+
+	memcpy(ctx->key_enc, bkey + KEY_LEN * idx_enc, KEY_LEN);
+	memcpy(ctx->key_dec, bkey + KEY_LEN * idx_dec, KEY_LEN);
+	memcpy(ctx->nonce, bkey + 2 * KEY_LEN, NONCE_LEN);
+
+	ret = 0;
+
+err:
+	fclose(fp);
+	free(bkey);
+	free(ckey);
+
+	return ret;
+}
+
+static int ovpn_parse_cipher(const char *cipher, struct ovpn_ctx *ctx)
+{
+	if (strcmp(cipher, "aes") == 0)
+		ctx->cipher = OVPN_CIPHER_ALG_AES_GCM;
+	else if (strcmp(cipher, "chachapoly") == 0)
+		ctx->cipher = OVPN_CIPHER_ALG_CHACHA20_POLY1305;
+	else if (strcmp(cipher, "none") == 0)
+		ctx->cipher = OVPN_CIPHER_ALG_NONE;
+	else
+		return -ENOTSUP;
+
+	return 0;
+}
+
+static int ovpn_parse_key_direction(const char *dir, struct ovpn_ctx *ctx)
+{
+	int in_dir;
+
+	in_dir = strtoll(dir, NULL, 10);
+	switch (in_dir) {
+	case KEY_DIR_IN:
+	case KEY_DIR_OUT:
+		ctx->key_dir = in_dir;
+		break;
+	default:
+		fprintf(stderr,
+			"invalid key direction provided. Can be 0 or 1 only\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static int ovpn_socket(struct ovpn_ctx *ctx, sa_family_t family, int proto)
+{
+	struct sockaddr_storage local_sock = { 0 };
+	struct sockaddr_in6 *in6;
+	struct sockaddr_in *in;
+	int ret, s, sock_type;
+	size_t sock_len;
+
+	if (proto == IPPROTO_UDP)
+		sock_type = SOCK_DGRAM;
+	else if (proto == IPPROTO_TCP)
+		sock_type = SOCK_STREAM;
+	else
+		return -EINVAL;
+
+	s = socket(family, sock_type, 0);
+	if (s < 0) {
+		perror("cannot create socket");
+		return -1;
+	}
+
+	switch (family) {
+	case AF_INET:
+		in = (struct sockaddr_in *)&local_sock;
+		in->sin_family = family;
+		in->sin_port = htons(ctx->lport);
+		in->sin_addr.s_addr = htonl(INADDR_ANY);
+		sock_len = sizeof(*in);
+		break;
+	case AF_INET6:
+		in6 = (struct sockaddr_in6 *)&local_sock;
+		in6->sin6_family = family;
+		in6->sin6_port = htons(ctx->lport);
+		in6->sin6_addr = in6addr_any;
+		sock_len = sizeof(*in6);
+		break;
+	default:
+		return -1;
+	}
+
+	int opt = 1;
+
+	ret = setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));
+
+	if (ret < 0) {
+		perror("setsockopt for SO_REUSEADDR");
+		return ret;
+	}
+
+	ret = setsockopt(s, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt));
+	if (ret < 0) {
+		perror("setsockopt for SO_REUSEPORT");
+		return ret;
+	}
+
+	if (family == AF_INET6) {
+		opt = 0;
+		if (setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY, &opt,
+			       sizeof(opt))) {
+			perror("failed to set IPV6_V6ONLY");
+			return -1;
+		}
+	}
+
+	ret = bind(s, (struct sockaddr *)&local_sock, sock_len);
+	if (ret < 0) {
+		perror("cannot bind socket");
+		goto err_socket;
+	}
+
+	ctx->socket = s;
+	ctx->sa_family = family;
+	return 0;
+
+err_socket:
+	close(s);
+	return -1;
+}
+
+static int ovpn_udp_socket(struct ovpn_ctx *ctx, sa_family_t family)
+{
+	return ovpn_socket(ctx, family, IPPROTO_UDP);
+}
+
+static int ovpn_listen(struct ovpn_ctx *ctx, sa_family_t family)
+{
+	int ret;
+
+	ret = ovpn_socket(ctx, family, IPPROTO_TCP);
+	if (ret < 0)
+		return ret;
+
+	ret = listen(ctx->socket, 10);
+	if (ret < 0) {
+		perror("listen");
+		close(ctx->socket);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int ovpn_accept(struct ovpn_ctx *ctx)
+{
+	socklen_t socklen;
+	int ret;
+
+	socklen = sizeof(ctx->remote);
+	ret = accept(ctx->socket, (struct sockaddr *)&ctx->remote, &socklen);
+	if (ret < 0) {
+		perror("accept");
+		goto err;
+	}
+
+	fprintf(stderr, "Connection received!\n");
+
+	switch (socklen) {
+	case sizeof(struct sockaddr_in):
+	case sizeof(struct sockaddr_in6):
+		break;
+	default:
+		fprintf(stderr, "error: expecting IPv4 or IPv6 connection\n");
+		close(ret);
+		ret = -EINVAL;
+		goto err;
+	}
+
+	return ret;
+err:
+	close(ctx->socket);
+	return ret;
+}
+
+static int ovpn_connect(struct ovpn_ctx *ovpn)
+{
+	socklen_t socklen;
+	int s, ret;
+
+	s = socket(ovpn->remote.in4.sin_family, SOCK_STREAM, 0);
+	if (s < 0) {
+		perror("cannot create socket");
+		return -1;
+	}
+
+	switch (ovpn->remote.in4.sin_family) {
+	case AF_INET:
+		socklen = sizeof(struct sockaddr_in);
+		break;
+	case AF_INET6:
+		socklen = sizeof(struct sockaddr_in6);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	ret = connect(s, (struct sockaddr *)&ovpn->remote, socklen);
+	if (ret < 0) {
+		perror("connect");
+		goto err;
+	}
+
+	fprintf(stderr, "connected\n");
+
+	ovpn->socket = s;
+
+	return 0;
+err:
+	close(s);
+	return ret;
+}
+
+static int ovpn_new_peer(struct ovpn_ctx *ovpn, bool is_tcp)
+{
+	struct nlattr *attr;
+	struct nl_ctx *ctx;
+	int ret = -1;
+
+	ctx = nl_ctx_alloc(ovpn, OVPN_CMD_PEER_NEW);
+	if (!ctx)
+		return -ENOMEM;
+
+	attr = nla_nest_start(ctx->nl_msg, OVPN_A_PEER);
+	NLA_PUT_U32(ctx->nl_msg, OVPN_A_PEER_ID, ovpn->peer_id);
+	NLA_PUT_U32(ctx->nl_msg, OVPN_A_PEER_SOCKET, ovpn->socket);
+
+	if (!is_tcp) {
+		switch (ovpn->remote.in4.sin_family) {
+		case AF_INET:
+			NLA_PUT_U32(ctx->nl_msg, OVPN_A_PEER_REMOTE_IPV4,
+				    ovpn->remote.in4.sin_addr.s_addr);
+			NLA_PUT_U16(ctx->nl_msg, OVPN_A_PEER_REMOTE_PORT,
+				    ovpn->remote.in4.sin_port);
+			break;
+		case AF_INET6:
+			NLA_PUT(ctx->nl_msg, OVPN_A_PEER_REMOTE_IPV6,
+				sizeof(ovpn->remote.in6.sin6_addr),
+				&ovpn->remote.in6.sin6_addr);
+			NLA_PUT_U32(ctx->nl_msg,
+				    OVPN_A_PEER_REMOTE_IPV6_SCOPE_ID,
+				    ovpn->remote.in6.sin6_scope_id);
+			NLA_PUT_U16(ctx->nl_msg, OVPN_A_PEER_REMOTE_PORT,
+				    ovpn->remote.in6.sin6_port);
+			break;
+		default:
+			fprintf(stderr,
+				"Invalid family for remote socket address\n");
+			goto nla_put_failure;
+		}
+	}
+
+	if (ovpn->peer_ip_set) {
+		switch (ovpn->peer_ip.in4.sin_family) {
+		case AF_INET:
+			NLA_PUT_U32(ctx->nl_msg, OVPN_A_PEER_VPN_IPV4,
+				    ovpn->peer_ip.in4.sin_addr.s_addr);
+			break;
+		case AF_INET6:
+			NLA_PUT(ctx->nl_msg, OVPN_A_PEER_VPN_IPV6,
+				sizeof(struct in6_addr),
+				&ovpn->peer_ip.in6.sin6_addr);
+			break;
+		default:
+			fprintf(stderr, "Invalid family for peer address\n");
+			goto nla_put_failure;
+		}
+	}
+
+	nla_nest_end(ctx->nl_msg, attr);
+
+	ret = ovpn_nl_msg_send(ctx, NULL);
+nla_put_failure:
+	nl_ctx_free(ctx);
+	return ret;
+}
+
+static int ovpn_set_peer(struct ovpn_ctx *ovpn)
+{
+	struct nlattr *attr;
+	struct nl_ctx *ctx;
+	int ret = -1;
+
+	ctx = nl_ctx_alloc(ovpn, OVPN_CMD_PEER_SET);
+	if (!ctx)
+		return -ENOMEM;
+
+	attr = nla_nest_start(ctx->nl_msg, OVPN_A_PEER);
+	NLA_PUT_U32(ctx->nl_msg, OVPN_A_PEER_ID, ovpn->peer_id);
+	NLA_PUT_U32(ctx->nl_msg, OVPN_A_PEER_KEEPALIVE_INTERVAL,
+		    ovpn->keepalive_interval);
+	NLA_PUT_U32(ctx->nl_msg, OVPN_A_PEER_KEEPALIVE_TIMEOUT,
+		    ovpn->keepalive_timeout);
+	nla_nest_end(ctx->nl_msg, attr);
+
+	ret = ovpn_nl_msg_send(ctx, NULL);
+nla_put_failure:
+	nl_ctx_free(ctx);
+	return ret;
+}
+
+static int ovpn_del_peer(struct ovpn_ctx *ovpn)
+{
+	struct nlattr *attr;
+	struct nl_ctx *ctx;
+	int ret = -1;
+
+	ctx = nl_ctx_alloc(ovpn, OVPN_CMD_PEER_DEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	attr = nla_nest_start(ctx->nl_msg, OVPN_A_PEER);
+	NLA_PUT_U32(ctx->nl_msg, OVPN_A_PEER_ID, ovpn->peer_id);
+	nla_nest_end(ctx->nl_msg, attr);
+
+	ret = ovpn_nl_msg_send(ctx, NULL);
+nla_put_failure:
+	nl_ctx_free(ctx);
+	return ret;
+}
+
+static int ovpn_handle_peer(struct nl_msg *msg, void (*arg)__always_unused)
+{
+	struct nlattr *pattrs[OVPN_A_PEER_MAX + 1];
+	struct genlmsghdr *gnlh = nlmsg_data(nlmsg_hdr(msg));
+	struct nlattr *attrs[OVPN_A_MAX + 1];
+	__u16 rport = 0, lport = 0;
+
+	nla_parse(attrs, OVPN_A_MAX, genlmsg_attrdata(gnlh, 0),
+		  genlmsg_attrlen(gnlh, 0), NULL);
+
+	if (!attrs[OVPN_A_PEER]) {
+		fprintf(stderr, "no packet content in netlink message\n");
+		return NL_SKIP;
+	}
+
+	nla_parse(pattrs, OVPN_A_PEER_MAX, nla_data(attrs[OVPN_A_PEER]),
+		  nla_len(attrs[OVPN_A_PEER]), NULL);
+
+	if (pattrs[OVPN_A_PEER_ID])
+		fprintf(stderr, "* Peer %u\n",
+			nla_get_u32(pattrs[OVPN_A_PEER_ID]));
+
+	if (pattrs[OVPN_A_PEER_SOCKET_NETNSID])
+		fprintf(stderr, "\tsocket NetNS ID: %d\n",
+			nla_get_s32(pattrs[OVPN_A_PEER_SOCKET_NETNSID]));
+
+	if (pattrs[OVPN_A_PEER_VPN_IPV4]) {
+		char buf[INET_ADDRSTRLEN];
+
+		inet_ntop(AF_INET, nla_data(pattrs[OVPN_A_PEER_VPN_IPV4]),
+			  buf, sizeof(buf));
+		fprintf(stderr, "\tVPN IPv4: %s\n", buf);
+	}
+
+	if (pattrs[OVPN_A_PEER_VPN_IPV6]) {
+		char buf[INET6_ADDRSTRLEN];
+
+		inet_ntop(AF_INET6, nla_data(pattrs[OVPN_A_PEER_VPN_IPV6]),
+			  buf, sizeof(buf));
+		fprintf(stderr, "\tVPN IPv6: %s\n", buf);
+	}
+
+	if (pattrs[OVPN_A_PEER_LOCAL_PORT])
+		lport = ntohs(nla_get_u16(pattrs[OVPN_A_PEER_LOCAL_PORT]));
+
+	if (pattrs[OVPN_A_PEER_REMOTE_PORT])
+		rport = ntohs(nla_get_u16(pattrs[OVPN_A_PEER_REMOTE_PORT]));
+
+	if (pattrs[OVPN_A_PEER_REMOTE_IPV6]) {
+		void *ip = pattrs[OVPN_A_PEER_REMOTE_IPV6];
+		char buf[INET6_ADDRSTRLEN];
+		int scope_id = -1;
+
+		if (pattrs[OVPN_A_PEER_REMOTE_IPV6_SCOPE_ID]) {
+			void *p = pattrs[OVPN_A_PEER_REMOTE_IPV6_SCOPE_ID];
+
+			scope_id = nla_get_u32(p);
+		}
+
+		inet_ntop(AF_INET6, nla_data(ip), buf, sizeof(buf));
+		fprintf(stderr, "\tRemote: %s:%hu (scope-id: %u)\n", buf, rport,
+			scope_id);
+
+		if (pattrs[OVPN_A_PEER_LOCAL_IPV6]) {
+			void *ip = pattrs[OVPN_A_PEER_LOCAL_IPV6];
+
+			inet_ntop(AF_INET6, nla_data(ip), buf, sizeof(buf));
+			fprintf(stderr, "\tLocal: %s:%hu\n", buf, lport);
+		}
+	}
+
+	if (pattrs[OVPN_A_PEER_REMOTE_IPV4]) {
+		void *ip = pattrs[OVPN_A_PEER_REMOTE_IPV4];
+		char buf[INET_ADDRSTRLEN];
+
+		inet_ntop(AF_INET, nla_data(ip), buf, sizeof(buf));
+		fprintf(stderr, "\tRemote: %s:%hu\n", buf, rport);
+
+		if (pattrs[OVPN_A_PEER_LOCAL_IPV4]) {
+			void *p = pattrs[OVPN_A_PEER_LOCAL_IPV4];
+
+			inet_ntop(AF_INET, nla_data(p), buf, sizeof(buf));
+			fprintf(stderr, "\tLocal: %s:%hu\n", buf, lport);
+		}
+	}
+
+	if (pattrs[OVPN_A_PEER_KEEPALIVE_INTERVAL]) {
+		void *p = pattrs[OVPN_A_PEER_KEEPALIVE_INTERVAL];
+
+		fprintf(stderr, "\tKeepalive interval: %u sec\n",
+			nla_get_u32(p));
+	}
+
+	if (pattrs[OVPN_A_PEER_KEEPALIVE_TIMEOUT])
+		fprintf(stderr, "\tKeepalive timeout: %u sec\n",
+			nla_get_u32(pattrs[OVPN_A_PEER_KEEPALIVE_TIMEOUT]));
+
+	if (pattrs[OVPN_A_PEER_VPN_RX_BYTES])
+		fprintf(stderr, "\tVPN RX bytes: %" PRIu64 "\n",
+			ovpn_nla_get_uint(pattrs[OVPN_A_PEER_VPN_RX_BYTES]));
+
+	if (pattrs[OVPN_A_PEER_VPN_TX_BYTES])
+		fprintf(stderr, "\tVPN TX bytes: %" PRIu64 "\n",
+			ovpn_nla_get_uint(pattrs[OVPN_A_PEER_VPN_TX_BYTES]));
+
+	if (pattrs[OVPN_A_PEER_VPN_RX_PACKETS])
+		fprintf(stderr, "\tVPN RX packets: %" PRIu64 "\n",
+			ovpn_nla_get_uint(pattrs[OVPN_A_PEER_VPN_RX_PACKETS]));
+
+	if (pattrs[OVPN_A_PEER_VPN_TX_PACKETS])
+		fprintf(stderr, "\tVPN TX packets: %" PRIu64 "\n",
+			ovpn_nla_get_uint(pattrs[OVPN_A_PEER_VPN_TX_PACKETS]));
+
+	if (pattrs[OVPN_A_PEER_LINK_RX_BYTES])
+		fprintf(stderr, "\tLINK RX bytes: %" PRIu64 "\n",
+			ovpn_nla_get_uint(pattrs[OVPN_A_PEER_LINK_RX_BYTES]));
+
+	if (pattrs[OVPN_A_PEER_LINK_TX_BYTES])
+		fprintf(stderr, "\tLINK TX bytes: %" PRIu64 "\n",
+			ovpn_nla_get_uint(pattrs[OVPN_A_PEER_LINK_TX_BYTES]));
+
+	if (pattrs[OVPN_A_PEER_LINK_RX_PACKETS])
+		fprintf(stderr, "\tLINK RX packets: %" PRIu64 "\n",
+			ovpn_nla_get_uint(pattrs[OVPN_A_PEER_LINK_RX_PACKETS]));
+
+	if (pattrs[OVPN_A_PEER_LINK_TX_PACKETS])
+		fprintf(stderr, "\tLINK TX packets: %" PRIu64 "\n",
+			ovpn_nla_get_uint(pattrs[OVPN_A_PEER_LINK_TX_PACKETS]));
+
+	return NL_SKIP;
+}
+
+static int ovpn_get_peer(struct ovpn_ctx *ovpn)
+{
+	int flags = 0, ret = -1;
+	struct nlattr *attr;
+	struct nl_ctx *ctx;
+
+	if (ovpn->peer_id == PEER_ID_UNDEF)
+		flags = NLM_F_DUMP;
+
+	ctx = nl_ctx_alloc_flags(ovpn, OVPN_CMD_PEER_GET, flags);
+	if (!ctx)
+		return -ENOMEM;
+
+	if (ovpn->peer_id != PEER_ID_UNDEF) {
+		attr = nla_nest_start(ctx->nl_msg, OVPN_A_PEER);
+		NLA_PUT_U32(ctx->nl_msg, OVPN_A_PEER_ID, ovpn->peer_id);
+		nla_nest_end(ctx->nl_msg, attr);
+	}
+
+	ret = ovpn_nl_msg_send(ctx, ovpn_handle_peer);
+nla_put_failure:
+	nl_ctx_free(ctx);
+	return ret;
+}
+
+static int ovpn_new_key(struct ovpn_ctx *ovpn)
+{
+	struct nlattr *keyconf, *key_dir;
+	struct nl_ctx *ctx;
+	int ret = -1;
+
+	ctx = nl_ctx_alloc(ovpn, OVPN_CMD_KEY_NEW);
+	if (!ctx)
+		return -ENOMEM;
+
+	keyconf = nla_nest_start(ctx->nl_msg, OVPN_A_KEYCONF);
+	NLA_PUT_U32(ctx->nl_msg, OVPN_A_KEYCONF_PEER_ID, ovpn->peer_id);
+	NLA_PUT_U32(ctx->nl_msg, OVPN_A_KEYCONF_SLOT, ovpn->key_slot);
+	NLA_PUT_U32(ctx->nl_msg, OVPN_A_KEYCONF_KEY_ID, ovpn->key_id);
+	NLA_PUT_U32(ctx->nl_msg, OVPN_A_KEYCONF_CIPHER_ALG, ovpn->cipher);
+
+	key_dir = nla_nest_start(ctx->nl_msg, OVPN_A_KEYCONF_ENCRYPT_DIR);
+	NLA_PUT(ctx->nl_msg, OVPN_A_KEYDIR_CIPHER_KEY, KEY_LEN, ovpn->key_enc);
+	NLA_PUT(ctx->nl_msg, OVPN_A_KEYDIR_NONCE_TAIL, NONCE_LEN, ovpn->nonce);
+	nla_nest_end(ctx->nl_msg, key_dir);
+
+	key_dir = nla_nest_start(ctx->nl_msg, OVPN_A_KEYCONF_DECRYPT_DIR);
+	NLA_PUT(ctx->nl_msg, OVPN_A_KEYDIR_CIPHER_KEY, KEY_LEN, ovpn->key_dec);
+	NLA_PUT(ctx->nl_msg, OVPN_A_KEYDIR_NONCE_TAIL, NONCE_LEN, ovpn->nonce);
+	nla_nest_end(ctx->nl_msg, key_dir);
+
+	nla_nest_end(ctx->nl_msg, keyconf);
+
+	ret = ovpn_nl_msg_send(ctx, NULL);
+nla_put_failure:
+	nl_ctx_free(ctx);
+	return ret;
+}
+
+static int ovpn_del_key(struct ovpn_ctx *ovpn)
+{
+	struct nlattr *keyconf;
+	struct nl_ctx *ctx;
+	int ret = -1;
+
+	ctx = nl_ctx_alloc(ovpn, OVPN_CMD_KEY_DEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	keyconf = nla_nest_start(ctx->nl_msg, OVPN_A_KEYCONF);
+	NLA_PUT_U32(ctx->nl_msg, OVPN_A_KEYCONF_PEER_ID, ovpn->peer_id);
+	NLA_PUT_U32(ctx->nl_msg, OVPN_A_KEYCONF_SLOT, ovpn->key_slot);
+	nla_nest_end(ctx->nl_msg, keyconf);
+
+	ret = ovpn_nl_msg_send(ctx, NULL);
+nla_put_failure:
+	nl_ctx_free(ctx);
+	return ret;
+}
+
+static int ovpn_handle_key(struct nl_msg *msg, void (*arg)__always_unused)
+{
+	struct nlattr *kattrs[OVPN_A_KEYCONF_MAX + 1];
+	struct genlmsghdr *gnlh = nlmsg_data(nlmsg_hdr(msg));
+	struct nlattr *attrs[OVPN_A_MAX + 1];
+
+	nla_parse(attrs, OVPN_A_MAX, genlmsg_attrdata(gnlh, 0),
+		  genlmsg_attrlen(gnlh, 0), NULL);
+
+	if (!attrs[OVPN_A_KEYCONF]) {
+		fprintf(stderr, "no packet content in netlink message\n");
+		return NL_SKIP;
+	}
+
+	nla_parse(kattrs, OVPN_A_KEYCONF_MAX, nla_data(attrs[OVPN_A_KEYCONF]),
+		  nla_len(attrs[OVPN_A_KEYCONF]), NULL);
+
+	if (kattrs[OVPN_A_KEYCONF_PEER_ID])
+		fprintf(stderr, "* Peer %u\n",
+			nla_get_u32(kattrs[OVPN_A_KEYCONF_PEER_ID]));
+	if (kattrs[OVPN_A_KEYCONF_SLOT]) {
+		fprintf(stderr, "\t- Slot: ");
+		switch (nla_get_u32(kattrs[OVPN_A_KEYCONF_SLOT])) {
+		case OVPN_KEY_SLOT_PRIMARY:
+			fprintf(stderr, "primary\n");
+			break;
+		case OVPN_KEY_SLOT_SECONDARY:
+			fprintf(stderr, "secondary\n");
+			break;
+		default:
+			fprintf(stderr, "invalid (%u)\n",
+				nla_get_u32(kattrs[OVPN_A_KEYCONF_SLOT]));
+			break;
+		}
+	}
+	if (kattrs[OVPN_A_KEYCONF_KEY_ID])
+		fprintf(stderr, "\t- Key ID: %u\n",
+			nla_get_u32(kattrs[OVPN_A_KEYCONF_KEY_ID]));
+	if (kattrs[OVPN_A_KEYCONF_CIPHER_ALG]) {
+		fprintf(stderr, "\t- Cipher: ");
+		switch (nla_get_u32(kattrs[OVPN_A_KEYCONF_CIPHER_ALG])) {
+		case OVPN_CIPHER_ALG_NONE:
+			fprintf(stderr, "none\n");
+			break;
+		case OVPN_CIPHER_ALG_AES_GCM:
+			fprintf(stderr, "aes-gcm\n");
+			break;
+		case OVPN_CIPHER_ALG_CHACHA20_POLY1305:
+			fprintf(stderr, "chacha20poly1305\n");
+			break;
+		default:
+			fprintf(stderr, "invalid (%u)\n",
+				nla_get_u32(kattrs[OVPN_A_KEYCONF_CIPHER_ALG]));
+			break;
+		}
+	}
+
+	return NL_SKIP;
+}
+
+static int ovpn_get_key(struct ovpn_ctx *ovpn)
+{
+	struct nlattr *keyconf;
+	struct nl_ctx *ctx;
+	int ret = -1;
+
+	ctx = nl_ctx_alloc(ovpn, OVPN_CMD_KEY_GET);
+	if (!ctx)
+		return -ENOMEM;
+
+	keyconf = nla_nest_start(ctx->nl_msg, OVPN_A_KEYCONF);
+	NLA_PUT_U32(ctx->nl_msg, OVPN_A_KEYCONF_PEER_ID, ovpn->peer_id);
+	NLA_PUT_U32(ctx->nl_msg, OVPN_A_KEYCONF_SLOT, ovpn->key_slot);
+	nla_nest_end(ctx->nl_msg, keyconf);
+
+	ret = ovpn_nl_msg_send(ctx, ovpn_handle_key);
+nla_put_failure:
+	nl_ctx_free(ctx);
+	return ret;
+}
+
+static int ovpn_swap_keys(struct ovpn_ctx *ovpn)
+{
+	struct nl_ctx *ctx;
+	struct nlattr *kc;
+	int ret = -1;
+
+	ctx = nl_ctx_alloc(ovpn, OVPN_CMD_KEY_SWAP);
+	if (!ctx)
+		return -ENOMEM;
+
+	kc = nla_nest_start(ctx->nl_msg, OVPN_A_KEYCONF);
+	NLA_PUT_U32(ctx->nl_msg, OVPN_A_KEYCONF_PEER_ID, ovpn->peer_id);
+	nla_nest_end(ctx->nl_msg, kc);
+
+	ret = ovpn_nl_msg_send(ctx, NULL);
+nla_put_failure:
+	nl_ctx_free(ctx);
+	return ret;
+}
+
+/* Helper function used to easily add attributes to a rtnl message */
+static int ovpn_addattr(struct nlmsghdr *n, int maxlen, int type,
+			const void *data, int alen)
+{
+	int len = RTA_LENGTH(alen);
+	struct rtattr *rta;
+
+	if ((int)(NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len)) > maxlen)	{
+		fprintf(stderr, "%s: rtnl: message exceeded bound of %d\n",
+			__func__, maxlen);
+		return -EMSGSIZE;
+	}
+
+	rta = nlmsg_tail(n);
+	rta->rta_type = type;
+	rta->rta_len = len;
+
+	if (!data)
+		memset(RTA_DATA(rta), 0, alen);
+	else
+		memcpy(RTA_DATA(rta), data, alen);
+
+	n->nlmsg_len = NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len);
+
+	return 0;
+}
+
+static struct rtattr *ovpn_nest_start(struct nlmsghdr *msg, size_t max_size,
+				      int attr)
+{
+	struct rtattr *nest = nlmsg_tail(msg);
+
+	if (ovpn_addattr(msg, max_size, attr, NULL, 0) < 0)
+		return NULL;
+
+	return nest;
+}
+
+static void ovpn_nest_end(struct nlmsghdr *msg, struct rtattr *nest)
+{
+	nest->rta_len = (uint8_t *)nlmsg_tail(msg) - (uint8_t *)nest;
+}
+
+#define RT_SNDBUF_SIZE (1024 * 2)
+#define RT_RCVBUF_SIZE (1024 * 4)
+
+/* Open RTNL socket */
+static int ovpn_rt_socket(void)
+{
+	int sndbuf = RT_SNDBUF_SIZE, rcvbuf = RT_RCVBUF_SIZE, fd;
+
+	fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+	if (fd < 0) {
+		fprintf(stderr, "%s: cannot open netlink socket\n", __func__);
+		return fd;
+	}
+
+	if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sndbuf,
+		       sizeof(sndbuf)) < 0) {
+		fprintf(stderr, "%s: SO_SNDBUF\n", __func__);
+		close(fd);
+		return -1;
+	}
+
+	if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf,
+		       sizeof(rcvbuf)) < 0) {
+		fprintf(stderr, "%s: SO_RCVBUF\n", __func__);
+		close(fd);
+		return -1;
+	}
+
+	return fd;
+}
+
+/* Bind socket to Netlink subsystem */
+static int ovpn_rt_bind(int fd, uint32_t groups)
+{
+	struct sockaddr_nl local = { 0 };
+	socklen_t addr_len;
+
+	local.nl_family = AF_NETLINK;
+	local.nl_groups = groups;
+
+	if (bind(fd, (struct sockaddr *)&local, sizeof(local)) < 0) {
+		fprintf(stderr, "%s: cannot bind netlink socket: %d\n",
+			__func__, errno);
+		return -errno;
+	}
+
+	addr_len = sizeof(local);
+	if (getsockname(fd, (struct sockaddr *)&local, &addr_len) < 0) {
+		fprintf(stderr, "%s: cannot getsockname: %d\n", __func__,
+			errno);
+		return -errno;
+	}
+
+	if (addr_len != sizeof(local)) {
+		fprintf(stderr, "%s: wrong address length %d\n", __func__,
+			addr_len);
+		return -EINVAL;
+	}
+
+	if (local.nl_family != AF_NETLINK) {
+		fprintf(stderr, "%s: wrong address family %d\n", __func__,
+			local.nl_family);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+typedef int (*ovpn_parse_reply_cb)(struct nlmsghdr *msg, void *arg);
+
+/* Send Netlink message and run callback on reply (if specified) */
+static int ovpn_rt_send(struct nlmsghdr *payload, pid_t peer,
+			unsigned int groups, ovpn_parse_reply_cb cb,
+			void *arg_cb)
+{
+	int len, rem_len, fd, ret, rcv_len;
+	struct sockaddr_nl nladdr = { 0 };
+	struct nlmsgerr *err;
+	struct nlmsghdr *h;
+	char buf[1024 * 16];
+	struct iovec iov = {
+		.iov_base = payload,
+		.iov_len = payload->nlmsg_len,
+	};
+	struct msghdr nlmsg = {
+		.msg_name = &nladdr,
+		.msg_namelen = sizeof(nladdr),
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+	};
+
+	nladdr.nl_family = AF_NETLINK;
+	nladdr.nl_pid = peer;
+	nladdr.nl_groups = groups;
+
+	payload->nlmsg_seq = time(NULL);
+
+	/* no need to send reply */
+	if (!cb)
+		payload->nlmsg_flags |= NLM_F_ACK;
+
+	fd = ovpn_rt_socket();
+	if (fd < 0) {
+		fprintf(stderr, "%s: can't open rtnl socket\n", __func__);
+		return -errno;
+	}
+
+	ret = ovpn_rt_bind(fd, 0);
+	if (ret < 0) {
+		fprintf(stderr, "%s: can't bind rtnl socket\n", __func__);
+		ret = -errno;
+		goto out;
+	}
+
+	ret = sendmsg(fd, &nlmsg, 0);
+	if (ret < 0) {
+		fprintf(stderr, "%s: rtnl: error on sendmsg()\n", __func__);
+		ret = -errno;
+		goto out;
+	}
+
+	/* prepare buffer to store RTNL replies */
+	memset(buf, 0, sizeof(buf));
+	iov.iov_base = buf;
+
+	while (1) {
+		/*
+		 * iov_len is modified by recvmsg(), therefore has to be initialized before
+		 * using it again
+		 */
+		iov.iov_len = sizeof(buf);
+		rcv_len = recvmsg(fd, &nlmsg, 0);
+		if (rcv_len < 0) {
+			if (errno == EINTR || errno == EAGAIN) {
+				fprintf(stderr, "%s: interrupted call\n",
+					__func__);
+				continue;
+			}
+			fprintf(stderr, "%s: rtnl: error on recvmsg()\n",
+				__func__);
+			ret = -errno;
+			goto out;
+		}
+
+		if (rcv_len == 0) {
+			fprintf(stderr,
+				"%s: rtnl: socket reached unexpected EOF\n",
+				__func__);
+			ret = -EIO;
+			goto out;
+		}
+
+		if (nlmsg.msg_namelen != sizeof(nladdr)) {
+			fprintf(stderr,
+				"%s: sender address length: %u (expected %zu)\n",
+				__func__, nlmsg.msg_namelen, sizeof(nladdr));
+			ret = -EIO;
+			goto out;
+		}
+
+		h = (struct nlmsghdr *)buf;
+		while (rcv_len >= (int)sizeof(*h)) {
+			len = h->nlmsg_len;
+			rem_len = len - sizeof(*h);
+
+			if (rem_len < 0 || len > rcv_len) {
+				if (nlmsg.msg_flags & MSG_TRUNC) {
+					fprintf(stderr, "%s: truncated message\n",
+						__func__);
+					ret = -EIO;
+					goto out;
+				}
+				fprintf(stderr, "%s: malformed message: len=%d\n",
+					__func__, len);
+				ret = -EIO;
+				goto out;
+			}
+
+			if (h->nlmsg_type == NLMSG_DONE) {
+				ret = 0;
+				goto out;
+			}
+
+			if (h->nlmsg_type == NLMSG_ERROR) {
+				err = (struct nlmsgerr *)NLMSG_DATA(h);
+				if (rem_len < (int)sizeof(struct nlmsgerr)) {
+					fprintf(stderr, "%s: ERROR truncated\n",
+						__func__);
+					ret = -EIO;
+					goto out;
+				}
+
+				if (err->error) {
+					fprintf(stderr, "%s: (%d) %s\n",
+						__func__, err->error,
+						strerror(-err->error));
+					ret = err->error;
+					goto out;
+				}
+
+				ret = 0;
+				if (cb)	{
+					int r = cb(h, arg_cb);
+
+					if (r <= 0)
+						ret = r;
+				}
+				goto out;
+			}
+
+			if (cb) {
+				int r = cb(h, arg_cb);
+
+				if (r <= 0) {
+					ret = r;
+					goto out;
+				}
+			} else {
+				fprintf(stderr, "%s: RTNL: unexpected reply\n",
+					__func__);
+			}
+
+			rcv_len -= NLMSG_ALIGN(len);
+			h = (struct nlmsghdr *)((uint8_t *)h +
+						NLMSG_ALIGN(len));
+		}
+
+		if (nlmsg.msg_flags & MSG_TRUNC) {
+			fprintf(stderr, "%s: message truncated\n", __func__);
+			continue;
+		}
+
+		if (rcv_len) {
+			fprintf(stderr, "%s: rtnl: %d not parsed bytes\n",
+				__func__, rcv_len);
+			ret = -1;
+			goto out;
+		}
+	}
+out:
+	close(fd);
+
+	return ret;
+}
+
+struct ovpn_link_req {
+	struct nlmsghdr n;
+	struct ifinfomsg i;
+	char buf[256];
+};
+
+static int ovpn_new_iface(struct ovpn_ctx *ovpn)
+{
+	struct rtattr *linkinfo, *data;
+	struct ovpn_link_req req = { 0 };
+	int ret = -1;
+
+	fprintf(stdout, "Creating interface %s with mode %u\n", ovpn->ifname,
+		ovpn->mode);
+
+	req.n.nlmsg_len = NLMSG_LENGTH(sizeof(req.i));
+	req.n.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
+	req.n.nlmsg_type = RTM_NEWLINK;
+
+	if (ovpn_addattr(&req.n, sizeof(req), IFLA_IFNAME, ovpn->ifname,
+			 strlen(ovpn->ifname) + 1) < 0)
+		goto err;
+
+	linkinfo = ovpn_nest_start(&req.n, sizeof(req), IFLA_LINKINFO);
+	if (!linkinfo)
+		goto err;
+
+	if (ovpn_addattr(&req.n, sizeof(req), IFLA_INFO_KIND, OVPN_FAMILY_NAME,
+			 strlen(OVPN_FAMILY_NAME) + 1) < 0)
+		goto err;
+
+	if (ovpn->mode_set) {
+		data = ovpn_nest_start(&req.n, sizeof(req), IFLA_INFO_DATA);
+		if (!data)
+			goto err;
+
+		if (ovpn_addattr(&req.n, sizeof(req), IFLA_OVPN_MODE,
+				 &ovpn->mode, sizeof(uint8_t)) < 0)
+			goto err;
+
+		ovpn_nest_end(&req.n, data);
+	}
+
+	ovpn_nest_end(&req.n, linkinfo);
+
+	req.i.ifi_family = AF_PACKET;
+
+	ret = ovpn_rt_send(&req.n, 0, 0, NULL, NULL);
+err:
+	return ret;
+}
+
+static int ovpn_del_iface(struct ovpn_ctx *ovpn)
+{
+	struct ovpn_link_req req = { 0 };
+
+	fprintf(stdout, "Deleting interface %s ifindex %u\n", ovpn->ifname,
+		ovpn->ifindex);
+
+	req.n.nlmsg_len = NLMSG_LENGTH(sizeof(req.i));
+	req.n.nlmsg_flags = NLM_F_REQUEST;
+	req.n.nlmsg_type = RTM_DELLINK;
+
+	req.i.ifi_family = AF_PACKET;
+	req.i.ifi_index = ovpn->ifindex;
+
+	return ovpn_rt_send(&req.n, 0, 0, NULL, NULL);
+}
+
+static int nl_seq_check(struct nl_msg (*msg)__always_unused,
+			void (*arg)__always_unused)
+{
+	return NL_OK;
+}
+
+struct mcast_handler_args {
+	const char *group;
+	int id;
+};
+
+static int mcast_family_handler(struct nl_msg *msg, void *arg)
+{
+	struct mcast_handler_args *grp = arg;
+	struct nlattr *tb[CTRL_ATTR_MAX + 1];
+	struct genlmsghdr *gnlh = nlmsg_data(nlmsg_hdr(msg));
+	struct nlattr *mcgrp;
+	int rem_mcgrp;
+
+	nla_parse(tb, CTRL_ATTR_MAX, genlmsg_attrdata(gnlh, 0),
+		  genlmsg_attrlen(gnlh, 0), NULL);
+
+	if (!tb[CTRL_ATTR_MCAST_GROUPS])
+		return NL_SKIP;
+
+	nla_for_each_nested(mcgrp, tb[CTRL_ATTR_MCAST_GROUPS], rem_mcgrp) {
+		struct nlattr *tb_mcgrp[CTRL_ATTR_MCAST_GRP_MAX + 1];
+
+		nla_parse(tb_mcgrp, CTRL_ATTR_MCAST_GRP_MAX,
+			  nla_data(mcgrp), nla_len(mcgrp), NULL);
+
+		if (!tb_mcgrp[CTRL_ATTR_MCAST_GRP_NAME] ||
+		    !tb_mcgrp[CTRL_ATTR_MCAST_GRP_ID])
+			continue;
+		if (strncmp(nla_data(tb_mcgrp[CTRL_ATTR_MCAST_GRP_NAME]),
+			    grp->group, nla_len(tb_mcgrp[CTRL_ATTR_MCAST_GRP_NAME])))
+			continue;
+		grp->id = nla_get_u32(tb_mcgrp[CTRL_ATTR_MCAST_GRP_ID]);
+		break;
+	}
+
+	return NL_SKIP;
+}
+
+static int mcast_error_handler(struct sockaddr_nl (*nla)__always_unused,
+			       struct nlmsgerr *err, void *arg)
+{
+	int *ret = arg;
+
+	*ret = err->error;
+	return NL_STOP;
+}
+
+static int mcast_ack_handler(struct nl_msg (*msg)__always_unused, void *arg)
+{
+	int *ret = arg;
+
+	*ret = 0;
+	return NL_STOP;
+}
+
+static int ovpn_handle_msg(struct nl_msg *msg, void *arg)
+{
+	struct genlmsghdr *gnlh = nlmsg_data(nlmsg_hdr(msg));
+	struct nlattr *attrs[OVPN_A_MAX + 1];
+	struct nlmsghdr *nlh = nlmsg_hdr(msg);
+	char ifname[IF_NAMESIZE];
+	int *ret = arg;
+	__u32 ifindex;
+
+	fprintf(stderr, "received message from ovpn-dco\n");
+
+	*ret = -1;
+
+	if (!genlmsg_valid_hdr(nlh, 0)) {
+		fprintf(stderr, "invalid header\n");
+		return NL_STOP;
+	}
+
+	if (nla_parse(attrs, OVPN_A_MAX, genlmsg_attrdata(gnlh, 0),
+		      genlmsg_attrlen(gnlh, 0), NULL)) {
+		fprintf(stderr, "received bogus data from ovpn-dco\n");
+		return NL_STOP;
+	}
+
+	if (!attrs[OVPN_A_IFINDEX]) {
+		fprintf(stderr, "no ifindex in this message\n");
+		return NL_STOP;
+	}
+
+	ifindex = nla_get_u32(attrs[OVPN_A_IFINDEX]);
+	if (!if_indextoname(ifindex, ifname)) {
+		fprintf(stderr, "cannot resolve ifname for ifindex: %u\n",
+			ifindex);
+		return NL_STOP;
+	}
+
+	switch (gnlh->cmd) {
+	case OVPN_CMD_PEER_DEL_NTF:
+		fprintf(stdout, "received CMD_PEER_DEL_NTF\n");
+		break;
+	case OVPN_CMD_KEY_SWAP_NTF:
+		fprintf(stdout, "received CMD_KEY_SWAP_NTF\n");
+		break;
+	default:
+		fprintf(stderr, "received unknown command: %d\n", gnlh->cmd);
+		return NL_STOP;
+	}
+
+	*ret = 0;
+	return NL_OK;
+}
+
+static int ovpn_get_mcast_id(struct nl_sock *sock, const char *family,
+			     const char *group)
+{
+	struct nl_msg *msg;
+	struct nl_cb *cb;
+	int ret, ctrlid;
+	struct mcast_handler_args grp = {
+		.group = group,
+		.id = -ENOENT,
+	};
+
+	msg = nlmsg_alloc();
+	if (!msg)
+		return -ENOMEM;
+
+	cb = nl_cb_alloc(NL_CB_DEFAULT);
+	if (!cb) {
+		ret = -ENOMEM;
+		goto out_fail_cb;
+	}
+
+	ctrlid = genl_ctrl_resolve(sock, "nlctrl");
+
+	genlmsg_put(msg, 0, 0, ctrlid, 0, 0, CTRL_CMD_GETFAMILY, 0);
+
+	ret = -ENOBUFS;
+	NLA_PUT_STRING(msg, CTRL_ATTR_FAMILY_NAME, family);
+
+	ret = nl_send_auto_complete(sock, msg);
+	if (ret < 0)
+		goto nla_put_failure;
+
+	ret = 1;
+
+	nl_cb_err(cb, NL_CB_CUSTOM, mcast_error_handler, &ret);
+	nl_cb_set(cb, NL_CB_ACK, NL_CB_CUSTOM, mcast_ack_handler, &ret);
+	nl_cb_set(cb, NL_CB_VALID, NL_CB_CUSTOM, mcast_family_handler, &grp);
+
+	while (ret > 0)
+		nl_recvmsgs(sock, cb);
+
+	if (ret == 0)
+		ret = grp.id;
+ nla_put_failure:
+	nl_cb_put(cb);
+ out_fail_cb:
+	nlmsg_free(msg);
+	return ret;
+}
+
+static int ovpn_listen_mcast(void)
+{
+	struct nl_sock *sock;
+	struct nl_cb *cb;
+	int mcid, ret;
+
+	sock = nl_socket_alloc();
+	if (!sock) {
+		fprintf(stderr, "cannot allocate netlink socket\n");
+		goto err_free;
+	}
+
+	nl_socket_set_buffer_size(sock, 8192, 8192);
+
+	ret = genl_connect(sock);
+	if (ret < 0) {
+		fprintf(stderr, "cannot connect to generic netlink: %s\n",
+			nl_geterror(ret));
+		goto err_free;
+	}
+
+	mcid = ovpn_get_mcast_id(sock, OVPN_FAMILY_NAME, OVPN_MCGRP_PEERS);
+	if (mcid < 0) {
+		fprintf(stderr, "cannot get mcast group: %s\n",
+			nl_geterror(mcid));
+		goto err_free;
+	}
+
+	ret = nl_socket_add_membership(sock, mcid);
+	if (ret) {
+		fprintf(stderr, "failed to join mcast group: %d\n", ret);
+		goto err_free;
+	}
+
+	ret = 1;
+	cb = nl_cb_alloc(NL_CB_DEFAULT);
+	nl_cb_set(cb, NL_CB_SEQ_CHECK, NL_CB_CUSTOM, nl_seq_check, NULL);
+	nl_cb_set(cb, NL_CB_VALID, NL_CB_CUSTOM, ovpn_handle_msg, &ret);
+	nl_cb_err(cb, NL_CB_CUSTOM, ovpn_nl_cb_error, &ret);
+
+	while (ret == 1) {
+		int err = nl_recvmsgs(sock, cb);
+
+		if (err < 0) {
+			fprintf(stderr,
+				"cannot receive netlink message: (%d) %s\n",
+				err, nl_geterror(-err));
+			ret = -1;
+			break;
+		}
+	}
+
+	nl_cb_put(cb);
+err_free:
+	nl_socket_free(sock);
+	return ret;
+}
+
+static void usage(const char *cmd)
+{
+	fprintf(stderr,
+		"Usage %s <command> <iface> [arguments..]\n",
+		cmd);
+	fprintf(stderr, "where <command> can be one of the following\n\n");
+
+	fprintf(stderr, "* new_iface <iface> [mode]: create new ovpn interface\n");
+	fprintf(stderr, "\tiface: ovpn interface name\n");
+	fprintf(stderr, "\tmode:\n");
+	fprintf(stderr, "\t\t- P2P for peer-to-peer mode (i.e. client)\n");
+	fprintf(stderr, "\t\t- MP for multi-peer mode (i.e. server)\n");
+
+	fprintf(stderr, "* del_iface <iface>: delete ovpn interface\n");
+	fprintf(stderr, "\tiface: ovpn interface name\n");
+
+	fprintf(stderr,
+		"* listen <iface> <lport> <peers_file> [ipv6]: listen for incoming peer TCP connections\n");
+	fprintf(stderr, "\tiface: ovpn interface name\n");
+	fprintf(stderr, "\tlport: TCP port to listen to\n");
+	fprintf(stderr,
+		"\tpeers_file: file containing one peer per line: Line format:\n");
+	fprintf(stderr, "\t\t<peer_id> <vpnaddr>\n");
+	fprintf(stderr,
+		"\tipv6: whether the socket should listen to the IPv6 wildcard address\n");
+
+	fprintf(stderr,
+		"* connect <iface> <peer_id> <raddr> <rport> [key_file]: start connecting peer of TCP-based VPN session\n");
+	fprintf(stderr, "\tiface: ovpn interface name\n");
+	fprintf(stderr, "\tpeer_id: peer ID of the connecting peer\n");
+	fprintf(stderr, "\traddr: peer IP address to connect to\n");
+	fprintf(stderr, "\trport: peer TCP port to connect to\n");
+	fprintf(stderr,
+		"\tkey_file: file containing the symmetric key for encryption\n");
+
+	fprintf(stderr,
+		"* new_peer <iface> <peer_id> <lport> <raddr> <rport> [vpnaddr]: add new peer\n");
+	fprintf(stderr, "\tiface: ovpn interface name\n");
+	fprintf(stderr, "\tlport: local UDP port to bind to\n");
+	fprintf(stderr,
+		"\tpeer_id: peer ID to be used in data packets to/from this peer\n");
+	fprintf(stderr, "\traddr: peer IP address\n");
+	fprintf(stderr, "\trport: peer UDP port\n");
+	fprintf(stderr, "\tvpnaddr: peer VPN IP\n");
+
+	fprintf(stderr,
+		"* new_multi_peer <iface> <lport> <peers_file>: add multiple peers as listed in the file\n");
+	fprintf(stderr, "\tiface: ovpn interface name\n");
+	fprintf(stderr, "\tlport: local UDP port to bind to\n");
+	fprintf(stderr,
+		"\tpeers_file: text file containing one peer per line. Line format:\n");
+	fprintf(stderr, "\t\t<peer_id> <raddr> <rport> <vpnaddr>\n");
+
+	fprintf(stderr,
+		"* set_peer <iface> <peer_id> <keepalive_interval> <keepalive_timeout>: set peer attributes\n");
+	fprintf(stderr, "\tiface: ovpn interface name\n");
+	fprintf(stderr, "\tpeer_id: peer ID of the peer to modify\n");
+	fprintf(stderr,
+		"\tkeepalive_interval: interval for sending ping messages\n");
+	fprintf(stderr,
+		"\tkeepalive_timeout: time after which a peer is timed out\n");
+
+	fprintf(stderr, "* del_peer <iface> <peer_id>: delete peer\n");
+	fprintf(stderr, "\tiface: ovpn interface name\n");
+	fprintf(stderr, "\tpeer_id: peer ID of the peer to delete\n");
+
+	fprintf(stderr, "* get_peer <iface> [peer_id]: retrieve peer(s) status\n");
+	fprintf(stderr, "\tiface: ovpn interface name\n");
+	fprintf(stderr,
+		"\tpeer_id: peer ID of the peer to query. All peers are returned if omitted\n");
+
+	fprintf(stderr,
+		"* new_key <iface> <peer_id> <slot> <key_id> <cipher> <key_dir> <key_file>: set data channel key\n");
+	fprintf(stderr, "\tiface: ovpn interface name\n");
+	fprintf(stderr,
+		"\tpeer_id: peer ID of the peer to configure the key for\n");
+	fprintf(stderr, "\tslot: either 1 (primary) or 2 (secondary)\n");
+	fprintf(stderr, "\tkey_id: an ID from 0 to 7\n");
+	fprintf(stderr,
+		"\tcipher: cipher to use, supported: aes (AES-GCM), chachapoly (CHACHA20POLY1305)\n");
+	fprintf(stderr,
+		"\tkey_dir: key direction, must 0 on one host and 1 on the other\n");
+	fprintf(stderr, "\tkey_file: file containing the pre-shared key\n");
+
+	fprintf(stderr,
+		"* del_key <iface> <peer_id> [slot]: erase existing data channel key\n");
+	fprintf(stderr, "\tiface: ovpn interface name\n");
+	fprintf(stderr, "\tpeer_id: peer ID of the peer to modify\n");
+	fprintf(stderr, "\tslot: slot to erase. PRIMARY if omitted\n");
+
+	fprintf(stderr,
+		"* get_key <iface> <peer_id> <slot>: retrieve non sensible key data\n");
+	fprintf(stderr, "\tiface: ovpn interface name\n");
+	fprintf(stderr, "\tpeer_id: peer ID of the peer to query\n");
+	fprintf(stderr, "\tslot: either 1 (primary) or 2 (secondary)\n");
+
+	fprintf(stderr,
+		"* swap_keys <iface> <peer_id>: swap content of primary and secondary key slots\n");
+	fprintf(stderr, "\tiface: ovpn interface name\n");
+	fprintf(stderr, "\tpeer_id: peer ID of the peer to modify\n");
+
+	fprintf(stderr,
+		"* listen_mcast: listen to ovpn netlink multicast messages\n");
+}
+
+static int ovpn_parse_remote(struct ovpn_ctx *ovpn, const char *host,
+			     const char *service, const char *vpnip)
+{
+	int ret;
+	struct addrinfo *result;
+	struct addrinfo hints = {
+		.ai_family = ovpn->sa_family,
+		.ai_socktype = SOCK_DGRAM,
+		.ai_protocol = IPPROTO_UDP
+	};
+
+	if (host) {
+		ret = getaddrinfo(host, service, &hints, &result);
+		if (ret) {
+			fprintf(stderr, "getaddrinfo on remote error: %s\n",
+				gai_strerror(ret));
+			return -1;
+		}
+
+		if (!(result->ai_family == AF_INET &&
+		      result->ai_addrlen == sizeof(struct sockaddr_in)) &&
+		    !(result->ai_family == AF_INET6 &&
+		      result->ai_addrlen == sizeof(struct sockaddr_in6))) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		memcpy(&ovpn->remote, result->ai_addr, result->ai_addrlen);
+	}
+
+	if (vpnip) {
+		ret = getaddrinfo(vpnip, NULL, &hints, &result);
+		if (ret) {
+			fprintf(stderr, "getaddrinfo on vpnip error: %s\n",
+				gai_strerror(ret));
+			return -1;
+		}
+
+		if (!(result->ai_family == AF_INET &&
+		      result->ai_addrlen == sizeof(struct sockaddr_in)) &&
+		    !(result->ai_family == AF_INET6 &&
+		      result->ai_addrlen == sizeof(struct sockaddr_in6))) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		memcpy(&ovpn->peer_ip, result->ai_addr, result->ai_addrlen);
+		ovpn->sa_family = result->ai_family;
+
+		ovpn->peer_ip_set = true;
+	}
+
+	ret = 0;
+out:
+	freeaddrinfo(result);
+	return ret;
+}
+
+static int ovpn_parse_new_peer(struct ovpn_ctx *ovpn, const char *peer_id,
+			       const char *raddr, const char *rport,
+			       const char *vpnip)
+{
+	ovpn->peer_id = strtoul(peer_id, NULL, 10);
+	if (errno == ERANGE || ovpn->peer_id > PEER_ID_UNDEF) {
+		fprintf(stderr, "peer ID value out of range\n");
+		return -1;
+	}
+
+	return ovpn_parse_remote(ovpn, raddr, rport, vpnip);
+}
+
+static int ovpn_parse_key_slot(const char *arg, struct ovpn_ctx *ovpn)
+{
+	int slot = strtoul(arg, NULL, 10);
+
+	if (errno == ERANGE || slot < 1 || slot > 2) {
+		fprintf(stderr, "key slot out of range\n");
+		return -1;
+	}
+
+	switch (slot) {
+	case 1:
+		ovpn->key_slot = OVPN_KEY_SLOT_PRIMARY;
+		break;
+	case 2:
+		ovpn->key_slot = OVPN_KEY_SLOT_SECONDARY;
+		break;
+	}
+
+	return 0;
+}
+
+static int ovpn_send_tcp_data(int socket)
+{
+	uint16_t len = htons(1000);
+	uint8_t buf[1002];
+	int ret;
+
+	memcpy(buf, &len, sizeof(len));
+	memset(buf + sizeof(len), 0x86, sizeof(buf) - sizeof(len));
+
+	ret = send(socket, buf, sizeof(buf), MSG_NOSIGNAL);
+
+	fprintf(stdout, "Sent %u bytes over TCP socket\n", ret);
+
+	return ret > 0 ? 0 : ret;
+}
+
+static int ovpn_recv_tcp_data(int socket)
+{
+	uint8_t buf[1002];
+	uint16_t len;
+	int ret;
+
+	ret = recv(socket, buf, sizeof(buf), MSG_NOSIGNAL);
+
+	if (ret < 2) {
+		fprintf(stderr, ">>>> Error while reading TCP data: %d\n", ret);
+		return ret;
+	}
+
+	memcpy(&len, buf, sizeof(len));
+	len = ntohs(len);
+
+	fprintf(stdout, ">>>> Received %u bytes over TCP socket, header: %u\n",
+		ret, len);
+
+	return 0;
+}
+
+static enum ovpn_cmd ovpn_parse_cmd(const char *cmd)
+{
+	if (!strcmp(cmd, "new_iface"))
+		return CMD_NEW_IFACE;
+
+	if (!strcmp(cmd, "del_iface"))
+		return CMD_DEL_IFACE;
+
+	if (!strcmp(cmd, "listen"))
+		return CMD_LISTEN;
+
+	if (!strcmp(cmd, "connect"))
+		return CMD_CONNECT;
+
+	if (!strcmp(cmd, "new_peer"))
+		return CMD_NEW_PEER;
+
+	if (!strcmp(cmd, "new_multi_peer"))
+		return CMD_NEW_MULTI_PEER;
+
+	if (!strcmp(cmd, "set_peer"))
+		return CMD_SET_PEER;
+
+	if (!strcmp(cmd, "del_peer"))
+		return CMD_DEL_PEER;
+
+	if (!strcmp(cmd, "get_peer"))
+		return CMD_GET_PEER;
+
+	if (!strcmp(cmd, "new_key"))
+		return CMD_NEW_KEY;
+
+	if (!strcmp(cmd, "del_key"))
+		return CMD_DEL_KEY;
+
+	if (!strcmp(cmd, "get_key"))
+		return CMD_GET_KEY;
+
+	if (!strcmp(cmd, "swap_keys"))
+		return CMD_SWAP_KEYS;
+
+	if (!strcmp(cmd, "listen_mcast"))
+		return CMD_LISTEN_MCAST;
+
+	return CMD_INVALID;
+}
+
+/* Send process to background and waits for signal.
+ *
+ * This helper is called at the end of commands
+ * creating sockets, so that the latter stay alive
+ * along with the process that created them.
+ *
+ * A signal is expected to be delivered in order to
+ * terminate the waiting processes
+ */
+static void ovpn_waitbg(void)
+{
+	daemon(1, 1);
+	pause();
+}
+
+static int ovpn_run_cmd(struct ovpn_ctx *ovpn)
+{
+	char peer_id[10], vpnip[INET6_ADDRSTRLEN], laddr[128], lport[10];
+	char raddr[128], rport[10];
+	int n, ret;
+	FILE *fp;
+
+	switch (ovpn->cmd) {
+	case CMD_NEW_IFACE:
+		ret = ovpn_new_iface(ovpn);
+		break;
+	case CMD_DEL_IFACE:
+		ret = ovpn_del_iface(ovpn);
+		break;
+	case CMD_LISTEN:
+		ret = ovpn_listen(ovpn, ovpn->sa_family);
+		if (ret < 0) {
+			fprintf(stderr, "cannot listen on TCP socket\n");
+			return ret;
+		}
+
+		fp = fopen(ovpn->peers_file, "r");
+		if (!fp) {
+			fprintf(stderr, "cannot open file: %s\n",
+				ovpn->peers_file);
+			return -1;
+		}
+
+		int num_peers = 0;
+
+		while ((n = fscanf(fp, "%s %s\n", peer_id, vpnip)) == 2) {
+			struct ovpn_ctx peer_ctx = { 0 };
+
+			if (num_peers == MAX_PEERS) {
+				fprintf(stderr, "max peers reached!\n");
+				return -E2BIG;
+			}
+
+			peer_ctx.ifindex = ovpn->ifindex;
+			peer_ctx.sa_family = ovpn->sa_family;
+
+			peer_ctx.socket = ovpn_accept(ovpn);
+			if (peer_ctx.socket < 0) {
+				fprintf(stderr, "cannot accept connection!\n");
+				return -1;
+			}
+
+			/* store peer sockets to test TCP I/O */
+			ovpn->cli_sockets[num_peers] = peer_ctx.socket;
+
+			ret = ovpn_parse_new_peer(&peer_ctx, peer_id, NULL,
+						  NULL, vpnip);
+			if (ret < 0) {
+				fprintf(stderr, "error while parsing line\n");
+				return -1;
+			}
+
+			ret = ovpn_new_peer(&peer_ctx, true);
+			if (ret < 0) {
+				fprintf(stderr,
+					"cannot add peer to VPN: %s %s\n",
+					peer_id, vpnip);
+				return ret;
+			}
+			num_peers++;
+		}
+
+		for (int i = 0; i < num_peers; i++) {
+			ret = ovpn_recv_tcp_data(ovpn->cli_sockets[i]);
+			if (ret < 0)
+				break;
+		}
+		ovpn_waitbg();
+		break;
+	case CMD_CONNECT:
+		ret = ovpn_connect(ovpn);
+		if (ret < 0) {
+			fprintf(stderr, "cannot connect TCP socket\n");
+			return ret;
+		}
+
+		ret = ovpn_new_peer(ovpn, true);
+		if (ret < 0) {
+			fprintf(stderr, "cannot add peer to VPN\n");
+			close(ovpn->socket);
+			return ret;
+		}
+
+		if (ovpn->cipher != OVPN_CIPHER_ALG_NONE) {
+			ret = ovpn_new_key(ovpn);
+			if (ret < 0) {
+				fprintf(stderr, "cannot set key\n");
+				return ret;
+			}
+		}
+
+		ret = ovpn_send_tcp_data(ovpn->socket);
+		ovpn_waitbg();
+		break;
+	case CMD_NEW_PEER:
+		ret = ovpn_udp_socket(ovpn, AF_INET6);
+		if (ret < 0)
+			return ret;
+
+		ret = ovpn_new_peer(ovpn, false);
+		ovpn_waitbg();
+		break;
+	case CMD_NEW_MULTI_PEER:
+		ret = ovpn_udp_socket(ovpn, AF_INET6);
+		if (ret < 0)
+			return ret;
+
+		fp = fopen(ovpn->peers_file, "r");
+		if (!fp) {
+			fprintf(stderr, "cannot open file: %s\n",
+				ovpn->peers_file);
+			return -1;
+		}
+
+		while ((n = fscanf(fp, "%s %s %s %s %s %s\n", peer_id, laddr,
+				   lport, raddr, rport, vpnip)) == 6) {
+			struct ovpn_ctx peer_ctx = { 0 };
+
+			peer_ctx.ifindex = ovpn->ifindex;
+			peer_ctx.socket = ovpn->socket;
+			peer_ctx.sa_family = AF_UNSPEC;
+
+			ret = ovpn_parse_new_peer(&peer_ctx, peer_id, raddr,
+						  rport, vpnip);
+			if (ret < 0) {
+				fprintf(stderr, "error while parsing line\n");
+				return -1;
+			}
+
+			ret = ovpn_new_peer(&peer_ctx, false);
+			if (ret < 0) {
+				fprintf(stderr,
+					"cannot add peer to VPN: %s %s %s %s\n",
+					peer_id, raddr, rport, vpnip);
+				return ret;
+			}
+		}
+		ovpn_waitbg();
+		break;
+	case CMD_SET_PEER:
+		ret = ovpn_set_peer(ovpn);
+		break;
+	case CMD_DEL_PEER:
+		ret = ovpn_del_peer(ovpn);
+		break;
+	case CMD_GET_PEER:
+		if (ovpn->peer_id == PEER_ID_UNDEF)
+			fprintf(stderr, "List of peers connected to: %s\n",
+				ovpn->ifname);
+
+		ret = ovpn_get_peer(ovpn);
+		break;
+	case CMD_NEW_KEY:
+		ret = ovpn_new_key(ovpn);
+		break;
+	case CMD_DEL_KEY:
+		ret = ovpn_del_key(ovpn);
+		break;
+	case CMD_GET_KEY:
+		ret = ovpn_get_key(ovpn);
+		break;
+	case CMD_SWAP_KEYS:
+		ret = ovpn_swap_keys(ovpn);
+		break;
+	case CMD_LISTEN_MCAST:
+		ret = ovpn_listen_mcast();
+		break;
+	case CMD_INVALID:
+		break;
+	}
+
+	return ret;
+}
+
+static int ovpn_parse_cmd_args(struct ovpn_ctx *ovpn, int argc, char *argv[])
+{
+	int ret;
+
+	/* no args required for LISTEN_MCAST */
+	if (ovpn->cmd == CMD_LISTEN_MCAST)
+		return 0;
+
+	/* all commands need an ifname */
+	if (argc < 3)
+		return -EINVAL;
+
+	strscpy(ovpn->ifname, argv[2], IFNAMSIZ - 1);
+	ovpn->ifname[IFNAMSIZ - 1] = '\0';
+
+	/* all commands, except NEW_IFNAME, needs an ifindex */
+	if (ovpn->cmd != CMD_NEW_IFACE) {
+		ovpn->ifindex = if_nametoindex(ovpn->ifname);
+		if (!ovpn->ifindex) {
+			fprintf(stderr, "cannot find interface: %s\n",
+				strerror(errno));
+			return -1;
+		}
+	}
+
+	switch (ovpn->cmd) {
+	case CMD_NEW_IFACE:
+		if (argc < 4)
+			break;
+
+		if (!strcmp(argv[3], "P2P")) {
+			ovpn->mode = OVPN_MODE_P2P;
+		} else if (!strcmp(argv[3], "MP")) {
+			ovpn->mode = OVPN_MODE_MP;
+		} else {
+			fprintf(stderr, "Cannot parse iface mode: %s\n",
+				argv[3]);
+			return -1;
+		}
+		ovpn->mode_set = true;
+		break;
+	case CMD_DEL_IFACE:
+		break;
+	case CMD_LISTEN:
+		if (argc < 5)
+			return -EINVAL;
+
+		ovpn->lport = strtoul(argv[3], NULL, 10);
+		if (errno == ERANGE || ovpn->lport > 65535) {
+			fprintf(stderr, "lport value out of range\n");
+			return -1;
+		}
+
+		ovpn->peers_file = argv[4];
+
+		if (argc > 5 && !strcmp(argv[5], "ipv6"))
+			ovpn->sa_family = AF_INET6;
+		break;
+	case CMD_CONNECT:
+		if (argc < 6)
+			return -EINVAL;
+
+		ovpn->sa_family = AF_INET;
+
+		ret = ovpn_parse_new_peer(ovpn, argv[3], argv[4], argv[5],
+					  NULL);
+		if (ret < 0) {
+			fprintf(stderr, "Cannot parse remote peer data\n");
+			return -1;
+		}
+
+		if (argc > 6) {
+			ovpn->key_slot = OVPN_KEY_SLOT_PRIMARY;
+			ovpn->key_id = 0;
+			ovpn->cipher = OVPN_CIPHER_ALG_AES_GCM;
+			ovpn->key_dir = KEY_DIR_OUT;
+
+			ret = ovpn_parse_key(argv[6], ovpn);
+			if (ret)
+				return -1;
+		}
+		break;
+	case CMD_NEW_PEER:
+		if (argc < 7)
+			return -EINVAL;
+
+		ovpn->lport = strtoul(argv[4], NULL, 10);
+		if (errno == ERANGE || ovpn->lport > 65535) {
+			fprintf(stderr, "lport value out of range\n");
+			return -1;
+		}
+
+		const char *vpnip = (argc > 7) ? argv[7] : NULL;
+
+		ret = ovpn_parse_new_peer(ovpn, argv[3], argv[5], argv[6],
+					  vpnip);
+		if (ret < 0)
+			return -1;
+		break;
+	case CMD_NEW_MULTI_PEER:
+		if (argc < 5)
+			return -EINVAL;
+
+		ovpn->lport = strtoul(argv[3], NULL, 10);
+		if (errno == ERANGE || ovpn->lport > 65535) {
+			fprintf(stderr, "lport value out of range\n");
+			return -1;
+		}
+
+		ovpn->peers_file = argv[4];
+		break;
+	case CMD_SET_PEER:
+		if (argc < 6)
+			return -EINVAL;
+
+		ovpn->peer_id = strtoul(argv[3], NULL, 10);
+		if (errno == ERANGE || ovpn->peer_id > PEER_ID_UNDEF) {
+			fprintf(stderr, "peer ID value out of range\n");
+			return -1;
+		}
+
+		ovpn->keepalive_interval = strtoul(argv[4], NULL, 10);
+		if (errno == ERANGE) {
+			fprintf(stderr,
+				"keepalive interval value out of range\n");
+			return -1;
+		}
+
+		ovpn->keepalive_timeout = strtoul(argv[5], NULL, 10);
+		if (errno == ERANGE) {
+			fprintf(stderr,
+				"keepalive interval value out of range\n");
+			return -1;
+		}
+		break;
+	case CMD_DEL_PEER:
+		if (argc < 4)
+			return -EINVAL;
+
+		ovpn->peer_id = strtoul(argv[3], NULL, 10);
+		if (errno == ERANGE || ovpn->peer_id > PEER_ID_UNDEF) {
+			fprintf(stderr, "peer ID value out of range\n");
+			return -1;
+		}
+		break;
+	case CMD_GET_PEER:
+		ovpn->peer_id = PEER_ID_UNDEF;
+		if (argc > 3) {
+			ovpn->peer_id = strtoul(argv[3], NULL, 10);
+			if (errno == ERANGE || ovpn->peer_id > PEER_ID_UNDEF) {
+				fprintf(stderr, "peer ID value out of range\n");
+				return -1;
+			}
+		}
+		break;
+	case CMD_NEW_KEY:
+		if (argc < 9)
+			return -EINVAL;
+
+		ovpn->peer_id = strtoul(argv[3], NULL, 10);
+		if (errno == ERANGE) {
+			fprintf(stderr, "peer ID value out of range\n");
+			return -1;
+		}
+
+		ret = ovpn_parse_key_slot(argv[4], ovpn);
+		if (ret)
+			return -1;
+
+		ovpn->key_id = strtoul(argv[5], NULL, 10);
+		if (errno == ERANGE || ovpn->key_id > 2) {
+			fprintf(stderr, "key ID out of range\n");
+			return -1;
+		}
+
+		ret = ovpn_parse_cipher(argv[6], ovpn);
+		if (ret < 0)
+			return -1;
+
+		ret = ovpn_parse_key_direction(argv[7], ovpn);
+		if (ret < 0)
+			return -1;
+
+		ret = ovpn_parse_key(argv[8], ovpn);
+		if (ret)
+			return -1;
+		break;
+	case CMD_DEL_KEY:
+		if (argc < 4)
+			return -EINVAL;
+
+		ovpn->peer_id = strtoul(argv[3], NULL, 10);
+		if (errno == ERANGE) {
+			fprintf(stderr, "peer ID value out of range\n");
+			return -1;
+		}
+
+		ret = ovpn_parse_key_slot(argv[4], ovpn);
+		if (ret)
+			return ret;
+		break;
+	case CMD_GET_KEY:
+		if (argc < 5)
+			return -EINVAL;
+
+		ovpn->peer_id = strtoul(argv[3], NULL, 10);
+		if (errno == ERANGE) {
+			fprintf(stderr, "peer ID value out of range\n");
+			return -1;
+		}
+
+		ret = ovpn_parse_key_slot(argv[4], ovpn);
+		if (ret)
+			return ret;
+		break;
+	case CMD_SWAP_KEYS:
+		if (argc < 4)
+			return -EINVAL;
+
+		ovpn->peer_id = strtoul(argv[3], NULL, 10);
+		if (errno == ERANGE) {
+			fprintf(stderr, "peer ID value out of range\n");
+			return -1;
+		}
+		break;
+	case CMD_LISTEN_MCAST:
+		break;
+	case CMD_INVALID:
+		break;
+	}
+
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	struct ovpn_ctx ovpn;
+	int ret;
+
+	if (argc < 2) {
+		usage(argv[0]);
+		return -1;
+	}
+
+	memset(&ovpn, 0, sizeof(ovpn));
+	ovpn.sa_family = AF_UNSPEC;
+	ovpn.cipher = OVPN_CIPHER_ALG_NONE;
+
+	ovpn.cmd = ovpn_parse_cmd(argv[1]);
+	if (ovpn.cmd == CMD_INVALID) {
+		fprintf(stderr, "Error: unknown command.\n\n");
+		usage(argv[0]);
+		return -1;
+	}
+
+	ret = ovpn_parse_cmd_args(&ovpn, argc, argv);
+	if (ret < 0) {
+		fprintf(stderr, "Error: invalid arguments.\n\n");
+		if (ret == -EINVAL)
+			usage(argv[0]);
+		return ret;
+	}
+
+	ret = ovpn_run_cmd(&ovpn);
+	if (ret)
+		fprintf(stderr, "Cannot execute command: %s (%d)\n",
+			strerror(-ret), ret);
+
+	return ret;
+}
diff --git a/tools/testing/selftests/net/ovpn/tcp_peers.txt b/tools/testing/selftests/net/ovpn/tcp_peers.txt
new file mode 100644
index 000000000000..d753eebe8716
--- /dev/null
+++ b/tools/testing/selftests/net/ovpn/tcp_peers.txt
@@ -0,0 +1,5 @@
+1 5.5.5.2
+2 5.5.5.3
+3 5.5.5.4
+4 5.5.5.5
+5 5.5.5.6
diff --git a/tools/testing/selftests/net/ovpn/test-chachapoly.sh b/tools/testing/selftests/net/ovpn/test-chachapoly.sh
new file mode 100755
index 000000000000..32504079a2b8
--- /dev/null
+++ b/tools/testing/selftests/net/ovpn/test-chachapoly.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2025 OpenVPN, Inc.
+#
+#  Author:	Antonio Quartulli <antonio@openvpn.net>
+
+ALG="chachapoly"
+
+source test.sh
diff --git a/tools/testing/selftests/net/ovpn/test-close-socket-tcp.sh b/tools/testing/selftests/net/ovpn/test-close-socket-tcp.sh
new file mode 100755
index 000000000000..093d44772ffd
--- /dev/null
+++ b/tools/testing/selftests/net/ovpn/test-close-socket-tcp.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2025 OpenVPN, Inc.
+#
+#  Author:	Antonio Quartulli <antonio@openvpn.net>
+
+PROTO="TCP"
+
+source test-close-socket.sh
diff --git a/tools/testing/selftests/net/ovpn/test-close-socket.sh b/tools/testing/selftests/net/ovpn/test-close-socket.sh
new file mode 100755
index 000000000000..5e48a8b67928
--- /dev/null
+++ b/tools/testing/selftests/net/ovpn/test-close-socket.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2020-2025 OpenVPN, Inc.
+#
+#  Author:	Antonio Quartulli <antonio@openvpn.net>
+
+#set -x
+set -e
+
+source ./common.sh
+
+cleanup
+
+modprobe -q ovpn || true
+
+for p in $(seq 0 ${NUM_PEERS}); do
+	create_ns ${p}
+done
+
+for p in $(seq 0 ${NUM_PEERS}); do
+	setup_ns ${p} 5.5.5.$((${p} + 1))/24
+done
+
+for p in $(seq 0 ${NUM_PEERS}); do
+	add_peer ${p}
+done
+
+for p in $(seq 1 ${NUM_PEERS}); do
+	ip netns exec peer0 ${OVPN_CLI} set_peer tun0 ${p} 60 120
+	ip netns exec peer${p} ${OVPN_CLI} set_peer tun${p} ${p} 60 120
+done
+
+sleep 1
+
+for p in $(seq 1 ${NUM_PEERS}); do
+	ip netns exec peer0 ping -qfc 500 -w 3 5.5.5.$((${p} + 1))
+done
+
+ip netns exec peer0 iperf3 -1 -s &
+sleep 1
+ip netns exec peer1 iperf3 -Z -t 3 -c 5.5.5.1
+
+cleanup
+
+modprobe -r ovpn || true
diff --git a/tools/testing/selftests/net/ovpn/test-float.sh b/tools/testing/selftests/net/ovpn/test-float.sh
new file mode 100755
index 000000000000..ba5d725e18b0
--- /dev/null
+++ b/tools/testing/selftests/net/ovpn/test-float.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2025 OpenVPN, Inc.
+#
+#  Author:	Antonio Quartulli <antonio@openvpn.net>
+
+FLOAT="1"
+
+source test.sh
diff --git a/tools/testing/selftests/net/ovpn/test-tcp.sh b/tools/testing/selftests/net/ovpn/test-tcp.sh
new file mode 100755
index 000000000000..ba3f1f315a34
--- /dev/null
+++ b/tools/testing/selftests/net/ovpn/test-tcp.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2025 OpenVPN, Inc.
+#
+#  Author:	Antonio Quartulli <antonio@openvpn.net>
+
+PROTO="TCP"
+
+source test.sh
diff --git a/tools/testing/selftests/net/ovpn/test.sh b/tools/testing/selftests/net/ovpn/test.sh
new file mode 100755
index 000000000000..e8acdc303307
--- /dev/null
+++ b/tools/testing/selftests/net/ovpn/test.sh
@@ -0,0 +1,117 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2020-2025 OpenVPN, Inc.
+#
+#  Author:	Antonio Quartulli <antonio@openvpn.net>
+
+#set -x
+set -e
+
+source ./common.sh
+
+cleanup
+
+modprobe -q ovpn || true
+
+for p in $(seq 0 ${NUM_PEERS}); do
+	create_ns ${p}
+done
+
+for p in $(seq 0 ${NUM_PEERS}); do
+	setup_ns ${p} 5.5.5.$((${p} + 1))/24 ${MTU}
+done
+
+for p in $(seq 0 ${NUM_PEERS}); do
+	add_peer ${p}
+done
+
+for p in $(seq 1 ${NUM_PEERS}); do
+	ip netns exec peer0 ${OVPN_CLI} set_peer tun0 ${p} 60 120
+	ip netns exec peer${p} ${OVPN_CLI} set_peer tun${p} ${p} 60 120
+done
+
+sleep 1
+
+for p in $(seq 1 ${NUM_PEERS}); do
+	ip netns exec peer0 ping -qfc 500 -w 3 5.5.5.$((${p} + 1))
+	ip netns exec peer0 ping -qfc 500 -s 3000 -w 3 5.5.5.$((${p} + 1))
+done
+
+# ping LAN behind client 1
+ip netns exec peer0 ping -qfc 500 -w 3 ${LAN_IP}
+
+if [ "$FLOAT" == "1" ]; then
+	# make clients float..
+	for p in $(seq 1 ${NUM_PEERS}); do
+		ip -n peer${p} addr del 10.10.${p}.2/24 dev veth${p}
+		ip -n peer${p} addr add 10.10.${p}.3/24 dev veth${p}
+	done
+	for p in $(seq 1 ${NUM_PEERS}); do
+		ip netns exec peer${p} ping -qfc 500 -w 3 5.5.5.1
+	done
+fi
+
+ip netns exec peer0 iperf3 -1 -s &
+sleep 1
+ip netns exec peer1 iperf3 -Z -t 3 -c 5.5.5.1
+
+echo "Adding secondary key and then swap:"
+for p in $(seq 1 ${NUM_PEERS}); do
+	ip netns exec peer0 ${OVPN_CLI} new_key tun0 ${p} 2 1 ${ALG} 0 data64.key
+	ip netns exec peer${p} ${OVPN_CLI} new_key tun${p} ${p} 2 1 ${ALG} 1 data64.key
+	ip netns exec peer${p} ${OVPN_CLI} swap_keys tun${p} ${p}
+done
+
+sleep 1
+
+echo "Querying all peers:"
+ip netns exec peer0 ${OVPN_CLI} get_peer tun0
+ip netns exec peer1 ${OVPN_CLI} get_peer tun1
+
+echo "Querying peer 1:"
+ip netns exec peer0 ${OVPN_CLI} get_peer tun0 1
+
+echo "Querying non-existent peer 10:"
+ip netns exec peer0 ${OVPN_CLI} get_peer tun0 10 || true
+
+echo "Deleting peer 1:"
+ip netns exec peer0 ${OVPN_CLI} del_peer tun0 1
+ip netns exec peer1 ${OVPN_CLI} del_peer tun1 1
+
+echo "Querying keys:"
+for p in $(seq 2 ${NUM_PEERS}); do
+	ip netns exec peer${p} ${OVPN_CLI} get_key tun${p} ${p} 1
+	ip netns exec peer${p} ${OVPN_CLI} get_key tun${p} ${p} 2
+done
+
+echo "Deleting peer while sending traffic:"
+(ip netns exec peer2 ping -qf -w 4 5.5.5.1)&
+sleep 2
+ip netns exec peer0 ${OVPN_CLI} del_peer tun0 2
+# following command fails in TCP mode
+# (both ends get conn reset when one peer disconnects)
+ip netns exec peer2 ${OVPN_CLI} del_peer tun2 2 || true
+
+echo "Deleting keys:"
+for p in $(seq 3 ${NUM_PEERS}); do
+	ip netns exec peer${p} ${OVPN_CLI} del_key tun${p} ${p} 1
+	ip netns exec peer${p} ${OVPN_CLI} del_key tun${p} ${p} 2
+done
+
+echo "Setting timeout to 3s MP:"
+for p in $(seq 3 ${NUM_PEERS}); do
+	ip netns exec peer0 ${OVPN_CLI} set_peer tun0 ${p} 3 3 || true
+	ip netns exec peer${p} ${OVPN_CLI} set_peer tun${p} ${p} 0 0
+done
+# wait for peers to timeout
+sleep 5
+
+echo "Setting timeout to 3s P2P:"
+for p in $(seq 3 ${NUM_PEERS}); do
+	ip netns exec peer${p} ${OVPN_CLI} set_peer tun${p} ${p} 3 3
+done
+sleep 5
+
+cleanup
+
+modprobe -r ovpn || true
diff --git a/tools/testing/selftests/net/ovpn/udp_peers.txt b/tools/testing/selftests/net/ovpn/udp_peers.txt
new file mode 100644
index 000000000000..e9773ddf875c
--- /dev/null
+++ b/tools/testing/selftests/net/ovpn/udp_peers.txt
@@ -0,0 +1,6 @@
+1 10.10.1.1 1 10.10.1.2 1 5.5.5.2
+2 10.10.2.1 1 10.10.2.2 1 5.5.5.3
+3 10.10.3.1 1 10.10.3.2 1 5.5.5.4
+4 fd00:0:0:4::1 1 fd00:0:0:4::2 1 5.5.5.5
+5 fd00:0:0:5::1 1 fd00:0:0:5::2 1 5.5.5.6
+6 fd00:0:0:6::1 1 fd00:0:0:6::2 1 5.5.5.7
diff --git a/tools/testing/selftests/net/packetdrill/Makefile b/tools/testing/selftests/net/packetdrill/Makefile
new file mode 100644
index 000000000000..31cfb666ba8b
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/Makefile
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0
+
+TEST_INCLUDES := ksft_runner.sh \
+		 defaults.sh \
+		 set_sysctls.py \
+		 ../../kselftest/ktap_helpers.sh
+
+TEST_PROGS := $(wildcard *.pkt)
+
+include ../../lib.mk
diff --git a/tools/testing/selftests/net/packetdrill/config b/tools/testing/selftests/net/packetdrill/config
new file mode 100644
index 000000000000..0237ed98f3c0
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/config
@@ -0,0 +1,11 @@
+CONFIG_IPV6=y
+CONFIG_HZ_1000=y
+CONFIG_HZ=1000
+CONFIG_NET_NS=y
+CONFIG_NET_SCH_FIFO=y
+CONFIG_NET_SCH_FQ=y
+CONFIG_PROC_SYSCTL=y
+CONFIG_SYN_COOKIES=y
+CONFIG_TCP_CONG_CUBIC=y
+CONFIG_TCP_MD5SIG=y
+CONFIG_TUN=y
diff --git a/tools/testing/selftests/net/packetdrill/defaults.sh b/tools/testing/selftests/net/packetdrill/defaults.sh
new file mode 100755
index 000000000000..1095a7b22f44
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/defaults.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Set standard production config values that relate to TCP behavior.
+
+# Flush old cached data (fastopen cookies).
+ip tcp_metrics flush all > /dev/null 2>&1
+
+# TCP min, default, and max receive and send buffer sizes.
+sysctl -q net.ipv4.tcp_rmem="4096 540000 $((15*1024*1024))"
+sysctl -q net.ipv4.tcp_wmem="4096 $((256*1024)) 4194304"
+
+# TCP timestamps.
+sysctl -q net.ipv4.tcp_timestamps=1
+
+# TCP SYN(ACK) retry thresholds
+sysctl -q net.ipv4.tcp_syn_retries=5
+sysctl -q net.ipv4.tcp_synack_retries=5
+
+# TCP Forward RTO-Recovery, RFC 5682.
+sysctl -q net.ipv4.tcp_frto=2
+
+# TCP Selective Acknowledgements (SACK)
+sysctl -q net.ipv4.tcp_sack=1
+
+# TCP Duplicate Selective Acknowledgements (DSACK)
+sysctl -q net.ipv4.tcp_dsack=1
+
+# TCP FACK (Forward Acknowldgement)
+sysctl -q net.ipv4.tcp_fack=0
+
+# TCP reordering degree ("dupthresh" threshold for entering Fast Recovery).
+sysctl -q net.ipv4.tcp_reordering=3
+
+# TCP congestion control.
+sysctl -q net.ipv4.tcp_congestion_control=cubic
+
+# TCP slow start after idle.
+sysctl -q net.ipv4.tcp_slow_start_after_idle=0
+
+# TCP RACK and TLP.
+sysctl -q net.ipv4.tcp_early_retrans=4 net.ipv4.tcp_recovery=1
+
+# TCP method for deciding when to defer sending to accumulate big TSO packets.
+sysctl -q net.ipv4.tcp_tso_win_divisor=3
+
+# TCP Explicit Congestion Notification (ECN)
+sysctl -q net.ipv4.tcp_ecn=0
+
+sysctl -q net.ipv4.tcp_pacing_ss_ratio=200
+sysctl -q net.ipv4.tcp_pacing_ca_ratio=120
+sysctl -q net.ipv4.tcp_notsent_lowat=4294967295 > /dev/null 2>&1
+
+sysctl -q net.ipv4.tcp_fastopen=0x70403
+sysctl -q net.ipv4.tcp_fastopen_key=a1a1a1a1-b2b2b2b2-c3c3c3c3-d4d4d4d4
+
+sysctl -q net.ipv4.tcp_syncookies=1
+
+# Override the default qdisc on the tun device.
+# Many tests fail with timing errors if the default
+# is FQ and that paces their flows.
+tc qdisc add dev tun0 root pfifo
+
diff --git a/tools/testing/selftests/net/packetdrill/ksft_runner.sh b/tools/testing/selftests/net/packetdrill/ksft_runner.sh
new file mode 100755
index 000000000000..ef8b25a606d8
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/ksft_runner.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source "$(dirname $(realpath $0))/../../kselftest/ktap_helpers.sh"
+
+readonly ipv4_args=('--ip_version=ipv4 '
+		    '--local_ip=192.168.0.1 '
+		    '--gateway_ip=192.168.0.1 '
+		    '--netmask_ip=255.255.0.0 '
+		    '--remote_ip=192.0.2.1 '
+		    '-D CMSG_LEVEL_IP=SOL_IP '
+		    '-D CMSG_TYPE_RECVERR=IP_RECVERR ')
+
+readonly ipv6_args=('--ip_version=ipv6 '
+		    '--mtu=1520 '
+		    '--local_ip=fd3d:0a0b:17d6::1 '
+		    '--gateway_ip=fd3d:0a0b:17d6:8888::1 '
+		    '--remote_ip=fd3d:fa7b:d17d::1 '
+		    '-D CMSG_LEVEL_IP=SOL_IPV6 '
+		    '-D CMSG_TYPE_RECVERR=IPV6_RECVERR ')
+
+if [ $# -ne 1 ]; then
+	ktap_exit_fail_msg "usage: $0 <script>"
+	exit "$KSFT_FAIL"
+fi
+script="$(basename $1)"
+
+if [ -z "$(which packetdrill)" ]; then
+	ktap_skip_all "packetdrill not found in PATH"
+	exit "$KSFT_SKIP"
+fi
+
+declare -a optargs
+failfunc=ktap_test_fail
+
+if [[ -n "${KSFT_MACHINE_SLOW}" ]]; then
+	optargs+=('--tolerance_usecs=14000')
+
+	# xfail tests that are known flaky with dbg config, not fixable.
+	# still run them for coverage (and expect 100% pass without dbg).
+	declare -ar xfail_list=(
+		"tcp_eor_no-coalesce-retrans.pkt"
+		"tcp_fast_recovery_prr-ss.*.pkt"
+		"tcp_slow_start_slow-start-after-win-update.pkt"
+		"tcp_timestamping.*.pkt"
+		"tcp_user_timeout_user-timeout-probe.pkt"
+		"tcp_zerocopy_epoll_.*.pkt"
+		"tcp_tcp_info_tcp-info-.*-limited.pkt"
+	)
+	readonly xfail_regex="^($(printf '%s|' "${xfail_list[@]}"))$"
+	[[ "$script" =~ ${xfail_regex} ]] && failfunc=ktap_test_xfail
+fi
+
+ktap_print_header
+ktap_set_plan 2
+
+unshare -n packetdrill ${ipv4_args[@]} ${optargs[@]} $script > /dev/null \
+	&& ktap_test_pass "ipv4" || $failfunc "ipv4"
+unshare -n packetdrill ${ipv6_args[@]} ${optargs[@]} $script > /dev/null \
+	&& ktap_test_pass "ipv6" || $failfunc "ipv6"
+
+ktap_finished
diff --git a/tools/testing/selftests/net/packetdrill/set_sysctls.py b/tools/testing/selftests/net/packetdrill/set_sysctls.py
new file mode 100755
index 000000000000..5ddf456ae973
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/set_sysctls.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+"""Sets sysctl values and writes a file that restores them.
+
+The arguments are of the form "<proc-file>=<val>" separated by spaces.
+The program first reads the current value of the proc-file and creates
+a shell script named "/tmp/sysctl_restore_${PACKETDRILL_PID}.sh" which
+restores the values when executed. It then sets the new values.
+
+PACKETDRILL_PID is set by packetdrill to the pid of itself, so a .pkt
+file could restore sysctls by running `/tmp/sysctl_restore_${PPID}.sh`
+at the end.
+"""
+
+import os
+import subprocess
+import sys
+
+filename = '/tmp/sysctl_restore_%s.sh' % os.environ['PACKETDRILL_PID']
+
+# Open file for restoring sysctl values
+restore_file = open(filename, 'w')
+print('#!/bin/bash', file=restore_file)
+
+for a in sys.argv[1:]:
+  sysctl = a.split('=')
+  # sysctl[0] contains the proc-file name, sysctl[1] the new value
+
+  # read current value and add restore command to file
+  cur_val = subprocess.check_output(['cat', sysctl[0]], universal_newlines=True)
+  print('echo "%s" > %s' % (cur_val.strip(), sysctl[0]), file=restore_file)
+
+  # set new value
+  cmd = 'echo "%s" > %s' % (sysctl[1], sysctl[0])
+  os.system(cmd)
+
+os.system('chmod u+x %s' % filename)
diff --git a/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-accept.pkt b/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-accept.pkt
new file mode 100644
index 000000000000..38535701656e
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-accept.pkt
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test for blocking accept.
+
+`./defaults.sh`
+
+// Establish a connection.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+   +0...0.200 accept(3, ..., ...) = 4
+
+  +.1 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 8>
+  +.1 < . 1:1(0) ack 1 win 257
+
+  +.1 write(4, ..., 2000) = 2000
+   +0 > P. 1:2001(2000) ack 1
diff --git a/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-connect.pkt b/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-connect.pkt
new file mode 100644
index 000000000000..3692ef102381
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-connect.pkt
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test for blocking connect.
+
+`./defaults.sh`
+
+// Establish a connection.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+
+  +.1...0.200 connect(3, ..., ...) = 0
+
+   +0 > S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
+  +.1 < S. 0:0(0) ack 1 win 5792 <mss 1460,nop,wscale 2,nop,nop,sackOK>
+   +0 > . 1:1(0) ack 1
diff --git a/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-read.pkt b/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-read.pkt
new file mode 100644
index 000000000000..914eabab367a
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-read.pkt
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test for blocking read.
+--tolerance_usecs=10000
+
+`./defaults.sh`
+
+// Establish a connection.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+  +.1 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 8>
+  +.1 < . 1:1(0) ack 1 win 257
+   +0 accept(3, ..., ...) = 4
+
+   +0...0.100 read(4, ..., 2000) = 2000
+  +.1 < P. 1:2001(2000) ack 1 win 257
+   +0 > . 1:1(0) ack 2001
+
+  +.1...0.200 read(4, ..., 2000) = 2000
+  +.1 < P. 2001:4001(2000) ack 1 win 257
+   +0 > . 1:1(0) ack 4001
+
+  +.1 < P. 4001:6001(2000) ack 1 win 257
+   +0 > . 1:1(0) ack 6001
+   +0...0.000 read(4, ..., 1000) = 1000
+   +0...0.000 read(4, ..., 1000) = 1000
diff --git a/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-write.pkt b/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-write.pkt
new file mode 100644
index 000000000000..cec5a0725d95
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-write.pkt
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test for blocking write.
+--tolerance_usecs=10000
+
+`./defaults.sh
+./set_sysctls.py /proc/sys/net/ipv4/tcp_min_tso_segs=10
+`
+
+// Establish a connection.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+  +.1 < S 0:0(0) win 50000 <mss 1000,nop,wscale 0>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 8>
+  +.1 < . 1:1(0) ack 1 win 50000
+   +0 accept(3, ..., ...) = 4
+
+// Kernel doubles our value -> sk->sk_sndbuf is set to 42000
+   +0 setsockopt(4, SOL_SOCKET, SO_SNDBUF,  [21000], 4) = 0
+   +0 getsockopt(4, SOL_SOCKET, SO_SNDBUF,  [42000], [4]) = 0
+
+// A write of 60000 does not block.
+   +0...0.300 write(4, ..., 61000) = 61000    // this write() blocks
+
+  +.1 < . 1:1(0) ack 10001 win 50000
+
+  +.1 < . 1:1(0) ack 30001 win 50000
+
+// This ACK should wakeup the write(). An ACK of 35001 does not.
+  +.1 < . 1:1(0) ack 36001 win 50000
+
+// Reset to sysctls defaults.
+`/tmp/sysctl_restore_${PPID}.sh`
diff --git a/tools/testing/selftests/net/packetdrill/tcp_close_close-local-close-then-remote-fin.pkt b/tools/testing/selftests/net/packetdrill/tcp_close_close-local-close-then-remote-fin.pkt
new file mode 100644
index 000000000000..8514d6bdbb6d
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_close_close-local-close-then-remote-fin.pkt
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test basic connection teardown where local process closes first:
+// the local process calls close() first, so we send a FIN, and receive an ACK.
+// Then we receive a FIN and ACK it.
+
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+ +.01...0.011 connect(3, ..., ...) = 0
+   +0 >  S 0:0(0) <...>
+   +0 < S. 0:0(0) ack 1 win 32768 <mss 1000,nop,wscale 6,nop,nop,sackOK>
+   +0 >  . 1:1(0) ack 1
+
+   +0 write(3, ..., 1000) = 1000
+   +0 >  P. 1:1001(1000) ack 1
+   +0 < . 1:1(0) ack 1001 win 257
+
+   +0 close(3) = 0
+   +0 >  F. 1001:1001(0) ack 1
+   +0 < . 1:1(0) ack 1002 win 257
+
+   +0 < F. 1:1(0) ack 1002 win 257
+   +0 >  . 1002:1002(0) ack 2
diff --git a/tools/testing/selftests/net/packetdrill/tcp_close_close-on-syn-sent.pkt b/tools/testing/selftests/net/packetdrill/tcp_close_close-on-syn-sent.pkt
new file mode 100644
index 000000000000..04103134bd99
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_close_close-on-syn-sent.pkt
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test to make sure no RST is being sent when close()
+// is called on a socket with SYN_SENT state.
+
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+
+   +0 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress)
+   +0 > S 0:0(0) <...>
+
+// Application decideds to close the socket in SYN_SENT state
+// Make sure no RST is sent after close().
+   +0 close(3) = 0
+
+// Receive syn-ack to trigger the send side packet examination:
+// If a RESET were sent right after close(), it would have failed with
+// a mismatched timestamp.
+  +.1 < S. 0:0(0) ack 1 win 32000 <mss 1460,nop,wscale 7>
+   +0 > R 1:1(0)
diff --git a/tools/testing/selftests/net/packetdrill/tcp_close_close-remote-fin-then-close.pkt b/tools/testing/selftests/net/packetdrill/tcp_close_close-remote-fin-then-close.pkt
new file mode 100644
index 000000000000..5f3a2914213a
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_close_close-remote-fin-then-close.pkt
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0
+// Verify behavior for the sequence: remote side sends FIN, then we close().
+// Since the remote side (client) closes first, we test our LAST_ACK code path.
+
+`./defaults.sh`
+
+// Initialize a server socket.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+   +0 < . 1:1(0) ack 1 win 257
+
+   +0 accept(3, ..., ...) = 4
+
+// Client closes first.
+ +.01 < F. 1:1(0) ack 1 win 257
+   +0 > . 1:1(0) ack 2
+
+// App notices that client closed.
+   +0 read(4, ..., 1000) = 0
+
+// Then we close.
+ +.01 close(4) = 0
+   +0 > F. 1:1(0) ack 2
+
+// Client ACKs our FIN.
+ +.01 < . 2:2(0) ack 2 win 257
+
+// Verify that we send RST in response to any incoming segments
+// (because the kernel no longer has any record of this socket).
+ +.01 < . 2:2(0) ack 2 win 257
+   +0 > R 2:2(0)
diff --git a/tools/testing/selftests/net/packetdrill/tcp_ecn_ecn-uses-ect0.pkt b/tools/testing/selftests/net/packetdrill/tcp_ecn_ecn-uses-ect0.pkt
new file mode 100644
index 000000000000..643baf3267cf
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_ecn_ecn-uses-ect0.pkt
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test ECN: verify that Linux TCP ECN sending code uses ECT0 (not ECT1).
+//
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=1  # fully enabled
+`
+
+// Initialize connection
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4
+
+// ECN handshake: send EW flags in SYN packet, E flag in SYN-ACK response
++.002 ... 0.004 connect(4, ..., ...) = 0
+
+   +0 > SEW 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
++.002 < SE. 0:0(0) ack 1 win 32767 <mss 1000,nop,wscale 6,nop,nop,sackOK>
+   +0 > . 1:1(0) ack 1
+
+// Write 1 MSS.
++.002 write(4, ..., 1000) = 1000
+// Send 1 MSS with ect0.
+   +0 > [ect0] P. 1:1001(1000) ack 1
diff --git a/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-large.pkt b/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-large.pkt
new file mode 100644
index 000000000000..f95b9b3c9fa1
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-large.pkt
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test TCP does not append any data from consequent writes to the tail
+// skb created for the chunk. The large chunk itself should be packetized as
+// usual.
+`./defaults.sh
+`
+
+// Initialize connection
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 10>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+ +.01 < . 1:1(0) ack 1 win 514
+
+   +0 accept(3, ..., ...) = 4
+
+// Write a 10400B chunk to fill the ICW, and have a 400 byte skb sitting on
+// the tail.
+   +0 write(4, ..., 10400) = 10400
+
+// Write another 10040B chunk with no coalescing options.
+   +0 send(4, ..., 10400, MSG_EOR) = 10400
+
+// Write a 2KB chunk. This chunk should not be appended to the packets created
+// the previous chunk.
+   +0 write(4, ..., 2000) = 2000
+
+   +0 > P. 1:10001(10000) ack 1
++.001 < .  1:1(0) ack 10001 win 514
+// Now we have enough room to send out the 2 x 400B packets out.
+   +0 > P. 10001:20801(10800) ack 1
++.001 < .  1:1(0) ack 20801 win 514
+// This 2KB packet should be sent alone.
+   +0 > P. 20801:22801(2000) ack 1
++.001 < .  1:1(0) ack 22801 win 514
diff --git a/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-retrans.pkt b/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-retrans.pkt
new file mode 100644
index 000000000000..2ff66075288e
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-retrans.pkt
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test TCP does not append any data from consequent writes to the tail
+// skb created for the chunk. Also, when packets are retransmitted, they
+// will not be coalesce into the same skb.
+`./defaults.sh
+`
+
+// Initialize connection
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 10>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+ +.01 < . 1:1(0) ack 1 win 514
+
+   +0 accept(3, ..., ...) = 4
+
+// Write a 10400B chunk to fill the ICW, and have a 400 byte skb sitting on
+// the tail.
+   +0 write(4, ..., 10400) = 10400
+
+// Write 10 400B chunks with no coalescing options.
+   +0 send(4, ..., 400, MSG_EOR) = 400
+   +0 send(4, ..., 400, MSG_EOR) = 400
+   +0 send(4, ..., 400, MSG_EOR) = 400
+   +0 send(4, ..., 400, MSG_EOR) = 400
+   +0 send(4, ..., 400, MSG_EOR) = 400
+   +0 send(4, ..., 400, MSG_EOR) = 400
+   +0 send(4, ..., 400, MSG_EOR) = 400
+   +0 send(4, ..., 400, MSG_EOR) = 400
+   +0 send(4, ..., 400, MSG_EOR) = 400
+   +0 send(4, ..., 400, MSG_EOR) = 400
+// This chunk should not be appended to the skbs created for the previous chunk.
+   +0 write(4, ..., 10000) = 10000
+
+   +0 > P. 1:10001(10000) ack 1
++.001 < .  1:1(0) ack 10001 win 514
+// Now we have enough room to send out the 2 x 400B packets out.
+   +0 > P. 10001:10801(800) ack 1
+// The 9 remaining 400B chunks should be sent as individual packets.
+   +0 > P. 10801:11201(400) ack 1
+   +0 > P. 11201:11601(400) ack 1
+   +0 > P. 11601:12001(400) ack 1
+   +0 > P. 12001:12401(400) ack 1
+   +0 > P. 12401:12801(400) ack 1
+   +0 > P. 12801:13201(400) ack 1
+   +0 > P. 13201:13601(400) ack 1
+   +0 > P. 13601:14001(400) ack 1
+   +0 > P. 14001:14401(400) ack 1
+// The last 10KB chunk should be sent separately.
+   +0 > P. 14401:24401(10000) ack 1
+
++.001 < .  1:1(0) ack 10401 win 514
++.001 < .  1:1(0) ack 10801 win 514
++.001 < .  1:1(0) ack 11201 win 514
++.001 < .  1:1(0) ack 11601 win 514
++.001 < .  1:1(0) ack 12001 win 514 <sack 13201:14401,nop,nop>
+// TCP should fill the hole but no coalescing should happen, and all
+// retransmissions should be sent out as individual packets.
+
+// Note : This is timeout based retransmit.
+// Do not put +0 here or flakes will come back.
++.004~+.008 > P. 12001:12401(400) ack 1
+
++.001 < .  1:1(0) ack 12401 win 514 <sack 13201:14401,nop,nop>
+   +0 > P. 12401:12801(400) ack 1
+   +0 > P. 12801:13201(400) ack 1
++.001 < .  1:1(0) ack 12801 win 514 <sack 13201:14401,nop,nop>
++.001 < .  1:1(0) ack 14401 win 514
++.001 < .  1:1(0) ack 24401 win 514
diff --git a/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-small.pkt b/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-small.pkt
new file mode 100644
index 000000000000..77039c5aac39
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-small.pkt
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test TCP does not append any data from consequent writes to the tail
+// skb created for the chunk.
+`./defaults.sh
+`
+
+// Initialize connection
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 10>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+ +.01 < . 1:1(0) ack 1 win 514
+
+   +0 accept(3, ..., ...) = 4
+
+// Write a 10400B chunk to fill the ICW, and have a 400 byte skb sitting on
+// the tail.
+   +0 write(4, ..., 10400) = 10400
+
+// Write a 400B chunk with no coalescing options.
+   +0 send(4, ..., 400, MSG_EOR) = 400
+
+// This chunk should not be appended to the skbs created for the previous chunk.
+   +0 write(4, ..., 10000) = 10000
+
+   +0 > P. 1:10001(10000) ack 1
++.001 < .  1:1(0) ack 10001 win 514
+// Now we have enough room to send out the 2 x 400B packets out.
+   +0 > P. 10001:10801(800) ack 1
+   +0 > P. 10801:20801(10000) ack 1
++.001 < .  1:1(0) ack 10401 win 514
++.001 < .  1:1(0) ack 10801 win 514
++.001 < .  1:1(0) ack 20801 win 514
diff --git a/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-subsequent.pkt b/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-subsequent.pkt
new file mode 100644
index 000000000000..dd5a06250595
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-subsequent.pkt
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test TCP does not append any data from consequent writes to the tail
+// skb created for the chunk even though we have 10 back-to-back small
+// writes.
+`./defaults.sh
+`
+
+// Initialize connection
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 10>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+ +.01 < . 1:1(0) ack 1 win 514
+
+   +0 accept(3, ..., ...) = 4
+
+// Write a 10400B chunk to fill the ICW, and have a 400 byte skb sitting on
+// the tail.
+   +0 write(4, ..., 10400) = 10400
+
+// Write 10 400B chunks with no coalescing options.
+   +0 send(4, ..., 400, MSG_EOR) = 400
+   +0 send(4, ..., 400, MSG_EOR) = 400
+   +0 send(4, ..., 400, MSG_EOR) = 400
+   +0 send(4, ..., 400, MSG_EOR) = 400
+   +0 send(4, ..., 400, MSG_EOR) = 400
+   +0 send(4, ..., 400, MSG_EOR) = 400
+   +0 send(4, ..., 400, MSG_EOR) = 400
+   +0 send(4, ..., 400, MSG_EOR) = 400
+   +0 send(4, ..., 400, MSG_EOR) = 400
+   +0 send(4, ..., 400, MSG_EOR) = 400
+// This chunk should not be appended to the skbs created for the previous chunk.
+   +0 write(4, ..., 10000) = 10000
+
+   +0 > P. 1:10001(10000) ack 1
++.001 < .  1:1(0) ack 10001 win 514
+// Now we have enough room to send out the 2 x 400B packets out.
+   +0 > P. 10001:10801(800) ack 1
+// The 9 remaining 400B chunks should be sent as individual packets.
+   +0 > P. 10801:11201(400) ack 1
+   +0 > P. 11201:11601(400) ack 1
+   +0 > P. 11601:12001(400) ack 1
+   +0 > P. 12001:12401(400) ack 1
+   +0 > P. 12401:12801(400) ack 1
+   +0 > P. 12801:13201(400) ack 1
+   +0 > P. 13201:13601(400) ack 1
+   +0 > P. 13601:14001(400) ack 1
+   +0 > P. 14001:14401(400) ack 1
+// The last 10KB chunk should be sent separately.
+   +0 > P. 14401:24401(10000) ack 1
+
++.001 < .  1:1(0) ack 10401 win 514
++.001 < .  1:1(0) ack 10801 win 514
++.001 < .  1:1(0) ack 11201 win 514
++.001 < .  1:1(0) ack 11601 win 514
++.001 < .  1:1(0) ack 12001 win 514
++.001 < .  1:1(0) ack 12401 win 514
++.001 < .  1:1(0) ack 12801 win 514
++.001 < .  1:1(0) ack 13201 win 514
++.001 < .  1:1(0) ack 13601 win 514
++.001 < .  1:1(0) ack 14001 win 514
++.001 < .  1:1(0) ack 14401 win 514
++.001 < .  1:1(0) ack 24401 win 514
diff --git a/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-10pkt-lost-1.pkt b/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-10pkt-lost-1.pkt
new file mode 100644
index 000000000000..0d3c8077e830
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-10pkt-lost-1.pkt
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test PRR-slowstart implementation.
+// In this variant we test a simple case where in-flight == ssthresh
+// all the way through recovery, so during fast recovery we send one segment
+// for each segment SACKed/ACKed.
+
+// Set up config.
+`./defaults.sh`
+
+// Establish a connection.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+  +.1 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+// RTT 100ms
+  +.1 < . 1:1(0) ack 1 win 320
+   +0 accept(3, ..., ...) = 4
+
+// Send 10 data segments.
+   +0 write(4, ..., 10000) = 10000
+   +0 > P. 1:10001(10000) ack 1
+
+// Lost packet 1:1001.
+ +.11 < . 1:1(0) ack 1 win 320 <sack 1001:2001,nop,nop>
+ +.01 < . 1:1(0) ack 1 win 320 <sack 1001:3001,nop,nop>
+ +.01 < . 1:1(0) ack 1 win 320 <sack 1001:4001,nop,nop>
+// Enter fast recovery.
+   +0 > . 1:1001(1000) ack 1
+ +.01 %{
+assert tcpi_ca_state == TCP_CA_Recovery, tcpi_ca_state
+assert tcpi_snd_cwnd == 7, tcpi_snd_cwnd
+assert tcpi_snd_ssthresh == 7, tcpi_snd_ssthresh
+}%
+
+// Write some more, which we will send 1 MSS at a time,
+// as in-flight segments are SACKed or ACKed.
+ +.01 write(4, ..., 7000) = 7000
+
+ +.01 < . 1:1(0) ack 1 win 320 <sack 1001:5001,nop,nop>
+   +0 > . 10001:11001(1000) ack 1
+
+ +.01 < . 1:1(0) ack 1 win 320 <sack 1001:6001,nop,nop>
+   +0 > . 11001:12001(1000) ack 1
+
+ +.01 < . 1:1(0) ack 1 win 320 <sack 1001:7001,nop,nop>
+   +0 > . 12001:13001(1000) ack 1
+
+ +.01 < . 1:1(0) ack 1 win 320 <sack 1001:8001,nop,nop>
+   +0 > . 13001:14001(1000) ack 1
+
+ +.01 < . 1:1(0) ack 1 win 320 <sack 1001:9001,nop,nop>
+   +0 > . 14001:15001(1000) ack 1
+
+ +.01 < . 1:1(0) ack 1 win 320 <sack 1001:10001,nop,nop>
+   +0 > . 15001:16001(1000) ack 1
+
+ +.02 < . 1:1(0) ack 10001 win 320
+   +0 > P. 16001:17001(1000) ack 1
+// Leave fast recovery.
+ +.01 %{
+assert tcpi_ca_state == TCP_CA_Open, tcpi_ca_state
+assert tcpi_snd_cwnd == 7, tcpi_snd_cwnd
+assert tcpi_snd_ssthresh == 7, tcpi_snd_ssthresh
+}%
+
+ +.03 < . 1:1(0) ack 12001 win 320
+ +.02 < . 1:1(0) ack 14001 win 320
+ +.02 < . 1:1(0) ack 16001 win 320
+ +.02 < . 1:1(0) ack 17001 win 320
diff --git a/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-30pkt-lost-1_4-11_16.pkt b/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-30pkt-lost-1_4-11_16.pkt
new file mode 100644
index 000000000000..7842a10b6967
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-30pkt-lost-1_4-11_16.pkt
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test PRR-slowstart implementation. The sender sends 20 packets. Packet
+// 1 to 4, and 11 to 16 are dropped.
+`./defaults.sh`
+
+// Establish a connection.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+
+ +.01 < . 1:1(0) ack 1 win 320
+   +0 accept(3, ..., ...) = 4
+
+// Write 20 data segments.
+   +0 write(4, ..., 20000) = 20000
+   +0 > P. 1:10001(10000) ack 1
+
+// Receive first DUPACK, entering PRR part
+ +.01 < . 1:1(0) ack 1 win 320 <sack 4001:5001,nop,nop>
+   +0 > . 10001:11001(1000) ack 1
++.002 < . 1:1(0) ack 1 win 320 <sack 4001:6001,nop,nop>
+   +0 > . 11001:12001(1000) ack 1
++.002 < . 1:1(0) ack 1 win 320 <sack 4001:7001,nop,nop>
+   +0 > . 1:1001(1000) ack 1
++.002 < . 1:1(0) ack 1 win 320 <sack 4001:8001,nop,nop>
+   +0 > . 1001:2001(1000) ack 1
++.002 < . 1:1(0) ack 1 win 320 <sack 4001:9001,nop,nop>
+   +0 > . 2001:3001(1000) ack 1
++.002 < . 1:1(0) ack 1 win 320 <sack 4001:10001,nop,nop>
+   +0 > . 3001:4001(1000) ack 1
+// Enter PRR CRB
++.002 < . 1:1(0) ack 1 win 320 <sack 4001:11001,nop,nop>
+   +0 > . 12001:13001(1000) ack 1
++.002 < . 1:1(0) ack 1 win 320 <sack 4001:12001,nop,nop>
+   +0 > . 13001:14001(1000) ack 1
+// Enter PRR slow start
+ +.01 < . 1:1(0) ack 1001 win 320 <sack 4001:12001,nop,nop>
+   +0 > P. 14001:16001(2000) ack 1
++.002 < . 1:1(0) ack 1001 win 320 <sack 2001:12001,nop,nop>
+   +0 > . 1001:2001(1000) ack 1
+   +0 > . 16001:17001(1000) ack 1
+// inflight reaches ssthresh, goes into packet conservation mode
++.002 < . 1:1(0) ack 1001 win 320 <sack 2001:13001,nop,nop>
+   +0 > . 17001:18001(1000) ack 1
++.002 < . 1:1(0) ack 1001 win 320 <sack 2001:14001,nop,nop>
+   +0 > . 18001:19001(1000) ack 1
diff --git a/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-30pkt-lost1_4.pkt b/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-30pkt-lost1_4.pkt
new file mode 100644
index 000000000000..b66d7644c3b6
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-30pkt-lost1_4.pkt
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test PRR-slowstart implementation. The sender sends 20 packets. Packet
+// 1 to 4 are lost. The sender writes another 10 packets.
+`./defaults.sh`
+
+// Establish a connection.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+
+ +.01 < . 1:1(0) ack 1 win 320
+   +0 accept(3, ..., ...) = 4
+
+// Send 20 data segments.
+   +0 write(4, ..., 10000) = 10000
+   +0 > P. 1:10001(10000) ack 1
+
+// Lost packet 1,2,3,4
+ +.01 < . 1:1(0) ack 1 win 320 <sack 4001:5001,nop,nop>
++.002 < . 1:1(0) ack 1 win 320 <sack 4001:6001,nop,nop>
+   +0 < . 1:1(0) ack 1 win 320 <sack 4001:7001,nop,nop>
+   +0 > . 1:1001(1000) ack 1
+   +0 < . 1:1(0) ack 1 win 320 <sack 4001:8001,nop,nop>
+   +0 > . 1001:2001(1000) ack 1
+   +0 < . 1:1(0) ack 1 win 320 <sack 4001:9001,nop,nop>
+   +0 > . 2001:3001(1000) ack 1
+   +0 < . 1:1(0) ack 1 win 320 <sack 4001:10001,nop,nop>
+   +0 > . 3001:4001(1000) ack 1
+
+// Receiver ACKs all data.
+ +.01 < . 1:1(0) ack 1001 win 320 <sack 4001:10001,nop,nop>
+   +0 < . 1:1(0) ack 2001 win 320 <sack 4001:10001,nop,nop>
+   +0 < . 1:1(0) ack 3001 win 320 <sack 4001:10001,nop,nop>
+   +0 < . 1:1(0) ack 10001 win 320
+
+// Writes another 10 packets, which the ssthresh*mss amount
+// should be sent right away
+ +.01 write(4, ..., 10000) = 10000
+   +0 > . 10001:17001(7000) ack 1
diff --git a/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-ack-below-snd_una-cubic.pkt b/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-ack-below-snd_una-cubic.pkt
new file mode 100644
index 000000000000..8e87bfecabb5
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-ack-below-snd_una-cubic.pkt
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test PRR-slowstart implementation.
+// In this variant we verify that the sender uses SACK info on an ACK
+// below snd_una.
+
+// Set up config.
+`./defaults.sh`
+
+// Establish a connection.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 8>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+// RTT 10ms
+ +.01 < . 1:1(0) ack 1 win 320
+   +0 accept(3, ..., ...) = 4
+
+// Send 10 data segments.
+   +0 write(4, ..., 10000) = 10000
+   +0 > P. 1:10001(10000) ack 1
+
+// Lost packet 1:1001,4001:5001,7001:8001.
+ +.01 < . 1:1(0) ack 1 win 320 <sack 1001:2001,nop,nop>
+   +0 < . 1:1(0) ack 1 win 320 <sack 1001:3001,nop,nop>
+   +0 < . 1:1(0) ack 1 win 320 <sack 1001:3001 8001:9001,nop,nop>
+   +0 > . 1:1001(1000) ack 1
+
++.012 < . 1:1(0) ack 4001 win 320 <sack 8001:9001,nop,nop>
+   +0 > . 4001:7001(3000) ack 1
+
+   +0 write(4, ..., 10000) = 10000
+
+// The following ACK was reordered - delayed so that it arrives with
+// an ACK field below snd_una. Here we check that the newly-SACKed
+// 2MSS at 5001:7001 cause us to send out 2 more MSS.
++.002 < . 1:1(0) ack 3001 win 320 <sack 5001:7001,nop,nop>
+   +0 > . 7001:8001(1000) ack 1
+   +0 > . 10001:11001(1000) ack 1
diff --git a/tools/testing/selftests/net/packetdrill/tcp_inq_client.pkt b/tools/testing/selftests/net/packetdrill/tcp_inq_client.pkt
new file mode 100644
index 000000000000..df49c67645ac
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_inq_client.pkt
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test TCP_INQ and TCP_CM_INQ on the client side.
+`./defaults.sh
+`
+
+// Create a socket and set it to non-blocking.
+    0	socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0	fcntl(3, F_GETFL) = 0x2 (flags O_RDWR)
+   +0	fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+
+// Connect to the server and enable TCP_INQ.
+   +0	connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress)
+   +0	setsockopt(3, SOL_TCP, TCP_INQ, [1], 4) = 0
+
+   +0	> S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
+ +.01	< S. 0:0(0) ack 1 win 5792 <mss 1460,sackOK,TS val 700 ecr 100,nop,wscale 7>
+   +0	> . 1:1(0) ack 1 <nop,nop,TS val 200 ecr 700>
+
+// Now we have 10K of data ready on the socket.
+   +0	< . 1:10001(10000) ack 1 win 514
+   +0	> . 1:1(0) ack 10001 <nop,nop,TS val 200 ecr 700>
+
+// We read 1K and we should have 9K ready to read.
+   +0	recvmsg(3, {msg_name(...)=...,
+		    msg_iov(1)=[{..., 1000}],
+		    msg_flags=0,
+		    msg_control=[{cmsg_level=SOL_TCP,
+				  cmsg_type=TCP_CM_INQ,
+				  cmsg_data=9000}]}, 0) = 1000
+// We read 9K and we should have no further data ready to read.
+   +0	recvmsg(3, {msg_name(...)=...,
+		    msg_iov(1)=[{..., 9000}],
+		    msg_flags=0,
+		    msg_control=[{cmsg_level=SOL_TCP,
+				  cmsg_type=TCP_CM_INQ,
+				  cmsg_data=0}]}, 0) = 9000
+
+// Server sends more data and closes the connections.
+   +0	< F. 10001:20001(10000) ack 1 win 514
+   +0	> . 1:1(0) ack 20002 <nop,nop,TS val 200 ecr 700>
+
+// We read 10K and we should have one "fake" byte because the connection is
+// closed.
+   +0	recvmsg(3, {msg_name(...)=...,
+		    msg_iov(1)=[{..., 10000}],
+		    msg_flags=0,
+		    msg_control=[{cmsg_level=SOL_TCP,
+				  cmsg_type=TCP_CM_INQ,
+				  cmsg_data=1}]}, 0) = 10000
+// Now, receive EOF.
+   +0	read(3, ..., 2000) = 0
diff --git a/tools/testing/selftests/net/packetdrill/tcp_inq_server.pkt b/tools/testing/selftests/net/packetdrill/tcp_inq_server.pkt
new file mode 100644
index 000000000000..04a5e2590c62
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_inq_server.pkt
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test TCP_INQ and TCP_CM_INQ on the server side.
+`./defaults.sh
+`
+
+// Initialize connection
+    0	socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0	setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0	bind(3, ..., ...) = 0
+   +0	listen(3, 1) = 0
+
+   +0	< S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 10>
+   +0	> S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+ +.01	< . 1:1(0) ack 1 win 514
+
+// Accept the connection and enable TCP_INQ.
+   +0	accept(3, ..., ...) = 4
+   +0	setsockopt(4, SOL_TCP, TCP_INQ, [1], 4) = 0
+
+// Now we have 10K of data ready on the socket.
+   +0	< . 1:10001(10000) ack 1 win 514
+   +0	> . 1:1(0) ack 10001
+
+// We read 2K and we should have 8K ready to read.
+   +0	recvmsg(4, {msg_name(...)=...,
+		    msg_iov(1)=[{..., 2000}],
+		    msg_flags=0,
+		    msg_control=[{cmsg_level=SOL_TCP,
+				  cmsg_type=TCP_CM_INQ,
+				  cmsg_data=8000}]}, 0) = 2000
+// We read 8K and we should have no further data ready to read.
+   +0	recvmsg(4, {msg_name(...)=...,
+		    msg_iov(1)=[{..., 8000}],
+		    msg_flags=0,
+		    msg_control=[{cmsg_level=SOL_TCP,
+				  cmsg_type=TCP_CM_INQ,
+				  cmsg_data=0}]}, 0) = 8000
+// Client sends more data and closes the connections.
+   +0	< F. 10001:20001(10000) ack 1 win 514
+   +0	> . 1:1(0) ack 20002
+
+// We read 10K and we should have one "fake" byte because the connection is
+// closed.
+   +0	recvmsg(4, {msg_name(...)=...,
+		    msg_iov(1)=[{..., 10000}],
+		    msg_flags=0,
+		    msg_control=[{cmsg_level=SOL_TCP,
+				  cmsg_type=TCP_CM_INQ,
+				  cmsg_data=1}]}, 0) = 10000
+// Now, receive error.
+   +0	read(3, ..., 2000) = -1 ENOTCONN (Transport endpoint is not connected)
diff --git a/tools/testing/selftests/net/packetdrill/tcp_limited_transmit_limited-transmit-no-sack.pkt b/tools/testing/selftests/net/packetdrill/tcp_limited_transmit_limited-transmit-no-sack.pkt
new file mode 100644
index 000000000000..96b01eb5b7a4
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_limited_transmit_limited-transmit-no-sack.pkt
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test RFC 3042 "Limited Transmit": "sending a new data segment in
+// response to each of the first two duplicate acknowledgments that
+// arrive at the sender".
+// This variation tests a receiver that doesn't support SACK.
+
+`./defaults.sh`
+
+// Establish a connection.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+  +.1 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 8>
+  +.1 < . 1:1(0) ack 1 win 320
+   +0 accept(3, ..., ...) = 4
+
+// Write some data, and send the initial congestion window.
+   +0 write(4, ..., 15000) = 15000
+   +0 > P. 1:10001(10000) ack 1
+
+// Limited transmit: on first dupack, send a new data segment.
+ +.11 < . 1:1(0) ack 1 win 320
+   +0 > . 10001:11001(1000) ack 1
+
+// Limited transmit: on second dupack, send a new data segment.
+ +.01 < . 1:1(0) ack 1 win 320
+   +0 > . 11001:12001(1000) ack 1
+
+// It turned out to be reordering, not loss.
+// We have one packet newly acked (1001:3001 were DUP-ACK'd)
+// So we revert state back to Open. Slow start cwnd from 10 to 11
+// and send 11 - 9 = 2 packets
+ +.01 < . 1:1(0) ack 3001 win 320
+   +0 > P. 12001:14001(2000) ack 1
+
+ +.02 < . 1:1(0) ack 5001 win 320
+   +0 > P. 14001:15001(1000) ack 1
+
+// Client gradually ACKs all data.
+ +.02 < . 1:1(0) ack 7001 win 320
+ +.02 < . 1:1(0) ack 9001 win 320
+ +.02 < . 1:1(0) ack 11001 win 320
+ +.02 < . 1:1(0) ack 13001 win 320
+ +.02 < . 1:1(0) ack 15001 win 320
+
+// Clean up.
+ +.17 close(4) = 0
+   +0 > F. 15001:15001(0) ack 1
+  +.1 < F. 1:1(0) ack 15002 win 257
+   +0 > . 15002:15002(0) ack 2
diff --git a/tools/testing/selftests/net/packetdrill/tcp_limited_transmit_limited-transmit-sack.pkt b/tools/testing/selftests/net/packetdrill/tcp_limited_transmit_limited-transmit-sack.pkt
new file mode 100644
index 000000000000..642da51ec3a4
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_limited_transmit_limited-transmit-sack.pkt
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test RFC 3042 "Limited Transmit": "sending a new data segment in
+// response to each of the first two duplicate acknowledgments that
+// arrive at the sender".
+// This variation tests a receiver that supports SACK.
+
+`./defaults.sh`
+
+// Establish a connection.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+  +.1 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+  +.1 < . 1:1(0) ack 1 win 320
+   +0 accept(3, ..., ...) = 4
+
+// Write some data, and send the initial congestion window.
+   +0 write(4, ..., 15000) = 15000
+   +0 > P. 1:10001(10000) ack 1
+
+// Limited transmit: on first dupack, send a new data segment.
+ +.11 < . 1:1(0) ack 1 win 320 <sack 1001:2001,nop,nop>
+   +0 > . 10001:11001(1000) ack 1
+
+// Limited transmit: on second dupack, send a new data segment.
+ +.01 < . 1:1(0) ack 1 win 320 <sack 1001:3001,nop,nop>
+   +0 > . 11001:12001(1000) ack 1
+
+// It turned out to be reordering, not loss.
+ +.01 < . 1:1(0) ack 3001 win 320
+   +0 > P. 12001:14001(2000) ack 1
+
+ +.02 < . 1:1(0) ack 5001 win 320
+   +0 > P. 14001:15001(1000) ack 1
+
+// Client gradually ACKs all data.
+ +.02 < . 1:1(0) ack 7001 win 320
+ +.02 < . 1:1(0) ack 9001 win 320
+ +.02 < . 1:1(0) ack 11001 win 320
+ +.02 < . 1:1(0) ack 13001 win 320
+ +.02 < . 1:1(0) ack 15001 win 320
+
+// Clean up.
+ +.17 close(4) = 0
+   +0 > F. 15001:15001(0) ack 1
+  +.1 < F. 1:1(0) ack 15002 win 257
+   +0 > . 15002:15002(0) ack 2
diff --git a/tools/testing/selftests/net/packetdrill/tcp_md5_md5-only-on-client-ack.pkt b/tools/testing/selftests/net/packetdrill/tcp_md5_md5-only-on-client-ack.pkt
new file mode 100644
index 000000000000..25dfef95d3f8
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_md5_md5-only-on-client-ack.pkt
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test what happens when client does not provide MD5 on SYN,
+// but then does on the ACK that completes the three-way handshake.
+
+`./defaults.sh`
+
+// Establish a connection.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 10>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+// Ooh, weird: client provides MD5 option on the ACK:
+ +.01 < . 1:1(0) ack 1 win 514 <md5 000102030405060708090a0b0c0d0e0f,nop,nop>
+ +.01 < . 1:1(0) ack 1 win 514 <md5 000102030405060708090a0b0c0d0e0f,nop,nop>
+
+// The TCP listener refcount should be 2, but on buggy kernels it can be 0:
+   +0 `grep " 0A " /proc/net/tcp /proc/net/tcp6 | grep ":1F90"`
+
+// Now here comes the legit ACK:
+ +.01 < . 1:1(0) ack 1 win 514
+
+// Make sure the connection is OK:
+   +0 accept(3, ..., ...) = 4
+
+ +.01 write(4, ..., 1000) = 1000
diff --git a/tools/testing/selftests/net/packetdrill/tcp_nagle_https_client.pkt b/tools/testing/selftests/net/packetdrill/tcp_nagle_https_client.pkt
new file mode 100644
index 000000000000..7adae7a9ef4a
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_nagle_https_client.pkt
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0
+// This is a test inspired by an Android client app using SSL. This
+// test verifies using TCP_NODELAY would save application latency
+// (Perhaps even better with TCP_NAGLE).
+//
+`./defaults.sh
+ethtool -K tun0 tso off gso off
+./set_sysctls.py /proc/sys/net/ipv4/tcp_timestamps=0`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4
+   +0 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+   +0 setsockopt(4, SOL_TCP, TCP_NODELAY, [1], 4) = 0
+
+   +0 connect(4, ..., ...) = -1 EINPROGRESS (Operation now in progress)
+   +0 > S 0:0(0) <mss 1460,nop,nop,sackOK,nop,wscale 8>
+  +.1 < S. 0:0(0) ack 1 win 5792 <mss 974,nop,nop,sackOK,nop,wscale 7>
+   +0 > . 1:1(0) ack 1
+
+// SSL handshake (resumed session)
+   +0 write(4, ..., 517) = 517
+   +0 > P. 1:518(517) ack 1
+  +.1 < . 1:1(0) ack 518 win 229
+
+   +0 < P. 1:144(143) ack 1 win 229
+   +0 > . 518:518(0) ack 144
+   +0 read(4, ..., 1000) = 143
+
+// Application POST header (51B) and body (2002B)
+   +0 write(4, ..., 51) = 51
+   +0 > P. 518:569(51) ack 144
+ +.03 write(4, ..., 2002) = 2002
+   +0 > . 569:1543(974) ack 144
+   +0 > P. 1543:2517(974) ack 144
+// Without disabling Nagle, this packet will not happen until the remote ACK.
+   +0 > P. 2517:2571(54) ack 144
+
+  +.1 < . 1:1(0) ack 2571 win 229
+
+// Reset sysctls
+`/tmp/sysctl_restore_${PPID}.sh`
diff --git a/tools/testing/selftests/net/packetdrill/tcp_nagle_sendmsg_msg_more.pkt b/tools/testing/selftests/net/packetdrill/tcp_nagle_sendmsg_msg_more.pkt
new file mode 100644
index 000000000000..fa9c01813996
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_nagle_sendmsg_msg_more.pkt
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test the MSG_MORE flag will correctly corks the tiny writes
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 8>
+ +.01 < . 1:1(0) ack 1 win 257
+   +0 accept(3, ..., ...) = 4
+// Disable Nagle by default on this socket.
+   +0 setsockopt(4, SOL_TCP, TCP_NODELAY, [1], 4) = 0
+
+// Test the basic case: MSG_MORE overwrites TCP_NODELAY and enables Nagle.
+   +0 sendmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{..., 40}], msg_flags=0}, MSG_MORE) = 40
+ +.21~+.215 > P. 1:41(40) ack 1
+ +.01 < . 1:1(0) ack 41 win 257
+
+// Test unsetting MSG_MORE releases the packet
+   +0 sendmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{..., 100}], msg_flags=0}, MSG_MORE) = 100
++.005 sendmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{..., 160}], msg_flags=0}, MSG_MORE) = 160
+ +.01 sendmsg(4, {msg_name(...)=...,
+                  msg_iov(3)=[{..., 100}, {..., 200}, {..., 195}],
+		  msg_flags=0}, MSG_MORE) = 495
++.008 sendmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{..., 5}], msg_flags=0}, 0) = 5
+   +0 > P. 41:801(760) ack 1
+ +.02 < . 1:1(0) ack 801 win 257
+
+
+// Test >MSS write will unleash MSS packets but hold on the remaining data.
+  +.1 sendmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{..., 3100}], msg_flags=0}, MSG_MORE) = 3100
+   +0 > . 801:3801(3000) ack 1
++.003 sendmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{..., 50}], msg_flags=0}, MSG_MORE) = 50
+
+ +.01 < . 1:1(0) ack 2801 win 257
+// Err... we relase the remaining right after the ACK? note that PUSH is reset
+   +0 > . 3801:3951(150) ack 1
+
+// Test we'll hold on the subsequent writes when inflight (3801:3951) > 0
++.001 sendmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{..., 1}], msg_flags=0}, MSG_MORE) = 1
++.002 sendmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{..., 2}], msg_flags=0}, MSG_MORE) = 2
++.003 sendmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{..., 3}], msg_flags=0}, MSG_MORE) = 3
++.004 sendmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{..., 4}], msg_flags=0}, MSG_MORE) = 4
+ +.02 < . 1:1(0) ack 3951 win 257
+   +0 > . 3951:3961(10) ack 1
+ +.02 < . 1:1(0) ack 3961 win 257
+
+
+// Test the case a MSG_MORE send followed by a write flushes the data
+   +0 sendmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{..., 20}], msg_flags=0}, MSG_MORE) = 20
+ +.05 write(4, ..., 20) = 20
+   +0 > P. 3961:4001(40) ack 1
diff --git a/tools/testing/selftests/net/packetdrill/tcp_nagle_sockopt_cork_nodelay.pkt b/tools/testing/selftests/net/packetdrill/tcp_nagle_sockopt_cork_nodelay.pkt
new file mode 100644
index 000000000000..0ddec5f7dc1a
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_nagle_sockopt_cork_nodelay.pkt
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test TCP_CORK and TCP_NODELAY sockopt behavior
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 8>
+ +.01 < . 1:1(0) ack 1 win 257
+   +0 accept(3, ..., ...) = 4
+// Set TCP_CORK sockopt to hold small packets
+   +0 setsockopt(4, SOL_TCP, TCP_CORK, [1], 4) = 0
+
+   +0 write(4, ..., 40) = 40
+ +.05 write(4, ..., 40) = 40
+
+// Unset TCP_CORK should push pending bytes out
+ +.01 setsockopt(4, SOL_TCP, TCP_CORK, [0], 4) = 0
+   +0 > P. 1:81(80) ack 1
+ +.01 < . 1:1(0) ack 81 win 257
+
+// Set TCP_CORK sockopt to hold small packets
+   +0 setsockopt(4, SOL_TCP, TCP_CORK, [1], 4) = 0
+
+   +0 write(4, ..., 40) = 40
+ +.05 write(4, ..., 40) = 40
+
+// Set TCP_NODELAY sockopt should push pending bytes out
+   +0 setsockopt(4, SOL_TCP, TCP_NODELAY, [1], 4) = 0
+   +0 > P. 81:161(80) ack 1
+ +.01 < . 1:1(0) ack 161 win 257
+
+// Set MSG_MORE to hold small packets
+   +0 send(4, ..., 40, MSG_MORE) = 40
+ +.05 send(4, ..., 40, MSG_MORE) = 40
+
+// Set TCP_NODELAY sockopt should push pending bytes out
+ +.01 setsockopt(4, SOL_TCP, TCP_NODELAY, [1], 4) = 0
+   +0 > . 161:241(80) ack 1
+ +.01 < . 1:1(0) ack 241 win 257
diff --git a/tools/testing/selftests/net/packetdrill/tcp_sack_sack-route-refresh-ip-tos.pkt b/tools/testing/selftests/net/packetdrill/tcp_sack_sack-route-refresh-ip-tos.pkt
new file mode 100644
index 000000000000..310ef31518da
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_sack_sack-route-refresh-ip-tos.pkt
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
+// Verify that setsockopt calls that force a route refresh do not
+// cause problems matching SACKs with packets in the write queue.
+// This variant tests IP_TOS.
+
+`./defaults.sh`
+
+// Establish a connection.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_IP, IP_MTU_DISCOVER, [IP_PMTUDISC_DONT], 1) = 0
+   +0...0.010 connect(3, ..., ...) = 0
+
+   +0 > S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
+ +.01 < S. 0:0(0) ack 1 win 65535 <mss 1460,nop,wscale 2,nop,nop,sackOK>
+   +0 > . 1:1(0) ack 1
+
+ +.01 write(3, ..., 5840) = 5840
+   +0 > P. 1:5841(5840) ack 1
+ +.01 < . 1:1(0) ack 5841 win 65535
+
+ +.01 write(3, ..., 5840) = 5840
+   +0 > P. 5841:11681(5840) ack 1
+ +.01 < . 1:1(0) ack 11681 win 65535
+
+ +.01 write(3, ..., 14600) = 14600
+   +0 > P. 11681:26281(14600) ack 1
+
+// Try the socket option that we know can force a route refresh.
+   +0 setsockopt(3, SOL_IP, IP_TOS, [4], 1) = 0
+// Then revert to avoid routing/mangling/etc implications of that setting.
+   +0 setsockopt(3, SOL_IP, IP_TOS, [0], 1) = 0
+
+// Verify that we do not retransmit the SACKed segments.
+ +.01 < . 1:1(0) ack 13141 win 65535 <sack 16061:17521 20441:26281,nop,nop>
+   +0 > . 13141:16061(2920) ack 1
+   +0 > P. 17521:20441(2920) ack 1
+ +.01 < . 1:1(0) ack 26281 win 65535
diff --git a/tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-2-6-8-3-9-nofack.pkt b/tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-2-6-8-3-9-nofack.pkt
new file mode 100644
index 000000000000..f185e1ac57ea
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-2-6-8-3-9-nofack.pkt
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test shifting of newly-SACKed ranges onto the previous already-SACKed skb.
+// This variant tests non-FACK SACK with SACKs coming in the order
+// 2 6 8 3 9, to test what happens when we get a new SACKed range
+// (for packet 3) that is on the right of an existing SACKed range
+// (for packet 2).
+
+`./defaults.sh`
+
+// Establish a connection and send 10 MSS.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+  +.1 < . 1:1(0) ack 1 win 1024
+   +0 accept(3, ..., ...) = 4
+
+   +0 write(4, ..., 10000) = 10000
+   +0 > P. 1:10001(10000) ack 1
+
+  +.1 < . 1:1(0) ack 1 win 257 <sack 2001:3001,nop,nop>
++.001 < . 1:1(0) ack 1 win 257 <sack 2001:3001 6001:7001,nop,nop>
++.001 < . 1:1(0) ack 1 win 257 <sack 2001:3001 6001:7001 8001:9001,nop,nop>
+
+// 3 SACKed packets, so we enter Fast Recovery.
+   +0 > . 1:1001(1000) ack 1
+   +0 %{ assert tcpi_ca_state == TCP_CA_Recovery, tcpi_ca_state }%
+   +0 %{ assert tcpi_lost == 6, tcpi_lost }%
+
+// SACK for 3001:4001.
+// This SACK for an adjacent range causes the sender to
+// shift the newly-SACKed range onto the previous skb.
++.007 < . 1:1(0) ack 1 win 257 <sack 2001:4001 6001:7001 8001:9001,nop,nop>
+   +0 > . 1001:2001(1000) ack 1
+   +0 %{ assert tcpi_lost == 5, tcpi_lost }%
+   +0 %{ assert tcpi_reordering == 6, tcpi_reordering }%   // 8001:9001 -> 3001:4001 is 6
+
+// SACK for 9001:10001.
+ +.01 < . 1:1(0) ack 1 win 257 <sack 2001:4001 6001:7001 8001:10001,nop,nop>
+   +0 %{ assert tcpi_lost == 5, tcpi_lost }%
+
+// ACK for 1:1001 as packets from t=0.303 arrive.
++.083 < . 1:1(0) ack 1001 win 257 <sack 2001:4001 6001:7001 8001:10001,nop,nop>
+   +0 %{ assert tcpi_lost == 4,tcpi_lost }%
+
+// ACK for 1:4001 as packets from t=0.310 arrive.
++.017 < . 1:1(0) ack 4001 win 257 <sack 6001:7001 8001:10001,nop,nop>
+   +0 %{ assert tcpi_lost == 3,tcpi_lost }%
+
+// ACK for 1:7001 as packets from t=0.320 arrive.
+ +.01 < . 1:1(0) ack 7001 win 257 <sack 8001:10001,nop,nop>
+
+// ACK for all data as packets from t=0.403 arrive.
+  +.1 < . 1:1(0) ack 10001 win 257
+   +0 %{
+assert tcpi_ca_state == TCP_CA_Open, tcpi_ca_state
+assert tcpi_unacked == 0, tcpi_unacked
+assert tcpi_sacked == 0, tcpi_sacked
+assert tcpi_lost == 0, tcpi_lost
+assert tcpi_retrans == 0, tcpi_retrans
+}%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-7-3-4-8-9-fack.pkt b/tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-7-3-4-8-9-fack.pkt
new file mode 100644
index 000000000000..0093b4973934
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-7-3-4-8-9-fack.pkt
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test shifting of newly-SACKed ranges onto the previous already-SACKed skb.
+// This variant tests the case where we mark packets 0-4 lost, then
+// get a SACK for 3, and then a SACK for 4.
+
+`./defaults.sh`
+
+// Establish a connection and send 10 MSS.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+  +.1 < . 1:1(0) ack 1 win 1024
+   +0 accept(3, ..., ...) = 4
+
+   +0 write(4, ..., 10000) = 10000
+   +0 > P. 1:10001(10000) ack 1
+
+// SACK for 7001:8001. Using RACK we delay the fast retransmit.
+  +.1 < . 1:1(0) ack 1 win 257 <sack 7001:8001,nop,nop>
+// RACK reordering timer
++.027 > . 1:1001(1000) ack 1
+   +0 %{
+assert tcpi_ca_state == TCP_CA_Recovery, tcpi_ca_state
+assert tcpi_lost == 7, tcpi_lost  # RACK thinks 1:7001 are lost
+assert tcpi_reordering == 3, tcpi_reordering
+}%
+
+// SACK for 3001:4001.
++.002 < . 1:1(0) ack 1 win 257 <sack 3001:4001 7001:8001,nop,nop>
+   +0 > . 1001:2001(1000) ack 1
+   +0 %{
+assert tcpi_lost == 6, tcpi_lost              # since 3001:4001 is no longer lost
+assert tcpi_reordering == 5, tcpi_reordering  # 7001:8001 -> 3001:4001
+}%
+
+// SACK for 4001:5001.
+// This SACK for an adjacent range causes the sender to
+// shift the newly-SACKed range onto the previous skb.
+// It uses the RFC3517 algorithm to mark 1:3001 lost
+// because >=3 higher-sequence packets are SACKed.
++.002 < . 1:1(0) ack 1 win 257 <sack 3001:5001 7001:8001,nop,nop>
+   +0 > . 2001:3001(1000) ack 1
+   +0 %{
+assert tcpi_lost == 5,tcpi_lost         # SACK/RFC3517 thinks 1:3001 are lost
+}%
+
+// SACK for 8001:9001.
++.002 < . 1:1(0) ack 1 win 257 <sack 3001:5001 7001:9001,nop,nop>
+
+// SACK for 9001:10001.
++.002 < . 1:1(0) ack 1 win 257 <sack 3001:5001 7001:10001,nop,nop>
+   +0 > . 5001:6001(1000) ack 1
+
+// To simplify clean-up, say we get an ACK for all data.
+  +.1 < . 1:1(0) ack 10001 win 257
+   +0 %{
+assert tcpi_ca_state == TCP_CA_Open, tcpi_ca_state
+assert tcpi_unacked == 0, tcpi_unacked
+assert tcpi_sacked == 0, tcpi_sacked
+assert tcpi_lost == 0, tcpi_lost
+assert tcpi_retrans == 0, tcpi_retrans
+}%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-7-5-6-8-9-fack.pkt b/tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-7-5-6-8-9-fack.pkt
new file mode 100644
index 000000000000..980a832dc81c
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-7-5-6-8-9-fack.pkt
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test shifting of newly-SACKed ranges onto the previous already-SACKed skb.
+// This variant tests the case where we mark packets 0-4 lost, then
+// get a SACK for 5, and then a SACK for 6.
+
+`./defaults.sh`
+
+// Establish a connection and send 10 MSS.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+  +.1 < . 1:1(0) ack 1 win 1024
+   +0 accept(3, ..., ...) = 4
+
+   +0 write(4, ..., 10000) = 10000
+   +0 > P. 1:10001(10000) ack 1
+
+// SACK for 7001:8001. Using RACK we delay a fast retransmit.
+  +.1 < . 1:1(0) ack 1 win 257 <sack 7001:8001,nop,nop>
++.027 > . 1:1001(1000) ack 1
+   +0 %{
+assert tcpi_ca_state == TCP_CA_Recovery, tcpi_ca_state
+assert tcpi_lost == 7,tcpi_lost         # RACK thinks 1:7001 are lost
+assert tcpi_reordering == 3, tcpi_reordering
+}%
+
+// SACK for 5001:6001.
+   +0 < . 1:1(0) ack 1 win 257 <sack 5001:6001 7001:8001,nop,nop>
+   +0 > . 1001:2001(1000) ack 1
+   +0 %{
+assert tcpi_lost == 6, tcpi_lost
+assert tcpi_reordering == 3, tcpi_reordering  # 7001:8001 -> 5001:6001 is 3
+}%
+
+// SACK for 6001:7001.
+// This SACK for an adjacent range causes the sender to
+// shift the newly-SACKed range onto the previous skb.
+   +0 < . 1:1(0) ack 1 win 257 <sack 5001:8001,nop,nop>
+   +0 > . 2001:3001(1000) ack 1
+   +0 %{ assert tcpi_lost == 5, tcpi_lost }%
+
+// SACK for 8001:9001.
+   +0 < . 1:1(0) ack 1 win 257 <sack 5001:9001,nop,nop>
+   +0 > . 3001:4001(1000) ack 1
+
+// SACK for 9001:10001.
+   +0 < . 1:1(0) ack 1 win 257 <sack 5001:10001,nop,nop>
+   +0 > . 4001:5001(1000) ack 1
+
+// To simplify clean-up, say we get an ACK for all data.
+  +.1 < . 1:1(0) ack 10001 win 257
+   +0 %{
+assert tcpi_ca_state == TCP_CA_Open, tcpi_ca_state
+assert tcpi_unacked == 0, tcpi_unacked
+assert tcpi_sacked == 0, tcpi_sacked
+assert tcpi_lost == 0, tcpi_lost
+assert tcpi_retrans == 0, tcpi_retrans
+}%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_sendfile_sendfile-simple.pkt b/tools/testing/selftests/net/packetdrill/tcp_sendfile_sendfile-simple.pkt
new file mode 100644
index 000000000000..6740859a1360
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_sendfile_sendfile-simple.pkt
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0
+// Simplest possible test of open() and then sendfile().
+// We write some zeroes into a file (since packetdrill expects payloads
+// to be all zeroes) and then open() the file, then use sendfile()
+// and verify that the correct number of zeroes goes out.
+
+`./defaults.sh
+/bin/rm -f /tmp/testfile
+/bin/dd bs=1 count=5 if=/dev/zero of=/tmp/testfile status=none
+`
+
+// Initialize connection
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 10>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+   +0 < . 1:1(0) ack 1 win 514
+
+   +0 accept(3, ..., ...) = 4
+
+   +0 open("/tmp/testfile", O_RDONLY) = 5
+   +0 sendfile(4, 5, [0], 5) = 5
+   +0 > P. 1:6(5) ack 1
diff --git a/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-1pkt.pkt b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-1pkt.pkt
new file mode 100644
index 000000000000..795c476d222d
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-1pkt.pkt
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test of slow start when not application-limited, so that
+// the cwnd continues to grow.
+// In this variant, the receiver ACKs every packet.
+
+// Set up config. To keep things simple, disable the
+// mechanism that defers sending in order to send bigger TSO packets.
+`./defaults.sh
+sysctl -q net.ipv4.tcp_tso_win_divisor=100`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+  +.1 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+  +.1 < . 1:1(0) ack 1 win 257
+   +0 accept(3, ..., ...) = 4
+   +0 setsockopt(4, SOL_SOCKET, SO_SNDBUF, [200000], 4) = 0
+
+   +0 write(4, ..., 30000) = 30000
+   +0 > P. 1:10001(10000) ack 1
+   +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }%
+
++.105 < . 1:1(0) ack 1001 win 257
+   +0 > P. 10001:12001(2000) ack 1
+
+   +0 < . 1:1(0) ack 2001 win 257
+   +0 > P. 12001:14001(2000) ack 1
+
++.005 < . 1:1(0) ack 3001 win 257
+   +0 > P. 14001:16001(2000) ack 1
+
+   +0 < . 1:1(0) ack 4001 win 257
+   +0 > P. 16001:18001(2000) ack 1
+
++.005 < . 1:1(0) ack 5001 win 257
+   +0 > P. 18001:20001(2000) ack 1
+
+   +0 < . 1:1(0) ack 6001 win 257
+   +0 > P. 20001:22001(2000) ack 1
+
++.005 < . 1:1(0) ack 7001 win 257
+   +0 > P. 22001:24001(2000) ack 1
+
+   +0 < . 1:1(0) ack 8001 win 257
+   +0 > P. 24001:26001(2000) ack 1
+
++.005 < . 1:1(0) ack 9001 win 257
+   +0 > P. 26001:28001(2000) ack 1
+
+   +0 < . 1:1(0) ack 10001 win 257
+   +0 > P. 28001:30001(2000) ack 1
+
+   +0 %{ assert tcpi_snd_cwnd == 20, tcpi_snd_cwnd }%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-2pkt-send-5pkt.pkt b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-2pkt-send-5pkt.pkt
new file mode 100644
index 000000000000..9212ae1fd0f2
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-2pkt-send-5pkt.pkt
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test of slow start when an outstanding flight of packets is
+// less than the current cwnd, and not big enough to bump up cwnd.
+//
+// In this variant, the receiver ACKs every other packet,
+// approximating standard delayed ACKs.
+
+// Set up config.
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+   +0 < . 1:1(0) ack 1 win 257
+   +0 accept(3, ..., ...) = 4
+
+// Only send 5 packets.
+   +0 write(4, ..., 5000) = 5000
+   +0 > P. 1:5001(5000) ack 1
+   +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }%
+
+   +0 < . 1:1(0) ack 2001 win 257
+   +0 %{ assert tcpi_snd_cwnd == 10, 'cwnd=%d' % tcpi_snd_cwnd }%
+
+   +0 < . 1:1(0) ack 4001 win 257
+   +0 %{ assert tcpi_snd_cwnd == 10, 'cwnd=%d' % tcpi_snd_cwnd }%
+
+   +0 < . 1:1(0) ack 5001 win 257
+   +0 %{ assert tcpi_snd_cwnd == 10, 'cwnd=%d' % tcpi_snd_cwnd }%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-2pkt-send-6pkt.pkt b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-2pkt-send-6pkt.pkt
new file mode 100644
index 000000000000..416c901ddf51
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-2pkt-send-6pkt.pkt
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test of slow start when an outstanding flight of packets is
+// less than the current cwnd, but still big enough that in slow
+// start we want to increase our cwnd a little.
+//
+// In this variant, the receiver ACKs every other packet,
+// approximating standard delayed ACKs.
+
+// Set up config.
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+   +0 < . 1:1(0) ack 1 win 257
+   +0 accept(3, ..., ...) = 4
+
+// Only send 6 packets.
+   +0 write(4, ..., 6000) = 6000
+   +0 > P. 1:6001(6000) ack 1
+   +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }%
+
+   +0 < . 1:1(0) ack 2001 win 257
+   +0 %{ assert tcpi_snd_cwnd == 12, 'cwnd=%d' % tcpi_snd_cwnd }%
+
+   +0 < . 1:1(0) ack 4001 win 257
+   +0 %{ assert tcpi_snd_cwnd == 12, 'cwnd=%d' % tcpi_snd_cwnd }%
+
+   +0 < . 1:1(0) ack 6001 win 257
+   +0 %{ assert tcpi_snd_cwnd == 12, 'cwnd=%d' % tcpi_snd_cwnd }%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-2pkt.pkt b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-2pkt.pkt
new file mode 100644
index 000000000000..a894b7d4559c
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-2pkt.pkt
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test of slow start when not application-limited, so that
+// the cwnd continues to grow.
+// In this variant, the receiver ACKs every other packet,
+// approximating standard delayed ACKs.
+
+// Set up config. To keep things simple, disable the
+// mechanism that defers sending in order to send bigger TSO packets.
+`./defaults.sh
+sysctl -q net.ipv4.tcp_tso_win_divisor=100`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+  +.1 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+  +.1 < . 1:1(0) ack 1 win 257
+   +0 accept(3, ..., ...) = 4
+   +0 setsockopt(4, SOL_SOCKET, SO_SNDBUF, [200000], 4) = 0
+
+   +0 write(4, ..., 30000) = 30000
+   +0 > P. 1:10001(10000) ack 1
+   +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }%
+
++.105 < . 1:1(0) ack 2001 win 257
+   +0 > P. 10001:14001(4000) ack 1
+
++.005 < . 1:1(0) ack 4001 win 257
+   +0 > P. 14001:18001(4000) ack 1
+
++.005 < . 1:1(0) ack 6001 win 257
+   +0 > P. 18001:22001(4000) ack 1
+
++.005 < . 1:1(0) ack 8001 win 257
+   +0 > P. 22001:26001(4000) ack 1
+
++.005 < . 1:1(0) ack 10001 win 257
+   +0 > P. 26001:30001(4000) ack 1
+
+   +0 %{ assert tcpi_snd_cwnd == 20, tcpi_snd_cwnd }%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-4pkt.pkt b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-4pkt.pkt
new file mode 100644
index 000000000000..065fae9e9abd
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-ack-per-4pkt.pkt
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test of slow start when not application-limited, so that
+// the cwnd continues to grow.
+// In this variant, the receiver sends one ACK per 4 packets.
+
+// Set up config. To keep things simple, disable the
+// mechanism that defers sending in order to send bigger TSO packets.
+`./defaults.sh
+sysctl -q net.ipv4.tcp_tso_win_divisor=100`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+  +.1 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+  +.1 < . 1:1(0) ack 1 win 257
+   +0 accept(3, ..., ...) = 4
+   +0 setsockopt(4, SOL_SOCKET, SO_SNDBUF, [200000], 4) = 0
+
+   +0 write(4, ..., 30000) = 30000
+   +0 > P. 1:10001(10000) ack 1
+   +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }%
+
+ +.11 < . 1:1(0) ack 4001 win 257
+   +0 > P. 10001:18001(8000) ack 1
+
+ +.01 < . 1:1(0) ack 8001 win 257
+   +0 > P. 18001:26001(8000) ack 1
+
++.005 < . 1:1(0) ack 10001 win 257
+   +0 > P. 26001:30001(4000) ack 1
+
+   +0 %{ assert tcpi_snd_cwnd == 20, tcpi_snd_cwnd }%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-after-idle.pkt b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-after-idle.pkt
new file mode 100644
index 000000000000..11b213be1138
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-after-idle.pkt
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test of slow start after idle
+// This test expects tso size to be at least initial cwnd * mss
+
+`./defaults.sh
+./set_sysctls.py /proc/sys/net/ipv4/tcp_slow_start_after_idle=1 \
+		 /proc/sys/net/ipv4/tcp_min_tso_segs=10`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 65535 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+  +.1 < . 1:1(0) ack 1 win 511
+   +0 accept(3, ..., ...) = 4
+   +0 setsockopt(4, SOL_SOCKET, SO_SNDBUF, [200000], 4) = 0
+
+   +0 write(4, ..., 26000) = 26000
+   +0 > P. 1:5001(5000) ack 1
+   +0 > P. 5001:10001(5000) ack 1
+   +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }%
+
+  +.1 < . 1:1(0) ack 10001 win 511
+   +0 %{ assert tcpi_snd_cwnd == 20, tcpi_snd_cwnd }%
+   +0 > P. 10001:20001(10000) ack 1
+   +0 > P. 20001:26001(6000) ack 1
+
+  +.1 < . 1:1(0) ack 26001 win 511
+   +0 %{ assert tcpi_snd_cwnd == 36, tcpi_snd_cwnd }%
+
+   +2 write(4, ..., 20000) = 20000
+// If slow start after idle works properly, we should send 5 MSS here (cwnd/2)
+   +0 > P. 26001:31001(5000) ack 1
+   +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }%
+
+// Reset sysctls
+`/tmp/sysctl_restore_${PPID}.sh`
diff --git a/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-after-win-update.pkt b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-after-win-update.pkt
new file mode 100644
index 000000000000..577ed8c8852c
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-after-win-update.pkt
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test of slow start after window update
+// This test expects tso size to be at least initial cwnd * mss
+
+`./defaults.sh
+./set_sysctls.py /proc/sys/net/ipv4/tcp_slow_start_after_idle=1 \
+		 /proc/sys/net/ipv4/tcp_min_tso_segs=10`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 65535 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+  +.1 < . 1:1(0) ack 1 win 511
+   +0 accept(3, ..., ...) = 4
+   +0 setsockopt(4, SOL_SOCKET, SO_SNDBUF, [200000], 4) = 0
+
+   +0 write(4, ..., 26000) = 26000
+   +0 > P. 1:5001(5000) ack 1
+   +0 > P. 5001:10001(5000) ack 1
+   +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }%
+
+  +.1 < . 1:1(0) ack 10001 win 511
+   +0 %{ assert tcpi_snd_cwnd == 20, tcpi_snd_cwnd }%
+   +0 > P. 10001:20001(10000) ack 1
+   +0 > P. 20001:26001(6000) ack 1
+
+  +.1 < . 1:1(0) ack 26001 win 0
+   +0 %{ assert tcpi_snd_cwnd == 36, tcpi_snd_cwnd }%
+
+   +0 write(4, ..., 20000) = 20000
+// 1st win0 probe
++.3~+.310 > . 26000:26000(0) ack 1
+   +0 %{ assert tcpi_snd_cwnd == 36, tcpi_snd_cwnd }%
+
+// 2nd win0 probe
++.6~+.620 > . 26000:26000(0) ack 1
+   +0 %{ assert tcpi_snd_cwnd == 36, tcpi_snd_cwnd }%
+
+// 3rd win0 probe
++1.2~+1.240 > . 26000:26000(0) ack 1
+   +0 %{ assert tcpi_snd_cwnd == 36, tcpi_snd_cwnd }%
+
+  +.9 < . 1:1(0) ack 26001 win 511
+   +0 > P. 26001:31001(5000) ack 1
+
+// Reset sysctls
+`/tmp/sysctl_restore_${PPID}.sh`
diff --git a/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-app-limited-9-packets-out.pkt b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-app-limited-9-packets-out.pkt
new file mode 100644
index 000000000000..869f32c35a2a
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-app-limited-9-packets-out.pkt
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test of slow start when application-limited: in this case,
+// with IW10, if we don't fully use our cwnd but instead
+// send just 9 packets, then cwnd should grow to twice that
+// value, or 18 packets.
+
+// Set up config.
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+  +.1 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+  +.1 < . 1:1(0) ack 1 win 257
+   +0 accept(3, ..., ...) = 4
+   +0 setsockopt(4, SOL_SOCKET, SO_SNDBUF, [200000], 4) = 0
+
+   +0 write(4, ..., 9000) = 9000
+   +0 > P. 1:9001(9000) ack 1
+   +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }%
+
++.105 < . 1:1(0) ack 2001 win 257
+   +0 %{ assert tcpi_snd_cwnd == 12, tcpi_snd_cwnd }%
+
++.005 < . 1:1(0) ack 4001 win 257
+   +0 %{ assert tcpi_snd_cwnd == 14, tcpi_snd_cwnd }%
+
++.005 < . 1:1(0) ack 6001 win 257
+   +0 %{ assert tcpi_snd_cwnd == 16, tcpi_snd_cwnd }%
+
++.005 < . 1:1(0) ack 8001 win 257
+   +0 %{ assert tcpi_snd_cwnd == 18, tcpi_snd_cwnd }%
+
++.005 < . 1:1(0) ack 9001 win 257
+   +0 %{ assert tcpi_snd_cwnd == 18, tcpi_snd_cwnd }%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-app-limited.pkt b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-app-limited.pkt
new file mode 100644
index 000000000000..0f77b7955db6
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-app-limited.pkt
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test of slow start when application-limited: in this case,
+// with IW10, if we send exactly 10 packets then cwnd should grow to 20.
+
+// Set up config.
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+  +.1 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+  +.1 < . 1:1(0) ack 1 win 257
+   +0 accept(3, ..., ...) = 4
+   +0 setsockopt(4, SOL_SOCKET, SO_SNDBUF, [200000], 4) = 0
+
+   +0 write(4, ..., 10000) = 10000
+   +0 > P. 1:10001(10000) ack 1
+   +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }%
+
++.105 < . 1:1(0) ack 2001 win 257
+   +0 %{ assert tcpi_snd_cwnd == 12, tcpi_snd_cwnd }%
+
++.005 < . 1:1(0) ack 4001 win 257
+   +0 %{ assert tcpi_snd_cwnd == 14, tcpi_snd_cwnd }%
+
++.005 < . 1:1(0) ack 6001 win 257
+   +0 %{ assert tcpi_snd_cwnd == 16, tcpi_snd_cwnd }%
+
++.005 < . 1:1(0) ack 8001 win 257
+   +0 %{ assert tcpi_snd_cwnd == 18, tcpi_snd_cwnd }%
+
++.005 < . 1:1(0) ack 10001 win 257
+   +0 %{ assert tcpi_snd_cwnd == 20, tcpi_snd_cwnd }%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-fq-ack-per-2pkt.pkt b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-fq-ack-per-2pkt.pkt
new file mode 100644
index 000000000000..7e9c83d617c2
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_slow_start_slow-start-fq-ack-per-2pkt.pkt
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test of slow start when not application-limited, so that
+// the cwnd continues to grow, even if TSQ triggers.
+// In this variant, the receiver ACKs every other packet,
+// approximating standard delayed ACKs.
+
+// Note we use FQ/pacing to check if TCP Small Queues is not hurting
+
+`./defaults.sh
+tc qdisc replace dev tun0 root fq
+sysctl -q net/ipv4/tcp_pacing_ss_ratio=200
+sysctl -e -q net.ipv4.tcp_min_tso_segs=2`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+  +.1 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+  +.1 < . 1:1(0) ack 1 win 500
+   +0 accept(3, ..., ...) = 4
+   +0 setsockopt(4, SOL_SOCKET, SO_SNDBUF, [200000], 4) = 0
+
+   +0 write(4, ..., 40000) = 40000
+// This might change if we cook the initial packet with 10 MSS.
+   +0 > P. 1:2921(2920) ack 1
+   +0 > P. 2921:5841(2920) ack 1
+   +0 > P. 5841:8761(2920) ack 1
+   +0 > P. 8761:11681(2920) ack 1
+   +0 > P. 11681:14601(2920) ack 1
+   +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }%
+
++.105 < . 1:1(0) ack 2921 win 500
+   +0 %{ assert tcpi_snd_cwnd == 12, tcpi_snd_cwnd }%
+
+// Note: after this commit : "net_sched: sch_fq: account for schedule/timers drifts"
+// FQ notices that this packet missed the 'time to send next packet' computed
+// when prior packet (11681:14601(2920)) was sent.
+// So FQ will allow following packet to be sent a bit earlier (quantum/2)
+// (FQ commit allows an application/cwnd limited flow to get at most quantum/2 extra credit)
+   +0 > P. 14601:17521(2920) ack 1
+
++.003 < . 1:1(0) ack 5841 win 500
+   +0 %{ assert tcpi_snd_cwnd == 14, tcpi_snd_cwnd }%
+
++.001 > P. 17521:20441(2920) ack 1
+
++.001 < . 1:1(0) ack 8761 win 500
+   +0 %{ assert tcpi_snd_cwnd == 16, tcpi_snd_cwnd }%
+
+// remaining packets are delivered at a constant rate.
++.007 > P. 20441:23361(2920) ack 1
+
++.002 < . 1:1(0) ack 11681 win 500
+   +0 %{ assert tcpi_snd_cwnd == 18, tcpi_snd_cwnd }%
++.001 < . 1:1(0) ack 14601 win 500
+
++.004 > P. 23361:26281(2920) ack 1
+
++.007 > P. 26281:29201(2920) ack 1
+
+   +0 %{ assert tcpi_snd_cwnd == 20, 'cwnd=%d' % tcpi_snd_cwnd }%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_splice_tcp_splice_loop_test.pkt b/tools/testing/selftests/net/packetdrill/tcp_splice_tcp_splice_loop_test.pkt
new file mode 100644
index 000000000000..0cbd43253236
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_splice_tcp_splice_loop_test.pkt
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0
+`./defaults.sh`
+
+// Initialize a server socket
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 setsockopt(3, SOL_IP, IP_FREEBIND, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+// Connection should get accepted
+   +0 < S 0:0(0) win 32972 <mss 1460,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <...>
+   +0 < . 1:1(0) ack 1 win 257
+   +0 accept(3, ..., ...) = 4
+
+   +0 pipe([5, 6]) = 0
+   +0 < U. 1:101(100) ack 1 win 257 urg 100
+   +0 splice(4, NULL, 6, NULL, 99, 0) = 99
+   +0 splice(4, NULL, 6, NULL, 1, 0) = 0
diff --git a/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_fastopen-invalid-buf-ptr.pkt b/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_fastopen-invalid-buf-ptr.pkt
new file mode 100644
index 000000000000..8940726a3ec2
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_fastopen-invalid-buf-ptr.pkt
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test TCP fastopen behavior with NULL as buffer pointer, but a non-zero
+// buffer length.
+`./defaults.sh
+./set_sysctls.py /proc/sys/net/ipv4/tcp_timestamps=0`
+
+// Cache warmup: send a Fast Open cookie request
+ 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
++0 setsockopt(3, SOL_TCP, TCP_FASTOPEN_CONNECT, [1], 4) = 0
++0 connect(3, ..., ...) = -1 EINPROGRESS (Operation is now in progress)
++0 > S 0:0(0) <mss 1460,nop,nop,sackOK,nop,wscale 8,FO,nop,nop>
++0 < S. 123:123(0) ack 1 win 14600 <mss 1460,nop,nop,sackOK,nop,wscale 6,FO abcd1234,nop,nop>
++0 > . 1:1(0) ack 1
++0 close(3) = 0
++0 > F. 1:1(0) ack 1
++0 < F. 1:1(0) ack 2 win 92
++0 > .  2:2(0) ack 2
+
+// Test with MSG_FASTOPEN without TCP_FASTOPEN_CONNECT.
++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4
++0 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0
++0 sendto(4, NULL, 1, MSG_FASTOPEN, ..., ...) = -1
++0 close(4) = 0
+
+// Test with TCP_FASTOPEN_CONNECT without MSG_FASTOPEN.
++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 5
++0 fcntl(5, F_SETFL, O_RDWR|O_NONBLOCK) = 0
++0 setsockopt(5, SOL_TCP, TCP_FASTOPEN_CONNECT, [1], 4) = 0
++0 connect(5, ..., ...) = 0
++0 sendto(5, NULL, 1, 0, ..., ...) = -1
++0 close(5) = 0
+
+// Test with both TCP_FASTOPEN_CONNECT and MSG_FASTOPEN.
++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 6
++0 fcntl(6, F_SETFL, O_RDWR|O_NONBLOCK) = 0
++0 setsockopt(6, SOL_TCP, TCP_FASTOPEN_CONNECT, [1], 4) = 0
++0 connect(6, ..., ...) = 0
++0 sendto(6, NULL, 1, MSG_FASTOPEN, ..., ...) = -1
++0 close(6) = 0
+
+`/tmp/sysctl_restore_${PPID}.sh`
diff --git a/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_sendmsg-empty-iov.pkt b/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_sendmsg-empty-iov.pkt
new file mode 100644
index 000000000000..b2b2cdf27e20
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_sendmsg-empty-iov.pkt
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test that we correctly skip zero-length IOVs.
+`./defaults.sh`
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 8>
+ +.01 < . 1:1(0) ack 1 win 257
+   +0 accept(3, ..., ...) = 4
+   +0 setsockopt(4, SOL_TCP, TCP_NODELAY, [1], 4) = 0
+
+   +0 sendmsg(4, {msg_name(...)=...,
+                  msg_iov(4)=[{..., 0}, {..., 40}, {..., 0}, {..., 20}],
+                  msg_flags=0}, 0) = 60
+   +0 > P. 1:61(60) ack 1
+ +.01 < . 1:1(0) ack 61 win 257
+
+   +0 sendmsg(4, {msg_name(...)=...,
+                  msg_iov(4)=[{..., 0}, {..., 0}, {..., 0}, {..., 0}],
+                  msg_flags=0}, MSG_ZEROCOPY) = 0
+
+   +0 sendmsg(4, {msg_name(...)=...,
+                  msg_iov(4)=[{..., 0}, {..., 10}, {..., 0}, {..., 50}],
+                  msg_flags=0}, MSG_ZEROCOPY) = 60
+   +0 > P. 61:121(60) ack 1
+ +.01 < . 1:1(0) ack 121 win 257
diff --git a/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_syscall-invalid-buf-ptr.pkt b/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_syscall-invalid-buf-ptr.pkt
new file mode 100644
index 000000000000..59f5903f285c
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_syscall-invalid-buf-ptr.pkt
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test kernel behavior with NULL as buffer pointer
+
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 10>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+  +.2 < . 1:1(0) ack 1 win 514
+
+   +0 accept(3, ..., ...) = 4
+
+   +0 write(4, NULL, 1000) = -1 EFAULT (Bad address)
+   +0 send(4, NULL, 1000, 0) = -1 EFAULT (Bad address)
+   +0 sendto(4, NULL, 1000, 0, ..., ...) = -1 EFAULT (Bad address)
+
+   +0 < . 1:1001(1000) ack 1 win 200
+   +0 read(4, NULL, 1000) = -1 EFAULT (Bad address)
+   +0 recv(4, NULL, 1000, 0) = -1 EFAULT (Bad address)
+   +0 recvfrom(4, NULL, 1000, 0, ..., ...) = -1 EFAULT (Bad address)
diff --git a/tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-last_data_recv.pkt b/tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-last_data_recv.pkt
new file mode 100644
index 000000000000..d7fdb43a8e89
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-last_data_recv.pkt
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test tcpi_last_data_recv for active session
+`./defaults.sh`
+
+// Create a socket and set it to non-blocking.
++0    socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0    fcntl(3, F_GETFL) = 0x2 (flags O_RDWR)
++0    fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+
++0    connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress)
++0    > S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
++.030 < S. 0:0(0) ack 1 win 10000 <mss 1000,sackOK,nop,nop,nop,wscale 8>
++0    > . 1:1(0) ack 1
+
++1 %{ assert 990 <= tcpi_last_data_recv <= 1010, tcpi_last_data_recv }%
+
++0    < . 1:1001(1000) ack 1 win 300
++0    > . 1:1(0) ack 1001
+
++0 %{ assert tcpi_last_data_recv <= 10, tcpi_last_data_recv }%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-rwnd-limited.pkt b/tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-rwnd-limited.pkt
new file mode 100644
index 000000000000..a9bcd46f6cb6
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-rwnd-limited.pkt
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test rwnd limited time in tcp_info for client side.
+
+`./defaults.sh`
+
+// Create a socket and set it to non-blocking.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 fcntl(3, F_GETFL) = 0x2 (flags O_RDWR)
+   +0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+
+   +0 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress)
+   +0 > S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
+
+// Server advertises 0 receive window.
+ +.01 < S. 0:0(0) ack 1 win 0 <mss 1000,nop,nop,sackOK>
+
+   +0 > . 1:1(0) ack 1
+   +0 getsockopt(3, SOL_SOCKET, SO_ERROR, [0], [4]) = 0
+   +0 fcntl(3, F_SETFL, O_RDWR) = 0   // set back to blocking
+
+// Make sure that initial rwnd limited time is 0.
+   +0 %{ assert tcpi_rwnd_limited == 0, tcpi_rwnd_limited }%
+
+// Receive window limited time starts here.
+   +0 write(3, ..., 1000) = 1000
+
+// Check that rwnd limited time in tcp_info is around 0.1s.
+  +.1 %{ assert 98000 <= tcpi_rwnd_limited <= 110000, tcpi_rwnd_limited }%
+
+// Server opens the receive window.
+  +.1 < . 1:1(0) ack 1 win 2000
+
+// Check that rwnd limited time in tcp_info is around 0.2s.
+   +0 %{ assert 198000 <= tcpi_rwnd_limited <= 210000, tcpi_rwnd_limited }%
+
+   +0 > P. 1:1001(1000) ack 1
+
+// Server advertises a very small receive window.
+ +.03 < . 1:1(0) ack 1001 win 10
+
+// Receive window limited time starts again.
+   +0 write(3, ..., 1000) = 1000
+
+// Server opens the receive window again.
+  +.1 < . 1:1(0) ack 1001 win 2000
+// Check that rwnd limited time in tcp_info is around 0.3s
+// and busy time is 0.3 + 0.03 (server opened small window temporarily).
+   +0 %{ assert 298000 <= tcpi_rwnd_limited <= 310000, tcpi_rwnd_limited;\
+         assert 328000 <= tcpi_busy_time <= 340000, tcpi_busy_time;\
+}%
+
+   +0 > P. 1001:2001(1000) ack 1
+ +.02 < . 1:1(0) ack 2001 win 2000
+   +0 %{ assert 348000 <= tcpi_busy_time <= 360000, tcpi_busy_time }%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-sndbuf-limited.pkt b/tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-sndbuf-limited.pkt
new file mode 100644
index 000000000000..f0de2acd0f8e
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-sndbuf-limited.pkt
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test send-buffer-limited time in tcp_info for client side.
+`./defaults.sh`
+
+// Create a socket and set it to non-blocking.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 fcntl(3, F_GETFL) = 0x2 (flags O_RDWR)
+   +0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+
+   +0 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress)
+   +0 > S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
+ +.01 < S. 0:0(0) ack 1 win 10000 <mss 1000,sackOK,nop,nop,nop,wscale 8>
+   +0 > . 1:1(0) ack 1
+   +0 getsockopt(3, SOL_SOCKET, SO_ERROR, [0], [4]) = 0
+   +0 fcntl(3, F_SETFL, O_RDWR) = 0   // set back to blocking
+   +0 setsockopt(3, SOL_SOCKET, SO_SNDBUF, [10000], 4) = 0
+   +0 getsockopt(3, SOL_SOCKET, SO_SNDBUF, [20000], [4]) = 0
+
+ +.09...0.14 write(3, ..., 150000) = 150000
+
+ +.01 < . 1:1(0) ack 10001 win 10000
+
+ +.01 < . 1:1(0) ack 30001 win 10000
+
+// cwnd goes from 40(60KB) to 80(120KB), and that we hit the tiny sndbuf limit 10KB
+ +.01 < . 1:1(0) ack 70001 win 10000
+
+ +.02 < . 1:1(0) ack 95001 win 10000
+   +0 %{ assert 19000 <= tcpi_sndbuf_limited <= 21000, tcpi_sndbuf_limited; \
+	 assert 49000 <= tcpi_busy_time <= 52000, tcpi_busy_time; \
+	 assert 0 == tcpi_rwnd_limited, tcpi_rwnd_limited }%
+
+// This ack frees up enough buffer so we are no longer
+// buffer limited (socket flag SOCK_NOSPACE is cleared)
+ +.02 < . 1:1(0) ack 150001 win 10000
+   +0 %{ assert 19000 <= tcpi_sndbuf_limited <= 21000, tcpi_sndbuf_limited;\
+	 assert 69000 <= tcpi_busy_time <= 73000, tcpi_busy_time;\
+	 assert 0 == tcpi_rwnd_limited, tcpi_rwnd_limited }%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_timestamping_client-only-last-byte.pkt b/tools/testing/selftests/net/packetdrill/tcp_timestamping_client-only-last-byte.pkt
new file mode 100644
index 000000000000..2087ec0c746a
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_timestamping_client-only-last-byte.pkt
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test that tx timestamping sends timestamps only for
+// the last byte of each sendmsg.
+`./defaults.sh
+`
+
+// Create a socket and set it to non-blocking.
+    0	socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0	fcntl(3, F_GETFL) = 0x2 (flags O_RDWR)
+   +0	fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+
+// Establish connection and verify that there was no error.
+   +0	connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress)
+   +0 > S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
+ +.01 < S. 0:0(0) ack 1 win 20000 <mss 1000,nop,nop,sackOK>
+   +0 > . 1:1(0) ack 1
+   +0	getsockopt(3, SOL_SOCKET, SO_ERROR, [0], [4]) = 0
+   +0	fcntl(3, F_SETFL, O_RDWR) = 0   // set back to blocking
+
+   +0	setsockopt(3, SOL_SOCKET, SO_TIMESTAMPING,
+		   [SOF_TIMESTAMPING_TX_SCHED | SOF_TIMESTAMPING_TX_SOFTWARE |
+		    SOF_TIMESTAMPING_TX_ACK | SOF_TIMESTAMPING_SOFTWARE |
+		    SOF_TIMESTAMPING_OPT_ID], 4) = 0
+
+   +0	write(3, ..., 11000) = 11000
+   +0	> P. 1:10001(10000) ack 1
+ +.01	< . 1:1(0) ack 10001 win 4000
+   +0	> P. 10001:11001(1000) ack 1
+ +.01	< . 1:1(0) ack 11001 win 4000
+
+// Make sure that internal TCP timestamps are not overwritten and we have sane
+// RTT measurement.
+   +0	%{
+assert 5000 <= tcpi_rtt <= 20000, 'srtt=%d us' % tcpi_rtt
+}%
+
+// SCM_TSTAMP_SCHED for the last byte should be received almost immediately
+// once 10001 is acked at t=20ms.
+// setsockopt(..., [SOF_TIMESTAMPING_SOFTWARE | SOF_TIMESTAMPING_OPT_ID], ...)
+// is called after when SYN is acked. So, we expect the last byte of the first
+// chunk to have a timestamp key of 10999 (i.e., 11000 - 1).
+   +0	recvmsg(3, {msg_name(...)=...,
+		    msg_iov(1)=[{...,0}],
+                    msg_flags=MSG_ERRQUEUE|MSG_TRUNC,
+                    msg_control=[
+			{cmsg_level=SOL_SOCKET,
+			 cmsg_type=SCM_TIMESTAMPING,
+			 cmsg_data={scm_sec=0,scm_nsec=20000000}},
+			{cmsg_level=CMSG_LEVEL_IP,
+			 cmsg_type=CMSG_TYPE_RECVERR,
+			 cmsg_data={ee_errno=ENOMSG,
+				    ee_origin=SO_EE_ORIGIN_TIMESTAMPING,
+				    ee_type=0,
+				    ee_code=0,
+				    ee_info=SCM_TSTAMP_SCHED,
+				    ee_data=10999}}
+		    ]}, MSG_ERRQUEUE) = 0
+// SCM_TSTAMP_SND for the last byte should be received almost immediately
+// once 10001 is acked at t=20ms.
+   +0	recvmsg(3, {msg_name(...)=...,
+		    msg_iov(1)=[{...,0}],
+                    msg_flags=MSG_ERRQUEUE|MSG_TRUNC,
+                    msg_control=[
+			{cmsg_level=SOL_SOCKET,
+			 cmsg_type=SCM_TIMESTAMPING,
+			 cmsg_data={scm_sec=0,scm_nsec=20000000}},
+			{cmsg_level=CMSG_LEVEL_IP,
+			 cmsg_type=CMSG_TYPE_RECVERR,
+			 cmsg_data={ee_errno=ENOMSG,
+				    ee_origin=SO_EE_ORIGIN_TIMESTAMPING,
+				    ee_type=0,
+				    ee_code=0,
+				    ee_info=SCM_TSTAMP_SND,
+				    ee_data=10999}}
+		    ]}, MSG_ERRQUEUE) = 0
+// SCM_TSTAMP_ACK for the last byte should be received at t=30ms.
+   +0	recvmsg(3, {msg_name(...)=...,
+		    msg_iov(1)=[{...,0}],
+                    msg_flags=MSG_ERRQUEUE|MSG_TRUNC,
+                    msg_control=[
+			{cmsg_level=SOL_SOCKET,
+			 cmsg_type=SCM_TIMESTAMPING,
+			 cmsg_data={scm_sec=0,scm_nsec=30000000}},
+			{cmsg_level=CMSG_LEVEL_IP,
+			 cmsg_type=CMSG_TYPE_RECVERR,
+			 cmsg_data={ee_errno=ENOMSG,
+				    ee_origin=SO_EE_ORIGIN_TIMESTAMPING,
+				    ee_type=0,
+				    ee_code=0,
+				    ee_info=SCM_TSTAMP_ACK,
+				    ee_data=10999}}
+		    ]}, MSG_ERRQUEUE) = 0
diff --git a/tools/testing/selftests/net/packetdrill/tcp_timestamping_partial.pkt b/tools/testing/selftests/net/packetdrill/tcp_timestamping_partial.pkt
new file mode 100644
index 000000000000..876024a31110
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_timestamping_partial.pkt
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test tx timestamping for partial writes (IPv4).
+`./defaults.sh
+`
+
+// Create a socket and set it to non-blocking.
+    0	socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0	fcntl(3, F_GETFL) = 0x2 (flags O_RDWR)
+   +0	fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+
+// Establish connection and verify that there was no error.
+   +0	connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress)
+   +0	> S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
+ +.01	< S. 0:0(0) ack 1 win 2000 <mss 1000,sackOK,TS val 700 ecr 100,nop,wscale 7>
+   +0	> . 1:1(0) ack 1 <nop,nop,TS val 200 ecr 700>
+   +0	getsockopt(3, SOL_SOCKET, SO_ERROR, [0], [4]) = 0
+
+   +0	setsockopt(3, SOL_SOCKET, SO_SNDBUF, [1000], 4) = 0
+   +0	setsockopt(3, SOL_SOCKET, SO_TIMESTAMPING,
+		   [SOF_TIMESTAMPING_TX_SCHED | SOF_TIMESTAMPING_TX_SOFTWARE |
+		    SOF_TIMESTAMPING_TX_ACK | SOF_TIMESTAMPING_SOFTWARE |
+		    SOF_TIMESTAMPING_OPT_ID], 4) = 0
+
+// We have a partial write.
+   +0	write(3, ..., 10000) = 2964
+   +0	> . 1:989(988) ack 1 <nop,nop,TS val 110 ecr 700>
+   +0	> P. 989:1977(988) ack 1 <nop,nop,TS val 110 ecr 700>
+ +.01	< . 1:1(0) ack 1977 win 92 <nop,nop,TS val 800 ecr 200>
+   +0	> P. 1977:2965(988) ack 1 <nop,nop,TS val 114 ecr 800>
+ +.01	< . 1:1(0) ack 2965 win 92 <nop,nop,TS val 800 ecr 200>
+
+// Make sure that internal TCP timestamps are not overwritten and we have sane
+// RTT measurement.
+   +0	%{
+assert 5000 <= tcpi_rtt <= 20000, 'srtt=%d us' % tcpi_rtt
+}%
+
+// SCM_TSTAMP_SCHED for the first chunk should be received almost immediately
+// after the first ack at t=20ms.
+   +0	recvmsg(3, {msg_name(...)=...,
+		    msg_iov(1)=[{...,0}],
+                    msg_flags=MSG_ERRQUEUE|MSG_TRUNC,
+                    msg_control=[
+			{cmsg_level=SOL_SOCKET,
+			 cmsg_type=SCM_TIMESTAMPING,
+			 cmsg_data={scm_sec=0,scm_nsec=20000000}},
+			{cmsg_level=CMSG_LEVEL_IP,
+			 cmsg_type=CMSG_TYPE_RECVERR,
+			 cmsg_data={ee_errno=ENOMSG,
+				    ee_origin=SO_EE_ORIGIN_TIMESTAMPING,
+				    ee_type=0,
+				    ee_code=0,
+				    ee_info=SCM_TSTAMP_SCHED,
+				    ee_data=2963}}
+		    ]}, MSG_ERRQUEUE) = 0
+// SCM_TSTAMP_SND for the first chunk should be received almost immediately
+// after the first ack at t=20ms.
+   +0	recvmsg(3, {msg_name(...)=...,
+		    msg_iov(1)=[{...,0}],
+                    msg_flags=MSG_ERRQUEUE|MSG_TRUNC,
+                    msg_control=[
+			{cmsg_level=SOL_SOCKET,
+			 cmsg_type=SCM_TIMESTAMPING,
+			 cmsg_data={scm_sec=0,scm_nsec=20000000}},
+			{cmsg_level=CMSG_LEVEL_IP,
+			 cmsg_type=CMSG_TYPE_RECVERR,
+			 cmsg_data={ee_errno=ENOMSG,
+				    ee_origin=SO_EE_ORIGIN_TIMESTAMPING,
+				    ee_type=0,
+				    ee_code=0,
+				    ee_info=SCM_TSTAMP_SND,
+				    ee_data=2963}}
+		    ]}, MSG_ERRQUEUE) = 0
+// SCM_TSTAMP_ACK for the first chunk should be received after the last ack at
+// t=30ms.
+   +0	recvmsg(3, {msg_name(...)=...,
+		    msg_iov(1)=[{...,0}],
+                    msg_flags=MSG_ERRQUEUE|MSG_TRUNC,
+                    msg_control=[
+			{cmsg_level=SOL_SOCKET,
+			 cmsg_type=SCM_TIMESTAMPING,
+			 cmsg_data={scm_sec=0,scm_nsec=30000000}},
+			{cmsg_level=CMSG_LEVEL_IP,
+			 cmsg_type=CMSG_TYPE_RECVERR,
+			 cmsg_data={ee_errno=ENOMSG,
+				    ee_origin=SO_EE_ORIGIN_TIMESTAMPING,
+				    ee_type=0,
+				    ee_code=0,
+				    ee_info=SCM_TSTAMP_ACK,
+				    ee_data=2963}}
+		    ]}, MSG_ERRQUEUE) = 0
diff --git a/tools/testing/selftests/net/packetdrill/tcp_timestamping_server.pkt b/tools/testing/selftests/net/packetdrill/tcp_timestamping_server.pkt
new file mode 100644
index 000000000000..84d94780e6be
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_timestamping_server.pkt
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test tx timestamping for server-side (IPv4).
+`./defaults.sh
+`
+
+// Initialize connection
+    0	socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0	setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0	bind(3, ..., ...) = 0
+   +0	listen(3, 1) = 0
+
+   +0	< S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 10>
+   +0	> S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+ +.01	< . 1:1(0) ack 1 win 514
+
+   +0	accept(3, ..., ...) = 4
+   +0	setsockopt(4, SOL_SOCKET, SO_TIMESTAMPING,
+		   [SOF_TIMESTAMPING_TX_SCHED | SOF_TIMESTAMPING_TX_SOFTWARE |
+		    SOF_TIMESTAMPING_TX_ACK | SOF_TIMESTAMPING_SOFTWARE |
+		    SOF_TIMESTAMPING_OPT_ID], 4) = 0
+
+// Write two 2KB chunks.
+// setsockopt(..., [SOF_TIMESTAMPING_SOFTWARE | SOF_TIMESTAMPING_OPT_ID], ...)
+// is called after when SYN is acked. So, we expect the last byte of the first
+// and the second chunks to have timestamp keys of 1999 (i.e., 2000 - 1) and
+// 3999 (i.e., 4000 - 1) respectively.
+   +0	write(4, ..., 2000) = 2000
+   +0	write(4, ..., 2000) = 2000
+   +0	> P. 1:2001(2000) ack 1
+   +0	> P. 2001:4001(2000) ack 1
+ +.01	< .  1:1(0) ack 2001 win 514
+ +.01	< .  1:1(0) ack 4001 win 514
+
+// Make sure that internal TCP timestamps are not overwritten and we have sane
+// RTT measurement.
+   +0	%{
+assert 5000 <= tcpi_rtt <= 20000, 'srtt=%d us' % tcpi_rtt
+}%
+
+// SCM_TSTAMP_SCHED for the first chunk should be received almost immediately
+// after write at t=10ms.
+   +0	recvmsg(4, {msg_name(...)=...,
+		    msg_iov(1)=[{...,0}],
+                    msg_flags=MSG_ERRQUEUE|MSG_TRUNC,
+                    msg_control=[
+			{cmsg_level=SOL_SOCKET,
+			 cmsg_type=SCM_TIMESTAMPING,
+			 cmsg_data={scm_sec=0,scm_nsec=10000000}},
+			{cmsg_level=CMSG_LEVEL_IP,
+			 cmsg_type=CMSG_TYPE_RECVERR,
+			 cmsg_data={ee_errno=ENOMSG,
+				    ee_origin=SO_EE_ORIGIN_TIMESTAMPING,
+				    ee_type=0,
+				    ee_code=0,
+				    ee_info=SCM_TSTAMP_SCHED,
+				    ee_data=1999}}
+		    ]}, MSG_ERRQUEUE) = 0
+// SCM_TSTAMP_SND for the first chunk should be received almost immediately
+// after write at t=10ms.
+   +0	recvmsg(4, {msg_name(...)=...,
+		    msg_iov(1)=[{...,0}],
+                    msg_flags=MSG_ERRQUEUE|MSG_TRUNC,
+                    msg_control=[
+			{cmsg_level=SOL_SOCKET,
+			 cmsg_type=SCM_TIMESTAMPING,
+			 cmsg_data={scm_sec=0,scm_nsec=10000000}},
+			{cmsg_level=CMSG_LEVEL_IP,
+			 cmsg_type=CMSG_TYPE_RECVERR,
+			 cmsg_data={ee_errno=ENOMSG,
+				    ee_origin=SO_EE_ORIGIN_TIMESTAMPING,
+				    ee_type=0,
+				    ee_code=0,
+				    ee_info=SCM_TSTAMP_SND,
+				    ee_data=1999}}
+		    ]}, MSG_ERRQUEUE) = 0
+// SCM_TSTAMP_SCHED for the second chunk should be received almost immediately
+// after that at t=10ms.
+   +0	recvmsg(4, {msg_name(...)=...,
+		    msg_iov(1)=[{...,0}],
+                    msg_flags=MSG_ERRQUEUE|MSG_TRUNC,
+                    msg_control=[
+			{cmsg_level=SOL_SOCKET,
+			 cmsg_type=SCM_TIMESTAMPING,
+			 cmsg_data={scm_sec=0,scm_nsec=10000000}},
+			{cmsg_level=CMSG_LEVEL_IP,
+			 cmsg_type=CMSG_TYPE_RECVERR,
+			 cmsg_data={ee_errno=ENOMSG,
+				    ee_origin=SO_EE_ORIGIN_TIMESTAMPING,
+				    ee_type=0,
+				    ee_code=0,
+				    ee_info=SCM_TSTAMP_SCHED,
+				    ee_data=3999}}
+		    ]}, MSG_ERRQUEUE) = 0
+// SCM_TSTAMP_SND for the second chunk should be received almost immediately
+// after that at t=10ms.
+   +0	recvmsg(4, {msg_name(...)=...,
+		    msg_iov(1)=[{...,0}],
+                    msg_flags=MSG_ERRQUEUE|MSG_TRUNC,
+                    msg_control=[
+			{cmsg_level=SOL_SOCKET,
+			 cmsg_type=SCM_TIMESTAMPING,
+			 cmsg_data={scm_sec=0,scm_nsec=10000000}},
+			{cmsg_level=CMSG_LEVEL_IP,
+			 cmsg_type=CMSG_TYPE_RECVERR,
+			 cmsg_data={ee_errno=ENOMSG,
+				    ee_origin=SO_EE_ORIGIN_TIMESTAMPING,
+				    ee_type=0,
+				    ee_code=0,
+				    ee_info=SCM_TSTAMP_SND,
+				    ee_data=3999}}
+		    ]}, MSG_ERRQUEUE) = 0
+// SCM_TSTAMP_ACK for the first chunk should be received at t=20ms.
+   +0	recvmsg(4, {msg_name(...)=...,
+		    msg_iov(1)=[{...,0}],
+                    msg_flags=MSG_ERRQUEUE|MSG_TRUNC,
+                    msg_control=[
+			{cmsg_level=SOL_SOCKET,
+			 cmsg_type=SCM_TIMESTAMPING,
+			 cmsg_data={scm_sec=0,scm_nsec=20000000}},
+			{cmsg_level=CMSG_LEVEL_IP,
+			 cmsg_type=CMSG_TYPE_RECVERR,
+			 cmsg_data={ee_errno=ENOMSG,
+				    ee_origin=SO_EE_ORIGIN_TIMESTAMPING,
+				    ee_type=0,
+				    ee_code=0,
+				    ee_info=SCM_TSTAMP_ACK,
+				    ee_data=1999}}
+		    ]}, MSG_ERRQUEUE) = 0
+// SCM_TSTAMP_ACK for the second chunk should be received at t=30ms.
+   +0	recvmsg(4, {msg_name(...)=...,
+		    msg_iov(1)=[{...,0}],
+                    msg_flags=MSG_ERRQUEUE|MSG_TRUNC,
+                    msg_control=[
+			{cmsg_level=SOL_SOCKET,
+			 cmsg_type=SCM_TIMESTAMPING,
+			 cmsg_data={scm_sec=0,scm_nsec=30000000}},
+			{cmsg_level=CMSG_LEVEL_IP,
+			 cmsg_type=CMSG_TYPE_RECVERR,
+			 cmsg_data={ee_errno=ENOMSG,
+				    ee_origin=SO_EE_ORIGIN_TIMESTAMPING,
+				    ee_type=0,
+				    ee_code=0,
+				    ee_info=SCM_TSTAMP_ACK,
+				    ee_data=3999}}
+		    ]}, MSG_ERRQUEUE) = 0
diff --git a/tools/testing/selftests/net/packetdrill/tcp_ts_recent_fin_tsval.pkt b/tools/testing/selftests/net/packetdrill/tcp_ts_recent_fin_tsval.pkt
new file mode 100644
index 000000000000..e61424a7bd0a
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_ts_recent_fin_tsval.pkt
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test that we send FIN packet with correct TSval
+--tcp_ts_tick_usecs=1000
+--tolerance_usecs=7000
+
+`./defaults.sh`
+
+// Create a socket.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+// Establish a connection.
+   +0 < S 0:0(0) win 20000 <mss 1000,sackOK,TS val 100 ecr 0>
+   +0 > S. 0:0(0) ack 1 <mss 1460,sackOK,TS val 100 ecr 100>
+  +.1 < . 1:1(0) ack 1 win 20000 <nop,nop,TS val 200 ecr 100>
+   +0 accept(3, ..., ...) = 4
+
+   +1 close(4) = 0
+// Check that FIN TSval is updated properly, one second has passed since last sent packet.
+   +0 > F. 1:1(0) ack 1 <nop,nop,TS val 1200 ecr 200>
diff --git a/tools/testing/selftests/net/packetdrill/tcp_ts_recent_invalid_ack.pkt b/tools/testing/selftests/net/packetdrill/tcp_ts_recent_invalid_ack.pkt
new file mode 100644
index 000000000000..174ce9a1bfc0
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_ts_recent_invalid_ack.pkt
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test that we reject TS val updates on a packet with invalid ACK sequence
+
+`./defaults.sh
+`
+
+// Create a socket.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+// Establish a connection.
+  +.1 < S 0:0(0) win 20000 <mss 1000,sackOK,TS val 100 ecr 0>
+   +0 > S. 0:0(0) ack 1 <mss 1460,sackOK,TS val 100 ecr 100>
+  +.1 < . 1:1(0) ack 1 win 20000 <nop,nop,TS val 200 ecr 100>
+   +0 accept(3, ..., ...) = 4
+
+// bad packet with high tsval (its ACK sequence is above our sndnxt)
+   +0 < F. 1:1(0) ack 9999 win 20000 <nop,nop,TS val 200000 ecr 100>
+
+
+   +0 < . 1:1001(1000) ack 1 win 20000 <nop,nop,TS val 201 ecr 100>
+   +0 > . 1:1(0) ack 1001 <nop,nop,TS val 200 ecr 201>
diff --git a/tools/testing/selftests/net/packetdrill/tcp_ts_recent_reset_tsval.pkt b/tools/testing/selftests/net/packetdrill/tcp_ts_recent_reset_tsval.pkt
new file mode 100644
index 000000000000..2e3b3bb7493a
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_ts_recent_reset_tsval.pkt
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test that we send RST packet with correct TSval
+--tcp_ts_tick_usecs=1000
+
+`./defaults.sh`
+
+// Create a socket.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+// Establish a connection.
+   +0 < S 0:0(0) win 20000 <mss 1000,sackOK,TS val 100 ecr 0>
+   +0 > S. 0:0(0) ack 1 <mss 1460,sackOK,TS val 100 ecr 100>
+  +.1 < . 1:1(0) ack 1 win 20000 <nop,nop,TS val 200 ecr 100>
+   +0 accept(3, ..., ...) = 4
+
+   +0 < . 1:1001(1000) ack 1 win 20000 <nop,nop,TS val 201 ecr 100>
+   +0 > . 1:1(0) ack 1001 <nop,nop,TS val 200 ecr 201>
+
+   +1 close(4) = 0
+// Check that RST TSval is updated properly, one second has passed since last sent packet.
+   +0 > R. 1:1(0) ack 1001 <nop,nop,TS val 1200 ecr 201>
diff --git a/tools/testing/selftests/net/packetdrill/tcp_user_timeout_user-timeout-probe.pkt b/tools/testing/selftests/net/packetdrill/tcp_user_timeout_user-timeout-probe.pkt
new file mode 100644
index 000000000000..183051ba0cae
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_user_timeout_user-timeout-probe.pkt
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
+
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+
+   +0 < S 0:0(0) win 0 <mss 1460>
+   +0 > S. 0:0(0) ack 1 <mss 1460>
+
+  +.1 < . 1:1(0) ack 1 win 65530
+   +0 accept(3, ..., ...) = 4
+
+   +0 setsockopt(4, SOL_TCP, TCP_USER_TIMEOUT, [3000], 4) = 0
+   +0 write(4, ..., 24) = 24
+   +0 > P. 1:25(24) ack 1
+   +.1 < . 1:1(0) ack 25 win 65530
+   +0 %{ assert tcpi_probes == 0, tcpi_probes; \
+         assert tcpi_backoff == 0, tcpi_backoff }%
+
+// install a qdisc dropping all packets
+   +0 `tc qdisc delete dev tun0 root 2>/dev/null ; tc qdisc add dev tun0 root pfifo limit 0`
+   +0 write(4, ..., 24) = 24
+   // When qdisc is congested we retry every 500ms
+   // (TCP_RESOURCE_PROBE_INTERVAL) and therefore
+   // we retry 6 times before hitting 3s timeout.
+   // First verify that the connection is alive:
++3.250 write(4, ..., 24) = 24
+   // Now verify that shortly after that the socket is dead:
+ +.100 write(4, ..., 24) = -1 ETIMEDOUT (Connection timed out)
+
+   +0 %{ assert tcpi_probes == 6, tcpi_probes; \
+         assert tcpi_backoff == 0, tcpi_backoff }%
+   +0 close(4) = 0
diff --git a/tools/testing/selftests/net/packetdrill/tcp_user_timeout_user_timeout.pkt b/tools/testing/selftests/net/packetdrill/tcp_user_timeout_user_timeout.pkt
new file mode 100644
index 000000000000..2efe02bfba9c
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_user_timeout_user_timeout.pkt
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0
+`./defaults.sh`
+
+// Initialize connection
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK>
+  +.1 < . 1:1(0) ack 1 win 32792
+
+
+   +0 accept(3, ..., ...) = 4
+
+// Okay, we received nothing, and decide to close this idle socket.
+// We set TCP_USER_TIMEOUT to 3 seconds because really it is not worth
+// trying hard to cleanly close this flow, at the price of keeping
+// a TCP structure in kernel for about 1 minute !
+   +2 setsockopt(4, SOL_TCP, TCP_USER_TIMEOUT, [3000], 4) = 0
+   +0 close(4) = 0
+
+   +0 > F. 1:1(0) ack 1
+  +.3~+.400 > F. 1:1(0) ack 1
+  +.3~+.400 > F. 1:1(0) ack 1
+  +.6~+.800 > F. 1:1(0) ack 1
+
+// We finally receive something from the peer, but it is way too late
+// Our socket vanished because TCP_USER_TIMEOUT was really small
+   +0 < . 1:2(1) ack 1 win 32792
+   +0 > R 1:1(0)
diff --git a/tools/testing/selftests/net/packetdrill/tcp_validate_validate-established-no-flags.pkt b/tools/testing/selftests/net/packetdrill/tcp_validate_validate-established-no-flags.pkt
new file mode 100644
index 000000000000..8bd60226ccfc
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_validate_validate-established-no-flags.pkt
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+// Verify that established connections drop a segment without the ACK flag set.
+
+`./defaults.sh`
+
+// Create a socket.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+// Establish a connection.
+   +0 < S 0:0(0) win 20000 <mss 1000,sackOK,nop,nop>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK>
+ +.01 < . 1:1(0) ack 1 win 20000
+   +0 accept(3, ..., ...) = 4
+
+// Receive a segment with no flags set, verify that it's not enqueued.
+ +.01 < - 1:1001(1000) win 20000
+   +0 ioctl(4, SIOCINQ, [0]) = 0
+
+// Receive a segment with ACK flag set, verify that it is enqueued.
+ +.01 < . 1:1001(1000) ack 1 win 20000
+   +0 ioctl(4, SIOCINQ, [1000]) = 0
diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_basic.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_basic.pkt
new file mode 100644
index 000000000000..a82c8899d36b
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_basic.pkt
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: GPL-2.0
+// basic zerocopy test:
+//
+// send a packet with MSG_ZEROCOPY and receive the notification ID
+// repeat and verify IDs are consecutive
+
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 setsockopt(3, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+   +0 < . 1:1(0) ack 1 win 257
+
+   +0 accept(3, ..., ...) = 4
+
+   +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000
+   +0 > P. 1:4001(4000) ack 1
+   +0 < . 1:1(0) ack 4001 win 257
+
+   +0 recvmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{...,0}],
+                  msg_flags=MSG_ERRQUEUE,
+                  msg_control=[
+                        {cmsg_level=CMSG_LEVEL_IP,
+                         cmsg_type=CMSG_TYPE_RECVERR,
+                         cmsg_data={ee_errno=0,
+                                    ee_origin=SO_EE_ORIGIN_ZEROCOPY,
+                                    ee_type=0,
+                                    ee_code=SO_EE_CODE_ZEROCOPY_COPIED,
+                                    ee_info=0,
+                                    ee_data=0}}
+                   ]}, MSG_ERRQUEUE) = 0
+
+   +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000
+   +0 > P. 4001:8001(4000) ack 1
+   +0 < . 1:1(0) ack 8001 win 257
+
+   +0 recvmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{...,0}],
+                  msg_flags=MSG_ERRQUEUE,
+                  msg_control=[
+                        {cmsg_level=CMSG_LEVEL_IP,
+                         cmsg_type=CMSG_TYPE_RECVERR,
+                         cmsg_data={ee_errno=0,
+                                    ee_origin=SO_EE_ORIGIN_ZEROCOPY,
+                                    ee_type=0,
+                                    ee_code=SO_EE_CODE_ZEROCOPY_COPIED,
+                                    ee_info=1,
+                                    ee_data=1}}
+                   ]}, MSG_ERRQUEUE) = 0
diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_batch.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_batch.pkt
new file mode 100644
index 000000000000..c01915e7f4a1
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_batch.pkt
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0
+// batch zerocopy test:
+//
+// send multiple packets, then read one range of all notifications.
+
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 setsockopt(3, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+   +0 < . 1:1(0) ack 1 win 257
+
+   +0 accept(3, ..., ...) = 4
+   +0 setsockopt(4, SOL_SOCKET, SO_MARK, [666], 4) = 0
+
+   +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000
+   +0 > P. 1:4001(4000) ack 1
+   +0 < . 1:1(0) ack 4001 win 257
+
+   +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000
+   +0 > P. 4001:8001(4000) ack 1
+   +0 < . 1:1(0) ack 8001 win 257
+
+   +0 recvmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{...,0}],
+                  msg_flags=MSG_ERRQUEUE,
+                  msg_control=[
+                        {cmsg_level=CMSG_LEVEL_IP,
+                         cmsg_type=CMSG_TYPE_RECVERR,
+                         cmsg_data={ee_errno=0,
+                                    ee_origin=SO_EE_ORIGIN_ZEROCOPY,
+                                    ee_type=0,
+                                    ee_code=SO_EE_CODE_ZEROCOPY_COPIED,
+                                    ee_info=0,
+                                    ee_data=1}}
+                  ]}, MSG_ERRQUEUE) = 0
diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_client.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_client.pkt
new file mode 100644
index 000000000000..6509882932e9
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_client.pkt
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0
+// Minimal client-side zerocopy test
+
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4
+   +0 setsockopt(4, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0
+   +0...0 connect(4, ..., ...) = 0
+
+   +0 > S 0:0(0) <mss 1460,sackOK,TS val 0 ecr 0,nop,wscale 8>
+   +0 < S. 0:0(0) ack 1 win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > . 1:1(0) ack 1
+
+   +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000
+   +0 > P. 1:4001(4000) ack 1
+   +0 < . 1:1(0) ack 4001 win 257
+
+   +0 recvmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{...,0}],
+                  msg_flags=MSG_ERRQUEUE,
+                  msg_control=[
+                        {cmsg_level=CMSG_LEVEL_IP,
+                         cmsg_type=CMSG_TYPE_RECVERR,
+                         cmsg_data={ee_errno=0,
+                                    ee_origin=SO_EE_ORIGIN_ZEROCOPY,
+                                    ee_type=0,
+                                    ee_code=SO_EE_CODE_ZEROCOPY_COPIED,
+                                    ee_info=0,
+                                    ee_data=0}}
+                   ]}, MSG_ERRQUEUE) = 0
diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_closed.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_closed.pkt
new file mode 100644
index 000000000000..2cd78755cb2a
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_closed.pkt
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: GPL-2.0
+// send with MSG_ZEROCOPY on a non-established socket
+//
+// verify that a send in state TCP_CLOSE correctly aborts the zerocopy
+// operation, specifically it does not increment the zerocopy counter.
+//
+// First send on a closed socket and wait for (absent) notification.
+// Then connect and send and verify that notification nr. is zero.
+
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4
+   +0 setsockopt(4, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0
+
+   +0 send(4, ..., 4000, MSG_ZEROCOPY) = -1 EPIPE (Broken pipe)
+
+   +0.1 recvmsg(4, {msg_name(...)=...,
+                    msg_iov(1)=[{...,0}],
+                    msg_flags=MSG_ERRQUEUE,
+                    msg_control=[]}, MSG_ERRQUEUE) = -1 EAGAIN (Resource temporarily unavailable)
+
+   +0...0 connect(4, ..., ...) = 0
+
+   +0 > S 0:0(0) <mss 1460,sackOK,TS val 0 ecr 0,nop,wscale 8>
+   +0 < S. 0:0(0) ack 1 win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > . 1:1(0) ack 1
+
+   +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000
+   +0 > P. 1:4001(4000) ack 1
+   +0 < . 1:1(0) ack 4001 win 257
+
+   +0 recvmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{...,0}],
+                  msg_flags=MSG_ERRQUEUE,
+                  msg_control=[
+                        {cmsg_level=CMSG_LEVEL_IP,
+                         cmsg_type=CMSG_TYPE_RECVERR,
+                         cmsg_data={ee_errno=0,
+                                    ee_origin=SO_EE_ORIGIN_ZEROCOPY,
+                                    ee_type=0,
+                                    ee_code=SO_EE_CODE_ZEROCOPY_COPIED,
+                                    ee_info=0,
+                                    ee_data=0}}
+                   ]}, MSG_ERRQUEUE) = 0
diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_edge.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_edge.pkt
new file mode 100644
index 000000000000..7671c20e01cf
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_edge.pkt
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0
+// epoll zerocopy test:
+//
+// EPOLLERR is known to be not edge-triggered unlike EPOLLIN and EPOLLOUT but
+// it is not level-triggered either.
+//
+// fire two sends with MSG_ZEROCOPY and receive the acks. confirm that EPOLLERR
+// is correctly fired only once, when EPOLLET is set. send another packet with
+// MSG_ZEROCOPY. confirm that EPOLLERR is correctly fired again only once.
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 setsockopt(3, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+   +0 < . 1:1(0) ack 1 win 257
+
+   +0 accept(3, ..., ...) = 4
+
+   +0 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+   +0 epoll_create(1) = 5
+   +0 epoll_ctl(5, EPOLL_CTL_ADD, 4, {events=EPOLLOUT|EPOLLET, fd=4}) = 0
+   +0 epoll_wait(5, {events=EPOLLOUT, fd=4}, 1, 0) = 1
+
+   +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000
+   +0 > P. 1:4001(4000) ack 1
+   +0 < . 1:1(0) ack 4001 win 257
+
+   +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000
+   +0 > P. 4001:8001(4000) ack 1
+   +0 < . 1:1(0) ack 8001 win 257
+
+// receive only one EPOLLERR for the two sends above.
+   +0 epoll_wait(5, {events=EPOLLERR|EPOLLOUT, fd=4}, 1, 0) = 1
+   +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0
+
+   +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000
+   +0 > P. 8001:12001(4000) ack 1
+   +0 < . 1:1(0) ack 12001 win 257
+
+// receive only one EPOLLERR for the third send above.
+   +0 epoll_wait(5, {events=EPOLLERR|EPOLLOUT, fd=4}, 1, 0) = 1
+   +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0
+
+   +0 recvmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{...,0}],
+                  msg_flags=MSG_ERRQUEUE,
+                  msg_control=[
+                        {cmsg_level=CMSG_LEVEL_IP,
+                         cmsg_type=CMSG_TYPE_RECVERR,
+                         cmsg_data={ee_errno=0,
+                                    ee_origin=SO_EE_ORIGIN_ZEROCOPY,
+                                    ee_type=0,
+                                    ee_code=SO_EE_CODE_ZEROCOPY_COPIED,
+                                    ee_info=0,
+                                    ee_data=2}}
+                   ]}, MSG_ERRQUEUE) = 0
diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_exclusive.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_exclusive.pkt
new file mode 100644
index 000000000000..fadc480fdb7f
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_exclusive.pkt
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0
+// epoll zerocopy test:
+//
+// EPOLLERR is known to be not edge-triggered unlike EPOLLIN and EPOLLOUT but
+// it is not level-triggered either. this tests verify that the same behavior is
+// maintained when we have EPOLLEXCLUSIVE.
+//
+// fire two sends with MSG_ZEROCOPY and receive the acks. confirm that EPOLLERR
+// is correctly fired only once, when EPOLLET is set. send another packet with
+// MSG_ZEROCOPY. confirm that EPOLLERR is correctly fired again only once.
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 setsockopt(3, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+   +0 < . 1:1(0) ack 1 win 257
+
+   +0 accept(3, ..., ...) = 4
+
+   +0 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+   +0 epoll_create(1) = 5
+   +0 epoll_ctl(5, EPOLL_CTL_ADD, 4,
+		{events=EPOLLOUT|EPOLLET|EPOLLEXCLUSIVE, fd=4}) = 0
+   +0 epoll_wait(5, {events=EPOLLOUT, fd=4}, 1, 0) = 1
+
+   +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000
+   +0 > P. 1:4001(4000) ack 1
+   +0 < . 1:1(0) ack 4001 win 257
+
+   +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000
+   +0 > P. 4001:8001(4000) ack 1
+   +0 < . 1:1(0) ack 8001 win 257
+
+// receive only one EPOLLERR for the two sends above.
+   +0 epoll_wait(5, {events=EPOLLERR|EPOLLOUT, fd=4}, 1, 0) = 1
+   +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0
+
+   +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000
+   +0 > P. 8001:12001(4000) ack 1
+   +0 < . 1:1(0) ack 12001 win 257
+
+// receive only one EPOLLERR for the third send above.
+   +0 epoll_wait(5, {events=EPOLLERR|EPOLLOUT, fd=4}, 1, 0) = 1
+   +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0
+
+   +0 recvmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{...,0}],
+                  msg_flags=MSG_ERRQUEUE,
+                  msg_control=[
+                        {cmsg_level=CMSG_LEVEL_IP,
+                         cmsg_type=CMSG_TYPE_RECVERR,
+                         cmsg_data={ee_errno=0,
+                                    ee_origin=SO_EE_ORIGIN_ZEROCOPY,
+                                    ee_type=0,
+                                    ee_code=SO_EE_CODE_ZEROCOPY_COPIED,
+                                    ee_info=0,
+                                    ee_data=2}}
+                   ]}, MSG_ERRQUEUE) = 0
diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_oneshot.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_oneshot.pkt
new file mode 100644
index 000000000000..5bfa0d1d2f4a
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_oneshot.pkt
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0
+// epoll zerocopy test:
+//
+// This is a test to confirm that EPOLLERR is only fired once for an FD when
+// EPOLLONESHOT is set.
+//
+// fire two sends with MSG_ZEROCOPY and receive the acks. confirm that EPOLLERR
+// is correctly fired only once, when EPOLLONESHOT is set. send another packet
+// with MSG_ZEROCOPY. confirm that EPOLLERR is not fired. Rearm the FD and
+// confirm that EPOLLERR is correctly set.
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 setsockopt(3, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+   +0 < . 1:1(0) ack 1 win 257
+
+   +0 accept(3, ..., ...) = 4
+
+   +0 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+   +0 epoll_create(1) = 5
+   +0 epoll_ctl(5, EPOLL_CTL_ADD, 4,
+		{events=EPOLLOUT|EPOLLET|EPOLLONESHOT, fd=4}) = 0
+
+   +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000
+   +0 > P. 1:4001(4000) ack 1
+   +0 < . 1:1(0) ack 4001 win 257
+
+   +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000
+   +0 > P. 4001:8001(4000) ack 1
+   +0 < . 1:1(0) ack 8001 win 257
+
+// receive only one EPOLLERR for the two sends above.
+   +0 epoll_wait(5, {events=EPOLLERR|EPOLLOUT, fd=4}, 1, 0) = 1
+   +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0
+
+   +0 send(4, ..., 4000, MSG_ZEROCOPY) = 4000
+   +0 > P. 8001:12001(4000) ack 1
+   +0 < . 1:1(0) ack 12001 win 257
+
+// receive no EPOLLERR for the third send above.
+   +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0
+
+// rearm the FD and verify the EPOLLERR is fired again.
+   +0 epoll_ctl(5, EPOLL_CTL_MOD, 4, {events=EPOLLOUT|EPOLLONESHOT, fd=4}) = 0
+   +0 epoll_wait(5, {events=EPOLLERR|EPOLLOUT, fd=4}, 1, 0) = 1
+   +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0
+
+   +0 recvmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{...,0}],
+                  msg_flags=MSG_ERRQUEUE,
+                  msg_control=[
+                        {cmsg_level=CMSG_LEVEL_IP,
+                         cmsg_type=CMSG_TYPE_RECVERR,
+                         cmsg_data={ee_errno=0,
+                                    ee_origin=SO_EE_ORIGIN_ZEROCOPY,
+                                    ee_type=0,
+                                    ee_code=SO_EE_CODE_ZEROCOPY_COPIED,
+                                    ee_info=0,
+                                    ee_data=2}}
+                   ]}, MSG_ERRQUEUE) = 0
diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_fastopen-client.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_fastopen-client.pkt
new file mode 100644
index 000000000000..4a73bbf46961
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_fastopen-client.pkt
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0
+// Fastopen client zerocopy test:
+//
+// send data with MSG_FASTOPEN | MSG_ZEROCOPY and verify that the
+// kernel returns the notification ID.
+//
+// Fastopen requires a stored cookie. Create two sockets. The first
+// one will have no data in the initial send. On return 0 the
+// zerocopy notification counter is not incremented. Verify this too.
+
+`./defaults.sh`
+
+// Send a FastOpen request, no cookie yet so no data in SYN
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+   +0 setsockopt(3, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0
+   +0 sendto(3, ..., 500, MSG_FASTOPEN|MSG_ZEROCOPY, ..., ...) = -1 EINPROGRESS (Operation now in progress)
+   +0 > S 0:0(0) <mss 1460,sackOK,TS val 1000 ecr 0,nop,wscale 8,FO,nop,nop>
+ +.01 < S. 123:123(0) ack 1 win 14600 <mss 940,TS val 2000 ecr 1000,sackOK,nop,wscale 6, FO abcd1234,nop,nop>
+   +0 > . 1:1(0) ack 1 <nop,nop,TS val 1001 ecr 2000>
+
+// Read from error queue: no zerocopy notification
+   +1 recvmsg(3, {msg_name(...)=...,
+                    msg_iov(1)=[{...,0}],
+                    msg_flags=MSG_ERRQUEUE,
+                    msg_control=[]}, MSG_ERRQUEUE) = -1 EAGAIN (Resource temporarily unavailable)
+
+ +.01 close(3) = 0
+   +0 > F. 1:1(0) ack 1 <nop,nop,TS val 1002 ecr 2000>
+ +.01 < F. 1:1(0) ack 2 win 92 <nop,nop,TS val 2001 ecr 1002>
+   +0 > .  2:2(0) ack 2 <nop,nop,TS val 1003 ecr 2001>
+
+// Send another Fastopen request, now SYN will have data
+ +.07 `sysctl -q net.ipv4.tcp_timestamps=0`
+  +.1 socket(..., SOCK_STREAM, IPPROTO_TCP) = 5
+   +0 fcntl(5, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+   +0 setsockopt(5, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0
+   +0 sendto(5, ..., 500, MSG_FASTOPEN|MSG_ZEROCOPY, ..., ...) = 500
+   +0 > S 0:500(500) <mss 1460,nop,nop,sackOK,nop,wscale 8,FO abcd1234,nop,nop>
+ +.05 < S. 5678:5678(0) ack 501 win 14600 <mss 1460,nop,nop,sackOK,nop,wscale 6>
+   +0 > . 501:501(0) ack 1
+
+// Read from error queue: now has first zerocopy notification
+   +0.5 recvmsg(5, {msg_name(...)=...,
+                    msg_iov(1)=[{...,0}],
+                    msg_flags=MSG_ERRQUEUE,
+                    msg_control=[
+                        {cmsg_level=CMSG_LEVEL_IP,
+                         cmsg_type=CMSG_TYPE_RECVERR,
+                         cmsg_data={ee_errno=0,
+                                    ee_origin=SO_EE_ORIGIN_ZEROCOPY,
+                                    ee_type=0,
+                                    ee_code=SO_EE_CODE_ZEROCOPY_COPIED,
+                                    ee_info=0,
+                                    ee_data=0}}
+                   ]}, MSG_ERRQUEUE) = 0
diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_fastopen-server.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_fastopen-server.pkt
new file mode 100644
index 000000000000..36086c5877ce
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_fastopen-server.pkt
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: GPL-2.0
+// Fastopen server zerocopy test:
+//
+// send data with MSG_FASTOPEN | MSG_ZEROCOPY and verify that the
+// kernel returns the notification ID.
+
+`./defaults.sh
+ ./set_sysctls.py /proc/sys/net/ipv4/tcp_fastopen=0x207`
+
+// Set up a TFO server listening socket.
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+  +.1 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+   +0 setsockopt(3, SOL_TCP, TCP_FASTOPEN, [2], 4) = 0
+   +0 setsockopt(3, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0
+
+// Client sends a SYN with data.
+  +.1 < S 0:1000(1000) win 32792 <mss 1460,sackOK,nop,nop>
+   +0 > S. 0:0(0) ack 1001 <mss 1460,nop,nop,sackOK>
+
+// Server accepts and replies with data.
++.005 accept(3, ..., ...) = 4
+   +0 read(4, ..., 1024) = 1000
+   +0 sendto(4, ..., 1000, MSG_ZEROCOPY, ..., ...) = 1000
+   +0 > P. 1:1001(1000) ack 1001
+ +.05 < . 1001:1001(0) ack 1001 win 32792
+
+// Read from error queue: now has first zerocopy notification
+  +0.1 recvmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{...,0}],
+                  msg_flags=MSG_ERRQUEUE,
+                  msg_control=[
+                      {cmsg_level=CMSG_LEVEL_IP,
+                       cmsg_type=CMSG_TYPE_RECVERR,
+                       cmsg_data={ee_errno=0,
+                                  ee_origin=SO_EE_ORIGIN_ZEROCOPY,
+                                  ee_type=0,
+                                  ee_code=SO_EE_CODE_ZEROCOPY_COPIED,
+                                  ee_info=0,
+                                  ee_data=0}}
+                  ]}, MSG_ERRQUEUE) = 0
+
+`/tmp/sysctl_restore_${PPID}.sh`
diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_maxfrags.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_maxfrags.pkt
new file mode 100644
index 000000000000..672f817faca0
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_maxfrags.pkt
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: GPL-2.0
+// tcp_MAX_SKB_FRAGS test
+//
+// Verify that sending an iovec of tcp_MAX_SKB_FRAGS + 1 elements will
+// 1) fit in a single packet without zerocopy
+// 2) spill over into a second packet with zerocopy,
+//    because each iovec element becomes a frag
+// 3) the PSH bit is set on an skb when it runs out of fragments
+
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 setsockopt(3, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0
+
+   // Each pinned zerocopy page is fully accounted to skb->truesize.
+   // This test generates a worst case packet with each frag storing
+   // one byte, but increasing truesize with a page (64KB on PPC).
+   +0 setsockopt(3, SOL_SOCKET, SO_SNDBUF, [2000000], 4) = 0
+
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+   +0 < . 1:1(0) ack 1 win 257
+
+   +0 accept(3, ..., ...) = 4
+
+   // send an iov of 18 elements: just becomes a linear skb
+   +0 sendmsg(4, {msg_name(...)=...,
+		  msg_iov(18)=[{..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}],
+		  msg_flags=0}, 0) = 18
+
+   +0 > P. 1:19(18) ack 1
+   +0 < . 1:1(0) ack 19 win 257
+
+   // send a zerocopy iov of 18 elements:
+   +1 sendmsg(4, {msg_name(...)=...,
+		  msg_iov(18)=[{..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}],
+		  msg_flags=0}, MSG_ZEROCOPY) = 18
+
+   // verify that it is split in one skb of 17 frags + 1 of 1 frag
+   // verify that both have the PSH bit set
+   +0 > P. 19:36(17) ack 1
+   +0 < . 1:1(0) ack 36 win 257
+
+   +0 > P. 36:37(1) ack 1
+   +0 < . 1:1(0) ack 37 win 257
+
+   +1 recvmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{...,0}],
+                  msg_flags=MSG_ERRQUEUE,
+                  msg_control=[
+                        {cmsg_level=CMSG_LEVEL_IP,
+                         cmsg_type=CMSG_TYPE_RECVERR,
+                         cmsg_data={ee_errno=0,
+                                    ee_origin=SO_EE_ORIGIN_ZEROCOPY,
+                                    ee_type=0,
+                                    ee_code=SO_EE_CODE_ZEROCOPY_COPIED,
+                                    ee_info=0,
+                                    ee_data=0}}
+                   ]}, MSG_ERRQUEUE) = 0
+
+   // send a zerocopy iov of 64 elements:
+   +0 sendmsg(4, {msg_name(...)=...,
+                  msg_iov(64)=[{..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1},
+			       {..., 1}, {..., 1}, {..., 1}, {..., 1}],
+                  msg_flags=0}, MSG_ZEROCOPY) = 64
+
+   // verify that it is split in skbs with 17 frags
+   +0 > P. 37:54(17) ack 1
+   +0 < . 1:1(0) ack 54 win 257
+
+   +0 > P. 54:71(17) ack 1
+   +0 < . 1:1(0) ack 71 win 257
+
+   +0 > P. 71:88(17) ack 1
+   +0 < . 1:1(0) ack 88 win 257
+
+   +0 > P. 88:101(13) ack 1
+   +0 < . 1:1(0) ack 101 win 257
+
+   +1 recvmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{...,0}],
+                  msg_flags=MSG_ERRQUEUE,
+                  msg_control=[
+                        {cmsg_level=CMSG_LEVEL_IP,
+                         cmsg_type=CMSG_TYPE_RECVERR,
+                         cmsg_data={ee_errno=0,
+                                    ee_origin=SO_EE_ORIGIN_ZEROCOPY,
+                                    ee_type=0,
+                                    ee_code=SO_EE_CODE_ZEROCOPY_COPIED,
+                                    ee_info=1,
+                                    ee_data=1}}
+                   ]}, MSG_ERRQUEUE) = 0
diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_small.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_small.pkt
new file mode 100644
index 000000000000..a9a1ac0aea4f
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_small.pkt
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0
+// small packet zerocopy test:
+//
+// verify that SO_EE_CODE_ZEROCOPY_COPIED is set on zerocopy
+// packets of all sizes, including the smallest payload, 1B.
+
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 setsockopt(3, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+   +0 < . 1:1(0) ack 1 win 257
+
+   +0 accept(3, ..., ...) = 4
+
+   // send 1B
+   +0 send(4, ..., 1, MSG_ZEROCOPY) = 1
+   +0 > P. 1:2(1) ack 1
+   +0 < . 1:1(0) ack 2 win 257
+
+   +1 recvmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{...,0}],
+                  msg_flags=MSG_ERRQUEUE,
+                  msg_control=[
+                        {cmsg_level=CMSG_LEVEL_IP,
+                         cmsg_type=CMSG_TYPE_RECVERR,
+                         cmsg_data={ee_errno=0,
+                                    ee_origin=SO_EE_ORIGIN_ZEROCOPY,
+                                    ee_type=0,
+                                    ee_code=SO_EE_CODE_ZEROCOPY_COPIED,
+                                    ee_info=0,
+                                    ee_data=0}}
+                   ]}, MSG_ERRQUEUE) = 0
+
+   // send 1B again
+   +0 send(4, ..., 1, MSG_ZEROCOPY) = 1
+   +0 > P. 2:3(1) ack 1
+   +0 < . 1:1(0) ack 3 win 257
+
+   +1 recvmsg(4, {msg_name(...)=...,
+                  msg_iov(1)=[{...,0}],
+                  msg_flags=MSG_ERRQUEUE,
+                  msg_control=[
+                        {cmsg_level=CMSG_LEVEL_IP,
+                         cmsg_type=CMSG_TYPE_RECVERR,
+                         cmsg_data={ee_errno=0,
+                                    ee_origin=SO_EE_ORIGIN_ZEROCOPY,
+                                    ee_type=0,
+                                    ee_code=SO_EE_CODE_ZEROCOPY_COPIED,
+                                    ee_info=1,
+                                    ee_data=1}}
+                   ]}, MSG_ERRQUEUE) = 0
diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
index cfc84958025a..88e914c4eef9 100755
--- a/tools/testing/selftests/net/pmtu.sh
+++ b/tools/testing/selftests/net/pmtu.sh
@@ -197,9 +197,14 @@
 #
 # - pmtu_ipv6_route_change
 #	Same as above but with IPv6
+#
+# - pmtu_ipv4_mp_exceptions
+#	Use the same topology as in pmtu_ipv4, but add routeable addresses
+#	on host A and B on lo reachable via both routers. Host A and B
+#	addresses have multipath routes to each other, b_r1 mtu = 1500.
+#	Check that PMTU exceptions are created for both paths.
 
 source lib.sh
-source net_helper.sh
 
 PAUSE_ON_FAIL=no
 VERBOSE=0
@@ -266,7 +271,8 @@ tests="
 	list_flush_ipv4_exception	ipv4: list and flush cached exceptions	1
 	list_flush_ipv6_exception	ipv6: list and flush cached exceptions	1
 	pmtu_ipv4_route_change		ipv4: PMTU exception w/route replace	1
-	pmtu_ipv6_route_change		ipv6: PMTU exception w/route replace	1"
+	pmtu_ipv6_route_change		ipv6: PMTU exception w/route replace	1
+	pmtu_ipv4_mp_exceptions		ipv4: PMTU multipath nh exceptions	1"
 
 # Addressing and routing for tests with routers: four network segments, with
 # index SEGMENT between 1 and 4, a common prefix (PREFIX4 or PREFIX6) and an
@@ -343,6 +349,9 @@ tunnel6_a_addr="fd00:2::a"
 tunnel6_b_addr="fd00:2::b"
 tunnel6_mask="64"
 
+host4_a_addr="192.168.99.99"
+host4_b_addr="192.168.88.88"
+
 dummy6_0_prefix="fc00:1000::"
 dummy6_1_prefix="fc00:1001::"
 dummy6_mask="64"
@@ -681,13 +690,7 @@ setup_xfrm() {
 }
 
 setup_nettest_xfrm() {
-	if ! which nettest >/dev/null; then
-		PATH=$PWD:$PATH
-		if ! which nettest >/dev/null; then
-			echo "'nettest' command not found; skipping tests"
-			return 1
-		fi
-	fi
+	check_gen_prog "nettest"
 
 	[ ${1} -eq 6 ] && proto="-6" || proto=""
 	port=${2}
@@ -842,25 +845,97 @@ setup_bridge() {
 	run_cmd ${ns_a} ip link set veth_A-C master br0
 }
 
+setup_ovs_via_internal_utility() {
+	type="${1}"
+	a_addr="${2}"
+	b_addr="${3}"
+	dport="${4}"
+
+	run_cmd python3 ./openvswitch/ovs-dpctl.py add-if ovs_br0 ${type}_a -t ${type} || return 1
+
+	ports=$(python3 ./openvswitch/ovs-dpctl.py show)
+	br0_port=$(echo "$ports" | grep -E "\sovs_br0" | sed -e 's@port @@' | cut -d: -f1 | xargs)
+	type_a_port=$(echo "$ports" | grep ${type}_a | sed -e 's@port @@' | cut -d: -f1 | xargs)
+	veth_a_port=$(echo "$ports" | grep veth_A | sed -e 's@port @@' | cut -d: -f1 | xargs)
+
+	v4_a_tun="${prefix4}.${a_r1}.1"
+	v4_b_tun="${prefix4}.${b_r1}.1"
+
+	v6_a_tun="${prefix6}:${a_r1}::1"
+	v6_b_tun="${prefix6}:${b_r1}::1"
+
+	if [ "${v4_a_tun}" = "${a_addr}" ]; then
+		run_cmd python3 ./openvswitch/ovs-dpctl.py add-flow ovs_br0 \
+		    "recirc_id(0),in_port(${veth_a_port}),eth(),eth_type(0x0800),ipv4()" \
+		    "set(tunnel(tun_id=1,dst=${v4_b_tun},ttl=64,tp_dst=${dport},flags(df|csum))),${type_a_port}"
+		run_cmd python3 ./openvswitch/ovs-dpctl.py add-flow ovs_br0 \
+		    "recirc_id(0),in_port(${veth_a_port}),eth(),eth_type(0x86dd),ipv6()" \
+		    "set(tunnel(tun_id=1,dst=${v4_b_tun},ttl=64,tp_dst=${dport},flags(df|csum))),${type_a_port}"
+		run_cmd python3 ./openvswitch/ovs-dpctl.py add-flow ovs_br0 \
+		    "recirc_id(0),tunnel(tun_id=1,src=${v4_b_tun},dst=${v4_a_tun}),in_port(${type_a_port}),eth(),eth_type(0x0800),ipv4()" \
+		    "${veth_a_port}"
+		run_cmd python3 ./openvswitch/ovs-dpctl.py add-flow ovs_br0 \
+		    "recirc_id(0),tunnel(tun_id=1,src=${v4_b_tun},dst=${v4_a_tun}),in_port(${type_a_port}),eth(),eth_type(0x86dd),ipv6()" \
+		    "${veth_a_port}"
+		run_cmd python3 ./openvswitch/ovs-dpctl.py add-flow ovs_br0 \
+		    "recirc_id(0),tunnel(tun_id=1,src=${v4_b_tun},dst=${v4_a_tun}),in_port(${type_a_port}),eth(),eth_type(0x0806),arp()" \
+		    "${veth_a_port}"
+		run_cmd python3 ./openvswitch/ovs-dpctl.py add-flow ovs_br0 \
+		    "recirc_id(0),in_port(${veth_a_port}),eth(),eth_type(0x0806),arp(sip=${veth4_c_addr},tip=${tunnel4_b_addr})" \
+		    "set(tunnel(tun_id=1,dst=${v4_b_tun},ttl=64,tp_dst=${dport},flags(df|csum))),${type_a_port}"
+	else
+		run_cmd python3 ./openvswitch/ovs-dpctl.py add-flow ovs_br0 \
+		    "recirc_id(0),in_port(${veth_a_port}),eth(),eth_type(0x0800),ipv4()" \
+		    "set(tunnel(tun_id=1,ipv6_dst=${v6_b_tun},ttl=64,tp_dst=${dport},flags(df|csum))),${type_a_port}"
+		run_cmd python3 ./openvswitch/ovs-dpctl.py add-flow ovs_br0 \
+		    "recirc_id(0),in_port(${veth_a_port}),eth(),eth_type(0x86dd),ipv6()" \
+		    "set(tunnel(tun_id=1,ipv6_dst=${v6_b_tun},ttl=64,tp_dst=${dport},flags(df|csum))),${type_a_port}"
+		run_cmd python3 ./openvswitch/ovs-dpctl.py add-flow ovs_br0 \
+		    "recirc_id(0),tunnel(tun_id=1,ipv6_src=${v6_b_tun},ipv6_dst=${v6_a_tun}),in_port(${type_a_port}),eth(),eth_type(0x0800),ipv4()" \
+		    "${veth_a_port}"
+		run_cmd python3 ./openvswitch/ovs-dpctl.py add-flow ovs_br0 \
+		    "recirc_id(0),tunnel(tun_id=1,ipv6_src=${v6_b_tun},ipv6_dst=${v6_a_tun}),in_port(${type_a_port}),eth(),eth_type(0x86dd),ipv6()" \
+		    "${veth_a_port}"
+		run_cmd python3 ./openvswitch/ovs-dpctl.py add-flow ovs_br0 \
+		    "recirc_id(0),tunnel(tun_id=1,ipv6_src=${v6_b_tun},ipv6_dst=${v6_a_tun}),in_port(${type_a_port}),eth(),eth_type(0x0806),arp()" \
+		    "${veth_a_port}"
+		run_cmd python3 ./openvswitch/ovs-dpctl.py add-flow ovs_br0 \
+		    "recirc_id(0),in_port(${veth_a_port}),eth(),eth_type(0x0806),arp(sip=${veth4_c_addr},tip=${tunnel4_b_addr})" \
+		    "set(tunnel(tun_id=1,ipv6_dst=${v6_b_tun},ttl=64,tp_dst=${dport},flags(df|csum))),${type_a_port}"
+	fi
+}
+
+setup_ovs_via_vswitchd() {
+	type="${1}"
+	b_addr="${2}"
+
+	run_cmd ovs-vsctl add-port ovs_br0 ${type}_a -- \
+		set interface ${type}_a type=${type} \
+		options:remote_ip=${b_addr} options:key=1 options:csum=true || return 1
+}
+
 setup_ovs_vxlan_or_geneve() {
 	type="${1}"
 	a_addr="${2}"
 	b_addr="${3}"
+	dport="6081"
 
 	if [ "${type}" = "vxlan" ]; then
+		dport="4789"
 		opts="${opts} ttl 64 dstport 4789"
 		opts_b="local ${b_addr}"
 	fi
 
-	run_cmd ovs-vsctl add-port ovs_br0 ${type}_a -- \
-		set interface ${type}_a type=${type} \
-		options:remote_ip=${b_addr} options:key=1 options:csum=true || return 1
+	setup_ovs_via_internal_utility "${type}" "${a_addr}" "${b_addr}" \
+				       "${dport}" || \
+	    setup_ovs_via_vswitchd "${type}" "${b_addr}" || return 1
 
 	run_cmd ${ns_b} ip link add ${type}_b type ${type} id 1 ${opts_b} remote ${a_addr} ${opts} || return 1
 
 	run_cmd ${ns_b} ip addr add ${tunnel4_b_addr}/${tunnel4_mask} dev ${type}_b
 	run_cmd ${ns_b} ip addr add ${tunnel6_b_addr}/${tunnel6_mask} dev ${type}_b
 
+	run_cmd ip link set ${type}_a up
 	run_cmd ${ns_b} ip link set ${type}_b up
 }
 
@@ -880,8 +955,24 @@ setup_ovs_vxlan6() {
 	setup_ovs_vxlan_or_geneve vxlan  ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1
 }
 
+setup_ovs_br_internal() {
+	run_cmd python3 ./openvswitch/ovs-dpctl.py add-dp ovs_br0 || \
+		return 1
+}
+
+setup_ovs_br_vswitchd() {
+	run_cmd ovs-vsctl add-br ovs_br0 || return 1
+}
+
+setup_ovs_add_if() {
+	ifname="${1}"
+	run_cmd python3 ./openvswitch/ovs-dpctl.py add-if ovs_br0 \
+		"${ifname}" || \
+		run_cmd ovs-vsctl add-port ovs_br0 "${ifname}"
+}
+
 setup_ovs_bridge() {
-	run_cmd ovs-vsctl add-br ovs_br0 || return $ksft_skip
+	setup_ovs_br_internal || setup_ovs_br_vswitchd || return $ksft_skip
 	run_cmd ip link set ovs_br0 up
 
 	run_cmd ${ns_c} ip link add veth_C-A type veth peer name veth_A-C
@@ -891,7 +982,7 @@ setup_ovs_bridge() {
 	run_cmd ${ns_c} ip link set veth_C-A up
 	run_cmd ${ns_c} ip addr add ${veth4_c_addr}/${veth4_mask} dev veth_C-A
 	run_cmd ${ns_c} ip addr add ${veth6_c_addr}/${veth6_mask} dev veth_C-A
-	run_cmd ovs-vsctl add-port ovs_br0 veth_A-C
+	setup_ovs_add_if veth_A-C
 
 	# Move veth_A-R1 to init
 	run_cmd ${ns_a} ip link set veth_A-R1 netns 1
@@ -902,6 +993,52 @@ setup_ovs_bridge() {
 	run_cmd ip route add ${prefix6}:${b_r1}::1 via ${prefix6}:${a_r1}::2
 }
 
+setup_multipath_new() {
+	# Set up host A with multipath routes to host B host4_b_addr
+	run_cmd ${ns_a} ip addr add ${host4_a_addr} dev lo
+	run_cmd ${ns_a} ip nexthop add id 401 via ${prefix4}.${a_r1}.2 dev veth_A-R1
+	run_cmd ${ns_a} ip nexthop add id 402 via ${prefix4}.${a_r2}.2 dev veth_A-R2
+	run_cmd ${ns_a} ip nexthop add id 403 group 401/402
+	run_cmd ${ns_a} ip route add ${host4_b_addr} src ${host4_a_addr} nhid 403
+
+	# Set up host B with multipath routes to host A host4_a_addr
+	run_cmd ${ns_b} ip addr add ${host4_b_addr} dev lo
+	run_cmd ${ns_b} ip nexthop add id 401 via ${prefix4}.${b_r1}.2 dev veth_B-R1
+	run_cmd ${ns_b} ip nexthop add id 402 via ${prefix4}.${b_r2}.2 dev veth_B-R2
+	run_cmd ${ns_b} ip nexthop add id 403 group 401/402
+	run_cmd ${ns_b} ip route add ${host4_a_addr} src ${host4_b_addr} nhid 403
+}
+
+setup_multipath_old() {
+	# Set up host A with multipath routes to host B host4_b_addr
+	run_cmd ${ns_a} ip addr add ${host4_a_addr} dev lo
+	run_cmd ${ns_a} ip route add ${host4_b_addr} \
+			src ${host4_a_addr} \
+			nexthop via ${prefix4}.${a_r1}.2 weight 1 \
+			nexthop via ${prefix4}.${a_r2}.2 weight 1
+
+	# Set up host B with multipath routes to host A host4_a_addr
+	run_cmd ${ns_b} ip addr add ${host4_b_addr} dev lo
+	run_cmd ${ns_b} ip route add ${host4_a_addr} \
+			src ${host4_b_addr} \
+			nexthop via ${prefix4}.${b_r1}.2 weight 1 \
+			nexthop via ${prefix4}.${b_r2}.2 weight 1
+}
+
+setup_multipath() {
+	if [ "$USE_NH" = "yes" ]; then
+		setup_multipath_new
+	else
+		setup_multipath_old
+	fi
+
+	# Set up routers with routes to dummies
+	run_cmd ${ns_r1} ip route add ${host4_a_addr} via ${prefix4}.${a_r1}.1
+	run_cmd ${ns_r2} ip route add ${host4_a_addr} via ${prefix4}.${a_r2}.1
+	run_cmd ${ns_r1} ip route add ${host4_b_addr} via ${prefix4}.${b_r1}.1
+	run_cmd ${ns_r2} ip route add ${host4_b_addr} via ${prefix4}.${b_r2}.1
+}
+
 setup() {
 	[ "$(id -u)" -ne 0 ] && echo "  need to run as root" && return $ksft_skip
 
@@ -922,6 +1059,18 @@ trace() {
 	sleep 1
 }
 
+cleanup_del_ovs_internal() {
+	# squelch the output of the del-if commands since it can be wordy
+	python3 ./openvswitch/ovs-dpctl.py del-if ovs_br0 -d true vxlan_a	>/dev/null	2>&1
+	python3 ./openvswitch/ovs-dpctl.py del-if ovs_br0 -d true geneve_a	>/dev/null	2>&1
+	python3 ./openvswitch/ovs-dpctl.py del-dp ovs_br0			>/dev/null	2>&1
+}
+
+cleanup_del_ovs_vswitchd() {
+	ovs-vsctl --if-exists del-port vxlan_a	2>/dev/null
+	ovs-vsctl --if-exists del-br ovs_br0	2>/dev/null
+}
+
 cleanup() {
 	for pid in ${tcpdump_pids}; do
 		kill ${pid}
@@ -940,10 +1089,10 @@ cleanup() {
 
 	cleanup_all_ns
 
-	ip link del veth_A-C			2>/dev/null
-	ip link del veth_A-R1			2>/dev/null
-	ovs-vsctl --if-exists del-port vxlan_a	2>/dev/null
-	ovs-vsctl --if-exists del-br ovs_br0	2>/dev/null
+	ip link del veth_A-C		2>/dev/null
+	ip link del veth_A-R1		2>/dev/null
+	cleanup_del_ovs_internal
+	cleanup_del_ovs_vswitchd
 	rm -f "$tmpoutfile"
 }
 
@@ -982,23 +1131,15 @@ link_get_mtu() {
 }
 
 route_get_dst_exception() {
-	ns_cmd="${1}"
-	dst="${2}"
-	dsfield="${3}"
-
-	if [ -z "${dsfield}" ]; then
-		dsfield=0
-	fi
+	ns_cmd="${1}"; shift
 
-	${ns_cmd} ip route get "${dst}" dsfield "${dsfield}"
+	${ns_cmd} ip route get "$@"
 }
 
 route_get_dst_pmtu_from_exception() {
-	ns_cmd="${1}"
-	dst="${2}"
-	dsfield="${3}"
+	ns_cmd="${1}"; shift
 
-	mtu_parse "$(route_get_dst_exception "${ns_cmd}" "${dst}" "${dsfield}")"
+	mtu_parse "$(route_get_dst_exception "${ns_cmd}" "$@")"
 }
 
 check_pmtu_value() {
@@ -1141,10 +1282,10 @@ test_pmtu_ipv4_dscp_icmp_exception() {
 	run_cmd "${ns_a}" ping -q -M want -Q "${dsfield}" -c 1 -w 1 -s "${len}" "${dst2}"
 
 	# Check that exceptions have been created with the correct PMTU
-	pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" "${policy_mark}")"
+	pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" dsfield "${policy_mark}")"
 	check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1
 
-	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" "${policy_mark}")"
+	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" dsfield "${policy_mark}")"
 	check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1
 }
 
@@ -1191,9 +1332,9 @@ test_pmtu_ipv4_dscp_udp_exception() {
 		UDP:"${dst2}":50000,tos="${dsfield}"
 
 	# Check that exceptions have been created with the correct PMTU
-	pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" "${policy_mark}")"
+	pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" dsfield "${policy_mark}")"
 	check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1
-	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" "${policy_mark}")"
+	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" dsfield "${policy_mark}")"
 	check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1
 }
 
@@ -1347,7 +1488,7 @@ test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception() {
 		size=$(du -sb $tmpoutfile)
 		size=${size%%/tmp/*}
 
-		[ $size -ne 1048576 ] && err "File size $size mismatches exepcted value in locally bridged vxlan test" && return 1
+		[ $size -ne 1048576 ] && err "File size $size mismatches expected value in locally bridged vxlan test" && return 1
 	done
 
 	rm -f "$tmpoutfile"
@@ -1397,6 +1538,12 @@ test_pmtu_ipvX_over_ovs_vxlanY_or_geneveY_exception() {
 	outer_family=${3}
 	ll_mtu=4000
 
+	if [ "${type}" = "vxlan" ]; then
+		tun_a="vxlan_sys_4789"
+	elif [ "${type}" = "geneve" ]; then
+		tun_a="genev_sys_6081"
+	fi
+
 	if [ ${outer_family} -eq 4 ]; then
 		setup namespaces routing ovs_bridge ovs_${type}4 || return $ksft_skip
 		#                      IPv4 header   UDP header   VXLAN/GENEVE header   Ethernet header
@@ -1407,17 +1554,11 @@ test_pmtu_ipvX_over_ovs_vxlanY_or_geneveY_exception() {
 		exp_mtu=$((${ll_mtu} - 40          - 8          - 8                   - 14))
 	fi
 
-	if [ "${type}" = "vxlan" ]; then
-		tun_a="vxlan_sys_4789"
-	elif [ "${type}" = "geneve" ]; then
-		tun_a="genev_sys_6081"
-	fi
-
-	trace ""        "${tun_a}"  "${ns_b}"  ${type}_b \
-	      ""        veth_A-R1   "${ns_r1}" veth_R1-A \
-	      "${ns_b}" veth_B-R1   "${ns_r1}" veth_R1-B \
-	      ""        ovs_br0     ""         veth-A-C  \
-	      "${ns_c}" veth_C-A
+	trace ""        ${type}_a    "${ns_b}"  ${type}_b \
+	      ""        veth_A-R1    "${ns_r1}" veth_R1-A \
+	      "${ns_b}" veth_B-R1    "${ns_r1}" veth_R1-B \
+	      ""        ovs_br0      ""         veth-A_C  \
+	      "${ns_c}" veth_C-A     ""         "${tun_a}"
 
 	if [ ${family} -eq 4 ]; then
 		ping=ping
@@ -1436,8 +1577,9 @@ test_pmtu_ipvX_over_ovs_vxlanY_or_geneveY_exception() {
 	mtu "${ns_b}"  veth_B-R1 ${ll_mtu}
 	mtu "${ns_r1}" veth_R1-B ${ll_mtu}
 
-	mtu ""        ${tun_a}  $((${ll_mtu} + 1000))
-	mtu "${ns_b}" ${type}_b $((${ll_mtu} + 1000))
+	mtu ""        ${tun_a}  $((${ll_mtu} + 1000)) 2>/dev/null || \
+		mtu ""        ${type}_a  $((${ll_mtu} + 1000)) 2>/dev/null
+	mtu "${ns_b}" ${type}_b  $((${ll_mtu} + 1000))
 
 	run_cmd ${ns_c} ${ping} -q -M want -i 0.1 -c 20 -s $((${ll_mtu} + 500)) ${dst} || return 1
 
@@ -1961,7 +2103,7 @@ check_running() {
 	pid=${1}
 	cmd=${2}
 
-	[ "$(cat /proc/${pid}/cmdline 2>/dev/null | tr -d '\0')" = "{cmd}" ]
+	[ "$(cat /proc/${pid}/cmdline 2>/dev/null | tr -d '\0')" = "${cmd}" ]
 }
 
 test_cleanup_vxlanX_exception() {
@@ -2234,6 +2376,36 @@ test_pmtu_ipv6_route_change() {
 	test_pmtu_ipvX_route_change 6
 }
 
+test_pmtu_ipv4_mp_exceptions() {
+	setup namespaces routing multipath || return $ksft_skip
+
+	trace "${ns_a}"  veth_A-R1    "${ns_r1}" veth_R1-A \
+	      "${ns_r1}" veth_R1-B    "${ns_b}"  veth_B-R1 \
+	      "${ns_a}"  veth_A-R2    "${ns_r2}" veth_R2-A \
+	      "${ns_r2}" veth_R2-B    "${ns_b}"  veth_B-R2
+
+	# Set up initial MTU values
+	mtu "${ns_a}"  veth_A-R1 2000
+	mtu "${ns_r1}" veth_R1-A 2000
+	mtu "${ns_r1}" veth_R1-B 1500
+	mtu "${ns_b}"  veth_B-R1 1500
+
+	mtu "${ns_a}"  veth_A-R2 2000
+	mtu "${ns_r2}" veth_R2-A 2000
+	mtu "${ns_r2}" veth_R2-B 1500
+	mtu "${ns_b}"  veth_B-R2 1500
+
+	# Ping and expect two nexthop exceptions for two routes
+	run_cmd ${ns_a} ping -q -M want -i 0.1 -c 1 -s 1800 "${host4_b_addr}"
+
+	# Check that exceptions have been created with the correct PMTU
+	pmtu_a_R1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${host4_b_addr}" oif veth_A-R1)"
+	pmtu_a_R2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${host4_b_addr}" oif veth_A-R2)"
+
+	check_pmtu_value "1500" "${pmtu_a_R1}" "exceeding MTU (veth_A-R1)" || return 1
+	check_pmtu_value "1500" "${pmtu_a_R2}" "exceeding MTU (veth_A-R2)" || return 1
+}
+
 usage() {
 	echo
 	echo "$0 [OPTIONS] [TEST]..."
diff --git a/tools/testing/selftests/net/proc_net_pktgen.c b/tools/testing/selftests/net/proc_net_pktgen.c
new file mode 100644
index 000000000000..69444fb29577
--- /dev/null
+++ b/tools/testing/selftests/net/proc_net_pktgen.c
@@ -0,0 +1,690 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * proc_net_pktgen: kselftest for /proc/net/pktgen interface
+ *
+ * Copyright (c) 2025 Peter Seiderer <ps.report@gmx.net>
+ *
+ */
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "../kselftest_harness.h"
+
+static const char ctrl_cmd_stop[] = "stop";
+static const char ctrl_cmd_start[] = "start";
+static const char ctrl_cmd_reset[] = "reset";
+
+static const char wrong_ctrl_cmd[] = "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789";
+
+static const char thr_cmd_add_loopback_0[] = "add_device lo@0";
+static const char thr_cmd_rm_loopback_0[] = "rem_device_all";
+
+static const char wrong_thr_cmd[] = "forsureawrongcommand";
+static const char legacy_thr_cmd[] = "max_before_softirq";
+
+static const char wrong_dev_cmd[] = "forsurewrongcommand";
+static const char dev_cmd_min_pkt_size_0[] = "min_pkt_size";
+static const char dev_cmd_min_pkt_size_1[] = "min_pkt_size ";
+static const char dev_cmd_min_pkt_size_2[] = "min_pkt_size 0";
+static const char dev_cmd_min_pkt_size_3[] = "min_pkt_size 1";
+static const char dev_cmd_min_pkt_size_4[] = "min_pkt_size 100";
+static const char dev_cmd_min_pkt_size_5[] = "min_pkt_size=1001";
+static const char dev_cmd_min_pkt_size_6[] = "min_pkt_size =2002";
+static const char dev_cmd_min_pkt_size_7[] = "min_pkt_size= 3003";
+static const char dev_cmd_min_pkt_size_8[] = "min_pkt_size = 4004";
+static const char dev_cmd_max_pkt_size_0[] = "max_pkt_size 200";
+static const char dev_cmd_pkt_size_0[] = "pkt_size 300";
+static const char dev_cmd_imix_weights_0[] = "imix_weights 0,7 576,4 1500,1";
+static const char dev_cmd_imix_weights_1[] = "imix_weights 101,1 102,2 103,3 104,4 105,5 106,6 107,7 108,8 109,9 110,10 111,11 112,12 113,13 114,14 115,15 116,16 117,17 118,18 119,19 120,20";
+static const char dev_cmd_imix_weights_2[] = "imix_weights 100,1 102,2 103,3 104,4 105,5 106,6 107,7 108,8 109,9 110,10 111,11 112,12 113,13 114,14 115,15 116,16 117,17 118,18 119,19 120,20 121,21";
+static const char dev_cmd_imix_weights_3[] = "imix_weights";
+static const char dev_cmd_imix_weights_4[] = "imix_weights ";
+static const char dev_cmd_imix_weights_5[] = "imix_weights 0";
+static const char dev_cmd_imix_weights_6[] = "imix_weights 0,";
+static const char dev_cmd_debug_0[] = "debug 1";
+static const char dev_cmd_debug_1[] = "debug 0";
+static const char dev_cmd_frags_0[] = "frags 100";
+static const char dev_cmd_delay_0[] = "delay 100";
+static const char dev_cmd_delay_1[] = "delay 2147483647";
+static const char dev_cmd_rate_0[] = "rate 0";
+static const char dev_cmd_rate_1[] = "rate 100";
+static const char dev_cmd_ratep_0[] = "ratep 0";
+static const char dev_cmd_ratep_1[] = "ratep 200";
+static const char dev_cmd_udp_src_min_0[] = "udp_src_min 1";
+static const char dev_cmd_udp_dst_min_0[] = "udp_dst_min 2";
+static const char dev_cmd_udp_src_max_0[] = "udp_src_max 3";
+static const char dev_cmd_udp_dst_max_0[] = "udp_dst_max 4";
+static const char dev_cmd_clone_skb_0[] = "clone_skb 1";
+static const char dev_cmd_clone_skb_1[] = "clone_skb 0";
+static const char dev_cmd_count_0[] = "count 100";
+static const char dev_cmd_src_mac_count_0[] = "src_mac_count 100";
+static const char dev_cmd_dst_mac_count_0[] = "dst_mac_count 100";
+static const char dev_cmd_burst_0[] = "burst 0";
+static const char dev_cmd_node_0[] = "node 100";
+static const char dev_cmd_xmit_mode_0[] = "xmit_mode start_xmit";
+static const char dev_cmd_xmit_mode_1[] = "xmit_mode netif_receive";
+static const char dev_cmd_xmit_mode_2[] = "xmit_mode queue_xmit";
+static const char dev_cmd_xmit_mode_3[] = "xmit_mode nonsense";
+static const char dev_cmd_flag_0[] = "flag UDPCSUM";
+static const char dev_cmd_flag_1[] = "flag !UDPCSUM";
+static const char dev_cmd_flag_2[] = "flag nonsense";
+static const char dev_cmd_dst_min_0[] = "dst_min 101.102.103.104";
+static const char dev_cmd_dst_0[] = "dst 101.102.103.104";
+static const char dev_cmd_dst_max_0[] = "dst_max 201.202.203.204";
+static const char dev_cmd_dst6_0[] = "dst6 2001:db38:1234:0000:0000:0000:0000:0000";
+static const char dev_cmd_dst6_min_0[] = "dst6_min 2001:db8:1234:0000:0000:0000:0000:0000";
+static const char dev_cmd_dst6_max_0[] = "dst6_max 2001:db8:1234:0000:0000:0000:0000:0000";
+static const char dev_cmd_src6_0[] = "src6 2001:db38:1234:0000:0000:0000:0000:0000";
+static const char dev_cmd_src_min_0[] = "src_min 101.102.103.104";
+static const char dev_cmd_src_max_0[] = "src_max 201.202.203.204";
+static const char dev_cmd_dst_mac_0[] = "dst_mac 01:02:03:04:05:06";
+static const char dev_cmd_src_mac_0[] = "src_mac 11:12:13:14:15:16";
+static const char dev_cmd_clear_counters_0[] = "clear_counters";
+static const char dev_cmd_flows_0[] = "flows 100";
+static const char dev_cmd_spi_0[] = "spi 100";
+static const char dev_cmd_flowlen_0[] = "flowlen 100";
+static const char dev_cmd_queue_map_min_0[] = "queue_map_min 1";
+static const char dev_cmd_queue_map_max_0[] = "queue_map_max 2";
+static const char dev_cmd_mpls_0[] = "mpls 00000001";
+static const char dev_cmd_mpls_1[] = "mpls 00000001,000000f2";
+static const char dev_cmd_mpls_2[] = "mpls 00000f00,00000f01,00000f02,00000f03,00000f04,00000f05,00000f06,00000f07,00000f08,00000f09,00000f0a,00000f0b,00000f0c,00000f0d,00000f0e,00000f0f";
+static const char dev_cmd_mpls_3[] = "mpls 00000f00,00000f01,00000f02,00000f03,00000f04,00000f05,00000f06,00000f07,00000f08,00000f09,00000f0a,00000f0b,00000f0c,00000f0d,00000f0e,00000f0f,00000f10";
+static const char dev_cmd_vlan_id_0[] = "vlan_id 1";
+static const char dev_cmd_vlan_p_0[] = "vlan_p 1";
+static const char dev_cmd_vlan_cfi_0[] = "vlan_cfi 1";
+static const char dev_cmd_vlan_id_1[] = "vlan_id 4096";
+static const char dev_cmd_svlan_id_0[] = "svlan_id 1";
+static const char dev_cmd_svlan_p_0[] = "svlan_p 1";
+static const char dev_cmd_svlan_cfi_0[] = "svlan_cfi 1";
+static const char dev_cmd_svlan_id_1[] = "svlan_id 4096";
+static const char dev_cmd_tos_0[] = "tos 0";
+static const char dev_cmd_tos_1[] = "tos 0f";
+static const char dev_cmd_tos_2[] = "tos 0ff";
+static const char dev_cmd_traffic_class_0[] = "traffic_class f0";
+static const char dev_cmd_skb_priority_0[] = "skb_priority 999";
+
+FIXTURE(proc_net_pktgen) {
+	int ctrl_fd;
+	int thr_fd;
+	int dev_fd;
+};
+
+FIXTURE_SETUP(proc_net_pktgen) {
+	int r;
+	ssize_t len;
+
+	r = system("modprobe pktgen");
+	ASSERT_EQ(r, 0) TH_LOG("CONFIG_NET_PKTGEN not enabled, module pktgen not loaded?");
+
+	self->ctrl_fd = open("/proc/net/pktgen/pgctrl", O_RDWR);
+	ASSERT_GE(self->ctrl_fd, 0) TH_LOG("CONFIG_NET_PKTGEN not enabled, module pktgen not loaded?");
+
+	self->thr_fd = open("/proc/net/pktgen/kpktgend_0", O_RDWR);
+	ASSERT_GE(self->thr_fd, 0) TH_LOG("CONFIG_NET_PKTGEN not enabled, module pktgen not loaded?");
+
+	len = write(self->thr_fd, thr_cmd_add_loopback_0, sizeof(thr_cmd_add_loopback_0));
+	ASSERT_EQ(len, sizeof(thr_cmd_add_loopback_0)) TH_LOG("device lo@0 already registered?");
+
+	self->dev_fd = open("/proc/net/pktgen/lo@0", O_RDWR);
+	ASSERT_GE(self->dev_fd, 0) TH_LOG("device entry for lo@0 missing?");
+}
+
+FIXTURE_TEARDOWN(proc_net_pktgen) {
+	int ret;
+	ssize_t len;
+
+	ret = close(self->dev_fd);
+	EXPECT_EQ(ret, 0);
+
+	len = write(self->thr_fd, thr_cmd_rm_loopback_0, sizeof(thr_cmd_rm_loopback_0));
+	EXPECT_EQ(len, sizeof(thr_cmd_rm_loopback_0));
+
+	ret = close(self->thr_fd);
+	EXPECT_EQ(ret, 0);
+
+	ret = close(self->ctrl_fd);
+	EXPECT_EQ(ret, 0);
+}
+
+TEST_F(proc_net_pktgen, wrong_ctrl_cmd) {
+	for (int i = 0; i <= sizeof(wrong_ctrl_cmd); i++) {
+		ssize_t len;
+
+		len = write(self->ctrl_fd, wrong_ctrl_cmd, i);
+		EXPECT_EQ(len, -1);
+		EXPECT_EQ(errno, EINVAL);
+	}
+}
+
+TEST_F(proc_net_pktgen, ctrl_cmd) {
+	ssize_t len;
+
+	len = write(self->ctrl_fd, ctrl_cmd_stop, sizeof(ctrl_cmd_stop));
+	EXPECT_EQ(len,	sizeof(ctrl_cmd_stop));
+
+	len = write(self->ctrl_fd, ctrl_cmd_stop, sizeof(ctrl_cmd_stop) - 1);
+	EXPECT_EQ(len,	sizeof(ctrl_cmd_stop) - 1);
+
+	len = write(self->ctrl_fd, ctrl_cmd_start, sizeof(ctrl_cmd_start));
+	EXPECT_EQ(len,	sizeof(ctrl_cmd_start));
+
+	len = write(self->ctrl_fd, ctrl_cmd_start, sizeof(ctrl_cmd_start) - 1);
+	EXPECT_EQ(len,	sizeof(ctrl_cmd_start) - 1);
+
+	len = write(self->ctrl_fd, ctrl_cmd_reset, sizeof(ctrl_cmd_reset));
+	EXPECT_EQ(len,	sizeof(ctrl_cmd_reset));
+
+	len = write(self->ctrl_fd, ctrl_cmd_reset, sizeof(ctrl_cmd_reset) - 1);
+	EXPECT_EQ(len,	sizeof(ctrl_cmd_reset) - 1);
+}
+
+TEST_F(proc_net_pktgen, wrong_thr_cmd) {
+	for (int i = 0; i <= sizeof(wrong_thr_cmd); i++) {
+		ssize_t len;
+
+		len = write(self->thr_fd, wrong_thr_cmd, i);
+		EXPECT_EQ(len, -1);
+		EXPECT_EQ(errno, EINVAL);
+	}
+}
+
+TEST_F(proc_net_pktgen, legacy_thr_cmd) {
+	for (int i = 0; i <= sizeof(legacy_thr_cmd); i++) {
+		ssize_t len;
+
+		len = write(self->thr_fd, legacy_thr_cmd, i);
+		if (i < (sizeof(legacy_thr_cmd) - 1)) {
+			/* incomplete command string */
+			EXPECT_EQ(len, -1);
+			EXPECT_EQ(errno, EINVAL);
+		} else {
+			/* complete command string without/with trailing '\0' */
+			EXPECT_EQ(len, i);
+		}
+	}
+}
+
+TEST_F(proc_net_pktgen, wrong_dev_cmd) {
+	for (int i = 0; i <= sizeof(wrong_dev_cmd); i++) {
+		ssize_t len;
+
+		len = write(self->dev_fd, wrong_dev_cmd, i);
+		EXPECT_EQ(len, -1);
+		EXPECT_EQ(errno, EINVAL);
+	}
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_min_pkt_size) {
+	ssize_t len;
+
+	/* with trailing '\0' */
+	len = write(self->dev_fd, dev_cmd_min_pkt_size_0, sizeof(dev_cmd_min_pkt_size_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_min_pkt_size_0));
+
+	/* without trailing '\0' */
+	len = write(self->dev_fd, dev_cmd_min_pkt_size_0, sizeof(dev_cmd_min_pkt_size_0) - 1);
+	EXPECT_EQ(len, sizeof(dev_cmd_min_pkt_size_0) - 1);
+
+	/* with trailing '\0' */
+	len = write(self->dev_fd, dev_cmd_min_pkt_size_1, sizeof(dev_cmd_min_pkt_size_1));
+	EXPECT_EQ(len, sizeof(dev_cmd_min_pkt_size_1));
+
+	/* without trailing '\0' */
+	len = write(self->dev_fd, dev_cmd_min_pkt_size_1, sizeof(dev_cmd_min_pkt_size_1) - 1);
+	EXPECT_EQ(len, sizeof(dev_cmd_min_pkt_size_1) - 1);
+
+	/* with trailing '\0' */
+	len = write(self->dev_fd, dev_cmd_min_pkt_size_2, sizeof(dev_cmd_min_pkt_size_2));
+	EXPECT_EQ(len, sizeof(dev_cmd_min_pkt_size_2));
+
+	/* without trailing '\0' */
+	len = write(self->dev_fd, dev_cmd_min_pkt_size_2, sizeof(dev_cmd_min_pkt_size_2) - 1);
+	EXPECT_EQ(len, sizeof(dev_cmd_min_pkt_size_2) - 1);
+
+	len = write(self->dev_fd, dev_cmd_min_pkt_size_3, sizeof(dev_cmd_min_pkt_size_3));
+	EXPECT_EQ(len, sizeof(dev_cmd_min_pkt_size_3));
+
+	len = write(self->dev_fd, dev_cmd_min_pkt_size_4, sizeof(dev_cmd_min_pkt_size_4));
+	EXPECT_EQ(len, sizeof(dev_cmd_min_pkt_size_4));
+
+	len = write(self->dev_fd, dev_cmd_min_pkt_size_5, sizeof(dev_cmd_min_pkt_size_5));
+	EXPECT_EQ(len, sizeof(dev_cmd_min_pkt_size_5));
+
+	len = write(self->dev_fd, dev_cmd_min_pkt_size_6, sizeof(dev_cmd_min_pkt_size_6));
+	EXPECT_EQ(len, sizeof(dev_cmd_min_pkt_size_6));
+
+	len = write(self->dev_fd, dev_cmd_min_pkt_size_7, sizeof(dev_cmd_min_pkt_size_7));
+	EXPECT_EQ(len, sizeof(dev_cmd_min_pkt_size_7));
+
+	len = write(self->dev_fd, dev_cmd_min_pkt_size_8, sizeof(dev_cmd_min_pkt_size_8));
+	EXPECT_EQ(len, sizeof(dev_cmd_min_pkt_size_8));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_max_pkt_size) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_max_pkt_size_0, sizeof(dev_cmd_max_pkt_size_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_max_pkt_size_0));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_pkt_size) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_pkt_size_0, sizeof(dev_cmd_pkt_size_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_pkt_size_0));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_imix_weights) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_imix_weights_0, sizeof(dev_cmd_imix_weights_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_imix_weights_0));
+
+	len = write(self->dev_fd, dev_cmd_imix_weights_1, sizeof(dev_cmd_imix_weights_1));
+	EXPECT_EQ(len, sizeof(dev_cmd_imix_weights_1));
+
+	len = write(self->dev_fd, dev_cmd_imix_weights_2, sizeof(dev_cmd_imix_weights_2));
+	EXPECT_EQ(len, -1);
+	EXPECT_EQ(errno, E2BIG);
+
+	/* with trailing '\0' */
+	len = write(self->dev_fd, dev_cmd_imix_weights_3, sizeof(dev_cmd_imix_weights_3));
+	EXPECT_EQ(len, -1);
+	EXPECT_EQ(errno, EINVAL);
+
+	/* without trailing '\0' */
+	len = write(self->dev_fd, dev_cmd_imix_weights_3, sizeof(dev_cmd_imix_weights_3) - 1);
+	EXPECT_EQ(len, -1);
+	EXPECT_EQ(errno, EINVAL);
+
+	/* with trailing '\0' */
+	len = write(self->dev_fd, dev_cmd_imix_weights_4, sizeof(dev_cmd_imix_weights_4));
+	EXPECT_EQ(len, -1);
+	EXPECT_EQ(errno, EINVAL);
+
+	/* without trailing '\0' */
+	len = write(self->dev_fd, dev_cmd_imix_weights_4, sizeof(dev_cmd_imix_weights_4) - 1);
+	EXPECT_EQ(len, -1);
+	EXPECT_EQ(errno, EINVAL);
+
+	/* with trailing '\0' */
+	len = write(self->dev_fd, dev_cmd_imix_weights_5, sizeof(dev_cmd_imix_weights_5));
+	EXPECT_EQ(len, -1);
+	EXPECT_EQ(errno, EINVAL);
+
+	/* without trailing '\0' */
+	len = write(self->dev_fd, dev_cmd_imix_weights_5, sizeof(dev_cmd_imix_weights_5) - 1);
+	EXPECT_EQ(len, -1);
+	EXPECT_EQ(errno, EINVAL);
+
+	/* with trailing '\0' */
+	len = write(self->dev_fd, dev_cmd_imix_weights_6, sizeof(dev_cmd_imix_weights_6));
+	EXPECT_EQ(len, -1);
+	EXPECT_EQ(errno, EINVAL);
+
+	/* without trailing '\0' */
+	len = write(self->dev_fd, dev_cmd_imix_weights_6, sizeof(dev_cmd_imix_weights_6) - 1);
+	EXPECT_EQ(len, -1);
+	EXPECT_EQ(errno, EINVAL);
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_debug) {
+	ssize_t len;
+
+	/* debug on */
+	len = write(self->dev_fd, dev_cmd_debug_0, sizeof(dev_cmd_debug_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_debug_0));
+
+	/* debug off */
+	len = write(self->dev_fd, dev_cmd_debug_1, sizeof(dev_cmd_debug_1));
+	EXPECT_EQ(len, sizeof(dev_cmd_debug_1));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_frags) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_frags_0, sizeof(dev_cmd_frags_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_frags_0));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_delay) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_delay_0, sizeof(dev_cmd_delay_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_delay_0));
+
+	len = write(self->dev_fd, dev_cmd_delay_1, sizeof(dev_cmd_delay_1));
+	EXPECT_EQ(len, sizeof(dev_cmd_delay_1));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_rate) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_rate_0, sizeof(dev_cmd_rate_0));
+	EXPECT_EQ(len, -1);
+	EXPECT_EQ(errno, EINVAL);
+
+	len = write(self->dev_fd, dev_cmd_rate_1, sizeof(dev_cmd_rate_1));
+	EXPECT_EQ(len, sizeof(dev_cmd_rate_1));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_ratep) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_ratep_0, sizeof(dev_cmd_ratep_0));
+	EXPECT_EQ(len, -1);
+	EXPECT_EQ(errno, EINVAL);
+
+	len = write(self->dev_fd, dev_cmd_ratep_1, sizeof(dev_cmd_ratep_1));
+	EXPECT_EQ(len, sizeof(dev_cmd_ratep_1));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_udp_src_min) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_udp_src_min_0, sizeof(dev_cmd_udp_src_min_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_udp_src_min_0));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_udp_dst_min) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_udp_dst_min_0, sizeof(dev_cmd_udp_dst_min_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_udp_dst_min_0));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_udp_src_max) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_udp_src_max_0, sizeof(dev_cmd_udp_src_max_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_udp_src_max_0));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_udp_dst_max) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_udp_dst_max_0, sizeof(dev_cmd_udp_dst_max_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_udp_dst_max_0));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_clone_skb) {
+	ssize_t len;
+
+	/* clone_skb on (gives EOPNOTSUPP on lo device) */
+	len = write(self->dev_fd, dev_cmd_clone_skb_0, sizeof(dev_cmd_clone_skb_0));
+	EXPECT_EQ(len, -1);
+	EXPECT_EQ(errno, EOPNOTSUPP);
+
+	/* clone_skb off */
+	len = write(self->dev_fd, dev_cmd_clone_skb_1, sizeof(dev_cmd_clone_skb_1));
+	EXPECT_EQ(len, sizeof(dev_cmd_clone_skb_1));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_count) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_count_0, sizeof(dev_cmd_count_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_count_0));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_src_mac_count) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_src_mac_count_0, sizeof(dev_cmd_src_mac_count_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_src_mac_count_0));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_dst_mac_count) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_dst_mac_count_0, sizeof(dev_cmd_dst_mac_count_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_dst_mac_count_0));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_burst) {
+	ssize_t len;
+
+	/* burst off */
+	len = write(self->dev_fd, dev_cmd_burst_0, sizeof(dev_cmd_burst_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_burst_0));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_node) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_node_0, sizeof(dev_cmd_node_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_node_0));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_xmit_mode) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_xmit_mode_0, sizeof(dev_cmd_xmit_mode_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_xmit_mode_0));
+
+	len = write(self->dev_fd, dev_cmd_xmit_mode_1, sizeof(dev_cmd_xmit_mode_1));
+	EXPECT_EQ(len, sizeof(dev_cmd_xmit_mode_1));
+
+	len = write(self->dev_fd, dev_cmd_xmit_mode_2, sizeof(dev_cmd_xmit_mode_2));
+	EXPECT_EQ(len, sizeof(dev_cmd_xmit_mode_2));
+
+	len = write(self->dev_fd, dev_cmd_xmit_mode_3, sizeof(dev_cmd_xmit_mode_3));
+	EXPECT_EQ(len, sizeof(dev_cmd_xmit_mode_3));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_flag) {
+	ssize_t len;
+
+	/* flag UDPCSUM on */
+	len = write(self->dev_fd, dev_cmd_flag_0, sizeof(dev_cmd_flag_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_flag_0));
+
+	/* flag UDPCSUM off */
+	len = write(self->dev_fd, dev_cmd_flag_1, sizeof(dev_cmd_flag_1));
+	EXPECT_EQ(len, sizeof(dev_cmd_flag_1));
+
+	/* flag invalid */
+	len = write(self->dev_fd, dev_cmd_flag_2, sizeof(dev_cmd_flag_2));
+	EXPECT_EQ(len, sizeof(dev_cmd_flag_2));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_dst_min) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_dst_min_0, sizeof(dev_cmd_dst_min_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_dst_min_0));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_dst) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_dst_0, sizeof(dev_cmd_dst_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_dst_0));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_dst_max) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_dst_max_0, sizeof(dev_cmd_dst_max_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_dst_max_0));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_dst6) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_dst6_0, sizeof(dev_cmd_dst6_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_dst6_0));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_dst6_min) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_dst6_min_0, sizeof(dev_cmd_dst6_min_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_dst6_min_0));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_dst6_max) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_dst6_max_0, sizeof(dev_cmd_dst6_max_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_dst6_max_0));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_src6) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_src6_0, sizeof(dev_cmd_src6_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_src6_0));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_src_min) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_src_min_0, sizeof(dev_cmd_src_min_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_src_min_0));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_src_max) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_src_max_0, sizeof(dev_cmd_src_max_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_src_max_0));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_dst_mac) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_dst_mac_0, sizeof(dev_cmd_dst_mac_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_dst_mac_0));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_src_mac) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_src_mac_0, sizeof(dev_cmd_src_mac_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_src_mac_0));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_clear_counters) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_clear_counters_0, sizeof(dev_cmd_clear_counters_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_clear_counters_0));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_flows) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_flows_0, sizeof(dev_cmd_flows_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_flows_0));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_spi) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_spi_0, sizeof(dev_cmd_spi_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_spi_0)) TH_LOG("CONFIG_XFRM not enabled?");
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_flowlen) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_flowlen_0, sizeof(dev_cmd_flowlen_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_flowlen_0));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_queue_map_min) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_queue_map_min_0, sizeof(dev_cmd_queue_map_min_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_queue_map_min_0));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_queue_map_max) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_queue_map_max_0, sizeof(dev_cmd_queue_map_max_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_queue_map_max_0));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_mpls) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_mpls_0, sizeof(dev_cmd_mpls_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_mpls_0));
+
+	len = write(self->dev_fd, dev_cmd_mpls_1, sizeof(dev_cmd_mpls_1));
+	EXPECT_EQ(len, sizeof(dev_cmd_mpls_1));
+
+	len = write(self->dev_fd, dev_cmd_mpls_2, sizeof(dev_cmd_mpls_2));
+	EXPECT_EQ(len, sizeof(dev_cmd_mpls_2));
+
+	len = write(self->dev_fd, dev_cmd_mpls_3, sizeof(dev_cmd_mpls_3));
+	EXPECT_EQ(len, -1);
+	EXPECT_EQ(errno, E2BIG);
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_vlan_id) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_vlan_id_0, sizeof(dev_cmd_vlan_id_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_vlan_id_0));
+
+	len = write(self->dev_fd, dev_cmd_vlan_p_0, sizeof(dev_cmd_vlan_p_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_vlan_p_0));
+
+	len = write(self->dev_fd, dev_cmd_vlan_cfi_0, sizeof(dev_cmd_vlan_cfi_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_vlan_cfi_0));
+
+	len = write(self->dev_fd, dev_cmd_vlan_id_1, sizeof(dev_cmd_vlan_id_1));
+	EXPECT_EQ(len, sizeof(dev_cmd_vlan_id_1));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_svlan_id) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_svlan_id_0, sizeof(dev_cmd_svlan_id_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_svlan_id_0));
+
+	len = write(self->dev_fd, dev_cmd_svlan_p_0, sizeof(dev_cmd_svlan_p_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_svlan_p_0));
+
+	len = write(self->dev_fd, dev_cmd_svlan_cfi_0, sizeof(dev_cmd_svlan_cfi_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_svlan_cfi_0));
+
+	len = write(self->dev_fd, dev_cmd_svlan_id_1, sizeof(dev_cmd_svlan_id_1));
+	EXPECT_EQ(len, sizeof(dev_cmd_svlan_id_1));
+}
+
+
+TEST_F(proc_net_pktgen, dev_cmd_tos) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_tos_0, sizeof(dev_cmd_tos_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_tos_0));
+
+	len = write(self->dev_fd, dev_cmd_tos_1, sizeof(dev_cmd_tos_1));
+	EXPECT_EQ(len, sizeof(dev_cmd_tos_1));
+
+	len = write(self->dev_fd, dev_cmd_tos_2, sizeof(dev_cmd_tos_2));
+	EXPECT_EQ(len, sizeof(dev_cmd_tos_2));
+}
+
+
+TEST_F(proc_net_pktgen, dev_cmd_traffic_class) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_traffic_class_0, sizeof(dev_cmd_traffic_class_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_traffic_class_0));
+}
+
+TEST_F(proc_net_pktgen, dev_cmd_skb_priority) {
+	ssize_t len;
+
+	len = write(self->dev_fd, dev_cmd_skb_priority_0, sizeof(dev_cmd_skb_priority_0));
+	EXPECT_EQ(len, sizeof(dev_cmd_skb_priority_0));
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/net/psock_fanout.c b/tools/testing/selftests/net/psock_fanout.c
index 1a736f700be4..84c524357075 100644
--- a/tools/testing/selftests/net/psock_fanout.c
+++ b/tools/testing/selftests/net/psock_fanout.c
@@ -48,6 +48,7 @@
 #include <string.h>
 #include <sys/mman.h>
 #include <sys/socket.h>
+#include <sys/ioctl.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
@@ -59,6 +60,33 @@
 
 static uint32_t cfg_max_num_members;
 
+static void loopback_set_up_down(int state_up)
+{
+	struct ifreq ifreq = {};
+	int fd, err;
+
+	fd = socket(AF_PACKET, SOCK_RAW, 0);
+	if (fd < 0) {
+		perror("socket loopback");
+		exit(1);
+	}
+	strcpy(ifreq.ifr_name, "lo");
+	err = ioctl(fd, SIOCGIFFLAGS, &ifreq);
+	if (err) {
+		perror("SIOCGIFFLAGS");
+		exit(1);
+	}
+	if (state_up != !!(ifreq.ifr_flags & IFF_UP)) {
+		ifreq.ifr_flags ^= IFF_UP;
+		err = ioctl(fd, SIOCSIFFLAGS, &ifreq);
+		if (err) {
+			perror("SIOCSIFFLAGS");
+			exit(1);
+		}
+	}
+	close(fd);
+}
+
 /* Open a socket in a given fanout mode.
  * @return -1 if mode is bad, a valid socket otherwise */
 static int sock_fanout_open(uint16_t typeflags, uint16_t group_id)
@@ -165,9 +193,9 @@ static void sock_fanout_set_ebpf(int fd)
 	attr.insns = (unsigned long) prog;
 	attr.insn_cnt = ARRAY_SIZE(prog);
 	attr.license = (unsigned long) "GPL";
-	attr.log_buf = (unsigned long) log_buf,
-	attr.log_size = sizeof(log_buf),
-	attr.log_level = 1,
+	attr.log_buf = (unsigned long) log_buf;
+	attr.log_size = sizeof(log_buf);
+	attr.log_level = 1;
 
 	pfd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
 	if (pfd < 0) {
@@ -251,6 +279,41 @@ static int sock_fanout_read(int fds[], char *rings[], const int expect[])
 	return 0;
 }
 
+/* Test that creating/joining a fanout group fails for unbound socket without
+ * a specified protocol
+ */
+static void test_unbound_fanout(void)
+{
+	int val, fd0, fd1, err;
+
+	fprintf(stderr, "test: unbound fanout\n");
+	fd0 = socket(PF_PACKET, SOCK_RAW, 0);
+	if (fd0 < 0) {
+		perror("socket packet");
+		exit(1);
+	}
+	/* Try to create a new fanout group. Should fail. */
+	val = (PACKET_FANOUT_HASH << 16) | 1;
+	err = setsockopt(fd0, SOL_PACKET, PACKET_FANOUT, &val, sizeof(val));
+	if (!err) {
+		fprintf(stderr, "ERROR: unbound socket fanout create\n");
+		exit(1);
+	}
+	fd1 = sock_fanout_open(PACKET_FANOUT_HASH, 1);
+	if (fd1 == -1) {
+		fprintf(stderr, "ERROR: failed to open HASH socket\n");
+		exit(1);
+	}
+	/* Try to join an existing fanout group. Should fail. */
+	err = setsockopt(fd0, SOL_PACKET, PACKET_FANOUT, &val, sizeof(val));
+	if (!err) {
+		fprintf(stderr, "ERROR: unbound socket fanout join\n");
+		exit(1);
+	}
+	close(fd0);
+	close(fd1);
+}
+
 /* Test illegal mode + flag combination */
 static void test_control_single(void)
 {
@@ -264,17 +327,22 @@ static void test_control_single(void)
 }
 
 /* Test illegal group with different modes or flags */
-static void test_control_group(void)
+static void test_control_group(int toggle)
 {
 	int fds[2];
 
-	fprintf(stderr, "test: control multiple sockets\n");
+	if (toggle)
+		fprintf(stderr, "test: control multiple sockets with link down toggle\n");
+	else
+		fprintf(stderr, "test: control multiple sockets\n");
 
 	fds[0] = sock_fanout_open(PACKET_FANOUT_HASH, 0);
 	if (fds[0] == -1) {
 		fprintf(stderr, "ERROR: failed to open HASH socket\n");
 		exit(1);
 	}
+	if (toggle)
+		loopback_set_up_down(0);
 	if (sock_fanout_open(PACKET_FANOUT_HASH |
 			       PACKET_FANOUT_FLAG_DEFRAG, 0) != -1) {
 		fprintf(stderr, "ERROR: joined group with wrong flag defrag\n");
@@ -294,6 +362,8 @@ static void test_control_group(void)
 		fprintf(stderr, "ERROR: failed to join group\n");
 		exit(1);
 	}
+	if (toggle)
+		loopback_set_up_down(1);
 	if (close(fds[1]) || close(fds[0])) {
 		fprintf(stderr, "ERROR: closing sockets\n");
 		exit(1);
@@ -488,8 +558,10 @@ int main(int argc, char **argv)
 	const int expect_uniqueid[2][2] = { { 20, 20},  { 20, 20 } };
 	int port_off = 2, tries = 20, ret;
 
+	test_unbound_fanout();
 	test_control_single();
-	test_control_group();
+	test_control_group(0);
+	test_control_group(1);
 	test_control_group_max_num_members();
 	test_unique_fanout_group_ids();
 
diff --git a/tools/testing/selftests/net/psock_tpacket.c b/tools/testing/selftests/net/psock_tpacket.c
index 404a2ce759ab..221270cee3ea 100644
--- a/tools/testing/selftests/net/psock_tpacket.c
+++ b/tools/testing/selftests/net/psock_tpacket.c
@@ -12,7 +12,7 @@
  *
  * Datapath:
  *   Open a pair of packet sockets and send resp. receive an a priori known
- *   packet pattern accross the sockets and check if it was received resp.
+ *   packet pattern across the sockets and check if it was received resp.
  *   sent correctly. Fanout in combination with RX_RING is currently not
  *   tested here.
  *
diff --git a/tools/testing/selftests/net/rds/.gitignore b/tools/testing/selftests/net/rds/.gitignore
new file mode 100644
index 000000000000..1c6f04e2aa11
--- /dev/null
+++ b/tools/testing/selftests/net/rds/.gitignore
@@ -0,0 +1 @@
+include.sh
diff --git a/tools/testing/selftests/net/rds/Makefile b/tools/testing/selftests/net/rds/Makefile
new file mode 100644
index 000000000000..612a7219990e
--- /dev/null
+++ b/tools/testing/selftests/net/rds/Makefile
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0
+
+all:
+	@echo mk_build_dir="$(shell pwd)" > include.sh
+
+TEST_PROGS := run.sh
+
+TEST_FILES := include.sh test.py
+
+EXTRA_CLEAN := /tmp/rds_logs include.sh
+
+include ../../lib.mk
diff --git a/tools/testing/selftests/net/rds/README.txt b/tools/testing/selftests/net/rds/README.txt
new file mode 100644
index 000000000000..cbde2951ab13
--- /dev/null
+++ b/tools/testing/selftests/net/rds/README.txt
@@ -0,0 +1,41 @@
+RDS self-tests
+==============
+
+These scripts provide a coverage test for RDS-TCP by creating two
+network namespaces and running rds packets between them. A loopback
+network is provisioned with optional probability of packet loss or
+corruption. A workload of 50000 hashes, each 64 characters in size,
+are passed over an RDS socket on this test network. A passing test means
+the RDS-TCP stack was able to recover properly.  The provided config.sh
+can be used to compile the kernel with the necessary gcov options.  The
+kernel may optionally be configured to omit the coverage report as well.
+
+USAGE:
+	run.sh [-d logdir] [-l packet_loss] [-c packet_corruption]
+	       [-u packet_duplcate]
+
+OPTIONS:
+	-d	Log directory.  Defaults to tools/testing/selftests/net/rds/rds_logs
+
+	-l	Simulates a percentage of packet loss
+
+	-c	Simulates a percentage of packet corruption
+
+	-u	Simulates a percentage of packet duplication.
+
+EXAMPLE:
+
+    # Create a suitable gcov enabled .config
+    tools/testing/selftests/net/rds/config.sh -g
+
+    # Alternatly create a gcov disabled .config
+    tools/testing/selftests/net/rds/config.sh
+
+    # build the kernel
+    vng --build  --config tools/testing/selftests/net/config
+
+    # launch the tests in a VM
+    vng -v --rwdir ./ --run . --user root --cpus 4 -- \
+        "export PYTHONPATH=tools/testing/selftests/net/; tools/testing/selftests/net/rds/run.sh"
+
+An HTML coverage report will be output in tools/testing/selftests/net/rds/rds_logs/coverage/.
diff --git a/tools/testing/selftests/net/rds/config.sh b/tools/testing/selftests/net/rds/config.sh
new file mode 100755
index 000000000000..791c8dbe1095
--- /dev/null
+++ b/tools/testing/selftests/net/rds/config.sh
@@ -0,0 +1,53 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+set -e
+set -u
+set -x
+
+unset KBUILD_OUTPUT
+
+GENERATE_GCOV_REPORT=0
+while getopts "g" opt; do
+  case ${opt} in
+    g)
+      GENERATE_GCOV_REPORT=1
+      ;;
+    :)
+      echo "USAGE: config.sh [-g]"
+      exit 1
+      ;;
+    ?)
+      echo "Invalid option: -${OPTARG}."
+      exit 1
+      ;;
+  esac
+done
+
+CONF_FILE="tools/testing/selftests/net/config"
+
+# no modules
+scripts/config --file "$CONF_FILE" --disable CONFIG_MODULES
+
+# enable RDS
+scripts/config --file "$CONF_FILE" --enable CONFIG_RDS
+scripts/config --file "$CONF_FILE" --enable CONFIG_RDS_TCP
+
+if [ "$GENERATE_GCOV_REPORT" -eq 1 ]; then
+	# instrument RDS and only RDS
+	scripts/config --file "$CONF_FILE" --enable CONFIG_GCOV_KERNEL
+	scripts/config --file "$CONF_FILE" --disable GCOV_PROFILE_ALL
+	scripts/config --file "$CONF_FILE" --enable GCOV_PROFILE_RDS
+else
+	scripts/config --file "$CONF_FILE" --disable CONFIG_GCOV_KERNEL
+	scripts/config --file "$CONF_FILE" --disable GCOV_PROFILE_ALL
+	scripts/config --file "$CONF_FILE" --disable GCOV_PROFILE_RDS
+fi
+
+# need network namespaces to run tests with veth network interfaces
+scripts/config --file "$CONF_FILE" --enable CONFIG_NET_NS
+scripts/config --file "$CONF_FILE" --enable CONFIG_VETH
+
+# simulate packet loss
+scripts/config --file "$CONF_FILE" --enable CONFIG_NET_SCH_NETEM
+
diff --git a/tools/testing/selftests/net/rds/run.sh b/tools/testing/selftests/net/rds/run.sh
new file mode 100755
index 000000000000..8aee244f582a
--- /dev/null
+++ b/tools/testing/selftests/net/rds/run.sh
@@ -0,0 +1,224 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+set -e
+set -u
+
+unset KBUILD_OUTPUT
+
+current_dir="$(realpath "$(dirname "$0")")"
+build_dir="$current_dir"
+
+build_include="$current_dir/include.sh"
+if test -f "$build_include"; then
+	# this include will define "$mk_build_dir" as the location the test was
+	# built.  We will need this if the tests are installed in a location
+	# other than the kernel source
+
+	source "$build_include"
+	build_dir="$mk_build_dir"
+fi
+
+# This test requires kernel source and the *.gcda data therein
+# Locate the top level of the kernel source, and the net/rds
+# subfolder with the appropriate *.gcno object files
+ksrc_dir="$(realpath "$build_dir"/../../../../../)"
+kconfig="$ksrc_dir/.config"
+obj_dir="$ksrc_dir/net/rds"
+
+GCOV_CMD=gcov
+
+#check to see if the host has the required packages to generate a gcov report
+check_gcov_env()
+{
+	if ! which "$GCOV_CMD" > /dev/null 2>&1; then
+		echo "Warning: Could not find gcov. "
+		GENERATE_GCOV_REPORT=0
+		return
+	fi
+
+	# the gcov version must match the gcc version
+	GCC_VER=$(gcc -dumpfullversion)
+	GCOV_VER=$($GCOV_CMD -v | grep gcov | awk '{print $3}'| awk 'BEGIN {FS="-"}{print $1}')
+	if [ "$GCOV_VER" != "$GCC_VER" ]; then
+		#attempt to find a matching gcov version
+		GCOV_CMD=gcov-$(gcc -dumpversion)
+
+		if ! which "$GCOV_CMD" > /dev/null 2>&1; then
+			echo "Warning: Could not find an appropriate gcov installation. \
+				gcov version must match gcc version"
+			GENERATE_GCOV_REPORT=0
+			return
+		fi
+
+		#recheck version number of found gcov executable
+		GCOV_VER=$($GCOV_CMD -v | grep gcov | awk '{print $3}'| \
+			awk 'BEGIN {FS="-"}{print $1}')
+		if [ "$GCOV_VER" != "$GCC_VER" ]; then
+			echo "Warning: Could not find an appropriate gcov installation. \
+				gcov version must match gcc version"
+			GENERATE_GCOV_REPORT=0
+		else
+			echo "Warning: Mismatched gcc and gcov detected.  Using $GCOV_CMD"
+		fi
+	fi
+}
+
+# Check to see if the kconfig has the required configs to generate a coverage report
+check_gcov_conf()
+{
+	if ! grep -x "CONFIG_GCOV_PROFILE_RDS=y" "$kconfig" > /dev/null 2>&1; then
+		echo "INFO: CONFIG_GCOV_PROFILE_RDS should be enabled for coverage reports"
+		GENERATE_GCOV_REPORT=0
+	fi
+	if ! grep -x "CONFIG_GCOV_KERNEL=y" "$kconfig" > /dev/null 2>&1; then
+		echo "INFO: CONFIG_GCOV_KERNEL should be enabled for coverage reports"
+		GENERATE_GCOV_REPORT=0
+	fi
+	if grep -x "CONFIG_GCOV_PROFILE_ALL=y" "$kconfig" > /dev/null 2>&1; then
+		echo "INFO: CONFIG_GCOV_PROFILE_ALL should be disabled for coverage reports"
+		GENERATE_GCOV_REPORT=0
+	fi
+
+	if [ "$GENERATE_GCOV_REPORT" -eq 0 ]; then
+		echo "To enable gcov reports, please run "\
+			"\"tools/testing/selftests/net/rds/config.sh -g\" and rebuild the kernel"
+	else
+		# if we have the required kernel configs, proceed to check the environment to
+		# ensure we have the required gcov packages
+		check_gcov_env
+	fi
+}
+
+# Kselftest framework requirement - SKIP code is 4.
+check_conf_enabled() {
+	if ! grep -x "$1=y" "$kconfig" > /dev/null 2>&1; then
+		echo "selftests: [SKIP] This test requires $1 enabled"
+		echo "Please run tools/testing/selftests/net/rds/config.sh and rebuild the kernel"
+		exit 4
+	fi
+}
+check_conf_disabled() {
+	if grep -x "$1=y" "$kconfig" > /dev/null 2>&1; then
+		echo "selftests: [SKIP] This test requires $1 disabled"
+		echo "Please run tools/testing/selftests/net/rds/config.sh and rebuild the kernel"
+		exit 4
+	fi
+}
+check_conf() {
+	check_conf_enabled CONFIG_NET_SCH_NETEM
+	check_conf_enabled CONFIG_VETH
+	check_conf_enabled CONFIG_NET_NS
+	check_conf_enabled CONFIG_RDS_TCP
+	check_conf_enabled CONFIG_RDS
+	check_conf_disabled CONFIG_MODULES
+}
+
+check_env()
+{
+	if ! test -d "$obj_dir"; then
+		echo "selftests: [SKIP] This test requires a kernel source tree"
+		exit 4
+	fi
+	if ! test -e "$kconfig"; then
+		echo "selftests: [SKIP] This test requires a configured kernel source tree"
+		exit 4
+	fi
+	if ! which strace > /dev/null 2>&1; then
+		echo "selftests: [SKIP] Could not run test without strace"
+		exit 4
+	fi
+	if ! which tcpdump > /dev/null 2>&1; then
+		echo "selftests: [SKIP] Could not run test without tcpdump"
+		exit 4
+	fi
+
+	if ! which python3 > /dev/null 2>&1; then
+		echo "selftests: [SKIP] Could not run test without python3"
+		exit 4
+	fi
+
+	python_major=$(python3 -c "import sys; print(sys.version_info[0])")
+	python_minor=$(python3 -c "import sys; print(sys.version_info[1])")
+	if [[ python_major -lt 3 || ( python_major -eq 3 && python_minor -lt 9 ) ]] ; then
+		echo "selftests: [SKIP] Could not run test without at least python3.9"
+		python3 -V
+		exit 4
+	fi
+}
+
+LOG_DIR="$current_dir"/rds_logs
+PLOSS=0
+PCORRUPT=0
+PDUP=0
+GENERATE_GCOV_REPORT=1
+while getopts "d:l:c:u:" opt; do
+  case ${opt} in
+    d)
+      LOG_DIR=${OPTARG}
+      ;;
+    l)
+      PLOSS=${OPTARG}
+      ;;
+    c)
+      PCORRUPT=${OPTARG}
+      ;;
+    u)
+      PDUP=${OPTARG}
+      ;;
+    :)
+      echo "USAGE: run.sh [-d logdir] [-l packet_loss] [-c packet_corruption]" \
+           "[-u packet_duplcate] [-g]"
+      exit 1
+      ;;
+    ?)
+      echo "Invalid option: -${OPTARG}."
+      exit 1
+      ;;
+  esac
+done
+
+
+check_env
+check_conf
+check_gcov_conf
+
+
+rm -fr "$LOG_DIR"
+TRACE_FILE="${LOG_DIR}/rds-strace.txt"
+COVR_DIR="${LOG_DIR}/coverage/"
+mkdir -p  "$LOG_DIR"
+mkdir -p "$COVR_DIR"
+
+set +e
+echo running RDS tests...
+echo Traces will be logged to "$TRACE_FILE"
+rm -f "$TRACE_FILE"
+strace -T -tt -o "$TRACE_FILE" python3 "$(dirname "$0")/test.py" --timeout 400 -d "$LOG_DIR" \
+       -l "$PLOSS" -c "$PCORRUPT" -u "$PDUP"
+
+test_rc=$?
+dmesg > "${LOG_DIR}/dmesg.out"
+
+if [ "$GENERATE_GCOV_REPORT" -eq 1 ]; then
+       echo saving coverage data...
+       (set +x; cd /sys/kernel/debug/gcov; find ./* -name '*.gcda' | \
+       while read -r f
+       do
+               cat < "/sys/kernel/debug/gcov/$f" > "/$f"
+       done)
+
+       echo running gcovr...
+       gcovr -s --html-details --gcov-executable "$GCOV_CMD" --gcov-ignore-parse-errors \
+             -o "${COVR_DIR}/gcovr" "${ksrc_dir}/net/rds/"
+else
+       echo "Coverage report will be skipped"
+fi
+
+if [ "$test_rc" -eq 0 ]; then
+	echo "PASS: Test completed successfully"
+else
+	echo "FAIL: Test failed"
+fi
+
+exit "$test_rc"
diff --git a/tools/testing/selftests/net/rds/test.py b/tools/testing/selftests/net/rds/test.py
new file mode 100755
index 000000000000..4a7178d11193
--- /dev/null
+++ b/tools/testing/selftests/net/rds/test.py
@@ -0,0 +1,265 @@
+#! /usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+import argparse
+import ctypes
+import errno
+import hashlib
+import os
+import select
+import signal
+import socket
+import subprocess
+import sys
+import atexit
+from pwd import getpwuid
+from os import stat
+
+# Allow utils module to be imported from different directory
+this_dir = os.path.dirname(os.path.realpath(__file__))
+sys.path.append(os.path.join(this_dir, "../"))
+from lib.py.utils import ip
+
+libc = ctypes.cdll.LoadLibrary('libc.so.6')
+setns = libc.setns
+
+net0 = 'net0'
+net1 = 'net1'
+
+veth0 = 'veth0'
+veth1 = 'veth1'
+
+# Helper function for creating a socket inside a network namespace.
+# We need this because otherwise RDS will detect that the two TCP
+# sockets are on the same interface and use the loop transport instead
+# of the TCP transport.
+def netns_socket(netns, *args):
+    u0, u1 = socket.socketpair(socket.AF_UNIX, socket.SOCK_SEQPACKET)
+
+    child = os.fork()
+    if child == 0:
+        # change network namespace
+        with open(f'/var/run/netns/{netns}') as f:
+            try:
+                ret = setns(f.fileno(), 0)
+            except IOError as e:
+                print(e.errno)
+                print(e)
+
+        # create socket in target namespace
+        s = socket.socket(*args)
+
+        # send resulting socket to parent
+        socket.send_fds(u0, [], [s.fileno()])
+
+        sys.exit(0)
+
+    # receive socket from child
+    _, s, _, _ = socket.recv_fds(u1, 0, 1)
+    os.waitpid(child, 0)
+    u0.close()
+    u1.close()
+    return socket.fromfd(s[0], *args)
+
+def signal_handler(sig, frame):
+    print('Test timed out')
+    sys.exit(1)
+
+#Parse out command line arguments.  We take an optional
+# timeout parameter and an optional log output folder
+parser = argparse.ArgumentParser(description="init script args",
+                  formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument("-d", "--logdir", action="store",
+                    help="directory to store logs", default="/tmp")
+parser.add_argument('--timeout', help="timeout to terminate hung test",
+                    type=int, default=0)
+parser.add_argument('-l', '--loss', help="Simulate tcp packet loss",
+                    type=int, default=0)
+parser.add_argument('-c', '--corruption', help="Simulate tcp packet corruption",
+                    type=int, default=0)
+parser.add_argument('-u', '--duplicate', help="Simulate tcp packet duplication",
+                    type=int, default=0)
+args = parser.parse_args()
+logdir=args.logdir
+packet_loss=str(args.loss)+'%'
+packet_corruption=str(args.corruption)+'%'
+packet_duplicate=str(args.duplicate)+'%'
+
+ip(f"netns add {net0}")
+ip(f"netns add {net1}")
+ip(f"link add type veth")
+
+addrs = [
+    # we technically don't need different port numbers, but this will
+    # help identify traffic in the network analyzer
+    ('10.0.0.1', 10000),
+    ('10.0.0.2', 20000),
+]
+
+# move interfaces to separate namespaces so they can no longer be
+# bound directly; this prevents rds from switching over from the tcp
+# transport to the loop transport.
+ip(f"link set {veth0} netns {net0} up")
+ip(f"link set {veth1} netns {net1} up")
+
+
+
+# add addresses
+ip(f"-n {net0} addr add {addrs[0][0]}/32 dev {veth0}")
+ip(f"-n {net1} addr add {addrs[1][0]}/32 dev {veth1}")
+
+# add routes
+ip(f"-n {net0} route add {addrs[1][0]}/32 dev {veth0}")
+ip(f"-n {net1} route add {addrs[0][0]}/32 dev {veth1}")
+
+# sanity check that our two interfaces/addresses are correctly set up
+# and communicating by doing a single ping
+ip(f"netns exec {net0} ping -c 1 {addrs[1][0]}")
+
+# Start a packet capture on each network
+for net in [net0, net1]:
+    tcpdump_pid = os.fork()
+    if tcpdump_pid == 0:
+        pcap = logdir+'/'+net+'.pcap'
+        subprocess.check_call(['touch', pcap])
+        user = getpwuid(stat(pcap).st_uid).pw_name
+        ip(f"netns exec {net} /usr/sbin/tcpdump -Z {user} -i any -w {pcap}")
+        sys.exit(0)
+
+# simulate packet loss, duplication and corruption
+for net, iface in [(net0, veth0), (net1, veth1)]:
+    ip(f"netns exec {net} /usr/sbin/tc qdisc add dev {iface} root netem  \
+         corrupt {packet_corruption} loss {packet_loss} duplicate  \
+         {packet_duplicate}")
+
+# add a timeout
+if args.timeout > 0:
+    signal.alarm(args.timeout)
+    signal.signal(signal.SIGALRM, signal_handler)
+
+sockets = [
+    netns_socket(net0, socket.AF_RDS, socket.SOCK_SEQPACKET),
+    netns_socket(net1, socket.AF_RDS, socket.SOCK_SEQPACKET),
+]
+
+for s, addr in zip(sockets, addrs):
+    s.bind(addr)
+    s.setblocking(0)
+
+fileno_to_socket = {
+    s.fileno(): s for s in sockets
+}
+
+addr_to_socket = {
+    addr: s for addr, s in zip(addrs, sockets)
+}
+
+socket_to_addr = {
+    s: addr for addr, s in zip(addrs, sockets)
+}
+
+send_hashes = {}
+recv_hashes = {}
+
+ep = select.epoll()
+
+for s in sockets:
+    ep.register(s, select.EPOLLRDNORM)
+
+n = 50000
+nr_send = 0
+nr_recv = 0
+
+while nr_send < n:
+    # Send as much as we can without blocking
+    print("sending...", nr_send, nr_recv)
+    while nr_send < n:
+        send_data = hashlib.sha256(
+            f'packet {nr_send}'.encode('utf-8')).hexdigest().encode('utf-8')
+
+        # pseudo-random send/receive pattern
+        sender = sockets[nr_send % 2]
+        receiver = sockets[1 - (nr_send % 3) % 2]
+
+        try:
+            sender.sendto(send_data, socket_to_addr[receiver])
+            send_hashes.setdefault((sender.fileno(), receiver.fileno()),
+                    hashlib.sha256()).update(f'<{send_data}>'.encode('utf-8'))
+            nr_send = nr_send + 1
+        except BlockingIOError as e:
+            break
+        except OSError as e:
+            if e.errno in [errno.ENOBUFS, errno.ECONNRESET, errno.EPIPE]:
+                break
+            raise
+
+    # Receive as much as we can without blocking
+    print("receiving...", nr_send, nr_recv)
+    while nr_recv < nr_send:
+        for fileno, eventmask in ep.poll():
+            receiver = fileno_to_socket[fileno]
+
+            if eventmask & select.EPOLLRDNORM:
+                while True:
+                    try:
+                        recv_data, address = receiver.recvfrom(1024)
+                        sender = addr_to_socket[address]
+                        recv_hashes.setdefault((sender.fileno(),
+                            receiver.fileno()), hashlib.sha256()).update(
+                                    f'<{recv_data}>'.encode('utf-8'))
+                        nr_recv = nr_recv + 1
+                    except BlockingIOError as e:
+                        break
+
+    # exercise net/rds/tcp.c:rds_tcp_sysctl_reset()
+    for net in [net0, net1]:
+        ip(f"netns exec {net} /usr/sbin/sysctl net.rds.tcp.rds_tcp_rcvbuf=10000")
+        ip(f"netns exec {net} /usr/sbin/sysctl net.rds.tcp.rds_tcp_sndbuf=10000")
+
+print("done", nr_send, nr_recv)
+
+# the Python socket module doesn't know these
+RDS_INFO_FIRST = 10000
+RDS_INFO_LAST = 10017
+
+nr_success = 0
+nr_error = 0
+
+for s in sockets:
+    for optname in range(RDS_INFO_FIRST, RDS_INFO_LAST + 1):
+        # Sigh, the Python socket module doesn't allow us to pass
+        # buffer lengths greater than 1024 for some reason. RDS
+        # wants multiple pages.
+        try:
+            s.getsockopt(socket.SOL_RDS, optname, 1024)
+            nr_success = nr_success + 1
+        except OSError as e:
+            nr_error = nr_error + 1
+            if e.errno == errno.ENOSPC:
+                # ignore
+                pass
+
+print(f"getsockopt(): {nr_success}/{nr_error}")
+
+print("Stopping network packet captures")
+subprocess.check_call(['killall', '-q', 'tcpdump'])
+
+# We're done sending and receiving stuff, now let's check if what
+# we received is what we sent.
+for (sender, receiver), send_hash in send_hashes.items():
+    recv_hash = recv_hashes.get((sender, receiver))
+
+    if recv_hash is None:
+        print("FAIL: No data received")
+        sys.exit(1)
+
+    if send_hash.hexdigest() != recv_hash.hexdigest():
+        print("FAIL: Send/recv mismatch")
+        print("hash expected:", send_hash.hexdigest())
+        print("hash received:", recv_hash.hexdigest())
+        sys.exit(1)
+
+    print(f"{sender}/{receiver}: ok")
+
+print("Success")
+sys.exit(0)
diff --git a/tools/testing/selftests/net/reuseaddr_ports_exhausted.c b/tools/testing/selftests/net/reuseaddr_ports_exhausted.c
index 066efd30e294..7b9bf8a7bbe1 100644
--- a/tools/testing/selftests/net/reuseaddr_ports_exhausted.c
+++ b/tools/testing/selftests/net/reuseaddr_ports_exhausted.c
@@ -112,7 +112,7 @@ TEST(reuseaddr_ports_exhausted_reusable_same_euid)
 		ASSERT_NE(-1, fd[0]) TH_LOG("failed to bind.");
 
 		if (opts->reuseport[0] && opts->reuseport[1]) {
-			EXPECT_EQ(-1, fd[1]) TH_LOG("should fail to bind because both sockets succeed to be listened.");
+			EXPECT_EQ(-1, fd[1]) TH_LOG("should fail to bind because both sockets successfully listened.");
 		} else {
 			EXPECT_NE(-1, fd[1]) TH_LOG("should succeed to bind to connect to different destinations.");
 		}
diff --git a/tools/testing/selftests/net/reuseport_addr_any.c b/tools/testing/selftests/net/reuseport_addr_any.c
index b8475cb29be7..1c43401a1c80 100644
--- a/tools/testing/selftests/net/reuseport_addr_any.c
+++ b/tools/testing/selftests/net/reuseport_addr_any.c
@@ -9,7 +9,6 @@
 #include <arpa/inet.h>
 #include <errno.h>
 #include <error.h>
-#include <linux/dccp.h>
 #include <linux/in.h>
 #include <linux/unistd.h>
 #include <stdbool.h>
@@ -21,10 +20,6 @@
 #include <sys/socket.h>
 #include <unistd.h>
 
-#ifndef SOL_DCCP
-#define SOL_DCCP 269
-#endif
-
 static const char *IP4_ADDR = "127.0.0.1";
 static const char *IP6_ADDR = "::1";
 static const char *IP4_MAPPED6 = "::ffff:127.0.0.1";
@@ -86,15 +81,6 @@ static void build_rcv_fd(int family, int proto, int *rcv_fds, int count,
 
 		if (proto == SOCK_STREAM && listen(rcv_fds[i], 10))
 			error(1, errno, "tcp: failed to listen on receive port");
-		else if (proto == SOCK_DCCP) {
-			if (setsockopt(rcv_fds[i], SOL_DCCP,
-					DCCP_SOCKOPT_SERVICE,
-					&(int) {htonl(42)}, sizeof(int)))
-				error(1, errno, "failed to setsockopt");
-
-			if (listen(rcv_fds[i], 10))
-				error(1, errno, "dccp: failed to listen on receive port");
-		}
 	}
 }
 
@@ -148,11 +134,6 @@ static int connect_and_send(int family, int proto)
 	if (fd < 0)
 		error(1, errno, "failed to create send socket");
 
-	if (proto == SOCK_DCCP &&
-		setsockopt(fd, SOL_DCCP, DCCP_SOCKOPT_SERVICE,
-				&(int){htonl(42)}, sizeof(int)))
-		error(1, errno, "failed to setsockopt");
-
 	if (bind(fd, saddr, sz))
 		error(1, errno, "failed to bind send socket");
 
@@ -175,7 +156,7 @@ static int receive_once(int epfd, int proto)
 	if (i < 0)
 		error(1, errno, "epoll_wait failed");
 
-	if (proto == SOCK_STREAM || proto == SOCK_DCCP) {
+	if (proto == SOCK_STREAM) {
 		fd = accept(ev.data.fd, NULL, NULL);
 		if (fd < 0)
 			error(1, errno, "failed to accept");
@@ -243,20 +224,6 @@ static void run_one_test(int fam_send, int fam_rcv, int proto,
 
 static void test_proto(int proto, const char *proto_str)
 {
-	if (proto == SOCK_DCCP) {
-		int test_fd;
-
-		test_fd = socket(AF_INET, proto, 0);
-		if (test_fd < 0) {
-			if (errno == ESOCKTNOSUPPORT) {
-				fprintf(stderr, "DCCP not supported: skipping DCCP tests\n");
-				return;
-			} else
-				error(1, errno, "failed to create a DCCP socket");
-		}
-		close(test_fd);
-	}
-
 	fprintf(stderr, "%s IPv4 ... ", proto_str);
 	run_one_test(AF_INET, AF_INET, proto, IP4_ADDR);
 
@@ -271,7 +238,6 @@ int main(void)
 {
 	test_proto(SOCK_DGRAM, "UDP");
 	test_proto(SOCK_STREAM, "TCP");
-	test_proto(SOCK_DCCP, "DCCP");
 
 	fprintf(stderr, "SUCCESS\n");
 	return 0;
diff --git a/tools/testing/selftests/net/rtnetlink.py b/tools/testing/selftests/net/rtnetlink.py
new file mode 100755
index 000000000000..e9ad5e88da97
--- /dev/null
+++ b/tools/testing/selftests/net/rtnetlink.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+from lib.py import ksft_exit, ksft_run, ksft_ge, RtnlAddrFamily
+import socket
+
+IPV4_ALL_HOSTS_MULTICAST = b'\xe0\x00\x00\x01'
+
+def dump_mcaddr_check(rtnl: RtnlAddrFamily) -> None:
+    """
+    Verify that at least one interface has the IPv4 all-hosts multicast address.
+    At least the loopback interface should have this address.
+    """
+
+    addresses = rtnl.getmulticast({"ifa-family": socket.AF_INET}, dump=True)
+
+    all_host_multicasts = [
+        addr for addr in addresses if addr['multicast'] == IPV4_ALL_HOSTS_MULTICAST
+    ]
+
+    ksft_ge(len(all_host_multicasts), 1,
+            "No interface found with the IPv4 all-hosts multicast address")
+
+def main() -> None:
+    rtnl = RtnlAddrFamily()
+    ksft_run([dump_mcaddr_check], args=(rtnl, ))
+    ksft_exit()
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh
index bdf6f10d0558..2e8243a65b50 100755
--- a/tools/testing/selftests/net/rtnetlink.sh
+++ b/tools/testing/selftests/net/rtnetlink.sh
@@ -21,14 +21,15 @@ ALL_TESTS="
 	kci_test_vrf
 	kci_test_encap
 	kci_test_macsec
-	kci_test_macsec_offload
 	kci_test_ipsec
 	kci_test_ipsec_offload
 	kci_test_fdb_get
+	kci_test_fdb_del
 	kci_test_neigh_get
 	kci_test_bridge_parent_id
 	kci_test_address_proto
 	kci_test_enslave_bonding
+	kci_test_mngtmpaddr
 "
 
 devdummy="test-dummy0"
@@ -44,6 +45,7 @@ check_err()
 	if [ $ret -eq 0 ]; then
 		ret=$1
 	fi
+	[ -n "$2" ] && echo "$2"
 }
 
 # same but inverted -- used when command must fail for test to pass
@@ -559,73 +561,6 @@ kci_test_macsec()
 	end_test "PASS: macsec"
 }
 
-kci_test_macsec_offload()
-{
-	sysfsd=/sys/kernel/debug/netdevsim/netdevsim0/ports/0/
-	sysfsnet=/sys/bus/netdevsim/devices/netdevsim0/net/
-	probed=false
-	local ret=0
-	run_cmd_grep "^Usage: ip macsec" ip macsec help
-	if [ $? -ne 0 ]; then
-		end_test "SKIP: macsec: iproute2 too old"
-		return $ksft_skip
-	fi
-
-	if ! mount | grep -q debugfs; then
-		mount -t debugfs none /sys/kernel/debug/ &> /dev/null
-	fi
-
-	# setup netdevsim since dummydev doesn't have offload support
-	if [ ! -w /sys/bus/netdevsim/new_device ] ; then
-		run_cmd modprobe -q netdevsim
-
-		if [ $ret -ne 0 ]; then
-			end_test "SKIP: macsec_offload can't load netdevsim"
-			return $ksft_skip
-		fi
-		probed=true
-	fi
-
-	echo "0" > /sys/bus/netdevsim/new_device
-	while [ ! -d $sysfsnet ] ; do :; done
-	udevadm settle
-	dev=`ls $sysfsnet`
-
-	ip link set $dev up
-	if [ ! -d $sysfsd ] ; then
-		end_test "FAIL: macsec_offload can't create device $dev"
-		return 1
-	fi
-	run_cmd_grep 'macsec-hw-offload: on' ethtool -k $dev
-	if [ $? -eq 1 ] ; then
-		end_test "FAIL: macsec_offload netdevsim doesn't support MACsec offload"
-		return 1
-	fi
-	run_cmd ip link add link $dev kci_macsec1 type macsec port 4 offload mac
-	run_cmd ip link add link $dev kci_macsec2 type macsec address "aa:bb:cc:dd:ee:ff" port 5 offload mac
-	run_cmd ip link add link $dev kci_macsec3 type macsec sci abbacdde01020304 offload mac
-	run_cmd_fail ip link add link $dev kci_macsec4 type macsec port 8 offload mac
-
-	msname=kci_macsec1
-	run_cmd ip macsec add "$msname" tx sa 0 pn 1024 on key 01 12345678901234567890123456789012
-	run_cmd ip macsec add "$msname" rx port 1234 address "1c:ed:de:ad:be:ef"
-	run_cmd ip macsec add "$msname" rx port 1234 address "1c:ed:de:ad:be:ef" sa 0 pn 1 on \
-		key 00 0123456789abcdef0123456789abcdef
-	run_cmd_fail ip macsec add "$msname" rx port 1235 address "1c:ed:de:ad:be:ef"
-	# clean up any leftovers
-	for msdev in kci_macsec{1,2,3,4} ; do
-	    ip link del $msdev 2> /dev/null
-	done
-	echo 0 > /sys/bus/netdevsim/del_device
-	$probed && rmmod netdevsim
-
-	if [ $ret -ne 0 ]; then
-		end_test "FAIL: macsec_offload"
-		return 1
-	fi
-	end_test "PASS: macsec_offload"
-}
-
 #-------------------------------------------------------------------
 # Example commands
 #   ip x s add proto esp src 14.0.0.52 dst 14.0.0.70 \
@@ -809,10 +744,10 @@ kci_test_ipsec_offload()
 	# does driver have correct offload info
 	run_cmd diff $sysfsf - << EOF
 SA count=2 tx=3
-sa[0] tx ipaddr=0x00000000 00000000 00000000 00000000
+sa[0] tx ipaddr=$dstip
 sa[0]    spi=0x00000009 proto=0x32 salt=0x61626364 crypt=1
 sa[0]    key=0x34333231 38373635 32313039 36353433
-sa[1] rx ipaddr=0x00000000 00000000 00000000 037ba8c0
+sa[1] rx ipaddr=$srcip
 sa[1]    spi=0x00000009 proto=0x32 salt=0x61626364 crypt=1
 sa[1]    key=0x34333231 38373635 32313039 36353433
 EOF
@@ -1065,6 +1000,45 @@ kci_test_fdb_get()
 	end_test "PASS: bridge fdb get"
 }
 
+kci_test_fdb_del()
+{
+	local test_mac=de:ad:be:ef:13:37
+	local dummydev="dummy1"
+	local brdev="test-br0"
+	local ret=0
+
+	run_cmd_grep 'bridge fdb get' bridge fdb help
+	if [ $? -ne 0 ]; then
+		end_test "SKIP: fdb del tests: iproute2 too old"
+		return $ksft_skip
+	fi
+
+	setup_ns testns
+	if [ $? -ne 0 ]; then
+		end_test "SKIP fdb del tests: cannot add net namespace $testns"
+		return $ksft_skip
+	fi
+	IP="ip -netns $testns"
+	BRIDGE="bridge -netns $testns"
+	run_cmd $IP link add $dummydev type dummy
+	run_cmd $IP link add name $brdev type bridge vlan_filtering 1
+	run_cmd $IP link set dev $dummydev master $brdev
+	run_cmd $BRIDGE fdb add $test_mac dev $dummydev master static vlan 1
+	run_cmd $BRIDGE vlan del vid 1 dev $dummydev
+	run_cmd $BRIDGE fdb get $test_mac br $brdev vlan 1
+	run_cmd $BRIDGE fdb del $test_mac dev $dummydev master vlan 1
+	run_cmd_fail $BRIDGE fdb get $test_mac br $brdev vlan 1
+
+	ip netns del $testns &>/dev/null
+
+	if [ $ret -ne 0 ]; then
+		end_test "FAIL: bridge fdb del"
+		return 1
+	fi
+
+	end_test "PASS: bridge fdb del"
+}
+
 kci_test_neigh_get()
 {
 	dstmac=de:ad:be:ef:13:37
@@ -1267,6 +1241,99 @@ kci_test_enslave_bonding()
 	ip netns del "$testns"
 }
 
+# Called to validate the addresses on $IFNAME:
+#
+# 1. Every `temporary` address must have a matching `mngtmpaddr`
+# 2. Every `mngtmpaddr` address must have some un`deprecated` `temporary`
+#
+# If the mngtmpaddr or tempaddr checking failed, return 0 and stop slowwait
+validate_mngtmpaddr()
+{
+	local dev=$1
+	local prefix=""
+	local addr_list=$(ip -j -n $testns addr show dev ${dev})
+	local temp_addrs=$(echo ${addr_list} | \
+		jq -r '.[].addr_info[] | select(.temporary == true) | .local')
+	local mng_prefixes=$(echo ${addr_list} | \
+		jq -r '.[].addr_info[] | select(.mngtmpaddr == true) | .local' | \
+		cut -d: -f1-4 | tr '\n' ' ')
+	local undep_prefixes=$(echo ${addr_list} | \
+		jq -r '.[].addr_info[] | select(.temporary == true and .deprecated != true) | .local' | \
+		cut -d: -f1-4 | tr '\n' ' ')
+
+	# 1. All temporary addresses (temp and dep) must have a matching mngtmpaddr
+	for address in ${temp_addrs}; do
+		prefix=$(echo ${address} | cut -d: -f1-4)
+		if [[ ! " ${mng_prefixes} " =~ " $prefix " ]]; then
+			check_err 1 "FAIL: Temporary $address with no matching mngtmpaddr!";
+			return 0
+		fi
+	done
+
+	# 2. All mngtmpaddr addresses must have a temporary address (not dep)
+	for prefix in ${mng_prefixes}; do
+		if [[ ! " ${undep_prefixes} " =~ " $prefix " ]]; then
+			check_err 1 "FAIL: No undeprecated temporary in $prefix!";
+			return 0
+		fi
+	done
+
+	return 1
+}
+
+kci_test_mngtmpaddr()
+{
+	local ret=0
+
+	setup_ns testns
+	if [ $? -ne 0 ]; then
+		end_test "SKIP mngtmpaddr tests: cannot add net namespace $testns"
+		return $ksft_skip
+	fi
+
+	# 1. Create a dummy Ethernet interface
+	run_cmd ip -n $testns link add ${devdummy} type dummy
+	run_cmd ip -n $testns link set ${devdummy} up
+	run_cmd ip netns exec $testns sysctl -w net.ipv6.conf.${devdummy}.use_tempaddr=1
+	run_cmd ip netns exec $testns sysctl -w net.ipv6.conf.${devdummy}.temp_prefered_lft=10
+	run_cmd ip netns exec $testns sysctl -w net.ipv6.conf.${devdummy}.temp_valid_lft=25
+	run_cmd ip netns exec $testns sysctl -w net.ipv6.conf.${devdummy}.max_desync_factor=1
+
+	# 2. Create several mngtmpaddr addresses on that interface.
+	# with temp_*_lft configured to be pretty short (10 and 35 seconds
+	# for prefer/valid respectively)
+	for i in $(seq 1 9); do
+		run_cmd ip -n $testns addr add 2001:db8:7e57:${i}::1/64 mngtmpaddr dev ${devdummy}
+	done
+
+	# 3. Confirm that a preferred temporary address exists for each mngtmpaddr
+	# address at all times, polling once per second for 30 seconds.
+	slowwait 30 validate_mngtmpaddr ${devdummy}
+
+	# 4. Delete each mngtmpaddr address, one at a time (alternating between
+	# deleting and merely un-mngtmpaddr-ing), and confirm that the other
+	# mngtmpaddr addresses still have preferred temporaries.
+	for i in $(seq 1 9); do
+		(( $i % 4 == 0 )) && mng_flag="mngtmpaddr" || mng_flag=""
+		if (( $i % 2 == 0 )); then
+			run_cmd ip -n $testns addr del 2001:db8:7e57:${i}::1/64 $mng_flag dev ${devdummy}
+		else
+			run_cmd ip -n $testns addr change 2001:db8:7e57:${i}::1/64 dev ${devdummy}
+		fi
+		# the temp addr should be deleted
+		validate_mngtmpaddr ${devdummy}
+	done
+
+	if [ $ret -ne 0 ]; then
+		end_test "FAIL: mngtmpaddr add/remove incorrect"
+	else
+		end_test "PASS: mngtmpaddr add/remove correctly"
+	fi
+
+	ip netns del "$testns"
+	return $ret
+}
+
 kci_test_rtnl()
 {
 	local current_test
diff --git a/tools/testing/selftests/net/rxtimestamp.c b/tools/testing/selftests/net/rxtimestamp.c
index 9eb42570294d..16ac4df55fdb 100644
--- a/tools/testing/selftests/net/rxtimestamp.c
+++ b/tools/testing/selftests/net/rxtimestamp.c
@@ -57,6 +57,8 @@ static struct sof_flag sof_flags[] = {
 	SOF_FLAG(SOF_TIMESTAMPING_SOFTWARE),
 	SOF_FLAG(SOF_TIMESTAMPING_RX_SOFTWARE),
 	SOF_FLAG(SOF_TIMESTAMPING_RX_HARDWARE),
+	SOF_FLAG(SOF_TIMESTAMPING_OPT_RX_FILTER),
+	SOF_FLAG(SOF_TIMESTAMPING_RAW_HARDWARE),
 };
 
 static struct socket_type socket_types[] = {
@@ -98,6 +100,22 @@ static struct test_case test_cases[] = {
 		{}
 	},
 	{
+		{ .so_timestamping = SOF_TIMESTAMPING_RAW_HARDWARE
+			| SOF_TIMESTAMPING_OPT_RX_FILTER },
+		{}
+	},
+	{
+		{ .so_timestamping = SOF_TIMESTAMPING_SOFTWARE
+			| SOF_TIMESTAMPING_OPT_RX_FILTER },
+		{}
+	},
+	{
+		{ .so_timestamping = SOF_TIMESTAMPING_SOFTWARE
+			| SOF_TIMESTAMPING_RX_SOFTWARE
+			| SOF_TIMESTAMPING_OPT_RX_FILTER },
+		{ .swtstamp = true }
+	},
+	{
 		{ .so_timestamping = SOF_TIMESTAMPING_SOFTWARE
 			| SOF_TIMESTAMPING_RX_SOFTWARE },
 		{ .swtstamp = true }
diff --git a/tools/testing/selftests/net/sample_map_ret0.bpf.c b/tools/testing/selftests/net/sample_map_ret0.bpf.c
new file mode 100644
index 000000000000..43ca92594926
--- /dev/null
+++ b/tools/testing/selftests/net/sample_map_ret0.bpf.c
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__type(key, __u32);
+	__type(value, long);
+	__uint(max_entries, 2);
+} htab SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, __u32);
+	__type(value, long);
+	__uint(max_entries, 2);
+} array SEC(".maps");
+
+/* Sample program which should always load for testing control paths. */
+SEC("xdp") int func()
+{
+	__u64 key64 = 0;
+	__u32 key = 0;
+	long *value;
+
+	value = bpf_map_lookup_elem(&htab, &key);
+	if (!value)
+		return 1;
+	value = bpf_map_lookup_elem(&array, &key64);
+	if (!value)
+		return 1;
+
+	return 0;
+}
diff --git a/tools/testing/selftests/net/sample_ret0.bpf.c b/tools/testing/selftests/net/sample_ret0.bpf.c
new file mode 100644
index 000000000000..1df5ca98bb65
--- /dev/null
+++ b/tools/testing/selftests/net/sample_ret0.bpf.c
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) */
+
+#define SEC(name) __attribute__((section(name), used))
+
+/* Sample program which should always load for testing control paths. */
+SEC("xdp")
+int func()
+{
+	return 0;
+}
diff --git a/tools/testing/selftests/net/setup_veth.sh b/tools/testing/selftests/net/setup_veth.sh
index 1f78a87f6f37..152bf4c65747 100644
--- a/tools/testing/selftests/net/setup_veth.sh
+++ b/tools/testing/selftests/net/setup_veth.sh
@@ -11,7 +11,8 @@ setup_veth_ns() {
 	local -r ns_mac="$4"
 
 	[[ -e /var/run/netns/"${ns_name}" ]] || ip netns add "${ns_name}"
-	echo 1000000 > "/sys/class/net/${ns_dev}/gro_flush_timeout"
+	echo 200000 > "/sys/class/net/${ns_dev}/gro_flush_timeout"
+	echo 1 > "/sys/class/net/${ns_dev}/napi_defer_hard_irqs"
 	ip link set dev "${ns_dev}" netns "${ns_name}" mtu 65535
 	ip -netns "${ns_name}" link set dev "${ns_dev}" up
 
diff --git a/tools/testing/selftests/net/sk_so_peek_off.c b/tools/testing/selftests/net/sk_so_peek_off.c
new file mode 100644
index 000000000000..d87dd8d8d491
--- /dev/null
+++ b/tools/testing/selftests/net/sk_so_peek_off.c
@@ -0,0 +1,202 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include "../kselftest.h"
+
+static char *afstr(int af, int proto)
+{
+	if (proto == IPPROTO_TCP)
+		return af == AF_INET ? "TCP/IPv4" : "TCP/IPv6";
+	else
+		return af == AF_INET ? "UDP/IPv4" : "UDP/IPv6";
+}
+
+int sk_peek_offset_probe(sa_family_t af, int proto)
+{
+	int type = (proto == IPPROTO_TCP ? SOCK_STREAM : SOCK_DGRAM);
+	int optv = 0;
+	int ret = 0;
+	int s;
+
+	s = socket(af, type, proto);
+	if (s < 0) {
+		ksft_perror("Temporary TCP socket creation failed");
+	} else {
+		if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &optv, sizeof(int)))
+			ret = 1;
+		else
+			printf("%s does not support SO_PEEK_OFF\n", afstr(af, proto));
+		close(s);
+	}
+	return ret;
+}
+
+static void sk_peek_offset_set(int s, int offset)
+{
+	if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset)))
+		ksft_perror("Failed to set SO_PEEK_OFF value\n");
+}
+
+static int sk_peek_offset_get(int s)
+{
+	int offset;
+	socklen_t len = sizeof(offset);
+
+	if (getsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, &len))
+		ksft_perror("Failed to get SO_PEEK_OFF value\n");
+	return offset;
+}
+
+static int sk_peek_offset_test(sa_family_t af, int proto)
+{
+	int type = (proto == IPPROTO_TCP ? SOCK_STREAM : SOCK_DGRAM);
+	union {
+		struct sockaddr sa;
+		struct sockaddr_in a4;
+		struct sockaddr_in6 a6;
+	} a;
+	int res = 0;
+	int s[2] = {0, 0};
+	int recv_sock = 0;
+	int offset = 0;
+	ssize_t len;
+	char buf[2];
+
+	memset(&a, 0, sizeof(a));
+	a.sa.sa_family = af;
+
+	s[0] = recv_sock = socket(af, type, proto);
+	s[1] = socket(af, type, proto);
+
+	if (s[0] < 0 || s[1] < 0) {
+		ksft_perror("Temporary socket creation failed\n");
+		goto out;
+	}
+	if (bind(s[0], &a.sa, sizeof(a)) < 0) {
+		ksft_perror("Temporary socket bind() failed\n");
+		goto out;
+	}
+	if (getsockname(s[0], &a.sa, &((socklen_t) { sizeof(a) })) < 0) {
+		ksft_perror("Temporary socket getsockname() failed\n");
+		goto out;
+	}
+	if (proto == IPPROTO_TCP && listen(s[0], 0) < 0) {
+		ksft_perror("Temporary socket listen() failed\n");
+		goto out;
+	}
+	if (connect(s[1], &a.sa, sizeof(a)) < 0) {
+		ksft_perror("Temporary socket connect() failed\n");
+		goto out;
+	}
+	if (proto == IPPROTO_TCP) {
+		recv_sock = accept(s[0], NULL, NULL);
+		if (recv_sock <= 0) {
+			ksft_perror("Temporary socket accept() failed\n");
+			goto out;
+		}
+	}
+
+	/* Some basic tests of getting/setting offset */
+	offset = sk_peek_offset_get(recv_sock);
+	if (offset != -1) {
+		ksft_perror("Initial value of socket offset not -1\n");
+		goto out;
+	}
+	sk_peek_offset_set(recv_sock, 0);
+	offset = sk_peek_offset_get(recv_sock);
+	if (offset != 0) {
+		ksft_perror("Failed to set socket offset to 0\n");
+		goto out;
+	}
+
+	/* Transfer a message */
+	if (send(s[1], (char *)("ab"), 2, 0) != 2) {
+		ksft_perror("Temporary probe socket send() failed\n");
+		goto out;
+	}
+	/* Read first byte */
+	len = recv(recv_sock, buf, 1, MSG_PEEK);
+	if (len != 1 || buf[0] != 'a') {
+		ksft_perror("Failed to read first byte of message\n");
+		goto out;
+	}
+	offset = sk_peek_offset_get(recv_sock);
+	if (offset != 1) {
+		ksft_perror("Offset not forwarded correctly at first byte\n");
+		goto out;
+	}
+	/* Try to read beyond last byte */
+	len = recv(recv_sock, buf, 2, MSG_PEEK);
+	if (len != 1 || buf[0] != 'b') {
+		ksft_perror("Failed to read last byte of message\n");
+		goto out;
+	}
+	offset = sk_peek_offset_get(recv_sock);
+	if (offset != 2) {
+		ksft_perror("Offset not forwarded correctly at last byte\n");
+		goto out;
+	}
+	/* Flush message */
+	len = recv(recv_sock, buf, 2, MSG_TRUNC);
+	if (len != 2) {
+		ksft_perror("Failed to flush message\n");
+		goto out;
+	}
+	offset = sk_peek_offset_get(recv_sock);
+	if (offset != 0) {
+		ksft_perror("Offset not reverted correctly after flush\n");
+		goto out;
+	}
+
+	printf("%s with MSG_PEEK_OFF works correctly\n", afstr(af, proto));
+	res = 1;
+out:
+	if (proto == IPPROTO_TCP && recv_sock >= 0)
+		close(recv_sock);
+	if (s[1] >= 0)
+		close(s[1]);
+	if (s[0] >= 0)
+		close(s[0]);
+	return res;
+}
+
+static int do_test(int proto)
+{
+	int res4, res6;
+
+	res4 = sk_peek_offset_probe(AF_INET, proto);
+	res6 = sk_peek_offset_probe(AF_INET6, proto);
+
+	if (!res4 && !res6)
+		return KSFT_SKIP;
+
+	if (res4)
+		res4 = sk_peek_offset_test(AF_INET, proto);
+
+	if (res6)
+		res6 = sk_peek_offset_test(AF_INET6, proto);
+
+	if (!res4 || !res6)
+		return KSFT_FAIL;
+
+	return KSFT_PASS;
+}
+
+int main(void)
+{
+	int restcp, resudp;
+
+	restcp = do_test(IPPROTO_TCP);
+	resudp = do_test(IPPROTO_UDP);
+	if (restcp == KSFT_FAIL || resudp == KSFT_FAIL)
+		return KSFT_FAIL;
+
+	return KSFT_PASS;
+}
diff --git a/tools/testing/selftests/net/skf_net_off.c b/tools/testing/selftests/net/skf_net_off.c
new file mode 100644
index 000000000000..1fdf61d6cd7f
--- /dev/null
+++ b/tools/testing/selftests/net/skf_net_off.c
@@ -0,0 +1,244 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* Open a tun device.
+ *
+ * [modifications: use IFF_NAPI_FRAGS, add sk filter]
+ *
+ * Expects the device to have been configured previously, e.g.:
+ *   sudo ip tuntap add name tap1 mode tap
+ *   sudo ip link set tap1 up
+ *   sudo ip link set dev tap1 addr 02:00:00:00:00:01
+ *   sudo ip -6 addr add fdab::1 peer fdab::2 dev tap1 nodad
+ *
+ * And to avoid premature pskb_may_pull:
+ *
+ *   sudo ethtool -K tap1 gro off
+ *   sudo bash -c 'echo 0 > /proc/sys/net/ipv4/ip_early_demux'
+ */
+
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <error.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <linux/filter.h>
+#include <linux/if.h>
+#include <linux/if_packet.h>
+#include <linux/if_tun.h>
+#include <linux/ipv6.h>
+#include <netinet/if_ether.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet/udp.h>
+#include <poll.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/poll.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <unistd.h>
+
+static bool cfg_do_filter;
+static bool cfg_do_frags;
+static int cfg_dst_port = 8000;
+static char *cfg_ifname;
+
+static int tun_open(const char *tun_name)
+{
+	struct ifreq ifr = {0};
+	int fd, ret;
+
+	fd = open("/dev/net/tun", O_RDWR);
+	if (fd == -1)
+		error(1, errno, "open /dev/net/tun");
+
+	ifr.ifr_flags = IFF_TAP;
+	if (cfg_do_frags)
+		ifr.ifr_flags |= IFF_NAPI | IFF_NAPI_FRAGS;
+
+	strncpy(ifr.ifr_name, tun_name, IFNAMSIZ - 1);
+
+	ret = ioctl(fd, TUNSETIFF, &ifr);
+	if (ret)
+		error(1, ret, "ioctl TUNSETIFF");
+
+	return fd;
+}
+
+static void sk_set_filter(int fd)
+{
+	const int offset_proto = offsetof(struct ip6_hdr, ip6_nxt);
+	const int offset_dport = sizeof(struct ip6_hdr) + offsetof(struct udphdr, dest);
+
+	/* Filter UDP packets with destination port cfg_dst_port */
+	struct sock_filter filter_code[] = {
+		BPF_STMT(BPF_LD  + BPF_B   + BPF_ABS, SKF_AD_OFF + SKF_AD_PKTTYPE),
+		BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, PACKET_HOST, 0, 4),
+		BPF_STMT(BPF_LD  + BPF_B   + BPF_ABS, SKF_NET_OFF + offset_proto),
+		BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, IPPROTO_UDP, 0, 2),
+		BPF_STMT(BPF_LD  + BPF_H   + BPF_ABS, SKF_NET_OFF + offset_dport),
+		BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, cfg_dst_port, 1, 0),
+		BPF_STMT(BPF_RET + BPF_K, 0),
+		BPF_STMT(BPF_RET + BPF_K, 0xFFFF),
+	};
+
+	struct sock_fprog filter = {
+		sizeof(filter_code) / sizeof(filter_code[0]),
+		filter_code,
+	};
+
+	if (setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &filter, sizeof(filter)))
+		error(1, errno, "setsockopt attach filter");
+}
+
+static int raw_open(void)
+{
+	int fd;
+
+	fd = socket(PF_INET6, SOCK_RAW, IPPROTO_UDP);
+	if (fd == -1)
+		error(1, errno, "socket raw (udp)");
+
+	if (cfg_do_filter)
+		sk_set_filter(fd);
+
+	return fd;
+}
+
+static void tun_write(int fd)
+{
+	const char eth_src[] = { 0x02, 0x00, 0x00, 0x00, 0x00, 0x02 };
+	const char eth_dst[] = { 0x02, 0x00, 0x00, 0x00, 0x00, 0x01 };
+	struct tun_pi pi = {0};
+	struct ipv6hdr ip6h = {0};
+	struct udphdr uh = {0};
+	struct ethhdr eth = {0};
+	uint32_t payload;
+	struct iovec iov[5];
+	int ret;
+
+	pi.proto = htons(ETH_P_IPV6);
+
+	memcpy(eth.h_source, eth_src, sizeof(eth_src));
+	memcpy(eth.h_dest, eth_dst, sizeof(eth_dst));
+	eth.h_proto = htons(ETH_P_IPV6);
+
+	ip6h.version = 6;
+	ip6h.payload_len = htons(sizeof(uh) + sizeof(uint32_t));
+	ip6h.nexthdr = IPPROTO_UDP;
+	ip6h.hop_limit = 8;
+	if (inet_pton(AF_INET6, "fdab::2", &ip6h.saddr) != 1)
+		error(1, errno, "inet_pton src");
+	if (inet_pton(AF_INET6, "fdab::1", &ip6h.daddr) != 1)
+		error(1, errno, "inet_pton src");
+
+	uh.source = htons(8000);
+	uh.dest = htons(cfg_dst_port);
+	uh.len = ip6h.payload_len;
+	uh.check = 0;
+
+	payload = htonl(0xABABABAB);		/* Covered in IPv6 length */
+
+	iov[0].iov_base = &pi;
+	iov[0].iov_len  = sizeof(pi);
+	iov[1].iov_base = &eth;
+	iov[1].iov_len  = sizeof(eth);
+	iov[2].iov_base = &ip6h;
+	iov[2].iov_len  = sizeof(ip6h);
+	iov[3].iov_base = &uh;
+	iov[3].iov_len  = sizeof(uh);
+	iov[4].iov_base = &payload;
+	iov[4].iov_len  = sizeof(payload);
+
+	ret = writev(fd, iov, sizeof(iov) / sizeof(iov[0]));
+	if (ret <= 0)
+		error(1, errno, "writev");
+}
+
+static void raw_read(int fd)
+{
+	struct timeval tv = { .tv_usec = 100 * 1000 };
+	struct msghdr msg = {0};
+	struct iovec iov[2];
+	struct udphdr uh;
+	uint32_t payload[2];
+	int ret;
+
+	if (setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)))
+		error(1, errno, "setsockopt rcvtimeo udp");
+
+	iov[0].iov_base = &uh;
+	iov[0].iov_len = sizeof(uh);
+
+	iov[1].iov_base = payload;
+	iov[1].iov_len = sizeof(payload);
+
+	msg.msg_iov = iov;
+	msg.msg_iovlen = sizeof(iov) / sizeof(iov[0]);
+
+	ret = recvmsg(fd, &msg, 0);
+	if (ret <= 0)
+		error(1, errno, "read raw");
+	if (ret != sizeof(uh) + sizeof(payload[0]))
+		error(1, errno, "read raw: len=%d\n", ret);
+
+	fprintf(stderr, "raw recv: 0x%x\n", payload[0]);
+}
+
+static void parse_opts(int argc, char **argv)
+{
+	int c;
+
+	while ((c = getopt(argc, argv, "fFi:")) != -1) {
+		switch (c) {
+		case 'f':
+			cfg_do_filter = true;
+			printf("bpf filter enabled\n");
+			break;
+		case 'F':
+			cfg_do_frags = true;
+			printf("napi frags mode enabled\n");
+			break;
+		case 'i':
+			cfg_ifname = optarg;
+			break;
+		default:
+			error(1, 0, "unknown option %c", optopt);
+			break;
+		}
+	}
+
+	if (!cfg_ifname)
+		error(1, 0, "must specify tap interface name (-i)");
+}
+
+int main(int argc, char **argv)
+{
+	int fdt, fdr;
+
+	parse_opts(argc, argv);
+
+	fdr = raw_open();
+	fdt = tun_open(cfg_ifname);
+
+	tun_write(fdt);
+	raw_read(fdr);
+
+	if (close(fdt))
+		error(1, errno, "close tun");
+	if (close(fdr))
+		error(1, errno, "close udp");
+
+	fprintf(stderr, "OK\n");
+	return 0;
+}
+
diff --git a/tools/testing/selftests/net/skf_net_off.sh b/tools/testing/selftests/net/skf_net_off.sh
new file mode 100755
index 000000000000..5da5066fb465
--- /dev/null
+++ b/tools/testing/selftests/net/skf_net_off.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+readonly NS="ns-$(mktemp -u XXXXXX)"
+
+cleanup() {
+	ip netns del $NS
+}
+
+ip netns add $NS
+trap cleanup EXIT
+
+ip -netns $NS link set lo up
+ip -netns $NS tuntap add name tap1 mode tap
+ip -netns $NS link set tap1 up
+ip -netns $NS link set dev tap1 addr 02:00:00:00:00:01
+ip -netns $NS -6 addr add fdab::1 peer fdab::2 dev tap1 nodad
+ip netns exec $NS ethtool -K tap1 gro off
+
+# disable early demux, else udp_v6_early_demux pulls udp header into linear
+ip netns exec $NS sysctl -w net.ipv4.ip_early_demux=0
+
+echo "no filter"
+ip netns exec $NS ./skf_net_off -i tap1
+
+echo "filter, linear skb (-f)"
+ip netns exec $NS ./skf_net_off -i tap1 -f
+
+echo "filter, fragmented skb (-f) (-F)"
+ip netns exec $NS ./skf_net_off -i tap1 -f -F
diff --git a/tools/testing/selftests/net/so_rcv_listener.c b/tools/testing/selftests/net/so_rcv_listener.c
new file mode 100644
index 000000000000..bc5841192aa6
--- /dev/null
+++ b/tools/testing/selftests/net/so_rcv_listener.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <errno.h>
+#include <netdb.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <linux/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+
+#ifndef SO_RCVPRIORITY
+#define SO_RCVPRIORITY 82
+#endif
+
+struct options {
+	__u32 val;
+	int name;
+	int rcvname;
+	const char *host;
+	const char *service;
+} opt;
+
+static void __attribute__((noreturn)) usage(const char *bin)
+{
+	printf("Usage: %s [opts] <dst host> <dst port / service>\n", bin);
+	printf("Options:\n"
+		"\t\t-M val  Test SO_RCVMARK\n"
+		"\t\t-P val  Test SO_RCVPRIORITY\n"
+		"");
+	exit(EXIT_FAILURE);
+}
+
+static void parse_args(int argc, char *argv[])
+{
+	int o;
+
+	while ((o = getopt(argc, argv, "M:P:")) != -1) {
+		switch (o) {
+		case 'M':
+			opt.val = atoi(optarg);
+			opt.name = SO_MARK;
+			opt.rcvname = SO_RCVMARK;
+			break;
+		case 'P':
+			opt.val = atoi(optarg);
+			opt.name = SO_PRIORITY;
+			opt.rcvname = SO_RCVPRIORITY;
+			break;
+		default:
+			usage(argv[0]);
+			break;
+		}
+	}
+
+	if (optind != argc - 2)
+		usage(argv[0]);
+
+	opt.host = argv[optind];
+	opt.service = argv[optind + 1];
+}
+
+int main(int argc, char *argv[])
+{
+	int err = 0;
+	int recv_fd = -1;
+	int ret_value = 0;
+	__u32 recv_val;
+	struct cmsghdr *cmsg;
+	char cbuf[CMSG_SPACE(sizeof(__u32))];
+	char recv_buf[CMSG_SPACE(sizeof(__u32))];
+	struct iovec iov[1];
+	struct msghdr msg;
+	struct sockaddr_in recv_addr4;
+	struct sockaddr_in6 recv_addr6;
+
+	parse_args(argc, argv);
+
+	int family = strchr(opt.host, ':') ? AF_INET6 : AF_INET;
+
+	recv_fd = socket(family, SOCK_DGRAM, IPPROTO_UDP);
+	if (recv_fd < 0) {
+		perror("Can't open recv socket");
+		ret_value = -errno;
+		goto cleanup;
+	}
+
+	err = setsockopt(recv_fd, SOL_SOCKET, opt.rcvname, &opt.val, sizeof(opt.val));
+	if (err < 0) {
+		perror("Recv setsockopt error");
+		ret_value = -errno;
+		goto cleanup;
+	}
+
+	if (family == AF_INET) {
+		memset(&recv_addr4, 0, sizeof(recv_addr4));
+		recv_addr4.sin_family = family;
+		recv_addr4.sin_port = htons(atoi(opt.service));
+
+		if (inet_pton(family, opt.host, &recv_addr4.sin_addr) <= 0) {
+			perror("Invalid IPV4 address");
+			ret_value = -errno;
+			goto cleanup;
+		}
+
+		err = bind(recv_fd, (struct sockaddr *)&recv_addr4, sizeof(recv_addr4));
+	} else {
+		memset(&recv_addr6, 0, sizeof(recv_addr6));
+		recv_addr6.sin6_family = family;
+		recv_addr6.sin6_port = htons(atoi(opt.service));
+
+		if (inet_pton(family, opt.host, &recv_addr6.sin6_addr) <= 0) {
+			perror("Invalid IPV6 address");
+			ret_value = -errno;
+			goto cleanup;
+		}
+
+		err = bind(recv_fd, (struct sockaddr *)&recv_addr6, sizeof(recv_addr6));
+	}
+
+	if (err < 0) {
+		perror("Recv bind error");
+		ret_value = -errno;
+		goto cleanup;
+	}
+
+	iov[0].iov_base = recv_buf;
+	iov[0].iov_len = sizeof(recv_buf);
+
+	memset(&msg, 0, sizeof(msg));
+	msg.msg_iov = iov;
+	msg.msg_iovlen = 1;
+	msg.msg_control = cbuf;
+	msg.msg_controllen = sizeof(cbuf);
+
+	err = recvmsg(recv_fd, &msg, 0);
+	if (err < 0) {
+		perror("Message receive error");
+		ret_value = -errno;
+		goto cleanup;
+	}
+
+	for (cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
+		if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == opt.name) {
+			recv_val = *(__u32 *)CMSG_DATA(cmsg);
+			printf("Received value: %u\n", recv_val);
+
+			if (recv_val != opt.val) {
+				fprintf(stderr, "Error: expected value: %u, got: %u\n",
+					opt.val, recv_val);
+				ret_value = -EINVAL;
+			}
+			goto cleanup;
+		}
+	}
+
+	fprintf(stderr, "Error: No matching cmsg received\n");
+	ret_value = -ENOMSG;
+
+cleanup:
+	if (recv_fd >= 0)
+		close(recv_fd);
+
+	return ret_value;
+}
diff --git a/tools/testing/selftests/net/srv6_end_dt46_l3vpn_test.sh b/tools/testing/selftests/net/srv6_end_dt46_l3vpn_test.sh
index 02d617040793..a5e959a080bb 100755
--- a/tools/testing/selftests/net/srv6_end_dt46_l3vpn_test.sh
+++ b/tools/testing/selftests/net/srv6_end_dt46_l3vpn_test.sh
@@ -285,11 +285,6 @@ setup_hs()
 	ip netns exec ${hsname} sysctl -wq net.ipv6.conf.all.accept_dad=0
 	ip netns exec ${hsname} sysctl -wq net.ipv6.conf.default.accept_dad=0
 
-	# disable the rp_filter otherwise the kernel gets confused about how
-	# to route decap ipv4 packets.
-	ip netns exec ${rtname} sysctl -wq net.ipv4.conf.all.rp_filter=0
-	ip netns exec ${rtname} sysctl -wq net.ipv4.conf.default.rp_filter=0
-
 	ip -netns ${hsname} link add veth0 type veth peer name ${rtveth}
 	ip -netns ${hsname} link set ${rtveth} netns ${rtname}
 	ip -netns ${hsname} addr add ${IPv6_HS_NETWORK}::${hid}/64 dev veth0 nodad
diff --git a/tools/testing/selftests/net/srv6_end_dt4_l3vpn_test.sh b/tools/testing/selftests/net/srv6_end_dt4_l3vpn_test.sh
index 79fb81e63c59..a649dba3cb77 100755
--- a/tools/testing/selftests/net/srv6_end_dt4_l3vpn_test.sh
+++ b/tools/testing/selftests/net/srv6_end_dt4_l3vpn_test.sh
@@ -250,11 +250,6 @@ setup_hs()
 	eval local rtname=\${rt_${rid}}
 	local rtveth=veth-t${tid}
 
-	# disable the rp_filter otherwise the kernel gets confused about how
-	# to route decap ipv4 packets.
-	ip netns exec ${rtname} sysctl -wq net.ipv4.conf.all.rp_filter=0
-	ip netns exec ${rtname} sysctl -wq net.ipv4.conf.default.rp_filter=0
-
 	ip -netns ${hsname} link add veth0 type veth peer name ${rtveth}
 	ip -netns ${hsname} link set ${rtveth} netns ${rtname}
 	ip -netns ${hsname} addr add ${IPv4_HS_NETWORK}.${hid}/24 dev veth0
diff --git a/tools/testing/selftests/net/srv6_end_dx4_netfilter_test.sh b/tools/testing/selftests/net/srv6_end_dx4_netfilter_test.sh
new file mode 100755
index 000000000000..e23210aa547f
--- /dev/null
+++ b/tools/testing/selftests/net/srv6_end_dx4_netfilter_test.sh
@@ -0,0 +1,335 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# author: Jianguo Wu <wujianguo@chinatelecom.cn>
+#
+# Mostly copied from tools/testing/selftests/net/srv6_end_dt4_l3vpn_test.sh.
+#
+# This script is designed for testing the support of netfilter hooks for
+# SRv6 End.DX4 behavior.
+#
+# Hereafter a network diagram is shown, where one tenants (named 100) offer
+# IPv4 L3 VPN services allowing hosts to communicate with each other across
+# an IPv6 network.
+#
+# Routers rt-1 and rt-2 implement IPv4 L3 VPN services leveraging the SRv6
+# architecture. The key components for such VPNs are: a) SRv6 Encap behavior,
+# b) SRv6 End.DX4 behavior.
+#
+# To explain how an IPv4 L3 VPN based on SRv6 works, let us briefly consider an
+# example where, within the same domain of tenant 100, the host hs-1 pings
+# the host hs-2.
+#
+# First of all, L2 reachability of the host hs-2 is taken into account by
+# the router rt-1 which acts as an arp proxy.
+#
+# When the host hs-1 sends an IPv4 packet destined to hs-2, the router rt-1
+# receives the packet on the internal veth-t100 interface, rt-1 contains the
+# SRv6 Encap route for encapsulating the IPv4 packet in a IPv6 plus the Segment
+# Routing Header (SRH) packet. This packet is sent through the (IPv6) core
+# network up to the router rt-2 that receives it on veth0 interface.
+#
+# The rt-2 router uses the 'localsid' routing table to process incoming
+# IPv6+SRH packets which belong to the VPN of the tenant 100. For each of these
+# packets, the SRv6 End.DX4 behavior removes the outer IPv6+SRH headers and
+# routs the packet to the specified nexthop. Afterwards, the packet is sent to
+# the host hs-2 through the veth-t100 interface.
+#
+# The ping response follows the same processing but this time the role of rt-1
+# and rt-2 are swapped.
+#
+# And when net.netfilter.nf_hooks_lwtunnel is set to 1 in rt-1 or rt-2, and a
+# rpfilter iptables rule is added, SRv6 packets will go through netfilter PREROUTING
+# hooks.
+#
+#
+# +-------------------+                                   +-------------------+
+# |                   |                                   |                   |
+# |    hs-1 netns     |                                   |     hs-2 netns    |
+# |                   |                                   |                   |
+# |  +-------------+  |                                   |  +-------------+  |
+# |  |    veth0    |  |                                   |  |    veth0    |  |
+# |  | 10.0.0.1/24 |  |                                   |  | 10.0.0.2/24 |  |
+# |  +-------------+  |                                   |  +-------------+  |
+# |        .          |                                   |         .         |
+# +-------------------+                                   +-------------------+
+#          .                                                        .
+#          .                                                        .
+#          .                                                        .
+# +-----------------------------------+   +-----------------------------------+
+# |        .                          |   |                         .         |
+# | +---------------+                 |   |                 +---------------- |
+# | |   veth-t100   |                 |   |                 |   veth-t100   | |
+# | | 10.0.0.11/24  |    +----------+ |   | +----------+    | 10.0.0.22/24  | |
+# | +-------+-------+   |   route   | |   | |   route  |    +-------+-------- |
+# |                     |   table   | |   | |   table  |                      |
+# |                      +----------+ |   | +----------+                      |
+# |                  +--------------+ |   | +--------------+                  |
+# |                 |      veth0    | |   | |   veth0       |                 |
+# |                 | 2001:11::1/64 |.|...|.| 2001:11::2/64 |                 |
+# |                  +--------------+ |   | +--------------+                  |
+# |                                   |   |                                   |
+# |                        rt-1 netns |   | rt-2 netns                        |
+# |                                   |   |                                   |
+# +-----------------------------------+   +-----------------------------------+
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~
+# | Network configuration |
+# ~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# rt-1: localsid table
+# +----------------------------------------------------------------+
+# |SID              |Action                                        |
+# +----------------------------------------------------------------+
+# |fc00:21:100::6004|apply SRv6 End.DX4 nh4 10.0.0.1 dev veth-t100 |
+# +----------------------------------------------------------------+
+#
+# rt-1: route table
+# +---------------------------------------------------+
+# |host       |Action                                 |
+# +---------------------------------------------------+
+# |10.0.0.2   |apply seg6 encap segs fc00:12:100::6004|
+# +---------------------------------------------------+
+# |10.0.0.0/24|forward to dev veth_t100               |
+# +---------------------------------------------------+
+#
+#
+# rt-2: localsid table
+# +---------------------------------------------------------------+
+# |SID              |Action                                       |
+# +---------------------------------------------------------------+
+# |fc00:12:100::6004|apply SRv6 End.DX4 nh4 10.0.0.2 dev veth-t100|
+# +---------------------------------------------------------------+
+#
+# rt-2: route table
+# +---------------------------------------------------+
+# |host       |Action                                 |
+# +---------------------------------------------------+
+# |10.0.0.1   |apply seg6 encap segs fc00:21:100::6004|
+# +---------------------------------------------------+
+# |10.0.0.0/24|forward to dev veth_t100               |
+# +---------------------------------------------------+
+#
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+readonly IPv6_RT_NETWORK=2001:11
+readonly IPv4_HS_NETWORK=10.0.0
+readonly SID_LOCATOR=fc00
+
+PING_TIMEOUT_SEC=4
+
+ret=0
+
+PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no}
+
+log_test()
+{
+	local rc=$1
+	local expected=$2
+	local msg="$3"
+
+	if [ ${rc} -eq ${expected} ]; then
+		nsuccess=$((nsuccess+1))
+		printf "\n    TEST: %-60s  [ OK ]\n" "${msg}"
+	else
+		ret=1
+		nfail=$((nfail+1))
+		printf "\n    TEST: %-60s  [FAIL]\n" "${msg}"
+		if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
+			echo
+			echo "hit enter to continue, 'q' to quit"
+			read a
+			[ "$a" = "q" ] && exit 1
+		fi
+	fi
+}
+
+print_log_test_results()
+{
+	if [ "$TESTS" != "none" ]; then
+		printf "\nTests passed: %3d\n" ${nsuccess}
+		printf "Tests failed: %3d\n"   ${nfail}
+	fi
+}
+
+log_section()
+{
+	echo
+	echo "################################################################################"
+	echo "TEST SECTION: $*"
+	echo "################################################################################"
+}
+
+cleanup()
+{
+	ip link del veth-rt-1 2>/dev/null || true
+	ip link del veth-rt-2 2>/dev/null || true
+
+	# destroy routers rt-* and hosts hs-*
+	for ns in $(ip netns show | grep -E 'rt-*|hs-*'); do
+		ip netns del ${ns} || true
+	done
+}
+
+# Setup the basic networking for the routers
+setup_rt_networking()
+{
+	local rt=$1
+	local nsname=rt-${rt}
+
+	ip netns add ${nsname}
+
+	ip netns exec ${nsname} sysctl -wq net.ipv6.conf.all.accept_dad=0
+	ip netns exec ${nsname} sysctl -wq net.ipv6.conf.default.accept_dad=0
+
+	ip link set veth-rt-${rt} netns ${nsname}
+	ip -netns ${nsname} link set veth-rt-${rt} name veth0
+
+	ip -netns ${nsname} addr add ${IPv6_RT_NETWORK}::${rt}/64 dev veth0 nodad
+	ip -netns ${nsname} link set veth0 up
+	ip -netns ${nsname} link set lo up
+
+	ip netns exec ${nsname} sysctl -wq net.ipv4.ip_forward=1
+	ip netns exec ${nsname} sysctl -wq net.ipv6.conf.all.forwarding=1
+}
+
+setup_rt_netfilter()
+{
+	local rt=$1
+	local nsname=rt-${rt}
+
+	ip netns exec ${nsname} sysctl -wq net.netfilter.nf_hooks_lwtunnel=1
+	ip netns exec ${nsname} iptables -t raw -A PREROUTING -m rpfilter --invert -j DROP
+}
+
+setup_hs()
+{
+	local hs=$1
+	local rt=$2
+	local tid=$3
+	local hsname=hs-${hs}
+	local rtname=rt-${rt}
+	local rtveth=veth-t${tid}
+
+	# set the networking for the host
+	ip netns add ${hsname}
+
+	ip -netns ${hsname} link add veth0 type veth peer name ${rtveth}
+	ip -netns ${hsname} link set ${rtveth} netns ${rtname}
+	ip -netns ${hsname} addr add ${IPv4_HS_NETWORK}.${hs}/24 dev veth0
+	ip -netns ${hsname} link set veth0 up
+	ip -netns ${hsname} link set lo up
+
+	ip -netns ${rtname} addr add ${IPv4_HS_NETWORK}.${rt}${hs}/24 dev ${rtveth}
+	ip -netns ${rtname} link set ${rtveth} up
+
+	ip netns exec ${rtname} sysctl -wq net.ipv4.conf.${rtveth}.proxy_arp=1
+}
+
+setup_vpn_config()
+{
+	local hssrc=$1
+	local rtsrc=$2
+	local hsdst=$3
+	local rtdst=$4
+	local tid=$5
+
+	local hssrc_name=hs-t${tid}-${hssrc}
+	local hsdst_name=hs-t${tid}-${hsdst}
+	local rtsrc_name=rt-${rtsrc}
+	local rtdst_name=rt-${rtdst}
+	local vpn_sid=${SID_LOCATOR}:${hssrc}${hsdst}:${tid}::6004
+
+	# set the encap route for encapsulating packets which arrive from the
+	# host hssrc and destined to the access router rtsrc.
+	ip -netns ${rtsrc_name} -4 route add ${IPv4_HS_NETWORK}.${hsdst}/32 \
+		encap seg6 mode encap segs ${vpn_sid} dev veth0
+	ip -netns ${rtsrc_name} -6 route add ${vpn_sid}/128 \
+		via 2001:11::${rtdst} dev veth0
+
+	# set the decap route for decapsulating packets which arrive from
+	# the rtdst router and destined to the hsdst host.
+	ip -netns ${rtdst_name} -6 route add ${vpn_sid}/128 \
+		encap seg6local action End.DX4 nh4 ${IPv4_HS_NETWORK}.${hsdst} dev veth-t${tid}
+}
+
+setup()
+{
+	ip link add veth-rt-1 type veth peer name veth-rt-2
+	# setup the networking for router rt-1 and router rt-2
+	setup_rt_networking 1
+	setup_rt_networking 2
+
+	# setup two hosts for the tenant 100.
+	#  - host hs-1 is directly connected to the router rt-1;
+	#  - host hs-2 is directly connected to the router rt-2.
+	setup_hs 1 1 100
+	setup_hs 2 2 100
+
+	# setup the IPv4 L3 VPN which connects the host hs-1 and host hs-2.
+	setup_vpn_config 1 1 2 2 100  #args: src_host src_router dst_host dst_router tenant
+	setup_vpn_config 2 2 1 1 100
+}
+
+check_hs_connectivity()
+{
+	local hssrc=$1
+	local hsdst=$2
+	local tid=$3
+
+	ip netns exec hs-${hssrc} ping -c 1 -W ${PING_TIMEOUT_SEC} \
+		${IPv4_HS_NETWORK}.${hsdst} >/dev/null 2>&1
+}
+
+check_and_log_hs_connectivity()
+{
+	local hssrc=$1
+	local hsdst=$2
+	local tid=$3
+
+	check_hs_connectivity ${hssrc} ${hsdst} ${tid}
+	log_test $? 0 "Hosts connectivity: hs-${hssrc} -> hs-${hsdst} (tenant ${tid})"
+}
+
+host_tests()
+{
+	log_section "SRv6 VPN connectivity test among hosts in the same tenant"
+
+	check_and_log_hs_connectivity 1 2 100
+	check_and_log_hs_connectivity 2 1 100
+}
+
+router_netfilter_tests()
+{
+	log_section "SRv6 VPN connectivity test with netfilter enabled in routers"
+	setup_rt_netfilter 1
+	setup_rt_netfilter 2
+
+	check_and_log_hs_connectivity 1 2 100
+	check_and_log_hs_connectivity 2 1 100
+}
+
+if [ "$(id -u)" -ne 0 ];then
+	echo "SKIP: Need root privileges"
+	exit $ksft_skip
+fi
+
+if [ ! -x "$(command -v ip)" ]; then
+	echo "SKIP: Could not run test without ip tool"
+	exit $ksft_skip
+fi
+
+cleanup &>/dev/null
+
+setup
+
+host_tests
+router_netfilter_tests
+
+print_log_test_results
+
+cleanup &>/dev/null
+
+exit ${ret}
diff --git a/tools/testing/selftests/net/srv6_end_dx6_netfilter_test.sh b/tools/testing/selftests/net/srv6_end_dx6_netfilter_test.sh
new file mode 100755
index 000000000000..9e69a2ed5bc3
--- /dev/null
+++ b/tools/testing/selftests/net/srv6_end_dx6_netfilter_test.sh
@@ -0,0 +1,340 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# author: Jianguo Wu <wujianguo@chinatelecom.cn>
+#
+# Mostly copied from tools/testing/selftests/net/srv6_end_dt6_l3vpn_test.sh.
+#
+# This script is designed for testing the support of netfilter hooks for
+# SRv6 End.DX4 behavior.
+#
+# Hereafter a network diagram is shown, where one tenants (named 100) offer
+# IPv6 L3 VPN services allowing hosts to communicate with each other across
+# an IPv6 network.
+#
+# Routers rt-1 and rt-2 implement IPv6 L3 VPN services leveraging the SRv6
+# architecture. The key components for such VPNs are: a) SRv6 Encap behavior,
+# b) SRv6 End.DX4 behavior.
+#
+# To explain how an IPv6 L3 VPN based on SRv6 works, let us briefly consider an
+# example where, within the same domain of tenant 100, the host hs-1 pings
+# the host hs-2.
+#
+# First of all, L2 reachability of the host hs-2 is taken into account by
+# the router rt-1 which acts as an arp proxy.
+#
+# When the host hs-1 sends an IPv6 packet destined to hs-2, the router rt-1
+# receives the packet on the internal veth-t100 interface, rt-1 contains the
+# SRv6 Encap route for encapsulating the IPv6 packet in a IPv6 plus the Segment
+# Routing Header (SRH) packet. This packet is sent through the (IPv6) core
+# network up to the router rt-2 that receives it on veth0 interface.
+#
+# The rt-2 router uses the 'localsid' routing table to process incoming
+# IPv6+SRH packets which belong to the VPN of the tenant 100. For each of these
+# packets, the SRv6 End.DX4 behavior removes the outer IPv6+SRH headers and
+# routs the packet to the specified nexthop. Afterwards, the packet is sent to
+# the host hs-2 through the veth-t100 interface.
+#
+# The ping response follows the same processing but this time the role of rt-1
+# and rt-2 are swapped.
+#
+# And when net.netfilter.nf_hooks_lwtunnel is set to 1 in rt-1 or rt-2, and a
+# rpfilter iptables rule is added, SRv6 packets will go through netfilter PREROUTING
+# hooks.
+#
+#
+# +-------------------+                                   +-------------------+
+# |                   |                                   |                   |
+# |    hs-1 netns     |                                   |     hs-2 netns    |
+# |                   |                                   |                   |
+# |  +-------------+  |                                   |  +-------------+  |
+# |  |    veth0    |  |                                   |  |    veth0    |  |
+# |  | cafe::1/64  |  |                                   |  | cafe::2/64  |  |
+# |  +-------------+  |                                   |  +-------------+  |
+# |        .          |                                   |         .         |
+# +-------------------+                                   +-------------------+
+#          .                                                        .
+#          .                                                        .
+#          .                                                        .
+# +-----------------------------------+   +-----------------------------------+
+# |        .                          |   |                         .         |
+# | +---------------+                 |   |                 +---------------- |
+# | |   veth-t100   |                 |   |                 |   veth-t100   | |
+# | | cafe::11/64   |    +----------+ |   | +----------+    | cafe::22/64   | |
+# | +-------+-------+   |   route   | |   | |   route  |    +-------+-------- |
+# |                     |   table   | |   | |   table  |                      |
+# |                      +----------+ |   | +----------+                      |
+# |                  +--------------+ |   | +--------------+                  |
+# |                 |      veth0    | |   | |   veth0       |                 |
+# |                 | 2001:11::1/64 |.|...|.| 2001:11::2/64 |                 |
+# |                  +--------------+ |   | +--------------+                  |
+# |                                   |   |                                   |
+# |                        rt-1 netns |   | rt-2 netns                        |
+# |                                   |   |                                   |
+# +-----------------------------------+   +-----------------------------------+
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~
+# | Network configuration |
+# ~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# rt-1: localsid table
+# +----------------------------------------------------------------+
+# |SID              |Action                                        |
+# +----------------------------------------------------------------+
+# |fc00:21:100::6004|apply SRv6 End.DX6 nh6 cafe::1 dev veth-t100  |
+# +----------------------------------------------------------------+
+#
+# rt-1: route table
+# +---------------------------------------------------+
+# |host       |Action                                 |
+# +---------------------------------------------------+
+# |cafe::2    |apply seg6 encap segs fc00:12:100::6004|
+# +---------------------------------------------------+
+# |cafe::/64  |forward to dev veth_t100               |
+# +---------------------------------------------------+
+#
+#
+# rt-2: localsid table
+# +---------------------------------------------------------------+
+# |SID              |Action                                       |
+# +---------------------------------------------------------------+
+# |fc00:12:100::6004|apply SRv6 End.DX6 nh6 cafe::2 dev veth-t100 |
+# +---------------------------------------------------------------+
+#
+# rt-2: route table
+# +---------------------------------------------------+
+# |host       |Action                                 |
+# +---------------------------------------------------+
+# |cafe::1    |apply seg6 encap segs fc00:21:100::6004|
+# +---------------------------------------------------+
+# |cafe::/64  |forward to dev veth_t100               |
+# +---------------------------------------------------+
+#
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+readonly IPv6_RT_NETWORK=2001:11
+readonly IPv6_HS_NETWORK=cafe
+readonly SID_LOCATOR=fc00
+
+PING_TIMEOUT_SEC=4
+
+ret=0
+
+PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no}
+
+log_test()
+{
+	local rc=$1
+	local expected=$2
+	local msg="$3"
+
+	if [ ${rc} -eq ${expected} ]; then
+		nsuccess=$((nsuccess+1))
+		printf "\n    TEST: %-60s  [ OK ]\n" "${msg}"
+	else
+		ret=1
+		nfail=$((nfail+1))
+		printf "\n    TEST: %-60s  [FAIL]\n" "${msg}"
+		if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
+			echo
+			echo "hit enter to continue, 'q' to quit"
+			read a
+			[ "$a" = "q" ] && exit 1
+		fi
+	fi
+}
+
+print_log_test_results()
+{
+	if [ "$TESTS" != "none" ]; then
+		printf "\nTests passed: %3d\n" ${nsuccess}
+		printf "Tests failed: %3d\n"   ${nfail}
+	fi
+}
+
+log_section()
+{
+	echo
+	echo "################################################################################"
+	echo "TEST SECTION: $*"
+	echo "################################################################################"
+}
+
+cleanup()
+{
+	ip link del veth-rt-1 2>/dev/null || true
+	ip link del veth-rt-2 2>/dev/null || true
+
+	# destroy routers rt-* and hosts hs-*
+	for ns in $(ip netns show | grep -E 'rt-*|hs-*'); do
+		ip netns del ${ns} || true
+	done
+}
+
+# Setup the basic networking for the routers
+setup_rt_networking()
+{
+	local rt=$1
+	local nsname=rt-${rt}
+
+	ip netns add ${nsname}
+
+	ip netns exec ${nsname} sysctl -wq net.ipv6.conf.all.accept_dad=0
+	ip netns exec ${nsname} sysctl -wq net.ipv6.conf.default.accept_dad=0
+
+	ip link set veth-rt-${rt} netns ${nsname}
+	ip -netns ${nsname} link set veth-rt-${rt} name veth0
+
+	ip -netns ${nsname} addr add ${IPv6_RT_NETWORK}::${rt}/64 dev veth0 nodad
+	ip -netns ${nsname} link set veth0 up
+	ip -netns ${nsname} link set lo up
+
+	ip netns exec ${nsname} sysctl -wq net.ipv6.conf.all.forwarding=1
+}
+
+setup_rt_netfilter()
+{
+	local rt=$1
+	local nsname=rt-${rt}
+
+	ip netns exec ${nsname} sysctl -wq net.netfilter.nf_hooks_lwtunnel=1
+	ip netns exec ${nsname} ip6tables -t raw -A PREROUTING -m rpfilter --invert -j DROP
+}
+
+setup_hs()
+{
+	local hs=$1
+	local rt=$2
+	local tid=$3
+	local hsname=hs-${hs}
+	local rtname=rt-${rt}
+	local rtveth=veth-t${tid}
+
+	# set the networking for the host
+	ip netns add ${hsname}
+
+	ip -netns ${hsname} link add veth0 type veth peer name ${rtveth}
+	ip -netns ${hsname} link set ${rtveth} netns ${rtname}
+	ip -netns ${hsname} addr add ${IPv6_HS_NETWORK}::${hs}/64 dev veth0 nodad
+	ip -netns ${hsname} link set veth0 up
+	ip -netns ${hsname} link set lo up
+
+	ip -netns ${rtname} addr add ${IPv6_HS_NETWORK}::${rt}${hs}/64 dev ${rtveth}
+	ip -netns ${rtname} link set ${rtveth} up
+
+	ip netns exec ${rtname} sysctl -wq net.ipv6.conf.all.accept_dad=0
+	ip netns exec ${rtname} sysctl -wq net.ipv6.conf.default.accept_dad=0
+
+	ip netns exec ${rtname} sysctl -wq net.ipv6.conf.${rtveth}.proxy_ndp=1
+}
+
+setup_vpn_config()
+{
+	local hssrc=$1
+	local rtsrc=$2
+	local hsdst=$3
+	local rtdst=$4
+	local tid=$5
+
+	local hssrc_name=hs-t${tid}-${hssrc}
+	local hsdst_name=hs-t${tid}-${hsdst}
+	local rtsrc_name=rt-${rtsrc}
+	local rtdst_name=rt-${rtdst}
+	local rtveth=veth-t${tid}
+	local vpn_sid=${SID_LOCATOR}:${hssrc}${hsdst}:${tid}::6004
+
+	ip -netns ${rtsrc_name} -6 neigh add proxy ${IPv6_HS_NETWORK}::${hsdst} dev ${rtveth}
+
+	# set the encap route for encapsulating packets which arrive from the
+	# host hssrc and destined to the access router rtsrc.
+	ip -netns ${rtsrc_name} -6 route add ${IPv6_HS_NETWORK}::${hsdst}/128 \
+		encap seg6 mode encap segs ${vpn_sid} dev veth0
+	ip -netns ${rtsrc_name} -6 route add ${vpn_sid}/128 \
+		via 2001:11::${rtdst} dev veth0
+
+	# set the decap route for decapsulating packets which arrive from
+	# the rtdst router and destined to the hsdst host.
+	ip -netns ${rtdst_name} -6 route add ${vpn_sid}/128 \
+		encap seg6local action End.DX6 nh6 ${IPv6_HS_NETWORK}::${hsdst} dev veth-t${tid}
+}
+
+setup()
+{
+	ip link add veth-rt-1 type veth peer name veth-rt-2
+	# setup the networking for router rt-1 and router rt-2
+	setup_rt_networking 1
+	setup_rt_networking 2
+
+	# setup two hosts for the tenant 100.
+	#  - host hs-1 is directly connected to the router rt-1;
+	#  - host hs-2 is directly connected to the router rt-2.
+	setup_hs 1 1 100
+	setup_hs 2 2 100
+
+	# setup the IPv4 L3 VPN which connects the host hs-1 and host hs-2.
+	setup_vpn_config 1 1 2 2 100  #args: src_host src_router dst_host dst_router tenant
+	setup_vpn_config 2 2 1 1 100
+}
+
+check_hs_connectivity()
+{
+	local hssrc=$1
+	local hsdst=$2
+	local tid=$3
+
+	ip netns exec hs-${hssrc} ping -6 -c 1 -W ${PING_TIMEOUT_SEC} \
+		${IPv6_HS_NETWORK}::${hsdst} >/dev/null 2>&1
+}
+
+check_and_log_hs_connectivity()
+{
+	local hssrc=$1
+	local hsdst=$2
+	local tid=$3
+
+	check_hs_connectivity ${hssrc} ${hsdst} ${tid}
+	log_test $? 0 "Hosts connectivity: hs-${hssrc} -> hs-${hsdst} (tenant ${tid})"
+}
+
+host_tests()
+{
+	log_section "SRv6 VPN connectivity test among hosts in the same tenant"
+
+	check_and_log_hs_connectivity 1 2 100
+	check_and_log_hs_connectivity 2 1 100
+}
+
+router_netfilter_tests()
+{
+	log_section "SRv6 VPN connectivity test with netfilter enabled in routers"
+	setup_rt_netfilter 1
+	setup_rt_netfilter 2
+
+	check_and_log_hs_connectivity 1 2 100
+	check_and_log_hs_connectivity 2 1 100
+}
+
+if [ "$(id -u)" -ne 0 ];then
+	echo "SKIP: Need root privileges"
+	exit $ksft_skip
+fi
+
+if [ ! -x "$(command -v ip)" ]; then
+	echo "SKIP: Could not run test without ip tool"
+	exit $ksft_skip
+fi
+
+cleanup &>/dev/null
+
+setup
+
+host_tests
+router_netfilter_tests
+
+print_log_test_results
+
+cleanup &>/dev/null
+
+exit ${ret}
diff --git a/tools/testing/selftests/net/srv6_end_flavors_test.sh b/tools/testing/selftests/net/srv6_end_flavors_test.sh
index 50563443a4ad..318487eda671 100755
--- a/tools/testing/selftests/net/srv6_end_flavors_test.sh
+++ b/tools/testing/selftests/net/srv6_end_flavors_test.sh
@@ -399,7 +399,7 @@ __get_srv6_rtcfg_id()
 
 # Given the description of a router <id:op> as an input, the function returns
 # the <op> token which represents the operation (e.g. End behavior with or
-# withouth flavors) configured for the node.
+# without flavors) configured for the node.
 
 # Note that when the operation represents an End behavior with a list of
 # flavors, the output is the ordered version of that list.
@@ -480,7 +480,7 @@ setup_rt_local_sids()
 
 
 	# all SIDs start with a common locator. Routes and SRv6 Endpoint
-	# behavior instaces are grouped together in the 'localsid' table.
+	# behavior instances are grouped together in the 'localsid' table.
 	ip -netns "${nsname}" -6 rule \
 		add to "${LOCATOR_SERVICE}::/16" \
 		lookup "${LOCALSID_TABLE_ID}" prio 999
diff --git a/tools/testing/selftests/net/srv6_end_next_csid_l3vpn_test.sh b/tools/testing/selftests/net/srv6_end_next_csid_l3vpn_test.sh
index 87e414cc417c..ba730655a7bf 100755
--- a/tools/testing/selftests/net/srv6_end_next_csid_l3vpn_test.sh
+++ b/tools/testing/selftests/net/srv6_end_next_csid_l3vpn_test.sh
@@ -245,10 +245,8 @@
 # that adopted in the use cases already examined (of course, it is necessary to
 # consider the different SIDs/C-SIDs).
 
-# Kselftest framework requirement - SKIP code is 4.
-readonly ksft_skip=4
+source lib.sh
 
-readonly RDMSUFF="$(mktemp -u XXXXXXXX)"
 readonly DUMMY_DEVNAME="dum0"
 readonly VRF_TID=100
 readonly VRF_DEVNAME="vrf-${VRF_TID}"
@@ -376,32 +374,18 @@ test_command_or_ksft_skip()
 	fi
 }
 
-get_nodename()
-{
-	local name="$1"
-
-	echo "${name}-${RDMSUFF}"
-}
-
 get_rtname()
 {
 	local rtid="$1"
 
-	get_nodename "rt-${rtid}"
+	echo "rt_${rtid}"
 }
 
 get_hsname()
 {
 	local hsid="$1"
 
-	get_nodename "hs-${hsid}"
-}
-
-__create_namespace()
-{
-	local name="$1"
-
-	ip netns add "${name}"
+	echo "hs_${hsid}"
 }
 
 create_router()
@@ -410,8 +394,7 @@ create_router()
 	local nsname
 
 	nsname="$(get_rtname "${rtid}")"
-
-	__create_namespace "${nsname}"
+	setup_ns "${nsname}"
 }
 
 create_host()
@@ -420,28 +403,12 @@ create_host()
 	local nsname
 
 	nsname="$(get_hsname "${hsid}")"
-
-	__create_namespace "${nsname}"
+	setup_ns "${nsname}"
 }
 
 cleanup()
 {
-	local nsname
-	local i
-
-	# destroy routers
-	for i in ${ROUTERS}; do
-		nsname="$(get_rtname "${i}")"
-
-		ip netns del "${nsname}" &>/dev/null || true
-	done
-
-	# destroy hosts
-	for i in ${HOSTS}; do
-		nsname="$(get_hsname "${i}")"
-
-		ip netns del "${nsname}" &>/dev/null || true
-	done
+	cleanup_all_ns
 
 	# check whether the setup phase was completed successfully or not. In
 	# case of an error during the setup phase of the testing environment,
@@ -462,10 +429,10 @@ add_link_rt_pairs()
 	local nsname
 	local neigh_nsname
 
-	nsname="$(get_rtname "${rt}")"
+	eval nsname=\${$(get_rtname "${rt}")}
 
 	for neigh in ${rt_neighs}; do
-		neigh_nsname="$(get_rtname "${neigh}")"
+		eval neigh_nsname=\${$(get_rtname "${neigh}")}
 
 		ip link add "veth-rt-${rt}-${neigh}" netns "${nsname}" \
 			type veth peer name "veth-rt-${neigh}-${rt}" \
@@ -497,7 +464,7 @@ setup_rt_networking()
 	local devname
 	local neigh
 
-	nsname="$(get_rtname "${rt}")"
+	eval nsname=\${$(get_rtname "${rt}")}
 
 	for neigh in ${rt_neighs}; do
 		devname="veth-rt-${rt}-${neigh}"
@@ -518,9 +485,6 @@ setup_rt_networking()
 	ip netns exec "${nsname}" sysctl -wq net.ipv6.conf.all.accept_dad=0
 	ip netns exec "${nsname}" sysctl -wq net.ipv6.conf.default.accept_dad=0
 	ip netns exec "${nsname}" sysctl -wq net.ipv6.conf.all.forwarding=1
-
-	ip netns exec "${nsname}" sysctl -wq net.ipv4.conf.all.rp_filter=0
-	ip netns exec "${nsname}" sysctl -wq net.ipv4.conf.default.rp_filter=0
 	ip netns exec "${nsname}" sysctl -wq net.ipv4.ip_forward=1
 }
 
@@ -596,7 +560,7 @@ setup_rt_local_sids()
 	local lcnode_func_prefix
 	local lcblock_prefix
 
-	nsname="$(get_rtname "${rt}")"
+	eval nsname=\${$(get_rtname "${rt}")}
 
 	for neigh in ${rt_neighs}; do
 		devname="veth-rt-${rt}-${neigh}"
@@ -668,8 +632,8 @@ __setup_l3vpn()
 	local rtsrc_nsname
 	local rtdst_nsname
 
-	rtsrc_nsname="$(get_rtname "${src}")"
-	rtdst_nsname="$(get_rtname "${dst}")"
+	eval rtsrc_nsname=\${$(get_rtname "${src}")}
+	eval rtdst_nsname=\${$(get_rtname "${dst}")}
 
 	container="${LCBLOCK_ADDR}"
 
@@ -744,8 +708,8 @@ setup_hs()
 	local hsname
 	local rtname
 
-	hsname="$(get_hsname "${hs}")"
-	rtname="$(get_rtname "${rt}")"
+	eval hsname=\${$(get_hsname "${hs}")}
+	eval rtname=\${$(get_rtname "${rt}")}
 
 	ip netns exec "${hsname}" sysctl -wq net.ipv6.conf.all.accept_dad=0
 	ip netns exec "${hsname}" sysctl -wq net.ipv6.conf.default.accept_dad=0
@@ -791,11 +755,6 @@ setup_hs()
 	ip netns exec "${rtname}" \
 		sysctl -wq net.ipv4.conf."${RT2HS_DEVNAME}".proxy_arp=1
 
-	# disable the rp_filter otherwise the kernel gets confused about how
-	# to route decap ipv4 packets.
-	ip netns exec "${rtname}" \
-		sysctl -wq net.ipv4.conf."${RT2HS_DEVNAME}".rp_filter=0
-
 	ip netns exec "${rtname}" sh -c "echo 1 > /proc/sys/net/vrf/strict_mode"
 }
 
@@ -880,7 +839,7 @@ check_rt_connectivity()
 	local prefix
 	local rtsrc_nsname
 
-	rtsrc_nsname="$(get_rtname "${rtsrc}")"
+	eval rtsrc_nsname=\${$(get_rtname "${rtsrc}")}
 
 	prefix="$(get_network_prefix "${rtsrc}" "${rtdst}")"
 
@@ -903,7 +862,7 @@ check_hs_ipv6_connectivity()
 	local hsdst="$2"
 	local hssrc_nsname
 
-	hssrc_nsname="$(get_hsname "${hssrc}")"
+	eval hssrc_nsname=\${$(get_hsname "${hssrc}")}
 
 	ip netns exec "${hssrc_nsname}" ping -c 1 -W "${PING_TIMEOUT_SEC}" \
 		"${IPv6_HS_NETWORK}::${hsdst}" >/dev/null 2>&1
@@ -915,7 +874,7 @@ check_hs_ipv4_connectivity()
 	local hsdst="$2"
 	local hssrc_nsname
 
-	hssrc_nsname="$(get_hsname "${hssrc}")"
+	eval hssrc_nsname=\${$(get_hsname "${hssrc}")}
 
 	ip netns exec "${hssrc_nsname}" ping -c 1 -W "${PING_TIMEOUT_SEC}" \
 		"${IPv4_HS_NETWORK}.${hsdst}" >/dev/null 2>&1
@@ -1025,7 +984,7 @@ rt_x_nextcsid_end_behavior_test()
 	local nsname
 	local ret
 
-	nsname="$(get_rtname "${rt}")"
+	eval nsname=\${$(get_rtname "${rt}")}
 
 	__nextcsid_end_behavior_test "${nsname}" "add" "${blen}" "${flen}"
 	ret="$?"
diff --git a/tools/testing/selftests/net/srv6_end_x_next_csid_l3vpn_test.sh b/tools/testing/selftests/net/srv6_end_x_next_csid_l3vpn_test.sh
index c79cb8ede17f..4b86040c58c6 100755
--- a/tools/testing/selftests/net/srv6_end_x_next_csid_l3vpn_test.sh
+++ b/tools/testing/selftests/net/srv6_end_x_next_csid_l3vpn_test.sh
@@ -287,10 +287,8 @@
 # packet using the SRv6 End.DT46 behavior (associated with the SID fcff:1::d46)
 # and sends it to the host hs-1.
 
-# Kselftest framework requirement - SKIP code is 4.
-readonly ksft_skip=4
+source lib.sh
 
-readonly RDMSUFF="$(mktemp -u XXXXXXXX)"
 readonly DUMMY_DEVNAME="dum0"
 readonly VRF_TID=100
 readonly VRF_DEVNAME="vrf-${VRF_TID}"
@@ -418,32 +416,18 @@ test_command_or_ksft_skip()
 	fi
 }
 
-get_nodename()
-{
-	local name="$1"
-
-	echo "${name}-${RDMSUFF}"
-}
-
 get_rtname()
 {
 	local rtid="$1"
 
-	get_nodename "rt-${rtid}"
+	echo "rt_${rtid}"
 }
 
 get_hsname()
 {
 	local hsid="$1"
 
-	get_nodename "hs-${hsid}"
-}
-
-__create_namespace()
-{
-	local name="$1"
-
-	ip netns add "${name}"
+	echo "hs_${hsid}"
 }
 
 create_router()
@@ -452,15 +436,12 @@ create_router()
 	local nsname
 
 	nsname="$(get_rtname "${rtid}")"
+	setup_ns "${nsname}"
 
-	__create_namespace "${nsname}"
-
+	eval nsname=\${$(get_rtname "${rtid}")}
 	ip netns exec "${nsname}" sysctl -wq net.ipv6.conf.all.accept_dad=0
 	ip netns exec "${nsname}" sysctl -wq net.ipv6.conf.default.accept_dad=0
 	ip netns exec "${nsname}" sysctl -wq net.ipv6.conf.all.forwarding=1
-
-	ip netns exec "${nsname}" sysctl -wq net.ipv4.conf.all.rp_filter=0
-	ip netns exec "${nsname}" sysctl -wq net.ipv4.conf.default.rp_filter=0
 	ip netns exec "${nsname}" sysctl -wq net.ipv4.ip_forward=1
 }
 
@@ -470,29 +451,12 @@ create_host()
 	local nsname
 
 	nsname="$(get_hsname "${hsid}")"
-
-	__create_namespace "${nsname}"
+	setup_ns "${nsname}"
 }
 
 cleanup()
 {
-	local nsname
-	local i
-
-	# destroy routers
-	for i in ${ROUTERS}; do
-		nsname="$(get_rtname "${i}")"
-
-		ip netns del "${nsname}" &>/dev/null || true
-	done
-
-	# destroy hosts
-	for i in ${HOSTS}; do
-		nsname="$(get_hsname "${i}")"
-
-		ip netns del "${nsname}" &>/dev/null || true
-	done
-
+	cleanup_all_ns
 	# check whether the setup phase was completed successfully or not. In
 	# case of an error during the setup phase of the testing environment,
 	# the selftest is considered as "skipped".
@@ -512,10 +476,10 @@ add_link_rt_pairs()
 	local nsname
 	local neigh_nsname
 
-	nsname="$(get_rtname "${rt}")"
+	eval nsname=\${$(get_rtname "${rt}")}
 
 	for neigh in ${rt_neighs}; do
-		neigh_nsname="$(get_rtname "${neigh}")"
+		eval neigh_nsname=\${$(get_rtname "${neigh}")}
 
 		ip link add "veth-rt-${rt}-${neigh}" netns "${nsname}" \
 			type veth peer name "veth-rt-${neigh}-${rt}" \
@@ -547,7 +511,7 @@ setup_rt_networking()
 	local devname
 	local neigh
 
-	nsname="$(get_rtname "${rt}")"
+	eval nsname=\${$(get_rtname "${rt}")}
 
 	for neigh in ${rt_neighs}; do
 		devname="veth-rt-${rt}-${neigh}"
@@ -631,7 +595,7 @@ set_end_x_nextcsid()
 	local rt="$1"
 	local adj="$2"
 
-	nsname="$(get_rtname "${rt}")"
+	eval nsname=\${$(get_rtname "${rt}")}
 	net_prefix="$(get_network_prefix "${rt}" "${adj}")"
 	lcnode_func_prefix="$(build_lcnode_func_prefix "${rt}")"
 
@@ -650,7 +614,7 @@ set_underlay_sids_reachability()
 	local rt="$1"
 	local rt_neighs="$2"
 
-	nsname="$(get_rtname "${rt}")"
+	eval nsname=\${$(get_rtname "${rt}")}
 
 	for neigh in ${rt_neighs}; do
 		devname="veth-rt-${rt}-${neigh}"
@@ -685,7 +649,7 @@ setup_rt_local_sids()
 	local lcnode_func_prefix
 	local lcblock_prefix
 
-	nsname="$(get_rtname "${rt}")"
+	eval nsname=\${$(get_rtname "${rt}")}
 
         set_underlay_sids_reachability "${rt}" "${rt_neighs}"
 
@@ -728,8 +692,8 @@ __setup_l3vpn()
 	local rtsrc_nsname
 	local rtdst_nsname
 
-	rtsrc_nsname="$(get_rtname "${src}")"
-	rtdst_nsname="$(get_rtname "${dst}")"
+	eval rtsrc_nsname=\${$(get_rtname "${src}")}
+	eval rtdst_nsname=\${$(get_rtname "${dst}")}
 
 	container="${LCBLOCK_ADDR}"
 
@@ -804,8 +768,8 @@ setup_hs()
 	local hsname
 	local rtname
 
-	hsname="$(get_hsname "${hs}")"
-	rtname="$(get_rtname "${rt}")"
+	eval hsname=\${$(get_hsname "${hs}")}
+	eval rtname=\${$(get_rtname "${rt}")}
 
 	ip netns exec "${hsname}" sysctl -wq net.ipv6.conf.all.accept_dad=0
 	ip netns exec "${hsname}" sysctl -wq net.ipv6.conf.default.accept_dad=0
@@ -851,11 +815,6 @@ setup_hs()
 	ip netns exec "${rtname}" \
 		sysctl -wq net.ipv4.conf."${RT2HS_DEVNAME}".proxy_arp=1
 
-	# disable the rp_filter otherwise the kernel gets confused about how
-	# to route decap ipv4 packets.
-	ip netns exec "${rtname}" \
-		sysctl -wq net.ipv4.conf."${RT2HS_DEVNAME}".rp_filter=0
-
 	ip netns exec "${rtname}" sh -c "echo 1 > /proc/sys/net/vrf/strict_mode"
 }
 
@@ -947,7 +906,7 @@ check_rt_connectivity()
 	local prefix
 	local rtsrc_nsname
 
-	rtsrc_nsname="$(get_rtname "${rtsrc}")"
+	eval rtsrc_nsname=\${$(get_rtname "${rtsrc}")}
 
 	prefix="$(get_network_prefix "${rtsrc}" "${rtdst}")"
 
@@ -970,7 +929,7 @@ check_hs_ipv6_connectivity()
 	local hsdst="$2"
 	local hssrc_nsname
 
-	hssrc_nsname="$(get_hsname "${hssrc}")"
+	eval hssrc_nsname=\${$(get_hsname "${hssrc}")}
 
 	ip netns exec "${hssrc_nsname}" ping -c 1 -W "${PING_TIMEOUT_SEC}" \
 		"${IPv6_HS_NETWORK}::${hsdst}" >/dev/null 2>&1
@@ -982,7 +941,7 @@ check_hs_ipv4_connectivity()
 	local hsdst="$2"
 	local hssrc_nsname
 
-	hssrc_nsname="$(get_hsname "${hssrc}")"
+	eval hssrc_nsname=\${$(get_hsname "${hssrc}")}
 
 	ip netns exec "${hssrc_nsname}" ping -c 1 -W "${PING_TIMEOUT_SEC}" \
 		"${IPv4_HS_NETWORK}.${hsdst}" >/dev/null 2>&1
@@ -1093,7 +1052,7 @@ rt_x_nextcsid_end_x_behavior_test()
 	local nsname
 	local ret
 
-	nsname="$(get_rtname "${rt}")"
+	eval nsname=\${$(get_rtname "${rt}")}
 
 	__nextcsid_end_x_behavior_test "${nsname}" "add" "${blen}" "${flen}"
 	ret="$?"
diff --git a/tools/testing/selftests/net/srv6_hencap_red_l3vpn_test.sh b/tools/testing/selftests/net/srv6_hencap_red_l3vpn_test.sh
index 28a775654b92..3efce1718c5f 100755
--- a/tools/testing/selftests/net/srv6_hencap_red_l3vpn_test.sh
+++ b/tools/testing/selftests/net/srv6_hencap_red_l3vpn_test.sh
@@ -166,10 +166,8 @@
 #  hs-4->hs-3 |IPv6 DA=fcff:1::e|SRH SIDs=fcff:3::d46|IPv6|...| (i.d)
 #
 
-# Kselftest framework requirement - SKIP code is 4.
-readonly ksft_skip=4
+source lib.sh
 
-readonly RDMSUFF="$(mktemp -u XXXXXXXX)"
 readonly VRF_TID=100
 readonly VRF_DEVNAME="vrf-${VRF_TID}"
 readonly RT2HS_DEVNAME="veth-t${VRF_TID}"
@@ -248,32 +246,18 @@ test_command_or_ksft_skip()
 	fi
 }
 
-get_nodename()
-{
-	local name="$1"
-
-	echo "${name}-${RDMSUFF}"
-}
-
 get_rtname()
 {
 	local rtid="$1"
 
-	get_nodename "rt-${rtid}"
+	echo "rt_${rtid}"
 }
 
 get_hsname()
 {
 	local hsid="$1"
 
-	get_nodename "hs-${hsid}"
-}
-
-__create_namespace()
-{
-	local name="$1"
-
-	ip netns add "${name}"
+	echo "hs_${hsid}"
 }
 
 create_router()
@@ -282,8 +266,7 @@ create_router()
 	local nsname
 
 	nsname="$(get_rtname "${rtid}")"
-
-	__create_namespace "${nsname}"
+	setup_ns "${nsname}"
 }
 
 create_host()
@@ -292,29 +275,12 @@ create_host()
 	local nsname
 
 	nsname="$(get_hsname "${hsid}")"
-
-	__create_namespace "${nsname}"
+	setup_ns "${nsname}"
 }
 
 cleanup()
 {
-	local nsname
-	local i
-
-	# destroy routers
-	for i in ${ROUTERS}; do
-		nsname="$(get_rtname "${i}")"
-
-		ip netns del "${nsname}" &>/dev/null || true
-	done
-
-	# destroy hosts
-	for i in ${HOSTS}; do
-		nsname="$(get_hsname "${i}")"
-
-		ip netns del "${nsname}" &>/dev/null || true
-	done
-
+	cleanup_all_ns
 	# check whether the setup phase was completed successfully or not. In
 	# case of an error during the setup phase of the testing environment,
 	# the selftest is considered as "skipped".
@@ -334,10 +300,10 @@ add_link_rt_pairs()
 	local nsname
 	local neigh_nsname
 
-	nsname="$(get_rtname "${rt}")"
+	eval nsname=\${$(get_rtname "${rt}")}
 
 	for neigh in ${rt_neighs}; do
-		neigh_nsname="$(get_rtname "${neigh}")"
+		eval neigh_nsname=\${$(get_rtname "${neigh}")}
 
 		ip link add "veth-rt-${rt}-${neigh}" netns "${nsname}" \
 			type veth peer name "veth-rt-${neigh}-${rt}" \
@@ -369,7 +335,7 @@ setup_rt_networking()
 	local devname
 	local neigh
 
-	nsname="$(get_rtname "${rt}")"
+	eval nsname=\${$(get_rtname "${rt}")}
 
 	for neigh in ${rt_neighs}; do
 		devname="veth-rt-${rt}-${neigh}"
@@ -387,9 +353,6 @@ setup_rt_networking()
 	ip netns exec "${nsname}" sysctl -wq net.ipv6.conf.all.accept_dad=0
 	ip netns exec "${nsname}" sysctl -wq net.ipv6.conf.default.accept_dad=0
 	ip netns exec "${nsname}" sysctl -wq net.ipv6.conf.all.forwarding=1
-
-	ip netns exec "${nsname}" sysctl -wq net.ipv4.conf.all.rp_filter=0
-	ip netns exec "${nsname}" sysctl -wq net.ipv4.conf.default.rp_filter=0
 	ip netns exec "${nsname}" sysctl -wq net.ipv4.ip_forward=1
 }
 
@@ -403,7 +366,7 @@ setup_rt_local_sids()
 	local nsname
 	local neigh
 
-	nsname="$(get_rtname "${rt}")"
+	eval nsname=\${$(get_rtname "${rt}")}
 
 	for neigh in ${rt_neighs}; do
 		devname="veth-rt-${rt}-${neigh}"
@@ -469,7 +432,7 @@ __setup_rt_policy()
 	local policy=''
 	local n
 
-	nsname="$(get_rtname "${encap_rt}")"
+	eval nsname=\${$(get_rtname "${encap_rt}")}
 
 	for n in ${end_rts}; do
 		policy="${policy}${VPN_LOCATOR_SERVICE}:${n}::${END_FUNC},"
@@ -516,8 +479,8 @@ setup_hs()
 	local hsname
 	local rtname
 
-	hsname="$(get_hsname "${hs}")"
-	rtname="$(get_rtname "${rt}")"
+	eval hsname=\${$(get_hsname "${hs}")}
+	eval rtname=\${$(get_rtname "${rt}")}
 
 	ip netns exec "${hsname}" sysctl -wq net.ipv6.conf.all.accept_dad=0
 	ip netns exec "${hsname}" sysctl -wq net.ipv6.conf.default.accept_dad=0
@@ -555,11 +518,6 @@ setup_hs()
 	ip netns exec "${rtname}" \
 		sysctl -wq net.ipv4.conf."${RT2HS_DEVNAME}".proxy_arp=1
 
-	# disable the rp_filter otherwise the kernel gets confused about how
-	# to route decap ipv4 packets.
-	ip netns exec "${rtname}" \
-		sysctl -wq net.ipv4.conf."${RT2HS_DEVNAME}".rp_filter=0
-
 	ip netns exec "${rtname}" sh -c "echo 1 > /proc/sys/net/vrf/strict_mode"
 }
 
@@ -656,7 +614,7 @@ check_rt_connectivity()
 	local prefix
 	local rtsrc_nsname
 
-	rtsrc_nsname="$(get_rtname "${rtsrc}")"
+	eval rtsrc_nsname=\${$(get_rtname "${rtsrc}")}
 
 	prefix="$(get_network_prefix "${rtsrc}" "${rtdst}")"
 
@@ -679,7 +637,7 @@ check_hs_ipv6_connectivity()
 	local hsdst="$2"
 	local hssrc_nsname
 
-	hssrc_nsname="$(get_hsname "${hssrc}")"
+	eval hssrc_nsname=\${$(get_hsname "${hssrc}")}
 
 	ip netns exec "${hssrc_nsname}" ping -c 1 -W "${PING_TIMEOUT_SEC}" \
 		"${IPv6_HS_NETWORK}::${hsdst}" >/dev/null 2>&1
@@ -691,7 +649,7 @@ check_hs_ipv4_connectivity()
 	local hsdst="$2"
 	local hssrc_nsname
 
-	hssrc_nsname="$(get_hsname "${hssrc}")"
+	eval hssrc_nsname=\${$(get_hsname "${hssrc}")}
 
 	ip netns exec "${hssrc_nsname}" ping -c 1 -W "${PING_TIMEOUT_SEC}" \
 		"${IPv4_HS_NETWORK}.${hsdst}" >/dev/null 2>&1
diff --git a/tools/testing/selftests/net/srv6_hl2encap_red_l2vpn_test.sh b/tools/testing/selftests/net/srv6_hl2encap_red_l2vpn_test.sh
index cb4177d41b21..cabc70538ffe 100755
--- a/tools/testing/selftests/net/srv6_hl2encap_red_l2vpn_test.sh
+++ b/tools/testing/selftests/net/srv6_hl2encap_red_l2vpn_test.sh
@@ -116,10 +116,8 @@
 #  hs-2->hs-1 |IPv6 DA=fcff:4::e|SRH SIDs=fcff:3::e,fcff:1::d2|eth|...| (i.b)
 #
 
-# Kselftest framework requirement - SKIP code is 4.
-readonly ksft_skip=4
+source lib.sh
 
-readonly RDMSUFF="$(mktemp -u XXXXXXXX)"
 readonly DUMMY_DEVNAME="dum0"
 readonly RT2HS_DEVNAME="veth-hs"
 readonly HS_VETH_NAME="veth0"
@@ -199,32 +197,18 @@ test_command_or_ksft_skip()
 	fi
 }
 
-get_nodename()
-{
-	local name="$1"
-
-	echo "${name}-${RDMSUFF}"
-}
-
 get_rtname()
 {
 	local rtid="$1"
 
-	get_nodename "rt-${rtid}"
+	echo "rt_${rtid}"
 }
 
 get_hsname()
 {
 	local hsid="$1"
 
-	get_nodename "hs-${hsid}"
-}
-
-__create_namespace()
-{
-	local name="$1"
-
-	ip netns add "${name}"
+	echo "hs_${hsid}"
 }
 
 create_router()
@@ -233,8 +217,7 @@ create_router()
 	local nsname
 
 	nsname="$(get_rtname "${rtid}")"
-
-	__create_namespace "${nsname}"
+	setup_ns "${nsname}"
 }
 
 create_host()
@@ -243,28 +226,12 @@ create_host()
 	local nsname
 
 	nsname="$(get_hsname "${hsid}")"
-
-	__create_namespace "${nsname}"
+	setup_ns "${nsname}"
 }
 
 cleanup()
 {
-	local nsname
-	local i
-
-	# destroy routers
-	for i in ${ROUTERS}; do
-		nsname="$(get_rtname "${i}")"
-
-		ip netns del "${nsname}" &>/dev/null || true
-	done
-
-	# destroy hosts
-	for i in ${HOSTS}; do
-		nsname="$(get_hsname "${i}")"
-
-		ip netns del "${nsname}" &>/dev/null || true
-	done
+	cleanup_all_ns
 
 	# check whether the setup phase was completed successfully or not. In
 	# case of an error during the setup phase of the testing environment,
@@ -285,10 +252,10 @@ add_link_rt_pairs()
 	local nsname
 	local neigh_nsname
 
-	nsname="$(get_rtname "${rt}")"
+	eval nsname=\${$(get_rtname "${rt}")}
 
 	for neigh in ${rt_neighs}; do
-		neigh_nsname="$(get_rtname "${neigh}")"
+		eval neigh_nsname=\${$(get_rtname "${neigh}")}
 
 		ip link add "veth-rt-${rt}-${neigh}" netns "${nsname}" \
 			type veth peer name "veth-rt-${neigh}-${rt}" \
@@ -320,7 +287,7 @@ setup_rt_networking()
 	local devname
 	local neigh
 
-	nsname="$(get_rtname "${rt}")"
+	eval nsname=\${$(get_rtname "${rt}")}
 
 	for neigh in ${rt_neighs}; do
 		devname="veth-rt-${rt}-${neigh}"
@@ -341,9 +308,6 @@ setup_rt_networking()
 	ip netns exec "${nsname}" sysctl -wq net.ipv6.conf.all.accept_dad=0
 	ip netns exec "${nsname}" sysctl -wq net.ipv6.conf.default.accept_dad=0
 	ip netns exec "${nsname}" sysctl -wq net.ipv6.conf.all.forwarding=1
-
-	ip netns exec "${nsname}" sysctl -wq net.ipv4.conf.all.rp_filter=0
-	ip netns exec "${nsname}" sysctl -wq net.ipv4.conf.default.rp_filter=0
 	ip netns exec "${nsname}" sysctl -wq net.ipv4.ip_forward=1
 }
 
@@ -357,7 +321,7 @@ setup_rt_local_sids()
 	local nsname
 	local neigh
 
-	nsname="$(get_rtname "${rt}")"
+	eval nsname=\${$(get_rtname "${rt}")}
 
 	for neigh in ${rt_neighs}; do
 		devname="veth-rt-${rt}-${neigh}"
@@ -407,7 +371,7 @@ __setup_rt_policy()
 	local policy=''
 	local n
 
-	nsname="$(get_rtname "${encap_rt}")"
+	eval nsname=\${$(get_rtname "${encap_rt}")}
 
 	for n in ${end_rts}; do
 		policy="${policy}${VPN_LOCATOR_SERVICE}:${n}::${END_FUNC},"
@@ -446,7 +410,7 @@ setup_decap()
 	local rt="$1"
 	local nsname
 
-	nsname="$(get_rtname "${rt}")"
+	eval nsname=\${$(get_rtname "${rt}")}
 
 	# Local End.DX2 behavior
 	ip -netns "${nsname}" -6 route \
@@ -463,8 +427,8 @@ setup_hs()
 	local hsname
 	local rtname
 
-	hsname="$(get_hsname "${hs}")"
-	rtname="$(get_rtname "${rt}")"
+	eval hsname=\${$(get_hsname "${hs}")}
+	eval rtname=\${$(get_rtname "${rt}")}
 
 	ip netns exec "${hsname}" sysctl -wq net.ipv6.conf.all.accept_dad=0
 	ip netns exec "${hsname}" sysctl -wq net.ipv6.conf.default.accept_dad=0
@@ -486,11 +450,6 @@ setup_hs()
 		add "${IPv4_HS_NETWORK}.254/24" dev "${RT2HS_DEVNAME}"
 
 	ip -netns "${rtname}" link set "${RT2HS_DEVNAME}" up
-
-	# disable the rp_filter otherwise the kernel gets confused about how
-	# to route decap ipv4 packets.
-	ip netns exec "${rtname}" \
-		sysctl -wq net.ipv4.conf."${RT2HS_DEVNAME}".rp_filter=0
 }
 
 # set an auto-generated mac address
@@ -508,7 +467,7 @@ set_mac_address()
 	local ifname="$4"
 	local nsname
 
-	nsname=$(get_nodename "${nodename}")
+	eval nsname=\${${nodename}}
 
 	ip -netns "${nsname}" link set dev "${ifname}" down
 
@@ -532,7 +491,7 @@ set_host_l2peer()
 	local hssrc_name
 	local ipaddr
 
-	hssrc_name="$(get_hsname "${hssrc}")"
+	eval hssrc_name=\${$(get_hsname "${hssrc}")}
 
 	if [ "${proto}" -eq 6 ]; then
 		ipaddr="${ipprefix}::${hsdst}"
@@ -562,7 +521,7 @@ setup_l2vpn()
 	local rtdst="${hsdst}"
 
 	# set fixed mac for source node and the neigh MAC address
-	set_mac_address "hs-${hssrc}" "${hssrc}" "${hssrc}" "${HS_VETH_NAME}"
+	set_mac_address "hs_${hssrc}" "${hssrc}" "${hssrc}" "${HS_VETH_NAME}"
 	set_host_l2peer "${hssrc}" "${hsdst}" "${IPv6_HS_NETWORK}" 6
 	set_host_l2peer "${hssrc}" "${hsdst}" "${IPv4_HS_NETWORK}" 4
 
@@ -570,7 +529,7 @@ setup_l2vpn()
 	# to the mac address of the remote peer (L2 VPN destination host).
 	# Otherwise, traffic coming from the source host is dropped at the
 	# ingress router.
-	set_mac_address "rt-${rtsrc}" "${hsdst}" 254 "${RT2HS_DEVNAME}"
+	set_mac_address "rt_${rtsrc}" "${hsdst}" 254 "${RT2HS_DEVNAME}"
 
 	# set the SRv6 Policies at the ingress router
 	setup_rt_policy_ipv6 "${hsdst}" "${rtsrc}" "${end_rts}" "${rtdst}" \
@@ -647,7 +606,7 @@ check_rt_connectivity()
 	local prefix
 	local rtsrc_nsname
 
-	rtsrc_nsname="$(get_rtname "${rtsrc}")"
+	eval rtsrc_nsname=\${$(get_rtname "${rtsrc}")}
 
 	prefix="$(get_network_prefix "${rtsrc}" "${rtdst}")"
 
@@ -670,7 +629,7 @@ check_hs_ipv6_connectivity()
 	local hsdst="$2"
 	local hssrc_nsname
 
-	hssrc_nsname="$(get_hsname "${hssrc}")"
+	eval hssrc_nsname=\${$(get_hsname "${hssrc}")}
 
 	ip netns exec "${hssrc_nsname}" ping -c 1 -W "${PING_TIMEOUT_SEC}" \
 		"${IPv6_HS_NETWORK}::${hsdst}" >/dev/null 2>&1
@@ -682,7 +641,7 @@ check_hs_ipv4_connectivity()
 	local hsdst="$2"
 	local hssrc_nsname
 
-	hssrc_nsname="$(get_hsname "${hssrc}")"
+	eval hssrc_nsname=\${$(get_hsname "${hssrc}")}
 
 	ip netns exec "${hssrc_nsname}" ping -c 1 -W "${PING_TIMEOUT_SEC}" \
 		"${IPv4_HS_NETWORK}.${hsdst}" >/dev/null 2>&1
diff --git a/tools/testing/selftests/net/tcp_ao/Makefile b/tools/testing/selftests/net/tcp_ao/Makefile
index 522d991e310e..5b0205c70c39 100644
--- a/tools/testing/selftests/net/tcp_ao/Makefile
+++ b/tools/testing/selftests/net/tcp_ao/Makefile
@@ -26,12 +26,13 @@ LIB	:= $(LIBDIR)/libaotst.a
 LDLIBS	+= $(LIB) -pthread
 LIBDEPS	:= lib/aolib.h Makefile
 
-CFLAGS	:= -Wall -O2 -g -D_GNU_SOURCE -fno-strict-aliasing
+CFLAGS	+= -Wall -O2 -g -fno-strict-aliasing
 CFLAGS	+= $(KHDR_INCLUDES)
 CFLAGS	+= -iquote ./lib/ -I ../../../../include/
 
 # Library
-LIBSRC	:= kconfig.c netlink.c proc.c repair.c setup.c sock.c utils.c
+LIBSRC	:= ftrace.c ftrace-tcp.c kconfig.c netlink.c
+LIBSRC	+= proc.c repair.c setup.c sock.c utils.c
 LIBOBJ	:= $(LIBSRC:%.c=$(LIBDIR)/%.o)
 EXTRA_CLEAN += $(LIBOBJ) $(LIB)
 
diff --git a/tools/testing/selftests/net/tcp_ao/bench-lookups.c b/tools/testing/selftests/net/tcp_ao/bench-lookups.c
index a1e6e007c291..6736484996a3 100644
--- a/tools/testing/selftests/net/tcp_ao/bench-lookups.c
+++ b/tools/testing/selftests/net/tcp_ao/bench-lookups.c
@@ -355,6 +355,6 @@ static void *client_fn(void *arg)
 
 int main(int argc, char *argv[])
 {
-	test_init(30, server_fn, client_fn);
+	test_init(31, server_fn, client_fn);
 	return 0;
 }
diff --git a/tools/testing/selftests/net/tcp_ao/config b/tools/testing/selftests/net/tcp_ao/config
index d3277a9de987..3605e38711cb 100644
--- a/tools/testing/selftests/net/tcp_ao/config
+++ b/tools/testing/selftests/net/tcp_ao/config
@@ -7,4 +7,5 @@ CONFIG_NET_L3_MASTER_DEV=y
 CONFIG_NET_VRF=y
 CONFIG_TCP_AO=y
 CONFIG_TCP_MD5SIG=y
+CONFIG_TRACEPOINTS=y
 CONFIG_VETH=m
diff --git a/tools/testing/selftests/net/tcp_ao/connect-deny.c b/tools/testing/selftests/net/tcp_ao/connect-deny.c
index 185a2f6e5ff3..93b61e9a36f1 100644
--- a/tools/testing/selftests/net/tcp_ao/connect-deny.c
+++ b/tools/testing/selftests/net/tcp_ao/connect-deny.c
@@ -4,6 +4,7 @@
 #include "aolib.h"
 
 #define fault(type)	(inj == FAULT_ ## type)
+static volatile int sk_pair;
 
 static inline int test_add_key_maclen(int sk, const char *key, uint8_t maclen,
 				      union tcp_addr in_addr, uint8_t prefix,
@@ -34,10 +35,10 @@ static void try_accept(const char *tst_name, unsigned int port, const char *pwd,
 		       const char *cnt_name, test_cnt cnt_expected,
 		       fault_t inj)
 {
-	struct tcp_ao_counters ao_cnt1, ao_cnt2;
+	struct tcp_counters cnt1, cnt2;
 	uint64_t before_cnt = 0, after_cnt = 0; /* silence GCC */
+	test_cnt poll_cnt = (cnt_expected == TEST_CNT_GOOD) ? 0 : cnt_expected;
 	int lsk, err, sk = 0;
-	time_t timeout;
 
 	lsk = test_listen_socket(this_ip_addr, port, 1);
 
@@ -46,21 +47,24 @@ static void try_accept(const char *tst_name, unsigned int port, const char *pwd,
 
 	if (cnt_name)
 		before_cnt = netstat_get_one(cnt_name, NULL);
-	if (pwd && test_get_tcp_ao_counters(lsk, &ao_cnt1))
-		test_error("test_get_tcp_ao_counters()");
+	if (pwd && test_get_tcp_counters(lsk, &cnt1))
+		test_error("test_get_tcp_counters()");
 
 	synchronize_threads(); /* preparations done */
 
-	timeout = fault(TIMEOUT) ? TEST_RETRANSMIT_SEC : TEST_TIMEOUT_SEC;
-	err = test_wait_fd(lsk, timeout, 0);
+	err = test_skpair_wait_poll(lsk, 0, poll_cnt, &sk_pair);
 	if (err == -ETIMEDOUT) {
+		sk_pair = err;
 		if (!fault(TIMEOUT))
-			test_fail("timed out for accept()");
+			test_fail("%s: timed out for accept()", tst_name);
+	} else if (err == -EKEYREJECTED) {
+		if (!fault(KEYREJECT))
+			test_fail("%s: key was rejected", tst_name);
 	} else if (err < 0) {
-		test_error("test_wait_fd()");
+		test_error("test_skpair_wait_poll()");
 	} else {
 		if (fault(TIMEOUT))
-			test_fail("ready to accept");
+			test_fail("%s: ready to accept", tst_name);
 
 		sk = accept(lsk, NULL, NULL);
 		if (sk < 0) {
@@ -71,12 +75,14 @@ static void try_accept(const char *tst_name, unsigned int port, const char *pwd,
 		}
 	}
 
-	if (pwd && test_get_tcp_ao_counters(lsk, &ao_cnt2))
-		test_error("test_get_tcp_ao_counters()");
+	synchronize_threads(); /* before counter checks */
+	if (pwd && test_get_tcp_counters(lsk, &cnt2))
+		test_error("test_get_tcp_counters()");
 
 	close(lsk);
+
 	if (pwd)
-		test_tcp_ao_counters_cmp(tst_name, &ao_cnt1, &ao_cnt2, cnt_expected);
+		test_assert_counters(tst_name, &cnt1, &cnt2, cnt_expected);
 
 	if (!cnt_name)
 		goto out;
@@ -84,10 +90,10 @@ static void try_accept(const char *tst_name, unsigned int port, const char *pwd,
 	after_cnt = netstat_get_one(cnt_name, NULL);
 
 	if (after_cnt <= before_cnt) {
-		test_fail("%s: %s counter did not increase: %zu <= %zu",
+		test_fail("%s: %s counter did not increase: %" PRIu64 " <= %" PRIu64,
 				tst_name, cnt_name, after_cnt, before_cnt);
 	} else {
-		test_ok("%s: counter %s increased %zu => %zu",
+		test_ok("%s: counter %s increased %" PRIu64  " => %" PRIu64,
 			tst_name, cnt_name, before_cnt, after_cnt);
 	}
 
@@ -107,7 +113,7 @@ static void *server_fn(void *arg)
 
 	try_accept("Non-AO server + AO client", port++, NULL,
 		   this_ip_dest, -1, 100, 100, 0,
-		   "TCPAOKeyNotFound", 0, FAULT_TIMEOUT);
+		   "TCPAOKeyNotFound", TEST_CNT_NS_KEY_NOT_FOUND, FAULT_TIMEOUT);
 
 	try_accept("AO server + Non-AO client", port++, DEFAULT_TEST_PASSWORD,
 		   this_ip_dest, -1, 100, 100, 0,
@@ -133,8 +139,9 @@ static void *server_fn(void *arg)
 		   wrong_addr, -1, 100, 100, 0,
 		   "TCPAOKeyNotFound", TEST_CNT_AO_KEY_NOT_FOUND, FAULT_TIMEOUT);
 
+	/* Key rejected by the other side, failing short through skpair */
 	try_accept("Client: Wrong addr", port++, NULL,
-		   this_ip_dest, -1, 100, 100, 0, NULL, 0, FAULT_TIMEOUT);
+		   this_ip_dest, -1, 100, 100, 0, NULL, 0, FAULT_KEYREJECT);
 
 	try_accept("rcv id != snd id", port++, DEFAULT_TEST_PASSWORD,
 		   this_ip_dest, -1, 200, 100, 0,
@@ -161,8 +168,7 @@ static void try_connect(const char *tst_name, unsigned int port,
 			uint8_t sndid, uint8_t rcvid,
 			test_cnt cnt_expected, fault_t inj)
 {
-	struct tcp_ao_counters ao_cnt1, ao_cnt2;
-	time_t timeout;
+	struct tcp_counters cnt1, cnt2;
 	int sk, ret;
 
 	sk = socket(test_family, SOCK_STREAM, IPPROTO_TCP);
@@ -172,15 +178,15 @@ static void try_connect(const char *tst_name, unsigned int port,
 	if (pwd && test_add_key(sk, pwd, addr, prefix, sndid, rcvid))
 		test_error("setsockopt(TCP_AO_ADD_KEY)");
 
-	if (pwd && test_get_tcp_ao_counters(sk, &ao_cnt1))
-		test_error("test_get_tcp_ao_counters()");
+	if (pwd && test_get_tcp_counters(sk, &cnt1))
+		test_error("test_get_tcp_counters()");
 
 	synchronize_threads(); /* preparations done */
 
-	timeout = fault(TIMEOUT) ? TEST_RETRANSMIT_SEC : TEST_TIMEOUT_SEC;
-	ret = _test_connect_socket(sk, this_ip_dest, port, timeout);
-
+	ret = test_skpair_connect_poll(sk, this_ip_dest, port, cnt_expected, &sk_pair);
+	synchronize_threads(); /* before counter checks */
 	if (ret < 0) {
+		sk_pair = ret;
 		if (fault(KEYREJECT) && ret == -EKEYREJECTED) {
 			test_ok("%s: connect() was prevented", tst_name);
 		} else if (ret == -ETIMEDOUT && fault(TIMEOUT)) {
@@ -199,9 +205,11 @@ static void try_connect(const char *tst_name, unsigned int port,
 	else
 		test_ok("%s: connected", tst_name);
 	if (pwd && ret > 0) {
-		if (test_get_tcp_ao_counters(sk, &ao_cnt2))
-			test_error("test_get_tcp_ao_counters()");
-		test_tcp_ao_counters_cmp(tst_name, &ao_cnt1, &ao_cnt2, cnt_expected);
+		if (test_get_tcp_counters(sk, &cnt2))
+			test_error("test_get_tcp_counters()");
+		test_assert_counters(tst_name, &cnt1, &cnt2, cnt_expected);
+	} else if (pwd) {
+		test_tcp_counters_free(&cnt1);
 	}
 out:
 	synchronize_threads(); /* close() */
@@ -212,30 +220,49 @@ out:
 
 static void *client_fn(void *arg)
 {
-	union tcp_addr wrong_addr, network_addr;
+	union tcp_addr wrong_addr, network_addr, addr_any = {};
 	unsigned int port = test_server_port;
 
 	if (inet_pton(TEST_FAMILY, TEST_WRONG_IP, &wrong_addr) != 1)
 		test_error("Can't convert ip address %s", TEST_WRONG_IP);
 
+	trace_ao_event_expect(TCP_AO_KEY_NOT_FOUND, this_ip_addr, this_ip_dest,
+			      -1, port, 0, 0, 1, 0, 0, 0, 100, 100, -1);
 	try_connect("Non-AO server + AO client", port++, DEFAULT_TEST_PASSWORD,
 			this_ip_dest, -1, 100, 100, 0, FAULT_TIMEOUT);
 
+	trace_hash_event_expect(TCP_HASH_AO_REQUIRED, this_ip_addr, this_ip_dest,
+				-1, port, 0, 0, 1, 0, 0, 0);
 	try_connect("AO server + Non-AO client", port++, NULL,
 			this_ip_dest, -1, 100, 100, 0, FAULT_TIMEOUT);
 
+	trace_ao_event_expect(TCP_AO_MISMATCH, this_ip_addr, this_ip_dest,
+			      -1, port, 0, 0, 1, 0, 0, 0, 100, 100, -1);
 	try_connect("Wrong password", port++, DEFAULT_TEST_PASSWORD,
 			this_ip_dest, -1, 100, 100, 0, FAULT_TIMEOUT);
 
+	trace_ao_event_expect(TCP_AO_KEY_NOT_FOUND, this_ip_addr, this_ip_dest,
+			      -1, port, 0, 0, 1, 0, 0, 0, 100, 100, -1);
 	try_connect("Wrong rcv id", port++, DEFAULT_TEST_PASSWORD,
 			this_ip_dest, -1, 100, 100, 0, FAULT_TIMEOUT);
 
+	/*
+	 * XXX: The test doesn't increase any counters, see tcp_make_synack().
+	 * Potentially, it can be speed up by setting sk_pair = -ETIMEDOUT
+	 * but the price would be increased complexity of the tracer thread.
+	 */
+	trace_ao_event_sk_expect(TCP_AO_SYNACK_NO_KEY, this_ip_dest, addr_any,
+				 port, 0, 100, 100);
 	try_connect("Wrong snd id", port++, DEFAULT_TEST_PASSWORD,
 			this_ip_dest, -1, 100, 100, 0, FAULT_TIMEOUT);
 
+	trace_ao_event_expect(TCP_AO_WRONG_MACLEN, this_ip_addr, this_ip_dest,
+			      -1, port, 0, 0, 1, 0, 0, 0, 100, 100, -1);
 	try_connect("Different maclen", port++, DEFAULT_TEST_PASSWORD,
 			this_ip_dest, -1, 100, 100, 0, FAULT_TIMEOUT);
 
+	trace_ao_event_expect(TCP_AO_KEY_NOT_FOUND, this_ip_addr, this_ip_dest,
+			      -1, port, 0, 0, 1, 0, 0, 0, 100, 100, -1);
 	try_connect("Server: Wrong addr", port++, DEFAULT_TEST_PASSWORD,
 			this_ip_dest, -1, 100, 100, 0, FAULT_TIMEOUT);
 
@@ -259,6 +286,6 @@ static void *client_fn(void *arg)
 
 int main(int argc, char *argv[])
 {
-	test_init(21, server_fn, client_fn);
+	test_init(22, server_fn, client_fn);
 	return 0;
 }
diff --git a/tools/testing/selftests/net/tcp_ao/connect.c b/tools/testing/selftests/net/tcp_ao/connect.c
index 81653b47f303..340f00e979ea 100644
--- a/tools/testing/selftests/net/tcp_ao/connect.c
+++ b/tools/testing/selftests/net/tcp_ao/connect.c
@@ -35,7 +35,7 @@ static void *client_fn(void *arg)
 	uint64_t before_aogood, after_aogood;
 	const size_t nr_packets = 20;
 	struct netstat *ns_before, *ns_after;
-	struct tcp_ao_counters ao1, ao2;
+	struct tcp_counters ao1, ao2;
 
 	if (sk < 0)
 		test_error("socket()");
@@ -50,41 +50,41 @@ static void *client_fn(void *arg)
 
 	ns_before = netstat_read();
 	before_aogood = netstat_get(ns_before, "TCPAOGood", NULL);
-	if (test_get_tcp_ao_counters(sk, &ao1))
-		test_error("test_get_tcp_ao_counters()");
+	if (test_get_tcp_counters(sk, &ao1))
+		test_error("test_get_tcp_counters()");
 
-	if (test_client_verify(sk, 100, nr_packets, TEST_TIMEOUT_SEC)) {
+	if (test_client_verify(sk, 100, nr_packets)) {
 		test_fail("verify failed");
 		return NULL;
 	}
 
 	ns_after = netstat_read();
 	after_aogood = netstat_get(ns_after, "TCPAOGood", NULL);
-	if (test_get_tcp_ao_counters(sk, &ao2))
-		test_error("test_get_tcp_ao_counters()");
+	if (test_get_tcp_counters(sk, &ao2))
+		test_error("test_get_tcp_counters()");
 	netstat_print_diff(ns_before, ns_after);
 	netstat_free(ns_before);
 	netstat_free(ns_after);
 
 	if (nr_packets > (after_aogood - before_aogood)) {
-		test_fail("TCPAOGood counter mismatch: %zu > (%zu - %zu)",
+		test_fail("TCPAOGood counter mismatch: %zu > (%" PRIu64 " - %" PRIu64 ")",
 				nr_packets, after_aogood, before_aogood);
 		return NULL;
 	}
-	if (test_tcp_ao_counters_cmp("connect", &ao1, &ao2, TEST_CNT_GOOD))
+	if (test_assert_counters("connect", &ao1, &ao2, TEST_CNT_GOOD))
 		return NULL;
 
-	test_ok("connect TCPAOGood %" PRIu64 "/%" PRIu64 "/%" PRIu64 " => %" PRIu64 "/%" PRIu64 "/%" PRIu64 ", sent %" PRIu64,
-			before_aogood, ao1.ao_info_pkt_good,
-			ao1.key_cnts[0].pkt_good,
-			after_aogood, ao2.ao_info_pkt_good,
-			ao2.key_cnts[0].pkt_good,
+	test_ok("connect TCPAOGood %" PRIu64 "/%" PRIu64 "/%" PRIu64 " => %" PRIu64 "/%" PRIu64 "/%" PRIu64 ", sent %zu",
+			before_aogood, ao1.ao.ao_info_pkt_good,
+			ao1.ao.key_cnts[0].pkt_good,
+			after_aogood, ao2.ao.ao_info_pkt_good,
+			ao2.ao.key_cnts[0].pkt_good,
 			nr_packets);
 	return NULL;
 }
 
 int main(int argc, char *argv[])
 {
-	test_init(1, server_fn, client_fn);
+	test_init(2, server_fn, client_fn);
 	return 0;
 }
diff --git a/tools/testing/selftests/net/tcp_ao/icmps-discard.c b/tools/testing/selftests/net/tcp_ao/icmps-discard.c
index d69bcba3c929..85c1a1e958c6 100644
--- a/tools/testing/selftests/net/tcp_ao/icmps-discard.c
+++ b/tools/testing/selftests/net/tcp_ao/icmps-discard.c
@@ -53,7 +53,7 @@ static void serve_interfered(int sk)
 	ssize_t test_quota = packet_size * packets_nr * 10;
 	uint64_t dest_unreach_a, dest_unreach_b;
 	uint64_t icmp_ignored_a, icmp_ignored_b;
-	struct tcp_ao_counters ao_cnt1, ao_cnt2;
+	struct tcp_counters cnt1, cnt2;
 	bool counter_not_found;
 	struct netstat *ns_after, *ns_before;
 	ssize_t bytes;
@@ -61,16 +61,16 @@ static void serve_interfered(int sk)
 	ns_before = netstat_read();
 	dest_unreach_a = netstat_get(ns_before, dst_unreach, NULL);
 	icmp_ignored_a = netstat_get(ns_before, tcpao_icmps, NULL);
-	if (test_get_tcp_ao_counters(sk, &ao_cnt1))
-		test_error("test_get_tcp_ao_counters()");
+	if (test_get_tcp_counters(sk, &cnt1))
+		test_error("test_get_tcp_counters()");
 	bytes = test_server_run(sk, test_quota, 0);
 	ns_after = netstat_read();
 	netstat_print_diff(ns_before, ns_after);
 	dest_unreach_b = netstat_get(ns_after, dst_unreach, NULL);
 	icmp_ignored_b = netstat_get(ns_after, tcpao_icmps,
 					&counter_not_found);
-	if (test_get_tcp_ao_counters(sk, &ao_cnt2))
-		test_error("test_get_tcp_ao_counters()");
+	if (test_get_tcp_counters(sk, &cnt2))
+		test_error("test_get_tcp_counters()");
 
 	netstat_free(ns_before);
 	netstat_free(ns_after);
@@ -91,9 +91,9 @@ static void serve_interfered(int sk)
 		return;
 	}
 #ifdef TEST_ICMPS_ACCEPT
-	test_tcp_ao_counters_cmp(NULL, &ao_cnt1, &ao_cnt2, TEST_CNT_GOOD);
+	test_assert_counters(NULL, &cnt1, &cnt2, TEST_CNT_GOOD);
 #else
-	test_tcp_ao_counters_cmp(NULL, &ao_cnt1, &ao_cnt2, TEST_CNT_GOOD | TEST_CNT_AO_DROPPED_ICMP);
+	test_assert_counters(NULL, &cnt1, &cnt2, TEST_CNT_GOOD | TEST_CNT_AO_DROPPED_ICMP);
 #endif
 	if (icmp_ignored_a >= icmp_ignored_b) {
 		test_icmps_fail("%s counter didn't change: %" PRIu64 " >= %" PRIu64,
@@ -395,7 +395,6 @@ static void icmp_interfere(const size_t nr, uint32_t rcv_nxt, void *src, void *d
 
 static void send_interfered(int sk)
 {
-	const unsigned int timeout = TEST_TIMEOUT_SEC;
 	struct sockaddr_in6 src, dst;
 	socklen_t addr_sz;
 
@@ -409,7 +408,7 @@ static void send_interfered(int sk)
 	while (1) {
 		uint32_t rcv_nxt;
 
-		if (test_client_verify(sk, packet_size, packets_nr, timeout)) {
+		if (test_client_verify(sk, packet_size, packets_nr)) {
 			test_fail("client: connection is broken");
 			return;
 		}
@@ -444,6 +443,6 @@ static void *client_fn(void *arg)
 
 int main(int argc, char *argv[])
 {
-	test_init(3, server_fn, client_fn);
+	test_init(4, server_fn, client_fn);
 	return 0;
 }
diff --git a/tools/testing/selftests/net/tcp_ao/key-management.c b/tools/testing/selftests/net/tcp_ao/key-management.c
index 24e62120b792..69d9a7a05d5c 100644
--- a/tools/testing/selftests/net/tcp_ao/key-management.c
+++ b/tools/testing/selftests/net/tcp_ao/key-management.c
@@ -629,11 +629,11 @@ static int key_collection_socket(bool server, unsigned int port)
 }
 
 static void verify_counters(const char *tst_name, bool is_listen_sk, bool server,
-			    struct tcp_ao_counters *a, struct tcp_ao_counters *b)
+			    struct tcp_counters *a, struct tcp_counters *b)
 {
 	unsigned int i;
 
-	__test_tcp_ao_counters_cmp(tst_name, a, b, TEST_CNT_GOOD);
+	test_assert_counters_sk(tst_name, a, b, TEST_CNT_GOOD);
 
 	for (i = 0; i < collection.nr_keys; i++) {
 		struct test_key *key = &collection.keys[i];
@@ -652,12 +652,12 @@ static void verify_counters(const char *tst_name, bool is_listen_sk, bool server
 			rx_cnt_expected = key->used_on_server_tx;
 		}
 
-		test_tcp_ao_key_counters_cmp(tst_name, a, b,
-					     rx_cnt_expected ? TEST_CNT_KEY_GOOD : 0,
-					     sndid, rcvid);
+		test_assert_counters_key(tst_name, &a->ao, &b->ao,
+					 rx_cnt_expected ? TEST_CNT_KEY_GOOD : 0,
+					 sndid, rcvid);
 	}
-	test_tcp_ao_counters_free(a);
-	test_tcp_ao_counters_free(b);
+	test_tcp_counters_free(a);
+	test_tcp_counters_free(b);
 	test_ok("%s: passed counters checks", tst_name);
 }
 
@@ -791,17 +791,17 @@ out:
 }
 
 static int start_server(const char *tst_name, unsigned int port, size_t quota,
-			struct tcp_ao_counters *begin,
+			struct tcp_counters *begin,
 			unsigned int current_index, unsigned int rnext_index)
 {
-	struct tcp_ao_counters lsk_c1, lsk_c2;
+	struct tcp_counters lsk_c1, lsk_c2;
 	ssize_t bytes;
 	int sk, lsk;
 
 	synchronize_threads(); /* 1: key collection initialized */
 	lsk = key_collection_socket(true, port);
-	if (test_get_tcp_ao_counters(lsk, &lsk_c1))
-		test_error("test_get_tcp_ao_counters()");
+	if (test_get_tcp_counters(lsk, &lsk_c1))
+		test_error("test_get_tcp_counters()");
 	synchronize_threads(); /* 2: MKTs added => connect() */
 	if (test_wait_fd(lsk, TEST_TIMEOUT_SEC, 0))
 		test_error("test_wait_fd()");
@@ -809,12 +809,12 @@ static int start_server(const char *tst_name, unsigned int port, size_t quota,
 	sk = accept(lsk, NULL, NULL);
 	if (sk < 0)
 		test_error("accept()");
-	if (test_get_tcp_ao_counters(sk, begin))
-		test_error("test_get_tcp_ao_counters()");
+	if (test_get_tcp_counters(sk, begin))
+		test_error("test_get_tcp_counters()");
 
 	synchronize_threads(); /* 3: accepted => send data */
-	if (test_get_tcp_ao_counters(lsk, &lsk_c2))
-		test_error("test_get_tcp_ao_counters()");
+	if (test_get_tcp_counters(lsk, &lsk_c2))
+		test_error("test_get_tcp_counters()");
 	verify_keys(tst_name, lsk, true, true);
 	close(lsk);
 
@@ -830,12 +830,12 @@ static int start_server(const char *tst_name, unsigned int port, size_t quota,
 }
 
 static void end_server(const char *tst_name, int sk,
-		       struct tcp_ao_counters *begin)
+		       struct tcp_counters *begin)
 {
-	struct tcp_ao_counters end;
+	struct tcp_counters end;
 
-	if (test_get_tcp_ao_counters(sk, &end))
-		test_error("test_get_tcp_ao_counters()");
+	if (test_get_tcp_counters(sk, &end))
+		test_error("test_get_tcp_counters()");
 	verify_keys(tst_name, sk, false, true);
 
 	synchronize_threads(); /* 4: verified => closed */
@@ -848,7 +848,7 @@ static void end_server(const char *tst_name, int sk,
 static void try_server_run(const char *tst_name, unsigned int port, size_t quota,
 			   unsigned int current_index, unsigned int rnext_index)
 {
-	struct tcp_ao_counters tmp;
+	struct tcp_counters tmp;
 	int sk;
 
 	sk = start_server(tst_name, port, quota, &tmp,
@@ -860,7 +860,7 @@ static void server_rotations(const char *tst_name, unsigned int port,
 			     size_t quota, unsigned int rotations,
 			     unsigned int current_index, unsigned int rnext_index)
 {
-	struct tcp_ao_counters tmp;
+	struct tcp_counters tmp;
 	unsigned int i;
 	int sk;
 
@@ -886,7 +886,7 @@ static void server_rotations(const char *tst_name, unsigned int port,
 
 static int run_client(const char *tst_name, unsigned int port,
 		      unsigned int nr_keys, int current_index, int rnext_index,
-		      struct tcp_ao_counters *before,
+		      struct tcp_counters *before,
 		      const size_t msg_sz, const size_t msg_nr)
 {
 	int sk;
@@ -904,8 +904,8 @@ static int run_client(const char *tst_name, unsigned int port,
 		if (test_set_key(sk, sndid, rcvid))
 			test_error("failed to set current/rnext keys");
 	}
-	if (before && test_get_tcp_ao_counters(sk, before))
-		test_error("test_get_tcp_ao_counters()");
+	if (before && test_get_tcp_counters(sk, before))
+		test_error("test_get_tcp_counters()");
 
 	synchronize_threads(); /* 2: MKTs added => connect() */
 	if (test_connect_socket(sk, this_ip_dest, port++) <= 0)
@@ -918,11 +918,11 @@ static int run_client(const char *tst_name, unsigned int port,
 	collection.keys[rnext_index].used_on_server_tx = 1;
 
 	synchronize_threads(); /* 3: accepted => send data */
-	if (test_client_verify(sk, msg_sz, msg_nr, TEST_TIMEOUT_SEC)) {
+	if (test_client_verify(sk, msg_sz, msg_nr)) {
 		test_fail("verify failed");
 		close(sk);
 		if (before)
-			test_tcp_ao_counters_free(before);
+			test_tcp_counters_free(before);
 		return -1;
 	}
 
@@ -931,7 +931,7 @@ static int run_client(const char *tst_name, unsigned int port,
 
 static int start_client(const char *tst_name, unsigned int port,
 			unsigned int nr_keys, int current_index, int rnext_index,
-			struct tcp_ao_counters *before,
+			struct tcp_counters *before,
 			const size_t msg_sz, const size_t msg_nr)
 {
 	if (init_default_key_collection(nr_keys, true))
@@ -943,9 +943,9 @@ static int start_client(const char *tst_name, unsigned int port,
 
 static void end_client(const char *tst_name, int sk, unsigned int nr_keys,
 		       int current_index, int rnext_index,
-		       struct tcp_ao_counters *start)
+		       struct tcp_counters *start)
 {
-	struct tcp_ao_counters end;
+	struct tcp_counters end;
 
 	/* Some application may become dependent on this kernel choice */
 	if (current_index < 0)
@@ -955,8 +955,8 @@ static void end_client(const char *tst_name, int sk, unsigned int nr_keys,
 	verify_current_rnext(tst_name, sk,
 			     collection.keys[current_index].client_keyid,
 			     collection.keys[rnext_index].server_keyid);
-	if (start && test_get_tcp_ao_counters(sk, &end))
-		test_error("test_get_tcp_ao_counters()");
+	if (start && test_get_tcp_counters(sk, &end))
+		test_error("test_get_tcp_counters()");
 	verify_keys(tst_name, sk, false, false);
 	synchronize_threads(); /* 4: verify => closed */
 	close(sk);
@@ -965,7 +965,7 @@ static void end_client(const char *tst_name, int sk, unsigned int nr_keys,
 	synchronize_threads(); /* 5: counters */
 }
 
-static void try_unmatched_keys(int sk, int *rnext_index)
+static void try_unmatched_keys(int sk, int *rnext_index, unsigned int port)
 {
 	struct test_key *key;
 	unsigned int i = 0;
@@ -1013,7 +1013,10 @@ static void try_unmatched_keys(int sk, int *rnext_index)
 		test_error("all keys on server match the client");
 	if (test_set_key(sk, -1, key->server_keyid))
 		test_error("Can't change the current key");
-	if (test_client_verify(sk, msg_len, nr_packets, TEST_TIMEOUT_SEC))
+	trace_ao_event_expect(TCP_AO_RNEXT_REQUEST, this_ip_addr, this_ip_dest,
+			      -1, port, 0, -1, -1, -1, -1, -1,
+			      -1, key->server_keyid, -1);
+	if (test_client_verify(sk, msg_len, nr_packets))
 		test_fail("verify failed");
 	*rnext_index = i;
 }
@@ -1045,7 +1048,7 @@ static void check_current_back(const char *tst_name, unsigned int port,
 			       unsigned int current_index, unsigned int rnext_index,
 			       unsigned int rotate_to_index)
 {
-	struct tcp_ao_counters tmp;
+	struct tcp_counters tmp;
 	int sk;
 
 	sk = start_client(tst_name, port, nr_keys, current_index, rnext_index,
@@ -1054,7 +1057,11 @@ static void check_current_back(const char *tst_name, unsigned int port,
 		return;
 	if (test_set_key(sk, collection.keys[rotate_to_index].client_keyid, -1))
 		test_error("Can't change the current key");
-	if (test_client_verify(sk, msg_len, nr_packets, TEST_TIMEOUT_SEC))
+	trace_ao_event_expect(TCP_AO_RNEXT_REQUEST, this_ip_dest, this_ip_addr,
+			      port, -1, 0, -1, -1, -1, -1, -1,
+			      collection.keys[rotate_to_index].client_keyid,
+			      collection.keys[current_index].client_keyid, -1);
+	if (test_client_verify(sk, msg_len, nr_packets))
 		test_fail("verify failed");
 	/* There is a race here: between setting the current_key with
 	 * setsockopt(TCP_AO_INFO) and starting to send some data - there
@@ -1074,7 +1081,7 @@ static void roll_over_keys(const char *tst_name, unsigned int port,
 			   unsigned int nr_keys, unsigned int rotations,
 			   unsigned int current_index, unsigned int rnext_index)
 {
-	struct tcp_ao_counters tmp;
+	struct tcp_counters tmp;
 	unsigned int i;
 	int sk;
 
@@ -1085,12 +1092,17 @@ static void roll_over_keys(const char *tst_name, unsigned int port,
 	for (i = rnext_index + 1; rotations > 0; i++, rotations--) {
 		if (i >= collection.nr_keys)
 			i = 0;
+		trace_ao_event_expect(TCP_AO_RNEXT_REQUEST,
+				this_ip_addr, this_ip_dest,
+				-1, port, 0, -1, -1, -1, -1, -1,
+				i == 0 ? -1 : collection.keys[i - 1].server_keyid,
+				collection.keys[i].server_keyid, -1);
 		if (test_set_key(sk, -1, collection.keys[i].server_keyid))
 			test_error("Can't change the Rnext key");
-		if (test_client_verify(sk, msg_len, nr_packets, TEST_TIMEOUT_SEC)) {
+		if (test_client_verify(sk, msg_len, nr_packets)) {
 			test_fail("verify failed");
 			close(sk);
-			test_tcp_ao_counters_free(&tmp);
+			test_tcp_counters_free(&tmp);
 			return;
 		}
 		verify_current_rnext(tst_name, sk, -1,
@@ -1104,7 +1116,7 @@ static void roll_over_keys(const char *tst_name, unsigned int port,
 static void try_client_run(const char *tst_name, unsigned int port,
 			   unsigned int nr_keys, int current_index, int rnext_index)
 {
-	struct tcp_ao_counters tmp;
+	struct tcp_counters tmp;
 	int sk;
 
 	sk = start_client(tst_name, port, nr_keys, current_index, rnext_index,
@@ -1124,7 +1136,7 @@ static void try_client_match(const char *tst_name, unsigned int port,
 				 rnext_index, msg_len, nr_packets);
 	if (sk < 0)
 		return;
-	try_unmatched_keys(sk, &rnext_index);
+	try_unmatched_keys(sk, &rnext_index, port);
 	end_client(tst_name, sk, nr_keys, current_index, rnext_index, NULL);
 }
 
@@ -1181,6 +1193,6 @@ static void *client_fn(void *arg)
 
 int main(int argc, char *argv[])
 {
-	test_init(120, server_fn, client_fn);
+	test_init(121, server_fn, client_fn);
 	return 0;
 }
diff --git a/tools/testing/selftests/net/tcp_ao/lib/aolib.h b/tools/testing/selftests/net/tcp_ao/lib/aolib.h
index fbc7f6111815..ebb2899c12fe 100644
--- a/tools/testing/selftests/net/tcp_ao/lib/aolib.h
+++ b/tools/testing/selftests/net/tcp_ao/lib/aolib.h
@@ -37,17 +37,59 @@ extern void __test_xfail(const char *buf);
 extern void __test_error(const char *buf);
 extern void __test_skip(const char *buf);
 
-__attribute__((__format__(__printf__, 2, 3)))
-static inline void __test_print(void (*fn)(const char *), const char *fmt, ...)
+static inline char *test_snprintf(const char *fmt, va_list vargs)
 {
-#define TEST_MSG_BUFFER_SIZE 4096
-	char buf[TEST_MSG_BUFFER_SIZE];
-	va_list arg;
-
-	va_start(arg, fmt);
-	vsnprintf(buf, sizeof(buf), fmt, arg);
-	va_end(arg);
-	fn(buf);
+	char *ret = NULL;
+	size_t size = 0;
+	va_list tmp;
+	int n = 0;
+
+	va_copy(tmp, vargs);
+	n = vsnprintf(ret, size, fmt, tmp);
+	va_end(tmp);
+	if (n < 0)
+		return NULL;
+
+	size = n + 1;
+	ret = malloc(size);
+	if (!ret)
+		return NULL;
+
+	n = vsnprintf(ret, size, fmt, vargs);
+	if (n < 0 || n > size - 1) {
+		free(ret);
+		return NULL;
+	}
+	return ret;
+}
+
+static __printf(1, 2) inline char *test_sprintf(const char *fmt, ...)
+{
+	va_list vargs;
+	char *ret;
+
+	va_start(vargs, fmt);
+	ret = test_snprintf(fmt, vargs);
+	va_end(vargs);
+
+	return ret;
+}
+
+static __printf(2, 3) inline void __test_print(void (*fn)(const char *),
+					       const char *fmt, ...)
+{
+	va_list vargs;
+	char *msg;
+
+	va_start(vargs, fmt);
+	msg = test_snprintf(fmt, vargs);
+	va_end(vargs);
+
+	if (!msg)
+		return;
+
+	fn(msg);
+	free(msg);
 }
 
 #define test_print(fmt, ...)						\
@@ -103,6 +145,7 @@ enum test_needs_kconfig {
 	KCONFIG_TCP_AO,			/* required */
 	KCONFIG_TCP_MD5,		/* optional, for TCP-MD5 features */
 	KCONFIG_NET_VRF,		/* optional, for L3/VRF testing */
+	KCONFIG_FTRACE,			/* optional, for tracepoints checks */
 	__KCONFIG_LAST__
 };
 extern bool kernel_config_has(enum test_needs_kconfig k);
@@ -142,6 +185,8 @@ static inline void test_init2(unsigned int ntests,
 	__test_init(ntests, family, prefix, taddr1, taddr2, peer1, peer2);
 }
 extern void test_add_destructor(void (*d)(void));
+extern void test_init_ftrace(int nsfd1, int nsfd2);
+extern int test_setup_tracing(void);
 
 /* To adjust optmem socket limit, approximately estimate a number,
  * that is bigger than sizeof(struct tcp_ao_key).
@@ -216,12 +261,17 @@ static inline void test_init(unsigned int ntests,
 }
 extern void synchronize_threads(void);
 extern void switch_ns(int fd);
+extern int switch_save_ns(int fd);
+extern void switch_close_ns(int fd);
 
 extern __thread union tcp_addr this_ip_addr;
 extern __thread union tcp_addr this_ip_dest;
 extern int test_family;
 
 extern void randomize_buffer(void *buf, size_t buflen);
+extern __printf(3, 4) int test_echo(const char *fname, bool append,
+				    const char *fmt, ...);
+
 extern int open_netns(void);
 extern int unshare_open_netns(void);
 extern const char veth_name[];
@@ -239,7 +289,7 @@ extern int link_set_up(const char *intf);
 extern const unsigned int test_server_port;
 extern int test_wait_fd(int sk, time_t sec, bool write);
 extern int __test_connect_socket(int sk, const char *device,
-				 void *addr, size_t addr_sz, time_t timeout);
+				 void *addr, size_t addr_sz, bool async);
 extern int __test_listen_socket(int backlog, void *addr, size_t addr_sz);
 
 static inline int test_listen_socket(const union tcp_addr taddr,
@@ -281,25 +331,26 @@ static inline int test_listen_socket(const union tcp_addr taddr,
  * If set to 0 - kernel will try to retransmit SYN number of times, set in
  * /proc/sys/net/ipv4/tcp_syn_retries
  * By default set to 1 to make tests pass faster on non-busy machine.
+ * [in process of removal, don't use in new tests]
  */
 #ifndef TEST_RETRANSMIT_SEC
 #define TEST_RETRANSMIT_SEC	1
 #endif
 
 static inline int _test_connect_socket(int sk, const union tcp_addr taddr,
-				       unsigned int port, time_t timeout)
+				       unsigned int port, bool async)
 {
 	sockaddr_af addr;
 
 	tcp_addr_to_sockaddr_in(&addr, &taddr, htons(port));
 	return __test_connect_socket(sk, veth_name,
-				     (void *)&addr, sizeof(addr), timeout);
+				     (void *)&addr, sizeof(addr), async);
 }
 
 static inline int test_connect_socket(int sk, const union tcp_addr taddr,
 				      unsigned int port)
 {
-	return _test_connect_socket(sk, taddr, port, TEST_TIMEOUT_SEC);
+	return _test_connect_socket(sk, taddr, port, false);
 }
 
 extern int __test_set_md5(int sk, void *addr, size_t addr_sz,
@@ -433,10 +484,7 @@ static inline int test_set_ao_flags(int sk, bool ao_required, bool accept_icmps)
 }
 
 extern ssize_t test_server_run(int sk, ssize_t quota, time_t timeout_sec);
-extern ssize_t test_client_loop(int sk, char *buf, size_t buf_sz,
-				const size_t msg_len, time_t timeout_sec);
-extern int test_client_verify(int sk, const size_t msg_len, const size_t nr,
-			      time_t timeout_sec);
+extern int test_client_verify(int sk, const size_t msg_len, const size_t nr);
 
 struct tcp_ao_key_counters {
 	uint8_t sndid;
@@ -462,7 +510,15 @@ struct tcp_ao_counters {
 	size_t nr_keys;
 	struct tcp_ao_key_counters *key_cnts;
 };
-extern int test_get_tcp_ao_counters(int sk, struct tcp_ao_counters *out);
+
+struct tcp_counters {
+	struct tcp_ao_counters ao;
+	uint64_t netns_md5_notfound;
+	uint64_t netns_md5_unexpected;
+	uint64_t netns_md5_failure;
+};
+
+extern int test_get_tcp_counters(int sk, struct tcp_counters *out);
 
 #define TEST_CNT_KEY_GOOD		BIT(0)
 #define TEST_CNT_KEY_BAD		BIT(1)
@@ -476,8 +532,31 @@ extern int test_get_tcp_ao_counters(int sk, struct tcp_ao_counters *out);
 #define TEST_CNT_NS_KEY_NOT_FOUND	BIT(9)
 #define TEST_CNT_NS_AO_REQUIRED		BIT(10)
 #define TEST_CNT_NS_DROPPED_ICMP	BIT(11)
+#define TEST_CNT_NS_MD5_NOT_FOUND	BIT(12)
+#define TEST_CNT_NS_MD5_UNEXPECTED	BIT(13)
+#define TEST_CNT_NS_MD5_FAILURE		BIT(14)
 typedef uint16_t test_cnt;
 
+#define _for_each_counter(f)						\
+do {									\
+	/* per-netns */							\
+	f(ao.netns_ao_good,		TEST_CNT_NS_GOOD);		\
+	f(ao.netns_ao_bad,		TEST_CNT_NS_BAD);		\
+	f(ao.netns_ao_key_not_found,	TEST_CNT_NS_KEY_NOT_FOUND);	\
+	f(ao.netns_ao_required,		TEST_CNT_NS_AO_REQUIRED);	\
+	f(ao.netns_ao_dropped_icmp,	TEST_CNT_NS_DROPPED_ICMP);	\
+	/* per-socket */						\
+	f(ao.ao_info_pkt_good,		TEST_CNT_SOCK_GOOD);		\
+	f(ao.ao_info_pkt_bad,		TEST_CNT_SOCK_BAD);		\
+	f(ao.ao_info_pkt_key_not_found,	TEST_CNT_SOCK_KEY_NOT_FOUND);	\
+	f(ao.ao_info_pkt_ao_required,	TEST_CNT_SOCK_AO_REQUIRED);	\
+	f(ao.ao_info_pkt_dropped_icmp,	TEST_CNT_SOCK_DROPPED_ICMP);	\
+	/* non-AO */							\
+	f(netns_md5_notfound,		TEST_CNT_NS_MD5_NOT_FOUND);	\
+	f(netns_md5_unexpected,		TEST_CNT_NS_MD5_UNEXPECTED);	\
+	f(netns_md5_failure,		TEST_CNT_NS_MD5_FAILURE);	\
+} while (0)
+
 #define TEST_CNT_AO_GOOD		(TEST_CNT_SOCK_GOOD | TEST_CNT_NS_GOOD)
 #define TEST_CNT_AO_BAD			(TEST_CNT_SOCK_BAD | TEST_CNT_NS_BAD)
 #define TEST_CNT_AO_KEY_NOT_FOUND	(TEST_CNT_SOCK_KEY_NOT_FOUND | \
@@ -489,34 +568,71 @@ typedef uint16_t test_cnt;
 #define TEST_CNT_GOOD			(TEST_CNT_KEY_GOOD | TEST_CNT_AO_GOOD)
 #define TEST_CNT_BAD			(TEST_CNT_KEY_BAD | TEST_CNT_AO_BAD)
 
-extern int __test_tcp_ao_counters_cmp(const char *tst_name,
-		struct tcp_ao_counters *before, struct tcp_ao_counters *after,
+extern test_cnt test_cmp_counters(struct tcp_counters *before,
+				  struct tcp_counters *after);
+extern int test_assert_counters_sk(const char *tst_name,
+		struct tcp_counters *before, struct tcp_counters *after,
 		test_cnt expected);
-extern int test_tcp_ao_key_counters_cmp(const char *tst_name,
+extern int test_assert_counters_key(const char *tst_name,
 		struct tcp_ao_counters *before, struct tcp_ao_counters *after,
 		test_cnt expected, int sndid, int rcvid);
-extern void test_tcp_ao_counters_free(struct tcp_ao_counters *cnts);
+extern void test_tcp_counters_free(struct tcp_counters *cnts);
+
+/*
+ * Polling for netns and socket counters during select()/connect() and also
+ * client/server messaging. Instead of constant timeout on underlying select(),
+ * check the counters and return early. This allows to pass the tests where
+ * timeout is expected without waiting for that fixing timeout (tests speed-up).
+ * Previously shorter timeouts were used for tests expecting to time out,
+ * but that leaded to sporadic false positives on counter checks failures,
+ * as one second timeouts aren't enough for TCP retransmit.
+ *
+ * Two sides of the socketpair (client/server) should synchronize failures
+ * using a shared variable *err, so that they can detect the other side's
+ * failure.
+ */
+extern int test_skpair_wait_poll(int sk, bool write, test_cnt cond,
+				 volatile int *err);
+extern int _test_skpair_connect_poll(int sk, const char *device,
+				     void *addr, size_t addr_sz,
+				     test_cnt cond, volatile int *err);
+static inline int test_skpair_connect_poll(int sk, const union tcp_addr taddr,
+					   unsigned int port,
+					   test_cnt cond, volatile int *err)
+{
+	sockaddr_af addr;
+
+	tcp_addr_to_sockaddr_in(&addr, &taddr, htons(port));
+	return _test_skpair_connect_poll(sk, veth_name,
+					 (void *)&addr, sizeof(addr), cond, err);
+}
+
+extern int test_skpair_client(int sk, const size_t msg_len, const size_t nr,
+			      test_cnt cond, volatile int *err);
+extern int test_skpair_server(int sk, ssize_t quota,
+			      test_cnt cond, volatile int *err);
+
 /*
- * Frees buffers allocated in test_get_tcp_ao_counters().
+ * Frees buffers allocated in test_get_tcp_counters().
  * The function doesn't expect new keys or keys removed between calls
- * to test_get_tcp_ao_counters(). Check key counters manually if they
+ * to test_get_tcp_counters(). Check key counters manually if they
  * may change.
  */
-static inline int test_tcp_ao_counters_cmp(const char *tst_name,
-					   struct tcp_ao_counters *before,
-					   struct tcp_ao_counters *after,
-					   test_cnt expected)
+static inline int test_assert_counters(const char *tst_name,
+				       struct tcp_counters *before,
+				       struct tcp_counters *after,
+				       test_cnt expected)
 {
 	int ret;
 
-	ret = __test_tcp_ao_counters_cmp(tst_name, before, after, expected);
+	ret = test_assert_counters_sk(tst_name, before, after, expected);
 	if (ret)
 		goto out;
-	ret = test_tcp_ao_key_counters_cmp(tst_name, before, after,
-					   expected, -1, -1);
+	ret = test_assert_counters_key(tst_name, &before->ao, &after->ao,
+				       expected, -1, -1);
 out:
-	test_tcp_ao_counters_free(before);
-	test_tcp_ao_counters_free(after);
+	test_tcp_counters_free(before);
+	test_tcp_counters_free(after);
 	return ret;
 }
 
@@ -602,4 +718,115 @@ static inline int test_add_repaired_key(int sk,
 	return test_verify_socket_key(sk, &tmp);
 }
 
+#define DEFAULT_FTRACE_BUFFER_KB	10000
+#define DEFAULT_TRACER_LINES_ARR	200
+struct test_ftracer;
+extern uint64_t ns_cookie1, ns_cookie2;
+
+enum ftracer_op {
+	FTRACER_LINE_DISCARD = 0,
+	FTRACER_LINE_PRESERVE,
+	FTRACER_EXIT,
+};
+
+extern struct test_ftracer *create_ftracer(const char *name,
+		enum ftracer_op (*process_line)(const char *line),
+		void (*destructor)(struct test_ftracer *tracer),
+		bool (*expecting_more)(void),
+		size_t lines_buf_sz, size_t buffer_size_kb);
+extern int setup_trace_event(struct test_ftracer *tracer,
+			     const char *event, const char *filter);
+extern void destroy_ftracer(struct test_ftracer *tracer);
+extern const size_t tracer_get_savedlines_nr(struct test_ftracer *tracer);
+extern const char **tracer_get_savedlines(struct test_ftracer *tracer);
+
+enum trace_events {
+	/* TCP_HASH_EVENT */
+	TCP_HASH_BAD_HEADER = 0,
+	TCP_HASH_MD5_REQUIRED,
+	TCP_HASH_MD5_UNEXPECTED,
+	TCP_HASH_MD5_MISMATCH,
+	TCP_HASH_AO_REQUIRED,
+	/* TCP_AO_EVENT */
+	TCP_AO_HANDSHAKE_FAILURE,
+	TCP_AO_WRONG_MACLEN,
+	TCP_AO_MISMATCH,
+	TCP_AO_KEY_NOT_FOUND,
+	TCP_AO_RNEXT_REQUEST,
+	/* TCP_AO_EVENT_SK */
+	TCP_AO_SYNACK_NO_KEY,
+	/* TCP_AO_EVENT_SNE */
+	TCP_AO_SND_SNE_UPDATE,
+	TCP_AO_RCV_SNE_UPDATE,
+	__MAX_TRACE_EVENTS
+};
+
+extern int __trace_event_expect(enum trace_events type, int family,
+				union tcp_addr src, union tcp_addr dst,
+				int src_port, int dst_port, int L3index,
+				int fin, int syn, int rst, int psh, int ack,
+				int keyid, int rnext, int maclen, int sne);
+
+static inline void trace_hash_event_expect(enum trace_events type,
+				union tcp_addr src, union tcp_addr dst,
+				int src_port, int dst_port, int L3index,
+				int fin, int syn, int rst, int psh, int ack)
+{
+	int err;
+
+	err = __trace_event_expect(type, TEST_FAMILY, src, dst,
+				   src_port, dst_port, L3index,
+				   fin, syn, rst, psh, ack,
+				   -1, -1, -1, -1);
+	if (err)
+		test_error("Couldn't add a trace event: %d", err);
+}
+
+static inline void trace_ao_event_expect(enum trace_events type,
+				union tcp_addr src, union tcp_addr dst,
+				int src_port, int dst_port, int L3index,
+				int fin, int syn, int rst, int psh, int ack,
+				int keyid, int rnext, int maclen)
+{
+	int err;
+
+	err = __trace_event_expect(type, TEST_FAMILY, src, dst,
+				   src_port, dst_port, L3index,
+				   fin, syn, rst, psh, ack,
+				   keyid, rnext, maclen, -1);
+	if (err)
+		test_error("Couldn't add a trace event: %d", err);
+}
+
+static inline void trace_ao_event_sk_expect(enum trace_events type,
+				union tcp_addr src, union tcp_addr dst,
+				int src_port, int dst_port,
+				int keyid, int rnext)
+{
+	int err;
+
+	err = __trace_event_expect(type, TEST_FAMILY, src, dst,
+				   src_port, dst_port, -1,
+				   -1, -1, -1, -1, -1,
+				   keyid, rnext, -1, -1);
+	if (err)
+		test_error("Couldn't add a trace event: %d", err);
+}
+
+static inline void trace_ao_event_sne_expect(enum trace_events type,
+				union tcp_addr src, union tcp_addr dst,
+				int src_port, int dst_port, int sne)
+{
+	int err;
+
+	err = __trace_event_expect(type, TEST_FAMILY, src, dst,
+				   src_port, dst_port, -1,
+				   -1, -1, -1, -1, -1,
+				   -1, -1, -1, sne);
+	if (err)
+		test_error("Couldn't add a trace event: %d", err);
+}
+
+extern int setup_aolib_ftracer(void);
+
 #endif /* _AOLIB_H_ */
diff --git a/tools/testing/selftests/net/tcp_ao/lib/ftrace-tcp.c b/tools/testing/selftests/net/tcp_ao/lib/ftrace-tcp.c
new file mode 100644
index 000000000000..27403f875054
--- /dev/null
+++ b/tools/testing/selftests/net/tcp_ao/lib/ftrace-tcp.c
@@ -0,0 +1,556 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <inttypes.h>
+#include <pthread.h>
+#include "aolib.h"
+
+static const char *trace_event_names[__MAX_TRACE_EVENTS] = {
+	/* TCP_HASH_EVENT */
+	"tcp_hash_bad_header",
+	"tcp_hash_md5_required",
+	"tcp_hash_md5_unexpected",
+	"tcp_hash_md5_mismatch",
+	"tcp_hash_ao_required",
+	/* TCP_AO_EVENT */
+	"tcp_ao_handshake_failure",
+	"tcp_ao_wrong_maclen",
+	"tcp_ao_mismatch",
+	"tcp_ao_key_not_found",
+	"tcp_ao_rnext_request",
+	/* TCP_AO_EVENT_SK */
+	"tcp_ao_synack_no_key",
+	/* TCP_AO_EVENT_SNE */
+	"tcp_ao_snd_sne_update",
+	"tcp_ao_rcv_sne_update"
+};
+
+struct expected_trace_point {
+	/* required */
+	enum trace_events type;
+	int family;
+	union tcp_addr src;
+	union tcp_addr dst;
+
+	/* optional */
+	int src_port;
+	int dst_port;
+	int L3index;
+
+	int fin;
+	int syn;
+	int rst;
+	int psh;
+	int ack;
+
+	int keyid;
+	int rnext;
+	int maclen;
+	int sne;
+
+	size_t matched;
+};
+
+static struct expected_trace_point *exp_tps;
+static size_t exp_tps_nr;
+static size_t exp_tps_size;
+static pthread_mutex_t exp_tps_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+int __trace_event_expect(enum trace_events type, int family,
+			 union tcp_addr src, union tcp_addr dst,
+			 int src_port, int dst_port, int L3index,
+			 int fin, int syn, int rst, int psh, int ack,
+			 int keyid, int rnext, int maclen, int sne)
+{
+	struct expected_trace_point new_tp = {
+		.type           = type,
+		.family         = family,
+		.src            = src,
+		.dst            = dst,
+		.src_port       = src_port,
+		.dst_port       = dst_port,
+		.L3index        = L3index,
+		.fin            = fin,
+		.syn            = syn,
+		.rst            = rst,
+		.psh            = psh,
+		.ack            = ack,
+		.keyid          = keyid,
+		.rnext          = rnext,
+		.maclen         = maclen,
+		.sne            = sne,
+		.matched        = 0,
+	};
+	int ret = 0;
+
+	if (!kernel_config_has(KCONFIG_FTRACE))
+		return 0;
+
+	pthread_mutex_lock(&exp_tps_mutex);
+	if (exp_tps_nr == exp_tps_size) {
+		struct expected_trace_point *tmp;
+
+		if (exp_tps_size == 0)
+			exp_tps_size = 10;
+		else
+			exp_tps_size = exp_tps_size * 1.6;
+
+		tmp = reallocarray(exp_tps, exp_tps_size, sizeof(exp_tps[0]));
+		if (!tmp) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		exp_tps = tmp;
+	}
+	exp_tps[exp_tps_nr] = new_tp;
+	exp_tps_nr++;
+out:
+	pthread_mutex_unlock(&exp_tps_mutex);
+	return ret;
+}
+
+static void free_expected_events(void)
+{
+	/* We're from the process destructor - not taking the mutex */
+	exp_tps_size = 0;
+	exp_tps = NULL;
+	free(exp_tps);
+}
+
+struct trace_point {
+	int family;
+	union tcp_addr src;
+	union tcp_addr dst;
+	unsigned int src_port;
+	unsigned int dst_port;
+	int L3index;
+	unsigned int fin:1,
+		     syn:1,
+		     rst:1,
+		     psh:1,
+		     ack:1;
+
+	unsigned int keyid;
+	unsigned int rnext;
+	unsigned int maclen;
+
+	unsigned int sne;
+};
+
+static bool lookup_expected_event(int event_type, struct trace_point *e)
+{
+	size_t i;
+
+	pthread_mutex_lock(&exp_tps_mutex);
+	for (i = 0; i < exp_tps_nr; i++) {
+		struct expected_trace_point *p = &exp_tps[i];
+		size_t sk_size;
+
+		if (p->type != event_type)
+			continue;
+		if (p->family != e->family)
+			continue;
+		if (p->family == AF_INET)
+			sk_size = sizeof(p->src.a4);
+		else
+			sk_size = sizeof(p->src.a6);
+		if (memcmp(&p->src, &e->src, sk_size))
+			continue;
+		if (memcmp(&p->dst, &e->dst, sk_size))
+			continue;
+		if (p->src_port >= 0 && p->src_port != e->src_port)
+			continue;
+		if (p->dst_port >= 0 && p->dst_port != e->dst_port)
+			continue;
+		if (p->L3index >= 0 && p->L3index != e->L3index)
+			continue;
+
+		if (p->fin >= 0 && p->fin != e->fin)
+			continue;
+		if (p->syn >= 0 && p->syn != e->syn)
+			continue;
+		if (p->rst >= 0 && p->rst != e->rst)
+			continue;
+		if (p->psh >= 0 && p->psh != e->psh)
+			continue;
+		if (p->ack >= 0 && p->ack != e->ack)
+			continue;
+
+		if (p->keyid >= 0 && p->keyid != e->keyid)
+			continue;
+		if (p->rnext >= 0 && p->rnext != e->rnext)
+			continue;
+		if (p->maclen >= 0 && p->maclen != e->maclen)
+			continue;
+		if (p->sne >= 0 && p->sne != e->sne)
+			continue;
+		p->matched++;
+		pthread_mutex_unlock(&exp_tps_mutex);
+		return true;
+	}
+	pthread_mutex_unlock(&exp_tps_mutex);
+	return false;
+}
+
+static int check_event_type(const char *line)
+{
+	size_t i;
+
+	/*
+	 * This should have been a set or hashmap, but it's a selftest,
+	 * so... KISS.
+	 */
+	for (i = 0; i < __MAX_TRACE_EVENTS; i++) {
+		if (!strncmp(trace_event_names[i], line, strlen(trace_event_names[i])))
+			return i;
+	}
+	return -1;
+}
+
+static bool event_has_flags(enum trace_events event)
+{
+	switch (event) {
+	case TCP_HASH_BAD_HEADER:
+	case TCP_HASH_MD5_REQUIRED:
+	case TCP_HASH_MD5_UNEXPECTED:
+	case TCP_HASH_MD5_MISMATCH:
+	case TCP_HASH_AO_REQUIRED:
+	case TCP_AO_HANDSHAKE_FAILURE:
+	case TCP_AO_WRONG_MACLEN:
+	case TCP_AO_MISMATCH:
+	case TCP_AO_KEY_NOT_FOUND:
+	case TCP_AO_RNEXT_REQUEST:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static int tracer_ip_split(int family, char *src, char **addr, char **port)
+{
+	char *p;
+
+	if (family == AF_INET) {
+		/* fomat is <addr>:port, i.e.: 10.0.254.1:7015 */
+		*addr = src;
+		p = strchr(src, ':');
+		if (!p) {
+			test_print("Couldn't parse trace event addr:port %s", src);
+			return -EINVAL;
+		}
+		*p++ = '\0';
+		*port = p;
+		return 0;
+	}
+	if (family != AF_INET6)
+		return -EAFNOSUPPORT;
+
+	/* format is [<addr>]:port, i.e.: [2001:db8:254::1]:7013 */
+	*addr = strchr(src, '[');
+	p = strchr(src, ']');
+
+	if (!p || !*addr) {
+		test_print("Couldn't parse trace event [addr]:port %s", src);
+		return -EINVAL;
+	}
+
+	*addr = *addr + 1;      /* '[' */
+	*p++ = '\0';            /* ']' */
+	if (*p != ':') {
+		test_print("Couldn't parse trace event :port %s", p);
+		return -EINVAL;
+	}
+	*p++ = '\0';            /* ':' */
+	*port = p;
+	return 0;
+}
+
+static int tracer_scan_address(int family, char *src,
+			       union tcp_addr *dst, unsigned int *port)
+{
+	char *addr, *port_str;
+	int ret;
+
+	ret = tracer_ip_split(family, src, &addr, &port_str);
+	if (ret)
+		return ret;
+
+	if (inet_pton(family, addr, dst) != 1) {
+		test_print("Couldn't parse trace event addr %s", addr);
+		return -EINVAL;
+	}
+	errno = 0;
+	*port = (unsigned int)strtoul(port_str, NULL, 10);
+	if (errno != 0) {
+		test_print("Couldn't parse trace event port %s", port_str);
+		return -errno;
+	}
+	return 0;
+}
+
+static int tracer_scan_event(const char *line, enum trace_events event,
+			     struct trace_point *out)
+{
+	char *src = NULL, *dst = NULL, *family = NULL;
+	char fin, syn, rst, psh, ack;
+	int nr_matched, ret = 0;
+	uint64_t netns_cookie;
+
+	switch (event) {
+	case TCP_HASH_BAD_HEADER:
+	case TCP_HASH_MD5_REQUIRED:
+	case TCP_HASH_MD5_UNEXPECTED:
+	case TCP_HASH_MD5_MISMATCH:
+	case TCP_HASH_AO_REQUIRED: {
+		nr_matched = sscanf(line, "%*s net=%" PRIu64 " state%*s family=%ms src=%ms dest=%ms L3index=%d [%c%c%c%c%c]",
+				    &netns_cookie, &family,
+				    &src, &dst, &out->L3index,
+				    &fin, &syn, &rst, &psh, &ack);
+		if (nr_matched != 10)
+			test_print("Couldn't parse trace event, matched = %d/10",
+				   nr_matched);
+		break;
+	}
+	case TCP_AO_HANDSHAKE_FAILURE:
+	case TCP_AO_WRONG_MACLEN:
+	case TCP_AO_MISMATCH:
+	case TCP_AO_KEY_NOT_FOUND:
+	case TCP_AO_RNEXT_REQUEST: {
+		nr_matched = sscanf(line, "%*s net=%" PRIu64 " state%*s family=%ms src=%ms dest=%ms L3index=%d [%c%c%c%c%c] keyid=%u rnext=%u maclen=%u",
+				    &netns_cookie, &family,
+				    &src, &dst, &out->L3index,
+				    &fin, &syn, &rst, &psh, &ack,
+				    &out->keyid, &out->rnext, &out->maclen);
+		if (nr_matched != 13)
+			test_print("Couldn't parse trace event, matched = %d/13",
+				   nr_matched);
+		break;
+	}
+	case TCP_AO_SYNACK_NO_KEY: {
+		nr_matched = sscanf(line, "%*s net=%" PRIu64 " state%*s family=%ms src=%ms dest=%ms keyid=%u rnext=%u",
+				    &netns_cookie, &family,
+				    &src, &dst, &out->keyid, &out->rnext);
+		if (nr_matched != 6)
+			test_print("Couldn't parse trace event, matched = %d/6",
+				   nr_matched);
+		break;
+	}
+	case TCP_AO_SND_SNE_UPDATE:
+	case TCP_AO_RCV_SNE_UPDATE: {
+		nr_matched = sscanf(line, "%*s net=%" PRIu64 " state%*s family=%ms src=%ms dest=%ms sne=%u",
+				    &netns_cookie, &family,
+				    &src, &dst, &out->sne);
+		if (nr_matched != 5)
+			test_print("Couldn't parse trace event, matched = %d/5",
+				   nr_matched);
+		break;
+	}
+	default:
+		return -1;
+	}
+
+	if (family) {
+		if (!strcmp(family, "AF_INET")) {
+			out->family = AF_INET;
+		} else if (!strcmp(family, "AF_INET6")) {
+			out->family = AF_INET6;
+		} else {
+			test_print("Couldn't parse trace event family %s", family);
+			ret = -EINVAL;
+			goto out_free;
+		}
+	}
+
+	if (event_has_flags(event)) {
+		out->fin = (fin == 'F');
+		out->syn = (syn == 'S');
+		out->rst = (rst == 'R');
+		out->psh = (psh == 'P');
+		out->ack = (ack == '.');
+
+		if ((fin != 'F' && fin != ' ') ||
+		    (syn != 'S' && syn != ' ') ||
+		    (rst != 'R' && rst != ' ') ||
+		    (psh != 'P' && psh != ' ') ||
+		    (ack != '.' && ack != ' ')) {
+			test_print("Couldn't parse trace event flags %c%c%c%c%c",
+				   fin, syn, rst, psh, ack);
+			ret = -EINVAL;
+			goto out_free;
+		}
+	}
+
+	if (src && tracer_scan_address(out->family, src, &out->src, &out->src_port)) {
+		ret = -EINVAL;
+		goto out_free;
+	}
+
+	if (dst && tracer_scan_address(out->family, dst, &out->dst, &out->dst_port)) {
+		ret = -EINVAL;
+		goto out_free;
+	}
+
+	if (netns_cookie != ns_cookie1 && netns_cookie != ns_cookie2) {
+		test_print("Net namespace filter for trace event didn't work: %" PRIu64 " != %" PRIu64 " OR %" PRIu64,
+			   netns_cookie, ns_cookie1, ns_cookie2);
+		ret = -EINVAL;
+	}
+
+out_free:
+	free(src);
+	free(dst);
+	free(family);
+	return ret;
+}
+
+static enum ftracer_op aolib_tracer_process_event(const char *line)
+{
+	int event_type = check_event_type(line);
+	struct trace_point tmp = {};
+
+	if (event_type < 0)
+		return FTRACER_LINE_PRESERVE;
+
+	if (tracer_scan_event(line, event_type, &tmp))
+		return FTRACER_LINE_PRESERVE;
+
+	return lookup_expected_event(event_type, &tmp) ?
+		FTRACER_LINE_DISCARD : FTRACER_LINE_PRESERVE;
+}
+
+static void dump_trace_event(struct expected_trace_point *e)
+{
+	char src[INET6_ADDRSTRLEN], dst[INET6_ADDRSTRLEN];
+
+	if (!inet_ntop(e->family, &e->src, src, INET6_ADDRSTRLEN))
+		test_error("inet_ntop()");
+	if (!inet_ntop(e->family, &e->dst, dst, INET6_ADDRSTRLEN))
+		test_error("inet_ntop()");
+	test_print("trace event filter %s [%s:%d => %s:%d, L3index %d, flags: %s%s%s%s%s, keyid: %d, rnext: %d, maclen: %d, sne: %d] = %zu",
+		   trace_event_names[e->type],
+		   src, e->src_port, dst, e->dst_port, e->L3index,
+		   e->fin ? "F" : "", e->syn ? "S" : "", e->rst ? "R" : "",
+		   e->psh ? "P" : "", e->ack ? "." : "",
+		   e->keyid, e->rnext, e->maclen, e->sne, e->matched);
+}
+
+static void print_match_stats(bool unexpected_events)
+{
+	size_t matches_per_type[__MAX_TRACE_EVENTS] = {};
+	bool expected_but_none = false;
+	size_t i, total_matched = 0;
+	char *stat_line = NULL;
+
+	for (i = 0; i < exp_tps_nr; i++) {
+		struct expected_trace_point *e = &exp_tps[i];
+
+		total_matched += e->matched;
+		matches_per_type[e->type] += e->matched;
+		if (!e->matched)
+			expected_but_none = true;
+	}
+	for (i = 0; i < __MAX_TRACE_EVENTS; i++) {
+		if (!matches_per_type[i])
+			continue;
+		stat_line = test_sprintf("%s%s[%zu] ", stat_line ?: "",
+					 trace_event_names[i],
+					 matches_per_type[i]);
+		if (!stat_line)
+			test_error("test_sprintf()");
+	}
+
+	if (unexpected_events || expected_but_none) {
+		for (i = 0; i < exp_tps_nr; i++)
+			dump_trace_event(&exp_tps[i]);
+	}
+
+	if (unexpected_events)
+		return;
+
+	if (expected_but_none)
+		test_fail("Some trace events were expected, but didn't occur");
+	else if (total_matched)
+		test_ok("Trace events matched expectations: %zu %s",
+			total_matched, stat_line);
+	else
+		test_ok("No unexpected trace events during the test run");
+}
+
+#define dump_events(fmt, ...)                           \
+	__test_print(__test_msg, fmt, ##__VA_ARGS__)
+static void check_free_events(struct test_ftracer *tracer)
+{
+	const char **lines;
+	size_t nr;
+
+	if (!kernel_config_has(KCONFIG_FTRACE)) {
+		test_skip("kernel config doesn't have ftrace - no checks");
+		return;
+	}
+
+	nr = tracer_get_savedlines_nr(tracer);
+	lines = tracer_get_savedlines(tracer);
+	print_match_stats(!!nr);
+	if (!nr)
+		return;
+
+	errno = 0;
+	test_xfail("Trace events [%zu] were not expected:", nr);
+	while (nr)
+		dump_events("\t%s", lines[--nr]);
+}
+
+static int setup_tcp_trace_events(struct test_ftracer *tracer)
+{
+	char *filter;
+	size_t i;
+	int ret;
+
+	filter = test_sprintf("net_cookie == %zu || net_cookie == %zu",
+			      ns_cookie1, ns_cookie2);
+	if (!filter)
+		return -ENOMEM;
+
+	for (i = 0; i < __MAX_TRACE_EVENTS; i++) {
+		char *event_name = test_sprintf("tcp/%s", trace_event_names[i]);
+
+		if (!event_name) {
+			ret = -ENOMEM;
+			break;
+		}
+		ret = setup_trace_event(tracer, event_name, filter);
+		free(event_name);
+		if (ret)
+			break;
+	}
+
+	free(filter);
+	return ret;
+}
+
+static void aolib_tracer_destroy(struct test_ftracer *tracer)
+{
+	check_free_events(tracer);
+	free_expected_events();
+}
+
+static bool aolib_tracer_expecting_more(void)
+{
+	size_t i;
+
+	for (i = 0; i < exp_tps_nr; i++)
+		if (!exp_tps[i].matched)
+			return true;
+	return false;
+}
+
+int setup_aolib_ftracer(void)
+{
+	struct test_ftracer *f;
+
+	f = create_ftracer("aolib", aolib_tracer_process_event,
+			   aolib_tracer_destroy, aolib_tracer_expecting_more,
+			   DEFAULT_FTRACE_BUFFER_KB, DEFAULT_TRACER_LINES_ARR);
+	if (!f)
+		return -1;
+
+	return setup_tcp_trace_events(f);
+}
diff --git a/tools/testing/selftests/net/tcp_ao/lib/ftrace.c b/tools/testing/selftests/net/tcp_ao/lib/ftrace.c
new file mode 100644
index 000000000000..e4d0b173bc94
--- /dev/null
+++ b/tools/testing/selftests/net/tcp_ao/lib/ftrace.c
@@ -0,0 +1,543 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <inttypes.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mount.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include "../../../../../include/linux/kernel.h"
+#include "aolib.h"
+
+static char ftrace_path[] = "ksft-ftrace-XXXXXX";
+static bool ftrace_mounted;
+uint64_t ns_cookie1, ns_cookie2;
+
+struct test_ftracer {
+	pthread_t tracer_thread;
+	int	error;
+	char	*instance_path;
+	FILE	*trace_pipe;
+
+	enum ftracer_op (*process_line)(const char *line);
+	void (*destructor)(struct test_ftracer *tracer);
+	bool (*expecting_more)(void);
+
+	char	**saved_lines;
+	size_t	saved_lines_size;
+	size_t	next_line_ind;
+
+	pthread_cond_t met_all_expected;
+	pthread_mutex_t met_all_expected_lock;
+
+	struct test_ftracer *next;
+};
+
+static struct test_ftracer *ftracers;
+static pthread_mutex_t ftracers_lock = PTHREAD_MUTEX_INITIALIZER;
+
+static int mount_ftrace(void)
+{
+	if (!mkdtemp(ftrace_path))
+		test_error("Can't create temp dir");
+
+	if (mount("tracefs", ftrace_path, "tracefs", 0, "rw"))
+		return -errno;
+
+	ftrace_mounted = true;
+
+	return 0;
+}
+
+static void unmount_ftrace(void)
+{
+	if (ftrace_mounted && umount(ftrace_path))
+		test_print("Failed on cleanup: can't unmount tracefs: %m");
+
+	if (rmdir(ftrace_path))
+		test_error("Failed on cleanup: can't remove ftrace dir %s",
+			   ftrace_path);
+}
+
+struct opts_list_t {
+	char *opt_name;
+	struct opts_list_t *next;
+};
+
+static int disable_trace_options(const char *ftrace_path)
+{
+	struct opts_list_t *opts_list = NULL;
+	char *fopts, *line = NULL;
+	size_t buf_len = 0;
+	ssize_t line_len;
+	int ret = 0;
+	FILE *opts;
+
+	fopts = test_sprintf("%s/%s", ftrace_path, "trace_options");
+	if (!fopts)
+		return -ENOMEM;
+
+	opts = fopen(fopts, "r+");
+	if (!opts) {
+		ret = -errno;
+		goto out_free;
+	}
+
+	while ((line_len = getline(&line, &buf_len, opts)) != -1) {
+		struct opts_list_t *tmp;
+
+		if (!strncmp(line, "no", 2))
+			continue;
+
+		tmp = malloc(sizeof(*tmp));
+		if (!tmp) {
+			ret = -ENOMEM;
+			goto out_free_opts_list;
+		}
+		tmp->next = opts_list;
+		tmp->opt_name = test_sprintf("no%s", line);
+		if (!tmp->opt_name) {
+			ret = -ENOMEM;
+			free(tmp);
+			goto out_free_opts_list;
+		}
+		opts_list = tmp;
+	}
+
+	while (opts_list) {
+		struct opts_list_t *tmp = opts_list;
+
+		fseek(opts, 0, SEEK_SET);
+		fwrite(tmp->opt_name, 1, strlen(tmp->opt_name), opts);
+
+		opts_list = opts_list->next;
+		free(tmp->opt_name);
+		free(tmp);
+	}
+
+out_free_opts_list:
+	while (opts_list) {
+		struct opts_list_t *tmp = opts_list;
+
+		opts_list = opts_list->next;
+		free(tmp->opt_name);
+		free(tmp);
+	}
+	free(line);
+	fclose(opts);
+out_free:
+	free(fopts);
+	return ret;
+}
+
+static int setup_buffer_size(const char *ftrace_path, size_t sz)
+{
+	char *fbuf_size = test_sprintf("%s/buffer_size_kb", ftrace_path);
+	int ret;
+
+	if (!fbuf_size)
+		return -1;
+
+	ret = test_echo(fbuf_size, 0, "%zu", sz);
+	free(fbuf_size);
+	return ret;
+}
+
+static int setup_ftrace_instance(struct test_ftracer *tracer, const char *name)
+{
+	char *tmp;
+
+	tmp = test_sprintf("%s/instances/ksft-%s-XXXXXX", ftrace_path, name);
+	if (!tmp)
+		return -ENOMEM;
+
+	tracer->instance_path = mkdtemp(tmp);
+	if (!tracer->instance_path) {
+		free(tmp);
+		return -errno;
+	}
+
+	return 0;
+}
+
+static void remove_ftrace_instance(struct test_ftracer *tracer)
+{
+	if (rmdir(tracer->instance_path))
+		test_print("Failed on cleanup: can't remove ftrace instance %s",
+			   tracer->instance_path);
+	free(tracer->instance_path);
+}
+
+static void tracer_cleanup(void *arg)
+{
+	struct test_ftracer *tracer = arg;
+
+	fclose(tracer->trace_pipe);
+}
+
+static void tracer_set_error(struct test_ftracer *tracer, int error)
+{
+	if (!tracer->error)
+		tracer->error = error;
+}
+
+const size_t tracer_get_savedlines_nr(struct test_ftracer *tracer)
+{
+	return tracer->next_line_ind;
+}
+
+const char **tracer_get_savedlines(struct test_ftracer *tracer)
+{
+	return (const char **)tracer->saved_lines;
+}
+
+static void *tracer_thread_func(void *arg)
+{
+	struct test_ftracer *tracer = arg;
+
+	pthread_cleanup_push(tracer_cleanup, arg);
+
+	while (tracer->next_line_ind < tracer->saved_lines_size) {
+		char **lp = &tracer->saved_lines[tracer->next_line_ind];
+		enum ftracer_op op;
+		size_t buf_len = 0;
+		ssize_t line_len;
+
+		line_len = getline(lp, &buf_len, tracer->trace_pipe);
+		if (line_len == -1)
+			break;
+
+		pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
+		op = tracer->process_line(*lp);
+		pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
+
+		if (tracer->expecting_more) {
+			pthread_mutex_lock(&tracer->met_all_expected_lock);
+			if (!tracer->expecting_more())
+				pthread_cond_signal(&tracer->met_all_expected);
+			pthread_mutex_unlock(&tracer->met_all_expected_lock);
+		}
+
+		if (op == FTRACER_LINE_DISCARD)
+			continue;
+		if (op == FTRACER_EXIT)
+			break;
+		if (op != FTRACER_LINE_PRESERVE)
+			test_error("unexpected tracer command %d", op);
+
+		tracer->next_line_ind++;
+		buf_len = 0;
+	}
+	test_print("too many lines in ftracer buffer %zu, exiting tracer",
+		   tracer->next_line_ind);
+
+	pthread_cleanup_pop(1);
+	return NULL;
+}
+
+static int setup_trace_thread(struct test_ftracer *tracer)
+{
+	int ret = 0;
+	char *path;
+
+	path = test_sprintf("%s/trace_pipe", tracer->instance_path);
+	if (!path)
+		return -ENOMEM;
+
+	tracer->trace_pipe = fopen(path, "r");
+	if (!tracer->trace_pipe) {
+		ret = -errno;
+		goto out_free;
+	}
+
+	if (pthread_create(&tracer->tracer_thread, NULL,
+			   tracer_thread_func, (void *)tracer)) {
+		ret = -errno;
+		fclose(tracer->trace_pipe);
+	}
+
+out_free:
+	free(path);
+	return ret;
+}
+
+static void stop_trace_thread(struct test_ftracer *tracer)
+{
+	void *res;
+
+	if (pthread_cancel(tracer->tracer_thread)) {
+		test_print("Can't stop tracer pthread: %m");
+		tracer_set_error(tracer, -errno);
+	}
+	if (pthread_join(tracer->tracer_thread, &res)) {
+		test_print("Can't join tracer pthread: %m");
+		tracer_set_error(tracer, -errno);
+	}
+	if (res != PTHREAD_CANCELED) {
+		test_print("Tracer thread wasn't canceled");
+		tracer_set_error(tracer, -errno);
+	}
+	if (tracer->error)
+		test_fail("tracer errored by %s", strerror(tracer->error));
+}
+
+static void final_wait_for_events(struct test_ftracer *tracer,
+				  unsigned timeout_sec)
+{
+	struct timespec timeout;
+	struct timeval now;
+	int ret = 0;
+
+	if (!tracer->expecting_more)
+		return;
+
+	pthread_mutex_lock(&tracer->met_all_expected_lock);
+	gettimeofday(&now, NULL);
+	timeout.tv_sec = now.tv_sec + timeout_sec;
+	timeout.tv_nsec = now.tv_usec * 1000;
+
+	while (tracer->expecting_more() && ret != ETIMEDOUT)
+		ret = pthread_cond_timedwait(&tracer->met_all_expected,
+				&tracer->met_all_expected_lock, &timeout);
+	pthread_mutex_unlock(&tracer->met_all_expected_lock);
+}
+
+int setup_trace_event(struct test_ftracer *tracer,
+		      const char *event, const char *filter)
+{
+	char *enable_path, *filter_path, *instance = tracer->instance_path;
+	int ret;
+
+	enable_path = test_sprintf("%s/events/%s/enable", instance, event);
+	if (!enable_path)
+		return -ENOMEM;
+
+	filter_path = test_sprintf("%s/events/%s/filter", instance, event);
+	if (!filter_path) {
+		ret = -ENOMEM;
+		goto out_free;
+	}
+
+	ret = test_echo(filter_path, 0, "%s", filter);
+	if (!ret)
+		ret = test_echo(enable_path, 0, "1");
+
+out_free:
+	free(filter_path);
+	free(enable_path);
+	return ret;
+}
+
+struct test_ftracer *create_ftracer(const char *name,
+				    enum ftracer_op (*process_line)(const char *line),
+				    void (*destructor)(struct test_ftracer *tracer),
+				    bool (*expecting_more)(void),
+				    size_t lines_buf_sz, size_t buffer_size_kb)
+{
+	struct test_ftracer *tracer;
+	int err;
+
+	/* XXX: separate __create_ftracer() helper and do here
+	 * if (!kernel_config_has(KCONFIG_FTRACE))
+	 *	return NULL;
+	 */
+
+	tracer = malloc(sizeof(*tracer));
+	if (!tracer) {
+		test_print("malloc()");
+		return NULL;
+	}
+
+	memset(tracer, 0, sizeof(*tracer));
+
+	err = setup_ftrace_instance(tracer, name);
+	if (err) {
+		test_print("setup_ftrace_instance(): %d", err);
+		goto err_free;
+	}
+
+	err = disable_trace_options(tracer->instance_path);
+	if (err) {
+		test_print("disable_trace_options(): %d", err);
+		goto err_remove;
+	}
+
+	err = setup_buffer_size(tracer->instance_path, buffer_size_kb);
+	if (err) {
+		test_print("disable_trace_options(): %d", err);
+		goto err_remove;
+	}
+
+	tracer->saved_lines = calloc(lines_buf_sz, sizeof(tracer->saved_lines[0]));
+	if (!tracer->saved_lines) {
+		test_print("calloc()");
+		goto err_remove;
+	}
+	tracer->saved_lines_size = lines_buf_sz;
+
+	tracer->process_line	= process_line;
+	tracer->destructor	= destructor;
+	tracer->expecting_more	= expecting_more;
+
+	err = pthread_cond_init(&tracer->met_all_expected, NULL);
+	if (err) {
+		test_print("pthread_cond_init(): %d", err);
+		goto err_free_lines;
+	}
+
+	err = pthread_mutex_init(&tracer->met_all_expected_lock, NULL);
+	if (err) {
+		test_print("pthread_mutex_init(): %d", err);
+		goto err_cond_destroy;
+	}
+
+	err = setup_trace_thread(tracer);
+	if (err) {
+		test_print("setup_trace_thread(): %d", err);
+		goto err_mutex_destroy;
+	}
+
+	pthread_mutex_lock(&ftracers_lock);
+	tracer->next = ftracers;
+	ftracers = tracer;
+	pthread_mutex_unlock(&ftracers_lock);
+
+	return tracer;
+
+err_mutex_destroy:
+	pthread_mutex_destroy(&tracer->met_all_expected_lock);
+err_cond_destroy:
+	pthread_cond_destroy(&tracer->met_all_expected);
+err_free_lines:
+	free(tracer->saved_lines);
+err_remove:
+	remove_ftrace_instance(tracer);
+err_free:
+	free(tracer);
+	return NULL;
+}
+
+static void __destroy_ftracer(struct test_ftracer *tracer)
+{
+	size_t i;
+
+	final_wait_for_events(tracer, TEST_TIMEOUT_SEC);
+	stop_trace_thread(tracer);
+	remove_ftrace_instance(tracer);
+	if (tracer->destructor)
+		tracer->destructor(tracer);
+	for (i = 0; i < tracer->saved_lines_size; i++)
+		free(tracer->saved_lines[i]);
+	pthread_cond_destroy(&tracer->met_all_expected);
+	pthread_mutex_destroy(&tracer->met_all_expected_lock);
+	free(tracer);
+}
+
+void destroy_ftracer(struct test_ftracer *tracer)
+{
+	pthread_mutex_lock(&ftracers_lock);
+	if (tracer == ftracers) {
+		ftracers = tracer->next;
+	} else {
+		struct test_ftracer *f = ftracers;
+
+		while (f->next != tracer) {
+			if (!f->next)
+				test_error("tracers list corruption or double free %p", tracer);
+			f = f->next;
+		}
+		f->next = tracer->next;
+	}
+	tracer->next = NULL;
+	pthread_mutex_unlock(&ftracers_lock);
+	__destroy_ftracer(tracer);
+}
+
+static void destroy_all_ftracers(void)
+{
+	struct test_ftracer *f;
+
+	pthread_mutex_lock(&ftracers_lock);
+	f = ftracers;
+	ftracers = NULL;
+	pthread_mutex_unlock(&ftracers_lock);
+
+	while (f) {
+		struct test_ftracer *n = f->next;
+
+		f->next = NULL;
+		__destroy_ftracer(f);
+		f = n;
+	}
+}
+
+static void test_unset_tracing(void)
+{
+	destroy_all_ftracers();
+	unmount_ftrace();
+}
+
+int test_setup_tracing(void)
+{
+	/*
+	 * Just a basic protection - this should be called only once from
+	 * lib/kconfig. Not thread safe, which is fine as it's early, before
+	 * threads are created.
+	 */
+	static int already_set;
+	int err;
+
+	if (already_set)
+		return -1;
+
+	/* Needs net-namespace cookies for filters */
+	if (ns_cookie1 == ns_cookie2) {
+		test_print("net-namespace cookies: %" PRIu64 " == %" PRIu64 ", can't set up tracing",
+			   ns_cookie1, ns_cookie2);
+		return -1;
+	}
+
+	already_set = 1;
+
+	test_add_destructor(test_unset_tracing);
+
+	err = mount_ftrace();
+	if (err) {
+		test_print("failed to mount_ftrace(): %d", err);
+		return err;
+	}
+
+	return setup_aolib_ftracer();
+}
+
+static int get_ns_cookie(int nsfd, uint64_t *out)
+{
+	int old_ns = switch_save_ns(nsfd);
+	socklen_t size = sizeof(*out);
+	int sk;
+
+	sk = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+	if (sk < 0) {
+		test_print("socket(): %m");
+		return -errno;
+	}
+
+	if (getsockopt(sk, SOL_SOCKET, SO_NETNS_COOKIE, out, &size)) {
+		test_print("getsockopt(SO_NETNS_COOKIE): %m");
+		close(sk);
+		return -errno;
+	}
+
+	close(sk);
+	switch_close_ns(old_ns);
+	return 0;
+}
+
+void test_init_ftrace(int nsfd1, int nsfd2)
+{
+	get_ns_cookie(nsfd1, &ns_cookie1);
+	get_ns_cookie(nsfd2, &ns_cookie2);
+	/* Populate kernel config state */
+	kernel_config_has(KCONFIG_FTRACE);
+}
diff --git a/tools/testing/selftests/net/tcp_ao/lib/kconfig.c b/tools/testing/selftests/net/tcp_ao/lib/kconfig.c
index f279ffc3843b..9f1c175846f8 100644
--- a/tools/testing/selftests/net/tcp_ao/lib/kconfig.c
+++ b/tools/testing/selftests/net/tcp_ao/lib/kconfig.c
@@ -6,7 +6,7 @@
 #include "aolib.h"
 
 struct kconfig_t {
-	int _errno;		/* the returned error if not supported */
+	int _error;		/* negative errno if not supported */
 	int (*check_kconfig)(int *error);
 };
 
@@ -62,7 +62,7 @@ static int has_tcp_ao(int *err)
 	memcpy(&tmp.addr, &addr, sizeof(addr));
 	*err = 0;
 	if (setsockopt(sk, IPPROTO_TCP, TCP_AO_ADD_KEY, &tmp, sizeof(tmp)) < 0) {
-		*err = errno;
+		*err = -errno;
 		if (errno != ENOPROTOOPT)
 			ret = -errno;
 	}
@@ -87,7 +87,7 @@ static int has_tcp_md5(int *err)
 	 */
 	*err = 0;
 	if (test_set_md5(sk, addr_any, 0, -1, DEFAULT_TEST_PASSWORD)) {
-		*err = errno;
+		*err = -errno;
 		if (errno != ENOPROTOOPT && errno == ENOMEM) {
 			test_print("setsockopt(TCP_MD5SIG_EXT): %m");
 			ret = -errno;
@@ -116,13 +116,21 @@ static int has_vrfs(int *err)
 	return ret;
 }
 
+static int has_ftrace(int *err)
+{
+	*err = test_setup_tracing();
+	return 0;
+}
+
+#define KCONFIG_UNKNOWN			1
 static pthread_mutex_t kconfig_lock = PTHREAD_MUTEX_INITIALIZER;
 static struct kconfig_t kconfig[__KCONFIG_LAST__] = {
-	{ -1, has_net_ns },
-	{ -1, has_veth },
-	{ -1, has_tcp_ao },
-	{ -1, has_tcp_md5 },
-	{ -1, has_vrfs },
+	{ KCONFIG_UNKNOWN, has_net_ns },
+	{ KCONFIG_UNKNOWN, has_veth },
+	{ KCONFIG_UNKNOWN, has_tcp_ao },
+	{ KCONFIG_UNKNOWN, has_tcp_md5 },
+	{ KCONFIG_UNKNOWN, has_vrfs },
+	{ KCONFIG_UNKNOWN, has_ftrace },
 };
 
 const char *tests_skip_reason[__KCONFIG_LAST__] = {
@@ -131,6 +139,7 @@ const char *tests_skip_reason[__KCONFIG_LAST__] = {
 	"Tests require TCP-AO support (CONFIG_TCP_AO)",
 	"setsockopt(TCP_MD5SIG_EXT) is not supported (CONFIG_TCP_MD5)",
 	"VRFs are not supported (CONFIG_NET_VRF)",
+	"Ftrace points are not supported (CONFIG_TRACEPOINTS)",
 };
 
 bool kernel_config_has(enum test_needs_kconfig k)
@@ -138,11 +147,11 @@ bool kernel_config_has(enum test_needs_kconfig k)
 	bool ret;
 
 	pthread_mutex_lock(&kconfig_lock);
-	if (kconfig[k]._errno == -1) {
-		if (kconfig[k].check_kconfig(&kconfig[k]._errno))
+	if (kconfig[k]._error == KCONFIG_UNKNOWN) {
+		if (kconfig[k].check_kconfig(&kconfig[k]._error))
 			test_error("Failed to initialize kconfig %u", k);
 	}
-	ret = kconfig[k]._errno == 0;
+	ret = kconfig[k]._error == 0;
 	pthread_mutex_unlock(&kconfig_lock);
 	return ret;
 }
diff --git a/tools/testing/selftests/net/tcp_ao/lib/setup.c b/tools/testing/selftests/net/tcp_ao/lib/setup.c
index e408b9243b2c..a27cc03c9fbd 100644
--- a/tools/testing/selftests/net/tcp_ao/lib/setup.c
+++ b/tools/testing/selftests/net/tcp_ao/lib/setup.c
@@ -111,7 +111,7 @@ static void sig_int(int signo)
 
 int open_netns(void)
 {
-	const char *netns_path = "/proc/self/ns/net";
+	const char *netns_path = "/proc/thread-self/ns/net";
 	int fd;
 
 	fd = open(netns_path, O_RDONLY);
@@ -142,6 +142,13 @@ int switch_save_ns(int new_ns)
 	return ret;
 }
 
+void switch_close_ns(int fd)
+{
+	if (setns(fd, CLONE_NEWNET))
+		test_error("setns()");
+	close(fd);
+}
+
 static int nsfd_outside	= -1;
 static int nsfd_parent	= -1;
 static int nsfd_child	= -1;
@@ -243,9 +250,9 @@ void __test_init(unsigned int ntests, int family, unsigned int prefix,
 	test_print("rand seed %u", (unsigned int)seed);
 	srand(seed);
 
-
 	ksft_print_header();
 	init_namespaces();
+	test_init_ftrace(nsfd_parent, nsfd_child);
 
 	if (add_veth(veth_name, nsfd_parent, nsfd_child))
 		test_error("Failed to add veth");
@@ -296,7 +303,7 @@ static bool is_optmem_namespaced(void)
 		int old_ns = switch_save_ns(nsfd_child);
 
 		optmem_ns = !access(optmem_file, F_OK);
-		switch_ns(old_ns);
+		switch_close_ns(old_ns);
 	}
 	return !!optmem_ns;
 }
@@ -317,7 +324,7 @@ size_t test_get_optmem(void)
 		test_error("can't read from %s", optmem_file);
 	fclose(foptmem);
 	if (!is_optmem_namespaced())
-		switch_ns(old_ns);
+		switch_close_ns(old_ns);
 	return ret;
 }
 
@@ -339,7 +346,7 @@ static void __test_set_optmem(size_t new, size_t *old)
 		test_error("can't write %zu to %s", new, optmem_file);
 	fclose(foptmem);
 	if (!is_optmem_namespaced())
-		switch_ns(old_ns);
+		switch_close_ns(old_ns);
 }
 
 static void test_revert_optmem(void)
diff --git a/tools/testing/selftests/net/tcp_ao/lib/sock.c b/tools/testing/selftests/net/tcp_ao/lib/sock.c
index 15aeb0963058..ef8e9031d47a 100644
--- a/tools/testing/selftests/net/tcp_ao/lib/sock.c
+++ b/tools/testing/selftests/net/tcp_ao/lib/sock.c
@@ -34,10 +34,8 @@ int __test_listen_socket(int backlog, void *addr, size_t addr_sz)
 	return sk;
 }
 
-int test_wait_fd(int sk, time_t sec, bool write)
+static int __test_wait_fd(int sk, struct timeval *tv, bool write)
 {
-	struct timeval tv = { .tv_sec = sec };
-	struct timeval *ptv = NULL;
 	fd_set fds, efds;
 	int ret;
 	socklen_t slen = sizeof(ret);
@@ -47,14 +45,11 @@ int test_wait_fd(int sk, time_t sec, bool write)
 	FD_ZERO(&efds);
 	FD_SET(sk, &efds);
 
-	if (sec)
-		ptv = &tv;
-
 	errno = 0;
 	if (write)
-		ret = select(sk + 1, NULL, &fds, &efds, ptv);
+		ret = select(sk + 1, NULL, &fds, &efds, tv);
 	else
-		ret = select(sk + 1, &fds, NULL, &efds, ptv);
+		ret = select(sk + 1, &fds, NULL, &efds, tv);
 	if (ret < 0)
 		return -errno;
 	if (ret == 0) {
@@ -69,8 +64,54 @@ int test_wait_fd(int sk, time_t sec, bool write)
 	return 0;
 }
 
+int test_wait_fd(int sk, time_t sec, bool write)
+{
+	struct timeval tv = { .tv_sec = sec, };
+
+	return __test_wait_fd(sk, sec ? &tv : NULL, write);
+}
+
+static bool __skpair_poll_should_stop(int sk, struct tcp_counters *c,
+				      test_cnt condition)
+{
+	struct tcp_counters c2;
+	test_cnt diff;
+
+	if (test_get_tcp_counters(sk, &c2))
+		test_error("test_get_tcp_counters()");
+
+	diff = test_cmp_counters(c, &c2);
+	test_tcp_counters_free(&c2);
+	return (diff & condition) == condition;
+}
+
+/* How often wake up and check netns counters & paired (*err) */
+#define POLL_USEC	150
+static int __test_skpair_poll(int sk, bool write, uint64_t timeout,
+			      struct tcp_counters *c, test_cnt cond,
+			      volatile int *err)
+{
+	uint64_t t;
+
+	for (t = 0; t <= timeout * 1000000; t += POLL_USEC) {
+		struct timeval tv = { .tv_usec = POLL_USEC, };
+		int ret;
+
+		ret = __test_wait_fd(sk, &tv, write);
+		if (ret != -ETIMEDOUT)
+			return ret;
+		if (c && cond && __skpair_poll_should_stop(sk, c, cond))
+			break;
+		if (err && *err)
+			return *err;
+	}
+	if (err)
+		*err = -ETIMEDOUT;
+	return -ETIMEDOUT;
+}
+
 int __test_connect_socket(int sk, const char *device,
-			  void *addr, size_t addr_sz, time_t timeout)
+			  void *addr, size_t addr_sz, bool async)
 {
 	long flags;
 	int err;
@@ -82,15 +123,6 @@ int __test_connect_socket(int sk, const char *device,
 			test_error("setsockopt(SO_BINDTODEVICE, %s)", device);
 	}
 
-	if (!timeout) {
-		err = connect(sk, addr, addr_sz);
-		if (err) {
-			err = -errno;
-			goto out;
-		}
-		return 0;
-	}
-
 	flags = fcntl(sk, F_GETFL);
 	if ((flags < 0) || (fcntl(sk, F_SETFL, flags | O_NONBLOCK) < 0))
 		test_error("fcntl()");
@@ -100,9 +132,9 @@ int __test_connect_socket(int sk, const char *device,
 			err = -errno;
 			goto out;
 		}
-		if (timeout < 0)
+		if (async)
 			return sk;
-		err = test_wait_fd(sk, timeout, 1);
+		err = test_wait_fd(sk, TEST_TIMEOUT_SEC, 1);
 		if (err)
 			goto out;
 	}
@@ -113,6 +145,45 @@ out:
 	return err;
 }
 
+int test_skpair_wait_poll(int sk, bool write,
+			  test_cnt cond, volatile int *err)
+{
+	struct tcp_counters c;
+	int ret;
+
+	*err = 0;
+	if (test_get_tcp_counters(sk, &c))
+		test_error("test_get_tcp_counters()");
+	synchronize_threads(); /* 1: init skpair & read nscounters */
+
+	ret = __test_skpair_poll(sk, write, TEST_TIMEOUT_SEC, &c, cond, err);
+	test_tcp_counters_free(&c);
+	return ret;
+}
+
+int _test_skpair_connect_poll(int sk, const char *device,
+			      void *addr, size_t addr_sz,
+			      test_cnt condition, volatile int *err)
+{
+	struct tcp_counters c;
+	int ret;
+
+	*err = 0;
+	if (test_get_tcp_counters(sk, &c))
+		test_error("test_get_tcp_counters()");
+	synchronize_threads(); /* 1: init skpair & read nscounters */
+	ret = __test_connect_socket(sk, device, addr, addr_sz, true);
+	if (ret < 0) {
+		test_tcp_counters_free(&c);
+		return (*err = ret);
+	}
+	ret = __test_skpair_poll(sk, 1, TEST_TIMEOUT_SEC, &c, condition, err);
+	if (ret < 0)
+		close(sk);
+	test_tcp_counters_free(&c);
+	return ret;
+}
+
 int __test_set_md5(int sk, void *addr, size_t addr_sz, uint8_t prefix,
 		   int vrf, const char *password)
 {
@@ -333,12 +404,12 @@ do {									\
 	return 0;
 }
 
-int test_get_tcp_ao_counters(int sk, struct tcp_ao_counters *out)
+int test_get_tcp_counters(int sk, struct tcp_counters *out)
 {
 	struct tcp_ao_getsockopt *key_dump;
 	socklen_t key_dump_sz = sizeof(*key_dump);
 	struct tcp_ao_info_opt info = {};
-	bool c1, c2, c3, c4, c5;
+	bool c1, c2, c3, c4, c5, c6, c7, c8;
 	struct netstat *ns;
 	int err, nr_keys;
 
@@ -346,25 +417,30 @@ int test_get_tcp_ao_counters(int sk, struct tcp_ao_counters *out)
 
 	/* per-netns */
 	ns = netstat_read();
-	out->netns_ao_good = netstat_get(ns, "TCPAOGood", &c1);
-	out->netns_ao_bad = netstat_get(ns, "TCPAOBad", &c2);
-	out->netns_ao_key_not_found = netstat_get(ns, "TCPAOKeyNotFound", &c3);
-	out->netns_ao_required = netstat_get(ns, "TCPAORequired", &c4);
-	out->netns_ao_dropped_icmp = netstat_get(ns, "TCPAODroppedIcmps", &c5);
+	out->ao.netns_ao_good = netstat_get(ns, "TCPAOGood", &c1);
+	out->ao.netns_ao_bad = netstat_get(ns, "TCPAOBad", &c2);
+	out->ao.netns_ao_key_not_found = netstat_get(ns, "TCPAOKeyNotFound", &c3);
+	out->ao.netns_ao_required = netstat_get(ns, "TCPAORequired", &c4);
+	out->ao.netns_ao_dropped_icmp = netstat_get(ns, "TCPAODroppedIcmps", &c5);
+	out->netns_md5_notfound = netstat_get(ns, "TCPMD5NotFound", &c6);
+	out->netns_md5_unexpected = netstat_get(ns, "TCPMD5Unexpected", &c7);
+	out->netns_md5_failure = netstat_get(ns, "TCPMD5Failure", &c8);
 	netstat_free(ns);
-	if (c1 || c2 || c3 || c4 || c5)
+	if (c1 || c2 || c3 || c4 || c5 || c6 || c7 || c8)
 		return -EOPNOTSUPP;
 
 	err = test_get_ao_info(sk, &info);
+	if (err == -ENOENT)
+		return 0;
 	if (err)
 		return err;
 
 	/* per-socket */
-	out->ao_info_pkt_good		= info.pkt_good;
-	out->ao_info_pkt_bad		= info.pkt_bad;
-	out->ao_info_pkt_key_not_found	= info.pkt_key_not_found;
-	out->ao_info_pkt_ao_required	= info.pkt_ao_required;
-	out->ao_info_pkt_dropped_icmp	= info.pkt_dropped_icmp;
+	out->ao.ao_info_pkt_good = info.pkt_good;
+	out->ao.ao_info_pkt_bad = info.pkt_bad;
+	out->ao.ao_info_pkt_key_not_found = info.pkt_key_not_found;
+	out->ao.ao_info_pkt_ao_required = info.pkt_ao_required;
+	out->ao.ao_info_pkt_dropped_icmp = info.pkt_dropped_icmp;
 
 	/* per-key */
 	nr_keys = test_get_ao_keys_nr(sk);
@@ -372,14 +448,13 @@ int test_get_tcp_ao_counters(int sk, struct tcp_ao_counters *out)
 		return nr_keys;
 	if (nr_keys == 0)
 		test_error("test_get_ao_keys_nr() == 0");
-	out->nr_keys = (size_t)nr_keys;
+	out->ao.nr_keys = (size_t)nr_keys;
 	key_dump = calloc(nr_keys, key_dump_sz);
 	if (!key_dump)
 		return -errno;
 
 	key_dump[0].nkeys = nr_keys;
 	key_dump[0].get_all = 1;
-	key_dump[0].get_all = 1;
 	err = getsockopt(sk, IPPROTO_TCP, TCP_AO_GET_KEYS,
 			 key_dump, &key_dump_sz);
 	if (err) {
@@ -387,72 +462,84 @@ int test_get_tcp_ao_counters(int sk, struct tcp_ao_counters *out)
 		return -errno;
 	}
 
-	out->key_cnts = calloc(nr_keys, sizeof(out->key_cnts[0]));
-	if (!out->key_cnts) {
+	out->ao.key_cnts = calloc(nr_keys, sizeof(out->ao.key_cnts[0]));
+	if (!out->ao.key_cnts) {
 		free(key_dump);
 		return -errno;
 	}
 
 	while (nr_keys--) {
-		out->key_cnts[nr_keys].sndid = key_dump[nr_keys].sndid;
-		out->key_cnts[nr_keys].rcvid = key_dump[nr_keys].rcvid;
-		out->key_cnts[nr_keys].pkt_good = key_dump[nr_keys].pkt_good;
-		out->key_cnts[nr_keys].pkt_bad = key_dump[nr_keys].pkt_bad;
+		out->ao.key_cnts[nr_keys].sndid = key_dump[nr_keys].sndid;
+		out->ao.key_cnts[nr_keys].rcvid = key_dump[nr_keys].rcvid;
+		out->ao.key_cnts[nr_keys].pkt_good = key_dump[nr_keys].pkt_good;
+		out->ao.key_cnts[nr_keys].pkt_bad = key_dump[nr_keys].pkt_bad;
 	}
 	free(key_dump);
 
 	return 0;
 }
 
-int __test_tcp_ao_counters_cmp(const char *tst_name,
-			       struct tcp_ao_counters *before,
-			       struct tcp_ao_counters *after,
-			       test_cnt expected)
+test_cnt test_cmp_counters(struct tcp_counters *before,
+			   struct tcp_counters *after)
 {
-#define __cmp_ao(cnt, expecting_inc)					\
+#define __cmp(cnt, e_cnt)						\
+do {									\
+	if (before->cnt > after->cnt)					\
+		test_error("counter " __stringify(cnt) " decreased");	\
+	if (before->cnt != after->cnt)					\
+		ret |= e_cnt;						\
+} while (0)
+
+	test_cnt ret = 0;
+	size_t i;
+
+	if (before->ao.nr_keys != after->ao.nr_keys)
+		test_error("the number of keys has changed");
+
+	_for_each_counter(__cmp);
+
+	i = before->ao.nr_keys;
+	while (i--) {
+		__cmp(ao.key_cnts[i].pkt_good, TEST_CNT_KEY_GOOD);
+		__cmp(ao.key_cnts[i].pkt_bad, TEST_CNT_KEY_BAD);
+	}
+#undef __cmp
+	return ret;
+}
+
+int test_assert_counters_sk(const char *tst_name,
+			    struct tcp_counters *before,
+			    struct tcp_counters *after,
+			    test_cnt expected)
+{
+#define __cmp_ao(cnt, e_cnt)						\
 do {									\
 	if (before->cnt > after->cnt) {					\
 		test_fail("%s: Decreased counter " __stringify(cnt) " %" PRIu64 " > %" PRIu64, \
-			  tst_name ?: "", before->cnt, after->cnt);		\
+			  tst_name ?: "", before->cnt, after->cnt);	\
 		return -1;						\
 	}								\
-	if ((before->cnt != after->cnt) != (expecting_inc)) {		\
+	if ((before->cnt != after->cnt) != !!(expected & e_cnt)) {	\
 		test_fail("%s: Counter " __stringify(cnt) " was %sexpected to increase %" PRIu64 " => %" PRIu64, \
-			  tst_name ?: "", (expecting_inc) ? "" : "not ",	\
+			  tst_name ?: "", (expected & e_cnt) ? "" : "not ",	\
 			  before->cnt, after->cnt);			\
 		return -1;						\
 	}								\
-} while(0)
+} while (0)
 
 	errno = 0;
-	/* per-netns */
-	__cmp_ao(netns_ao_good, !!(expected & TEST_CNT_NS_GOOD));
-	__cmp_ao(netns_ao_bad, !!(expected & TEST_CNT_NS_BAD));
-	__cmp_ao(netns_ao_key_not_found,
-		 !!(expected & TEST_CNT_NS_KEY_NOT_FOUND));
-	__cmp_ao(netns_ao_required, !!(expected & TEST_CNT_NS_AO_REQUIRED));
-	__cmp_ao(netns_ao_dropped_icmp,
-		 !!(expected & TEST_CNT_NS_DROPPED_ICMP));
-	/* per-socket */
-	__cmp_ao(ao_info_pkt_good, !!(expected & TEST_CNT_SOCK_GOOD));
-	__cmp_ao(ao_info_pkt_bad, !!(expected & TEST_CNT_SOCK_BAD));
-	__cmp_ao(ao_info_pkt_key_not_found,
-		 !!(expected & TEST_CNT_SOCK_KEY_NOT_FOUND));
-	__cmp_ao(ao_info_pkt_ao_required, !!(expected & TEST_CNT_SOCK_AO_REQUIRED));
-	__cmp_ao(ao_info_pkt_dropped_icmp,
-		 !!(expected & TEST_CNT_SOCK_DROPPED_ICMP));
+	_for_each_counter(__cmp_ao);
 	return 0;
 #undef __cmp_ao
 }
 
-int test_tcp_ao_key_counters_cmp(const char *tst_name,
-				 struct tcp_ao_counters *before,
-				 struct tcp_ao_counters *after,
-				 test_cnt expected,
-				 int sndid, int rcvid)
+int test_assert_counters_key(const char *tst_name,
+			     struct tcp_ao_counters *before,
+			     struct tcp_ao_counters *after,
+			     test_cnt expected, int sndid, int rcvid)
 {
 	size_t i;
-#define __cmp_ao(i, cnt, expecting_inc)					\
+#define __cmp_ao(i, cnt, e_cnt)					\
 do {									\
 	if (before->key_cnts[i].cnt > after->key_cnts[i].cnt) {		\
 		test_fail("%s: Decreased counter " __stringify(cnt) " %" PRIu64 " > %" PRIu64 " for key %u:%u", \
@@ -462,16 +549,16 @@ do {									\
 			  before->key_cnts[i].rcvid);			\
 		return -1;						\
 	}								\
-	if ((before->key_cnts[i].cnt != after->key_cnts[i].cnt) != (expecting_inc)) {		\
+	if ((before->key_cnts[i].cnt != after->key_cnts[i].cnt) != !!(expected & e_cnt)) {		\
 		test_fail("%s: Counter " __stringify(cnt) " was %sexpected to increase %" PRIu64 " => %" PRIu64 " for key %u:%u", \
-			  tst_name ?: "", (expecting_inc) ? "" : "not ",\
+			  tst_name ?: "", (expected & e_cnt) ? "" : "not ",\
 			  before->key_cnts[i].cnt,			\
 			  after->key_cnts[i].cnt,			\
 			  before->key_cnts[i].sndid,			\
 			  before->key_cnts[i].rcvid);			\
 		return -1;						\
 	}								\
-} while(0)
+} while (0)
 
 	if (before->nr_keys != after->nr_keys) {
 		test_fail("%s: Keys changed on the socket %zu != %zu",
@@ -486,20 +573,22 @@ do {									\
 			continue;
 		if (rcvid >= 0 && before->key_cnts[i].rcvid != rcvid)
 			continue;
-		__cmp_ao(i, pkt_good, !!(expected & TEST_CNT_KEY_GOOD));
-		__cmp_ao(i, pkt_bad, !!(expected & TEST_CNT_KEY_BAD));
+		__cmp_ao(i, pkt_good, TEST_CNT_KEY_GOOD);
+		__cmp_ao(i, pkt_bad, TEST_CNT_KEY_BAD);
 	}
 	return 0;
 #undef __cmp_ao
 }
 
-void test_tcp_ao_counters_free(struct tcp_ao_counters *cnts)
+void test_tcp_counters_free(struct tcp_counters *cnts)
 {
-	free(cnts->key_cnts);
+	free(cnts->ao.key_cnts);
 }
 
 #define TEST_BUF_SIZE 4096
-ssize_t test_server_run(int sk, ssize_t quota, time_t timeout_sec)
+static ssize_t _test_server_run(int sk, ssize_t quota, struct tcp_counters *c,
+				test_cnt cond, volatile int *err,
+				time_t timeout_sec)
 {
 	ssize_t total = 0;
 
@@ -508,7 +597,7 @@ ssize_t test_server_run(int sk, ssize_t quota, time_t timeout_sec)
 		ssize_t bytes, sent;
 		int ret;
 
-		ret = test_wait_fd(sk, timeout_sec, 0);
+		ret = __test_skpair_poll(sk, 0, timeout_sec, c, cond, err);
 		if (ret)
 			return ret;
 
@@ -519,7 +608,7 @@ ssize_t test_server_run(int sk, ssize_t quota, time_t timeout_sec)
 		if (bytes == 0)
 			break;
 
-		ret = test_wait_fd(sk, timeout_sec, 1);
+		ret = __test_skpair_poll(sk, 1, timeout_sec, c, cond, err);
 		if (ret)
 			return ret;
 
@@ -534,13 +623,41 @@ ssize_t test_server_run(int sk, ssize_t quota, time_t timeout_sec)
 	return total;
 }
 
-ssize_t test_client_loop(int sk, char *buf, size_t buf_sz,
-			 const size_t msg_len, time_t timeout_sec)
+ssize_t test_server_run(int sk, ssize_t quota, time_t timeout_sec)
+{
+	return _test_server_run(sk, quota, NULL, 0, NULL,
+				timeout_sec ?: TEST_TIMEOUT_SEC);
+}
+
+int test_skpair_server(int sk, ssize_t quota, test_cnt cond, volatile int *err)
+{
+	struct tcp_counters c;
+	ssize_t ret;
+
+	*err = 0;
+	if (test_get_tcp_counters(sk, &c))
+		test_error("test_get_tcp_counters()");
+	synchronize_threads(); /* 1: init skpair & read nscounters */
+
+	ret = _test_server_run(sk, quota, &c, cond, err, TEST_TIMEOUT_SEC);
+	test_tcp_counters_free(&c);
+	return ret;
+}
+
+static ssize_t test_client_loop(int sk, size_t buf_sz, const size_t msg_len,
+				struct tcp_counters *c, test_cnt cond,
+				volatile int *err)
 {
 	char msg[msg_len];
 	int nodelay = 1;
+	char *buf;
 	size_t i;
 
+	buf = alloca(buf_sz);
+	if (!buf)
+		return -ENOMEM;
+	randomize_buffer(buf, buf_sz);
+
 	if (setsockopt(sk, IPPROTO_TCP, TCP_NODELAY, &nodelay, sizeof(nodelay)))
 		test_error("setsockopt(TCP_NODELAY)");
 
@@ -548,7 +665,7 @@ ssize_t test_client_loop(int sk, char *buf, size_t buf_sz,
 		size_t sent, bytes = min(msg_len, buf_sz - i);
 		int ret;
 
-		ret = test_wait_fd(sk, timeout_sec, 1);
+		ret = __test_skpair_poll(sk, 1, TEST_TIMEOUT_SEC, c, cond, err);
 		if (ret)
 			return ret;
 
@@ -562,7 +679,8 @@ ssize_t test_client_loop(int sk, char *buf, size_t buf_sz,
 		do {
 			ssize_t got;
 
-			ret = test_wait_fd(sk, timeout_sec, 0);
+			ret = __test_skpair_poll(sk, 0, TEST_TIMEOUT_SEC,
+						 c, cond, err);
 			if (ret)
 				return ret;
 
@@ -581,15 +699,31 @@ ssize_t test_client_loop(int sk, char *buf, size_t buf_sz,
 	return i;
 }
 
-int test_client_verify(int sk, const size_t msg_len, const size_t nr,
-		       time_t timeout_sec)
+int test_client_verify(int sk, const size_t msg_len, const size_t nr)
 {
 	size_t buf_sz = msg_len * nr;
-	char *buf = alloca(buf_sz);
 	ssize_t ret;
 
-	randomize_buffer(buf, buf_sz);
-	ret = test_client_loop(sk, buf, buf_sz, msg_len, timeout_sec);
+	ret = test_client_loop(sk, buf_sz, msg_len, NULL, 0, NULL);
+	if (ret < 0)
+		return (int)ret;
+	return ret != buf_sz ? -1 : 0;
+}
+
+int test_skpair_client(int sk, const size_t msg_len, const size_t nr,
+		       test_cnt cond, volatile int *err)
+{
+	struct tcp_counters c;
+	size_t buf_sz = msg_len * nr;
+	ssize_t ret;
+
+	*err = 0;
+	if (test_get_tcp_counters(sk, &c))
+		test_error("test_get_tcp_counters()");
+	synchronize_threads(); /* 1: init skpair & read nscounters */
+
+	ret = test_client_loop(sk, buf_sz, msg_len, &c, cond, err);
+	test_tcp_counters_free(&c);
 	if (ret < 0)
 		return (int)ret;
 	return ret != buf_sz ? -1 : 0;
diff --git a/tools/testing/selftests/net/tcp_ao/lib/utils.c b/tools/testing/selftests/net/tcp_ao/lib/utils.c
index 372daca525f5..bdf5522c9213 100644
--- a/tools/testing/selftests/net/tcp_ao/lib/utils.c
+++ b/tools/testing/selftests/net/tcp_ao/lib/utils.c
@@ -21,6 +21,32 @@ void randomize_buffer(void *buf, size_t buflen)
 	}
 }
 
+__printf(3, 4) int test_echo(const char *fname, bool append,
+			     const char *fmt, ...)
+{
+	size_t len, written;
+	va_list vargs;
+	char *msg;
+	FILE *f;
+
+	f = fopen(fname, append ? "a" : "w");
+	if (!f)
+		return -errno;
+
+	va_start(vargs, fmt);
+	msg = test_snprintf(fmt, vargs);
+	va_end(vargs);
+	if (!msg) {
+		fclose(f);
+		return -1;
+	}
+	len = strlen(msg);
+	written = fwrite(msg, 1, len, f);
+	fclose(f);
+	free(msg);
+	return written == len ? 0 : -1;
+}
+
 const struct sockaddr_in6 addr_any6 = {
 	.sin6_family	= AF_INET6,
 };
diff --git a/tools/testing/selftests/net/tcp_ao/restore.c b/tools/testing/selftests/net/tcp_ao/restore.c
index 8fdc808df325..9a059b6c4523 100644
--- a/tools/testing/selftests/net/tcp_ao/restore.c
+++ b/tools/testing/selftests/net/tcp_ao/restore.c
@@ -16,11 +16,11 @@ const size_t quota = nr_packets * msg_len;
 static void try_server_run(const char *tst_name, unsigned int port,
 			   fault_t inj, test_cnt cnt_expected)
 {
+	test_cnt poll_cnt = (cnt_expected == TEST_CNT_GOOD) ? 0 : cnt_expected;
 	const char *cnt_name = "TCPAOGood";
-	struct tcp_ao_counters ao1, ao2;
+	struct tcp_counters cnt1, cnt2;
 	uint64_t before_cnt, after_cnt;
-	int sk, lsk;
-	time_t timeout;
+	int sk, lsk, dummy;
 	ssize_t bytes;
 
 	if (fault(TIMEOUT))
@@ -48,11 +48,10 @@ static void try_server_run(const char *tst_name, unsigned int port,
 	}
 
 	before_cnt = netstat_get_one(cnt_name, NULL);
-	if (test_get_tcp_ao_counters(sk, &ao1))
-		test_error("test_get_tcp_ao_counters()");
+	if (test_get_tcp_counters(sk, &cnt1))
+		test_error("test_get_tcp_counters()");
 
-	timeout = fault(TIMEOUT) ? TEST_RETRANSMIT_SEC : TEST_TIMEOUT_SEC;
-	bytes = test_server_run(sk, quota, timeout);
+	bytes = test_skpair_server(sk, quota, poll_cnt, &dummy);
 	if (fault(TIMEOUT)) {
 		if (bytes > 0)
 			test_fail("%s: server served: %zd", tst_name, bytes);
@@ -64,17 +63,18 @@ static void try_server_run(const char *tst_name, unsigned int port,
 		else
 			test_ok("%s: server alive", tst_name);
 	}
-	if (test_get_tcp_ao_counters(sk, &ao2))
-		test_error("test_get_tcp_ao_counters()");
+	synchronize_threads(); /* 3: counters checks */
+	if (test_get_tcp_counters(sk, &cnt2))
+		test_error("test_get_tcp_counters()");
 	after_cnt = netstat_get_one(cnt_name, NULL);
 
-	test_tcp_ao_counters_cmp(tst_name, &ao1, &ao2, cnt_expected);
+	test_assert_counters(tst_name, &cnt1, &cnt2, cnt_expected);
 
 	if (after_cnt <= before_cnt) {
-		test_fail("%s: %s counter did not increase: %zu <= %zu",
-				tst_name, cnt_name, after_cnt, before_cnt);
+		test_fail("%s(server): %s counter did not increase: %" PRIu64 " <= %" PRIu64,
+			  tst_name, cnt_name, after_cnt, before_cnt);
 	} else {
-		test_ok("%s: counter %s increased %zu => %zu",
+		test_ok("%s(server): counter %s increased %" PRIu64 " => %" PRIu64,
 			tst_name, cnt_name, before_cnt, after_cnt);
 	}
 
@@ -82,7 +82,7 @@ static void try_server_run(const char *tst_name, unsigned int port,
 	 * Before close() as that will send FIN and move the peer in TCP_CLOSE
 	 * and that will prevent reading AO counters from the peer's socket.
 	 */
-	synchronize_threads(); /* 3: verified => closed */
+	synchronize_threads(); /* 4: verified => closed */
 out:
 	close(sk);
 }
@@ -91,16 +91,16 @@ static void *server_fn(void *arg)
 {
 	unsigned int port = test_server_port;
 
-	try_server_run("TCP-AO migrate to another socket", port++,
+	try_server_run("TCP-AO migrate to another socket (server)", port++,
 		       0, TEST_CNT_GOOD);
-	try_server_run("TCP-AO with wrong send ISN", port++,
+	try_server_run("TCP-AO with wrong send ISN (server)", port++,
 		       FAULT_TIMEOUT, TEST_CNT_BAD);
-	try_server_run("TCP-AO with wrong receive ISN", port++,
+	try_server_run("TCP-AO with wrong receive ISN (server)", port++,
 		       FAULT_TIMEOUT, TEST_CNT_BAD);
-	try_server_run("TCP-AO with wrong send SEQ ext number", port++,
+	try_server_run("TCP-AO with wrong send SEQ ext number (server)", port++,
 		       FAULT_TIMEOUT, TEST_CNT_BAD);
-	try_server_run("TCP-AO with wrong receive SEQ ext number", port++,
-		       FAULT_TIMEOUT, TEST_CNT_NS_BAD | TEST_CNT_GOOD);
+	try_server_run("TCP-AO with wrong receive SEQ ext number (server)",
+		       port++, FAULT_TIMEOUT, TEST_CNT_NS_BAD | TEST_CNT_GOOD);
 
 	synchronize_threads(); /* don't race to exit: client exits */
 	return NULL;
@@ -124,7 +124,7 @@ static void test_get_sk_checkpoint(unsigned int server_port, sockaddr_af *saddr,
 		test_error("failed to connect()");
 
 	synchronize_threads(); /* 2: accepted => send data */
-	if (test_client_verify(sk, msg_len, nr_packets, TEST_TIMEOUT_SEC))
+	if (test_client_verify(sk, msg_len, nr_packets))
 		test_fail("pre-migrate verify failed");
 
 	test_enable_repair(sk);
@@ -138,11 +138,11 @@ static void test_sk_restore(const char *tst_name, unsigned int server_port,
 			    struct tcp_ao_repair *ao_img,
 			    fault_t inj, test_cnt cnt_expected)
 {
+	test_cnt poll_cnt = (cnt_expected == TEST_CNT_GOOD) ? 0 : cnt_expected;
 	const char *cnt_name = "TCPAOGood";
-	struct tcp_ao_counters ao1, ao2;
+	struct tcp_counters cnt1, cnt2;
 	uint64_t before_cnt, after_cnt;
-	time_t timeout;
-	int sk;
+	int sk, dummy;
 
 	if (fault(TIMEOUT))
 		cnt_name = "TCPAOBad";
@@ -158,38 +158,39 @@ static void test_sk_restore(const char *tst_name, unsigned int server_port,
 		test_error("setsockopt(TCP_AO_ADD_KEY)");
 	test_ao_restore(sk, ao_img);
 
-	if (test_get_tcp_ao_counters(sk, &ao1))
-		test_error("test_get_tcp_ao_counters()");
+	if (test_get_tcp_counters(sk, &cnt1))
+		test_error("test_get_tcp_counters()");
 
 	test_disable_repair(sk);
 	test_sock_state_free(img);
 
-	timeout = fault(TIMEOUT) ? TEST_RETRANSMIT_SEC : TEST_TIMEOUT_SEC;
-	if (test_client_verify(sk, msg_len, nr_packets, timeout)) {
+	if (test_skpair_client(sk, msg_len, nr_packets, poll_cnt, &dummy)) {
 		if (fault(TIMEOUT))
 			test_ok("%s: post-migrate connection is broken", tst_name);
 		else
 			test_fail("%s: post-migrate connection is working", tst_name);
 	} else {
 		if (fault(TIMEOUT))
-			test_fail("%s: post-migrate connection still working", tst_name);
+			test_fail("%s: post-migrate connection is working", tst_name);
 		else
 			test_ok("%s: post-migrate connection is alive", tst_name);
 	}
-	if (test_get_tcp_ao_counters(sk, &ao2))
-		test_error("test_get_tcp_ao_counters()");
+
+	synchronize_threads(); /* 3: counters checks */
+	if (test_get_tcp_counters(sk, &cnt2))
+		test_error("test_get_tcp_counters()");
 	after_cnt = netstat_get_one(cnt_name, NULL);
 
-	test_tcp_ao_counters_cmp(tst_name, &ao1, &ao2, cnt_expected);
+	test_assert_counters(tst_name, &cnt1, &cnt2, cnt_expected);
 
 	if (after_cnt <= before_cnt) {
-		test_fail("%s: %s counter did not increase: %zu <= %zu",
+		test_fail("%s: %s counter did not increase: %" PRIu64 " <= %" PRIu64,
 				tst_name, cnt_name, after_cnt, before_cnt);
 	} else {
-		test_ok("%s: counter %s increased %zu => %zu",
+		test_ok("%s: counter %s increased %" PRIu64 " => %" PRIu64,
 			tst_name, cnt_name, before_cnt, after_cnt);
 	}
-	synchronize_threads(); /* 3: verified => closed */
+	synchronize_threads(); /* 4: verified => closed */
 	close(sk);
 }
 
@@ -201,29 +202,43 @@ static void *client_fn(void *arg)
 	sockaddr_af saddr;
 
 	test_get_sk_checkpoint(port, &saddr, &tcp_img, &ao_img);
-	test_sk_restore("TCP-AO migrate to another socket", port++,
+	test_sk_restore("TCP-AO migrate to another socket (client)", port++,
 			&saddr, &tcp_img, &ao_img, 0, TEST_CNT_GOOD);
 
 	test_get_sk_checkpoint(port, &saddr, &tcp_img, &ao_img);
 	ao_img.snt_isn += 1;
-	test_sk_restore("TCP-AO with wrong send ISN", port++,
+	trace_ao_event_expect(TCP_AO_MISMATCH, this_ip_addr, this_ip_dest,
+			      -1, port, 0, -1, -1, -1, -1, -1, 100, 100, -1);
+	trace_ao_event_expect(TCP_AO_MISMATCH, this_ip_dest, this_ip_addr,
+			      port, -1, 0, -1, -1, -1, -1, -1, 100, 100, -1);
+	test_sk_restore("TCP-AO with wrong send ISN (client)", port++,
 			&saddr, &tcp_img, &ao_img, FAULT_TIMEOUT, TEST_CNT_BAD);
 
 	test_get_sk_checkpoint(port, &saddr, &tcp_img, &ao_img);
 	ao_img.rcv_isn += 1;
-	test_sk_restore("TCP-AO with wrong receive ISN", port++,
+	trace_ao_event_expect(TCP_AO_MISMATCH, this_ip_addr, this_ip_dest,
+			      -1, port, 0, -1, -1, -1, -1, -1, 100, 100, -1);
+	trace_ao_event_expect(TCP_AO_MISMATCH, this_ip_dest, this_ip_addr,
+			      port, -1, 0, -1, -1, -1, -1, -1, 100, 100, -1);
+	test_sk_restore("TCP-AO with wrong receive ISN (client)", port++,
 			&saddr, &tcp_img, &ao_img, FAULT_TIMEOUT, TEST_CNT_BAD);
 
 	test_get_sk_checkpoint(port, &saddr, &tcp_img, &ao_img);
 	ao_img.snd_sne += 1;
-	test_sk_restore("TCP-AO with wrong send SEQ ext number", port++,
-			&saddr, &tcp_img, &ao_img, FAULT_TIMEOUT,
+	trace_ao_event_expect(TCP_AO_MISMATCH, this_ip_addr, this_ip_dest,
+			      -1, port, 0, -1, -1, -1, -1, -1, 100, 100, -1);
+	/* not expecting server => client mismatches as only snd sne is broken */
+	test_sk_restore("TCP-AO with wrong send SEQ ext number (client)",
+			port++, &saddr, &tcp_img, &ao_img, FAULT_TIMEOUT,
 			TEST_CNT_NS_BAD | TEST_CNT_GOOD);
 
 	test_get_sk_checkpoint(port, &saddr, &tcp_img, &ao_img);
 	ao_img.rcv_sne += 1;
-	test_sk_restore("TCP-AO with wrong receive SEQ ext number", port++,
-			&saddr, &tcp_img, &ao_img, FAULT_TIMEOUT,
+	/* not expecting client => server mismatches as only rcv sne is broken */
+	trace_ao_event_expect(TCP_AO_MISMATCH, this_ip_dest, this_ip_addr,
+			      port, -1, 0, -1, -1, -1, -1, -1, 100, 100, -1);
+	test_sk_restore("TCP-AO with wrong receive SEQ ext number (client)",
+			port++, &saddr, &tcp_img, &ao_img, FAULT_TIMEOUT,
 			TEST_CNT_NS_GOOD | TEST_CNT_BAD);
 
 	return NULL;
@@ -231,6 +246,6 @@ static void *client_fn(void *arg)
 
 int main(int argc, char *argv[])
 {
-	test_init(20, server_fn, client_fn);
+	test_init(21, server_fn, client_fn);
 	return 0;
 }
diff --git a/tools/testing/selftests/net/tcp_ao/rst.c b/tools/testing/selftests/net/tcp_ao/rst.c
index a2fe88d35ac0..883cddf377cf 100644
--- a/tools/testing/selftests/net/tcp_ao/rst.c
+++ b/tools/testing/selftests/net/tcp_ao/rst.c
@@ -84,15 +84,15 @@ static void close_forced(int sk)
 
 static void test_server_active_rst(unsigned int port)
 {
-	struct tcp_ao_counters cnt1, cnt2;
+	struct tcp_counters cnt1, cnt2;
 	ssize_t bytes;
 	int sk, lsk;
 
 	lsk = test_listen_socket(this_ip_addr, port, backlog);
 	if (test_add_key(lsk, DEFAULT_TEST_PASSWORD, this_ip_dest, -1, 100, 100))
 		test_error("setsockopt(TCP_AO_ADD_KEY)");
-	if (test_get_tcp_ao_counters(lsk, &cnt1))
-		test_error("test_get_tcp_ao_counters()");
+	if (test_get_tcp_counters(lsk, &cnt1))
+		test_error("test_get_tcp_counters()");
 
 	synchronize_threads(); /* 1: MKT added */
 	if (test_wait_fd(lsk, TEST_TIMEOUT_SEC, 0))
@@ -103,8 +103,8 @@ static void test_server_active_rst(unsigned int port)
 		test_error("accept()");
 
 	synchronize_threads(); /* 2: connection accept()ed, another queued */
-	if (test_get_tcp_ao_counters(lsk, &cnt2))
-		test_error("test_get_tcp_ao_counters()");
+	if (test_get_tcp_counters(lsk, &cnt2))
+		test_error("test_get_tcp_counters()");
 
 	synchronize_threads(); /* 3: close listen socket */
 	close(lsk);
@@ -120,7 +120,7 @@ static void test_server_active_rst(unsigned int port)
 	synchronize_threads(); /* 5: closed active sk */
 
 	synchronize_threads(); /* 6: counters checks */
-	if (test_tcp_ao_counters_cmp("active RST server", &cnt1, &cnt2, TEST_CNT_GOOD))
+	if (test_assert_counters("active RST server", &cnt1, &cnt2, TEST_CNT_GOOD))
 		test_fail("MKT counters (server) have not only good packets");
 	else
 		test_ok("MKT counters are good on server");
@@ -128,7 +128,7 @@ static void test_server_active_rst(unsigned int port)
 
 static void test_server_passive_rst(unsigned int port)
 {
-	struct tcp_ao_counters ao1, ao2;
+	struct tcp_counters cnt1, cnt2;
 	int sk, lsk;
 	ssize_t bytes;
 
@@ -147,8 +147,8 @@ static void test_server_passive_rst(unsigned int port)
 
 	synchronize_threads(); /* 2: accepted => send data */
 	close(lsk);
-	if (test_get_tcp_ao_counters(sk, &ao1))
-		test_error("test_get_tcp_ao_counters()");
+	if (test_get_tcp_counters(sk, &cnt1))
+		test_error("test_get_tcp_counters()");
 
 	bytes = test_server_run(sk, quota, TEST_TIMEOUT_SEC);
 	if (bytes != quota) {
@@ -160,12 +160,12 @@ static void test_server_passive_rst(unsigned int port)
 
 	synchronize_threads(); /* 3: checkpoint the client */
 	synchronize_threads(); /* 4: close the server, creating twsk */
-	if (test_get_tcp_ao_counters(sk, &ao2))
-		test_error("test_get_tcp_ao_counters()");
+	if (test_get_tcp_counters(sk, &cnt2))
+		test_error("test_get_tcp_counters()");
 	close(sk);
 
 	synchronize_threads(); /* 5: restore the socket, send more data */
-	test_tcp_ao_counters_cmp("passive RST server", &ao1, &ao2, TEST_CNT_GOOD);
+	test_assert_counters("passive RST server", &cnt1, &cnt2, TEST_CNT_GOOD);
 
 	synchronize_threads(); /* 6: server exits */
 }
@@ -271,8 +271,7 @@ static void test_client_active_rst(unsigned int port)
 
 	synchronize_threads(); /* 1: MKT added */
 	for (i = 0; i < last; i++) {
-		err = _test_connect_socket(sk[i], this_ip_dest, port,
-					       (i == 0) ? TEST_TIMEOUT_SEC : -1);
+		err = _test_connect_socket(sk[i], this_ip_dest, port, i != 0);
 		if (err < 0)
 			test_error("failed to connect()");
 	}
@@ -283,12 +282,12 @@ static void test_client_active_rst(unsigned int port)
 		test_error("test_wait_fds(): %d", err);
 
 	/* async connect() with third sk to get into request_sock_queue */
-	err = _test_connect_socket(sk[last], this_ip_dest, port, -1);
+	err = _test_connect_socket(sk[last], this_ip_dest, port, 1);
 	if (err < 0)
 		test_error("failed to connect()");
 
 	synchronize_threads(); /* 3: close listen socket */
-	if (test_client_verify(sk[0], packet_sz, quota / packet_sz, TEST_TIMEOUT_SEC))
+	if (test_client_verify(sk[0], packet_sz, quota / packet_sz))
 		test_fail("Failed to send data on connected socket");
 	else
 		test_ok("Verified established tcp connection");
@@ -323,7 +322,7 @@ static void test_client_active_rst(unsigned int port)
 
 static void test_client_passive_rst(unsigned int port)
 {
-	struct tcp_ao_counters ao1, ao2;
+	struct tcp_counters cnt1, cnt2;
 	struct tcp_ao_repair ao_img;
 	struct tcp_sock_state img;
 	sockaddr_af saddr;
@@ -341,7 +340,7 @@ static void test_client_passive_rst(unsigned int port)
 		test_error("failed to connect()");
 
 	synchronize_threads(); /* 2: accepted => send data */
-	if (test_client_verify(sk, packet_sz, quota / packet_sz, TEST_TIMEOUT_SEC))
+	if (test_client_verify(sk, packet_sz, quota / packet_sz))
 		test_fail("Failed to send data on connected socket");
 	else
 		test_ok("Verified established tcp connection");
@@ -397,8 +396,8 @@ static void test_client_passive_rst(unsigned int port)
 		test_error("setsockopt(TCP_AO_ADD_KEY)");
 	test_ao_restore(sk, &ao_img);
 
-	if (test_get_tcp_ao_counters(sk, &ao1))
-		test_error("test_get_tcp_ao_counters()");
+	if (test_get_tcp_counters(sk, &cnt1))
+		test_error("test_get_tcp_counters()");
 
 	test_disable_repair(sk);
 	test_sock_state_free(&img);
@@ -417,7 +416,7 @@ static void test_client_passive_rst(unsigned int port)
 	 * IP 10.0.254.1.7011 > 10.0.1.1.59772: Flags [R], seq 3215596252, win 0,
 	 *    options [tcp-ao keyid 100 rnextkeyid 100 mac 0x0bcfbbf497bce844312304b2], length 0
 	 */
-	err = test_client_verify(sk, packet_sz, quota / packet_sz, 2 * TEST_TIMEOUT_SEC);
+	err = test_client_verify(sk, packet_sz, quota / packet_sz);
 	/* Make sure that the connection was reset, not timeouted */
 	if (err && err == -ECONNRESET)
 		test_ok("client sock was passively reset post-seq-adjust");
@@ -426,12 +425,12 @@ static void test_client_passive_rst(unsigned int port)
 	else
 		test_fail("client sock is yet connected post-seq-adjust");
 
-	if (test_get_tcp_ao_counters(sk, &ao2))
-		test_error("test_get_tcp_ao_counters()");
+	if (test_get_tcp_counters(sk, &cnt2))
+		test_error("test_get_tcp_counters()");
 
 	synchronize_threads(); /* 6: server exits */
 	close(sk);
-	test_tcp_ao_counters_cmp("client passive RST", &ao1, &ao2, TEST_CNT_GOOD);
+	test_assert_counters("client passive RST", &cnt1, &cnt2, TEST_CNT_GOOD);
 }
 
 static void *client_fn(void *arg)
@@ -455,6 +454,6 @@ static void *client_fn(void *arg)
 
 int main(int argc, char *argv[])
 {
-	test_init(14, server_fn, client_fn);
+	test_init(15, server_fn, client_fn);
 	return 0;
 }
diff --git a/tools/testing/selftests/net/tcp_ao/self-connect.c b/tools/testing/selftests/net/tcp_ao/self-connect.c
index e154d9e198a9..2c73bea698a6 100644
--- a/tools/testing/selftests/net/tcp_ao/self-connect.c
+++ b/tools/testing/selftests/net/tcp_ao/self-connect.c
@@ -16,6 +16,9 @@ static void __setup_lo_intf(const char *lo_intf,
 
 	if (link_set_up(lo_intf))
 		test_error("Failed to bring %s up", lo_intf);
+
+	if (ip_route_add(lo_intf, TEST_FAMILY, local_addr, local_addr))
+		test_error("Failed to add a local route %s", lo_intf);
 }
 
 static void setup_lo_intf(const char *lo_intf)
@@ -30,9 +33,7 @@ static void setup_lo_intf(const char *lo_intf)
 static void tcp_self_connect(const char *tst, unsigned int port,
 			     bool different_keyids, bool check_restore)
 {
-	uint64_t before_challenge_ack, after_challenge_ack;
-	uint64_t before_syn_challenge, after_syn_challenge;
-	struct tcp_ao_counters before_ao, after_ao;
+	struct tcp_counters before, after;
 	uint64_t before_aogood, after_aogood;
 	struct netstat *ns_before, *ns_after;
 	const size_t nr_packets = 20;
@@ -62,19 +63,17 @@ static void tcp_self_connect(const char *tst, unsigned int port,
 
 	ns_before = netstat_read();
 	before_aogood = netstat_get(ns_before, "TCPAOGood", NULL);
-	before_challenge_ack = netstat_get(ns_before, "TCPChallengeACK", NULL);
-	before_syn_challenge = netstat_get(ns_before, "TCPSYNChallenge", NULL);
-	if (test_get_tcp_ao_counters(sk, &before_ao))
-		test_error("test_get_tcp_ao_counters()");
+	if (test_get_tcp_counters(sk, &before))
+		test_error("test_get_tcp_counters()");
 
 	if (__test_connect_socket(sk, "lo", (struct sockaddr *)&addr,
-				  sizeof(addr), TEST_TIMEOUT_SEC) < 0) {
+				  sizeof(addr), 0) < 0) {
 		ns_after = netstat_read();
 		netstat_print_diff(ns_before, ns_after);
 		test_error("failed to connect()");
 	}
 
-	if (test_client_verify(sk, 100, nr_packets, TEST_TIMEOUT_SEC)) {
+	if (test_client_verify(sk, 100, nr_packets)) {
 		test_fail("%s: tcp connection verify failed", tst);
 		close(sk);
 		return;
@@ -82,10 +81,8 @@ static void tcp_self_connect(const char *tst, unsigned int port,
 
 	ns_after = netstat_read();
 	after_aogood = netstat_get(ns_after, "TCPAOGood", NULL);
-	after_challenge_ack = netstat_get(ns_after, "TCPChallengeACK", NULL);
-	after_syn_challenge = netstat_get(ns_after, "TCPSYNChallenge", NULL);
-	if (test_get_tcp_ao_counters(sk, &after_ao))
-		test_error("test_get_tcp_ao_counters()");
+	if (test_get_tcp_counters(sk, &after))
+		test_error("test_get_tcp_counters()");
 	if (!check_restore) {
 		/* to debug: netstat_print_diff(ns_before, ns_after); */
 		netstat_free(ns_before);
@@ -93,25 +90,13 @@ static void tcp_self_connect(const char *tst, unsigned int port,
 	netstat_free(ns_after);
 
 	if (after_aogood <= before_aogood) {
-		test_fail("%s: TCPAOGood counter mismatch: %zu <= %zu",
+		test_fail("%s: TCPAOGood counter mismatch: %" PRIu64 " <= %" PRIu64,
 			  tst, after_aogood, before_aogood);
 		close(sk);
 		return;
 	}
-	if (after_challenge_ack <= before_challenge_ack ||
-	    after_syn_challenge <= before_syn_challenge) {
-		/*
-		 * It's also meant to test simultaneous open, so check
-		 * these counters as well.
-		 */
-		test_fail("%s: Didn't challenge SYN or ACK: %zu <= %zu OR %zu <= %zu",
-			  tst, after_challenge_ack, before_challenge_ack,
-			  after_syn_challenge, before_syn_challenge);
-		close(sk);
-		return;
-	}
 
-	if (test_tcp_ao_counters_cmp(tst, &before_ao, &after_ao, TEST_CNT_GOOD)) {
+	if (test_assert_counters(tst, &before, &after, TEST_CNT_GOOD)) {
 		close(sk);
 		return;
 	}
@@ -154,7 +139,7 @@ static void tcp_self_connect(const char *tst, unsigned int port,
 	test_ao_restore(sk, &ao_img);
 	test_disable_repair(sk);
 	test_sock_state_free(&img);
-	if (test_client_verify(sk, 100, nr_packets, TEST_TIMEOUT_SEC)) {
+	if (test_client_verify(sk, 100, nr_packets)) {
 		test_fail("%s: tcp connection verify failed", tst);
 		close(sk);
 		return;
@@ -166,7 +151,7 @@ static void tcp_self_connect(const char *tst, unsigned int port,
 	netstat_free(ns_after);
 	close(sk);
 	if (after_aogood <= before_aogood) {
-		test_fail("%s: TCPAOGood counter mismatch: %zu <= %zu",
+		test_fail("%s: TCPAOGood counter mismatch: %" PRIu64 " <= %" PRIu64,
 			  tst, after_aogood, before_aogood);
 		return;
 	}
@@ -181,17 +166,26 @@ static void *client_fn(void *arg)
 	setup_lo_intf("lo");
 
 	tcp_self_connect("self-connect(same keyids)", port++, false, false);
+
+	/* expecting rnext to change based on the first segment RNext != Current */
+	trace_ao_event_expect(TCP_AO_RNEXT_REQUEST, local_addr, local_addr,
+			      port, port, 0, -1, -1, -1, -1, -1, 7, 5, -1);
 	tcp_self_connect("self-connect(different keyids)", port++, true, false);
 	tcp_self_connect("self-connect(restore)", port, false, true);
-	port += 2;
+	port += 2; /* restore test restores over different port */
+	trace_ao_event_expect(TCP_AO_RNEXT_REQUEST, local_addr, local_addr,
+			      port, port, 0, -1, -1, -1, -1, -1, 7, 5, -1);
+	/* intentionally on restore they are added to the socket in different order */
+	trace_ao_event_expect(TCP_AO_RNEXT_REQUEST, local_addr, local_addr,
+			      port + 1, port + 1, 0, -1, -1, -1, -1, -1, 5, 7, -1);
 	tcp_self_connect("self-connect(restore, different keyids)", port, true, true);
-	port += 2;
+	port += 2; /* restore test restores over different port */
 
 	return NULL;
 }
 
 int main(int argc, char *argv[])
 {
-	test_init(4, client_fn, NULL);
+	test_init(5, client_fn, NULL);
 	return 0;
 }
diff --git a/tools/testing/selftests/net/tcp_ao/seq-ext.c b/tools/testing/selftests/net/tcp_ao/seq-ext.c
index ad4e77d6823e..f00245263b20 100644
--- a/tools/testing/selftests/net/tcp_ao/seq-ext.c
+++ b/tools/testing/selftests/net/tcp_ao/seq-ext.c
@@ -40,7 +40,7 @@ static void test_adjust_seqs(struct tcp_sock_state *img,
 static int test_sk_restore(struct tcp_sock_state *img,
 			   struct tcp_ao_repair *ao_img, sockaddr_af *saddr,
 			   const union tcp_addr daddr, unsigned int dport,
-			   struct tcp_ao_counters *cnt)
+			   struct tcp_counters *cnt)
 {
 	int sk;
 
@@ -54,8 +54,8 @@ static int test_sk_restore(struct tcp_sock_state *img,
 		test_error("setsockopt(TCP_AO_ADD_KEY)");
 	test_ao_restore(sk, ao_img);
 
-	if (test_get_tcp_ao_counters(sk, cnt))
-		test_error("test_get_tcp_ao_counters()");
+	if (test_get_tcp_counters(sk, cnt))
+		test_error("test_get_tcp_counters()");
 
 	test_disable_repair(sk);
 	test_sock_state_free(img);
@@ -65,7 +65,7 @@ static int test_sk_restore(struct tcp_sock_state *img,
 static void *server_fn(void *arg)
 {
 	uint64_t before_good, after_good, after_bad;
-	struct tcp_ao_counters ao1, ao2;
+	struct tcp_counters cnt1, cnt2;
 	struct tcp_sock_state img;
 	struct tcp_ao_repair ao_img;
 	sockaddr_af saddr;
@@ -114,9 +114,17 @@ static void *server_fn(void *arg)
 	test_adjust_seqs(&img, &ao_img, true);
 	synchronize_threads(); /* 4: dump finished */
 	sk = test_sk_restore(&img, &ao_img, &saddr, this_ip_dest,
-			     client_new_port, &ao1);
-
-	synchronize_threads(); /* 5: verify counters during SEQ-number rollover */
+			     client_new_port, &cnt1);
+
+	trace_ao_event_sne_expect(TCP_AO_SND_SNE_UPDATE, this_ip_addr,
+			this_ip_dest, test_server_port + 1, client_new_port, 1);
+	trace_ao_event_sne_expect(TCP_AO_SND_SNE_UPDATE, this_ip_dest,
+			this_ip_addr, client_new_port, test_server_port + 1, 1);
+	trace_ao_event_sne_expect(TCP_AO_RCV_SNE_UPDATE, this_ip_addr,
+			this_ip_dest, test_server_port + 1, client_new_port, 1);
+	trace_ao_event_sne_expect(TCP_AO_RCV_SNE_UPDATE, this_ip_dest,
+			this_ip_addr, client_new_port, test_server_port + 1, 1);
+	synchronize_threads(); /* 5: verify the connection during SEQ-number rollover */
 	bytes = test_server_run(sk, quota, TEST_TIMEOUT_SEC);
 	if (bytes != quota) {
 		if (bytes > 0)
@@ -127,22 +135,23 @@ static void *server_fn(void *arg)
 		test_ok("server alive");
 	}
 
-	if (test_get_tcp_ao_counters(sk, &ao2))
-		test_error("test_get_tcp_ao_counters()");
+	synchronize_threads(); /* 6: verify counters after SEQ-number rollover */
+	if (test_get_tcp_counters(sk, &cnt2))
+		test_error("test_get_tcp_counters()");
 	after_good = netstat_get_one("TCPAOGood", NULL);
 
-	test_tcp_ao_counters_cmp(NULL, &ao1, &ao2, TEST_CNT_GOOD);
+	test_assert_counters(NULL, &cnt1, &cnt2, TEST_CNT_GOOD);
 
 	if (after_good <= before_good) {
-		test_fail("TCPAOGood counter did not increase: %zu <= %zu",
+		test_fail("TCPAOGood counter did not increase: %" PRIu64 " <= %" PRIu64,
 			  after_good, before_good);
 	} else {
-		test_ok("TCPAOGood counter increased %zu => %zu",
+		test_ok("TCPAOGood counter increased %" PRIu64 " => %" PRIu64,
 			before_good, after_good);
 	}
 	after_bad = netstat_get_one("TCPAOBad", NULL);
 	if (after_bad)
-		test_fail("TCPAOBad counter is non-zero: %zu", after_bad);
+		test_fail("TCPAOBad counter is non-zero: %" PRIu64, after_bad);
 	else
 		test_ok("TCPAOBad counter didn't increase");
 	test_enable_repair(sk);
@@ -164,7 +173,7 @@ out:
 static void *client_fn(void *arg)
 {
 	uint64_t before_good, after_good, after_bad;
-	struct tcp_ao_counters ao1, ao2;
+	struct tcp_counters cnt1, cnt2;
 	struct tcp_sock_state img;
 	struct tcp_ao_repair ao_img;
 	sockaddr_af saddr;
@@ -182,7 +191,7 @@ static void *client_fn(void *arg)
 		test_error("failed to connect()");
 
 	synchronize_threads(); /* 2: accepted => send data */
-	if (test_client_verify(sk, msg_len, nr_packets, TEST_TIMEOUT_SEC)) {
+	if (test_client_verify(sk, msg_len, nr_packets)) {
 		test_fail("pre-migrate verify failed");
 		return NULL;
 	}
@@ -204,30 +213,31 @@ static void *client_fn(void *arg)
 	test_adjust_seqs(&img, &ao_img, false);
 	synchronize_threads(); /* 4: dump finished */
 	sk = test_sk_restore(&img, &ao_img, &saddr, this_ip_dest,
-			     test_server_port + 1, &ao1);
+			     test_server_port + 1, &cnt1);
 
-	synchronize_threads(); /* 5: verify counters during SEQ-number rollover */
-	if (test_client_verify(sk, msg_len, nr_packets, TEST_TIMEOUT_SEC))
+	synchronize_threads(); /* 5: verify the connection during SEQ-number rollover */
+	if (test_client_verify(sk, msg_len, nr_packets))
 		test_fail("post-migrate verify failed");
 	else
 		test_ok("post-migrate connection alive");
 
-	if (test_get_tcp_ao_counters(sk, &ao2))
-		test_error("test_get_tcp_ao_counters()");
+	synchronize_threads(); /* 5: verify counters after SEQ-number rollover */
+	if (test_get_tcp_counters(sk, &cnt2))
+		test_error("test_get_tcp_counters()");
 	after_good = netstat_get_one("TCPAOGood", NULL);
 
-	test_tcp_ao_counters_cmp(NULL, &ao1, &ao2, TEST_CNT_GOOD);
+	test_assert_counters(NULL, &cnt1, &cnt2, TEST_CNT_GOOD);
 
 	if (after_good <= before_good) {
-		test_fail("TCPAOGood counter did not increase: %zu <= %zu",
+		test_fail("TCPAOGood counter did not increase: %" PRIu64 " <= %" PRIu64,
 			  after_good, before_good);
 	} else {
-		test_ok("TCPAOGood counter increased %zu => %zu",
+		test_ok("TCPAOGood counter increased %" PRIu64 " => %" PRIu64,
 			before_good, after_good);
 	}
 	after_bad = netstat_get_one("TCPAOBad", NULL);
 	if (after_bad)
-		test_fail("TCPAOBad counter is non-zero: %zu", after_bad);
+		test_fail("TCPAOBad counter is non-zero: %" PRIu64, after_bad);
 	else
 		test_ok("TCPAOBad counter didn't increase");
 
@@ -240,6 +250,6 @@ static void *client_fn(void *arg)
 
 int main(int argc, char *argv[])
 {
-	test_init(7, server_fn, client_fn);
+	test_init(8, server_fn, client_fn);
 	return 0;
 }
diff --git a/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c b/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c
index 517930f9721b..0abb9807d742 100644
--- a/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c
+++ b/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c
@@ -6,6 +6,8 @@
 
 static union tcp_addr tcp_md5_client;
 
+#define FILTER_TEST_NKEYS 16
+
 static int test_port = 7788;
 static void make_listen(int sk)
 {
@@ -30,8 +32,8 @@ static void test_vefify_ao_info(int sk, struct tcp_ao_info_opt *info,
 #define __cmp_ao(member)							\
 do {										\
 	if (info->member != tmp.member) {					\
-		test_fail("%s: getsockopt(): " __stringify(member) " %zu != %zu",	\
-			  tst, (size_t)info->member, (size_t)tmp.member);	\
+		test_fail("%s: getsockopt(): " __stringify(member) " %" PRIu64 " != %" PRIu64,	\
+			  tst, (uint64_t)info->member, (uint64_t)tmp.member);	\
 		return;								\
 	}									\
 } while(0)
@@ -813,23 +815,197 @@ static void duplicate_tests(void)
 	setsockopt_checked(sk, TCP_AO_ADD_KEY, &ao, EEXIST, "duplicate: SendID differs");
 }
 
+static void fetch_all_keys(int sk, struct tcp_ao_getsockopt *keys)
+{
+	socklen_t optlen = sizeof(struct tcp_ao_getsockopt);
+
+	memset(keys, 0, sizeof(struct tcp_ao_getsockopt) * FILTER_TEST_NKEYS);
+	keys[0].get_all = 1;
+	keys[0].nkeys = FILTER_TEST_NKEYS;
+	if (getsockopt(sk, IPPROTO_TCP, TCP_AO_GET_KEYS, &keys[0], &optlen))
+		test_error("getsockopt");
+}
+
+static int prepare_test_keys(struct tcp_ao_getsockopt *keys)
+{
+	const char *test_password = "Test password number ";
+	struct tcp_ao_add test_ao[FILTER_TEST_NKEYS];
+	char test_password_scratch[64] = {};
+	u8 rcvid = 100, sndid = 100;
+	int sk;
+
+	sk = socket(test_family, SOCK_STREAM, IPPROTO_TCP);
+	if (sk < 0)
+		test_error("socket()");
+
+	for (int i = 0; i < FILTER_TEST_NKEYS; i++) {
+		snprintf(test_password_scratch, 64, "%s %d", test_password, i);
+		test_prepare_key(&test_ao[i], DEFAULT_TEST_ALGO, this_ip_dest,
+			  false, false, DEFAULT_TEST_PREFIX, 0, sndid++,
+			  rcvid++, 0, 0, strlen(test_password_scratch),
+			  test_password_scratch);
+	}
+	test_ao[0].set_current = 1;
+	test_ao[1].set_rnext = 1;
+	/* One key with a different addr and overlapping sndid, rcvid */
+	tcp_addr_to_sockaddr_in(&test_ao[2].addr, &this_ip_addr, 0);
+	test_ao[2].sndid = 100;
+	test_ao[2].rcvid = 100;
+
+	/* Add keys in a random order */
+	for (int i = 0; i < FILTER_TEST_NKEYS; i++) {
+		int randidx = rand() % (FILTER_TEST_NKEYS - i);
+
+		if (setsockopt(sk, IPPROTO_TCP, TCP_AO_ADD_KEY,
+			       &test_ao[randidx], sizeof(struct tcp_ao_add)))
+			test_error("setsockopt()");
+		memcpy(&test_ao[randidx], &test_ao[FILTER_TEST_NKEYS - 1 - i],
+		       sizeof(struct tcp_ao_add));
+	}
+
+	fetch_all_keys(sk, keys);
+
+	return sk;
+}
+
+/* Assumes passwords are unique */
+static int compare_mkts(struct tcp_ao_getsockopt *expected, int nexpected,
+			struct tcp_ao_getsockopt *actual, int nactual)
+{
+	int matches = 0;
+
+	for (int i = 0; i < nexpected; i++) {
+		for (int j = 0; j < nactual; j++) {
+			if (memcmp(expected[i].key, actual[j].key,
+				   TCP_AO_MAXKEYLEN) == 0)
+				matches++;
+		}
+	}
+	return nexpected - matches;
+}
+
+static void filter_keys_checked(int sk, struct tcp_ao_getsockopt *filter,
+				struct tcp_ao_getsockopt *expected,
+				unsigned int nexpected, const char *tst)
+{
+	struct tcp_ao_getsockopt filtered_keys[FILTER_TEST_NKEYS] = {};
+	struct tcp_ao_getsockopt all_keys[FILTER_TEST_NKEYS] = {};
+	socklen_t len = sizeof(struct tcp_ao_getsockopt);
+
+	fetch_all_keys(sk, all_keys);
+	memcpy(&filtered_keys[0], filter, sizeof(struct tcp_ao_getsockopt));
+	filtered_keys[0].nkeys = FILTER_TEST_NKEYS;
+	if (getsockopt(sk, IPPROTO_TCP, TCP_AO_GET_KEYS, filtered_keys, &len))
+		test_error("getsockopt");
+	if (filtered_keys[0].nkeys != nexpected) {
+		test_fail("wrong nr of keys, expected %u got %u", nexpected,
+			  filtered_keys[0].nkeys);
+		goto out_close;
+	}
+	if (compare_mkts(expected, nexpected, filtered_keys,
+			 filtered_keys[0].nkeys)) {
+		test_fail("got wrong keys back");
+		goto out_close;
+	}
+	test_ok("filter keys: %s", tst);
+
+out_close:
+	close(sk);
+	memset(filter, 0, sizeof(struct tcp_ao_getsockopt));
+}
+
+static void filter_tests(void)
+{
+	struct tcp_ao_getsockopt original_keys[FILTER_TEST_NKEYS];
+	struct tcp_ao_getsockopt expected_keys[FILTER_TEST_NKEYS];
+	struct tcp_ao_getsockopt filter = {};
+	int sk, f, nmatches;
+	socklen_t len;
+
+	f = 2;
+	sk = prepare_test_keys(original_keys);
+	filter.rcvid = original_keys[f].rcvid;
+	filter.sndid = original_keys[f].sndid;
+	memcpy(&filter.addr, &original_keys[f].addr,
+	       sizeof(original_keys[f].addr));
+	filter.prefix = original_keys[f].prefix;
+	filter_keys_checked(sk, &filter, &original_keys[f], 1,
+			    "by sndid, rcvid, address");
+
+	f = -1;
+	sk = prepare_test_keys(original_keys);
+	for (int i = 0; i < original_keys[0].nkeys; i++) {
+		if (original_keys[i].is_current) {
+			f = i;
+			break;
+		}
+	}
+	if (f < 0)
+		test_error("No current key after adding one");
+	filter.is_current = 1;
+	filter_keys_checked(sk, &filter, &original_keys[f], 1, "by is_current");
+
+	f = -1;
+	sk = prepare_test_keys(original_keys);
+	for (int i = 0; i < original_keys[0].nkeys; i++) {
+		if (original_keys[i].is_rnext) {
+			f = i;
+			break;
+		}
+	}
+	if (f < 0)
+		test_error("No rnext key after adding one");
+	filter.is_rnext = 1;
+	filter_keys_checked(sk, &filter, &original_keys[f], 1, "by is_rnext");
+
+	f = -1;
+	nmatches = 0;
+	sk = prepare_test_keys(original_keys);
+	for (int i = 0; i < original_keys[0].nkeys; i++) {
+		if (original_keys[i].sndid == 100) {
+			f = i;
+			memcpy(&expected_keys[nmatches], &original_keys[i],
+			       sizeof(struct tcp_ao_getsockopt));
+			nmatches++;
+		}
+	}
+	if (f < 0)
+		test_error("No key for sndid 100");
+	if (nmatches != 2)
+		test_error("Should have 2 keys with sndid 100");
+	filter.rcvid = original_keys[f].rcvid;
+	filter.sndid = original_keys[f].sndid;
+	filter.addr.ss_family = test_family;
+	filter_keys_checked(sk, &filter, expected_keys, nmatches,
+			    "by sndid, rcvid");
+
+	sk = prepare_test_keys(original_keys);
+	filter.get_all = 1;
+	filter.nkeys = FILTER_TEST_NKEYS / 2;
+	len = sizeof(struct tcp_ao_getsockopt);
+	if (getsockopt(sk, IPPROTO_TCP, TCP_AO_GET_KEYS, &filter, &len))
+		test_error("getsockopt");
+	if (filter.nkeys == FILTER_TEST_NKEYS)
+		test_ok("filter keys: correct nkeys when in.nkeys < matches");
+	else
+		test_fail("filter keys: wrong nkeys, expected %u got %u",
+			  FILTER_TEST_NKEYS, filter.nkeys);
+}
+
 static void *client_fn(void *arg)
 {
 	if (inet_pton(TEST_FAMILY, __TEST_CLIENT_IP(2), &tcp_md5_client) != 1)
 		test_error("Can't convert ip address");
 	extend_tests();
 	einval_tests();
+	filter_tests();
 	duplicate_tests();
-	/*
-	 * TODO: check getsockopt(TCP_AO_GET_KEYS) with different filters
-	 * returning proper nr & keys;
-	 */
 
 	return NULL;
 }
 
 int main(int argc, char *argv[])
 {
-	test_init(120, client_fn, NULL);
+	test_init(126, client_fn, NULL);
 	return 0;
 }
diff --git a/tools/testing/selftests/net/tcp_ao/unsigned-md5.c b/tools/testing/selftests/net/tcp_ao/unsigned-md5.c
index 6b59a652159f..a1467b64390a 100644
--- a/tools/testing/selftests/net/tcp_ao/unsigned-md5.c
+++ b/tools/testing/selftests/net/tcp_ao/unsigned-md5.c
@@ -6,6 +6,7 @@
 #define fault(type)	(inj == FAULT_ ## type)
 static const char *md5_password = "Some evil genius, enemy to mankind, must have been the first contriver.";
 static const char *ao_password = DEFAULT_TEST_PASSWORD;
+static volatile int sk_pair;
 
 static union tcp_addr client2;
 static union tcp_addr client3;
@@ -41,10 +42,10 @@ static void try_accept(const char *tst_name, unsigned int port,
 		       const char *cnt_name, test_cnt cnt_expected,
 		       int needs_tcp_md5, fault_t inj)
 {
-	struct tcp_ao_counters ao_cnt1, ao_cnt2;
+	struct tcp_counters cnt1, cnt2;
 	uint64_t before_cnt = 0, after_cnt = 0; /* silence GCC */
-	int lsk, err, sk = 0;
-	time_t timeout;
+	test_cnt poll_cnt = (cnt_expected == TEST_CNT_GOOD) ? 0 : cnt_expected;
+	int lsk, err, sk = -1;
 
 	if (needs_tcp_md5 && should_skip_test(tst_name, KCONFIG_TCP_MD5))
 		return;
@@ -63,21 +64,25 @@ static void try_accept(const char *tst_name, unsigned int port,
 
 	if (cnt_name)
 		before_cnt = netstat_get_one(cnt_name, NULL);
-	if (ao_addr && test_get_tcp_ao_counters(lsk, &ao_cnt1))
-		test_error("test_get_tcp_ao_counters()");
+	if (ao_addr && test_get_tcp_counters(lsk, &cnt1))
+		test_error("test_get_tcp_counters()");
 
 	synchronize_threads(); /* preparations done */
 
-	timeout = fault(TIMEOUT) ? TEST_RETRANSMIT_SEC : TEST_TIMEOUT_SEC;
-	err = test_wait_fd(lsk, timeout, 0);
+	err = test_skpair_wait_poll(lsk, 0, poll_cnt, &sk_pair);
+	synchronize_threads(); /* connect()/accept() timeouts */
 	if (err == -ETIMEDOUT) {
+		sk_pair = err;
 		if (!fault(TIMEOUT))
-			test_fail("timed out for accept()");
+			test_fail("%s: timed out for accept()", tst_name);
+	} else if (err == -EKEYREJECTED) {
+		if (!fault(KEYREJECT))
+			test_fail("%s: key was rejected", tst_name);
 	} else if (err < 0) {
-		test_error("test_wait_fd()");
+		test_error("test_skpair_wait_poll()");
 	} else {
 		if (fault(TIMEOUT))
-			test_fail("ready to accept");
+			test_fail("%s: ready to accept", tst_name);
 
 		sk = accept(lsk, NULL, NULL);
 		if (sk < 0) {
@@ -88,8 +93,8 @@ static void try_accept(const char *tst_name, unsigned int port,
 		}
 	}
 
-	if (ao_addr && test_get_tcp_ao_counters(lsk, &ao_cnt2))
-		test_error("test_get_tcp_ao_counters()");
+	if (ao_addr && test_get_tcp_counters(lsk, &cnt2))
+		test_error("test_get_tcp_counters()");
 	close(lsk);
 
 	if (!cnt_name) {
@@ -100,18 +105,18 @@ static void try_accept(const char *tst_name, unsigned int port,
 	after_cnt = netstat_get_one(cnt_name, NULL);
 
 	if (after_cnt <= before_cnt) {
-		test_fail("%s: %s counter did not increase: %zu <= %zu",
+		test_fail("%s: %s counter did not increase: %" PRIu64 " <= %" PRIu64,
 				tst_name, cnt_name, after_cnt, before_cnt);
 	} else {
-		test_ok("%s: counter %s increased %zu => %zu",
+		test_ok("%s: counter %s increased %" PRIu64 " => %" PRIu64,
 			tst_name, cnt_name, before_cnt, after_cnt);
 	}
 	if (ao_addr)
-		test_tcp_ao_counters_cmp(tst_name, &ao_cnt1, &ao_cnt2, cnt_expected);
+		test_assert_counters(tst_name, &cnt1, &cnt2, cnt_expected);
 
 out:
 	synchronize_threads(); /* test_kill_sk() */
-	if (sk > 0)
+	if (sk >= 0)
 		test_kill_sk(sk);
 }
 
@@ -152,78 +157,82 @@ static void *server_fn(void *arg)
 
 	server_add_routes();
 
-	try_accept("AO server (INADDR_ANY): AO client", port++, NULL, 0,
+	try_accept("[server] AO server (INADDR_ANY): AO client", port++, NULL, 0,
 		   &addr_any, 0, 0, 100, 100, 0, "TCPAOGood",
 		   TEST_CNT_GOOD, 0, 0);
-	try_accept("AO server (INADDR_ANY): MD5 client", port++, NULL, 0,
+	try_accept("[server] AO server (INADDR_ANY): MD5 client", port++, NULL, 0,
 		   &addr_any, 0, 0, 100, 100, 0, "TCPMD5Unexpected",
-		   0, 1, FAULT_TIMEOUT);
-	try_accept("AO server (INADDR_ANY): no sign client", port++, NULL, 0,
+		   TEST_CNT_NS_MD5_UNEXPECTED, 1, FAULT_TIMEOUT);
+	try_accept("[server] AO server (INADDR_ANY): no sign client", port++, NULL, 0,
 		   &addr_any, 0, 0, 100, 100, 0, "TCPAORequired",
 		   TEST_CNT_AO_REQUIRED, 0, FAULT_TIMEOUT);
-	try_accept("AO server (AO_REQUIRED): AO client", port++, NULL, 0,
+	try_accept("[server] AO server (AO_REQUIRED): AO client", port++, NULL, 0,
 		   &this_ip_dest, TEST_PREFIX, true,
 		   100, 100, 0, "TCPAOGood", TEST_CNT_GOOD, 0, 0);
-	try_accept("AO server (AO_REQUIRED): unsigned client", port++, NULL, 0,
+	try_accept("[server] AO server (AO_REQUIRED): unsigned client", port++, NULL, 0,
 		   &this_ip_dest, TEST_PREFIX, true,
 		   100, 100, 0, "TCPAORequired",
 		   TEST_CNT_AO_REQUIRED, 0, FAULT_TIMEOUT);
 
-	try_accept("MD5 server (INADDR_ANY): AO client", port++, &addr_any, 0,
+	try_accept("[server] MD5 server (INADDR_ANY): AO client", port++, &addr_any, 0,
 		   NULL, 0, 0, 0, 0, 0, "TCPAOKeyNotFound",
-		   0, 1, FAULT_TIMEOUT);
-	try_accept("MD5 server (INADDR_ANY): MD5 client", port++, &addr_any, 0,
+		   TEST_CNT_NS_KEY_NOT_FOUND, 1, FAULT_TIMEOUT);
+	try_accept("[server] MD5 server (INADDR_ANY): MD5 client", port++, &addr_any, 0,
 		   NULL, 0, 0, 0, 0, 0, NULL, 0, 1, 0);
-	try_accept("MD5 server (INADDR_ANY): no sign client", port++, &addr_any,
+	try_accept("[server] MD5 server (INADDR_ANY): no sign client", port++, &addr_any,
 		   0, NULL, 0, 0, 0, 0, 0, "TCPMD5NotFound",
-		   0, 1, FAULT_TIMEOUT);
+		   TEST_CNT_NS_MD5_NOT_FOUND, 1, FAULT_TIMEOUT);
 
-	try_accept("no sign server: AO client", port++, NULL, 0,
+	try_accept("[server] no sign server: AO client", port++, NULL, 0,
 		   NULL, 0, 0, 0, 0, 0, "TCPAOKeyNotFound",
-		   TEST_CNT_AO_KEY_NOT_FOUND, 0, FAULT_TIMEOUT);
-	try_accept("no sign server: MD5 client", port++, NULL, 0,
+		   TEST_CNT_NS_KEY_NOT_FOUND, 0, FAULT_TIMEOUT);
+	try_accept("[server] no sign server: MD5 client", port++, NULL, 0,
 		   NULL, 0, 0, 0, 0, 0, "TCPMD5Unexpected",
-		   0, 1, FAULT_TIMEOUT);
-	try_accept("no sign server: no sign client", port++, NULL, 0,
+		   TEST_CNT_NS_MD5_UNEXPECTED, 1, FAULT_TIMEOUT);
+	try_accept("[server] no sign server: no sign client", port++, NULL, 0,
 		   NULL, 0, 0, 0, 0, 0, "CurrEstab", 0, 0, 0);
 
-	try_accept("AO+MD5 server: AO client (matching)", port++,
+	try_accept("[server] AO+MD5 server: AO client (matching)", port++,
 		   &this_ip_dest, TEST_PREFIX, &client2, TEST_PREFIX, 0,
 		   100, 100, 0, "TCPAOGood", TEST_CNT_GOOD, 1, 0);
-	try_accept("AO+MD5 server: AO client (misconfig, matching MD5)", port++,
+	try_accept("[server] AO+MD5 server: AO client (misconfig, matching MD5)", port++,
 		   &this_ip_dest, TEST_PREFIX, &client2, TEST_PREFIX, 0,
 		   100, 100, 0, "TCPAOKeyNotFound", TEST_CNT_AO_KEY_NOT_FOUND,
 		   1, FAULT_TIMEOUT);
-	try_accept("AO+MD5 server: AO client (misconfig, non-matching)", port++,
+	try_accept("[server] AO+MD5 server: AO client (misconfig, non-matching)", port++,
 		   &this_ip_dest, TEST_PREFIX, &client2, TEST_PREFIX, 0,
 		   100, 100, 0, "TCPAOKeyNotFound", TEST_CNT_AO_KEY_NOT_FOUND,
 		   1, FAULT_TIMEOUT);
-	try_accept("AO+MD5 server: MD5 client (matching)", port++,
+	try_accept("[server] AO+MD5 server: MD5 client (matching)", port++,
 		   &this_ip_dest, TEST_PREFIX, &client2, TEST_PREFIX, 0,
 		   100, 100, 0, NULL, 0, 1, 0);
-	try_accept("AO+MD5 server: MD5 client (misconfig, matching AO)", port++,
+	try_accept("[server] AO+MD5 server: MD5 client (misconfig, matching AO)", port++,
 		   &this_ip_dest, TEST_PREFIX, &client2, TEST_PREFIX, 0,
-		   100, 100, 0, "TCPMD5Unexpected", 0, 1, FAULT_TIMEOUT);
-	try_accept("AO+MD5 server: MD5 client (misconfig, non-matching)", port++,
+		   100, 100, 0, "TCPMD5Unexpected",
+		   TEST_CNT_NS_MD5_UNEXPECTED, 1, FAULT_TIMEOUT);
+	try_accept("[server] AO+MD5 server: MD5 client (misconfig, non-matching)", port++,
 		   &this_ip_dest, TEST_PREFIX, &client2, TEST_PREFIX, 0,
-		   100, 100, 0, "TCPMD5Unexpected", 0, 1, FAULT_TIMEOUT);
-	try_accept("AO+MD5 server: no sign client (unmatched)", port++,
+		   100, 100, 0, "TCPMD5Unexpected",
+		   TEST_CNT_NS_MD5_UNEXPECTED, 1, FAULT_TIMEOUT);
+	try_accept("[server] AO+MD5 server: no sign client (unmatched)", port++,
 		   &this_ip_dest, TEST_PREFIX, &client2, TEST_PREFIX, 0,
 		   100, 100, 0, "CurrEstab", 0, 1, 0);
-	try_accept("AO+MD5 server: no sign client (misconfig, matching AO)",
+	try_accept("[server] AO+MD5 server: no sign client (misconfig, matching AO)",
 		   port++, &this_ip_dest, TEST_PREFIX, &client2, TEST_PREFIX, 0,
 		   100, 100, 0, "TCPAORequired",
 		   TEST_CNT_AO_REQUIRED, 1, FAULT_TIMEOUT);
-	try_accept("AO+MD5 server: no sign client (misconfig, matching MD5)",
+	try_accept("[server] AO+MD5 server: no sign client (misconfig, matching MD5)",
 		   port++, &this_ip_dest, TEST_PREFIX, &client2, TEST_PREFIX, 0,
-		   100, 100, 0, "TCPMD5NotFound", 0, 1, FAULT_TIMEOUT);
+		   100, 100, 0, "TCPMD5NotFound",
+		   TEST_CNT_NS_MD5_NOT_FOUND, 1, FAULT_TIMEOUT);
 
-	try_accept("AO+MD5 server: client with both [TCP-MD5] and TCP-AO keys",
+	/* Key rejected by the other side, failing short through skpair */
+	try_accept("[server] AO+MD5 server: client with both [TCP-MD5] and TCP-AO keys",
 		   port++, &this_ip_dest, TEST_PREFIX, &client2, TEST_PREFIX, 0,
-		   100, 100, 0, NULL, 0, 1, FAULT_TIMEOUT);
-	try_accept("AO+MD5 server: client with both TCP-MD5 and [TCP-AO] keys",
+		   100, 100, 0, NULL, 0, 1, FAULT_KEYREJECT);
+	try_accept("[server] AO+MD5 server: client with both TCP-MD5 and [TCP-AO] keys",
 		   port++, &this_ip_dest, TEST_PREFIX, &client2, TEST_PREFIX, 0,
-		   100, 100, 0, NULL, 0, 1, FAULT_TIMEOUT);
+		   100, 100, 0, NULL, 0, 1, FAULT_KEYREJECT);
 
 	server_add_fail_tests(&port);
 
@@ -258,7 +267,6 @@ static void try_connect(const char *tst_name, unsigned int port,
 		       uint8_t sndid, uint8_t rcvid, uint8_t vrf,
 		       fault_t inj, int needs_tcp_md5, union tcp_addr *bind_addr)
 {
-	time_t timeout;
 	int sk, ret;
 
 	if (needs_tcp_md5 && should_skip_test(tst_name, KCONFIG_TCP_MD5))
@@ -280,10 +288,10 @@ static void try_connect(const char *tst_name, unsigned int port,
 
 	synchronize_threads(); /* preparations done */
 
-	timeout = fault(TIMEOUT) ? TEST_RETRANSMIT_SEC : TEST_TIMEOUT_SEC;
-	ret = _test_connect_socket(sk, this_ip_dest, port, timeout);
-
+	ret = test_skpair_connect_poll(sk, this_ip_dest, port, 0, &sk_pair);
+	synchronize_threads(); /* connect()/accept() timeouts */
 	if (ret < 0) {
+		sk_pair = ret;
 		if (fault(KEYREJECT) && ret == -EKEYREJECTED)
 			test_ok("%s: connect() was prevented", tst_name);
 		else if (ret == -ETIMEDOUT && fault(TIMEOUT))
@@ -303,8 +311,7 @@ static void try_connect(const char *tst_name, unsigned int port,
 
 out:
 	synchronize_threads(); /* test_kill_sk() */
-	/* _test_connect_socket() cleans up on failure */
-	if (ret > 0)
+	if (ret > 0) /* test_skpair_connect_poll() cleans up on failure */
 		test_kill_sk(sk);
 }
 
@@ -435,7 +442,6 @@ static void try_to_add(const char *tst_name, unsigned int port,
 		       int ao_vrf, uint8_t sndid, uint8_t rcvid,
 		       int needs_tcp_md5, fault_t inj)
 {
-	time_t timeout;
 	int sk, ret;
 
 	if (needs_tcp_md5 && should_skip_test(tst_name, KCONFIG_TCP_MD5))
@@ -448,10 +454,10 @@ static void try_to_add(const char *tst_name, unsigned int port,
 
 	synchronize_threads(); /* preparations done */
 
-	timeout = fault(TIMEOUT) ? TEST_RETRANSMIT_SEC : TEST_TIMEOUT_SEC;
-	ret = _test_connect_socket(sk, this_ip_dest, port, timeout);
+	ret = test_skpair_connect_poll(sk, this_ip_dest, port, 0, &sk_pair);
 
-	if (ret <= 0) {
+	synchronize_threads(); /* connect()/accept() timeouts */
+	if (ret < 0) {
 		test_error("%s: connect() returned %d", tst_name, ret);
 		goto out;
 	}
@@ -487,8 +493,7 @@ static void try_to_add(const char *tst_name, unsigned int port,
 
 out:
 	synchronize_threads(); /* test_kill_sk() */
-	/* _test_connect_socket() cleans up on failure */
-	if (ret > 0)
+	if (ret > 0) /* test_skpair_connect_poll() cleans up on failure */
 		test_kill_sk(sk);
 }
 
@@ -671,24 +676,38 @@ static void *client_fn(void *arg)
 
 	try_connect("AO server (INADDR_ANY): AO client", port++, NULL, 0,
 		    &addr_any, 0, 100, 100, 0, 0, 0, &this_ip_addr);
+	trace_hash_event_expect(TCP_HASH_MD5_UNEXPECTED, this_ip_addr,
+				this_ip_dest, -1, port, 0, 0, 1, 0, 0, 0);
 	try_connect("AO server (INADDR_ANY): MD5 client", port++, &addr_any, 0,
 		    NULL, 0, 100, 100, 0, FAULT_TIMEOUT, 1, &this_ip_addr);
+	trace_hash_event_expect(TCP_HASH_AO_REQUIRED, this_ip_addr,
+				this_ip_dest, -1, port, 0, 0, 1, 0, 0, 0);
 	try_connect("AO server (INADDR_ANY): unsigned client", port++, NULL, 0,
 		    NULL, 0, 100, 100, 0, FAULT_TIMEOUT, 0, &this_ip_addr);
 	try_connect("AO server (AO_REQUIRED): AO client", port++, NULL, 0,
 		    &addr_any, 0, 100, 100, 0, 0, 0, &this_ip_addr);
+	trace_hash_event_expect(TCP_HASH_AO_REQUIRED, client2,
+				this_ip_dest, -1, port, 0, 0, 1, 0, 0, 0);
 	try_connect("AO server (AO_REQUIRED): unsigned client", port++, NULL, 0,
 		    NULL, 0, 100, 100, 0, FAULT_TIMEOUT, 0, &client2);
 
+	trace_ao_event_expect(TCP_AO_KEY_NOT_FOUND, this_ip_addr, this_ip_dest,
+			      -1, port, 0, 0, 1, 0, 0, 0, 100, 100, -1);
 	try_connect("MD5 server (INADDR_ANY): AO client", port++, NULL, 0,
 		   &addr_any, 0, 100, 100, 0, FAULT_TIMEOUT, 1, &this_ip_addr);
 	try_connect("MD5 server (INADDR_ANY): MD5 client", port++, &addr_any, 0,
 		   NULL, 0, 100, 100, 0, 0, 1, &this_ip_addr);
+	trace_hash_event_expect(TCP_HASH_MD5_REQUIRED, this_ip_addr,
+				this_ip_dest, -1, port, 0, 0, 1, 0, 0, 0);
 	try_connect("MD5 server (INADDR_ANY): no sign client", port++, NULL, 0,
 		   NULL, 0, 100, 100, 0, FAULT_TIMEOUT, 1, &this_ip_addr);
 
+	trace_ao_event_expect(TCP_AO_KEY_NOT_FOUND, this_ip_addr, this_ip_dest,
+			      -1, port, 0, 0, 1, 0, 0, 0, 100, 100, -1);
 	try_connect("no sign server: AO client", port++, NULL, 0,
 		   &addr_any, 0, 100, 100, 0, FAULT_TIMEOUT, 0, &this_ip_addr);
+	trace_hash_event_expect(TCP_HASH_MD5_UNEXPECTED, this_ip_addr,
+				this_ip_dest, -1, port, 0, 0, 1, 0, 0, 0);
 	try_connect("no sign server: MD5 client", port++, &addr_any, 0,
 		   NULL, 0, 100, 100, 0, FAULT_TIMEOUT, 1, &this_ip_addr);
 	try_connect("no sign server: no sign client", port++, NULL, 0,
@@ -696,25 +715,37 @@ static void *client_fn(void *arg)
 
 	try_connect("AO+MD5 server: AO client (matching)", port++, NULL, 0,
 		   &addr_any, 0, 100, 100, 0, 0, 1, &client2);
+	trace_ao_event_expect(TCP_AO_KEY_NOT_FOUND, this_ip_addr, this_ip_dest,
+			      -1, port, 0, 0, 1, 0, 0, 0, 100, 100, -1);
 	try_connect("AO+MD5 server: AO client (misconfig, matching MD5)",
 		   port++, NULL, 0, &addr_any, 0, 100, 100, 0,
 		   FAULT_TIMEOUT, 1, &this_ip_addr);
+	trace_ao_event_expect(TCP_AO_KEY_NOT_FOUND, client3, this_ip_dest,
+			      -1, port, 0, 0, 1, 0, 0, 0, 100, 100, -1);
 	try_connect("AO+MD5 server: AO client (misconfig, non-matching)",
 		   port++, NULL, 0, &addr_any, 0, 100, 100, 0,
 		   FAULT_TIMEOUT, 1, &client3);
 	try_connect("AO+MD5 server: MD5 client (matching)", port++, &addr_any, 0,
 		   NULL, 0, 100, 100, 0, 0, 1, &this_ip_addr);
+	trace_hash_event_expect(TCP_HASH_MD5_UNEXPECTED, client2,
+				this_ip_dest, -1, port, 0, 0, 1, 0, 0, 0);
 	try_connect("AO+MD5 server: MD5 client (misconfig, matching AO)",
 		   port++, &addr_any, 0, NULL, 0, 100, 100, 0, FAULT_TIMEOUT,
 		   1, &client2);
+	trace_hash_event_expect(TCP_HASH_MD5_UNEXPECTED, client3,
+				this_ip_dest, -1, port, 0, 0, 1, 0, 0, 0);
 	try_connect("AO+MD5 server: MD5 client (misconfig, non-matching)",
 		   port++, &addr_any, 0, NULL, 0, 100, 100, 0, FAULT_TIMEOUT,
 		   1, &client3);
 	try_connect("AO+MD5 server: no sign client (unmatched)",
 		   port++, NULL, 0, NULL, 0, 100, 100, 0, 0, 1, &client3);
+	trace_hash_event_expect(TCP_HASH_AO_REQUIRED, client2,
+				this_ip_dest, -1, port, 0, 0, 1, 0, 0, 0);
 	try_connect("AO+MD5 server: no sign client (misconfig, matching AO)",
 		   port++, NULL, 0, NULL, 0, 100, 100, 0, FAULT_TIMEOUT,
 		   1, &client2);
+	trace_hash_event_expect(TCP_HASH_MD5_REQUIRED, this_ip_addr,
+				this_ip_dest, -1, port, 0, 0, 1, 0, 0, 0);
 	try_connect("AO+MD5 server: no sign client (misconfig, matching MD5)",
 		   port++, NULL, 0, NULL, 0, 100, 100, 0, FAULT_TIMEOUT,
 		   1, &this_ip_addr);
@@ -736,6 +767,6 @@ static void *client_fn(void *arg)
 
 int main(int argc, char *argv[])
 {
-	test_init(72, server_fn, client_fn);
+	test_init(73, server_fn, client_fn);
 	return 0;
 }
diff --git a/tools/testing/selftests/net/test_blackhole_dev.sh b/tools/testing/selftests/net/test_blackhole_dev.sh
deleted file mode 100755
index 3119b80e711f..000000000000
--- a/tools/testing/selftests/net/test_blackhole_dev.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-# Runs blackhole-dev test using blackhole-dev kernel module
-
-if /sbin/modprobe -q test_blackhole_dev ; then
-	/sbin/modprobe -q -r test_blackhole_dev;
-	echo "test_blackhole_dev: ok";
-else
-	echo "test_blackhole_dev: [FAIL]";
-	exit 1;
-fi
diff --git a/tools/testing/selftests/net/test_bridge_neigh_suppress.sh b/tools/testing/selftests/net/test_bridge_neigh_suppress.sh
index 8533393a4f18..9067197c9055 100755
--- a/tools/testing/selftests/net/test_bridge_neigh_suppress.sh
+++ b/tools/testing/selftests/net/test_bridge_neigh_suppress.sh
@@ -51,7 +51,9 @@ ret=0
 # All tests in this script. Can be overridden with -t option.
 TESTS="
 	neigh_suppress_arp
+	neigh_suppress_uc_arp
 	neigh_suppress_ns
+	neigh_suppress_uc_ns
 	neigh_vlan_suppress_arp
 	neigh_vlan_suppress_ns
 "
@@ -154,17 +156,9 @@ setup_topo()
 		setup_topo_ns $ns
 	done
 
-	ip link add name veth0 type veth peer name veth1
-	ip link set dev veth0 netns $h1 name eth0
-	ip link set dev veth1 netns $sw1 name swp1
-
-	ip link add name veth0 type veth peer name veth1
-	ip link set dev veth0 netns $sw1 name veth0
-	ip link set dev veth1 netns $sw2 name veth0
-
-	ip link add name veth0 type veth peer name veth1
-	ip link set dev veth0 netns $h2 name eth0
-	ip link set dev veth1 netns $sw2 name swp1
+	ip -n $h1 link add name eth0 type veth peer name swp1 netns $sw1
+	ip -n $sw1 link add name veth0 type veth peer name veth0 netns $sw2
+	ip -n $h2 link add name eth0 type veth peer name swp1 netns $sw2
 }
 
 setup_host_common()
@@ -396,6 +390,52 @@ neigh_suppress_arp()
 	neigh_suppress_arp_common $vid $sip $tip
 }
 
+neigh_suppress_uc_arp_common()
+{
+	local vid=$1; shift
+	local sip=$1; shift
+	local tip=$1; shift
+	local tmac
+
+	echo
+	echo "Unicast ARP, per-port ARP suppression - VLAN $vid"
+	echo "-----------------------------------------------"
+
+	run_cmd "bridge -n $sw1 link set dev vx0 neigh_suppress on"
+	run_cmd "bridge -n $sw1 -d link show dev vx0 | grep \"neigh_suppress on\""
+	log_test $? 0 "\"neigh_suppress\" is on"
+
+	tmac=$(ip -n $h2 -j -p link show eth0.$vid | jq -r '.[]["address"]')
+	run_cmd "bridge -n $sw1 fdb replace $tmac dev vx0 master static vlan $vid"
+	run_cmd "ip -n $sw1 neigh replace $tip lladdr $tmac nud permanent dev br0.$vid"
+
+	run_cmd "tc -n $h1 qdisc replace dev eth0.$vid clsact"
+	run_cmd "tc -n $h1 filter replace dev eth0.$vid ingress pref 1 handle 101 proto arp flower arp_sip $tip arp_op reply action pass"
+
+	run_cmd "tc -n $h2 qdisc replace dev eth0.$vid clsact"
+	run_cmd "tc -n $h2 filter replace dev eth0.$vid egress pref 1 handle 101 proto arp flower arp_tip $sip arp_op reply action pass"
+
+	run_cmd "ip netns exec $h1 mausezahn eth0.$vid -c 1 -a own -b $tmac -t arp 'request sip=$sip, tip=$tip, tmac=$tmac' -q"
+	tc_check_packets $h1 "dev eth0.$vid ingress" 101 1
+	log_test $? 0 "Unicast ARP, suppression on, h1 filter"
+	tc_check_packets $h2 "dev eth0.$vid egress" 101 1
+	log_test $? 0 "Unicast ARP, suppression on, h2 filter"
+}
+
+neigh_suppress_uc_arp()
+{
+	local vid=10
+	local sip=192.0.2.1
+	local tip=192.0.2.2
+
+	neigh_suppress_uc_arp_common $vid $sip $tip
+
+	vid=20
+	sip=192.0.2.17
+	tip=192.0.2.18
+	neigh_suppress_uc_arp_common $vid $sip $tip
+}
+
 neigh_suppress_ns_common()
 {
 	local vid=$1; shift
@@ -502,6 +542,78 @@ neigh_suppress_ns()
 	neigh_suppress_ns_common $vid $saddr $daddr $maddr
 }
 
+icmpv6_header_get()
+{
+	local csum=$1; shift
+	local tip=$1; shift
+	local type
+	local p
+
+	# Type 135 (Neighbor Solicitation), hex format
+	type="87"
+	p=$(:
+		)"$type:"$(                     : ICMPv6.type
+		)"00:"$(                        : ICMPv6.code
+		)"$csum:"$(                     : ICMPv6.checksum
+		)"00:00:00:00:"$(               : Reserved
+	        )"$tip:"$(	                : Target Address
+		)
+	echo $p
+}
+
+neigh_suppress_uc_ns_common()
+{
+	local vid=$1; shift
+	local sip=$1; shift
+	local dip=$1; shift
+	local full_dip=$1; shift
+	local csum=$1; shift
+	local tmac
+
+	echo
+	echo "Unicast NS, per-port NS suppression - VLAN $vid"
+	echo "---------------------------------------------"
+
+	run_cmd "bridge -n $sw1 link set dev vx0 neigh_suppress on"
+	run_cmd "bridge -n $sw1 -d link show dev vx0 | grep \"neigh_suppress on\""
+	log_test $? 0 "\"neigh_suppress\" is on"
+
+	tmac=$(ip -n $h2 -j -p link show eth0.$vid | jq -r '.[]["address"]')
+	run_cmd "bridge -n $sw1 fdb replace $tmac dev vx0 master static vlan $vid"
+	run_cmd "ip -n $sw1 -6 neigh replace $dip lladdr $tmac nud permanent dev br0.$vid"
+
+	run_cmd "tc -n $h1 qdisc replace dev eth0.$vid clsact"
+	run_cmd "tc -n $h1 filter replace dev eth0.$vid ingress pref 1 handle 101 proto ipv6 flower ip_proto icmpv6 src_ip $dip type 136 code 0 action pass"
+
+	run_cmd "tc -n $h2 qdisc replace dev eth0.$vid clsact"
+	run_cmd "tc -n $h2 filter replace dev eth0.$vid egress pref 1 handle 101 proto ipv6 flower ip_proto icmpv6 dst_ip $sip type 136 code 0 action pass"
+
+	run_cmd "ip netns exec $h1 mausezahn -6 eth0.$vid -c 1 -a own -b $tmac -A $sip -B $dip -t ip hop=255,next=58,payload=$(icmpv6_header_get $csum $full_dip) -q"
+	tc_check_packets $h1 "dev eth0.$vid ingress" 101 1
+	log_test $? 0 "Unicast NS, suppression on, h1 filter"
+	tc_check_packets $h2 "dev eth0.$vid egress" 101 1
+	log_test $? 0 "Unicast NS, suppression on, h2 filter"
+}
+
+neigh_suppress_uc_ns()
+{
+	local vid=10
+	local saddr=2001:db8:1::1
+	local daddr=2001:db8:1::2
+	local full_daddr=20:01:0d:b8:00:01:00:00:00:00:00:00:00:00:00:02
+	local csum="ef:79"
+
+	neigh_suppress_uc_ns_common $vid $saddr $daddr $full_daddr $csum
+
+	vid=20
+	saddr=2001:db8:2::1
+	daddr=2001:db8:2::2
+	full_daddr=20:01:0d:b8:00:02:00:00:00:00:00:00:00:00:00:02
+	csum="ef:76"
+
+	neigh_suppress_uc_ns_common $vid $saddr $daddr $full_daddr $csum
+}
+
 neigh_vlan_suppress_arp()
 {
 	local vid1=10
@@ -833,6 +945,11 @@ if [ ! -x "$(command -v jq)" ]; then
 	exit $ksft_skip
 fi
 
+if [ ! -x "$(command -v mausezahn)" ]; then
+	echo "SKIP: Could not run test without mausezahn tool"
+	exit $ksft_skip
+fi
+
 bridge link help 2>&1 | grep -q "neigh_vlan_suppress"
 if [ $? -ne 0 ]; then
    echo "SKIP: iproute2 bridge too old, missing per-VLAN neighbor suppression support"
diff --git a/tools/testing/selftests/net/test_so_rcv.sh b/tools/testing/selftests/net/test_so_rcv.sh
new file mode 100755
index 000000000000..d8aa4362879d
--- /dev/null
+++ b/tools/testing/selftests/net/test_so_rcv.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source lib.sh
+
+HOSTS=("127.0.0.1" "::1")
+PORT=1234
+TOTAL_TESTS=0
+FAILED_TESTS=0
+
+declare -A TESTS=(
+	["SO_RCVPRIORITY"]="-P 2"
+	["SO_RCVMARK"]="-M 3"
+)
+
+check_result() {
+	((TOTAL_TESTS++))
+	if [ "$1" -ne 0 ]; then
+		((FAILED_TESTS++))
+	fi
+}
+
+cleanup()
+{
+	cleanup_ns $NS
+}
+
+trap cleanup EXIT
+
+setup_ns NS
+
+for HOST in "${HOSTS[@]}"; do
+	PROTOCOL="IPv4"
+	if [[ "$HOST" == "::1" ]]; then
+		PROTOCOL="IPv6"
+	fi
+	for test_name in "${!TESTS[@]}"; do
+		echo "Running $test_name test, $PROTOCOL"
+		arg=${TESTS[$test_name]}
+
+		ip netns exec $NS ./so_rcv_listener $arg $HOST $PORT &
+		LISTENER_PID=$!
+
+		sleep 0.5
+
+		if ! ip netns exec $NS ./cmsg_sender $arg $HOST $PORT; then
+			echo "Sender failed for $test_name, $PROTOCOL"
+			kill "$LISTENER_PID" 2>/dev/null
+			wait "$LISTENER_PID"
+			check_result 1
+			continue
+		fi
+
+		wait "$LISTENER_PID"
+		LISTENER_EXIT_CODE=$?
+
+		if [ "$LISTENER_EXIT_CODE" -eq 0 ]; then
+			echo "Rcv test OK for $test_name, $PROTOCOL"
+			check_result 0
+		else
+			echo "Rcv test FAILED for $test_name, $PROTOCOL"
+			check_result 1
+		fi
+	done
+done
+
+if [ "$FAILED_TESTS" -ne 0 ]; then
+	echo "FAIL - $FAILED_TESTS/$TOTAL_TESTS tests failed"
+	exit ${KSFT_FAIL}
+else
+	echo "OK - All $TOTAL_TESTS tests passed"
+	exit ${KSFT_PASS}
+fi
diff --git a/tools/testing/selftests/net/test_vxlan_fdb_changelink.sh b/tools/testing/selftests/net/test_vxlan_fdb_changelink.sh
index 2d442cdab11e..062f957950af 100755
--- a/tools/testing/selftests/net/test_vxlan_fdb_changelink.sh
+++ b/tools/testing/selftests/net/test_vxlan_fdb_changelink.sh
@@ -1,29 +1,114 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
-# Check FDB default-remote handling across "ip link set".
+ALL_TESTS="
+	test_set_remote
+	test_change_mc_remote
+"
+source lib.sh
 
 check_remotes()
 {
 	local what=$1; shift
 	local N=$(bridge fdb sh dev vx | grep 00:00:00:00:00:00 | wc -l)
 
-	echo -ne "expected two remotes after $what\t"
-	if [[ $N != 2 ]]; then
-		echo "[FAIL]"
-		EXIT_STATUS=1
+	((N == 2))
+	check_err $? "expected 2 remotes after $what, got $N"
+}
+
+# Check FDB default-remote handling across "ip link set".
+test_set_remote()
+{
+	RET=0
+
+	ip_link_add vx up type vxlan id 2000 dstport 4789
+	bridge fdb ap dev vx 00:00:00:00:00:00 dst 192.0.2.20 self permanent
+	bridge fdb ap dev vx 00:00:00:00:00:00 dst 192.0.2.30 self permanent
+	check_remotes "fdb append"
+
+	ip link set dev vx type vxlan remote 192.0.2.30
+	check_remotes "link set"
+
+	log_test 'FDB default-remote handling across "ip link set"'
+}
+
+fmt_remote()
+{
+	local addr=$1; shift
+
+	if [[ $addr == 224.* ]]; then
+		echo "group $addr"
 	else
-		echo "[ OK ]"
+		echo "remote $addr"
 	fi
 }
 
-ip link add name vx up type vxlan id 2000 dstport 4789
-bridge fdb ap dev vx 00:00:00:00:00:00 dst 192.0.2.20 self permanent
-bridge fdb ap dev vx 00:00:00:00:00:00 dst 192.0.2.30 self permanent
-check_remotes "fdb append"
+change_remote()
+{
+	local remote=$1; shift
+
+	ip link set dev vx type vxlan $(fmt_remote $remote) dev v1
+}
+
+check_membership()
+{
+	local check_vec=("$@")
+
+	local memberships
+	memberships=$(
+	    netstat -n --groups |
+		sed -n '/^v1\b/p' |
+		grep -o '[^ ]*$'
+	)
+	check_err $? "Couldn't obtain group memberships"
+
+	local item
+	for item in "${check_vec[@]}"; do
+		eval "local $item"
+		echo "$memberships" | grep -q "\b$group\b"
+		check_err_fail $fail $? "$group is_ex reported in IGMP query response"
+	done
+}
+
+test_change_mc_remote()
+{
+	check_command netstat || return
+
+	ip_link_add v1 up type veth peer name v2
+	ip_link_set_up v2
+
+	RET=0
+
+	ip_link_add vx up type vxlan dstport 4789 \
+		local 192.0.2.1 $(fmt_remote 224.1.1.1) dev v1 vni 1000
+
+	check_membership "group=224.1.1.1 fail=0" \
+			 "group=224.1.1.2 fail=1" \
+			 "group=224.1.1.3 fail=1"
+
+	log_test "MC group report after VXLAN creation"
+
+	RET=0
+
+	change_remote 224.1.1.2
+	check_membership "group=224.1.1.1 fail=1" \
+			 "group=224.1.1.2 fail=0" \
+			 "group=224.1.1.3 fail=1"
+
+	log_test "MC group report after changing VXLAN remote MC->MC"
+
+	RET=0
+
+	change_remote 192.0.2.2
+	check_membership "group=224.1.1.1 fail=1" \
+			 "group=224.1.1.2 fail=1" \
+			 "group=224.1.1.3 fail=1"
+
+	log_test "MC group report after changing VXLAN remote MC->UC"
+}
+
+trap defer_scopes_cleanup EXIT
 
-ip link set dev vx type vxlan remote 192.0.2.30
-check_remotes "link set"
+tests_run
 
-ip link del dev vx
 exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c
index f27a12d2a2c9..5ded3b3a7538 100644
--- a/tools/testing/selftests/net/tls.c
+++ b/tools/testing/selftests/net/tls.c
@@ -44,9 +44,11 @@ struct tls_crypto_info_keys {
 };
 
 static void tls_crypto_info_init(uint16_t tls_version, uint16_t cipher_type,
-				 struct tls_crypto_info_keys *tls12)
+				 struct tls_crypto_info_keys *tls12,
+				 char key_generation)
 {
-	memset(tls12, 0, sizeof(*tls12));
+	memset(tls12, key_generation, sizeof(*tls12));
+	memset(tls12, 0, sizeof(struct tls_crypto_info));
 
 	switch (cipher_type) {
 	case TLS_CIPHER_CHACHA20_POLY1305:
@@ -266,6 +268,25 @@ TEST_F(tls_basic, bad_cipher)
 	EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, sizeof(struct tls12_crypto_info_aes_gcm_128)), -1);
 }
 
+TEST_F(tls_basic, recseq_wrap)
+{
+	struct tls_crypto_info_keys tls12;
+	char const *test_str = "test_read";
+	int send_len = 10;
+
+	if (self->notls)
+		SKIP(return, "no TLS support");
+
+	tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_GCM_128, &tls12, 0);
+	memset(&tls12.aes128.rec_seq, 0xff, sizeof(tls12.aes128.rec_seq));
+
+	ASSERT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0);
+	ASSERT_EQ(setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len), 0);
+
+	EXPECT_EQ(send(self->fd, test_str, send_len, 0), -1);
+	EXPECT_EQ(errno, EBADMSG);
+}
+
 FIXTURE(tls)
 {
 	int fd, cfd;
@@ -372,7 +393,7 @@ FIXTURE_SETUP(tls)
 		SKIP(return, "Unsupported cipher in FIPS mode");
 
 	tls_crypto_info_init(variant->tls_version, variant->cipher_type,
-			     &tls12);
+			     &tls12, 0);
 
 	ulp_sock_pair(_metadata, &self->fd, &self->cfd, &self->notls);
 
@@ -1156,7 +1177,7 @@ TEST_F(tls, bidir)
 		struct tls_crypto_info_keys tls12;
 
 		tls_crypto_info_init(variant->tls_version, variant->cipher_type,
-				     &tls12);
+				     &tls12, 0);
 
 		ret = setsockopt(self->fd, SOL_TLS, TLS_RX, &tls12,
 				 tls12.len);
@@ -1595,7 +1616,7 @@ TEST_F(tls, getsockopt)
 	EXPECT_EQ(get.crypto_info.cipher_type, variant->cipher_type);
 
 	/* get the full crypto_info */
-	tls_crypto_info_init(variant->tls_version, variant->cipher_type, &expect);
+	tls_crypto_info_init(variant->tls_version, variant->cipher_type, &expect, 0);
 	len = expect.len;
 	memrnd(&get, sizeof(get));
 	EXPECT_EQ(getsockopt(self->fd, SOL_TLS, TLS_TX, &get, &len), 0);
@@ -1649,6 +1670,500 @@ TEST_F(tls, recv_efault)
 		EXPECT_EQ(memcmp(rec2, recv_mem + 9, ret - 9), 0);
 }
 
+#define TLS_RECORD_TYPE_HANDSHAKE      0x16
+/* key_update, length 1, update_not_requested */
+static const char key_update_msg[] = "\x18\x00\x00\x01\x00";
+static void tls_send_keyupdate(struct __test_metadata *_metadata, int fd)
+{
+	size_t len = sizeof(key_update_msg);
+
+	EXPECT_EQ(tls_send_cmsg(fd, TLS_RECORD_TYPE_HANDSHAKE,
+				(char *)key_update_msg, len, 0),
+		  len);
+}
+
+static void tls_recv_keyupdate(struct __test_metadata *_metadata, int fd, int flags)
+{
+	char buf[100];
+
+	EXPECT_EQ(tls_recv_cmsg(_metadata, fd, TLS_RECORD_TYPE_HANDSHAKE, buf, sizeof(buf), flags),
+		  sizeof(key_update_msg));
+	EXPECT_EQ(memcmp(buf, key_update_msg, sizeof(key_update_msg)), 0);
+}
+
+/* set the key to 0 then 1 for RX, immediately to 1 for TX */
+TEST_F(tls_basic, rekey_rx)
+{
+	struct tls_crypto_info_keys tls12_0, tls12_1;
+	char const *test_str = "test_message";
+	int send_len = strlen(test_str) + 1;
+	char buf[20];
+	int ret;
+
+	if (self->notls)
+		return;
+
+	tls_crypto_info_init(TLS_1_3_VERSION, TLS_CIPHER_AES_GCM_128,
+			     &tls12_0, 0);
+	tls_crypto_info_init(TLS_1_3_VERSION, TLS_CIPHER_AES_GCM_128,
+			     &tls12_1, 1);
+
+	ret = setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12_1, tls12_1.len);
+	ASSERT_EQ(ret, 0);
+
+	ret = setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12_0, tls12_0.len);
+	ASSERT_EQ(ret, 0);
+
+	ret = setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12_1, tls12_1.len);
+	EXPECT_EQ(ret, 0);
+
+	EXPECT_EQ(send(self->fd, test_str, send_len, 0), send_len);
+	EXPECT_EQ(recv(self->cfd, buf, send_len, 0), send_len);
+	EXPECT_EQ(memcmp(buf, test_str, send_len), 0);
+}
+
+/* set the key to 0 then 1 for TX, immediately to 1 for RX */
+TEST_F(tls_basic, rekey_tx)
+{
+	struct tls_crypto_info_keys tls12_0, tls12_1;
+	char const *test_str = "test_message";
+	int send_len = strlen(test_str) + 1;
+	char buf[20];
+	int ret;
+
+	if (self->notls)
+		return;
+
+	tls_crypto_info_init(TLS_1_3_VERSION, TLS_CIPHER_AES_GCM_128,
+			     &tls12_0, 0);
+	tls_crypto_info_init(TLS_1_3_VERSION, TLS_CIPHER_AES_GCM_128,
+			     &tls12_1, 1);
+
+	ret = setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12_0, tls12_0.len);
+	ASSERT_EQ(ret, 0);
+
+	ret = setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12_1, tls12_1.len);
+	ASSERT_EQ(ret, 0);
+
+	ret = setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12_1, tls12_1.len);
+	EXPECT_EQ(ret, 0);
+
+	EXPECT_EQ(send(self->fd, test_str, send_len, 0), send_len);
+	EXPECT_EQ(recv(self->cfd, buf, send_len, 0), send_len);
+	EXPECT_EQ(memcmp(buf, test_str, send_len), 0);
+}
+
+TEST_F(tls_basic, disconnect)
+{
+	char const *test_str = "test_message";
+	int send_len = strlen(test_str) + 1;
+	struct tls_crypto_info_keys key;
+	struct sockaddr_in addr;
+	char buf[20];
+	int ret;
+
+	if (self->notls)
+		return;
+
+	tls_crypto_info_init(TLS_1_3_VERSION, TLS_CIPHER_AES_GCM_128,
+			     &key, 0);
+
+	ret = setsockopt(self->fd, SOL_TLS, TLS_TX, &key, key.len);
+	ASSERT_EQ(ret, 0);
+
+	/* Pre-queue the data so that setsockopt parses it but doesn't
+	 * dequeue it from the TCP socket. recvmsg would dequeue.
+	 */
+	EXPECT_EQ(send(self->fd, test_str, send_len, 0), send_len);
+
+	ret = setsockopt(self->cfd, SOL_TLS, TLS_RX, &key, key.len);
+	ASSERT_EQ(ret, 0);
+
+	addr.sin_family = AF_UNSPEC;
+	addr.sin_addr.s_addr = htonl(INADDR_ANY);
+	addr.sin_port = 0;
+	ret = connect(self->cfd, &addr, sizeof(addr));
+	EXPECT_EQ(ret, -1);
+	EXPECT_EQ(errno, EOPNOTSUPP);
+
+	EXPECT_EQ(recv(self->cfd, buf, send_len, 0), send_len);
+}
+
+TEST_F(tls, rekey)
+{
+	char const *test_str_1 = "test_message_before_rekey";
+	char const *test_str_2 = "test_message_after_rekey";
+	struct tls_crypto_info_keys tls12;
+	int send_len;
+	char buf[100];
+
+	if (variant->tls_version != TLS_1_3_VERSION)
+		return;
+
+	/* initial send/recv */
+	send_len = strlen(test_str_1) + 1;
+	EXPECT_EQ(send(self->fd, test_str_1, send_len, 0), send_len);
+	EXPECT_EQ(recv(self->cfd, buf, send_len, 0), send_len);
+	EXPECT_EQ(memcmp(buf, test_str_1, send_len), 0);
+
+	/* update TX key */
+	tls_send_keyupdate(_metadata, self->fd);
+	tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1);
+	EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0);
+
+	/* send after rekey */
+	send_len = strlen(test_str_2) + 1;
+	EXPECT_EQ(send(self->fd, test_str_2, send_len, 0), send_len);
+
+	/* can't receive the KeyUpdate without a control message */
+	EXPECT_EQ(recv(self->cfd, buf, send_len, 0), -1);
+
+	/* get KeyUpdate */
+	tls_recv_keyupdate(_metadata, self->cfd, 0);
+
+	/* recv blocking -> -EKEYEXPIRED */
+	EXPECT_EQ(recv(self->cfd, buf, sizeof(buf), 0), -1);
+	EXPECT_EQ(errno, EKEYEXPIRED);
+
+	/* recv non-blocking -> -EKEYEXPIRED */
+	EXPECT_EQ(recv(self->cfd, buf, sizeof(buf), MSG_DONTWAIT), -1);
+	EXPECT_EQ(errno, EKEYEXPIRED);
+
+	/* update RX key */
+	EXPECT_EQ(setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len), 0);
+
+	/* recv after rekey */
+	EXPECT_NE(recv(self->cfd, buf, send_len, 0), -1);
+	EXPECT_EQ(memcmp(buf, test_str_2, send_len), 0);
+}
+
+TEST_F(tls, rekey_fail)
+{
+	char const *test_str_1 = "test_message_before_rekey";
+	char const *test_str_2 = "test_message_after_rekey";
+	struct tls_crypto_info_keys tls12;
+	int send_len;
+	char buf[100];
+
+	/* initial send/recv */
+	send_len = strlen(test_str_1) + 1;
+	EXPECT_EQ(send(self->fd, test_str_1, send_len, 0), send_len);
+	EXPECT_EQ(recv(self->cfd, buf, send_len, 0), send_len);
+	EXPECT_EQ(memcmp(buf, test_str_1, send_len), 0);
+
+	/* update TX key */
+	tls_send_keyupdate(_metadata, self->fd);
+
+	if (variant->tls_version != TLS_1_3_VERSION) {
+		/* just check that rekey is not supported and return */
+		tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1);
+		EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), -1);
+		EXPECT_EQ(errno, EBUSY);
+		return;
+	}
+
+	/* successful update */
+	tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1);
+	EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0);
+
+	/* invalid update: change of version */
+	tls_crypto_info_init(TLS_1_2_VERSION, variant->cipher_type, &tls12, 1);
+	EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), -1);
+	EXPECT_EQ(errno, EINVAL);
+
+	/* invalid update (RX socket): change of version */
+	tls_crypto_info_init(TLS_1_2_VERSION, variant->cipher_type, &tls12, 1);
+	EXPECT_EQ(setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len), -1);
+	EXPECT_EQ(errno, EINVAL);
+
+	/* invalid update: change of cipher */
+	if (variant->cipher_type == TLS_CIPHER_AES_GCM_256)
+		tls_crypto_info_init(variant->tls_version, TLS_CIPHER_CHACHA20_POLY1305, &tls12, 1);
+	else
+		tls_crypto_info_init(variant->tls_version, TLS_CIPHER_AES_GCM_256, &tls12, 1);
+	EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), -1);
+	EXPECT_EQ(errno, EINVAL);
+
+	/* send after rekey, the invalid updates shouldn't have an effect */
+	send_len = strlen(test_str_2) + 1;
+	EXPECT_EQ(send(self->fd, test_str_2, send_len, 0), send_len);
+
+	/* can't receive the KeyUpdate without a control message */
+	EXPECT_EQ(recv(self->cfd, buf, send_len, 0), -1);
+
+	/* get KeyUpdate */
+	tls_recv_keyupdate(_metadata, self->cfd, 0);
+
+	/* recv blocking -> -EKEYEXPIRED */
+	EXPECT_EQ(recv(self->cfd, buf, sizeof(buf), 0), -1);
+	EXPECT_EQ(errno, EKEYEXPIRED);
+
+	/* recv non-blocking -> -EKEYEXPIRED */
+	EXPECT_EQ(recv(self->cfd, buf, sizeof(buf), MSG_DONTWAIT), -1);
+	EXPECT_EQ(errno, EKEYEXPIRED);
+
+	/* update RX key */
+	tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1);
+	EXPECT_EQ(setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len), 0);
+
+	/* recv after rekey */
+	EXPECT_NE(recv(self->cfd, buf, send_len, 0), -1);
+	EXPECT_EQ(memcmp(buf, test_str_2, send_len), 0);
+}
+
+TEST_F(tls, rekey_peek)
+{
+	char const *test_str_1 = "test_message_before_rekey";
+	struct tls_crypto_info_keys tls12;
+	int send_len;
+	char buf[100];
+
+	if (variant->tls_version != TLS_1_3_VERSION)
+		return;
+
+	send_len = strlen(test_str_1) + 1;
+	EXPECT_EQ(send(self->fd, test_str_1, send_len, 0), send_len);
+
+	/* update TX key */
+	tls_send_keyupdate(_metadata, self->fd);
+	tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1);
+	EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0);
+
+	EXPECT_EQ(recv(self->cfd, buf, sizeof(buf), MSG_PEEK), send_len);
+	EXPECT_EQ(memcmp(buf, test_str_1, send_len), 0);
+
+	EXPECT_EQ(recv(self->cfd, buf, send_len, 0), send_len);
+	EXPECT_EQ(memcmp(buf, test_str_1, send_len), 0);
+
+	/* can't receive the KeyUpdate without a control message */
+	EXPECT_EQ(recv(self->cfd, buf, send_len, MSG_PEEK), -1);
+
+	/* peek KeyUpdate */
+	tls_recv_keyupdate(_metadata, self->cfd, MSG_PEEK);
+
+	/* get KeyUpdate */
+	tls_recv_keyupdate(_metadata, self->cfd, 0);
+
+	/* update RX key */
+	EXPECT_EQ(setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len), 0);
+}
+
+TEST_F(tls, splice_rekey)
+{
+	int send_len = TLS_PAYLOAD_MAX_LEN / 2;
+	char mem_send[TLS_PAYLOAD_MAX_LEN];
+	char mem_recv[TLS_PAYLOAD_MAX_LEN];
+	struct tls_crypto_info_keys tls12;
+	int p[2];
+
+	if (variant->tls_version != TLS_1_3_VERSION)
+		return;
+
+	memrnd(mem_send, sizeof(mem_send));
+
+	ASSERT_GE(pipe(p), 0);
+	EXPECT_EQ(send(self->fd, mem_send, send_len, 0), send_len);
+
+	/* update TX key */
+	tls_send_keyupdate(_metadata, self->fd);
+	tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1);
+	EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0);
+
+	EXPECT_EQ(send(self->fd, mem_send, send_len, 0), send_len);
+
+	EXPECT_EQ(splice(self->cfd, NULL, p[1], NULL, TLS_PAYLOAD_MAX_LEN, 0), send_len);
+	EXPECT_EQ(read(p[0], mem_recv, send_len), send_len);
+	EXPECT_EQ(memcmp(mem_send, mem_recv, send_len), 0);
+
+	/* can't splice the KeyUpdate */
+	EXPECT_EQ(splice(self->cfd, NULL, p[1], NULL, TLS_PAYLOAD_MAX_LEN, 0), -1);
+	EXPECT_EQ(errno, EINVAL);
+
+	/* peek KeyUpdate */
+	tls_recv_keyupdate(_metadata, self->cfd, MSG_PEEK);
+
+	/* get KeyUpdate */
+	tls_recv_keyupdate(_metadata, self->cfd, 0);
+
+	/* can't splice before updating the key */
+	EXPECT_EQ(splice(self->cfd, NULL, p[1], NULL, TLS_PAYLOAD_MAX_LEN, 0), -1);
+	EXPECT_EQ(errno, EKEYEXPIRED);
+
+	/* update RX key */
+	EXPECT_EQ(setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len), 0);
+
+	EXPECT_EQ(splice(self->cfd, NULL, p[1], NULL, TLS_PAYLOAD_MAX_LEN, 0), send_len);
+	EXPECT_EQ(read(p[0], mem_recv, send_len), send_len);
+	EXPECT_EQ(memcmp(mem_send, mem_recv, send_len), 0);
+}
+
+TEST_F(tls, rekey_peek_splice)
+{
+	char const *test_str_1 = "test_message_before_rekey";
+	struct tls_crypto_info_keys tls12;
+	int send_len;
+	char buf[100];
+	char mem_recv[TLS_PAYLOAD_MAX_LEN];
+	int p[2];
+
+	if (variant->tls_version != TLS_1_3_VERSION)
+		return;
+
+	ASSERT_GE(pipe(p), 0);
+
+	send_len = strlen(test_str_1) + 1;
+	EXPECT_EQ(send(self->fd, test_str_1, send_len, 0), send_len);
+
+	/* update TX key */
+	tls_send_keyupdate(_metadata, self->fd);
+	tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1);
+	EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0);
+
+	EXPECT_EQ(recv(self->cfd, buf, sizeof(buf), MSG_PEEK), send_len);
+	EXPECT_EQ(memcmp(buf, test_str_1, send_len), 0);
+
+	EXPECT_EQ(splice(self->cfd, NULL, p[1], NULL, TLS_PAYLOAD_MAX_LEN, 0), send_len);
+	EXPECT_EQ(read(p[0], mem_recv, send_len), send_len);
+	EXPECT_EQ(memcmp(mem_recv, test_str_1, send_len), 0);
+}
+
+TEST_F(tls, rekey_getsockopt)
+{
+	struct tls_crypto_info_keys tls12;
+	struct tls_crypto_info_keys tls12_get;
+	socklen_t len;
+
+	tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 0);
+
+	len = tls12.len;
+	EXPECT_EQ(getsockopt(self->fd, SOL_TLS, TLS_TX, &tls12_get, &len), 0);
+	EXPECT_EQ(len, tls12.len);
+	EXPECT_EQ(memcmp(&tls12_get, &tls12, tls12.len), 0);
+
+	len = tls12.len;
+	EXPECT_EQ(getsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12_get, &len), 0);
+	EXPECT_EQ(len, tls12.len);
+	EXPECT_EQ(memcmp(&tls12_get, &tls12, tls12.len), 0);
+
+	if (variant->tls_version != TLS_1_3_VERSION)
+		return;
+
+	tls_send_keyupdate(_metadata, self->fd);
+	tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1);
+	EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0);
+
+	tls_recv_keyupdate(_metadata, self->cfd, 0);
+	EXPECT_EQ(setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len), 0);
+
+	len = tls12.len;
+	EXPECT_EQ(getsockopt(self->fd, SOL_TLS, TLS_TX, &tls12_get, &len), 0);
+	EXPECT_EQ(len, tls12.len);
+	EXPECT_EQ(memcmp(&tls12_get, &tls12, tls12.len), 0);
+
+	len = tls12.len;
+	EXPECT_EQ(getsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12_get, &len), 0);
+	EXPECT_EQ(len, tls12.len);
+	EXPECT_EQ(memcmp(&tls12_get, &tls12, tls12.len), 0);
+}
+
+TEST_F(tls, rekey_poll_pending)
+{
+	char const *test_str = "test_message_after_rekey";
+	struct tls_crypto_info_keys tls12;
+	struct pollfd pfd = { };
+	int send_len;
+	int ret;
+
+	if (variant->tls_version != TLS_1_3_VERSION)
+		return;
+
+	/* update TX key */
+	tls_send_keyupdate(_metadata, self->fd);
+	tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1);
+	EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0);
+
+	/* get KeyUpdate */
+	tls_recv_keyupdate(_metadata, self->cfd, 0);
+
+	/* send immediately after rekey */
+	send_len = strlen(test_str) + 1;
+	EXPECT_EQ(send(self->fd, test_str, send_len, 0), send_len);
+
+	/* key hasn't been updated, expect cfd to be non-readable */
+	pfd.fd = self->cfd;
+	pfd.events = POLLIN;
+	EXPECT_EQ(poll(&pfd, 1, 0), 0);
+
+	ret = fork();
+	ASSERT_GE(ret, 0);
+
+	if (ret) {
+		int pid2, status;
+
+		/* wait before installing the new key */
+		sleep(1);
+
+		/* update RX key while poll() is sleeping */
+		EXPECT_EQ(setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len), 0);
+
+		pid2 = wait(&status);
+		EXPECT_EQ(pid2, ret);
+		EXPECT_EQ(status, 0);
+	} else {
+		pfd.fd = self->cfd;
+		pfd.events = POLLIN;
+		EXPECT_EQ(poll(&pfd, 1, 5000), 1);
+
+		exit(!__test_passed(_metadata));
+	}
+}
+
+TEST_F(tls, rekey_poll_delay)
+{
+	char const *test_str = "test_message_after_rekey";
+	struct tls_crypto_info_keys tls12;
+	struct pollfd pfd = { };
+	int send_len;
+	int ret;
+
+	if (variant->tls_version != TLS_1_3_VERSION)
+		return;
+
+	/* update TX key */
+	tls_send_keyupdate(_metadata, self->fd);
+	tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1);
+	EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0);
+
+	/* get KeyUpdate */
+	tls_recv_keyupdate(_metadata, self->cfd, 0);
+
+	ret = fork();
+	ASSERT_GE(ret, 0);
+
+	if (ret) {
+		int pid2, status;
+
+		/* wait before installing the new key */
+		sleep(1);
+
+		/* update RX key while poll() is sleeping */
+		EXPECT_EQ(setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len), 0);
+
+		sleep(1);
+		send_len = strlen(test_str) + 1;
+		EXPECT_EQ(send(self->fd, test_str, send_len, 0), send_len);
+
+		pid2 = wait(&status);
+		EXPECT_EQ(pid2, ret);
+		EXPECT_EQ(status, 0);
+	} else {
+		pfd.fd = self->cfd;
+		pfd.events = POLLIN;
+		EXPECT_EQ(poll(&pfd, 1, 5000), 1);
+		exit(!__test_passed(_metadata));
+	}
+}
+
 FIXTURE(tls_err)
 {
 	int fd, cfd;
@@ -1677,7 +2192,7 @@ FIXTURE_SETUP(tls_err)
 	int ret;
 
 	tls_crypto_info_init(variant->tls_version, TLS_CIPHER_AES_GCM_128,
-			     &tls12);
+			     &tls12, 0);
 
 	ulp_sock_pair(_metadata, &self->fd, &self->cfd, &self->notls);
 	ulp_sock_pair(_metadata, &self->fd2, &self->cfd2, &self->notls);
@@ -2099,7 +2614,7 @@ TEST(tls_v6ops) {
 	int sfd, ret, fd;
 	socklen_t len, len2;
 
-	tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_GCM_128, &tls12);
+	tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_GCM_128, &tls12, 0);
 
 	addr.sin6_family = AF_INET6;
 	addr.sin6_addr = in6addr_any;
@@ -2158,7 +2673,7 @@ TEST(prequeue) {
 	len = sizeof(addr);
 	memrnd(buf, sizeof(buf));
 
-	tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_GCM_256, &tls12);
+	tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_GCM_256, &tls12, 0);
 
 	addr.sin_family = AF_INET;
 	addr.sin_addr.s_addr = htonl(INADDR_ANY);
diff --git a/tools/testing/selftests/net/txtimestamp.c b/tools/testing/selftests/net/txtimestamp.c
index ec60a16c9307..dae91eb97d69 100644
--- a/tools/testing/selftests/net/txtimestamp.c
+++ b/tools/testing/selftests/net/txtimestamp.c
@@ -77,6 +77,8 @@ static bool cfg_epollet;
 static bool cfg_do_listen;
 static uint16_t dest_port = 9000;
 static bool cfg_print_nsec;
+static uint32_t ts_opt_id;
+static bool cfg_use_cmsg_opt_id;
 
 static struct sockaddr_in daddr;
 static struct sockaddr_in6 daddr6;
@@ -136,12 +138,13 @@ static void validate_key(int tskey, int tstype)
 	/* compare key for each subsequent request
 	 * must only test for one type, the first one requested
 	 */
-	if (saved_tskey == -1)
+	if (saved_tskey == -1 || cfg_use_cmsg_opt_id)
 		saved_tskey_type = tstype;
 	else if (saved_tskey_type != tstype)
 		return;
 
 	stepsize = cfg_proto == SOCK_STREAM ? cfg_payload_len : 1;
+	stepsize = cfg_use_cmsg_opt_id ? 0 : stepsize;
 	if (tskey != saved_tskey + stepsize) {
 		fprintf(stderr, "ERROR: key %d, expected %d\n",
 				tskey, saved_tskey + stepsize);
@@ -356,8 +359,12 @@ static void __recv_errmsg_cmsg(struct msghdr *msg, int payload_len)
 		}
 	}
 
-	if (batch > 1)
+	if (batch > 1) {
 		fprintf(stderr, "batched %d timestamps\n", batch);
+	} else if (!batch) {
+		fprintf(stderr, "Failed to report timestamps\n");
+		test_failed = true;
+	}
 }
 
 static int recv_errmsg(int fd)
@@ -480,7 +487,7 @@ static void fill_header_udp(void *p, bool is_ipv4)
 
 static void do_test(int family, unsigned int report_opt)
 {
-	char control[CMSG_SPACE(sizeof(uint32_t))];
+	char control[2 * CMSG_SPACE(sizeof(uint32_t))];
 	struct sockaddr_ll laddr;
 	unsigned int sock_opt;
 	struct cmsghdr *cmsg;
@@ -620,18 +627,32 @@ static void do_test(int family, unsigned int report_opt)
 		msg.msg_iov = &iov;
 		msg.msg_iovlen = 1;
 
-		if (cfg_use_cmsg) {
+		if (cfg_use_cmsg || cfg_use_cmsg_opt_id) {
 			memset(control, 0, sizeof(control));
 
 			msg.msg_control = control;
-			msg.msg_controllen = sizeof(control);
+			msg.msg_controllen = cfg_use_cmsg * CMSG_SPACE(sizeof(uint32_t));
+			msg.msg_controllen += cfg_use_cmsg_opt_id * CMSG_SPACE(sizeof(uint32_t));
+
+			cmsg = NULL;
+			if (cfg_use_cmsg) {
+				cmsg = CMSG_FIRSTHDR(&msg);
+				cmsg->cmsg_level = SOL_SOCKET;
+				cmsg->cmsg_type = SO_TIMESTAMPING;
+				cmsg->cmsg_len = CMSG_LEN(sizeof(uint32_t));
 
-			cmsg = CMSG_FIRSTHDR(&msg);
-			cmsg->cmsg_level = SOL_SOCKET;
-			cmsg->cmsg_type = SO_TIMESTAMPING;
-			cmsg->cmsg_len = CMSG_LEN(sizeof(uint32_t));
+				*((uint32_t *)CMSG_DATA(cmsg)) = report_opt;
+			}
+			if (cfg_use_cmsg_opt_id) {
+				cmsg = cmsg ? CMSG_NXTHDR(&msg, cmsg) : CMSG_FIRSTHDR(&msg);
+				cmsg->cmsg_level = SOL_SOCKET;
+				cmsg->cmsg_type = SCM_TS_OPT_ID;
+				cmsg->cmsg_len = CMSG_LEN(sizeof(uint32_t));
+
+				*((uint32_t *)CMSG_DATA(cmsg)) = ts_opt_id;
+				saved_tskey = ts_opt_id;
+			}
 
-			*((uint32_t *) CMSG_DATA(cmsg)) = report_opt;
 		}
 
 		val = sendmsg(fd, &msg, 0);
@@ -681,6 +702,7 @@ static void __attribute__((noreturn)) usage(const char *filepath)
 			"  -L    listen on hostname and port\n"
 			"  -n:   set no-payload option\n"
 			"  -N:   print timestamps and durations in nsec (instead of usec)\n"
+			"  -o N: use SCM_TS_OPT_ID control message to provide N as tskey\n"
 			"  -p N: connect to port N\n"
 			"  -P:   use PF_PACKET\n"
 			"  -r:   use raw\n"
@@ -701,7 +723,7 @@ static void parse_opt(int argc, char **argv)
 	int c;
 
 	while ((c = getopt(argc, argv,
-				"46bc:CeEFhIl:LnNp:PrRS:t:uv:V:x")) != -1) {
+				"46bc:CeEFhIl:LnNo:p:PrRS:t:uv:V:x")) != -1) {
 		switch (c) {
 		case '4':
 			do_ipv6 = 0;
@@ -742,6 +764,10 @@ static void parse_opt(int argc, char **argv)
 		case 'N':
 			cfg_print_nsec = true;
 			break;
+		case 'o':
+			ts_opt_id = strtoul(optarg, NULL, 10);
+			cfg_use_cmsg_opt_id = true;
+			break;
 		case 'p':
 			dest_port = strtoul(optarg, NULL, 10);
 			break;
@@ -799,6 +825,8 @@ static void parse_opt(int argc, char **argv)
 		error(1, 0, "cannot ask for pktinfo over pf_packet");
 	if (cfg_busy_poll && cfg_use_epoll)
 		error(1, 0, "pass epoll or busy_poll, not both");
+	if (cfg_proto == SOCK_STREAM && cfg_use_cmsg_opt_id)
+		error(1, 0, "TCP sockets don't support SCM_TS_OPT_ID");
 
 	if (optind != argc - 1)
 		error(1, 0, "missing required hostname argument");
diff --git a/tools/testing/selftests/net/txtimestamp.sh b/tools/testing/selftests/net/txtimestamp.sh
index 25baca4b148e..fe4649bb8786 100755
--- a/tools/testing/selftests/net/txtimestamp.sh
+++ b/tools/testing/selftests/net/txtimestamp.sh
@@ -37,11 +37,13 @@ run_test_v4v6() {
 run_test_tcpudpraw() {
 	local -r args=$@
 
-	run_test_v4v6 ${args}		# tcp
-	run_test_v4v6 ${args} -u	# udp
-	run_test_v4v6 ${args} -r	# raw
-	run_test_v4v6 ${args} -R	# raw (IPPROTO_RAW)
-	run_test_v4v6 ${args} -P	# pf_packet
+	run_test_v4v6 ${args}		  # tcp
+	run_test_v4v6 ${args} -u	  # udp
+	run_test_v4v6 ${args} -u -o 42	  # udp with fixed tskey
+	run_test_v4v6 ${args} -r	  # raw
+	run_test_v4v6 ${args} -r -o 42	  # raw
+	run_test_v4v6 ${args} -R	  # raw (IPPROTO_RAW)
+	run_test_v4v6 ${args} -P	  # pf_packet
 }
 
 run_test_all() {
diff --git a/tools/testing/selftests/net/udpgro.sh b/tools/testing/selftests/net/udpgro.sh
index 8802604148dd..1dc337c709f8 100755
--- a/tools/testing/selftests/net/udpgro.sh
+++ b/tools/testing/selftests/net/udpgro.sh
@@ -3,12 +3,10 @@
 #
 # Run a series of udpgro functional tests.
 
-source net_helper.sh
+source lib.sh
 
 readonly PEER_NS="ns-peer-$(mktemp -u XXXXXX)"
 
-BPF_FILE="xdp_dummy.o"
-
 # set global exit status, but never reset nonzero one.
 check_err()
 {
@@ -38,7 +36,7 @@ cfg_veth() {
 	ip -netns "${PEER_NS}" addr add dev veth1 192.168.1.1/24
 	ip -netns "${PEER_NS}" addr add dev veth1 2001:db8::1/64 nodad
 	ip -netns "${PEER_NS}" link set dev veth1 up
-	ip -n "${PEER_NS}" link set veth1 xdp object ${BPF_FILE} section xdp
+	ip netns exec "${PEER_NS}" ethtool -K veth1 gro on
 }
 
 run_one() {
@@ -46,17 +44,19 @@ run_one() {
 	local -r all="$@"
 	local -r tx_args=${all%rx*}
 	local -r rx_args=${all#*rx}
+	local ret=0
 
 	cfg_veth
 
-	ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${rx_args} && \
-		echo "ok" || \
-		echo "failed" &
+	ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${rx_args} &
+	local PID1=$!
 
 	wait_local_port_listen ${PEER_NS} 8000 udp
 	./udpgso_bench_tx ${tx_args}
-	ret=$?
-	wait $(jobs -p)
+	check_err $?
+	wait ${PID1}
+	check_err $?
+	[ "$ret" -eq 0 ] && echo "ok" || echo "failed"
 	return $ret
 }
 
@@ -73,6 +73,7 @@ run_one_nat() {
 	local -r all="$@"
 	local -r tx_args=${all%rx*}
 	local -r rx_args=${all#*rx}
+	local ret=0
 
 	if [[ ${tx_args} = *-4* ]]; then
 		ipt_cmd=iptables
@@ -93,16 +94,17 @@ run_one_nat() {
 	# ... so that GRO will match the UDP_GRO enabled socket, but packets
 	# will land on the 'plain' one
 	ip netns exec "${PEER_NS}" ./udpgso_bench_rx -G ${family} -b ${addr1} -n 0 &
-	pid=$!
-	ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${family} -b ${addr2%/*} ${rx_args} && \
-		echo "ok" || \
-		echo "failed"&
+	local PID1=$!
+	ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${family} -b ${addr2%/*} ${rx_args} &
+	local PID2=$!
 
 	wait_local_port_listen "${PEER_NS}" 8000 udp
 	./udpgso_bench_tx ${tx_args}
-	ret=$?
-	kill -INT $pid
-	wait $(jobs -p)
+	check_err $?
+	kill -INT ${PID1}
+	wait ${PID2}
+	check_err $?
+	[ "$ret" -eq 0 ] && echo "ok" || echo "failed"
 	return $ret
 }
 
@@ -111,20 +113,26 @@ run_one_2sock() {
 	local -r all="$@"
 	local -r tx_args=${all%rx*}
 	local -r rx_args=${all#*rx}
+	local ret=0
 
 	cfg_veth
 
 	ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${rx_args} -p 12345 &
-	ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 2000 -R 10 ${rx_args} && \
-		echo "ok" || \
-		echo "failed" &
+	local PID1=$!
+	ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 2000 -R 10 ${rx_args} &
+	local PID2=$!
 
 	wait_local_port_listen "${PEER_NS}" 12345 udp
 	./udpgso_bench_tx ${tx_args} -p 12345
+	check_err $?
 	wait_local_port_listen "${PEER_NS}" 8000 udp
 	./udpgso_bench_tx ${tx_args}
-	ret=$?
-	wait $(jobs -p)
+	check_err $?
+	wait ${PID1}
+	check_err $?
+	wait ${PID2}
+	check_err $?
+	[ "$ret" -eq 0 ] && echo "ok" || echo "failed"
 	return $ret
 }
 
@@ -196,11 +204,6 @@ run_all() {
 	return $ret
 }
 
-if [ ! -f ${BPF_FILE} ]; then
-	echo "Missing ${BPF_FILE}. Run 'make' first"
-	exit -1
-fi
-
 if [[ $# -eq 0 ]]; then
 	run_all
 elif [[ $1 == "__subprocess" ]]; then
diff --git a/tools/testing/selftests/net/udpgro_bench.sh b/tools/testing/selftests/net/udpgro_bench.sh
index 7080eae5312b..54fa4821bc5e 100755
--- a/tools/testing/selftests/net/udpgro_bench.sh
+++ b/tools/testing/selftests/net/udpgro_bench.sh
@@ -3,11 +3,11 @@
 #
 # Run a series of udpgro benchmarks
 
-source net_helper.sh
+source lib.sh
 
 readonly PEER_NS="ns-peer-$(mktemp -u XXXXXX)"
 
-BPF_FILE="xdp_dummy.o"
+BPF_FILE="lib/xdp_dummy.bpf.o"
 
 cleanup() {
 	local -r jobs="$(jobs -p)"
diff --git a/tools/testing/selftests/net/udpgro_frglist.sh b/tools/testing/selftests/net/udpgro_frglist.sh
index e1ff645bd3d1..9a2cfec1153e 100755
--- a/tools/testing/selftests/net/udpgro_frglist.sh
+++ b/tools/testing/selftests/net/udpgro_frglist.sh
@@ -3,11 +3,11 @@
 #
 # Run a series of udpgro benchmarks
 
-source net_helper.sh
+source lib.sh
 
 readonly PEER_NS="ns-peer-$(mktemp -u XXXXXX)"
 
-BPF_FILE="xdp_dummy.o"
+BPF_FILE="lib/xdp_dummy.bpf.o"
 
 cleanup() {
 	local -r jobs="$(jobs -p)"
@@ -42,8 +42,8 @@ run_one() {
 
 	ip -n "${PEER_NS}" link set veth1 xdp object ${BPF_FILE} section xdp
 	tc -n "${PEER_NS}" qdisc add dev veth1 clsact
-	tc -n "${PEER_NS}" filter add dev veth1 ingress prio 4 protocol ipv6 bpf object-file nat6to4.o section schedcls/ingress6/nat_6  direct-action
-	tc -n "${PEER_NS}" filter add dev veth1 egress prio 4 protocol ip bpf object-file nat6to4.o section schedcls/egress4/snat4 direct-action
+	tc -n "${PEER_NS}" filter add dev veth1 ingress prio 4 protocol ipv6 bpf object-file nat6to4.bpf.o section schedcls/ingress6/nat_6  direct-action
+	tc -n "${PEER_NS}" filter add dev veth1 egress prio 4 protocol ip bpf object-file nat6to4.bpf.o section schedcls/egress4/snat4 direct-action
         echo ${rx_args}
 	ip netns exec "${PEER_NS}" ./udpgso_bench_rx ${rx_args} -r &
 
@@ -89,7 +89,7 @@ if [ ! -f ${BPF_FILE} ]; then
 	exit -1
 fi
 
-if [ ! -f nat6to4.o ]; then
+if [ ! -f nat6to4.bpf.o ]; then
 	echo "Missing nat6to4 helper. Run 'make' first"
 	exit -1
 fi
diff --git a/tools/testing/selftests/net/udpgro_fwd.sh b/tools/testing/selftests/net/udpgro_fwd.sh
index 83ed987cff34..a39fdc4aa2ff 100755
--- a/tools/testing/selftests/net/udpgro_fwd.sh
+++ b/tools/testing/selftests/net/udpgro_fwd.sh
@@ -1,9 +1,9 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
-source net_helper.sh
+source lib.sh
 
-BPF_FILE="xdp_dummy.o"
+BPF_FILE="lib/xdp_dummy.bpf.o"
 readonly BASE="ns-$(mktemp -u XXXXXX)"
 readonly SRC=2
 readonly DST=1
diff --git a/tools/testing/selftests/net/udpgso.c b/tools/testing/selftests/net/udpgso.c
index 85b3baa3f7f3..36ff28af4b19 100644
--- a/tools/testing/selftests/net/udpgso.c
+++ b/tools/testing/selftests/net/udpgso.c
@@ -53,6 +53,7 @@ static bool		cfg_do_ipv6;
 static bool		cfg_do_connected;
 static bool		cfg_do_connectionless;
 static bool		cfg_do_msgmore;
+static bool		cfg_do_recv = true;
 static bool		cfg_do_setsockopt;
 static int		cfg_specific_test_id = -1;
 
@@ -66,6 +67,7 @@ struct testcase {
 	int gso_len;		/* mss after applying gso */
 	int r_num_mss;		/* recv(): number of calls of full mss */
 	int r_len_last;		/* recv(): size of last non-mss dgram, if any */
+	bool v6_ext_hdr;	/* send() dgrams with IPv6 extension headers */
 };
 
 const struct in6_addr addr6 = {
@@ -76,6 +78,8 @@ const struct in_addr addr4 = {
 	__constant_htonl(0x0a000001), /* 10.0.0.1 */
 };
 
+static const char ipv6_hopopts_pad1[8] = { 0 };
+
 struct testcase testcases_v4[] = {
 	{
 		/* no GSO: send a single byte */
@@ -99,6 +103,19 @@ struct testcase testcases_v4[] = {
 		.r_num_mss = 1,
 	},
 	{
+		/* datalen <= MSS < gso_len: will fall back to no GSO */
+		.tlen = CONST_MSS_V4,
+		.gso_len = CONST_MSS_V4 + 1,
+		.r_num_mss = 0,
+		.r_len_last = CONST_MSS_V4,
+	},
+	{
+		/* MSS < datalen < gso_len: fail */
+		.tlen = CONST_MSS_V4 + 1,
+		.gso_len = CONST_MSS_V4 + 2,
+		.tfail = true,
+	},
+	{
 		/* send a single MSS + 1B */
 		.tlen = CONST_MSS_V4 + 1,
 		.gso_len = CONST_MSS_V4,
@@ -202,6 +219,19 @@ struct testcase testcases_v6[] = {
 		.r_num_mss = 1,
 	},
 	{
+		/* datalen <= MSS < gso_len: will fall back to no GSO */
+		.tlen = CONST_MSS_V6,
+		.gso_len = CONST_MSS_V6 + 1,
+		.r_num_mss = 0,
+		.r_len_last = CONST_MSS_V6,
+	},
+	{
+		/* MSS < datalen < gso_len: fail */
+		.tlen = CONST_MSS_V6 + 1,
+		.gso_len = CONST_MSS_V6 + 2,
+		.tfail = true
+	},
+	{
 		/* send a single MSS + 1B */
 		.tlen = CONST_MSS_V6 + 1,
 		.gso_len = CONST_MSS_V6,
@@ -255,6 +285,13 @@ struct testcase testcases_v6[] = {
 		.r_num_mss = 2,
 	},
 	{
+		/* send 2 1B segments with extension headers */
+		.tlen = 2,
+		.gso_len = 1,
+		.r_num_mss = 2,
+		.v6_ext_hdr = true,
+	},
+	{
 		/* send 2B + 2B + 1B segments */
 		.tlen = 5,
 		.gso_len = 2,
@@ -395,11 +432,18 @@ static void run_one(struct testcase *test, int fdt, int fdr,
 	int i, ret, val, mss;
 	bool sent;
 
-	fprintf(stderr, "ipv%d tx:%d gso:%d %s\n",
+	fprintf(stderr, "ipv%d tx:%d gso:%d %s%s\n",
 			addr->sa_family == AF_INET ? 4 : 6,
 			test->tlen, test->gso_len,
+			test->v6_ext_hdr ? "ext-hdr " : "",
 			test->tfail ? "(fail)" : "");
 
+	if (test->v6_ext_hdr) {
+		if (setsockopt(fdt, IPPROTO_IPV6, IPV6_HOPOPTS,
+			       ipv6_hopopts_pad1, sizeof(ipv6_hopopts_pad1)))
+			error(1, errno, "setsockopt ipv6 hopopts");
+	}
+
 	val = test->gso_len;
 	if (cfg_do_setsockopt) {
 		if (setsockopt(fdt, SOL_UDP, UDP_SEGMENT, &val, sizeof(val)))
@@ -411,9 +455,18 @@ static void run_one(struct testcase *test, int fdt, int fdr,
 		error(1, 0, "send succeeded while expecting failure");
 	if (!sent && !test->tfail)
 		error(1, 0, "send failed while expecting success");
+
+	if (test->v6_ext_hdr) {
+		if (setsockopt(fdt, IPPROTO_IPV6, IPV6_HOPOPTS, NULL, 0))
+			error(1, errno, "setsockopt ipv6 hopopts clear");
+	}
+
 	if (!sent)
 		return;
 
+	if (!cfg_do_recv)
+		return;
+
 	if (test->gso_len)
 		mss = test->gso_len;
 	else
@@ -464,8 +517,10 @@ static void run_test(struct sockaddr *addr, socklen_t alen)
 	if (fdr == -1)
 		error(1, errno, "socket r");
 
-	if (bind(fdr, addr, alen))
-		error(1, errno, "bind");
+	if (cfg_do_recv) {
+		if (bind(fdr, addr, alen))
+			error(1, errno, "bind");
+	}
 
 	/* Have tests fail quickly instead of hang */
 	if (setsockopt(fdr, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)))
@@ -524,7 +579,7 @@ static void parse_opts(int argc, char **argv)
 {
 	int c;
 
-	while ((c = getopt(argc, argv, "46cCmst:")) != -1) {
+	while ((c = getopt(argc, argv, "46cCmRst:")) != -1) {
 		switch (c) {
 		case '4':
 			cfg_do_ipv4 = true;
@@ -541,6 +596,9 @@ static void parse_opts(int argc, char **argv)
 		case 'm':
 			cfg_do_msgmore = true;
 			break;
+		case 'R':
+			cfg_do_recv = false;
+			break;
 		case 's':
 			cfg_do_setsockopt = true;
 			break;
diff --git a/tools/testing/selftests/net/udpgso.sh b/tools/testing/selftests/net/udpgso.sh
index 6c63178086b0..85d1fa3c1ff7 100755
--- a/tools/testing/selftests/net/udpgso.sh
+++ b/tools/testing/selftests/net/udpgso.sh
@@ -27,6 +27,31 @@ test_route_mtu() {
 	ip route add local fd00::1/128 table local dev lo mtu 1500
 }
 
+setup_dummy_sink() {
+	ip link add name sink mtu 1500 type dummy
+	ip addr add dev sink 10.0.0.0/24
+	ip addr add dev sink fd00::2/64 nodad
+	ip link set dev sink up
+}
+
+test_hw_gso_hw_csum() {
+	setup_dummy_sink
+	ethtool -K sink tx-checksum-ip-generic on >/dev/null
+	ethtool -K sink tx-udp-segmentation on >/dev/null
+}
+
+test_sw_gso_hw_csum() {
+	setup_dummy_sink
+	ethtool -K sink tx-checksum-ip-generic on >/dev/null
+	ethtool -K sink tx-udp-segmentation off >/dev/null
+}
+
+test_sw_gso_sw_csum() {
+	setup_dummy_sink
+	ethtool -K sink tx-checksum-ip-generic off >/dev/null
+	ethtool -K sink tx-udp-segmentation off >/dev/null
+}
+
 if [ "$#" -gt 0 ]; then
 	"$1"
 	shift 2 # pop "test_*" arg and "--" delimiter
@@ -56,3 +81,21 @@ echo "ipv4 msg_more"
 
 echo "ipv6 msg_more"
 ./in_netns.sh "$0" test_dev_mtu -- ./udpgso -6 -C -m
+
+echo "ipv4 hw-gso hw-csum"
+./in_netns.sh "$0" test_hw_gso_hw_csum -- ./udpgso -4 -C -R
+
+echo "ipv6 hw-gso hw-csum"
+./in_netns.sh "$0" test_hw_gso_hw_csum -- ./udpgso -6 -C -R
+
+echo "ipv4 sw-gso hw-csum"
+./in_netns.sh "$0" test_sw_gso_hw_csum -- ./udpgso -4 -C -R
+
+echo "ipv6 sw-gso hw-csum"
+./in_netns.sh "$0" test_sw_gso_hw_csum -- ./udpgso -6 -C -R
+
+echo "ipv4 sw-gso sw-csum"
+./in_netns.sh "$0" test_sw_gso_sw_csum -- ./udpgso -4 -C -R
+
+echo "ipv6 sw-gso sw-csum"
+./in_netns.sh "$0" test_sw_gso_sw_csum -- ./udpgso -6 -C -R
diff --git a/tools/testing/selftests/net/udpgso_bench.sh b/tools/testing/selftests/net/udpgso_bench.sh
index 640bc43452fa..88fa1d53ba2b 100755
--- a/tools/testing/selftests/net/udpgso_bench.sh
+++ b/tools/testing/selftests/net/udpgso_bench.sh
@@ -92,6 +92,9 @@ run_udp() {
 	echo "udp"
 	run_in_netns ${args}
 
+	echo "udp sendmmsg"
+	run_in_netns ${args} -m
+
 	echo "udp gso"
 	run_in_netns ${args} -S 0
 
diff --git a/tools/testing/selftests/net/unicast_extensions.sh b/tools/testing/selftests/net/unicast_extensions.sh
index f52aa5f7da52..3e751234ccfe 100755
--- a/tools/testing/selftests/net/unicast_extensions.sh
+++ b/tools/testing/selftests/net/unicast_extensions.sh
@@ -30,14 +30,7 @@
 
 source lib.sh
 
-# nettest can be run from PATH or from same directory as this selftest
-if ! which nettest >/dev/null; then
-	PATH=$PWD:$PATH
-	if ! which nettest >/dev/null; then
-		echo "'nettest' command not found; skipping tests"
-		exit $ksft_skip
-	fi
-fi
+check_gen_prog "nettest"
 
 result=0
 
diff --git a/tools/testing/selftests/net/veth.sh b/tools/testing/selftests/net/veth.sh
index 3a394b43e274..9709dd067c72 100755
--- a/tools/testing/selftests/net/veth.sh
+++ b/tools/testing/selftests/net/veth.sh
@@ -1,7 +1,7 @@
 #!/bin/sh
 # SPDX-License-Identifier: GPL-2.0
 
-BPF_FILE="xdp_dummy.o"
+BPF_FILE="lib/xdp_dummy.bpf.o"
 readonly STATS="$(mktemp -p /tmp ns-XXXXXX)"
 readonly BASE=`basename $STATS`
 readonly SRC=2
@@ -46,8 +46,6 @@ create_ns() {
 		ip -n $BASE$ns addr add dev veth$ns $BM_NET_V4$ns/24
 		ip -n $BASE$ns addr add dev veth$ns $BM_NET_V6$ns/64 nodad
 	done
-	echo "#kernel" > $BASE
-	chmod go-rw $BASE
 }
 
 __chk_flag() {
diff --git a/tools/testing/selftests/net/vlan_bridge_binding.sh b/tools/testing/selftests/net/vlan_bridge_binding.sh
new file mode 100755
index 000000000000..e7cb8c678bde
--- /dev/null
+++ b/tools/testing/selftests/net/vlan_bridge_binding.sh
@@ -0,0 +1,256 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source lib.sh
+
+ALL_TESTS="
+	test_binding_on
+	test_binding_off
+	test_binding_toggle_on
+	test_binding_toggle_off
+	test_binding_toggle_on_when_upper_down
+	test_binding_toggle_off_when_upper_down
+	test_binding_toggle_on_when_lower_down
+	test_binding_toggle_off_when_lower_down
+"
+
+setup_prepare()
+{
+	local port
+
+	ip_link_add br up type bridge vlan_filtering 1
+
+	for port in d1 d2 d3; do
+		ip_link_add $port type veth peer name r$port
+		ip_link_set_up $port
+		ip_link_set_up r$port
+		ip_link_set_master $port br
+	done
+
+	bridge_vlan_add vid 11 dev br self
+	bridge_vlan_add vid 11 dev d1 master
+
+	bridge_vlan_add vid 12 dev br self
+	bridge_vlan_add vid 12 dev d2 master
+
+	bridge_vlan_add vid 13 dev br self
+	bridge_vlan_add vid 13 dev d1 master
+	bridge_vlan_add vid 13 dev d2 master
+
+	bridge_vlan_add vid 14 dev br self
+	bridge_vlan_add vid 14 dev d1 master
+	bridge_vlan_add vid 14 dev d2 master
+	bridge_vlan_add vid 14 dev d3 master
+}
+
+operstate_is()
+{
+	local dev=$1; shift
+	local expect=$1; shift
+
+	local operstate=$(ip -j link show $dev | jq -r .[].operstate)
+	if [[ $operstate == UP ]]; then
+		operstate=1
+	elif [[ $operstate == DOWN || $operstate == LOWERLAYERDOWN ]]; then
+		operstate=0
+	fi
+	echo -n $operstate
+	[[ $operstate == $expect ]]
+}
+
+check_operstate()
+{
+	local dev=$1; shift
+	local expect=$1; shift
+	local operstate
+
+	operstate=$(busywait 1000 \
+			operstate_is "$dev" "$expect")
+	check_err $? "Got operstate of $operstate, expected $expect"
+}
+
+add_one_vlan()
+{
+	local link=$1; shift
+	local id=$1; shift
+
+	ip_link_add $link.$id link $link type vlan id $id "$@"
+}
+
+add_vlans()
+{
+	add_one_vlan br 11 "$@"
+	add_one_vlan br 12 "$@"
+	add_one_vlan br 13 "$@"
+	add_one_vlan br 14 "$@"
+}
+
+set_vlans()
+{
+	ip link set dev br.11 "$@"
+	ip link set dev br.12 "$@"
+	ip link set dev br.13 "$@"
+	ip link set dev br.14 "$@"
+}
+
+down_netdevs()
+{
+	local dev
+
+	for dev in "$@"; do
+		ip_link_set_down $dev
+	done
+}
+
+check_operstates()
+{
+	local opst_11=$1; shift
+	local opst_12=$1; shift
+	local opst_13=$1; shift
+	local opst_14=$1; shift
+
+	check_operstate br.11 $opst_11
+	check_operstate br.12 $opst_12
+	check_operstate br.13 $opst_13
+	check_operstate br.14 $opst_14
+}
+
+do_test_binding()
+{
+	local inject=$1; shift
+	local what=$1; shift
+	local opsts_d1=$1; shift
+	local opsts_d2=$1; shift
+	local opsts_d12=$1; shift
+	local opsts_d123=$1; shift
+
+	RET=0
+
+	defer_scope_push
+		down_netdevs d1
+		$inject
+		check_operstates $opsts_d1
+	defer_scope_pop
+
+	defer_scope_push
+		down_netdevs d2
+		$inject
+		check_operstates $opsts_d2
+	defer_scope_pop
+
+	defer_scope_push
+		down_netdevs d1 d2
+		$inject
+		check_operstates $opsts_d12
+	defer_scope_pop
+
+	defer_scope_push
+		down_netdevs d1 d2 d3
+		$inject
+		check_operstates $opsts_d123
+	defer_scope_pop
+
+	log_test "Test bridge_binding $what"
+}
+
+do_test_binding_on()
+{
+	local inject=$1; shift
+	local what=$1; shift
+
+	do_test_binding "$inject" "$what"	\
+			"0 1 1 1"		\
+			"1 0 1 1"		\
+			"0 0 0 1"		\
+			"0 0 0 0"
+}
+
+do_test_binding_off()
+{
+	local inject=$1; shift
+	local what=$1; shift
+
+	do_test_binding "$inject" "$what"	\
+			"1 1 1 1"		\
+			"1 1 1 1"		\
+			"1 1 1 1"		\
+			"0 0 0 0"
+}
+
+test_binding_on()
+{
+	add_vlans bridge_binding on
+	set_vlans up
+	do_test_binding_on : "on"
+}
+
+test_binding_off()
+{
+	add_vlans bridge_binding off
+	set_vlans up
+	do_test_binding_off : "off"
+}
+
+test_binding_toggle_on()
+{
+	add_vlans bridge_binding off
+	set_vlans up
+	set_vlans type vlan bridge_binding on
+	do_test_binding_on : "off->on"
+}
+
+test_binding_toggle_off()
+{
+	add_vlans bridge_binding on
+	set_vlans up
+	set_vlans type vlan bridge_binding off
+	do_test_binding_off : "on->off"
+}
+
+dfr_set_binding_on()
+{
+	set_vlans type vlan bridge_binding on
+	defer set_vlans type vlan bridge_binding off
+}
+
+dfr_set_binding_off()
+{
+	set_vlans type vlan bridge_binding off
+	defer set_vlans type vlan bridge_binding on
+}
+
+test_binding_toggle_on_when_lower_down()
+{
+	add_vlans bridge_binding off
+	set_vlans up
+	do_test_binding_on dfr_set_binding_on "off->on when lower down"
+}
+
+test_binding_toggle_off_when_lower_down()
+{
+	add_vlans bridge_binding on
+	set_vlans up
+	do_test_binding_off dfr_set_binding_off "on->off when lower down"
+}
+
+test_binding_toggle_on_when_upper_down()
+{
+	add_vlans bridge_binding off
+	set_vlans type vlan bridge_binding on
+	set_vlans up
+	do_test_binding_on : "off->on when upper down"
+}
+
+test_binding_toggle_off_when_upper_down()
+{
+	add_vlans bridge_binding on
+	set_vlans type vlan bridge_binding off
+	set_vlans up
+	do_test_binding_off : "on->off when upper down"
+}
+
+trap defer_scopes_cleanup EXIT
+setup_prepare
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/vrf_route_leaking.sh b/tools/testing/selftests/net/vrf_route_leaking.sh
index 2da32f4c479b..e9c2f71da207 100755
--- a/tools/testing/selftests/net/vrf_route_leaking.sh
+++ b/tools/testing/selftests/net/vrf_route_leaking.sh
@@ -533,6 +533,86 @@ ipv6_ping_frag_asym()
 	ipv6_ping_frag asym
 }
 
+ipv4_ping_local()
+{
+	log_section "IPv4 (sym route): VRF ICMP local error route lookup ping"
+
+	setup_sym
+
+	check_connectivity || return
+
+	run_cmd ip netns exec $r1 ip vrf exec blue ping -c1 -w1 ${H2_N2_IP}
+	log_test $? 0 "VRF ICMP local IPv4"
+}
+
+ipv4_tcp_local()
+{
+	log_section "IPv4 (sym route): VRF tcp local connection"
+
+	setup_sym
+
+	check_connectivity || return
+
+	run_cmd nettest -s -O "$h2" -l ${H2_N2_IP} -I eth0 -3 eth0 &
+	sleep 1
+	run_cmd nettest -N "$r1" -d blue -r ${H2_N2_IP}
+	log_test $? 0 "VRF tcp local connection IPv4"
+}
+
+ipv4_udp_local()
+{
+	log_section "IPv4 (sym route): VRF udp local connection"
+
+	setup_sym
+
+	check_connectivity || return
+
+	run_cmd nettest -s -D -O "$h2" -l ${H2_N2_IP} -I eth0 -3 eth0 &
+	sleep 1
+	run_cmd nettest -D -N "$r1" -d blue -r ${H2_N2_IP}
+	log_test $? 0 "VRF udp local connection IPv4"
+}
+
+ipv6_ping_local()
+{
+	log_section "IPv6 (sym route): VRF ICMP local error route lookup ping"
+
+	setup_sym
+
+	check_connectivity6 || return
+
+	run_cmd ip netns exec $r1 ip vrf exec blue ${ping6} -c1 -w1 ${H2_N2_IP6}
+	log_test $? 0 "VRF ICMP local IPv6"
+}
+
+ipv6_tcp_local()
+{
+	log_section "IPv6 (sym route): VRF tcp local connection"
+
+	setup_sym
+
+	check_connectivity6 || return
+
+	run_cmd nettest -s -6 -O "$h2" -l ${H2_N2_IP6} -I eth0 -3 eth0 &
+	sleep 1
+	run_cmd nettest -6 -N "$r1" -d blue -r ${H2_N2_IP6}
+	log_test $? 0 "VRF tcp local connection IPv6"
+}
+
+ipv6_udp_local()
+{
+	log_section "IPv6 (sym route): VRF udp local connection"
+
+	setup_sym
+
+	check_connectivity6 || return
+
+	run_cmd nettest -s -6 -D -O "$h2" -l ${H2_N2_IP6} -I eth0 -3 eth0 &
+	sleep 1
+	run_cmd nettest -6 -D -N "$r1" -d blue -r ${H2_N2_IP6}
+	log_test $? 0 "VRF udp local connection IPv6"
+}
+
 ################################################################################
 # usage
 
@@ -555,8 +635,12 @@ EOF
 # Some systems don't have a ping6 binary anymore
 command -v ping6 > /dev/null 2>&1 && ping6=$(command -v ping6) || ping6=$(command -v ping)
 
-TESTS_IPV4="ipv4_ping_ttl ipv4_traceroute ipv4_ping_frag ipv4_ping_ttl_asym ipv4_traceroute_asym"
-TESTS_IPV6="ipv6_ping_ttl ipv6_traceroute ipv6_ping_ttl_asym ipv6_traceroute_asym"
+check_gen_prog "nettest"
+
+TESTS_IPV4="ipv4_ping_ttl ipv4_traceroute ipv4_ping_frag ipv4_ping_local ipv4_tcp_local
+ipv4_udp_local ipv4_ping_ttl_asym ipv4_traceroute_asym"
+TESTS_IPV6="ipv6_ping_ttl ipv6_traceroute ipv6_ping_local ipv6_tcp_local ipv6_udp_local
+ipv6_ping_ttl_asym ipv6_traceroute_asym"
 
 ret=0
 nsuccess=0
@@ -594,12 +678,18 @@ do
 	ipv4_traceroute|traceroute)      ipv4_traceroute;;&
 	ipv4_traceroute_asym|traceroute) ipv4_traceroute_asym;;&
 	ipv4_ping_frag|ping)             ipv4_ping_frag;;&
+	ipv4_ping_local|ping)            ipv4_ping_local;;&
+	ipv4_tcp_local)                  ipv4_tcp_local;;&
+	ipv4_udp_local)                  ipv4_udp_local;;&
 
 	ipv6_ping_ttl|ping)              ipv6_ping_ttl;;&
 	ipv6_ping_ttl_asym|ping)         ipv6_ping_ttl_asym;;&
 	ipv6_traceroute|traceroute)      ipv6_traceroute;;&
 	ipv6_traceroute_asym|traceroute) ipv6_traceroute_asym;;&
 	ipv6_ping_frag|ping)             ipv6_ping_frag;;&
+	ipv6_ping_local|ping)            ipv6_ping_local;;&
+	ipv6_tcp_local)                  ipv6_tcp_local;;&
+	ipv6_udp_local)                  ipv6_udp_local;;&
 
 	# setup namespaces and config, but do not run any tests
 	setup_sym|setup)                 setup_sym; exit 0;;
diff --git a/tools/testing/selftests/net/xfrm_policy.sh b/tools/testing/selftests/net/xfrm_policy.sh
index 457789530645..3eeeeffb4005 100755
--- a/tools/testing/selftests/net/xfrm_policy.sh
+++ b/tools/testing/selftests/net/xfrm_policy.sh
@@ -293,7 +293,7 @@ check_random_order()
 	local ns=$1
 	local log=$2
 
-	for i in $(seq 100); do
+	for i in $(seq 50); do
 		ip -net $ns xfrm policy flush
 		for j in $(seq 0 16 255 | sort -R); do
 			ip -net $ns xfrm policy add dst $j.0.0.0/24 dir out priority 10 action allow
@@ -306,7 +306,7 @@ check_random_order()
 		done
 	done
 
-	for i in $(seq 100); do
+	for i in $(seq 50); do
 		ip -net $ns xfrm policy flush
 		for j in $(seq 0 16 255 | sort -R); do
 			local addr=$(printf "e000:0000:%02x00::/56" $j)
diff --git a/tools/testing/selftests/net/xfrm_policy_add_speed.sh b/tools/testing/selftests/net/xfrm_policy_add_speed.sh
new file mode 100755
index 000000000000..2fab29d3cb91
--- /dev/null
+++ b/tools/testing/selftests/net/xfrm_policy_add_speed.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+source lib.sh
+
+timeout=4m
+ret=0
+tmp=$(mktemp)
+cleanup() {
+	cleanup_all_ns
+	rm -f "$tmp"
+}
+
+trap cleanup EXIT
+
+maxpolicies=100000
+[ "$KSFT_MACHINE_SLOW" = "yes" ] && maxpolicies=10000
+
+do_dummies4() {
+	local dir="$1"
+	local max="$2"
+
+	local policies
+	local pfx
+	pfx=30
+	policies=0
+
+	ip netns exec "$ns" ip xfrm policy flush
+
+	for i in $(seq 1 100);do
+		local s
+		local d
+		for j in $(seq 1 255);do
+			s=$((i+0))
+			d=$((i+100))
+
+			for a in $(seq 1 8 255); do
+				policies=$((policies+1))
+				[ "$policies" -gt "$max" ] && return
+				echo xfrm policy add src 10.$s.$j.0/30 dst 10.$d.$j.$a/$pfx dir $dir action block
+			done
+			for a in $(seq 1 8 255); do
+				policies=$((policies+1))
+				[ "$policies" -gt "$max" ] && return
+				echo xfrm policy add src 10.$s.$j.$a/30 dst 10.$d.$j.0/$pfx dir $dir action block
+			done
+		done
+	done
+}
+
+setup_ns ns
+
+do_bench()
+{
+	local max="$1"
+
+	start=$(date +%s%3N)
+	do_dummies4 "out" "$max" > "$tmp"
+	if ! timeout "$timeout" ip netns exec "$ns" ip -batch "$tmp";then
+		echo "WARNING: policy insertion cancelled after $timeout"
+		ret=1
+	fi
+	stop=$(date +%s%3N)
+
+	result=$((stop-start))
+
+	policies=$(wc -l < "$tmp")
+	printf "Inserted %-06s policies in $result ms\n" $policies
+
+	have=$(ip netns exec "$ns" ip xfrm policy show | grep "action block" | wc -l)
+	if [ "$have" -ne "$policies" ]; then
+		echo "WARNING: mismatch, have $have policies, expected $policies"
+		ret=1
+	fi
+}
+
+p=100
+while [ $p -le "$maxpolicies" ]; do
+	do_bench "$p"
+	p="${p}0"
+done
+
+exit $ret
diff --git a/tools/testing/selftests/net/ynl.mk b/tools/testing/selftests/net/ynl.mk
new file mode 100644
index 000000000000..e907c2751956
--- /dev/null
+++ b/tools/testing/selftests/net/ynl.mk
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: GPL-2.0
+
+# YNL selftest build snippet
+
+# Inputs:
+#
+# YNL_GENS:      families we need in the selftests
+# YNL_PROGS:     TEST_PROGS which need YNL (TODO, none exist, yet)
+# YNL_GEN_FILES: TEST_GEN_FILES which need YNL
+
+YNL_OUTPUTS := $(patsubst %,$(OUTPUT)/%,$(YNL_GEN_FILES))
+YNL_SPECS := \
+	$(patsubst %,$(top_srcdir)/Documentation/netlink/specs/%.yaml,$(YNL_GENS))
+
+$(YNL_OUTPUTS): $(OUTPUT)/libynl.a
+$(YNL_OUTPUTS): CFLAGS += \
+	-I$(top_srcdir)/usr/include/ $(KHDR_INCLUDES) \
+	-I$(top_srcdir)/tools/net/ynl/lib/ \
+	-I$(top_srcdir)/tools/net/ynl/generated/
+
+# Make sure we rebuild libynl if user added a new family. We can't easily
+# depend on the contents of a variable so create a fake file with a hash.
+YNL_GENS_HASH := $(shell echo $(YNL_GENS) | sha1sum | cut -c1-8)
+$(OUTPUT)/.libynl-$(YNL_GENS_HASH).sig:
+	$(Q)rm -f $(OUTPUT)/.libynl-*.sig
+	$(Q)touch $(OUTPUT)/.libynl-$(YNL_GENS_HASH).sig
+
+$(OUTPUT)/libynl.a: $(YNL_SPECS) $(OUTPUT)/.libynl-$(YNL_GENS_HASH).sig
+	$(Q)rm -f $(top_srcdir)/tools/net/ynl/libynl.a
+	$(Q)$(MAKE) -C $(top_srcdir)/tools/net/ynl \
+		GENS="$(YNL_GENS)" RSTS="" libynl.a
+	$(Q)cp $(top_srcdir)/tools/net/ynl/libynl.a $(OUTPUT)/libynl.a
+
+EXTRA_CLEAN += \
+	$(top_srcdir)/tools/net/ynl/pyynl/__pycache__ \
+	$(top_srcdir)/tools/net/ynl/pyynl/lib/__pycache__ \
+	$(top_srcdir)/tools/net/ynl/lib/*.[ado] \
+	$(OUTPUT)/.libynl-*.sig \
+	$(OUTPUT)/libynl.a