diff options
Diffstat (limited to 'tools/testing/selftests/drivers/net')
28 files changed, 3477 insertions, 265 deletions
diff --git a/tools/testing/selftests/drivers/net/.gitignore b/tools/testing/selftests/drivers/net/.gitignore index 585ecb4d5dc4..3633c7a3ed65 100644 --- a/tools/testing/selftests/drivers/net/.gitignore +++ b/tools/testing/selftests/drivers/net/.gitignore @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only +gro napi_id_helper psp_responder diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile index 6e41635bd55a..f5c71d993750 100644 --- a/tools/testing/selftests/drivers/net/Makefile +++ b/tools/testing/selftests/drivers/net/Makefile @@ -6,10 +6,12 @@ TEST_INCLUDES := $(wildcard lib/py/*.py) \ ../../net/lib.sh \ TEST_GEN_FILES := \ + gro \ napi_id_helper \ # end of TEST_GEN_FILES TEST_PROGS := \ + gro.py \ hds.py \ napi_id.py \ napi_threaded.py \ @@ -18,10 +20,12 @@ TEST_PROGS := \ netcons_fragmented_msg.sh \ netcons_overflow.sh \ netcons_sysdata.sh \ + netcons_torture.sh \ netpoll_basic.py \ ping.py \ psp.py \ queues.py \ + ring_reconfig.py \ shaper.py \ stats.py \ xdp.py \ diff --git a/tools/testing/selftests/drivers/net/bonding/Makefile b/tools/testing/selftests/drivers/net/bonding/Makefile index 402d4ee84f2e..6c5c60adb5e8 100644 --- a/tools/testing/selftests/drivers/net/bonding/Makefile +++ b/tools/testing/selftests/drivers/net/bonding/Makefile @@ -14,6 +14,7 @@ TEST_PROGS := \ dev_addr_lists.sh \ mode-1-recovery-updelay.sh \ mode-2-recovery-updelay.sh \ + netcons_over_bonding.sh \ # end of TEST_PROGS TEST_FILES := \ @@ -24,6 +25,7 @@ TEST_FILES := \ TEST_INCLUDES := \ ../../../net/lib.sh \ + ../lib/sh/lib_netcons.sh \ ../../../net/forwarding/lib.sh \ # end of TEST_INCLUDES diff --git a/tools/testing/selftests/drivers/net/bonding/bond_macvlan_ipvlan.sh b/tools/testing/selftests/drivers/net/bonding/bond_macvlan_ipvlan.sh index c4711272fe45..559f300f965a 100755 --- a/tools/testing/selftests/drivers/net/bonding/bond_macvlan_ipvlan.sh +++ b/tools/testing/selftests/drivers/net/bonding/bond_macvlan_ipvlan.sh @@ -30,6 +30,7 @@ check_connection() local message=${3} RET=0 + sleep 0.25 ip netns exec ${ns} ping ${target} -c 4 -i 0.1 &>/dev/null check_err $? "ping failed" log_test "${bond_mode}/${xvlan_type}_${xvlan_mode}: ${message}" diff --git a/tools/testing/selftests/drivers/net/bonding/config b/tools/testing/selftests/drivers/net/bonding/config index 6bb290abd48b..991494376223 100644 --- a/tools/testing/selftests/drivers/net/bonding/config +++ b/tools/testing/selftests/drivers/net/bonding/config @@ -1,5 +1,6 @@ CONFIG_BONDING=y CONFIG_BRIDGE=y +CONFIG_CONFIGFS_FS=y CONFIG_DUMMY=y CONFIG_INET_ESP=y CONFIG_INET_ESP_OFFLOAD=y @@ -9,6 +10,9 @@ CONFIG_MACVLAN=y CONFIG_NET_ACT_GACT=y CONFIG_NET_CLS_FLOWER=y CONFIG_NET_CLS_MATCHALL=m +CONFIG_NETCONSOLE=m +CONFIG_NETCONSOLE_DYNAMIC=y +CONFIG_NETCONSOLE_EXTENDED_LOG=y CONFIG_NETDEVSIM=m CONFIG_NET_SCH_INGRESS=y CONFIG_NLMON=y diff --git a/tools/testing/selftests/drivers/net/bonding/netcons_over_bonding.sh b/tools/testing/selftests/drivers/net/bonding/netcons_over_bonding.sh new file mode 100755 index 000000000000..477cc9379500 --- /dev/null +++ b/tools/testing/selftests/drivers/net/bonding/netcons_over_bonding.sh @@ -0,0 +1,361 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-2.0 +# +# This selftest exercises trying to have multiple netpoll users at the same +# time. +# +# This selftest has multiple smalls test inside, and the goal is to +# get interfaces with bonding and netconsole in different orders in order +# to catch any possible issue. +# +# The main test composes of four interfaces being created using netdevsim; two +# of them are bonded to serve as the netconsole's transmit interface. The +# remaining two interfaces are similarly bonded and assigned to a separate +# network namespace, which acts as the receive interface, where socat monitors +# for incoming messages. +# +# A netconsole message is then sent to ensure it is properly received across +# this configuration. +# +# Later, run a few other tests, to make sure that bonding and netconsole +# cannot coexist. +# +# The test's objective is to exercise netpoll usage when managed simultaneously +# by multiple subsystems (netconsole and bonding). +# +# Author: Breno Leitao <leitao@debian.org> + +set -euo pipefail + +SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +source "${SCRIPTDIR}"/../lib/sh/lib_netcons.sh + +modprobe netdevsim 2> /dev/null || true +modprobe netconsole 2> /dev/null || true +modprobe bonding 2> /dev/null || true +modprobe veth 2> /dev/null || true + +# The content of kmsg will be save to the following file +OUTPUT_FILE="/tmp/${TARGET}" + +# Check for basic system dependency and exit if not found +check_for_dependencies +# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) +echo "6 5" > /proc/sys/kernel/printk +# Remove the namespace, interfaces and netconsole target on exit +trap cleanup_bond EXIT + +FORMAT="extended" +IP_VERSION="ipv4" +VETH0="veth"$(( RANDOM % 256)) +VETH1="veth"$((256 + RANDOM % 256)) +TXNS="" +RXNS="" + +# Create "bond_tx_XX" and "bond_rx_XX" interfaces, and set DSTIF and SRCIF with +# the bonding interfaces +function setup_bonding_ifaces() { + local RAND=$(( RANDOM % 100 )) + BOND_TX_MAIN_IF="bond_tx_$RAND" + BOND_RX_MAIN_IF="bond_rx_$RAND" + + # Setup TX + if ! ip -n "${TXNS}" link add "${BOND_TX_MAIN_IF}" type bond mode balance-rr + then + echo "Failed to create bond TX interface. Is CONFIG_BONDING set?" >&2 + # only clean nsim ifaces and namespace. Nothing else has been + # initialized + cleanup_bond_nsim + trap - EXIT + exit "${ksft_skip}" + fi + + # create_netdevsim() got the interface up, but it needs to be down + # before being enslaved. + ip -n "${TXNS}" \ + link set "${BOND_TX1_SLAVE_IF}" down + ip -n "${TXNS}" \ + link set "${BOND_TX2_SLAVE_IF}" down + ip -n "${TXNS}" \ + link set "${BOND_TX1_SLAVE_IF}" master "${BOND_TX_MAIN_IF}" + ip -n "${TXNS}" \ + link set "${BOND_TX2_SLAVE_IF}" master "${BOND_TX_MAIN_IF}" + ip -n "${TXNS}" \ + link set "${BOND_TX_MAIN_IF}" up + + # Setup RX + ip -n "${RXNS}" \ + link add "${BOND_RX_MAIN_IF}" type bond mode balance-rr + ip -n "${RXNS}" \ + link set "${BOND_RX1_SLAVE_IF}" down + ip -n "${RXNS}" \ + link set "${BOND_RX2_SLAVE_IF}" down + ip -n "${RXNS}" \ + link set "${BOND_RX1_SLAVE_IF}" master "${BOND_RX_MAIN_IF}" + ip -n "${RXNS}" \ + link set "${BOND_RX2_SLAVE_IF}" master "${BOND_RX_MAIN_IF}" + ip -n "${RXNS}" \ + link set "${BOND_RX_MAIN_IF}" up + + export DSTIF="${BOND_RX_MAIN_IF}" + export SRCIF="${BOND_TX_MAIN_IF}" +} + +# Create 4 netdevsim interfaces. Two of them will be bound to TX bonding iface +# and the other two will be bond to the RX interface (on the other namespace) +function create_ifaces_bond() { + BOND_TX1_SLAVE_IF=$(create_netdevsim "${NSIM_BOND_TX_1}" "${TXNS}") + BOND_TX2_SLAVE_IF=$(create_netdevsim "${NSIM_BOND_TX_2}" "${TXNS}") + BOND_RX1_SLAVE_IF=$(create_netdevsim "${NSIM_BOND_RX_1}" "${RXNS}") + BOND_RX2_SLAVE_IF=$(create_netdevsim "${NSIM_BOND_RX_2}" "${RXNS}") +} + +# netdevsim link BOND_TX to BOND_RX interfaces +function link_ifaces_bond() { + local BOND_TX1_SLAVE_IFIDX + local BOND_TX2_SLAVE_IFIDX + local BOND_RX1_SLAVE_IFIDX + local BOND_RX2_SLAVE_IFIDX + local TXNS_FD + local RXNS_FD + + BOND_TX1_SLAVE_IFIDX=$(ip netns exec "${TXNS}" \ + cat /sys/class/net/"$BOND_TX1_SLAVE_IF"/ifindex) + BOND_TX2_SLAVE_IFIDX=$(ip netns exec "${TXNS}" \ + cat /sys/class/net/"$BOND_TX2_SLAVE_IF"/ifindex) + BOND_RX1_SLAVE_IFIDX=$(ip netns exec "${RXNS}" \ + cat /sys/class/net/"$BOND_RX1_SLAVE_IF"/ifindex) + BOND_RX2_SLAVE_IFIDX=$(ip netns exec "${RXNS}" \ + cat /sys/class/net/"$BOND_RX2_SLAVE_IF"/ifindex) + + exec {TXNS_FD}</var/run/netns/"${TXNS}" + exec {RXNS_FD}</var/run/netns/"${RXNS}" + + # Linking TX ifaces to the RX ones (on the other namespace) + echo "${TXNS_FD}:$BOND_TX1_SLAVE_IFIDX $RXNS_FD:$BOND_RX1_SLAVE_IFIDX" \ + > "$NSIM_DEV_SYS_LINK" + echo "${TXNS_FD}:$BOND_TX2_SLAVE_IFIDX $RXNS_FD:$BOND_RX2_SLAVE_IFIDX" \ + > "$NSIM_DEV_SYS_LINK" + + exec {TXNS_FD}<&- + exec {RXNS_FD}<&- +} + +function create_all_ifaces() { + # setup_ns function is coming from lib.sh + setup_ns TXNS RXNS + export NAMESPACE="${RXNS}" + + # Create two interfaces for RX and two for TX + create_ifaces_bond + # Link netlink ifaces + link_ifaces_bond +} + +# configure DSTIF and SRCIF IPs +function configure_ifaces_ips() { + local IP_VERSION=${1:-"ipv4"} + select_ipv4_or_ipv6 "${IP_VERSION}" + + ip -n "${RXNS}" addr add "${DSTIP}"/24 dev "${DSTIF}" + ip -n "${RXNS}" link set "${DSTIF}" up + + ip -n "${TXNS}" addr add "${SRCIP}"/24 dev "${SRCIF}" + ip -n "${TXNS}" link set "${SRCIF}" up +} + +function test_enable_netpoll_on_enslaved_iface() { + echo 0 > "${NETCONS_PATH}"/enabled + + # At this stage, BOND_TX1_SLAVE_IF is enslaved to BOND_TX_MAIN_IF, and + # linked to BOND_RX1_SLAVE_IF inside the namespace. + echo "${BOND_TX1_SLAVE_IF}" > "${NETCONS_PATH}"/dev_name + + # This should fail with the following message in dmesg: + # netpoll: netconsole: ethX is a slave device, aborting + set +e + enable_netcons_ns 2> /dev/null + set -e + + if [[ $(cat "${NETCONS_PATH}"/enabled) -eq 1 ]] + then + echo "test failed: Bonding and netpoll cannot co-exists." >&2 + exit "${ksft_fail}" + fi +} + +function test_delete_bond_and_reenable_target() { + ip -n "${TXNS}" \ + link delete "${BOND_TX_MAIN_IF}" type bond + + # BOND_TX1_SLAVE_IF is not attached to a bond interface anymore + # netpoll can be plugged in there + echo "${BOND_TX1_SLAVE_IF}" > "${NETCONS_PATH}"/dev_name + + # this should work, since the interface is not enslaved + enable_netcons_ns + + if [[ $(cat "${NETCONS_PATH}"/enabled) -eq 0 ]] + then + echo "test failed: Unable to start netpoll on an unbond iface." >&2 + exit "${ksft_fail}" + fi +} + +# Send a netconsole message to the netconsole target +function test_send_netcons_msg_through_bond_iface() { + # Listen for netconsole port inside the namespace and + # destination interface + listen_port_and_save_to "${OUTPUT_FILE}" "${IP_VERSION}" & + # Wait for socat to start and listen to the port. + wait_for_port "${RXNS}" "${PORT}" "${IP_VERSION}" + # Send the message + echo "${MSG}: ${TARGET}" > /dev/kmsg + # Wait until socat saves the file to disk + busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" + # Make sure the message was received in the dst part + # and exit + validate_result "${OUTPUT_FILE}" "${FORMAT}" + # kill socat in case it is still running + pkill_socat +} + +# BOND_TX1_SLAVE_IF has netconsole enabled on it, bind it to BOND_TX_MAIN_IF. +# Given BOND_TX_MAIN_IF was deleted, recreate it first +function test_enslave_netcons_enabled_iface { + # netconsole got disabled while the interface was down + if [[ $(cat "${NETCONS_PATH}"/enabled) -eq 0 ]] + then + echo "test failed: netconsole expected to be enabled against BOND_TX1_SLAVE_IF" >&2 + exit "${ksft_fail}" + fi + + # recreate the bonding iface. it got deleted by previous + # test (test_delete_bond_and_reenable_target) + ip -n "${TXNS}" \ + link add "${BOND_TX_MAIN_IF}" type bond mode balance-rr + + # sub-interface need to be down before attaching to bonding + # This will also disable netconsole. + ip -n "${TXNS}" \ + link set "${BOND_TX1_SLAVE_IF}" down + ip -n "${TXNS}" \ + link set "${BOND_TX1_SLAVE_IF}" master "${BOND_TX_MAIN_IF}" + ip -n "${TXNS}" \ + link set "${BOND_TX_MAIN_IF}" up + + # netconsole got disabled while the interface was down + if [[ $(cat "${NETCONS_PATH}"/enabled) -eq 1 ]] + then + echo "test failed: Device is part of a bond iface, cannot have netcons enabled" >&2 + exit "${ksft_fail}" + fi +} + +# Get netconsole enabled on a bonding interface and attach a second +# sub-interface. +function test_enslave_iface_to_bond { + # BOND_TX_MAIN_IF has only BOND_TX1_SLAVE_IF right now + echo "${BOND_TX_MAIN_IF}" > "${NETCONS_PATH}"/dev_name + enable_netcons_ns + + # netcons is attached to bond0 and BOND_TX1_SLAVE_IF is + # part of BOND_TX_MAIN_IF. Attach BOND_TX2_SLAVE_IF to BOND_TX_MAIN_IF. + ip -n "${TXNS}" \ + link set "${BOND_TX2_SLAVE_IF}" master "${BOND_TX_MAIN_IF}" + if [[ $(cat "${NETCONS_PATH}"/enabled) -eq 0 ]] + then + echo "test failed: Netconsole should be enabled on bonding interface. Failed" >&2 + exit "${ksft_fail}" + fi +} + +function test_enslave_iff_disabled_netpoll_iface { + local ret + + # Create two interfaces. veth interfaces it known to have + # IFF_DISABLE_NETPOLL set + if ! ip link add "${VETH0}" type veth peer name "${VETH1}" + then + echo "Failed to create veth TX interface. Is CONFIG_VETH set?" >&2 + exit "${ksft_skip}" + fi + set +e + # This will print RTNETLINK answers: Device or resource busy + ip link set "${VETH0}" master "${BOND_TX_MAIN_IF}" 2> /dev/null + ret=$? + set -e + if [[ $ret -eq 0 ]] + then + echo "test failed: veth interface could not be enslaved" + exit "${ksft_fail}" + fi +} + +# Given that netconsole picks the current net namespace, we need to enable it +# from inside the TXNS namespace +function enable_netcons_ns() { + ip netns exec "${TXNS}" sh -c \ + "mount -t configfs configfs /sys/kernel/config && echo 1 > $NETCONS_PATH/enabled" +} + +#################### +# Tests start here # +#################### + +# Create regular interfaces using netdevsim and link them +create_all_ifaces + +# Setup the bonding interfaces +# BOND_RX_MAIN_IF has BOND_RX{1,2}_SLAVE_IF +# BOND_TX_MAIN_IF has BOND_TX{1,2}_SLAVE_IF +setup_bonding_ifaces + +# Configure the ips as BOND_RX1_SLAVE_IF and BOND_TX1_SLAVE_IF +configure_ifaces_ips "${IP_VERSION}" + +_create_dynamic_target "${FORMAT}" "${NETCONS_PATH}" +enable_netcons_ns +set_user_data + +# Test #1 : Create an bonding interface and attach netpoll into +# the bonding interface. Netconsole/netpoll should work on +# the bonding interface. +test_send_netcons_msg_through_bond_iface +echo "test #1: netpoll on bonding interface worked. Test passed" >&2 + +# Test #2: Attach netpoll to an enslaved interface +# Try to attach netpoll to an enslaved sub-interface (while still being part of +# a bonding interface), which shouldn't be allowed +test_enable_netpoll_on_enslaved_iface +echo "test #2: netpoll correctly rejected enslaved interface (expected behavior). Test passed." >&2 + +# Test #3: Unplug the sub-interface from bond and enable netconsole +# Detach the interface from a bonding interface and attach netpoll again +test_delete_bond_and_reenable_target +echo "test #3: Able to attach to an unbound interface. Test passed." >&2 + +# Test #4: Enslave a sub-interface that had netconsole enabled +# Try to enslave an interface that has netconsole/netpoll enabled. +# Previous test has netconsole enabled in BOND_TX1_SLAVE_IF, try to enslave it +test_enslave_netcons_enabled_iface +echo "test #4: Enslaving an interface with netpoll attached. Test passed." >&2 + +# Test #5: Enslave a sub-interface to a bonding interface +# Enslave an interface to a bond interface that has netpoll attached +# At this stage, BOND_TX_MAIN_IF is created and BOND_TX1_SLAVE_IF is part of +# it. Netconsole is currently disabled +test_enslave_iface_to_bond +echo "test #5: Enslaving an interface to bond+netpoll. Test passed." >&2 + +# Test #6: Enslave a IFF_DISABLE_NETPOLL sub-interface to a bonding interface +# At this stage, BOND_TX_MAIN_IF has both sub interface and netconsole is +# enabled. This test will try to enslave an a veth (IFF_DISABLE_NETPOLL) interface +# and it should fail, with netpoll: veth0 doesn't support polling +test_enslave_iff_disabled_netpoll_iface +echo "test #6: Enslaving IFF_DISABLE_NETPOLL ifaces to bond iface is not supported. Test passed." >&2 + +cleanup_bond +trap - EXIT +exit "${EXIT_STATUS}" diff --git a/tools/testing/selftests/drivers/net/gro.c b/tools/testing/selftests/drivers/net/gro.c new file mode 100644 index 000000000000..995b492f5bcb --- /dev/null +++ b/tools/testing/selftests/drivers/net/gro.c @@ -0,0 +1,1369 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This testsuite provides conformance testing for GRO coalescing. + * + * Test cases: + * 1.data + * Data packets of the same size and same header setup with correct + * sequence numbers coalesce. The one exception being the last data + * packet coalesced: it can be smaller than the rest and coalesced + * as long as it is in the same flow. + * 2.ack + * Pure ACK does not coalesce. + * 3.flags + * Specific test cases: no packets with PSH, SYN, URG, RST set will + * be coalesced. + * 4.tcp + * Packets with incorrect checksum, non-consecutive seqno and + * different TCP header options shouldn't coalesce. Nit: given that + * some extension headers have paddings, such as timestamp, headers + * that are padding differently would not be coalesced. + * 5.ip: + * Packets with different (ECN, TTL, TOS) header, ip options or + * ip fragments (ipv6) shouldn't coalesce. + * 6.large: + * Packets larger than GRO_MAX_SIZE packets shouldn't coalesce. + * + * MSS is defined as 4096 - header because if it is too small + * (i.e. 1500 MTU - header), it will result in many packets, + * increasing the "large" test case's flakiness. This is because + * due to time sensitivity in the coalescing window, the receiver + * may not coalesce all of the packets. + * + * Note the timing issue applies to all of the test cases, so some + * flakiness is to be expected. + * + */ + +#define _GNU_SOURCE + +#include <arpa/inet.h> +#include <errno.h> +#include <error.h> +#include <getopt.h> +#include <linux/filter.h> +#include <linux/if_packet.h> +#include <linux/ipv6.h> +#include <net/ethernet.h> +#include <net/if.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/ip6.h> +#include <netinet/tcp.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdio.h> +#include <stdarg.h> +#include <string.h> +#include <unistd.h> + +#include "../../kselftest.h" +#include "../../net/lib/ksft.h" + +#define DPORT 8000 +#define SPORT 1500 +#define PAYLOAD_LEN 100 +#define NUM_PACKETS 4 +#define START_SEQ 100 +#define START_ACK 100 +#define ETH_P_NONE 0 +#define TOTAL_HDR_LEN (ETH_HLEN + sizeof(struct ipv6hdr) + sizeof(struct tcphdr)) +#define MSS (4096 - sizeof(struct tcphdr) - sizeof(struct ipv6hdr)) +#define MAX_PAYLOAD (IP_MAXPACKET - sizeof(struct tcphdr) - sizeof(struct ipv6hdr)) +#define NUM_LARGE_PKT (MAX_PAYLOAD / MSS) +#define MAX_HDR_LEN (ETH_HLEN + sizeof(struct ipv6hdr) + sizeof(struct tcphdr)) +#define MIN_EXTHDR_SIZE 8 +#define EXT_PAYLOAD_1 "\x00\x00\x00\x00\x00\x00" +#define EXT_PAYLOAD_2 "\x11\x11\x11\x11\x11\x11" + +#define ipv6_optlen(p) (((p)->hdrlen+1) << 3) /* calculate IPv6 extension header len */ +#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) + +static const char *addr6_src = "fdaa::2"; +static const char *addr6_dst = "fdaa::1"; +static const char *addr4_src = "192.168.1.200"; +static const char *addr4_dst = "192.168.1.100"; +static int proto = -1; +static uint8_t src_mac[ETH_ALEN], dst_mac[ETH_ALEN]; +static char *testname = "data"; +static char *ifname = "eth0"; +static char *smac = "aa:00:00:00:00:02"; +static char *dmac = "aa:00:00:00:00:01"; +static bool verbose; +static bool tx_socket = true; +static int tcp_offset = -1; +static int total_hdr_len = -1; +static int ethhdr_proto = -1; +static bool ipip; +static const int num_flush_id_cases = 6; + +static void vlog(const char *fmt, ...) +{ + va_list args; + + if (verbose) { + va_start(args, fmt); + vfprintf(stderr, fmt, args); + va_end(args); + } +} + +static void setup_sock_filter(int fd) +{ + const int dport_off = tcp_offset + offsetof(struct tcphdr, dest); + const int ethproto_off = offsetof(struct ethhdr, h_proto); + int optlen = 0; + int ipproto_off, opt_ipproto_off; + int next_off; + + if (ipip) + next_off = sizeof(struct iphdr) + offsetof(struct iphdr, protocol); + else if (proto == PF_INET) + next_off = offsetof(struct iphdr, protocol); + else + next_off = offsetof(struct ipv6hdr, nexthdr); + ipproto_off = ETH_HLEN + next_off; + + /* Overridden later if exthdrs are used: */ + opt_ipproto_off = ipproto_off; + + if (strcmp(testname, "ip") == 0) { + if (proto == PF_INET) + optlen = sizeof(struct ip_timestamp); + else { + BUILD_BUG_ON(sizeof(struct ip6_hbh) > MIN_EXTHDR_SIZE); + BUILD_BUG_ON(sizeof(struct ip6_dest) > MIN_EXTHDR_SIZE); + BUILD_BUG_ON(sizeof(struct ip6_frag) > MIN_EXTHDR_SIZE); + + /* same size for HBH and Fragment extension header types */ + optlen = MIN_EXTHDR_SIZE; + opt_ipproto_off = ETH_HLEN + sizeof(struct ipv6hdr) + + offsetof(struct ip6_ext, ip6e_nxt); + } + } + + /* this filter validates the following: + * - packet is IPv4/IPv6 according to the running test. + * - packet is TCP. Also handles the case of one extension header and then TCP. + * - checks the packet tcp dport equals to DPORT. Also handles the case of one + * extension header and then TCP. + */ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD + BPF_H + BPF_ABS, ethproto_off), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, ntohs(ethhdr_proto), 0, 9), + BPF_STMT(BPF_LD + BPF_B + BPF_ABS, ipproto_off), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, IPPROTO_TCP, 2, 0), + BPF_STMT(BPF_LD + BPF_B + BPF_ABS, opt_ipproto_off), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, IPPROTO_TCP, 0, 5), + BPF_STMT(BPF_LD + BPF_H + BPF_ABS, dport_off), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, DPORT, 2, 0), + BPF_STMT(BPF_LD + BPF_H + BPF_ABS, dport_off + optlen), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, DPORT, 0, 1), + BPF_STMT(BPF_RET + BPF_K, 0xFFFFFFFF), + BPF_STMT(BPF_RET + BPF_K, 0), + }; + + struct sock_fprog bpf = { + .len = ARRAY_SIZE(filter), + .filter = filter, + }; + + if (setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &bpf, sizeof(bpf)) < 0) + error(1, errno, "error setting filter"); +} + +static uint32_t checksum_nofold(void *data, size_t len, uint32_t sum) +{ + uint16_t *words = data; + int i; + + for (i = 0; i < len / 2; i++) + sum += words[i]; + if (len & 1) + sum += ((char *)data)[len - 1]; + return sum; +} + +static uint16_t checksum_fold(void *data, size_t len, uint32_t sum) +{ + sum = checksum_nofold(data, len, sum); + while (sum > 0xFFFF) + sum = (sum & 0xFFFF) + (sum >> 16); + return ~sum; +} + +static uint16_t tcp_checksum(void *buf, int payload_len) +{ + struct pseudo_header6 { + struct in6_addr saddr; + struct in6_addr daddr; + uint16_t protocol; + uint16_t payload_len; + } ph6; + struct pseudo_header4 { + struct in_addr saddr; + struct in_addr daddr; + uint16_t protocol; + uint16_t payload_len; + } ph4; + uint32_t sum = 0; + + if (proto == PF_INET6) { + if (inet_pton(AF_INET6, addr6_src, &ph6.saddr) != 1) + error(1, errno, "inet_pton6 source ip pseudo"); + if (inet_pton(AF_INET6, addr6_dst, &ph6.daddr) != 1) + error(1, errno, "inet_pton6 dest ip pseudo"); + ph6.protocol = htons(IPPROTO_TCP); + ph6.payload_len = htons(sizeof(struct tcphdr) + payload_len); + + sum = checksum_nofold(&ph6, sizeof(ph6), 0); + } else if (proto == PF_INET) { + if (inet_pton(AF_INET, addr4_src, &ph4.saddr) != 1) + error(1, errno, "inet_pton source ip pseudo"); + if (inet_pton(AF_INET, addr4_dst, &ph4.daddr) != 1) + error(1, errno, "inet_pton dest ip pseudo"); + ph4.protocol = htons(IPPROTO_TCP); + ph4.payload_len = htons(sizeof(struct tcphdr) + payload_len); + + sum = checksum_nofold(&ph4, sizeof(ph4), 0); + } + + return checksum_fold(buf, sizeof(struct tcphdr) + payload_len, sum); +} + +static void read_MAC(uint8_t *mac_addr, char *mac) +{ + if (sscanf(mac, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx", + &mac_addr[0], &mac_addr[1], &mac_addr[2], + &mac_addr[3], &mac_addr[4], &mac_addr[5]) != 6) + error(1, 0, "sscanf"); +} + +static void fill_datalinklayer(void *buf) +{ + struct ethhdr *eth = buf; + + memcpy(eth->h_dest, dst_mac, ETH_ALEN); + memcpy(eth->h_source, src_mac, ETH_ALEN); + eth->h_proto = ethhdr_proto; +} + +static void fill_networklayer(void *buf, int payload_len, int protocol) +{ + struct ipv6hdr *ip6h = buf; + struct iphdr *iph = buf; + + if (proto == PF_INET6) { + memset(ip6h, 0, sizeof(*ip6h)); + + ip6h->version = 6; + ip6h->payload_len = htons(sizeof(struct tcphdr) + payload_len); + ip6h->nexthdr = protocol; + ip6h->hop_limit = 8; + if (inet_pton(AF_INET6, addr6_src, &ip6h->saddr) != 1) + error(1, errno, "inet_pton source ip6"); + if (inet_pton(AF_INET6, addr6_dst, &ip6h->daddr) != 1) + error(1, errno, "inet_pton dest ip6"); + } else if (proto == PF_INET) { + memset(iph, 0, sizeof(*iph)); + + iph->version = 4; + iph->ihl = 5; + iph->ttl = 8; + iph->protocol = protocol; + iph->tot_len = htons(sizeof(struct tcphdr) + + payload_len + sizeof(struct iphdr)); + iph->frag_off = htons(0x4000); /* DF = 1, MF = 0 */ + if (inet_pton(AF_INET, addr4_src, &iph->saddr) != 1) + error(1, errno, "inet_pton source ip"); + if (inet_pton(AF_INET, addr4_dst, &iph->daddr) != 1) + error(1, errno, "inet_pton dest ip"); + iph->check = checksum_fold(buf, sizeof(struct iphdr), 0); + } +} + +static void fill_transportlayer(void *buf, int seq_offset, int ack_offset, + int payload_len, int fin) +{ + struct tcphdr *tcph = buf; + + memset(tcph, 0, sizeof(*tcph)); + + tcph->source = htons(SPORT); + tcph->dest = htons(DPORT); + tcph->seq = ntohl(START_SEQ + seq_offset); + tcph->ack_seq = ntohl(START_ACK + ack_offset); + tcph->ack = 1; + tcph->fin = fin; + tcph->doff = 5; + tcph->window = htons(TCP_MAXWIN); + tcph->urg_ptr = 0; + tcph->check = tcp_checksum(tcph, payload_len); +} + +static void write_packet(int fd, char *buf, int len, struct sockaddr_ll *daddr) +{ + int ret = -1; + + ret = sendto(fd, buf, len, 0, (struct sockaddr *)daddr, sizeof(*daddr)); + if (ret == -1) + error(1, errno, "sendto failure"); + if (ret != len) + error(1, errno, "sendto wrong length"); +} + +static void create_packet(void *buf, int seq_offset, int ack_offset, + int payload_len, int fin) +{ + memset(buf, 0, total_hdr_len); + memset(buf + total_hdr_len, 'a', payload_len); + + fill_transportlayer(buf + tcp_offset, seq_offset, ack_offset, + payload_len, fin); + + if (ipip) { + fill_networklayer(buf + ETH_HLEN, payload_len + sizeof(struct iphdr), + IPPROTO_IPIP); + fill_networklayer(buf + ETH_HLEN + sizeof(struct iphdr), + payload_len, IPPROTO_TCP); + } else { + fill_networklayer(buf + ETH_HLEN, payload_len, IPPROTO_TCP); + } + + fill_datalinklayer(buf); +} + +/* send one extra flag, not first and not last pkt */ +static void send_flags(int fd, struct sockaddr_ll *daddr, int psh, int syn, + int rst, int urg) +{ + static char flag_buf[MAX_HDR_LEN + PAYLOAD_LEN]; + static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; + int payload_len, pkt_size, flag, i; + struct tcphdr *tcph; + + payload_len = PAYLOAD_LEN * psh; + pkt_size = total_hdr_len + payload_len; + flag = NUM_PACKETS / 2; + + create_packet(flag_buf, flag * payload_len, 0, payload_len, 0); + + tcph = (struct tcphdr *)(flag_buf + tcp_offset); + tcph->psh = psh; + tcph->syn = syn; + tcph->rst = rst; + tcph->urg = urg; + tcph->check = 0; + tcph->check = tcp_checksum(tcph, payload_len); + + for (i = 0; i < NUM_PACKETS + 1; i++) { + if (i == flag) { + write_packet(fd, flag_buf, pkt_size, daddr); + continue; + } + create_packet(buf, i * PAYLOAD_LEN, 0, PAYLOAD_LEN, 0); + write_packet(fd, buf, total_hdr_len + PAYLOAD_LEN, daddr); + } +} + +/* Test for data of same length, smaller than previous + * and of different lengths + */ +static void send_data_pkts(int fd, struct sockaddr_ll *daddr, + int payload_len1, int payload_len2) +{ + static char buf[ETH_HLEN + IP_MAXPACKET]; + + create_packet(buf, 0, 0, payload_len1, 0); + write_packet(fd, buf, total_hdr_len + payload_len1, daddr); + create_packet(buf, payload_len1, 0, payload_len2, 0); + write_packet(fd, buf, total_hdr_len + payload_len2, daddr); +} + +/* If incoming segments make tracked segment length exceed + * legal IP datagram length, do not coalesce + */ +static void send_large(int fd, struct sockaddr_ll *daddr, int remainder) +{ + static char pkts[NUM_LARGE_PKT][TOTAL_HDR_LEN + MSS]; + static char last[TOTAL_HDR_LEN + MSS]; + static char new_seg[TOTAL_HDR_LEN + MSS]; + int i; + + for (i = 0; i < NUM_LARGE_PKT; i++) + create_packet(pkts[i], i * MSS, 0, MSS, 0); + create_packet(last, NUM_LARGE_PKT * MSS, 0, remainder, 0); + create_packet(new_seg, (NUM_LARGE_PKT + 1) * MSS, 0, remainder, 0); + + for (i = 0; i < NUM_LARGE_PKT; i++) + write_packet(fd, pkts[i], total_hdr_len + MSS, daddr); + write_packet(fd, last, total_hdr_len + remainder, daddr); + write_packet(fd, new_seg, total_hdr_len + remainder, daddr); +} + +/* Pure acks and dup acks don't coalesce */ +static void send_ack(int fd, struct sockaddr_ll *daddr) +{ + static char buf[MAX_HDR_LEN]; + + create_packet(buf, 0, 0, 0, 0); + write_packet(fd, buf, total_hdr_len, daddr); + write_packet(fd, buf, total_hdr_len, daddr); + create_packet(buf, 0, 1, 0, 0); + write_packet(fd, buf, total_hdr_len, daddr); +} + +static void recompute_packet(char *buf, char *no_ext, int extlen) +{ + struct tcphdr *tcphdr = (struct tcphdr *)(buf + tcp_offset); + struct ipv6hdr *ip6h = (struct ipv6hdr *)(buf + ETH_HLEN); + struct iphdr *iph = (struct iphdr *)(buf + ETH_HLEN); + + memmove(buf, no_ext, total_hdr_len); + memmove(buf + total_hdr_len + extlen, + no_ext + total_hdr_len, PAYLOAD_LEN); + + tcphdr->doff = tcphdr->doff + (extlen / 4); + tcphdr->check = 0; + tcphdr->check = tcp_checksum(tcphdr, PAYLOAD_LEN + extlen); + if (proto == PF_INET) { + iph->tot_len = htons(ntohs(iph->tot_len) + extlen); + iph->check = 0; + iph->check = checksum_fold(iph, sizeof(struct iphdr), 0); + + if (ipip) { + iph += 1; + iph->tot_len = htons(ntohs(iph->tot_len) + extlen); + iph->check = 0; + iph->check = checksum_fold(iph, sizeof(struct iphdr), 0); + } + } else { + ip6h->payload_len = htons(ntohs(ip6h->payload_len) + extlen); + } +} + +static void tcp_write_options(char *buf, int kind, int ts) +{ + struct tcp_option_ts { + uint8_t kind; + uint8_t len; + uint32_t tsval; + uint32_t tsecr; + } *opt_ts = (void *)buf; + struct tcp_option_window { + uint8_t kind; + uint8_t len; + uint8_t shift; + } *opt_window = (void *)buf; + + switch (kind) { + case TCPOPT_NOP: + buf[0] = TCPOPT_NOP; + break; + case TCPOPT_WINDOW: + memset(opt_window, 0, sizeof(struct tcp_option_window)); + opt_window->kind = TCPOPT_WINDOW; + opt_window->len = TCPOLEN_WINDOW; + opt_window->shift = 0; + break; + case TCPOPT_TIMESTAMP: + memset(opt_ts, 0, sizeof(struct tcp_option_ts)); + opt_ts->kind = TCPOPT_TIMESTAMP; + opt_ts->len = TCPOLEN_TIMESTAMP; + opt_ts->tsval = ts; + opt_ts->tsecr = 0; + break; + default: + error(1, 0, "unimplemented TCP option"); + break; + } +} + +/* TCP with options is always a permutation of {TS, NOP, NOP}. + * Implement different orders to verify coalescing stops. + */ +static void add_standard_tcp_options(char *buf, char *no_ext, int ts, int order) +{ + switch (order) { + case 0: + tcp_write_options(buf + total_hdr_len, TCPOPT_NOP, 0); + tcp_write_options(buf + total_hdr_len + 1, TCPOPT_NOP, 0); + tcp_write_options(buf + total_hdr_len + 2 /* two NOP opts */, + TCPOPT_TIMESTAMP, ts); + break; + case 1: + tcp_write_options(buf + total_hdr_len, TCPOPT_NOP, 0); + tcp_write_options(buf + total_hdr_len + 1, + TCPOPT_TIMESTAMP, ts); + tcp_write_options(buf + total_hdr_len + 1 + TCPOLEN_TIMESTAMP, + TCPOPT_NOP, 0); + break; + case 2: + tcp_write_options(buf + total_hdr_len, TCPOPT_TIMESTAMP, ts); + tcp_write_options(buf + total_hdr_len + TCPOLEN_TIMESTAMP + 1, + TCPOPT_NOP, 0); + tcp_write_options(buf + total_hdr_len + TCPOLEN_TIMESTAMP + 2, + TCPOPT_NOP, 0); + break; + default: + error(1, 0, "unknown order"); + break; + } + recompute_packet(buf, no_ext, TCPOLEN_TSTAMP_APPA); +} + +/* Packets with invalid checksum don't coalesce. */ +static void send_changed_checksum(int fd, struct sockaddr_ll *daddr) +{ + static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; + struct tcphdr *tcph = (struct tcphdr *)(buf + tcp_offset); + int pkt_size = total_hdr_len + PAYLOAD_LEN; + + create_packet(buf, 0, 0, PAYLOAD_LEN, 0); + write_packet(fd, buf, pkt_size, daddr); + + create_packet(buf, PAYLOAD_LEN, 0, PAYLOAD_LEN, 0); + tcph->check = tcph->check - 1; + write_packet(fd, buf, pkt_size, daddr); +} + + /* Packets with non-consecutive sequence number don't coalesce.*/ +static void send_changed_seq(int fd, struct sockaddr_ll *daddr) +{ + static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; + struct tcphdr *tcph = (struct tcphdr *)(buf + tcp_offset); + int pkt_size = total_hdr_len + PAYLOAD_LEN; + + create_packet(buf, 0, 0, PAYLOAD_LEN, 0); + write_packet(fd, buf, pkt_size, daddr); + + create_packet(buf, PAYLOAD_LEN, 0, PAYLOAD_LEN, 0); + tcph->seq = ntohl(htonl(tcph->seq) + 1); + tcph->check = 0; + tcph->check = tcp_checksum(tcph, PAYLOAD_LEN); + write_packet(fd, buf, pkt_size, daddr); +} + + /* Packet with different timestamp option or different timestamps + * don't coalesce. + */ +static void send_changed_ts(int fd, struct sockaddr_ll *daddr) +{ + static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; + static char extpkt[sizeof(buf) + TCPOLEN_TSTAMP_APPA]; + int pkt_size = total_hdr_len + PAYLOAD_LEN + TCPOLEN_TSTAMP_APPA; + + create_packet(buf, 0, 0, PAYLOAD_LEN, 0); + add_standard_tcp_options(extpkt, buf, 0, 0); + write_packet(fd, extpkt, pkt_size, daddr); + + create_packet(buf, PAYLOAD_LEN, 0, PAYLOAD_LEN, 0); + add_standard_tcp_options(extpkt, buf, 0, 0); + write_packet(fd, extpkt, pkt_size, daddr); + + create_packet(buf, PAYLOAD_LEN * 2, 0, PAYLOAD_LEN, 0); + add_standard_tcp_options(extpkt, buf, 100, 0); + write_packet(fd, extpkt, pkt_size, daddr); + + create_packet(buf, PAYLOAD_LEN * 3, 0, PAYLOAD_LEN, 0); + add_standard_tcp_options(extpkt, buf, 100, 1); + write_packet(fd, extpkt, pkt_size, daddr); + + create_packet(buf, PAYLOAD_LEN * 4, 0, PAYLOAD_LEN, 0); + add_standard_tcp_options(extpkt, buf, 100, 2); + write_packet(fd, extpkt, pkt_size, daddr); +} + +/* Packet with different tcp options don't coalesce. */ +static void send_diff_opt(int fd, struct sockaddr_ll *daddr) +{ + static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; + static char extpkt1[sizeof(buf) + TCPOLEN_TSTAMP_APPA]; + static char extpkt2[sizeof(buf) + TCPOLEN_MAXSEG]; + int extpkt1_size = total_hdr_len + PAYLOAD_LEN + TCPOLEN_TSTAMP_APPA; + int extpkt2_size = total_hdr_len + PAYLOAD_LEN + TCPOLEN_MAXSEG; + + create_packet(buf, 0, 0, PAYLOAD_LEN, 0); + add_standard_tcp_options(extpkt1, buf, 0, 0); + write_packet(fd, extpkt1, extpkt1_size, daddr); + + create_packet(buf, PAYLOAD_LEN, 0, PAYLOAD_LEN, 0); + add_standard_tcp_options(extpkt1, buf, 0, 0); + write_packet(fd, extpkt1, extpkt1_size, daddr); + + create_packet(buf, PAYLOAD_LEN * 2, 0, PAYLOAD_LEN, 0); + tcp_write_options(extpkt2 + MAX_HDR_LEN, TCPOPT_NOP, 0); + tcp_write_options(extpkt2 + MAX_HDR_LEN + 1, TCPOPT_WINDOW, 0); + recompute_packet(extpkt2, buf, TCPOLEN_WINDOW + 1); + write_packet(fd, extpkt2, extpkt2_size, daddr); +} + +static void add_ipv4_ts_option(void *buf, void *optpkt) +{ + struct ip_timestamp *ts = (struct ip_timestamp *)(optpkt + tcp_offset); + int optlen = sizeof(struct ip_timestamp); + struct iphdr *iph; + + if (optlen % 4) + error(1, 0, "ipv4 timestamp length is not a multiple of 4B"); + + ts->ipt_code = IPOPT_TS; + ts->ipt_len = optlen; + ts->ipt_ptr = 5; + ts->ipt_flg = IPOPT_TS_TSONLY; + + memcpy(optpkt, buf, tcp_offset); + memcpy(optpkt + tcp_offset + optlen, buf + tcp_offset, + sizeof(struct tcphdr) + PAYLOAD_LEN); + + iph = (struct iphdr *)(optpkt + ETH_HLEN); + iph->ihl = 5 + (optlen / 4); + iph->tot_len = htons(ntohs(iph->tot_len) + optlen); + iph->check = 0; + iph->check = checksum_fold(iph, sizeof(struct iphdr) + optlen, 0); +} + +static void add_ipv6_exthdr(void *buf, void *optpkt, __u8 exthdr_type, char *ext_payload) +{ + struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr *)(optpkt + tcp_offset); + struct ipv6hdr *iph = (struct ipv6hdr *)(optpkt + ETH_HLEN); + char *exthdr_payload_start = (char *)(exthdr + 1); + + exthdr->hdrlen = 0; + exthdr->nexthdr = IPPROTO_TCP; + + memcpy(exthdr_payload_start, ext_payload, MIN_EXTHDR_SIZE - sizeof(*exthdr)); + + memcpy(optpkt, buf, tcp_offset); + memcpy(optpkt + tcp_offset + MIN_EXTHDR_SIZE, buf + tcp_offset, + sizeof(struct tcphdr) + PAYLOAD_LEN); + + iph->nexthdr = exthdr_type; + iph->payload_len = htons(ntohs(iph->payload_len) + MIN_EXTHDR_SIZE); +} + +static void fix_ip4_checksum(struct iphdr *iph) +{ + iph->check = 0; + iph->check = checksum_fold(iph, sizeof(struct iphdr), 0); +} + +static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase) +{ + static char buf1[MAX_HDR_LEN + PAYLOAD_LEN]; + static char buf2[MAX_HDR_LEN + PAYLOAD_LEN]; + static char buf3[MAX_HDR_LEN + PAYLOAD_LEN]; + bool send_three = false; + struct iphdr *iph1; + struct iphdr *iph2; + struct iphdr *iph3; + + iph1 = (struct iphdr *)(buf1 + ETH_HLEN); + iph2 = (struct iphdr *)(buf2 + ETH_HLEN); + iph3 = (struct iphdr *)(buf3 + ETH_HLEN); + + create_packet(buf1, 0, 0, PAYLOAD_LEN, 0); + create_packet(buf2, PAYLOAD_LEN, 0, PAYLOAD_LEN, 0); + create_packet(buf3, PAYLOAD_LEN * 2, 0, PAYLOAD_LEN, 0); + + switch (tcase) { + case 0: /* DF=1, Incrementing - should coalesce */ + iph1->frag_off |= htons(IP_DF); + iph1->id = htons(8); + + iph2->frag_off |= htons(IP_DF); + iph2->id = htons(9); + break; + + case 1: /* DF=1, Fixed - should coalesce */ + iph1->frag_off |= htons(IP_DF); + iph1->id = htons(8); + + iph2->frag_off |= htons(IP_DF); + iph2->id = htons(8); + break; + + case 2: /* DF=0, Incrementing - should coalesce */ + iph1->frag_off &= ~htons(IP_DF); + iph1->id = htons(8); + + iph2->frag_off &= ~htons(IP_DF); + iph2->id = htons(9); + break; + + case 3: /* DF=0, Fixed - should coalesce */ + iph1->frag_off &= ~htons(IP_DF); + iph1->id = htons(8); + + iph2->frag_off &= ~htons(IP_DF); + iph2->id = htons(8); + break; + + case 4: /* DF=1, two packets incrementing, and one fixed - should + * coalesce only the first two packets + */ + iph1->frag_off |= htons(IP_DF); + iph1->id = htons(8); + + iph2->frag_off |= htons(IP_DF); + iph2->id = htons(9); + + iph3->frag_off |= htons(IP_DF); + iph3->id = htons(9); + send_three = true; + break; + + case 5: /* DF=1, two packets fixed, and one incrementing - should + * coalesce only the first two packets + */ + iph1->frag_off |= htons(IP_DF); + iph1->id = htons(8); + + iph2->frag_off |= htons(IP_DF); + iph2->id = htons(8); + + iph3->frag_off |= htons(IP_DF); + iph3->id = htons(9); + send_three = true; + break; + } + + fix_ip4_checksum(iph1); + fix_ip4_checksum(iph2); + write_packet(fd, buf1, total_hdr_len + PAYLOAD_LEN, daddr); + write_packet(fd, buf2, total_hdr_len + PAYLOAD_LEN, daddr); + + if (send_three) { + fix_ip4_checksum(iph3); + write_packet(fd, buf3, total_hdr_len + PAYLOAD_LEN, daddr); + } +} + +static void test_flush_id(int fd, struct sockaddr_ll *daddr, char *fin_pkt) +{ + for (int i = 0; i < num_flush_id_cases; i++) { + sleep(1); + send_flush_id_case(fd, daddr, i); + sleep(1); + write_packet(fd, fin_pkt, total_hdr_len, daddr); + } +} + +static void send_ipv6_exthdr(int fd, struct sockaddr_ll *daddr, char *ext_data1, char *ext_data2) +{ + static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; + static char exthdr_pck[sizeof(buf) + MIN_EXTHDR_SIZE]; + + create_packet(buf, 0, 0, PAYLOAD_LEN, 0); + add_ipv6_exthdr(buf, exthdr_pck, IPPROTO_DSTOPTS, ext_data1); + write_packet(fd, exthdr_pck, total_hdr_len + PAYLOAD_LEN + MIN_EXTHDR_SIZE, daddr); + + create_packet(buf, PAYLOAD_LEN * 1, 0, PAYLOAD_LEN, 0); + add_ipv6_exthdr(buf, exthdr_pck, IPPROTO_DSTOPTS, ext_data2); + write_packet(fd, exthdr_pck, total_hdr_len + PAYLOAD_LEN + MIN_EXTHDR_SIZE, daddr); +} + +/* IPv4 options shouldn't coalesce */ +static void send_ip_options(int fd, struct sockaddr_ll *daddr) +{ + static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; + static char optpkt[sizeof(buf) + sizeof(struct ip_timestamp)]; + int optlen = sizeof(struct ip_timestamp); + int pkt_size = total_hdr_len + PAYLOAD_LEN + optlen; + + create_packet(buf, 0, 0, PAYLOAD_LEN, 0); + write_packet(fd, buf, total_hdr_len + PAYLOAD_LEN, daddr); + + create_packet(buf, PAYLOAD_LEN * 1, 0, PAYLOAD_LEN, 0); + add_ipv4_ts_option(buf, optpkt); + write_packet(fd, optpkt, pkt_size, daddr); + + create_packet(buf, PAYLOAD_LEN * 2, 0, PAYLOAD_LEN, 0); + write_packet(fd, buf, total_hdr_len + PAYLOAD_LEN, daddr); +} + +/* IPv4 fragments shouldn't coalesce */ +static void send_fragment4(int fd, struct sockaddr_ll *daddr) +{ + static char buf[IP_MAXPACKET]; + struct iphdr *iph = (struct iphdr *)(buf + ETH_HLEN); + int pkt_size = total_hdr_len + PAYLOAD_LEN; + + create_packet(buf, 0, 0, PAYLOAD_LEN, 0); + write_packet(fd, buf, pkt_size, daddr); + + /* Once fragmented, packet would retain the total_len. + * Tcp header is prepared as if rest of data is in follow-up frags, + * but follow up frags aren't actually sent. + */ + memset(buf + total_hdr_len, 'a', PAYLOAD_LEN * 2); + fill_transportlayer(buf + tcp_offset, PAYLOAD_LEN, 0, PAYLOAD_LEN * 2, 0); + fill_networklayer(buf + ETH_HLEN, PAYLOAD_LEN, IPPROTO_TCP); + fill_datalinklayer(buf); + + iph->frag_off = htons(0x6000); // DF = 1, MF = 1 + iph->check = 0; + iph->check = checksum_fold(iph, sizeof(struct iphdr), 0); + write_packet(fd, buf, pkt_size, daddr); +} + +/* IPv4 packets with different ttl don't coalesce.*/ +static void send_changed_ttl(int fd, struct sockaddr_ll *daddr) +{ + int pkt_size = total_hdr_len + PAYLOAD_LEN; + static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; + struct iphdr *iph = (struct iphdr *)(buf + ETH_HLEN); + + create_packet(buf, 0, 0, PAYLOAD_LEN, 0); + write_packet(fd, buf, pkt_size, daddr); + + create_packet(buf, PAYLOAD_LEN, 0, PAYLOAD_LEN, 0); + iph->ttl = 7; + iph->check = 0; + iph->check = checksum_fold(iph, sizeof(struct iphdr), 0); + write_packet(fd, buf, pkt_size, daddr); +} + +/* Packets with different tos don't coalesce.*/ +static void send_changed_tos(int fd, struct sockaddr_ll *daddr) +{ + int pkt_size = total_hdr_len + PAYLOAD_LEN; + static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; + struct iphdr *iph = (struct iphdr *)(buf + ETH_HLEN); + struct ipv6hdr *ip6h = (struct ipv6hdr *)(buf + ETH_HLEN); + + create_packet(buf, 0, 0, PAYLOAD_LEN, 0); + write_packet(fd, buf, pkt_size, daddr); + + create_packet(buf, PAYLOAD_LEN, 0, PAYLOAD_LEN, 0); + if (proto == PF_INET) { + iph->tos = 1; + iph->check = 0; + iph->check = checksum_fold(iph, sizeof(struct iphdr), 0); + } else if (proto == PF_INET6) { + ip6h->priority = 0xf; + } + write_packet(fd, buf, pkt_size, daddr); +} + +/* Packets with different ECN don't coalesce.*/ +static void send_changed_ECN(int fd, struct sockaddr_ll *daddr) +{ + int pkt_size = total_hdr_len + PAYLOAD_LEN; + static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; + struct iphdr *iph = (struct iphdr *)(buf + ETH_HLEN); + + create_packet(buf, 0, 0, PAYLOAD_LEN, 0); + write_packet(fd, buf, pkt_size, daddr); + + create_packet(buf, PAYLOAD_LEN, 0, PAYLOAD_LEN, 0); + if (proto == PF_INET) { + buf[ETH_HLEN + 1] ^= 0x2; // ECN set to 10 + iph->check = 0; + iph->check = checksum_fold(iph, sizeof(struct iphdr), 0); + } else { + buf[ETH_HLEN + 1] ^= 0x20; // ECN set to 10 + } + write_packet(fd, buf, pkt_size, daddr); +} + +/* IPv6 fragments and packets with extensions don't coalesce.*/ +static void send_fragment6(int fd, struct sockaddr_ll *daddr) +{ + static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; + static char extpkt[MAX_HDR_LEN + PAYLOAD_LEN + + sizeof(struct ip6_frag)]; + struct ipv6hdr *ip6h = (struct ipv6hdr *)(buf + ETH_HLEN); + struct ip6_frag *frag = (void *)(extpkt + tcp_offset); + int extlen = sizeof(struct ip6_frag); + int bufpkt_len = total_hdr_len + PAYLOAD_LEN; + int extpkt_len = bufpkt_len + extlen; + int i; + + for (i = 0; i < 2; i++) { + create_packet(buf, PAYLOAD_LEN * i, 0, PAYLOAD_LEN, 0); + write_packet(fd, buf, bufpkt_len, daddr); + } + sleep(1); + create_packet(buf, PAYLOAD_LEN * 2, 0, PAYLOAD_LEN, 0); + memset(extpkt, 0, extpkt_len); + + ip6h->nexthdr = IPPROTO_FRAGMENT; + ip6h->payload_len = htons(ntohs(ip6h->payload_len) + extlen); + frag->ip6f_nxt = IPPROTO_TCP; + + memcpy(extpkt, buf, tcp_offset); + memcpy(extpkt + tcp_offset + extlen, buf + tcp_offset, + sizeof(struct tcphdr) + PAYLOAD_LEN); + write_packet(fd, extpkt, extpkt_len, daddr); + + create_packet(buf, PAYLOAD_LEN * 3, 0, PAYLOAD_LEN, 0); + write_packet(fd, buf, bufpkt_len, daddr); +} + +static void bind_packetsocket(int fd) +{ + struct sockaddr_ll daddr = {}; + + daddr.sll_family = AF_PACKET; + daddr.sll_protocol = ethhdr_proto; + daddr.sll_ifindex = if_nametoindex(ifname); + if (daddr.sll_ifindex == 0) + error(1, errno, "if_nametoindex"); + + if (bind(fd, (void *)&daddr, sizeof(daddr)) < 0) + error(1, errno, "could not bind socket"); +} + +static void set_timeout(int fd) +{ + struct timeval timeout; + + timeout.tv_sec = 3; + timeout.tv_usec = 0; + if (setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, + sizeof(timeout)) < 0) + error(1, errno, "cannot set timeout, setsockopt failed"); +} + +static void check_recv_pkts(int fd, int *correct_payload, + int correct_num_pkts) +{ + static char buffer[IP_MAXPACKET + ETH_HLEN + 1]; + struct iphdr *iph = (struct iphdr *)(buffer + ETH_HLEN); + struct ipv6hdr *ip6h = (struct ipv6hdr *)(buffer + ETH_HLEN); + struct tcphdr *tcph; + bool bad_packet = false; + int tcp_ext_len = 0; + int ip_ext_len = 0; + int pkt_size = -1; + int data_len = 0; + int num_pkt = 0; + int i; + + vlog("Expected {"); + for (i = 0; i < correct_num_pkts; i++) + vlog("%d ", correct_payload[i]); + vlog("}, Total %d packets\nReceived {", correct_num_pkts); + + while (1) { + ip_ext_len = 0; + pkt_size = recv(fd, buffer, IP_MAXPACKET + ETH_HLEN + 1, 0); + if (pkt_size < 0) + error(1, errno, "could not receive"); + + if (iph->version == 4) + ip_ext_len = (iph->ihl - 5) * 4; + else if (ip6h->version == 6 && ip6h->nexthdr != IPPROTO_TCP) + ip_ext_len = MIN_EXTHDR_SIZE; + + tcph = (struct tcphdr *)(buffer + tcp_offset + ip_ext_len); + + if (tcph->fin) + break; + + tcp_ext_len = (tcph->doff - 5) * 4; + data_len = pkt_size - total_hdr_len - tcp_ext_len - ip_ext_len; + /* Min ethernet frame payload is 46(ETH_ZLEN - ETH_HLEN) by RFC 802.3. + * Ipv4/tcp packets without at least 6 bytes of data will be padded. + * Packet sockets are protocol agnostic, and will not trim the padding. + */ + if (pkt_size == ETH_ZLEN && iph->version == 4) { + data_len = ntohs(iph->tot_len) + - sizeof(struct tcphdr) - sizeof(struct iphdr); + } + vlog("%d ", data_len); + if (data_len != correct_payload[num_pkt]) { + vlog("[!=%d]", correct_payload[num_pkt]); + bad_packet = true; + } + num_pkt++; + } + vlog("}, Total %d packets.\n", num_pkt); + if (num_pkt != correct_num_pkts) + error(1, 0, "incorrect number of packets"); + if (bad_packet) + error(1, 0, "incorrect packet geometry"); + + printf("Test succeeded\n\n"); +} + +static void gro_sender(void) +{ + const int fin_delay_us = 100 * 1000; + static char fin_pkt[MAX_HDR_LEN]; + struct sockaddr_ll daddr = {}; + int txfd = -1; + + txfd = socket(PF_PACKET, SOCK_RAW, IPPROTO_RAW); + if (txfd < 0) + error(1, errno, "socket creation"); + + memset(&daddr, 0, sizeof(daddr)); + daddr.sll_ifindex = if_nametoindex(ifname); + if (daddr.sll_ifindex == 0) + error(1, errno, "if_nametoindex"); + daddr.sll_family = AF_PACKET; + memcpy(daddr.sll_addr, dst_mac, ETH_ALEN); + daddr.sll_halen = ETH_ALEN; + create_packet(fin_pkt, PAYLOAD_LEN * 2, 0, 0, 1); + + if (strcmp(testname, "data") == 0) { + send_data_pkts(txfd, &daddr, PAYLOAD_LEN, PAYLOAD_LEN); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + send_data_pkts(txfd, &daddr, PAYLOAD_LEN, PAYLOAD_LEN / 2); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + send_data_pkts(txfd, &daddr, PAYLOAD_LEN / 2, PAYLOAD_LEN); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "ack") == 0) { + send_ack(txfd, &daddr); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "flags") == 0) { + send_flags(txfd, &daddr, 1, 0, 0, 0); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + send_flags(txfd, &daddr, 0, 1, 0, 0); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + send_flags(txfd, &daddr, 0, 0, 1, 0); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + send_flags(txfd, &daddr, 0, 0, 0, 1); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "tcp") == 0) { + send_changed_checksum(txfd, &daddr); + /* Adding sleep before sending FIN so that it is not + * received prior to other packets. + */ + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + send_changed_seq(txfd, &daddr); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + send_changed_ts(txfd, &daddr); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + send_diff_opt(txfd, &daddr); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "ip") == 0) { + send_changed_ECN(txfd, &daddr); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + send_changed_tos(txfd, &daddr); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + if (proto == PF_INET) { + /* Modified packets may be received out of order. + * Sleep function added to enforce test boundaries + * so that fin pkts are not received prior to other pkts. + */ + sleep(1); + send_changed_ttl(txfd, &daddr); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + sleep(1); + send_ip_options(txfd, &daddr); + sleep(1); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + sleep(1); + send_fragment4(txfd, &daddr); + sleep(1); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + test_flush_id(txfd, &daddr, fin_pkt); + } else if (proto == PF_INET6) { + sleep(1); + send_fragment6(txfd, &daddr); + sleep(1); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + sleep(1); + /* send IPv6 packets with ext header with same payload */ + send_ipv6_exthdr(txfd, &daddr, EXT_PAYLOAD_1, EXT_PAYLOAD_1); + sleep(1); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + sleep(1); + /* send IPv6 packets with ext header with different payload */ + send_ipv6_exthdr(txfd, &daddr, EXT_PAYLOAD_1, EXT_PAYLOAD_2); + sleep(1); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } + } else if (strcmp(testname, "large") == 0) { + /* 20 is the difference between min iphdr size + * and min ipv6hdr size. Like MAX_HDR_SIZE, + * MAX_PAYLOAD is defined with the larger header of the two. + */ + int offset = (proto == PF_INET && !ipip) ? 20 : 0; + int remainder = (MAX_PAYLOAD + offset) % MSS; + + send_large(txfd, &daddr, remainder); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + send_large(txfd, &daddr, remainder + 1); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else { + error(1, 0, "Unknown testcase"); + } + + if (close(txfd)) + error(1, errno, "socket close"); +} + +static void gro_receiver(void) +{ + static int correct_payload[NUM_PACKETS]; + int rxfd = -1; + + rxfd = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_NONE)); + if (rxfd < 0) + error(1, 0, "socket creation"); + setup_sock_filter(rxfd); + set_timeout(rxfd); + bind_packetsocket(rxfd); + + ksft_ready(); + + memset(correct_payload, 0, sizeof(correct_payload)); + + if (strcmp(testname, "data") == 0) { + printf("pure data packet of same size: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); + + printf("large data packets followed by a smaller one: "); + correct_payload[0] = PAYLOAD_LEN * 1.5; + check_recv_pkts(rxfd, correct_payload, 1); + + printf("small data packets followed by a larger one: "); + correct_payload[0] = PAYLOAD_LEN / 2; + correct_payload[1] = PAYLOAD_LEN; + check_recv_pkts(rxfd, correct_payload, 2); + } else if (strcmp(testname, "ack") == 0) { + printf("duplicate ack and pure ack: "); + check_recv_pkts(rxfd, correct_payload, 3); + } else if (strcmp(testname, "flags") == 0) { + correct_payload[0] = PAYLOAD_LEN * 3; + correct_payload[1] = PAYLOAD_LEN * 2; + + printf("psh flag ends coalescing: "); + check_recv_pkts(rxfd, correct_payload, 2); + + correct_payload[0] = PAYLOAD_LEN * 2; + correct_payload[1] = 0; + correct_payload[2] = PAYLOAD_LEN * 2; + printf("syn flag ends coalescing: "); + check_recv_pkts(rxfd, correct_payload, 3); + + printf("rst flag ends coalescing: "); + check_recv_pkts(rxfd, correct_payload, 3); + + printf("urg flag ends coalescing: "); + check_recv_pkts(rxfd, correct_payload, 3); + } else if (strcmp(testname, "tcp") == 0) { + correct_payload[0] = PAYLOAD_LEN; + correct_payload[1] = PAYLOAD_LEN; + correct_payload[2] = PAYLOAD_LEN; + correct_payload[3] = PAYLOAD_LEN; + + printf("changed checksum does not coalesce: "); + check_recv_pkts(rxfd, correct_payload, 2); + + printf("Wrong Seq number doesn't coalesce: "); + check_recv_pkts(rxfd, correct_payload, 2); + + printf("Different timestamp doesn't coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 4); + + printf("Different options doesn't coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 2); + } else if (strcmp(testname, "ip") == 0) { + correct_payload[0] = PAYLOAD_LEN; + correct_payload[1] = PAYLOAD_LEN; + + printf("different ECN doesn't coalesce: "); + check_recv_pkts(rxfd, correct_payload, 2); + + printf("different tos doesn't coalesce: "); + check_recv_pkts(rxfd, correct_payload, 2); + + if (proto == PF_INET) { + printf("different ttl doesn't coalesce: "); + check_recv_pkts(rxfd, correct_payload, 2); + + printf("ip options doesn't coalesce: "); + correct_payload[2] = PAYLOAD_LEN; + check_recv_pkts(rxfd, correct_payload, 3); + + printf("fragmented ip4 doesn't coalesce: "); + check_recv_pkts(rxfd, correct_payload, 2); + + /* is_atomic checks */ + printf("DF=1, Incrementing - should coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); + + printf("DF=1, Fixed - should coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); + + printf("DF=0, Incrementing - should coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); + + printf("DF=0, Fixed - should coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); + + printf("DF=1, 2 Incrementing and one fixed - should coalesce only first 2 packets: "); + correct_payload[0] = PAYLOAD_LEN * 2; + correct_payload[1] = PAYLOAD_LEN; + check_recv_pkts(rxfd, correct_payload, 2); + + printf("DF=1, 2 Fixed and one incrementing - should coalesce only first 2 packets: "); + correct_payload[0] = PAYLOAD_LEN * 2; + correct_payload[1] = PAYLOAD_LEN; + check_recv_pkts(rxfd, correct_payload, 2); + } else if (proto == PF_INET6) { + /* GRO doesn't check for ipv6 hop limit when flushing. + * Hence no corresponding test to the ipv4 case. + */ + printf("fragmented ip6 doesn't coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + correct_payload[1] = PAYLOAD_LEN; + correct_payload[2] = PAYLOAD_LEN; + check_recv_pkts(rxfd, correct_payload, 3); + + printf("ipv6 with ext header does coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); + + printf("ipv6 with ext header with different payloads doesn't coalesce: "); + correct_payload[0] = PAYLOAD_LEN; + correct_payload[1] = PAYLOAD_LEN; + check_recv_pkts(rxfd, correct_payload, 2); + } + } else if (strcmp(testname, "large") == 0) { + int offset = (proto == PF_INET && !ipip) ? 20 : 0; + int remainder = (MAX_PAYLOAD + offset) % MSS; + + correct_payload[0] = (MAX_PAYLOAD + offset); + correct_payload[1] = remainder; + printf("Shouldn't coalesce if exceed IP max pkt size: "); + check_recv_pkts(rxfd, correct_payload, 2); + + /* last segment sent individually, doesn't start new segment */ + correct_payload[0] = correct_payload[0] - remainder; + correct_payload[1] = remainder + 1; + correct_payload[2] = remainder + 1; + check_recv_pkts(rxfd, correct_payload, 3); + } else { + error(1, 0, "Test case error, should never trigger"); + } + + if (close(rxfd)) + error(1, 0, "socket close"); +} + +static void parse_args(int argc, char **argv) +{ + static const struct option opts[] = { + { "daddr", required_argument, NULL, 'd' }, + { "dmac", required_argument, NULL, 'D' }, + { "iface", required_argument, NULL, 'i' }, + { "ipv4", no_argument, NULL, '4' }, + { "ipv6", no_argument, NULL, '6' }, + { "ipip", no_argument, NULL, 'e' }, + { "rx", no_argument, NULL, 'r' }, + { "saddr", required_argument, NULL, 's' }, + { "smac", required_argument, NULL, 'S' }, + { "test", required_argument, NULL, 't' }, + { "verbose", no_argument, NULL, 'v' }, + { 0, 0, 0, 0 } + }; + int c; + + while ((c = getopt_long(argc, argv, "46d:D:ei:rs:S:t:v", opts, NULL)) != -1) { + switch (c) { + case '4': + proto = PF_INET; + ethhdr_proto = htons(ETH_P_IP); + break; + case '6': + proto = PF_INET6; + ethhdr_proto = htons(ETH_P_IPV6); + break; + case 'e': + ipip = true; + proto = PF_INET; + ethhdr_proto = htons(ETH_P_IP); + break; + case 'd': + addr4_dst = addr6_dst = optarg; + break; + case 'D': + dmac = optarg; + break; + case 'i': + ifname = optarg; + break; + case 'r': + tx_socket = false; + break; + case 's': + addr4_src = addr6_src = optarg; + break; + case 'S': + smac = optarg; + break; + case 't': + testname = optarg; + break; + case 'v': + verbose = true; + break; + default: + error(1, 0, "%s invalid option %c\n", __func__, c); + break; + } + } +} + +int main(int argc, char **argv) +{ + parse_args(argc, argv); + + if (ipip) { + tcp_offset = ETH_HLEN + sizeof(struct iphdr) * 2; + total_hdr_len = tcp_offset + sizeof(struct tcphdr); + } else if (proto == PF_INET) { + tcp_offset = ETH_HLEN + sizeof(struct iphdr); + total_hdr_len = tcp_offset + sizeof(struct tcphdr); + } else if (proto == PF_INET6) { + tcp_offset = ETH_HLEN + sizeof(struct ipv6hdr); + total_hdr_len = MAX_HDR_LEN; + } else { + error(1, 0, "Protocol family is not ipv4 or ipv6"); + } + + read_MAC(src_mac, smac); + read_MAC(dst_mac, dmac); + + if (tx_socket) { + gro_sender(); + } else { + /* Only the receiver exit status determines test success. */ + gro_receiver(); + fprintf(stderr, "Gro::%s test passed.\n", testname); + } + + return 0; +} diff --git a/tools/testing/selftests/drivers/net/gro.py b/tools/testing/selftests/drivers/net/gro.py new file mode 100755 index 000000000000..ba83713bf7b5 --- /dev/null +++ b/tools/testing/selftests/drivers/net/gro.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +""" +GRO (Generic Receive Offload) conformance tests. + +Validates that GRO coalescing works correctly by running the gro +binary in different configurations and checking for correct packet +coalescing behavior. + +Test cases: + - data: Data packets with same size/headers and correct seq numbers coalesce + - ack: Pure ACK packets do not coalesce + - flags: Packets with PSH, SYN, URG, RST flags do not coalesce + - tcp: Packets with incorrect checksum, non-consecutive seqno don't coalesce + - ip: Packets with different ECN, TTL, TOS, or IP options don't coalesce + - large: Packets larger than GRO_MAX_SIZE don't coalesce +""" + +import os +from lib.py import ksft_run, ksft_exit, ksft_pr +from lib.py import NetDrvEpEnv, KsftXfailEx +from lib.py import cmd, defer, bkg, ip +from lib.py import ksft_variants + + +def _resolve_dmac(cfg, ipver): + """ + Find the destination MAC address remote host should use to send packets + towards the local host. It may be a router / gateway address. + """ + + attr = "dmac" + ipver + # Cache the response across test cases + if hasattr(cfg, attr): + return getattr(cfg, attr) + + route = ip(f"-{ipver} route get {cfg.addr_v[ipver]}", + json=True, host=cfg.remote)[0] + gw = route.get("gateway") + # Local L2 segment, address directly + if not gw: + setattr(cfg, attr, cfg.dev['address']) + return getattr(cfg, attr) + + # ping to make sure neighbor is resolved, + # bind to an interface, for v6 the GW is likely link local + cmd(f"ping -c1 -W0 -I{cfg.remote_ifname} {gw}", host=cfg.remote) + + neigh = ip(f"neigh get {gw} dev {cfg.remote_ifname}", + json=True, host=cfg.remote)[0] + setattr(cfg, attr, neigh['lladdr']) + return getattr(cfg, attr) + + +def _write_defer_restore(cfg, path, val, defer_undo=False): + with open(path, "r", encoding="utf-8") as fp: + orig_val = fp.read().strip() + if str(val) == orig_val: + return + with open(path, "w", encoding="utf-8") as fp: + fp.write(val) + if defer_undo: + defer(_write_defer_restore, cfg, path, orig_val) + + +def _set_mtu_restore(dev, mtu, host): + if dev['mtu'] < mtu: + ip(f"link set dev {dev['ifname']} mtu {mtu}", host=host) + defer(ip, f"link set dev {dev['ifname']} mtu {dev['mtu']}", host=host) + + +def _setup(cfg, test_name): + """ Setup hardware loopback mode for GRO testing. """ + + if not hasattr(cfg, "bin_remote"): + cfg.bin_local = cfg.test_dir / "gro" + cfg.bin_remote = cfg.remote.deploy(cfg.bin_local) + + # "large" test needs at least 4k MTU + if test_name == "large": + _set_mtu_restore(cfg.dev, 4096, None) + _set_mtu_restore(cfg.remote_dev, 4096, cfg.remote) + + flush_path = f"/sys/class/net/{cfg.ifname}/gro_flush_timeout" + irq_path = f"/sys/class/net/{cfg.ifname}/napi_defer_hard_irqs" + + _write_defer_restore(cfg, flush_path, "200000", defer_undo=True) + _write_defer_restore(cfg, irq_path, "10", defer_undo=True) + + try: + # Disable TSO for local tests + cfg.require_nsim() # will raise KsftXfailEx if not running on nsim + + cmd(f"ethtool -K {cfg.ifname} gro on tso off") + cmd(f"ethtool -K {cfg.remote_ifname} gro on tso off", host=cfg.remote) + except KsftXfailEx: + pass + +def _gro_variants(): + """Generator that yields all combinations of protocol and test types.""" + + for protocol in ["ipv4", "ipv6", "ipip"]: + for test_name in ["data", "ack", "flags", "tcp", "ip", "large"]: + yield protocol, test_name + + +@ksft_variants(_gro_variants()) +def test(cfg, protocol, test_name): + """Run a single GRO test with retries.""" + + ipver = "6" if protocol[-1] == "6" else "4" + cfg.require_ipver(ipver) + + _setup(cfg, test_name) + + base_cmd_args = [ + f"--{protocol}", + f"--dmac {_resolve_dmac(cfg, ipver)}", + f"--smac {cfg.remote_dev['address']}", + f"--daddr {cfg.addr_v[ipver]}", + f"--saddr {cfg.remote_addr_v[ipver]}", + f"--test {test_name}", + "--verbose" + ] + base_args = " ".join(base_cmd_args) + + # Each test is run 6 times to deflake, because given the receive timing, + # not all packets that should coalesce will be considered in the same flow + # on every try. + max_retries = 6 + for attempt in range(max_retries): + rx_cmd = f"{cfg.bin_local} {base_args} --rx --iface {cfg.ifname}" + tx_cmd = f"{cfg.bin_remote} {base_args} --iface {cfg.remote_ifname}" + + fail_now = attempt >= max_retries - 1 + + with bkg(rx_cmd, ksft_ready=True, exit_wait=True, + fail=fail_now) as rx_proc: + cmd(tx_cmd, host=cfg.remote) + + if rx_proc.ret == 0: + return + + ksft_pr(rx_proc.stdout.strip().replace('\n', '\n# ')) + ksft_pr(rx_proc.stderr.strip().replace('\n', '\n# ')) + + if test_name == "large" and os.environ.get("KSFT_MACHINE_SLOW"): + ksft_pr(f"Ignoring {protocol}/{test_name} failure due to slow environment") + return + + ksft_pr(f"Attempt {attempt + 1}/{max_retries} failed, retrying...") + + +def main() -> None: + """ Ksft boiler plate main """ + + with NetDrvEpEnv(__file__) as cfg: + ksft_run(cases=[test], args=(cfg,)) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/drivers/net/hw/.gitignore b/tools/testing/selftests/drivers/net/hw/.gitignore index 6942bf575497..46540468a775 100644 --- a/tools/testing/selftests/drivers/net/hw/.gitignore +++ b/tools/testing/selftests/drivers/net/hw/.gitignore @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only iou-zcrx ncdevmem +toeplitz diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile index 8133d1a0051c..9c163ba6feee 100644 --- a/tools/testing/selftests/drivers/net/hw/Makefile +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -1,10 +1,26 @@ # SPDX-License-Identifier: GPL-2.0+ OR MIT -TEST_GEN_FILES = iou-zcrx +# Check if io_uring supports zero-copy receive +HAS_IOURING_ZCRX := $(shell \ + echo -e '#include <liburing.h>\n' \ + 'void *func = (void *)io_uring_register_ifq;\n' \ + 'int main() {return 0;}' | \ + $(CC) -luring -x c - -o /dev/null 2>&1 && echo y) + +ifeq ($(HAS_IOURING_ZCRX),y) +COND_GEN_FILES += iou-zcrx +else +$(warning excluding iouring tests, liburing not installed or too old) +endif + +TEST_GEN_FILES := \ + $(COND_GEN_FILES) \ +# end of TEST_GEN_FILES TEST_PROGS = \ csum.py \ devlink_port_split.py \ + devlink_rate_tc_bw.py \ devmem.py \ ethtool.sh \ ethtool_extended_state.sh \ @@ -21,6 +37,7 @@ TEST_PROGS = \ rss_ctx.py \ rss_flow_label.py \ rss_input_xfrm.py \ + toeplitz.py \ tso.py \ xsk_reconfig.py \ # @@ -38,7 +55,10 @@ TEST_INCLUDES := \ # # YNL files, must be before "include ..lib.mk" -YNL_GEN_FILES := ncdevmem +YNL_GEN_FILES := \ + ncdevmem \ + toeplitz \ +# end of YNL_GEN_FILES TEST_GEN_FILES += $(YNL_GEN_FILES) TEST_GEN_FILES += $(patsubst %.c,%.o,$(wildcard *.bpf.c)) @@ -54,4 +74,6 @@ include ../../../net/ynl.mk include ../../../net/bpf.mk +ifeq ($(HAS_IOURING_ZCRX),y) $(OUTPUT)/iou-zcrx: LDLIBS += -luring +endif diff --git a/tools/testing/selftests/drivers/net/hw/devlink_rate_tc_bw.py b/tools/testing/selftests/drivers/net/hw/devlink_rate_tc_bw.py index ead6784d1910..4e4faa9275bb 100755 --- a/tools/testing/selftests/drivers/net/hw/devlink_rate_tc_bw.py +++ b/tools/testing/selftests/drivers/net/hw/devlink_rate_tc_bw.py @@ -21,21 +21,21 @@ Test Cases: ---------- 1. test_no_tc_mapping_bandwidth: - Verifies that without TC mapping, bandwidth is NOT distributed according to - the configured 80/20 split between TC4 and TC3 - - This test should fail if bandwidth matches the 80/20 split without TC + the configured 20/80 split between TC3 and TC4 + - This test should fail if bandwidth matches the 20/80 split without TC mapping - - Expected: Bandwidth should NOT be distributed as 80/20 + - Expected: Bandwidth should NOT be distributed as 20/80 2. test_tc_mapping_bandwidth: - Configures TC mapping using mqprio qdisc - Verifies that with TC mapping, bandwidth IS distributed according to the - configured 80/20 split between TC3 and TC4 - - Expected: Bandwidth should be distributed as 80/20 + configured 20/80 split between TC3 and TC4 + - Expected: Bandwidth should be distributed as 20/80 Bandwidth Distribution: ---------------------- -- TC3 (VLAN 101): Configured for 80% of total bandwidth -- TC4 (VLAN 102): Configured for 20% of total bandwidth +- TC3 (VLAN 101): Configured for 20% of total bandwidth +- TC4 (VLAN 102): Configured for 80% of total bandwidth - Total bandwidth: 1Gbps - Tolerance: +-12% @@ -64,43 +64,40 @@ from lib.py import KsftSkipEx, KsftFailEx, KsftXfailEx from lib.py import NetDrvEpEnv, DevlinkFamily from lib.py import NlError from lib.py import cmd, defer, ethtool, ip +from lib.py import Iperf3Runner class BandwidthValidator: """ - Validates bandwidth totals and per-TC shares against expected values - with a tolerance. + Validates total bandwidth and individual shares with tolerance + relative to the overall total. """ - def __init__(self): + def __init__(self, shares): self.tolerance_percent = 12 - self.expected_total_gbps = 1.0 - self.total_min_expected = self.min_expected(self.expected_total_gbps) - self.total_max_expected = self.max_expected(self.expected_total_gbps) - self.tc_expected_percent = { - 3: 20.0, - 4: 80.0, - } + self.expected_total = sum(shares.values()) + self.bounds = {} + + for name, exp in shares.items(): + self.bounds[name] = (self.min_expected(exp), self.max_expected(exp)) def min_expected(self, value): """Calculates the minimum acceptable value based on tolerance.""" - return value - (value * self.tolerance_percent / 100) + return value - (self.expected_total * self.tolerance_percent / 100) def max_expected(self, value): """Calculates the maximum acceptable value based on tolerance.""" - return value + (value * self.tolerance_percent / 100) - - def bound(self, expected, value): - """Returns True if value is within expected tolerance.""" - return self.min_expected(expected) <= value <= self.max_expected(expected) + return value + (self.expected_total * self.tolerance_percent / 100) - def tc_bandwidth_bound(self, value, tc_ix): + def bound(self, values): """ - Returns True if the given bandwidth value is within tolerance - for the TC's expected bandwidth. + Return True if all given values fall within tolerance. """ - expected = self.tc_expected_percent[tc_ix] - return self.bound(expected, value) + for name, value in values.items(): + low, high = self.bounds[name] + if not low <= value <= high: + return False + return True def setup_vf(cfg, set_tc_mapping=True): @@ -116,8 +113,8 @@ def setup_vf(cfg, set_tc_mapping=True): except Exception as exc: raise KsftSkipEx(f"Failed to enable switchdev mode on {cfg.pci}") from exc try: - cmd(f"echo 1 > /sys/class/net/{cfg.ifname}/device/sriov_numvfs") - defer(cmd, f"echo 0 > /sys/class/net/{cfg.ifname}/device/sriov_numvfs") + cmd(f"echo 1 > /sys/class/net/{cfg.ifname}/device/sriov_numvfs", shell=True) + defer(cmd, f"echo 0 > /sys/class/net/{cfg.ifname}/device/sriov_numvfs", shell=True) except Exception as exc: raise KsftSkipEx(f"Failed to enable SR-IOV on {cfg.ifname}") from exc @@ -139,8 +136,8 @@ def setup_vlans_on_vf(vf_ifc): Sets up two VLAN interfaces on the given VF, each mapped to a different TC. """ vlan_configs = [ - {"vlan_id": 101, "tc": 3, "ip": "198.51.100.2"}, - {"vlan_id": 102, "tc": 4, "ip": "198.51.100.10"}, + {"vlan_id": 101, "tc": 3, "ip": "198.51.100.1"}, + {"vlan_id": 102, "tc": 4, "ip": "198.51.100.9"}, ] for config in vlan_configs: @@ -224,13 +221,13 @@ def setup_devlink_rate(cfg): raise KsftFailEx(f"rate_set failed on VF port {port_index}") from exc -def setup_remote_server(cfg): +def setup_remote_vlans(cfg): """ - Sets up VLAN interfaces and starts iperf3 servers on the remote side. + Sets up VLAN interfaces on the remote side. """ remote_dev = cfg.remote_ifname vlan_ids = [101, 102] - remote_ips = ["198.51.100.1", "198.51.100.9"] + remote_ips = ["198.51.100.2", "198.51.100.10"] for vlan_id, ip_addr in zip(vlan_ids, remote_ips): vlan_dev = f"{remote_dev}.{vlan_id}" @@ -238,14 +235,13 @@ def setup_remote_server(cfg): f"type vlan id {vlan_id}", host=cfg.remote) cmd(f"ip addr add {ip_addr}/29 dev {vlan_dev}", host=cfg.remote) cmd(f"ip link set dev {vlan_dev} up", host=cfg.remote) - cmd(f"iperf3 -s -1 -B {ip_addr}",background=True, host=cfg.remote) defer(cmd, f"ip link del {vlan_dev}", host=cfg.remote) def setup_test_environment(cfg, set_tc_mapping=True): """ Sets up the complete test environment including VF creation, VLANs, - bridge configuration, devlink rate setup, and the remote server. + bridge configuration and devlink rate setup. """ vf_ifc = setup_vf(cfg, set_tc_mapping) ksft_pr(f"Created VF interface: {vf_ifc}") @@ -256,51 +252,39 @@ def setup_test_environment(cfg, set_tc_mapping=True): setup_bridge(cfg) setup_devlink_rate(cfg) - setup_remote_server(cfg) - time.sleep(2) + setup_remote_vlans(cfg) -def run_iperf_client(server_ip, local_ip, barrier, min_expected_gbps=0.1): +def measure_bandwidth(cfg, server_ip, client_ip, barrier): """ - Runs a single iperf3 client instance, binding to the given local IP. - Waits on a barrier to synchronize with other threads. + Synchronizes with peers and runs an iperf3-based bandwidth measurement + between the given endpoints. Returns average Gbps. """ + runner = Iperf3Runner(cfg, server_ip=server_ip, client_ip=client_ip) try: barrier.wait(timeout=10) except Exception as exc: raise KsftFailEx("iperf3 barrier wait timed") from exc - iperf_cmd = ["iperf3", "-c", server_ip, "-B", local_ip, "-J"] - result = subprocess.run(iperf_cmd, capture_output=True, text=True, - check=True) - try: - output = json.loads(result.stdout) - bits_per_second = output["end"]["sum_received"]["bits_per_second"] - gbps = bits_per_second / 1e9 - if gbps < min_expected_gbps: - ksft_pr( - f"iperf3 bandwidth too low: {gbps:.2f} Gbps " - f"(expected ≥ {min_expected_gbps} Gbps)" - ) - return None - return gbps - except json.JSONDecodeError as exc: - ksft_pr(f"Failed to parse iperf3 JSON output: {exc}") - return None + bw_gbps = runner.measure_bandwidth(reverse=True) + except Exception as exc: + raise KsftFailEx("iperf3 bandwidth measurement failed") from exc + + return bw_gbps -def run_bandwidth_test(): +def run_bandwidth_test(cfg): """ - Launches iperf3 client threads for each VLAN/TC pair and collects results. + Runs parallel bandwidth measurements for each VLAN/TC pair and collects results. """ - def _run_iperf_client_thread(server_ip, local_ip, results, barrier, tc_ix): - results[tc_ix] = run_iperf_client(server_ip, local_ip, barrier) + def _run_measure_bandwidth_thread(local_ip, remote_ip, results, barrier, tc_ix): + results[tc_ix] = measure_bandwidth(cfg, local_ip, remote_ip, barrier) vf_vlan_data = [ # (local_ip, remote_ip, TC) - ("198.51.100.2", "198.51.100.1", 3), - ("198.51.100.10", "198.51.100.9", 4), + ("198.51.100.1", "198.51.100.2", 3), + ("198.51.100.9", "198.51.100.10", 4), ] results = {} @@ -309,8 +293,8 @@ def run_bandwidth_test(): for local_ip, remote_ip, tc_ix in vf_vlan_data: thread = threading.Thread( - target=_run_iperf_client_thread, - args=(remote_ip, local_ip, results, start_barrier, tc_ix) + target=_run_measure_bandwidth_thread, + args=(local_ip, remote_ip, results, start_barrier, tc_ix) ) thread.start() threads.append(thread) @@ -320,10 +304,11 @@ def run_bandwidth_test(): for tc_ix, tc_bw in results.items(): if tc_bw is None: - raise KsftFailEx("iperf3 client failed; cannot evaluate bandwidth") + raise KsftFailEx("iperf3 failed; cannot evaluate bandwidth") return results + def calculate_bandwidth_percentages(results): """ Calculates the percentage of total bandwidth received by TC3 and TC4. @@ -364,59 +349,48 @@ def verify_total_bandwidth(bw_data, validator): """ total = bw_data['total_bw'] - if validator.bound(validator.expected_total_gbps, total): + if validator.bound({"total": total}): return - if total < validator.total_min_expected: + low, high = validator.bounds["total"] + + if total < low: raise KsftSkipEx( f"Total bandwidth {total:.2f} Gbps < minimum " - f"{validator.total_min_expected:.2f} Gbps; " - f"parent tx_max ({validator.expected_total_gbps:.1f} G) " + f"{low:.2f} Gbps; " + f"parent tx_max ({validator.expected_total:.1f} G) " f"not reached, cannot validate share" ) raise KsftFailEx( f"Total bandwidth {total:.2f} Gbps exceeds allowed ceiling " - f"{validator.total_max_expected:.2f} Gbps " - f"(VF tx_max set to {validator.expected_total_gbps:.1f} G)" + f"{high:.2f} Gbps " + f"(VF tx_max set to {validator.expected_total:.1f} G)" ) -def check_bandwidth_distribution(bw_data, validator): - """ - Checks whether the measured TC3 and TC4 bandwidth percentages - fall within their expected tolerance ranges. - - Returns: - bool: True if both TC3 and TC4 percentages are within bounds. - """ - tc3_valid = validator.tc_bandwidth_bound(bw_data['tc3_percentage'], 3) - tc4_valid = validator.tc_bandwidth_bound(bw_data['tc4_percentage'], 4) - - return tc3_valid and tc4_valid - - def run_bandwidth_distribution_test(cfg, set_tc_mapping): """ - Runs parallel iperf3 tests for both TCs and collects results. + Runs parallel bandwidth measurements for both TCs and collects results. """ setup_test_environment(cfg, set_tc_mapping) - bandwidths = run_bandwidth_test() + bandwidths = run_bandwidth_test(cfg) bw_data = calculate_bandwidth_percentages(bandwidths) test_name = "with TC mapping" if set_tc_mapping else "without TC mapping" print_bandwidth_results(bw_data, test_name) - verify_total_bandwidth(bw_data, cfg.bw_validator) + verify_total_bandwidth(bw_data, cfg.traffic_bw_validator) - return check_bandwidth_distribution(bw_data, cfg.bw_validator) + return cfg.tc_bw_validator.bound({"tc3": bw_data['tc3_percentage'], + "tc4": bw_data['tc4_percentage']}) def test_no_tc_mapping_bandwidth(cfg): """ - Verifies that bandwidth is not split 80/20 without traffic class mapping. + Verifies that bandwidth is not split 20/80 without traffic class mapping. """ - pass_bw_msg = "Bandwidth is NOT distributed as 80/20 without TC mapping" - fail_bw_msg = "Bandwidth matched 80/20 split without TC mapping" + pass_bw_msg = "Bandwidth is NOT distributed as 20/80 without TC mapping" + fail_bw_msg = "Bandwidth matched 20/80 split without TC mapping" is_mlx5 = "driver: mlx5" in ethtool(f"-i {cfg.ifname}").stdout if run_bandwidth_distribution_test(cfg, set_tc_mapping=False): @@ -430,13 +404,13 @@ def test_no_tc_mapping_bandwidth(cfg): def test_tc_mapping_bandwidth(cfg): """ - Verifies that bandwidth is correctly split 80/20 between TC3 and TC4 + Verifies that bandwidth is correctly split 20/80 between TC3 and TC4 when traffic class mapping is set. """ if run_bandwidth_distribution_test(cfg, set_tc_mapping=True): - ksft_pr("Bandwidth is distributed as 80/20 with TC mapping") + ksft_pr("Bandwidth is distributed as 20/80 with TC mapping") else: - raise KsftFailEx("Bandwidth did not match 80/20 split with TC mapping") + raise KsftFailEx("Bandwidth did not match 20/80 split with TC mapping") def main() -> None: @@ -451,9 +425,9 @@ def main() -> None: ) if not cfg.pci: raise KsftSkipEx("Could not get PCI address of the interface") - cfg.require_cmd("iperf3", local=True, remote=True) - cfg.bw_validator = BandwidthValidator() + cfg.traffic_bw_validator = BandwidthValidator({"total": 1}) + cfg.tc_bw_validator = BandwidthValidator({"tc3": 20, "tc4": 80}) cases = [test_no_tc_mapping_bandwidth, test_tc_mapping_bandwidth] diff --git a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py index fb010a48a5a1..766bfc4ad842 100644 --- a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py +++ b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py @@ -25,10 +25,10 @@ try: fd_read_timeout, ip, rand_port, wait_port_listen, wait_file from net.lib.py import KsftSkipEx, KsftFailEx, KsftXfailEx from net.lib.py import ksft_disruptive, ksft_exit, ksft_pr, ksft_run, \ - ksft_setup + ksft_setup, ksft_variants, KsftNamedVariant from net.lib.py import ksft_eq, ksft_ge, ksft_in, ksft_is, ksft_lt, \ ksft_ne, ksft_not_in, ksft_raises, ksft_true, ksft_gt, ksft_not_none - from drivers.net.lib.py import GenerateTraffic, Remote + from drivers.net.lib.py import GenerateTraffic, Remote, Iperf3Runner from drivers.net.lib.py import NetDrvEnv, NetDrvEpEnv __all__ = ["NetNS", "NetNSEnter", "NetdevSimDev", @@ -40,11 +40,12 @@ try: "wait_port_listen", "wait_file", "KsftSkipEx", "KsftFailEx", "KsftXfailEx", "ksft_disruptive", "ksft_exit", "ksft_pr", "ksft_run", - "ksft_setup", + "ksft_setup", "ksft_variants", "KsftNamedVariant", "ksft_eq", "ksft_ge", "ksft_in", "ksft_is", "ksft_lt", "ksft_ne", "ksft_not_in", "ksft_raises", "ksft_true", "ksft_gt", "ksft_not_none", "ksft_not_none", - "NetDrvEnv", "NetDrvEpEnv", "GenerateTraffic", "Remote"] + "NetDrvEnv", "NetDrvEpEnv", "GenerateTraffic", "Remote", + "Iperf3Runner"] except ModuleNotFoundError as e: print("Failed importing `net` library from kernel sources") print(str(e)) diff --git a/tools/testing/selftests/drivers/net/hw/toeplitz.c b/tools/testing/selftests/drivers/net/hw/toeplitz.c new file mode 100644 index 000000000000..a4d04438c313 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/toeplitz.c @@ -0,0 +1,655 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Toeplitz test + * + * 1. Read packets and their rx_hash using PF_PACKET/TPACKET_V3 + * 2. Compute the rx_hash in software based on the packet contents + * 3. Compare the two + * + * Optionally, either '-C $rx_irq_cpu_list' or '-r $rps_bitmap' may be given. + * + * If '-C $rx_irq_cpu_list' is given, also + * + * 4. Identify the cpu on which the packet arrived with PACKET_FANOUT_CPU + * 5. Compute the rxqueue that RSS would select based on this rx_hash + * 6. Using the $rx_irq_cpu_list map, identify the arriving cpu based on rxq irq + * 7. Compare the cpus from 4 and 6 + * + * Else if '-r $rps_bitmap' is given, also + * + * 4. Identify the cpu on which the packet arrived with PACKET_FANOUT_CPU + * 5. Compute the cpu that RPS should select based on rx_hash and $rps_bitmap + * 6. Compare the cpus from 4 and 5 + */ + +#define _GNU_SOURCE + +#include <arpa/inet.h> +#include <errno.h> +#include <error.h> +#include <fcntl.h> +#include <getopt.h> +#include <linux/filter.h> +#include <linux/if_ether.h> +#include <linux/if_packet.h> +#include <net/if.h> +#include <netdb.h> +#include <netinet/ip.h> +#include <netinet/ip6.h> +#include <netinet/tcp.h> +#include <netinet/udp.h> +#include <poll.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mman.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/sysinfo.h> +#include <sys/time.h> +#include <sys/types.h> +#include <unistd.h> + +#include <ynl.h> +#include "ethtool-user.h" + +#include "../../../kselftest.h" +#include "../../../net/lib/ksft.h" + +#define TOEPLITZ_KEY_MIN_LEN 40 +#define TOEPLITZ_KEY_MAX_LEN 60 + +#define TOEPLITZ_STR_LEN(K) (((K) * 3) - 1) /* hex encoded: AA:BB:CC:...:ZZ */ +#define TOEPLITZ_STR_MIN_LEN TOEPLITZ_STR_LEN(TOEPLITZ_KEY_MIN_LEN) +#define TOEPLITZ_STR_MAX_LEN TOEPLITZ_STR_LEN(TOEPLITZ_KEY_MAX_LEN) + +#define FOUR_TUPLE_MAX_LEN ((sizeof(struct in6_addr) * 2) + (sizeof(uint16_t) * 2)) + +#define RSS_MAX_CPUS (1 << 16) /* real constraint is PACKET_FANOUT_MAX */ +#define RSS_MAX_INDIR (1 << 16) + +#define RPS_MAX_CPUS 16UL /* must be a power of 2 */ + +/* configuration options (cmdline arguments) */ +static uint16_t cfg_dport = 8000; +static int cfg_family = AF_INET6; +static char *cfg_ifname = "eth0"; +static int cfg_num_queues; +static int cfg_num_rps_cpus; +static bool cfg_sink; +static int cfg_type = SOCK_STREAM; +static int cfg_timeout_msec = 1000; +static bool cfg_verbose; + +/* global vars */ +static int num_cpus; +static int ring_block_nr; +static int ring_block_sz; + +/* stats */ +static int frames_received; +static int frames_nohash; +static int frames_error; + +#define log_verbose(args...) do { if (cfg_verbose) fprintf(stderr, args); } while (0) + +/* tpacket ring */ +struct ring_state { + int fd; + char *mmap; + int idx; + int cpu; +}; + +static unsigned int rx_irq_cpus[RSS_MAX_CPUS]; /* map from rxq to cpu */ +static int rps_silo_to_cpu[RPS_MAX_CPUS]; +static unsigned char toeplitz_key[TOEPLITZ_KEY_MAX_LEN]; +static unsigned int rss_indir_tbl[RSS_MAX_INDIR]; +static unsigned int rss_indir_tbl_size; +static struct ring_state rings[RSS_MAX_CPUS]; + +static inline uint32_t toeplitz(const unsigned char *four_tuple, + const unsigned char *key) +{ + int i, bit, ret = 0; + uint32_t key32; + + key32 = ntohl(*((uint32_t *)key)); + key += 4; + + for (i = 0; i < FOUR_TUPLE_MAX_LEN; i++) { + for (bit = 7; bit >= 0; bit--) { + if (four_tuple[i] & (1 << bit)) + ret ^= key32; + + key32 <<= 1; + key32 |= !!(key[0] & (1 << bit)); + } + key++; + } + + return ret; +} + +/* Compare computed cpu with arrival cpu from packet_fanout_cpu */ +static void verify_rss(uint32_t rx_hash, int cpu) +{ + int queue; + + if (rss_indir_tbl_size) + queue = rss_indir_tbl[rx_hash % rss_indir_tbl_size]; + else + queue = rx_hash % cfg_num_queues; + + log_verbose(" rxq %d (cpu %d)", queue, rx_irq_cpus[queue]); + if (rx_irq_cpus[queue] != cpu) { + log_verbose(". error: rss cpu mismatch (%d)", cpu); + frames_error++; + } +} + +static void verify_rps(uint64_t rx_hash, int cpu) +{ + int silo = (rx_hash * cfg_num_rps_cpus) >> 32; + + log_verbose(" silo %d (cpu %d)", silo, rps_silo_to_cpu[silo]); + if (rps_silo_to_cpu[silo] != cpu) { + log_verbose(". error: rps cpu mismatch (%d)", cpu); + frames_error++; + } +} + +static void log_rxhash(int cpu, uint32_t rx_hash, + const char *addrs, int addr_len) +{ + char saddr[INET6_ADDRSTRLEN], daddr[INET6_ADDRSTRLEN]; + uint16_t *ports; + + if (!inet_ntop(cfg_family, addrs, saddr, sizeof(saddr)) || + !inet_ntop(cfg_family, addrs + addr_len, daddr, sizeof(daddr))) + error(1, 0, "address parse error"); + + ports = (void *)addrs + (addr_len * 2); + log_verbose("cpu %d: rx_hash 0x%08x [saddr %s daddr %s sport %02hu dport %02hu]", + cpu, rx_hash, saddr, daddr, + ntohs(ports[0]), ntohs(ports[1])); +} + +/* Compare computed rxhash with rxhash received from tpacket_v3 */ +static void verify_rxhash(const char *pkt, uint32_t rx_hash, int cpu) +{ + unsigned char four_tuple[FOUR_TUPLE_MAX_LEN] = {0}; + uint32_t rx_hash_sw; + const char *addrs; + int addr_len; + + if (cfg_family == AF_INET) { + addr_len = sizeof(struct in_addr); + addrs = pkt + offsetof(struct iphdr, saddr); + } else { + addr_len = sizeof(struct in6_addr); + addrs = pkt + offsetof(struct ip6_hdr, ip6_src); + } + + memcpy(four_tuple, addrs, (addr_len * 2) + (sizeof(uint16_t) * 2)); + rx_hash_sw = toeplitz(four_tuple, toeplitz_key); + + if (cfg_verbose) + log_rxhash(cpu, rx_hash, addrs, addr_len); + + if (rx_hash != rx_hash_sw) { + log_verbose(" != expected 0x%x\n", rx_hash_sw); + frames_error++; + return; + } + + log_verbose(" OK"); + if (cfg_num_queues) + verify_rss(rx_hash, cpu); + else if (cfg_num_rps_cpus) + verify_rps(rx_hash, cpu); + log_verbose("\n"); +} + +static char *recv_frame(const struct ring_state *ring, char *frame) +{ + struct tpacket3_hdr *hdr = (void *)frame; + + if (hdr->hv1.tp_rxhash) + verify_rxhash(frame + hdr->tp_net, hdr->hv1.tp_rxhash, + ring->cpu); + else + frames_nohash++; + + return frame + hdr->tp_next_offset; +} + +/* A single TPACKET_V3 block can hold multiple frames */ +static bool recv_block(struct ring_state *ring) +{ + struct tpacket_block_desc *block; + char *frame; + int i; + + block = (void *)(ring->mmap + ring->idx * ring_block_sz); + if (!(block->hdr.bh1.block_status & TP_STATUS_USER)) + return false; + + frame = (char *)block; + frame += block->hdr.bh1.offset_to_first_pkt; + + for (i = 0; i < block->hdr.bh1.num_pkts; i++) { + frame = recv_frame(ring, frame); + frames_received++; + } + + block->hdr.bh1.block_status = TP_STATUS_KERNEL; + ring->idx = (ring->idx + 1) % ring_block_nr; + + return true; +} + +/* simple test: sleep once unconditionally and then process all rings */ +static void process_rings(void) +{ + int i; + + usleep(1000 * cfg_timeout_msec); + + for (i = 0; i < num_cpus; i++) + do {} while (recv_block(&rings[i])); + + fprintf(stderr, "count: pass=%u nohash=%u fail=%u\n", + frames_received - frames_nohash - frames_error, + frames_nohash, frames_error); +} + +static char *setup_ring(int fd) +{ + struct tpacket_req3 req3 = {0}; + void *ring; + + req3.tp_retire_blk_tov = cfg_timeout_msec / 8; + req3.tp_feature_req_word = TP_FT_REQ_FILL_RXHASH; + + req3.tp_frame_size = 2048; + req3.tp_frame_nr = 1 << 10; + req3.tp_block_nr = 16; + + req3.tp_block_size = req3.tp_frame_size * req3.tp_frame_nr; + req3.tp_block_size /= req3.tp_block_nr; + + if (setsockopt(fd, SOL_PACKET, PACKET_RX_RING, &req3, sizeof(req3))) + error(1, errno, "setsockopt PACKET_RX_RING"); + + ring_block_sz = req3.tp_block_size; + ring_block_nr = req3.tp_block_nr; + + ring = mmap(0, req3.tp_block_size * req3.tp_block_nr, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_LOCKED | MAP_POPULATE, fd, 0); + if (ring == MAP_FAILED) + error(1, 0, "mmap failed"); + + return ring; +} + +static void __set_filter(int fd, int off_proto, uint8_t proto, int off_dport) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD + BPF_B + BPF_ABS, SKF_AD_OFF + SKF_AD_PKTTYPE), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, PACKET_HOST, 0, 4), + BPF_STMT(BPF_LD + BPF_B + BPF_ABS, off_proto), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, proto, 0, 2), + BPF_STMT(BPF_LD + BPF_H + BPF_ABS, off_dport), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, cfg_dport, 1, 0), + BPF_STMT(BPF_RET + BPF_K, 0), + BPF_STMT(BPF_RET + BPF_K, 0xFFFF), + }; + struct sock_fprog prog = {}; + + prog.filter = filter; + prog.len = ARRAY_SIZE(filter); + if (setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &prog, sizeof(prog))) + error(1, errno, "setsockopt filter"); +} + +/* filter on transport protocol and destination port */ +static void set_filter(int fd) +{ + const int off_dport = offsetof(struct tcphdr, dest); /* same for udp */ + uint8_t proto; + + proto = cfg_type == SOCK_STREAM ? IPPROTO_TCP : IPPROTO_UDP; + if (cfg_family == AF_INET) + __set_filter(fd, offsetof(struct iphdr, protocol), proto, + sizeof(struct iphdr) + off_dport); + else + __set_filter(fd, offsetof(struct ip6_hdr, ip6_nxt), proto, + sizeof(struct ip6_hdr) + off_dport); +} + +/* drop everything: used temporarily during setup */ +static void set_filter_null(int fd) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_RET + BPF_K, 0), + }; + struct sock_fprog prog = {}; + + prog.filter = filter; + prog.len = ARRAY_SIZE(filter); + if (setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &prog, sizeof(prog))) + error(1, errno, "setsockopt filter"); +} + +static int create_ring(char **ring) +{ + struct fanout_args args = { + .id = 1, + .type_flags = PACKET_FANOUT_CPU, + .max_num_members = RSS_MAX_CPUS + }; + struct sockaddr_ll ll = { 0 }; + int fd, val; + + fd = socket(PF_PACKET, SOCK_DGRAM, 0); + if (fd == -1) + error(1, errno, "socket creation failed"); + + val = TPACKET_V3; + if (setsockopt(fd, SOL_PACKET, PACKET_VERSION, &val, sizeof(val))) + error(1, errno, "setsockopt PACKET_VERSION"); + *ring = setup_ring(fd); + + /* block packets until all rings are added to the fanout group: + * else packets can arrive during setup and get misclassified + */ + set_filter_null(fd); + + ll.sll_family = AF_PACKET; + ll.sll_ifindex = if_nametoindex(cfg_ifname); + ll.sll_protocol = cfg_family == AF_INET ? htons(ETH_P_IP) : + htons(ETH_P_IPV6); + if (bind(fd, (void *)&ll, sizeof(ll))) + error(1, errno, "bind"); + + /* must come after bind: verifies all programs in group match */ + if (setsockopt(fd, SOL_PACKET, PACKET_FANOUT, &args, sizeof(args))) { + /* on failure, retry using old API if that is sufficient: + * it has a hard limit of 256 sockets, so only try if + * (a) only testing rxhash, not RSS or (b) <= 256 cpus. + * in this API, the third argument is left implicit. + */ + if (cfg_num_queues || num_cpus > 256 || + setsockopt(fd, SOL_PACKET, PACKET_FANOUT, + &args, sizeof(uint32_t))) + error(1, errno, "setsockopt PACKET_FANOUT cpu"); + } + + return fd; +} + +/* setup inet(6) socket to blackhole the test traffic, if arg '-s' */ +static int setup_sink(void) +{ + int fd, val; + + fd = socket(cfg_family, cfg_type, 0); + if (fd == -1) + error(1, errno, "socket %d.%d", cfg_family, cfg_type); + + val = 1 << 20; + if (setsockopt(fd, SOL_SOCKET, SO_RCVBUFFORCE, &val, sizeof(val))) + error(1, errno, "setsockopt rcvbuf"); + + return fd; +} + +static void setup_rings(void) +{ + int i; + + for (i = 0; i < num_cpus; i++) { + rings[i].cpu = i; + rings[i].fd = create_ring(&rings[i].mmap); + } + + /* accept packets once all rings in the fanout group are up */ + for (i = 0; i < num_cpus; i++) + set_filter(rings[i].fd); +} + +static void cleanup_rings(void) +{ + int i; + + for (i = 0; i < num_cpus; i++) { + if (munmap(rings[i].mmap, ring_block_nr * ring_block_sz)) + error(1, errno, "munmap"); + if (close(rings[i].fd)) + error(1, errno, "close"); + } +} + +static void parse_cpulist(const char *arg) +{ + do { + rx_irq_cpus[cfg_num_queues++] = strtol(arg, NULL, 10); + + arg = strchr(arg, ','); + if (!arg) + break; + arg++; // skip ',' + } while (1); +} + +static void show_cpulist(void) +{ + int i; + + for (i = 0; i < cfg_num_queues; i++) + fprintf(stderr, "rxq %d: cpu %d\n", i, rx_irq_cpus[i]); +} + +static void show_silos(void) +{ + int i; + + for (i = 0; i < cfg_num_rps_cpus; i++) + fprintf(stderr, "silo %d: cpu %d\n", i, rps_silo_to_cpu[i]); +} + +static void parse_toeplitz_key(const char *str, int slen, unsigned char *key) +{ + int i, ret, off; + + if (slen < TOEPLITZ_STR_MIN_LEN || + slen > TOEPLITZ_STR_MAX_LEN + 1) + error(1, 0, "invalid toeplitz key"); + + for (i = 0, off = 0; off < slen; i++, off += 3) { + ret = sscanf(str + off, "%hhx", &key[i]); + if (ret != 1) + error(1, 0, "key parse error at %d off %d len %d", + i, off, slen); + } +} + +static void parse_rps_bitmap(const char *arg) +{ + unsigned long bitmap; + int i; + + bitmap = strtoul(arg, NULL, 0); + + if (bitmap & ~(RPS_MAX_CPUS - 1)) + error(1, 0, "rps bitmap 0x%lx out of bounds 0..%lu", + bitmap, RPS_MAX_CPUS - 1); + + for (i = 0; i < RPS_MAX_CPUS; i++) + if (bitmap & 1UL << i) + rps_silo_to_cpu[cfg_num_rps_cpus++] = i; +} + +static void read_rss_dev_info_ynl(void) +{ + struct ethtool_rss_get_req *req; + struct ethtool_rss_get_rsp *rsp; + struct ynl_sock *ys; + + ys = ynl_sock_create(&ynl_ethtool_family, NULL); + if (!ys) + error(1, errno, "ynl_sock_create failed"); + + req = ethtool_rss_get_req_alloc(); + if (!req) + error(1, errno, "ethtool_rss_get_req_alloc failed"); + + ethtool_rss_get_req_set_header_dev_name(req, cfg_ifname); + + rsp = ethtool_rss_get(ys, req); + if (!rsp) + error(1, ys->err.code, "YNL: %s", ys->err.msg); + + if (!rsp->_len.hkey) + error(1, 0, "RSS key not available for %s", cfg_ifname); + + if (rsp->_len.hkey < TOEPLITZ_KEY_MIN_LEN || + rsp->_len.hkey > TOEPLITZ_KEY_MAX_LEN) + error(1, 0, "RSS key length %u out of bounds [%u, %u]", + rsp->_len.hkey, TOEPLITZ_KEY_MIN_LEN, + TOEPLITZ_KEY_MAX_LEN); + + memcpy(toeplitz_key, rsp->hkey, rsp->_len.hkey); + + if (rsp->_count.indir > RSS_MAX_INDIR) + error(1, 0, "RSS indirection table too large (%u > %u)", + rsp->_count.indir, RSS_MAX_INDIR); + + /* If indir table not available we'll fallback to simple modulo math */ + if (rsp->_count.indir) { + memcpy(rss_indir_tbl, rsp->indir, + rsp->_count.indir * sizeof(rss_indir_tbl[0])); + rss_indir_tbl_size = rsp->_count.indir; + + log_verbose("RSS indirection table size: %u\n", + rss_indir_tbl_size); + } + + ethtool_rss_get_rsp_free(rsp); + ethtool_rss_get_req_free(req); + ynl_sock_destroy(ys); +} + +static void parse_opts(int argc, char **argv) +{ + static struct option long_options[] = { + {"dport", required_argument, 0, 'd'}, + {"cpus", required_argument, 0, 'C'}, + {"key", required_argument, 0, 'k'}, + {"iface", required_argument, 0, 'i'}, + {"ipv4", no_argument, 0, '4'}, + {"ipv6", no_argument, 0, '6'}, + {"sink", no_argument, 0, 's'}, + {"tcp", no_argument, 0, 't'}, + {"timeout", required_argument, 0, 'T'}, + {"udp", no_argument, 0, 'u'}, + {"verbose", no_argument, 0, 'v'}, + {"rps", required_argument, 0, 'r'}, + {0, 0, 0, 0} + }; + bool have_toeplitz = false; + int index, c; + + while ((c = getopt_long(argc, argv, "46C:d:i:k:r:stT:uv", long_options, &index)) != -1) { + switch (c) { + case '4': + cfg_family = AF_INET; + break; + case '6': + cfg_family = AF_INET6; + break; + case 'C': + parse_cpulist(optarg); + break; + case 'd': + cfg_dport = strtol(optarg, NULL, 0); + break; + case 'i': + cfg_ifname = optarg; + break; + case 'k': + parse_toeplitz_key(optarg, strlen(optarg), + toeplitz_key); + have_toeplitz = true; + break; + case 'r': + parse_rps_bitmap(optarg); + break; + case 's': + cfg_sink = true; + break; + case 't': + cfg_type = SOCK_STREAM; + break; + case 'T': + cfg_timeout_msec = strtol(optarg, NULL, 0); + break; + case 'u': + cfg_type = SOCK_DGRAM; + break; + case 'v': + cfg_verbose = true; + break; + + default: + error(1, 0, "unknown option %c", optopt); + break; + } + } + + if (!have_toeplitz) + read_rss_dev_info_ynl(); + + num_cpus = get_nprocs(); + if (num_cpus > RSS_MAX_CPUS) + error(1, 0, "increase RSS_MAX_CPUS"); + + if (cfg_num_queues && cfg_num_rps_cpus) + error(1, 0, + "Can't supply both RSS cpus ('-C') and RPS map ('-r')"); + if (cfg_verbose) { + show_cpulist(); + show_silos(); + } +} + +int main(int argc, char **argv) +{ + const int min_tests = 10; + int fd_sink = -1; + + parse_opts(argc, argv); + + if (cfg_sink) + fd_sink = setup_sink(); + + setup_rings(); + + /* Signal to test framework that we're ready to receive */ + ksft_ready(); + + process_rings(); + cleanup_rings(); + + if (cfg_sink && close(fd_sink)) + error(1, errno, "close sink"); + + if (frames_received - frames_nohash < min_tests) + error(1, 0, "too few frames for verification"); + + return frames_error; +} diff --git a/tools/testing/selftests/drivers/net/hw/toeplitz.py b/tools/testing/selftests/drivers/net/hw/toeplitz.py new file mode 100755 index 000000000000..d2db5ee9e358 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/toeplitz.py @@ -0,0 +1,211 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +""" +Toeplitz Rx hashing test: + - rxhash (the hash value calculation itself); + - RSS mapping from rxhash to rx queue; + - RPS mapping from rxhash to cpu. +""" + +import glob +import os +import socket +from lib.py import ksft_run, ksft_exit, ksft_pr +from lib.py import NetDrvEpEnv, EthtoolFamily, NetdevFamily +from lib.py import cmd, bkg, rand_port, defer +from lib.py import ksft_in +from lib.py import ksft_variants, KsftNamedVariant, KsftSkipEx, KsftFailEx + +# "define" for the ID of the Toeplitz hash function +ETH_RSS_HASH_TOP = 1 + + +def _check_rps_and_rfs_not_configured(cfg): + """Verify that RPS is not already configured.""" + + for rps_file in glob.glob(f"/sys/class/net/{cfg.ifname}/queues/rx-*/rps_cpus"): + with open(rps_file, "r", encoding="utf-8") as fp: + val = fp.read().strip() + if set(val) - {"0", ","}: + raise KsftSkipEx(f"RPS already configured on {rps_file}: {val}") + + rfs_file = "/proc/sys/net/core/rps_sock_flow_entries" + with open(rfs_file, "r", encoding="utf-8") as fp: + val = fp.read().strip() + if val != "0": + raise KsftSkipEx(f"RFS already configured {rfs_file}: {val}") + + +def _get_cpu_for_irq(irq): + with open(f"/proc/irq/{irq}/smp_affinity_list", "r", + encoding="utf-8") as fp: + data = fp.read().strip() + if "," in data or "-" in data: + raise KsftFailEx(f"IRQ{irq} is not mapped to a single core: {data}") + return int(data) + + +def _get_irq_cpus(cfg): + """ + Read the list of IRQs for the device Rx queues. + """ + queues = cfg.netnl.queue_get({"ifindex": cfg.ifindex}, dump=True) + napis = cfg.netnl.napi_get({"ifindex": cfg.ifindex}, dump=True) + + # Remap into ID-based dicts + napis = {n["id"]: n for n in napis} + queues = {f"{q['type']}{q['id']}": q for q in queues} + + cpus = [] + for rx in range(9999): + name = f"rx{rx}" + if name not in queues: + break + cpus.append(_get_cpu_for_irq(napis[queues[name]["napi-id"]]["irq"])) + + return cpus + + +def _get_unused_cpus(cfg, count=2): + """ + Get CPUs that are not used by Rx queues. + Returns a list of at least 'count' CPU numbers. + """ + + # Get CPUs used by Rx queues + rx_cpus = set(_get_irq_cpus(cfg)) + + # Get total number of CPUs + num_cpus = os.cpu_count() + + # Find unused CPUs + unused_cpus = [cpu for cpu in range(num_cpus) if cpu not in rx_cpus] + + if len(unused_cpus) < count: + raise KsftSkipEx(f"Need at {count} CPUs not used by Rx queues, found {len(unused_cpus)}") + + return unused_cpus[:count] + + +def _configure_rps(cfg, rps_cpus): + """Configure RPS for all Rx queues.""" + + mask = 0 + for cpu in rps_cpus: + mask |= (1 << cpu) + mask = hex(mask)[2:] + + # Set RPS bitmap for all rx queues + for rps_file in glob.glob(f"/sys/class/net/{cfg.ifname}/queues/rx-*/rps_cpus"): + with open(rps_file, "w", encoding="utf-8") as fp: + fp.write(mask) + + return mask + + +def _send_traffic(cfg, proto_flag, ipver, port): + """Send 20 packets of requested type.""" + + # Determine protocol and IP version for socat + if proto_flag == "-u": + proto = "UDP" + else: + proto = "TCP" + + baddr = f"[{cfg.addr_v['6']}]" if ipver == "6" else cfg.addr_v["4"] + + # Run socat in a loop to send traffic periodically + # Use sh -c with a loop similar to toeplitz_client.sh + socat_cmd = f""" + for i in `seq 20`; do + echo "msg $i" | socat -{ipver} -t 0.1 - {proto}:{baddr}:{port}; + sleep 0.001; + done + """ + + cmd(socat_cmd, shell=True, host=cfg.remote) + + +def _test_variants(): + for grp in ["", "rss", "rps"]: + for l4 in ["tcp", "udp"]: + for l3 in ["4", "6"]: + name = f"{l4}_ipv{l3}" + if grp: + name = f"{grp}_{name}" + yield KsftNamedVariant(name, "-" + l4[0], l3, grp) + + +@ksft_variants(_test_variants()) +def test(cfg, proto_flag, ipver, grp): + """Run a single toeplitz test.""" + + cfg.require_ipver(ipver) + + # Check that rxhash is enabled + ksft_in("receive-hashing: on", cmd(f"ethtool -k {cfg.ifname}").stdout) + + rss = cfg.ethnl.rss_get({"header": {"dev-index": cfg.ifindex}}) + # Make sure NIC is configured to use Toeplitz hash, and no key xfrm. + if rss.get('hfunc') != ETH_RSS_HASH_TOP or rss.get('input-xfrm'): + cfg.ethnl.rss_set({"header": {"dev-index": cfg.ifindex}, + "hfunc": ETH_RSS_HASH_TOP, + "input-xfrm": {}}) + defer(cfg.ethnl.rss_set, {"header": {"dev-index": cfg.ifindex}, + "hfunc": rss.get('hfunc'), + "input-xfrm": rss.get('input-xfrm', {}) + }) + + port = rand_port(socket.SOCK_DGRAM) + + toeplitz_path = cfg.test_dir / "toeplitz" + rx_cmd = [ + str(toeplitz_path), + "-" + ipver, + proto_flag, + "-d", str(port), + "-i", cfg.ifname, + "-T", "4000", + "-s", + "-v" + ] + + if grp: + _check_rps_and_rfs_not_configured(cfg) + if grp == "rss": + irq_cpus = ",".join([str(x) for x in _get_irq_cpus(cfg)]) + rx_cmd += ["-C", irq_cpus] + ksft_pr(f"RSS using CPUs: {irq_cpus}") + elif grp == "rps": + # Get CPUs not used by Rx queues and configure them for RPS + rps_cpus = _get_unused_cpus(cfg, count=2) + rps_mask = _configure_rps(cfg, rps_cpus) + defer(_configure_rps, cfg, []) + rx_cmd += ["-r", rps_mask] + ksft_pr(f"RPS using CPUs: {rps_cpus}, mask: {rps_mask}") + + # Run rx in background, it will exit once it has seen enough packets + with bkg(" ".join(rx_cmd), ksft_ready=True, exit_wait=True) as rx_proc: + while rx_proc.proc.poll() is None: + _send_traffic(cfg, proto_flag, ipver, port) + + # Check rx result + ksft_pr("Receiver output:") + ksft_pr(rx_proc.stdout.strip().replace('\n', '\n# ')) + if rx_proc.stderr: + ksft_pr(rx_proc.stderr.strip().replace('\n', '\n# ')) + + +def main() -> None: + """Ksft boilerplate main.""" + + with NetDrvEpEnv(__file__) as cfg: + cfg.ethnl = EthtoolFamily() + cfg.netnl = NetdevFamily() + ksft_run(cases=[test], args=(cfg,)) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/drivers/net/lib/py/__init__.py b/tools/testing/selftests/drivers/net/lib/py/__init__.py index b0c6300150fb..8b75faa9af6d 100644 --- a/tools/testing/selftests/drivers/net/lib/py/__init__.py +++ b/tools/testing/selftests/drivers/net/lib/py/__init__.py @@ -25,7 +25,7 @@ try: fd_read_timeout, ip, rand_port, wait_port_listen, wait_file from net.lib.py import KsftSkipEx, KsftFailEx, KsftXfailEx from net.lib.py import ksft_disruptive, ksft_exit, ksft_pr, ksft_run, \ - ksft_setup + ksft_setup, ksft_variants, KsftNamedVariant from net.lib.py import ksft_eq, ksft_ge, ksft_in, ksft_is, ksft_lt, \ ksft_ne, ksft_not_in, ksft_raises, ksft_true, ksft_gt, ksft_not_none @@ -38,16 +38,17 @@ try: "wait_port_listen", "wait_file", "KsftSkipEx", "KsftFailEx", "KsftXfailEx", "ksft_disruptive", "ksft_exit", "ksft_pr", "ksft_run", - "ksft_setup", + "ksft_setup", "ksft_variants", "KsftNamedVariant", "ksft_eq", "ksft_ge", "ksft_in", "ksft_is", "ksft_lt", "ksft_ne", "ksft_not_in", "ksft_raises", "ksft_true", "ksft_gt", "ksft_not_none", "ksft_not_none"] from .env import NetDrvEnv, NetDrvEpEnv - from .load import GenerateTraffic + from .load import GenerateTraffic, Iperf3Runner from .remote import Remote - __all__ += ["NetDrvEnv", "NetDrvEpEnv", "GenerateTraffic", "Remote"] + __all__ += ["NetDrvEnv", "NetDrvEpEnv", "GenerateTraffic", "Remote", + "Iperf3Runner"] except ModuleNotFoundError as e: print("Failed importing `net` library from kernel sources") print(str(e)) diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py index 01be3d9b9720..8b644fd84ff2 100644 --- a/tools/testing/selftests/drivers/net/lib/py/env.py +++ b/tools/testing/selftests/drivers/net/lib/py/env.py @@ -168,6 +168,8 @@ class NetDrvEpEnv(NetDrvEnvBase): # resolve remote interface name self.remote_ifname = self.resolve_remote_ifc() + self.remote_dev = ip("-d link show dev " + self.remote_ifname, + host=self.remote, json=True)[0] self._required_cmd = {} diff --git a/tools/testing/selftests/drivers/net/lib/py/load.py b/tools/testing/selftests/drivers/net/lib/py/load.py index c4e808407cc4..f181fa2d38fc 100644 --- a/tools/testing/selftests/drivers/net/lib/py/load.py +++ b/tools/testing/selftests/drivers/net/lib/py/load.py @@ -2,21 +2,89 @@ import re import time +import json from lib.py import ksft_pr, cmd, ip, rand_port, wait_port_listen -class GenerateTraffic: - def __init__(self, env, port=None): - env.require_cmd("iperf3", local=True, remote=True) +class Iperf3Runner: + """ + Sets up and runs iperf3 traffic. + """ + def __init__(self, env, port=None, server_ip=None, client_ip=None): + env.require_cmd("iperf3", local=True, remote=True) self.env = env - self.port = rand_port() if port is None else port - self._iperf_server = cmd(f"iperf3 -s -1 -p {self.port}", background=True) + self.server_ip = server_ip + self.client_ip = client_ip + + def _build_server(self): + cmdline = f"iperf3 -s -1 -p {self.port}" + if self.server_ip: + cmdline += f" -B {self.server_ip}" + return cmdline + + def _build_client(self, streams, duration, reverse): + host = self.env.addr if self.server_ip is None else self.server_ip + cmdline = f"iperf3 -c {host} -p {self.port} -P {streams} -t {duration} -J" + if self.client_ip: + cmdline += f" -B {self.client_ip}" + if reverse: + cmdline += " --reverse" + return cmdline + + def start_server(self): + """ + Starts an iperf3 server with optional bind IP. + """ + cmdline = self._build_server() + proc = cmd(cmdline, background=True) wait_port_listen(self.port) time.sleep(0.1) - self._iperf_client = cmd(f"iperf3 -c {env.addr} -P 16 -p {self.port} -t 86400", - background=True, host=env.remote) + return proc + + def start_client(self, background=False, streams=1, duration=10, reverse=False): + """ + Starts the iperf3 client with the configured options. + """ + cmdline = self._build_client(streams, duration, reverse) + return cmd(cmdline, background=background, host=self.env.remote) + + def measure_bandwidth(self, reverse=False): + """ + Runs an iperf3 measurement and returns the average bandwidth (Gbps). + Discards the first and last few reporting intervals and uses only the + middle part of the run where throughput is typically stable. + """ + self.start_server() + result = self.start_client(duration=10, reverse=reverse) + + if result.ret != 0: + raise RuntimeError("iperf3 failed to run successfully") + try: + out = json.loads(result.stdout) + except json.JSONDecodeError as exc: + raise ValueError("Failed to parse iperf3 JSON output") from exc + + intervals = out.get("intervals", []) + samples = [i["sum"]["bits_per_second"] / 1e9 for i in intervals] + if len(samples) < 10: + raise ValueError(f"iperf3 returned too few intervals: {len(samples)}") + # Discard potentially unstable first and last 3 seconds. + stable = samples[3:-3] + + avg = sum(stable) / len(stable) + + return avg + + +class GenerateTraffic: + def __init__(self, env, port=None): + self.env = env + self.runner = Iperf3Runner(env, port) + + self._iperf_server = self.runner.start_server() + self._iperf_client = self.runner.start_client(background=True, streams=16, duration=86400) # Wait for traffic to ramp up if not self._wait_pkts(pps=1000): @@ -61,7 +129,7 @@ class GenerateTraffic: def _wait_client_stopped(self, sleep=0.005, timeout=5): end = time.monotonic() + timeout - live_port_pattern = re.compile(fr":{self.port:04X} 0[^6] ") + live_port_pattern = re.compile(fr":{self.runner.port:04X} 0[^6] ") while time.monotonic() < end: data = cmd("cat /proc/net/tcp*", host=self.env.remote).stdout diff --git a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh index 8e1085e89647..ae8abff4be40 100644 --- a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh +++ b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh @@ -11,9 +11,11 @@ set -euo pipefail LIBDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") SRCIF="" # to be populated later +SRCIP="" # to be populated later SRCIP4="192.0.2.1" SRCIP6="fc00::1" DSTIF="" # to be populated later +DSTIP="" # to be populated later DSTIP4="192.0.2.2" DSTIP6="fc00::2" @@ -28,17 +30,23 @@ NETCONS_PATH="${NETCONS_CONFIGFS}"/"${TARGET}" # NAMESPACE will be populated by setup_ns with a random value NAMESPACE="" -# IDs for netdevsim +# IDs for netdevsim. We either use NSIM_DEV_{1,2}_ID for standard test +# or NSIM_BOND_{T,R}X_{1,2} for the bonding tests. Not both at the +# same time. NSIM_DEV_1_ID=$((256 + RANDOM % 256)) NSIM_DEV_2_ID=$((512 + RANDOM % 256)) +NSIM_BOND_TX_1=$((768 + RANDOM % 256)) +NSIM_BOND_TX_2=$((1024 + RANDOM % 256)) +NSIM_BOND_RX_1=$((1280 + RANDOM % 256)) +NSIM_BOND_RX_2=$((1536 + RANDOM % 256)) NSIM_DEV_SYS_NEW="/sys/bus/netdevsim/new_device" +NSIM_DEV_SYS_LINK="/sys/bus/netdevsim/link_device" # Used to create and delete namespaces source "${LIBDIR}"/../../../../net/lib.sh # Create netdevsim interfaces create_ifaces() { - echo "$NSIM_DEV_2_ID" > "$NSIM_DEV_SYS_NEW" echo "$NSIM_DEV_1_ID" > "$NSIM_DEV_SYS_NEW" udevadm settle 2> /dev/null || true @@ -113,31 +121,38 @@ function set_network() { configure_ip } -function create_dynamic_target() { - local FORMAT=${1:-"extended"} +function _create_dynamic_target() { + local FORMAT="${1:?FORMAT parameter required}" + local NCPATH="${2:?NCPATH parameter required}" DSTMAC=$(ip netns exec "${NAMESPACE}" \ ip link show "${DSTIF}" | awk '/ether/ {print $2}') # Create a dynamic target - mkdir "${NETCONS_PATH}" + mkdir "${NCPATH}" - echo "${DSTIP}" > "${NETCONS_PATH}"/remote_ip - echo "${SRCIP}" > "${NETCONS_PATH}"/local_ip - echo "${DSTMAC}" > "${NETCONS_PATH}"/remote_mac - echo "${SRCIF}" > "${NETCONS_PATH}"/dev_name + echo "${DSTIP}" > "${NCPATH}"/remote_ip + echo "${SRCIP}" > "${NCPATH}"/local_ip + echo "${DSTMAC}" > "${NCPATH}"/remote_mac + echo "${SRCIF}" > "${NCPATH}"/dev_name if [ "${FORMAT}" == "basic" ] then # Basic target does not support release - echo 0 > "${NETCONS_PATH}"/release - echo 0 > "${NETCONS_PATH}"/extended + echo 0 > "${NCPATH}"/release + echo 0 > "${NCPATH}"/extended elif [ "${FORMAT}" == "extended" ] then - echo 1 > "${NETCONS_PATH}"/extended + echo 1 > "${NCPATH}"/extended fi +} - echo 1 > "${NETCONS_PATH}"/enabled +function create_dynamic_target() { + local FORMAT=${1:-"extended"} + local NCPATH=${2:-"$NETCONS_PATH"} + _create_dynamic_target "${FORMAT}" "${NCPATH}" + + echo 1 > "${NCPATH}"/enabled # This will make sure that the kernel was able to # load the netconsole driver configuration. The console message @@ -185,14 +200,26 @@ function do_cleanup() { echo "${DEFAULT_PRINTK_VALUES}" > /proc/sys/kernel/printk } -function cleanup() { +function cleanup_netcons() { # delete netconsole dynamic reconfiguration - echo 0 > "${NETCONS_PATH}"/enabled + # do not fail if the target is already disabled + if [[ ! -d "${NETCONS_PATH}" ]] + then + # in some cases this is called before netcons path is created + return + fi + if [[ $(cat "${NETCONS_PATH}"/enabled) != 0 ]] + then + echo 0 > "${NETCONS_PATH}"/enabled || true + fi # Remove all the keys that got created during the selftest find "${NETCONS_PATH}/userdata/" -mindepth 1 -type d -delete # Remove the configfs entry rmdir "${NETCONS_PATH}" +} +function cleanup() { + cleanup_netcons do_cleanup } @@ -222,7 +249,7 @@ function listen_port_and_save_to() { # Just wait for 2 seconds timeout 2 ip netns exec "${NAMESPACE}" \ - socat "${SOCAT_MODE}":"${PORT}",fork "${OUTPUT}" + socat "${SOCAT_MODE}":"${PORT}",fork "${OUTPUT}" 2> /dev/null } # Only validate that the message arrived properly @@ -369,3 +396,24 @@ function wait_for_port() { # more frequently on IPv6 sleep 1 } + +# Clean up netdevsim ifaces created for bonding test +function cleanup_bond_nsim() { + ip -n "${TXNS}" \ + link delete "${BOND_TX_MAIN_IF}" type bond || true + ip -n "${RXNS}" \ + link delete "${BOND_RX_MAIN_IF}" type bond || true + + cleanup_netdevsim "$NSIM_BOND_TX_1" + cleanup_netdevsim "$NSIM_BOND_TX_2" + cleanup_netdevsim "$NSIM_BOND_RX_1" + cleanup_netdevsim "$NSIM_BOND_RX_2" +} + +# cleanup tests that use bonding interfaces +function cleanup_bond() { + cleanup_netcons + cleanup_bond_nsim + cleanup_all_ns + ip link delete "${VETH0}" || true +} diff --git a/tools/testing/selftests/drivers/net/netcons_basic.sh b/tools/testing/selftests/drivers/net/netcons_basic.sh index a3446b569976..2022f3061738 100755 --- a/tools/testing/selftests/drivers/net/netcons_basic.sh +++ b/tools/testing/selftests/drivers/net/netcons_basic.sh @@ -28,8 +28,6 @@ OUTPUT_FILE="/tmp/${TARGET}" # Check for basic system dependency and exit if not found check_for_dependencies -# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) -echo "6 5" > /proc/sys/kernel/printk # Remove the namespace, interfaces and netconsole target on exit trap cleanup EXIT @@ -39,6 +37,9 @@ do for IP_VERSION in "ipv6" "ipv4" do echo "Running with target mode: ${FORMAT} (${IP_VERSION})" + # Set current loglevel to KERN_INFO(6), and default to + # KERN_NOTICE(5) + echo "6 5" > /proc/sys/kernel/printk # Create one namespace and two interfaces set_network "${IP_VERSION}" # Create a dynamic target for netconsole diff --git a/tools/testing/selftests/drivers/net/netcons_overflow.sh b/tools/testing/selftests/drivers/net/netcons_overflow.sh index 29bad56448a2..06089643b771 100755 --- a/tools/testing/selftests/drivers/net/netcons_overflow.sh +++ b/tools/testing/selftests/drivers/net/netcons_overflow.sh @@ -15,7 +15,7 @@ SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh # This is coming from netconsole code. Check for it in drivers/net/netconsole.c -MAX_USERDATA_ITEMS=16 +MAX_USERDATA_ITEMS=256 # Function to create userdata entries function create_userdata_max_entries() { diff --git a/tools/testing/selftests/drivers/net/netcons_torture.sh b/tools/testing/selftests/drivers/net/netcons_torture.sh new file mode 100755 index 000000000000..2ce9ee3719d1 --- /dev/null +++ b/tools/testing/selftests/drivers/net/netcons_torture.sh @@ -0,0 +1,130 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-2.0 + +# Repeatedly send kernel messages, toggles netconsole targets on and off, +# creates and deletes targets in parallel, and toggles the source interface to +# simulate stress conditions. +# +# This test aims to verify the robustness of netconsole under dynamic +# configurations and concurrent operations. +# +# The major goal is to run this test with LOCKDEP, Kmemleak and KASAN to make +# sure no issues is reported. +# +# Author: Breno Leitao <leitao@debian.org> + +set -euo pipefail + +SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh + +# Number of times the main loop run +ITERATIONS=${1:-150} + +# Only test extended format +FORMAT="extended" +# And ipv6 only +IP_VERSION="ipv6" + +# Create, enable and delete some targets. +create_and_delete_random_target() { + COUNT=2 + RND_PREFIX=$(mktemp -u netcons_rnd_XXXX_) + + if [ -d "${NETCONS_CONFIGFS}/${RND_PREFIX}${COUNT}" ] || \ + [ -d "${NETCONS_CONFIGFS}/${RND_PREFIX}0" ]; then + echo "Function didn't finish yet, skipping it." >&2 + return + fi + + # enable COUNT targets + for i in $(seq ${COUNT}) + do + RND_TARGET="${RND_PREFIX}"${i} + RND_TARGET_PATH="${NETCONS_CONFIGFS}"/"${RND_TARGET}" + + # Basic population so the target can come up + _create_dynamic_target "${FORMAT}" "${RND_TARGET_PATH}" + done + + echo "netconsole selftest: ${COUNT} additional targets were created" > /dev/kmsg + # disable them all + for i in $(seq ${COUNT}) + do + RND_TARGET="${RND_PREFIX}"${i} + RND_TARGET_PATH="${NETCONS_CONFIGFS}"/"${RND_TARGET}" + if [[ $(cat "${RND_TARGET_PATH}/enabled") -eq 1 ]] + then + echo 0 > "${RND_TARGET_PATH}"/enabled + fi + rmdir "${RND_TARGET_PATH}" + done +} + +# Disable and enable the target mid-air, while messages +# are being transmitted. +toggle_netcons_target() { + for i in $(seq 2) + do + if [ ! -d "${NETCONS_PATH}" ] + then + break + fi + echo 0 > "${NETCONS_PATH}"/enabled 2> /dev/null || true + # Try to enable a bit harder, given it might fail to enable + # Write to `enabled` might fail depending on the lock, which is + # highly contentious here + for _ in $(seq 5) + do + echo 1 > "${NETCONS_PATH}"/enabled 2> /dev/null || true + done + done +} + +toggle_iface(){ + ip link set "${SRCIF}" down + ip link set "${SRCIF}" up +} + +# Start here + +modprobe netdevsim 2> /dev/null || true +modprobe netconsole 2> /dev/null || true + +# Check for basic system dependency and exit if not found +check_for_dependencies +# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) +echo "6 5" > /proc/sys/kernel/printk +# Remove the namespace, interfaces and netconsole target on exit +trap cleanup EXIT +# Create one namespace and two interfaces +set_network "${IP_VERSION}" +# Create a dynamic target for netconsole +create_dynamic_target "${FORMAT}" + +for i in $(seq "$ITERATIONS") +do + for _ in $(seq 10) + do + echo "${MSG}: ${TARGET} ${i}" > /dev/kmsg + done + wait + + if (( i % 30 == 0 )); then + toggle_netcons_target & + fi + + if (( i % 50 == 0 )); then + # create some targets, enable them, send msg and disable + # all in a parallel thread + create_and_delete_random_target & + fi + + if (( i % 70 == 0 )); then + toggle_iface & + fi +done +wait + +exit "${EXIT_STATUS}" diff --git a/tools/testing/selftests/drivers/net/netdevsim/Makefile b/tools/testing/selftests/drivers/net/netdevsim/Makefile index daf51113c827..1a228c5430f5 100644 --- a/tools/testing/selftests/drivers/net/netdevsim/Makefile +++ b/tools/testing/selftests/drivers/net/netdevsim/Makefile @@ -8,7 +8,6 @@ TEST_PROGS := \ ethtool-features.sh \ ethtool-fec.sh \ ethtool-pause.sh \ - ethtool-ring.sh \ fib.sh \ fib_notifications.sh \ hw_stats_l3.sh \ @@ -20,4 +19,8 @@ TEST_PROGS := \ udp_tunnel_nic.sh \ # end of TEST_PROGS +TEST_FILES := \ + ethtool-common.sh +# end of TEST_FILES + include ../../../lib.mk diff --git a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh index 030762b203d7..1b529ccaf050 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh @@ -3,7 +3,8 @@ lib_dir=$(dirname $0)/../../../net/forwarding -ALL_TESTS="fw_flash_test params_test regions_test reload_test \ +ALL_TESTS="fw_flash_test params_test \ + params_default_test regions_test reload_test \ netns_reload_test resource_test dev_info_test \ empty_reporter_test dummy_reporter_test rate_test" NUM_NETIFS=0 @@ -78,17 +79,28 @@ fw_flash_test() param_get() { local name=$1 + local attr=${2:-value} + local cmode=${3:-driverinit} cmd_jq "devlink dev param show $DL_HANDLE name $name -j" \ - '.[][][].values[] | select(.cmode == "driverinit").value' + '.[][][].values[] | select(.cmode == "'"$cmode"'").'"$attr" } param_set() { local name=$1 local value=$2 + local cmode=${3:-driverinit} - devlink dev param set $DL_HANDLE name $name cmode driverinit value $value + devlink dev param set $DL_HANDLE name $name cmode $cmode value $value +} + +param_set_default() +{ + local name=$1 + local cmode=${2:-driverinit} + + devlink dev param set $DL_HANDLE name $name default cmode $cmode } check_value() @@ -97,12 +109,18 @@ check_value() local phase_name=$2 local expected_param_value=$3 local expected_debugfs_value=$4 + local cmode=${5:-driverinit} local value + local attr="value" - value=$(param_get $name) - check_err $? "Failed to get $name param value" + if [[ "$phase_name" == *"default"* ]]; then + attr="default" + fi + + value=$(param_get $name $attr $cmode) + check_err $? "Failed to get $name param $attr" [ "$value" == "$expected_param_value" ] - check_err $? "Unexpected $phase_name $name param value" + check_err $? "Unexpected $phase_name $name param $attr" value=$(<$DEBUGFS_DIR/$name) check_err $? "Failed to get $name debugfs value" [ "$value" == "$expected_debugfs_value" ] @@ -135,6 +153,92 @@ params_test() log_test "params test" } +value_to_debugfs() +{ + local value=$1 + + case "$value" in + true) + echo "Y" + ;; + false) + echo "N" + ;; + *) + echo "$value" + ;; + esac +} + +test_default() +{ + local param_name=$1 + local new_value=$2 + local expected_default=$3 + local cmode=${4:-driverinit} + local default_debugfs + local new_debugfs + local expected_debugfs + + default_debugfs=$(value_to_debugfs $expected_default) + new_debugfs=$(value_to_debugfs $new_value) + + expected_debugfs=$default_debugfs + check_value $param_name initial-default $expected_default $expected_debugfs $cmode + + param_set $param_name $new_value $cmode + check_err $? "Failed to set $param_name to $new_value" + + expected_debugfs=$([ "$cmode" == "runtime" ] && echo "$new_debugfs" || echo "$default_debugfs") + check_value $param_name post-set $new_value $expected_debugfs $cmode + + devlink dev reload $DL_HANDLE + check_err $? "Failed to reload device" + + expected_debugfs=$new_debugfs + check_value $param_name post-reload-new-value $new_value $expected_debugfs $cmode + + param_set_default $param_name $cmode + check_err $? "Failed to set $param_name to default" + + expected_debugfs=$([ "$cmode" == "runtime" ] && echo "$default_debugfs" || echo "$new_debugfs") + check_value $param_name post-set-default $expected_default $expected_debugfs $cmode + + devlink dev reload $DL_HANDLE + check_err $? "Failed to reload device" + + expected_debugfs=$default_debugfs + check_value $param_name post-reload-default $expected_default $expected_debugfs $cmode +} + +params_default_test() +{ + RET=0 + + if ! devlink dev param help 2>&1 | grep -q "value VALUE | default"; then + echo "SKIP: devlink cli missing default feature" + return + fi + + # Remove side effects of previous tests. Use plain param_set, because + # param_set_default is a feature under test here. + param_set max_macs 32 driverinit + check_err $? "Failed to reset max_macs to default value" + param_set test1 true driverinit + check_err $? "Failed to reset test1 to default value" + param_set test2 1234 runtime + check_err $? "Failed to reset test2 to default value" + + devlink dev reload $DL_HANDLE + check_err $? "Failed to reload device for clean state" + + test_default max_macs 16 32 driverinit + test_default test1 false true driverinit + test_default test2 100 1234 runtime + + log_test "params default test" +} + check_region_size() { local name=$1 diff --git a/tools/testing/selftests/drivers/net/netdevsim/ethtool-ring.sh b/tools/testing/selftests/drivers/net/netdevsim/ethtool-ring.sh deleted file mode 100755 index c969559ffa7a..000000000000 --- a/tools/testing/selftests/drivers/net/netdevsim/ethtool-ring.sh +++ /dev/null @@ -1,85 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0-only - -source ethtool-common.sh - -function get_value { - local query="${SETTINGS_MAP[$1]}" - - echo $(ethtool -g $NSIM_NETDEV | \ - tail -n +$CURR_SETT_LINE | \ - awk -F':' -v pattern="$query:" '$0 ~ pattern {gsub(/[\t ]/, "", $2); print $2}') -} - -function update_current_settings { - for key in ${!SETTINGS_MAP[@]}; do - CURRENT_SETTINGS[$key]=$(get_value $key) - done - echo ${CURRENT_SETTINGS[@]} -} - -if ! ethtool -h | grep -q set-ring >/dev/null; then - echo "SKIP: No --set-ring support in ethtool" - exit 4 -fi - -NSIM_NETDEV=$(make_netdev) - -set -o pipefail - -declare -A SETTINGS_MAP=( - ["rx"]="RX" - ["rx-mini"]="RX Mini" - ["rx-jumbo"]="RX Jumbo" - ["tx"]="TX" -) - -declare -A EXPECTED_SETTINGS=( - ["rx"]="" - ["rx-mini"]="" - ["rx-jumbo"]="" - ["tx"]="" -) - -declare -A CURRENT_SETTINGS=( - ["rx"]="" - ["rx-mini"]="" - ["rx-jumbo"]="" - ["tx"]="" -) - -MAX_VALUE=$((RANDOM % $((2**32-1)))) -RING_MAX_LIST=$(ls $NSIM_DEV_DFS/ethtool/ring/) - -for ring_max_entry in $RING_MAX_LIST; do - echo $MAX_VALUE > $NSIM_DEV_DFS/ethtool/ring/$ring_max_entry -done - -CURR_SETT_LINE=$(ethtool -g $NSIM_NETDEV | grep -i -m1 -n 'Current hardware settings' | cut -f1 -d:) - -# populate the expected settings map -for key in ${!SETTINGS_MAP[@]}; do - EXPECTED_SETTINGS[$key]=$(get_value $key) -done - -# test -for key in ${!SETTINGS_MAP[@]}; do - value=$((RANDOM % $MAX_VALUE)) - - ethtool -G $NSIM_NETDEV "$key" "$value" - - EXPECTED_SETTINGS[$key]="$value" - expected=${EXPECTED_SETTINGS[@]} - current=$(update_current_settings) - - check $? "$current" "$expected" - set +x -done - -if [ $num_errors -eq 0 ]; then - echo "PASSED all $((num_passes)) checks" - exit 0 -else - echo "FAILED $num_errors/$((num_errors+num_passes)) checks" - exit 1 -fi diff --git a/tools/testing/selftests/drivers/net/psp.py b/tools/testing/selftests/drivers/net/psp.py index 4ae7a785ff10..06559ef49b9a 100755 --- a/tools/testing/selftests/drivers/net/psp.py +++ b/tools/testing/selftests/drivers/net/psp.py @@ -109,6 +109,10 @@ def _check_data_outq(s, exp_len, force_wait=False): time.sleep(0.01) ksft_eq(outq, exp_len) + +def _get_stat(cfg, key): + return cfg.pspnl.get_stats({'dev-id': cfg.psp_dev_id})[key] + # # Test case boiler plate # @@ -171,11 +175,16 @@ def dev_rotate(cfg): """ Test key rotation """ _init_psp_dev(cfg) + prev_rotations = _get_stat(cfg, 'key-rotations') + rot = cfg.pspnl.key_rotate({"id": cfg.psp_dev_id}) ksft_eq(rot['id'], cfg.psp_dev_id) rot = cfg.pspnl.key_rotate({"id": cfg.psp_dev_id}) ksft_eq(rot['id'], cfg.psp_dev_id) + cur_rotations = _get_stat(cfg, 'key-rotations') + ksft_eq(cur_rotations, prev_rotations + 2) + def dev_rotate_spi(cfg): """ Test key rotation and SPI check """ @@ -475,6 +484,7 @@ def data_stale_key(cfg): """ Test send on a double-rotated key """ _init_psp_dev(cfg) + prev_stale = _get_stat(cfg, 'stale-events') s = _make_psp_conn(cfg) try: rx_assoc = cfg.pspnl.rx_assoc({"version": 0, @@ -495,6 +505,9 @@ def data_stale_key(cfg): cfg.pspnl.key_rotate({"id": cfg.psp_dev_id}) cfg.pspnl.key_rotate({"id": cfg.psp_dev_id}) + cur_stale = _get_stat(cfg, 'stale-events') + ksft_gt(cur_stale, prev_stale) + s.send(b'0123456789' * 200) _check_data_outq(s, 2000, force_wait=True) finally: diff --git a/tools/testing/selftests/drivers/net/ring_reconfig.py b/tools/testing/selftests/drivers/net/ring_reconfig.py new file mode 100755 index 000000000000..f9530a8b0856 --- /dev/null +++ b/tools/testing/selftests/drivers/net/ring_reconfig.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +""" +Test channel and ring size configuration via ethtool (-L / -G). +""" + +from lib.py import ksft_run, ksft_exit, ksft_pr +from lib.py import ksft_eq +from lib.py import NetDrvEpEnv, EthtoolFamily, GenerateTraffic +from lib.py import defer, NlError + + +def channels(cfg) -> None: + """ + Twiddle channel counts in various combinations of parameters. + We're only looking for driver adhering to the requested config + if the config is accepted and crashes. + """ + ehdr = {'header':{'dev-index': cfg.ifindex}} + chans = cfg.eth.channels_get(ehdr) + + all_keys = ["rx", "tx", "combined"] + mixes = [{"combined"}, {"rx", "tx"}, {"rx", "combined"}, {"tx", "combined"}, + {"rx", "tx", "combined"},] + + # Get the set of keys that device actually supports + restore = {} + supported = set() + for key in all_keys: + if key + "-max" in chans: + supported.add(key) + restore |= {key + "-count": chans[key + "-count"]} + + defer(cfg.eth.channels_set, ehdr | restore) + + def test_config(config): + try: + cfg.eth.channels_set(ehdr | config) + get = cfg.eth.channels_get(ehdr) + for k, v in config.items(): + ksft_eq(get.get(k, 0), v) + except NlError as e: + failed.append(mix) + ksft_pr("Can't set", config, e) + else: + ksft_pr("Okay", config) + + failed = [] + for mix in mixes: + if not mix.issubset(supported): + continue + + # Set all the values in the mix to 1, other supported to 0 + config = {} + for key in all_keys: + config[key + "-count"] = 1 if key in mix else 0 + test_config(config) + + for mix in mixes: + if not mix.issubset(supported): + continue + if mix in failed: + continue + + # Set all the values in the mix to max, other supported to 0 + config = {} + for key in all_keys: + config[key + "-count"] = chans[key + '-max'] if key in mix else 0 + test_config(config) + + +def _configure_min_ring_cnt(cfg) -> None: + """ Try to configure a single Rx/Tx ring. """ + ehdr = {'header':{'dev-index': cfg.ifindex}} + chans = cfg.eth.channels_get(ehdr) + + all_keys = ["rx-count", "tx-count", "combined-count"] + restore = {} + config = {} + for key in all_keys: + if key in chans: + restore[key] = chans[key] + config[key] = 0 + + if chans.get('combined-count', 0) > 1: + config['combined-count'] = 1 + elif chans.get('rx-count', 0) > 1 and chans.get('tx-count', 0) > 1: + config['tx-count'] = 1 + config['rx-count'] = 1 + else: + # looks like we're already on 1 channel + return + + cfg.eth.channels_set(ehdr | config) + defer(cfg.eth.channels_set, ehdr | restore) + + +def ringparam(cfg) -> None: + """ + Tweak the ringparam configuration. Try to run some traffic over min + ring size to make sure it actually functions. + """ + ehdr = {'header':{'dev-index': cfg.ifindex}} + rings = cfg.eth.rings_get(ehdr) + + restore = {} + maxes = {} + params = set() + for key in rings.keys(): + if 'max' in key: + param = key[:-4] + maxes[param] = rings[key] + params.add(param) + restore[param] = rings[param] + + defer(cfg.eth.rings_set, ehdr | restore) + + # Speed up the reconfig by configuring just one ring + _configure_min_ring_cnt(cfg) + + # Try to reach min on all settings + for param in params: + val = rings[param] + while True: + try: + cfg.eth.rings_set({'header':{'dev-index': cfg.ifindex}, + param: val // 2}) + if val == 0: + break + val //= 2 + except NlError: + break + + get = cfg.eth.rings_get(ehdr) + ksft_eq(get[param], val) + + ksft_pr(f"Reached min for '{param}' at {val} (max {rings[param]})") + + GenerateTraffic(cfg).wait_pkts_and_stop(10000) + + # Try max across all params, if the driver supports large rings + # this may OOM so we ignore errors + try: + ksft_pr("Applying max settings") + config = {p: maxes[p] for p in params} + cfg.eth.rings_set(ehdr | config) + except NlError as e: + ksft_pr("Can't set max params", config, e) + else: + GenerateTraffic(cfg).wait_pkts_and_stop(10000) + + +def main() -> None: + """ Ksft boiler plate main """ + + with NetDrvEpEnv(__file__) as cfg: + cfg.eth = EthtoolFamily() + + ksft_run([channels, + ringparam], + args=(cfg, )) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/drivers/net/stats.py b/tools/testing/selftests/drivers/net/stats.py index 04d0a2a13e73..b08e4d48b15c 100755 --- a/tools/testing/selftests/drivers/net/stats.py +++ b/tools/testing/selftests/drivers/net/stats.py @@ -263,14 +263,15 @@ def procfs_downup_hammer(cfg) -> None: Reading stats via procfs only holds the RCU lock, drivers often try to sleep when reading the stats, or don't protect against races. """ - # Max out the queues, we'll flip between max and 1 + # Set a large number of queues, + # we'll flip between min(max_queues, 64) and 1 channels = ethnl.channels_get({'header': {'dev-index': cfg.ifindex}}) if channels['combined-count'] == 0: rx_type = 'rx' else: rx_type = 'combined' cur_queue_cnt = channels[f'{rx_type}-count'] - max_queue_cnt = channels[f'{rx_type}-max'] + max_queue_cnt = min(channels[f'{rx_type}-max'], 64) cmd(f"ethtool -L {cfg.ifname} {rx_type} {max_queue_cnt}") defer(cmd, f"ethtool -L {cfg.ifname} {rx_type} {cur_queue_cnt}") diff --git a/tools/testing/selftests/drivers/net/xdp.py b/tools/testing/selftests/drivers/net/xdp.py index a148004e1c36..e54df158dfe9 100755 --- a/tools/testing/selftests/drivers/net/xdp.py +++ b/tools/testing/selftests/drivers/net/xdp.py @@ -12,6 +12,7 @@ from dataclasses import dataclass from enum import Enum from lib.py import ksft_run, ksft_exit, ksft_eq, ksft_ge, ksft_ne, ksft_pr +from lib.py import KsftNamedVariant, ksft_variants from lib.py import KsftFailEx, NetDrvEpEnv from lib.py import EthtoolFamily, NetdevFamily, NlError from lib.py import bkg, cmd, rand_port, wait_port_listen @@ -672,7 +673,18 @@ def test_xdp_native_adjst_head_shrnk_data(cfg): _validate_res(res, offset_lst, pkt_sz_lst) -def _test_xdp_native_ifc_stats(cfg, act): +@ksft_variants([ + KsftNamedVariant("pass", XDPAction.PASS), + KsftNamedVariant("drop", XDPAction.DROP), + KsftNamedVariant("tx", XDPAction.TX), +]) +def test_xdp_native_qstats(cfg, act): + """ + Send 1000 messages. Expect XDP action specified in @act. + Make sure the packets were counted to interface level qstats + (Rx, and Tx if act is TX). + """ + cfg.require_cmd("socat") bpf_info = BPFProgInfo("xdp_prog", "xdp_native.bpf.o", "xdp", 1500) @@ -687,9 +699,12 @@ def _test_xdp_native_ifc_stats(cfg, act): "/dev/null" # Listener runs on "remote" in case of XDP_TX rx_host = cfg.remote if act == XDPAction.TX else None - # We want to spew 2000 packets quickly, bash seems to do a good enough job - tx_udp = f"exec 5<>/dev/udp/{cfg.addr}/{port}; " \ - "for i in `seq 2000`; do echo a >&5; done; exec 5>&-" + # We want to spew 1000 packets quickly, bash seems to do a good enough job + # Each reopening of the socket gives us a differenot local port (for RSS) + tx_udp = "for _ in `seq 20`; do " \ + f"exec 5<>/dev/udp/{cfg.addr}/{port}; " \ + "for i in `seq 50`; do echo a >&5; done; " \ + "exec 5>&-; done" cfg.wait_hw_stats_settle() # Qstats have more clearly defined semantics than rtnetlink. @@ -704,11 +719,11 @@ def _test_xdp_native_ifc_stats(cfg, act): cfg.wait_hw_stats_settle() after = cfg.netnl.qstats_get({"ifindex": cfg.ifindex}, dump=True)[0] - ksft_ge(after['rx-packets'] - before['rx-packets'], 2000) + expected_pkts = 1000 + ksft_ge(after['rx-packets'] - before['rx-packets'], expected_pkts) if act == XDPAction.TX: - ksft_ge(after['tx-packets'] - before['tx-packets'], 2000) + ksft_ge(after['tx-packets'] - before['tx-packets'], expected_pkts) - expected_pkts = 2000 stats = _get_stats(prog_info["maps"]["map_xdp_stats"]) ksft_eq(stats[XDPStats.RX.value], expected_pkts, "XDP RX stats mismatch") if act == XDPAction.TX: @@ -730,30 +745,6 @@ def _test_xdp_native_ifc_stats(cfg, act): ksft_ge(after['tx-packets'], before['tx-packets']) -def test_xdp_native_qstats_pass(cfg): - """ - Send 2000 messages, expect XDP_PASS, make sure the packets were counted - to interface level qstats (Rx). - """ - _test_xdp_native_ifc_stats(cfg, XDPAction.PASS) - - -def test_xdp_native_qstats_drop(cfg): - """ - Send 2000 messages, expect XDP_DROP, make sure the packets were counted - to interface level qstats (Rx). - """ - _test_xdp_native_ifc_stats(cfg, XDPAction.DROP) - - -def test_xdp_native_qstats_tx(cfg): - """ - Send 2000 messages, expect XDP_TX, make sure the packets were counted - to interface level qstats (Rx and Tx) - """ - _test_xdp_native_ifc_stats(cfg, XDPAction.TX) - - def main(): """ Main function to execute the XDP tests. @@ -778,9 +769,7 @@ def main(): test_xdp_native_adjst_tail_shrnk_data, test_xdp_native_adjst_head_grow_data, test_xdp_native_adjst_head_shrnk_data, - test_xdp_native_qstats_pass, - test_xdp_native_qstats_drop, - test_xdp_native_qstats_tx, + test_xdp_native_qstats, ], args=(cfg,)) ksft_exit() |
