summaryrefslogtreecommitdiff
path: root/tools
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2022-08-24 19:30:19 -0700
committerJakub Kicinski <kuba@kernel.org>2022-08-24 19:30:19 -0700
commitc21e1bf4d8104fb77bbd99f3d1276c0d7c9b28d9 (patch)
tree73ff51aa3bc284c611e346b14f4ebe8b90f251b8 /tools
parent0bf73255d3a3cf3b0416e95f2c9f7c53095c2e1a (diff)
parent1be9ac87a75a4fc0e2cc254e412d2d67a58a7191 (diff)
Merge branch 'add-a-second-bind-table-hashed-by-port-and-address'
Joanne Koong says: ==================== Add a second bind table hashed by port and address Currently, there is one bind hashtable (bhash) that hashes by port only. This patchset adds a second bind table (bhash2) that hashes by port and address. The motivation for adding bhash2 is to expedite bind requests in situations where the port has many sockets in its bhash table entry (eg a large number of sockets bound to different addresses on the same port), which makes checking bind conflicts costly especially given that we acquire the table entry spinlock while doing so, which can cause softirq cpu lockups and can prevent new tcp connections. We ran into this problem at Meta where the traffic team binds a large number of IPs to port 443 and the bind() call took a significant amount of time which led to cpu softirq lockups, which caused packet drops and other failures on the machine. When experimentally testing this on a local server for ~24k sockets bound to the port, the results seen were: ipv4: before - 0.002317 seconds with bhash2 - 0.000020 seconds ipv6: before - 0.002431 seconds with bhash2 - 0.000021 seconds The additions to the initial bhash2 submission [0] are: * Updating bhash2 in the cases where a socket's rcv saddr changes after it has * been bound * Adding locks for bhash2 hashbuckets [0] https://lore.kernel.org/netdev/20220520001834.2247810-1-kuba@kernel.org/ v3: https://lore.kernel.org/netdev/20220722195406.1304948-2-joannelkoong@gmail.com/ v2: https://lore.kernel.org/netdev/20220712235310.1935121-1-joannelkoong@gmail.com/ v1: https://lore.kernel.org/netdev/20220623234242.2083895-2-joannelkoong@gmail.com/ ==================== Link: https://lore.kernel.org/r/20220822181023.3979645-1-joannelkoong@gmail.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'tools')
-rw-r--r--tools/testing/selftests/net/.gitignore5
-rw-r--r--tools/testing/selftests/net/Makefile5
-rw-r--r--tools/testing/selftests/net/bind_bhash.c144
-rwxr-xr-xtools/testing/selftests/net/bind_bhash.sh66
-rw-r--r--tools/testing/selftests/net/sk_bind_sendto_listen.c80
-rw-r--r--tools/testing/selftests/net/sk_connect_zero_addr.c62
6 files changed, 361 insertions, 1 deletions
diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore
index 0e5751af6247..bec5cf96984c 100644
--- a/tools/testing/selftests/net/.gitignore
+++ b/tools/testing/selftests/net/.gitignore
@@ -39,4 +39,7 @@ toeplitz
tun
cmsg_sender
unix_connect
-tap \ No newline at end of file
+tap
+bind_bhash
+sk_bind_sendto_listen
+sk_connect_zero_addr
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 11a288b67e2f..e6a951ba5ba0 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -43,6 +43,7 @@ TEST_PROGS += ndisc_unsolicited_na_test.sh
TEST_PROGS += arp_ndisc_untracked_subnets.sh
TEST_PROGS += stress_reuseport_listen.sh
TEST_PROGS := l2_tos_ttl_inherit.sh
+TEST_PROGS += bind_bhash.sh
TEST_PROGS_EXTENDED := in_netns.sh setup_loopback.sh setup_veth.sh
TEST_PROGS_EXTENDED += toeplitz_client.sh toeplitz.sh
TEST_GEN_FILES = socket nettest
@@ -64,6 +65,9 @@ TEST_GEN_FILES += cmsg_sender
TEST_GEN_FILES += stress_reuseport_listen
TEST_PROGS += test_vxlan_vnifiltering.sh
TEST_GEN_FILES += io_uring_zerocopy_tx
+TEST_GEN_FILES += bind_bhash
+TEST_GEN_PROGS += sk_bind_sendto_listen
+TEST_GEN_PROGS += sk_connect_zero_addr
TEST_FILES := settings
@@ -74,3 +78,4 @@ include bpf/Makefile
$(OUTPUT)/reuseport_bpf_numa: LDLIBS += -lnuma
$(OUTPUT)/tcp_mmap: LDLIBS += -lpthread
$(OUTPUT)/tcp_inq: LDLIBS += -lpthread
+$(OUTPUT)/bind_bhash: LDLIBS += -lpthread
diff --git a/tools/testing/selftests/net/bind_bhash.c b/tools/testing/selftests/net/bind_bhash.c
new file mode 100644
index 000000000000..57ff67a3751e
--- /dev/null
+++ b/tools/testing/selftests/net/bind_bhash.c
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This times how long it takes to bind to a port when the port already
+ * has multiple sockets in its bhash table.
+ *
+ * In the setup(), we populate the port's bhash table with
+ * MAX_THREADS * MAX_CONNECTIONS number of entries.
+ */
+
+#include <unistd.h>
+#include <stdio.h>
+#include <netdb.h>
+#include <pthread.h>
+#include <string.h>
+#include <stdbool.h>
+
+#define MAX_THREADS 600
+#define MAX_CONNECTIONS 40
+
+static const char *setup_addr_v6 = "::1";
+static const char *setup_addr_v4 = "127.0.0.1";
+static const char *setup_addr;
+static const char *bind_addr;
+static const char *port;
+bool use_v6;
+int ret;
+
+static int fd_array[MAX_THREADS][MAX_CONNECTIONS];
+
+static int bind_socket(int opt, const char *addr)
+{
+ struct addrinfo *res, hint = {};
+ int sock_fd, reuse = 1, err;
+ int domain = use_v6 ? AF_INET6 : AF_INET;
+
+ sock_fd = socket(domain, SOCK_STREAM, 0);
+ if (sock_fd < 0) {
+ perror("socket fd err");
+ return sock_fd;
+ }
+
+ hint.ai_family = domain;
+ hint.ai_socktype = SOCK_STREAM;
+
+ err = getaddrinfo(addr, port, &hint, &res);
+ if (err) {
+ perror("getaddrinfo failed");
+ goto cleanup;
+ }
+
+ if (opt) {
+ err = setsockopt(sock_fd, SOL_SOCKET, opt, &reuse, sizeof(reuse));
+ if (err) {
+ perror("setsockopt failed");
+ goto cleanup;
+ }
+ }
+
+ err = bind(sock_fd, res->ai_addr, res->ai_addrlen);
+ if (err) {
+ perror("failed to bind to port");
+ goto cleanup;
+ }
+
+ return sock_fd;
+
+cleanup:
+ close(sock_fd);
+ return err;
+}
+
+static void *setup(void *arg)
+{
+ int sock_fd, i;
+ int *array = (int *)arg;
+
+ for (i = 0; i < MAX_CONNECTIONS; i++) {
+ sock_fd = bind_socket(SO_REUSEADDR | SO_REUSEPORT, setup_addr);
+ if (sock_fd < 0) {
+ ret = sock_fd;
+ pthread_exit(&ret);
+ }
+ array[i] = sock_fd;
+ }
+
+ return NULL;
+}
+
+int main(int argc, const char *argv[])
+{
+ int listener_fd, sock_fd, i, j;
+ pthread_t tid[MAX_THREADS];
+ clock_t begin, end;
+
+ if (argc != 4) {
+ printf("Usage: listener <port> <ipv6 | ipv4> <bind-addr>\n");
+ return -1;
+ }
+
+ port = argv[1];
+ use_v6 = strcmp(argv[2], "ipv6") == 0;
+ bind_addr = argv[3];
+
+ setup_addr = use_v6 ? setup_addr_v6 : setup_addr_v4;
+
+ listener_fd = bind_socket(SO_REUSEADDR | SO_REUSEPORT, setup_addr);
+ if (listen(listener_fd, 100) < 0) {
+ perror("listen failed");
+ return -1;
+ }
+
+ /* Set up threads to populate the bhash table entry for the port */
+ for (i = 0; i < MAX_THREADS; i++)
+ pthread_create(&tid[i], NULL, setup, fd_array[i]);
+
+ for (i = 0; i < MAX_THREADS; i++)
+ pthread_join(tid[i], NULL);
+
+ if (ret)
+ goto done;
+
+ begin = clock();
+
+ /* Bind to the same port on a different address */
+ sock_fd = bind_socket(0, bind_addr);
+ if (sock_fd < 0)
+ goto done;
+
+ end = clock();
+
+ printf("time spent = %f\n", (double)(end - begin) / CLOCKS_PER_SEC);
+
+ /* clean up */
+ close(sock_fd);
+
+done:
+ close(listener_fd);
+ for (i = 0; i < MAX_THREADS; i++) {
+ for (j = 0; i < MAX_THREADS; i++)
+ close(fd_array[i][j]);
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/net/bind_bhash.sh b/tools/testing/selftests/net/bind_bhash.sh
new file mode 100755
index 000000000000..ca0292d4b441
--- /dev/null
+++ b/tools/testing/selftests/net/bind_bhash.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+NR_FILES=32768
+SAVED_NR_FILES=$(ulimit -n)
+
+# default values
+port=443
+addr_v6="2001:0db8:0:f101::1"
+addr_v4="10.8.8.8"
+use_v6=true
+addr=""
+
+usage() {
+ echo "Usage: $0 [-6 | -4] [-p port] [-a address]"
+ echo -e "\t6: use ipv6"
+ echo -e "\t4: use ipv4"
+ echo -e "\tport: Port number"
+ echo -e "\taddress: ip address"
+}
+
+while getopts "ha:p:64" opt; do
+ case ${opt} in
+ h)
+ usage $0
+ exit 0
+ ;;
+ a) addr=$OPTARG;;
+ p)
+ port=$OPTARG;;
+ 6)
+ use_v6=true;;
+ 4)
+ use_v6=false;;
+ esac
+done
+
+setup() {
+ if [[ "$use_v6" == true ]]; then
+ ip addr add $addr_v6 nodad dev eth0
+ else
+ ip addr add $addr_v4 dev lo
+ fi
+ ulimit -n $NR_FILES
+}
+
+cleanup() {
+ if [[ "$use_v6" == true ]]; then
+ ip addr del $addr_v6 dev eth0
+ else
+ ip addr del $addr_v4/32 dev lo
+ fi
+ ulimit -n $SAVED_NR_FILES
+}
+
+if [[ "$addr" != "" ]]; then
+ addr_v4=$addr;
+ addr_v6=$addr;
+fi
+setup
+if [[ "$use_v6" == true ]] ; then
+ ./bind_bhash $port "ipv6" $addr_v6
+else
+ ./bind_bhash $port "ipv4" $addr_v4
+fi
+cleanup
diff --git a/tools/testing/selftests/net/sk_bind_sendto_listen.c b/tools/testing/selftests/net/sk_bind_sendto_listen.c
new file mode 100644
index 000000000000..b420d830f72c
--- /dev/null
+++ b/tools/testing/selftests/net/sk_bind_sendto_listen.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <arpa/inet.h>
+#include <error.h>
+#include <errno.h>
+#include <unistd.h>
+
+int main(void)
+{
+ int fd1, fd2, one = 1;
+ struct sockaddr_in6 bind_addr = {
+ .sin6_family = AF_INET6,
+ .sin6_port = htons(20000),
+ .sin6_flowinfo = htonl(0),
+ .sin6_addr = {},
+ .sin6_scope_id = 0,
+ };
+
+ inet_pton(AF_INET6, "::", &bind_addr.sin6_addr);
+
+ fd1 = socket(AF_INET6, SOCK_STREAM, IPPROTO_IP);
+ if (fd1 < 0) {
+ error(1, errno, "socket fd1");
+ return -1;
+ }
+
+ if (setsockopt(fd1, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one))) {
+ error(1, errno, "setsockopt(SO_REUSEADDR) fd1");
+ goto out_err1;
+ }
+
+ if (bind(fd1, (struct sockaddr *)&bind_addr, sizeof(bind_addr))) {
+ error(1, errno, "bind fd1");
+ goto out_err1;
+ }
+
+ if (sendto(fd1, NULL, 0, MSG_FASTOPEN, (struct sockaddr *)&bind_addr,
+ sizeof(bind_addr))) {
+ error(1, errno, "sendto fd1");
+ goto out_err1;
+ }
+
+ fd2 = socket(AF_INET6, SOCK_STREAM, IPPROTO_IP);
+ if (fd2 < 0) {
+ error(1, errno, "socket fd2");
+ goto out_err1;
+ }
+
+ if (setsockopt(fd2, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one))) {
+ error(1, errno, "setsockopt(SO_REUSEADDR) fd2");
+ goto out_err2;
+ }
+
+ if (bind(fd2, (struct sockaddr *)&bind_addr, sizeof(bind_addr))) {
+ error(1, errno, "bind fd2");
+ goto out_err2;
+ }
+
+ if (sendto(fd2, NULL, 0, MSG_FASTOPEN, (struct sockaddr *)&bind_addr,
+ sizeof(bind_addr)) != -1) {
+ error(1, errno, "sendto fd2");
+ goto out_err2;
+ }
+
+ if (listen(fd2, 0)) {
+ error(1, errno, "listen");
+ goto out_err2;
+ }
+
+ close(fd2);
+ close(fd1);
+ return 0;
+
+out_err2:
+ close(fd2);
+
+out_err1:
+ close(fd1);
+ return -1;
+}
diff --git a/tools/testing/selftests/net/sk_connect_zero_addr.c b/tools/testing/selftests/net/sk_connect_zero_addr.c
new file mode 100644
index 000000000000..4be418aefd9f
--- /dev/null
+++ b/tools/testing/selftests/net/sk_connect_zero_addr.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <arpa/inet.h>
+#include <error.h>
+#include <errno.h>
+#include <unistd.h>
+
+int main(void)
+{
+ int fd1, fd2, one = 1;
+ struct sockaddr_in6 bind_addr = {
+ .sin6_family = AF_INET6,
+ .sin6_port = htons(20000),
+ .sin6_flowinfo = htonl(0),
+ .sin6_addr = {},
+ .sin6_scope_id = 0,
+ };
+
+ inet_pton(AF_INET6, "::", &bind_addr.sin6_addr);
+
+ fd1 = socket(AF_INET6, SOCK_STREAM, IPPROTO_IP);
+ if (fd1 < 0) {
+ error(1, errno, "socket fd1");
+ return -1;
+ }
+
+ if (setsockopt(fd1, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one))) {
+ error(1, errno, "setsockopt(SO_REUSEADDR) fd1");
+ goto out_err1;
+ }
+
+ if (bind(fd1, (struct sockaddr *)&bind_addr, sizeof(bind_addr))) {
+ error(1, errno, "bind fd1");
+ goto out_err1;
+ }
+
+ if (listen(fd1, 0)) {
+ error(1, errno, "listen");
+ goto out_err1;
+ }
+
+ fd2 = socket(AF_INET6, SOCK_STREAM, IPPROTO_IP);
+ if (fd2 < 0) {
+ error(1, errno, "socket fd2");
+ goto out_err1;
+ }
+
+ if (connect(fd2, (struct sockaddr *)&bind_addr, sizeof(bind_addr))) {
+ error(1, errno, "bind fd2");
+ goto out_err2;
+ }
+
+ close(fd2);
+ close(fd1);
+ return 0;
+
+out_err2:
+ close(fd2);
+out_err1:
+ close(fd1);
+ return -1;
+}