diff options
| author | Jakub Kicinski <kuba@kernel.org> | 2022-05-20 18:16:31 -0700 |
|---|---|---|
| committer | Jakub Kicinski <kuba@kernel.org> | 2022-05-20 18:16:31 -0700 |
| commit | aa5334b1f96801cd09775217a72ff252ef614d7a (patch) | |
| tree | 464be1977af3e8290dd0f519c5c3849299b20c3a /tools | |
| parent | eac67d83bf2553f98cfddd42cfbcfd6f9ccfc287 (diff) | |
| parent | 538aaf9b2383701094a47797b4554c6a21c83eed (diff) | |
Merge branch 'add-a-bhash2-table-hashed-by-port-address'
Joanne Koong says:
====================
Add a bhash2 table hashed by port + address
This patchset proposes adding a bhash2 table that hashes by port and address.
The motivation behind bhash2 is to expedite bind requests in situations where
the port has many sockets in its bhash table entry, which makes checking bind
conflicts costly especially given that we acquire the table entry spinlock
while doing so, which can cause softirq cpu lockups and can prevent new tcp
connections.
We ran into this problem at Meta where the traffic team binds a large number
of IPs to port 443 and the bind() call took a significant amount of time
which led to cpu softirq lockups, which caused packet drops and other failures
on the machine
The patches are as follows:
1/2 - Adds a second bhash table (bhash2) hashed by port and address
2/2 - Adds a test for timing how long an additional bind request takes when
the bhash entry is populated
When experimentally testing this on a local server for ~24k sockets bound to
the port, the results seen were:
ipv4:
before - 0.002317 seconds
with bhash2 - 0.000018 seconds
ipv6:
before - 0.002431 seconds
with bhash2 - 0.000021 seconds
====================
Link: https://lore.kernel.org/r/20220520001834.2247810-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'tools')
| -rw-r--r-- | tools/testing/selftests/net/.gitignore | 1 | ||||
| -rw-r--r-- | tools/testing/selftests/net/Makefile | 2 | ||||
| -rw-r--r-- | tools/testing/selftests/net/bind_bhash_test.c | 119 |
3 files changed, 122 insertions, 0 deletions
diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 21a411b04890..735423136bc4 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -36,3 +36,4 @@ gro ioam6_parser toeplitz cmsg_sender +bind_bhash_test diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 7ea54af55490..464df13831f2 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -59,6 +59,7 @@ TEST_GEN_FILES += toeplitz TEST_GEN_FILES += cmsg_sender TEST_GEN_FILES += stress_reuseport_listen TEST_PROGS += test_vxlan_vnifiltering.sh +TEST_GEN_FILES += bind_bhash_test TEST_FILES := settings @@ -69,4 +70,5 @@ include bpf/Makefile $(OUTPUT)/reuseport_bpf_numa: LDLIBS += -lnuma $(OUTPUT)/tcp_mmap: LDLIBS += -lpthread +$(OUTPUT)/bind_bhash_test: LDLIBS += -lpthread $(OUTPUT)/tcp_inq: LDLIBS += -lpthread diff --git a/tools/testing/selftests/net/bind_bhash_test.c b/tools/testing/selftests/net/bind_bhash_test.c new file mode 100644 index 000000000000..252e73754e76 --- /dev/null +++ b/tools/testing/selftests/net/bind_bhash_test.c @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This times how long it takes to bind to a port when the port already + * has multiple sockets in its bhash table. + * + * In the setup(), we populate the port's bhash table with + * MAX_THREADS * MAX_CONNECTIONS number of entries. + */ + +#include <unistd.h> +#include <stdio.h> +#include <netdb.h> +#include <pthread.h> + +#define MAX_THREADS 600 +#define MAX_CONNECTIONS 40 + +static const char *bind_addr = "::1"; +static const char *port; + +static int fd_array[MAX_THREADS][MAX_CONNECTIONS]; + +static int bind_socket(int opt, const char *addr) +{ + struct addrinfo *res, hint = {}; + int sock_fd, reuse = 1, err; + + sock_fd = socket(AF_INET6, SOCK_STREAM, 0); + if (sock_fd < 0) { + perror("socket fd err"); + return -1; + } + + hint.ai_family = AF_INET6; + hint.ai_socktype = SOCK_STREAM; + + err = getaddrinfo(addr, port, &hint, &res); + if (err) { + perror("getaddrinfo failed"); + return -1; + } + + if (opt) { + err = setsockopt(sock_fd, SOL_SOCKET, opt, &reuse, sizeof(reuse)); + if (err) { + perror("setsockopt failed"); + return -1; + } + } + + err = bind(sock_fd, res->ai_addr, res->ai_addrlen); + if (err) { + perror("failed to bind to port"); + return -1; + } + + return sock_fd; +} + +static void *setup(void *arg) +{ + int sock_fd, i; + int *array = (int *)arg; + + for (i = 0; i < MAX_CONNECTIONS; i++) { + sock_fd = bind_socket(SO_REUSEADDR | SO_REUSEPORT, bind_addr); + if (sock_fd < 0) + return NULL; + array[i] = sock_fd; + } + + return NULL; +} + +int main(int argc, const char *argv[]) +{ + int listener_fd, sock_fd, i, j; + pthread_t tid[MAX_THREADS]; + clock_t begin, end; + + if (argc != 2) { + printf("Usage: listener <port>\n"); + return -1; + } + + port = argv[1]; + + listener_fd = bind_socket(SO_REUSEADDR | SO_REUSEPORT, bind_addr); + if (listen(listener_fd, 100) < 0) { + perror("listen failed"); + return -1; + } + + /* Set up threads to populate the bhash table entry for the port */ + for (i = 0; i < MAX_THREADS; i++) + pthread_create(&tid[i], NULL, setup, fd_array[i]); + + for (i = 0; i < MAX_THREADS; i++) + pthread_join(tid[i], NULL); + + begin = clock(); + + /* Bind to the same port on a different address */ + sock_fd = bind_socket(0, "2001:0db8:0:f101::1"); + + end = clock(); + + printf("time spent = %f\n", (double)(end - begin) / CLOCKS_PER_SEC); + + /* clean up */ + close(sock_fd); + close(listener_fd); + for (i = 0; i < MAX_THREADS; i++) { + for (j = 0; i < MAX_THREADS; i++) + close(fd_array[i][j]); + } + + return 0; +} |
