From 5d4cc87414c5d11345c4b11d61377d351b5c28a2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 16 Feb 2024 16:20:06 +0000 Subject: net: reorganize "struct sock" fields Last major reorg happened in commit 9115e8cd2a0c ("net: reorganize struct sock for better data locality") Since then, many changes have been done. Before SO_PEEK_OFF support is added to TCP, we need to move sk_peek_off to a better location. It is time to make another pass, and add six groups, without explicit alignment. - sock_write_rx (following sk_refcnt) read-write fields in rx path. - sock_read_rx read-mostly fields in rx path. - sock_read_rxtx read-mostly fields in both rx and tx paths. - sock_write_rxtx read-write fields in both rx and tx paths. - sock_write_tx read-write fields in tx paths. - sock_read_tx read-mostly fields in tx paths. Results on TCP_RR benchmarks seem to show a gain (4 to 5 %). It is possible UDP needs a change, because sk_peek_off shares a cache line with sk_receive_queue. If this the case, we can exchange roles of sk->sk_receive and up->reader_queue queues. After this change, we have the following layout: struct sock { struct sock_common __sk_common; /* 0 0x88 */ /* --- cacheline 2 boundary (128 bytes) was 8 bytes ago --- */ __u8 __cacheline_group_begin__sock_write_rx[0]; /* 0x88 0 */ atomic_t sk_drops; /* 0x88 0x4 */ __s32 sk_peek_off; /* 0x8c 0x4 */ struct sk_buff_head sk_error_queue; /* 0x90 0x18 */ struct sk_buff_head sk_receive_queue; /* 0xa8 0x18 */ /* --- cacheline 3 boundary (192 bytes) --- */ struct { atomic_t rmem_alloc; /* 0xc0 0x4 */ int len; /* 0xc4 0x4 */ struct sk_buff * head; /* 0xc8 0x8 */ struct sk_buff * tail; /* 0xd0 0x8 */ } sk_backlog; /* 0xc0 0x18 */ struct { atomic_t rmem_alloc; /* 0 0x4 */ int len; /* 0x4 0x4 */ struct sk_buff * head; /* 0x8 0x8 */ struct sk_buff * tail; /* 0x10 0x8 */ /* size: 24, cachelines: 1, members: 4 */ /* last cacheline: 24 bytes */ }; __u8 __cacheline_group_end__sock_write_rx[0]; /* 0xd8 0 */ __u8 __cacheline_group_begin__sock_read_rx[0]; /* 0xd8 0 */ rcu * sk_rx_dst; /* 0xd8 0x8 */ int sk_rx_dst_ifindex; /* 0xe0 0x4 */ u32 sk_rx_dst_cookie; /* 0xe4 0x4 */ unsigned int sk_ll_usec; /* 0xe8 0x4 */ unsigned int sk_napi_id; /* 0xec 0x4 */ u16 sk_busy_poll_budget; /* 0xf0 0x2 */ u8 sk_prefer_busy_poll; /* 0xf2 0x1 */ u8 sk_userlocks; /* 0xf3 0x1 */ int sk_rcvbuf; /* 0xf4 0x4 */ rcu * sk_filter; /* 0xf8 0x8 */ /* --- cacheline 4 boundary (256 bytes) --- */ union { rcu * sk_wq; /* 0x100 0x8 */ struct socket_wq * sk_wq_raw; /* 0x100 0x8 */ }; /* 0x100 0x8 */ union { rcu * sk_wq; /* 0 0x8 */ struct socket_wq * sk_wq_raw; /* 0 0x8 */ }; void (*sk_data_ready)(struct sock *); /* 0x108 0x8 */ long sk_rcvtimeo; /* 0x110 0x8 */ int sk_rcvlowat; /* 0x118 0x4 */ __u8 __cacheline_group_end__sock_read_rx[0]; /* 0x11c 0 */ __u8 __cacheline_group_begin__sock_read_rxtx[0]; /* 0x11c 0 */ int sk_err; /* 0x11c 0x4 */ struct socket * sk_socket; /* 0x120 0x8 */ struct mem_cgroup * sk_memcg; /* 0x128 0x8 */ rcu * sk_policy[2]; /* 0x130 0x10 */ /* --- cacheline 5 boundary (320 bytes) --- */ __u8 __cacheline_group_end__sock_read_rxtx[0]; /* 0x140 0 */ __u8 __cacheline_group_begin__sock_write_rxtx[0]; /* 0x140 0 */ socket_lock_t sk_lock; /* 0x140 0x20 */ u32 sk_reserved_mem; /* 0x160 0x4 */ int sk_forward_alloc; /* 0x164 0x4 */ u32 sk_tsflags; /* 0x168 0x4 */ __u8 __cacheline_group_end__sock_write_rxtx[0]; /* 0x16c 0 */ __u8 __cacheline_group_begin__sock_write_tx[0]; /* 0x16c 0 */ int sk_write_pending; /* 0x16c 0x4 */ atomic_t sk_omem_alloc; /* 0x170 0x4 */ int sk_sndbuf; /* 0x174 0x4 */ int sk_wmem_queued; /* 0x178 0x4 */ refcount_t sk_wmem_alloc; /* 0x17c 0x4 */ /* --- cacheline 6 boundary (384 bytes) --- */ unsigned long sk_tsq_flags; /* 0x180 0x8 */ union { struct sk_buff * sk_send_head; /* 0x188 0x8 */ struct rb_root tcp_rtx_queue; /* 0x188 0x8 */ }; /* 0x188 0x8 */ union { struct sk_buff * sk_send_head; /* 0 0x8 */ struct rb_root tcp_rtx_queue; /* 0 0x8 */ }; struct sk_buff_head sk_write_queue; /* 0x190 0x18 */ u32 sk_dst_pending_confirm; /* 0x1a8 0x4 */ u32 sk_pacing_status; /* 0x1ac 0x4 */ struct page_frag sk_frag; /* 0x1b0 0x10 */ /* --- cacheline 7 boundary (448 bytes) --- */ struct timer_list sk_timer; /* 0x1c0 0x28 */ /* XXX last struct has 4 bytes of padding */ unsigned long sk_pacing_rate; /* 0x1e8 0x8 */ atomic_t sk_zckey; /* 0x1f0 0x4 */ atomic_t sk_tskey; /* 0x1f4 0x4 */ __u8 __cacheline_group_end__sock_write_tx[0]; /* 0x1f8 0 */ __u8 __cacheline_group_begin__sock_read_tx[0]; /* 0x1f8 0 */ unsigned long sk_max_pacing_rate; /* 0x1f8 0x8 */ /* --- cacheline 8 boundary (512 bytes) --- */ long sk_sndtimeo; /* 0x200 0x8 */ u32 sk_priority; /* 0x208 0x4 */ u32 sk_mark; /* 0x20c 0x4 */ rcu * sk_dst_cache; /* 0x210 0x8 */ netdev_features_t sk_route_caps; /* 0x218 0x8 */ u16 sk_gso_type; /* 0x220 0x2 */ u16 sk_gso_max_segs; /* 0x222 0x2 */ unsigned int sk_gso_max_size; /* 0x224 0x4 */ gfp_t sk_allocation; /* 0x228 0x4 */ u32 sk_txhash; /* 0x22c 0x4 */ u8 sk_pacing_shift; /* 0x230 0x1 */ bool sk_use_task_frag; /* 0x231 0x1 */ __u8 __cacheline_group_end__sock_read_tx[0]; /* 0x232 0 */ u8 sk_gso_disabled:1; /* 0x232: 0 0x1 */ u8 sk_kern_sock:1; /* 0x232:0x1 0x1 */ u8 sk_no_check_tx:1; /* 0x232:0x2 0x1 */ u8 sk_no_check_rx:1; /* 0x232:0x3 0x1 */ /* XXX 4 bits hole, try to pack */ u8 sk_shutdown; /* 0x233 0x1 */ u16 sk_type; /* 0x234 0x2 */ u16 sk_protocol; /* 0x236 0x2 */ unsigned long sk_lingertime; /* 0x238 0x8 */ /* --- cacheline 9 boundary (576 bytes) --- */ struct proto * sk_prot_creator; /* 0x240 0x8 */ rwlock_t sk_callback_lock; /* 0x248 0x8 */ int sk_err_soft; /* 0x250 0x4 */ u32 sk_ack_backlog; /* 0x254 0x4 */ u32 sk_max_ack_backlog; /* 0x258 0x4 */ kuid_t sk_uid; /* 0x25c 0x4 */ spinlock_t sk_peer_lock; /* 0x260 0x4 */ int sk_bind_phc; /* 0x264 0x4 */ struct pid * sk_peer_pid; /* 0x268 0x8 */ const struct cred * sk_peer_cred; /* 0x270 0x8 */ ktime_t sk_stamp; /* 0x278 0x8 */ /* --- cacheline 10 boundary (640 bytes) --- */ int sk_disconnects; /* 0x280 0x4 */ u8 sk_txrehash; /* 0x284 0x1 */ u8 sk_clockid; /* 0x285 0x1 */ u8 sk_txtime_deadline_mode:1; /* 0x286: 0 0x1 */ u8 sk_txtime_report_errors:1; /* 0x286:0x1 0x1 */ u8 sk_txtime_unused:6; /* 0x286:0x2 0x1 */ /* XXX 1 byte hole, try to pack */ void * sk_user_data; /* 0x288 0x8 */ void * sk_security; /* 0x290 0x8 */ struct sock_cgroup_data sk_cgrp_data; /* 0x298 0x8 */ void (*sk_state_change)(struct sock *); /* 0x2a0 0x8 */ void (*sk_write_space)(struct sock *); /* 0x2a8 0x8 */ void (*sk_error_report)(struct sock *); /* 0x2b0 0x8 */ int (*sk_backlog_rcv)(struct sock *, struct sk_buff *); /* 0x2b8 0x8 */ /* --- cacheline 11 boundary (704 bytes) --- */ void (*sk_destruct)(struct sock *); /* 0x2c0 0x8 */ rcu * sk_reuseport_cb; /* 0x2c8 0x8 */ rcu * sk_bpf_storage; /* 0x2d0 0x8 */ struct callback_head sk_rcu __attribute__((__aligned__(8))); /* 0x2d8 0x10 */ netns_tracker ns_tracker; /* 0x2e8 0x8 */ /* size: 752, cachelines: 12, members: 105 */ /* sum members: 749, holes: 1, sum holes: 1 */ /* sum bitfield members: 12 bits, bit holes: 1, sum bit holes: 4 bits */ /* paddings: 1, sum paddings: 4 */ /* forced alignments: 1 */ /* last cacheline: 48 bytes */ }; Signed-off-by: Eric Dumazet Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240216162006.2342759-1-edumazet@google.com Signed-off-by: Paolo Abeni --- net/core/sock.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index 88bf810394a5..3fef3407383e 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -4234,3 +4234,65 @@ int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) return sock_ioctl_out(sk, cmd, arg); } EXPORT_SYMBOL(sk_ioctl); + +static int __init sock_struct_check(void) +{ + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog); + + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat); + + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg); + + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags); + + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey); + + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag); + return 0; +} + +core_initcall(sock_struct_check); -- cgit