From 9b42c1f179a614e11893ae4619f0304a38f481ae Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Tue, 12 Jun 2018 12:44:26 +0200 Subject: xfrm: Extend the output_mark to support input direction and masking. We already support setting an output mark at the xfrm_state, unfortunately this does not support the input direction and masking the marks that will be applied to the skb. This change adds support applying a masked value in both directions. The existing XFRMA_OUTPUT_MARK number is reused for this purpose and as it is now bi-directional, it is renamed to XFRMA_SET_MARK. An additional XFRMA_SET_MARK_MASK attribute is added for setting the mask. If the attribute mask not provided, it is set to 0xffffffff, keeping the XFRMA_OUTPUT_MARK existing 'full mask' semantics. Co-developed-by: Tobias Brunner Co-developed-by: Eyal Birger Co-developed-by: Lorenzo Colitti Signed-off-by: Steffen Klassert Signed-off-by: Tobias Brunner Signed-off-by: Eyal Birger Signed-off-by: Lorenzo Colitti --- include/net/xfrm.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 557122846e0e..3dc83ba26f62 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -166,7 +166,7 @@ struct xfrm_state { int header_len; int trailer_len; u32 extra_flags; - u32 output_mark; + struct xfrm_mark smark; } props; struct xfrm_lifetime_cfg lft; @@ -2012,6 +2012,13 @@ static inline int xfrm_mark_put(struct sk_buff *skb, const struct xfrm_mark *m) return ret; } +static inline __u32 xfrm_smark_get(__u32 mark, struct xfrm_state *x) +{ + struct xfrm_mark *m = &x->props.smark; + + return (m->v & m->m) | (mark & ~m->m); +} + static inline int xfrm_tunnel_check(struct sk_buff *skb, struct xfrm_state *x, unsigned int family) { -- cgit From d159ce7957eec306eacda672e5909e26675ca8ef Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Tue, 12 Jun 2018 14:06:57 +0200 Subject: flow: Extend flow informations with xfrm interface id. Add a new flowi_xfrm structure with informations needed to do a xfrm lookup. At the moment it keeps the informations about the new xfrm interface id needed to lookup xfrm interfaces that are introduced with a followup patch. We need this new lookup key as other possible keys, like the ifindex is already part of the xfrm selector and used as a key to enforce the output device after the transformation in the policy/state lookup. Signed-off-by: Steffen Klassert Acked-by: Shannon Nelson Acked-by: Benedict Wong Tested-by: Benedict Wong Tested-by: Antony Antony Reviewed-by: Eyal Birger --- include/net/flow.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/net') diff --git a/include/net/flow.h b/include/net/flow.h index 8ce21793094e..187c9bef672f 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -26,6 +26,10 @@ struct flowi_tunnel { __be64 tun_id; }; +struct flowi_xfrm { + __u32 if_id; +}; + struct flowi_common { int flowic_oif; int flowic_iif; @@ -39,6 +43,7 @@ struct flowi_common { #define FLOWI_FLAG_SKIP_NH_OIF 0x04 __u32 flowic_secid; struct flowi_tunnel flowic_tun_key; + struct flowi_xfrm xfrm; kuid_t flowic_uid; }; @@ -78,6 +83,7 @@ struct flowi4 { #define flowi4_secid __fl_common.flowic_secid #define flowi4_tun_key __fl_common.flowic_tun_key #define flowi4_uid __fl_common.flowic_uid +#define flowi4_xfrm __fl_common.xfrm /* (saddr,daddr) must be grouped, same order as in IP header */ __be32 saddr; @@ -109,6 +115,7 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif, fl4->flowi4_flags = flags; fl4->flowi4_secid = 0; fl4->flowi4_tun_key.tun_id = 0; + fl4->flowi4_xfrm.if_id = 0; fl4->flowi4_uid = uid; fl4->daddr = daddr; fl4->saddr = saddr; @@ -138,6 +145,7 @@ struct flowi6 { #define flowi6_secid __fl_common.flowic_secid #define flowi6_tun_key __fl_common.flowic_tun_key #define flowi6_uid __fl_common.flowic_uid +#define flowi6_xfrm __fl_common.xfrm struct in6_addr daddr; struct in6_addr saddr; /* Note: flowi6_tos is encoded in flowlabel, too. */ @@ -185,6 +193,7 @@ struct flowi { #define flowi_secid u.__fl_common.flowic_secid #define flowi_tun_key u.__fl_common.flowic_tun_key #define flowi_uid u.__fl_common.flowic_uid +#define flowi_xfrm u.__fl_common.xfrm } __attribute__((__aligned__(BITS_PER_LONG/8))); static inline struct flowi *flowi4_to_flowi(struct flowi4 *fl4) -- cgit From 7e6526404adedf079279aa7aa11722deaca8fe2e Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Tue, 12 Jun 2018 14:07:07 +0200 Subject: xfrm: Add a new lookup key to match xfrm interfaces. This patch adds the xfrm interface id as a lookup key for xfrm states and policies. With this we can assign states and policies to virtual xfrm interfaces. Signed-off-by: Steffen Klassert Acked-by: Shannon Nelson Acked-by: Benedict Wong Tested-by: Benedict Wong Tested-by: Antony Antony Reviewed-by: Eyal Birger --- include/net/xfrm.h | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 3dc83ba26f62..e8bada4d2a45 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -147,6 +147,7 @@ struct xfrm_state { struct xfrm_id id; struct xfrm_selector sel; struct xfrm_mark mark; + u32 if_id; u32 tfcpad; u32 genid; @@ -574,6 +575,7 @@ struct xfrm_policy { atomic_t genid; u32 priority; u32 index; + u32 if_id; struct xfrm_mark mark; struct xfrm_selector selector; struct xfrm_lifetime_cfg lft; @@ -1533,7 +1535,7 @@ struct xfrm_state *xfrm_state_find(const xfrm_address_t *daddr, struct xfrm_tmpl *tmpl, struct xfrm_policy *pol, int *err, unsigned short family); -struct xfrm_state *xfrm_stateonly_find(struct net *net, u32 mark, +struct xfrm_state *xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id, xfrm_address_t *daddr, xfrm_address_t *saddr, unsigned short family, @@ -1690,20 +1692,20 @@ int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk, void *); void xfrm_policy_walk_done(struct xfrm_policy_walk *walk, struct net *net); int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl); -struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, +struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id, u8 type, int dir, struct xfrm_selector *sel, struct xfrm_sec_ctx *ctx, int delete, int *err); -struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8, int dir, - u32 id, int delete, int *err); +struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u32 if_id, u8, + int dir, u32 id, int delete, int *err); int xfrm_policy_flush(struct net *net, u8 type, bool task_valid); void xfrm_policy_hash_rebuild(struct net *net); u32 xfrm_get_acqseq(void); int verify_spi_info(u8 proto, u32 min, u32 max); int xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi); struct xfrm_state *xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, - u8 mode, u32 reqid, u8 proto, + u8 mode, u32 reqid, u32 if_id, u8 proto, const xfrm_address_t *daddr, const xfrm_address_t *saddr, int create, unsigned short family); @@ -2019,6 +2021,15 @@ static inline __u32 xfrm_smark_get(__u32 mark, struct xfrm_state *x) return (m->v & m->m) | (mark & ~m->m); } +static inline int xfrm_if_id_put(struct sk_buff *skb, __u32 if_id) +{ + int ret = 0; + + if (if_id) + ret = nla_put_u32(skb, XFRMA_IF_ID, if_id); + return ret; +} + static inline int xfrm_tunnel_check(struct sk_buff *skb, struct xfrm_state *x, unsigned int family) { -- cgit From f203b76d78092faf248db3f851840fbecf80b40e Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Tue, 12 Jun 2018 14:07:12 +0200 Subject: xfrm: Add virtual xfrm interfaces This patch adds support for virtual xfrm interfaces. Packets that are routed through such an interface are guaranteed to be IPsec transformed or dropped. It is a generic virtual interface that ensures IPsec transformation, no need to know what happens behind the interface. This means that we can tunnel IPv4 and IPv6 through the same interface and support all xfrm modes (tunnel, transport and beet) on it. Co-developed-by: Lorenzo Colitti Co-developed-by: Benedict Wong Signed-off-by: Lorenzo Colitti Signed-off-by: Benedict Wong Signed-off-by: Steffen Klassert Acked-by: Shannon Nelson Tested-by: Benedict Wong Tested-by: Antony Antony Reviewed-by: Eyal Birger --- include/net/xfrm.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index e8bada4d2a45..3fa578a6a819 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -293,6 +294,13 @@ struct xfrm_replay { int (*overflow)(struct xfrm_state *x, struct sk_buff *skb); }; +struct xfrm_if_cb { + struct xfrm_if *(*decode_session)(struct sk_buff *skb); +}; + +void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb); +void xfrm_if_unregister_cb(void); + struct net_device; struct xfrm_type; struct xfrm_dst; @@ -1039,6 +1047,22 @@ static inline void xfrm_dst_destroy(struct xfrm_dst *xdst) void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev); +struct xfrm_if_parms { + char name[IFNAMSIZ]; /* name of XFRM device */ + int link; /* ifindex of underlying L2 interface */ + u32 if_id; /* interface identifyer */ +}; + +struct xfrm_if { + struct xfrm_if __rcu *next; /* next interface in list */ + struct net_device *dev; /* virtual device associated with interface */ + struct net_device *phydev; /* physical device */ + struct net *net; /* netns for packet i/o */ + struct xfrm_if_parms p; /* interface parms */ + + struct gro_cells gro_cells; +}; + struct xfrm_offload { /* Output sequence number for replay protection on offloading. */ struct { -- cgit From e4db5b61c572475bbbcf63e3c8a2606bfccf2c9d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 25 Jun 2018 17:26:02 +0200 Subject: xfrm: policy: remove pcpu policy cache Kristian Evensen says: In a project I am involved in, we are running ipsec (Strongswan) on different mt7621-based routers. Each router is configured as an initiator and has around ~30 tunnels to different responders (running on misc. devices). Before the flow cache was removed (kernel 4.9), we got a combined throughput of around 70Mbit/s for all tunnels on one router. However, we recently switched to kernel 4.14 (4.14.48), and the total throughput is somewhere around 57Mbit/s (best-case). I.e., a drop of around 20%. Reverting the flow cache removal restores, as expected, performance levels to that of kernel 4.9. When pcpu xdst exists, it has to be validated first before it can be used. A negative hit thus increases cost vs. no-cache. As number of tunnels increases, hit rate decreases so this pcpu caching isn't a viable strategy. Furthermore, the xdst cache also needs to run with BH off, so when removing this the bh disable/enable pairs can be removed too. Kristian tested a 4.14.y backport of this change and reported increased performance: In our tests, the throughput reduction has been reduced from around -20% to -5%. We also see that the overall throughput is independent of the number of tunnels, while before the throughput was reduced as the number of tunnels increased. Reported-by: Kristian Evensen Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 3fa578a6a819..a5378613a49c 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -332,7 +332,6 @@ int xfrm_policy_register_afinfo(const struct xfrm_policy_afinfo *afinfo, int fam void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo); void km_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c); -void xfrm_policy_cache_flush(void); void km_state_notify(struct xfrm_state *x, const struct km_event *c); struct xfrm_tmpl; -- cgit From a9744f7ca200c756e6f8c65b633770a2da711651 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 29 Jun 2018 09:48:20 +0200 Subject: xsk: fix potential race in SKB TX completion code There is a potential race in the TX completion code for the SKB case. One process enters the sendmsg code of an AF_XDP socket in order to send a frame. The execution eventually trickles down to the driver that is told to send the packet. However, it decides to drop the packet due to some error condition (e.g., rings full) and frees the SKB. This will trigger the SKB destructor and a completion will be sent to the AF_XDP user space through its single-producer/single-consumer queues. At the same time a TX interrupt has fired on another core and it dispatches the TX completion code in the driver. It does its HW specific things and ends up freeing the SKB associated with the transmitted packet. This will trigger the SKB destructor and a completion will be sent to the AF_XDP user space through its single-producer/single-consumer queues. With a pseudo call stack, it would look like this: Core 1: sendmsg() being called in the application netdev_start_xmit() Driver entered through ndo_start_xmit Driver decides to free the SKB for some reason (e.g., rings full) Destructor of SKB called xskq_produce_addr() is called to signal completion to user space Core 2: TX completion irq NAPI loop Driver irq handler for TX completions Frees the SKB Destructor of SKB called xskq_produce_addr() is called to signal completion to user space We now have a violation of the single-producer/single-consumer principle for our queues as there are two threads trying to produce at the same time on the same queue. Fixed by introducing a spin_lock in the destructor. In regards to the performance, I get around 1.74 Mpps for txonly before and after the introduction of the spinlock. There is of course some impact due to the spin lock but it is in the less significant digits that are too noisy for me to measure. But let us say that the version without the spin lock got 1.745 Mpps in the best case and the version with 1.735 Mpps in the worst case, then that would mean a maximum drop in performance of 0.5%. Fixes: 35fcde7f8deb ("xsk: support for Tx") Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- include/net/xdp_sock.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 9fe472f2ac95..7161856bcf9c 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -60,6 +60,10 @@ struct xdp_sock { bool zc; /* Protects multiple processes in the control path */ struct mutex mutex; + /* Mutual exclusion of NAPI TX thread and sendmsg error paths + * in the SKB destructor callback. + */ + spinlock_t tx_completion_lock; u64 rx_dropped; }; -- cgit From 33bd5ac54dc47e002da4a395aaf9bf158dd17709 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 3 Jul 2018 14:36:21 -0700 Subject: net/ipv6: Revert attempt to simplify route replace and append NetworkManager likes to manage linklocal prefix routes and does so with the NLM_F_APPEND flag, breaking attempts to simplify the IPv6 route code and by extension enable multipath routes with device only nexthops. Revert f34436a43092 and these followup patches: 6eba08c3626b ("ipv6: Only emit append events for appended routes"). ce45bded6435 ("mlxsw: spectrum_router: Align with new route replace logic") 53b562df8c20 ("mlxsw: spectrum_router: Allow appending to dev-only routes") Update the fib_tests cases to reflect the old behavior. Fixes: f34436a43092 ("net/ipv6: Simplify route replace and appending into multipath route") Signed-off-by: David Ahern --- include/net/ip6_route.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 59656fc580df..7b9c82de11cc 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -66,6 +66,12 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr) (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); } +static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i) +{ + return (f6i->fib6_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == + RTF_GATEWAY; +} + void ip6_route_input(struct sk_buff *skb); struct dst_entry *ip6_route_input_lookup(struct net *net, struct net_device *dev, -- cgit From a9ba23d48dbc6ffd08426bb10f05720e0b9f5c14 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Wed, 4 Jul 2018 09:58:05 -0400 Subject: ipv6: make ipv6_renew_options() interrupt/kernel safe At present the ipv6_renew_options_kern() function ends up calling into access_ok() which is problematic if done from inside an interrupt as access_ok() calls WARN_ON_IN_IRQ() on some (all?) architectures (x86-64 is affected). Example warning/backtrace is shown below: WARNING: CPU: 1 PID: 3144 at lib/usercopy.c:11 _copy_from_user+0x85/0x90 ... Call Trace: ipv6_renew_option+0xb2/0xf0 ipv6_renew_options+0x26a/0x340 ipv6_renew_options_kern+0x2c/0x40 calipso_req_setattr+0x72/0xe0 netlbl_req_setattr+0x126/0x1b0 selinux_netlbl_inet_conn_request+0x80/0x100 selinux_inet_conn_request+0x6d/0xb0 security_inet_conn_request+0x32/0x50 tcp_conn_request+0x35f/0xe00 ? __lock_acquire+0x250/0x16c0 ? selinux_socket_sock_rcv_skb+0x1ae/0x210 ? tcp_rcv_state_process+0x289/0x106b tcp_rcv_state_process+0x289/0x106b ? tcp_v6_do_rcv+0x1a7/0x3c0 tcp_v6_do_rcv+0x1a7/0x3c0 tcp_v6_rcv+0xc82/0xcf0 ip6_input_finish+0x10d/0x690 ip6_input+0x45/0x1e0 ? ip6_rcv_finish+0x1d0/0x1d0 ipv6_rcv+0x32b/0x880 ? ip6_make_skb+0x1e0/0x1e0 __netif_receive_skb_core+0x6f2/0xdf0 ? process_backlog+0x85/0x250 ? process_backlog+0x85/0x250 ? process_backlog+0xec/0x250 process_backlog+0xec/0x250 net_rx_action+0x153/0x480 __do_softirq+0xd9/0x4f7 do_softirq_own_stack+0x2a/0x40 ... While not present in the backtrace, ipv6_renew_option() ends up calling access_ok() via the following chain: access_ok() _copy_from_user() copy_from_user() ipv6_renew_option() The fix presented in this patch is to perform the userspace copy earlier in the call chain such that it is only called when the option data is actually coming from userspace; that place is do_ipv6_setsockopt(). Not only does this solve the problem seen in the backtrace above, it also allows us to simplify the code quite a bit by removing ipv6_renew_options_kern() completely. We also take this opportunity to cleanup ipv6_renew_options()/ipv6_renew_option() a small amount as well. This patch is heavily based on a rough patch by Al Viro. I've taken his original patch, converted a kmemdup() call in do_ipv6_setsockopt() to a memdup_user() call, made better use of the e_inval jump target in the same function, and cleaned up the use ipv6_renew_option() by ipv6_renew_options(). CC: Al Viro Signed-off-by: Paul Moore Signed-off-by: David S. Miller --- include/net/ipv6.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 16475c269749..d02881e4ad1f 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -355,14 +355,7 @@ struct ipv6_txoptions *ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, int newtype, - struct ipv6_opt_hdr __user *newopt, - int newoptlen); -struct ipv6_txoptions * -ipv6_renew_options_kern(struct sock *sk, - struct ipv6_txoptions *opt, - int newtype, - struct ipv6_opt_hdr *newopt, - int newoptlen); + struct ipv6_opt_hdr *newopt); struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *opt); -- cgit From cfdb0c2d095ac5d7f09cac1317b7d0a9e8178134 Mon Sep 17 00:00:00 2001 From: Ankit Navik Date: Fri, 29 Jun 2018 12:12:50 +0530 Subject: Bluetooth: Store Resolv list size When the controller supports the Read LE Resolv List size feature, the maximum list size are read and now stored. Before patch: < HCI Command: LE Read White List... (0x08|0x000f) plen 0 #55 [hci0] 17.979791 > HCI Event: Command Complete (0x0e) plen 5 #56 [hci0] 17.980629 LE Read White List Size (0x08|0x000f) ncmd 1 Status: Success (0x00) Size: 25 < HCI Command: LE Clear White List (0x08|0x0010) plen 0 #57 [hci0] 17.980786 > HCI Event: Command Complete (0x0e) plen 4 #58 [hci0] 17.981627 LE Clear White List (0x08|0x0010) ncmd 1 Status: Success (0x00) < HCI Command: LE Read Maximum Dat.. (0x08|0x002f) plen 0 #59 [hci0] 17.981786 > HCI Event: Command Complete (0x0e) plen 12 #60 [hci0] 17.982636 LE Read Maximum Data Length (0x08|0x002f) ncmd 1 Status: Success (0x00) Max TX octets: 251 Max TX time: 17040 Max RX octets: 251 Max RX time: 17040 After patch: < HCI Command: LE Read White List... (0x08|0x000f) plen 0 #55 [hci0] 13.338168 > HCI Event: Command Complete (0x0e) plen 5 #56 [hci0] 13.338842 LE Read White List Size (0x08|0x000f) ncmd 1 Status: Success (0x00) Size: 25 < HCI Command: LE Clear White List (0x08|0x0010) plen 0 #57 [hci0] 13.339029 > HCI Event: Command Complete (0x0e) plen 4 #58 [hci0] 13.339939 LE Clear White List (0x08|0x0010) ncmd 1 Status: Success (0x00) < HCI Command: LE Read Resolving L.. (0x08|0x002a) plen 0 #59 [hci0] 13.340152 > HCI Event: Command Complete (0x0e) plen 5 #60 [hci0] 13.340952 LE Read Resolving List Size (0x08|0x002a) ncmd 1 Status: Success (0x00) Size: 25 < HCI Command: LE Read Maximum Dat.. (0x08|0x002f) plen 0 #61 [hci0] 13.341180 > HCI Event: Command Complete (0x0e) plen 12 #62 [hci0] 13.341898 LE Read Maximum Data Length (0x08|0x002f) ncmd 1 Status: Success (0x00) Max TX octets: 251 Max TX time: 17040 Max RX octets: 251 Max RX time: 17040 Signed-off-by: Ankit Navik Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 6 ++++++ include/net/bluetooth/hci_core.h | 2 ++ 2 files changed, 8 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 1668211297a9..484f24c7a415 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1490,6 +1490,12 @@ struct hci_cp_le_write_def_data_len { __le16 tx_time; } __packed; +#define HCI_OP_LE_READ_RESOLV_LIST_SIZE 0x202a +struct hci_rp_le_read_resolv_list_size { + __u8 status; + __u8 size; +} __packed; + #define HCI_OP_LE_READ_MAX_DATA_LEN 0x202f struct hci_rp_le_read_max_data_len { __u8 status; diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 893bbbb5d2fa..409f49bd8338 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -221,6 +221,7 @@ struct hci_dev { __u8 features[HCI_MAX_PAGES][8]; __u8 le_features[8]; __u8 le_white_list_size; + __u8 le_resolv_list_size; __u8 le_states[8]; __u8 commands[64]; __u8 hci_ver; @@ -367,6 +368,7 @@ struct hci_dev { struct list_head identity_resolving_keys; struct list_head remote_oob_data; struct list_head le_white_list; + struct list_head le_resolv_list; struct list_head le_conn_params; struct list_head pend_le_conns; struct list_head pend_le_reports; -- cgit From 545f2596b907f0747170c7cb71edc74cecf68c5c Mon Sep 17 00:00:00 2001 From: Ankit Navik Date: Fri, 29 Jun 2018 12:13:20 +0530 Subject: Bluetooth: Add HCI command for clear Resolv list Check for Resolv list supported by controller. So check the supported commmand first before issuing this command i.e.,HCI_OP_LE_CLEAR_RESOLV_LIST Before patch: < HCI Command: LE Read White List... (0x08|0x000f) plen 0 #55 [hci0] 13.338168 > HCI Event: Command Complete (0x0e) plen 5 #56 [hci0] 13.338842 LE Read White List Size (0x08|0x000f) ncmd 1 Status: Success (0x00) Size: 25 < HCI Command: LE Clear White List (0x08|0x0010) plen 0 #57 [hci0] 13.339029 > HCI Event: Command Complete (0x0e) plen 4 #58 [hci0] 13.339939 LE Clear White List (0x08|0x0010) ncmd 1 Status: Success (0x00) < HCI Command: LE Read Resolving L.. (0x08|0x002a) plen 0 #59 [hci0] 13.340152 > HCI Event: Command Complete (0x0e) plen 5 #60 [hci0] 13.340952 LE Read Resolving List Size (0x08|0x002a) ncmd 1 Status: Success (0x00) Size: 25 < HCI Command: LE Read Maximum Dat.. (0x08|0x002f) plen 0 #61 [hci0] 13.341180 > HCI Event: Command Complete (0x0e) plen 12 #62 [hci0] 13.341898 LE Read Maximum Data Length (0x08|0x002f) ncmd 1 Status: Success (0x00) Max TX octets: 251 Max TX time: 17040 Max RX octets: 251 Max RX time: 17040 After patch: < HCI Command: LE Read White List... (0x08|0x000f) plen 0 #55 [hci0] 28.919131 > HCI Event: Command Complete (0x0e) plen 5 #56 [hci0] 28.920016 LE Read White List Size (0x08|0x000f) ncmd 1 Status: Success (0x00) Size: 25 < HCI Command: LE Clear White List (0x08|0x0010) plen 0 #57 [hci0] 28.920164 > HCI Event: Command Complete (0x0e) plen 4 #58 [hci0] 28.920873 LE Clear White List (0x08|0x0010) ncmd 1 Status: Success (0x00) < HCI Command: LE Read Resolving L.. (0x08|0x002a) plen 0 #59 [hci0] 28.921109 > HCI Event: Command Complete (0x0e) plen 5 #60 [hci0] 28.922016 LE Read Resolving List Size (0x08|0x002a) ncmd 1 Status: Success (0x00) Size: 25 < HCI Command: LE Clear Resolving... (0x08|0x0029) plen 0 #61 [hci0] 28.922166 > HCI Event: Command Complete (0x0e) plen 4 #62 [hci0] 28.922872 LE Clear Resolving List (0x08|0x0029) ncmd 1 Status: Success (0x00) < HCI Command: LE Read Maximum Dat.. (0x08|0x002f) plen 0 #63 [hci0] 28.923117 > HCI Event: Command Complete (0x0e) plen 12 #64 [hci0] 28.924030 LE Read Maximum Data Length (0x08|0x002f) ncmd 1 Status: Success (0x00) Max TX octets: 251 Max TX time: 17040 Max RX octets: 251 Max RX time: 17040 Signed-off-by: Ankit Navik Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 484f24c7a415..4af1a3a4d9b1 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1490,6 +1490,8 @@ struct hci_cp_le_write_def_data_len { __le16 tx_time; } __packed; +#define HCI_OP_LE_CLEAR_RESOLV_LIST 0x2029 + #define HCI_OP_LE_READ_RESOLV_LIST_SIZE 0x202a struct hci_rp_le_read_resolv_list_size { __u8 status; -- cgit From 5711b4e89319c2912f20b2a4f371c1525fc9551d Mon Sep 17 00:00:00 2001 From: Máté Eckl Date: Thu, 5 Jul 2018 12:01:53 +0200 Subject: netfilter: nf_tproxy: fix possible non-linear access to transport header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch fixes a silent out-of-bound read possibility that was present because of the misuse of this function. Mostly it was called with a struct udphdr *hp which had only the udphdr part linearized by the skb_header_pointer, however nf_tproxy_get_sock_v{4,6} uses it as a tcphdr pointer, so some reads for tcp specific attributes may be invalid. Fixes: a583636a83ea ("inet: refactor inet[6]_lookup functions to take skb") Signed-off-by: Máté Eckl Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tproxy.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tproxy.h b/include/net/netfilter/nf_tproxy.h index 9754a50ecde9..4cc64c8446eb 100644 --- a/include/net/netfilter/nf_tproxy.h +++ b/include/net/netfilter/nf_tproxy.h @@ -64,7 +64,7 @@ nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb, * belonging to established connections going through that one. */ struct sock * -nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp, +nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, const u8 protocol, const __be32 saddr, const __be32 daddr, const __be16 sport, const __be16 dport, @@ -103,7 +103,7 @@ nf_tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff, struct sock *sk); struct sock * -nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp, +nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, const u8 protocol, const struct in6_addr *saddr, const struct in6_addr *daddr, const __be16 sport, const __be16 dport, -- cgit From a948f713842ad5c23f125efc61dee6951893219c Mon Sep 17 00:00:00 2001 From: Denis Kenzior Date: Tue, 3 Jul 2018 15:05:48 -0500 Subject: nl80211/mac80211: allow non-linear skb in rx_control_port The current implementation of cfg80211_rx_control_port assumed that the caller could provide a contiguous region of memory for the control port frame to be sent up to userspace. Unfortunately, many drivers produce non-linear skbs, especially for data frames. This resulted in userspace getting notified of control port frames with correct metadata (from address, port, etc) yet garbage / nonsense contents, resulting in bad handshakes, disconnections, etc. mac80211 linearizes skbs containing management frames. But it didn't seem worthwhile to do this for control port frames. Thus the signature of cfg80211_rx_control_port was changed to take the skb directly. nl80211 then takes care of obtaining control port frame data directly from the (linear | non-linear) skb. The caller is still responsible for freeing the skb, cfg80211_rx_control_port does not take ownership of it. Fixes: 6a671a50f819 ("nl80211: Add CMD_CONTROL_PORT_FRAME API") Signed-off-by: Denis Kenzior [fix some kernel-doc formatting, add fixes tag] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 5fbfe61f41c6..1beb3ead0385 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5835,10 +5835,11 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, /** * cfg80211_rx_control_port - notification about a received control port frame * @dev: The device the frame matched to - * @buf: control port frame - * @len: length of the frame data - * @addr: The peer from which the frame was received - * @proto: frame protocol, typically PAE or Pre-authentication + * @skb: The skbuf with the control port frame. It is assumed that the skbuf + * is 802.3 formatted (with 802.3 header). The skb can be non-linear. + * This function does not take ownership of the skb, so the caller is + * responsible for any cleanup. The caller must also ensure that + * skb->protocol is set appropriately. * @unencrypted: Whether the frame was received unencrypted * * This function is used to inform userspace about a received control port @@ -5851,8 +5852,7 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, * Return: %true if the frame was passed to userspace */ bool cfg80211_rx_control_port(struct net_device *dev, - const u8 *buf, size_t len, - const u8 *addr, u16 proto, bool unencrypted); + struct sk_buff *skb, bool unencrypted); /** * cfg80211_cqm_rssi_notify - connection quality monitoring rssi event -- cgit From e240cd0df48185a28c153f83a39ba3940e3e9b86 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 6 Jul 2018 19:06:43 +0200 Subject: netfilter: nf_tables: place all set backends in one single module This patch disallows rbtree with single elements, which is causing problems with the recent timeout support. Before this patch, you could opt out individual set representations per module, which is just adding extra complexity. Fixes: 8d8540c4f5e0("netfilter: nft_set_rbtree: add timeout support") Reported-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_core.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index e0c0c2558ec4..a05134507e7b 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -65,4 +65,10 @@ extern const struct nft_expr_ops nft_payload_fast_ops; extern struct static_key_false nft_counters_enabled; extern struct static_key_false nft_trace_enabled; +extern struct nft_set_type nft_set_rhash_type; +extern struct nft_set_type nft_set_hash_type; +extern struct nft_set_type nft_set_hash_fast_type; +extern struct nft_set_type nft_set_rbtree_type; +extern struct nft_set_type nft_set_bitmap_type; + #endif /* _NET_NF_TABLES_CORE_H */ -- cgit From a2344b9e3a8c5c2064306b0d99b0e9a6c4813c08 Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Fri, 6 Jul 2018 17:05:28 +0530 Subject: Bluetooth: Use extended scanning if controller supports This implements Set extended scan param and set extended scan enable commands and use it for start LE scan based on controller support. The new features added in these commands are setting of new PHY for scanning and setting of scan duration. Both features are disabled for now, meaning only 1M PHY is set and scan duration is set to 0 which means that scanning will be done untill scan disable is called. < HCI Command: LE Set Extended Scan Parameters (0x08|0x0041) plen 8 Own address type: Random (0x01) Filter policy: Accept all advertisement (0x00) PHYs: 0x01 Entry 0: LE 1M Type: Active (0x01) Interval: 11.250 msec (0x0012) Window: 11.250 msec (0x0012) > HCI Event: Command Complete (0x0e) plen 4 LE Set Extended Scan Parameters (0x08|0x0041) ncmd 1 Status: Success (0x00) < HCI Command: LE Set Extended Scan Enable (0x08|0x0042) plen 6 Extended scan: Enabled (0x01) Filter duplicates: Enabled (0x01) Duration: 0 msec (0x0000) Period: 0.00 sec (0x0000) > HCI Event: Command Complete (0x0e) plen 4 LE Set Extended Scan Enable (0x08|0x0042) ncmd 2 Status: Success (0x00) Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 24 ++++++++++++++++++++++++ include/net/bluetooth/hci_core.h | 4 ++++ 2 files changed, 28 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 4af1a3a4d9b1..8c2868f439e7 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1514,6 +1514,30 @@ struct hci_cp_le_set_default_phy { __u8 rx_phys; } __packed; +#define HCI_OP_LE_SET_EXT_SCAN_PARAMS 0x2041 +struct hci_cp_le_set_ext_scan_params { + __u8 own_addr_type; + __u8 filter_policy; + __u8 scanning_phys; + __u8 data[0]; +} __packed; + +#define LE_SCAN_PHY_1M 0x01 + +struct hci_cp_le_scan_phy_params { + __u8 type; + __le16 interval; + __le16 window; +} __packed; + +#define HCI_OP_LE_SET_EXT_SCAN_ENABLE 0x2042 +struct hci_cp_le_set_ext_scan_enable { + __u8 enable; + __u8 filter_dup; + __le16 duration; + __le16 period; +} __packed; + /* ---- HCI Events ---- */ #define HCI_EV_INQUIRY_COMPLETE 0x01 diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 409f49bd8338..cc0bde74dd45 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1158,6 +1158,10 @@ void hci_conn_del_sysfs(struct hci_conn *conn); #define bredr_sc_enabled(dev) (lmp_sc_capable(dev) && \ hci_dev_test_flag(dev, HCI_SC_ENABLED)) +/* Use ext scanning if set ext scan param and ext scan enable is supported */ +#define use_ext_scan(dev) (((dev)->commands[37] & 0x20) && \ + ((dev)->commands[37] & 0x40)) + /* ----- HCI protocols ----- */ #define HCI_PROTO_DEFER 0x01 -- cgit From c215e9397b00b3045a668120ed7dbd89f2866e74 Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Fri, 6 Jul 2018 17:05:29 +0530 Subject: Bluetooth: Process extended ADV report event This patch enables Extended ADV report event if extended scanning is supported in the controller and process the same. The new features are not handled and for now its as good as legacy ADV report. > HCI Event: LE Meta Event (0x3e) plen 53 LE Extended Advertising Report (0x0d) Num reports: 1 Entry 0 Event type: 0x0013 Props: 0x0013 Connectable Scannable Use legacy advertising PDUs Data status: Complete Legacy PDU Type: ADV_IND (0x0013) Address type: Random (0x01) Address: DB:7E:2E:1A:85:E8 (Static) Primary PHY: LE 1M Secondary PHY: LE 1M SID: 0x00 TX power: 0 dBm RSSI: -90 dBm (0xa6) Periodic advertising invteral: 0.00 msec (0x0000) Direct address type: Public (0x00) Direct address: 00:00:00:00:00:00 (OUI 00-00-00) Data length: 0x1b 0f 09 44 65 73 69 67 6e 65 72 20 4d 6f 75 73 65 ..Designer Mouse 03 19 c2 03 02 01 05 03 03 12 18 ........... Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 8c2868f439e7..0ec51eb14810 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1925,6 +1925,15 @@ struct hci_ev_le_conn_complete { #define LE_ADV_SCAN_IND 0x02 #define LE_ADV_NONCONN_IND 0x03 #define LE_ADV_SCAN_RSP 0x04 +#define LE_ADV_INVALID 0x05 + +/* Legacy event types in extended adv report */ +#define LE_LEGACY_ADV_IND 0x0013 +#define LE_LEGACY_ADV_DIRECT_IND 0x0015 +#define LE_LEGACY_ADV_SCAN_IND 0x0012 +#define LE_LEGACY_NONCONN_IND 0x0010 +#define LE_LEGACY_SCAN_RSP_ADV 0x001b +#define LE_LEGACY_SCAN_RSP_ADV_SCAN 0x001a #define ADDR_LE_DEV_PUBLIC 0x00 #define ADDR_LE_DEV_RANDOM 0x01 @@ -1989,6 +1998,23 @@ struct hci_ev_le_direct_adv_info { __s8 rssi; } __packed; +#define HCI_EV_LE_EXT_ADV_REPORT 0x0d +struct hci_ev_le_ext_adv_report { + __le16 evt_type; + __u8 bdaddr_type; + bdaddr_t bdaddr; + __u8 primary_phy; + __u8 secondary_phy; + __u8 sid; + __u8 tx_power; + __s8 rssi; + __le16 interval; + __u8 direct_addr_type; + bdaddr_t direct_addr; + __u8 length; + __u8 data[0]; +} __packed; + /* Internal events generated by Bluetooth stack */ #define HCI_EV_STACK_INTERNAL 0xfd struct hci_ev_stack_internal { -- cgit From 4d94f95d30c8fbfe86068e9abed110974d697cf5 Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Fri, 6 Jul 2018 22:50:32 +0200 Subject: Bluetooth: Use extended LE Connection if supported This implements extended LE craete connection and enhanced LE conn complete event if the controller supports. For now it is as good as legacy LE connection and event as no new features in the extended connection is handled. < HCI Command: LE Extended Create Connection (0x08|0x0043) plen 26 Filter policy: White list is not used (0x00) Own address type: Public (0x00) Peer address type: Random (0x01) Peer address: DB:7E:2E:1D:85:E8 (Static) Initiating PHYs: 0x01 Entry 0: LE 1M Scan interval: 60.000 msec (0x0060) Scan window: 60.000 msec (0x0060) Min connection interval: 50.00 msec (0x0028) Max connection interval: 70.00 msec (0x0038) Connection latency: 0 (0x0000) Supervision timeout: 420 msec (0x002a) Min connection length: 0.000 msec (0x0000) Max connection length: 0.000 msec (0x0000) > HCI Event: Command Status (0x0f) plen 4 LE Extended Create Connection (0x08|0x0043) ncmd 2 Status: Success (0x00) > HCI Event: LE Meta Event (0x3e) plen 31 LE Enhanced Connection Complete (0x0a) Status: Success (0x00) Handle: 3585 Role: Master (0x00) Peer address type: Random (0x01) Peer address: DB:7E:2E:1D:85:E8 (Static) Local resolvable private address: 00:00:00:00:00:00 (Non-Resolvable) Peer resolvable private address: 00:00:00:00:00:00 (Non-Resolvable) Connection interval: 67.50 msec (0x0036) Connection latency: 0 (0x0000) Supervision timeout: 420 msec (0x002a) Master clock accuracy: 0x00 @ MGMT Event: Device Connected (0x000b) plen 40 LE Address: DB:7E:2E:1D:85:E8 (Static) Flags: 0x00000000 Data length: 27 Name (complete): Designer Mouse Appearance: Mouse (0x03c2) Flags: 0x05 LE Limited Discoverable Mode BR/EDR Not Supported 16-bit Service UUIDs (complete): 1 entry Human Interface Device (0x1812) Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 36 ++++++++++++++++++++++++++++++++++++ include/net/bluetooth/hci_core.h | 2 ++ 2 files changed, 38 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 0ec51eb14810..73e48be5bbb3 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1538,6 +1538,27 @@ struct hci_cp_le_set_ext_scan_enable { __le16 period; } __packed; +#define HCI_OP_LE_EXT_CREATE_CONN 0x2043 +struct hci_cp_le_ext_create_conn { + __u8 filter_policy; + __u8 own_addr_type; + __u8 peer_addr_type; + bdaddr_t peer_addr; + __u8 phys; + __u8 data[0]; +} __packed; + +struct hci_cp_le_ext_conn_param { + __le16 scan_interval; + __le16 scan_window; + __le16 conn_interval_min; + __le16 conn_interval_max; + __le16 conn_latency; + __le16 supervision_timeout; + __le16 min_ce_len; + __le16 max_ce_len; +} __packed; + /* ---- HCI Events ---- */ #define HCI_EV_INQUIRY_COMPLETE 0x01 @@ -2015,6 +2036,21 @@ struct hci_ev_le_ext_adv_report { __u8 data[0]; } __packed; +#define HCI_EV_LE_ENHANCED_CONN_COMPLETE 0x0a +struct hci_ev_le_enh_conn_complete { + __u8 status; + __le16 handle; + __u8 role; + __u8 bdaddr_type; + bdaddr_t bdaddr; + bdaddr_t local_rpa; + bdaddr_t peer_rpa; + __le16 interval; + __le16 latency; + __le16 supervision_timeout; + __u8 clk_accurancy; +} __packed; + /* Internal events generated by Bluetooth stack */ #define HCI_EV_STACK_INTERNAL 0xfd struct hci_ev_stack_internal { diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index cc0bde74dd45..a74453571264 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1161,6 +1161,8 @@ void hci_conn_del_sysfs(struct hci_conn *conn); /* Use ext scanning if set ext scan param and ext scan enable is supported */ #define use_ext_scan(dev) (((dev)->commands[37] & 0x20) && \ ((dev)->commands[37] & 0x40)) +/* Use ext create connection if command is supported */ +#define use_ext_conn(dev) ((dev)->commands[37] & 0x80) /* ----- HCI protocols ----- */ #define HCI_PROTO_DEFER 0x01 -- cgit From 11a245e2f7bf25fc21f47e4c9c8491841b128890 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Fri, 6 Jul 2018 21:01:05 +0200 Subject: net/sched: act_csum: fix NULL dereference when 'goto chain' is used the control action in the common member of struct tcf_csum must be a valid value, as it can contain the chain index when 'goto chain' is used. Ensure that the control action can be read as x->tcfa_action, when x is a pointer to struct tc_action and x->ops->type is TCA_ACT_CSUM, to prevent the following command: # tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \ > $tcflags dst_mac $h2mac action csum ip or tcp or udp or sctp goto chain 1 from triggering a NULL pointer dereference when a matching packet is received. BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 PGD 800000010416b067 P4D 800000010416b067 PUD 1041be067 PMD 0 Oops: 0000 [#1] SMP PTI CPU: 0 PID: 3072 Comm: mausezahn Tainted: G E 4.18.0-rc2.auguri+ #421 Hardware name: Hewlett-Packard HP Z220 CMT Workstation/1790, BIOS K51 v01.58 02/07/2013 RIP: 0010:tcf_action_exec+0xb8/0x100 Code: 00 00 00 20 74 1d 83 f8 03 75 09 49 83 c4 08 4d 39 ec 75 bc 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 49 8b 97 a8 00 00 00 <48> 8b 12 48 89 55 00 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 RSP: 0018:ffffa020dea03c40 EFLAGS: 00010246 RAX: 0000000020000001 RBX: ffffa020d7ccef00 RCX: 0000000000000054 RDX: 0000000000000000 RSI: ffffa020ca5ae000 RDI: ffffa020d7ccef00 RBP: ffffa020dea03e60 R08: 0000000000000000 R09: ffffa020dea03c9c R10: ffffa020dea03c78 R11: 0000000000000008 R12: ffffa020d3fe4f00 R13: ffffa020d3fe4f08 R14: 0000000000000001 R15: ffffa020d53ca300 FS: 00007f5a46942740(0000) GS:ffffa020dea00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 0000000104218002 CR4: 00000000001606f0 Call Trace: fl_classify+0x1ad/0x1c0 [cls_flower] ? arp_rcv+0x121/0x1b0 ? __x2apic_send_IPI_dest+0x40/0x40 ? smp_reschedule_interrupt+0x1c/0xd0 ? reschedule_interrupt+0xf/0x20 ? reschedule_interrupt+0xa/0x20 ? device_is_rmrr_locked+0xe/0x50 ? iommu_should_identity_map+0x49/0xd0 ? __intel_map_single+0x30/0x140 ? e1000e_update_rdt_wa.isra.52+0x22/0xb0 [e1000e] ? e1000_alloc_rx_buffers+0x233/0x250 [e1000e] ? kmem_cache_alloc+0x38/0x1c0 tcf_classify+0x89/0x140 __netif_receive_skb_core+0x5ea/0xb70 ? enqueue_task_fair+0xb6/0x7d0 ? process_backlog+0x97/0x150 process_backlog+0x97/0x150 net_rx_action+0x14b/0x3e0 __do_softirq+0xde/0x2b4 do_softirq_own_stack+0x2a/0x40 do_softirq.part.18+0x49/0x50 __local_bh_enable_ip+0x49/0x50 __dev_queue_xmit+0x4ab/0x8a0 ? wait_woken+0x80/0x80 ? packet_sendmsg+0x38f/0x810 ? __dev_queue_xmit+0x8a0/0x8a0 packet_sendmsg+0x38f/0x810 sock_sendmsg+0x36/0x40 __sys_sendto+0x10e/0x140 ? do_vfs_ioctl+0xa4/0x630 ? syscall_trace_enter+0x1df/0x2e0 ? __audit_syscall_exit+0x22a/0x290 __x64_sys_sendto+0x24/0x30 do_syscall_64+0x5b/0x180 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f5a45cbec93 Code: 48 8b 0d 18 83 20 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d 59 c7 20 00 00 75 13 49 89 ca b8 2c 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 34 c3 48 83 ec 08 e8 2b f7 ff ff 48 89 04 24 RSP: 002b:00007ffd0ee6d748 EFLAGS: 00000246 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 0000000001161010 RCX: 00007f5a45cbec93 RDX: 0000000000000062 RSI: 0000000001161322 RDI: 0000000000000003 RBP: 00007ffd0ee6d780 R08: 00007ffd0ee6d760 R09: 0000000000000014 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000062 R13: 0000000001161322 R14: 00007ffd0ee6d760 R15: 0000000000000003 Modules linked in: act_csum act_gact cls_flower sch_ingress vrf veth act_tunnel_key(E) xt_CHECKSUM iptable_mangle ipt_MASQUERADE iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 tun bridge stp llc ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter intel_rapl x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel snd_hda_codec_hdmi snd_hda_codec_realtek kvm snd_hda_codec_generic hp_wmi iTCO_wdt sparse_keymap rfkill mei_wdt iTCO_vendor_support wmi_bmof gpio_ich irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel pcbc aesni_intel snd_hda_intel crypto_simd cryptd snd_hda_codec glue_helper snd_hda_core snd_hwdep snd_seq snd_seq_device snd_pcm pcspkr i2c_i801 snd_timer snd sg lpc_ich soundcore wmi mei_me mei ie31200_edac nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c sr_mod cdrom sd_mod ahci libahci crc32c_intel i915 ixgbe serio_raw libata video dca i2c_algo_bit sfc drm_kms_helper syscopyarea mtd sysfillrect mdio sysimgblt fb_sys_fops drm e1000e i2c_core CR2: 0000000000000000 ---[ end trace 3c9e9d1a77df4026 ]--- RIP: 0010:tcf_action_exec+0xb8/0x100 Code: 00 00 00 20 74 1d 83 f8 03 75 09 49 83 c4 08 4d 39 ec 75 bc 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 49 8b 97 a8 00 00 00 <48> 8b 12 48 89 55 00 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 RSP: 0018:ffffa020dea03c40 EFLAGS: 00010246 RAX: 0000000020000001 RBX: ffffa020d7ccef00 RCX: 0000000000000054 RDX: 0000000000000000 RSI: ffffa020ca5ae000 RDI: ffffa020d7ccef00 RBP: ffffa020dea03e60 R08: 0000000000000000 R09: ffffa020dea03c9c R10: ffffa020dea03c78 R11: 0000000000000008 R12: ffffa020d3fe4f00 R13: ffffa020d3fe4f08 R14: 0000000000000001 R15: ffffa020d53ca300 FS: 00007f5a46942740(0000) GS:ffffa020dea00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 0000000104218002 CR4: 00000000001606f0 Kernel panic - not syncing: Fatal exception in interrupt Kernel Offset: 0x26400000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]--- Fixes: 9c5f69bbd75a ("net/sched: act_csum: don't use spinlock in the fast path") Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- include/net/tc_act/tc_csum.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/tc_act/tc_csum.h b/include/net/tc_act/tc_csum.h index 9470fd7e4350..32d2454c0479 100644 --- a/include/net/tc_act/tc_csum.h +++ b/include/net/tc_act/tc_csum.h @@ -7,7 +7,6 @@ #include struct tcf_csum_params { - int action; u32 update_flags; struct rcu_head rcu; }; -- cgit From 38230a3e0e0933bbcf5df6fa469ba0667f667568 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Fri, 6 Jul 2018 21:01:06 +0200 Subject: net/sched: act_tunnel_key: fix NULL dereference when 'goto chain' is used the control action in the common member of struct tcf_tunnel_key must be a valid value, as it can contain the chain index when 'goto chain' is used. Ensure that the control action can be read as x->tcfa_action, when x is a pointer to struct tc_action and x->ops->type is TCA_ACT_TUNNEL_KEY, to prevent the following command: # tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \ > $tcflags dst_mac $h2mac action tunnel_key unset goto chain 1 from causing a NULL dereference when a matching packet is received: BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 PGD 80000001097ac067 P4D 80000001097ac067 PUD 103b0a067 PMD 0 Oops: 0000 [#1] SMP PTI CPU: 0 PID: 3491 Comm: mausezahn Tainted: G E 4.18.0-rc2.auguri+ #421 Hardware name: Hewlett-Packard HP Z220 CMT Workstation/1790, BIOS K51 v01.58 02/07/2013 RIP: 0010:tcf_action_exec+0xb8/0x100 Code: 00 00 00 20 74 1d 83 f8 03 75 09 49 83 c4 08 4d 39 ec 75 bc 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 49 8b 97 a8 00 00 00 <48> 8b 12 48 89 55 00 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 RSP: 0018:ffff95145ea03c40 EFLAGS: 00010246 RAX: 0000000020000001 RBX: ffff9514499e5800 RCX: 0000000000000001 RDX: 0000000000000000 RSI: 0000000000000002 RDI: 0000000000000000 RBP: ffff95145ea03e60 R08: 0000000000000000 R09: ffff95145ea03c9c R10: ffff95145ea03c78 R11: 0000000000000008 R12: ffff951456a69800 R13: ffff951456a69808 R14: 0000000000000001 R15: ffff95144965ee40 FS: 00007fd67ee11740(0000) GS:ffff95145ea00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000001038a2006 CR4: 00000000001606f0 Call Trace: fl_classify+0x1ad/0x1c0 [cls_flower] ? __update_load_avg_se.isra.47+0x1ca/0x1d0 ? __update_load_avg_se.isra.47+0x1ca/0x1d0 ? update_load_avg+0x665/0x690 ? update_load_avg+0x665/0x690 ? kmem_cache_alloc+0x38/0x1c0 tcf_classify+0x89/0x140 __netif_receive_skb_core+0x5ea/0xb70 ? enqueue_entity+0xd0/0x270 ? process_backlog+0x97/0x150 process_backlog+0x97/0x150 net_rx_action+0x14b/0x3e0 __do_softirq+0xde/0x2b4 do_softirq_own_stack+0x2a/0x40 do_softirq.part.18+0x49/0x50 __local_bh_enable_ip+0x49/0x50 __dev_queue_xmit+0x4ab/0x8a0 ? wait_woken+0x80/0x80 ? packet_sendmsg+0x38f/0x810 ? __dev_queue_xmit+0x8a0/0x8a0 packet_sendmsg+0x38f/0x810 sock_sendmsg+0x36/0x40 __sys_sendto+0x10e/0x140 ? do_vfs_ioctl+0xa4/0x630 ? syscall_trace_enter+0x1df/0x2e0 ? __audit_syscall_exit+0x22a/0x290 __x64_sys_sendto+0x24/0x30 do_syscall_64+0x5b/0x180 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7fd67e18dc93 Code: 48 8b 0d 18 83 20 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d 59 c7 20 00 00 75 13 49 89 ca b8 2c 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 34 c3 48 83 ec 08 e8 2b f7 ff ff 48 89 04 24 RSP: 002b:00007ffe0189b748 EFLAGS: 00000246 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 00000000020ca010 RCX: 00007fd67e18dc93 RDX: 0000000000000062 RSI: 00000000020ca322 RDI: 0000000000000003 RBP: 00007ffe0189b780 R08: 00007ffe0189b760 R09: 0000000000000014 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000062 R13: 00000000020ca322 R14: 00007ffe0189b760 R15: 0000000000000003 Modules linked in: act_tunnel_key act_gact cls_flower sch_ingress vrf veth act_csum(E) xt_CHECKSUM iptable_mangle ipt_MASQUERADE iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 tun bridge stp llc ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter intel_rapl snd_hda_codec_hdmi x86_pkg_temp_thermal intel_powerclamp snd_hda_codec_realtek coretemp snd_hda_codec_generic kvm_intel kvm irqbypass snd_hda_intel crct10dif_pclmul crc32_pclmul hp_wmi ghash_clmulni_intel pcbc snd_hda_codec aesni_intel sparse_keymap rfkill snd_hda_core snd_hwdep snd_seq crypto_simd iTCO_wdt gpio_ich iTCO_vendor_support wmi_bmof cryptd mei_wdt glue_helper snd_seq_device snd_pcm pcspkr snd_timer snd i2c_i801 lpc_ich sg soundcore wmi mei_me mei ie31200_edac nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c sd_mod sr_mod cdrom i915 video i2c_algo_bit drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ahci crc32c_intel libahci serio_raw sfc libata mtd drm ixgbe mdio i2c_core e1000e dca CR2: 0000000000000000 ---[ end trace 1ab8b5b5d4639dfc ]--- RIP: 0010:tcf_action_exec+0xb8/0x100 Code: 00 00 00 20 74 1d 83 f8 03 75 09 49 83 c4 08 4d 39 ec 75 bc 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 49 8b 97 a8 00 00 00 <48> 8b 12 48 89 55 00 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 RSP: 0018:ffff95145ea03c40 EFLAGS: 00010246 RAX: 0000000020000001 RBX: ffff9514499e5800 RCX: 0000000000000001 RDX: 0000000000000000 RSI: 0000000000000002 RDI: 0000000000000000 RBP: ffff95145ea03e60 R08: 0000000000000000 R09: ffff95145ea03c9c R10: ffff95145ea03c78 R11: 0000000000000008 R12: ffff951456a69800 R13: ffff951456a69808 R14: 0000000000000001 R15: ffff95144965ee40 FS: 00007fd67ee11740(0000) GS:ffff95145ea00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000001038a2006 CR4: 00000000001606f0 Kernel panic - not syncing: Fatal exception in interrupt Kernel Offset: 0x11400000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]--- Fixes: d0f6dd8a914f ("net/sched: Introduce act_tunnel_key") Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- include/net/tc_act/tc_tunnel_key.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/tc_act/tc_tunnel_key.h b/include/net/tc_act/tc_tunnel_key.h index efef0b4b1b2b..46b8c7f1c8d5 100644 --- a/include/net/tc_act/tc_tunnel_key.h +++ b/include/net/tc_act/tc_tunnel_key.h @@ -18,7 +18,6 @@ struct tcf_tunnel_key_params { struct rcu_head rcu; int tcft_action; - int action; struct metadata_dst *tcft_enc_metadata; }; -- cgit From 0ea488ff8d23c93da383fcf424825c298b13b1fb Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 5 Jul 2018 08:50:15 -0700 Subject: bpf: sockmap, convert bpf_compute_data_pointers to bpf_*_sk_skb In commit 'bpf: bpf_compute_data uses incorrect cb structure' (8108a7751512) we added the routine bpf_compute_data_end_sk_skb() to compute the correct data_end values, but this has since been lost. In kernel v4.14 this was correct and the above patch was applied in it entirety. Then when v4.14 was merged into v4.15-rc1 net-next tree we lost the piece that renamed bpf_compute_data_pointers to the new function bpf_compute_data_end_sk_skb. This was done here, e1ea2f9856b7 ("Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net") When it conflicted with the following rename patch, 6aaae2b6c433 ("bpf: rename bpf_compute_data_end into bpf_compute_data_pointers") Finally, after a refactor I thought even the function bpf_compute_data_end_sk_skb() was no longer needed and it was erroneously removed. However, we never reverted the sk_skb_convert_ctx_access() usage of tcp_skb_cb which had been committed and survived the merge conflict. Here we fix this by adding back the helper and *_data_end_sk_skb() usage. Using the bpf_skc_data_end mapping is not correct because it expects a qdisc_skb_cb object but at the sock layer this is not the case. Even though it happens to work here because we don't overwrite any data in-use at the socket layer and the cb structure is cleared later this has potential to create some subtle issues. But, even more concretely the filter.c access check uses tcp_skb_cb. And by some act of chance though, struct bpf_skb_data_end { struct qdisc_skb_cb qdisc_cb; /* 0 28 */ /* XXX 4 bytes hole, try to pack */ void * data_meta; /* 32 8 */ void * data_end; /* 40 8 */ /* size: 48, cachelines: 1, members: 3 */ /* sum members: 44, holes: 1, sum holes: 4 */ /* last cacheline: 48 bytes */ }; and then tcp_skb_cb, struct tcp_skb_cb { [...] struct { __u32 flags; /* 24 4 */ struct sock * sk_redir; /* 32 8 */ void * data_end; /* 40 8 */ } bpf; /* 24 */ }; So when we use offset_of() to track down the byte offset we get 40 in either case and everything continues to work. Fix this mess and use correct structures its unclear how long this might actually work for until someone moves the structs around. Reported-by: Martin KaFai Lau Fixes: e1ea2f9856b7 ("Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net") Fixes: 6aaae2b6c433 ("bpf: rename bpf_compute_data_end into bpf_compute_data_pointers") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov --- include/net/tcp.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 800582b5dd54..af3ec72d5d41 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -828,6 +828,10 @@ struct tcp_skb_cb { #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) +static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb) +{ + TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb); +} #if IS_ENABLED(CONFIG_IPV6) /* This is the variant of inet6_iif() that must be used by TCP, -- cgit From 03dc7a35fcc83a199121a5156c4a7a976b836682 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 11 Jul 2018 12:19:14 +0200 Subject: ipv6: xfrm: use 64-bit timestamps get_seconds() is deprecated because it can overflow on 32-bit architectures. For the xfrm_state->lastused member, we treat the data as a 64-bit number already, so we just need to use the right accessor that works on both 32-bit and 64-bit machines. Signed-off-by: Arnd Bergmann Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index a5378613a49c..1350e2cf0749 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -227,7 +227,7 @@ struct xfrm_state { long saved_tmo; /* Last used time */ - unsigned long lastused; + time64_t lastused; struct page_frag xfrag; -- cgit From 05296620f6d14dce0030b87e1e57891a770fb65c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 11 Jul 2018 20:36:40 -0700 Subject: xdp: factor out common program/flags handling from drivers Basic operations drivers perform during xdp setup and query can be moved to helpers in the core. Encapsulate program and flags into a structure and add helpers. Note that the structure is intended as the "main" program information source in the driver. Most drivers will additionally place the program pointer in their fast path or ring structures. The helpers don't have a huge impact now, but they will decrease the code duplication when programs can be installed in HW and driver at the same time. Encapsulating the basic operations in helpers will hopefully also reduce the number of changes to drivers which adopt them. Helpers could really be static inline, but they depend on definition of struct netdev_bpf which means they'd have to be placed in netdevice.h, an already 4500 line header. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/net/xdp.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/net') diff --git a/include/net/xdp.h b/include/net/xdp.h index 2deea7166a34..fcb033f51d8c 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -144,4 +144,17 @@ xdp_data_meta_unsupported(const struct xdp_buff *xdp) return unlikely(xdp->data_meta > xdp->data); } +struct xdp_attachment_info { + struct bpf_prog *prog; + u32 flags; +}; + +struct netdev_bpf; +int xdp_attachment_query(struct xdp_attachment_info *info, + struct netdev_bpf *bpf); +bool xdp_attachment_flags_ok(struct xdp_attachment_info *info, + struct netdev_bpf *bpf); +void xdp_attachment_setup(struct xdp_attachment_info *info, + struct netdev_bpf *bpf); + #endif /* __LINUX_NET_XDP_H__ */ -- cgit From 01683a1469995cc7aaf833d6f8b3f1c1d2fc3b92 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Mon, 9 Jul 2018 13:29:11 +0300 Subject: net: sched: refactor flower walk to iterate over idr Extend struct tcf_walker with additional 'cookie' field. It is intended to be used by classifier walk implementations to continue iteration directly from particular filter, instead of iterating 'skip' number of times. Change flower walk implementation to save filter handle in 'cookie'. Each time flower walk is called, it looks up filter with saved handle directly with idr, instead of iterating over filter linked list 'skip' number of times. This change improves complexity of dumping flower classifier from quadratic to linearithmic. (assuming idr lookup has logarithmic complexity) Reviewed-by: Jiri Pirko Signed-off-by: Vlad Buslov Reported-by: Simon Horman Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 2081e4219f81..e4252a176eec 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -13,6 +13,7 @@ struct tcf_walker { int stop; int skip; int count; + unsigned long cookie; int (*fn)(struct tcf_proto *, void *node, struct tcf_walker *); }; -- cgit From a69258f7aa2623e0930212f09c586fd06674ad79 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 12 Jul 2018 06:04:53 -0700 Subject: tcp: remove DELAYED ACK events in DCTCP After fixing the way DCTCP tracking delayed ACKs, the delayed-ACK related callbacks are no longer needed Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Acked-by: Lawrence Brakmo Signed-off-by: David S. Miller --- include/net/tcp.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index af3ec72d5d41..3482d13d655b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -912,8 +912,6 @@ enum tcp_ca_event { CA_EVENT_LOSS, /* loss timeout */ CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */ CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */ - CA_EVENT_DELAYED_ACK, /* Delayed ack is sent */ - CA_EVENT_NON_DELAYED_ACK, }; /* Information about inbound ACK, passed to cong_ops->in_ack_event() */ -- cgit From d80a1b9d186057ddb0d384ba601cf2b7d214539c Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Fri, 13 Jul 2018 14:33:39 +0300 Subject: tls: Refactor tls_offload variable names For symmetry, we rename tls_offload_context to tls_offload_context_tx before we add tls_offload_context_rx. Signed-off-by: Boris Pismenny Signed-off-by: David S. Miller --- include/net/tls.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/net/tls.h b/include/net/tls.h index 70c273777fe9..5dcd808236a7 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -128,7 +128,7 @@ struct tls_record_info { skb_frag_t frags[MAX_SKB_FRAGS]; }; -struct tls_offload_context { +struct tls_offload_context_tx { struct crypto_aead *aead_send; spinlock_t lock; /* protects records list */ struct list_head records_list; @@ -147,8 +147,8 @@ struct tls_offload_context { #define TLS_DRIVER_STATE_SIZE (max_t(size_t, 8, sizeof(void *))) }; -#define TLS_OFFLOAD_CONTEXT_SIZE \ - (ALIGN(sizeof(struct tls_offload_context), sizeof(void *)) + \ +#define TLS_OFFLOAD_CONTEXT_SIZE_TX \ + (ALIGN(sizeof(struct tls_offload_context_tx), sizeof(void *)) + \ TLS_DRIVER_STATE_SIZE) enum { @@ -239,7 +239,7 @@ void tls_device_sk_destruct(struct sock *sk); void tls_device_init(void); void tls_device_cleanup(void); -struct tls_record_info *tls_get_record(struct tls_offload_context *context, +struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context, u32 seq, u64 *p_record_sn); static inline bool tls_record_is_start_marker(struct tls_record_info *rec) @@ -380,10 +380,10 @@ static inline struct tls_sw_context_tx *tls_sw_ctx_tx( return (struct tls_sw_context_tx *)tls_ctx->priv_ctx_tx; } -static inline struct tls_offload_context *tls_offload_ctx( - const struct tls_context *tls_ctx) +static inline struct tls_offload_context_tx * +tls_offload_ctx_tx(const struct tls_context *tls_ctx) { - return (struct tls_offload_context *)tls_ctx->priv_ctx_tx; + return (struct tls_offload_context_tx *)tls_ctx->priv_ctx_tx; } int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg, @@ -396,7 +396,7 @@ struct sk_buff *tls_validate_xmit_skb(struct sock *sk, struct sk_buff *skb); int tls_sw_fallback_init(struct sock *sk, - struct tls_offload_context *offload_ctx, + struct tls_offload_context_tx *offload_ctx, struct tls_crypto_info *crypto_info); #endif /* _TLS_OFFLOAD_H */ -- cgit From dafb67f3bb4a58a45fe92c1e362ea6429831688a Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Fri, 13 Jul 2018 14:33:40 +0300 Subject: tls: Split decrypt_skb to two functions Previously, decrypt_skb also updated the TLS context. Now, decrypt_skb only decrypts the payload using the current context, while decrypt_skb_update also updates the state. Later, in the tls_device Rx flow, we will use decrypt_skb directly. Signed-off-by: Boris Pismenny Signed-off-by: David S. Miller --- include/net/tls.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/tls.h b/include/net/tls.h index 5dcd808236a7..49b89221db43 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -390,6 +390,8 @@ int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg, unsigned char *record_type); void tls_register_device(struct tls_device *device); void tls_unregister_device(struct tls_device *device); +int decrypt_skb(struct sock *sk, struct sk_buff *skb, + struct scatterlist *sgout); struct sk_buff *tls_validate_xmit_skb(struct sock *sk, struct net_device *dev, -- cgit From 39f56e1a78d647316db330c3b6f4c5637a895e3b Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Fri, 13 Jul 2018 14:33:41 +0300 Subject: tls: Split tls_sw_release_resources_rx This patch splits tls_sw_release_resources_rx into two functions one which releases all inner software tls structures and another that also frees the containing structure. In TLS_DEVICE we will need to release the software structures without freeeing the containing structure, which contains other information. Signed-off-by: Boris Pismenny Signed-off-by: David S. Miller --- include/net/tls.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/tls.h b/include/net/tls.h index 49b89221db43..7a485de25646 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -223,6 +223,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, void tls_sw_close(struct sock *sk, long timeout); void tls_sw_free_resources_tx(struct sock *sk); void tls_sw_free_resources_rx(struct sock *sk); +void tls_sw_release_resources_rx(struct sock *sk); int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); unsigned int tls_sw_poll(struct file *file, struct socket *sock, -- cgit From 4799ac81e52a72a6404827bf2738337bb581a174 Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Fri, 13 Jul 2018 14:33:43 +0300 Subject: tls: Add rx inline crypto offload This patch completes the generic infrastructure to offload TLS crypto to a network device. It enables the kernel to skip decryption and authentication of some skbs marked as decrypted by the NIC. In the fast path, all packets received are decrypted by the NIC and the performance is comparable to plain TCP. This infrastructure doesn't require a TCP offload engine. Instead, the NIC only decrypts packets that contain the expected TCP sequence number. Out-Of-Order TCP packets are provided unmodified. As a result, at the worst case a received TLS record consists of both plaintext and ciphertext packets. These partially decrypted records must be reencrypted, only to be decrypted. The notable differences between SW KTLS Rx and this offload are as follows: 1. Partial decryption - Software must handle the case of a TLS record that was only partially decrypted by HW. This can happen due to packet reordering. 2. Resynchronization - tls_read_size calls the device driver to resynchronize HW after HW lost track of TLS record framing in the TCP stream. Signed-off-by: Boris Pismenny Signed-off-by: David S. Miller --- include/net/tls.h | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 59 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/tls.h b/include/net/tls.h index 7a485de25646..d8b3b6578c01 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -83,6 +83,16 @@ struct tls_device { void (*unhash)(struct tls_device *device, struct sock *sk); }; +enum { + TLS_BASE, + TLS_SW, +#ifdef CONFIG_TLS_DEVICE + TLS_HW, +#endif + TLS_HW_RECORD, + TLS_NUM_CONFIG, +}; + struct tls_sw_context_tx { struct crypto_aead *aead_send; struct crypto_wait async_wait; @@ -197,6 +207,7 @@ struct tls_context { int (*push_pending_record)(struct sock *sk, int flags); void (*sk_write_space)(struct sock *sk); + void (*sk_destruct)(struct sock *sk); void (*sk_proto_close)(struct sock *sk, long timeout); int (*setsockopt)(struct sock *sk, int level, @@ -209,13 +220,27 @@ struct tls_context { void (*unhash)(struct sock *sk); }; +struct tls_offload_context_rx { + /* sw must be the first member of tls_offload_context_rx */ + struct tls_sw_context_rx sw; + atomic64_t resync_req; + u8 driver_state[]; + /* The TLS layer reserves room for driver specific state + * Currently the belief is that there is not enough + * driver specific state to justify another layer of indirection + */ +}; + +#define TLS_OFFLOAD_CONTEXT_SIZE_RX \ + (ALIGN(sizeof(struct tls_offload_context_rx), sizeof(void *)) + \ + TLS_DRIVER_STATE_SIZE) + int wait_on_pending_writer(struct sock *sk, long *timeo); int tls_sk_query(struct sock *sk, int optname, char __user *optval, int __user *optlen); int tls_sk_attach(struct sock *sk, int optname, char __user *optval, unsigned int optlen); - int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx); int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tls_sw_sendpage(struct sock *sk, struct page *page, @@ -290,11 +315,19 @@ static inline bool tls_is_pending_open_record(struct tls_context *tls_ctx) return tls_ctx->pending_open_record_frags; } +struct sk_buff * +tls_validate_xmit_skb(struct sock *sk, struct net_device *dev, + struct sk_buff *skb); + static inline bool tls_is_sk_tx_device_offloaded(struct sock *sk) { - return sk_fullsock(sk) && - /* matches smp_store_release in tls_set_device_offload */ - smp_load_acquire(&sk->sk_destruct) == &tls_device_sk_destruct; +#ifdef CONFIG_SOCK_VALIDATE_XMIT + return sk_fullsock(sk) & + (smp_load_acquire(&sk->sk_validate_xmit_skb) == + &tls_validate_xmit_skb); +#else + return false; +#endif } static inline void tls_err_abort(struct sock *sk, int err) @@ -387,10 +420,27 @@ tls_offload_ctx_tx(const struct tls_context *tls_ctx) return (struct tls_offload_context_tx *)tls_ctx->priv_ctx_tx; } +static inline struct tls_offload_context_rx * +tls_offload_ctx_rx(const struct tls_context *tls_ctx) +{ + return (struct tls_offload_context_rx *)tls_ctx->priv_ctx_rx; +} + +/* The TLS context is valid until sk_destruct is called */ +static inline void tls_offload_rx_resync_request(struct sock *sk, __be32 seq) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx); + + atomic64_set(&rx_ctx->resync_req, ((((uint64_t)seq) << 32) | 1)); +} + + int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg, unsigned char *record_type); void tls_register_device(struct tls_device *device); void tls_unregister_device(struct tls_device *device); +int tls_device_decrypted(struct sock *sk, struct sk_buff *skb); int decrypt_skb(struct sock *sk, struct sk_buff *skb, struct scatterlist *sgout); @@ -402,4 +452,9 @@ int tls_sw_fallback_init(struct sock *sk, struct tls_offload_context_tx *offload_ctx, struct tls_crypto_info *crypto_info); +int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx); + +void tls_device_offload_cleanup_rx(struct sock *sk); +void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn); + #endif /* _TLS_OFFLOAD_H */ -- cgit From f286586df68e7733a8e651098401f139dc2e17f4 Mon Sep 17 00:00:00 2001 From: Máté Eckl Date: Mon, 18 Jun 2018 15:12:52 +0200 Subject: netfilter: nft_tproxy: Move nf_tproxy_assign_sock() to nf_tproxy.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This function is also necessary to implement nft tproxy support Fixes: 45ca4e0cf273 ("netfilter: Libify xt_TPROXY") Signed-off-by: Máté Eckl Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tproxy.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tproxy.h b/include/net/netfilter/nf_tproxy.h index 9754a50ecde9..d5a80888cbe4 100644 --- a/include/net/netfilter/nf_tproxy.h +++ b/include/net/netfilter/nf_tproxy.h @@ -17,6 +17,14 @@ static inline bool nf_tproxy_sk_is_transparent(struct sock *sk) return false; } +/* assign a socket to the skb -- consumes sk */ +static inline void nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk) +{ + skb_orphan(skb); + skb->sk = sk; + skb->destructor = sock_edemux; +} + __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr); /** -- cgit From 60e3be94e6a1c5162a0763c9aafb5190b2b1fdce Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 25 Jun 2018 17:55:32 +0200 Subject: openvswitch: use nf_ct_get_tuplepr, invert_tuplepr These versions deal with the l3proto/l4proto details internally. It removes only caller of nf_ct_get_tuple, so make it static. After this, l3proto->get_l4proto() can be removed in a followup patch. Signed-off-by: Florian Westphal Acked-by: Pravin B Shelar Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_core.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index 9b5e7634713e..90df45022c51 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -40,13 +40,6 @@ void nf_conntrack_cleanup_start(void); void nf_conntrack_init_end(void); void nf_conntrack_cleanup_end(void); -bool nf_ct_get_tuple(const struct sk_buff *skb, unsigned int nhoff, - unsigned int dataoff, u_int16_t l3num, u_int8_t protonum, - struct net *net, - struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_l3proto *l3proto, - const struct nf_conntrack_l4proto *l4proto); - bool nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, const struct nf_conntrack_tuple *orig, const struct nf_conntrack_l3proto *l3proto, -- cgit From f957be9d349a3800940f823b16e12b0405cc305b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Jun 2018 07:46:44 +0200 Subject: netfilter: conntrack: remove ctnetlink callbacks from l3 protocol trackers handle everything from ctnetlink directly. After all these years we still only support ipv4 and ipv6, so it seems reasonable to remove l3 protocol tracker support and instead handle ipv4/ipv6 from a common, always builtin inet tracker. Step 1: Get rid of all the l3proto->func() calls. Start with ctnetlink, then move on to packet-path ones. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_core.h | 6 ++---- include/net/netfilter/nf_conntrack_l3proto.h | 8 -------- 2 files changed, 2 insertions(+), 12 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index 90df45022c51..d454a53ba646 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -68,10 +68,8 @@ static inline int nf_conntrack_confirm(struct sk_buff *skb) return ret; } -void -print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_l3proto *l3proto, - const struct nf_conntrack_l4proto *proto); +void print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_l4proto *proto); #define CONNTRACK_LOCKS 1024 diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h index d5808f3e2715..d07b5216a925 100644 --- a/include/net/netfilter/nf_conntrack_l3proto.h +++ b/include/net/netfilter/nf_conntrack_l3proto.h @@ -46,14 +46,6 @@ struct nf_conntrack_l3proto { int (*get_l4proto)(const struct sk_buff *skb, unsigned int nhoff, unsigned int *dataoff, u_int8_t *protonum); -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - int (*tuple_to_nlattr)(struct sk_buff *skb, - const struct nf_conntrack_tuple *t); - int (*nlattr_to_tuple)(struct nlattr *tb[], - struct nf_conntrack_tuple *t); - const struct nla_policy *nla_policy; -#endif - /* Called when netns wants to use connection tracking */ int (*net_ns_get)(struct net *); void (*net_ns_put)(struct net *); -- cgit From 47a91b14de62e35d1466820cbb4c024b6c02dff1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Jun 2018 07:46:45 +0200 Subject: netfilter: conntrack: remove pkt_to_tuple indirection from l3 protocol trackers Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l3proto.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h index d07b5216a925..ece231450f30 100644 --- a/include/net/netfilter/nf_conntrack_l3proto.h +++ b/include/net/netfilter/nf_conntrack_l3proto.h @@ -24,13 +24,6 @@ struct nf_conntrack_l3proto { /* size of tuple nlattr, fills a hole */ u16 nla_size; - /* - * Try to fill in the third arg: nhoff is offset of l3 proto - * hdr. Return true if possible. - */ - bool (*pkt_to_tuple)(const struct sk_buff *skb, unsigned int nhoff, - struct nf_conntrack_tuple *tuple); - /* * Invert the per-proto part of the tuple: ie. turn xmit into reply. * Some packets can't be inverted: return 0 in that case. -- cgit From d1b6fe94941f43e4743d5fea953d16b0a001c2c6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Jun 2018 07:46:46 +0200 Subject: netfilter: conntrack: remove invert_tuple indirection from l3 protocol trackers Its simpler to just handle it directly in nf_ct_invert_tuple(). Also gets rid of need to pass l3proto pointer to resolve_conntrack(). Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_core.h | 1 - include/net/netfilter/nf_conntrack_l3proto.h | 7 ------- 2 files changed, 8 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index d454a53ba646..35461b2d3462 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -42,7 +42,6 @@ void nf_conntrack_cleanup_end(void); bool nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, const struct nf_conntrack_tuple *orig, - const struct nf_conntrack_l3proto *l3proto, const struct nf_conntrack_l4proto *l4proto); /* Find a connection corresponding to a tuple. */ diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h index ece231450f30..164641c743a5 100644 --- a/include/net/netfilter/nf_conntrack_l3proto.h +++ b/include/net/netfilter/nf_conntrack_l3proto.h @@ -24,13 +24,6 @@ struct nf_conntrack_l3proto { /* size of tuple nlattr, fills a hole */ u16 nla_size; - /* - * Invert the per-proto part of the tuple: ie. turn xmit into reply. - * Some packets can't be inverted: return 0 in that case. - */ - bool (*invert_tuple)(struct nf_conntrack_tuple *inverse, - const struct nf_conntrack_tuple *orig); - /* * Called before tracking. * *dataoff: offset of protocol header (TCP, UDP,...) in skb -- cgit From 6816d931cab009024b68c11c4cf752f8bf9a1e32 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Jun 2018 07:46:47 +0200 Subject: netfilter: conntrack: remove get_l4proto indirection from l3 protocol trackers Handle it in the core instead. ipv6_skip_exthdr() is built-in even if ipv6 is a module, i.e. this doesn't create an ipv6 dependency. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l3proto.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h index 164641c743a5..5f160375c93a 100644 --- a/include/net/netfilter/nf_conntrack_l3proto.h +++ b/include/net/netfilter/nf_conntrack_l3proto.h @@ -24,14 +24,6 @@ struct nf_conntrack_l3proto { /* size of tuple nlattr, fills a hole */ u16 nla_size; - /* - * Called before tracking. - * *dataoff: offset of protocol header (TCP, UDP,...) in skb - * *protonum: protocol number - */ - int (*get_l4proto)(const struct sk_buff *skb, unsigned int nhoff, - unsigned int *dataoff, u_int8_t *protonum); - /* Called when netns wants to use connection tracking */ int (*net_ns_get)(struct net *); void (*net_ns_put)(struct net *); -- cgit From 8b3892ea8718920d29432328fe9544d89a429614 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Jun 2018 07:46:48 +0200 Subject: netfilter: conntrack: avoid calls to l4proto invert_tuple Handle the common cases (tcp, udp, etc). in the core and only do the indirect call for the protocols that need it (GRE for instance). Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l4proto.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index a7220eef9aee..6a55e337a161 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -36,7 +36,7 @@ struct nf_conntrack_l4proto { struct net *net, struct nf_conntrack_tuple *tuple); /* Invert the per-proto part of the tuple: ie. turn xmit into reply. - * Some packets can't be inverted: return 0 in that case. + * Only used by icmp, most protocols use a generic version. */ bool (*invert_tuple)(struct nf_conntrack_tuple *inverse, const struct nf_conntrack_tuple *orig); -- cgit From c779e849608a875448f6ffc2a5c2a15523bdcd00 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Jun 2018 07:46:50 +0200 Subject: netfilter: conntrack: remove get_timeout() indirection Not needed, we can have the l4trackers fetch it themselvs. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l4proto.h | 8 ++------ include/net/netfilter/nf_conntrack_timeout.h | 18 ++++-------------- 2 files changed, 6 insertions(+), 20 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 6a55e337a161..c7a0075d96df 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -45,13 +45,12 @@ struct nf_conntrack_l4proto { int (*packet)(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info ctinfo, - unsigned int *timeouts); + enum ip_conntrack_info ctinfo); /* Called when a new connection for this protocol found; * returns TRUE if it's OK. If so, packet() called next. */ bool (*new)(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff, unsigned int *timeouts); + unsigned int dataoff); /* Called when a conntrack entry is destroyed */ void (*destroy)(struct nf_conn *ct); @@ -63,9 +62,6 @@ struct nf_conntrack_l4proto { /* called by gc worker if table is full */ bool (*can_early_drop)(const struct nf_conn *ct); - /* Return the array of timeouts for this protocol. */ - unsigned int *(*get_timeouts)(struct net *net); - /* convert protoinfo to nfnetink attributes */ int (*to_nlattr)(struct sk_buff *skb, struct nlattr *nla, struct nf_conn *ct); diff --git a/include/net/netfilter/nf_conntrack_timeout.h b/include/net/netfilter/nf_conntrack_timeout.h index 9468ab4ad12d..80ceb3d0291d 100644 --- a/include/net/netfilter/nf_conntrack_timeout.h +++ b/include/net/netfilter/nf_conntrack_timeout.h @@ -67,27 +67,17 @@ struct nf_conn_timeout *nf_ct_timeout_ext_add(struct nf_conn *ct, #endif }; -static inline unsigned int * -nf_ct_timeout_lookup(struct net *net, struct nf_conn *ct, - const struct nf_conntrack_l4proto *l4proto) +static inline unsigned int *nf_ct_timeout_lookup(const struct nf_conn *ct) { + unsigned int *timeouts = NULL; #ifdef CONFIG_NF_CONNTRACK_TIMEOUT struct nf_conn_timeout *timeout_ext; - unsigned int *timeouts; timeout_ext = nf_ct_timeout_find(ct); - if (timeout_ext) { + if (timeout_ext) timeouts = nf_ct_timeout_data(timeout_ext); - if (unlikely(!timeouts)) - timeouts = l4proto->get_timeouts(net); - } else { - timeouts = l4proto->get_timeouts(net); - } - - return timeouts; -#else - return l4proto->get_timeouts(net); #endif + return timeouts; } #ifdef CONFIG_NF_CONNTRACK_TIMEOUT -- cgit From c7ea20c9da5b94e400c8dcc0adb99411f2e430a6 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 10 Jul 2018 22:41:27 +0800 Subject: ipv6/mcast: init as INCLUDE when join SSM INCLUDE group This an IPv6 version patch of "ipv4/igmp: init group mode as INCLUDE when join source group". From RFC3810, part 6.1: If no per-interface state existed for that multicast address before the change (i.e., the change consisted of creating a new per-interface record), or if no state exists after the change (i.e., the change consisted of deleting a per-interface record), then the "non-existent" state is considered to have an INCLUDE filter mode and an empty source list. Which means a new multicast group should start with state IN(). Currently, for MLDv2 SSM JOIN_SOURCE_GROUP mode, we first call ipv6_sock_mc_join(), then ip6_mc_source(), which will trigger a TO_IN() message instead of ALLOW(). The issue was exposed by commit a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change"). Before this change, we sent both ALLOW(A) and TO_IN(A). Now, we only send TO_IN(A). Fix it by adding a new parameter to init group mode. Also add some wrapper functions to avoid changing too much code. v1 -> v2: In the first version I only cleared the group change record. But this is not enough. Because when a new group join, it will init as EXCLUDE and trigger a filter mode change in ip/ip6_mc_add_src(), which will clear all source addresses sf_crcount. This will prevent early joined address sending state change records if multi source addressed joined at the same time. In v2 patch, I fixed it by directly initializing the mode to INCLUDE for SSM JOIN_SOURCE_GROUP. I also split the original patch into two separated patches for IPv4 and IPv6. There is also a difference between v4 and v6 version. For IPv6, when the interface goes down and up, we will send correct state change record with unspecified IPv6 address (::) with function ipv6_mc_up(). But after DAD is completed, we resend the change record TO_IN() in mld_send_initial_cr(). Fix it by sending ALLOW() for INCLUDE mode in mld_send_initial_cr(). Fixes: a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change") Reviewed-by: Stefano Brivio Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- include/net/ipv6.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index d02881e4ad1f..7528632bcf2a 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1100,6 +1100,8 @@ void ipv6_sysctl_unregister(void); int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr); +int ipv6_sock_mc_join_ssm(struct sock *sk, int ifindex, + const struct in6_addr *addr, unsigned int mode); int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr); #endif /* _NET_IPV6_H */ -- cgit From a0ae2562c6c4b2721d9fddba63b7286c13517d9f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Jun 2018 07:46:51 +0200 Subject: netfilter: conntrack: remove l3proto abstraction This unifies ipv4 and ipv6 protocol trackers and removes the l3proto abstraction. This gets rid of all l3proto indirect calls and the need to do a lookup on the function to call for l3 demux. It increases module size by only a small amount (12kbyte), so this reduces size because nf_conntrack.ko is useless without either nf_conntrack_ipv4 or nf_conntrack_ipv6 module. before: text data bss dec hex filename 7357 1088 0 8445 20fd nf_conntrack_ipv4.ko 7405 1084 4 8493 212d nf_conntrack_ipv6.ko 72614 13689 236 86539 1520b nf_conntrack.ko 19K nf_conntrack_ipv4.ko 19K nf_conntrack_ipv6.ko 179K nf_conntrack.ko after: text data bss dec hex filename 79277 13937 236 93450 16d0a nf_conntrack.ko 191K nf_conntrack.ko Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/ipv4/nf_conntrack_ipv4.h | 3 -- include/net/netfilter/nf_conntrack.h | 5 +++ include/net/netfilter/nf_conntrack_core.h | 1 - include/net/netfilter/nf_conntrack_l3proto.h | 54 -------------------------- include/net/netfilter/nf_conntrack_l4proto.h | 4 -- 5 files changed, 5 insertions(+), 62 deletions(-) delete mode 100644 include/net/netfilter/nf_conntrack_l3proto.h (limited to 'include/net') diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h index 73f825732326..c84b51682f08 100644 --- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h +++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h @@ -10,9 +10,6 @@ #ifndef _NF_CONNTRACK_IPV4_H #define _NF_CONNTRACK_IPV4_H - -const extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4; - extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4; extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4; extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp; diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 062dc19b5840..a2b0ed025908 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -41,6 +41,11 @@ union nf_conntrack_expect_proto { /* insert expect proto private data here */ }; +struct nf_conntrack_net { + unsigned int users4; + unsigned int users6; +}; + #include #include diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index 35461b2d3462..2a3e0974a6af 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -14,7 +14,6 @@ #define _NF_CONNTRACK_CORE_H #include -#include #include #include diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h deleted file mode 100644 index 5f160375c93a..000000000000 --- a/include/net/netfilter/nf_conntrack_l3proto.h +++ /dev/null @@ -1,54 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C)2003,2004 USAGI/WIDE Project - * - * Header for use in defining a given L3 protocol for connection tracking. - * - * Author: - * Yasuyuki Kozakai @USAGI - * - * Derived from include/netfilter_ipv4/ip_conntrack_protocol.h - */ - -#ifndef _NF_CONNTRACK_L3PROTO_H -#define _NF_CONNTRACK_L3PROTO_H -#include -#include -#include -#include - -struct nf_conntrack_l3proto { - /* L3 Protocol Family number. ex) PF_INET */ - u_int16_t l3proto; - - /* size of tuple nlattr, fills a hole */ - u16 nla_size; - - /* Called when netns wants to use connection tracking */ - int (*net_ns_get)(struct net *); - void (*net_ns_put)(struct net *); - - /* Module (if any) which this is connected to. */ - struct module *me; -}; - -extern struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[NFPROTO_NUMPROTO]; - -/* Protocol global registration. */ -int nf_ct_l3proto_register(const struct nf_conntrack_l3proto *proto); -void nf_ct_l3proto_unregister(const struct nf_conntrack_l3proto *proto); - -const struct nf_conntrack_l3proto *nf_ct_l3proto_find_get(u_int16_t l3proto); - -/* Existing built-in protocols */ -extern struct nf_conntrack_l3proto nf_conntrack_l3proto_generic; - -static inline struct nf_conntrack_l3proto * -__nf_ct_l3proto_find(u_int16_t l3proto) -{ - if (unlikely(l3proto >= NFPROTO_NUMPROTO)) - return &nf_conntrack_l3proto_generic; - return rcu_dereference(nf_ct_l3protos[l3proto]); -} - -#endif /*_NF_CONNTRACK_L3PROTO_H*/ diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index c7a0075d96df..6068c6da3eac 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -130,10 +130,6 @@ void nf_ct_l4proto_pernet_unregister(struct net *net, /* Protocol global registration. */ int nf_ct_l4proto_register_one(const struct nf_conntrack_l4proto *proto); void nf_ct_l4proto_unregister_one(const struct nf_conntrack_l4proto *proto); -int nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const proto[], - unsigned int num_proto); -void nf_ct_l4proto_unregister(const struct nf_conntrack_l4proto * const proto[], - unsigned int num_proto); /* Generic netlink helpers */ int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, -- cgit From 26b2f552525cf98fad08515bd6faa427f2f22038 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Fri, 13 Jul 2018 01:38:08 +0900 Subject: netfilter: nf_tables: fix jumpstack depth validation The level of struct nft_ctx is updated by nf_tables_check_loops(). That is used to validate jumpstack depth. But jumpstack validation routine doesn't update and validate recursively. So, in some cases, chain depth can be bigger than the NFT_JUMP_STACK_SIZE. After this patch, The jumpstack validation routine is located in the nft_chain_validate(). When new rules or new set elements are added, the nft_table_validate() is called by the nf_tables_newrule and the nf_tables_newsetelem. The nft_table_validate() calls the nft_chain_validate() that visit all their children chains recursively. So it can update depth of chain certainly. Reproducer: %cat ./test.sh #!/bin/bash nft add table ip filter nft add chain ip filter input { type filter hook input priority 0\; } for ((i=0;i<20;i++)); do nft add chain ip filter a$i done nft add rule ip filter input jump a1 for ((i=0;i<10;i++)); do nft add rule ip filter a$i jump a$((i+1)) done for ((i=11;i<19;i++)); do nft add rule ip filter a$i jump a$((i+1)) done nft add rule ip filter a10 jump a11 Result: [ 253.931782] WARNING: CPU: 1 PID: 0 at net/netfilter/nf_tables_core.c:186 nft_do_chain+0xacc/0xdf0 [nf_tables] [ 253.931915] Modules linked in: nf_tables nfnetlink ip_tables x_tables [ 253.932153] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.18.0-rc3+ #48 [ 253.932153] RIP: 0010:nft_do_chain+0xacc/0xdf0 [nf_tables] [ 253.932153] Code: 83 f8 fb 0f 84 c7 00 00 00 e9 d0 00 00 00 83 f8 fd 74 0e 83 f8 ff 0f 84 b4 00 00 00 e9 bd 00 00 00 83 bd 64 fd ff ff 0f 76 09 <0f> 0b 31 c0 e9 bc 02 00 00 44 8b ad 64 fd [ 253.933807] RSP: 0018:ffff88011b807570 EFLAGS: 00010212 [ 253.933807] RAX: 00000000fffffffd RBX: ffff88011b807660 RCX: 0000000000000000 [ 253.933807] RDX: 0000000000000010 RSI: ffff880112b39d78 RDI: ffff88011b807670 [ 253.933807] RBP: ffff88011b807850 R08: ffffed0023700ece R09: ffffed0023700ecd [ 253.933807] R10: ffff88011b80766f R11: ffffed0023700ece R12: ffff88011b807898 [ 253.933807] R13: ffff880112b39d80 R14: ffff880112b39d60 R15: dffffc0000000000 [ 253.933807] FS: 0000000000000000(0000) GS:ffff88011b800000(0000) knlGS:0000000000000000 [ 253.933807] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 253.933807] CR2: 00000000014f1008 CR3: 000000006b216000 CR4: 00000000001006e0 [ 253.933807] Call Trace: [ 253.933807] [ 253.933807] ? sched_clock_cpu+0x132/0x170 [ 253.933807] ? __nft_trace_packet+0x180/0x180 [nf_tables] [ 253.933807] ? sched_clock_cpu+0x132/0x170 [ 253.933807] ? debug_show_all_locks+0x290/0x290 [ 253.933807] ? __lock_acquire+0x4835/0x4af0 [ 253.933807] ? inet_ehash_locks_alloc+0x1a0/0x1a0 [ 253.933807] ? unwind_next_frame+0x159e/0x1840 [ 253.933807] ? __read_once_size_nocheck.constprop.4+0x5/0x10 [ 253.933807] ? nft_do_chain_ipv4+0x197/0x1e0 [nf_tables] [ 253.933807] ? nft_do_chain+0x5/0xdf0 [nf_tables] [ 253.933807] nft_do_chain_ipv4+0x197/0x1e0 [nf_tables] [ 253.933807] ? nft_do_chain_arp+0xb0/0xb0 [nf_tables] [ 253.933807] ? __lock_is_held+0x9d/0x130 [ 253.933807] nf_hook_slow+0xc4/0x150 [ 253.933807] ip_local_deliver+0x28b/0x380 [ 253.933807] ? ip_call_ra_chain+0x3e0/0x3e0 [ 253.933807] ? ip_rcv_finish+0x1610/0x1610 [ 253.933807] ip_rcv+0xbcc/0xcc0 [ 253.933807] ? debug_show_all_locks+0x290/0x290 [ 253.933807] ? ip_local_deliver+0x380/0x380 [ 253.933807] ? __lock_is_held+0x9d/0x130 [ 253.933807] ? ip_local_deliver+0x380/0x380 [ 253.933807] __netif_receive_skb_core+0x1c9c/0x2240 Signed-off-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 08c005ce56e9..4e82a4c49912 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -150,6 +150,7 @@ static inline void nft_data_debug(const struct nft_data *data) * @portid: netlink portID of the original message * @seq: netlink sequence number * @family: protocol family + * @level: depth of the chains * @report: notify via unicast netlink message */ struct nft_ctx { @@ -160,6 +161,7 @@ struct nft_ctx { u32 portid; u32 seq; u8 family; + u8 level; bool report; }; @@ -865,7 +867,6 @@ enum nft_chain_flags { * @table: table that this chain belongs to * @handle: chain handle * @use: number of jump references to this chain - * @level: length of longest path to this chain * @flags: bitmask of enum nft_chain_flags * @name: name of the chain */ @@ -878,7 +879,6 @@ struct nft_chain { struct nft_table *table; u64 handle; u32 use; - u16 level; u8 flags:6, genmask:2; char *name; -- cgit From cb2b36f5a97df76f547fcc4ab444a02522fb6c96 Mon Sep 17 00:00:00 2001 From: Yi-Hung Wei Date: Mon, 2 Jul 2018 17:33:40 -0700 Subject: netfilter: nf_conncount: Switch to plain list Original patch is from Florian Westphal. This patch switches from hlist to plain list to store the list of connections with the same filtering key in nf_conncount. With the plain list, we can insert new connections at the tail, so over time the beginning of list holds long-running connections and those are expired, while the newly creates ones are at the end. Later on, we could probably move checked ones to the end of the list, so the next run has higher chance to reclaim stale entries in the front. Signed-off-by: Yi-Hung Wei Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_count.h | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h index 3a188a0923a3..e4884e0e4f69 100644 --- a/include/net/netfilter/nf_conntrack_count.h +++ b/include/net/netfilter/nf_conntrack_count.h @@ -1,8 +1,15 @@ #ifndef _NF_CONNTRACK_COUNT_H #define _NF_CONNTRACK_COUNT_H +#include + struct nf_conncount_data; +struct nf_conncount_list { + struct list_head head; /* connections with the same filtering key */ + unsigned int count; /* length of list */ +}; + struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family, unsigned int keylen); void nf_conncount_destroy(struct net *net, unsigned int family, @@ -14,15 +21,17 @@ unsigned int nf_conncount_count(struct net *net, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone); -unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head, +unsigned int nf_conncount_lookup(struct net *net, struct nf_conncount_list *list, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone, bool *addit); -bool nf_conncount_add(struct hlist_head *head, +void nf_conncount_list_init(struct nf_conncount_list *list); + +bool nf_conncount_add(struct nf_conncount_list *list, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone); -void nf_conncount_cache_free(struct hlist_head *hhead); +void nf_conncount_cache_free(struct nf_conncount_list *list); #endif -- cgit From 976afca1ceba53df6f4a543014e15d1c7a962571 Mon Sep 17 00:00:00 2001 From: Yi-Hung Wei Date: Mon, 2 Jul 2018 17:33:41 -0700 Subject: netfilter: nf_conncount: Early exit in nf_conncount_lookup() and cleanup This patch is originally from Florian Westphal. This patch does the following three tasks. It applies the same early exit technique for nf_conncount_lookup(). Since now we keep the number of connections in 'struct nf_conncount_list', we no longer need to return the count in nf_conncount_lookup(). Moreover, we expose the garbage collection function nf_conncount_gc_list() for nft_connlimit. Signed-off-by: Yi-Hung Wei Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_count.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h index e4884e0e4f69..dbec17f674b7 100644 --- a/include/net/netfilter/nf_conntrack_count.h +++ b/include/net/netfilter/nf_conntrack_count.h @@ -21,10 +21,10 @@ unsigned int nf_conncount_count(struct net *net, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone); -unsigned int nf_conncount_lookup(struct net *net, struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone, - bool *addit); +void nf_conncount_lookup(struct net *net, struct nf_conncount_list *list, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone, + bool *addit); void nf_conncount_list_init(struct nf_conncount_list *list); @@ -32,6 +32,9 @@ bool nf_conncount_add(struct nf_conncount_list *list, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone); +void nf_conncount_gc_list(struct net *net, + struct nf_conncount_list *list); + void nf_conncount_cache_free(struct nf_conncount_list *list); #endif -- cgit From 5c789e131cbb997a528451564ea4613e812fc718 Mon Sep 17 00:00:00 2001 From: Yi-Hung Wei Date: Mon, 2 Jul 2018 17:33:44 -0700 Subject: netfilter: nf_conncount: Add list lock and gc worker, and RCU for init tree search This patch is originally from Florian Westphal. This patch does the following 3 main tasks. 1) Add list lock to 'struct nf_conncount_list' so that we can alter the lists containing the individual connections without holding the main tree lock. It would be useful when we only need to add/remove to/from a list without allocate/remove a node in the tree. With this change, we update nft_connlimit accordingly since we longer need to maintain a list lock in nft_connlimit now. 2) Use RCU for the initial tree search to improve tree look up performance. 3) Add a garbage collection worker. This worker is schedule when there are excessive tree node that needed to be recycled. Moreover,the rbnode reclaim logic is moved from search tree to insert tree to avoid race condition. Signed-off-by: Yi-Hung Wei Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_count.h | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h index dbec17f674b7..4b2b2baf8ab4 100644 --- a/include/net/netfilter/nf_conntrack_count.h +++ b/include/net/netfilter/nf_conntrack_count.h @@ -5,9 +5,17 @@ struct nf_conncount_data; +enum nf_conncount_list_add { + NF_CONNCOUNT_ADDED, /* list add was ok */ + NF_CONNCOUNT_ERR, /* -ENOMEM, must drop skb */ + NF_CONNCOUNT_SKIP, /* list is already reclaimed by gc */ +}; + struct nf_conncount_list { + spinlock_t list_lock; struct list_head head; /* connections with the same filtering key */ unsigned int count; /* length of list */ + bool dead; }; struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family, @@ -28,11 +36,12 @@ void nf_conncount_lookup(struct net *net, struct nf_conncount_list *list, void nf_conncount_list_init(struct nf_conncount_list *list); -bool nf_conncount_add(struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone); +enum nf_conncount_list_add +nf_conncount_add(struct nf_conncount_list *list, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone); -void nf_conncount_gc_list(struct net *net, +bool nf_conncount_gc_list(struct net *net, struct nf_conncount_list *list); void nf_conncount_cache_free(struct nf_conncount_list *list); -- cgit From ec1b28ca9674def4a158808a6493bdb87b993d81 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Fri, 6 Jul 2018 08:25:52 +0300 Subject: ipvs: provide just conn to ip_vs_state_name In preparation for followup patches, provide just the cp ptr to ip_vs_state_name. Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index a0bec23c6d5e..4d76abcf1c41 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1221,7 +1221,7 @@ struct ip_vs_conn *ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, struct ip_vs_dest *dest, __u32 fwmark); void ip_vs_conn_expire_now(struct ip_vs_conn *cp); -const char *ip_vs_state_name(__u16 proto, int state); +const char *ip_vs_state_name(const struct ip_vs_conn *cp); void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp); int ip_vs_check_template(struct ip_vs_conn *ct, struct ip_vs_dest *cdest); -- cgit From 275411430f892407b885be1de2548b2e632892c3 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Fri, 6 Jul 2018 08:25:53 +0300 Subject: ipvs: add assured state for conn templates cp->state was not used for templates. Add support for state bits and for the first "assured" bit which indicates that some connection controlled by this template was established or assured by the real server. In a followup patch we will use it to drop templates under SYN attack. Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 4d76abcf1c41..a0d2e0bb9a94 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -335,6 +335,11 @@ enum ip_vs_sctp_states { IP_VS_SCTP_S_LAST }; +/* Connection templates use bits from state */ +#define IP_VS_CTPL_S_NONE 0x0000 +#define IP_VS_CTPL_S_ASSURED 0x0001 +#define IP_VS_CTPL_S_LAST 0x0002 + /* Delta sequence info structure * Each ip_vs_conn has 2 (output AND input seq. changes). * Only used in the VS/NAT. @@ -1289,6 +1294,17 @@ ip_vs_control_add(struct ip_vs_conn *cp, struct ip_vs_conn *ctl_cp) atomic_inc(&ctl_cp->n_control); } +/* Mark our template as assured */ +static inline void +ip_vs_control_assure_ct(struct ip_vs_conn *cp) +{ + struct ip_vs_conn *ct = cp->control; + + if (ct && !(ct->state & IP_VS_CTPL_S_ASSURED) && + (ct->flags & IP_VS_CONN_F_TEMPLATE)) + ct->state |= IP_VS_CTPL_S_ASSURED; +} + /* IPVS netns init & cleanup functions */ int ip_vs_estimator_net_init(struct netns_ipvs *ipvs); int ip_vs_control_net_init(struct netns_ipvs *ipvs); -- cgit From 440534d3c56be04abfb26850ee882d19d223557a Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Mon, 9 Jul 2018 18:06:33 +0800 Subject: netfilter: Remove useless param helper of nf_ct_helper_ext_add The param helper of nf_ct_helper_ext_add is useless now, then remove it now. Signed-off-by: Gao Feng Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_helper.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h index 32c2a94a219d..2492120b8097 100644 --- a/include/net/netfilter/nf_conntrack_helper.h +++ b/include/net/netfilter/nf_conntrack_helper.h @@ -103,9 +103,7 @@ int nf_conntrack_helpers_register(struct nf_conntrack_helper *, unsigned int); void nf_conntrack_helpers_unregister(struct nf_conntrack_helper *, unsigned int); -struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, - struct nf_conntrack_helper *helper, - gfp_t gfp); +struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp); int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, gfp_t flags); -- cgit From f102d66b335a417d4848da9441f585695a838934 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 11 Jul 2018 13:45:14 +0200 Subject: netfilter: nf_tables: use dedicated mutex to guard transactions Continue to use nftnl subsys mutex to protect (un)registration of hook types, expressions and so on, but force batch operations to do their own locking. This allows distinct net namespaces to perform transactions in parallel. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netns/nftables.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h index 94767ea3a490..286fd960896f 100644 --- a/include/net/netns/nftables.h +++ b/include/net/netns/nftables.h @@ -7,6 +7,7 @@ struct netns_nftables { struct list_head tables; struct list_head commit_list; + struct mutex commit_mutex; unsigned int base_seq; u8 gencursor; u8 validate_state; -- cgit From 70b095c84326640eeacfd69a411db8fc36e8ab1a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 14 Jul 2018 01:14:01 +0200 Subject: ipv6: remove dependency of nf_defrag_ipv6 on ipv6 module IPV6=m DEFRAG_IPV6=m CONNTRACK=y yields: net/netfilter/nf_conntrack_proto.o: In function `nf_ct_netns_do_get': net/netfilter/nf_conntrack_proto.c:802: undefined reference to `nf_defrag_ipv6_enable' net/netfilter/nf_conntrack_proto.o:(.rodata+0x640): undefined reference to `nf_conntrack_l4proto_icmpv6' Setting DEFRAG_IPV6=y causes undefined references to ip6_rhash_params ip6_frag_init and ip6_expire_frag_queue so it would be needed to force IPV6=y too. This patch gets rid of the 'followup linker error' by removing the dependency of ipv6.ko symbols from netfilter ipv6 defrag. Shared code is placed into a header, then used from both. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/ipv6.h | 28 ------------- include/net/ipv6_frag.h | 104 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+), 28 deletions(-) create mode 100644 include/net/ipv6_frag.h (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index aa6fd11a887c..3720958cd4e1 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -581,34 +581,6 @@ static inline bool ipv6_prefix_equal(const struct in6_addr *addr1, } #endif -struct inet_frag_queue; - -enum ip6_defrag_users { - IP6_DEFRAG_LOCAL_DELIVER, - IP6_DEFRAG_CONNTRACK_IN, - __IP6_DEFRAG_CONNTRACK_IN = IP6_DEFRAG_CONNTRACK_IN + USHRT_MAX, - IP6_DEFRAG_CONNTRACK_OUT, - __IP6_DEFRAG_CONNTRACK_OUT = IP6_DEFRAG_CONNTRACK_OUT + USHRT_MAX, - IP6_DEFRAG_CONNTRACK_BRIDGE_IN, - __IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX, -}; - -void ip6_frag_init(struct inet_frag_queue *q, const void *a); -extern const struct rhashtable_params ip6_rhash_params; - -/* - * Equivalent of ipv4 struct ip - */ -struct frag_queue { - struct inet_frag_queue q; - - int iif; - __u16 nhoffset; - u8 ecn; -}; - -void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq); - static inline bool ipv6_addr_any(const struct in6_addr *a) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 diff --git a/include/net/ipv6_frag.h b/include/net/ipv6_frag.h new file mode 100644 index 000000000000..6ced1e6899b6 --- /dev/null +++ b/include/net/ipv6_frag.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _IPV6_FRAG_H +#define _IPV6_FRAG_H +#include +#include +#include +#include + +enum ip6_defrag_users { + IP6_DEFRAG_LOCAL_DELIVER, + IP6_DEFRAG_CONNTRACK_IN, + __IP6_DEFRAG_CONNTRACK_IN = IP6_DEFRAG_CONNTRACK_IN + USHRT_MAX, + IP6_DEFRAG_CONNTRACK_OUT, + __IP6_DEFRAG_CONNTRACK_OUT = IP6_DEFRAG_CONNTRACK_OUT + USHRT_MAX, + IP6_DEFRAG_CONNTRACK_BRIDGE_IN, + __IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX, +}; + +/* + * Equivalent of ipv4 struct ip + */ +struct frag_queue { + struct inet_frag_queue q; + + int iif; + __u16 nhoffset; + u8 ecn; +}; + +#if IS_ENABLED(CONFIG_IPV6) +static inline void ip6frag_init(struct inet_frag_queue *q, const void *a) +{ + struct frag_queue *fq = container_of(q, struct frag_queue, q); + const struct frag_v6_compare_key *key = a; + + q->key.v6 = *key; + fq->ecn = 0; +} + +static inline u32 ip6frag_key_hashfn(const void *data, u32 len, u32 seed) +{ + return jhash2(data, + sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); +} + +static inline u32 ip6frag_obj_hashfn(const void *data, u32 len, u32 seed) +{ + const struct inet_frag_queue *fq = data; + + return jhash2((const u32 *)&fq->key.v6, + sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); +} + +static inline int +ip6frag_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) +{ + const struct frag_v6_compare_key *key = arg->key; + const struct inet_frag_queue *fq = ptr; + + return !!memcmp(&fq->key, key, sizeof(*key)); +} + +static inline void +ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq) +{ + struct net_device *dev = NULL; + struct sk_buff *head; + + rcu_read_lock(); + spin_lock(&fq->q.lock); + + if (fq->q.flags & INET_FRAG_COMPLETE) + goto out; + + inet_frag_kill(&fq->q); + + dev = dev_get_by_index_rcu(net, fq->iif); + if (!dev) + goto out; + + __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); + __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); + + /* Don't send error if the first segment did not arrive. */ + head = fq->q.fragments; + if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !head) + goto out; + + head->dev = dev; + skb_get(head); + spin_unlock(&fq->q.lock); + + icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0); + kfree_skb(head); + goto out_rcu_unlock; + +out: + spin_unlock(&fq->q.lock); +out_rcu_unlock: + rcu_read_unlock(); + inet_frag_put(&fq->q); +} +#endif +#endif -- cgit From 0015b80abccecca82622d9e9d48eb210572a0c3b Mon Sep 17 00:00:00 2001 From: Salvatore Mesoraca Date: Mon, 16 Jul 2018 21:10:34 -0700 Subject: net: dsa: Remove VLA usage We avoid 2 VLAs by using a pre-allocated field in dsa_switch. We also try to avoid dynamic allocation whenever possible (when using fewer than bits-per-long ports, which is the common case). Link: http://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qPXydAacU1RqZWA@mail.gmail.com Link: http://lkml.kernel.org/r/20180505185145.GB32630@lunn.ch Signed-off-by: Salvatore Mesoraca [kees: tweak commit subject and message slightly] Signed-off-by: Kees Cook Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/net/dsa.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index fdbd6082945d..461e8a7661b7 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -259,6 +259,9 @@ struct dsa_switch { /* Number of switch port queues */ unsigned int num_tx_queues; + unsigned long *bitmap; + unsigned long _bitmap; + /* Dynamically allocated ports, keep last */ size_t num_ports; struct dsa_port ports[]; -- cgit From 169dc027fb02492ea37a0575db6a658cf922b854 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 17 Jul 2018 17:12:39 +0100 Subject: ipv6: fix useless rol32 call on hash The rol32 call is currently rotating hash but the rol'd value is being discarded. I believe the current code is incorrect and hash should be assigned the rotated value returned from rol32. Thanks to David Lebrun for spotting this. Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- include/net/ipv6.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 7528632bcf2a..8f73be494503 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -823,7 +823,7 @@ static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb, * to minimize possbility that any useful information to an * attacker is leaked. Only lower 20 bits are relevant. */ - rol32(hash, 16); + hash = rol32(hash, 16); flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK; -- cgit From 5544adb9707fda5d54494c37940701894c16b9a0 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 17 Jul 2018 19:27:17 +0300 Subject: flow_dissector: Dissect tos and ttl from the tunnel info Add dissection of the tos and ttl from the ip tunnel headers fields in case a match is needed on them. Signed-off-by: Or Gerlitz Reviewed-by: Roi Dayan Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index c64406717eee..2a17f041f7a1 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -207,7 +207,7 @@ enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_TCP, /* struct flow_dissector_key_tcp */ FLOW_DISSECTOR_KEY_IP, /* struct flow_dissector_key_ip */ FLOW_DISSECTOR_KEY_CVLAN, /* struct flow_dissector_key_flow_vlan */ - + FLOW_DISSECTOR_KEY_ENC_IP, /* struct flow_dissector_key_ip */ FLOW_DISSECTOR_KEY_MAX, }; -- cgit From bc56b33404599edc412b91933d74b36873e8ea25 Mon Sep 17 00:00:00 2001 From: Benedict Wong Date: Thu, 19 Jul 2018 10:50:44 -0700 Subject: xfrm: Remove xfrmi interface ID from flowi In order to remove performance impact of having the extra u32 in every single flowi, this change removes the flowi_xfrm struct, prefering to take the if_id as a method parameter where needed. In the inbound direction, if_id is only needed during the __xfrm_check_policy() function, and the if_id can be determined at that point based on the skb. As such, xfrmi_decode_session() is only called with the skb in __xfrm_check_policy(). In the outbound direction, the only place where if_id is needed is the xfrm_lookup() call in xfrmi_xmit2(). With this change, the if_id is directly passed into the xfrm_lookup_with_ifid() call. All existing callers can still call xfrm_lookup(), which uses a default if_id of 0. This change does not change any behavior of XFRMIs except for improving overall system performance via flowi size reduction. This change has been tested against the Android Kernel Networking Tests: https://android.googlesource.com/kernel/tests/+/master/net/test Signed-off-by: Benedict Wong Signed-off-by: Steffen Klassert --- include/net/dst.h | 14 ++++++++++++++ include/net/flow.h | 9 --------- include/net/xfrm.h | 2 +- 3 files changed, 15 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/net/dst.h b/include/net/dst.h index b3219cd8a5a1..7f735e76ca73 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -475,6 +475,14 @@ static inline struct dst_entry *xfrm_lookup(struct net *net, return dst_orig; } +static inline struct dst_entry * +xfrm_lookup_with_ifid(struct net *net, struct dst_entry *dst_orig, + const struct flowi *fl, const struct sock *sk, + int flags, u32 if_id) +{ + return dst_orig; +} + static inline struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, @@ -494,6 +502,12 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags); +struct dst_entry *xfrm_lookup_with_ifid(struct net *net, + struct dst_entry *dst_orig, + const struct flowi *fl, + const struct sock *sk, int flags, + u32 if_id); + struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags); diff --git a/include/net/flow.h b/include/net/flow.h index 187c9bef672f..8ce21793094e 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -26,10 +26,6 @@ struct flowi_tunnel { __be64 tun_id; }; -struct flowi_xfrm { - __u32 if_id; -}; - struct flowi_common { int flowic_oif; int flowic_iif; @@ -43,7 +39,6 @@ struct flowi_common { #define FLOWI_FLAG_SKIP_NH_OIF 0x04 __u32 flowic_secid; struct flowi_tunnel flowic_tun_key; - struct flowi_xfrm xfrm; kuid_t flowic_uid; }; @@ -83,7 +78,6 @@ struct flowi4 { #define flowi4_secid __fl_common.flowic_secid #define flowi4_tun_key __fl_common.flowic_tun_key #define flowi4_uid __fl_common.flowic_uid -#define flowi4_xfrm __fl_common.xfrm /* (saddr,daddr) must be grouped, same order as in IP header */ __be32 saddr; @@ -115,7 +109,6 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif, fl4->flowi4_flags = flags; fl4->flowi4_secid = 0; fl4->flowi4_tun_key.tun_id = 0; - fl4->flowi4_xfrm.if_id = 0; fl4->flowi4_uid = uid; fl4->daddr = daddr; fl4->saddr = saddr; @@ -145,7 +138,6 @@ struct flowi6 { #define flowi6_secid __fl_common.flowic_secid #define flowi6_tun_key __fl_common.flowic_tun_key #define flowi6_uid __fl_common.flowic_uid -#define flowi6_xfrm __fl_common.xfrm struct in6_addr daddr; struct in6_addr saddr; /* Note: flowi6_tos is encoded in flowlabel, too. */ @@ -193,7 +185,6 @@ struct flowi { #define flowi_secid u.__fl_common.flowic_secid #define flowi_tun_key u.__fl_common.flowic_tun_key #define flowi_uid u.__fl_common.flowic_uid -#define flowi_xfrm u.__fl_common.xfrm } __attribute__((__aligned__(BITS_PER_LONG/8))); static inline struct flowi *flowi4_to_flowi(struct flowi4 *fl4) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 1350e2cf0749..ca820945f30c 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1557,7 +1557,7 @@ struct xfrm_state *xfrm_state_find(const xfrm_address_t *daddr, const struct flowi *fl, struct xfrm_tmpl *tmpl, struct xfrm_policy *pol, int *err, - unsigned short family); + unsigned short family, u32 if_id); struct xfrm_state *xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id, xfrm_address_t *daddr, xfrm_address_t *saddr, -- cgit From b8088dda98b9064a2b3007fe54b03ede70a15602 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 17 Jul 2018 07:17:53 +0200 Subject: netfilter: nf_tables: use dev->name directly no need to store the name in separate area. Furthermore, it uses kmalloc but not kfree and most accesses seem to treat it as char[IFNAMSIZ] not char *. Remove this and use dev->name instead. In case event zeroed dev, just omit the name in the dump. Fixes: d92191aa84e5f1 ("netfilter: nf_tables: cache device name in flowtable object") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 4e82a4c49912..dc417ef0a0c5 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1124,7 +1124,6 @@ struct nft_flowtable { u32 genmask:2, use:30; u64 handle; - char *dev_name[NFT_FLOWTABLE_DEVICE_MAX]; /* runtime data below here */ struct nf_hook_ops *ops ____cacheline_aligned; struct nf_flowtable data; -- cgit From 27cde44a259c380a3c09066fc4b42de7dde9b1ad Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 18 Jul 2018 13:56:35 -0700 Subject: tcp: do not cancel delay-AcK on DCTCP special ACK Currently when a DCTCP receiver delays an ACK and receive a data packet with a different CE mark from the previous one's, it sends two immediate ACKs acking previous and latest sequences respectly (for ECN accounting). Previously sending the first ACK may mark off the delayed ACK timer (tcp_event_ack_sent). This may subsequently prevent sending the second ACK to acknowledge the latest sequence (tcp_ack_snd_check). The culprit is that tcp_send_ack() assumes it always acknowleges the latest sequence, which is not true for the first special ACK. The fix is to not make the assumption in tcp_send_ack and check the actual ack sequence before cancelling the delayed ACK. Further it's safer to pass the ack sequence number as a local variable into tcp_send_ack routine, instead of intercepting tp->rcv_nxt to avoid future bugs like this. Reported-by: Neal Cardwell Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 3482d13d655b..a08de496d1b2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -539,6 +539,7 @@ void tcp_send_fin(struct sock *sk); void tcp_send_active_reset(struct sock *sk, gfp_t priority); int tcp_send_synack(struct sock *); void tcp_push_one(struct sock *, unsigned int mss_now); +void __tcp_send_ack(struct sock *sk, u32 rcv_nxt); void tcp_send_ack(struct sock *sk); void tcp_send_delayed_ack(struct sock *sk); void tcp_send_loss_probe(struct sock *sk); -- cgit From a0496ef2c23b3b180902dd185d0d63ccbc624cf8 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 18 Jul 2018 13:56:36 -0700 Subject: tcp: do not delay ACK in DCTCP upon CE status change Per DCTCP RFC8257 (Section 3.2) the ACK reflecting the CE status change has to be sent immediately so the sender can respond quickly: """ When receiving packets, the CE codepoint MUST be processed as follows: 1. If the CE codepoint is set and DCTCP.CE is false, set DCTCP.CE to true and send an immediate ACK. 2. If the CE codepoint is not set and DCTCP.CE is true, set DCTCP.CE to false and send an immediate ACK. """ Previously DCTCP implementation may continue to delay the ACK. This patch fixes that to implement the RFC by forcing an immediate ACK. Tested with this packetdrill script provided by Larry Brakmo 0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 0.000 setsockopt(3, SOL_TCP, TCP_CONGESTION, "dctcp", 5) = 0 0.000 bind(3, ..., ...) = 0 0.000 listen(3, 1) = 0 0.100 < [ect0] SEW 0:0(0) win 32792 0.100 > SE. 0:0(0) ack 1 0.110 < [ect0] . 1:1(0) ack 1 win 257 0.200 accept(3, ..., ...) = 4 +0 setsockopt(4, SOL_SOCKET, SO_DEBUG, [1], 4) = 0 0.200 < [ect0] . 1:1001(1000) ack 1 win 257 0.200 > [ect01] . 1:1(0) ack 1001 0.200 write(4, ..., 1) = 1 0.200 > [ect01] P. 1:2(1) ack 1001 0.200 < [ect0] . 1001:2001(1000) ack 2 win 257 +0.005 < [ce] . 2001:3001(1000) ack 2 win 257 +0.000 > [ect01] . 2:2(0) ack 2001 // Previously the ACK below would be delayed by 40ms +0.000 > [ect01] E. 2:2(0) ack 3001 +0.500 < F. 9501:9501(0) ack 4 win 257 Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index a08de496d1b2..25116ec02087 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -342,6 +342,7 @@ ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); +void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks); static inline void tcp_dec_quickack_mode(struct sock *sk, const unsigned int pkts) { -- cgit From fbdeaed408cf2728c62640c10848ddb1b67e63d3 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 20 Jul 2018 21:56:53 +0000 Subject: net: create reusable function for getting ownership info of sysfs inodes Make net_ns_get_ownership() reusable by networking code outside of core. This is useful, for example, to allow bridge related sysfs files to be owned by container root. Add a function comment since this is a potentially dangerous function to use given the way that kobject_get_ownership() works by initializing uid and gid before calling .get_ownership(). Signed-off-by: Tyler Hicks Signed-off-by: David S. Miller --- include/net/net_namespace.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/net') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index a71264d75d7f..9b5fdc50519a 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -170,6 +171,8 @@ extern struct net init_net; struct net *copy_net_ns(unsigned long flags, struct user_namespace *user_ns, struct net *old_net); +void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid); + void net_ns_barrier(void); #else /* CONFIG_NET_NS */ #include @@ -182,6 +185,13 @@ static inline struct net *copy_net_ns(unsigned long flags, return old_net; } +static inline void net_ns_get_ownership(const struct net *net, + kuid_t *uid, kgid_t *gid) +{ + *uid = GLOBAL_ROOT_UID; + *gid = GLOBAL_ROOT_GID; +} + static inline void net_ns_barrier(void) {} #endif /* CONFIG_NET_NS */ -- cgit From 24b711edfc34bc45777a3f068812b7d1ed004a5d Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 19 Jul 2018 12:41:18 -0700 Subject: net/ipv6: Fix linklocal to global address with VRF Example setup: host: ip -6 addr add dev eth1 2001:db8:104::4 where eth1 is enslaved to a VRF switch: ip -6 ro add 2001:db8:104::4/128 dev br1 where br1 only has an LLA ping6 2001:db8:104::4 ssh 2001:db8:104::4 (NOTE: UDP works fine if the PKTINFO has the address set to the global address and ifindex is set to the index of eth1 with a destination an LLA). For ICMP, icmp6_iif needs to be updated to check if skb->dev is an L3 master. If it is then return the ifindex from rt6i_idev similar to what is done for loopback. For TCP, restore the original tcp_v6_iif definition which is needed in most places and add a new tcp_v6_iif_l3_slave that considers the l3_slave variability. This latter check is only needed for socket lookups. Fixes: 9ff74384600a ("net: vrf: Handle ipv6 multicast and link-local addresses") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/tcp.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 25116ec02087..cd3ecda9386a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -840,6 +840,11 @@ static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb) * as TCP moves IP6CB into a different location in skb->cb[] */ static inline int tcp_v6_iif(const struct sk_buff *skb) +{ + return TCP_SKB_CB(skb)->header.h6.iif; +} + +static inline int tcp_v6_iif_l3_slave(const struct sk_buff *skb) { bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags); -- cgit From 042f8825569d628517784d558aefe23c212f0fb2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 20 Jul 2018 21:14:38 -0700 Subject: nfp: bring back support for offloading shared blocks Now that we have offload replay infrastructure added by commit 326367427cc0 ("net: sched: call reoffload op on block callback reg") and flows are guaranteed to be removed correctly, we can revert commit 951a8ee6def3 ("nfp: reject binding to shared blocks"). Signed-off-by: Jakub Kicinski Reviewed-by: John Hurley Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index e4252a176eec..4f405ca8346f 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -114,11 +114,6 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, { } -static inline bool tcf_block_shared(struct tcf_block *block) -{ - return false; -} - static inline struct Qdisc *tcf_block_q(struct tcf_block *block) { return NULL; -- cgit From e873e4b9cc7e8ce79e5c5627b32b107035bb3f5d Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Sat, 21 Jul 2018 20:56:32 -0700 Subject: ipv6: use fib6_info_hold_safe() when necessary In the code path where only rcu read lock is held, e.g. in the route lookup code path, it is not safe to directly call fib6_info_hold() because the fib6_info may already have been deleted but still exists in the rcu grace period. Holding reference to it could cause double free and crash the kernel. This patch adds a new function fib6_info_hold_safe() and replace fib6_info_hold() in all necessary places. Syzbot reported 3 crash traces because of this. One of them is: 8021q: adding VLAN 0 to HW filter on device team0 IPv6: ADDRCONF(NETDEV_CHANGE): team0: link becomes ready dst_release: dst:(____ptrval____) refcnt:-1 dst_release: dst:(____ptrval____) refcnt:-2 WARNING: CPU: 1 PID: 4845 at include/net/dst.h:239 dst_hold include/net/dst.h:239 [inline] WARNING: CPU: 1 PID: 4845 at include/net/dst.h:239 ip6_setup_cork+0xd66/0x1830 net/ipv6/ip6_output.c:1204 dst_release: dst:(____ptrval____) refcnt:-1 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 4845 Comm: syz-executor493 Not tainted 4.18.0-rc3+ #10 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1c9/0x2b4 lib/dump_stack.c:113 panic+0x238/0x4e7 kernel/panic.c:184 dst_release: dst:(____ptrval____) refcnt:-2 dst_release: dst:(____ptrval____) refcnt:-3 __warn.cold.8+0x163/0x1ba kernel/panic.c:536 dst_release: dst:(____ptrval____) refcnt:-4 report_bug+0x252/0x2d0 lib/bug.c:186 fixup_bug arch/x86/kernel/traps.c:178 [inline] do_error_trap+0x1fc/0x4d0 arch/x86/kernel/traps.c:296 dst_release: dst:(____ptrval____) refcnt:-5 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:316 invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:992 RIP: 0010:dst_hold include/net/dst.h:239 [inline] RIP: 0010:ip6_setup_cork+0xd66/0x1830 net/ipv6/ip6_output.c:1204 Code: c1 ed 03 89 9d 18 ff ff ff 48 b8 00 00 00 00 00 fc ff df 41 c6 44 05 00 f8 e9 2d 01 00 00 4c 8b a5 c8 fe ff ff e8 1a f6 e6 fa <0f> 0b e9 6a fc ff ff e8 0e f6 e6 fa 48 8b 85 d0 fe ff ff 48 8d 78 RSP: 0018:ffff8801a8fcf178 EFLAGS: 00010293 RAX: ffff8801a8eba5c0 RBX: 0000000000000000 RCX: ffffffff869511e6 RDX: 0000000000000000 RSI: ffffffff869515b6 RDI: 0000000000000005 RBP: ffff8801a8fcf2c8 R08: ffff8801a8eba5c0 R09: ffffed0035ac8338 R10: ffffed0035ac8338 R11: ffff8801ad6419c3 R12: ffff8801a8fcf720 R13: ffff8801a8fcf6a0 R14: ffff8801ad6419c0 R15: ffff8801ad641980 ip6_make_skb+0x2c8/0x600 net/ipv6/ip6_output.c:1768 udpv6_sendmsg+0x2c90/0x35f0 net/ipv6/udp.c:1376 inet_sendmsg+0x1a1/0x690 net/ipv4/af_inet.c:798 sock_sendmsg_nosec net/socket.c:641 [inline] sock_sendmsg+0xd5/0x120 net/socket.c:651 ___sys_sendmsg+0x51d/0x930 net/socket.c:2125 __sys_sendmmsg+0x240/0x6f0 net/socket.c:2220 __do_sys_sendmmsg net/socket.c:2249 [inline] __se_sys_sendmmsg net/socket.c:2246 [inline] __x64_sys_sendmmsg+0x9d/0x100 net/socket.c:2246 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x446ba9 Code: e8 cc bb 02 00 48 83 c4 18 c3 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 eb 08 fc ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007fb39a469da8 EFLAGS: 00000246 ORIG_RAX: 0000000000000133 RAX: ffffffffffffffda RBX: 00000000006dcc54 RCX: 0000000000446ba9 RDX: 00000000000000b8 RSI: 0000000020001b00 RDI: 0000000000000003 RBP: 00000000006dcc50 R08: 00007fb39a46a700 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 45c828efc7a64843 R13: e6eeb815b9d8a477 R14: 5068caf6f713c6fc R15: 0000000000000001 Dumping ftrace buffer: (ftrace buffer empty) Kernel Offset: disabled Rebooting in 86400 seconds.. Fixes: 93531c674315 ("net/ipv6: separate handling of FIB entries from dst based routes") Reported-by: syzbot+902e2a1bcd4f7808cef5@syzkaller.appspotmail.com Reported-by: syzbot+8ae62d67f647abeeceb9@syzkaller.appspotmail.com Reported-by: syzbot+3f08feb14086930677d0@syzkaller.appspotmail.com Signed-off-by: Wei Wang Acked-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 71b9043aa0e7..3d4930528db0 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -281,6 +281,11 @@ static inline void fib6_info_hold(struct fib6_info *f6i) atomic_inc(&f6i->fib6_ref); } +static inline bool fib6_info_hold_safe(struct fib6_info *f6i) +{ + return atomic_inc_not_zero(&f6i->fib6_ref); +} + static inline void fib6_info_release(struct fib6_info *f6i) { if (f6i && atomic_dec_and_test(&f6i->fib6_ref)) -- cgit From f71e0ca4db187af7c44987e9d21e9042c3046070 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 23 Jul 2018 09:23:05 +0200 Subject: net: sched: Avoid implicit chain 0 creation Currently, chain 0 is implicitly created during block creation. However that does not align with chain object exposure, creation and destruction api introduced later on. So make the chain 0 behave the same way as any other chain and only create it when it is needed. Since chain 0 is somehow special as the qdiscs need to hold pointer to the first chain tp, this requires to move the chain head change callback infra to the block structure. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/sch_generic.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 7432100027b7..86f4651784e8 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -300,7 +300,6 @@ typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv); struct tcf_chain { struct tcf_proto __rcu *filter_chain; - struct list_head filter_chain_list; struct list_head list; struct tcf_block *block; u32 index; /* chain index */ @@ -318,6 +317,10 @@ struct tcf_block { bool keep_dst; unsigned int offloadcnt; /* Number of oddloaded filters */ unsigned int nooffloaddevcnt; /* Number of devs unable to do offload */ + struct { + struct tcf_chain *chain; + struct list_head filter_chain_list; + } chain0; }; static inline void tcf_block_offload_inc(struct tcf_block *block, u32 *flags) -- cgit From 32a4f5ecd7381f30ae3bb36dea77a150ba68af2e Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 23 Jul 2018 09:23:06 +0200 Subject: net: sched: introduce chain object to uapi Allow user to create, destroy, get and dump chain objects. Do that by extending rtnl commands by the chain-specific ones. User will now be able to explicitly create or destroy chains (so far this was done only automatically according the filter/act needs and refcounting). Also, the user will receive notification about any chain creation or destuction. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/sch_generic.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 86f4651784e8..81ec8276db9c 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -304,6 +304,7 @@ struct tcf_chain { struct tcf_block *block; u32 index; /* chain index */ unsigned int refcnt; + bool explicitly_created; }; struct tcf_block { -- cgit From 9f407f1768d3e1a5ddd7bd49fa4d1f5a26e10ed2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 23 Jul 2018 09:23:07 +0200 Subject: net: sched: introduce chain templates Allow user to set a template for newly created chains. Template lock down the chain for particular classifier type/options combinations. The classifier needs to support templates, otherwise kernel would reply with error. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/sch_generic.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 81ec8276db9c..085c509c8674 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -238,6 +238,8 @@ struct tcf_result { }; }; +struct tcf_chain; + struct tcf_proto_ops { struct list_head head; char kind[IFNAMSIZ]; @@ -263,10 +265,18 @@ struct tcf_proto_ops { tc_setup_cb_t *cb, void *cb_priv, struct netlink_ext_ack *extack); void (*bind_class)(void *, u32, unsigned long); + void * (*tmplt_create)(struct net *net, + struct tcf_chain *chain, + struct nlattr **tca, + struct netlink_ext_ack *extack); + void (*tmplt_destroy)(void *tmplt_priv); /* rtnetlink specific */ int (*dump)(struct net*, struct tcf_proto*, void *, struct sk_buff *skb, struct tcmsg*); + int (*tmplt_dump)(struct sk_buff *skb, + struct net *net, + void *tmplt_priv); struct module *owner; }; @@ -305,6 +315,8 @@ struct tcf_chain { u32 index; /* chain index */ unsigned int refcnt; bool explicitly_created; + const struct tcf_proto_ops *tmplt_ops; + void *tmplt_priv; }; struct tcf_block { -- cgit From 34738452739069947e528123810533f28dd8332b Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 23 Jul 2018 09:23:11 +0200 Subject: net: sched: cls_flower: propagate chain teplate creation and destruction to drivers Introduce a couple of flower offload commands in order to propagate template creation/destruction events down to device drivers. Drivers may use this information to prepare HW in an optimal way for future filter insertions. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 4f405ca8346f..a3101582f642 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -721,6 +721,8 @@ enum tc_fl_command { TC_CLSFLOWER_REPLACE, TC_CLSFLOWER_DESTROY, TC_CLSFLOWER_STATS, + TC_CLSFLOWER_TMPLT_CREATE, + TC_CLSFLOWER_TMPLT_DESTROY, }; struct tc_cls_flower_offload { -- cgit From 1f3ed383fb9a073ae2e408cd7a0717b04c7c3a21 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 27 Jul 2018 09:45:05 +0200 Subject: net: sched: don't dump chains only held by actions In case a chain is empty and not explicitly created by a user, such chain should not exist. The only exception is if there is an action "goto chain" pointing to it. In that case, don't show the chain in the dump. Track the chain references held by actions and use them to find out if a chain should or should not be shown in chain dump. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 3 +++ include/net/sch_generic.h | 1 + 2 files changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index a3101582f642..6d02f31abba8 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -39,7 +39,10 @@ bool tcf_queue_work(struct rcu_work *rwork, work_func_t func); #ifdef CONFIG_NET_CLS struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, bool create); +struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, + u32 chain_index); void tcf_chain_put(struct tcf_chain *chain); +void tcf_chain_put_by_act(struct tcf_chain *chain); void tcf_block_netif_keep_dst(struct tcf_block *block); int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 085c509c8674..c5432362dc26 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -314,6 +314,7 @@ struct tcf_chain { struct tcf_block *block; u32 index; /* chain index */ unsigned int refcnt; + unsigned int action_refcnt; bool explicitly_created; const struct tcf_proto_ops *tmplt_ops; void *tmplt_priv; -- cgit From b67c540b8a987e365dc548e5b2ddf023946e3d63 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 27 Jul 2018 15:26:56 +0300 Subject: net: dcb: Add priority-to-DSCP map getters On ingress, a network device such as a switch assigns to packets priority based on various criteria. Common options include interpreting PCP and DSCP fields according to user configuration. When a packet egresses the switch, a reverse process may rewrite PCP and/or DSCP values according to packet priority. The following three functions support a) obtaining a DSCP-to-priority map or vice versa, and b) finding default-priority entries in APP database. The DCB subsystem supports for APP entries a very generous M:N mapping between priorities and protocol identifiers. Understandably, several (say) DSCP values can map to the same priority. But this asymmetry holds the other way around as well--one priority can map to several DSCP values. For this reason, the following functions operate in terms of bitmaps, with ones in positions that match some APP entry. - dcb_ieee_getapp_dscp_prio_mask_map() to compute for a given netdevice a map of DSCP-to-priority-mask, which gives for each DSCP value a bitmap of priorities related to that DSCP value by APP, along the lines of dcb_ieee_getapp_mask(). - dcb_ieee_getapp_prio_dscp_mask_map() similarly to compute for a given netdevice a map from priorities to a bitmap of DSCPs. - dcb_ieee_getapp_default_prio_mask() which finds all default-priority rules for a given port in APP database, and returns a mask of priorities allowed by these default-priority rules. Signed-off-by: Petr Machata Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/dcbnl.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/net') diff --git a/include/net/dcbnl.h b/include/net/dcbnl.h index 0e5e91be2d30..e22a8a3c089b 100644 --- a/include/net/dcbnl.h +++ b/include/net/dcbnl.h @@ -34,6 +34,19 @@ int dcb_ieee_setapp(struct net_device *, struct dcb_app *); int dcb_ieee_delapp(struct net_device *, struct dcb_app *); u8 dcb_ieee_getapp_mask(struct net_device *, struct dcb_app *); +struct dcb_ieee_app_prio_map { + u64 map[IEEE_8021QAZ_MAX_TCS]; +}; +void dcb_ieee_getapp_prio_dscp_mask_map(const struct net_device *dev, + struct dcb_ieee_app_prio_map *p_map); + +struct dcb_ieee_app_dscp_map { + u8 map[64]; +}; +void dcb_ieee_getapp_dscp_prio_mask_map(const struct net_device *dev, + struct dcb_ieee_app_dscp_map *p_map); +u8 dcb_ieee_getapp_default_prio_mask(const struct net_device *dev); + int dcbnl_ieee_notify(struct net_device *dev, int event, int cmd, u32 seq, u32 pid); int dcbnl_cee_notify(struct net_device *dev, int event, int cmd, -- cgit From 222440b4e832059c0ddf18d1e409f0552ab53a7d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 4 Jul 2018 12:48:04 +0200 Subject: netfilter: nf_tables: handle meta/lookup with direct call Currently nft uses inlined variants for common operations such as 'ip saddr 1.2.3.4' instead of an indirect call. Also handle meta get operations and lookups without indirect call, both are builtin. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_core.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index a05134507e7b..8da837d2aaf9 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -71,4 +71,11 @@ extern struct nft_set_type nft_set_hash_fast_type; extern struct nft_set_type nft_set_rbtree_type; extern struct nft_set_type nft_set_bitmap_type; +struct nft_expr; +struct nft_regs; +struct nft_pktinfo; +void nft_meta_get_eval(const struct nft_expr *expr, + struct nft_regs *regs, const struct nft_pktinfo *pkt); +void nft_lookup_eval(const struct nft_expr *expr, + struct nft_regs *regs, const struct nft_pktinfo *pkt); #endif /* _NET_NF_TABLES_CORE_H */ -- cgit From 6decb5b45e70d6ffff6488cc8e8bad6b9ac7f99b Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:32 +0530 Subject: Bluetooth: Define PHY flags in hdev and set 1M as default 1M is mandatory to be supported by LE controllers and the same would be set in power on. This patch defines hdev flags for LE PHYs and set 1M to default. Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 4 ++++ include/net/bluetooth/hci_core.h | 3 +++ 2 files changed, 7 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 73e48be5bbb3..664fe1ebf2c7 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1514,6 +1514,10 @@ struct hci_cp_le_set_default_phy { __u8 rx_phys; } __packed; +#define HCI_LE_SET_PHY_1M 0x01 +#define HCI_LE_SET_PHY_2M 0x02 +#define HCI_LE_SET_PHY_CODED 0x04 + #define HCI_OP_LE_SET_EXT_SCAN_PARAMS 0x2041 struct hci_cp_le_set_ext_scan_params { __u8 own_addr_type; diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index a74453571264..71f79df9ee05 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -315,6 +315,9 @@ struct hci_dev { unsigned long sco_last_tx; unsigned long le_last_tx; + __u8 le_tx_def_phys; + __u8 le_rx_def_phys; + struct workqueue_struct *workqueue; struct workqueue_struct *req_workqueue; -- cgit From 5075b972f20ddad5bb19542ea4f5794d06673375 Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:33 +0530 Subject: Bluetooth: Add defines for BREDR pkt_type and LE PHYs This also add macros for checking LMP support for different pkt_types Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 14 ++++++++++++++ include/net/bluetooth/hci_core.h | 4 ++++ 2 files changed, 18 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 664fe1ebf2c7..89bf800f6eb1 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -291,6 +291,14 @@ enum { #define HCI_DH3 0x0800 #define HCI_DH5 0x8000 +/* HCI packet types inverted masks */ +#define HCI_2DH1 0x0002 +#define HCI_3DH1 0x0004 +#define HCI_2DH3 0x0100 +#define HCI_3DH3 0x0200 +#define HCI_2DH5 0x1000 +#define HCI_3DH5 0x2000 + #define HCI_HV1 0x0020 #define HCI_HV2 0x0040 #define HCI_HV3 0x0080 @@ -354,6 +362,8 @@ enum { #define LMP_PCONTROL 0x04 #define LMP_TRANSPARENT 0x08 +#define LMP_EDR_2M 0x02 +#define LMP_EDR_3M 0x04 #define LMP_RSSI_INQ 0x40 #define LMP_ESCO 0x80 @@ -361,7 +371,9 @@ enum { #define LMP_EV5 0x02 #define LMP_NO_BREDR 0x20 #define LMP_LE 0x40 +#define LMP_EDR_3SLOT 0x80 +#define LMP_EDR_5SLOT 0x01 #define LMP_SNIFF_SUBR 0x02 #define LMP_PAUSE_ENC 0x04 #define LMP_EDR_ESCO_2M 0x20 @@ -399,6 +411,8 @@ enum { #define HCI_LE_PING 0x10 #define HCI_LE_DATA_LEN_EXT 0x20 #define HCI_LE_EXT_SCAN_POLICY 0x80 +#define HCI_LE_PHY_2M 0x01 +#define HCI_LE_PHY_CODED 0x08 #define HCI_LE_CHAN_SEL_ALG2 0x40 /* Connection modes */ diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 71f79df9ee05..a64d13f91d09 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1141,6 +1141,10 @@ void hci_conn_del_sysfs(struct hci_conn *conn); #define lmp_inq_tx_pwr_capable(dev) ((dev)->features[0][7] & LMP_INQ_TX_PWR) #define lmp_ext_feat_capable(dev) ((dev)->features[0][7] & LMP_EXTFEATURES) #define lmp_transp_capable(dev) ((dev)->features[0][2] & LMP_TRANSPARENT) +#define lmp_edr_2m_capable(dev) ((dev)->features[0][3] & LMP_EDR_2M) +#define lmp_edr_3m_capable(dev) ((dev)->features[0][3] & LMP_EDR_3M) +#define lmp_edr_3slot_capable(dev) ((dev)->features[0][4] & LMP_EDR_3SLOT) +#define lmp_edr_5slot_capable(dev) ((dev)->features[0][5] & LMP_EDR_5SLOT) /* ----- Extended LMP capabilities ----- */ #define lmp_csb_master_capable(dev) ((dev)->features[2][0] & LMP_CSB_MASTER) -- cgit From 6244691fec4dd0adebca255e60e0ed7ac8155b2e Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:34 +0530 Subject: Bluetooth: Implement Get PHY Configuration mgmt command This commands basically retrieve the supported packet types of BREDR and supported PHYs of the controller. BR_1M_1SLOT, LE_1M_TX and LE_1M_RX would be supported by default. Other PHYs are supported based on the local features. Also this sets PHY_CONFIGURATION bit in supported settings. @ MGMT Command: Get PHY Configuration (0x0044) plen 0 @ MGMT Event: Command Complete (0x0001) plen 15 Get PHY Configuration (0x0044) plen 12 Status: Success (0x00) Supported PHYs: 0x7fff BR 1M 1SLOT BR 1M 3SLOT BR 1M 5SLOT EDR 2M 1SLOT EDR 2M 3SLOT EDR 2M 5SLOT EDR 3M 1SLOT EDR 3M 3SLOT EDR 3M 5SLOT LE 1M TX LE 1M RX LE 2M TX LE 2M RX LE CODED TX LE CODED RX Configurable PHYs: 0x79fe BR 1M 3SLOT BR 1M 5SLOT EDR 2M 1SLOT EDR 2M 3SLOT EDR 2M 5SLOT EDR 3M 1SLOT EDR 3M 3SLOT EDR 3M 5SLOT LE 2M TX LE 2M RX LE CODED TX LE CODED RX Selected PHYs: 0x07ff BR 1M 1SLOT BR 1M 3SLOT BR 1M 5SLOT EDR 2M 1SLOT EDR 2M 3SLOT EDR 2M 5SLOT EDR 3M 1SLOT EDR 3M 3SLOT EDR 3M 5SLOT LE 1M TX LE 1M RX Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/mgmt.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index e7303eee65cd..1c93d6e83a6c 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -101,6 +101,7 @@ struct mgmt_rp_read_index_list { #define MGMT_SETTING_PRIVACY 0x00002000 #define MGMT_SETTING_CONFIGURATION 0x00004000 #define MGMT_SETTING_STATIC_ADDRESS 0x00008000 +#define MGMT_SETTING_PHY_CONFIGURATION 0x00010000 #define MGMT_OP_READ_INFO 0x0004 #define MGMT_READ_INFO_SIZE 0 @@ -604,6 +605,30 @@ struct mgmt_cp_set_appearance { } __packed; #define MGMT_SET_APPEARANCE_SIZE 2 +#define MGMT_OP_GET_PHY_CONFIGURATION 0x0044 +struct mgmt_rp_get_phy_confguration { + __le32 supported_phys; + __le32 configurable_phys; + __le32 selected_phys; +} __packed; +#define MGMT_GET_PHY_CONFIGURATION_SIZE 0 + +#define MGMT_PHY_BR_1M_1SLOT 0x00000001 +#define MGMT_PHY_BR_1M_3SLOT 0x00000002 +#define MGMT_PHY_BR_1M_5SLOT 0x00000004 +#define MGMT_PHY_EDR_2M_1SLOT 0x00000008 +#define MGMT_PHY_EDR_2M_3SLOT 0x00000010 +#define MGMT_PHY_EDR_2M_5SLOT 0x00000020 +#define MGMT_PHY_EDR_3M_1SLOT 0x00000040 +#define MGMT_PHY_EDR_3M_3SLOT 0x00000080 +#define MGMT_PHY_EDR_3M_5SLOT 0x00000100 +#define MGMT_PHY_LE_1M_TX 0x00000200 +#define MGMT_PHY_LE_1M_RX 0x00000400 +#define MGMT_PHY_LE_2M_TX 0x00000800 +#define MGMT_PHY_LE_2M_RX 0x00001000 +#define MGMT_PHY_LE_CODED_TX 0x00002000 +#define MGMT_PHY_LE_CODED_RX 0x00004000 + #define MGMT_EV_CMD_COMPLETE 0x0001 struct mgmt_ev_cmd_complete { __le16 opcode; -- cgit From 0314f2867fa0c46d0fc1c23c80e7fab9435079df Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:35 +0530 Subject: Bluetooth: Implement Set PHY Confguration command This enables user to set phys which will be used in all subsequent connections. Also host will use the same in LE scanning as well. @ MGMT Command: Set PHY Configuration (0x0045) plen 4 Selected PHYs: 0x7fff BR 1M 1SLOT BR 1M 3SLOT BR 1M 5SLOT EDR 2M 1SLOT EDR 2M 3SLOT EDR 2M 5SLOT EDR 3M 1SLOT EDR 3M 3SLOT EDR 3M 5SLOT LE 1M TX LE 1M RX LE 2M TX LE 2M RX LE CODED TX LE CODED RX < HCI Command: LE Set Default PHY (0x08|0x0031) plen 3 All PHYs preference: 0x00 TX PHYs preference: 0x07 LE 1M LE 2M LE Coded RX PHYs preference: 0x07 LE 1M LE 2M LE Coded > HCI Event: Command Complete (0x0e) plen 4 LE Set Default PHY (0x08|0x0031) ncmd 1 Status: Success (0x00) @ MGMT Event: Command Complete (0x0001) plen 3 Set PHY Configuration (0x0045) plen 0 Status: Success (0x00) @ MGMT Event: PHY Configuration Changed (0x0026) plen 4 Selected PHYs: 0x7fff BR 1M 1SLOT BR 1M 3SLOT BR 1M 5SLOT EDR 2M 1SLOT EDR 2M 3SLOT EDR 2M 5SLOT EDR 3M 1SLOT EDR 3M 3SLOT EDR 3M 5SLOT LE 1M TX LE 1M RX LE 2M TX LE 2M RX LE CODED TX LE CODED RX Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/mgmt.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index 1c93d6e83a6c..0916e203e5d9 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -629,6 +629,25 @@ struct mgmt_rp_get_phy_confguration { #define MGMT_PHY_LE_CODED_TX 0x00002000 #define MGMT_PHY_LE_CODED_RX 0x00004000 +#define MGMT_PHY_BREDR_MASK (MGMT_PHY_BR_1M_1SLOT | MGMT_PHY_BR_1M_3SLOT | \ + MGMT_PHY_BR_1M_5SLOT | MGMT_PHY_EDR_2M_1SLOT | \ + MGMT_PHY_EDR_2M_3SLOT | MGMT_PHY_EDR_2M_5SLOT | \ + MGMT_PHY_EDR_3M_1SLOT | MGMT_PHY_EDR_3M_3SLOT | \ + MGMT_PHY_EDR_3M_5SLOT) +#define MGMT_PHY_LE_MASK (MGMT_PHY_LE_1M_TX | MGMT_PHY_LE_1M_RX | \ + MGMT_PHY_LE_2M_TX | MGMT_PHY_LE_2M_RX | \ + MGMT_PHY_LE_CODED_TX | MGMT_PHY_LE_CODED_RX) +#define MGMT_PHY_LE_TX_MASK (MGMT_PHY_LE_1M_TX | MGMT_PHY_LE_2M_TX | \ + MGMT_PHY_LE_CODED_TX) +#define MGMT_PHY_LE_RX_MASK (MGMT_PHY_LE_1M_RX | MGMT_PHY_LE_2M_RX | \ + MGMT_PHY_LE_CODED_RX) + +#define MGMT_OP_SET_PHY_CONFIGURATION 0x0045 +struct mgmt_cp_set_phy_confguration { + __le32 selected_phys; +} __packed; +#define MGMT_SET_PHY_CONFIGURATION_SIZE 4 + #define MGMT_EV_CMD_COMPLETE 0x0001 struct mgmt_ev_cmd_complete { __le16 opcode; -- cgit From b7c23df85b6a1c3bcfb591cfa938d341fc3a556e Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:36 +0530 Subject: Bluetooth: Implement PHY changed event This defines and implement phy changed event and send it to user whenever selected PHYs changes using SET_PHY_CONFIGURATION. This will be also trigerred when BREDR pkt_type is changed using the legacy ioctl HCISETPTYPE. @ MGMT Command: Set PHY Configuration (0x0045) plen 4 Selected PHYs: 0x7fff BR 1M 1SLOT BR 1M 3SLOT BR 1M 5SLOT EDR 2M 1SLOT EDR 2M 3SLOT EDR 2M 5SLOT EDR 3M 1SLOT EDR 3M 3SLOT EDR 3M 5SLOT LE 1M TX LE 1M RX LE 2M TX LE 2M RX LE CODED TX LE CODED RX < HCI Command: LE Set Default PHY (0x08|0x0031) plen 3 All PHYs preference: 0x00 TX PHYs preference: 0x07 LE 1M LE 2M LE Coded RX PHYs preference: 0x07 LE 1M LE 2M LE Coded > HCI Event: Command Complete (0x0e) plen 4 LE Set Default PHY (0x08|0x0031) ncmd 1 Status: Success (0x00) @ MGMT Event: Command Complete (0x0001) plen 3 Set PHY Configuration (0x0045) plen 0 Status: Success (0x00) @ MGMT Event: PHY Configuration Changed (0x0026) plen 4 Selected PHYs: 0x7fff BR 1M 1SLOT BR 1M 3SLOT BR 1M 5SLOT EDR 2M 1SLOT EDR 2M 3SLOT EDR 2M 5SLOT EDR 3M 1SLOT EDR 3M 3SLOT EDR 3M 5SLOT LE 1M TX LE 1M RX LE 2M TX LE 2M RX LE CODED TX LE CODED RX Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 1 + include/net/bluetooth/mgmt.h | 5 +++++ 2 files changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index a64d13f91d09..ab5d494a545a 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1544,6 +1544,7 @@ void mgmt_advertising_added(struct sock *sk, struct hci_dev *hdev, u8 instance); void mgmt_advertising_removed(struct sock *sk, struct hci_dev *hdev, u8 instance); +int mgmt_phy_configuration_changed(struct hci_dev *hdev, struct sock *skip); u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency, u16 to_multiplier); diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index 0916e203e5d9..7f372e9067c9 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -868,3 +868,8 @@ struct mgmt_ev_ext_info_changed { __le16 eir_len; __u8 eir[0]; } __packed; + +#define MGMT_EV_PHY_CONFIGURATION_CHANGED 0x0026 +struct mgmt_ev_phy_configuration_changed { + __le32 selected_phys; +} __packed; -- cgit From 45bdd86eafc7d29e0b4b6681bec9c6ab8eddc6bf Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:37 +0530 Subject: Bluetooth: Set Scan PHYs based on selected PHYs by user Use the PHYs selected in Set Phy Configuration management command while scanning. < HCI Command: LE Set Extended Scan Parameters (0x08|0x0041) plen 13 Own address type: Random (0x01) Filter policy: Accept all advertisement (0x00) PHYs: 0x05 Entry 0: LE 1M Type: Active (0x01) Interval: 11.250 msec (0x0012) Window: 11.250 msec (0x0012) Entry 1: LE Coded Type: Active (0x01) Interval: 11.250 msec (0x0012) Window: 11.250 msec (0x0012) > HCI Event: Command Complete (0x0e) plen 4 LE Set Extended Scan Parameters (0x08|0x0041) ncmd 1 Status: Success (0x00) < HCI Command: LE Set Extended Scan Enable (0x08|0x0042) plen 6 Extended scan: Enabled (0x01) Filter duplicates: Enabled (0x01) Duration: 0 msec (0x0000) Period: 0.00 sec (0x0000) > HCI Event: Command Complete (0x0e) plen 4 LE Set Extended Scan Enable (0x08|0x0042) ncmd 2 Status: Success (0x00) Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 4 +++- include/net/bluetooth/hci_core.h | 9 +++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 89bf800f6eb1..04211457367a 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1540,7 +1540,9 @@ struct hci_cp_le_set_ext_scan_params { __u8 data[0]; } __packed; -#define LE_SCAN_PHY_1M 0x01 +#define LE_SCAN_PHY_1M 0x01 +#define LE_SCAN_PHY_2M 0x02 +#define LE_SCAN_PHY_CODED 0x04 struct hci_cp_le_scan_phy_params { __u8 type; diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index ab5d494a545a..113c9bb609c7 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1165,6 +1165,15 @@ void hci_conn_del_sysfs(struct hci_conn *conn); #define bredr_sc_enabled(dev) (lmp_sc_capable(dev) && \ hci_dev_test_flag(dev, HCI_SC_ENABLED)) +#define scan_1m(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_1M) || \ + ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_1M)) + +#define scan_2m(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_2M) || \ + ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_2M)) + +#define scan_coded(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_CODED) || \ + ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_CODED)) + /* Use ext scanning if set ext scan param and ext scan enable is supported */ #define use_ext_scan(dev) (((dev)->commands[37] & 0x20) && \ ((dev)->commands[37] & 0x40)) -- cgit From b2cc9761f144e8ef714be8c590603073b80ddc13 Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:38 +0530 Subject: Bluetooth: Handle extended ADV PDU types This patch defines the extended ADV types and handle it in ADV report. Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 04211457367a..83a1593a128e 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1976,6 +1976,14 @@ struct hci_ev_le_conn_complete { #define LE_LEGACY_SCAN_RSP_ADV 0x001b #define LE_LEGACY_SCAN_RSP_ADV_SCAN 0x001a +/* Extended Advertising event types */ +#define LE_EXT_ADV_NON_CONN_IND 0x0000 +#define LE_EXT_ADV_CONN_IND 0x0001 +#define LE_EXT_ADV_SCAN_IND 0x0002 +#define LE_EXT_ADV_DIRECT_IND 0x0004 +#define LE_EXT_ADV_SCAN_RSP 0x0008 +#define LE_EXT_ADV_LEGACY_PDU 0x0010 + #define ADDR_LE_DEV_PUBLIC 0x00 #define ADDR_LE_DEV_RANDOM 0x01 -- cgit From 6b49bcb4bce2ed0f0aefe8e304a8b9cbaeeaa3f0 Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:40 +0530 Subject: Bluetooth: Read no of adv sets during init This patch reads the number of advertising sets in the controller during init and save it in hdev. Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 7 +++++++ include/net/bluetooth/hci_core.h | 4 ++++ 2 files changed, 11 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 83a1593a128e..3f93ae9765a4 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -410,6 +410,7 @@ enum { #define HCI_LE_SLAVE_FEATURES 0x08 #define HCI_LE_PING 0x10 #define HCI_LE_DATA_LEN_EXT 0x20 +#define HCI_LE_EXT_ADV 0x10 #define HCI_LE_EXT_SCAN_POLICY 0x80 #define HCI_LE_PHY_2M 0x01 #define HCI_LE_PHY_CODED 0x08 @@ -1579,6 +1580,12 @@ struct hci_cp_le_ext_conn_param { __le16 max_ce_len; } __packed; +#define HCI_OP_LE_READ_NUM_SUPPORTED_ADV_SETS 0x203b +struct hci_rp_le_read_num_supported_adv_sets { + __u8 status; + __u8 num_of_sets; +} __packed; + /* ---- HCI Events ---- */ #define HCI_EV_INQUIRY_COMPLETE 0x01 diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 113c9bb609c7..2aad4a863176 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -222,6 +222,7 @@ struct hci_dev { __u8 le_features[8]; __u8 le_white_list_size; __u8 le_resolv_list_size; + __u8 le_num_of_adv_sets; __u8 le_states[8]; __u8 commands[64]; __u8 hci_ver; @@ -1180,6 +1181,9 @@ void hci_conn_del_sysfs(struct hci_conn *conn); /* Use ext create connection if command is supported */ #define use_ext_conn(dev) ((dev)->commands[37] & 0x80) +/* Extended advertising support */ +#define ext_adv_capable(dev) (((dev)->le_features[1] & HCI_LE_EXT_ADV)) + /* ----- HCI protocols ----- */ #define HCI_PROTO_DEFER 0x01 -- cgit From de181e887ac27dadda127c7d4c3e89c6da8fb6d2 Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:41 +0530 Subject: Bluetooth: Impmlement extended adv enable This patch basically replaces legacy adv with extended adv based on the controller support. Currently there is no design change. ie only one adv set will be enabled at a time. This also adds tx_power in instance and store whatever returns from Set_ext_parameter, use the same in adv data as well. For instance 0 tx_power is stored in hdev only. < HCI Command: LE Set Extended Advertising Parameters (0x08|0x0036) plen 25 Handle: 0x00 Properties: 0x0010 Use legacy advertising PDUs: ADV_NONCONN_IND Min advertising interval: 1280.000 msec (0x0800) Max advertising interval: 1280.000 msec (0x0800) Channel map: 37, 38, 39 (0x07) Own address type: Random (0x01) Peer address type: Public (0x00) Peer address: 00:00:00:00:00:00 (OUI 00-00-00) Filter policy: Allow Scan Request from Any, Allow Connect Request from Any (0x00) TX power: 127 dbm (0x7f) Primary PHY: LE 1M (0x01) Secondary max skip: 0x00 Secondary PHY: LE 1M (0x01) SID: 0x00 Scan request notifications: Disabled (0x00) > HCI Event: Command Complete (0x0e) plen 5 LE Set Extended Advertising Parameters (0x08|0x0036) ncmd 1 Status: Success (0x00) TX power (selected): 7 dbm (0x07) < HCI Command: LE Set Extended Advertising Enable (0x08|0x0039) plen 6 Extended advertising: Enabled (0x01) Number of sets: 1 (0x01) Entry 0 Handle: 0x00 Duration: 0 ms (0x00) Max ext adv events: 0 > HCI Event: Command Complete (0x0e) plen 4 LE Set Extended Advertising Enable (0x08|0x0039) ncmd 2 Status: Success (0x00) Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 39 +++++++++++++++++++++++++++++++++++++++ include/net/bluetooth/hci_core.h | 1 + 2 files changed, 40 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 3f93ae9765a4..b447b127879e 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1586,6 +1586,45 @@ struct hci_rp_le_read_num_supported_adv_sets { __u8 num_of_sets; } __packed; +#define HCI_OP_LE_SET_EXT_ADV_PARAMS 0x2036 +struct hci_cp_le_set_ext_adv_params { + __u8 handle; + __le16 evt_properties; + __u8 min_interval[3]; + __u8 max_interval[3]; + __u8 channel_map; + __u8 own_addr_type; + __u8 peer_addr_type; + bdaddr_t peer_addr; + __u8 filter_policy; + __u8 tx_power; + __u8 primary_phy; + __u8 secondary_max_skip; + __u8 secondary_phy; + __u8 sid; + __u8 notif_enable; +} __packed; + +#define HCI_ADV_PHY_1M 0X01 + +struct hci_rp_le_set_ext_adv_params { + __u8 status; + __u8 tx_power; +} __packed; + +#define HCI_OP_LE_SET_EXT_ADV_ENABLE 0x2039 +struct hci_cp_le_set_ext_adv_enable { + __u8 enable; + __u8 num_of_sets; + __u8 data[0]; +} __packed; + +struct hci_cp_ext_adv_set { + __u8 handle; + __le16 duration; + __u8 max_events; +} __packed; + /* ---- HCI Events ---- */ #define HCI_EV_INQUIRY_COMPLETE 0x01 diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 2aad4a863176..ad3518303a0c 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -171,6 +171,7 @@ struct adv_info { __u8 adv_data[HCI_MAX_AD_LENGTH]; __u16 scan_rsp_len; __u8 scan_rsp_data[HCI_MAX_AD_LENGTH]; + __s8 tx_power; }; #define HCI_MAX_ADV_INSTANCES 5 -- cgit From a0fb3726ba55138ef6fdd5dc67da6d9a70360696 Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:42 +0530 Subject: Bluetooth: Use Set ext adv/scan rsp data if controller supports This patch implements Set Ext Adv data and Set Ext Scan rsp data if controller support extended advertising. Currently the operation is set as Complete data and fragment preference is set as no fragment < HCI Command: LE Set Extended Advertising Data (0x08|0x0037) plen 35 Handle: 0x00 Operation: Complete extended advertising data (0x03) Fragment preference: Minimize fragmentation (0x01) Data length: 0x15 16-bit Service UUIDs (complete): 2 entries Heart Rate (0x180d) Battery Service (0x180f) Name (complete): Test LE Company: Google (224) Data: 0102 > HCI Event: Command Complete (0x0e) plen 4 LE Set Extended Advertising Data (0x08|0x0037) ncmd 1 Status: Success (0x00) Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index b447b127879e..aace97099ead 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1625,6 +1625,28 @@ struct hci_cp_ext_adv_set { __u8 max_events; } __packed; +#define HCI_OP_LE_SET_EXT_ADV_DATA 0x2037 +struct hci_cp_le_set_ext_adv_data { + __u8 handle; + __u8 operation; + __u8 frag_pref; + __u8 length; + __u8 data[HCI_MAX_AD_LENGTH]; +} __packed; + +#define HCI_OP_LE_SET_EXT_SCAN_RSP_DATA 0x2038 +struct hci_cp_le_set_ext_scan_rsp_data { + __u8 handle; + __u8 operation; + __u8 frag_pref; + __u8 length; + __u8 data[HCI_MAX_AD_LENGTH]; +} __packed; + +#define LE_SET_ADV_DATA_OP_COMPLETE 0x03 + +#define LE_SET_ADV_DATA_NO_FRAG 0x01 + /* ---- HCI Events ---- */ #define HCI_EV_INQUIRY_COMPLETE 0x01 -- cgit From 45b7749f16aacd9ffab8e958caa77e2aa2358c0b Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:43 +0530 Subject: Bluetooth: Implement disable and removal of adv instance If ext adv is enabled then use ext adv to disable as well. Also remove the adv set during LE disable. < HCI Command: LE Set Extended Advertising Enable (0x08|0x0039) plen 2 Extended advertising: Disabled (0x00) Number of sets: Disable all sets (0x00) > HCI Event: Command Complete (0x0e) plen 4 LE Set Extended Advertising Enable (0x08|0x0039) ncmd 2 Status: Success (0x00) Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index aace97099ead..faa2922a69fd 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1647,6 +1647,8 @@ struct hci_cp_le_set_ext_scan_rsp_data { #define LE_SET_ADV_DATA_NO_FRAG 0x01 +#define HCI_OP_LE_CLEAR_ADV_SETS 0x203d + /* ---- HCI Events ---- */ #define HCI_EV_INQUIRY_COMPLETE 0x01 -- cgit From a73c046a2869048430c332a871a5b169f192c6c3 Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:45 +0530 Subject: Bluetooth: Implement Set ADV set random address This basically sets the random address for the adv instance Random address can be set only if the instance is created which is done in Set ext adv param. Random address and rpa expire timer and flags have been added to adv instance which will be used when the respective instance is scheduled. This introduces a hci_get_random_address() which returns the own address type and random address (rpa or nrpa) based on the instance flags and hdev flags. New function is required since own address type should be known before setting adv params but address can be set only after setting params. < HCI Command: LE Set Advertising Set Random Address (0x08|0x0035) plen 7 Advertising handle: 0x00 Advertising random address: 3C:8E:56:9B:77:84 (OUI 3C-8E-56) > HCI Event: Command Complete (0x0e) plen 4 LE Set Advertising Set Random Address (0x08|0x0035) ncmd 1 Status: Success (0x00) Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 6 ++++++ include/net/bluetooth/hci_core.h | 4 ++++ 2 files changed, 10 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index faa2922a69fd..8d348d0d3eea 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1649,6 +1649,12 @@ struct hci_cp_le_set_ext_scan_rsp_data { #define HCI_OP_LE_CLEAR_ADV_SETS 0x203d +#define HCI_OP_LE_SET_ADV_SET_RAND_ADDR 0x2035 +struct hci_cp_le_set_adv_set_rand_addr { + __u8 handle; + bdaddr_t bdaddr; +} __packed; + /* ---- HCI Events ---- */ #define HCI_EV_INQUIRY_COMPLETE 0x01 diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index ad3518303a0c..0db1b9b428b7 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -172,6 +172,9 @@ struct adv_info { __u16 scan_rsp_len; __u8 scan_rsp_data[HCI_MAX_AD_LENGTH]; __s8 tx_power; + bdaddr_t random_addr; + bool rpa_expired; + struct delayed_work rpa_expired_cb; }; #define HCI_MAX_ADV_INSTANCES 5 @@ -1113,6 +1116,7 @@ int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, u16 scan_rsp_len, u8 *scan_rsp_data, u16 timeout, u16 duration); int hci_remove_adv_instance(struct hci_dev *hdev, u8 instance); +void hci_adv_instances_set_rpa_expired(struct hci_dev *hdev, bool rpa_expired); void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb); -- cgit From acf0aeae431a0f1723385cd1cb50177e4cc10edd Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:46 +0530 Subject: Bluetooth: Handle ADv set terminated event This event comes after connection complete event for incoming connections. Since we now have different random address for each instance, conn resp address is assigned from this event. As of now only connection part is handled as we are not enabling duration or max num of events while starting ext adv. Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 8d348d0d3eea..57e3e3675d66 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -2155,6 +2155,14 @@ struct hci_ev_le_enh_conn_complete { __u8 clk_accurancy; } __packed; +#define HCI_EV_LE_EXT_ADV_SET_TERM 0x12 +struct hci_evt_le_ext_adv_set_term { + __u8 status; + __u8 handle; + __le16 conn_handle; + __u8 num_evts; +} __packed; + /* Internal events generated by Bluetooth stack */ #define HCI_EV_STACK_INTERNAL 0xfd struct hci_ev_stack_internal { -- cgit From 85a721a8b0b6880d8cf6b9def70404ade8563225 Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:47 +0530 Subject: Bluetooth: Implement secondary advertising on different PHYs This patch adds support for advertising in primary and secondary channel on different PHYs. User can add the phy preference in the flag based on which phy type will be added in extended advertising parameter would be set. @ MGMT Command: Add Advertising (0x003e) plen 11 Instance: 1 Flags: 0x00000200 Advertise in CODED on Secondary channel Duration: 0 Timeout: 0 Advertising data length: 0 Scan response length: 0 < HCI Command: LE Set Extended Advertising Enable (0x08|0x0039) plen 2 Extended advertising: Disabled (0x00) Number of sets: Disable all sets (0x00) > HCI Event: Command Complete (0x0e) plen 4 LE Set Extended Advertising Enable (0x08|0x0039) ncmd 2 Status: Success (0x00) < HCI Command: LE Set Extended Advertising Parameters (0x08|0x0036) plen 25 Handle: 0x00 Properties: 0x0000 Min advertising interval: 1280.000 msec (0x0800) Max advertising interval: 1280.000 msec (0x0800) Channel map: 37, 38, 39 (0x07) Own address type: Random (0x01) Peer address type: Public (0x00) Peer address: 00:00:00:00:00:00 (OUI 00-00-00) Filter policy: Allow Scan Request from Any, Allow Connect Request from Any (0x00) TX power: 127 dbm (0x7f) Primary PHY: LE Coded (0x03) Secondary max skip: 0x00 Secondary PHY: LE Coded (0x03) SID: 0x00 Scan request notifications: Disabled (0x00) Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 4 ++++ include/net/bluetooth/mgmt.h | 6 ++++++ 2 files changed, 10 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 57e3e3675d66..8ff36463719f 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -410,6 +410,8 @@ enum { #define HCI_LE_SLAVE_FEATURES 0x08 #define HCI_LE_PING 0x10 #define HCI_LE_DATA_LEN_EXT 0x20 +#define HCI_LE_PHY_2M 0x01 +#define HCI_LE_PHY_CODED 0x08 #define HCI_LE_EXT_ADV 0x10 #define HCI_LE_EXT_SCAN_POLICY 0x80 #define HCI_LE_PHY_2M 0x01 @@ -1606,6 +1608,8 @@ struct hci_cp_le_set_ext_adv_params { } __packed; #define HCI_ADV_PHY_1M 0X01 +#define HCI_ADV_PHY_2M 0x02 +#define HCI_ADV_PHY_CODED 0x03 struct hci_rp_le_set_ext_adv_params { __u8 status; diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index 7f372e9067c9..9cee7ddc6741 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -562,6 +562,12 @@ struct mgmt_rp_add_advertising { #define MGMT_ADV_FLAG_TX_POWER BIT(4) #define MGMT_ADV_FLAG_APPEARANCE BIT(5) #define MGMT_ADV_FLAG_LOCAL_NAME BIT(6) +#define MGMT_ADV_FLAG_SEC_1M BIT(7) +#define MGMT_ADV_FLAG_SEC_2M BIT(8) +#define MGMT_ADV_FLAG_SEC_CODED BIT(9) + +#define MGMT_ADV_FLAG_SEC_MASK (MGMT_ADV_FLAG_SEC_1M | MGMT_ADV_FLAG_SEC_2M | \ + MGMT_ADV_FLAG_SEC_CODED) #define MGMT_OP_REMOVE_ADVERTISING 0x003F struct mgmt_cp_remove_advertising { -- cgit From 740011cfe94859df8d05f5400d589a8693b095e7 Mon Sep 17 00:00:00 2001 From: Sean Wang Date: Fri, 20 Jul 2018 13:12:28 +0800 Subject: Bluetooth: Add new quirk for non-persistent setup settings Add a new quirk HCI_QUIRK_NON_PERSISTENT_SETUP allowing that a quirk that runs setup() after every open() and not just after the first open(). Signed-off-by: Sean Wang Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 8ff36463719f..7f008097552e 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -183,6 +183,15 @@ enum { * during the hdev->setup vendor callback. */ HCI_QUIRK_NON_PERSISTENT_DIAG, + + /* When this quirk is set, setup() would be run after every + * open() and not just after the first open(). + * + * This quirk can be set before hci_register_dev is called or + * during the hdev->setup vendor callback. + * + */ + HCI_QUIRK_NON_PERSISTENT_SETUP, }; /* HCI device flags */ -- cgit From dd979b4df817e9976f18fb6f9d134d6bc4a3c317 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 30 Jul 2018 09:42:10 +0200 Subject: net: simplify sock_poll_wait The wait_address argument is always directly derived from the filp argument, so remove it. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/net/sock.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 83b747538bd0..0518f61926ec 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2057,16 +2057,17 @@ static inline bool skwq_has_sleeper(struct socket_wq *wq) /** * sock_poll_wait - place memory barrier behind the poll_wait call. * @filp: file - * @wait_address: socket wait queue * @p: poll_table * * See the comments in the wq_has_sleeper function. */ -static inline void sock_poll_wait(struct file *filp, - wait_queue_head_t *wait_address, poll_table *p) +static inline void sock_poll_wait(struct file *filp, poll_table *p) { - if (!poll_does_not_wait(p) && wait_address) { - poll_wait(filp, wait_address, p); + struct socket *sock = filp->private_data; + wait_queue_head_t *wq = sk_sleep(sock->sk); + + if (!poll_does_not_wait(p) && wq) { + poll_wait(filp, wq, p); /* We need to be sure we are in sync with the * socket flags modification. * -- cgit From d8bbd13beeaacd6494954bf5b945b54ccb2af309 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 30 Jul 2018 09:42:11 +0200 Subject: net: don not detour through struct sock to find the poll waitqueue For any open socket file descriptor sock->sk->sk_wq->wait will always point to sock->wq->wait. That means we can do the shorter dereference and removal a NULL check and don't have to not worry about any RCU protection. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/net/sock.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 0518f61926ec..2afea5d1bdfe 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2064,10 +2064,9 @@ static inline bool skwq_has_sleeper(struct socket_wq *wq) static inline void sock_poll_wait(struct file *filp, poll_table *p) { struct socket *sock = filp->private_data; - wait_queue_head_t *wq = sk_sleep(sock->sk); - if (!poll_does_not_wait(p) && wq) { - poll_wait(filp, wq, p); + if (!poll_does_not_wait(p)) { + poll_wait(filp, &sock->wq->wait, p); /* We need to be sure we are in sync with the * socket flags modification. * -- cgit From f641f13b992979b97e595b761a9ba1a64fed7c4e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 30 Jul 2018 09:42:12 +0200 Subject: net: remove sock_poll_busy_loop There is no point in hiding this logic in a helper. Also remove the useless events != 0 check and only busy loop once we know we actually have a poll method. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/net/busy_poll.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/net') diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 9e36fda652b7..85777e68f738 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -121,15 +121,6 @@ static inline void sk_busy_loop(struct sock *sk, int nonblock) #endif } -static inline void sock_poll_busy_loop(struct socket *sock, __poll_t events) -{ - if (sk_can_busy_loop(sock->sk) && - events && (events & POLL_BUSY_LOOP)) { - /* once, only if requested by syscall */ - sk_busy_loop(sock->sk, 1); - } -} - /* if this socket can poll_ll, tell the system call */ static inline __poll_t sock_poll_busy_flag(struct socket *sock) { -- cgit From a331de3bf0e66ab2437fc8c5b99bd3c0d9da3088 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 30 Jul 2018 09:42:13 +0200 Subject: net: remove sock_poll_busy_flag Fold it into the only caller to make the code simpler and easier to read. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/net/busy_poll.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/net') diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 85777e68f738..ba61cdd09eaa 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -121,12 +121,6 @@ static inline void sk_busy_loop(struct sock *sk, int nonblock) #endif } -/* if this socket can poll_ll, tell the system call */ -static inline __poll_t sock_poll_busy_flag(struct socket *sock) -{ - return sk_can_busy_loop(sock->sk) ? POLL_BUSY_LOOP : 0; -} - /* used in the NIC receive handler to mark the skb */ static inline void skb_mark_napi_id(struct sk_buff *skb, struct napi_struct *napi) -- cgit From 7fd4b288ea6a3e45ad8afbcd5ec39554d57f1ae0 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 30 Jul 2018 14:30:43 +0200 Subject: tc/act: remove unneeded RCU lock in action callback Each lockless action currently does its own RCU locking in ->act(). This allows using plain RCU accessor, even if the context is really RCU BH. This change drops the per action RCU lock, replace the accessors with the _bh variant, cleans up a bit the surrounding code and documents the RCU status in the relevant header. No functional nor performance change is intended. The goal of this patch is clarifying that the RCU critical section used by the tc actions extends up to the classifier's caller. v1 -> v2: - preserve rcu lock in act_bpf: it's needed by eBPF helpers, as pointed out by Daniel v3 -> v4: - fixed some typos in the commit message (JiriP) Signed-off-by: Paolo Abeni Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 2 +- include/net/sch_generic.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 683ce41053d9..8c9bc02d05e1 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -85,7 +85,7 @@ struct tc_action_ops { size_t size; struct module *owner; int (*act)(struct sk_buff *, const struct tc_action *, - struct tcf_result *); + struct tcf_result *); /* called under RCU BH lock*/ int (*dump)(struct sk_buff *, struct tc_action *, int, int); void (*cleanup)(struct tc_action *); int (*lookup)(struct net *net, struct tc_action **a, u32 index, diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index c5432362dc26..bcae181c1857 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -285,6 +285,8 @@ struct tcf_proto { /* Fast access part */ struct tcf_proto __rcu *next; void __rcu *root; + + /* called under RCU BH lock*/ int (*classify)(struct sk_buff *, const struct tcf_proto *, struct tcf_result *); -- cgit From cd11b164073b719203318227918f9510809d5e10 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 30 Jul 2018 14:30:44 +0200 Subject: net/tc: introduce TC_ACT_REINSERT. This is similar TC_ACT_REDIRECT, but with a slightly different semantic: - on ingress the mirred skbs are passed to the target device network stack without any additional check not scrubbing. - the rcu-protected stats provided via the tcf_result struct are updated on error conditions. This new tcfa_action value is not exposed to the user-space and can be used only internally by clsact. v1 -> v2: do not touch TC_ACT_REDIRECT code path, introduce a new action type instead v2 -> v3: - rename the new action value TC_ACT_REINJECT, update the helper accordingly - take care of uncloned reinjected packets in XDP generic hook v3 -> v4: - renamed again the new action value (JiriP) v4 -> v5: - fix build error with !NET_CLS_ACT (kbuild bot) Signed-off-by: Paolo Abeni Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 3 +++ include/net/sch_generic.h | 28 ++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 6d02f31abba8..22bfc3a13c25 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -7,6 +7,9 @@ #include #include +/* TC action not accessible from user space */ +#define TC_ACT_REINSERT (TC_ACT_VALUE_MAX + 1) + /* Basic packet classifier frontend definitions. */ struct tcf_walker { diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index bcae181c1857..a6d00093f35e 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -235,6 +235,12 @@ struct tcf_result { u32 classid; }; const struct tcf_proto *goto_tp; + + /* used by the TC_ACT_REINSERT action */ + struct { + bool ingress; + struct gnet_stats_queue *qstats; + }; }; }; @@ -569,6 +575,15 @@ static inline void skb_reset_tc(struct sk_buff *skb) #endif } +static inline bool skb_is_tc_redirected(const struct sk_buff *skb) +{ +#ifdef CONFIG_NET_CLS_ACT + return skb->tc_redirected; +#else + return false; +#endif +} + static inline bool skb_at_tc_ingress(const struct sk_buff *skb) { #ifdef CONFIG_NET_CLS_ACT @@ -1108,4 +1123,17 @@ void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc, struct mini_Qdisc __rcu **p_miniq); +static inline void skb_tc_reinsert(struct sk_buff *skb, struct tcf_result *res) +{ + struct gnet_stats_queue *stats = res->qstats; + int ret; + + if (res->ingress) + ret = netif_receive_skb(skb); + else + ret = dev_queue_xmit(skb); + if (ret && stats) + qstats_overlimit_inc(res->qstats); +} + #endif -- cgit From e6476c21447c4b17c47e476aade6facf050f31e8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 30 Jul 2018 09:45:07 +0200 Subject: net: remove bogus RCU annotations on socket.wq We never use RCU protection for it, just a lot of cargo-cult rcu_deference_protects calls. Note that we do keep the kfree_rcu call for it, as the references through struct sock are RCU protected and thus might require a grace period before freeing. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Dumazet Acked-by: Paul E. McKenney Signed-off-by: David S. Miller --- include/net/sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 2afea5d1bdfe..433f45fc2d68 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1788,7 +1788,7 @@ static inline void sock_graft(struct sock *sk, struct socket *parent) { WARN_ON(parent->sk); write_lock_bh(&sk->sk_callback_lock); - sk->sk_wq = parent->wq; + rcu_assign_pointer(sk->sk_wq, parent->wq); parent->sk = sk; sk_set_socket(sk, parent); sk->sk_uid = SOCK_INODE(parent)->i_uid; -- cgit From 83ba4645152d1177c161750e1064e3a8e7cee19b Mon Sep 17 00:00:00 2001 From: Vincent Bernat Date: Tue, 31 Jul 2018 21:18:11 +0200 Subject: net: add helpers checking if socket can be bound to nonlocal address The construction "net->ipv4.sysctl_ip_nonlocal_bind || inet->freebind || inet->transparent" is present three times and its IPv6 counterpart is also present three times. We introduce two small helpers to characterize these tests uniformly. Signed-off-by: Vincent Bernat Signed-off-by: David S. Miller --- include/net/inet_sock.h | 8 ++++++++ include/net/ipv6.h | 7 +++++++ 2 files changed, 15 insertions(+) (limited to 'include/net') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 314be484c696..e03b93360f33 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -359,4 +359,12 @@ static inline bool inet_get_convert_csum(struct sock *sk) return !!inet_sk(sk)->convert_csum; } + +static inline bool inet_can_nonlocal_bind(struct net *net, + struct inet_sock *inet) +{ + return net->ipv4.sysctl_ip_nonlocal_bind || + inet->freebind || inet->transparent; +} + #endif /* _INET_SOCK_H */ diff --git a/include/net/ipv6.h b/include/net/ipv6.h index a44509f4e985..82deb684ba73 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -766,6 +766,13 @@ static inline int ip6_sk_dst_hoplimit(struct ipv6_pinfo *np, struct flowi6 *fl6, return hlimit; } +static inline bool ipv6_can_nonlocal_bind(struct net *net, + struct inet_sock *inet) +{ + return net->ipv6.sysctl.ip_nonlocal_bind || + inet->freebind || inet->transparent; +} + /* copy IPv6 saddr & daddr to flow_keys, possibly using 64bit load/store * Equivalent to : flow->v6addrs.src = iph->saddr; * flow->v6addrs.dst = iph->daddr; -- cgit From 432e05d328921c68c35bfdeff7d7b7400b8e3d1a Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 1 Aug 2018 00:36:03 +0200 Subject: net: ipv4: Control SKB reprioritization after forwarding After IPv4 packets are forwarded, the priority of the corresponding SKB is updated according to the TOS field of IPv4 header. This overrides any prioritization done earlier by e.g. an skbedit action or ingress-qos-map defined at a vlan device. Such overriding may not always be desirable. Even if the packet ends up being routed, which implies this is an L3 network node, an administrator may wish to preserve whatever prioritization was done earlier on in the pipeline. Therefore introduce a sysctl that controls this behavior. Keep the default value at 1 to maintain backward-compatible behavior. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 661348f23ea5..e47503b4e4d1 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -98,6 +98,7 @@ struct netns_ipv4 { int sysctl_ip_default_ttl; int sysctl_ip_no_pmtu_disc; int sysctl_ip_fwd_use_pmtu; + int sysctl_ip_fwd_update_priority; int sysctl_ip_nonlocal_bind; /* Shall we try to damage output packets if routing dev changes? */ int sysctl_ip_dynaddr; -- cgit From d18c5d1995aa322b722fa731397e28ebcd00b3c6 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 1 Aug 2018 00:36:42 +0200 Subject: net: ipv4: Notify about changes to ip_forward_update_priority Drivers may make offloading decision based on whether ip_forward_update_priority is enabled or not. Therefore distribute netevent notifications to give them a chance to react to a change. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/netevent.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/netevent.h b/include/net/netevent.h index d9918261701c..4107016c3bb4 100644 --- a/include/net/netevent.h +++ b/include/net/netevent.h @@ -28,6 +28,7 @@ enum netevent_notif_type { NETEVENT_DELAY_PROBE_TIME_UPDATE, /* arg is struct neigh_parms ptr */ NETEVENT_IPV4_MPATH_HASH_UPDATE, /* arg is struct net ptr */ NETEVENT_IPV6_MPATH_HASH_UPDATE, /* arg is struct net ptr */ + NETEVENT_IPV4_FWD_UPDATE_PRIORITY_UPDATE, /* arg is struct net ptr */ }; int register_netevent_notifier(struct notifier_block *nb); -- cgit From 290b1c8b1a902c0902df9ec05577ab209296f345 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 1 Aug 2018 12:36:57 +0200 Subject: net: sched: make tcf_chain_{get,put}() static These are no longer used outside of cls_api.c so make them static. Move tcf_chain_flush() to avoid fwd declaration of tcf_chain_put(). Signed-off-by: Jiri Pirko v1->v2: - new patch Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 22bfc3a13c25..ef727f71336e 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -40,11 +40,8 @@ struct tcf_block_cb; bool tcf_queue_work(struct rcu_work *rwork, work_func_t func); #ifdef CONFIG_NET_CLS -struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, - bool create); struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, u32 chain_index); -void tcf_chain_put(struct tcf_chain *chain); void tcf_chain_put_by_act(struct tcf_chain *chain); void tcf_block_netif_keep_dst(struct tcf_block *block); int tcf_block_get(struct tcf_block **p_block, -- cgit From db57dc7c7a5c42bb653425a01b6d73c49514b5db Mon Sep 17 00:00:00 2001 From: Vincent Bernat Date: Wed, 1 Aug 2018 22:05:10 +0200 Subject: net: don't declare IPv6 non-local bind helper if CONFIG_IPV6 undefined Fixes: 83ba4645152d ("net: add helpers checking if socket can be bound to nonlocal address") Signed-off-by: Vincent Bernat Signed-off-by: David S. Miller --- include/net/ipv6.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 82deb684ba73..ff33f498c137 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -766,13 +766,6 @@ static inline int ip6_sk_dst_hoplimit(struct ipv6_pinfo *np, struct flowi6 *fl6, return hlimit; } -static inline bool ipv6_can_nonlocal_bind(struct net *net, - struct inet_sock *inet) -{ - return net->ipv6.sysctl.ip_nonlocal_bind || - inet->freebind || inet->transparent; -} - /* copy IPv6 saddr & daddr to flow_keys, possibly using 64bit load/store * Equivalent to : flow->v6addrs.src = iph->saddr; * flow->v6addrs.dst = iph->daddr; @@ -789,6 +782,13 @@ static inline void iph_to_flow_copy_v6addrs(struct flow_keys *flow, #if IS_ENABLED(CONFIG_IPV6) +static inline bool ipv6_can_nonlocal_bind(struct net *net, + struct inet_sock *inet) +{ + return net->ipv6.sysctl.ip_nonlocal_bind || + inet->freebind || inet->transparent; +} + /* Sysctl settings for net ipv6.auto_flowlabels */ #define IP6_AUTO_FLOW_LABEL_OFF 0 #define IP6_AUTO_FLOW_LABEL_OPTOUT 1 -- cgit From 285189c78eeb6f684a024b86fb5997d10c6aa564 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Wed, 25 Jul 2018 15:52:13 +0800 Subject: netfilter: use kvmalloc_array to allocate memory for hashtable nf_ct_alloc_hashtable is used to allocate memory for conntrack, NAT bysrc and expectation hashtable. Assuming 64k bucket size, which means 7th order page allocation, __get_free_pages, called by nf_ct_alloc_hashtable, will trigger the direct memory reclaim and stall for a long time, when system has lots of memory stress so replace combination of __get_free_pages and vzalloc with kvmalloc_array, which provides a overflow check and a fallback if no high order memory is available, and do not retry to reclaim memory, reduce stall and remove nf_ct_free_hashtable, since it is just a kvfree Signed-off-by: Zhang Yu Signed-off-by: Wang Li Signed-off-by: Li RongQing Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index a2b0ed025908..7e012312cd61 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -176,8 +176,6 @@ void nf_ct_netns_put(struct net *net, u8 nfproto); */ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls); -void nf_ct_free_hashtable(void *hash, unsigned int size); - int nf_conntrack_hash_check_insert(struct nf_conn *ct); bool nf_ct_delete(struct nf_conn *ct, u32 pid, int report); -- cgit From eb9950eb31f56e57582a61c92073336d04a26542 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 3 Aug 2018 17:06:56 +0100 Subject: rxrpc: Push iov_iter up from rxrpc_kernel_recv_data() to caller Push iov_iter up from rxrpc_kernel_recv_data() to its caller to allow non-contiguous iovs to be passed down, thereby permitting file reading to be simplified in the AFS filesystem in a future patch. Signed-off-by: David Howells Signed-off-by: David S. Miller --- include/net/af_rxrpc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index 8ae8ee004258..f53edb3754bc 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -61,7 +61,7 @@ int rxrpc_kernel_send_data(struct socket *, struct rxrpc_call *, struct msghdr *, size_t, rxrpc_notify_end_tx_t); int rxrpc_kernel_recv_data(struct socket *, struct rxrpc_call *, - void *, size_t, size_t *, bool, u32 *, u16 *); + struct iov_iter *, bool, u32 *, u16 *); bool rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *, u32, int, const char *); void rxrpc_kernel_end_call(struct socket *, struct rxrpc_call *); -- cgit From 31ba191bf5ab2975c1955f1adf771a5a0b57afaa Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Fri, 3 Aug 2018 14:53:15 +0800 Subject: include/net/bond_3ad: Simplify the code by using the ARRAY_SIZE We prefer to ARRAY_SIZE rather than the open code to calculate size. Signed-off-by: zhong jiang Signed-off-by: David S. Miller --- include/net/bond_3ad.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h index f358ad5e4214..fc3111515f5c 100644 --- a/include/net/bond_3ad.h +++ b/include/net/bond_3ad.h @@ -283,7 +283,7 @@ static inline const char *bond_3ad_churn_desc(churn_state_t state) "none", "unknown" }; - int max_size = sizeof(churn_description) / sizeof(churn_description[0]); + int max_size = ARRAY_SIZE(churn_description); if (state >= max_size) state = max_size - 1; -- cgit From fa0f527358bd900ef92f925878ed6bfbd51305cc Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Thu, 2 Aug 2018 23:34:39 +0000 Subject: ip: use rb trees for IP frag queue. Similar to TCP OOO RX queue, it makes sense to use rb trees to store IP fragments, so that OOO fragments are inserted faster. Tested: - a follow-up patch contains a rather comprehensive ip defrag self-test (functional) - ran neper `udp_stream -c -H -F 100 -l 300 -T 20`: netstat --statistics Ip: 282078937 total packets received 0 forwarded 0 incoming packets discarded 946760 incoming packets delivered 18743456 requests sent out 101 fragments dropped after timeout 282077129 reassemblies required 944952 packets reassembled ok 262734239 packet reassembles failed (The numbers/stats above are somewhat better re: reassemblies vs a kernel without this patchset. More comprehensive performance testing TBD). Reported-by: Jann Horn Reported-by: Juha-Matti Tilli Suggested-by: Eric Dumazet Signed-off-by: Peter Oskolkov Signed-off-by: Eric Dumazet Cc: Florian Westphal Signed-off-by: David S. Miller --- include/net/inet_frag.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index f4272a29dc44..b86d14528188 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -75,7 +75,8 @@ struct inet_frag_queue { struct timer_list timer; spinlock_t lock; refcount_t refcnt; - struct sk_buff *fragments; + struct sk_buff *fragments; /* Used in IPv6. */ + struct rb_root rb_fragments; /* Used in IPv4. */ struct sk_buff *fragments_tail; ktime_t stamp; int len; -- cgit