summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-02-21 11:59:51 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2020-02-21 11:59:51 -0800
commit3dc55dba67231fc22352483f5ca737df96cdc1e6 (patch)
tree5eec1f5ce42822cb744313ec394b761fab0f7ed6 /include
parentb0dd1eb220c06892e0a4098378c4296650f3f8db (diff)
parent36a44bcdd8df092d76c11bc213e81c5817d4e302 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
Pull networking fixes from David Miller: 1) Limit xt_hashlimit hash table size to avoid OOM or hung tasks, from Cong Wang. 2) Fix deadlock in xsk by publishing global consumer pointers when NAPI is finished, from Magnus Karlsson. 3) Set table field properly to RT_TABLE_COMPAT when necessary, from Jethro Beekman. 4) NLA_STRING attributes are not necessary NULL terminated, deal wiht that in IFLA_ALT_IFNAME. From Eric Dumazet. 5) Fix checksum handling in atlantic driver, from Dmitry Bezrukov. 6) Handle mtu==0 devices properly in wireguard, from Jason A. Donenfeld. 7) Fix several lockdep warnings in bonding, from Taehee Yoo. 8) Fix cls_flower port blocking, from Jason Baron. 9) Sanitize internal map names in libbpf, from Toke Høiland-Jørgensen. 10) Fix RDMA race in qede driver, from Michal Kalderon. 11) Fix several false lockdep warnings by adding conditions to list_for_each_entry_rcu(), from Madhuparna Bhowmik. 12) Fix sleep in atomic in mlx5 driver, from Huy Nguyen. 13) Fix potential deadlock in bpf_map_do_batch(), from Yonghong Song. 14) Hey, variables declared in switch statement before any case statements are not initialized. I learn something every day. Get rids of this stuff in several parts of the networking, from Kees Cook. * git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net: (99 commits) bnxt_en: Issue PCIe FLR in kdump kernel to cleanup pending DMAs. bnxt_en: Improve device shutdown method. net: netlink: cap max groups which will be considered in netlink_bind() net: thunderx: workaround BGX TX Underflow issue ionic: fix fw_status read net: disable BRIDGE_NETFILTER by default net: macb: Properly handle phylink on at91rm9200 s390/qeth: fix off-by-one in RX copybreak check s390/qeth: don't warn for napi with 0 budget s390/qeth: vnicc Fix EOPNOTSUPP precedence openvswitch: Distribute switch variables for initialization net: ip6_gre: Distribute switch variables for initialization net: core: Distribute switch variables for initialization udp: rehash on disconnect net/tls: Fix to avoid gettig invalid tls record bpf: Fix a potential deadlock with bpf_map_do_batch bpf: Do not grab the bucket spinlock by default on htab batch ops ice: Wait for VF to be reset/ready before configuration ice: Don't tell the OS that link is going down ice: Don't reject odd values of usecs set by user ...
Diffstat (limited to 'include')
-rw-r--r--include/linux/mlx5/mlx5_ifc.h5
-rw-r--r--include/linux/netdevice.h7
-rw-r--r--include/linux/rculist_nulls.h7
-rw-r--r--include/linux/skbuff.h30
-rw-r--r--include/net/flow_dissector.h9
-rw-r--r--include/net/sock.h38
-rw-r--r--include/uapi/linux/bpf.h16
-rw-r--r--include/uapi/linux/netfilter/nf_conntrack_common.h12
8 files changed, 104 insertions, 20 deletions
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index ff8c9d527bb4..bfdf41537cf1 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -688,7 +688,10 @@ struct mlx5_ifc_flow_table_nic_cap_bits {
u8 nic_rx_multi_path_tirs[0x1];
u8 nic_rx_multi_path_tirs_fts[0x1];
u8 allow_sniffer_and_nic_rx_shared_tir[0x1];
- u8 reserved_at_3[0x1d];
+ u8 reserved_at_3[0x4];
+ u8 sw_owner_reformat_supported[0x1];
+ u8 reserved_at_8[0x18];
+
u8 encap_general_header[0x1];
u8 reserved_at_21[0xa];
u8 log_max_packet_reformat_context[0x5];
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9f1f633235f6..6c3f7032e8d9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -72,6 +72,8 @@ void netdev_set_default_ethtool_ops(struct net_device *dev,
#define NET_RX_SUCCESS 0 /* keep 'em coming, baby */
#define NET_RX_DROP 1 /* packet dropped */
+#define MAX_NEST_DEV 8
+
/*
* Transmit return codes: transmit return codes originate from three different
* namespaces:
@@ -4389,11 +4391,8 @@ void *netdev_lower_get_next(struct net_device *dev,
ldev; \
ldev = netdev_lower_get_next(dev, &(iter)))
-struct net_device *netdev_all_lower_get_next(struct net_device *dev,
+struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
struct list_head **iter);
-struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev,
- struct list_head **iter);
-
int netdev_walk_all_lower_dev(struct net_device *dev,
int (*fn)(struct net_device *lower_dev,
void *data),
diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
index e5b752027a03..9670b54b484a 100644
--- a/include/linux/rculist_nulls.h
+++ b/include/linux/rculist_nulls.h
@@ -145,6 +145,13 @@ static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n,
}
}
+/* after that hlist_nulls_del will work */
+static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n)
+{
+ n->pprev = &n->next;
+ n->next = (struct hlist_nulls_node *)NULLS_MARKER(NULL);
+}
+
/**
* hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type
* @tpos: the type * to use as a loop cursor.
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ca8806b69388..5b50278c4bc8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -611,9 +611,15 @@ typedef unsigned char *sk_buff_data_t;
* @next: Next buffer in list
* @prev: Previous buffer in list
* @tstamp: Time we arrived/left
+ * @skb_mstamp_ns: (aka @tstamp) earliest departure time; start point
+ * for retransmit timer
* @rbnode: RB tree node, alternative to next/prev for netem/tcp
+ * @list: queue head
* @sk: Socket we are owned by
+ * @ip_defrag_offset: (aka @sk) alternate use of @sk, used in
+ * fragmentation management
* @dev: Device we arrived on/are leaving by
+ * @dev_scratch: (aka @dev) alternate use of @dev when @dev would be %NULL
* @cb: Control buffer. Free for use by every layer. Put private vars here
* @_skb_refdst: destination entry (with norefcount bit)
* @sp: the security path, used for xfrm
@@ -632,6 +638,9 @@ typedef unsigned char *sk_buff_data_t;
* @pkt_type: Packet class
* @fclone: skbuff clone status
* @ipvs_property: skbuff is owned by ipvs
+ * @inner_protocol_type: whether the inner protocol is
+ * ENCAP_TYPE_ETHER or ENCAP_TYPE_IPPROTO
+ * @remcsum_offload: remote checksum offload is enabled
* @offload_fwd_mark: Packet was L2-forwarded in hardware
* @offload_l3_fwd_mark: Packet was L3-forwarded in hardware
* @tc_skip_classify: do not classify packet. set by IFB device
@@ -650,6 +659,8 @@ typedef unsigned char *sk_buff_data_t;
* @tc_index: Traffic control index
* @hash: the packet hash
* @queue_mapping: Queue mapping for multiqueue devices
+ * @head_frag: skb was allocated from page fragments,
+ * not allocated by kmalloc() or vmalloc().
* @pfmemalloc: skbuff was allocated from PFMEMALLOC reserves
* @active_extensions: active extensions (skb_ext_id types)
* @ndisc_nodetype: router type (from link layer)
@@ -660,15 +671,28 @@ typedef unsigned char *sk_buff_data_t;
* @wifi_acked_valid: wifi_acked was set
* @wifi_acked: whether frame was acked on wifi or not
* @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS
+ * @encapsulation: indicates the inner headers in the skbuff are valid
+ * @encap_hdr_csum: software checksum is needed
+ * @csum_valid: checksum is already valid
* @csum_not_inet: use CRC32c to resolve CHECKSUM_PARTIAL
+ * @csum_complete_sw: checksum was completed by software
+ * @csum_level: indicates the number of consecutive checksums found in
+ * the packet minus one that have been verified as
+ * CHECKSUM_UNNECESSARY (max 3)
* @dst_pending_confirm: need to confirm neighbour
* @decrypted: Decrypted SKB
* @napi_id: id of the NAPI struct this skb came from
+ * @sender_cpu: (aka @napi_id) source CPU in XPS
* @secmark: security marking
* @mark: Generic packet mark
+ * @reserved_tailroom: (aka @mark) number of bytes of free space available
+ * at the tail of an sk_buff
+ * @vlan_present: VLAN tag is present
* @vlan_proto: vlan encapsulation protocol
* @vlan_tci: vlan tag control information
* @inner_protocol: Protocol (encapsulation)
+ * @inner_ipproto: (aka @inner_protocol) stores ipproto when
+ * skb->inner_protocol_type == ENCAP_TYPE_IPPROTO;
* @inner_transport_header: Inner transport layer header (encapsulation)
* @inner_network_header: Network layer header (encapsulation)
* @inner_mac_header: Link layer header (encapsulation)
@@ -750,7 +774,9 @@ struct sk_buff {
#endif
#define CLONED_OFFSET() offsetof(struct sk_buff, __cloned_offset)
+ /* private: */
__u8 __cloned_offset[0];
+ /* public: */
__u8 cloned:1,
nohdr:1,
fclone:2,
@@ -775,7 +801,9 @@ struct sk_buff {
#endif
#define PKT_TYPE_OFFSET() offsetof(struct sk_buff, __pkt_type_offset)
+ /* private: */
__u8 __pkt_type_offset[0];
+ /* public: */
__u8 pkt_type:3;
__u8 ignore_df:1;
__u8 nf_trace:1;
@@ -798,7 +826,9 @@ struct sk_buff {
#define PKT_VLAN_PRESENT_BIT 0
#endif
#define PKT_VLAN_PRESENT_OFFSET() offsetof(struct sk_buff, __pkt_vlan_present_offset)
+ /* private: */
__u8 __pkt_vlan_present_offset[0];
+ /* public: */
__u8 vlan_present:1;
__u8 csum_complete_sw:1;
__u8 csum_level:2;
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index e9391e877f9a..628383915827 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -5,6 +5,7 @@
#include <linux/types.h>
#include <linux/in6.h>
#include <linux/siphash.h>
+#include <linux/string.h>
#include <uapi/linux/if_ether.h>
struct sk_buff;
@@ -348,4 +349,12 @@ struct bpf_flow_dissector {
void *data_end;
};
+static inline void
+flow_dissector_init_keys(struct flow_dissector_key_control *key_control,
+ struct flow_dissector_key_basic *key_basic)
+{
+ memset(key_control, 0, sizeof(*key_control));
+ memset(key_basic, 0, sizeof(*key_basic));
+}
+
#endif
diff --git a/include/net/sock.h b/include/net/sock.h
index 02162b0378f7..328564525526 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -117,19 +117,26 @@ typedef __u64 __bitwise __addrpair;
* struct sock_common - minimal network layer representation of sockets
* @skc_daddr: Foreign IPv4 addr
* @skc_rcv_saddr: Bound local IPv4 addr
+ * @skc_addrpair: 8-byte-aligned __u64 union of @skc_daddr & @skc_rcv_saddr
* @skc_hash: hash value used with various protocol lookup tables
* @skc_u16hashes: two u16 hash values used by UDP lookup tables
* @skc_dport: placeholder for inet_dport/tw_dport
* @skc_num: placeholder for inet_num/tw_num
+ * @skc_portpair: __u32 union of @skc_dport & @skc_num
* @skc_family: network address family
* @skc_state: Connection state
* @skc_reuse: %SO_REUSEADDR setting
* @skc_reuseport: %SO_REUSEPORT setting
+ * @skc_ipv6only: socket is IPV6 only
+ * @skc_net_refcnt: socket is using net ref counting
* @skc_bound_dev_if: bound device index if != 0
* @skc_bind_node: bind hash linkage for various protocol lookup tables
* @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol
* @skc_prot: protocol handlers inside a network family
* @skc_net: reference to the network namespace of this socket
+ * @skc_v6_daddr: IPV6 destination address
+ * @skc_v6_rcv_saddr: IPV6 source address
+ * @skc_cookie: socket's cookie value
* @skc_node: main hash linkage for various protocol lookup tables
* @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
* @skc_tx_queue_mapping: tx queue number for this connection
@@ -137,7 +144,15 @@ typedef __u64 __bitwise __addrpair;
* @skc_flags: place holder for sk_flags
* %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
* %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
+ * @skc_listener: connection request listener socket (aka rsk_listener)
+ * [union with @skc_flags]
+ * @skc_tw_dr: (aka tw_dr) ptr to &struct inet_timewait_death_row
+ * [union with @skc_flags]
* @skc_incoming_cpu: record/match cpu processing incoming packets
+ * @skc_rcv_wnd: (aka rsk_rcv_wnd) TCP receive window size (possibly scaled)
+ * [union with @skc_incoming_cpu]
+ * @skc_tw_rcv_nxt: (aka tw_rcv_nxt) TCP window next expected seq number
+ * [union with @skc_incoming_cpu]
* @skc_refcnt: reference count
*
* This is the minimal network layer representation of sockets, the header
@@ -245,6 +260,7 @@ struct bpf_sk_storage;
* @sk_dst_cache: destination cache
* @sk_dst_pending_confirm: need to confirm neighbour
* @sk_policy: flow policy
+ * @sk_rx_skb_cache: cache copy of recently accessed RX skb
* @sk_receive_queue: incoming packets
* @sk_wmem_alloc: transmit queue bytes committed
* @sk_tsq_flags: TCP Small Queues flags
@@ -265,6 +281,8 @@ struct bpf_sk_storage;
* @sk_no_check_rx: allow zero checksum in RX packets
* @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO)
* @sk_route_nocaps: forbidden route capabilities (e.g NETIF_F_GSO_MASK)
+ * @sk_route_forced_caps: static, forced route capabilities
+ * (set in tcp_init_sock())
* @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4)
* @sk_gso_max_size: Maximum GSO segment size to build
* @sk_gso_max_segs: Maximum number of GSO segments
@@ -303,6 +321,8 @@ struct bpf_sk_storage;
* @sk_frag: cached page frag
* @sk_peek_off: current peek_offset value
* @sk_send_head: front of stuff to transmit
+ * @tcp_rtx_queue: TCP re-transmit queue [union with @sk_send_head]
+ * @sk_tx_skb_cache: cache copy of recently accessed TX skb
* @sk_security: used by security modules
* @sk_mark: generic packet mark
* @sk_cgrp_data: cgroup data for this cgroup
@@ -313,11 +333,14 @@ struct bpf_sk_storage;
* @sk_write_space: callback to indicate there is bf sending space available
* @sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE)
* @sk_backlog_rcv: callback to process the backlog
+ * @sk_validate_xmit_skb: ptr to an optional validate function
* @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0
* @sk_reuseport_cb: reuseport group container
+ * @sk_bpf_storage: ptr to cache and control for bpf_sk_storage
* @sk_rcu: used during RCU grace period
* @sk_clockid: clockid used by time-based scheduling (SO_TXTIME)
* @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME
+ * @sk_txtime_report_errors: set report errors mode for SO_TXTIME
* @sk_txtime_unused: unused txtime flags
*/
struct sock {
@@ -393,7 +416,9 @@ struct sock {
struct sk_filter __rcu *sk_filter;
union {
struct socket_wq __rcu *sk_wq;
+ /* private: */
struct socket_wq *sk_wq_raw;
+ /* public: */
};
#ifdef CONFIG_XFRM
struct xfrm_policy __rcu *sk_policy[2];
@@ -2017,7 +2042,7 @@ static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *fro
* sk_wmem_alloc_get - returns write allocations
* @sk: socket
*
- * Returns sk_wmem_alloc minus initial offset of one
+ * Return: sk_wmem_alloc minus initial offset of one
*/
static inline int sk_wmem_alloc_get(const struct sock *sk)
{
@@ -2028,7 +2053,7 @@ static inline int sk_wmem_alloc_get(const struct sock *sk)
* sk_rmem_alloc_get - returns read allocations
* @sk: socket
*
- * Returns sk_rmem_alloc
+ * Return: sk_rmem_alloc
*/
static inline int sk_rmem_alloc_get(const struct sock *sk)
{
@@ -2039,7 +2064,7 @@ static inline int sk_rmem_alloc_get(const struct sock *sk)
* sk_has_allocations - check if allocations are outstanding
* @sk: socket
*
- * Returns true if socket has write or read allocations
+ * Return: true if socket has write or read allocations
*/
static inline bool sk_has_allocations(const struct sock *sk)
{
@@ -2050,7 +2075,7 @@ static inline bool sk_has_allocations(const struct sock *sk)
* skwq_has_sleeper - check if there are any waiting processes
* @wq: struct socket_wq
*
- * Returns true if socket_wq has waiting processes
+ * Return: true if socket_wq has waiting processes
*
* The purpose of the skwq_has_sleeper and sock_poll_wait is to wrap the memory
* barrier call. They were added due to the race found within the tcp code.
@@ -2238,6 +2263,9 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
* gfpflags_allow_blocking() isn't enough here as direct reclaim may nest
* inside other socket operations and end up recursing into sk_page_frag()
* while it's already in use.
+ *
+ * Return: a per task page_frag if context allows that,
+ * otherwise a per socket one.
*/
static inline struct page_frag *sk_page_frag(struct sock *sk)
{
@@ -2432,6 +2460,7 @@ static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags)
&skb_shinfo(skb)->tskey);
}
+DECLARE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key);
/**
* sk_eat_skb - Release a skb if it is no longer needed
* @sk: socket to eat this skb from
@@ -2440,7 +2469,6 @@ static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags)
* This routine must be called with interrupts disabled or with the socket
* locked so that the sk_buff queue operation is ok.
*/
-DECLARE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key);
static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
{
__skb_unlink(skb, &sk->sk_receive_queue);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f1d74a2bd234..22f235260a3a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1045,9 +1045,9 @@ union bpf_attr {
* supports redirection to the egress interface, and accepts no
* flag at all.
*
- * The same effect can be attained with the more generic
- * **bpf_redirect_map**\ (), which requires specific maps to be
- * used but offers better performance.
+ * The same effect can also be attained with the more generic
+ * **bpf_redirect_map**\ (), which uses a BPF map to store the
+ * redirect target instead of providing it directly to the helper.
* Return
* For XDP, the helper returns **XDP_REDIRECT** on success or
* **XDP_ABORTED** on error. For other program types, the values
@@ -1611,13 +1611,11 @@ union bpf_attr {
* the caller. Any higher bits in the *flags* argument must be
* unset.
*
- * When used to redirect packets to net devices, this helper
- * provides a high performance increase over **bpf_redirect**\ ().
- * This is due to various implementation details of the underlying
- * mechanisms, one of which is the fact that **bpf_redirect_map**\
- * () tries to send packet as a "bulk" to the device.
+ * See also bpf_redirect(), which only supports redirecting to an
+ * ifindex, but doesn't require a map to do so.
* Return
- * **XDP_REDIRECT** on success, or **XDP_ABORTED** on error.
+ * **XDP_REDIRECT** on success, or the value of the two lower bits
+ * of the **flags* argument on error.
*
* int bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags)
* Description
diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h
index 336014bf8868..b6f0bb1dc799 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
@@ -97,6 +97,15 @@ enum ip_conntrack_status {
IPS_UNTRACKED_BIT = 12,
IPS_UNTRACKED = (1 << IPS_UNTRACKED_BIT),
+#ifdef __KERNEL__
+ /* Re-purposed for in-kernel use:
+ * Tags a conntrack entry that clashed with an existing entry
+ * on insert.
+ */
+ IPS_NAT_CLASH_BIT = IPS_UNTRACKED_BIT,
+ IPS_NAT_CLASH = IPS_UNTRACKED,
+#endif
+
/* Conntrack got a helper explicitly attached via CT target. */
IPS_HELPER_BIT = 13,
IPS_HELPER = (1 << IPS_HELPER_BIT),
@@ -110,7 +119,8 @@ enum ip_conntrack_status {
*/
IPS_UNCHANGEABLE_MASK = (IPS_NAT_DONE_MASK | IPS_NAT_MASK |
IPS_EXPECTED | IPS_CONFIRMED | IPS_DYING |
- IPS_SEQ_ADJUST | IPS_TEMPLATE | IPS_OFFLOAD),
+ IPS_SEQ_ADJUST | IPS_TEMPLATE | IPS_UNTRACKED |
+ IPS_OFFLOAD),
__IPS_MAX_BIT = 15,
};