From 2d23d0736e3a4a0fdb92b8e46ea476639f16aae8 Mon Sep 17 00:00:00 2001 From: Roee Zamir Date: Sun, 6 Aug 2017 11:38:22 +0300 Subject: nl80211: add OCE scan and capability flags Add Optimized Connectivity Experience (OCE) scan and capability flags. Some of them unique to OCE and some are stand alone. And add scan flags to enable/disable them. Signed-off-by: Roee Zamir Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 51626b4175c0..76404d8a8863 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4914,6 +4914,15 @@ enum nl80211_feature_flags { * handshake with 802.1X in station mode (will pass EAP frames to the host * and accept the set_pmk/del_pmk commands), doing it in the host might not * be supported. + * @NL80211_EXT_FEATURE_FILS_MAX_CHANNEL_TIME: Driver is capable of overriding + * the max channel attribute in the FILS request params IE with the + * actual dwell time. + * @NL80211_EXT_FEATURE_ACCEPT_BCAST_PROBE_RESP: Driver accepts broadcast probe + * response + * @NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE: Driver supports sending + * the first probe request in each channel at rate of at least 5.5Mbps. + * @NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION: Driver supports + * probe request tx deferral and suppression * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. @@ -4936,6 +4945,10 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_FILS_SK_OFFLOAD, NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_PSK, NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_1X, + NL80211_EXT_FEATURE_FILS_MAX_CHANNEL_TIME, + NL80211_EXT_FEATURE_ACCEPT_BCAST_PROBE_RESP, + NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE, + NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, @@ -5012,12 +5025,28 @@ enum nl80211_timeout_reason { * locally administered 1, multicast 0) is assumed. * This flag must not be requested when the feature isn't supported, check * the nl80211 feature flags for the device. + * @NL80211_SCAN_FLAG_FILS_MAX_CHANNEL_TIME: fill the dwell time in the FILS + * request parameters IE in the probe request + * @NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP: accept broadcast probe responses + * @NL80211_SCAN_FLAG_OCE_PROBE_REQ_HIGH_TX_RATE: send probe request frames at + * rate of at least 5.5M. In case non OCE AP is dicovered in the channel, + * only the first probe req in the channel will be sent in high rate. + * @NL80211_SCAN_FLAG_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION: allow probe request + * tx deferral (dot11FILSProbeDelay shall be set to 15ms) + * and suppression (if it has received a broadcast Probe Response frame, + * Beacon frame or FILS Discovery frame from an AP that the STA considers + * a suitable candidate for (re-)association - suitable in terms of + * SSID and/or RSSI */ enum nl80211_scan_flags { - NL80211_SCAN_FLAG_LOW_PRIORITY = 1<<0, - NL80211_SCAN_FLAG_FLUSH = 1<<1, - NL80211_SCAN_FLAG_AP = 1<<2, - NL80211_SCAN_FLAG_RANDOM_ADDR = 1<<3, + NL80211_SCAN_FLAG_LOW_PRIORITY = 1<<0, + NL80211_SCAN_FLAG_FLUSH = 1<<1, + NL80211_SCAN_FLAG_AP = 1<<2, + NL80211_SCAN_FLAG_RANDOM_ADDR = 1<<3, + NL80211_SCAN_FLAG_FILS_MAX_CHANNEL_TIME = 1<<4, + NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP = 1<<5, + NL80211_SCAN_FLAG_OCE_PROBE_REQ_HIGH_TX_RATE = 1<<6, + NL80211_SCAN_FLAG_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION = 1<<7, }; /** -- cgit From 65026002d69de006e273749bb799d3b01b757eb0 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Fri, 18 Aug 2017 15:31:41 +0300 Subject: nl80211: add an option to allow MFP without requiring it The user space can now allow the kernel to associate to an AP that requires MFP or that doesn't have MFP enabled in the same NL80211_CMD_CONNECT command, by using a new NL80211_MFP_OPTIONAL flag. The driver / firmware will decide whether to use it or not. Include a feature bit to advertise support for NL80211_MFP_OPTIONAL. This allows new user space to run on old kernels and know that it cannot use the new attribute if it isn't supported. Signed-off-by: Emmanuel Grumbach Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 76404d8a8863..59ba6ca66a0d 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1407,8 +1407,12 @@ enum nl80211_commands { * * @NL80211_ATTR_USE_MFP: Whether management frame protection (IEEE 802.11w) is * used for the association (&enum nl80211_mfp, represented as a u32); - * this attribute can be used - * with %NL80211_CMD_ASSOCIATE and %NL80211_CMD_CONNECT requests + * this attribute can be used with %NL80211_CMD_ASSOCIATE and + * %NL80211_CMD_CONNECT requests. %NL80211_MFP_OPTIONAL is not allowed for + * %NL80211_CMD_ASSOCIATE since user space SME is expected and hence, it + * must have decided whether to use management frame protection or not. + * Setting %NL80211_MFP_OPTIONAL with a %NL80211_CMD_CONNECT request will + * let the driver (or the firmware) decide whether to use MFP or not. * * @NL80211_ATTR_STA_FLAGS2: Attribute containing a * &struct nl80211_sta_flag_update. @@ -3947,10 +3951,12 @@ enum nl80211_key_type { * enum nl80211_mfp - Management frame protection state * @NL80211_MFP_NO: Management frame protection not used * @NL80211_MFP_REQUIRED: Management frame protection required + * @NL80211_MFP_OPTIONAL: Management frame protection is optional */ enum nl80211_mfp { NL80211_MFP_NO, NL80211_MFP_REQUIRED, + NL80211_MFP_OPTIONAL, }; enum nl80211_wpa_versions { @@ -4923,6 +4929,8 @@ enum nl80211_feature_flags { * the first probe request in each channel at rate of at least 5.5Mbps. * @NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION: Driver supports * probe request tx deferral and suppression + * @NL80211_EXT_FEATURE_MFP_OPTIONAL: Driver supports the %NL80211_MFP_OPTIONAL + * value in %NL80211_ATTR_USE_MFP. * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. @@ -4949,6 +4957,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_ACCEPT_BCAST_PROBE_RESP, NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE, NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION, + NL80211_EXT_FEATURE_MFP_OPTIONAL, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, -- cgit From 943170998b200190f99d3fe7e771437e2c51f319 Mon Sep 17 00:00:00 2001 From: Petar Penkov Date: Fri, 22 Sep 2017 13:49:14 -0700 Subject: tun: enable NAPI for TUN/TAP driver Changes TUN driver to use napi_gro_receive() upon receiving packets rather than netif_rx_ni(). Adds flag IFF_NAPI that enables these changes and operation is not affected if the flag is disabled. SKBs are constructed upon packet arrival and are queued to be processed later. The new path was evaluated with a benchmark with the following setup: Open two tap devices and a receiver thread that reads in a loop for each device. Start one sender thread and pin all threads to different CPUs. Send 1M minimum UDP packets to each device and measure sending time for each of the sending methods: napi_gro_receive(): 4.90s netif_rx_ni(): 4.90s netif_receive_skb(): 7.20s Signed-off-by: Petar Penkov Cc: Eric Dumazet Cc: Mahesh Bandewar Cc: Willem de Bruijn Cc: davem@davemloft.net Cc: ppenkov@stanford.edu Acked-by: Mahesh Bandewar Signed-off-by: David S. Miller --- include/uapi/linux/if_tun.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index 3cb5e1d85ddd..30b6184884eb 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -60,6 +60,7 @@ /* TUNSETIFF ifr flags */ #define IFF_TUN 0x0001 #define IFF_TAP 0x0002 +#define IFF_NAPI 0x0010 #define IFF_NO_PI 0x1000 /* This flag has no real effect */ #define IFF_ONE_QUEUE 0x2000 -- cgit From 90e33d45940793def6f773b2d528e9f3c84ffdc7 Mon Sep 17 00:00:00 2001 From: Petar Penkov Date: Fri, 22 Sep 2017 13:49:15 -0700 Subject: tun: enable napi_gro_frags() for TUN/TAP driver Add a TUN/TAP receive mode that exercises the napi_gro_frags() interface. This mode is available only in TAP mode, as the interface expects packets with Ethernet headers. Furthermore, packets follow the layout of the iovec_iter that was received. The first iovec is the linear data, and every one after the first is a fragment. If there are more fragments than the max number, drop the packet. Additionally, invoke eth_get_headlen() to exercise flow dissector code and to verify that the header resides in the linear data. The napi_gro_frags() mode requires setting the IFF_NAPI_FRAGS option. This is imposed because this mode is intended for testing via tools like syzkaller and packetdrill, and the increased flexibility it provides can introduce security vulnerabilities. This flag is accepted only if the device is in TAP mode and has the IFF_NAPI flag set as well. This is done because both of these are explicit requirements for correct operation in this mode. Signed-off-by: Petar Penkov Cc: Eric Dumazet Cc: Mahesh Bandewar Cc: Willem de Bruijn Cc: davem@davemloft.net Cc: ppenkov@stanford.edu Acked-by: Mahesh Bandewar Signed-off-by: David S. Miller --- include/uapi/linux/if_tun.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index 30b6184884eb..365ade5685c9 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -61,6 +61,7 @@ #define IFF_TUN 0x0001 #define IFF_TAP 0x0002 #define IFF_NAPI 0x0010 +#define IFF_NAPI_FRAGS 0x0020 #define IFF_NO_PI 0x1000 /* This flag has no real effect */ #define IFF_ONE_QUEUE 0x2000 -- cgit From de8f3a83b0a0fddb2cf56e7a718127e9619ea3da Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 25 Sep 2017 02:25:51 +0200 Subject: bpf: add meta pointer for direct access This work enables generic transfer of metadata from XDP into skb. The basic idea is that we can make use of the fact that the resulting skb must be linear and already comes with a larger headroom for supporting bpf_xdp_adjust_head(), which mangles xdp->data. Here, we base our work on a similar principle and introduce a small helper bpf_xdp_adjust_meta() for adjusting a new pointer called xdp->data_meta. Thus, the packet has a flexible and programmable room for meta data, followed by the actual packet data. struct xdp_buff is therefore laid out that we first point to data_hard_start, then data_meta directly prepended to data followed by data_end marking the end of packet. bpf_xdp_adjust_head() takes into account whether we have meta data already prepended and if so, memmove()s this along with the given offset provided there's enough room. xdp->data_meta is optional and programs are not required to use it. The rationale is that when we process the packet in XDP (e.g. as DoS filter), we can push further meta data along with it for the XDP_PASS case, and give the guarantee that a clsact ingress BPF program on the same device can pick this up for further post-processing. Since we work with skb there, we can also set skb->mark, skb->priority or other skb meta data out of BPF, thus having this scratch space generic and programmable allows for more flexibility than defining a direct 1:1 transfer of potentially new XDP members into skb (it's also more efficient as we don't need to initialize/handle each of such new members). The facility also works together with GRO aggregation. The scratch space at the head of the packet can be multiple of 4 byte up to 32 byte large. Drivers not yet supporting xdp->data_meta can simply be set up with xdp->data_meta as xdp->data + 1 as bpf_xdp_adjust_meta() will detect this and bail out, such that the subsequent match against xdp->data for later access is guaranteed to fail. The verifier treats xdp->data_meta/xdp->data the same way as we treat xdp->data/xdp->data_end pointer comparisons. The requirement for doing the compare against xdp->data is that it hasn't been modified from it's original address we got from ctx access. It may have a range marking already from prior successful xdp->data/xdp->data_end pointer comparisons though. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 43ab5c402f98..e43491ac4823 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -582,6 +582,12 @@ union bpf_attr { * @map: pointer to sockmap to update * @key: key to insert/update sock in map * @flags: same flags as map update elem + * + * int bpf_xdp_adjust_meta(xdp_md, delta) + * Adjust the xdp_md.data_meta by delta + * @xdp_md: pointer to xdp_md + * @delta: An positive/negative integer to be added to xdp_md.data_meta + * Return: 0 on success or negative on error */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -638,6 +644,7 @@ union bpf_attr { FN(redirect_map), \ FN(sk_redirect_map), \ FN(sock_map_update), \ + FN(xdp_adjust_meta), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -715,7 +722,7 @@ struct __sk_buff { __u32 data_end; __u32 napi_id; - /* accessed by BPF_PROG_TYPE_sk_skb types */ + /* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */ __u32 family; __u32 remote_ip4; /* Stored in network byte order */ __u32 local_ip4; /* Stored in network byte order */ @@ -723,6 +730,9 @@ struct __sk_buff { __u32 local_ip6[4]; /* Stored in network byte order */ __u32 remote_port; /* Stored in network byte order */ __u32 local_port; /* stored in host byte order */ + /* ... here. */ + + __u32 data_meta; }; struct bpf_tunnel_key { @@ -783,6 +793,7 @@ enum xdp_action { struct xdp_md { __u32 data; __u32 data_end; + __u32 data_meta; }; enum sk_action { -- cgit From 5af48b59f35cf712793badabe1a574a0d0ce3bd3 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 27 Sep 2017 16:12:44 +0300 Subject: net: bridge: add per-port group_fwd_mask with less restrictions We need to be able to transparently forward most link-local frames via tunnels (e.g. vxlan, qinq). Currently the bridge's group_fwd_mask has a mask which restricts the forwarding of STP and LACP, but we need to be able to forward these over tunnels and control that forwarding on a per-port basis thus add a new per-port group_fwd_mask option which only disallows mac pause frames to be forwarded (they're always dropped anyway). The patch does not change the current default situation - all of the others are still restricted unless configured for forwarding. We have successfully tested this patch with LACP and STP forwarding over VxLAN and qinq tunnels. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 8d062c58d5cb..ea87bd708ee9 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -325,6 +325,7 @@ enum { IFLA_BRPORT_MCAST_TO_UCAST, IFLA_BRPORT_VLAN_TUNNEL, IFLA_BRPORT_BCAST_FLOOD, + IFLA_BRPORT_GROUP_FWD_MASK, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) -- cgit From cb4d2b3f03d8eed90be3a194e5b54b734ec4bbe9 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 27 Sep 2017 14:37:52 -0700 Subject: bpf: Add name, load_time, uid and map_ids to bpf_prog_info The patch adds name and load_time to struct bpf_prog_aux. They are also exported to bpf_prog_info. The bpf_prog's name is passed by userspace during BPF_PROG_LOAD. The kernel only stores the first (BPF_PROG_NAME_LEN - 1) bytes and the name stored in the kernel is always \0 terminated. The kernel will reject name that contains characters other than isalnum() and '_'. It will also reject name that is not null terminated. The existing 'user->uid' of the bpf_prog_aux is also exported to the bpf_prog_info as created_by_uid. The existing 'used_maps' of the bpf_prog_aux is exported to the newly added members 'nr_map_ids' and 'map_ids' of the bpf_prog_info. On the input, nr_map_ids tells how big the userspace's map_ids buffer is. On the output, nr_map_ids tells the exact user_map_cnt and it will only copy up to the userspace's map_ids buffer is allowed. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e43491ac4823..bd6348269bf5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -175,6 +175,8 @@ enum bpf_attach_type { /* Specify numa node during map creation */ #define BPF_F_NUMA_NODE (1U << 2) +#define BPF_OBJ_NAME_LEN 16U + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ @@ -210,6 +212,7 @@ union bpf_attr { __aligned_u64 log_buf; /* user supplied buffer */ __u32 kern_version; /* checked when prog_type=kprobe */ __u32 prog_flags; + __u8 prog_name[BPF_OBJ_NAME_LEN]; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -812,6 +815,11 @@ struct bpf_prog_info { __u32 xlated_prog_len; __aligned_u64 jited_prog_insns; __aligned_u64 xlated_prog_insns; + __u64 load_time; /* ns since boottime */ + __u32 created_by_uid; + __u32 nr_map_ids; + __aligned_u64 map_ids; + __u8 name[BPF_OBJ_NAME_LEN]; } __attribute__((aligned(8))); struct bpf_map_info { -- cgit From ad5b177bd73f5107d97c36f56395c4281fb6f089 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 27 Sep 2017 14:37:53 -0700 Subject: bpf: Add map_name to bpf_map_info This patch allows userspace to specify a name for a map during BPF_MAP_CREATE. The map's name can later be exported to user space via BPF_OBJ_GET_INFO_BY_FD. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bd6348269bf5..6d2137b4cf38 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -190,6 +190,7 @@ union bpf_attr { __u32 numa_node; /* numa node (effective only if * BPF_F_NUMA_NODE is set). */ + __u8 map_name[BPF_OBJ_NAME_LEN]; }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ @@ -829,6 +830,7 @@ struct bpf_map_info { __u32 value_size; __u32 max_entries; __u32 map_flags; + __u8 name[BPF_OBJ_NAME_LEN]; } __attribute__((aligned(8))); /* User bpf_sock_ops struct to access socket values and specify request ops -- cgit From 84e14fe353de7624872e582887712079ba0b2d56 Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Tue, 26 Sep 2017 21:32:42 -0700 Subject: net-ipv6: add support for sockopt(SOL_IPV6, IPV6_FREEBIND) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So far we've been relying on sockopt(SOL_IP, IP_FREEBIND) being usable even on IPv6 sockets. However, it turns out it is perfectly reasonable to want to set freebind on an AF_INET6 SOCK_RAW socket - but there is no way to set any SOL_IP socket option on such a socket (they're all blindly errored out). One use case for this is to allow spoofing src ip on a raw socket via sendmsg cmsg. Tested: built, and booted # python >>> import socket >>> SOL_IP = socket.SOL_IP >>> SOL_IPV6 = socket.IPPROTO_IPV6 >>> IP_FREEBIND = 15 >>> IPV6_FREEBIND = 78 >>> s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM, 0) >>> s.getsockopt(SOL_IP, IP_FREEBIND) 0 >>> s.getsockopt(SOL_IPV6, IPV6_FREEBIND) 0 >>> s.setsockopt(SOL_IPV6, IPV6_FREEBIND, 1) >>> s.getsockopt(SOL_IP, IP_FREEBIND) 1 >>> s.getsockopt(SOL_IPV6, IPV6_FREEBIND) 1 Signed-off-by: Maciej Żenczykowski Signed-off-by: David S. Miller --- include/uapi/linux/in6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h index 46444f8fbee4..4f8f3eb0699f 100644 --- a/include/uapi/linux/in6.h +++ b/include/uapi/linux/in6.h @@ -284,6 +284,7 @@ struct in6_flowlabel_req { #define IPV6_TRANSPARENT 75 #define IPV6_UNICAST_IF 76 #define IPV6_RECVFRAGSIZE 77 +#define IPV6_FREEBIND 78 /* * Multicast Routing: -- cgit From 503c1fb98ba3859c13863957c7c65c92371a9e50 Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Fri, 29 Sep 2017 14:21:49 +0200 Subject: cfg80211/nl80211: add a port authorized event Add an event that indicates that a connection is authorized (i.e. the 4 way handshake was performed by the driver). This event should be sent by the driver after sending a connect/roamed event. This is useful for networks that require 802.1X authentication. In cases that the driver supports 4 way handshake offload, but the 802.1X authentication is managed by user space, the driver needs to inform user space right after the 802.11 association was completed so user space can initialize its 802.1X state machine etc. However, it is also possible that the AP will choose to skip the 802.1X authentication (e.g. when PMKSA caching is used) and proceed with the 4 way handshake immediately. In this case the driver needs to inform user space that 802.1X authentication is no longer required (e.g. to prevent user space from disconnecting since it did not get any EAPOLs from the AP). This is also useful for roaming, in which case it is possible that the driver used the Fast Transition protocol so 802.1X is not required. Since there will now be a dedicated notification indicating that the connection is authorized, the authorized flag can be removed from the roamed event. Drivers can send the new port authorized event right after sending the roamed event to indicate the new AP is already authorized. This therefore reserves the old PORT_AUTHORIZED attribute. Signed-off-by: Avraham Stern Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 59ba6ca66a0d..95832ce03a44 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -569,13 +569,14 @@ * authentication/association or not receiving a response from the AP. * Non-zero %NL80211_ATTR_STATUS_CODE value is indicated in that case as * well to remain backwards compatible. - * @NL80211_CMD_ROAM: notifcation indicating the card/driver roamed by itself. - * When the driver roamed in a network that requires 802.1X authentication, - * %NL80211_ATTR_PORT_AUTHORIZED should be set if the 802.1X authentication - * was done by the driver or if roaming was done using Fast Transition - * protocol (in which case 802.1X authentication is not needed). If - * %NL80211_ATTR_PORT_AUTHORIZED is not set, user space is responsible for - * the 802.1X authentication. + * When establishing a security association, drivers that support 4 way + * handshake offload should send %NL80211_CMD_PORT_AUTHORIZED event when + * the 4 way handshake is completed successfully. + * @NL80211_CMD_ROAM: Notification indicating the card/driver roamed by itself. + * When a security association was established with the new AP (e.g. if + * the FT protocol was used for roaming or the driver completed the 4 way + * handshake), this event should be followed by an + * %NL80211_CMD_PORT_AUTHORIZED event. * @NL80211_CMD_DISCONNECT: drop a given connection; also used to notify * userspace that a connection was dropped by the AP or due to other * reasons, for this the %NL80211_ATTR_DISCONNECTED_BY_AP and @@ -982,6 +983,12 @@ * @NL80211_CMD_DEL_PMK: For offloaded 4-Way handshake, delete the previously * configured PMK for the authenticator address identified by * &NL80211_ATTR_MAC. + * @NL80211_CMD_PORT_AUTHORIZED: An event that indicates that the 4 way + * handshake was completed successfully by the driver. The BSSID is + * specified with &NL80211_ATTR_MAC. Drivers that support 4 way handshake + * offload should send this event after indicating 802.11 association with + * &NL80211_CMD_CONNECT or &NL80211_CMD_ROAM. If the 4 way handshake failed + * &NL80211_CMD_DISCONNECT should be indicated instead. * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use @@ -1185,6 +1192,8 @@ enum nl80211_commands { NL80211_CMD_SET_PMK, NL80211_CMD_DEL_PMK, + NL80211_CMD_PORT_AUTHORIZED, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -2138,10 +2147,7 @@ enum nl80211_commands { * in %NL80211_CMD_CONNECT to indicate that for 802.1X authentication it * wants to use the supported offload of the 4-way handshake. * @NL80211_ATTR_PMKR0_NAME: PMK-R0 Name for offloaded FT. - * @NL80211_ATTR_PORT_AUTHORIZED: flag attribute used in %NL80211_CMD_ROAMED - * notification indicating that that 802.1X authentication was done by - * the driver or is not needed (because roaming used the Fast Transition - * protocol). + * @NL80211_ATTR_PORT_AUTHORIZED: (reserved) * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined -- cgit From 5bbbbe32a43199c2b9ea5ea66fab6241c64beb51 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 3 Oct 2017 19:20:13 -0300 Subject: sctp: introduce stream scheduler foundations This patch introduces the hooks necessary to do stream scheduling, as per RFC Draft ndata. It also introduces the first scheduler, which is what we do today but now factored out: first come first served (FCFS). With stream scheduling now we have to track which chunk was enqueued on which stream and be able to select another other than the in front of the main outqueue. So we introduce a list on sctp_stream_out_ext structure for this purpose. We reuse sctp_chunk->transmitted_list space for the list above, as the chunk cannot belong to the two lists at the same time. By using the union in there, we can have distinct names for these moments. sctp_sched_ops are the operations expected to be implemented by each scheduler. The dequeueing is a bit particular to this implementation but it is to match how we dequeue packets today. We first dequeue and then check if it fits the packet and if not, we requeue it at head. Thus why we don't have a peek operation but have dequeue_done instead, which is called once the chunk can be safely considered as transmitted. The check removed from sctp_outq_flush is now performed by sctp_stream_outq_migrate, which is only called during assoc setup. (sctp_sendmsg() also checks for it) The only operation that is foreseen but not yet added here is a way to signalize that a new packet is starting or that the packet is done, for round robin scheduler per packet, but is intentionally left to the patch that actually implements it. Support for I-DATA chunks, also described in this RFC, with user message interleaving is straightforward as it just requires the schedulers to probe for the feature and ignore datamsg boundaries when dequeueing. See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13 Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/uapi/linux/sctp.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 6217ff8500a1..4487e7625ddb 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -1088,4 +1088,10 @@ struct sctp_add_streams { uint16_t sas_outstrms; }; +/* SCTP Stream schedulers */ +enum sctp_sched_type { + SCTP_SS_FCFS, + SCTP_SS_MAX = SCTP_SS_FCFS +}; + #endif /* _UAPI_SCTP_H */ -- cgit From 13aa8770fe42d246c6f3a8eb814b85bccb428011 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 3 Oct 2017 19:20:14 -0300 Subject: sctp: add sockopt to get/set stream scheduler As defined per RFC Draft ndata Section 4.3.2, named as SCTP_STREAM_SCHEDULER. See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13 Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/uapi/linux/sctp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 4487e7625ddb..0050f10087d2 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -122,6 +122,7 @@ typedef __s32 sctp_assoc_t; #define SCTP_RESET_ASSOC 120 #define SCTP_ADD_STREAMS 121 #define SCTP_SOCKOPT_PEELOFF_FLAGS 122 +#define SCTP_STREAM_SCHEDULER 123 /* PR-SCTP policies */ #define SCTP_PR_SCTP_NONE 0x0000 -- cgit From 0ccdf3c7fdeda511b10def19505178a9d2d3fccd Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 3 Oct 2017 19:20:15 -0300 Subject: sctp: add sockopt to get/set stream scheduler parameters As defined per RFC Draft ndata Section 4.3.3, named as SCTP_STREAM_SCHEDULER_VALUE. See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13 Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/uapi/linux/sctp.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 0050f10087d2..00ac417d2c4f 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -123,6 +123,7 @@ typedef __s32 sctp_assoc_t; #define SCTP_ADD_STREAMS 121 #define SCTP_SOCKOPT_PEELOFF_FLAGS 122 #define SCTP_STREAM_SCHEDULER 123 +#define SCTP_STREAM_SCHEDULER_VALUE 124 /* PR-SCTP policies */ #define SCTP_PR_SCTP_NONE 0x0000 @@ -815,6 +816,12 @@ struct sctp_assoc_value { uint32_t assoc_value; }; +struct sctp_stream_value { + sctp_assoc_t assoc_id; + uint16_t stream_id; + uint16_t stream_value; +}; + /* * 7.2.2 Peer Address Information * -- cgit From 637784ade221a3c8a7ecd0f583eddd95d6276b9a Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 3 Oct 2017 19:20:16 -0300 Subject: sctp: introduce priority based stream scheduler This patch introduces RFC Draft ndata section 3.4 Priority Based Scheduler (SCTP_SS_PRIO). It works by having a struct sctp_stream_priority for each priority configured. This struct is then enlisted on a queue ordered per priority if, and only if, there is a stream with data queued, so that dequeueing is very straightforward: either finish current datamsg or simply dequeue from the highest priority queued, which is the next stream pointed, and that's it. If there are multiple streams assigned with the same priority and with data queued, it will do round robin amongst them while respecting datamsgs boundaries (when not using idata chunks), to be reasonably fair. We intentionally don't maintain a list of priorities nor a list of all streams with the same priority to save memory. The first would mean at least 2 other pointers per priority (which, for 1000 priorities, that can mean 16kB) and the second would also mean 2 other pointers but per stream. As SCTP supports up to 65535 streams on a given asoc, that's 1MB. This impacts when giving a priority to some stream, as we have to find out if the new priority is already being used and if we can free the old one, and also when tearing down. The new fields in struct sctp_stream_out_ext and sctp_stream are added under a union because that memory is to be shared with other schedulers. See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13 Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/uapi/linux/sctp.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 00ac417d2c4f..850fa8b29d7e 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -1099,7 +1099,8 @@ struct sctp_add_streams { /* SCTP Stream schedulers */ enum sctp_sched_type { SCTP_SS_FCFS, - SCTP_SS_MAX = SCTP_SS_FCFS + SCTP_SS_PRIO, + SCTP_SS_MAX = SCTP_SS_PRIO }; #endif /* _UAPI_SCTP_H */ -- cgit From ac1ed8b82cd60ba8e7d84103ac1414b8c577c485 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 3 Oct 2017 19:20:17 -0300 Subject: sctp: introduce round robin stream scheduler This patch introduces RFC Draft ndata section 3.2 Priority Based Scheduler (SCTP_SS_RR). Works by maintaining a list of enqueued streams and tracking the last one used to send data. When the datamsg is done, it switches to the next stream. See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13 Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/uapi/linux/sctp.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 850fa8b29d7e..6cd7d416ca40 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -1100,7 +1100,8 @@ struct sctp_add_streams { enum sctp_sched_type { SCTP_SS_FCFS, SCTP_SS_PRIO, - SCTP_SS_MAX = SCTP_SS_PRIO + SCTP_SS_RR, + SCTP_SS_MAX = SCTP_SS_RR }; #endif /* _UAPI_SCTP_H */ -- cgit From 324bda9e6c5add86ba2e1066476481c48132aca0 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 2 Oct 2017 22:50:21 -0700 Subject: bpf: multi program support for cgroup+bpf introduce BPF_F_ALLOW_MULTI flag that can be used to attach multiple bpf programs to a cgroup. The difference between three possible flags for BPF_PROG_ATTACH command: - NONE(default): No further bpf programs allowed in the subtree. - BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program, the program in this cgroup yields to sub-cgroup program. - BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program, that cgroup program gets run in addition to the program in this cgroup. NONE and BPF_F_ALLOW_OVERRIDE existed before. This patch doesn't change their behavior. It only clarifies the semantics in relation to new flag. Only one program is allowed to be attached to a cgroup with NONE or BPF_F_ALLOW_OVERRIDE flag. Multiple programs are allowed to be attached to a cgroup with BPF_F_ALLOW_MULTI flag. They are executed in FIFO order (those that were attached first, run first) The programs of sub-cgroup are executed first, then programs of this cgroup and then programs of parent cgroup. All eligible programs are executed regardless of return code from earlier programs. To allow efficient execution of multiple programs attached to a cgroup and to avoid penalizing cgroups without any programs attached introduce 'struct bpf_prog_array' which is RCU protected array of pointers to bpf programs. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Acked-by: Martin KaFai Lau for cgroup bits Acked-by: Tejun Heo Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 42 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6d2137b4cf38..762f74bc6c47 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -143,11 +143,47 @@ enum bpf_attach_type { #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE -/* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command - * to the given target_fd cgroup the descendent cgroup will be able to - * override effective bpf program that was inherited from this cgroup +/* cgroup-bpf attach flags used in BPF_PROG_ATTACH command + * + * NONE(default): No further bpf programs allowed in the subtree. + * + * BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program, + * the program in this cgroup yields to sub-cgroup program. + * + * BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program, + * that cgroup program gets run in addition to the program in this cgroup. + * + * Only one program is allowed to be attached to a cgroup with + * NONE or BPF_F_ALLOW_OVERRIDE flag. + * Attaching another program on top of NONE or BPF_F_ALLOW_OVERRIDE will + * release old program and attach the new one. Attach flags has to match. + * + * Multiple programs are allowed to be attached to a cgroup with + * BPF_F_ALLOW_MULTI flag. They are executed in FIFO order + * (those that were attached first, run first) + * The programs of sub-cgroup are executed first, then programs of + * this cgroup and then programs of parent cgroup. + * When children program makes decision (like picking TCP CA or sock bind) + * parent program has a chance to override it. + * + * A cgroup with MULTI or OVERRIDE flag allows any attach flags in sub-cgroups. + * A cgroup with NONE doesn't allow any programs in sub-cgroups. + * Ex1: + * cgrp1 (MULTI progs A, B) -> + * cgrp2 (OVERRIDE prog C) -> + * cgrp3 (MULTI prog D) -> + * cgrp4 (OVERRIDE prog E) -> + * cgrp5 (NONE prog F) + * the event in cgrp5 triggers execution of F,D,A,B in that order. + * if prog F is detached, the execution is E,D,A,B + * if prog F and D are detached, the execution is E,A,B + * if prog F, E and D are detached, the execution is C,A,B + * + * All eligible programs are executed regardless of return code from + * earlier programs. */ #define BPF_F_ALLOW_OVERRIDE (1U << 0) +#define BPF_F_ALLOW_MULTI (1U << 1) /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the * verifier will perform strict alignment checking as if the kernel -- cgit From 468e2f64d220fe2dc11caa2bcb9b3a1e50fc7321 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 2 Oct 2017 22:50:22 -0700 Subject: bpf: introduce BPF_PROG_QUERY command introduce BPF_PROG_QUERY command to retrieve a set of either attached programs to given cgroup or a set of effective programs that will execute for events within a cgroup Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Acked-by: Martin KaFai Lau for cgroup bits Acked-by: Tejun Heo Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 762f74bc6c47..cb2b9f95160a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -92,6 +92,7 @@ enum bpf_cmd { BPF_PROG_GET_FD_BY_ID, BPF_MAP_GET_FD_BY_ID, BPF_OBJ_GET_INFO_BY_FD, + BPF_PROG_QUERY, }; enum bpf_map_type { @@ -211,6 +212,9 @@ enum bpf_attach_type { /* Specify numa node during map creation */ #define BPF_F_NUMA_NODE (1U << 2) +/* flags for BPF_PROG_QUERY */ +#define BPF_F_QUERY_EFFECTIVE (1U << 0) + #define BPF_OBJ_NAME_LEN 16U union bpf_attr { @@ -289,6 +293,15 @@ union bpf_attr { __u32 info_len; __aligned_u64 info; } info; + + struct { /* anonymous struct used by BPF_PROG_QUERY command */ + __u32 target_fd; /* container object to query */ + __u32 attach_type; + __u32 query_flags; + __u32 attach_flags; + __aligned_u64 prog_ids; + __u32 prog_cnt; + } query; } __attribute__((aligned(8))); /* BPF helper function descriptions: -- cgit From 6621dd29eb9b5e6774ec7a9a75161352fdea47fc Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Tue, 3 Oct 2017 13:53:23 +0200 Subject: dev: advertise the new nsid when the netns iface changes x-netns interfaces are bound to two netns: the link netns and the upper netns. Usually, this kind of interfaces is created in the link netns and then moved to the upper netns. At the end, the interface is visible only in the upper netns. The link nsid is advertised via netlink in the upper netns, thus the user always knows where is the link part. There is no such mechanism in the link netns. When the interface is moved to another netns, the user cannot "follow" it. This patch adds a new netlink attribute which helps to follow an interface which moves to another netns. When the interface is unregistered, the new nsid is advertised. If the interface is a x-netns interface (ie rtnl_link_ops->get_link_net is defined), the nsid is allocated if needed. CC: Jason A. Donenfeld Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index ea87bd708ee9..cd580fc0e58f 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -158,6 +158,7 @@ enum { IFLA_PAD, IFLA_XDP, IFLA_EVENT, + IFLA_NEW_NETNSID, __IFLA_MAX }; -- cgit From 413a4317aca7d6367d57a5971b0c461f03851207 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 5 Oct 2017 16:46:53 -0400 Subject: VSOCK: add sock_diag interface This patch adds the sock_diag interface for querying sockets from userspace. Tools like ss(8) and netstat(8) can use this interface to list open sockets. The userspace ABI is defined in and includes netlink request and response structs. The request can query sockets based on their sk_state (e.g. listening sockets only) and the response contains socket information fields including the local/remote addresses, inode number, etc. This patch does not dump VMCI pending sockets because I have only tested the virtio transport, which does not use pending sockets. Support can be added later by extending vsock_diag_dump() if needed by VMCI users. Signed-off-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- include/uapi/linux/vm_sockets_diag.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 include/uapi/linux/vm_sockets_diag.h (limited to 'include/uapi') diff --git a/include/uapi/linux/vm_sockets_diag.h b/include/uapi/linux/vm_sockets_diag.h new file mode 100644 index 000000000000..14cd7dc5a187 --- /dev/null +++ b/include/uapi/linux/vm_sockets_diag.h @@ -0,0 +1,33 @@ +/* AF_VSOCK sock_diag(7) interface for querying open sockets */ + +#ifndef _UAPI__VM_SOCKETS_DIAG_H__ +#define _UAPI__VM_SOCKETS_DIAG_H__ + +#include + +/* Request */ +struct vsock_diag_req { + __u8 sdiag_family; /* must be AF_VSOCK */ + __u8 sdiag_protocol; /* must be 0 */ + __u16 pad; /* must be 0 */ + __u32 vdiag_states; /* query bitmap (e.g. 1 << TCP_LISTEN) */ + __u32 vdiag_ino; /* must be 0 (reserved) */ + __u32 vdiag_show; /* must be 0 (reserved) */ + __u32 vdiag_cookie[2]; +}; + +/* Response */ +struct vsock_diag_msg { + __u8 vdiag_family; /* AF_VSOCK */ + __u8 vdiag_type; /* SOCK_STREAM or SOCK_DGRAM */ + __u8 vdiag_state; /* sk_state (e.g. TCP_LISTEN) */ + __u8 vdiag_shutdown; /* local RCV_SHUTDOWN | SEND_SHUTDOWN */ + __u32 vdiag_src_cid; + __u32 vdiag_src_port; + __u32 vdiag_dst_cid; + __u32 vdiag_dst_port; + __u32 vdiag_ino; + __u32 vdiag_cookie[2]; +}; + +#endif /* _UAPI__VM_SOCKETS_DIAG_H__ */ -- cgit From bdc476413dcdb5c38a7dec90fb2bca327021273a Mon Sep 17 00:00:00 2001 From: Amine Kherbouche Date: Wed, 4 Oct 2017 19:35:57 +0200 Subject: ip_tunnel: add mpls over gre support This commit introduces the MPLSoGRE support (RFC 4023), using ip tunnel API by simply adding ipgre_tunnel_encap_(add|del)_mpls_ops() and the new tunnel type TUNNEL_ENCAP_MPLS. Signed-off-by: Amine Kherbouche Signed-off-by: David S. Miller --- include/uapi/linux/if_tunnel.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index 2e520883c054..a2f48c01365e 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -84,6 +84,7 @@ enum tunnel_encap_types { TUNNEL_ENCAP_NONE, TUNNEL_ENCAP_FOU, TUNNEL_ENCAP_GUE, + TUNNEL_ENCAP_MPLS, }; #define TUNNEL_ENCAP_FLAG_CSUM (1<<0) -- cgit From 908432ca84fc229e906ba164219e9ad0fe56f755 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 5 Oct 2017 09:19:20 -0700 Subject: bpf: add helper bpf_perf_event_read_value for perf event array map Hardware pmu counters are limited resources. When there are more pmu based perf events opened than available counters, kernel will multiplex these events so each event gets certain percentage (but not 100%) of the pmu time. In case that multiplexing happens, the number of samples or counter value will not reflect the case compared to no multiplexing. This makes comparison between different runs difficult. Typically, the number of samples or counter value should be normalized before comparing to other experiments. The typical normalization is done like: normalized_num_samples = num_samples * time_enabled / time_running normalized_counter_value = counter_value * time_enabled / time_running where time_enabled is the time enabled for event and time_running is the time running for event since last normalization. This patch adds helper bpf_perf_event_read_value for kprobed based perf event array map, to read perf counter and enabled/running time. The enabled/running time is accumulated since the perf event open. To achieve scaling factor between two bpf invocations, users can can use cpu_id as the key (which is typical for perf array usage model) to remember the previous value and do the calculation inside the bpf program. Signed-off-by: Yonghong Song Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6082faf5fd2a..7b57a212c7d7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -641,6 +641,14 @@ union bpf_attr { * @xdp_md: pointer to xdp_md * @delta: An positive/negative integer to be added to xdp_md.data_meta * Return: 0 on success or negative on error + * + * int bpf_perf_event_read_value(map, flags, buf, buf_size) + * read perf event counter value and perf event enabled/running time + * @map: pointer to perf_event_array map + * @flags: index of event in the map or bitmask flags + * @buf: buf to fill + * @buf_size: size of the buf + * Return: 0 on success or negative error code */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -697,7 +705,8 @@ union bpf_attr { FN(redirect_map), \ FN(sk_redirect_map), \ FN(sock_map_update), \ - FN(xdp_adjust_meta), + FN(xdp_adjust_meta), \ + FN(perf_event_read_value), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -741,7 +750,9 @@ enum bpf_func_id { #define BPF_F_ZERO_CSUM_TX (1ULL << 1) #define BPF_F_DONT_FRAGMENT (1ULL << 2) -/* BPF_FUNC_perf_event_output and BPF_FUNC_perf_event_read flags. */ +/* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and + * BPF_FUNC_perf_event_read_value flags. + */ #define BPF_F_INDEX_MASK 0xffffffffULL #define BPF_F_CURRENT_CPU BPF_F_INDEX_MASK /* BPF_FUNC_perf_event_output for sk_buff input context. */ @@ -934,4 +945,10 @@ enum { #define TCP_BPF_IW 1001 /* Set TCP initial congestion window */ #define TCP_BPF_SNDCWND_CLAMP 1002 /* Set sndcwnd_clamp */ +struct bpf_perf_event_value { + __u64 counter; + __u64 enabled; + __u64 running; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit From 4bebdc7a85aa400c0222b5329861e4ad9252f1e5 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 5 Oct 2017 09:19:22 -0700 Subject: bpf: add helper bpf_perf_prog_read_value This patch adds helper bpf_perf_prog_read_cvalue for perf event based bpf programs, to read event counter and enabled/running time. The enabled/running time is accumulated since the perf event open. The typical use case for perf event based bpf program is to attach itself to a single event. In such cases, if it is desirable to get scaling factor between two bpf invocations, users can can save the time values in a map, and use the value from the map and the current value to calculate the scaling factor. Signed-off-by: Yonghong Song Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7b57a212c7d7..5bbbec17aa5a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -649,6 +649,13 @@ union bpf_attr { * @buf: buf to fill * @buf_size: size of the buf * Return: 0 on success or negative error code + * + * int bpf_perf_prog_read_value(ctx, buf, buf_size) + * read perf prog attached perf event counter and enabled/running time + * @ctx: pointer to ctx + * @buf: buf to fill + * @buf_size: size of the buf + * Return : 0 on success or negative error code */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -706,7 +713,8 @@ union bpf_attr { FN(sk_redirect_map), \ FN(sock_map_update), \ FN(xdp_adjust_meta), \ - FN(perf_event_read_value), + FN(perf_event_read_value), \ + FN(perf_prog_read_value), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit From 067cae47771c864604969fd902efe10916e0d79c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 5 Oct 2017 21:52:12 -0700 Subject: bpf: Use char in prog and map name Instead of u8, use char for prog and map name. It can avoid the userspace tool getting compiler's signess warning. The bpf_prog_aux, bpf_map, bpf_attr, bpf_prog_info and bpf_map_info are changed. Signed-off-by: Martin KaFai Lau Cc: Jakub Kicinski Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 5bbbec17aa5a..6db9e1d679cd 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -230,7 +230,7 @@ union bpf_attr { __u32 numa_node; /* numa node (effective only if * BPF_F_NUMA_NODE is set). */ - __u8 map_name[BPF_OBJ_NAME_LEN]; + char map_name[BPF_OBJ_NAME_LEN]; }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ @@ -253,7 +253,7 @@ union bpf_attr { __aligned_u64 log_buf; /* user supplied buffer */ __u32 kern_version; /* checked when prog_type=kprobe */ __u32 prog_flags; - __u8 prog_name[BPF_OBJ_NAME_LEN]; + char prog_name[BPF_OBJ_NAME_LEN]; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -888,7 +888,7 @@ struct bpf_prog_info { __u32 created_by_uid; __u32 nr_map_ids; __aligned_u64 map_ids; - __u8 name[BPF_OBJ_NAME_LEN]; + char name[BPF_OBJ_NAME_LEN]; } __attribute__((aligned(8))); struct bpf_map_info { @@ -898,7 +898,7 @@ struct bpf_map_info { __u32 value_size; __u32 max_entries; __u32 map_flags; - __u8 name[BPF_OBJ_NAME_LEN]; + char name[BPF_OBJ_NAME_LEN]; } __attribute__((aligned(8))); /* User bpf_sock_ops struct to access socket values and specify request ops -- cgit From 821f1b21cabb46827ce39ddf82e2789680b5042a Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Fri, 6 Oct 2017 22:12:37 -0700 Subject: bridge: add new BR_NEIGH_SUPPRESS port flag to suppress arp and nd flood This patch adds a new bridge port flag BR_NEIGH_SUPPRESS to suppress arp and nd flood on bridge ports. It implements rfc7432, section 10. https://tools.ietf.org/html/rfc7432#section-10 for ethernet VPN deployments. It is similar to the existing BR_PROXYARP* flags but has a few semantic differences to conform to EVPN standard. Unlike the existing flags, this new flag suppresses flood of all neigh discovery packets (arp and nd) to tunnel ports. Supports both vlan filtering and non-vlan filtering bridges. In case of EVPN, it is mainly used to avoid flooding of arp and nd packets to tunnel ports like vxlan. This patch adds netlink and sysfs support to set this bridge port flag. Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index cd580fc0e58f..b037e0ab1975 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -327,6 +327,7 @@ enum { IFLA_BRPORT_VLAN_TUNNEL, IFLA_BRPORT_BCAST_FLOOD, IFLA_BRPORT_GROUP_FWD_MASK, + IFLA_BRPORT_NEIGH_SUPPRESS, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) -- cgit From ceaa001a170e43608854d5290a48064f57b565ed Mon Sep 17 00:00:00 2001 From: William Tu Date: Wed, 4 Oct 2017 17:03:12 -0700 Subject: openvswitch: Add erspan tunnel support. Add erspan netlink interface for OVS. Signed-off-by: William Tu Cc: Pravin B Shelar Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 156ee4cab82e..efdbfbfd3ee2 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -359,6 +359,7 @@ enum ovs_tunnel_key_attr { OVS_TUNNEL_KEY_ATTR_IPV6_SRC, /* struct in6_addr src IPv6 address. */ OVS_TUNNEL_KEY_ATTR_IPV6_DST, /* struct in6_addr dst IPv6 address. */ OVS_TUNNEL_KEY_ATTR_PAD, + OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS, /* be32 ERSPAN index. */ __OVS_TUNNEL_KEY_ATTR_MAX }; -- cgit From b8226962b1c49c784aeddb9d2fafbf53dfdc2190 Mon Sep 17 00:00:00 2001 From: Eric Garver Date: Tue, 10 Oct 2017 16:54:44 -0400 Subject: openvswitch: add ct_clear action This adds a ct_clear action for clearing conntrack state. ct_clear is currently implemented in OVS userspace, but is not backed by an action in the kernel datapath. This is useful for flows that may modify a packet tuple after a ct lookup has already occurred. Signed-off-by: Eric Garver Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index efdbfbfd3ee2..0cd6f8833147 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -807,6 +807,7 @@ struct ovs_action_push_eth { * packet. * @OVS_ACTION_ATTR_POP_ETH: Pop the outermost Ethernet header off the * packet. + * @OVS_ACTION_ATTR_CT_CLEAR: Clear conntrack state from the packet. * * Only a single header can be set with a single %OVS_ACTION_ATTR_SET. Not all * fields within a header are modifiable, e.g. the IPv4 protocol and fragment @@ -836,6 +837,7 @@ enum ovs_action_attr { OVS_ACTION_ATTR_TRUNC, /* u32 struct ovs_action_trunc. */ OVS_ACTION_ATTR_PUSH_ETH, /* struct ovs_action_push_eth. */ OVS_ACTION_ATTR_POP_ETH, /* No argument. */ + OVS_ACTION_ATTR_CT_CLEAR, /* No argument. */ __OVS_ACTION_ATTR_MAX, /* Nothing past this will be accepted * from userspace. */ -- cgit From 1ea4ff3e9f0b8d53e680a2bb9e8e644bf03aeb4d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 13 Sep 2017 16:07:22 +0200 Subject: cfg80211: support reloading regulatory database If the regulatory database is loaded, and then updated, it may be necessary to reload it. Add an nl80211 command to do this. Note that this just reloads the database, it doesn't re-apply the rules from it immediately. Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 95832ce03a44..f882fe1f9709 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -990,6 +990,8 @@ * &NL80211_CMD_CONNECT or &NL80211_CMD_ROAM. If the 4 way handshake failed * &NL80211_CMD_DISCONNECT should be indicated instead. * + * @NL80211_CMD_RELOAD_REGDB: Request that the regdb firmware file is reloaded. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1194,6 +1196,8 @@ enum nl80211_commands { NL80211_CMD_PORT_AUTHORIZED, + NL80211_CMD_RELOAD_REGDB, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ -- cgit From 28978713c51b0a70acf748f76f9d6d2d20dcf980 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Tue, 10 Oct 2017 23:45:18 -0700 Subject: net: qrtr: Move constants to header file The constants are used by both the name server and clients, so clarify their value and move them to the uapi header. Signed-off-by: Bjorn Andersson Signed-off-by: David S. Miller --- include/uapi/linux/qrtr.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/qrtr.h b/include/uapi/linux/qrtr.h index 9d76c566f66e..63e8803e4d90 100644 --- a/include/uapi/linux/qrtr.h +++ b/include/uapi/linux/qrtr.h @@ -4,6 +4,9 @@ #include #include +#define QRTR_NODE_BCAST 0xffffffffu +#define QRTR_PORT_CTRL 0xfffffffeu + struct sockaddr_qrtr { __kernel_sa_family_t sq_family; __u32 sq_node; -- cgit From da7653f0faabbe45eb2d3fd6e4b400fe003e81ae Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Tue, 10 Oct 2017 23:45:19 -0700 Subject: net: qrtr: Add control packet definition to uapi The QMUX protocol specification defines structure of the special control packet messages being sent between handlers of the control port. Add these to the uapi header, as this structure and the associated types are shared between the kernel and all userspace handlers of control messages. Signed-off-by: Bjorn Andersson Signed-off-by: David S. Miller --- include/uapi/linux/qrtr.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/qrtr.h b/include/uapi/linux/qrtr.h index 63e8803e4d90..179af64846e0 100644 --- a/include/uapi/linux/qrtr.h +++ b/include/uapi/linux/qrtr.h @@ -13,4 +13,36 @@ struct sockaddr_qrtr { __u32 sq_port; }; +enum qrtr_pkt_type { + QRTR_TYPE_DATA = 1, + QRTR_TYPE_HELLO = 2, + QRTR_TYPE_BYE = 3, + QRTR_TYPE_NEW_SERVER = 4, + QRTR_TYPE_DEL_SERVER = 5, + QRTR_TYPE_DEL_CLIENT = 6, + QRTR_TYPE_RESUME_TX = 7, + QRTR_TYPE_EXIT = 8, + QRTR_TYPE_PING = 9, + QRTR_TYPE_NEW_LOOKUP = 10, + QRTR_TYPE_DEL_LOOKUP = 11, +}; + +struct qrtr_ctrl_pkt { + __le32 cmd; + + union { + struct { + __le32 service; + __le32 instance; + __le32 node; + __le32 port; + } server; + + struct { + __le32 node; + __le32 port; + } client; + }; +} __packed; + #endif /* _LINUX_QRTR_H */ -- cgit From ad2d116c5242875bba27522682ec5ba7f0df75f0 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 11 Oct 2017 11:14:49 -0700 Subject: sched: tc_mirred: Remove whitespaces This file contains unnecessary whitespaces as newlines, remove them, found by looking at what struct tc_mirred looks like. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/uapi/linux/tc_act/tc_mirred.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/tc_act/tc_mirred.h b/include/uapi/linux/tc_act/tc_mirred.h index 3d7a2b352a62..69038c29e8a9 100644 --- a/include/uapi/linux/tc_act/tc_mirred.h +++ b/include/uapi/linux/tc_act/tc_mirred.h @@ -9,13 +9,13 @@ #define TCA_EGRESS_MIRROR 2 /* mirror packet to EGRESS */ #define TCA_INGRESS_REDIR 3 /* packet redirect to INGRESS*/ #define TCA_INGRESS_MIRROR 4 /* mirror packet to INGRESS */ - + struct tc_mirred { tc_gen; int eaction; /* one of IN/EGRESS_MIRROR/REDIR */ __u32 ifindex; /* ifindex of egress port */ }; - + enum { TCA_MIRRED_UNSPEC, TCA_MIRRED_TM, @@ -24,5 +24,5 @@ enum { __TCA_MIRRED_MAX }; #define TCA_MIRRED_MAX (__TCA_MIRRED_MAX - 1) - + #endif -- cgit From 75da2163dbb6af9f2dce1d80056d11d290dd19a5 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Fri, 13 Oct 2017 11:04:23 +0200 Subject: tipc: introduce communication groups As a preparation for introducing flow control for multicast and datagram messaging we need a more strictly defined framework than we have now. A socket must be able keep track of exactly how many and which other sockets it is allowed to communicate with at any moment, and keep the necessary state for those. We therefore introduce a new concept we have named Communication Group. Sockets can join a group via a new setsockopt() call TIPC_GROUP_JOIN. The call takes four parameters: 'type' serves as group identifier, 'instance' serves as an logical member identifier, and 'scope' indicates the visibility of the group (node/cluster/zone). Finally, 'flags' makes it possible to set certain properties for the member. For now, there is only one flag, indicating if the creator of the socket wants to receive a copy of broadcast or multicast messages it is sending via the socket, and if wants to be eligible as destination for its own anycasts. A group is closed, i.e., sockets which have not joined a group will not be able to send messages to or receive messages from members of the group, and vice versa. Any member of a group can send multicast ('group broadcast') messages to all group members, optionally including itself, using the primitive send(). The messages are received via the recvmsg() primitive. A socket can only be member of one group at a time. Signed-off-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- include/uapi/linux/tipc.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h index 5351b08c897a..5f7b2c4a09ab 100644 --- a/include/uapi/linux/tipc.h +++ b/include/uapi/linux/tipc.h @@ -231,6 +231,20 @@ struct sockaddr_tipc { #define TIPC_SOCK_RECVQ_DEPTH 132 /* Default: none (read only) */ #define TIPC_MCAST_BROADCAST 133 /* Default: TIPC selects. No arg */ #define TIPC_MCAST_REPLICAST 134 /* Default: TIPC selects. No arg */ +#define TIPC_GROUP_JOIN 135 /* Takes struct tipc_group_req* */ +#define TIPC_GROUP_LEAVE 136 /* No argument */ + +/* + * Flag values + */ +#define TIPC_GROUP_LOOPBACK 0x1 /* Receive copy of sent msg when match */ + +struct tipc_group_req { + __u32 type; /* group id */ + __u32 instance; /* member id */ + __u32 scope; /* zone/cluster/node */ + __u32 flags; +}; /* * Maximum sizes of TIPC bearer-related names (including terminating NULL) -- cgit From ae236fb208a6fbbd2e7a6913385e8fb78ac807f8 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Fri, 13 Oct 2017 11:04:25 +0200 Subject: tipc: receive group membership events via member socket Like with any other service, group members' availability can be subscribed for by connecting to be topology server. However, because the events arrive via a different socket than the member socket, there is a real risk that membership events my arrive out of synch with the actual JOIN/LEAVE action. I.e., it is possible to receive the first messages from a new member before the corresponding JOIN event arrives, just as it is possible to receive the last messages from a leaving member after the LEAVE event has already been received. Since each member socket is internally also subscribing for membership events, we now fix this problem by passing those events on to the user via the member socket. We leverage the already present member synch- ronization protocol to guarantee correct message/event order. An event is delivered to the user as an empty message where the two source addresses identify the new/lost member. Furthermore, we set the MSG_OOB bit in the message flags to mark it as an event. If the event is an indication about a member loss we also set the MSG_EOR bit, so it can be distinguished from a member addition event. Signed-off-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- include/uapi/linux/tipc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h index 5f7b2c4a09ab..ef41c11a7f38 100644 --- a/include/uapi/linux/tipc.h +++ b/include/uapi/linux/tipc.h @@ -238,6 +238,7 @@ struct sockaddr_tipc { * Flag values */ #define TIPC_GROUP_LOOPBACK 0x1 /* Receive copy of sent msg when match */ +#define TIPC_GROUP_MEMBER_EVTS 0x2 /* Receive membership events in socket */ struct tipc_group_req { __u32 type; /* group id */ -- cgit From 4e8b86c062695454df0b76f3fee4fab8dc4bb716 Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Thu, 7 Sep 2017 04:00:06 -0700 Subject: mqprio: Introduce new hardware offload mode and shaper in mqprio The offload types currently supported in mqprio are 0 (no offload) and 1 (offload only TCs) by setting these values for the 'hw' option. If offloads are supported by setting the 'hw' option to 1, the default offload mode is 'dcb' where only the TC values are offloaded to the device. This patch introduces a new hardware offload mode called 'channel' with 'hw' set to 1 in mqprio which makes full use of the mqprio options, the TCs, the queue configurations and the QoS parameters for the TCs. This is achieved through a new netlink attribute for the 'mode' option which takes values such as 'dcb' (default) and 'channel'. The 'channel' mode also supports QoS attributes for traffic class such as minimum and maximum values for bandwidth rate limits. This patch enables configuring additional HW shaper attributes associated with a traffic class. Currently the shaper for bandwidth rate limiting is supported which takes options such as minimum and maximum bandwidth rates and are offloaded to the hardware in the 'channel' mode. The min and max limits for bandwidth rates are provided by the user along with the TCs and the queue configurations when creating the mqprio qdisc. The interface can be extended to support new HW shapers in future through the 'shaper' attribute. Introduces a new data structure 'tc_mqprio_qopt_offload' for offloading mqprio queue options and use this to be shared between the kernel and device driver. This contains a copy of the existing data structure for mqprio queue options. This new data structure can be extended when adding new attributes for traffic class such as mode, shaper, shaper parameters (bandwidth rate limits). The existing data structure for mqprio queue options will be shared between the kernel and userspace. Example: queues 4@0 4@4 hw 1 mode channel shaper bw_rlimit\ min_rate 1Gbit 2Gbit max_rate 4Gbit 5Gbit To dump the bandwidth rates: qdisc mqprio 804a: root tc 2 map 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 queues:(0:3) (4:7) mode:channel shaper:bw_rlimit min_rate:1Gbit 2Gbit max_rate:4Gbit 5Gbit Signed-off-by: Amritha Nambiar Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- include/uapi/linux/pkt_sched.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 099bf5528fed..e95b5c9b9fad 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -625,6 +625,22 @@ enum { #define TC_MQPRIO_HW_OFFLOAD_MAX (__TC_MQPRIO_HW_OFFLOAD_MAX - 1) +enum { + TC_MQPRIO_MODE_DCB, + TC_MQPRIO_MODE_CHANNEL, + __TC_MQPRIO_MODE_MAX +}; + +#define __TC_MQPRIO_MODE_MAX (__TC_MQPRIO_MODE_MAX - 1) + +enum { + TC_MQPRIO_SHAPER_DCB, + TC_MQPRIO_SHAPER_BW_RATE, /* Add new shapers below */ + __TC_MQPRIO_SHAPER_MAX +}; + +#define __TC_MQPRIO_SHAPER_MAX (__TC_MQPRIO_SHAPER_MAX - 1) + struct tc_mqprio_qopt { __u8 num_tc; __u8 prio_tc_map[TC_QOPT_BITMASK + 1]; @@ -633,6 +649,22 @@ struct tc_mqprio_qopt { __u16 offset[TC_QOPT_MAX_QUEUE]; }; +#define TC_MQPRIO_F_MODE 0x1 +#define TC_MQPRIO_F_SHAPER 0x2 +#define TC_MQPRIO_F_MIN_RATE 0x4 +#define TC_MQPRIO_F_MAX_RATE 0x8 + +enum { + TCA_MQPRIO_UNSPEC, + TCA_MQPRIO_MODE, + TCA_MQPRIO_SHAPER, + TCA_MQPRIO_MIN_RATE64, + TCA_MQPRIO_MAX_RATE64, + __TCA_MQPRIO_MAX, +}; + +#define TCA_MQPRIO_MAX (__TCA_MQPRIO_MAX - 1) + /* SFB */ enum { -- cgit From 32302902ff093891d8e64439cbb8ceae83e21ef8 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 12 Oct 2017 11:38:45 -0700 Subject: mqprio: Reserve last 32 classid values for HW traffic classes and misc IDs This patch makes a slight tweak to mqprio in order to bring the classid values used back in line with what is used for mq. The general idea is to reserve values :ffe0 - :ffef to identify hardware traffic classes normally reported via dev->num_tc. By doing this we can maintain a consistent behavior with mq for classid where :1 - :ffdf will represent a physical qdisc mapped onto a Tx queue represented by classid - 1, and the traffic classes will be mapped onto a known subset of classid values reserved for our virtual qdiscs. Note I reserved the range from :fff0 - :ffff since this way we might be able to reuse these classid values with clsact and ingress which would mean that for mq, mqprio, ingress, and clsact we should be able to maintain a similar classid layout. Signed-off-by: Alexander Duyck Tested-by: Jesus Sanchez-Palencia Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index e95b5c9b9fad..e7cc3d3c7421 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -74,6 +74,7 @@ struct tc_estimator { #define TC_H_INGRESS (0xFFFFFFF1U) #define TC_H_CLSACT TC_H_INGRESS +#define TC_H_MIN_PRIORITY 0xFFE0U #define TC_H_MIN_INGRESS 0xFFF2U #define TC_H_MIN_EGRESS 0xFFF3U -- cgit From 6710e1126934d8b4372b4d2f9ae1646cd3f151bf Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 16 Oct 2017 12:19:28 +0200 Subject: bpf: introduce new bpf cpu map type BPF_MAP_TYPE_CPUMAP The 'cpumap' is primarily used as a backend map for XDP BPF helper call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'. This patch implement the main part of the map. It is not connected to the XDP redirect system yet, and no SKB allocation are done yet. The main concern in this patch is to ensure the datapath can run without any locking. This adds complexity to the setup and tear-down procedure, which assumptions are extra carefully documented in the code comments. V2: - make sure array isn't larger than NR_CPUS - make sure CPUs added is a valid possible CPU V3: fix nitpicks from Jakub Kicinski V5: - Restrict map allocation to root / CAP_SYS_ADMIN - WARN_ON_ONCE if queue is not empty on tear-down - Return -EPERM on memlock limit instead of -ENOMEM - Error code in __cpu_map_entry_alloc() also handle ptr_ring_cleanup() - Moved cpu_map_enqueue() to next patch V6: all notice by Daniel Borkmann - Fix err return code in cpu_map_alloc() introduced in V5 - Move cpu_possible() check after max_entries boundary check - Forbid usage initially in check_map_func_compatibility() V7: - Fix alloc error path spotted by Daniel Borkmann - Did stress test adding+removing CPUs from the map concurrently - Fixed refcnt issue on cpu_map_entry, kthread started too soon - Make sure packets are flushed during tear-down, involved use of rcu_barrier() and kthread_run only exit after queue is empty - Fix alloc error path in __cpu_map_entry_alloc() for ptr_ring V8: - Nitpicking comments and gramma by Edward Cree - Fix missing semi-colon introduced in V7 due to rebasing - Move struct bpf_cpu_map_entry members cpu+map_id to tracepoint patch Signed-off-by: Jesper Dangaard Brouer Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6db9e1d679cd..4303fb6c3817 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -112,6 +112,7 @@ enum bpf_map_type { BPF_MAP_TYPE_HASH_OF_MAPS, BPF_MAP_TYPE_DEVMAP, BPF_MAP_TYPE_SOCKMAP, + BPF_MAP_TYPE_CPUMAP, }; enum bpf_prog_type { -- cgit From 1fba70e5b6bed53496ba1f1f16127f5be01b5fb6 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 18 Oct 2017 11:22:51 -0700 Subject: tcp: socket option to set TCP fast open key New socket option TCP_FASTOPEN_KEY to allow different keys per listener. The listener by default uses the global key until the socket option is set. The key is a 16 bytes long binary data. This option has no effect on regular non-listener TCP sockets. Signed-off-by: Yuchung Cheng Reviewed-by: Eric Dumazet Reviewed-by: Christoph Paasch Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 15c25eccab2b..69c7493e42f8 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -119,6 +119,7 @@ enum { #define TCP_FASTOPEN_CONNECT 30 /* Attempt FastOpen with connect */ #define TCP_ULP 31 /* Attach a ULP to a TCP connection */ #define TCP_MD5SIG_EXT 32 /* TCP MD5 Signature with extensions */ +#define TCP_FASTOPEN_KEY 33 /* Set the key for Fast Open (cookie) */ struct tcp_repair_opt { __u32 opt_code; -- cgit From 6e71b04a82248ccf13a94b85cbc674a9fefe53f5 Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Wed, 18 Oct 2017 13:00:22 -0700 Subject: bpf: Add file mode configuration into bpf maps Introduce the map read/write flags to the eBPF syscalls that returns the map fd. The flags is used to set up the file mode when construct a new file descriptor for bpf maps. To not break the backward capability, the f_flags is set to O_RDWR if the flag passed by syscall is 0. Otherwise it should be O_RDONLY or O_WRONLY. When the userspace want to modify or read the map content, it will check the file mode to see if it is allowed to make the change. Signed-off-by: Chenbo Feng Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4303fb6c3817..d83f95ea6a1b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -218,6 +218,10 @@ enum bpf_attach_type { #define BPF_OBJ_NAME_LEN 16U +/* Flags for accessing BPF object */ +#define BPF_F_RDONLY (1U << 3) +#define BPF_F_WRONLY (1U << 4) + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ @@ -260,6 +264,7 @@ union bpf_attr { struct { /* anonymous struct used by BPF_OBJ_* commands */ __aligned_u64 pathname; __u32 bpf_fd; + __u32 file_flags; }; struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */ @@ -287,6 +292,7 @@ union bpf_attr { __u32 map_id; }; __u32 next_id; + __u32 open_flags; }; struct { /* anonymous struct used by BPF_OBJ_GET_INFO_BY_FD */ -- cgit From e6546ef6d86d0fc38e0e84ccae80e641f3fc0087 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Fri, 20 Oct 2017 11:05:39 -0700 Subject: bpf: add support for BPF_SOCK_OPS_BASE_RTT A congestion control algorithm can make a call to the BPF socket_ops program to request the base RTT. The base RTT can be congestion control dependent and is meant to represent a congestion threshold such that RTTs above it indicate congestion. This is especially useful for flows within a DC where the base RTT is easy to obtain. Being provided a base RTT solves a basic problem in RTT based congestion avoidance algorithms (such as Vegas, NV and BBR). Although it is easy to get the base RTT when the network is not congested, it is very diffcult to do when it is very congested. Newer connections get an inflated value of the base RTT leading to unfariness (newer flows with a larger base RTT get more bandwidth). As a result, RTT based congestion avoidance algorithms tend to update their base RTTs to improve fairness. In very congested networks this can lead to base RTT inflation, reducing the ability of these RTT based congestion control algorithms to prevent congestion. Note that in my experiments with TCP-NV, the base RTT provided can be much larger than the actual hardware RTT. For example, experimenting with hosts within a rack where the hardware RTT is 16-20us, I've used base RTTs up to 150us. The effect of using a larger base RTT is that the congestion avoidance algorithm will allow more queueing. When there are only a few flows the main effect is larger measured RTTs and RPC latencies due to the increased queueing. When there are a lot of flows, a larger base RTT can lead to more congestion and more packet drops. For this case, where the hardware RTT is 20us, a base RTT of 80us produces good results. This patch only introduces BPF_SOCK_OPS_BASE_RTT, a later patch in this set adds support for using it in TCP-NV. Further study and testing is needed before support can be added to other delay based congestion avoidance algorithms. Signed-off-by: Lawrence Brakmo Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d83f95ea6a1b..1aca744c220f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -955,6 +955,13 @@ enum { BPF_SOCK_OPS_NEEDS_ECN, /* If connection's congestion control * needs ECN */ + BPF_SOCK_OPS_BASE_RTT, /* Get base RTT. The correct value is + * based on the path and may be + * dependent on the congestion control + * algorithm. In general it indicates + * a congestion threshold. RTTs above + * this indicate congestion + */ }; #define TCP_BPF_IW 1001 /* Set TCP initial congestion window */ -- cgit From cd86d1fd21025fdd6daf23d1288da405e7ad0ec6 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Fri, 20 Oct 2017 11:05:40 -0700 Subject: bpf: Adding helper function bpf_getsockops Adding support for helper function bpf_getsockops to socket_ops BPF programs. This patch only supports TCP_CONGESTION. Signed-off-by: Vlad Vysotsky Acked-by: Lawrence Brakmo Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1aca744c220f..f650346aaa1a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -613,12 +613,22 @@ union bpf_attr { * int bpf_setsockopt(bpf_socket, level, optname, optval, optlen) * Calls setsockopt. Not all opts are available, only those with * integer optvals plus TCP_CONGESTION. - * Supported levels: SOL_SOCKET and IPROTO_TCP + * Supported levels: SOL_SOCKET and IPPROTO_TCP * @bpf_socket: pointer to bpf_socket - * @level: SOL_SOCKET or IPROTO_TCP + * @level: SOL_SOCKET or IPPROTO_TCP * @optname: option name * @optval: pointer to option value - * @optlen: length of optval in byes + * @optlen: length of optval in bytes + * Return: 0 or negative error + * + * int bpf_getsockopt(bpf_socket, level, optname, optval, optlen) + * Calls getsockopt. Not all opts are available. + * Supported levels: IPPROTO_TCP + * @bpf_socket: pointer to bpf_socket + * @level: IPPROTO_TCP + * @optname: option name + * @optval: pointer to option value + * @optlen: length of optval in bytes * Return: 0 or negative error * * int bpf_skb_adjust_room(skb, len_diff, mode, flags) @@ -721,7 +731,8 @@ union bpf_attr { FN(sock_map_update), \ FN(xdp_adjust_meta), \ FN(perf_event_read_value), \ - FN(perf_prog_read_value), + FN(perf_prog_read_value), \ + FN(getsockopt), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit From 40b16b9be5773a314948656c96adf7bf7cfdbd0b Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sat, 21 Oct 2017 11:45:46 +0200 Subject: batman-adv: use inline kernel-doc for uapi constants The enums of constants for netlink tends to become rather large over time. Documenting them is easier when the kernel-doc is actually next to constant and not in a different block above the enum. Also inline kernel-doc allows multi-paragraph description. This could be required to better document the netlink command types and the expected return values. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batman_adv.h | 369 +++++++++++++++++++++++++++++++--------- 1 file changed, 290 insertions(+), 79 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index a83ddb7b63db..efd641c8a5d6 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -24,20 +24,6 @@ /** * enum batadv_tt_client_flags - TT client specific flags - * @BATADV_TT_CLIENT_DEL: the client has to be deleted from the table - * @BATADV_TT_CLIENT_ROAM: the client roamed to/from another node and the new - * update telling its new real location has not been received/sent yet - * @BATADV_TT_CLIENT_WIFI: this client is connected through a wifi interface. - * This information is used by the "AP Isolation" feature - * @BATADV_TT_CLIENT_ISOLA: this client is considered "isolated". This - * information is used by the Extended Isolation feature - * @BATADV_TT_CLIENT_NOPURGE: this client should never be removed from the table - * @BATADV_TT_CLIENT_NEW: this client has been added to the local table but has - * not been announced yet - * @BATADV_TT_CLIENT_PENDING: this client is marked for removal but it is kept - * in the table for one more originator interval for consistency purposes - * @BATADV_TT_CLIENT_TEMP: this global client has been detected to be part of - * the network but no nnode has already announced it * * Bits from 0 to 7 are called _remote flags_ because they are sent on the wire. * Bits from 8 to 15 are called _local flags_ because they are used for local @@ -48,160 +34,385 @@ * in the TT CRC computation. */ enum batadv_tt_client_flags { + /** + * @BATADV_TT_CLIENT_DEL: the client has to be deleted from the table + */ BATADV_TT_CLIENT_DEL = (1 << 0), + + /** + * @BATADV_TT_CLIENT_ROAM: the client roamed to/from another node and + * the new update telling its new real location has not been + * received/sent yet + */ BATADV_TT_CLIENT_ROAM = (1 << 1), + + /** + * @BATADV_TT_CLIENT_WIFI: this client is connected through a wifi + * interface. This information is used by the "AP Isolation" feature + */ BATADV_TT_CLIENT_WIFI = (1 << 4), + + /** + * @BATADV_TT_CLIENT_ISOLA: this client is considered "isolated". This + * information is used by the Extended Isolation feature + */ BATADV_TT_CLIENT_ISOLA = (1 << 5), + + /** + * @BATADV_TT_CLIENT_NOPURGE: this client should never be removed from + * the table + */ BATADV_TT_CLIENT_NOPURGE = (1 << 8), + + /** + * @BATADV_TT_CLIENT_NEW: this client has been added to the local table + * but has not been announced yet + */ BATADV_TT_CLIENT_NEW = (1 << 9), + + /** + * @BATADV_TT_CLIENT_PENDING: this client is marked for removal but it + * is kept in the table for one more originator interval for consistency + * purposes + */ BATADV_TT_CLIENT_PENDING = (1 << 10), + + /** + * @BATADV_TT_CLIENT_TEMP: this global client has been detected to be + * part of the network but no nnode has already announced it + */ BATADV_TT_CLIENT_TEMP = (1 << 11), }; /** * enum batadv_nl_attrs - batman-adv netlink attributes - * - * @BATADV_ATTR_UNSPEC: unspecified attribute to catch errors - * @BATADV_ATTR_VERSION: batman-adv version string - * @BATADV_ATTR_ALGO_NAME: name of routing algorithm - * @BATADV_ATTR_MESH_IFINDEX: index of the batman-adv interface - * @BATADV_ATTR_MESH_IFNAME: name of the batman-adv interface - * @BATADV_ATTR_MESH_ADDRESS: mac address of the batman-adv interface - * @BATADV_ATTR_HARD_IFINDEX: index of the non-batman-adv interface - * @BATADV_ATTR_HARD_IFNAME: name of the non-batman-adv interface - * @BATADV_ATTR_HARD_ADDRESS: mac address of the non-batman-adv interface - * @BATADV_ATTR_ORIG_ADDRESS: originator mac address - * @BATADV_ATTR_TPMETER_RESULT: result of run (see batadv_tp_meter_status) - * @BATADV_ATTR_TPMETER_TEST_TIME: time (msec) the run took - * @BATADV_ATTR_TPMETER_BYTES: amount of acked bytes during run - * @BATADV_ATTR_TPMETER_COOKIE: session cookie to match tp_meter session - * @BATADV_ATTR_PAD: attribute used for padding for 64-bit alignment - * @BATADV_ATTR_ACTIVE: Flag indicating if the hard interface is active - * @BATADV_ATTR_TT_ADDRESS: Client MAC address - * @BATADV_ATTR_TT_TTVN: Translation table version - * @BATADV_ATTR_TT_LAST_TTVN: Previous translation table version - * @BATADV_ATTR_TT_CRC32: CRC32 over translation table - * @BATADV_ATTR_TT_VID: VLAN ID - * @BATADV_ATTR_TT_FLAGS: Translation table client flags - * @BATADV_ATTR_FLAG_BEST: Flags indicating entry is the best - * @BATADV_ATTR_LAST_SEEN_MSECS: Time in milliseconds since last seen - * @BATADV_ATTR_NEIGH_ADDRESS: Neighbour MAC address - * @BATADV_ATTR_TQ: TQ to neighbour - * @BATADV_ATTR_THROUGHPUT: Estimated throughput to Neighbour - * @BATADV_ATTR_BANDWIDTH_UP: Reported uplink bandwidth - * @BATADV_ATTR_BANDWIDTH_DOWN: Reported downlink bandwidth - * @BATADV_ATTR_ROUTER: Gateway router MAC address - * @BATADV_ATTR_BLA_OWN: Flag indicating own originator - * @BATADV_ATTR_BLA_ADDRESS: Bridge loop avoidance claim MAC address - * @BATADV_ATTR_BLA_VID: BLA VLAN ID - * @BATADV_ATTR_BLA_BACKBONE: BLA gateway originator MAC address - * @BATADV_ATTR_BLA_CRC: BLA CRC - * @__BATADV_ATTR_AFTER_LAST: internal use - * @NUM_BATADV_ATTR: total number of batadv_nl_attrs available - * @BATADV_ATTR_MAX: highest attribute number currently defined */ enum batadv_nl_attrs { + /** + * @BATADV_ATTR_UNSPEC: unspecified attribute to catch errors + */ BATADV_ATTR_UNSPEC, + + /** + * @BATADV_ATTR_VERSION: batman-adv version string + */ BATADV_ATTR_VERSION, + + /** + * @BATADV_ATTR_ALGO_NAME: name of routing algorithm + */ BATADV_ATTR_ALGO_NAME, + + /** + * @BATADV_ATTR_MESH_IFINDEX: index of the batman-adv interface + */ BATADV_ATTR_MESH_IFINDEX, + + /** + * @BATADV_ATTR_MESH_IFNAME: name of the batman-adv interface + */ BATADV_ATTR_MESH_IFNAME, + + /** + * @BATADV_ATTR_MESH_ADDRESS: mac address of the batman-adv interface + */ BATADV_ATTR_MESH_ADDRESS, + + /** + * @BATADV_ATTR_HARD_IFINDEX: index of the non-batman-adv interface + */ BATADV_ATTR_HARD_IFINDEX, + + /** + * @BATADV_ATTR_HARD_IFNAME: name of the non-batman-adv interface + */ BATADV_ATTR_HARD_IFNAME, + + /** + * @BATADV_ATTR_HARD_ADDRESS: mac address of the non-batman-adv + * interface + */ BATADV_ATTR_HARD_ADDRESS, + + /** + * @BATADV_ATTR_ORIG_ADDRESS: originator mac address + */ BATADV_ATTR_ORIG_ADDRESS, + + /** + * @BATADV_ATTR_TPMETER_RESULT: result of run (see + * batadv_tp_meter_status) + */ BATADV_ATTR_TPMETER_RESULT, + + /** + * @BATADV_ATTR_TPMETER_TEST_TIME: time (msec) the run took + */ BATADV_ATTR_TPMETER_TEST_TIME, + + /** + * @BATADV_ATTR_TPMETER_BYTES: amount of acked bytes during run + */ BATADV_ATTR_TPMETER_BYTES, + + /** + * @BATADV_ATTR_TPMETER_COOKIE: session cookie to match tp_meter session + */ BATADV_ATTR_TPMETER_COOKIE, + + /** + * @BATADV_ATTR_PAD: attribute used for padding for 64-bit alignment + */ BATADV_ATTR_PAD, + + /** + * @BATADV_ATTR_ACTIVE: Flag indicating if the hard interface is active + */ BATADV_ATTR_ACTIVE, + + /** + * @BATADV_ATTR_TT_ADDRESS: Client MAC address + */ BATADV_ATTR_TT_ADDRESS, + + /** + * @BATADV_ATTR_TT_TTVN: Translation table version + */ BATADV_ATTR_TT_TTVN, + + /** + * @BATADV_ATTR_TT_LAST_TTVN: Previous translation table version + */ BATADV_ATTR_TT_LAST_TTVN, + + /** + * @BATADV_ATTR_TT_CRC32: CRC32 over translation table + */ BATADV_ATTR_TT_CRC32, + + /** + * @BATADV_ATTR_TT_VID: VLAN ID + */ BATADV_ATTR_TT_VID, + + /** + * @BATADV_ATTR_TT_FLAGS: Translation table client flags + */ BATADV_ATTR_TT_FLAGS, + + /** + * @BATADV_ATTR_FLAG_BEST: Flags indicating entry is the best + */ BATADV_ATTR_FLAG_BEST, + + /** + * @BATADV_ATTR_LAST_SEEN_MSECS: Time in milliseconds since last seen + */ BATADV_ATTR_LAST_SEEN_MSECS, + + /** + * @BATADV_ATTR_NEIGH_ADDRESS: Neighbour MAC address + */ BATADV_ATTR_NEIGH_ADDRESS, + + /** + * @BATADV_ATTR_TQ: TQ to neighbour + */ BATADV_ATTR_TQ, + + /** + * @BATADV_ATTR_THROUGHPUT: Estimated throughput to Neighbour + */ BATADV_ATTR_THROUGHPUT, + + /** + * @BATADV_ATTR_BANDWIDTH_UP: Reported uplink bandwidth + */ BATADV_ATTR_BANDWIDTH_UP, + + /** + * @BATADV_ATTR_BANDWIDTH_DOWN: Reported downlink bandwidth + */ BATADV_ATTR_BANDWIDTH_DOWN, + + /** + * @BATADV_ATTR_ROUTER: Gateway router MAC address + */ BATADV_ATTR_ROUTER, + + /** + * @BATADV_ATTR_BLA_OWN: Flag indicating own originator + */ BATADV_ATTR_BLA_OWN, + + /** + * @BATADV_ATTR_BLA_ADDRESS: Bridge loop avoidance claim MAC address + */ BATADV_ATTR_BLA_ADDRESS, + + /** + * @BATADV_ATTR_BLA_VID: BLA VLAN ID + */ BATADV_ATTR_BLA_VID, + + /** + * @BATADV_ATTR_BLA_BACKBONE: BLA gateway originator MAC address + */ BATADV_ATTR_BLA_BACKBONE, + + /** + * @BATADV_ATTR_BLA_CRC: BLA CRC + */ BATADV_ATTR_BLA_CRC, + /* add attributes above here, update the policy in netlink.c */ + + /** + * @__BATADV_ATTR_AFTER_LAST: internal use + */ __BATADV_ATTR_AFTER_LAST, + + /** + * @NUM_BATADV_ATTR: total number of batadv_nl_attrs available + */ NUM_BATADV_ATTR = __BATADV_ATTR_AFTER_LAST, + + /** + * @BATADV_ATTR_MAX: highest attribute number currently defined + */ BATADV_ATTR_MAX = __BATADV_ATTR_AFTER_LAST - 1 }; /** * enum batadv_nl_commands - supported batman-adv netlink commands - * - * @BATADV_CMD_UNSPEC: unspecified command to catch errors - * @BATADV_CMD_GET_MESH_INFO: Query basic information about batman-adv device - * @BATADV_CMD_TP_METER: Start a tp meter session - * @BATADV_CMD_TP_METER_CANCEL: Cancel a tp meter session - * @BATADV_CMD_GET_ROUTING_ALGOS: Query the list of routing algorithms. - * @BATADV_CMD_GET_HARDIFS: Query list of hard interfaces - * @BATADV_CMD_GET_TRANSTABLE_LOCAL: Query list of local translations - * @BATADV_CMD_GET_TRANSTABLE_GLOBAL Query list of global translations - * @BATADV_CMD_GET_ORIGINATORS: Query list of originators - * @BATADV_CMD_GET_NEIGHBORS: Query list of neighbours - * @BATADV_CMD_GET_GATEWAYS: Query list of gateways - * @BATADV_CMD_GET_BLA_CLAIM: Query list of bridge loop avoidance claims - * @BATADV_CMD_GET_BLA_BACKBONE: Query list of bridge loop avoidance backbones - * @__BATADV_CMD_AFTER_LAST: internal use - * @BATADV_CMD_MAX: highest used command number */ enum batadv_nl_commands { + /** + * @BATADV_CMD_UNSPEC: unspecified command to catch errors + */ BATADV_CMD_UNSPEC, + + /** + * @BATADV_CMD_GET_MESH_INFO: Query basic information about batman-adv + * device + */ BATADV_CMD_GET_MESH_INFO, + + /** + * @BATADV_CMD_TP_METER: Start a tp meter session + */ BATADV_CMD_TP_METER, + + /** + * @BATADV_CMD_TP_METER_CANCEL: Cancel a tp meter session + */ BATADV_CMD_TP_METER_CANCEL, + + /** + * @BATADV_CMD_GET_ROUTING_ALGOS: Query the list of routing algorithms. + */ BATADV_CMD_GET_ROUTING_ALGOS, + + /** + * @BATADV_CMD_GET_HARDIFS: Query list of hard interfaces + */ BATADV_CMD_GET_HARDIFS, + + /** + * @BATADV_CMD_GET_TRANSTABLE_LOCAL: Query list of local translations + */ BATADV_CMD_GET_TRANSTABLE_LOCAL, + + /** + * @BATADV_CMD_GET_TRANSTABLE_GLOBAL: Query list of global translations + */ BATADV_CMD_GET_TRANSTABLE_GLOBAL, + + /** + * @BATADV_CMD_GET_ORIGINATORS: Query list of originators + */ BATADV_CMD_GET_ORIGINATORS, + + /** + * @BATADV_CMD_GET_NEIGHBORS: Query list of neighbours + */ BATADV_CMD_GET_NEIGHBORS, + + /** + * @BATADV_CMD_GET_GATEWAYS: Query list of gateways + */ BATADV_CMD_GET_GATEWAYS, + + /** + * @BATADV_CMD_GET_BLA_CLAIM: Query list of bridge loop avoidance claims + */ BATADV_CMD_GET_BLA_CLAIM, + + /** + * @BATADV_CMD_GET_BLA_BACKBONE: Query list of bridge loop avoidance + * backbones + */ BATADV_CMD_GET_BLA_BACKBONE, + /* add new commands above here */ + + /** + * @__BATADV_CMD_AFTER_LAST: internal use + */ __BATADV_CMD_AFTER_LAST, + + /** + * @BATADV_CMD_MAX: highest used command number + */ BATADV_CMD_MAX = __BATADV_CMD_AFTER_LAST - 1 }; /** * enum batadv_tp_meter_reason - reason of a tp meter test run stop - * @BATADV_TP_REASON_COMPLETE: sender finished tp run - * @BATADV_TP_REASON_CANCEL: sender was stopped during run - * @BATADV_TP_REASON_DST_UNREACHABLE: receiver could not be reached or didn't - * answer - * @BATADV_TP_REASON_RESEND_LIMIT: (unused) sender retry reached limit - * @BATADV_TP_REASON_ALREADY_ONGOING: test to or from the same node already - * ongoing - * @BATADV_TP_REASON_MEMORY_ERROR: test was stopped due to low memory - * @BATADV_TP_REASON_CANT_SEND: failed to send via outgoing interface - * @BATADV_TP_REASON_TOO_MANY: too many ongoing sessions */ enum batadv_tp_meter_reason { + /** + * @BATADV_TP_REASON_COMPLETE: sender finished tp run + */ BATADV_TP_REASON_COMPLETE = 3, + + /** + * @BATADV_TP_REASON_CANCEL: sender was stopped during run + */ BATADV_TP_REASON_CANCEL = 4, + /* error status >= 128 */ + + /** + * @BATADV_TP_REASON_DST_UNREACHABLE: receiver could not be reached or + * didn't answer + */ BATADV_TP_REASON_DST_UNREACHABLE = 128, + + /** + * @BATADV_TP_REASON_RESEND_LIMIT: (unused) sender retry reached limit + */ BATADV_TP_REASON_RESEND_LIMIT = 129, + + /** + * @BATADV_TP_REASON_ALREADY_ONGOING: test to or from the same node + * already ongoing + */ BATADV_TP_REASON_ALREADY_ONGOING = 130, + + /** + * @BATADV_TP_REASON_MEMORY_ERROR: test was stopped due to low memory + */ BATADV_TP_REASON_MEMORY_ERROR = 131, + + /** + * @BATADV_TP_REASON_CANT_SEND: failed to send via outgoing interface + */ BATADV_TP_REASON_CANT_SEND = 132, + + /** + * @BATADV_TP_REASON_TOO_MANY: too many ongoing sessions + */ BATADV_TP_REASON_TOO_MANY = 133, }; -- cgit From 71c02379c762cb616c00fd5c4ed253fbf6bbe11b Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Mon, 23 Oct 2017 13:22:23 -0700 Subject: tcp: Configure TFO without cookie per socket and/or per route We already allow to enable TFO without a cookie by using the fastopen-sysctl and setting it to TFO_SERVER_COOKIE_NOT_REQD (or TFO_CLIENT_NO_COOKIE). This is safe to do in certain environments where we know that there isn't a malicous host (aka., data-centers) or when the application-protocol already provides an authentication mechanism in the first flight of data. A server however might be providing multiple services or talking to both sides (public Internet and data-center). So, this server would want to enable cookie-less TFO for certain services and/or for connections that go to the data-center. This patch exposes a socket-option and a per-route attribute to enable such fine-grained configurations. Signed-off-by: Christoph Paasch Reviewed-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 2 ++ include/uapi/linux/tcp.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index dab7dad9e01a..fe6679268901 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -430,6 +430,8 @@ enum { #define RTAX_QUICKACK RTAX_QUICKACK RTAX_CC_ALGO, #define RTAX_CC_ALGO RTAX_CC_ALGO + RTAX_FASTOPEN_NO_COOKIE, +#define RTAX_FASTOPEN_NO_COOKIE RTAX_FASTOPEN_NO_COOKIE __RTAX_MAX }; diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 69c7493e42f8..d67e1d40c6d6 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -120,6 +120,7 @@ enum { #define TCP_ULP 31 /* Attach a ULP to a TCP connection */ #define TCP_MD5SIG_EXT 32 /* TCP MD5 Signature with extensions */ #define TCP_FASTOPEN_KEY 33 /* Set the key for Fast Open (cookie) */ +#define TCP_FASTOPEN_NO_COOKIE 34 /* Enable TFO without a TFO cookie */ struct tcp_repair_opt { __u32 opt_code; -- cgit From 908d140a87a794bf89717ceae54aba5ce86c52e4 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Sat, 21 Oct 2017 00:25:15 +0300 Subject: ip6_tunnel: Allow rcv/xmit even if remote address is a local address Currently, ip6_tnl_xmit_ctl drops tunneled packets if the remote address (outer v6 destination) is one of host's locally configured addresses. Same applies to ip6_tnl_rcv_ctl: it drops packets if the remote address (outer v6 source) is a local address. This prevents using ipxip6 (and ip6_gre) tunnels whose local/remote endpoints are on same host; OTOH v4 tunnels (ipip or gre) allow such configurations. An example where this proves useful is a system where entities are identified by their unique v6 addresses, and use tunnels to encapsulate traffic between them. The limitation prevents placing several entities on same host. Introduce IP6_TNL_F_ALLOW_LOCAL_REMOTE which allows to bypass this restriction. Signed-off-by: Shmulik Ladkani Signed-off-by: David S. Miller --- include/uapi/linux/ip6_tunnel.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/ip6_tunnel.h b/include/uapi/linux/ip6_tunnel.h index 425926c467d7..ffebbe365478 100644 --- a/include/uapi/linux/ip6_tunnel.h +++ b/include/uapi/linux/ip6_tunnel.h @@ -20,6 +20,8 @@ #define IP6_TNL_F_RCV_DSCP_COPY 0x10 /* copy fwmark from inner packet */ #define IP6_TNL_F_USE_ORIG_FWMARK 0x20 +/* allow remote endpoint on the local node */ +#define IP6_TNL_F_ALLOW_LOCAL_REMOTE 0x40 struct ip6_tnl_parm { char name[IFNAMSIZ]; /* name of tunnel device */ -- cgit From 585d763af09cc21daf48ecc873604ccdb70f6014 Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Mon, 16 Oct 2017 18:01:26 -0700 Subject: net/sched: Introduce Credit Based Shaper (CBS) qdisc This queueing discipline implements the shaper algorithm defined by the 802.1Q-2014 Section 8.6.8.2 and detailed in Annex L. It's primary usage is to apply some bandwidth reservation to user defined traffic classes, which are mapped to different queues via the mqprio qdisc. Only a simple software implementation is added for now. Signed-off-by: Vinicius Costa Gomes Signed-off-by: Jesus Sanchez-Palencia Tested-by: Henrik Austad Signed-off-by: Jeff Kirsher --- include/uapi/linux/pkt_sched.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index e7cc3d3c7421..0e88cc262ca0 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -904,4 +904,23 @@ struct tc_pie_xstats { __u32 maxq; /* maximum queue size */ __u32 ecn_mark; /* packets marked with ecn*/ }; + +/* CBS */ +struct tc_cbs_qopt { + __u8 offload; + __u8 _pad[3]; + __s32 hicredit; + __s32 locredit; + __s32 idleslope; + __s32 sendslope; +}; + +enum { + TCA_CBS_UNSPEC, + TCA_CBS_PARMS, + __TCA_CBS_MAX, +}; + +#define TCA_CBS_MAX (__TCA_CBS_MAX - 1) + #endif -- cgit From 2ea2352ede9d97585164a7e19224955f4e4ca8db Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 27 Oct 2017 17:30:12 -0700 Subject: ipv6: prevent user from adding cached routes Cached routes should only be created by the system when receiving pmtu discovery or ip redirect msg. Users should not be allowed to create cached routes. Furthermore, after the patch series to move cached routes into exception table, user added cached routes will trigger the following warning in fib6_add(): WARNING: CPU: 0 PID: 2985 at net/ipv6/ip6_fib.c:1137 fib6_add+0x20d9/0x2c10 net/ipv6/ip6_fib.c:1137 Kernel panic - not syncing: panic_on_warn set ... CPU: 0 PID: 2985 Comm: syzkaller320388 Not tainted 4.14.0-rc3+ #74 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:52 panic+0x1e4/0x417 kernel/panic.c:181 __warn+0x1c4/0x1d9 kernel/panic.c:542 report_bug+0x211/0x2d0 lib/bug.c:183 fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:178 do_trap_no_signal arch/x86/kernel/traps.c:212 [inline] do_trap+0x260/0x390 arch/x86/kernel/traps.c:261 do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:298 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:311 invalid_op+0x18/0x20 arch/x86/entry/entry_64.S:905 RIP: 0010:fib6_add+0x20d9/0x2c10 net/ipv6/ip6_fib.c:1137 RSP: 0018:ffff8801cf09f6a0 EFLAGS: 00010297 RAX: ffff8801ce45e340 RBX: 1ffff10039e13eec RCX: ffff8801d749c814 RDX: 0000000000000000 RSI: ffff8801d749c700 RDI: ffff8801d749c780 RBP: ffff8801cf09fa08 R08: 0000000000000000 R09: ffff8801cf09f360 R10: ffff8801cf09f2d8 R11: 1ffff10039c8befb R12: 0000000000000001 R13: dffffc0000000000 R14: ffff8801d749c700 R15: ffffffff860655c0 __ip6_ins_rt+0x6c/0x90 net/ipv6/route.c:1011 ip6_route_add+0x148/0x1a0 net/ipv6/route.c:2782 ipv6_route_ioctl+0x4d5/0x690 net/ipv6/route.c:3291 inet6_ioctl+0xef/0x1e0 net/ipv6/af_inet6.c:521 sock_do_ioctl+0x65/0xb0 net/socket.c:961 sock_ioctl+0x2c2/0x440 net/socket.c:1058 vfs_ioctl fs/ioctl.c:45 [inline] do_vfs_ioctl+0x1b1/0x1530 fs/ioctl.c:685 SYSC_ioctl fs/ioctl.c:700 [inline] SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691 entry_SYSCALL_64_fastpath+0x1f/0xbe So we fix this by failing the attemp to add cached routes from userspace with returning EINVAL error. Fixes: 2b760fcf5cfb ("ipv6: hook up exception table to store dst cache") Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/uapi/linux/ipv6_route.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/ipv6_route.h b/include/uapi/linux/ipv6_route.h index d496c02e14bc..c15d8054905c 100644 --- a/include/uapi/linux/ipv6_route.h +++ b/include/uapi/linux/ipv6_route.h @@ -28,7 +28,7 @@ #define RTF_ROUTEINFO 0x00800000 /* route information - RA */ -#define RTF_CACHE 0x01000000 /* cache entry */ +#define RTF_CACHE 0x01000000 /* read-only: can not be set by user */ #define RTF_FLOW 0x02000000 /* flow significant route */ #define RTF_POLICY 0x04000000 /* policy route */ -- cgit From a190d04db93710ae166749055b6985397c6d13f5 Mon Sep 17 00:00:00 2001 From: Mahesh Bandewar Date: Thu, 26 Oct 2017 15:09:21 -0700 Subject: ipvlan: introduce 'private' attribute for all existing modes. IPvlan has always operated in bridge mode. However there are scenarios where each slave should be able to talk through the master device but not necessarily across each other. Think of an environment where each of a namespace is a private and independant customer. In this scenario the machine which is hosting these namespaces neither want to tell who their neighbor is nor the individual namespaces care to talk to neighbor on short-circuited network path. This patch implements the mode that is very similar to the 'private' mode in macvlan where individual slaves can send and receive traffic through the master device, just that they can not talk among slave devices. Signed-off-by: Mahesh Bandewar Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index b037e0ab1975..052e32cd584c 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -465,6 +465,7 @@ enum macsec_validation_type { enum { IFLA_IPVLAN_UNSPEC, IFLA_IPVLAN_MODE, + IFLA_IPVLAN_FLAGS, __IFLA_IPVLAN_MAX }; @@ -477,6 +478,8 @@ enum ipvlan_mode { IPVLAN_MODE_MAX }; +#define IPVLAN_F_PRIVATE 0x01 + /* VXLAN section */ enum { IFLA_VXLAN_UNSPEC, -- cgit From fe89aa6b250c1011ccf425fbb7998e96bd54263f Mon Sep 17 00:00:00 2001 From: Mahesh Bandewar Date: Thu, 26 Oct 2017 15:09:25 -0700 Subject: ipvlan: implement VEPA mode This is very similar to the Macvlan VEPA mode, however, there is some difference. IPvlan uses the mac-address of the lower device, so the VEPA mode has implications of ICMP-redirects for packets destined for its immediate neighbors sharing same master since the packets will have same source and dest mac. The external switch/router will send redirect msg. Having said that, this will be useful tool in terms of debugging since IPvlan will not switch packets within its slaves and rely completely on the external entity as intended in 802.1Qbg. Signed-off-by: Mahesh Bandewar Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 052e32cd584c..81f26473d728 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -479,6 +479,7 @@ enum ipvlan_mode { }; #define IPVLAN_F_PRIVATE 0x01 +#define IPVLAN_F_VEPA 0x02 /* VXLAN section */ enum { -- cgit From ee20598194500e82c477cf13e52b58e569446ed0 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Tue, 18 Jul 2017 15:42:15 -0500 Subject: net/dcb: Add dscp to priority selector type IEEE specification P802.1Qcd/D2.1 defines priority selector 5. This APP TLV selector defines DSCP to priority map. This patch defines such DSCP selector. Signed-off-by: Huy Nguyen Reviewed-by: Parav Pandit Reviewed-by: Or Gerlitz Signed-off-by: Saeed Mahameed --- include/uapi/linux/dcbnl.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/dcbnl.h b/include/uapi/linux/dcbnl.h index b6170a6af7c2..2c0c6453c3f4 100644 --- a/include/uapi/linux/dcbnl.h +++ b/include/uapi/linux/dcbnl.h @@ -206,6 +206,7 @@ struct cee_pfc { #define IEEE_8021QAZ_APP_SEL_STREAM 2 #define IEEE_8021QAZ_APP_SEL_DGRAM 3 #define IEEE_8021QAZ_APP_SEL_ANY 4 +#define IEEE_8021QAZ_APP_SEL_DSCP 5 /* This structure contains the IEEE 802.1Qaz APP managed object. This * object is also used for the CEE std as well. -- cgit From 9354d452034273a50a4fd703bea31e5d6b1fc20b Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 2 Nov 2017 17:04:37 -0200 Subject: openvswitch: reliable interface indentification in port dumps This patch allows reliable identification of netdevice interfaces connected to openvswitch bridges. In particular, user space queries the netdev interfaces belonging to the ports for statistics, up/down state, etc. Datapath dump needs to provide enough information for the user space to be able to do that. Currently, only interface names are returned. This is not sufficient, as openvswitch allows its ports to be in different name spaces and the interface name is valid only in its name space. What is needed and generally used in other netlink APIs, is the pair ifindex+netnsid. The solution is addition of the ifindex+netnsid pair (or only ifindex if in the same name space) to vport get/dump operation. On request side, ideally the ifindex+netnsid pair could be used to get/set/del the corresponding vport. This is not implemented by this patch and can be added later if needed. Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index ffe397daad49..501e4c4e2a03 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -258,6 +258,8 @@ enum ovs_vport_attr { /* receiving upcalls */ OVS_VPORT_ATTR_STATS, /* struct ovs_vport_stats */ OVS_VPORT_ATTR_PAD, + OVS_VPORT_ATTR_IFINDEX, + OVS_VPORT_ATTR_NETNSID, __OVS_VPORT_ATTR_MAX }; -- cgit From 79e1ad148c844f5c8b9d76b36b26e3886dca95ae Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 2 Nov 2017 17:04:38 -0200 Subject: rtnetlink: use netnsid to query interface Currently, when an application gets netnsid from the kernel (for example as the result of RTM_GETLINK call on one end of the veth pair), it's not much useful. There's no reliable way to get to the netns fd from the netnsid, nor does any kernel API accept netnsid. Extend the RTM_GETLINK call to also accept netnsid. It will operate on the netns with the given netnsid in such case. Of course, the calling process needs to have enough capabilities in the target name space; for now, require CAP_NET_ADMIN. This can be relaxed in the future. To signal to the calling process that the kernel understood the new IFLA_IF_NETNSID attribute in the query, it will include it in the response. This is needed to detect older kernels, as they will just ignore IFLA_IF_NETNSID and query in the current name space. This patch implemetns IFLA_IF_NETNSID only for get and dump. For set operations, this can be extended later. Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index b3cf5639ac8f..19fc02660e0c 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -160,6 +160,7 @@ enum { IFLA_XDP, IFLA_EVENT, IFLA_NEW_NETNSID, + IFLA_IF_NETNSID, __IFLA_MAX }; -- cgit From ab3f0063c48c26c927851b6767824e35a716d878 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Nov 2017 13:56:17 -0700 Subject: bpf: offload: add infrastructure for loading programs for a specific netdev The fact that we don't know which device the program is going to be used on is quite limiting in current eBPF infrastructure. We have to reverse or limit the changes which kernel makes to the loaded bytecode if we want it to be offloaded to a networking device. We also have to invent new APIs for debugging and troubleshooting support. Make it possible to load programs for a specific netdev. This helps us to bring the debug information closer to the core eBPF infrastructure (e.g. we will be able to reuse the verifer log in device JIT). It allows device JITs to perform translation on the original bytecode. __bpf_prog_get() when called to get a reference for an attachment point will now refuse to give it if program has a device assigned. Following patches will add a version of that function which passes the expected netdev in. @type argument in __bpf_prog_get() is renamed to attach_type to make it clearer that it's only set on attachment. All calls to ndo_bpf are protected by rtnl, only verifier callbacks are not. We need a wait queue to make sure netdev doesn't get destroyed while verifier is still running and calling its driver. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Reviewed-by: Quentin Monnet Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a9820677c2ff..80d191a93fb0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -260,6 +260,7 @@ union bpf_attr { __u32 kern_version; /* checked when prog_type=kprobe */ __u32 prog_flags; char prog_name[BPF_OBJ_NAME_LEN]; + __u32 prog_target_ifindex; /* ifindex of netdev to prep for */ }; struct { /* anonymous struct used by BPF_OBJ_* commands */ -- cgit From bd601b6ada11fdfb9e277f24ad2eb54bc599156b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Nov 2017 13:56:18 -0700 Subject: bpf: report offload info to user space Extend struct bpf_prog_info to contain information about program being bound to a device. Since the netdev may get destroyed while program still exists we need a flag to indicate the program is loaded for a device, even if the device is gone. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Reviewed-by: Quentin Monnet Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 80d191a93fb0..4455dd195201 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -895,6 +895,10 @@ enum sk_action { #define BPF_TAG_SIZE 8 +enum bpf_prog_status { + BPF_PROG_STATUS_DEV_BOUND = (1 << 0), +}; + struct bpf_prog_info { __u32 type; __u32 id; @@ -908,6 +912,8 @@ struct bpf_prog_info { __u32 nr_map_ids; __aligned_u64 map_ids; char name[BPF_OBJ_NAME_LEN]; + __u32 ifindex; + __u32 status; } __attribute__((aligned(8))); struct bpf_map_info { -- cgit From ebc614f687369f9df99828572b1d85a7c2de3d92 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Sun, 5 Nov 2017 08:15:32 -0500 Subject: bpf, cgroup: implement eBPF-based device controller for cgroup v2 Cgroup v2 lacks the device controller, provided by cgroup v1. This patch adds a new eBPF program type, which in combination of previously added ability to attach multiple eBPF programs to a cgroup, will provide a similar functionality, but with some additional flexibility. This patch introduces a BPF_PROG_TYPE_CGROUP_DEVICE program type. A program takes major and minor device numbers, device type (block/character) and access type (mknod/read/write) as parameters and returns an integer which defines if the operation should be allowed or terminated with -EPERM. Signed-off-by: Roman Gushchin Acked-by: Alexei Starovoitov Acked-by: Tejun Heo Cc: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4455dd195201..e880ae6434ee 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -132,6 +132,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_LWT_XMIT, BPF_PROG_TYPE_SOCK_OPS, BPF_PROG_TYPE_SK_SKB, + BPF_PROG_TYPE_CGROUP_DEVICE, }; enum bpf_attach_type { @@ -141,6 +142,7 @@ enum bpf_attach_type { BPF_CGROUP_SOCK_OPS, BPF_SK_SKB_STREAM_PARSER, BPF_SK_SKB_STREAM_VERDICT, + BPF_CGROUP_DEVICE, __MAX_BPF_ATTACH_TYPE }; @@ -991,4 +993,17 @@ struct bpf_perf_event_value { __u64 running; }; +#define BPF_DEVCG_ACC_MKNOD (1ULL << 0) +#define BPF_DEVCG_ACC_READ (1ULL << 1) +#define BPF_DEVCG_ACC_WRITE (1ULL << 2) + +#define BPF_DEVCG_DEV_BLOCK (1ULL << 0) +#define BPF_DEVCG_DEV_CHAR (1ULL << 1) + +struct bpf_cgroup_dev_ctx { + __u32 access_type; /* (access << 16) | type */ + __u32 major; + __u32 minor; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit From 84287bb3285634b60c55c00a1d5ed843b44fde92 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 5 Nov 2017 15:58:23 -0800 Subject: ila: add checksum neutral map auto Add checksum neutral auto that performs checksum neutral mapping without using the C-bit. This is enabled by configuration of a mapping. The checksum neutral function has been split into ila_csum_do_neutral_fmt and ila_csum_do_neutral_nofmt. The former handles the C-bit and includes it in the adjustment value. The latter just sets the adjustment value on the locator diff only. Added configuration for checksum neutral map aut in ila_lwt and ila_xlat. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/uapi/linux/ila.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/ila.h b/include/uapi/linux/ila.h index f54853288f99..0744881dcef3 100644 --- a/include/uapi/linux/ila.h +++ b/include/uapi/linux/ila.h @@ -41,6 +41,7 @@ enum { ILA_CSUM_ADJUST_TRANSPORT, ILA_CSUM_NEUTRAL_MAP, ILA_CSUM_NO_ACTION, + ILA_CSUM_NEUTRAL_MAP_AUTO, }; #endif /* _UAPI_LINUX_ILA_H */ -- cgit From 70d5aef48a421a68bd9d1bf8f8267af406681580 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 5 Nov 2017 15:58:24 -0800 Subject: ila: allow configuration of identifier type Allow identifier to be explicitly configured for a mapping. This can either be one of the identifier types specified in the ILA draft or a value of ILA_ATYPE_USE_FORMAT which means the identifier type is inferred from the identifier type field. If a value other than ILA_ATYPE_USE_FORMAT is set for a mapping then it is assumed that the identifier type field is not present in an identifier. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/uapi/linux/ila.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/ila.h b/include/uapi/linux/ila.h index 0744881dcef3..8353c78a7781 100644 --- a/include/uapi/linux/ila.h +++ b/include/uapi/linux/ila.h @@ -17,6 +17,7 @@ enum { ILA_ATTR_DIR, /* u32 */ ILA_ATTR_PAD, ILA_ATTR_CSUM_MODE, /* u8 */ + ILA_ATTR_IDENT_TYPE, /* u8 */ __ILA_ATTR_MAX, }; @@ -44,4 +45,16 @@ enum { ILA_CSUM_NEUTRAL_MAP_AUTO, }; +enum { + ILA_ATYPE_IID = 0, + ILA_ATYPE_LUID, + ILA_ATYPE_VIRT_V4, + ILA_ATYPE_VIRT_UNI_V6, + ILA_ATYPE_VIRT_MULTI_V6, + ILA_ATYPE_NONLOCAL_ADDR, + ILA_ATYPE_RSVD_1, + ILA_ATYPE_RSVD_2, + + ILA_ATYPE_USE_FORMAT = 32, /* Get type from type field in identifier */ +}; #endif /* _UAPI_LINUX_ILA_H */ -- cgit From fddb231ebe647749782a9ebf11106a81f7168ba7 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 5 Nov 2017 15:58:25 -0800 Subject: ila: Add a hook type for LWT routes In LWT tunnels both an input and output route method is defined. If both of these are executed in the same path then double translation happens and the effect is not correct. This patch adds a new attribute that indicates the hook type. Two values are defined for route output and route output. ILA translation is only done for the one that is set. The default is to enable ILA on route output. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/uapi/linux/ila.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/ila.h b/include/uapi/linux/ila.h index 8353c78a7781..483b77af4eb8 100644 --- a/include/uapi/linux/ila.h +++ b/include/uapi/linux/ila.h @@ -18,6 +18,7 @@ enum { ILA_ATTR_PAD, ILA_ATTR_CSUM_MODE, /* u8 */ ILA_ATTR_IDENT_TYPE, /* u8 */ + ILA_ATTR_HOOK_TYPE, /* u8 */ __ILA_ATTR_MAX, }; @@ -57,4 +58,10 @@ enum { ILA_ATYPE_USE_FORMAT = 32, /* Get type from type field in identifier */ }; + +enum { + ILA_HOOK_ROUTE_OUTPUT, + ILA_HOOK_ROUTE_INPUT, +}; + #endif /* _UAPI_LINUX_ILA_H */ -- cgit From 602f3baf22188aad24b9a58be3209ab774b97d74 Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Mon, 6 Nov 2017 07:23:41 +0100 Subject: net_sch: red: Add offload ability to RED qdisc Add the ability to offload RED qdisc by using ndo_setup_tc. There are four commands for RED offloading: * TC_RED_SET: handles set and change. * TC_RED_DESTROY: handle qdisc destroy. * TC_RED_STATS: update the qdiscs counters (given as reference) * TC_RED_XSTAT: returns red xstats. Whether RED is being offloaded is being determined every time dump action is being called because parent change of this qdisc could change its offload state but doesn't require any RED function to be called. Signed-off-by: Nogah Frankel Signed-off-by: Jiri Pirko Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 5002562868cc..6a2c5ea7e9c4 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -256,6 +256,7 @@ struct tc_red_qopt { #define TC_RED_ECN 1 #define TC_RED_HARDDROP 2 #define TC_RED_ADAPTATIVE 4 +#define TC_RED_OFFLOADED 8 }; struct tc_red_xstats { -- cgit From b2d0f5d5dc53532e6f07bc546a476a55ebdfe0f3 Mon Sep 17 00:00:00 2001 From: Yi Yang Date: Tue, 7 Nov 2017 21:07:02 +0800 Subject: openvswitch: enable NSH support v16->17 - Fixed disputed check code: keep them in nsh_push and nsh_pop but also add them in __ovs_nla_copy_actions v15->v16 - Add csum recalculation for nsh_push, nsh_pop and set_nsh pointed out by Pravin - Move nsh key into the union with ipv4 and ipv6 and add check for nsh key in match_validate pointed out by Pravin - Add nsh check in validate_set and __ovs_nla_copy_actions v14->v15 - Check size in nsh_hdr_from_nlattr - Fixed four small issues pointed out By Jiri and Eric v13->v14 - Rename skb_push_nsh to nsh_push per Dave's comment - Rename skb_pop_nsh to nsh_pop per Dave's comment v12->v13 - Fix NSH header length check in set_nsh v11->v12 - Fix missing changes old comments pointed out - Fix new comments for v11 v10->v11 - Fix the left three disputable comments for v9 but not fixed in v10. v9->v10 - Change struct ovs_key_nsh to struct ovs_nsh_key_base base; __be32 context[NSH_MD1_CONTEXT_SIZE]; - Fix new comments for v9 v8->v9 - Fix build error reported by daily intel build because nsh module isn't selected by openvswitch v7->v8 - Rework nested value and mask for OVS_KEY_ATTR_NSH - Change pop_nsh to adapt to nsh kernel module - Fix many issues per comments from Jiri Benc v6->v7 - Remove NSH GSO patches in v6 because Jiri Benc reworked it as another patch series and they have been merged. - Change it to adapt to nsh kernel module added by NSH GSO patch series v5->v6 - Fix the rest comments for v4. - Add NSH GSO support for VxLAN-gpe + NSH and Eth + NSH. v4->v5 - Fix many comments by Jiri Benc and Eric Garver for v4. v3->v4 - Add new NSH match field ttl - Update NSH header to the latest format which will be final format and won't change per its author's confirmation. - Fix comments for v3. v2->v3 - Change OVS_KEY_ATTR_NSH to nested key to handle length-fixed attributes and length-variable attriubte more flexibly. - Remove struct ovs_action_push_nsh completely - Add code to handle nested attribute for SET_MASKED - Change PUSH_NSH to use the nested OVS_KEY_ATTR_NSH to transfer NSH header data. - Fix comments and coding style issues by Jiri and Eric v1->v2 - Change encap_nsh and decap_nsh to push_nsh and pop_nsh - Dynamically allocate struct ovs_action_push_nsh for length-variable metadata. OVS master and 2.8 branch has merged NSH userspace patch series, this patch is to enable NSH support in kernel data path in order that OVS can support NSH in compat mode by porting this. Signed-off-by: Yi Yang Acked-by: Jiri Benc Acked-by: Eric Garver Acked-by: Pravin Shelar Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 501e4c4e2a03..ec75a685f1dd 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -336,6 +336,7 @@ enum ovs_key_attr { OVS_KEY_ATTR_CT_LABELS, /* 16-octet connection tracking label */ OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4, /* struct ovs_key_ct_tuple_ipv4 */ OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6, /* struct ovs_key_ct_tuple_ipv6 */ + OVS_KEY_ATTR_NSH, /* Nested set of ovs_nsh_key_* */ #ifdef __KERNEL__ OVS_KEY_ATTR_TUNNEL_INFO, /* struct ip_tunnel_info */ @@ -495,6 +496,30 @@ struct ovs_key_ct_tuple_ipv6 { __u8 ipv6_proto; }; +enum ovs_nsh_key_attr { + OVS_NSH_KEY_ATTR_UNSPEC, + OVS_NSH_KEY_ATTR_BASE, /* struct ovs_nsh_key_base. */ + OVS_NSH_KEY_ATTR_MD1, /* struct ovs_nsh_key_md1. */ + OVS_NSH_KEY_ATTR_MD2, /* variable-length octets for MD type 2. */ + __OVS_NSH_KEY_ATTR_MAX +}; + +#define OVS_NSH_KEY_ATTR_MAX (__OVS_NSH_KEY_ATTR_MAX - 1) + +struct ovs_nsh_key_base { + __u8 flags; + __u8 ttl; + __u8 mdtype; + __u8 np; + __be32 path_hdr; +}; + +#define NSH_MD1_CONTEXT_SIZE 4 + +struct ovs_nsh_key_md1 { + __be32 context[NSH_MD1_CONTEXT_SIZE]; +}; + /** * enum ovs_flow_attr - attributes for %OVS_FLOW_* commands. * @OVS_FLOW_ATTR_KEY: Nested %OVS_KEY_ATTR_* attributes specifying the flow @@ -811,6 +836,8 @@ struct ovs_action_push_eth { * @OVS_ACTION_ATTR_POP_ETH: Pop the outermost Ethernet header off the * packet. * @OVS_ACTION_ATTR_CT_CLEAR: Clear conntrack state from the packet. + * @OVS_ACTION_ATTR_PUSH_NSH: push NSH header to the packet. + * @OVS_ACTION_ATTR_POP_NSH: pop the outermost NSH header off the packet. * * Only a single header can be set with a single %OVS_ACTION_ATTR_SET. Not all * fields within a header are modifiable, e.g. the IPv4 protocol and fragment @@ -841,6 +868,8 @@ enum ovs_action_attr { OVS_ACTION_ATTR_PUSH_ETH, /* struct ovs_action_push_eth. */ OVS_ACTION_ATTR_POP_ETH, /* No argument. */ OVS_ACTION_ATTR_CT_CLEAR, /* No argument. */ + OVS_ACTION_ATTR_PUSH_NSH, /* Nested OVS_NSH_KEY_ATTR_*. */ + OVS_ACTION_ATTR_POP_NSH, /* No argument. */ __OVS_ACTION_ATTR_MAX, /* Nothing past this will be accepted * from userspace. */ -- cgit From 4d63adfe12dd9cb61ed8badb4d798955399048c2 Mon Sep 17 00:00:00 2001 From: Mark Greer Date: Thu, 15 Jun 2017 20:34:22 -0700 Subject: NFC: Add NFC_CMD_DEACTIVATE_TARGET support Once an NFC target (i.e., a tag) is found, it remains active until there is a failure reading or writing it (often caused by the target moving out of range). While the target is active, the NFC adapter and antenna must remain powered. This wastes power when the target remains in range but the client application no longer cares whether it is there or not. To mitigate this, add a new netlink command that allows userspace to deactivate an active target. When issued, this command will cause the NFC subsystem to act as though the target was moved out of range. Once the command has been executed, the client application can power off the NFC adapter to reduce power consumption. Signed-off-by: Mark Greer Signed-off-by: Samuel Ortiz --- include/uapi/linux/nfc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h index 399f39ff8048..f6e3c8c9c744 100644 --- a/include/uapi/linux/nfc.h +++ b/include/uapi/linux/nfc.h @@ -89,6 +89,7 @@ * @NFC_CMD_ACTIVATE_TARGET: Request NFC controller to reactivate target. * @NFC_CMD_VENDOR: Vendor specific command, to be implemented directly * from the driver in order to support hardware specific operations. + * @NFC_CMD_DEACTIVATE_TARGET: Request NFC controller to deactivate target. */ enum nfc_commands { NFC_CMD_UNSPEC, @@ -121,6 +122,7 @@ enum nfc_commands { NFC_CMD_SE_IO, NFC_CMD_ACTIVATE_TARGET, NFC_CMD_VENDOR, + NFC_CMD_DEACTIVATE_TARGET, /* private: internal use only */ __NFC_CMD_AFTER_LAST }; -- cgit From dd0bb688eaa241b5655d396d45366cba9225aed9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 7 Nov 2017 15:28:42 -0500 Subject: bpf: add a bpf_override_function helper Error injection is sloppy and very ad-hoc. BPF could fill this niche perfectly with it's kprobe functionality. We could make sure errors are only triggered in specific call chains that we care about with very specific situations. Accomplish this with the bpf_override_funciton helper. This will modify the probe'd callers return value to the specified value and set the PC to an override function that simply returns, bypassing the originally probed function. This gives us a nice clean way to implement systematic error injection for all of our code paths. Acked-by: Alexei Starovoitov Signed-off-by: Josef Bacik Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e880ae6434ee..adb66f78b674 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -677,6 +677,10 @@ union bpf_attr { * @buf: buf to fill * @buf_size: size of the buf * Return : 0 on success or negative error code + * + * int bpf_override_return(pt_regs, rc) + * @pt_regs: pointer to struct pt_regs + * @rc: the return value to set */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -736,7 +740,8 @@ union bpf_attr { FN(xdp_adjust_meta), \ FN(perf_event_read_value), \ FN(perf_prog_read_value), \ - FN(getsockopt), + FN(getsockopt), \ + FN(override_return), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit From 2210d6b2f287d738eddf6b75f432126ce05450f8 Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Tue, 7 Nov 2017 21:52:09 -0800 Subject: net: ipv6: sysctl to specify IPv6 ND traffic class MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a per-device sysctl to specify the default traffic class to use for kernel originated IPv6 Neighbour Discovery packets. Currently this includes: - Router Solicitation (ICMPv6 type 133) ndisc_send_rs() -> ndisc_send_skb() -> ip6_nd_hdr() - Neighbour Solicitation (ICMPv6 type 135) ndisc_send_ns() -> ndisc_send_skb() -> ip6_nd_hdr() - Neighbour Advertisement (ICMPv6 type 136) ndisc_send_na() -> ndisc_send_skb() -> ip6_nd_hdr() - Redirect (ICMPv6 type 137) ndisc_send_redirect() -> ndisc_send_skb() -> ip6_nd_hdr() and if the kernel ever gets around to generating RA's, it would presumably also include: - Router Advertisement (ICMPv6 type 134) (radvd daemon could pick up on the kernel setting and use it) Interface drivers may examine the Traffic Class value and translate the DiffServ Code Point into a link-layer appropriate traffic prioritization scheme. An example of mapping IETF DSCP values to IEEE 802.11 User Priority values can be found here: https://tools.ietf.org/html/draft-ietf-tsvwg-ieee-802-11 The expected primary use case is to properly prioritize ND over wifi. Testing: jzem22:~# cat /proc/sys/net/ipv6/conf/eth0/ndisc_tclass 0 jzem22:~# echo -1 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass -bash: echo: write error: Invalid argument jzem22:~# echo 256 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass -bash: echo: write error: Invalid argument jzem22:~# echo 0 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass jzem22:~# echo 255 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass jzem22:~# cat /proc/sys/net/ipv6/conf/eth0/ndisc_tclass 255 jzem22:~# echo 34 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass jzem22:~# cat /proc/sys/net/ipv6/conf/eth0/ndisc_tclass 34 jzem22:~# echo $[0xDC] > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass jzem22:~# tcpdump -v -i eth0 icmp6 and src host jzem22.pgc and dst host fe80::1 tcpdump: listening on eth0, link-type EN10MB (Ethernet), capture size 262144 bytes IP6 (class 0xdc, hlim 255, next-header ICMPv6 (58) payload length: 24) jzem22.pgc > fe80::1: [icmp6 sum ok] ICMP6, neighbor advertisement, length 24, tgt is jzem22.pgc, Flags [solicited] (based on original change written by Erik Kline, with minor changes) v2: fix 'suspicious rcu_dereference_check() usage' by explicitly grabbing the rcu_read_lock. Cc: Lorenzo Colitti Signed-off-by: Erik Kline Signed-off-by: Maciej Żenczykowski Signed-off-by: David S. Miller --- include/uapi/linux/ipv6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index b22a9c4e1b12..9c0f4a92bcff 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -186,6 +186,7 @@ enum { DEVCONF_ADDR_GEN_MODE, DEVCONF_DISABLE_POLICY, DEVCONF_ACCEPT_RA_RT_INFO_MIN_PLEN, + DEVCONF_NDISC_TCLASS, DEVCONF_MAX }; -- cgit From f3edacbd697f94a743fff1a3d26910ab99948ba7 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 11 Nov 2017 18:24:55 +0900 Subject: bpf: Revert bpf_overrid_function() helper changes. NACK'd by x86 maintainer. Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index adb66f78b674..e880ae6434ee 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -677,10 +677,6 @@ union bpf_attr { * @buf: buf to fill * @buf_size: size of the buf * Return : 0 on success or negative error code - * - * int bpf_override_return(pt_regs, rc) - * @pt_regs: pointer to struct pt_regs - * @rc: the return value to set */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -740,8 +736,7 @@ union bpf_attr { FN(xdp_adjust_meta), \ FN(perf_event_read_value), \ FN(perf_prog_read_value), \ - FN(getsockopt), \ - FN(override_return), + FN(getsockopt), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit From 713bafea92920103cd3d361657406cf04d0e22dd Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 8 Nov 2017 13:01:26 -0800 Subject: tcp: retire FACK loss detection FACK loss detection has been disabled by default and the successor RACK subsumed FACK and can handle reordering better. This patch removes FACK to simplify TCP loss recovery. Signed-off-by: Yuchung Cheng Reviewed-by: Eric Dumazet Reviewed-by: Neal Cardwell Reviewed-by: Soheil Hassas Yeganeh Reviewed-by: Priyaranjan Jha Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 0d941cdd8e8c..33a70ece462f 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -191,7 +191,6 @@ enum LINUX_MIB_TCPRENORECOVERY, /* TCPRenoRecovery */ LINUX_MIB_TCPSACKRECOVERY, /* TCPSackRecovery */ LINUX_MIB_TCPSACKRENEGING, /* TCPSACKReneging */ - LINUX_MIB_TCPFACKREORDER, /* TCPFACKReorder */ LINUX_MIB_TCPSACKREORDER, /* TCPSACKReorder */ LINUX_MIB_TCPRENOREORDER, /* TCPRenoReorder */ LINUX_MIB_TCPTSREORDER, /* TCPTSReorder */ -- cgit From 99803171ef04037092bf5eb29ae801e8b4d49a75 Mon Sep 17 00:00:00 2001 From: Dave Taht Date: Wed, 8 Nov 2017 15:12:27 -0800 Subject: netem: add uapi to express delay and jitter in nanoseconds netem userspace has long relied on a horrible /proc/net/psched hack to translate the current notion of "ticks" to nanoseconds. Expressing latency and jitter instead, in well defined nanoseconds, increases the dynamic range of emulated delays and jitter in netem. It will also ease a transition where reducing a tick to nsec equivalence would constrain the max delay in prior versions of netem to only 4.3 seconds. Signed-off-by: Dave Taht Suggested-by: Eric Dumazet Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 6a2c5ea7e9c4..8fe6d1842bee 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -537,6 +537,8 @@ enum { TCA_NETEM_ECN, TCA_NETEM_RATE64, TCA_NETEM_PAD, + TCA_NETEM_LATENCY64, + TCA_NETEM_JITTER64, __TCA_NETEM_MAX, }; -- cgit From 836af83b54e3e285c4a0cc06c24aeb737d3e0e18 Mon Sep 17 00:00:00 2001 From: Dave Taht Date: Wed, 8 Nov 2017 15:12:28 -0800 Subject: netem: support delivering packets in delayed time slots Slotting is a crude approximation of the behaviors of shared media such as cable, wifi, and LTE, which gather up a bunch of packets within a varying delay window and deliver them, relative to that, nearly all at once. It works within the existing loss, duplication, jitter and delay parameters of netem. Some amount of inherent latency must be specified, regardless. The new "slot" parameter specifies a minimum and maximum delay between transmission attempts. The "bytes" and "packets" parameters can be used to limit the amount of information transferred per slot. Examples of use: tc qdisc add dev eth0 root netem delay 200us \ slot 800us 10ms bytes 64k packets 42 A more correct example, using stacked netem instances and a packet limit to emulate a tail drop wifi queue with slots and variable packet delivery, with a 200Mbit isochronous underlying rate, and 20ms path delay: tc qdisc add dev eth0 root handle 1: netem delay 20ms rate 200mbit \ limit 10000 tc qdisc add dev eth0 parent 1:1 handle 10:1 netem delay 200us \ slot 800us 10ms bytes 64k packets 42 limit 512 Signed-off-by: Dave Taht Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 8fe6d1842bee..af3cc2f4e1ad 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -539,6 +539,7 @@ enum { TCA_NETEM_PAD, TCA_NETEM_LATENCY64, TCA_NETEM_JITTER64, + TCA_NETEM_SLOT, __TCA_NETEM_MAX, }; @@ -576,6 +577,13 @@ struct tc_netem_rate { __s32 cell_overhead; }; +struct tc_netem_slot { + __s64 min_delay; /* nsec */ + __s64 max_delay; + __s32 max_packets; + __s32 max_bytes; +}; + enum { NETEM_LOSS_UNSPEC, NETEM_LOSS_GI, /* General Intuitive - 4 state model */ -- cgit From 5794040647de4011598a6d005fdad95d24fd385b Mon Sep 17 00:00:00 2001 From: Andy Zhou Date: Fri, 10 Nov 2017 12:09:40 -0800 Subject: openvswitch: Add meter netlink definitions Meter has its own netlink family. Define netlink messages and attributes for communicating with the user space programs. Signed-off-by: Andy Zhou Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 51 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index ec75a685f1dd..d60b9a4cf3d1 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -883,4 +883,55 @@ enum ovs_action_attr { #define OVS_ACTION_ATTR_MAX (__OVS_ACTION_ATTR_MAX - 1) +/* Meters. */ +#define OVS_METER_FAMILY "ovs_meter" +#define OVS_METER_MCGROUP "ovs_meter" +#define OVS_METER_VERSION 0x1 + +enum ovs_meter_cmd { + OVS_METER_CMD_UNSPEC, + OVS_METER_CMD_FEATURES, /* Get features supported by the datapath. */ + OVS_METER_CMD_SET, /* Add or modify a meter. */ + OVS_METER_CMD_DEL, /* Delete a meter. */ + OVS_METER_CMD_GET /* Get meter stats. */ +}; + +enum ovs_meter_attr { + OVS_METER_ATTR_UNSPEC, + OVS_METER_ATTR_ID, /* u32 meter ID within datapath. */ + OVS_METER_ATTR_KBPS, /* No argument. If set, units in kilobits + * per second. Otherwise, units in + * packets per second. + */ + OVS_METER_ATTR_STATS, /* struct ovs_flow_stats for the meter. */ + OVS_METER_ATTR_BANDS, /* Nested attributes for meter bands. */ + OVS_METER_ATTR_USED, /* u64 msecs last used in monotonic time. */ + OVS_METER_ATTR_CLEAR, /* Flag to clear stats, used. */ + OVS_METER_ATTR_MAX_METERS, /* u32 number of meters supported. */ + OVS_METER_ATTR_MAX_BANDS, /* u32 max number of bands per meter. */ + OVS_METER_ATTR_PAD, + __OVS_METER_ATTR_MAX +}; + +#define OVS_METER_ATTR_MAX (__OVS_METER_ATTR_MAX - 1) + +enum ovs_band_attr { + OVS_BAND_ATTR_UNSPEC, + OVS_BAND_ATTR_TYPE, /* u32 OVS_METER_BAND_TYPE_* constant. */ + OVS_BAND_ATTR_RATE, /* u32 band rate in meter units (see above). */ + OVS_BAND_ATTR_BURST, /* u32 burst size in meter units. */ + OVS_BAND_ATTR_STATS, /* struct ovs_flow_stats for the band. */ + __OVS_BAND_ATTR_MAX +}; + +#define OVS_BAND_ATTR_MAX (__OVS_BAND_ATTR_MAX - 1) + +enum ovs_meter_band_type { + OVS_METER_BAND_TYPE_UNSPEC, + OVS_METER_BAND_TYPE_DROP, /* Drop exceeding packets. */ + __OVS_METER_BAND_TYPE_MAX +}; + +#define OVS_METER_BAND_TYPE_MAX (__OVS_METER_BAND_TYPE_MAX - 1) + #endif /* _LINUX_OPENVSWITCH_H */ -- cgit From cd8a6c33693c1b89d2737ffdbf9611564e9ac907 Mon Sep 17 00:00:00 2001 From: Andy Zhou Date: Fri, 10 Nov 2017 12:09:43 -0800 Subject: openvswitch: Add meter action support Implements OVS kernel meter action support. Signed-off-by: Andy Zhou Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index d60b9a4cf3d1..4265d7f9e1f2 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -838,6 +838,8 @@ struct ovs_action_push_eth { * @OVS_ACTION_ATTR_CT_CLEAR: Clear conntrack state from the packet. * @OVS_ACTION_ATTR_PUSH_NSH: push NSH header to the packet. * @OVS_ACTION_ATTR_POP_NSH: pop the outermost NSH header off the packet. + * @OVS_ACTION_ATTR_METER: Run packet through a meter, which may drop the + * packet, or modify the packet (e.g., change the DSCP field). * * Only a single header can be set with a single %OVS_ACTION_ATTR_SET. Not all * fields within a header are modifiable, e.g. the IPv4 protocol and fragment @@ -870,6 +872,7 @@ enum ovs_action_attr { OVS_ACTION_ATTR_CT_CLEAR, /* No argument. */ OVS_ACTION_ATTR_PUSH_NSH, /* Nested OVS_NSH_KEY_ATTR_*. */ OVS_ACTION_ATTR_POP_NSH, /* No argument. */ + OVS_ACTION_ATTR_METER, /* u32 meter ID. */ __OVS_ACTION_ATTR_MAX, /* Nothing past this will be accepted * from userspace. */ -- cgit From 0eef304bc9f7d079a1165e8cd2f24b078e9e1f2a Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Mon, 13 Nov 2017 03:37:06 +0300 Subject: uapi: fix linux/rxrpc.h userspace compilation errors Consistently use types provided by to fix the following linux/rxrpc.h userspace compilation errors: /usr/include/linux/rxrpc.h:24:2: error: unknown type name 'u16' u16 srx_service; /* service desired */ /usr/include/linux/rxrpc.h:25:2: error: unknown type name 'u16' u16 transport_type; /* type of transport socket (SOCK_DGRAM) */ /usr/include/linux/rxrpc.h:26:2: error: unknown type name 'u16' u16 transport_len; /* length of transport address */ Use __kernel_sa_family_t instead of sa_family_t the same way as uapi/linux/in.h does, to fix the following linux/rxrpc.h userspace compilation errors: /usr/include/linux/rxrpc.h:23:2: error: unknown type name 'sa_family_t' sa_family_t srx_family; /* address family */ /usr/include/linux/rxrpc.h:28:3: error: unknown type name 'sa_family_t' sa_family_t family; /* transport address family */ Fixes: 727f8914477e ("rxrpc: Expose UAPI definitions to userspace") Cc: # v4.14 Signed-off-by: Dmitry V. Levin Signed-off-by: David S. Miller --- include/uapi/linux/rxrpc.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/rxrpc.h b/include/uapi/linux/rxrpc.h index 9656aad8f8f7..9d4afea308a4 100644 --- a/include/uapi/linux/rxrpc.h +++ b/include/uapi/linux/rxrpc.h @@ -20,12 +20,12 @@ * RxRPC socket address */ struct sockaddr_rxrpc { - sa_family_t srx_family; /* address family */ - u16 srx_service; /* service desired */ - u16 transport_type; /* type of transport socket (SOCK_DGRAM) */ - u16 transport_len; /* length of transport address */ + __kernel_sa_family_t srx_family; /* address family */ + __u16 srx_service; /* service desired */ + __u16 transport_type; /* type of transport socket (SOCK_DGRAM) */ + __u16 transport_len; /* length of transport address */ union { - sa_family_t family; /* transport address family */ + __kernel_sa_family_t family; /* transport address family */ struct sockaddr_in sin; /* IPv4 transport address */ struct sockaddr_in6 sin6; /* IPv6 transport address */ } transport; -- cgit From b9f3eb499d84f8d4adcb2f9212ec655700b28228 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Tue, 14 Nov 2017 06:30:11 +0300 Subject: uapi: fix linux/tls.h userspace compilation error Move inclusion of a private kernel header from uapi/linux/tls.h to its only user - net/tls.h, to fix the following linux/tls.h userspace compilation error: /usr/include/linux/tls.h:41:21: fatal error: net/tcp.h: No such file or directory As to this point uapi/linux/tls.h was totaly unusuable for userspace, cleanup this header file further by moving other redundant includes to net/tls.h. Fixes: 3c4d7559159b ("tls: kernel TLS support") Cc: # v4.13+ Signed-off-by: Dmitry V. Levin Signed-off-by: David S. Miller --- include/uapi/linux/tls.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h index d5e0682ab837..293b2cdad88d 100644 --- a/include/uapi/linux/tls.h +++ b/include/uapi/linux/tls.h @@ -35,10 +35,6 @@ #define _UAPI_LINUX_TLS_H #include -#include -#include -#include -#include /* TLS socket options */ #define TLS_TX 1 /* Set transmit parameters */ -- cgit