From 2d23d0736e3a4a0fdb92b8e46ea476639f16aae8 Mon Sep 17 00:00:00 2001
From: Roee Zamir <roee.zamir@intel.com>
Date: Sun, 6 Aug 2017 11:38:22 +0300
Subject: nl80211: add OCE scan and capability flags

Add Optimized Connectivity Experience (OCE) scan and capability flags.
Some of them unique to OCE and some are stand alone.
And add scan flags to enable/disable them.

Signed-off-by: Roee Zamir <roee.zamir@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 37 +++++++++++++++++++++++++++++++++----
 1 file changed, 33 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 51626b4175c0..76404d8a8863 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -4914,6 +4914,15 @@ enum nl80211_feature_flags {
  *	handshake with 802.1X in station mode (will pass EAP frames to the host
  *	and accept the set_pmk/del_pmk commands), doing it in the host might not
  *	be supported.
+ * @NL80211_EXT_FEATURE_FILS_MAX_CHANNEL_TIME: Driver is capable of overriding
+ *	the max channel attribute in the FILS request params IE with the
+ *	actual dwell time.
+ * @NL80211_EXT_FEATURE_ACCEPT_BCAST_PROBE_RESP: Driver accepts broadcast probe
+ *	response
+ * @NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE: Driver supports sending
+ *	the first probe request in each channel at rate of at least 5.5Mbps.
+ * @NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION: Driver supports
+ *	probe request tx deferral and suppression
  *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
@@ -4936,6 +4945,10 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_FILS_SK_OFFLOAD,
 	NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_PSK,
 	NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_1X,
+	NL80211_EXT_FEATURE_FILS_MAX_CHANNEL_TIME,
+	NL80211_EXT_FEATURE_ACCEPT_BCAST_PROBE_RESP,
+	NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE,
+	NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
@@ -5012,12 +5025,28 @@ enum nl80211_timeout_reason {
  *	locally administered 1, multicast 0) is assumed.
  *	This flag must not be requested when the feature isn't supported, check
  *	the nl80211 feature flags for the device.
+ * @NL80211_SCAN_FLAG_FILS_MAX_CHANNEL_TIME: fill the dwell time in the FILS
+ *	request parameters IE in the probe request
+ * @NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP: accept broadcast probe responses
+ * @NL80211_SCAN_FLAG_OCE_PROBE_REQ_HIGH_TX_RATE: send probe request frames at
+ *	rate of at least 5.5M. In case non OCE AP is dicovered in the channel,
+ *	only the first probe req in the channel will be sent in high rate.
+ * @NL80211_SCAN_FLAG_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION: allow probe request
+ *	tx deferral (dot11FILSProbeDelay shall be set to 15ms)
+ *	and suppression (if it has received a broadcast Probe Response frame,
+ *	Beacon frame or FILS Discovery frame from an AP that the STA considers
+ *	a suitable candidate for (re-)association - suitable in terms of
+ *	SSID and/or RSSI
  */
 enum nl80211_scan_flags {
-	NL80211_SCAN_FLAG_LOW_PRIORITY			= 1<<0,
-	NL80211_SCAN_FLAG_FLUSH				= 1<<1,
-	NL80211_SCAN_FLAG_AP				= 1<<2,
-	NL80211_SCAN_FLAG_RANDOM_ADDR			= 1<<3,
+	NL80211_SCAN_FLAG_LOW_PRIORITY				= 1<<0,
+	NL80211_SCAN_FLAG_FLUSH					= 1<<1,
+	NL80211_SCAN_FLAG_AP					= 1<<2,
+	NL80211_SCAN_FLAG_RANDOM_ADDR				= 1<<3,
+	NL80211_SCAN_FLAG_FILS_MAX_CHANNEL_TIME			= 1<<4,
+	NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP		= 1<<5,
+	NL80211_SCAN_FLAG_OCE_PROBE_REQ_HIGH_TX_RATE		= 1<<6,
+	NL80211_SCAN_FLAG_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION	= 1<<7,
 };
 
 /**
-- 
cgit 


From 65026002d69de006e273749bb799d3b01b757eb0 Mon Sep 17 00:00:00 2001
From: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Date: Fri, 18 Aug 2017 15:31:41 +0300
Subject: nl80211: add an option to allow MFP without requiring it

The user space can now allow the kernel to associate to an AP that
requires MFP or that doesn't have MFP enabled in the same
NL80211_CMD_CONNECT command, by using a new NL80211_MFP_OPTIONAL flag.
The driver / firmware will decide whether to use it or not.

Include a feature bit to advertise support for NL80211_MFP_OPTIONAL.
This allows new user space to run on old kernels and know that it
cannot use the new attribute if it isn't supported.

Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 76404d8a8863..59ba6ca66a0d 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1407,8 +1407,12 @@ enum nl80211_commands {
  *
  * @NL80211_ATTR_USE_MFP: Whether management frame protection (IEEE 802.11w) is
  *	used for the association (&enum nl80211_mfp, represented as a u32);
- *	this attribute can be used
- *	with %NL80211_CMD_ASSOCIATE and %NL80211_CMD_CONNECT requests
+ *	this attribute can be used with %NL80211_CMD_ASSOCIATE and
+ *	%NL80211_CMD_CONNECT requests. %NL80211_MFP_OPTIONAL is not allowed for
+ *	%NL80211_CMD_ASSOCIATE since user space SME is expected and hence, it
+ *	must have decided whether to use management frame protection or not.
+ *	Setting %NL80211_MFP_OPTIONAL with a %NL80211_CMD_CONNECT request will
+ *	let the driver (or the firmware) decide whether to use MFP or not.
  *
  * @NL80211_ATTR_STA_FLAGS2: Attribute containing a
  *	&struct nl80211_sta_flag_update.
@@ -3947,10 +3951,12 @@ enum nl80211_key_type {
  * enum nl80211_mfp - Management frame protection state
  * @NL80211_MFP_NO: Management frame protection not used
  * @NL80211_MFP_REQUIRED: Management frame protection required
+ * @NL80211_MFP_OPTIONAL: Management frame protection is optional
  */
 enum nl80211_mfp {
 	NL80211_MFP_NO,
 	NL80211_MFP_REQUIRED,
+	NL80211_MFP_OPTIONAL,
 };
 
 enum nl80211_wpa_versions {
@@ -4923,6 +4929,8 @@ enum nl80211_feature_flags {
  *	the first probe request in each channel at rate of at least 5.5Mbps.
  * @NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION: Driver supports
  *	probe request tx deferral and suppression
+ * @NL80211_EXT_FEATURE_MFP_OPTIONAL: Driver supports the %NL80211_MFP_OPTIONAL
+ *	value in %NL80211_ATTR_USE_MFP.
  *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
@@ -4949,6 +4957,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_ACCEPT_BCAST_PROBE_RESP,
 	NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE,
 	NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION,
+	NL80211_EXT_FEATURE_MFP_OPTIONAL,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
-- 
cgit 


From 943170998b200190f99d3fe7e771437e2c51f319 Mon Sep 17 00:00:00 2001
From: Petar Penkov <peterpenkov96@gmail.com>
Date: Fri, 22 Sep 2017 13:49:14 -0700
Subject: tun: enable NAPI for TUN/TAP driver

Changes TUN driver to use napi_gro_receive() upon receiving packets
rather than netif_rx_ni(). Adds flag IFF_NAPI that enables these
changes and operation is not affected if the flag is disabled.  SKBs
are constructed upon packet arrival and are queued to be processed
later.

The new path was evaluated with a benchmark with the following setup:
Open two tap devices and a receiver thread that reads in a loop for
each device. Start one sender thread and pin all threads to different
CPUs. Send 1M minimum UDP packets to each device and measure sending
time for each of the sending methods:
	napi_gro_receive():	4.90s
	netif_rx_ni():		4.90s
	netif_receive_skb():	7.20s

Signed-off-by: Petar Penkov <peterpenkov96@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Mahesh Bandewar <maheshb@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: davem@davemloft.net
Cc: ppenkov@stanford.edu
Acked-by: Mahesh Bandewar <maheshb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_tun.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 3cb5e1d85ddd..30b6184884eb 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -60,6 +60,7 @@
 /* TUNSETIFF ifr flags */
 #define IFF_TUN		0x0001
 #define IFF_TAP		0x0002
+#define IFF_NAPI	0x0010
 #define IFF_NO_PI	0x1000
 /* This flag has no real effect */
 #define IFF_ONE_QUEUE	0x2000
-- 
cgit 


From 90e33d45940793def6f773b2d528e9f3c84ffdc7 Mon Sep 17 00:00:00 2001
From: Petar Penkov <peterpenkov96@gmail.com>
Date: Fri, 22 Sep 2017 13:49:15 -0700
Subject: tun: enable napi_gro_frags() for TUN/TAP driver

Add a TUN/TAP receive mode that exercises the napi_gro_frags()
interface. This mode is available only in TAP mode, as the interface
expects packets with Ethernet headers.

Furthermore, packets follow the layout of the iovec_iter that was
received. The first iovec is the linear data, and every one after the
first is a fragment. If there are more fragments than the max number,
drop the packet. Additionally, invoke eth_get_headlen() to exercise flow
dissector code and to verify that the header resides in the linear data.

The napi_gro_frags() mode requires setting the IFF_NAPI_FRAGS option.
This is imposed because this mode is intended for testing via tools like
syzkaller and packetdrill, and the increased flexibility it provides can
introduce security vulnerabilities. This flag is accepted only if the
device is in TAP mode and has the IFF_NAPI flag set as well. This is
done because both of these are explicit requirements for correct
operation in this mode.

Signed-off-by: Petar Penkov <peterpenkov96@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Mahesh Bandewar <maheshb@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: davem@davemloft.net
Cc: ppenkov@stanford.edu
Acked-by: Mahesh Bandewar <maheshb@google,com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_tun.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 30b6184884eb..365ade5685c9 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -61,6 +61,7 @@
 #define IFF_TUN		0x0001
 #define IFF_TAP		0x0002
 #define IFF_NAPI	0x0010
+#define IFF_NAPI_FRAGS	0x0020
 #define IFF_NO_PI	0x1000
 /* This flag has no real effect */
 #define IFF_ONE_QUEUE	0x2000
-- 
cgit 


From de8f3a83b0a0fddb2cf56e7a718127e9619ea3da Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 25 Sep 2017 02:25:51 +0200
Subject: bpf: add meta pointer for direct access

This work enables generic transfer of metadata from XDP into skb. The
basic idea is that we can make use of the fact that the resulting skb
must be linear and already comes with a larger headroom for supporting
bpf_xdp_adjust_head(), which mangles xdp->data. Here, we base our work
on a similar principle and introduce a small helper bpf_xdp_adjust_meta()
for adjusting a new pointer called xdp->data_meta. Thus, the packet has
a flexible and programmable room for meta data, followed by the actual
packet data. struct xdp_buff is therefore laid out that we first point
to data_hard_start, then data_meta directly prepended to data followed
by data_end marking the end of packet. bpf_xdp_adjust_head() takes into
account whether we have meta data already prepended and if so, memmove()s
this along with the given offset provided there's enough room.

xdp->data_meta is optional and programs are not required to use it. The
rationale is that when we process the packet in XDP (e.g. as DoS filter),
we can push further meta data along with it for the XDP_PASS case, and
give the guarantee that a clsact ingress BPF program on the same device
can pick this up for further post-processing. Since we work with skb
there, we can also set skb->mark, skb->priority or other skb meta data
out of BPF, thus having this scratch space generic and programmable
allows for more flexibility than defining a direct 1:1 transfer of
potentially new XDP members into skb (it's also more efficient as we
don't need to initialize/handle each of such new members). The facility
also works together with GRO aggregation. The scratch space at the head
of the packet can be multiple of 4 byte up to 32 byte large. Drivers not
yet supporting xdp->data_meta can simply be set up with xdp->data_meta
as xdp->data + 1 as bpf_xdp_adjust_meta() will detect this and bail out,
such that the subsequent match against xdp->data for later access is
guaranteed to fail.

The verifier treats xdp->data_meta/xdp->data the same way as we treat
xdp->data/xdp->data_end pointer comparisons. The requirement for doing
the compare against xdp->data is that it hasn't been modified from it's
original address we got from ctx access. It may have a range marking
already from prior successful xdp->data/xdp->data_end pointer comparisons
though.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 43ab5c402f98..e43491ac4823 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -582,6 +582,12 @@ union bpf_attr {
  *	@map: pointer to sockmap to update
  *	@key: key to insert/update sock in map
  *	@flags: same flags as map update elem
+ *
+ * int bpf_xdp_adjust_meta(xdp_md, delta)
+ *     Adjust the xdp_md.data_meta by delta
+ *     @xdp_md: pointer to xdp_md
+ *     @delta: An positive/negative integer to be added to xdp_md.data_meta
+ *     Return: 0 on success or negative on error
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -638,6 +644,7 @@ union bpf_attr {
 	FN(redirect_map),		\
 	FN(sk_redirect_map),		\
 	FN(sock_map_update),		\
+	FN(xdp_adjust_meta),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -715,7 +722,7 @@ struct __sk_buff {
 	__u32 data_end;
 	__u32 napi_id;
 
-	/* accessed by BPF_PROG_TYPE_sk_skb types */
+	/* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */
 	__u32 family;
 	__u32 remote_ip4;	/* Stored in network byte order */
 	__u32 local_ip4;	/* Stored in network byte order */
@@ -723,6 +730,9 @@ struct __sk_buff {
 	__u32 local_ip6[4];	/* Stored in network byte order */
 	__u32 remote_port;	/* Stored in network byte order */
 	__u32 local_port;	/* stored in host byte order */
+	/* ... here. */
+
+	__u32 data_meta;
 };
 
 struct bpf_tunnel_key {
@@ -783,6 +793,7 @@ enum xdp_action {
 struct xdp_md {
 	__u32 data;
 	__u32 data_end;
+	__u32 data_meta;
 };
 
 enum sk_action {
-- 
cgit 


From 5af48b59f35cf712793badabe1a574a0d0ce3bd3 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Wed, 27 Sep 2017 16:12:44 +0300
Subject: net: bridge: add per-port group_fwd_mask with less restrictions

We need to be able to transparently forward most link-local frames via
tunnels (e.g. vxlan, qinq). Currently the bridge's group_fwd_mask has a
mask which restricts the forwarding of STP and LACP, but we need to be able
to forward these over tunnels and control that forwarding on a per-port
basis thus add a new per-port group_fwd_mask option which only disallows
mac pause frames to be forwarded (they're always dropped anyway).
The patch does not change the current default situation - all of the others
are still restricted unless configured for forwarding.
We have successfully tested this patch with LACP and STP forwarding over
VxLAN and qinq tunnels.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 8d062c58d5cb..ea87bd708ee9 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -325,6 +325,7 @@ enum {
 	IFLA_BRPORT_MCAST_TO_UCAST,
 	IFLA_BRPORT_VLAN_TUNNEL,
 	IFLA_BRPORT_BCAST_FLOOD,
+	IFLA_BRPORT_GROUP_FWD_MASK,
 	__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
-- 
cgit 


From cb4d2b3f03d8eed90be3a194e5b54b734ec4bbe9 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 27 Sep 2017 14:37:52 -0700
Subject: bpf: Add name, load_time, uid and map_ids to bpf_prog_info

The patch adds name and load_time to struct bpf_prog_aux.  They
are also exported to bpf_prog_info.

The bpf_prog's name is passed by userspace during BPF_PROG_LOAD.
The kernel only stores the first (BPF_PROG_NAME_LEN - 1) bytes
and the name stored in the kernel is always \0 terminated.

The kernel will reject name that contains characters other than
isalnum() and '_'.  It will also reject name that is not null
terminated.

The existing 'user->uid' of the bpf_prog_aux is also exported to
the bpf_prog_info as created_by_uid.

The existing 'used_maps' of the bpf_prog_aux is exported to
the newly added members 'nr_map_ids' and 'map_ids' of
the bpf_prog_info.  On the input, nr_map_ids tells how
big the userspace's map_ids buffer is.  On the output,
nr_map_ids tells the exact user_map_cnt and it will only
copy up to the userspace's map_ids buffer is allowed.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e43491ac4823..bd6348269bf5 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -175,6 +175,8 @@ enum bpf_attach_type {
 /* Specify numa node during map creation */
 #define BPF_F_NUMA_NODE		(1U << 2)
 
+#define BPF_OBJ_NAME_LEN 16U
+
 union bpf_attr {
 	struct { /* anonymous struct used by BPF_MAP_CREATE command */
 		__u32	map_type;	/* one of enum bpf_map_type */
@@ -210,6 +212,7 @@ union bpf_attr {
 		__aligned_u64	log_buf;	/* user supplied buffer */
 		__u32		kern_version;	/* checked when prog_type=kprobe */
 		__u32		prog_flags;
+		__u8		prog_name[BPF_OBJ_NAME_LEN];
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -812,6 +815,11 @@ struct bpf_prog_info {
 	__u32 xlated_prog_len;
 	__aligned_u64 jited_prog_insns;
 	__aligned_u64 xlated_prog_insns;
+	__u64 load_time;	/* ns since boottime */
+	__u32 created_by_uid;
+	__u32 nr_map_ids;
+	__aligned_u64 map_ids;
+	__u8  name[BPF_OBJ_NAME_LEN];
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
-- 
cgit 


From ad5b177bd73f5107d97c36f56395c4281fb6f089 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 27 Sep 2017 14:37:53 -0700
Subject: bpf: Add map_name to bpf_map_info

This patch allows userspace to specify a name for a map
during BPF_MAP_CREATE.

The map's name can later be exported to user space
via BPF_OBJ_GET_INFO_BY_FD.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index bd6348269bf5..6d2137b4cf38 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -190,6 +190,7 @@ union bpf_attr {
 		__u32	numa_node;	/* numa node (effective only if
 					 * BPF_F_NUMA_NODE is set).
 					 */
+		__u8	map_name[BPF_OBJ_NAME_LEN];
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
@@ -829,6 +830,7 @@ struct bpf_map_info {
 	__u32 value_size;
 	__u32 max_entries;
 	__u32 map_flags;
+	__u8  name[BPF_OBJ_NAME_LEN];
 } __attribute__((aligned(8)));
 
 /* User bpf_sock_ops struct to access socket values and specify request ops
-- 
cgit 


From 84e14fe353de7624872e582887712079ba0b2d56 Mon Sep 17 00:00:00 2001
From: Maciej Żenczykowski <maze@google.com>
Date: Tue, 26 Sep 2017 21:32:42 -0700
Subject: net-ipv6: add support for sockopt(SOL_IPV6, IPV6_FREEBIND)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

So far we've been relying on sockopt(SOL_IP, IP_FREEBIND) being usable
even on IPv6 sockets.

However, it turns out it is perfectly reasonable to want to set freebind
on an AF_INET6 SOCK_RAW socket - but there is no way to set any SOL_IP
socket option on such a socket (they're all blindly errored out).

One use case for this is to allow spoofing src ip on a raw socket
via sendmsg cmsg.

Tested:
  built, and booted
  # python
  >>> import socket
  >>> SOL_IP = socket.SOL_IP
  >>> SOL_IPV6 = socket.IPPROTO_IPV6
  >>> IP_FREEBIND = 15
  >>> IPV6_FREEBIND = 78
  >>> s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM, 0)
  >>> s.getsockopt(SOL_IP, IP_FREEBIND)
  0
  >>> s.getsockopt(SOL_IPV6, IPV6_FREEBIND)
  0
  >>> s.setsockopt(SOL_IPV6, IPV6_FREEBIND, 1)
  >>> s.getsockopt(SOL_IP, IP_FREEBIND)
  1
  >>> s.getsockopt(SOL_IPV6, IPV6_FREEBIND)
  1

Signed-off-by: Maciej Żenczykowski <maze@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/in6.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h
index 46444f8fbee4..4f8f3eb0699f 100644
--- a/include/uapi/linux/in6.h
+++ b/include/uapi/linux/in6.h
@@ -284,6 +284,7 @@ struct in6_flowlabel_req {
 #define IPV6_TRANSPARENT        75
 #define IPV6_UNICAST_IF         76
 #define IPV6_RECVFRAGSIZE	77
+#define IPV6_FREEBIND		78
 
 /*
  * Multicast Routing:
-- 
cgit 


From 503c1fb98ba3859c13863957c7c65c92371a9e50 Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Fri, 29 Sep 2017 14:21:49 +0200
Subject: cfg80211/nl80211: add a port authorized event

Add an event that indicates that a connection is authorized
(i.e. the 4 way handshake was performed by the driver). This event
should be sent by the driver after sending a connect/roamed event.

This is useful for networks that require 802.1X authentication.
In cases that the driver supports 4 way handshake offload, but the
802.1X authentication is managed by user space, the driver needs to
inform user space right after the 802.11 association was completed
so user space can initialize its 802.1X state machine etc.
However, it is also possible that the AP will choose to skip the
802.1X authentication (e.g. when PMKSA caching is used) and proceed
with the 4 way handshake immediately. In this case the driver needs
to inform user space that 802.1X authentication is no longer required
(e.g. to prevent user space from disconnecting since it did not get
any EAPOLs from the AP).

This is also useful for roaming, in which case it is possible that
the driver used the Fast Transition protocol so 802.1X is not
required.

Since there will now be a dedicated notification indicating that the
connection is authorized, the authorized flag can be removed from the
roamed event. Drivers can send the new port authorized event right
after sending the roamed event to indicate the new AP is already
authorized. This therefore reserves the old PORT_AUTHORIZED attribute.

Signed-off-by: Avraham Stern <avraham.stern@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 59ba6ca66a0d..95832ce03a44 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -569,13 +569,14 @@
  *	authentication/association or not receiving a response from the AP.
  *	Non-zero %NL80211_ATTR_STATUS_CODE value is indicated in that case as
  *	well to remain backwards compatible.
- * @NL80211_CMD_ROAM: notifcation indicating the card/driver roamed by itself.
- *	When the driver roamed in a network that requires 802.1X authentication,
- *	%NL80211_ATTR_PORT_AUTHORIZED should be set if the 802.1X authentication
- *	was done by the driver or if roaming was done using Fast Transition
- *	protocol (in which case 802.1X authentication is not needed). If
- *	%NL80211_ATTR_PORT_AUTHORIZED is not set, user space is responsible for
- *	the 802.1X authentication.
+ *	When establishing a security association, drivers that support 4 way
+ *	handshake offload should send %NL80211_CMD_PORT_AUTHORIZED event when
+ *	the 4 way handshake is completed successfully.
+ * @NL80211_CMD_ROAM: Notification indicating the card/driver roamed by itself.
+ *	When a security association was established with the new AP (e.g. if
+ *	the FT protocol was used for roaming or the driver completed the 4 way
+ *	handshake), this event should be followed by an
+ *	%NL80211_CMD_PORT_AUTHORIZED event.
  * @NL80211_CMD_DISCONNECT: drop a given connection; also used to notify
  *	userspace that a connection was dropped by the AP or due to other
  *	reasons, for this the %NL80211_ATTR_DISCONNECTED_BY_AP and
@@ -982,6 +983,12 @@
  * @NL80211_CMD_DEL_PMK: For offloaded 4-Way handshake, delete the previously
  *	configured PMK for the authenticator address identified by
  *	&NL80211_ATTR_MAC.
+ * @NL80211_CMD_PORT_AUTHORIZED: An event that indicates that the 4 way
+ *	handshake was completed successfully by the driver. The BSSID is
+ *	specified with &NL80211_ATTR_MAC. Drivers that support 4 way handshake
+ *	offload should send this event after indicating 802.11 association with
+ *	&NL80211_CMD_CONNECT or &NL80211_CMD_ROAM. If the 4 way handshake failed
+ *	&NL80211_CMD_DISCONNECT should be indicated instead.
  *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
@@ -1185,6 +1192,8 @@ enum nl80211_commands {
 	NL80211_CMD_SET_PMK,
 	NL80211_CMD_DEL_PMK,
 
+	NL80211_CMD_PORT_AUTHORIZED,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
@@ -2138,10 +2147,7 @@ enum nl80211_commands {
  *	in %NL80211_CMD_CONNECT to indicate that for 802.1X authentication it
  *	wants to use the supported offload of the 4-way handshake.
  * @NL80211_ATTR_PMKR0_NAME: PMK-R0 Name for offloaded FT.
- * @NL80211_ATTR_PORT_AUTHORIZED: flag attribute used in %NL80211_CMD_ROAMED
- *	notification indicating that that 802.1X authentication was done by
- *	the driver or is not needed (because roaming used the Fast Transition
- *	protocol).
+ * @NL80211_ATTR_PORT_AUTHORIZED: (reserved)
  *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
-- 
cgit 


From 5bbbbe32a43199c2b9ea5ea66fab6241c64beb51 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Tue, 3 Oct 2017 19:20:13 -0300
Subject: sctp: introduce stream scheduler foundations

This patch introduces the hooks necessary to do stream scheduling, as
per RFC Draft ndata.  It also introduces the first scheduler, which is
what we do today but now factored out: first come first served (FCFS).

With stream scheduling now we have to track which chunk was enqueued on
which stream and be able to select another other than the in front of
the main outqueue. So we introduce a list on sctp_stream_out_ext
structure for this purpose.

We reuse sctp_chunk->transmitted_list space for the list above, as the
chunk cannot belong to the two lists at the same time. By using the
union in there, we can have distinct names for these moments.

sctp_sched_ops are the operations expected to be implemented by each
scheduler. The dequeueing is a bit particular to this implementation but
it is to match how we dequeue packets today. We first dequeue and then
check if it fits the packet and if not, we requeue it at head. Thus why
we don't have a peek operation but have dequeue_done instead, which is
called once the chunk can be safely considered as transmitted.

The check removed from sctp_outq_flush is now performed by
sctp_stream_outq_migrate, which is only called during assoc setup.
(sctp_sendmsg() also checks for it)

The only operation that is foreseen but not yet added here is a way to
signalize that a new packet is starting or that the packet is done, for
round robin scheduler per packet, but is intentionally left to the
patch that actually implements it.

Support for I-DATA chunks, also described in this RFC, with user message
interleaving is straightforward as it just requires the schedulers to
probe for the feature and ignore datamsg boundaries when dequeueing.

See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13
Tested-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/sctp.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 6217ff8500a1..4487e7625ddb 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -1088,4 +1088,10 @@ struct sctp_add_streams {
 	uint16_t sas_outstrms;
 };
 
+/* SCTP Stream schedulers */
+enum sctp_sched_type {
+	SCTP_SS_FCFS,
+	SCTP_SS_MAX = SCTP_SS_FCFS
+};
+
 #endif /* _UAPI_SCTP_H */
-- 
cgit 


From 13aa8770fe42d246c6f3a8eb814b85bccb428011 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Tue, 3 Oct 2017 19:20:14 -0300
Subject: sctp: add sockopt to get/set stream scheduler

As defined per RFC Draft ndata Section 4.3.2, named as
SCTP_STREAM_SCHEDULER.

See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13
Tested-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/sctp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 4487e7625ddb..0050f10087d2 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -122,6 +122,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_RESET_ASSOC	120
 #define SCTP_ADD_STREAMS	121
 #define SCTP_SOCKOPT_PEELOFF_FLAGS 122
+#define SCTP_STREAM_SCHEDULER	123
 
 /* PR-SCTP policies */
 #define SCTP_PR_SCTP_NONE	0x0000
-- 
cgit 


From 0ccdf3c7fdeda511b10def19505178a9d2d3fccd Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Tue, 3 Oct 2017 19:20:15 -0300
Subject: sctp: add sockopt to get/set stream scheduler parameters

As defined per RFC Draft ndata Section 4.3.3, named as
SCTP_STREAM_SCHEDULER_VALUE.

See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13
Tested-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/sctp.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 0050f10087d2..00ac417d2c4f 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -123,6 +123,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_ADD_STREAMS	121
 #define SCTP_SOCKOPT_PEELOFF_FLAGS 122
 #define SCTP_STREAM_SCHEDULER	123
+#define SCTP_STREAM_SCHEDULER_VALUE	124
 
 /* PR-SCTP policies */
 #define SCTP_PR_SCTP_NONE	0x0000
@@ -815,6 +816,12 @@ struct sctp_assoc_value {
     uint32_t                assoc_value;
 };
 
+struct sctp_stream_value {
+	sctp_assoc_t assoc_id;
+	uint16_t stream_id;
+	uint16_t stream_value;
+};
+
 /*
  * 7.2.2 Peer Address Information
  *
-- 
cgit 


From 637784ade221a3c8a7ecd0f583eddd95d6276b9a Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Tue, 3 Oct 2017 19:20:16 -0300
Subject: sctp: introduce priority based stream scheduler

This patch introduces RFC Draft ndata section 3.4 Priority Based
Scheduler (SCTP_SS_PRIO).

It works by having a struct sctp_stream_priority for each priority
configured. This struct is then enlisted on a queue ordered per priority
if, and only if, there is a stream with data queued, so that dequeueing
is very straightforward: either finish current datamsg or simply dequeue
from the highest priority queued, which is the next stream pointed, and
that's it.

If there are multiple streams assigned with the same priority and with
data queued, it will do round robin amongst them while respecting
datamsgs boundaries (when not using idata chunks), to be reasonably
fair.

We intentionally don't maintain a list of priorities nor a list of all
streams with the same priority to save memory. The first would mean at
least 2 other pointers per priority (which, for 1000 priorities, that
can mean 16kB) and the second would also mean 2 other pointers but per
stream. As SCTP supports up to 65535 streams on a given asoc, that's
1MB. This impacts when giving a priority to some stream, as we have to
find out if the new priority is already being used and if we can free
the old one, and also when tearing down.

The new fields in struct sctp_stream_out_ext and sctp_stream are added
under a union because that memory is to be shared with other schedulers.

See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13
Tested-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/sctp.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 00ac417d2c4f..850fa8b29d7e 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -1099,7 +1099,8 @@ struct sctp_add_streams {
 /* SCTP Stream schedulers */
 enum sctp_sched_type {
 	SCTP_SS_FCFS,
-	SCTP_SS_MAX = SCTP_SS_FCFS
+	SCTP_SS_PRIO,
+	SCTP_SS_MAX = SCTP_SS_PRIO
 };
 
 #endif /* _UAPI_SCTP_H */
-- 
cgit 


From ac1ed8b82cd60ba8e7d84103ac1414b8c577c485 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Tue, 3 Oct 2017 19:20:17 -0300
Subject: sctp: introduce round robin stream scheduler

This patch introduces RFC Draft ndata section 3.2 Priority Based
Scheduler (SCTP_SS_RR).

Works by maintaining a list of enqueued streams and tracking the last
one used to send data. When the datamsg is done, it switches to the next
stream.

See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13
Tested-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/sctp.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 850fa8b29d7e..6cd7d416ca40 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -1100,7 +1100,8 @@ struct sctp_add_streams {
 enum sctp_sched_type {
 	SCTP_SS_FCFS,
 	SCTP_SS_PRIO,
-	SCTP_SS_MAX = SCTP_SS_PRIO
+	SCTP_SS_RR,
+	SCTP_SS_MAX = SCTP_SS_RR
 };
 
 #endif /* _UAPI_SCTP_H */
-- 
cgit 


From 324bda9e6c5add86ba2e1066476481c48132aca0 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 2 Oct 2017 22:50:21 -0700
Subject: bpf: multi program support for cgroup+bpf

introduce BPF_F_ALLOW_MULTI flag that can be used to attach multiple
bpf programs to a cgroup.

The difference between three possible flags for BPF_PROG_ATTACH command:
- NONE(default): No further bpf programs allowed in the subtree.
- BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program,
  the program in this cgroup yields to sub-cgroup program.
- BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program,
  that cgroup program gets run in addition to the program in this cgroup.

NONE and BPF_F_ALLOW_OVERRIDE existed before. This patch doesn't
change their behavior. It only clarifies the semantics in relation
to new flag.

Only one program is allowed to be attached to a cgroup with
NONE or BPF_F_ALLOW_OVERRIDE flag.
Multiple programs are allowed to be attached to a cgroup with
BPF_F_ALLOW_MULTI flag. They are executed in FIFO order
(those that were attached first, run first)
The programs of sub-cgroup are executed first, then programs of
this cgroup and then programs of parent cgroup.
All eligible programs are executed regardless of return code from
earlier programs.

To allow efficient execution of multiple programs attached to a cgroup
and to avoid penalizing cgroups without any programs attached
introduce 'struct bpf_prog_array' which is RCU protected array
of pointers to bpf programs.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
for cgroup bits
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 42 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 39 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6d2137b4cf38..762f74bc6c47 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -143,11 +143,47 @@ enum bpf_attach_type {
 
 #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
 
-/* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command
- * to the given target_fd cgroup the descendent cgroup will be able to
- * override effective bpf program that was inherited from this cgroup
+/* cgroup-bpf attach flags used in BPF_PROG_ATTACH command
+ *
+ * NONE(default): No further bpf programs allowed in the subtree.
+ *
+ * BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program,
+ * the program in this cgroup yields to sub-cgroup program.
+ *
+ * BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program,
+ * that cgroup program gets run in addition to the program in this cgroup.
+ *
+ * Only one program is allowed to be attached to a cgroup with
+ * NONE or BPF_F_ALLOW_OVERRIDE flag.
+ * Attaching another program on top of NONE or BPF_F_ALLOW_OVERRIDE will
+ * release old program and attach the new one. Attach flags has to match.
+ *
+ * Multiple programs are allowed to be attached to a cgroup with
+ * BPF_F_ALLOW_MULTI flag. They are executed in FIFO order
+ * (those that were attached first, run first)
+ * The programs of sub-cgroup are executed first, then programs of
+ * this cgroup and then programs of parent cgroup.
+ * When children program makes decision (like picking TCP CA or sock bind)
+ * parent program has a chance to override it.
+ *
+ * A cgroup with MULTI or OVERRIDE flag allows any attach flags in sub-cgroups.
+ * A cgroup with NONE doesn't allow any programs in sub-cgroups.
+ * Ex1:
+ * cgrp1 (MULTI progs A, B) ->
+ *    cgrp2 (OVERRIDE prog C) ->
+ *      cgrp3 (MULTI prog D) ->
+ *        cgrp4 (OVERRIDE prog E) ->
+ *          cgrp5 (NONE prog F)
+ * the event in cgrp5 triggers execution of F,D,A,B in that order.
+ * if prog F is detached, the execution is E,D,A,B
+ * if prog F and D are detached, the execution is E,A,B
+ * if prog F, E and D are detached, the execution is C,A,B
+ *
+ * All eligible programs are executed regardless of return code from
+ * earlier programs.
  */
 #define BPF_F_ALLOW_OVERRIDE	(1U << 0)
+#define BPF_F_ALLOW_MULTI	(1U << 1)
 
 /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the
  * verifier will perform strict alignment checking as if the kernel
-- 
cgit 


From 468e2f64d220fe2dc11caa2bcb9b3a1e50fc7321 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 2 Oct 2017 22:50:22 -0700
Subject: bpf: introduce BPF_PROG_QUERY command

introduce BPF_PROG_QUERY command to retrieve a set of either
attached programs to given cgroup or a set of effective programs
that will execute for events within a cgroup

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
for cgroup bits
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 762f74bc6c47..cb2b9f95160a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -92,6 +92,7 @@ enum bpf_cmd {
 	BPF_PROG_GET_FD_BY_ID,
 	BPF_MAP_GET_FD_BY_ID,
 	BPF_OBJ_GET_INFO_BY_FD,
+	BPF_PROG_QUERY,
 };
 
 enum bpf_map_type {
@@ -211,6 +212,9 @@ enum bpf_attach_type {
 /* Specify numa node during map creation */
 #define BPF_F_NUMA_NODE		(1U << 2)
 
+/* flags for BPF_PROG_QUERY */
+#define BPF_F_QUERY_EFFECTIVE	(1U << 0)
+
 #define BPF_OBJ_NAME_LEN 16U
 
 union bpf_attr {
@@ -289,6 +293,15 @@ union bpf_attr {
 		__u32		info_len;
 		__aligned_u64	info;
 	} info;
+
+	struct { /* anonymous struct used by BPF_PROG_QUERY command */
+		__u32		target_fd;	/* container object to query */
+		__u32		attach_type;
+		__u32		query_flags;
+		__u32		attach_flags;
+		__aligned_u64	prog_ids;
+		__u32		prog_cnt;
+	} query;
 } __attribute__((aligned(8)));
 
 /* BPF helper function descriptions:
-- 
cgit 


From 6621dd29eb9b5e6774ec7a9a75161352fdea47fc Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Tue, 3 Oct 2017 13:53:23 +0200
Subject: dev: advertise the new nsid when the netns iface changes

x-netns interfaces are bound to two netns: the link netns and the upper
netns. Usually, this kind of interfaces is created in the link netns and
then moved to the upper netns. At the end, the interface is visible only
in the upper netns. The link nsid is advertised via netlink in the upper
netns, thus the user always knows where is the link part.

There is no such mechanism in the link netns. When the interface is moved
to another netns, the user cannot "follow" it.
This patch adds a new netlink attribute which helps to follow an interface
which moves to another netns. When the interface is unregistered, the new
nsid is advertised. If the interface is a x-netns interface (ie
rtnl_link_ops->get_link_net is defined), the nsid is allocated if needed.

CC: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index ea87bd708ee9..cd580fc0e58f 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -158,6 +158,7 @@ enum {
 	IFLA_PAD,
 	IFLA_XDP,
 	IFLA_EVENT,
+	IFLA_NEW_NETNSID,
 	__IFLA_MAX
 };
 
-- 
cgit 


From 413a4317aca7d6367d57a5971b0c461f03851207 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 5 Oct 2017 16:46:53 -0400
Subject: VSOCK: add sock_diag interface

This patch adds the sock_diag interface for querying sockets from
userspace.  Tools like ss(8) and netstat(8) can use this interface to
list open sockets.

The userspace ABI is defined in <linux/vm_sockets_diag.h> and includes
netlink request and response structs.  The request can query sockets
based on their sk_state (e.g. listening sockets only) and the response
contains socket information fields including the local/remote addresses,
inode number, etc.

This patch does not dump VMCI pending sockets because I have only tested
the virtio transport, which does not use pending sockets.  Support can
be added later by extending vsock_diag_dump() if needed by VMCI users.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/vm_sockets_diag.h | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 include/uapi/linux/vm_sockets_diag.h

(limited to 'include/uapi')

diff --git a/include/uapi/linux/vm_sockets_diag.h b/include/uapi/linux/vm_sockets_diag.h
new file mode 100644
index 000000000000..14cd7dc5a187
--- /dev/null
+++ b/include/uapi/linux/vm_sockets_diag.h
@@ -0,0 +1,33 @@
+/* AF_VSOCK sock_diag(7) interface for querying open sockets */
+
+#ifndef _UAPI__VM_SOCKETS_DIAG_H__
+#define _UAPI__VM_SOCKETS_DIAG_H__
+
+#include <linux/types.h>
+
+/* Request */
+struct vsock_diag_req {
+	__u8	sdiag_family;	/* must be AF_VSOCK */
+	__u8	sdiag_protocol;	/* must be 0 */
+	__u16	pad;		/* must be 0 */
+	__u32	vdiag_states;	/* query bitmap (e.g. 1 << TCP_LISTEN) */
+	__u32	vdiag_ino;	/* must be 0 (reserved) */
+	__u32	vdiag_show;	/* must be 0 (reserved) */
+	__u32	vdiag_cookie[2];
+};
+
+/* Response */
+struct vsock_diag_msg {
+	__u8	vdiag_family;	/* AF_VSOCK */
+	__u8	vdiag_type;	/* SOCK_STREAM or SOCK_DGRAM */
+	__u8	vdiag_state;	/* sk_state (e.g. TCP_LISTEN) */
+	__u8	vdiag_shutdown; /* local RCV_SHUTDOWN | SEND_SHUTDOWN */
+	__u32   vdiag_src_cid;
+	__u32   vdiag_src_port;
+	__u32   vdiag_dst_cid;
+	__u32   vdiag_dst_port;
+	__u32	vdiag_ino;
+	__u32	vdiag_cookie[2];
+};
+
+#endif /* _UAPI__VM_SOCKETS_DIAG_H__ */
-- 
cgit 


From bdc476413dcdb5c38a7dec90fb2bca327021273a Mon Sep 17 00:00:00 2001
From: Amine Kherbouche <amine.kherbouche@6wind.com>
Date: Wed, 4 Oct 2017 19:35:57 +0200
Subject: ip_tunnel: add mpls over gre support

This commit introduces the MPLSoGRE support (RFC 4023), using ip tunnel
API by simply adding ipgre_tunnel_encap_(add|del)_mpls_ops() and the new
tunnel type TUNNEL_ENCAP_MPLS.

Signed-off-by: Amine Kherbouche <amine.kherbouche@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_tunnel.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h
index 2e520883c054..a2f48c01365e 100644
--- a/include/uapi/linux/if_tunnel.h
+++ b/include/uapi/linux/if_tunnel.h
@@ -84,6 +84,7 @@ enum tunnel_encap_types {
 	TUNNEL_ENCAP_NONE,
 	TUNNEL_ENCAP_FOU,
 	TUNNEL_ENCAP_GUE,
+	TUNNEL_ENCAP_MPLS,
 };
 
 #define TUNNEL_ENCAP_FLAG_CSUM		(1<<0)
-- 
cgit 


From 908432ca84fc229e906ba164219e9ad0fe56f755 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 5 Oct 2017 09:19:20 -0700
Subject: bpf: add helper bpf_perf_event_read_value for perf event array map

Hardware pmu counters are limited resources. When there are more
pmu based perf events opened than available counters, kernel will
multiplex these events so each event gets certain percentage
(but not 100%) of the pmu time. In case that multiplexing happens,
the number of samples or counter value will not reflect the
case compared to no multiplexing. This makes comparison between
different runs difficult.

Typically, the number of samples or counter value should be
normalized before comparing to other experiments. The typical
normalization is done like:
  normalized_num_samples = num_samples * time_enabled / time_running
  normalized_counter_value = counter_value * time_enabled / time_running
where time_enabled is the time enabled for event and time_running is
the time running for event since last normalization.

This patch adds helper bpf_perf_event_read_value for kprobed based perf
event array map, to read perf counter and enabled/running time.
The enabled/running time is accumulated since the perf event open.
To achieve scaling factor between two bpf invocations, users
can can use cpu_id as the key (which is typical for perf array usage model)
to remember the previous value and do the calculation inside the
bpf program.

Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6082faf5fd2a..7b57a212c7d7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -641,6 +641,14 @@ union bpf_attr {
  *     @xdp_md: pointer to xdp_md
  *     @delta: An positive/negative integer to be added to xdp_md.data_meta
  *     Return: 0 on success or negative on error
+ *
+ * int bpf_perf_event_read_value(map, flags, buf, buf_size)
+ *     read perf event counter value and perf event enabled/running time
+ *     @map: pointer to perf_event_array map
+ *     @flags: index of event in the map or bitmask flags
+ *     @buf: buf to fill
+ *     @buf_size: size of the buf
+ *     Return: 0 on success or negative error code
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -697,7 +705,8 @@ union bpf_attr {
 	FN(redirect_map),		\
 	FN(sk_redirect_map),		\
 	FN(sock_map_update),		\
-	FN(xdp_adjust_meta),
+	FN(xdp_adjust_meta),		\
+	FN(perf_event_read_value),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -741,7 +750,9 @@ enum bpf_func_id {
 #define BPF_F_ZERO_CSUM_TX		(1ULL << 1)
 #define BPF_F_DONT_FRAGMENT		(1ULL << 2)
 
-/* BPF_FUNC_perf_event_output and BPF_FUNC_perf_event_read flags. */
+/* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and
+ * BPF_FUNC_perf_event_read_value flags.
+ */
 #define BPF_F_INDEX_MASK		0xffffffffULL
 #define BPF_F_CURRENT_CPU		BPF_F_INDEX_MASK
 /* BPF_FUNC_perf_event_output for sk_buff input context. */
@@ -934,4 +945,10 @@ enum {
 #define TCP_BPF_IW		1001	/* Set TCP initial congestion window */
 #define TCP_BPF_SNDCWND_CLAMP	1002	/* Set sndcwnd_clamp */
 
+struct bpf_perf_event_value {
+	__u64 counter;
+	__u64 enabled;
+	__u64 running;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
-- 
cgit 


From 4bebdc7a85aa400c0222b5329861e4ad9252f1e5 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 5 Oct 2017 09:19:22 -0700
Subject: bpf: add helper bpf_perf_prog_read_value

This patch adds helper bpf_perf_prog_read_cvalue for perf event based bpf
programs, to read event counter and enabled/running time.
The enabled/running time is accumulated since the perf event open.

The typical use case for perf event based bpf program is to attach itself
to a single event. In such cases, if it is desirable to get scaling factor
between two bpf invocations, users can can save the time values in a map,
and use the value from the map and the current value to calculate
the scaling factor.

Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7b57a212c7d7..5bbbec17aa5a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -649,6 +649,13 @@ union bpf_attr {
  *     @buf: buf to fill
  *     @buf_size: size of the buf
  *     Return: 0 on success or negative error code
+ *
+ * int bpf_perf_prog_read_value(ctx, buf, buf_size)
+ *     read perf prog attached perf event counter and enabled/running time
+ *     @ctx: pointer to ctx
+ *     @buf: buf to fill
+ *     @buf_size: size of the buf
+ *     Return : 0 on success or negative error code
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -706,7 +713,8 @@ union bpf_attr {
 	FN(sk_redirect_map),		\
 	FN(sock_map_update),		\
 	FN(xdp_adjust_meta),		\
-	FN(perf_event_read_value),
+	FN(perf_event_read_value),	\
+	FN(perf_prog_read_value),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
-- 
cgit 


From 067cae47771c864604969fd902efe10916e0d79c Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 5 Oct 2017 21:52:12 -0700
Subject: bpf: Use char in prog and map name

Instead of u8, use char for prog and map name.  It can avoid the
userspace tool getting compiler's signess warning.  The
bpf_prog_aux, bpf_map, bpf_attr, bpf_prog_info and
bpf_map_info are changed.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 5bbbec17aa5a..6db9e1d679cd 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -230,7 +230,7 @@ union bpf_attr {
 		__u32	numa_node;	/* numa node (effective only if
 					 * BPF_F_NUMA_NODE is set).
 					 */
-		__u8	map_name[BPF_OBJ_NAME_LEN];
+		char	map_name[BPF_OBJ_NAME_LEN];
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
@@ -253,7 +253,7 @@ union bpf_attr {
 		__aligned_u64	log_buf;	/* user supplied buffer */
 		__u32		kern_version;	/* checked when prog_type=kprobe */
 		__u32		prog_flags;
-		__u8		prog_name[BPF_OBJ_NAME_LEN];
+		char		prog_name[BPF_OBJ_NAME_LEN];
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -888,7 +888,7 @@ struct bpf_prog_info {
 	__u32 created_by_uid;
 	__u32 nr_map_ids;
 	__aligned_u64 map_ids;
-	__u8  name[BPF_OBJ_NAME_LEN];
+	char name[BPF_OBJ_NAME_LEN];
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
@@ -898,7 +898,7 @@ struct bpf_map_info {
 	__u32 value_size;
 	__u32 max_entries;
 	__u32 map_flags;
-	__u8  name[BPF_OBJ_NAME_LEN];
+	char  name[BPF_OBJ_NAME_LEN];
 } __attribute__((aligned(8)));
 
 /* User bpf_sock_ops struct to access socket values and specify request ops
-- 
cgit 


From 821f1b21cabb46827ce39ddf82e2789680b5042a Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Fri, 6 Oct 2017 22:12:37 -0700
Subject: bridge: add new BR_NEIGH_SUPPRESS port flag to suppress arp and nd
 flood

This patch adds a new bridge port flag BR_NEIGH_SUPPRESS to
suppress arp and nd flood on bridge ports. It implements
rfc7432, section 10.
https://tools.ietf.org/html/rfc7432#section-10
for ethernet VPN deployments. It is similar to the existing
BR_PROXYARP* flags but has a few semantic differences to conform
to EVPN standard. Unlike the existing flags, this new flag suppresses
flood of all neigh discovery packets (arp and nd) to tunnel ports.
Supports both vlan filtering and non-vlan filtering bridges.

In case of EVPN, it is mainly used to avoid flooding
of arp and nd packets to tunnel ports like vxlan.

This patch adds netlink and sysfs support to set this bridge port
flag.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index cd580fc0e58f..b037e0ab1975 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -327,6 +327,7 @@ enum {
 	IFLA_BRPORT_VLAN_TUNNEL,
 	IFLA_BRPORT_BCAST_FLOOD,
 	IFLA_BRPORT_GROUP_FWD_MASK,
+	IFLA_BRPORT_NEIGH_SUPPRESS,
 	__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
-- 
cgit 


From ceaa001a170e43608854d5290a48064f57b565ed Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Wed, 4 Oct 2017 17:03:12 -0700
Subject: openvswitch: Add erspan tunnel support.

Add erspan netlink interface for OVS.

Signed-off-by: William Tu <u9012063@gmail.com>
Cc: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 156ee4cab82e..efdbfbfd3ee2 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -359,6 +359,7 @@ enum ovs_tunnel_key_attr {
 	OVS_TUNNEL_KEY_ATTR_IPV6_SRC,		/* struct in6_addr src IPv6 address. */
 	OVS_TUNNEL_KEY_ATTR_IPV6_DST,		/* struct in6_addr dst IPv6 address. */
 	OVS_TUNNEL_KEY_ATTR_PAD,
+	OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS,	/* be32 ERSPAN index. */
 	__OVS_TUNNEL_KEY_ATTR_MAX
 };
 
-- 
cgit 


From b8226962b1c49c784aeddb9d2fafbf53dfdc2190 Mon Sep 17 00:00:00 2001
From: Eric Garver <e@erig.me>
Date: Tue, 10 Oct 2017 16:54:44 -0400
Subject: openvswitch: add ct_clear action

This adds a ct_clear action for clearing conntrack state. ct_clear is
currently implemented in OVS userspace, but is not backed by an action
in the kernel datapath. This is useful for flows that may modify a
packet tuple after a ct lookup has already occurred.

Signed-off-by: Eric Garver <e@erig.me>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index efdbfbfd3ee2..0cd6f8833147 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -807,6 +807,7 @@ struct ovs_action_push_eth {
  * packet.
  * @OVS_ACTION_ATTR_POP_ETH: Pop the outermost Ethernet header off the
  * packet.
+ * @OVS_ACTION_ATTR_CT_CLEAR: Clear conntrack state from the packet.
  *
  * Only a single header can be set with a single %OVS_ACTION_ATTR_SET.  Not all
  * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
@@ -836,6 +837,7 @@ enum ovs_action_attr {
 	OVS_ACTION_ATTR_TRUNC,        /* u32 struct ovs_action_trunc. */
 	OVS_ACTION_ATTR_PUSH_ETH,     /* struct ovs_action_push_eth. */
 	OVS_ACTION_ATTR_POP_ETH,      /* No argument. */
+	OVS_ACTION_ATTR_CT_CLEAR,     /* No argument. */
 
 	__OVS_ACTION_ATTR_MAX,	      /* Nothing past this will be accepted
 				       * from userspace. */
-- 
cgit 


From 1ea4ff3e9f0b8d53e680a2bb9e8e644bf03aeb4d Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 13 Sep 2017 16:07:22 +0200
Subject: cfg80211: support reloading regulatory database

If the regulatory database is loaded, and then updated, it may
be necessary to reload it. Add an nl80211 command to do this.

Note that this just reloads the database, it doesn't re-apply
the rules from it immediately.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 95832ce03a44..f882fe1f9709 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -990,6 +990,8 @@
  *	&NL80211_CMD_CONNECT or &NL80211_CMD_ROAM. If the 4 way handshake failed
  *	&NL80211_CMD_DISCONNECT should be indicated instead.
  *
+ * @NL80211_CMD_RELOAD_REGDB: Request that the regdb firmware file is reloaded.
+ *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
  */
@@ -1194,6 +1196,8 @@ enum nl80211_commands {
 
 	NL80211_CMD_PORT_AUTHORIZED,
 
+	NL80211_CMD_RELOAD_REGDB,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
-- 
cgit 


From 28978713c51b0a70acf748f76f9d6d2d20dcf980 Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Tue, 10 Oct 2017 23:45:18 -0700
Subject: net: qrtr: Move constants to header file

The constants are used by both the name server and clients, so clarify
their value and move them to the uapi header.

Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/qrtr.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/qrtr.h b/include/uapi/linux/qrtr.h
index 9d76c566f66e..63e8803e4d90 100644
--- a/include/uapi/linux/qrtr.h
+++ b/include/uapi/linux/qrtr.h
@@ -4,6 +4,9 @@
 #include <linux/socket.h>
 #include <linux/types.h>
 
+#define QRTR_NODE_BCAST	0xffffffffu
+#define QRTR_PORT_CTRL	0xfffffffeu
+
 struct sockaddr_qrtr {
 	__kernel_sa_family_t sq_family;
 	__u32 sq_node;
-- 
cgit 


From da7653f0faabbe45eb2d3fd6e4b400fe003e81ae Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Tue, 10 Oct 2017 23:45:19 -0700
Subject: net: qrtr: Add control packet definition to uapi

The QMUX protocol specification defines structure of the special control
packet messages being sent between handlers of the control port.

Add these to the uapi header, as this structure and the associated types
are shared between the kernel and all userspace handlers of control
messages.

Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/qrtr.h | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/qrtr.h b/include/uapi/linux/qrtr.h
index 63e8803e4d90..179af64846e0 100644
--- a/include/uapi/linux/qrtr.h
+++ b/include/uapi/linux/qrtr.h
@@ -13,4 +13,36 @@ struct sockaddr_qrtr {
 	__u32 sq_port;
 };
 
+enum qrtr_pkt_type {
+	QRTR_TYPE_DATA		= 1,
+	QRTR_TYPE_HELLO		= 2,
+	QRTR_TYPE_BYE		= 3,
+	QRTR_TYPE_NEW_SERVER	= 4,
+	QRTR_TYPE_DEL_SERVER	= 5,
+	QRTR_TYPE_DEL_CLIENT	= 6,
+	QRTR_TYPE_RESUME_TX	= 7,
+	QRTR_TYPE_EXIT          = 8,
+	QRTR_TYPE_PING          = 9,
+	QRTR_TYPE_NEW_LOOKUP	= 10,
+	QRTR_TYPE_DEL_LOOKUP	= 11,
+};
+
+struct qrtr_ctrl_pkt {
+	__le32 cmd;
+
+	union {
+		struct {
+			__le32 service;
+			__le32 instance;
+			__le32 node;
+			__le32 port;
+		} server;
+
+		struct {
+			__le32 node;
+			__le32 port;
+		} client;
+	};
+} __packed;
+
 #endif /* _LINUX_QRTR_H */
-- 
cgit 


From ad2d116c5242875bba27522682ec5ba7f0df75f0 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 11 Oct 2017 11:14:49 -0700
Subject: sched: tc_mirred: Remove whitespaces

This file contains unnecessary whitespaces as newlines, remove them,
found by looking at what struct tc_mirred looks like.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tc_act/tc_mirred.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/tc_act/tc_mirred.h b/include/uapi/linux/tc_act/tc_mirred.h
index 3d7a2b352a62..69038c29e8a9 100644
--- a/include/uapi/linux/tc_act/tc_mirred.h
+++ b/include/uapi/linux/tc_act/tc_mirred.h
@@ -9,13 +9,13 @@
 #define TCA_EGRESS_MIRROR 2 /* mirror packet to EGRESS */
 #define TCA_INGRESS_REDIR 3  /* packet redirect to INGRESS*/
 #define TCA_INGRESS_MIRROR 4 /* mirror packet to INGRESS */
-                                                                                
+
 struct tc_mirred {
 	tc_gen;
 	int                     eaction;   /* one of IN/EGRESS_MIRROR/REDIR */
 	__u32                   ifindex;  /* ifindex of egress port */
 };
-                                                                                
+
 enum {
 	TCA_MIRRED_UNSPEC,
 	TCA_MIRRED_TM,
@@ -24,5 +24,5 @@ enum {
 	__TCA_MIRRED_MAX
 };
 #define TCA_MIRRED_MAX (__TCA_MIRRED_MAX - 1)
-                                                                                
+
 #endif
-- 
cgit 


From 75da2163dbb6af9f2dce1d80056d11d290dd19a5 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Fri, 13 Oct 2017 11:04:23 +0200
Subject: tipc: introduce communication groups

As a preparation for introducing flow control for multicast and datagram
messaging we need a more strictly defined framework than we have now. A
socket must be able keep track of exactly how many and which other
sockets it is allowed to communicate with at any moment, and keep the
necessary state for those.

We therefore introduce a new concept we have named Communication Group.
Sockets can join a group via a new setsockopt() call TIPC_GROUP_JOIN.
The call takes four parameters: 'type' serves as group identifier,
'instance' serves as an logical member identifier, and 'scope' indicates
the visibility of the group (node/cluster/zone). Finally, 'flags' makes
it possible to set certain properties for the member. For now, there is
only one flag, indicating if the creator of the socket wants to receive
a copy of broadcast or multicast messages it is sending via the socket,
and if wants to be eligible as destination for its own anycasts.

A group is closed, i.e., sockets which have not joined a group will
not be able to send messages to or receive messages from members of
the group, and vice versa.

Any member of a group can send multicast ('group broadcast') messages
to all group members, optionally including itself, using the primitive
send(). The messages are received via the recvmsg() primitive. A socket
can only be member of one group at a time.

Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tipc.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h
index 5351b08c897a..5f7b2c4a09ab 100644
--- a/include/uapi/linux/tipc.h
+++ b/include/uapi/linux/tipc.h
@@ -231,6 +231,20 @@ struct sockaddr_tipc {
 #define TIPC_SOCK_RECVQ_DEPTH	132	/* Default: none (read only) */
 #define TIPC_MCAST_BROADCAST    133     /* Default: TIPC selects. No arg */
 #define TIPC_MCAST_REPLICAST    134     /* Default: TIPC selects. No arg */
+#define TIPC_GROUP_JOIN         135     /* Takes struct tipc_group_req* */
+#define TIPC_GROUP_LEAVE        136     /* No argument */
+
+/*
+ * Flag values
+ */
+#define TIPC_GROUP_LOOPBACK     0x1  /* Receive copy of sent msg when match */
+
+struct tipc_group_req {
+	__u32 type;      /* group id */
+	__u32 instance;  /* member id */
+	__u32 scope;     /* zone/cluster/node */
+	__u32 flags;
+};
 
 /*
  * Maximum sizes of TIPC bearer-related names (including terminating NULL)
-- 
cgit 


From ae236fb208a6fbbd2e7a6913385e8fb78ac807f8 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Fri, 13 Oct 2017 11:04:25 +0200
Subject: tipc: receive group membership events via member socket

Like with any other service, group members' availability can be
subscribed for by connecting to be topology server. However, because
the events arrive via a different socket than the member socket, there
is a real risk that membership events my arrive out of synch with the
actual JOIN/LEAVE action. I.e., it is possible to receive the first
messages from a new member before the corresponding JOIN event arrives,
just as it is possible to receive the last messages from a leaving
member after the LEAVE event has already been received.

Since each member socket is internally also subscribing for membership
events, we now fix this problem by passing those events on to the user
via the member socket. We leverage the already present member synch-
ronization protocol to guarantee correct message/event order. An event
is delivered to the user as an empty message where the two source
addresses identify the new/lost member. Furthermore, we set the MSG_OOB
bit in the message flags to mark it as an event. If the event is an
indication about a member loss we also set the MSG_EOR bit, so it can
be distinguished from a member addition event.

Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tipc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h
index 5f7b2c4a09ab..ef41c11a7f38 100644
--- a/include/uapi/linux/tipc.h
+++ b/include/uapi/linux/tipc.h
@@ -238,6 +238,7 @@ struct sockaddr_tipc {
  * Flag values
  */
 #define TIPC_GROUP_LOOPBACK     0x1  /* Receive copy of sent msg when match */
+#define TIPC_GROUP_MEMBER_EVTS  0x2  /* Receive membership events in socket */
 
 struct tipc_group_req {
 	__u32 type;      /* group id */
-- 
cgit 


From 4e8b86c062695454df0b76f3fee4fab8dc4bb716 Mon Sep 17 00:00:00 2001
From: Amritha Nambiar <amritha.nambiar@intel.com>
Date: Thu, 7 Sep 2017 04:00:06 -0700
Subject: mqprio: Introduce new hardware offload mode and shaper in mqprio

The offload types currently supported in mqprio are 0 (no offload) and
1 (offload only TCs) by setting these values for the 'hw' option. If
offloads are supported by setting the 'hw' option to 1, the default
offload mode is 'dcb' where only the TC values are offloaded to the
device. This patch introduces a new hardware offload mode called
'channel' with 'hw' set to 1 in mqprio which makes full use of the
mqprio options, the TCs, the queue configurations and the QoS parameters
for the TCs. This is achieved through a new netlink attribute for the
'mode' option which takes values such as 'dcb' (default) and 'channel'.
The 'channel' mode also supports QoS attributes for traffic class such as
minimum and maximum values for bandwidth rate limits.

This patch enables configuring additional HW shaper attributes associated
with a traffic class. Currently the shaper for bandwidth rate limiting is
supported which takes options such as minimum and maximum bandwidth rates
and are offloaded to the hardware in the 'channel' mode. The min and max
limits for bandwidth rates are provided by the user along with the TCs
and the queue configurations when creating the mqprio qdisc. The interface
can be extended to support new HW shapers in future through the 'shaper'
attribute.

Introduces a new data structure 'tc_mqprio_qopt_offload' for offloading
mqprio queue options and use this to be shared between the kernel and
device driver. This contains a copy of the existing data structure
for mqprio queue options. This new data structure can be extended when
adding new attributes for traffic class such as mode, shaper, shaper
parameters (bandwidth rate limits). The existing data structure for mqprio
queue options will be shared between the kernel and userspace.

Example:
  queues 4@0 4@4 hw 1 mode channel shaper bw_rlimit\
  min_rate 1Gbit 2Gbit max_rate 4Gbit 5Gbit

To dump the bandwidth rates:

qdisc mqprio 804a: root  tc 2 map 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0
             queues:(0:3) (4:7)
             mode:channel
             shaper:bw_rlimit   min_rate:1Gbit 2Gbit   max_rate:4Gbit 5Gbit

Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 include/uapi/linux/pkt_sched.h | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 099bf5528fed..e95b5c9b9fad 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -625,6 +625,22 @@ enum {
 
 #define TC_MQPRIO_HW_OFFLOAD_MAX (__TC_MQPRIO_HW_OFFLOAD_MAX - 1)
 
+enum {
+	TC_MQPRIO_MODE_DCB,
+	TC_MQPRIO_MODE_CHANNEL,
+	__TC_MQPRIO_MODE_MAX
+};
+
+#define __TC_MQPRIO_MODE_MAX (__TC_MQPRIO_MODE_MAX - 1)
+
+enum {
+	TC_MQPRIO_SHAPER_DCB,
+	TC_MQPRIO_SHAPER_BW_RATE,	/* Add new shapers below */
+	__TC_MQPRIO_SHAPER_MAX
+};
+
+#define __TC_MQPRIO_SHAPER_MAX (__TC_MQPRIO_SHAPER_MAX - 1)
+
 struct tc_mqprio_qopt {
 	__u8	num_tc;
 	__u8	prio_tc_map[TC_QOPT_BITMASK + 1];
@@ -633,6 +649,22 @@ struct tc_mqprio_qopt {
 	__u16	offset[TC_QOPT_MAX_QUEUE];
 };
 
+#define TC_MQPRIO_F_MODE		0x1
+#define TC_MQPRIO_F_SHAPER		0x2
+#define TC_MQPRIO_F_MIN_RATE		0x4
+#define TC_MQPRIO_F_MAX_RATE		0x8
+
+enum {
+	TCA_MQPRIO_UNSPEC,
+	TCA_MQPRIO_MODE,
+	TCA_MQPRIO_SHAPER,
+	TCA_MQPRIO_MIN_RATE64,
+	TCA_MQPRIO_MAX_RATE64,
+	__TCA_MQPRIO_MAX,
+};
+
+#define TCA_MQPRIO_MAX (__TCA_MQPRIO_MAX - 1)
+
 /* SFB */
 
 enum {
-- 
cgit 


From 32302902ff093891d8e64439cbb8ceae83e21ef8 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Thu, 12 Oct 2017 11:38:45 -0700
Subject: mqprio: Reserve last 32 classid values for HW traffic classes and
 misc IDs

This patch makes a slight tweak to mqprio in order to bring the
classid values used back in line with what is used for mq. The general idea
is to reserve values :ffe0 - :ffef to identify hardware traffic classes
normally reported via dev->num_tc. By doing this we can maintain a
consistent behavior with mq for classid where :1 - :ffdf will represent a
physical qdisc mapped onto a Tx queue represented by classid - 1, and the
traffic classes will be mapped onto a known subset of classid values
reserved for our virtual qdiscs.

Note I reserved the range from :fff0 - :ffff since this way we might be
able to reuse these classid values with clsact and ingress which would mean
that for mq, mqprio, ingress, and clsact we should be able to maintain a
similar classid layout.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index e95b5c9b9fad..e7cc3d3c7421 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -74,6 +74,7 @@ struct tc_estimator {
 #define TC_H_INGRESS    (0xFFFFFFF1U)
 #define TC_H_CLSACT	TC_H_INGRESS
 
+#define TC_H_MIN_PRIORITY	0xFFE0U
 #define TC_H_MIN_INGRESS	0xFFF2U
 #define TC_H_MIN_EGRESS		0xFFF3U
 
-- 
cgit 


From 6710e1126934d8b4372b4d2f9ae1646cd3f151bf Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Mon, 16 Oct 2017 12:19:28 +0200
Subject: bpf: introduce new bpf cpu map type BPF_MAP_TYPE_CPUMAP

The 'cpumap' is primarily used as a backend map for XDP BPF helper
call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'.

This patch implement the main part of the map.  It is not connected to
the XDP redirect system yet, and no SKB allocation are done yet.

The main concern in this patch is to ensure the datapath can run
without any locking.  This adds complexity to the setup and tear-down
procedure, which assumptions are extra carefully documented in the
code comments.

V2:
 - make sure array isn't larger than NR_CPUS
 - make sure CPUs added is a valid possible CPU

V3: fix nitpicks from Jakub Kicinski <kubakici@wp.pl>

V5:
 - Restrict map allocation to root / CAP_SYS_ADMIN
 - WARN_ON_ONCE if queue is not empty on tear-down
 - Return -EPERM on memlock limit instead of -ENOMEM
 - Error code in __cpu_map_entry_alloc() also handle ptr_ring_cleanup()
 - Moved cpu_map_enqueue() to next patch

V6: all notice by Daniel Borkmann
 - Fix err return code in cpu_map_alloc() introduced in V5
 - Move cpu_possible() check after max_entries boundary check
 - Forbid usage initially in check_map_func_compatibility()

V7:
 - Fix alloc error path spotted by Daniel Borkmann
 - Did stress test adding+removing CPUs from the map concurrently
 - Fixed refcnt issue on cpu_map_entry, kthread started too soon
 - Make sure packets are flushed during tear-down, involved use of
   rcu_barrier() and kthread_run only exit after queue is empty
 - Fix alloc error path in __cpu_map_entry_alloc() for ptr_ring

V8:
 - Nitpicking comments and gramma by Edward Cree
 - Fix missing semi-colon introduced in V7 due to rebasing
 - Move struct bpf_cpu_map_entry members cpu+map_id to tracepoint patch

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6db9e1d679cd..4303fb6c3817 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -112,6 +112,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_HASH_OF_MAPS,
 	BPF_MAP_TYPE_DEVMAP,
 	BPF_MAP_TYPE_SOCKMAP,
+	BPF_MAP_TYPE_CPUMAP,
 };
 
 enum bpf_prog_type {
-- 
cgit 


From 1fba70e5b6bed53496ba1f1f16127f5be01b5fb6 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 18 Oct 2017 11:22:51 -0700
Subject: tcp: socket option to set TCP fast open key

New socket option TCP_FASTOPEN_KEY to allow different keys per
listener.  The listener by default uses the global key until the
socket option is set.  The key is a 16 bytes long binary data. This
option has no effect on regular non-listener TCP sockets.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Christoph Paasch <cpaasch@apple.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tcp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 15c25eccab2b..69c7493e42f8 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -119,6 +119,7 @@ enum {
 #define TCP_FASTOPEN_CONNECT	30	/* Attempt FastOpen with connect */
 #define TCP_ULP			31	/* Attach a ULP to a TCP connection */
 #define TCP_MD5SIG_EXT		32	/* TCP MD5 Signature with extensions */
+#define TCP_FASTOPEN_KEY	33	/* Set the key for Fast Open (cookie) */
 
 struct tcp_repair_opt {
 	__u32	opt_code;
-- 
cgit 


From 6e71b04a82248ccf13a94b85cbc674a9fefe53f5 Mon Sep 17 00:00:00 2001
From: Chenbo Feng <fengc@google.com>
Date: Wed, 18 Oct 2017 13:00:22 -0700
Subject: bpf: Add file mode configuration into bpf maps

Introduce the map read/write flags to the eBPF syscalls that returns the
map fd. The flags is used to set up the file mode when construct a new
file descriptor for bpf maps. To not break the backward capability, the
f_flags is set to O_RDWR if the flag passed by syscall is 0. Otherwise
it should be O_RDONLY or O_WRONLY. When the userspace want to modify or
read the map content, it will check the file mode to see if it is
allowed to make the change.

Signed-off-by: Chenbo Feng <fengc@google.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4303fb6c3817..d83f95ea6a1b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -218,6 +218,10 @@ enum bpf_attach_type {
 
 #define BPF_OBJ_NAME_LEN 16U
 
+/* Flags for accessing BPF object */
+#define BPF_F_RDONLY		(1U << 3)
+#define BPF_F_WRONLY		(1U << 4)
+
 union bpf_attr {
 	struct { /* anonymous struct used by BPF_MAP_CREATE command */
 		__u32	map_type;	/* one of enum bpf_map_type */
@@ -260,6 +264,7 @@ union bpf_attr {
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
 		__aligned_u64	pathname;
 		__u32		bpf_fd;
+		__u32		file_flags;
 	};
 
 	struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */
@@ -287,6 +292,7 @@ union bpf_attr {
 			__u32		map_id;
 		};
 		__u32		next_id;
+		__u32		open_flags;
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_GET_INFO_BY_FD */
-- 
cgit 


From e6546ef6d86d0fc38e0e84ccae80e641f3fc0087 Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Fri, 20 Oct 2017 11:05:39 -0700
Subject: bpf: add support for BPF_SOCK_OPS_BASE_RTT

A congestion control algorithm can make a call to the BPF socket_ops
program to request the base RTT. The base RTT can be congestion control
dependent and is meant to represent a congestion threshold such that
RTTs above it indicate congestion. This is especially useful for flows
within a DC where the base RTT is easy to obtain.

Being provided a base RTT solves a basic problem in RTT based congestion
avoidance algorithms (such as Vegas, NV and BBR). Although it is easy
to get the base RTT when the network is not congested, it is very
diffcult to do when it is very congested. Newer connections get an
inflated value of the base RTT leading to unfariness (newer flows with a
larger base RTT get more bandwidth). As a result, RTT based congestion
avoidance algorithms tend to update their base RTTs to improve fairness.
In very congested networks this can lead to base RTT inflation, reducing
the ability of these RTT based congestion control algorithms to prevent
congestion.

Note that in my experiments with TCP-NV, the base RTT provided can be
much larger than the actual hardware RTT. For example, experimenting
with hosts within a rack where the hardware RTT is 16-20us, I've used
base RTTs up to 150us. The effect of using a larger base RTT is that the
congestion avoidance algorithm will allow more queueing. When there are
only a few flows the main effect is larger measured RTTs and RPC
latencies due to the increased queueing. When there are a lot of flows,
a larger base RTT can lead to more congestion and more packet drops.
For this case, where the hardware RTT is 20us, a base RTT of 80us
produces good results.

This patch only introduces BPF_SOCK_OPS_BASE_RTT, a later patch in this
set adds support for using it in TCP-NV. Further study and testing is
needed before support can be added to other delay based congestion
avoidance algorithms.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d83f95ea6a1b..1aca744c220f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -955,6 +955,13 @@ enum {
 	BPF_SOCK_OPS_NEEDS_ECN,		/* If connection's congestion control
 					 * needs ECN
 					 */
+	BPF_SOCK_OPS_BASE_RTT,		/* Get base RTT. The correct value is
+					 * based on the path and may be
+					 * dependent on the congestion control
+					 * algorithm. In general it indicates
+					 * a congestion threshold. RTTs above
+					 * this indicate congestion
+					 */
 };
 
 #define TCP_BPF_IW		1001	/* Set TCP initial congestion window */
-- 
cgit 


From cd86d1fd21025fdd6daf23d1288da405e7ad0ec6 Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Fri, 20 Oct 2017 11:05:40 -0700
Subject: bpf: Adding helper function bpf_getsockops

Adding support for helper function bpf_getsockops to socket_ops BPF
programs. This patch only supports TCP_CONGESTION.

Signed-off-by: Vlad Vysotsky <vlad@cs.ucla.edu>
Acked-by: Lawrence Brakmo <brakmo@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1aca744c220f..f650346aaa1a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -613,12 +613,22 @@ union bpf_attr {
  * int bpf_setsockopt(bpf_socket, level, optname, optval, optlen)
  *     Calls setsockopt. Not all opts are available, only those with
  *     integer optvals plus TCP_CONGESTION.
- *     Supported levels: SOL_SOCKET and IPROTO_TCP
+ *     Supported levels: SOL_SOCKET and IPPROTO_TCP
  *     @bpf_socket: pointer to bpf_socket
- *     @level: SOL_SOCKET or IPROTO_TCP
+ *     @level: SOL_SOCKET or IPPROTO_TCP
  *     @optname: option name
  *     @optval: pointer to option value
- *     @optlen: length of optval in byes
+ *     @optlen: length of optval in bytes
+ *     Return: 0 or negative error
+ *
+ * int bpf_getsockopt(bpf_socket, level, optname, optval, optlen)
+ *     Calls getsockopt. Not all opts are available.
+ *     Supported levels: IPPROTO_TCP
+ *     @bpf_socket: pointer to bpf_socket
+ *     @level: IPPROTO_TCP
+ *     @optname: option name
+ *     @optval: pointer to option value
+ *     @optlen: length of optval in bytes
  *     Return: 0 or negative error
  *
  * int bpf_skb_adjust_room(skb, len_diff, mode, flags)
@@ -721,7 +731,8 @@ union bpf_attr {
 	FN(sock_map_update),		\
 	FN(xdp_adjust_meta),		\
 	FN(perf_event_read_value),	\
-	FN(perf_prog_read_value),
+	FN(perf_prog_read_value),	\
+	FN(getsockopt),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
-- 
cgit 


From 40b16b9be5773a314948656c96adf7bf7cfdbd0b Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sat, 21 Oct 2017 11:45:46 +0200
Subject: batman-adv: use inline kernel-doc for uapi constants

The enums of constants for netlink tends to become rather large over time.
Documenting them is easier when the kernel-doc is actually next to constant
and not in a different block above the enum.

Also inline kernel-doc allows multi-paragraph description. This could be
required to better document the netlink command types and the expected
return values.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 include/uapi/linux/batman_adv.h | 369 +++++++++++++++++++++++++++++++---------
 1 file changed, 290 insertions(+), 79 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index a83ddb7b63db..efd641c8a5d6 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -24,20 +24,6 @@
 
 /**
  * enum batadv_tt_client_flags - TT client specific flags
- * @BATADV_TT_CLIENT_DEL: the client has to be deleted from the table
- * @BATADV_TT_CLIENT_ROAM: the client roamed to/from another node and the new
- *  update telling its new real location has not been received/sent yet
- * @BATADV_TT_CLIENT_WIFI: this client is connected through a wifi interface.
- *  This information is used by the "AP Isolation" feature
- * @BATADV_TT_CLIENT_ISOLA: this client is considered "isolated". This
- *  information is used by the Extended Isolation feature
- * @BATADV_TT_CLIENT_NOPURGE: this client should never be removed from the table
- * @BATADV_TT_CLIENT_NEW: this client has been added to the local table but has
- *  not been announced yet
- * @BATADV_TT_CLIENT_PENDING: this client is marked for removal but it is kept
- *  in the table for one more originator interval for consistency purposes
- * @BATADV_TT_CLIENT_TEMP: this global client has been detected to be part of
- *  the network but no nnode has already announced it
  *
  * Bits from 0 to 7 are called _remote flags_ because they are sent on the wire.
  * Bits from 8 to 15 are called _local flags_ because they are used for local
@@ -48,160 +34,385 @@
  * in the TT CRC computation.
  */
 enum batadv_tt_client_flags {
+	/**
+	 * @BATADV_TT_CLIENT_DEL: the client has to be deleted from the table
+	 */
 	BATADV_TT_CLIENT_DEL     = (1 << 0),
+
+	/**
+	 * @BATADV_TT_CLIENT_ROAM: the client roamed to/from another node and
+	 * the new update telling its new real location has not been
+	 * received/sent yet
+	 */
 	BATADV_TT_CLIENT_ROAM    = (1 << 1),
+
+	/**
+	 * @BATADV_TT_CLIENT_WIFI: this client is connected through a wifi
+	 * interface. This information is used by the "AP Isolation" feature
+	 */
 	BATADV_TT_CLIENT_WIFI    = (1 << 4),
+
+	/**
+	 * @BATADV_TT_CLIENT_ISOLA: this client is considered "isolated". This
+	 * information is used by the Extended Isolation feature
+	 */
 	BATADV_TT_CLIENT_ISOLA	 = (1 << 5),
+
+	/**
+	 * @BATADV_TT_CLIENT_NOPURGE: this client should never be removed from
+	 * the table
+	 */
 	BATADV_TT_CLIENT_NOPURGE = (1 << 8),
+
+	/**
+	 * @BATADV_TT_CLIENT_NEW: this client has been added to the local table
+	 * but has not been announced yet
+	 */
 	BATADV_TT_CLIENT_NEW     = (1 << 9),
+
+	/**
+	 * @BATADV_TT_CLIENT_PENDING: this client is marked for removal but it
+	 * is kept in the table for one more originator interval for consistency
+	 * purposes
+	 */
 	BATADV_TT_CLIENT_PENDING = (1 << 10),
+
+	/**
+	 * @BATADV_TT_CLIENT_TEMP: this global client has been detected to be
+	 * part of the network but no nnode has already announced it
+	 */
 	BATADV_TT_CLIENT_TEMP	 = (1 << 11),
 };
 
 /**
  * enum batadv_nl_attrs - batman-adv netlink attributes
- *
- * @BATADV_ATTR_UNSPEC: unspecified attribute to catch errors
- * @BATADV_ATTR_VERSION: batman-adv version string
- * @BATADV_ATTR_ALGO_NAME: name of routing algorithm
- * @BATADV_ATTR_MESH_IFINDEX: index of the batman-adv interface
- * @BATADV_ATTR_MESH_IFNAME: name of the batman-adv interface
- * @BATADV_ATTR_MESH_ADDRESS: mac address of the batman-adv interface
- * @BATADV_ATTR_HARD_IFINDEX: index of the non-batman-adv interface
- * @BATADV_ATTR_HARD_IFNAME: name of the non-batman-adv interface
- * @BATADV_ATTR_HARD_ADDRESS: mac address of the non-batman-adv interface
- * @BATADV_ATTR_ORIG_ADDRESS: originator mac address
- * @BATADV_ATTR_TPMETER_RESULT: result of run (see batadv_tp_meter_status)
- * @BATADV_ATTR_TPMETER_TEST_TIME: time (msec) the run took
- * @BATADV_ATTR_TPMETER_BYTES: amount of acked bytes during run
- * @BATADV_ATTR_TPMETER_COOKIE: session cookie to match tp_meter session
- * @BATADV_ATTR_PAD: attribute used for padding for 64-bit alignment
- * @BATADV_ATTR_ACTIVE: Flag indicating if the hard interface is active
- * @BATADV_ATTR_TT_ADDRESS: Client MAC address
- * @BATADV_ATTR_TT_TTVN: Translation table version
- * @BATADV_ATTR_TT_LAST_TTVN: Previous translation table version
- * @BATADV_ATTR_TT_CRC32: CRC32 over translation table
- * @BATADV_ATTR_TT_VID: VLAN ID
- * @BATADV_ATTR_TT_FLAGS: Translation table client flags
- * @BATADV_ATTR_FLAG_BEST: Flags indicating entry is the best
- * @BATADV_ATTR_LAST_SEEN_MSECS: Time in milliseconds since last seen
- * @BATADV_ATTR_NEIGH_ADDRESS: Neighbour MAC address
- * @BATADV_ATTR_TQ: TQ to neighbour
- * @BATADV_ATTR_THROUGHPUT: Estimated throughput to Neighbour
- * @BATADV_ATTR_BANDWIDTH_UP: Reported uplink bandwidth
- * @BATADV_ATTR_BANDWIDTH_DOWN: Reported downlink bandwidth
- * @BATADV_ATTR_ROUTER: Gateway router MAC address
- * @BATADV_ATTR_BLA_OWN: Flag indicating own originator
- * @BATADV_ATTR_BLA_ADDRESS: Bridge loop avoidance claim MAC address
- * @BATADV_ATTR_BLA_VID: BLA VLAN ID
- * @BATADV_ATTR_BLA_BACKBONE: BLA gateway originator MAC address
- * @BATADV_ATTR_BLA_CRC: BLA CRC
- * @__BATADV_ATTR_AFTER_LAST: internal use
- * @NUM_BATADV_ATTR: total number of batadv_nl_attrs available
- * @BATADV_ATTR_MAX: highest attribute number currently defined
  */
 enum batadv_nl_attrs {
+	/**
+	 * @BATADV_ATTR_UNSPEC: unspecified attribute to catch errors
+	 */
 	BATADV_ATTR_UNSPEC,
+
+	/**
+	 * @BATADV_ATTR_VERSION: batman-adv version string
+	 */
 	BATADV_ATTR_VERSION,
+
+	/**
+	 * @BATADV_ATTR_ALGO_NAME: name of routing algorithm
+	 */
 	BATADV_ATTR_ALGO_NAME,
+
+	/**
+	 * @BATADV_ATTR_MESH_IFINDEX: index of the batman-adv interface
+	 */
 	BATADV_ATTR_MESH_IFINDEX,
+
+	/**
+	 * @BATADV_ATTR_MESH_IFNAME: name of the batman-adv interface
+	 */
 	BATADV_ATTR_MESH_IFNAME,
+
+	/**
+	 * @BATADV_ATTR_MESH_ADDRESS: mac address of the batman-adv interface
+	 */
 	BATADV_ATTR_MESH_ADDRESS,
+
+	/**
+	 * @BATADV_ATTR_HARD_IFINDEX: index of the non-batman-adv interface
+	 */
 	BATADV_ATTR_HARD_IFINDEX,
+
+	/**
+	 * @BATADV_ATTR_HARD_IFNAME: name of the non-batman-adv interface
+	 */
 	BATADV_ATTR_HARD_IFNAME,
+
+	/**
+	 * @BATADV_ATTR_HARD_ADDRESS: mac address of the non-batman-adv
+	 * interface
+	 */
 	BATADV_ATTR_HARD_ADDRESS,
+
+	/**
+	 * @BATADV_ATTR_ORIG_ADDRESS: originator mac address
+	 */
 	BATADV_ATTR_ORIG_ADDRESS,
+
+	/**
+	 * @BATADV_ATTR_TPMETER_RESULT: result of run (see
+	 * batadv_tp_meter_status)
+	 */
 	BATADV_ATTR_TPMETER_RESULT,
+
+	/**
+	 * @BATADV_ATTR_TPMETER_TEST_TIME: time (msec) the run took
+	 */
 	BATADV_ATTR_TPMETER_TEST_TIME,
+
+	/**
+	 * @BATADV_ATTR_TPMETER_BYTES: amount of acked bytes during run
+	 */
 	BATADV_ATTR_TPMETER_BYTES,
+
+	/**
+	 * @BATADV_ATTR_TPMETER_COOKIE: session cookie to match tp_meter session
+	 */
 	BATADV_ATTR_TPMETER_COOKIE,
+
+	/**
+	 * @BATADV_ATTR_PAD: attribute used for padding for 64-bit alignment
+	 */
 	BATADV_ATTR_PAD,
+
+	/**
+	 * @BATADV_ATTR_ACTIVE: Flag indicating if the hard interface is active
+	 */
 	BATADV_ATTR_ACTIVE,
+
+	/**
+	 * @BATADV_ATTR_TT_ADDRESS: Client MAC address
+	 */
 	BATADV_ATTR_TT_ADDRESS,
+
+	/**
+	 * @BATADV_ATTR_TT_TTVN: Translation table version
+	 */
 	BATADV_ATTR_TT_TTVN,
+
+	/**
+	 * @BATADV_ATTR_TT_LAST_TTVN: Previous translation table version
+	 */
 	BATADV_ATTR_TT_LAST_TTVN,
+
+	/**
+	 * @BATADV_ATTR_TT_CRC32: CRC32 over translation table
+	 */
 	BATADV_ATTR_TT_CRC32,
+
+	/**
+	 * @BATADV_ATTR_TT_VID: VLAN ID
+	 */
 	BATADV_ATTR_TT_VID,
+
+	/**
+	 * @BATADV_ATTR_TT_FLAGS: Translation table client flags
+	 */
 	BATADV_ATTR_TT_FLAGS,
+
+	/**
+	 * @BATADV_ATTR_FLAG_BEST: Flags indicating entry is the best
+	 */
 	BATADV_ATTR_FLAG_BEST,
+
+	/**
+	 * @BATADV_ATTR_LAST_SEEN_MSECS: Time in milliseconds since last seen
+	 */
 	BATADV_ATTR_LAST_SEEN_MSECS,
+
+	/**
+	 * @BATADV_ATTR_NEIGH_ADDRESS: Neighbour MAC address
+	 */
 	BATADV_ATTR_NEIGH_ADDRESS,
+
+	/**
+	 * @BATADV_ATTR_TQ: TQ to neighbour
+	 */
 	BATADV_ATTR_TQ,
+
+	/**
+	 * @BATADV_ATTR_THROUGHPUT: Estimated throughput to Neighbour
+	 */
 	BATADV_ATTR_THROUGHPUT,
+
+	/**
+	 * @BATADV_ATTR_BANDWIDTH_UP: Reported uplink bandwidth
+	 */
 	BATADV_ATTR_BANDWIDTH_UP,
+
+	/**
+	 * @BATADV_ATTR_BANDWIDTH_DOWN: Reported downlink bandwidth
+	 */
 	BATADV_ATTR_BANDWIDTH_DOWN,
+
+	/**
+	 * @BATADV_ATTR_ROUTER: Gateway router MAC address
+	 */
 	BATADV_ATTR_ROUTER,
+
+	/**
+	 * @BATADV_ATTR_BLA_OWN: Flag indicating own originator
+	 */
 	BATADV_ATTR_BLA_OWN,
+
+	/**
+	 * @BATADV_ATTR_BLA_ADDRESS: Bridge loop avoidance claim MAC address
+	 */
 	BATADV_ATTR_BLA_ADDRESS,
+
+	/**
+	 * @BATADV_ATTR_BLA_VID: BLA VLAN ID
+	 */
 	BATADV_ATTR_BLA_VID,
+
+	/**
+	 * @BATADV_ATTR_BLA_BACKBONE: BLA gateway originator MAC address
+	 */
 	BATADV_ATTR_BLA_BACKBONE,
+
+	/**
+	 * @BATADV_ATTR_BLA_CRC: BLA CRC
+	 */
 	BATADV_ATTR_BLA_CRC,
+
 	/* add attributes above here, update the policy in netlink.c */
+
+	/**
+	 * @__BATADV_ATTR_AFTER_LAST: internal use
+	 */
 	__BATADV_ATTR_AFTER_LAST,
+
+	/**
+	 * @NUM_BATADV_ATTR: total number of batadv_nl_attrs available
+	 */
 	NUM_BATADV_ATTR = __BATADV_ATTR_AFTER_LAST,
+
+	/**
+	 * @BATADV_ATTR_MAX: highest attribute number currently defined
+	 */
 	BATADV_ATTR_MAX = __BATADV_ATTR_AFTER_LAST - 1
 };
 
 /**
  * enum batadv_nl_commands - supported batman-adv netlink commands
- *
- * @BATADV_CMD_UNSPEC: unspecified command to catch errors
- * @BATADV_CMD_GET_MESH_INFO: Query basic information about batman-adv device
- * @BATADV_CMD_TP_METER: Start a tp meter session
- * @BATADV_CMD_TP_METER_CANCEL: Cancel a tp meter session
- * @BATADV_CMD_GET_ROUTING_ALGOS: Query the list of routing algorithms.
- * @BATADV_CMD_GET_HARDIFS: Query list of hard interfaces
- * @BATADV_CMD_GET_TRANSTABLE_LOCAL: Query list of local translations
- * @BATADV_CMD_GET_TRANSTABLE_GLOBAL Query list of global translations
- * @BATADV_CMD_GET_ORIGINATORS: Query list of originators
- * @BATADV_CMD_GET_NEIGHBORS: Query list of neighbours
- * @BATADV_CMD_GET_GATEWAYS: Query list of gateways
- * @BATADV_CMD_GET_BLA_CLAIM: Query list of bridge loop avoidance claims
- * @BATADV_CMD_GET_BLA_BACKBONE: Query list of bridge loop avoidance backbones
- * @__BATADV_CMD_AFTER_LAST: internal use
- * @BATADV_CMD_MAX: highest used command number
  */
 enum batadv_nl_commands {
+	/**
+	 * @BATADV_CMD_UNSPEC: unspecified command to catch errors
+	 */
 	BATADV_CMD_UNSPEC,
+
+	/**
+	 * @BATADV_CMD_GET_MESH_INFO: Query basic information about batman-adv
+	 * device
+	 */
 	BATADV_CMD_GET_MESH_INFO,
+
+	/**
+	 * @BATADV_CMD_TP_METER: Start a tp meter session
+	 */
 	BATADV_CMD_TP_METER,
+
+	/**
+	 * @BATADV_CMD_TP_METER_CANCEL: Cancel a tp meter session
+	 */
 	BATADV_CMD_TP_METER_CANCEL,
+
+	/**
+	 * @BATADV_CMD_GET_ROUTING_ALGOS: Query the list of routing algorithms.
+	 */
 	BATADV_CMD_GET_ROUTING_ALGOS,
+
+	/**
+	 * @BATADV_CMD_GET_HARDIFS: Query list of hard interfaces
+	 */
 	BATADV_CMD_GET_HARDIFS,
+
+	/**
+	 * @BATADV_CMD_GET_TRANSTABLE_LOCAL: Query list of local translations
+	 */
 	BATADV_CMD_GET_TRANSTABLE_LOCAL,
+
+	/**
+	 * @BATADV_CMD_GET_TRANSTABLE_GLOBAL: Query list of global translations
+	 */
 	BATADV_CMD_GET_TRANSTABLE_GLOBAL,
+
+	/**
+	 * @BATADV_CMD_GET_ORIGINATORS: Query list of originators
+	 */
 	BATADV_CMD_GET_ORIGINATORS,
+
+	/**
+	 * @BATADV_CMD_GET_NEIGHBORS: Query list of neighbours
+	 */
 	BATADV_CMD_GET_NEIGHBORS,
+
+	/**
+	 * @BATADV_CMD_GET_GATEWAYS: Query list of gateways
+	 */
 	BATADV_CMD_GET_GATEWAYS,
+
+	/**
+	 * @BATADV_CMD_GET_BLA_CLAIM: Query list of bridge loop avoidance claims
+	 */
 	BATADV_CMD_GET_BLA_CLAIM,
+
+	/**
+	 * @BATADV_CMD_GET_BLA_BACKBONE: Query list of bridge loop avoidance
+	 * backbones
+	 */
 	BATADV_CMD_GET_BLA_BACKBONE,
+
 	/* add new commands above here */
+
+	/**
+	 * @__BATADV_CMD_AFTER_LAST: internal use
+	 */
 	__BATADV_CMD_AFTER_LAST,
+
+	/**
+	 * @BATADV_CMD_MAX: highest used command number
+	 */
 	BATADV_CMD_MAX = __BATADV_CMD_AFTER_LAST - 1
 };
 
 /**
  * enum batadv_tp_meter_reason - reason of a tp meter test run stop
- * @BATADV_TP_REASON_COMPLETE: sender finished tp run
- * @BATADV_TP_REASON_CANCEL: sender was stopped during run
- * @BATADV_TP_REASON_DST_UNREACHABLE: receiver could not be reached or didn't
- *  answer
- * @BATADV_TP_REASON_RESEND_LIMIT: (unused) sender retry reached limit
- * @BATADV_TP_REASON_ALREADY_ONGOING: test to or from the same node already
- *  ongoing
- * @BATADV_TP_REASON_MEMORY_ERROR: test was stopped due to low memory
- * @BATADV_TP_REASON_CANT_SEND: failed to send via outgoing interface
- * @BATADV_TP_REASON_TOO_MANY: too many ongoing sessions
  */
 enum batadv_tp_meter_reason {
+	/**
+	 * @BATADV_TP_REASON_COMPLETE: sender finished tp run
+	 */
 	BATADV_TP_REASON_COMPLETE		= 3,
+
+	/**
+	 * @BATADV_TP_REASON_CANCEL: sender was stopped during run
+	 */
 	BATADV_TP_REASON_CANCEL			= 4,
+
 	/* error status >= 128 */
+
+	/**
+	 * @BATADV_TP_REASON_DST_UNREACHABLE: receiver could not be reached or
+	 * didn't answer
+	 */
 	BATADV_TP_REASON_DST_UNREACHABLE	= 128,
+
+	/**
+	 * @BATADV_TP_REASON_RESEND_LIMIT: (unused) sender retry reached limit
+	 */
 	BATADV_TP_REASON_RESEND_LIMIT		= 129,
+
+	/**
+	 * @BATADV_TP_REASON_ALREADY_ONGOING: test to or from the same node
+	 * already ongoing
+	 */
 	BATADV_TP_REASON_ALREADY_ONGOING	= 130,
+
+	/**
+	 * @BATADV_TP_REASON_MEMORY_ERROR: test was stopped due to low memory
+	 */
 	BATADV_TP_REASON_MEMORY_ERROR		= 131,
+
+	/**
+	 * @BATADV_TP_REASON_CANT_SEND: failed to send via outgoing interface
+	 */
 	BATADV_TP_REASON_CANT_SEND		= 132,
+
+	/**
+	 * @BATADV_TP_REASON_TOO_MANY: too many ongoing sessions
+	 */
 	BATADV_TP_REASON_TOO_MANY		= 133,
 };
 
-- 
cgit 


From 71c02379c762cb616c00fd5c4ed253fbf6bbe11b Mon Sep 17 00:00:00 2001
From: Christoph Paasch <cpaasch@apple.com>
Date: Mon, 23 Oct 2017 13:22:23 -0700
Subject: tcp: Configure TFO without cookie per socket and/or per route

We already allow to enable TFO without a cookie by using the
fastopen-sysctl and setting it to TFO_SERVER_COOKIE_NOT_REQD (or
TFO_CLIENT_NO_COOKIE).
This is safe to do in certain environments where we know that there
isn't a malicous host (aka., data-centers) or when the
application-protocol already provides an authentication mechanism in the
first flight of data.

A server however might be providing multiple services or talking to both
sides (public Internet and data-center). So, this server would want to
enable cookie-less TFO for certain services and/or for connections that
go to the data-center.

This patch exposes a socket-option and a per-route attribute to enable such
fine-grained configurations.

Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Reviewed-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/rtnetlink.h | 2 ++
 include/uapi/linux/tcp.h       | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index dab7dad9e01a..fe6679268901 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -430,6 +430,8 @@ enum {
 #define RTAX_QUICKACK RTAX_QUICKACK
 	RTAX_CC_ALGO,
 #define RTAX_CC_ALGO RTAX_CC_ALGO
+	RTAX_FASTOPEN_NO_COOKIE,
+#define RTAX_FASTOPEN_NO_COOKIE RTAX_FASTOPEN_NO_COOKIE
 	__RTAX_MAX
 };
 
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 69c7493e42f8..d67e1d40c6d6 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -120,6 +120,7 @@ enum {
 #define TCP_ULP			31	/* Attach a ULP to a TCP connection */
 #define TCP_MD5SIG_EXT		32	/* TCP MD5 Signature with extensions */
 #define TCP_FASTOPEN_KEY	33	/* Set the key for Fast Open (cookie) */
+#define TCP_FASTOPEN_NO_COOKIE	34	/* Enable TFO without a TFO cookie */
 
 struct tcp_repair_opt {
 	__u32	opt_code;
-- 
cgit 


From 908d140a87a794bf89717ceae54aba5ce86c52e4 Mon Sep 17 00:00:00 2001
From: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Date: Sat, 21 Oct 2017 00:25:15 +0300
Subject: ip6_tunnel: Allow rcv/xmit even if remote address is a local address

Currently, ip6_tnl_xmit_ctl drops tunneled packets if the remote
address (outer v6 destination) is one of host's locally configured
addresses.
Same applies to ip6_tnl_rcv_ctl: it drops packets if the remote address
(outer v6 source) is a local address.

This prevents using ipxip6 (and ip6_gre) tunnels whose local/remote
endpoints are on same host; OTOH v4 tunnels (ipip or gre) allow such
configurations.

An example where this proves useful is a system where entities are
identified by their unique v6 addresses, and use tunnels to encapsulate
traffic between them. The limitation prevents placing several entities
on same host.

Introduce IP6_TNL_F_ALLOW_LOCAL_REMOTE which allows to bypass this
restriction.

Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ip6_tunnel.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ip6_tunnel.h b/include/uapi/linux/ip6_tunnel.h
index 425926c467d7..ffebbe365478 100644
--- a/include/uapi/linux/ip6_tunnel.h
+++ b/include/uapi/linux/ip6_tunnel.h
@@ -20,6 +20,8 @@
 #define IP6_TNL_F_RCV_DSCP_COPY 0x10
 /* copy fwmark from inner packet */
 #define IP6_TNL_F_USE_ORIG_FWMARK 0x20
+/* allow remote endpoint on the local node */
+#define IP6_TNL_F_ALLOW_LOCAL_REMOTE 0x40
 
 struct ip6_tnl_parm {
 	char name[IFNAMSIZ];	/* name of tunnel device */
-- 
cgit 


From 585d763af09cc21daf48ecc873604ccdb70f6014 Mon Sep 17 00:00:00 2001
From: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Date: Mon, 16 Oct 2017 18:01:26 -0700
Subject: net/sched: Introduce Credit Based Shaper (CBS) qdisc

This queueing discipline implements the shaper algorithm defined by
the 802.1Q-2014 Section 8.6.8.2 and detailed in Annex L.

It's primary usage is to apply some bandwidth reservation to user
defined traffic classes, which are mapped to different queues via the
mqprio qdisc.

Only a simple software implementation is added for now.

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
Tested-by: Henrik Austad <henrik@austad.us>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 include/uapi/linux/pkt_sched.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index e7cc3d3c7421..0e88cc262ca0 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -904,4 +904,23 @@ struct tc_pie_xstats {
 	__u32 maxq;             /* maximum queue size */
 	__u32 ecn_mark;         /* packets marked with ecn*/
 };
+
+/* CBS */
+struct tc_cbs_qopt {
+	__u8 offload;
+	__u8 _pad[3];
+	__s32 hicredit;
+	__s32 locredit;
+	__s32 idleslope;
+	__s32 sendslope;
+};
+
+enum {
+	TCA_CBS_UNSPEC,
+	TCA_CBS_PARMS,
+	__TCA_CBS_MAX,
+};
+
+#define TCA_CBS_MAX (__TCA_CBS_MAX - 1)
+
 #endif
-- 
cgit 


From 2ea2352ede9d97585164a7e19224955f4e4ca8db Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Fri, 27 Oct 2017 17:30:12 -0700
Subject: ipv6: prevent user from adding cached routes

Cached routes should only be created by the system when receiving pmtu
discovery or ip redirect msg. Users should not be allowed to create
cached routes.

Furthermore, after the patch series to move cached routes into exception
table, user added cached routes will trigger the following warning in
fib6_add():

WARNING: CPU: 0 PID: 2985 at net/ipv6/ip6_fib.c:1137
fib6_add+0x20d9/0x2c10 net/ipv6/ip6_fib.c:1137
Kernel panic - not syncing: panic_on_warn set ...

CPU: 0 PID: 2985 Comm: syzkaller320388 Not tainted 4.14.0-rc3+ #74
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:16 [inline]
 dump_stack+0x194/0x257 lib/dump_stack.c:52
 panic+0x1e4/0x417 kernel/panic.c:181
 __warn+0x1c4/0x1d9 kernel/panic.c:542
 report_bug+0x211/0x2d0 lib/bug.c:183
 fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:178
 do_trap_no_signal arch/x86/kernel/traps.c:212 [inline]
 do_trap+0x260/0x390 arch/x86/kernel/traps.c:261
 do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:298
 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:311
 invalid_op+0x18/0x20 arch/x86/entry/entry_64.S:905
RIP: 0010:fib6_add+0x20d9/0x2c10 net/ipv6/ip6_fib.c:1137
RSP: 0018:ffff8801cf09f6a0 EFLAGS: 00010297
RAX: ffff8801ce45e340 RBX: 1ffff10039e13eec RCX: ffff8801d749c814
RDX: 0000000000000000 RSI: ffff8801d749c700 RDI: ffff8801d749c780
RBP: ffff8801cf09fa08 R08: 0000000000000000 R09: ffff8801cf09f360
R10: ffff8801cf09f2d8 R11: 1ffff10039c8befb R12: 0000000000000001
R13: dffffc0000000000 R14: ffff8801d749c700 R15: ffffffff860655c0
 __ip6_ins_rt+0x6c/0x90 net/ipv6/route.c:1011
 ip6_route_add+0x148/0x1a0 net/ipv6/route.c:2782
 ipv6_route_ioctl+0x4d5/0x690 net/ipv6/route.c:3291
 inet6_ioctl+0xef/0x1e0 net/ipv6/af_inet6.c:521
 sock_do_ioctl+0x65/0xb0 net/socket.c:961
 sock_ioctl+0x2c2/0x440 net/socket.c:1058
 vfs_ioctl fs/ioctl.c:45 [inline]
 do_vfs_ioctl+0x1b1/0x1530 fs/ioctl.c:685
 SYSC_ioctl fs/ioctl.c:700 [inline]
 SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691
 entry_SYSCALL_64_fastpath+0x1f/0xbe

So we fix this by failing the attemp to add cached routes from userspace
with returning EINVAL error.

Fixes: 2b760fcf5cfb ("ipv6: hook up exception table to store dst cache")
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ipv6_route.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ipv6_route.h b/include/uapi/linux/ipv6_route.h
index d496c02e14bc..c15d8054905c 100644
--- a/include/uapi/linux/ipv6_route.h
+++ b/include/uapi/linux/ipv6_route.h
@@ -28,7 +28,7 @@
 
 #define RTF_ROUTEINFO	0x00800000	/* route information - RA	*/
 
-#define RTF_CACHE	0x01000000	/* cache entry			*/
+#define RTF_CACHE	0x01000000	/* read-only: can not be set by user */
 #define RTF_FLOW	0x02000000	/* flow significant route	*/
 #define RTF_POLICY	0x04000000	/* policy route			*/
 
-- 
cgit 


From a190d04db93710ae166749055b6985397c6d13f5 Mon Sep 17 00:00:00 2001
From: Mahesh Bandewar <maheshb@google.com>
Date: Thu, 26 Oct 2017 15:09:21 -0700
Subject: ipvlan: introduce 'private' attribute for all existing modes.

IPvlan has always operated in bridge mode. However there are scenarios
where each slave should be able to talk through the master device but
not necessarily across each other. Think of an environment where each
of a namespace is a private and independant customer. In this scenario
the machine which is hosting these namespaces neither want to tell who
their neighbor is nor the individual namespaces care to talk to neighbor
on short-circuited network path.

This patch implements the mode that is very similar to the 'private' mode
in macvlan where individual slaves can send and receive traffic through
the master device, just that they can not talk among slave devices.

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index b037e0ab1975..052e32cd584c 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -465,6 +465,7 @@ enum macsec_validation_type {
 enum {
 	IFLA_IPVLAN_UNSPEC,
 	IFLA_IPVLAN_MODE,
+	IFLA_IPVLAN_FLAGS,
 	__IFLA_IPVLAN_MAX
 };
 
@@ -477,6 +478,8 @@ enum ipvlan_mode {
 	IPVLAN_MODE_MAX
 };
 
+#define IPVLAN_F_PRIVATE	0x01
+
 /* VXLAN section */
 enum {
 	IFLA_VXLAN_UNSPEC,
-- 
cgit 


From fe89aa6b250c1011ccf425fbb7998e96bd54263f Mon Sep 17 00:00:00 2001
From: Mahesh Bandewar <maheshb@google.com>
Date: Thu, 26 Oct 2017 15:09:25 -0700
Subject: ipvlan: implement VEPA mode

This is very similar to the Macvlan VEPA mode, however, there is some
difference. IPvlan uses the mac-address of the lower device, so the VEPA
mode has implications of ICMP-redirects for packets destined for its
immediate neighbors sharing same master since the packets will have same
source and dest mac. The external switch/router will send redirect msg.

Having said that, this will be useful tool in terms of debugging
since IPvlan will not switch packets within its slaves and rely completely
on the external entity as intended in 802.1Qbg.

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 052e32cd584c..81f26473d728 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -479,6 +479,7 @@ enum ipvlan_mode {
 };
 
 #define IPVLAN_F_PRIVATE	0x01
+#define IPVLAN_F_VEPA		0x02
 
 /* VXLAN section */
 enum {
-- 
cgit 


From ee20598194500e82c477cf13e52b58e569446ed0 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Tue, 18 Jul 2017 15:42:15 -0500
Subject: net/dcb: Add dscp to priority selector type

IEEE specification P802.1Qcd/D2.1 defines priority selector 5.
This APP TLV selector defines DSCP to priority map.
This patch defines such DSCP selector.

Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/uapi/linux/dcbnl.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/dcbnl.h b/include/uapi/linux/dcbnl.h
index b6170a6af7c2..2c0c6453c3f4 100644
--- a/include/uapi/linux/dcbnl.h
+++ b/include/uapi/linux/dcbnl.h
@@ -206,6 +206,7 @@ struct cee_pfc {
 #define IEEE_8021QAZ_APP_SEL_STREAM	2
 #define IEEE_8021QAZ_APP_SEL_DGRAM	3
 #define IEEE_8021QAZ_APP_SEL_ANY	4
+#define IEEE_8021QAZ_APP_SEL_DSCP       5
 
 /* This structure contains the IEEE 802.1Qaz APP managed object. This
  * object is also used for the CEE std as well.
-- 
cgit 


From 9354d452034273a50a4fd703bea31e5d6b1fc20b Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Thu, 2 Nov 2017 17:04:37 -0200
Subject: openvswitch: reliable interface indentification in port dumps

This patch allows reliable identification of netdevice interfaces connected
to openvswitch bridges. In particular, user space queries the netdev
interfaces belonging to the ports for statistics, up/down state, etc.
Datapath dump needs to provide enough information for the user space to be
able to do that.

Currently, only interface names are returned. This is not sufficient, as
openvswitch allows its ports to be in different name spaces and the
interface name is valid only in its name space. What is needed and generally
used in other netlink APIs, is the pair ifindex+netnsid.

The solution is addition of the ifindex+netnsid pair (or only ifindex if in
the same name space) to vport get/dump operation.

On request side, ideally the ifindex+netnsid pair could be used to
get/set/del the corresponding vport. This is not implemented by this patch
and can be added later if needed.

Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index ffe397daad49..501e4c4e2a03 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -258,6 +258,8 @@ enum ovs_vport_attr {
 				/* receiving upcalls */
 	OVS_VPORT_ATTR_STATS,	/* struct ovs_vport_stats */
 	OVS_VPORT_ATTR_PAD,
+	OVS_VPORT_ATTR_IFINDEX,
+	OVS_VPORT_ATTR_NETNSID,
 	__OVS_VPORT_ATTR_MAX
 };
 
-- 
cgit 


From 79e1ad148c844f5c8b9d76b36b26e3886dca95ae Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Thu, 2 Nov 2017 17:04:38 -0200
Subject: rtnetlink: use netnsid to query interface

Currently, when an application gets netnsid from the kernel (for example as
the result of RTM_GETLINK call on one end of the veth pair), it's not much
useful. There's no reliable way to get to the netns fd from the netnsid, nor
does any kernel API accept netnsid.

Extend the RTM_GETLINK call to also accept netnsid. It will operate on the
netns with the given netnsid in such case. Of course, the calling process
needs to have enough capabilities in the target name space; for now, require
CAP_NET_ADMIN. This can be relaxed in the future.

To signal to the calling process that the kernel understood the new
IFLA_IF_NETNSID attribute in the query, it will include it in the response.
This is needed to detect older kernels, as they will just ignore
IFLA_IF_NETNSID and query in the current name space.

This patch implemetns IFLA_IF_NETNSID only for get and dump. For set
operations, this can be extended later.

Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index b3cf5639ac8f..19fc02660e0c 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -160,6 +160,7 @@ enum {
 	IFLA_XDP,
 	IFLA_EVENT,
 	IFLA_NEW_NETNSID,
+	IFLA_IF_NETNSID,
 	__IFLA_MAX
 };
 
-- 
cgit 


From ab3f0063c48c26c927851b6767824e35a716d878 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 3 Nov 2017 13:56:17 -0700
Subject: bpf: offload: add infrastructure for loading programs for a specific
 netdev

The fact that we don't know which device the program is going
to be used on is quite limiting in current eBPF infrastructure.
We have to reverse or limit the changes which kernel makes to
the loaded bytecode if we want it to be offloaded to a networking
device.  We also have to invent new APIs for debugging and
troubleshooting support.

Make it possible to load programs for a specific netdev.  This
helps us to bring the debug information closer to the core
eBPF infrastructure (e.g. we will be able to reuse the verifer
log in device JIT).  It allows device JITs to perform translation
on the original bytecode.

__bpf_prog_get() when called to get a reference for an attachment
point will now refuse to give it if program has a device assigned.
Following patches will add a version of that function which passes
the expected netdev in. @type argument in __bpf_prog_get() is
renamed to attach_type to make it clearer that it's only set on
attachment.

All calls to ndo_bpf are protected by rtnl, only verifier callbacks
are not.  We need a wait queue to make sure netdev doesn't get
destroyed while verifier is still running and calling its driver.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a9820677c2ff..80d191a93fb0 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -260,6 +260,7 @@ union bpf_attr {
 		__u32		kern_version;	/* checked when prog_type=kprobe */
 		__u32		prog_flags;
 		char		prog_name[BPF_OBJ_NAME_LEN];
+		__u32		prog_target_ifindex;	/* ifindex of netdev to prep for */
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
-- 
cgit 


From bd601b6ada11fdfb9e277f24ad2eb54bc599156b Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 3 Nov 2017 13:56:18 -0700
Subject: bpf: report offload info to user space

Extend struct bpf_prog_info to contain information about program
being bound to a device.  Since the netdev may get destroyed while
program still exists we need a flag to indicate the program is
loaded for a device, even if the device is gone.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 80d191a93fb0..4455dd195201 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -895,6 +895,10 @@ enum sk_action {
 
 #define BPF_TAG_SIZE	8
 
+enum bpf_prog_status {
+	BPF_PROG_STATUS_DEV_BOUND	= (1 << 0),
+};
+
 struct bpf_prog_info {
 	__u32 type;
 	__u32 id;
@@ -908,6 +912,8 @@ struct bpf_prog_info {
 	__u32 nr_map_ids;
 	__aligned_u64 map_ids;
 	char name[BPF_OBJ_NAME_LEN];
+	__u32 ifindex;
+	__u32 status;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
-- 
cgit 


From ebc614f687369f9df99828572b1d85a7c2de3d92 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Sun, 5 Nov 2017 08:15:32 -0500
Subject: bpf, cgroup: implement eBPF-based device controller for cgroup v2

Cgroup v2 lacks the device controller, provided by cgroup v1.
This patch adds a new eBPF program type, which in combination
of previously added ability to attach multiple eBPF programs
to a cgroup, will provide a similar functionality, but with some
additional flexibility.

This patch introduces a BPF_PROG_TYPE_CGROUP_DEVICE program type.
A program takes major and minor device numbers, device type
(block/character) and access type (mknod/read/write) as parameters
and returns an integer which defines if the operation should be
allowed or terminated with -EPERM.

Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4455dd195201..e880ae6434ee 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -132,6 +132,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_LWT_XMIT,
 	BPF_PROG_TYPE_SOCK_OPS,
 	BPF_PROG_TYPE_SK_SKB,
+	BPF_PROG_TYPE_CGROUP_DEVICE,
 };
 
 enum bpf_attach_type {
@@ -141,6 +142,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_SOCK_OPS,
 	BPF_SK_SKB_STREAM_PARSER,
 	BPF_SK_SKB_STREAM_VERDICT,
+	BPF_CGROUP_DEVICE,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -991,4 +993,17 @@ struct bpf_perf_event_value {
 	__u64 running;
 };
 
+#define BPF_DEVCG_ACC_MKNOD	(1ULL << 0)
+#define BPF_DEVCG_ACC_READ	(1ULL << 1)
+#define BPF_DEVCG_ACC_WRITE	(1ULL << 2)
+
+#define BPF_DEVCG_DEV_BLOCK	(1ULL << 0)
+#define BPF_DEVCG_DEV_CHAR	(1ULL << 1)
+
+struct bpf_cgroup_dev_ctx {
+	__u32 access_type; /* (access << 16) | type */
+	__u32 major;
+	__u32 minor;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
-- 
cgit 


From 84287bb3285634b60c55c00a1d5ed843b44fde92 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@quantonium.net>
Date: Sun, 5 Nov 2017 15:58:23 -0800
Subject: ila: add checksum neutral map auto

Add checksum neutral auto that performs checksum neutral mapping
without using the C-bit. This is enabled by configuration of
a mapping.

The checksum neutral function has been split into
ila_csum_do_neutral_fmt and ila_csum_do_neutral_nofmt. The former
handles the C-bit and includes it in the adjustment value. The latter
just sets the adjustment value on the locator diff only.

Added configuration for checksum neutral map aut in ila_lwt
and ila_xlat.

Signed-off-by: Tom Herbert <tom@quantonium.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ila.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ila.h b/include/uapi/linux/ila.h
index f54853288f99..0744881dcef3 100644
--- a/include/uapi/linux/ila.h
+++ b/include/uapi/linux/ila.h
@@ -41,6 +41,7 @@ enum {
 	ILA_CSUM_ADJUST_TRANSPORT,
 	ILA_CSUM_NEUTRAL_MAP,
 	ILA_CSUM_NO_ACTION,
+	ILA_CSUM_NEUTRAL_MAP_AUTO,
 };
 
 #endif /* _UAPI_LINUX_ILA_H */
-- 
cgit 


From 70d5aef48a421a68bd9d1bf8f8267af406681580 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@quantonium.net>
Date: Sun, 5 Nov 2017 15:58:24 -0800
Subject: ila: allow configuration of identifier type

Allow identifier to be explicitly configured for a mapping.
This can either be one of the identifier types specified in the
ILA draft or a value of ILA_ATYPE_USE_FORMAT which means the
identifier type is inferred from the identifier type field.
If a value other than ILA_ATYPE_USE_FORMAT is set for a
mapping then it is assumed that the identifier type field is
not present in an identifier.

Signed-off-by: Tom Herbert <tom@quantonium.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ila.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ila.h b/include/uapi/linux/ila.h
index 0744881dcef3..8353c78a7781 100644
--- a/include/uapi/linux/ila.h
+++ b/include/uapi/linux/ila.h
@@ -17,6 +17,7 @@ enum {
 	ILA_ATTR_DIR,				/* u32 */
 	ILA_ATTR_PAD,
 	ILA_ATTR_CSUM_MODE,			/* u8 */
+	ILA_ATTR_IDENT_TYPE,			/* u8 */
 
 	__ILA_ATTR_MAX,
 };
@@ -44,4 +45,16 @@ enum {
 	ILA_CSUM_NEUTRAL_MAP_AUTO,
 };
 
+enum {
+	ILA_ATYPE_IID = 0,
+	ILA_ATYPE_LUID,
+	ILA_ATYPE_VIRT_V4,
+	ILA_ATYPE_VIRT_UNI_V6,
+	ILA_ATYPE_VIRT_MULTI_V6,
+	ILA_ATYPE_NONLOCAL_ADDR,
+	ILA_ATYPE_RSVD_1,
+	ILA_ATYPE_RSVD_2,
+
+	ILA_ATYPE_USE_FORMAT = 32, /* Get type from type field in identifier */
+};
 #endif /* _UAPI_LINUX_ILA_H */
-- 
cgit 


From fddb231ebe647749782a9ebf11106a81f7168ba7 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@quantonium.net>
Date: Sun, 5 Nov 2017 15:58:25 -0800
Subject: ila: Add a hook type for LWT routes

In LWT tunnels both an input and output route method is defined.
If both of these are executed in the same path then double translation
happens and the effect is not correct.

This patch adds a new attribute that indicates the hook type. Two
values are defined for route output and route output. ILA
translation is only done for the one that is set. The default is
to enable ILA on route output.

Signed-off-by: Tom Herbert <tom@quantonium.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ila.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ila.h b/include/uapi/linux/ila.h
index 8353c78a7781..483b77af4eb8 100644
--- a/include/uapi/linux/ila.h
+++ b/include/uapi/linux/ila.h
@@ -18,6 +18,7 @@ enum {
 	ILA_ATTR_PAD,
 	ILA_ATTR_CSUM_MODE,			/* u8 */
 	ILA_ATTR_IDENT_TYPE,			/* u8 */
+	ILA_ATTR_HOOK_TYPE,			/* u8 */
 
 	__ILA_ATTR_MAX,
 };
@@ -57,4 +58,10 @@ enum {
 
 	ILA_ATYPE_USE_FORMAT = 32, /* Get type from type field in identifier */
 };
+
+enum {
+	ILA_HOOK_ROUTE_OUTPUT,
+	ILA_HOOK_ROUTE_INPUT,
+};
+
 #endif /* _UAPI_LINUX_ILA_H */
-- 
cgit 


From 602f3baf22188aad24b9a58be3209ab774b97d74 Mon Sep 17 00:00:00 2001
From: Nogah Frankel <nogahf@mellanox.com>
Date: Mon, 6 Nov 2017 07:23:41 +0100
Subject: net_sch: red: Add offload ability to RED qdisc

Add the ability to offload RED qdisc by using ndo_setup_tc.
There are four commands for RED offloading:
* TC_RED_SET: handles set and change.
* TC_RED_DESTROY: handle qdisc destroy.
* TC_RED_STATS: update the qdiscs counters (given as reference)
* TC_RED_XSTAT: returns red xstats.

Whether RED is being offloaded is being determined every time dump action
is being called because parent change of this qdisc could change its
offload state but doesn't require any RED function to be called.

Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 5002562868cc..6a2c5ea7e9c4 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -256,6 +256,7 @@ struct tc_red_qopt {
 #define TC_RED_ECN		1
 #define TC_RED_HARDDROP		2
 #define TC_RED_ADAPTATIVE	4
+#define TC_RED_OFFLOADED	8
 };
 
 struct tc_red_xstats {
-- 
cgit 


From b2d0f5d5dc53532e6f07bc546a476a55ebdfe0f3 Mon Sep 17 00:00:00 2001
From: Yi Yang <yi.y.yang@intel.com>
Date: Tue, 7 Nov 2017 21:07:02 +0800
Subject: openvswitch: enable NSH support

v16->17
 - Fixed disputed check code: keep them in nsh_push and nsh_pop
   but also add them in __ovs_nla_copy_actions

v15->v16
 - Add csum recalculation for nsh_push, nsh_pop and set_nsh
   pointed out by Pravin
 - Move nsh key into the union with ipv4 and ipv6 and add
   check for nsh key in match_validate pointed out by Pravin
 - Add nsh check in validate_set and __ovs_nla_copy_actions

v14->v15
 - Check size in nsh_hdr_from_nlattr
 - Fixed four small issues pointed out By Jiri and Eric

v13->v14
 - Rename skb_push_nsh to nsh_push per Dave's comment
 - Rename skb_pop_nsh to nsh_pop per Dave's comment

v12->v13
 - Fix NSH header length check in set_nsh

v11->v12
 - Fix missing changes old comments pointed out
 - Fix new comments for v11

v10->v11
 - Fix the left three disputable comments for v9
   but not fixed in v10.

v9->v10
 - Change struct ovs_key_nsh to
       struct ovs_nsh_key_base base;
       __be32 context[NSH_MD1_CONTEXT_SIZE];
 - Fix new comments for v9

v8->v9
 - Fix build error reported by daily intel build
   because nsh module isn't selected by openvswitch

v7->v8
 - Rework nested value and mask for OVS_KEY_ATTR_NSH
 - Change pop_nsh to adapt to nsh kernel module
 - Fix many issues per comments from Jiri Benc

v6->v7
 - Remove NSH GSO patches in v6 because Jiri Benc
   reworked it as another patch series and they have
   been merged.
 - Change it to adapt to nsh kernel module added by NSH
   GSO patch series

v5->v6
 - Fix the rest comments for v4.
 - Add NSH GSO support for VxLAN-gpe + NSH and
   Eth + NSH.

v4->v5
 - Fix many comments by Jiri Benc and Eric Garver
   for v4.

v3->v4
 - Add new NSH match field ttl
 - Update NSH header to the latest format
   which will be final format and won't change
   per its author's confirmation.
 - Fix comments for v3.

v2->v3
 - Change OVS_KEY_ATTR_NSH to nested key to handle
   length-fixed attributes and length-variable
   attriubte more flexibly.
 - Remove struct ovs_action_push_nsh completely
 - Add code to handle nested attribute for SET_MASKED
 - Change PUSH_NSH to use the nested OVS_KEY_ATTR_NSH
   to transfer NSH header data.
 - Fix comments and coding style issues by Jiri and Eric

v1->v2
 - Change encap_nsh and decap_nsh to push_nsh and pop_nsh
 - Dynamically allocate struct ovs_action_push_nsh for
   length-variable metadata.

OVS master and 2.8 branch has merged NSH userspace
patch series, this patch is to enable NSH support
in kernel data path in order that OVS can support
NSH in compat mode by porting this.

Signed-off-by: Yi Yang <yi.y.yang@intel.com>
Acked-by: Jiri Benc <jbenc@redhat.com>
Acked-by: Eric Garver <e@erig.me>
Acked-by: Pravin Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 501e4c4e2a03..ec75a685f1dd 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -336,6 +336,7 @@ enum ovs_key_attr {
 	OVS_KEY_ATTR_CT_LABELS,	/* 16-octet connection tracking label */
 	OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4,   /* struct ovs_key_ct_tuple_ipv4 */
 	OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6,   /* struct ovs_key_ct_tuple_ipv6 */
+	OVS_KEY_ATTR_NSH,       /* Nested set of ovs_nsh_key_* */
 
 #ifdef __KERNEL__
 	OVS_KEY_ATTR_TUNNEL_INFO,  /* struct ip_tunnel_info */
@@ -495,6 +496,30 @@ struct ovs_key_ct_tuple_ipv6 {
 	__u8   ipv6_proto;
 };
 
+enum ovs_nsh_key_attr {
+	OVS_NSH_KEY_ATTR_UNSPEC,
+	OVS_NSH_KEY_ATTR_BASE,  /* struct ovs_nsh_key_base. */
+	OVS_NSH_KEY_ATTR_MD1,   /* struct ovs_nsh_key_md1. */
+	OVS_NSH_KEY_ATTR_MD2,   /* variable-length octets for MD type 2. */
+	__OVS_NSH_KEY_ATTR_MAX
+};
+
+#define OVS_NSH_KEY_ATTR_MAX (__OVS_NSH_KEY_ATTR_MAX - 1)
+
+struct ovs_nsh_key_base {
+	__u8 flags;
+	__u8 ttl;
+	__u8 mdtype;
+	__u8 np;
+	__be32 path_hdr;
+};
+
+#define NSH_MD1_CONTEXT_SIZE 4
+
+struct ovs_nsh_key_md1 {
+	__be32 context[NSH_MD1_CONTEXT_SIZE];
+};
+
 /**
  * enum ovs_flow_attr - attributes for %OVS_FLOW_* commands.
  * @OVS_FLOW_ATTR_KEY: Nested %OVS_KEY_ATTR_* attributes specifying the flow
@@ -811,6 +836,8 @@ struct ovs_action_push_eth {
  * @OVS_ACTION_ATTR_POP_ETH: Pop the outermost Ethernet header off the
  * packet.
  * @OVS_ACTION_ATTR_CT_CLEAR: Clear conntrack state from the packet.
+ * @OVS_ACTION_ATTR_PUSH_NSH: push NSH header to the packet.
+ * @OVS_ACTION_ATTR_POP_NSH: pop the outermost NSH header off the packet.
  *
  * Only a single header can be set with a single %OVS_ACTION_ATTR_SET.  Not all
  * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
@@ -841,6 +868,8 @@ enum ovs_action_attr {
 	OVS_ACTION_ATTR_PUSH_ETH,     /* struct ovs_action_push_eth. */
 	OVS_ACTION_ATTR_POP_ETH,      /* No argument. */
 	OVS_ACTION_ATTR_CT_CLEAR,     /* No argument. */
+	OVS_ACTION_ATTR_PUSH_NSH,     /* Nested OVS_NSH_KEY_ATTR_*. */
+	OVS_ACTION_ATTR_POP_NSH,      /* No argument. */
 
 	__OVS_ACTION_ATTR_MAX,	      /* Nothing past this will be accepted
 				       * from userspace. */
-- 
cgit 


From 4d63adfe12dd9cb61ed8badb4d798955399048c2 Mon Sep 17 00:00:00 2001
From: Mark Greer <mgreer@animalcreek.com>
Date: Thu, 15 Jun 2017 20:34:22 -0700
Subject: NFC: Add NFC_CMD_DEACTIVATE_TARGET support

Once an NFC target (i.e., a tag) is found, it remains active until
there is a failure reading or writing it (often caused by the target
moving out of range).  While the target is active, the NFC adapter
and antenna must remain powered.  This wastes power when the target
remains in range but the client application no longer cares whether
it is there or not.

To mitigate this, add a new netlink command that allows userspace
to deactivate an active target.  When issued, this command will cause
the NFC subsystem to act as though the target was moved out of range.
Once the command has been executed, the client application can power
off the NFC adapter to reduce power consumption.

Signed-off-by: Mark Greer <mgreer@animalcreek.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/uapi/linux/nfc.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h
index 399f39ff8048..f6e3c8c9c744 100644
--- a/include/uapi/linux/nfc.h
+++ b/include/uapi/linux/nfc.h
@@ -89,6 +89,7 @@
  * @NFC_CMD_ACTIVATE_TARGET: Request NFC controller to reactivate target.
  * @NFC_CMD_VENDOR: Vendor specific command, to be implemented directly
  *	from the driver in order to support hardware specific operations.
+ * @NFC_CMD_DEACTIVATE_TARGET: Request NFC controller to deactivate target.
  */
 enum nfc_commands {
 	NFC_CMD_UNSPEC,
@@ -121,6 +122,7 @@ enum nfc_commands {
 	NFC_CMD_SE_IO,
 	NFC_CMD_ACTIVATE_TARGET,
 	NFC_CMD_VENDOR,
+	NFC_CMD_DEACTIVATE_TARGET,
 /* private: internal use only */
 	__NFC_CMD_AFTER_LAST
 };
-- 
cgit 


From dd0bb688eaa241b5655d396d45366cba9225aed9 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Tue, 7 Nov 2017 15:28:42 -0500
Subject: bpf: add a bpf_override_function helper

Error injection is sloppy and very ad-hoc.  BPF could fill this niche
perfectly with it's kprobe functionality.  We could make sure errors are
only triggered in specific call chains that we care about with very
specific situations.  Accomplish this with the bpf_override_funciton
helper.  This will modify the probe'd callers return value to the
specified value and set the PC to an override function that simply
returns, bypassing the originally probed function.  This gives us a nice
clean way to implement systematic error injection for all of our code
paths.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e880ae6434ee..adb66f78b674 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -677,6 +677,10 @@ union bpf_attr {
  *     @buf: buf to fill
  *     @buf_size: size of the buf
  *     Return : 0 on success or negative error code
+ *
+ * int bpf_override_return(pt_regs, rc)
+ *	@pt_regs: pointer to struct pt_regs
+ *	@rc: the return value to set
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -736,7 +740,8 @@ union bpf_attr {
 	FN(xdp_adjust_meta),		\
 	FN(perf_event_read_value),	\
 	FN(perf_prog_read_value),	\
-	FN(getsockopt),
+	FN(getsockopt),			\
+	FN(override_return),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
-- 
cgit 


From 2210d6b2f287d738eddf6b75f432126ce05450f8 Mon Sep 17 00:00:00 2001
From: Maciej Żenczykowski <maze@google.com>
Date: Tue, 7 Nov 2017 21:52:09 -0800
Subject: net: ipv6: sysctl to specify IPv6 ND traffic class
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a per-device sysctl to specify the default traffic class to use for
kernel originated IPv6 Neighbour Discovery packets.

Currently this includes:

  - Router Solicitation (ICMPv6 type 133)
    ndisc_send_rs() -> ndisc_send_skb() -> ip6_nd_hdr()

  - Neighbour Solicitation (ICMPv6 type 135)
    ndisc_send_ns() -> ndisc_send_skb() -> ip6_nd_hdr()

  - Neighbour Advertisement (ICMPv6 type 136)
    ndisc_send_na() -> ndisc_send_skb() -> ip6_nd_hdr()

  - Redirect (ICMPv6 type 137)
    ndisc_send_redirect() -> ndisc_send_skb() -> ip6_nd_hdr()

and if the kernel ever gets around to generating RA's,
it would presumably also include:

  - Router Advertisement (ICMPv6 type 134)
    (radvd daemon could pick up on the kernel setting and use it)

Interface drivers may examine the Traffic Class value and translate
the DiffServ Code Point into a link-layer appropriate traffic
prioritization scheme.  An example of mapping IETF DSCP values to
IEEE 802.11 User Priority values can be found here:

    https://tools.ietf.org/html/draft-ietf-tsvwg-ieee-802-11

The expected primary use case is to properly prioritize ND over wifi.

Testing:
  jzem22:~# cat /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  0
  jzem22:~# echo -1 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  -bash: echo: write error: Invalid argument
  jzem22:~# echo 256 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  -bash: echo: write error: Invalid argument
  jzem22:~# echo 0 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  jzem22:~# echo 255 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  jzem22:~# cat /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  255
  jzem22:~# echo 34 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  jzem22:~# cat /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  34

  jzem22:~# echo $[0xDC] > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  jzem22:~# tcpdump -v -i eth0 icmp6 and src host jzem22.pgc and dst host fe80::1
  tcpdump: listening on eth0, link-type EN10MB (Ethernet), capture size 262144 bytes
  IP6 (class 0xdc, hlim 255, next-header ICMPv6 (58) payload length: 24)
  jzem22.pgc > fe80::1: [icmp6 sum ok] ICMP6, neighbor advertisement,
  length 24, tgt is jzem22.pgc, Flags [solicited]

(based on original change written by Erik Kline, with minor changes)

v2: fix 'suspicious rcu_dereference_check() usage'
    by explicitly grabbing the rcu_read_lock.

Cc: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: Erik Kline <ek@google.com>
Signed-off-by: Maciej Żenczykowski <maze@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ipv6.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index b22a9c4e1b12..9c0f4a92bcff 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -186,6 +186,7 @@ enum {
 	DEVCONF_ADDR_GEN_MODE,
 	DEVCONF_DISABLE_POLICY,
 	DEVCONF_ACCEPT_RA_RT_INFO_MIN_PLEN,
+	DEVCONF_NDISC_TCLASS,
 	DEVCONF_MAX
 };
 
-- 
cgit 


From f3edacbd697f94a743fff1a3d26910ab99948ba7 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sat, 11 Nov 2017 18:24:55 +0900
Subject: bpf: Revert bpf_overrid_function() helper changes.

NACK'd by x86 maintainer.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index adb66f78b674..e880ae6434ee 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -677,10 +677,6 @@ union bpf_attr {
  *     @buf: buf to fill
  *     @buf_size: size of the buf
  *     Return : 0 on success or negative error code
- *
- * int bpf_override_return(pt_regs, rc)
- *	@pt_regs: pointer to struct pt_regs
- *	@rc: the return value to set
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -740,8 +736,7 @@ union bpf_attr {
 	FN(xdp_adjust_meta),		\
 	FN(perf_event_read_value),	\
 	FN(perf_prog_read_value),	\
-	FN(getsockopt),			\
-	FN(override_return),
+	FN(getsockopt),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
-- 
cgit 


From 713bafea92920103cd3d361657406cf04d0e22dd Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 8 Nov 2017 13:01:26 -0800
Subject: tcp: retire FACK loss detection

FACK loss detection has been disabled by default and the
successor RACK subsumed FACK and can handle reordering better.
This patch removes FACK to simplify TCP loss recovery.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: Priyaranjan Jha <priyarjha@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/snmp.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 0d941cdd8e8c..33a70ece462f 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -191,7 +191,6 @@ enum
 	LINUX_MIB_TCPRENORECOVERY,		/* TCPRenoRecovery */
 	LINUX_MIB_TCPSACKRECOVERY,		/* TCPSackRecovery */
 	LINUX_MIB_TCPSACKRENEGING,		/* TCPSACKReneging */
-	LINUX_MIB_TCPFACKREORDER,		/* TCPFACKReorder */
 	LINUX_MIB_TCPSACKREORDER,		/* TCPSACKReorder */
 	LINUX_MIB_TCPRENOREORDER,		/* TCPRenoReorder */
 	LINUX_MIB_TCPTSREORDER,			/* TCPTSReorder */
-- 
cgit 


From 99803171ef04037092bf5eb29ae801e8b4d49a75 Mon Sep 17 00:00:00 2001
From: Dave Taht <dave.taht@gmail.com>
Date: Wed, 8 Nov 2017 15:12:27 -0800
Subject: netem: add uapi to express delay and jitter in nanoseconds

netem userspace has long relied on a horrible /proc/net/psched hack
to translate the current notion of "ticks" to nanoseconds.

Expressing latency and jitter instead, in well defined nanoseconds,
increases the dynamic range of emulated delays and jitter in netem.

It will also ease a transition where reducing a tick to nsec
equivalence would constrain the max delay in prior versions of
netem to only 4.3 seconds.

Signed-off-by: Dave Taht <dave.taht@gmail.com>
Suggested-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 6a2c5ea7e9c4..8fe6d1842bee 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -537,6 +537,8 @@ enum {
 	TCA_NETEM_ECN,
 	TCA_NETEM_RATE64,
 	TCA_NETEM_PAD,
+	TCA_NETEM_LATENCY64,
+	TCA_NETEM_JITTER64,
 	__TCA_NETEM_MAX,
 };
 
-- 
cgit 


From 836af83b54e3e285c4a0cc06c24aeb737d3e0e18 Mon Sep 17 00:00:00 2001
From: Dave Taht <dave.taht@gmail.com>
Date: Wed, 8 Nov 2017 15:12:28 -0800
Subject: netem: support delivering packets in delayed time slots

Slotting is a crude approximation of the behaviors of shared media such
as cable, wifi, and LTE, which gather up a bunch of packets within a
varying delay window and deliver them, relative to that, nearly all at
once.

It works within the existing loss, duplication, jitter and delay
parameters of netem. Some amount of inherent latency must be specified,
regardless.

The new "slot" parameter specifies a minimum and maximum delay between
transmission attempts.

The "bytes" and "packets" parameters can be used to limit the amount of
information transferred per slot.

Examples of use:

tc qdisc add dev eth0 root netem delay 200us \
         slot 800us 10ms bytes 64k packets 42

A more correct example, using stacked netem instances and a packet limit
to emulate a tail drop wifi queue with slots and variable packet
delivery, with a 200Mbit isochronous underlying rate, and 20ms path
delay:

tc qdisc add dev eth0 root handle 1: netem delay 20ms rate 200mbit \
         limit 10000
tc qdisc add dev eth0 parent 1:1 handle 10:1 netem delay 200us \
         slot 800us 10ms bytes 64k packets 42 limit 512

Signed-off-by: Dave Taht <dave.taht@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 8fe6d1842bee..af3cc2f4e1ad 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -539,6 +539,7 @@ enum {
 	TCA_NETEM_PAD,
 	TCA_NETEM_LATENCY64,
 	TCA_NETEM_JITTER64,
+	TCA_NETEM_SLOT,
 	__TCA_NETEM_MAX,
 };
 
@@ -576,6 +577,13 @@ struct tc_netem_rate {
 	__s32	cell_overhead;
 };
 
+struct tc_netem_slot {
+	__s64   min_delay; /* nsec */
+	__s64   max_delay;
+	__s32   max_packets;
+	__s32   max_bytes;
+};
+
 enum {
 	NETEM_LOSS_UNSPEC,
 	NETEM_LOSS_GI,		/* General Intuitive - 4 state model */
-- 
cgit 


From 5794040647de4011598a6d005fdad95d24fd385b Mon Sep 17 00:00:00 2001
From: Andy Zhou <azhou@ovn.org>
Date: Fri, 10 Nov 2017 12:09:40 -0800
Subject: openvswitch: Add meter netlink definitions

Meter has its own netlink family. Define netlink messages and attributes
for communicating with the user space programs.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h | 51 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index ec75a685f1dd..d60b9a4cf3d1 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -883,4 +883,55 @@ enum ovs_action_attr {
 
 #define OVS_ACTION_ATTR_MAX (__OVS_ACTION_ATTR_MAX - 1)
 
+/* Meters. */
+#define OVS_METER_FAMILY  "ovs_meter"
+#define OVS_METER_MCGROUP "ovs_meter"
+#define OVS_METER_VERSION 0x1
+
+enum ovs_meter_cmd {
+	OVS_METER_CMD_UNSPEC,
+	OVS_METER_CMD_FEATURES,	/* Get features supported by the datapath. */
+	OVS_METER_CMD_SET,	/* Add or modify a meter. */
+	OVS_METER_CMD_DEL,	/* Delete a meter. */
+	OVS_METER_CMD_GET	/* Get meter stats. */
+};
+
+enum ovs_meter_attr {
+	OVS_METER_ATTR_UNSPEC,
+	OVS_METER_ATTR_ID,	/* u32 meter ID within datapath. */
+	OVS_METER_ATTR_KBPS,	/* No argument. If set, units in kilobits
+				 * per second. Otherwise, units in
+				 * packets per second.
+				 */
+	OVS_METER_ATTR_STATS,	/* struct ovs_flow_stats for the meter. */
+	OVS_METER_ATTR_BANDS,	/* Nested attributes for meter bands. */
+	OVS_METER_ATTR_USED,	/* u64 msecs last used in monotonic time. */
+	OVS_METER_ATTR_CLEAR,	/* Flag to clear stats, used. */
+	OVS_METER_ATTR_MAX_METERS, /* u32 number of meters supported. */
+	OVS_METER_ATTR_MAX_BANDS,  /* u32 max number of bands per meter. */
+	OVS_METER_ATTR_PAD,
+	__OVS_METER_ATTR_MAX
+};
+
+#define OVS_METER_ATTR_MAX (__OVS_METER_ATTR_MAX - 1)
+
+enum ovs_band_attr {
+	OVS_BAND_ATTR_UNSPEC,
+	OVS_BAND_ATTR_TYPE,	/* u32 OVS_METER_BAND_TYPE_* constant. */
+	OVS_BAND_ATTR_RATE,	/* u32 band rate in meter units (see above). */
+	OVS_BAND_ATTR_BURST,	/* u32 burst size in meter units. */
+	OVS_BAND_ATTR_STATS,	/* struct ovs_flow_stats for the band. */
+	__OVS_BAND_ATTR_MAX
+};
+
+#define OVS_BAND_ATTR_MAX (__OVS_BAND_ATTR_MAX - 1)
+
+enum ovs_meter_band_type {
+	OVS_METER_BAND_TYPE_UNSPEC,
+	OVS_METER_BAND_TYPE_DROP,   /* Drop exceeding packets. */
+	__OVS_METER_BAND_TYPE_MAX
+};
+
+#define OVS_METER_BAND_TYPE_MAX (__OVS_METER_BAND_TYPE_MAX - 1)
+
 #endif /* _LINUX_OPENVSWITCH_H */
-- 
cgit 


From cd8a6c33693c1b89d2737ffdbf9611564e9ac907 Mon Sep 17 00:00:00 2001
From: Andy Zhou <azhou@ovn.org>
Date: Fri, 10 Nov 2017 12:09:43 -0800
Subject: openvswitch: Add meter action support

Implements OVS kernel meter action support.

Signed-off-by: Andy Zhou <azhou@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index d60b9a4cf3d1..4265d7f9e1f2 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -838,6 +838,8 @@ struct ovs_action_push_eth {
  * @OVS_ACTION_ATTR_CT_CLEAR: Clear conntrack state from the packet.
  * @OVS_ACTION_ATTR_PUSH_NSH: push NSH header to the packet.
  * @OVS_ACTION_ATTR_POP_NSH: pop the outermost NSH header off the packet.
+ * @OVS_ACTION_ATTR_METER: Run packet through a meter, which may drop the
+ * packet, or modify the packet (e.g., change the DSCP field).
  *
  * Only a single header can be set with a single %OVS_ACTION_ATTR_SET.  Not all
  * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
@@ -870,6 +872,7 @@ enum ovs_action_attr {
 	OVS_ACTION_ATTR_CT_CLEAR,     /* No argument. */
 	OVS_ACTION_ATTR_PUSH_NSH,     /* Nested OVS_NSH_KEY_ATTR_*. */
 	OVS_ACTION_ATTR_POP_NSH,      /* No argument. */
+	OVS_ACTION_ATTR_METER,        /* u32 meter ID. */
 
 	__OVS_ACTION_ATTR_MAX,	      /* Nothing past this will be accepted
 				       * from userspace. */
-- 
cgit 


From 0eef304bc9f7d079a1165e8cd2f24b078e9e1f2a Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Mon, 13 Nov 2017 03:37:06 +0300
Subject: uapi: fix linux/rxrpc.h userspace compilation errors

Consistently use types provided by <linux/types.h> to fix the following
linux/rxrpc.h userspace compilation errors:

/usr/include/linux/rxrpc.h:24:2: error: unknown type name 'u16'
  u16  srx_service; /* service desired */
/usr/include/linux/rxrpc.h:25:2: error: unknown type name 'u16'
  u16  transport_type; /* type of transport socket (SOCK_DGRAM) */
/usr/include/linux/rxrpc.h:26:2: error: unknown type name 'u16'
  u16  transport_len; /* length of transport address */

Use __kernel_sa_family_t instead of sa_family_t the same way
as uapi/linux/in.h does, to fix the following
linux/rxrpc.h userspace compilation errors:

/usr/include/linux/rxrpc.h:23:2: error: unknown type name 'sa_family_t'
  sa_family_t srx_family; /* address family */
/usr/include/linux/rxrpc.h:28:3: error: unknown type name 'sa_family_t'
  sa_family_t family;  /* transport address family */

Fixes: 727f8914477e ("rxrpc: Expose UAPI definitions to userspace")
Cc: <stable@vger.kernel.org> # v4.14
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/rxrpc.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/rxrpc.h b/include/uapi/linux/rxrpc.h
index 9656aad8f8f7..9d4afea308a4 100644
--- a/include/uapi/linux/rxrpc.h
+++ b/include/uapi/linux/rxrpc.h
@@ -20,12 +20,12 @@
  * RxRPC socket address
  */
 struct sockaddr_rxrpc {
-	sa_family_t	srx_family;	/* address family */
-	u16		srx_service;	/* service desired */
-	u16		transport_type;	/* type of transport socket (SOCK_DGRAM) */
-	u16		transport_len;	/* length of transport address */
+	__kernel_sa_family_t	srx_family;	/* address family */
+	__u16			srx_service;	/* service desired */
+	__u16			transport_type;	/* type of transport socket (SOCK_DGRAM) */
+	__u16			transport_len;	/* length of transport address */
 	union {
-		sa_family_t family;		/* transport address family */
+		__kernel_sa_family_t family;	/* transport address family */
 		struct sockaddr_in sin;		/* IPv4 transport address */
 		struct sockaddr_in6 sin6;	/* IPv6 transport address */
 	} transport;
-- 
cgit 


From b9f3eb499d84f8d4adcb2f9212ec655700b28228 Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Tue, 14 Nov 2017 06:30:11 +0300
Subject: uapi: fix linux/tls.h userspace compilation error

Move inclusion of a private kernel header <net/tcp.h>
from uapi/linux/tls.h to its only user - net/tls.h,
to fix the following linux/tls.h userspace compilation error:

/usr/include/linux/tls.h:41:21: fatal error: net/tcp.h: No such file or directory

As to this point uapi/linux/tls.h was totaly unusuable for userspace,
cleanup this header file further by moving other redundant includes
to net/tls.h.

Fixes: 3c4d7559159b ("tls: kernel TLS support")
Cc: <stable@vger.kernel.org> # v4.13+
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tls.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h
index d5e0682ab837..293b2cdad88d 100644
--- a/include/uapi/linux/tls.h
+++ b/include/uapi/linux/tls.h
@@ -35,10 +35,6 @@
 #define _UAPI_LINUX_TLS_H
 
 #include <linux/types.h>
-#include <asm/byteorder.h>
-#include <linux/socket.h>
-#include <linux/tcp.h>
-#include <net/tcp.h>
 
 /* TLS socket options */
 #define TLS_TX			1	/* Set transmit parameters */
-- 
cgit