7 files changed, 387 insertions, 12 deletions
diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst
index 2090bfc69aa5..c941b214e0b7 100644
--- a/Documentation/admin-guide/sysctl/net.rst
+++ b/Documentation/admin-guide/sysctl/net.rst
@@ -320,7 +320,7 @@ waiting for a network device refcount to drop to 0 during device
 unregistration. A lower value may be useful during bisection to detect
 a leaked reference faster. A larger value may be useful to prevent false
 warnings on slow/loaded systems.
-Default value is 10, minimum 0, maximum 3600.
+Default value is 10, minimum 1, maximum 3600.
 
 optmem_max
 ----------
diff --git a/Documentation/devicetree/bindings/net/xilinx_axienet.txt b/Documentation/devicetree/bindings/net/xilinx_axienet.txt
index 2cd452419ed0..b8e4894bc634 100644
--- a/Documentation/devicetree/bindings/net/xilinx_axienet.txt
+++ b/Documentation/devicetree/bindings/net/xilinx_axienet.txt
@@ -42,11 +42,23 @@ Optional properties:
 		  support both 1000BaseX and SGMII modes. If set, the phy-mode
 		  should be set to match the mode selected on core reset (i.e.
 		  by the basex_or_sgmii core input line).
-- clocks	: AXI bus clock for the device. Refer to common clock bindings.
-		  Used to calculate MDIO clock divisor. If not specified, it is
-		  auto-detected from the CPU clock (but only on platforms where
-		  this is possible). New device trees should specify this - the
-		  auto detection is only for backward compatibility.
+- clock-names: 	  Tuple listing input clock names. Possible clocks:
+		  s_axi_lite_clk: Clock for AXI register slave interface
+		  axis_clk: AXI4-Stream clock for TXD RXD TXC and RXS interfaces
+		  ref_clk: Ethernet reference clock, used by signal delay
+			   primitives and transceivers
+		  mgt_clk: MGT reference clock (used by optional internal
+			   PCS/PMA PHY)
+
+		  Note that if s_axi_lite_clk is not specified by name, the
+		  first clock of any name is used for this. If that is also not
+		  specified, the clock rate is auto-detected from the CPU clock
+		  (but only on platforms where this is possible). New device
+		  trees should specify all applicable clocks by name - the
+		  fallbacks to an unnamed clock or to CPU clock are only for
+		  backward compatibility.
+- clocks: 	  Phandles to input clocks matching clock-names. Refer to common
+		  clock bindings.
 - axistream-connected: Reference to another node which contains the resources
 		       for the AXI DMA controller used by this device.
 		       If this is specified, the DMA-related resources from that
@@ -62,7 +74,8 @@ Example:
 		device_type = "network";
 		interrupt-parent = <&microblaze_0_axi_intc>;
 		interrupts = <2 0 1>;
-		clocks = <&axi_clk>;
+		clock-names = "s_axi_lite_clk", "axis_clk", "ref_clk", "mgt_clk";
+		clocks = <&axi_clk>, <&axi_clk>, <&pl_enet_ref_clk>, <&mgt_clk>;
 		phy-mode = "mii";
 		reg = <0x40c00000 0x40000 0x50c00000 0x40000>;
 		xlnx,rxcsum = <0x2>;
diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index 05073482db05..4bdb4298f178 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -208,6 +208,8 @@ Userspace to kernel:
   ``ETHTOOL_MSG_CABLE_TEST_ACT``        action start cable test
   ``ETHTOOL_MSG_CABLE_TEST_TDR_ACT``    action start raw TDR cable test
   ``ETHTOOL_MSG_TUNNEL_INFO_GET``       get tunnel offload info
+  ``ETHTOOL_MSG_FEC_GET``               get FEC settings
+  ``ETHTOOL_MSG_FEC_SET``               set FEC settings
   ===================================== ================================
 
 Kernel to userspace:
@@ -242,6 +244,8 @@ Kernel to userspace:
   ``ETHTOOL_MSG_CABLE_TEST_NTF``        Cable test results
   ``ETHTOOL_MSG_CABLE_TEST_TDR_NTF``    Cable test TDR results
   ``ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY`` tunnel offload info
+  ``ETHTOOL_MSG_FEC_GET_REPLY``         FEC settings
+  ``ETHTOOL_MSG_FEC_NTF``               FEC settings
   ===================================== =================================
 
 ``GET`` requests are sent by userspace applications to retrieve device
@@ -1280,6 +1284,60 @@ Kernel response contents:
 For UDP tunnel table empty ``ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES`` indicates that
 the table contains static entries, hard-coded by the NIC.
 
+FEC_GET
+=======
+
+Gets FEC configuration and state like ``ETHTOOL_GFECPARAM`` ioctl request.
+
+Request contents:
+
+  =====================================  ======  ==========================
+  ``ETHTOOL_A_FEC_HEADER``               nested  request header
+  =====================================  ======  ==========================
+
+Kernel response contents:
+
+  =====================================  ======  ==========================
+  ``ETHTOOL_A_FEC_HEADER``               nested  request header
+  ``ETHTOOL_A_FEC_MODES``                bitset  configured modes
+  ``ETHTOOL_A_FEC_AUTO``                 bool    FEC mode auto selection
+  ``ETHTOOL_A_FEC_ACTIVE``               u32     index of active FEC mode
+  =====================================  ======  ==========================
+
+``ETHTOOL_A_FEC_ACTIVE`` is the bit index of the FEC link mode currently
+active on the interface. This attribute may not be present if device does
+not support FEC.
+
+``ETHTOOL_A_FEC_MODES`` and ``ETHTOOL_A_FEC_AUTO`` are only meaningful when
+autonegotiation is disabled. If ``ETHTOOL_A_FEC_AUTO`` is non-zero driver will
+select the FEC mode automatically based on the parameters of the SFP module.
+This is equivalent to the ``ETHTOOL_FEC_AUTO`` bit of the ioctl interface.
+``ETHTOOL_A_FEC_MODES`` carry the current FEC configuration using link mode
+bits (rather than old ``ETHTOOL_FEC_*`` bits).
+
+FEC_SET
+=======
+
+Sets FEC parameters like ``ETHTOOL_SFECPARAM`` ioctl request.
+
+Request contents:
+
+  =====================================  ======  ==========================
+  ``ETHTOOL_A_FEC_HEADER``               nested  request header
+  ``ETHTOOL_A_FEC_MODES``                bitset  configured modes
+  ``ETHTOOL_A_FEC_AUTO``                 bool    FEC mode auto selection
+  =====================================  ======  ==========================
+
+``FEC_SET`` is only meaningful when autonegotiation is disabled. Otherwise
+FEC mode is selected as part of autonegotiation.
+
+``ETHTOOL_A_FEC_MODES`` selects which FEC mode should be used. It's recommended
+to set only one bit, if multiple bits are set driver may choose between them
+in an implementation specific way.
+
+``ETHTOOL_A_FEC_AUTO`` requests the driver to choose FEC mode based on SFP
+module parameters. This does not mean autonegotiation.
+
 Request translation
 ===================
 
@@ -1373,8 +1431,8 @@ are netlink only.
                                       ``ETHTOOL_MSG_LINKMODES_SET``
   ``ETHTOOL_PHY_GTUNABLE``            n/a
   ``ETHTOOL_PHY_STUNABLE``            n/a
-  ``ETHTOOL_GFECPARAM``               n/a
-  ``ETHTOOL_SFECPARAM``               n/a
+  ``ETHTOOL_GFECPARAM``               ``ETHTOOL_MSG_FEC_GET``
+  ``ETHTOOL_SFECPARAM``               ``ETHTOOL_MSG_FEC_SET``
   n/a                                 ''ETHTOOL_MSG_CABLE_TEST_ACT''
   n/a                                 ''ETHTOOL_MSG_CABLE_TEST_TDR_ACT''
   n/a                                 ``ETHTOOL_MSG_TUNNEL_INFO_GET``
diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index b8a29997d433..e9ce55992aa9 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -76,6 +76,7 @@ Contents:
    netdevices
    netfilter-sysctl
    netif-msg
+   nexthop-group-resilient
    nf_conntrack-sysctl
    nf_flowtable
    openvswitch
diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index c7952ac5bd2f..9701906f63f9 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -1073,7 +1073,9 @@ ip_local_reserved_ports - list of comma separated ranges
 
 	although this is redundant. However such a setting is useful
 	if later the port range is changed to a value that will
-	include the reserved ports.
+	include the reserved ports. Also keep in mind, that overlapping
+	of these ranges may affect probability of selecting ephemeral
+	ports which are right after block of reserved ports.
 
 	Default: Empty
 
@@ -1143,6 +1145,12 @@ icmp_echo_ignore_all - BOOLEAN
 
 	Default: 0
 
+icmp_echo_enable_probe - BOOLEAN
+        If set to one, then the kernel will respond to RFC 8335 PROBE
+        requests sent to it.
+
+        Default: 0
+
 icmp_echo_ignore_broadcasts - BOOLEAN
 	If set non-zero, then the kernel will ignore all ICMP ECHO and
 	TIMESTAMP requests sent to it via broadcast/multicast.
diff --git a/Documentation/networking/nexthop-group-resilient.rst b/Documentation/networking/nexthop-group-resilient.rst
new file mode 100644
index 000000000000..fabecee24d85
--- /dev/null
+++ b/Documentation/networking/nexthop-group-resilient.rst
@@ -0,0 +1,293 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=========================
+Resilient Next-hop Groups
+=========================
+
+Resilient groups are a type of next-hop group that is aimed at minimizing
+disruption in flow routing across changes to the group composition and
+weights of constituent next hops.
+
+The idea behind resilient hashing groups is best explained in contrast to
+the legacy multipath next-hop group, which uses the hash-threshold
+algorithm, described in RFC 2992.
+
+To select a next hop, hash-threshold algorithm first assigns a range of
+hashes to each next hop in the group, and then selects the next hop by
+comparing the SKB hash with the individual ranges. When a next hop is
+removed from the group, the ranges are recomputed, which leads to
+reassignment of parts of hash space from one next hop to another. RFC 2992
+illustrates it thus::
+
+             +-------+-------+-------+-------+-------+
+             |   1   |   2   |   3   |   4   |   5   |
+             +-------+-+-----+---+---+-----+-+-------+
+             |    1    |    2    |    4    |    5    |
+             +---------+---------+---------+---------+
+
+              Before and after deletion of next hop 3
+	      under the hash-threshold algorithm.
+
+Note how next hop 2 gave up part of the hash space in favor of next hop 1,
+and 4 in favor of 5. While there will usually be some overlap between the
+previous and the new distribution, some traffic flows change the next hop
+that they resolve to.
+
+If a multipath group is used for load-balancing between multiple servers,
+this hash space reassignment causes an issue that packets from a single
+flow suddenly end up arriving at a server that does not expect them. This
+can result in TCP connections being reset.
+
+If a multipath group is used for load-balancing among available paths to
+the same server, the issue is that different latencies and reordering along
+the way causes the packets to arrive in the wrong order, resulting in
+degraded application performance.
+
+To mitigate the above-mentioned flow redirection, resilient next-hop groups
+insert another layer of indirection between the hash space and its
+constituent next hops: a hash table. The selection algorithm uses SKB hash
+to choose a hash table bucket, then reads the next hop that this bucket
+contains, and forwards traffic there.
+
+This indirection brings an important feature. In the hash-threshold
+algorithm, the range of hashes associated with a next hop must be
+continuous. With a hash table, mapping between the hash table buckets and
+the individual next hops is arbitrary. Therefore when a next hop is deleted
+the buckets that held it are simply reassigned to other next hops::
+
+	    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+	    |1|1|1|1|2|2|2|2|3|3|3|3|4|4|4|4|5|5|5|5|
+	    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+	                     v v v v
+	    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+	    |1|1|1|1|2|2|2|2|1|2|4|5|4|4|4|4|5|5|5|5|
+	    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+	    Before and after deletion of next hop 3
+	    under the resilient hashing algorithm.
+
+When weights of next hops in a group are altered, it may be possible to
+choose a subset of buckets that are currently not used for forwarding
+traffic, and use those to satisfy the new next-hop distribution demands,
+keeping the "busy" buckets intact. This way, established flows are ideally
+kept being forwarded to the same endpoints through the same paths as before
+the next-hop group change.
+
+Algorithm
+---------
+
+In a nutshell, the algorithm works as follows. Each next hop deserves a
+certain number of buckets, according to its weight and the number of
+buckets in the hash table. In accordance with the source code, we will call
+this number a "wants count" of a next hop. In case of an event that might
+cause bucket allocation change, the wants counts for individual next hops
+are updated.
+
+Next hops that have fewer buckets than their wants count, are called
+"underweight". Those that have more are "overweight". If there are no
+overweight (and therefore no underweight) next hops in the group, it is
+said to be "balanced".
+
+Each bucket maintains a last-used timer. Every time a packet is forwarded
+through a bucket, this timer is updated to current jiffies value. One
+attribute of a resilient group is then the "idle timer", which is the
+amount of time that a bucket must not be hit by traffic in order for it to
+be considered "idle". Buckets that are not idle are busy.
+
+After assigning wants counts to next hops, an "upkeep" algorithm runs. For
+buckets:
+
+1) that have no assigned next hop, or
+2) whose next hop has been removed, or
+3) that are idle and their next hop is overweight,
+
+upkeep changes the next hop that the bucket references to one of the
+underweight next hops. If, after considering all buckets in this manner,
+there are still underweight next hops, another upkeep run is scheduled to a
+future time.
+
+There may not be enough "idle" buckets to satisfy the updated wants counts
+of all next hops. Another attribute of a resilient group is the "unbalanced
+timer". This timer can be set to 0, in which case the table will stay out
+of balance until idle buckets do appear, possibly never. If set to a
+non-zero value, the value represents the period of time that the table is
+permitted to stay out of balance.
+
+With this in mind, we update the above list of conditions with one more
+item. Thus buckets:
+
+4) whose next hop is overweight, and the amount of time that the table has
+   been out of balance exceeds the unbalanced timer, if that is non-zero,
+
+\... are migrated as well.
+
+Offloading & Driver Feedback
+----------------------------
+
+When offloading resilient groups, the algorithm that distributes buckets
+among next hops is still the one in SW. Drivers are notified of updates to
+next hop groups in the following three ways:
+
+- Full group notification with the type
+  ``NH_NOTIFIER_INFO_TYPE_RES_TABLE``. This is used just after the group is
+  created and buckets populated for the first time.
+
+- Single-bucket notifications of the type
+  ``NH_NOTIFIER_INFO_TYPE_RES_BUCKET``, which is used for notifications of
+  individual migrations within an already-established group.
+
+- Pre-replace notification, ``NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE``. This
+  is sent before the group is replaced, and is a way for the driver to veto
+  the group before committing anything to the HW.
+
+Some single-bucket notifications are forced, as indicated by the "force"
+flag in the notification. Those are used for the cases where e.g. the next
+hop associated with the bucket was removed, and the bucket really must be
+migrated.
+
+Non-forced notifications can be overridden by the driver by returning an
+error code. The use case for this is that the driver notifies the HW that a
+bucket should be migrated, but the HW discovers that the bucket has in fact
+been hit by traffic.
+
+A second way for the HW to report that a bucket is busy is through the
+``nexthop_res_grp_activity_update()`` API. The buckets identified this way
+as busy are treated as if traffic hit them.
+
+Offloaded buckets should be flagged as either "offload" or "trap". This is
+done through the ``nexthop_bucket_set_hw_flags()`` API.
+
+Netlink UAPI
+------------
+
+Resilient Group Replacement
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Resilient groups are configured using the ``RTM_NEWNEXTHOP`` message in the
+same manner as other multipath groups. The following changes apply to the
+attributes passed in the netlink message:
+
+  =================== =========================================================
+  ``NHA_GROUP_TYPE``  Should be ``NEXTHOP_GRP_TYPE_RES`` for resilient group.
+  ``NHA_RES_GROUP``   A nest that contains attributes specific to resilient
+                      groups.
+  =================== =========================================================
+
+``NHA_RES_GROUP`` payload:
+
+  =================================== =========================================
+  ``NHA_RES_GROUP_BUCKETS``           Number of buckets in the hash table.
+  ``NHA_RES_GROUP_IDLE_TIMER``        Idle timer in units of clock_t.
+  ``NHA_RES_GROUP_UNBALANCED_TIMER``  Unbalanced timer in units of clock_t.
+  =================================== =========================================
+
+Next Hop Get
+^^^^^^^^^^^^
+
+Requests to get resilient next-hop groups use the ``RTM_GETNEXTHOP``
+message in exactly the same way as other next hop get requests. The
+response attributes match the replacement attributes cited above, except
+``NHA_RES_GROUP`` payload will include the following attribute:
+
+  =================================== =========================================
+  ``NHA_RES_GROUP_UNBALANCED_TIME``   How long has the resilient group been out
+                                      of balance, in units of clock_t.
+  =================================== =========================================
+
+Bucket Get
+^^^^^^^^^^
+
+The message ``RTM_GETNEXTHOPBUCKET`` without the ``NLM_F_DUMP`` flag is
+used to request a single bucket. The attributes recognized at get requests
+are:
+
+  =================== =========================================================
+  ``NHA_ID``          ID of the next-hop group that the bucket belongs to.
+  ``NHA_RES_BUCKET``  A nest that contains attributes specific to bucket.
+  =================== =========================================================
+
+``NHA_RES_BUCKET`` payload:
+
+  ======================== ====================================================
+  ``NHA_RES_BUCKET_INDEX`` Index of bucket in the resilient table.
+  ======================== ====================================================
+
+Bucket Dumps
+^^^^^^^^^^^^
+
+The message ``RTM_GETNEXTHOPBUCKET`` with the ``NLM_F_DUMP`` flag is used
+to request a dump of matching buckets. The attributes recognized at dump
+requests are:
+
+  =================== =========================================================
+  ``NHA_ID``          If specified, limits the dump to just the next-hop group
+                      with this ID.
+  ``NHA_OIF``         If specified, limits the dump to buckets that contain
+                      next hops that use the device with this ifindex.
+  ``NHA_MASTER``      If specified, limits the dump to buckets that contain
+                      next hops that use a device in the VRF with this ifindex.
+  ``NHA_RES_BUCKET``  A nest that contains attributes specific to bucket.
+  =================== =========================================================
+
+``NHA_RES_BUCKET`` payload:
+
+  ======================== ====================================================
+  ``NHA_RES_BUCKET_NH_ID`` If specified, limits the dump to just the buckets
+                           that contain the next hop with this ID.
+  ======================== ====================================================
+
+Usage
+-----
+
+To illustrate the usage, consider the following commands::
+
+	# ip nexthop add id 1 via 192.0.2.2 dev eth0
+	# ip nexthop add id 2 via 192.0.2.3 dev eth0
+	# ip nexthop add id 10 group 1/2 type resilient \
+		buckets 8 idle_timer 60 unbalanced_timer 300
+
+The last command creates a resilient next-hop group. It will have 8 buckets
+(which is unusually low number, and used here for demonstration purposes
+only), each bucket will be considered idle when no traffic hits it for at
+least 60 seconds, and if the table remains out of balance for 300 seconds,
+it will be forcefully brought into balance.
+
+Changing next-hop weights leads to change in bucket allocation::
+
+	# ip nexthop replace id 10 group 1,3/2 type resilient
+
+This can be confirmed by looking at individual buckets::
+
+	# ip nexthop bucket show id 10
+	id 10 index 0 idle_time 5.59 nhid 1
+	id 10 index 1 idle_time 5.59 nhid 1
+	id 10 index 2 idle_time 8.74 nhid 2
+	id 10 index 3 idle_time 8.74 nhid 2
+	id 10 index 4 idle_time 8.74 nhid 1
+	id 10 index 5 idle_time 8.74 nhid 1
+	id 10 index 6 idle_time 8.74 nhid 1
+	id 10 index 7 idle_time 8.74 nhid 1
+
+Note the two buckets that have a shorter idle time. Those are the ones that
+were migrated after the next-hop replace command to satisfy the new demand
+that next hop 1 be given 6 buckets instead of 4.
+
+Netdevsim
+---------
+
+The netdevsim driver implements a mock offload of resilient groups, and
+exposes debugfs interface that allows marking individual buckets as busy.
+For example, the following will mark bucket 23 in next-hop group 10 as
+active::
+
+	# echo 10 23 > /sys/kernel/debug/netdevsim/netdevsim10/fib/nexthop_bucket_activity
+
+In addition, another debugfs interface can be used to configure that the
+next attempt to migrate a bucket should fail::
+
+	# echo 1 > /sys/kernel/debug/netdevsim/netdevsim10/fib/fail_nexthop_bucket_replace
+
+Besides serving as an example, the interfaces that netdevsim exposes are
+useful in automated testing, and
+``tools/testing/selftests/drivers/net/netdevsim/nexthop.sh`` makes use of
+them to test the algorithm.
diff --git a/Documentation/networking/nf_flowtable.rst b/Documentation/networking/nf_flowtable.rst
index d87f253b9d39..d757c21c10f2 100644
--- a/Documentation/networking/nf_flowtable.rst
+++ b/Documentation/networking/nf_flowtable.rst
@@ -112,6 +112,7 @@ You can identify offloaded flows through the [OFFLOAD] tag when listing your
 connection tracking table.
 
 ::
+
 	# conntrack -L
 	tcp      6 src=10.141.10.2 dst=192.168.10.2 sport=52728 dport=5201 src=192.168.10.2 dst=192.168.10.1 sport=5201 dport=52728 [OFFLOAD] mark=0 use=2
 
@@ -138,6 +139,7 @@ allows the flowtable to define a fastpath bypass between the bridge ports
 device (represented as eth0) in your switch/router.
 
 ::
+
                       fastpath bypass
                .-------------------------.
               /                           \
@@ -168,12 +170,12 @@ connection tracking entry by specifying the counter statement in your flowtable
 definition, e.g.
 
 ::
+
 	table inet x {
 		flowtable f {
 			hook ingress priority 0; devices = { eth0, eth1 };
 			counter
 		}
-		...
 	}
 
 Counter support is available since Linux kernel 5.7.
@@ -185,12 +187,12 @@ If your network device provides hardware offload support, you can turn it on by
 means of the 'offload' flag in your flowtable definition, e.g.
 
 ::
+
 	table inet x {
 		flowtable f {
 			hook ingress priority 0; devices = { eth0, eth1 };
 			flags offload;
 		}
-		...
 	}
 
 There is a workqueue that adds the flows to the hardware. Note that a few