diff options
1717 files changed, 59779 insertions, 19602 deletions
@@ -99,6 +99,7 @@ Jacob Shin <Jacob.Shin@amd.com> Jaegeuk Kim <jaegeuk@kernel.org> <jaegeuk@google.com> Jaegeuk Kim <jaegeuk@kernel.org> <jaegeuk@motorola.com> Jaegeuk Kim <jaegeuk@kernel.org> <jaegeuk.kim@samsung.com> +Jakub Kicinski <kuba@kernel.org> <jakub.kicinski@netronome.com> James Bottomley <jejb@mulgrave.(none)> James Bottomley <jejb@titanic.il.steeleye.com> James E Wilson <wilson@specifix.com> diff --git a/Documentation/ABI/stable/sysfs-driver-mlxreg-io b/Documentation/ABI/stable/sysfs-driver-mlxreg-io index 8ca498447aeb..05601a90a9b6 100644 --- a/Documentation/ABI/stable/sysfs-driver-mlxreg-io +++ b/Documentation/ABI/stable/sysfs-driver-mlxreg-io @@ -29,13 +29,13 @@ Description: This file shows the system fans direction: The files are read only. -What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/jtag_enable +What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/cpld3_version Date: November 2018 KernelVersion: 5.0 Contact: Vadim Pasternak <vadimpmellanox.com> Description: These files show with which CPLD versions have been burned - on LED board. + on LED or Gearbox board. The files are read only. @@ -121,6 +121,15 @@ Description: These files show the system reset cause, as following: ComEx The files are read only. +What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/cpld4_version +Date: November 2018 +KernelVersion: 5.0 +Contact: Vadim Pasternak <vadimpmellanox.com> +Description: These files show with which CPLD versions have been burned + on LED board. + + The files are read only. + Date: June 2019 KernelVersion: 5.3 Contact: Vadim Pasternak <vadimpmellanox.com> diff --git a/Documentation/ABI/testing/sysfs-bus-mdio b/Documentation/ABI/testing/sysfs-bus-mdio new file mode 100644 index 000000000000..da86efc7781b --- /dev/null +++ b/Documentation/ABI/testing/sysfs-bus-mdio @@ -0,0 +1,63 @@ +What: /sys/bus/mdio_bus/devices/.../statistics/ +Date: January 2020 +KernelVersion: 5.6 +Contact: netdev@vger.kernel.org +Description: + This folder contains statistics about global and per + MDIO bus address statistics. + +What: /sys/bus/mdio_bus/devices/.../statistics/transfers +Date: January 2020 +KernelVersion: 5.6 +Contact: netdev@vger.kernel.org +Description: + Total number of transfers for this MDIO bus. + +What: /sys/bus/mdio_bus/devices/.../statistics/errors +Date: January 2020 +KernelVersion: 5.6 +Contact: netdev@vger.kernel.org +Description: + Total number of transfer errors for this MDIO bus. + +What: /sys/bus/mdio_bus/devices/.../statistics/writes +Date: January 2020 +KernelVersion: 5.6 +Contact: netdev@vger.kernel.org +Description: + Total number of write transactions for this MDIO bus. + +What: /sys/bus/mdio_bus/devices/.../statistics/reads +Date: January 2020 +KernelVersion: 5.6 +Contact: netdev@vger.kernel.org +Description: + Total number of read transactions for this MDIO bus. + +What: /sys/bus/mdio_bus/devices/.../statistics/transfers_<addr> +Date: January 2020 +KernelVersion: 5.6 +Contact: netdev@vger.kernel.org +Description: + Total number of transfers for this MDIO bus address. + +What: /sys/bus/mdio_bus/devices/.../statistics/errors_<addr> +Date: January 2020 +KernelVersion: 5.6 +Contact: netdev@vger.kernel.org +Description: + Total number of transfer errors for this MDIO bus address. + +What: /sys/bus/mdio_bus/devices/.../statistics/writes_<addr> +Date: January 2020 +KernelVersion: 5.6 +Contact: netdev@vger.kernel.org +Description: + Total number of write transactions for this MDIO bus address. + +What: /sys/bus/mdio_bus/devices/.../statistics/reads_<addr> +Date: January 2020 +KernelVersion: 5.6 +Contact: netdev@vger.kernel.org +Description: + Total number of read transactions for this MDIO bus address. diff --git a/Documentation/admin-guide/devices.txt b/Documentation/admin-guide/devices.txt index 1c5d2281efc9..2a97aaec8b12 100644 --- a/Documentation/admin-guide/devices.txt +++ b/Documentation/admin-guide/devices.txt @@ -319,7 +319,7 @@ 182 = /dev/perfctr Performance-monitoring counters 183 = /dev/hwrng Generic random number generator 184 = /dev/cpu/microcode CPU microcode update interface - 186 = /dev/atomicps Atomic shapshot of process state data + 186 = /dev/atomicps Atomic snapshot of process state data 187 = /dev/irnet IrNET device 188 = /dev/smbusbios SMBus BIOS 189 = /dev/ussp_ctl User space serial port control diff --git a/Documentation/core-api/xarray.rst b/Documentation/core-api/xarray.rst index fcedc5349ace..640934b6f7b4 100644 --- a/Documentation/core-api/xarray.rst +++ b/Documentation/core-api/xarray.rst @@ -25,10 +25,6 @@ good performance with large indices. If your index can be larger than ``ULONG_MAX`` then the XArray is not the data type for you. The most important user of the XArray is the page cache. -Each non-``NULL`` entry in the array has three bits associated with -it called marks. Each mark may be set or cleared independently of -the others. You can iterate over entries which are marked. - Normal pointers may be stored in the XArray directly. They must be 4-byte aligned, which is true for any pointer returned from kmalloc() and alloc_page(). It isn't true for arbitrary user-space pointers, @@ -41,12 +37,11 @@ When you retrieve an entry from the XArray, you can check whether it is a value entry by calling xa_is_value(), and convert it back to an integer by calling xa_to_value(). -Some users want to store tagged pointers instead of using the marks -described above. They can call xa_tag_pointer() to create an -entry with a tag, xa_untag_pointer() to turn a tagged entry -back into an untagged pointer and xa_pointer_tag() to retrieve -the tag of an entry. Tagged pointers use the same bits that are used -to distinguish value entries from normal pointers, so each user must +Some users want to tag the pointers they store in the XArray. You can +call xa_tag_pointer() to create an entry with a tag, xa_untag_pointer() +to turn a tagged entry back into an untagged pointer and xa_pointer_tag() +to retrieve the tag of an entry. Tagged pointers use the same bits that +are used to distinguish value entries from normal pointers, so you must decide whether they want to store value entries or tagged pointers in any particular XArray. @@ -56,10 +51,9 @@ conflict with value entries or internal entries. An unusual feature of the XArray is the ability to create entries which occupy a range of indices. Once stored to, looking up any index in the range will return the same entry as looking up any other index in -the range. Setting a mark on one index will set it on all of them. -Storing to any index will store to all of them. Multi-index entries can -be explicitly split into smaller entries, or storing ``NULL`` into any -entry will cause the XArray to forget about the range. +the range. Storing to any index will store to all of them. Multi-index +entries can be explicitly split into smaller entries, or storing ``NULL`` +into any entry will cause the XArray to forget about the range. Normal API ========== @@ -87,17 +81,11 @@ If you want to only store a new entry to an index if the current entry at that index is ``NULL``, you can use xa_insert() which returns ``-EBUSY`` if the entry is not empty. -You can enquire whether a mark is set on an entry by using -xa_get_mark(). If the entry is not ``NULL``, you can set a mark -on it by using xa_set_mark() and remove the mark from an entry by -calling xa_clear_mark(). You can ask whether any entry in the -XArray has a particular mark set by calling xa_marked(). - You can copy entries out of the XArray into a plain array by calling -xa_extract(). Or you can iterate over the present entries in -the XArray by calling xa_for_each(). You may prefer to use -xa_find() or xa_find_after() to move to the next present -entry in the XArray. +xa_extract(). Or you can iterate over the present entries in the XArray +by calling xa_for_each(), xa_for_each_start() or xa_for_each_range(). +You may prefer to use xa_find() or xa_find_after() to move to the next +present entry in the XArray. Calling xa_store_range() stores the same entry in a range of indices. If you do this, some of the other operations will behave @@ -124,6 +112,31 @@ xa_destroy(). If the XArray entries are pointers, you may wish to free the entries first. You can do this by iterating over all present entries in the XArray using the xa_for_each() iterator. +Search Marks +------------ + +Each entry in the array has three bits associated with it called marks. +Each mark may be set or cleared independently of the others. You can +iterate over marked entries by using the xa_for_each_marked() iterator. + +You can enquire whether a mark is set on an entry by using +xa_get_mark(). If the entry is not ``NULL``, you can set a mark on it +by using xa_set_mark() and remove the mark from an entry by calling +xa_clear_mark(). You can ask whether any entry in the XArray has a +particular mark set by calling xa_marked(). Erasing an entry from the +XArray causes all marks associated with that entry to be cleared. + +Setting or clearing a mark on any index of a multi-index entry will +affect all indices covered by that entry. Querying the mark on any +index will return the same result. + +There is no way to iterate over entries which are not marked; the data +structure does not allow this to be implemented efficiently. There are +not currently iterators to search for logical combinations of bits (eg +iterate over all entries which have both ``XA_MARK_1`` and ``XA_MARK_2`` +set, or iterate over all entries which have ``XA_MARK_0`` or ``XA_MARK_2`` +set). It would be possible to add these if a user arises. + Allocating XArrays ------------------ @@ -180,6 +193,8 @@ No lock needed: Takes RCU read lock: * xa_load() * xa_for_each() + * xa_for_each_start() + * xa_for_each_range() * xa_find() * xa_find_after() * xa_extract() @@ -419,10 +434,9 @@ you last processed. If you have interrupts disabled while iterating, then it is good manners to pause the iteration and reenable interrupts every ``XA_CHECK_SCHED`` entries. -The xas_get_mark(), xas_set_mark() and -xas_clear_mark() functions require the xa_state cursor to have -been moved to the appropriate location in the xarray; they will do -nothing if you have called xas_pause() or xas_set() +The xas_get_mark(), xas_set_mark() and xas_clear_mark() functions require +the xa_state cursor to have been moved to the appropriate location in the +XArray; they will do nothing if you have called xas_pause() or xas_set() immediately before. You can call xas_set_update() to have a callback function diff --git a/Documentation/dev-tools/kcov.rst b/Documentation/dev-tools/kcov.rst index 36890b026e77..1c4e1825d769 100644 --- a/Documentation/dev-tools/kcov.rst +++ b/Documentation/dev-tools/kcov.rst @@ -251,11 +251,11 @@ selectively from different subsystems. .. code-block:: c struct kcov_remote_arg { - unsigned trace_mode; - unsigned area_size; - unsigned num_handles; - uint64_t common_handle; - uint64_t handles[0]; + __u32 trace_mode; + __u32 area_size; + __u32 num_handles; + __aligned_u64 common_handle; + __aligned_u64 handles[0]; }; #define KCOV_INIT_TRACE _IOR('c', 1, unsigned long) diff --git a/Documentation/devicetree/bindings/i2c/i2c-at91.txt b/Documentation/devicetree/bindings/i2c/i2c-at91.txt index 2210f4359c45..8347b1e7c080 100644 --- a/Documentation/devicetree/bindings/i2c/i2c-at91.txt +++ b/Documentation/devicetree/bindings/i2c/i2c-at91.txt @@ -18,8 +18,10 @@ Optional properties: - dma-names: should contain "tx" and "rx". - atmel,fifo-size: maximum number of data the RX and TX FIFOs can store for FIFO capable I2C controllers. -- i2c-sda-hold-time-ns: TWD hold time, only available for "atmel,sama5d4-i2c" - and "atmel,sama5d2-i2c". +- i2c-sda-hold-time-ns: TWD hold time, only available for: + "atmel,sama5d4-i2c", + "atmel,sama5d2-i2c", + "microchip,sam9x60-i2c". - Child nodes conforming to i2c bus binding Examples : diff --git a/Documentation/devicetree/bindings/net/fsl-fman.txt b/Documentation/devicetree/bindings/net/fsl-fman.txt index 299c0dcd67db..250f8d8cdce4 100644 --- a/Documentation/devicetree/bindings/net/fsl-fman.txt +++ b/Documentation/devicetree/bindings/net/fsl-fman.txt @@ -403,6 +403,19 @@ PROPERTIES The settings and programming routines for internal/external MDIO are different. Must be included for internal MDIO. +- fsl,erratum-a011043 + Usage: optional + Value type: <boolean> + Definition: Indicates the presence of the A011043 erratum + describing that the MDIO_CFG[MDIO_RD_ER] bit may be falsely + set when reading internal PCS registers. MDIO reads to + internal PCS registers may result in having the + MDIO_CFG[MDIO_RD_ER] bit set, even when there is no error and + read data (MDIO_DATA[MDIO_DATA]) is correct. + Software may get false read error when reading internal + PCS registers through MDIO. As a workaround, all internal + MDIO accesses should ignore the MDIO_CFG[MDIO_RD_ER] bit. + For internal PHY device on internal mdio bus, a PHY node should be created. See the definition of the PHY node in booting-without-of.txt for an example of how to define a PHY (Internal PHY has no interrupt line). diff --git a/Documentation/devicetree/bindings/spi/spi-controller.yaml b/Documentation/devicetree/bindings/spi/spi-controller.yaml index 732339275848..1e0ca6ccf64b 100644 --- a/Documentation/devicetree/bindings/spi/spi-controller.yaml +++ b/Documentation/devicetree/bindings/spi/spi-controller.yaml @@ -111,7 +111,7 @@ patternProperties: spi-rx-bus-width: allOf: - $ref: /schemas/types.yaml#/definitions/uint32 - - enum: [ 1, 2, 4 ] + - enum: [ 1, 2, 4, 8 ] - default: 1 description: Bus width to the SPI bus used for MISO. @@ -123,7 +123,7 @@ patternProperties: spi-tx-bus-width: allOf: - $ref: /schemas/types.yaml#/definitions/uint32 - - enum: [ 1, 2, 4 ] + - enum: [ 1, 2, 4, 8 ] - default: 1 description: Bus width to the SPI bus used for MOSI. diff --git a/Documentation/features/debug/gcov-profile-all/arch-support.txt b/Documentation/features/debug/gcov-profile-all/arch-support.txt index 059d58a549c7..6fb2b0671994 100644 --- a/Documentation/features/debug/gcov-profile-all/arch-support.txt +++ b/Documentation/features/debug/gcov-profile-all/arch-support.txt @@ -23,7 +23,7 @@ | openrisc: | TODO | | parisc: | TODO | | powerpc: | ok | - | riscv: | TODO | + | riscv: | ok | | s390: | ok | | sh: | ok | | sparc: | TODO | diff --git a/Documentation/media/v4l-drivers/meye.rst b/Documentation/media/v4l-drivers/meye.rst index a572996cdbf6..dc57a6a91b43 100644 --- a/Documentation/media/v4l-drivers/meye.rst +++ b/Documentation/media/v4l-drivers/meye.rst @@ -95,7 +95,7 @@ so all video4linux tools (like xawtv) should work with this driver. Besides the video4linux interface, the driver has a private interface for accessing the Motion Eye extended parameters (camera sharpness, -agc, video framerate), the shapshot and the MJPEG capture facilities. +agc, video framerate), the snapshot and the MJPEG capture facilities. This interface consists of several ioctls (prototypes and structures can be found in include/linux/meye.h): diff --git a/Documentation/networking/device_drivers/index.rst b/Documentation/networking/device_drivers/index.rst index c1f7f75e5fd9..4bc6ff29976a 100644 --- a/Documentation/networking/device_drivers/index.rst +++ b/Documentation/networking/device_drivers/index.rst @@ -25,6 +25,7 @@ Contents: mellanox/mlx5 netronome/nfp pensando/ionic + stmicro/stmmac .. only:: subproject and html diff --git a/Documentation/networking/device_drivers/microsoft/netvsc.txt b/Documentation/networking/device_drivers/microsoft/netvsc.txt index 3bfa635bbbd5..cd63556b27a0 100644 --- a/Documentation/networking/device_drivers/microsoft/netvsc.txt +++ b/Documentation/networking/device_drivers/microsoft/netvsc.txt @@ -82,3 +82,24 @@ Features contain one or more packets. The send buffer is an optimization, the driver will use slower method to handle very large packets or if the send buffer area is exhausted. + + XDP support + ----------- + XDP (eXpress Data Path) is a feature that runs eBPF bytecode at the early + stage when packets arrive at a NIC card. The goal is to increase performance + for packet processing, reducing the overhead of SKB allocation and other + upper network layers. + + hv_netvsc supports XDP in native mode, and transparently sets the XDP + program on the associated VF NIC as well. + + Setting / unsetting XDP program on synthetic NIC (netvsc) propagates to + VF NIC automatically. Setting / unsetting XDP program on VF NIC directly + is not recommended, also not propagated to synthetic NIC, and may be + overwritten by setting of synthetic NIC. + + XDP program cannot run with LRO (RSC) enabled, so you need to disable LRO + before running XDP: + ethtool -K eth0 lro off + + XDP_REDIRECT action is not yet supported. diff --git a/Documentation/networking/device_drivers/stmicro/stmmac.rst b/Documentation/networking/device_drivers/stmicro/stmmac.rst new file mode 100644 index 000000000000..c34bab3d2df0 --- /dev/null +++ b/Documentation/networking/device_drivers/stmicro/stmmac.rst @@ -0,0 +1,697 @@ +.. SPDX-License-Identifier: GPL-2.0+ + +============================================================== +Linux Driver for the Synopsys(R) Ethernet Controllers "stmmac" +============================================================== + +Authors: Giuseppe Cavallaro <peppe.cavallaro@st.com>, +Alexandre Torgue <alexandre.torgue@st.com>, Jose Abreu <joabreu@synopsys.com> + +Contents +======== + +- In This Release +- Feature List +- Kernel Configuration +- Command Line Parameters +- Driver Information and Notes +- Debug Information +- Support + +In This Release +=============== + +This file describes the stmmac Linux Driver for all the Synopsys(R) Ethernet +Controllers. + +Currently, this network device driver is for all STi embedded MAC/GMAC +(i.e. 7xxx/5xxx SoCs), SPEAr (arm), Loongson1B (mips) and XILINX XC2V3000 +FF1152AMT0221 D1215994A VIRTEX FPGA board. The Synopsys Ethernet QoS 5.0 IPK +is also supported. + +DesignWare(R) Cores Ethernet MAC 10/100/1000 Universal version 3.70a +(and older) and DesignWare(R) Cores Ethernet Quality-of-Service version 4.0 +(and upper) have been used for developing this driver as well as +DesignWare(R) Cores XGMAC - 10G Ethernet MAC. + +This driver supports both the platform bus and PCI. + +This driver includes support for the following Synopsys(R) DesignWare(R) +Cores Ethernet Controllers and corresponding minimum and maximum versions: + ++-------------------------------+--------------+--------------+--------------+ +| Controller Name | Min. Version | Max. Version | Abbrev. Name | ++===============================+==============+==============+==============+ +| Ethernet MAC Universal | N/A | 3.73a | GMAC | ++-------------------------------+--------------+--------------+--------------+ +| Ethernet Quality-of-Service | 4.00a | N/A | GMAC4+ | ++-------------------------------+--------------+--------------+--------------+ +| XGMAC - 10G Ethernet MAC | 2.10a | N/A | XGMAC2+ | ++-------------------------------+--------------+--------------+--------------+ + +For questions related to hardware requirements, refer to the documentation +supplied with your Ethernet adapter. All hardware requirements listed apply +to use with Linux. + +Feature List +============ + +The following features are available in this driver: + - GMII/MII/RGMII/SGMII/RMII/XGMII Interface + - Half-Duplex / Full-Duplex Operation + - Energy Efficient Ethernet (EEE) + - IEEE 802.3x PAUSE Packets (Flow Control) + - RMON/MIB Counters + - IEEE 1588 Timestamping (PTP) + - Pulse-Per-Second Output (PPS) + - MDIO Clause 22 / Clause 45 Interface + - MAC Loopback + - ARP Offloading + - Automatic CRC / PAD Insertion and Checking + - Checksum Offload for Received and Transmitted Packets + - Standard or Jumbo Ethernet Packets + - Source Address Insertion / Replacement + - VLAN TAG Insertion / Replacement / Deletion / Filtering (HASH and PERFECT) + - Programmable TX and RX Watchdog and Coalesce Settings + - Destination Address Filtering (PERFECT) + - HASH Filtering (Multicast) + - Layer 3 / Layer 4 Filtering + - Remote Wake-Up Detection + - Receive Side Scaling (RSS) + - Frame Preemption for TX and RX + - Programmable Burst Length, Threshold, Queue Size + - Multiple Queues (up to 8) + - Multiple Scheduling Algorithms (TX: WRR, DWRR, WFQ, SP, CBS, EST, TBS; + RX: WRR, SP) + - Flexible RX Parser + - TCP / UDP Segmentation Offload (TSO, USO) + - Split Header (SPH) + - Safety Features (ECC Protection, Data Parity Protection) + - Selftests using Ethtool + +Kernel Configuration +==================== + +The kernel configuration option is ``CONFIG_STMMAC_ETH``: + - ``CONFIG_STMMAC_PLATFORM``: is to enable the platform driver. + - ``CONFIG_STMMAC_PCI``: is to enable the pci driver. + +Command Line Parameters +======================= + +If the driver is built as a module the following optional parameters are used +by entering them on the command line with the modprobe command using this +syntax (e.g. for PCI module):: + + modprobe stmmac_pci [<option>=<VAL1>,<VAL2>,...] + +Driver parameters can be also passed in command line by using:: + + stmmaceth=watchdog:100,chain_mode=1 + +The default value for each parameter is generally the recommended setting, +unless otherwise noted. + +watchdog +-------- +:Valid Range: 5000-None +:Default Value: 5000 + +This parameter overrides the transmit timeout in milliseconds. + +debug +----- +:Valid Range: 0-16 (0=none,...,16=all) +:Default Value: 0 + +This parameter adjusts the level of debug messages displayed in the system +logs. + +phyaddr +------- +:Valid Range: 0-31 +:Default Value: -1 + +This parameter overrides the physical address of the PHY device. + +flow_ctrl +--------- +:Valid Range: 0-3 (0=off,1=rx,2=tx,3=rx/tx) +:Default Value: 3 + +This parameter changes the default Flow Control ability. + +pause +----- +:Valid Range: 0-65535 +:Default Value: 65535 + +This parameter changes the default Flow Control Pause time. + +tc +-- +:Valid Range: 64-256 +:Default Value: 64 + +This parameter changes the default HW FIFO Threshold control value. + +buf_sz +------ +:Valid Range: 1536-16384 +:Default Value: 1536 + +This parameter changes the default RX DMA packet buffer size. + +eee_timer +--------- +:Valid Range: 0-None +:Default Value: 1000 + +This parameter changes the default LPI TX Expiration time in milliseconds. + +chain_mode +---------- +:Valid Range: 0-1 (0=off,1=on) +:Default Value: 0 + +This parameter changes the default mode of operation from Ring Mode to +Chain Mode. + +Driver Information and Notes +============================ + +Transmit Process +---------------- + +The xmit method is invoked when the kernel needs to transmit a packet; it sets +the descriptors in the ring and informs the DMA engine that there is a packet +ready to be transmitted. + +By default, the driver sets the ``NETIF_F_SG`` bit in the features field of +the ``net_device`` structure, enabling the scatter-gather feature. This is +true on chips and configurations where the checksum can be done in hardware. + +Once the controller has finished transmitting the packet, timer will be +scheduled to release the transmit resources. + +Receive Process +--------------- + +When one or more packets are received, an interrupt happens. The interrupts +are not queued, so the driver has to scan all the descriptors in the ring +during the receive process. + +This is based on NAPI, so the interrupt handler signals only if there is work +to be done, and it exits. Then the poll method will be scheduled at some +future point. + +The incoming packets are stored, by the DMA, in a list of pre-allocated socket +buffers in order to avoid the memcpy (zero-copy). + +Interrupt Mitigation +-------------------- + +The driver is able to mitigate the number of its DMA interrupts using NAPI for +the reception on chips older than the 3.50. New chips have an HW RX Watchdog +used for this mitigation. + +Mitigation parameters can be tuned by ethtool. + +WoL +--- + +Wake up on Lan feature through Magic and Unicast frames are supported for the +GMAC, GMAC4/5 and XGMAC core. + +DMA Descriptors +--------------- + +Driver handles both normal and alternate descriptors. The latter has been only +tested on DesignWare(R) Cores Ethernet MAC Universal version 3.41a and later. + +stmmac supports DMA descriptor to operate both in dual buffer (RING) and +linked-list(CHAINED) mode. In RING each descriptor points to two data buffer +pointers whereas in CHAINED mode they point to only one data buffer pointer. +RING mode is the default. + +In CHAINED mode each descriptor will have pointer to next descriptor in the +list, hence creating the explicit chaining in the descriptor itself, whereas +such explicit chaining is not possible in RING mode. + +Extended Descriptors +-------------------- + +The extended descriptors give us information about the Ethernet payload when +it is carrying PTP packets or TCP/UDP/ICMP over IP. These are not available on +GMAC Synopsys(R) chips older than the 3.50. At probe time the driver will +decide if these can be actually used. This support also is mandatory for PTPv2 +because the extra descriptors are used for saving the hardware timestamps and +Extended Status. + +Ethtool Support +--------------- + +Ethtool is supported. For example, driver statistics (including RMON), +internal errors can be taken using:: + + ethtool -S ethX + +Ethtool selftests are also supported. This allows to do some early sanity +checks to the HW using MAC and PHY loopback mechanisms:: + + ethtool -t ethX + +Jumbo and Segmentation Offloading +--------------------------------- + +Jumbo frames are supported and tested for the GMAC. The GSO has been also +added but it's performed in software. LRO is not supported. + +TSO Support +----------- + +TSO (TCP Segmentation Offload) feature is supported by GMAC > 4.x and XGMAC +chip family. When a packet is sent through TCP protocol, the TCP stack ensures +that the SKB provided to the low level driver (stmmac in our case) matches +with the maximum frame len (IP header + TCP header + payload <= 1500 bytes +(for MTU set to 1500)). It means that if an application using TCP want to send +a packet which will have a length (after adding headers) > 1514 the packet +will be split in several TCP packets: The data payload is split and headers +(TCP/IP ..) are added. It is done by software. + +When TSO is enabled, the TCP stack doesn't care about the maximum frame length +and provide SKB packet to stmmac as it is. The GMAC IP will have to perform +the segmentation by it self to match with maximum frame length. + +This feature can be enabled in device tree through ``snps,tso`` entry. + +Energy Efficient Ethernet +------------------------- + +Energy Efficient Ethernet (EEE) enables IEEE 802.3 MAC sublayer along with a +family of Physical layer to operate in the Low Power Idle (LPI) mode. The EEE +mode supports the IEEE 802.3 MAC operation at 100Mbps, 1000Mbps and 1Gbps. + +The LPI mode allows power saving by switching off parts of the communication +device functionality when there is no data to be transmitted & received. +The system on both the side of the link can disable some functionalities and +save power during the period of low-link utilization. The MAC controls whether +the system should enter or exit the LPI mode and communicate this to PHY. + +As soon as the interface is opened, the driver verifies if the EEE can be +supported. This is done by looking at both the DMA HW capability register and +the PHY devices MCD registers. + +To enter in TX LPI mode the driver needs to have a software timer that enable +and disable the LPI mode when there is nothing to be transmitted. + +Precision Time Protocol (PTP) +----------------------------- + +The driver supports the IEEE 1588-2002, Precision Time Protocol (PTP), which +enables precise synchronization of clocks in measurement and control systems +implemented with technologies such as network communication. + +In addition to the basic timestamp features mentioned in IEEE 1588-2002 +Timestamps, new GMAC cores support the advanced timestamp features. +IEEE 1588-2008 can be enabled when configuring the Kernel. + +SGMII/RGMII Support +------------------- + +New GMAC devices provide own way to manage RGMII/SGMII. This information is +available at run-time by looking at the HW capability register. This means +that the stmmac can manage auto-negotiation and link status w/o using the +PHYLIB stuff. In fact, the HW provides a subset of extended registers to +restart the ANE, verify Full/Half duplex mode and Speed. Thanks to these +registers, it is possible to look at the Auto-negotiated Link Parter Ability. + +Physical +-------- + +The driver is compatible with Physical Abstraction Layer to be connected with +PHY and GPHY devices. + +Platform Information +-------------------- + +Several information can be passed through the platform and device-tree. + +:: + + struct plat_stmmacenet_data { + +1) Bus identifier:: + + int bus_id; + +2) PHY Physical Address. If set to -1 the driver will pick the first PHY it +finds:: + + int phy_addr; + +3) PHY Device Interface:: + + int interface; + +4) Specific platform fields for the MDIO bus:: + + struct stmmac_mdio_bus_data *mdio_bus_data; + +5) Internal DMA parameters:: + + struct stmmac_dma_cfg *dma_cfg; + +6) Fixed CSR Clock Range selection:: + + int clk_csr; + +7) HW uses the GMAC core:: + + int has_gmac; + +8) If set the MAC will use Enhanced Descriptors:: + + int enh_desc; + +9) Core is able to perform TX Checksum and/or RX Checksum in HW:: + + int tx_coe; + int rx_coe; + +11) Some HWs are not able to perform the csum in HW for over-sized frames due +to limited buffer sizes. Setting this flag the csum will be done in SW on +JUMBO frames:: + + int bugged_jumbo; + +12) Core has the embedded power module:: + + int pmt; + +13) Force DMA to use the Store and Forward mode or Threshold mode:: + + int force_sf_dma_mode; + int force_thresh_dma_mode; + +15) Force to disable the RX Watchdog feature and switch to NAPI mode:: + + int riwt_off; + +16) Limit the maximum operating speed and MTU:: + + int max_speed; + int maxmtu; + +18) Number of Multicast/Unicast filters:: + + int multicast_filter_bins; + int unicast_filter_entries; + +20) Limit the maximum TX and RX FIFO size:: + + int tx_fifo_size; + int rx_fifo_size; + +21) Use the specified number of TX and RX Queues:: + + u32 rx_queues_to_use; + u32 tx_queues_to_use; + +22) Use the specified TX and RX scheduling algorithm:: + + u8 rx_sched_algorithm; + u8 tx_sched_algorithm; + +23) Internal TX and RX Queue parameters:: + + struct stmmac_rxq_cfg rx_queues_cfg[MTL_MAX_RX_QUEUES]; + struct stmmac_txq_cfg tx_queues_cfg[MTL_MAX_TX_QUEUES]; + +24) This callback is used for modifying some syscfg registers (on ST SoCs) +according to the link speed negotiated by the physical layer:: + + void (*fix_mac_speed)(void *priv, unsigned int speed); + +25) Callbacks used for calling a custom initialization; This is sometimes +necessary on some platforms (e.g. ST boxes) where the HW needs to have set +some PIO lines or system cfg registers. init/exit callbacks should not use +or modify platform data:: + + int (*init)(struct platform_device *pdev, void *priv); + void (*exit)(struct platform_device *pdev, void *priv); + +26) Perform HW setup of the bus. For example, on some ST platforms this field +is used to configure the AMBA bridge to generate more efficient STBus traffic:: + + struct mac_device_info *(*setup)(void *priv); + void *bsp_priv; + +27) Internal clocks and rates:: + + struct clk *stmmac_clk; + struct clk *pclk; + struct clk *clk_ptp_ref; + unsigned int clk_ptp_rate; + unsigned int clk_ref_rate; + s32 ptp_max_adj; + +28) Main reset:: + + struct reset_control *stmmac_rst; + +29) AXI Internal Parameters:: + + struct stmmac_axi *axi; + +30) HW uses GMAC>4 cores:: + + int has_gmac4; + +31) HW is sun8i based:: + + bool has_sun8i; + +32) Enables TSO feature:: + + bool tso_en; + +33) Enables Receive Side Scaling (RSS) feature:: + + int rss_en; + +34) MAC Port selection:: + + int mac_port_sel_speed; + +35) Enables TX LPI Clock Gating:: + + bool en_tx_lpi_clockgating; + +36) HW uses XGMAC>2.10 cores:: + + int has_xgmac; + +:: + + } + +For MDIO bus data, we have: + +:: + + struct stmmac_mdio_bus_data { + +1) PHY mask passed when MDIO bus is registered:: + + unsigned int phy_mask; + +2) List of IRQs, one per PHY:: + + int *irqs; + +3) If IRQs is NULL, use this for probed PHY:: + + int probed_phy_irq; + +4) Set to true if PHY needs reset:: + + bool needs_reset; + +:: + + } + +For DMA engine configuration, we have: + +:: + + struct stmmac_dma_cfg { + +1) Programmable Burst Length (TX and RX):: + + int pbl; + +2) If set, DMA TX / RX will use this value rather than pbl:: + + int txpbl; + int rxpbl; + +3) Enable 8xPBL:: + + bool pblx8; + +4) Enable Fixed or Mixed burst:: + + int fixed_burst; + int mixed_burst; + +5) Enable Address Aligned Beats:: + + bool aal; + +6) Enable Enhanced Addressing (> 32 bits):: + + bool eame; + +:: + + } + +For DMA AXI parameters, we have: + +:: + + struct stmmac_axi { + +1) Enable AXI LPI:: + + bool axi_lpi_en; + bool axi_xit_frm; + +2) Set AXI Write / Read maximum outstanding requests:: + + u32 axi_wr_osr_lmt; + u32 axi_rd_osr_lmt; + +3) Set AXI 4KB bursts:: + + bool axi_kbbe; + +4) Set AXI maximum burst length map:: + + u32 axi_blen[AXI_BLEN]; + +5) Set AXI Fixed burst / mixed burst:: + + bool axi_fb; + bool axi_mb; + +6) Set AXI rebuild incrx mode:: + + bool axi_rb; + +:: + + } + +For the RX Queues configuration, we have: + +:: + + struct stmmac_rxq_cfg { + +1) Mode to use (DCB or AVB):: + + u8 mode_to_use; + +2) DMA channel to use:: + + u32 chan; + +3) Packet routing, if applicable:: + + u8 pkt_route; + +4) Use priority routing, and priority to route:: + + bool use_prio; + u32 prio; + +:: + + } + +For the TX Queues configuration, we have: + +:: + + struct stmmac_txq_cfg { + +1) Queue weight in scheduler:: + + u32 weight; + +2) Mode to use (DCB or AVB):: + + u8 mode_to_use; + +3) Credit Base Shaper Parameters:: + + u32 send_slope; + u32 idle_slope; + u32 high_credit; + u32 low_credit; + +4) Use priority scheduling, and priority:: + + bool use_prio; + u32 prio; + +:: + + } + +Device Tree Information +----------------------- + +Please refer to the following document: +Documentation/devicetree/bindings/net/snps,dwmac.yaml + +HW Capabilities +--------------- + +Note that, starting from new chips, where it is available the HW capability +register, many configurations are discovered at run-time for example to +understand if EEE, HW csum, PTP, enhanced descriptor etc are actually +available. As strategy adopted in this driver, the information from the HW +capability register can replace what has been passed from the platform. + +Debug Information +================= + +The driver exports many information i.e. internal statistics, debug +information, MAC and DMA registers etc. + +These can be read in several ways depending on the type of the information +actually needed. + +For example a user can be use the ethtool support to get statistics: e.g. +using: ``ethtool -S ethX`` (that shows the Management counters (MMC) if +supported) or sees the MAC/DMA registers: e.g. using: ``ethtool -d ethX`` + +Compiling the Kernel with ``CONFIG_DEBUG_FS`` the driver will export the +following debugfs entries: + + - ``descriptors_status``: To show the DMA TX/RX descriptor rings + - ``dma_cap``: To show the HW Capabilities + +Developer can also use the ``debug`` module parameter to get further debug +information (please see: NETIF Msg Level). + +Support +======= + +If an issue is identified with the released source code on a supported kernel +with a supported adapter, email the specific information related to the +issue to netdev@vger.kernel.org diff --git a/Documentation/networking/device_drivers/stmicro/stmmac.txt b/Documentation/networking/device_drivers/stmicro/stmmac.txt deleted file mode 100644 index 1ae979fd90d2..000000000000 --- a/Documentation/networking/device_drivers/stmicro/stmmac.txt +++ /dev/null @@ -1,401 +0,0 @@ - STMicroelectronics 10/100/1000 Synopsys Ethernet driver - -Copyright (C) 2007-2015 STMicroelectronics Ltd -Author: Giuseppe Cavallaro <peppe.cavallaro@st.com> - -This is the driver for the MAC 10/100/1000 on-chip Ethernet controllers -(Synopsys IP blocks). - -Currently this network device driver is for all STi embedded MAC/GMAC -(i.e. 7xxx/5xxx SoCs), SPEAr (arm), Loongson1B (mips) and XLINX XC2V3000 -FF1152AMT0221 D1215994A VIRTEX FPGA board. - -DWC Ether MAC 10/100/1000 Universal version 3.70a (and older) and DWC Ether -MAC 10/100 Universal version 4.0 have been used for developing this driver. - -This driver supports both the platform bus and PCI. - -Please, for more information also visit: www.stlinux.com - -1) Kernel Configuration -The kernel configuration option is STMMAC_ETH: - Device Drivers ---> Network device support ---> Ethernet (1000 Mbit) ---> - STMicroelectronics 10/100/1000 Ethernet driver (STMMAC_ETH) - -CONFIG_STMMAC_PLATFORM: is to enable the platform driver. -CONFIG_STMMAC_PCI: is to enable the pci driver. - -2) Driver parameters list: - debug: message level (0: no output, 16: all); - phyaddr: to manually provide the physical address to the PHY device; - buf_sz: DMA buffer size; - tc: control the HW FIFO threshold; - watchdog: transmit timeout (in milliseconds); - flow_ctrl: Flow control ability [on/off]; - pause: Flow Control Pause Time; - eee_timer: tx EEE timer; - chain_mode: select chain mode instead of ring. - -3) Command line options -Driver parameters can be also passed in command line by using: - stmmaceth=watchdog:100,chain_mode=1 - -4) Driver information and notes - -4.1) Transmit process -The xmit method is invoked when the kernel needs to transmit a packet; it sets -the descriptors in the ring and informs the DMA engine, that there is a packet -ready to be transmitted. -By default, the driver sets the NETIF_F_SG bit in the features field of the -net_device structure, enabling the scatter-gather feature. This is true on -chips and configurations where the checksum can be done in hardware. -Once the controller has finished transmitting the packet, timer will be -scheduled to release the transmit resources. - -4.2) Receive process -When one or more packets are received, an interrupt happens. The interrupts -are not queued, so the driver has to scan all the descriptors in the ring during -the receive process. -This is based on NAPI, so the interrupt handler signals only if there is work -to be done, and it exits. -Then the poll method will be scheduled at some future point. -The incoming packets are stored, by the DMA, in a list of pre-allocated socket -buffers in order to avoid the memcpy (zero-copy). - -4.3) Interrupt mitigation -The driver is able to mitigate the number of its DMA interrupts -using NAPI for the reception on chips older than the 3.50. -New chips have an HW RX-Watchdog used for this mitigation. -Mitigation parameters can be tuned by ethtool. - -4.4) WOL -Wake up on Lan feature through Magic and Unicast frames are supported for the -GMAC core. - -4.5) DMA descriptors -Driver handles both normal and alternate descriptors. The latter has been only -tested on DWC Ether MAC 10/100/1000 Universal version 3.41a and later. - -STMMAC supports DMA descriptor to operate both in dual buffer (RING) -and linked-list(CHAINED) mode. In RING each descriptor points to two -data buffer pointers whereas in CHAINED mode they point to only one data -buffer pointer. RING mode is the default. - -In CHAINED mode each descriptor will have pointer to next descriptor in -the list, hence creating the explicit chaining in the descriptor itself, -whereas such explicit chaining is not possible in RING mode. - -4.5.1) Extended descriptors -The extended descriptors give us information about the Ethernet payload -when it is carrying PTP packets or TCP/UDP/ICMP over IP. -These are not available on GMAC Synopsys chips older than the 3.50. -At probe time the driver will decide if these can be actually used. -This support also is mandatory for PTPv2 because the extra descriptors -are used for saving the hardware timestamps and Extended Status. - -4.6) Ethtool support -Ethtool is supported. - -For example, driver statistics (including RMON), internal errors can be taken -using: - # ethtool -S ethX -command - -4.7) Jumbo and Segmentation Offloading -Jumbo frames are supported and tested for the GMAC. -The GSO has been also added but it's performed in software. -LRO is not supported. - -4.8) Physical -The driver is compatible with Physical Abstraction Layer to be connected with -PHY and GPHY devices. - -4.9) Platform information -Several information can be passed through the platform and device-tree. - -struct plat_stmmacenet_data { - char *phy_bus_name; - int bus_id; - int phy_addr; - int interface; - struct stmmac_mdio_bus_data *mdio_bus_data; - struct stmmac_dma_cfg *dma_cfg; - int clk_csr; - int has_gmac; - int enh_desc; - int tx_coe; - int rx_coe; - int bugged_jumbo; - int pmt; - int force_sf_dma_mode; - int force_thresh_dma_mode; - int riwt_off; - int max_speed; - int maxmtu; - void (*fix_mac_speed)(void *priv, unsigned int speed); - void (*bus_setup)(void __iomem *ioaddr); - int (*init)(struct platform_device *pdev, void *priv); - void (*exit)(struct platform_device *pdev, void *priv); - void *bsp_priv; - int has_gmac4; - bool tso_en; -}; - -Where: - o phy_bus_name: phy bus name to attach to the stmmac. - o bus_id: bus identifier. - o phy_addr: the physical address can be passed from the platform. - If it is set to -1 the driver will automatically - detect it at run-time by probing all the 32 addresses. - o interface: PHY device's interface. - o mdio_bus_data: specific platform fields for the MDIO bus. - o dma_cfg: internal DMA parameters - o pbl: the Programmable Burst Length is maximum number of beats to - be transferred in one DMA transaction. - GMAC also enables the 4xPBL by default. (8xPBL for GMAC 3.50 and newer) - o txpbl/rxpbl: GMAC and newer supports independent DMA pbl for tx/rx. - o pblx8: Enable 8xPBL (4xPBL for core rev < 3.50). Enabled by default. - o fixed_burst/mixed_burst/aal - o clk_csr: fixed CSR Clock range selection. - o has_gmac: uses the GMAC core. - o enh_desc: if sets the MAC will use the enhanced descriptor structure. - o tx_coe: core is able to perform the tx csum in HW. - o rx_coe: the supports three check sum offloading engine types: - type_1, type_2 (full csum) and no RX coe. - o bugged_jumbo: some HWs are not able to perform the csum in HW for - over-sized frames due to limited buffer sizes. - Setting this flag the csum will be done in SW on - JUMBO frames. - o pmt: core has the embedded power module (optional). - o force_sf_dma_mode: force DMA to use the Store and Forward mode - instead of the Threshold. - o force_thresh_dma_mode: force DMA to use the Threshold mode other than - the Store and Forward mode. - o riwt_off: force to disable the RX watchdog feature and switch to NAPI mode. - o fix_mac_speed: this callback is used for modifying some syscfg registers - (on ST SoCs) according to the link speed negotiated by the - physical layer . - o bus_setup: perform HW setup of the bus. For example, on some ST platforms - this field is used to configure the AMBA bridge to generate more - efficient STBus traffic. - o init/exit: callbacks used for calling a custom initialization; - this is sometime necessary on some platforms (e.g. ST boxes) - where the HW needs to have set some PIO lines or system cfg - registers. init/exit callbacks should not use or modify - platform data. - o bsp_priv: another private pointer. - o has_gmac4: uses GMAC4 core. - o tso_en: Enables TSO (TCP Segmentation Offload) feature. - -For MDIO bus The we have: - - struct stmmac_mdio_bus_data { - int (*phy_reset)(void *priv); - unsigned int phy_mask; - int *irqs; - int probed_phy_irq; - }; - -Where: - o phy_reset: hook to reset the phy device attached to the bus. - o phy_mask: phy mask passed when register the MDIO bus within the driver. - o irqs: list of IRQs, one per PHY. - o probed_phy_irq: if irqs is NULL, use this for probed PHY. - -For DMA engine we have the following internal fields that should be -tuned according to the HW capabilities. - -struct stmmac_dma_cfg { - int pbl; - int txpbl; - int rxpbl; - bool pblx8; - int fixed_burst; - int mixed_burst; - bool aal; -}; - -Where: - o pbl: Programmable Burst Length (tx and rx) - o txpbl: Transmit Programmable Burst Length. Only for GMAC and newer. - If set, DMA tx will use this value rather than pbl. - o rxpbl: Receive Programmable Burst Length. Only for GMAC and newer. - If set, DMA rx will use this value rather than pbl. - o pblx8: Enable 8xPBL (4xPBL for core rev < 3.50). Enabled by default. - o fixed_burst: program the DMA to use the fixed burst mode - o mixed_burst: program the DMA to use the mixed burst mode - o aal: Address-Aligned Beats - ---- - -Below an example how the structures above are using on ST platforms. - - static struct plat_stmmacenet_data stxYYY_ethernet_platform_data = { - .has_gmac = 0, - .enh_desc = 0, - .fix_mac_speed = stxYYY_ethernet_fix_mac_speed, - | - |-> to write an internal syscfg - | on this platform when the - | link speed changes from 10 to - | 100 and viceversa - .init = &stmmac_claim_resource, - | - |-> On ST SoC this calls own "PAD" - | manager framework to claim - | all the resources necessary - | (GPIO ...). The .custom_cfg field - | is used to pass a custom config. -}; - -Below the usage of the stmmac_mdio_bus_data: on this SoC, in fact, -there are two MAC cores: one MAC is for MDIO Bus/PHY emulation -with fixed_link support. - -static struct stmmac_mdio_bus_data stmmac1_mdio_bus = { - .phy_reset = phy_reset; - | - |-> function to provide the phy_reset on this board - .phy_mask = 0, -}; - -static struct fixed_phy_status stmmac0_fixed_phy_status = { - .link = 1, - .speed = 100, - .duplex = 1, -}; - -During the board's device_init we can configure the first -MAC for fixed_link by calling: - fixed_phy_add(PHY_POLL, 1, &stmmac0_fixed_phy_status); -and the second one, with a real PHY device attached to the bus, -by using the stmmac_mdio_bus_data structure (to provide the id, the -reset procedure etc). - -Note that, starting from new chips, where it is available the HW capability -register, many configurations are discovered at run-time for example to -understand if EEE, HW csum, PTP, enhanced descriptor etc are actually -available. As strategy adopted in this driver, the information from the HW -capability register can replace what has been passed from the platform. - -4.10) Device-tree support. - -Please see the following document: - Documentation/devicetree/bindings/net/stmmac.txt - -4.11) This is a summary of the content of some relevant files: - o stmmac_main.c: implements the main network device driver; - o stmmac_mdio.c: provides MDIO functions; - o stmmac_pci: this is the PCI driver; - o stmmac_platform.c: this the platform driver (OF supported); - o stmmac_ethtool.c: implements the ethtool support; - o stmmac.h: private driver structure; - o common.h: common definitions and VFTs; - o mmc_core.c/mmc.h: Management MAC Counters; - o stmmac_hwtstamp.c: HW timestamp support for PTP; - o stmmac_ptp.c: PTP 1588 clock; - o stmmac_pcs.h: Physical Coding Sublayer common implementation; - o dwmac-<XXX>.c: these are for the platform glue-logic file; e.g. dwmac-sti.c - for STMicroelectronics SoCs. - -- GMAC 3.x - o descs.h: descriptor structure definitions; - o dwmac1000_core.c: dwmac GiGa core functions; - o dwmac1000_dma.c: dma functions for the GMAC chip; - o dwmac1000.h: specific header file for the dwmac GiGa; - o dwmac100_core: dwmac 100 core code; - o dwmac100_dma.c: dma functions for the dwmac 100 chip; - o dwmac1000.h: specific header file for the MAC; - o dwmac_lib.c: generic DMA functions; - o enh_desc.c: functions for handling enhanced descriptors; - o norm_desc.c: functions for handling normal descriptors; - o chain_mode.c/ring_mode.c:: functions to manage RING/CHAINED modes; - -- GMAC4.x generation - o dwmac4_core.c: dwmac GMAC4.x core functions; - o dwmac4_desc.c: functions for handling GMAC4.x descriptors; - o dwmac4_descs.h: descriptor definitions; - o dwmac4_dma.c: dma functions for the GMAC4.x chip; - o dwmac4_dma.h: dma definitions for the GMAC4.x chip; - o dwmac4.h: core definitions for the GMAC4.x chip; - o dwmac4_lib.c: generic GMAC4.x functions; - -4.12) TSO support (GMAC4.x) - -TSO (Tcp Segmentation Offload) feature is supported by GMAC 4.x chip family. -When a packet is sent through TCP protocol, the TCP stack ensures that -the SKB provided to the low level driver (stmmac in our case) matches with -the maximum frame len (IP header + TCP header + payload <= 1500 bytes (for -MTU set to 1500)). It means that if an application using TCP want to send a -packet which will have a length (after adding headers) > 1514 the packet -will be split in several TCP packets: The data payload is split and headers -(TCP/IP ..) are added. It is done by software. - -When TSO is enabled, the TCP stack doesn't care about the maximum frame -length and provide SKB packet to stmmac as it is. The GMAC IP will have to -perform the segmentation by it self to match with maximum frame length. - -This feature can be enabled in device tree through "snps,tso" entry. - -5) Debug Information - -The driver exports many information i.e. internal statistics, -debug information, MAC and DMA registers etc. - -These can be read in several ways depending on the -type of the information actually needed. - -For example a user can be use the ethtool support -to get statistics: e.g. using: ethtool -S ethX -(that shows the Management counters (MMC) if supported) -or sees the MAC/DMA registers: e.g. using: ethtool -d ethX - -Compiling the Kernel with CONFIG_DEBUG_FS the driver will export the following -debugfs entries: - -/sys/kernel/debug/stmmaceth/descriptors_status - To show the DMA TX/RX descriptor rings - -Developer can also use the "debug" module parameter to get further debug -information (please see: NETIF Msg Level). - -6) Energy Efficient Ethernet - -Energy Efficient Ethernet(EEE) enables IEEE 802.3 MAC sublayer along -with a family of Physical layer to operate in the Low power Idle(LPI) -mode. The EEE mode supports the IEEE 802.3 MAC operation at 100Mbps, -1000Mbps & 10Gbps. - -The LPI mode allows power saving by switching off parts of the -communication device functionality when there is no data to be -transmitted & received. The system on both the side of the link can -disable some functionalities & save power during the period of low-link -utilization. The MAC controls whether the system should enter or exit -the LPI mode & communicate this to PHY. - -As soon as the interface is opened, the driver verifies if the EEE can -be supported. This is done by looking at both the DMA HW capability -register and the PHY devices MCD registers. -To enter in Tx LPI mode the driver needs to have a software timer -that enable and disable the LPI mode when there is nothing to be -transmitted. - -7) Precision Time Protocol (PTP) -The driver supports the IEEE 1588-2002, Precision Time Protocol (PTP), -which enables precise synchronization of clocks in measurement and -control systems implemented with technologies such as network -communication. - -In addition to the basic timestamp features mentioned in IEEE 1588-2002 -Timestamps, new GMAC cores support the advanced timestamp features. -IEEE 1588-2008 that can be enabled when configure the Kernel. - -8) SGMII/RGMII support -New GMAC devices provide own way to manage RGMII/SGMII. -This information is available at run-time by looking at the -HW capability register. This means that the stmmac can manage -auto-negotiation and link status w/o using the PHYLIB stuff. -In fact, the HW provides a subset of extended registers to -restart the ANE, verify Full/Half duplex mode and Speed. -Thanks to these registers, it is possible to look at the -Auto-negotiated Link Parter Ability. diff --git a/Documentation/networking/device_drivers/ti/cpsw_switchdev.txt b/Documentation/networking/device_drivers/ti/cpsw_switchdev.txt index 5c8cee17fca9..12855ab268b8 100644 --- a/Documentation/networking/device_drivers/ti/cpsw_switchdev.txt +++ b/Documentation/networking/device_drivers/ti/cpsw_switchdev.txt @@ -39,7 +39,7 @@ but without enabling "switch" mode, or to different bridges. Devlink configuration parameters ==================== -See Documentation/networking/devlink-params-ti-cpsw-switch.txt +See Documentation/networking/devlink/ti-cpsw-switch.rst ==================== # Bridging in dual mac mode diff --git a/Documentation/networking/devlink-health.txt b/Documentation/networking/devlink-health.txt deleted file mode 100644 index 1db3fbea0831..000000000000 --- a/Documentation/networking/devlink-health.txt +++ /dev/null @@ -1,86 +0,0 @@ -The health mechanism is targeted for Real Time Alerting, in order to know when -something bad had happened to a PCI device -- Provide alert debug information -- Self healing -- If problem needs vendor support, provide a way to gather all needed debugging - information. - -The main idea is to unify and centralize driver health reports in the -generic devlink instance and allow the user to set different -attributes of the health reporting and recovery procedures. - -The devlink health reporter: -Device driver creates a "health reporter" per each error/health type. -Error/Health type can be a known/generic (eg pci error, fw error, rx/tx error) -or unknown (driver specific). -For each registered health reporter a driver can issue error/health reports -asynchronously. All health reports handling is done by devlink. -Device driver can provide specific callbacks for each "health reporter", e.g. - - Recovery procedures - - Diagnostics and object dump procedures - - OOB initial parameters -Different parts of the driver can register different types of health reporters -with different handlers. - -Once an error is reported, devlink health will do the following actions: - * A log is being send to the kernel trace events buffer - * Health status and statistics are being updated for the reporter instance - * Object dump is being taken and saved at the reporter instance (as long as - there is no other dump which is already stored) - * Auto recovery attempt is being done. Depends on: - - Auto-recovery configuration - - Grace period vs. time passed since last recover - -The user interface: -User can access/change each reporter's parameters and driver specific callbacks -via devlink, e.g per error type (per health reporter) - - Configure reporter's generic parameters (like: disable/enable auto recovery) - - Invoke recovery procedure - - Run diagnostics - - Object dump - -The devlink health interface (via netlink): -DEVLINK_CMD_HEALTH_REPORTER_GET - Retrieves status and configuration info per DEV and reporter. -DEVLINK_CMD_HEALTH_REPORTER_SET - Allows reporter-related configuration setting. -DEVLINK_CMD_HEALTH_REPORTER_RECOVER - Triggers a reporter's recovery procedure. -DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE - Retrieves diagnostics data from a reporter on a device. -DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET - Retrieves the last stored dump. Devlink health - saves a single dump. If an dump is not already stored by the devlink - for this reporter, devlink generates a new dump. - dump output is defined by the reporter. -DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR - Clears the last saved dump file for the specified reporter. - - - netlink - +--------------------------+ - | | - | + | - | | | - +--------------------------+ - |request for ops - |(diagnose, - mlx5_core devlink |recover, - |dump) -+--------+ +--------------------------+ -| | | reporter| | -| | | +---------v----------+ | -| | ops execution | | | | -| <----------------------------------+ | | -| | | | | | -| | | + ^------------------+ | -| | | | request for ops | -| | | | (recover, dump) | -| | | | | -| | | +-+------------------+ | -| | health report | | health handler | | -| +-------------------------------> | | -| | | +--------------------+ | -| | health reporter create | | -| +----------------------------> | -+--------+ +--------------------------+ diff --git a/Documentation/networking/devlink-info-versions.rst b/Documentation/networking/devlink-info-versions.rst deleted file mode 100644 index 4914f581b1fd..000000000000 --- a/Documentation/networking/devlink-info-versions.rst +++ /dev/null @@ -1,64 +0,0 @@ -.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) - -===================== -Devlink info versions -===================== - -board.id -======== - -Unique identifier of the board design. - -board.rev -========= - -Board design revision. - -asic.id -======= - -ASIC design identifier. - -asic.rev -======== - -ASIC design revision. - -board.manufacture -================= - -An identifier of the company or the facility which produced the part. - -fw -== - -Overall firmware version, often representing the collection of -fw.mgmt, fw.app, etc. - -fw.mgmt -======= - -Control unit firmware version. This firmware is responsible for house -keeping tasks, PHY control etc. but not the packet-by-packet data path -operation. - -fw.app -====== - -Data path microcode controlling high-speed packet processing. - -fw.undi -======= - -UNDI software, may include the UEFI driver, firmware or both. - -fw.ncsi -======= - -Version of the software responsible for supporting/handling the -Network Controller Sideband Interface. - -fw.psid -======= - -Unique identifier of the firmware parameter set. diff --git a/Documentation/networking/devlink-params-bnxt.txt b/Documentation/networking/devlink-params-bnxt.txt deleted file mode 100644 index 481aa303d5b4..000000000000 --- a/Documentation/networking/devlink-params-bnxt.txt +++ /dev/null @@ -1,18 +0,0 @@ -enable_sriov [DEVICE, GENERIC] - Configuration mode: Permanent - -ignore_ari [DEVICE, GENERIC] - Configuration mode: Permanent - -msix_vec_per_pf_max [DEVICE, GENERIC] - Configuration mode: Permanent - -msix_vec_per_pf_min [DEVICE, GENERIC] - Configuration mode: Permanent - -gre_ver_check [DEVICE, DRIVER-SPECIFIC] - Generic Routing Encapsulation (GRE) version check will - be enabled in the device. If disabled, device skips - version checking for incoming packets. - Type: Boolean - Configuration mode: Permanent diff --git a/Documentation/networking/devlink-params-mlx5.txt b/Documentation/networking/devlink-params-mlx5.txt deleted file mode 100644 index 5071467118bd..000000000000 --- a/Documentation/networking/devlink-params-mlx5.txt +++ /dev/null @@ -1,17 +0,0 @@ -flow_steering_mode [DEVICE, DRIVER-SPECIFIC] - Controls the flow steering mode of the driver. - Two modes are supported: - 1. 'dmfs' - Device managed flow steering. - 2. 'smfs - Software/Driver managed flow steering. - In DMFS mode, the HW steering entities are created and - managed through the Firmware. - In SMFS mode, the HW steering entities are created and - managed though by the driver directly into Hardware - without firmware intervention. - Type: String - Configuration mode: runtime - -enable_roce [DEVICE, GENERIC] - Enable handling of RoCE traffic in the device. - Defaultly enabled. - Configuration mode: driverinit diff --git a/Documentation/networking/devlink-params-mlxsw.txt b/Documentation/networking/devlink-params-mlxsw.txt deleted file mode 100644 index c63ea9fc7009..000000000000 --- a/Documentation/networking/devlink-params-mlxsw.txt +++ /dev/null @@ -1,10 +0,0 @@ -fw_load_policy [DEVICE, GENERIC] - Configuration mode: driverinit - -acl_region_rehash_interval [DEVICE, DRIVER-SPECIFIC] - Sets an interval for periodic ACL region rehashes. - The value is in milliseconds, minimal value is "3000". - Value "0" disables the periodic work. - The first rehash will be run right after value is set. - Type: u32 - Configuration mode: runtime diff --git a/Documentation/networking/devlink-params-mv88e6xxx.txt b/Documentation/networking/devlink-params-mv88e6xxx.txt deleted file mode 100644 index 21c4b3556ef2..000000000000 --- a/Documentation/networking/devlink-params-mv88e6xxx.txt +++ /dev/null @@ -1,7 +0,0 @@ -ATU_hash [DEVICE, DRIVER-SPECIFIC] - Select one of four possible hashing algorithms for - MAC addresses in the Address Translation Unit. - A value of 3 seems to work better than the default of - 1 when many MAC addresses have the same OUI. - Configuration mode: runtime - Type: u8. 0-3 valid. diff --git a/Documentation/networking/devlink-params-nfp.txt b/Documentation/networking/devlink-params-nfp.txt deleted file mode 100644 index 43e4d4034865..000000000000 --- a/Documentation/networking/devlink-params-nfp.txt +++ /dev/null @@ -1,5 +0,0 @@ -fw_load_policy [DEVICE, GENERIC] - Configuration mode: permanent - -reset_dev_on_drv_probe [DEVICE, GENERIC] - Configuration mode: permanent diff --git a/Documentation/networking/devlink-params-ti-cpsw-switch.txt b/Documentation/networking/devlink-params-ti-cpsw-switch.txt deleted file mode 100644 index 4037458499f7..000000000000 --- a/Documentation/networking/devlink-params-ti-cpsw-switch.txt +++ /dev/null @@ -1,10 +0,0 @@ -ale_bypass [DEVICE, DRIVER-SPECIFIC] - Allows to enable ALE_CONTROL(4).BYPASS mode for debug purposes. - All packets will be sent to the Host port only if enabled. - Type: bool - Configuration mode: runtime - -switch_mode [DEVICE, DRIVER-SPECIFIC] - Enable switch mode - Type: bool - Configuration mode: runtime diff --git a/Documentation/networking/devlink-params.txt b/Documentation/networking/devlink-params.txt deleted file mode 100644 index 04e234e9acc9..000000000000 --- a/Documentation/networking/devlink-params.txt +++ /dev/null @@ -1,71 +0,0 @@ -Devlink configuration parameters -================================ -Following is the list of configuration parameters via devlink interface. -Each parameter can be generic or driver specific and are device level -parameters. - -Note that the driver-specific files should contain the generic params -they support to, with supported config modes. - -Each parameter can be set in different configuration modes: - runtime - set while driver is running, no reset required. - driverinit - applied while driver initializes, requires restart - driver by devlink reload command. - permanent - written to device's non-volatile memory, hard reset - required. - -Following is the list of parameters: -==================================== -enable_sriov [DEVICE, GENERIC] - Enable Single Root I/O Virtualisation (SRIOV) in - the device. - Type: Boolean - -ignore_ari [DEVICE, GENERIC] - Ignore Alternative Routing-ID Interpretation (ARI) - capability. If enabled, adapter will ignore ARI - capability even when platforms has the support - enabled and creates same number of partitions when - platform does not support ARI. - Type: Boolean - -msix_vec_per_pf_max [DEVICE, GENERIC] - Provides the maximum number of MSIX interrupts that - a device can create. Value is same across all - physical functions (PFs) in the device. - Type: u32 - -msix_vec_per_pf_min [DEVICE, GENERIC] - Provides the minimum number of MSIX interrupts required - for the device initialization. Value is same across all - physical functions (PFs) in the device. - Type: u32 - -fw_load_policy [DEVICE, GENERIC] - Controls the device's firmware loading policy. - Valid values: - * DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_DRIVER (0) - Load firmware version preferred by the driver. - * DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_FLASH (1) - Load firmware currently stored in flash. - * DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_DISK (2) - Load firmware currently available on host's disk. - Type: u8 - -reset_dev_on_drv_probe [DEVICE, GENERIC] - Controls the device's reset policy on driver probe. - Valid values: - * DEVLINK_PARAM_RESET_DEV_ON_DRV_PROBE_VALUE_UNKNOWN (0) - Unknown or invalid value. - * DEVLINK_PARAM_RESET_DEV_ON_DRV_PROBE_VALUE_ALWAYS (1) - Always reset device on driver probe. - * DEVLINK_PARAM_RESET_DEV_ON_DRV_PROBE_VALUE_NEVER (2) - Never reset device on driver probe. - * DEVLINK_PARAM_RESET_DEV_ON_DRV_PROBE_VALUE_DISK (3) - Reset only if device firmware can be found in the - filesystem. - Type: u8 - -enable_roce [DEVICE, GENERIC] - Enable handling of RoCE traffic in the device. - Type: Boolean diff --git a/Documentation/networking/devlink-trap-netdevsim.rst b/Documentation/networking/devlink-trap-netdevsim.rst deleted file mode 100644 index b721c9415473..000000000000 --- a/Documentation/networking/devlink-trap-netdevsim.rst +++ /dev/null @@ -1,20 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -====================== -Devlink Trap netdevsim -====================== - -Driver-specific Traps -===================== - -.. list-table:: List of Driver-specific Traps Registered by ``netdevsim`` - :widths: 5 5 90 - - * - Name - - Type - - Description - * - ``fid_miss`` - - ``exception`` - - When a packet enters the device it is classified to a filtering - indentifier (FID) based on the ingress port and VLAN. This trap is used - to trap packets for which a FID could not be found diff --git a/Documentation/networking/devlink/bnxt.rst b/Documentation/networking/devlink/bnxt.rst new file mode 100644 index 000000000000..79e746d22a53 --- /dev/null +++ b/Documentation/networking/devlink/bnxt.rst @@ -0,0 +1,41 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==================== +bnxt devlink support +==================== + +This document describes the devlink features implemented by the ``bnxt`` +device driver. + +Parameters +========== + +.. list-table:: Generic parameters implemented + + * - Name + - Mode + * - ``enable_sriov`` + - Permanent + * - ``ignore_ari`` + - Permanent + * - ``msix_vec_per_pf_max`` + - Permanent + * - ``msix_vec_per_pf_min`` + - Permanent + +The ``bnxt`` driver also implements the following driver-specific +parameters. + +.. list-table:: Driver-specific parameters implemented + :widths: 5 5 5 85 + + * - Name + - Type + - Mode + - Description + * - ``gre_ver_check`` + - Boolean + - Permanent + - Generic Routing Encapsulation (GRE) version check will be enabled in + the device. If disabled, the device will skip the version check for + incoming packets. diff --git a/Documentation/networking/devlink/devlink-dpipe.rst b/Documentation/networking/devlink/devlink-dpipe.rst new file mode 100644 index 000000000000..468fe1001b74 --- /dev/null +++ b/Documentation/networking/devlink/devlink-dpipe.rst @@ -0,0 +1,252 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============= +Devlink DPIPE +============= + +Background +========== + +While performing the hardware offloading process, much of the hardware +specifics cannot be presented. These details are useful for debugging, and +``devlink-dpipe`` provides a standardized way to provide visibility into the +offloading process. + +For example, the routing longest prefix match (LPM) algorithm used by the +Linux kernel may differ from the hardware implementation. The pipeline debug +API (DPIPE) is aimed at providing the user visibility into the ASIC's +pipeline in a generic way. + +The hardware offload process is expected to be done in a way that the user +should not be able to distinguish between the hardware vs. software +implementation. In this process, hardware specifics are neglected. In +reality those details can have lots of meaning and should be exposed in some +standard way. + +This problem is made even more complex when one wishes to offload the +control path of the whole networking stack to a switch ASIC. Due to +differences in the hardware and software models some processes cannot be +represented correctly. + +One example is the kernel's LPM algorithm which in many cases differs +greatly to the hardware implementation. The configuration API is the same, +but one cannot rely on the Forward Information Base (FIB) to look like the +Level Path Compression trie (LPC-trie) in hardware. + +In many situations trying to analyze systems failure solely based on the +kernel's dump may not be enough. By combining this data with complementary +information about the underlying hardware, this debugging can be made +easier; additionally, the information can be useful when debugging +performance issues. + +Overview +======== + +The ``devlink-dpipe`` interface closes this gap. The hardware's pipeline is +modeled as a graph of match/action tables. Each table represents a specific +hardware block. This model is not new, first being used by the P4 language. + +Traditionally it has been used as an alternative model for hardware +configuration, but the ``devlink-dpipe`` interface uses it for visibility +purposes as a standard complementary tool. The system's view from +``devlink-dpipe`` should change according to the changes done by the +standard configuration tools. + +For example, it’s quiet common to implement Access Control Lists (ACL) +using Ternary Content Addressable Memory (TCAM). The TCAM memory can be +divided into TCAM regions. Complex TC filters can have multiple rules with +different priorities and different lookup keys. On the other hand hardware +TCAM regions have a predefined lookup key. Offloading the TC filter rules +using TCAM engine can result in multiple TCAM regions being interconnected +in a chain (which may affect the data path latency). In response to a new TC +filter new tables should be created describing those regions. + +Model +===== + +The ``DPIPE`` model introduces several objects: + + * headers + * tables + * entries + +A ``header`` describes packet formats and provides names for fields within +the packet. A ``table`` describes hardware blocks. An ``entry`` describes +the actual content of a specific table. + +The hardware pipeline is not port specific, but rather describes the whole +ASIC. Thus it is tied to the top of the ``devlink`` infrastructure. + +Drivers can register and unregister tables at run time, in order to support +dynamic behavior. This dynamic behavior is mandatory for describing hardware +blocks like TCAM regions which can be allocated and freed dynamically. + +``devlink-dpipe`` generally is not intended for configuration. The exception +is hardware counting for a specific table. + +The following commands are used to obtain the ``dpipe`` objects from +userspace: + + * ``table_get``: Receive a table's description. + * ``headers_get``: Receive a device's supported headers. + * ``entries_get``: Receive a table's current entries. + * ``counters_set``: Enable or disable counters on a table. + +Table +----- + +The driver should implement the following operations for each table: + + * ``matches_dump``: Dump the supported matches. + * ``actions_dump``: Dump the supported actions. + * ``entries_dump``: Dump the actual content of the table. + * ``counters_set_update``: Synchronize hardware with counters enabled or + disabled. + +Header/Field +------------ + +In a similar way to P4 headers and fields are used to describe a table's +behavior. There is a slight difference between the standard protocol headers +and specific ASIC metadata. The protocol headers should be declared in the +``devlink`` core API. On the other hand ASIC meta data is driver specific +and should be defined in the driver. Additionally, each driver-specific +devlink documentation file should document the driver-specific ``dpipe`` +headers it implements. The headers and fields are identified by enumeration. + +In order to provide further visibility some ASIC metadata fields could be +mapped to kernel objects. For example, internal router interface indexes can +be directly mapped to the net device ifindex. FIB table indexes used by +different Virtual Routing and Forwarding (VRF) tables can be mapped to +internal routing table indexes. + +Match +----- + +Matches are kept primitive and close to hardware operation. Match types like +LPM are not supported due to the fact that this is exactly a process we wish +to describe in full detail. Example of matches: + + * ``field_exact``: Exact match on a specific field. + * ``field_exact_mask``: Exact match on a specific field after masking. + * ``field_range``: Match on a specific range. + +The id's of the header and the field should be specified in order to +identify the specific field. Furthermore, the header index should be +specified in order to distinguish multiple headers of the same type in a +packet (tunneling). + +Action +------ + +Similar to match, the actions are kept primitive and close to hardware +operation. For example: + + * ``field_modify``: Modify the field value. + * ``field_inc``: Increment the field value. + * ``push_header``: Add a header. + * ``pop_header``: Remove a header. + +Entry +----- + +Entries of a specific table can be dumped on demand. Each eentry is +identified with an index and its properties are described by a list of +match/action values and specific counter. By dumping the tables content the +interactions between tables can be resolved. + +Abstraction Example +=================== + +The following is an example of the abstraction model of the L3 part of +Mellanox Spectrum ASIC. The blocks are described in the order they appear in +the pipeline. The table sizes in the following examples are not real +hardware sizes and are provided for demonstration purposes. + +LPM +--- + +The LPM algorithm can be implemented as a list of hash tables. Each hash +table contains routes with the same prefix length. The root of the list is +/32, and in case of a miss the hardware will continue to the next hash +table. The depth of the search will affect the data path latency. + +In case of a hit the entry contains information about the next stage of the +pipeline which resolves the MAC address. The next stage can be either local +host table for directly connected routes, or adjacency table for next-hops. +The ``meta.lpm_prefix`` field is used to connect two LPM tables. + +.. code:: + + table lpm_prefix_16 { + size: 4096, + counters_enabled: true, + match: { meta.vr_id: exact, + ipv4.dst_addr: exact_mask, + ipv6.dst_addr: exact_mask, + meta.lpm_prefix: exact }, + action: { meta.adj_index: set, + meta.adj_group_size: set, + meta.rif_port: set, + meta.lpm_prefix: set }, + } + +Local Host +---------- + +In the case of local routes the LPM lookup already resolves the egress +router interface (RIF), yet the exact MAC address is not known. The local +host table is a hash table combining the output interface id with +destination IP address as a key. The result is the MAC address. + +.. code:: + + table local_host { + size: 4096, + counters_enabled: true, + match: { meta.rif_port: exact, + ipv4.dst_addr: exact}, + action: { ethernet.daddr: set } + } + +Adjacency +--------- + +In case of remote routes this table does the ECMP. The LPM lookup results in +ECMP group size and index that serves as a global offset into this table. +Concurrently a hash of the packet is generated. Based on the ECMP group size +and the packet's hash a local offset is generated. Multiple LPM entries can +point to the same adjacency group. + +.. code:: + + table adjacency { + size: 4096, + counters_enabled: true, + match: { meta.adj_index: exact, + meta.adj_group_size: exact, + meta.packet_hash_index: exact }, + action: { ethernet.daddr: set, + meta.erif: set } + } + +ERIF +---- + +In case the egress RIF and destination MAC have been resolved by previous +tables this table does multiple operations like TTL decrease and MTU check. +Then the decision of forward/drop is taken and the port L3 statistics are +updated based on the packet's type (broadcast, unicast, multicast). + +.. code:: + + table erif { + size: 800, + counters_enabled: true, + match: { meta.rif_port: exact, + meta.is_l3_unicast: exact, + meta.is_l3_broadcast: exact, + meta.is_l3_multicast, exact }, + action: { meta.l3_drop: set, + meta.l3_forward: set } + } diff --git a/Documentation/networking/devlink/devlink-health.rst b/Documentation/networking/devlink/devlink-health.rst new file mode 100644 index 000000000000..0c99b11f05f9 --- /dev/null +++ b/Documentation/networking/devlink/devlink-health.rst @@ -0,0 +1,114 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============== +Devlink Health +============== + +Background +========== + +The ``devlink`` health mechanism is targeted for Real Time Alerting, in +order to know when something bad happened to a PCI device. + + * Provide alert debug information. + * Self healing. + * If problem needs vendor support, provide a way to gather all needed + debugging information. + +Overview +======== + +The main idea is to unify and centralize driver health reports in the +generic ``devlink`` instance and allow the user to set different +attributes of the health reporting and recovery procedures. + +The ``devlink`` health reporter: +Device driver creates a "health reporter" per each error/health type. +Error/Health type can be a known/generic (eg pci error, fw error, rx/tx error) +or unknown (driver specific). +For each registered health reporter a driver can issue error/health reports +asynchronously. All health reports handling is done by ``devlink``. +Device driver can provide specific callbacks for each "health reporter", e.g.: + + * Recovery procedures + * Diagnostics procedures + * Object dump procedures + * OOB initial parameters + +Different parts of the driver can register different types of health reporters +with different handlers. + +Actions +======= + +Once an error is reported, devlink health will perform the following actions: + + * A log is being send to the kernel trace events buffer + * Health status and statistics are being updated for the reporter instance + * Object dump is being taken and saved at the reporter instance (as long as + there is no other dump which is already stored) + * Auto recovery attempt is being done. Depends on: + - Auto-recovery configuration + - Grace period vs. time passed since last recover + +User Interface +============== + +User can access/change each reporter's parameters and driver specific callbacks +via ``devlink``, e.g per error type (per health reporter): + + * Configure reporter's generic parameters (like: disable/enable auto recovery) + * Invoke recovery procedure + * Run diagnostics + * Object dump + +.. list-table:: List of devlink health interfaces + :widths: 10 90 + + * - Name + - Description + * - ``DEVLINK_CMD_HEALTH_REPORTER_GET`` + - Retrieves status and configuration info per DEV and reporter. + * - ``DEVLINK_CMD_HEALTH_REPORTER_SET`` + - Allows reporter-related configuration setting. + * - ``DEVLINK_CMD_HEALTH_REPORTER_RECOVER`` + - Triggers a reporter's recovery procedure. + * - ``DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE`` + - Retrieves diagnostics data from a reporter on a device. + * - ``DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET`` + - Retrieves the last stored dump. Devlink health + saves a single dump. If an dump is not already stored by the devlink + for this reporter, devlink generates a new dump. + dump output is defined by the reporter. + * - ``DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR`` + - Clears the last saved dump file for the specified reporter. + +The following diagram provides a general overview of ``devlink-health``:: + + netlink + +--------------------------+ + | | + | + | + | | | + +--------------------------+ + |request for ops + |(diagnose, + mlx5_core devlink |recover, + |dump) + +--------+ +--------------------------+ + | | | reporter| | + | | | +---------v----------+ | + | | ops execution | | | | + | <----------------------------------+ | | + | | | | | | + | | | + ^------------------+ | + | | | | request for ops | + | | | | (recover, dump) | + | | | | | + | | | +-+------------------+ | + | | health report | | health handler | | + | +-------------------------------> | | + | | | +--------------------+ | + | | health reporter create | | + | +----------------------------> | + +--------+ +--------------------------+ diff --git a/Documentation/networking/devlink/devlink-info.rst b/Documentation/networking/devlink/devlink-info.rst new file mode 100644 index 000000000000..0385f15028b1 --- /dev/null +++ b/Documentation/networking/devlink/devlink-info.rst @@ -0,0 +1,94 @@ +.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + +============ +Devlink Info +============ + +The ``devlink-info`` mechanism enables device drivers to report device +information in a generic fashion. It is extensible, and enables exporting +even device or driver specific information. + +devlink supports representing the following types of versions + +.. list-table:: List of version types + :widths: 5 95 + + * - Type + - Description + * - ``fixed`` + - Represents fixed versions, which cannot change. For example, + component identifiers or the board version reported in the PCI VPD. + * - ``running`` + - Represents the version of the currently running component. For + example the running version of firmware. These versions generally + only update after a reboot. + * - ``stored`` + - Represents the version of a component as stored, such as after a + flash update. Stored values should update to reflect changes in the + flash even if a reboot has not yet occurred. + +Generic Versions +================ + +It is expected that drivers use the following generic names for exporting +version information. Other information may be exposed using driver-specific +names, but these should be documented in the driver-specific file. + +board.id +-------- + +Unique identifier of the board design. + +board.rev +--------- + +Board design revision. + +asic.id +------- + +ASIC design identifier. + +asic.rev +-------- + +ASIC design revision. + +board.manufacture +----------------- + +An identifier of the company or the facility which produced the part. + +fw +-- + +Overall firmware version, often representing the collection of +fw.mgmt, fw.app, etc. + +fw.mgmt +------- + +Control unit firmware version. This firmware is responsible for house +keeping tasks, PHY control etc. but not the packet-by-packet data path +operation. + +fw.app +------ + +Data path microcode controlling high-speed packet processing. + +fw.undi +------- + +UNDI software, may include the UEFI driver, firmware or both. + +fw.ncsi +------- + +Version of the software responsible for supporting/handling the +Network Controller Sideband Interface. + +fw.psid +------- + +Unique identifier of the firmware parameter set. diff --git a/Documentation/networking/devlink/devlink-params.rst b/Documentation/networking/devlink/devlink-params.rst new file mode 100644 index 000000000000..da2f85c0fa21 --- /dev/null +++ b/Documentation/networking/devlink/devlink-params.rst @@ -0,0 +1,108 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============== +Devlink Params +============== + +``devlink`` provides capability for a driver to expose device parameters for low +level device functionality. Since devlink can operate at the device-wide +level, it can be used to provide configuration that may affect multiple +ports on a single device. + +This document describes a number of generic parameters that are supported +across multiple drivers. Each driver is also free to add their own +parameters. Each driver must document the specific parameters they support, +whether generic or not. + +Configuration modes +=================== + +Parameters may be set in different configuration modes. + +.. list-table:: Possible configuration modes + :widths: 5 90 + + * - Name + - Description + * - ``runtime`` + - set while the driver is running, and takes effect immediately. No + reset is required. + * - ``driverinit`` + - applied while the driver initializes. Requires the user to restart + the driver using the ``devlink`` reload command. + * - ``permanent`` + - written to the device's non-volatile memory. A hard reset is required + for it to take effect. + +Reloading +--------- + +In order for ``driverinit`` parameters to take effect, the driver must +support reloading via the ``devlink-reload`` command. This command will +request a reload of the device driver. + +Generic configuration parameters +================================ +The following is a list of generic configuration parameters that drivers may +add. Use of generic parameters is preferred over each driver creating their +own name. + +.. list-table:: List of generic parameters + :widths: 5 5 90 + + * - Name + - Type + - Description + * - ``enable_sriov`` + - Boolean + - Enable Single Root I/O Virtualization (SRIOV) in the device. + * - ``ignore_ari`` + - Boolean + - Ignore Alternative Routing-ID Interpretation (ARI) capability. If + enabled, the adapter will ignore ARI capability even when the + platform has support enabled. The device will create the same number + of partitions as when the platform does not support ARI. + * - ``msix_vec_per_pf_max`` + - u32 + - Provides the maximum number of MSI-X interrupts that a device can + create. Value is the same across all physical functions (PFs) in the + device. + * - ``msix_vec_per_pf_min`` + - u32 + - Provides the minimum number of MSI-X interrupts required for the + device to initialize. Value is the same across all physical functions + (PFs) in the device. + * - ``fw_load_policy`` + - u8 + - Control the device's firmware loading policy. + - ``DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_DRIVER`` (0) + Load firmware version preferred by the driver. + - ``DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_FLASH`` (1) + Load firmware currently stored in flash. + - ``DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_DISK`` (2) + Load firmware currently available on host's disk. + * - ``reset_dev_on_drv_probe`` + - u8 + - Controls the device's reset policy on driver probe. + - ``DEVLINK_PARAM_RESET_DEV_ON_DRV_PROBE_VALUE_UNKNOWN`` (0) + Unknown or invalid value. + - ``DEVLINK_PARAM_RESET_DEV_ON_DRV_PROBE_VALUE_ALWAYS`` (1) + Always reset device on driver probe. + - ``DEVLINK_PARAM_RESET_DEV_ON_DRV_PROBE_VALUE_NEVER`` (2) + Never reset device on driver probe. + - ``DEVLINK_PARAM_RESET_DEV_ON_DRV_PROBE_VALUE_DISK`` (3) + Reset the device only if firmware can be found in the filesystem. + * - ``enable_roce`` + - Boolean + - Enable handling of RoCE traffic in the device. + * - ``internal_err_reset`` + - Boolean + - When enabled, the device driver will reset the device on internal + errors. + * - ``max_macs`` + - u32 + - Specifies the maximum number of MAC addresses per ethernet port of + this device. + * - ``region_snapshot_enable`` + - Boolean + - Enable capture of ``devlink-region`` snapshots. diff --git a/Documentation/networking/devlink/devlink-region.rst b/Documentation/networking/devlink/devlink-region.rst new file mode 100644 index 000000000000..1a7683e7acb2 --- /dev/null +++ b/Documentation/networking/devlink/devlink-region.rst @@ -0,0 +1,60 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============== +Devlink Region +============== + +``devlink`` regions enable access to driver defined address regions using +devlink. + +Each device can create and register its own supported address regions. The +region can then be accessed via the devlink region interface. + +Region snapshots are collected by the driver, and can be accessed via read +or dump commands. This allows future analysis on the created snapshots. +Regions may optionally support triggering snapshots on demand. + +The major benefit to creating a region is to provide access to internal +address regions that are otherwise inaccessible to the user. + +Regions may also be used to provide an additional way to debug complex error +states, but see also :doc:`devlink-health` + +example usage +------------- + +.. code:: shell + + $ devlink region help + $ devlink region show [ DEV/REGION ] + $ devlink region del DEV/REGION snapshot SNAPSHOT_ID + $ devlink region dump DEV/REGION [ snapshot SNAPSHOT_ID ] + $ devlink region read DEV/REGION [ snapshot SNAPSHOT_ID ] + address ADDRESS length length + + # Show all of the exposed regions with region sizes: + $ devlink region show + pci/0000:00:05.0/cr-space: size 1048576 snapshot [1 2] + pci/0000:00:05.0/fw-health: size 64 snapshot [1 2] + + # Delete a snapshot using: + $ devlink region del pci/0000:00:05.0/cr-space snapshot 1 + + # Trigger (request) a snapshot be taken: + $ devlink region trigger pci/0000:00:05.0/cr-space + + # Dump a snapshot: + $ devlink region dump pci/0000:00:05.0/fw-health snapshot 1 + 0000000000000000 0014 95dc 0014 9514 0035 1670 0034 db30 + 0000000000000010 0000 0000 ffff ff04 0029 8c00 0028 8cc8 + 0000000000000020 0016 0bb8 0016 1720 0000 0000 c00f 3ffc + 0000000000000030 bada cce5 bada cce5 bada cce5 bada cce5 + + # Read a specific part of a snapshot: + $ devlink region read pci/0000:00:05.0/fw-health snapshot 1 address 0 + length 16 + 0000000000000000 0014 95dc 0014 9514 0035 1670 0034 db30 + +As regions are likely very device or driver specific, no generic regions are +defined. See the driver-specific documentation files for information on the +specific regions a driver supports. diff --git a/Documentation/networking/devlink/devlink-resource.rst b/Documentation/networking/devlink/devlink-resource.rst new file mode 100644 index 000000000000..93e92d2f0752 --- /dev/null +++ b/Documentation/networking/devlink/devlink-resource.rst @@ -0,0 +1,62 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================ +Devlink Resource +================ + +``devlink`` provides the ability for drivers to register resources, which +can allow administrators to see the device restrictions for a given +resource, as well as how much of the given resource is currently +in use. Additionally, these resources can optionally have configurable size. +This could enable the administrator to limit the number of resources that +are used. + +For example, the ``netdevsim`` driver enables ``/IPv4/fib`` and +``/IPv4/fib-rules`` as resources to limit the number of IPv4 FIB entries and +rules for a given device. + +Resource Ids +============ + +Each resource is represented by an id, and contains information about its +current size and related sub resources. To access a sub resource, you +specify the path of the resource. For example ``/IPv4/fib`` is the id for +the ``fib`` sub-resource under the ``IPv4`` resource. + +example usage +------------- + +The resources exposed by the driver can be observed, for example: + +.. code:: shell + + $devlink resource show pci/0000:03:00.0 + pci/0000:03:00.0: + name kvd size 245760 unit entry + resources: + name linear size 98304 occ 0 unit entry size_min 0 size_max 147456 size_gran 128 + name hash_double size 60416 unit entry size_min 32768 size_max 180224 size_gran 128 + name hash_single size 87040 unit entry size_min 65536 size_max 212992 size_gran 128 + +Some resource's size can be changed. Examples: + +.. code:: shell + + $devlink resource set pci/0000:03:00.0 path /kvd/hash_single size 73088 + $devlink resource set pci/0000:03:00.0 path /kvd/hash_double size 74368 + +The changes do not apply immediately, this can be validated by the 'size_new' +attribute, which represents the pending change in size. For example: + +.. code:: shell + + $devlink resource show pci/0000:03:00.0 + pci/0000:03:00.0: + name kvd size 245760 unit entry size_valid false + resources: + name linear size 98304 size_new 147456 occ 0 unit entry size_min 0 size_max 147456 size_gran 128 + name hash_double size 60416 unit entry size_min 32768 size_max 180224 size_gran 128 + name hash_single size 87040 unit entry size_min 65536 size_max 212992 size_gran 128 + +Note that changes in resource size may require a device reload to properly +take effect. diff --git a/Documentation/networking/devlink-trap.rst b/Documentation/networking/devlink/devlink-trap.rst index 03311849bfb1..47a429bb8658 100644 --- a/Documentation/networking/devlink-trap.rst +++ b/Documentation/networking/devlink/devlink-trap.rst @@ -223,6 +223,21 @@ be added to the following table: * - ``ipv6_lpm_miss`` - ``exception`` - Traps unicast IPv6 packets that did not match any route + * - ``non_routable_packet`` + - ``drop`` + - Traps packets that the device decided to drop because they are not + supposed to be routed. For example, IGMP queries can be flooded by the + device in layer 2 and reach the router. Such packets should not be + routed and instead dropped + * - ``decap_error`` + - ``exception`` + - Traps NVE and IPinIP packets that the device decided to drop because of + failure during decapsulation (e.g., packet being too short, reserved + bits set in VXLAN header) + * - ``overlay_smac_is_mc`` + - ``drop`` + - Traps NVE packets that the device decided to drop because their overlay + source MAC is multicast Driver-specific Packet Traps ============================ @@ -233,7 +248,8 @@ help debug packet drops caused by these exceptions. The following list includes links to the description of driver-specific traps registered by various device drivers: - * :doc:`devlink-trap-netdevsim` + * :doc:`netdevsim` + * :doc:`mlxsw` Generic Packet Trap Groups ========================== @@ -258,6 +274,9 @@ narrow. The description of these groups must be added to the following table: * - ``buffer_drops`` - Contains packet traps for packets that were dropped by the device due to an enqueue decision + * - ``tunnel_drops`` + - Contains packet traps for packets that were dropped by the device during + tunnel encapsulation / decapsulation Testing ======= diff --git a/Documentation/networking/devlink/index.rst b/Documentation/networking/devlink/index.rst new file mode 100644 index 000000000000..087ff54d53fc --- /dev/null +++ b/Documentation/networking/devlink/index.rst @@ -0,0 +1,42 @@ +Linux Devlink Documentation +=========================== + +devlink is an API to expose device information and resources not directly +related to any device class, such as chip-wide/switch-ASIC-wide configuration. + +Interface documentation +----------------------- + +The following pages describe various interfaces available through devlink in +general. + +.. toctree:: + :maxdepth: 1 + + devlink-dpipe + devlink-health + devlink-info + devlink-params + devlink-region + devlink-resource + devlink-trap + +Driver-specific documentation +----------------------------- + +Each driver that implements ``devlink`` is expected to document what +parameters, info versions, and other features it supports. + +.. toctree:: + :maxdepth: 1 + + bnxt + ionic + mlx4 + mlx5 + mlxsw + mv88e6xxx + netdevsim + nfp + qed + ti-cpsw-switch diff --git a/Documentation/networking/devlink/ionic.rst b/Documentation/networking/devlink/ionic.rst new file mode 100644 index 000000000000..48da9c92d584 --- /dev/null +++ b/Documentation/networking/devlink/ionic.rst @@ -0,0 +1,29 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================== +ionic devlink support +===================== + +This document describes the devlink features implemented by the ``ionic`` +device driver. + +Info versions +============= + +The ``ionic`` driver reports the following versions + +.. list-table:: devlink info versions implemented + :widths: 5 5 90 + + * - Name + - Type + - Description + * - ``fw`` + - running + - Version of firmware running on the device + * - ``asic.id`` + - fixed + - The ASIC type for this device + * - ``asic.rev`` + - fixed + - The revision of the ASIC for this device diff --git a/Documentation/networking/devlink/mlx4.rst b/Documentation/networking/devlink/mlx4.rst new file mode 100644 index 000000000000..7b2d17ea5471 --- /dev/null +++ b/Documentation/networking/devlink/mlx4.rst @@ -0,0 +1,56 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==================== +mlx4 devlink support +==================== + +This document describes the devlink features implemented by the ``mlx4`` +device driver. + +Parameters +========== + +.. list-table:: Generic parameters implemented + + * - Name + - Mode + * - ``internal_err_reset`` + - driverinit, runtime + * - ``max_macs`` + - driverinit + * - ``region_snapshot_enable`` + - driverinit, runtime + +The ``mlx4`` driver also implements the following driver-specific +parameters. + +.. list-table:: Driver-specific parameters implemented + :widths: 5 5 5 85 + + * - Name + - Type + - Mode + - Description + * - ``enable_64b_cqe_eqe`` + - Boolean + - driverinit + - Enable 64 byte CQEs/EQEs, if the FW supports it. + * - ``enable_4k_uar`` + - Boolean + - driverinit + - Enable using the 4k UAR. + +The ``mlx4`` driver supports reloading via ``DEVLINK_CMD_RELOAD`` + +Regions +======= + +The ``mlx4`` driver supports dumping the firmware PCI crspace and health +buffer during a critical firmware issue. + +In case a firmware command times out, firmware getting stuck, or a non zero +value on the catastrophic buffer, a snapshot will be taken by the driver. + +The ``cr-space`` region will contain the firmware PCI crspace contents. The +``fw-health`` region will contain the device firmware's health buffer. +Snapshots for both of these regions are taken on the same event triggers. diff --git a/Documentation/networking/devlink/mlx5.rst b/Documentation/networking/devlink/mlx5.rst new file mode 100644 index 000000000000..629a6e69c036 --- /dev/null +++ b/Documentation/networking/devlink/mlx5.rst @@ -0,0 +1,59 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==================== +mlx5 devlink support +==================== + +This document describes the devlink features implemented by the ``mlx5`` +device driver. + +Parameters +========== + +.. list-table:: Generic parameters implemented + + * - Name + - Mode + * - ``enable_roce`` + - driverinit + +The ``mlx5`` driver also implements the following driver-specific +parameters. + +.. list-table:: Driver-specific parameters implemented + :widths: 5 5 5 85 + + * - Name + - Type + - Mode + - Description + * - ``flow_steering_mode`` + - string + - runtime + - Controls the flow steering mode of the driver + + * ``dmfs`` Device managed flow steering. In DMFS mode, the HW + steering entities are created and managed through firmware. + * ``smfs`` Software managed flow steering. In SMFS mode, the HW + steering entities are created and manage through the driver without + firmware intervention. + +The ``mlx5`` driver supports reloading via ``DEVLINK_CMD_RELOAD`` + +Info versions +============= + +The ``mlx5`` driver reports the following versions + +.. list-table:: devlink info versions implemented + :widths: 5 5 90 + + * - Name + - Type + - Description + * - ``fw.psid`` + - fixed + - Used to represent the board id of the device. + * - ``fw.version`` + - stored, running + - Three digit major.minor.subminor firmware version number. diff --git a/Documentation/networking/devlink/mlxsw.rst b/Documentation/networking/devlink/mlxsw.rst new file mode 100644 index 000000000000..cf857cb4ba8f --- /dev/null +++ b/Documentation/networking/devlink/mlxsw.rst @@ -0,0 +1,81 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================== +mlxsw devlink support +===================== + +This document describes the devlink features implemented by the ``mlxsw`` +device driver. + +Parameters +========== + +.. list-table:: Generic parameters implemented + + * - Name + - Mode + * - ``fw_load_policy`` + - driverinit + +The ``mlxsw`` driver also implements the following driver-specific +parameters. + +.. list-table:: Driver-specific parameters implemented + :widths: 5 5 5 85 + + * - Name + - Type + - Mode + - Description + * - ``acl_region_rehash_interval`` + - u32 + - runtime + - Sets an interval for periodic ACL region rehashes. The value is + specified in milliseconds, with a minimum of ``3000``. The value of + ``0`` disables periodic work entirely. The first rehash will be run + immediately after the value is set. + +The ``mlxsw`` driver supports reloading via ``DEVLINK_CMD_RELOAD`` + +Info versions +============= + +The ``mlxsw`` driver reports the following versions + +.. list-table:: devlink info versions implemented + :widths: 5 5 90 + + * - Name + - Type + - Description + * - ``hw.revision`` + - fixed + - The hardware revision for this board + * - ``fw.psid`` + - fixed + - Firmware PSID + * - ``fw.version`` + - running + - Three digit firmware version + +Driver-specific Traps +===================== + +.. list-table:: List of Driver-specific Traps Registered by ``mlxsw`` + :widths: 5 5 90 + + * - Name + - Type + - Description + * - ``irif_disabled`` + - ``drop`` + - Traps packets that the device decided to drop because they need to be + routed from a disabled router interface (RIF). This can happen during + RIF dismantle, when the RIF is first disabled before being removed + completely + * - ``erif_disabled`` + - ``drop`` + - Traps packets that the device decided to drop because they need to be + routed through a disabled router interface (RIF). This can happen during + RIF dismantle, when the RIF is first disabled before being removed + completely diff --git a/Documentation/networking/devlink/mv88e6xxx.rst b/Documentation/networking/devlink/mv88e6xxx.rst new file mode 100644 index 000000000000..c621212a47a1 --- /dev/null +++ b/Documentation/networking/devlink/mv88e6xxx.rst @@ -0,0 +1,28 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================= +mv88e6xxx devlink support +========================= + +This document describes the devlink features implemented by the ``mv88e6xxx`` +device driver. + +Parameters +========== + +The ``mv88e6xxx`` driver implements the following driver-specific parameters. + +.. list-table:: Driver-specific parameters implemented + :widths: 5 5 5 85 + + * - Name + - Type + - Mode + - Description + * - ``ATU_hash`` + - u8 + - runtime + - Select one of four possible hashing algorithms for MAC addresses in + the Address Translation Unit. A value of 3 may work better than the + default of 1 when many MAC addresses have the same OUI. Only the + values 0 to 3 are valid for this parameter. diff --git a/Documentation/networking/devlink/netdevsim.rst b/Documentation/networking/devlink/netdevsim.rst new file mode 100644 index 000000000000..2a266b7e7b38 --- /dev/null +++ b/Documentation/networking/devlink/netdevsim.rst @@ -0,0 +1,72 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================= +netdevsim devlink support +========================= + +This document describes the ``devlink`` features supported by the +``netdevsim`` device driver. + +Parameters +========== + +.. list-table:: Generic parameters implemented + + * - Name + - Mode + * - ``max_macs`` + - driverinit + +The ``netdevsim`` driver also implements the following driver-specific +parameters. + +.. list-table:: Driver-specific parameters implemented + :widths: 5 5 5 85 + + * - Name + - Type + - Mode + - Description + * - ``test1`` + - Boolean + - driverinit + - Test parameter used to show how a driver-specific devlink parameter + can be implemented. + +The ``netdevsim`` driver supports reloading via ``DEVLINK_CMD_RELOAD`` + +Regions +======= + +The ``netdevsim`` driver exposes a ``dummy`` region as an example of how the +devlink-region interfaces work. A snapshot is taken whenever the +``take_snapshot`` debugfs file is written to. + +Resources +========= + +The ``netdevsim`` driver exposes resources to control the number of FIB +entries and FIB rule entries that the driver will allow. + +.. code:: shell + + $ devlink resource set netdevsim/netdevsim0 path /IPv4/fib size 96 + $ devlink resource set netdevsim/netdevsim0 path /IPv4/fib-rules size 16 + $ devlink resource set netdevsim/netdevsim0 path /IPv6/fib size 64 + $ devlink resource set netdevsim/netdevsim0 path /IPv6/fib-rules size 16 + $ devlink dev reload netdevsim/netdevsim0 + +Driver-specific Traps +===================== + +.. list-table:: List of Driver-specific Traps Registered by ``netdevsim`` + :widths: 5 5 90 + + * - Name + - Type + - Description + * - ``fid_miss`` + - ``exception`` + - When a packet enters the device it is classified to a filtering + indentifier (FID) based on the ingress port and VLAN. This trap is used + to trap packets for which a FID could not be found diff --git a/Documentation/networking/devlink/nfp.rst b/Documentation/networking/devlink/nfp.rst new file mode 100644 index 000000000000..a1717db0dfcc --- /dev/null +++ b/Documentation/networking/devlink/nfp.rst @@ -0,0 +1,65 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=================== +nfp devlink support +=================== + +This document describes the devlink features implemented by the ``nfp`` +device driver. + +Parameters +========== + +.. list-table:: Generic parameters implemented + + * - Name + - Mode + * - ``fw_load_policy`` + - permanent + * - ``reset_dev_on_drv_probe`` + - permanent + +Info versions +============= + +The ``nfp`` driver reports the following versions + +.. list-table:: devlink info versions implemented + :widths: 5 5 90 + + * - Name + - Type + - Description + * - ``board.id`` + - fixed + - Part number identifying the board design + * - ``board.rev`` + - fixed + - Revision of the board design + * - ``board.manufacture`` + - fixed + - Vendor of the board design + * - ``board.model`` + - fixed + - Model name of the board design + * - ``fw.bundle_id`` + - stored, running + - Firmware bundle id + * - ``fw.mgmt`` + - stored, running + - Version of the management firmware + * - ``fw.cpld`` + - stored, running + - The CPLD firmware component version + * - ``fw.app`` + - stored, running + - The APP firmware component version + * - ``fw.undi`` + - stored, running + - The UNDI firmware component version + * - ``fw.ncsi`` + - stored, running + - The NSCI firmware component version + * - ``chip.init`` + - stored, running + - The CFGR firmware component version diff --git a/Documentation/networking/devlink/qed.rst b/Documentation/networking/devlink/qed.rst new file mode 100644 index 000000000000..805c6f63621a --- /dev/null +++ b/Documentation/networking/devlink/qed.rst @@ -0,0 +1,26 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=================== +qed devlink support +=================== + +This document describes the devlink features implemented by the ``qed`` core +device driver. + +Parameters +========== + +The ``qed`` driver implements the following driver-specific parameters. + +.. list-table:: Driver-specific parameters implemented + :widths: 5 5 5 85 + + * - Name + - Type + - Mode + - Description + * - ``iwarp_cmt`` + - Boolean + - runtime + - Enable iWARP functionality for 100g devices. Note that this impacts + L2 performance, and is therefore not enabled by default. diff --git a/Documentation/networking/devlink/ti-cpsw-switch.rst b/Documentation/networking/devlink/ti-cpsw-switch.rst new file mode 100644 index 000000000000..dc399e32abaa --- /dev/null +++ b/Documentation/networking/devlink/ti-cpsw-switch.rst @@ -0,0 +1,31 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================== +ti-cpsw-switch devlink support +============================== + +This document describes the devlink features implemented by the ``ti-cpsw-switch`` +device driver. + +Parameters +========== + +The ``ti-cpsw-switch`` driver implements the following driver-specific +parameters. + +.. list-table:: Driver-specific parameters implemented + :widths: 5 5 5 85 + + * - Name + - Type + - Mode + - Description + * - ``ale_bypass`` + - Boolean + - runtime + - Enables ALE_CONTROL(4).BYPASS mode for debugging purposes. In this + mode, all packets will be sent to the host port only. + * - ``switch_mode`` + - Boolean + - runtime + - Enable switch mode diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index bee73be7af93..d07d9855dcd3 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -13,9 +13,7 @@ Contents: can_ucan_protocol device_drivers/index dsa/index - devlink-info-versions - devlink-trap - devlink-trap-netdevsim + devlink/index ethtool-netlink ieee802154 j1939 diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 90b8f14bc41a..5f53faff4e25 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -607,7 +607,7 @@ tcp_synack_retries - INTEGER with the current initial RTO of 1second. With this the final timeout for a passive TCP connection will happen after 63seconds. -tcp_syncookies - BOOLEAN +tcp_syncookies - INTEGER Only valid when the kernel was compiled with CONFIG_SYN_COOKIES Send out syncookies when the syn backlog queue of a socket overflows. This is to prevent against the common 'SYN flood attack' diff --git a/Documentation/networking/netdev-FAQ.rst b/Documentation/networking/netdev-FAQ.rst index 642fa963be3c..d5c9320901c3 100644 --- a/Documentation/networking/netdev-FAQ.rst +++ b/Documentation/networking/netdev-FAQ.rst @@ -34,8 +34,8 @@ the names, the ``net`` tree is for fixes to existing code already in the mainline tree from Linus, and ``net-next`` is where the new code goes for the future release. You can find the trees here: -- https://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git -- https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git +- https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git +- https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git Q: How often do changes from these trees make it to the mainline Linus tree? ---------------------------------------------------------------------------- diff --git a/Documentation/networking/phy.rst b/Documentation/networking/phy.rst index e0a7c7af6525..1e4735cc0553 100644 --- a/Documentation/networking/phy.rst +++ b/Documentation/networking/phy.rst @@ -267,6 +267,24 @@ Some of the interface modes are described below: duplex, pause or other settings. This is dependent on the MAC and/or PHY behaviour. +``PHY_INTERFACE_MODE_10GBASER`` + This is the IEEE 802.3 Clause 49 defined 10GBASE-R protocol used with + various different mediums. Please refer to the IEEE standard for a + definition of this. + + Note: 10GBASE-R is just one protocol that can be used with XFI and SFI. + XFI and SFI permit multiple protocols over a single SERDES lane, and + also defines the electrical characteristics of the signals with a host + compliance board plugged into the host XFP/SFP connector. Therefore, + XFI and SFI are not PHY interface types in their own right. + +``PHY_INTERFACE_MODE_10GKR`` + This is the IEEE 802.3 Clause 49 defined 10GBASE-R with Clause 73 + autonegotiation. Please refer to the IEEE standard for further + information. + + Note: due to legacy usage, some 10GBASE-R usage incorrectly makes + use of this definition. Pause frames / flow control =========================== diff --git a/Documentation/networking/sfp-phylink.rst b/Documentation/networking/sfp-phylink.rst index a5e00a159d21..d753a309f9d1 100644 --- a/Documentation/networking/sfp-phylink.rst +++ b/Documentation/networking/sfp-phylink.rst @@ -251,7 +251,8 @@ this documentation. phylink_mac_change(priv->phylink, link_is_up); where ``link_is_up`` is true if the link is currently up or false - otherwise. + otherwise. If a MAC is unable to provide these interrupts, then + it should set ``priv->phylink_config.pcs_poll = true;`` in step 9. 11. Verify that the driver does not call:: diff --git a/Documentation/process/embargoed-hardware-issues.rst b/Documentation/process/embargoed-hardware-issues.rst index 799580acc8de..5d54946cfc75 100644 --- a/Documentation/process/embargoed-hardware-issues.rst +++ b/Documentation/process/embargoed-hardware-issues.rst @@ -255,7 +255,7 @@ an involved disclosed party. The current ambassadors list: Red Hat Josh Poimboeuf <jpoimboe@redhat.com> SUSE Jiri Kosina <jkosina@suse.cz> - Amazon + Amazon Peter Bowen <pzb@amzn.com> Google Kees Cook <keescook@chromium.org> ============= ======================================================== diff --git a/Documentation/process/index.rst b/Documentation/process/index.rst index 21aa7d5358e6..6399d92f0b21 100644 --- a/Documentation/process/index.rst +++ b/Documentation/process/index.rst @@ -60,6 +60,7 @@ lack of a better place. volatile-considered-harmful botching-up-ioctls clang-format + ../riscv/patch-acceptance .. only:: subproject and html diff --git a/Documentation/riscv/index.rst b/Documentation/riscv/index.rst index 215fd3c1f2d5..fa33bffd8992 100644 --- a/Documentation/riscv/index.rst +++ b/Documentation/riscv/index.rst @@ -7,6 +7,7 @@ RISC-V architecture boot-image-header pmu + patch-acceptance .. only:: subproject and html diff --git a/Documentation/riscv/patch-acceptance.rst b/Documentation/riscv/patch-acceptance.rst new file mode 100644 index 000000000000..dfe0ac5624fb --- /dev/null +++ b/Documentation/riscv/patch-acceptance.rst @@ -0,0 +1,35 @@ +.. SPDX-License-Identifier: GPL-2.0 + +arch/riscv maintenance guidelines for developers +================================================ + +Overview +-------- +The RISC-V instruction set architecture is developed in the open: +in-progress drafts are available for all to review and to experiment +with implementations. New module or extension drafts can change +during the development process - sometimes in ways that are +incompatible with previous drafts. This flexibility can present a +challenge for RISC-V Linux maintenance. Linux maintainers disapprove +of churn, and the Linux development process prefers well-reviewed and +tested code over experimental code. We wish to extend these same +principles to the RISC-V-related code that will be accepted for +inclusion in the kernel. + +Submit Checklist Addendum +------------------------- +We'll only accept patches for new modules or extensions if the +specifications for those modules or extensions are listed as being +"Frozen" or "Ratified" by the RISC-V Foundation. (Developers may, of +course, maintain their own Linux kernel trees that contain code for +any draft extensions that they wish.) + +Additionally, the RISC-V specification allows implementors to create +their own custom extensions. These custom extensions aren't required +to go through any review or ratification process by the RISC-V +Foundation. To avoid the maintenance complexity and potential +performance impact of adding kernel code for implementor-specific +RISC-V extensions, we'll only to accept patches for extensions that +have been officially frozen or ratified by the RISC-V Foundation. +(Implementors, may, of course, maintain their own Linux kernel trees +containing code for any custom extensions that they wish.) diff --git a/MAINTAINERS b/MAINTAINERS index 34de47832883..1726b4eb49e5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -720,7 +720,7 @@ F: Documentation/devicetree/bindings/i2c/i2c-altera.txt F: drivers/i2c/busses/i2c-altera.c ALTERA MAILBOX DRIVER -M: Ley Foon Tan <lftan@altera.com> +M: Ley Foon Tan <ley.foon.tan@intel.com> L: nios2-dev@lists.rocketboards.org (moderated for non-subscribers) S: Maintained F: drivers/mailbox/mailbox-altera.c @@ -1407,7 +1407,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/arm/arm-soc.git ARM/ACTIONS SEMI ARCHITECTURE M: Andreas Färber <afaerber@suse.de> -R: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org> +M: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org> L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained N: owl @@ -3150,7 +3150,7 @@ S: Maintained F: arch/mips/net/ BPF JIT for NFP NICs -M: Jakub Kicinski <jakub.kicinski@netronome.com> +M: Jakub Kicinski <kuba@kernel.org> L: netdev@vger.kernel.org L: bpf@vger.kernel.org S: Supported @@ -4848,6 +4848,7 @@ S: Supported F: net/core/devlink.c F: include/net/devlink.h F: include/uapi/linux/devlink.h +F: Documentation/networking/devlink DIALOG SEMICONDUCTOR DRIVERS M: Support Opensource <support.opensource@diasemi.com> @@ -6197,6 +6198,7 @@ ETHERNET PHY LIBRARY M: Andrew Lunn <andrew@lunn.ch> M: Florian Fainelli <f.fainelli@gmail.com> M: Heiner Kallweit <hkallweit1@gmail.com> +R: Russell King <linux@armlinux.org.uk> L: netdev@vger.kernel.org S: Maintained F: Documentation/ABI/testing/sysfs-class-net-phydev @@ -8569,7 +8571,7 @@ S: Maintained F: drivers/platform/x86/intel-vbtn.c INTEL WIRELESS 3945ABG/BG, 4965AGN (iwlegacy) -M: Stanislaw Gruszka <sgruszka@redhat.com> +M: Stanislaw Gruszka <stf_xl@wp.pl> L: linux-wireless@vger.kernel.org S: Supported F: drivers/net/wireless/intel/iwlegacy/ @@ -9889,7 +9891,7 @@ S: Maintained F: drivers/net/dsa/mv88e6xxx/ F: include/linux/platform_data/mv88e6xxx.h F: Documentation/devicetree/bindings/net/dsa/marvell.txt -F: Documentation/networking/devlink-params-mv88e6xxx.txt +F: Documentation/networking/devlink/mv88e6xxx.rst MARVELL ARMADA DRM SUPPORT M: Russell King <linux@armlinux.org.uk> @@ -11431,7 +11433,7 @@ F: include/uapi/linux/netrom.h F: net/netrom/ NETRONOME ETHERNET DRIVERS -M: Jakub Kicinski <jakub.kicinski@netronome.com> +M: Jakub Kicinski <kuba@kernel.org> L: oss-drivers@netronome.com S: Maintained F: drivers/net/ethernet/netronome/ @@ -11460,8 +11462,8 @@ M: "David S. Miller" <davem@davemloft.net> L: netdev@vger.kernel.org W: http://www.linuxfoundation.org/en/Net Q: http://patchwork.ozlabs.org/project/netdev/list/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git -T: git git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git S: Odd Fixes F: Documentation/devicetree/bindings/net/ F: drivers/net/ @@ -11499,11 +11501,12 @@ F: drivers/net/dsa/ NETWORKING [GENERAL] M: "David S. Miller" <davem@davemloft.net> +M: Jakub Kicinski <kuba@kernel.org> L: netdev@vger.kernel.org W: http://www.linuxfoundation.org/en/Net Q: http://patchwork.ozlabs.org/project/netdev/list/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git -T: git git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git B: mailto:netdev@vger.kernel.org S: Maintained F: net/ @@ -11548,7 +11551,7 @@ M: "David S. Miller" <davem@davemloft.net> M: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> M: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org> L: netdev@vger.kernel.org -T: git git://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git S: Maintained F: net/ipv4/ F: net/ipv6/ @@ -11573,6 +11576,18 @@ F: net/ipv6/calipso.c F: net/netfilter/xt_CONNSECMARK.c F: net/netfilter/xt_SECMARK.c +NETWORKING [MPTCP] +M: Mat Martineau <mathew.j.martineau@linux.intel.com> +M: Matthieu Baerts <matthieu.baerts@tessares.net> +L: netdev@vger.kernel.org +L: mptcp@lists.01.org +W: https://github.com/multipath-tcp/mptcp_net-next/wiki +B: https://github.com/multipath-tcp/mptcp_net-next/issues +S: Maintained +F: include/net/mptcp.h +F: net/mptcp/ +F: tools/testing/selftests/net/mptcp/ + NETWORKING [TCP] M: Eric Dumazet <edumazet@google.com> L: netdev@vger.kernel.org @@ -11591,7 +11606,7 @@ M: Boris Pismenny <borisp@mellanox.com> M: Aviad Yehezkel <aviadye@mellanox.com> M: John Fastabend <john.fastabend@gmail.com> M: Daniel Borkmann <daniel@iogearbox.net> -M: Jakub Kicinski <jakub.kicinski@netronome.com> +M: Jakub Kicinski <kuba@kernel.org> L: netdev@vger.kernel.org S: Maintained F: net/tls/* @@ -11603,7 +11618,7 @@ L: linux-wireless@vger.kernel.org Q: http://patchwork.kernel.org/project/linux-wireless/list/ NETDEVSIM -M: Jakub Kicinski <jakub.kicinski@netronome.com> +M: Jakub Kicinski <kuba@kernel.org> S: Maintained F: drivers/net/netdevsim/* @@ -11680,7 +11695,7 @@ F: Documentation/scsi/NinjaSCSI.txt F: drivers/scsi/nsp32* NIOS2 ARCHITECTURE -M: Ley Foon Tan <lftan@altera.com> +M: Ley Foon Tan <ley.foon.tan@intel.com> L: nios2-dev@lists.rocketboards.org (moderated for non-subscribers) T: git git://git.kernel.org/pub/scm/linux/kernel/git/lftan/nios2.git S: Maintained @@ -12564,7 +12579,7 @@ F: Documentation/devicetree/bindings/pci/aardvark-pci.txt F: drivers/pci/controller/pci-aardvark.c PCI DRIVER FOR ALTERA PCIE IP -M: Ley Foon Tan <lftan@altera.com> +M: Ley Foon Tan <ley.foon.tan@intel.com> L: rfi@lists.rocketboards.org (moderated for non-subscribers) L: linux-pci@vger.kernel.org S: Supported @@ -12743,7 +12758,7 @@ S: Supported F: Documentation/PCI/pci-error-recovery.rst PCI MSI DRIVER FOR ALTERA MSI IP -M: Ley Foon Tan <lftan@altera.com> +M: Ley Foon Tan <ley.foon.tan@intel.com> L: rfi@lists.rocketboards.org (moderated for non-subscribers) L: linux-pci@vger.kernel.org S: Supported @@ -13686,7 +13701,6 @@ F: drivers/net/ethernet/qualcomm/emac/ QUALCOMM ETHQOS ETHERNET DRIVER M: Vinod Koul <vkoul@kernel.org> -M: Niklas Cassel <niklas.cassel@linaro.org> L: netdev@vger.kernel.org S: Maintained F: drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c @@ -13828,7 +13842,7 @@ S: Maintained F: arch/mips/ralink RALINK RT2X00 WIRELESS LAN DRIVER -M: Stanislaw Gruszka <sgruszka@redhat.com> +M: Stanislaw Gruszka <stf_xl@wp.pl> M: Helmut Schaa <helmut.schaa@googlemail.com> L: linux-wireless@vger.kernel.org S: Maintained @@ -14128,6 +14142,7 @@ M: Paul Walmsley <paul.walmsley@sifive.com> M: Palmer Dabbelt <palmer@dabbelt.com> M: Albert Ou <aou@eecs.berkeley.edu> L: linux-riscv@lists.infradead.org +P: Documentation/riscv/patch-acceptance.rst T: git git://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux.git S: Supported F: arch/riscv/ @@ -14555,8 +14570,6 @@ F: include/linux/platform_data/spi-s3c64xx.h SAMSUNG SXGBE DRIVERS M: Byungho An <bh74.an@samsung.com> -M: Girish K S <ks.giri@samsung.com> -M: Vipul Pandya <vipul.pandya@samsung.com> S: Supported L: netdev@vger.kernel.org F: drivers/net/ethernet/samsung/sxgbe/ @@ -15777,6 +15790,7 @@ M: Jose Abreu <joabreu@synopsys.com> L: netdev@vger.kernel.org W: http://www.stlinux.com S: Supported +F: Documentation/networking/device_drivers/stmicro/ F: drivers/net/ethernet/stmicro/stmmac/ SUN3/3X @@ -16610,7 +16624,7 @@ F: kernel/time/ntp.c F: tools/testing/selftests/timers/ TIPC NETWORK LAYER -M: Jon Maloy <jon.maloy@ericsson.com> +M: Jon Maloy <jmaloy@redhat.com> M: Ying Xue <ying.xue@windriver.com> L: netdev@vger.kernel.org (core kernel code) L: tipc-discussion@lists.sourceforge.net (user apps, general discussion) @@ -18060,7 +18074,7 @@ XDP (eXpress Data Path) M: Alexei Starovoitov <ast@kernel.org> M: Daniel Borkmann <daniel@iogearbox.net> M: David S. Miller <davem@davemloft.net> -M: Jakub Kicinski <jakub.kicinski@netronome.com> +M: Jakub Kicinski <kuba@kernel.org> M: Jesper Dangaard Brouer <hawk@kernel.org> M: John Fastabend <john.fastabend@gmail.com> L: netdev@vger.kernel.org @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 5 SUBLEVEL = 0 -EXTRAVERSION = -rc4 +EXTRAVERSION = -rc7 NAME = Kleptomaniac Octopus # *DOCUMENTATION* diff --git a/arch/arc/include/asm/entry-arcv2.h b/arch/arc/include/asm/entry-arcv2.h index 41b16f21beec..0b8b63d0bec1 100644 --- a/arch/arc/include/asm/entry-arcv2.h +++ b/arch/arc/include/asm/entry-arcv2.h @@ -162,7 +162,7 @@ #endif #ifdef CONFIG_ARC_HAS_ACCL_REGS - ST2 r58, r59, PT_sp + 12 + ST2 r58, r59, PT_r58 #endif .endm @@ -172,8 +172,8 @@ LD2 gp, fp, PT_r26 ; gp (r26), fp (r27) - ld r12, [sp, PT_sp + 4] - ld r30, [sp, PT_sp + 8] + ld r12, [sp, PT_r12] + ld r30, [sp, PT_r30] ; Restore SP (into AUX_USER_SP) only if returning to U mode ; - for K mode, it will be implicitly restored as stack is unwound @@ -190,7 +190,7 @@ #endif #ifdef CONFIG_ARC_HAS_ACCL_REGS - LD2 r58, r59, PT_sp + 12 + LD2 r58, r59, PT_r58 #endif .endm diff --git a/arch/arc/include/asm/hugepage.h b/arch/arc/include/asm/hugepage.h index 9a74ce71a767..30ac40fed2c5 100644 --- a/arch/arc/include/asm/hugepage.h +++ b/arch/arc/include/asm/hugepage.h @@ -8,7 +8,6 @@ #define _ASM_ARC_HUGEPAGE_H #include <linux/types.h> -#define __ARCH_USE_5LEVEL_HACK #include <asm-generic/pgtable-nopmd.h> static inline pte_t pmd_pte(pmd_t pmd) diff --git a/arch/arc/kernel/asm-offsets.c b/arch/arc/kernel/asm-offsets.c index 1f621e416521..c783bcd35eb8 100644 --- a/arch/arc/kernel/asm-offsets.c +++ b/arch/arc/kernel/asm-offsets.c @@ -66,7 +66,15 @@ int main(void) DEFINE(SZ_CALLEE_REGS, sizeof(struct callee_regs)); DEFINE(SZ_PT_REGS, sizeof(struct pt_regs)); - DEFINE(PT_user_r25, offsetof(struct pt_regs, user_r25)); + +#ifdef CONFIG_ISA_ARCV2 + OFFSET(PT_r12, pt_regs, r12); + OFFSET(PT_r30, pt_regs, r30); +#endif +#ifdef CONFIG_ARC_HAS_ACCL_REGS + OFFSET(PT_r58, pt_regs, r58); + OFFSET(PT_r59, pt_regs, r59); +#endif return 0; } diff --git a/arch/arc/plat-eznps/Kconfig b/arch/arc/plat-eznps/Kconfig index a376a50d3fea..a931d0a256d0 100644 --- a/arch/arc/plat-eznps/Kconfig +++ b/arch/arc/plat-eznps/Kconfig @@ -7,7 +7,7 @@ menuconfig ARC_PLAT_EZNPS bool "\"EZchip\" ARC dev platform" select CPU_BIG_ENDIAN - select CLKSRC_NPS + select CLKSRC_NPS if !PHYS_ADDR_T_64BIT select EZNPS_GIC select EZCHIP_NPS_MANAGEMENT_ENET if ETHERNET help diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index ba75e3661a41..96dab76da3b3 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -72,6 +72,7 @@ config ARM select HAVE_ARM_SMCCC if CPU_V7 select HAVE_EBPF_JIT if !CPU_ENDIAN_BE32 select HAVE_CONTEXT_TRACKING + select HAVE_COPY_THREAD_TLS select HAVE_C_RECORDMCOUNT select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS if MMU diff --git a/arch/arm/boot/dts/am335x-boneblack-common.dtsi b/arch/arm/boot/dts/am335x-boneblack-common.dtsi index 7ad079861efd..91f93bc89716 100644 --- a/arch/arm/boot/dts/am335x-boneblack-common.dtsi +++ b/arch/arm/boot/dts/am335x-boneblack-common.dtsi @@ -131,6 +131,11 @@ }; / { + memory@80000000 { + device_type = "memory"; + reg = <0x80000000 0x20000000>; /* 512 MB */ + }; + clk_mcasp0_fixed: clk_mcasp0_fixed { #clock-cells = <0>; compatible = "fixed-clock"; diff --git a/arch/arm/boot/dts/am43x-epos-evm.dts b/arch/arm/boot/dts/am43x-epos-evm.dts index 078cb473fa7d..a6fbc088daa8 100644 --- a/arch/arm/boot/dts/am43x-epos-evm.dts +++ b/arch/arm/boot/dts/am43x-epos-evm.dts @@ -848,6 +848,7 @@ pinctrl-names = "default", "sleep"; pinctrl-0 = <&spi0_pins_default>; pinctrl-1 = <&spi0_pins_sleep>; + ti,pindir-d0-out-d1-in = <1>; }; &spi1 { @@ -855,6 +856,7 @@ pinctrl-names = "default", "sleep"; pinctrl-0 = <&spi1_pins_default>; pinctrl-1 = <&spi1_pins_sleep>; + ti,pindir-d0-out-d1-in = <1>; }; &usb2_phy1 { diff --git a/arch/arm/boot/dts/am571x-idk.dts b/arch/arm/boot/dts/am571x-idk.dts index 820ce3b60bb6..669559c9c95b 100644 --- a/arch/arm/boot/dts/am571x-idk.dts +++ b/arch/arm/boot/dts/am571x-idk.dts @@ -167,11 +167,7 @@ &pcie1_rc { status = "okay"; - gpios = <&gpio3 23 GPIO_ACTIVE_HIGH>; -}; - -&pcie1_ep { - gpios = <&gpio3 23 GPIO_ACTIVE_HIGH>; + gpios = <&gpio5 18 GPIO_ACTIVE_HIGH>; }; &mmc1 { diff --git a/arch/arm/boot/dts/am572x-idk-common.dtsi b/arch/arm/boot/dts/am572x-idk-common.dtsi index a064f13b3880..ddf123620e96 100644 --- a/arch/arm/boot/dts/am572x-idk-common.dtsi +++ b/arch/arm/boot/dts/am572x-idk-common.dtsi @@ -147,10 +147,6 @@ gpios = <&gpio3 23 GPIO_ACTIVE_HIGH>; }; -&pcie1_ep { - gpios = <&gpio3 23 GPIO_ACTIVE_HIGH>; -}; - &mailbox5 { status = "okay"; mbox_ipu1_ipc3x: mbox_ipu1_ipc3x { diff --git a/arch/arm/boot/dts/am57xx-beagle-x15-common.dtsi b/arch/arm/boot/dts/am57xx-beagle-x15-common.dtsi index bc76f1705c0f..a813a0cf3ff3 100644 --- a/arch/arm/boot/dts/am57xx-beagle-x15-common.dtsi +++ b/arch/arm/boot/dts/am57xx-beagle-x15-common.dtsi @@ -29,6 +29,27 @@ reg = <0x0 0x80000000 0x0 0x80000000>; }; + main_12v0: fixedregulator-main_12v0 { + /* main supply */ + compatible = "regulator-fixed"; + regulator-name = "main_12v0"; + regulator-min-microvolt = <12000000>; + regulator-max-microvolt = <12000000>; + regulator-always-on; + regulator-boot-on; + }; + + evm_5v0: fixedregulator-evm_5v0 { + /* Output of TPS54531D */ + compatible = "regulator-fixed"; + regulator-name = "evm_5v0"; + regulator-min-microvolt = <5000000>; + regulator-max-microvolt = <5000000>; + vin-supply = <&main_12v0>; + regulator-always-on; + regulator-boot-on; + }; + vdd_3v3: fixedregulator-vdd_3v3 { compatible = "regulator-fixed"; regulator-name = "vdd_3v3"; @@ -547,10 +568,6 @@ gpios = <&gpio2 8 GPIO_ACTIVE_LOW>; }; -&pcie1_ep { - gpios = <&gpio2 8 GPIO_ACTIVE_LOW>; -}; - &mcasp3 { #sound-dai-cells = <0>; assigned-clocks = <&l4per2_clkctrl DRA7_L4PER2_MCASP3_CLKCTRL 24>; diff --git a/arch/arm/boot/dts/aspeed-bmc-ibm-rainier.dts b/arch/arm/boot/dts/aspeed-bmc-ibm-rainier.dts index c1c9cd30f980..13f7aefe045e 100644 --- a/arch/arm/boot/dts/aspeed-bmc-ibm-rainier.dts +++ b/arch/arm/boot/dts/aspeed-bmc-ibm-rainier.dts @@ -258,9 +258,9 @@ }; }; - pca0: pca9552@60 { + pca0: pca9552@61 { compatible = "nxp,pca9552"; - reg = <0x60>; + reg = <0x61>; #address-cells = <1>; #size-cells = <0>; @@ -521,371 +521,6 @@ &i2c13 { status = "okay"; -}; - -&i2c14 { - status = "okay"; -}; - -&i2c15 { - status = "okay"; -}; - -&i2c0 { - status = "okay"; -}; - -&i2c1 { - status = "okay"; -}; - -&i2c2 { - status = "okay"; -}; - -&i2c3 { - status = "okay"; - - power-supply@68 { - compatible = "ibm,cffps2"; - reg = <0x68>; - }; - - power-supply@69 { - compatible = "ibm,cffps2"; - reg = <0x69>; - }; - - power-supply@6a { - compatible = "ibm,cffps2"; - reg = <0x6a>; - }; - - power-supply@6b { - compatible = "ibm,cffps2"; - reg = <0x6b>; - }; -}; - -&i2c4 { - status = "okay"; - - tmp275@48 { - compatible = "ti,tmp275"; - reg = <0x48>; - }; - - tmp275@49 { - compatible = "ti,tmp275"; - reg = <0x49>; - }; - - tmp275@4a { - compatible = "ti,tmp275"; - reg = <0x4a>; - }; -}; - -&i2c5 { - status = "okay"; - - tmp275@48 { - compatible = "ti,tmp275"; - reg = <0x48>; - }; - - tmp275@49 { - compatible = "ti,tmp275"; - reg = <0x49>; - }; -}; - -&i2c6 { - status = "okay"; - - tmp275@48 { - compatible = "ti,tmp275"; - reg = <0x48>; - }; - - tmp275@4a { - compatible = "ti,tmp275"; - reg = <0x4a>; - }; - - tmp275@4b { - compatible = "ti,tmp275"; - reg = <0x4b>; - }; -}; - -&i2c7 { - status = "okay"; - - si7021-a20@20 { - compatible = "silabs,si7020"; - reg = <0x20>; - }; - - tmp275@48 { - compatible = "ti,tmp275"; - reg = <0x48>; - }; - - max31785@52 { - compatible = "maxim,max31785a"; - reg = <0x52>; - #address-cells = <1>; - #size-cells = <0>; - - fan@0 { - compatible = "pmbus-fan"; - reg = <0>; - tach-pulses = <2>; - }; - - fan@1 { - compatible = "pmbus-fan"; - reg = <1>; - tach-pulses = <2>; - }; - - fan@2 { - compatible = "pmbus-fan"; - reg = <2>; - tach-pulses = <2>; - }; - - fan@3 { - compatible = "pmbus-fan"; - reg = <3>; - tach-pulses = <2>; - }; - }; - - pca0: pca9552@60 { - compatible = "nxp,pca9552"; - reg = <0x60>; - #address-cells = <1>; - #size-cells = <0>; - - gpio-controller; - #gpio-cells = <2>; - - gpio@0 { - reg = <0>; - }; - - gpio@1 { - reg = <1>; - }; - - gpio@2 { - reg = <2>; - }; - - gpio@3 { - reg = <3>; - }; - - gpio@4 { - reg = <4>; - }; - - gpio@5 { - reg = <5>; - }; - - gpio@6 { - reg = <6>; - }; - - gpio@7 { - reg = <7>; - }; - - gpio@8 { - reg = <8>; - }; - - gpio@9 { - reg = <9>; - }; - - gpio@10 { - reg = <10>; - }; - - gpio@11 { - reg = <11>; - }; - - gpio@12 { - reg = <12>; - }; - - gpio@13 { - reg = <13>; - }; - - gpio@14 { - reg = <14>; - }; - - gpio@15 { - reg = <15>; - }; - }; - - dps: dps310@76 { - compatible = "infineon,dps310"; - reg = <0x76>; - #io-channel-cells = <0>; - }; -}; - -&i2c8 { - status = "okay"; - - ucd90320@b { - compatible = "ti,ucd90160"; - reg = <0x0b>; - }; - - ucd90320@c { - compatible = "ti,ucd90160"; - reg = <0x0c>; - }; - - ucd90320@11 { - compatible = "ti,ucd90160"; - reg = <0x11>; - }; - - rtc@32 { - compatible = "epson,rx8900"; - reg = <0x32>; - }; - - tmp275@48 { - compatible = "ti,tmp275"; - reg = <0x48>; - }; - - tmp275@4a { - compatible = "ti,tmp275"; - reg = <0x4a>; - }; -}; - -&i2c9 { - status = "okay"; - - ir35221@42 { - compatible = "infineon,ir35221"; - reg = <0x42>; - }; - - ir35221@43 { - compatible = "infineon,ir35221"; - reg = <0x43>; - }; - - ir35221@44 { - compatible = "infineon,ir35221"; - reg = <0x44>; - }; - - tmp423a@4c { - compatible = "ti,tmp423"; - reg = <0x4c>; - }; - - tmp423b@4d { - compatible = "ti,tmp423"; - reg = <0x4d>; - }; - - ir35221@72 { - compatible = "infineon,ir35221"; - reg = <0x72>; - }; - - ir35221@73 { - compatible = "infineon,ir35221"; - reg = <0x73>; - }; - - ir35221@74 { - compatible = "infineon,ir35221"; - reg = <0x74>; - }; -}; - -&i2c10 { - status = "okay"; - - ir35221@42 { - compatible = "infineon,ir35221"; - reg = <0x42>; - }; - - ir35221@43 { - compatible = "infineon,ir35221"; - reg = <0x43>; - }; - - ir35221@44 { - compatible = "infineon,ir35221"; - reg = <0x44>; - }; - - tmp423a@4c { - compatible = "ti,tmp423"; - reg = <0x4c>; - }; - - tmp423b@4d { - compatible = "ti,tmp423"; - reg = <0x4d>; - }; - - ir35221@72 { - compatible = "infineon,ir35221"; - reg = <0x72>; - }; - - ir35221@73 { - compatible = "infineon,ir35221"; - reg = <0x73>; - }; - - ir35221@74 { - compatible = "infineon,ir35221"; - reg = <0x74>; - }; -}; - -&i2c11 { - status = "okay"; - - tmp275@48 { - compatible = "ti,tmp275"; - reg = <0x48>; - }; - - tmp275@49 { - compatible = "ti,tmp275"; - reg = <0x49>; - }; -}; - -&i2c12 { - status = "okay"; -}; - -&i2c13 { - status = "okay"; eeprom@50 { compatible = "atmel,24c64"; diff --git a/arch/arm/boot/dts/aspeed-bmc-opp-tacoma.dts b/arch/arm/boot/dts/aspeed-bmc-opp-tacoma.dts index f02de4ab058c..ff49ec76fa7c 100644 --- a/arch/arm/boot/dts/aspeed-bmc-opp-tacoma.dts +++ b/arch/arm/boot/dts/aspeed-bmc-opp-tacoma.dts @@ -122,37 +122,6 @@ }; }; -&fmc { - status = "okay"; - flash@0 { - status = "okay"; - m25p,fast-read; - label = "bmc"; - spi-max-frequency = <50000000>; -#include "openbmc-flash-layout-128.dtsi" - }; - - flash@1 { - status = "okay"; - m25p,fast-read; - label = "alt-bmc"; - spi-max-frequency = <50000000>; - }; -}; - -&spi1 { - status = "okay"; - pinctrl-names = "default"; - pinctrl-0 = <&pinctrl_spi1_default>; - - flash@0 { - status = "okay"; - m25p,fast-read; - label = "pnor"; - spi-max-frequency = <100000000>; - }; -}; - &mac2 { status = "okay"; pinctrl-names = "default"; @@ -165,6 +134,11 @@ &emmc { status = "okay"; +}; + +&fsim0 { + status = "okay"; + #address-cells = <2>; #size-cells = <0>; @@ -820,373 +794,6 @@ status = "okay"; }; -&i2c0 { - status = "okay"; -}; - -&i2c1 { - status = "okay"; -}; - -&i2c2 { - status = "okay"; -}; - -&i2c3 { - status = "okay"; - - bmp: bmp280@77 { - compatible = "bosch,bmp280"; - reg = <0x77>; - #io-channel-cells = <1>; - }; - - max31785@52 { - compatible = "maxim,max31785a"; - reg = <0x52>; - #address-cells = <1>; - #size-cells = <0>; - - fan@0 { - compatible = "pmbus-fan"; - reg = <0>; - tach-pulses = <2>; - maxim,fan-rotor-input = "tach"; - maxim,fan-pwm-freq = <25000>; - maxim,fan-dual-tach; - maxim,fan-no-watchdog; - maxim,fan-no-fault-ramp; - maxim,fan-ramp = <2>; - maxim,fan-fault-pin-mon; - }; - - fan@1 { - compatible = "pmbus-fan"; - reg = <1>; - tach-pulses = <2>; - maxim,fan-rotor-input = "tach"; - maxim,fan-pwm-freq = <25000>; - maxim,fan-dual-tach; - maxim,fan-no-watchdog; - maxim,fan-no-fault-ramp; - maxim,fan-ramp = <2>; - maxim,fan-fault-pin-mon; - }; - - fan@2 { - compatible = "pmbus-fan"; - reg = <2>; - tach-pulses = <2>; - maxim,fan-rotor-input = "tach"; - maxim,fan-pwm-freq = <25000>; - maxim,fan-dual-tach; - maxim,fan-no-watchdog; - maxim,fan-no-fault-ramp; - maxim,fan-ramp = <2>; - maxim,fan-fault-pin-mon; - }; - - fan@3 { - compatible = "pmbus-fan"; - reg = <3>; - tach-pulses = <2>; - maxim,fan-rotor-input = "tach"; - maxim,fan-pwm-freq = <25000>; - maxim,fan-dual-tach; - maxim,fan-no-watchdog; - maxim,fan-no-fault-ramp; - maxim,fan-ramp = <2>; - maxim,fan-fault-pin-mon; - }; - }; - - dps: dps310@76 { - compatible = "infineon,dps310"; - reg = <0x76>; - #io-channel-cells = <0>; - }; - - pca0: pca9552@60 { - compatible = "nxp,pca9552"; - reg = <0x60>; - #address-cells = <1>; - #size-cells = <0>; - - gpio-controller; - #gpio-cells = <2>; - - gpio@0 { - reg = <0>; - type = <PCA955X_TYPE_GPIO>; - }; - - gpio@1 { - reg = <1>; - type = <PCA955X_TYPE_GPIO>; - }; - - gpio@2 { - reg = <2>; - type = <PCA955X_TYPE_GPIO>; - }; - - gpio@3 { - reg = <3>; - type = <PCA955X_TYPE_GPIO>; - }; - - gpio@4 { - reg = <4>; - type = <PCA955X_TYPE_GPIO>; - }; - - gpio@5 { - reg = <5>; - type = <PCA955X_TYPE_GPIO>; - }; - - gpio@6 { - reg = <6>; - type = <PCA955X_TYPE_GPIO>; - }; - - gpio@7 { - reg = <7>; - type = <PCA955X_TYPE_GPIO>; - }; - - gpio@8 { - reg = <8>; - type = <PCA955X_TYPE_GPIO>; - }; - - gpio@9 { - reg = <9>; - type = <PCA955X_TYPE_GPIO>; - }; - - gpio@10 { - reg = <10>; - type = <PCA955X_TYPE_GPIO>; - }; - - gpio@11 { - reg = <11>; - type = <PCA955X_TYPE_GPIO>; - }; - - gpio@12 { - reg = <12>; - type = <PCA955X_TYPE_GPIO>; - }; - - gpio@13 { - reg = <13>; - type = <PCA955X_TYPE_GPIO>; - }; - - gpio@14 { - reg = <14>; - type = <PCA955X_TYPE_GPIO>; - }; - - gpio@15 { - reg = <15>; - type = <PCA955X_TYPE_GPIO>; - }; - }; - - power-supply@68 { - compatible = "ibm,cffps1"; - reg = <0x68>; - }; - - power-supply@69 { - compatible = "ibm,cffps1"; - reg = <0x69>; - }; -}; - -&i2c4 { - status = "okay"; - - tmp423a@4c { - compatible = "ti,tmp423"; - reg = <0x4c>; - }; - - ir35221@70 { - compatible = "infineon,ir35221"; - reg = <0x70>; - }; - - ir35221@71 { - compatible = "infineon,ir35221"; - reg = <0x71>; - }; -}; - -&i2c5 { - status = "okay"; - - tmp423a@4c { - compatible = "ti,tmp423"; - reg = <0x4c>; - }; - - ir35221@70 { - compatible = "infineon,ir35221"; - reg = <0x70>; - }; - - ir35221@71 { - compatible = "infineon,ir35221"; - reg = <0x71>; - }; -}; - -&i2c7 { - status = "okay"; -}; - -&i2c9 { - status = "okay"; - - tmp275@4a { - compatible = "ti,tmp275"; - reg = <0x4a>; - }; -}; - -&i2c10 { - status = "okay"; -}; - -&i2c11 { - status = "okay"; - - pca9552: pca9552@60 { - compatible = "nxp,pca9552"; - reg = <0x60>; - #address-cells = <1>; - #size-cells = <0>; - gpio-controller; - #gpio-cells = <2>; - - gpio-line-names = "PS_SMBUS_RESET_N", "APSS_RESET_N", - "GPU0_TH_OVERT_N_BUFF", "GPU1_TH_OVERT_N_BUFF", - "GPU2_TH_OVERT_N_BUFF", "GPU3_TH_OVERT_N_BUFF", - "GPU4_TH_OVERT_N_BUFF", "GPU5_TH_OVERT_N_BUFF", - "GPU0_PWR_GOOD_BUFF", "GPU1_PWR_GOOD_BUFF", - "GPU2_PWR_GOOD_BUFF", "GPU3_PWR_GOOD_BUFF", - "GPU4_PWR_GOOD_BUFF", "GPU5_PWR_GOOD_BUFF", - "12V_BREAKER_FLT_N", "THROTTLE_UNLATCHED_N"; - - gpio@0 { - reg = <0>; - type = <PCA955X_TYPE_GPIO>; - }; - - gpio@1 { - reg = <1>; - type = <PCA955X_TYPE_GPIO>; - }; - - gpio@2 { - reg = <2>; - type = <PCA955X_TYPE_GPIO>; - }; - - gpio@3 { - reg = <3>; - type = <PCA955X_TYPE_GPIO>; - }; - - gpio@4 { - reg = <4>; - type = <PCA955X_TYPE_GPIO>; - }; - - gpio@5 { - reg = <5>; - type = <PCA955X_TYPE_GPIO>; - }; - - gpio@6 { - reg = <6>; - type = <PCA955X_TYPE_GPIO>; - }; - - gpio@7 { - reg = <7>; - type = <PCA955X_TYPE_GPIO>; - }; - - gpio@8 { - reg = <8>; - type = <PCA955X_TYPE_GPIO>; - }; - - gpio@9 { - reg = <9>; - type = <PCA955X_TYPE_GPIO>; - }; - - gpio@10 { - reg = <10>; - type = <PCA955X_TYPE_GPIO>; - }; - - gpio@11 { - reg = <11>; - type = <PCA955X_TYPE_GPIO>; - }; - - gpio@12 { - reg = <12>; - type = <PCA955X_TYPE_GPIO>; - }; - - gpio@13 { - reg = <13>; - type = <PCA955X_TYPE_GPIO>; - }; - - gpio@14 { - reg = <14>; - type = <PCA955X_TYPE_GPIO>; - }; - - gpio@15 { - reg = <15>; - type = <PCA955X_TYPE_GPIO>; - }; - }; - - rtc@32 { - compatible = "epson,rx8900"; - reg = <0x32>; - }; - - eeprom@51 { - compatible = "atmel,24c64"; - reg = <0x51>; - }; - - ucd90160@64 { - compatible = "ti,ucd90160"; - reg = <0x64>; - }; -}; - -&i2c12 { - status = "okay"; -}; - -&i2c13 { - status = "okay"; -}; - &pinctrl { /* Hog these as no driver is probed for the entire LPC block */ pinctrl-names = "default"; diff --git a/arch/arm/boot/dts/aspeed-g6.dtsi b/arch/arm/boot/dts/aspeed-g6.dtsi index 5f6142d99eeb..b72afbaadaf8 100644 --- a/arch/arm/boot/dts/aspeed-g6.dtsi +++ b/arch/arm/boot/dts/aspeed-g6.dtsi @@ -163,26 +163,6 @@ spi-max-frequency = <50000000>; status = "disabled"; }; - - fsim0: fsi@1e79b000 { - compatible = "aspeed,ast2600-fsi-master", "fsi-master"; - reg = <0x1e79b000 0x94>; - interrupts = <GIC_SPI 100 IRQ_TYPE_LEVEL_HIGH>; - pinctrl-names = "default"; - pinctrl-0 = <&pinctrl_fsi1_default>; - clocks = <&syscon ASPEED_CLK_GATE_FSICLK>; - status = "disabled"; - }; - - fsim1: fsi@1e79b100 { - compatible = "aspeed,ast2600-fsi-master", "fsi-master"; - reg = <0x1e79b100 0x94>; - interrupts = <GIC_SPI 101 IRQ_TYPE_LEVEL_HIGH>; - pinctrl-names = "default"; - pinctrl-0 = <&pinctrl_fsi2_default>; - clocks = <&syscon ASPEED_CLK_GATE_FSICLK>; - status = "disabled"; - }; }; mdio0: mdio@1e650000 { @@ -595,6 +575,25 @@ ranges = <0 0x1e78a000 0x1000>; }; + fsim0: fsi@1e79b000 { + compatible = "aspeed,ast2600-fsi-master", "fsi-master"; + reg = <0x1e79b000 0x94>; + interrupts = <GIC_SPI 100 IRQ_TYPE_LEVEL_HIGH>; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_fsi1_default>; + clocks = <&syscon ASPEED_CLK_GATE_FSICLK>; + status = "disabled"; + }; + + fsim1: fsi@1e79b100 { + compatible = "aspeed,ast2600-fsi-master", "fsi-master"; + reg = <0x1e79b100 0x94>; + interrupts = <GIC_SPI 101 IRQ_TYPE_LEVEL_HIGH>; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_fsi2_default>; + clocks = <&syscon ASPEED_CLK_GATE_FSICLK>; + status = "disabled"; + }; }; }; }; diff --git a/arch/arm/boot/dts/imx6dl-icore-mipi.dts b/arch/arm/boot/dts/imx6dl-icore-mipi.dts index e43bccb78ab2..d8f3821a0ffd 100644 --- a/arch/arm/boot/dts/imx6dl-icore-mipi.dts +++ b/arch/arm/boot/dts/imx6dl-icore-mipi.dts @@ -8,7 +8,7 @@ /dts-v1/; #include "imx6dl.dtsi" -#include "imx6qdl-icore.dtsi" +#include "imx6qdl-icore-1.5.dtsi" / { model = "Engicam i.CoreM6 DualLite/Solo MIPI Starter Kit"; diff --git a/arch/arm/boot/dts/imx6q-dhcom-pdk2.dts b/arch/arm/boot/dts/imx6q-dhcom-pdk2.dts index 5219553df1e7..bb74fc62d913 100644 --- a/arch/arm/boot/dts/imx6q-dhcom-pdk2.dts +++ b/arch/arm/boot/dts/imx6q-dhcom-pdk2.dts @@ -63,7 +63,7 @@ #sound-dai-cells = <0>; clocks = <&clk_ext_audio_codec>; VDDA-supply = <®_3p3v>; - VDDIO-supply = <®_3p3v>; + VDDIO-supply = <&sw2_reg>; }; }; diff --git a/arch/arm/boot/dts/imx6q-dhcom-som.dtsi b/arch/arm/boot/dts/imx6q-dhcom-som.dtsi index 845cfad99bf9..87f0aa897086 100644 --- a/arch/arm/boot/dts/imx6q-dhcom-som.dtsi +++ b/arch/arm/boot/dts/imx6q-dhcom-som.dtsi @@ -204,7 +204,7 @@ }; rtc@56 { - compatible = "rv3029c2"; + compatible = "microcrystal,rv3029"; pinctrl-names = "default"; pinctrl-0 = <&pinctrl_rtc_hw300>; reg = <0x56>; diff --git a/arch/arm/boot/dts/imx6qdl-sabresd.dtsi b/arch/arm/boot/dts/imx6qdl-sabresd.dtsi index 71ca76a5e4a5..fe59dde41b64 100644 --- a/arch/arm/boot/dts/imx6qdl-sabresd.dtsi +++ b/arch/arm/boot/dts/imx6qdl-sabresd.dtsi @@ -749,10 +749,6 @@ vin-supply = <&vgen5_reg>; }; -®_vdd3p0 { - vin-supply = <&sw2_reg>; -}; - ®_vdd2p5 { vin-supply = <&vgen5_reg>; }; diff --git a/arch/arm/boot/dts/imx6sl-evk.dts b/arch/arm/boot/dts/imx6sl-evk.dts index 4829aa682aeb..bc86cfaaa9c2 100644 --- a/arch/arm/boot/dts/imx6sl-evk.dts +++ b/arch/arm/boot/dts/imx6sl-evk.dts @@ -584,10 +584,6 @@ vin-supply = <&sw2_reg>; }; -®_vdd3p0 { - vin-supply = <&sw2_reg>; -}; - ®_vdd2p5 { vin-supply = <&sw2_reg>; }; diff --git a/arch/arm/boot/dts/imx6sll-evk.dts b/arch/arm/boot/dts/imx6sll-evk.dts index 3e1d32fdf4b8..5ace9e6acf85 100644 --- a/arch/arm/boot/dts/imx6sll-evk.dts +++ b/arch/arm/boot/dts/imx6sll-evk.dts @@ -265,10 +265,6 @@ status = "okay"; }; -®_3p0 { - vin-supply = <&sw2_reg>; -}; - &snvs_poweroff { status = "okay"; }; diff --git a/arch/arm/boot/dts/imx6sx-sdb-reva.dts b/arch/arm/boot/dts/imx6sx-sdb-reva.dts index f1830ed387a5..91a7548fdb8d 100644 --- a/arch/arm/boot/dts/imx6sx-sdb-reva.dts +++ b/arch/arm/boot/dts/imx6sx-sdb-reva.dts @@ -159,10 +159,6 @@ vin-supply = <&vgen6_reg>; }; -®_vdd3p0 { - vin-supply = <&sw2_reg>; -}; - ®_vdd2p5 { vin-supply = <&vgen6_reg>; }; diff --git a/arch/arm/boot/dts/imx6sx-sdb.dts b/arch/arm/boot/dts/imx6sx-sdb.dts index a8ee7087af5a..5a63ca615722 100644 --- a/arch/arm/boot/dts/imx6sx-sdb.dts +++ b/arch/arm/boot/dts/imx6sx-sdb.dts @@ -141,10 +141,6 @@ vin-supply = <&vgen6_reg>; }; -®_vdd3p0 { - vin-supply = <&sw2_reg>; -}; - ®_vdd2p5 { vin-supply = <&vgen6_reg>; }; diff --git a/arch/arm/boot/dts/imx7s-colibri.dtsi b/arch/arm/boot/dts/imx7s-colibri.dtsi index 1fb1ec5d3d70..6d16e32aed89 100644 --- a/arch/arm/boot/dts/imx7s-colibri.dtsi +++ b/arch/arm/boot/dts/imx7s-colibri.dtsi @@ -49,3 +49,7 @@ reg = <0x80000000 0x10000000>; }; }; + +&gpmi { + status = "okay"; +}; diff --git a/arch/arm/boot/dts/imx7ulp.dtsi b/arch/arm/boot/dts/imx7ulp.dtsi index d37a1927c88e..ab91c98f2124 100644 --- a/arch/arm/boot/dts/imx7ulp.dtsi +++ b/arch/arm/boot/dts/imx7ulp.dtsi @@ -37,10 +37,10 @@ #address-cells = <1>; #size-cells = <0>; - cpu0: cpu@0 { + cpu0: cpu@f00 { compatible = "arm,cortex-a7"; device_type = "cpu"; - reg = <0>; + reg = <0xf00>; }; }; diff --git a/arch/arm/boot/dts/meson8.dtsi b/arch/arm/boot/dts/meson8.dtsi index 5a7e3e5caebe..3c534cd50ee3 100644 --- a/arch/arm/boot/dts/meson8.dtsi +++ b/arch/arm/boot/dts/meson8.dtsi @@ -253,7 +253,7 @@ &aobus { pmu: pmu@e0 { compatible = "amlogic,meson8-pmu", "syscon"; - reg = <0xe0 0x8>; + reg = <0xe0 0x18>; }; pinctrl_aobus: pinctrl@84 { diff --git a/arch/arm/boot/dts/mmp3.dtsi b/arch/arm/boot/dts/mmp3.dtsi index d9762de0ed34..6f480827b94d 100644 --- a/arch/arm/boot/dts/mmp3.dtsi +++ b/arch/arm/boot/dts/mmp3.dtsi @@ -356,7 +356,7 @@ twsi1: i2c@d4011000 { compatible = "mrvl,mmp-twsi"; - reg = <0xd4011000 0x1000>; + reg = <0xd4011000 0x70>; interrupts = <GIC_SPI 7 IRQ_TYPE_LEVEL_HIGH>; clocks = <&soc_clocks MMP2_CLK_TWSI0>; resets = <&soc_clocks MMP2_CLK_TWSI0>; @@ -368,7 +368,7 @@ twsi2: i2c@d4031000 { compatible = "mrvl,mmp-twsi"; - reg = <0xd4031000 0x1000>; + reg = <0xd4031000 0x70>; interrupt-parent = <&twsi_mux>; interrupts = <0>; clocks = <&soc_clocks MMP2_CLK_TWSI1>; @@ -380,7 +380,7 @@ twsi3: i2c@d4032000 { compatible = "mrvl,mmp-twsi"; - reg = <0xd4032000 0x1000>; + reg = <0xd4032000 0x70>; interrupt-parent = <&twsi_mux>; interrupts = <1>; clocks = <&soc_clocks MMP2_CLK_TWSI2>; @@ -392,7 +392,7 @@ twsi4: i2c@d4033000 { compatible = "mrvl,mmp-twsi"; - reg = <0xd4033000 0x1000>; + reg = <0xd4033000 0x70>; interrupt-parent = <&twsi_mux>; interrupts = <2>; clocks = <&soc_clocks MMP2_CLK_TWSI3>; @@ -405,7 +405,7 @@ twsi5: i2c@d4033800 { compatible = "mrvl,mmp-twsi"; - reg = <0xd4033800 0x1000>; + reg = <0xd4033800 0x70>; interrupt-parent = <&twsi_mux>; interrupts = <3>; clocks = <&soc_clocks MMP2_CLK_TWSI4>; @@ -417,7 +417,7 @@ twsi6: i2c@d4034000 { compatible = "mrvl,mmp-twsi"; - reg = <0xd4034000 0x1000>; + reg = <0xd4034000 0x70>; interrupt-parent = <&twsi_mux>; interrupts = <4>; clocks = <&soc_clocks MMP2_CLK_TWSI5>; diff --git a/arch/arm/boot/dts/sun8i-a83t-cubietruck-plus.dts b/arch/arm/boot/dts/sun8i-a83t-cubietruck-plus.dts index fb928503ad45..d9be511f054f 100644 --- a/arch/arm/boot/dts/sun8i-a83t-cubietruck-plus.dts +++ b/arch/arm/boot/dts/sun8i-a83t-cubietruck-plus.dts @@ -101,7 +101,7 @@ initial-mode = <1>; /* initialize in HUB mode */ disabled-ports = <1>; intn-gpios = <&pio 7 5 GPIO_ACTIVE_HIGH>; /* PH5 */ - reset-gpios = <&pio 4 16 GPIO_ACTIVE_HIGH>; /* PE16 */ + reset-gpios = <&pio 4 16 GPIO_ACTIVE_LOW>; /* PE16 */ connect-gpios = <&pio 4 17 GPIO_ACTIVE_HIGH>; /* PE17 */ refclk-frequency = <19200000>; }; diff --git a/arch/arm/kernel/hyp-stub.S b/arch/arm/kernel/hyp-stub.S index ae5020302de4..6607fa817bba 100644 --- a/arch/arm/kernel/hyp-stub.S +++ b/arch/arm/kernel/hyp-stub.S @@ -146,10 +146,9 @@ ARM_BE8(orr r7, r7, #(1 << 25)) @ HSCTLR.EE #if !defined(ZIMAGE) && defined(CONFIG_ARM_ARCH_TIMER) @ make CNTP_* and CNTPCT accessible from PL1 mrc p15, 0, r7, c0, c1, 1 @ ID_PFR1 - lsr r7, #16 - and r7, #0xf - cmp r7, #1 - bne 1f + ubfx r7, r7, #16, #4 + teq r7, #0 + beq 1f mrc p15, 4, r7, c14, c1, 0 @ CNTHCTL orr r7, r7, #3 @ PL1PCEN | PL1PCTEN mcr p15, 4, r7, c14, c1, 0 @ CNTHCTL diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index cea1c27c29cb..46e478fb5ea2 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -226,8 +226,8 @@ void release_thread(struct task_struct *dead_task) asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); int -copy_thread(unsigned long clone_flags, unsigned long stack_start, - unsigned long stk_sz, struct task_struct *p) +copy_thread_tls(unsigned long clone_flags, unsigned long stack_start, + unsigned long stk_sz, struct task_struct *p, unsigned long tls) { struct thread_info *thread = task_thread_info(p); struct pt_regs *childregs = task_pt_regs(p); @@ -261,7 +261,7 @@ copy_thread(unsigned long clone_flags, unsigned long stack_start, clear_ptrace_hw_breakpoint(p); if (clone_flags & CLONE_SETTLS) - thread->tp_value[0] = childregs->ARM_r3; + thread->tp_value[0] = tls; thread->tp_value[1] = get_tpuser(); thread_notify(THREAD_NOTIFY_COPY, thread); diff --git a/arch/arm/mach-davinci/Kconfig b/arch/arm/mach-davinci/Kconfig index dd427bd2768c..02b180ad7245 100644 --- a/arch/arm/mach-davinci/Kconfig +++ b/arch/arm/mach-davinci/Kconfig @@ -9,6 +9,7 @@ menuconfig ARCH_DAVINCI select PM_GENERIC_DOMAINS if PM select PM_GENERIC_DOMAINS_OF if PM && OF select REGMAP_MMIO + select RESET_CONTROLLER select HAVE_IDE select PINCTRL_SINGLE diff --git a/arch/arm/mach-ixp4xx/fsg-setup.c b/arch/arm/mach-ixp4xx/fsg-setup.c index 648932d8d7a8..507ee3878769 100644 --- a/arch/arm/mach-ixp4xx/fsg-setup.c +++ b/arch/arm/mach-ixp4xx/fsg-setup.c @@ -132,6 +132,22 @@ static struct platform_device fsg_leds = { }; /* Built-in 10/100 Ethernet MAC interfaces */ +static struct resource fsg_eth_npeb_resources[] = { + { + .start = IXP4XX_EthB_BASE_PHYS, + .end = IXP4XX_EthB_BASE_PHYS + 0x0fff, + .flags = IORESOURCE_MEM, + }, +}; + +static struct resource fsg_eth_npec_resources[] = { + { + .start = IXP4XX_EthC_BASE_PHYS, + .end = IXP4XX_EthC_BASE_PHYS + 0x0fff, + .flags = IORESOURCE_MEM, + }, +}; + static struct eth_plat_info fsg_plat_eth[] = { { .phy = 5, @@ -151,12 +167,16 @@ static struct platform_device fsg_eth[] = { .dev = { .platform_data = fsg_plat_eth, }, + .num_resources = ARRAY_SIZE(fsg_eth_npeb_resources), + .resource = fsg_eth_npeb_resources, }, { .name = "ixp4xx_eth", .id = IXP4XX_ETH_NPEC, .dev = { .platform_data = fsg_plat_eth + 1, }, + .num_resources = ARRAY_SIZE(fsg_eth_npec_resources), + .resource = fsg_eth_npec_resources, } }; diff --git a/arch/arm/mach-ixp4xx/goramo_mlr.c b/arch/arm/mach-ixp4xx/goramo_mlr.c index a0e0b6b7dc5c..07b50dfcc489 100644 --- a/arch/arm/mach-ixp4xx/goramo_mlr.c +++ b/arch/arm/mach-ixp4xx/goramo_mlr.c @@ -11,6 +11,7 @@ #include <linux/irq.h> #include <linux/kernel.h> #include <linux/pci.h> +#include <linux/platform_data/wan_ixp4xx_hss.h> #include <linux/serial_8250.h> #include <asm/mach-types.h> #include <asm/mach/arch.h> @@ -272,6 +273,22 @@ static struct platform_device device_uarts = { /* Built-in 10/100 Ethernet MAC interfaces */ +static struct resource eth_npeb_resources[] = { + { + .start = IXP4XX_EthB_BASE_PHYS, + .end = IXP4XX_EthB_BASE_PHYS + 0x0fff, + .flags = IORESOURCE_MEM, + }, +}; + +static struct resource eth_npec_resources[] = { + { + .start = IXP4XX_EthC_BASE_PHYS, + .end = IXP4XX_EthC_BASE_PHYS + 0x0fff, + .flags = IORESOURCE_MEM, + }, +}; + static struct eth_plat_info eth_plat[] = { { .phy = 0, @@ -289,10 +306,14 @@ static struct platform_device device_eth_tab[] = { .name = "ixp4xx_eth", .id = IXP4XX_ETH_NPEB, .dev.platform_data = eth_plat, + .num_resources = ARRAY_SIZE(eth_npeb_resources), + .resource = eth_npeb_resources, }, { .name = "ixp4xx_eth", .id = IXP4XX_ETH_NPEC, .dev.platform_data = eth_plat + 1, + .num_resources = ARRAY_SIZE(eth_npec_resources), + .resource = eth_npec_resources, } }; @@ -405,6 +426,9 @@ static void __init gmlr_init(void) if (hw_bits & CFG_HW_HAS_HSS1) device_tab[devices++] = &device_hss_tab[1]; /* max index 5 */ + hss_plat[0].timer_freq = ixp4xx_timer_freq; + hss_plat[1].timer_freq = ixp4xx_timer_freq; + gpio_request(GPIO_SCL, "SCL/clock"); gpio_request(GPIO_SDA, "SDA/data"); gpio_request(GPIO_STR, "strobe"); diff --git a/arch/arm/mach-ixp4xx/include/mach/platform.h b/arch/arm/mach-ixp4xx/include/mach/platform.h index 342acbe20f7c..6d403fe0bf52 100644 --- a/arch/arm/mach-ixp4xx/include/mach/platform.h +++ b/arch/arm/mach-ixp4xx/include/mach/platform.h @@ -15,6 +15,7 @@ #ifndef __ASSEMBLY__ #include <linux/reboot.h> +#include <linux/platform_data/eth_ixp4xx.h> #include <asm/types.h> @@ -92,27 +93,6 @@ struct ixp4xx_pata_data { void __iomem *cs1; }; -#define IXP4XX_ETH_NPEA 0x00 -#define IXP4XX_ETH_NPEB 0x10 -#define IXP4XX_ETH_NPEC 0x20 - -/* Information about built-in Ethernet MAC interfaces */ -struct eth_plat_info { - u8 phy; /* MII PHY ID, 0 - 31 */ - u8 rxq; /* configurable, currently 0 - 31 only */ - u8 txreadyq; - u8 hwaddr[6]; -}; - -/* Information about built-in HSS (synchronous serial) interfaces */ -struct hss_plat_info { - int (*set_clock)(int port, unsigned int clock_type); - int (*open)(int port, void *pdev, - void (*set_carrier_cb)(void *pdev, int carrier)); - void (*close)(int port, void *pdev); - u8 txreadyq; -}; - /* * Frequency of clock used for primary clocksource */ diff --git a/arch/arm/mach-ixp4xx/ixdp425-setup.c b/arch/arm/mach-ixp4xx/ixdp425-setup.c index 6f0f7ed18ea8..45d5b720ded6 100644 --- a/arch/arm/mach-ixp4xx/ixdp425-setup.c +++ b/arch/arm/mach-ixp4xx/ixdp425-setup.c @@ -187,6 +187,22 @@ static struct platform_device ixdp425_uart = { }; /* Built-in 10/100 Ethernet MAC interfaces */ +static struct resource ixp425_npeb_resources[] = { + { + .start = IXP4XX_EthB_BASE_PHYS, + .end = IXP4XX_EthB_BASE_PHYS + 0x0fff, + .flags = IORESOURCE_MEM, + }, +}; + +static struct resource ixp425_npec_resources[] = { + { + .start = IXP4XX_EthC_BASE_PHYS, + .end = IXP4XX_EthC_BASE_PHYS + 0x0fff, + .flags = IORESOURCE_MEM, + }, +}; + static struct eth_plat_info ixdp425_plat_eth[] = { { .phy = 0, @@ -204,10 +220,14 @@ static struct platform_device ixdp425_eth[] = { .name = "ixp4xx_eth", .id = IXP4XX_ETH_NPEB, .dev.platform_data = ixdp425_plat_eth, + .num_resources = ARRAY_SIZE(ixp425_npeb_resources), + .resource = ixp425_npeb_resources, }, { .name = "ixp4xx_eth", .id = IXP4XX_ETH_NPEC, .dev.platform_data = ixdp425_plat_eth + 1, + .num_resources = ARRAY_SIZE(ixp425_npec_resources), + .resource = ixp425_npec_resources, } }; diff --git a/arch/arm/mach-ixp4xx/nas100d-setup.c b/arch/arm/mach-ixp4xx/nas100d-setup.c index c142cfa8c5d6..6959ad2e3aec 100644 --- a/arch/arm/mach-ixp4xx/nas100d-setup.c +++ b/arch/arm/mach-ixp4xx/nas100d-setup.c @@ -165,6 +165,14 @@ static struct platform_device nas100d_uart = { }; /* Built-in 10/100 Ethernet MAC interfaces */ +static struct resource nas100d_eth_resources[] = { + { + .start = IXP4XX_EthB_BASE_PHYS, + .end = IXP4XX_EthB_BASE_PHYS + 0x0fff, + .flags = IORESOURCE_MEM, + }, +}; + static struct eth_plat_info nas100d_plat_eth[] = { { .phy = 0, @@ -178,6 +186,8 @@ static struct platform_device nas100d_eth[] = { .name = "ixp4xx_eth", .id = IXP4XX_ETH_NPEB, .dev.platform_data = nas100d_plat_eth, + .num_resources = ARRAY_SIZE(nas100d_eth_resources), + .resource = nas100d_eth_resources, } }; diff --git a/arch/arm/mach-ixp4xx/nslu2-setup.c b/arch/arm/mach-ixp4xx/nslu2-setup.c index ee1877fcfafe..a428bb918703 100644 --- a/arch/arm/mach-ixp4xx/nslu2-setup.c +++ b/arch/arm/mach-ixp4xx/nslu2-setup.c @@ -185,6 +185,14 @@ static struct platform_device nslu2_uart = { }; /* Built-in 10/100 Ethernet MAC interfaces */ +static struct resource nslu2_eth_resources[] = { + { + .start = IXP4XX_EthB_BASE_PHYS, + .end = IXP4XX_EthB_BASE_PHYS + 0x0fff, + .flags = IORESOURCE_MEM, + }, +}; + static struct eth_plat_info nslu2_plat_eth[] = { { .phy = 1, @@ -198,6 +206,8 @@ static struct platform_device nslu2_eth[] = { .name = "ixp4xx_eth", .id = IXP4XX_ETH_NPEB, .dev.platform_data = nslu2_plat_eth, + .num_resources = ARRAY_SIZE(nslu2_eth_resources), + .resource = nslu2_eth_resources, } }; diff --git a/arch/arm/mach-ixp4xx/omixp-setup.c b/arch/arm/mach-ixp4xx/omixp-setup.c index 6ed5a9aed600..8f2b8c473d7a 100644 --- a/arch/arm/mach-ixp4xx/omixp-setup.c +++ b/arch/arm/mach-ixp4xx/omixp-setup.c @@ -170,6 +170,22 @@ static struct platform_device mic256_leds = { }; /* Built-in 10/100 Ethernet MAC interfaces */ +static struct resource ixp425_npeb_resources[] = { + { + .start = IXP4XX_EthB_BASE_PHYS, + .end = IXP4XX_EthB_BASE_PHYS + 0x0fff, + .flags = IORESOURCE_MEM, + }, +}; + +static struct resource ixp425_npec_resources[] = { + { + .start = IXP4XX_EthC_BASE_PHYS, + .end = IXP4XX_EthC_BASE_PHYS + 0x0fff, + .flags = IORESOURCE_MEM, + }, +}; + static struct eth_plat_info ixdp425_plat_eth[] = { { .phy = 0, @@ -187,10 +203,14 @@ static struct platform_device ixdp425_eth[] = { .name = "ixp4xx_eth", .id = IXP4XX_ETH_NPEB, .dev.platform_data = ixdp425_plat_eth, + .num_resources = ARRAY_SIZE(ixp425_npeb_resources), + .resource = ixp425_npeb_resources, }, { .name = "ixp4xx_eth", .id = IXP4XX_ETH_NPEC, .dev.platform_data = ixdp425_plat_eth + 1, + .num_resources = ARRAY_SIZE(ixp425_npec_resources), + .resource = ixp425_npec_resources, }, }; diff --git a/arch/arm/mach-ixp4xx/vulcan-setup.c b/arch/arm/mach-ixp4xx/vulcan-setup.c index d2ebb7c675a8..e506d2af98ad 100644 --- a/arch/arm/mach-ixp4xx/vulcan-setup.c +++ b/arch/arm/mach-ixp4xx/vulcan-setup.c @@ -124,6 +124,22 @@ static struct platform_device vulcan_uart = { .num_resources = ARRAY_SIZE(vulcan_uart_resources), }; +static struct resource vulcan_npeb_resources[] = { + { + .start = IXP4XX_EthB_BASE_PHYS, + .end = IXP4XX_EthB_BASE_PHYS + 0x0fff, + .flags = IORESOURCE_MEM, + }, +}; + +static struct resource vulcan_npec_resources[] = { + { + .start = IXP4XX_EthC_BASE_PHYS, + .end = IXP4XX_EthC_BASE_PHYS + 0x0fff, + .flags = IORESOURCE_MEM, + }, +}; + static struct eth_plat_info vulcan_plat_eth[] = { [0] = { .phy = 0, @@ -144,6 +160,8 @@ static struct platform_device vulcan_eth[] = { .dev = { .platform_data = &vulcan_plat_eth[0], }, + .num_resources = ARRAY_SIZE(vulcan_npeb_resources), + .resource = vulcan_npeb_resources, }, [1] = { .name = "ixp4xx_eth", @@ -151,6 +169,8 @@ static struct platform_device vulcan_eth[] = { .dev = { .platform_data = &vulcan_plat_eth[1], }, + .num_resources = ARRAY_SIZE(vulcan_npec_resources), + .resource = vulcan_npec_resources, }, }; diff --git a/arch/arm/mach-mmp/time.c b/arch/arm/mach-mmp/time.c index 110dcb3314d1..c65cfc1ad99b 100644 --- a/arch/arm/mach-mmp/time.c +++ b/arch/arm/mach-mmp/time.c @@ -207,7 +207,7 @@ static int __init mmp_dt_init_timer(struct device_node *np) ret = clk_prepare_enable(clk); if (ret) return ret; - rate = clk_get_rate(clk) / 2; + rate = clk_get_rate(clk); } else if (cpu_is_pj4()) { rate = 6500000; } else { diff --git a/arch/arm/mach-omap2/Kconfig b/arch/arm/mach-omap2/Kconfig index ad08d470a2ca..dca7d06c0b93 100644 --- a/arch/arm/mach-omap2/Kconfig +++ b/arch/arm/mach-omap2/Kconfig @@ -95,6 +95,7 @@ config ARCH_OMAP2PLUS bool select ARCH_HAS_BANDGAP select ARCH_HAS_HOLES_MEMORYMODEL + select ARCH_HAS_RESET_CONTROLLER select ARCH_OMAP select CLKSRC_MMIO select GENERIC_IRQ_CHIP @@ -105,11 +106,11 @@ config ARCH_OMAP2PLUS select OMAP_DM_TIMER select OMAP_GPMC select PINCTRL + select RESET_CONTROLLER select SOC_BUS select TI_SYSC select OMAP_IRQCHIP select CLKSRC_TI_32K - select ARCH_HAS_RESET_CONTROLLER help Systems based on OMAP2, OMAP3, OMAP4 or OMAP5 diff --git a/arch/arm/mach-omap2/pdata-quirks.c b/arch/arm/mach-omap2/pdata-quirks.c index ca52271de5a8..e95c224ffc4d 100644 --- a/arch/arm/mach-omap2/pdata-quirks.c +++ b/arch/arm/mach-omap2/pdata-quirks.c @@ -306,10 +306,14 @@ static void __init dra7x_evm_mmc_quirk(void) static struct clockdomain *ti_sysc_find_one_clockdomain(struct clk *clk) { + struct clk_hw *hw = __clk_get_hw(clk); struct clockdomain *clkdm = NULL; struct clk_hw_omap *hwclk; - hwclk = to_clk_hw_omap(__clk_get_hw(clk)); + hwclk = to_clk_hw_omap(hw); + if (!omap2_clk_is_hw_omap(hw)) + return NULL; + if (hwclk && hwclk->clkdm_name) clkdm = clkdm_lookup(hwclk->clkdm_name); diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 29d03459de20..e2c758df08b4 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -139,6 +139,7 @@ config ARM64 select HAVE_CMPXCHG_DOUBLE select HAVE_CMPXCHG_LOCAL select HAVE_CONTEXT_TRACKING + select HAVE_COPY_THREAD_TLS select HAVE_DEBUG_BUGVERBOSE select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-olinuxino-emmc.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-olinuxino-emmc.dts index 96ab0227e82d..121e6cc4849b 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-a64-olinuxino-emmc.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-olinuxino-emmc.dts @@ -15,7 +15,7 @@ pinctrl-names = "default"; pinctrl-0 = <&mmc2_pins>; vmmc-supply = <®_dcdc1>; - vqmmc-supply = <®_dcdc1>; + vqmmc-supply = <®_eldo1>; bus-width = <8>; non-removable; cap-mmc-hw-reset; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-olinuxino.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-olinuxino.dts index 01a9a52edae4..393c1948a495 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-a64-olinuxino.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-olinuxino.dts @@ -140,7 +140,7 @@ &mmc1 { pinctrl-names = "default"; pinctrl-0 = <&mmc1_pins>; - vmmc-supply = <®_aldo2>; + vmmc-supply = <®_dcdc1>; vqmmc-supply = <®_dldo4>; mmc-pwrseq = <&wifi_pwrseq>; bus-width = <4>; diff --git a/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi b/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi index 144a2c19ac02..d1fc9c2055f4 100644 --- a/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi +++ b/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi @@ -61,10 +61,10 @@ pmu { compatible = "arm,armv8-pmuv3"; - interrupts = <0 120 8>, - <0 121 8>, - <0 122 8>, - <0 123 8>; + interrupts = <0 170 4>, + <0 171 4>, + <0 172 4>, + <0 173 4>; interrupt-affinity = <&cpu0>, <&cpu1>, <&cpu2>, diff --git a/arch/arm64/boot/dts/amlogic/meson-sm1-sei610.dts b/arch/arm64/boot/dts/amlogic/meson-sm1-sei610.dts index 5bd07469766b..a8bb3fa9fec9 100644 --- a/arch/arm64/boot/dts/amlogic/meson-sm1-sei610.dts +++ b/arch/arm64/boot/dts/amlogic/meson-sm1-sei610.dts @@ -46,25 +46,47 @@ }; gpio-keys { - compatible = "gpio-keys-polled"; - poll-interval = <100>; + compatible = "gpio-keys"; key1 { label = "A"; linux,code = <BTN_0>; gpios = <&gpio GPIOH_6 GPIO_ACTIVE_LOW>; + interrupt-parent = <&gpio_intc>; + interrupts = <34 IRQ_TYPE_EDGE_BOTH>; }; key2 { label = "B"; linux,code = <BTN_1>; gpios = <&gpio GPIOH_7 GPIO_ACTIVE_LOW>; + interrupt-parent = <&gpio_intc>; + interrupts = <35 IRQ_TYPE_EDGE_BOTH>; }; key3 { label = "C"; linux,code = <BTN_2>; gpios = <&gpio_ao GPIOAO_2 GPIO_ACTIVE_LOW>; + interrupt-parent = <&gpio_intc>; + interrupts = <2 IRQ_TYPE_EDGE_BOTH>; + }; + + mic_mute { + label = "MicMute"; + linux,code = <SW_MUTE_DEVICE>; + linux,input-type = <EV_SW>; + gpios = <&gpio_ao GPIOE_2 GPIO_ACTIVE_LOW>; + interrupt-parent = <&gpio_intc>; + interrupts = <99 IRQ_TYPE_EDGE_BOTH>; + }; + + power_key { + label = "PowerKey"; + linux,code = <KEY_POWER>; + gpios = <&gpio_ao GPIOAO_3 GPIO_ACTIVE_LOW>; + interrupt-parent = <&gpio_intc>; + interrupts = <3 IRQ_TYPE_EDGE_BOTH>; }; }; @@ -569,6 +591,8 @@ bluetooth { compatible = "brcm,bcm43438-bt"; + interrupt-parent = <&gpio_intc>; + interrupts = <95 IRQ_TYPE_LEVEL_HIGH>; shutdown-gpios = <&gpio GPIOX_17 GPIO_ACTIVE_HIGH>; max-speed = <2000000>; clocks = <&wifi32k>; diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi index 13a3cbe89b5a..a6f9b7784e8f 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi @@ -175,7 +175,7 @@ dcfg: syscon@1e00000 { compatible = "fsl,ls1028a-dcfg", "syscon"; reg = <0x0 0x1e00000 0x0 0x10000>; - big-endian; + little-endian; }; rst: syscon@1e60000 { diff --git a/arch/arm64/boot/dts/freescale/imx8mm.dtsi b/arch/arm64/boot/dts/freescale/imx8mm.dtsi index 6edbdfe2d0d7..3d95b66a2d71 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mm.dtsi @@ -740,7 +740,7 @@ reg = <0x30bd0000 0x10000>; interrupts = <GIC_SPI 2 IRQ_TYPE_LEVEL_HIGH>; clocks = <&clk IMX8MM_CLK_SDMA1_ROOT>, - <&clk IMX8MM_CLK_SDMA1_ROOT>; + <&clk IMX8MM_CLK_AHB>; clock-names = "ipg", "ahb"; #dma-cells = <3>; fsl,sdma-ram-script-name = "imx/sdma/sdma-imx7d.bin"; diff --git a/arch/arm64/boot/dts/freescale/imx8mq-librem5-devkit.dts b/arch/arm64/boot/dts/freescale/imx8mq-librem5-devkit.dts index 2a759dff9f87..596bc65f475c 100644 --- a/arch/arm64/boot/dts/freescale/imx8mq-librem5-devkit.dts +++ b/arch/arm64/boot/dts/freescale/imx8mq-librem5-devkit.dts @@ -421,7 +421,7 @@ pinctrl-names = "default"; pinctrl-0 = <&pinctrl_imu>; interrupt-parent = <&gpio3>; - interrupts = <19 IRQ_TYPE_LEVEL_LOW>; + interrupts = <19 IRQ_TYPE_LEVEL_HIGH>; vdd-supply = <®_3v3_p>; vddio-supply = <®_3v3_p>; }; diff --git a/arch/arm64/boot/dts/intel/socfpga_agilex.dtsi b/arch/arm64/boot/dts/intel/socfpga_agilex.dtsi index 94090c6fb946..d43e1299c8ef 100644 --- a/arch/arm64/boot/dts/intel/socfpga_agilex.dtsi +++ b/arch/arm64/boot/dts/intel/socfpga_agilex.dtsi @@ -60,10 +60,10 @@ pmu { compatible = "arm,armv8-pmuv3"; - interrupts = <0 120 8>, - <0 121 8>, - <0 122 8>, - <0 123 8>; + interrupts = <0 170 4>, + <0 171 4>, + <0 172 4>, + <0 173 4>; interrupt-affinity = <&cpu0>, <&cpu1>, <&cpu2>, diff --git a/arch/arm64/boot/dts/rockchip/rk3328-a1.dts b/arch/arm64/boot/dts/rockchip/rk3328-a1.dts index 76b49f573101..16f1656d5203 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328-a1.dts +++ b/arch/arm64/boot/dts/rockchip/rk3328-a1.dts @@ -49,7 +49,8 @@ ir-receiver { compatible = "gpio-ir-receiver"; - gpios = <&gpio2 RK_PA2 GPIO_ACTIVE_HIGH>; + gpios = <&gpio2 RK_PA2 GPIO_ACTIVE_LOW>; + linux,rc-map-name = "rc-beelink-gs1"; }; }; diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 8dc6c5cdabe6..baf52baaa2a5 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -85,13 +85,12 @@ #define PAGE_SHARED_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_WRITE) #define PAGE_READONLY __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN) #define PAGE_READONLY_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN) -#define PAGE_EXECONLY __pgprot(_PAGE_DEFAULT | PTE_RDONLY | PTE_NG | PTE_PXN) #define __P000 PAGE_NONE #define __P001 PAGE_READONLY #define __P010 PAGE_READONLY #define __P011 PAGE_READONLY -#define __P100 PAGE_EXECONLY +#define __P100 PAGE_READONLY_EXEC #define __P101 PAGE_READONLY_EXEC #define __P110 PAGE_READONLY_EXEC #define __P111 PAGE_READONLY_EXEC @@ -100,7 +99,7 @@ #define __S001 PAGE_READONLY #define __S010 PAGE_SHARED #define __S011 PAGE_SHARED -#define __S100 PAGE_EXECONLY +#define __S100 PAGE_READONLY_EXEC #define __S101 PAGE_READONLY_EXEC #define __S110 PAGE_SHARED_EXEC #define __S111 PAGE_SHARED_EXEC diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 5d15b4735a0e..cd5de0e40bfa 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -96,12 +96,8 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; #define pte_dirty(pte) (pte_sw_dirty(pte) || pte_hw_dirty(pte)) #define pte_valid(pte) (!!(pte_val(pte) & PTE_VALID)) -/* - * Execute-only user mappings do not have the PTE_USER bit set. All valid - * kernel mappings have the PTE_UXN bit set. - */ #define pte_valid_not_user(pte) \ - ((pte_val(pte) & (PTE_VALID | PTE_USER | PTE_UXN)) == (PTE_VALID | PTE_UXN)) + ((pte_val(pte) & (PTE_VALID | PTE_USER)) == PTE_VALID) #define pte_valid_young(pte) \ ((pte_val(pte) & (PTE_VALID | PTE_AF)) == (PTE_VALID | PTE_AF)) #define pte_valid_user(pte) \ @@ -117,8 +113,8 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; /* * p??_access_permitted() is true for valid user mappings (subject to the - * write permission check) other than user execute-only which do not have the - * PTE_USER bit set. PROT_NONE mappings do not have the PTE_VALID bit set. + * write permission check). PROT_NONE mappings do not have the PTE_VALID bit + * set. */ #define pte_access_permitted(pte, write) \ (pte_valid_user(pte) && (!(write) || pte_write(pte))) diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 2629a68b8724..5af82587909e 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -42,7 +42,6 @@ #endif #define __ARCH_WANT_SYS_CLONE -#define __ARCH_WANT_SYS_CLONE3 #ifndef __COMPAT_SYSCALL_NR #include <uapi/asm/unistd.h> diff --git a/arch/arm64/include/uapi/asm/unistd.h b/arch/arm64/include/uapi/asm/unistd.h index 4703d218663a..f83a70e07df8 100644 --- a/arch/arm64/include/uapi/asm/unistd.h +++ b/arch/arm64/include/uapi/asm/unistd.h @@ -19,5 +19,6 @@ #define __ARCH_WANT_NEW_STAT #define __ARCH_WANT_SET_GET_RLIMIT #define __ARCH_WANT_TIME32_SYSCALLS +#define __ARCH_WANT_SYS_CLONE3 #include <asm-generic/unistd.h> diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 71f788cd2b18..d54586d5b031 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -360,8 +360,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) asmlinkage void ret_from_fork(void) asm("ret_from_fork"); -int copy_thread(unsigned long clone_flags, unsigned long stack_start, - unsigned long stk_sz, struct task_struct *p) +int copy_thread_tls(unsigned long clone_flags, unsigned long stack_start, + unsigned long stk_sz, struct task_struct *p, unsigned long tls) { struct pt_regs *childregs = task_pt_regs(p); @@ -394,11 +394,11 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, } /* - * If a TLS pointer was passed to clone (4th argument), use it - * for the new thread. + * If a TLS pointer was passed to clone, use it for the new + * thread. */ if (clone_flags & CLONE_SETTLS) - p->thread.uw.tp_value = childregs->regs[3]; + p->thread.uw.tp_value = tls; } else { memset(childregs, 0, sizeof(struct pt_regs)); childregs->pstate = PSR_MODE_EL1h; diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 077b02a2d4d3..85566d32958f 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -445,7 +445,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, const struct fault_info *inf; struct mm_struct *mm = current->mm; vm_fault_t fault, major = 0; - unsigned long vm_flags = VM_READ | VM_WRITE; + unsigned long vm_flags = VM_READ | VM_WRITE | VM_EXEC; unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; if (kprobe_page_fault(regs, esr)) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 5a3b15a14a7f..40797cbfba2d 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1070,7 +1070,6 @@ void arch_remove_memory(int nid, u64 start, u64 size, { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - struct zone *zone; /* * FIXME: Cleanup page tables (also in arch_add_memory() in case @@ -1079,7 +1078,6 @@ void arch_remove_memory(int nid, u64 start, u64 size, * unplug. ARCH_ENABLE_MEMORY_HOTREMOVE must not be * unlocked yet. */ - zone = page_zone(pfn_to_page(start_pfn)); - __remove_pages(zone, start_pfn, nr_pages, altmap); + __remove_pages(start_pfn, nr_pages, altmap); } #endif diff --git a/arch/hexagon/include/asm/atomic.h b/arch/hexagon/include/asm/atomic.h index 12cd9231c4b8..0231d69c8bf2 100644 --- a/arch/hexagon/include/asm/atomic.h +++ b/arch/hexagon/include/asm/atomic.h @@ -91,7 +91,7 @@ static inline void atomic_##op(int i, atomic_t *v) \ "1: %0 = memw_locked(%1);\n" \ " %0 = "#op "(%0,%2);\n" \ " memw_locked(%1,P3)=%0;\n" \ - " if !P3 jump 1b;\n" \ + " if (!P3) jump 1b;\n" \ : "=&r" (output) \ : "r" (&v->counter), "r" (i) \ : "memory", "p3" \ @@ -107,7 +107,7 @@ static inline int atomic_##op##_return(int i, atomic_t *v) \ "1: %0 = memw_locked(%1);\n" \ " %0 = "#op "(%0,%2);\n" \ " memw_locked(%1,P3)=%0;\n" \ - " if !P3 jump 1b;\n" \ + " if (!P3) jump 1b;\n" \ : "=&r" (output) \ : "r" (&v->counter), "r" (i) \ : "memory", "p3" \ @@ -124,7 +124,7 @@ static inline int atomic_fetch_##op(int i, atomic_t *v) \ "1: %0 = memw_locked(%2);\n" \ " %1 = "#op "(%0,%3);\n" \ " memw_locked(%2,P3)=%1;\n" \ - " if !P3 jump 1b;\n" \ + " if (!P3) jump 1b;\n" \ : "=&r" (output), "=&r" (val) \ : "r" (&v->counter), "r" (i) \ : "memory", "p3" \ @@ -173,7 +173,7 @@ static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) " }" " memw_locked(%2, p3) = %1;" " {" - " if !p3 jump 1b;" + " if (!p3) jump 1b;" " }" "2:" : "=&r" (__oldval), "=&r" (tmp) diff --git a/arch/hexagon/include/asm/bitops.h b/arch/hexagon/include/asm/bitops.h index 47384b094b94..71429f756af0 100644 --- a/arch/hexagon/include/asm/bitops.h +++ b/arch/hexagon/include/asm/bitops.h @@ -38,7 +38,7 @@ static inline int test_and_clear_bit(int nr, volatile void *addr) "1: R12 = memw_locked(R10);\n" " { P0 = tstbit(R12,R11); R12 = clrbit(R12,R11); }\n" " memw_locked(R10,P1) = R12;\n" - " {if !P1 jump 1b; %0 = mux(P0,#1,#0);}\n" + " {if (!P1) jump 1b; %0 = mux(P0,#1,#0);}\n" : "=&r" (oldval) : "r" (addr), "r" (nr) : "r10", "r11", "r12", "p0", "p1", "memory" @@ -62,7 +62,7 @@ static inline int test_and_set_bit(int nr, volatile void *addr) "1: R12 = memw_locked(R10);\n" " { P0 = tstbit(R12,R11); R12 = setbit(R12,R11); }\n" " memw_locked(R10,P1) = R12;\n" - " {if !P1 jump 1b; %0 = mux(P0,#1,#0);}\n" + " {if (!P1) jump 1b; %0 = mux(P0,#1,#0);}\n" : "=&r" (oldval) : "r" (addr), "r" (nr) : "r10", "r11", "r12", "p0", "p1", "memory" @@ -88,7 +88,7 @@ static inline int test_and_change_bit(int nr, volatile void *addr) "1: R12 = memw_locked(R10);\n" " { P0 = tstbit(R12,R11); R12 = togglebit(R12,R11); }\n" " memw_locked(R10,P1) = R12;\n" - " {if !P1 jump 1b; %0 = mux(P0,#1,#0);}\n" + " {if (!P1) jump 1b; %0 = mux(P0,#1,#0);}\n" : "=&r" (oldval) : "r" (addr), "r" (nr) : "r10", "r11", "r12", "p0", "p1", "memory" @@ -223,7 +223,7 @@ static inline int ffs(int x) int r; asm("{ P0 = cmp.eq(%1,#0); %0 = ct0(%1);}\n" - "{ if P0 %0 = #0; if !P0 %0 = add(%0,#1);}\n" + "{ if (P0) %0 = #0; if (!P0) %0 = add(%0,#1);}\n" : "=&r" (r) : "r" (x) : "p0"); diff --git a/arch/hexagon/include/asm/cmpxchg.h b/arch/hexagon/include/asm/cmpxchg.h index 6091322c3af9..92b8a02e588a 100644 --- a/arch/hexagon/include/asm/cmpxchg.h +++ b/arch/hexagon/include/asm/cmpxchg.h @@ -30,7 +30,7 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr, __asm__ __volatile__ ( "1: %0 = memw_locked(%1);\n" /* load into retval */ " memw_locked(%1,P0) = %2;\n" /* store into memory */ - " if !P0 jump 1b;\n" + " if (!P0) jump 1b;\n" : "=&r" (retval) : "r" (ptr), "r" (x) : "memory", "p0" diff --git a/arch/hexagon/include/asm/futex.h b/arch/hexagon/include/asm/futex.h index cb635216a732..0191f7c7193e 100644 --- a/arch/hexagon/include/asm/futex.h +++ b/arch/hexagon/include/asm/futex.h @@ -16,7 +16,7 @@ /* For example: %1 = %4 */ \ insn \ "2: memw_locked(%3,p2) = %1;\n" \ - " if !p2 jump 1b;\n" \ + " if (!p2) jump 1b;\n" \ " %1 = #0;\n" \ "3:\n" \ ".section .fixup,\"ax\"\n" \ @@ -84,10 +84,10 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, "1: %1 = memw_locked(%3)\n" " {\n" " p2 = cmp.eq(%1,%4)\n" - " if !p2.new jump:NT 3f\n" + " if (!p2.new) jump:NT 3f\n" " }\n" "2: memw_locked(%3,p2) = %5\n" - " if !p2 jump 1b\n" + " if (!p2) jump 1b\n" "3:\n" ".section .fixup,\"ax\"\n" "4: %0 = #%6\n" diff --git a/arch/hexagon/include/asm/io.h b/arch/hexagon/include/asm/io.h index 539e3efcf39c..b0dbc3473172 100644 --- a/arch/hexagon/include/asm/io.h +++ b/arch/hexagon/include/asm/io.h @@ -173,6 +173,7 @@ static inline void writel(u32 data, volatile void __iomem *addr) void __iomem *ioremap(unsigned long phys_addr, unsigned long size); #define ioremap_nocache ioremap +#define ioremap_uc(X, Y) ioremap((X), (Y)) #define __raw_writel writel diff --git a/arch/hexagon/include/asm/spinlock.h b/arch/hexagon/include/asm/spinlock.h index bfe07d842ff3..ef103b73bec8 100644 --- a/arch/hexagon/include/asm/spinlock.h +++ b/arch/hexagon/include/asm/spinlock.h @@ -30,9 +30,9 @@ static inline void arch_read_lock(arch_rwlock_t *lock) __asm__ __volatile__( "1: R6 = memw_locked(%0);\n" " { P3 = cmp.ge(R6,#0); R6 = add(R6,#1);}\n" - " { if !P3 jump 1b; }\n" + " { if (!P3) jump 1b; }\n" " memw_locked(%0,P3) = R6;\n" - " { if !P3 jump 1b; }\n" + " { if (!P3) jump 1b; }\n" : : "r" (&lock->lock) : "memory", "r6", "p3" @@ -46,7 +46,7 @@ static inline void arch_read_unlock(arch_rwlock_t *lock) "1: R6 = memw_locked(%0);\n" " R6 = add(R6,#-1);\n" " memw_locked(%0,P3) = R6\n" - " if !P3 jump 1b;\n" + " if (!P3) jump 1b;\n" : : "r" (&lock->lock) : "memory", "r6", "p3" @@ -61,7 +61,7 @@ static inline int arch_read_trylock(arch_rwlock_t *lock) __asm__ __volatile__( " R6 = memw_locked(%1);\n" " { %0 = #0; P3 = cmp.ge(R6,#0); R6 = add(R6,#1);}\n" - " { if !P3 jump 1f; }\n" + " { if (!P3) jump 1f; }\n" " memw_locked(%1,P3) = R6;\n" " { %0 = P3 }\n" "1:\n" @@ -78,9 +78,9 @@ static inline void arch_write_lock(arch_rwlock_t *lock) __asm__ __volatile__( "1: R6 = memw_locked(%0)\n" " { P3 = cmp.eq(R6,#0); R6 = #-1;}\n" - " { if !P3 jump 1b; }\n" + " { if (!P3) jump 1b; }\n" " memw_locked(%0,P3) = R6;\n" - " { if !P3 jump 1b; }\n" + " { if (!P3) jump 1b; }\n" : : "r" (&lock->lock) : "memory", "r6", "p3" @@ -94,7 +94,7 @@ static inline int arch_write_trylock(arch_rwlock_t *lock) __asm__ __volatile__( " R6 = memw_locked(%1)\n" " { %0 = #0; P3 = cmp.eq(R6,#0); R6 = #-1;}\n" - " { if !P3 jump 1f; }\n" + " { if (!P3) jump 1f; }\n" " memw_locked(%1,P3) = R6;\n" " %0 = P3;\n" "1:\n" @@ -117,9 +117,9 @@ static inline void arch_spin_lock(arch_spinlock_t *lock) __asm__ __volatile__( "1: R6 = memw_locked(%0);\n" " P3 = cmp.eq(R6,#0);\n" - " { if !P3 jump 1b; R6 = #1; }\n" + " { if (!P3) jump 1b; R6 = #1; }\n" " memw_locked(%0,P3) = R6;\n" - " { if !P3 jump 1b; }\n" + " { if (!P3) jump 1b; }\n" : : "r" (&lock->lock) : "memory", "r6", "p3" @@ -139,7 +139,7 @@ static inline unsigned int arch_spin_trylock(arch_spinlock_t *lock) __asm__ __volatile__( " R6 = memw_locked(%1);\n" " P3 = cmp.eq(R6,#0);\n" - " { if !P3 jump 1f; R6 = #1; %0 = #0; }\n" + " { if (!P3) jump 1f; R6 = #1; %0 = #0; }\n" " memw_locked(%1,P3) = R6;\n" " %0 = P3;\n" "1:\n" diff --git a/arch/hexagon/kernel/stacktrace.c b/arch/hexagon/kernel/stacktrace.c index 35f29423fda8..5ed02f699479 100644 --- a/arch/hexagon/kernel/stacktrace.c +++ b/arch/hexagon/kernel/stacktrace.c @@ -11,8 +11,6 @@ #include <linux/thread_info.h> #include <linux/module.h> -register unsigned long current_frame_pointer asm("r30"); - struct stackframe { unsigned long fp; unsigned long rets; @@ -30,7 +28,7 @@ void save_stack_trace(struct stack_trace *trace) low = (unsigned long)task_stack_page(current); high = low + THREAD_SIZE; - fp = current_frame_pointer; + fp = (unsigned long)__builtin_frame_address(0); while (fp >= low && fp <= (high - sizeof(*frame))) { frame = (struct stackframe *)fp; diff --git a/arch/hexagon/kernel/vm_entry.S b/arch/hexagon/kernel/vm_entry.S index 12242c27e2df..4023fdbea490 100644 --- a/arch/hexagon/kernel/vm_entry.S +++ b/arch/hexagon/kernel/vm_entry.S @@ -369,7 +369,7 @@ ret_from_fork: R26.L = #LO(do_work_pending); R0 = #VM_INT_DISABLE; } - if P0 jump check_work_pending + if (P0) jump check_work_pending { R0 = R25; callr R24 diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 58fd67068bac..b01d68a2d5d9 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -689,9 +689,7 @@ void arch_remove_memory(int nid, u64 start, u64 size, { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - struct zone *zone; - zone = page_zone(pfn_to_page(start_pfn)); - __remove_pages(zone, start_pfn, nr_pages, altmap); + __remove_pages(start_pfn, nr_pages, altmap); } #endif diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index add388236f4e..ed8e28b0fb3e 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -47,7 +47,7 @@ config MIPS select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE if CPU_SUPPORTS_HUGEPAGES select HAVE_ASM_MODVERSIONS - select HAVE_EBPF_JIT if (!CPU_MICROMIPS) + select HAVE_EBPF_JIT if 64BIT && !CPU_MICROMIPS && TARGET_ISA_REV >= 2 select HAVE_CONTEXT_TRACKING select HAVE_COPY_THREAD_TLS select HAVE_C_RECORDMCOUNT diff --git a/arch/mips/boot/compressed/Makefile b/arch/mips/boot/compressed/Makefile index 172801ed35b8..d859f079b771 100644 --- a/arch/mips/boot/compressed/Makefile +++ b/arch/mips/boot/compressed/Makefile @@ -29,6 +29,9 @@ KBUILD_AFLAGS := $(KBUILD_AFLAGS) -D__ASSEMBLY__ \ -DBOOT_HEAP_SIZE=$(BOOT_HEAP_SIZE) \ -DKERNEL_ENTRY=$(VMLINUX_ENTRY_ADDRESS) +# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. +KCOV_INSTRUMENT := n + # decompressor objects (linked with vmlinuz) vmlinuzobjs-y := $(obj)/head.o $(obj)/decompress.o $(obj)/string.o diff --git a/arch/mips/include/asm/cpu-type.h b/arch/mips/include/asm/cpu-type.h index c46c59b0f1b4..49f0061a6051 100644 --- a/arch/mips/include/asm/cpu-type.h +++ b/arch/mips/include/asm/cpu-type.h @@ -15,7 +15,8 @@ static inline int __pure __get_cpu_type(const int cpu_type) { switch (cpu_type) { -#if defined(CONFIG_SYS_HAS_CPU_LOONGSON2EF) +#if defined(CONFIG_SYS_HAS_CPU_LOONGSON2E) || \ + defined(CONFIG_SYS_HAS_CPU_LOONGSON2F) case CPU_LOONGSON2EF: #endif diff --git a/arch/mips/include/asm/thread_info.h b/arch/mips/include/asm/thread_info.h index 4993db40482c..ee26f9a4575d 100644 --- a/arch/mips/include/asm/thread_info.h +++ b/arch/mips/include/asm/thread_info.h @@ -49,8 +49,26 @@ struct thread_info { .addr_limit = KERNEL_DS, \ } -/* How to get the thread information struct from C. */ +/* + * A pointer to the struct thread_info for the currently executing thread is + * held in register $28/$gp. + * + * We declare __current_thread_info as a global register variable rather than a + * local register variable within current_thread_info() because clang doesn't + * support explicit local register variables. + * + * When building the VDSO we take care not to declare the global register + * variable because this causes GCC to not preserve the value of $28/$gp in + * functions that change its value (which is common in the PIC VDSO when + * accessing the GOT). Since the VDSO shouldn't be accessing + * __current_thread_info anyway we declare it extern in order to cause a link + * failure if it's referenced. + */ +#ifdef __VDSO__ +extern struct thread_info *__current_thread_info; +#else register struct thread_info *__current_thread_info __asm__("$28"); +#endif static inline struct thread_info *current_thread_info(void) { diff --git a/arch/mips/include/asm/vdso/gettimeofday.h b/arch/mips/include/asm/vdso/gettimeofday.h index b08825531e9f..0ae9b4cbc153 100644 --- a/arch/mips/include/asm/vdso/gettimeofday.h +++ b/arch/mips/include/asm/vdso/gettimeofday.h @@ -26,8 +26,6 @@ #define __VDSO_USE_SYSCALL ULLONG_MAX -#ifdef CONFIG_MIPS_CLOCK_VSYSCALL - static __always_inline long gettimeofday_fallback( struct __kernel_old_timeval *_tv, struct timezone *_tz) @@ -48,17 +46,6 @@ static __always_inline long gettimeofday_fallback( return error ? -ret : ret; } -#else - -static __always_inline long gettimeofday_fallback( - struct __kernel_old_timeval *_tv, - struct timezone *_tz) -{ - return -1; -} - -#endif - static __always_inline long clock_gettime_fallback( clockid_t _clkid, struct __kernel_timespec *_ts) diff --git a/arch/mips/kernel/cacheinfo.c b/arch/mips/kernel/cacheinfo.c index f777e44653d5..47312c529410 100644 --- a/arch/mips/kernel/cacheinfo.c +++ b/arch/mips/kernel/cacheinfo.c @@ -50,6 +50,25 @@ static int __init_cache_level(unsigned int cpu) return 0; } +static void fill_cpumask_siblings(int cpu, cpumask_t *cpu_map) +{ + int cpu1; + + for_each_possible_cpu(cpu1) + if (cpus_are_siblings(cpu, cpu1)) + cpumask_set_cpu(cpu1, cpu_map); +} + +static void fill_cpumask_cluster(int cpu, cpumask_t *cpu_map) +{ + int cpu1; + int cluster = cpu_cluster(&cpu_data[cpu]); + + for_each_possible_cpu(cpu1) + if (cpu_cluster(&cpu_data[cpu1]) == cluster) + cpumask_set_cpu(cpu1, cpu_map); +} + static int __populate_cache_leaves(unsigned int cpu) { struct cpuinfo_mips *c = ¤t_cpu_data; @@ -57,14 +76,20 @@ static int __populate_cache_leaves(unsigned int cpu) struct cacheinfo *this_leaf = this_cpu_ci->info_list; if (c->icache.waysize) { + /* L1 caches are per core */ + fill_cpumask_siblings(cpu, &this_leaf->shared_cpu_map); populate_cache(dcache, this_leaf, 1, CACHE_TYPE_DATA); + fill_cpumask_siblings(cpu, &this_leaf->shared_cpu_map); populate_cache(icache, this_leaf, 1, CACHE_TYPE_INST); } else { populate_cache(dcache, this_leaf, 1, CACHE_TYPE_UNIFIED); } - if (c->scache.waysize) + if (c->scache.waysize) { + /* L2 cache is per cluster */ + fill_cpumask_cluster(cpu, &this_leaf->shared_cpu_map); populate_cache(scache, this_leaf, 2, CACHE_TYPE_UNIFIED); + } if (c->tcache.waysize) populate_cache(tcache, this_leaf, 3, CACHE_TYPE_UNIFIED); diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c index 3ec69d9cbe88..561154cbcc40 100644 --- a/arch/mips/net/ebpf_jit.c +++ b/arch/mips/net/ebpf_jit.c @@ -1804,7 +1804,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) unsigned int image_size; u8 *image_ptr; - if (!prog->jit_requested || MIPS_ISA_REV < 2) + if (!prog->jit_requested) return prog; tmp = bpf_jit_blind_constants(prog); diff --git a/arch/mips/vdso/vgettimeofday.c b/arch/mips/vdso/vgettimeofday.c index 6ebdc37c89fc..6b83b6376a4b 100644 --- a/arch/mips/vdso/vgettimeofday.c +++ b/arch/mips/vdso/vgettimeofday.c @@ -17,12 +17,22 @@ int __vdso_clock_gettime(clockid_t clock, return __cvdso_clock_gettime32(clock, ts); } +#ifdef CONFIG_MIPS_CLOCK_VSYSCALL + +/* + * This is behind the ifdef so that we don't provide the symbol when there's no + * possibility of there being a usable clocksource, because there's nothing we + * can do without it. When libc fails the symbol lookup it should fall back on + * the standard syscall path. + */ int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz) { return __cvdso_gettimeofday(tv, tz); } +#endif /* CONFIG_MIPS_CLOCK_VSYSCALL */ + int __vdso_clock_getres(clockid_t clock_id, struct old_timespec32 *res) { @@ -43,12 +53,22 @@ int __vdso_clock_gettime(clockid_t clock, return __cvdso_clock_gettime(clock, ts); } +#ifdef CONFIG_MIPS_CLOCK_VSYSCALL + +/* + * This is behind the ifdef so that we don't provide the symbol when there's no + * possibility of there being a usable clocksource, because there's nothing we + * can do without it. When libc fails the symbol lookup it should fall back on + * the standard syscall path. + */ int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz) { return __cvdso_gettimeofday(tv, tz); } +#endif /* CONFIG_MIPS_CLOCK_VSYSCALL */ + int __vdso_clock_getres(clockid_t clock_id, struct __kernel_timespec *res) { diff --git a/arch/nds32/include/asm/cacheflush.h b/arch/nds32/include/asm/cacheflush.h index d9ac7e6408ef..caddded56e77 100644 --- a/arch/nds32/include/asm/cacheflush.h +++ b/arch/nds32/include/asm/cacheflush.h @@ -9,7 +9,11 @@ #define PG_dcache_dirty PG_arch_1 void flush_icache_range(unsigned long start, unsigned long end); +#define flush_icache_range flush_icache_range + void flush_icache_page(struct vm_area_struct *vma, struct page *page); +#define flush_icache_page flush_icache_page + #ifdef CONFIG_CPU_CACHE_ALIASING void flush_cache_mm(struct mm_struct *mm); void flush_cache_dup_mm(struct mm_struct *mm); @@ -40,12 +44,11 @@ void invalidate_kernel_vmap_range(void *addr, int size); #define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&(mapping)->i_pages) #else -#include <asm-generic/cacheflush.h> -#undef flush_icache_range -#undef flush_icache_page -#undef flush_icache_user_range void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len); +#define flush_icache_user_range flush_icache_user_range + +#include <asm-generic/cacheflush.h> #endif #endif /* __NDS32_CACHEFLUSH_H__ */ diff --git a/arch/nds32/include/asm/pgtable.h b/arch/nds32/include/asm/pgtable.h index 0214e4150539..6abc58ac406d 100644 --- a/arch/nds32/include/asm/pgtable.h +++ b/arch/nds32/include/asm/pgtable.h @@ -195,7 +195,7 @@ extern void paging_init(void); #define pte_unmap(pte) do { } while (0) #define pte_unmap_nested(pte) do { } while (0) -#define pmd_off_k(address) pmd_offset(pgd_offset_k(address), address) +#define pmd_off_k(address) pmd_offset(pud_offset(p4d_offset(pgd_offset_k(address), (address)), (address)), (address)) #define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval) /* diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index b16237c95ea3..0c29d6cb2c8d 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -62,6 +62,7 @@ config PARISC select HAVE_FTRACE_MCOUNT_RECORD if HAVE_DYNAMIC_FTRACE select HAVE_KPROBES_ON_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS + select HAVE_COPY_THREAD_TLS help The PA-RISC microprocessor is designed by Hewlett-Packard and used diff --git a/arch/parisc/kernel/drivers.c b/arch/parisc/kernel/drivers.c index a6c9f49c6612..a5f3e50fe976 100644 --- a/arch/parisc/kernel/drivers.c +++ b/arch/parisc/kernel/drivers.c @@ -889,8 +889,8 @@ static void print_parisc_device(struct parisc_device *dev) static int count; print_pa_hwpath(dev, hw_path); - pr_info("%d. %s at 0x%px [%s] { %d, 0x%x, 0x%.3x, 0x%.5x }", - ++count, dev->name, (void*) dev->hpa.start, hw_path, dev->id.hw_type, + pr_info("%d. %s at %pap [%s] { %d, 0x%x, 0x%.3x, 0x%.5x }", + ++count, dev->name, &(dev->hpa.start), hw_path, dev->id.hw_type, dev->id.hversion_rev, dev->id.hversion, dev->id.sversion); if (dev->num_addrs) { diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index ecc5c2771208..230a6422b99f 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -208,8 +208,8 @@ arch_initcall(parisc_idle_init); * Copy architecture-specific thread state */ int -copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long kthread_arg, struct task_struct *p) +copy_thread_tls(unsigned long clone_flags, unsigned long usp, + unsigned long kthread_arg, struct task_struct *p, unsigned long tls) { struct pt_regs *cregs = &(p->thread.regs); void *stack = task_stack_page(p); @@ -254,9 +254,9 @@ copy_thread(unsigned long clone_flags, unsigned long usp, cregs->ksp = (unsigned long)stack + THREAD_SZ_ALGN + FRAME_SIZE; cregs->kpc = (unsigned long) &child_return; - /* Setup thread TLS area from the 4th parameter in clone */ + /* Setup thread TLS area */ if (clone_flags & CLONE_SETTLS) - cregs->cr27 = cregs->gr[23]; + cregs->cr27 = tls; } return 0; diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index ddca8287d43b..354cf060b67f 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -401,7 +401,7 @@ static void __init map_pages(unsigned long start_vaddr, pmd = (pmd_t *) __pa(pmd); } - pgd_populate(NULL, pg_dir, __va(pmd)); + pud_populate(NULL, (pud_t *)pg_dir, __va(pmd)); #endif pg_dir++; diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 1ec34e16ed65..e2a412113359 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -455,11 +455,7 @@ config PPC_TRANSACTIONAL_MEM config PPC_UV bool "Ultravisor support" depends on KVM_BOOK3S_HV_POSSIBLE - select ZONE_DEVICE - select DEV_PAGEMAP_OPS - select DEVICE_PRIVATE - select MEMORY_HOTPLUG - select MEMORY_HOTREMOVE + depends on DEVICE_PRIVATE default n help This option paravirtualizes the kernel to run in POWER platforms that diff --git a/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-10g-0-best-effort.dtsi b/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-10g-0-best-effort.dtsi index e1a961f05dcd..baa0c503e741 100644 --- a/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-10g-0-best-effort.dtsi +++ b/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-10g-0-best-effort.dtsi @@ -63,6 +63,7 @@ fman@400000 { #size-cells = <0>; compatible = "fsl,fman-memac-mdio", "fsl,fman-xmdio"; reg = <0xe1000 0x1000>; + fsl,erratum-a011043; /* must ignore read errors */ pcsphy0: ethernet-phy@0 { reg = <0x0>; diff --git a/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-10g-0.dtsi b/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-10g-0.dtsi index c288f3c6c637..93095600e808 100644 --- a/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-10g-0.dtsi +++ b/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-10g-0.dtsi @@ -60,6 +60,7 @@ fman@400000 { #size-cells = <0>; compatible = "fsl,fman-memac-mdio", "fsl,fman-xmdio"; reg = <0xf1000 0x1000>; + fsl,erratum-a011043; /* must ignore read errors */ pcsphy6: ethernet-phy@0 { reg = <0x0>; diff --git a/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-10g-1-best-effort.dtsi b/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-10g-1-best-effort.dtsi index 94f3e7175012..ff4bd38f0645 100644 --- a/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-10g-1-best-effort.dtsi +++ b/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-10g-1-best-effort.dtsi @@ -63,6 +63,7 @@ fman@400000 { #size-cells = <0>; compatible = "fsl,fman-memac-mdio", "fsl,fman-xmdio"; reg = <0xe3000 0x1000>; + fsl,erratum-a011043; /* must ignore read errors */ pcsphy1: ethernet-phy@0 { reg = <0x0>; diff --git a/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-10g-1.dtsi b/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-10g-1.dtsi index 94a76982d214..1fa38ed6f59e 100644 --- a/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-10g-1.dtsi +++ b/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-10g-1.dtsi @@ -60,6 +60,7 @@ fman@400000 { #size-cells = <0>; compatible = "fsl,fman-memac-mdio", "fsl,fman-xmdio"; reg = <0xf3000 0x1000>; + fsl,erratum-a011043; /* must ignore read errors */ pcsphy7: ethernet-phy@0 { reg = <0x0>; diff --git a/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-1g-0.dtsi b/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-1g-0.dtsi index b5ff5f71c6b8..a8cc9780c0c4 100644 --- a/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-1g-0.dtsi +++ b/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-1g-0.dtsi @@ -59,6 +59,7 @@ fman@400000 { #size-cells = <0>; compatible = "fsl,fman-memac-mdio", "fsl,fman-xmdio"; reg = <0xe1000 0x1000>; + fsl,erratum-a011043; /* must ignore read errors */ pcsphy0: ethernet-phy@0 { reg = <0x0>; diff --git a/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-1g-1.dtsi b/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-1g-1.dtsi index ee44182c6348..8b8bd70c9382 100644 --- a/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-1g-1.dtsi +++ b/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-1g-1.dtsi @@ -59,6 +59,7 @@ fman@400000 { #size-cells = <0>; compatible = "fsl,fman-memac-mdio", "fsl,fman-xmdio"; reg = <0xe3000 0x1000>; + fsl,erratum-a011043; /* must ignore read errors */ pcsphy1: ethernet-phy@0 { reg = <0x0>; diff --git a/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-1g-2.dtsi b/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-1g-2.dtsi index f05f0d775039..619c880b54d8 100644 --- a/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-1g-2.dtsi +++ b/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-1g-2.dtsi @@ -59,6 +59,7 @@ fman@400000 { #size-cells = <0>; compatible = "fsl,fman-memac-mdio", "fsl,fman-xmdio"; reg = <0xe5000 0x1000>; + fsl,erratum-a011043; /* must ignore read errors */ pcsphy2: ethernet-phy@0 { reg = <0x0>; diff --git a/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-1g-3.dtsi b/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-1g-3.dtsi index a9114ec51075..d7ebb73a400d 100644 --- a/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-1g-3.dtsi +++ b/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-1g-3.dtsi @@ -59,6 +59,7 @@ fman@400000 { #size-cells = <0>; compatible = "fsl,fman-memac-mdio", "fsl,fman-xmdio"; reg = <0xe7000 0x1000>; + fsl,erratum-a011043; /* must ignore read errors */ pcsphy3: ethernet-phy@0 { reg = <0x0>; diff --git a/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-1g-4.dtsi b/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-1g-4.dtsi index 44dd00ac7367..b151d696a069 100644 --- a/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-1g-4.dtsi +++ b/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-1g-4.dtsi @@ -59,6 +59,7 @@ fman@400000 { #size-cells = <0>; compatible = "fsl,fman-memac-mdio", "fsl,fman-xmdio"; reg = <0xe9000 0x1000>; + fsl,erratum-a011043; /* must ignore read errors */ pcsphy4: ethernet-phy@0 { reg = <0x0>; diff --git a/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-1g-5.dtsi b/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-1g-5.dtsi index 5b1b84b58602..adc0ae0013a3 100644 --- a/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-1g-5.dtsi +++ b/arch/powerpc/boot/dts/fsl/qoriq-fman3-0-1g-5.dtsi @@ -59,6 +59,7 @@ fman@400000 { #size-cells = <0>; compatible = "fsl,fman-memac-mdio", "fsl,fman-xmdio"; reg = <0xeb000 0x1000>; + fsl,erratum-a011043; /* must ignore read errors */ pcsphy5: ethernet-phy@0 { reg = <0x0>; diff --git a/arch/powerpc/boot/dts/fsl/qoriq-fman3-1-10g-0.dtsi b/arch/powerpc/boot/dts/fsl/qoriq-fman3-1-10g-0.dtsi index 0e1daaef9e74..435047e0e250 100644 --- a/arch/powerpc/boot/dts/fsl/qoriq-fman3-1-10g-0.dtsi +++ b/arch/powerpc/boot/dts/fsl/qoriq-fman3-1-10g-0.dtsi @@ -60,6 +60,7 @@ fman@500000 { #size-cells = <0>; compatible = "fsl,fman-memac-mdio", "fsl,fman-xmdio"; reg = <0xf1000 0x1000>; + fsl,erratum-a011043; /* must ignore read errors */ pcsphy14: ethernet-phy@0 { reg = <0x0>; diff --git a/arch/powerpc/boot/dts/fsl/qoriq-fman3-1-10g-1.dtsi b/arch/powerpc/boot/dts/fsl/qoriq-fman3-1-10g-1.dtsi index 68c5ef779266..c098657cca0a 100644 --- a/arch/powerpc/boot/dts/fsl/qoriq-fman3-1-10g-1.dtsi +++ b/arch/powerpc/boot/dts/fsl/qoriq-fman3-1-10g-1.dtsi @@ -60,6 +60,7 @@ fman@500000 { #size-cells = <0>; compatible = "fsl,fman-memac-mdio", "fsl,fman-xmdio"; reg = <0xf3000 0x1000>; + fsl,erratum-a011043; /* must ignore read errors */ pcsphy15: ethernet-phy@0 { reg = <0x0>; diff --git a/arch/powerpc/boot/dts/fsl/qoriq-fman3-1-1g-0.dtsi b/arch/powerpc/boot/dts/fsl/qoriq-fman3-1-1g-0.dtsi index 605363cc1117..9d06824815f3 100644 --- a/arch/powerpc/boot/dts/fsl/qoriq-fman3-1-1g-0.dtsi +++ b/arch/powerpc/boot/dts/fsl/qoriq-fman3-1-1g-0.dtsi @@ -59,6 +59,7 @@ fman@500000 { #size-cells = <0>; compatible = "fsl,fman-memac-mdio", "fsl,fman-xmdio"; reg = <0xe1000 0x1000>; + fsl,erratum-a011043; /* must ignore read errors */ pcsphy8: ethernet-phy@0 { reg = <0x0>; diff --git a/arch/powerpc/boot/dts/fsl/qoriq-fman3-1-1g-1.dtsi b/arch/powerpc/boot/dts/fsl/qoriq-fman3-1-1g-1.dtsi index 1955dfa13634..70e947730c4b 100644 --- a/arch/powerpc/boot/dts/fsl/qoriq-fman3-1-1g-1.dtsi +++ b/arch/powerpc/boot/dts/fsl/qoriq-fman3-1-1g-1.dtsi @@ -59,6 +59,7 @@ fman@500000 { #size-cells = <0>; compatible = "fsl,fman-memac-mdio", "fsl,fman-xmdio"; reg = <0xe3000 0x1000>; + fsl,erratum-a011043; /* must ignore read errors */ pcsphy9: ethernet-phy@0 { reg = <0x0>; diff --git a/arch/powerpc/boot/dts/fsl/qoriq-fman3-1-1g-2.dtsi b/arch/powerpc/boot/dts/fsl/qoriq-fman3-1-1g-2.dtsi index 2c1476454ee0..ad96e6529595 100644 --- a/arch/powerpc/boot/dts/fsl/qoriq-fman3-1-1g-2.dtsi +++ b/arch/powerpc/boot/dts/fsl/qoriq-fman3-1-1g-2.dtsi @@ -59,6 +59,7 @@ fman@500000 { #size-cells = <0>; compatible = "fsl,fman-memac-mdio", "fsl,fman-xmdio"; reg = <0xe5000 0x1000>; + fsl,erratum-a011043; /* must ignore read errors */ pcsphy10: ethernet-phy@0 { reg = <0x0>; diff --git a/arch/powerpc/boot/dts/fsl/qoriq-fman3-1-1g-3.dtsi b/arch/powerpc/boot/dts/fsl/qoriq-fman3-1-1g-3.dtsi index b8b541ff5fb0..034bc4b71f7a 100644 --- a/arch/powerpc/boot/dts/fsl/qoriq-fman3-1-1g-3.dtsi +++ b/arch/powerpc/boot/dts/fsl/qoriq-fman3-1-1g-3.dtsi @@ -59,6 +59,7 @@ fman@500000 { #size-cells = <0>; compatible = "fsl,fman-memac-mdio", "fsl,fman-xmdio"; reg = <0xe7000 0x1000>; + fsl,erratum-a011043; /* must ignore read errors */ pcsphy11: ethernet-phy@0 { reg = <0x0>; diff --git a/arch/powerpc/boot/dts/fsl/qoriq-fman3-1-1g-4.dtsi b/arch/powerpc/boot/dts/fsl/qoriq-fman3-1-1g-4.dtsi index 4b2cfddd1b15..93ca23d82b39 100644 --- a/arch/powerpc/boot/dts/fsl/qoriq-fman3-1-1g-4.dtsi +++ b/arch/powerpc/boot/dts/fsl/qoriq-fman3-1-1g-4.dtsi @@ -59,6 +59,7 @@ fman@500000 { #size-cells = <0>; compatible = "fsl,fman-memac-mdio", "fsl,fman-xmdio"; reg = <0xe9000 0x1000>; + fsl,erratum-a011043; /* must ignore read errors */ pcsphy12: ethernet-phy@0 { reg = <0x0>; diff --git a/arch/powerpc/boot/dts/fsl/qoriq-fman3-1-1g-5.dtsi b/arch/powerpc/boot/dts/fsl/qoriq-fman3-1-1g-5.dtsi index 0a52ddf7cc17..23b3117a2fd2 100644 --- a/arch/powerpc/boot/dts/fsl/qoriq-fman3-1-1g-5.dtsi +++ b/arch/powerpc/boot/dts/fsl/qoriq-fman3-1-1g-5.dtsi @@ -59,6 +59,7 @@ fman@500000 { #size-cells = <0>; compatible = "fsl,fman-memac-mdio", "fsl,fman-xmdio"; reg = <0xeb000 0x1000>; + fsl,erratum-a011043; /* must ignore read errors */ pcsphy13: ethernet-phy@0 { reg = <0x0>; diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index 15b75005bc34..3fa1b962dc27 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -600,8 +600,11 @@ extern void slb_set_size(u16 size); * */ #define MAX_USER_CONTEXT ((ASM_CONST(1) << CONTEXT_BITS) - 2) + +// The + 2 accounts for INVALID_REGION and 1 more to avoid overlap with kernel #define MIN_USER_CONTEXT (MAX_KERNEL_CTX_CNT + MAX_VMALLOC_CTX_CNT + \ - MAX_IO_CTX_CNT + MAX_VMEMMAP_CTX_CNT) + MAX_IO_CTX_CNT + MAX_VMEMMAP_CTX_CNT + 2) + /* * For platforms that support on 65bit VA we limit the context bits */ diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h index 1b55fc08f853..860228e917dc 100644 --- a/arch/powerpc/include/asm/spinlock.h +++ b/arch/powerpc/include/asm/spinlock.h @@ -15,6 +15,7 @@ * * (the type definitions are in asm/spinlock_types.h) */ +#include <linux/jump_label.h> #include <linux/irqflags.h> #ifdef CONFIG_PPC64 #include <asm/paca.h> diff --git a/arch/powerpc/include/asm/xive-regs.h b/arch/powerpc/include/asm/xive-regs.h index f2dfcd50a2d3..33aee7490cbb 100644 --- a/arch/powerpc/include/asm/xive-regs.h +++ b/arch/powerpc/include/asm/xive-regs.h @@ -39,6 +39,7 @@ #define XIVE_ESB_VAL_P 0x2 #define XIVE_ESB_VAL_Q 0x1 +#define XIVE_ESB_INVALID 0xFF /* * Thread Management (aka "TM") registers diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 617c2777926f..f5535eae637f 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -151,10 +151,9 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size, { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - struct page *page = pfn_to_page(start_pfn) + vmem_altmap_offset(altmap); int ret; - __remove_pages(page_zone(page), start_pfn, nr_pages, altmap); + __remove_pages(start_pfn, nr_pages, altmap); /* Remove htab bolted mappings for this section of memory */ start = (unsigned long)__va(start); diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 42bbcd47cc85..dffe1a45b6ed 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -50,7 +50,7 @@ static void slice_print_mask(const char *label, const struct slice_mask *mask) { #endif -static inline bool slice_addr_is_low(unsigned long addr) +static inline notrace bool slice_addr_is_low(unsigned long addr) { u64 tmp = (u64)addr; @@ -659,7 +659,7 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp, mm_ctx_user_psize(¤t->mm->context), 1); } -unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr) +unsigned int notrace get_slice_psize(struct mm_struct *mm, unsigned long addr) { unsigned char *psizes; int index, mask_index; diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index f5fadbd2533a..9651ca061828 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -972,12 +972,21 @@ static int xive_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, bool *state) { struct xive_irq_data *xd = irq_data_get_irq_handler_data(data); + u8 pq; switch (which) { case IRQCHIP_STATE_ACTIVE: - *state = !xd->stale_p && - (xd->saved_p || - !!(xive_esb_read(xd, XIVE_ESB_GET) & XIVE_ESB_VAL_P)); + pq = xive_esb_read(xd, XIVE_ESB_GET); + + /* + * The esb value being all 1's means we couldn't get + * the PQ state of the interrupt through mmio. It may + * happen, for example when querying a PHB interrupt + * while the PHB is in an error state. We consider the + * interrupt to be inactive in that case. + */ + *state = (pq != XIVE_ESB_INVALID) && !xd->stale_p && + (xd->saved_p || !!(pq & XIVE_ESB_VAL_P)); return 0; default: return -EINVAL; diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index d8efbaa78d67..fa7dc03459e7 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -64,6 +64,8 @@ config RISCV select SPARSEMEM_STATIC if 32BIT select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU select HAVE_ARCH_MMAP_RND_BITS if MMU + select ARCH_HAS_GCOV_PROFILE_ALL + select HAVE_COPY_THREAD_TLS config ARCH_MMAP_RND_BITS_MIN default 18 if 64BIT diff --git a/arch/riscv/boot/dts/sifive/fu540-c000.dtsi b/arch/riscv/boot/dts/sifive/fu540-c000.dtsi index 70a1891e7cd0..a2e3d54e830c 100644 --- a/arch/riscv/boot/dts/sifive/fu540-c000.dtsi +++ b/arch/riscv/boot/dts/sifive/fu540-c000.dtsi @@ -54,6 +54,7 @@ reg = <1>; riscv,isa = "rv64imafdc"; tlb-split; + next-level-cache = <&l2cache>; cpu1_intc: interrupt-controller { #interrupt-cells = <1>; compatible = "riscv,cpu-intc"; @@ -77,6 +78,7 @@ reg = <2>; riscv,isa = "rv64imafdc"; tlb-split; + next-level-cache = <&l2cache>; cpu2_intc: interrupt-controller { #interrupt-cells = <1>; compatible = "riscv,cpu-intc"; @@ -100,6 +102,7 @@ reg = <3>; riscv,isa = "rv64imafdc"; tlb-split; + next-level-cache = <&l2cache>; cpu3_intc: interrupt-controller { #interrupt-cells = <1>; compatible = "riscv,cpu-intc"; @@ -123,6 +126,7 @@ reg = <4>; riscv,isa = "rv64imafdc"; tlb-split; + next-level-cache = <&l2cache>; cpu4_intc: interrupt-controller { #interrupt-cells = <1>; compatible = "riscv,cpu-intc"; @@ -253,6 +257,17 @@ #pwm-cells = <3>; status = "disabled"; }; + l2cache: cache-controller@2010000 { + compatible = "sifive,fu540-c000-ccache", "cache"; + cache-block-size = <64>; + cache-level = <2>; + cache-sets = <1024>; + cache-size = <2097152>; + cache-unified; + interrupt-parent = <&plic0>; + interrupts = <1 2 3>; + reg = <0x0 0x2010000 0x0 0x1000>; + }; }; }; diff --git a/arch/riscv/include/asm/asm-prototypes.h b/arch/riscv/include/asm/asm-prototypes.h index dd62b691c443..27e005fca584 100644 --- a/arch/riscv/include/asm/asm-prototypes.h +++ b/arch/riscv/include/asm/asm-prototypes.h @@ -5,4 +5,8 @@ #include <linux/ftrace.h> #include <asm-generic/asm-prototypes.h> +long long __lshrti3(long long a, int b); +long long __ashrti3(long long a, int b); +long long __ashlti3(long long a, int b); + #endif /* _ASM_RISCV_PROTOTYPES_H */ diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h index 0a62d2d68455..435b65532e29 100644 --- a/arch/riscv/include/asm/csr.h +++ b/arch/riscv/include/asm/csr.h @@ -116,9 +116,9 @@ # define SR_PIE SR_MPIE # define SR_PP SR_MPP -# define IRQ_SOFT IRQ_M_SOFT -# define IRQ_TIMER IRQ_M_TIMER -# define IRQ_EXT IRQ_M_EXT +# define RV_IRQ_SOFT IRQ_M_SOFT +# define RV_IRQ_TIMER IRQ_M_TIMER +# define RV_IRQ_EXT IRQ_M_EXT #else /* CONFIG_RISCV_M_MODE */ # define CSR_STATUS CSR_SSTATUS # define CSR_IE CSR_SIE @@ -133,15 +133,15 @@ # define SR_PIE SR_SPIE # define SR_PP SR_SPP -# define IRQ_SOFT IRQ_S_SOFT -# define IRQ_TIMER IRQ_S_TIMER -# define IRQ_EXT IRQ_S_EXT +# define RV_IRQ_SOFT IRQ_S_SOFT +# define RV_IRQ_TIMER IRQ_S_TIMER +# define RV_IRQ_EXT IRQ_S_EXT #endif /* CONFIG_RISCV_M_MODE */ /* IE/IP (Supervisor/Machine Interrupt Enable/Pending) flags */ -#define IE_SIE (_AC(0x1, UL) << IRQ_SOFT) -#define IE_TIE (_AC(0x1, UL) << IRQ_TIMER) -#define IE_EIE (_AC(0x1, UL) << IRQ_EXT) +#define IE_SIE (_AC(0x1, UL) << RV_IRQ_SOFT) +#define IE_TIE (_AC(0x1, UL) << RV_IRQ_TIMER) +#define IE_EIE (_AC(0x1, UL) << RV_IRQ_EXT) #ifndef __ASSEMBLY__ diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c index b94d8db5ddcc..c40fdcdeb950 100644 --- a/arch/riscv/kernel/ftrace.c +++ b/arch/riscv/kernel/ftrace.c @@ -142,7 +142,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, */ old = *parent; - if (function_graph_enter(old, self_addr, frame_pointer, parent)) + if (!function_graph_enter(old, self_addr, frame_pointer, parent)) *parent = return_hooker; } diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index 797802c73dee..a4242be66966 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -80,7 +80,9 @@ _start_kernel: #ifdef CONFIG_SMP li t0, CONFIG_NR_CPUS - bgeu a0, t0, .Lsecondary_park + blt a0, t0, .Lgood_cores + tail .Lsecondary_park +.Lgood_cores: #endif /* Pick one hart to run the main boot sequence */ @@ -209,11 +211,6 @@ relocate: tail smp_callin #endif -.align 2 -.Lsecondary_park: - /* We lack SMP support or have too many harts, so park this hart */ - wfi - j .Lsecondary_park END(_start) #ifdef CONFIG_RISCV_M_MODE @@ -251,7 +248,7 @@ ENTRY(reset_regs) #ifdef CONFIG_FPU csrr t0, CSR_MISA andi t0, t0, (COMPAT_HWCAP_ISA_F | COMPAT_HWCAP_ISA_D) - bnez t0, .Lreset_regs_done + beqz t0, .Lreset_regs_done li t1, SR_FS csrs CSR_STATUS, t1 @@ -295,6 +292,13 @@ ENTRY(reset_regs) END(reset_regs) #endif /* CONFIG_RISCV_M_MODE */ +.section ".text", "ax",@progbits +.align 2 +.Lsecondary_park: + /* We lack SMP support or have too many harts, so park this hart */ + wfi + j .Lsecondary_park + __PAGE_ALIGNED_BSS /* Empty zero page */ .balign PAGE_SIZE diff --git a/arch/riscv/kernel/irq.c b/arch/riscv/kernel/irq.c index 3f07a91d5afb..345c4f2eba13 100644 --- a/arch/riscv/kernel/irq.c +++ b/arch/riscv/kernel/irq.c @@ -23,11 +23,11 @@ asmlinkage __visible void __irq_entry do_IRQ(struct pt_regs *regs) irq_enter(); switch (regs->cause & ~CAUSE_IRQ_FLAG) { - case IRQ_TIMER: + case RV_IRQ_TIMER: riscv_timer_interrupt(); break; #ifdef CONFIG_SMP - case IRQ_SOFT: + case RV_IRQ_SOFT: /* * We only use software interrupts to pass IPIs, so if a non-SMP * system gets one, then we don't know what to do. @@ -35,7 +35,7 @@ asmlinkage __visible void __irq_entry do_IRQ(struct pt_regs *regs) riscv_software_interrupt(); break; #endif - case IRQ_EXT: + case RV_IRQ_EXT: handle_arch_irq(regs); break; default: diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index 95a3031e5c7c..817cf7b0974c 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -99,8 +99,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) return 0; } -int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long arg, struct task_struct *p) +int copy_thread_tls(unsigned long clone_flags, unsigned long usp, + unsigned long arg, struct task_struct *p, unsigned long tls) { struct pt_regs *childregs = task_pt_regs(p); @@ -121,7 +121,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, if (usp) /* User fork */ childregs->sp = usp; if (clone_flags & CLONE_SETTLS) - childregs->tp = childregs->a5; + childregs->tp = tls; childregs->a0 = 0; /* Return value of fork() */ p->thread.ra = (unsigned long)ret_from_fork; } diff --git a/arch/riscv/kernel/vdso/Makefile b/arch/riscv/kernel/vdso/Makefile index 49a5852fd07d..33b16f4212f7 100644 --- a/arch/riscv/kernel/vdso/Makefile +++ b/arch/riscv/kernel/vdso/Makefile @@ -58,7 +58,8 @@ quiet_cmd_vdsold = VDSOLD $@ cmd_vdsold = $(CC) $(KBUILD_CFLAGS) $(call cc-option, -no-pie) -nostdlib -nostartfiles $(SYSCFLAGS_$(@F)) \ -Wl,-T,$(filter-out FORCE,$^) -o $@.tmp && \ $(CROSS_COMPILE)objcopy \ - $(patsubst %, -G __vdso_%, $(vdso-syms)) $@.tmp $@ + $(patsubst %, -G __vdso_%, $(vdso-syms)) $@.tmp $@ && \ + rm $@.tmp # install commands for the unstripped file quiet_cmd_vdso_install = INSTALL $@ diff --git a/arch/riscv/lib/tishift.S b/arch/riscv/lib/tishift.S index 15f9d54c7db6..ef90075c4b0a 100644 --- a/arch/riscv/lib/tishift.S +++ b/arch/riscv/lib/tishift.S @@ -4,34 +4,73 @@ */ #include <linux/linkage.h> +#include <asm-generic/export.h> -ENTRY(__lshrti3) +SYM_FUNC_START(__lshrti3) beqz a2, .L1 li a5,64 sub a5,a5,a2 - addi sp,sp,-16 sext.w a4,a5 blez a5, .L2 sext.w a2,a2 - sll a4,a1,a4 srl a0,a0,a2 - srl a1,a1,a2 + sll a4,a1,a4 + srl a2,a1,a2 or a0,a0,a4 - sd a1,8(sp) - sd a0,0(sp) - ld a0,0(sp) - ld a1,8(sp) - addi sp,sp,16 - ret + mv a1,a2 .L1: ret .L2: - negw a4,a4 - srl a1,a1,a4 - sd a1,0(sp) - sd zero,8(sp) - ld a0,0(sp) - ld a1,8(sp) - addi sp,sp,16 + negw a0,a4 + li a2,0 + srl a0,a1,a0 + mv a1,a2 + ret +SYM_FUNC_END(__lshrti3) +EXPORT_SYMBOL(__lshrti3) + +SYM_FUNC_START(__ashrti3) + beqz a2, .L3 + li a5,64 + sub a5,a5,a2 + sext.w a4,a5 + blez a5, .L4 + sext.w a2,a2 + srl a0,a0,a2 + sll a4,a1,a4 + sra a2,a1,a2 + or a0,a0,a4 + mv a1,a2 +.L3: + ret +.L4: + negw a0,a4 + srai a2,a1,0x3f + sra a0,a1,a0 + mv a1,a2 + ret +SYM_FUNC_END(__ashrti3) +EXPORT_SYMBOL(__ashrti3) + +SYM_FUNC_START(__ashlti3) + beqz a2, .L5 + li a5,64 + sub a5,a5,a2 + sext.w a4,a5 + blez a5, .L6 + sext.w a2,a2 + sll a1,a1,a2 + srl a4,a0,a4 + sll a2,a0,a2 + or a1,a1,a4 + mv a0,a2 +.L5: + ret +.L6: + negw a1,a4 + li a2,0 + sll a1,a0,a1 + mv a0,a2 ret -ENDPROC(__lshrti3) +SYM_FUNC_END(__ashlti3) +EXPORT_SYMBOL(__ashlti3) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 69f6678db7f3..965a8cf4829c 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -99,13 +99,13 @@ static void __init setup_initrd(void) pr_info("initrd not found or empty"); goto disable; } - if (__pa(initrd_end) > PFN_PHYS(max_low_pfn)) { + if (__pa_symbol(initrd_end) > PFN_PHYS(max_low_pfn)) { pr_err("initrd extends beyond end of memory"); goto disable; } size = initrd_end - initrd_start; - memblock_reserve(__pa(initrd_start), size); + memblock_reserve(__pa_symbol(initrd_start), size); initrd_below_start_ok = 1; pr_info("Initial ramdisk at: 0x%p (%lu bytes)\n", @@ -124,8 +124,8 @@ void __init setup_bootmem(void) { struct memblock_region *reg; phys_addr_t mem_size = 0; - phys_addr_t vmlinux_end = __pa(&_end); - phys_addr_t vmlinux_start = __pa(&_start); + phys_addr_t vmlinux_end = __pa_symbol(&_end); + phys_addr_t vmlinux_start = __pa_symbol(&_start); /* Find the memory region containing the kernel */ for_each_memblock(memory, reg) { @@ -445,7 +445,7 @@ static void __init setup_vm_final(void) /* Setup swapper PGD for fixmap */ create_pgd_mapping(swapper_pg_dir, FIXADDR_START, - __pa(fixmap_pgd_next), + __pa_symbol(fixmap_pgd_next), PGDIR_SIZE, PAGE_TABLE); /* Map all memory banks */ @@ -474,7 +474,7 @@ static void __init setup_vm_final(void) clear_fixmap(FIX_PMD); /* Move to swapper page table */ - csr_write(CSR_SATP, PFN_DOWN(__pa(swapper_pg_dir)) | SATP_MODE); + csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | SATP_MODE); local_flush_tlb_all(); } #else diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 9cbf490fd162..d5fbd754f41a 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -1052,7 +1052,7 @@ static void __init log_component_list(void) if (!early_ipl_comp_list_addr) return; - if (ipl_block.hdr.flags & IPL_PL_FLAG_IPLSR) + if (ipl_block.hdr.flags & IPL_PL_FLAG_SIPL) pr_info("Linux is running with Secure-IPL enabled\n"); else pr_info("Linux is running with Secure-IPL disabled\n"); diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index f0ce22220565..ac44bd76db4b 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -292,10 +292,8 @@ void arch_remove_memory(int nid, u64 start, u64 size, { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - struct zone *zone; - zone = page_zone(pfn_to_page(start_pfn)); - __remove_pages(zone, start_pfn, nr_pages, altmap); + __remove_pages(start_pfn, nr_pages, altmap); vmem_remove_mapping(start, size); } #endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index dfdbaa50946e..d1b1ff2be17a 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -434,9 +434,7 @@ void arch_remove_memory(int nid, u64 start, u64 size, { unsigned long start_pfn = PFN_DOWN(start); unsigned long nr_pages = size >> PAGE_SHIFT; - struct zone *zone; - zone = page_zone(pfn_to_page(start_pfn)); - __remove_pages(zone, start_pfn, nr_pages, altmap); + __remove_pages(start_pfn, nr_pages, altmap); } #endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 2a6d04fcb3e9..6f0edd0c0220 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -14,6 +14,7 @@ config UML select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_DEBUG_KMEMLEAK select HAVE_DEBUG_BUGVERBOSE + select HAVE_COPY_THREAD_TLS select GENERIC_IRQ_SHOW select GENERIC_CPU_DEVICES select GENERIC_CLOCKEVENTS diff --git a/arch/um/include/asm/ptrace-generic.h b/arch/um/include/asm/ptrace-generic.h index 81c647ef9c6c..adf91ef553ae 100644 --- a/arch/um/include/asm/ptrace-generic.h +++ b/arch/um/include/asm/ptrace-generic.h @@ -36,7 +36,7 @@ extern long subarch_ptrace(struct task_struct *child, long request, extern unsigned long getreg(struct task_struct *child, int regno); extern int putreg(struct task_struct *child, int regno, unsigned long value); -extern int arch_copy_tls(struct task_struct *new); +extern int arch_set_tls(struct task_struct *new, unsigned long tls); extern void clear_flushed_tls(struct task_struct *task); extern int syscall_trace_enter(struct pt_regs *regs); extern void syscall_trace_leave(struct pt_regs *regs); diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index 263a8f069133..17045e7211bf 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -153,8 +153,8 @@ void fork_handler(void) userspace(¤t->thread.regs.regs, current_thread_info()->aux_fp_regs); } -int copy_thread(unsigned long clone_flags, unsigned long sp, - unsigned long arg, struct task_struct * p) +int copy_thread_tls(unsigned long clone_flags, unsigned long sp, + unsigned long arg, struct task_struct * p, unsigned long tls) { void (*handler)(void); int kthread = current->flags & PF_KTHREAD; @@ -188,7 +188,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, * Set a new TLS for the child thread? */ if (clone_flags & CLONE_SETTLS) - ret = arch_copy_tls(p); + ret = arch_set_tls(p, tls); } return ret; diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 58a512e33d8d..ee60b81944a7 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -244,6 +244,11 @@ SYM_FUNC_START(efi32_stub_entry) leal efi32_config(%ebp), %eax movl %eax, efi_config(%ebp) + /* Disable paging */ + movl %cr0, %eax + btrl $X86_CR0_PG_BIT, %eax + movl %eax, %cr0 + jmp startup_32 SYM_FUNC_END(efi32_stub_entry) #endif diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index dbaa1b088a30..c37cb12d0ef6 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -15,6 +15,7 @@ #define PCI_DEVICE_ID_INTEL_SKL_HQ_IMC 0x1910 #define PCI_DEVICE_ID_INTEL_SKL_SD_IMC 0x190f #define PCI_DEVICE_ID_INTEL_SKL_SQ_IMC 0x191f +#define PCI_DEVICE_ID_INTEL_SKL_E3_IMC 0x1918 #define PCI_DEVICE_ID_INTEL_KBL_Y_IMC 0x590c #define PCI_DEVICE_ID_INTEL_KBL_U_IMC 0x5904 #define PCI_DEVICE_ID_INTEL_KBL_UQ_IMC 0x5914 @@ -658,6 +659,10 @@ static const struct pci_device_id skl_uncore_pci_ids[] = { .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), }, { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_E3_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_Y_IMC), .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), }, @@ -826,6 +831,7 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = { IMC_DEV(SKL_HQ_IMC, &skl_uncore_pci_driver), /* 6th Gen Core H Quad Core */ IMC_DEV(SKL_SD_IMC, &skl_uncore_pci_driver), /* 6th Gen Core S Dual Core */ IMC_DEV(SKL_SQ_IMC, &skl_uncore_pci_driver), /* 6th Gen Core S Quad Core */ + IMC_DEV(SKL_E3_IMC, &skl_uncore_pci_driver), /* Xeon E3 V5 Gen Core processor */ IMC_DEV(KBL_Y_IMC, &skl_uncore_pci_driver), /* 7th Gen Core Y */ IMC_DEV(KBL_U_IMC, &skl_uncore_pci_driver), /* 7th Gen Core U */ IMC_DEV(KBL_UQ_IMC, &skl_uncore_pci_driver), /* 7th Gen Core U Quad Core */ diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index b10a5ec79e48..ad20220af303 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -369,11 +369,6 @@ #define SNR_M2M_PCI_PMON_BOX_CTL 0x438 #define SNR_M2M_PCI_PMON_UMASK_EXT 0xff -/* SNR PCIE3 */ -#define SNR_PCIE3_PCI_PMON_CTL0 0x508 -#define SNR_PCIE3_PCI_PMON_CTR0 0x4e8 -#define SNR_PCIE3_PCI_PMON_BOX_CTL 0x4e4 - /* SNR IMC */ #define SNR_IMC_MMIO_PMON_FIXED_CTL 0x54 #define SNR_IMC_MMIO_PMON_FIXED_CTR 0x38 @@ -4328,27 +4323,12 @@ static struct intel_uncore_type snr_uncore_m2m = { .format_group = &snr_m2m_uncore_format_group, }; -static struct intel_uncore_type snr_uncore_pcie3 = { - .name = "pcie3", - .num_counters = 4, - .num_boxes = 1, - .perf_ctr_bits = 48, - .perf_ctr = SNR_PCIE3_PCI_PMON_CTR0, - .event_ctl = SNR_PCIE3_PCI_PMON_CTL0, - .event_mask = SNBEP_PMON_RAW_EVENT_MASK, - .box_ctl = SNR_PCIE3_PCI_PMON_BOX_CTL, - .ops = &ivbep_uncore_pci_ops, - .format_group = &ivbep_uncore_format_group, -}; - enum { SNR_PCI_UNCORE_M2M, - SNR_PCI_UNCORE_PCIE3, }; static struct intel_uncore_type *snr_pci_uncores[] = { [SNR_PCI_UNCORE_M2M] = &snr_uncore_m2m, - [SNR_PCI_UNCORE_PCIE3] = &snr_uncore_pcie3, NULL, }; @@ -4357,10 +4337,6 @@ static const struct pci_device_id snr_uncore_pci_ids[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x344a), .driver_data = UNCORE_PCI_DEV_FULL_DATA(12, 0, SNR_PCI_UNCORE_M2M, 0), }, - { /* PCIe3 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x334a), - .driver_data = UNCORE_PCI_DEV_FULL_DATA(4, 0, SNR_PCI_UNCORE_PCIE3, 0), - }, { /* end: all zeroes */ } }; @@ -4536,6 +4512,7 @@ static struct uncore_event_desc snr_uncore_imc_freerunning_events[] = { INTEL_UNCORE_EVENT_DESC(write, "event=0xff,umask=0x21"), INTEL_UNCORE_EVENT_DESC(write.scale, "3.814697266e-6"), INTEL_UNCORE_EVENT_DESC(write.unit, "MiB"), + { /* end: all zeroes */ }, }; static struct intel_uncore_ops snr_uncore_imc_freerunning_ops = { diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 90f75e515876..62c30279be77 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -615,9 +615,9 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) return; clear_all: - clear_cpu_cap(c, X86_FEATURE_SME); + setup_clear_cpu_cap(X86_FEATURE_SME); clear_sev: - clear_cpu_cap(c, X86_FEATURE_SEV); + setup_clear_cpu_cap(X86_FEATURE_SEV); } } diff --git a/arch/x86/kernel/cpu/mce/therm_throt.c b/arch/x86/kernel/cpu/mce/therm_throt.c index b38010b541d6..6c3e1c92f183 100644 --- a/arch/x86/kernel/cpu/mce/therm_throt.c +++ b/arch/x86/kernel/cpu/mce/therm_throt.c @@ -467,6 +467,7 @@ static int thermal_throttle_online(unsigned int cpu) { struct thermal_state *state = &per_cpu(thermal_state, cpu); struct device *dev = get_cpu_device(cpu); + u32 l; state->package_throttle.level = PACKAGE_LEVEL; state->core_throttle.level = CORE_LEVEL; @@ -474,6 +475,10 @@ static int thermal_throttle_online(unsigned int cpu) INIT_DELAYED_WORK(&state->package_throttle.therm_work, throttle_active_work); INIT_DELAYED_WORK(&state->core_throttle.therm_work, throttle_active_work); + /* Unmask the thermal vector after the above workqueues are initialized. */ + l = apic_read(APIC_LVTTHMR); + apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); + return thermal_throttle_add_dev(dev, cpu); } @@ -722,10 +727,6 @@ void intel_init_thermal(struct cpuinfo_x86 *c) rdmsr(MSR_IA32_MISC_ENABLE, l, h); wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); - /* Unmask the thermal vector: */ - l = apic_read(APIC_LVTTHMR); - apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); - pr_info_once("CPU0: Thermal monitoring enabled (%s)\n", tm2 ? "TM2" : "TM1"); diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 03eb90d00af0..89049b343c7a 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -618,7 +618,7 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) if (static_branch_unlikely(&rdt_mon_enable_key)) rmdir_mondata_subdir_allrdtgrp(r, d->id); list_del(&d->list); - if (is_mbm_enabled()) + if (r->mon_capable && is_mbm_enabled()) cancel_delayed_work(&d->mbm_over); if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) { /* diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 2e3b06d6bbc6..dac7209a0708 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -1741,9 +1741,6 @@ static int set_cache_qos_cfg(int level, bool enable) struct rdt_domain *d; int cpu; - if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) - return -ENOMEM; - if (level == RDT_RESOURCE_L3) update = l3_qos_cfg_update; else if (level == RDT_RESOURCE_L2) @@ -1751,6 +1748,9 @@ static int set_cache_qos_cfg(int level, bool enable) else return -EINVAL; + if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) + return -ENOMEM; + r_l = &rdt_resources_all[level]; list_for_each_entry(d, &r_l->domains, list) { /* Pick one CPU from each domain instance to update MSR */ diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 930edeb41ec3..0a74407ef92e 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -865,10 +865,8 @@ void arch_remove_memory(int nid, u64 start, u64 size, { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - struct zone *zone; - zone = page_zone(pfn_to_page(start_pfn)); - __remove_pages(zone, start_pfn, nr_pages, altmap); + __remove_pages(start_pfn, nr_pages, altmap); } #endif diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index dcb9bc961b39..bcfede46fe02 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1212,10 +1212,8 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size, { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - struct page *page = pfn_to_page(start_pfn) + vmem_altmap_offset(altmap); - struct zone *zone = page_zone(page); - __remove_pages(zone, start_pfn, nr_pages, altmap); + __remove_pages(start_pfn, nr_pages, altmap); kernel_physical_mapping_remove(start, start + size); } #endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 4c8a2d1f8470..9ba08e9abc09 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1328,7 +1328,7 @@ emit_jmp: return proglen; } -static void save_regs(struct btf_func_model *m, u8 **prog, int nr_args, +static void save_regs(const struct btf_func_model *m, u8 **prog, int nr_args, int stack_size) { int i; @@ -1344,7 +1344,7 @@ static void save_regs(struct btf_func_model *m, u8 **prog, int nr_args, -(stack_size - i * 8)); } -static void restore_regs(struct btf_func_model *m, u8 **prog, int nr_args, +static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_args, int stack_size) { int i; @@ -1361,7 +1361,7 @@ static void restore_regs(struct btf_func_model *m, u8 **prog, int nr_args, -(stack_size - i * 8)); } -static int invoke_bpf(struct btf_func_model *m, u8 **pprog, +static int invoke_bpf(const struct btf_func_model *m, u8 **pprog, struct bpf_prog **progs, int prog_cnt, int stack_size) { u8 *prog = *pprog; @@ -1456,7 +1456,8 @@ static int invoke_bpf(struct btf_func_model *m, u8 **pprog, * add rsp, 8 // skip eth_type_trans's frame * ret // return to its caller */ -int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags, +int arch_prepare_bpf_trampoline(void *image, void *image_end, + const struct btf_func_model *m, u32 flags, struct bpf_prog **fentry_progs, int fentry_cnt, struct bpf_prog **fexit_progs, int fexit_cnt, void *orig_call) @@ -1523,13 +1524,10 @@ int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags /* skip our return address and return to parent */ EMIT4(0x48, 0x83, 0xC4, 8); /* add rsp, 8 */ EMIT1(0xC3); /* ret */ - /* One half of the page has active running trampoline. - * Another half is an area for next trampoline. - * Make sure the trampoline generation logic doesn't overflow. - */ - if (WARN_ON_ONCE(prog - (u8 *)image > PAGE_SIZE / 2 - BPF_INSN_SAFETY)) + /* Make sure the trampoline generation logic doesn't overflow */ + if (WARN_ON_ONCE(prog > (u8 *)image_end - BPF_INSN_SAFETY)) return -EFAULT; - return 0; + return prog - (u8 *)image; } static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond) diff --git a/arch/x86/um/tls_32.c b/arch/x86/um/tls_32.c index 5bd949da7a4a..ac8eee093f9c 100644 --- a/arch/x86/um/tls_32.c +++ b/arch/x86/um/tls_32.c @@ -215,14 +215,12 @@ static int set_tls_entry(struct task_struct* task, struct user_desc *info, return 0; } -int arch_copy_tls(struct task_struct *new) +int arch_set_tls(struct task_struct *new, unsigned long tls) { struct user_desc info; int idx, ret = -EFAULT; - if (copy_from_user(&info, - (void __user *) UPT_SI(&new->thread.regs.regs), - sizeof(info))) + if (copy_from_user(&info, (void __user *) tls, sizeof(info))) goto out; ret = -EINVAL; diff --git a/arch/x86/um/tls_64.c b/arch/x86/um/tls_64.c index 3a621e0d3925..ebd3855d9b13 100644 --- a/arch/x86/um/tls_64.c +++ b/arch/x86/um/tls_64.c @@ -6,14 +6,13 @@ void clear_flushed_tls(struct task_struct *task) { } -int arch_copy_tls(struct task_struct *t) +int arch_set_tls(struct task_struct *t, unsigned long tls) { /* * If CLONE_SETTLS is set, we need to save the thread id - * (which is argument 5, child_tid, of clone) so it can be set - * during context switches. + * so it can be set during context switches. */ - t->thread.arch.fs = t->thread.regs.regs.gp[R8 / sizeof(long)]; + t->thread.arch.fs = tls; return 0; } diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 4a3fa295d8fe..296c5324dace 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -24,6 +24,7 @@ config XTENSA select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL select HAVE_ARCH_KASAN if MMU && !XIP_KERNEL select HAVE_ARCH_TRACEHOOK + select HAVE_COPY_THREAD_TLS select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS select HAVE_EXIT_THREAD diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c index 9e1c49134c07..3edecc41ef8c 100644 --- a/arch/xtensa/kernel/process.c +++ b/arch/xtensa/kernel/process.c @@ -202,8 +202,9 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) * involved. Much simpler to just not copy those live frames across. */ -int copy_thread(unsigned long clone_flags, unsigned long usp_thread_fn, - unsigned long thread_fn_arg, struct task_struct *p) +int copy_thread_tls(unsigned long clone_flags, unsigned long usp_thread_fn, + unsigned long thread_fn_arg, struct task_struct *p, + unsigned long tls) { struct pt_regs *childregs = task_pt_regs(p); @@ -266,9 +267,8 @@ int copy_thread(unsigned long clone_flags, unsigned long usp_thread_fn, childregs->syscall = regs->syscall; - /* The thread pointer is passed in the '4th argument' (= a5) */ if (clone_flags & CLONE_SETTLS) - childregs->threadptr = childregs->areg[5]; + childregs->threadptr = tls; } else { p->thread.ra = MAKE_RA_FOR_CALL( (unsigned long)ret_from_kernel_thread, 1); diff --git a/block/bio.c b/block/bio.c index a5d75f6bf4c7..94d697217887 100644 --- a/block/bio.c +++ b/block/bio.c @@ -539,6 +539,55 @@ void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start) EXPORT_SYMBOL(zero_fill_bio_iter); /** + * bio_truncate - truncate the bio to small size of @new_size + * @bio: the bio to be truncated + * @new_size: new size for truncating the bio + * + * Description: + * Truncate the bio to new size of @new_size. If bio_op(bio) is + * REQ_OP_READ, zero the truncated part. This function should only + * be used for handling corner cases, such as bio eod. + */ +void bio_truncate(struct bio *bio, unsigned new_size) +{ + struct bio_vec bv; + struct bvec_iter iter; + unsigned int done = 0; + bool truncated = false; + + if (new_size >= bio->bi_iter.bi_size) + return; + + if (bio_op(bio) != REQ_OP_READ) + goto exit; + + bio_for_each_segment(bv, bio, iter) { + if (done + bv.bv_len > new_size) { + unsigned offset; + + if (!truncated) + offset = new_size - done; + else + offset = 0; + zero_user(bv.bv_page, offset, bv.bv_len - offset); + truncated = true; + } + done += bv.bv_len; + } + + exit: + /* + * Don't touch bvec table here and make it really immutable, since + * fs bio user has to retrieve all pages via bio_for_each_segment_all + * in its .end_bio() callback. + * + * It is enough to truncate bio by updating .bi_size since we can make + * correct bvec with the updated .bi_size for drivers. + */ + bio->bi_iter.bi_size = new_size; +} + +/** * bio_put - release a reference to a bio * @bio: bio to release reference to * diff --git a/block/blk-merge.c b/block/blk-merge.c index d783bdc4559b..1534ed736363 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -157,17 +157,20 @@ static inline unsigned get_max_io_size(struct request_queue *q, return sectors & (lbs - 1); } -static unsigned get_max_segment_size(const struct request_queue *q, - unsigned offset) +static inline unsigned get_max_segment_size(const struct request_queue *q, + struct page *start_page, + unsigned long offset) { unsigned long mask = queue_segment_boundary(q); - /* default segment boundary mask means no boundary limit */ - if (mask == BLK_SEG_BOUNDARY_MASK) - return queue_max_segment_size(q); + offset = mask & (page_to_phys(start_page) + offset); - return min_t(unsigned long, mask - (mask & offset) + 1, - queue_max_segment_size(q)); + /* + * overflow may be triggered in case of zero page physical address + * on 32bit arch, use queue's max segment size when that happens. + */ + return min_not_zero(mask - offset + 1, + (unsigned long)queue_max_segment_size(q)); } /** @@ -201,7 +204,8 @@ static bool bvec_split_segs(const struct request_queue *q, unsigned seg_size = 0; while (len && *nsegs < max_segs) { - seg_size = get_max_segment_size(q, bv->bv_offset + total_len); + seg_size = get_max_segment_size(q, bv->bv_page, + bv->bv_offset + total_len); seg_size = min(seg_size, len); (*nsegs)++; @@ -419,7 +423,8 @@ static unsigned blk_bvec_map_sg(struct request_queue *q, while (nbytes > 0) { unsigned offset = bvec->bv_offset + total; - unsigned len = min(get_max_segment_size(q, offset), nbytes); + unsigned len = min(get_max_segment_size(q, bvec->bv_page, + offset), nbytes); struct page *page = bvec->bv_page; /* diff --git a/block/blk-settings.c b/block/blk-settings.c index 5f6dcc7a47bd..c8eda2e7b91e 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -328,7 +328,7 @@ EXPORT_SYMBOL(blk_queue_max_segment_size); * storage device can address. The default of 512 covers most * hardware. **/ -void blk_queue_logical_block_size(struct request_queue *q, unsigned short size) +void blk_queue_logical_block_size(struct request_queue *q, unsigned int size) { q->limits.logical_block_size = size; diff --git a/drivers/atm/eni.c b/drivers/atm/eni.c index b23d1e4bad33..9d0d65efcd94 100644 --- a/drivers/atm/eni.c +++ b/drivers/atm/eni.c @@ -374,7 +374,7 @@ static int do_rx_dma(struct atm_vcc *vcc,struct sk_buff *skb, here = (eni_vcc->descr+skip) & (eni_vcc->words-1); dma[j++] = (here << MID_DMA_COUNT_SHIFT) | (vcc->vci << MID_DMA_VCI_SHIFT) | MID_DT_JK; - j++; + dma[j++] = 0; } here = (eni_vcc->descr+size+skip) & (eni_vcc->words-1); if (!eff) size += skip; @@ -447,7 +447,7 @@ static int do_rx_dma(struct atm_vcc *vcc,struct sk_buff *skb, if (size != eff) { dma[j++] = (here << MID_DMA_COUNT_SHIFT) | (vcc->vci << MID_DMA_VCI_SHIFT) | MID_DT_JK; - j++; + dma[j++] = 0; } if (!j || j > 2*RX_DMA_BUF) { printk(KERN_CRIT DEV_LABEL "!j or j too big!!!\n"); diff --git a/drivers/atm/firestream.c b/drivers/atm/firestream.c index aad00d2b28f5..cc87004d5e2d 100644 --- a/drivers/atm/firestream.c +++ b/drivers/atm/firestream.c @@ -912,6 +912,7 @@ static int fs_open(struct atm_vcc *atm_vcc) } if (!to) { printk ("No more free channels for FS50..\n"); + kfree(vcc); return -EBUSY; } vcc->channo = dev->channo; @@ -922,6 +923,7 @@ static int fs_open(struct atm_vcc *atm_vcc) if (((DO_DIRECTION(rxtp) && dev->atm_vccs[vcc->channo])) || ( DO_DIRECTION(txtp) && test_bit (vcc->channo, dev->tx_inuse))) { printk ("Channel is in use for FS155.\n"); + kfree(vcc); return -EBUSY; } } @@ -935,6 +937,7 @@ static int fs_open(struct atm_vcc *atm_vcc) tc, sizeof (struct fs_transmit_config)); if (!tc) { fs_dprintk (FS_DEBUG_OPEN, "fs: can't alloc transmit_config.\n"); + kfree(vcc); return -ENOMEM; } diff --git a/drivers/base/firmware_loader/builtin/Makefile b/drivers/base/firmware_loader/builtin/Makefile index 4a66888e7253..5fa7ce3745a0 100644 --- a/drivers/base/firmware_loader/builtin/Makefile +++ b/drivers/base/firmware_loader/builtin/Makefile @@ -17,7 +17,7 @@ PROGBITS = $(if $(CONFIG_ARM),%,@)progbits filechk_fwbin = \ echo "/* Generated by $(src)/Makefile */" ;\ echo " .section .rodata" ;\ - echo " .p2align $(ASM_ALIGN)" ;\ + echo " .p2align 4" ;\ echo "_fw_$(FWSTR)_bin:" ;\ echo " .incbin \"$(fwdir)/$(FWNAME)\"" ;\ echo "_fw_end:" ;\ diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index d4d88b581822..ed34785dd64b 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -129,11 +129,13 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, return BLK_STS_IOERR; case BLK_ZONE_COND_EMPTY: case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + case BLK_ZONE_COND_CLOSED: /* Writes must be at the write pointer position */ if (sector != zone->wp) return BLK_STS_IOERR; - if (zone->cond == BLK_ZONE_COND_EMPTY) + if (zone->cond != BLK_ZONE_COND_EXP_OPEN) zone->cond = BLK_ZONE_COND_IMP_OPEN; zone->wp += nr_sectors; @@ -186,7 +188,10 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, if (zone->cond == BLK_ZONE_COND_FULL) return BLK_STS_IOERR; - zone->cond = BLK_ZONE_COND_CLOSED; + if (zone->wp == zone->start) + zone->cond = BLK_ZONE_COND_EMPTY; + else + zone->cond = BLK_ZONE_COND_CLOSED; break; case REQ_OP_ZONE_FINISH: if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c index f4d1597df0a2..ccb44fe790a7 100644 --- a/drivers/bus/ti-sysc.c +++ b/drivers/bus/ti-sysc.c @@ -343,6 +343,12 @@ static int sysc_get_clocks(struct sysc *ddata) return -EINVAL; } + /* Always add a slot for main clocks fck and ick even if unused */ + if (!nr_fck) + ddata->nr_clocks++; + if (!nr_ick) + ddata->nr_clocks++; + ddata->clocks = devm_kcalloc(ddata->dev, ddata->nr_clocks, sizeof(*ddata->clocks), GFP_KERNEL); @@ -421,7 +427,7 @@ static int sysc_enable_opt_clocks(struct sysc *ddata) struct clk *clock; int i, error; - if (!ddata->clocks) + if (!ddata->clocks || ddata->nr_clocks < SYSC_OPTFCK0 + 1) return 0; for (i = SYSC_OPTFCK0; i < SYSC_MAX_CLOCKS; i++) { @@ -455,7 +461,7 @@ static void sysc_disable_opt_clocks(struct sysc *ddata) struct clk *clock; int i; - if (!ddata->clocks) + if (!ddata->clocks || ddata->nr_clocks < SYSC_OPTFCK0 + 1) return; for (i = SYSC_OPTFCK0; i < SYSC_MAX_CLOCKS; i++) { diff --git a/drivers/char/agp/isoch.c b/drivers/char/agp/isoch.c index 31c374b1b91b..7ecf20a6d19c 100644 --- a/drivers/char/agp/isoch.c +++ b/drivers/char/agp/isoch.c @@ -84,7 +84,6 @@ static int agp_3_5_isochronous_node_enable(struct agp_bridge_data *bridge, unsigned int cdev = 0; u32 mnistat, tnistat, tstatus, mcmd; u16 tnicmd, mnicmd; - u8 mcapndx; u32 tot_bw = 0, tot_n = 0, tot_rq = 0, y_max, rq_isoch, rq_async; u32 step, rem, rem_isoch, rem_async; int ret = 0; @@ -138,8 +137,6 @@ static int agp_3_5_isochronous_node_enable(struct agp_bridge_data *bridge, cur = list_entry(pos, struct agp_3_5_dev, list); dev = cur->dev; - mcapndx = cur->capndx; - pci_read_config_dword(dev, cur->capndx+AGPNISTAT, &mnistat); master[cdev].maxbw = (mnistat >> 16) & 0xff; @@ -251,8 +248,6 @@ static int agp_3_5_isochronous_node_enable(struct agp_bridge_data *bridge, cur = master[cdev].dev; dev = cur->dev; - mcapndx = cur->capndx; - master[cdev].rq += (cdev == ndevs - 1) ? (rem_async + rem_isoch) : step; @@ -319,7 +314,7 @@ int agp_3_5_enable(struct agp_bridge_data *bridge) { struct pci_dev *td = bridge->dev, *dev = NULL; u8 mcapndx; - u32 isoch, arqsz; + u32 isoch; u32 tstatus, mstatus, ncapid; u32 mmajor; u16 mpstat; @@ -334,8 +329,6 @@ int agp_3_5_enable(struct agp_bridge_data *bridge) if (isoch == 0) /* isoch xfers not available, bail out. */ return -ENODEV; - arqsz = (tstatus >> 13) & 0x7; - /* * Allocate a head for our AGP 3.5 device list * (multiple AGP v3 devices are allowed behind a single bridge). diff --git a/drivers/char/tpm/tpm-dev-common.c b/drivers/char/tpm/tpm-dev-common.c index b23b0b999232..87f449340202 100644 --- a/drivers/char/tpm/tpm-dev-common.c +++ b/drivers/char/tpm/tpm-dev-common.c @@ -130,7 +130,7 @@ ssize_t tpm_common_read(struct file *file, char __user *buf, priv->response_read = true; ret_size = min_t(ssize_t, size, priv->response_length); - if (!ret_size) { + if (ret_size <= 0) { priv->response_length = 0; goto out; } diff --git a/drivers/char/tpm/tpm-dev.h b/drivers/char/tpm/tpm-dev.h index 1089fc0bb290..f3742bcc73e3 100644 --- a/drivers/char/tpm/tpm-dev.h +++ b/drivers/char/tpm/tpm-dev.h @@ -14,7 +14,7 @@ struct file_priv { struct work_struct timeout_work; struct work_struct async_work; wait_queue_head_t async_wait; - size_t response_length; + ssize_t response_length; bool response_read; bool command_enqueued; diff --git a/drivers/char/tpm/tpm_tis_core.c b/drivers/char/tpm/tpm_tis_core.c index bb0343ffd235..27c6ca031e23 100644 --- a/drivers/char/tpm/tpm_tis_core.c +++ b/drivers/char/tpm/tpm_tis_core.c @@ -978,13 +978,13 @@ int tpm_tis_core_init(struct device *dev, struct tpm_tis_data *priv, int irq, if (wait_startup(chip, 0) != 0) { rc = -ENODEV; - goto err_start; + goto out_err; } /* Take control of the TPM's interrupt hardware and shut it off */ rc = tpm_tis_read32(priv, TPM_INT_ENABLE(priv->locality), &intmask); if (rc < 0) - goto err_start; + goto out_err; intmask |= TPM_INTF_CMD_READY_INT | TPM_INTF_LOCALITY_CHANGE_INT | TPM_INTF_DATA_AVAIL_INT | TPM_INTF_STS_VALID_INT; @@ -993,21 +993,21 @@ int tpm_tis_core_init(struct device *dev, struct tpm_tis_data *priv, int irq, rc = tpm_chip_start(chip); if (rc) - goto err_start; - + goto out_err; rc = tpm2_probe(chip); + tpm_chip_stop(chip); if (rc) - goto err_probe; + goto out_err; rc = tpm_tis_read32(priv, TPM_DID_VID(0), &vendor); if (rc < 0) - goto err_probe; + goto out_err; priv->manufacturer_id = vendor; rc = tpm_tis_read8(priv, TPM_RID(0), &rid); if (rc < 0) - goto err_probe; + goto out_err; dev_info(dev, "%s TPM (device-id 0x%X, rev-id %d)\n", (chip->flags & TPM_CHIP_FLAG_TPM2) ? "2.0" : "1.2", @@ -1016,13 +1016,13 @@ int tpm_tis_core_init(struct device *dev, struct tpm_tis_data *priv, int irq, probe = probe_itpm(chip); if (probe < 0) { rc = -ENODEV; - goto err_probe; + goto out_err; } /* Figure out the capabilities */ rc = tpm_tis_read32(priv, TPM_INTF_CAPS(priv->locality), &intfcaps); if (rc < 0) - goto err_probe; + goto out_err; dev_dbg(dev, "TPM interface capabilities (0x%x):\n", intfcaps); @@ -1056,10 +1056,9 @@ int tpm_tis_core_init(struct device *dev, struct tpm_tis_data *priv, int irq, if (tpm_get_timeouts(chip)) { dev_err(dev, "Could not get TPM timeouts and durations\n"); rc = -ENODEV; - goto err_probe; + goto out_err; } - chip->flags |= TPM_CHIP_FLAG_IRQ; if (irq) { tpm_tis_probe_irq_single(chip, intmask, IRQF_SHARED, irq); @@ -1071,18 +1070,15 @@ int tpm_tis_core_init(struct device *dev, struct tpm_tis_data *priv, int irq, } } - tpm_chip_stop(chip); - rc = tpm_chip_register(chip); if (rc) - goto err_start; - - return 0; + goto out_err; -err_probe: - tpm_chip_stop(chip); + if (chip->ops->clk_enable != NULL) + chip->ops->clk_enable(chip, false); -err_start: + return 0; +out_err: if ((chip->ops != NULL) && (chip->ops->clk_enable != NULL)) chip->ops->clk_enable(chip, false); diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index 6a11239ccde3..772258de2d1f 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -3426,11 +3426,17 @@ static int __clk_core_init(struct clk_core *core) if (core->flags & CLK_IS_CRITICAL) { unsigned long flags; - clk_core_prepare(core); + ret = clk_core_prepare(core); + if (ret) + goto out; flags = clk_enable_lock(); - clk_core_enable(core); + ret = clk_core_enable(core); clk_enable_unlock(flags); + if (ret) { + clk_core_unprepare(core); + goto out; + } } clk_core_reparent_orphans_nolock(); diff --git a/drivers/clk/mmp/clk-of-mmp2.c b/drivers/clk/mmp/clk-of-mmp2.c index a60a1be937ad..b4a95cbbda98 100644 --- a/drivers/clk/mmp/clk-of-mmp2.c +++ b/drivers/clk/mmp/clk-of-mmp2.c @@ -134,7 +134,7 @@ static DEFINE_SPINLOCK(ssp3_lock); static const char *ssp_parent_names[] = {"vctcxo_4", "vctcxo_2", "vctcxo", "pll1_16"}; static DEFINE_SPINLOCK(timer_lock); -static const char *timer_parent_names[] = {"clk32", "vctcxo_2", "vctcxo_4", "vctcxo"}; +static const char *timer_parent_names[] = {"clk32", "vctcxo_4", "vctcxo_2", "vctcxo"}; static DEFINE_SPINLOCK(reset_lock); diff --git a/drivers/clk/qcom/gcc-sdm845.c b/drivers/clk/qcom/gcc-sdm845.c index f7b370f3acef..f6ce888098be 100644 --- a/drivers/clk/qcom/gcc-sdm845.c +++ b/drivers/clk/qcom/gcc-sdm845.c @@ -3255,6 +3255,7 @@ static struct gdsc hlos1_vote_aggre_noc_mmu_audio_tbu_gdsc = { .name = "hlos1_vote_aggre_noc_mmu_audio_tbu_gdsc", }, .pwrsts = PWRSTS_OFF_ON, + .flags = VOTABLE, }; static struct gdsc hlos1_vote_aggre_noc_mmu_pcie_tbu_gdsc = { @@ -3263,6 +3264,7 @@ static struct gdsc hlos1_vote_aggre_noc_mmu_pcie_tbu_gdsc = { .name = "hlos1_vote_aggre_noc_mmu_pcie_tbu_gdsc", }, .pwrsts = PWRSTS_OFF_ON, + .flags = VOTABLE, }; static struct gdsc hlos1_vote_aggre_noc_mmu_tbu1_gdsc = { @@ -3271,6 +3273,7 @@ static struct gdsc hlos1_vote_aggre_noc_mmu_tbu1_gdsc = { .name = "hlos1_vote_aggre_noc_mmu_tbu1_gdsc", }, .pwrsts = PWRSTS_OFF_ON, + .flags = VOTABLE, }; static struct gdsc hlos1_vote_aggre_noc_mmu_tbu2_gdsc = { @@ -3279,6 +3282,7 @@ static struct gdsc hlos1_vote_aggre_noc_mmu_tbu2_gdsc = { .name = "hlos1_vote_aggre_noc_mmu_tbu2_gdsc", }, .pwrsts = PWRSTS_OFF_ON, + .flags = VOTABLE, }; static struct gdsc hlos1_vote_mmnoc_mmu_tbu_hf0_gdsc = { @@ -3287,6 +3291,7 @@ static struct gdsc hlos1_vote_mmnoc_mmu_tbu_hf0_gdsc = { .name = "hlos1_vote_mmnoc_mmu_tbu_hf0_gdsc", }, .pwrsts = PWRSTS_OFF_ON, + .flags = VOTABLE, }; static struct gdsc hlos1_vote_mmnoc_mmu_tbu_hf1_gdsc = { @@ -3295,6 +3300,7 @@ static struct gdsc hlos1_vote_mmnoc_mmu_tbu_hf1_gdsc = { .name = "hlos1_vote_mmnoc_mmu_tbu_hf1_gdsc", }, .pwrsts = PWRSTS_OFF_ON, + .flags = VOTABLE, }; static struct gdsc hlos1_vote_mmnoc_mmu_tbu_sf_gdsc = { @@ -3303,6 +3309,7 @@ static struct gdsc hlos1_vote_mmnoc_mmu_tbu_sf_gdsc = { .name = "hlos1_vote_mmnoc_mmu_tbu_sf_gdsc", }, .pwrsts = PWRSTS_OFF_ON, + .flags = VOTABLE, }; static struct clk_regmap *gcc_sdm845_clocks[] = { diff --git a/drivers/clk/samsung/clk-exynos5420.c b/drivers/clk/samsung/clk-exynos5420.c index 3a991ca1ee36..c9e5a1fb6653 100644 --- a/drivers/clk/samsung/clk-exynos5420.c +++ b/drivers/clk/samsung/clk-exynos5420.c @@ -12,6 +12,7 @@ #include <linux/clk-provider.h> #include <linux/of.h> #include <linux/of_address.h> +#include <linux/clk.h> #include "clk.h" #include "clk-cpu.h" @@ -1646,6 +1647,13 @@ static void __init exynos5x_clk_init(struct device_node *np, exynos5x_subcmus); } + /* + * Keep top part of G3D clock path enabled permanently to ensure + * that the internal busses get their clock regardless of the + * main G3D clock enablement status. + */ + clk_prepare_enable(__clk_lookup("mout_sw_aclk_g3d")); + samsung_clk_of_add_provider(np, ctx); } diff --git a/drivers/clk/sunxi-ng/ccu-sun50i-h6-r.c b/drivers/clk/sunxi-ng/ccu-sun50i-h6-r.c index 45a1ed3fe674..50f8d1bc7046 100644 --- a/drivers/clk/sunxi-ng/ccu-sun50i-h6-r.c +++ b/drivers/clk/sunxi-ng/ccu-sun50i-h6-r.c @@ -23,9 +23,9 @@ */ static const char * const ar100_r_apb2_parents[] = { "osc24M", "osc32k", - "pll-periph0", "iosc" }; + "iosc", "pll-periph0" }; static const struct ccu_mux_var_prediv ar100_r_apb2_predivs[] = { - { .index = 2, .shift = 0, .width = 5 }, + { .index = 3, .shift = 0, .width = 5 }, }; static struct ccu_div ar100_clk = { @@ -51,17 +51,7 @@ static struct ccu_div ar100_clk = { static CLK_FIXED_FACTOR_HW(r_ahb_clk, "r-ahb", &ar100_clk.common.hw, 1, 1, 0); -static struct ccu_div r_apb1_clk = { - .div = _SUNXI_CCU_DIV(0, 2), - - .common = { - .reg = 0x00c, - .hw.init = CLK_HW_INIT("r-apb1", - "r-ahb", - &ccu_div_ops, - 0), - }, -}; +static SUNXI_CCU_M(r_apb1_clk, "r-apb1", "r-ahb", 0x00c, 0, 2, 0); static struct ccu_div r_apb2_clk = { .div = _SUNXI_CCU_DIV_FLAGS(8, 2, CLK_DIVIDER_POWER_OF_TWO), diff --git a/drivers/clk/sunxi-ng/ccu-sun8i-r.c b/drivers/clk/sunxi-ng/ccu-sun8i-r.c index 4646fdc61053..4c8c491b87c2 100644 --- a/drivers/clk/sunxi-ng/ccu-sun8i-r.c +++ b/drivers/clk/sunxi-ng/ccu-sun8i-r.c @@ -51,19 +51,7 @@ static struct ccu_div ar100_clk = { static CLK_FIXED_FACTOR_HW(ahb0_clk, "ahb0", &ar100_clk.common.hw, 1, 1, 0); -static struct ccu_div apb0_clk = { - .div = _SUNXI_CCU_DIV_FLAGS(0, 2, CLK_DIVIDER_POWER_OF_TWO), - - .common = { - .reg = 0x0c, - .hw.init = CLK_HW_INIT_HW("apb0", - &ahb0_clk.hw, - &ccu_div_ops, - 0), - }, -}; - -static SUNXI_CCU_M(a83t_apb0_clk, "apb0", "ahb0", 0x0c, 0, 2, 0); +static SUNXI_CCU_M(apb0_clk, "apb0", "ahb0", 0x0c, 0, 2, 0); /* * Define the parent as an array that can be reused to save space @@ -127,7 +115,7 @@ static struct ccu_mp a83t_ir_clk = { static struct ccu_common *sun8i_a83t_r_ccu_clks[] = { &ar100_clk.common, - &a83t_apb0_clk.common, + &apb0_clk.common, &apb0_pio_clk.common, &apb0_ir_clk.common, &apb0_timer_clk.common, @@ -167,7 +155,7 @@ static struct clk_hw_onecell_data sun8i_a83t_r_hw_clks = { .hws = { [CLK_AR100] = &ar100_clk.common.hw, [CLK_AHB0] = &ahb0_clk.hw, - [CLK_APB0] = &a83t_apb0_clk.common.hw, + [CLK_APB0] = &apb0_clk.common.hw, [CLK_APB0_PIO] = &apb0_pio_clk.common.hw, [CLK_APB0_IR] = &apb0_ir_clk.common.hw, [CLK_APB0_TIMER] = &apb0_timer_clk.common.hw, @@ -282,9 +270,6 @@ static void __init sunxi_r_ccu_init(struct device_node *node, static void __init sun8i_a83t_r_ccu_setup(struct device_node *node) { - /* Fix apb0 bus gate parents here */ - apb0_gate_parent[0] = &a83t_apb0_clk.common.hw; - sunxi_r_ccu_init(node, &sun8i_a83t_r_ccu_desc); } CLK_OF_DECLARE(sun8i_a83t_r_ccu, "allwinner,sun8i-a83t-r-ccu", diff --git a/drivers/clk/sunxi-ng/ccu-sun8i-r40.c b/drivers/clk/sunxi-ng/ccu-sun8i-r40.c index 897490800102..23bfe1d12f21 100644 --- a/drivers/clk/sunxi-ng/ccu-sun8i-r40.c +++ b/drivers/clk/sunxi-ng/ccu-sun8i-r40.c @@ -761,7 +761,8 @@ static struct ccu_mp outa_clk = { .reg = 0x1f0, .features = CCU_FEATURE_FIXED_PREDIV, .hw.init = CLK_HW_INIT_PARENTS("outa", out_parents, - &ccu_mp_ops, 0), + &ccu_mp_ops, + CLK_SET_RATE_PARENT), } }; @@ -779,7 +780,8 @@ static struct ccu_mp outb_clk = { .reg = 0x1f4, .features = CCU_FEATURE_FIXED_PREDIV, .hw.init = CLK_HW_INIT_PARENTS("outb", out_parents, - &ccu_mp_ops, 0), + &ccu_mp_ops, + CLK_SET_RATE_PARENT), } }; diff --git a/drivers/clk/sunxi-ng/ccu-sun8i-v3s.c b/drivers/clk/sunxi-ng/ccu-sun8i-v3s.c index 5c779eec454b..0e36ca3bf3d5 100644 --- a/drivers/clk/sunxi-ng/ccu-sun8i-v3s.c +++ b/drivers/clk/sunxi-ng/ccu-sun8i-v3s.c @@ -618,7 +618,7 @@ static struct clk_hw_onecell_data sun8i_v3s_hw_clks = { [CLK_MBUS] = &mbus_clk.common.hw, [CLK_MIPI_CSI] = &mipi_csi_clk.common.hw, }, - .num = CLK_NUMBER, + .num = CLK_PLL_DDR1 + 1, }; static struct clk_hw_onecell_data sun8i_v3_hw_clks = { @@ -700,7 +700,7 @@ static struct clk_hw_onecell_data sun8i_v3_hw_clks = { [CLK_MBUS] = &mbus_clk.common.hw, [CLK_MIPI_CSI] = &mipi_csi_clk.common.hw, }, - .num = CLK_NUMBER, + .num = CLK_I2S0 + 1, }; static struct ccu_reset_map sun8i_v3s_ccu_resets[] = { diff --git a/drivers/clk/sunxi-ng/ccu-sun8i-v3s.h b/drivers/clk/sunxi-ng/ccu-sun8i-v3s.h index b0160d305a67..108eeeedcbf7 100644 --- a/drivers/clk/sunxi-ng/ccu-sun8i-v3s.h +++ b/drivers/clk/sunxi-ng/ccu-sun8i-v3s.h @@ -51,6 +51,4 @@ #define CLK_PLL_DDR1 74 -#define CLK_NUMBER (CLK_I2S0 + 1) - #endif /* _CCU_SUN8I_H3_H_ */ diff --git a/drivers/clk/tegra/clk.c b/drivers/clk/tegra/clk.c index e6bd6d1ea012..f6cdce441cf7 100644 --- a/drivers/clk/tegra/clk.c +++ b/drivers/clk/tegra/clk.c @@ -231,8 +231,10 @@ struct clk ** __init tegra_clk_init(void __iomem *regs, int num, int banks) periph_banks = banks; clks = kcalloc(num, sizeof(struct clk *), GFP_KERNEL); - if (!clks) + if (!clks) { kfree(periph_clk_enb_refcnt); + return NULL; + } clk_num = num; diff --git a/drivers/clk/ti/clk-dra7-atl.c b/drivers/clk/ti/clk-dra7-atl.c index f65e16c4f3c4..8d4c08b034bd 100644 --- a/drivers/clk/ti/clk-dra7-atl.c +++ b/drivers/clk/ti/clk-dra7-atl.c @@ -233,7 +233,6 @@ static int of_dra7_atl_clk_probe(struct platform_device *pdev) cinfo->iobase = of_iomap(node, 0); cinfo->dev = &pdev->dev; pm_runtime_enable(cinfo->dev); - pm_runtime_irq_safe(cinfo->dev); pm_runtime_get_sync(cinfo->dev); atl_write(cinfo, DRA7_ATL_PCLKMUX_REG(0), DRA7_ATL_PCLKMUX); diff --git a/drivers/clocksource/timer-riscv.c b/drivers/clocksource/timer-riscv.c index 4e54856ce2a5..c4f15c4068c0 100644 --- a/drivers/clocksource/timer-riscv.c +++ b/drivers/clocksource/timer-riscv.c @@ -56,7 +56,7 @@ static unsigned long long riscv_clocksource_rdtime(struct clocksource *cs) return get_cycles64(); } -static u64 riscv_sched_clock(void) +static u64 notrace riscv_sched_clock(void) { return get_cycles64(); } diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c index f1d170dcf4d3..aba591d57c67 100644 --- a/drivers/cpufreq/cpufreq-dt-platdev.c +++ b/drivers/cpufreq/cpufreq-dt-platdev.c @@ -121,6 +121,8 @@ static const struct of_device_id blacklist[] __initconst = { { .compatible = "mediatek,mt8176", }, { .compatible = "mediatek,mt8183", }, + { .compatible = "nvidia,tegra20", }, + { .compatible = "nvidia,tegra30", }, { .compatible = "nvidia,tegra124", }, { .compatible = "nvidia,tegra210", }, diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index de7e706efd46..6deaaf5f05b5 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -198,7 +198,7 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) * pattern detection. */ cpu_data->intervals[cpu_data->interval_idx++] = measured_ns; - if (cpu_data->interval_idx > INTERVALS) + if (cpu_data->interval_idx >= INTERVALS) cpu_data->interval_idx = 0; } diff --git a/drivers/crypto/hisilicon/sec2/sec.h b/drivers/crypto/hisilicon/sec2/sec.h index 26754d0570ba..b846d73d9a85 100644 --- a/drivers/crypto/hisilicon/sec2/sec.h +++ b/drivers/crypto/hisilicon/sec2/sec.h @@ -40,7 +40,7 @@ struct sec_req { int req_id; /* Status of the SEC request */ - int fake_busy; + atomic_t fake_busy; }; /** @@ -132,8 +132,8 @@ struct sec_debug_file { }; struct sec_dfx { - u64 send_cnt; - u64 recv_cnt; + atomic64_t send_cnt; + atomic64_t recv_cnt; }; struct sec_debug { diff --git a/drivers/crypto/hisilicon/sec2/sec_crypto.c b/drivers/crypto/hisilicon/sec2/sec_crypto.c index 62b04e19067c..0a5391fff485 100644 --- a/drivers/crypto/hisilicon/sec2/sec_crypto.c +++ b/drivers/crypto/hisilicon/sec2/sec_crypto.c @@ -120,7 +120,7 @@ static void sec_req_cb(struct hisi_qp *qp, void *resp) return; } - __sync_add_and_fetch(&req->ctx->sec->debug.dfx.recv_cnt, 1); + atomic64_inc(&req->ctx->sec->debug.dfx.recv_cnt); req->ctx->req_op->buf_unmap(req->ctx, req); @@ -135,13 +135,13 @@ static int sec_bd_send(struct sec_ctx *ctx, struct sec_req *req) mutex_lock(&qp_ctx->req_lock); ret = hisi_qp_send(qp_ctx->qp, &req->sec_sqe); mutex_unlock(&qp_ctx->req_lock); - __sync_add_and_fetch(&ctx->sec->debug.dfx.send_cnt, 1); + atomic64_inc(&ctx->sec->debug.dfx.send_cnt); if (ret == -EBUSY) return -ENOBUFS; if (!ret) { - if (req->fake_busy) + if (atomic_read(&req->fake_busy)) ret = -EBUSY; else ret = -EINPROGRESS; @@ -641,7 +641,7 @@ static void sec_skcipher_callback(struct sec_ctx *ctx, struct sec_req *req) if (ctx->c_ctx.c_mode == SEC_CMODE_CBC && req->c_req.encrypt) sec_update_iv(req); - if (__sync_bool_compare_and_swap(&req->fake_busy, 1, 0)) + if (atomic_cmpxchg(&req->fake_busy, 1, 0) != 1) sk_req->base.complete(&sk_req->base, -EINPROGRESS); sk_req->base.complete(&sk_req->base, req->err_type); @@ -672,9 +672,9 @@ static int sec_request_init(struct sec_ctx *ctx, struct sec_req *req) } if (ctx->fake_req_limit <= atomic_inc_return(&qp_ctx->pending_reqs)) - req->fake_busy = 1; + atomic_set(&req->fake_busy, 1); else - req->fake_busy = 0; + atomic_set(&req->fake_busy, 0); ret = ctx->req_op->get_res(ctx, req); if (ret) { diff --git a/drivers/crypto/hisilicon/sec2/sec_main.c b/drivers/crypto/hisilicon/sec2/sec_main.c index 74f0654028c9..ab742dfbab99 100644 --- a/drivers/crypto/hisilicon/sec2/sec_main.c +++ b/drivers/crypto/hisilicon/sec2/sec_main.c @@ -608,6 +608,14 @@ static const struct file_operations sec_dbg_fops = { .write = sec_debug_write, }; +static int debugfs_atomic64_t_get(void *data, u64 *val) +{ + *val = atomic64_read((atomic64_t *)data); + return 0; +} +DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic64_t_ro, debugfs_atomic64_t_get, NULL, + "%lld\n"); + static int sec_core_debug_init(struct sec_dev *sec) { struct hisi_qm *qm = &sec->qm; @@ -628,9 +636,11 @@ static int sec_core_debug_init(struct sec_dev *sec) debugfs_create_regset32("regs", 0444, tmp_d, regset); - debugfs_create_u64("send_cnt", 0444, tmp_d, &dfx->send_cnt); + debugfs_create_file("send_cnt", 0444, tmp_d, &dfx->send_cnt, + &fops_atomic64_t_ro); - debugfs_create_u64("recv_cnt", 0444, tmp_d, &dfx->recv_cnt); + debugfs_create_file("recv_cnt", 0444, tmp_d, &dfx->recv_cnt, + &fops_atomic64_t_ro); return 0; } diff --git a/drivers/dma/dma-jz4780.c b/drivers/dma/dma-jz4780.c index fa626acdc9b9..44af435628f8 100644 --- a/drivers/dma/dma-jz4780.c +++ b/drivers/dma/dma-jz4780.c @@ -999,7 +999,8 @@ static const struct jz4780_dma_soc_data jz4740_dma_soc_data = { static const struct jz4780_dma_soc_data jz4725b_dma_soc_data = { .nb_channels = 6, .transfer_ord_max = 5, - .flags = JZ_SOC_DATA_PER_CHAN_PM | JZ_SOC_DATA_NO_DCKES_DCKEC, + .flags = JZ_SOC_DATA_PER_CHAN_PM | JZ_SOC_DATA_NO_DCKES_DCKEC | + JZ_SOC_DATA_BREAK_LINKS, }; static const struct jz4780_dma_soc_data jz4770_dma_soc_data = { diff --git a/drivers/dma/ioat/dma.c b/drivers/dma/ioat/dma.c index 1a422a8b43cf..18c011e57592 100644 --- a/drivers/dma/ioat/dma.c +++ b/drivers/dma/ioat/dma.c @@ -377,10 +377,11 @@ ioat_alloc_ring(struct dma_chan *c, int order, gfp_t flags) descs->virt = dma_alloc_coherent(to_dev(ioat_chan), SZ_2M, &descs->hw, flags); - if (!descs->virt && (i > 0)) { + if (!descs->virt) { int idx; for (idx = 0; idx < i; idx++) { + descs = &ioat_chan->descs[idx]; dma_free_coherent(to_dev(ioat_chan), SZ_2M, descs->virt, descs->hw); descs->virt = NULL; diff --git a/drivers/dma/k3dma.c b/drivers/dma/k3dma.c index adecea51814f..c5c1aa0dcaed 100644 --- a/drivers/dma/k3dma.c +++ b/drivers/dma/k3dma.c @@ -229,9 +229,11 @@ static irqreturn_t k3_dma_int_handler(int irq, void *dev_id) c = p->vchan; if (c && (tc1 & BIT(i))) { spin_lock_irqsave(&c->vc.lock, flags); - vchan_cookie_complete(&p->ds_run->vd); - p->ds_done = p->ds_run; - p->ds_run = NULL; + if (p->ds_run != NULL) { + vchan_cookie_complete(&p->ds_run->vd); + p->ds_done = p->ds_run; + p->ds_run = NULL; + } spin_unlock_irqrestore(&c->vc.lock, flags); } if (c && (tc2 & BIT(i))) { @@ -271,6 +273,10 @@ static int k3_dma_start_txd(struct k3_dma_chan *c) if (BIT(c->phy->idx) & k3_dma_get_chan_stat(d)) return -EAGAIN; + /* Avoid losing track of ds_run if a transaction is in flight */ + if (c->phy->ds_run) + return -EAGAIN; + if (vd) { struct k3_dma_desc_sw *ds = container_of(vd, struct k3_dma_desc_sw, vd); diff --git a/drivers/dma/virt-dma.c b/drivers/dma/virt-dma.c index ec4adf4260a0..256fc662c500 100644 --- a/drivers/dma/virt-dma.c +++ b/drivers/dma/virt-dma.c @@ -104,9 +104,8 @@ static void vchan_complete(unsigned long arg) dmaengine_desc_get_callback(&vd->tx, &cb); list_del(&vd->node); - vchan_vdesc_fini(vd); - dmaengine_desc_callback_invoke(&cb, &vd->tx_result); + vchan_vdesc_fini(vd); } } diff --git a/drivers/edac/sifive_edac.c b/drivers/edac/sifive_edac.c index 413cdb4a591d..c0cc72a3b2be 100644 --- a/drivers/edac/sifive_edac.c +++ b/drivers/edac/sifive_edac.c @@ -10,7 +10,7 @@ #include <linux/edac.h> #include <linux/platform_device.h> #include "edac_module.h" -#include <asm/sifive_l2_cache.h> +#include <soc/sifive/sifive_l2_cache.h> #define DRVNAME "sifive_edac" diff --git a/drivers/firmware/broadcom/tee_bnxt_fw.c b/drivers/firmware/broadcom/tee_bnxt_fw.c index 5b7ef89eb701..ed10da5313e8 100644 --- a/drivers/firmware/broadcom/tee_bnxt_fw.c +++ b/drivers/firmware/broadcom/tee_bnxt_fw.c @@ -215,7 +215,6 @@ static int tee_bnxt_fw_probe(struct device *dev) fw_shm_pool = tee_shm_alloc(pvt_data.ctx, MAX_SHM_MEM_SZ, TEE_SHM_MAPPED | TEE_SHM_DMA_BUF); if (IS_ERR(fw_shm_pool)) { - tee_client_close_context(pvt_data.ctx); dev_err(pvt_data.dev, "tee_shm_alloc failed\n"); err = PTR_ERR(fw_shm_pool); goto out_sess; diff --git a/drivers/firmware/efi/earlycon.c b/drivers/firmware/efi/earlycon.c index d4077db6dc97..5d4f84781aa0 100644 --- a/drivers/firmware/efi/earlycon.c +++ b/drivers/firmware/efi/earlycon.c @@ -17,7 +17,7 @@ static const struct console *earlycon_console __initdata; static const struct font_desc *font; static u32 efi_x, efi_y; static u64 fb_base; -static pgprot_t fb_prot; +static bool fb_wb; static void *efi_fb; /* @@ -33,10 +33,8 @@ static int __init efi_earlycon_remap_fb(void) if (!earlycon_console || !(earlycon_console->flags & CON_ENABLED)) return 0; - if (pgprot_val(fb_prot) == pgprot_val(PAGE_KERNEL)) - efi_fb = memremap(fb_base, screen_info.lfb_size, MEMREMAP_WB); - else - efi_fb = memremap(fb_base, screen_info.lfb_size, MEMREMAP_WC); + efi_fb = memremap(fb_base, screen_info.lfb_size, + fb_wb ? MEMREMAP_WB : MEMREMAP_WC); return efi_fb ? 0 : -ENOMEM; } @@ -53,9 +51,12 @@ late_initcall(efi_earlycon_unmap_fb); static __ref void *efi_earlycon_map(unsigned long start, unsigned long len) { + pgprot_t fb_prot; + if (efi_fb) return efi_fb + start; + fb_prot = fb_wb ? PAGE_KERNEL : pgprot_writecombine(PAGE_KERNEL); return early_memremap_prot(fb_base + start, len, pgprot_val(fb_prot)); } @@ -215,10 +216,7 @@ static int __init efi_earlycon_setup(struct earlycon_device *device, if (screen_info.capabilities & VIDEO_CAPABILITY_64BIT_BASE) fb_base |= (u64)screen_info.ext_lfb_base << 32; - if (opt && !strcmp(opt, "ram")) - fb_prot = PAGE_KERNEL; - else - fb_prot = pgprot_writecombine(PAGE_KERNEL); + fb_wb = opt && !strcmp(opt, "ram"); si = &screen_info; xres = si->lfb_width; diff --git a/drivers/firmware/efi/libstub/random.c b/drivers/firmware/efi/libstub/random.c index 35edd7cfb6a1..97378cf96a2e 100644 --- a/drivers/firmware/efi/libstub/random.c +++ b/drivers/firmware/efi/libstub/random.c @@ -33,7 +33,7 @@ efi_status_t efi_get_random_bytes(efi_system_table_t *sys_table_arg, { efi_guid_t rng_proto = EFI_RNG_PROTOCOL_GUID; efi_status_t status; - struct efi_rng_protocol *rng; + struct efi_rng_protocol *rng = NULL; status = efi_call_early(locate_protocol, &rng_proto, NULL, (void **)&rng); @@ -162,8 +162,8 @@ efi_status_t efi_random_get_seed(efi_system_table_t *sys_table_arg) efi_guid_t rng_proto = EFI_RNG_PROTOCOL_GUID; efi_guid_t rng_algo_raw = EFI_RNG_ALGORITHM_RAW; efi_guid_t rng_table_guid = LINUX_EFI_RANDOM_SEED_TABLE_GUID; - struct efi_rng_protocol *rng; - struct linux_efi_random_seed *seed; + struct efi_rng_protocol *rng = NULL; + struct linux_efi_random_seed *seed = NULL; efi_status_t status; status = efi_call_early(locate_protocol, &rng_proto, NULL, diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index 6ab25fe1c423..4b6d2ef15c39 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -573,7 +573,6 @@ config GPIO_THUNDERX tristate "Cavium ThunderX/OCTEON-TX GPIO" depends on ARCH_THUNDER || (64BIT && COMPILE_TEST) depends on PCI_MSI - select GPIOLIB_IRQCHIP select IRQ_DOMAIN_HIERARCHY select IRQ_FASTEOI_HIERARCHY_HANDLERS help @@ -1148,6 +1147,7 @@ config GPIO_MADERA config GPIO_MAX77620 tristate "GPIO support for PMIC MAX77620 and MAX20024" depends on MFD_MAX77620 + select GPIOLIB_IRQCHIP help GPIO driver for MAX77620 and MAX20024 PMIC from Maxim Semiconductor. MAX77620 PMIC has 8 pins that can be configured as GPIOs. The diff --git a/drivers/gpio/gpio-mockup.c b/drivers/gpio/gpio-mockup.c index c4fdc192ea4e..94b8d3ae27bc 100644 --- a/drivers/gpio/gpio-mockup.c +++ b/drivers/gpio/gpio-mockup.c @@ -156,7 +156,7 @@ static int gpio_mockup_apply_pull(struct gpio_mockup_chip *chip, mutex_lock(&chip->lock); if (test_bit(FLAG_REQUESTED, &desc->flags) && - !test_bit(FLAG_IS_OUT, &desc->flags)) { + !test_bit(FLAG_IS_OUT, &desc->flags)) { curr = __gpio_mockup_get(chip, offset); if (curr == value) goto out; @@ -165,7 +165,7 @@ static int gpio_mockup_apply_pull(struct gpio_mockup_chip *chip, irq_type = irq_get_trigger_type(irq); if ((value == 1 && (irq_type & IRQ_TYPE_EDGE_RISING)) || - (value == 0 && (irq_type & IRQ_TYPE_EDGE_FALLING))) + (value == 0 && (irq_type & IRQ_TYPE_EDGE_FALLING))) irq_sim_fire(sim, offset); } diff --git a/drivers/gpio/gpio-thunderx.c b/drivers/gpio/gpio-thunderx.c index d08d86a22b1f..462770479045 100644 --- a/drivers/gpio/gpio-thunderx.c +++ b/drivers/gpio/gpio-thunderx.c @@ -53,6 +53,7 @@ struct thunderx_line { struct thunderx_gpio { struct gpio_chip chip; u8 __iomem *register_base; + struct irq_domain *irqd; struct msix_entry *msix_entries; /* per line MSI-X */ struct thunderx_line *line_entries; /* per line irq info */ raw_spinlock_t lock; @@ -285,60 +286,54 @@ static void thunderx_gpio_set_multiple(struct gpio_chip *chip, } } -static void thunderx_gpio_irq_ack(struct irq_data *d) +static void thunderx_gpio_irq_ack(struct irq_data *data) { - struct gpio_chip *gc = irq_data_get_irq_chip_data(d); - struct thunderx_gpio *txgpio = gpiochip_get_data(gc); + struct thunderx_line *txline = irq_data_get_irq_chip_data(data); writeq(GPIO_INTR_INTR, - txgpio->register_base + intr_reg(irqd_to_hwirq(d))); + txline->txgpio->register_base + intr_reg(txline->line)); } -static void thunderx_gpio_irq_mask(struct irq_data *d) +static void thunderx_gpio_irq_mask(struct irq_data *data) { - struct gpio_chip *gc = irq_data_get_irq_chip_data(d); - struct thunderx_gpio *txgpio = gpiochip_get_data(gc); + struct thunderx_line *txline = irq_data_get_irq_chip_data(data); writeq(GPIO_INTR_ENA_W1C, - txgpio->register_base + intr_reg(irqd_to_hwirq(d))); + txline->txgpio->register_base + intr_reg(txline->line)); } -static void thunderx_gpio_irq_mask_ack(struct irq_data *d) +static void thunderx_gpio_irq_mask_ack(struct irq_data *data) { - struct gpio_chip *gc = irq_data_get_irq_chip_data(d); - struct thunderx_gpio *txgpio = gpiochip_get_data(gc); + struct thunderx_line *txline = irq_data_get_irq_chip_data(data); writeq(GPIO_INTR_ENA_W1C | GPIO_INTR_INTR, - txgpio->register_base + intr_reg(irqd_to_hwirq(d))); + txline->txgpio->register_base + intr_reg(txline->line)); } -static void thunderx_gpio_irq_unmask(struct irq_data *d) +static void thunderx_gpio_irq_unmask(struct irq_data *data) { - struct gpio_chip *gc = irq_data_get_irq_chip_data(d); - struct thunderx_gpio *txgpio = gpiochip_get_data(gc); + struct thunderx_line *txline = irq_data_get_irq_chip_data(data); writeq(GPIO_INTR_ENA_W1S, - txgpio->register_base + intr_reg(irqd_to_hwirq(d))); + txline->txgpio->register_base + intr_reg(txline->line)); } -static int thunderx_gpio_irq_set_type(struct irq_data *d, +static int thunderx_gpio_irq_set_type(struct irq_data *data, unsigned int flow_type) { - struct gpio_chip *gc = irq_data_get_irq_chip_data(d); - struct thunderx_gpio *txgpio = gpiochip_get_data(gc); - struct thunderx_line *txline = - &txgpio->line_entries[irqd_to_hwirq(d)]; + struct thunderx_line *txline = irq_data_get_irq_chip_data(data); + struct thunderx_gpio *txgpio = txline->txgpio; u64 bit_cfg; - irqd_set_trigger_type(d, flow_type); + irqd_set_trigger_type(data, flow_type); bit_cfg = txline->fil_bits | GPIO_BIT_CFG_INT_EN; if (flow_type & IRQ_TYPE_EDGE_BOTH) { - irq_set_handler_locked(d, handle_fasteoi_ack_irq); + irq_set_handler_locked(data, handle_fasteoi_ack_irq); bit_cfg |= GPIO_BIT_CFG_INT_TYPE; } else { - irq_set_handler_locked(d, handle_fasteoi_mask_irq); + irq_set_handler_locked(data, handle_fasteoi_mask_irq); } raw_spin_lock(&txgpio->lock); @@ -367,6 +362,33 @@ static void thunderx_gpio_irq_disable(struct irq_data *data) irq_chip_disable_parent(data); } +static int thunderx_gpio_irq_request_resources(struct irq_data *data) +{ + struct thunderx_line *txline = irq_data_get_irq_chip_data(data); + struct thunderx_gpio *txgpio = txline->txgpio; + int r; + + r = gpiochip_lock_as_irq(&txgpio->chip, txline->line); + if (r) + return r; + + r = irq_chip_request_resources_parent(data); + if (r) + gpiochip_unlock_as_irq(&txgpio->chip, txline->line); + + return r; +} + +static void thunderx_gpio_irq_release_resources(struct irq_data *data) +{ + struct thunderx_line *txline = irq_data_get_irq_chip_data(data); + struct thunderx_gpio *txgpio = txline->txgpio; + + irq_chip_release_resources_parent(data); + + gpiochip_unlock_as_irq(&txgpio->chip, txline->line); +} + /* * Interrupts are chained from underlying MSI-X vectors. We have * these irq_chip functions to be able to handle level triggering @@ -383,24 +405,50 @@ static struct irq_chip thunderx_gpio_irq_chip = { .irq_unmask = thunderx_gpio_irq_unmask, .irq_eoi = irq_chip_eoi_parent, .irq_set_affinity = irq_chip_set_affinity_parent, + .irq_request_resources = thunderx_gpio_irq_request_resources, + .irq_release_resources = thunderx_gpio_irq_release_resources, .irq_set_type = thunderx_gpio_irq_set_type, .flags = IRQCHIP_SET_TYPE_MASKED }; -static int thunderx_gpio_child_to_parent_hwirq(struct gpio_chip *gc, - unsigned int child, - unsigned int child_type, - unsigned int *parent, - unsigned int *parent_type) +static int thunderx_gpio_irq_translate(struct irq_domain *d, + struct irq_fwspec *fwspec, + irq_hw_number_t *hwirq, + unsigned int *type) { - struct thunderx_gpio *txgpio = gpiochip_get_data(gc); - - *parent = txgpio->base_msi + (2 * child); - *parent_type = IRQ_TYPE_LEVEL_HIGH; + struct thunderx_gpio *txgpio = d->host_data; + + if (WARN_ON(fwspec->param_count < 2)) + return -EINVAL; + if (fwspec->param[0] >= txgpio->chip.ngpio) + return -EINVAL; + *hwirq = fwspec->param[0]; + *type = fwspec->param[1] & IRQ_TYPE_SENSE_MASK; return 0; } +static int thunderx_gpio_irq_alloc(struct irq_domain *d, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct thunderx_line *txline = arg; + + return irq_domain_set_hwirq_and_chip(d, virq, txline->line, + &thunderx_gpio_irq_chip, txline); +} + +static const struct irq_domain_ops thunderx_gpio_irqd_ops = { + .alloc = thunderx_gpio_irq_alloc, + .translate = thunderx_gpio_irq_translate +}; + +static int thunderx_gpio_to_irq(struct gpio_chip *chip, unsigned int offset) +{ + struct thunderx_gpio *txgpio = gpiochip_get_data(chip); + + return irq_find_mapping(txgpio->irqd, offset); +} + static int thunderx_gpio_probe(struct pci_dev *pdev, const struct pci_device_id *id) { @@ -408,7 +456,6 @@ static int thunderx_gpio_probe(struct pci_dev *pdev, struct device *dev = &pdev->dev; struct thunderx_gpio *txgpio; struct gpio_chip *chip; - struct gpio_irq_chip *girq; int ngpio, i; int err = 0; @@ -453,8 +500,8 @@ static int thunderx_gpio_probe(struct pci_dev *pdev, } txgpio->msix_entries = devm_kcalloc(dev, - ngpio, sizeof(struct msix_entry), - GFP_KERNEL); + ngpio, sizeof(struct msix_entry), + GFP_KERNEL); if (!txgpio->msix_entries) { err = -ENOMEM; goto out; @@ -495,6 +542,27 @@ static int thunderx_gpio_probe(struct pci_dev *pdev, if (err < 0) goto out; + /* + * Push GPIO specific irqdomain on hierarchy created as a side + * effect of the pci_enable_msix() + */ + txgpio->irqd = irq_domain_create_hierarchy(irq_get_irq_data(txgpio->msix_entries[0].vector)->domain, + 0, 0, of_node_to_fwnode(dev->of_node), + &thunderx_gpio_irqd_ops, txgpio); + if (!txgpio->irqd) { + err = -ENOMEM; + goto out; + } + + /* Push on irq_data and the domain for each line. */ + for (i = 0; i < ngpio; i++) { + err = irq_domain_push_irq(txgpio->irqd, + txgpio->msix_entries[i].vector, + &txgpio->line_entries[i]); + if (err < 0) + dev_err(dev, "irq_domain_push_irq: %d\n", err); + } + chip->label = KBUILD_MODNAME; chip->parent = dev; chip->owner = THIS_MODULE; @@ -509,28 +577,11 @@ static int thunderx_gpio_probe(struct pci_dev *pdev, chip->set = thunderx_gpio_set; chip->set_multiple = thunderx_gpio_set_multiple; chip->set_config = thunderx_gpio_set_config; - girq = &chip->irq; - girq->chip = &thunderx_gpio_irq_chip; - girq->fwnode = of_node_to_fwnode(dev->of_node); - girq->parent_domain = - irq_get_irq_data(txgpio->msix_entries[0].vector)->domain; - girq->child_to_parent_hwirq = thunderx_gpio_child_to_parent_hwirq; - girq->handler = handle_bad_irq; - girq->default_type = IRQ_TYPE_NONE; - + chip->to_irq = thunderx_gpio_to_irq; err = devm_gpiochip_add_data(dev, chip, txgpio); if (err) goto out; - /* Push on irq_data and the domain for each line. */ - for (i = 0; i < ngpio; i++) { - err = irq_domain_push_irq(chip->irq.domain, - txgpio->msix_entries[i].vector, - chip); - if (err < 0) - dev_err(dev, "irq_domain_push_irq: %d\n", err); - } - dev_info(dev, "ThunderX GPIO: %d lines with base %d.\n", ngpio, chip->base); return 0; @@ -545,10 +596,10 @@ static void thunderx_gpio_remove(struct pci_dev *pdev) struct thunderx_gpio *txgpio = pci_get_drvdata(pdev); for (i = 0; i < txgpio->chip.ngpio; i++) - irq_domain_pop_irq(txgpio->chip.irq.domain, + irq_domain_pop_irq(txgpio->irqd, txgpio->msix_entries[i].vector); - irq_domain_remove(txgpio->chip.irq.domain); + irq_domain_remove(txgpio->irqd); pci_set_drvdata(pdev, NULL); } diff --git a/drivers/gpio/gpio-zynq.c b/drivers/gpio/gpio-zynq.c index 4c3f6370eab4..05ba16fffdad 100644 --- a/drivers/gpio/gpio-zynq.c +++ b/drivers/gpio/gpio-zynq.c @@ -684,6 +684,8 @@ static void zynq_gpio_restore_context(struct zynq_gpio *gpio) unsigned int bank_num; for (bank_num = 0; bank_num < gpio->p_data->max_bank; bank_num++) { + writel_relaxed(ZYNQ_GPIO_IXR_DISABLE_ALL, gpio->base_addr + + ZYNQ_GPIO_INTDIS_OFFSET(bank_num)); writel_relaxed(gpio->context.datalsw[bank_num], gpio->base_addr + ZYNQ_GPIO_DATA_LSW_OFFSET(bank_num)); @@ -693,9 +695,6 @@ static void zynq_gpio_restore_context(struct zynq_gpio *gpio) writel_relaxed(gpio->context.dirm[bank_num], gpio->base_addr + ZYNQ_GPIO_DIRM_OFFSET(bank_num)); - writel_relaxed(gpio->context.int_en[bank_num], - gpio->base_addr + - ZYNQ_GPIO_INTEN_OFFSET(bank_num)); writel_relaxed(gpio->context.int_type[bank_num], gpio->base_addr + ZYNQ_GPIO_INTTYPE_OFFSET(bank_num)); @@ -705,6 +704,9 @@ static void zynq_gpio_restore_context(struct zynq_gpio *gpio) writel_relaxed(gpio->context.int_any[bank_num], gpio->base_addr + ZYNQ_GPIO_INTANY_OFFSET(bank_num)); + writel_relaxed(~(gpio->context.int_en[bank_num]), + gpio->base_addr + + ZYNQ_GPIO_INTEN_OFFSET(bank_num)); } } diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c index d30e57dc755c..31fee5e918b7 100644 --- a/drivers/gpio/gpiolib-acpi.c +++ b/drivers/gpio/gpiolib-acpi.c @@ -21,11 +21,19 @@ #include "gpiolib.h" #include "gpiolib-acpi.h" +#define QUIRK_NO_EDGE_EVENTS_ON_BOOT 0x01l +#define QUIRK_NO_WAKEUP 0x02l + static int run_edge_events_on_boot = -1; module_param(run_edge_events_on_boot, int, 0444); MODULE_PARM_DESC(run_edge_events_on_boot, "Run edge _AEI event-handlers at boot: 0=no, 1=yes, -1=auto"); +static int honor_wakeup = -1; +module_param(honor_wakeup, int, 0444); +MODULE_PARM_DESC(honor_wakeup, + "Honor the ACPI wake-capable flag: 0=no, 1=yes, -1=auto"); + /** * struct acpi_gpio_event - ACPI GPIO event handler data * @@ -281,7 +289,7 @@ static acpi_status acpi_gpiochip_alloc_event(struct acpi_resource *ares, event->handle = evt_handle; event->handler = handler; event->irq = irq; - event->irq_is_wake = agpio->wake_capable == ACPI_WAKE_CAPABLE; + event->irq_is_wake = honor_wakeup && agpio->wake_capable == ACPI_WAKE_CAPABLE; event->pin = pin; event->desc = desc; @@ -1309,7 +1317,7 @@ static int acpi_gpio_handle_deferred_request_irqs(void) /* We must use _sync so that this runs after the first deferred_probe run */ late_initcall_sync(acpi_gpio_handle_deferred_request_irqs); -static const struct dmi_system_id run_edge_events_on_boot_blacklist[] = { +static const struct dmi_system_id gpiolib_acpi_quirks[] = { { /* * The Minix Neo Z83-4 has a micro-USB-B id-pin handler for @@ -1319,7 +1327,8 @@ static const struct dmi_system_id run_edge_events_on_boot_blacklist[] = { .matches = { DMI_MATCH(DMI_SYS_VENDOR, "MINIX"), DMI_MATCH(DMI_PRODUCT_NAME, "Z83-4"), - } + }, + .driver_data = (void *)QUIRK_NO_EDGE_EVENTS_ON_BOOT, }, { /* @@ -1331,20 +1340,52 @@ static const struct dmi_system_id run_edge_events_on_boot_blacklist[] = { .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Wortmann_AG"), DMI_MATCH(DMI_PRODUCT_NAME, "TERRA_PAD_1061"), - } + }, + .driver_data = (void *)QUIRK_NO_EDGE_EVENTS_ON_BOOT, + }, + { + /* + * Various HP X2 10 Cherry Trail models use an external + * embedded-controller connected via I2C + an ACPI GPIO + * event handler. The embedded controller generates various + * spurious wakeup events when suspended. So disable wakeup + * for its handler (it uses the only ACPI GPIO event handler). + * This breaks wakeup when opening the lid, the user needs + * to press the power-button to wakeup the system. The + * alternative is suspend simply not working, which is worse. + */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "HP"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP x2 Detachable 10-p0XX"), + }, + .driver_data = (void *)QUIRK_NO_WAKEUP, }, {} /* Terminating entry */ }; static int acpi_gpio_setup_params(void) { + const struct dmi_system_id *id; + long quirks = 0; + + id = dmi_first_match(gpiolib_acpi_quirks); + if (id) + quirks = (long)id->driver_data; + if (run_edge_events_on_boot < 0) { - if (dmi_check_system(run_edge_events_on_boot_blacklist)) + if (quirks & QUIRK_NO_EDGE_EVENTS_ON_BOOT) run_edge_events_on_boot = 0; else run_edge_events_on_boot = 1; } + if (honor_wakeup < 0) { + if (quirks & QUIRK_NO_WAKEUP) + honor_wakeup = 0; + else + honor_wakeup = 1; + } + return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c index a97fb759e2f4..3e35a8f2c5e5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c @@ -613,7 +613,17 @@ static bool amdgpu_atpx_detect(void) bool d3_supported = false; struct pci_dev *parent_pdev; - while ((pdev = pci_get_class(PCI_BASE_CLASS_DISPLAY << 16, pdev)) != NULL) { + while ((pdev = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, pdev)) != NULL) { + vga_count++; + + has_atpx |= (amdgpu_atpx_pci_probe_handle(pdev) == true); + + parent_pdev = pci_upstream_bridge(pdev); + d3_supported |= parent_pdev && parent_pdev->bridge_d3; + amdgpu_atpx_get_quirks(pdev); + } + + while ((pdev = pci_get_class(PCI_CLASS_DISPLAY_OTHER << 8, pdev)) != NULL) { vga_count++; has_atpx |= (amdgpu_atpx_pci_probe_handle(pdev) == true); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 0ffc9447b573..30a1e3ac21d6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -142,7 +142,7 @@ int amdgpu_async_gfx_ring = 1; int amdgpu_mcbp = 0; int amdgpu_discovery = -1; int amdgpu_mes = 0; -int amdgpu_noretry = 1; +int amdgpu_noretry; int amdgpu_force_asic_type = -1; struct amdgpu_mgpu_info mgpu_info = { @@ -588,7 +588,7 @@ MODULE_PARM_DESC(mes, module_param_named(mes, amdgpu_mes, int, 0444); MODULE_PARM_DESC(noretry, - "Disable retry faults (0 = retry enabled, 1 = retry disabled (default))"); + "Disable retry faults (0 = retry enabled (default), 1 = retry disabled)"); module_param_named(noretry, amdgpu_noretry, int, 0644); /** @@ -1004,7 +1004,7 @@ static const struct pci_device_id pciidlist[] = { {0x1002, 0x734F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI14}, /* Renoir */ - {0x1002, 0x1636, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RENOIR|AMD_IS_APU|AMD_EXP_HW_SUPPORT}, + {0x1002, 0x1636, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RENOIR|AMD_IS_APU}, /* Navi12 */ {0x1002, 0x7360, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI12|AMD_EXP_HW_SUPPORT}, @@ -1359,7 +1359,8 @@ static struct drm_driver kms_driver = { .driver_features = DRIVER_USE_AGP | DRIVER_ATOMIC | DRIVER_GEM | - DRIVER_RENDER | DRIVER_MODESET | DRIVER_SYNCOBJ, + DRIVER_RENDER | DRIVER_MODESET | DRIVER_SYNCOBJ | + DRIVER_SYNCOBJ_TIMELINE, .load = amdgpu_driver_load_kms, .open = amdgpu_driver_open_kms, .postclose = amdgpu_driver_postclose_kms, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 44be3a45b25e..e1b8d8daeafc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -1488,7 +1488,7 @@ out: /* Start rlc autoload after psp recieved all the gfx firmware */ if (psp->autoload_supported && ucode->ucode_id == (amdgpu_sriov_vf(adev) ? - AMDGPU_UCODE_ID_CP_MEC2 : AMDGPU_UCODE_ID_RLC_RESTORE_LIST_SRM_MEM)) { + AMDGPU_UCODE_ID_CP_MEC2 : AMDGPU_UCODE_ID_RLC_G)) { ret = psp_rlc_autoload(psp); if (ret) { DRM_ERROR("Failed to start rlc autoload\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h index 410587b950f3..914acecda5cf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h @@ -292,10 +292,10 @@ enum AMDGPU_UCODE_ID { AMDGPU_UCODE_ID_CP_MEC2_JT, AMDGPU_UCODE_ID_CP_MES, AMDGPU_UCODE_ID_CP_MES_DATA, - AMDGPU_UCODE_ID_RLC_G, AMDGPU_UCODE_ID_RLC_RESTORE_LIST_CNTL, AMDGPU_UCODE_ID_RLC_RESTORE_LIST_GPM_MEM, AMDGPU_UCODE_ID_RLC_RESTORE_LIST_SRM_MEM, + AMDGPU_UCODE_ID_RLC_G, AMDGPU_UCODE_ID_STORAGE, AMDGPU_UCODE_ID_SMC, AMDGPU_UCODE_ID_UVD, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 66328ffa395a..97105a5bb246 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -1052,17 +1052,10 @@ static void gfx_v9_0_check_if_need_gfxoff(struct amdgpu_device *adev) case CHIP_VEGA20: break; case CHIP_RAVEN: - /* Disable GFXOFF on original raven. There are combinations - * of sbios and platforms that are not stable. - */ - if (!(adev->rev_id >= 0x8 || adev->pdev->device == 0x15d8)) - adev->pm.pp_feature &= ~PP_GFXOFF_MASK; - else if (!(adev->rev_id >= 0x8 || adev->pdev->device == 0x15d8) - &&((adev->gfx.rlc_fw_version != 106 && - adev->gfx.rlc_fw_version < 531) || - (adev->gfx.rlc_fw_version == 53815) || - (adev->gfx.rlc_feature_version < 1) || - !adev->gfx.rlc.is_rlc_v2_1)) + if (!(adev->rev_id >= 0x8 || + adev->pdev->device == 0x15d8) && + (adev->pm.fw_version < 0x41e2b || /* not raven1 fresh */ + !adev->gfx.rlc.is_rlc_v2_1)) /* without rlc save restore ucodes */ adev->pm.pp_feature &= ~PP_GFXOFF_MASK; if (adev->pm.pp_feature & PP_GFXOFF_MASK) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c index 4ef4d31f5231..2f52b7f4d25c 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c @@ -254,7 +254,7 @@ static const struct soc15_reg_golden golden_settings_sdma_4_3[] = { SOC15_REG_GOLDEN_VALUE(SDMA0, 0, mmSDMA0_RLC0_RB_WPTR_POLL_CNTL, 0xfffffff7, 0x00403000), SOC15_REG_GOLDEN_VALUE(SDMA0, 0, mmSDMA0_RLC1_RB_WPTR_POLL_CNTL, 0xfffffff7, 0x00403000), SOC15_REG_GOLDEN_VALUE(SDMA0, 0, mmSDMA0_UTCL1_PAGE, 0x000003ff, 0x000003c0), - SOC15_REG_GOLDEN_VALUE(SDMA0, 0, mmSDMA0_UTCL1_WATERMK, 0xfc000000, 0x00000000) + SOC15_REG_GOLDEN_VALUE(SDMA0, 0, mmSDMA0_UTCL1_WATERMK, 0xfc000000, 0x03fbe1fe) }; static u32 sdma_v4_0_get_reg_offset(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 7aac9568d3be..803e59d97411 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -3356,27 +3356,21 @@ get_output_color_space(const struct dc_crtc_timing *dc_crtc_timing) return color_space; } -static void reduce_mode_colour_depth(struct dc_crtc_timing *timing_out) -{ - if (timing_out->display_color_depth <= COLOR_DEPTH_888) - return; - - timing_out->display_color_depth--; -} - -static void adjust_colour_depth_from_display_info(struct dc_crtc_timing *timing_out, - const struct drm_display_info *info) +static bool adjust_colour_depth_from_display_info( + struct dc_crtc_timing *timing_out, + const struct drm_display_info *info) { + enum dc_color_depth depth = timing_out->display_color_depth; int normalized_clk; - if (timing_out->display_color_depth <= COLOR_DEPTH_888) - return; do { normalized_clk = timing_out->pix_clk_100hz / 10; /* YCbCr 4:2:0 requires additional adjustment of 1/2 */ if (timing_out->pixel_encoding == PIXEL_ENCODING_YCBCR420) normalized_clk /= 2; /* Adjusting pix clock following on HDMI spec based on colour depth */ - switch (timing_out->display_color_depth) { + switch (depth) { + case COLOR_DEPTH_888: + break; case COLOR_DEPTH_101010: normalized_clk = (normalized_clk * 30) / 24; break; @@ -3387,14 +3381,15 @@ static void adjust_colour_depth_from_display_info(struct dc_crtc_timing *timing_ normalized_clk = (normalized_clk * 48) / 24; break; default: - return; + /* The above depths are the only ones valid for HDMI. */ + return false; } - if (normalized_clk <= info->max_tmds_clock) - return; - reduce_mode_colour_depth(timing_out); - - } while (timing_out->display_color_depth > COLOR_DEPTH_888); - + if (normalized_clk <= info->max_tmds_clock) { + timing_out->display_color_depth = depth; + return true; + } + } while (--depth > COLOR_DEPTH_666); + return false; } static void fill_stream_properties_from_drm_display_mode( @@ -3474,8 +3469,14 @@ static void fill_stream_properties_from_drm_display_mode( stream->out_transfer_func->type = TF_TYPE_PREDEFINED; stream->out_transfer_func->tf = TRANSFER_FUNCTION_SRGB; - if (stream->signal == SIGNAL_TYPE_HDMI_TYPE_A) - adjust_colour_depth_from_display_info(timing_out, info); + if (stream->signal == SIGNAL_TYPE_HDMI_TYPE_A) { + if (!adjust_colour_depth_from_display_info(timing_out, info) && + drm_mode_is_420_also(info, mode_in) && + timing_out->pixel_encoding != PIXEL_ENCODING_YCBCR420) { + timing_out->pixel_encoding = PIXEL_ENCODING_YCBCR420; + adjust_colour_depth_from_display_info(timing_out, info); + } + } } static void fill_audio_info(struct audio_info *audio_info, diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link.c b/drivers/gpu/drm/amd/display/dc/core/dc_link.c index 62d8289abb4e..4619f94f0ac7 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_link.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_link.c @@ -817,8 +817,8 @@ static bool dc_link_detect_helper(struct dc_link *link, } case SIGNAL_TYPE_EDP: { - read_current_link_settings_on_detect(link); detect_edp_sink_caps(link); + read_current_link_settings_on_detect(link); sink_caps.transaction_type = DDC_TRANSACTION_TYPE_I2C_OVER_AUX; sink_caps.signal = SIGNAL_TYPE_EDP; break; diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c index 5ff7ccedfbed..a23729d3174b 100644 --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c @@ -866,6 +866,7 @@ static int smu_sw_init(void *handle) smu->smu_baco.platform_support = false; mutex_init(&smu->sensor_lock); + mutex_init(&smu->metrics_lock); smu->watermarks_bitmap = 0; smu->power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT; diff --git a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c index cc71a1078a7a..472e9fed411a 100644 --- a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c +++ b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c @@ -862,18 +862,21 @@ static int arcturus_get_metrics_table(struct smu_context *smu, struct smu_table_context *smu_table= &smu->smu_table; int ret = 0; + mutex_lock(&smu->metrics_lock); if (!smu_table->metrics_time || time_after(jiffies, smu_table->metrics_time + HZ / 1000)) { ret = smu_update_table(smu, SMU_TABLE_SMU_METRICS, 0, (void *)smu_table->metrics_table, false); if (ret) { pr_info("Failed to export SMU metrics table!\n"); + mutex_unlock(&smu->metrics_lock); return ret; } smu_table->metrics_time = jiffies; } memcpy(metrics_table, smu_table->metrics_table, sizeof(SmuMetrics_t)); + mutex_unlock(&smu->metrics_lock); return ret; } diff --git a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h index ac9758305ab3..41fce75b263f 100644 --- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h +++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h @@ -349,6 +349,7 @@ struct smu_context const struct pptable_funcs *ppt_funcs; struct mutex mutex; struct mutex sensor_lock; + struct mutex metrics_lock; uint64_t pool_size; struct smu_table_context smu_table; diff --git a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c index 4a14fd1f9fd5..ca62e92e5a4f 100644 --- a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c +++ b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c @@ -562,17 +562,20 @@ static int navi10_get_metrics_table(struct smu_context *smu, struct smu_table_context *smu_table= &smu->smu_table; int ret = 0; + mutex_lock(&smu->metrics_lock); if (!smu_table->metrics_time || time_after(jiffies, smu_table->metrics_time + msecs_to_jiffies(100))) { ret = smu_update_table(smu, SMU_TABLE_SMU_METRICS, 0, (void *)smu_table->metrics_table, false); if (ret) { pr_info("Failed to export SMU metrics table!\n"); + mutex_unlock(&smu->metrics_lock); return ret; } smu_table->metrics_time = jiffies; } memcpy(metrics_table, smu_table->metrics_table, sizeof(SmuMetrics_t)); + mutex_unlock(&smu->metrics_lock); return ret; } diff --git a/drivers/gpu/drm/amd/powerplay/vega20_ppt.c b/drivers/gpu/drm/amd/powerplay/vega20_ppt.c index 60b9ff097142..0d3a3b0a934e 100644 --- a/drivers/gpu/drm/amd/powerplay/vega20_ppt.c +++ b/drivers/gpu/drm/amd/powerplay/vega20_ppt.c @@ -1678,17 +1678,20 @@ static int vega20_get_metrics_table(struct smu_context *smu, struct smu_table_context *smu_table= &smu->smu_table; int ret = 0; + mutex_lock(&smu->metrics_lock); if (!smu_table->metrics_time || time_after(jiffies, smu_table->metrics_time + HZ / 1000)) { ret = smu_update_table(smu, SMU_TABLE_SMU_METRICS, 0, (void *)smu_table->metrics_table, false); if (ret) { pr_info("Failed to export SMU metrics table!\n"); + mutex_unlock(&smu->metrics_lock); return ret; } smu_table->metrics_time = jiffies; } memcpy(metrics_table, smu_table->metrics_table, sizeof(SmuMetrics_t)); + mutex_unlock(&smu->metrics_lock); return ret; } diff --git a/drivers/gpu/drm/arm/malidp_mw.c b/drivers/gpu/drm/arm/malidp_mw.c index 875a3a9eabfa..7d0e7b031e44 100644 --- a/drivers/gpu/drm/arm/malidp_mw.c +++ b/drivers/gpu/drm/arm/malidp_mw.c @@ -56,7 +56,7 @@ malidp_mw_connector_mode_valid(struct drm_connector *connector, return MODE_OK; } -const struct drm_connector_helper_funcs malidp_mw_connector_helper_funcs = { +static const struct drm_connector_helper_funcs malidp_mw_connector_helper_funcs = { .get_modes = malidp_mw_connector_get_modes, .mode_valid = malidp_mw_connector_mode_valid, }; diff --git a/drivers/gpu/drm/drm_dp_mst_topology.c b/drivers/gpu/drm/drm_dp_mst_topology.c index 273dd80fabf3..e6afe4faeca6 100644 --- a/drivers/gpu/drm/drm_dp_mst_topology.c +++ b/drivers/gpu/drm/drm_dp_mst_topology.c @@ -393,7 +393,7 @@ drm_dp_encode_sideband_req(const struct drm_dp_sideband_msg_req_body *req, memcpy(&buf[idx], req->u.i2c_read.transactions[i].bytes, req->u.i2c_read.transactions[i].num_bytes); idx += req->u.i2c_read.transactions[i].num_bytes; - buf[idx] = (req->u.i2c_read.transactions[i].no_stop_bit & 0x1) << 5; + buf[idx] = (req->u.i2c_read.transactions[i].no_stop_bit & 0x1) << 4; buf[idx] |= (req->u.i2c_read.transactions[i].i2c_transaction_delay & 0xf); idx++; } @@ -1190,6 +1190,8 @@ static int drm_dp_mst_wait_tx_reply(struct drm_dp_mst_branch *mstb, txmsg->state == DRM_DP_SIDEBAND_TX_SENT) { mstb->tx_slots[txmsg->seqno] = NULL; } + mgr->is_waiting_for_dwn_reply = false; + } out: if (unlikely(ret == -EIO) && drm_debug_enabled(DRM_UT_DP)) { @@ -1199,6 +1201,7 @@ out: } mutex_unlock(&mgr->qlock); + drm_dp_mst_kick_tx(mgr); return ret; } @@ -1913,73 +1916,90 @@ static u8 drm_dp_calculate_rad(struct drm_dp_mst_port *port, return parent_lct + 1; } -static int drm_dp_port_set_pdt(struct drm_dp_mst_port *port, u8 new_pdt) +static bool drm_dp_mst_is_dp_mst_end_device(u8 pdt, bool mcs) +{ + switch (pdt) { + case DP_PEER_DEVICE_DP_LEGACY_CONV: + case DP_PEER_DEVICE_SST_SINK: + return true; + case DP_PEER_DEVICE_MST_BRANCHING: + /* For sst branch device */ + if (!mcs) + return true; + + return false; + } + return true; +} + +static int +drm_dp_port_set_pdt(struct drm_dp_mst_port *port, u8 new_pdt, + bool new_mcs) { struct drm_dp_mst_topology_mgr *mgr = port->mgr; struct drm_dp_mst_branch *mstb; u8 rad[8], lct; int ret = 0; - if (port->pdt == new_pdt) + if (port->pdt == new_pdt && port->mcs == new_mcs) return 0; /* Teardown the old pdt, if there is one */ - switch (port->pdt) { - case DP_PEER_DEVICE_DP_LEGACY_CONV: - case DP_PEER_DEVICE_SST_SINK: - /* - * If the new PDT would also have an i2c bus, don't bother - * with reregistering it - */ - if (new_pdt == DP_PEER_DEVICE_DP_LEGACY_CONV || - new_pdt == DP_PEER_DEVICE_SST_SINK) { - port->pdt = new_pdt; - return 0; - } + if (port->pdt != DP_PEER_DEVICE_NONE) { + if (drm_dp_mst_is_dp_mst_end_device(port->pdt, port->mcs)) { + /* + * If the new PDT would also have an i2c bus, + * don't bother with reregistering it + */ + if (new_pdt != DP_PEER_DEVICE_NONE && + drm_dp_mst_is_dp_mst_end_device(new_pdt, new_mcs)) { + port->pdt = new_pdt; + port->mcs = new_mcs; + return 0; + } - /* remove i2c over sideband */ - drm_dp_mst_unregister_i2c_bus(&port->aux); - break; - case DP_PEER_DEVICE_MST_BRANCHING: - mutex_lock(&mgr->lock); - drm_dp_mst_topology_put_mstb(port->mstb); - port->mstb = NULL; - mutex_unlock(&mgr->lock); - break; + /* remove i2c over sideband */ + drm_dp_mst_unregister_i2c_bus(&port->aux); + } else { + mutex_lock(&mgr->lock); + drm_dp_mst_topology_put_mstb(port->mstb); + port->mstb = NULL; + mutex_unlock(&mgr->lock); + } } port->pdt = new_pdt; - switch (port->pdt) { - case DP_PEER_DEVICE_DP_LEGACY_CONV: - case DP_PEER_DEVICE_SST_SINK: - /* add i2c over sideband */ - ret = drm_dp_mst_register_i2c_bus(&port->aux); - break; + port->mcs = new_mcs; - case DP_PEER_DEVICE_MST_BRANCHING: - lct = drm_dp_calculate_rad(port, rad); - mstb = drm_dp_add_mst_branch_device(lct, rad); - if (!mstb) { - ret = -ENOMEM; - DRM_ERROR("Failed to create MSTB for port %p", port); - goto out; - } + if (port->pdt != DP_PEER_DEVICE_NONE) { + if (drm_dp_mst_is_dp_mst_end_device(port->pdt, port->mcs)) { + /* add i2c over sideband */ + ret = drm_dp_mst_register_i2c_bus(&port->aux); + } else { + lct = drm_dp_calculate_rad(port, rad); + mstb = drm_dp_add_mst_branch_device(lct, rad); + if (!mstb) { + ret = -ENOMEM; + DRM_ERROR("Failed to create MSTB for port %p", + port); + goto out; + } - mutex_lock(&mgr->lock); - port->mstb = mstb; - mstb->mgr = port->mgr; - mstb->port_parent = port; + mutex_lock(&mgr->lock); + port->mstb = mstb; + mstb->mgr = port->mgr; + mstb->port_parent = port; - /* - * Make sure this port's memory allocation stays - * around until its child MSTB releases it - */ - drm_dp_mst_get_port_malloc(port); - mutex_unlock(&mgr->lock); + /* + * Make sure this port's memory allocation stays + * around until its child MSTB releases it + */ + drm_dp_mst_get_port_malloc(port); + mutex_unlock(&mgr->lock); - /* And make sure we send a link address for this */ - ret = 1; - break; + /* And make sure we send a link address for this */ + ret = 1; + } } out: @@ -2132,9 +2152,8 @@ drm_dp_mst_port_add_connector(struct drm_dp_mst_branch *mstb, goto error; } - if ((port->pdt == DP_PEER_DEVICE_DP_LEGACY_CONV || - port->pdt == DP_PEER_DEVICE_SST_SINK) && - port->port_num >= DP_MST_LOGICAL_PORT_0) { + if (port->pdt != DP_PEER_DEVICE_NONE && + drm_dp_mst_is_dp_mst_end_device(port->pdt, port->mcs)) { port->cached_edid = drm_get_edid(port->connector, &port->aux.ddc); drm_connector_set_tile_property(port->connector); @@ -2198,6 +2217,7 @@ drm_dp_mst_handle_link_address_port(struct drm_dp_mst_branch *mstb, struct drm_dp_mst_port *port; int old_ddps = 0, ret; u8 new_pdt = DP_PEER_DEVICE_NONE; + bool new_mcs = 0; bool created = false, send_link_addr = false, changed = false; port = drm_dp_get_port(mstb, port_msg->port_number); @@ -2242,7 +2262,7 @@ drm_dp_mst_handle_link_address_port(struct drm_dp_mst_branch *mstb, port->input = port_msg->input_port; if (!port->input) new_pdt = port_msg->peer_device_type; - port->mcs = port_msg->mcs; + new_mcs = port_msg->mcs; port->ddps = port_msg->ddps; port->ldps = port_msg->legacy_device_plug_status; port->dpcd_rev = port_msg->dpcd_revision; @@ -2269,7 +2289,7 @@ drm_dp_mst_handle_link_address_port(struct drm_dp_mst_branch *mstb, } } - ret = drm_dp_port_set_pdt(port, new_pdt); + ret = drm_dp_port_set_pdt(port, new_pdt, new_mcs); if (ret == 1) { send_link_addr = true; } else if (ret < 0) { @@ -2283,7 +2303,8 @@ drm_dp_mst_handle_link_address_port(struct drm_dp_mst_branch *mstb, * we're coming out of suspend. In this case, always resend the link * address if there's an MSTB on this port */ - if (!created && port->pdt == DP_PEER_DEVICE_MST_BRANCHING) + if (!created && port->pdt == DP_PEER_DEVICE_MST_BRANCHING && + port->mcs) send_link_addr = true; if (port->connector) @@ -2318,8 +2339,9 @@ drm_dp_mst_handle_conn_stat(struct drm_dp_mst_branch *mstb, { struct drm_dp_mst_topology_mgr *mgr = mstb->mgr; struct drm_dp_mst_port *port; - int old_ddps, ret; + int old_ddps, old_input, ret, i; u8 new_pdt; + bool new_mcs; bool dowork = false, create_connector = false; port = drm_dp_get_port(mstb, conn_stat->port_number); @@ -2349,8 +2371,8 @@ drm_dp_mst_handle_conn_stat(struct drm_dp_mst_branch *mstb, } old_ddps = port->ddps; + old_input = port->input; port->input = conn_stat->input_port; - port->mcs = conn_stat->message_capability_status; port->ldps = conn_stat->legacy_device_plug_status; port->ddps = conn_stat->displayport_device_plug_status; @@ -2363,8 +2385,8 @@ drm_dp_mst_handle_conn_stat(struct drm_dp_mst_branch *mstb, } new_pdt = port->input ? DP_PEER_DEVICE_NONE : conn_stat->peer_device_type; - - ret = drm_dp_port_set_pdt(port, new_pdt); + new_mcs = conn_stat->message_capability_status; + ret = drm_dp_port_set_pdt(port, new_pdt, new_mcs); if (ret == 1) { dowork = true; } else if (ret < 0) { @@ -2373,6 +2395,28 @@ drm_dp_mst_handle_conn_stat(struct drm_dp_mst_branch *mstb, dowork = false; } + if (!old_input && old_ddps != port->ddps && !port->ddps) { + for (i = 0; i < mgr->max_payloads; i++) { + struct drm_dp_vcpi *vcpi = mgr->proposed_vcpis[i]; + struct drm_dp_mst_port *port_validated; + + if (!vcpi) + continue; + + port_validated = + container_of(vcpi, struct drm_dp_mst_port, vcpi); + port_validated = + drm_dp_mst_topology_get_port_validated(mgr, port_validated); + if (!port_validated) { + mutex_lock(&mgr->payload_lock); + vcpi->num_slots = 0; + mutex_unlock(&mgr->payload_lock); + } else { + drm_dp_mst_topology_put_port(port_validated); + } + } + } + if (port->connector) drm_modeset_unlock(&mgr->base.lock); else if (create_connector) @@ -2718,9 +2762,11 @@ static void process_single_down_tx_qlock(struct drm_dp_mst_topology_mgr *mgr) ret = process_single_tx_qlock(mgr, txmsg, false); if (ret == 1) { /* txmsg is sent it should be in the slots now */ + mgr->is_waiting_for_dwn_reply = true; list_del(&txmsg->next); } else if (ret) { DRM_DEBUG_KMS("failed to send msg in q %d\n", ret); + mgr->is_waiting_for_dwn_reply = false; list_del(&txmsg->next); if (txmsg->seqno != -1) txmsg->dst->tx_slots[txmsg->seqno] = NULL; @@ -2760,7 +2806,8 @@ static void drm_dp_queue_down_tx(struct drm_dp_mst_topology_mgr *mgr, drm_dp_mst_dump_sideband_msg_tx(&p, txmsg); } - if (list_is_singular(&mgr->tx_msg_downq)) + if (list_is_singular(&mgr->tx_msg_downq) && + !mgr->is_waiting_for_dwn_reply) process_single_down_tx_qlock(mgr); mutex_unlock(&mgr->qlock); } @@ -3678,6 +3725,7 @@ static int drm_dp_mst_handle_down_rep(struct drm_dp_mst_topology_mgr *mgr) mutex_lock(&mgr->qlock); txmsg->state = DRM_DP_SIDEBAND_TX_RX; mstb->tx_slots[slot] = NULL; + mgr->is_waiting_for_dwn_reply = false; mutex_unlock(&mgr->qlock); wake_up_all(&mgr->tx_waitq); @@ -3687,6 +3735,9 @@ static int drm_dp_mst_handle_down_rep(struct drm_dp_mst_topology_mgr *mgr) no_msg: drm_dp_mst_topology_put_mstb(mstb); clear_down_rep_recv: + mutex_lock(&mgr->qlock); + mgr->is_waiting_for_dwn_reply = false; + mutex_unlock(&mgr->qlock); memset(&mgr->down_rep_recv, 0, sizeof(struct drm_dp_sideband_msg_rx)); return 0; @@ -3896,6 +3947,8 @@ drm_dp_mst_detect_port(struct drm_connector *connector, switch (port->pdt) { case DP_PEER_DEVICE_NONE: case DP_PEER_DEVICE_MST_BRANCHING: + if (!port->mcs) + ret = connector_status_connected; break; case DP_PEER_DEVICE_SST_SINK: @@ -4497,7 +4550,7 @@ static void drm_dp_tx_work(struct work_struct *work) struct drm_dp_mst_topology_mgr *mgr = container_of(work, struct drm_dp_mst_topology_mgr, tx_work); mutex_lock(&mgr->qlock); - if (!list_empty(&mgr->tx_msg_downq)) + if (!list_empty(&mgr->tx_msg_downq) && !mgr->is_waiting_for_dwn_reply) process_single_down_tx_qlock(mgr); mutex_unlock(&mgr->qlock); } @@ -4508,7 +4561,7 @@ drm_dp_delayed_destroy_port(struct drm_dp_mst_port *port) if (port->connector) port->mgr->cbs->destroy_connector(port->mgr, port->connector); - drm_dp_port_set_pdt(port, DP_PEER_DEVICE_NONE); + drm_dp_port_set_pdt(port, DP_PEER_DEVICE_NONE, port->mcs); drm_dp_mst_put_port_malloc(port); } diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c index 8ebeccdeed23..d8e8f3960f4d 100644 --- a/drivers/gpu/drm/drm_fb_helper.c +++ b/drivers/gpu/drm/drm_fb_helper.c @@ -1283,7 +1283,7 @@ int drm_fb_helper_check_var(struct fb_var_screeninfo *var, * Changes struct fb_var_screeninfo are currently not pushed back * to KMS, hence fail if different settings are requested. */ - if (var->bits_per_pixel != fb->format->cpp[0] * 8 || + if (var->bits_per_pixel > fb->format->cpp[0] * 8 || var->xres > fb->width || var->yres > fb->height || var->xres_virtual > fb->width || var->yres_virtual > fb->height) { DRM_DEBUG("fb requested width/height/bpp can't fit in current fb " @@ -1309,6 +1309,11 @@ int drm_fb_helper_check_var(struct fb_var_screeninfo *var, } /* + * Likewise, bits_per_pixel should be rounded up to a supported value. + */ + var->bits_per_pixel = fb->format->cpp[0] * 8; + + /* * drm fbdev emulation doesn't support changing the pixel format at all, * so reject all pixel format changing requests. */ diff --git a/drivers/gpu/drm/i915/display/intel_audio.c b/drivers/gpu/drm/i915/display/intel_audio.c index 85e6b2bbb34f..3a5ac13d5801 100644 --- a/drivers/gpu/drm/i915/display/intel_audio.c +++ b/drivers/gpu/drm/i915/display/intel_audio.c @@ -856,7 +856,7 @@ static unsigned long i915_audio_component_get_power(struct device *kdev) } /* Force CDCLK to 2*BCLK as long as we need audio powered. */ - if (INTEL_GEN(dev_priv) >= 10 || IS_GEMINILAKE(dev_priv)) + if (IS_GEMINILAKE(dev_priv)) glk_force_audio_cdclk(dev_priv, true); if (INTEL_GEN(dev_priv) >= 10 || IS_GEMINILAKE(dev_priv)) @@ -875,7 +875,7 @@ static void i915_audio_component_put_power(struct device *kdev, /* Stop forcing CDCLK to 2*BCLK if no need for audio to be powered. */ if (--dev_priv->audio_power_refcount == 0) - if (INTEL_GEN(dev_priv) >= 10 || IS_GEMINILAKE(dev_priv)) + if (IS_GEMINILAKE(dev_priv)) glk_force_audio_cdclk(dev_priv, false); intel_display_power_put(dev_priv, POWER_DOMAIN_AUDIO, cookie); diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c index effc4250b230..301897791627 100644 --- a/drivers/gpu/drm/i915/display/intel_display.c +++ b/drivers/gpu/drm/i915/display/intel_display.c @@ -4515,8 +4515,6 @@ static void icl_disable_transcoder_port_sync(const struct intel_crtc_state *old_ { struct intel_crtc *crtc = to_intel_crtc(old_crtc_state->base.crtc); struct drm_i915_private *dev_priv = to_i915(crtc->base.dev); - i915_reg_t reg; - u32 trans_ddi_func_ctl2_val; if (old_crtc_state->master_transcoder == INVALID_TRANSCODER) return; @@ -4524,10 +4522,7 @@ static void icl_disable_transcoder_port_sync(const struct intel_crtc_state *old_ DRM_DEBUG_KMS("Disabling Transcoder Port Sync on Slave Transcoder %s\n", transcoder_name(old_crtc_state->cpu_transcoder)); - reg = TRANS_DDI_FUNC_CTL2(old_crtc_state->cpu_transcoder); - trans_ddi_func_ctl2_val = ~(PORT_SYNC_MODE_ENABLE | - PORT_SYNC_MODE_MASTER_SELECT_MASK); - I915_WRITE(reg, trans_ddi_func_ctl2_val); + I915_WRITE(TRANS_DDI_FUNC_CTL2(old_crtc_state->cpu_transcoder), 0); } static void intel_fdi_normal_train(struct intel_crtc *crtc) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_busy.c b/drivers/gpu/drm/i915/gem/i915_gem_busy.c index 3d4f5775a4ba..25235ef630c1 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_busy.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_busy.c @@ -9,16 +9,16 @@ #include "i915_gem_ioctls.h" #include "i915_gem_object.h" -static __always_inline u32 __busy_read_flag(u8 id) +static __always_inline u32 __busy_read_flag(u16 id) { - if (id == (u8)I915_ENGINE_CLASS_INVALID) + if (id == (u16)I915_ENGINE_CLASS_INVALID) return 0xffff0000u; GEM_BUG_ON(id >= 16); return 0x10000u << id; } -static __always_inline u32 __busy_write_id(u8 id) +static __always_inline u32 __busy_write_id(u16 id) { /* * The uABI guarantees an active writer is also amongst the read @@ -29,14 +29,14 @@ static __always_inline u32 __busy_write_id(u8 id) * last_read - hence we always set both read and write busy for * last_write. */ - if (id == (u8)I915_ENGINE_CLASS_INVALID) + if (id == (u16)I915_ENGINE_CLASS_INVALID) return 0xffffffffu; return (id + 1) | __busy_read_flag(id); } static __always_inline unsigned int -__busy_set_if_active(const struct dma_fence *fence, u32 (*flag)(u8 id)) +__busy_set_if_active(const struct dma_fence *fence, u32 (*flag)(u16 id)) { const struct i915_request *rq; @@ -57,7 +57,7 @@ __busy_set_if_active(const struct dma_fence *fence, u32 (*flag)(u8 id)) return 0; /* Beware type-expansion follies! */ - BUILD_BUG_ON(!typecheck(u8, rq->engine->uabi_class)); + BUILD_BUG_ON(!typecheck(u16, rq->engine->uabi_class)); return flag(rq->engine->uabi_class); } diff --git a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c index 4c72d74d6576..0dbb44d30885 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c @@ -402,7 +402,7 @@ struct get_pages_work { static struct sg_table * __i915_gem_userptr_alloc_pages(struct drm_i915_gem_object *obj, - struct page **pvec, int num_pages) + struct page **pvec, unsigned long num_pages) { unsigned int max_segment = i915_sg_segment_size(); struct sg_table *st; @@ -448,9 +448,10 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work) { struct get_pages_work *work = container_of(_work, typeof(*work), work); struct drm_i915_gem_object *obj = work->obj; - const int npages = obj->base.size >> PAGE_SHIFT; + const unsigned long npages = obj->base.size >> PAGE_SHIFT; + unsigned long pinned; struct page **pvec; - int pinned, ret; + int ret; ret = -ENOMEM; pinned = 0; @@ -553,7 +554,7 @@ __i915_gem_userptr_get_pages_schedule(struct drm_i915_gem_object *obj) static int i915_gem_userptr_get_pages(struct drm_i915_gem_object *obj) { - const int num_pages = obj->base.size >> PAGE_SHIFT; + const unsigned long num_pages = obj->base.size >> PAGE_SHIFT; struct mm_struct *mm = obj->userptr.mm->mm; struct page **pvec; struct sg_table *pages; diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c index ef7bc41ffffa..5b7ff3ccfa8e 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.c +++ b/drivers/gpu/drm/i915/gt/intel_context.c @@ -123,6 +123,10 @@ static int __context_pin_state(struct i915_vma *vma) if (err) return err; + err = i915_active_acquire(&vma->active); + if (err) + goto err_unpin; + /* * And mark it as a globally pinned object to let the shrinker know * it cannot reclaim the object until we release it. @@ -131,14 +135,44 @@ static int __context_pin_state(struct i915_vma *vma) vma->obj->mm.dirty = true; return 0; + +err_unpin: + i915_vma_unpin(vma); + return err; } static void __context_unpin_state(struct i915_vma *vma) { i915_vma_make_shrinkable(vma); + i915_active_release(&vma->active); __i915_vma_unpin(vma); } +static int __ring_active(struct intel_ring *ring) +{ + int err; + + err = i915_active_acquire(&ring->vma->active); + if (err) + return err; + + err = intel_ring_pin(ring); + if (err) + goto err_active; + + return 0; + +err_active: + i915_active_release(&ring->vma->active); + return err; +} + +static void __ring_retire(struct intel_ring *ring) +{ + intel_ring_unpin(ring); + i915_active_release(&ring->vma->active); +} + __i915_active_call static void __intel_context_retire(struct i915_active *active) { @@ -151,7 +185,7 @@ static void __intel_context_retire(struct i915_active *active) __context_unpin_state(ce->state); intel_timeline_unpin(ce->timeline); - intel_ring_unpin(ce->ring); + __ring_retire(ce->ring); intel_context_put(ce); } @@ -163,7 +197,7 @@ static int __intel_context_active(struct i915_active *active) intel_context_get(ce); - err = intel_ring_pin(ce->ring); + err = __ring_active(ce->ring); if (err) goto err_put; @@ -183,7 +217,7 @@ static int __intel_context_active(struct i915_active *active) err_timeline: intel_timeline_unpin(ce->timeline); err_ring: - intel_ring_unpin(ce->ring); + __ring_retire(ce->ring); err_put: intel_context_put(ce); return err; diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h index 17f1f1441efc..2b446474e010 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h @@ -274,8 +274,8 @@ struct intel_engine_cs { u8 class; u8 instance; - u8 uabi_class; - u8 uabi_instance; + u16 uabi_class; + u16 uabi_instance; u32 uabi_capabilities; u32 context_size; diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index 75dd0e0367b7..d925a1035c9d 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -2664,6 +2664,14 @@ static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ batch = gen8_emit_flush_coherentl3_wa(engine, batch); + /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */ + batch = gen8_emit_pipe_control(batch, + PIPE_CONTROL_FLUSH_L3 | + PIPE_CONTROL_STORE_DATA_INDEX | + PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_QW_WRITE, + LRC_PPHWSP_SCRATCH_ADDR); + batch = emit_lri(batch, lri, ARRAY_SIZE(lri)); /* WaMediaPoolStateCmdInWABB:bxt,glk */ @@ -4416,9 +4424,11 @@ intel_execlists_create_virtual(struct i915_gem_context *ctx, ve->base.gt = siblings[0]->gt; ve->base.uncore = siblings[0]->uncore; ve->base.id = -1; + ve->base.class = OTHER_CLASS; ve->base.uabi_class = I915_ENGINE_CLASS_INVALID; ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; + ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; /* * The decision on whether to submit a request using semaphores diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c b/drivers/gpu/drm/i915/gt/intel_ring_submission.c index a47d5a7c32c9..93026217c121 100644 --- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c +++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c @@ -1413,14 +1413,6 @@ static inline int mi_set_context(struct i915_request *rq, u32 flags) int len; u32 *cs; - flags |= MI_MM_SPACE_GTT; - if (IS_HASWELL(i915)) - /* These flags are for resource streamer on HSW+ */ - flags |= HSW_MI_RS_SAVE_STATE_EN | HSW_MI_RS_RESTORE_STATE_EN; - else - /* We need to save the extended state for powersaving modes */ - flags |= MI_SAVE_EXT_STATE_EN | MI_RESTORE_EXT_STATE_EN; - len = 4; if (IS_GEN(i915, 7)) len += 2 + (num_engines ? 4 * num_engines + 6 : 0); @@ -1589,22 +1581,21 @@ static int switch_context(struct i915_request *rq) } if (ce->state) { - u32 hw_flags; + u32 flags; GEM_BUG_ON(rq->engine->id != RCS0); - /* - * The kernel context(s) is treated as pure scratch and is not - * expected to retain any state (as we sacrifice it during - * suspend and on resume it may be corrupted). This is ok, - * as nothing actually executes using the kernel context; it - * is purely used for flushing user contexts. - */ - hw_flags = 0; - if (i915_gem_context_is_kernel(rq->gem_context)) - hw_flags = MI_RESTORE_INHIBIT; + /* For resource streamer on HSW+ and power context elsewhere */ + BUILD_BUG_ON(HSW_MI_RS_SAVE_STATE_EN != MI_SAVE_EXT_STATE_EN); + BUILD_BUG_ON(HSW_MI_RS_RESTORE_STATE_EN != MI_RESTORE_EXT_STATE_EN); + + flags = MI_SAVE_EXT_STATE_EN | MI_MM_SPACE_GTT; + if (!i915_gem_context_is_kernel(rq->gem_context)) + flags |= MI_RESTORE_EXT_STATE_EN; + else + flags |= MI_RESTORE_INHIBIT; - ret = mi_set_context(rq, hw_flags); + ret = mi_set_context(rq, flags); if (ret) return ret; } diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index e29bc137e7ba..21aa08f55811 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1660,8 +1660,10 @@ IS_SUBPLATFORM(const struct drm_i915_private *i915, (IS_BROADWELL(dev_priv) || IS_GEN(dev_priv, 9)) /* WaRsDisableCoarsePowerGating:skl,cnl */ -#define NEEDS_WaRsDisableCoarsePowerGating(dev_priv) \ - (IS_CANNONLAKE(dev_priv) || IS_GEN(dev_priv, 9)) +#define NEEDS_WaRsDisableCoarsePowerGating(dev_priv) \ + (IS_CANNONLAKE(dev_priv) || \ + IS_SKL_GT3(dev_priv) || \ + IS_SKL_GT4(dev_priv)) #define HAS_GMBUS_IRQ(dev_priv) (INTEL_GEN(dev_priv) >= 4) #define HAS_GMBUS_BURST_READ(dev_priv) (INTEL_GEN(dev_priv) >= 10 || \ diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c index 6239a9adbf14..44727806dfd7 100644 --- a/drivers/gpu/drm/i915/i915_gem_gtt.c +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c @@ -1177,6 +1177,7 @@ gen8_ppgtt_insert_pte(struct i915_ppgtt *ppgtt, pd = i915_pd_entry(pdp, gen8_pd_index(idx, 2)); vaddr = kmap_atomic_px(i915_pt_entry(pd, gen8_pd_index(idx, 1))); do { + GEM_BUG_ON(iter->sg->length < I915_GTT_PAGE_SIZE); vaddr[gen8_pd_index(idx, 0)] = pte_encode | iter->dma; iter->dma += I915_GTT_PAGE_SIZE; @@ -1660,6 +1661,7 @@ static void gen6_ppgtt_insert_entries(struct i915_address_space *vm, vaddr = kmap_atomic_px(i915_pt_entry(pd, act_pt)); do { + GEM_BUG_ON(iter.sg->length < I915_GTT_PAGE_SIZE); vaddr[act_pte] = pte_encode | GEN6_PTE_ADDR_ENCODE(iter.dma); iter.dma += I915_GTT_PAGE_SIZE; @@ -3304,7 +3306,7 @@ void i915_ggtt_disable_guc(struct i915_ggtt *ggtt) static void ggtt_restore_mappings(struct i915_ggtt *ggtt) { - struct i915_vma *vma, *vn; + struct i915_vma *vma; bool flush = false; int open; @@ -3319,15 +3321,12 @@ static void ggtt_restore_mappings(struct i915_ggtt *ggtt) open = atomic_xchg(&ggtt->vm.open, 0); /* clflush objects bound into the GGTT and rebind them. */ - list_for_each_entry_safe(vma, vn, &ggtt->vm.bound_list, vm_link) { + list_for_each_entry(vma, &ggtt->vm.bound_list, vm_link) { struct drm_i915_gem_object *obj = vma->obj; if (!i915_vma_is_bound(vma, I915_VMA_GLOBAL_BIND)) continue; - if (!__i915_vma_unbind(vma)) - continue; - clear_bit(I915_VMA_GLOBAL_BIND_BIT, __i915_vma_flags(vma)); WARN_ON(i915_vma_bind(vma, obj ? obj->cache_level : 0, diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c index 6f09aa0be80a..d6d2e6fb8674 100644 --- a/drivers/gpu/drm/i915/i915_pmu.c +++ b/drivers/gpu/drm/i915/i915_pmu.c @@ -1074,12 +1074,17 @@ void i915_pmu_register(struct drm_i915_private *i915) hrtimer_init(&pmu->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); pmu->timer.function = i915_sample; - if (!is_igp(i915)) + if (!is_igp(i915)) { pmu->name = kasprintf(GFP_KERNEL, - "i915-%s", + "i915_%s", dev_name(i915->drm.dev)); - else + if (pmu->name) { + /* tools/perf reserves colons as special. */ + strreplace((char *)pmu->name, ':', '_'); + } + } else { pmu->name = "i915"; + } if (!pmu->name) goto err; diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 4fd3d76db346..094011b8f64d 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -4177,7 +4177,13 @@ enum { #define CPSSUNIT_CLKGATE_DIS REG_BIT(9) #define UNSLICE_UNIT_LEVEL_CLKGATE _MMIO(0x9434) -#define VFUNIT_CLKGATE_DIS (1 << 20) +#define VFUNIT_CLKGATE_DIS REG_BIT(20) +#define HSUNIT_CLKGATE_DIS REG_BIT(8) +#define VSUNIT_CLKGATE_DIS REG_BIT(3) + +#define UNSLICE_UNIT_LEVEL_CLKGATE2 _MMIO(0x94e4) +#define VSUNIT_CLKGATE_DIS_TGL REG_BIT(19) +#define PSDUNIT_CLKGATE_DIS REG_BIT(5) #define INF_UNIT_LEVEL_CLKGATE _MMIO(0x9560) #define CGPSF_CLKGATE_DIS (1 << 3) diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c index 75ae6f495161..86379eddc908 100644 --- a/drivers/gpu/drm/i915/intel_pm.c +++ b/drivers/gpu/drm/i915/intel_pm.c @@ -6565,6 +6565,17 @@ static void icl_init_clock_gating(struct drm_i915_private *dev_priv) /* WaEnable32PlaneMode:icl */ I915_WRITE(GEN9_CSFE_CHICKEN1_RCS, _MASKED_BIT_ENABLE(GEN11_ENABLE_32_PLANE_MODE)); + + /* + * Wa_1408615072:icl,ehl (vsunit) + * Wa_1407596294:icl,ehl (hsunit) + */ + intel_uncore_rmw(&dev_priv->uncore, UNSLICE_UNIT_LEVEL_CLKGATE, + 0, VSUNIT_CLKGATE_DIS | HSUNIT_CLKGATE_DIS); + + /* Wa_1407352427:icl,ehl */ + intel_uncore_rmw(&dev_priv->uncore, UNSLICE_UNIT_LEVEL_CLKGATE2, + 0, PSDUNIT_CLKGATE_DIS); } static void tgl_init_clock_gating(struct drm_i915_private *dev_priv) diff --git a/drivers/gpu/drm/i915/selftests/i915_random.h b/drivers/gpu/drm/i915/selftests/i915_random.h index 35cc69a3a1b9..05364eca20f7 100644 --- a/drivers/gpu/drm/i915/selftests/i915_random.h +++ b/drivers/gpu/drm/i915/selftests/i915_random.h @@ -25,6 +25,7 @@ #ifndef __I915_SELFTESTS_RANDOM_H__ #define __I915_SELFTESTS_RANDOM_H__ +#include <linux/math64.h> #include <linux/random.h> #include "../i915_selftest.h" diff --git a/drivers/gpu/drm/mediatek/mtk_drm_crtc.c b/drivers/gpu/drm/mediatek/mtk_drm_crtc.c index f80a8ba75977..3305a94fc930 100644 --- a/drivers/gpu/drm/mediatek/mtk_drm_crtc.c +++ b/drivers/gpu/drm/mediatek/mtk_drm_crtc.c @@ -215,11 +215,12 @@ struct mtk_ddp_comp *mtk_drm_ddp_comp_for_plane(struct drm_crtc *crtc, struct mtk_drm_crtc *mtk_crtc = to_mtk_crtc(crtc); struct mtk_ddp_comp *comp; int i, count = 0; + unsigned int local_index = plane - mtk_crtc->planes; for (i = 0; i < mtk_crtc->ddp_comp_nr; i++) { comp = mtk_crtc->ddp_comp[i]; - if (plane->index < (count + mtk_ddp_comp_layer_nr(comp))) { - *local_layer = plane->index - count; + if (local_index < (count + mtk_ddp_comp_layer_nr(comp))) { + *local_layer = local_index - count; return comp; } count += mtk_ddp_comp_layer_nr(comp); @@ -310,7 +311,9 @@ static int mtk_crtc_ddp_hw_init(struct mtk_drm_crtc *mtk_crtc) plane_state = to_mtk_plane_state(plane->state); comp = mtk_drm_ddp_comp_for_plane(crtc, plane, &local_layer); - mtk_ddp_comp_layer_config(comp, local_layer, plane_state); + if (comp) + mtk_ddp_comp_layer_config(comp, local_layer, + plane_state); } return 0; @@ -386,8 +389,9 @@ static void mtk_crtc_ddp_config(struct drm_crtc *crtc) comp = mtk_drm_ddp_comp_for_plane(crtc, plane, &local_layer); - mtk_ddp_comp_layer_config(comp, local_layer, - plane_state); + if (comp) + mtk_ddp_comp_layer_config(comp, local_layer, + plane_state); plane_state->pending.config = false; } mtk_crtc->pending_planes = false; @@ -401,7 +405,9 @@ int mtk_drm_crtc_plane_check(struct drm_crtc *crtc, struct drm_plane *plane, struct mtk_ddp_comp *comp; comp = mtk_drm_ddp_comp_for_plane(crtc, plane, &local_layer); - return mtk_ddp_comp_layer_check(comp, local_layer, state); + if (comp) + return mtk_ddp_comp_layer_check(comp, local_layer, state); + return 0; } static void mtk_drm_crtc_atomic_enable(struct drm_crtc *crtc, diff --git a/drivers/gpu/drm/mediatek/mtk_dsi.c b/drivers/gpu/drm/mediatek/mtk_dsi.c index e9931bbbe846..d77c9f484ce3 100644 --- a/drivers/gpu/drm/mediatek/mtk_dsi.c +++ b/drivers/gpu/drm/mediatek/mtk_dsi.c @@ -230,28 +230,25 @@ static void mtk_dsi_mask(struct mtk_dsi *dsi, u32 offset, u32 mask, u32 data) static void mtk_dsi_phy_timconfig(struct mtk_dsi *dsi) { u32 timcon0, timcon1, timcon2, timcon3; - u32 ui, cycle_time; + u32 data_rate_mhz = DIV_ROUND_UP(dsi->data_rate, 1000000); struct mtk_phy_timing *timing = &dsi->phy_timing; - ui = DIV_ROUND_UP(1000000000, dsi->data_rate); - cycle_time = div_u64(8000000000ULL, dsi->data_rate); + timing->lpx = (60 * data_rate_mhz / (8 * 1000)) + 1; + timing->da_hs_prepare = (80 * data_rate_mhz + 4 * 1000) / 8000; + timing->da_hs_zero = (170 * data_rate_mhz + 10 * 1000) / 8000 + 1 - + timing->da_hs_prepare; + timing->da_hs_trail = timing->da_hs_prepare + 1; - timing->lpx = NS_TO_CYCLE(60, cycle_time); - timing->da_hs_prepare = NS_TO_CYCLE(50 + 5 * ui, cycle_time); - timing->da_hs_zero = NS_TO_CYCLE(110 + 6 * ui, cycle_time); - timing->da_hs_trail = NS_TO_CYCLE(77 + 4 * ui, cycle_time); + timing->ta_go = 4 * timing->lpx - 2; + timing->ta_sure = timing->lpx + 2; + timing->ta_get = 4 * timing->lpx; + timing->da_hs_exit = 2 * timing->lpx + 1; - timing->ta_go = 4 * timing->lpx; - timing->ta_sure = 3 * timing->lpx / 2; - timing->ta_get = 5 * timing->lpx; - timing->da_hs_exit = 2 * timing->lpx; - - timing->clk_hs_zero = NS_TO_CYCLE(336, cycle_time); - timing->clk_hs_trail = NS_TO_CYCLE(100, cycle_time) + 10; - - timing->clk_hs_prepare = NS_TO_CYCLE(64, cycle_time); - timing->clk_hs_post = NS_TO_CYCLE(80 + 52 * ui, cycle_time); - timing->clk_hs_exit = 2 * timing->lpx; + timing->clk_hs_prepare = 70 * data_rate_mhz / (8 * 1000); + timing->clk_hs_post = timing->clk_hs_prepare + 8; + timing->clk_hs_trail = timing->clk_hs_prepare; + timing->clk_hs_zero = timing->clk_hs_trail * 4; + timing->clk_hs_exit = 2 * timing->clk_hs_trail; timcon0 = timing->lpx | timing->da_hs_prepare << 8 | timing->da_hs_zero << 16 | timing->da_hs_trail << 24; @@ -482,27 +479,39 @@ static void mtk_dsi_config_vdo_timing(struct mtk_dsi *dsi) dsi_tmp_buf_bpp - 10); data_phy_cycles = timing->lpx + timing->da_hs_prepare + - timing->da_hs_zero + timing->da_hs_exit + 2; + timing->da_hs_zero + timing->da_hs_exit + 3; if (dsi->mode_flags & MIPI_DSI_MODE_VIDEO_BURST) { - if (vm->hfront_porch * dsi_tmp_buf_bpp > + if ((vm->hfront_porch + vm->hback_porch) * dsi_tmp_buf_bpp > data_phy_cycles * dsi->lanes + 18) { - horizontal_frontporch_byte = vm->hfront_porch * - dsi_tmp_buf_bpp - - data_phy_cycles * - dsi->lanes - 18; + horizontal_frontporch_byte = + vm->hfront_porch * dsi_tmp_buf_bpp - + (data_phy_cycles * dsi->lanes + 18) * + vm->hfront_porch / + (vm->hfront_porch + vm->hback_porch); + + horizontal_backporch_byte = + horizontal_backporch_byte - + (data_phy_cycles * dsi->lanes + 18) * + vm->hback_porch / + (vm->hfront_porch + vm->hback_porch); } else { DRM_WARN("HFP less than d-phy, FPS will under 60Hz\n"); horizontal_frontporch_byte = vm->hfront_porch * dsi_tmp_buf_bpp; } } else { - if (vm->hfront_porch * dsi_tmp_buf_bpp > + if ((vm->hfront_porch + vm->hback_porch) * dsi_tmp_buf_bpp > data_phy_cycles * dsi->lanes + 12) { - horizontal_frontporch_byte = vm->hfront_porch * - dsi_tmp_buf_bpp - - data_phy_cycles * - dsi->lanes - 12; + horizontal_frontporch_byte = + vm->hfront_porch * dsi_tmp_buf_bpp - + (data_phy_cycles * dsi->lanes + 12) * + vm->hfront_porch / + (vm->hfront_porch + vm->hback_porch); + horizontal_backporch_byte = horizontal_backporch_byte - + (data_phy_cycles * dsi->lanes + 12) * + vm->hback_porch / + (vm->hfront_porch + vm->hback_porch); } else { DRM_WARN("HFP less than d-phy, FPS will under 60Hz\n"); horizontal_frontporch_byte = vm->hfront_porch * diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c b/drivers/gpu/drm/panfrost/panfrost_drv.c index f61364f7c471..88b431a267af 100644 --- a/drivers/gpu/drm/panfrost/panfrost_drv.c +++ b/drivers/gpu/drm/panfrost/panfrost_drv.c @@ -78,8 +78,10 @@ static int panfrost_ioctl_get_param(struct drm_device *ddev, void *data, struct static int panfrost_ioctl_create_bo(struct drm_device *dev, void *data, struct drm_file *file) { + struct panfrost_file_priv *priv = file->driver_priv; struct panfrost_gem_object *bo; struct drm_panfrost_create_bo *args = data; + struct panfrost_gem_mapping *mapping; if (!args->size || args->pad || (args->flags & ~(PANFROST_BO_NOEXEC | PANFROST_BO_HEAP))) @@ -95,7 +97,14 @@ static int panfrost_ioctl_create_bo(struct drm_device *dev, void *data, if (IS_ERR(bo)) return PTR_ERR(bo); - args->offset = bo->node.start << PAGE_SHIFT; + mapping = panfrost_gem_mapping_get(bo, priv); + if (!mapping) { + drm_gem_object_put_unlocked(&bo->base.base); + return -EINVAL; + } + + args->offset = mapping->mmnode.start << PAGE_SHIFT; + panfrost_gem_mapping_put(mapping); return 0; } @@ -119,6 +128,11 @@ panfrost_lookup_bos(struct drm_device *dev, struct drm_panfrost_submit *args, struct panfrost_job *job) { + struct panfrost_file_priv *priv = file_priv->driver_priv; + struct panfrost_gem_object *bo; + unsigned int i; + int ret; + job->bo_count = args->bo_handle_count; if (!job->bo_count) @@ -130,9 +144,32 @@ panfrost_lookup_bos(struct drm_device *dev, if (!job->implicit_fences) return -ENOMEM; - return drm_gem_objects_lookup(file_priv, - (void __user *)(uintptr_t)args->bo_handles, - job->bo_count, &job->bos); + ret = drm_gem_objects_lookup(file_priv, + (void __user *)(uintptr_t)args->bo_handles, + job->bo_count, &job->bos); + if (ret) + return ret; + + job->mappings = kvmalloc_array(job->bo_count, + sizeof(struct panfrost_gem_mapping *), + GFP_KERNEL | __GFP_ZERO); + if (!job->mappings) + return -ENOMEM; + + for (i = 0; i < job->bo_count; i++) { + struct panfrost_gem_mapping *mapping; + + bo = to_panfrost_bo(job->bos[i]); + mapping = panfrost_gem_mapping_get(bo, priv); + if (!mapping) { + ret = -EINVAL; + break; + } + + job->mappings[i] = mapping; + } + + return ret; } /** @@ -320,7 +357,9 @@ out: static int panfrost_ioctl_get_bo_offset(struct drm_device *dev, void *data, struct drm_file *file_priv) { + struct panfrost_file_priv *priv = file_priv->driver_priv; struct drm_panfrost_get_bo_offset *args = data; + struct panfrost_gem_mapping *mapping; struct drm_gem_object *gem_obj; struct panfrost_gem_object *bo; @@ -331,18 +370,26 @@ static int panfrost_ioctl_get_bo_offset(struct drm_device *dev, void *data, } bo = to_panfrost_bo(gem_obj); - args->offset = bo->node.start << PAGE_SHIFT; - + mapping = panfrost_gem_mapping_get(bo, priv); drm_gem_object_put_unlocked(gem_obj); + + if (!mapping) + return -EINVAL; + + args->offset = mapping->mmnode.start << PAGE_SHIFT; + panfrost_gem_mapping_put(mapping); return 0; } static int panfrost_ioctl_madvise(struct drm_device *dev, void *data, struct drm_file *file_priv) { + struct panfrost_file_priv *priv = file_priv->driver_priv; struct drm_panfrost_madvise *args = data; struct panfrost_device *pfdev = dev->dev_private; struct drm_gem_object *gem_obj; + struct panfrost_gem_object *bo; + int ret = 0; gem_obj = drm_gem_object_lookup(file_priv, args->handle); if (!gem_obj) { @@ -350,22 +397,48 @@ static int panfrost_ioctl_madvise(struct drm_device *dev, void *data, return -ENOENT; } + bo = to_panfrost_bo(gem_obj); + mutex_lock(&pfdev->shrinker_lock); + mutex_lock(&bo->mappings.lock); + if (args->madv == PANFROST_MADV_DONTNEED) { + struct panfrost_gem_mapping *first; + + first = list_first_entry(&bo->mappings.list, + struct panfrost_gem_mapping, + node); + + /* + * If we want to mark the BO purgeable, there must be only one + * user: the caller FD. + * We could do something smarter and mark the BO purgeable only + * when all its users have marked it purgeable, but globally + * visible/shared BOs are likely to never be marked purgeable + * anyway, so let's not bother. + */ + if (!list_is_singular(&bo->mappings.list) || + WARN_ON_ONCE(first->mmu != &priv->mmu)) { + ret = -EINVAL; + goto out_unlock_mappings; + } + } + args->retained = drm_gem_shmem_madvise(gem_obj, args->madv); if (args->retained) { - struct panfrost_gem_object *bo = to_panfrost_bo(gem_obj); - if (args->madv == PANFROST_MADV_DONTNEED) list_add_tail(&bo->base.madv_list, &pfdev->shrinker_list); else if (args->madv == PANFROST_MADV_WILLNEED) list_del_init(&bo->base.madv_list); } + +out_unlock_mappings: + mutex_unlock(&bo->mappings.lock); mutex_unlock(&pfdev->shrinker_lock); drm_gem_object_put_unlocked(gem_obj); - return 0; + return ret; } int panfrost_unstable_ioctl_check(void) diff --git a/drivers/gpu/drm/panfrost/panfrost_gem.c b/drivers/gpu/drm/panfrost/panfrost_gem.c index fd766b1395fb..17b654e1eb94 100644 --- a/drivers/gpu/drm/panfrost/panfrost_gem.c +++ b/drivers/gpu/drm/panfrost/panfrost_gem.c @@ -29,6 +29,12 @@ static void panfrost_gem_free_object(struct drm_gem_object *obj) list_del_init(&bo->base.madv_list); mutex_unlock(&pfdev->shrinker_lock); + /* + * If we still have mappings attached to the BO, there's a problem in + * our refcounting. + */ + WARN_ON_ONCE(!list_empty(&bo->mappings.list)); + if (bo->sgts) { int i; int n_sgt = bo->base.base.size / SZ_2M; @@ -46,6 +52,69 @@ static void panfrost_gem_free_object(struct drm_gem_object *obj) drm_gem_shmem_free_object(obj); } +struct panfrost_gem_mapping * +panfrost_gem_mapping_get(struct panfrost_gem_object *bo, + struct panfrost_file_priv *priv) +{ + struct panfrost_gem_mapping *iter, *mapping = NULL; + + mutex_lock(&bo->mappings.lock); + list_for_each_entry(iter, &bo->mappings.list, node) { + if (iter->mmu == &priv->mmu) { + kref_get(&iter->refcount); + mapping = iter; + break; + } + } + mutex_unlock(&bo->mappings.lock); + + return mapping; +} + +static void +panfrost_gem_teardown_mapping(struct panfrost_gem_mapping *mapping) +{ + struct panfrost_file_priv *priv; + + if (mapping->active) + panfrost_mmu_unmap(mapping); + + priv = container_of(mapping->mmu, struct panfrost_file_priv, mmu); + spin_lock(&priv->mm_lock); + if (drm_mm_node_allocated(&mapping->mmnode)) + drm_mm_remove_node(&mapping->mmnode); + spin_unlock(&priv->mm_lock); +} + +static void panfrost_gem_mapping_release(struct kref *kref) +{ + struct panfrost_gem_mapping *mapping; + + mapping = container_of(kref, struct panfrost_gem_mapping, refcount); + + panfrost_gem_teardown_mapping(mapping); + drm_gem_object_put_unlocked(&mapping->obj->base.base); + kfree(mapping); +} + +void panfrost_gem_mapping_put(struct panfrost_gem_mapping *mapping) +{ + if (!mapping) + return; + + kref_put(&mapping->refcount, panfrost_gem_mapping_release); +} + +void panfrost_gem_teardown_mappings(struct panfrost_gem_object *bo) +{ + struct panfrost_gem_mapping *mapping; + + mutex_lock(&bo->mappings.lock); + list_for_each_entry(mapping, &bo->mappings.list, node) + panfrost_gem_teardown_mapping(mapping); + mutex_unlock(&bo->mappings.lock); +} + int panfrost_gem_open(struct drm_gem_object *obj, struct drm_file *file_priv) { int ret; @@ -54,6 +123,16 @@ int panfrost_gem_open(struct drm_gem_object *obj, struct drm_file *file_priv) struct panfrost_gem_object *bo = to_panfrost_bo(obj); unsigned long color = bo->noexec ? PANFROST_BO_NOEXEC : 0; struct panfrost_file_priv *priv = file_priv->driver_priv; + struct panfrost_gem_mapping *mapping; + + mapping = kzalloc(sizeof(*mapping), GFP_KERNEL); + if (!mapping) + return -ENOMEM; + + INIT_LIST_HEAD(&mapping->node); + kref_init(&mapping->refcount); + drm_gem_object_get(obj); + mapping->obj = bo; /* * Executable buffers cannot cross a 16MB boundary as the program @@ -66,37 +145,48 @@ int panfrost_gem_open(struct drm_gem_object *obj, struct drm_file *file_priv) else align = size >= SZ_2M ? SZ_2M >> PAGE_SHIFT : 0; - bo->mmu = &priv->mmu; + mapping->mmu = &priv->mmu; spin_lock(&priv->mm_lock); - ret = drm_mm_insert_node_generic(&priv->mm, &bo->node, + ret = drm_mm_insert_node_generic(&priv->mm, &mapping->mmnode, size >> PAGE_SHIFT, align, color, 0); spin_unlock(&priv->mm_lock); if (ret) - return ret; + goto err; if (!bo->is_heap) { - ret = panfrost_mmu_map(bo); - if (ret) { - spin_lock(&priv->mm_lock); - drm_mm_remove_node(&bo->node); - spin_unlock(&priv->mm_lock); - } + ret = panfrost_mmu_map(mapping); + if (ret) + goto err; } + + mutex_lock(&bo->mappings.lock); + WARN_ON(bo->base.madv != PANFROST_MADV_WILLNEED); + list_add_tail(&mapping->node, &bo->mappings.list); + mutex_unlock(&bo->mappings.lock); + +err: + if (ret) + panfrost_gem_mapping_put(mapping); return ret; } void panfrost_gem_close(struct drm_gem_object *obj, struct drm_file *file_priv) { - struct panfrost_gem_object *bo = to_panfrost_bo(obj); struct panfrost_file_priv *priv = file_priv->driver_priv; + struct panfrost_gem_object *bo = to_panfrost_bo(obj); + struct panfrost_gem_mapping *mapping = NULL, *iter; - if (bo->is_mapped) - panfrost_mmu_unmap(bo); + mutex_lock(&bo->mappings.lock); + list_for_each_entry(iter, &bo->mappings.list, node) { + if (iter->mmu == &priv->mmu) { + mapping = iter; + list_del(&iter->node); + break; + } + } + mutex_unlock(&bo->mappings.lock); - spin_lock(&priv->mm_lock); - if (drm_mm_node_allocated(&bo->node)) - drm_mm_remove_node(&bo->node); - spin_unlock(&priv->mm_lock); + panfrost_gem_mapping_put(mapping); } static int panfrost_gem_pin(struct drm_gem_object *obj) @@ -136,6 +226,8 @@ struct drm_gem_object *panfrost_gem_create_object(struct drm_device *dev, size_t if (!obj) return NULL; + INIT_LIST_HEAD(&obj->mappings.list); + mutex_init(&obj->mappings.lock); obj->base.base.funcs = &panfrost_gem_funcs; return &obj->base.base; diff --git a/drivers/gpu/drm/panfrost/panfrost_gem.h b/drivers/gpu/drm/panfrost/panfrost_gem.h index 4b17e7308764..ca1bc9019600 100644 --- a/drivers/gpu/drm/panfrost/panfrost_gem.h +++ b/drivers/gpu/drm/panfrost/panfrost_gem.h @@ -13,23 +13,46 @@ struct panfrost_gem_object { struct drm_gem_shmem_object base; struct sg_table *sgts; - struct panfrost_mmu *mmu; - struct drm_mm_node node; - bool is_mapped :1; + /* + * Use a list for now. If searching a mapping ever becomes the + * bottleneck, we should consider using an RB-tree, or even better, + * let the core store drm_gem_object_mapping entries (where we + * could place driver specific data) instead of drm_gem_object ones + * in its drm_file->object_idr table. + * + * struct drm_gem_object_mapping { + * struct drm_gem_object *obj; + * void *driver_priv; + * }; + */ + struct { + struct list_head list; + struct mutex lock; + } mappings; + bool noexec :1; bool is_heap :1; }; +struct panfrost_gem_mapping { + struct list_head node; + struct kref refcount; + struct panfrost_gem_object *obj; + struct drm_mm_node mmnode; + struct panfrost_mmu *mmu; + bool active :1; +}; + static inline struct panfrost_gem_object *to_panfrost_bo(struct drm_gem_object *obj) { return container_of(to_drm_gem_shmem_obj(obj), struct panfrost_gem_object, base); } -static inline -struct panfrost_gem_object *drm_mm_node_to_panfrost_bo(struct drm_mm_node *node) +static inline struct panfrost_gem_mapping * +drm_mm_node_to_panfrost_mapping(struct drm_mm_node *node) { - return container_of(node, struct panfrost_gem_object, node); + return container_of(node, struct panfrost_gem_mapping, mmnode); } struct drm_gem_object *panfrost_gem_create_object(struct drm_device *dev, size_t size); @@ -49,6 +72,12 @@ int panfrost_gem_open(struct drm_gem_object *obj, struct drm_file *file_priv); void panfrost_gem_close(struct drm_gem_object *obj, struct drm_file *file_priv); +struct panfrost_gem_mapping * +panfrost_gem_mapping_get(struct panfrost_gem_object *bo, + struct panfrost_file_priv *priv); +void panfrost_gem_mapping_put(struct panfrost_gem_mapping *mapping); +void panfrost_gem_teardown_mappings(struct panfrost_gem_object *bo); + void panfrost_gem_shrinker_init(struct drm_device *dev); void panfrost_gem_shrinker_cleanup(struct drm_device *dev); diff --git a/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c b/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c index 458f0fa68111..f5dd7b29bc95 100644 --- a/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c +++ b/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c @@ -39,11 +39,12 @@ panfrost_gem_shrinker_count(struct shrinker *shrinker, struct shrink_control *sc static bool panfrost_gem_purge(struct drm_gem_object *obj) { struct drm_gem_shmem_object *shmem = to_drm_gem_shmem_obj(obj); + struct panfrost_gem_object *bo = to_panfrost_bo(obj); if (!mutex_trylock(&shmem->pages_lock)) return false; - panfrost_mmu_unmap(to_panfrost_bo(obj)); + panfrost_gem_teardown_mappings(bo); drm_gem_shmem_purge_locked(obj); mutex_unlock(&shmem->pages_lock); diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c index d411eb6c8eb9..e364ee00f3d0 100644 --- a/drivers/gpu/drm/panfrost/panfrost_job.c +++ b/drivers/gpu/drm/panfrost/panfrost_job.c @@ -268,9 +268,20 @@ static void panfrost_job_cleanup(struct kref *ref) dma_fence_put(job->done_fence); dma_fence_put(job->render_done_fence); - if (job->bos) { + if (job->mappings) { for (i = 0; i < job->bo_count; i++) + panfrost_gem_mapping_put(job->mappings[i]); + kvfree(job->mappings); + } + + if (job->bos) { + struct panfrost_gem_object *bo; + + for (i = 0; i < job->bo_count; i++) { + bo = to_panfrost_bo(job->bos[i]); drm_gem_object_put_unlocked(job->bos[i]); + } + kvfree(job->bos); } diff --git a/drivers/gpu/drm/panfrost/panfrost_job.h b/drivers/gpu/drm/panfrost/panfrost_job.h index 62454128a792..bbd3ba97ff67 100644 --- a/drivers/gpu/drm/panfrost/panfrost_job.h +++ b/drivers/gpu/drm/panfrost/panfrost_job.h @@ -32,6 +32,7 @@ struct panfrost_job { /* Exclusive fences we have taken from the BOs to wait for */ struct dma_fence **implicit_fences; + struct panfrost_gem_mapping **mappings; struct drm_gem_object **bos; u32 bo_count; diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c b/drivers/gpu/drm/panfrost/panfrost_mmu.c index a3ed64a1f15e..763cfca886a7 100644 --- a/drivers/gpu/drm/panfrost/panfrost_mmu.c +++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c @@ -269,14 +269,15 @@ static int mmu_map_sg(struct panfrost_device *pfdev, struct panfrost_mmu *mmu, return 0; } -int panfrost_mmu_map(struct panfrost_gem_object *bo) +int panfrost_mmu_map(struct panfrost_gem_mapping *mapping) { + struct panfrost_gem_object *bo = mapping->obj; struct drm_gem_object *obj = &bo->base.base; struct panfrost_device *pfdev = to_panfrost_device(obj->dev); struct sg_table *sgt; int prot = IOMMU_READ | IOMMU_WRITE; - if (WARN_ON(bo->is_mapped)) + if (WARN_ON(mapping->active)) return 0; if (bo->noexec) @@ -286,25 +287,28 @@ int panfrost_mmu_map(struct panfrost_gem_object *bo) if (WARN_ON(IS_ERR(sgt))) return PTR_ERR(sgt); - mmu_map_sg(pfdev, bo->mmu, bo->node.start << PAGE_SHIFT, prot, sgt); - bo->is_mapped = true; + mmu_map_sg(pfdev, mapping->mmu, mapping->mmnode.start << PAGE_SHIFT, + prot, sgt); + mapping->active = true; return 0; } -void panfrost_mmu_unmap(struct panfrost_gem_object *bo) +void panfrost_mmu_unmap(struct panfrost_gem_mapping *mapping) { + struct panfrost_gem_object *bo = mapping->obj; struct drm_gem_object *obj = &bo->base.base; struct panfrost_device *pfdev = to_panfrost_device(obj->dev); - struct io_pgtable_ops *ops = bo->mmu->pgtbl_ops; - u64 iova = bo->node.start << PAGE_SHIFT; - size_t len = bo->node.size << PAGE_SHIFT; + struct io_pgtable_ops *ops = mapping->mmu->pgtbl_ops; + u64 iova = mapping->mmnode.start << PAGE_SHIFT; + size_t len = mapping->mmnode.size << PAGE_SHIFT; size_t unmapped_len = 0; - if (WARN_ON(!bo->is_mapped)) + if (WARN_ON(!mapping->active)) return; - dev_dbg(pfdev->dev, "unmap: as=%d, iova=%llx, len=%zx", bo->mmu->as, iova, len); + dev_dbg(pfdev->dev, "unmap: as=%d, iova=%llx, len=%zx", + mapping->mmu->as, iova, len); while (unmapped_len < len) { size_t unmapped_page; @@ -318,8 +322,9 @@ void panfrost_mmu_unmap(struct panfrost_gem_object *bo) unmapped_len += pgsize; } - panfrost_mmu_flush_range(pfdev, bo->mmu, bo->node.start << PAGE_SHIFT, len); - bo->is_mapped = false; + panfrost_mmu_flush_range(pfdev, mapping->mmu, + mapping->mmnode.start << PAGE_SHIFT, len); + mapping->active = false; } static void mmu_tlb_inv_context_s1(void *cookie) @@ -394,10 +399,10 @@ void panfrost_mmu_pgtable_free(struct panfrost_file_priv *priv) free_io_pgtable_ops(mmu->pgtbl_ops); } -static struct panfrost_gem_object * -addr_to_drm_mm_node(struct panfrost_device *pfdev, int as, u64 addr) +static struct panfrost_gem_mapping * +addr_to_mapping(struct panfrost_device *pfdev, int as, u64 addr) { - struct panfrost_gem_object *bo = NULL; + struct panfrost_gem_mapping *mapping = NULL; struct panfrost_file_priv *priv; struct drm_mm_node *node; u64 offset = addr >> PAGE_SHIFT; @@ -418,8 +423,9 @@ found_mmu: drm_mm_for_each_node(node, &priv->mm) { if (offset >= node->start && offset < (node->start + node->size)) { - bo = drm_mm_node_to_panfrost_bo(node); - drm_gem_object_get(&bo->base.base); + mapping = drm_mm_node_to_panfrost_mapping(node); + + kref_get(&mapping->refcount); break; } } @@ -427,7 +433,7 @@ found_mmu: spin_unlock(&priv->mm_lock); out: spin_unlock(&pfdev->as_lock); - return bo; + return mapping; } #define NUM_FAULT_PAGES (SZ_2M / PAGE_SIZE) @@ -436,28 +442,30 @@ static int panfrost_mmu_map_fault_addr(struct panfrost_device *pfdev, int as, u64 addr) { int ret, i; + struct panfrost_gem_mapping *bomapping; struct panfrost_gem_object *bo; struct address_space *mapping; pgoff_t page_offset; struct sg_table *sgt; struct page **pages; - bo = addr_to_drm_mm_node(pfdev, as, addr); - if (!bo) + bomapping = addr_to_mapping(pfdev, as, addr); + if (!bomapping) return -ENOENT; + bo = bomapping->obj; if (!bo->is_heap) { dev_WARN(pfdev->dev, "matching BO is not heap type (GPU VA = %llx)", - bo->node.start << PAGE_SHIFT); + bomapping->mmnode.start << PAGE_SHIFT); ret = -EINVAL; goto err_bo; } - WARN_ON(bo->mmu->as != as); + WARN_ON(bomapping->mmu->as != as); /* Assume 2MB alignment and size multiple */ addr &= ~((u64)SZ_2M - 1); page_offset = addr >> PAGE_SHIFT; - page_offset -= bo->node.start; + page_offset -= bomapping->mmnode.start; mutex_lock(&bo->base.pages_lock); @@ -509,13 +517,14 @@ static int panfrost_mmu_map_fault_addr(struct panfrost_device *pfdev, int as, goto err_map; } - mmu_map_sg(pfdev, bo->mmu, addr, IOMMU_WRITE | IOMMU_READ | IOMMU_NOEXEC, sgt); + mmu_map_sg(pfdev, bomapping->mmu, addr, + IOMMU_WRITE | IOMMU_READ | IOMMU_NOEXEC, sgt); - bo->is_mapped = true; + bomapping->active = true; dev_dbg(pfdev->dev, "mapped page fault @ AS%d %llx", as, addr); - drm_gem_object_put_unlocked(&bo->base.base); + panfrost_gem_mapping_put(bomapping); return 0; diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.h b/drivers/gpu/drm/panfrost/panfrost_mmu.h index 7c5b6775ae23..44fc2edf63ce 100644 --- a/drivers/gpu/drm/panfrost/panfrost_mmu.h +++ b/drivers/gpu/drm/panfrost/panfrost_mmu.h @@ -4,12 +4,12 @@ #ifndef __PANFROST_MMU_H__ #define __PANFROST_MMU_H__ -struct panfrost_gem_object; +struct panfrost_gem_mapping; struct panfrost_file_priv; struct panfrost_mmu; -int panfrost_mmu_map(struct panfrost_gem_object *bo); -void panfrost_mmu_unmap(struct panfrost_gem_object *bo); +int panfrost_mmu_map(struct panfrost_gem_mapping *mapping); +void panfrost_mmu_unmap(struct panfrost_gem_mapping *mapping); int panfrost_mmu_init(struct panfrost_device *pfdev); void panfrost_mmu_fini(struct panfrost_device *pfdev); diff --git a/drivers/gpu/drm/panfrost/panfrost_perfcnt.c b/drivers/gpu/drm/panfrost/panfrost_perfcnt.c index 2c04e858c50a..684820448be3 100644 --- a/drivers/gpu/drm/panfrost/panfrost_perfcnt.c +++ b/drivers/gpu/drm/panfrost/panfrost_perfcnt.c @@ -25,7 +25,7 @@ #define V4_SHADERS_PER_COREGROUP 4 struct panfrost_perfcnt { - struct panfrost_gem_object *bo; + struct panfrost_gem_mapping *mapping; size_t bosize; void *buf; struct panfrost_file_priv *user; @@ -49,7 +49,7 @@ static int panfrost_perfcnt_dump_locked(struct panfrost_device *pfdev) int ret; reinit_completion(&pfdev->perfcnt->dump_comp); - gpuva = pfdev->perfcnt->bo->node.start << PAGE_SHIFT; + gpuva = pfdev->perfcnt->mapping->mmnode.start << PAGE_SHIFT; gpu_write(pfdev, GPU_PERFCNT_BASE_LO, gpuva); gpu_write(pfdev, GPU_PERFCNT_BASE_HI, gpuva >> 32); gpu_write(pfdev, GPU_INT_CLEAR, @@ -89,17 +89,22 @@ static int panfrost_perfcnt_enable_locked(struct panfrost_device *pfdev, if (IS_ERR(bo)) return PTR_ERR(bo); - perfcnt->bo = to_panfrost_bo(&bo->base); - /* Map the perfcnt buf in the address space attached to file_priv. */ - ret = panfrost_gem_open(&perfcnt->bo->base.base, file_priv); + ret = panfrost_gem_open(&bo->base, file_priv); if (ret) goto err_put_bo; + perfcnt->mapping = panfrost_gem_mapping_get(to_panfrost_bo(&bo->base), + user); + if (!perfcnt->mapping) { + ret = -EINVAL; + goto err_close_bo; + } + perfcnt->buf = drm_gem_shmem_vmap(&bo->base); if (IS_ERR(perfcnt->buf)) { ret = PTR_ERR(perfcnt->buf); - goto err_close_bo; + goto err_put_mapping; } /* @@ -154,12 +159,17 @@ static int panfrost_perfcnt_enable_locked(struct panfrost_device *pfdev, if (panfrost_has_hw_issue(pfdev, HW_ISSUE_8186)) gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0xffffffff); + /* The BO ref is retained by the mapping. */ + drm_gem_object_put_unlocked(&bo->base); + return 0; err_vunmap: - drm_gem_shmem_vunmap(&perfcnt->bo->base.base, perfcnt->buf); + drm_gem_shmem_vunmap(&bo->base, perfcnt->buf); +err_put_mapping: + panfrost_gem_mapping_put(perfcnt->mapping); err_close_bo: - panfrost_gem_close(&perfcnt->bo->base.base, file_priv); + panfrost_gem_close(&bo->base, file_priv); err_put_bo: drm_gem_object_put_unlocked(&bo->base); return ret; @@ -182,11 +192,11 @@ static int panfrost_perfcnt_disable_locked(struct panfrost_device *pfdev, GPU_PERFCNT_CFG_MODE(GPU_PERFCNT_CFG_MODE_OFF)); perfcnt->user = NULL; - drm_gem_shmem_vunmap(&perfcnt->bo->base.base, perfcnt->buf); + drm_gem_shmem_vunmap(&perfcnt->mapping->obj->base.base, perfcnt->buf); perfcnt->buf = NULL; - panfrost_gem_close(&perfcnt->bo->base.base, file_priv); - drm_gem_object_put_unlocked(&perfcnt->bo->base.base); - perfcnt->bo = NULL; + panfrost_gem_close(&perfcnt->mapping->obj->base.base, file_priv); + panfrost_gem_mapping_put(perfcnt->mapping); + perfcnt->mapping = NULL; pm_runtime_mark_last_busy(pfdev->dev); pm_runtime_put_autosuspend(pfdev->dev); diff --git a/drivers/gpu/drm/rockchip/cdn-dp-core.h b/drivers/gpu/drm/rockchip/cdn-dp-core.h index 83c4586665b4..81ac9b658a70 100644 --- a/drivers/gpu/drm/rockchip/cdn-dp-core.h +++ b/drivers/gpu/drm/rockchip/cdn-dp-core.h @@ -95,7 +95,7 @@ struct cdn_dp_device { struct cdn_dp_port *port[MAX_PHY]; u8 ports; u8 max_lanes; - u8 max_rate; + unsigned int max_rate; u8 lanes; int active_port; diff --git a/drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c b/drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c index a7c4654445c7..68d4644ac2dc 100644 --- a/drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c +++ b/drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c @@ -685,8 +685,6 @@ static void sun4i_hdmi_unbind(struct device *dev, struct device *master, struct sun4i_hdmi *hdmi = dev_get_drvdata(dev); cec_unregister_adapter(hdmi->cec_adap); - drm_connector_cleanup(&hdmi->connector); - drm_encoder_cleanup(&hdmi->encoder); i2c_del_adapter(hdmi->i2c); i2c_put_adapter(hdmi->ddc_i2c); clk_disable_unprepare(hdmi->mod_clk); diff --git a/drivers/gpu/drm/sun4i/sun4i_tcon.c b/drivers/gpu/drm/sun4i/sun4i_tcon.c index 42651d737c55..c81cdce6ed55 100644 --- a/drivers/gpu/drm/sun4i/sun4i_tcon.c +++ b/drivers/gpu/drm/sun4i/sun4i_tcon.c @@ -489,7 +489,7 @@ static void sun4i_tcon0_mode_set_rgb(struct sun4i_tcon *tcon, WARN_ON(!tcon->quirks->has_channel_0); - tcon->dclk_min_div = 1; + tcon->dclk_min_div = tcon->quirks->dclk_min_div; tcon->dclk_max_div = 127; sun4i_tcon0_mode_set_common(tcon, mode); @@ -1426,12 +1426,14 @@ static int sun8i_r40_tcon_tv_set_mux(struct sun4i_tcon *tcon, static const struct sun4i_tcon_quirks sun4i_a10_quirks = { .has_channel_0 = true, .has_channel_1 = true, + .dclk_min_div = 4, .set_mux = sun4i_a10_tcon_set_mux, }; static const struct sun4i_tcon_quirks sun5i_a13_quirks = { .has_channel_0 = true, .has_channel_1 = true, + .dclk_min_div = 4, .set_mux = sun5i_a13_tcon_set_mux, }; @@ -1440,6 +1442,7 @@ static const struct sun4i_tcon_quirks sun6i_a31_quirks = { .has_channel_1 = true, .has_lvds_alt = true, .needs_de_be_mux = true, + .dclk_min_div = 1, .set_mux = sun6i_tcon_set_mux, }; @@ -1447,11 +1450,13 @@ static const struct sun4i_tcon_quirks sun6i_a31s_quirks = { .has_channel_0 = true, .has_channel_1 = true, .needs_de_be_mux = true, + .dclk_min_div = 1, }; static const struct sun4i_tcon_quirks sun7i_a20_quirks = { .has_channel_0 = true, .has_channel_1 = true, + .dclk_min_div = 4, /* Same display pipeline structure as A10 */ .set_mux = sun4i_a10_tcon_set_mux, }; @@ -1459,11 +1464,13 @@ static const struct sun4i_tcon_quirks sun7i_a20_quirks = { static const struct sun4i_tcon_quirks sun8i_a33_quirks = { .has_channel_0 = true, .has_lvds_alt = true, + .dclk_min_div = 1, }; static const struct sun4i_tcon_quirks sun8i_a83t_lcd_quirks = { .supports_lvds = true, .has_channel_0 = true, + .dclk_min_div = 1, }; static const struct sun4i_tcon_quirks sun8i_a83t_tv_quirks = { @@ -1477,11 +1484,13 @@ static const struct sun4i_tcon_quirks sun8i_r40_tv_quirks = { static const struct sun4i_tcon_quirks sun8i_v3s_quirks = { .has_channel_0 = true, + .dclk_min_div = 1, }; static const struct sun4i_tcon_quirks sun9i_a80_tcon_lcd_quirks = { - .has_channel_0 = true, - .needs_edp_reset = true, + .has_channel_0 = true, + .needs_edp_reset = true, + .dclk_min_div = 1, }; static const struct sun4i_tcon_quirks sun9i_a80_tcon_tv_quirks = { diff --git a/drivers/gpu/drm/sun4i/sun4i_tcon.h b/drivers/gpu/drm/sun4i/sun4i_tcon.h index f9f1fe80b206..a62ec826ae71 100644 --- a/drivers/gpu/drm/sun4i/sun4i_tcon.h +++ b/drivers/gpu/drm/sun4i/sun4i_tcon.h @@ -224,6 +224,7 @@ struct sun4i_tcon_quirks { bool needs_de_be_mux; /* sun6i needs mux to select backend */ bool needs_edp_reset; /* a80 edp reset needed for tcon0 access */ bool supports_lvds; /* Does the TCON support an LVDS output? */ + u8 dclk_min_div; /* minimum divider for TCON0 DCLK */ /* callback to handle tcon muxing options */ int (*set_mux)(struct sun4i_tcon *, const struct drm_encoder *); diff --git a/drivers/gpu/drm/virtio/virtgpu_plane.c b/drivers/gpu/drm/virtio/virtgpu_plane.c index 390524143139..1635a9ff4794 100644 --- a/drivers/gpu/drm/virtio/virtgpu_plane.c +++ b/drivers/gpu/drm/virtio/virtgpu_plane.c @@ -232,6 +232,7 @@ static void virtio_gpu_cursor_plane_update(struct drm_plane *plane, if (!objs) return; virtio_gpu_array_add_obj(objs, vgfb->base.obj[0]); + virtio_gpu_array_lock_resv(objs); virtio_gpu_cmd_transfer_to_host_2d (vgdev, 0, plane->state->crtc_w, diff --git a/drivers/hid/hid-asus.c b/drivers/hid/hid-asus.c index 8063b1d567b1..e6e4c841fb06 100644 --- a/drivers/hid/hid-asus.c +++ b/drivers/hid/hid-asus.c @@ -261,7 +261,8 @@ static int asus_event(struct hid_device *hdev, struct hid_field *field, struct hid_usage *usage, __s32 value) { if ((usage->hid & HID_USAGE_PAGE) == 0xff310000 && - (usage->hid & HID_USAGE) != 0x00 && !usage->type) { + (usage->hid & HID_USAGE) != 0x00 && + (usage->hid & HID_USAGE) != 0xff && !usage->type) { hid_warn(hdev, "Unmapped Asus vendor usagepage code 0x%02x\n", usage->hid & HID_USAGE); } diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index e0b241bd3070..851fe54ea59e 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -288,6 +288,12 @@ static int hid_add_field(struct hid_parser *parser, unsigned report_type, unsign offset = report->size; report->size += parser->global.report_size * parser->global.report_count; + /* Total size check: Allow for possible report index byte */ + if (report->size > (HID_MAX_BUFFER_SIZE - 1) << 3) { + hid_err(parser->device, "report is too long\n"); + return -1; + } + if (!parser->local.usage_index) /* Ignore padding fields */ return 0; diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 7e1689ef35f5..3a400ce603c4 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -631,6 +631,7 @@ #define USB_VENDOR_ID_ITE 0x048d #define USB_DEVICE_ID_ITE_LENOVO_YOGA 0x8386 #define USB_DEVICE_ID_ITE_LENOVO_YOGA2 0x8350 +#define I2C_DEVICE_ID_ITE_LENOVO_LEGION_Y720 0x837a #define USB_DEVICE_ID_ITE_LENOVO_YOGA900 0x8396 #define USB_DEVICE_ID_ITE8595 0x8595 @@ -730,6 +731,7 @@ #define USB_DEVICE_ID_LG_MULTITOUCH 0x0064 #define USB_DEVICE_ID_LG_MELFAS_MT 0x6007 #define I2C_DEVICE_ID_LG_8001 0x8001 +#define I2C_DEVICE_ID_LG_7010 0x7010 #define USB_VENDOR_ID_LOGITECH 0x046d #define USB_DEVICE_ID_LOGITECH_AUDIOHUB 0x0a0e @@ -1102,6 +1104,7 @@ #define USB_DEVICE_ID_SYNAPTICS_LTS2 0x1d10 #define USB_DEVICE_ID_SYNAPTICS_HD 0x0ac3 #define USB_DEVICE_ID_SYNAPTICS_QUAD_HD 0x1ac3 +#define USB_DEVICE_ID_SYNAPTICS_ACER_SWITCH5_012 0x2968 #define USB_DEVICE_ID_SYNAPTICS_TP_V103 0x5710 #define USB_DEVICE_ID_SYNAPTICS_ACER_SWITCH5 0x81a7 diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c index 63855f275a38..dea9cc65bf80 100644 --- a/drivers/hid/hid-input.c +++ b/drivers/hid/hid-input.c @@ -1132,9 +1132,15 @@ static void hidinput_configure_usage(struct hid_input *hidinput, struct hid_fiel } mapped: - if (device->driver->input_mapped && device->driver->input_mapped(device, - hidinput, field, usage, &bit, &max) < 0) - goto ignore; + if (device->driver->input_mapped && + device->driver->input_mapped(device, hidinput, field, usage, + &bit, &max) < 0) { + /* + * The driver indicated that no further generic handling + * of the usage is desired. + */ + return; + } set_bit(usage->type, input->evbit); @@ -1215,9 +1221,11 @@ mapped: set_bit(MSC_SCAN, input->mscbit); } -ignore: return; +ignore: + usage->type = 0; + usage->code = 0; } static void hidinput_handle_scroll(struct hid_usage *usage, diff --git a/drivers/hid/hid-ite.c b/drivers/hid/hid-ite.c index a45f2352618d..c436e12feb23 100644 --- a/drivers/hid/hid-ite.c +++ b/drivers/hid/hid-ite.c @@ -40,6 +40,9 @@ static int ite_event(struct hid_device *hdev, struct hid_field *field, static const struct hid_device_id ite_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_ITE, USB_DEVICE_ID_ITE8595) }, { HID_USB_DEVICE(USB_VENDOR_ID_258A, USB_DEVICE_ID_258A_6A88) }, + /* ITE8595 USB kbd ctlr, with Synaptics touchpad connected to it. */ + { HID_USB_DEVICE(USB_VENDOR_ID_SYNAPTICS, + USB_DEVICE_ID_SYNAPTICS_ACER_SWITCH5_012) }, { } }; MODULE_DEVICE_TABLE(hid, ite_devices); diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c index 3cfeb1629f79..362805ddf377 100644 --- a/drivers/hid/hid-multitouch.c +++ b/drivers/hid/hid-multitouch.c @@ -1019,7 +1019,7 @@ static int mt_process_slot(struct mt_device *td, struct input_dev *input, tool = MT_TOOL_DIAL; else if (unlikely(!confidence_state)) { tool = MT_TOOL_PALM; - if (!active && + if (!active && mt && input_mt_is_active(&mt->slots[slotnum])) { /* * The non-confidence was reported for @@ -1985,6 +1985,9 @@ static const struct hid_device_id mt_devices[] = { { .driver_data = MT_CLS_LG, HID_USB_DEVICE(USB_VENDOR_ID_LG, USB_DEVICE_ID_LG_MELFAS_MT) }, + { .driver_data = MT_CLS_LG, + HID_DEVICE(BUS_I2C, HID_GROUP_GENERIC, + USB_VENDOR_ID_LG, I2C_DEVICE_ID_LG_7010) }, /* MosArt panels */ { .driver_data = MT_CLS_CONFIDENCE_MINUS_ONE, diff --git a/drivers/hid/hid-quirks.c b/drivers/hid/hid-quirks.c index d1b39c29e353..0e7b2d998395 100644 --- a/drivers/hid/hid-quirks.c +++ b/drivers/hid/hid-quirks.c @@ -174,6 +174,7 @@ static const struct hid_device_id hid_quirks[] = { { HID_USB_DEVICE(USB_VENDOR_ID_WALTOP, USB_DEVICE_ID_WALTOP_SIRIUS_BATTERY_FREE_TABLET), HID_QUIRK_MULTI_INPUT }, { HID_USB_DEVICE(USB_VENDOR_ID_WISEGROUP_LTD2, USB_DEVICE_ID_SMARTJOY_DUAL_PLUS), HID_QUIRK_NOGET | HID_QUIRK_MULTI_INPUT }, { HID_USB_DEVICE(USB_VENDOR_ID_WISEGROUP, USB_DEVICE_ID_QUAD_USB_JOYPAD), HID_QUIRK_NOGET | HID_QUIRK_MULTI_INPUT }, + { HID_USB_DEVICE(USB_VENDOR_ID_XIN_MO, USB_DEVICE_ID_XIN_MO_DUAL_ARCADE), HID_QUIRK_MULTI_INPUT }, { 0 } }; diff --git a/drivers/hid/hid-steam.c b/drivers/hid/hid-steam.c index 8dae0f9b819e..6286204d4c56 100644 --- a/drivers/hid/hid-steam.c +++ b/drivers/hid/hid-steam.c @@ -768,8 +768,12 @@ static int steam_probe(struct hid_device *hdev, if (steam->quirks & STEAM_QUIRK_WIRELESS) { hid_info(hdev, "Steam wireless receiver connected"); + /* If using a wireless adaptor ask for connection status */ + steam->connected = false; steam_request_conn_status(steam); } else { + /* A wired connection is always present */ + steam->connected = true; ret = steam_register(steam); if (ret) { hid_err(hdev, diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c index c3fc0ceb8096..7a75aff78388 100644 --- a/drivers/hid/hidraw.c +++ b/drivers/hid/hidraw.c @@ -249,13 +249,14 @@ out: static __poll_t hidraw_poll(struct file *file, poll_table *wait) { struct hidraw_list *list = file->private_data; + __poll_t mask = EPOLLOUT | EPOLLWRNORM; /* hidraw is always writable */ poll_wait(file, &list->hidraw->wait, wait); if (list->head != list->tail) - return EPOLLIN | EPOLLRDNORM | EPOLLOUT; + mask |= EPOLLIN | EPOLLRDNORM; if (!list->hidraw->exist) - return EPOLLERR | EPOLLHUP; - return 0; + mask |= EPOLLERR | EPOLLHUP; + return mask; } static int hidraw_open(struct inode *inode, struct file *file) diff --git a/drivers/hid/i2c-hid/i2c-hid-core.c b/drivers/hid/i2c-hid/i2c-hid-core.c index a358e61fbc82..009000c5d55c 100644 --- a/drivers/hid/i2c-hid/i2c-hid-core.c +++ b/drivers/hid/i2c-hid/i2c-hid-core.c @@ -49,6 +49,8 @@ #define I2C_HID_QUIRK_NO_IRQ_AFTER_RESET BIT(1) #define I2C_HID_QUIRK_BOGUS_IRQ BIT(4) #define I2C_HID_QUIRK_RESET_ON_RESUME BIT(5) +#define I2C_HID_QUIRK_BAD_INPUT_SIZE BIT(6) + /* flags */ #define I2C_HID_STARTED 0 @@ -175,6 +177,8 @@ static const struct i2c_hid_quirks { I2C_HID_QUIRK_BOGUS_IRQ }, { USB_VENDOR_ID_ALPS_JP, HID_ANY_ID, I2C_HID_QUIRK_RESET_ON_RESUME }, + { USB_VENDOR_ID_ITE, I2C_DEVICE_ID_ITE_LENOVO_LEGION_Y720, + I2C_HID_QUIRK_BAD_INPUT_SIZE }, { 0, 0 } }; @@ -496,9 +500,15 @@ static void i2c_hid_get_input(struct i2c_hid *ihid) } if ((ret_size > size) || (ret_size < 2)) { - dev_err(&ihid->client->dev, "%s: incomplete report (%d/%d)\n", - __func__, size, ret_size); - return; + if (ihid->quirks & I2C_HID_QUIRK_BAD_INPUT_SIZE) { + ihid->inbuf[0] = size & 0xff; + ihid->inbuf[1] = size >> 8; + ret_size = size; + } else { + dev_err(&ihid->client->dev, "%s: incomplete report (%d/%d)\n", + __func__, size, ret_size); + return; + } } i2c_hid_dbg(ihid, "input: %*ph\n", ret_size, ihid->inbuf); diff --git a/drivers/hid/intel-ish-hid/ipc/hw-ish.h b/drivers/hid/intel-ish-hid/ipc/hw-ish.h index 6c1e6110867f..1fb294ca463e 100644 --- a/drivers/hid/intel-ish-hid/ipc/hw-ish.h +++ b/drivers/hid/intel-ish-hid/ipc/hw-ish.h @@ -24,7 +24,9 @@ #define ICL_MOBILE_DEVICE_ID 0x34FC #define SPT_H_DEVICE_ID 0xA135 #define CML_LP_DEVICE_ID 0x02FC +#define CMP_H_DEVICE_ID 0x06FC #define EHL_Ax_DEVICE_ID 0x4BB3 +#define TGL_LP_DEVICE_ID 0xA0FC #define REVISION_ID_CHT_A0 0x6 #define REVISION_ID_CHT_Ax_SI 0x0 diff --git a/drivers/hid/intel-ish-hid/ipc/pci-ish.c b/drivers/hid/intel-ish-hid/ipc/pci-ish.c index 784dcc8c7022..f491d8b4e24c 100644 --- a/drivers/hid/intel-ish-hid/ipc/pci-ish.c +++ b/drivers/hid/intel-ish-hid/ipc/pci-ish.c @@ -34,7 +34,9 @@ static const struct pci_device_id ish_pci_tbl[] = { {PCI_DEVICE(PCI_VENDOR_ID_INTEL, ICL_MOBILE_DEVICE_ID)}, {PCI_DEVICE(PCI_VENDOR_ID_INTEL, SPT_H_DEVICE_ID)}, {PCI_DEVICE(PCI_VENDOR_ID_INTEL, CML_LP_DEVICE_ID)}, + {PCI_DEVICE(PCI_VENDOR_ID_INTEL, CMP_H_DEVICE_ID)}, {PCI_DEVICE(PCI_VENDOR_ID_INTEL, EHL_Ax_DEVICE_ID)}, + {PCI_DEVICE(PCI_VENDOR_ID_INTEL, TGL_LP_DEVICE_ID)}, {0, } }; MODULE_DEVICE_TABLE(pci, ish_pci_tbl); diff --git a/drivers/hid/uhid.c b/drivers/hid/uhid.c index fa0cc0899827..8fe3efcb8327 100644 --- a/drivers/hid/uhid.c +++ b/drivers/hid/uhid.c @@ -766,13 +766,14 @@ unlock: static __poll_t uhid_char_poll(struct file *file, poll_table *wait) { struct uhid_device *uhid = file->private_data; + __poll_t mask = EPOLLOUT | EPOLLWRNORM; /* uhid is always writable */ poll_wait(file, &uhid->waitq, wait); if (uhid->head != uhid->tail) - return EPOLLIN | EPOLLRDNORM; + mask |= EPOLLIN | EPOLLRDNORM; - return 0; + return mask; } static const struct file_operations uhid_fops = { diff --git a/drivers/hid/usbhid/hiddev.c b/drivers/hid/usbhid/hiddev.c index e421cdf2d1a4..a970b809d778 100644 --- a/drivers/hid/usbhid/hiddev.c +++ b/drivers/hid/usbhid/hiddev.c @@ -241,12 +241,51 @@ static int hiddev_release(struct inode * inode, struct file * file) return 0; } +static int __hiddev_open(struct hiddev *hiddev, struct file *file) +{ + struct hiddev_list *list; + int error; + + lockdep_assert_held(&hiddev->existancelock); + + list = vzalloc(sizeof(*list)); + if (!list) + return -ENOMEM; + + mutex_init(&list->thread_lock); + list->hiddev = hiddev; + + if (!hiddev->open++) { + error = hid_hw_power(hiddev->hid, PM_HINT_FULLON); + if (error < 0) + goto err_drop_count; + + error = hid_hw_open(hiddev->hid); + if (error < 0) + goto err_normal_power; + } + + spin_lock_irq(&hiddev->list_lock); + list_add_tail(&list->node, &hiddev->list); + spin_unlock_irq(&hiddev->list_lock); + + file->private_data = list; + + return 0; + +err_normal_power: + hid_hw_power(hiddev->hid, PM_HINT_NORMAL); +err_drop_count: + hiddev->open--; + vfree(list); + return error; +} + /* * open file op */ static int hiddev_open(struct inode *inode, struct file *file) { - struct hiddev_list *list; struct usb_interface *intf; struct hid_device *hid; struct hiddev *hiddev; @@ -255,66 +294,14 @@ static int hiddev_open(struct inode *inode, struct file *file) intf = usbhid_find_interface(iminor(inode)); if (!intf) return -ENODEV; + hid = usb_get_intfdata(intf); hiddev = hid->hiddev; - if (!(list = vzalloc(sizeof(struct hiddev_list)))) - return -ENOMEM; - mutex_init(&list->thread_lock); - list->hiddev = hiddev; - file->private_data = list; - - /* - * no need for locking because the USB major number - * is shared which usbcore guards against disconnect - */ - if (list->hiddev->exist) { - if (!list->hiddev->open++) { - res = hid_hw_open(hiddev->hid); - if (res < 0) - goto bail; - } - } else { - res = -ENODEV; - goto bail; - } - - spin_lock_irq(&list->hiddev->list_lock); - list_add_tail(&list->node, &hiddev->list); - spin_unlock_irq(&list->hiddev->list_lock); - mutex_lock(&hiddev->existancelock); - /* - * recheck exist with existance lock held to - * avoid opening a disconnected device - */ - if (!list->hiddev->exist) { - res = -ENODEV; - goto bail_unlock; - } - if (!list->hiddev->open++) - if (list->hiddev->exist) { - struct hid_device *hid = hiddev->hid; - res = hid_hw_power(hid, PM_HINT_FULLON); - if (res < 0) - goto bail_unlock; - res = hid_hw_open(hid); - if (res < 0) - goto bail_normal_power; - } - mutex_unlock(&hiddev->existancelock); - return 0; -bail_normal_power: - hid_hw_power(hid, PM_HINT_NORMAL); -bail_unlock: + res = hiddev->exist ? __hiddev_open(hiddev, file) : -ENODEV; mutex_unlock(&hiddev->existancelock); - spin_lock_irq(&list->hiddev->list_lock); - list_del(&list->node); - spin_unlock_irq(&list->hiddev->list_lock); -bail: - file->private_data = NULL; - vfree(list); return res; } diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index ccb74529bc78..d99a9d407671 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -2096,14 +2096,16 @@ static void wacom_wac_pad_event(struct hid_device *hdev, struct hid_field *field (hdev->product == 0x34d || hdev->product == 0x34e || /* MobileStudio Pro */ hdev->product == 0x357 || hdev->product == 0x358 || /* Intuos Pro 2 */ hdev->product == 0x392 || /* Intuos Pro 2 */ - hdev->product == 0x398 || hdev->product == 0x399)) { /* MobileStudio Pro */ + hdev->product == 0x398 || hdev->product == 0x399 || /* MobileStudio Pro */ + hdev->product == 0x3AA)) { /* MobileStudio Pro */ value = (field->logical_maximum - value); if (hdev->product == 0x357 || hdev->product == 0x358 || hdev->product == 0x392) value = wacom_offset_rotation(input, usage, value, 3, 16); else if (hdev->product == 0x34d || hdev->product == 0x34e || - hdev->product == 0x398 || hdev->product == 0x399) + hdev->product == 0x398 || hdev->product == 0x399 || + hdev->product == 0x3AA) value = wacom_offset_rotation(input, usage, value, 1, 2); } else { diff --git a/drivers/hwmon/adt7475.c b/drivers/hwmon/adt7475.c index 6c64d50c9aae..01c2eeb02aa9 100644 --- a/drivers/hwmon/adt7475.c +++ b/drivers/hwmon/adt7475.c @@ -294,9 +294,10 @@ static inline u16 volt2reg(int channel, long volt, u8 bypass_attn) long reg; if (bypass_attn & (1 << channel)) - reg = (volt * 1024) / 2250; + reg = DIV_ROUND_CLOSEST(volt * 1024, 2250); else - reg = (volt * r[1] * 1024) / ((r[0] + r[1]) * 2250); + reg = DIV_ROUND_CLOSEST(volt * r[1] * 1024, + (r[0] + r[1]) * 2250); return clamp_val(reg, 0, 1023) & (0xff << 2); } diff --git a/drivers/hwmon/hwmon.c b/drivers/hwmon/hwmon.c index 1f3b30b085b9..d018b20089ec 100644 --- a/drivers/hwmon/hwmon.c +++ b/drivers/hwmon/hwmon.c @@ -51,6 +51,7 @@ struct hwmon_device_attribute { #define to_hwmon_attr(d) \ container_of(d, struct hwmon_device_attribute, dev_attr) +#define to_dev_attr(a) container_of(a, struct device_attribute, attr) /* * Thermal zone information @@ -58,7 +59,7 @@ struct hwmon_device_attribute { * also provides the sensor index. */ struct hwmon_thermal_data { - struct hwmon_device *hwdev; /* Reference to hwmon device */ + struct device *dev; /* Reference to hwmon device */ int index; /* sensor index */ }; @@ -95,9 +96,27 @@ static const struct attribute_group *hwmon_dev_attr_groups[] = { NULL }; +static void hwmon_free_attrs(struct attribute **attrs) +{ + int i; + + for (i = 0; attrs[i]; i++) { + struct device_attribute *dattr = to_dev_attr(attrs[i]); + struct hwmon_device_attribute *hattr = to_hwmon_attr(dattr); + + kfree(hattr); + } + kfree(attrs); +} + static void hwmon_dev_release(struct device *dev) { - kfree(to_hwmon_device(dev)); + struct hwmon_device *hwdev = to_hwmon_device(dev); + + if (hwdev->group.attrs) + hwmon_free_attrs(hwdev->group.attrs); + kfree(hwdev->groups); + kfree(hwdev); } static struct class hwmon_class = { @@ -119,11 +138,11 @@ static DEFINE_IDA(hwmon_ida); static int hwmon_thermal_get_temp(void *data, int *temp) { struct hwmon_thermal_data *tdata = data; - struct hwmon_device *hwdev = tdata->hwdev; + struct hwmon_device *hwdev = to_hwmon_device(tdata->dev); int ret; long t; - ret = hwdev->chip->ops->read(&hwdev->dev, hwmon_temp, hwmon_temp_input, + ret = hwdev->chip->ops->read(tdata->dev, hwmon_temp, hwmon_temp_input, tdata->index, &t); if (ret < 0) return ret; @@ -137,8 +156,7 @@ static const struct thermal_zone_of_device_ops hwmon_thermal_ops = { .get_temp = hwmon_thermal_get_temp, }; -static int hwmon_thermal_add_sensor(struct device *dev, - struct hwmon_device *hwdev, int index) +static int hwmon_thermal_add_sensor(struct device *dev, int index) { struct hwmon_thermal_data *tdata; struct thermal_zone_device *tzd; @@ -147,10 +165,10 @@ static int hwmon_thermal_add_sensor(struct device *dev, if (!tdata) return -ENOMEM; - tdata->hwdev = hwdev; + tdata->dev = dev; tdata->index = index; - tzd = devm_thermal_zone_of_sensor_register(&hwdev->dev, index, tdata, + tzd = devm_thermal_zone_of_sensor_register(dev, index, tdata, &hwmon_thermal_ops); /* * If CONFIG_THERMAL_OF is disabled, this returns -ENODEV, @@ -162,8 +180,7 @@ static int hwmon_thermal_add_sensor(struct device *dev, return 0; } #else -static int hwmon_thermal_add_sensor(struct device *dev, - struct hwmon_device *hwdev, int index) +static int hwmon_thermal_add_sensor(struct device *dev, int index) { return 0; } @@ -250,8 +267,7 @@ static bool is_string_attr(enum hwmon_sensor_types type, u32 attr) (type == hwmon_fan && attr == hwmon_fan_label); } -static struct attribute *hwmon_genattr(struct device *dev, - const void *drvdata, +static struct attribute *hwmon_genattr(const void *drvdata, enum hwmon_sensor_types type, u32 attr, int index, @@ -279,7 +295,7 @@ static struct attribute *hwmon_genattr(struct device *dev, if ((mode & 0222) && !ops->write) return ERR_PTR(-EINVAL); - hattr = devm_kzalloc(dev, sizeof(*hattr), GFP_KERNEL); + hattr = kzalloc(sizeof(*hattr), GFP_KERNEL); if (!hattr) return ERR_PTR(-ENOMEM); @@ -492,8 +508,7 @@ static int hwmon_num_channel_attrs(const struct hwmon_channel_info *info) return n; } -static int hwmon_genattrs(struct device *dev, - const void *drvdata, +static int hwmon_genattrs(const void *drvdata, struct attribute **attrs, const struct hwmon_ops *ops, const struct hwmon_channel_info *info) @@ -519,7 +534,7 @@ static int hwmon_genattrs(struct device *dev, attr_mask &= ~BIT(attr); if (attr >= template_size) return -EINVAL; - a = hwmon_genattr(dev, drvdata, info->type, attr, i, + a = hwmon_genattr(drvdata, info->type, attr, i, templates[attr], ops); if (IS_ERR(a)) { if (PTR_ERR(a) != -ENOENT) @@ -533,8 +548,7 @@ static int hwmon_genattrs(struct device *dev, } static struct attribute ** -__hwmon_create_attrs(struct device *dev, const void *drvdata, - const struct hwmon_chip_info *chip) +__hwmon_create_attrs(const void *drvdata, const struct hwmon_chip_info *chip) { int ret, i, aindex = 0, nattrs = 0; struct attribute **attrs; @@ -545,15 +559,17 @@ __hwmon_create_attrs(struct device *dev, const void *drvdata, if (nattrs == 0) return ERR_PTR(-EINVAL); - attrs = devm_kcalloc(dev, nattrs + 1, sizeof(*attrs), GFP_KERNEL); + attrs = kcalloc(nattrs + 1, sizeof(*attrs), GFP_KERNEL); if (!attrs) return ERR_PTR(-ENOMEM); for (i = 0; chip->info[i]; i++) { - ret = hwmon_genattrs(dev, drvdata, &attrs[aindex], chip->ops, + ret = hwmon_genattrs(drvdata, &attrs[aindex], chip->ops, chip->info[i]); - if (ret < 0) + if (ret < 0) { + hwmon_free_attrs(attrs); return ERR_PTR(ret); + } aindex += ret; } @@ -595,14 +611,13 @@ __hwmon_device_register(struct device *dev, const char *name, void *drvdata, for (i = 0; groups[i]; i++) ngroups++; - hwdev->groups = devm_kcalloc(dev, ngroups, sizeof(*groups), - GFP_KERNEL); + hwdev->groups = kcalloc(ngroups, sizeof(*groups), GFP_KERNEL); if (!hwdev->groups) { err = -ENOMEM; goto free_hwmon; } - attrs = __hwmon_create_attrs(dev, drvdata, chip); + attrs = __hwmon_create_attrs(drvdata, chip); if (IS_ERR(attrs)) { err = PTR_ERR(attrs); goto free_hwmon; @@ -647,8 +662,7 @@ __hwmon_device_register(struct device *dev, const char *name, void *drvdata, hwmon_temp_input, j)) continue; if (info[i]->config[j] & HWMON_T_INPUT) { - err = hwmon_thermal_add_sensor(dev, - hwdev, j); + err = hwmon_thermal_add_sensor(hdev, j); if (err) { device_unregister(hdev); /* @@ -667,7 +681,7 @@ __hwmon_device_register(struct device *dev, const char *name, void *drvdata, return hdev; free_hwmon: - kfree(hwdev); + hwmon_dev_release(hdev); ida_remove: ida_simple_remove(&hwmon_ida, id); return ERR_PTR(err); diff --git a/drivers/hwmon/nct7802.c b/drivers/hwmon/nct7802.c index f3dd2a17bd42..2e97e56c72c7 100644 --- a/drivers/hwmon/nct7802.c +++ b/drivers/hwmon/nct7802.c @@ -23,8 +23,8 @@ static const u8 REG_VOLTAGE[5] = { 0x09, 0x0a, 0x0c, 0x0d, 0x0e }; static const u8 REG_VOLTAGE_LIMIT_LSB[2][5] = { - { 0x40, 0x00, 0x42, 0x44, 0x46 }, - { 0x3f, 0x00, 0x41, 0x43, 0x45 }, + { 0x46, 0x00, 0x40, 0x42, 0x44 }, + { 0x45, 0x00, 0x3f, 0x41, 0x43 }, }; static const u8 REG_VOLTAGE_LIMIT_MSB[5] = { 0x48, 0x00, 0x47, 0x47, 0x48 }; @@ -58,6 +58,8 @@ static const u8 REG_VOLTAGE_LIMIT_MSB_SHIFT[2][5] = { struct nct7802_data { struct regmap *regmap; struct mutex access_lock; /* for multi-byte read and write operations */ + u8 in_status; + struct mutex in_alarm_lock; }; static ssize_t temp_type_show(struct device *dev, @@ -368,6 +370,66 @@ static ssize_t in_store(struct device *dev, struct device_attribute *attr, return err ? : count; } +static ssize_t in_alarm_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct sensor_device_attribute_2 *sattr = to_sensor_dev_attr_2(attr); + struct nct7802_data *data = dev_get_drvdata(dev); + int volt, min, max, ret; + unsigned int val; + + mutex_lock(&data->in_alarm_lock); + + /* + * The SMI Voltage status register is the only register giving a status + * for voltages. A bit is set for each input crossing a threshold, in + * both direction, but the "inside" or "outside" limits info is not + * available. Also this register is cleared on read. + * Note: this is not explicitly spelled out in the datasheet, but + * from experiment. + * To deal with this we use a status cache with one validity bit and + * one status bit for each input. Validity is cleared at startup and + * each time the register reports a change, and the status is processed + * by software based on current input value and limits. + */ + ret = regmap_read(data->regmap, 0x1e, &val); /* SMI Voltage status */ + if (ret < 0) + goto abort; + + /* invalidate cached status for all inputs crossing a threshold */ + data->in_status &= ~((val & 0x0f) << 4); + + /* if cached status for requested input is invalid, update it */ + if (!(data->in_status & (0x10 << sattr->index))) { + ret = nct7802_read_voltage(data, sattr->nr, 0); + if (ret < 0) + goto abort; + volt = ret; + + ret = nct7802_read_voltage(data, sattr->nr, 1); + if (ret < 0) + goto abort; + min = ret; + + ret = nct7802_read_voltage(data, sattr->nr, 2); + if (ret < 0) + goto abort; + max = ret; + + if (volt < min || volt > max) + data->in_status |= (1 << sattr->index); + else + data->in_status &= ~(1 << sattr->index); + + data->in_status |= 0x10 << sattr->index; + } + + ret = sprintf(buf, "%u\n", !!(data->in_status & (1 << sattr->index))); +abort: + mutex_unlock(&data->in_alarm_lock); + return ret; +} + static ssize_t temp_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -660,7 +722,7 @@ static const struct attribute_group nct7802_temp_group = { static SENSOR_DEVICE_ATTR_2_RO(in0_input, in, 0, 0); static SENSOR_DEVICE_ATTR_2_RW(in0_min, in, 0, 1); static SENSOR_DEVICE_ATTR_2_RW(in0_max, in, 0, 2); -static SENSOR_DEVICE_ATTR_2_RO(in0_alarm, alarm, 0x1e, 3); +static SENSOR_DEVICE_ATTR_2_RO(in0_alarm, in_alarm, 0, 3); static SENSOR_DEVICE_ATTR_2_RW(in0_beep, beep, 0x5a, 3); static SENSOR_DEVICE_ATTR_2_RO(in1_input, in, 1, 0); @@ -668,19 +730,19 @@ static SENSOR_DEVICE_ATTR_2_RO(in1_input, in, 1, 0); static SENSOR_DEVICE_ATTR_2_RO(in2_input, in, 2, 0); static SENSOR_DEVICE_ATTR_2_RW(in2_min, in, 2, 1); static SENSOR_DEVICE_ATTR_2_RW(in2_max, in, 2, 2); -static SENSOR_DEVICE_ATTR_2_RO(in2_alarm, alarm, 0x1e, 0); +static SENSOR_DEVICE_ATTR_2_RO(in2_alarm, in_alarm, 2, 0); static SENSOR_DEVICE_ATTR_2_RW(in2_beep, beep, 0x5a, 0); static SENSOR_DEVICE_ATTR_2_RO(in3_input, in, 3, 0); static SENSOR_DEVICE_ATTR_2_RW(in3_min, in, 3, 1); static SENSOR_DEVICE_ATTR_2_RW(in3_max, in, 3, 2); -static SENSOR_DEVICE_ATTR_2_RO(in3_alarm, alarm, 0x1e, 1); +static SENSOR_DEVICE_ATTR_2_RO(in3_alarm, in_alarm, 3, 1); static SENSOR_DEVICE_ATTR_2_RW(in3_beep, beep, 0x5a, 1); static SENSOR_DEVICE_ATTR_2_RO(in4_input, in, 4, 0); static SENSOR_DEVICE_ATTR_2_RW(in4_min, in, 4, 1); static SENSOR_DEVICE_ATTR_2_RW(in4_max, in, 4, 2); -static SENSOR_DEVICE_ATTR_2_RO(in4_alarm, alarm, 0x1e, 2); +static SENSOR_DEVICE_ATTR_2_RO(in4_alarm, in_alarm, 4, 2); static SENSOR_DEVICE_ATTR_2_RW(in4_beep, beep, 0x5a, 2); static struct attribute *nct7802_in_attrs[] = { @@ -1011,6 +1073,7 @@ static int nct7802_probe(struct i2c_client *client, return PTR_ERR(data->regmap); mutex_init(&data->access_lock); + mutex_init(&data->in_alarm_lock); ret = nct7802_init_chip(data); if (ret < 0) diff --git a/drivers/hwtracing/coresight/coresight-etm4x.c b/drivers/hwtracing/coresight/coresight-etm4x.c index dc3f507e7562..a90d757f7043 100644 --- a/drivers/hwtracing/coresight/coresight-etm4x.c +++ b/drivers/hwtracing/coresight/coresight-etm4x.c @@ -1132,7 +1132,6 @@ static void etm4_init_trace_id(struct etmv4_drvdata *drvdata) drvdata->trcid = coresight_get_trace_id(drvdata->cpu); } -#ifdef CONFIG_CPU_PM static int etm4_cpu_save(struct etmv4_drvdata *drvdata) { int i, ret = 0; @@ -1402,17 +1401,17 @@ static struct notifier_block etm4_cpu_pm_nb = { static int etm4_cpu_pm_register(void) { - return cpu_pm_register_notifier(&etm4_cpu_pm_nb); + if (IS_ENABLED(CONFIG_CPU_PM)) + return cpu_pm_register_notifier(&etm4_cpu_pm_nb); + + return 0; } static void etm4_cpu_pm_unregister(void) { - cpu_pm_unregister_notifier(&etm4_cpu_pm_nb); + if (IS_ENABLED(CONFIG_CPU_PM)) + cpu_pm_unregister_notifier(&etm4_cpu_pm_nb); } -#else -static int etm4_cpu_pm_register(void) { return 0; } -static void etm4_cpu_pm_unregister(void) { } -#endif static int etm4_probe(struct amba_device *adev, const struct amba_id *id) { diff --git a/drivers/i2c/busses/i2c-at91-core.c b/drivers/i2c/busses/i2c-at91-core.c index e13af4874976..5137e6297022 100644 --- a/drivers/i2c/busses/i2c-at91-core.c +++ b/drivers/i2c/busses/i2c-at91-core.c @@ -174,7 +174,7 @@ static struct at91_twi_pdata sama5d2_config = { static struct at91_twi_pdata sam9x60_config = { .clk_max_div = 7, - .clk_offset = 4, + .clk_offset = 3, .has_unre_flag = true, .has_alt_cmd = true, .has_hold_field = true, diff --git a/drivers/i2c/busses/i2c-bcm2835.c b/drivers/i2c/busses/i2c-bcm2835.c index e01b2b57e724..5ab901ad615d 100644 --- a/drivers/i2c/busses/i2c-bcm2835.c +++ b/drivers/i2c/busses/i2c-bcm2835.c @@ -58,6 +58,7 @@ struct bcm2835_i2c_dev { struct i2c_adapter adapter; struct completion completion; struct i2c_msg *curr_msg; + struct clk *bus_clk; int num_msgs; u32 msg_err; u8 *msg_buf; @@ -404,7 +405,6 @@ static int bcm2835_i2c_probe(struct platform_device *pdev) struct resource *mem, *irq; int ret; struct i2c_adapter *adap; - struct clk *bus_clk; struct clk *mclk; u32 bus_clk_rate; @@ -427,11 +427,11 @@ static int bcm2835_i2c_probe(struct platform_device *pdev) return PTR_ERR(mclk); } - bus_clk = bcm2835_i2c_register_div(&pdev->dev, mclk, i2c_dev); + i2c_dev->bus_clk = bcm2835_i2c_register_div(&pdev->dev, mclk, i2c_dev); - if (IS_ERR(bus_clk)) { + if (IS_ERR(i2c_dev->bus_clk)) { dev_err(&pdev->dev, "Could not register clock\n"); - return PTR_ERR(bus_clk); + return PTR_ERR(i2c_dev->bus_clk); } ret = of_property_read_u32(pdev->dev.of_node, "clock-frequency", @@ -442,13 +442,13 @@ static int bcm2835_i2c_probe(struct platform_device *pdev) bus_clk_rate = 100000; } - ret = clk_set_rate_exclusive(bus_clk, bus_clk_rate); + ret = clk_set_rate_exclusive(i2c_dev->bus_clk, bus_clk_rate); if (ret < 0) { dev_err(&pdev->dev, "Could not set clock frequency\n"); return ret; } - ret = clk_prepare_enable(bus_clk); + ret = clk_prepare_enable(i2c_dev->bus_clk); if (ret) { dev_err(&pdev->dev, "Couldn't prepare clock"); return ret; @@ -491,10 +491,9 @@ static int bcm2835_i2c_probe(struct platform_device *pdev) static int bcm2835_i2c_remove(struct platform_device *pdev) { struct bcm2835_i2c_dev *i2c_dev = platform_get_drvdata(pdev); - struct clk *bus_clk = devm_clk_get(i2c_dev->dev, "div"); - clk_rate_exclusive_put(bus_clk); - clk_disable_unprepare(bus_clk); + clk_rate_exclusive_put(i2c_dev->bus_clk); + clk_disable_unprepare(i2c_dev->bus_clk); free_irq(i2c_dev->irq, i2c_dev); i2c_del_adapter(&i2c_dev->adapter); diff --git a/drivers/i2c/busses/i2c-iop3xx.c b/drivers/i2c/busses/i2c-iop3xx.c index 38556381f4ca..2f8b8050a223 100644 --- a/drivers/i2c/busses/i2c-iop3xx.c +++ b/drivers/i2c/busses/i2c-iop3xx.c @@ -433,13 +433,17 @@ iop3xx_i2c_probe(struct platform_device *pdev) adapter_data->gpio_scl = devm_gpiod_get_optional(&pdev->dev, "scl", GPIOD_ASIS); - if (IS_ERR(adapter_data->gpio_scl)) - return PTR_ERR(adapter_data->gpio_scl); + if (IS_ERR(adapter_data->gpio_scl)) { + ret = PTR_ERR(adapter_data->gpio_scl); + goto free_both; + } adapter_data->gpio_sda = devm_gpiod_get_optional(&pdev->dev, "sda", GPIOD_ASIS); - if (IS_ERR(adapter_data->gpio_sda)) - return PTR_ERR(adapter_data->gpio_sda); + if (IS_ERR(adapter_data->gpio_sda)) { + ret = PTR_ERR(adapter_data->gpio_sda); + goto free_both; + } res = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!res) { diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c index a98bf31d0e5c..61339c665ebd 100644 --- a/drivers/i2c/busses/i2c-tegra.c +++ b/drivers/i2c/busses/i2c-tegra.c @@ -1608,14 +1608,18 @@ static int tegra_i2c_probe(struct platform_device *pdev) } pm_runtime_enable(&pdev->dev); - if (!pm_runtime_enabled(&pdev->dev)) + if (!pm_runtime_enabled(&pdev->dev)) { ret = tegra_i2c_runtime_resume(&pdev->dev); - else + if (ret < 0) { + dev_err(&pdev->dev, "runtime resume failed\n"); + goto unprepare_div_clk; + } + } else { ret = pm_runtime_get_sync(i2c_dev->dev); - - if (ret < 0) { - dev_err(&pdev->dev, "runtime resume failed\n"); - goto unprepare_div_clk; + if (ret < 0) { + dev_err(&pdev->dev, "runtime resume failed\n"); + goto disable_rpm; + } } if (i2c_dev->is_multimaster_mode) { @@ -1623,7 +1627,7 @@ static int tegra_i2c_probe(struct platform_device *pdev) if (ret < 0) { dev_err(i2c_dev->dev, "div_clk enable failed %d\n", ret); - goto disable_rpm; + goto put_rpm; } } @@ -1671,11 +1675,16 @@ disable_div_clk: if (i2c_dev->is_multimaster_mode) clk_disable(i2c_dev->div_clk); -disable_rpm: - pm_runtime_disable(&pdev->dev); - if (!pm_runtime_status_suspended(&pdev->dev)) +put_rpm: + if (pm_runtime_enabled(&pdev->dev)) + pm_runtime_put_sync(&pdev->dev); + else tegra_i2c_runtime_suspend(&pdev->dev); +disable_rpm: + if (pm_runtime_enabled(&pdev->dev)) + pm_runtime_disable(&pdev->dev); + unprepare_div_clk: clk_unprepare(i2c_dev->div_clk); @@ -1710,9 +1719,14 @@ static int tegra_i2c_remove(struct platform_device *pdev) static int __maybe_unused tegra_i2c_suspend(struct device *dev) { struct tegra_i2c_dev *i2c_dev = dev_get_drvdata(dev); + int err; i2c_mark_adapter_suspended(&i2c_dev->adapter); + err = pm_runtime_force_suspend(dev); + if (err < 0) + return err; + return 0; } @@ -1733,6 +1747,10 @@ static int __maybe_unused tegra_i2c_resume(struct device *dev) if (err) return err; + err = pm_runtime_force_resume(dev); + if (err < 0) + return err; + i2c_mark_adapter_resumed(&i2c_dev->adapter); return 0; diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c index 9f8dcd3f8385..35b209797d7b 100644 --- a/drivers/i2c/i2c-core-base.c +++ b/drivers/i2c/i2c-core-base.c @@ -186,10 +186,11 @@ int i2c_generic_scl_recovery(struct i2c_adapter *adap) * If we can set SDA, we will always create a STOP to ensure additional * pulses will do no harm. This is achieved by letting SDA follow SCL * half a cycle later. Check the 'incomplete_write_byte' fault injector - * for details. + * for details. Note that we must honour tsu:sto, 4us, but lets use 5us + * here for simplicity. */ bri->set_scl(adap, scl); - ndelay(RECOVERY_NDELAY / 2); + ndelay(RECOVERY_NDELAY); if (bri->set_sda) bri->set_sda(adap, scl); ndelay(RECOVERY_NDELAY / 2); @@ -211,7 +212,13 @@ int i2c_generic_scl_recovery(struct i2c_adapter *adap) scl = !scl; bri->set_scl(adap, scl); /* Creating STOP again, see above */ - ndelay(RECOVERY_NDELAY / 2); + if (scl) { + /* Honour minimum tsu:sto */ + ndelay(RECOVERY_NDELAY); + } else { + /* Honour minimum tf and thd:dat */ + ndelay(RECOVERY_NDELAY / 2); + } if (bri->set_sda) bri->set_sda(adap, scl); ndelay(RECOVERY_NDELAY / 2); diff --git a/drivers/iio/adc/ad7124.c b/drivers/iio/adc/ad7124.c index 3f03abf100b5..306bf15023a7 100644 --- a/drivers/iio/adc/ad7124.c +++ b/drivers/iio/adc/ad7124.c @@ -494,13 +494,11 @@ static int ad7124_of_parse_channel_config(struct iio_dev *indio_dev, st->channel_config[channel].buf_negative = of_property_read_bool(child, "adi,buffered-negative"); - *chan = ad7124_channel_template; - chan->address = channel; - chan->scan_index = channel; - chan->channel = ain[0]; - chan->channel2 = ain[1]; - - chan++; + chan[channel] = ad7124_channel_template; + chan[channel].address = channel; + chan[channel].scan_index = channel; + chan[channel].channel = ain[0]; + chan[channel].channel2 = ain[1]; } return 0; diff --git a/drivers/iio/chemical/Kconfig b/drivers/iio/chemical/Kconfig index fa4586037bb8..0b91de4df8f4 100644 --- a/drivers/iio/chemical/Kconfig +++ b/drivers/iio/chemical/Kconfig @@ -65,6 +65,7 @@ config IAQCORE config PMS7003 tristate "Plantower PMS7003 particulate matter sensor" depends on SERIAL_DEV_BUS + select IIO_BUFFER select IIO_TRIGGERED_BUFFER help Say Y here to build support for the Plantower PMS7003 particulate diff --git a/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c b/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c index a7d40c02ce6b..b921dd9e108f 100644 --- a/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c +++ b/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c @@ -1301,7 +1301,8 @@ static int st_lsm6dsx_check_whoami(struct st_lsm6dsx_hw *hw, int id, for (i = 0; i < ARRAY_SIZE(st_lsm6dsx_sensor_settings); i++) { for (j = 0; j < ST_LSM6DSX_MAX_ID; j++) { - if (id == st_lsm6dsx_sensor_settings[i].id[j].hw_id) + if (st_lsm6dsx_sensor_settings[i].id[j].name && + id == st_lsm6dsx_sensor_settings[i].id[j].hw_id) break; } if (j < ST_LSM6DSX_MAX_ID) diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c index c193d64e5217..112225c0e486 100644 --- a/drivers/iio/industrialio-buffer.c +++ b/drivers/iio/industrialio-buffer.c @@ -566,7 +566,7 @@ static int iio_compute_scan_bytes(struct iio_dev *indio_dev, const unsigned long *mask, bool timestamp) { unsigned bytes = 0; - int length, i; + int length, i, largest = 0; /* How much space will the demuxed element take? */ for_each_set_bit(i, mask, @@ -574,13 +574,17 @@ static int iio_compute_scan_bytes(struct iio_dev *indio_dev, length = iio_storage_bytes_for_si(indio_dev, i); bytes = ALIGN(bytes, length); bytes += length; + largest = max(largest, length); } if (timestamp) { length = iio_storage_bytes_for_timestamp(indio_dev); bytes = ALIGN(bytes, length); bytes += length; + largest = max(largest, length); } + + bytes = ALIGN(bytes, largest); return bytes; } diff --git a/drivers/iio/light/vcnl4000.c b/drivers/iio/light/vcnl4000.c index 16dacea9eadf..b0e241aaefb4 100644 --- a/drivers/iio/light/vcnl4000.c +++ b/drivers/iio/light/vcnl4000.c @@ -163,7 +163,6 @@ static int vcnl4200_init(struct vcnl4000_data *data) if (ret < 0) return ret; - data->al_scale = 24000; data->vcnl4200_al.reg = VCNL4200_AL_DATA; data->vcnl4200_ps.reg = VCNL4200_PS_DATA; switch (id) { @@ -172,11 +171,13 @@ static int vcnl4200_init(struct vcnl4000_data *data) /* show 54ms in total. */ data->vcnl4200_al.sampling_rate = ktime_set(0, 54000 * 1000); data->vcnl4200_ps.sampling_rate = ktime_set(0, 4200 * 1000); + data->al_scale = 24000; break; case VCNL4040_PROD_ID: /* Integration time is 80ms, add 10ms. */ data->vcnl4200_al.sampling_rate = ktime_set(0, 100000 * 1000); data->vcnl4200_ps.sampling_rate = ktime_set(0, 100000 * 1000); + data->al_scale = 120000; break; } data->vcnl4200_al.last_measurement = ktime_set(0, 0); diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 7a3b99597ead..146f98fbf22b 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -181,15 +181,14 @@ EXPORT_SYMBOL(ib_umem_find_best_pgsz); /** * ib_umem_get - Pin and DMA map userspace memory. * - * @udata: userspace context to pin memory for + * @device: IB device to connect UMEM * @addr: userspace virtual address to start at * @size: length of region to pin * @access: IB_ACCESS_xxx flags for memory being pinned */ -struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, +struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr, size_t size, int access) { - struct ib_ucontext *context; struct ib_umem *umem; struct page **page_list; unsigned long lock_limit; @@ -201,14 +200,6 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, struct scatterlist *sg; unsigned int gup_flags = FOLL_WRITE; - if (!udata) - return ERR_PTR(-EIO); - - context = container_of(udata, struct uverbs_attr_bundle, driver_udata) - ->context; - if (!context) - return ERR_PTR(-EIO); - /* * If the combination of the addr and size requested for this memory * region causes an integer overflow, return error. @@ -226,7 +217,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, umem = kzalloc(sizeof(*umem), GFP_KERNEL); if (!umem) return ERR_PTR(-ENOMEM); - umem->ibdev = context->device; + umem->ibdev = device; umem->length = size; umem->address = addr; umem->writable = ib_access_writable(access); @@ -281,7 +272,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, npages -= ret; sg = ib_umem_add_sg_table(sg, page_list, ret, - dma_get_max_seg_size(context->device->dma_device), + dma_get_max_seg_size(device->dma_device), &umem->sg_nents); up_read(&mm->mmap_sem); @@ -289,10 +280,10 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, sg_mark_end(sg); - umem->nmap = ib_dma_map_sg(context->device, - umem->sg_head.sgl, - umem->sg_nents, - DMA_BIDIRECTIONAL); + umem->nmap = ib_dma_map_sg(device, + umem->sg_head.sgl, + umem->sg_nents, + DMA_BIDIRECTIONAL); if (!umem->nmap) { ret = -ENOMEM; @@ -303,7 +294,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, goto out; umem_release: - __ib_umem_release(context->device, umem, 0); + __ib_umem_release(device, umem, 0); vma: atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm); out: diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index e42d44e501fd..dac3fd2ebc26 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -110,15 +110,12 @@ out_page_list: * They exist only to hold the per_mm reference to help the driver create * children umems. * - * @udata: udata from the syscall being used to create the umem + * @device: IB device to create UMEM * @access: ib_reg_mr access flags */ -struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata, +struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device, int access) { - struct ib_ucontext *context = - container_of(udata, struct uverbs_attr_bundle, driver_udata) - ->context; struct ib_umem *umem; struct ib_umem_odp *umem_odp; int ret; @@ -126,14 +123,11 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata, if (access & IB_ACCESS_HUGETLB) return ERR_PTR(-EINVAL); - if (!context) - return ERR_PTR(-EIO); - umem_odp = kzalloc(sizeof(*umem_odp), GFP_KERNEL); if (!umem_odp) return ERR_PTR(-ENOMEM); umem = &umem_odp->umem; - umem->ibdev = context->device; + umem->ibdev = device; umem->writable = ib_access_writable(access); umem->owning_mm = current->mm; umem_odp->is_implicit_odp = 1; @@ -201,7 +195,7 @@ EXPORT_SYMBOL(ib_umem_odp_alloc_child); /** * ib_umem_odp_get - Create a umem_odp for a userspace va * - * @udata: userspace context to pin memory for + * @device: IB device struct to get UMEM * @addr: userspace virtual address to start at * @size: length of region to pin * @access: IB_ACCESS_xxx flags for memory being pinned @@ -210,23 +204,14 @@ EXPORT_SYMBOL(ib_umem_odp_alloc_child); * pinning, instead, stores the mm for future page fault handling in * conjunction with MMU notifiers. */ -struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, - size_t size, int access, +struct ib_umem_odp *ib_umem_odp_get(struct ib_device *device, + unsigned long addr, size_t size, int access, const struct mmu_interval_notifier_ops *ops) { struct ib_umem_odp *umem_odp; - struct ib_ucontext *context; struct mm_struct *mm; int ret; - if (!udata) - return ERR_PTR(-EIO); - - context = container_of(udata, struct uverbs_attr_bundle, driver_udata) - ->context; - if (!context) - return ERR_PTR(-EIO); - if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND))) return ERR_PTR(-EINVAL); @@ -234,7 +219,7 @@ struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, if (!umem_odp) return ERR_PTR(-ENOMEM); - umem_odp->umem.ibdev = context->device; + umem_odp->umem.ibdev = device; umem_odp->umem.length = size; umem_odp->umem.address = addr; umem_odp->umem.writable = ib_access_writable(access); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index dd765e176cdd..d33bdc9b01cd 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1990,6 +1990,47 @@ EXPORT_SYMBOL(ib_resize_cq); /* Memory regions */ +struct ib_mr *ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int access_flags) +{ + struct ib_mr *mr; + + if (access_flags & IB_ACCESS_ON_DEMAND) { + if (!(pd->device->attrs.device_cap_flags & + IB_DEVICE_ON_DEMAND_PAGING)) { + pr_debug("ODP support not available\n"); + return ERR_PTR(-EINVAL); + } + } + + mr = pd->device->ops.reg_user_mr(pd, start, length, virt_addr, + access_flags, NULL); + + if (IS_ERR(mr)) + return mr; + + mr->device = pd->device; + mr->pd = pd; + mr->dm = NULL; + atomic_inc(&pd->usecnt); + mr->res.type = RDMA_RESTRACK_MR; + rdma_restrack_kadd(&mr->res); + + return mr; +} +EXPORT_SYMBOL(ib_reg_user_mr); + +int ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, + u32 flags, struct ib_sge *sg_list, u32 num_sge) +{ + if (!pd->device->ops.advise_mr) + return -EOPNOTSUPP; + + return pd->device->ops.advise_mr(pd, advice, flags, sg_list, num_sge, + NULL); +} +EXPORT_SYMBOL(ib_advise_mr); + int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata) { struct ib_pd *pd = mr->pd; diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 9b6ca15a183c..52b6a4d85460 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -837,7 +837,8 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd, bytes += (qplib_qp->sq.max_wqe * psn_sz); } bytes = PAGE_ALIGN(bytes); - umem = ib_umem_get(udata, ureq.qpsva, bytes, IB_ACCESS_LOCAL_WRITE); + umem = ib_umem_get(&rdev->ibdev, ureq.qpsva, bytes, + IB_ACCESS_LOCAL_WRITE); if (IS_ERR(umem)) return PTR_ERR(umem); @@ -850,7 +851,7 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd, if (!qp->qplib_qp.srq) { bytes = (qplib_qp->rq.max_wqe * BNXT_QPLIB_MAX_RQE_ENTRY_SIZE); bytes = PAGE_ALIGN(bytes); - umem = ib_umem_get(udata, ureq.qprva, bytes, + umem = ib_umem_get(&rdev->ibdev, ureq.qprva, bytes, IB_ACCESS_LOCAL_WRITE); if (IS_ERR(umem)) goto rqfail; @@ -1304,7 +1305,8 @@ static int bnxt_re_init_user_srq(struct bnxt_re_dev *rdev, bytes = (qplib_srq->max_wqe * BNXT_QPLIB_MAX_RQE_ENTRY_SIZE); bytes = PAGE_ALIGN(bytes); - umem = ib_umem_get(udata, ureq.srqva, bytes, IB_ACCESS_LOCAL_WRITE); + umem = ib_umem_get(&rdev->ibdev, ureq.srqva, bytes, + IB_ACCESS_LOCAL_WRITE); if (IS_ERR(umem)) return PTR_ERR(umem); @@ -2545,7 +2547,7 @@ int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, goto fail; } - cq->umem = ib_umem_get(udata, req.cq_va, + cq->umem = ib_umem_get(&rdev->ibdev, req.cq_va, entries * sizeof(struct cq_base), IB_ACCESS_LOCAL_WRITE); if (IS_ERR(cq->umem)) { @@ -3305,8 +3307,10 @@ int bnxt_re_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata) int rc; rc = bnxt_qplib_free_mrw(&rdev->qplib_res, &mr->qplib_mr); - if (rc) + if (rc) { dev_err(rdev_to_dev(rdev), "Dereg MR failed: %#x\n", rc); + return rc; + } if (mr->pages) { rc = bnxt_qplib_free_fast_reg_page_list(&rdev->qplib_res, @@ -3512,7 +3516,7 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length, /* The fixed portion of the rkey is the same as the lkey */ mr->ib_mr.rkey = mr->qplib_mr.rkey; - umem = ib_umem_get(udata, start, length, mr_access_flags); + umem = ib_umem_get(&rdev->ibdev, start, length, mr_access_flags); if (IS_ERR(umem)) { dev_err(rdev_to_dev(rdev), "Failed to get umem"); rc = -EFAULT; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 958c1ff9c515..4d07d22bfa7b 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -2283,13 +2283,13 @@ static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq, /* Add qp to flush list of the CQ */ bnxt_qplib_add_flush_qp(qp); } else { + /* Before we complete, do WA 9060 */ + if (do_wa9060(qp, cq, cq_cons, sw_sq_cons, + cqe_sq_cons)) { + *lib_qp = qp; + goto out; + } if (swq->flags & SQ_SEND_FLAGS_SIGNAL_COMP) { - /* Before we complete, do WA 9060 */ - if (do_wa9060(qp, cq, cq_cons, sw_sq_cons, - cqe_sq_cons)) { - *lib_qp = qp; - goto out; - } cqe->status = CQ_REQ_STATUS_OK; cqe++; (*budget)--; diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index fe3a7e8561df..962dc97a8ff2 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -543,7 +543,7 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, mhp->rhp = rhp; - mhp->umem = ib_umem_get(udata, start, length, acc); + mhp->umem = ib_umem_get(pd->device, start, length, acc); if (IS_ERR(mhp->umem)) goto err_free_skb; diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index 50c22575aed6..4822f5fa12be 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -1358,7 +1358,7 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length, int inline_size; int err; - if (udata->inlen && + if (udata && udata->inlen && !ib_is_udata_cleared(udata, 0, sizeof(udata->inlen))) { ibdev_dbg(&dev->ibdev, "Incompatible ABI params, udata not cleared\n"); @@ -1384,7 +1384,7 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length, goto err_out; } - mr->umem = ib_umem_get(udata, start, length, access_flags); + mr->umem = ib_umem_get(ibpd->device, start, length, access_flags); if (IS_ERR(mr->umem)) { err = PTR_ERR(mr->umem); ibdev_dbg(&dev->ibdev, diff --git a/drivers/infiniband/hw/hfi1/iowait.c b/drivers/infiniband/hw/hfi1/iowait.c index adb4a1ba921b..5836fe7b2817 100644 --- a/drivers/infiniband/hw/hfi1/iowait.c +++ b/drivers/infiniband/hw/hfi1/iowait.c @@ -81,7 +81,9 @@ void iowait_init(struct iowait *wait, u32 tx_limit, void iowait_cancel_work(struct iowait *w) { cancel_work_sync(&iowait_get_ib_work(w)->iowork); - cancel_work_sync(&iowait_get_tid_work(w)->iowork); + /* Make sure that the iowork for TID RDMA is used */ + if (iowait_get_tid_work(w)->iowork.func) + cancel_work_sync(&iowait_get_tid_work(w)->iowork); } /** diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index e53f542b60af..8a2e0d9351e9 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -4633,6 +4633,15 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) */ fpsn = full_flow_psn(flow, flow->flow_state.spsn); req->r_ack_psn = psn; + /* + * If resync_psn points to the last flow PSN for a + * segment and the new segment (likely from a new + * request) starts with a new generation number, we + * need to adjust resync_psn accordingly. + */ + if (flow->flow_state.generation != + (resync_psn >> HFI1_KDETH_BTH_SEQ_SHIFT)) + resync_psn = mask_psn(fpsn - 1); flow->resync_npkts += delta_psn(mask_psn(resync_psn + 1), fpsn); /* diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index af1d8823b3f0..a2d1e5331bf1 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -163,7 +163,7 @@ static int get_cq_umem(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq, u32 npages; int ret; - *umem = ib_umem_get(udata, ucmd.buf_addr, buf->size, + *umem = ib_umem_get(&hr_dev->ib_dev, ucmd.buf_addr, buf->size, IB_ACCESS_LOCAL_WRITE); if (IS_ERR(*umem)) return PTR_ERR(*umem); diff --git a/drivers/infiniband/hw/hns/hns_roce_db.c b/drivers/infiniband/hw/hns/hns_roce_db.c index 10af6958ab69..bff6abdccfb0 100644 --- a/drivers/infiniband/hw/hns/hns_roce_db.c +++ b/drivers/infiniband/hw/hns/hns_roce_db.c @@ -31,7 +31,8 @@ int hns_roce_db_map_user(struct hns_roce_ucontext *context, refcount_set(&page->refcount, 1); page->user_virt = page_addr; - page->umem = ib_umem_get(udata, page_addr, PAGE_SIZE, 0); + page->umem = ib_umem_get(context->ibucontext.device, page_addr, + PAGE_SIZE, 0); if (IS_ERR(page->umem)) { ret = PTR_ERR(page->umem); kfree(page); diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 9ad19170c3f9..3ff610549c74 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -1145,7 +1145,7 @@ struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (!mr) return ERR_PTR(-ENOMEM); - mr->umem = ib_umem_get(udata, start, length, access_flags); + mr->umem = ib_umem_get(pd->device, start, length, access_flags); if (IS_ERR(mr->umem)) { ret = PTR_ERR(mr->umem); goto err_free; @@ -1230,7 +1230,7 @@ static int rereg_mr_trans(struct ib_mr *ibmr, int flags, } ib_umem_release(mr->umem); - mr->umem = ib_umem_get(udata, start, length, mr_access_flags); + mr->umem = ib_umem_get(ibmr->device, start, length, mr_access_flags); if (IS_ERR(mr->umem)) { ret = PTR_ERR(mr->umem); mr->umem = NULL; diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index a6565b674801..eb2ee6a581aa 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -744,7 +744,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, goto err_alloc_rq_inline_buf; } - hr_qp->umem = ib_umem_get(udata, ucmd.buf_addr, + hr_qp->umem = ib_umem_get(ib_pd->device, ucmd.buf_addr, hr_qp->buff_size, 0); if (IS_ERR(hr_qp->umem)) { dev_err(dev, "ib_umem_get error for create qp\n"); diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index 7113ebfdb4f0..c6d5f06f9cde 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -186,7 +186,8 @@ static int create_user_srq(struct hns_roce_srq *srq, struct ib_udata *udata, if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) return -EFAULT; - srq->umem = ib_umem_get(udata, ucmd.buf_addr, srq_buf_size, 0); + srq->umem = + ib_umem_get(srq->ibsrq.device, ucmd.buf_addr, srq_buf_size, 0); if (IS_ERR(srq->umem)) return PTR_ERR(srq->umem); @@ -205,7 +206,7 @@ static int create_user_srq(struct hns_roce_srq *srq, struct ib_udata *udata, goto err_user_srq_mtt; /* config index queue BA */ - srq->idx_que.umem = ib_umem_get(udata, ucmd.que_addr, + srq->idx_que.umem = ib_umem_get(srq->ibsrq.device, ucmd.que_addr, srq->idx_que.buf_size, 0); if (IS_ERR(srq->idx_que.umem)) { dev_err(hr_dev->dev, "ib_umem_get error for index queue\n"); diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index 86375947bc67..c335de91508f 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -169,8 +169,7 @@ static void i40iw_dealloc_ucontext(struct ib_ucontext *context) static int i40iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) { struct i40iw_ucontext *ucontext; - u64 db_addr_offset; - u64 push_offset; + u64 db_addr_offset, push_offset, pfn; ucontext = to_ucontext(context); if (ucontext->iwdev->sc_dev.is_pf) { @@ -189,7 +188,6 @@ static int i40iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) if (vma->vm_pgoff == (db_addr_offset >> PAGE_SHIFT)) { vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - vma->vm_private_data = ucontext; } else { if ((vma->vm_pgoff - (push_offset >> PAGE_SHIFT)) % 2) vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); @@ -197,12 +195,12 @@ static int i40iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); } - if (io_remap_pfn_range(vma, vma->vm_start, - vma->vm_pgoff + (pci_resource_start(ucontext->iwdev->ldev->pcidev, 0) >> PAGE_SHIFT), - PAGE_SIZE, vma->vm_page_prot)) - return -EAGAIN; + pfn = vma->vm_pgoff + + (pci_resource_start(ucontext->iwdev->ldev->pcidev, 0) >> + PAGE_SHIFT); - return 0; + return rdma_user_mmap_io(context, vma, pfn, PAGE_SIZE, + vma->vm_page_prot, NULL); } /** @@ -1758,12 +1756,15 @@ static struct ib_mr *i40iw_reg_user_mr(struct ib_pd *pd, int ret; int pg_shift; + if (!udata) + return ERR_PTR(-EOPNOTSUPP); + if (iwdev->closing) return ERR_PTR(-ENODEV); if (length > I40IW_MAX_MR_SIZE) return ERR_PTR(-EINVAL); - region = ib_umem_get(udata, start, length, acc); + region = ib_umem_get(pd->device, start, length, acc); if (IS_ERR(region)) return (struct ib_mr *)region; diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index 306b21281fa2..a57033d4b0e5 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -144,7 +144,7 @@ static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_udata *udata, int shift; int n; - *umem = ib_umem_get(udata, buf_addr, cqe * cqe_size, + *umem = ib_umem_get(&dev->ib_dev, buf_addr, cqe * cqe_size, IB_ACCESS_LOCAL_WRITE); if (IS_ERR(*umem)) return PTR_ERR(*umem); diff --git a/drivers/infiniband/hw/mlx4/doorbell.c b/drivers/infiniband/hw/mlx4/doorbell.c index 714f9df5bf39..d41f03ccb0e1 100644 --- a/drivers/infiniband/hw/mlx4/doorbell.c +++ b/drivers/infiniband/hw/mlx4/doorbell.c @@ -64,7 +64,8 @@ int mlx4_ib_db_map_user(struct ib_udata *udata, unsigned long virt, page->user_virt = (virt & PAGE_MASK); page->refcnt = 0; - page->umem = ib_umem_get(udata, virt & PAGE_MASK, PAGE_SIZE, 0); + page->umem = ib_umem_get(context->ibucontext.device, virt & PAGE_MASK, + PAGE_SIZE, 0); if (IS_ERR(page->umem)) { err = PTR_ERR(page->umem); kfree(page); diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index dfa17bcdcdbc..b0121c90c561 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -367,7 +367,7 @@ end: return block_shift; } -static struct ib_umem *mlx4_get_umem_mr(struct ib_udata *udata, u64 start, +static struct ib_umem *mlx4_get_umem_mr(struct ib_device *device, u64 start, u64 length, int access_flags) { /* @@ -398,7 +398,7 @@ static struct ib_umem *mlx4_get_umem_mr(struct ib_udata *udata, u64 start, up_read(¤t->mm->mmap_sem); } - return ib_umem_get(udata, start, length, access_flags); + return ib_umem_get(device, start, length, access_flags); } struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, @@ -415,7 +415,7 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (!mr) return ERR_PTR(-ENOMEM); - mr->umem = mlx4_get_umem_mr(udata, start, length, access_flags); + mr->umem = mlx4_get_umem_mr(pd->device, start, length, access_flags); if (IS_ERR(mr->umem)) { err = PTR_ERR(mr->umem); goto err_free; @@ -504,7 +504,7 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr); ib_umem_release(mmr->umem); - mmr->umem = mlx4_get_umem_mr(udata, start, length, + mmr->umem = mlx4_get_umem_mr(mr->device, start, length, mr_access_flags); if (IS_ERR(mmr->umem)) { err = PTR_ERR(mmr->umem); diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 85f57b76e446..cb23cdb9389a 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -916,7 +916,7 @@ static int create_rq(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + (qp->sq.wqe_cnt << qp->sq.wqe_shift); - qp->umem = ib_umem_get(udata, wq.buf_addr, qp->buf_size, 0); + qp->umem = ib_umem_get(pd->device, wq.buf_addr, qp->buf_size, 0); if (IS_ERR(qp->umem)) { err = PTR_ERR(qp->umem); goto err; @@ -1110,7 +1110,8 @@ static int create_qp_common(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, if (err) goto err; - qp->umem = ib_umem_get(udata, ucmd.buf_addr, qp->buf_size, 0); + qp->umem = + ib_umem_get(pd->device, ucmd.buf_addr, qp->buf_size, 0); if (IS_ERR(qp->umem)) { err = PTR_ERR(qp->umem); goto err; diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c index 8dcf6e3d9ae2..8f9d5035142d 100644 --- a/drivers/infiniband/hw/mlx4/srq.c +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -110,7 +110,8 @@ int mlx4_ib_create_srq(struct ib_srq *ib_srq, if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) return -EFAULT; - srq->umem = ib_umem_get(udata, ucmd.buf_addr, buf_size, 0); + srq->umem = + ib_umem_get(ib_srq->device, ucmd.buf_addr, buf_size, 0); if (IS_ERR(srq->umem)) return PTR_ERR(srq->umem); diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index dd8d24ee8e1d..367a71bc5f4b 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -708,8 +708,8 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, *cqe_size = ucmd.cqe_size; cq->buf.umem = - ib_umem_get(udata, ucmd.buf_addr, entries * ucmd.cqe_size, - IB_ACCESS_LOCAL_WRITE); + ib_umem_get(&dev->ib_dev, ucmd.buf_addr, + entries * ucmd.cqe_size, IB_ACCESS_LOCAL_WRITE); if (IS_ERR(cq->buf.umem)) { err = PTR_ERR(cq->buf.umem); return err; @@ -1108,7 +1108,7 @@ static int resize_user(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, if (ucmd.cqe_size && SIZE_MAX / ucmd.cqe_size <= entries - 1) return -EINVAL; - umem = ib_umem_get(udata, ucmd.buf_addr, + umem = ib_umem_get(&dev->ib_dev, ucmd.buf_addr, (size_t)ucmd.cqe_size * entries, IB_ACCESS_LOCAL_WRITE); if (IS_ERR(umem)) { diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 9d0a18cf9e5e..685b8ed96b4e 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -2134,7 +2134,7 @@ static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext, if (err) return err; - obj->umem = ib_umem_get(&attrs->driver_udata, addr, size, access); + obj->umem = ib_umem_get(&dev->ib_dev, addr, size, access); if (IS_ERR(obj->umem)) return PTR_ERR(obj->umem); diff --git a/drivers/infiniband/hw/mlx5/doorbell.c b/drivers/infiniband/hw/mlx5/doorbell.c index 12737c509aa2..61475b571531 100644 --- a/drivers/infiniband/hw/mlx5/doorbell.c +++ b/drivers/infiniband/hw/mlx5/doorbell.c @@ -64,7 +64,8 @@ int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, page->user_virt = (virt & PAGE_MASK); page->refcnt = 0; - page->umem = ib_umem_get(udata, virt & PAGE_MASK, PAGE_SIZE, 0); + page->umem = ib_umem_get(context->ibucontext.device, virt & PAGE_MASK, + PAGE_SIZE, 0); if (IS_ERR(page->umem)) { err = PTR_ERR(page->umem); kfree(page); diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 997cbfe4b90c..904d508cc823 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -815,6 +815,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props, struct ib_udata *uhw) { + size_t uhw_outlen = (uhw) ? uhw->outlen : 0; struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; int err = -ENOMEM; @@ -828,12 +829,12 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, u64 max_tso; resp_len = sizeof(resp.comp_mask) + sizeof(resp.response_length); - if (uhw->outlen && uhw->outlen < resp_len) + if (uhw_outlen && uhw_outlen < resp_len) return -EINVAL; resp.response_length = resp_len; - if (uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen)) + if (uhw && uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen)) return -EINVAL; memset(props, 0, sizeof(*props)); @@ -897,7 +898,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->raw_packet_caps |= IB_RAW_PACKET_CAP_CVLAN_STRIPPING; - if (field_avail(typeof(resp), tso_caps, uhw->outlen)) { + if (field_avail(typeof(resp), tso_caps, uhw_outlen)) { max_tso = MLX5_CAP_ETH(mdev, max_lso_cap); if (max_tso) { resp.tso_caps.max_tso = 1 << max_tso; @@ -907,7 +908,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, } } - if (field_avail(typeof(resp), rss_caps, uhw->outlen)) { + if (field_avail(typeof(resp), rss_caps, uhw_outlen)) { resp.rss_caps.rx_hash_function = MLX5_RX_HASH_FUNC_TOEPLITZ; resp.rss_caps.rx_hash_fields_mask = @@ -927,9 +928,9 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, resp.response_length += sizeof(resp.rss_caps); } } else { - if (field_avail(typeof(resp), tso_caps, uhw->outlen)) + if (field_avail(typeof(resp), tso_caps, uhw_outlen)) resp.response_length += sizeof(resp.tso_caps); - if (field_avail(typeof(resp), rss_caps, uhw->outlen)) + if (field_avail(typeof(resp), rss_caps, uhw_outlen)) resp.response_length += sizeof(resp.rss_caps); } @@ -1014,6 +1015,23 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, if (dev->odp_caps.general_caps & IB_ODP_SUPPORT) props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING; props->odp_caps = dev->odp_caps; + if (!uhw) { + /* ODP for kernel QPs is not implemented for receive + * WQEs and SRQ WQEs + */ + props->odp_caps.per_transport_caps.rc_odp_caps &= + ~(IB_ODP_SUPPORT_READ | + IB_ODP_SUPPORT_SRQ_RECV); + props->odp_caps.per_transport_caps.uc_odp_caps &= + ~(IB_ODP_SUPPORT_READ | + IB_ODP_SUPPORT_SRQ_RECV); + props->odp_caps.per_transport_caps.ud_odp_caps &= + ~(IB_ODP_SUPPORT_READ | + IB_ODP_SUPPORT_SRQ_RECV); + props->odp_caps.per_transport_caps.xrc_odp_caps &= + ~(IB_ODP_SUPPORT_READ | + IB_ODP_SUPPORT_SRQ_RECV); + } } if (MLX5_CAP_GEN(mdev, cd)) @@ -1054,7 +1072,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, MLX5_MAX_CQ_PERIOD; } - if (field_avail(typeof(resp), cqe_comp_caps, uhw->outlen)) { + if (field_avail(typeof(resp), cqe_comp_caps, uhw_outlen)) { resp.response_length += sizeof(resp.cqe_comp_caps); if (MLX5_CAP_GEN(dev->mdev, cqe_compression)) { @@ -1072,7 +1090,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, } } - if (field_avail(typeof(resp), packet_pacing_caps, uhw->outlen) && + if (field_avail(typeof(resp), packet_pacing_caps, uhw_outlen) && raw_support) { if (MLX5_CAP_QOS(mdev, packet_pacing) && MLX5_CAP_GEN(mdev, qos)) { @@ -1091,7 +1109,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, } if (field_avail(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes, - uhw->outlen)) { + uhw_outlen)) { if (MLX5_CAP_ETH(mdev, multi_pkt_send_wqe)) resp.mlx5_ib_support_multi_pkt_send_wqes = MLX5_IB_ALLOW_MPW; @@ -1104,7 +1122,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, sizeof(resp.mlx5_ib_support_multi_pkt_send_wqes); } - if (field_avail(typeof(resp), flags, uhw->outlen)) { + if (field_avail(typeof(resp), flags, uhw_outlen)) { resp.response_length += sizeof(resp.flags); if (MLX5_CAP_GEN(mdev, cqe_compression_128)) @@ -1120,8 +1138,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_SCAT2CQE_DCT; } - if (field_avail(typeof(resp), sw_parsing_caps, - uhw->outlen)) { + if (field_avail(typeof(resp), sw_parsing_caps, uhw_outlen)) { resp.response_length += sizeof(resp.sw_parsing_caps); if (MLX5_CAP_ETH(mdev, swp)) { resp.sw_parsing_caps.sw_parsing_offloads |= @@ -1141,7 +1158,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, } } - if (field_avail(typeof(resp), striding_rq_caps, uhw->outlen) && + if (field_avail(typeof(resp), striding_rq_caps, uhw_outlen) && raw_support) { resp.response_length += sizeof(resp.striding_rq_caps); if (MLX5_CAP_GEN(mdev, striding_rq)) { @@ -1164,8 +1181,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, } } - if (field_avail(typeof(resp), tunnel_offloads_caps, - uhw->outlen)) { + if (field_avail(typeof(resp), tunnel_offloads_caps, uhw_outlen)) { resp.response_length += sizeof(resp.tunnel_offloads_caps); if (MLX5_CAP_ETH(mdev, tunnel_stateless_vxlan)) resp.tunnel_offloads_caps |= @@ -1186,7 +1202,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, MLX5_IB_TUNNELED_OFFLOADS_MPLS_UDP; } - if (uhw->outlen) { + if (uhw_outlen) { err = ib_copy_to_udata(uhw, &resp, resp.response_length); if (err) @@ -3276,12 +3292,14 @@ static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_flow_namespace *ns, int num_entries, int num_groups, u32 flags) { + struct mlx5_flow_table_attr ft_attr = {}; struct mlx5_flow_table *ft; - ft = mlx5_create_auto_grouped_flow_table(ns, priority, - num_entries, - num_groups, - 0, flags); + ft_attr.prio = priority; + ft_attr.max_fte = num_entries; + ft_attr.flags = flags; + ft_attr.autogroup.max_num_groups = num_groups; + ft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr); if (IS_ERR(ft)) return ERR_CAST(ft); @@ -4771,7 +4789,6 @@ static int __get_port_caps(struct mlx5_ib_dev *dev, u8 port) struct ib_device_attr *dprops = NULL; struct ib_port_attr *pprops = NULL; int err = -ENOMEM; - struct ib_udata uhw = {.inlen = 0, .outlen = 0}; pprops = kzalloc(sizeof(*pprops), GFP_KERNEL); if (!pprops) @@ -4781,7 +4798,7 @@ static int __get_port_caps(struct mlx5_ib_dev *dev, u8 port) if (!dprops) goto out; - err = mlx5_ib_query_device(&dev->ib_dev, dprops, &uhw); + err = mlx5_ib_query_device(&dev->ib_dev, dprops, NULL); if (err) { mlx5_ib_warn(dev, "query_device failed %d\n", err); goto out; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index b06f32ff5748..77d495b2032d 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1153,12 +1153,12 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, const struct ib_send_wr **bad_wr); int mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr); -int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer, - int buflen, size_t *bc); -int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer, - int buflen, size_t *bc); -int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, - void *buffer, int buflen, size_t *bc); +int mlx5_ib_read_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer, + size_t buflen, size_t *bc); +int mlx5_ib_read_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer, + size_t buflen, size_t *bc); +int mlx5_ib_read_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, void *buffer, + size_t buflen, size_t *bc); int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, struct ib_udata *udata); void mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index ea8bfc3e2d8d..44a0ee6bd9f1 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -737,10 +737,9 @@ static int mr_cache_max_order(struct mlx5_ib_dev *dev) return MLX5_MAX_UMR_SHIFT; } -static int mr_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata, - u64 start, u64 length, int access_flags, - struct ib_umem **umem, int *npages, int *page_shift, - int *ncont, int *order) +static int mr_umem_get(struct mlx5_ib_dev *dev, u64 start, u64 length, + int access_flags, struct ib_umem **umem, int *npages, + int *page_shift, int *ncont, int *order) { struct ib_umem *u; @@ -749,7 +748,7 @@ static int mr_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata, if (access_flags & IB_ACCESS_ON_DEMAND) { struct ib_umem_odp *odp; - odp = ib_umem_odp_get(udata, start, length, access_flags, + odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags, &mlx5_mn_ops); if (IS_ERR(odp)) { mlx5_ib_dbg(dev, "umem get failed (%ld)\n", @@ -765,7 +764,7 @@ static int mr_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata, if (order) *order = ilog2(roundup_pow_of_two(*ncont)); } else { - u = ib_umem_get(udata, start, length, access_flags); + u = ib_umem_get(&dev->ib_dev, start, length, access_flags); if (IS_ERR(u)) { mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(u)); return PTR_ERR(u); @@ -1247,6 +1246,8 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && !start && length == U64_MAX) { + if (virt_addr != start) + return ERR_PTR(-EINVAL); if (!(access_flags & IB_ACCESS_ON_DEMAND) || !(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) return ERR_PTR(-EINVAL); @@ -1257,7 +1258,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, return &mr->ibmr; } - err = mr_umem_get(dev, udata, start, length, access_flags, &umem, + err = mr_umem_get(dev, start, length, access_flags, &umem, &npages, &page_shift, &ncont, &order); if (err < 0) @@ -1424,9 +1425,8 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, flags |= IB_MR_REREG_TRANS; ib_umem_release(mr->umem); mr->umem = NULL; - err = mr_umem_get(dev, udata, addr, len, access_flags, - &mr->umem, &npages, &page_shift, &ncont, - &order); + err = mr_umem_get(dev, addr, len, access_flags, &mr->umem, + &npages, &page_shift, &ncont, &order); if (err) goto err; } diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index f924250f80c2..0afb0042bd53 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -497,7 +497,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, struct mlx5_ib_mr *imr; int err; - umem_odp = ib_umem_odp_alloc_implicit(udata, access_flags); + umem_odp = ib_umem_odp_alloc_implicit(&dev->ib_dev, access_flags); if (IS_ERR(umem_odp)) return ERR_CAST(umem_odp); @@ -624,11 +624,10 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp, bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE; unsigned long current_seq; u64 access_mask; - u64 start_idx, page_mask; + u64 start_idx; page_shift = odp->page_shift; - page_mask = ~(BIT(page_shift) - 1); - start_idx = (user_va - (mr->mmkey.iova & page_mask)) >> page_shift; + start_idx = (user_va - ib_umem_start(odp)) >> page_shift; access_mask = ODP_READ_ALLOWED_BIT; if (odp->umem.writable && !downgrade) @@ -767,11 +766,19 @@ static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt, { struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); + if (unlikely(io_virt < mr->mmkey.iova)) + return -EFAULT; + if (!odp->is_implicit_odp) { - if (unlikely(io_virt < ib_umem_start(odp) || - ib_umem_end(odp) - io_virt < bcnt)) + u64 user_va; + + if (check_add_overflow(io_virt - mr->mmkey.iova, + (u64)odp->umem.address, &user_va)) + return -EFAULT; + if (unlikely(user_va >= ib_umem_end(odp) || + ib_umem_end(odp) - user_va < bcnt)) return -EFAULT; - return pagefault_real_mr(mr, odp, io_virt, bcnt, bytes_mapped, + return pagefault_real_mr(mr, odp, user_va, bcnt, bytes_mapped, flags); } return pagefault_implicit_mr(mr, odp, io_virt, bcnt, bytes_mapped, @@ -1237,15 +1244,15 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev, wqe = wqe_start; qp = (res->res == MLX5_RES_QP) ? res_to_qp(res) : NULL; if (qp && sq) { - ret = mlx5_ib_read_user_wqe_sq(qp, wqe_index, wqe, PAGE_SIZE, - &bytes_copied); + ret = mlx5_ib_read_wqe_sq(qp, wqe_index, wqe, PAGE_SIZE, + &bytes_copied); if (ret) goto read_user; ret = mlx5_ib_mr_initiator_pfault_handler( dev, pfault, qp, &wqe, &wqe_end, bytes_copied); } else if (qp && !sq) { - ret = mlx5_ib_read_user_wqe_rq(qp, wqe_index, wqe, PAGE_SIZE, - &bytes_copied); + ret = mlx5_ib_read_wqe_rq(qp, wqe_index, wqe, PAGE_SIZE, + &bytes_copied); if (ret) goto read_user; ret = mlx5_ib_mr_responder_pfault_handler_rq( @@ -1253,8 +1260,8 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev, } else if (!qp) { struct mlx5_ib_srq *srq = res_to_srq(res); - ret = mlx5_ib_read_user_wqe_srq(srq, wqe_index, wqe, PAGE_SIZE, - &bytes_copied); + ret = mlx5_ib_read_wqe_srq(srq, wqe_index, wqe, PAGE_SIZE, + &bytes_copied); if (ret) goto read_user; ret = mlx5_ib_mr_responder_pfault_handler_srq( diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 7e51870e9e01..ae7cbd9c9bca 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -129,14 +129,10 @@ static int is_sqp(enum ib_qp_type qp_type) * * Return: zero on success, or an error code. */ -static int mlx5_ib_read_user_wqe_common(struct ib_umem *umem, - void *buffer, - u32 buflen, - int wqe_index, - int wq_offset, - int wq_wqe_cnt, - int wq_wqe_shift, - int bcnt, +static int mlx5_ib_read_user_wqe_common(struct ib_umem *umem, void *buffer, + size_t buflen, int wqe_index, + int wq_offset, int wq_wqe_cnt, + int wq_wqe_shift, int bcnt, size_t *bytes_copied) { size_t offset = wq_offset + ((wqe_index % wq_wqe_cnt) << wq_wqe_shift); @@ -160,11 +156,43 @@ static int mlx5_ib_read_user_wqe_common(struct ib_umem *umem, return 0; } -int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp, - int wqe_index, - void *buffer, - int buflen, - size_t *bc) +static int mlx5_ib_read_kernel_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index, + void *buffer, size_t buflen, size_t *bc) +{ + struct mlx5_wqe_ctrl_seg *ctrl; + size_t bytes_copied = 0; + size_t wqe_length; + void *p; + int ds; + + wqe_index = wqe_index & qp->sq.fbc.sz_m1; + + /* read the control segment first */ + p = mlx5_frag_buf_get_wqe(&qp->sq.fbc, wqe_index); + ctrl = p; + ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; + wqe_length = ds * MLX5_WQE_DS_UNITS; + + /* read rest of WQE if it spreads over more than one stride */ + while (bytes_copied < wqe_length) { + size_t copy_length = + min_t(size_t, buflen - bytes_copied, MLX5_SEND_WQE_BB); + + if (!copy_length) + break; + + memcpy(buffer + bytes_copied, p, copy_length); + bytes_copied += copy_length; + + wqe_index = (wqe_index + 1) & qp->sq.fbc.sz_m1; + p = mlx5_frag_buf_get_wqe(&qp->sq.fbc, wqe_index); + } + *bc = bytes_copied; + return 0; +} + +static int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index, + void *buffer, size_t buflen, size_t *bc) { struct mlx5_ib_qp_base *base = &qp->trans_qp.base; struct ib_umem *umem = base->ubuffer.umem; @@ -176,18 +204,10 @@ int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp, int ret; int ds; - if (buflen < sizeof(*ctrl)) - return -EINVAL; - /* at first read as much as possible */ - ret = mlx5_ib_read_user_wqe_common(umem, - buffer, - buflen, - wqe_index, - wq->offset, - wq->wqe_cnt, - wq->wqe_shift, - buflen, + ret = mlx5_ib_read_user_wqe_common(umem, buffer, buflen, wqe_index, + wq->offset, wq->wqe_cnt, + wq->wqe_shift, buflen, &bytes_copied); if (ret) return ret; @@ -210,13 +230,9 @@ int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp, * so read the remaining bytes starting * from wqe_index 0 */ - ret = mlx5_ib_read_user_wqe_common(umem, - buffer + bytes_copied, - buflen - bytes_copied, - 0, - wq->offset, - wq->wqe_cnt, - wq->wqe_shift, + ret = mlx5_ib_read_user_wqe_common(umem, buffer + bytes_copied, + buflen - bytes_copied, 0, wq->offset, + wq->wqe_cnt, wq->wqe_shift, wqe_length - bytes_copied, &bytes_copied2); @@ -226,11 +242,24 @@ int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp, return 0; } -int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp, - int wqe_index, - void *buffer, - int buflen, - size_t *bc) +int mlx5_ib_read_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer, + size_t buflen, size_t *bc) +{ + struct mlx5_ib_qp_base *base = &qp->trans_qp.base; + struct ib_umem *umem = base->ubuffer.umem; + + if (buflen < sizeof(struct mlx5_wqe_ctrl_seg)) + return -EINVAL; + + if (!umem) + return mlx5_ib_read_kernel_wqe_sq(qp, wqe_index, buffer, + buflen, bc); + + return mlx5_ib_read_user_wqe_sq(qp, wqe_index, buffer, buflen, bc); +} + +static int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, + void *buffer, size_t buflen, size_t *bc) { struct mlx5_ib_qp_base *base = &qp->trans_qp.base; struct ib_umem *umem = base->ubuffer.umem; @@ -238,14 +267,9 @@ int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp, size_t bytes_copied; int ret; - ret = mlx5_ib_read_user_wqe_common(umem, - buffer, - buflen, - wqe_index, - wq->offset, - wq->wqe_cnt, - wq->wqe_shift, - buflen, + ret = mlx5_ib_read_user_wqe_common(umem, buffer, buflen, wqe_index, + wq->offset, wq->wqe_cnt, + wq->wqe_shift, buflen, &bytes_copied); if (ret) @@ -254,25 +278,33 @@ int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp, return 0; } -int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq, - int wqe_index, - void *buffer, - int buflen, - size_t *bc) +int mlx5_ib_read_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer, + size_t buflen, size_t *bc) +{ + struct mlx5_ib_qp_base *base = &qp->trans_qp.base; + struct ib_umem *umem = base->ubuffer.umem; + struct mlx5_ib_wq *wq = &qp->rq; + size_t wqe_size = 1 << wq->wqe_shift; + + if (buflen < wqe_size) + return -EINVAL; + + if (!umem) + return -EOPNOTSUPP; + + return mlx5_ib_read_user_wqe_rq(qp, wqe_index, buffer, buflen, bc); +} + +static int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, + void *buffer, size_t buflen, size_t *bc) { struct ib_umem *umem = srq->umem; size_t bytes_copied; int ret; - ret = mlx5_ib_read_user_wqe_common(umem, - buffer, - buflen, - wqe_index, - 0, - srq->msrq.max, - srq->msrq.wqe_shift, - buflen, - &bytes_copied); + ret = mlx5_ib_read_user_wqe_common(umem, buffer, buflen, wqe_index, 0, + srq->msrq.max, srq->msrq.wqe_shift, + buflen, &bytes_copied); if (ret) return ret; @@ -280,6 +312,21 @@ int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq, return 0; } +int mlx5_ib_read_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, void *buffer, + size_t buflen, size_t *bc) +{ + struct ib_umem *umem = srq->umem; + size_t wqe_size = 1 << srq->msrq.wqe_shift; + + if (buflen < wqe_size) + return -EINVAL; + + if (!umem) + return -EOPNOTSUPP; + + return mlx5_ib_read_user_wqe_srq(srq, wqe_index, buffer, buflen, bc); +} + static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type) { struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; @@ -749,7 +796,7 @@ static int mlx5_ib_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata, { int err; - *umem = ib_umem_get(udata, addr, size, 0); + *umem = ib_umem_get(&dev->ib_dev, addr, size, 0); if (IS_ERR(*umem)) { mlx5_ib_dbg(dev, "umem_get failed\n"); return PTR_ERR(*umem); @@ -806,7 +853,7 @@ static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd, if (!ucmd->buf_addr) return -EINVAL; - rwq->umem = ib_umem_get(udata, ucmd->buf_addr, rwq->buf_size, 0); + rwq->umem = ib_umem_get(&dev->ib_dev, ucmd->buf_addr, rwq->buf_size, 0); if (IS_ERR(rwq->umem)) { mlx5_ib_dbg(dev, "umem_get failed\n"); err = PTR_ERR(rwq->umem); diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index 62939df3c692..b1a8a9175040 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -80,7 +80,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE); - srq->umem = ib_umem_get(udata, ucmd.buf_addr, buf_size, 0); + srq->umem = ib_umem_get(pd->device, ucmd.buf_addr, buf_size, 0); if (IS_ERR(srq->umem)) { mlx5_ib_dbg(dev, "failed umem get, size %d\n", buf_size); err = PTR_ERR(srq->umem); diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 33002530fee7..ac19d57803b5 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -880,7 +880,7 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (!mr) return ERR_PTR(-ENOMEM); - mr->umem = ib_umem_get(udata, start, length, acc); + mr->umem = ib_umem_get(pd->device, start, length, acc); if (IS_ERR(mr->umem)) { err = PTR_ERR(mr->umem); goto err; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 9bc1ca6f6f9e..d47ea675734b 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -869,7 +869,7 @@ struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len, mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) return ERR_PTR(status); - mr->umem = ib_umem_get(udata, start, len, acc); + mr->umem = ib_umem_get(ibpd->device, start, len, acc); if (IS_ERR(mr->umem)) { status = -EFAULT; goto umem_err; diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 4cd292966aa9..920f35e28cfc 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -772,7 +772,7 @@ static inline int qedr_init_user_queue(struct ib_udata *udata, q->buf_addr = buf_addr; q->buf_len = buf_len; - q->umem = ib_umem_get(udata, q->buf_addr, q->buf_len, access); + q->umem = ib_umem_get(&dev->ibdev, q->buf_addr, q->buf_len, access); if (IS_ERR(q->umem)) { DP_ERR(dev, "create user queue: failed ib_umem_get, got %ld\n", PTR_ERR(q->umem)); @@ -1415,9 +1415,8 @@ static int qedr_init_srq_user_params(struct ib_udata *udata, if (rc) return rc; - srq->prod_umem = - ib_umem_get(udata, ureq->prod_pair_addr, - sizeof(struct rdma_srq_producers), access); + srq->prod_umem = ib_umem_get(srq->ibsrq.device, ureq->prod_pair_addr, + sizeof(struct rdma_srq_producers), access); if (IS_ERR(srq->prod_umem)) { qedr_free_pbl(srq->dev, &srq->usrq.pbl_info, srq->usrq.pbl_tbl); ib_umem_release(srq->usrq.umem); @@ -2839,7 +2838,7 @@ struct ib_mr *qedr_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len, mr->type = QEDR_MR_USER; - mr->umem = ib_umem_get(udata, start, len, acc); + mr->umem = ib_umem_get(ibpd->device, start, len, acc); if (IS_ERR(mr->umem)) { rc = -EFAULT; goto err0; diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c index a26a4fd86bf4..4f6cc0de7ef9 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c @@ -135,7 +135,7 @@ int pvrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, goto err_cq; } - cq->umem = ib_umem_get(udata, ucmd.buf_addr, ucmd.buf_size, + cq->umem = ib_umem_get(ibdev, ucmd.buf_addr, ucmd.buf_size, IB_ACCESS_LOCAL_WRITE); if (IS_ERR(cq->umem)) { ret = PTR_ERR(cq->umem); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c index c61e665ff261..b039f1f00e05 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c @@ -126,7 +126,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, return ERR_PTR(-EINVAL); } - umem = ib_umem_get(udata, start, length, access_flags); + umem = ib_umem_get(pd->device, start, length, access_flags); if (IS_ERR(umem)) { dev_warn(&dev->pdev->dev, "could not get umem for mem region\n"); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c index f15809c28f67..9de1281f9a3b 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c @@ -276,8 +276,9 @@ struct ib_qp *pvrdma_create_qp(struct ib_pd *pd, if (!is_srq) { /* set qp->sq.wqe_cnt, shift, buf_size.. */ - qp->rumem = ib_umem_get(udata, ucmd.rbuf_addr, - ucmd.rbuf_size, 0); + qp->rumem = + ib_umem_get(pd->device, ucmd.rbuf_addr, + ucmd.rbuf_size, 0); if (IS_ERR(qp->rumem)) { ret = PTR_ERR(qp->rumem); goto err_qp; @@ -288,7 +289,7 @@ struct ib_qp *pvrdma_create_qp(struct ib_pd *pd, qp->srq = to_vsrq(init_attr->srq); } - qp->sumem = ib_umem_get(udata, ucmd.sbuf_addr, + qp->sumem = ib_umem_get(pd->device, ucmd.sbuf_addr, ucmd.sbuf_size, 0); if (IS_ERR(qp->sumem)) { if (!is_srq) diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c index 98c8be71d91d..d330decfb80a 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c @@ -146,7 +146,7 @@ int pvrdma_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *init_attr, goto err_srq; } - srq->umem = ib_umem_get(udata, ucmd.buf_addr, ucmd.buf_size, 0); + srq->umem = ib_umem_get(ibsrq->device, ucmd.buf_addr, ucmd.buf_size, 0); if (IS_ERR(srq->umem)) { ret = PTR_ERR(srq->umem); goto err_srq; diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c index b9a76bf74857..72f6534fbb52 100644 --- a/drivers/infiniband/sw/rdmavt/mr.c +++ b/drivers/infiniband/sw/rdmavt/mr.c @@ -390,7 +390,7 @@ struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (length == 0) return ERR_PTR(-EINVAL); - umem = ib_umem_get(udata, start, length, mr_access_flags); + umem = ib_umem_get(pd->device, start, length, mr_access_flags); if (IS_ERR(umem)) return (void *)umem; diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index 35a2baf2f364..e83c7b518bfa 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -169,7 +169,7 @@ int rxe_mem_init_user(struct rxe_pd *pd, u64 start, void *vaddr; int err; - umem = ib_umem_get(udata, start, length, access); + umem = ib_umem_get(pd->ibpd.device, start, length, access); if (IS_ERR(umem)) { pr_warn("err %d from rxe_umem_get\n", (int)PTR_ERR(umem)); diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c index d7dd6fcf2db0..cb6e3a5f509c 100644 --- a/drivers/input/evdev.c +++ b/drivers/input/evdev.c @@ -224,13 +224,13 @@ static void __pass_event(struct evdev_client *client, */ client->tail = (client->head - 2) & (client->bufsize - 1); - client->buffer[client->tail].input_event_sec = - event->input_event_sec; - client->buffer[client->tail].input_event_usec = - event->input_event_usec; - client->buffer[client->tail].type = EV_SYN; - client->buffer[client->tail].code = SYN_DROPPED; - client->buffer[client->tail].value = 0; + client->buffer[client->tail] = (struct input_event) { + .input_event_sec = event->input_event_sec, + .input_event_usec = event->input_event_usec, + .type = EV_SYN, + .code = SYN_DROPPED, + .value = 0, + }; client->packet_head = client->tail; } @@ -484,10 +484,7 @@ static int evdev_open(struct inode *inode, struct file *file) struct evdev_client *client; int error; - client = kzalloc(struct_size(client, buffer, bufsize), - GFP_KERNEL | __GFP_NOWARN); - if (!client) - client = vzalloc(struct_size(client, buffer, bufsize)); + client = kvzalloc(struct_size(client, buffer, bufsize), GFP_KERNEL); if (!client) return -ENOMEM; diff --git a/drivers/input/input.c b/drivers/input/input.c index 55086279d044..ee6c3234df36 100644 --- a/drivers/input/input.c +++ b/drivers/input/input.c @@ -878,16 +878,18 @@ static int input_default_setkeycode(struct input_dev *dev, } } - __clear_bit(*old_keycode, dev->keybit); - __set_bit(ke->keycode, dev->keybit); - - for (i = 0; i < dev->keycodemax; i++) { - if (input_fetch_keycode(dev, i) == *old_keycode) { - __set_bit(*old_keycode, dev->keybit); - break; /* Setting the bit twice is useless, so break */ + if (*old_keycode <= KEY_MAX) { + __clear_bit(*old_keycode, dev->keybit); + for (i = 0; i < dev->keycodemax; i++) { + if (input_fetch_keycode(dev, i) == *old_keycode) { + __set_bit(*old_keycode, dev->keybit); + /* Setting the bit twice is useless, so break */ + break; + } } } + __set_bit(ke->keycode, dev->keybit); return 0; } @@ -943,9 +945,13 @@ int input_set_keycode(struct input_dev *dev, * Simulate keyup event if keycode is not present * in the keymap anymore */ - if (test_bit(EV_KEY, dev->evbit) && - !is_event_supported(old_keycode, dev->keybit, KEY_MAX) && - __test_and_clear_bit(old_keycode, dev->key)) { + if (old_keycode > KEY_MAX) { + dev_warn(dev->dev.parent ?: &dev->dev, + "%s: got too big old keycode %#x\n", + __func__, old_keycode); + } else if (test_bit(EV_KEY, dev->evbit) && + !is_event_supported(old_keycode, dev->keybit, KEY_MAX) && + __test_and_clear_bit(old_keycode, dev->key)) { struct input_value vals[] = { { EV_KEY, old_keycode, 0 }, input_value_sync diff --git a/drivers/input/keyboard/imx_sc_key.c b/drivers/input/keyboard/imx_sc_key.c index 53799527dc75..9f809aeb785c 100644 --- a/drivers/input/keyboard/imx_sc_key.c +++ b/drivers/input/keyboard/imx_sc_key.c @@ -78,7 +78,13 @@ static void imx_sc_check_for_events(struct work_struct *work) return; } - state = (bool)msg.state; + /* + * The response data from SCU firmware is 4 bytes, + * but ONLY the first byte is the key state, other + * 3 bytes could be some dirty data, so we should + * ONLY take the first byte as key state. + */ + state = (bool)(msg.state & 0xff); if (state ^ priv->keystate) { priv->keystate = state; diff --git a/drivers/input/misc/keyspan_remote.c b/drivers/input/misc/keyspan_remote.c index 83368f1e7c4e..4650f4a94989 100644 --- a/drivers/input/misc/keyspan_remote.c +++ b/drivers/input/misc/keyspan_remote.c @@ -336,7 +336,8 @@ static int keyspan_setup(struct usb_device* dev) int retval = 0; retval = usb_control_msg(dev, usb_sndctrlpipe(dev, 0), - 0x11, 0x40, 0x5601, 0x0, NULL, 0, 0); + 0x11, 0x40, 0x5601, 0x0, NULL, 0, + USB_CTRL_SET_TIMEOUT); if (retval) { dev_dbg(&dev->dev, "%s - failed to set bit rate due to error: %d\n", __func__, retval); @@ -344,7 +345,8 @@ static int keyspan_setup(struct usb_device* dev) } retval = usb_control_msg(dev, usb_sndctrlpipe(dev, 0), - 0x44, 0x40, 0x0, 0x0, NULL, 0, 0); + 0x44, 0x40, 0x0, 0x0, NULL, 0, + USB_CTRL_SET_TIMEOUT); if (retval) { dev_dbg(&dev->dev, "%s - failed to set resume sensitivity due to error: %d\n", __func__, retval); @@ -352,7 +354,8 @@ static int keyspan_setup(struct usb_device* dev) } retval = usb_control_msg(dev, usb_sndctrlpipe(dev, 0), - 0x22, 0x40, 0x0, 0x0, NULL, 0, 0); + 0x22, 0x40, 0x0, 0x0, NULL, 0, + USB_CTRL_SET_TIMEOUT); if (retval) { dev_dbg(&dev->dev, "%s - failed to turn receive on due to error: %d\n", __func__, retval); diff --git a/drivers/input/misc/max77650-onkey.c b/drivers/input/misc/max77650-onkey.c index 4d875f2ac13d..ee55f22dbca5 100644 --- a/drivers/input/misc/max77650-onkey.c +++ b/drivers/input/misc/max77650-onkey.c @@ -108,9 +108,16 @@ static int max77650_onkey_probe(struct platform_device *pdev) return input_register_device(onkey->input); } +static const struct of_device_id max77650_onkey_of_match[] = { + { .compatible = "maxim,max77650-onkey" }, + { } +}; +MODULE_DEVICE_TABLE(of, max77650_onkey_of_match); + static struct platform_driver max77650_onkey_driver = { .driver = { .name = "max77650-onkey", + .of_match_table = max77650_onkey_of_match, }, .probe = max77650_onkey_probe, }; diff --git a/drivers/input/misc/pm8xxx-vibrator.c b/drivers/input/misc/pm8xxx-vibrator.c index ecd762f93732..53ad25eaf1a2 100644 --- a/drivers/input/misc/pm8xxx-vibrator.c +++ b/drivers/input/misc/pm8xxx-vibrator.c @@ -90,7 +90,7 @@ static int pm8xxx_vib_set(struct pm8xxx_vib *vib, bool on) if (regs->enable_mask) rc = regmap_update_bits(vib->regmap, regs->enable_addr, - on ? regs->enable_mask : 0, val); + regs->enable_mask, on ? ~0 : 0); return rc; } diff --git a/drivers/input/misc/uinput.c b/drivers/input/misc/uinput.c index fd253781be71..f2593133e524 100644 --- a/drivers/input/misc/uinput.c +++ b/drivers/input/misc/uinput.c @@ -74,12 +74,16 @@ static int uinput_dev_event(struct input_dev *dev, struct uinput_device *udev = input_get_drvdata(dev); struct timespec64 ts; - udev->buff[udev->head].type = type; - udev->buff[udev->head].code = code; - udev->buff[udev->head].value = value; ktime_get_ts64(&ts); - udev->buff[udev->head].input_event_sec = ts.tv_sec; - udev->buff[udev->head].input_event_usec = ts.tv_nsec / NSEC_PER_USEC; + + udev->buff[udev->head] = (struct input_event) { + .input_event_sec = ts.tv_sec, + .input_event_usec = ts.tv_nsec / NSEC_PER_USEC, + .type = type, + .code = code, + .value = value, + }; + udev->head = (udev->head + 1) % UINPUT_BUFFER_SIZE; wake_up_interruptible(&udev->waitq); @@ -689,13 +693,14 @@ static ssize_t uinput_read(struct file *file, char __user *buffer, static __poll_t uinput_poll(struct file *file, poll_table *wait) { struct uinput_device *udev = file->private_data; + __poll_t mask = EPOLLOUT | EPOLLWRNORM; /* uinput is always writable */ poll_wait(file, &udev->waitq, wait); if (udev->head != udev->tail) - return EPOLLIN | EPOLLRDNORM; + mask |= EPOLLIN | EPOLLRDNORM; - return EPOLLOUT | EPOLLWRNORM; + return mask; } static int uinput_release(struct inode *inode, struct file *file) diff --git a/drivers/input/rmi4/rmi_f54.c b/drivers/input/rmi4/rmi_f54.c index 0bc01cfc2b51..6b23e679606e 100644 --- a/drivers/input/rmi4/rmi_f54.c +++ b/drivers/input/rmi4/rmi_f54.c @@ -24,6 +24,12 @@ #define F54_NUM_TX_OFFSET 1 #define F54_NUM_RX_OFFSET 0 +/* + * The smbus protocol can read only 32 bytes max at a time. + * But this should be fine for i2c/spi as well. + */ +#define F54_REPORT_DATA_SIZE 32 + /* F54 commands */ #define F54_GET_REPORT 1 #define F54_FORCE_CAL 2 @@ -526,6 +532,7 @@ static void rmi_f54_work(struct work_struct *work) int report_size; u8 command; int error; + int i; report_size = rmi_f54_get_report_size(f54); if (report_size == 0) { @@ -558,23 +565,27 @@ static void rmi_f54_work(struct work_struct *work) rmi_dbg(RMI_DEBUG_FN, &fn->dev, "Get report command completed, reading data\n"); - fifo[0] = 0; - fifo[1] = 0; - error = rmi_write_block(fn->rmi_dev, - fn->fd.data_base_addr + F54_FIFO_OFFSET, - fifo, sizeof(fifo)); - if (error) { - dev_err(&fn->dev, "Failed to set fifo start offset\n"); - goto abort; - } + for (i = 0; i < report_size; i += F54_REPORT_DATA_SIZE) { + int size = min(F54_REPORT_DATA_SIZE, report_size - i); + + fifo[0] = i & 0xff; + fifo[1] = i >> 8; + error = rmi_write_block(fn->rmi_dev, + fn->fd.data_base_addr + F54_FIFO_OFFSET, + fifo, sizeof(fifo)); + if (error) { + dev_err(&fn->dev, "Failed to set fifo start offset\n"); + goto abort; + } - error = rmi_read_block(fn->rmi_dev, fn->fd.data_base_addr + - F54_REPORT_DATA_OFFSET, f54->report_data, - report_size); - if (error) { - dev_err(&fn->dev, "%s: read [%d bytes] returned %d\n", - __func__, report_size, error); - goto abort; + error = rmi_read_block(fn->rmi_dev, fn->fd.data_base_addr + + F54_REPORT_DATA_OFFSET, + f54->report_data + i, size); + if (error) { + dev_err(&fn->dev, "%s: read [%d bytes] returned %d\n", + __func__, size, error); + goto abort; + } } abort: diff --git a/drivers/input/rmi4/rmi_smbus.c b/drivers/input/rmi4/rmi_smbus.c index b313c579914f..2407ea43de59 100644 --- a/drivers/input/rmi4/rmi_smbus.c +++ b/drivers/input/rmi4/rmi_smbus.c @@ -163,6 +163,7 @@ static int rmi_smb_write_block(struct rmi_transport_dev *xport, u16 rmiaddr, /* prepare to write next block of bytes */ cur_len -= SMB_MAX_COUNT; databuff += SMB_MAX_COUNT; + rmiaddr += SMB_MAX_COUNT; } exit: mutex_unlock(&rmi_smb->page_mutex); @@ -214,6 +215,7 @@ static int rmi_smb_read_block(struct rmi_transport_dev *xport, u16 rmiaddr, /* prepare to read next block of bytes */ cur_len -= SMB_MAX_COUNT; databuff += SMB_MAX_COUNT; + rmiaddr += SMB_MAX_COUNT; } retval = 0; diff --git a/drivers/input/tablet/aiptek.c b/drivers/input/tablet/aiptek.c index 2ca586fb914f..e08b0ef078e8 100644 --- a/drivers/input/tablet/aiptek.c +++ b/drivers/input/tablet/aiptek.c @@ -1713,7 +1713,7 @@ aiptek_probe(struct usb_interface *intf, const struct usb_device_id *id) aiptek->inputdev = inputdev; aiptek->intf = intf; - aiptek->ifnum = intf->altsetting[0].desc.bInterfaceNumber; + aiptek->ifnum = intf->cur_altsetting->desc.bInterfaceNumber; aiptek->inDelay = 0; aiptek->endDelay = 0; aiptek->previousJitterable = 0; @@ -1802,14 +1802,14 @@ aiptek_probe(struct usb_interface *intf, const struct usb_device_id *id) input_set_abs_params(inputdev, ABS_WHEEL, AIPTEK_WHEEL_MIN, AIPTEK_WHEEL_MAX - 1, 0, 0); /* Verify that a device really has an endpoint */ - if (intf->altsetting[0].desc.bNumEndpoints < 1) { + if (intf->cur_altsetting->desc.bNumEndpoints < 1) { dev_err(&intf->dev, "interface has %d endpoints, but must have minimum 1\n", - intf->altsetting[0].desc.bNumEndpoints); + intf->cur_altsetting->desc.bNumEndpoints); err = -EINVAL; goto fail3; } - endpoint = &intf->altsetting[0].endpoint[0].desc; + endpoint = &intf->cur_altsetting->endpoint[0].desc; /* Go set up our URB, which is called when the tablet receives * input. diff --git a/drivers/input/tablet/gtco.c b/drivers/input/tablet/gtco.c index 35031228a6d0..96d65575f75a 100644 --- a/drivers/input/tablet/gtco.c +++ b/drivers/input/tablet/gtco.c @@ -875,18 +875,14 @@ static int gtco_probe(struct usb_interface *usbinterface, } /* Sanity check that a device has an endpoint */ - if (usbinterface->altsetting[0].desc.bNumEndpoints < 1) { + if (usbinterface->cur_altsetting->desc.bNumEndpoints < 1) { dev_err(&usbinterface->dev, "Invalid number of endpoints\n"); error = -EINVAL; goto err_free_urb; } - /* - * The endpoint is always altsetting 0, we know this since we know - * this device only has one interrupt endpoint - */ - endpoint = &usbinterface->altsetting[0].endpoint[0].desc; + endpoint = &usbinterface->cur_altsetting->endpoint[0].desc; /* Some debug */ dev_dbg(&usbinterface->dev, "gtco # interfaces: %d\n", usbinterface->num_altsetting); @@ -896,7 +892,8 @@ static int gtco_probe(struct usb_interface *usbinterface, if (usb_endpoint_xfer_int(endpoint)) dev_dbg(&usbinterface->dev, "endpoint: we have interrupt endpoint\n"); - dev_dbg(&usbinterface->dev, "endpoint extra len:%d\n", usbinterface->altsetting[0].extralen); + dev_dbg(&usbinterface->dev, "interface extra len:%d\n", + usbinterface->cur_altsetting->extralen); /* * Find the HID descriptor so we can find out the size of the @@ -973,8 +970,6 @@ static int gtco_probe(struct usb_interface *usbinterface, input_dev->dev.parent = &usbinterface->dev; /* Setup the URB, it will be posted later on open of input device */ - endpoint = &usbinterface->altsetting[0].endpoint[0].desc; - usb_fill_int_urb(gtco->urbinfo, udev, usb_rcvintpipe(udev, diff --git a/drivers/input/tablet/pegasus_notetaker.c b/drivers/input/tablet/pegasus_notetaker.c index a1f3a0cb197e..38f087404f7a 100644 --- a/drivers/input/tablet/pegasus_notetaker.c +++ b/drivers/input/tablet/pegasus_notetaker.c @@ -275,7 +275,7 @@ static int pegasus_probe(struct usb_interface *intf, return -ENODEV; /* Sanity check that the device has an endpoint */ - if (intf->altsetting[0].desc.bNumEndpoints < 1) { + if (intf->cur_altsetting->desc.bNumEndpoints < 1) { dev_err(&intf->dev, "Invalid number of endpoints\n"); return -EINVAL; } diff --git a/drivers/input/touchscreen/sun4i-ts.c b/drivers/input/touchscreen/sun4i-ts.c index 0af0fe8c40d7..742a7e96c1b5 100644 --- a/drivers/input/touchscreen/sun4i-ts.c +++ b/drivers/input/touchscreen/sun4i-ts.c @@ -237,6 +237,7 @@ static int sun4i_ts_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; struct device_node *np = dev->of_node; struct device *hwmon; + struct thermal_zone_device *thermal; int error; u32 reg; bool ts_attached; @@ -355,7 +356,10 @@ static int sun4i_ts_probe(struct platform_device *pdev) if (IS_ERR(hwmon)) return PTR_ERR(hwmon); - devm_thermal_zone_of_sensor_register(ts->dev, 0, ts, &sun4i_ts_tz_ops); + thermal = devm_thermal_zone_of_sensor_register(ts->dev, 0, ts, + &sun4i_ts_tz_ops); + if (IS_ERR(thermal)) + return PTR_ERR(thermal); writel(TEMP_IRQ_EN(1), ts->base + TP_INT_FIFOC); diff --git a/drivers/input/touchscreen/sur40.c b/drivers/input/touchscreen/sur40.c index 1dd47dda71cd..34d31c7ec8ba 100644 --- a/drivers/input/touchscreen/sur40.c +++ b/drivers/input/touchscreen/sur40.c @@ -661,7 +661,7 @@ static int sur40_probe(struct usb_interface *interface, int error; /* Check if we really have the right interface. */ - iface_desc = &interface->altsetting[0]; + iface_desc = interface->cur_altsetting; if (iface_desc->desc.bInterfaceClass != 0xFF) return -ENODEV; diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index 568c52317757..483f7bc379fa 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -1655,27 +1655,39 @@ static int iommu_pc_get_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, static void init_iommu_perf_ctr(struct amd_iommu *iommu) { struct pci_dev *pdev = iommu->dev; - u64 val = 0xabcd, val2 = 0; + u64 val = 0xabcd, val2 = 0, save_reg = 0; if (!iommu_feature(iommu, FEATURE_PC)) return; amd_iommu_pc_present = true; + /* save the value to restore, if writable */ + if (iommu_pc_get_set_reg(iommu, 0, 0, 0, &save_reg, false)) + goto pc_false; + /* Check if the performance counters can be written to */ if ((iommu_pc_get_set_reg(iommu, 0, 0, 0, &val, true)) || (iommu_pc_get_set_reg(iommu, 0, 0, 0, &val2, false)) || - (val != val2)) { - pci_err(pdev, "Unable to write to IOMMU perf counter.\n"); - amd_iommu_pc_present = false; - return; - } + (val != val2)) + goto pc_false; + + /* restore */ + if (iommu_pc_get_set_reg(iommu, 0, 0, 0, &save_reg, true)) + goto pc_false; pci_info(pdev, "IOMMU performance counters supported\n"); val = readl(iommu->mmio_base + MMIO_CNTR_CONF_OFFSET); iommu->max_banks = (u8) ((val >> 12) & 0x3f); iommu->max_counters = (u8) ((val >> 7) & 0xf); + + return; + +pc_false: + pci_err(pdev, "Unable to read/write to IOMMU perf counter.\n"); + amd_iommu_pc_present = false; + return; } static ssize_t amd_iommu_show_cap(struct device *dev, diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index c363294b3bb9..a2e96a5fd9a7 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1203,7 +1203,6 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) { struct device *dev = msi_desc_to_dev(desc); struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - struct iommu_dma_cookie *cookie; struct iommu_dma_msi_page *msi_page; static DEFINE_MUTEX(msi_prepare_lock); /* see below */ @@ -1212,8 +1211,6 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) return 0; } - cookie = domain->iova_cookie; - /* * In fact the whole prepare operation should already be serialised by * irq_domain_mutex further up the callchain, but that's pretty subtle diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 42966611a192..932267f49f9a 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -5163,7 +5163,8 @@ static void dmar_remove_one_dev_info(struct device *dev) spin_lock_irqsave(&device_domain_lock, flags); info = dev->archdata.iommu; - if (info) + if (info && info != DEFER_DEVICE_DOMAIN_INFO + && info != DUMMY_DEVICE_DOMAIN_INFO) __dmar_remove_one_dev_info(info); spin_unlock_irqrestore(&device_domain_lock, flags); } @@ -5624,8 +5625,10 @@ static int intel_iommu_add_device(struct device *dev) group = iommu_group_get_for_dev(dev); - if (IS_ERR(group)) - return PTR_ERR(group); + if (IS_ERR(group)) { + ret = PTR_ERR(group); + goto unlink; + } iommu_group_put(group); @@ -5651,7 +5654,8 @@ static int intel_iommu_add_device(struct device *dev) if (!get_private_domain_for_dev(dev)) { dev_warn(dev, "Failed to get a private domain.\n"); - return -ENOMEM; + ret = -ENOMEM; + goto unlink; } dev_info(dev, @@ -5666,6 +5670,10 @@ static int intel_iommu_add_device(struct device *dev) } return 0; + +unlink: + iommu_device_unlink(&iommu->iommu, dev); + return ret; } static void intel_iommu_remove_device(struct device *dev) @@ -5817,6 +5825,13 @@ static void intel_iommu_apply_resv_region(struct device *dev, WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end)); } +static struct iommu_group *intel_iommu_device_group(struct device *dev) +{ + if (dev_is_pci(dev)) + return pci_device_group(dev); + return generic_device_group(dev); +} + #ifdef CONFIG_INTEL_IOMMU_SVM struct intel_iommu *intel_svm_device_to_iommu(struct device *dev) { @@ -5989,7 +6004,7 @@ const struct iommu_ops intel_iommu_ops = { .get_resv_regions = intel_iommu_get_resv_regions, .put_resv_regions = intel_iommu_put_resv_regions, .apply_resv_region = intel_iommu_apply_resv_region, - .device_group = pci_device_group, + .device_group = intel_iommu_device_group, .dev_has_feat = intel_iommu_dev_has_feat, .dev_feat_enabled = intel_iommu_dev_feat_enabled, .dev_enable_feat = intel_iommu_dev_enable_feat, diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index fdd40756dbc1..3ead597e1c57 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -751,6 +751,7 @@ err_put_group: mutex_unlock(&group->mutex); dev->iommu_group = NULL; kobject_put(group->devices_kobj); + sysfs_remove_link(group->devices_kobj, device->name); err_free_name: kfree(device->name); err_remove_link: diff --git a/drivers/irqchip/irq-ingenic.c b/drivers/irqchip/irq-ingenic.c index 01d18b39069e..c5589ee0dfb3 100644 --- a/drivers/irqchip/irq-ingenic.c +++ b/drivers/irqchip/irq-ingenic.c @@ -17,7 +17,6 @@ #include <linux/delay.h> #include <asm/io.h> -#include <asm/mach-jz4740/irq.h> struct ingenic_intc_data { void __iomem *base; @@ -50,7 +49,7 @@ static irqreturn_t intc_cascade(int irq, void *data) while (pending) { int bit = __fls(pending); - irq = irq_find_mapping(domain, bit + (i * 32)); + irq = irq_linear_revmap(domain, bit + (i * 32)); generic_handle_irq(irq); pending &= ~BIT(bit); } @@ -97,8 +96,7 @@ static int __init ingenic_intc_of_init(struct device_node *node, goto out_unmap_irq; } - domain = irq_domain_add_legacy(node, num_chips * 32, - JZ4740_IRQ_BASE, 0, + domain = irq_domain_add_linear(node, num_chips * 32, &irq_generic_chip_ops, NULL); if (!domain) { err = -ENOMEM; diff --git a/drivers/irqchip/irq-sifive-plic.c b/drivers/irqchip/irq-sifive-plic.c index 8df547d2d935..0aca5807a119 100644 --- a/drivers/irqchip/irq-sifive-plic.c +++ b/drivers/irqchip/irq-sifive-plic.c @@ -256,7 +256,7 @@ static int __init plic_init(struct device_node *node, * Skip contexts other than external interrupts for our * privilege level. */ - if (parent.args[0] != IRQ_EXT) + if (parent.args[0] != RV_IRQ_EXT) continue; hartid = plic_find_hart_id(parent.np); diff --git a/drivers/leds/leds-as3645a.c b/drivers/leds/leds-as3645a.c index b7e0ae1af8fa..e8922fa03379 100644 --- a/drivers/leds/leds-as3645a.c +++ b/drivers/leds/leds-as3645a.c @@ -493,16 +493,17 @@ static int as3645a_parse_node(struct as3645a *flash, switch (id) { case AS_LED_FLASH: flash->flash_node = child; + fwnode_handle_get(child); break; case AS_LED_INDICATOR: flash->indicator_node = child; + fwnode_handle_get(child); break; default: dev_warn(&flash->client->dev, "unknown LED %u encountered, ignoring\n", id); break; } - fwnode_handle_get(child); } if (!flash->flash_node) { diff --git a/drivers/leds/leds-gpio.c b/drivers/leds/leds-gpio.c index a5c73f3d5f79..2bf74595610f 100644 --- a/drivers/leds/leds-gpio.c +++ b/drivers/leds/leds-gpio.c @@ -151,9 +151,14 @@ static struct gpio_leds_priv *gpio_leds_create(struct platform_device *pdev) struct gpio_led led = {}; const char *state = NULL; + /* + * Acquire gpiod from DT with uninitialized label, which + * will be updated after LED class device is registered, + * Only then the final LED name is known. + */ led.gpiod = devm_fwnode_get_gpiod_from_child(dev, NULL, child, GPIOD_ASIS, - led.name); + NULL); if (IS_ERR(led.gpiod)) { fwnode_handle_put(child); return ERR_CAST(led.gpiod); @@ -186,6 +191,9 @@ static struct gpio_leds_priv *gpio_leds_create(struct platform_device *pdev) fwnode_handle_put(child); return ERR_PTR(ret); } + /* Set gpiod label to match the corresponding LED name. */ + gpiod_set_consumer_name(led_dat->gpiod, + led_dat->cdev.dev->kobj.name); priv->num_leds++; } diff --git a/drivers/leds/leds-lm3532.c b/drivers/leds/leds-lm3532.c index 0507c6575c08..491268bb34a7 100644 --- a/drivers/leds/leds-lm3532.c +++ b/drivers/leds/leds-lm3532.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 // TI LM3532 LED driver // Copyright (C) 2019 Texas Instruments Incorporated - http://www.ti.com/ +// http://www.ti.com/lit/ds/symlink/lm3532.pdf #include <linux/i2c.h> #include <linux/leds.h> @@ -623,7 +624,7 @@ static int lm3532_parse_node(struct lm3532_data *priv) led->num_leds = fwnode_property_count_u32(child, "led-sources"); if (led->num_leds > LM3532_MAX_LED_STRINGS) { - dev_err(&priv->client->dev, "To many LED string defined\n"); + dev_err(&priv->client->dev, "Too many LED string defined\n"); continue; } diff --git a/drivers/leds/leds-max77650.c b/drivers/leds/leds-max77650.c index 4c2d0b3c6dad..a0d4b725c917 100644 --- a/drivers/leds/leds-max77650.c +++ b/drivers/leds/leds-max77650.c @@ -135,9 +135,16 @@ err_node_put: return rv; } +static const struct of_device_id max77650_led_of_match[] = { + { .compatible = "maxim,max77650-led" }, + { } +}; +MODULE_DEVICE_TABLE(of, max77650_led_of_match); + static struct platform_driver max77650_led_driver = { .driver = { .name = "max77650-led", + .of_match_table = max77650_led_of_match, }, .probe = max77650_led_probe, }; diff --git a/drivers/leds/leds-rb532.c b/drivers/leds/leds-rb532.c index db5af83f0cec..b6447c1721b4 100644 --- a/drivers/leds/leds-rb532.c +++ b/drivers/leds/leds-rb532.c @@ -21,7 +21,6 @@ static void rb532_led_set(struct led_classdev *cdev, { if (brightness) set_latch_u5(LO_ULED, 0); - else set_latch_u5(0, LO_ULED); } diff --git a/drivers/leds/trigger/ledtrig-pattern.c b/drivers/leds/trigger/ledtrig-pattern.c index 718729c89440..3abcafe46278 100644 --- a/drivers/leds/trigger/ledtrig-pattern.c +++ b/drivers/leds/trigger/ledtrig-pattern.c @@ -455,7 +455,7 @@ static void __exit pattern_trig_exit(void) module_init(pattern_trig_init); module_exit(pattern_trig_exit); -MODULE_AUTHOR("Raphael Teysseyre <rteysseyre@gmail.com"); -MODULE_AUTHOR("Baolin Wang <baolin.wang@linaro.org"); +MODULE_AUTHOR("Raphael Teysseyre <rteysseyre@gmail.com>"); +MODULE_AUTHOR("Baolin Wang <baolin.wang@linaro.org>"); MODULE_DESCRIPTION("LED Pattern trigger"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 3c50c4e4da8f..963d3774c93e 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -17,7 +17,7 @@ #include <linux/dm-bufio.h> #define DM_MSG_PREFIX "persistent snapshot" -#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */ +#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32U /* 16KB */ #define DM_PREFETCH_CHUNKS 12 diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index b7c20979bd19..322386ff5d22 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -87,7 +87,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) char b[BDEVNAME_SIZE]; char b2[BDEVNAME_SIZE]; struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL); - unsigned short blksize = 512; + unsigned blksize = 512; *private_conf = ERR_PTR(-ENOMEM); if (!conf) diff --git a/drivers/media/cec/cec-adap.c b/drivers/media/cec/cec-adap.c index 9340435a94a0..6c95dc471d4c 100644 --- a/drivers/media/cec/cec-adap.c +++ b/drivers/media/cec/cec-adap.c @@ -380,7 +380,8 @@ static void cec_data_cancel(struct cec_data *data, u8 tx_status) } else { list_del_init(&data->list); if (!(data->msg.tx_status & CEC_TX_STATUS_OK)) - data->adap->transmit_queue_sz--; + if (!WARN_ON(!data->adap->transmit_queue_sz)) + data->adap->transmit_queue_sz--; } if (data->msg.tx_status & CEC_TX_STATUS_OK) { @@ -432,6 +433,14 @@ static void cec_flush(struct cec_adapter *adap) * need to do anything special in that case. */ } + /* + * If something went wrong and this counter isn't what it should + * be, then this will reset it back to 0. Warn if it is not 0, + * since it indicates a bug, either in this framework or in a + * CEC driver. + */ + if (WARN_ON(adap->transmit_queue_sz)) + adap->transmit_queue_sz = 0; } /* @@ -456,7 +465,7 @@ int cec_thread_func(void *_adap) bool timeout = false; u8 attempts; - if (adap->transmitting) { + if (adap->transmit_in_progress) { int err; /* @@ -491,7 +500,7 @@ int cec_thread_func(void *_adap) goto unlock; } - if (adap->transmitting && timeout) { + if (adap->transmit_in_progress && timeout) { /* * If we timeout, then log that. Normally this does * not happen and it is an indication of a faulty CEC @@ -500,14 +509,18 @@ int cec_thread_func(void *_adap) * so much traffic on the bus that the adapter was * unable to transmit for CEC_XFER_TIMEOUT_MS (2.1s). */ - pr_warn("cec-%s: message %*ph timed out\n", adap->name, - adap->transmitting->msg.len, - adap->transmitting->msg.msg); + if (adap->transmitting) { + pr_warn("cec-%s: message %*ph timed out\n", adap->name, + adap->transmitting->msg.len, + adap->transmitting->msg.msg); + /* Just give up on this. */ + cec_data_cancel(adap->transmitting, + CEC_TX_STATUS_TIMEOUT); + } else { + pr_warn("cec-%s: transmit timed out\n", adap->name); + } adap->transmit_in_progress = false; adap->tx_timeouts++; - /* Just give up on this. */ - cec_data_cancel(adap->transmitting, - CEC_TX_STATUS_TIMEOUT); goto unlock; } @@ -522,7 +535,8 @@ int cec_thread_func(void *_adap) data = list_first_entry(&adap->transmit_queue, struct cec_data, list); list_del_init(&data->list); - adap->transmit_queue_sz--; + if (!WARN_ON(!data->adap->transmit_queue_sz)) + adap->transmit_queue_sz--; /* Make this the current transmitting message */ adap->transmitting = data; @@ -1085,11 +1099,11 @@ void cec_received_msg_ts(struct cec_adapter *adap, valid_la = false; else if (!cec_msg_is_broadcast(msg) && !(dir_fl & DIRECTED)) valid_la = false; - else if (cec_msg_is_broadcast(msg) && !(dir_fl & BCAST1_4)) + else if (cec_msg_is_broadcast(msg) && !(dir_fl & BCAST)) valid_la = false; else if (cec_msg_is_broadcast(msg) && - adap->log_addrs.cec_version >= CEC_OP_CEC_VERSION_2_0 && - !(dir_fl & BCAST2_0)) + adap->log_addrs.cec_version < CEC_OP_CEC_VERSION_2_0 && + !(dir_fl & BCAST1_4)) valid_la = false; } if (valid_la && min_len) { diff --git a/drivers/media/usb/pulse8-cec/pulse8-cec.c b/drivers/media/usb/pulse8-cec/pulse8-cec.c index ac88ade94cda..59609556d969 100644 --- a/drivers/media/usb/pulse8-cec/pulse8-cec.c +++ b/drivers/media/usb/pulse8-cec/pulse8-cec.c @@ -116,6 +116,7 @@ struct pulse8 { unsigned int vers; struct completion cmd_done; struct work_struct work; + u8 work_result; struct delayed_work ping_eeprom_work; struct cec_msg rx_msg; u8 data[DATA_SIZE]; @@ -137,8 +138,10 @@ static void pulse8_irq_work_handler(struct work_struct *work) { struct pulse8 *pulse8 = container_of(work, struct pulse8, work); + u8 result = pulse8->work_result; - switch (pulse8->data[0] & 0x3f) { + pulse8->work_result = 0; + switch (result & 0x3f) { case MSGCODE_FRAME_DATA: cec_received_msg(pulse8->adap, &pulse8->rx_msg); break; @@ -172,12 +175,12 @@ static irqreturn_t pulse8_interrupt(struct serio *serio, unsigned char data, pulse8->escape = false; } else if (data == MSGEND) { struct cec_msg *msg = &pulse8->rx_msg; + u8 msgcode = pulse8->buf[0]; if (debug) dev_info(pulse8->dev, "received: %*ph\n", pulse8->idx, pulse8->buf); - pulse8->data[0] = pulse8->buf[0]; - switch (pulse8->buf[0] & 0x3f) { + switch (msgcode & 0x3f) { case MSGCODE_FRAME_START: msg->len = 1; msg->msg[0] = pulse8->buf[1]; @@ -186,14 +189,20 @@ static irqreturn_t pulse8_interrupt(struct serio *serio, unsigned char data, if (msg->len == CEC_MAX_MSG_SIZE) break; msg->msg[msg->len++] = pulse8->buf[1]; - if (pulse8->buf[0] & MSGCODE_FRAME_EOM) + if (msgcode & MSGCODE_FRAME_EOM) { + WARN_ON(pulse8->work_result); + pulse8->work_result = msgcode; schedule_work(&pulse8->work); + break; + } break; case MSGCODE_TRANSMIT_SUCCEEDED: case MSGCODE_TRANSMIT_FAILED_LINE: case MSGCODE_TRANSMIT_FAILED_ACK: case MSGCODE_TRANSMIT_FAILED_TIMEOUT_DATA: case MSGCODE_TRANSMIT_FAILED_TIMEOUT_LINE: + WARN_ON(pulse8->work_result); + pulse8->work_result = msgcode; schedule_work(&pulse8->work); break; case MSGCODE_HIGH_ERROR: diff --git a/drivers/message/fusion/mptctl.c b/drivers/message/fusion/mptctl.c index f9ac22413000..1074b882c57c 100644 --- a/drivers/message/fusion/mptctl.c +++ b/drivers/message/fusion/mptctl.c @@ -100,19 +100,19 @@ struct buflist { * Function prototypes. Called from OS entry point mptctl_ioctl. * arg contents specific to function. */ -static int mptctl_fw_download(unsigned long arg); -static int mptctl_getiocinfo(unsigned long arg, unsigned int cmd); -static int mptctl_gettargetinfo(unsigned long arg); -static int mptctl_readtest(unsigned long arg); -static int mptctl_mpt_command(unsigned long arg); -static int mptctl_eventquery(unsigned long arg); -static int mptctl_eventenable(unsigned long arg); -static int mptctl_eventreport(unsigned long arg); -static int mptctl_replace_fw(unsigned long arg); - -static int mptctl_do_reset(unsigned long arg); -static int mptctl_hp_hostinfo(unsigned long arg, unsigned int cmd); -static int mptctl_hp_targetinfo(unsigned long arg); +static int mptctl_fw_download(MPT_ADAPTER *iocp, unsigned long arg); +static int mptctl_getiocinfo(MPT_ADAPTER *iocp, unsigned long arg, unsigned int cmd); +static int mptctl_gettargetinfo(MPT_ADAPTER *iocp, unsigned long arg); +static int mptctl_readtest(MPT_ADAPTER *iocp, unsigned long arg); +static int mptctl_mpt_command(MPT_ADAPTER *iocp, unsigned long arg); +static int mptctl_eventquery(MPT_ADAPTER *iocp, unsigned long arg); +static int mptctl_eventenable(MPT_ADAPTER *iocp, unsigned long arg); +static int mptctl_eventreport(MPT_ADAPTER *iocp, unsigned long arg); +static int mptctl_replace_fw(MPT_ADAPTER *iocp, unsigned long arg); + +static int mptctl_do_reset(MPT_ADAPTER *iocp, unsigned long arg); +static int mptctl_hp_hostinfo(MPT_ADAPTER *iocp, unsigned long arg, unsigned int cmd); +static int mptctl_hp_targetinfo(MPT_ADAPTER *iocp, unsigned long arg); static int mptctl_probe(struct pci_dev *, const struct pci_device_id *); static void mptctl_remove(struct pci_dev *); @@ -123,8 +123,8 @@ static long compat_mpctl_ioctl(struct file *f, unsigned cmd, unsigned long arg); /* * Private function calls. */ -static int mptctl_do_mpt_command(struct mpt_ioctl_command karg, void __user *mfPtr); -static int mptctl_do_fw_download(int ioc, char __user *ufwbuf, size_t fwlen); +static int mptctl_do_mpt_command(MPT_ADAPTER *iocp, struct mpt_ioctl_command karg, void __user *mfPtr); +static int mptctl_do_fw_download(MPT_ADAPTER *iocp, char __user *ufwbuf, size_t fwlen); static MptSge_t *kbuf_alloc_2_sgl(int bytes, u32 dir, int sge_offset, int *frags, struct buflist **blp, dma_addr_t *sglbuf_dma, MPT_ADAPTER *ioc); static void kfree_sgl(MptSge_t *sgl, dma_addr_t sgl_dma, @@ -656,19 +656,19 @@ __mptctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg) * by TM and FW reloads. */ if ((cmd & ~IOCSIZE_MASK) == (MPTIOCINFO & ~IOCSIZE_MASK)) { - return mptctl_getiocinfo(arg, _IOC_SIZE(cmd)); + return mptctl_getiocinfo(iocp, arg, _IOC_SIZE(cmd)); } else if (cmd == MPTTARGETINFO) { - return mptctl_gettargetinfo(arg); + return mptctl_gettargetinfo(iocp, arg); } else if (cmd == MPTTEST) { - return mptctl_readtest(arg); + return mptctl_readtest(iocp, arg); } else if (cmd == MPTEVENTQUERY) { - return mptctl_eventquery(arg); + return mptctl_eventquery(iocp, arg); } else if (cmd == MPTEVENTENABLE) { - return mptctl_eventenable(arg); + return mptctl_eventenable(iocp, arg); } else if (cmd == MPTEVENTREPORT) { - return mptctl_eventreport(arg); + return mptctl_eventreport(iocp, arg); } else if (cmd == MPTFWREPLACE) { - return mptctl_replace_fw(arg); + return mptctl_replace_fw(iocp, arg); } /* All of these commands require an interrupt or @@ -678,15 +678,15 @@ __mptctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return ret; if (cmd == MPTFWDOWNLOAD) - ret = mptctl_fw_download(arg); + ret = mptctl_fw_download(iocp, arg); else if (cmd == MPTCOMMAND) - ret = mptctl_mpt_command(arg); + ret = mptctl_mpt_command(iocp, arg); else if (cmd == MPTHARDRESET) - ret = mptctl_do_reset(arg); + ret = mptctl_do_reset(iocp, arg); else if ((cmd & ~IOCSIZE_MASK) == (HP_GETHOSTINFO & ~IOCSIZE_MASK)) - ret = mptctl_hp_hostinfo(arg, _IOC_SIZE(cmd)); + ret = mptctl_hp_hostinfo(iocp, arg, _IOC_SIZE(cmd)); else if (cmd == HP_GETTARGETINFO) - ret = mptctl_hp_targetinfo(arg); + ret = mptctl_hp_targetinfo(iocp, arg); else ret = -EINVAL; @@ -705,11 +705,10 @@ mptctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return ret; } -static int mptctl_do_reset(unsigned long arg) +static int mptctl_do_reset(MPT_ADAPTER *iocp, unsigned long arg) { struct mpt_ioctl_diag_reset __user *urinfo = (void __user *) arg; struct mpt_ioctl_diag_reset krinfo; - MPT_ADAPTER *iocp; if (copy_from_user(&krinfo, urinfo, sizeof(struct mpt_ioctl_diag_reset))) { printk(KERN_ERR MYNAM "%s@%d::mptctl_do_reset - " @@ -718,12 +717,6 @@ static int mptctl_do_reset(unsigned long arg) return -EFAULT; } - if (mpt_verify_adapter(krinfo.hdr.iocnum, &iocp) < 0) { - printk(KERN_DEBUG MYNAM "%s@%d::mptctl_do_reset - ioc%d not found!\n", - __FILE__, __LINE__, krinfo.hdr.iocnum); - return -ENODEV; /* (-6) No such device or address */ - } - dctlprintk(iocp, printk(MYIOC_s_DEBUG_FMT "mptctl_do_reset called.\n", iocp->name)); @@ -754,7 +747,7 @@ static int mptctl_do_reset(unsigned long arg) * -ENOMSG if FW upload returned bad status */ static int -mptctl_fw_download(unsigned long arg) +mptctl_fw_download(MPT_ADAPTER *iocp, unsigned long arg) { struct mpt_fw_xfer __user *ufwdl = (void __user *) arg; struct mpt_fw_xfer kfwdl; @@ -766,7 +759,7 @@ mptctl_fw_download(unsigned long arg) return -EFAULT; } - return mptctl_do_fw_download(kfwdl.iocnum, kfwdl.bufp, kfwdl.fwlen); + return mptctl_do_fw_download(iocp, kfwdl.bufp, kfwdl.fwlen); } /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/ @@ -784,11 +777,10 @@ mptctl_fw_download(unsigned long arg) * -ENOMSG if FW upload returned bad status */ static int -mptctl_do_fw_download(int ioc, char __user *ufwbuf, size_t fwlen) +mptctl_do_fw_download(MPT_ADAPTER *iocp, char __user *ufwbuf, size_t fwlen) { FWDownload_t *dlmsg; MPT_FRAME_HDR *mf; - MPT_ADAPTER *iocp; FWDownloadTCSGE_t *ptsge; MptSge_t *sgl, *sgIn; char *sgOut; @@ -808,17 +800,10 @@ mptctl_do_fw_download(int ioc, char __user *ufwbuf, size_t fwlen) pFWDownloadReply_t ReplyMsg = NULL; unsigned long timeleft; - if (mpt_verify_adapter(ioc, &iocp) < 0) { - printk(KERN_DEBUG MYNAM "ioctl_fwdl - ioc%d not found!\n", - ioc); - return -ENODEV; /* (-6) No such device or address */ - } else { - - /* Valid device. Get a message frame and construct the FW download message. - */ - if ((mf = mpt_get_msg_frame(mptctl_id, iocp)) == NULL) - return -EAGAIN; - } + /* Valid device. Get a message frame and construct the FW download message. + */ + if ((mf = mpt_get_msg_frame(mptctl_id, iocp)) == NULL) + return -EAGAIN; dctlprintk(iocp, printk(MYIOC_s_DEBUG_FMT "mptctl_do_fwdl called. mptctl_id = %xh.\n", iocp->name, mptctl_id)); @@ -826,8 +811,6 @@ mptctl_do_fw_download(int ioc, char __user *ufwbuf, size_t fwlen) iocp->name, ufwbuf)); dctlprintk(iocp, printk(MYIOC_s_DEBUG_FMT "DbG: kfwdl.fwlen = %d\n", iocp->name, (int)fwlen)); - dctlprintk(iocp, printk(MYIOC_s_DEBUG_FMT "DbG: kfwdl.ioc = %04xh\n", - iocp->name, ioc)); dlmsg = (FWDownload_t*) mf; ptsge = (FWDownloadTCSGE_t *) &dlmsg->SGL; @@ -1238,13 +1221,11 @@ kfree_sgl(MptSge_t *sgl, dma_addr_t sgl_dma, struct buflist *buflist, MPT_ADAPTE * -ENODEV if no such device/adapter */ static int -mptctl_getiocinfo (unsigned long arg, unsigned int data_size) +mptctl_getiocinfo (MPT_ADAPTER *ioc, unsigned long arg, unsigned int data_size) { struct mpt_ioctl_iocinfo __user *uarg = (void __user *) arg; struct mpt_ioctl_iocinfo *karg; - MPT_ADAPTER *ioc; struct pci_dev *pdev; - int iocnum; unsigned int port; int cim_rev; struct scsi_device *sdev; @@ -1272,14 +1253,6 @@ mptctl_getiocinfo (unsigned long arg, unsigned int data_size) return PTR_ERR(karg); } - if (((iocnum = mpt_verify_adapter(karg->hdr.iocnum, &ioc)) < 0) || - (ioc == NULL)) { - printk(KERN_DEBUG MYNAM "%s::mptctl_getiocinfo() @%d - ioc%d not found!\n", - __FILE__, __LINE__, iocnum); - kfree(karg); - return -ENODEV; - } - /* Verify the data transfer size is correct. */ if (karg->hdr.maxDataSize != data_size) { printk(MYIOC_s_ERR_FMT "%s@%d::mptctl_getiocinfo - " @@ -1385,15 +1358,13 @@ mptctl_getiocinfo (unsigned long arg, unsigned int data_size) * -ENODEV if no such device/adapter */ static int -mptctl_gettargetinfo (unsigned long arg) +mptctl_gettargetinfo (MPT_ADAPTER *ioc, unsigned long arg) { struct mpt_ioctl_targetinfo __user *uarg = (void __user *) arg; struct mpt_ioctl_targetinfo karg; - MPT_ADAPTER *ioc; VirtDevice *vdevice; char *pmem; int *pdata; - int iocnum; int numDevices = 0; int lun; int maxWordsLeft; @@ -1408,13 +1379,6 @@ mptctl_gettargetinfo (unsigned long arg) return -EFAULT; } - if (((iocnum = mpt_verify_adapter(karg.hdr.iocnum, &ioc)) < 0) || - (ioc == NULL)) { - printk(KERN_DEBUG MYNAM "%s::mptctl_gettargetinfo() @%d - ioc%d not found!\n", - __FILE__, __LINE__, iocnum); - return -ENODEV; - } - dctlprintk(ioc, printk(MYIOC_s_DEBUG_FMT "mptctl_gettargetinfo called.\n", ioc->name)); /* Get the port number and set the maximum number of bytes @@ -1510,12 +1474,10 @@ mptctl_gettargetinfo (unsigned long arg) * -ENODEV if no such device/adapter */ static int -mptctl_readtest (unsigned long arg) +mptctl_readtest (MPT_ADAPTER *ioc, unsigned long arg) { struct mpt_ioctl_test __user *uarg = (void __user *) arg; struct mpt_ioctl_test karg; - MPT_ADAPTER *ioc; - int iocnum; if (copy_from_user(&karg, uarg, sizeof(struct mpt_ioctl_test))) { printk(KERN_ERR MYNAM "%s@%d::mptctl_readtest - " @@ -1524,13 +1486,6 @@ mptctl_readtest (unsigned long arg) return -EFAULT; } - if (((iocnum = mpt_verify_adapter(karg.hdr.iocnum, &ioc)) < 0) || - (ioc == NULL)) { - printk(KERN_DEBUG MYNAM "%s::mptctl_readtest() @%d - ioc%d not found!\n", - __FILE__, __LINE__, iocnum); - return -ENODEV; - } - dctlprintk(ioc, printk(MYIOC_s_DEBUG_FMT "mptctl_readtest called.\n", ioc->name)); /* Fill in the data and return the structure to the calling @@ -1571,12 +1526,10 @@ mptctl_readtest (unsigned long arg) * -ENODEV if no such device/adapter */ static int -mptctl_eventquery (unsigned long arg) +mptctl_eventquery (MPT_ADAPTER *ioc, unsigned long arg) { struct mpt_ioctl_eventquery __user *uarg = (void __user *) arg; struct mpt_ioctl_eventquery karg; - MPT_ADAPTER *ioc; - int iocnum; if (copy_from_user(&karg, uarg, sizeof(struct mpt_ioctl_eventquery))) { printk(KERN_ERR MYNAM "%s@%d::mptctl_eventquery - " @@ -1585,13 +1538,6 @@ mptctl_eventquery (unsigned long arg) return -EFAULT; } - if (((iocnum = mpt_verify_adapter(karg.hdr.iocnum, &ioc)) < 0) || - (ioc == NULL)) { - printk(KERN_DEBUG MYNAM "%s::mptctl_eventquery() @%d - ioc%d not found!\n", - __FILE__, __LINE__, iocnum); - return -ENODEV; - } - dctlprintk(ioc, printk(MYIOC_s_DEBUG_FMT "mptctl_eventquery called.\n", ioc->name)); karg.eventEntries = MPTCTL_EVENT_LOG_SIZE; @@ -1610,12 +1556,10 @@ mptctl_eventquery (unsigned long arg) /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/ static int -mptctl_eventenable (unsigned long arg) +mptctl_eventenable (MPT_ADAPTER *ioc, unsigned long arg) { struct mpt_ioctl_eventenable __user *uarg = (void __user *) arg; struct mpt_ioctl_eventenable karg; - MPT_ADAPTER *ioc; - int iocnum; if (copy_from_user(&karg, uarg, sizeof(struct mpt_ioctl_eventenable))) { printk(KERN_ERR MYNAM "%s@%d::mptctl_eventenable - " @@ -1624,13 +1568,6 @@ mptctl_eventenable (unsigned long arg) return -EFAULT; } - if (((iocnum = mpt_verify_adapter(karg.hdr.iocnum, &ioc)) < 0) || - (ioc == NULL)) { - printk(KERN_DEBUG MYNAM "%s::mptctl_eventenable() @%d - ioc%d not found!\n", - __FILE__, __LINE__, iocnum); - return -ENODEV; - } - dctlprintk(ioc, printk(MYIOC_s_DEBUG_FMT "mptctl_eventenable called.\n", ioc->name)); if (ioc->events == NULL) { @@ -1658,12 +1595,10 @@ mptctl_eventenable (unsigned long arg) /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/ static int -mptctl_eventreport (unsigned long arg) +mptctl_eventreport (MPT_ADAPTER *ioc, unsigned long arg) { struct mpt_ioctl_eventreport __user *uarg = (void __user *) arg; struct mpt_ioctl_eventreport karg; - MPT_ADAPTER *ioc; - int iocnum; int numBytes, maxEvents, max; if (copy_from_user(&karg, uarg, sizeof(struct mpt_ioctl_eventreport))) { @@ -1673,12 +1608,6 @@ mptctl_eventreport (unsigned long arg) return -EFAULT; } - if (((iocnum = mpt_verify_adapter(karg.hdr.iocnum, &ioc)) < 0) || - (ioc == NULL)) { - printk(KERN_DEBUG MYNAM "%s::mptctl_eventreport() @%d - ioc%d not found!\n", - __FILE__, __LINE__, iocnum); - return -ENODEV; - } dctlprintk(ioc, printk(MYIOC_s_DEBUG_FMT "mptctl_eventreport called.\n", ioc->name)); @@ -1712,12 +1641,10 @@ mptctl_eventreport (unsigned long arg) /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/ static int -mptctl_replace_fw (unsigned long arg) +mptctl_replace_fw (MPT_ADAPTER *ioc, unsigned long arg) { struct mpt_ioctl_replace_fw __user *uarg = (void __user *) arg; struct mpt_ioctl_replace_fw karg; - MPT_ADAPTER *ioc; - int iocnum; int newFwSize; if (copy_from_user(&karg, uarg, sizeof(struct mpt_ioctl_replace_fw))) { @@ -1727,13 +1654,6 @@ mptctl_replace_fw (unsigned long arg) return -EFAULT; } - if (((iocnum = mpt_verify_adapter(karg.hdr.iocnum, &ioc)) < 0) || - (ioc == NULL)) { - printk(KERN_DEBUG MYNAM "%s::mptctl_replace_fw() @%d - ioc%d not found!\n", - __FILE__, __LINE__, iocnum); - return -ENODEV; - } - dctlprintk(ioc, printk(MYIOC_s_DEBUG_FMT "mptctl_replace_fw called.\n", ioc->name)); /* If caching FW, Free the old FW image @@ -1780,12 +1700,10 @@ mptctl_replace_fw (unsigned long arg) * -ENOMEM if memory allocation error */ static int -mptctl_mpt_command (unsigned long arg) +mptctl_mpt_command (MPT_ADAPTER *ioc, unsigned long arg) { struct mpt_ioctl_command __user *uarg = (void __user *) arg; struct mpt_ioctl_command karg; - MPT_ADAPTER *ioc; - int iocnum; int rc; @@ -1796,14 +1714,7 @@ mptctl_mpt_command (unsigned long arg) return -EFAULT; } - if (((iocnum = mpt_verify_adapter(karg.hdr.iocnum, &ioc)) < 0) || - (ioc == NULL)) { - printk(KERN_DEBUG MYNAM "%s::mptctl_mpt_command() @%d - ioc%d not found!\n", - __FILE__, __LINE__, iocnum); - return -ENODEV; - } - - rc = mptctl_do_mpt_command (karg, &uarg->MF); + rc = mptctl_do_mpt_command (ioc, karg, &uarg->MF); return rc; } @@ -1821,9 +1732,8 @@ mptctl_mpt_command (unsigned long arg) * -EPERM if SCSI I/O and target is untagged */ static int -mptctl_do_mpt_command (struct mpt_ioctl_command karg, void __user *mfPtr) +mptctl_do_mpt_command (MPT_ADAPTER *ioc, struct mpt_ioctl_command karg, void __user *mfPtr) { - MPT_ADAPTER *ioc; MPT_FRAME_HDR *mf = NULL; MPIHeader_t *hdr; char *psge; @@ -1832,7 +1742,7 @@ mptctl_do_mpt_command (struct mpt_ioctl_command karg, void __user *mfPtr) dma_addr_t dma_addr_in; dma_addr_t dma_addr_out; int sgSize = 0; /* Num SG elements */ - int iocnum, flagsLength; + int flagsLength; int sz, rc = 0; int msgContext; u16 req_idx; @@ -1847,13 +1757,6 @@ mptctl_do_mpt_command (struct mpt_ioctl_command karg, void __user *mfPtr) bufIn.kptr = bufOut.kptr = NULL; bufIn.len = bufOut.len = 0; - if (((iocnum = mpt_verify_adapter(karg.hdr.iocnum, &ioc)) < 0) || - (ioc == NULL)) { - printk(KERN_DEBUG MYNAM "%s::mptctl_do_mpt_command() @%d - ioc%d not found!\n", - __FILE__, __LINE__, iocnum); - return -ENODEV; - } - spin_lock_irqsave(&ioc->taskmgmt_lock, flags); if (ioc->ioc_reset_in_progress) { spin_unlock_irqrestore(&ioc->taskmgmt_lock, flags); @@ -2418,17 +2321,15 @@ done_free_mem: * -ENOMEM if memory allocation error */ static int -mptctl_hp_hostinfo(unsigned long arg, unsigned int data_size) +mptctl_hp_hostinfo(MPT_ADAPTER *ioc, unsigned long arg, unsigned int data_size) { hp_host_info_t __user *uarg = (void __user *) arg; - MPT_ADAPTER *ioc; struct pci_dev *pdev; char *pbuf=NULL; dma_addr_t buf_dma; hp_host_info_t karg; CONFIGPARMS cfg; ConfigPageHeader_t hdr; - int iocnum; int rc, cim_rev; ToolboxIstwiReadWriteRequest_t *IstwiRWRequest; MPT_FRAME_HDR *mf = NULL; @@ -2452,12 +2353,6 @@ mptctl_hp_hostinfo(unsigned long arg, unsigned int data_size) return -EFAULT; } - if (((iocnum = mpt_verify_adapter(karg.hdr.iocnum, &ioc)) < 0) || - (ioc == NULL)) { - printk(KERN_DEBUG MYNAM "%s::mptctl_hp_hostinfo() @%d - ioc%d not found!\n", - __FILE__, __LINE__, iocnum); - return -ENODEV; - } dctlprintk(ioc, printk(MYIOC_s_DEBUG_FMT ": mptctl_hp_hostinfo called.\n", ioc->name)); @@ -2659,15 +2554,13 @@ retry_wait: * -ENOMEM if memory allocation error */ static int -mptctl_hp_targetinfo(unsigned long arg) +mptctl_hp_targetinfo(MPT_ADAPTER *ioc, unsigned long arg) { hp_target_info_t __user *uarg = (void __user *) arg; SCSIDevicePage0_t *pg0_alloc; SCSIDevicePage3_t *pg3_alloc; - MPT_ADAPTER *ioc; MPT_SCSI_HOST *hd = NULL; hp_target_info_t karg; - int iocnum; int data_sz; dma_addr_t page_dma; CONFIGPARMS cfg; @@ -2681,12 +2574,6 @@ mptctl_hp_targetinfo(unsigned long arg) return -EFAULT; } - if (((iocnum = mpt_verify_adapter(karg.hdr.iocnum, &ioc)) < 0) || - (ioc == NULL)) { - printk(KERN_DEBUG MYNAM "%s::mptctl_hp_targetinfo() @%d - ioc%d not found!\n", - __FILE__, __LINE__, iocnum); - return -ENODEV; - } if (karg.hdr.id >= MPT_MAX_FC_DEVICES) return -EINVAL; dctlprintk(ioc, printk(MYIOC_s_DEBUG_FMT "mptctl_hp_targetinfo called.\n", @@ -2854,7 +2741,7 @@ compat_mptfwxfer_ioctl(struct file *filp, unsigned int cmd, kfw.fwlen = kfw32.fwlen; kfw.bufp = compat_ptr(kfw32.bufp); - ret = mptctl_do_fw_download(kfw.iocnum, kfw.bufp, kfw.fwlen); + ret = mptctl_do_fw_download(iocp, kfw.bufp, kfw.fwlen); mutex_unlock(&iocp->ioctl_cmds.mutex); @@ -2908,7 +2795,7 @@ compat_mpt_command(struct file *filp, unsigned int cmd, /* Pass new structure to do_mpt_command */ - ret = mptctl_do_mpt_command (karg, &uarg->MF); + ret = mptctl_do_mpt_command (iocp, karg, &uarg->MF); mutex_unlock(&iocp->ioctl_cmds.mutex); diff --git a/drivers/misc/enclosure.c b/drivers/misc/enclosure.c index 6d27ccfe0680..3c2d405bc79b 100644 --- a/drivers/misc/enclosure.c +++ b/drivers/misc/enclosure.c @@ -406,10 +406,9 @@ int enclosure_remove_device(struct enclosure_device *edev, struct device *dev) cdev = &edev->component[i]; if (cdev->dev == dev) { enclosure_remove_links(cdev); - device_del(&cdev->cdev); put_device(dev); cdev->dev = NULL; - return device_add(&cdev->cdev); + return 0; } } return -ENODEV; diff --git a/drivers/misc/lkdtm/bugs.c b/drivers/misc/lkdtm/bugs.c index a4fdad04809a..de87693cf557 100644 --- a/drivers/misc/lkdtm/bugs.c +++ b/drivers/misc/lkdtm/bugs.c @@ -278,7 +278,7 @@ void lkdtm_STACK_GUARD_PAGE_TRAILING(void) void lkdtm_UNSET_SMEP(void) { -#ifdef CONFIG_X86_64 +#if IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_UML) #define MOV_CR4_DEPTH 64 void (*direct_write_cr4)(unsigned long val); unsigned char *insn; @@ -338,13 +338,13 @@ void lkdtm_UNSET_SMEP(void) native_write_cr4(cr4); } #else - pr_err("FAIL: this test is x86_64-only\n"); + pr_err("XFAIL: this test is x86_64-only\n"); #endif } -#ifdef CONFIG_X86_32 void lkdtm_DOUBLE_FAULT(void) { +#ifdef CONFIG_X86_32 /* * Trigger #DF by setting the stack limit to zero. This clobbers * a GDT TLS slot, which is okay because the current task will die @@ -373,6 +373,8 @@ void lkdtm_DOUBLE_FAULT(void) asm volatile ("movw %0, %%ss; addl $0, (%%esp)" :: "r" ((unsigned short)(GDT_ENTRY_TLS_MIN << 3))); - panic("tried to double fault but didn't die\n"); -} + pr_err("FAIL: tried to double fault but didn't die\n"); +#else + pr_err("XFAIL: this test is ia32-only\n"); #endif +} diff --git a/drivers/mmc/host/sdhci-tegra.c b/drivers/mmc/host/sdhci-tegra.c index 7bc950520fd9..403ac44a7378 100644 --- a/drivers/mmc/host/sdhci-tegra.c +++ b/drivers/mmc/host/sdhci-tegra.c @@ -386,7 +386,7 @@ static void tegra_sdhci_reset(struct sdhci_host *host, u8 mask) misc_ctrl |= SDHCI_MISC_CTRL_ENABLE_DDR50; if (soc_data->nvquirks & NVQUIRK_ENABLE_SDR104) misc_ctrl |= SDHCI_MISC_CTRL_ENABLE_SDR104; - if (soc_data->nvquirks & SDHCI_MISC_CTRL_ENABLE_SDR50) + if (soc_data->nvquirks & NVQUIRK_ENABLE_SDR50) clk_ctrl |= SDHCI_CLOCK_CTRL_SDR50_TUNING_OVERRIDE; } diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 1b1c26da3fe0..659a9459ace3 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -3913,11 +3913,13 @@ int sdhci_setup_host(struct sdhci_host *host) if (host->ops->get_min_clock) mmc->f_min = host->ops->get_min_clock(host); else if (host->version >= SDHCI_SPEC_300) { - if (host->clk_mul) { - mmc->f_min = (host->max_clk * host->clk_mul) / 1024; + if (host->clk_mul) max_clk = host->max_clk * host->clk_mul; - } else - mmc->f_min = host->max_clk / SDHCI_MAX_DIV_SPEC_300; + /* + * Divided Clock Mode minimum clock rate is always less than + * Programmable Clock Mode minimum clock rate. + */ + mmc->f_min = host->max_clk / SDHCI_MAX_DIV_SPEC_300; } else mmc->f_min = host->max_clk / SDHCI_MAX_DIV_SPEC_200; diff --git a/drivers/mmc/host/sdhci_am654.c b/drivers/mmc/host/sdhci_am654.c index b8e897e31e2e..b8fe94fd9525 100644 --- a/drivers/mmc/host/sdhci_am654.c +++ b/drivers/mmc/host/sdhci_am654.c @@ -240,6 +240,35 @@ static void sdhci_am654_write_b(struct sdhci_host *host, u8 val, int reg) writeb(val, host->ioaddr + reg); } +static int sdhci_am654_execute_tuning(struct mmc_host *mmc, u32 opcode) +{ + struct sdhci_host *host = mmc_priv(mmc); + int err = sdhci_execute_tuning(mmc, opcode); + + if (err) + return err; + /* + * Tuning data remains in the buffer after tuning. + * Do a command and data reset to get rid of it + */ + sdhci_reset(host, SDHCI_RESET_CMD | SDHCI_RESET_DATA); + + return 0; +} + +static u32 sdhci_am654_cqhci_irq(struct sdhci_host *host, u32 intmask) +{ + int cmd_error = 0; + int data_error = 0; + + if (!sdhci_cqe_irq(host, intmask, &cmd_error, &data_error)) + return intmask; + + cqhci_irq(host->mmc, intmask, cmd_error, data_error); + + return 0; +} + static struct sdhci_ops sdhci_am654_ops = { .get_max_clock = sdhci_pltfm_clk_get_max_clock, .get_timeout_clock = sdhci_pltfm_clk_get_max_clock, @@ -248,13 +277,13 @@ static struct sdhci_ops sdhci_am654_ops = { .set_power = sdhci_am654_set_power, .set_clock = sdhci_am654_set_clock, .write_b = sdhci_am654_write_b, + .irq = sdhci_am654_cqhci_irq, .reset = sdhci_reset, }; static const struct sdhci_pltfm_data sdhci_am654_pdata = { .ops = &sdhci_am654_ops, - .quirks = SDHCI_QUIRK_INVERTED_WRITE_PROTECT | - SDHCI_QUIRK_MULTIBLOCK_READ_ACMD12, + .quirks = SDHCI_QUIRK_MULTIBLOCK_READ_ACMD12, .quirks2 = SDHCI_QUIRK2_PRESET_VALUE_BROKEN, }; @@ -263,19 +292,6 @@ static const struct sdhci_am654_driver_data sdhci_am654_drvdata = { .flags = IOMUX_PRESENT | FREQSEL_2_BIT | STRBSEL_4_BIT | DLL_PRESENT, }; -static u32 sdhci_am654_cqhci_irq(struct sdhci_host *host, u32 intmask) -{ - int cmd_error = 0; - int data_error = 0; - - if (!sdhci_cqe_irq(host, intmask, &cmd_error, &data_error)) - return intmask; - - cqhci_irq(host->mmc, intmask, cmd_error, data_error); - - return 0; -} - static struct sdhci_ops sdhci_j721e_8bit_ops = { .get_max_clock = sdhci_pltfm_clk_get_max_clock, .get_timeout_clock = sdhci_pltfm_clk_get_max_clock, @@ -290,8 +306,7 @@ static struct sdhci_ops sdhci_j721e_8bit_ops = { static const struct sdhci_pltfm_data sdhci_j721e_8bit_pdata = { .ops = &sdhci_j721e_8bit_ops, - .quirks = SDHCI_QUIRK_INVERTED_WRITE_PROTECT | - SDHCI_QUIRK_MULTIBLOCK_READ_ACMD12, + .quirks = SDHCI_QUIRK_MULTIBLOCK_READ_ACMD12, .quirks2 = SDHCI_QUIRK2_PRESET_VALUE_BROKEN, }; @@ -314,8 +329,7 @@ static struct sdhci_ops sdhci_j721e_4bit_ops = { static const struct sdhci_pltfm_data sdhci_j721e_4bit_pdata = { .ops = &sdhci_j721e_4bit_ops, - .quirks = SDHCI_QUIRK_INVERTED_WRITE_PROTECT | - SDHCI_QUIRK_MULTIBLOCK_READ_ACMD12, + .quirks = SDHCI_QUIRK_MULTIBLOCK_READ_ACMD12, .quirks2 = SDHCI_QUIRK2_PRESET_VALUE_BROKEN, }; @@ -549,6 +563,8 @@ static int sdhci_am654_probe(struct platform_device *pdev) goto pm_runtime_put; } + host->mmc_host_ops.execute_tuning = sdhci_am654_execute_tuning; + ret = sdhci_am654_init(host); if (ret) goto pm_runtime_put; diff --git a/drivers/mtd/nand/onenand/omap2.c b/drivers/mtd/nand/onenand/omap2.c index edf94ee54ec7..aa9368bf7a0c 100644 --- a/drivers/mtd/nand/onenand/omap2.c +++ b/drivers/mtd/nand/onenand/omap2.c @@ -148,13 +148,13 @@ static int omap2_onenand_wait(struct mtd_info *mtd, int state) unsigned long timeout; u32 syscfg; - if (state == FL_RESETING || state == FL_PREPARING_ERASE || + if (state == FL_RESETTING || state == FL_PREPARING_ERASE || state == FL_VERIFYING_ERASE) { int i = 21; unsigned int intr_flags = ONENAND_INT_MASTER; switch (state) { - case FL_RESETING: + case FL_RESETTING: intr_flags |= ONENAND_INT_RESET; break; case FL_PREPARING_ERASE: @@ -328,7 +328,8 @@ static inline int omap2_onenand_dma_transfer(struct omap2_onenand *c, struct dma_async_tx_descriptor *tx; dma_cookie_t cookie; - tx = dmaengine_prep_dma_memcpy(c->dma_chan, dst, src, count, 0); + tx = dmaengine_prep_dma_memcpy(c->dma_chan, dst, src, count, + DMA_CTRL_ACK | DMA_PREP_INTERRUPT); if (!tx) { dev_err(&c->pdev->dev, "Failed to prepare DMA memcpy\n"); return -EIO; @@ -375,7 +376,7 @@ static int omap2_onenand_read_bufferram(struct mtd_info *mtd, int area, * context fallback to PIO mode. */ if (!virt_addr_valid(buf) || bram_offset & 3 || (size_t)buf & 3 || - count < 384 || in_interrupt() || oops_in_progress ) + count < 384 || in_interrupt() || oops_in_progress) goto out_copy; xtra = count & 3; @@ -422,7 +423,7 @@ static int omap2_onenand_write_bufferram(struct mtd_info *mtd, int area, * context fallback to PIO mode. */ if (!virt_addr_valid(buf) || bram_offset & 3 || (size_t)buf & 3 || - count < 384 || in_interrupt() || oops_in_progress ) + count < 384 || in_interrupt() || oops_in_progress) goto out_copy; dma_src = dma_map_single(dev, buf, count, DMA_TO_DEVICE); @@ -528,7 +529,8 @@ static int omap2_onenand_probe(struct platform_device *pdev) c->gpmc_cs, c->phys_base, c->onenand.base, c->dma_chan ? "DMA" : "PIO"); - if ((r = onenand_scan(&c->mtd, 1)) < 0) + r = onenand_scan(&c->mtd, 1); + if (r < 0) goto err_release_dma; freq = omap2_onenand_get_freq(c->onenand.version_id); diff --git a/drivers/mtd/nand/onenand/onenand_base.c b/drivers/mtd/nand/onenand/onenand_base.c index 77bd32a683e1..85640ee11c86 100644 --- a/drivers/mtd/nand/onenand/onenand_base.c +++ b/drivers/mtd/nand/onenand/onenand_base.c @@ -2853,7 +2853,7 @@ static int onenand_otp_write_oob_nolock(struct mtd_info *mtd, loff_t to, /* Exit OTP access mode */ this->command(mtd, ONENAND_CMD_RESET, 0, 0); - this->wait(mtd, FL_RESETING); + this->wait(mtd, FL_RESETTING); status = this->read_word(this->base + ONENAND_REG_CTRL_STATUS); status &= 0x60; @@ -2924,7 +2924,7 @@ static int do_otp_read(struct mtd_info *mtd, loff_t from, size_t len, /* Exit OTP access mode */ this->command(mtd, ONENAND_CMD_RESET, 0, 0); - this->wait(mtd, FL_RESETING); + this->wait(mtd, FL_RESETTING); return ret; } @@ -2968,7 +2968,7 @@ static int do_otp_write(struct mtd_info *mtd, loff_t to, size_t len, /* Exit OTP access mode */ this->command(mtd, ONENAND_CMD_RESET, 0, 0); - this->wait(mtd, FL_RESETING); + this->wait(mtd, FL_RESETTING); return ret; } @@ -3008,7 +3008,7 @@ static int do_otp_lock(struct mtd_info *mtd, loff_t from, size_t len, /* Exit OTP access mode */ this->command(mtd, ONENAND_CMD_RESET, 0, 0); - this->wait(mtd, FL_RESETING); + this->wait(mtd, FL_RESETTING); } else { ops.mode = MTD_OPS_PLACE_OOB; ops.ooblen = len; @@ -3413,7 +3413,7 @@ static int flexonenand_get_boundary(struct mtd_info *mtd) this->boundary[die] = bdry & FLEXONENAND_PI_MASK; this->command(mtd, ONENAND_CMD_RESET, 0, 0); - this->wait(mtd, FL_RESETING); + this->wait(mtd, FL_RESETTING); printk(KERN_INFO "Die %d boundary: %d%s\n", die, this->boundary[die], locked ? "(Locked)" : "(Unlocked)"); @@ -3635,7 +3635,7 @@ static int flexonenand_set_boundary(struct mtd_info *mtd, int die, ret = this->wait(mtd, FL_WRITING); out: this->write_word(ONENAND_CMD_RESET, this->base + ONENAND_REG_COMMAND); - this->wait(mtd, FL_RESETING); + this->wait(mtd, FL_RESETTING); if (!ret) /* Recalculate device size on boundary change*/ flexonenand_get_size(mtd); @@ -3671,7 +3671,7 @@ static int onenand_chip_probe(struct mtd_info *mtd) /* Reset OneNAND to read default register values */ this->write_word(ONENAND_CMD_RESET, this->base + ONENAND_BOOTRAM); /* Wait reset */ - this->wait(mtd, FL_RESETING); + this->wait(mtd, FL_RESETTING); /* Restore system configuration 1 */ this->write_word(syscfg, this->base + ONENAND_REG_SYS_CFG1); diff --git a/drivers/mtd/nand/onenand/samsung_mtd.c b/drivers/mtd/nand/onenand/samsung_mtd.c index 55e5536a5850..beb7987e4c2b 100644 --- a/drivers/mtd/nand/onenand/samsung_mtd.c +++ b/drivers/mtd/nand/onenand/samsung_mtd.c @@ -675,12 +675,12 @@ static int s5pc110_read_bufferram(struct mtd_info *mtd, int area, normal: if (count != mtd->writesize) { /* Copy the bufferram to memory to prevent unaligned access */ - memcpy(this->page_buf, p, mtd->writesize); - p = this->page_buf + offset; + memcpy_fromio(this->page_buf, p, mtd->writesize); + memcpy(buffer, this->page_buf + offset, count); + } else { + memcpy_fromio(buffer, p, count); } - memcpy(buffer, p, count); - return 0; } diff --git a/drivers/mtd/nand/raw/cadence-nand-controller.c b/drivers/mtd/nand/raw/cadence-nand-controller.c index 3a36285a8d8a..f6c7102a1e32 100644 --- a/drivers/mtd/nand/raw/cadence-nand-controller.c +++ b/drivers/mtd/nand/raw/cadence-nand-controller.c @@ -914,8 +914,8 @@ static void cadence_nand_get_caps(struct cdns_nand_ctrl *cdns_ctrl) /* Prepare CDMA descriptor. */ static void cadence_nand_cdma_desc_prepare(struct cdns_nand_ctrl *cdns_ctrl, - char nf_mem, u32 flash_ptr, char *mem_ptr, - char *ctrl_data_ptr, u16 ctype) + char nf_mem, u32 flash_ptr, dma_addr_t mem_ptr, + dma_addr_t ctrl_data_ptr, u16 ctype) { struct cadence_nand_cdma_desc *cdma_desc = cdns_ctrl->cdma_desc; @@ -931,13 +931,13 @@ cadence_nand_cdma_desc_prepare(struct cdns_nand_ctrl *cdns_ctrl, cdma_desc->command_flags |= CDMA_CF_DMA_MASTER; cdma_desc->command_flags |= CDMA_CF_INT; - cdma_desc->memory_pointer = (uintptr_t)mem_ptr; + cdma_desc->memory_pointer = mem_ptr; cdma_desc->status = 0; cdma_desc->sync_flag_pointer = 0; cdma_desc->sync_arguments = 0; cdma_desc->command_type = ctype; - cdma_desc->ctrl_data_ptr = (uintptr_t)ctrl_data_ptr; + cdma_desc->ctrl_data_ptr = ctrl_data_ptr; } static u8 cadence_nand_check_desc_error(struct cdns_nand_ctrl *cdns_ctrl, @@ -1280,8 +1280,7 @@ cadence_nand_cdma_transfer(struct cdns_nand_ctrl *cdns_ctrl, u8 chip_nr, } cadence_nand_cdma_desc_prepare(cdns_ctrl, chip_nr, page, - (void *)dma_buf, (void *)dma_ctrl_dat, - ctype); + dma_buf, dma_ctrl_dat, ctype); status = cadence_nand_cdma_send_and_wait(cdns_ctrl, thread_nr); @@ -1360,7 +1359,7 @@ static int cadence_nand_erase(struct nand_chip *chip, u32 page) cadence_nand_cdma_desc_prepare(cdns_ctrl, cdns_chip->cs[chip->cur_cs], - page, NULL, NULL, + page, 0, 0, CDMA_CT_ERASE); status = cadence_nand_cdma_send_and_wait(cdns_ctrl, thread_nr); if (status) { diff --git a/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c b/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c index 334fe3130285..b9d5d55a5edb 100644 --- a/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c +++ b/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c @@ -148,6 +148,10 @@ static int gpmi_init(struct gpmi_nand_data *this) struct resources *r = &this->resources; int ret; + ret = pm_runtime_get_sync(this->dev); + if (ret < 0) + return ret; + ret = gpmi_reset_block(r->gpmi_regs, false); if (ret) goto err_out; @@ -179,8 +183,9 @@ static int gpmi_init(struct gpmi_nand_data *this) */ writel(BM_GPMI_CTRL1_DECOUPLE_CS, r->gpmi_regs + HW_GPMI_CTRL1_SET); - return 0; err_out: + pm_runtime_mark_last_busy(this->dev); + pm_runtime_put_autosuspend(this->dev); return ret; } @@ -2722,6 +2727,10 @@ static int gpmi_pm_resume(struct device *dev) return ret; } + /* Set flag to get timing setup restored for next exec_op */ + if (this->hw.clk_rate) + this->hw.must_apply_timings = true; + /* re-init the BCH registers */ ret = bch_set_geometry(this); if (ret) { diff --git a/drivers/mtd/nand/raw/stm32_fmc2_nand.c b/drivers/mtd/nand/raw/stm32_fmc2_nand.c index 9e63800f768a..3ba73f18841f 100644 --- a/drivers/mtd/nand/raw/stm32_fmc2_nand.c +++ b/drivers/mtd/nand/raw/stm32_fmc2_nand.c @@ -37,6 +37,7 @@ /* Max ECC buffer length */ #define FMC2_MAX_ECC_BUF_LEN (FMC2_BCHDSRS_LEN * FMC2_MAX_SG) +#define FMC2_TIMEOUT_US 1000 #define FMC2_TIMEOUT_MS 1000 /* Timings */ @@ -53,6 +54,8 @@ #define FMC2_PMEM 0x88 #define FMC2_PATT 0x8c #define FMC2_HECCR 0x94 +#define FMC2_ISR 0x184 +#define FMC2_ICR 0x188 #define FMC2_CSQCR 0x200 #define FMC2_CSQCFGR1 0x204 #define FMC2_CSQCFGR2 0x208 @@ -118,6 +121,12 @@ #define FMC2_PATT_ATTHIZ(x) (((x) & 0xff) << 24) #define FMC2_PATT_DEFAULT 0x0a0a0a0a +/* Register: FMC2_ISR */ +#define FMC2_ISR_IHLF BIT(1) + +/* Register: FMC2_ICR */ +#define FMC2_ICR_CIHLF BIT(1) + /* Register: FMC2_CSQCR */ #define FMC2_CSQCR_CSQSTART BIT(0) @@ -1322,6 +1331,31 @@ static void stm32_fmc2_write_data(struct nand_chip *chip, const void *buf, stm32_fmc2_set_buswidth_16(fmc2, true); } +static int stm32_fmc2_waitrdy(struct nand_chip *chip, unsigned long timeout_ms) +{ + struct stm32_fmc2_nfc *fmc2 = to_stm32_nfc(chip->controller); + const struct nand_sdr_timings *timings; + u32 isr, sr; + + /* Check if there is no pending requests to the NAND flash */ + if (readl_relaxed_poll_timeout_atomic(fmc2->io_base + FMC2_SR, sr, + sr & FMC2_SR_NWRF, 1, + FMC2_TIMEOUT_US)) + dev_warn(fmc2->dev, "Waitrdy timeout\n"); + + /* Wait tWB before R/B# signal is low */ + timings = nand_get_sdr_timings(&chip->data_interface); + ndelay(PSEC_TO_NSEC(timings->tWB_max)); + + /* R/B# signal is low, clear high level flag */ + writel_relaxed(FMC2_ICR_CIHLF, fmc2->io_base + FMC2_ICR); + + /* Wait R/B# signal is high */ + return readl_relaxed_poll_timeout_atomic(fmc2->io_base + FMC2_ISR, + isr, isr & FMC2_ISR_IHLF, + 5, 1000 * timeout_ms); +} + static int stm32_fmc2_exec_op(struct nand_chip *chip, const struct nand_operation *op, bool check_only) @@ -1366,8 +1400,8 @@ static int stm32_fmc2_exec_op(struct nand_chip *chip, break; case NAND_OP_WAITRDY_INSTR: - ret = nand_soft_waitrdy(chip, - instr->ctx.waitrdy.timeout_ms); + ret = stm32_fmc2_waitrdy(chip, + instr->ctx.waitrdy.timeout_ms); break; } } diff --git a/drivers/mtd/sm_ftl.c b/drivers/mtd/sm_ftl.c index 4744bf94ad9a..b9f272408c4d 100644 --- a/drivers/mtd/sm_ftl.c +++ b/drivers/mtd/sm_ftl.c @@ -247,7 +247,8 @@ static int sm_read_sector(struct sm_ftl *ftl, /* FTL can contain -1 entries that are by default filled with bits */ if (block == -1) { - memset(buffer, 0xFF, SM_SECTOR_SIZE); + if (buffer) + memset(buffer, 0xFF, SM_SECTOR_SIZE); return 0; } diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c index f4afe123e9dc..b0cd443dd758 100644 --- a/drivers/mtd/spi-nor/spi-nor.c +++ b/drivers/mtd/spi-nor/spi-nor.c @@ -2124,6 +2124,8 @@ static int spi_nor_sr2_bit1_quad_enable(struct spi_nor *nor) if (nor->bouncebuf[0] & SR2_QUAD_EN_BIT1) return 0; + nor->bouncebuf[0] |= SR2_QUAD_EN_BIT1; + return spi_nor_write_16bit_cr_and_check(nor, nor->bouncebuf[0]); } @@ -4596,6 +4598,7 @@ static void sst_set_default_init(struct spi_nor *nor) static void st_micron_set_default_init(struct spi_nor *nor) { nor->flags |= SNOR_F_HAS_LOCK; + nor->flags &= ~SNOR_F_HAS_16BIT_SR; nor->params.quad_enable = NULL; nor->params.set_4byte = st_micron_set_4byte; } @@ -4768,9 +4771,7 @@ static void spi_nor_info_init_params(struct spi_nor *nor) static void spansion_post_sfdp_fixups(struct spi_nor *nor) { - struct mtd_info *mtd = &nor->mtd; - - if (mtd->size <= SZ_16M) + if (nor->params.size <= SZ_16M) return; nor->flags |= SNOR_F_4B_OPCODES; diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 01e2657e4c26..dee79588d2b1 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -549,6 +549,8 @@ source "drivers/net/hyperv/Kconfig" config NETDEVSIM tristate "Simulated networking device" depends on DEBUG_FS + depends on INET + depends on IPV6 || IPV6=n select NET_DEVLINK help This driver is a developer testing tool and software model that can diff --git a/drivers/net/can/m_can/tcan4x5x.c b/drivers/net/can/m_can/tcan4x5x.c index 4e1789ea2bc3..eacd428e07e9 100644 --- a/drivers/net/can/m_can/tcan4x5x.c +++ b/drivers/net/can/m_can/tcan4x5x.c @@ -102,6 +102,7 @@ #define TCAN4X5X_MODE_NORMAL BIT(7) #define TCAN4X5X_DISABLE_WAKE_MSK (BIT(31) | BIT(30)) +#define TCAN4X5X_DISABLE_INH_MSK BIT(9) #define TCAN4X5X_SW_RESET BIT(2) @@ -166,6 +167,28 @@ static void tcan4x5x_check_wake(struct tcan4x5x_priv *priv) } } +static int tcan4x5x_reset(struct tcan4x5x_priv *priv) +{ + int ret = 0; + + if (priv->reset_gpio) { + gpiod_set_value(priv->reset_gpio, 1); + + /* tpulse_width minimum 30us */ + usleep_range(30, 100); + gpiod_set_value(priv->reset_gpio, 0); + } else { + ret = regmap_write(priv->regmap, TCAN4X5X_CONFIG, + TCAN4X5X_SW_RESET); + if (ret) + return ret; + } + + usleep_range(700, 1000); + + return ret; +} + static int regmap_spi_gather_write(void *context, const void *reg, size_t reg_len, const void *val, size_t val_len) @@ -348,14 +371,23 @@ static int tcan4x5x_disable_wake(struct m_can_classdev *cdev) TCAN4X5X_DISABLE_WAKE_MSK, 0x00); } +static int tcan4x5x_disable_state(struct m_can_classdev *cdev) +{ + struct tcan4x5x_priv *tcan4x5x = cdev->device_data; + + return regmap_update_bits(tcan4x5x->regmap, TCAN4X5X_CONFIG, + TCAN4X5X_DISABLE_INH_MSK, 0x01); +} + static int tcan4x5x_parse_config(struct m_can_classdev *cdev) { struct tcan4x5x_priv *tcan4x5x = cdev->device_data; + int ret; tcan4x5x->device_wake_gpio = devm_gpiod_get(cdev->dev, "device-wake", GPIOD_OUT_HIGH); if (IS_ERR(tcan4x5x->device_wake_gpio)) { - if (PTR_ERR(tcan4x5x->power) == -EPROBE_DEFER) + if (PTR_ERR(tcan4x5x->device_wake_gpio) == -EPROBE_DEFER) return -EPROBE_DEFER; tcan4x5x_disable_wake(cdev); @@ -366,18 +398,17 @@ static int tcan4x5x_parse_config(struct m_can_classdev *cdev) if (IS_ERR(tcan4x5x->reset_gpio)) tcan4x5x->reset_gpio = NULL; - usleep_range(700, 1000); + ret = tcan4x5x_reset(tcan4x5x); + if (ret) + return ret; tcan4x5x->device_state_gpio = devm_gpiod_get_optional(cdev->dev, "device-state", GPIOD_IN); - if (IS_ERR(tcan4x5x->device_state_gpio)) + if (IS_ERR(tcan4x5x->device_state_gpio)) { tcan4x5x->device_state_gpio = NULL; - - tcan4x5x->power = devm_regulator_get_optional(cdev->dev, - "vsup"); - if (PTR_ERR(tcan4x5x->power) == -EPROBE_DEFER) - return -EPROBE_DEFER; + tcan4x5x_disable_state(cdev); + } return 0; } @@ -412,6 +443,12 @@ static int tcan4x5x_can_probe(struct spi_device *spi) if (!priv) return -ENOMEM; + priv->power = devm_regulator_get_optional(&spi->dev, "vsup"); + if (PTR_ERR(priv->power) == -EPROBE_DEFER) + return -EPROBE_DEFER; + else + priv->power = NULL; + mcan_class->device_data = priv; m_can_class_get_clocks(mcan_class); @@ -451,11 +488,17 @@ static int tcan4x5x_can_probe(struct spi_device *spi) priv->regmap = devm_regmap_init(&spi->dev, &tcan4x5x_bus, &spi->dev, &tcan4x5x_regmap); - ret = tcan4x5x_parse_config(mcan_class); + ret = tcan4x5x_power_enable(priv->power, 1); if (ret) goto out_clk; - tcan4x5x_power_enable(priv->power, 1); + ret = tcan4x5x_parse_config(mcan_class); + if (ret) + goto out_power; + + ret = tcan4x5x_init(mcan_class); + if (ret) + goto out_power; ret = m_can_class_register(mcan_class); if (ret) diff --git a/drivers/net/can/mscan/mscan.c b/drivers/net/can/mscan/mscan.c index 8caf7af0dee2..99101d7027a8 100644 --- a/drivers/net/can/mscan/mscan.c +++ b/drivers/net/can/mscan/mscan.c @@ -381,13 +381,12 @@ static int mscan_rx_poll(struct napi_struct *napi, int quota) struct net_device *dev = napi->dev; struct mscan_regs __iomem *regs = priv->reg_base; struct net_device_stats *stats = &dev->stats; - int npackets = 0; - int ret = 1; + int work_done = 0; struct sk_buff *skb; struct can_frame *frame; u8 canrflg; - while (npackets < quota) { + while (work_done < quota) { canrflg = in_8(®s->canrflg); if (!(canrflg & (MSCAN_RXF | MSCAN_ERR_IF))) break; @@ -408,18 +407,18 @@ static int mscan_rx_poll(struct napi_struct *napi, int quota) stats->rx_packets++; stats->rx_bytes += frame->can_dlc; - npackets++; + work_done++; netif_receive_skb(skb); } - if (!(in_8(®s->canrflg) & (MSCAN_RXF | MSCAN_ERR_IF))) { - napi_complete(&priv->napi); - clear_bit(F_RX_PROGRESS, &priv->flags); - if (priv->can.state < CAN_STATE_BUS_OFF) - out_8(®s->canrier, priv->shadow_canrier); - ret = 0; + if (work_done < quota) { + if (likely(napi_complete_done(&priv->napi, work_done))) { + clear_bit(F_RX_PROGRESS, &priv->flags); + if (priv->can.state < CAN_STATE_BUS_OFF) + out_8(®s->canrier, priv->shadow_canrier); + } } - return ret; + return work_done; } static irqreturn_t mscan_isr(int irq, void *dev_id) diff --git a/drivers/net/can/slcan.c b/drivers/net/can/slcan.c index 2e57122f02fb..2f5c287eac95 100644 --- a/drivers/net/can/slcan.c +++ b/drivers/net/can/slcan.c @@ -344,9 +344,16 @@ static void slcan_transmit(struct work_struct *work) */ static void slcan_write_wakeup(struct tty_struct *tty) { - struct slcan *sl = tty->disc_data; + struct slcan *sl; + + rcu_read_lock(); + sl = rcu_dereference(tty->disc_data); + if (!sl) + goto out; schedule_work(&sl->tx_work); +out: + rcu_read_unlock(); } /* Send a can_frame to a TTY queue. */ @@ -644,10 +651,11 @@ static void slcan_close(struct tty_struct *tty) return; spin_lock_bh(&sl->lock); - tty->disc_data = NULL; + rcu_assign_pointer(tty->disc_data, NULL); sl->tty = NULL; spin_unlock_bh(&sl->lock); + synchronize_rcu(); flush_work(&sl->tx_work); /* Flush network side */ diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c index 2f74f6704c12..a4b4b742c80c 100644 --- a/drivers/net/can/usb/gs_usb.c +++ b/drivers/net/can/usb/gs_usb.c @@ -918,7 +918,7 @@ static int gs_usb_probe(struct usb_interface *intf, GS_USB_BREQ_HOST_FORMAT, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_INTERFACE, 1, - intf->altsetting[0].desc.bInterfaceNumber, + intf->cur_altsetting->desc.bInterfaceNumber, hconf, sizeof(*hconf), 1000); @@ -941,7 +941,7 @@ static int gs_usb_probe(struct usb_interface *intf, GS_USB_BREQ_DEVICE_CONFIG, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_INTERFACE, 1, - intf->altsetting[0].desc.bInterfaceNumber, + intf->cur_altsetting->desc.bInterfaceNumber, dconf, sizeof(*dconf), 1000); diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c index 5fc0be564274..7ab87a758754 100644 --- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c +++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c @@ -1590,7 +1590,7 @@ static int kvaser_usb_hydra_setup_endpoints(struct kvaser_usb *dev) struct usb_endpoint_descriptor *ep; int i; - iface_desc = &dev->intf->altsetting[0]; + iface_desc = dev->intf->cur_altsetting; for (i = 0; i < iface_desc->desc.bNumEndpoints; ++i) { ep = &iface_desc->endpoint[i].desc; diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c index ae4c37e1bb75..1b9957f12459 100644 --- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c +++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c @@ -1310,7 +1310,7 @@ static int kvaser_usb_leaf_setup_endpoints(struct kvaser_usb *dev) struct usb_endpoint_descriptor *endpoint; int i; - iface_desc = &dev->intf->altsetting[0]; + iface_desc = dev->intf->cur_altsetting; for (i = 0; i < iface_desc->desc.bNumEndpoints; ++i) { endpoint = &iface_desc->endpoint[i].desc; diff --git a/drivers/net/dsa/Kconfig b/drivers/net/dsa/Kconfig index cbd74a72d0a1..2d38dbc9dd8c 100644 --- a/drivers/net/dsa/Kconfig +++ b/drivers/net/dsa/Kconfig @@ -105,7 +105,6 @@ config NET_DSA_SMSC_LAN9303_MDIO config NET_DSA_VITESSE_VSC73XX tristate - depends on OF depends on NET_DSA select FIXED_PHY select VITESSE_PHY @@ -116,7 +115,6 @@ config NET_DSA_VITESSE_VSC73XX config NET_DSA_VITESSE_VSC73XX_SPI tristate "Vitesse VSC7385/7388/7395/7398 SPI mode support" - depends on OF depends on NET_DSA depends on SPI select NET_DSA_VITESSE_VSC73XX @@ -126,7 +124,6 @@ config NET_DSA_VITESSE_VSC73XX_SPI config NET_DSA_VITESSE_VSC73XX_PLATFORM tristate "Vitesse VSC7385/7388/7395/7398 Platform mode support" - depends on OF depends on NET_DSA depends on HAS_IOMEM select NET_DSA_VITESSE_VSC73XX diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index edacacfc9365..060497512159 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -371,8 +371,6 @@ static void b53_enable_vlan(struct b53_device *dev, bool enable, b53_read8(dev, B53_VLAN_PAGE, B53_VLAN_CTRL5, &vc5); } - mgmt &= ~SM_SW_FWD_MODE; - if (enable) { vc0 |= VC0_VLAN_EN | VC0_VID_CHK_EN | VC0_VID_HASH_VID; vc1 |= VC1_RX_MCST_UNTAG_EN | VC1_RX_MCST_FWD_EN; @@ -573,9 +571,8 @@ EXPORT_SYMBOL(b53_disable_port); void b53_brcm_hdr_setup(struct dsa_switch *ds, int port) { - bool tag_en = !(ds->ops->get_tag_protocol(ds, port) == - DSA_TAG_PROTO_NONE); struct b53_device *dev = ds->priv; + bool tag_en = !(dev->tag_protocol == DSA_TAG_PROTO_NONE); u8 hdr_ctl, val; u16 reg; @@ -595,6 +592,22 @@ void b53_brcm_hdr_setup(struct dsa_switch *ds, int port) break; } + /* Enable management mode if tagging is requested */ + b53_read8(dev, B53_CTRL_PAGE, B53_SWITCH_MODE, &hdr_ctl); + if (tag_en) + hdr_ctl |= SM_SW_FWD_MODE; + else + hdr_ctl &= ~SM_SW_FWD_MODE; + b53_write8(dev, B53_CTRL_PAGE, B53_SWITCH_MODE, hdr_ctl); + + /* Configure the appropriate IMP port */ + b53_read8(dev, B53_MGMT_PAGE, B53_GLOBAL_CONFIG, &hdr_ctl); + if (port == 8) + hdr_ctl |= GC_FRM_MGMT_PORT_MII; + else if (port == 5) + hdr_ctl |= GC_FRM_MGMT_PORT_M; + b53_write8(dev, B53_MGMT_PAGE, B53_GLOBAL_CONFIG, hdr_ctl); + /* Enable Broadcom tags for IMP port */ b53_read8(dev, B53_MGMT_PAGE, B53_BRCM_HDR, &hdr_ctl); if (tag_en) @@ -1866,36 +1879,57 @@ static bool b53_possible_cpu_port(struct dsa_switch *ds, int port) return false; } -static bool b53_can_enable_brcm_tags(struct dsa_switch *ds, int port) +static bool b53_can_enable_brcm_tags(struct dsa_switch *ds, int port, + enum dsa_tag_protocol tag_protocol) { bool ret = b53_possible_cpu_port(ds, port); - if (!ret) + if (!ret) { dev_warn(ds->dev, "Port %d is not Broadcom tag capable\n", port); + return ret; + } + + switch (tag_protocol) { + case DSA_TAG_PROTO_BRCM: + case DSA_TAG_PROTO_BRCM_PREPEND: + dev_warn(ds->dev, + "Port %d is stacked to Broadcom tag switch\n", port); + ret = false; + break; + default: + ret = true; + break; + } + return ret; } -enum dsa_tag_protocol b53_get_tag_protocol(struct dsa_switch *ds, int port) +enum dsa_tag_protocol b53_get_tag_protocol(struct dsa_switch *ds, int port, + enum dsa_tag_protocol mprot) { struct b53_device *dev = ds->priv; /* Older models (5325, 5365) support a different tag format that we do - * not support in net/dsa/tag_brcm.c yet. 539x and 531x5 require managed - * mode to be turned on which means we need to specifically manage ARL - * misses on multicast addresses (TBD). + * not support in net/dsa/tag_brcm.c yet. */ - if (is5325(dev) || is5365(dev) || is539x(dev) || is531x5(dev) || - !b53_can_enable_brcm_tags(ds, port)) - return DSA_TAG_PROTO_NONE; + if (is5325(dev) || is5365(dev) || + !b53_can_enable_brcm_tags(ds, port, mprot)) { + dev->tag_protocol = DSA_TAG_PROTO_NONE; + goto out; + } /* Broadcom BCM58xx chips have a flow accelerator on Port 8 * which requires us to use the prepended Broadcom tag type */ - if (dev->chip_id == BCM58XX_DEVICE_ID && port == B53_CPU_PORT) - return DSA_TAG_PROTO_BRCM_PREPEND; + if (dev->chip_id == BCM58XX_DEVICE_ID && port == B53_CPU_PORT) { + dev->tag_protocol = DSA_TAG_PROTO_BRCM_PREPEND; + goto out; + } - return DSA_TAG_PROTO_BRCM; + dev->tag_protocol = DSA_TAG_PROTO_BRCM; +out: + return dev->tag_protocol; } EXPORT_SYMBOL(b53_get_tag_protocol); diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h index 1877acf05081..3c30f3a7eb29 100644 --- a/drivers/net/dsa/b53/b53_priv.h +++ b/drivers/net/dsa/b53/b53_priv.h @@ -118,6 +118,7 @@ struct b53_device { u8 jumbo_size_reg; int reset_gpio; u8 num_arl_entries; + enum dsa_tag_protocol tag_protocol; /* used ports mask */ u16 enabled_ports; @@ -359,7 +360,8 @@ int b53_mdb_del(struct dsa_switch *ds, int port, const struct switchdev_obj_port_mdb *mdb); int b53_mirror_add(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror, bool ingress); -enum dsa_tag_protocol b53_get_tag_protocol(struct dsa_switch *ds, int port); +enum dsa_tag_protocol b53_get_tag_protocol(struct dsa_switch *ds, int port, + enum dsa_tag_protocol mprot); void b53_mirror_del(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror); int b53_enable_port(struct dsa_switch *ds, int port, struct phy_device *phy); diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index e43040c9f9ee..3e8635311d0d 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -68,7 +68,7 @@ static void bcm_sf2_imp_setup(struct dsa_switch *ds, int port) /* Force link status for IMP port */ reg = core_readl(priv, offset); - reg |= (MII_SW_OR | LINK_STS); + reg |= (MII_SW_OR | LINK_STS | GMII_SPEED_UP_2G); core_writel(priv, reg, offset); /* Enable Broadcast, Multicast, Unicast forwarding to IMP port */ diff --git a/drivers/net/dsa/dsa_loop.c b/drivers/net/dsa/dsa_loop.c index c8d7ef27fd72..fdcb70b9f0e4 100644 --- a/drivers/net/dsa/dsa_loop.c +++ b/drivers/net/dsa/dsa_loop.c @@ -61,7 +61,8 @@ struct dsa_loop_priv { static struct phy_device *phydevs[PHY_MAX_ADDR]; static enum dsa_tag_protocol dsa_loop_get_protocol(struct dsa_switch *ds, - int port) + int port, + enum dsa_tag_protocol mp) { dev_dbg(ds->dev, "%s: port: %d\n", __func__, port); diff --git a/drivers/net/dsa/lan9303-core.c b/drivers/net/dsa/lan9303-core.c index e3c333a8f45d..cc17a44dd3a8 100644 --- a/drivers/net/dsa/lan9303-core.c +++ b/drivers/net/dsa/lan9303-core.c @@ -883,7 +883,8 @@ static int lan9303_check_device(struct lan9303 *chip) /* ---------------------------- DSA -----------------------------------*/ static enum dsa_tag_protocol lan9303_get_tag_protocol(struct dsa_switch *ds, - int port) + int port, + enum dsa_tag_protocol mp) { return DSA_TAG_PROTO_LAN9303; } diff --git a/drivers/net/dsa/lantiq_gswip.c b/drivers/net/dsa/lantiq_gswip.c index 955324968b74..0369c22fe3e1 100644 --- a/drivers/net/dsa/lantiq_gswip.c +++ b/drivers/net/dsa/lantiq_gswip.c @@ -841,7 +841,8 @@ static int gswip_setup(struct dsa_switch *ds) } static enum dsa_tag_protocol gswip_get_tag_protocol(struct dsa_switch *ds, - int port) + int port, + enum dsa_tag_protocol mp) { return DSA_TAG_PROTO_GSWIP; } diff --git a/drivers/net/dsa/microchip/ksz8795.c b/drivers/net/dsa/microchip/ksz8795.c index 24a5e99f7fd5..47d65b77caf7 100644 --- a/drivers/net/dsa/microchip/ksz8795.c +++ b/drivers/net/dsa/microchip/ksz8795.c @@ -645,7 +645,8 @@ static void ksz8795_w_phy(struct ksz_device *dev, u16 phy, u16 reg, u16 val) } static enum dsa_tag_protocol ksz8795_get_tag_protocol(struct dsa_switch *ds, - int port) + int port, + enum dsa_tag_protocol mp) { return DSA_TAG_PROTO_KSZ8795; } diff --git a/drivers/net/dsa/microchip/ksz9477.c b/drivers/net/dsa/microchip/ksz9477.c index 50ffc63d6231..9a51b8a4de5d 100644 --- a/drivers/net/dsa/microchip/ksz9477.c +++ b/drivers/net/dsa/microchip/ksz9477.c @@ -295,7 +295,8 @@ static void ksz9477_port_init_cnt(struct ksz_device *dev, int port) } static enum dsa_tag_protocol ksz9477_get_tag_protocol(struct dsa_switch *ds, - int port) + int port, + enum dsa_tag_protocol mp) { enum dsa_tag_protocol proto = DSA_TAG_PROTO_KSZ9477; struct ksz_device *dev = ds->priv; diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index ed1ec10ec62b..022466ca1c19 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -1223,7 +1223,8 @@ mt7530_port_vlan_del(struct dsa_switch *ds, int port, } static enum dsa_tag_protocol -mtk_get_tag_protocol(struct dsa_switch *ds, int port) +mtk_get_tag_protocol(struct dsa_switch *ds, int port, + enum dsa_tag_protocol mp) { struct mt7530_priv *priv = ds->priv; diff --git a/drivers/net/dsa/mv88e6060.c b/drivers/net/dsa/mv88e6060.c index a5a37f47b320..24b8219fd607 100644 --- a/drivers/net/dsa/mv88e6060.c +++ b/drivers/net/dsa/mv88e6060.c @@ -43,7 +43,8 @@ static const char *mv88e6060_get_name(struct mii_bus *bus, int sw_addr) } static enum dsa_tag_protocol mv88e6060_get_tag_protocol(struct dsa_switch *ds, - int port) + int port, + enum dsa_tag_protocol m) { return DSA_TAG_PROTO_TRAILER; } diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 5eeeb6566196..8c9289549688 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -340,11 +340,14 @@ static int mv88e6xxx_g1_irq_setup(struct mv88e6xxx_chip *chip) */ irq_set_lockdep_class(chip->irq, &lock_key, &request_key); + snprintf(chip->irq_name, sizeof(chip->irq_name), + "mv88e6xxx-%s", dev_name(chip->dev)); + mv88e6xxx_reg_unlock(chip); err = request_threaded_irq(chip->irq, NULL, mv88e6xxx_g1_irq_thread_fn, IRQF_ONESHOT | IRQF_SHARED, - dev_name(chip->dev), chip); + chip->irq_name, chip); mv88e6xxx_reg_lock(chip); if (err) mv88e6xxx_g1_irq_free_common(chip); @@ -2303,10 +2306,14 @@ static int mv88e6xxx_serdes_irq_request(struct mv88e6xxx_chip *chip, int port, if (!irq) return 0; + snprintf(dev_id->serdes_irq_name, sizeof(dev_id->serdes_irq_name), + "mv88e6xxx-%s-serdes-%d", dev_name(chip->dev), port); + /* Requesting the IRQ will trigger IRQ callbacks, so release the lock */ mv88e6xxx_reg_unlock(chip); err = request_threaded_irq(irq, NULL, mv88e6xxx_serdes_irq_thread_fn, - IRQF_ONESHOT, "mv88e6xxx-serdes", dev_id); + IRQF_ONESHOT, dev_id->serdes_irq_name, + dev_id); mv88e6xxx_reg_lock(chip); if (err) return err; @@ -3838,6 +3845,9 @@ static const struct mv88e6xxx_ops mv88e6190_ops = { .serdes_irq_mapping = mv88e6390_serdes_irq_mapping, .serdes_irq_enable = mv88e6390_serdes_irq_enable, .serdes_irq_status = mv88e6390_serdes_irq_status, + .serdes_get_strings = mv88e6390_serdes_get_strings, + .serdes_get_stats = mv88e6390_serdes_get_stats, + .phylink_validate = mv88e6390_phylink_validate, .gpio_ops = &mv88e6352_gpio_ops, .phylink_validate = mv88e6390_phylink_validate, }; @@ -3889,6 +3899,9 @@ static const struct mv88e6xxx_ops mv88e6190x_ops = { .serdes_irq_mapping = mv88e6390_serdes_irq_mapping, .serdes_irq_enable = mv88e6390_serdes_irq_enable, .serdes_irq_status = mv88e6390_serdes_irq_status, + .serdes_get_strings = mv88e6390_serdes_get_strings, + .serdes_get_stats = mv88e6390_serdes_get_stats, + .phylink_validate = mv88e6390_phylink_validate, .gpio_ops = &mv88e6352_gpio_ops, .phylink_validate = mv88e6390x_phylink_validate, }; @@ -3939,6 +3952,9 @@ static const struct mv88e6xxx_ops mv88e6191_ops = { .serdes_irq_mapping = mv88e6390_serdes_irq_mapping, .serdes_irq_enable = mv88e6390_serdes_irq_enable, .serdes_irq_status = mv88e6390_serdes_irq_status, + .serdes_get_strings = mv88e6390_serdes_get_strings, + .serdes_get_stats = mv88e6390_serdes_get_stats, + .phylink_validate = mv88e6390_phylink_validate, .avb_ops = &mv88e6390_avb_ops, .ptp_ops = &mv88e6352_ptp_ops, .phylink_validate = mv88e6390_phylink_validate, @@ -4085,6 +4101,9 @@ static const struct mv88e6xxx_ops mv88e6290_ops = { .serdes_irq_mapping = mv88e6390_serdes_irq_mapping, .serdes_irq_enable = mv88e6390_serdes_irq_enable, .serdes_irq_status = mv88e6390_serdes_irq_status, + .serdes_get_strings = mv88e6390_serdes_get_strings, + .serdes_get_stats = mv88e6390_serdes_get_stats, + .phylink_validate = mv88e6390_phylink_validate, .gpio_ops = &mv88e6352_gpio_ops, .avb_ops = &mv88e6390_avb_ops, .ptp_ops = &mv88e6352_ptp_ops, @@ -4479,6 +4498,9 @@ static const struct mv88e6xxx_ops mv88e6390x_ops = { .serdes_irq_mapping = mv88e6390_serdes_irq_mapping, .serdes_irq_enable = mv88e6390_serdes_irq_enable, .serdes_irq_status = mv88e6390_serdes_irq_status, + .serdes_get_sset_count = mv88e6390_serdes_get_sset_count, + .serdes_get_strings = mv88e6390_serdes_get_strings, + .serdes_get_stats = mv88e6390_serdes_get_stats, .gpio_ops = &mv88e6352_gpio_ops, .avb_ops = &mv88e6390_avb_ops, .ptp_ops = &mv88e6352_ptp_ops, @@ -5210,7 +5232,8 @@ static struct mv88e6xxx_chip *mv88e6xxx_alloc_chip(struct device *dev) } static enum dsa_tag_protocol mv88e6xxx_get_tag_protocol(struct dsa_switch *ds, - int port) + int port, + enum dsa_tag_protocol m) { struct mv88e6xxx_chip *chip = ds->priv; diff --git a/drivers/net/dsa/mv88e6xxx/chip.h b/drivers/net/dsa/mv88e6xxx/chip.h index 8a8e38bfb161..f332cb4b2fbf 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.h +++ b/drivers/net/dsa/mv88e6xxx/chip.h @@ -236,6 +236,7 @@ struct mv88e6xxx_port { bool mirror_ingress; bool mirror_egress; unsigned int serdes_irq; + char serdes_irq_name[32]; }; struct mv88e6xxx_chip { @@ -292,11 +293,16 @@ struct mv88e6xxx_chip { struct mv88e6xxx_irq g1_irq; struct mv88e6xxx_irq g2_irq; int irq; + char irq_name[32]; int device_irq; + char device_irq_name[32]; int watchdog_irq; + char watchdog_irq_name[32]; int atu_prob_irq; + char atu_prob_irq_name[32]; int vtu_prob_irq; + char vtu_prob_irq_name[32]; struct kthread_worker *kworker; struct kthread_delayed_work irq_poll_work; diff --git a/drivers/net/dsa/mv88e6xxx/global1.c b/drivers/net/dsa/mv88e6xxx/global1.c index 120a65d3e3ef..b016cc205f81 100644 --- a/drivers/net/dsa/mv88e6xxx/global1.c +++ b/drivers/net/dsa/mv88e6xxx/global1.c @@ -360,6 +360,11 @@ int mv88e6390_g1_set_cpu_port(struct mv88e6xxx_chip *chip, int port) { u16 ptr = MV88E6390_G1_MONITOR_MGMT_CTL_PTR_CPU_DEST; + /* Use the default high priority for management frames sent to + * the CPU. + */ + port |= MV88E6390_G1_MONITOR_MGMT_CTL_PTR_CPU_DEST_MGMTPRI; + return mv88e6390_g1_monitor_write(chip, ptr, port); } diff --git a/drivers/net/dsa/mv88e6xxx/global1.h b/drivers/net/dsa/mv88e6xxx/global1.h index bc5a6b2bb1e4..5324c6f4ae90 100644 --- a/drivers/net/dsa/mv88e6xxx/global1.h +++ b/drivers/net/dsa/mv88e6xxx/global1.h @@ -211,6 +211,7 @@ #define MV88E6390_G1_MONITOR_MGMT_CTL_PTR_INGRESS_DEST 0x2000 #define MV88E6390_G1_MONITOR_MGMT_CTL_PTR_EGRESS_DEST 0x2100 #define MV88E6390_G1_MONITOR_MGMT_CTL_PTR_CPU_DEST 0x3000 +#define MV88E6390_G1_MONITOR_MGMT_CTL_PTR_CPU_DEST_MGMTPRI 0x00e0 #define MV88E6390_G1_MONITOR_MGMT_CTL_DATA_MASK 0x00ff /* Offset 0x1C: Global Control 2 */ diff --git a/drivers/net/dsa/mv88e6xxx/global1_atu.c b/drivers/net/dsa/mv88e6xxx/global1_atu.c index bdcd25560dd2..bac9a8a68e50 100644 --- a/drivers/net/dsa/mv88e6xxx/global1_atu.c +++ b/drivers/net/dsa/mv88e6xxx/global1_atu.c @@ -425,9 +425,12 @@ int mv88e6xxx_g1_atu_prob_irq_setup(struct mv88e6xxx_chip *chip) if (chip->atu_prob_irq < 0) return chip->atu_prob_irq; + snprintf(chip->atu_prob_irq_name, sizeof(chip->atu_prob_irq_name), + "mv88e6xxx-%s-g1-atu-prob", dev_name(chip->dev)); + err = request_threaded_irq(chip->atu_prob_irq, NULL, mv88e6xxx_g1_atu_prob_irq_thread_fn, - IRQF_ONESHOT, "mv88e6xxx-g1-atu-prob", + IRQF_ONESHOT, chip->atu_prob_irq_name, chip); if (err) irq_dispose_mapping(chip->atu_prob_irq); diff --git a/drivers/net/dsa/mv88e6xxx/global1_vtu.c b/drivers/net/dsa/mv88e6xxx/global1_vtu.c index 33056a609e96..48390b7b18ad 100644 --- a/drivers/net/dsa/mv88e6xxx/global1_vtu.c +++ b/drivers/net/dsa/mv88e6xxx/global1_vtu.c @@ -631,9 +631,12 @@ int mv88e6xxx_g1_vtu_prob_irq_setup(struct mv88e6xxx_chip *chip) if (chip->vtu_prob_irq < 0) return chip->vtu_prob_irq; + snprintf(chip->vtu_prob_irq_name, sizeof(chip->vtu_prob_irq_name), + "mv88e6xxx-%s-g1-vtu-prob", dev_name(chip->dev)); + err = request_threaded_irq(chip->vtu_prob_irq, NULL, mv88e6xxx_g1_vtu_prob_irq_thread_fn, - IRQF_ONESHOT, "mv88e6xxx-g1-vtu-prob", + IRQF_ONESHOT, chip->vtu_prob_irq_name, chip); if (err) irq_dispose_mapping(chip->vtu_prob_irq); diff --git a/drivers/net/dsa/mv88e6xxx/global2.c b/drivers/net/dsa/mv88e6xxx/global2.c index 87bfe7c8c9cd..01503014b1ee 100644 --- a/drivers/net/dsa/mv88e6xxx/global2.c +++ b/drivers/net/dsa/mv88e6xxx/global2.c @@ -948,10 +948,13 @@ static int mv88e6xxx_g2_watchdog_setup(struct mv88e6xxx_chip *chip) if (chip->watchdog_irq < 0) return chip->watchdog_irq; + snprintf(chip->watchdog_irq_name, sizeof(chip->watchdog_irq_name), + "mv88e6xxx-%s-watchdog", dev_name(chip->dev)); + err = request_threaded_irq(chip->watchdog_irq, NULL, mv88e6xxx_g2_watchdog_thread_fn, IRQF_ONESHOT | IRQF_TRIGGER_FALLING, - "mv88e6xxx-watchdog", chip); + chip->watchdog_irq_name, chip); if (err) return err; @@ -1114,9 +1117,12 @@ int mv88e6xxx_g2_irq_setup(struct mv88e6xxx_chip *chip) goto out; } + snprintf(chip->device_irq_name, sizeof(chip->device_irq_name), + "mv88e6xxx-%s-g2", dev_name(chip->dev)); + err = request_threaded_irq(chip->device_irq, NULL, mv88e6xxx_g2_irq_thread_fn, - IRQF_ONESHOT, "mv88e6xxx-g2", chip); + IRQF_ONESHOT, chip->device_irq_name, chip); if (err) goto out; diff --git a/drivers/net/dsa/mv88e6xxx/port.c b/drivers/net/dsa/mv88e6xxx/port.c index 7fe256c5739d..0b43c650e100 100644 --- a/drivers/net/dsa/mv88e6xxx/port.c +++ b/drivers/net/dsa/mv88e6xxx/port.c @@ -393,7 +393,7 @@ phy_interface_t mv88e6390x_port_max_speed_mode(int port) } static int mv88e6xxx_port_set_cmode(struct mv88e6xxx_chip *chip, int port, - phy_interface_t mode) + phy_interface_t mode, bool force) { u8 lane; u16 cmode; @@ -427,8 +427,8 @@ static int mv88e6xxx_port_set_cmode(struct mv88e6xxx_chip *chip, int port, cmode = 0; } - /* cmode doesn't change, nothing to do for us */ - if (cmode == chip->ports[port].cmode) + /* cmode doesn't change, nothing to do for us unless forced */ + if (cmode == chip->ports[port].cmode && !force) return 0; lane = mv88e6xxx_serdes_get_lane(chip, port); @@ -484,7 +484,7 @@ int mv88e6390x_port_set_cmode(struct mv88e6xxx_chip *chip, int port, if (port != 9 && port != 10) return -EOPNOTSUPP; - return mv88e6xxx_port_set_cmode(chip, port, mode); + return mv88e6xxx_port_set_cmode(chip, port, mode, false); } int mv88e6390_port_set_cmode(struct mv88e6xxx_chip *chip, int port, @@ -504,7 +504,7 @@ int mv88e6390_port_set_cmode(struct mv88e6xxx_chip *chip, int port, break; } - return mv88e6xxx_port_set_cmode(chip, port, mode); + return mv88e6xxx_port_set_cmode(chip, port, mode, false); } static int mv88e6341_port_set_cmode_writable(struct mv88e6xxx_chip *chip, @@ -555,7 +555,7 @@ int mv88e6341_port_set_cmode(struct mv88e6xxx_chip *chip, int port, if (err) return err; - return mv88e6xxx_port_set_cmode(chip, port, mode); + return mv88e6xxx_port_set_cmode(chip, port, mode, true); } int mv88e6185_port_get_cmode(struct mv88e6xxx_chip *chip, int port, u8 *cmode) diff --git a/drivers/net/dsa/ocelot/Kconfig b/drivers/net/dsa/ocelot/Kconfig index 6f9804093150..a5b7cca03d09 100644 --- a/drivers/net/dsa/ocelot/Kconfig +++ b/drivers/net/dsa/ocelot/Kconfig @@ -3,8 +3,10 @@ config NET_DSA_MSCC_FELIX tristate "Ocelot / Felix Ethernet switch support" depends on NET_DSA && PCI depends on NET_VENDOR_MICROSEMI + depends on NET_VENDOR_FREESCALE select MSCC_OCELOT_SWITCH select NET_DSA_TAG_OCELOT + select FSL_ENETC_MDIO help This driver supports the VSC9959 network switch, which is a member of the Vitesse / Microsemi / Microchip Ocelot family of switching cores. diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index b7f92464815d..3257962c147e 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -2,16 +2,22 @@ /* Copyright 2019 NXP Semiconductors */ #include <uapi/linux/if_bridge.h> +#include <soc/mscc/ocelot_qsys.h> +#include <soc/mscc/ocelot_sys.h> +#include <soc/mscc/ocelot_dev.h> +#include <soc/mscc/ocelot_ana.h> #include <soc/mscc/ocelot.h> #include <linux/packing.h> #include <linux/module.h> +#include <linux/of_net.h> #include <linux/pci.h> #include <linux/of.h> #include <net/dsa.h> #include "felix.h" static enum dsa_tag_protocol felix_get_tag_protocol(struct dsa_switch *ds, - int port) + int port, + enum dsa_tag_protocol mp) { return DSA_TAG_PROTO_OCELOT; } @@ -26,14 +32,6 @@ static int felix_set_ageing_time(struct dsa_switch *ds, return 0; } -static void felix_adjust_link(struct dsa_switch *ds, int port, - struct phy_device *phydev) -{ - struct ocelot *ocelot = ds->priv; - - ocelot_adjust_link(ocelot, port, phydev); -} - static int felix_fdb_dump(struct dsa_switch *ds, int port, dsa_fdb_dump_cb_t *cb, void *data) { @@ -155,6 +153,141 @@ static void felix_port_disable(struct dsa_switch *ds, int port) return ocelot_port_disable(ocelot, port); } +static void felix_phylink_validate(struct dsa_switch *ds, int port, + unsigned long *supported, + struct phylink_link_state *state) +{ + struct ocelot *ocelot = ds->priv; + struct ocelot_port *ocelot_port = ocelot->ports[port]; + __ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, }; + + if (state->interface != PHY_INTERFACE_MODE_NA && + state->interface != ocelot_port->phy_mode) { + bitmap_zero(supported, __ETHTOOL_LINK_MODE_MASK_NBITS); + return; + } + + /* No half-duplex. */ + phylink_set_port_modes(mask); + phylink_set(mask, Autoneg); + phylink_set(mask, Pause); + phylink_set(mask, Asym_Pause); + phylink_set(mask, 10baseT_Full); + phylink_set(mask, 100baseT_Full); + phylink_set(mask, 1000baseT_Full); + + /* The internal ports that run at 2.5G are overclocked GMII */ + if (state->interface == PHY_INTERFACE_MODE_GMII || + state->interface == PHY_INTERFACE_MODE_2500BASEX || + state->interface == PHY_INTERFACE_MODE_USXGMII) { + phylink_set(mask, 2500baseT_Full); + phylink_set(mask, 2500baseX_Full); + } + + bitmap_and(supported, supported, mask, + __ETHTOOL_LINK_MODE_MASK_NBITS); + bitmap_and(state->advertising, state->advertising, mask, + __ETHTOOL_LINK_MODE_MASK_NBITS); +} + +static int felix_phylink_mac_pcs_get_state(struct dsa_switch *ds, int port, + struct phylink_link_state *state) +{ + struct ocelot *ocelot = ds->priv; + struct felix *felix = ocelot_to_felix(ocelot); + + if (felix->info->pcs_link_state) + felix->info->pcs_link_state(ocelot, port, state); + + return 0; +} + +static void felix_phylink_mac_config(struct dsa_switch *ds, int port, + unsigned int link_an_mode, + const struct phylink_link_state *state) +{ + struct ocelot *ocelot = ds->priv; + struct ocelot_port *ocelot_port = ocelot->ports[port]; + struct felix *felix = ocelot_to_felix(ocelot); + u32 mac_fc_cfg; + + /* Take port out of reset by clearing the MAC_TX_RST, MAC_RX_RST and + * PORT_RST bits in CLOCK_CFG + */ + ocelot_port_writel(ocelot_port, DEV_CLOCK_CFG_LINK_SPEED(state->speed), + DEV_CLOCK_CFG); + + /* Flow control. Link speed is only used here to evaluate the time + * specification in incoming pause frames. + */ + mac_fc_cfg = SYS_MAC_FC_CFG_FC_LINK_SPEED(state->speed); + + /* handle Rx pause in all cases, with 2500base-X this is used for rate + * adaptation. + */ + mac_fc_cfg |= SYS_MAC_FC_CFG_RX_FC_ENA; + + if (state->pause & MLO_PAUSE_TX) + mac_fc_cfg |= SYS_MAC_FC_CFG_TX_FC_ENA | + SYS_MAC_FC_CFG_PAUSE_VAL_CFG(0xffff) | + SYS_MAC_FC_CFG_FC_LATENCY_CFG(0x7) | + SYS_MAC_FC_CFG_ZERO_PAUSE_ENA; + ocelot_write_rix(ocelot, mac_fc_cfg, SYS_MAC_FC_CFG, port); + + ocelot_write_rix(ocelot, 0, ANA_POL_FLOWC, port); + + if (felix->info->pcs_init) + felix->info->pcs_init(ocelot, port, link_an_mode, state); +} + +static void felix_phylink_mac_an_restart(struct dsa_switch *ds, int port) +{ + struct ocelot *ocelot = ds->priv; + struct felix *felix = ocelot_to_felix(ocelot); + + if (felix->info->pcs_an_restart) + felix->info->pcs_an_restart(ocelot, port); +} + +static void felix_phylink_mac_link_down(struct dsa_switch *ds, int port, + unsigned int link_an_mode, + phy_interface_t interface) +{ + struct ocelot *ocelot = ds->priv; + struct ocelot_port *ocelot_port = ocelot->ports[port]; + + ocelot_port_writel(ocelot_port, 0, DEV_MAC_ENA_CFG); + ocelot_rmw_rix(ocelot, 0, QSYS_SWITCH_PORT_MODE_PORT_ENA, + QSYS_SWITCH_PORT_MODE, port); +} + +static void felix_phylink_mac_link_up(struct dsa_switch *ds, int port, + unsigned int link_an_mode, + phy_interface_t interface, + struct phy_device *phydev) +{ + struct ocelot *ocelot = ds->priv; + struct ocelot_port *ocelot_port = ocelot->ports[port]; + + /* Enable MAC module */ + ocelot_port_writel(ocelot_port, DEV_MAC_ENA_CFG_RX_ENA | + DEV_MAC_ENA_CFG_TX_ENA, DEV_MAC_ENA_CFG); + + /* Enable receiving frames on the port, and activate auto-learning of + * MAC addresses. + */ + ocelot_write_gix(ocelot, ANA_PORT_PORT_CFG_LEARNAUTO | + ANA_PORT_PORT_CFG_RECV_ENA | + ANA_PORT_PORT_CFG_PORTID_VAL(port), + ANA_PORT_PORT_CFG, port); + + /* Core: Enable port for frame transfer */ + ocelot_write_rix(ocelot, QSYS_SWITCH_PORT_MODE_INGRESS_DROP_MODE | + QSYS_SWITCH_PORT_MODE_SCH_NEXT_CFG(1) | + QSYS_SWITCH_PORT_MODE_PORT_ENA, + QSYS_SWITCH_PORT_MODE, port); +} + static void felix_get_strings(struct dsa_switch *ds, int port, u32 stringset, u8 *data) { @@ -185,10 +318,76 @@ static int felix_get_ts_info(struct dsa_switch *ds, int port, return ocelot_get_ts_info(ocelot, port, info); } +static int felix_parse_ports_node(struct felix *felix, + struct device_node *ports_node, + phy_interface_t *port_phy_modes) +{ + struct ocelot *ocelot = &felix->ocelot; + struct device *dev = felix->ocelot.dev; + struct device_node *child; + + for_each_available_child_of_node(ports_node, child) { + phy_interface_t phy_mode; + u32 port; + int err; + + /* Get switch port number from DT */ + if (of_property_read_u32(child, "reg", &port) < 0) { + dev_err(dev, "Port number not defined in device tree " + "(property \"reg\")\n"); + of_node_put(child); + return -ENODEV; + } + + /* Get PHY mode from DT */ + err = of_get_phy_mode(child, &phy_mode); + if (err) { + dev_err(dev, "Failed to read phy-mode or " + "phy-interface-type property for port %d\n", + port); + of_node_put(child); + return -ENODEV; + } + + err = felix->info->prevalidate_phy_mode(ocelot, port, phy_mode); + if (err < 0) { + dev_err(dev, "Unsupported PHY mode %s on port %d\n", + phy_modes(phy_mode), port); + return err; + } + + port_phy_modes[port] = phy_mode; + } + + return 0; +} + +static int felix_parse_dt(struct felix *felix, phy_interface_t *port_phy_modes) +{ + struct device *dev = felix->ocelot.dev; + struct device_node *switch_node; + struct device_node *ports_node; + int err; + + switch_node = dev->of_node; + + ports_node = of_get_child_by_name(switch_node, "ports"); + if (!ports_node) { + dev_err(dev, "Incorrect bindings: absent \"ports\" node\n"); + return -ENODEV; + } + + err = felix_parse_ports_node(felix, ports_node, port_phy_modes); + of_node_put(ports_node); + + return err; +} + static int felix_init_structs(struct felix *felix, int num_phys_ports) { struct ocelot *ocelot = &felix->ocelot; - resource_size_t base; + phy_interface_t *port_phy_modes; + resource_size_t switch_base; int port, i, err; ocelot->num_phys_ports = num_phys_ports; @@ -203,7 +402,19 @@ static int felix_init_structs(struct felix *felix, int num_phys_ports) ocelot->shared_queue_sz = felix->info->shared_queue_sz; ocelot->ops = felix->info->ops; - base = pci_resource_start(felix->pdev, felix->info->pci_bar); + port_phy_modes = kcalloc(num_phys_ports, sizeof(phy_interface_t), + GFP_KERNEL); + if (!port_phy_modes) + return -ENOMEM; + + err = felix_parse_dt(felix, port_phy_modes); + if (err) { + kfree(port_phy_modes); + return err; + } + + switch_base = pci_resource_start(felix->pdev, + felix->info->switch_pci_bar); for (i = 0; i < TARGET_MAX; i++) { struct regmap *target; @@ -214,13 +425,14 @@ static int felix_init_structs(struct felix *felix, int num_phys_ports) res = &felix->info->target_io_res[i]; res->flags = IORESOURCE_MEM; - res->start += base; - res->end += base; + res->start += switch_base; + res->end += switch_base; target = ocelot_regmap_init(ocelot, res); if (IS_ERR(target)) { dev_err(ocelot->dev, "Failed to map device memory space\n"); + kfree(port_phy_modes); return PTR_ERR(target); } @@ -230,6 +442,7 @@ static int felix_init_structs(struct felix *felix, int num_phys_ports) err = ocelot_regfields_init(ocelot, felix->info->regfields); if (err) { dev_err(ocelot->dev, "failed to init reg fields map\n"); + kfree(port_phy_modes); return err; } @@ -244,26 +457,37 @@ static int felix_init_structs(struct felix *felix, int num_phys_ports) if (!ocelot_port) { dev_err(ocelot->dev, "failed to allocate port memory\n"); + kfree(port_phy_modes); return -ENOMEM; } res = &felix->info->port_io_res[port]; res->flags = IORESOURCE_MEM; - res->start += base; - res->end += base; + res->start += switch_base; + res->end += switch_base; port_regs = devm_ioremap_resource(ocelot->dev, res); if (IS_ERR(port_regs)) { dev_err(ocelot->dev, "failed to map registers for port %d\n", port); + kfree(port_phy_modes); return PTR_ERR(port_regs); } + ocelot_port->phy_mode = port_phy_modes[port]; ocelot_port->ocelot = ocelot; ocelot_port->regs = port_regs; ocelot->ports[port] = ocelot_port; } + kfree(port_phy_modes); + + if (felix->info->mdio_bus_alloc) { + err = felix->info->mdio_bus_alloc(ocelot); + if (err < 0) + return err; + } + return 0; } @@ -293,12 +517,22 @@ static int felix_setup(struct dsa_switch *ds) OCELOT_TAG_PREFIX_LONG); } + /* It looks like the MAC/PCS interrupt register - PM0_IEVENT (0x8040) + * isn't instantiated for the Felix PF. + * In-band AN may take a few ms to complete, so we need to poll. + */ + ds->pcs_poll = true; + return 0; } static void felix_teardown(struct dsa_switch *ds) { struct ocelot *ocelot = ds->priv; + struct felix *felix = ocelot_to_felix(ocelot); + + if (felix->info->mdio_bus_free) + felix->info->mdio_bus_free(ocelot); /* stop workqueue thread */ ocelot_deinit(ocelot); @@ -369,7 +603,12 @@ static const struct dsa_switch_ops felix_switch_ops = { .get_ethtool_stats = felix_get_ethtool_stats, .get_sset_count = felix_get_sset_count, .get_ts_info = felix_get_ts_info, - .adjust_link = felix_adjust_link, + .phylink_validate = felix_phylink_validate, + .phylink_mac_link_state = felix_phylink_mac_pcs_get_state, + .phylink_mac_config = felix_phylink_mac_config, + .phylink_mac_an_restart = felix_phylink_mac_an_restart, + .phylink_mac_link_down = felix_phylink_mac_link_down, + .phylink_mac_link_up = felix_phylink_mac_link_up, .port_enable = felix_port_enable, .port_disable = felix_port_disable, .port_fdb_dump = felix_fdb_dump, diff --git a/drivers/net/dsa/ocelot/felix.h b/drivers/net/dsa/ocelot/felix.h index 204296e51d0c..3a7580015b62 100644 --- a/drivers/net/dsa/ocelot/felix.h +++ b/drivers/net/dsa/ocelot/felix.h @@ -10,6 +10,7 @@ struct felix_info { struct resource *target_io_res; struct resource *port_io_res; + struct resource *imdio_res; const struct reg_field *regfields; const u32 *const *map; const struct ocelot_ops *ops; @@ -17,7 +18,18 @@ struct felix_info { const struct ocelot_stat_layout *stats_layout; unsigned int num_stats; int num_ports; - int pci_bar; + int switch_pci_bar; + int imdio_pci_bar; + int (*mdio_bus_alloc)(struct ocelot *ocelot); + void (*mdio_bus_free)(struct ocelot *ocelot); + void (*pcs_init)(struct ocelot *ocelot, int port, + unsigned int link_an_mode, + const struct phylink_link_state *state); + void (*pcs_an_restart)(struct ocelot *ocelot, int port); + void (*pcs_link_state)(struct ocelot *ocelot, int port, + struct phylink_link_state *state); + int (*prevalidate_phy_mode)(struct ocelot *ocelot, int port, + phy_interface_t phy_mode); }; extern struct felix_info felix_info_vsc9959; @@ -32,6 +44,8 @@ struct felix { struct pci_dev *pdev; struct felix_info *info; struct ocelot ocelot; + struct mii_bus *imdio; + struct phy_device **pcs; }; #endif diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c index b9758b0d18c7..2c812b481778 100644 --- a/drivers/net/dsa/ocelot/felix_vsc9959.c +++ b/drivers/net/dsa/ocelot/felix_vsc9959.c @@ -2,12 +2,33 @@ /* Copyright 2017 Microsemi Corporation * Copyright 2018-2019 NXP Semiconductors */ +#include <linux/fsl/enetc_mdio.h> #include <soc/mscc/ocelot_sys.h> #include <soc/mscc/ocelot.h> #include <linux/iopoll.h> #include <linux/pci.h> #include "felix.h" +/* TODO: should find a better place for these */ +#define USXGMII_BMCR_RESET BIT(15) +#define USXGMII_BMCR_AN_EN BIT(12) +#define USXGMII_BMCR_RST_AN BIT(9) +#define USXGMII_BMSR_LNKS(status) (((status) & GENMASK(2, 2)) >> 2) +#define USXGMII_BMSR_AN_CMPL(status) (((status) & GENMASK(5, 5)) >> 5) +#define USXGMII_ADVERTISE_LNKS(x) (((x) << 15) & BIT(15)) +#define USXGMII_ADVERTISE_FDX BIT(12) +#define USXGMII_ADVERTISE_SPEED(x) (((x) << 9) & GENMASK(11, 9)) +#define USXGMII_LPA_LNKS(lpa) ((lpa) >> 15) +#define USXGMII_LPA_DUPLEX(lpa) (((lpa) & GENMASK(12, 12)) >> 12) +#define USXGMII_LPA_SPEED(lpa) (((lpa) & GENMASK(11, 9)) >> 9) + +enum usxgmii_speed { + USXGMII_SPEED_10 = 0, + USXGMII_SPEED_100 = 1, + USXGMII_SPEED_1000 = 2, + USXGMII_SPEED_2500 = 4, +}; + static const u32 vsc9959_ana_regmap[] = { REG(ANA_ADVLEARN, 0x0089a0), REG(ANA_VLANMASK, 0x0089a4), @@ -386,6 +407,15 @@ static struct resource vsc9959_port_io_res[] = { }, }; +/* Port MAC 0 Internal MDIO bus through which the SerDes acting as an + * SGMII/QSGMII MAC PCS can be found. + */ +static struct resource vsc9959_imdio_res = { + .start = 0x8030, + .end = 0x8040, + .name = "imdio", +}; + static const struct reg_field vsc9959_regfields[] = { [ANA_ADVLEARN_VLAN_CHK] = REG_FIELD(ANA_ADVLEARN, 6, 6), [ANA_ADVLEARN_LEARN_MIRROR] = REG_FIELD(ANA_ADVLEARN, 0, 5), @@ -565,13 +595,495 @@ static int vsc9959_reset(struct ocelot *ocelot) return 0; } +static void vsc9959_pcs_an_restart_sgmii(struct phy_device *pcs) +{ + phy_set_bits(pcs, MII_BMCR, BMCR_ANRESTART); +} + +static void vsc9959_pcs_an_restart_usxgmii(struct phy_device *pcs) +{ + phy_write_mmd(pcs, MDIO_MMD_VEND2, MII_BMCR, + USXGMII_BMCR_RESET | + USXGMII_BMCR_AN_EN | + USXGMII_BMCR_RST_AN); +} + +static void vsc9959_pcs_an_restart(struct ocelot *ocelot, int port) +{ + struct felix *felix = ocelot_to_felix(ocelot); + struct phy_device *pcs = felix->pcs[port]; + + if (!pcs) + return; + + switch (pcs->interface) { + case PHY_INTERFACE_MODE_SGMII: + case PHY_INTERFACE_MODE_QSGMII: + vsc9959_pcs_an_restart_sgmii(pcs); + break; + case PHY_INTERFACE_MODE_USXGMII: + vsc9959_pcs_an_restart_usxgmii(pcs); + break; + default: + dev_err(ocelot->dev, "Invalid PCS interface type %s\n", + phy_modes(pcs->interface)); + break; + } +} + +/* We enable SGMII AN only when the PHY has managed = "in-band-status" in the + * device tree. If we are in MLO_AN_PHY mode, we program directly state->speed + * into the PCS, which is retrieved out-of-band over MDIO. This also has the + * benefit of working with SGMII fixed-links, like downstream switches, where + * both link partners attempt to operate as AN slaves and therefore AN never + * completes. But it also has the disadvantage that some PHY chips don't pass + * traffic if SGMII AN is enabled but not completed (acknowledged by us), so + * setting MLO_AN_INBAND is actually required for those. + */ +static void vsc9959_pcs_init_sgmii(struct phy_device *pcs, + unsigned int link_an_mode, + const struct phylink_link_state *state) +{ + if (link_an_mode == MLO_AN_INBAND) { + int bmsr, bmcr; + + /* Some PHYs like VSC8234 don't like it when AN restarts on + * their system side and they restart line side AN too, going + * into an endless link up/down loop. Don't restart PCS AN if + * link is up already. + * We do check that AN is enabled just in case this is the 1st + * call, PCS detects a carrier but AN is disabled from power on + * or by boot loader. + */ + bmcr = phy_read(pcs, MII_BMCR); + if (bmcr < 0) + return; + + bmsr = phy_read(pcs, MII_BMSR); + if (bmsr < 0) + return; + + if ((bmcr & BMCR_ANENABLE) && (bmsr & BMSR_LSTATUS)) + return; + + /* SGMII spec requires tx_config_Reg[15:0] to be exactly 0x4001 + * for the MAC PCS in order to acknowledge the AN. + */ + phy_write(pcs, MII_ADVERTISE, ADVERTISE_SGMII | + ADVERTISE_LPACK); + + phy_write(pcs, ENETC_PCS_IF_MODE, + ENETC_PCS_IF_MODE_SGMII_EN | + ENETC_PCS_IF_MODE_USE_SGMII_AN); + + /* Adjust link timer for SGMII */ + phy_write(pcs, ENETC_PCS_LINK_TIMER1, + ENETC_PCS_LINK_TIMER1_VAL); + phy_write(pcs, ENETC_PCS_LINK_TIMER2, + ENETC_PCS_LINK_TIMER2_VAL); + + phy_write(pcs, MII_BMCR, BMCR_ANRESTART | BMCR_ANENABLE); + } else { + int speed; + + if (state->duplex == DUPLEX_HALF) { + phydev_err(pcs, "Half duplex not supported\n"); + return; + } + switch (state->speed) { + case SPEED_1000: + speed = ENETC_PCS_SPEED_1000; + break; + case SPEED_100: + speed = ENETC_PCS_SPEED_100; + break; + case SPEED_10: + speed = ENETC_PCS_SPEED_10; + break; + case SPEED_UNKNOWN: + /* Silently don't do anything */ + return; + default: + phydev_err(pcs, "Invalid PCS speed %d\n", state->speed); + return; + } + + phy_write(pcs, ENETC_PCS_IF_MODE, + ENETC_PCS_IF_MODE_SGMII_EN | + ENETC_PCS_IF_MODE_SGMII_SPEED(speed)); + + /* Yes, not a mistake: speed is given by IF_MODE. */ + phy_write(pcs, MII_BMCR, BMCR_RESET | + BMCR_SPEED1000 | + BMCR_FULLDPLX); + } +} + +/* 2500Base-X is SerDes protocol 7 on Felix and 6 on ENETC. It is a SerDes lane + * clocked at 3.125 GHz which encodes symbols with 8b/10b and does not have + * auto-negotiation of any link parameters. Electrically it is compatible with + * a single lane of XAUI. + * The hardware reference manual wants to call this mode SGMII, but it isn't + * really, since the fundamental features of SGMII: + * - Downgrading the link speed by duplicating symbols + * - Auto-negotiation + * are not there. + * The speed is configured at 1000 in the IF_MODE and BMCR MDIO registers + * because the clock frequency is actually given by a PLL configured in the + * Reset Configuration Word (RCW). + * Since there is no difference between fixed speed SGMII w/o AN and 802.3z w/o + * AN, we call this PHY interface type 2500Base-X. In case a PHY negotiates a + * lower link speed on line side, the system-side interface remains fixed at + * 2500 Mbps and we do rate adaptation through pause frames. + */ +static void vsc9959_pcs_init_2500basex(struct phy_device *pcs, + unsigned int link_an_mode, + const struct phylink_link_state *state) +{ + if (link_an_mode == MLO_AN_INBAND) { + phydev_err(pcs, "AN not supported on 3.125GHz SerDes lane\n"); + return; + } + + phy_write(pcs, ENETC_PCS_IF_MODE, + ENETC_PCS_IF_MODE_SGMII_EN | + ENETC_PCS_IF_MODE_SGMII_SPEED(ENETC_PCS_SPEED_2500)); + + phy_write(pcs, MII_BMCR, BMCR_SPEED1000 | + BMCR_FULLDPLX | + BMCR_RESET); +} + +static void vsc9959_pcs_init_usxgmii(struct phy_device *pcs, + unsigned int link_an_mode, + const struct phylink_link_state *state) +{ + if (link_an_mode != MLO_AN_INBAND) { + phydev_err(pcs, "USXGMII only supports in-band AN for now\n"); + return; + } + + /* Configure device ability for the USXGMII Replicator */ + phy_write_mmd(pcs, MDIO_MMD_VEND2, MII_ADVERTISE, + USXGMII_ADVERTISE_SPEED(USXGMII_SPEED_2500) | + USXGMII_ADVERTISE_LNKS(1) | + ADVERTISE_SGMII | + ADVERTISE_LPACK | + USXGMII_ADVERTISE_FDX); +} + +static void vsc9959_pcs_init(struct ocelot *ocelot, int port, + unsigned int link_an_mode, + const struct phylink_link_state *state) +{ + struct felix *felix = ocelot_to_felix(ocelot); + struct phy_device *pcs = felix->pcs[port]; + + if (!pcs) + return; + + /* The PCS does not implement the BMSR register fully, so capability + * detection via genphy_read_abilities does not work. Since we can get + * the PHY config word from the LPA register though, there is still + * value in using the generic phy_resolve_aneg_linkmode function. So + * populate the supported and advertising link modes manually here. + */ + linkmode_set_bit_array(phy_basic_ports_array, + ARRAY_SIZE(phy_basic_ports_array), + pcs->supported); + linkmode_set_bit(ETHTOOL_LINK_MODE_10baseT_Full_BIT, pcs->supported); + linkmode_set_bit(ETHTOOL_LINK_MODE_100baseT_Full_BIT, pcs->supported); + linkmode_set_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT, pcs->supported); + if (pcs->interface == PHY_INTERFACE_MODE_2500BASEX || + pcs->interface == PHY_INTERFACE_MODE_USXGMII) + linkmode_set_bit(ETHTOOL_LINK_MODE_2500baseX_Full_BIT, + pcs->supported); + if (pcs->interface != PHY_INTERFACE_MODE_2500BASEX) + linkmode_set_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, + pcs->supported); + phy_advertise_supported(pcs); + + switch (pcs->interface) { + case PHY_INTERFACE_MODE_SGMII: + case PHY_INTERFACE_MODE_QSGMII: + vsc9959_pcs_init_sgmii(pcs, link_an_mode, state); + break; + case PHY_INTERFACE_MODE_2500BASEX: + vsc9959_pcs_init_2500basex(pcs, link_an_mode, state); + break; + case PHY_INTERFACE_MODE_USXGMII: + vsc9959_pcs_init_usxgmii(pcs, link_an_mode, state); + break; + default: + dev_err(ocelot->dev, "Unsupported link mode %s\n", + phy_modes(pcs->interface)); + } +} + +static void vsc9959_pcs_link_state_resolve(struct phy_device *pcs, + struct phylink_link_state *state) +{ + state->an_complete = pcs->autoneg_complete; + state->an_enabled = pcs->autoneg; + state->link = pcs->link; + state->duplex = pcs->duplex; + state->speed = pcs->speed; + /* SGMII AN does not negotiate flow control, but that's ok, + * since phylink already knows that, and does: + * link_state.pause |= pl->phy_state.pause; + */ + state->pause = MLO_PAUSE_NONE; + + phydev_dbg(pcs, + "mode=%s/%s/%s adv=%*pb lpa=%*pb link=%u an_enabled=%u an_complete=%u\n", + phy_modes(pcs->interface), + phy_speed_to_str(pcs->speed), + phy_duplex_to_str(pcs->duplex), + __ETHTOOL_LINK_MODE_MASK_NBITS, pcs->advertising, + __ETHTOOL_LINK_MODE_MASK_NBITS, pcs->lp_advertising, + pcs->link, pcs->autoneg, pcs->autoneg_complete); +} + +static void vsc9959_pcs_link_state_sgmii(struct phy_device *pcs, + struct phylink_link_state *state) +{ + int err; + + err = genphy_update_link(pcs); + if (err < 0) + return; + + if (pcs->autoneg_complete) { + u16 lpa = phy_read(pcs, MII_LPA); + + mii_lpa_to_linkmode_lpa_sgmii(pcs->lp_advertising, lpa); + + phy_resolve_aneg_linkmode(pcs); + } +} + +static void vsc9959_pcs_link_state_2500basex(struct phy_device *pcs, + struct phylink_link_state *state) +{ + int err; + + err = genphy_update_link(pcs); + if (err < 0) + return; + + pcs->speed = SPEED_2500; + pcs->asym_pause = true; + pcs->pause = true; +} + +static void vsc9959_pcs_link_state_usxgmii(struct phy_device *pcs, + struct phylink_link_state *state) +{ + int status, lpa; + + status = phy_read_mmd(pcs, MDIO_MMD_VEND2, MII_BMSR); + if (status < 0) + return; + + pcs->autoneg = true; + pcs->autoneg_complete = USXGMII_BMSR_AN_CMPL(status); + pcs->link = USXGMII_BMSR_LNKS(status); + + if (!pcs->link || !pcs->autoneg_complete) + return; + + lpa = phy_read_mmd(pcs, MDIO_MMD_VEND2, MII_LPA); + if (lpa < 0) + return; + + switch (USXGMII_LPA_SPEED(lpa)) { + case USXGMII_SPEED_10: + pcs->speed = SPEED_10; + break; + case USXGMII_SPEED_100: + pcs->speed = SPEED_100; + break; + case USXGMII_SPEED_1000: + pcs->speed = SPEED_1000; + break; + case USXGMII_SPEED_2500: + pcs->speed = SPEED_2500; + break; + default: + break; + } + + if (USXGMII_LPA_DUPLEX(lpa)) + pcs->duplex = DUPLEX_FULL; + else + pcs->duplex = DUPLEX_HALF; +} + +static void vsc9959_pcs_link_state(struct ocelot *ocelot, int port, + struct phylink_link_state *state) +{ + struct felix *felix = ocelot_to_felix(ocelot); + struct phy_device *pcs = felix->pcs[port]; + + if (!pcs) + return; + + pcs->speed = SPEED_UNKNOWN; + pcs->duplex = DUPLEX_UNKNOWN; + pcs->pause = 0; + pcs->asym_pause = 0; + + switch (pcs->interface) { + case PHY_INTERFACE_MODE_SGMII: + case PHY_INTERFACE_MODE_QSGMII: + vsc9959_pcs_link_state_sgmii(pcs, state); + break; + case PHY_INTERFACE_MODE_2500BASEX: + vsc9959_pcs_link_state_2500basex(pcs, state); + break; + case PHY_INTERFACE_MODE_USXGMII: + vsc9959_pcs_link_state_usxgmii(pcs, state); + break; + default: + return; + } + + vsc9959_pcs_link_state_resolve(pcs, state); +} + +static int vsc9959_prevalidate_phy_mode(struct ocelot *ocelot, int port, + phy_interface_t phy_mode) +{ + switch (phy_mode) { + case PHY_INTERFACE_MODE_GMII: + /* Only supported on internal to-CPU ports */ + if (port != 4 && port != 5) + return -ENOTSUPP; + return 0; + case PHY_INTERFACE_MODE_SGMII: + case PHY_INTERFACE_MODE_QSGMII: + case PHY_INTERFACE_MODE_USXGMII: + case PHY_INTERFACE_MODE_2500BASEX: + /* Not supported on internal to-CPU ports */ + if (port == 4 || port == 5) + return -ENOTSUPP; + return 0; + default: + return -ENOTSUPP; + } +} + static const struct ocelot_ops vsc9959_ops = { .reset = vsc9959_reset, }; +static int vsc9959_mdio_bus_alloc(struct ocelot *ocelot) +{ + struct felix *felix = ocelot_to_felix(ocelot); + struct enetc_mdio_priv *mdio_priv; + struct device *dev = ocelot->dev; + resource_size_t imdio_base; + void __iomem *imdio_regs; + struct resource *res; + struct enetc_hw *hw; + struct mii_bus *bus; + int port; + int rc; + + felix->pcs = devm_kcalloc(dev, felix->info->num_ports, + sizeof(struct phy_device *), + GFP_KERNEL); + if (!felix->pcs) { + dev_err(dev, "failed to allocate array for PCS PHYs\n"); + return -ENOMEM; + } + + imdio_base = pci_resource_start(felix->pdev, + felix->info->imdio_pci_bar); + + res = felix->info->imdio_res; + res->flags = IORESOURCE_MEM; + res->start += imdio_base; + res->end += imdio_base; + + imdio_regs = devm_ioremap_resource(dev, res); + if (IS_ERR(imdio_regs)) { + dev_err(dev, "failed to map internal MDIO registers\n"); + return PTR_ERR(imdio_regs); + } + + hw = enetc_hw_alloc(dev, imdio_regs); + if (IS_ERR(hw)) { + dev_err(dev, "failed to allocate ENETC HW structure\n"); + return PTR_ERR(hw); + } + + bus = devm_mdiobus_alloc_size(dev, sizeof(*mdio_priv)); + if (!bus) + return -ENOMEM; + + bus->name = "VSC9959 internal MDIO bus"; + bus->read = enetc_mdio_read; + bus->write = enetc_mdio_write; + bus->parent = dev; + mdio_priv = bus->priv; + mdio_priv->hw = hw; + /* This gets added to imdio_regs, which already maps addresses + * starting with the proper offset. + */ + mdio_priv->mdio_base = 0; + snprintf(bus->id, MII_BUS_ID_SIZE, "%s-imdio", dev_name(dev)); + + /* Needed in order to initialize the bus mutex lock */ + rc = mdiobus_register(bus); + if (rc < 0) { + dev_err(dev, "failed to register MDIO bus\n"); + return rc; + } + + felix->imdio = bus; + + for (port = 0; port < felix->info->num_ports; port++) { + struct ocelot_port *ocelot_port = ocelot->ports[port]; + struct phy_device *pcs; + bool is_c45 = false; + + if (ocelot_port->phy_mode == PHY_INTERFACE_MODE_USXGMII) + is_c45 = true; + + pcs = get_phy_device(felix->imdio, port, is_c45); + if (IS_ERR(pcs)) + continue; + + pcs->interface = ocelot_port->phy_mode; + felix->pcs[port] = pcs; + + dev_info(dev, "Found PCS at internal MDIO address %d\n", port); + } + + return 0; +} + +static void vsc9959_mdio_bus_free(struct ocelot *ocelot) +{ + struct felix *felix = ocelot_to_felix(ocelot); + int port; + + for (port = 0; port < ocelot->num_phys_ports; port++) { + struct phy_device *pcs = felix->pcs[port]; + + if (!pcs) + continue; + + put_device(&pcs->mdio.dev); + } + mdiobus_unregister(felix->imdio); +} + struct felix_info felix_info_vsc9959 = { .target_io_res = vsc9959_target_io_res, .port_io_res = vsc9959_port_io_res, + .imdio_res = &vsc9959_imdio_res, .regfields = vsc9959_regfields, .map = vsc9959_regmap, .ops = &vsc9959_ops, @@ -579,5 +1091,12 @@ struct felix_info felix_info_vsc9959 = { .num_stats = ARRAY_SIZE(vsc9959_stats_layout), .shared_queue_sz = 128 * 1024, .num_ports = 6, - .pci_bar = 4, + .switch_pci_bar = 4, + .imdio_pci_bar = 0, + .mdio_bus_alloc = vsc9959_mdio_bus_alloc, + .mdio_bus_free = vsc9959_mdio_bus_free, + .pcs_init = vsc9959_pcs_init, + .pcs_an_restart = vsc9959_pcs_an_restart, + .pcs_link_state = vsc9959_pcs_link_state, + .prevalidate_phy_mode = vsc9959_prevalidate_phy_mode, }; diff --git a/drivers/net/dsa/qca/ar9331.c b/drivers/net/dsa/qca/ar9331.c index da3bece75e21..de25f99e995a 100644 --- a/drivers/net/dsa/qca/ar9331.c +++ b/drivers/net/dsa/qca/ar9331.c @@ -347,7 +347,8 @@ static void ar9331_sw_port_disable(struct dsa_switch *ds, int port) } static enum dsa_tag_protocol ar9331_sw_get_tag_protocol(struct dsa_switch *ds, - int port) + int port, + enum dsa_tag_protocol m) { return DSA_TAG_PROTO_AR9331; } diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c index e548289df31e..9f4205b4439b 100644 --- a/drivers/net/dsa/qca8k.c +++ b/drivers/net/dsa/qca8k.c @@ -1017,7 +1017,8 @@ qca8k_port_fdb_dump(struct dsa_switch *ds, int port, } static enum dsa_tag_protocol -qca8k_get_tag_protocol(struct dsa_switch *ds, int port) +qca8k_get_tag_protocol(struct dsa_switch *ds, int port, + enum dsa_tag_protocol mp) { return DSA_TAG_PROTO_QCA; } diff --git a/drivers/net/dsa/rtl8366rb.c b/drivers/net/dsa/rtl8366rb.c index f5cc8b0a7c74..fd1977590cb4 100644 --- a/drivers/net/dsa/rtl8366rb.c +++ b/drivers/net/dsa/rtl8366rb.c @@ -964,7 +964,8 @@ static int rtl8366rb_setup(struct dsa_switch *ds) } static enum dsa_tag_protocol rtl8366_get_tag_protocol(struct dsa_switch *ds, - int port) + int port, + enum dsa_tag_protocol mp) { /* For now, the RTL switches are handled without any custom tags. * diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index 1da5ac111499..03ba6d25f7fe 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -426,14 +426,6 @@ static int sja1105_init_general_params(struct sja1105_private *priv) .tpid2 = ETH_P_SJA1105, }; struct sja1105_table *table; - int i, k = 0; - - for (i = 0; i < SJA1105_NUM_PORTS; i++) { - if (dsa_is_dsa_port(priv->ds, i)) - default_general_params.casc_port = i; - else if (dsa_is_user_port(priv->ds, i)) - priv->ports[i].mgmt_slot = k++; - } table = &priv->static_config.tables[BLK_IDX_GENERAL_PARAMS]; @@ -582,7 +574,7 @@ static int sja1105_parse_ports_node(struct sja1105_private *priv, struct device *dev = &priv->spidev->dev; struct device_node *child; - for_each_child_of_node(ports_node, child) { + for_each_available_child_of_node(ports_node, child) { struct device_node *phy_node; phy_interface_t phy_mode; u32 index; @@ -1542,7 +1534,8 @@ static int sja1105_setup_8021q_tagging(struct dsa_switch *ds, bool enabled) } static enum dsa_tag_protocol -sja1105_get_tag_protocol(struct dsa_switch *ds, int port) +sja1105_get_tag_protocol(struct dsa_switch *ds, int port, + enum dsa_tag_protocol mp) { return DSA_TAG_PROTO_SJA1105; } @@ -1740,6 +1733,16 @@ static int sja1105_setup(struct dsa_switch *ds) static void sja1105_teardown(struct dsa_switch *ds) { struct sja1105_private *priv = ds->priv; + int port; + + for (port = 0; port < SJA1105_NUM_PORTS; port++) { + struct sja1105_port *sp = &priv->ports[port]; + + if (!dsa_is_user_port(ds, port)) + continue; + + kthread_destroy_worker(sp->xmit_worker); + } sja1105_tas_teardown(ds); sja1105_ptp_clock_unregister(ds); @@ -1761,6 +1764,18 @@ static int sja1105_port_enable(struct dsa_switch *ds, int port, return 0; } +static void sja1105_port_disable(struct dsa_switch *ds, int port) +{ + struct sja1105_private *priv = ds->priv; + struct sja1105_port *sp = &priv->ports[port]; + + if (!dsa_is_user_port(ds, port)) + return; + + kthread_cancel_work_sync(&sp->xmit_work); + skb_queue_purge(&sp->xmit_queue); +} + static int sja1105_mgmt_xmit(struct dsa_switch *ds, int port, int slot, struct sk_buff *skb, bool takets) { @@ -1819,47 +1834,36 @@ static int sja1105_mgmt_xmit(struct dsa_switch *ds, int port, int slot, return NETDEV_TX_OK; } +#define work_to_port(work) \ + container_of((work), struct sja1105_port, xmit_work) +#define tagger_to_sja1105(t) \ + container_of((t), struct sja1105_private, tagger_data) + /* Deferred work is unfortunately necessary because setting up the management * route cannot be done from atomit context (SPI transfer takes a sleepable * lock on the bus) */ -static netdev_tx_t sja1105_port_deferred_xmit(struct dsa_switch *ds, int port, - struct sk_buff *skb) +static void sja1105_port_deferred_xmit(struct kthread_work *work) { - struct sja1105_private *priv = ds->priv; - struct sja1105_port *sp = &priv->ports[port]; - int slot = sp->mgmt_slot; - struct sk_buff *clone; - - /* The tragic fact about the switch having 4x2 slots for installing - * management routes is that all of them except one are actually - * useless. - * If 2 slots are simultaneously configured for two BPDUs sent to the - * same (multicast) DMAC but on different egress ports, the switch - * would confuse them and redirect first frame it receives on the CPU - * port towards the port configured on the numerically first slot - * (therefore wrong port), then second received frame on second slot - * (also wrong port). - * So for all practical purposes, there needs to be a lock that - * prevents that from happening. The slot used here is utterly useless - * (could have simply been 0 just as fine), but we are doing it - * nonetheless, in case a smarter idea ever comes up in the future. - */ - mutex_lock(&priv->mgmt_lock); + struct sja1105_port *sp = work_to_port(work); + struct sja1105_tagger_data *tagger_data = sp->data; + struct sja1105_private *priv = tagger_to_sja1105(tagger_data); + int port = sp - priv->ports; + struct sk_buff *skb; - /* The clone, if there, was made by dsa_skb_tx_timestamp */ - clone = DSA_SKB_CB(skb)->clone; + while ((skb = skb_dequeue(&sp->xmit_queue)) != NULL) { + struct sk_buff *clone = DSA_SKB_CB(skb)->clone; - sja1105_mgmt_xmit(ds, port, slot, skb, !!clone); + mutex_lock(&priv->mgmt_lock); - if (!clone) - goto out; + sja1105_mgmt_xmit(priv->ds, port, 0, skb, !!clone); - sja1105_ptp_txtstamp_skb(ds, port, clone); + /* The clone, if there, was made by dsa_skb_tx_timestamp */ + if (clone) + sja1105_ptp_txtstamp_skb(priv->ds, port, clone); -out: - mutex_unlock(&priv->mgmt_lock); - return NETDEV_TX_OK; + mutex_unlock(&priv->mgmt_lock); + } } /* The MAXAGE setting belongs to the L2 Forwarding Parameters table, @@ -1990,6 +1994,7 @@ static const struct dsa_switch_ops sja1105_switch_ops = { .get_sset_count = sja1105_get_sset_count, .get_ts_info = sja1105_get_ts_info, .port_enable = sja1105_port_enable, + .port_disable = sja1105_port_disable, .port_fdb_dump = sja1105_fdb_dump, .port_fdb_add = sja1105_fdb_add, .port_fdb_del = sja1105_fdb_del, @@ -2003,7 +2008,6 @@ static const struct dsa_switch_ops sja1105_switch_ops = { .port_mdb_prepare = sja1105_mdb_prepare, .port_mdb_add = sja1105_mdb_add, .port_mdb_del = sja1105_mdb_del, - .port_deferred_xmit = sja1105_port_deferred_xmit, .port_hwtstamp_get = sja1105_hwtstamp_get, .port_hwtstamp_set = sja1105_hwtstamp_set, .port_rxtstamp = sja1105_port_rxtstamp, @@ -2055,7 +2059,7 @@ static int sja1105_probe(struct spi_device *spi) struct device *dev = &spi->dev; struct sja1105_private *priv; struct dsa_switch *ds; - int rc, i; + int rc, port; if (!dev->of_node) { dev_err(dev, "No DTS bindings for SJA1105 driver\n"); @@ -2120,15 +2124,42 @@ static int sja1105_probe(struct spi_device *spi) return rc; /* Connections between dsa_port and sja1105_port */ - for (i = 0; i < SJA1105_NUM_PORTS; i++) { - struct sja1105_port *sp = &priv->ports[i]; + for (port = 0; port < SJA1105_NUM_PORTS; port++) { + struct sja1105_port *sp = &priv->ports[port]; + struct dsa_port *dp = dsa_to_port(ds, port); + struct net_device *slave; - dsa_to_port(ds, i)->priv = sp; - sp->dp = dsa_to_port(ds, i); + if (!dsa_is_user_port(ds, port)) + continue; + + dp->priv = sp; + sp->dp = dp; sp->data = tagger_data; + slave = dp->slave; + kthread_init_work(&sp->xmit_work, sja1105_port_deferred_xmit); + sp->xmit_worker = kthread_create_worker(0, "%s_xmit", + slave->name); + if (IS_ERR(sp->xmit_worker)) { + rc = PTR_ERR(sp->xmit_worker); + dev_err(ds->dev, + "failed to create deferred xmit thread: %d\n", + rc); + goto out; + } + skb_queue_head_init(&sp->xmit_queue); } return 0; +out: + while (port-- > 0) { + struct sja1105_port *sp = &priv->ports[port]; + + if (!dsa_is_user_port(ds, port)) + continue; + + kthread_destroy_worker(sp->xmit_worker); + } + return rc; } static int sja1105_remove(struct spi_device *spi) diff --git a/drivers/net/dsa/vitesse-vsc73xx-core.c b/drivers/net/dsa/vitesse-vsc73xx-core.c index 42c1574d45f2..6e21a2a5cf01 100644 --- a/drivers/net/dsa/vitesse-vsc73xx-core.c +++ b/drivers/net/dsa/vitesse-vsc73xx-core.c @@ -542,7 +542,8 @@ static int vsc73xx_phy_write(struct dsa_switch *ds, int phy, int regnum, } static enum dsa_tag_protocol vsc73xx_get_tag_protocol(struct dsa_switch *ds, - int port) + int port, + enum dsa_tag_protocol mp) { /* The switch internally uses a 8 byte header with length, * source port, tag, LPA and priority. This is supposedly @@ -1111,7 +1112,9 @@ static int vsc73xx_gpio_probe(struct vsc73xx *vsc) vsc->gc.ngpio = 4; vsc->gc.owner = THIS_MODULE; vsc->gc.parent = vsc->dev; +#if IS_ENABLED(CONFIG_OF_GPIO) vsc->gc.of_node = vsc->dev->of_node; +#endif vsc->gc.base = -1; vsc->gc.get = vsc73xx_gpio_get; vsc->gc.set = vsc73xx_gpio_set; diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c index fc046797c0ea..a2b7f7ab8170 100644 --- a/drivers/net/ethernet/3com/3c59x.c +++ b/drivers/net/ethernet/3com/3c59x.c @@ -1548,7 +1548,7 @@ vortex_up(struct net_device *dev) struct vortex_private *vp = netdev_priv(dev); void __iomem *ioaddr = vp->ioaddr; unsigned int config; - int i, mii_reg1, mii_reg5, err = 0; + int i, mii_reg5, err = 0; if (VORTEX_PCI(vp)) { pci_set_power_state(VORTEX_PCI(vp), PCI_D0); /* Go active */ @@ -1605,7 +1605,7 @@ vortex_up(struct net_device *dev) window_write32(vp, config, 3, Wn3_Config); if (dev->if_port == XCVR_MII || dev->if_port == XCVR_NWAY) { - mii_reg1 = mdio_read(dev, vp->phys[0], MII_BMSR); + mdio_read(dev, vp->phys[0], MII_BMSR); mii_reg5 = mdio_read(dev, vp->phys[0], MII_LPA); vp->partner_flow_ctrl = ((mii_reg5 & 0x0400) != 0); vp->mii.full_duplex = vp->full_duplex; diff --git a/drivers/net/ethernet/agere/et131x.c b/drivers/net/ethernet/agere/et131x.c index 3c51d8c502ed..cb6a761d5c11 100644 --- a/drivers/net/ethernet/agere/et131x.c +++ b/drivers/net/ethernet/agere/et131x.c @@ -3651,15 +3651,6 @@ static int et131x_close(struct net_device *netdev) return del_timer_sync(&adapter->error_timer); } -static int et131x_ioctl(struct net_device *netdev, struct ifreq *reqbuf, - int cmd) -{ - if (!netdev->phydev) - return -EINVAL; - - return phy_mii_ioctl(netdev->phydev, reqbuf, cmd); -} - /* et131x_set_packet_filter - Configures the Rx Packet filtering */ static int et131x_set_packet_filter(struct et131x_adapter *adapter) { @@ -3899,7 +3890,7 @@ static const struct net_device_ops et131x_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_get_stats = et131x_stats, - .ndo_do_ioctl = et131x_ioctl, + .ndo_do_ioctl = phy_do_ioctl, }; static int et131x_pci_setup(struct pci_dev *pdev, diff --git a/drivers/net/ethernet/allwinner/sun4i-emac.c b/drivers/net/ethernet/allwinner/sun4i-emac.c index 5ea806423e4c..22cadfbeedfb 100644 --- a/drivers/net/ethernet/allwinner/sun4i-emac.c +++ b/drivers/net/ethernet/allwinner/sun4i-emac.c @@ -207,19 +207,6 @@ static void emac_inblk_32bit(void __iomem *reg, void *data, int count) readsl(reg, data, round_up(count, 4) / 4); } -static int emac_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) -{ - struct phy_device *phydev = dev->phydev; - - if (!netif_running(dev)) - return -EINVAL; - - if (!phydev) - return -ENODEV; - - return phy_mii_ioctl(phydev, rq, cmd); -} - /* ethtool ops */ static void emac_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) @@ -791,7 +778,7 @@ static const struct net_device_ops emac_netdev_ops = { .ndo_start_xmit = emac_start_xmit, .ndo_tx_timeout = emac_timeout, .ndo_set_rx_mode = emac_set_rx_mode, - .ndo_do_ioctl = emac_ioctl, + .ndo_do_ioctl = phy_do_ioctl_running, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = emac_set_mac_address, #ifdef CONFIG_NET_POLL_CONTROLLER diff --git a/drivers/net/ethernet/amd/a2065.c b/drivers/net/ethernet/amd/a2065.c index a3faf4feb204..2f808dbc8b0e 100644 --- a/drivers/net/ethernet/amd/a2065.c +++ b/drivers/net/ethernet/amd/a2065.c @@ -118,10 +118,6 @@ struct lance_private { int auto_select; /* cable-selection by carrier */ unsigned short busmaster_regval; -#ifdef CONFIG_SUNLANCE - struct Linux_SBus_DMA *ledma; /* if set this points to ledma and arch=4m */ - int burst_sizes; /* ledma SBus burst sizes */ -#endif struct timer_list multicast_timer; struct net_device *dev; }; @@ -551,11 +547,10 @@ static netdev_tx_t lance_start_xmit(struct sk_buff *skb, if (!lance_tx_buffs_avail(lp)) goto out_free; -#ifdef DEBUG /* dump the packet */ - print_hex_dump(KERN_DEBUG, "skb->data: ", DUMP_PREFIX_NONE, - 16, 1, skb->data, 64, true); -#endif + print_hex_dump_debug("skb->data: ", DUMP_PREFIX_NONE, 16, 1, skb->data, + 64, true); + entry = lp->tx_new & lp->tx_ring_mod_mask; ib->btx_ring[entry].length = (-skblen) | 0xf000; ib->btx_ring[entry].misc = 0; diff --git a/drivers/net/ethernet/amd/au1000_eth.c b/drivers/net/ethernet/amd/au1000_eth.c index d832c9f4d306..255847e7fa5b 100644 --- a/drivers/net/ethernet/amd/au1000_eth.c +++ b/drivers/net/ethernet/amd/au1000_eth.c @@ -1053,23 +1053,12 @@ static void au1000_multicast_list(struct net_device *dev) writel(reg, &aup->mac->control); } -static int au1000_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) -{ - if (!netif_running(dev)) - return -EINVAL; - - if (!dev->phydev) - return -EINVAL; /* PHY not controllable */ - - return phy_mii_ioctl(dev->phydev, rq, cmd); -} - static const struct net_device_ops au1000_netdev_ops = { .ndo_open = au1000_open, .ndo_stop = au1000_close, .ndo_start_xmit = au1000_tx, .ndo_set_rx_mode = au1000_multicast_list, - .ndo_do_ioctl = au1000_ioctl, + .ndo_do_ioctl = phy_do_ioctl_running, .ndo_tx_timeout = au1000_tx_timeout, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, diff --git a/drivers/net/ethernet/amd/declance.c b/drivers/net/ethernet/amd/declance.c index 6592a2db9efb..7282ce55ffb8 100644 --- a/drivers/net/ethernet/amd/declance.c +++ b/drivers/net/ethernet/amd/declance.c @@ -608,7 +608,7 @@ static int lance_rx(struct net_device *dev) len = (*rds_ptr(rd, mblength, lp->type) & 0xfff) - 4; skb = netdev_alloc_skb(dev, len + 2); - if (skb == 0) { + if (!skb) { dev->stats.rx_dropped++; *rds_ptr(rd, mblength, lp->type) = 0; *rds_ptr(rd, rmd1, lp->type) = diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c index 128cd648ba99..46c3c1ca38d6 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c @@ -1227,7 +1227,7 @@ static bool xgbe_phy_sfp_verify_eeprom(u8 cc_in, u8 *buf, unsigned int len) for (cc = 0; len; buf++, len--) cc += *buf; - return (cc == cc_in) ? true : false; + return cc == cc_in; } static int xgbe_phy_sfp_read_eeprom(struct xgbe_prv_data *pdata) diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c index e284b6753725..6aee2f0fc0db 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c @@ -2020,7 +2020,7 @@ static int xgene_enet_probe(struct platform_device *pdev) int ret; ndev = alloc_etherdev_mqs(sizeof(struct xgene_enet_pdata), - XGENE_NUM_RX_RING, XGENE_NUM_TX_RING); + XGENE_NUM_TX_RING, XGENE_NUM_RX_RING); if (!ndev) return -ENOMEM; diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c index a17a4da7bc15..c85e3e29012c 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c @@ -403,6 +403,8 @@ int aq_nic_start(struct aq_nic_s *self) if (err < 0) goto err_exit; + aq_nic_set_loopback(self); + err = self->aq_hw_ops->hw_start(self->aq_hw); if (err < 0) goto err_exit; @@ -413,8 +415,6 @@ int aq_nic_start(struct aq_nic_s *self) INIT_WORK(&self->service_task, aq_nic_service_task); - aq_nic_set_loopback(self); - timer_setup(&self->service_timer, aq_nic_service_timer_cb, 0); aq_nic_service_timer_cb(&self->service_timer); diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c index 58e891af6e09..ec041f78d063 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c @@ -1525,9 +1525,6 @@ const struct aq_hw_ops hw_atl_ops_b0 = { .rx_extract_ts = hw_atl_b0_rx_extract_ts, .extract_hwts = hw_atl_b0_extract_hwts, .hw_set_offload = hw_atl_b0_hw_offload_set, - .hw_get_hw_stats = hw_atl_utils_get_hw_stats, - .hw_get_fw_version = hw_atl_utils_get_fw_version, - .hw_set_offload = hw_atl_b0_hw_offload_set, .hw_set_loopback = hw_atl_b0_set_loopback, .hw_set_fc = hw_atl_b0_set_fc, }; diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c index 8910b62e67ed..f547baa6c954 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c @@ -667,9 +667,7 @@ int hw_atl_utils_mpi_get_link_status(struct aq_hw_s *self) u32 speed; mpi_state = hw_atl_utils_mpi_get_state(self); - speed = mpi_state & (FW2X_RATE_100M | FW2X_RATE_1G | - FW2X_RATE_2G5 | FW2X_RATE_5G | - FW2X_RATE_10G); + speed = mpi_state >> HW_ATL_MPI_SPEED_SHIFT; if (!speed) { link_status->mbps = 0U; diff --git a/drivers/net/ethernet/arc/emac_main.c b/drivers/net/ethernet/arc/emac_main.c index 6f2c867785fe..17bda4e8cc45 100644 --- a/drivers/net/ethernet/arc/emac_main.c +++ b/drivers/net/ethernet/arc/emac_main.c @@ -781,18 +781,6 @@ static int arc_emac_set_address(struct net_device *ndev, void *p) return 0; } -static int arc_emac_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) -{ - if (!netif_running(dev)) - return -EINVAL; - - if (!dev->phydev) - return -ENODEV; - - return phy_mii_ioctl(dev->phydev, rq, cmd); -} - - /** * arc_emac_restart - Restart EMAC * @ndev: Pointer to net_device structure. @@ -857,7 +845,7 @@ static const struct net_device_ops arc_emac_netdev_ops = { .ndo_set_mac_address = arc_emac_set_address, .ndo_get_stats = arc_emac_stats, .ndo_set_rx_mode = arc_emac_set_rx_mode, - .ndo_do_ioctl = arc_emac_ioctl, + .ndo_do_ioctl = phy_do_ioctl_running, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = arc_emac_poll_controller, #endif diff --git a/drivers/net/ethernet/atheros/ag71xx.c b/drivers/net/ethernet/atheros/ag71xx.c index 88e4e1500a8e..5ce2df482d8c 100644 --- a/drivers/net/ethernet/atheros/ag71xx.c +++ b/drivers/net/ethernet/atheros/ag71xx.c @@ -1394,14 +1394,6 @@ err_drop: return NETDEV_TX_OK; } -static int ag71xx_do_ioctl(struct net_device *ndev, struct ifreq *ifr, int cmd) -{ - if (!ndev->phydev) - return -EINVAL; - - return phy_mii_ioctl(ndev->phydev, ifr, cmd); -} - static void ag71xx_oom_timer_handler(struct timer_list *t) { struct ag71xx *ag = from_timer(ag, t, oom_timer); @@ -1618,7 +1610,7 @@ static const struct net_device_ops ag71xx_netdev_ops = { .ndo_open = ag71xx_open, .ndo_stop = ag71xx_stop, .ndo_start_xmit = ag71xx_hard_start_xmit, - .ndo_do_ioctl = ag71xx_do_ioctl, + .ndo_do_ioctl = phy_do_ioctl, .ndo_tx_timeout = ag71xx_tx_timeout, .ndo_change_mtu = ag71xx_change_mtu, .ndo_set_mac_address = eth_mac_addr, diff --git a/drivers/net/ethernet/aurora/nb8800.c b/drivers/net/ethernet/aurora/nb8800.c index 30b455013bf3..bc273e0db7ff 100644 --- a/drivers/net/ethernet/aurora/nb8800.c +++ b/drivers/net/ethernet/aurora/nb8800.c @@ -1005,18 +1005,13 @@ static int nb8800_stop(struct net_device *dev) return 0; } -static int nb8800_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) -{ - return phy_mii_ioctl(dev->phydev, rq, cmd); -} - static const struct net_device_ops nb8800_netdev_ops = { .ndo_open = nb8800_open, .ndo_stop = nb8800_stop, .ndo_start_xmit = nb8800_xmit, .ndo_set_mac_address = nb8800_set_mac_address, .ndo_set_rx_mode = nb8800_set_rx_mode, - .ndo_do_ioctl = nb8800_ioctl, + .ndo_do_ioctl = phy_do_ioctl, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/ethernet/broadcom/b44.c b/drivers/net/ethernet/broadcom/b44.c index 5b3464c3e8d1..a780b7215021 100644 --- a/drivers/net/ethernet/broadcom/b44.c +++ b/drivers/net/ethernet/broadcom/b44.c @@ -1516,8 +1516,10 @@ static int b44_magic_pattern(u8 *macaddr, u8 *ppattern, u8 *pmask, int offset) int ethaddr_bytes = ETH_ALEN; memset(ppattern + offset, 0xff, magicsync); - for (j = 0; j < magicsync; j++) - set_bit(len++, (unsigned long *) pmask); + for (j = 0; j < magicsync; j++) { + pmask[len >> 3] |= BIT(len & 7); + len++; + } for (j = 0; j < B44_MAX_PATTERNS; j++) { if ((B44_PATTERN_SIZE - len) >= ETH_ALEN) @@ -1529,7 +1531,8 @@ static int b44_magic_pattern(u8 *macaddr, u8 *ppattern, u8 *pmask, int offset) for (k = 0; k< ethaddr_bytes; k++) { ppattern[offset + magicsync + (j * ETH_ALEN) + k] = macaddr[k]; - set_bit(len++, (unsigned long *) pmask); + pmask[len >> 3] |= BIT(len & 7); + len++; } } return len - 1; diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index 1907e47fd0af..f07ac0e0af59 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -2323,7 +2323,7 @@ static int bcm_sysport_map_queues(struct notifier_block *nb, ring->switch_queue = qp; ring->switch_port = port; ring->inspect = true; - priv->ring_map[q + port * num_tx_queues] = ring; + priv->ring_map[qp + port * num_tx_queues] = ring; qp++; } @@ -2338,7 +2338,7 @@ static int bcm_sysport_unmap_queues(struct notifier_block *nb, struct net_device *slave_dev; unsigned int num_tx_queues; struct net_device *dev; - unsigned int q, port; + unsigned int q, qp, port; priv = container_of(nb, struct bcm_sysport_priv, dsa_notifier); if (priv->netdev != info->master) @@ -2364,7 +2364,8 @@ static int bcm_sysport_unmap_queues(struct notifier_block *nb, continue; ring->inspect = false; - priv->ring_map[q + port * num_tx_queues] = NULL; + qp = ring->switch_queue; + priv->ring_map[qp + port * num_tx_queues] = NULL; } return 0; diff --git a/drivers/net/ethernet/broadcom/bgmac.c b/drivers/net/ethernet/broadcom/bgmac.c index 148734b166f0..1bb07a5d82c9 100644 --- a/drivers/net/ethernet/broadcom/bgmac.c +++ b/drivers/net/ethernet/broadcom/bgmac.c @@ -1248,14 +1248,6 @@ static int bgmac_set_mac_address(struct net_device *net_dev, void *addr) return 0; } -static int bgmac_ioctl(struct net_device *net_dev, struct ifreq *ifr, int cmd) -{ - if (!netif_running(net_dev)) - return -EINVAL; - - return phy_mii_ioctl(net_dev->phydev, ifr, cmd); -} - static const struct net_device_ops bgmac_netdev_ops = { .ndo_open = bgmac_open, .ndo_stop = bgmac_stop, @@ -1263,7 +1255,7 @@ static const struct net_device_ops bgmac_netdev_ops = { .ndo_set_rx_mode = bgmac_set_rx_mode, .ndo_set_mac_address = bgmac_set_mac_address, .ndo_validate_addr = eth_validate_addr, - .ndo_do_ioctl = bgmac_ioctl, + .ndo_do_ioctl = phy_do_ioctl_running, }; /************************************************** diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 7b0fe19d9ecb..198c69dceeef 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -944,6 +944,7 @@ static struct sk_buff *bnxt_rx_page_skb(struct bnxt *bp, dma_addr -= bp->rx_dma_offset; dma_unmap_page_attrs(&bp->pdev->dev, dma_addr, PAGE_SIZE, bp->rx_dir, DMA_ATTR_WEAK_ORDERING); + page_pool_release_page(rxr->page_pool, page); if (unlikely(!payload)) payload = eth_get_headlen(bp->dev, data_ptr, len); @@ -10822,6 +10823,7 @@ static void bnxt_fw_reset_task(struct work_struct *work) smp_mb__before_atomic(); clear_bit(BNXT_STATE_IN_FW_RESET, &bp->state); bnxt_ulp_start(bp, rc); + bnxt_dl_health_recovery_done(bp); bnxt_dl_health_status_update(bp, true); rtnl_unlock(); break; @@ -11065,11 +11067,23 @@ static bool bnxt_fltr_match(struct bnxt_ntuple_filter *f1, struct flow_keys *keys1 = &f1->fkeys; struct flow_keys *keys2 = &f2->fkeys; - if (keys1->addrs.v4addrs.src == keys2->addrs.v4addrs.src && - keys1->addrs.v4addrs.dst == keys2->addrs.v4addrs.dst && - keys1->ports.ports == keys2->ports.ports && - keys1->basic.ip_proto == keys2->basic.ip_proto && - keys1->basic.n_proto == keys2->basic.n_proto && + if (keys1->basic.n_proto != keys2->basic.n_proto || + keys1->basic.ip_proto != keys2->basic.ip_proto) + return false; + + if (keys1->basic.n_proto == htons(ETH_P_IP)) { + if (keys1->addrs.v4addrs.src != keys2->addrs.v4addrs.src || + keys1->addrs.v4addrs.dst != keys2->addrs.v4addrs.dst) + return false; + } else { + if (memcmp(&keys1->addrs.v6addrs.src, &keys2->addrs.v6addrs.src, + sizeof(keys1->addrs.v6addrs.src)) || + memcmp(&keys1->addrs.v6addrs.dst, &keys2->addrs.v6addrs.dst, + sizeof(keys1->addrs.v6addrs.dst))) + return false; + } + + if (keys1->ports.ports == keys2->ports.ports && keys1->control.flags == keys2->control.flags && ether_addr_equal(f1->src_mac_addr, f2->src_mac_addr) && ether_addr_equal(f1->dst_mac_addr, f2->dst_mac_addr)) @@ -11361,7 +11375,7 @@ int bnxt_get_port_parent_id(struct net_device *dev, return -EOPNOTSUPP; /* The PF and it's VF-reps only support the switchdev framework */ - if (!BNXT_PF(bp)) + if (!BNXT_PF(bp) || !(bp->flags & BNXT_FLAG_DSN_VALID)) return -EOPNOTSUPP; ppid->id_len = sizeof(bp->switch_id); @@ -11734,6 +11748,7 @@ static int bnxt_pcie_dsn_get(struct bnxt *bp, u8 dsn[]) put_unaligned_le32(dw, &dsn[0]); pci_read_config_dword(pdev, pos + 4, &dw); put_unaligned_le32(dw, &dsn[4]); + bp->flags |= BNXT_FLAG_DSN_VALID; return 0; } @@ -11845,9 +11860,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) if (BNXT_PF(bp)) { /* Read the adapter's DSN to use as the eswitch switch_id */ - rc = bnxt_pcie_dsn_get(bp, bp->switch_id); - if (rc) - goto init_err_pci_clean; + bnxt_pcie_dsn_get(bp, bp->switch_id); } /* MTU range: 60 - FW defined max */ diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 505af5cfb1bd..f14335433a64 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -1532,6 +1532,7 @@ struct bnxt { #define BNXT_FLAG_NO_AGG_RINGS 0x20000 #define BNXT_FLAG_RX_PAGE_MODE 0x40000 #define BNXT_FLAG_MULTI_HOST 0x100000 + #define BNXT_FLAG_DSN_VALID 0x200000 #define BNXT_FLAG_DOUBLE_DB 0x400000 #define BNXT_FLAG_CHIP_NITRO_A0 0x1000000 #define BNXT_FLAG_DIM 0x2000000 @@ -1936,9 +1937,6 @@ static inline bool bnxt_cfa_hwrm_message(u16 req_type) case HWRM_CFA_ENCAP_RECORD_FREE: case HWRM_CFA_DECAP_FILTER_ALLOC: case HWRM_CFA_DECAP_FILTER_FREE: - case HWRM_CFA_NTUPLE_FILTER_ALLOC: - case HWRM_CFA_NTUPLE_FILTER_FREE: - case HWRM_CFA_NTUPLE_FILTER_CFG: case HWRM_CFA_EM_FLOW_ALLOC: case HWRM_CFA_EM_FLOW_FREE: case HWRM_CFA_EM_FLOW_CFG: diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c index 3eedd4477218..0c3d224637b9 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c @@ -89,7 +89,7 @@ static int bnxt_fw_reset_recover(struct devlink_health_reporter *reporter, return -EOPNOTSUPP; bnxt_fw_reset(bp); - return 0; + return -EINPROGRESS; } static const @@ -116,7 +116,7 @@ static int bnxt_fw_fatal_recover(struct devlink_health_reporter *reporter, else if (event == BNXT_FW_EXCEPTION_SP_EVENT) bnxt_fw_exception(bp); - return 0; + return -EINPROGRESS; } static const @@ -262,6 +262,16 @@ void bnxt_dl_health_status_update(struct bnxt *bp, bool healthy) health->fatal = false; } +void bnxt_dl_health_recovery_done(struct bnxt *bp) +{ + struct bnxt_fw_health *hlth = bp->fw_health; + + if (hlth->fatal) + devlink_health_reporter_recovery_done(hlth->fw_fatal_reporter); + else + devlink_health_reporter_recovery_done(hlth->fw_reset_reporter); +} + static const struct devlink_ops bnxt_dl_ops = { #ifdef CONFIG_BNXT_SRIOV .eswitch_mode_set = bnxt_dl_eswitch_mode_set, diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h index 6db6c3dac472..08aaa4441c78 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h @@ -58,6 +58,7 @@ struct bnxt_dl_nvm_param { void bnxt_devlink_health_report(struct bnxt *bp, unsigned long event); void bnxt_dl_health_status_update(struct bnxt *bp, bool healthy); +void bnxt_dl_health_recovery_done(struct bnxt *bp); void bnxt_dl_fw_reporters_create(struct bnxt *bp); void bnxt_dl_fw_reporters_destroy(struct bnxt *bp, bool all); int bnxt_dl_register(struct bnxt *bp); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c index f9bf7d7250ab..b010b34cdaf8 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c @@ -398,6 +398,9 @@ static int bnxt_vf_reps_create(struct bnxt *bp) struct net_device *dev; int rc, i; + if (!(bp->flags & BNXT_FLAG_DSN_VALID)) + return -ENODEV; + bp->vf_reps = kcalloc(num_vfs, sizeof(vf_rep), GFP_KERNEL); if (!bp->vf_reps) return -ENOMEM; diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index 3ee7917e3fc0..e50a15397e11 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -1225,18 +1225,6 @@ static void bcmgenet_power_up(struct bcmgenet_priv *priv, } } -/* ioctl handle special commands that are not present in ethtool. */ -static int bcmgenet_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) -{ - if (!netif_running(dev)) - return -EINVAL; - - if (!dev->phydev) - return -ENODEV; - - return phy_mii_ioctl(dev->phydev, rq, cmd); -} - static struct enet_cb *bcmgenet_get_txcb(struct bcmgenet_priv *priv, struct bcmgenet_tx_ring *ring) { @@ -2169,8 +2157,8 @@ static void bcmgenet_init_tx_ring(struct bcmgenet_priv *priv, DMA_END_ADDR); /* Initialize Tx NAPI */ - netif_napi_add(priv->dev, &ring->napi, bcmgenet_tx_poll, - NAPI_POLL_WEIGHT); + netif_tx_napi_add(priv->dev, &ring->napi, bcmgenet_tx_poll, + NAPI_POLL_WEIGHT); } /* Initialize a RDMA ring */ @@ -3222,7 +3210,7 @@ static const struct net_device_ops bcmgenet_netdev_ops = { .ndo_tx_timeout = bcmgenet_timeout, .ndo_set_rx_mode = bcmgenet_set_rx_mode, .ndo_set_mac_address = bcmgenet_set_mac_addr, - .ndo_do_ioctl = bcmgenet_ioctl, + .ndo_do_ioctl = phy_do_ioctl_running, .ndo_set_features = bcmgenet_set_features, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = bcmgenet_poll_controller, diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index 460b4992914a..88466255bf66 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -7874,8 +7874,8 @@ static netdev_tx_t tg3_start_xmit(struct sk_buff *, struct net_device *); static int tg3_tso_bug(struct tg3 *tp, struct tg3_napi *tnapi, struct netdev_queue *txq, struct sk_buff *skb) { - struct sk_buff *segs, *nskb; u32 frag_cnt_est = skb_shinfo(skb)->gso_segs * 3; + struct sk_buff *segs, *seg, *next; /* Estimate the number of fragments in the worst case */ if (unlikely(tg3_tx_avail(tnapi) <= frag_cnt_est)) { @@ -7898,12 +7898,10 @@ static int tg3_tso_bug(struct tg3 *tp, struct tg3_napi *tnapi, if (IS_ERR(segs) || !segs) goto tg3_tso_bug_end; - do { - nskb = segs; - segs = segs->next; - nskb->next = NULL; - tg3_start_xmit(nskb, tp->dev); - } while (segs); + skb_list_walk_safe(segs, seg, next) { + skb_mark_not_on_list(seg); + tg3_start_xmit(seg, tp->dev); + } tg3_tso_bug_end: dev_consume_skb_any(skb); diff --git a/drivers/net/ethernet/cadence/macb.h b/drivers/net/ethernet/cadence/macb.h index 19fe4f4867c7..dbf7070fcdba 100644 --- a/drivers/net/ethernet/cadence/macb.h +++ b/drivers/net/ethernet/cadence/macb.h @@ -630,10 +630,17 @@ #define GEM_CLK_DIV96 5 /* Constants for MAN register */ -#define MACB_MAN_SOF 1 -#define MACB_MAN_WRITE 1 -#define MACB_MAN_READ 2 -#define MACB_MAN_CODE 2 +#define MACB_MAN_C22_SOF 1 +#define MACB_MAN_C22_WRITE 1 +#define MACB_MAN_C22_READ 2 +#define MACB_MAN_C22_CODE 2 + +#define MACB_MAN_C45_SOF 0 +#define MACB_MAN_C45_ADDR 0 +#define MACB_MAN_C45_WRITE 1 +#define MACB_MAN_C45_POST_READ_INCR 2 +#define MACB_MAN_C45_READ 3 +#define MACB_MAN_C45_CODE 2 /* Capability mask bits */ #define MACB_CAPS_ISR_CLEAR_ON_WRITE 0x00000001 diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index c5ee363ca5dc..7a2fe63d1136 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -337,11 +337,30 @@ static int macb_mdio_read(struct mii_bus *bus, int mii_id, int regnum) if (status < 0) goto mdio_read_exit; - macb_writel(bp, MAN, (MACB_BF(SOF, MACB_MAN_SOF) - | MACB_BF(RW, MACB_MAN_READ) - | MACB_BF(PHYA, mii_id) - | MACB_BF(REGA, regnum) - | MACB_BF(CODE, MACB_MAN_CODE))); + if (regnum & MII_ADDR_C45) { + macb_writel(bp, MAN, (MACB_BF(SOF, MACB_MAN_C45_SOF) + | MACB_BF(RW, MACB_MAN_C45_ADDR) + | MACB_BF(PHYA, mii_id) + | MACB_BF(REGA, (regnum >> 16) & 0x1F) + | MACB_BF(DATA, regnum & 0xFFFF) + | MACB_BF(CODE, MACB_MAN_C45_CODE))); + + status = macb_mdio_wait_for_idle(bp); + if (status < 0) + goto mdio_read_exit; + + macb_writel(bp, MAN, (MACB_BF(SOF, MACB_MAN_C45_SOF) + | MACB_BF(RW, MACB_MAN_C45_READ) + | MACB_BF(PHYA, mii_id) + | MACB_BF(REGA, (regnum >> 16) & 0x1F) + | MACB_BF(CODE, MACB_MAN_C45_CODE))); + } else { + macb_writel(bp, MAN, (MACB_BF(SOF, MACB_MAN_C22_SOF) + | MACB_BF(RW, MACB_MAN_C22_READ) + | MACB_BF(PHYA, mii_id) + | MACB_BF(REGA, regnum) + | MACB_BF(CODE, MACB_MAN_C22_CODE))); + } status = macb_mdio_wait_for_idle(bp); if (status < 0) @@ -370,12 +389,32 @@ static int macb_mdio_write(struct mii_bus *bus, int mii_id, int regnum, if (status < 0) goto mdio_write_exit; - macb_writel(bp, MAN, (MACB_BF(SOF, MACB_MAN_SOF) - | MACB_BF(RW, MACB_MAN_WRITE) - | MACB_BF(PHYA, mii_id) - | MACB_BF(REGA, regnum) - | MACB_BF(CODE, MACB_MAN_CODE) - | MACB_BF(DATA, value))); + if (regnum & MII_ADDR_C45) { + macb_writel(bp, MAN, (MACB_BF(SOF, MACB_MAN_C45_SOF) + | MACB_BF(RW, MACB_MAN_C45_ADDR) + | MACB_BF(PHYA, mii_id) + | MACB_BF(REGA, (regnum >> 16) & 0x1F) + | MACB_BF(DATA, regnum & 0xFFFF) + | MACB_BF(CODE, MACB_MAN_C45_CODE))); + + status = macb_mdio_wait_for_idle(bp); + if (status < 0) + goto mdio_write_exit; + + macb_writel(bp, MAN, (MACB_BF(SOF, MACB_MAN_C45_SOF) + | MACB_BF(RW, MACB_MAN_C45_WRITE) + | MACB_BF(PHYA, mii_id) + | MACB_BF(REGA, (regnum >> 16) & 0x1F) + | MACB_BF(CODE, MACB_MAN_C45_CODE) + | MACB_BF(DATA, value))); + } else { + macb_writel(bp, MAN, (MACB_BF(SOF, MACB_MAN_C22_SOF) + | MACB_BF(RW, MACB_MAN_C22_WRITE) + | MACB_BF(PHYA, mii_id) + | MACB_BF(REGA, regnum) + | MACB_BF(CODE, MACB_MAN_C22_CODE) + | MACB_BF(DATA, value))); + } status = macb_mdio_wait_for_idle(bp); if (status < 0) @@ -611,21 +650,24 @@ static const struct phylink_mac_ops macb_phylink_ops = { .mac_link_up = macb_mac_link_up, }; +static bool macb_phy_handle_exists(struct device_node *dn) +{ + dn = of_parse_phandle(dn, "phy-handle", 0); + of_node_put(dn); + return dn != NULL; +} + static int macb_phylink_connect(struct macb *bp) { + struct device_node *dn = bp->pdev->dev.of_node; struct net_device *dev = bp->dev; struct phy_device *phydev; int ret; - if (bp->pdev->dev.of_node && - of_parse_phandle(bp->pdev->dev.of_node, "phy-handle", 0)) { - ret = phylink_of_phy_connect(bp->phylink, bp->pdev->dev.of_node, - 0); - if (ret) { - netdev_err(dev, "Could not attach PHY (%d)\n", ret); - return ret; - } - } else { + if (dn) + ret = phylink_of_phy_connect(bp->phylink, dn, 0); + + if (!dn || (ret && !macb_phy_handle_exists(dn))) { phydev = phy_find_first(bp->mii_bus); if (!phydev) { netdev_err(dev, "no PHY found\n"); @@ -634,10 +676,11 @@ static int macb_phylink_connect(struct macb *bp) /* attach the mac to the phy */ ret = phylink_connect_phy(bp->phylink, phydev); - if (ret) { - netdev_err(dev, "Could not attach to PHY (%d)\n", ret); - return ret; - } + } + + if (ret) { + netdev_err(dev, "Could not attach PHY (%d)\n", ret); + return ret; } phylink_start(bp->phylink); @@ -4088,7 +4131,7 @@ static int fu540_c000_clk_init(struct platform_device *pdev, struct clk **pclk, mgmt->rate = 0; mgmt->hw.init = &init; - *tx_clk = clk_register(NULL, &mgmt->hw); + *tx_clk = devm_clk_register(&pdev->dev, &mgmt->hw); if (IS_ERR(*tx_clk)) return PTR_ERR(*tx_clk); @@ -4416,7 +4459,6 @@ err_out_free_netdev: err_disable_clocks: clk_disable_unprepare(tx_clk); - clk_unregister(tx_clk); clk_disable_unprepare(hclk); clk_disable_unprepare(pclk); clk_disable_unprepare(rx_clk); @@ -4446,7 +4488,6 @@ static int macb_remove(struct platform_device *pdev) pm_runtime_dont_use_autosuspend(&pdev->dev); if (!pm_runtime_suspended(&pdev->dev)) { clk_disable_unprepare(bp->tx_clk); - clk_unregister(bp->tx_clk); clk_disable_unprepare(bp->hclk); clk_disable_unprepare(bp->pclk); clk_disable_unprepare(bp->rx_clk); diff --git a/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c b/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c index cdd7e5da4a74..e9575887a4f8 100644 --- a/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c +++ b/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c @@ -790,9 +790,7 @@ static int octeon_mgmt_ioctl(struct net_device *netdev, case SIOCSHWTSTAMP: return octeon_mgmt_ioctl_hwtstamp(netdev, rq, cmd); default: - if (netdev->phydev) - return phy_mii_ioctl(netdev->phydev, rq, cmd); - return -EINVAL; + return phy_do_ioctl(netdev, rq, cmd); } } diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c index 58f89f6a040f..97ff8608f0ab 100644 --- a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c +++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c @@ -2448,6 +2448,8 @@ static int cxgb_extension_ioctl(struct net_device *dev, void __user *useraddr) if (!is_offload(adapter)) return -EOPNOTSUPP; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; if (!(adapter->flags & FULL_INIT_DONE)) return -EIO; /* need the memory controllers */ if (copy_from_user(&t, useraddr, sizeof(t))) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c index ee3aab563b1d..9d1f2f88b945 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c @@ -70,8 +70,7 @@ static void *seq_tab_start(struct seq_file *seq, loff_t *pos) static void *seq_tab_next(struct seq_file *seq, void *v, loff_t *pos) { v = seq_tab_get_idx(seq->private, *pos + 1); - if (v) - ++*pos; + ++(*pos); return v; } diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index 1930e39f195e..649842a8aa28 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -3163,9 +3163,9 @@ static int cxgb_set_tx_maxrate(struct net_device *dev, int index, u32 rate) { struct port_info *pi = netdev_priv(dev); struct adapter *adap = pi->adapter; + struct ch_sched_queue qe = { 0 }; + struct ch_sched_params p = { 0 }; struct sched_class *e; - struct ch_sched_params p; - struct ch_sched_queue qe; u32 req_rate; int err = 0; @@ -3182,6 +3182,15 @@ static int cxgb_set_tx_maxrate(struct net_device *dev, int index, u32 rate) return -EINVAL; } + qe.queue = index; + e = cxgb4_sched_queue_lookup(dev, &qe); + if (e && e->info.u.params.level != SCHED_CLASS_LEVEL_CL_RL) { + dev_err(adap->pdev_dev, + "Queue %u already bound to class %u of type: %u\n", + index, e->idx, e->info.u.params.level); + return -EBUSY; + } + /* Convert from Mbps to Kbps */ req_rate = rate * 1000; @@ -3211,7 +3220,6 @@ static int cxgb_set_tx_maxrate(struct net_device *dev, int index, u32 rate) return 0; /* Fetch any available unused or matching scheduling class */ - memset(&p, 0, sizeof(p)); p.type = SCHED_CLASS_TYPE_PACKET; p.u.params.level = SCHED_CLASS_LEVEL_CL_RL; p.u.params.mode = SCHED_CLASS_MODE_CLASS; diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_matchall.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_matchall.c index 24c3c2dc7171..1b7681a4eb32 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_matchall.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_matchall.c @@ -15,6 +15,8 @@ static int cxgb4_matchall_egress_validate(struct net_device *dev, struct flow_action *actions = &cls->rule->action; struct port_info *pi = netdev2pinfo(dev); struct flow_action_entry *entry; + struct ch_sched_queue qe; + struct sched_class *e; u64 max_link_rate; u32 i, speed; int ret; @@ -60,9 +62,61 @@ static int cxgb4_matchall_egress_validate(struct net_device *dev, } } + for (i = 0; i < pi->nqsets; i++) { + memset(&qe, 0, sizeof(qe)); + qe.queue = i; + + e = cxgb4_sched_queue_lookup(dev, &qe); + if (e && e->info.u.params.level != SCHED_CLASS_LEVEL_CH_RL) { + NL_SET_ERR_MSG_MOD(extack, + "Some queues are already bound to different class"); + return -EBUSY; + } + } + return 0; } +static int cxgb4_matchall_tc_bind_queues(struct net_device *dev, u32 tc) +{ + struct port_info *pi = netdev2pinfo(dev); + struct ch_sched_queue qe; + int ret; + u32 i; + + for (i = 0; i < pi->nqsets; i++) { + qe.queue = i; + qe.class = tc; + ret = cxgb4_sched_class_bind(dev, &qe, SCHED_QUEUE); + if (ret) + goto out_free; + } + + return 0; + +out_free: + while (i--) { + qe.queue = i; + qe.class = SCHED_CLS_NONE; + cxgb4_sched_class_unbind(dev, &qe, SCHED_QUEUE); + } + + return ret; +} + +static void cxgb4_matchall_tc_unbind_queues(struct net_device *dev) +{ + struct port_info *pi = netdev2pinfo(dev); + struct ch_sched_queue qe; + u32 i; + + for (i = 0; i < pi->nqsets; i++) { + qe.queue = i; + qe.class = SCHED_CLS_NONE; + cxgb4_sched_class_unbind(dev, &qe, SCHED_QUEUE); + } +} + static int cxgb4_matchall_alloc_tc(struct net_device *dev, struct tc_cls_matchall_offload *cls) { @@ -83,6 +137,7 @@ static int cxgb4_matchall_alloc_tc(struct net_device *dev, struct adapter *adap = netdev2adap(dev); struct flow_action_entry *entry; struct sched_class *e; + int ret; u32 i; tc_port_matchall = &adap->tc_matchall->port_matchall[pi->port_id]; @@ -101,10 +156,21 @@ static int cxgb4_matchall_alloc_tc(struct net_device *dev, return -ENOMEM; } + ret = cxgb4_matchall_tc_bind_queues(dev, e->idx); + if (ret) { + NL_SET_ERR_MSG_MOD(extack, + "Could not bind queues to traffic class"); + goto out_free; + } + tc_port_matchall->egress.hwtc = e->idx; tc_port_matchall->egress.cookie = cls->cookie; tc_port_matchall->egress.state = CXGB4_MATCHALL_STATE_ENABLED; return 0; + +out_free: + cxgb4_sched_class_free(dev, e->idx); + return ret; } static void cxgb4_matchall_free_tc(struct net_device *dev) @@ -114,6 +180,7 @@ static void cxgb4_matchall_free_tc(struct net_device *dev) struct adapter *adap = netdev2adap(dev); tc_port_matchall = &adap->tc_matchall->port_matchall[pi->port_id]; + cxgb4_matchall_tc_unbind_queues(dev); cxgb4_sched_class_free(dev, tc_port_matchall->egress.hwtc); tc_port_matchall->egress.hwtc = SCHED_CLS_NONE; diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.c index 8971dddcdb7a..ec3eb45ee3b4 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.c @@ -12,8 +12,9 @@ static int cxgb4_mqprio_validate(struct net_device *dev, struct port_info *pi = netdev2pinfo(dev); struct adapter *adap = netdev2adap(dev); u32 speed, qcount = 0, qoffset = 0; + u32 start_a, start_b, end_a, end_b; int ret; - u8 i; + u8 i, j; if (!mqprio->qopt.num_tc) return 0; @@ -47,6 +48,31 @@ static int cxgb4_mqprio_validate(struct net_device *dev, qoffset = max_t(u16, mqprio->qopt.offset[i], qoffset); qcount += mqprio->qopt.count[i]; + start_a = mqprio->qopt.offset[i]; + end_a = start_a + mqprio->qopt.count[i] - 1; + for (j = i + 1; j < mqprio->qopt.num_tc; j++) { + start_b = mqprio->qopt.offset[j]; + end_b = start_b + mqprio->qopt.count[j] - 1; + + /* If queue count is 0, then the traffic + * belonging to this class will not use + * ETHOFLD queues. So, no need to validate + * further. + */ + if (!mqprio->qopt.count[i]) + break; + + if (!mqprio->qopt.count[j]) + continue; + + if (max_t(u32, start_a, start_b) <= + min_t(u32, end_a, end_b)) { + netdev_err(dev, + "Queues can't overlap across tc\n"); + return -EINVAL; + } + } + /* Convert byte per second to bits per second */ min_rate += (mqprio->min_rate[i] * 8); max_rate += (mqprio->max_rate[i] * 8); diff --git a/drivers/net/ethernet/chelsio/cxgb4/l2t.c b/drivers/net/ethernet/chelsio/cxgb4/l2t.c index e9e45006632d..1a16449e9deb 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/l2t.c +++ b/drivers/net/ethernet/chelsio/cxgb4/l2t.c @@ -678,8 +678,7 @@ static void *l2t_seq_start(struct seq_file *seq, loff_t *pos) static void *l2t_seq_next(struct seq_file *seq, void *v, loff_t *pos) { v = l2t_get_idx(seq, *pos); - if (v) - ++*pos; + ++(*pos); return v; } diff --git a/drivers/net/ethernet/chelsio/cxgb4/sched.c b/drivers/net/ethernet/chelsio/cxgb4/sched.c index 3e61bd5d0c29..cebe1412d960 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/sched.c +++ b/drivers/net/ethernet/chelsio/cxgb4/sched.c @@ -165,6 +165,22 @@ static void *t4_sched_entry_lookup(struct port_info *pi, return found; } +struct sched_class *cxgb4_sched_queue_lookup(struct net_device *dev, + struct ch_sched_queue *p) +{ + struct port_info *pi = netdev2pinfo(dev); + struct sched_queue_entry *qe = NULL; + struct adapter *adap = pi->adapter; + struct sge_eth_txq *txq; + + if (p->queue < 0 || p->queue >= pi->nqsets) + return NULL; + + txq = &adap->sge.ethtxq[pi->first_qset + p->queue]; + qe = t4_sched_entry_lookup(pi, SCHED_QUEUE, txq->q.cntxt_id); + return qe ? &pi->sched_tbl->tab[qe->param.class] : NULL; +} + static int t4_sched_queue_unbind(struct port_info *pi, struct ch_sched_queue *p) { struct sched_queue_entry *qe = NULL; diff --git a/drivers/net/ethernet/chelsio/cxgb4/sched.h b/drivers/net/ethernet/chelsio/cxgb4/sched.h index e92ff68bdd0a..5cc74a5a1774 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/sched.h +++ b/drivers/net/ethernet/chelsio/cxgb4/sched.h @@ -103,6 +103,8 @@ static inline bool valid_class_id(struct net_device *dev, u8 class_id) return true; } +struct sched_class *cxgb4_sched_queue_lookup(struct net_device *dev, + struct ch_sched_queue *p); int cxgb4_sched_class_bind(struct net_device *dev, void *arg, enum sched_bind_type type); int cxgb4_sched_class_unbind(struct net_device *dev, void *arg, diff --git a/drivers/net/ethernet/dnet.c b/drivers/net/ethernet/dnet.c index e24979010969..5f8fa1145db6 100644 --- a/drivers/net/ethernet/dnet.c +++ b/drivers/net/ethernet/dnet.c @@ -725,19 +725,6 @@ static struct net_device_stats *dnet_get_stats(struct net_device *dev) return nstat; } -static int dnet_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) -{ - struct phy_device *phydev = dev->phydev; - - if (!netif_running(dev)) - return -EINVAL; - - if (!phydev) - return -ENODEV; - - return phy_mii_ioctl(phydev, rq, cmd); -} - static void dnet_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { @@ -759,7 +746,7 @@ static const struct net_device_ops dnet_netdev_ops = { .ndo_stop = dnet_close, .ndo_get_stats = dnet_get_stats, .ndo_start_xmit = dnet_start_xmit, - .ndo_do_ioctl = dnet_ioctl, + .ndo_do_ioctl = phy_do_ioctl_running, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c index 48b3b72fe02e..4572797f00d7 100644 --- a/drivers/net/ethernet/faraday/ftgmac100.c +++ b/drivers/net/ethernet/faraday/ftgmac100.c @@ -1536,15 +1536,6 @@ static int ftgmac100_stop(struct net_device *netdev) return 0; } -/* optional */ -static int ftgmac100_do_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd) -{ - if (!netdev->phydev) - return -ENXIO; - - return phy_mii_ioctl(netdev->phydev, ifr, cmd); -} - static void ftgmac100_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct ftgmac100 *priv = netdev_priv(netdev); @@ -1597,7 +1588,7 @@ static const struct net_device_ops ftgmac100_netdev_ops = { .ndo_start_xmit = ftgmac100_hard_start_xmit, .ndo_set_mac_address = ftgmac100_set_mac_addr, .ndo_validate_addr = eth_validate_addr, - .ndo_do_ioctl = ftgmac100_do_ioctl, + .ndo_do_ioctl = phy_do_ioctl, .ndo_tx_timeout = ftgmac100_tx_timeout, .ndo_set_rx_mode = ftgmac100_set_rx_mode, .ndo_set_features = ftgmac100_set_features, diff --git a/drivers/net/ethernet/freescale/Makefile b/drivers/net/ethernet/freescale/Makefile index 6a93293d31e0..67c436400352 100644 --- a/drivers/net/ethernet/freescale/Makefile +++ b/drivers/net/ethernet/freescale/Makefile @@ -25,4 +25,5 @@ obj-$(CONFIG_FSL_DPAA_ETH) += dpaa/ obj-$(CONFIG_FSL_DPAA2_ETH) += dpaa2/ obj-$(CONFIG_FSL_ENETC) += enetc/ +obj-$(CONFIG_FSL_ENETC_MDIO) += enetc/ obj-$(CONFIG_FSL_ENETC_VF) += enetc/ diff --git a/drivers/net/ethernet/freescale/enetc/Kconfig b/drivers/net/ethernet/freescale/enetc/Kconfig index edad4ca46327..fe942de19597 100644 --- a/drivers/net/ethernet/freescale/enetc/Kconfig +++ b/drivers/net/ethernet/freescale/enetc/Kconfig @@ -2,6 +2,7 @@ config FSL_ENETC tristate "ENETC PF driver" depends on PCI && PCI_MSI && (ARCH_LAYERSCAPE || COMPILE_TEST) + select FSL_ENETC_MDIO select PHYLIB help This driver supports NXP ENETC gigabit ethernet controller PCIe diff --git a/drivers/net/ethernet/freescale/enetc/Makefile b/drivers/net/ethernet/freescale/enetc/Makefile index d0db33e5b6b7..74f7ac253b8b 100644 --- a/drivers/net/ethernet/freescale/enetc/Makefile +++ b/drivers/net/ethernet/freescale/enetc/Makefile @@ -3,7 +3,7 @@ common-objs := enetc.o enetc_cbdr.o enetc_ethtool.o obj-$(CONFIG_FSL_ENETC) += fsl-enetc.o -fsl-enetc-y := enetc_pf.o enetc_mdio.o $(common-objs) +fsl-enetc-y := enetc_pf.o $(common-objs) fsl-enetc-$(CONFIG_PCI_IOV) += enetc_msg.o fsl-enetc-$(CONFIG_FSL_ENETC_QOS) += enetc_qos.o diff --git a/drivers/net/ethernet/freescale/enetc/enetc_hw.h b/drivers/net/ethernet/freescale/enetc/enetc_hw.h index 8375cd886dba..62554f28ce07 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_hw.h +++ b/drivers/net/ethernet/freescale/enetc/enetc_hw.h @@ -200,6 +200,7 @@ enum enetc_bdr_type {TX, RX}; #define ENETC_PFPMR 0x1900 #define ENETC_PFPMR_PMACE BIT(1) #define ENETC_PFPMR_MWLM BIT(0) +#define ENETC_EMDIO_BASE 0x1c00 #define ENETC_PSIUMHFR0(n, err) (((err) ? 0x1d08 : 0x1d00) + (n) * 0x10) #define ENETC_PSIUMHFR1(n) (0x1d04 + (n) * 0x10) #define ENETC_PSIMMHFR0(n, err) (((err) ? 0x1d00 : 0x1d08) + (n) * 0x10) diff --git a/drivers/net/ethernet/freescale/enetc/enetc_mdio.c b/drivers/net/ethernet/freescale/enetc/enetc_mdio.c index 149883c8f0b8..48c32a171afa 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_mdio.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_mdio.c @@ -1,41 +1,56 @@ // SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) /* Copyright 2019 NXP */ +#include <linux/fsl/enetc_mdio.h> #include <linux/mdio.h> #include <linux/of_mdio.h> #include <linux/iopoll.h> #include <linux/of.h> -#include "enetc_mdio.h" +#include "enetc_pf.h" -#define ENETC_MDIO_REG_OFFSET 0x1c00 #define ENETC_MDIO_CFG 0x0 /* MDIO configuration and status */ #define ENETC_MDIO_CTL 0x4 /* MDIO control */ #define ENETC_MDIO_DATA 0x8 /* MDIO data */ #define ENETC_MDIO_ADDR 0xc /* MDIO address */ -#define enetc_mdio_rd(hw, off) \ - enetc_port_rd(hw, ENETC_##off + ENETC_MDIO_REG_OFFSET) -#define enetc_mdio_wr(hw, off, val) \ - enetc_port_wr(hw, ENETC_##off + ENETC_MDIO_REG_OFFSET, val) -#define enetc_mdio_rd_reg(off) enetc_mdio_rd(hw, off) +static inline u32 _enetc_mdio_rd(struct enetc_mdio_priv *mdio_priv, int off) +{ + return enetc_port_rd(mdio_priv->hw, mdio_priv->mdio_base + off); +} + +static inline void _enetc_mdio_wr(struct enetc_mdio_priv *mdio_priv, int off, + u32 val) +{ + enetc_port_wr(mdio_priv->hw, mdio_priv->mdio_base + off, val); +} -#define ENETC_MDC_DIV 258 +#define enetc_mdio_rd(mdio_priv, off) \ + _enetc_mdio_rd(mdio_priv, ENETC_##off) +#define enetc_mdio_wr(mdio_priv, off, val) \ + _enetc_mdio_wr(mdio_priv, ENETC_##off, val) +#define enetc_mdio_rd_reg(off) enetc_mdio_rd(mdio_priv, off) #define MDIO_CFG_CLKDIV(x) ((((x) >> 1) & 0xff) << 8) #define MDIO_CFG_BSY BIT(0) #define MDIO_CFG_RD_ER BIT(1) +#define MDIO_CFG_HOLD(x) (((x) << 2) & GENMASK(4, 2)) #define MDIO_CFG_ENC45 BIT(6) /* external MDIO only - driven on neg MDC edge */ #define MDIO_CFG_NEG BIT(23) +#define ENETC_EMDIO_CFG \ + (MDIO_CFG_HOLD(2) | \ + MDIO_CFG_CLKDIV(258) | \ + MDIO_CFG_NEG) + #define MDIO_CTL_DEV_ADDR(x) ((x) & 0x1f) #define MDIO_CTL_PORT_ADDR(x) (((x) & 0x1f) << 5) #define MDIO_CTL_READ BIT(15) #define MDIO_DATA(x) ((x) & 0xffff) #define TIMEOUT 1000 -static int enetc_mdio_wait_complete(struct enetc_hw *hw) +static int enetc_mdio_wait_complete(struct enetc_mdio_priv *mdio_priv) { u32 val; @@ -46,12 +61,11 @@ static int enetc_mdio_wait_complete(struct enetc_hw *hw) int enetc_mdio_write(struct mii_bus *bus, int phy_id, int regnum, u16 value) { struct enetc_mdio_priv *mdio_priv = bus->priv; - struct enetc_hw *hw = mdio_priv->hw; u32 mdio_ctl, mdio_cfg; u16 dev_addr; int ret; - mdio_cfg = MDIO_CFG_CLKDIV(ENETC_MDC_DIV) | MDIO_CFG_NEG; + mdio_cfg = ENETC_EMDIO_CFG; if (regnum & MII_ADDR_C45) { dev_addr = (regnum >> 16) & 0x1f; mdio_cfg |= MDIO_CFG_ENC45; @@ -61,44 +75,44 @@ int enetc_mdio_write(struct mii_bus *bus, int phy_id, int regnum, u16 value) mdio_cfg &= ~MDIO_CFG_ENC45; } - enetc_mdio_wr(hw, MDIO_CFG, mdio_cfg); + enetc_mdio_wr(mdio_priv, MDIO_CFG, mdio_cfg); - ret = enetc_mdio_wait_complete(hw); + ret = enetc_mdio_wait_complete(mdio_priv); if (ret) return ret; /* set port and dev addr */ mdio_ctl = MDIO_CTL_PORT_ADDR(phy_id) | MDIO_CTL_DEV_ADDR(dev_addr); - enetc_mdio_wr(hw, MDIO_CTL, mdio_ctl); + enetc_mdio_wr(mdio_priv, MDIO_CTL, mdio_ctl); /* set the register address */ if (regnum & MII_ADDR_C45) { - enetc_mdio_wr(hw, MDIO_ADDR, regnum & 0xffff); + enetc_mdio_wr(mdio_priv, MDIO_ADDR, regnum & 0xffff); - ret = enetc_mdio_wait_complete(hw); + ret = enetc_mdio_wait_complete(mdio_priv); if (ret) return ret; } /* write the value */ - enetc_mdio_wr(hw, MDIO_DATA, MDIO_DATA(value)); + enetc_mdio_wr(mdio_priv, MDIO_DATA, MDIO_DATA(value)); - ret = enetc_mdio_wait_complete(hw); + ret = enetc_mdio_wait_complete(mdio_priv); if (ret) return ret; return 0; } +EXPORT_SYMBOL_GPL(enetc_mdio_write); int enetc_mdio_read(struct mii_bus *bus, int phy_id, int regnum) { struct enetc_mdio_priv *mdio_priv = bus->priv; - struct enetc_hw *hw = mdio_priv->hw; u32 mdio_ctl, mdio_cfg; u16 dev_addr, value; int ret; - mdio_cfg = MDIO_CFG_CLKDIV(ENETC_MDC_DIV) | MDIO_CFG_NEG; + mdio_cfg = ENETC_EMDIO_CFG; if (regnum & MII_ADDR_C45) { dev_addr = (regnum >> 16) & 0x1f; mdio_cfg |= MDIO_CFG_ENC45; @@ -107,86 +121,56 @@ int enetc_mdio_read(struct mii_bus *bus, int phy_id, int regnum) mdio_cfg &= ~MDIO_CFG_ENC45; } - enetc_mdio_wr(hw, MDIO_CFG, mdio_cfg); + enetc_mdio_wr(mdio_priv, MDIO_CFG, mdio_cfg); - ret = enetc_mdio_wait_complete(hw); + ret = enetc_mdio_wait_complete(mdio_priv); if (ret) return ret; /* set port and device addr */ mdio_ctl = MDIO_CTL_PORT_ADDR(phy_id) | MDIO_CTL_DEV_ADDR(dev_addr); - enetc_mdio_wr(hw, MDIO_CTL, mdio_ctl); + enetc_mdio_wr(mdio_priv, MDIO_CTL, mdio_ctl); /* set the register address */ if (regnum & MII_ADDR_C45) { - enetc_mdio_wr(hw, MDIO_ADDR, regnum & 0xffff); + enetc_mdio_wr(mdio_priv, MDIO_ADDR, regnum & 0xffff); - ret = enetc_mdio_wait_complete(hw); + ret = enetc_mdio_wait_complete(mdio_priv); if (ret) return ret; } /* initiate the read */ - enetc_mdio_wr(hw, MDIO_CTL, mdio_ctl | MDIO_CTL_READ); + enetc_mdio_wr(mdio_priv, MDIO_CTL, mdio_ctl | MDIO_CTL_READ); - ret = enetc_mdio_wait_complete(hw); + ret = enetc_mdio_wait_complete(mdio_priv); if (ret) return ret; /* return all Fs if nothing was there */ - if (enetc_mdio_rd(hw, MDIO_CFG) & MDIO_CFG_RD_ER) { + if (enetc_mdio_rd(mdio_priv, MDIO_CFG) & MDIO_CFG_RD_ER) { dev_dbg(&bus->dev, "Error while reading PHY%d reg at %d.%hhu\n", phy_id, dev_addr, regnum); return 0xffff; } - value = enetc_mdio_rd(hw, MDIO_DATA) & 0xffff; + value = enetc_mdio_rd(mdio_priv, MDIO_DATA) & 0xffff; return value; } +EXPORT_SYMBOL_GPL(enetc_mdio_read); -int enetc_mdio_probe(struct enetc_pf *pf) +struct enetc_hw *enetc_hw_alloc(struct device *dev, void __iomem *port_regs) { - struct device *dev = &pf->si->pdev->dev; - struct enetc_mdio_priv *mdio_priv; - struct device_node *np; - struct mii_bus *bus; - int err; - - bus = devm_mdiobus_alloc_size(dev, sizeof(*mdio_priv)); - if (!bus) - return -ENOMEM; - - bus->name = "Freescale ENETC MDIO Bus"; - bus->read = enetc_mdio_read; - bus->write = enetc_mdio_write; - bus->parent = dev; - mdio_priv = bus->priv; - mdio_priv->hw = &pf->si->hw; - snprintf(bus->id, MII_BUS_ID_SIZE, "%s", dev_name(dev)); - - np = of_get_child_by_name(dev->of_node, "mdio"); - if (!np) { - dev_err(dev, "MDIO node missing\n"); - return -EINVAL; - } - - err = of_mdiobus_register(bus, np); - if (err) { - of_node_put(np); - dev_err(dev, "cannot register MDIO bus\n"); - return err; - } + struct enetc_hw *hw; - of_node_put(np); - pf->mdio = bus; + hw = devm_kzalloc(dev, sizeof(*hw), GFP_KERNEL); + if (!hw) + return ERR_PTR(-ENOMEM); - return 0; -} + hw->port = port_regs; -void enetc_mdio_remove(struct enetc_pf *pf) -{ - if (pf->mdio) - mdiobus_unregister(pf->mdio); + return hw; } +EXPORT_SYMBOL_GPL(enetc_hw_alloc); diff --git a/drivers/net/ethernet/freescale/enetc/enetc_mdio.h b/drivers/net/ethernet/freescale/enetc/enetc_mdio.h deleted file mode 100644 index 60c9a3889824..000000000000 --- a/drivers/net/ethernet/freescale/enetc/enetc_mdio.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */ -/* Copyright 2019 NXP */ - -#include <linux/phy.h> -#include "enetc_pf.h" - -struct enetc_mdio_priv { - struct enetc_hw *hw; -}; - -int enetc_mdio_write(struct mii_bus *bus, int phy_id, int regnum, u16 value); -int enetc_mdio_read(struct mii_bus *bus, int phy_id, int regnum); diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pci_mdio.c b/drivers/net/ethernet/freescale/enetc/enetc_pci_mdio.c index fbd41ce01f06..ebc635f8a4cc 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_pci_mdio.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_pci_mdio.c @@ -1,7 +1,8 @@ // SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) /* Copyright 2019 NXP */ +#include <linux/fsl/enetc_mdio.h> #include <linux/of_mdio.h> -#include "enetc_mdio.h" +#include "enetc_pf.h" #define ENETC_MDIO_DEV_ID 0xee01 #define ENETC_MDIO_DEV_NAME "FSL PCIe IE Central MDIO" @@ -13,17 +14,29 @@ static int enetc_pci_mdio_probe(struct pci_dev *pdev, { struct enetc_mdio_priv *mdio_priv; struct device *dev = &pdev->dev; + void __iomem *port_regs; struct enetc_hw *hw; struct mii_bus *bus; int err; - hw = devm_kzalloc(dev, sizeof(*hw), GFP_KERNEL); - if (!hw) - return -ENOMEM; + port_regs = pci_iomap(pdev, 0, 0); + if (!port_regs) { + dev_err(dev, "iomap failed\n"); + err = -ENXIO; + goto err_ioremap; + } + + hw = enetc_hw_alloc(dev, port_regs); + if (IS_ERR(hw)) { + err = PTR_ERR(hw); + goto err_hw_alloc; + } bus = devm_mdiobus_alloc_size(dev, sizeof(*mdio_priv)); - if (!bus) - return -ENOMEM; + if (!bus) { + err = -ENOMEM; + goto err_mdiobus_alloc; + } bus->name = ENETC_MDIO_BUS_NAME; bus->read = enetc_mdio_read; @@ -31,13 +44,14 @@ static int enetc_pci_mdio_probe(struct pci_dev *pdev, bus->parent = dev; mdio_priv = bus->priv; mdio_priv->hw = hw; + mdio_priv->mdio_base = ENETC_EMDIO_BASE; snprintf(bus->id, MII_BUS_ID_SIZE, "%s", dev_name(dev)); pcie_flr(pdev); err = pci_enable_device_mem(pdev); if (err) { dev_err(dev, "device enable failed\n"); - return err; + goto err_pci_enable; } err = pci_request_region(pdev, 0, KBUILD_MODNAME); @@ -46,13 +60,6 @@ static int enetc_pci_mdio_probe(struct pci_dev *pdev, goto err_pci_mem_reg; } - hw->port = pci_iomap(pdev, 0, 0); - if (!hw->port) { - err = -ENXIO; - dev_err(dev, "iomap failed\n"); - goto err_ioremap; - } - err = of_mdiobus_register(bus, dev->of_node); if (err) goto err_mdiobus_reg; @@ -62,12 +69,14 @@ static int enetc_pci_mdio_probe(struct pci_dev *pdev, return 0; err_mdiobus_reg: - iounmap(mdio_priv->hw->port); -err_ioremap: pci_release_mem_regions(pdev); err_pci_mem_reg: pci_disable_device(pdev); - +err_pci_enable: +err_mdiobus_alloc: + iounmap(port_regs); +err_hw_alloc: +err_ioremap: return err; } diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf.c b/drivers/net/ethernet/freescale/enetc/enetc_pf.c index e7482d483b28..fc0d7d99e9a1 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_pf.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.c @@ -2,6 +2,7 @@ /* Copyright 2017-2019 NXP */ #include <linux/module.h> +#include <linux/fsl/enetc_mdio.h> #include <linux/of_mdio.h> #include <linux/of_net.h> #include "enetc_pf.h" @@ -749,6 +750,52 @@ static void enetc_pf_netdev_setup(struct enetc_si *si, struct net_device *ndev, enetc_get_primary_mac_addr(&si->hw, ndev->dev_addr); } +static int enetc_mdio_probe(struct enetc_pf *pf) +{ + struct device *dev = &pf->si->pdev->dev; + struct enetc_mdio_priv *mdio_priv; + struct device_node *np; + struct mii_bus *bus; + int err; + + bus = devm_mdiobus_alloc_size(dev, sizeof(*mdio_priv)); + if (!bus) + return -ENOMEM; + + bus->name = "Freescale ENETC MDIO Bus"; + bus->read = enetc_mdio_read; + bus->write = enetc_mdio_write; + bus->parent = dev; + mdio_priv = bus->priv; + mdio_priv->hw = &pf->si->hw; + mdio_priv->mdio_base = ENETC_EMDIO_BASE; + snprintf(bus->id, MII_BUS_ID_SIZE, "%s", dev_name(dev)); + + np = of_get_child_by_name(dev->of_node, "mdio"); + if (!np) { + dev_err(dev, "MDIO node missing\n"); + return -EINVAL; + } + + err = of_mdiobus_register(bus, np); + if (err) { + of_node_put(np); + dev_err(dev, "cannot register MDIO bus\n"); + return err; + } + + of_node_put(np); + pf->mdio = bus; + + return 0; +} + +static void enetc_mdio_remove(struct enetc_pf *pf) +{ + if (pf->mdio) + mdiobus_unregister(pf->mdio); +} + static int enetc_of_get_phy(struct enetc_ndev_priv *priv) { struct enetc_pf *pf = enetc_si_priv(priv->si); diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf.h b/drivers/net/ethernet/freescale/enetc/enetc_pf.h index 10dd1b53bb08..59e65a6f6c3e 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_pf.h +++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.h @@ -49,7 +49,3 @@ struct enetc_pf { int enetc_msg_psi_init(struct enetc_pf *pf); void enetc_msg_psi_free(struct enetc_pf *pf); void enetc_msg_handle_rxmsg(struct enetc_pf *pf, int mbox_id, u16 *status); - -/* MDIO */ -int enetc_mdio_probe(struct enetc_pf *pf); -void enetc_mdio_remove(struct enetc_pf *pf); diff --git a/drivers/net/ethernet/freescale/enetc/enetc_qos.c b/drivers/net/ethernet/freescale/enetc/enetc_qos.c index e910aaf0f5ec..0c6bf3a55a9a 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_qos.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_qos.c @@ -36,7 +36,6 @@ void enetc_sched_speed_set(struct net_device *ndev) case SPEED_10: default: pspeed = ENETC_PMR_PSPEED_10M; - netdev_err(ndev, "Qbv PSPEED set speed link down.\n"); } priv->speed = speed; @@ -315,7 +314,7 @@ int enetc_setup_tc_txtime(struct net_device *ndev, void *type_data) tc = qopt->queue; - if (tc < 0 || tc > priv->num_tx_rings) + if (tc < 0 || tc >= priv->num_tx_rings) return -EINVAL; /* Do not support TXSTART and TX CSUM offload simutaniously */ diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 798fed37be46..4432a59904c7 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -2199,8 +2199,14 @@ static void fec_enet_get_regs(struct net_device *ndev, { struct fec_enet_private *fep = netdev_priv(ndev); u32 __iomem *theregs = (u32 __iomem *)fep->hwp; + struct device *dev = &fep->pdev->dev; u32 *buf = (u32 *)regbuf; u32 i, off; + int ret; + + ret = pm_runtime_get_sync(dev); + if (ret < 0) + return; regs->version = fec_enet_register_version; @@ -2216,6 +2222,9 @@ static void fec_enet_get_regs(struct net_device *ndev, off >>= 2; buf[off] = readl(&theregs[off]); } + + pm_runtime_mark_last_busy(dev); + pm_runtime_put_autosuspend(dev); } static int fec_enet_get_ts_info(struct net_device *ndev, diff --git a/drivers/net/ethernet/freescale/fec_mpc52xx.c b/drivers/net/ethernet/freescale/fec_mpc52xx.c index de5278485062..7a3f066e611d 100644 --- a/drivers/net/ethernet/freescale/fec_mpc52xx.c +++ b/drivers/net/ethernet/freescale/fec_mpc52xx.c @@ -785,16 +785,6 @@ static const struct ethtool_ops mpc52xx_fec_ethtool_ops = { }; -static int mpc52xx_fec_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) -{ - struct phy_device *phydev = dev->phydev; - - if (!phydev) - return -ENOTSUPP; - - return phy_mii_ioctl(phydev, rq, cmd); -} - static const struct net_device_ops mpc52xx_fec_netdev_ops = { .ndo_open = mpc52xx_fec_open, .ndo_stop = mpc52xx_fec_close, @@ -802,7 +792,7 @@ static const struct net_device_ops mpc52xx_fec_netdev_ops = { .ndo_set_rx_mode = mpc52xx_fec_set_multicast_list, .ndo_set_mac_address = mpc52xx_fec_set_mac_address, .ndo_validate_addr = eth_validate_addr, - .ndo_do_ioctl = mpc52xx_fec_ioctl, + .ndo_do_ioctl = phy_do_ioctl, .ndo_tx_timeout = mpc52xx_fec_tx_timeout, .ndo_get_stats = mpc52xx_fec_get_stats, #ifdef CONFIG_NET_POLL_CONTROLLER diff --git a/drivers/net/ethernet/freescale/fman/fman_memac.c b/drivers/net/ethernet/freescale/fman/fman_memac.c index 41c6fa200e74..e1901874c19f 100644 --- a/drivers/net/ethernet/freescale/fman/fman_memac.c +++ b/drivers/net/ethernet/freescale/fman/fman_memac.c @@ -110,7 +110,7 @@ do { \ /* Interface Mode Register (IF_MODE) */ #define IF_MODE_MASK 0x00000003 /* 30-31 Mask on i/f mode bits */ -#define IF_MODE_XGMII 0x00000000 /* 30-31 XGMII (10G) interface */ +#define IF_MODE_10G 0x00000000 /* 30-31 10G interface */ #define IF_MODE_GMII 0x00000002 /* 30-31 GMII (1G) interface */ #define IF_MODE_RGMII 0x00000004 #define IF_MODE_RGMII_AUTO 0x00008000 @@ -440,7 +440,7 @@ static int init(struct memac_regs __iomem *regs, struct memac_cfg *cfg, tmp = 0; switch (phy_if) { case PHY_INTERFACE_MODE_XGMII: - tmp |= IF_MODE_XGMII; + tmp |= IF_MODE_10G; break; default: tmp |= IF_MODE_GMII; diff --git a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c index 80903cd58468..add61fed33ee 100644 --- a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c +++ b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c @@ -882,14 +882,6 @@ static const struct ethtool_ops fs_ethtool_ops = { .set_tunable = fs_set_tunable, }; -static int fs_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) -{ - if (!netif_running(dev)) - return -EINVAL; - - return phy_mii_ioctl(dev->phydev, rq, cmd); -} - extern int fs_mii_connect(struct net_device *dev); extern void fs_mii_disconnect(struct net_device *dev); @@ -907,7 +899,7 @@ static const struct net_device_ops fs_enet_netdev_ops = { .ndo_start_xmit = fs_enet_start_xmit, .ndo_tx_timeout = fs_timeout, .ndo_set_rx_mode = fs_set_multicast_list, - .ndo_do_ioctl = fs_ioctl, + .ndo_do_ioctl = phy_do_ioctl_running, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, #ifdef CONFIG_NET_POLL_CONTROLLER diff --git a/drivers/net/ethernet/freescale/xgmac_mdio.c b/drivers/net/ethernet/freescale/xgmac_mdio.c index e03b30c60dcf..c82c85ef5fb3 100644 --- a/drivers/net/ethernet/freescale/xgmac_mdio.c +++ b/drivers/net/ethernet/freescale/xgmac_mdio.c @@ -49,6 +49,7 @@ struct tgec_mdio_controller { struct mdio_fsl_priv { struct tgec_mdio_controller __iomem *mdio_base; bool is_little_endian; + bool has_a011043; }; static u32 xgmac_read32(void __iomem *regs, @@ -226,7 +227,8 @@ static int xgmac_mdio_read(struct mii_bus *bus, int phy_id, int regnum) return ret; /* Return all Fs if nothing was there */ - if (xgmac_read32(®s->mdio_stat, endian) & MDIO_STAT_RD_ER) { + if ((xgmac_read32(®s->mdio_stat, endian) & MDIO_STAT_RD_ER) && + !priv->has_a011043) { dev_err(&bus->dev, "Error while reading PHY%d reg at %d.%hhu\n", phy_id, dev_addr, regnum); @@ -274,6 +276,9 @@ static int xgmac_mdio_probe(struct platform_device *pdev) priv->is_little_endian = of_property_read_bool(pdev->dev.of_node, "little-endian"); + priv->has_a011043 = of_property_read_bool(pdev->dev.of_node, + "fsl,erratum-a011043"); + ret = of_mdiobus_register(bus, np); if (ret) { dev_err(&pdev->dev, "cannot register MDIO bus\n"); diff --git a/drivers/net/ethernet/google/gve/gve_rx.c b/drivers/net/ethernet/google/gve/gve_rx.c index edec61dfc868..9f52e72ff641 100644 --- a/drivers/net/ethernet/google/gve/gve_rx.c +++ b/drivers/net/ethernet/google/gve/gve_rx.c @@ -418,8 +418,6 @@ bool gve_clean_rx_done(struct gve_rx_ring *rx, int budget, rx->cnt = cnt; rx->fill_cnt += work_done; - /* restock desc ring slots */ - dma_wmb(); /* Ensure descs are visible before ringing doorbell */ gve_rx_write_doorbell(priv, rx); return gve_rx_work_pending(rx); } diff --git a/drivers/net/ethernet/google/gve/gve_tx.c b/drivers/net/ethernet/google/gve/gve_tx.c index f4889431f9b7..d0244feb0301 100644 --- a/drivers/net/ethernet/google/gve/gve_tx.c +++ b/drivers/net/ethernet/google/gve/gve_tx.c @@ -487,10 +487,6 @@ netdev_tx_t gve_tx(struct sk_buff *skb, struct net_device *dev) * may have added descriptors without ringing the doorbell. */ - /* Ensure tx descs from a prior gve_tx are visible before - * ringing doorbell. - */ - dma_wmb(); gve_tx_put_doorbell(priv, tx->q_resources, tx->req); return NETDEV_TX_BUSY; } @@ -505,8 +501,6 @@ netdev_tx_t gve_tx(struct sk_buff *skb, struct net_device *dev) if (!netif_xmit_stopped(tx->netdev_txq) && netdev_xmit_more()) return NETDEV_TX_OK; - /* Ensure tx descs are visible before ringing doorbell */ - dma_wmb(); gve_tx_put_doorbell(priv, tx->q_resources, tx->req); return NETDEV_TX_OK; } diff --git a/drivers/net/ethernet/hisilicon/hisi_femac.c b/drivers/net/ethernet/hisilicon/hisi_femac.c index 90ab7ade44c4..57c3bc4f7089 100644 --- a/drivers/net/ethernet/hisilicon/hisi_femac.c +++ b/drivers/net/ethernet/hisilicon/hisi_femac.c @@ -675,18 +675,6 @@ static void hisi_femac_net_set_rx_mode(struct net_device *dev) } } -static int hisi_femac_net_ioctl(struct net_device *dev, - struct ifreq *ifreq, int cmd) -{ - if (!netif_running(dev)) - return -EINVAL; - - if (!dev->phydev) - return -EINVAL; - - return phy_mii_ioctl(dev->phydev, ifreq, cmd); -} - static const struct ethtool_ops hisi_femac_ethtools_ops = { .get_link = ethtool_op_get_link, .get_link_ksettings = phy_ethtool_get_link_ksettings, @@ -697,7 +685,7 @@ static const struct net_device_ops hisi_femac_netdev_ops = { .ndo_open = hisi_femac_net_open, .ndo_stop = hisi_femac_net_close, .ndo_start_xmit = hisi_femac_net_xmit, - .ndo_do_ioctl = hisi_femac_net_ioctl, + .ndo_do_ioctl = phy_do_ioctl_running, .ndo_set_mac_address = hisi_femac_set_mac_address, .ndo_set_rx_mode = hisi_femac_net_set_rx_mode, }; diff --git a/drivers/net/ethernet/hisilicon/hns/hns_enet.c b/drivers/net/ethernet/hisilicon/hns/hns_enet.c index e45553ec114a..c117074c16e3 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c @@ -565,7 +565,6 @@ static int hns_nic_poll_rx_skb(struct hns_nic_ring_data *ring_data, skb = *out_skb = napi_alloc_skb(&ring_data->napi, HNS_RX_HEAD_SIZE); if (unlikely(!skb)) { - netdev_err(ndev, "alloc rx skb fail\n"); ring->stats.sw_err_cnt++; return -ENOMEM; } @@ -1056,7 +1055,6 @@ static int hns_nic_common_poll(struct napi_struct *napi, int budget) container_of(napi, struct hns_nic_ring_data, napi); struct hnae_ring *ring = ring_data->ring; -try_again: clean_complete += ring_data->poll_one( ring_data, budget - clean_complete, ring_data->ex_process); @@ -1066,7 +1064,7 @@ try_again: napi_complete(napi); ring->q->handle->dev->ops->toggle_ring_irq(ring, 0); } else { - goto try_again; + return budget; } } @@ -1499,20 +1497,6 @@ static void hns_nic_net_timeout(struct net_device *ndev, unsigned int txqueue) } } -static int hns_nic_do_ioctl(struct net_device *netdev, struct ifreq *ifr, - int cmd) -{ - struct phy_device *phy_dev = netdev->phydev; - - if (!netif_running(netdev)) - return -EINVAL; - - if (!phy_dev) - return -ENOTSUPP; - - return phy_mii_ioctl(phy_dev, ifr, cmd); -} - static netdev_tx_t hns_nic_net_xmit(struct sk_buff *skb, struct net_device *ndev) { @@ -1960,7 +1944,7 @@ static const struct net_device_ops hns_nic_netdev_ops = { .ndo_tx_timeout = hns_nic_net_timeout, .ndo_set_mac_address = hns_nic_net_set_mac_address, .ndo_change_mtu = hns_nic_change_mtu, - .ndo_do_ioctl = hns_nic_do_ioctl, + .ndo_do_ioctl = phy_do_ioctl_running, .ndo_set_features = hns_nic_set_features, .ndo_fix_features = hns_nic_fix_features, .ndo_get_stats64 = hns_nic_get_stats64, diff --git a/drivers/net/ethernet/hisilicon/hns3/Makefile b/drivers/net/ethernet/hisilicon/hns3/Makefile index d01bf536eb86..7aa2fac76c5e 100644 --- a/drivers/net/ethernet/hisilicon/hns3/Makefile +++ b/drivers/net/ethernet/hisilicon/hns3/Makefile @@ -3,6 +3,8 @@ # Makefile for the HISILICON network device drivers. # +ccflags-y += -I$(srctree)/$(src) + obj-$(CONFIG_HNS3) += hns3pf/ obj-$(CONFIG_HNS3) += hns3vf/ diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index 3b5e2d7251e7..a3e4081b84ba 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -164,11 +164,7 @@ enum hnae3_reset_type { HNAE3_IMP_RESET, HNAE3_UNKNOWN_RESET, HNAE3_NONE_RESET, -}; - -enum hnae3_flr_state { - HNAE3_FLR_DOWN, - HNAE3_FLR_DONE, + HNAE3_MAX_RESET, }; enum hnae3_port_base_vlan_state { @@ -575,8 +571,7 @@ struct hnae3_ae_algo { const struct pci_device_id *pdev_id_table; }; -#define HNAE3_INT_NAME_EXT_LEN 32 /* Max extra information length */ -#define HNAE3_INT_NAME_LEN (IFNAMSIZ + HNAE3_INT_NAME_EXT_LEN) +#define HNAE3_INT_NAME_LEN 32 #define HNAE3_ITR_COUNTDOWN_START 100 struct hnae3_tc_info { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c index 6b328a259efc..1d4ffc5f408a 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c @@ -176,7 +176,7 @@ static int hns3_dbg_bd_info(struct hnae3_handle *h, const char *cmd_buf) return -EINVAL; } - ring = &priv->ring[q_num]; + ring = &priv->ring[q_num]; value = readl_relaxed(ring->tqp->io_base + HNS3_RING_TX_RING_TAIL_REG); tx_index = (cnt == 1) ? value : tx_index; @@ -209,10 +209,10 @@ static int hns3_dbg_bd_info(struct hnae3_handle *h, const char *cmd_buf) le16_to_cpu(tx_desc->tx.bdtp_fe_sc_vld_ra_ri)); dev_info(dev, "(TX)mss: %u\n", le16_to_cpu(tx_desc->tx.mss)); - ring = &priv->ring[q_num + h->kinfo.num_tqps]; + ring = &priv->ring[q_num + h->kinfo.num_tqps]; value = readl_relaxed(ring->tqp->io_base + HNS3_RING_RX_RING_TAIL_REG); rx_index = (cnt == 1) ? value : tx_index; - rx_desc = &ring->desc[rx_index]; + rx_desc = &ring->desc[rx_index]; addr = le64_to_cpu(rx_desc->addr); dev_info(dev, "RX Queue Num: %u, BD Index: %u\n", q_num, rx_index); @@ -297,8 +297,8 @@ static ssize_t hns3_dbg_cmd_read(struct file *filp, char __user *buffer, if (!buf) return -ENOMEM; - len = snprintf(buf, HNS3_DBG_READ_LEN, "%s\n", - "Please echo help to cmd to get help information"); + len = scnprintf(buf, HNS3_DBG_READ_LEN, "%s\n", + "Please echo help to cmd to get help information"); uncopy_bytes = copy_to_user(buffer, buf, len); kfree(buf); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index aee5facc89b5..acb796cc10d0 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -24,6 +24,12 @@ #include "hnae3.h" #include "hns3_enet.h" +/* All hns3 tracepoints are defined by the include below, which + * must be included exactly once across the whole kernel with + * CREATE_TRACE_POINTS defined + */ +#define CREATE_TRACE_POINTS +#include "hns3_trace.h" #define hns3_set_field(origin, shift, val) ((origin) |= ((val) << (shift))) #define hns3_tx_bd_count(S) DIV_ROUND_UP(S, HNS3_MAX_BD_SIZE) @@ -54,6 +60,8 @@ MODULE_PARM_DESC(debug, " Network interface message level setting"); #define HNS3_INNER_VLAN_TAG 1 #define HNS3_OUTER_VLAN_TAG 2 +#define HNS3_MIN_TX_LEN 33U + /* hns3_pci_tbl - PCI Device ID Table * * Last entry must be all 0s @@ -127,18 +135,21 @@ static int hns3_nic_init_irq(struct hns3_nic_priv *priv) continue; if (tqp_vectors->tx_group.ring && tqp_vectors->rx_group.ring) { - snprintf(tqp_vectors->name, HNAE3_INT_NAME_LEN - 1, - "%s-%s-%d", priv->netdev->name, "TxRx", - txrx_int_idx++); + snprintf(tqp_vectors->name, HNAE3_INT_NAME_LEN, + "%s-%s-%s-%d", hns3_driver_name, + pci_name(priv->ae_handle->pdev), + "TxRx", txrx_int_idx++); txrx_int_idx++; } else if (tqp_vectors->rx_group.ring) { - snprintf(tqp_vectors->name, HNAE3_INT_NAME_LEN - 1, - "%s-%s-%d", priv->netdev->name, "Rx", - rx_int_idx++); + snprintf(tqp_vectors->name, HNAE3_INT_NAME_LEN, + "%s-%s-%s-%d", hns3_driver_name, + pci_name(priv->ae_handle->pdev), + "Rx", rx_int_idx++); } else if (tqp_vectors->tx_group.ring) { - snprintf(tqp_vectors->name, HNAE3_INT_NAME_LEN - 1, - "%s-%s-%d", priv->netdev->name, "Tx", - tx_int_idx++); + snprintf(tqp_vectors->name, HNAE3_INT_NAME_LEN, + "%s-%s-%s-%d", hns3_driver_name, + pci_name(priv->ae_handle->pdev), + "Tx", tx_int_idx++); } else { /* Skip this unused q_vector */ continue; @@ -155,6 +166,8 @@ static int hns3_nic_init_irq(struct hns3_nic_priv *priv) return ret; } + disable_irq(tqp_vectors->vector_irq); + irq_set_affinity_hint(tqp_vectors->vector_irq, &tqp_vectors->affinity_mask); @@ -173,6 +186,7 @@ static void hns3_mask_vector_irq(struct hns3_enet_tqp_vector *tqp_vector, static void hns3_vector_enable(struct hns3_enet_tqp_vector *tqp_vector) { napi_enable(&tqp_vector->napi); + enable_irq(tqp_vector->vector_irq); /* enable vector */ hns3_mask_vector_irq(tqp_vector, 1); @@ -372,18 +386,6 @@ static int hns3_nic_net_up(struct net_device *netdev) if (ret) return ret; - /* the device can work without cpu rmap, only aRFS needs it */ - ret = hns3_set_rx_cpu_rmap(netdev); - if (ret) - netdev_warn(netdev, "set rx cpu rmap fail, ret=%d!\n", ret); - - /* get irq resource for all vectors */ - ret = hns3_nic_init_irq(priv); - if (ret) { - netdev_err(netdev, "init irq failed! ret=%d\n", ret); - goto free_rmap; - } - clear_bit(HNS3_NIC_STATE_DOWN, &priv->state); /* enable the vectors */ @@ -396,22 +398,15 @@ static int hns3_nic_net_up(struct net_device *netdev) /* start the ae_dev */ ret = h->ae_algo->ops->start ? h->ae_algo->ops->start(h) : 0; - if (ret) - goto out_start_err; - - return 0; - -out_start_err: - set_bit(HNS3_NIC_STATE_DOWN, &priv->state); - while (j--) - hns3_tqp_disable(h->kinfo.tqp[j]); + if (ret) { + set_bit(HNS3_NIC_STATE_DOWN, &priv->state); + while (j--) + hns3_tqp_disable(h->kinfo.tqp[j]); - for (j = i - 1; j >= 0; j--) - hns3_vector_disable(&priv->tqp_vector[j]); + for (j = i - 1; j >= 0; j--) + hns3_vector_disable(&priv->tqp_vector[j]); + } - hns3_nic_uninit_irq(priv); -free_rmap: - hns3_free_rx_cpu_rmap(netdev); return ret; } @@ -508,11 +503,6 @@ static void hns3_nic_net_down(struct net_device *netdev) if (ops->stop) ops->stop(priv->ae_handle); - hns3_free_rx_cpu_rmap(netdev); - - /* free irq resources */ - hns3_nic_uninit_irq(priv); - /* delay ring buffer clearing to hns3_reset_notify_uninit_enet * during reset process, because driver may not be able * to disable the ring through firmware when downing the netdev. @@ -734,6 +724,8 @@ static int hns3_set_tso(struct sk_buff *skb, u32 *paylen, /* get MSS for TSO */ *mss = skb_shinfo(skb)->gso_size; + trace_hns3_tso(skb); + return 0; } @@ -1138,6 +1130,7 @@ static int hns3_fill_desc(struct hns3_enet_ring *ring, void *priv, desc->tx.bdtp_fe_sc_vld_ra_ri = cpu_to_le16(BIT(HNS3_TXD_VLD_B)); + trace_hns3_tx_desc(ring, ring->next_to_use); ring_ptr_move_fw(ring, next_to_use); return HNS3_LIKELY_BD_NUM; } @@ -1161,6 +1154,7 @@ static int hns3_fill_desc(struct hns3_enet_ring *ring, void *priv, desc->tx.bdtp_fe_sc_vld_ra_ri = cpu_to_le16(BIT(HNS3_TXD_VLD_B)); + trace_hns3_tx_desc(ring, ring->next_to_use); /* move ring pointer to next */ ring_ptr_move_fw(ring, next_to_use); @@ -1286,6 +1280,14 @@ static bool hns3_skb_need_linearized(struct sk_buff *skb, unsigned int *bd_size, return false; } +void hns3_shinfo_pack(struct skb_shared_info *shinfo, __u32 *size) +{ + int i = 0; + + for (i = 0; i < MAX_SKB_FRAGS; i++) + size[i] = skb_frag_size(&shinfo->frags[i]); +} + static int hns3_nic_maybe_stop_tx(struct hns3_enet_ring *ring, struct net_device *netdev, struct sk_buff *skb) @@ -1297,8 +1299,10 @@ static int hns3_nic_maybe_stop_tx(struct hns3_enet_ring *ring, bd_num = hns3_tx_bd_num(skb, bd_size); if (unlikely(bd_num > HNS3_MAX_NON_TSO_BD_NUM)) { if (bd_num <= HNS3_MAX_TSO_BD_NUM && skb_is_gso(skb) && - !hns3_skb_need_linearized(skb, bd_size, bd_num)) + !hns3_skb_need_linearized(skb, bd_size, bd_num)) { + trace_hns3_over_8bd(skb); goto out; + } if (__skb_linearize(skb)) return -ENOMEM; @@ -1306,8 +1310,10 @@ static int hns3_nic_maybe_stop_tx(struct hns3_enet_ring *ring, bd_num = hns3_tx_bd_count(skb->len); if ((skb_is_gso(skb) && bd_num > HNS3_MAX_TSO_BD_NUM) || (!skb_is_gso(skb) && - bd_num > HNS3_MAX_NON_TSO_BD_NUM)) + bd_num > HNS3_MAX_NON_TSO_BD_NUM)) { + trace_hns3_over_8bd(skb); return -ENOMEM; + } u64_stats_update_begin(&ring->syncp); ring->stats.tx_copy++; @@ -1405,6 +1411,10 @@ netdev_tx_t hns3_nic_net_xmit(struct sk_buff *skb, struct net_device *netdev) int bd_num = 0; int ret; + /* Hardware can only handle short frames above 32 bytes */ + if (skb_put_padto(skb, HNS3_MIN_TX_LEN)) + return NETDEV_TX_OK; + /* Prefetch the data used later */ prefetch(skb->data); @@ -1448,6 +1458,7 @@ out: (ring->desc_num - 1); ring->desc[pre_ntu].tx.bdtp_fe_sc_vld_ra_ri |= cpu_to_le16(BIT(HNS3_TXD_FE_B)); + trace_hns3_tx_desc(ring, pre_ntu); /* Complete translate all packets */ dev_queue = netdev_get_tx_queue(netdev, ring->queue_index); @@ -2083,10 +2094,8 @@ static int hns3_probe(struct pci_dev *pdev, const struct pci_device_id *ent) int ret; ae_dev = devm_kzalloc(&pdev->dev, sizeof(*ae_dev), GFP_KERNEL); - if (!ae_dev) { - ret = -ENOMEM; - return ret; - } + if (!ae_dev) + return -ENOMEM; ae_dev->pdev = pdev; ae_dev->flag = ent->driver_data; @@ -2529,8 +2538,8 @@ void hns3_clean_tx_ring(struct hns3_enet_ring *ring) rmb(); /* Make sure head is ready before touch any data */ if (unlikely(!is_valid_clean_head(ring, head))) { - netdev_err(netdev, "wrong head (%d, %d-%d)\n", head, - ring->next_to_use, ring->next_to_clean); + hns3_rl_err(netdev, "wrong head (%d, %d-%d)\n", head, + ring->next_to_use, ring->next_to_clean); u64_stats_update_begin(&ring->syncp); ring->stats.io_err_cnt++; @@ -2616,6 +2625,12 @@ static void hns3_nic_alloc_rx_buffers(struct hns3_enet_ring *ring, writel_relaxed(i, ring->tqp->io_base + HNS3_RING_RX_RING_HEAD_REG); } +static bool hns3_page_is_reusable(struct page *page) +{ + return page_to_nid(page) == numa_mem_id() && + !page_is_pfmemalloc(page); +} + static void hns3_nic_reuse_page(struct sk_buff *skb, int i, struct hns3_enet_ring *ring, int pull_len, struct hns3_desc_cb *desc_cb) @@ -2630,7 +2645,7 @@ static void hns3_nic_reuse_page(struct sk_buff *skb, int i, /* Avoid re-using remote pages, or the stack is still using the page * when page_offset rollback to zero, flag default unreuse */ - if (unlikely(page_to_nid(desc_cb->priv) != numa_mem_id()) || + if (unlikely(!hns3_page_is_reusable(desc_cb->priv)) || (!desc_cb->page_offset && page_count(desc_cb->priv) > 1)) return; @@ -2700,6 +2715,9 @@ static int hns3_gro_complete(struct sk_buff *skb, u32 l234info) skb->csum_start = (unsigned char *)th - skb->head; skb->csum_offset = offsetof(struct tcphdr, check); skb->ip_summed = CHECKSUM_PARTIAL; + + trace_hns3_gro(skb); + return 0; } @@ -2836,6 +2854,7 @@ static int hns3_alloc_skb(struct hns3_enet_ring *ring, unsigned int length, return -ENOMEM; } + trace_hns3_rx_desc(ring); prefetchw(skb->data); ring->pending_buf = 1; @@ -2845,7 +2864,7 @@ static int hns3_alloc_skb(struct hns3_enet_ring *ring, unsigned int length, memcpy(__skb_put(skb, length), va, ALIGN(length, sizeof(long))); /* We can reuse buffer as-is, just make sure it is local */ - if (likely(page_to_nid(desc_cb->priv) == numa_mem_id())) + if (likely(hns3_page_is_reusable(desc_cb->priv))) desc_cb->reuse_flag = 1; else /* This page cannot be reused so discard it */ put_page(desc_cb->priv); @@ -2910,6 +2929,7 @@ static int hns3_add_frag(struct hns3_enet_ring *ring) } hns3_nic_reuse_page(skb, ring->frag_num++, ring, 0, desc_cb); + trace_hns3_rx_desc(ring); ring_ptr_move_fw(ring, next_to_clean); ring->pending_buf++; } while (!(bd_base_info & BIT(HNS3_RXD_FE_B))); @@ -3614,19 +3634,13 @@ static void hns3_nic_uninit_vector_data(struct hns3_nic_priv *priv) hns3_free_vector_ring_chain(tqp_vector, &vector_ring_chain); - if (tqp_vector->irq_init_flag == HNS3_VECTOR_INITED) { - irq_set_affinity_hint(tqp_vector->vector_irq, NULL); - free_irq(tqp_vector->vector_irq, tqp_vector); - tqp_vector->irq_init_flag = HNS3_VECTOR_NOT_INITED; - } - hns3_clear_ring_group(&tqp_vector->rx_group); hns3_clear_ring_group(&tqp_vector->tx_group); netif_napi_del(&priv->tqp_vector[i].napi); } } -static int hns3_nic_dealloc_vector_data(struct hns3_nic_priv *priv) +static void hns3_nic_dealloc_vector_data(struct hns3_nic_priv *priv) { struct hnae3_handle *h = priv->ae_handle; struct pci_dev *pdev = h->pdev; @@ -3638,11 +3652,10 @@ static int hns3_nic_dealloc_vector_data(struct hns3_nic_priv *priv) tqp_vector = &priv->tqp_vector[i]; ret = h->ae_algo->ops->put_vector(h, tqp_vector->vector_irq); if (ret) - return ret; + return; } devm_kfree(&pdev->dev, priv->tqp_vector); - return 0; } static void hns3_ring_get_cfg(struct hnae3_queue *q, struct hns3_nic_priv *priv, @@ -4041,6 +4054,18 @@ static int hns3_client_init(struct hnae3_handle *handle) goto out_reg_netdev_fail; } + /* the device can work without cpu rmap, only aRFS needs it */ + ret = hns3_set_rx_cpu_rmap(netdev); + if (ret) + dev_warn(priv->dev, "set rx cpu rmap fail, ret=%d\n", ret); + + ret = hns3_nic_init_irq(priv); + if (ret) { + dev_err(priv->dev, "init irq failed! ret=%d\n", ret); + hns3_free_rx_cpu_rmap(netdev); + goto out_init_irq_fail; + } + ret = hns3_client_start(handle); if (ret) { dev_err(priv->dev, "hns3_client_start fail! ret=%d\n", ret); @@ -4062,6 +4087,9 @@ static int hns3_client_init(struct hnae3_handle *handle) return ret; out_client_start: + hns3_free_rx_cpu_rmap(netdev); + hns3_nic_uninit_irq(priv); +out_init_irq_fail: unregister_netdev(netdev); out_reg_netdev_fail: hns3_uninit_phy(netdev); @@ -4099,15 +4127,17 @@ static void hns3_client_uninit(struct hnae3_handle *handle, bool reset) goto out_netdev_free; } + hns3_free_rx_cpu_rmap(netdev); + + hns3_nic_uninit_irq(priv); + hns3_del_all_fd_rules(netdev, true); hns3_clear_all_ring(handle, true); hns3_nic_uninit_vector_data(priv); - ret = hns3_nic_dealloc_vector_data(priv); - if (ret) - netdev_err(netdev, "dealloc vector error\n"); + hns3_nic_dealloc_vector_data(priv); ret = hns3_uninit_all_ring(priv); if (ret) @@ -4434,17 +4464,32 @@ static int hns3_reset_notify_init_enet(struct hnae3_handle *handle) if (ret) goto err_uninit_vector; + /* the device can work without cpu rmap, only aRFS needs it */ + ret = hns3_set_rx_cpu_rmap(netdev); + if (ret) + dev_warn(priv->dev, "set rx cpu rmap fail, ret=%d\n", ret); + + ret = hns3_nic_init_irq(priv); + if (ret) { + dev_err(priv->dev, "init irq failed! ret=%d\n", ret); + hns3_free_rx_cpu_rmap(netdev); + goto err_init_irq_fail; + } + ret = hns3_client_start(handle); if (ret) { dev_err(priv->dev, "hns3_client_start fail! ret=%d\n", ret); - goto err_uninit_ring; + goto err_client_start_fail; } set_bit(HNS3_NIC_STATE_INITED, &priv->state); return ret; -err_uninit_ring: +err_client_start_fail: + hns3_free_rx_cpu_rmap(netdev); + hns3_nic_uninit_irq(priv); +err_init_irq_fail: hns3_uninit_all_ring(priv); err_uninit_vector: hns3_nic_uninit_vector_data(priv); @@ -4494,6 +4539,8 @@ static int hns3_reset_notify_uninit_enet(struct hnae3_handle *handle) return 0; } + hns3_free_rx_cpu_rmap(netdev); + hns3_nic_uninit_irq(priv); hns3_clear_all_ring(handle, true); hns3_reset_tx_queue(priv->ae_handle); @@ -4501,9 +4548,7 @@ static int hns3_reset_notify_uninit_enet(struct hnae3_handle *handle) hns3_store_coal(priv); - ret = hns3_nic_dealloc_vector_data(priv); - if (ret) - netdev_err(netdev, "dealloc vector error\n"); + hns3_nic_dealloc_vector_data(priv); ret = hns3_uninit_all_ring(priv); if (ret) @@ -4669,7 +4714,7 @@ static int __init hns3_init_module(void) pr_info("%s: %s\n", hns3_driver_name, hns3_copyright); client.type = HNAE3_CLIENT_KNIC; - snprintf(client.name, HNAE3_CLIENT_NAME_LENGTH - 1, "%s", + snprintf(client.name, HNAE3_CLIENT_NAME_LENGTH, "%s", hns3_driver_name); client.ops = &client_ops; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h index 9d47abd5c37c..abefd7a179f7 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h @@ -673,4 +673,5 @@ void hns3_dbg_init(struct hnae3_handle *handle); void hns3_dbg_uninit(struct hnae3_handle *handle); void hns3_dbg_register_debugfs(const char *debugfs_dir_name); void hns3_dbg_unregister_debugfs(void); +void hns3_shinfo_pack(struct skb_shared_info *shinfo, __u32 *size); #endif diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index 6e0212b79438..c03856e63320 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -423,9 +423,8 @@ static void *hns3_update_strings(u8 *data, const struct hns3_stats *stats, data[ETH_GSTRING_LEN - 1] = '\0'; /* first, prepend the prefix string */ - n1 = snprintf(data, MAX_PREFIX_SIZE, "%s%d_", - prefix, i); - n1 = min_t(uint, n1, MAX_PREFIX_SIZE - 1); + n1 = scnprintf(data, MAX_PREFIX_SIZE, "%s%d_", + prefix, i); size_left = (ETH_GSTRING_LEN - 1) - n1; /* now, concatenate the stats string to it */ diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_trace.h b/drivers/net/ethernet/hisilicon/hns3/hns3_trace.h new file mode 100644 index 000000000000..7bddcca148a5 --- /dev/null +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_trace.h @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* Copyright (c) 2018-2019 Hisilicon Limited. */ + +/* This must be outside ifdef _HNS3_TRACE_H */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hns3 + +#if !defined(_HNS3_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ) +#define _HNS3_TRACE_H_ + +#include <linux/tracepoint.h> + +#define DESC_NR (sizeof(struct hns3_desc) / sizeof(u32)) + +DECLARE_EVENT_CLASS(hns3_skb_template, + TP_PROTO(struct sk_buff *skb), + TP_ARGS(skb), + + TP_STRUCT__entry( + __field(unsigned int, headlen) + __field(unsigned int, len) + __field(__u8, nr_frags) + __field(__u8, ip_summed) + __field(unsigned int, hdr_len) + __field(unsigned short, gso_size) + __field(unsigned short, gso_segs) + __field(unsigned int, gso_type) + __field(bool, fraglist) + __array(__u32, size, MAX_SKB_FRAGS) + ), + + TP_fast_assign( + __entry->headlen = skb_headlen(skb); + __entry->len = skb->len; + __entry->nr_frags = skb_shinfo(skb)->nr_frags; + __entry->gso_size = skb_shinfo(skb)->gso_size; + __entry->gso_segs = skb_shinfo(skb)->gso_segs; + __entry->gso_type = skb_shinfo(skb)->gso_type; + __entry->hdr_len = skb->encapsulation ? + skb_inner_transport_offset(skb) + inner_tcp_hdrlen(skb) : + skb_transport_offset(skb) + tcp_hdrlen(skb); + __entry->ip_summed = skb->ip_summed; + __entry->fraglist = skb_has_frag_list(skb); + hns3_shinfo_pack(skb_shinfo(skb), __entry->size); + ), + + TP_printk( + "len: %u, %u, %u, cs: %u, gso: %u, %u, %x, frag(%d %u): %s", + __entry->headlen, __entry->len, __entry->hdr_len, + __entry->ip_summed, __entry->gso_size, __entry->gso_segs, + __entry->gso_type, __entry->fraglist, __entry->nr_frags, + __print_array(__entry->size, MAX_SKB_FRAGS, sizeof(__u32)) + ) +); + +DEFINE_EVENT(hns3_skb_template, hns3_over_8bd, + TP_PROTO(struct sk_buff *skb), + TP_ARGS(skb)); + +DEFINE_EVENT(hns3_skb_template, hns3_gro, + TP_PROTO(struct sk_buff *skb), + TP_ARGS(skb)); + +DEFINE_EVENT(hns3_skb_template, hns3_tso, + TP_PROTO(struct sk_buff *skb), + TP_ARGS(skb)); + +TRACE_EVENT(hns3_tx_desc, + TP_PROTO(struct hns3_enet_ring *ring, int cur_ntu), + TP_ARGS(ring, cur_ntu), + + TP_STRUCT__entry( + __field(int, index) + __field(int, ntu) + __field(int, ntc) + __field(dma_addr_t, desc_dma) + __array(u32, desc, DESC_NR) + __string(devname, ring->tqp->handle->kinfo.netdev->name) + ), + + TP_fast_assign( + __entry->index = ring->tqp->tqp_index; + __entry->ntu = ring->next_to_use; + __entry->ntc = ring->next_to_clean; + __entry->desc_dma = ring->desc_dma_addr, + memcpy(__entry->desc, &ring->desc[cur_ntu], + sizeof(struct hns3_desc)); + __assign_str(devname, ring->tqp->handle->kinfo.netdev->name); + ), + + TP_printk( + "%s-%d-%d/%d desc(%pad): %s", + __get_str(devname), __entry->index, __entry->ntu, + __entry->ntc, &__entry->desc_dma, + __print_array(__entry->desc, DESC_NR, sizeof(u32)) + ) +); + +TRACE_EVENT(hns3_rx_desc, + TP_PROTO(struct hns3_enet_ring *ring), + TP_ARGS(ring), + + TP_STRUCT__entry( + __field(int, index) + __field(int, ntu) + __field(int, ntc) + __field(dma_addr_t, desc_dma) + __field(dma_addr_t, buf_dma) + __array(u32, desc, DESC_NR) + __string(devname, ring->tqp->handle->kinfo.netdev->name) + ), + + TP_fast_assign( + __entry->index = ring->tqp->tqp_index; + __entry->ntu = ring->next_to_use; + __entry->ntc = ring->next_to_clean; + __entry->desc_dma = ring->desc_dma_addr; + __entry->buf_dma = ring->desc_cb[ring->next_to_clean].dma; + memcpy(__entry->desc, &ring->desc[ring->next_to_clean], + sizeof(struct hns3_desc)); + __assign_str(devname, ring->tqp->handle->kinfo.netdev->name); + ), + + TP_printk( + "%s-%d-%d/%d desc(%pad) buf(%pad): %s", + __get_str(devname), __entry->index, __entry->ntu, + __entry->ntc, &__entry->desc_dma, &__entry->buf_dma, + __print_array(__entry->desc, DESC_NR, sizeof(u32)) + ) +); + +#endif /* _HNS3_TRACE_H_ */ + +/* This must be outside ifdef _HNS3_TRACE_H */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE hns3_trace +#include <trace/define_trace.h> diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h index d97da67f07a1..96498d9b4754 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h @@ -6,6 +6,7 @@ #include <linux/types.h> #include <linux/io.h> #include <linux/etherdevice.h> +#include "hnae3.h" #define HCLGE_CMDQ_TX_TIMEOUT 30000 #define HCLGE_DESC_DATA_LEN 6 @@ -63,6 +64,7 @@ enum hclge_cmd_status { struct hclge_misc_vector { u8 __iomem *addr; int vector_irq; + char name[HNAE3_INT_NAME_LEN]; }; struct hclge_cmq { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c index f3d4cbd28913..67fad80035d3 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c @@ -73,8 +73,6 @@ static struct hclge_dbg_reg_type_info hclge_dbg_reg_info[] = { static int hclge_dbg_get_dfx_bd_num(struct hclge_dev *hdev, int offset) { -#define HCLGE_GET_DFX_REG_TYPE_CNT 4 - struct hclge_desc desc[HCLGE_GET_DFX_REG_TYPE_CNT]; int entries_per_desc; int index; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c index dc66b4e13377..c85b72dc44d2 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c @@ -505,7 +505,7 @@ static const struct hclge_hw_error hclge_ssu_mem_ecc_err_int[] = { static const struct hclge_hw_error hclge_ssu_port_based_err_int[] = { { .int_msk = BIT(0), .msg = "roc_pkt_without_key_port", - .reset_level = HNAE3_GLOBAL_RESET }, + .reset_level = HNAE3_FUNC_RESET }, { .int_msk = BIT(1), .msg = "tpu_pkt_without_key_port", .reset_level = HNAE3_GLOBAL_RESET }, { .int_msk = BIT(2), .msg = "igu_pkt_without_key_port", @@ -599,7 +599,7 @@ static const struct hclge_hw_error hclge_ssu_ets_tcg_int[] = { static const struct hclge_hw_error hclge_ssu_port_based_pf_int[] = { { .int_msk = BIT(0), .msg = "roc_pkt_without_key_port", - .reset_level = HNAE3_GLOBAL_RESET }, + .reset_level = HNAE3_FUNC_RESET }, { .int_msk = BIT(9), .msg = "low_water_line_err_port", .reset_level = HNAE3_NONE_RESET }, { .int_msk = BIT(10), .msg = "hi_water_line_err_port", @@ -1898,10 +1898,8 @@ static int hclge_handle_all_hw_msix_error(struct hclge_dev *hdev, bd_num = max_t(u32, mpf_bd_num, pf_bd_num); desc = kcalloc(bd_num, sizeof(struct hclge_desc), GFP_KERNEL); - if (!desc) { - ret = -ENOMEM; - goto out; - } + if (!desc) + return -ENOMEM; ret = hclge_handle_mpf_msix_error(hdev, desc, mpf_bd_num, reset_requests); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 5e3743929e1c..ec5f6eeb639b 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -862,9 +862,7 @@ static int hclge_query_function_status(struct hclge_dev *hdev) usleep_range(1000, 2000); } while (timeout++ < HCLGE_QUERY_MAX_CNT); - ret = hclge_parse_func_status(hdev, req); - - return ret; + return hclge_parse_func_status(hdev, req); } static int hclge_query_pf_resource(struct hclge_dev *hdev) @@ -882,12 +880,12 @@ static int hclge_query_pf_resource(struct hclge_dev *hdev) } req = (struct hclge_pf_res_cmd *)desc.data; - hdev->num_tqps = __le16_to_cpu(req->tqp_num); - hdev->pkt_buf_size = __le16_to_cpu(req->buf_size) << HCLGE_BUF_UNIT_S; + hdev->num_tqps = le16_to_cpu(req->tqp_num); + hdev->pkt_buf_size = le16_to_cpu(req->buf_size) << HCLGE_BUF_UNIT_S; if (req->tx_buf_size) hdev->tx_buf_size = - __le16_to_cpu(req->tx_buf_size) << HCLGE_BUF_UNIT_S; + le16_to_cpu(req->tx_buf_size) << HCLGE_BUF_UNIT_S; else hdev->tx_buf_size = HCLGE_DEFAULT_TX_BUF; @@ -895,7 +893,7 @@ static int hclge_query_pf_resource(struct hclge_dev *hdev) if (req->dv_buf_size) hdev->dv_buf_size = - __le16_to_cpu(req->dv_buf_size) << HCLGE_BUF_UNIT_S; + le16_to_cpu(req->dv_buf_size) << HCLGE_BUF_UNIT_S; else hdev->dv_buf_size = HCLGE_DEFAULT_DV; @@ -903,10 +901,10 @@ static int hclge_query_pf_resource(struct hclge_dev *hdev) if (hnae3_dev_roce_supported(hdev)) { hdev->roce_base_msix_offset = - hnae3_get_field(__le16_to_cpu(req->msixcap_localid_ba_rocee), + hnae3_get_field(le16_to_cpu(req->msixcap_localid_ba_rocee), HCLGE_MSIX_OFT_ROCEE_M, HCLGE_MSIX_OFT_ROCEE_S); hdev->num_roce_msi = - hnae3_get_field(__le16_to_cpu(req->pf_intr_vector_number), + hnae3_get_field(le16_to_cpu(req->pf_intr_vector_number), HCLGE_PF_VEC_NUM_M, HCLGE_PF_VEC_NUM_S); /* nic's msix numbers is always equals to the roce's. */ @@ -919,7 +917,7 @@ static int hclge_query_pf_resource(struct hclge_dev *hdev) hdev->roce_base_msix_offset; } else { hdev->num_msi = - hnae3_get_field(__le16_to_cpu(req->pf_intr_vector_number), + hnae3_get_field(le16_to_cpu(req->pf_intr_vector_number), HCLGE_PF_VEC_NUM_M, HCLGE_PF_VEC_NUM_S); hdev->num_nic_msi = hdev->num_msi; @@ -1333,11 +1331,7 @@ static int hclge_get_cap(struct hclge_dev *hdev) } /* get pf resource */ - ret = hclge_query_pf_resource(hdev); - if (ret) - dev_err(&hdev->pdev->dev, "query pf resource error %d.\n", ret); - - return ret; + return hclge_query_pf_resource(hdev); } static void hclge_init_kdump_kernel_config(struct hclge_dev *hdev) @@ -2621,30 +2615,21 @@ static int hclge_mac_init(struct hclge_dev *hdev) hdev->hw.mac.duplex = HCLGE_MAC_FULL; ret = hclge_cfg_mac_speed_dup_hw(hdev, hdev->hw.mac.speed, hdev->hw.mac.duplex); - if (ret) { - dev_err(&hdev->pdev->dev, - "Config mac speed dup fail ret=%d\n", ret); + if (ret) return ret; - } if (hdev->hw.mac.support_autoneg) { ret = hclge_set_autoneg_en(hdev, hdev->hw.mac.autoneg); - if (ret) { - dev_err(&hdev->pdev->dev, - "Config mac autoneg fail ret=%d\n", ret); + if (ret) return ret; - } } mac->link = 0; if (mac->user_fec_mode & BIT(HNAE3_FEC_USER_DEF)) { ret = hclge_set_fec_hw(hdev, mac->user_fec_mode); - if (ret) { - dev_err(&hdev->pdev->dev, - "Fec mode init fail, ret = %d\n", ret); + if (ret) return ret; - } } ret = hclge_set_mac_mtu(hdev, hdev->mps); @@ -2835,6 +2820,12 @@ static int hclge_get_sfp_info(struct hclge_dev *hdev, struct hclge_mac *mac) return ret; } + /* In some case, mac speed get from IMP may be 0, it shouldn't be + * set to mac->speed. + */ + if (!le32_to_cpu(resp->speed)) + return 0; + mac->speed = le32_to_cpu(resp->speed); /* if resp->speed_ability is 0, it means it's an old version * firmware, do not update these params @@ -2910,7 +2901,7 @@ static int hclge_get_status(struct hnae3_handle *handle) static struct hclge_vport *hclge_get_vf_vport(struct hclge_dev *hdev, int vf) { - if (pci_num_vf(hdev->pdev) == 0) { + if (!pci_num_vf(hdev->pdev)) { dev_err(&hdev->pdev->dev, "SRIOV is disabled, can not get vport(%d) info.\n", vf); return NULL; @@ -3177,8 +3168,10 @@ static int hclge_misc_irq_init(struct hclge_dev *hdev) hclge_get_misc_vector(hdev); /* this would be explicitly freed in the end */ + snprintf(hdev->misc_vector.name, HNAE3_INT_NAME_LEN, "%s-misc-%s", + HCLGE_NAME, pci_name(hdev->pdev)); ret = request_irq(hdev->misc_vector.vector_irq, hclge_misc_irq_handle, - 0, "hclge_misc", hdev); + 0, hdev->misc_vector.name, hdev); if (ret) { hclge_free_vector(hdev, 0); dev_err(&hdev->pdev->dev, "request misc irq(%d) fail\n", @@ -3252,7 +3245,8 @@ static int hclge_notify_roce_client(struct hclge_dev *hdev, static int hclge_reset_wait(struct hclge_dev *hdev) { #define HCLGE_RESET_WATI_MS 100 -#define HCLGE_RESET_WAIT_CNT 200 +#define HCLGE_RESET_WAIT_CNT 350 + u32 val, reg, reg_bit; u32 cnt = 0; @@ -3269,8 +3263,6 @@ static int hclge_reset_wait(struct hclge_dev *hdev) reg = HCLGE_FUN_RST_ING; reg_bit = HCLGE_FUN_RST_ING_B; break; - case HNAE3_FLR_RESET: - break; default: dev_err(&hdev->pdev->dev, "Wait for unsupported reset type: %d\n", @@ -3278,20 +3270,6 @@ static int hclge_reset_wait(struct hclge_dev *hdev) return -EINVAL; } - if (hdev->reset_type == HNAE3_FLR_RESET) { - while (!test_bit(HNAE3_FLR_DONE, &hdev->flr_state) && - cnt++ < HCLGE_RESET_WAIT_CNT) - msleep(HCLGE_RESET_WATI_MS); - - if (!test_bit(HNAE3_FLR_DONE, &hdev->flr_state)) { - dev_err(&hdev->pdev->dev, - "flr wait timeout: %u\n", cnt); - return -EBUSY; - } - - return 0; - } - val = hclge_read_dev(&hdev->hw, reg); while (hnae3_get_bit(val, reg_bit) && cnt < HCLGE_RESET_WAIT_CNT) { msleep(HCLGE_RESET_WATI_MS); @@ -3369,7 +3347,7 @@ static void hclge_mailbox_service_task(struct hclge_dev *hdev) clear_bit(HCLGE_STATE_MBX_HANDLING, &hdev->state); } -static int hclge_func_reset_sync_vf(struct hclge_dev *hdev) +static void hclge_func_reset_sync_vf(struct hclge_dev *hdev) { struct hclge_pf_rst_sync_cmd *req; struct hclge_desc desc; @@ -3389,20 +3367,19 @@ static int hclge_func_reset_sync_vf(struct hclge_dev *hdev) */ if (ret == -EOPNOTSUPP) { msleep(HCLGE_RESET_SYNC_TIME); - return 0; + return; } else if (ret) { - dev_err(&hdev->pdev->dev, "sync with VF fail %d!\n", - ret); - return ret; + dev_warn(&hdev->pdev->dev, "sync with VF fail %d!\n", + ret); + return; } else if (req->all_vf_ready) { - return 0; + return; } msleep(HCLGE_PF_RESET_SYNC_TIME); hclge_cmd_reuse_desc(&desc, true); } while (cnt++ < HCLGE_PF_RESET_SYNC_CNT); - dev_err(&hdev->pdev->dev, "sync with VF timeout!\n"); - return -ETIME; + dev_warn(&hdev->pdev->dev, "sync with VF timeout!\n"); } void hclge_report_hw_error(struct hclge_dev *hdev, @@ -3482,12 +3459,6 @@ static void hclge_do_reset(struct hclge_dev *hdev) set_bit(HNAE3_FUNC_RESET, &hdev->reset_pending); hclge_reset_task_schedule(hdev); break; - case HNAE3_FLR_RESET: - dev_info(&pdev->dev, "FLR requested\n"); - /* schedule again to check later */ - set_bit(HNAE3_FLR_RESET, &hdev->reset_pending); - hclge_reset_task_schedule(hdev); - break; default: dev_warn(&pdev->dev, "Unsupported reset type: %d\n", hdev->reset_type); @@ -3576,23 +3547,6 @@ static void hclge_clear_reset_cause(struct hclge_dev *hdev) hclge_enable_vector(&hdev->misc_vector, true); } -static int hclge_reset_prepare_down(struct hclge_dev *hdev) -{ - int ret = 0; - - switch (hdev->reset_type) { - case HNAE3_FUNC_RESET: - /* fall through */ - case HNAE3_FLR_RESET: - ret = hclge_set_all_vf_rst(hdev, true); - break; - default: - break; - } - - return ret; -} - static void hclge_reset_handshake(struct hclge_dev *hdev, bool enable) { u32 reg_val; @@ -3606,6 +3560,19 @@ static void hclge_reset_handshake(struct hclge_dev *hdev, bool enable) hclge_write_dev(&hdev->hw, HCLGE_NIC_CSQ_DEPTH_REG, reg_val); } +static int hclge_func_reset_notify_vf(struct hclge_dev *hdev) +{ + int ret; + + ret = hclge_set_all_vf_rst(hdev, true); + if (ret) + return ret; + + hclge_func_reset_sync_vf(hdev); + + return 0; +} + static int hclge_reset_prepare_wait(struct hclge_dev *hdev) { u32 reg_val; @@ -3613,10 +3580,7 @@ static int hclge_reset_prepare_wait(struct hclge_dev *hdev) switch (hdev->reset_type) { case HNAE3_FUNC_RESET: - /* to confirm whether all running VF is ready - * before request PF reset - */ - ret = hclge_func_reset_sync_vf(hdev); + ret = hclge_func_reset_notify_vf(hdev); if (ret) return ret; @@ -3636,16 +3600,9 @@ static int hclge_reset_prepare_wait(struct hclge_dev *hdev) hdev->rst_stats.pf_rst_cnt++; break; case HNAE3_FLR_RESET: - /* to confirm whether all running VF is ready - * before request PF reset - */ - ret = hclge_func_reset_sync_vf(hdev); + ret = hclge_func_reset_notify_vf(hdev); if (ret) return ret; - - set_bit(HCLGE_STATE_CMD_DISABLE, &hdev->state); - set_bit(HNAE3_FLR_DOWN, &hdev->flr_state); - hdev->rst_stats.flr_rst_cnt++; break; case HNAE3_IMP_RESET: hclge_handle_imp_error(hdev); @@ -3774,10 +3731,9 @@ static int hclge_reset_stack(struct hclge_dev *hdev) return hclge_notify_client(hdev, HNAE3_RESTORE_CLIENT); } -static void hclge_reset(struct hclge_dev *hdev) +static int hclge_reset_prepare(struct hclge_dev *hdev) { struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev); - enum hnae3_reset_type reset_level; int ret; /* Initialize ae_dev reset status as well, in case enet layer wants to @@ -3788,45 +3744,41 @@ static void hclge_reset(struct hclge_dev *hdev) /* perform reset of the stack & ae device for a client */ ret = hclge_notify_roce_client(hdev, HNAE3_DOWN_CLIENT); if (ret) - goto err_reset; - - ret = hclge_reset_prepare_down(hdev); - if (ret) - goto err_reset; + return ret; rtnl_lock(); ret = hclge_notify_client(hdev, HNAE3_DOWN_CLIENT); - if (ret) - goto err_reset_lock; - rtnl_unlock(); - - ret = hclge_reset_prepare_wait(hdev); if (ret) - goto err_reset; + return ret; - if (hclge_reset_wait(hdev)) - goto err_reset; + return hclge_reset_prepare_wait(hdev); +} + +static int hclge_reset_rebuild(struct hclge_dev *hdev) +{ + struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev); + enum hnae3_reset_type reset_level; + int ret; hdev->rst_stats.hw_reset_done_cnt++; ret = hclge_notify_roce_client(hdev, HNAE3_UNINIT_CLIENT); if (ret) - goto err_reset; + return ret; rtnl_lock(); - ret = hclge_reset_stack(hdev); + rtnl_unlock(); if (ret) - goto err_reset_lock; + return ret; hclge_clear_reset_cause(hdev); ret = hclge_reset_prepare_up(hdev); if (ret) - goto err_reset_lock; + return ret; - rtnl_unlock(); ret = hclge_notify_roce_client(hdev, HNAE3_INIT_CLIENT); /* ignore RoCE notify error if it fails HCLGE_RESET_MAX_FAIL_CNT - 1 @@ -3834,19 +3786,17 @@ static void hclge_reset(struct hclge_dev *hdev) */ if (ret && hdev->rst_stats.reset_fail_cnt < HCLGE_RESET_MAX_FAIL_CNT - 1) - goto err_reset; + return ret; rtnl_lock(); - ret = hclge_notify_client(hdev, HNAE3_UP_CLIENT); - if (ret) - goto err_reset_lock; - rtnl_unlock(); + if (ret) + return ret; ret = hclge_notify_roce_client(hdev, HNAE3_UP_CLIENT); if (ret) - goto err_reset; + return ret; hdev->last_reset_time = jiffies; hdev->rst_stats.reset_fail_cnt = 0; @@ -3863,10 +3813,22 @@ static void hclge_reset(struct hclge_dev *hdev) if (reset_level != HNAE3_NONE_RESET) set_bit(reset_level, &hdev->reset_request); + return 0; +} + +static void hclge_reset(struct hclge_dev *hdev) +{ + if (hclge_reset_prepare(hdev)) + goto err_reset; + + if (hclge_reset_wait(hdev)) + goto err_reset; + + if (hclge_reset_rebuild(hdev)) + goto err_reset; + return; -err_reset_lock: - rtnl_unlock(); err_reset: if (hclge_reset_err_handle(hdev)) hclge_reset_task_schedule(hdev); @@ -3972,12 +3934,13 @@ static void hclge_reset_service_task(struct hclge_dev *hdev) if (!test_and_clear_bit(HCLGE_STATE_RST_SERVICE_SCHED, &hdev->state)) return; - if (test_and_set_bit(HCLGE_STATE_RST_HANDLING, &hdev->state)) - return; + down(&hdev->reset_sem); + set_bit(HCLGE_STATE_RST_HANDLING, &hdev->state); hclge_reset_subtask(hdev); clear_bit(HCLGE_STATE_RST_HANDLING, &hdev->state); + up(&hdev->reset_sem); } static void hclge_update_vport_alive(struct hclge_dev *hdev) @@ -4123,7 +4086,7 @@ static int hclge_put_vector(struct hnae3_handle *handle, int vector) vector_id = hclge_get_vector_index(hdev, vector); if (vector_id < 0) { dev_err(&hdev->pdev->dev, - "Get vector index fail. vector_id =%d\n", vector_id); + "Get vector index fail. vector = %d\n", vector); return vector_id; } @@ -4698,7 +4661,7 @@ static int hclge_map_ring_to_vector(struct hnae3_handle *handle, int vector, vector_id = hclge_get_vector_index(hdev, vector); if (vector_id < 0) { dev_err(&hdev->pdev->dev, - "Get vector index fail. vector_id =%d\n", vector_id); + "failed to get vector index. vector=%d\n", vector); return vector_id; } @@ -6606,7 +6569,7 @@ static int hclge_set_serdes_loopback(struct hclge_dev *hdev, bool en, hclge_cfg_mac_mode(hdev, en); - ret = hclge_mac_phy_link_status_wait(hdev, en, FALSE); + ret = hclge_mac_phy_link_status_wait(hdev, en, false); if (ret) dev_err(&hdev->pdev->dev, "serdes loopback config mac mode timeout\n"); @@ -6664,7 +6627,7 @@ static int hclge_set_phy_loopback(struct hclge_dev *hdev, bool en) hclge_cfg_mac_mode(hdev, en); - ret = hclge_mac_phy_link_status_wait(hdev, en, TRUE); + ret = hclge_mac_phy_link_status_wait(hdev, en, true); if (ret) dev_err(&hdev->pdev->dev, "phy loopback config mac mode timeout\n"); @@ -9324,30 +9287,53 @@ static void hclge_state_uninit(struct hclge_dev *hdev) static void hclge_flr_prepare(struct hnae3_ae_dev *ae_dev) { -#define HCLGE_FLR_WAIT_MS 100 -#define HCLGE_FLR_WAIT_CNT 50 - struct hclge_dev *hdev = ae_dev->priv; - int cnt = 0; +#define HCLGE_FLR_RETRY_WAIT_MS 500 +#define HCLGE_FLR_RETRY_CNT 5 - clear_bit(HNAE3_FLR_DOWN, &hdev->flr_state); - clear_bit(HNAE3_FLR_DONE, &hdev->flr_state); - set_bit(HNAE3_FLR_RESET, &hdev->default_reset_request); - hclge_reset_event(hdev->pdev, NULL); + struct hclge_dev *hdev = ae_dev->priv; + int retry_cnt = 0; + int ret; - while (!test_bit(HNAE3_FLR_DOWN, &hdev->flr_state) && - cnt++ < HCLGE_FLR_WAIT_CNT) - msleep(HCLGE_FLR_WAIT_MS); +retry: + down(&hdev->reset_sem); + set_bit(HCLGE_STATE_RST_HANDLING, &hdev->state); + hdev->reset_type = HNAE3_FLR_RESET; + ret = hclge_reset_prepare(hdev); + if (ret) { + dev_err(&hdev->pdev->dev, "fail to prepare FLR, ret=%d\n", + ret); + if (hdev->reset_pending || + retry_cnt++ < HCLGE_FLR_RETRY_CNT) { + dev_err(&hdev->pdev->dev, + "reset_pending:0x%lx, retry_cnt:%d\n", + hdev->reset_pending, retry_cnt); + clear_bit(HCLGE_STATE_RST_HANDLING, &hdev->state); + up(&hdev->reset_sem); + msleep(HCLGE_FLR_RETRY_WAIT_MS); + goto retry; + } + } - if (!test_bit(HNAE3_FLR_DOWN, &hdev->flr_state)) - dev_err(&hdev->pdev->dev, - "flr wait down timeout: %d\n", cnt); + /* disable misc vector before FLR done */ + hclge_enable_vector(&hdev->misc_vector, false); + set_bit(HCLGE_STATE_CMD_DISABLE, &hdev->state); + hdev->rst_stats.flr_rst_cnt++; } static void hclge_flr_done(struct hnae3_ae_dev *ae_dev) { struct hclge_dev *hdev = ae_dev->priv; + int ret; - set_bit(HNAE3_FLR_DONE, &hdev->flr_state); + hclge_enable_vector(&hdev->misc_vector, true); + + ret = hclge_reset_rebuild(hdev); + if (ret) + dev_err(&hdev->pdev->dev, "fail to rebuild, ret=%d\n", ret); + + hdev->reset_type = HNAE3_NONE_RESET; + clear_bit(HCLGE_STATE_RST_HANDLING, &hdev->state); + up(&hdev->reset_sem); } static void hclge_clear_resetting_state(struct hclge_dev *hdev) @@ -9390,19 +9376,16 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) mutex_init(&hdev->vport_lock); spin_lock_init(&hdev->fd_rule_lock); + sema_init(&hdev->reset_sem, 1); ret = hclge_pci_init(hdev); - if (ret) { - dev_err(&pdev->dev, "PCI init failed\n"); + if (ret) goto out; - } /* Firmware command queue initialize */ ret = hclge_cmd_queue_init(hdev); - if (ret) { - dev_err(&pdev->dev, "Cmd queue init failed, ret = %d.\n", ret); + if (ret) goto err_pci_uninit; - } /* Firmware command initialize */ ret = hclge_cmd_init(hdev); @@ -9410,11 +9393,8 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) goto err_cmd_uninit; ret = hclge_get_cap(hdev); - if (ret) { - dev_err(&pdev->dev, "get hw capability error, ret = %d.\n", - ret); + if (ret) goto err_cmd_uninit; - } ret = hclge_configure(hdev); if (ret) { @@ -9429,12 +9409,8 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) } ret = hclge_misc_irq_init(hdev); - if (ret) { - dev_err(&pdev->dev, - "Misc IRQ(vector0) init error, ret = %d.\n", - ret); + if (ret) goto err_msi_uninit; - } ret = hclge_alloc_tqps(hdev); if (ret) { @@ -9443,31 +9419,22 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) } ret = hclge_alloc_vport(hdev); - if (ret) { - dev_err(&pdev->dev, "Allocate vport error, ret = %d.\n", ret); + if (ret) goto err_msi_irq_uninit; - } ret = hclge_map_tqp(hdev); - if (ret) { - dev_err(&pdev->dev, "Map tqp error, ret = %d.\n", ret); + if (ret) goto err_msi_irq_uninit; - } if (hdev->hw.mac.media_type == HNAE3_MEDIA_TYPE_COPPER) { ret = hclge_mac_mdio_config(hdev); - if (ret) { - dev_err(&hdev->pdev->dev, - "mdio config fail ret=%d\n", ret); + if (ret) goto err_msi_irq_uninit; - } } ret = hclge_init_umv_space(hdev); - if (ret) { - dev_err(&pdev->dev, "umv space init error, ret=%d.\n", ret); + if (ret) goto err_mdiobus_unreg; - } ret = hclge_mac_init(hdev); if (ret) { @@ -10202,10 +10169,8 @@ static int hclge_get_dfx_reg_bd_num(struct hclge_dev *hdev, int *bd_num_list, u32 type_num) { -#define HCLGE_DFX_REG_BD_NUM 4 - u32 entries_per_desc, desc_index, index, offset, i; - struct hclge_desc desc[HCLGE_DFX_REG_BD_NUM]; + struct hclge_desc desc[HCLGE_GET_DFX_REG_TYPE_CNT]; int ret; ret = hclge_query_bd_num_cmd_send(hdev, desc); @@ -10318,10 +10283,8 @@ static int hclge_get_dfx_reg(struct hclge_dev *hdev, void *data) buf_len = sizeof(*desc_src) * bd_num_max; desc_src = kzalloc(buf_len, GFP_KERNEL); - if (!desc_src) { - dev_err(&hdev->pdev->dev, "%s kzalloc failed\n", __func__); + if (!desc_src) return -ENOMEM; - } for (i = 0; i < dfx_reg_type_num; i++) { bd_num = bd_num_list[i]; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index 4e5cfda16eb7..f78cbb4cc85e 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -139,6 +139,8 @@ #define HCLGE_PHY_MDIX_STATUS_B 6 #define HCLGE_PHY_SPEED_DUP_RESOLVE_B 11 +#define HCLGE_GET_DFX_REG_TYPE_CNT 4 + /* Factor used to calculate offset and bitmap of VF num */ #define HCLGE_VF_NUM_PER_CMD 64 @@ -720,6 +722,7 @@ struct hclge_dev { unsigned long reset_request; /* reset has been requested */ unsigned long reset_pending; /* client rst is pending to be served */ struct hclge_rst_stats rst_stats; + struct semaphore reset_sem; /* protect reset process */ u32 fw_version; u16 num_vmdq_vport; /* Num vmdq vport this PF has set up */ u16 num_tqps; /* Num task queue pairs of this PF */ diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c index f905dd3386b3..a3c0822191a9 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c @@ -86,10 +86,12 @@ static int hclge_send_mbx_msg(struct hclge_vport *vport, u8 *msg, u16 msg_len, int hclge_inform_reset_assert_to_vf(struct hclge_vport *vport) { struct hclge_dev *hdev = vport->back; - enum hnae3_reset_type reset_type; + u16 reset_type; u8 msg_data[2]; u8 dest_vfid; + BUILD_BUG_ON(HNAE3_MAX_RESET > U16_MAX); + dest_vfid = (u8)vport->vport_id; if (hdev->reset_type == HNAE3_FUNC_RESET) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index c33b8027f801..d6597206e692 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -1316,14 +1316,13 @@ static int hclgevf_set_vlan_filter(struct hnae3_handle *handle, msg_data[0] = is_kill; memcpy(&msg_data[1], &vlan_id, sizeof(vlan_id)); memcpy(&msg_data[3], &proto, sizeof(proto)); - ret = hclgevf_send_mbx_msg(hdev, HCLGE_MBX_SET_VLAN, - HCLGE_MBX_VLAN_FILTER, msg_data, - HCLGEVF_VLAN_MBX_MSG_LEN, true, NULL, 0); - /* when remove hw vlan filter failed, record the vlan id, * and try to remove it from hw later, to be consistence * with stack. */ + ret = hclgevf_send_mbx_msg(hdev, HCLGE_MBX_SET_VLAN, + HCLGE_MBX_VLAN_FILTER, msg_data, + HCLGEVF_VLAN_MBX_MSG_LEN, true, NULL, 0); if (is_kill && ret) set_bit(vlan_id, hdev->vlan_del_fail_bmap); @@ -1411,32 +1410,6 @@ static int hclgevf_notify_client(struct hclgevf_dev *hdev, return ret; } -static void hclgevf_flr_done(struct hnae3_ae_dev *ae_dev) -{ - struct hclgevf_dev *hdev = ae_dev->priv; - - set_bit(HNAE3_FLR_DONE, &hdev->flr_state); -} - -static int hclgevf_flr_poll_timeout(struct hclgevf_dev *hdev, - unsigned long delay_us, - unsigned long wait_cnt) -{ - unsigned long cnt = 0; - - while (!test_bit(HNAE3_FLR_DONE, &hdev->flr_state) && - cnt++ < wait_cnt) - usleep_range(delay_us, delay_us * 2); - - if (!test_bit(HNAE3_FLR_DONE, &hdev->flr_state)) { - dev_err(&hdev->pdev->dev, - "flr wait timeout\n"); - return -ETIMEDOUT; - } - - return 0; -} - static int hclgevf_reset_wait(struct hclgevf_dev *hdev) { #define HCLGEVF_RESET_WAIT_US 20000 @@ -1447,11 +1420,7 @@ static int hclgevf_reset_wait(struct hclgevf_dev *hdev) u32 val; int ret; - if (hdev->reset_type == HNAE3_FLR_RESET) - return hclgevf_flr_poll_timeout(hdev, - HCLGEVF_RESET_WAIT_US, - HCLGEVF_RESET_WAIT_CNT); - else if (hdev->reset_type == HNAE3_VF_RESET) + if (hdev->reset_type == HNAE3_VF_RESET) ret = readl_poll_timeout(hdev->hw.io_base + HCLGEVF_VF_RST_ING, val, !(val & HCLGEVF_VF_RST_ING_BIT), @@ -1523,7 +1492,8 @@ static int hclgevf_reset_stack(struct hclgevf_dev *hdev) /* clear handshake status with IMP */ hclgevf_reset_handshake(hdev, false); - return 0; + /* bring up the nic to enable TX/RX again */ + return hclgevf_notify_client(hdev, HNAE3_UP_CLIENT); } static int hclgevf_reset_prepare_wait(struct hclgevf_dev *hdev) @@ -1532,18 +1502,10 @@ static int hclgevf_reset_prepare_wait(struct hclgevf_dev *hdev) int ret = 0; - switch (hdev->reset_type) { - case HNAE3_VF_FUNC_RESET: + if (hdev->reset_type == HNAE3_VF_FUNC_RESET) { ret = hclgevf_send_mbx_msg(hdev, HCLGE_MBX_RESET, 0, NULL, 0, true, NULL, sizeof(u8)); hdev->rst_stats.vf_func_rst_cnt++; - break; - case HNAE3_FLR_RESET: - set_bit(HNAE3_FLR_DOWN, &hdev->flr_state); - hdev->rst_stats.flr_rst_cnt++; - break; - default: - break; } set_bit(HCLGEVF_STATE_CMD_DISABLE, &hdev->state); @@ -1603,7 +1565,7 @@ static void hclgevf_reset_err_handle(struct hclgevf_dev *hdev) } } -static int hclgevf_reset(struct hclgevf_dev *hdev) +static int hclgevf_reset_prepare(struct hclgevf_dev *hdev) { struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev); int ret; @@ -1613,62 +1575,64 @@ static int hclgevf_reset(struct hclgevf_dev *hdev) */ ae_dev->reset_type = hdev->reset_type; hdev->rst_stats.rst_cnt++; - rtnl_lock(); + rtnl_lock(); /* bring down the nic to stop any ongoing TX/RX */ ret = hclgevf_notify_client(hdev, HNAE3_DOWN_CLIENT); - if (ret) - goto err_reset_lock; - rtnl_unlock(); - - ret = hclgevf_reset_prepare_wait(hdev); if (ret) - goto err_reset; + return ret; - /* check if VF could successfully fetch the hardware reset completion - * status from the hardware - */ - ret = hclgevf_reset_wait(hdev); - if (ret) { - /* can't do much in this situation, will disable VF */ - dev_err(&hdev->pdev->dev, - "VF failed(=%d) to fetch H/W reset completion status\n", - ret); - goto err_reset; - } + return hclgevf_reset_prepare_wait(hdev); +} + +static int hclgevf_reset_rebuild(struct hclgevf_dev *hdev) +{ + struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev); + int ret; hdev->rst_stats.hw_rst_done_cnt++; rtnl_lock(); - /* now, re-initialize the nic client and ae device */ ret = hclgevf_reset_stack(hdev); + rtnl_unlock(); if (ret) { dev_err(&hdev->pdev->dev, "failed to reset VF stack\n"); - goto err_reset_lock; + return ret; } - /* bring up the nic to enable TX/RX again */ - ret = hclgevf_notify_client(hdev, HNAE3_UP_CLIENT); - if (ret) - goto err_reset_lock; - - rtnl_unlock(); - hdev->last_reset_time = jiffies; ae_dev->reset_type = HNAE3_NONE_RESET; hdev->rst_stats.rst_done_cnt++; hdev->rst_stats.rst_fail_cnt = 0; clear_bit(HCLGEVF_STATE_RST_FAIL, &hdev->state); - return ret; -err_reset_lock: - rtnl_unlock(); + return 0; +} + +static void hclgevf_reset(struct hclgevf_dev *hdev) +{ + if (hclgevf_reset_prepare(hdev)) + goto err_reset; + + /* check if VF could successfully fetch the hardware reset completion + * status from the hardware + */ + if (hclgevf_reset_wait(hdev)) { + /* can't do much in this situation, will disable VF */ + dev_err(&hdev->pdev->dev, + "failed to fetch H/W reset completion status\n"); + goto err_reset; + } + + if (hclgevf_reset_rebuild(hdev)) + goto err_reset; + + return; + err_reset: hclgevf_reset_err_handle(hdev); - - return ret; } static enum hnae3_reset_type hclgevf_get_reset_level(struct hclgevf_dev *hdev, @@ -1731,25 +1695,60 @@ static void hclgevf_set_def_reset_request(struct hnae3_ae_dev *ae_dev, set_bit(rst_type, &hdev->default_reset_request); } +static void hclgevf_enable_vector(struct hclgevf_misc_vector *vector, bool en) +{ + writel(en ? 1 : 0, vector->addr); +} + static void hclgevf_flr_prepare(struct hnae3_ae_dev *ae_dev) { -#define HCLGEVF_FLR_WAIT_MS 100 -#define HCLGEVF_FLR_WAIT_CNT 50 +#define HCLGEVF_FLR_RETRY_WAIT_MS 500 +#define HCLGEVF_FLR_RETRY_CNT 5 + struct hclgevf_dev *hdev = ae_dev->priv; - int cnt = 0; + int retry_cnt = 0; + int ret; + +retry: + down(&hdev->reset_sem); + set_bit(HCLGEVF_STATE_RST_HANDLING, &hdev->state); + hdev->reset_type = HNAE3_FLR_RESET; + ret = hclgevf_reset_prepare(hdev); + if (ret) { + dev_err(&hdev->pdev->dev, "fail to prepare FLR, ret=%d\n", + ret); + if (hdev->reset_pending || + retry_cnt++ < HCLGEVF_FLR_RETRY_CNT) { + dev_err(&hdev->pdev->dev, + "reset_pending:0x%lx, retry_cnt:%d\n", + hdev->reset_pending, retry_cnt); + clear_bit(HCLGEVF_STATE_RST_HANDLING, &hdev->state); + up(&hdev->reset_sem); + msleep(HCLGEVF_FLR_RETRY_WAIT_MS); + goto retry; + } + } - clear_bit(HNAE3_FLR_DOWN, &hdev->flr_state); - clear_bit(HNAE3_FLR_DONE, &hdev->flr_state); - set_bit(HNAE3_FLR_RESET, &hdev->default_reset_request); - hclgevf_reset_event(hdev->pdev, NULL); + /* disable misc vector before FLR done */ + hclgevf_enable_vector(&hdev->misc_vector, false); + hdev->rst_stats.flr_rst_cnt++; +} - while (!test_bit(HNAE3_FLR_DOWN, &hdev->flr_state) && - cnt++ < HCLGEVF_FLR_WAIT_CNT) - msleep(HCLGEVF_FLR_WAIT_MS); +static void hclgevf_flr_done(struct hnae3_ae_dev *ae_dev) +{ + struct hclgevf_dev *hdev = ae_dev->priv; + int ret; - if (!test_bit(HNAE3_FLR_DOWN, &hdev->flr_state)) - dev_err(&hdev->pdev->dev, - "flr wait down timeout: %d\n", cnt); + hclgevf_enable_vector(&hdev->misc_vector, true); + + ret = hclgevf_reset_rebuild(hdev); + if (ret) + dev_warn(&hdev->pdev->dev, "fail to rebuild, ret=%d\n", + ret); + + hdev->reset_type = HNAE3_NONE_RESET; + clear_bit(HCLGEVF_STATE_RST_HANDLING, &hdev->state); + up(&hdev->reset_sem); } static u32 hclgevf_get_fw_version(struct hnae3_handle *handle) @@ -1802,13 +1801,11 @@ static void hclgevf_reset_service_task(struct hclgevf_dev *hdev) { #define HCLGEVF_MAX_RESET_ATTEMPTS_CNT 3 - int ret; - if (!test_and_clear_bit(HCLGEVF_STATE_RST_SERVICE_SCHED, &hdev->state)) return; - if (test_and_set_bit(HCLGEVF_STATE_RST_HANDLING, &hdev->state)) - return; + down(&hdev->reset_sem); + set_bit(HCLGEVF_STATE_RST_HANDLING, &hdev->state); if (test_and_clear_bit(HCLGEVF_RESET_PENDING, &hdev->reset_state)) { @@ -1822,12 +1819,8 @@ static void hclgevf_reset_service_task(struct hclgevf_dev *hdev) hdev->last_reset_time = jiffies; while ((hdev->reset_type = hclgevf_get_reset_level(hdev, &hdev->reset_pending)) - != HNAE3_NONE_RESET) { - ret = hclgevf_reset(hdev); - if (ret) - dev_err(&hdev->pdev->dev, - "VF stack reset failed %d.\n", ret); - } + != HNAE3_NONE_RESET) + hclgevf_reset(hdev); } else if (test_and_clear_bit(HCLGEVF_RESET_REQUESTED, &hdev->reset_state)) { /* we could be here when either of below happens: @@ -1868,7 +1861,9 @@ static void hclgevf_reset_service_task(struct hclgevf_dev *hdev) hclgevf_reset_task_schedule(hdev); } + hdev->reset_type = HNAE3_NONE_RESET; clear_bit(HCLGEVF_STATE_RST_HANDLING, &hdev->state); + up(&hdev->reset_sem); } static void hclgevf_mailbox_service_task(struct hclgevf_dev *hdev) @@ -2012,11 +2007,6 @@ static enum hclgevf_evt_cause hclgevf_check_evt_cause(struct hclgevf_dev *hdev, return HCLGEVF_VECTOR0_EVENT_OTHER; } -static void hclgevf_enable_vector(struct hclgevf_misc_vector *vector, bool en) -{ - writel(en ? 1 : 0, vector->addr); -} - static irqreturn_t hclgevf_misc_irq_handle(int irq, void *data) { enum hclgevf_evt_cause event_cause; @@ -2291,6 +2281,7 @@ static void hclgevf_state_init(struct hclgevf_dev *hdev) INIT_DELAYED_WORK(&hdev->service_task, hclgevf_service_task); mutex_init(&hdev->mbx_resp.mbx_mutex); + sema_init(&hdev->reset_sem, 1); /* bring the device down */ set_bit(HCLGEVF_STATE_DOWN, &hdev->state); @@ -2376,8 +2367,10 @@ static int hclgevf_misc_irq_init(struct hclgevf_dev *hdev) hclgevf_get_misc_vector(hdev); + snprintf(hdev->misc_vector.name, HNAE3_INT_NAME_LEN, "%s-misc-%s", + HCLGEVF_NAME, pci_name(hdev->pdev)); ret = request_irq(hdev->misc_vector.vector_irq, hclgevf_misc_irq_handle, - 0, "hclgevf_cmd", hdev); + 0, hdev->misc_vector.name, hdev); if (ret) { dev_err(&hdev->pdev->dev, "VF failed to request misc irq(%d)\n", hdev->misc_vector.vector_irq); @@ -2604,11 +2597,11 @@ static int hclgevf_query_vf_resource(struct hclgevf_dev *hdev) if (hnae3_dev_roce_supported(hdev)) { hdev->roce_base_msix_offset = - hnae3_get_field(__le16_to_cpu(req->msixcap_localid_ba_rocee), + hnae3_get_field(le16_to_cpu(req->msixcap_localid_ba_rocee), HCLGEVF_MSIX_OFT_ROCEE_M, HCLGEVF_MSIX_OFT_ROCEE_S); hdev->num_roce_msix = - hnae3_get_field(__le16_to_cpu(req->vf_intr_vector_number), + hnae3_get_field(le16_to_cpu(req->vf_intr_vector_number), HCLGEVF_VEC_NUM_M, HCLGEVF_VEC_NUM_S); /* nic's msix numbers is always equals to the roce's. */ @@ -2621,7 +2614,7 @@ static int hclgevf_query_vf_resource(struct hclgevf_dev *hdev) hdev->roce_base_msix_offset; } else { hdev->num_msi = - hnae3_get_field(__le16_to_cpu(req->vf_intr_vector_number), + hnae3_get_field(le16_to_cpu(req->vf_intr_vector_number), HCLGEVF_VEC_NUM_M, HCLGEVF_VEC_NUM_S); hdev->num_nic_msix = hdev->num_msi; @@ -2718,16 +2711,12 @@ static int hclgevf_init_hdev(struct hclgevf_dev *hdev) int ret; ret = hclgevf_pci_init(hdev); - if (ret) { - dev_err(&pdev->dev, "PCI initialization failed\n"); + if (ret) return ret; - } ret = hclgevf_cmd_queue_init(hdev); - if (ret) { - dev_err(&pdev->dev, "Cmd queue init failed: %d\n", ret); + if (ret) goto err_cmd_queue_init; - } ret = hclgevf_cmd_init(hdev); if (ret) @@ -2735,11 +2724,8 @@ static int hclgevf_init_hdev(struct hclgevf_dev *hdev) /* Get vf resource */ ret = hclgevf_query_vf_resource(hdev); - if (ret) { - dev_err(&hdev->pdev->dev, - "Query vf status error, ret = %d.\n", ret); + if (ret) goto err_cmd_init; - } ret = hclgevf_init_msi(hdev); if (ret) { @@ -2749,13 +2735,11 @@ static int hclgevf_init_hdev(struct hclgevf_dev *hdev) hclgevf_state_init(hdev); hdev->reset_level = HNAE3_VF_FUNC_RESET; + hdev->reset_type = HNAE3_NONE_RESET; ret = hclgevf_misc_irq_init(hdev); - if (ret) { - dev_err(&pdev->dev, "failed(%d) to init Misc IRQ(vector0)\n", - ret); + if (ret) goto err_misc_irq_init; - } set_bit(HCLGEVF_STATE_IRQ_INITED, &hdev->state); @@ -2772,10 +2756,8 @@ static int hclgevf_init_hdev(struct hclgevf_dev *hdev) } ret = hclgevf_set_handle_info(hdev); - if (ret) { - dev_err(&pdev->dev, "failed(%d) to set handle info\n", ret); + if (ret) goto err_config; - } ret = hclgevf_config_gro(hdev, true); if (ret) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h index 003114f6db6c..fee8d97f323c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h @@ -221,6 +221,7 @@ struct hclgevf_rss_cfg { struct hclgevf_misc_vector { u8 __iomem *addr; int vector_irq; + char name[HNAE3_INT_NAME_LEN]; }; struct hclgevf_rst_stats { @@ -252,6 +253,7 @@ struct hclgevf_dev { unsigned long reset_state; /* requested, pending */ struct hclgevf_rst_stats rst_stats; u32 reset_attempts; + struct semaphore reset_sem; /* protect reset process */ u32 fw_version; u16 num_tqps; /* num task queue pairs of this PF */ diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h b/drivers/net/ethernet/intel/e1000e/e1000.h index 6c51b1bad8c4..37a2314d3e6b 100644 --- a/drivers/net/ethernet/intel/e1000e/e1000.h +++ b/drivers/net/ethernet/intel/e1000e/e1000.h @@ -185,13 +185,12 @@ struct e1000_phy_regs { /* board specific private data structure */ struct e1000_adapter { + struct timer_list watchdog_timer; struct timer_list phy_info_timer; struct timer_list blink_timer; struct work_struct reset_task; - struct delayed_work watchdog_task; - - struct workqueue_struct *e1000_workqueue; + struct work_struct watchdog_task; const struct e1000_info *ei; diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index 8797913b2702..db4ea58bac82 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -1780,8 +1780,7 @@ static irqreturn_t e1000_intr_msi(int __always_unused irq, void *data) } /* guard against interrupt when we're going down */ if (!test_bit(__E1000_DOWN, &adapter->state)) - mod_delayed_work(adapter->e1000_workqueue, - &adapter->watchdog_task, HZ); + mod_timer(&adapter->watchdog_timer, jiffies + 1); } /* Reset on uncorrectable ECC error */ @@ -1861,8 +1860,7 @@ static irqreturn_t e1000_intr(int __always_unused irq, void *data) } /* guard against interrupt when we're going down */ if (!test_bit(__E1000_DOWN, &adapter->state)) - mod_delayed_work(adapter->e1000_workqueue, - &adapter->watchdog_task, HZ); + mod_timer(&adapter->watchdog_timer, jiffies + 1); } /* Reset on uncorrectable ECC error */ @@ -1907,8 +1905,7 @@ static irqreturn_t e1000_msix_other(int __always_unused irq, void *data) hw->mac.get_link_status = true; /* guard against interrupt when we're going down */ if (!test_bit(__E1000_DOWN, &adapter->state)) - mod_delayed_work(adapter->e1000_workqueue, - &adapter->watchdog_task, HZ); + mod_timer(&adapter->watchdog_timer, jiffies + 1); } if (!test_bit(__E1000_DOWN, &adapter->state)) @@ -4284,6 +4281,7 @@ void e1000e_down(struct e1000_adapter *adapter, bool reset) napi_synchronize(&adapter->napi); + del_timer_sync(&adapter->watchdog_timer); del_timer_sync(&adapter->phy_info_timer); spin_lock(&adapter->stats64_lock); @@ -5156,11 +5154,25 @@ static void e1000e_check_82574_phy_workaround(struct e1000_adapter *adapter) } } +/** + * e1000_watchdog - Timer Call-back + * @data: pointer to adapter cast into an unsigned long + **/ +static void e1000_watchdog(struct timer_list *t) +{ + struct e1000_adapter *adapter = from_timer(adapter, t, watchdog_timer); + + /* Do the rest outside of interrupt context */ + schedule_work(&adapter->watchdog_task); + + /* TODO: make this use queue_delayed_work() */ +} + static void e1000_watchdog_task(struct work_struct *work) { struct e1000_adapter *adapter = container_of(work, struct e1000_adapter, - watchdog_task.work); + watchdog_task); struct net_device *netdev = adapter->netdev; struct e1000_mac_info *mac = &adapter->hw.mac; struct e1000_phy_info *phy = &adapter->hw.phy; @@ -5408,9 +5420,8 @@ link_up: /* Reset the timer */ if (!test_bit(__E1000_DOWN, &adapter->state)) - queue_delayed_work(adapter->e1000_workqueue, - &adapter->watchdog_task, - round_jiffies(2 * HZ)); + mod_timer(&adapter->watchdog_timer, + round_jiffies(jiffies + 2 * HZ)); } #define E1000_TX_FLAGS_CSUM 0x00000001 @@ -7450,21 +7461,11 @@ static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto err_eeprom; } - adapter->e1000_workqueue = alloc_workqueue("%s", WQ_MEM_RECLAIM, 0, - e1000e_driver_name); - - if (!adapter->e1000_workqueue) { - err = -ENOMEM; - goto err_workqueue; - } - - INIT_DELAYED_WORK(&adapter->watchdog_task, e1000_watchdog_task); - queue_delayed_work(adapter->e1000_workqueue, &adapter->watchdog_task, - 0); - + timer_setup(&adapter->watchdog_timer, e1000_watchdog, 0); timer_setup(&adapter->phy_info_timer, e1000_update_phy_info, 0); INIT_WORK(&adapter->reset_task, e1000_reset_task); + INIT_WORK(&adapter->watchdog_task, e1000_watchdog_task); INIT_WORK(&adapter->downshift_task, e1000e_downshift_workaround); INIT_WORK(&adapter->update_phy_task, e1000e_update_phy_task); INIT_WORK(&adapter->print_hang_task, e1000_print_hw_hang); @@ -7558,9 +7559,6 @@ static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent) return 0; err_register: - flush_workqueue(adapter->e1000_workqueue); - destroy_workqueue(adapter->e1000_workqueue); -err_workqueue: if (!(adapter->flags & FLAG_HAS_AMT)) e1000e_release_hw_control(adapter); err_eeprom: @@ -7605,17 +7603,15 @@ static void e1000_remove(struct pci_dev *pdev) * from being rescheduled. */ set_bit(__E1000_DOWN, &adapter->state); + del_timer_sync(&adapter->watchdog_timer); del_timer_sync(&adapter->phy_info_timer); cancel_work_sync(&adapter->reset_task); + cancel_work_sync(&adapter->watchdog_task); cancel_work_sync(&adapter->downshift_task); cancel_work_sync(&adapter->update_phy_task); cancel_work_sync(&adapter->print_hang_task); - cancel_delayed_work(&adapter->watchdog_task); - flush_workqueue(adapter->e1000_workqueue); - destroy_workqueue(adapter->e1000_workqueue); - if (adapter->flags & FLAG_HAS_HW_TIMESTAMP) { cancel_work_sync(&adapter->tx_hwtstamp_work); if (adapter->tx_hwtstamp_skb) { diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c index ba2566e2123d..0637ccadee79 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c @@ -696,21 +696,24 @@ static netdev_tx_t fm10k_xmit_frame(struct sk_buff *skb, struct net_device *dev) /** * fm10k_tx_timeout - Respond to a Tx Hang * @netdev: network interface device structure + * @txqueue: the index of the Tx queue that timed out **/ static void fm10k_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct fm10k_intfc *interface = netdev_priv(netdev); + struct fm10k_ring *tx_ring; bool real_tx_hang = false; - int i; - -#define TX_TIMEO_LIMIT 16000 - for (i = 0; i < interface->num_tx_queues; i++) { - struct fm10k_ring *tx_ring = interface->tx_ring[i]; - if (check_for_tx_hang(tx_ring) && fm10k_check_tx_hang(tx_ring)) - real_tx_hang = true; + if (txqueue >= interface->num_tx_queues) { + WARN(1, "invalid Tx queue index %d", txqueue); + return; } + tx_ring = interface->tx_ring[txqueue]; + if (check_for_tx_hang(tx_ring) && fm10k_check_tx_hang(tx_ring)) + real_tx_hang = true; + +#define TX_TIMEO_LIMIT 16000 if (real_tx_hang) { fm10k_tx_timeout_reset(interface); } else { diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq.c b/drivers/net/ethernet/intel/i40e/i40e_adminq.c index 9f0a4e92a231..37514a75f928 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_adminq.c +++ b/drivers/net/ethernet/intel/i40e/i40e_adminq.c @@ -536,6 +536,11 @@ static void i40e_set_hw_flags(struct i40e_hw *hw) (aq->api_maj_ver == 1 && aq->api_min_ver >= I40E_MINOR_VER_FW_LLDP_STOPPABLE_X722)) hw->flags |= I40E_HW_FLAG_FW_LLDP_STOPPABLE; + + if (aq->api_maj_ver > 1 || + (aq->api_maj_ver == 1 && + aq->api_min_ver >= I40E_MINOR_VER_GET_LINK_INFO_X722)) + hw->flags |= I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE; /* fall through */ default: break; diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c index d4055037af89..45b90eb11adb 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_common.c +++ b/drivers/net/ethernet/intel/i40e/i40e_common.c @@ -1113,7 +1113,7 @@ i40e_status i40e_read_pba_string(struct i40e_hw *hw, u8 *pba_num, */ pba_size--; if (pba_num_size < (((u32)pba_size * 2) + 1)) { - hw_dbg(hw, "Buffer to small for PBA data.\n"); + hw_dbg(hw, "Buffer too small for PBA data.\n"); return I40E_ERR_PARAM; } diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 33912cf964eb..8c3e753bfb9d 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -307,37 +307,18 @@ static void i40e_tx_timeout(struct net_device *netdev, unsigned int txqueue) struct i40e_vsi *vsi = np->vsi; struct i40e_pf *pf = vsi->back; struct i40e_ring *tx_ring = NULL; - unsigned int i, hung_queue = 0; + unsigned int i; u32 head, val; pf->tx_timeout_count++; - /* find the stopped queue the same way the stack does */ - for (i = 0; i < netdev->num_tx_queues; i++) { - struct netdev_queue *q; - unsigned long trans_start; - - q = netdev_get_tx_queue(netdev, i); - trans_start = q->trans_start; - if (netif_xmit_stopped(q) && - time_after(jiffies, - (trans_start + netdev->watchdog_timeo))) { - hung_queue = i; - break; - } - } - - if (i == netdev->num_tx_queues) { - netdev_info(netdev, "tx_timeout: no netdev hung queue found\n"); - } else { - /* now that we have an index, find the tx_ring struct */ - for (i = 0; i < vsi->num_queue_pairs; i++) { - if (vsi->tx_rings[i] && vsi->tx_rings[i]->desc) { - if (hung_queue == - vsi->tx_rings[i]->queue_index) { - tx_ring = vsi->tx_rings[i]; - break; - } + /* with txqueue index, find the tx_ring struct */ + for (i = 0; i < vsi->num_queue_pairs; i++) { + if (vsi->tx_rings[i] && vsi->tx_rings[i]->desc) { + if (txqueue == + vsi->tx_rings[i]->queue_index) { + tx_ring = vsi->tx_rings[i]; + break; } } } @@ -363,14 +344,14 @@ static void i40e_tx_timeout(struct net_device *netdev, unsigned int txqueue) val = rd32(&pf->hw, I40E_PFINT_DYN_CTL0); netdev_info(netdev, "tx_timeout: VSI_seid: %d, Q %d, NTC: 0x%x, HWB: 0x%x, NTU: 0x%x, TAIL: 0x%x, INT: 0x%x\n", - vsi->seid, hung_queue, tx_ring->next_to_clean, + vsi->seid, txqueue, tx_ring->next_to_clean, head, tx_ring->next_to_use, readl(tx_ring->tail), val); } pf->tx_timeout_last_recovery = jiffies; - netdev_info(netdev, "tx_timeout recovery level %d, hung_queue %d\n", - pf->tx_timeout_recovery_level, hung_queue); + netdev_info(netdev, "tx_timeout recovery level %d, txqueue %d\n", + pf->tx_timeout_recovery_level, txqueue); switch (pf->tx_timeout_recovery_level) { case 1: diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 6a3f0fc56c3b..69523ac85639 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -2322,6 +2322,22 @@ static int i40e_ctrl_vf_rx_rings(struct i40e_vsi *vsi, unsigned long q_map, } /** + * i40e_vc_validate_vqs_bitmaps - validate Rx/Tx queue bitmaps from VIRTHCHNL + * @vqs: virtchnl_queue_select structure containing bitmaps to validate + * + * Returns true if validation was successful, else false. + */ +static bool i40e_vc_validate_vqs_bitmaps(struct virtchnl_queue_select *vqs) +{ + if ((!vqs->rx_queues && !vqs->tx_queues) || + vqs->rx_queues >= BIT(I40E_MAX_VF_QUEUES) || + vqs->tx_queues >= BIT(I40E_MAX_VF_QUEUES)) + return false; + + return true; +} + +/** * i40e_vc_enable_queues_msg * @vf: pointer to the VF info * @msg: pointer to the msg buffer @@ -2346,7 +2362,7 @@ static int i40e_vc_enable_queues_msg(struct i40e_vf *vf, u8 *msg) goto error_param; } - if ((0 == vqs->rx_queues) && (0 == vqs->tx_queues)) { + if (i40e_vc_validate_vqs_bitmaps(vqs)) { aq_ret = I40E_ERR_PARAM; goto error_param; } @@ -2408,9 +2424,7 @@ static int i40e_vc_disable_queues_msg(struct i40e_vf *vf, u8 *msg) goto error_param; } - if ((vqs->rx_queues == 0 && vqs->tx_queues == 0) || - vqs->rx_queues > I40E_MAX_VF_QUEUES || - vqs->tx_queues > I40E_MAX_VF_QUEUES) { + if (i40e_vc_validate_vqs_bitmaps(vqs)) { aq_ret = I40E_ERR_PARAM; goto error_param; } diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h index 29de3ae96ef2..bd1b1ed323f4 100644 --- a/drivers/net/ethernet/intel/iavf/iavf.h +++ b/drivers/net/ethernet/intel/iavf/iavf.h @@ -415,4 +415,6 @@ void iavf_enable_channels(struct iavf_adapter *adapter); void iavf_disable_channels(struct iavf_adapter *adapter); void iavf_add_cloud_filter(struct iavf_adapter *adapter); void iavf_del_cloud_filter(struct iavf_adapter *adapter); +struct iavf_mac_filter *iavf_add_filter(struct iavf_adapter *adapter, + const u8 *macaddr); #endif /* _IAVF_H_ */ diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 0a8824871618..62fe56ddcb6e 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -743,9 +743,8 @@ iavf_mac_filter *iavf_find_filter(struct iavf_adapter *adapter, * * Returns ptr to the filter object or NULL when no memory available. **/ -static struct -iavf_mac_filter *iavf_add_filter(struct iavf_adapter *adapter, - const u8 *macaddr) +struct iavf_mac_filter *iavf_add_filter(struct iavf_adapter *adapter, + const u8 *macaddr) { struct iavf_mac_filter *f; @@ -2065,9 +2064,9 @@ static void iavf_reset_task(struct work_struct *work) struct virtchnl_vf_resource *vfres = adapter->vf_res; struct net_device *netdev = adapter->netdev; struct iavf_hw *hw = &adapter->hw; + struct iavf_mac_filter *f, *ftmp; struct iavf_vlan_filter *vlf; struct iavf_cloud_filter *cf; - struct iavf_mac_filter *f; u32 reg_val; int i = 0, err; bool running; @@ -2181,6 +2180,16 @@ continue_reset: spin_lock_bh(&adapter->mac_vlan_list_lock); + /* Delete filter for the current MAC address, it could have + * been changed by the PF via administratively set MAC. + * Will be re-added via VIRTCHNL_OP_GET_VF_RESOURCES. + */ + list_for_each_entry_safe(f, ftmp, &adapter->mac_filter_list, list) { + if (ether_addr_equal(f->macaddr, adapter->hw.mac.addr)) { + list_del(&f->list); + kfree(f); + } + } /* re-add all MAC filters */ list_for_each_entry(f, &adapter->mac_filter_list, list) { f->add = true; diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c index c46770eba320..1ab9cb339acb 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c +++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c @@ -1359,6 +1359,9 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter, ether_addr_copy(netdev->perm_addr, adapter->hw.mac.addr); } + spin_lock_bh(&adapter->mac_vlan_list_lock); + iavf_add_filter(adapter, adapter->hw.mac.addr); + spin_unlock_bh(&adapter->mac_vlan_list_lock); iavf_process_config(adapter); } break; diff --git a/drivers/net/ethernet/intel/ice/Makefile b/drivers/net/ethernet/intel/ice/Makefile index 7cb829132d28..59544b0fc086 100644 --- a/drivers/net/ethernet/intel/ice/Makefile +++ b/drivers/net/ethernet/intel/ice/Makefile @@ -17,7 +17,8 @@ ice-y := ice_main.o \ ice_lib.o \ ice_txrx_lib.o \ ice_txrx.o \ - ice_flex_pipe.o \ + ice_flex_pipe.o \ + ice_flow.o \ ice_ethtool.o ice-$(CONFIG_PCI_IOV) += ice_virtchnl_pf.o ice_sriov.o ice-$(CONFIG_DCB) += ice_dcb.o ice_dcb_nl.o ice_dcb_lib.o diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index f972dce8aebb..cb10abb14e11 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -174,6 +174,8 @@ struct ice_sw { struct ice_pf *pf; u16 sw_id; /* switch ID for this switch */ u16 bridge_mode; /* VEB/VEPA/Port Virtualizer */ + struct ice_vsi *dflt_vsi; /* default VSI for this switch */ + u8 dflt_vsi_ena:1; /* true if above dflt_vsi is enabled */ }; enum ice_state { @@ -275,6 +277,7 @@ struct ice_vsi { u8 current_isup:1; /* Sync 'link up' logging */ u8 stat_offsets_loaded:1; u8 vlan_ena:1; + u16 num_vlan; /* queue information */ u8 tx_mapping_mode; /* ICE_MAP_MODE_[CONTIG|SCATTER] */ @@ -462,12 +465,13 @@ static inline void ice_set_ring_xdp(struct ice_ring *ring) static inline struct xdp_umem *ice_xsk_umem(struct ice_ring *ring) { struct xdp_umem **umems = ring->vsi->xsk_umems; - int qid = ring->q_index; + u16 qid = ring->q_index; if (ice_ring_is_xdp(ring)) qid -= ring->vsi->num_xdp_txq; - if (!umems || !umems[qid] || !ice_is_xdp_ena_vsi(ring->vsi)) + if (qid >= ring->vsi->num_xsk_umems || !umems || !umems[qid] || + !ice_is_xdp_ena_vsi(ring->vsi)) return NULL; return umems[qid]; diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h index 5421fc413f94..4459bc564b11 100644 --- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h +++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h @@ -232,6 +232,13 @@ struct ice_aqc_get_sw_cfg_resp { */ #define ICE_AQC_RES_TYPE_VSI_LIST_REP 0x03 #define ICE_AQC_RES_TYPE_VSI_LIST_PRUNE 0x04 +#define ICE_AQC_RES_TYPE_HASH_PROF_BLDR_PROFID 0x60 +#define ICE_AQC_RES_TYPE_HASH_PROF_BLDR_TCAM 0x61 + +#define ICE_AQC_RES_TYPE_FLAG_SCAN_BOTTOM BIT(12) +#define ICE_AQC_RES_TYPE_FLAG_IGNORE_INDEX BIT(13) + +#define ICE_AQC_RES_TYPE_FLAG_DEDICATED 0x00 /* Allocate Resources command (indirect 0x0208) * Free Resources command (indirect 0x0209) @@ -1849,6 +1856,7 @@ enum ice_adminq_opc { /* package commands */ ice_aqc_opc_download_pkg = 0x0C40, + ice_aqc_opc_update_pkg = 0x0C42, ice_aqc_opc_get_pkg_info_list = 0x0C43, /* debug commands */ diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c index 77d6a0291e97..d8e975cceb21 100644 --- a/drivers/net/ethernet/intel/ice/ice_base.c +++ b/drivers/net/ethernet/intel/ice/ice_base.c @@ -93,7 +93,8 @@ static int ice_pf_rxq_wait(struct ice_pf *pf, int pf_q, bool ena) * @vsi: the VSI being configured * @v_idx: index of the vector in the VSI struct * - * We allocate one q_vector. If allocation fails we return -ENOMEM. + * We allocate one q_vector and set default value for ITR setting associated + * with this q_vector. If allocation fails we return -ENOMEM. */ static int ice_vsi_alloc_q_vector(struct ice_vsi *vsi, int v_idx) { @@ -108,6 +109,8 @@ static int ice_vsi_alloc_q_vector(struct ice_vsi *vsi, int v_idx) q_vector->vsi = vsi; q_vector->v_idx = v_idx; + q_vector->tx.itr_setting = ICE_DFLT_TX_ITR; + q_vector->rx.itr_setting = ICE_DFLT_RX_ITR; if (vsi->type == ICE_VSI_VF) goto out; /* only set affinity_mask if the CPU is online */ @@ -299,6 +302,7 @@ int ice_setup_rx_ctx(struct ice_ring *ring) if (ring->vsi->type == ICE_VSI_PF) { if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) + /* coverity[check_return] */ xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, ring->q_index); @@ -323,7 +327,9 @@ int ice_setup_rx_ctx(struct ice_ring *ring) dev_info(&vsi->back->pdev->dev, "Registered XDP mem model MEM_TYPE_ZERO_COPY on Rx ring %d\n", ring->q_index); } else { + ring->zca.free = NULL; if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) + /* coverity[check_return] */ xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, ring->q_index); @@ -674,10 +680,6 @@ void ice_cfg_itr(struct ice_hw *hw, struct ice_q_vector *q_vector) if (q_vector->num_ring_rx) { struct ice_ring_container *rc = &q_vector->rx; - /* if this value is set then don't overwrite with default */ - if (!rc->itr_setting) - rc->itr_setting = ICE_DFLT_RX_ITR; - rc->target_itr = ITR_TO_REG(rc->itr_setting); rc->next_update = jiffies + 1; rc->current_itr = rc->target_itr; @@ -688,10 +690,6 @@ void ice_cfg_itr(struct ice_hw *hw, struct ice_q_vector *q_vector) if (q_vector->num_ring_tx) { struct ice_ring_container *rc = &q_vector->tx; - /* if this value is set then don't overwrite with default */ - if (!rc->itr_setting) - rc->itr_setting = ICE_DFLT_TX_ITR; - rc->target_itr = ITR_TO_REG(rc->itr_setting); rc->next_update = jiffies + 1; rc->current_itr = rc->target_itr; diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index fb1d930470c7..0207e28c2682 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -4,28 +4,10 @@ #include "ice_common.h" #include "ice_sched.h" #include "ice_adminq_cmd.h" +#include "ice_flow.h" #define ICE_PF_RESET_WAIT_COUNT 200 -#define ICE_PROG_FLEX_ENTRY(hw, rxdid, mdid, idx) \ - wr32((hw), GLFLXP_RXDID_FLX_WRD_##idx(rxdid), \ - ((ICE_RX_OPC_MDID << \ - GLFLXP_RXDID_FLX_WRD_##idx##_RXDID_OPCODE_S) & \ - GLFLXP_RXDID_FLX_WRD_##idx##_RXDID_OPCODE_M) | \ - (((mdid) << GLFLXP_RXDID_FLX_WRD_##idx##_PROT_MDID_S) & \ - GLFLXP_RXDID_FLX_WRD_##idx##_PROT_MDID_M)) - -#define ICE_PROG_FLG_ENTRY(hw, rxdid, flg_0, flg_1, flg_2, flg_3, idx) \ - wr32((hw), GLFLXP_RXDID_FLAGS(rxdid, idx), \ - (((flg_0) << GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_S) & \ - GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_M) | \ - (((flg_1) << GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_1_S) & \ - GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_1_M) | \ - (((flg_2) << GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_2_S) & \ - GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_2_M) | \ - (((flg_3) << GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_3_S) & \ - GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_3_M)) - /** * ice_set_mac_type - Sets MAC type * @hw: pointer to the HW structure @@ -348,88 +330,6 @@ ice_aq_get_link_info(struct ice_port_info *pi, bool ena_lse, } /** - * ice_init_flex_flags - * @hw: pointer to the hardware structure - * @prof_id: Rx Descriptor Builder profile ID - * - * Function to initialize Rx flex flags - */ -static void ice_init_flex_flags(struct ice_hw *hw, enum ice_rxdid prof_id) -{ - u8 idx = 0; - - /* Flex-flag fields (0-2) are programmed with FLG64 bits with layout: - * flexiflags0[5:0] - TCP flags, is_packet_fragmented, is_packet_UDP_GRE - * flexiflags1[3:0] - Not used for flag programming - * flexiflags2[7:0] - Tunnel and VLAN types - * 2 invalid fields in last index - */ - switch (prof_id) { - /* Rx flex flags are currently programmed for the NIC profiles only. - * Different flag bit programming configurations can be added per - * profile as needed. - */ - case ICE_RXDID_FLEX_NIC: - case ICE_RXDID_FLEX_NIC_2: - ICE_PROG_FLG_ENTRY(hw, prof_id, ICE_FLG_PKT_FRG, - ICE_FLG_UDP_GRE, ICE_FLG_PKT_DSI, - ICE_FLG_FIN, idx++); - /* flex flag 1 is not used for flexi-flag programming, skipping - * these four FLG64 bits. - */ - ICE_PROG_FLG_ENTRY(hw, prof_id, ICE_FLG_SYN, ICE_FLG_RST, - ICE_FLG_PKT_DSI, ICE_FLG_PKT_DSI, idx++); - ICE_PROG_FLG_ENTRY(hw, prof_id, ICE_FLG_PKT_DSI, - ICE_FLG_PKT_DSI, ICE_FLG_EVLAN_x8100, - ICE_FLG_EVLAN_x9100, idx++); - ICE_PROG_FLG_ENTRY(hw, prof_id, ICE_FLG_VLAN_x8100, - ICE_FLG_TNL_VLAN, ICE_FLG_TNL_MAC, - ICE_FLG_TNL0, idx++); - ICE_PROG_FLG_ENTRY(hw, prof_id, ICE_FLG_TNL1, ICE_FLG_TNL2, - ICE_FLG_PKT_DSI, ICE_FLG_PKT_DSI, idx); - break; - - default: - ice_debug(hw, ICE_DBG_INIT, - "Flag programming for profile ID %d not supported\n", - prof_id); - } -} - -/** - * ice_init_flex_flds - * @hw: pointer to the hardware structure - * @prof_id: Rx Descriptor Builder profile ID - * - * Function to initialize flex descriptors - */ -static void ice_init_flex_flds(struct ice_hw *hw, enum ice_rxdid prof_id) -{ - enum ice_flex_rx_mdid mdid; - - switch (prof_id) { - case ICE_RXDID_FLEX_NIC: - case ICE_RXDID_FLEX_NIC_2: - ICE_PROG_FLEX_ENTRY(hw, prof_id, ICE_RX_MDID_HASH_LOW, 0); - ICE_PROG_FLEX_ENTRY(hw, prof_id, ICE_RX_MDID_HASH_HIGH, 1); - ICE_PROG_FLEX_ENTRY(hw, prof_id, ICE_RX_MDID_FLOW_ID_LOWER, 2); - - mdid = (prof_id == ICE_RXDID_FLEX_NIC_2) ? - ICE_RX_MDID_SRC_VSI : ICE_RX_MDID_FLOW_ID_HIGH; - - ICE_PROG_FLEX_ENTRY(hw, prof_id, mdid, 3); - - ice_init_flex_flags(hw, prof_id); - break; - - default: - ice_debug(hw, ICE_DBG_INIT, - "Field init for profile ID %d not supported\n", - prof_id); - } -} - -/** * ice_init_fltr_mgmt_struct - initializes filter management list and locks * @hw: pointer to the HW struct */ @@ -882,9 +782,6 @@ enum ice_status ice_init_hw(struct ice_hw *hw) if (status) goto err_unroll_fltr_mgmt_struct; - - ice_init_flex_flds(hw, ICE_RXDID_FLEX_NIC); - ice_init_flex_flds(hw, ICE_RXDID_FLEX_NIC_2); status = ice_init_hw_tbls(hw); if (status) goto err_unroll_fltr_mgmt_struct; @@ -1601,6 +1498,114 @@ void ice_release_res(struct ice_hw *hw, enum ice_aq_res_ids res) } /** + * ice_aq_alloc_free_res - command to allocate/free resources + * @hw: pointer to the HW struct + * @num_entries: number of resource entries in buffer + * @buf: Indirect buffer to hold data parameters and response + * @buf_size: size of buffer for indirect commands + * @opc: pass in the command opcode + * @cd: pointer to command details structure or NULL + * + * Helper function to allocate/free resources using the admin queue commands + */ +enum ice_status +ice_aq_alloc_free_res(struct ice_hw *hw, u16 num_entries, + struct ice_aqc_alloc_free_res_elem *buf, u16 buf_size, + enum ice_adminq_opc opc, struct ice_sq_cd *cd) +{ + struct ice_aqc_alloc_free_res_cmd *cmd; + struct ice_aq_desc desc; + + cmd = &desc.params.sw_res_ctrl; + + if (!buf) + return ICE_ERR_PARAM; + + if (buf_size < (num_entries * sizeof(buf->elem[0]))) + return ICE_ERR_PARAM; + + ice_fill_dflt_direct_cmd_desc(&desc, opc); + + desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD); + + cmd->num_entries = cpu_to_le16(num_entries); + + return ice_aq_send_cmd(hw, &desc, buf, buf_size, cd); +} + +/** + * ice_alloc_hw_res - allocate resource + * @hw: pointer to the HW struct + * @type: type of resource + * @num: number of resources to allocate + * @btm: allocate from bottom + * @res: pointer to array that will receive the resources + */ +enum ice_status +ice_alloc_hw_res(struct ice_hw *hw, u16 type, u16 num, bool btm, u16 *res) +{ + struct ice_aqc_alloc_free_res_elem *buf; + enum ice_status status; + u16 buf_len; + + buf_len = struct_size(buf, elem, num - 1); + buf = kzalloc(buf_len, GFP_KERNEL); + if (!buf) + return ICE_ERR_NO_MEMORY; + + /* Prepare buffer to allocate resource. */ + buf->num_elems = cpu_to_le16(num); + buf->res_type = cpu_to_le16(type | ICE_AQC_RES_TYPE_FLAG_DEDICATED | + ICE_AQC_RES_TYPE_FLAG_IGNORE_INDEX); + if (btm) + buf->res_type |= cpu_to_le16(ICE_AQC_RES_TYPE_FLAG_SCAN_BOTTOM); + + status = ice_aq_alloc_free_res(hw, 1, buf, buf_len, + ice_aqc_opc_alloc_res, NULL); + if (status) + goto ice_alloc_res_exit; + + memcpy(res, buf->elem, sizeof(buf->elem) * num); + +ice_alloc_res_exit: + kfree(buf); + return status; +} + +/** + * ice_free_hw_res - free allocated HW resource + * @hw: pointer to the HW struct + * @type: type of resource to free + * @num: number of resources + * @res: pointer to array that contains the resources to free + */ +enum ice_status +ice_free_hw_res(struct ice_hw *hw, u16 type, u16 num, u16 *res) +{ + struct ice_aqc_alloc_free_res_elem *buf; + enum ice_status status; + u16 buf_len; + + buf_len = struct_size(buf, elem, num - 1); + buf = kzalloc(buf_len, GFP_KERNEL); + if (!buf) + return ICE_ERR_NO_MEMORY; + + /* Prepare buffer to free resource. */ + buf->num_elems = cpu_to_le16(num); + buf->res_type = cpu_to_le16(type); + memcpy(buf->elem, res, sizeof(buf->elem) * num); + + status = ice_aq_alloc_free_res(hw, num, buf, buf_len, + ice_aqc_opc_free_res, NULL); + if (status) + ice_debug(hw, ICE_DBG_SW, "CQ CMD Buffer:\n"); + + kfree(buf); + return status; +} + +/** * ice_get_num_per_func - determine number of resources per PF * @hw: pointer to the HW structure * @max: value to be evenly split between each PF @@ -3510,7 +3515,10 @@ enum ice_status ice_replay_vsi(struct ice_hw *hw, u16 vsi_handle) if (status) return status; } - + /* Replay per VSI all RSS configurations */ + status = ice_replay_rss_cfg(hw, vsi_handle); + if (status) + return status; /* Replay per VSI all filters */ status = ice_replay_vsi_all_fltr(hw, vsi_handle); return status; diff --git a/drivers/net/ethernet/intel/ice/ice_common.h b/drivers/net/ethernet/intel/ice/ice_common.h index b22aa561e253..b5c013fdaaf9 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.h +++ b/drivers/net/ethernet/intel/ice/ice_common.h @@ -34,10 +34,18 @@ enum ice_status ice_acquire_res(struct ice_hw *hw, enum ice_aq_res_ids res, enum ice_aq_res_access_type access, u32 timeout); void ice_release_res(struct ice_hw *hw, enum ice_aq_res_ids res); +enum ice_status +ice_alloc_hw_res(struct ice_hw *hw, u16 type, u16 num, bool btm, u16 *res); +enum ice_status +ice_free_hw_res(struct ice_hw *hw, u16 type, u16 num, u16 *res); enum ice_status ice_init_nvm(struct ice_hw *hw); enum ice_status ice_read_sr_buf(struct ice_hw *hw, u16 offset, u16 *words, u16 *data); enum ice_status +ice_aq_alloc_free_res(struct ice_hw *hw, u16 num_entries, + struct ice_aqc_alloc_free_res_elem *buf, u16 buf_size, + enum ice_adminq_opc opc, struct ice_sq_cd *cd); +enum ice_status ice_sq_send_cmd(struct ice_hw *hw, struct ice_ctl_q_info *cq, struct ice_aq_desc *desc, void *buf, u16 buf_size, struct ice_sq_cd *cd); diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c index d3d3ec29def9..0664e5b8d130 100644 --- a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c @@ -396,6 +396,12 @@ dcb_error: prev_cfg->etscfg.tcbwtable[0] = ICE_TC_MAX_BW; prev_cfg->etscfg.tsatable[0] = ICE_IEEE_TSA_ETS; memcpy(&prev_cfg->etsrec, &prev_cfg->etscfg, sizeof(prev_cfg->etsrec)); + /* Coverity warns the return code of ice_pf_dcb_cfg() is not checked + * here as is done for other calls to that function. That check is + * not necessary since this is in this function's error cleanup path. + * Suppress the Coverity warning with the following comment... + */ + /* coverity[check_return] */ ice_pf_dcb_cfg(pf, prev_cfg, false); kfree(prev_cfg); } diff --git a/drivers/net/ethernet/intel/ice/ice_devids.h b/drivers/net/ethernet/intel/ice/ice_devids.h index f8d5c661d0ba..ce63017c56c7 100644 --- a/drivers/net/ethernet/intel/ice/ice_devids.h +++ b/drivers/net/ethernet/intel/ice/ice_devids.h @@ -11,5 +11,23 @@ #define ICE_DEV_ID_E810C_QSFP 0x1592 /* Intel(R) Ethernet Controller E810-C for SFP */ #define ICE_DEV_ID_E810C_SFP 0x1593 +/* Intel(R) Ethernet Connection E822-C for backplane */ +#define ICE_DEV_ID_E822C_BACKPLANE 0x1890 +/* Intel(R) Ethernet Connection E822-C for QSFP */ +#define ICE_DEV_ID_E822C_QSFP 0x1891 +/* Intel(R) Ethernet Connection E822-C for SFP */ +#define ICE_DEV_ID_E822C_SFP 0x1892 +/* Intel(R) Ethernet Connection E822-C/X557-AT 10GBASE-T */ +#define ICE_DEV_ID_E822C_10G_BASE_T 0x1893 +/* Intel(R) Ethernet Connection E822-C 1GbE */ +#define ICE_DEV_ID_E822C_SGMII 0x1894 +/* Intel(R) Ethernet Connection E822-X for backplane */ +#define ICE_DEV_ID_E822X_BACKPLANE 0x1897 +/* Intel(R) Ethernet Connection E822-L for SFP */ +#define ICE_DEV_ID_E822L_SFP 0x1898 +/* Intel(R) Ethernet Connection E822-L/X557-AT 10GBASE-T */ +#define ICE_DEV_ID_E822L_10G_BASE_T 0x1899 +/* Intel(R) Ethernet Connection E822-L 1GbE */ +#define ICE_DEV_ID_E822L_SGMII 0x189A #endif /* _ICE_DEVIDS_H_ */ diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index 9ebd93e79aeb..90c6a3ca20c9 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -4,6 +4,7 @@ /* ethtool support for ice */ #include "ice.h" +#include "ice_flow.h" #include "ice_lib.h" #include "ice_dcb_lib.h" @@ -283,12 +284,15 @@ out: */ static bool ice_active_vfs(struct ice_pf *pf) { - struct ice_vf *vf = pf->vf; int i; - for (i = 0; i < pf->num_alloc_vfs; i++, vf++) + ice_for_each_vf(pf, i) { + struct ice_vf *vf = &pf->vf[i]; + if (test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) return true; + } + return false; } @@ -2531,6 +2535,243 @@ done: } /** + * ice_parse_hdrs - parses headers from RSS hash input + * @nfc: ethtool rxnfc command + * + * This function parses the rxnfc command and returns intended + * header types for RSS configuration + */ +static u32 ice_parse_hdrs(struct ethtool_rxnfc *nfc) +{ + u32 hdrs = ICE_FLOW_SEG_HDR_NONE; + + switch (nfc->flow_type) { + case TCP_V4_FLOW: + hdrs |= ICE_FLOW_SEG_HDR_TCP | ICE_FLOW_SEG_HDR_IPV4; + break; + case UDP_V4_FLOW: + hdrs |= ICE_FLOW_SEG_HDR_UDP | ICE_FLOW_SEG_HDR_IPV4; + break; + case SCTP_V4_FLOW: + hdrs |= ICE_FLOW_SEG_HDR_SCTP | ICE_FLOW_SEG_HDR_IPV4; + break; + case TCP_V6_FLOW: + hdrs |= ICE_FLOW_SEG_HDR_TCP | ICE_FLOW_SEG_HDR_IPV6; + break; + case UDP_V6_FLOW: + hdrs |= ICE_FLOW_SEG_HDR_UDP | ICE_FLOW_SEG_HDR_IPV6; + break; + case SCTP_V6_FLOW: + hdrs |= ICE_FLOW_SEG_HDR_SCTP | ICE_FLOW_SEG_HDR_IPV6; + break; + default: + break; + } + return hdrs; +} + +#define ICE_FLOW_HASH_FLD_IPV4_SA BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_SA) +#define ICE_FLOW_HASH_FLD_IPV6_SA BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_SA) +#define ICE_FLOW_HASH_FLD_IPV4_DA BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_DA) +#define ICE_FLOW_HASH_FLD_IPV6_DA BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_DA) +#define ICE_FLOW_HASH_FLD_TCP_SRC_PORT BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_SRC_PORT) +#define ICE_FLOW_HASH_FLD_TCP_DST_PORT BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_DST_PORT) +#define ICE_FLOW_HASH_FLD_UDP_SRC_PORT BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_SRC_PORT) +#define ICE_FLOW_HASH_FLD_UDP_DST_PORT BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_DST_PORT) +#define ICE_FLOW_HASH_FLD_SCTP_SRC_PORT \ + BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_SRC_PORT) +#define ICE_FLOW_HASH_FLD_SCTP_DST_PORT \ + BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_DST_PORT) + +/** + * ice_parse_hash_flds - parses hash fields from RSS hash input + * @nfc: ethtool rxnfc command + * + * This function parses the rxnfc command and returns intended + * hash fields for RSS configuration + */ +static u64 ice_parse_hash_flds(struct ethtool_rxnfc *nfc) +{ + u64 hfld = ICE_HASH_INVALID; + + if (nfc->data & RXH_IP_SRC || nfc->data & RXH_IP_DST) { + switch (nfc->flow_type) { + case TCP_V4_FLOW: + case UDP_V4_FLOW: + case SCTP_V4_FLOW: + if (nfc->data & RXH_IP_SRC) + hfld |= ICE_FLOW_HASH_FLD_IPV4_SA; + if (nfc->data & RXH_IP_DST) + hfld |= ICE_FLOW_HASH_FLD_IPV4_DA; + break; + case TCP_V6_FLOW: + case UDP_V6_FLOW: + case SCTP_V6_FLOW: + if (nfc->data & RXH_IP_SRC) + hfld |= ICE_FLOW_HASH_FLD_IPV6_SA; + if (nfc->data & RXH_IP_DST) + hfld |= ICE_FLOW_HASH_FLD_IPV6_DA; + break; + default: + break; + } + } + + if (nfc->data & RXH_L4_B_0_1 || nfc->data & RXH_L4_B_2_3) { + switch (nfc->flow_type) { + case TCP_V4_FLOW: + case TCP_V6_FLOW: + if (nfc->data & RXH_L4_B_0_1) + hfld |= ICE_FLOW_HASH_FLD_TCP_SRC_PORT; + if (nfc->data & RXH_L4_B_2_3) + hfld |= ICE_FLOW_HASH_FLD_TCP_DST_PORT; + break; + case UDP_V4_FLOW: + case UDP_V6_FLOW: + if (nfc->data & RXH_L4_B_0_1) + hfld |= ICE_FLOW_HASH_FLD_UDP_SRC_PORT; + if (nfc->data & RXH_L4_B_2_3) + hfld |= ICE_FLOW_HASH_FLD_UDP_DST_PORT; + break; + case SCTP_V4_FLOW: + case SCTP_V6_FLOW: + if (nfc->data & RXH_L4_B_0_1) + hfld |= ICE_FLOW_HASH_FLD_SCTP_SRC_PORT; + if (nfc->data & RXH_L4_B_2_3) + hfld |= ICE_FLOW_HASH_FLD_SCTP_DST_PORT; + break; + default: + break; + } + } + + return hfld; +} + +/** + * ice_set_rss_hash_opt - Enable/Disable flow types for RSS hash + * @vsi: the VSI being configured + * @nfc: ethtool rxnfc command + * + * Returns Success if the flow input set is supported. + */ +static int +ice_set_rss_hash_opt(struct ice_vsi *vsi, struct ethtool_rxnfc *nfc) +{ + struct ice_pf *pf = vsi->back; + enum ice_status status; + struct device *dev; + u64 hashed_flds; + u32 hdrs; + + dev = ice_pf_to_dev(pf); + if (ice_is_safe_mode(pf)) { + dev_dbg(dev, "Advanced RSS disabled. Package download failed, vsi num = %d\n", + vsi->vsi_num); + return -EINVAL; + } + + hashed_flds = ice_parse_hash_flds(nfc); + if (hashed_flds == ICE_HASH_INVALID) { + dev_dbg(dev, "Invalid hash fields, vsi num = %d\n", + vsi->vsi_num); + return -EINVAL; + } + + hdrs = ice_parse_hdrs(nfc); + if (hdrs == ICE_FLOW_SEG_HDR_NONE) { + dev_dbg(dev, "Header type is not valid, vsi num = %d\n", + vsi->vsi_num); + return -EINVAL; + } + + status = ice_add_rss_cfg(&pf->hw, vsi->idx, hashed_flds, hdrs); + if (status) { + dev_dbg(dev, "ice_add_rss_cfg failed, vsi num = %d, error = %d\n", + vsi->vsi_num, status); + return -EINVAL; + } + + return 0; +} + +/** + * ice_get_rss_hash_opt - Retrieve hash fields for a given flow-type + * @vsi: the VSI being configured + * @nfc: ethtool rxnfc command + */ +static void +ice_get_rss_hash_opt(struct ice_vsi *vsi, struct ethtool_rxnfc *nfc) +{ + struct ice_pf *pf = vsi->back; + struct device *dev; + u64 hash_flds; + u32 hdrs; + + dev = ice_pf_to_dev(pf); + + nfc->data = 0; + if (ice_is_safe_mode(pf)) { + dev_dbg(dev, "Advanced RSS disabled. Package download failed, vsi num = %d\n", + vsi->vsi_num); + return; + } + + hdrs = ice_parse_hdrs(nfc); + if (hdrs == ICE_FLOW_SEG_HDR_NONE) { + dev_dbg(dev, "Header type is not valid, vsi num = %d\n", + vsi->vsi_num); + return; + } + + hash_flds = ice_get_rss_cfg(&pf->hw, vsi->idx, hdrs); + if (hash_flds == ICE_HASH_INVALID) { + dev_dbg(dev, "No hash fields found for the given header type, vsi num = %d\n", + vsi->vsi_num); + return; + } + + if (hash_flds & ICE_FLOW_HASH_FLD_IPV4_SA || + hash_flds & ICE_FLOW_HASH_FLD_IPV6_SA) + nfc->data |= (u64)RXH_IP_SRC; + + if (hash_flds & ICE_FLOW_HASH_FLD_IPV4_DA || + hash_flds & ICE_FLOW_HASH_FLD_IPV6_DA) + nfc->data |= (u64)RXH_IP_DST; + + if (hash_flds & ICE_FLOW_HASH_FLD_TCP_SRC_PORT || + hash_flds & ICE_FLOW_HASH_FLD_UDP_SRC_PORT || + hash_flds & ICE_FLOW_HASH_FLD_SCTP_SRC_PORT) + nfc->data |= (u64)RXH_L4_B_0_1; + + if (hash_flds & ICE_FLOW_HASH_FLD_TCP_DST_PORT || + hash_flds & ICE_FLOW_HASH_FLD_UDP_DST_PORT || + hash_flds & ICE_FLOW_HASH_FLD_SCTP_DST_PORT) + nfc->data |= (u64)RXH_L4_B_2_3; +} + +/** + * ice_set_rxnfc - command to set Rx flow rules. + * @netdev: network interface device structure + * @cmd: ethtool rxnfc command + * + * Returns 0 for success and negative values for errors + */ +static int ice_set_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd) +{ + struct ice_netdev_priv *np = netdev_priv(netdev); + struct ice_vsi *vsi = np->vsi; + + switch (cmd->cmd) { + case ETHTOOL_SRXFH: + return ice_set_rss_hash_opt(vsi, cmd); + default: + break; + } + return -EOPNOTSUPP; +} + +/** * ice_get_rxnfc - command to get Rx flow classification rules * @netdev: network interface device structure * @cmd: ethtool rxnfc command @@ -2551,6 +2792,10 @@ ice_get_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd, cmd->data = vsi->rss_size; ret = 0; break; + case ETHTOOL_GRXFH: + ice_get_rss_hash_opt(vsi, cmd); + ret = 0; + break; default: break; } @@ -3585,6 +3830,53 @@ ice_set_q_coalesce(struct ice_vsi *vsi, struct ethtool_coalesce *ec, int q_num) } /** + * ice_is_coalesce_param_invalid - check for unsupported coalesce parameters + * @netdev: pointer to the netdev associated with this query + * @ec: ethtool structure to fill with driver's coalesce settings + * + * Print netdev info if driver doesn't support one of the parameters + * and return error. When any parameters will be implemented, remove only + * this parameter from param array. + */ +static int +ice_is_coalesce_param_invalid(struct net_device *netdev, + struct ethtool_coalesce *ec) +{ + struct ice_ethtool_not_used { + u32 value; + const char *name; + } param[] = { + {ec->stats_block_coalesce_usecs, "stats-block-usecs"}, + {ec->rate_sample_interval, "sample-interval"}, + {ec->pkt_rate_low, "pkt-rate-low"}, + {ec->pkt_rate_high, "pkt-rate-high"}, + {ec->rx_max_coalesced_frames, "rx-frames"}, + {ec->rx_coalesce_usecs_irq, "rx-usecs-irq"}, + {ec->rx_max_coalesced_frames_irq, "rx-frames-irq"}, + {ec->tx_max_coalesced_frames, "tx-frames"}, + {ec->tx_coalesce_usecs_irq, "tx-usecs-irq"}, + {ec->tx_max_coalesced_frames_irq, "tx-frames-irq"}, + {ec->rx_coalesce_usecs_low, "rx-usecs-low"}, + {ec->rx_max_coalesced_frames_low, "rx-frames-low"}, + {ec->tx_coalesce_usecs_low, "tx-usecs-low"}, + {ec->tx_max_coalesced_frames_low, "tx-frames-low"}, + {ec->rx_max_coalesced_frames_high, "rx-frames-high"}, + {ec->tx_max_coalesced_frames_high, "tx-frames-high"} + }; + int i; + + for (i = 0; i < ARRAY_SIZE(param); i++) { + if (param[i].value) { + netdev_info(netdev, "Setting %s not supported\n", + param[i].name); + return -EINVAL; + } + } + + return 0; +} + +/** * __ice_set_coalesce - set ITR/INTRL values for the device * @netdev: pointer to the netdev associated with this query * @ec: ethtool structure to fill with driver's coalesce settings @@ -3600,6 +3892,9 @@ __ice_set_coalesce(struct net_device *netdev, struct ethtool_coalesce *ec, struct ice_netdev_priv *np = netdev_priv(netdev); struct ice_vsi *vsi = np->vsi; + if (ice_is_coalesce_param_invalid(netdev, ec)) + return -EINVAL; + if (q_num < 0) { int v_idx; @@ -3804,6 +4099,7 @@ static const struct ethtool_ops ice_ethtool_ops = { .set_priv_flags = ice_set_priv_flags, .get_sset_count = ice_get_sset_count, .get_rxnfc = ice_get_rxnfc, + .set_rxnfc = ice_set_rxnfc, .get_ringparam = ice_get_ringparam, .set_ringparam = ice_set_ringparam, .nway_reset = ice_nway_reset, diff --git a/drivers/net/ethernet/intel/ice/ice_flex_pipe.c b/drivers/net/ethernet/intel/ice/ice_flex_pipe.c index cbd53b586c36..99208946224c 100644 --- a/drivers/net/ethernet/intel/ice/ice_flex_pipe.c +++ b/drivers/net/ethernet/intel/ice/ice_flex_pipe.c @@ -3,6 +3,87 @@ #include "ice_common.h" #include "ice_flex_pipe.h" +#include "ice_flow.h" + +static const u32 ice_sect_lkup[ICE_BLK_COUNT][ICE_SECT_COUNT] = { + /* SWITCH */ + { + ICE_SID_XLT0_SW, + ICE_SID_XLT_KEY_BUILDER_SW, + ICE_SID_XLT1_SW, + ICE_SID_XLT2_SW, + ICE_SID_PROFID_TCAM_SW, + ICE_SID_PROFID_REDIR_SW, + ICE_SID_FLD_VEC_SW, + ICE_SID_CDID_KEY_BUILDER_SW, + ICE_SID_CDID_REDIR_SW + }, + + /* ACL */ + { + ICE_SID_XLT0_ACL, + ICE_SID_XLT_KEY_BUILDER_ACL, + ICE_SID_XLT1_ACL, + ICE_SID_XLT2_ACL, + ICE_SID_PROFID_TCAM_ACL, + ICE_SID_PROFID_REDIR_ACL, + ICE_SID_FLD_VEC_ACL, + ICE_SID_CDID_KEY_BUILDER_ACL, + ICE_SID_CDID_REDIR_ACL + }, + + /* FD */ + { + ICE_SID_XLT0_FD, + ICE_SID_XLT_KEY_BUILDER_FD, + ICE_SID_XLT1_FD, + ICE_SID_XLT2_FD, + ICE_SID_PROFID_TCAM_FD, + ICE_SID_PROFID_REDIR_FD, + ICE_SID_FLD_VEC_FD, + ICE_SID_CDID_KEY_BUILDER_FD, + ICE_SID_CDID_REDIR_FD + }, + + /* RSS */ + { + ICE_SID_XLT0_RSS, + ICE_SID_XLT_KEY_BUILDER_RSS, + ICE_SID_XLT1_RSS, + ICE_SID_XLT2_RSS, + ICE_SID_PROFID_TCAM_RSS, + ICE_SID_PROFID_REDIR_RSS, + ICE_SID_FLD_VEC_RSS, + ICE_SID_CDID_KEY_BUILDER_RSS, + ICE_SID_CDID_REDIR_RSS + }, + + /* PE */ + { + ICE_SID_XLT0_PE, + ICE_SID_XLT_KEY_BUILDER_PE, + ICE_SID_XLT1_PE, + ICE_SID_XLT2_PE, + ICE_SID_PROFID_TCAM_PE, + ICE_SID_PROFID_REDIR_PE, + ICE_SID_FLD_VEC_PE, + ICE_SID_CDID_KEY_BUILDER_PE, + ICE_SID_CDID_REDIR_PE + } +}; + +/** + * ice_sect_id - returns section ID + * @blk: block type + * @sect: section type + * + * This helper function returns the proper section ID given a block type and a + * section type. + */ +static u32 ice_sect_id(enum ice_block blk, enum ice_sect sect) +{ + return ice_sect_lkup[blk][sect]; +} /** * ice_pkg_val_buf @@ -158,6 +239,176 @@ ice_pkg_enum_section(struct ice_seg *ice_seg, struct ice_pkg_enum *state, return state->sect; } +/* Key creation */ + +#define ICE_DC_KEY 0x1 /* don't care */ +#define ICE_DC_KEYINV 0x1 +#define ICE_NM_KEY 0x0 /* never match */ +#define ICE_NM_KEYINV 0x0 +#define ICE_0_KEY 0x1 /* match 0 */ +#define ICE_0_KEYINV 0x0 +#define ICE_1_KEY 0x0 /* match 1 */ +#define ICE_1_KEYINV 0x1 + +/** + * ice_gen_key_word - generate 16-bits of a key/mask word + * @val: the value + * @valid: valid bits mask (change only the valid bits) + * @dont_care: don't care mask + * @nvr_mtch: never match mask + * @key: pointer to an array of where the resulting key portion + * @key_inv: pointer to an array of where the resulting key invert portion + * + * This function generates 16-bits from a 8-bit value, an 8-bit don't care mask + * and an 8-bit never match mask. The 16-bits of output are divided into 8 bits + * of key and 8 bits of key invert. + * + * '0' = b01, always match a 0 bit + * '1' = b10, always match a 1 bit + * '?' = b11, don't care bit (always matches) + * '~' = b00, never match bit + * + * Input: + * val: b0 1 0 1 0 1 + * dont_care: b0 0 1 1 0 0 + * never_mtch: b0 0 0 0 1 1 + * ------------------------------ + * Result: key: b01 10 11 11 00 00 + */ +static enum ice_status +ice_gen_key_word(u8 val, u8 valid, u8 dont_care, u8 nvr_mtch, u8 *key, + u8 *key_inv) +{ + u8 in_key = *key, in_key_inv = *key_inv; + u8 i; + + /* 'dont_care' and 'nvr_mtch' masks cannot overlap */ + if ((dont_care ^ nvr_mtch) != (dont_care | nvr_mtch)) + return ICE_ERR_CFG; + + *key = 0; + *key_inv = 0; + + /* encode the 8 bits into 8-bit key and 8-bit key invert */ + for (i = 0; i < 8; i++) { + *key >>= 1; + *key_inv >>= 1; + + if (!(valid & 0x1)) { /* change only valid bits */ + *key |= (in_key & 0x1) << 7; + *key_inv |= (in_key_inv & 0x1) << 7; + } else if (dont_care & 0x1) { /* don't care bit */ + *key |= ICE_DC_KEY << 7; + *key_inv |= ICE_DC_KEYINV << 7; + } else if (nvr_mtch & 0x1) { /* never match bit */ + *key |= ICE_NM_KEY << 7; + *key_inv |= ICE_NM_KEYINV << 7; + } else if (val & 0x01) { /* exact 1 match */ + *key |= ICE_1_KEY << 7; + *key_inv |= ICE_1_KEYINV << 7; + } else { /* exact 0 match */ + *key |= ICE_0_KEY << 7; + *key_inv |= ICE_0_KEYINV << 7; + } + + dont_care >>= 1; + nvr_mtch >>= 1; + valid >>= 1; + val >>= 1; + in_key >>= 1; + in_key_inv >>= 1; + } + + return 0; +} + +/** + * ice_bits_max_set - determine if the number of bits set is within a maximum + * @mask: pointer to the byte array which is the mask + * @size: the number of bytes in the mask + * @max: the max number of set bits + * + * This function determines if there are at most 'max' number of bits set in an + * array. Returns true if the number for bits set is <= max or will return false + * otherwise. + */ +static bool ice_bits_max_set(const u8 *mask, u16 size, u16 max) +{ + u16 count = 0; + u16 i; + + /* check each byte */ + for (i = 0; i < size; i++) { + /* if 0, go to next byte */ + if (!mask[i]) + continue; + + /* We know there is at least one set bit in this byte because of + * the above check; if we already have found 'max' number of + * bits set, then we can return failure now. + */ + if (count == max) + return false; + + /* count the bits in this byte, checking threshold */ + count += hweight8(mask[i]); + if (count > max) + return false; + } + + return true; +} + +/** + * ice_set_key - generate a variable sized key with multiples of 16-bits + * @key: pointer to where the key will be stored + * @size: the size of the complete key in bytes (must be even) + * @val: array of 8-bit values that makes up the value portion of the key + * @upd: array of 8-bit masks that determine what key portion to update + * @dc: array of 8-bit masks that make up the don't care mask + * @nm: array of 8-bit masks that make up the never match mask + * @off: the offset of the first byte in the key to update + * @len: the number of bytes in the key update + * + * This function generates a key from a value, a don't care mask and a never + * match mask. + * upd, dc, and nm are optional parameters, and can be NULL: + * upd == NULL --> udp mask is all 1's (update all bits) + * dc == NULL --> dc mask is all 0's (no don't care bits) + * nm == NULL --> nm mask is all 0's (no never match bits) + */ +static enum ice_status +ice_set_key(u8 *key, u16 size, u8 *val, u8 *upd, u8 *dc, u8 *nm, u16 off, + u16 len) +{ + u16 half_size; + u16 i; + + /* size must be a multiple of 2 bytes. */ + if (size % 2) + return ICE_ERR_CFG; + + half_size = size / 2; + if (off + len > half_size) + return ICE_ERR_CFG; + + /* Make sure at most one bit is set in the never match mask. Having more + * than one never match mask bit set will cause HW to consume excessive + * power otherwise; this is a power management efficiency check. + */ +#define ICE_NVR_MTCH_BITS_MAX 1 + if (nm && !ice_bits_max_set(nm, len, ICE_NVR_MTCH_BITS_MAX)) + return ICE_ERR_CFG; + + for (i = 0; i < len; i++) + if (ice_gen_key_word(val[i], upd ? upd[i] : 0xff, + dc ? dc[i] : 0, nm ? nm[i] : 0, + key + off + i, key + half_size + off + i)) + return ICE_ERR_CFG; + + return 0; +} + /** * ice_acquire_global_cfg_lock * @hw: pointer to the HW structure @@ -205,6 +456,31 @@ static void ice_release_global_cfg_lock(struct ice_hw *hw) } /** + * ice_acquire_change_lock + * @hw: pointer to the HW structure + * @access: access type (read or write) + * + * This function will request ownership of the change lock. + */ +static enum ice_status +ice_acquire_change_lock(struct ice_hw *hw, enum ice_aq_res_access_type access) +{ + return ice_acquire_res(hw, ICE_CHANGE_LOCK_RES_ID, access, + ICE_CHANGE_LOCK_TIMEOUT); +} + +/** + * ice_release_change_lock + * @hw: pointer to the HW structure + * + * This function will release the change lock using the proper Admin Command. + */ +static void ice_release_change_lock(struct ice_hw *hw) +{ + ice_release_res(hw, ICE_CHANGE_LOCK_RES_ID); +} + +/** * ice_aq_download_pkg * @hw: pointer to the hardware structure * @pkg_buf: the package buffer to transfer @@ -253,6 +529,54 @@ ice_aq_download_pkg(struct ice_hw *hw, struct ice_buf_hdr *pkg_buf, } /** + * ice_aq_update_pkg + * @hw: pointer to the hardware structure + * @pkg_buf: the package cmd buffer + * @buf_size: the size of the package cmd buffer + * @last_buf: last buffer indicator + * @error_offset: returns error offset + * @error_info: returns error information + * @cd: pointer to command details structure or NULL + * + * Update Package (0x0C42) + */ +static enum ice_status +ice_aq_update_pkg(struct ice_hw *hw, struct ice_buf_hdr *pkg_buf, u16 buf_size, + bool last_buf, u32 *error_offset, u32 *error_info, + struct ice_sq_cd *cd) +{ + struct ice_aqc_download_pkg *cmd; + struct ice_aq_desc desc; + enum ice_status status; + + if (error_offset) + *error_offset = 0; + if (error_info) + *error_info = 0; + + cmd = &desc.params.download_pkg; + ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_update_pkg); + desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD); + + if (last_buf) + cmd->flags |= ICE_AQC_DOWNLOAD_PKG_LAST_BUF; + + status = ice_aq_send_cmd(hw, &desc, pkg_buf, buf_size, cd); + if (status == ICE_ERR_AQ_ERROR) { + /* Read error from buffer only when the FW returned an error */ + struct ice_aqc_download_pkg_resp *resp; + + resp = (struct ice_aqc_download_pkg_resp *)pkg_buf; + if (error_offset) + *error_offset = le32_to_cpu(resp->error_offset); + if (error_info) + *error_info = le32_to_cpu(resp->error_info); + } + + return status; +} + +/** * ice_find_seg_in_pkg * @hw: pointer to the hardware structure * @seg_type: the segment type to search for (i.e., SEGMENT_TYPE_CPK) @@ -287,6 +611,44 @@ ice_find_seg_in_pkg(struct ice_hw *hw, u32 seg_type, } /** + * ice_update_pkg + * @hw: pointer to the hardware structure + * @bufs: pointer to an array of buffers + * @count: the number of buffers in the array + * + * Obtains change lock and updates package. + */ +static enum ice_status +ice_update_pkg(struct ice_hw *hw, struct ice_buf *bufs, u32 count) +{ + enum ice_status status; + u32 offset, info, i; + + status = ice_acquire_change_lock(hw, ICE_RES_WRITE); + if (status) + return status; + + for (i = 0; i < count; i++) { + struct ice_buf_hdr *bh = (struct ice_buf_hdr *)(bufs + i); + bool last = ((i + 1) == count); + + status = ice_aq_update_pkg(hw, bh, le16_to_cpu(bh->data_end), + last, &offset, &info, NULL); + + if (status) { + ice_debug(hw, ICE_DBG_PKG, + "Update pkg failed: err %d off %d inf %d\n", + status, offset, info); + break; + } + } + + ice_release_change_lock(hw); + + return status; +} + +/** * ice_dwnld_cfg_bufs * @hw: pointer to the hardware structure * @bufs: pointer to an array of buffers @@ -767,6 +1129,169 @@ enum ice_status ice_copy_and_init_pkg(struct ice_hw *hw, const u8 *buf, u32 len) return status; } +/** + * ice_pkg_buf_alloc + * @hw: pointer to the HW structure + * + * Allocates a package buffer and returns a pointer to the buffer header. + * Note: all package contents must be in Little Endian form. + */ +static struct ice_buf_build *ice_pkg_buf_alloc(struct ice_hw *hw) +{ + struct ice_buf_build *bld; + struct ice_buf_hdr *buf; + + bld = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*bld), GFP_KERNEL); + if (!bld) + return NULL; + + buf = (struct ice_buf_hdr *)bld; + buf->data_end = cpu_to_le16(offsetof(struct ice_buf_hdr, + section_entry)); + return bld; +} + +/** + * ice_pkg_buf_free + * @hw: pointer to the HW structure + * @bld: pointer to pkg build (allocated by ice_pkg_buf_alloc()) + * + * Frees a package buffer + */ +static void ice_pkg_buf_free(struct ice_hw *hw, struct ice_buf_build *bld) +{ + devm_kfree(ice_hw_to_dev(hw), bld); +} + +/** + * ice_pkg_buf_reserve_section + * @bld: pointer to pkg build (allocated by ice_pkg_buf_alloc()) + * @count: the number of sections to reserve + * + * Reserves one or more section table entries in a package buffer. This routine + * can be called multiple times as long as they are made before calling + * ice_pkg_buf_alloc_section(). Once ice_pkg_buf_alloc_section() + * is called once, the number of sections that can be allocated will not be able + * to be increased; not using all reserved sections is fine, but this will + * result in some wasted space in the buffer. + * Note: all package contents must be in Little Endian form. + */ +static enum ice_status +ice_pkg_buf_reserve_section(struct ice_buf_build *bld, u16 count) +{ + struct ice_buf_hdr *buf; + u16 section_count; + u16 data_end; + + if (!bld) + return ICE_ERR_PARAM; + + buf = (struct ice_buf_hdr *)&bld->buf; + + /* already an active section, can't increase table size */ + section_count = le16_to_cpu(buf->section_count); + if (section_count > 0) + return ICE_ERR_CFG; + + if (bld->reserved_section_table_entries + count > ICE_MAX_S_COUNT) + return ICE_ERR_CFG; + bld->reserved_section_table_entries += count; + + data_end = le16_to_cpu(buf->data_end) + + (count * sizeof(buf->section_entry[0])); + buf->data_end = cpu_to_le16(data_end); + + return 0; +} + +/** + * ice_pkg_buf_alloc_section + * @bld: pointer to pkg build (allocated by ice_pkg_buf_alloc()) + * @type: the section type value + * @size: the size of the section to reserve (in bytes) + * + * Reserves memory in the buffer for a section's content and updates the + * buffers' status accordingly. This routine returns a pointer to the first + * byte of the section start within the buffer, which is used to fill in the + * section contents. + * Note: all package contents must be in Little Endian form. + */ +static void * +ice_pkg_buf_alloc_section(struct ice_buf_build *bld, u32 type, u16 size) +{ + struct ice_buf_hdr *buf; + u16 sect_count; + u16 data_end; + + if (!bld || !type || !size) + return NULL; + + buf = (struct ice_buf_hdr *)&bld->buf; + + /* check for enough space left in buffer */ + data_end = le16_to_cpu(buf->data_end); + + /* section start must align on 4 byte boundary */ + data_end = ALIGN(data_end, 4); + + if ((data_end + size) > ICE_MAX_S_DATA_END) + return NULL; + + /* check for more available section table entries */ + sect_count = le16_to_cpu(buf->section_count); + if (sect_count < bld->reserved_section_table_entries) { + void *section_ptr = ((u8 *)buf) + data_end; + + buf->section_entry[sect_count].offset = cpu_to_le16(data_end); + buf->section_entry[sect_count].size = cpu_to_le16(size); + buf->section_entry[sect_count].type = cpu_to_le32(type); + + data_end += size; + buf->data_end = cpu_to_le16(data_end); + + buf->section_count = cpu_to_le16(sect_count + 1); + return section_ptr; + } + + /* no free section table entries */ + return NULL; +} + +/** + * ice_pkg_buf_get_active_sections + * @bld: pointer to pkg build (allocated by ice_pkg_buf_alloc()) + * + * Returns the number of active sections. Before using the package buffer + * in an update package command, the caller should make sure that there is at + * least one active section - otherwise, the buffer is not legal and should + * not be used. + * Note: all package contents must be in Little Endian form. + */ +static u16 ice_pkg_buf_get_active_sections(struct ice_buf_build *bld) +{ + struct ice_buf_hdr *buf; + + if (!bld) + return 0; + + buf = (struct ice_buf_hdr *)&bld->buf; + return le16_to_cpu(buf->section_count); +} + +/** + * ice_pkg_buf + * @bld: pointer to pkg build (allocated by ice_pkg_buf_alloc()) + * + * Return a pointer to the buffer's header + */ +static struct ice_buf *ice_pkg_buf(struct ice_buf_build *bld) +{ + if (!bld) + return NULL; + + return &bld->buf; +} + /* PTG Management */ /** @@ -951,6 +1476,48 @@ enum ice_sid_all { ICE_SID_OFF_COUNT, }; +/* Characteristic handling */ + +/** + * ice_match_prop_lst - determine if properties of two lists match + * @list1: first properties list + * @list2: second properties list + * + * Count, cookies and the order must match in order to be considered equivalent. + */ +static bool +ice_match_prop_lst(struct list_head *list1, struct list_head *list2) +{ + struct ice_vsig_prof *tmp1; + struct ice_vsig_prof *tmp2; + u16 chk_count = 0; + u16 count = 0; + + /* compare counts */ + list_for_each_entry(tmp1, list1, list) + count++; + list_for_each_entry(tmp2, list2, list) + chk_count++; + if (!count || count != chk_count) + return false; + + tmp1 = list_first_entry(list1, struct ice_vsig_prof, list); + tmp2 = list_first_entry(list2, struct ice_vsig_prof, list); + + /* profile cookies must compare, and in the exact same order to take + * into account priority + */ + while (count--) { + if (tmp2->profile_cookie != tmp1->profile_cookie) + return false; + + tmp1 = list_next_entry(tmp1, list); + tmp2 = list_next_entry(tmp2, list); + } + + return true; +} + /* VSIG Management */ /** @@ -999,6 +1566,117 @@ static u16 ice_vsig_alloc_val(struct ice_hw *hw, enum ice_block blk, u16 vsig) } /** + * ice_vsig_alloc - Finds a free entry and allocates a new VSIG + * @hw: pointer to the hardware structure + * @blk: HW block + * + * This function will iterate through the VSIG list and mark the first + * unused entry for the new VSIG entry as used and return that value. + */ +static u16 ice_vsig_alloc(struct ice_hw *hw, enum ice_block blk) +{ + u16 i; + + for (i = 1; i < ICE_MAX_VSIGS; i++) + if (!hw->blk[blk].xlt2.vsig_tbl[i].in_use) + return ice_vsig_alloc_val(hw, blk, i); + + return ICE_DEFAULT_VSIG; +} + +/** + * ice_find_dup_props_vsig - find VSI group with a specified set of properties + * @hw: pointer to the hardware structure + * @blk: HW block + * @chs: characteristic list + * @vsig: returns the VSIG with the matching profiles, if found + * + * Each VSIG is associated with a characteristic set; i.e. all VSIs under + * a group have the same characteristic set. To check if there exists a VSIG + * which has the same characteristics as the input characteristics; this + * function will iterate through the XLT2 list and return the VSIG that has a + * matching configuration. In order to make sure that priorities are accounted + * for, the list must match exactly, including the order in which the + * characteristics are listed. + */ +static enum ice_status +ice_find_dup_props_vsig(struct ice_hw *hw, enum ice_block blk, + struct list_head *chs, u16 *vsig) +{ + struct ice_xlt2 *xlt2 = &hw->blk[blk].xlt2; + u16 i; + + for (i = 0; i < xlt2->count; i++) + if (xlt2->vsig_tbl[i].in_use && + ice_match_prop_lst(chs, &xlt2->vsig_tbl[i].prop_lst)) { + *vsig = ICE_VSIG_VALUE(i, hw->pf_id); + return 0; + } + + return ICE_ERR_DOES_NOT_EXIST; +} + +/** + * ice_vsig_free - free VSI group + * @hw: pointer to the hardware structure + * @blk: HW block + * @vsig: VSIG to remove + * + * The function will remove all VSIs associated with the input VSIG and move + * them to the DEFAULT_VSIG and mark the VSIG available. + */ +static enum ice_status +ice_vsig_free(struct ice_hw *hw, enum ice_block blk, u16 vsig) +{ + struct ice_vsig_prof *dtmp, *del; + struct ice_vsig_vsi *vsi_cur; + u16 idx; + + idx = vsig & ICE_VSIG_IDX_M; + if (idx >= ICE_MAX_VSIGS) + return ICE_ERR_PARAM; + + if (!hw->blk[blk].xlt2.vsig_tbl[idx].in_use) + return ICE_ERR_DOES_NOT_EXIST; + + hw->blk[blk].xlt2.vsig_tbl[idx].in_use = false; + + vsi_cur = hw->blk[blk].xlt2.vsig_tbl[idx].first_vsi; + /* If the VSIG has at least 1 VSI then iterate through the + * list and remove the VSIs before deleting the group. + */ + if (vsi_cur) { + /* remove all vsis associated with this VSIG XLT2 entry */ + do { + struct ice_vsig_vsi *tmp = vsi_cur->next_vsi; + + vsi_cur->vsig = ICE_DEFAULT_VSIG; + vsi_cur->changed = 1; + vsi_cur->next_vsi = NULL; + vsi_cur = tmp; + } while (vsi_cur); + + /* NULL terminate head of VSI list */ + hw->blk[blk].xlt2.vsig_tbl[idx].first_vsi = NULL; + } + + /* free characteristic list */ + list_for_each_entry_safe(del, dtmp, + &hw->blk[blk].xlt2.vsig_tbl[idx].prop_lst, + list) { + list_del(&del->list); + devm_kfree(ice_hw_to_dev(hw), del); + } + + /* if VSIG characteristic list was cleared for reset + * re-initialize the list head + */ + INIT_LIST_HEAD(&hw->blk[blk].xlt2.vsig_tbl[idx].prop_lst); + + return 0; +} + +/** * ice_vsig_remove_vsi - remove VSI from VSIG * @hw: pointer to the hardware structure * @blk: HW block @@ -1117,6 +1795,215 @@ ice_vsig_add_mv_vsi(struct ice_hw *hw, enum ice_block blk, u16 vsi, u16 vsig) return 0; } +/** + * ice_find_prof_id - find profile ID for a given field vector + * @hw: pointer to the hardware structure + * @blk: HW block + * @fv: field vector to search for + * @prof_id: receives the profile ID + */ +static enum ice_status +ice_find_prof_id(struct ice_hw *hw, enum ice_block blk, + struct ice_fv_word *fv, u8 *prof_id) +{ + struct ice_es *es = &hw->blk[blk].es; + u16 off, i; + + for (i = 0; i < es->count; i++) { + off = i * es->fvw; + + if (memcmp(&es->t[off], fv, es->fvw * sizeof(*fv))) + continue; + + *prof_id = i; + return 0; + } + + return ICE_ERR_DOES_NOT_EXIST; +} + +/** + * ice_prof_id_rsrc_type - get profile ID resource type for a block type + * @blk: the block type + * @rsrc_type: pointer to variable to receive the resource type + */ +static bool ice_prof_id_rsrc_type(enum ice_block blk, u16 *rsrc_type) +{ + switch (blk) { + case ICE_BLK_RSS: + *rsrc_type = ICE_AQC_RES_TYPE_HASH_PROF_BLDR_PROFID; + break; + default: + return false; + } + return true; +} + +/** + * ice_tcam_ent_rsrc_type - get TCAM entry resource type for a block type + * @blk: the block type + * @rsrc_type: pointer to variable to receive the resource type + */ +static bool ice_tcam_ent_rsrc_type(enum ice_block blk, u16 *rsrc_type) +{ + switch (blk) { + case ICE_BLK_RSS: + *rsrc_type = ICE_AQC_RES_TYPE_HASH_PROF_BLDR_TCAM; + break; + default: + return false; + } + return true; +} + +/** + * ice_alloc_tcam_ent - allocate hardware TCAM entry + * @hw: pointer to the HW struct + * @blk: the block to allocate the TCAM for + * @tcam_idx: pointer to variable to receive the TCAM entry + * + * This function allocates a new entry in a Profile ID TCAM for a specific + * block. + */ +static enum ice_status +ice_alloc_tcam_ent(struct ice_hw *hw, enum ice_block blk, u16 *tcam_idx) +{ + u16 res_type; + + if (!ice_tcam_ent_rsrc_type(blk, &res_type)) + return ICE_ERR_PARAM; + + return ice_alloc_hw_res(hw, res_type, 1, true, tcam_idx); +} + +/** + * ice_free_tcam_ent - free hardware TCAM entry + * @hw: pointer to the HW struct + * @blk: the block from which to free the TCAM entry + * @tcam_idx: the TCAM entry to free + * + * This function frees an entry in a Profile ID TCAM for a specific block. + */ +static enum ice_status +ice_free_tcam_ent(struct ice_hw *hw, enum ice_block blk, u16 tcam_idx) +{ + u16 res_type; + + if (!ice_tcam_ent_rsrc_type(blk, &res_type)) + return ICE_ERR_PARAM; + + return ice_free_hw_res(hw, res_type, 1, &tcam_idx); +} + +/** + * ice_alloc_prof_id - allocate profile ID + * @hw: pointer to the HW struct + * @blk: the block to allocate the profile ID for + * @prof_id: pointer to variable to receive the profile ID + * + * This function allocates a new profile ID, which also corresponds to a Field + * Vector (Extraction Sequence) entry. + */ +static enum ice_status +ice_alloc_prof_id(struct ice_hw *hw, enum ice_block blk, u8 *prof_id) +{ + enum ice_status status; + u16 res_type; + u16 get_prof; + + if (!ice_prof_id_rsrc_type(blk, &res_type)) + return ICE_ERR_PARAM; + + status = ice_alloc_hw_res(hw, res_type, 1, false, &get_prof); + if (!status) + *prof_id = (u8)get_prof; + + return status; +} + +/** + * ice_free_prof_id - free profile ID + * @hw: pointer to the HW struct + * @blk: the block from which to free the profile ID + * @prof_id: the profile ID to free + * + * This function frees a profile ID, which also corresponds to a Field Vector. + */ +static enum ice_status +ice_free_prof_id(struct ice_hw *hw, enum ice_block blk, u8 prof_id) +{ + u16 tmp_prof_id = (u16)prof_id; + u16 res_type; + + if (!ice_prof_id_rsrc_type(blk, &res_type)) + return ICE_ERR_PARAM; + + return ice_free_hw_res(hw, res_type, 1, &tmp_prof_id); +} + +/** + * ice_prof_inc_ref - increment reference count for profile + * @hw: pointer to the HW struct + * @blk: the block from which to free the profile ID + * @prof_id: the profile ID for which to increment the reference count + */ +static enum ice_status +ice_prof_inc_ref(struct ice_hw *hw, enum ice_block blk, u8 prof_id) +{ + if (prof_id > hw->blk[blk].es.count) + return ICE_ERR_PARAM; + + hw->blk[blk].es.ref_count[prof_id]++; + + return 0; +} + +/** + * ice_write_es - write an extraction sequence to hardware + * @hw: pointer to the HW struct + * @blk: the block in which to write the extraction sequence + * @prof_id: the profile ID to write + * @fv: pointer to the extraction sequence to write - NULL to clear extraction + */ +static void +ice_write_es(struct ice_hw *hw, enum ice_block blk, u8 prof_id, + struct ice_fv_word *fv) +{ + u16 off; + + off = prof_id * hw->blk[blk].es.fvw; + if (!fv) { + memset(&hw->blk[blk].es.t[off], 0, + hw->blk[blk].es.fvw * sizeof(*fv)); + hw->blk[blk].es.written[prof_id] = false; + } else { + memcpy(&hw->blk[blk].es.t[off], fv, + hw->blk[blk].es.fvw * sizeof(*fv)); + } +} + +/** + * ice_prof_dec_ref - decrement reference count for profile + * @hw: pointer to the HW struct + * @blk: the block from which to free the profile ID + * @prof_id: the profile ID for which to decrement the reference count + */ +static enum ice_status +ice_prof_dec_ref(struct ice_hw *hw, enum ice_block blk, u8 prof_id) +{ + if (prof_id > hw->blk[blk].es.count) + return ICE_ERR_PARAM; + + if (hw->blk[blk].es.ref_count[prof_id] > 0) { + if (!--hw->blk[blk].es.ref_count[prof_id]) { + ice_write_es(hw, blk, prof_id, NULL); + return ice_free_prof_id(hw, blk, prof_id); + } + } + + return 0; +} + /* Block / table section IDs */ static const u32 ice_blk_sids[ICE_BLK_COUNT][ICE_SID_OFF_COUNT] = { /* SWITCH */ @@ -1374,16 +2261,85 @@ void ice_fill_blk_tbls(struct ice_hw *hw) } /** + * ice_free_prof_map - free profile map + * @hw: pointer to the hardware structure + * @blk_idx: HW block index + */ +static void ice_free_prof_map(struct ice_hw *hw, u8 blk_idx) +{ + struct ice_es *es = &hw->blk[blk_idx].es; + struct ice_prof_map *del, *tmp; + + mutex_lock(&es->prof_map_lock); + list_for_each_entry_safe(del, tmp, &es->prof_map, list) { + list_del(&del->list); + devm_kfree(ice_hw_to_dev(hw), del); + } + INIT_LIST_HEAD(&es->prof_map); + mutex_unlock(&es->prof_map_lock); +} + +/** + * ice_free_flow_profs - free flow profile entries + * @hw: pointer to the hardware structure + * @blk_idx: HW block index + */ +static void ice_free_flow_profs(struct ice_hw *hw, u8 blk_idx) +{ + struct ice_flow_prof *p, *tmp; + + mutex_lock(&hw->fl_profs_locks[blk_idx]); + list_for_each_entry_safe(p, tmp, &hw->fl_profs[blk_idx], l_entry) { + list_del(&p->l_entry); + devm_kfree(ice_hw_to_dev(hw), p); + } + mutex_unlock(&hw->fl_profs_locks[blk_idx]); + + /* if driver is in reset and tables are being cleared + * re-initialize the flow profile list heads + */ + INIT_LIST_HEAD(&hw->fl_profs[blk_idx]); +} + +/** + * ice_free_vsig_tbl - free complete VSIG table entries + * @hw: pointer to the hardware structure + * @blk: the HW block on which to free the VSIG table entries + */ +static void ice_free_vsig_tbl(struct ice_hw *hw, enum ice_block blk) +{ + u16 i; + + if (!hw->blk[blk].xlt2.vsig_tbl) + return; + + for (i = 1; i < ICE_MAX_VSIGS; i++) + if (hw->blk[blk].xlt2.vsig_tbl[i].in_use) + ice_vsig_free(hw, blk, i); +} + +/** * ice_free_hw_tbls - free hardware table memory * @hw: pointer to the hardware structure */ void ice_free_hw_tbls(struct ice_hw *hw) { + struct ice_rss_cfg *r, *rt; u8 i; for (i = 0; i < ICE_BLK_COUNT; i++) { - hw->blk[i].is_list_init = false; + if (hw->blk[i].is_list_init) { + struct ice_es *es = &hw->blk[i].es; + + ice_free_prof_map(hw, i); + mutex_destroy(&es->prof_map_lock); + ice_free_flow_profs(hw, i); + mutex_destroy(&hw->fl_profs_locks[i]); + + hw->blk[i].is_list_init = false; + } + ice_free_vsig_tbl(hw, (enum ice_block)i); devm_kfree(ice_hw_to_dev(hw), hw->blk[i].xlt1.ptypes); devm_kfree(ice_hw_to_dev(hw), hw->blk[i].xlt1.ptg_tbl); devm_kfree(ice_hw_to_dev(hw), hw->blk[i].xlt1.t); @@ -1397,10 +2353,26 @@ void ice_free_hw_tbls(struct ice_hw *hw) devm_kfree(ice_hw_to_dev(hw), hw->blk[i].es.written); } + list_for_each_entry_safe(r, rt, &hw->rss_list_head, l_entry) { + list_del(&r->l_entry); + devm_kfree(ice_hw_to_dev(hw), r); + } + mutex_destroy(&hw->rss_locks); memset(hw->blk, 0, sizeof(hw->blk)); } /** + * ice_init_flow_profs - init flow profile locks and list heads + * @hw: pointer to the hardware structure + * @blk_idx: HW block index + */ +static void ice_init_flow_profs(struct ice_hw *hw, u8 blk_idx) +{ + mutex_init(&hw->fl_profs_locks[blk_idx]); + INIT_LIST_HEAD(&hw->fl_profs[blk_idx]); +} + +/** * ice_clear_hw_tbls - clear HW tables and flow profiles * @hw: pointer to the hardware structure */ @@ -1415,6 +2387,13 @@ void ice_clear_hw_tbls(struct ice_hw *hw) struct ice_xlt2 *xlt2 = &hw->blk[i].xlt2; struct ice_es *es = &hw->blk[i].es; + if (hw->blk[i].is_list_init) { + ice_free_prof_map(hw, i); + ice_free_flow_profs(hw, i); + } + + ice_free_vsig_tbl(hw, (enum ice_block)i); + memset(xlt1->ptypes, 0, xlt1->count * sizeof(*xlt1->ptypes)); memset(xlt1->ptg_tbl, 0, ICE_MAX_PTGS * sizeof(*xlt1->ptg_tbl)); @@ -1443,6 +2422,8 @@ enum ice_status ice_init_hw_tbls(struct ice_hw *hw) { u8 i; + mutex_init(&hw->rss_locks); + INIT_LIST_HEAD(&hw->rss_list_head); for (i = 0; i < ICE_BLK_COUNT; i++) { struct ice_prof_redir *prof_redir = &hw->blk[i].prof_redir; struct ice_prof_tcam *prof = &hw->blk[i].prof; @@ -1454,6 +2435,9 @@ enum ice_status ice_init_hw_tbls(struct ice_hw *hw) if (hw->blk[i].is_list_init) continue; + ice_init_flow_profs(hw, i); + mutex_init(&es->prof_map_lock); + INIT_LIST_HEAD(&es->prof_map); hw->blk[i].is_list_init = true; hw->blk[i].overwrite = blk_sizes[i].overwrite; @@ -1547,3 +2531,1580 @@ err: ice_free_hw_tbls(hw); return ICE_ERR_NO_MEMORY; } + +/** + * ice_prof_gen_key - generate profile ID key + * @hw: pointer to the HW struct + * @blk: the block in which to write profile ID to + * @ptg: packet type group (PTG) portion of key + * @vsig: VSIG portion of key + * @cdid: CDID portion of key + * @flags: flag portion of key + * @vl_msk: valid mask + * @dc_msk: don't care mask + * @nm_msk: never match mask + * @key: output of profile ID key + */ +static enum ice_status +ice_prof_gen_key(struct ice_hw *hw, enum ice_block blk, u8 ptg, u16 vsig, + u8 cdid, u16 flags, u8 vl_msk[ICE_TCAM_KEY_VAL_SZ], + u8 dc_msk[ICE_TCAM_KEY_VAL_SZ], u8 nm_msk[ICE_TCAM_KEY_VAL_SZ], + u8 key[ICE_TCAM_KEY_SZ]) +{ + struct ice_prof_id_key inkey; + + inkey.xlt1 = ptg; + inkey.xlt2_cdid = cpu_to_le16(vsig); + inkey.flags = cpu_to_le16(flags); + + switch (hw->blk[blk].prof.cdid_bits) { + case 0: + break; + case 2: +#define ICE_CD_2_M 0xC000U +#define ICE_CD_2_S 14 + inkey.xlt2_cdid &= ~cpu_to_le16(ICE_CD_2_M); + inkey.xlt2_cdid |= cpu_to_le16(BIT(cdid) << ICE_CD_2_S); + break; + case 4: +#define ICE_CD_4_M 0xF000U +#define ICE_CD_4_S 12 + inkey.xlt2_cdid &= ~cpu_to_le16(ICE_CD_4_M); + inkey.xlt2_cdid |= cpu_to_le16(BIT(cdid) << ICE_CD_4_S); + break; + case 8: +#define ICE_CD_8_M 0xFF00U +#define ICE_CD_8_S 16 + inkey.xlt2_cdid &= ~cpu_to_le16(ICE_CD_8_M); + inkey.xlt2_cdid |= cpu_to_le16(BIT(cdid) << ICE_CD_8_S); + break; + default: + ice_debug(hw, ICE_DBG_PKG, "Error in profile config\n"); + break; + } + + return ice_set_key(key, ICE_TCAM_KEY_SZ, (u8 *)&inkey, vl_msk, dc_msk, + nm_msk, 0, ICE_TCAM_KEY_SZ / 2); +} + +/** + * ice_tcam_write_entry - write TCAM entry + * @hw: pointer to the HW struct + * @blk: the block in which to write profile ID to + * @idx: the entry index to write to + * @prof_id: profile ID + * @ptg: packet type group (PTG) portion of key + * @vsig: VSIG portion of key + * @cdid: CDID portion of key + * @flags: flag portion of key + * @vl_msk: valid mask + * @dc_msk: don't care mask + * @nm_msk: never match mask + */ +static enum ice_status +ice_tcam_write_entry(struct ice_hw *hw, enum ice_block blk, u16 idx, + u8 prof_id, u8 ptg, u16 vsig, u8 cdid, u16 flags, + u8 vl_msk[ICE_TCAM_KEY_VAL_SZ], + u8 dc_msk[ICE_TCAM_KEY_VAL_SZ], + u8 nm_msk[ICE_TCAM_KEY_VAL_SZ]) +{ + struct ice_prof_tcam_entry; + enum ice_status status; + + status = ice_prof_gen_key(hw, blk, ptg, vsig, cdid, flags, vl_msk, + dc_msk, nm_msk, hw->blk[blk].prof.t[idx].key); + if (!status) { + hw->blk[blk].prof.t[idx].addr = cpu_to_le16(idx); + hw->blk[blk].prof.t[idx].prof_id = prof_id; + } + + return status; +} + +/** + * ice_vsig_get_ref - returns number of VSIs belong to a VSIG + * @hw: pointer to the hardware structure + * @blk: HW block + * @vsig: VSIG to query + * @refs: pointer to variable to receive the reference count + */ +static enum ice_status +ice_vsig_get_ref(struct ice_hw *hw, enum ice_block blk, u16 vsig, u16 *refs) +{ + u16 idx = vsig & ICE_VSIG_IDX_M; + struct ice_vsig_vsi *ptr; + + *refs = 0; + + if (!hw->blk[blk].xlt2.vsig_tbl[idx].in_use) + return ICE_ERR_DOES_NOT_EXIST; + + ptr = hw->blk[blk].xlt2.vsig_tbl[idx].first_vsi; + while (ptr) { + (*refs)++; + ptr = ptr->next_vsi; + } + + return 0; +} + +/** + * ice_has_prof_vsig - check to see if VSIG has a specific profile + * @hw: pointer to the hardware structure + * @blk: HW block + * @vsig: VSIG to check against + * @hdl: profile handle + */ +static bool +ice_has_prof_vsig(struct ice_hw *hw, enum ice_block blk, u16 vsig, u64 hdl) +{ + u16 idx = vsig & ICE_VSIG_IDX_M; + struct ice_vsig_prof *ent; + + list_for_each_entry(ent, &hw->blk[blk].xlt2.vsig_tbl[idx].prop_lst, + list) + if (ent->profile_cookie == hdl) + return true; + + ice_debug(hw, ICE_DBG_INIT, + "Characteristic list for VSI group %d not found.\n", + vsig); + return false; +} + +/** + * ice_prof_bld_es - build profile ID extraction sequence changes + * @hw: pointer to the HW struct + * @blk: hardware block + * @bld: the update package buffer build to add to + * @chgs: the list of changes to make in hardware + */ +static enum ice_status +ice_prof_bld_es(struct ice_hw *hw, enum ice_block blk, + struct ice_buf_build *bld, struct list_head *chgs) +{ + u16 vec_size = hw->blk[blk].es.fvw * sizeof(struct ice_fv_word); + struct ice_chs_chg *tmp; + + list_for_each_entry(tmp, chgs, list_entry) + if (tmp->type == ICE_PTG_ES_ADD && tmp->add_prof) { + u16 off = tmp->prof_id * hw->blk[blk].es.fvw; + struct ice_pkg_es *p; + u32 id; + + id = ice_sect_id(blk, ICE_VEC_TBL); + p = (struct ice_pkg_es *) + ice_pkg_buf_alloc_section(bld, id, sizeof(*p) + + vec_size - + sizeof(p->es[0])); + + if (!p) + return ICE_ERR_MAX_LIMIT; + + p->count = cpu_to_le16(1); + p->offset = cpu_to_le16(tmp->prof_id); + + memcpy(p->es, &hw->blk[blk].es.t[off], vec_size); + } + + return 0; +} + +/** + * ice_prof_bld_tcam - build profile ID TCAM changes + * @hw: pointer to the HW struct + * @blk: hardware block + * @bld: the update package buffer build to add to + * @chgs: the list of changes to make in hardware + */ +static enum ice_status +ice_prof_bld_tcam(struct ice_hw *hw, enum ice_block blk, + struct ice_buf_build *bld, struct list_head *chgs) +{ + struct ice_chs_chg *tmp; + + list_for_each_entry(tmp, chgs, list_entry) + if (tmp->type == ICE_TCAM_ADD && tmp->add_tcam_idx) { + struct ice_prof_id_section *p; + u32 id; + + id = ice_sect_id(blk, ICE_PROF_TCAM); + p = (struct ice_prof_id_section *) + ice_pkg_buf_alloc_section(bld, id, sizeof(*p)); + + if (!p) + return ICE_ERR_MAX_LIMIT; + + p->count = cpu_to_le16(1); + p->entry[0].addr = cpu_to_le16(tmp->tcam_idx); + p->entry[0].prof_id = tmp->prof_id; + + memcpy(p->entry[0].key, + &hw->blk[blk].prof.t[tmp->tcam_idx].key, + sizeof(hw->blk[blk].prof.t->key)); + } + + return 0; +} + +/** + * ice_prof_bld_xlt1 - build XLT1 changes + * @blk: hardware block + * @bld: the update package buffer build to add to + * @chgs: the list of changes to make in hardware + */ +static enum ice_status +ice_prof_bld_xlt1(enum ice_block blk, struct ice_buf_build *bld, + struct list_head *chgs) +{ + struct ice_chs_chg *tmp; + + list_for_each_entry(tmp, chgs, list_entry) + if (tmp->type == ICE_PTG_ES_ADD && tmp->add_ptg) { + struct ice_xlt1_section *p; + u32 id; + + id = ice_sect_id(blk, ICE_XLT1); + p = (struct ice_xlt1_section *) + ice_pkg_buf_alloc_section(bld, id, sizeof(*p)); + + if (!p) + return ICE_ERR_MAX_LIMIT; + + p->count = cpu_to_le16(1); + p->offset = cpu_to_le16(tmp->ptype); + p->value[0] = tmp->ptg; + } + + return 0; +} + +/** + * ice_prof_bld_xlt2 - build XLT2 changes + * @blk: hardware block + * @bld: the update package buffer build to add to + * @chgs: the list of changes to make in hardware + */ +static enum ice_status +ice_prof_bld_xlt2(enum ice_block blk, struct ice_buf_build *bld, + struct list_head *chgs) +{ + struct ice_chs_chg *tmp; + + list_for_each_entry(tmp, chgs, list_entry) { + struct ice_xlt2_section *p; + u32 id; + + switch (tmp->type) { + case ICE_VSIG_ADD: + case ICE_VSI_MOVE: + case ICE_VSIG_REM: + id = ice_sect_id(blk, ICE_XLT2); + p = (struct ice_xlt2_section *) + ice_pkg_buf_alloc_section(bld, id, sizeof(*p)); + + if (!p) + return ICE_ERR_MAX_LIMIT; + + p->count = cpu_to_le16(1); + p->offset = cpu_to_le16(tmp->vsi); + p->value[0] = cpu_to_le16(tmp->vsig); + break; + default: + break; + } + } + + return 0; +} + +/** + * ice_upd_prof_hw - update hardware using the change list + * @hw: pointer to the HW struct + * @blk: hardware block + * @chgs: the list of changes to make in hardware + */ +static enum ice_status +ice_upd_prof_hw(struct ice_hw *hw, enum ice_block blk, + struct list_head *chgs) +{ + struct ice_buf_build *b; + struct ice_chs_chg *tmp; + enum ice_status status; + u16 pkg_sects; + u16 xlt1 = 0; + u16 xlt2 = 0; + u16 tcam = 0; + u16 es = 0; + u16 sects; + + /* count number of sections we need */ + list_for_each_entry(tmp, chgs, list_entry) { + switch (tmp->type) { + case ICE_PTG_ES_ADD: + if (tmp->add_ptg) + xlt1++; + if (tmp->add_prof) + es++; + break; + case ICE_TCAM_ADD: + tcam++; + break; + case ICE_VSIG_ADD: + case ICE_VSI_MOVE: + case ICE_VSIG_REM: + xlt2++; + break; + default: + break; + } + } + sects = xlt1 + xlt2 + tcam + es; + + if (!sects) + return 0; + + /* Build update package buffer */ + b = ice_pkg_buf_alloc(hw); + if (!b) + return ICE_ERR_NO_MEMORY; + + status = ice_pkg_buf_reserve_section(b, sects); + if (status) + goto error_tmp; + + /* Preserve order of table update: ES, TCAM, PTG, VSIG */ + if (es) { + status = ice_prof_bld_es(hw, blk, b, chgs); + if (status) + goto error_tmp; + } + + if (tcam) { + status = ice_prof_bld_tcam(hw, blk, b, chgs); + if (status) + goto error_tmp; + } + + if (xlt1) { + status = ice_prof_bld_xlt1(blk, b, chgs); + if (status) + goto error_tmp; + } + + if (xlt2) { + status = ice_prof_bld_xlt2(blk, b, chgs); + if (status) + goto error_tmp; + } + + /* After package buffer build check if the section count in buffer is + * non-zero and matches the number of sections detected for package + * update. + */ + pkg_sects = ice_pkg_buf_get_active_sections(b); + if (!pkg_sects || pkg_sects != sects) { + status = ICE_ERR_INVAL_SIZE; + goto error_tmp; + } + + /* update package */ + status = ice_update_pkg(hw, ice_pkg_buf(b), 1); + if (status == ICE_ERR_AQ_ERROR) + ice_debug(hw, ICE_DBG_INIT, "Unable to update HW profile\n"); + +error_tmp: + ice_pkg_buf_free(hw, b); + return status; +} + +/** + * ice_add_prof - add profile + * @hw: pointer to the HW struct + * @blk: hardware block + * @id: profile tracking ID + * @ptypes: array of bitmaps indicating ptypes (ICE_FLOW_PTYPE_MAX bits) + * @es: extraction sequence (length of array is determined by the block) + * + * This function registers a profile, which matches a set of PTGs with a + * particular extraction sequence. While the hardware profile is allocated + * it will not be written until the first call to ice_add_flow that specifies + * the ID value used here. + */ +enum ice_status +ice_add_prof(struct ice_hw *hw, enum ice_block blk, u64 id, u8 ptypes[], + struct ice_fv_word *es) +{ + u32 bytes = DIV_ROUND_UP(ICE_FLOW_PTYPE_MAX, BITS_PER_BYTE); + DECLARE_BITMAP(ptgs_used, ICE_XLT1_CNT); + struct ice_prof_map *prof; + enum ice_status status; + u32 byte = 0; + u8 prof_id; + + bitmap_zero(ptgs_used, ICE_XLT1_CNT); + + mutex_lock(&hw->blk[blk].es.prof_map_lock); + + /* search for existing profile */ + status = ice_find_prof_id(hw, blk, es, &prof_id); + if (status) { + /* allocate profile ID */ + status = ice_alloc_prof_id(hw, blk, &prof_id); + if (status) + goto err_ice_add_prof; + + /* and write new es */ + ice_write_es(hw, blk, prof_id, es); + } + + ice_prof_inc_ref(hw, blk, prof_id); + + /* add profile info */ + prof = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*prof), GFP_KERNEL); + if (!prof) + goto err_ice_add_prof; + + prof->profile_cookie = id; + prof->prof_id = prof_id; + prof->ptg_cnt = 0; + prof->context = 0; + + /* build list of ptgs */ + while (bytes && prof->ptg_cnt < ICE_MAX_PTG_PER_PROFILE) { + u32 bit; + + if (!ptypes[byte]) { + bytes--; + byte++; + continue; + } + + /* Examine 8 bits per byte */ + for_each_set_bit(bit, (unsigned long *)&ptypes[byte], + BITS_PER_BYTE) { + u16 ptype; + u8 ptg; + u8 m; + + ptype = byte * BITS_PER_BYTE + bit; + + /* The package should place all ptypes in a non-zero + * PTG, so the following call should never fail. + */ + if (ice_ptg_find_ptype(hw, blk, ptype, &ptg)) + continue; + + /* If PTG is already added, skip and continue */ + if (test_bit(ptg, ptgs_used)) + continue; + + set_bit(ptg, ptgs_used); + prof->ptg[prof->ptg_cnt] = ptg; + + if (++prof->ptg_cnt >= ICE_MAX_PTG_PER_PROFILE) + break; + + /* nothing left in byte, then exit */ + m = ~((1 << (bit + 1)) - 1); + if (!(ptypes[byte] & m)) + break; + } + + bytes--; + byte++; + } + + list_add(&prof->list, &hw->blk[blk].es.prof_map); + status = 0; + +err_ice_add_prof: + mutex_unlock(&hw->blk[blk].es.prof_map_lock); + return status; +} + +/** + * ice_search_prof_id_low - Search for a profile tracking ID low level + * @hw: pointer to the HW struct + * @blk: hardware block + * @id: profile tracking ID + * + * This will search for a profile tracking ID which was previously added. This + * version assumes that the caller has already acquired the prof map lock. + */ +static struct ice_prof_map * +ice_search_prof_id_low(struct ice_hw *hw, enum ice_block blk, u64 id) +{ + struct ice_prof_map *entry = NULL; + struct ice_prof_map *map; + + list_for_each_entry(map, &hw->blk[blk].es.prof_map, list) + if (map->profile_cookie == id) { + entry = map; + break; + } + + return entry; +} + +/** + * ice_search_prof_id - Search for a profile tracking ID + * @hw: pointer to the HW struct + * @blk: hardware block + * @id: profile tracking ID + * + * This will search for a profile tracking ID which was previously added. + */ +static struct ice_prof_map * +ice_search_prof_id(struct ice_hw *hw, enum ice_block blk, u64 id) +{ + struct ice_prof_map *entry; + + mutex_lock(&hw->blk[blk].es.prof_map_lock); + entry = ice_search_prof_id_low(hw, blk, id); + mutex_unlock(&hw->blk[blk].es.prof_map_lock); + + return entry; +} + +/** + * ice_vsig_prof_id_count - count profiles in a VSIG + * @hw: pointer to the HW struct + * @blk: hardware block + * @vsig: VSIG to remove the profile from + */ +static u16 +ice_vsig_prof_id_count(struct ice_hw *hw, enum ice_block blk, u16 vsig) +{ + u16 idx = vsig & ICE_VSIG_IDX_M, count = 0; + struct ice_vsig_prof *p; + + list_for_each_entry(p, &hw->blk[blk].xlt2.vsig_tbl[idx].prop_lst, + list) + count++; + + return count; +} + +/** + * ice_rel_tcam_idx - release a TCAM index + * @hw: pointer to the HW struct + * @blk: hardware block + * @idx: the index to release + */ +static enum ice_status +ice_rel_tcam_idx(struct ice_hw *hw, enum ice_block blk, u16 idx) +{ + /* Masks to invoke a never match entry */ + u8 vl_msk[ICE_TCAM_KEY_VAL_SZ] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; + u8 dc_msk[ICE_TCAM_KEY_VAL_SZ] = { 0xFE, 0xFF, 0xFF, 0xFF, 0xFF }; + u8 nm_msk[ICE_TCAM_KEY_VAL_SZ] = { 0x01, 0x00, 0x00, 0x00, 0x00 }; + enum ice_status status; + + /* write the TCAM entry */ + status = ice_tcam_write_entry(hw, blk, idx, 0, 0, 0, 0, 0, vl_msk, + dc_msk, nm_msk); + if (status) + return status; + + /* release the TCAM entry */ + status = ice_free_tcam_ent(hw, blk, idx); + + return status; +} + +/** + * ice_rem_prof_id - remove one profile from a VSIG + * @hw: pointer to the HW struct + * @blk: hardware block + * @prof: pointer to profile structure to remove + */ +static enum ice_status +ice_rem_prof_id(struct ice_hw *hw, enum ice_block blk, + struct ice_vsig_prof *prof) +{ + enum ice_status status; + u16 i; + + for (i = 0; i < prof->tcam_count; i++) + if (prof->tcam[i].in_use) { + prof->tcam[i].in_use = false; + status = ice_rel_tcam_idx(hw, blk, + prof->tcam[i].tcam_idx); + if (status) + return ICE_ERR_HW_TABLE; + } + + return 0; +} + +/** + * ice_rem_vsig - remove VSIG + * @hw: pointer to the HW struct + * @blk: hardware block + * @vsig: the VSIG to remove + * @chg: the change list + */ +static enum ice_status +ice_rem_vsig(struct ice_hw *hw, enum ice_block blk, u16 vsig, + struct list_head *chg) +{ + u16 idx = vsig & ICE_VSIG_IDX_M; + struct ice_vsig_vsi *vsi_cur; + struct ice_vsig_prof *d, *t; + enum ice_status status; + + /* remove TCAM entries */ + list_for_each_entry_safe(d, t, + &hw->blk[blk].xlt2.vsig_tbl[idx].prop_lst, + list) { + status = ice_rem_prof_id(hw, blk, d); + if (status) + return status; + + list_del(&d->list); + devm_kfree(ice_hw_to_dev(hw), d); + } + + /* Move all VSIS associated with this VSIG to the default VSIG */ + vsi_cur = hw->blk[blk].xlt2.vsig_tbl[idx].first_vsi; + /* If the VSIG has at least 1 VSI then iterate through the list + * and remove the VSIs before deleting the group. + */ + if (vsi_cur) + do { + struct ice_vsig_vsi *tmp = vsi_cur->next_vsi; + struct ice_chs_chg *p; + + p = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*p), + GFP_KERNEL); + if (!p) + return ICE_ERR_NO_MEMORY; + + p->type = ICE_VSIG_REM; + p->orig_vsig = vsig; + p->vsig = ICE_DEFAULT_VSIG; + p->vsi = vsi_cur - hw->blk[blk].xlt2.vsis; + + list_add(&p->list_entry, chg); + + vsi_cur = tmp; + } while (vsi_cur); + + return ice_vsig_free(hw, blk, vsig); +} + +/** + * ice_rem_prof_id_vsig - remove a specific profile from a VSIG + * @hw: pointer to the HW struct + * @blk: hardware block + * @vsig: VSIG to remove the profile from + * @hdl: profile handle indicating which profile to remove + * @chg: list to receive a record of changes + */ +static enum ice_status +ice_rem_prof_id_vsig(struct ice_hw *hw, enum ice_block blk, u16 vsig, u64 hdl, + struct list_head *chg) +{ + u16 idx = vsig & ICE_VSIG_IDX_M; + struct ice_vsig_prof *p, *t; + enum ice_status status; + + list_for_each_entry_safe(p, t, + &hw->blk[blk].xlt2.vsig_tbl[idx].prop_lst, + list) + if (p->profile_cookie == hdl) { + if (ice_vsig_prof_id_count(hw, blk, vsig) == 1) + /* this is the last profile, remove the VSIG */ + return ice_rem_vsig(hw, blk, vsig, chg); + + status = ice_rem_prof_id(hw, blk, p); + if (!status) { + list_del(&p->list); + devm_kfree(ice_hw_to_dev(hw), p); + } + return status; + } + + return ICE_ERR_DOES_NOT_EXIST; +} + +/** + * ice_rem_flow_all - remove all flows with a particular profile + * @hw: pointer to the HW struct + * @blk: hardware block + * @id: profile tracking ID + */ +static enum ice_status +ice_rem_flow_all(struct ice_hw *hw, enum ice_block blk, u64 id) +{ + struct ice_chs_chg *del, *tmp; + enum ice_status status; + struct list_head chg; + u16 i; + + INIT_LIST_HEAD(&chg); + + for (i = 1; i < ICE_MAX_VSIGS; i++) + if (hw->blk[blk].xlt2.vsig_tbl[i].in_use) { + if (ice_has_prof_vsig(hw, blk, i, id)) { + status = ice_rem_prof_id_vsig(hw, blk, i, id, + &chg); + if (status) + goto err_ice_rem_flow_all; + } + } + + status = ice_upd_prof_hw(hw, blk, &chg); + +err_ice_rem_flow_all: + list_for_each_entry_safe(del, tmp, &chg, list_entry) { + list_del(&del->list_entry); + devm_kfree(ice_hw_to_dev(hw), del); + } + + return status; +} + +/** + * ice_rem_prof - remove profile + * @hw: pointer to the HW struct + * @blk: hardware block + * @id: profile tracking ID + * + * This will remove the profile specified by the ID parameter, which was + * previously created through ice_add_prof. If any existing entries + * are associated with this profile, they will be removed as well. + */ +enum ice_status ice_rem_prof(struct ice_hw *hw, enum ice_block blk, u64 id) +{ + struct ice_prof_map *pmap; + enum ice_status status; + + mutex_lock(&hw->blk[blk].es.prof_map_lock); + + pmap = ice_search_prof_id_low(hw, blk, id); + if (!pmap) { + status = ICE_ERR_DOES_NOT_EXIST; + goto err_ice_rem_prof; + } + + /* remove all flows with this profile */ + status = ice_rem_flow_all(hw, blk, pmap->profile_cookie); + if (status) + goto err_ice_rem_prof; + + /* dereference profile, and possibly remove */ + ice_prof_dec_ref(hw, blk, pmap->prof_id); + + list_del(&pmap->list); + devm_kfree(ice_hw_to_dev(hw), pmap); + +err_ice_rem_prof: + mutex_unlock(&hw->blk[blk].es.prof_map_lock); + return status; +} + +/** + * ice_get_prof - get profile + * @hw: pointer to the HW struct + * @blk: hardware block + * @hdl: profile handle + * @chg: change list + */ +static enum ice_status +ice_get_prof(struct ice_hw *hw, enum ice_block blk, u64 hdl, + struct list_head *chg) +{ + struct ice_prof_map *map; + struct ice_chs_chg *p; + u16 i; + + /* Get the details on the profile specified by the handle ID */ + map = ice_search_prof_id(hw, blk, hdl); + if (!map) + return ICE_ERR_DOES_NOT_EXIST; + + for (i = 0; i < map->ptg_cnt; i++) + if (!hw->blk[blk].es.written[map->prof_id]) { + /* add ES to change list */ + p = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*p), + GFP_KERNEL); + if (!p) + goto err_ice_get_prof; + + p->type = ICE_PTG_ES_ADD; + p->ptype = 0; + p->ptg = map->ptg[i]; + p->add_ptg = 0; + + p->add_prof = 1; + p->prof_id = map->prof_id; + + hw->blk[blk].es.written[map->prof_id] = true; + + list_add(&p->list_entry, chg); + } + + return 0; + +err_ice_get_prof: + /* let caller clean up the change list */ + return ICE_ERR_NO_MEMORY; +} + +/** + * ice_get_profs_vsig - get a copy of the list of profiles from a VSIG + * @hw: pointer to the HW struct + * @blk: hardware block + * @vsig: VSIG from which to copy the list + * @lst: output list + * + * This routine makes a copy of the list of profiles in the specified VSIG. + */ +static enum ice_status +ice_get_profs_vsig(struct ice_hw *hw, enum ice_block blk, u16 vsig, + struct list_head *lst) +{ + struct ice_vsig_prof *ent1, *ent2; + u16 idx = vsig & ICE_VSIG_IDX_M; + + list_for_each_entry(ent1, &hw->blk[blk].xlt2.vsig_tbl[idx].prop_lst, + list) { + struct ice_vsig_prof *p; + + /* copy to the input list */ + p = devm_kmemdup(ice_hw_to_dev(hw), ent1, sizeof(*p), + GFP_KERNEL); + if (!p) + goto err_ice_get_profs_vsig; + + list_add_tail(&p->list, lst); + } + + return 0; + +err_ice_get_profs_vsig: + list_for_each_entry_safe(ent1, ent2, lst, list) { + list_del(&ent1->list); + devm_kfree(ice_hw_to_dev(hw), ent1); + } + + return ICE_ERR_NO_MEMORY; +} + +/** + * ice_add_prof_to_lst - add profile entry to a list + * @hw: pointer to the HW struct + * @blk: hardware block + * @lst: the list to be added to + * @hdl: profile handle of entry to add + */ +static enum ice_status +ice_add_prof_to_lst(struct ice_hw *hw, enum ice_block blk, + struct list_head *lst, u64 hdl) +{ + struct ice_prof_map *map; + struct ice_vsig_prof *p; + u16 i; + + map = ice_search_prof_id(hw, blk, hdl); + if (!map) + return ICE_ERR_DOES_NOT_EXIST; + + p = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*p), GFP_KERNEL); + if (!p) + return ICE_ERR_NO_MEMORY; + + p->profile_cookie = map->profile_cookie; + p->prof_id = map->prof_id; + p->tcam_count = map->ptg_cnt; + + for (i = 0; i < map->ptg_cnt; i++) { + p->tcam[i].prof_id = map->prof_id; + p->tcam[i].tcam_idx = ICE_INVALID_TCAM; + p->tcam[i].ptg = map->ptg[i]; + } + + list_add(&p->list, lst); + + return 0; +} + +/** + * ice_move_vsi - move VSI to another VSIG + * @hw: pointer to the HW struct + * @blk: hardware block + * @vsi: the VSI to move + * @vsig: the VSIG to move the VSI to + * @chg: the change list + */ +static enum ice_status +ice_move_vsi(struct ice_hw *hw, enum ice_block blk, u16 vsi, u16 vsig, + struct list_head *chg) +{ + enum ice_status status; + struct ice_chs_chg *p; + u16 orig_vsig; + + p = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*p), GFP_KERNEL); + if (!p) + return ICE_ERR_NO_MEMORY; + + status = ice_vsig_find_vsi(hw, blk, vsi, &orig_vsig); + if (!status) + status = ice_vsig_add_mv_vsi(hw, blk, vsi, vsig); + + if (status) { + devm_kfree(ice_hw_to_dev(hw), p); + return status; + } + + p->type = ICE_VSI_MOVE; + p->vsi = vsi; + p->orig_vsig = orig_vsig; + p->vsig = vsig; + + list_add(&p->list_entry, chg); + + return 0; +} + +/** + * ice_prof_tcam_ena_dis - add enable or disable TCAM change + * @hw: pointer to the HW struct + * @blk: hardware block + * @enable: true to enable, false to disable + * @vsig: the VSIG of the TCAM entry + * @tcam: pointer the TCAM info structure of the TCAM to disable + * @chg: the change list + * + * This function appends an enable or disable TCAM entry in the change log + */ +static enum ice_status +ice_prof_tcam_ena_dis(struct ice_hw *hw, enum ice_block blk, bool enable, + u16 vsig, struct ice_tcam_inf *tcam, + struct list_head *chg) +{ + enum ice_status status; + struct ice_chs_chg *p; + + /* Default: enable means change the low flag bit to don't care */ + u8 dc_msk[ICE_TCAM_KEY_VAL_SZ] = { 0x01, 0x00, 0x00, 0x00, 0x00 }; + u8 nm_msk[ICE_TCAM_KEY_VAL_SZ] = { 0x00, 0x00, 0x00, 0x00, 0x00 }; + u8 vl_msk[ICE_TCAM_KEY_VAL_SZ] = { 0x01, 0x00, 0x00, 0x00, 0x00 }; + + /* if disabling, free the TCAM */ + if (!enable) { + status = ice_free_tcam_ent(hw, blk, tcam->tcam_idx); + tcam->tcam_idx = 0; + tcam->in_use = 0; + return status; + } + + /* for re-enabling, reallocate a TCAM */ + status = ice_alloc_tcam_ent(hw, blk, &tcam->tcam_idx); + if (status) + return status; + + /* add TCAM to change list */ + p = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*p), GFP_KERNEL); + if (!p) + return ICE_ERR_NO_MEMORY; + + status = ice_tcam_write_entry(hw, blk, tcam->tcam_idx, tcam->prof_id, + tcam->ptg, vsig, 0, 0, vl_msk, dc_msk, + nm_msk); + if (status) + goto err_ice_prof_tcam_ena_dis; + + tcam->in_use = 1; + + p->type = ICE_TCAM_ADD; + p->add_tcam_idx = true; + p->prof_id = tcam->prof_id; + p->ptg = tcam->ptg; + p->vsig = 0; + p->tcam_idx = tcam->tcam_idx; + + /* log change */ + list_add(&p->list_entry, chg); + + return 0; + +err_ice_prof_tcam_ena_dis: + devm_kfree(ice_hw_to_dev(hw), p); + return status; +} + +/** + * ice_adj_prof_priorities - adjust profile based on priorities + * @hw: pointer to the HW struct + * @blk: hardware block + * @vsig: the VSIG for which to adjust profile priorities + * @chg: the change list + */ +static enum ice_status +ice_adj_prof_priorities(struct ice_hw *hw, enum ice_block blk, u16 vsig, + struct list_head *chg) +{ + DECLARE_BITMAP(ptgs_used, ICE_XLT1_CNT); + struct ice_vsig_prof *t; + enum ice_status status; + u16 idx; + + bitmap_zero(ptgs_used, ICE_XLT1_CNT); + idx = vsig & ICE_VSIG_IDX_M; + + /* Priority is based on the order in which the profiles are added. The + * newest added profile has highest priority and the oldest added + * profile has the lowest priority. Since the profile property list for + * a VSIG is sorted from newest to oldest, this code traverses the list + * in order and enables the first of each PTG that it finds (that is not + * already enabled); it also disables any duplicate PTGs that it finds + * in the older profiles (that are currently enabled). + */ + + list_for_each_entry(t, &hw->blk[blk].xlt2.vsig_tbl[idx].prop_lst, + list) { + u16 i; + + for (i = 0; i < t->tcam_count; i++) { + /* Scan the priorities from newest to oldest. + * Make sure that the newest profiles take priority. + */ + if (test_bit(t->tcam[i].ptg, ptgs_used) && + t->tcam[i].in_use) { + /* need to mark this PTG as never match, as it + * was already in use and therefore duplicate + * (and lower priority) + */ + status = ice_prof_tcam_ena_dis(hw, blk, false, + vsig, + &t->tcam[i], + chg); + if (status) + return status; + } else if (!test_bit(t->tcam[i].ptg, ptgs_used) && + !t->tcam[i].in_use) { + /* need to enable this PTG, as it in not in use + * and not enabled (highest priority) + */ + status = ice_prof_tcam_ena_dis(hw, blk, true, + vsig, + &t->tcam[i], + chg); + if (status) + return status; + } + + /* keep track of used ptgs */ + set_bit(t->tcam[i].ptg, ptgs_used); + } + } + + return 0; +} + +/** + * ice_add_prof_id_vsig - add profile to VSIG + * @hw: pointer to the HW struct + * @blk: hardware block + * @vsig: the VSIG to which this profile is to be added + * @hdl: the profile handle indicating the profile to add + * @chg: the change list + */ +static enum ice_status +ice_add_prof_id_vsig(struct ice_hw *hw, enum ice_block blk, u16 vsig, u64 hdl, + struct list_head *chg) +{ + /* Masks that ignore flags */ + u8 vl_msk[ICE_TCAM_KEY_VAL_SZ] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; + u8 dc_msk[ICE_TCAM_KEY_VAL_SZ] = { 0xFF, 0xFF, 0x00, 0x00, 0x00 }; + u8 nm_msk[ICE_TCAM_KEY_VAL_SZ] = { 0x00, 0x00, 0x00, 0x00, 0x00 }; + struct ice_prof_map *map; + struct ice_vsig_prof *t; + struct ice_chs_chg *p; + u16 i; + + /* Get the details on the profile specified by the handle ID */ + map = ice_search_prof_id(hw, blk, hdl); + if (!map) + return ICE_ERR_DOES_NOT_EXIST; + + /* Error, if this VSIG already has this profile */ + if (ice_has_prof_vsig(hw, blk, vsig, hdl)) + return ICE_ERR_ALREADY_EXISTS; + + /* new VSIG profile structure */ + t = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*t), GFP_KERNEL); + if (!t) + return ICE_ERR_NO_MEMORY; + + t->profile_cookie = map->profile_cookie; + t->prof_id = map->prof_id; + t->tcam_count = map->ptg_cnt; + + /* create TCAM entries */ + for (i = 0; i < map->ptg_cnt; i++) { + enum ice_status status; + u16 tcam_idx; + + /* add TCAM to change list */ + p = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*p), GFP_KERNEL); + if (!p) + goto err_ice_add_prof_id_vsig; + + /* allocate the TCAM entry index */ + status = ice_alloc_tcam_ent(hw, blk, &tcam_idx); + if (status) { + devm_kfree(ice_hw_to_dev(hw), p); + goto err_ice_add_prof_id_vsig; + } + + t->tcam[i].ptg = map->ptg[i]; + t->tcam[i].prof_id = map->prof_id; + t->tcam[i].tcam_idx = tcam_idx; + t->tcam[i].in_use = true; + + p->type = ICE_TCAM_ADD; + p->add_tcam_idx = true; + p->prof_id = t->tcam[i].prof_id; + p->ptg = t->tcam[i].ptg; + p->vsig = vsig; + p->tcam_idx = t->tcam[i].tcam_idx; + + /* write the TCAM entry */ + status = ice_tcam_write_entry(hw, blk, t->tcam[i].tcam_idx, + t->tcam[i].prof_id, + t->tcam[i].ptg, vsig, 0, 0, + vl_msk, dc_msk, nm_msk); + if (status) + goto err_ice_add_prof_id_vsig; + + /* log change */ + list_add(&p->list_entry, chg); + } + + /* add profile to VSIG */ + list_add(&t->list, + &hw->blk[blk].xlt2.vsig_tbl[(vsig & ICE_VSIG_IDX_M)].prop_lst); + + return 0; + +err_ice_add_prof_id_vsig: + /* let caller clean up the change list */ + devm_kfree(ice_hw_to_dev(hw), t); + return ICE_ERR_NO_MEMORY; +} + +/** + * ice_create_prof_id_vsig - add a new VSIG with a single profile + * @hw: pointer to the HW struct + * @blk: hardware block + * @vsi: the initial VSI that will be in VSIG + * @hdl: the profile handle of the profile that will be added to the VSIG + * @chg: the change list + */ +static enum ice_status +ice_create_prof_id_vsig(struct ice_hw *hw, enum ice_block blk, u16 vsi, u64 hdl, + struct list_head *chg) +{ + enum ice_status status; + struct ice_chs_chg *p; + u16 new_vsig; + + p = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*p), GFP_KERNEL); + if (!p) + return ICE_ERR_NO_MEMORY; + + new_vsig = ice_vsig_alloc(hw, blk); + if (!new_vsig) { + status = ICE_ERR_HW_TABLE; + goto err_ice_create_prof_id_vsig; + } + + status = ice_move_vsi(hw, blk, vsi, new_vsig, chg); + if (status) + goto err_ice_create_prof_id_vsig; + + status = ice_add_prof_id_vsig(hw, blk, new_vsig, hdl, chg); + if (status) + goto err_ice_create_prof_id_vsig; + + p->type = ICE_VSIG_ADD; + p->vsi = vsi; + p->orig_vsig = ICE_DEFAULT_VSIG; + p->vsig = new_vsig; + + list_add(&p->list_entry, chg); + + return 0; + +err_ice_create_prof_id_vsig: + /* let caller clean up the change list */ + devm_kfree(ice_hw_to_dev(hw), p); + return status; +} + +/** + * ice_create_vsig_from_lst - create a new VSIG with a list of profiles + * @hw: pointer to the HW struct + * @blk: hardware block + * @vsi: the initial VSI that will be in VSIG + * @lst: the list of profile that will be added to the VSIG + * @chg: the change list + */ +static enum ice_status +ice_create_vsig_from_lst(struct ice_hw *hw, enum ice_block blk, u16 vsi, + struct list_head *lst, struct list_head *chg) +{ + struct ice_vsig_prof *t; + enum ice_status status; + u16 vsig; + + vsig = ice_vsig_alloc(hw, blk); + if (!vsig) + return ICE_ERR_HW_TABLE; + + status = ice_move_vsi(hw, blk, vsi, vsig, chg); + if (status) + return status; + + list_for_each_entry(t, lst, list) { + status = ice_add_prof_id_vsig(hw, blk, vsig, t->profile_cookie, + chg); + if (status) + return status; + } + + return 0; +} + +/** + * ice_find_prof_vsig - find a VSIG with a specific profile handle + * @hw: pointer to the HW struct + * @blk: hardware block + * @hdl: the profile handle of the profile to search for + * @vsig: returns the VSIG with the matching profile + */ +static bool +ice_find_prof_vsig(struct ice_hw *hw, enum ice_block blk, u64 hdl, u16 *vsig) +{ + struct ice_vsig_prof *t; + enum ice_status status; + struct list_head lst; + + INIT_LIST_HEAD(&lst); + + t = kzalloc(sizeof(*t), GFP_KERNEL); + if (!t) + return false; + + t->profile_cookie = hdl; + list_add(&t->list, &lst); + + status = ice_find_dup_props_vsig(hw, blk, &lst, vsig); + + list_del(&t->list); + kfree(t); + + return !status; +} + +/** + * ice_add_prof_id_flow - add profile flow + * @hw: pointer to the HW struct + * @blk: hardware block + * @vsi: the VSI to enable with the profile specified by ID + * @hdl: profile handle + * + * Calling this function will update the hardware tables to enable the + * profile indicated by the ID parameter for the VSIs specified in the VSI + * array. Once successfully called, the flow will be enabled. + */ +enum ice_status +ice_add_prof_id_flow(struct ice_hw *hw, enum ice_block blk, u16 vsi, u64 hdl) +{ + struct ice_vsig_prof *tmp1, *del1; + struct ice_chs_chg *tmp, *del; + struct list_head union_lst; + enum ice_status status; + struct list_head chg; + u16 vsig; + + INIT_LIST_HEAD(&union_lst); + INIT_LIST_HEAD(&chg); + + /* Get profile */ + status = ice_get_prof(hw, blk, hdl, &chg); + if (status) + return status; + + /* determine if VSI is already part of a VSIG */ + status = ice_vsig_find_vsi(hw, blk, vsi, &vsig); + if (!status && vsig) { + bool only_vsi; + u16 or_vsig; + u16 ref; + + /* found in VSIG */ + or_vsig = vsig; + + /* make sure that there is no overlap/conflict between the new + * characteristics and the existing ones; we don't support that + * scenario + */ + if (ice_has_prof_vsig(hw, blk, vsig, hdl)) { + status = ICE_ERR_ALREADY_EXISTS; + goto err_ice_add_prof_id_flow; + } + + /* last VSI in the VSIG? */ + status = ice_vsig_get_ref(hw, blk, vsig, &ref); + if (status) + goto err_ice_add_prof_id_flow; + only_vsi = (ref == 1); + + /* create a union of the current profiles and the one being + * added + */ + status = ice_get_profs_vsig(hw, blk, vsig, &union_lst); + if (status) + goto err_ice_add_prof_id_flow; + + status = ice_add_prof_to_lst(hw, blk, &union_lst, hdl); + if (status) + goto err_ice_add_prof_id_flow; + + /* search for an existing VSIG with an exact charc match */ + status = ice_find_dup_props_vsig(hw, blk, &union_lst, &vsig); + if (!status) { + /* move VSI to the VSIG that matches */ + status = ice_move_vsi(hw, blk, vsi, vsig, &chg); + if (status) + goto err_ice_add_prof_id_flow; + + /* VSI has been moved out of or_vsig. If the or_vsig had + * only that VSI it is now empty and can be removed. + */ + if (only_vsi) { + status = ice_rem_vsig(hw, blk, or_vsig, &chg); + if (status) + goto err_ice_add_prof_id_flow; + } + } else if (only_vsi) { + /* If the original VSIG only contains one VSI, then it + * will be the requesting VSI. In this case the VSI is + * not sharing entries and we can simply add the new + * profile to the VSIG. + */ + status = ice_add_prof_id_vsig(hw, blk, vsig, hdl, &chg); + if (status) + goto err_ice_add_prof_id_flow; + + /* Adjust priorities */ + status = ice_adj_prof_priorities(hw, blk, vsig, &chg); + if (status) + goto err_ice_add_prof_id_flow; + } else { + /* No match, so we need a new VSIG */ + status = ice_create_vsig_from_lst(hw, blk, vsi, + &union_lst, &chg); + if (status) + goto err_ice_add_prof_id_flow; + + /* Adjust priorities */ + status = ice_adj_prof_priorities(hw, blk, vsig, &chg); + if (status) + goto err_ice_add_prof_id_flow; + } + } else { + /* need to find or add a VSIG */ + /* search for an existing VSIG with an exact charc match */ + if (ice_find_prof_vsig(hw, blk, hdl, &vsig)) { + /* found an exact match */ + /* add or move VSI to the VSIG that matches */ + status = ice_move_vsi(hw, blk, vsi, vsig, &chg); + if (status) + goto err_ice_add_prof_id_flow; + } else { + /* we did not find an exact match */ + /* we need to add a VSIG */ + status = ice_create_prof_id_vsig(hw, blk, vsi, hdl, + &chg); + if (status) + goto err_ice_add_prof_id_flow; + } + } + + /* update hardware */ + if (!status) + status = ice_upd_prof_hw(hw, blk, &chg); + +err_ice_add_prof_id_flow: + list_for_each_entry_safe(del, tmp, &chg, list_entry) { + list_del(&del->list_entry); + devm_kfree(ice_hw_to_dev(hw), del); + } + + list_for_each_entry_safe(del1, tmp1, &union_lst, list) { + list_del(&del1->list); + devm_kfree(ice_hw_to_dev(hw), del1); + } + + return status; +} + +/** + * ice_rem_prof_from_list - remove a profile from list + * @hw: pointer to the HW struct + * @lst: list to remove the profile from + * @hdl: the profile handle indicating the profile to remove + */ +static enum ice_status +ice_rem_prof_from_list(struct ice_hw *hw, struct list_head *lst, u64 hdl) +{ + struct ice_vsig_prof *ent, *tmp; + + list_for_each_entry_safe(ent, tmp, lst, list) + if (ent->profile_cookie == hdl) { + list_del(&ent->list); + devm_kfree(ice_hw_to_dev(hw), ent); + return 0; + } + + return ICE_ERR_DOES_NOT_EXIST; +} + +/** + * ice_rem_prof_id_flow - remove flow + * @hw: pointer to the HW struct + * @blk: hardware block + * @vsi: the VSI from which to remove the profile specified by ID + * @hdl: profile tracking handle + * + * Calling this function will update the hardware tables to remove the + * profile indicated by the ID parameter for the VSIs specified in the VSI + * array. Once successfully called, the flow will be disabled. + */ +enum ice_status +ice_rem_prof_id_flow(struct ice_hw *hw, enum ice_block blk, u16 vsi, u64 hdl) +{ + struct ice_vsig_prof *tmp1, *del1; + struct ice_chs_chg *tmp, *del; + struct list_head chg, copy; + enum ice_status status; + u16 vsig; + + INIT_LIST_HEAD(©); + INIT_LIST_HEAD(&chg); + + /* determine if VSI is already part of a VSIG */ + status = ice_vsig_find_vsi(hw, blk, vsi, &vsig); + if (!status && vsig) { + bool last_profile; + bool only_vsi; + u16 ref; + + /* found in VSIG */ + last_profile = ice_vsig_prof_id_count(hw, blk, vsig) == 1; + status = ice_vsig_get_ref(hw, blk, vsig, &ref); + if (status) + goto err_ice_rem_prof_id_flow; + only_vsi = (ref == 1); + + if (only_vsi) { + /* If the original VSIG only contains one reference, + * which will be the requesting VSI, then the VSI is not + * sharing entries and we can simply remove the specific + * characteristics from the VSIG. + */ + + if (last_profile) { + /* If there are no profiles left for this VSIG, + * then simply remove the the VSIG. + */ + status = ice_rem_vsig(hw, blk, vsig, &chg); + if (status) + goto err_ice_rem_prof_id_flow; + } else { + status = ice_rem_prof_id_vsig(hw, blk, vsig, + hdl, &chg); + if (status) + goto err_ice_rem_prof_id_flow; + + /* Adjust priorities */ + status = ice_adj_prof_priorities(hw, blk, vsig, + &chg); + if (status) + goto err_ice_rem_prof_id_flow; + } + + } else { + /* Make a copy of the VSIG's list of Profiles */ + status = ice_get_profs_vsig(hw, blk, vsig, ©); + if (status) + goto err_ice_rem_prof_id_flow; + + /* Remove specified profile entry from the list */ + status = ice_rem_prof_from_list(hw, ©, hdl); + if (status) + goto err_ice_rem_prof_id_flow; + + if (list_empty(©)) { + status = ice_move_vsi(hw, blk, vsi, + ICE_DEFAULT_VSIG, &chg); + if (status) + goto err_ice_rem_prof_id_flow; + + } else if (!ice_find_dup_props_vsig(hw, blk, ©, + &vsig)) { + /* found an exact match */ + /* add or move VSI to the VSIG that matches */ + /* Search for a VSIG with a matching profile + * list + */ + + /* Found match, move VSI to the matching VSIG */ + status = ice_move_vsi(hw, blk, vsi, vsig, &chg); + if (status) + goto err_ice_rem_prof_id_flow; + } else { + /* since no existing VSIG supports this + * characteristic pattern, we need to create a + * new VSIG and TCAM entries + */ + status = ice_create_vsig_from_lst(hw, blk, vsi, + ©, &chg); + if (status) + goto err_ice_rem_prof_id_flow; + + /* Adjust priorities */ + status = ice_adj_prof_priorities(hw, blk, vsig, + &chg); + if (status) + goto err_ice_rem_prof_id_flow; + } + } + } else { + status = ICE_ERR_DOES_NOT_EXIST; + } + + /* update hardware tables */ + if (!status) + status = ice_upd_prof_hw(hw, blk, &chg); + +err_ice_rem_prof_id_flow: + list_for_each_entry_safe(del, tmp, &chg, list_entry) { + list_del(&del->list_entry); + devm_kfree(ice_hw_to_dev(hw), del); + } + + list_for_each_entry_safe(del1, tmp1, ©, list) { + list_del(&del1->list); + devm_kfree(ice_hw_to_dev(hw), del1); + } + + return status; +} diff --git a/drivers/net/ethernet/intel/ice/ice_flex_pipe.h b/drivers/net/ethernet/intel/ice/ice_flex_pipe.h index 37eb282742d1..c7b5e1a6ea2b 100644 --- a/drivers/net/ethernet/intel/ice/ice_flex_pipe.h +++ b/drivers/net/ethernet/intel/ice/ice_flex_pipe.h @@ -18,6 +18,13 @@ #define ICE_PKG_CNT 4 +enum ice_status +ice_add_prof(struct ice_hw *hw, enum ice_block blk, u64 id, u8 ptypes[], + struct ice_fv_word *es); +enum ice_status +ice_add_prof_id_flow(struct ice_hw *hw, enum ice_block blk, u16 vsi, u64 hdl); +enum ice_status +ice_rem_prof_id_flow(struct ice_hw *hw, enum ice_block blk, u16 vsi, u64 hdl); enum ice_status ice_init_pkg(struct ice_hw *hw, u8 *buff, u32 len); enum ice_status ice_copy_and_init_pkg(struct ice_hw *hw, const u8 *buf, u32 len); @@ -26,4 +33,6 @@ void ice_free_seg(struct ice_hw *hw); void ice_fill_blk_tbls(struct ice_hw *hw); void ice_clear_hw_tbls(struct ice_hw *hw); void ice_free_hw_tbls(struct ice_hw *hw); +enum ice_status +ice_rem_prof(struct ice_hw *hw, enum ice_block blk, u64 id); #endif /* _ICE_FLEX_PIPE_H_ */ diff --git a/drivers/net/ethernet/intel/ice/ice_flex_type.h b/drivers/net/ethernet/intel/ice/ice_flex_type.h index 5d5a7eaffa30..0fb3fe3ff3ea 100644 --- a/drivers/net/ethernet/intel/ice/ice_flex_type.h +++ b/drivers/net/ethernet/intel/ice/ice_flex_type.h @@ -3,6 +3,9 @@ #ifndef _ICE_FLEX_TYPE_H_ #define _ICE_FLEX_TYPE_H_ + +#define ICE_FV_OFFSET_INVAL 0x1FF + /* Extraction Sequence (Field Vector) Table */ struct ice_fv_word { u8 prot_id; @@ -105,37 +108,57 @@ struct ice_buf_hdr { sizeof(struct ice_buf_hdr) - (hd_sz)) / (ent_sz)) /* ice package section IDs */ +#define ICE_SID_XLT0_SW 10 +#define ICE_SID_XLT_KEY_BUILDER_SW 11 #define ICE_SID_XLT1_SW 12 #define ICE_SID_XLT2_SW 13 #define ICE_SID_PROFID_TCAM_SW 14 #define ICE_SID_PROFID_REDIR_SW 15 #define ICE_SID_FLD_VEC_SW 16 +#define ICE_SID_CDID_KEY_BUILDER_SW 17 +#define ICE_SID_CDID_REDIR_SW 18 +#define ICE_SID_XLT0_ACL 20 +#define ICE_SID_XLT_KEY_BUILDER_ACL 21 #define ICE_SID_XLT1_ACL 22 #define ICE_SID_XLT2_ACL 23 #define ICE_SID_PROFID_TCAM_ACL 24 #define ICE_SID_PROFID_REDIR_ACL 25 #define ICE_SID_FLD_VEC_ACL 26 +#define ICE_SID_CDID_KEY_BUILDER_ACL 27 +#define ICE_SID_CDID_REDIR_ACL 28 +#define ICE_SID_XLT0_FD 30 +#define ICE_SID_XLT_KEY_BUILDER_FD 31 #define ICE_SID_XLT1_FD 32 #define ICE_SID_XLT2_FD 33 #define ICE_SID_PROFID_TCAM_FD 34 #define ICE_SID_PROFID_REDIR_FD 35 #define ICE_SID_FLD_VEC_FD 36 +#define ICE_SID_CDID_KEY_BUILDER_FD 37 +#define ICE_SID_CDID_REDIR_FD 38 +#define ICE_SID_XLT0_RSS 40 +#define ICE_SID_XLT_KEY_BUILDER_RSS 41 #define ICE_SID_XLT1_RSS 42 #define ICE_SID_XLT2_RSS 43 #define ICE_SID_PROFID_TCAM_RSS 44 #define ICE_SID_PROFID_REDIR_RSS 45 #define ICE_SID_FLD_VEC_RSS 46 +#define ICE_SID_CDID_KEY_BUILDER_RSS 47 +#define ICE_SID_CDID_REDIR_RSS 48 #define ICE_SID_RXPARSER_BOOST_TCAM 56 +#define ICE_SID_XLT0_PE 80 +#define ICE_SID_XLT_KEY_BUILDER_PE 81 #define ICE_SID_XLT1_PE 82 #define ICE_SID_XLT2_PE 83 #define ICE_SID_PROFID_TCAM_PE 84 #define ICE_SID_PROFID_REDIR_PE 85 #define ICE_SID_FLD_VEC_PE 86 +#define ICE_SID_CDID_KEY_BUILDER_PE 87 +#define ICE_SID_CDID_REDIR_PE 88 /* Label Metadata section IDs */ #define ICE_SID_LBL_FIRST 0x80000010 @@ -152,6 +175,19 @@ enum ice_block { ICE_BLK_COUNT }; +enum ice_sect { + ICE_XLT0 = 0, + ICE_XLT_KB, + ICE_XLT1, + ICE_XLT2, + ICE_PROF_TCAM, + ICE_PROF_REDIR, + ICE_VEC_TBL, + ICE_CDID_KB, + ICE_CDID_REDIR, + ICE_SECT_COUNT +}; + /* package labels */ struct ice_label { __le16 value; @@ -234,6 +270,13 @@ struct ice_prof_redir_section { u8 redir_value[1]; }; +/* package buffer building */ + +struct ice_buf_build { + struct ice_buf buf; + u16 reserved_section_table_entries; +}; + struct ice_pkg_enum { struct ice_buf_table *buf_table; u32 buf_idx; @@ -248,6 +291,12 @@ struct ice_pkg_enum { void *(*handler)(u32 sect_type, void *section, u32 index, u32 *offset); }; +struct ice_pkg_es { + __le16 count; + __le16 offset; + struct ice_fv_word es[1]; +}; + struct ice_es { u32 sid; u16 count; @@ -280,6 +329,35 @@ struct ice_ptg_ptype { u8 ptg; }; +#define ICE_MAX_TCAM_PER_PROFILE 32 +#define ICE_MAX_PTG_PER_PROFILE 32 + +struct ice_prof_map { + struct list_head list; + u64 profile_cookie; + u64 context; + u8 prof_id; + u8 ptg_cnt; + u8 ptg[ICE_MAX_PTG_PER_PROFILE]; +}; + +#define ICE_INVALID_TCAM 0xFFFF + +struct ice_tcam_inf { + u16 tcam_idx; + u8 ptg; + u8 prof_id; + u8 in_use; +}; + +struct ice_vsig_prof { + struct list_head list; + u64 profile_cookie; + u8 prof_id; + u8 tcam_count; + struct ice_tcam_inf tcam[ICE_MAX_TCAM_PER_PROFILE]; +}; + struct ice_vsig_entry { struct list_head prop_lst; struct ice_vsig_vsi *first_vsi; @@ -329,6 +407,13 @@ struct ice_xlt2 { u16 count; }; +/* Profile ID Management */ +struct ice_prof_id_key { + __le16 flags; + u8 xlt1; + __le16 xlt2_cdid; +} __packed; + /* Keys are made up of two values, each one-half the size of the key. * For TCAM, the entire key is 80 bits wide (or 2, 40-bit wide values) */ @@ -371,4 +456,31 @@ struct ice_blk_info { u8 is_list_init; }; +enum ice_chg_type { + ICE_TCAM_NONE = 0, + ICE_PTG_ES_ADD, + ICE_TCAM_ADD, + ICE_VSIG_ADD, + ICE_VSIG_REM, + ICE_VSI_MOVE, +}; + +struct ice_chs_chg { + struct list_head list_entry; + enum ice_chg_type type; + + u8 add_ptg; + u8 add_vsig; + u8 add_tcam_idx; + u8 add_prof; + u16 ptype; + u8 ptg; + u8 prof_id; + u16 vsi; + u16 vsig; + u16 orig_vsig; + u16 tcam_idx; +}; + +#define ICE_FLOW_PTYPE_MAX ICE_XLT1_CNT #endif /* _ICE_FLEX_TYPE_H_ */ diff --git a/drivers/net/ethernet/intel/ice/ice_flow.c b/drivers/net/ethernet/intel/ice/ice_flow.c new file mode 100644 index 000000000000..a05ceb59863b --- /dev/null +++ b/drivers/net/ethernet/intel/ice/ice_flow.c @@ -0,0 +1,1275 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019, Intel Corporation. */ + +#include "ice_common.h" +#include "ice_flow.h" + +/* Describe properties of a protocol header field */ +struct ice_flow_field_info { + enum ice_flow_seg_hdr hdr; + s16 off; /* Offset from start of a protocol header, in bits */ + u16 size; /* Size of fields in bits */ +}; + +#define ICE_FLOW_FLD_INFO(_hdr, _offset_bytes, _size_bytes) { \ + .hdr = _hdr, \ + .off = (_offset_bytes) * BITS_PER_BYTE, \ + .size = (_size_bytes) * BITS_PER_BYTE, \ +} + +/* Table containing properties of supported protocol header fields */ +static const +struct ice_flow_field_info ice_flds_info[ICE_FLOW_FIELD_IDX_MAX] = { + /* IPv4 / IPv6 */ + /* ICE_FLOW_FIELD_IDX_IPV4_SA */ + ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_IPV4, 12, sizeof(struct in_addr)), + /* ICE_FLOW_FIELD_IDX_IPV4_DA */ + ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_IPV4, 16, sizeof(struct in_addr)), + /* ICE_FLOW_FIELD_IDX_IPV6_SA */ + ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_IPV6, 8, sizeof(struct in6_addr)), + /* ICE_FLOW_FIELD_IDX_IPV6_DA */ + ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_IPV6, 24, sizeof(struct in6_addr)), + /* Transport */ + /* ICE_FLOW_FIELD_IDX_TCP_SRC_PORT */ + ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_TCP, 0, sizeof(__be16)), + /* ICE_FLOW_FIELD_IDX_TCP_DST_PORT */ + ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_TCP, 2, sizeof(__be16)), + /* ICE_FLOW_FIELD_IDX_UDP_SRC_PORT */ + ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_UDP, 0, sizeof(__be16)), + /* ICE_FLOW_FIELD_IDX_UDP_DST_PORT */ + ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_UDP, 2, sizeof(__be16)), + /* ICE_FLOW_FIELD_IDX_SCTP_SRC_PORT */ + ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_SCTP, 0, sizeof(__be16)), + /* ICE_FLOW_FIELD_IDX_SCTP_DST_PORT */ + ICE_FLOW_FLD_INFO(ICE_FLOW_SEG_HDR_SCTP, 2, sizeof(__be16)), + +}; + +/* Bitmaps indicating relevant packet types for a particular protocol header + * + * Packet types for packets with an Outer/First/Single IPv4 header + */ +static const u32 ice_ptypes_ipv4_ofos[] = { + 0x1DC00000, 0x04000800, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, +}; + +/* Packet types for packets with an Innermost/Last IPv4 header */ +static const u32 ice_ptypes_ipv4_il[] = { + 0xE0000000, 0xB807700E, 0x80000003, 0xE01DC03B, + 0x0000000E, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, +}; + +/* Packet types for packets with an Outer/First/Single IPv6 header */ +static const u32 ice_ptypes_ipv6_ofos[] = { + 0x00000000, 0x00000000, 0x77000000, 0x10002000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, +}; + +/* Packet types for packets with an Innermost/Last IPv6 header */ +static const u32 ice_ptypes_ipv6_il[] = { + 0x00000000, 0x03B80770, 0x000001DC, 0x0EE00000, + 0x00000770, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, +}; + +/* UDP Packet types for non-tunneled packets or tunneled + * packets with inner UDP. + */ +static const u32 ice_ptypes_udp_il[] = { + 0x81000000, 0x20204040, 0x04000010, 0x80810102, + 0x00000040, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, +}; + +/* Packet types for packets with an Innermost/Last TCP header */ +static const u32 ice_ptypes_tcp_il[] = { + 0x04000000, 0x80810102, 0x10000040, 0x02040408, + 0x00000102, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, +}; + +/* Packet types for packets with an Innermost/Last SCTP header */ +static const u32 ice_ptypes_sctp_il[] = { + 0x08000000, 0x01020204, 0x20000081, 0x04080810, + 0x00000204, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, +}; + +/* Manage parameters and info. used during the creation of a flow profile */ +struct ice_flow_prof_params { + enum ice_block blk; + u16 entry_length; /* # of bytes formatted entry will require */ + u8 es_cnt; + struct ice_flow_prof *prof; + + /* For ACL, the es[0] will have the data of ICE_RX_MDID_PKT_FLAGS_15_0 + * This will give us the direction flags. + */ + struct ice_fv_word es[ICE_MAX_FV_WORDS]; + DECLARE_BITMAP(ptypes, ICE_FLOW_PTYPE_MAX); +}; + +#define ICE_FLOW_SEG_HDRS_L3_MASK \ + (ICE_FLOW_SEG_HDR_IPV4 | ICE_FLOW_SEG_HDR_IPV6) +#define ICE_FLOW_SEG_HDRS_L4_MASK \ + (ICE_FLOW_SEG_HDR_TCP | ICE_FLOW_SEG_HDR_UDP | ICE_FLOW_SEG_HDR_SCTP) + +/** + * ice_flow_val_hdrs - validates packet segments for valid protocol headers + * @segs: array of one or more packet segments that describe the flow + * @segs_cnt: number of packet segments provided + */ +static enum ice_status +ice_flow_val_hdrs(struct ice_flow_seg_info *segs, u8 segs_cnt) +{ + u8 i; + + for (i = 0; i < segs_cnt; i++) { + /* Multiple L3 headers */ + if (segs[i].hdrs & ICE_FLOW_SEG_HDRS_L3_MASK && + !is_power_of_2(segs[i].hdrs & ICE_FLOW_SEG_HDRS_L3_MASK)) + return ICE_ERR_PARAM; + + /* Multiple L4 headers */ + if (segs[i].hdrs & ICE_FLOW_SEG_HDRS_L4_MASK && + !is_power_of_2(segs[i].hdrs & ICE_FLOW_SEG_HDRS_L4_MASK)) + return ICE_ERR_PARAM; + } + + return 0; +} + +/** + * ice_flow_proc_seg_hdrs - process protocol headers present in pkt segments + * @params: information about the flow to be processed + * + * This function identifies the packet types associated with the protocol + * headers being present in packet segments of the specified flow profile. + */ +static enum ice_status +ice_flow_proc_seg_hdrs(struct ice_flow_prof_params *params) +{ + struct ice_flow_prof *prof; + u8 i; + + memset(params->ptypes, 0xff, sizeof(params->ptypes)); + + prof = params->prof; + + for (i = 0; i < params->prof->segs_cnt; i++) { + const unsigned long *src; + u32 hdrs; + + hdrs = prof->segs[i].hdrs; + + if (hdrs & ICE_FLOW_SEG_HDR_IPV4) { + src = !i ? (const unsigned long *)ice_ptypes_ipv4_ofos : + (const unsigned long *)ice_ptypes_ipv4_il; + bitmap_and(params->ptypes, params->ptypes, src, + ICE_FLOW_PTYPE_MAX); + } else if (hdrs & ICE_FLOW_SEG_HDR_IPV6) { + src = !i ? (const unsigned long *)ice_ptypes_ipv6_ofos : + (const unsigned long *)ice_ptypes_ipv6_il; + bitmap_and(params->ptypes, params->ptypes, src, + ICE_FLOW_PTYPE_MAX); + } + + if (hdrs & ICE_FLOW_SEG_HDR_UDP) { + src = (const unsigned long *)ice_ptypes_udp_il; + bitmap_and(params->ptypes, params->ptypes, src, + ICE_FLOW_PTYPE_MAX); + } else if (hdrs & ICE_FLOW_SEG_HDR_TCP) { + bitmap_and(params->ptypes, params->ptypes, + (const unsigned long *)ice_ptypes_tcp_il, + ICE_FLOW_PTYPE_MAX); + } else if (hdrs & ICE_FLOW_SEG_HDR_SCTP) { + src = (const unsigned long *)ice_ptypes_sctp_il; + bitmap_and(params->ptypes, params->ptypes, src, + ICE_FLOW_PTYPE_MAX); + } + } + + return 0; +} + +/** + * ice_flow_xtract_fld - Create an extraction sequence entry for the given field + * @hw: pointer to the HW struct + * @params: information about the flow to be processed + * @seg: packet segment index of the field to be extracted + * @fld: ID of field to be extracted + * + * This function determines the protocol ID, offset, and size of the given + * field. It then allocates one or more extraction sequence entries for the + * given field, and fill the entries with protocol ID and offset information. + */ +static enum ice_status +ice_flow_xtract_fld(struct ice_hw *hw, struct ice_flow_prof_params *params, + u8 seg, enum ice_flow_field fld) +{ + enum ice_prot_id prot_id = ICE_PROT_ID_INVAL; + u8 fv_words = hw->blk[params->blk].es.fvw; + struct ice_flow_fld_info *flds; + u16 cnt, ese_bits, i; + u16 off; + + flds = params->prof->segs[seg].fields; + + switch (fld) { + case ICE_FLOW_FIELD_IDX_IPV4_SA: + case ICE_FLOW_FIELD_IDX_IPV4_DA: + prot_id = seg == 0 ? ICE_PROT_IPV4_OF_OR_S : ICE_PROT_IPV4_IL; + break; + case ICE_FLOW_FIELD_IDX_IPV6_SA: + case ICE_FLOW_FIELD_IDX_IPV6_DA: + prot_id = seg == 0 ? ICE_PROT_IPV6_OF_OR_S : ICE_PROT_IPV6_IL; + break; + case ICE_FLOW_FIELD_IDX_TCP_SRC_PORT: + case ICE_FLOW_FIELD_IDX_TCP_DST_PORT: + prot_id = ICE_PROT_TCP_IL; + break; + case ICE_FLOW_FIELD_IDX_UDP_SRC_PORT: + case ICE_FLOW_FIELD_IDX_UDP_DST_PORT: + prot_id = ICE_PROT_UDP_IL_OR_S; + break; + case ICE_FLOW_FIELD_IDX_SCTP_SRC_PORT: + case ICE_FLOW_FIELD_IDX_SCTP_DST_PORT: + prot_id = ICE_PROT_SCTP_IL; + break; + default: + return ICE_ERR_NOT_IMPL; + } + + /* Each extraction sequence entry is a word in size, and extracts a + * word-aligned offset from a protocol header. + */ + ese_bits = ICE_FLOW_FV_EXTRACT_SZ * BITS_PER_BYTE; + + flds[fld].xtrct.prot_id = prot_id; + flds[fld].xtrct.off = (ice_flds_info[fld].off / ese_bits) * + ICE_FLOW_FV_EXTRACT_SZ; + flds[fld].xtrct.disp = (u8)(ice_flds_info[fld].off % ese_bits); + flds[fld].xtrct.idx = params->es_cnt; + + /* Adjust the next field-entry index after accommodating the number of + * entries this field consumes + */ + cnt = DIV_ROUND_UP(flds[fld].xtrct.disp + ice_flds_info[fld].size, + ese_bits); + + /* Fill in the extraction sequence entries needed for this field */ + off = flds[fld].xtrct.off; + for (i = 0; i < cnt; i++) { + u8 idx; + + /* Make sure the number of extraction sequence required + * does not exceed the block's capability + */ + if (params->es_cnt >= fv_words) + return ICE_ERR_MAX_LIMIT; + + /* some blocks require a reversed field vector layout */ + if (hw->blk[params->blk].es.reverse) + idx = fv_words - params->es_cnt - 1; + else + idx = params->es_cnt; + + params->es[idx].prot_id = prot_id; + params->es[idx].off = off; + params->es_cnt++; + + off += ICE_FLOW_FV_EXTRACT_SZ; + } + + return 0; +} + +/** + * ice_flow_create_xtrct_seq - Create an extraction sequence for given segments + * @hw: pointer to the HW struct + * @params: information about the flow to be processed + * + * This function iterates through all matched fields in the given segments, and + * creates an extraction sequence for the fields. + */ +static enum ice_status +ice_flow_create_xtrct_seq(struct ice_hw *hw, + struct ice_flow_prof_params *params) +{ + struct ice_flow_prof *prof = params->prof; + enum ice_status status = 0; + u8 i; + + for (i = 0; i < prof->segs_cnt; i++) { + u8 j; + + for_each_set_bit(j, (unsigned long *)&prof->segs[i].match, + ICE_FLOW_FIELD_IDX_MAX) { + status = ice_flow_xtract_fld(hw, params, i, + (enum ice_flow_field)j); + if (status) + return status; + } + } + + return status; +} + +/** + * ice_flow_proc_segs - process all packet segments associated with a profile + * @hw: pointer to the HW struct + * @params: information about the flow to be processed + */ +static enum ice_status +ice_flow_proc_segs(struct ice_hw *hw, struct ice_flow_prof_params *params) +{ + enum ice_status status; + + status = ice_flow_proc_seg_hdrs(params); + if (status) + return status; + + status = ice_flow_create_xtrct_seq(hw, params); + if (status) + return status; + + switch (params->blk) { + case ICE_BLK_RSS: + /* Only header information is provided for RSS configuration. + * No further processing is needed. + */ + status = 0; + break; + default: + return ICE_ERR_NOT_IMPL; + } + + return status; +} + +#define ICE_FLOW_FIND_PROF_CHK_FLDS 0x00000001 +#define ICE_FLOW_FIND_PROF_CHK_VSI 0x00000002 +#define ICE_FLOW_FIND_PROF_NOT_CHK_DIR 0x00000004 + +/** + * ice_flow_find_prof_conds - Find a profile matching headers and conditions + * @hw: pointer to the HW struct + * @blk: classification stage + * @dir: flow direction + * @segs: array of one or more packet segments that describe the flow + * @segs_cnt: number of packet segments provided + * @vsi_handle: software VSI handle to check VSI (ICE_FLOW_FIND_PROF_CHK_VSI) + * @conds: additional conditions to be checked (ICE_FLOW_FIND_PROF_CHK_*) + */ +static struct ice_flow_prof * +ice_flow_find_prof_conds(struct ice_hw *hw, enum ice_block blk, + enum ice_flow_dir dir, struct ice_flow_seg_info *segs, + u8 segs_cnt, u16 vsi_handle, u32 conds) +{ + struct ice_flow_prof *p, *prof = NULL; + + mutex_lock(&hw->fl_profs_locks[blk]); + list_for_each_entry(p, &hw->fl_profs[blk], l_entry) + if ((p->dir == dir || conds & ICE_FLOW_FIND_PROF_NOT_CHK_DIR) && + segs_cnt && segs_cnt == p->segs_cnt) { + u8 i; + + /* Check for profile-VSI association if specified */ + if ((conds & ICE_FLOW_FIND_PROF_CHK_VSI) && + ice_is_vsi_valid(hw, vsi_handle) && + !test_bit(vsi_handle, p->vsis)) + continue; + + /* Protocol headers must be checked. Matched fields are + * checked if specified. + */ + for (i = 0; i < segs_cnt; i++) + if (segs[i].hdrs != p->segs[i].hdrs || + ((conds & ICE_FLOW_FIND_PROF_CHK_FLDS) && + segs[i].match != p->segs[i].match)) + break; + + /* A match is found if all segments are matched */ + if (i == segs_cnt) { + prof = p; + break; + } + } + mutex_unlock(&hw->fl_profs_locks[blk]); + + return prof; +} + +/** + * ice_flow_find_prof_id - Look up a profile with given profile ID + * @hw: pointer to the HW struct + * @blk: classification stage + * @prof_id: unique ID to identify this flow profile + */ +static struct ice_flow_prof * +ice_flow_find_prof_id(struct ice_hw *hw, enum ice_block blk, u64 prof_id) +{ + struct ice_flow_prof *p; + + list_for_each_entry(p, &hw->fl_profs[blk], l_entry) + if (p->id == prof_id) + return p; + + return NULL; +} + +/** + * ice_flow_add_prof_sync - Add a flow profile for packet segments and fields + * @hw: pointer to the HW struct + * @blk: classification stage + * @dir: flow direction + * @prof_id: unique ID to identify this flow profile + * @segs: array of one or more packet segments that describe the flow + * @segs_cnt: number of packet segments provided + * @prof: stores the returned flow profile added + * + * Assumption: the caller has acquired the lock to the profile list + */ +static enum ice_status +ice_flow_add_prof_sync(struct ice_hw *hw, enum ice_block blk, + enum ice_flow_dir dir, u64 prof_id, + struct ice_flow_seg_info *segs, u8 segs_cnt, + struct ice_flow_prof **prof) +{ + struct ice_flow_prof_params params; + enum ice_status status; + u8 i; + + if (!prof) + return ICE_ERR_BAD_PTR; + + memset(¶ms, 0, sizeof(params)); + params.prof = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*params.prof), + GFP_KERNEL); + if (!params.prof) + return ICE_ERR_NO_MEMORY; + + /* initialize extraction sequence to all invalid (0xff) */ + for (i = 0; i < ICE_MAX_FV_WORDS; i++) { + params.es[i].prot_id = ICE_PROT_INVALID; + params.es[i].off = ICE_FV_OFFSET_INVAL; + } + + params.blk = blk; + params.prof->id = prof_id; + params.prof->dir = dir; + params.prof->segs_cnt = segs_cnt; + + /* Make a copy of the segments that need to be persistent in the flow + * profile instance + */ + for (i = 0; i < segs_cnt; i++) + memcpy(¶ms.prof->segs[i], &segs[i], sizeof(*segs)); + + status = ice_flow_proc_segs(hw, ¶ms); + if (status) { + ice_debug(hw, ICE_DBG_FLOW, + "Error processing a flow's packet segments\n"); + goto out; + } + + /* Add a HW profile for this flow profile */ + status = ice_add_prof(hw, blk, prof_id, (u8 *)params.ptypes, params.es); + if (status) { + ice_debug(hw, ICE_DBG_FLOW, "Error adding a HW flow profile\n"); + goto out; + } + + INIT_LIST_HEAD(¶ms.prof->entries); + mutex_init(¶ms.prof->entries_lock); + *prof = params.prof; + +out: + if (status) + devm_kfree(ice_hw_to_dev(hw), params.prof); + + return status; +} + +/** + * ice_flow_rem_prof_sync - remove a flow profile + * @hw: pointer to the hardware structure + * @blk: classification stage + * @prof: pointer to flow profile to remove + * + * Assumption: the caller has acquired the lock to the profile list + */ +static enum ice_status +ice_flow_rem_prof_sync(struct ice_hw *hw, enum ice_block blk, + struct ice_flow_prof *prof) +{ + enum ice_status status; + + /* Remove all hardware profiles associated with this flow profile */ + status = ice_rem_prof(hw, blk, prof->id); + if (!status) { + list_del(&prof->l_entry); + mutex_destroy(&prof->entries_lock); + devm_kfree(ice_hw_to_dev(hw), prof); + } + + return status; +} + +/** + * ice_flow_assoc_prof - associate a VSI with a flow profile + * @hw: pointer to the hardware structure + * @blk: classification stage + * @prof: pointer to flow profile + * @vsi_handle: software VSI handle + * + * Assumption: the caller has acquired the lock to the profile list + * and the software VSI handle has been validated + */ +static enum ice_status +ice_flow_assoc_prof(struct ice_hw *hw, enum ice_block blk, + struct ice_flow_prof *prof, u16 vsi_handle) +{ + enum ice_status status = 0; + + if (!test_bit(vsi_handle, prof->vsis)) { + status = ice_add_prof_id_flow(hw, blk, + ice_get_hw_vsi_num(hw, + vsi_handle), + prof->id); + if (!status) + set_bit(vsi_handle, prof->vsis); + else + ice_debug(hw, ICE_DBG_FLOW, + "HW profile add failed, %d\n", + status); + } + + return status; +} + +/** + * ice_flow_disassoc_prof - disassociate a VSI from a flow profile + * @hw: pointer to the hardware structure + * @blk: classification stage + * @prof: pointer to flow profile + * @vsi_handle: software VSI handle + * + * Assumption: the caller has acquired the lock to the profile list + * and the software VSI handle has been validated + */ +static enum ice_status +ice_flow_disassoc_prof(struct ice_hw *hw, enum ice_block blk, + struct ice_flow_prof *prof, u16 vsi_handle) +{ + enum ice_status status = 0; + + if (test_bit(vsi_handle, prof->vsis)) { + status = ice_rem_prof_id_flow(hw, blk, + ice_get_hw_vsi_num(hw, + vsi_handle), + prof->id); + if (!status) + clear_bit(vsi_handle, prof->vsis); + else + ice_debug(hw, ICE_DBG_FLOW, + "HW profile remove failed, %d\n", + status); + } + + return status; +} + +/** + * ice_flow_add_prof - Add a flow profile for packet segments and matched fields + * @hw: pointer to the HW struct + * @blk: classification stage + * @dir: flow direction + * @prof_id: unique ID to identify this flow profile + * @segs: array of one or more packet segments that describe the flow + * @segs_cnt: number of packet segments provided + * @prof: stores the returned flow profile added + */ +static enum ice_status +ice_flow_add_prof(struct ice_hw *hw, enum ice_block blk, enum ice_flow_dir dir, + u64 prof_id, struct ice_flow_seg_info *segs, u8 segs_cnt, + struct ice_flow_prof **prof) +{ + enum ice_status status; + + if (segs_cnt > ICE_FLOW_SEG_MAX) + return ICE_ERR_MAX_LIMIT; + + if (!segs_cnt) + return ICE_ERR_PARAM; + + if (!segs) + return ICE_ERR_BAD_PTR; + + status = ice_flow_val_hdrs(segs, segs_cnt); + if (status) + return status; + + mutex_lock(&hw->fl_profs_locks[blk]); + + status = ice_flow_add_prof_sync(hw, blk, dir, prof_id, segs, segs_cnt, + prof); + if (!status) + list_add(&(*prof)->l_entry, &hw->fl_profs[blk]); + + mutex_unlock(&hw->fl_profs_locks[blk]); + + return status; +} + +/** + * ice_flow_rem_prof - Remove a flow profile and all entries associated with it + * @hw: pointer to the HW struct + * @blk: the block for which the flow profile is to be removed + * @prof_id: unique ID of the flow profile to be removed + */ +static enum ice_status +ice_flow_rem_prof(struct ice_hw *hw, enum ice_block blk, u64 prof_id) +{ + struct ice_flow_prof *prof; + enum ice_status status; + + mutex_lock(&hw->fl_profs_locks[blk]); + + prof = ice_flow_find_prof_id(hw, blk, prof_id); + if (!prof) { + status = ICE_ERR_DOES_NOT_EXIST; + goto out; + } + + /* prof becomes invalid after the call */ + status = ice_flow_rem_prof_sync(hw, blk, prof); + +out: + mutex_unlock(&hw->fl_profs_locks[blk]); + + return status; +} + +/** + * ice_flow_set_fld_ext - specifies locations of field from entry's input buffer + * @seg: packet segment the field being set belongs to + * @fld: field to be set + * @type: type of the field + * @val_loc: if not ICE_FLOW_FLD_OFF_INVAL, location of the value to match from + * entry's input buffer + * @mask_loc: if not ICE_FLOW_FLD_OFF_INVAL, location of mask value from entry's + * input buffer + * @last_loc: if not ICE_FLOW_FLD_OFF_INVAL, location of last/upper value from + * entry's input buffer + * + * This helper function stores information of a field being matched, including + * the type of the field and the locations of the value to match, the mask, and + * and the upper-bound value in the start of the input buffer for a flow entry. + * This function should only be used for fixed-size data structures. + * + * This function also opportunistically determines the protocol headers to be + * present based on the fields being set. Some fields cannot be used alone to + * determine the protocol headers present. Sometimes, fields for particular + * protocol headers are not matched. In those cases, the protocol headers + * must be explicitly set. + */ +static void +ice_flow_set_fld_ext(struct ice_flow_seg_info *seg, enum ice_flow_field fld, + enum ice_flow_fld_match_type type, u16 val_loc, + u16 mask_loc, u16 last_loc) +{ + u64 bit = BIT_ULL(fld); + + seg->match |= bit; + if (type == ICE_FLOW_FLD_TYPE_RANGE) + seg->range |= bit; + + seg->fields[fld].type = type; + seg->fields[fld].src.val = val_loc; + seg->fields[fld].src.mask = mask_loc; + seg->fields[fld].src.last = last_loc; + + ICE_FLOW_SET_HDRS(seg, ice_flds_info[fld].hdr); +} + +/** + * ice_flow_set_fld - specifies locations of field from entry's input buffer + * @seg: packet segment the field being set belongs to + * @fld: field to be set + * @val_loc: if not ICE_FLOW_FLD_OFF_INVAL, location of the value to match from + * entry's input buffer + * @mask_loc: if not ICE_FLOW_FLD_OFF_INVAL, location of mask value from entry's + * input buffer + * @last_loc: if not ICE_FLOW_FLD_OFF_INVAL, location of last/upper value from + * entry's input buffer + * @range: indicate if field being matched is to be in a range + * + * This function specifies the locations, in the form of byte offsets from the + * start of the input buffer for a flow entry, from where the value to match, + * the mask value, and upper value can be extracted. These locations are then + * stored in the flow profile. When adding a flow entry associated with the + * flow profile, these locations will be used to quickly extract the values and + * create the content of a match entry. This function should only be used for + * fixed-size data structures. + */ +static void +ice_flow_set_fld(struct ice_flow_seg_info *seg, enum ice_flow_field fld, + u16 val_loc, u16 mask_loc, u16 last_loc, bool range) +{ + enum ice_flow_fld_match_type t = range ? + ICE_FLOW_FLD_TYPE_RANGE : ICE_FLOW_FLD_TYPE_REG; + + ice_flow_set_fld_ext(seg, fld, t, val_loc, mask_loc, last_loc); +} + +#define ICE_FLOW_RSS_SEG_HDR_L3_MASKS \ + (ICE_FLOW_SEG_HDR_IPV4 | ICE_FLOW_SEG_HDR_IPV6) + +#define ICE_FLOW_RSS_SEG_HDR_L4_MASKS \ + (ICE_FLOW_SEG_HDR_TCP | ICE_FLOW_SEG_HDR_UDP | ICE_FLOW_SEG_HDR_SCTP) + +#define ICE_FLOW_RSS_SEG_HDR_VAL_MASKS \ + (ICE_FLOW_RSS_SEG_HDR_L3_MASKS | \ + ICE_FLOW_RSS_SEG_HDR_L4_MASKS) + +/** + * ice_flow_set_rss_seg_info - setup packet segments for RSS + * @segs: pointer to the flow field segment(s) + * @hash_fields: fields to be hashed on for the segment(s) + * @flow_hdr: protocol header fields within a packet segment + * + * Helper function to extract fields from hash bitmap and use flow + * header value to set flow field segment for further use in flow + * profile entry or removal. + */ +static enum ice_status +ice_flow_set_rss_seg_info(struct ice_flow_seg_info *segs, u64 hash_fields, + u32 flow_hdr) +{ + u64 val; + u8 i; + + for_each_set_bit(i, (unsigned long *)&hash_fields, + ICE_FLOW_FIELD_IDX_MAX) + ice_flow_set_fld(segs, (enum ice_flow_field)i, + ICE_FLOW_FLD_OFF_INVAL, ICE_FLOW_FLD_OFF_INVAL, + ICE_FLOW_FLD_OFF_INVAL, false); + + ICE_FLOW_SET_HDRS(segs, flow_hdr); + + if (segs->hdrs & ~ICE_FLOW_RSS_SEG_HDR_VAL_MASKS) + return ICE_ERR_PARAM; + + val = (u64)(segs->hdrs & ICE_FLOW_RSS_SEG_HDR_L3_MASKS); + if (val && !is_power_of_2(val)) + return ICE_ERR_CFG; + + val = (u64)(segs->hdrs & ICE_FLOW_RSS_SEG_HDR_L4_MASKS); + if (val && !is_power_of_2(val)) + return ICE_ERR_CFG; + + return 0; +} + +/** + * ice_rem_vsi_rss_list - remove VSI from RSS list + * @hw: pointer to the hardware structure + * @vsi_handle: software VSI handle + * + * Remove the VSI from all RSS configurations in the list. + */ +void ice_rem_vsi_rss_list(struct ice_hw *hw, u16 vsi_handle) +{ + struct ice_rss_cfg *r, *tmp; + + if (list_empty(&hw->rss_list_head)) + return; + + mutex_lock(&hw->rss_locks); + list_for_each_entry_safe(r, tmp, &hw->rss_list_head, l_entry) + if (test_and_clear_bit(vsi_handle, r->vsis)) + if (bitmap_empty(r->vsis, ICE_MAX_VSI)) { + list_del(&r->l_entry); + devm_kfree(ice_hw_to_dev(hw), r); + } + mutex_unlock(&hw->rss_locks); +} + +/** + * ice_rem_vsi_rss_cfg - remove RSS configurations associated with VSI + * @hw: pointer to the hardware structure + * @vsi_handle: software VSI handle + * + * This function will iterate through all flow profiles and disassociate + * the VSI from that profile. If the flow profile has no VSIs it will + * be removed. + */ +enum ice_status ice_rem_vsi_rss_cfg(struct ice_hw *hw, u16 vsi_handle) +{ + const enum ice_block blk = ICE_BLK_RSS; + struct ice_flow_prof *p, *t; + enum ice_status status = 0; + + if (!ice_is_vsi_valid(hw, vsi_handle)) + return ICE_ERR_PARAM; + + if (list_empty(&hw->fl_profs[blk])) + return 0; + + mutex_lock(&hw->fl_profs_locks[blk]); + list_for_each_entry_safe(p, t, &hw->fl_profs[blk], l_entry) + if (test_bit(vsi_handle, p->vsis)) { + status = ice_flow_disassoc_prof(hw, blk, p, vsi_handle); + if (status) + break; + + if (bitmap_empty(p->vsis, ICE_MAX_VSI)) { + status = ice_flow_rem_prof_sync(hw, blk, p); + if (status) + break; + } + } + mutex_unlock(&hw->fl_profs_locks[blk]); + + return status; +} + +/** + * ice_rem_rss_list - remove RSS configuration from list + * @hw: pointer to the hardware structure + * @vsi_handle: software VSI handle + * @prof: pointer to flow profile + * + * Assumption: lock has already been acquired for RSS list + */ +static void +ice_rem_rss_list(struct ice_hw *hw, u16 vsi_handle, struct ice_flow_prof *prof) +{ + struct ice_rss_cfg *r, *tmp; + + /* Search for RSS hash fields associated to the VSI that match the + * hash configurations associated to the flow profile. If found + * remove from the RSS entry list of the VSI context and delete entry. + */ + list_for_each_entry_safe(r, tmp, &hw->rss_list_head, l_entry) + if (r->hashed_flds == prof->segs[prof->segs_cnt - 1].match && + r->packet_hdr == prof->segs[prof->segs_cnt - 1].hdrs) { + clear_bit(vsi_handle, r->vsis); + if (bitmap_empty(r->vsis, ICE_MAX_VSI)) { + list_del(&r->l_entry); + devm_kfree(ice_hw_to_dev(hw), r); + } + return; + } +} + +/** + * ice_add_rss_list - add RSS configuration to list + * @hw: pointer to the hardware structure + * @vsi_handle: software VSI handle + * @prof: pointer to flow profile + * + * Assumption: lock has already been acquired for RSS list + */ +static enum ice_status +ice_add_rss_list(struct ice_hw *hw, u16 vsi_handle, struct ice_flow_prof *prof) +{ + struct ice_rss_cfg *r, *rss_cfg; + + list_for_each_entry(r, &hw->rss_list_head, l_entry) + if (r->hashed_flds == prof->segs[prof->segs_cnt - 1].match && + r->packet_hdr == prof->segs[prof->segs_cnt - 1].hdrs) { + set_bit(vsi_handle, r->vsis); + return 0; + } + + rss_cfg = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*rss_cfg), + GFP_KERNEL); + if (!rss_cfg) + return ICE_ERR_NO_MEMORY; + + rss_cfg->hashed_flds = prof->segs[prof->segs_cnt - 1].match; + rss_cfg->packet_hdr = prof->segs[prof->segs_cnt - 1].hdrs; + set_bit(vsi_handle, rss_cfg->vsis); + + list_add_tail(&rss_cfg->l_entry, &hw->rss_list_head); + + return 0; +} + +#define ICE_FLOW_PROF_HASH_S 0 +#define ICE_FLOW_PROF_HASH_M (0xFFFFFFFFULL << ICE_FLOW_PROF_HASH_S) +#define ICE_FLOW_PROF_HDR_S 32 +#define ICE_FLOW_PROF_HDR_M (0x3FFFFFFFULL << ICE_FLOW_PROF_HDR_S) +#define ICE_FLOW_PROF_ENCAP_S 63 +#define ICE_FLOW_PROF_ENCAP_M (BIT_ULL(ICE_FLOW_PROF_ENCAP_S)) + +#define ICE_RSS_OUTER_HEADERS 1 + +/* Flow profile ID format: + * [0:31] - Packet match fields + * [32:62] - Protocol header + * [63] - Encapsulation flag, 0 if non-tunneled, 1 if tunneled + */ +#define ICE_FLOW_GEN_PROFID(hash, hdr, segs_cnt) \ + (u64)(((u64)(hash) & ICE_FLOW_PROF_HASH_M) | \ + (((u64)(hdr) << ICE_FLOW_PROF_HDR_S) & ICE_FLOW_PROF_HDR_M) | \ + ((u8)((segs_cnt) - 1) ? ICE_FLOW_PROF_ENCAP_M : 0)) + +/** + * ice_add_rss_cfg_sync - add an RSS configuration + * @hw: pointer to the hardware structure + * @vsi_handle: software VSI handle + * @hashed_flds: hash bit fields (ICE_FLOW_HASH_*) to configure + * @addl_hdrs: protocol header fields + * @segs_cnt: packet segment count + * + * Assumption: lock has already been acquired for RSS list + */ +static enum ice_status +ice_add_rss_cfg_sync(struct ice_hw *hw, u16 vsi_handle, u64 hashed_flds, + u32 addl_hdrs, u8 segs_cnt) +{ + const enum ice_block blk = ICE_BLK_RSS; + struct ice_flow_prof *prof = NULL; + struct ice_flow_seg_info *segs; + enum ice_status status; + + if (!segs_cnt || segs_cnt > ICE_FLOW_SEG_MAX) + return ICE_ERR_PARAM; + + segs = kcalloc(segs_cnt, sizeof(*segs), GFP_KERNEL); + if (!segs) + return ICE_ERR_NO_MEMORY; + + /* Construct the packet segment info from the hashed fields */ + status = ice_flow_set_rss_seg_info(&segs[segs_cnt - 1], hashed_flds, + addl_hdrs); + if (status) + goto exit; + + /* Search for a flow profile that has matching headers, hash fields + * and has the input VSI associated to it. If found, no further + * operations required and exit. + */ + prof = ice_flow_find_prof_conds(hw, blk, ICE_FLOW_RX, segs, segs_cnt, + vsi_handle, + ICE_FLOW_FIND_PROF_CHK_FLDS | + ICE_FLOW_FIND_PROF_CHK_VSI); + if (prof) + goto exit; + + /* Check if a flow profile exists with the same protocol headers and + * associated with the input VSI. If so disassociate the VSI from + * this profile. The VSI will be added to a new profile created with + * the protocol header and new hash field configuration. + */ + prof = ice_flow_find_prof_conds(hw, blk, ICE_FLOW_RX, segs, segs_cnt, + vsi_handle, ICE_FLOW_FIND_PROF_CHK_VSI); + if (prof) { + status = ice_flow_disassoc_prof(hw, blk, prof, vsi_handle); + if (!status) + ice_rem_rss_list(hw, vsi_handle, prof); + else + goto exit; + + /* Remove profile if it has no VSIs associated */ + if (bitmap_empty(prof->vsis, ICE_MAX_VSI)) { + status = ice_flow_rem_prof(hw, blk, prof->id); + if (status) + goto exit; + } + } + + /* Search for a profile that has same match fields only. If this + * exists then associate the VSI to this profile. + */ + prof = ice_flow_find_prof_conds(hw, blk, ICE_FLOW_RX, segs, segs_cnt, + vsi_handle, + ICE_FLOW_FIND_PROF_CHK_FLDS); + if (prof) { + status = ice_flow_assoc_prof(hw, blk, prof, vsi_handle); + if (!status) + status = ice_add_rss_list(hw, vsi_handle, prof); + goto exit; + } + + /* Create a new flow profile with generated profile and packet + * segment information. + */ + status = ice_flow_add_prof(hw, blk, ICE_FLOW_RX, + ICE_FLOW_GEN_PROFID(hashed_flds, + segs[segs_cnt - 1].hdrs, + segs_cnt), + segs, segs_cnt, &prof); + if (status) + goto exit; + + status = ice_flow_assoc_prof(hw, blk, prof, vsi_handle); + /* If association to a new flow profile failed then this profile can + * be removed. + */ + if (status) { + ice_flow_rem_prof(hw, blk, prof->id); + goto exit; + } + + status = ice_add_rss_list(hw, vsi_handle, prof); + +exit: + kfree(segs); + return status; +} + +/** + * ice_add_rss_cfg - add an RSS configuration with specified hashed fields + * @hw: pointer to the hardware structure + * @vsi_handle: software VSI handle + * @hashed_flds: hash bit fields (ICE_FLOW_HASH_*) to configure + * @addl_hdrs: protocol header fields + * + * This function will generate a flow profile based on fields associated with + * the input fields to hash on, the flow type and use the VSI number to add + * a flow entry to the profile. + */ +enum ice_status +ice_add_rss_cfg(struct ice_hw *hw, u16 vsi_handle, u64 hashed_flds, + u32 addl_hdrs) +{ + enum ice_status status; + + if (hashed_flds == ICE_HASH_INVALID || + !ice_is_vsi_valid(hw, vsi_handle)) + return ICE_ERR_PARAM; + + mutex_lock(&hw->rss_locks); + status = ice_add_rss_cfg_sync(hw, vsi_handle, hashed_flds, addl_hdrs, + ICE_RSS_OUTER_HEADERS); + mutex_unlock(&hw->rss_locks); + + return status; +} + +/* Mapping of AVF hash bit fields to an L3-L4 hash combination. + * As the ice_flow_avf_hdr_field represent individual bit shifts in a hash, + * convert its values to their appropriate flow L3, L4 values. + */ +#define ICE_FLOW_AVF_RSS_IPV4_MASKS \ + (BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_OTHER) | \ + BIT_ULL(ICE_AVF_FLOW_FIELD_FRAG_IPV4)) +#define ICE_FLOW_AVF_RSS_TCP_IPV4_MASKS \ + (BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_TCP_SYN_NO_ACK) | \ + BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_TCP)) +#define ICE_FLOW_AVF_RSS_UDP_IPV4_MASKS \ + (BIT_ULL(ICE_AVF_FLOW_FIELD_UNICAST_IPV4_UDP) | \ + BIT_ULL(ICE_AVF_FLOW_FIELD_MULTICAST_IPV4_UDP) | \ + BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_UDP)) +#define ICE_FLOW_AVF_RSS_ALL_IPV4_MASKS \ + (ICE_FLOW_AVF_RSS_TCP_IPV4_MASKS | ICE_FLOW_AVF_RSS_UDP_IPV4_MASKS | \ + ICE_FLOW_AVF_RSS_IPV4_MASKS | BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_SCTP)) + +#define ICE_FLOW_AVF_RSS_IPV6_MASKS \ + (BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_OTHER) | \ + BIT_ULL(ICE_AVF_FLOW_FIELD_FRAG_IPV6)) +#define ICE_FLOW_AVF_RSS_UDP_IPV6_MASKS \ + (BIT_ULL(ICE_AVF_FLOW_FIELD_UNICAST_IPV6_UDP) | \ + BIT_ULL(ICE_AVF_FLOW_FIELD_MULTICAST_IPV6_UDP) | \ + BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_UDP)) +#define ICE_FLOW_AVF_RSS_TCP_IPV6_MASKS \ + (BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_TCP_SYN_NO_ACK) | \ + BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_TCP)) +#define ICE_FLOW_AVF_RSS_ALL_IPV6_MASKS \ + (ICE_FLOW_AVF_RSS_TCP_IPV6_MASKS | ICE_FLOW_AVF_RSS_UDP_IPV6_MASKS | \ + ICE_FLOW_AVF_RSS_IPV6_MASKS | BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_SCTP)) + +/** + * ice_add_avf_rss_cfg - add an RSS configuration for AVF driver + * @hw: pointer to the hardware structure + * @vsi_handle: software VSI handle + * @avf_hash: hash bit fields (ICE_AVF_FLOW_FIELD_*) to configure + * + * This function will take the hash bitmap provided by the AVF driver via a + * message, convert it to ICE-compatible values, and configure RSS flow + * profiles. + */ +enum ice_status +ice_add_avf_rss_cfg(struct ice_hw *hw, u16 vsi_handle, u64 avf_hash) +{ + enum ice_status status = 0; + u64 hash_flds; + + if (avf_hash == ICE_AVF_FLOW_FIELD_INVALID || + !ice_is_vsi_valid(hw, vsi_handle)) + return ICE_ERR_PARAM; + + /* Make sure no unsupported bits are specified */ + if (avf_hash & ~(ICE_FLOW_AVF_RSS_ALL_IPV4_MASKS | + ICE_FLOW_AVF_RSS_ALL_IPV6_MASKS)) + return ICE_ERR_CFG; + + hash_flds = avf_hash; + + /* Always create an L3 RSS configuration for any L4 RSS configuration */ + if (hash_flds & ICE_FLOW_AVF_RSS_ALL_IPV4_MASKS) + hash_flds |= ICE_FLOW_AVF_RSS_IPV4_MASKS; + + if (hash_flds & ICE_FLOW_AVF_RSS_ALL_IPV6_MASKS) + hash_flds |= ICE_FLOW_AVF_RSS_IPV6_MASKS; + + /* Create the corresponding RSS configuration for each valid hash bit */ + while (hash_flds) { + u64 rss_hash = ICE_HASH_INVALID; + + if (hash_flds & ICE_FLOW_AVF_RSS_ALL_IPV4_MASKS) { + if (hash_flds & ICE_FLOW_AVF_RSS_IPV4_MASKS) { + rss_hash = ICE_FLOW_HASH_IPV4; + hash_flds &= ~ICE_FLOW_AVF_RSS_IPV4_MASKS; + } else if (hash_flds & + ICE_FLOW_AVF_RSS_TCP_IPV4_MASKS) { + rss_hash = ICE_FLOW_HASH_IPV4 | + ICE_FLOW_HASH_TCP_PORT; + hash_flds &= ~ICE_FLOW_AVF_RSS_TCP_IPV4_MASKS; + } else if (hash_flds & + ICE_FLOW_AVF_RSS_UDP_IPV4_MASKS) { + rss_hash = ICE_FLOW_HASH_IPV4 | + ICE_FLOW_HASH_UDP_PORT; + hash_flds &= ~ICE_FLOW_AVF_RSS_UDP_IPV4_MASKS; + } else if (hash_flds & + BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_SCTP)) { + rss_hash = ICE_FLOW_HASH_IPV4 | + ICE_FLOW_HASH_SCTP_PORT; + hash_flds &= + ~BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_SCTP); + } + } else if (hash_flds & ICE_FLOW_AVF_RSS_ALL_IPV6_MASKS) { + if (hash_flds & ICE_FLOW_AVF_RSS_IPV6_MASKS) { + rss_hash = ICE_FLOW_HASH_IPV6; + hash_flds &= ~ICE_FLOW_AVF_RSS_IPV6_MASKS; + } else if (hash_flds & + ICE_FLOW_AVF_RSS_TCP_IPV6_MASKS) { + rss_hash = ICE_FLOW_HASH_IPV6 | + ICE_FLOW_HASH_TCP_PORT; + hash_flds &= ~ICE_FLOW_AVF_RSS_TCP_IPV6_MASKS; + } else if (hash_flds & + ICE_FLOW_AVF_RSS_UDP_IPV6_MASKS) { + rss_hash = ICE_FLOW_HASH_IPV6 | + ICE_FLOW_HASH_UDP_PORT; + hash_flds &= ~ICE_FLOW_AVF_RSS_UDP_IPV6_MASKS; + } else if (hash_flds & + BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_SCTP)) { + rss_hash = ICE_FLOW_HASH_IPV6 | + ICE_FLOW_HASH_SCTP_PORT; + hash_flds &= + ~BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_SCTP); + } + } + + if (rss_hash == ICE_HASH_INVALID) + return ICE_ERR_OUT_OF_RANGE; + + status = ice_add_rss_cfg(hw, vsi_handle, rss_hash, + ICE_FLOW_SEG_HDR_NONE); + if (status) + break; + } + + return status; +} + +/** + * ice_replay_rss_cfg - replay RSS configurations associated with VSI + * @hw: pointer to the hardware structure + * @vsi_handle: software VSI handle + */ +enum ice_status ice_replay_rss_cfg(struct ice_hw *hw, u16 vsi_handle) +{ + enum ice_status status = 0; + struct ice_rss_cfg *r; + + if (!ice_is_vsi_valid(hw, vsi_handle)) + return ICE_ERR_PARAM; + + mutex_lock(&hw->rss_locks); + list_for_each_entry(r, &hw->rss_list_head, l_entry) { + if (test_bit(vsi_handle, r->vsis)) { + status = ice_add_rss_cfg_sync(hw, vsi_handle, + r->hashed_flds, + r->packet_hdr, + ICE_RSS_OUTER_HEADERS); + if (status) + break; + } + } + mutex_unlock(&hw->rss_locks); + + return status; +} + +/** + * ice_get_rss_cfg - returns hashed fields for the given header types + * @hw: pointer to the hardware structure + * @vsi_handle: software VSI handle + * @hdrs: protocol header type + * + * This function will return the match fields of the first instance of flow + * profile having the given header types and containing input VSI + */ +u64 ice_get_rss_cfg(struct ice_hw *hw, u16 vsi_handle, u32 hdrs) +{ + struct ice_rss_cfg *r, *rss_cfg = NULL; + + /* verify if the protocol header is non zero and VSI is valid */ + if (hdrs == ICE_FLOW_SEG_HDR_NONE || !ice_is_vsi_valid(hw, vsi_handle)) + return ICE_HASH_INVALID; + + mutex_lock(&hw->rss_locks); + list_for_each_entry(r, &hw->rss_list_head, l_entry) + if (test_bit(vsi_handle, r->vsis) && + r->packet_hdr == hdrs) { + rss_cfg = r; + break; + } + mutex_unlock(&hw->rss_locks); + + return rss_cfg ? rss_cfg->hashed_flds : ICE_HASH_INVALID; +} diff --git a/drivers/net/ethernet/intel/ice/ice_flow.h b/drivers/net/ethernet/intel/ice/ice_flow.h new file mode 100644 index 000000000000..5558627bd5eb --- /dev/null +++ b/drivers/net/ethernet/intel/ice/ice_flow.h @@ -0,0 +1,207 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2019, Intel Corporation. */ + +#ifndef _ICE_FLOW_H_ +#define _ICE_FLOW_H_ + +#define ICE_FLOW_ENTRY_HANDLE_INVAL 0 +#define ICE_FLOW_FLD_OFF_INVAL 0xffff + +/* Generate flow hash field from flow field type(s) */ +#define ICE_FLOW_HASH_IPV4 \ + (BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_SA) | \ + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_DA)) +#define ICE_FLOW_HASH_IPV6 \ + (BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_SA) | \ + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_DA)) +#define ICE_FLOW_HASH_TCP_PORT \ + (BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_SRC_PORT) | \ + BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_DST_PORT)) +#define ICE_FLOW_HASH_UDP_PORT \ + (BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_SRC_PORT) | \ + BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_DST_PORT)) +#define ICE_FLOW_HASH_SCTP_PORT \ + (BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_SRC_PORT) | \ + BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_DST_PORT)) + +#define ICE_HASH_INVALID 0 +#define ICE_HASH_TCP_IPV4 (ICE_FLOW_HASH_IPV4 | ICE_FLOW_HASH_TCP_PORT) +#define ICE_HASH_TCP_IPV6 (ICE_FLOW_HASH_IPV6 | ICE_FLOW_HASH_TCP_PORT) +#define ICE_HASH_UDP_IPV4 (ICE_FLOW_HASH_IPV4 | ICE_FLOW_HASH_UDP_PORT) +#define ICE_HASH_UDP_IPV6 (ICE_FLOW_HASH_IPV6 | ICE_FLOW_HASH_UDP_PORT) + +/* Protocol header fields within a packet segment. A segment consists of one or + * more protocol headers that make up a logical group of protocol headers. Each + * logical group of protocol headers encapsulates or is encapsulated using/by + * tunneling or encapsulation protocols for network virtualization such as GRE, + * VxLAN, etc. + */ +enum ice_flow_seg_hdr { + ICE_FLOW_SEG_HDR_NONE = 0x00000000, + ICE_FLOW_SEG_HDR_IPV4 = 0x00000004, + ICE_FLOW_SEG_HDR_IPV6 = 0x00000008, + ICE_FLOW_SEG_HDR_TCP = 0x00000040, + ICE_FLOW_SEG_HDR_UDP = 0x00000080, + ICE_FLOW_SEG_HDR_SCTP = 0x00000100, +}; + +enum ice_flow_field { + /* L3 */ + ICE_FLOW_FIELD_IDX_IPV4_SA, + ICE_FLOW_FIELD_IDX_IPV4_DA, + ICE_FLOW_FIELD_IDX_IPV6_SA, + ICE_FLOW_FIELD_IDX_IPV6_DA, + /* L4 */ + ICE_FLOW_FIELD_IDX_TCP_SRC_PORT, + ICE_FLOW_FIELD_IDX_TCP_DST_PORT, + ICE_FLOW_FIELD_IDX_UDP_SRC_PORT, + ICE_FLOW_FIELD_IDX_UDP_DST_PORT, + ICE_FLOW_FIELD_IDX_SCTP_SRC_PORT, + ICE_FLOW_FIELD_IDX_SCTP_DST_PORT, + /* The total number of enums must not exceed 64 */ + ICE_FLOW_FIELD_IDX_MAX +}; + +/* Flow headers and fields for AVF support */ +enum ice_flow_avf_hdr_field { + /* Values 0 - 28 are reserved for future use */ + ICE_AVF_FLOW_FIELD_INVALID = 0, + ICE_AVF_FLOW_FIELD_UNICAST_IPV4_UDP = 29, + ICE_AVF_FLOW_FIELD_MULTICAST_IPV4_UDP, + ICE_AVF_FLOW_FIELD_IPV4_UDP, + ICE_AVF_FLOW_FIELD_IPV4_TCP_SYN_NO_ACK, + ICE_AVF_FLOW_FIELD_IPV4_TCP, + ICE_AVF_FLOW_FIELD_IPV4_SCTP, + ICE_AVF_FLOW_FIELD_IPV4_OTHER, + ICE_AVF_FLOW_FIELD_FRAG_IPV4, + /* Values 37-38 are reserved */ + ICE_AVF_FLOW_FIELD_UNICAST_IPV6_UDP = 39, + ICE_AVF_FLOW_FIELD_MULTICAST_IPV6_UDP, + ICE_AVF_FLOW_FIELD_IPV6_UDP, + ICE_AVF_FLOW_FIELD_IPV6_TCP_SYN_NO_ACK, + ICE_AVF_FLOW_FIELD_IPV6_TCP, + ICE_AVF_FLOW_FIELD_IPV6_SCTP, + ICE_AVF_FLOW_FIELD_IPV6_OTHER, + ICE_AVF_FLOW_FIELD_FRAG_IPV6, + ICE_AVF_FLOW_FIELD_RSVD47, + ICE_AVF_FLOW_FIELD_FCOE_OX, + ICE_AVF_FLOW_FIELD_FCOE_RX, + ICE_AVF_FLOW_FIELD_FCOE_OTHER, + /* Values 51-62 are reserved */ + ICE_AVF_FLOW_FIELD_L2_PAYLOAD = 63, + ICE_AVF_FLOW_FIELD_MAX +}; + +/* Supported RSS offloads This macro is defined to support + * VIRTCHNL_OP_GET_RSS_HENA_CAPS ops. PF driver sends the RSS hardware + * capabilities to the caller of this ops. + */ +#define ICE_DEFAULT_RSS_HENA ( \ + BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_UDP) | \ + BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_SCTP) | \ + BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_TCP) | \ + BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_OTHER) | \ + BIT_ULL(ICE_AVF_FLOW_FIELD_FRAG_IPV4) | \ + BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_UDP) | \ + BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_TCP) | \ + BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_SCTP) | \ + BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_OTHER) | \ + BIT_ULL(ICE_AVF_FLOW_FIELD_FRAG_IPV6) | \ + BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_TCP_SYN_NO_ACK) | \ + BIT_ULL(ICE_AVF_FLOW_FIELD_UNICAST_IPV4_UDP) | \ + BIT_ULL(ICE_AVF_FLOW_FIELD_MULTICAST_IPV4_UDP) | \ + BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_TCP_SYN_NO_ACK) | \ + BIT_ULL(ICE_AVF_FLOW_FIELD_UNICAST_IPV6_UDP) | \ + BIT_ULL(ICE_AVF_FLOW_FIELD_MULTICAST_IPV6_UDP)) + +enum ice_flow_dir { + ICE_FLOW_RX = 0x02, +}; + +enum ice_flow_priority { + ICE_FLOW_PRIO_LOW, + ICE_FLOW_PRIO_NORMAL, + ICE_FLOW_PRIO_HIGH +}; + +#define ICE_FLOW_SEG_MAX 2 +#define ICE_FLOW_FV_EXTRACT_SZ 2 + +#define ICE_FLOW_SET_HDRS(seg, val) ((seg)->hdrs |= (u32)(val)) + +struct ice_flow_seg_xtrct { + u8 prot_id; /* Protocol ID of extracted header field */ + u16 off; /* Starting offset of the field in header in bytes */ + u8 idx; /* Index of FV entry used */ + u8 disp; /* Displacement of field in bits fr. FV entry's start */ +}; + +enum ice_flow_fld_match_type { + ICE_FLOW_FLD_TYPE_REG, /* Value, mask */ + ICE_FLOW_FLD_TYPE_RANGE, /* Value, mask, last (upper bound) */ + ICE_FLOW_FLD_TYPE_PREFIX, /* IP address, prefix, size of prefix */ + ICE_FLOW_FLD_TYPE_SIZE, /* Value, mask, size of match */ +}; + +struct ice_flow_fld_loc { + /* Describe offsets of field information relative to the beginning of + * input buffer provided when adding flow entries. + */ + u16 val; /* Offset where the value is located */ + u16 mask; /* Offset where the mask/prefix value is located */ + u16 last; /* Length or offset where the upper value is located */ +}; + +struct ice_flow_fld_info { + enum ice_flow_fld_match_type type; + /* Location where to retrieve data from an input buffer */ + struct ice_flow_fld_loc src; + /* Location where to put the data into the final entry buffer */ + struct ice_flow_fld_loc entry; + struct ice_flow_seg_xtrct xtrct; +}; + +struct ice_flow_seg_info { + u32 hdrs; /* Bitmask indicating protocol headers present */ + u64 match; /* Bitmask indicating header fields to be matched */ + u64 range; /* Bitmask indicating header fields matched as ranges */ + + struct ice_flow_fld_info fields[ICE_FLOW_FIELD_IDX_MAX]; +}; + +struct ice_flow_prof { + struct list_head l_entry; + + u64 id; + enum ice_flow_dir dir; + u8 segs_cnt; + + /* Keep track of flow entries associated with this flow profile */ + struct mutex entries_lock; + struct list_head entries; + + struct ice_flow_seg_info segs[ICE_FLOW_SEG_MAX]; + + /* software VSI handles referenced by this flow profile */ + DECLARE_BITMAP(vsis, ICE_MAX_VSI); +}; + +struct ice_rss_cfg { + struct list_head l_entry; + /* bitmap of VSIs added to the RSS entry */ + DECLARE_BITMAP(vsis, ICE_MAX_VSI); + u64 hashed_flds; + u32 packet_hdr; +}; + +enum ice_status ice_flow_rem_entry(struct ice_hw *hw, u64 entry_h); +void ice_rem_vsi_rss_list(struct ice_hw *hw, u16 vsi_handle); +enum ice_status ice_replay_rss_cfg(struct ice_hw *hw, u16 vsi_handle); +enum ice_status +ice_add_avf_rss_cfg(struct ice_hw *hw, u16 vsi_handle, u64 hashed_flds); +enum ice_status ice_rem_vsi_rss_cfg(struct ice_hw *hw, u16 vsi_handle); +enum ice_status +ice_add_rss_cfg(struct ice_hw *hw, u16 vsi_handle, u64 hashed_flds, + u32 addl_hdrs); +u64 ice_get_rss_cfg(struct ice_hw *hw, u16 vsi_handle, u32 hdrs); +#endif /* _ICE_FLOW_H_ */ diff --git a/drivers/net/ethernet/intel/ice/ice_hw_autogen.h b/drivers/net/ethernet/intel/ice/ice_hw_autogen.h index e8f32350fed2..f2cababf2561 100644 --- a/drivers/net/ethernet/intel/ice/ice_hw_autogen.h +++ b/drivers/net/ethernet/intel/ice/ice_hw_autogen.h @@ -60,15 +60,6 @@ #define PRTDCB_GENS_DCBX_STATUS_M ICE_M(0x7, 0) #define GL_PREEXT_L2_PMASK0(_i) (0x0020F0FC + ((_i) * 4)) #define GL_PREEXT_L2_PMASK1(_i) (0x0020F108 + ((_i) * 4)) -#define GLFLXP_RXDID_FLAGS(_i, _j) (0x0045D000 + ((_i) * 4 + (_j) * 256)) -#define GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_S 0 -#define GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_M ICE_M(0x3F, 0) -#define GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_1_S 8 -#define GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_1_M ICE_M(0x3F, 8) -#define GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_2_S 16 -#define GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_2_M ICE_M(0x3F, 16) -#define GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_3_S 24 -#define GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_3_M ICE_M(0x3F, 24) #define GLFLXP_RXDID_FLX_WRD_0(_i) (0x0045c800 + ((_i) * 4)) #define GLFLXP_RXDID_FLX_WRD_0_PROT_MDID_S 0 #define GLFLXP_RXDID_FLX_WRD_0_PROT_MDID_M ICE_M(0xFF, 0) diff --git a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h index 0997d352709b..878e125d8b42 100644 --- a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h +++ b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h @@ -199,6 +199,14 @@ enum ice_rxdid { /* Receive Flex Descriptor Rx opcode values */ #define ICE_RX_OPC_MDID 0x01 +/* Receive Descriptor MDID values that access packet flags */ +enum ice_flex_mdid_pkt_flags { + ICE_RX_MDID_PKT_FLAGS_15_0 = 20, + ICE_RX_MDID_PKT_FLAGS_31_16, + ICE_RX_MDID_PKT_FLAGS_47_32, + ICE_RX_MDID_PKT_FLAGS_63_48, +}; + /* Receive Descriptor MDID values */ enum ice_flex_rx_mdid { ICE_RX_MDID_FLOW_ID_LOWER = 5, diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index e7449248fab4..1874c9f51a32 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -3,6 +3,7 @@ #include "ice.h" #include "ice_base.h" +#include "ice_flow.h" #include "ice_lib.h" #include "ice_dcb_lib.h" @@ -493,7 +494,28 @@ bool ice_is_safe_mode(struct ice_pf *pf) } /** - * ice_rss_clean - Delete RSS related VSI structures that hold user inputs + * ice_vsi_clean_rss_flow_fld - Delete RSS configuration + * @vsi: the VSI being cleaned up + * + * This function deletes RSS input set for all flows that were configured + * for this VSI + */ +static void ice_vsi_clean_rss_flow_fld(struct ice_vsi *vsi) +{ + struct ice_pf *pf = vsi->back; + enum ice_status status; + + if (ice_is_safe_mode(pf)) + return; + + status = ice_rem_vsi_rss_cfg(&pf->hw, vsi->idx); + if (status) + dev_dbg(ice_pf_to_dev(pf), "ice_rem_vsi_rss_cfg failed for vsi = %d, error = %d\n", + vsi->vsi_num, status); +} + +/** + * ice_rss_clean - Delete RSS related VSI structures and configuration * @vsi: the VSI being removed */ static void ice_rss_clean(struct ice_vsi *vsi) @@ -507,6 +529,11 @@ static void ice_rss_clean(struct ice_vsi *vsi) devm_kfree(dev, vsi->rss_hkey_user); if (vsi->rss_lut_user) devm_kfree(dev, vsi->rss_lut_user); + + ice_vsi_clean_rss_flow_fld(vsi); + /* remove RSS replay list */ + if (!ice_is_safe_mode(pf)) + ice_rem_vsi_rss_list(&pf->hw, vsi->idx); } /** @@ -817,12 +844,23 @@ static int ice_vsi_init(struct ice_vsi *vsi, bool init_vsi) ctxt->info.valid_sections |= cpu_to_le16(ICE_AQ_VSI_PROP_RXQ_MAP_VALID); - /* Enable MAC Antispoof with new VSI being initialized or updated */ - if (vsi->type == ICE_VSI_VF && pf->vf[vsi->vf_id].spoofchk) { + /* enable/disable MAC and VLAN anti-spoof when spoofchk is on/off + * respectively + */ + if (vsi->type == ICE_VSI_VF) { ctxt->info.valid_sections |= cpu_to_le16(ICE_AQ_VSI_PROP_SECURITY_VALID); - ctxt->info.sec_flags |= - ICE_AQ_VSI_SEC_FLAG_ENA_MAC_ANTI_SPOOF; + if (pf->vf[vsi->vf_id].spoofchk) { + ctxt->info.sec_flags |= + ICE_AQ_VSI_SEC_FLAG_ENA_MAC_ANTI_SPOOF | + (ICE_AQ_VSI_SEC_TX_VLAN_PRUNE_ENA << + ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S); + } else { + ctxt->info.sec_flags &= + ~(ICE_AQ_VSI_SEC_FLAG_ENA_MAC_ANTI_SPOOF | + (ICE_AQ_VSI_SEC_TX_VLAN_PRUNE_ENA << + ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S)); + } } /* Allow control frames out of main VSI */ @@ -1076,6 +1114,115 @@ ice_vsi_cfg_rss_exit: } /** + * ice_vsi_set_vf_rss_flow_fld - Sets VF VSI RSS input set for different flows + * @vsi: VSI to be configured + * + * This function will only be called during the VF VSI setup. Upon successful + * completion of package download, this function will configure default RSS + * input sets for VF VSI. + */ +static void ice_vsi_set_vf_rss_flow_fld(struct ice_vsi *vsi) +{ + struct ice_pf *pf = vsi->back; + enum ice_status status; + struct device *dev; + + dev = ice_pf_to_dev(pf); + if (ice_is_safe_mode(pf)) { + dev_dbg(dev, "Advanced RSS disabled. Package download failed, vsi num = %d\n", + vsi->vsi_num); + return; + } + + status = ice_add_avf_rss_cfg(&pf->hw, vsi->idx, ICE_DEFAULT_RSS_HENA); + if (status) + dev_dbg(dev, "ice_add_avf_rss_cfg failed for vsi = %d, error = %d\n", + vsi->vsi_num, status); +} + +/** + * ice_vsi_set_rss_flow_fld - Sets RSS input set for different flows + * @vsi: VSI to be configured + * + * This function will only be called after successful download package call + * during initialization of PF. Since the downloaded package will erase the + * RSS section, this function will configure RSS input sets for different + * flow types. The last profile added has the highest priority, therefore 2 + * tuple profiles (i.e. IPv4 src/dst) are added before 4 tuple profiles + * (i.e. IPv4 src/dst TCP src/dst port). + */ +static void ice_vsi_set_rss_flow_fld(struct ice_vsi *vsi) +{ + u16 vsi_handle = vsi->idx, vsi_num = vsi->vsi_num; + struct ice_pf *pf = vsi->back; + struct ice_hw *hw = &pf->hw; + enum ice_status status; + struct device *dev; + + dev = ice_pf_to_dev(pf); + if (ice_is_safe_mode(pf)) { + dev_dbg(dev, "Advanced RSS disabled. Package download failed, vsi num = %d\n", + vsi_num); + return; + } + /* configure RSS for IPv4 with input set IP src/dst */ + status = ice_add_rss_cfg(hw, vsi_handle, ICE_FLOW_HASH_IPV4, + ICE_FLOW_SEG_HDR_IPV4); + if (status) + dev_dbg(dev, "ice_add_rss_cfg failed for ipv4 flow, vsi = %d, error = %d\n", + vsi_num, status); + + /* configure RSS for IPv6 with input set IPv6 src/dst */ + status = ice_add_rss_cfg(hw, vsi_handle, ICE_FLOW_HASH_IPV6, + ICE_FLOW_SEG_HDR_IPV6); + if (status) + dev_dbg(dev, "ice_add_rss_cfg failed for ipv6 flow, vsi = %d, error = %d\n", + vsi_num, status); + + /* configure RSS for tcp4 with input set IP src/dst, TCP src/dst */ + status = ice_add_rss_cfg(hw, vsi_handle, ICE_HASH_TCP_IPV4, + ICE_FLOW_SEG_HDR_TCP | ICE_FLOW_SEG_HDR_IPV4); + if (status) + dev_dbg(dev, "ice_add_rss_cfg failed for tcp4 flow, vsi = %d, error = %d\n", + vsi_num, status); + + /* configure RSS for udp4 with input set IP src/dst, UDP src/dst */ + status = ice_add_rss_cfg(hw, vsi_handle, ICE_HASH_UDP_IPV4, + ICE_FLOW_SEG_HDR_UDP | ICE_FLOW_SEG_HDR_IPV4); + if (status) + dev_dbg(dev, "ice_add_rss_cfg failed for udp4 flow, vsi = %d, error = %d\n", + vsi_num, status); + + /* configure RSS for sctp4 with input set IP src/dst */ + status = ice_add_rss_cfg(hw, vsi_handle, ICE_FLOW_HASH_IPV4, + ICE_FLOW_SEG_HDR_SCTP | ICE_FLOW_SEG_HDR_IPV4); + if (status) + dev_dbg(dev, "ice_add_rss_cfg failed for sctp4 flow, vsi = %d, error = %d\n", + vsi_num, status); + + /* configure RSS for tcp6 with input set IPv6 src/dst, TCP src/dst */ + status = ice_add_rss_cfg(hw, vsi_handle, ICE_HASH_TCP_IPV6, + ICE_FLOW_SEG_HDR_TCP | ICE_FLOW_SEG_HDR_IPV6); + if (status) + dev_dbg(dev, "ice_add_rss_cfg failed for tcp6 flow, vsi = %d, error = %d\n", + vsi_num, status); + + /* configure RSS for udp6 with input set IPv6 src/dst, UDP src/dst */ + status = ice_add_rss_cfg(hw, vsi_handle, ICE_HASH_UDP_IPV6, + ICE_FLOW_SEG_HDR_UDP | ICE_FLOW_SEG_HDR_IPV6); + if (status) + dev_dbg(dev, "ice_add_rss_cfg failed for udp6 flow, vsi = %d, error = %d\n", + vsi_num, status); + + /* configure RSS for sctp6 with input set IPv6 src/dst */ + status = ice_add_rss_cfg(hw, vsi_handle, ICE_FLOW_HASH_IPV6, + ICE_FLOW_SEG_HDR_SCTP | ICE_FLOW_SEG_HDR_IPV6); + if (status) + dev_dbg(dev, "ice_add_rss_cfg failed for sctp6 flow, vsi = %d, error = %d\n", + vsi_num, status); +} + +/** * ice_add_mac_to_list - Add a MAC address filter entry to the list * @vsi: the VSI to be forwarded to * @add_list: pointer to the list which contains MAC filter entries @@ -1636,22 +1783,14 @@ int ice_cfg_vlan_pruning(struct ice_vsi *vsi, bool ena, bool vlan_promisc) ctxt->info = vsi->info; - if (ena) { - ctxt->info.sec_flags |= - ICE_AQ_VSI_SEC_TX_VLAN_PRUNE_ENA << - ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S; + if (ena) ctxt->info.sw_flags2 |= ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA; - } else { - ctxt->info.sec_flags &= - ~(ICE_AQ_VSI_SEC_TX_VLAN_PRUNE_ENA << - ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S); + else ctxt->info.sw_flags2 &= ~ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA; - } if (!vlan_promisc) ctxt->info.valid_sections = - cpu_to_le16(ICE_AQ_VSI_PROP_SECURITY_VALID | - ICE_AQ_VSI_PROP_SW_VALID); + cpu_to_le16(ICE_AQ_VSI_PROP_SW_VALID); status = ice_update_vsi(&pf->hw, vsi->idx, ctxt, NULL); if (status) { @@ -1661,7 +1800,6 @@ int ice_cfg_vlan_pruning(struct ice_vsi *vsi, bool ena, bool vlan_promisc) goto err_out; } - vsi->info.sec_flags = ctxt->info.sec_flags; vsi->info.sw_flags2 = ctxt->info.sw_flags2; kfree(ctxt); @@ -1899,8 +2037,10 @@ ice_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi, * receive traffic on first queue. Hence no need to capture * return value */ - if (test_bit(ICE_FLAG_RSS_ENA, pf->flags)) + if (test_bit(ICE_FLAG_RSS_ENA, pf->flags)) { ice_vsi_cfg_rss_lut_key(vsi); + ice_vsi_set_rss_flow_fld(vsi); + } break; case ICE_VSI_VF: /* VF driver will take care of creating netdev for this type and @@ -1924,8 +2064,10 @@ ice_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi, * receive traffic on first queue. Hence no need to capture * return value */ - if (test_bit(ICE_FLAG_RSS_ENA, pf->flags)) + if (test_bit(ICE_FLAG_RSS_ENA, pf->flags)) { ice_vsi_cfg_rss_lut_key(vsi); + ice_vsi_set_vf_rss_flow_fld(vsi); + } break; case ICE_VSI_LB: ret = ice_vsi_alloc_rings(vsi); @@ -2402,6 +2544,97 @@ int ice_vsi_release(struct ice_vsi *vsi) } /** + * ice_vsi_rebuild_update_coalesce - set coalesce for a q_vector + * @q_vector: pointer to q_vector which is being updated + * @coalesce: pointer to array of struct with stored coalesce + * + * Set coalesce param in q_vector and update these parameters in HW. + */ +static void +ice_vsi_rebuild_update_coalesce(struct ice_q_vector *q_vector, + struct ice_coalesce_stored *coalesce) +{ + struct ice_ring_container *rx_rc = &q_vector->rx; + struct ice_ring_container *tx_rc = &q_vector->tx; + struct ice_hw *hw = &q_vector->vsi->back->hw; + + tx_rc->itr_setting = coalesce->itr_tx; + rx_rc->itr_setting = coalesce->itr_rx; + + /* dynamic ITR values will be updated during Tx/Rx */ + if (!ITR_IS_DYNAMIC(tx_rc->itr_setting)) + wr32(hw, GLINT_ITR(tx_rc->itr_idx, q_vector->reg_idx), + ITR_REG_ALIGN(tx_rc->itr_setting) >> + ICE_ITR_GRAN_S); + if (!ITR_IS_DYNAMIC(rx_rc->itr_setting)) + wr32(hw, GLINT_ITR(rx_rc->itr_idx, q_vector->reg_idx), + ITR_REG_ALIGN(rx_rc->itr_setting) >> + ICE_ITR_GRAN_S); + + q_vector->intrl = coalesce->intrl; + wr32(hw, GLINT_RATE(q_vector->reg_idx), + ice_intrl_usec_to_reg(q_vector->intrl, hw->intrl_gran)); +} + +/** + * ice_vsi_rebuild_get_coalesce - get coalesce from all q_vectors + * @vsi: VSI connected with q_vectors + * @coalesce: array of struct with stored coalesce + * + * Returns array size. + */ +static int +ice_vsi_rebuild_get_coalesce(struct ice_vsi *vsi, + struct ice_coalesce_stored *coalesce) +{ + int i; + + ice_for_each_q_vector(vsi, i) { + struct ice_q_vector *q_vector = vsi->q_vectors[i]; + + coalesce[i].itr_tx = q_vector->tx.itr_setting; + coalesce[i].itr_rx = q_vector->rx.itr_setting; + coalesce[i].intrl = q_vector->intrl; + } + + return vsi->num_q_vectors; +} + +/** + * ice_vsi_rebuild_set_coalesce - set coalesce from earlier saved arrays + * @vsi: VSI connected with q_vectors + * @coalesce: pointer to array of struct with stored coalesce + * @size: size of coalesce array + * + * Before this function, ice_vsi_rebuild_get_coalesce should be called to save + * ITR params in arrays. If size is 0 or coalesce wasn't stored set coalesce + * to default value. + */ +static void +ice_vsi_rebuild_set_coalesce(struct ice_vsi *vsi, + struct ice_coalesce_stored *coalesce, int size) +{ + int i; + + if ((size && !coalesce) || !vsi) + return; + + for (i = 0; i < size && i < vsi->num_q_vectors; i++) + ice_vsi_rebuild_update_coalesce(vsi->q_vectors[i], + &coalesce[i]); + + for (; i < vsi->num_q_vectors; i++) { + struct ice_coalesce_stored coalesce_dflt = { + .itr_tx = ICE_DFLT_TX_ITR, + .itr_rx = ICE_DFLT_RX_ITR, + .intrl = 0 + }; + ice_vsi_rebuild_update_coalesce(vsi->q_vectors[i], + &coalesce_dflt); + } +} + +/** * ice_vsi_rebuild - Rebuild VSI after reset * @vsi: VSI to be rebuild * @init_vsi: is this an initialization or a reconfigure of the VSI @@ -2411,6 +2644,8 @@ int ice_vsi_release(struct ice_vsi *vsi) int ice_vsi_rebuild(struct ice_vsi *vsi, bool init_vsi) { u16 max_txqs[ICE_MAX_TRAFFIC_CLASS] = { 0 }; + struct ice_coalesce_stored *coalesce; + int prev_num_q_vectors = 0; struct ice_vf *vf = NULL; enum ice_status status; struct ice_pf *pf; @@ -2423,6 +2658,11 @@ int ice_vsi_rebuild(struct ice_vsi *vsi, bool init_vsi) if (vsi->type == ICE_VSI_VF) vf = &pf->vf[vsi->vf_id]; + coalesce = kcalloc(vsi->num_q_vectors, + sizeof(struct ice_coalesce_stored), GFP_KERNEL); + if (coalesce) + prev_num_q_vectors = ice_vsi_rebuild_get_coalesce(vsi, + coalesce); ice_rm_vsi_lan_cfg(vsi->port_info, vsi->idx); ice_vsi_free_q_vectors(vsi); @@ -2535,6 +2775,9 @@ int ice_vsi_rebuild(struct ice_vsi *vsi, bool init_vsi) return ice_schedule_reset(pf, ICE_RESET_PFR); } } + ice_vsi_rebuild_set_coalesce(vsi, coalesce, prev_num_q_vectors); + kfree(coalesce); + return 0; err_vectors: @@ -2549,6 +2792,7 @@ err_rings: err_vsi: ice_vsi_clear(vsi); set_bit(__ICE_RESET_FAILED, pf->state); + kfree(coalesce); return ret; } @@ -2740,3 +2984,121 @@ cfg_mac_fltr_exit: ice_free_fltr_list(&vsi->back->pdev->dev, &tmp_add_list); return status; } + +/** + * ice_is_dflt_vsi_in_use - check if the default forwarding VSI is being used + * @sw: switch to check if its default forwarding VSI is free + * + * Return true if the default forwarding VSI is already being used, else returns + * false signalling that it's available to use. + */ +bool ice_is_dflt_vsi_in_use(struct ice_sw *sw) +{ + return (sw->dflt_vsi && sw->dflt_vsi_ena); +} + +/** + * ice_is_vsi_dflt_vsi - check if the VSI passed in is the default VSI + * @sw: switch for the default forwarding VSI to compare against + * @vsi: VSI to compare against default forwarding VSI + * + * If this VSI passed in is the default forwarding VSI then return true, else + * return false + */ +bool ice_is_vsi_dflt_vsi(struct ice_sw *sw, struct ice_vsi *vsi) +{ + return (sw->dflt_vsi == vsi && sw->dflt_vsi_ena); +} + +/** + * ice_set_dflt_vsi - set the default forwarding VSI + * @sw: switch used to assign the default forwarding VSI + * @vsi: VSI getting set as the default forwarding VSI on the switch + * + * If the VSI passed in is already the default VSI and it's enabled just return + * success. + * + * If there is already a default VSI on the switch and it's enabled then return + * -EEXIST since there can only be one default VSI per switch. + * + * Otherwise try to set the VSI passed in as the switch's default VSI and + * return the result. + */ +int ice_set_dflt_vsi(struct ice_sw *sw, struct ice_vsi *vsi) +{ + enum ice_status status; + struct device *dev; + + if (!sw || !vsi) + return -EINVAL; + + dev = ice_pf_to_dev(vsi->back); + + /* the VSI passed in is already the default VSI */ + if (ice_is_vsi_dflt_vsi(sw, vsi)) { + dev_dbg(dev, "VSI %d passed in is already the default forwarding VSI, nothing to do\n", + vsi->vsi_num); + return 0; + } + + /* another VSI is already the default VSI for this switch */ + if (ice_is_dflt_vsi_in_use(sw)) { + dev_err(dev, + "Default forwarding VSI %d already in use, disable it and try again\n", + sw->dflt_vsi->vsi_num); + return -EEXIST; + } + + status = ice_cfg_dflt_vsi(&vsi->back->hw, vsi->idx, true, ICE_FLTR_RX); + if (status) { + dev_err(dev, + "Failed to set VSI %d as the default forwarding VSI, error %d\n", + vsi->vsi_num, status); + return -EIO; + } + + sw->dflt_vsi = vsi; + sw->dflt_vsi_ena = true; + + return 0; +} + +/** + * ice_clear_dflt_vsi - clear the default forwarding VSI + * @sw: switch used to clear the default VSI + * + * If the switch has no default VSI or it's not enabled then return error. + * + * Otherwise try to clear the default VSI and return the result. + */ +int ice_clear_dflt_vsi(struct ice_sw *sw) +{ + struct ice_vsi *dflt_vsi; + enum ice_status status; + struct device *dev; + + if (!sw) + return -EINVAL; + + dev = ice_pf_to_dev(sw->pf); + + dflt_vsi = sw->dflt_vsi; + + /* there is no default VSI configured */ + if (!ice_is_dflt_vsi_in_use(sw)) + return -ENODEV; + + status = ice_cfg_dflt_vsi(&dflt_vsi->back->hw, dflt_vsi->idx, false, + ICE_FLTR_RX); + if (status) { + dev_err(dev, + "Failed to clear the default forwarding VSI %d, error %d\n", + dflt_vsi->vsi_num, status); + return -EIO; + } + + sw->dflt_vsi = NULL; + sw->dflt_vsi_ena = false; + + return 0; +} diff --git a/drivers/net/ethernet/intel/ice/ice_lib.h b/drivers/net/ethernet/intel/ice/ice_lib.h index 6e31e30aba39..68fd0d4505c2 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.h +++ b/drivers/net/ethernet/intel/ice/ice_lib.h @@ -103,4 +103,12 @@ enum ice_status ice_vsi_cfg_mac_fltr(struct ice_vsi *vsi, const u8 *macaddr, bool set); bool ice_is_safe_mode(struct ice_pf *pf); + +bool ice_is_dflt_vsi_in_use(struct ice_sw *sw); + +bool ice_is_vsi_dflt_vsi(struct ice_sw *sw, struct ice_vsi *vsi); + +int ice_set_dflt_vsi(struct ice_sw *sw, struct ice_vsi *vsi); + +int ice_clear_dflt_vsi(struct ice_sw *sw); #endif /* !_ICE_LIB_H_ */ diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 4d5220c9c721..5ae671609f98 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -13,7 +13,7 @@ #define DRV_VERSION_MAJOR 0 #define DRV_VERSION_MINOR 8 -#define DRV_VERSION_BUILD 1 +#define DRV_VERSION_BUILD 2 #define DRV_VERSION __stringify(DRV_VERSION_MAJOR) "." \ __stringify(DRV_VERSION_MINOR) "." \ @@ -379,25 +379,29 @@ static int ice_vsi_sync_fltr(struct ice_vsi *vsi) clear_bit(ICE_VSI_FLAG_PROMISC_CHANGED, vsi->flags); if (vsi->current_netdev_flags & IFF_PROMISC) { /* Apply Rx filter rule to get traffic from wire */ - status = ice_cfg_dflt_vsi(hw, vsi->idx, true, - ICE_FLTR_RX); - if (status) { - netdev_err(netdev, "Error setting default VSI %i Rx rule\n", - vsi->vsi_num); - vsi->current_netdev_flags &= ~IFF_PROMISC; - err = -EIO; - goto out_promisc; + if (!ice_is_dflt_vsi_in_use(pf->first_sw)) { + err = ice_set_dflt_vsi(pf->first_sw, vsi); + if (err && err != -EEXIST) { + netdev_err(netdev, + "Error %d setting default VSI %i Rx rule\n", + err, vsi->vsi_num); + vsi->current_netdev_flags &= + ~IFF_PROMISC; + goto out_promisc; + } } } else { /* Clear Rx filter to remove traffic from wire */ - status = ice_cfg_dflt_vsi(hw, vsi->idx, false, - ICE_FLTR_RX); - if (status) { - netdev_err(netdev, "Error clearing default VSI %i Rx rule\n", - vsi->vsi_num); - vsi->current_netdev_flags |= IFF_PROMISC; - err = -EIO; - goto out_promisc; + if (ice_is_vsi_dflt_vsi(pf->first_sw, vsi)) { + err = ice_clear_dflt_vsi(pf->first_sw); + if (err) { + netdev_err(netdev, + "Error %d clearing default VSI %i Rx rule\n", + err, vsi->vsi_num); + vsi->current_netdev_flags |= + IFF_PROMISC; + goto out_promisc; + } } } } @@ -472,7 +476,7 @@ ice_prepare_for_reset(struct ice_pf *pf) ice_vc_notify_reset(pf); /* Disable VFs until reset is completed */ - for (i = 0; i < pf->num_alloc_vfs; i++) + ice_for_each_vf(pf, i) ice_set_vf_state_qs_dis(&pf->vf[i]); /* clear SW filtering DB */ @@ -840,8 +844,7 @@ ice_link_event(struct ice_pf *pf, struct ice_port_info *pi, bool link_up, ice_vsi_link_event(vsi, link_up); ice_print_link_msg(vsi, link_up); - if (pf->num_alloc_vfs) - ice_vc_notify_link_state(pf); + ice_vc_notify_link_state(pf); return result; } @@ -1291,7 +1294,7 @@ static void ice_handle_mdd_event(struct ice_pf *pf) } /* check to see if one of the VFs caused the MDD */ - for (i = 0; i < pf->num_alloc_vfs; i++) { + ice_for_each_vf(pf, i) { struct ice_vf *vf = &pf->vf[i]; bool vf_mdd_detected = false; @@ -2330,7 +2333,8 @@ static void ice_set_netdev_features(struct net_device *netdev) NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX; - tso_features = NETIF_F_TSO; + tso_features = NETIF_F_TSO | + NETIF_F_GSO_UDP_L4; /* set features that user can change */ netdev->hw_features = dflt_features | csumo_features | @@ -3568,6 +3572,15 @@ static const struct pci_device_id ice_pci_tbl[] = { { PCI_VDEVICE(INTEL, ICE_DEV_ID_E810C_BACKPLANE), 0 }, { PCI_VDEVICE(INTEL, ICE_DEV_ID_E810C_QSFP), 0 }, { PCI_VDEVICE(INTEL, ICE_DEV_ID_E810C_SFP), 0 }, + { PCI_VDEVICE(INTEL, ICE_DEV_ID_E822C_BACKPLANE), 0 }, + { PCI_VDEVICE(INTEL, ICE_DEV_ID_E822C_QSFP), 0 }, + { PCI_VDEVICE(INTEL, ICE_DEV_ID_E822C_SFP), 0 }, + { PCI_VDEVICE(INTEL, ICE_DEV_ID_E822C_10G_BASE_T), 0 }, + { PCI_VDEVICE(INTEL, ICE_DEV_ID_E822C_SGMII), 0 }, + { PCI_VDEVICE(INTEL, ICE_DEV_ID_E822X_BACKPLANE), 0 }, + { PCI_VDEVICE(INTEL, ICE_DEV_ID_E822L_SFP), 0 }, + { PCI_VDEVICE(INTEL, ICE_DEV_ID_E822L_10G_BASE_T), 0 }, + { PCI_VDEVICE(INTEL, ICE_DEV_ID_E822L_SGMII), 0 }, /* required last entry */ { 0, } }; @@ -4670,6 +4683,13 @@ static void ice_rebuild(struct ice_pf *pf, enum ice_reset_req reset_type) goto err_init_ctrlq; } + if (pf->first_sw->dflt_vsi_ena) + dev_info(dev, + "Clearing default VSI, re-enable after reset completes\n"); + /* clear the default VSI configuration if it exists */ + pf->first_sw->dflt_vsi = NULL; + pf->first_sw->dflt_vsi_ena = false; + ice_clear_pxe_mode(hw); ret = ice_get_caps(hw); @@ -4825,7 +4845,7 @@ static int ice_change_mtu(struct net_device *netdev, int new_mtu) } } - netdev_info(netdev, "changed MTU to %d\n", new_mtu); + netdev_dbg(netdev, "changed MTU to %d\n", new_mtu); return 0; } @@ -5066,36 +5086,17 @@ static void ice_tx_timeout(struct net_device *netdev, unsigned int txqueue) struct ice_ring *tx_ring = NULL; struct ice_vsi *vsi = np->vsi; struct ice_pf *pf = vsi->back; - int hung_queue = -1; u32 i; pf->tx_timeout_count++; - /* find the stopped queue the same way dev_watchdog() does */ - for (i = 0; i < netdev->num_tx_queues; i++) { - unsigned long trans_start; - struct netdev_queue *q; - - q = netdev_get_tx_queue(netdev, i); - trans_start = q->trans_start; - if (netif_xmit_stopped(q) && - time_after(jiffies, - trans_start + netdev->watchdog_timeo)) { - hung_queue = i; - break; - } - } - - if (i == netdev->num_tx_queues) - netdev_info(netdev, "tx_timeout: no netdev hung queue found\n"); - else - /* now that we have an index, find the tx_ring struct */ - for (i = 0; i < vsi->num_txq; i++) - if (vsi->tx_rings[i] && vsi->tx_rings[i]->desc) - if (hung_queue == vsi->tx_rings[i]->q_index) { - tx_ring = vsi->tx_rings[i]; - break; - } + /* now that we have an index, find the tx_ring struct */ + for (i = 0; i < vsi->num_txq; i++) + if (vsi->tx_rings[i] && vsi->tx_rings[i]->desc) + if (txqueue == vsi->tx_rings[i]->q_index) { + tx_ring = vsi->tx_rings[i]; + break; + } /* Reset recovery level if enough time has elapsed after last timeout. * Also ensure no new reset action happens before next timeout period. @@ -5110,19 +5111,19 @@ static void ice_tx_timeout(struct net_device *netdev, unsigned int txqueue) struct ice_hw *hw = &pf->hw; u32 head, val = 0; - head = (rd32(hw, QTX_COMM_HEAD(vsi->txq_map[hung_queue])) & + head = (rd32(hw, QTX_COMM_HEAD(vsi->txq_map[txqueue])) & QTX_COMM_HEAD_HEAD_M) >> QTX_COMM_HEAD_HEAD_S; /* Read interrupt register */ val = rd32(hw, GLINT_DYN_CTL(tx_ring->q_vector->reg_idx)); netdev_info(netdev, "tx_timeout: VSI_num: %d, Q %d, NTC: 0x%x, HW_HEAD: 0x%x, NTU: 0x%x, INT: 0x%x\n", - vsi->vsi_num, hung_queue, tx_ring->next_to_clean, + vsi->vsi_num, txqueue, tx_ring->next_to_clean, head, tx_ring->next_to_use, val); } pf->tx_timeout_last_recovery = jiffies; - netdev_info(netdev, "tx_timeout recovery level %d, hung_queue %d\n", - pf->tx_timeout_recovery_level, hung_queue); + netdev_info(netdev, "tx_timeout recovery level %d, txqueue %d\n", + pf->tx_timeout_recovery_level, txqueue); switch (pf->tx_timeout_recovery_level) { case 1: diff --git a/drivers/net/ethernet/intel/ice/ice_nvm.c b/drivers/net/ethernet/intel/ice/ice_nvm.c index 57c73f613f32..7525ac50742e 100644 --- a/drivers/net/ethernet/intel/ice/ice_nvm.c +++ b/drivers/net/ethernet/intel/ice/ice_nvm.c @@ -289,6 +289,18 @@ enum ice_status ice_init_nvm(struct ice_hw *hw) nvm->eetrack = (eetrack_hi << 16) | eetrack_lo; + /* the following devices do not have boot_cfg_tlv yet */ + if (hw->device_id == ICE_DEV_ID_E822C_BACKPLANE || + hw->device_id == ICE_DEV_ID_E822C_QSFP || + hw->device_id == ICE_DEV_ID_E822C_10G_BASE_T || + hw->device_id == ICE_DEV_ID_E822C_SGMII || + hw->device_id == ICE_DEV_ID_E822C_SFP || + hw->device_id == ICE_DEV_ID_E822X_BACKPLANE || + hw->device_id == ICE_DEV_ID_E822L_SFP || + hw->device_id == ICE_DEV_ID_E822L_10G_BASE_T || + hw->device_id == ICE_DEV_ID_E822L_SGMII) + return status; + status = ice_get_pfa_module_tlv(hw, &boot_cfg_tlv, &boot_cfg_tlv_len, ICE_SR_BOOT_CFG_PTR); if (status) { diff --git a/drivers/net/ethernet/intel/ice/ice_protocol_type.h b/drivers/net/ethernet/intel/ice/ice_protocol_type.h new file mode 100644 index 000000000000..71647566964e --- /dev/null +++ b/drivers/net/ethernet/intel/ice/ice_protocol_type.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2019, Intel Corporation. */ + +#ifndef _ICE_PROTOCOL_TYPE_H_ +#define _ICE_PROTOCOL_TYPE_H_ +/* Decoders for ice_prot_id: + * - F: First + * - I: Inner + * - L: Last + * - O: Outer + * - S: Single + */ +enum ice_prot_id { + ICE_PROT_ID_INVAL = 0, + ICE_PROT_IPV4_OF_OR_S = 32, + ICE_PROT_IPV4_IL = 33, + ICE_PROT_IPV6_OF_OR_S = 40, + ICE_PROT_IPV6_IL = 41, + ICE_PROT_TCP_IL = 49, + ICE_PROT_UDP_IL_OR_S = 53, + ICE_PROT_SCTP_IL = 96, + ICE_PROT_META_ID = 255, /* when offset == metadata */ + ICE_PROT_INVALID = 255 /* when offset == ICE_FV_OFFSET_INVAL */ +}; +#endif /* _ICE_PROTOCOL_TYPE_H_ */ diff --git a/drivers/net/ethernet/intel/ice/ice_status.h b/drivers/net/ethernet/intel/ice/ice_status.h index c01597885629..a9a8bc3aca42 100644 --- a/drivers/net/ethernet/intel/ice/ice_status.h +++ b/drivers/net/ethernet/intel/ice/ice_status.h @@ -26,6 +26,7 @@ enum ice_status { ICE_ERR_IN_USE = -16, ICE_ERR_MAX_LIMIT = -17, ICE_ERR_RESET_ONGOING = -18, + ICE_ERR_HW_TABLE = -19, ICE_ERR_NVM_CHECKSUM = -51, ICE_ERR_BUF_TOO_SHORT = -52, ICE_ERR_NVM_BLANK_MODE = -53, diff --git a/drivers/net/ethernet/intel/ice/ice_switch.c b/drivers/net/ethernet/intel/ice/ice_switch.c index b5a53f862a83..431266081a80 100644 --- a/drivers/net/ethernet/intel/ice/ice_switch.c +++ b/drivers/net/ethernet/intel/ice/ice_switch.c @@ -50,42 +50,6 @@ static const u8 dummy_eth_header[DUMMY_ETH_HDR_LEN] = { 0x2, 0, 0, 0, 0, 0, ((n) * sizeof(((struct ice_sw_rule_vsi_list *)0)->vsi))) /** - * ice_aq_alloc_free_res - command to allocate/free resources - * @hw: pointer to the HW struct - * @num_entries: number of resource entries in buffer - * @buf: Indirect buffer to hold data parameters and response - * @buf_size: size of buffer for indirect commands - * @opc: pass in the command opcode - * @cd: pointer to command details structure or NULL - * - * Helper function to allocate/free resources using the admin queue commands - */ -static enum ice_status -ice_aq_alloc_free_res(struct ice_hw *hw, u16 num_entries, - struct ice_aqc_alloc_free_res_elem *buf, u16 buf_size, - enum ice_adminq_opc opc, struct ice_sq_cd *cd) -{ - struct ice_aqc_alloc_free_res_cmd *cmd; - struct ice_aq_desc desc; - - cmd = &desc.params.sw_res_ctrl; - - if (!buf) - return ICE_ERR_PARAM; - - if (buf_size < (num_entries * sizeof(buf->elem[0]))) - return ICE_ERR_PARAM; - - ice_fill_dflt_direct_cmd_desc(&desc, opc); - - desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD); - - cmd->num_entries = cpu_to_le16(num_entries); - - return ice_aq_send_cmd(hw, &desc, buf, buf_size, cd); -} - -/** * ice_init_def_sw_recp - initialize the recipe book keeping tables * @hw: pointer to the HW struct * diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index 2c212f64d99f..fd17ace6b226 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -1071,13 +1071,16 @@ static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget) ice_put_rx_buf(rx_ring, rx_buf); continue; construct_skb: - if (skb) + if (skb) { ice_add_rx_frag(rx_ring, rx_buf, skb, size); - else if (ice_ring_uses_build_skb(rx_ring)) - skb = ice_build_skb(rx_ring, rx_buf, &xdp); - else + } else if (likely(xdp.data)) { + if (ice_ring_uses_build_skb(rx_ring)) + skb = ice_build_skb(rx_ring, rx_buf, &xdp); + else + skb = ice_construct_skb(rx_ring, rx_buf, &xdp); + } else { skb = ice_construct_skb(rx_ring, rx_buf, &xdp); - + } /* exit if we failed to retrieve a buffer */ if (!skb) { rx_ring->rx_stats.alloc_buf_failed++; @@ -1925,6 +1928,7 @@ int ice_tso(struct ice_tx_buf *first, struct ice_tx_offload_params *off) } ip; union { struct tcphdr *tcp; + struct udphdr *udp; unsigned char *hdr; } l4; u64 cd_mss, cd_tso_len; @@ -1958,10 +1962,18 @@ int ice_tso(struct ice_tx_buf *first, struct ice_tx_offload_params *off) /* remove payload length from checksum */ paylen = skb->len - l4_start; - csum_replace_by_diff(&l4.tcp->check, (__force __wsum)htonl(paylen)); - /* compute length of segmentation header */ - off->header_len = (l4.tcp->doff * 4) + l4_start; + if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) { + csum_replace_by_diff(&l4.udp->check, + (__force __wsum)htonl(paylen)); + /* compute length of UDP segmentation header */ + off->header_len = sizeof(l4.udp) + l4_start; + } else { + csum_replace_by_diff(&l4.tcp->check, + (__force __wsum)htonl(paylen)); + /* compute length of TCP segmentation header */ + off->header_len = (l4.tcp->doff * 4) + l4_start; + } /* update gso_segs and bytecount */ first->gso_segs = skb_shinfo(skb)->gso_segs; diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h index a84cc0e6dd27..a86270696df1 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.h +++ b/drivers/net/ethernet/intel/ice/ice_txrx.h @@ -341,6 +341,12 @@ struct ice_ring_container { u16 itr_setting; }; +struct ice_coalesce_stored { + u16 itr_tx; + u16 itr_rx; + u8 intrl; +}; + /* iterator for handling rings in ring container */ #define ice_for_each_ring(pos, head) \ for (pos = (head).ring; pos; pos = pos->next) diff --git a/drivers/net/ethernet/intel/ice/ice_type.h b/drivers/net/ethernet/intel/ice/ice_type.h index c4854a987130..b361ffabb0ca 100644 --- a/drivers/net/ethernet/intel/ice/ice_type.h +++ b/drivers/net/ethernet/intel/ice/ice_type.h @@ -13,6 +13,7 @@ #include "ice_controlq.h" #include "ice_lan_tx_rx.h" #include "ice_flex_type.h" +#include "ice_protocol_type.h" static inline bool ice_is_tc_ena(unsigned long bitmap, u8 tc) { @@ -41,6 +42,7 @@ static inline u32 ice_round_to_num(u32 N, u32 R) #define ICE_DBG_QCTX BIT_ULL(6) #define ICE_DBG_NVM BIT_ULL(7) #define ICE_DBG_LAN BIT_ULL(8) +#define ICE_DBG_FLOW BIT_ULL(9) #define ICE_DBG_SW BIT_ULL(13) #define ICE_DBG_SCHED BIT_ULL(14) #define ICE_DBG_PKG BIT_ULL(16) @@ -559,6 +561,10 @@ struct ice_hw { /* HW block tables */ struct ice_blk_info blk[ICE_BLK_COUNT]; + struct mutex fl_profs_locks[ICE_BLK_COUNT]; /* lock fltr profiles */ + struct list_head fl_profs[ICE_BLK_COUNT]; + struct mutex rss_locks; /* protect RSS configuration */ + struct list_head rss_list_head; }; /* Statistics collected by each port, VSI, VEB, and S-channel */ diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index edb374296d1f..82b1e7a4cb92 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -35,37 +35,6 @@ static int ice_check_vf_init(struct ice_pf *pf, struct ice_vf *vf) } /** - * ice_err_to_virt err - translate errors for VF return code - * @ice_err: error return code - */ -static enum virtchnl_status_code ice_err_to_virt_err(enum ice_status ice_err) -{ - switch (ice_err) { - case ICE_SUCCESS: - return VIRTCHNL_STATUS_SUCCESS; - case ICE_ERR_BAD_PTR: - case ICE_ERR_INVAL_SIZE: - case ICE_ERR_DEVICE_NOT_SUPPORTED: - case ICE_ERR_PARAM: - case ICE_ERR_CFG: - return VIRTCHNL_STATUS_ERR_PARAM; - case ICE_ERR_NO_MEMORY: - return VIRTCHNL_STATUS_ERR_NO_MEMORY; - case ICE_ERR_NOT_READY: - case ICE_ERR_RESET_FAILED: - case ICE_ERR_FW_API_VER: - case ICE_ERR_AQ_ERROR: - case ICE_ERR_AQ_TIMEOUT: - case ICE_ERR_AQ_FULL: - case ICE_ERR_AQ_NO_WORK: - case ICE_ERR_AQ_EMPTY: - return VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR; - default: - return VIRTCHNL_STATUS_ERR_NOT_SUPPORTED; - } -} - -/** * ice_vc_vf_broadcast - Broadcast a message to all VFs on PF * @pf: pointer to the PF structure * @v_opcode: operation code @@ -78,10 +47,11 @@ ice_vc_vf_broadcast(struct ice_pf *pf, enum virtchnl_ops v_opcode, enum virtchnl_status_code v_retval, u8 *msg, u16 msglen) { struct ice_hw *hw = &pf->hw; - struct ice_vf *vf = pf->vf; int i; - for (i = 0; i < pf->num_alloc_vfs; i++, vf++) { + ice_for_each_vf(pf, i) { + struct ice_vf *vf = &pf->vf[i]; + /* Not all vfs are enabled so skip the ones that are not */ if (!test_bit(ICE_VF_STATE_INIT, vf->vf_states) && !test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) @@ -121,26 +91,6 @@ ice_set_pfe_link(struct ice_vf *vf, struct virtchnl_pf_event *pfe, } /** - * ice_set_pfe_link_forced - Force the virtchnl_pf_event link speed/status - * @vf: pointer to the VF structure - * @pfe: pointer to the virtchnl_pf_event to set link speed/status for - * @link_up: whether or not to set the link up/down - */ -static void -ice_set_pfe_link_forced(struct ice_vf *vf, struct virtchnl_pf_event *pfe, - bool link_up) -{ - u16 link_speed; - - if (link_up) - link_speed = ICE_AQ_LINK_SPEED_100GB; - else - link_speed = ICE_AQ_LINK_SPEED_UNKNOWN; - - ice_set_pfe_link(vf, pfe, link_speed, link_up); -} - -/** * ice_vc_notify_vf_link_state - Inform a VF of link status * @vf: pointer to the VF structure * @@ -160,13 +110,17 @@ static void ice_vc_notify_vf_link_state(struct ice_vf *vf) pfe.severity = PF_EVENT_SEVERITY_INFO; /* Always report link is down if the VF queues aren't enabled */ - if (!vf->num_qs_ena) + if (!vf->num_qs_ena) { ice_set_pfe_link(vf, &pfe, ICE_AQ_LINK_SPEED_UNKNOWN, false); - else if (vf->link_forced) - ice_set_pfe_link_forced(vf, &pfe, vf->link_up); - else - ice_set_pfe_link(vf, &pfe, ls->link_speed, ls->link_info & - ICE_AQ_LINK_UP); + } else if (vf->link_forced) { + u16 link_speed = vf->link_up ? + ls->link_speed : ICE_AQ_LINK_SPEED_UNKNOWN; + + ice_set_pfe_link(vf, &pfe, link_speed, vf->link_up); + } else { + ice_set_pfe_link(vf, &pfe, ls->link_speed, + ls->link_info & ICE_AQ_LINK_UP); + } ice_aq_send_msg_to_vf(hw, vf->vf_id, VIRTCHNL_OP_EVENT, VIRTCHNL_STATUS_SUCCESS, (u8 *)&pfe, @@ -331,7 +285,7 @@ void ice_free_vfs(struct ice_pf *pf) usleep_range(1000, 2000); /* Avoid wait time by stopping all VFs at the same time */ - for (i = 0; i < pf->num_alloc_vfs; i++) + ice_for_each_vf(pf, i) if (test_bit(ICE_VF_STATE_QS_ENA, pf->vf[i].vf_states)) ice_dis_vf_qs(&pf->vf[i]); @@ -991,10 +945,17 @@ static void ice_cleanup_and_realloc_vf(struct ice_vf *vf) /* reallocate VF resources to finish resetting the VSI state */ if (!ice_alloc_vf_res(vf)) { + struct ice_vsi *vsi; + ice_ena_vf_mappings(vf); set_bit(ICE_VF_STATE_ACTIVE, vf->vf_states); clear_bit(ICE_VF_STATE_DIS, vf->vf_states); - vf->num_vlan = 0; + + vsi = pf->vsi[vf->lan_vsi_idx]; + if (ice_vsi_add_vlan(vsi, 0)) + dev_warn(ice_pf_to_dev(pf), + "Failed to add VLAN 0 filter for VF %d, MDD events will trigger. Reset the VF, disable spoofchk, or enable 8021q module on the guest", + vf->vf_id); } /* Tell the VF driver the reset is done. This needs to be done only @@ -1023,7 +984,7 @@ ice_vf_set_vsi_promisc(struct ice_vf *vf, struct ice_vsi *vsi, u8 promisc_m, struct ice_hw *hw; hw = &pf->hw; - if (vf->num_vlan) { + if (vsi->num_vlan) { status = ice_set_vlan_vsi_promisc(hw, vsi->idx, promisc_m, rm_promisc); } else if (vf->port_vlan_id) { @@ -1070,7 +1031,7 @@ static bool ice_config_res_vfs(struct ice_pf *pf) ice_irq_dynamic_ena(hw, NULL, NULL); /* Finish resetting each VF and allocate resources */ - for (v = 0; v < pf->num_alloc_vfs; v++) { + ice_for_each_vf(pf, v) { struct ice_vf *vf = &pf->vf[v]; vf->num_vf_qs = pf->num_vf_qps; @@ -1113,10 +1074,10 @@ bool ice_reset_all_vfs(struct ice_pf *pf, bool is_vflr) return false; /* Begin reset on all VFs at once */ - for (v = 0; v < pf->num_alloc_vfs; v++) + ice_for_each_vf(pf, v) ice_trigger_vf_reset(&pf->vf[v], is_vflr, true); - for (v = 0; v < pf->num_alloc_vfs; v++) { + ice_for_each_vf(pf, v) { struct ice_vsi *vsi; vf = &pf->vf[v]; @@ -1161,7 +1122,7 @@ bool ice_reset_all_vfs(struct ice_pf *pf, bool is_vflr) dev_warn(dev, "VF reset check timeout\n"); /* free VF resources to begin resetting the VSI state */ - for (v = 0; v < pf->num_alloc_vfs; v++) { + ice_for_each_vf(pf, v) { vf = &pf->vf[v]; ice_free_vf_res(vf); @@ -1273,7 +1234,7 @@ static bool ice_reset_vf(struct ice_vf *vf, bool is_vflr) */ if (test_bit(ICE_VF_STATE_UC_PROMISC, vf->vf_states) || test_bit(ICE_VF_STATE_MC_PROMISC, vf->vf_states)) { - if (vf->port_vlan_id || vf->num_vlan) + if (vf->port_vlan_id || vsi->num_vlan) promisc_m = ICE_UCAST_VLAN_PROMISC_BITS; else promisc_m = ICE_UCAST_PROMISC_BITS; @@ -1301,7 +1262,7 @@ void ice_vc_notify_link_state(struct ice_pf *pf) { int i; - for (i = 0; i < pf->num_alloc_vfs; i++) + ice_for_each_vf(pf, i) ice_vc_notify_vf_link_state(&pf->vf[i]); } @@ -1385,9 +1346,10 @@ static int ice_alloc_vfs(struct ice_pf *pf, u16 num_alloc_vfs) goto err_pci_disable_sriov; } pf->vf = vfs; + pf->num_alloc_vfs = num_alloc_vfs; /* apply default profile */ - for (i = 0; i < num_alloc_vfs; i++) { + ice_for_each_vf(pf, i) { vfs[i].pf = pf; vfs[i].vf_sw_id = pf->first_sw; vfs[i].vf_id = i; @@ -1396,7 +1358,6 @@ static int ice_alloc_vfs(struct ice_pf *pf, u16 num_alloc_vfs) set_bit(ICE_VIRTCHNL_VF_CAP_L2, &vfs[i].vf_caps); vfs[i].spoofchk = true; } - pf->num_alloc_vfs = num_alloc_vfs; /* VF resources get allocated with initialization */ if (!ice_config_res_vfs(pf)) { @@ -1535,7 +1496,7 @@ void ice_process_vflr_event(struct ice_pf *pf) !pf->num_alloc_vfs) return; - for (vf_id = 0; vf_id < pf->num_alloc_vfs; vf_id++) { + ice_for_each_vf(pf, vf_id) { struct ice_vf *vf = &pf->vf[vf_id]; u32 reg_idx, bit_idx; @@ -1918,6 +1879,89 @@ error_param: } /** + * ice_set_vf_spoofchk + * @netdev: network interface device structure + * @vf_id: VF identifier + * @ena: flag to enable or disable feature + * + * Enable or disable VF spoof checking + */ +int ice_set_vf_spoofchk(struct net_device *netdev, int vf_id, bool ena) +{ + struct ice_netdev_priv *np = netdev_priv(netdev); + struct ice_pf *pf = np->vsi->back; + struct ice_vsi_ctx *ctx; + struct ice_vsi *vf_vsi; + enum ice_status status; + struct device *dev; + struct ice_vf *vf; + int ret = 0; + + dev = ice_pf_to_dev(pf); + if (ice_validate_vf_id(pf, vf_id)) + return -EINVAL; + + vf = &pf->vf[vf_id]; + + if (ice_check_vf_init(pf, vf)) + return -EBUSY; + + vf_vsi = pf->vsi[vf->lan_vsi_idx]; + if (!vf_vsi) { + netdev_err(netdev, "VSI %d for VF %d is null\n", + vf->lan_vsi_idx, vf->vf_id); + return -EINVAL; + } + + if (vf_vsi->type != ICE_VSI_VF) { + netdev_err(netdev, + "Type %d of VSI %d for VF %d is no ICE_VSI_VF\n", + vf_vsi->type, vf_vsi->vsi_num, vf->vf_id); + return -ENODEV; + } + + if (ena == vf->spoofchk) { + dev_dbg(dev, "VF spoofchk already %s\n", ena ? "ON" : "OFF"); + return 0; + } + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->info.sec_flags = vf_vsi->info.sec_flags; + ctx->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_SECURITY_VALID); + if (ena) { + ctx->info.sec_flags |= + ICE_AQ_VSI_SEC_FLAG_ENA_MAC_ANTI_SPOOF | + (ICE_AQ_VSI_SEC_TX_VLAN_PRUNE_ENA << + ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S); + } else { + ctx->info.sec_flags &= + ~(ICE_AQ_VSI_SEC_FLAG_ENA_MAC_ANTI_SPOOF | + (ICE_AQ_VSI_SEC_TX_VLAN_PRUNE_ENA << + ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S)); + } + + status = ice_update_vsi(&pf->hw, vf_vsi->idx, ctx, NULL); + if (status) { + dev_err(dev, + "Failed to %sable spoofchk on VF %d VSI %d\n error %d", + ena ? "en" : "dis", vf->vf_id, vf_vsi->vsi_num, status); + ret = -EIO; + goto out; + } + + /* only update spoofchk state and VSI context on success */ + vf_vsi->info.sec_flags = ctx->info.sec_flags; + vf->spoofchk = ena; + +out: + kfree(ctx); + return ret; +} + +/** * ice_vc_get_stats_msg * @vf: pointer to the VF info * @msg: pointer to the msg buffer @@ -2409,6 +2453,83 @@ static bool ice_can_vf_change_mac(struct ice_vf *vf) } /** + * ice_vc_add_mac_addr - attempt to add the MAC address passed in + * @vf: pointer to the VF info + * @vsi: pointer to the VF's VSI + * @mac_addr: MAC address to add + */ +static int +ice_vc_add_mac_addr(struct ice_vf *vf, struct ice_vsi *vsi, u8 *mac_addr) +{ + struct device *dev = ice_pf_to_dev(vf->pf); + enum ice_status status; + + /* default unicast MAC already added */ + if (ether_addr_equal(mac_addr, vf->dflt_lan_addr.addr)) + return 0; + + if (is_unicast_ether_addr(mac_addr) && !ice_can_vf_change_mac(vf)) { + dev_err(dev, "VF attempting to override administratively set MAC address, bring down and up the VF interface to resume normal operation\n"); + return -EPERM; + } + + status = ice_vsi_cfg_mac_fltr(vsi, mac_addr, true); + if (status == ICE_ERR_ALREADY_EXISTS) { + dev_err(dev, "MAC %pM already exists for VF %d\n", mac_addr, + vf->vf_id); + return -EEXIST; + } else if (status) { + dev_err(dev, "Failed to add MAC %pM for VF %d\n, error %d\n", + mac_addr, vf->vf_id, status); + return -EIO; + } + + /* only set dflt_lan_addr once */ + if (is_zero_ether_addr(vf->dflt_lan_addr.addr) && + is_unicast_ether_addr(mac_addr)) + ether_addr_copy(vf->dflt_lan_addr.addr, mac_addr); + + vf->num_mac++; + + return 0; +} + +/** + * ice_vc_del_mac_addr - attempt to delete the MAC address passed in + * @vf: pointer to the VF info + * @vsi: pointer to the VF's VSI + * @mac_addr: MAC address to delete + */ +static int +ice_vc_del_mac_addr(struct ice_vf *vf, struct ice_vsi *vsi, u8 *mac_addr) +{ + struct device *dev = ice_pf_to_dev(vf->pf); + enum ice_status status; + + if (!ice_can_vf_change_mac(vf) && + ether_addr_equal(mac_addr, vf->dflt_lan_addr.addr)) + return 0; + + status = ice_vsi_cfg_mac_fltr(vsi, mac_addr, false); + if (status == ICE_ERR_DOES_NOT_EXIST) { + dev_err(dev, "MAC %pM does not exist for VF %d\n", mac_addr, + vf->vf_id); + return -ENOENT; + } else if (status) { + dev_err(dev, "Failed to delete MAC %pM for VF %d, error %d\n", + mac_addr, vf->vf_id, status); + return -EIO; + } + + if (ether_addr_equal(mac_addr, vf->dflt_lan_addr.addr)) + eth_zero_addr(vf->dflt_lan_addr.addr); + + vf->num_mac--; + + return 0; +} + +/** * ice_vc_handle_mac_addr_msg * @vf: pointer to the VF info * @msg: pointer to the msg buffer @@ -2419,23 +2540,23 @@ static bool ice_can_vf_change_mac(struct ice_vf *vf) static int ice_vc_handle_mac_addr_msg(struct ice_vf *vf, u8 *msg, bool set) { + int (*ice_vc_cfg_mac) + (struct ice_vf *vf, struct ice_vsi *vsi, u8 *mac_addr); enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS; struct virtchnl_ether_addr_list *al = (struct virtchnl_ether_addr_list *)msg; struct ice_pf *pf = vf->pf; enum virtchnl_ops vc_op; - enum ice_status status; struct ice_vsi *vsi; - struct device *dev; - int mac_count = 0; int i; - dev = ice_pf_to_dev(pf); - - if (set) + if (set) { vc_op = VIRTCHNL_OP_ADD_ETH_ADDR; - else + ice_vc_cfg_mac = ice_vc_add_mac_addr; + } else { vc_op = VIRTCHNL_OP_DEL_ETH_ADDR; + ice_vc_cfg_mac = ice_vc_del_mac_addr; + } if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states) || !ice_vc_isvalid_vsi_id(vf, al->vsi_id)) { @@ -2443,14 +2564,15 @@ ice_vc_handle_mac_addr_msg(struct ice_vf *vf, u8 *msg, bool set) goto handle_mac_exit; } + /* If this VF is not privileged, then we can't add more than a + * limited number of addresses. Check to make sure that the + * additions do not push us over the limit. + */ if (set && !ice_is_vf_trusted(vf) && (vf->num_mac + al->num_elements) > ICE_MAX_MACADDR_PER_VF) { - dev_err(dev, + dev_err(ice_pf_to_dev(pf), "Can't add more MAC addresses, because VF-%d is not trusted, switch the VF to trusted mode in order to add more functionalities\n", vf->vf_id); - /* There is no need to let VF know about not being trusted - * to add more MAC addr, so we can just return success message. - */ v_ret = VIRTCHNL_STATUS_ERR_PARAM; goto handle_mac_exit; } @@ -2462,70 +2584,22 @@ ice_vc_handle_mac_addr_msg(struct ice_vf *vf, u8 *msg, bool set) } for (i = 0; i < al->num_elements; i++) { - u8 *maddr = al->list[i].addr; + u8 *mac_addr = al->list[i].addr; + int result; - if (ether_addr_equal(maddr, vf->dflt_lan_addr.addr) || - is_broadcast_ether_addr(maddr)) { - if (set) { - /* VF is trying to add filters that the PF - * already added. Just continue. - */ - dev_info(dev, - "MAC %pM already set for VF %d\n", - maddr, vf->vf_id); - continue; - } else { - /* VF can't remove dflt_lan_addr/bcast MAC */ - dev_err(dev, - "VF can't remove default MAC address or MAC %pM programmed by PF for VF %d\n", - maddr, vf->vf_id); - continue; - } - } - - /* check for the invalid cases and bail if necessary */ - if (is_zero_ether_addr(maddr)) { - dev_err(dev, - "invalid MAC %pM provided for VF %d\n", - maddr, vf->vf_id); - v_ret = VIRTCHNL_STATUS_ERR_PARAM; - goto handle_mac_exit; - } - - if (is_unicast_ether_addr(maddr) && - !ice_can_vf_change_mac(vf)) { - dev_err(dev, - "can't change unicast MAC for untrusted VF %d\n", - vf->vf_id); - v_ret = VIRTCHNL_STATUS_ERR_PARAM; - goto handle_mac_exit; - } + if (is_broadcast_ether_addr(mac_addr) || + is_zero_ether_addr(mac_addr)) + continue; - /* program the updated filter list */ - status = ice_vsi_cfg_mac_fltr(vsi, maddr, set); - if (status == ICE_ERR_DOES_NOT_EXIST || - status == ICE_ERR_ALREADY_EXISTS) { - dev_info(dev, - "can't %s MAC filters %pM for VF %d, error %d\n", - set ? "add" : "remove", maddr, vf->vf_id, - status); - } else if (status) { - dev_err(dev, - "can't %s MAC filters for VF %d, error %d\n", - set ? "add" : "remove", vf->vf_id, status); - v_ret = ice_err_to_virt_err(status); + result = ice_vc_cfg_mac(vf, vsi, mac_addr); + if (result == -EEXIST || result == -ENOENT) { + continue; + } else if (result) { + v_ret = VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR; goto handle_mac_exit; } - - mac_count++; } - /* Track number of MAC filters programmed for the VF VSI */ - if (set) - vf->num_mac += mac_count; - else - vf->num_mac -= mac_count; - handle_mac_exit: /* send the response to the VF */ return ice_vc_send_msg_to_vf(vf, vc_op, v_ret, NULL, 0); @@ -2744,17 +2818,6 @@ static int ice_vc_process_vlan_msg(struct ice_vf *vf, u8 *msg, bool add_v) goto error_param; } - if (add_v && !ice_is_vf_trusted(vf) && - vf->num_vlan >= ICE_MAX_VLAN_PER_VF) { - dev_info(dev, - "VF-%d is not trusted, switch the VF to trusted mode, in order to add more VLAN addresses\n", - vf->vf_id); - /* There is no need to let VF know about being not trusted, - * so we can just return success message here - */ - goto error_param; - } - for (i = 0; i < vfl->num_elements; i++) { if (vfl->vlan_id[i] > ICE_MAX_VLANID) { v_ret = VIRTCHNL_STATUS_ERR_PARAM; @@ -2771,6 +2834,17 @@ static int ice_vc_process_vlan_msg(struct ice_vf *vf, u8 *msg, bool add_v) goto error_param; } + if (add_v && !ice_is_vf_trusted(vf) && + vsi->num_vlan >= ICE_MAX_VLAN_PER_VF) { + dev_info(dev, + "VF-%d is not trusted, switch the VF to trusted mode, in order to add more VLAN addresses\n", + vf->vf_id); + /* There is no need to let VF know about being not trusted, + * so we can just return success message here + */ + goto error_param; + } + if (vsi->info.pvid) { v_ret = VIRTCHNL_STATUS_ERR_PARAM; goto error_param; @@ -2785,7 +2859,7 @@ static int ice_vc_process_vlan_msg(struct ice_vf *vf, u8 *msg, bool add_v) u16 vid = vfl->vlan_id[i]; if (!ice_is_vf_trusted(vf) && - vf->num_vlan >= ICE_MAX_VLAN_PER_VF) { + vsi->num_vlan >= ICE_MAX_VLAN_PER_VF) { dev_info(dev, "VF-%d is not trusted, switch the VF to trusted mode, in order to add more VLAN addresses\n", vf->vf_id); @@ -2796,12 +2870,20 @@ static int ice_vc_process_vlan_msg(struct ice_vf *vf, u8 *msg, bool add_v) goto error_param; } - if (ice_vsi_add_vlan(vsi, vid)) { + /* we add VLAN 0 by default for each VF so we can enable + * Tx VLAN anti-spoof without triggering MDD events so + * we don't need to add it again here + */ + if (!vid) + continue; + + status = ice_vsi_add_vlan(vsi, vid); + if (status) { v_ret = VIRTCHNL_STATUS_ERR_PARAM; goto error_param; } - vf->num_vlan++; + vsi->num_vlan++; /* Enable VLAN pruning when VLAN is added */ if (!vlan_promisc) { status = ice_cfg_vlan_pruning(vsi, true, false); @@ -2837,21 +2919,29 @@ static int ice_vc_process_vlan_msg(struct ice_vf *vf, u8 *msg, bool add_v) */ int num_vf_vlan; - num_vf_vlan = vf->num_vlan; + num_vf_vlan = vsi->num_vlan; for (i = 0; i < vfl->num_elements && i < num_vf_vlan; i++) { u16 vid = vfl->vlan_id[i]; + /* we add VLAN 0 by default for each VF so we can enable + * Tx VLAN anti-spoof without triggering MDD events so + * we don't want a VIRTCHNL request to remove it + */ + if (!vid) + continue; + /* Make sure ice_vsi_kill_vlan is successful before * updating VLAN information */ - if (ice_vsi_kill_vlan(vsi, vid)) { + status = ice_vsi_kill_vlan(vsi, vid); + if (status) { v_ret = VIRTCHNL_STATUS_ERR_PARAM; goto error_param; } - vf->num_vlan--; + vsi->num_vlan--; /* Disable VLAN pruning when the last VLAN is removed */ - if (!vf->num_vlan) + if (!vsi->num_vlan) ice_cfg_vlan_pruning(vsi, false, false); /* Disable Unicast/Multicast VLAN promiscuous mode */ @@ -3165,65 +3255,6 @@ ice_get_vf_cfg(struct net_device *netdev, int vf_id, struct ifla_vf_info *ivi) } /** - * ice_set_vf_spoofchk - * @netdev: network interface device structure - * @vf_id: VF identifier - * @ena: flag to enable or disable feature - * - * Enable or disable VF spoof checking - */ -int ice_set_vf_spoofchk(struct net_device *netdev, int vf_id, bool ena) -{ - struct ice_pf *pf = ice_netdev_to_pf(netdev); - struct ice_vsi *vsi = pf->vsi[0]; - struct ice_vsi_ctx *ctx; - enum ice_status status; - struct device *dev; - struct ice_vf *vf; - int ret = 0; - - dev = ice_pf_to_dev(pf); - if (ice_validate_vf_id(pf, vf_id)) - return -EINVAL; - - vf = &pf->vf[vf_id]; - if (ice_check_vf_init(pf, vf)) - return -EBUSY; - - if (ena == vf->spoofchk) { - dev_dbg(dev, "VF spoofchk already %s\n", - ena ? "ON" : "OFF"); - return 0; - } - - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); - if (!ctx) - return -ENOMEM; - - ctx->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_SECURITY_VALID); - - if (ena) { - ctx->info.sec_flags |= ICE_AQ_VSI_SEC_FLAG_ENA_MAC_ANTI_SPOOF; - ctx->info.sw_flags2 |= ICE_AQ_VSI_SW_FLAG_RX_PRUNE_EN_M; - } - - status = ice_update_vsi(&pf->hw, vsi->idx, ctx, NULL); - if (status) { - dev_dbg(dev, - "Error %d, failed to update VSI* parameters\n", status); - ret = -EIO; - goto out; - } - - vf->spoofchk = ena; - vsi->info.sec_flags = ctx->info.sec_flags; - vsi->info.sw_flags2 = ctx->info.sw_flags2; -out: - kfree(ctx); - return ret; -} - -/** * ice_wait_on_vf_reset * @vf: The VF being resseting * @@ -3344,28 +3375,18 @@ int ice_set_vf_trust(struct net_device *netdev, int vf_id, bool trusted) int ice_set_vf_link_state(struct net_device *netdev, int vf_id, int link_state) { struct ice_pf *pf = ice_netdev_to_pf(netdev); - struct virtchnl_pf_event pfe = { 0 }; - struct ice_link_status *ls; struct ice_vf *vf; - struct ice_hw *hw; if (ice_validate_vf_id(pf, vf_id)) return -EINVAL; vf = &pf->vf[vf_id]; - hw = &pf->hw; - ls = &pf->hw.port_info->phy.link_info; - if (ice_check_vf_init(pf, vf)) return -EBUSY; - pfe.event = VIRTCHNL_EVENT_LINK_CHANGE; - pfe.severity = PF_EVENT_SEVERITY_INFO; - switch (link_state) { case IFLA_VF_LINK_STATE_AUTO: vf->link_forced = false; - vf->link_up = ls->link_info & ICE_AQ_LINK_UP; break; case IFLA_VF_LINK_STATE_ENABLE: vf->link_forced = true; @@ -3379,15 +3400,7 @@ int ice_set_vf_link_state(struct net_device *netdev, int vf_id, int link_state) return -EINVAL; } - if (vf->link_forced) - ice_set_pfe_link_forced(vf, &pfe, vf->link_up); - else - ice_set_pfe_link(vf, &pfe, ls->link_speed, vf->link_up); - - /* Notify the VF of its new link state */ - ice_aq_send_msg_to_vf(hw, vf->vf_id, VIRTCHNL_OP_EVENT, - VIRTCHNL_STATUS_SUCCESS, (u8 *)&pfe, - sizeof(pfe), NULL); + ice_vc_notify_vf_link_state(vf); return 0; } diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h index 88aa65d5cb31..4647d636ed36 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h @@ -40,6 +40,9 @@ #define ICE_DFLT_INTR_PER_VF (ICE_DFLT_QS_PER_VF + 1) #define ICE_MAX_VF_RESET_WAIT 15 +#define ice_for_each_vf(pf, i) \ + for ((i) = 0; (i) < (pf)->num_alloc_vfs; (i)++) + /* Specific VF states */ enum ice_vf_states { ICE_VF_STATE_INIT = 0, /* PF is initializing VF */ @@ -91,7 +94,6 @@ struct ice_vf { unsigned long vf_caps; /* VF's adv. capabilities */ u8 num_req_qs; /* num of queue pairs requested by VF */ u16 num_mac; - u16 num_vlan; u16 num_vf_qs; /* num of queue configured per VF */ u16 num_qs_ena; /* total num of Tx/Rx queue enabled */ }; diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 0af1f0b951c0..149dca0012ba 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -414,7 +414,8 @@ ice_xsk_umem_enable(struct ice_vsi *vsi, struct xdp_umem *umem, u16 qid) if (vsi->type != ICE_VSI_PF) return -EINVAL; - vsi->num_xsk_umems = min_t(u16, vsi->num_rxq, vsi->num_txq); + if (!vsi->num_xsk_umems) + vsi->num_xsk_umems = min_t(u16, vsi->num_rxq, vsi->num_txq); if (qid >= vsi->num_xsk_umems) return -EINVAL; @@ -1019,8 +1020,8 @@ bool ice_clean_tx_irq_zc(struct ice_ring *xdp_ring, int budget) s16 ntc = xdp_ring->next_to_clean; struct ice_tx_desc *tx_desc; struct ice_tx_buf *tx_buf; - bool xmit_done = true; u32 xsk_frames = 0; + bool xmit_done; tx_desc = ICE_TX_DESC(xdp_ring, ntc); tx_buf = &xdp_ring->tx_buf[ntc]; diff --git a/drivers/net/ethernet/intel/igb/e1000_82575.c b/drivers/net/ethernet/intel/igb/e1000_82575.c index 8a6ef3514129..438b42ce2cd9 100644 --- a/drivers/net/ethernet/intel/igb/e1000_82575.c +++ b/drivers/net/ethernet/intel/igb/e1000_82575.c @@ -530,7 +530,7 @@ static s32 igb_set_sfp_media_type_82575(struct e1000_hw *hw) dev_spec->module_plugged = true; if (eth_flags->e1000_base_lx || eth_flags->e1000_base_sx) { hw->phy.media_type = e1000_media_type_internal_serdes; - } else if (eth_flags->e100_base_fx) { + } else if (eth_flags->e100_base_fx || eth_flags->e100_base_lx) { dev_spec->sgmii_active = true; hw->phy.media_type = e1000_media_type_internal_serdes; } else if (eth_flags->e1000_base_t) { @@ -657,14 +657,10 @@ static s32 igb_get_invariants_82575(struct e1000_hw *hw) break; } - /* do not change link mode for 100BaseFX */ - if (dev_spec->eth_flags.e100_base_fx) - break; - /* change current link mode setting */ ctrl_ext &= ~E1000_CTRL_EXT_LINK_MODE_MASK; - if (hw->phy.media_type == e1000_media_type_copper) + if (dev_spec->sgmii_active) ctrl_ext |= E1000_CTRL_EXT_LINK_MODE_SGMII; else ctrl_ext |= E1000_CTRL_EXT_LINK_MODE_PCIE_SERDES; diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c index 43c438365389..f96ffa83efbe 100644 --- a/drivers/net/ethernet/intel/igb/igb_ethtool.c +++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c @@ -181,7 +181,7 @@ static int igb_get_link_ksettings(struct net_device *netdev, advertising &= ~ADVERTISED_1000baseKX_Full; } } - if (eth_flags->e100_base_fx) { + if (eth_flags->e100_base_fx || eth_flags->e100_base_lx) { supported |= SUPPORTED_100baseT_Full; advertising |= ADVERTISED_100baseT_Full; } diff --git a/drivers/net/ethernet/intel/igc/Makefile b/drivers/net/ethernet/intel/igc/Makefile index 88c6f88baac5..49fb1e1965cd 100644 --- a/drivers/net/ethernet/intel/igc/Makefile +++ b/drivers/net/ethernet/intel/igc/Makefile @@ -8,4 +8,4 @@ obj-$(CONFIG_IGC) += igc.o igc-objs := igc_main.o igc_mac.o igc_i225.o igc_base.o igc_nvm.o igc_phy.o \ -igc_ethtool.o +igc_ethtool.o igc_ptp.o diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h index 612fe9ec81a4..52066bdbbad0 100644 --- a/drivers/net/ethernet/intel/igc/igc.h +++ b/drivers/net/ethernet/intel/igc/igc.h @@ -10,6 +10,9 @@ #include <linux/vmalloc.h> #include <linux/ethtool.h> #include <linux/sctp.h> +#include <linux/ptp_clock_kernel.h> +#include <linux/timecounter.h> +#include <linux/net_tstamp.h> #include "igc_hw.h" @@ -45,11 +48,15 @@ extern char igc_driver_version[]; #define IGC_REGS_LEN 740 #define IGC_RETA_SIZE 128 +/* flags controlling PTP/1588 function */ +#define IGC_PTP_ENABLED BIT(0) + /* Interrupt defines */ #define IGC_START_ITR 648 /* ~6000 ints/sec */ #define IGC_FLAG_HAS_MSI BIT(0) #define IGC_FLAG_QUEUE_PAIRS BIT(3) #define IGC_FLAG_DMAC BIT(4) +#define IGC_FLAG_PTP BIT(8) #define IGC_FLAG_NEED_LINK_UPDATE BIT(9) #define IGC_FLAG_MEDIA_RESET BIT(10) #define IGC_FLAG_MAS_ENABLE BIT(12) @@ -100,6 +107,20 @@ extern char igc_driver_version[]; #define AUTO_ALL_MODES 0 #define IGC_RX_HDR_LEN IGC_RXBUFFER_256 +/* Transmit and receive latency (for PTP timestamps) */ +/* FIXME: These values were estimated using the ones that i210 has as + * basis, they seem to provide good numbers with ptp4l/phc2sys, but we + * need to confirm them. + */ +#define IGC_I225_TX_LATENCY_10 9542 +#define IGC_I225_TX_LATENCY_100 1024 +#define IGC_I225_TX_LATENCY_1000 178 +#define IGC_I225_TX_LATENCY_2500 64 +#define IGC_I225_RX_LATENCY_10 20662 +#define IGC_I225_RX_LATENCY_100 2213 +#define IGC_I225_RX_LATENCY_1000 448 +#define IGC_I225_RX_LATENCY_2500 160 + /* RX and TX descriptor control thresholds. * PTHRESH - MAC will consider prefetch if it has fewer than this number of * descriptors available in its onboard memory. @@ -432,6 +453,20 @@ struct igc_adapter { unsigned long link_check_timeout; struct igc_info ei; + + struct ptp_clock *ptp_clock; + struct ptp_clock_info ptp_caps; + struct work_struct ptp_tx_work; + struct sk_buff *ptp_tx_skb; + struct hwtstamp_config tstamp_config; + unsigned long ptp_tx_start; + unsigned long last_rx_ptp_check; + unsigned long last_rx_timestamp; + unsigned int ptp_flags; + /* System time value lock */ + spinlock_t tmreg_lock; + struct cyclecounter cc; + struct timecounter tc; }; /* igc_desc_unused - calculate if we have unused descriptors */ @@ -515,6 +550,16 @@ int igc_add_filter(struct igc_adapter *adapter, int igc_erase_filter(struct igc_adapter *adapter, struct igc_nfc_filter *input); +void igc_ptp_init(struct igc_adapter *adapter); +void igc_ptp_reset(struct igc_adapter *adapter); +void igc_ptp_stop(struct igc_adapter *adapter); +void igc_ptp_rx_rgtstamp(struct igc_q_vector *q_vector, struct sk_buff *skb); +void igc_ptp_rx_pktstamp(struct igc_q_vector *q_vector, void *va, + struct sk_buff *skb); +int igc_ptp_set_ts_config(struct net_device *netdev, struct ifreq *ifr); +int igc_ptp_get_ts_config(struct net_device *netdev, struct ifreq *ifr); +void igc_ptp_tx_hang(struct igc_adapter *adapter); + #define igc_rx_pg_size(_ring) (PAGE_SIZE << igc_rx_pg_order(_ring)) #define IGC_TXD_DCMD (IGC_ADVTXD_DCMD_EOP | IGC_ADVTXD_DCMD_RS) diff --git a/drivers/net/ethernet/intel/igc/igc_base.c b/drivers/net/ethernet/intel/igc/igc_base.c index db289bcce21d..5a506440560a 100644 --- a/drivers/net/ethernet/intel/igc/igc_base.c +++ b/drivers/net/ethernet/intel/igc/igc_base.c @@ -212,6 +212,7 @@ static s32 igc_get_invariants_base(struct igc_hw *hw) case IGC_DEV_ID_I225_I: case IGC_DEV_ID_I220_V: case IGC_DEV_ID_I225_K: + case IGC_DEV_ID_I225_BLANK_NVM: mac->type = igc_i225; break; default: diff --git a/drivers/net/ethernet/intel/igc/igc_defines.h b/drivers/net/ethernet/intel/igc/igc_defines.h index 50dffd5db606..58efa7a02c68 100644 --- a/drivers/net/ethernet/intel/igc/igc_defines.h +++ b/drivers/net/ethernet/intel/igc/igc_defines.h @@ -218,6 +218,7 @@ #define IGC_ICR_RXDMT0 BIT(4) /* Rx desc min. threshold (0) */ #define IGC_ICR_RXO BIT(6) /* Rx overrun */ #define IGC_ICR_RXT0 BIT(7) /* Rx timer intr (ring 0) */ +#define IGC_ICR_TS BIT(19) /* Time Sync Interrupt */ #define IGC_ICR_DRSTA BIT(30) /* Device Reset Asserted */ /* If this bit asserted, the driver should claim the interrupt */ @@ -240,6 +241,7 @@ #define IGC_IMS_DRSTA IGC_ICR_DRSTA /* Device Reset Asserted */ #define IGC_IMS_RXT0 IGC_ICR_RXT0 /* Rx timer intr */ #define IGC_IMS_RXDMT0 IGC_ICR_RXDMT0 /* Rx desc min. threshold */ +#define IGC_IMS_TS IGC_ICR_TS /* Time Sync Interrupt */ #define IGC_QVECTOR_MASK 0x7FFC /* Q-vector mask */ #define IGC_ITR_VAL_MASK 0x04 /* ITR value mask */ @@ -280,6 +282,10 @@ #define IGC_TXD_STAT_TC 0x00000004 /* Tx Underrun */ #define IGC_TXD_EXTCMD_TSTAMP 0x00000010 /* IEEE1588 Timestamp packet */ +/* IPSec Encrypt Enable */ +#define IGC_ADVTXD_L4LEN_SHIFT 8 /* Adv ctxt L4LEN shift */ +#define IGC_ADVTXD_MSS_SHIFT 16 /* Adv ctxt MSS shift */ + /* Transmit Control */ #define IGC_TCTL_EN 0x00000002 /* enable Tx */ #define IGC_TCTL_PSP 0x00000008 /* pad short packets */ @@ -312,12 +318,21 @@ #define IGC_RCTL_RDMTS_HALF 0x00000000 /* Rx desc min thresh size */ #define IGC_RCTL_BAM 0x00008000 /* broadcast enable */ +/* Split Replication Receive Control */ +#define IGC_SRRCTL_TIMESTAMP 0x40000000 +#define IGC_SRRCTL_TIMER1SEL(timer) (((timer) & 0x3) << 14) +#define IGC_SRRCTL_TIMER0SEL(timer) (((timer) & 0x3) << 17) + /* Receive Descriptor bit definitions */ #define IGC_RXD_STAT_EOP 0x02 /* End of Packet */ #define IGC_RXD_STAT_IXSM 0x04 /* Ignore checksum */ #define IGC_RXD_STAT_UDPCS 0x10 /* UDP xsum calculated */ #define IGC_RXD_STAT_TCPCS 0x20 /* TCP xsum calculated */ +/* Advanced Receive Descriptor bit definitions */ +#define IGC_RXDADV_STAT_TSIP 0x08000 /* timestamp in packet */ +#define IGC_RXDADV_STAT_TS 0x10000 /* Pkt was time stamped */ + #define IGC_RXDEXT_STATERR_CE 0x01000000 #define IGC_RXDEXT_STATERR_SE 0x02000000 #define IGC_RXDEXT_STATERR_SEQ 0x04000000 @@ -354,6 +369,61 @@ #define I225_RXPBSIZE_DEFAULT 0x000000A2 /* RXPBSIZE default */ #define I225_TXPBSIZE_DEFAULT 0x04000014 /* TXPBSIZE default */ +#define IGC_RXPBS_CFG_TS_EN 0x80000000 /* Timestamp in Rx buffer */ + +/* Time Sync Interrupt Causes */ +#define IGC_TSICR_SYS_WRAP BIT(0) /* SYSTIM Wrap around. */ +#define IGC_TSICR_TXTS BIT(1) /* Transmit Timestamp. */ +#define IGC_TSICR_TT0 BIT(3) /* Target Time 0 Trigger. */ +#define IGC_TSICR_TT1 BIT(4) /* Target Time 1 Trigger. */ +#define IGC_TSICR_AUTT0 BIT(5) /* Auxiliary Timestamp 0 Taken. */ +#define IGC_TSICR_AUTT1 BIT(6) /* Auxiliary Timestamp 1 Taken. */ + +#define IGC_TSICR_INTERRUPTS IGC_TSICR_TXTS + +/* PTP Queue Filter */ +#define IGC_ETQF_1588 BIT(30) + +#define IGC_FTQF_VF_BP 0x00008000 +#define IGC_FTQF_1588_TIME_STAMP 0x08000000 +#define IGC_FTQF_MASK 0xF0000000 +#define IGC_FTQF_MASK_PROTO_BP 0x10000000 + +/* Time Sync Receive Control bit definitions */ +#define IGC_TSYNCRXCTL_VALID 0x00000001 /* Rx timestamp valid */ +#define IGC_TSYNCRXCTL_TYPE_MASK 0x0000000E /* Rx type mask */ +#define IGC_TSYNCRXCTL_TYPE_L2_V2 0x00 +#define IGC_TSYNCRXCTL_TYPE_L4_V1 0x02 +#define IGC_TSYNCRXCTL_TYPE_L2_L4_V2 0x04 +#define IGC_TSYNCRXCTL_TYPE_ALL 0x08 +#define IGC_TSYNCRXCTL_TYPE_EVENT_V2 0x0A +#define IGC_TSYNCRXCTL_ENABLED 0x00000010 /* enable Rx timestamping */ +#define IGC_TSYNCRXCTL_SYSCFI 0x00000020 /* Sys clock frequency */ +#define IGC_TSYNCRXCTL_RXSYNSIG 0x00000400 /* Sample RX tstamp in PHY sop */ + +/* Time Sync Receive Configuration */ +#define IGC_TSYNCRXCFG_PTP_V1_CTRLT_MASK 0x000000FF +#define IGC_TSYNCRXCFG_PTP_V1_SYNC_MESSAGE 0x00 +#define IGC_TSYNCRXCFG_PTP_V1_DELAY_REQ_MESSAGE 0x01 + +/* Immediate Interrupt Receive */ +#define IGC_IMIR_CLEAR_MASK 0xF001FFFF /* IMIR Reg Clear Mask */ +#define IGC_IMIR_PORT_BYPASS 0x20000 /* IMIR Port Bypass Bit */ +#define IGC_IMIR_PRIORITY_SHIFT 29 /* IMIR Priority Shift */ +#define IGC_IMIREXT_CLEAR_MASK 0x7FFFF /* IMIREXT Reg Clear Mask */ + +/* Immediate Interrupt Receive Extended */ +#define IGC_IMIREXT_CTRL_BP 0x00080000 /* Bypass check of ctrl bits */ +#define IGC_IMIREXT_SIZE_BP 0x00001000 /* Packet size bypass */ + +/* Time Sync Transmit Control bit definitions */ +#define IGC_TSYNCTXCTL_VALID 0x00000001 /* Tx timestamp valid */ +#define IGC_TSYNCTXCTL_ENABLED 0x00000010 /* enable Tx timestamping */ +#define IGC_TSYNCTXCTL_MAX_ALLOWED_DLY_MASK 0x0000F000 /* max delay */ +#define IGC_TSYNCTXCTL_SYNC_COMP_ERR 0x20000000 /* sync err */ +#define IGC_TSYNCTXCTL_SYNC_COMP 0x40000000 /* sync complete */ +#define IGC_TSYNCTXCTL_START_SYNC 0x80000000 /* initiate sync */ +#define IGC_TSYNCTXCTL_TXSYNSIG 0x00000020 /* Sample TX tstamp in PHY sop */ /* Receive Checksum Control */ #define IGC_RXCSUM_CRCOFL 0x00000800 /* CRC32 offload enable */ @@ -394,6 +464,7 @@ /* PHY Status Register */ #define MII_SR_LINK_STATUS 0x0004 /* Link Status 1 = link */ #define MII_SR_AUTONEG_COMPLETE 0x0020 /* Auto Neg Complete */ +#define IGC_PHY_RST_COMP 0x0100 /* Internal PHY reset completion */ /* PHY 1000 MII Register/Bit Definitions */ /* PHY Registers defined by IEEE */ diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c index 455c1cdceb6e..ee07011e13e9 100644 --- a/drivers/net/ethernet/intel/igc/igc_ethtool.c +++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c @@ -1600,6 +1600,39 @@ static int igc_set_channels(struct net_device *netdev, return 0; } +static int igc_get_ts_info(struct net_device *dev, + struct ethtool_ts_info *info) +{ + struct igc_adapter *adapter = netdev_priv(dev); + + if (adapter->ptp_clock) + info->phc_index = ptp_clock_index(adapter->ptp_clock); + else + info->phc_index = -1; + + switch (adapter->hw.mac.type) { + case igc_i225: + info->so_timestamping = + SOF_TIMESTAMPING_TX_SOFTWARE | + SOF_TIMESTAMPING_RX_SOFTWARE | + SOF_TIMESTAMPING_SOFTWARE | + SOF_TIMESTAMPING_TX_HARDWARE | + SOF_TIMESTAMPING_RX_HARDWARE | + SOF_TIMESTAMPING_RAW_HARDWARE; + + info->tx_types = + BIT(HWTSTAMP_TX_OFF) | + BIT(HWTSTAMP_TX_ON); + + info->rx_filters = BIT(HWTSTAMP_FILTER_NONE); + info->rx_filters |= BIT(HWTSTAMP_FILTER_ALL); + + return 0; + default: + return -EOPNOTSUPP; + } +} + static u32 igc_get_priv_flags(struct net_device *netdev) { struct igc_adapter *adapter = netdev_priv(netdev); @@ -1847,6 +1880,7 @@ static const struct ethtool_ops igc_ethtool_ops = { .get_rxfh_indir_size = igc_get_rxfh_indir_size, .get_rxfh = igc_get_rxfh, .set_rxfh = igc_set_rxfh, + .get_ts_info = igc_get_ts_info, .get_channels = igc_get_channels, .set_channels = igc_set_channels, .get_priv_flags = igc_get_priv_flags, diff --git a/drivers/net/ethernet/intel/igc/igc_hw.h b/drivers/net/ethernet/intel/igc/igc_hw.h index 20f710645746..90ac0e0144d8 100644 --- a/drivers/net/ethernet/intel/igc/igc_hw.h +++ b/drivers/net/ethernet/intel/igc/igc_hw.h @@ -21,8 +21,7 @@ #define IGC_DEV_ID_I225_I 0x15F8 #define IGC_DEV_ID_I220_V 0x15F7 #define IGC_DEV_ID_I225_K 0x3100 - -#define IGC_FUNC_0 0 +#define IGC_DEV_ID_I225_BLANK_NVM 0x15FD /* Function pointers for the MAC. */ struct igc_mac_operations { diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 25989c087aca..d9d5425fe8d9 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -45,31 +45,13 @@ static const struct pci_device_id igc_pci_tbl[] = { { PCI_VDEVICE(INTEL, IGC_DEV_ID_I225_I), board_base }, { PCI_VDEVICE(INTEL, IGC_DEV_ID_I220_V), board_base }, { PCI_VDEVICE(INTEL, IGC_DEV_ID_I225_K), board_base }, + { PCI_VDEVICE(INTEL, IGC_DEV_ID_I225_BLANK_NVM), board_base }, /* required last entry */ {0, } }; MODULE_DEVICE_TABLE(pci, igc_pci_tbl); -/* forward declaration */ -static void igc_clean_tx_ring(struct igc_ring *tx_ring); -static int igc_sw_init(struct igc_adapter *); -static void igc_configure(struct igc_adapter *adapter); -static void igc_power_down_link(struct igc_adapter *adapter); -static void igc_set_default_mac_filter(struct igc_adapter *adapter); -static void igc_set_rx_mode(struct net_device *netdev); -static void igc_write_itr(struct igc_q_vector *q_vector); -static void igc_assign_vector(struct igc_q_vector *q_vector, int msix_vector); -static void igc_free_q_vector(struct igc_adapter *adapter, int v_idx); -static void igc_set_interrupt_capability(struct igc_adapter *adapter, - bool msix); -static void igc_free_q_vectors(struct igc_adapter *adapter); -static void igc_irq_disable(struct igc_adapter *adapter); -static void igc_irq_enable(struct igc_adapter *adapter); -static void igc_configure_msix(struct igc_adapter *adapter); -static bool igc_alloc_mapped_page(struct igc_ring *rx_ring, - struct igc_rx_buffer *bi); - enum latency_range { lowest_latency = 0, low_latency = 1, @@ -77,6 +59,16 @@ enum latency_range { latency_invalid = 255 }; +/** + * igc_power_down_link - Power down the phy/serdes link + * @adapter: address of board private structure + */ +static void igc_power_down_link(struct igc_adapter *adapter) +{ + if (adapter->hw.phy.media_type == igc_media_type_copper) + igc_power_down_phy_copper_base(&adapter->hw); +} + void igc_reset(struct igc_adapter *adapter) { struct pci_dev *pdev = adapter->pdev; @@ -111,6 +103,9 @@ void igc_reset(struct igc_adapter *adapter) if (!netif_running(adapter->netdev)) igc_power_down_link(adapter); + /* Re-enable PTP, where applicable. */ + igc_ptp_reset(adapter); + igc_get_phy_info(hw); } @@ -129,16 +124,6 @@ static void igc_power_up_link(struct igc_adapter *adapter) } /** - * igc_power_down_link - Power down the phy link - * @adapter: address of board private structure - */ -static void igc_power_down_link(struct igc_adapter *adapter) -{ - if (adapter->hw.phy.media_type == igc_media_type_copper) - igc_power_down_phy_copper_base(&adapter->hw); -} - -/** * igc_release_hw_control - release control of the h/w to f/w * @adapter: address of board private structure * @@ -177,43 +162,6 @@ static void igc_get_hw_control(struct igc_adapter *adapter) } /** - * igc_free_tx_resources - Free Tx Resources per Queue - * @tx_ring: Tx descriptor ring for a specific queue - * - * Free all transmit software resources - */ -void igc_free_tx_resources(struct igc_ring *tx_ring) -{ - igc_clean_tx_ring(tx_ring); - - vfree(tx_ring->tx_buffer_info); - tx_ring->tx_buffer_info = NULL; - - /* if not set, then don't free */ - if (!tx_ring->desc) - return; - - dma_free_coherent(tx_ring->dev, tx_ring->size, - tx_ring->desc, tx_ring->dma); - - tx_ring->desc = NULL; -} - -/** - * igc_free_all_tx_resources - Free Tx Resources for All Queues - * @adapter: board private structure - * - * Free all transmit software resources - */ -static void igc_free_all_tx_resources(struct igc_adapter *adapter) -{ - int i; - - for (i = 0; i < adapter->num_tx_queues; i++) - igc_free_tx_resources(adapter->tx_ring[i]); -} - -/** * igc_clean_tx_ring - Free Tx Buffers * @tx_ring: ring to be cleaned */ @@ -275,6 +223,43 @@ static void igc_clean_tx_ring(struct igc_ring *tx_ring) } /** + * igc_free_tx_resources - Free Tx Resources per Queue + * @tx_ring: Tx descriptor ring for a specific queue + * + * Free all transmit software resources + */ +void igc_free_tx_resources(struct igc_ring *tx_ring) +{ + igc_clean_tx_ring(tx_ring); + + vfree(tx_ring->tx_buffer_info); + tx_ring->tx_buffer_info = NULL; + + /* if not set, then don't free */ + if (!tx_ring->desc) + return; + + dma_free_coherent(tx_ring->dev, tx_ring->size, + tx_ring->desc, tx_ring->dma); + + tx_ring->desc = NULL; +} + +/** + * igc_free_all_tx_resources - Free Tx Resources for All Queues + * @adapter: board private structure + * + * Free all transmit software resources + */ +static void igc_free_all_tx_resources(struct igc_adapter *adapter) +{ + int i; + + for (i = 0; i < adapter->num_tx_queues; i++) + igc_free_tx_resources(adapter->tx_ring[i]); +} + +/** * igc_clean_all_tx_rings - Free Tx Buffers for all queues * @adapter: board private structure */ @@ -772,6 +757,51 @@ static void igc_setup_tctl(struct igc_adapter *adapter) } /** + * igc_rar_set_index - Sync RAL[index] and RAH[index] registers with MAC table + * @adapter: address of board private structure + * @index: Index of the RAR entry which need to be synced with MAC table + */ +static void igc_rar_set_index(struct igc_adapter *adapter, u32 index) +{ + u8 *addr = adapter->mac_table[index].addr; + struct igc_hw *hw = &adapter->hw; + u32 rar_low, rar_high; + + /* HW expects these to be in network order when they are plugged + * into the registers which are little endian. In order to guarantee + * that ordering we need to do an leXX_to_cpup here in order to be + * ready for the byteswap that occurs with writel + */ + rar_low = le32_to_cpup((__le32 *)(addr)); + rar_high = le16_to_cpup((__le16 *)(addr + 4)); + + /* Indicate to hardware the Address is Valid. */ + if (adapter->mac_table[index].state & IGC_MAC_STATE_IN_USE) { + if (is_valid_ether_addr(addr)) + rar_high |= IGC_RAH_AV; + + rar_high |= IGC_RAH_POOL_1 << + adapter->mac_table[index].queue; + } + + wr32(IGC_RAL(index), rar_low); + wrfl(); + wr32(IGC_RAH(index), rar_high); + wrfl(); +} + +/* Set default MAC address for the PF in the first RAR entry */ +static void igc_set_default_mac_filter(struct igc_adapter *adapter) +{ + struct igc_mac_addr *mac_table = &adapter->mac_table[0]; + + ether_addr_copy(mac_table->addr, adapter->hw.mac.addr); + mac_table->state = IGC_MAC_STATE_DEFAULT | IGC_MAC_STATE_IN_USE; + + igc_rar_set_index(adapter, 0); +} + +/** * igc_set_mac - Change the Ethernet Address of the NIC * @netdev: network interface device structure * @p: pointer to an address structure @@ -851,7 +881,7 @@ static void igc_tx_ctxtdesc(struct igc_ring *tx_ring, /* set bits to identify this as an advanced context descriptor */ type_tucmd |= IGC_TXD_CMD_DEXT | IGC_ADVTXD_DTYP_CTXT; - /* For 82575, context index must be unique per ring. */ + /* For i225, context index must be unique per ring. */ if (test_bit(IGC_RING_FLAG_TX_CTX_IDX, &tx_ring->flags)) mss_l4len_idx |= tx_ring->reg_idx << 4; @@ -958,6 +988,11 @@ static inline int igc_maybe_stop_tx(struct igc_ring *tx_ring, const u16 size) return __igc_maybe_stop_tx(tx_ring, size); } +#define IGC_SET_FLAG(_input, _flag, _result) \ + (((_flag) <= (_result)) ? \ + ((u32)((_input) & (_flag)) * ((_result) / (_flag))) : \ + ((u32)((_input) & (_flag)) / ((_flag) / (_result)))) + static u32 igc_tx_cmd_type(struct sk_buff *skb, u32 tx_flags) { /* set type for advanced descriptor with frame checksum insertion */ @@ -965,6 +1000,14 @@ static u32 igc_tx_cmd_type(struct sk_buff *skb, u32 tx_flags) IGC_ADVTXD_DCMD_DEXT | IGC_ADVTXD_DCMD_IFCS; + /* set segmentation bits for TSO */ + cmd_type |= IGC_SET_FLAG(tx_flags, IGC_TX_FLAGS_TSO, + (IGC_ADVTXD_DCMD_TSE)); + + /* set timestamp bit if present */ + cmd_type |= IGC_SET_FLAG(tx_flags, IGC_TX_FLAGS_TSTAMP, + (IGC_ADVTXD_MAC_TSTAMP)); + return cmd_type; } @@ -1132,6 +1175,100 @@ dma_error: return -1; } +static int igc_tso(struct igc_ring *tx_ring, + struct igc_tx_buffer *first, + u8 *hdr_len) +{ + u32 vlan_macip_lens, type_tucmd, mss_l4len_idx; + struct sk_buff *skb = first->skb; + union { + struct iphdr *v4; + struct ipv6hdr *v6; + unsigned char *hdr; + } ip; + union { + struct tcphdr *tcp; + struct udphdr *udp; + unsigned char *hdr; + } l4; + u32 paylen, l4_offset; + int err; + + if (skb->ip_summed != CHECKSUM_PARTIAL) + return 0; + + if (!skb_is_gso(skb)) + return 0; + + err = skb_cow_head(skb, 0); + if (err < 0) + return err; + + ip.hdr = skb_network_header(skb); + l4.hdr = skb_checksum_start(skb); + + /* ADV DTYP TUCMD MKRLOC/ISCSIHEDLEN */ + type_tucmd = IGC_ADVTXD_TUCMD_L4T_TCP; + + /* initialize outer IP header fields */ + if (ip.v4->version == 4) { + unsigned char *csum_start = skb_checksum_start(skb); + unsigned char *trans_start = ip.hdr + (ip.v4->ihl * 4); + + /* IP header will have to cancel out any data that + * is not a part of the outer IP header + */ + ip.v4->check = csum_fold(csum_partial(trans_start, + csum_start - trans_start, + 0)); + type_tucmd |= IGC_ADVTXD_TUCMD_IPV4; + + ip.v4->tot_len = 0; + first->tx_flags |= IGC_TX_FLAGS_TSO | + IGC_TX_FLAGS_CSUM | + IGC_TX_FLAGS_IPV4; + } else { + ip.v6->payload_len = 0; + first->tx_flags |= IGC_TX_FLAGS_TSO | + IGC_TX_FLAGS_CSUM; + } + + /* determine offset of inner transport header */ + l4_offset = l4.hdr - skb->data; + + /* remove payload length from inner checksum */ + paylen = skb->len - l4_offset; + if (type_tucmd & IGC_ADVTXD_TUCMD_L4T_TCP) { + /* compute length of segmentation header */ + *hdr_len = (l4.tcp->doff * 4) + l4_offset; + csum_replace_by_diff(&l4.tcp->check, + (__force __wsum)htonl(paylen)); + } else { + /* compute length of segmentation header */ + *hdr_len = sizeof(*l4.udp) + l4_offset; + csum_replace_by_diff(&l4.udp->check, + (__force __wsum)htonl(paylen)); + } + + /* update gso size and bytecount with header size */ + first->gso_segs = skb_shinfo(skb)->gso_segs; + first->bytecount += (first->gso_segs - 1) * *hdr_len; + + /* MSS L4LEN IDX */ + mss_l4len_idx = (*hdr_len - l4_offset) << IGC_ADVTXD_L4LEN_SHIFT; + mss_l4len_idx |= skb_shinfo(skb)->gso_size << IGC_ADVTXD_MSS_SHIFT; + + /* VLAN MACLEN IPLEN */ + vlan_macip_lens = l4.hdr - ip.hdr; + vlan_macip_lens |= (ip.hdr - skb->data) << IGC_ADVTXD_MACLEN_SHIFT; + vlan_macip_lens |= first->tx_flags & IGC_TX_FLAGS_VLAN_MASK; + + igc_tx_ctxtdesc(tx_ring, first, vlan_macip_lens, + type_tucmd, mss_l4len_idx); + + return 1; +} + static netdev_tx_t igc_xmit_frame_ring(struct sk_buff *skb, struct igc_ring *tx_ring) { @@ -1141,6 +1278,7 @@ static netdev_tx_t igc_xmit_frame_ring(struct sk_buff *skb, u32 tx_flags = 0; unsigned short f; u8 hdr_len = 0; + int tso = 0; /* need: 1 descriptor per page * PAGE_SIZE/IGC_MAX_DATA_PER_TXD, * + 1 desc for skb_headlen/IGC_MAX_DATA_PER_TXD, @@ -1163,15 +1301,45 @@ static netdev_tx_t igc_xmit_frame_ring(struct sk_buff *skb, first->bytecount = skb->len; first->gso_segs = 1; + if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) { + struct igc_adapter *adapter = netdev_priv(tx_ring->netdev); + + /* FIXME: add support for retrieving timestamps from + * the other timer registers before skipping the + * timestamping request. + */ + if (adapter->tstamp_config.tx_type == HWTSTAMP_TX_ON && + !test_and_set_bit_lock(__IGC_PTP_TX_IN_PROGRESS, + &adapter->state)) { + skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; + tx_flags |= IGC_TX_FLAGS_TSTAMP; + + adapter->ptp_tx_skb = skb_get(skb); + adapter->ptp_tx_start = jiffies; + } else { + adapter->tx_hwtstamp_skipped++; + } + } + /* record initial flags and protocol */ first->tx_flags = tx_flags; first->protocol = protocol; - igc_tx_csum(tx_ring, first); + tso = igc_tso(tx_ring, first, &hdr_len); + if (tso < 0) + goto out_drop; + else if (!tso) + igc_tx_csum(tx_ring, first); igc_tx_map(tx_ring, first, hdr_len); return NETDEV_TX_OK; + +out_drop: + dev_kfree_skb_any(first->skb); + first->skb = NULL; + + return NETDEV_TX_OK; } static inline struct igc_ring *igc_tx_queue_mapping(struct igc_adapter *adapter, @@ -1270,6 +1438,10 @@ static void igc_process_skb_fields(struct igc_ring *rx_ring, igc_rx_checksum(rx_ring, rx_desc, skb); + if (igc_test_staterr(rx_desc, IGC_RXDADV_STAT_TS) && + !igc_test_staterr(rx_desc, IGC_RXDADV_STAT_TSIP)) + igc_ptp_rx_rgtstamp(rx_ring->q_vector, skb); + skb_record_rx_queue(skb, rx_ring->queue_index); skb->protocol = eth_type_trans(skb, rx_ring->netdev); @@ -1389,6 +1561,12 @@ static struct sk_buff *igc_construct_skb(struct igc_ring *rx_ring, if (unlikely(!skb)) return NULL; + if (unlikely(igc_test_staterr(rx_desc, IGC_RXDADV_STAT_TSIP))) { + igc_ptp_rx_pktstamp(rx_ring->q_vector, va, skb); + va += IGC_TS_HDR_LEN; + size -= IGC_TS_HDR_LEN; + } + /* Determine available headroom for copy */ headlen = size; if (headlen > IGC_RX_HDR_LEN) @@ -1565,6 +1743,52 @@ static void igc_put_rx_buffer(struct igc_ring *rx_ring, rx_buffer->page = NULL; } +static inline unsigned int igc_rx_offset(struct igc_ring *rx_ring) +{ + return ring_uses_build_skb(rx_ring) ? IGC_SKB_PAD : 0; +} + +static bool igc_alloc_mapped_page(struct igc_ring *rx_ring, + struct igc_rx_buffer *bi) +{ + struct page *page = bi->page; + dma_addr_t dma; + + /* since we are recycling buffers we should seldom need to alloc */ + if (likely(page)) + return true; + + /* alloc new page for storage */ + page = dev_alloc_pages(igc_rx_pg_order(rx_ring)); + if (unlikely(!page)) { + rx_ring->rx_stats.alloc_failed++; + return false; + } + + /* map page for use */ + dma = dma_map_page_attrs(rx_ring->dev, page, 0, + igc_rx_pg_size(rx_ring), + DMA_FROM_DEVICE, + IGC_RX_DMA_ATTR); + + /* if mapping failed free memory back to system since + * there isn't much point in holding memory we can't use + */ + if (dma_mapping_error(rx_ring->dev, dma)) { + __free_page(page); + + rx_ring->rx_stats.alloc_failed++; + return false; + } + + bi->dma = dma; + bi->page = page; + bi->page_offset = igc_rx_offset(rx_ring); + bi->pagecnt_bias = 1; + + return true; +} + /** * igc_alloc_rx_buffers - Replace used receive buffers; packet split * @rx_ring: rx descriptor ring @@ -1726,52 +1950,6 @@ static int igc_clean_rx_irq(struct igc_q_vector *q_vector, const int budget) return total_packets; } -static inline unsigned int igc_rx_offset(struct igc_ring *rx_ring) -{ - return ring_uses_build_skb(rx_ring) ? IGC_SKB_PAD : 0; -} - -static bool igc_alloc_mapped_page(struct igc_ring *rx_ring, - struct igc_rx_buffer *bi) -{ - struct page *page = bi->page; - dma_addr_t dma; - - /* since we are recycling buffers we should seldom need to alloc */ - if (likely(page)) - return true; - - /* alloc new page for storage */ - page = dev_alloc_pages(igc_rx_pg_order(rx_ring)); - if (unlikely(!page)) { - rx_ring->rx_stats.alloc_failed++; - return false; - } - - /* map page for use */ - dma = dma_map_page_attrs(rx_ring->dev, page, 0, - igc_rx_pg_size(rx_ring), - DMA_FROM_DEVICE, - IGC_RX_DMA_ATTR); - - /* if mapping failed free memory back to system since - * there isn't much point in holding memory we can't use - */ - if (dma_mapping_error(rx_ring->dev, dma)) { - __free_page(page); - - rx_ring->rx_stats.alloc_failed++; - return false; - } - - bi->dma = dma; - bi->page = page; - bi->page_offset = igc_rx_offset(rx_ring); - bi->pagecnt_bias = 1; - - return true; -} - /** * igc_clean_tx_irq - Reclaim resources after transmit completes * @q_vector: pointer to q_vector containing needed info @@ -1943,6 +2121,1128 @@ static bool igc_clean_tx_irq(struct igc_q_vector *q_vector, int napi_budget) return !!budget; } +static void igc_nfc_filter_restore(struct igc_adapter *adapter) +{ + struct igc_nfc_filter *rule; + + spin_lock(&adapter->nfc_lock); + + hlist_for_each_entry(rule, &adapter->nfc_filter_list, nfc_node) + igc_add_filter(adapter, rule); + + spin_unlock(&adapter->nfc_lock); +} + +/* If the filter to be added and an already existing filter express + * the same address and address type, it should be possible to only + * override the other configurations, for example the queue to steer + * traffic. + */ +static bool igc_mac_entry_can_be_used(const struct igc_mac_addr *entry, + const u8 *addr, const u8 flags) +{ + if (!(entry->state & IGC_MAC_STATE_IN_USE)) + return true; + + if ((entry->state & IGC_MAC_STATE_SRC_ADDR) != + (flags & IGC_MAC_STATE_SRC_ADDR)) + return false; + + if (!ether_addr_equal(addr, entry->addr)) + return false; + + return true; +} + +/* Add a MAC filter for 'addr' directing matching traffic to 'queue', + * 'flags' is used to indicate what kind of match is made, match is by + * default for the destination address, if matching by source address + * is desired the flag IGC_MAC_STATE_SRC_ADDR can be used. + */ +static int igc_add_mac_filter(struct igc_adapter *adapter, + const u8 *addr, const u8 queue) +{ + struct igc_hw *hw = &adapter->hw; + int rar_entries = hw->mac.rar_entry_count; + int i; + + if (is_zero_ether_addr(addr)) + return -EINVAL; + + /* Search for the first empty entry in the MAC table. + * Do not touch entries at the end of the table reserved for the VF MAC + * addresses. + */ + for (i = 0; i < rar_entries; i++) { + if (!igc_mac_entry_can_be_used(&adapter->mac_table[i], + addr, 0)) + continue; + + ether_addr_copy(adapter->mac_table[i].addr, addr); + adapter->mac_table[i].queue = queue; + adapter->mac_table[i].state |= IGC_MAC_STATE_IN_USE; + + igc_rar_set_index(adapter, i); + return i; + } + + return -ENOSPC; +} + +/* Remove a MAC filter for 'addr' directing matching traffic to + * 'queue', 'flags' is used to indicate what kind of match need to be + * removed, match is by default for the destination address, if + * matching by source address is to be removed the flag + * IGC_MAC_STATE_SRC_ADDR can be used. + */ +static int igc_del_mac_filter(struct igc_adapter *adapter, + const u8 *addr, const u8 queue) +{ + struct igc_hw *hw = &adapter->hw; + int rar_entries = hw->mac.rar_entry_count; + int i; + + if (is_zero_ether_addr(addr)) + return -EINVAL; + + /* Search for matching entry in the MAC table based on given address + * and queue. Do not touch entries at the end of the table reserved + * for the VF MAC addresses. + */ + for (i = 0; i < rar_entries; i++) { + if (!(adapter->mac_table[i].state & IGC_MAC_STATE_IN_USE)) + continue; + if (adapter->mac_table[i].state != 0) + continue; + if (adapter->mac_table[i].queue != queue) + continue; + if (!ether_addr_equal(adapter->mac_table[i].addr, addr)) + continue; + + /* When a filter for the default address is "deleted", + * we return it to its initial configuration + */ + if (adapter->mac_table[i].state & IGC_MAC_STATE_DEFAULT) { + adapter->mac_table[i].state = + IGC_MAC_STATE_DEFAULT | IGC_MAC_STATE_IN_USE; + adapter->mac_table[i].queue = 0; + } else { + adapter->mac_table[i].state = 0; + adapter->mac_table[i].queue = 0; + memset(adapter->mac_table[i].addr, 0, ETH_ALEN); + } + + igc_rar_set_index(adapter, i); + return 0; + } + + return -ENOENT; +} + +static int igc_uc_sync(struct net_device *netdev, const unsigned char *addr) +{ + struct igc_adapter *adapter = netdev_priv(netdev); + int ret; + + ret = igc_add_mac_filter(adapter, addr, adapter->num_rx_queues); + + return min_t(int, ret, 0); +} + +static int igc_uc_unsync(struct net_device *netdev, const unsigned char *addr) +{ + struct igc_adapter *adapter = netdev_priv(netdev); + + igc_del_mac_filter(adapter, addr, adapter->num_rx_queues); + + return 0; +} + +/** + * igc_set_rx_mode - Secondary Unicast, Multicast and Promiscuous mode set + * @netdev: network interface device structure + * + * The set_rx_mode entry point is called whenever the unicast or multicast + * address lists or the network interface flags are updated. This routine is + * responsible for configuring the hardware for proper unicast, multicast, + * promiscuous mode, and all-multi behavior. + */ +static void igc_set_rx_mode(struct net_device *netdev) +{ + struct igc_adapter *adapter = netdev_priv(netdev); + struct igc_hw *hw = &adapter->hw; + u32 rctl = 0, rlpml = MAX_JUMBO_FRAME_SIZE; + int count; + + /* Check for Promiscuous and All Multicast modes */ + if (netdev->flags & IFF_PROMISC) { + rctl |= IGC_RCTL_UPE | IGC_RCTL_MPE; + } else { + if (netdev->flags & IFF_ALLMULTI) { + rctl |= IGC_RCTL_MPE; + } else { + /* Write addresses to the MTA, if the attempt fails + * then we should just turn on promiscuous mode so + * that we can at least receive multicast traffic + */ + count = igc_write_mc_addr_list(netdev); + if (count < 0) + rctl |= IGC_RCTL_MPE; + } + } + + /* Write addresses to available RAR registers, if there is not + * sufficient space to store all the addresses then enable + * unicast promiscuous mode + */ + if (__dev_uc_sync(netdev, igc_uc_sync, igc_uc_unsync)) + rctl |= IGC_RCTL_UPE; + + /* update state of unicast and multicast */ + rctl |= rd32(IGC_RCTL) & ~(IGC_RCTL_UPE | IGC_RCTL_MPE); + wr32(IGC_RCTL, rctl); + +#if (PAGE_SIZE < 8192) + if (adapter->max_frame_size <= IGC_MAX_FRAME_BUILD_SKB) + rlpml = IGC_MAX_FRAME_BUILD_SKB; +#endif + wr32(IGC_RLPML, rlpml); +} + +/** + * igc_configure - configure the hardware for RX and TX + * @adapter: private board structure + */ +static void igc_configure(struct igc_adapter *adapter) +{ + struct net_device *netdev = adapter->netdev; + int i = 0; + + igc_get_hw_control(adapter); + igc_set_rx_mode(netdev); + + igc_setup_tctl(adapter); + igc_setup_mrqc(adapter); + igc_setup_rctl(adapter); + + igc_nfc_filter_restore(adapter); + igc_configure_tx(adapter); + igc_configure_rx(adapter); + + igc_rx_fifo_flush_base(&adapter->hw); + + /* call igc_desc_unused which always leaves + * at least 1 descriptor unused to make sure + * next_to_use != next_to_clean + */ + for (i = 0; i < adapter->num_rx_queues; i++) { + struct igc_ring *ring = adapter->rx_ring[i]; + + igc_alloc_rx_buffers(ring, igc_desc_unused(ring)); + } +} + +/** + * igc_write_ivar - configure ivar for given MSI-X vector + * @hw: pointer to the HW structure + * @msix_vector: vector number we are allocating to a given ring + * @index: row index of IVAR register to write within IVAR table + * @offset: column offset of in IVAR, should be multiple of 8 + * + * The IVAR table consists of 2 columns, + * each containing an cause allocation for an Rx and Tx ring, and a + * variable number of rows depending on the number of queues supported. + */ +static void igc_write_ivar(struct igc_hw *hw, int msix_vector, + int index, int offset) +{ + u32 ivar = array_rd32(IGC_IVAR0, index); + + /* clear any bits that are currently set */ + ivar &= ~((u32)0xFF << offset); + + /* write vector and valid bit */ + ivar |= (msix_vector | IGC_IVAR_VALID) << offset; + + array_wr32(IGC_IVAR0, index, ivar); +} + +static void igc_assign_vector(struct igc_q_vector *q_vector, int msix_vector) +{ + struct igc_adapter *adapter = q_vector->adapter; + struct igc_hw *hw = &adapter->hw; + int rx_queue = IGC_N0_QUEUE; + int tx_queue = IGC_N0_QUEUE; + + if (q_vector->rx.ring) + rx_queue = q_vector->rx.ring->reg_idx; + if (q_vector->tx.ring) + tx_queue = q_vector->tx.ring->reg_idx; + + switch (hw->mac.type) { + case igc_i225: + if (rx_queue > IGC_N0_QUEUE) + igc_write_ivar(hw, msix_vector, + rx_queue >> 1, + (rx_queue & 0x1) << 4); + if (tx_queue > IGC_N0_QUEUE) + igc_write_ivar(hw, msix_vector, + tx_queue >> 1, + ((tx_queue & 0x1) << 4) + 8); + q_vector->eims_value = BIT(msix_vector); + break; + default: + WARN_ONCE(hw->mac.type != igc_i225, "Wrong MAC type\n"); + break; + } + + /* add q_vector eims value to global eims_enable_mask */ + adapter->eims_enable_mask |= q_vector->eims_value; + + /* configure q_vector to set itr on first interrupt */ + q_vector->set_itr = 1; +} + +/** + * igc_configure_msix - Configure MSI-X hardware + * @adapter: Pointer to adapter structure + * + * igc_configure_msix sets up the hardware to properly + * generate MSI-X interrupts. + */ +static void igc_configure_msix(struct igc_adapter *adapter) +{ + struct igc_hw *hw = &adapter->hw; + int i, vector = 0; + u32 tmp; + + adapter->eims_enable_mask = 0; + + /* set vector for other causes, i.e. link changes */ + switch (hw->mac.type) { + case igc_i225: + /* Turn on MSI-X capability first, or our settings + * won't stick. And it will take days to debug. + */ + wr32(IGC_GPIE, IGC_GPIE_MSIX_MODE | + IGC_GPIE_PBA | IGC_GPIE_EIAME | + IGC_GPIE_NSICR); + + /* enable msix_other interrupt */ + adapter->eims_other = BIT(vector); + tmp = (vector++ | IGC_IVAR_VALID) << 8; + + wr32(IGC_IVAR_MISC, tmp); + break; + default: + /* do nothing, since nothing else supports MSI-X */ + break; + } /* switch (hw->mac.type) */ + + adapter->eims_enable_mask |= adapter->eims_other; + + for (i = 0; i < adapter->num_q_vectors; i++) + igc_assign_vector(adapter->q_vector[i], vector++); + + wrfl(); +} + +/** + * igc_irq_enable - Enable default interrupt generation settings + * @adapter: board private structure + */ +static void igc_irq_enable(struct igc_adapter *adapter) +{ + struct igc_hw *hw = &adapter->hw; + + if (adapter->msix_entries) { + u32 ims = IGC_IMS_LSC | IGC_IMS_DOUTSYNC | IGC_IMS_DRSTA; + u32 regval = rd32(IGC_EIAC); + + wr32(IGC_EIAC, regval | adapter->eims_enable_mask); + regval = rd32(IGC_EIAM); + wr32(IGC_EIAM, regval | adapter->eims_enable_mask); + wr32(IGC_EIMS, adapter->eims_enable_mask); + wr32(IGC_IMS, ims); + } else { + wr32(IGC_IMS, IMS_ENABLE_MASK | IGC_IMS_DRSTA); + wr32(IGC_IAM, IMS_ENABLE_MASK | IGC_IMS_DRSTA); + } +} + +/** + * igc_irq_disable - Mask off interrupt generation on the NIC + * @adapter: board private structure + */ +static void igc_irq_disable(struct igc_adapter *adapter) +{ + struct igc_hw *hw = &adapter->hw; + + if (adapter->msix_entries) { + u32 regval = rd32(IGC_EIAM); + + wr32(IGC_EIAM, regval & ~adapter->eims_enable_mask); + wr32(IGC_EIMC, adapter->eims_enable_mask); + regval = rd32(IGC_EIAC); + wr32(IGC_EIAC, regval & ~adapter->eims_enable_mask); + } + + wr32(IGC_IAM, 0); + wr32(IGC_IMC, ~0); + wrfl(); + + if (adapter->msix_entries) { + int vector = 0, i; + + synchronize_irq(adapter->msix_entries[vector++].vector); + + for (i = 0; i < adapter->num_q_vectors; i++) + synchronize_irq(adapter->msix_entries[vector++].vector); + } else { + synchronize_irq(adapter->pdev->irq); + } +} + +void igc_set_flag_queue_pairs(struct igc_adapter *adapter, + const u32 max_rss_queues) +{ + /* Determine if we need to pair queues. */ + /* If rss_queues > half of max_rss_queues, pair the queues in + * order to conserve interrupts due to limited supply. + */ + if (adapter->rss_queues > (max_rss_queues / 2)) + adapter->flags |= IGC_FLAG_QUEUE_PAIRS; + else + adapter->flags &= ~IGC_FLAG_QUEUE_PAIRS; +} + +unsigned int igc_get_max_rss_queues(struct igc_adapter *adapter) +{ + unsigned int max_rss_queues; + + /* Determine the maximum number of RSS queues supported. */ + max_rss_queues = IGC_MAX_RX_QUEUES; + + return max_rss_queues; +} + +static void igc_init_queue_configuration(struct igc_adapter *adapter) +{ + u32 max_rss_queues; + + max_rss_queues = igc_get_max_rss_queues(adapter); + adapter->rss_queues = min_t(u32, max_rss_queues, num_online_cpus()); + + igc_set_flag_queue_pairs(adapter, max_rss_queues); +} + +/** + * igc_reset_q_vector - Reset config for interrupt vector + * @adapter: board private structure to initialize + * @v_idx: Index of vector to be reset + * + * If NAPI is enabled it will delete any references to the + * NAPI struct. This is preparation for igc_free_q_vector. + */ +static void igc_reset_q_vector(struct igc_adapter *adapter, int v_idx) +{ + struct igc_q_vector *q_vector = adapter->q_vector[v_idx]; + + /* if we're coming from igc_set_interrupt_capability, the vectors are + * not yet allocated + */ + if (!q_vector) + return; + + if (q_vector->tx.ring) + adapter->tx_ring[q_vector->tx.ring->queue_index] = NULL; + + if (q_vector->rx.ring) + adapter->rx_ring[q_vector->rx.ring->queue_index] = NULL; + + netif_napi_del(&q_vector->napi); +} + +/** + * igc_free_q_vector - Free memory allocated for specific interrupt vector + * @adapter: board private structure to initialize + * @v_idx: Index of vector to be freed + * + * This function frees the memory allocated to the q_vector. + */ +static void igc_free_q_vector(struct igc_adapter *adapter, int v_idx) +{ + struct igc_q_vector *q_vector = adapter->q_vector[v_idx]; + + adapter->q_vector[v_idx] = NULL; + + /* igc_get_stats64() might access the rings on this vector, + * we must wait a grace period before freeing it. + */ + if (q_vector) + kfree_rcu(q_vector, rcu); +} + +/** + * igc_free_q_vectors - Free memory allocated for interrupt vectors + * @adapter: board private structure to initialize + * + * This function frees the memory allocated to the q_vectors. In addition if + * NAPI is enabled it will delete any references to the NAPI struct prior + * to freeing the q_vector. + */ +static void igc_free_q_vectors(struct igc_adapter *adapter) +{ + int v_idx = adapter->num_q_vectors; + + adapter->num_tx_queues = 0; + adapter->num_rx_queues = 0; + adapter->num_q_vectors = 0; + + while (v_idx--) { + igc_reset_q_vector(adapter, v_idx); + igc_free_q_vector(adapter, v_idx); + } +} + +/** + * igc_update_itr - update the dynamic ITR value based on statistics + * @q_vector: pointer to q_vector + * @ring_container: ring info to update the itr for + * + * Stores a new ITR value based on packets and byte + * counts during the last interrupt. The advantage of per interrupt + * computation is faster updates and more accurate ITR for the current + * traffic pattern. Constants in this function were computed + * based on theoretical maximum wire speed and thresholds were set based + * on testing data as well as attempting to minimize response time + * while increasing bulk throughput. + * NOTE: These calculations are only valid when operating in a single- + * queue environment. + */ +static void igc_update_itr(struct igc_q_vector *q_vector, + struct igc_ring_container *ring_container) +{ + unsigned int packets = ring_container->total_packets; + unsigned int bytes = ring_container->total_bytes; + u8 itrval = ring_container->itr; + + /* no packets, exit with status unchanged */ + if (packets == 0) + return; + + switch (itrval) { + case lowest_latency: + /* handle TSO and jumbo frames */ + if (bytes / packets > 8000) + itrval = bulk_latency; + else if ((packets < 5) && (bytes > 512)) + itrval = low_latency; + break; + case low_latency: /* 50 usec aka 20000 ints/s */ + if (bytes > 10000) { + /* this if handles the TSO accounting */ + if (bytes / packets > 8000) + itrval = bulk_latency; + else if ((packets < 10) || ((bytes / packets) > 1200)) + itrval = bulk_latency; + else if ((packets > 35)) + itrval = lowest_latency; + } else if (bytes / packets > 2000) { + itrval = bulk_latency; + } else if (packets <= 2 && bytes < 512) { + itrval = lowest_latency; + } + break; + case bulk_latency: /* 250 usec aka 4000 ints/s */ + if (bytes > 25000) { + if (packets > 35) + itrval = low_latency; + } else if (bytes < 1500) { + itrval = low_latency; + } + break; + } + + /* clear work counters since we have the values we need */ + ring_container->total_bytes = 0; + ring_container->total_packets = 0; + + /* write updated itr to ring container */ + ring_container->itr = itrval; +} + +static void igc_set_itr(struct igc_q_vector *q_vector) +{ + struct igc_adapter *adapter = q_vector->adapter; + u32 new_itr = q_vector->itr_val; + u8 current_itr = 0; + + /* for non-gigabit speeds, just fix the interrupt rate at 4000 */ + switch (adapter->link_speed) { + case SPEED_10: + case SPEED_100: + current_itr = 0; + new_itr = IGC_4K_ITR; + goto set_itr_now; + default: + break; + } + + igc_update_itr(q_vector, &q_vector->tx); + igc_update_itr(q_vector, &q_vector->rx); + + current_itr = max(q_vector->rx.itr, q_vector->tx.itr); + + /* conservative mode (itr 3) eliminates the lowest_latency setting */ + if (current_itr == lowest_latency && + ((q_vector->rx.ring && adapter->rx_itr_setting == 3) || + (!q_vector->rx.ring && adapter->tx_itr_setting == 3))) + current_itr = low_latency; + + switch (current_itr) { + /* counts and packets in update_itr are dependent on these numbers */ + case lowest_latency: + new_itr = IGC_70K_ITR; /* 70,000 ints/sec */ + break; + case low_latency: + new_itr = IGC_20K_ITR; /* 20,000 ints/sec */ + break; + case bulk_latency: + new_itr = IGC_4K_ITR; /* 4,000 ints/sec */ + break; + default: + break; + } + +set_itr_now: + if (new_itr != q_vector->itr_val) { + /* this attempts to bias the interrupt rate towards Bulk + * by adding intermediate steps when interrupt rate is + * increasing + */ + new_itr = new_itr > q_vector->itr_val ? + max((new_itr * q_vector->itr_val) / + (new_itr + (q_vector->itr_val >> 2)), + new_itr) : new_itr; + /* Don't write the value here; it resets the adapter's + * internal timer, and causes us to delay far longer than + * we should between interrupts. Instead, we write the ITR + * value at the beginning of the next interrupt so the timing + * ends up being correct. + */ + q_vector->itr_val = new_itr; + q_vector->set_itr = 1; + } +} + +static void igc_reset_interrupt_capability(struct igc_adapter *adapter) +{ + int v_idx = adapter->num_q_vectors; + + if (adapter->msix_entries) { + pci_disable_msix(adapter->pdev); + kfree(adapter->msix_entries); + adapter->msix_entries = NULL; + } else if (adapter->flags & IGC_FLAG_HAS_MSI) { + pci_disable_msi(adapter->pdev); + } + + while (v_idx--) + igc_reset_q_vector(adapter, v_idx); +} + +/** + * igc_set_interrupt_capability - set MSI or MSI-X if supported + * @adapter: Pointer to adapter structure + * @msix: boolean value for MSI-X capability + * + * Attempt to configure interrupts using the best available + * capabilities of the hardware and kernel. + */ +static void igc_set_interrupt_capability(struct igc_adapter *adapter, + bool msix) +{ + int numvecs, i; + int err; + + if (!msix) + goto msi_only; + adapter->flags |= IGC_FLAG_HAS_MSIX; + + /* Number of supported queues. */ + adapter->num_rx_queues = adapter->rss_queues; + + adapter->num_tx_queues = adapter->rss_queues; + + /* start with one vector for every Rx queue */ + numvecs = adapter->num_rx_queues; + + /* if Tx handler is separate add 1 for every Tx queue */ + if (!(adapter->flags & IGC_FLAG_QUEUE_PAIRS)) + numvecs += adapter->num_tx_queues; + + /* store the number of vectors reserved for queues */ + adapter->num_q_vectors = numvecs; + + /* add 1 vector for link status interrupts */ + numvecs++; + + adapter->msix_entries = kcalloc(numvecs, sizeof(struct msix_entry), + GFP_KERNEL); + + if (!adapter->msix_entries) + return; + + /* populate entry values */ + for (i = 0; i < numvecs; i++) + adapter->msix_entries[i].entry = i; + + err = pci_enable_msix_range(adapter->pdev, + adapter->msix_entries, + numvecs, + numvecs); + if (err > 0) + return; + + kfree(adapter->msix_entries); + adapter->msix_entries = NULL; + + igc_reset_interrupt_capability(adapter); + +msi_only: + adapter->flags &= ~IGC_FLAG_HAS_MSIX; + + adapter->rss_queues = 1; + adapter->flags |= IGC_FLAG_QUEUE_PAIRS; + adapter->num_rx_queues = 1; + adapter->num_tx_queues = 1; + adapter->num_q_vectors = 1; + if (!pci_enable_msi(adapter->pdev)) + adapter->flags |= IGC_FLAG_HAS_MSI; +} + +/** + * igc_update_ring_itr - update the dynamic ITR value based on packet size + * @q_vector: pointer to q_vector + * + * Stores a new ITR value based on strictly on packet size. This + * algorithm is less sophisticated than that used in igc_update_itr, + * due to the difficulty of synchronizing statistics across multiple + * receive rings. The divisors and thresholds used by this function + * were determined based on theoretical maximum wire speed and testing + * data, in order to minimize response time while increasing bulk + * throughput. + * NOTE: This function is called only when operating in a multiqueue + * receive environment. + */ +static void igc_update_ring_itr(struct igc_q_vector *q_vector) +{ + struct igc_adapter *adapter = q_vector->adapter; + int new_val = q_vector->itr_val; + int avg_wire_size = 0; + unsigned int packets; + + /* For non-gigabit speeds, just fix the interrupt rate at 4000 + * ints/sec - ITR timer value of 120 ticks. + */ + switch (adapter->link_speed) { + case SPEED_10: + case SPEED_100: + new_val = IGC_4K_ITR; + goto set_itr_val; + default: + break; + } + + packets = q_vector->rx.total_packets; + if (packets) + avg_wire_size = q_vector->rx.total_bytes / packets; + + packets = q_vector->tx.total_packets; + if (packets) + avg_wire_size = max_t(u32, avg_wire_size, + q_vector->tx.total_bytes / packets); + + /* if avg_wire_size isn't set no work was done */ + if (!avg_wire_size) + goto clear_counts; + + /* Add 24 bytes to size to account for CRC, preamble, and gap */ + avg_wire_size += 24; + + /* Don't starve jumbo frames */ + avg_wire_size = min(avg_wire_size, 3000); + + /* Give a little boost to mid-size frames */ + if (avg_wire_size > 300 && avg_wire_size < 1200) + new_val = avg_wire_size / 3; + else + new_val = avg_wire_size / 2; + + /* conservative mode (itr 3) eliminates the lowest_latency setting */ + if (new_val < IGC_20K_ITR && + ((q_vector->rx.ring && adapter->rx_itr_setting == 3) || + (!q_vector->rx.ring && adapter->tx_itr_setting == 3))) + new_val = IGC_20K_ITR; + +set_itr_val: + if (new_val != q_vector->itr_val) { + q_vector->itr_val = new_val; + q_vector->set_itr = 1; + } +clear_counts: + q_vector->rx.total_bytes = 0; + q_vector->rx.total_packets = 0; + q_vector->tx.total_bytes = 0; + q_vector->tx.total_packets = 0; +} + +static void igc_ring_irq_enable(struct igc_q_vector *q_vector) +{ + struct igc_adapter *adapter = q_vector->adapter; + struct igc_hw *hw = &adapter->hw; + + if ((q_vector->rx.ring && (adapter->rx_itr_setting & 3)) || + (!q_vector->rx.ring && (adapter->tx_itr_setting & 3))) { + if (adapter->num_q_vectors == 1) + igc_set_itr(q_vector); + else + igc_update_ring_itr(q_vector); + } + + if (!test_bit(__IGC_DOWN, &adapter->state)) { + if (adapter->msix_entries) + wr32(IGC_EIMS, q_vector->eims_value); + else + igc_irq_enable(adapter); + } +} + +static void igc_add_ring(struct igc_ring *ring, + struct igc_ring_container *head) +{ + head->ring = ring; + head->count++; +} + +/** + * igc_cache_ring_register - Descriptor ring to register mapping + * @adapter: board private structure to initialize + * + * Once we know the feature-set enabled for the device, we'll cache + * the register offset the descriptor ring is assigned to. + */ +static void igc_cache_ring_register(struct igc_adapter *adapter) +{ + int i = 0, j = 0; + + switch (adapter->hw.mac.type) { + case igc_i225: + /* Fall through */ + default: + for (; i < adapter->num_rx_queues; i++) + adapter->rx_ring[i]->reg_idx = i; + for (; j < adapter->num_tx_queues; j++) + adapter->tx_ring[j]->reg_idx = j; + break; + } +} + +/** + * igc_poll - NAPI Rx polling callback + * @napi: napi polling structure + * @budget: count of how many packets we should handle + */ +static int igc_poll(struct napi_struct *napi, int budget) +{ + struct igc_q_vector *q_vector = container_of(napi, + struct igc_q_vector, + napi); + bool clean_complete = true; + int work_done = 0; + + if (q_vector->tx.ring) + clean_complete = igc_clean_tx_irq(q_vector, budget); + + if (q_vector->rx.ring) { + int cleaned = igc_clean_rx_irq(q_vector, budget); + + work_done += cleaned; + if (cleaned >= budget) + clean_complete = false; + } + + /* If all work not completed, return budget and keep polling */ + if (!clean_complete) + return budget; + + /* Exit the polling mode, but don't re-enable interrupts if stack might + * poll us due to busy-polling + */ + if (likely(napi_complete_done(napi, work_done))) + igc_ring_irq_enable(q_vector); + + return min(work_done, budget - 1); +} + +/** + * igc_alloc_q_vector - Allocate memory for a single interrupt vector + * @adapter: board private structure to initialize + * @v_count: q_vectors allocated on adapter, used for ring interleaving + * @v_idx: index of vector in adapter struct + * @txr_count: total number of Tx rings to allocate + * @txr_idx: index of first Tx ring to allocate + * @rxr_count: total number of Rx rings to allocate + * @rxr_idx: index of first Rx ring to allocate + * + * We allocate one q_vector. If allocation fails we return -ENOMEM. + */ +static int igc_alloc_q_vector(struct igc_adapter *adapter, + unsigned int v_count, unsigned int v_idx, + unsigned int txr_count, unsigned int txr_idx, + unsigned int rxr_count, unsigned int rxr_idx) +{ + struct igc_q_vector *q_vector; + struct igc_ring *ring; + int ring_count; + + /* igc only supports 1 Tx and/or 1 Rx queue per vector */ + if (txr_count > 1 || rxr_count > 1) + return -ENOMEM; + + ring_count = txr_count + rxr_count; + + /* allocate q_vector and rings */ + q_vector = adapter->q_vector[v_idx]; + if (!q_vector) + q_vector = kzalloc(struct_size(q_vector, ring, ring_count), + GFP_KERNEL); + else + memset(q_vector, 0, struct_size(q_vector, ring, ring_count)); + if (!q_vector) + return -ENOMEM; + + /* initialize NAPI */ + netif_napi_add(adapter->netdev, &q_vector->napi, + igc_poll, 64); + + /* tie q_vector and adapter together */ + adapter->q_vector[v_idx] = q_vector; + q_vector->adapter = adapter; + + /* initialize work limits */ + q_vector->tx.work_limit = adapter->tx_work_limit; + + /* initialize ITR configuration */ + q_vector->itr_register = adapter->io_addr + IGC_EITR(0); + q_vector->itr_val = IGC_START_ITR; + + /* initialize pointer to rings */ + ring = q_vector->ring; + + /* initialize ITR */ + if (rxr_count) { + /* rx or rx/tx vector */ + if (!adapter->rx_itr_setting || adapter->rx_itr_setting > 3) + q_vector->itr_val = adapter->rx_itr_setting; + } else { + /* tx only vector */ + if (!adapter->tx_itr_setting || adapter->tx_itr_setting > 3) + q_vector->itr_val = adapter->tx_itr_setting; + } + + if (txr_count) { + /* assign generic ring traits */ + ring->dev = &adapter->pdev->dev; + ring->netdev = adapter->netdev; + + /* configure backlink on ring */ + ring->q_vector = q_vector; + + /* update q_vector Tx values */ + igc_add_ring(ring, &q_vector->tx); + + /* apply Tx specific ring traits */ + ring->count = adapter->tx_ring_count; + ring->queue_index = txr_idx; + + /* assign ring to adapter */ + adapter->tx_ring[txr_idx] = ring; + + /* push pointer to next ring */ + ring++; + } + + if (rxr_count) { + /* assign generic ring traits */ + ring->dev = &adapter->pdev->dev; + ring->netdev = adapter->netdev; + + /* configure backlink on ring */ + ring->q_vector = q_vector; + + /* update q_vector Rx values */ + igc_add_ring(ring, &q_vector->rx); + + /* apply Rx specific ring traits */ + ring->count = adapter->rx_ring_count; + ring->queue_index = rxr_idx; + + /* assign ring to adapter */ + adapter->rx_ring[rxr_idx] = ring; + } + + return 0; +} + +/** + * igc_alloc_q_vectors - Allocate memory for interrupt vectors + * @adapter: board private structure to initialize + * + * We allocate one q_vector per queue interrupt. If allocation fails we + * return -ENOMEM. + */ +static int igc_alloc_q_vectors(struct igc_adapter *adapter) +{ + int rxr_remaining = adapter->num_rx_queues; + int txr_remaining = adapter->num_tx_queues; + int rxr_idx = 0, txr_idx = 0, v_idx = 0; + int q_vectors = adapter->num_q_vectors; + int err; + + if (q_vectors >= (rxr_remaining + txr_remaining)) { + for (; rxr_remaining; v_idx++) { + err = igc_alloc_q_vector(adapter, q_vectors, v_idx, + 0, 0, 1, rxr_idx); + + if (err) + goto err_out; + + /* update counts and index */ + rxr_remaining--; + rxr_idx++; + } + } + + for (; v_idx < q_vectors; v_idx++) { + int rqpv = DIV_ROUND_UP(rxr_remaining, q_vectors - v_idx); + int tqpv = DIV_ROUND_UP(txr_remaining, q_vectors - v_idx); + + err = igc_alloc_q_vector(adapter, q_vectors, v_idx, + tqpv, txr_idx, rqpv, rxr_idx); + + if (err) + goto err_out; + + /* update counts and index */ + rxr_remaining -= rqpv; + txr_remaining -= tqpv; + rxr_idx++; + txr_idx++; + } + + return 0; + +err_out: + adapter->num_tx_queues = 0; + adapter->num_rx_queues = 0; + adapter->num_q_vectors = 0; + + while (v_idx--) + igc_free_q_vector(adapter, v_idx); + + return -ENOMEM; +} + +/** + * igc_init_interrupt_scheme - initialize interrupts, allocate queues/vectors + * @adapter: Pointer to adapter structure + * @msix: boolean for MSI-X capability + * + * This function initializes the interrupts and allocates all of the queues. + */ +static int igc_init_interrupt_scheme(struct igc_adapter *adapter, bool msix) +{ + struct pci_dev *pdev = adapter->pdev; + int err = 0; + + igc_set_interrupt_capability(adapter, msix); + + err = igc_alloc_q_vectors(adapter); + if (err) { + dev_err(&pdev->dev, "Unable to allocate memory for vectors\n"); + goto err_alloc_q_vectors; + } + + igc_cache_ring_register(adapter); + + return 0; + +err_alloc_q_vectors: + igc_reset_interrupt_capability(adapter); + return err; +} + +/** + * igc_sw_init - Initialize general software structures (struct igc_adapter) + * @adapter: board private structure to initialize + * + * igc_sw_init initializes the Adapter private data structure. + * Fields are initialized based on PCI device information and + * OS network device settings (MTU size). + */ +static int igc_sw_init(struct igc_adapter *adapter) +{ + struct net_device *netdev = adapter->netdev; + struct pci_dev *pdev = adapter->pdev; + struct igc_hw *hw = &adapter->hw; + + int size = sizeof(struct igc_mac_addr) * hw->mac.rar_entry_count; + + pci_read_config_word(pdev, PCI_COMMAND, &hw->bus.pci_cmd_word); + + /* set default ring sizes */ + adapter->tx_ring_count = IGC_DEFAULT_TXD; + adapter->rx_ring_count = IGC_DEFAULT_RXD; + + /* set default ITR values */ + adapter->rx_itr_setting = IGC_DEFAULT_ITR; + adapter->tx_itr_setting = IGC_DEFAULT_ITR; + + /* set default work limits */ + adapter->tx_work_limit = IGC_DEFAULT_TX_WORK; + + /* adjust max frame to be at least the size of a standard frame */ + adapter->max_frame_size = netdev->mtu + ETH_HLEN + ETH_FCS_LEN + + VLAN_HLEN; + adapter->min_frame_size = ETH_ZLEN + ETH_FCS_LEN; + + spin_lock_init(&adapter->nfc_lock); + spin_lock_init(&adapter->stats64_lock); + /* Assume MSI-X interrupts, will be checked during IRQ allocation */ + adapter->flags |= IGC_FLAG_HAS_MSIX; + + adapter->mac_table = kzalloc(size, GFP_ATOMIC); + if (!adapter->mac_table) + return -ENOMEM; + + igc_init_queue_configuration(adapter); + + /* This call may decrease the number of queues */ + if (igc_init_interrupt_scheme(adapter, true)) { + dev_err(&pdev->dev, "Unable to allocate memory for queues\n"); + return -ENOMEM; + } + + /* Explicitly disable IRQ since the NIC can be in any state. */ + igc_irq_disable(adapter); + + set_bit(__IGC_DOWN, &adapter->state); + + return 0; +} + /** * igc_up - Open the interface and prepare it to handle traffic * @adapter: board private structure @@ -2164,18 +3464,6 @@ static void igc_nfc_filter_exit(struct igc_adapter *adapter) spin_unlock(&adapter->nfc_lock); } -static void igc_nfc_filter_restore(struct igc_adapter *adapter) -{ - struct igc_nfc_filter *rule; - - spin_lock(&adapter->nfc_lock); - - hlist_for_each_entry(rule, &adapter->nfc_filter_list, nfc_node) - igc_add_filter(adapter, rule); - - spin_unlock(&adapter->nfc_lock); -} - /** * igc_down - Close the interface * @adapter: board private structure @@ -2399,105 +3687,6 @@ igc_features_check(struct sk_buff *skb, struct net_device *dev, return features; } -/** - * igc_configure - configure the hardware for RX and TX - * @adapter: private board structure - */ -static void igc_configure(struct igc_adapter *adapter) -{ - struct net_device *netdev = adapter->netdev; - int i = 0; - - igc_get_hw_control(adapter); - igc_set_rx_mode(netdev); - - igc_setup_tctl(adapter); - igc_setup_mrqc(adapter); - igc_setup_rctl(adapter); - - igc_nfc_filter_restore(adapter); - igc_configure_tx(adapter); - igc_configure_rx(adapter); - - igc_rx_fifo_flush_base(&adapter->hw); - - /* call igc_desc_unused which always leaves - * at least 1 descriptor unused to make sure - * next_to_use != next_to_clean - */ - for (i = 0; i < adapter->num_rx_queues; i++) { - struct igc_ring *ring = adapter->rx_ring[i]; - - igc_alloc_rx_buffers(ring, igc_desc_unused(ring)); - } -} - -/** - * igc_rar_set_index - Sync RAL[index] and RAH[index] registers with MAC table - * @adapter: address of board private structure - * @index: Index of the RAR entry which need to be synced with MAC table - */ -static void igc_rar_set_index(struct igc_adapter *adapter, u32 index) -{ - u8 *addr = adapter->mac_table[index].addr; - struct igc_hw *hw = &adapter->hw; - u32 rar_low, rar_high; - - /* HW expects these to be in network order when they are plugged - * into the registers which are little endian. In order to guarantee - * that ordering we need to do an leXX_to_cpup here in order to be - * ready for the byteswap that occurs with writel - */ - rar_low = le32_to_cpup((__le32 *)(addr)); - rar_high = le16_to_cpup((__le16 *)(addr + 4)); - - /* Indicate to hardware the Address is Valid. */ - if (adapter->mac_table[index].state & IGC_MAC_STATE_IN_USE) { - if (is_valid_ether_addr(addr)) - rar_high |= IGC_RAH_AV; - - rar_high |= IGC_RAH_POOL_1 << - adapter->mac_table[index].queue; - } - - wr32(IGC_RAL(index), rar_low); - wrfl(); - wr32(IGC_RAH(index), rar_high); - wrfl(); -} - -/* Set default MAC address for the PF in the first RAR entry */ -static void igc_set_default_mac_filter(struct igc_adapter *adapter) -{ - struct igc_mac_addr *mac_table = &adapter->mac_table[0]; - - ether_addr_copy(mac_table->addr, adapter->hw.mac.addr); - mac_table->state = IGC_MAC_STATE_DEFAULT | IGC_MAC_STATE_IN_USE; - - igc_rar_set_index(adapter, 0); -} - -/* If the filter to be added and an already existing filter express - * the same address and address type, it should be possible to only - * override the other configurations, for example the queue to steer - * traffic. - */ -static bool igc_mac_entry_can_be_used(const struct igc_mac_addr *entry, - const u8 *addr, const u8 flags) -{ - if (!(entry->state & IGC_MAC_STATE_IN_USE)) - return true; - - if ((entry->state & IGC_MAC_STATE_SRC_ADDR) != - (flags & IGC_MAC_STATE_SRC_ADDR)) - return false; - - if (!ether_addr_equal(addr, entry->addr)) - return false; - - return true; -} - /* Add a MAC filter for 'addr' directing matching traffic to 'queue', * 'flags' is used to indicate what kind of match is made, match is by * default for the destination address, if matching by source address @@ -2598,159 +3787,20 @@ int igc_del_mac_steering_filter(struct igc_adapter *adapter, IGC_MAC_STATE_QUEUE_STEERING | flags); } -/* Add a MAC filter for 'addr' directing matching traffic to 'queue', - * 'flags' is used to indicate what kind of match is made, match is by - * default for the destination address, if matching by source address - * is desired the flag IGC_MAC_STATE_SRC_ADDR can be used. - */ -static int igc_add_mac_filter(struct igc_adapter *adapter, - const u8 *addr, const u8 queue) -{ - struct igc_hw *hw = &adapter->hw; - int rar_entries = hw->mac.rar_entry_count; - int i; - - if (is_zero_ether_addr(addr)) - return -EINVAL; - - /* Search for the first empty entry in the MAC table. - * Do not touch entries at the end of the table reserved for the VF MAC - * addresses. - */ - for (i = 0; i < rar_entries; i++) { - if (!igc_mac_entry_can_be_used(&adapter->mac_table[i], - addr, 0)) - continue; - - ether_addr_copy(adapter->mac_table[i].addr, addr); - adapter->mac_table[i].queue = queue; - adapter->mac_table[i].state |= IGC_MAC_STATE_IN_USE; - - igc_rar_set_index(adapter, i); - return i; - } - - return -ENOSPC; -} - -/* Remove a MAC filter for 'addr' directing matching traffic to - * 'queue', 'flags' is used to indicate what kind of match need to be - * removed, match is by default for the destination address, if - * matching by source address is to be removed the flag - * IGC_MAC_STATE_SRC_ADDR can be used. - */ -static int igc_del_mac_filter(struct igc_adapter *adapter, - const u8 *addr, const u8 queue) +static void igc_tsync_interrupt(struct igc_adapter *adapter) { struct igc_hw *hw = &adapter->hw; - int rar_entries = hw->mac.rar_entry_count; - int i; + u32 tsicr = rd32(IGC_TSICR); + u32 ack = 0; - if (is_zero_ether_addr(addr)) - return -EINVAL; - - /* Search for matching entry in the MAC table based on given address - * and queue. Do not touch entries at the end of the table reserved - * for the VF MAC addresses. - */ - for (i = 0; i < rar_entries; i++) { - if (!(adapter->mac_table[i].state & IGC_MAC_STATE_IN_USE)) - continue; - if (adapter->mac_table[i].state != 0) - continue; - if (adapter->mac_table[i].queue != queue) - continue; - if (!ether_addr_equal(adapter->mac_table[i].addr, addr)) - continue; - - /* When a filter for the default address is "deleted", - * we return it to its initial configuration - */ - if (adapter->mac_table[i].state & IGC_MAC_STATE_DEFAULT) { - adapter->mac_table[i].state = - IGC_MAC_STATE_DEFAULT | IGC_MAC_STATE_IN_USE; - adapter->mac_table[i].queue = 0; - } else { - adapter->mac_table[i].state = 0; - adapter->mac_table[i].queue = 0; - memset(adapter->mac_table[i].addr, 0, ETH_ALEN); - } - - igc_rar_set_index(adapter, i); - return 0; + if (tsicr & IGC_TSICR_TXTS) { + /* retrieve hardware timestamp */ + schedule_work(&adapter->ptp_tx_work); + ack |= IGC_TSICR_TXTS; } - return -ENOENT; -} - -static int igc_uc_sync(struct net_device *netdev, const unsigned char *addr) -{ - struct igc_adapter *adapter = netdev_priv(netdev); - int ret; - - ret = igc_add_mac_filter(adapter, addr, adapter->num_rx_queues); - - return min_t(int, ret, 0); -} - -static int igc_uc_unsync(struct net_device *netdev, const unsigned char *addr) -{ - struct igc_adapter *adapter = netdev_priv(netdev); - - igc_del_mac_filter(adapter, addr, adapter->num_rx_queues); - - return 0; -} - -/** - * igc_set_rx_mode - Secondary Unicast, Multicast and Promiscuous mode set - * @netdev: network interface device structure - * - * The set_rx_mode entry point is called whenever the unicast or multicast - * address lists or the network interface flags are updated. This routine is - * responsible for configuring the hardware for proper unicast, multicast, - * promiscuous mode, and all-multi behavior. - */ -static void igc_set_rx_mode(struct net_device *netdev) -{ - struct igc_adapter *adapter = netdev_priv(netdev); - struct igc_hw *hw = &adapter->hw; - u32 rctl = 0, rlpml = MAX_JUMBO_FRAME_SIZE; - int count; - - /* Check for Promiscuous and All Multicast modes */ - if (netdev->flags & IFF_PROMISC) { - rctl |= IGC_RCTL_UPE | IGC_RCTL_MPE; - } else { - if (netdev->flags & IFF_ALLMULTI) { - rctl |= IGC_RCTL_MPE; - } else { - /* Write addresses to the MTA, if the attempt fails - * then we should just turn on promiscuous mode so - * that we can at least receive multicast traffic - */ - count = igc_write_mc_addr_list(netdev); - if (count < 0) - rctl |= IGC_RCTL_MPE; - } - } - - /* Write addresses to available RAR registers, if there is not - * sufficient space to store all the addresses then enable - * unicast promiscuous mode - */ - if (__dev_uc_sync(netdev, igc_uc_sync, igc_uc_unsync)) - rctl |= IGC_RCTL_UPE; - - /* update state of unicast and multicast */ - rctl |= rd32(IGC_RCTL) & ~(IGC_RCTL_UPE | IGC_RCTL_MPE); - wr32(IGC_RCTL, rctl); - -#if (PAGE_SIZE < 8192) - if (adapter->max_frame_size <= IGC_MAX_FRAME_BUILD_SKB) - rlpml = IGC_MAX_FRAME_BUILD_SKB; -#endif - wr32(IGC_RLPML, rlpml); + /* acknowledge the interrupts */ + wr32(IGC_TSICR, ack); } /** @@ -2780,114 +3830,28 @@ static irqreturn_t igc_msix_other(int irq, void *data) mod_timer(&adapter->watchdog_timer, jiffies + 1); } + if (icr & IGC_ICR_TS) + igc_tsync_interrupt(adapter); + wr32(IGC_EIMS, adapter->eims_other); return IRQ_HANDLED; } -/** - * igc_write_ivar - configure ivar for given MSI-X vector - * @hw: pointer to the HW structure - * @msix_vector: vector number we are allocating to a given ring - * @index: row index of IVAR register to write within IVAR table - * @offset: column offset of in IVAR, should be multiple of 8 - * - * The IVAR table consists of 2 columns, - * each containing an cause allocation for an Rx and Tx ring, and a - * variable number of rows depending on the number of queues supported. - */ -static void igc_write_ivar(struct igc_hw *hw, int msix_vector, - int index, int offset) -{ - u32 ivar = array_rd32(IGC_IVAR0, index); - - /* clear any bits that are currently set */ - ivar &= ~((u32)0xFF << offset); - - /* write vector and valid bit */ - ivar |= (msix_vector | IGC_IVAR_VALID) << offset; - - array_wr32(IGC_IVAR0, index, ivar); -} - -static void igc_assign_vector(struct igc_q_vector *q_vector, int msix_vector) -{ - struct igc_adapter *adapter = q_vector->adapter; - struct igc_hw *hw = &adapter->hw; - int rx_queue = IGC_N0_QUEUE; - int tx_queue = IGC_N0_QUEUE; - - if (q_vector->rx.ring) - rx_queue = q_vector->rx.ring->reg_idx; - if (q_vector->tx.ring) - tx_queue = q_vector->tx.ring->reg_idx; - - switch (hw->mac.type) { - case igc_i225: - if (rx_queue > IGC_N0_QUEUE) - igc_write_ivar(hw, msix_vector, - rx_queue >> 1, - (rx_queue & 0x1) << 4); - if (tx_queue > IGC_N0_QUEUE) - igc_write_ivar(hw, msix_vector, - tx_queue >> 1, - ((tx_queue & 0x1) << 4) + 8); - q_vector->eims_value = BIT(msix_vector); - break; - default: - WARN_ONCE(hw->mac.type != igc_i225, "Wrong MAC type\n"); - break; - } - - /* add q_vector eims value to global eims_enable_mask */ - adapter->eims_enable_mask |= q_vector->eims_value; - - /* configure q_vector to set itr on first interrupt */ - q_vector->set_itr = 1; -} - -/** - * igc_configure_msix - Configure MSI-X hardware - * @adapter: Pointer to adapter structure - * - * igc_configure_msix sets up the hardware to properly - * generate MSI-X interrupts. - */ -static void igc_configure_msix(struct igc_adapter *adapter) +static void igc_write_itr(struct igc_q_vector *q_vector) { - struct igc_hw *hw = &adapter->hw; - int i, vector = 0; - u32 tmp; - - adapter->eims_enable_mask = 0; - - /* set vector for other causes, i.e. link changes */ - switch (hw->mac.type) { - case igc_i225: - /* Turn on MSI-X capability first, or our settings - * won't stick. And it will take days to debug. - */ - wr32(IGC_GPIE, IGC_GPIE_MSIX_MODE | - IGC_GPIE_PBA | IGC_GPIE_EIAME | - IGC_GPIE_NSICR); - - /* enable msix_other interrupt */ - adapter->eims_other = BIT(vector); - tmp = (vector++ | IGC_IVAR_VALID) << 8; + u32 itr_val = q_vector->itr_val & IGC_QVECTOR_MASK; - wr32(IGC_IVAR_MISC, tmp); - break; - default: - /* do nothing, since nothing else supports MSI-X */ - break; - } /* switch (hw->mac.type) */ + if (!q_vector->set_itr) + return; - adapter->eims_enable_mask |= adapter->eims_other; + if (!itr_val) + itr_val = IGC_ITR_VAL_MASK; - for (i = 0; i < adapter->num_q_vectors; i++) - igc_assign_vector(adapter->q_vector[i], vector++); + itr_val |= IGC_EITR_CNT_IGNR; - wrfl(); + writel(itr_val, q_vector->itr_register); + q_vector->set_itr = 0; } static irqreturn_t igc_msix_ring(int irq, void *data) @@ -2962,49 +3926,6 @@ err_out: } /** - * igc_reset_q_vector - Reset config for interrupt vector - * @adapter: board private structure to initialize - * @v_idx: Index of vector to be reset - * - * If NAPI is enabled it will delete any references to the - * NAPI struct. This is preparation for igc_free_q_vector. - */ -static void igc_reset_q_vector(struct igc_adapter *adapter, int v_idx) -{ - struct igc_q_vector *q_vector = adapter->q_vector[v_idx]; - - /* if we're coming from igc_set_interrupt_capability, the vectors are - * not yet allocated - */ - if (!q_vector) - return; - - if (q_vector->tx.ring) - adapter->tx_ring[q_vector->tx.ring->queue_index] = NULL; - - if (q_vector->rx.ring) - adapter->rx_ring[q_vector->rx.ring->queue_index] = NULL; - - netif_napi_del(&q_vector->napi); -} - -static void igc_reset_interrupt_capability(struct igc_adapter *adapter) -{ - int v_idx = adapter->num_q_vectors; - - if (adapter->msix_entries) { - pci_disable_msix(adapter->pdev); - kfree(adapter->msix_entries); - adapter->msix_entries = NULL; - } else if (adapter->flags & IGC_FLAG_HAS_MSI) { - pci_disable_msi(adapter->pdev); - } - - while (v_idx--) - igc_reset_q_vector(adapter, v_idx); -} - -/** * igc_clear_interrupt_scheme - reset the device to a state of no interrupts * @adapter: Pointer to adapter structure * @@ -3017,48 +3938,6 @@ static void igc_clear_interrupt_scheme(struct igc_adapter *adapter) igc_reset_interrupt_capability(adapter); } -/** - * igc_free_q_vectors - Free memory allocated for interrupt vectors - * @adapter: board private structure to initialize - * - * This function frees the memory allocated to the q_vectors. In addition if - * NAPI is enabled it will delete any references to the NAPI struct prior - * to freeing the q_vector. - */ -static void igc_free_q_vectors(struct igc_adapter *adapter) -{ - int v_idx = adapter->num_q_vectors; - - adapter->num_tx_queues = 0; - adapter->num_rx_queues = 0; - adapter->num_q_vectors = 0; - - while (v_idx--) { - igc_reset_q_vector(adapter, v_idx); - igc_free_q_vector(adapter, v_idx); - } -} - -/** - * igc_free_q_vector - Free memory allocated for specific interrupt vector - * @adapter: board private structure to initialize - * @v_idx: Index of vector to be freed - * - * This function frees the memory allocated to the q_vector. - */ -static void igc_free_q_vector(struct igc_adapter *adapter, int v_idx) -{ - struct igc_q_vector *q_vector = adapter->q_vector[v_idx]; - - adapter->q_vector[v_idx] = NULL; - - /* igc_get_stats64() might access the rings on this vector, - * we must wait a grace period before freeing it. - */ - if (q_vector) - kfree_rcu(q_vector, rcu); -} - /* Need to wait a few seconds after link up to get diagnostic information from * the phy */ @@ -3283,6 +4162,8 @@ no_wait: wr32(IGC_ICS, IGC_ICS_RXDMT0); } + igc_ptp_tx_hang(adapter); + /* Reset the timer */ if (!test_bit(__IGC_DOWN, &adapter->state)) { if (adapter->flags & IGC_FLAG_NEED_LINK_UPDATE) @@ -3295,149 +4176,6 @@ no_wait: } /** - * igc_update_ring_itr - update the dynamic ITR value based on packet size - * @q_vector: pointer to q_vector - * - * Stores a new ITR value based on strictly on packet size. This - * algorithm is less sophisticated than that used in igc_update_itr, - * due to the difficulty of synchronizing statistics across multiple - * receive rings. The divisors and thresholds used by this function - * were determined based on theoretical maximum wire speed and testing - * data, in order to minimize response time while increasing bulk - * throughput. - * NOTE: This function is called only when operating in a multiqueue - * receive environment. - */ -static void igc_update_ring_itr(struct igc_q_vector *q_vector) -{ - struct igc_adapter *adapter = q_vector->adapter; - int new_val = q_vector->itr_val; - int avg_wire_size = 0; - unsigned int packets; - - /* For non-gigabit speeds, just fix the interrupt rate at 4000 - * ints/sec - ITR timer value of 120 ticks. - */ - switch (adapter->link_speed) { - case SPEED_10: - case SPEED_100: - new_val = IGC_4K_ITR; - goto set_itr_val; - default: - break; - } - - packets = q_vector->rx.total_packets; - if (packets) - avg_wire_size = q_vector->rx.total_bytes / packets; - - packets = q_vector->tx.total_packets; - if (packets) - avg_wire_size = max_t(u32, avg_wire_size, - q_vector->tx.total_bytes / packets); - - /* if avg_wire_size isn't set no work was done */ - if (!avg_wire_size) - goto clear_counts; - - /* Add 24 bytes to size to account for CRC, preamble, and gap */ - avg_wire_size += 24; - - /* Don't starve jumbo frames */ - avg_wire_size = min(avg_wire_size, 3000); - - /* Give a little boost to mid-size frames */ - if (avg_wire_size > 300 && avg_wire_size < 1200) - new_val = avg_wire_size / 3; - else - new_val = avg_wire_size / 2; - - /* conservative mode (itr 3) eliminates the lowest_latency setting */ - if (new_val < IGC_20K_ITR && - ((q_vector->rx.ring && adapter->rx_itr_setting == 3) || - (!q_vector->rx.ring && adapter->tx_itr_setting == 3))) - new_val = IGC_20K_ITR; - -set_itr_val: - if (new_val != q_vector->itr_val) { - q_vector->itr_val = new_val; - q_vector->set_itr = 1; - } -clear_counts: - q_vector->rx.total_bytes = 0; - q_vector->rx.total_packets = 0; - q_vector->tx.total_bytes = 0; - q_vector->tx.total_packets = 0; -} - -/** - * igc_update_itr - update the dynamic ITR value based on statistics - * @q_vector: pointer to q_vector - * @ring_container: ring info to update the itr for - * - * Stores a new ITR value based on packets and byte - * counts during the last interrupt. The advantage of per interrupt - * computation is faster updates and more accurate ITR for the current - * traffic pattern. Constants in this function were computed - * based on theoretical maximum wire speed and thresholds were set based - * on testing data as well as attempting to minimize response time - * while increasing bulk throughput. - * NOTE: These calculations are only valid when operating in a single- - * queue environment. - */ -static void igc_update_itr(struct igc_q_vector *q_vector, - struct igc_ring_container *ring_container) -{ - unsigned int packets = ring_container->total_packets; - unsigned int bytes = ring_container->total_bytes; - u8 itrval = ring_container->itr; - - /* no packets, exit with status unchanged */ - if (packets == 0) - return; - - switch (itrval) { - case lowest_latency: - /* handle TSO and jumbo frames */ - if (bytes / packets > 8000) - itrval = bulk_latency; - else if ((packets < 5) && (bytes > 512)) - itrval = low_latency; - break; - case low_latency: /* 50 usec aka 20000 ints/s */ - if (bytes > 10000) { - /* this if handles the TSO accounting */ - if (bytes / packets > 8000) - itrval = bulk_latency; - else if ((packets < 10) || ((bytes / packets) > 1200)) - itrval = bulk_latency; - else if ((packets > 35)) - itrval = lowest_latency; - } else if (bytes / packets > 2000) { - itrval = bulk_latency; - } else if (packets <= 2 && bytes < 512) { - itrval = lowest_latency; - } - break; - case bulk_latency: /* 250 usec aka 4000 ints/s */ - if (bytes > 25000) { - if (packets > 35) - itrval = low_latency; - } else if (bytes < 1500) { - itrval = low_latency; - } - break; - } - - /* clear work counters since we have the values we need */ - ring_container->total_bytes = 0; - ring_container->total_packets = 0; - - /* write updated itr to ring container */ - ring_container->itr = itrval; -} - -/** * igc_intr_msi - Interrupt Handler * @irq: interrupt number * @data: pointer to a network interface device structure @@ -3514,426 +4252,6 @@ static irqreturn_t igc_intr(int irq, void *data) return IRQ_HANDLED; } -static void igc_set_itr(struct igc_q_vector *q_vector) -{ - struct igc_adapter *adapter = q_vector->adapter; - u32 new_itr = q_vector->itr_val; - u8 current_itr = 0; - - /* for non-gigabit speeds, just fix the interrupt rate at 4000 */ - switch (adapter->link_speed) { - case SPEED_10: - case SPEED_100: - current_itr = 0; - new_itr = IGC_4K_ITR; - goto set_itr_now; - default: - break; - } - - igc_update_itr(q_vector, &q_vector->tx); - igc_update_itr(q_vector, &q_vector->rx); - - current_itr = max(q_vector->rx.itr, q_vector->tx.itr); - - /* conservative mode (itr 3) eliminates the lowest_latency setting */ - if (current_itr == lowest_latency && - ((q_vector->rx.ring && adapter->rx_itr_setting == 3) || - (!q_vector->rx.ring && adapter->tx_itr_setting == 3))) - current_itr = low_latency; - - switch (current_itr) { - /* counts and packets in update_itr are dependent on these numbers */ - case lowest_latency: - new_itr = IGC_70K_ITR; /* 70,000 ints/sec */ - break; - case low_latency: - new_itr = IGC_20K_ITR; /* 20,000 ints/sec */ - break; - case bulk_latency: - new_itr = IGC_4K_ITR; /* 4,000 ints/sec */ - break; - default: - break; - } - -set_itr_now: - if (new_itr != q_vector->itr_val) { - /* this attempts to bias the interrupt rate towards Bulk - * by adding intermediate steps when interrupt rate is - * increasing - */ - new_itr = new_itr > q_vector->itr_val ? - max((new_itr * q_vector->itr_val) / - (new_itr + (q_vector->itr_val >> 2)), - new_itr) : new_itr; - /* Don't write the value here; it resets the adapter's - * internal timer, and causes us to delay far longer than - * we should between interrupts. Instead, we write the ITR - * value at the beginning of the next interrupt so the timing - * ends up being correct. - */ - q_vector->itr_val = new_itr; - q_vector->set_itr = 1; - } -} - -static void igc_ring_irq_enable(struct igc_q_vector *q_vector) -{ - struct igc_adapter *adapter = q_vector->adapter; - struct igc_hw *hw = &adapter->hw; - - if ((q_vector->rx.ring && (adapter->rx_itr_setting & 3)) || - (!q_vector->rx.ring && (adapter->tx_itr_setting & 3))) { - if (adapter->num_q_vectors == 1) - igc_set_itr(q_vector); - else - igc_update_ring_itr(q_vector); - } - - if (!test_bit(__IGC_DOWN, &adapter->state)) { - if (adapter->msix_entries) - wr32(IGC_EIMS, q_vector->eims_value); - else - igc_irq_enable(adapter); - } -} - -/** - * igc_poll - NAPI Rx polling callback - * @napi: napi polling structure - * @budget: count of how many packets we should handle - */ -static int igc_poll(struct napi_struct *napi, int budget) -{ - struct igc_q_vector *q_vector = container_of(napi, - struct igc_q_vector, - napi); - bool clean_complete = true; - int work_done = 0; - - if (q_vector->tx.ring) - clean_complete = igc_clean_tx_irq(q_vector, budget); - - if (q_vector->rx.ring) { - int cleaned = igc_clean_rx_irq(q_vector, budget); - - work_done += cleaned; - if (cleaned >= budget) - clean_complete = false; - } - - /* If all work not completed, return budget and keep polling */ - if (!clean_complete) - return budget; - - /* Exit the polling mode, but don't re-enable interrupts if stack might - * poll us due to busy-polling - */ - if (likely(napi_complete_done(napi, work_done))) - igc_ring_irq_enable(q_vector); - - return min(work_done, budget - 1); -} - -/** - * igc_set_interrupt_capability - set MSI or MSI-X if supported - * @adapter: Pointer to adapter structure - * @msix: boolean value for MSI-X capability - * - * Attempt to configure interrupts using the best available - * capabilities of the hardware and kernel. - */ -static void igc_set_interrupt_capability(struct igc_adapter *adapter, - bool msix) -{ - int numvecs, i; - int err; - - if (!msix) - goto msi_only; - adapter->flags |= IGC_FLAG_HAS_MSIX; - - /* Number of supported queues. */ - adapter->num_rx_queues = adapter->rss_queues; - - adapter->num_tx_queues = adapter->rss_queues; - - /* start with one vector for every Rx queue */ - numvecs = adapter->num_rx_queues; - - /* if Tx handler is separate add 1 for every Tx queue */ - if (!(adapter->flags & IGC_FLAG_QUEUE_PAIRS)) - numvecs += adapter->num_tx_queues; - - /* store the number of vectors reserved for queues */ - adapter->num_q_vectors = numvecs; - - /* add 1 vector for link status interrupts */ - numvecs++; - - adapter->msix_entries = kcalloc(numvecs, sizeof(struct msix_entry), - GFP_KERNEL); - - if (!adapter->msix_entries) - return; - - /* populate entry values */ - for (i = 0; i < numvecs; i++) - adapter->msix_entries[i].entry = i; - - err = pci_enable_msix_range(adapter->pdev, - adapter->msix_entries, - numvecs, - numvecs); - if (err > 0) - return; - - kfree(adapter->msix_entries); - adapter->msix_entries = NULL; - - igc_reset_interrupt_capability(adapter); - -msi_only: - adapter->flags &= ~IGC_FLAG_HAS_MSIX; - - adapter->rss_queues = 1; - adapter->flags |= IGC_FLAG_QUEUE_PAIRS; - adapter->num_rx_queues = 1; - adapter->num_tx_queues = 1; - adapter->num_q_vectors = 1; - if (!pci_enable_msi(adapter->pdev)) - adapter->flags |= IGC_FLAG_HAS_MSI; -} - -static void igc_add_ring(struct igc_ring *ring, - struct igc_ring_container *head) -{ - head->ring = ring; - head->count++; -} - -/** - * igc_alloc_q_vector - Allocate memory for a single interrupt vector - * @adapter: board private structure to initialize - * @v_count: q_vectors allocated on adapter, used for ring interleaving - * @v_idx: index of vector in adapter struct - * @txr_count: total number of Tx rings to allocate - * @txr_idx: index of first Tx ring to allocate - * @rxr_count: total number of Rx rings to allocate - * @rxr_idx: index of first Rx ring to allocate - * - * We allocate one q_vector. If allocation fails we return -ENOMEM. - */ -static int igc_alloc_q_vector(struct igc_adapter *adapter, - unsigned int v_count, unsigned int v_idx, - unsigned int txr_count, unsigned int txr_idx, - unsigned int rxr_count, unsigned int rxr_idx) -{ - struct igc_q_vector *q_vector; - struct igc_ring *ring; - int ring_count; - - /* igc only supports 1 Tx and/or 1 Rx queue per vector */ - if (txr_count > 1 || rxr_count > 1) - return -ENOMEM; - - ring_count = txr_count + rxr_count; - - /* allocate q_vector and rings */ - q_vector = adapter->q_vector[v_idx]; - if (!q_vector) - q_vector = kzalloc(struct_size(q_vector, ring, ring_count), - GFP_KERNEL); - else - memset(q_vector, 0, struct_size(q_vector, ring, ring_count)); - if (!q_vector) - return -ENOMEM; - - /* initialize NAPI */ - netif_napi_add(adapter->netdev, &q_vector->napi, - igc_poll, 64); - - /* tie q_vector and adapter together */ - adapter->q_vector[v_idx] = q_vector; - q_vector->adapter = adapter; - - /* initialize work limits */ - q_vector->tx.work_limit = adapter->tx_work_limit; - - /* initialize ITR configuration */ - q_vector->itr_register = adapter->io_addr + IGC_EITR(0); - q_vector->itr_val = IGC_START_ITR; - - /* initialize pointer to rings */ - ring = q_vector->ring; - - /* initialize ITR */ - if (rxr_count) { - /* rx or rx/tx vector */ - if (!adapter->rx_itr_setting || adapter->rx_itr_setting > 3) - q_vector->itr_val = adapter->rx_itr_setting; - } else { - /* tx only vector */ - if (!adapter->tx_itr_setting || adapter->tx_itr_setting > 3) - q_vector->itr_val = adapter->tx_itr_setting; - } - - if (txr_count) { - /* assign generic ring traits */ - ring->dev = &adapter->pdev->dev; - ring->netdev = adapter->netdev; - - /* configure backlink on ring */ - ring->q_vector = q_vector; - - /* update q_vector Tx values */ - igc_add_ring(ring, &q_vector->tx); - - /* apply Tx specific ring traits */ - ring->count = adapter->tx_ring_count; - ring->queue_index = txr_idx; - - /* assign ring to adapter */ - adapter->tx_ring[txr_idx] = ring; - - /* push pointer to next ring */ - ring++; - } - - if (rxr_count) { - /* assign generic ring traits */ - ring->dev = &adapter->pdev->dev; - ring->netdev = adapter->netdev; - - /* configure backlink on ring */ - ring->q_vector = q_vector; - - /* update q_vector Rx values */ - igc_add_ring(ring, &q_vector->rx); - - /* apply Rx specific ring traits */ - ring->count = adapter->rx_ring_count; - ring->queue_index = rxr_idx; - - /* assign ring to adapter */ - adapter->rx_ring[rxr_idx] = ring; - } - - return 0; -} - -/** - * igc_alloc_q_vectors - Allocate memory for interrupt vectors - * @adapter: board private structure to initialize - * - * We allocate one q_vector per queue interrupt. If allocation fails we - * return -ENOMEM. - */ -static int igc_alloc_q_vectors(struct igc_adapter *adapter) -{ - int rxr_remaining = adapter->num_rx_queues; - int txr_remaining = adapter->num_tx_queues; - int rxr_idx = 0, txr_idx = 0, v_idx = 0; - int q_vectors = adapter->num_q_vectors; - int err; - - if (q_vectors >= (rxr_remaining + txr_remaining)) { - for (; rxr_remaining; v_idx++) { - err = igc_alloc_q_vector(adapter, q_vectors, v_idx, - 0, 0, 1, rxr_idx); - - if (err) - goto err_out; - - /* update counts and index */ - rxr_remaining--; - rxr_idx++; - } - } - - for (; v_idx < q_vectors; v_idx++) { - int rqpv = DIV_ROUND_UP(rxr_remaining, q_vectors - v_idx); - int tqpv = DIV_ROUND_UP(txr_remaining, q_vectors - v_idx); - - err = igc_alloc_q_vector(adapter, q_vectors, v_idx, - tqpv, txr_idx, rqpv, rxr_idx); - - if (err) - goto err_out; - - /* update counts and index */ - rxr_remaining -= rqpv; - txr_remaining -= tqpv; - rxr_idx++; - txr_idx++; - } - - return 0; - -err_out: - adapter->num_tx_queues = 0; - adapter->num_rx_queues = 0; - adapter->num_q_vectors = 0; - - while (v_idx--) - igc_free_q_vector(adapter, v_idx); - - return -ENOMEM; -} - -/** - * igc_cache_ring_register - Descriptor ring to register mapping - * @adapter: board private structure to initialize - * - * Once we know the feature-set enabled for the device, we'll cache - * the register offset the descriptor ring is assigned to. - */ -static void igc_cache_ring_register(struct igc_adapter *adapter) -{ - int i = 0, j = 0; - - switch (adapter->hw.mac.type) { - case igc_i225: - /* Fall through */ - default: - for (; i < adapter->num_rx_queues; i++) - adapter->rx_ring[i]->reg_idx = i; - for (; j < adapter->num_tx_queues; j++) - adapter->tx_ring[j]->reg_idx = j; - break; - } -} - -/** - * igc_init_interrupt_scheme - initialize interrupts, allocate queues/vectors - * @adapter: Pointer to adapter structure - * @msix: boolean for MSI-X capability - * - * This function initializes the interrupts and allocates all of the queues. - */ -static int igc_init_interrupt_scheme(struct igc_adapter *adapter, bool msix) -{ - struct pci_dev *pdev = adapter->pdev; - int err = 0; - - igc_set_interrupt_capability(adapter, msix); - - err = igc_alloc_q_vectors(adapter); - if (err) { - dev_err(&pdev->dev, "Unable to allocate memory for vectors\n"); - goto err_alloc_q_vectors; - } - - igc_cache_ring_register(adapter); - - return 0; - -err_alloc_q_vectors: - igc_reset_interrupt_capability(adapter); - return err; -} - static void igc_free_irq(struct igc_adapter *adapter) { if (adapter->msix_entries) { @@ -3950,62 +4268,6 @@ static void igc_free_irq(struct igc_adapter *adapter) } /** - * igc_irq_disable - Mask off interrupt generation on the NIC - * @adapter: board private structure - */ -static void igc_irq_disable(struct igc_adapter *adapter) -{ - struct igc_hw *hw = &adapter->hw; - - if (adapter->msix_entries) { - u32 regval = rd32(IGC_EIAM); - - wr32(IGC_EIAM, regval & ~adapter->eims_enable_mask); - wr32(IGC_EIMC, adapter->eims_enable_mask); - regval = rd32(IGC_EIAC); - wr32(IGC_EIAC, regval & ~adapter->eims_enable_mask); - } - - wr32(IGC_IAM, 0); - wr32(IGC_IMC, ~0); - wrfl(); - - if (adapter->msix_entries) { - int vector = 0, i; - - synchronize_irq(adapter->msix_entries[vector++].vector); - - for (i = 0; i < adapter->num_q_vectors; i++) - synchronize_irq(adapter->msix_entries[vector++].vector); - } else { - synchronize_irq(adapter->pdev->irq); - } -} - -/** - * igc_irq_enable - Enable default interrupt generation settings - * @adapter: board private structure - */ -static void igc_irq_enable(struct igc_adapter *adapter) -{ - struct igc_hw *hw = &adapter->hw; - - if (adapter->msix_entries) { - u32 ims = IGC_IMS_LSC | IGC_IMS_DOUTSYNC | IGC_IMS_DRSTA; - u32 regval = rd32(IGC_EIAC); - - wr32(IGC_EIAC, regval | adapter->eims_enable_mask); - regval = rd32(IGC_EIAM); - wr32(IGC_EIAM, regval | adapter->eims_enable_mask); - wr32(IGC_EIMS, adapter->eims_enable_mask); - wr32(IGC_IMS, ims); - } else { - wr32(IGC_IMS, IMS_ENABLE_MASK | IGC_IMS_DRSTA); - wr32(IGC_IAM, IMS_ENABLE_MASK | IGC_IMS_DRSTA); - } -} - -/** * igc_request_irq - initialize interrupts * @adapter: Pointer to adapter structure * @@ -4059,22 +4321,6 @@ request_done: return err; } -static void igc_write_itr(struct igc_q_vector *q_vector) -{ - u32 itr_val = q_vector->itr_val & IGC_QVECTOR_MASK; - - if (!q_vector->set_itr) - return; - - if (!itr_val) - itr_val = IGC_ITR_VAL_MASK; - - itr_val |= IGC_EITR_CNT_IGNR; - - writel(itr_val, q_vector->itr_register); - q_vector->set_itr = 0; -} - /** * __igc_open - Called when a network interface is made active * @netdev: network interface device structure @@ -4204,6 +4450,24 @@ static int igc_close(struct net_device *netdev) return 0; } +/** + * igc_ioctl - Access the hwtstamp interface + * @netdev: network interface device structure + * @ifreq: interface request data + * @cmd: ioctl command + **/ +static int igc_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd) +{ + switch (cmd) { + case SIOCGHWTSTAMP: + return igc_ptp_get_ts_config(netdev, ifr); + case SIOCSHWTSTAMP: + return igc_ptp_set_ts_config(netdev, ifr); + default: + return -EOPNOTSUPP; + } +} + static const struct net_device_ops igc_netdev_ops = { .ndo_open = igc_open, .ndo_stop = igc_close, @@ -4215,6 +4479,7 @@ static const struct net_device_ops igc_netdev_ops = { .ndo_fix_features = igc_fix_features, .ndo_set_features = igc_set_features, .ndo_features_check = igc_features_check, + .ndo_do_ioctl = igc_ioctl, }; /* PCIe configuration access */ @@ -4433,6 +4698,8 @@ static int igc_probe(struct pci_dev *pdev, /* Add supported features to the features list*/ netdev->features |= NETIF_F_SG; + netdev->features |= NETIF_F_TSO; + netdev->features |= NETIF_F_TSO6; netdev->features |= NETIF_F_RXCSUM; netdev->features |= NETIF_F_HW_CSUM; netdev->features |= NETIF_F_SCTP_CRC; @@ -4515,6 +4782,9 @@ static int igc_probe(struct pci_dev *pdev, /* carrier off reporting is important to ethtool even BEFORE open */ netif_carrier_off(netdev); + /* do hw tstamp init after resetting */ + igc_ptp_init(adapter); + /* Check if Media Autosense is enabled */ adapter->ei = *ei; @@ -4556,6 +4826,8 @@ static void igc_remove(struct pci_dev *pdev) struct net_device *netdev = pci_get_drvdata(pdev); struct igc_adapter *adapter = netdev_priv(netdev); + igc_ptp_stop(adapter); + set_bit(__IGC_DOWN, &adapter->state); del_timer_sync(&adapter->watchdog_timer); @@ -4792,98 +5064,6 @@ static struct pci_driver igc_driver = { .shutdown = igc_shutdown, }; -void igc_set_flag_queue_pairs(struct igc_adapter *adapter, - const u32 max_rss_queues) -{ - /* Determine if we need to pair queues. */ - /* If rss_queues > half of max_rss_queues, pair the queues in - * order to conserve interrupts due to limited supply. - */ - if (adapter->rss_queues > (max_rss_queues / 2)) - adapter->flags |= IGC_FLAG_QUEUE_PAIRS; - else - adapter->flags &= ~IGC_FLAG_QUEUE_PAIRS; -} - -unsigned int igc_get_max_rss_queues(struct igc_adapter *adapter) -{ - unsigned int max_rss_queues; - - /* Determine the maximum number of RSS queues supported. */ - max_rss_queues = IGC_MAX_RX_QUEUES; - - return max_rss_queues; -} - -static void igc_init_queue_configuration(struct igc_adapter *adapter) -{ - u32 max_rss_queues; - - max_rss_queues = igc_get_max_rss_queues(adapter); - adapter->rss_queues = min_t(u32, max_rss_queues, num_online_cpus()); - - igc_set_flag_queue_pairs(adapter, max_rss_queues); -} - -/** - * igc_sw_init - Initialize general software structures (struct igc_adapter) - * @adapter: board private structure to initialize - * - * igc_sw_init initializes the Adapter private data structure. - * Fields are initialized based on PCI device information and - * OS network device settings (MTU size). - */ -static int igc_sw_init(struct igc_adapter *adapter) -{ - struct net_device *netdev = adapter->netdev; - struct pci_dev *pdev = adapter->pdev; - struct igc_hw *hw = &adapter->hw; - - int size = sizeof(struct igc_mac_addr) * hw->mac.rar_entry_count; - - pci_read_config_word(pdev, PCI_COMMAND, &hw->bus.pci_cmd_word); - - /* set default ring sizes */ - adapter->tx_ring_count = IGC_DEFAULT_TXD; - adapter->rx_ring_count = IGC_DEFAULT_RXD; - - /* set default ITR values */ - adapter->rx_itr_setting = IGC_DEFAULT_ITR; - adapter->tx_itr_setting = IGC_DEFAULT_ITR; - - /* set default work limits */ - adapter->tx_work_limit = IGC_DEFAULT_TX_WORK; - - /* adjust max frame to be at least the size of a standard frame */ - adapter->max_frame_size = netdev->mtu + ETH_HLEN + ETH_FCS_LEN + - VLAN_HLEN; - adapter->min_frame_size = ETH_ZLEN + ETH_FCS_LEN; - - spin_lock_init(&adapter->nfc_lock); - spin_lock_init(&adapter->stats64_lock); - /* Assume MSI-X interrupts, will be checked during IRQ allocation */ - adapter->flags |= IGC_FLAG_HAS_MSIX; - - adapter->mac_table = kzalloc(size, GFP_ATOMIC); - if (!adapter->mac_table) - return -ENOMEM; - - igc_init_queue_configuration(adapter); - - /* This call may decrease the number of queues */ - if (igc_init_interrupt_scheme(adapter, true)) { - dev_err(&pdev->dev, "Unable to allocate memory for queues\n"); - return -ENOMEM; - } - - /* Explicitly disable IRQ since the NIC can be in any state. */ - igc_irq_disable(adapter); - - set_bit(__IGC_DOWN, &adapter->state); - - return 0; -} - /** * igc_reinit_queues - return error * @adapter: pointer to adapter structure diff --git a/drivers/net/ethernet/intel/igc/igc_phy.c b/drivers/net/ethernet/intel/igc/igc_phy.c index f4b05af0dd2f..8e1799508edc 100644 --- a/drivers/net/ethernet/intel/igc/igc_phy.c +++ b/drivers/net/ethernet/intel/igc/igc_phy.c @@ -173,6 +173,7 @@ s32 igc_check_downshift(struct igc_hw *hw) s32 igc_phy_hw_reset(struct igc_hw *hw) { struct igc_phy_info *phy = &hw->phy; + u32 phpm = 0, timeout = 10000; s32 ret_val; u32 ctrl; @@ -186,6 +187,8 @@ s32 igc_phy_hw_reset(struct igc_hw *hw) if (ret_val) goto out; + phpm = rd32(IGC_I225_PHPM); + ctrl = rd32(IGC_CTRL); wr32(IGC_CTRL, ctrl | IGC_CTRL_PHY_RST); wrfl(); @@ -195,7 +198,18 @@ s32 igc_phy_hw_reset(struct igc_hw *hw) wr32(IGC_CTRL, ctrl); wrfl(); - usleep_range(1500, 2000); + /* SW should guarantee 100us for the completion of the PHY reset */ + usleep_range(100, 150); + do { + phpm = rd32(IGC_I225_PHPM); + timeout--; + udelay(1); + } while (!(phpm & IGC_PHY_RST_COMP) && timeout); + + if (!timeout) + hw_dbg("Timeout is expired after a phy reset\n"); + + usleep_range(100, 150); phy->ops.release(hw); diff --git a/drivers/net/ethernet/intel/igc/igc_ptp.c b/drivers/net/ethernet/intel/igc/igc_ptp.c new file mode 100644 index 000000000000..693506587198 --- /dev/null +++ b/drivers/net/ethernet/intel/igc/igc_ptp.c @@ -0,0 +1,716 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Intel Corporation */ + +#include "igc.h" + +#include <linux/module.h> +#include <linux/device.h> +#include <linux/pci.h> +#include <linux/ptp_classify.h> +#include <linux/clocksource.h> + +#define INCVALUE_MASK 0x7fffffff +#define ISGN 0x80000000 + +#define IGC_SYSTIM_OVERFLOW_PERIOD (HZ * 60 * 9) +#define IGC_PTP_TX_TIMEOUT (HZ * 15) + +/* SYSTIM read access for I225 */ +static void igc_ptp_read_i225(struct igc_adapter *adapter, + struct timespec64 *ts) +{ + struct igc_hw *hw = &adapter->hw; + u32 sec, nsec; + + /* The timestamp latches on lowest register read. For I210/I211, the + * lowest register is SYSTIMR. Since we only need to provide nanosecond + * resolution, we can ignore it. + */ + rd32(IGC_SYSTIMR); + nsec = rd32(IGC_SYSTIML); + sec = rd32(IGC_SYSTIMH); + + ts->tv_sec = sec; + ts->tv_nsec = nsec; +} + +static void igc_ptp_write_i225(struct igc_adapter *adapter, + const struct timespec64 *ts) +{ + struct igc_hw *hw = &adapter->hw; + + /* Writing the SYSTIMR register is not necessary as it only + * provides sub-nanosecond resolution. + */ + wr32(IGC_SYSTIML, ts->tv_nsec); + wr32(IGC_SYSTIMH, ts->tv_sec); +} + +static int igc_ptp_adjfine_i225(struct ptp_clock_info *ptp, long scaled_ppm) +{ + struct igc_adapter *igc = container_of(ptp, struct igc_adapter, + ptp_caps); + struct igc_hw *hw = &igc->hw; + int neg_adj = 0; + u64 rate; + u32 inca; + + if (scaled_ppm < 0) { + neg_adj = 1; + scaled_ppm = -scaled_ppm; + } + rate = scaled_ppm; + rate <<= 14; + rate = div_u64(rate, 78125); + + inca = rate & INCVALUE_MASK; + if (neg_adj) + inca |= ISGN; + + wr32(IGC_TIMINCA, inca); + + return 0; +} + +static int igc_ptp_adjtime_i225(struct ptp_clock_info *ptp, s64 delta) +{ + struct igc_adapter *igc = container_of(ptp, struct igc_adapter, + ptp_caps); + struct timespec64 now, then = ns_to_timespec64(delta); + unsigned long flags; + + spin_lock_irqsave(&igc->tmreg_lock, flags); + + igc_ptp_read_i225(igc, &now); + now = timespec64_add(now, then); + igc_ptp_write_i225(igc, (const struct timespec64 *)&now); + + spin_unlock_irqrestore(&igc->tmreg_lock, flags); + + return 0; +} + +static int igc_ptp_gettimex64_i225(struct ptp_clock_info *ptp, + struct timespec64 *ts, + struct ptp_system_timestamp *sts) +{ + struct igc_adapter *igc = container_of(ptp, struct igc_adapter, + ptp_caps); + struct igc_hw *hw = &igc->hw; + unsigned long flags; + + spin_lock_irqsave(&igc->tmreg_lock, flags); + + ptp_read_system_prets(sts); + rd32(IGC_SYSTIMR); + ptp_read_system_postts(sts); + ts->tv_nsec = rd32(IGC_SYSTIML); + ts->tv_sec = rd32(IGC_SYSTIMH); + + spin_unlock_irqrestore(&igc->tmreg_lock, flags); + + return 0; +} + +static int igc_ptp_settime_i225(struct ptp_clock_info *ptp, + const struct timespec64 *ts) +{ + struct igc_adapter *igc = container_of(ptp, struct igc_adapter, + ptp_caps); + unsigned long flags; + + spin_lock_irqsave(&igc->tmreg_lock, flags); + + igc_ptp_write_i225(igc, ts); + + spin_unlock_irqrestore(&igc->tmreg_lock, flags); + + return 0; +} + +static int igc_ptp_feature_enable_i225(struct ptp_clock_info *ptp, + struct ptp_clock_request *rq, int on) +{ + return -EOPNOTSUPP; +} + +/** + * igc_ptp_systim_to_hwtstamp - convert system time value to HW timestamp + * @adapter: board private structure + * @hwtstamps: timestamp structure to update + * @systim: unsigned 64bit system time value + * + * We need to convert the system time value stored in the RX/TXSTMP registers + * into a hwtstamp which can be used by the upper level timestamping functions. + **/ +static void igc_ptp_systim_to_hwtstamp(struct igc_adapter *adapter, + struct skb_shared_hwtstamps *hwtstamps, + u64 systim) +{ + switch (adapter->hw.mac.type) { + case igc_i225: + memset(hwtstamps, 0, sizeof(*hwtstamps)); + /* Upper 32 bits contain s, lower 32 bits contain ns. */ + hwtstamps->hwtstamp = ktime_set(systim >> 32, + systim & 0xFFFFFFFF); + break; + default: + break; + } +} + +/** + * igc_ptp_rx_pktstamp - retrieve Rx per packet timestamp + * @q_vector: Pointer to interrupt specific structure + * @va: Pointer to address containing Rx buffer + * @skb: Buffer containing timestamp and packet + * + * This function is meant to retrieve the first timestamp from the + * first buffer of an incoming frame. The value is stored in little + * endian format starting on byte 0. There's a second timestamp + * starting on byte 8. + **/ +void igc_ptp_rx_pktstamp(struct igc_q_vector *q_vector, void *va, + struct sk_buff *skb) +{ + struct igc_adapter *adapter = q_vector->adapter; + __le64 *regval = (__le64 *)va; + int adjust = 0; + + /* The timestamp is recorded in little endian format. + * DWORD: | 0 | 1 | 2 | 3 + * Field: | Timer0 Low | Timer0 High | Timer1 Low | Timer1 High + */ + igc_ptp_systim_to_hwtstamp(adapter, skb_hwtstamps(skb), + le64_to_cpu(regval[0])); + + /* adjust timestamp for the RX latency based on link speed */ + if (adapter->hw.mac.type == igc_i225) { + switch (adapter->link_speed) { + case SPEED_10: + adjust = IGC_I225_RX_LATENCY_10; + break; + case SPEED_100: + adjust = IGC_I225_RX_LATENCY_100; + break; + case SPEED_1000: + adjust = IGC_I225_RX_LATENCY_1000; + break; + case SPEED_2500: + adjust = IGC_I225_RX_LATENCY_2500; + break; + } + } + skb_hwtstamps(skb)->hwtstamp = + ktime_sub_ns(skb_hwtstamps(skb)->hwtstamp, adjust); +} + +/** + * igc_ptp_rx_rgtstamp - retrieve Rx timestamp stored in register + * @q_vector: Pointer to interrupt specific structure + * @skb: Buffer containing timestamp and packet + * + * This function is meant to retrieve a timestamp from the internal registers + * of the adapter and store it in the skb. + */ +void igc_ptp_rx_rgtstamp(struct igc_q_vector *q_vector, + struct sk_buff *skb) +{ + struct igc_adapter *adapter = q_vector->adapter; + struct igc_hw *hw = &adapter->hw; + u64 regval; + + /* If this bit is set, then the RX registers contain the time + * stamp. No other packet will be time stamped until we read + * these registers, so read the registers to make them + * available again. Because only one packet can be time + * stamped at a time, we know that the register values must + * belong to this one here and therefore we don't need to + * compare any of the additional attributes stored for it. + * + * If nothing went wrong, then it should have a shared + * tx_flags that we can turn into a skb_shared_hwtstamps. + */ + if (!(rd32(IGC_TSYNCRXCTL) & IGC_TSYNCRXCTL_VALID)) + return; + + regval = rd32(IGC_RXSTMPL); + regval |= (u64)rd32(IGC_RXSTMPH) << 32; + + igc_ptp_systim_to_hwtstamp(adapter, skb_hwtstamps(skb), regval); + + /* Update the last_rx_timestamp timer in order to enable watchdog check + * for error case of latched timestamp on a dropped packet. + */ + adapter->last_rx_timestamp = jiffies; +} + +/** + * igc_ptp_enable_tstamp_rxqueue - Enable RX timestamp for a queue + * @rx_ring: Pointer to RX queue + * @timer: Index for timer + * + * This function enables RX timestamping for a queue, and selects + * which 1588 timer will provide the timestamp. + */ +static void igc_ptp_enable_tstamp_rxqueue(struct igc_adapter *adapter, + struct igc_ring *rx_ring, u8 timer) +{ + struct igc_hw *hw = &adapter->hw; + int reg_idx = rx_ring->reg_idx; + u32 srrctl = rd32(IGC_SRRCTL(reg_idx)); + + srrctl |= IGC_SRRCTL_TIMESTAMP; + srrctl |= IGC_SRRCTL_TIMER1SEL(timer); + srrctl |= IGC_SRRCTL_TIMER0SEL(timer); + + wr32(IGC_SRRCTL(reg_idx), srrctl); +} + +static void igc_ptp_enable_tstamp_all_rxqueues(struct igc_adapter *adapter, + u8 timer) +{ + int i; + + for (i = 0; i < adapter->num_rx_queues; i++) { + struct igc_ring *ring = adapter->rx_ring[i]; + + igc_ptp_enable_tstamp_rxqueue(adapter, ring, timer); + } +} + +/** + * igc_ptp_set_timestamp_mode - setup hardware for timestamping + * @adapter: networking device structure + * @config: hwtstamp configuration + * + * Outgoing time stamping can be enabled and disabled. Play nice and + * disable it when requested, although it shouldn't case any overhead + * when no packet needs it. At most one packet in the queue may be + * marked for time stamping, otherwise it would be impossible to tell + * for sure to which packet the hardware time stamp belongs. + * + * Incoming time stamping has to be configured via the hardware + * filters. Not all combinations are supported, in particular event + * type has to be specified. Matching the kind of event packet is + * not supported, with the exception of "all V2 events regardless of + * level 2 or 4". + * + */ +static int igc_ptp_set_timestamp_mode(struct igc_adapter *adapter, + struct hwtstamp_config *config) +{ + u32 tsync_tx_ctl = IGC_TSYNCTXCTL_ENABLED; + u32 tsync_rx_ctl = IGC_TSYNCRXCTL_ENABLED; + struct igc_hw *hw = &adapter->hw; + u32 tsync_rx_cfg = 0; + bool is_l4 = false; + bool is_l2 = false; + u32 regval; + + /* reserved for future extensions */ + if (config->flags) + return -EINVAL; + + switch (config->tx_type) { + case HWTSTAMP_TX_OFF: + tsync_tx_ctl = 0; + case HWTSTAMP_TX_ON: + break; + default: + return -ERANGE; + } + + switch (config->rx_filter) { + case HWTSTAMP_FILTER_NONE: + tsync_rx_ctl = 0; + break; + case HWTSTAMP_FILTER_PTP_V1_L4_SYNC: + tsync_rx_ctl |= IGC_TSYNCRXCTL_TYPE_L4_V1; + tsync_rx_cfg = IGC_TSYNCRXCFG_PTP_V1_SYNC_MESSAGE; + is_l4 = true; + break; + case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ: + tsync_rx_ctl |= IGC_TSYNCRXCTL_TYPE_L4_V1; + tsync_rx_cfg = IGC_TSYNCRXCFG_PTP_V1_DELAY_REQ_MESSAGE; + is_l4 = true; + break; + case HWTSTAMP_FILTER_PTP_V2_EVENT: + case HWTSTAMP_FILTER_PTP_V2_L2_EVENT: + case HWTSTAMP_FILTER_PTP_V2_L4_EVENT: + case HWTSTAMP_FILTER_PTP_V2_SYNC: + case HWTSTAMP_FILTER_PTP_V2_L2_SYNC: + case HWTSTAMP_FILTER_PTP_V2_L4_SYNC: + case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: + case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ: + case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ: + tsync_rx_ctl |= IGC_TSYNCRXCTL_TYPE_EVENT_V2; + config->rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT; + is_l2 = true; + is_l4 = true; + break; + case HWTSTAMP_FILTER_PTP_V1_L4_EVENT: + case HWTSTAMP_FILTER_NTP_ALL: + case HWTSTAMP_FILTER_ALL: + tsync_rx_ctl |= IGC_TSYNCRXCTL_TYPE_ALL; + config->rx_filter = HWTSTAMP_FILTER_ALL; + break; + /* fall through */ + default: + config->rx_filter = HWTSTAMP_FILTER_NONE; + return -ERANGE; + } + + /* Per-packet timestamping only works if all packets are + * timestamped, so enable timestamping in all packets as long + * as one Rx filter was configured. + */ + if (tsync_rx_ctl) { + tsync_rx_ctl = IGC_TSYNCRXCTL_ENABLED; + tsync_rx_ctl |= IGC_TSYNCRXCTL_TYPE_ALL; + tsync_rx_ctl |= IGC_TSYNCRXCTL_RXSYNSIG; + config->rx_filter = HWTSTAMP_FILTER_ALL; + is_l2 = true; + is_l4 = true; + + if (hw->mac.type == igc_i225) { + regval = rd32(IGC_RXPBS); + regval |= IGC_RXPBS_CFG_TS_EN; + wr32(IGC_RXPBS, regval); + + /* FIXME: For now, only support retrieving RX + * timestamps from timer 0 + */ + igc_ptp_enable_tstamp_all_rxqueues(adapter, 0); + } + } + + if (tsync_tx_ctl) { + tsync_tx_ctl = IGC_TSYNCTXCTL_ENABLED; + tsync_tx_ctl |= IGC_TSYNCTXCTL_TXSYNSIG; + } + + /* enable/disable TX */ + regval = rd32(IGC_TSYNCTXCTL); + regval &= ~IGC_TSYNCTXCTL_ENABLED; + regval |= tsync_tx_ctl; + wr32(IGC_TSYNCTXCTL, regval); + + /* enable/disable RX */ + regval = rd32(IGC_TSYNCRXCTL); + regval &= ~(IGC_TSYNCRXCTL_ENABLED | IGC_TSYNCRXCTL_TYPE_MASK); + regval |= tsync_rx_ctl; + wr32(IGC_TSYNCRXCTL, regval); + + /* define which PTP packets are time stamped */ + wr32(IGC_TSYNCRXCFG, tsync_rx_cfg); + + /* define ethertype filter for timestamped packets */ + if (is_l2) + wr32(IGC_ETQF(3), + (IGC_ETQF_FILTER_ENABLE | /* enable filter */ + IGC_ETQF_1588 | /* enable timestamping */ + ETH_P_1588)); /* 1588 eth protocol type */ + else + wr32(IGC_ETQF(3), 0); + + /* L4 Queue Filter[3]: filter by destination port and protocol */ + if (is_l4) { + u32 ftqf = (IPPROTO_UDP /* UDP */ + | IGC_FTQF_VF_BP /* VF not compared */ + | IGC_FTQF_1588_TIME_STAMP /* Enable Timestamp */ + | IGC_FTQF_MASK); /* mask all inputs */ + ftqf &= ~IGC_FTQF_MASK_PROTO_BP; /* enable protocol check */ + + wr32(IGC_IMIR(3), htons(PTP_EV_PORT)); + wr32(IGC_IMIREXT(3), + (IGC_IMIREXT_SIZE_BP | IGC_IMIREXT_CTRL_BP)); + wr32(IGC_FTQF(3), ftqf); + } else { + wr32(IGC_FTQF(3), IGC_FTQF_MASK); + } + wrfl(); + + /* clear TX/RX time stamp registers, just to be sure */ + regval = rd32(IGC_TXSTMPL); + regval = rd32(IGC_TXSTMPH); + regval = rd32(IGC_RXSTMPL); + regval = rd32(IGC_RXSTMPH); + + return 0; +} + +void igc_ptp_tx_hang(struct igc_adapter *adapter) +{ + bool timeout = time_is_before_jiffies(adapter->ptp_tx_start + + IGC_PTP_TX_TIMEOUT); + struct igc_hw *hw = &adapter->hw; + + if (!adapter->ptp_tx_skb) + return; + + if (!test_bit(__IGC_PTP_TX_IN_PROGRESS, &adapter->state)) + return; + + /* If we haven't received a timestamp within the timeout, it is + * reasonable to assume that it will never occur, so we can unlock the + * timestamp bit when this occurs. + */ + if (timeout) { + cancel_work_sync(&adapter->ptp_tx_work); + dev_kfree_skb_any(adapter->ptp_tx_skb); + adapter->ptp_tx_skb = NULL; + clear_bit_unlock(__IGC_PTP_TX_IN_PROGRESS, &adapter->state); + adapter->tx_hwtstamp_timeouts++; + /* Clear the Tx valid bit in TSYNCTXCTL register to enable + * interrupt + */ + rd32(IGC_TXSTMPH); + dev_warn(&adapter->pdev->dev, "clearing Tx timestamp hang\n"); + } +} + +/** + * igc_ptp_tx_hwtstamp - utility function which checks for TX time stamp + * @adapter: Board private structure + * + * If we were asked to do hardware stamping and such a time stamp is + * available, then it must have been for this skb here because we only + * allow only one such packet into the queue. + */ +static void igc_ptp_tx_hwtstamp(struct igc_adapter *adapter) +{ + struct sk_buff *skb = adapter->ptp_tx_skb; + struct skb_shared_hwtstamps shhwtstamps; + struct igc_hw *hw = &adapter->hw; + u64 regval; + + regval = rd32(IGC_TXSTMPL); + regval |= (u64)rd32(IGC_TXSTMPH) << 32; + igc_ptp_systim_to_hwtstamp(adapter, &shhwtstamps, regval); + + /* Clear the lock early before calling skb_tstamp_tx so that + * applications are not woken up before the lock bit is clear. We use + * a copy of the skb pointer to ensure other threads can't change it + * while we're notifying the stack. + */ + adapter->ptp_tx_skb = NULL; + clear_bit_unlock(__IGC_PTP_TX_IN_PROGRESS, &adapter->state); + + /* Notify the stack and free the skb after we've unlocked */ + skb_tstamp_tx(skb, &shhwtstamps); + dev_kfree_skb_any(skb); +} + +/** + * igc_ptp_tx_work + * @work: pointer to work struct + * + * This work function polls the TSYNCTXCTL valid bit to determine when a + * timestamp has been taken for the current stored skb. + */ +void igc_ptp_tx_work(struct work_struct *work) +{ + struct igc_adapter *adapter = container_of(work, struct igc_adapter, + ptp_tx_work); + struct igc_hw *hw = &adapter->hw; + u32 tsynctxctl; + + if (!adapter->ptp_tx_skb) + return; + + if (time_is_before_jiffies(adapter->ptp_tx_start + + IGC_PTP_TX_TIMEOUT)) { + dev_kfree_skb_any(adapter->ptp_tx_skb); + adapter->ptp_tx_skb = NULL; + clear_bit_unlock(__IGC_PTP_TX_IN_PROGRESS, &adapter->state); + adapter->tx_hwtstamp_timeouts++; + /* Clear the tx valid bit in TSYNCTXCTL register to enable + * interrupt + */ + rd32(IGC_TXSTMPH); + dev_warn(&adapter->pdev->dev, "clearing Tx timestamp hang\n"); + return; + } + + tsynctxctl = rd32(IGC_TSYNCTXCTL); + if (tsynctxctl & IGC_TSYNCTXCTL_VALID) + igc_ptp_tx_hwtstamp(adapter); + else + /* reschedule to check later */ + schedule_work(&adapter->ptp_tx_work); +} + +/** + * igc_ptp_set_ts_config - set hardware time stamping config + * @netdev: network interface device structure + * @ifreq: interface request data + * + **/ +int igc_ptp_set_ts_config(struct net_device *netdev, struct ifreq *ifr) +{ + struct igc_adapter *adapter = netdev_priv(netdev); + struct hwtstamp_config config; + int err; + + if (copy_from_user(&config, ifr->ifr_data, sizeof(config))) + return -EFAULT; + + err = igc_ptp_set_timestamp_mode(adapter, &config); + if (err) + return err; + + /* save these settings for future reference */ + memcpy(&adapter->tstamp_config, &config, + sizeof(adapter->tstamp_config)); + + return copy_to_user(ifr->ifr_data, &config, sizeof(config)) ? + -EFAULT : 0; +} + +/** + * igc_ptp_get_ts_config - get hardware time stamping config + * @netdev: network interface device structure + * @ifreq: interface request data + * + * Get the hwtstamp_config settings to return to the user. Rather than attempt + * to deconstruct the settings from the registers, just return a shadow copy + * of the last known settings. + **/ +int igc_ptp_get_ts_config(struct net_device *netdev, struct ifreq *ifr) +{ + struct igc_adapter *adapter = netdev_priv(netdev); + struct hwtstamp_config *config = &adapter->tstamp_config; + + return copy_to_user(ifr->ifr_data, config, sizeof(*config)) ? + -EFAULT : 0; +} + +/** + * igc_ptp_init - Initialize PTP functionality + * @adapter: Board private structure + * + * This function is called at device probe to initialize the PTP + * functionality. + */ +void igc_ptp_init(struct igc_adapter *adapter) +{ + struct net_device *netdev = adapter->netdev; + struct igc_hw *hw = &adapter->hw; + + switch (hw->mac.type) { + case igc_i225: + snprintf(adapter->ptp_caps.name, 16, "%pm", netdev->dev_addr); + adapter->ptp_caps.owner = THIS_MODULE; + adapter->ptp_caps.max_adj = 62499999; + adapter->ptp_caps.adjfine = igc_ptp_adjfine_i225; + adapter->ptp_caps.adjtime = igc_ptp_adjtime_i225; + adapter->ptp_caps.gettimex64 = igc_ptp_gettimex64_i225; + adapter->ptp_caps.settime64 = igc_ptp_settime_i225; + adapter->ptp_caps.enable = igc_ptp_feature_enable_i225; + break; + default: + adapter->ptp_clock = NULL; + return; + } + + spin_lock_init(&adapter->tmreg_lock); + INIT_WORK(&adapter->ptp_tx_work, igc_ptp_tx_work); + + adapter->tstamp_config.rx_filter = HWTSTAMP_FILTER_NONE; + adapter->tstamp_config.tx_type = HWTSTAMP_TX_OFF; + + igc_ptp_reset(adapter); + + adapter->ptp_clock = ptp_clock_register(&adapter->ptp_caps, + &adapter->pdev->dev); + if (IS_ERR(adapter->ptp_clock)) { + adapter->ptp_clock = NULL; + dev_err(&adapter->pdev->dev, "ptp_clock_register failed\n"); + } else if (adapter->ptp_clock) { + dev_info(&adapter->pdev->dev, "added PHC on %s\n", + adapter->netdev->name); + adapter->ptp_flags |= IGC_PTP_ENABLED; + } +} + +/** + * igc_ptp_suspend - Disable PTP work items and prepare for suspend + * @adapter: Board private structure + * + * This function stops the overflow check work and PTP Tx timestamp work, and + * will prepare the device for OS suspend. + */ +void igc_ptp_suspend(struct igc_adapter *adapter) +{ + if (!(adapter->ptp_flags & IGC_PTP_ENABLED)) + return; + + cancel_work_sync(&adapter->ptp_tx_work); + if (adapter->ptp_tx_skb) { + dev_kfree_skb_any(adapter->ptp_tx_skb); + adapter->ptp_tx_skb = NULL; + clear_bit_unlock(__IGC_PTP_TX_IN_PROGRESS, &adapter->state); + } +} + +/** + * igc_ptp_stop - Disable PTP device and stop the overflow check. + * @adapter: Board private structure. + * + * This function stops the PTP support and cancels the delayed work. + **/ +void igc_ptp_stop(struct igc_adapter *adapter) +{ + igc_ptp_suspend(adapter); + + if (adapter->ptp_clock) { + ptp_clock_unregister(adapter->ptp_clock); + dev_info(&adapter->pdev->dev, "removed PHC on %s\n", + adapter->netdev->name); + adapter->ptp_flags &= ~IGC_PTP_ENABLED; + } +} + +/** + * igc_ptp_reset - Re-enable the adapter for PTP following a reset. + * @adapter: Board private structure. + * + * This function handles the reset work required to re-enable the PTP device. + **/ +void igc_ptp_reset(struct igc_adapter *adapter) +{ + struct igc_hw *hw = &adapter->hw; + unsigned long flags; + + /* reset the tstamp_config */ + igc_ptp_set_timestamp_mode(adapter, &adapter->tstamp_config); + + spin_lock_irqsave(&adapter->tmreg_lock, flags); + + switch (adapter->hw.mac.type) { + case igc_i225: + wr32(IGC_TSAUXC, 0x0); + wr32(IGC_TSSDP, 0x0); + wr32(IGC_TSIM, IGC_TSICR_INTERRUPTS); + wr32(IGC_IMS, IGC_IMS_TS); + break; + default: + /* No work to do. */ + goto out; + } + + /* Re-initialize the timer. */ + if (hw->mac.type == igc_i225) { + struct timespec64 ts64 = ktime_to_timespec64(ktime_get_real()); + + igc_ptp_write_i225(adapter, &ts64); + } else { + timecounter_init(&adapter->tc, &adapter->cc, + ktime_to_ns(ktime_get_real())); + } +out: + spin_unlock_irqrestore(&adapter->tmreg_lock, flags); + + wrfl(); +} diff --git a/drivers/net/ethernet/intel/igc/igc_regs.h b/drivers/net/ethernet/intel/igc/igc_regs.h index 93a9139f08c5..c9029b549b90 100644 --- a/drivers/net/ethernet/intel/igc/igc_regs.h +++ b/drivers/net/ethernet/intel/igc/igc_regs.h @@ -12,6 +12,7 @@ #define IGC_MDIC 0x00020 /* MDI Control - RW */ #define IGC_MDICNFG 0x00E04 /* MDC/MDIO Configuration - RW */ #define IGC_CONNSW 0x00034 /* Copper/Fiber switch control - RW */ +#define IGC_I225_PHPM 0x00E14 /* I225 PHY Power Management */ /* Internal Packet Buffer Size Registers */ #define IGC_RXPBS 0x02404 /* Rx Packet Buffer Size - RW */ @@ -209,6 +210,33 @@ #define IGC_LENERRS 0x04138 /* Length Errors Count */ #define IGC_HRMPC 0x0A018 /* Header Redirection Missed Packet Count */ +/* Time sync registers */ +#define IGC_TSICR 0x0B66C /* Time Sync Interrupt Cause */ +#define IGC_TSIM 0x0B674 /* Time Sync Interrupt Mask Register */ +#define IGC_TSAUXC 0x0B640 /* Timesync Auxiliary Control register */ +#define IGC_TSYNCRXCTL 0x0B620 /* Rx Time Sync Control register - RW */ +#define IGC_TSYNCTXCTL 0x0B614 /* Tx Time Sync Control register - RW */ +#define IGC_TSYNCRXCFG 0x05F50 /* Time Sync Rx Configuration - RW */ +#define IGC_TSSDP 0x0003C /* Time Sync SDP Configuration Register - RW */ + +#define IGC_IMIR(_i) (0x05A80 + ((_i) * 4)) /* Immediate Interrupt */ +#define IGC_IMIREXT(_i) (0x05AA0 + ((_i) * 4)) /* Immediate INTR Ext*/ + +#define IGC_FTQF(_n) (0x059E0 + (4 * (_n))) /* 5-tuple Queue Fltr */ + +#define IGC_RXPBS 0x02404 /* Rx Packet Buffer Size - RW */ + +/* System Time Registers */ +#define IGC_SYSTIML 0x0B600 /* System time register Low - RO */ +#define IGC_SYSTIMH 0x0B604 /* System time register High - RO */ +#define IGC_SYSTIMR 0x0B6F8 /* System time register Residue */ +#define IGC_TIMINCA 0x0B608 /* Increment attributes register - RW */ + +#define IGC_RXSTMPL 0x0B624 /* Rx timestamp Low - RO */ +#define IGC_RXSTMPH 0x0B628 /* Rx timestamp High - RO */ +#define IGC_TXSTMPL 0x0B618 /* Tx timestamp value Low - RO */ +#define IGC_TXSTMPH 0x0B61C /* Tx timestamp value High - RO */ + /* Management registers */ #define IGC_MANC 0x05820 /* Management Control - RW */ diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 4c13cca656b2..718931d951bc 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -5239,7 +5239,7 @@ static void ixgbe_fdir_filter_restore(struct ixgbe_adapter *adapter) struct ixgbe_hw *hw = &adapter->hw; struct hlist_node *node2; struct ixgbe_fdir_filter *filter; - u64 action; + u8 queue; spin_lock(&adapter->fdir_perfect_lock); @@ -5248,17 +5248,34 @@ static void ixgbe_fdir_filter_restore(struct ixgbe_adapter *adapter) hlist_for_each_entry_safe(filter, node2, &adapter->fdir_filter_list, fdir_node) { - action = filter->action; - if (action != IXGBE_FDIR_DROP_QUEUE && action != 0) - action = - (action >> ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF) - 1; + if (filter->action == IXGBE_FDIR_DROP_QUEUE) { + queue = IXGBE_FDIR_DROP_QUEUE; + } else { + u32 ring = ethtool_get_flow_spec_ring(filter->action); + u8 vf = ethtool_get_flow_spec_ring_vf(filter->action); + + if (!vf && (ring >= adapter->num_rx_queues)) { + e_err(drv, "FDIR restore failed without VF, ring: %u\n", + ring); + continue; + } else if (vf && + ((vf > adapter->num_vfs) || + ring >= adapter->num_rx_queues_per_pool)) { + e_err(drv, "FDIR restore failed with VF, vf: %hhu, ring: %u\n", + vf, ring); + continue; + } + + /* Map the ring onto the absolute queue index */ + if (!vf) + queue = adapter->rx_ring[ring]->reg_idx; + else + queue = ((vf - 1) * + adapter->num_rx_queues_per_pool) + ring; + } ixgbe_fdir_write_perfect_filter_82599(hw, - &filter->filter, - filter->sw_idx, - (action == IXGBE_FDIR_DROP_QUEUE) ? - IXGBE_FDIR_DROP_QUEUE : - adapter->rx_ring[action]->reg_idx); + &filter->filter, filter->sw_idx, queue); } spin_unlock(&adapter->fdir_perfect_lock); diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index fa286694ac2c..4622c4ea2e46 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -2081,11 +2081,6 @@ static int ixgbevf_write_uc_addr_list(struct net_device *netdev) struct ixgbe_hw *hw = &adapter->hw; int count = 0; - if ((netdev_uc_count(netdev)) > 10) { - pr_err("Too many unicast filters - No Space\n"); - return -ENOSPC; - } - if (!netdev_uc_empty(netdev)) { struct netdev_hw_addr *ha; diff --git a/drivers/net/ethernet/lantiq_etop.c b/drivers/net/ethernet/lantiq_etop.c index 028e3e6222e9..578c31697cc0 100644 --- a/drivers/net/ethernet/lantiq_etop.c +++ b/drivers/net/ethernet/lantiq_etop.c @@ -510,13 +510,6 @@ ltq_etop_change_mtu(struct net_device *dev, int new_mtu) } static int -ltq_etop_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) -{ - /* TODO: mii-toll reports "No MII transceiver present!." ?!*/ - return phy_mii_ioctl(dev->phydev, rq, cmd); -} - -static int ltq_etop_set_mac_address(struct net_device *dev, void *p) { int ret = eth_mac_addr(dev, p); @@ -616,7 +609,7 @@ static const struct net_device_ops ltq_eth_netdev_ops = { .ndo_stop = ltq_etop_stop, .ndo_start_xmit = ltq_etop_tx, .ndo_change_mtu = ltq_etop_change_mtu, - .ndo_do_ioctl = ltq_etop_ioctl, + .ndo_do_ioctl = phy_do_ioctl, .ndo_set_mac_address = ltq_etop_set_mac_address, .ndo_validate_addr = eth_validate_addr, .ndo_set_rx_mode = ltq_etop_set_multicast_list, diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 71a872d46bc4..2dfbfdff45a8 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -2081,7 +2081,11 @@ static int mvneta_run_xdp(struct mvneta_port *pp, struct mvneta_rx_queue *rxq, struct bpf_prog *prog, struct xdp_buff *xdp) { - u32 ret, act = bpf_prog_run_xdp(prog, xdp); + unsigned int len; + u32 ret, act; + + len = xdp->data_end - xdp->data_hard_start - pp->rx_offset_correction; + act = bpf_prog_run_xdp(prog, xdp); switch (act) { case XDP_PASS: @@ -2094,9 +2098,8 @@ mvneta_run_xdp(struct mvneta_port *pp, struct mvneta_rx_queue *rxq, if (err) { ret = MVNETA_XDP_DROPPED; __page_pool_put_page(rxq->page_pool, - virt_to_head_page(xdp->data), - xdp->data_end - xdp->data_hard_start, - true); + virt_to_head_page(xdp->data), + len, true); } else { ret = MVNETA_XDP_REDIR; } @@ -2106,9 +2109,8 @@ mvneta_run_xdp(struct mvneta_port *pp, struct mvneta_rx_queue *rxq, ret = mvneta_xdp_xmit_back(pp, xdp); if (ret != MVNETA_XDP_TX) __page_pool_put_page(rxq->page_pool, - virt_to_head_page(xdp->data), - xdp->data_end - xdp->data_hard_start, - true); + virt_to_head_page(xdp->data), + len, true); break; default: bpf_warn_invalid_xdp_action(act); @@ -2119,8 +2121,7 @@ mvneta_run_xdp(struct mvneta_port *pp, struct mvneta_rx_queue *rxq, case XDP_DROP: __page_pool_put_page(rxq->page_pool, virt_to_head_page(xdp->data), - xdp->data_end - xdp->data_hard_start, - true); + len, true); ret = MVNETA_XDP_DROPPED; break; } @@ -3071,7 +3072,7 @@ static int mvneta_create_page_pool(struct mvneta_port *pp, .order = 0, .flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV, .pool_size = size, - .nid = cpu_to_node(0), + .nid = NUMA_NO_NODE, .dev = pp->dev->dev.parent, .dma_dir = xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE, .offset = pp->rx_offset_correction, @@ -4225,6 +4226,12 @@ static int mvneta_xdp_setup(struct net_device *dev, struct bpf_prog *prog, return -EOPNOTSUPP; } + if (pp->bm_priv) { + NL_SET_ERR_MSG_MOD(extack, + "Hardware Buffer Management not supported on XDP"); + return -EOPNOTSUPP; + } + need_update = !!pp->xdp_prog != !!prog; if (running && need_update) mvneta_stop(dev); diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index cd32c7196a7f..72133cbe55d4 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -1114,7 +1114,7 @@ mvpp2_shared_interrupt_mask_unmask(struct mvpp2_port *port, bool mask) /* Port configuration routines */ static bool mvpp2_is_xlg(phy_interface_t interface) { - return interface == PHY_INTERFACE_MODE_10GKR || + return interface == PHY_INTERFACE_MODE_10GBASER || interface == PHY_INTERFACE_MODE_XAUI; } @@ -1200,7 +1200,7 @@ static int mvpp22_gop_init(struct mvpp2_port *port) case PHY_INTERFACE_MODE_2500BASEX: mvpp22_gop_init_sgmii(port); break; - case PHY_INTERFACE_MODE_10GKR: + case PHY_INTERFACE_MODE_10GBASER: if (port->gop_id != 0) goto invalid_conf; mvpp22_gop_init_10gkr(port); @@ -1649,7 +1649,7 @@ static void mvpp22_pcs_reset_deassert(struct mvpp2_port *port) xpcs = priv->iface_base + MVPP22_XPCS_BASE(port->gop_id); switch (port->phy_interface) { - case PHY_INTERFACE_MODE_10GKR: + case PHY_INTERFACE_MODE_10GBASER: val = readl(mpcs + MVPP22_MPCS_CLK_RESET); val |= MAC_CLK_RESET_MAC | MAC_CLK_RESET_SD_RX | MAC_CLK_RESET_SD_TX; @@ -4758,7 +4758,7 @@ static void mvpp2_phylink_validate(struct phylink_config *config, /* Invalid combinations */ switch (state->interface) { - case PHY_INTERFACE_MODE_10GKR: + case PHY_INTERFACE_MODE_10GBASER: case PHY_INTERFACE_MODE_XAUI: if (port->gop_id != 0) goto empty_set; @@ -4780,7 +4780,7 @@ static void mvpp2_phylink_validate(struct phylink_config *config, phylink_set(mask, Asym_Pause); switch (state->interface) { - case PHY_INTERFACE_MODE_10GKR: + case PHY_INTERFACE_MODE_10GBASER: case PHY_INTERFACE_MODE_XAUI: case PHY_INTERFACE_MODE_NA: if (port->gop_id == 0) { @@ -5247,6 +5247,15 @@ static int mvpp2_port_probe(struct platform_device *pdev, goto err_free_netdev; } + /* + * Rewrite 10GBASE-KR to 10GBASE-R for compatibility with existing DT. + * Existing usage of 10GBASE-KR is not correct; no backplane + * negotiation is done, and this driver does not actually support + * 10GBASE-KR. + */ + if (phy_mode == PHY_INTERFACE_MODE_10GKR) + phy_mode = PHY_INTERFACE_MODE_10GBASER; + if (port_node) { comphy = devm_of_phy_get(&pdev->dev, port_node, NULL); if (IS_ERR(comphy)) { diff --git a/drivers/net/ethernet/marvell/pxa168_eth.c b/drivers/net/ethernet/marvell/pxa168_eth.c index 1a6877902dd6..7a0d785b826c 100644 --- a/drivers/net/ethernet/marvell/pxa168_eth.c +++ b/drivers/net/ethernet/marvell/pxa168_eth.c @@ -1344,15 +1344,6 @@ static int pxa168_smi_write(struct mii_bus *bus, int phy_addr, int regnum, return 0; } -static int pxa168_eth_do_ioctl(struct net_device *dev, struct ifreq *ifr, - int cmd) -{ - if (dev->phydev) - return phy_mii_ioctl(dev->phydev, ifr, cmd); - - return -EOPNOTSUPP; -} - #ifdef CONFIG_NET_POLL_CONTROLLER static void pxa168_eth_netpoll(struct net_device *dev) { @@ -1387,7 +1378,7 @@ static const struct net_device_ops pxa168_eth_netdev_ops = { .ndo_set_rx_mode = pxa168_eth_set_rx_mode, .ndo_set_mac_address = pxa168_eth_set_mac_address, .ndo_validate_addr = eth_validate_addr, - .ndo_do_ioctl = pxa168_eth_do_ioctl, + .ndo_do_ioctl = phy_do_ioctl, .ndo_change_mtu = pxa168_eth_change_mtu, .ndo_tx_timeout = pxa168_eth_tx_timeout, #ifdef CONFIG_NET_POLL_CONTROLLER diff --git a/drivers/net/ethernet/mellanox/mlx4/crdump.c b/drivers/net/ethernet/mellanox/mlx4/crdump.c index eaf08f7ad128..64ed725aec28 100644 --- a/drivers/net/ethernet/mellanox/mlx4/crdump.c +++ b/drivers/net/ethernet/mellanox/mlx4/crdump.c @@ -182,7 +182,7 @@ int mlx4_crdump_collect(struct mlx4_dev *dev) crdump_enable_crspace_access(dev, cr_space); /* Get the available snapshot ID for the dumps */ - id = devlink_region_shapshot_id_get(devlink); + id = devlink_region_snapshot_id_get(devlink); /* Try to capture dumps */ mlx4_crdump_collect_crspace(dev, cr_space, id); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index a6f390fdb971..d3e06cec8317 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -42,7 +42,7 @@ mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += en/hv_vhca_stats.o # Core extra # mlx5_core-$(CONFIG_MLX5_ESWITCH) += eswitch.o eswitch_offloads.o eswitch_offloads_termtbl.o \ - ecpf.o rdma.o + ecpf.o rdma.o eswitch_offloads_chains.o mlx5_core-$(CONFIG_MLX5_MPFS) += lib/mpfs.o mlx5_core-$(CONFIG_VXLAN) += lib/vxlan.o mlx5_core-$(CONFIG_PTP_1588_CLOCK) += lib/clock.o diff --git a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c index 549f962cd86e..42198e64a7f4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c @@ -71,8 +71,8 @@ static void *mlx5_dma_zalloc_coherent_node(struct mlx5_core_dev *dev, return cpu_handle; } -int mlx5_buf_alloc_node(struct mlx5_core_dev *dev, int size, - struct mlx5_frag_buf *buf, int node) +static int mlx5_buf_alloc_node(struct mlx5_core_dev *dev, int size, + struct mlx5_frag_buf *buf, int node) { dma_addr_t t; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 9c8427698238..220ef9f06f84 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -135,7 +135,7 @@ struct page_pool; #define MLX5E_LOG_INDIR_RQT_SIZE 0x7 #define MLX5E_INDIR_RQT_SIZE BIT(MLX5E_LOG_INDIR_RQT_SIZE) #define MLX5E_MIN_NUM_CHANNELS 0x1 -#define MLX5E_MAX_NUM_CHANNELS (MLX5E_INDIR_RQT_SIZE >> 1) +#define MLX5E_MAX_NUM_CHANNELS MLX5E_INDIR_RQT_SIZE #define MLX5E_MAX_NUM_SQS (MLX5E_MAX_NUM_CHANNELS * MLX5E_MAX_NUM_TC) #define MLX5E_TX_CQ_POLL_BUDGET 128 #define MLX5E_TX_XSK_POLL_BUDGET 64 @@ -892,6 +892,8 @@ struct mlx5e_profile { int (*update_rx)(struct mlx5e_priv *priv); void (*update_stats)(struct mlx5e_priv *priv); void (*update_carrier)(struct mlx5e_priv *priv); + unsigned int (*stats_grps_num)(struct mlx5e_priv *priv); + mlx5e_stats_grp_t *stats_grps; struct { mlx5e_fp_handle_rx_cqe handle_rx_cqe; mlx5e_fp_handle_rx_cqe handle_rx_cqe_mpwqe; @@ -964,7 +966,6 @@ struct sk_buff * mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt); -void mlx5e_update_stats(struct mlx5e_priv *priv); void mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats); void mlx5e_fold_sw_stats64(struct mlx5e_priv *priv, struct rtnl_link_stats64 *s); @@ -1175,11 +1176,11 @@ int mlx5e_attach_netdev(struct mlx5e_priv *priv); void mlx5e_detach_netdev(struct mlx5e_priv *priv); void mlx5e_destroy_netdev(struct mlx5e_priv *priv); void mlx5e_set_netdev_mtu_boundaries(struct mlx5e_priv *priv); -void mlx5e_build_nic_params(struct mlx5_core_dev *mdev, +void mlx5e_build_nic_params(struct mlx5e_priv *priv, struct mlx5e_xsk *xsk, struct mlx5e_rss_params *rss_params, struct mlx5e_params *params, - u16 max_channels, u16 mtu); + u16 mtu); void mlx5e_build_rq_params(struct mlx5_core_dev *mdev, struct mlx5e_params *params); void mlx5e_build_rss_params(struct mlx5e_rss_params *rss_params, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h index 68d593074f6c..0416f7712109 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h @@ -21,6 +21,7 @@ struct mlx5e_tc_table { DECLARE_HASHTABLE(hairpin_tbl, 8); struct notifier_block netdevice_nb; + struct netdev_net_notifier netdevice_nn; }; struct mlx5e_flow_table { @@ -122,6 +123,22 @@ enum { #endif }; +#define MLX5E_TTC_NUM_GROUPS 3 +#define MLX5E_TTC_GROUP1_SIZE (BIT(3) + MLX5E_NUM_TUNNEL_TT) +#define MLX5E_TTC_GROUP2_SIZE BIT(1) +#define MLX5E_TTC_GROUP3_SIZE BIT(0) +#define MLX5E_TTC_TABLE_SIZE (MLX5E_TTC_GROUP1_SIZE +\ + MLX5E_TTC_GROUP2_SIZE +\ + MLX5E_TTC_GROUP3_SIZE) + +#define MLX5E_INNER_TTC_NUM_GROUPS 3 +#define MLX5E_INNER_TTC_GROUP1_SIZE BIT(3) +#define MLX5E_INNER_TTC_GROUP2_SIZE BIT(1) +#define MLX5E_INNER_TTC_GROUP3_SIZE BIT(0) +#define MLX5E_INNER_TTC_TABLE_SIZE (MLX5E_INNER_TTC_GROUP1_SIZE +\ + MLX5E_INNER_TTC_GROUP2_SIZE +\ + MLX5E_INNER_TTC_GROUP3_SIZE) + #ifdef CONFIG_MLX5_EN_RXNFC struct mlx5e_ethtool_table { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/health.c b/drivers/net/ethernet/mellanox/mlx5/core/en/health.c index 1d6b58860da6..3a975641f902 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/health.c @@ -197,9 +197,10 @@ int mlx5e_health_report(struct mlx5e_priv *priv, struct devlink_health_reporter *reporter, char *err_str, struct mlx5e_err_ctx *err_ctx) { - if (!reporter) { - netdev_err(priv->netdev, err_str); + netdev_err(priv->netdev, err_str); + + if (!reporter) return err_ctx->recover(&err_ctx->ctx); - } + return devlink_health_report(reporter, err_str, err_ctx); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c index 778dab1af8fc..f260dd96873b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c @@ -180,7 +180,7 @@ mlx5e_ktls_tx_post_param_wqes(struct mlx5e_txqsq *sq, struct tx_sync_info { u64 rcd_sn; - s32 sync_len; + u32 sync_len; int nr_frags; skb_frag_t frags[MAX_SKB_FRAGS]; }; @@ -193,13 +193,14 @@ enum mlx5e_ktls_sync_retval { static enum mlx5e_ktls_sync_retval tx_sync_info_get(struct mlx5e_ktls_offload_context_tx *priv_tx, - u32 tcp_seq, struct tx_sync_info *info) + u32 tcp_seq, int datalen, struct tx_sync_info *info) { struct tls_offload_context_tx *tx_ctx = priv_tx->tx_ctx; enum mlx5e_ktls_sync_retval ret = MLX5E_KTLS_SYNC_DONE; struct tls_record_info *record; int remaining, i = 0; unsigned long flags; + bool ends_before; spin_lock_irqsave(&tx_ctx->lock, flags); record = tls_get_record(tx_ctx, tcp_seq, &info->rcd_sn); @@ -209,9 +210,21 @@ tx_sync_info_get(struct mlx5e_ktls_offload_context_tx *priv_tx, goto out; } - if (unlikely(tcp_seq < tls_record_start_seq(record))) { - ret = tls_record_is_start_marker(record) ? - MLX5E_KTLS_SYNC_SKIP_NO_DATA : MLX5E_KTLS_SYNC_FAIL; + /* There are the following cases: + * 1. packet ends before start marker: bypass offload. + * 2. packet starts before start marker and ends after it: drop, + * not supported, breaks contract with kernel. + * 3. packet ends before tls record info starts: drop, + * this packet was already acknowledged and its record info + * was released. + */ + ends_before = before(tcp_seq + datalen, tls_record_start_seq(record)); + + if (unlikely(tls_record_is_start_marker(record))) { + ret = ends_before ? MLX5E_KTLS_SYNC_SKIP_NO_DATA : MLX5E_KTLS_SYNC_FAIL; + goto out; + } else if (ends_before) { + ret = MLX5E_KTLS_SYNC_FAIL; goto out; } @@ -337,7 +350,7 @@ mlx5e_ktls_tx_handle_ooo(struct mlx5e_ktls_offload_context_tx *priv_tx, u8 num_wqebbs; int i = 0; - ret = tx_sync_info_get(priv_tx, seq, &info); + ret = tx_sync_info_get(priv_tx, seq, datalen, &info); if (unlikely(ret != MLX5E_KTLS_SYNC_DONE)) { if (ret == MLX5E_KTLS_SYNC_SKIP_NO_DATA) { stats->tls_skip_no_sync_data++; @@ -351,14 +364,6 @@ mlx5e_ktls_tx_handle_ooo(struct mlx5e_ktls_offload_context_tx *priv_tx, goto err_out; } - if (unlikely(info.sync_len < 0)) { - if (likely(datalen <= -info.sync_len)) - return MLX5E_KTLS_SYNC_DONE; - - stats->tls_drop_bypass_req++; - goto err_out; - } - stats->tls_ooo++; tx_post_resync_params(sq, priv_tx, info.rcd_sn); @@ -378,8 +383,6 @@ mlx5e_ktls_tx_handle_ooo(struct mlx5e_ktls_offload_context_tx *priv_tx, if (unlikely(contig_wqebbs_room < num_wqebbs)) mlx5e_fill_sq_frag_edge(sq, wq, pi, contig_wqebbs_room); - tx_post_resync_params(sq, priv_tx, info.rcd_sn); - for (; i < info.nr_frags; i++) { unsigned int orig_fsz, frag_offset = 0, n = 0; skb_frag_t *f = &info.frags[i]; @@ -455,12 +458,18 @@ struct sk_buff *mlx5e_ktls_handle_tx_skb(struct net_device *netdev, enum mlx5e_ktls_sync_retval ret = mlx5e_ktls_tx_handle_ooo(priv_tx, sq, datalen, seq); - if (likely(ret == MLX5E_KTLS_SYNC_DONE)) + switch (ret) { + case MLX5E_KTLS_SYNC_DONE: *wqe = mlx5e_sq_fetch_wqe(sq, sizeof(**wqe), pi); - else if (ret == MLX5E_KTLS_SYNC_FAIL) + break; + case MLX5E_KTLS_SYNC_SKIP_NO_DATA: + if (likely(!skb->decrypted)) + goto out; + WARN_ON_ONCE(1); + /* fall-through */ + default: /* MLX5E_KTLS_SYNC_FAIL */ goto err_out; - else /* ret == MLX5E_KTLS_SYNC_SKIP_NO_DATA */ - goto out; + } } priv_tx->expected_seq = seq + datalen; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index c6776f308d5e..d674cb679895 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -218,13 +218,9 @@ static const struct pflag_desc mlx5e_priv_flags[MLX5E_NUM_PFLAGS]; int mlx5e_ethtool_get_sset_count(struct mlx5e_priv *priv, int sset) { - int i, num_stats = 0; - switch (sset) { case ETH_SS_STATS: - for (i = 0; i < mlx5e_num_stats_grps; i++) - num_stats += mlx5e_stats_grps[i].get_num_stats(priv); - return num_stats; + return mlx5e_stats_total_num(priv); case ETH_SS_PRIV_FLAGS: return MLX5E_NUM_PFLAGS; case ETH_SS_TEST: @@ -242,14 +238,6 @@ static int mlx5e_get_sset_count(struct net_device *dev, int sset) return mlx5e_ethtool_get_sset_count(priv, sset); } -static void mlx5e_fill_stats_strings(struct mlx5e_priv *priv, u8 *data) -{ - int i, idx = 0; - - for (i = 0; i < mlx5e_num_stats_grps; i++) - idx = mlx5e_stats_grps[i].fill_strings(priv, data, idx); -} - void mlx5e_ethtool_get_strings(struct mlx5e_priv *priv, u32 stringset, u8 *data) { int i; @@ -268,7 +256,7 @@ void mlx5e_ethtool_get_strings(struct mlx5e_priv *priv, u32 stringset, u8 *data) break; case ETH_SS_STATS: - mlx5e_fill_stats_strings(priv, data); + mlx5e_stats_fill_strings(priv, data); break; } } @@ -283,14 +271,13 @@ static void mlx5e_get_strings(struct net_device *dev, u32 stringset, u8 *data) void mlx5e_ethtool_get_ethtool_stats(struct mlx5e_priv *priv, struct ethtool_stats *stats, u64 *data) { - int i, idx = 0; + int idx = 0; mutex_lock(&priv->state_lock); - mlx5e_update_stats(priv); + mlx5e_stats_update(priv); mutex_unlock(&priv->state_lock); - for (i = 0; i < mlx5e_num_stats_grps; i++) - idx = mlx5e_stats_grps[i].fill_stats(priv, data, idx); + mlx5e_stats_fill(priv, data, idx); } static void mlx5e_get_ethtool_stats(struct net_device *dev, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c index 15b7f0f1427c..73d3dc07331f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c @@ -904,22 +904,6 @@ del_rules: return err; } -#define MLX5E_TTC_NUM_GROUPS 3 -#define MLX5E_TTC_GROUP1_SIZE (BIT(3) + MLX5E_NUM_TUNNEL_TT) -#define MLX5E_TTC_GROUP2_SIZE BIT(1) -#define MLX5E_TTC_GROUP3_SIZE BIT(0) -#define MLX5E_TTC_TABLE_SIZE (MLX5E_TTC_GROUP1_SIZE +\ - MLX5E_TTC_GROUP2_SIZE +\ - MLX5E_TTC_GROUP3_SIZE) - -#define MLX5E_INNER_TTC_NUM_GROUPS 3 -#define MLX5E_INNER_TTC_GROUP1_SIZE BIT(3) -#define MLX5E_INNER_TTC_GROUP2_SIZE BIT(1) -#define MLX5E_INNER_TTC_GROUP3_SIZE BIT(0) -#define MLX5E_INNER_TTC_TABLE_SIZE (MLX5E_INNER_TTC_GROUP1_SIZE +\ - MLX5E_INNER_TTC_GROUP2_SIZE +\ - MLX5E_INNER_TTC_GROUP3_SIZE) - static int mlx5e_create_ttc_table_groups(struct mlx5e_ttc_table *ttc, bool use_ipv) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c index acd946f2ddbe..3bc2ac3d53fc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c @@ -58,6 +58,7 @@ static struct mlx5e_ethtool_table *get_flow_table(struct mlx5e_priv *priv, struct ethtool_rx_flow_spec *fs, int num_tuples) { + struct mlx5_flow_table_attr ft_attr = {}; struct mlx5e_ethtool_table *eth_ft; struct mlx5_flow_namespace *ns; struct mlx5_flow_table *ft; @@ -102,9 +103,11 @@ static struct mlx5e_ethtool_table *get_flow_table(struct mlx5e_priv *priv, table_size = min_t(u32, BIT(MLX5_CAP_FLOWTABLE(priv->mdev, flow_table_properties_nic_receive.log_max_ft_size)), MLX5E_ETHTOOL_NUM_ENTRIES); - ft = mlx5_create_auto_grouped_flow_table(ns, prio, - table_size, - MLX5E_ETHTOOL_NUM_GROUPS, 0, 0); + + ft_attr.prio = prio; + ft_attr.max_fte = table_size; + ft_attr.autogroup.max_num_groups = MLX5E_ETHTOOL_NUM_GROUPS; + ft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr); if (IS_ERR(ft)) return (void *)ft; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 319b39f25592..454d3459bd8b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -159,23 +159,14 @@ static void mlx5e_update_carrier_work(struct work_struct *work) mutex_unlock(&priv->state_lock); } -void mlx5e_update_stats(struct mlx5e_priv *priv) -{ - int i; - - for (i = mlx5e_num_stats_grps - 1; i >= 0; i--) - if (mlx5e_stats_grps[i].update_stats) - mlx5e_stats_grps[i].update_stats(priv); -} - void mlx5e_update_ndo_stats(struct mlx5e_priv *priv) { int i; - for (i = mlx5e_num_stats_grps - 1; i >= 0; i--) - if (mlx5e_stats_grps[i].update_stats_mask & + for (i = mlx5e_nic_stats_grps_num(priv) - 1; i >= 0; i--) + if (mlx5e_nic_stats_grps[i]->update_stats_mask & MLX5E_NDO_UPDATE_STATS) - mlx5e_stats_grps[i].update_stats(priv); + mlx5e_nic_stats_grps[i]->update_stats(priv); } static void mlx5e_update_stats_work(struct work_struct *work) @@ -4739,17 +4730,19 @@ void mlx5e_build_rss_params(struct mlx5e_rss_params *rss_params, tirc_default_config[tt].rx_hash_fields; } -void mlx5e_build_nic_params(struct mlx5_core_dev *mdev, +void mlx5e_build_nic_params(struct mlx5e_priv *priv, struct mlx5e_xsk *xsk, struct mlx5e_rss_params *rss_params, struct mlx5e_params *params, - u16 max_channels, u16 mtu) + u16 mtu) { + struct mlx5_core_dev *mdev = priv->mdev; u8 rx_cq_period_mode; params->sw_mtu = mtu; params->hard_mtu = MLX5E_ETH_HARD_MTU; - params->num_channels = max_channels; + params->num_channels = min_t(unsigned int, MLX5E_MAX_NUM_CHANNELS / 2, + priv->max_nch); params->num_tc = 1; /* SQ */ @@ -4876,6 +4869,8 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev) netdev->hw_enc_features |= NETIF_F_GSO_UDP_TUNNEL | NETIF_F_GSO_UDP_TUNNEL_CSUM; netdev->gso_partial_features = NETIF_F_GSO_UDP_TUNNEL_CSUM; + netdev->vlan_features |= NETIF_F_GSO_UDP_TUNNEL | + NETIF_F_GSO_UDP_TUNNEL_CSUM; } if (mlx5e_tunnel_proto_supported(mdev, IPPROTO_GRE)) { @@ -4986,8 +4981,8 @@ static int mlx5e_nic_init(struct mlx5_core_dev *mdev, if (err) return err; - mlx5e_build_nic_params(mdev, &priv->xsk, rss, &priv->channels.params, - priv->max_nch, netdev->mtu); + mlx5e_build_nic_params(priv, &priv->xsk, rss, &priv->channels.params, + netdev->mtu); mlx5e_timestamp_init(priv); @@ -5149,6 +5144,7 @@ static void mlx5e_nic_enable(struct mlx5e_priv *priv) static void mlx5e_nic_disable(struct mlx5e_priv *priv) { + struct net_device *netdev = priv->netdev; struct mlx5_core_dev *mdev = priv->mdev; #ifdef CONFIG_MLX5_CORE_EN_DCB @@ -5169,7 +5165,7 @@ static void mlx5e_nic_disable(struct mlx5e_priv *priv) mlx5e_monitor_counter_cleanup(priv); mlx5e_disable_async_events(priv); - mlx5_lag_remove(mdev); + mlx5_lag_remove(mdev, netdev); } int mlx5e_update_nic_rx(struct mlx5e_priv *priv) @@ -5193,6 +5189,8 @@ static const struct mlx5e_profile mlx5e_nic_profile = { .rx_handlers.handle_rx_cqe_mpwqe = mlx5e_handle_rx_cqe_mpwrq, .max_tc = MLX5E_MAX_NUM_TC, .rq_groups = MLX5E_NUM_RQ_GROUPS(XSK), + .stats_grps = mlx5e_nic_stats_grps, + .stats_grps_num = mlx5e_nic_stats_grps_num, }; /* mlx5e generic netdev management API (move to en_common.c) */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index f175cb24bb67..7b48ccacebe2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -41,6 +41,7 @@ #include <net/ipv6_stubs.h> #include "eswitch.h" +#include "eswitch_offloads_chains.h" #include "en.h" #include "en_rep.h" #include "en_tc.h" @@ -116,24 +117,71 @@ static const struct counter_desc vport_rep_stats_desc[] = { #define NUM_VPORT_REP_SW_COUNTERS ARRAY_SIZE(sw_rep_stats_desc) #define NUM_VPORT_REP_HW_COUNTERS ARRAY_SIZE(vport_rep_stats_desc) -static void mlx5e_rep_get_strings(struct net_device *dev, - u32 stringset, uint8_t *data) +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(sw_rep) { - int i, j; + return NUM_VPORT_REP_SW_COUNTERS; +} - switch (stringset) { - case ETH_SS_STATS: - for (i = 0; i < NUM_VPORT_REP_SW_COUNTERS; i++) - strcpy(data + (i * ETH_GSTRING_LEN), - sw_rep_stats_desc[i].format); - for (j = 0; j < NUM_VPORT_REP_HW_COUNTERS; j++, i++) - strcpy(data + (i * ETH_GSTRING_LEN), - vport_rep_stats_desc[j].format); - break; - } +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(sw_rep) +{ + int i; + + for (i = 0; i < NUM_VPORT_REP_SW_COUNTERS; i++) + strcpy(data + (idx++) * ETH_GSTRING_LEN, + sw_rep_stats_desc[i].format); + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(sw_rep) +{ + int i; + + for (i = 0; i < NUM_VPORT_REP_SW_COUNTERS; i++) + data[idx++] = MLX5E_READ_CTR64_CPU(&priv->stats.sw, + sw_rep_stats_desc, i); + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(sw_rep) +{ + struct mlx5e_sw_stats *s = &priv->stats.sw; + struct rtnl_link_stats64 stats64 = {}; + + memset(s, 0, sizeof(*s)); + mlx5e_fold_sw_stats64(priv, &stats64); + + s->rx_packets = stats64.rx_packets; + s->rx_bytes = stats64.rx_bytes; + s->tx_packets = stats64.tx_packets; + s->tx_bytes = stats64.tx_bytes; + s->tx_queue_dropped = stats64.tx_dropped; +} + +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(vport_rep) +{ + return NUM_VPORT_REP_HW_COUNTERS; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(vport_rep) +{ + int i; + + for (i = 0; i < NUM_VPORT_REP_HW_COUNTERS; i++) + strcpy(data + (idx++) * ETH_GSTRING_LEN, vport_rep_stats_desc[i].format); + return idx; } -static void mlx5e_rep_update_hw_counters(struct mlx5e_priv *priv) +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(vport_rep) +{ + int i; + + for (i = 0; i < NUM_VPORT_REP_HW_COUNTERS; i++) + data[idx++] = MLX5E_READ_CTR64_CPU(&priv->stats.vf_vport, + vport_rep_stats_desc, i); + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(vport_rep) { struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; struct mlx5e_rep_priv *rpriv = priv->ppriv; @@ -156,64 +204,33 @@ static void mlx5e_rep_update_hw_counters(struct mlx5e_priv *priv) vport_stats->tx_bytes = vf_stats.rx_bytes; } -static void mlx5e_uplink_rep_update_hw_counters(struct mlx5e_priv *priv) -{ - struct mlx5e_pport_stats *pstats = &priv->stats.pport; - struct rtnl_link_stats64 *vport_stats; - - mlx5e_grp_802_3_update_stats(priv); - - vport_stats = &priv->stats.vf_vport; - - vport_stats->rx_packets = PPORT_802_3_GET(pstats, a_frames_received_ok); - vport_stats->rx_bytes = PPORT_802_3_GET(pstats, a_octets_received_ok); - vport_stats->tx_packets = PPORT_802_3_GET(pstats, a_frames_transmitted_ok); - vport_stats->tx_bytes = PPORT_802_3_GET(pstats, a_octets_transmitted_ok); -} - -static void mlx5e_rep_update_sw_counters(struct mlx5e_priv *priv) +static void mlx5e_rep_get_strings(struct net_device *dev, + u32 stringset, uint8_t *data) { - struct mlx5e_sw_stats *s = &priv->stats.sw; - struct rtnl_link_stats64 stats64 = {}; - - memset(s, 0, sizeof(*s)); - mlx5e_fold_sw_stats64(priv, &stats64); + struct mlx5e_priv *priv = netdev_priv(dev); - s->rx_packets = stats64.rx_packets; - s->rx_bytes = stats64.rx_bytes; - s->tx_packets = stats64.tx_packets; - s->tx_bytes = stats64.tx_bytes; - s->tx_queue_dropped = stats64.tx_dropped; + switch (stringset) { + case ETH_SS_STATS: + mlx5e_stats_fill_strings(priv, data); + break; + } } static void mlx5e_rep_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data) { struct mlx5e_priv *priv = netdev_priv(dev); - int i, j; - - if (!data) - return; - - mutex_lock(&priv->state_lock); - mlx5e_rep_update_sw_counters(priv); - priv->profile->update_stats(priv); - mutex_unlock(&priv->state_lock); - for (i = 0; i < NUM_VPORT_REP_SW_COUNTERS; i++) - data[i] = MLX5E_READ_CTR64_CPU(&priv->stats.sw, - sw_rep_stats_desc, i); - - for (j = 0; j < NUM_VPORT_REP_HW_COUNTERS; j++, i++) - data[i] = MLX5E_READ_CTR64_CPU(&priv->stats.vf_vport, - vport_rep_stats_desc, j); + mlx5e_ethtool_get_ethtool_stats(priv, stats, data); } static int mlx5e_rep_get_sset_count(struct net_device *dev, int sset) { + struct mlx5e_priv *priv = netdev_priv(dev); + switch (sset) { case ETH_SS_STATS: - return NUM_VPORT_REP_SW_COUNTERS + NUM_VPORT_REP_HW_COUNTERS; + return mlx5e_stats_total_num(priv); default: return -EOPNOTSUPP; } @@ -1247,8 +1264,7 @@ static int mlx5e_rep_setup_tc_cb(enum tc_setup_type type, void *type_data, static int mlx5e_rep_setup_ft_cb(enum tc_setup_type type, void *type_data, void *cb_priv) { - struct flow_cls_offload *f = type_data; - struct flow_cls_offload cls_flower; + struct flow_cls_offload tmp, *f = type_data; struct mlx5e_priv *priv = cb_priv; struct mlx5_eswitch *esw; unsigned long flags; @@ -1261,16 +1277,30 @@ static int mlx5e_rep_setup_ft_cb(enum tc_setup_type type, void *type_data, switch (type) { case TC_SETUP_CLSFLOWER: - if (!mlx5_eswitch_prios_supported(esw) || f->common.chain_index) + memcpy(&tmp, f, sizeof(*f)); + + if (!mlx5_esw_chains_prios_supported(esw) || + tmp.common.chain_index) return -EOPNOTSUPP; /* Re-use tc offload path by moving the ft flow to the * reserved ft chain. + * + * FT offload can use prio range [0, INT_MAX], so we normalize + * it to range [1, mlx5_esw_chains_get_prio_range(esw)] + * as with tc, where prio 0 isn't supported. + * + * We only support chain 0 of FT offload. */ - memcpy(&cls_flower, f, sizeof(*f)); - cls_flower.common.chain_index = FDB_FT_CHAIN; - err = mlx5e_rep_setup_tc_cls_flower(priv, &cls_flower, flags); - memcpy(&f->stats, &cls_flower.stats, sizeof(f->stats)); + if (tmp.common.prio >= mlx5_esw_chains_get_prio_range(esw)) + return -EOPNOTSUPP; + if (tmp.common.chain_index != 0) + return -EOPNOTSUPP; + + tmp.common.chain_index = mlx5_esw_chains_get_ft_chain(esw); + tmp.common.prio++; + err = mlx5e_rep_setup_tc_cls_flower(priv, &tmp, flags); + memcpy(&f->stats, &tmp.stats, sizeof(f->stats)); return err; default: return -EOPNOTSUPP; @@ -1660,10 +1690,65 @@ static void mlx5e_cleanup_rep_rx(struct mlx5e_priv *priv) mlx5e_close_drop_rq(&priv->drop_rq); } +static int mlx5e_init_ul_rep_rx(struct mlx5e_priv *priv) +{ + int err = mlx5e_init_rep_rx(priv); + + if (err) + return err; + + mlx5e_create_q_counters(priv); + return 0; +} + +static void mlx5e_cleanup_ul_rep_rx(struct mlx5e_priv *priv) +{ + mlx5e_destroy_q_counters(priv); + mlx5e_cleanup_rep_rx(priv); +} + +static int mlx5e_init_uplink_rep_tx(struct mlx5e_rep_priv *rpriv) +{ + struct mlx5_rep_uplink_priv *uplink_priv; + struct net_device *netdev; + struct mlx5e_priv *priv; + int err; + + netdev = rpriv->netdev; + priv = netdev_priv(netdev); + uplink_priv = &rpriv->uplink_priv; + + mutex_init(&uplink_priv->unready_flows_lock); + INIT_LIST_HEAD(&uplink_priv->unready_flows); + + /* init shared tc flow table */ + err = mlx5e_tc_esw_init(&uplink_priv->tc_ht); + if (err) + return err; + + mlx5_init_port_tun_entropy(&uplink_priv->tun_entropy, priv->mdev); + + /* init indirect block notifications */ + INIT_LIST_HEAD(&uplink_priv->tc_indr_block_priv_list); + uplink_priv->netdevice_nb.notifier_call = mlx5e_nic_rep_netdevice_event; + err = register_netdevice_notifier_dev_net(rpriv->netdev, + &uplink_priv->netdevice_nb, + &uplink_priv->netdevice_nn); + if (err) { + mlx5_core_err(priv->mdev, "Failed to register netdev notifier\n"); + goto tc_esw_cleanup; + } + + return 0; + +tc_esw_cleanup: + mlx5e_tc_esw_cleanup(&uplink_priv->tc_ht); + return err; +} + static int mlx5e_init_rep_tx(struct mlx5e_priv *priv) { struct mlx5e_rep_priv *rpriv = priv->ppriv; - struct mlx5_rep_uplink_priv *uplink_priv; int err; err = mlx5e_create_tises(priv); @@ -1673,52 +1758,41 @@ static int mlx5e_init_rep_tx(struct mlx5e_priv *priv) } if (rpriv->rep->vport == MLX5_VPORT_UPLINK) { - uplink_priv = &rpriv->uplink_priv; - - mutex_init(&uplink_priv->unready_flows_lock); - INIT_LIST_HEAD(&uplink_priv->unready_flows); - - /* init shared tc flow table */ - err = mlx5e_tc_esw_init(&uplink_priv->tc_ht); + err = mlx5e_init_uplink_rep_tx(rpriv); if (err) goto destroy_tises; - - mlx5_init_port_tun_entropy(&uplink_priv->tun_entropy, priv->mdev); - - /* init indirect block notifications */ - INIT_LIST_HEAD(&uplink_priv->tc_indr_block_priv_list); - uplink_priv->netdevice_nb.notifier_call = mlx5e_nic_rep_netdevice_event; - err = register_netdevice_notifier(&uplink_priv->netdevice_nb); - if (err) { - mlx5_core_err(priv->mdev, "Failed to register netdev notifier\n"); - goto tc_esw_cleanup; - } } return 0; -tc_esw_cleanup: - mlx5e_tc_esw_cleanup(&uplink_priv->tc_ht); destroy_tises: mlx5e_destroy_tises(priv); return err; } +static void mlx5e_cleanup_uplink_rep_tx(struct mlx5e_rep_priv *rpriv) +{ + struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv; + + /* clean indirect TC block notifications */ + unregister_netdevice_notifier_dev_net(rpriv->netdev, + &uplink_priv->netdevice_nb, + &uplink_priv->netdevice_nn); + mlx5e_rep_indr_clean_block_privs(rpriv); + + /* delete shared tc flow table */ + mlx5e_tc_esw_cleanup(&rpriv->uplink_priv.tc_ht); + mutex_destroy(&rpriv->uplink_priv.unready_flows_lock); +} + static void mlx5e_cleanup_rep_tx(struct mlx5e_priv *priv) { struct mlx5e_rep_priv *rpriv = priv->ppriv; mlx5e_destroy_tises(priv); - if (rpriv->rep->vport == MLX5_VPORT_UPLINK) { - /* clean indirect TC block notifications */ - unregister_netdevice_notifier(&rpriv->uplink_priv.netdevice_nb); - mlx5e_rep_indr_clean_block_privs(rpriv); - - /* delete shared tc flow table */ - mlx5e_tc_esw_cleanup(&rpriv->uplink_priv.tc_ht); - mutex_destroy(&rpriv->uplink_priv.unready_flows_lock); - } + if (rpriv->rep->vport == MLX5_VPORT_UPLINK) + mlx5e_cleanup_uplink_rep_tx(rpriv); } static void mlx5e_rep_enable(struct mlx5e_priv *priv) @@ -1787,6 +1861,7 @@ static void mlx5e_uplink_rep_enable(struct mlx5e_priv *priv) static void mlx5e_uplink_rep_disable(struct mlx5e_priv *priv) { + struct net_device *netdev = priv->netdev; struct mlx5_core_dev *mdev = priv->mdev; struct mlx5e_rep_priv *rpriv = priv->ppriv; @@ -1795,7 +1870,44 @@ static void mlx5e_uplink_rep_disable(struct mlx5e_priv *priv) #endif mlx5_notifier_unregister(mdev, &priv->events_nb); cancel_work_sync(&rpriv->uplink_priv.reoffload_flows_work); - mlx5_lag_remove(mdev); + mlx5_lag_remove(mdev, netdev); +} + +static MLX5E_DEFINE_STATS_GRP(sw_rep, 0); +static MLX5E_DEFINE_STATS_GRP(vport_rep, MLX5E_NDO_UPDATE_STATS); + +/* The stats groups order is opposite to the update_stats() order calls */ +static mlx5e_stats_grp_t mlx5e_rep_stats_grps[] = { + &MLX5E_STATS_GRP(sw_rep), + &MLX5E_STATS_GRP(vport_rep), +}; + +static unsigned int mlx5e_rep_stats_grps_num(struct mlx5e_priv *priv) +{ + return ARRAY_SIZE(mlx5e_rep_stats_grps); +} + +/* The stats groups order is opposite to the update_stats() order calls */ +static mlx5e_stats_grp_t mlx5e_ul_rep_stats_grps[] = { + &MLX5E_STATS_GRP(sw), + &MLX5E_STATS_GRP(qcnt), + &MLX5E_STATS_GRP(vnic_env), + &MLX5E_STATS_GRP(vport), + &MLX5E_STATS_GRP(802_3), + &MLX5E_STATS_GRP(2863), + &MLX5E_STATS_GRP(2819), + &MLX5E_STATS_GRP(phy), + &MLX5E_STATS_GRP(eth_ext), + &MLX5E_STATS_GRP(pcie), + &MLX5E_STATS_GRP(per_prio), + &MLX5E_STATS_GRP(pme), + &MLX5E_STATS_GRP(channels), + &MLX5E_STATS_GRP(per_port_buff_congest), +}; + +static unsigned int mlx5e_ul_rep_stats_grps_num(struct mlx5e_priv *priv) +{ + return ARRAY_SIZE(mlx5e_ul_rep_stats_grps); } static const struct mlx5e_profile mlx5e_rep_profile = { @@ -1807,29 +1919,33 @@ static const struct mlx5e_profile mlx5e_rep_profile = { .cleanup_tx = mlx5e_cleanup_rep_tx, .enable = mlx5e_rep_enable, .update_rx = mlx5e_update_rep_rx, - .update_stats = mlx5e_rep_update_hw_counters, + .update_stats = mlx5e_update_ndo_stats, .rx_handlers.handle_rx_cqe = mlx5e_handle_rx_cqe_rep, .rx_handlers.handle_rx_cqe_mpwqe = mlx5e_handle_rx_cqe_mpwrq, .max_tc = 1, .rq_groups = MLX5E_NUM_RQ_GROUPS(REGULAR), + .stats_grps = mlx5e_rep_stats_grps, + .stats_grps_num = mlx5e_rep_stats_grps_num, }; static const struct mlx5e_profile mlx5e_uplink_rep_profile = { .init = mlx5e_init_rep, .cleanup = mlx5e_cleanup_rep, - .init_rx = mlx5e_init_rep_rx, - .cleanup_rx = mlx5e_cleanup_rep_rx, + .init_rx = mlx5e_init_ul_rep_rx, + .cleanup_rx = mlx5e_cleanup_ul_rep_rx, .init_tx = mlx5e_init_rep_tx, .cleanup_tx = mlx5e_cleanup_rep_tx, .enable = mlx5e_uplink_rep_enable, .disable = mlx5e_uplink_rep_disable, .update_rx = mlx5e_update_rep_rx, - .update_stats = mlx5e_uplink_rep_update_hw_counters, + .update_stats = mlx5e_update_ndo_stats, .update_carrier = mlx5e_update_carrier, .rx_handlers.handle_rx_cqe = mlx5e_handle_rx_cqe_rep, .rx_handlers.handle_rx_cqe_mpwqe = mlx5e_handle_rx_cqe_mpwrq, .max_tc = MLX5E_MAX_NUM_TC, .rq_groups = MLX5E_NUM_RQ_GROUPS(REGULAR), + .stats_grps = mlx5e_ul_rep_stats_grps, + .stats_grps_num = mlx5e_ul_rep_stats_grps_num, }; static bool diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h index 31f83c8adcc9..3f756d51435f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h @@ -73,6 +73,7 @@ struct mlx5_rep_uplink_priv { */ struct list_head tc_indr_block_priv_list; struct notifier_block netdevice_nb; + struct netdev_net_notifier netdevice_nn; struct mlx5_tun_entropy tun_entropy; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c index 9f09253f9f46..30b216d9284c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c @@ -35,6 +35,58 @@ #include "en_accel/ipsec.h" #include "en_accel/tls.h" +static unsigned int stats_grps_num(struct mlx5e_priv *priv) +{ + return !priv->profile->stats_grps_num ? 0 : + priv->profile->stats_grps_num(priv); +} + +unsigned int mlx5e_stats_total_num(struct mlx5e_priv *priv) +{ + mlx5e_stats_grp_t *stats_grps = priv->profile->stats_grps; + const unsigned int num_stats_grps = stats_grps_num(priv); + unsigned int total = 0; + int i; + + for (i = 0; i < num_stats_grps; i++) + total += stats_grps[i]->get_num_stats(priv); + + return total; +} + +void mlx5e_stats_update(struct mlx5e_priv *priv) +{ + mlx5e_stats_grp_t *stats_grps = priv->profile->stats_grps; + const unsigned int num_stats_grps = stats_grps_num(priv); + int i; + + for (i = num_stats_grps - 1; i >= 0; i--) + if (stats_grps[i]->update_stats) + stats_grps[i]->update_stats(priv); +} + +void mlx5e_stats_fill(struct mlx5e_priv *priv, u64 *data, int idx) +{ + mlx5e_stats_grp_t *stats_grps = priv->profile->stats_grps; + const unsigned int num_stats_grps = stats_grps_num(priv); + int i; + + for (i = 0; i < num_stats_grps; i++) + idx = stats_grps[i]->fill_stats(priv, data, idx); +} + +void mlx5e_stats_fill_strings(struct mlx5e_priv *priv, u8 *data) +{ + mlx5e_stats_grp_t *stats_grps = priv->profile->stats_grps; + const unsigned int num_stats_grps = stats_grps_num(priv); + int i, idx = 0; + + for (i = 0; i < num_stats_grps; i++) + idx = stats_grps[i]->fill_strings(priv, data, idx); +} + +/* Concrete NIC Stats */ + static const struct counter_desc sw_stats_desc[] = { { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_packets) }, { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_bytes) }, @@ -146,12 +198,12 @@ static const struct counter_desc sw_stats_desc[] = { #define NUM_SW_COUNTERS ARRAY_SIZE(sw_stats_desc) -static int mlx5e_grp_sw_get_num_stats(struct mlx5e_priv *priv) +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(sw) { return NUM_SW_COUNTERS; } -static int mlx5e_grp_sw_fill_strings(struct mlx5e_priv *priv, u8 *data, int idx) +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(sw) { int i; @@ -160,7 +212,7 @@ static int mlx5e_grp_sw_fill_strings(struct mlx5e_priv *priv, u8 *data, int idx) return idx; } -static int mlx5e_grp_sw_fill_stats(struct mlx5e_priv *priv, u64 *data, int idx) +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(sw) { int i; @@ -169,7 +221,7 @@ static int mlx5e_grp_sw_fill_stats(struct mlx5e_priv *priv, u64 *data, int idx) return idx; } -static void mlx5e_grp_sw_update_stats(struct mlx5e_priv *priv) +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(sw) { struct mlx5e_sw_stats *s = &priv->stats.sw; int i; @@ -297,6 +349,9 @@ static void mlx5e_grp_sw_update_stats(struct mlx5e_priv *priv) s->tx_tls_drop_bypass_req += sq_stats->tls_drop_bypass_req; #endif s->tx_cqes += sq_stats->cqes; + + /* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92657 */ + barrier(); } } } @@ -312,7 +367,7 @@ static const struct counter_desc drop_rq_stats_desc[] = { #define NUM_Q_COUNTERS ARRAY_SIZE(q_stats_desc) #define NUM_DROP_RQ_COUNTERS ARRAY_SIZE(drop_rq_stats_desc) -static int mlx5e_grp_q_get_num_stats(struct mlx5e_priv *priv) +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(qcnt) { int num_stats = 0; @@ -325,7 +380,7 @@ static int mlx5e_grp_q_get_num_stats(struct mlx5e_priv *priv) return num_stats; } -static int mlx5e_grp_q_fill_strings(struct mlx5e_priv *priv, u8 *data, int idx) +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(qcnt) { int i; @@ -340,7 +395,7 @@ static int mlx5e_grp_q_fill_strings(struct mlx5e_priv *priv, u8 *data, int idx) return idx; } -static int mlx5e_grp_q_fill_stats(struct mlx5e_priv *priv, u64 *data, int idx) +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(qcnt) { int i; @@ -353,7 +408,7 @@ static int mlx5e_grp_q_fill_stats(struct mlx5e_priv *priv, u64 *data, int idx) return idx; } -static void mlx5e_grp_q_update_stats(struct mlx5e_priv *priv) +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(qcnt) { struct mlx5e_qcounter_stats *qcnt = &priv->stats.qcnt; u32 out[MLX5_ST_SZ_DW(query_q_counter_out)]; @@ -388,14 +443,13 @@ static const struct counter_desc vnic_env_stats_dev_oob_desc[] = { (MLX5_CAP_GEN(dev, vnic_env_int_rq_oob) ? \ ARRAY_SIZE(vnic_env_stats_dev_oob_desc) : 0) -static int mlx5e_grp_vnic_env_get_num_stats(struct mlx5e_priv *priv) +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(vnic_env) { return NUM_VNIC_ENV_STEER_COUNTERS(priv->mdev) + NUM_VNIC_ENV_DEV_OOB_COUNTERS(priv->mdev); } -static int mlx5e_grp_vnic_env_fill_strings(struct mlx5e_priv *priv, u8 *data, - int idx) +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(vnic_env) { int i; @@ -409,8 +463,7 @@ static int mlx5e_grp_vnic_env_fill_strings(struct mlx5e_priv *priv, u8 *data, return idx; } -static int mlx5e_grp_vnic_env_fill_stats(struct mlx5e_priv *priv, u64 *data, - int idx) +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(vnic_env) { int i; @@ -424,7 +477,7 @@ static int mlx5e_grp_vnic_env_fill_stats(struct mlx5e_priv *priv, u64 *data, return idx; } -static void mlx5e_grp_vnic_env_update_stats(struct mlx5e_priv *priv) +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(vnic_env) { u32 *out = (u32 *)priv->stats.vnic.query_vnic_env_out; int outlen = MLX5_ST_SZ_BYTES(query_vnic_env_out); @@ -487,13 +540,12 @@ static const struct counter_desc vport_stats_desc[] = { #define NUM_VPORT_COUNTERS ARRAY_SIZE(vport_stats_desc) -static int mlx5e_grp_vport_get_num_stats(struct mlx5e_priv *priv) +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(vport) { return NUM_VPORT_COUNTERS; } -static int mlx5e_grp_vport_fill_strings(struct mlx5e_priv *priv, u8 *data, - int idx) +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(vport) { int i; @@ -502,8 +554,7 @@ static int mlx5e_grp_vport_fill_strings(struct mlx5e_priv *priv, u8 *data, return idx; } -static int mlx5e_grp_vport_fill_stats(struct mlx5e_priv *priv, u64 *data, - int idx) +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(vport) { int i; @@ -513,7 +564,7 @@ static int mlx5e_grp_vport_fill_stats(struct mlx5e_priv *priv, u64 *data, return idx; } -static void mlx5e_grp_vport_update_stats(struct mlx5e_priv *priv) +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(vport) { int outlen = MLX5_ST_SZ_BYTES(query_vport_counter_out); u32 *out = (u32 *)priv->stats.vport.query_vport_out; @@ -552,13 +603,12 @@ static const struct counter_desc pport_802_3_stats_desc[] = { #define NUM_PPORT_802_3_COUNTERS ARRAY_SIZE(pport_802_3_stats_desc) -static int mlx5e_grp_802_3_get_num_stats(struct mlx5e_priv *priv) +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(802_3) { return NUM_PPORT_802_3_COUNTERS; } -static int mlx5e_grp_802_3_fill_strings(struct mlx5e_priv *priv, u8 *data, - int idx) +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(802_3) { int i; @@ -567,8 +617,7 @@ static int mlx5e_grp_802_3_fill_strings(struct mlx5e_priv *priv, u8 *data, return idx; } -static int mlx5e_grp_802_3_fill_stats(struct mlx5e_priv *priv, u64 *data, - int idx) +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(802_3) { int i; @@ -581,7 +630,7 @@ static int mlx5e_grp_802_3_fill_stats(struct mlx5e_priv *priv, u64 *data, #define MLX5_BASIC_PPCNT_SUPPORTED(mdev) \ (MLX5_CAP_GEN(mdev, pcam_reg) ? MLX5_CAP_PCAM_REG(mdev, ppcnt) : 1) -void mlx5e_grp_802_3_update_stats(struct mlx5e_priv *priv) +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(802_3) { struct mlx5e_pport_stats *pstats = &priv->stats.pport; struct mlx5_core_dev *mdev = priv->mdev; @@ -609,13 +658,12 @@ static const struct counter_desc pport_2863_stats_desc[] = { #define NUM_PPORT_2863_COUNTERS ARRAY_SIZE(pport_2863_stats_desc) -static int mlx5e_grp_2863_get_num_stats(struct mlx5e_priv *priv) +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(2863) { return NUM_PPORT_2863_COUNTERS; } -static int mlx5e_grp_2863_fill_strings(struct mlx5e_priv *priv, u8 *data, - int idx) +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(2863) { int i; @@ -624,8 +672,7 @@ static int mlx5e_grp_2863_fill_strings(struct mlx5e_priv *priv, u8 *data, return idx; } -static int mlx5e_grp_2863_fill_stats(struct mlx5e_priv *priv, u64 *data, - int idx) +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(2863) { int i; @@ -635,7 +682,7 @@ static int mlx5e_grp_2863_fill_stats(struct mlx5e_priv *priv, u64 *data, return idx; } -static void mlx5e_grp_2863_update_stats(struct mlx5e_priv *priv) +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(2863) { struct mlx5e_pport_stats *pstats = &priv->stats.pport; struct mlx5_core_dev *mdev = priv->mdev; @@ -670,13 +717,12 @@ static const struct counter_desc pport_2819_stats_desc[] = { #define NUM_PPORT_2819_COUNTERS ARRAY_SIZE(pport_2819_stats_desc) -static int mlx5e_grp_2819_get_num_stats(struct mlx5e_priv *priv) +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(2819) { return NUM_PPORT_2819_COUNTERS; } -static int mlx5e_grp_2819_fill_strings(struct mlx5e_priv *priv, u8 *data, - int idx) +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(2819) { int i; @@ -685,8 +731,7 @@ static int mlx5e_grp_2819_fill_strings(struct mlx5e_priv *priv, u8 *data, return idx; } -static int mlx5e_grp_2819_fill_stats(struct mlx5e_priv *priv, u64 *data, - int idx) +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(2819) { int i; @@ -696,7 +741,7 @@ static int mlx5e_grp_2819_fill_stats(struct mlx5e_priv *priv, u64 *data, return idx; } -static void mlx5e_grp_2819_update_stats(struct mlx5e_priv *priv) +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(2819) { struct mlx5e_pport_stats *pstats = &priv->stats.pport; struct mlx5_core_dev *mdev = priv->mdev; @@ -734,7 +779,7 @@ pport_phy_statistical_err_lanes_stats_desc[] = { #define NUM_PPORT_PHY_STATISTICAL_PER_LANE_COUNTERS \ ARRAY_SIZE(pport_phy_statistical_err_lanes_stats_desc) -static int mlx5e_grp_phy_get_num_stats(struct mlx5e_priv *priv) +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(phy) { struct mlx5_core_dev *mdev = priv->mdev; int num_stats; @@ -751,8 +796,7 @@ static int mlx5e_grp_phy_get_num_stats(struct mlx5e_priv *priv) return num_stats; } -static int mlx5e_grp_phy_fill_strings(struct mlx5e_priv *priv, u8 *data, - int idx) +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(phy) { struct mlx5_core_dev *mdev = priv->mdev; int i; @@ -774,7 +818,7 @@ static int mlx5e_grp_phy_fill_strings(struct mlx5e_priv *priv, u8 *data, return idx; } -static int mlx5e_grp_phy_fill_stats(struct mlx5e_priv *priv, u64 *data, int idx) +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(phy) { struct mlx5_core_dev *mdev = priv->mdev; int i; @@ -800,7 +844,7 @@ static int mlx5e_grp_phy_fill_stats(struct mlx5e_priv *priv, u64 *data, int idx) return idx; } -static void mlx5e_grp_phy_update_stats(struct mlx5e_priv *priv) +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(phy) { struct mlx5e_pport_stats *pstats = &priv->stats.pport; struct mlx5_core_dev *mdev = priv->mdev; @@ -830,7 +874,7 @@ static const struct counter_desc pport_eth_ext_stats_desc[] = { #define NUM_PPORT_ETH_EXT_COUNTERS ARRAY_SIZE(pport_eth_ext_stats_desc) -static int mlx5e_grp_eth_ext_get_num_stats(struct mlx5e_priv *priv) +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(eth_ext) { if (MLX5_CAP_PCAM_FEATURE((priv)->mdev, rx_buffer_fullness_counters)) return NUM_PPORT_ETH_EXT_COUNTERS; @@ -838,8 +882,7 @@ static int mlx5e_grp_eth_ext_get_num_stats(struct mlx5e_priv *priv) return 0; } -static int mlx5e_grp_eth_ext_fill_strings(struct mlx5e_priv *priv, u8 *data, - int idx) +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(eth_ext) { int i; @@ -850,8 +893,7 @@ static int mlx5e_grp_eth_ext_fill_strings(struct mlx5e_priv *priv, u8 *data, return idx; } -static int mlx5e_grp_eth_ext_fill_stats(struct mlx5e_priv *priv, u64 *data, - int idx) +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(eth_ext) { int i; @@ -863,7 +905,7 @@ static int mlx5e_grp_eth_ext_fill_stats(struct mlx5e_priv *priv, u64 *data, return idx; } -static void mlx5e_grp_eth_ext_update_stats(struct mlx5e_priv *priv) +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(eth_ext) { struct mlx5e_pport_stats *pstats = &priv->stats.pport; struct mlx5_core_dev *mdev = priv->mdev; @@ -904,7 +946,7 @@ static const struct counter_desc pcie_perf_stall_stats_desc[] = { #define NUM_PCIE_PERF_COUNTERS64 ARRAY_SIZE(pcie_perf_stats_desc64) #define NUM_PCIE_PERF_STALL_COUNTERS ARRAY_SIZE(pcie_perf_stall_stats_desc) -static int mlx5e_grp_pcie_get_num_stats(struct mlx5e_priv *priv) +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(pcie) { int num_stats = 0; @@ -920,8 +962,7 @@ static int mlx5e_grp_pcie_get_num_stats(struct mlx5e_priv *priv) return num_stats; } -static int mlx5e_grp_pcie_fill_strings(struct mlx5e_priv *priv, u8 *data, - int idx) +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(pcie) { int i; @@ -942,8 +983,7 @@ static int mlx5e_grp_pcie_fill_strings(struct mlx5e_priv *priv, u8 *data, return idx; } -static int mlx5e_grp_pcie_fill_stats(struct mlx5e_priv *priv, u64 *data, - int idx) +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(pcie) { int i; @@ -967,7 +1007,7 @@ static int mlx5e_grp_pcie_fill_stats(struct mlx5e_priv *priv, u64 *data, return idx; } -static void mlx5e_grp_pcie_update_stats(struct mlx5e_priv *priv) +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(pcie) { struct mlx5e_pcie_stats *pcie_stats = &priv->stats.pcie; struct mlx5_core_dev *mdev = priv->mdev; @@ -1015,8 +1055,7 @@ static int mlx5e_grp_per_tc_prio_get_num_stats(struct mlx5e_priv *priv) return NUM_PPORT_PER_TC_PRIO_COUNTERS * NUM_PPORT_PRIO; } -static int mlx5e_grp_per_port_buffer_congest_fill_strings(struct mlx5e_priv *priv, - u8 *data, int idx) +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(per_port_buff_congest) { struct mlx5_core_dev *mdev = priv->mdev; int i, prio; @@ -1036,8 +1075,7 @@ static int mlx5e_grp_per_port_buffer_congest_fill_strings(struct mlx5e_priv *pri return idx; } -static int mlx5e_grp_per_port_buffer_congest_fill_stats(struct mlx5e_priv *priv, - u64 *data, int idx) +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(per_port_buff_congest) { struct mlx5e_pport_stats *pport = &priv->stats.pport; struct mlx5_core_dev *mdev = priv->mdev; @@ -1112,13 +1150,13 @@ static void mlx5e_grp_per_tc_congest_prio_update_stats(struct mlx5e_priv *priv) } } -static int mlx5e_grp_per_port_buffer_congest_get_num_stats(struct mlx5e_priv *priv) +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(per_port_buff_congest) { return mlx5e_grp_per_tc_prio_get_num_stats(priv) + mlx5e_grp_per_tc_congest_prio_get_num_stats(priv); } -static void mlx5e_grp_per_port_buffer_congest_update_stats(struct mlx5e_priv *priv) +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(per_port_buff_congest) { mlx5e_grp_per_tc_prio_update_stats(priv); mlx5e_grp_per_tc_congest_prio_update_stats(priv); @@ -1130,6 +1168,7 @@ static void mlx5e_grp_per_port_buffer_congest_update_stats(struct mlx5e_priv *pr static const struct counter_desc pport_per_prio_traffic_stats_desc[] = { { "rx_prio%d_bytes", PPORT_PER_PRIO_OFF(rx_octets) }, { "rx_prio%d_packets", PPORT_PER_PRIO_OFF(rx_frames) }, + { "rx_prio%d_discards", PPORT_PER_PRIO_OFF(rx_discards) }, { "tx_prio%d_bytes", PPORT_PER_PRIO_OFF(tx_octets) }, { "tx_prio%d_packets", PPORT_PER_PRIO_OFF(tx_frames) }, }; @@ -1292,29 +1331,27 @@ static int mlx5e_grp_per_prio_pfc_fill_stats(struct mlx5e_priv *priv, return idx; } -static int mlx5e_grp_per_prio_get_num_stats(struct mlx5e_priv *priv) +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(per_prio) { return mlx5e_grp_per_prio_traffic_get_num_stats() + mlx5e_grp_per_prio_pfc_get_num_stats(priv); } -static int mlx5e_grp_per_prio_fill_strings(struct mlx5e_priv *priv, u8 *data, - int idx) +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(per_prio) { idx = mlx5e_grp_per_prio_traffic_fill_strings(priv, data, idx); idx = mlx5e_grp_per_prio_pfc_fill_strings(priv, data, idx); return idx; } -static int mlx5e_grp_per_prio_fill_stats(struct mlx5e_priv *priv, u64 *data, - int idx) +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(per_prio) { idx = mlx5e_grp_per_prio_traffic_fill_stats(priv, data, idx); idx = mlx5e_grp_per_prio_pfc_fill_stats(priv, data, idx); return idx; } -static void mlx5e_grp_per_prio_update_stats(struct mlx5e_priv *priv) +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(per_prio) { struct mlx5e_pport_stats *pstats = &priv->stats.pport; struct mlx5_core_dev *mdev = priv->mdev; @@ -1349,13 +1386,12 @@ static const struct counter_desc mlx5e_pme_error_desc[] = { #define NUM_PME_STATUS_STATS ARRAY_SIZE(mlx5e_pme_status_desc) #define NUM_PME_ERR_STATS ARRAY_SIZE(mlx5e_pme_error_desc) -static int mlx5e_grp_pme_get_num_stats(struct mlx5e_priv *priv) +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(pme) { return NUM_PME_STATUS_STATS + NUM_PME_ERR_STATS; } -static int mlx5e_grp_pme_fill_strings(struct mlx5e_priv *priv, u8 *data, - int idx) +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(pme) { int i; @@ -1368,8 +1404,7 @@ static int mlx5e_grp_pme_fill_strings(struct mlx5e_priv *priv, u8 *data, return idx; } -static int mlx5e_grp_pme_fill_stats(struct mlx5e_priv *priv, u64 *data, - int idx) +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(pme) { struct mlx5_pme_stats pme_stats; int i; @@ -1387,45 +1422,46 @@ static int mlx5e_grp_pme_fill_stats(struct mlx5e_priv *priv, u64 *data, return idx; } -static int mlx5e_grp_ipsec_get_num_stats(struct mlx5e_priv *priv) +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(pme) { return; } + +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(ipsec) { return mlx5e_ipsec_get_count(priv); } -static int mlx5e_grp_ipsec_fill_strings(struct mlx5e_priv *priv, u8 *data, - int idx) +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(ipsec) { return idx + mlx5e_ipsec_get_strings(priv, data + idx * ETH_GSTRING_LEN); } -static int mlx5e_grp_ipsec_fill_stats(struct mlx5e_priv *priv, u64 *data, - int idx) +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(ipsec) { return idx + mlx5e_ipsec_get_stats(priv, data + idx); } -static void mlx5e_grp_ipsec_update_stats(struct mlx5e_priv *priv) +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(ipsec) { mlx5e_ipsec_update_stats(priv); } -static int mlx5e_grp_tls_get_num_stats(struct mlx5e_priv *priv) +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(tls) { return mlx5e_tls_get_count(priv); } -static int mlx5e_grp_tls_fill_strings(struct mlx5e_priv *priv, u8 *data, - int idx) +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(tls) { return idx + mlx5e_tls_get_strings(priv, data + idx * ETH_GSTRING_LEN); } -static int mlx5e_grp_tls_fill_stats(struct mlx5e_priv *priv, u64 *data, int idx) +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(tls) { return idx + mlx5e_tls_get_stats(priv, data + idx); } +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(tls) { return; } + static const struct counter_desc rq_stats_desc[] = { { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, packets) }, { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, bytes) }, @@ -1559,7 +1595,7 @@ static const struct counter_desc ch_stats_desc[] = { #define NUM_XSKSQ_STATS ARRAY_SIZE(xsksq_stats_desc) #define NUM_CH_STATS ARRAY_SIZE(ch_stats_desc) -static int mlx5e_grp_channels_get_num_stats(struct mlx5e_priv *priv) +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(channels) { int max_nch = priv->max_nch; @@ -1572,8 +1608,7 @@ static int mlx5e_grp_channels_get_num_stats(struct mlx5e_priv *priv) (NUM_XSKSQ_STATS * max_nch * priv->xsk.ever_used); } -static int mlx5e_grp_channels_fill_strings(struct mlx5e_priv *priv, u8 *data, - int idx) +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(channels) { bool is_xsk = priv->xsk.ever_used; int max_nch = priv->max_nch; @@ -1615,8 +1650,7 @@ static int mlx5e_grp_channels_fill_strings(struct mlx5e_priv *priv, u8 *data, return idx; } -static int mlx5e_grp_channels_fill_stats(struct mlx5e_priv *priv, u64 *data, - int idx) +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(channels) { bool is_xsk = priv->xsk.ever_used; int max_nch = priv->max_nch; @@ -1664,104 +1698,46 @@ static int mlx5e_grp_channels_fill_stats(struct mlx5e_priv *priv, u64 *data, return idx; } +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(channels) { return; } + +MLX5E_DEFINE_STATS_GRP(sw, 0); +MLX5E_DEFINE_STATS_GRP(qcnt, MLX5E_NDO_UPDATE_STATS); +MLX5E_DEFINE_STATS_GRP(vnic_env, 0); +MLX5E_DEFINE_STATS_GRP(vport, MLX5E_NDO_UPDATE_STATS); +MLX5E_DEFINE_STATS_GRP(802_3, MLX5E_NDO_UPDATE_STATS); +MLX5E_DEFINE_STATS_GRP(2863, 0); +MLX5E_DEFINE_STATS_GRP(2819, 0); +MLX5E_DEFINE_STATS_GRP(phy, 0); +MLX5E_DEFINE_STATS_GRP(pcie, 0); +MLX5E_DEFINE_STATS_GRP(per_prio, 0); +MLX5E_DEFINE_STATS_GRP(pme, 0); +MLX5E_DEFINE_STATS_GRP(channels, 0); +MLX5E_DEFINE_STATS_GRP(per_port_buff_congest, 0); +MLX5E_DEFINE_STATS_GRP(eth_ext, 0); +static MLX5E_DEFINE_STATS_GRP(ipsec, 0); +static MLX5E_DEFINE_STATS_GRP(tls, 0); + /* The stats groups order is opposite to the update_stats() order calls */ -const struct mlx5e_stats_grp mlx5e_stats_grps[] = { - { - .get_num_stats = mlx5e_grp_sw_get_num_stats, - .fill_strings = mlx5e_grp_sw_fill_strings, - .fill_stats = mlx5e_grp_sw_fill_stats, - .update_stats = mlx5e_grp_sw_update_stats, - }, - { - .get_num_stats = mlx5e_grp_q_get_num_stats, - .fill_strings = mlx5e_grp_q_fill_strings, - .fill_stats = mlx5e_grp_q_fill_stats, - .update_stats_mask = MLX5E_NDO_UPDATE_STATS, - .update_stats = mlx5e_grp_q_update_stats, - }, - { - .get_num_stats = mlx5e_grp_vnic_env_get_num_stats, - .fill_strings = mlx5e_grp_vnic_env_fill_strings, - .fill_stats = mlx5e_grp_vnic_env_fill_stats, - .update_stats = mlx5e_grp_vnic_env_update_stats, - }, - { - .get_num_stats = mlx5e_grp_vport_get_num_stats, - .fill_strings = mlx5e_grp_vport_fill_strings, - .fill_stats = mlx5e_grp_vport_fill_stats, - .update_stats_mask = MLX5E_NDO_UPDATE_STATS, - .update_stats = mlx5e_grp_vport_update_stats, - }, - { - .get_num_stats = mlx5e_grp_802_3_get_num_stats, - .fill_strings = mlx5e_grp_802_3_fill_strings, - .fill_stats = mlx5e_grp_802_3_fill_stats, - .update_stats_mask = MLX5E_NDO_UPDATE_STATS, - .update_stats = mlx5e_grp_802_3_update_stats, - }, - { - .get_num_stats = mlx5e_grp_2863_get_num_stats, - .fill_strings = mlx5e_grp_2863_fill_strings, - .fill_stats = mlx5e_grp_2863_fill_stats, - .update_stats = mlx5e_grp_2863_update_stats, - }, - { - .get_num_stats = mlx5e_grp_2819_get_num_stats, - .fill_strings = mlx5e_grp_2819_fill_strings, - .fill_stats = mlx5e_grp_2819_fill_stats, - .update_stats = mlx5e_grp_2819_update_stats, - }, - { - .get_num_stats = mlx5e_grp_phy_get_num_stats, - .fill_strings = mlx5e_grp_phy_fill_strings, - .fill_stats = mlx5e_grp_phy_fill_stats, - .update_stats = mlx5e_grp_phy_update_stats, - }, - { - .get_num_stats = mlx5e_grp_eth_ext_get_num_stats, - .fill_strings = mlx5e_grp_eth_ext_fill_strings, - .fill_stats = mlx5e_grp_eth_ext_fill_stats, - .update_stats = mlx5e_grp_eth_ext_update_stats, - }, - { - .get_num_stats = mlx5e_grp_pcie_get_num_stats, - .fill_strings = mlx5e_grp_pcie_fill_strings, - .fill_stats = mlx5e_grp_pcie_fill_stats, - .update_stats = mlx5e_grp_pcie_update_stats, - }, - { - .get_num_stats = mlx5e_grp_per_prio_get_num_stats, - .fill_strings = mlx5e_grp_per_prio_fill_strings, - .fill_stats = mlx5e_grp_per_prio_fill_stats, - .update_stats = mlx5e_grp_per_prio_update_stats, - }, - { - .get_num_stats = mlx5e_grp_pme_get_num_stats, - .fill_strings = mlx5e_grp_pme_fill_strings, - .fill_stats = mlx5e_grp_pme_fill_stats, - }, - { - .get_num_stats = mlx5e_grp_ipsec_get_num_stats, - .fill_strings = mlx5e_grp_ipsec_fill_strings, - .fill_stats = mlx5e_grp_ipsec_fill_stats, - .update_stats = mlx5e_grp_ipsec_update_stats, - }, - { - .get_num_stats = mlx5e_grp_tls_get_num_stats, - .fill_strings = mlx5e_grp_tls_fill_strings, - .fill_stats = mlx5e_grp_tls_fill_stats, - }, - { - .get_num_stats = mlx5e_grp_channels_get_num_stats, - .fill_strings = mlx5e_grp_channels_fill_strings, - .fill_stats = mlx5e_grp_channels_fill_stats, - }, - { - .get_num_stats = mlx5e_grp_per_port_buffer_congest_get_num_stats, - .fill_strings = mlx5e_grp_per_port_buffer_congest_fill_strings, - .fill_stats = mlx5e_grp_per_port_buffer_congest_fill_stats, - .update_stats = mlx5e_grp_per_port_buffer_congest_update_stats, - }, +mlx5e_stats_grp_t mlx5e_nic_stats_grps[] = { + &MLX5E_STATS_GRP(sw), + &MLX5E_STATS_GRP(qcnt), + &MLX5E_STATS_GRP(vnic_env), + &MLX5E_STATS_GRP(vport), + &MLX5E_STATS_GRP(802_3), + &MLX5E_STATS_GRP(2863), + &MLX5E_STATS_GRP(2819), + &MLX5E_STATS_GRP(phy), + &MLX5E_STATS_GRP(eth_ext), + &MLX5E_STATS_GRP(pcie), + &MLX5E_STATS_GRP(per_prio), + &MLX5E_STATS_GRP(pme), + &MLX5E_STATS_GRP(ipsec), + &MLX5E_STATS_GRP(tls), + &MLX5E_STATS_GRP(channels), + &MLX5E_STATS_GRP(per_port_buff_congest), }; -const int mlx5e_num_stats_grps = ARRAY_SIZE(mlx5e_stats_grps); +unsigned int mlx5e_nic_stats_grps_num(struct mlx5e_priv *priv) +{ + return ARRAY_SIZE(mlx5e_nic_stats_grps); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h index 869f3502f631..092b39ffa32a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h @@ -29,6 +29,7 @@ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ + #ifndef __MLX5_EN_STATS_H__ #define __MLX5_EN_STATS_H__ @@ -55,6 +56,56 @@ struct counter_desc { size_t offset; /* Byte offset */ }; +enum { + MLX5E_NDO_UPDATE_STATS = BIT(0x1), +}; + +struct mlx5e_priv; +struct mlx5e_stats_grp { + u16 update_stats_mask; + int (*get_num_stats)(struct mlx5e_priv *priv); + int (*fill_strings)(struct mlx5e_priv *priv, u8 *data, int idx); + int (*fill_stats)(struct mlx5e_priv *priv, u64 *data, int idx); + void (*update_stats)(struct mlx5e_priv *priv); +}; + +typedef const struct mlx5e_stats_grp *const mlx5e_stats_grp_t; + +#define MLX5E_STATS_GRP_OP(grp, name) mlx5e_stats_grp_ ## grp ## _ ## name + +#define MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(grp) \ + int MLX5E_STATS_GRP_OP(grp, num_stats)(struct mlx5e_priv *priv) + +#define MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(grp) \ + void MLX5E_STATS_GRP_OP(grp, update_stats)(struct mlx5e_priv *priv) + +#define MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(grp) \ + int MLX5E_STATS_GRP_OP(grp, fill_strings)(struct mlx5e_priv *priv, u8 *data, int idx) + +#define MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(grp) \ + int MLX5E_STATS_GRP_OP(grp, fill_stats)(struct mlx5e_priv *priv, u64 *data, int idx) + +#define MLX5E_STATS_GRP(grp) mlx5e_stats_grp_ ## grp + +#define MLX5E_DECLARE_STATS_GRP(grp) \ + const struct mlx5e_stats_grp MLX5E_STATS_GRP(grp) + +#define MLX5E_DEFINE_STATS_GRP(grp, mask) \ +MLX5E_DECLARE_STATS_GRP(grp) = { \ + .get_num_stats = MLX5E_STATS_GRP_OP(grp, num_stats), \ + .fill_stats = MLX5E_STATS_GRP_OP(grp, fill_stats), \ + .fill_strings = MLX5E_STATS_GRP_OP(grp, fill_strings), \ + .update_stats = MLX5E_STATS_GRP_OP(grp, update_stats), \ + .update_stats_mask = mask, \ +} + +unsigned int mlx5e_stats_total_num(struct mlx5e_priv *priv); +void mlx5e_stats_update(struct mlx5e_priv *priv); +void mlx5e_stats_fill(struct mlx5e_priv *priv, u64 *data, int idx); +void mlx5e_stats_fill_strings(struct mlx5e_priv *priv, u8 *data); + +/* Concrete NIC Stats */ + struct mlx5e_sw_stats { u64 rx_packets; u64 rx_bytes; @@ -322,22 +373,22 @@ struct mlx5e_stats { struct mlx5e_pcie_stats pcie; }; -enum { - MLX5E_NDO_UPDATE_STATS = BIT(0x1), -}; - -struct mlx5e_priv; -struct mlx5e_stats_grp { - u16 update_stats_mask; - int (*get_num_stats)(struct mlx5e_priv *priv); - int (*fill_strings)(struct mlx5e_priv *priv, u8 *data, int idx); - int (*fill_stats)(struct mlx5e_priv *priv, u64 *data, int idx); - void (*update_stats)(struct mlx5e_priv *priv); -}; - -extern const struct mlx5e_stats_grp mlx5e_stats_grps[]; -extern const int mlx5e_num_stats_grps; +extern mlx5e_stats_grp_t mlx5e_nic_stats_grps[]; +unsigned int mlx5e_nic_stats_grps_num(struct mlx5e_priv *priv); -void mlx5e_grp_802_3_update_stats(struct mlx5e_priv *priv); +extern MLX5E_DECLARE_STATS_GRP(sw); +extern MLX5E_DECLARE_STATS_GRP(qcnt); +extern MLX5E_DECLARE_STATS_GRP(vnic_env); +extern MLX5E_DECLARE_STATS_GRP(vport); +extern MLX5E_DECLARE_STATS_GRP(802_3); +extern MLX5E_DECLARE_STATS_GRP(2863); +extern MLX5E_DECLARE_STATS_GRP(2819); +extern MLX5E_DECLARE_STATS_GRP(phy); +extern MLX5E_DECLARE_STATS_GRP(eth_ext); +extern MLX5E_DECLARE_STATS_GRP(pcie); +extern MLX5E_DECLARE_STATS_GRP(per_prio); +extern MLX5E_DECLARE_STATS_GRP(pme); +extern MLX5E_DECLARE_STATS_GRP(channels); +extern MLX5E_DECLARE_STATS_GRP(per_port_buff_congest); #endif /* __MLX5_EN_STATS_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 9b32a9c0f497..74091f72c9a8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -51,6 +51,7 @@ #include "en_rep.h" #include "en_tc.h" #include "eswitch.h" +#include "eswitch_offloads_chains.h" #include "fs_core.h" #include "en/port.h" #include "en/tc_tun.h" @@ -592,7 +593,7 @@ static void mlx5e_hairpin_set_ttc_params(struct mlx5e_hairpin *hp, for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++) ttc_params->indir_tirn[tt] = hp->indir_tirn[tt]; - ft_attr->max_fte = MLX5E_NUM_TT; + ft_attr->max_fte = MLX5E_TTC_TABLE_SIZE; ft_attr->level = MLX5E_TC_TTC_FT_LEVEL; ft_attr->prio = MLX5E_TC_PRIO; } @@ -960,7 +961,8 @@ mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv, mutex_lock(&priv->fs.tc.t_lock); if (IS_ERR_OR_NULL(priv->fs.tc.t)) { - int tc_grp_size, tc_tbl_size; + struct mlx5_flow_table_attr ft_attr = {}; + int tc_grp_size, tc_tbl_size, tc_num_grps; u32 max_flow_counter; max_flow_counter = (MLX5_CAP_GEN(dev, max_flow_counter_31_16) << 16) | @@ -970,13 +972,15 @@ mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv, tc_tbl_size = min_t(int, tc_grp_size * MLX5E_TC_TABLE_NUM_GROUPS, BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev, log_max_ft_size))); + tc_num_grps = MLX5E_TC_TABLE_NUM_GROUPS; + ft_attr.prio = MLX5E_TC_PRIO; + ft_attr.max_fte = tc_tbl_size; + ft_attr.level = MLX5E_TC_FT_LEVEL; + ft_attr.autogroup.max_num_groups = tc_num_grps; priv->fs.tc.t = mlx5_create_auto_grouped_flow_table(priv->fs.ns, - MLX5E_TC_PRIO, - tc_tbl_size, - MLX5E_TC_TABLE_NUM_GROUPS, - MLX5E_TC_FT_LEVEL, 0); + &ft_attr); if (IS_ERR(priv->fs.tc.t)) { mutex_unlock(&priv->fs.tc.t_lock); NL_SET_ERR_MSG_MOD(extack, @@ -1080,7 +1084,7 @@ mlx5e_tc_offload_to_slow_path(struct mlx5_eswitch *esw, memcpy(slow_attr, flow->esw_attr, sizeof(*slow_attr)); slow_attr->action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; slow_attr->split_count = 0; - slow_attr->dest_chain = FDB_TC_SLOW_PATH_CHAIN; + slow_attr->flags |= MLX5_ESW_ATTR_FLAG_SLOW_PATH; rule = mlx5e_tc_offload_fdb_rules(esw, flow, spec, slow_attr); if (!IS_ERR(rule)) @@ -1097,7 +1101,7 @@ mlx5e_tc_unoffload_from_slow_path(struct mlx5_eswitch *esw, memcpy(slow_attr, flow->esw_attr, sizeof(*slow_attr)); slow_attr->action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; slow_attr->split_count = 0; - slow_attr->dest_chain = FDB_TC_SLOW_PATH_CHAIN; + slow_attr->flags |= MLX5_ESW_ATTR_FLAG_SLOW_PATH; mlx5e_tc_unoffload_fdb_rules(esw, flow, slow_attr); flow_flag_clear(flow, SLOW); } @@ -1157,19 +1161,18 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv, struct netlink_ext_ack *extack) { struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; - u32 max_chain = mlx5_eswitch_get_chain_range(esw); struct mlx5_esw_flow_attr *attr = flow->esw_attr; struct mlx5e_tc_flow_parse_attr *parse_attr = attr->parse_attr; - u16 max_prio = mlx5_eswitch_get_prio_range(esw); struct net_device *out_dev, *encap_dev = NULL; struct mlx5_fc *counter = NULL; struct mlx5e_rep_priv *rpriv; struct mlx5e_priv *out_priv; bool encap_valid = true; + u32 max_prio, max_chain; int err = 0; int out_index; - if (!mlx5_eswitch_prios_supported(esw) && attr->prio != 1) { + if (!mlx5_esw_chains_prios_supported(esw) && attr->prio != 1) { NL_SET_ERR_MSG(extack, "E-switch priorities unsupported, upgrade FW"); return -EOPNOTSUPP; } @@ -1179,11 +1182,13 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv, * FDB_FT_CHAIN which is outside tc range. * See mlx5e_rep_setup_ft_cb(). */ + max_chain = mlx5_esw_chains_get_chain_range(esw); if (!mlx5e_is_ft_flow(flow) && attr->chain > max_chain) { NL_SET_ERR_MSG(extack, "Requested chain is out of supported range"); return -EOPNOTSUPP; } + max_prio = mlx5_esw_chains_get_prio_range(esw); if (attr->prio > max_prio) { NL_SET_ERR_MSG(extack, "Requested priority is out of supported range"); return -EOPNOTSUPP; @@ -1805,6 +1810,40 @@ static void *get_match_headers_value(u32 flags, outer_headers); } +static int mlx5e_flower_parse_meta(struct net_device *filter_dev, + struct flow_cls_offload *f) +{ + struct flow_rule *rule = flow_cls_offload_flow_rule(f); + struct netlink_ext_ack *extack = f->common.extack; + struct net_device *ingress_dev; + struct flow_match_meta match; + + if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_META)) + return 0; + + flow_rule_match_meta(rule, &match); + if (match.mask->ingress_ifindex != 0xFFFFFFFF) { + NL_SET_ERR_MSG_MOD(extack, "Unsupported ingress ifindex mask"); + return -EINVAL; + } + + ingress_dev = __dev_get_by_index(dev_net(filter_dev), + match.key->ingress_ifindex); + if (!ingress_dev) { + NL_SET_ERR_MSG_MOD(extack, + "Can't find the ingress port to match on"); + return -EINVAL; + } + + if (ingress_dev != filter_dev) { + NL_SET_ERR_MSG_MOD(extack, + "Can't match on the ingress filter port"); + return -EINVAL; + } + + return 0; +} + static int __parse_cls_flower(struct mlx5e_priv *priv, struct mlx5_flow_spec *spec, struct flow_cls_offload *f, @@ -1825,6 +1864,7 @@ static int __parse_cls_flower(struct mlx5e_priv *priv, u16 addr_type = 0; u8 ip_proto = 0; u8 *match_level; + int err; match_level = outer_match_level; @@ -1868,6 +1908,10 @@ static int __parse_cls_flower(struct mlx5e_priv *priv, spec); } + err = mlx5e_flower_parse_meta(filter_dev, f); + if (err) + return err; + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_BASIC)) { struct flow_match_basic match; @@ -2842,6 +2886,10 @@ static int parse_tc_nic_actions(struct mlx5e_priv *priv, flow_action_for_each(i, act, flow_action) { switch (act->id) { + case FLOW_ACTION_ACCEPT: + action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | + MLX5_FLOW_CONTEXT_ACTION_COUNT; + break; case FLOW_ACTION_DROP: action |= MLX5_FLOW_CONTEXT_ACTION_DROP; if (MLX5_CAP_FLOWTABLE(priv->mdev, @@ -2999,6 +3047,25 @@ static struct ip_tunnel_info *dup_tun_info(const struct ip_tunnel_info *tun_info return kmemdup(tun_info, tun_size, GFP_KERNEL); } +static bool is_duplicated_encap_entry(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + int out_index, + struct mlx5e_encap_entry *e, + struct netlink_ext_ack *extack) +{ + int i; + + for (i = 0; i < out_index; i++) { + if (flow->encaps[i].e != e) + continue; + NL_SET_ERR_MSG_MOD(extack, "can't duplicate encap action"); + netdev_err(priv->netdev, "can't duplicate encap action\n"); + return true; + } + + return false; +} + static int mlx5e_attach_encap(struct mlx5e_priv *priv, struct mlx5e_tc_flow *flow, struct net_device *mirred_dev, @@ -3034,6 +3101,12 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv, /* must verify if encap is valid or not */ if (e) { + /* Check that entry was not already attached to this flow */ + if (is_duplicated_encap_entry(priv, flow, out_index, e, extack)) { + err = -EOPNOTSUPP; + goto out_err; + } + mutex_unlock(&esw->offloads.encap_tbl_lock); wait_for_completion(&e->res_ready); @@ -3220,6 +3293,26 @@ bool mlx5e_is_valid_eswitch_fwd_dev(struct mlx5e_priv *priv, same_hw_devs(priv, netdev_priv(out_dev)); } +static bool is_duplicated_output_device(struct net_device *dev, + struct net_device *out_dev, + int *ifindexes, int if_count, + struct netlink_ext_ack *extack) +{ + int i; + + for (i = 0; i < if_count; i++) { + if (ifindexes[i] == out_dev->ifindex) { + NL_SET_ERR_MSG_MOD(extack, + "can't duplicate output to same device"); + netdev_err(dev, "can't duplicate output to same device: %s\n", + out_dev->name); + return true; + } + } + + return false; +} + static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct flow_action *flow_action, struct mlx5e_tc_flow *flow, @@ -3231,11 +3324,12 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct mlx5e_tc_flow_parse_attr *parse_attr = attr->parse_attr; struct mlx5e_rep_priv *rpriv = priv->ppriv; const struct ip_tunnel_info *info = NULL; + int ifindexes[MLX5_MAX_FLOW_FWD_VPORTS]; bool ft_flow = mlx5e_is_ft_flow(flow); const struct flow_action_entry *act; + int err, i, if_count = 0; bool encap = false; u32 action = 0; - int err, i; if (!flow_action_has_entries(flow_action)) return -EINVAL; @@ -3312,6 +3406,16 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct net_device *uplink_dev = mlx5_eswitch_uplink_get_proto_dev(esw, REP_ETH); struct net_device *uplink_upper; + if (is_duplicated_output_device(priv->netdev, + out_dev, + ifindexes, + if_count, + extack)) + return -EOPNOTSUPP; + + ifindexes[if_count] = out_dev->ifindex; + if_count++; + rcu_read_lock(); uplink_upper = netdev_master_upper_dev_get_rcu(uplink_dev); @@ -3406,7 +3510,7 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, break; case FLOW_ACTION_GOTO: { u32 dest_chain = act->chain_index; - u32 max_chain = mlx5_eswitch_get_chain_range(esw); + u32 max_chain = mlx5_esw_chains_get_chain_range(esw); if (ft_flow) { NL_SET_ERR_MSG_MOD(extack, "Goto action is not supported"); @@ -3980,6 +4084,13 @@ static int apply_police_params(struct mlx5e_priv *priv, u32 rate, u32 rate_mbps; int err; + vport_num = rpriv->rep->vport; + if (vport_num >= MLX5_VPORT_ECPF) { + NL_SET_ERR_MSG_MOD(extack, + "Ingress rate limit is supported only for Eswitch ports connected to VFs"); + return -EOPNOTSUPP; + } + esw = priv->mdev->priv.eswitch; /* rate is given in bytes/sec. * First convert to bits/sec and then round to the nearest mbit/secs. @@ -3988,8 +4099,6 @@ static int apply_police_params(struct mlx5e_priv *priv, u32 rate, * 1 mbit/sec. */ rate_mbps = rate ? max_t(u32, (rate * 8 + 500000) / 1000000, 1) : 0; - vport_num = rpriv->rep->vport; - err = mlx5_esw_modify_vport_rate(esw, vport_num, rate_mbps); if (err) NL_SET_ERR_MSG_MOD(extack, "failed applying action to hardware"); @@ -4142,7 +4251,10 @@ int mlx5e_tc_nic_init(struct mlx5e_priv *priv) return err; tc->netdevice_nb.notifier_call = mlx5e_tc_netdev_event; - if (register_netdevice_notifier(&tc->netdevice_nb)) { + err = register_netdevice_notifier_dev_net(priv->netdev, + &tc->netdevice_nb, + &tc->netdevice_nn); + if (err) { tc->netdevice_nb.notifier_call = NULL; mlx5_core_warn(priv->mdev, "Failed to register netdev notifier\n"); } @@ -4164,7 +4276,9 @@ void mlx5e_tc_nic_cleanup(struct mlx5e_priv *priv) struct mlx5e_tc_table *tc = &priv->fs.tc; if (tc->netdevice_nb.notifier_call) - unregister_netdevice_notifier(&tc->netdevice_nb); + unregister_netdevice_notifier_dev_net(priv->netdev, + &tc->netdevice_nb, + &tc->netdevice_nn); mutex_destroy(&tc->mod_hdr.lock); mutex_destroy(&tc->hairpin_tbl_lock); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index 580c71cb9dfa..cccea3a8eddd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -156,7 +156,8 @@ static int mlx5_eq_comp_int(struct notifier_block *nb, cq->comp(cq, eqe); mlx5_cq_put(cq); } else { - mlx5_core_warn(eq->dev, "Completion event for bogus CQ 0x%x\n", cqn); + dev_dbg_ratelimited(eq->dev->device, + "Completion event for bogus CQ 0x%x\n", cqn); } ++eq->cons_index; @@ -563,6 +564,39 @@ static void gather_async_events_mask(struct mlx5_core_dev *dev, u64 mask[4]) gather_user_async_events(dev, mask); } +static int +setup_async_eq(struct mlx5_core_dev *dev, struct mlx5_eq_async *eq, + struct mlx5_eq_param *param, const char *name) +{ + int err; + + eq->irq_nb.notifier_call = mlx5_eq_async_int; + + err = create_async_eq(dev, &eq->core, param); + if (err) { + mlx5_core_warn(dev, "failed to create %s EQ %d\n", name, err); + return err; + } + err = mlx5_eq_enable(dev, &eq->core, &eq->irq_nb); + if (err) { + mlx5_core_warn(dev, "failed to enable %s EQ %d\n", name, err); + destroy_async_eq(dev, &eq->core); + } + return err; +} + +static void cleanup_async_eq(struct mlx5_core_dev *dev, + struct mlx5_eq_async *eq, const char *name) +{ + int err; + + mlx5_eq_disable(dev, &eq->core, &eq->irq_nb); + err = destroy_async_eq(dev, &eq->core); + if (err) + mlx5_core_err(dev, "failed to destroy %s eq, err(%d)\n", + name, err); +} + static int create_async_eqs(struct mlx5_core_dev *dev) { struct mlx5_eq_table *table = dev->priv.eq_table; @@ -572,77 +606,45 @@ static int create_async_eqs(struct mlx5_core_dev *dev) MLX5_NB_INIT(&table->cq_err_nb, cq_err_event_notifier, CQ_ERROR); mlx5_eq_notifier_register(dev, &table->cq_err_nb); - table->cmd_eq.irq_nb.notifier_call = mlx5_eq_async_int; param = (struct mlx5_eq_param) { .irq_index = 0, .nent = MLX5_NUM_CMD_EQE, + .mask[0] = 1ull << MLX5_EVENT_TYPE_CMD, }; - - param.mask[0] = 1ull << MLX5_EVENT_TYPE_CMD; - err = create_async_eq(dev, &table->cmd_eq.core, ¶m); - if (err) { - mlx5_core_warn(dev, "failed to create cmd EQ %d\n", err); - goto err0; - } - err = mlx5_eq_enable(dev, &table->cmd_eq.core, &table->cmd_eq.irq_nb); - if (err) { - mlx5_core_warn(dev, "failed to enable cmd EQ %d\n", err); + err = setup_async_eq(dev, &table->cmd_eq, ¶m, "cmd"); + if (err) goto err1; - } + mlx5_cmd_use_events(dev); - table->async_eq.irq_nb.notifier_call = mlx5_eq_async_int; param = (struct mlx5_eq_param) { .irq_index = 0, .nent = MLX5_NUM_ASYNC_EQE, }; gather_async_events_mask(dev, param.mask); - err = create_async_eq(dev, &table->async_eq.core, ¶m); - if (err) { - mlx5_core_warn(dev, "failed to create async EQ %d\n", err); + err = setup_async_eq(dev, &table->async_eq, ¶m, "async"); + if (err) goto err2; - } - err = mlx5_eq_enable(dev, &table->async_eq.core, - &table->async_eq.irq_nb); - if (err) { - mlx5_core_warn(dev, "failed to enable async EQ %d\n", err); - goto err3; - } - table->pages_eq.irq_nb.notifier_call = mlx5_eq_async_int; param = (struct mlx5_eq_param) { .irq_index = 0, .nent = /* TODO: sriov max_vf + */ 1, + .mask[0] = 1ull << MLX5_EVENT_TYPE_PAGE_REQUEST, }; - param.mask[0] = 1ull << MLX5_EVENT_TYPE_PAGE_REQUEST; - err = create_async_eq(dev, &table->pages_eq.core, ¶m); - if (err) { - mlx5_core_warn(dev, "failed to create pages EQ %d\n", err); - goto err4; - } - err = mlx5_eq_enable(dev, &table->pages_eq.core, - &table->pages_eq.irq_nb); - if (err) { - mlx5_core_warn(dev, "failed to enable pages EQ %d\n", err); - goto err5; - } + err = setup_async_eq(dev, &table->pages_eq, ¶m, "pages"); + if (err) + goto err3; - return err; + return 0; -err5: - destroy_async_eq(dev, &table->pages_eq.core); -err4: - mlx5_eq_disable(dev, &table->async_eq.core, &table->async_eq.irq_nb); err3: - destroy_async_eq(dev, &table->async_eq.core); + cleanup_async_eq(dev, &table->async_eq, "async"); err2: mlx5_cmd_use_polling(dev); - mlx5_eq_disable(dev, &table->cmd_eq.core, &table->cmd_eq.irq_nb); + cleanup_async_eq(dev, &table->cmd_eq, "cmd"); err1: - destroy_async_eq(dev, &table->cmd_eq.core); -err0: mlx5_eq_notifier_unregister(dev, &table->cq_err_nb); return err; } @@ -650,28 +652,11 @@ err0: static void destroy_async_eqs(struct mlx5_core_dev *dev) { struct mlx5_eq_table *table = dev->priv.eq_table; - int err; - - mlx5_eq_disable(dev, &table->pages_eq.core, &table->pages_eq.irq_nb); - err = destroy_async_eq(dev, &table->pages_eq.core); - if (err) - mlx5_core_err(dev, "failed to destroy pages eq, err(%d)\n", - err); - - mlx5_eq_disable(dev, &table->async_eq.core, &table->async_eq.irq_nb); - err = destroy_async_eq(dev, &table->async_eq.core); - if (err) - mlx5_core_err(dev, "failed to destroy async eq, err(%d)\n", - err); + cleanup_async_eq(dev, &table->pages_eq, "pages"); + cleanup_async_eq(dev, &table->async_eq, "async"); mlx5_cmd_use_polling(dev); - - mlx5_eq_disable(dev, &table->cmd_eq.core, &table->cmd_eq.irq_nb); - err = destroy_async_eq(dev, &table->cmd_eq.core); - if (err) - mlx5_core_err(dev, "failed to destroy command eq, err(%d)\n", - err); - + cleanup_async_eq(dev, &table->cmd_eq, "cmd"); mlx5_eq_notifier_unregister(dev, &table->cq_err_nb); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index 2c965ad0d744..5acf60b1bbfe 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -277,6 +277,7 @@ enum { static int esw_create_legacy_vepa_table(struct mlx5_eswitch *esw) { + struct mlx5_flow_table_attr ft_attr = {}; struct mlx5_core_dev *dev = esw->dev; struct mlx5_flow_namespace *root_ns; struct mlx5_flow_table *fdb; @@ -289,8 +290,10 @@ static int esw_create_legacy_vepa_table(struct mlx5_eswitch *esw) } /* num FTE 2, num FG 2 */ - fdb = mlx5_create_auto_grouped_flow_table(root_ns, LEGACY_VEPA_PRIO, - 2, 2, 0, 0); + ft_attr.prio = LEGACY_VEPA_PRIO; + ft_attr.max_fte = 2; + ft_attr.autogroup.max_num_groups = 2; + fdb = mlx5_create_auto_grouped_flow_table(root_ns, &ft_attr); if (IS_ERR(fdb)) { err = PTR_ERR(fdb); esw_warn(dev, "Failed to create VEPA FDB err %d\n", err); @@ -1928,8 +1931,10 @@ static void mlx5_eswitch_clear_vf_vports_info(struct mlx5_eswitch *esw) struct mlx5_vport *vport; int i; - mlx5_esw_for_each_vf_vport(esw, i, vport, esw->esw_funcs.num_vfs) + mlx5_esw_for_each_vf_vport(esw, i, vport, esw->esw_funcs.num_vfs) { memset(&vport->info, 0, sizeof(vport->info)); + vport->info.link_state = MLX5_VPORT_ADMIN_STATE_AUTO; + } } /* Public E-Switch API */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index ffcff3ba3701..4472710ccc9c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -157,7 +157,7 @@ enum offloads_fdb_flags { ESW_FDB_CHAINS_AND_PRIOS_SUPPORTED = BIT(0), }; -extern const unsigned int ESW_POOLS[4]; +struct mlx5_esw_chains_priv; struct mlx5_eswitch_fdb { union { @@ -182,14 +182,7 @@ struct mlx5_eswitch_fdb { struct mlx5_flow_handle *miss_rule_multi; int vlan_push_pop_refcount; - struct { - struct mlx5_flow_table *fdb; - u32 num_rules; - } fdb_prio[FDB_NUM_CHAINS][FDB_TC_MAX_PRIO + 1][FDB_TC_LEVELS_PER_PRIO]; - /* Protects fdb_prio table */ - struct mutex fdb_prio_lock; - - int fdb_left[ARRAY_SIZE(ESW_POOLS)]; + struct mlx5_esw_chains_priv *esw_chains_priv; } offloads; }; u32 flags; @@ -355,15 +348,6 @@ mlx5_eswitch_del_fwd_rule(struct mlx5_eswitch *esw, struct mlx5_flow_handle *rule, struct mlx5_esw_flow_attr *attr); -bool -mlx5_eswitch_prios_supported(struct mlx5_eswitch *esw); - -u16 -mlx5_eswitch_get_prio_range(struct mlx5_eswitch *esw); - -u32 -mlx5_eswitch_get_chain_range(struct mlx5_eswitch *esw); - struct mlx5_flow_handle * mlx5_eswitch_create_vport_rx_rule(struct mlx5_eswitch *esw, u16 vport, struct mlx5_flow_destination *dest); @@ -388,6 +372,11 @@ enum { MLX5_ESW_DEST_ENCAP_VALID = BIT(1), }; +enum { + MLX5_ESW_ATTR_FLAG_VLAN_HANDLED = BIT(0), + MLX5_ESW_ATTR_FLAG_SLOW_PATH = BIT(1), +}; + struct mlx5_esw_flow_attr { struct mlx5_eswitch_rep *in_rep; struct mlx5_core_dev *in_mdev; @@ -401,7 +390,6 @@ struct mlx5_esw_flow_attr { u16 vlan_vid[MLX5_FS_VLAN_DEPTH]; u8 vlan_prio[MLX5_FS_VLAN_DEPTH]; u8 total_vlan; - bool vlan_handled; struct { u32 flags; struct mlx5_eswitch_rep *rep; @@ -416,6 +404,7 @@ struct mlx5_esw_flow_attr { u32 chain; u16 prio; u32 dest_chain; + u32 flags; struct mlx5e_tc_flow_parse_attr *parse_attr; }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 243a5440867e..979f13bdc203 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -37,6 +37,7 @@ #include <linux/mlx5/fs.h> #include "mlx5_core.h" #include "eswitch.h" +#include "eswitch_offloads_chains.h" #include "rdma.h" #include "en.h" #include "fs_core.h" @@ -47,10 +48,6 @@ * one for multicast. */ #define MLX5_ESW_MISS_FLOWS (2) - -#define fdb_prio_table(esw, chain, prio, level) \ - (esw)->fdb_table.offloads.fdb_prio[(chain)][(prio)][(level)] - #define UPLINK_REP_INDEX 0 static struct mlx5_eswitch_rep *mlx5_eswitch_get_rep(struct mlx5_eswitch *esw, @@ -62,32 +59,6 @@ static struct mlx5_eswitch_rep *mlx5_eswitch_get_rep(struct mlx5_eswitch *esw, return &esw->offloads.vport_reps[idx]; } -static struct mlx5_flow_table * -esw_get_prio_table(struct mlx5_eswitch *esw, u32 chain, u16 prio, int level); -static void -esw_put_prio_table(struct mlx5_eswitch *esw, u32 chain, u16 prio, int level); - -bool mlx5_eswitch_prios_supported(struct mlx5_eswitch *esw) -{ - return (!!(esw->fdb_table.flags & ESW_FDB_CHAINS_AND_PRIOS_SUPPORTED)); -} - -u32 mlx5_eswitch_get_chain_range(struct mlx5_eswitch *esw) -{ - if (esw->fdb_table.flags & ESW_FDB_CHAINS_AND_PRIOS_SUPPORTED) - return FDB_TC_MAX_CHAIN; - - return 0; -} - -u16 mlx5_eswitch_get_prio_range(struct mlx5_eswitch *esw) -{ - if (esw->fdb_table.flags & ESW_FDB_CHAINS_AND_PRIOS_SUPPORTED) - return FDB_TC_MAX_PRIO; - - return 1; -} - static bool esw_check_ingress_prio_tag_enabled(const struct mlx5_eswitch *esw, const struct mlx5_vport *vport) @@ -175,10 +146,17 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw, } if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) { - if (attr->dest_chain) { - struct mlx5_flow_table *ft; + struct mlx5_flow_table *ft; - ft = esw_get_prio_table(esw, attr->dest_chain, 1, 0); + if (attr->flags & MLX5_ESW_ATTR_FLAG_SLOW_PATH) { + flow_act.flags |= FLOW_ACT_IGNORE_FLOW_LEVEL; + dest[i].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest[i].ft = mlx5_esw_chains_get_tc_end_ft(esw); + i++; + } else if (attr->dest_chain) { + flow_act.flags |= FLOW_ACT_IGNORE_FLOW_LEVEL; + ft = mlx5_esw_chains_get_table(esw, attr->dest_chain, + 1, 0); if (IS_ERR(ft)) { rule = ERR_CAST(ft); goto err_create_goto_table; @@ -223,7 +201,8 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw, if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) flow_act.modify_hdr = attr->modify_hdr; - fdb = esw_get_prio_table(esw, attr->chain, attr->prio, !!split); + fdb = mlx5_esw_chains_get_table(esw, attr->chain, attr->prio, + !!split); if (IS_ERR(fdb)) { rule = ERR_CAST(fdb); goto err_esw_get; @@ -242,10 +221,10 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw, return rule; err_add_rule: - esw_put_prio_table(esw, attr->chain, attr->prio, !!split); + mlx5_esw_chains_put_table(esw, attr->chain, attr->prio, !!split); err_esw_get: - if (attr->dest_chain) - esw_put_prio_table(esw, attr->dest_chain, 1, 0); + if (!(attr->flags & MLX5_ESW_ATTR_FLAG_SLOW_PATH) && attr->dest_chain) + mlx5_esw_chains_put_table(esw, attr->dest_chain, 1, 0); err_create_goto_table: return rule; } @@ -262,13 +241,13 @@ mlx5_eswitch_add_fwd_rule(struct mlx5_eswitch *esw, struct mlx5_flow_handle *rule; int i; - fast_fdb = esw_get_prio_table(esw, attr->chain, attr->prio, 0); + fast_fdb = mlx5_esw_chains_get_table(esw, attr->chain, attr->prio, 0); if (IS_ERR(fast_fdb)) { rule = ERR_CAST(fast_fdb); goto err_get_fast; } - fwd_fdb = esw_get_prio_table(esw, attr->chain, attr->prio, 1); + fwd_fdb = mlx5_esw_chains_get_table(esw, attr->chain, attr->prio, 1); if (IS_ERR(fwd_fdb)) { rule = ERR_CAST(fwd_fdb); goto err_get_fwd; @@ -296,6 +275,7 @@ mlx5_eswitch_add_fwd_rule(struct mlx5_eswitch *esw, if (attr->outer_match_level != MLX5_MATCH_NONE) spec->match_criteria_enable |= MLX5_MATCH_OUTER_HEADERS; + flow_act.flags |= FLOW_ACT_IGNORE_FLOW_LEVEL; rule = mlx5_add_flow_rules(fast_fdb, spec, &flow_act, dest, i); if (IS_ERR(rule)) @@ -305,9 +285,9 @@ mlx5_eswitch_add_fwd_rule(struct mlx5_eswitch *esw, return rule; add_err: - esw_put_prio_table(esw, attr->chain, attr->prio, 1); + mlx5_esw_chains_put_table(esw, attr->chain, attr->prio, 1); err_get_fwd: - esw_put_prio_table(esw, attr->chain, attr->prio, 0); + mlx5_esw_chains_put_table(esw, attr->chain, attr->prio, 0); err_get_fast: return rule; } @@ -332,12 +312,13 @@ __mlx5_eswitch_del_rule(struct mlx5_eswitch *esw, atomic64_dec(&esw->offloads.num_flows); if (fwd_rule) { - esw_put_prio_table(esw, attr->chain, attr->prio, 1); - esw_put_prio_table(esw, attr->chain, attr->prio, 0); + mlx5_esw_chains_put_table(esw, attr->chain, attr->prio, 1); + mlx5_esw_chains_put_table(esw, attr->chain, attr->prio, 0); } else { - esw_put_prio_table(esw, attr->chain, attr->prio, !!split); + mlx5_esw_chains_put_table(esw, attr->chain, attr->prio, + !!split); if (attr->dest_chain) - esw_put_prio_table(esw, attr->dest_chain, 1, 0); + mlx5_esw_chains_put_table(esw, attr->dest_chain, 1, 0); } } @@ -451,7 +432,7 @@ int mlx5_eswitch_add_vlan_action(struct mlx5_eswitch *esw, if (err) goto unlock; - attr->vlan_handled = false; + attr->flags &= ~MLX5_ESW_ATTR_FLAG_VLAN_HANDLED; vport = esw_vlan_action_get_vport(attr, push, pop); @@ -459,7 +440,7 @@ int mlx5_eswitch_add_vlan_action(struct mlx5_eswitch *esw, /* tracks VF --> wire rules without vlan push action */ if (attr->dests[0].rep->vport == MLX5_VPORT_UPLINK) { vport->vlan_refcount++; - attr->vlan_handled = true; + attr->flags |= MLX5_ESW_ATTR_FLAG_VLAN_HANDLED; } goto unlock; @@ -490,7 +471,7 @@ skip_set_push: } out: if (!err) - attr->vlan_handled = true; + attr->flags |= MLX5_ESW_ATTR_FLAG_VLAN_HANDLED; unlock: mutex_unlock(&esw->state_lock); return err; @@ -508,7 +489,7 @@ int mlx5_eswitch_del_vlan_action(struct mlx5_eswitch *esw, if (mlx5_eswitch_vlan_actions_supported(esw->dev, 1)) return 0; - if (!attr->vlan_handled) + if (!(attr->flags & MLX5_ESW_ATTR_FLAG_VLAN_HANDLED)) return 0; push = !!(attr->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH); @@ -582,8 +563,8 @@ mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw, u16 vport, dest.vport.num = vport; flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; - flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.slow_fdb, spec, - &flow_act, &dest, 1); + flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.slow_fdb, + spec, &flow_act, &dest, 1); if (IS_ERR(flow_rule)) esw_warn(esw->dev, "FDB: Failed to add send to vport rule err %ld\n", PTR_ERR(flow_rule)); out: @@ -824,8 +805,8 @@ static int esw_add_fdb_miss_rule(struct mlx5_eswitch *esw) dest.vport.num = esw->manager_vport; flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; - flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.slow_fdb, spec, - &flow_act, &dest, 1); + flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.slow_fdb, + spec, &flow_act, &dest, 1); if (IS_ERR(flow_rule)) { err = PTR_ERR(flow_rule); esw_warn(esw->dev, "FDB: Failed to add unicast miss flow rule err %d\n", err); @@ -839,8 +820,8 @@ static int esw_add_fdb_miss_rule(struct mlx5_eswitch *esw) dmac_v = MLX5_ADDR_OF(fte_match_param, headers_v, outer_headers.dmac_47_16); dmac_v[0] = 0x01; - flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.slow_fdb, spec, - &flow_act, &dest, 1); + flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.slow_fdb, + spec, &flow_act, &dest, 1); if (IS_ERR(flow_rule)) { err = PTR_ERR(flow_rule); esw_warn(esw->dev, "FDB: Failed to add multicast miss flow rule err %d\n", err); @@ -855,174 +836,6 @@ out: return err; } -#define ESW_OFFLOADS_NUM_GROUPS 4 - -/* Firmware currently has 4 pool of 4 sizes that it supports (ESW_POOLS), - * and a virtual memory region of 16M (ESW_SIZE), this region is duplicated - * for each flow table pool. We can allocate up to 16M of each pool, - * and we keep track of how much we used via put/get_sz_to_pool. - * Firmware doesn't report any of this for now. - * ESW_POOL is expected to be sorted from large to small - */ -#define ESW_SIZE (16 * 1024 * 1024) -const unsigned int ESW_POOLS[4] = { 4 * 1024 * 1024, 1 * 1024 * 1024, - 64 * 1024, 4 * 1024 }; - -static int -get_sz_from_pool(struct mlx5_eswitch *esw) -{ - int sz = 0, i; - - for (i = 0; i < ARRAY_SIZE(ESW_POOLS); i++) { - if (esw->fdb_table.offloads.fdb_left[i]) { - --esw->fdb_table.offloads.fdb_left[i]; - sz = ESW_POOLS[i]; - break; - } - } - - return sz; -} - -static void -put_sz_to_pool(struct mlx5_eswitch *esw, int sz) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(ESW_POOLS); i++) { - if (sz >= ESW_POOLS[i]) { - ++esw->fdb_table.offloads.fdb_left[i]; - break; - } - } -} - -static struct mlx5_flow_table * -create_next_size_table(struct mlx5_eswitch *esw, - struct mlx5_flow_namespace *ns, - u16 table_prio, - int level, - u32 flags) -{ - struct mlx5_flow_table *fdb; - int sz; - - sz = get_sz_from_pool(esw); - if (!sz) - return ERR_PTR(-ENOSPC); - - fdb = mlx5_create_auto_grouped_flow_table(ns, - table_prio, - sz, - ESW_OFFLOADS_NUM_GROUPS, - level, - flags); - if (IS_ERR(fdb)) { - esw_warn(esw->dev, "Failed to create FDB Table err %d (table prio: %d, level: %d, size: %d)\n", - (int)PTR_ERR(fdb), table_prio, level, sz); - put_sz_to_pool(esw, sz); - } - - return fdb; -} - -static struct mlx5_flow_table * -esw_get_prio_table(struct mlx5_eswitch *esw, u32 chain, u16 prio, int level) -{ - struct mlx5_core_dev *dev = esw->dev; - struct mlx5_flow_table *fdb = NULL; - struct mlx5_flow_namespace *ns; - int table_prio, l = 0; - u32 flags = 0; - - if (chain == FDB_TC_SLOW_PATH_CHAIN) - return esw->fdb_table.offloads.slow_fdb; - - mutex_lock(&esw->fdb_table.offloads.fdb_prio_lock); - - fdb = fdb_prio_table(esw, chain, prio, level).fdb; - if (fdb) { - /* take ref on earlier levels as well */ - while (level >= 0) - fdb_prio_table(esw, chain, prio, level--).num_rules++; - mutex_unlock(&esw->fdb_table.offloads.fdb_prio_lock); - return fdb; - } - - ns = mlx5_get_fdb_sub_ns(dev, chain); - if (!ns) { - esw_warn(dev, "Failed to get FDB sub namespace\n"); - mutex_unlock(&esw->fdb_table.offloads.fdb_prio_lock); - return ERR_PTR(-EOPNOTSUPP); - } - - if (esw->offloads.encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE) - flags |= (MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT | - MLX5_FLOW_TABLE_TUNNEL_EN_DECAP); - - table_prio = prio - 1; - - /* create earlier levels for correct fs_core lookup when - * connecting tables - */ - for (l = 0; l <= level; l++) { - if (fdb_prio_table(esw, chain, prio, l).fdb) { - fdb_prio_table(esw, chain, prio, l).num_rules++; - continue; - } - - fdb = create_next_size_table(esw, ns, table_prio, l, flags); - if (IS_ERR(fdb)) { - l--; - goto err_create_fdb; - } - - fdb_prio_table(esw, chain, prio, l).fdb = fdb; - fdb_prio_table(esw, chain, prio, l).num_rules = 1; - } - - mutex_unlock(&esw->fdb_table.offloads.fdb_prio_lock); - return fdb; - -err_create_fdb: - mutex_unlock(&esw->fdb_table.offloads.fdb_prio_lock); - if (l >= 0) - esw_put_prio_table(esw, chain, prio, l); - - return fdb; -} - -static void -esw_put_prio_table(struct mlx5_eswitch *esw, u32 chain, u16 prio, int level) -{ - int l; - - if (chain == FDB_TC_SLOW_PATH_CHAIN) - return; - - mutex_lock(&esw->fdb_table.offloads.fdb_prio_lock); - - for (l = level; l >= 0; l--) { - if (--(fdb_prio_table(esw, chain, prio, l).num_rules) > 0) - continue; - - put_sz_to_pool(esw, fdb_prio_table(esw, chain, prio, l).fdb->max_fte); - mlx5_destroy_flow_table(fdb_prio_table(esw, chain, prio, l).fdb); - fdb_prio_table(esw, chain, prio, l).fdb = NULL; - } - - mutex_unlock(&esw->fdb_table.offloads.fdb_prio_lock); -} - -static void esw_destroy_offloads_fast_fdb_tables(struct mlx5_eswitch *esw) -{ - /* If lazy creation isn't supported, deref the fast path tables */ - if (!(esw->fdb_table.flags & ESW_FDB_CHAINS_AND_PRIOS_SUPPORTED)) { - esw_put_prio_table(esw, 0, 1, 1); - esw_put_prio_table(esw, 0, 1, 0); - } -} - #define MAX_PF_SQ 256 #define MAX_SQ_NVPORTS 32 @@ -1055,16 +868,16 @@ static int esw_create_offloads_fdb_tables(struct mlx5_eswitch *esw, int nvports) int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); struct mlx5_flow_table_attr ft_attr = {}; struct mlx5_core_dev *dev = esw->dev; - u32 *flow_group_in, max_flow_counter; struct mlx5_flow_namespace *root_ns; struct mlx5_flow_table *fdb = NULL; - int table_size, ix, err = 0, i; + u32 flags = 0, *flow_group_in; + int table_size, ix, err = 0; struct mlx5_flow_group *g; - u32 flags = 0, fdb_max; void *match_criteria; u8 *dmac; esw_debug(esw->dev, "Create offloads FDB Tables\n"); + flow_group_in = kvzalloc(inlen, GFP_KERNEL); if (!flow_group_in) return -ENOMEM; @@ -1083,19 +896,6 @@ static int esw_create_offloads_fdb_tables(struct mlx5_eswitch *esw, int nvports) goto ns_err; } - max_flow_counter = (MLX5_CAP_GEN(dev, max_flow_counter_31_16) << 16) | - MLX5_CAP_GEN(dev, max_flow_counter_15_0); - fdb_max = 1 << MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size); - - esw_debug(dev, "Create offloads FDB table, min (max esw size(2^%d), max counters(%d), groups(%d), max flow table size(%d))\n", - MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size), - max_flow_counter, ESW_OFFLOADS_NUM_GROUPS, - fdb_max); - - for (i = 0; i < ARRAY_SIZE(ESW_POOLS); i++) - esw->fdb_table.offloads.fdb_left[i] = - ESW_POOLS[i] <= fdb_max ? ESW_SIZE / ESW_POOLS[i] : 0; - table_size = nvports * MAX_SQ_NVPORTS + MAX_PF_SQ + MLX5_ESW_MISS_FLOWS + esw->total_vports; @@ -1118,16 +918,10 @@ static int esw_create_offloads_fdb_tables(struct mlx5_eswitch *esw, int nvports) } esw->fdb_table.offloads.slow_fdb = fdb; - /* If lazy creation isn't supported, open the fast path tables now */ - if (!MLX5_CAP_ESW_FLOWTABLE(esw->dev, multi_fdb_encap) && - esw->offloads.encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE) { - esw->fdb_table.flags &= ~ESW_FDB_CHAINS_AND_PRIOS_SUPPORTED; - esw_warn(dev, "Lazy creation of flow tables isn't supported, ignoring priorities\n"); - esw_get_prio_table(esw, 0, 1, 0); - esw_get_prio_table(esw, 0, 1, 1); - } else { - esw_debug(dev, "Lazy creation of flow tables supported, deferring table opening\n"); - esw->fdb_table.flags |= ESW_FDB_CHAINS_AND_PRIOS_SUPPORTED; + err = mlx5_esw_chains_create(esw); + if (err) { + esw_warn(dev, "Failed to create fdb chains err(%d)\n", err); + goto fdb_chains_err; } /* create send-to-vport group */ @@ -1218,7 +1012,8 @@ miss_err: peer_miss_err: mlx5_destroy_flow_group(esw->fdb_table.offloads.send_to_vport_grp); send_vport_err: - esw_destroy_offloads_fast_fdb_tables(esw); + mlx5_esw_chains_destroy(esw); +fdb_chains_err: mlx5_destroy_flow_table(esw->fdb_table.offloads.slow_fdb); slow_fdb_err: /* Holds true only as long as DMFS is the default */ @@ -1240,8 +1035,8 @@ static void esw_destroy_offloads_fdb_tables(struct mlx5_eswitch *esw) mlx5_destroy_flow_group(esw->fdb_table.offloads.peer_miss_grp); mlx5_destroy_flow_group(esw->fdb_table.offloads.miss_grp); + mlx5_esw_chains_destroy(esw); mlx5_destroy_flow_table(esw->fdb_table.offloads.slow_fdb); - esw_destroy_offloads_fast_fdb_tables(esw); /* Holds true only as long as DMFS is the default */ mlx5_flow_namespace_set_mode(esw->fdb_table.offloads.ns, MLX5_FLOW_STEERING_MODE_DMFS); @@ -1377,7 +1172,7 @@ static int esw_offloads_start(struct mlx5_eswitch *esw, return -EINVAL; } - mlx5_eswitch_disable(esw, false); + mlx5_eswitch_disable(esw, true); mlx5_eswitch_update_num_of_vfs(esw, esw->dev->priv.sriov.num_vfs); err = mlx5_eswitch_enable(esw, MLX5_ESWITCH_OFFLOADS); if (err) { @@ -2111,7 +1906,6 @@ static int esw_offloads_steering_init(struct mlx5_eswitch *esw) total_vports = num_vfs + MLX5_SPECIAL_VPORTS(esw->dev); memset(&esw->fdb_table.offloads, 0, sizeof(struct offloads_fdb)); - mutex_init(&esw->fdb_table.offloads.fdb_prio_lock); err = esw_create_uplink_offloads_acl_tables(esw); if (err) @@ -2220,7 +2014,8 @@ int mlx5_esw_funcs_changed_handler(struct notifier_block *nb, unsigned long type int esw_offloads_enable(struct mlx5_eswitch *esw) { - int err; + struct mlx5_vport *vport; + int err, i; if (MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, reformat) && MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, decap)) @@ -2237,6 +2032,10 @@ int esw_offloads_enable(struct mlx5_eswitch *esw) if (err) goto err_vport_metadata; + /* Representor will control the vport link state */ + mlx5_esw_for_each_vf_vport(esw, i, vport, esw->esw_funcs.num_vfs) + vport->info.link_state = MLX5_VPORT_ADMIN_STATE_DOWN; + err = mlx5_eswitch_enable_pf_vf_vports(esw, MLX5_VPORT_UC_ADDR_CHANGE); if (err) goto err_vports; @@ -2266,7 +2065,7 @@ static int esw_offloads_stop(struct mlx5_eswitch *esw, { int err, err1; - mlx5_eswitch_disable(esw, false); + mlx5_eswitch_disable(esw, true); err = mlx5_eswitch_enable(esw, MLX5_ESWITCH_LEGACY); if (err) { NL_SET_ERR_MSG_MOD(extack, "Failed setting eswitch to legacy"); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_chains.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_chains.c new file mode 100644 index 000000000000..c5a446e295aa --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_chains.c @@ -0,0 +1,758 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2020 Mellanox Technologies. + +#include <linux/mlx5/driver.h> +#include <linux/mlx5/mlx5_ifc.h> +#include <linux/mlx5/fs.h> + +#include "eswitch_offloads_chains.h" +#include "mlx5_core.h" +#include "fs_core.h" +#include "eswitch.h" +#include "en.h" + +#define esw_chains_priv(esw) ((esw)->fdb_table.offloads.esw_chains_priv) +#define esw_chains_lock(esw) (esw_chains_priv(esw)->lock) +#define esw_chains_ht(esw) (esw_chains_priv(esw)->chains_ht) +#define esw_prios_ht(esw) (esw_chains_priv(esw)->prios_ht) +#define fdb_pool_left(esw) (esw_chains_priv(esw)->fdb_left) +#define tc_slow_fdb(esw) ((esw)->fdb_table.offloads.slow_fdb) +#define tc_end_fdb(esw) (esw_chains_priv(esw)->tc_end_fdb) +#define fdb_ignore_flow_level_supported(esw) \ + (MLX5_CAP_ESW_FLOWTABLE_FDB((esw)->dev, ignore_flow_level)) + +#define ESW_OFFLOADS_NUM_GROUPS 4 + +/* Firmware currently has 4 pool of 4 sizes that it supports (ESW_POOLS), + * and a virtual memory region of 16M (ESW_SIZE), this region is duplicated + * for each flow table pool. We can allocate up to 16M of each pool, + * and we keep track of how much we used via get_next_avail_sz_from_pool. + * Firmware doesn't report any of this for now. + * ESW_POOL is expected to be sorted from large to small and match firmware + * pools. + */ +#define ESW_SIZE (16 * 1024 * 1024) +static const unsigned int ESW_POOLS[] = { 4 * 1024 * 1024, + 1 * 1024 * 1024, + 64 * 1024, + 4 * 1024, }; + +struct mlx5_esw_chains_priv { + struct rhashtable chains_ht; + struct rhashtable prios_ht; + /* Protects above chains_ht and prios_ht */ + struct mutex lock; + + struct mlx5_flow_table *tc_end_fdb; + + int fdb_left[ARRAY_SIZE(ESW_POOLS)]; +}; + +struct fdb_chain { + struct rhash_head node; + + u32 chain; + + int ref; + + struct mlx5_eswitch *esw; + struct list_head prios_list; +}; + +struct fdb_prio_key { + u32 chain; + u32 prio; + u32 level; +}; + +struct fdb_prio { + struct rhash_head node; + struct list_head list; + + struct fdb_prio_key key; + + int ref; + + struct fdb_chain *fdb_chain; + struct mlx5_flow_table *fdb; + struct mlx5_flow_table *next_fdb; + struct mlx5_flow_group *miss_group; + struct mlx5_flow_handle *miss_rule; +}; + +static const struct rhashtable_params chain_params = { + .head_offset = offsetof(struct fdb_chain, node), + .key_offset = offsetof(struct fdb_chain, chain), + .key_len = sizeof_field(struct fdb_chain, chain), + .automatic_shrinking = true, +}; + +static const struct rhashtable_params prio_params = { + .head_offset = offsetof(struct fdb_prio, node), + .key_offset = offsetof(struct fdb_prio, key), + .key_len = sizeof_field(struct fdb_prio, key), + .automatic_shrinking = true, +}; + +bool mlx5_esw_chains_prios_supported(struct mlx5_eswitch *esw) +{ + return esw->fdb_table.flags & ESW_FDB_CHAINS_AND_PRIOS_SUPPORTED; +} + +u32 mlx5_esw_chains_get_chain_range(struct mlx5_eswitch *esw) +{ + if (!mlx5_esw_chains_prios_supported(esw)) + return 1; + + if (fdb_ignore_flow_level_supported(esw)) + return UINT_MAX - 1; + + return FDB_TC_MAX_CHAIN; +} + +u32 mlx5_esw_chains_get_ft_chain(struct mlx5_eswitch *esw) +{ + return mlx5_esw_chains_get_chain_range(esw) + 1; +} + +u32 mlx5_esw_chains_get_prio_range(struct mlx5_eswitch *esw) +{ + if (!mlx5_esw_chains_prios_supported(esw)) + return 1; + + if (fdb_ignore_flow_level_supported(esw)) + return UINT_MAX; + + return FDB_TC_MAX_PRIO; +} + +static unsigned int mlx5_esw_chains_get_level_range(struct mlx5_eswitch *esw) +{ + if (fdb_ignore_flow_level_supported(esw)) + return UINT_MAX; + + return FDB_TC_LEVELS_PER_PRIO; +} + +#define POOL_NEXT_SIZE 0 +static int +mlx5_esw_chains_get_avail_sz_from_pool(struct mlx5_eswitch *esw, + int desired_size) +{ + int i, found_i = -1; + + for (i = ARRAY_SIZE(ESW_POOLS) - 1; i >= 0; i--) { + if (fdb_pool_left(esw)[i] && ESW_POOLS[i] > desired_size) { + found_i = i; + if (desired_size != POOL_NEXT_SIZE) + break; + } + } + + if (found_i != -1) { + --fdb_pool_left(esw)[found_i]; + return ESW_POOLS[found_i]; + } + + return 0; +} + +static void +mlx5_esw_chains_put_sz_to_pool(struct mlx5_eswitch *esw, int sz) +{ + int i; + + for (i = ARRAY_SIZE(ESW_POOLS) - 1; i >= 0; i--) { + if (sz == ESW_POOLS[i]) { + ++fdb_pool_left(esw)[i]; + return; + } + } + + WARN_ONCE(1, "Couldn't find size %d in fdb size pool", sz); +} + +static void +mlx5_esw_chains_init_sz_pool(struct mlx5_eswitch *esw) +{ + u32 fdb_max; + int i; + + fdb_max = 1 << MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, log_max_ft_size); + + for (i = ARRAY_SIZE(ESW_POOLS) - 1; i >= 0; i--) + fdb_pool_left(esw)[i] = + ESW_POOLS[i] <= fdb_max ? ESW_SIZE / ESW_POOLS[i] : 0; +} + +static struct mlx5_flow_table * +mlx5_esw_chains_create_fdb_table(struct mlx5_eswitch *esw, + u32 chain, u32 prio, u32 level) +{ + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_flow_namespace *ns; + struct mlx5_flow_table *fdb; + int sz; + + if (esw->offloads.encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE) + ft_attr.flags |= (MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT | + MLX5_FLOW_TABLE_TUNNEL_EN_DECAP); + + sz = mlx5_esw_chains_get_avail_sz_from_pool(esw, POOL_NEXT_SIZE); + if (!sz) + return ERR_PTR(-ENOSPC); + ft_attr.max_fte = sz; + + /* We use tc_slow_fdb(esw) as the table's next_ft till + * ignore_flow_level is allowed on FT creation and not just for FTEs. + * Instead caller should add an explicit miss rule if needed. + */ + ft_attr.next_ft = tc_slow_fdb(esw); + + /* The root table(chain 0, prio 1, level 0) is required to be + * connected to the previous prio (FDB_BYPASS_PATH if exists). + * We always create it, as a managed table, in order to align with + * fs_core logic. + */ + if (!fdb_ignore_flow_level_supported(esw) || + (chain == 0 && prio == 1 && level == 0)) { + ft_attr.level = level; + ft_attr.prio = prio - 1; + ns = mlx5_get_fdb_sub_ns(esw->dev, chain); + } else { + ft_attr.flags |= MLX5_FLOW_TABLE_UNMANAGED; + ft_attr.prio = FDB_TC_OFFLOAD; + /* Firmware doesn't allow us to create another level 0 table, + * so we create all unmanaged tables as level 1. + * + * To connect them, we use explicit miss rules with + * ignore_flow_level. Caller is responsible to create + * these rules (if needed). + */ + ft_attr.level = 1; + ns = mlx5_get_flow_namespace(esw->dev, MLX5_FLOW_NAMESPACE_FDB); + } + + ft_attr.autogroup.num_reserved_entries = 2; + ft_attr.autogroup.max_num_groups = ESW_OFFLOADS_NUM_GROUPS; + fdb = mlx5_create_auto_grouped_flow_table(ns, &ft_attr); + if (IS_ERR(fdb)) { + esw_warn(esw->dev, + "Failed to create FDB table err %d (chain: %d, prio: %d, level: %d, size: %d)\n", + (int)PTR_ERR(fdb), chain, prio, level, sz); + mlx5_esw_chains_put_sz_to_pool(esw, sz); + return fdb; + } + + return fdb; +} + +static void +mlx5_esw_chains_destroy_fdb_table(struct mlx5_eswitch *esw, + struct mlx5_flow_table *fdb) +{ + mlx5_esw_chains_put_sz_to_pool(esw, fdb->max_fte); + mlx5_destroy_flow_table(fdb); +} + +static struct fdb_chain * +mlx5_esw_chains_create_fdb_chain(struct mlx5_eswitch *esw, u32 chain) +{ + struct fdb_chain *fdb_chain = NULL; + int err; + + fdb_chain = kvzalloc(sizeof(*fdb_chain), GFP_KERNEL); + if (!fdb_chain) + return ERR_PTR(-ENOMEM); + + fdb_chain->esw = esw; + fdb_chain->chain = chain; + INIT_LIST_HEAD(&fdb_chain->prios_list); + + err = rhashtable_insert_fast(&esw_chains_ht(esw), &fdb_chain->node, + chain_params); + if (err) + goto err_insert; + + return fdb_chain; + +err_insert: + kvfree(fdb_chain); + return ERR_PTR(err); +} + +static void +mlx5_esw_chains_destroy_fdb_chain(struct fdb_chain *fdb_chain) +{ + struct mlx5_eswitch *esw = fdb_chain->esw; + + rhashtable_remove_fast(&esw_chains_ht(esw), &fdb_chain->node, + chain_params); + kvfree(fdb_chain); +} + +static struct fdb_chain * +mlx5_esw_chains_get_fdb_chain(struct mlx5_eswitch *esw, u32 chain) +{ + struct fdb_chain *fdb_chain; + + fdb_chain = rhashtable_lookup_fast(&esw_chains_ht(esw), &chain, + chain_params); + if (!fdb_chain) { + fdb_chain = mlx5_esw_chains_create_fdb_chain(esw, chain); + if (IS_ERR(fdb_chain)) + return fdb_chain; + } + + fdb_chain->ref++; + + return fdb_chain; +} + +static struct mlx5_flow_handle * +mlx5_esw_chains_add_miss_rule(struct mlx5_flow_table *fdb, + struct mlx5_flow_table *next_fdb) +{ + static const struct mlx5_flow_spec spec = {}; + struct mlx5_flow_destination dest = {}; + struct mlx5_flow_act act = {}; + + act.flags = FLOW_ACT_IGNORE_FLOW_LEVEL | FLOW_ACT_NO_APPEND; + act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest.ft = next_fdb; + + return mlx5_add_flow_rules(fdb, &spec, &act, &dest, 1); +} + +static int +mlx5_esw_chains_update_prio_prevs(struct fdb_prio *fdb_prio, + struct mlx5_flow_table *next_fdb) +{ + struct mlx5_flow_handle *miss_rules[FDB_TC_LEVELS_PER_PRIO + 1] = {}; + struct fdb_chain *fdb_chain = fdb_prio->fdb_chain; + struct fdb_prio *pos; + int n = 0, err; + + if (fdb_prio->key.level) + return 0; + + /* Iterate in reverse order until reaching the level 0 rule of + * the previous priority, adding all the miss rules first, so we can + * revert them if any of them fails. + */ + pos = fdb_prio; + list_for_each_entry_continue_reverse(pos, + &fdb_chain->prios_list, + list) { + miss_rules[n] = mlx5_esw_chains_add_miss_rule(pos->fdb, + next_fdb); + if (IS_ERR(miss_rules[n])) { + err = PTR_ERR(miss_rules[n]); + goto err_prev_rule; + } + + n++; + if (!pos->key.level) + break; + } + + /* Success, delete old miss rules, and update the pointers. */ + n = 0; + pos = fdb_prio; + list_for_each_entry_continue_reverse(pos, + &fdb_chain->prios_list, + list) { + mlx5_del_flow_rules(pos->miss_rule); + + pos->miss_rule = miss_rules[n]; + pos->next_fdb = next_fdb; + + n++; + if (!pos->key.level) + break; + } + + return 0; + +err_prev_rule: + while (--n >= 0) + mlx5_del_flow_rules(miss_rules[n]); + + return err; +} + +static void +mlx5_esw_chains_put_fdb_chain(struct fdb_chain *fdb_chain) +{ + if (--fdb_chain->ref == 0) + mlx5_esw_chains_destroy_fdb_chain(fdb_chain); +} + +static struct fdb_prio * +mlx5_esw_chains_create_fdb_prio(struct mlx5_eswitch *esw, + u32 chain, u32 prio, u32 level) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_handle *miss_rule = NULL; + struct mlx5_flow_group *miss_group; + struct fdb_prio *fdb_prio = NULL; + struct mlx5_flow_table *next_fdb; + struct fdb_chain *fdb_chain; + struct mlx5_flow_table *fdb; + struct list_head *pos; + u32 *flow_group_in; + int err; + + fdb_chain = mlx5_esw_chains_get_fdb_chain(esw, chain); + if (IS_ERR(fdb_chain)) + return ERR_CAST(fdb_chain); + + fdb_prio = kvzalloc(sizeof(*fdb_prio), GFP_KERNEL); + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!fdb_prio || !flow_group_in) { + err = -ENOMEM; + goto err_alloc; + } + + /* Chain's prio list is sorted by prio and level. + * And all levels of some prio point to the next prio's level 0. + * Example list (prio, level): + * (3,0)->(3,1)->(5,0)->(5,1)->(6,1)->(7,0) + * In hardware, we will we have the following pointers: + * (3,0) -> (5,0) -> (7,0) -> Slow path + * (3,1) -> (5,0) + * (5,1) -> (7,0) + * (6,1) -> (7,0) + */ + + /* Default miss for each chain: */ + next_fdb = (chain == mlx5_esw_chains_get_ft_chain(esw)) ? + tc_slow_fdb(esw) : + tc_end_fdb(esw); + list_for_each(pos, &fdb_chain->prios_list) { + struct fdb_prio *p = list_entry(pos, struct fdb_prio, list); + + /* exit on first pos that is larger */ + if (prio < p->key.prio || (prio == p->key.prio && + level < p->key.level)) { + /* Get next level 0 table */ + next_fdb = p->key.level == 0 ? p->fdb : p->next_fdb; + break; + } + } + + fdb = mlx5_esw_chains_create_fdb_table(esw, chain, prio, level); + if (IS_ERR(fdb)) { + err = PTR_ERR(fdb); + goto err_create; + } + + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, + fdb->max_fte - 2); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, + fdb->max_fte - 1); + miss_group = mlx5_create_flow_group(fdb, flow_group_in); + if (IS_ERR(miss_group)) { + err = PTR_ERR(miss_group); + goto err_group; + } + + /* Add miss rule to next_fdb */ + miss_rule = mlx5_esw_chains_add_miss_rule(fdb, next_fdb); + if (IS_ERR(miss_rule)) { + err = PTR_ERR(miss_rule); + goto err_miss_rule; + } + + fdb_prio->miss_group = miss_group; + fdb_prio->miss_rule = miss_rule; + fdb_prio->next_fdb = next_fdb; + fdb_prio->fdb_chain = fdb_chain; + fdb_prio->key.chain = chain; + fdb_prio->key.prio = prio; + fdb_prio->key.level = level; + fdb_prio->fdb = fdb; + + err = rhashtable_insert_fast(&esw_prios_ht(esw), &fdb_prio->node, + prio_params); + if (err) + goto err_insert; + + list_add(&fdb_prio->list, pos->prev); + + /* Table is ready, connect it */ + err = mlx5_esw_chains_update_prio_prevs(fdb_prio, fdb); + if (err) + goto err_update; + + kvfree(flow_group_in); + return fdb_prio; + +err_update: + list_del(&fdb_prio->list); + rhashtable_remove_fast(&esw_prios_ht(esw), &fdb_prio->node, + prio_params); +err_insert: + mlx5_del_flow_rules(miss_rule); +err_miss_rule: + mlx5_destroy_flow_group(miss_group); +err_group: + mlx5_esw_chains_destroy_fdb_table(esw, fdb); +err_create: +err_alloc: + kvfree(fdb_prio); + kvfree(flow_group_in); + mlx5_esw_chains_put_fdb_chain(fdb_chain); + return ERR_PTR(err); +} + +static void +mlx5_esw_chains_destroy_fdb_prio(struct mlx5_eswitch *esw, + struct fdb_prio *fdb_prio) +{ + struct fdb_chain *fdb_chain = fdb_prio->fdb_chain; + + WARN_ON(mlx5_esw_chains_update_prio_prevs(fdb_prio, + fdb_prio->next_fdb)); + + list_del(&fdb_prio->list); + rhashtable_remove_fast(&esw_prios_ht(esw), &fdb_prio->node, + prio_params); + mlx5_del_flow_rules(fdb_prio->miss_rule); + mlx5_destroy_flow_group(fdb_prio->miss_group); + mlx5_esw_chains_destroy_fdb_table(esw, fdb_prio->fdb); + mlx5_esw_chains_put_fdb_chain(fdb_chain); + kvfree(fdb_prio); +} + +struct mlx5_flow_table * +mlx5_esw_chains_get_table(struct mlx5_eswitch *esw, u32 chain, u32 prio, + u32 level) +{ + struct mlx5_flow_table *prev_fts; + struct fdb_prio *fdb_prio; + struct fdb_prio_key key; + int l = 0; + + if ((chain > mlx5_esw_chains_get_chain_range(esw) && + chain != mlx5_esw_chains_get_ft_chain(esw)) || + prio > mlx5_esw_chains_get_prio_range(esw) || + level > mlx5_esw_chains_get_level_range(esw)) + return ERR_PTR(-EOPNOTSUPP); + + /* create earlier levels for correct fs_core lookup when + * connecting tables. + */ + for (l = 0; l < level; l++) { + prev_fts = mlx5_esw_chains_get_table(esw, chain, prio, l); + if (IS_ERR(prev_fts)) { + fdb_prio = ERR_CAST(prev_fts); + goto err_get_prevs; + } + } + + key.chain = chain; + key.prio = prio; + key.level = level; + + mutex_lock(&esw_chains_lock(esw)); + fdb_prio = rhashtable_lookup_fast(&esw_prios_ht(esw), &key, + prio_params); + if (!fdb_prio) { + fdb_prio = mlx5_esw_chains_create_fdb_prio(esw, chain, + prio, level); + if (IS_ERR(fdb_prio)) + goto err_create_prio; + } + + ++fdb_prio->ref; + mutex_unlock(&esw_chains_lock(esw)); + + return fdb_prio->fdb; + +err_create_prio: + mutex_unlock(&esw_chains_lock(esw)); +err_get_prevs: + while (--l >= 0) + mlx5_esw_chains_put_table(esw, chain, prio, l); + return ERR_CAST(fdb_prio); +} + +void +mlx5_esw_chains_put_table(struct mlx5_eswitch *esw, u32 chain, u32 prio, + u32 level) +{ + struct fdb_prio *fdb_prio; + struct fdb_prio_key key; + + key.chain = chain; + key.prio = prio; + key.level = level; + + mutex_lock(&esw_chains_lock(esw)); + fdb_prio = rhashtable_lookup_fast(&esw_prios_ht(esw), &key, + prio_params); + if (!fdb_prio) + goto err_get_prio; + + if (--fdb_prio->ref == 0) + mlx5_esw_chains_destroy_fdb_prio(esw, fdb_prio); + mutex_unlock(&esw_chains_lock(esw)); + + while (level-- > 0) + mlx5_esw_chains_put_table(esw, chain, prio, level); + + return; + +err_get_prio: + mutex_unlock(&esw_chains_lock(esw)); + WARN_ONCE(1, + "Couldn't find table: (chain: %d prio: %d level: %d)", + chain, prio, level); +} + +struct mlx5_flow_table * +mlx5_esw_chains_get_tc_end_ft(struct mlx5_eswitch *esw) +{ + return tc_end_fdb(esw); +} + +static int +mlx5_esw_chains_init(struct mlx5_eswitch *esw) +{ + struct mlx5_esw_chains_priv *chains_priv; + struct mlx5_core_dev *dev = esw->dev; + u32 max_flow_counter, fdb_max; + int err; + + chains_priv = kzalloc(sizeof(*chains_priv), GFP_KERNEL); + if (!chains_priv) + return -ENOMEM; + esw_chains_priv(esw) = chains_priv; + + max_flow_counter = (MLX5_CAP_GEN(dev, max_flow_counter_31_16) << 16) | + MLX5_CAP_GEN(dev, max_flow_counter_15_0); + fdb_max = 1 << MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size); + + esw_debug(dev, + "Init esw offloads chains, max counters(%d), groups(%d), max flow table size(%d)\n", + max_flow_counter, ESW_OFFLOADS_NUM_GROUPS, fdb_max); + + mlx5_esw_chains_init_sz_pool(esw); + + if (!MLX5_CAP_ESW_FLOWTABLE(esw->dev, multi_fdb_encap) && + esw->offloads.encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE) { + esw->fdb_table.flags &= ~ESW_FDB_CHAINS_AND_PRIOS_SUPPORTED; + esw_warn(dev, "Tc chains and priorities offload aren't supported, update firmware if needed\n"); + } else { + esw->fdb_table.flags |= ESW_FDB_CHAINS_AND_PRIOS_SUPPORTED; + esw_info(dev, "Supported tc offload range - chains: %u, prios: %u\n", + mlx5_esw_chains_get_chain_range(esw), + mlx5_esw_chains_get_prio_range(esw)); + } + + err = rhashtable_init(&esw_chains_ht(esw), &chain_params); + if (err) + goto init_chains_ht_err; + + err = rhashtable_init(&esw_prios_ht(esw), &prio_params); + if (err) + goto init_prios_ht_err; + + mutex_init(&esw_chains_lock(esw)); + + return 0; + +init_prios_ht_err: + rhashtable_destroy(&esw_chains_ht(esw)); +init_chains_ht_err: + kfree(chains_priv); + return err; +} + +static void +mlx5_esw_chains_cleanup(struct mlx5_eswitch *esw) +{ + mutex_destroy(&esw_chains_lock(esw)); + rhashtable_destroy(&esw_prios_ht(esw)); + rhashtable_destroy(&esw_chains_ht(esw)); + + kfree(esw_chains_priv(esw)); +} + +static int +mlx5_esw_chains_open(struct mlx5_eswitch *esw) +{ + struct mlx5_flow_table *ft; + int err; + + /* Create tc_end_fdb(esw) which is the always created ft chain */ + ft = mlx5_esw_chains_get_table(esw, mlx5_esw_chains_get_ft_chain(esw), + 1, 0); + if (IS_ERR(ft)) + return PTR_ERR(ft); + + tc_end_fdb(esw) = ft; + + /* Always open the root for fast path */ + ft = mlx5_esw_chains_get_table(esw, 0, 1, 0); + if (IS_ERR(ft)) { + err = PTR_ERR(ft); + goto level_0_err; + } + + /* Open level 1 for split rules now if prios isn't supported */ + if (!mlx5_esw_chains_prios_supported(esw)) { + ft = mlx5_esw_chains_get_table(esw, 0, 1, 1); + + if (IS_ERR(ft)) { + err = PTR_ERR(ft); + goto level_1_err; + } + } + + return 0; + +level_1_err: + mlx5_esw_chains_put_table(esw, 0, 1, 0); +level_0_err: + mlx5_esw_chains_put_table(esw, mlx5_esw_chains_get_ft_chain(esw), 1, 0); + return err; +} + +static void +mlx5_esw_chains_close(struct mlx5_eswitch *esw) +{ + if (!mlx5_esw_chains_prios_supported(esw)) + mlx5_esw_chains_put_table(esw, 0, 1, 1); + mlx5_esw_chains_put_table(esw, 0, 1, 0); + mlx5_esw_chains_put_table(esw, mlx5_esw_chains_get_ft_chain(esw), 1, 0); +} + +int +mlx5_esw_chains_create(struct mlx5_eswitch *esw) +{ + int err; + + err = mlx5_esw_chains_init(esw); + if (err) + return err; + + err = mlx5_esw_chains_open(esw); + if (err) + goto err_open; + + return 0; + +err_open: + mlx5_esw_chains_cleanup(esw); + return err; +} + +void +mlx5_esw_chains_destroy(struct mlx5_eswitch *esw) +{ + mlx5_esw_chains_close(esw); + mlx5_esw_chains_cleanup(esw); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_chains.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_chains.h new file mode 100644 index 000000000000..2e13097fe348 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_chains.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020 Mellanox Technologies. */ + +#ifndef __ML5_ESW_CHAINS_H__ +#define __ML5_ESW_CHAINS_H__ + +bool +mlx5_esw_chains_prios_supported(struct mlx5_eswitch *esw); +u32 +mlx5_esw_chains_get_prio_range(struct mlx5_eswitch *esw); +u32 +mlx5_esw_chains_get_chain_range(struct mlx5_eswitch *esw); +u32 +mlx5_esw_chains_get_ft_chain(struct mlx5_eswitch *esw); + +struct mlx5_flow_table * +mlx5_esw_chains_get_table(struct mlx5_eswitch *esw, u32 chain, u32 prio, + u32 level); +void +mlx5_esw_chains_put_table(struct mlx5_eswitch *esw, u32 chain, u32 prio, + u32 level); + +struct mlx5_flow_table * +mlx5_esw_chains_get_tc_end_ft(struct mlx5_eswitch *esw); + +int mlx5_esw_chains_create(struct mlx5_eswitch *esw); +void mlx5_esw_chains_destroy(struct mlx5_eswitch *esw); + +#endif /* __ML5_ESW_CHAINS_H__ */ + diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_termtbl.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_termtbl.c index 366bda1bb1c3..dc08ed9339ab 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_termtbl.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_termtbl.c @@ -50,8 +50,8 @@ mlx5_eswitch_termtbl_create(struct mlx5_core_dev *dev, struct mlx5_flow_act *flow_act) { static const struct mlx5_flow_spec spec = {}; + struct mlx5_flow_table_attr ft_attr = {}; struct mlx5_flow_namespace *root_ns; - int prio, flags; int err; root_ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_FDB); @@ -63,10 +63,11 @@ mlx5_eswitch_termtbl_create(struct mlx5_core_dev *dev, /* As this is the terminating action then the termination table is the * same prio as the slow path */ - prio = FDB_SLOW_PATH; - flags = MLX5_FLOW_TABLE_TERMINATION; - tt->termtbl = mlx5_create_auto_grouped_flow_table(root_ns, prio, 1, 1, - 0, flags); + ft_attr.flags = MLX5_FLOW_TABLE_TERMINATION; + ft_attr.prio = FDB_SLOW_PATH; + ft_attr.max_fte = 1; + ft_attr.autogroup.max_num_groups = 1; + tt->termtbl = mlx5_create_auto_grouped_flow_table(root_ns, &ft_attr); if (IS_ERR(tt->termtbl)) { esw_warn(dev, "Failed to create termination table\n"); return -EOPNOTSUPP; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c index 3c816e81f8d9..b25465d9e030 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c @@ -432,6 +432,9 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev, MLX5_SET(set_fte_in, in, table_type, ft->type); MLX5_SET(set_fte_in, in, table_id, ft->id); MLX5_SET(set_fte_in, in, flow_index, fte->index); + MLX5_SET(set_fte_in, in, ignore_flow_level, + !!(fte->action.flags & FLOW_ACT_IGNORE_FLOW_LEVEL)); + if (ft->vport) { MLX5_SET(set_fte_in, in, vport_number, ft->vport); MLX5_SET(set_fte_in, in, other_vport, 1); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 9a48c4310887..c7a16ae05fa8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -531,16 +531,9 @@ static void del_hw_fte(struct fs_node *node) } } -static void del_sw_fte_rcu(struct rcu_head *head) -{ - struct fs_fte *fte = container_of(head, struct fs_fte, rcu); - struct mlx5_flow_steering *steering = get_steering(&fte->node); - - kmem_cache_free(steering->ftes_cache, fte); -} - static void del_sw_fte(struct fs_node *node) { + struct mlx5_flow_steering *steering = get_steering(node); struct mlx5_flow_group *fg; struct fs_fte *fte; int err; @@ -553,8 +546,7 @@ static void del_sw_fte(struct fs_node *node) rhash_fte); WARN_ON(err); ida_simple_remove(&fg->fte_allocator, fte->index - fg->start_index); - - call_rcu(&fte->rcu, del_sw_fte_rcu); + kmem_cache_free(steering->ftes_cache, fte); } static void del_hw_flow_group(struct fs_node *node) @@ -587,7 +579,9 @@ static void del_sw_flow_group(struct fs_node *node) rhashtable_destroy(&fg->ftes_hash); ida_destroy(&fg->fte_allocator); - if (ft->autogroup.active && fg->max_ftes == ft->autogroup.group_size) + if (ft->autogroup.active && + fg->max_ftes == ft->autogroup.group_size && + fg->start_index < ft->autogroup.max_fte) ft->autogroup.num_groups--; err = rhltable_remove(&ft->fgs_hash, &fg->hash, @@ -1014,7 +1008,8 @@ static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespa u16 vport) { struct mlx5_flow_root_namespace *root = find_root(&ns->node); - struct mlx5_flow_table *next_ft = NULL; + bool unmanaged = ft_attr->flags & MLX5_FLOW_TABLE_UNMANAGED; + struct mlx5_flow_table *next_ft; struct fs_prio *fs_prio = NULL; struct mlx5_flow_table *ft; int log_table_sz; @@ -1031,14 +1026,21 @@ static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespa err = -EINVAL; goto unlock_root; } - if (ft_attr->level >= fs_prio->num_levels) { - err = -ENOSPC; - goto unlock_root; + if (!unmanaged) { + /* The level is related to the + * priority level range. + */ + if (ft_attr->level >= fs_prio->num_levels) { + err = -ENOSPC; + goto unlock_root; + } + + ft_attr->level += fs_prio->start_level; } + /* The level is related to the * priority level range. */ - ft_attr->level += fs_prio->start_level; ft = alloc_flow_table(ft_attr->level, vport, ft_attr->max_fte ? roundup_pow_of_two(ft_attr->max_fte) : 0, @@ -1051,19 +1053,27 @@ static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespa tree_init_node(&ft->node, del_hw_flow_table, del_sw_flow_table); log_table_sz = ft->max_fte ? ilog2(ft->max_fte) : 0; - next_ft = find_next_chained_ft(fs_prio); + next_ft = unmanaged ? ft_attr->next_ft : + find_next_chained_ft(fs_prio); ft->def_miss_action = ns->def_miss_action; err = root->cmds->create_flow_table(root, ft, log_table_sz, next_ft); if (err) goto free_ft; - err = connect_flow_table(root->dev, ft, fs_prio); - if (err) - goto destroy_ft; + if (!unmanaged) { + err = connect_flow_table(root->dev, ft, fs_prio); + if (err) + goto destroy_ft; + } + ft->node.active = true; down_write_ref_node(&fs_prio->node, false); - tree_add_node(&ft->node, &fs_prio->node); - list_add_flow_table(ft, fs_prio); + if (!unmanaged) { + tree_add_node(&ft->node, &fs_prio->node); + list_add_flow_table(ft, fs_prio); + } else { + ft->node.root = fs_prio->node.root; + } fs_prio->num_ft++; up_write_ref_node(&fs_prio->node, false); mutex_unlock(&root->chain_lock); @@ -1111,31 +1121,27 @@ EXPORT_SYMBOL(mlx5_create_lag_demux_flow_table); struct mlx5_flow_table* mlx5_create_auto_grouped_flow_table(struct mlx5_flow_namespace *ns, - int prio, - int num_flow_table_entries, - int max_num_groups, - u32 level, - u32 flags) + struct mlx5_flow_table_attr *ft_attr) { - struct mlx5_flow_table_attr ft_attr = {}; + int num_reserved_entries = ft_attr->autogroup.num_reserved_entries; + int autogroups_max_fte = ft_attr->max_fte - num_reserved_entries; + int max_num_groups = ft_attr->autogroup.max_num_groups; struct mlx5_flow_table *ft; - if (max_num_groups > num_flow_table_entries) + if (max_num_groups > autogroups_max_fte) + return ERR_PTR(-EINVAL); + if (num_reserved_entries > ft_attr->max_fte) return ERR_PTR(-EINVAL); - ft_attr.max_fte = num_flow_table_entries; - ft_attr.prio = prio; - ft_attr.level = level; - ft_attr.flags = flags; - - ft = mlx5_create_flow_table(ns, &ft_attr); + ft = mlx5_create_flow_table(ns, ft_attr); if (IS_ERR(ft)) return ft; ft->autogroup.active = true; ft->autogroup.required_groups = max_num_groups; + ft->autogroup.max_fte = autogroups_max_fte; /* We save place for flow groups in addition to max types */ - ft->autogroup.group_size = ft->max_fte / (max_num_groups + 1); + ft->autogroup.group_size = autogroups_max_fte / (max_num_groups + 1); return ft; } @@ -1157,7 +1163,7 @@ struct mlx5_flow_group *mlx5_create_flow_group(struct mlx5_flow_table *ft, struct mlx5_flow_group *fg; int err; - if (ft->autogroup.active) + if (ft->autogroup.active && start_index < ft->autogroup.max_fte) return ERR_PTR(-EPERM); down_write_ref_node(&ft->node, false); @@ -1330,9 +1336,10 @@ static struct mlx5_flow_group *alloc_auto_flow_group(struct mlx5_flow_table *ft const struct mlx5_flow_spec *spec) { struct list_head *prev = &ft->node.children; - struct mlx5_flow_group *fg; + u32 max_fte = ft->autogroup.max_fte; unsigned int candidate_index = 0; unsigned int group_size = 0; + struct mlx5_flow_group *fg; if (!ft->autogroup.active) return ERR_PTR(-ENOENT); @@ -1340,7 +1347,7 @@ static struct mlx5_flow_group *alloc_auto_flow_group(struct mlx5_flow_table *ft if (ft->autogroup.num_groups < ft->autogroup.required_groups) group_size = ft->autogroup.group_size; - /* ft->max_fte == ft->autogroup.max_types */ + /* max_fte == ft->autogroup.max_types */ if (group_size == 0) group_size = 1; @@ -1353,7 +1360,7 @@ static struct mlx5_flow_group *alloc_auto_flow_group(struct mlx5_flow_table *ft prev = &fg->node.list; } - if (candidate_index + group_size > ft->max_fte) + if (candidate_index + group_size > max_fte) return ERR_PTR(-ENOSPC); fg = alloc_insert_flow_group(ft, @@ -1537,18 +1544,30 @@ static bool counter_is_valid(u32 action) } static bool dest_is_valid(struct mlx5_flow_destination *dest, - u32 action, + struct mlx5_flow_act *flow_act, struct mlx5_flow_table *ft) { + bool ignore_level = flow_act->flags & FLOW_ACT_IGNORE_FLOW_LEVEL; + u32 action = flow_act->action; + if (dest && (dest->type == MLX5_FLOW_DESTINATION_TYPE_COUNTER)) return counter_is_valid(action); if (!(action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST)) return true; + if (ignore_level) { + if (ft->type != FS_FT_FDB) + return false; + + if (dest->type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE && + dest->ft->type != FS_FT_FDB) + return false; + } + if (!dest || ((dest->type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) && - (dest->ft->level <= ft->level))) + (dest->ft->level <= ft->level && !ignore_level))) return false; return true; } @@ -1633,47 +1652,22 @@ static u64 matched_fgs_get_version(struct list_head *match_head) } static struct fs_fte * -lookup_fte_for_write_locked(struct mlx5_flow_group *g, const u32 *match_value) -{ - struct fs_fte *fte_tmp; - - nested_down_write_ref_node(&g->node, FS_LOCK_PARENT); - - fte_tmp = rhashtable_lookup_fast(&g->ftes_hash, match_value, rhash_fte); - if (!fte_tmp || !tree_get_node(&fte_tmp->node)) { - fte_tmp = NULL; - goto out; - } - - if (!fte_tmp->node.active) { - tree_put_node(&fte_tmp->node, false); - fte_tmp = NULL; - goto out; - } - nested_down_write_ref_node(&fte_tmp->node, FS_LOCK_CHILD); - -out: - up_write_ref_node(&g->node, false); - return fte_tmp; -} - -static struct fs_fte * -lookup_fte_for_read_locked(struct mlx5_flow_group *g, const u32 *match_value) +lookup_fte_locked(struct mlx5_flow_group *g, + const u32 *match_value, + bool take_write) { struct fs_fte *fte_tmp; - if (!tree_get_node(&g->node)) - return NULL; - - rcu_read_lock(); - fte_tmp = rhashtable_lookup(&g->ftes_hash, match_value, rhash_fte); + if (take_write) + nested_down_write_ref_node(&g->node, FS_LOCK_PARENT); + else + nested_down_read_ref_node(&g->node, FS_LOCK_PARENT); + fte_tmp = rhashtable_lookup_fast(&g->ftes_hash, match_value, + rhash_fte); if (!fte_tmp || !tree_get_node(&fte_tmp->node)) { - rcu_read_unlock(); fte_tmp = NULL; goto out; } - rcu_read_unlock(); - if (!fte_tmp->node.active) { tree_put_node(&fte_tmp->node, false); fte_tmp = NULL; @@ -1681,19 +1675,12 @@ lookup_fte_for_read_locked(struct mlx5_flow_group *g, const u32 *match_value) } nested_down_write_ref_node(&fte_tmp->node, FS_LOCK_CHILD); - out: - tree_put_node(&g->node, false); - return fte_tmp; -} - -static struct fs_fte * -lookup_fte_locked(struct mlx5_flow_group *g, const u32 *match_value, bool write) -{ - if (write) - return lookup_fte_for_write_locked(g, match_value); + if (take_write) + up_write_ref_node(&g->node, false); else - return lookup_fte_for_read_locked(g, match_value); + up_read_ref_node(&g->node); + return fte_tmp; } static struct mlx5_flow_handle * @@ -1810,7 +1797,7 @@ _mlx5_add_flow_rules(struct mlx5_flow_table *ft, return ERR_PTR(-EINVAL); for (i = 0; i < dest_num; i++) { - if (!dest_is_valid(&dest[i], flow_act->action, ft)) + if (!dest_is_valid(&dest[i], flow_act, ft)) return ERR_PTR(-EINVAL); } nested_down_read_ref_node(&ft->node, FS_LOCK_GRANDPARENT); @@ -2073,7 +2060,8 @@ int mlx5_destroy_flow_table(struct mlx5_flow_table *ft) int err = 0; mutex_lock(&root->chain_lock); - err = disconnect_flow_table(ft); + if (!(ft->flags & MLX5_FLOW_TABLE_UNMANAGED)) + err = disconnect_flow_table(ft); if (err) { mutex_unlock(&root->chain_lock); return err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h index e8cd997f413e..be5f5e32c1e8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h @@ -164,6 +164,7 @@ struct mlx5_flow_table { unsigned int required_groups; unsigned int group_size; unsigned int num_groups; + unsigned int max_fte; } autogroup; /* Protect fwd_rules */ struct mutex lock; @@ -203,7 +204,6 @@ struct fs_fte { enum fs_fte_status status; struct mlx5_fc *counter; struct rhash_head hash; - struct rcu_head rcu; int modify_mask; }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c index a19790dee7b2..d89ff1d09119 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c @@ -131,11 +131,11 @@ static int mlx5_get_pcam_reg(struct mlx5_core_dev *dev) MLX5_PCAM_REGS_5000_TO_507F); } -static int mlx5_get_mcam_reg(struct mlx5_core_dev *dev) +static int mlx5_get_mcam_access_reg_group(struct mlx5_core_dev *dev, + enum mlx5_mcam_reg_groups group) { - return mlx5_query_mcam_reg(dev, dev->caps.mcam, - MLX5_MCAM_FEATURE_ENHANCED_FEATURES, - MLX5_MCAM_REGS_FIRST_128); + return mlx5_query_mcam_reg(dev, dev->caps.mcam[group], + MLX5_MCAM_FEATURE_ENHANCED_FEATURES, group); } static int mlx5_get_qcam_reg(struct mlx5_core_dev *dev) @@ -221,8 +221,11 @@ int mlx5_query_hca_caps(struct mlx5_core_dev *dev) if (MLX5_CAP_GEN(dev, pcam_reg)) mlx5_get_pcam_reg(dev); - if (MLX5_CAP_GEN(dev, mcam_reg)) - mlx5_get_mcam_reg(dev); + if (MLX5_CAP_GEN(dev, mcam_reg)) { + mlx5_get_mcam_access_reg_group(dev, MLX5_MCAM_REGS_FIRST_128); + mlx5_get_mcam_access_reg_group(dev, MLX5_MCAM_REGS_0x9080_0x90FF); + mlx5_get_mcam_access_reg_group(dev, MLX5_MCAM_REGS_0x9100_0x917F); + } if (MLX5_CAP_GEN(dev, qcam_reg)) mlx5_get_qcam_reg(dev); @@ -245,6 +248,13 @@ int mlx5_query_hca_caps(struct mlx5_core_dev *dev) return err; } + if (MLX5_CAP_GEN_64(dev, general_obj_types) & + MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q) { + err = mlx5_core_get_caps(dev, MLX5_CAP_VDPA_EMULATION); + if (err) + return err; + } + return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c index 3ed8ab2d703d..56078b23f1a0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c @@ -87,8 +87,8 @@ int mlx5i_init(struct mlx5_core_dev *mdev, mlx5e_set_netdev_mtu_boundaries(priv); netdev->mtu = netdev->max_mtu; - mlx5e_build_nic_params(mdev, NULL, &priv->rss_params, &priv->channels.params, - priv->max_nch, netdev->mtu); + mlx5e_build_nic_params(priv, NULL, &priv->rss_params, &priv->channels.params, + netdev->mtu); mlx5i_build_nic_params(mdev, &priv->channels.params); mlx5e_timestamp_init(priv); @@ -419,6 +419,28 @@ static void mlx5i_cleanup_rx(struct mlx5e_priv *priv) mlx5e_destroy_q_counters(priv); } +/* The stats groups order is opposite to the update_stats() order calls */ +static mlx5e_stats_grp_t mlx5i_stats_grps[] = { + &MLX5E_STATS_GRP(sw), + &MLX5E_STATS_GRP(qcnt), + &MLX5E_STATS_GRP(vnic_env), + &MLX5E_STATS_GRP(vport), + &MLX5E_STATS_GRP(802_3), + &MLX5E_STATS_GRP(2863), + &MLX5E_STATS_GRP(2819), + &MLX5E_STATS_GRP(phy), + &MLX5E_STATS_GRP(pcie), + &MLX5E_STATS_GRP(per_prio), + &MLX5E_STATS_GRP(pme), + &MLX5E_STATS_GRP(channels), + &MLX5E_STATS_GRP(per_port_buff_congest), +}; + +static unsigned int mlx5i_stats_grps_num(struct mlx5e_priv *priv) +{ + return ARRAY_SIZE(mlx5i_stats_grps); +} + static const struct mlx5e_profile mlx5i_nic_profile = { .init = mlx5i_init, .cleanup = mlx5i_cleanup, @@ -435,6 +457,8 @@ static const struct mlx5e_profile mlx5i_nic_profile = { .rx_handlers.handle_rx_cqe_mpwqe = NULL, /* Not supported */ .max_tc = MLX5I_MAX_NUM_TC, .rq_groups = MLX5E_NUM_RQ_GROUPS(REGULAR), + .stats_grps = mlx5i_stats_grps, + .stats_grps_num = mlx5i_stats_grps_num, }; /* mlx5i netdev NDos */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag.c index fc0d9583475d..b91eabc09fbc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag.c @@ -586,7 +586,8 @@ void mlx5_lag_add(struct mlx5_core_dev *dev, struct net_device *netdev) if (!ldev->nb.notifier_call) { ldev->nb.notifier_call = mlx5_lag_netdev_event; - if (register_netdevice_notifier(&ldev->nb)) { + if (register_netdevice_notifier_dev_net(netdev, &ldev->nb, + &ldev->nn)) { ldev->nb.notifier_call = NULL; mlx5_core_err(dev, "Failed to register LAG netdev notifier\n"); } @@ -599,7 +600,7 @@ void mlx5_lag_add(struct mlx5_core_dev *dev, struct net_device *netdev) } /* Must be called with intf_mutex held */ -void mlx5_lag_remove(struct mlx5_core_dev *dev) +void mlx5_lag_remove(struct mlx5_core_dev *dev, struct net_device *netdev) { struct mlx5_lag *ldev; int i; @@ -619,7 +620,8 @@ void mlx5_lag_remove(struct mlx5_core_dev *dev) if (i == MLX5_MAX_PORTS) { if (ldev->nb.notifier_call) - unregister_netdevice_notifier(&ldev->nb); + unregister_netdevice_notifier_dev_net(netdev, &ldev->nb, + &ldev->nn); mlx5_lag_mp_cleanup(ldev); cancel_delayed_work_sync(&ldev->bond_work); mlx5_lag_dev_free(ldev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag.h index f1068aac6406..316ab09e2664 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag.h @@ -44,6 +44,7 @@ struct mlx5_lag { struct workqueue_struct *wq; struct delayed_work bond_work; struct notifier_block nb; + struct netdev_net_notifier nn; struct lag_mp lag_mp; }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 173e2c12e1c7..f554cfddcf4e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1193,6 +1193,12 @@ int mlx5_load_one(struct mlx5_core_dev *dev, bool boot) if (err) goto err_load; + if (boot) { + err = mlx5_devlink_register(priv_to_devlink(dev), dev->device); + if (err) + goto err_devlink_reg; + } + if (mlx5_device_registered(dev)) { mlx5_attach_device(dev); } else { @@ -1210,6 +1216,9 @@ out: return err; err_reg_dev: + if (boot) + mlx5_devlink_unregister(priv_to_devlink(dev)); +err_devlink_reg: mlx5_unload(dev); err_load: if (boot) @@ -1347,10 +1356,6 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *id) request_module_nowait(MLX5_IB_MOD); - err = mlx5_devlink_register(devlink, &pdev->dev); - if (err) - goto clean_load; - err = mlx5_crdump_enable(dev); if (err) dev_err(&pdev->dev, "mlx5_crdump_enable failed with error code %d\n", err); @@ -1358,9 +1363,6 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *id) pci_save_state(pdev); return 0; -clean_load: - mlx5_unload_one(dev, true); - err_load_one: mlx5_pci_close(dev); pci_init_err: @@ -1561,6 +1563,7 @@ static const struct pci_device_id mlx5_core_pci_table[] = { { PCI_VDEVICE(MELLANOX, 0x101d) }, /* ConnectX-6 Dx */ { PCI_VDEVICE(MELLANOX, 0x101e), MLX5_PCI_DEV_IS_VF}, /* ConnectX Family mlx5Gen Virtual Function */ { PCI_VDEVICE(MELLANOX, 0x101f) }, /* ConnectX-6 LX */ + { PCI_VDEVICE(MELLANOX, 0x1021) }, /* ConnectX-7 */ { PCI_VDEVICE(MELLANOX, 0xa2d2) }, /* BlueField integrated ConnectX-5 network controller */ { PCI_VDEVICE(MELLANOX, 0xa2d3), MLX5_PCI_DEV_IS_VF}, /* BlueField integrated ConnectX-5 network controller VF */ { PCI_VDEVICE(MELLANOX, 0xa2d6) }, /* BlueField-2 integrated ConnectX-6 Dx network controller */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index da67b28d6e23..fcce9e0fc82c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -157,7 +157,7 @@ int mlx5_query_qcam_reg(struct mlx5_core_dev *mdev, u32 *qcam, u8 feature_group, u8 access_reg_group); void mlx5_lag_add(struct mlx5_core_dev *dev, struct net_device *netdev); -void mlx5_lag_remove(struct mlx5_core_dev *dev); +void mlx5_lag_remove(struct mlx5_core_dev *dev, struct net_device *netdev); int mlx5_irq_table_init(struct mlx5_core_dev *dev); void mlx5_irq_table_cleanup(struct mlx5_core_dev *dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c index 004c56c2fc0c..6dec2a550a10 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c @@ -677,9 +677,12 @@ int mlx5dr_actions_build_ste_arr(struct mlx5dr_matcher *matcher, goto out_invalid_arg; } if (action->dest_tbl.tbl->level <= matcher->tbl->level) { + mlx5_core_warn_once(dmn->mdev, + "Connecting table to a lower/same level destination table\n"); mlx5dr_dbg(dmn, - "Destination table level should be higher than source table\n"); - goto out_invalid_arg; + "Connecting table at level %d to a destination table at level %d\n", + matcher->tbl->level, + action->dest_tbl.tbl->level); } attr.final_icm_addr = rx_rule ? action->dest_tbl.tbl->rx.s_anchor->chunk->icm_addr : @@ -690,9 +693,9 @@ int mlx5dr_actions_build_ste_arr(struct mlx5dr_matcher *matcher, /* get the relevant addresses */ if (!action->dest_tbl.fw_tbl.rx_icm_addr) { - ret = mlx5dr_cmd_query_flow_table(action->dest_tbl.fw_tbl.mdev, - action->dest_tbl.fw_tbl.ft->type, - action->dest_tbl.fw_tbl.ft->id, + ret = mlx5dr_cmd_query_flow_table(dmn->mdev, + action->dest_tbl.fw_tbl.type, + action->dest_tbl.fw_tbl.id, &output); if (!ret) { action->dest_tbl.fw_tbl.tx_icm_addr = @@ -982,8 +985,106 @@ dec_ref: } struct mlx5dr_action * -mlx5dr_create_action_dest_flow_fw_table(struct mlx5_flow_table *ft, - struct mlx5_core_dev *mdev) +mlx5dr_action_create_mult_dest_tbl(struct mlx5dr_domain *dmn, + struct mlx5dr_action_dest *dests, + u32 num_of_dests) +{ + struct mlx5dr_cmd_flow_destination_hw_info *hw_dests; + struct mlx5dr_action **ref_actions; + struct mlx5dr_action *action; + bool reformat_req = false; + u32 num_of_ref = 0; + int ret; + int i; + + if (dmn->type != MLX5DR_DOMAIN_TYPE_FDB) { + mlx5dr_err(dmn, "Multiple destination support is for FDB only\n"); + return NULL; + } + + hw_dests = kzalloc(sizeof(*hw_dests) * num_of_dests, GFP_KERNEL); + if (!hw_dests) + return NULL; + + ref_actions = kzalloc(sizeof(*ref_actions) * num_of_dests * 2, GFP_KERNEL); + if (!ref_actions) + goto free_hw_dests; + + for (i = 0; i < num_of_dests; i++) { + struct mlx5dr_action *reformat_action = dests[i].reformat; + struct mlx5dr_action *dest_action = dests[i].dest; + + ref_actions[num_of_ref++] = dest_action; + + switch (dest_action->action_type) { + case DR_ACTION_TYP_VPORT: + hw_dests[i].vport.flags = MLX5_FLOW_DEST_VPORT_VHCA_ID; + hw_dests[i].type = MLX5_FLOW_DESTINATION_TYPE_VPORT; + hw_dests[i].vport.num = dest_action->vport.caps->num; + hw_dests[i].vport.vhca_id = dest_action->vport.caps->vhca_gvmi; + if (reformat_action) { + reformat_req = true; + hw_dests[i].vport.reformat_id = + reformat_action->reformat.reformat_id; + ref_actions[num_of_ref++] = reformat_action; + hw_dests[i].vport.flags |= MLX5_FLOW_DEST_VPORT_REFORMAT_ID; + } + break; + + case DR_ACTION_TYP_FT: + hw_dests[i].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + if (dest_action->dest_tbl.is_fw_tbl) + hw_dests[i].ft_id = dest_action->dest_tbl.fw_tbl.id; + else + hw_dests[i].ft_id = dest_action->dest_tbl.tbl->table_id; + break; + + default: + mlx5dr_dbg(dmn, "Invalid multiple destinations action\n"); + goto free_ref_actions; + } + } + + action = dr_action_create_generic(DR_ACTION_TYP_FT); + if (!action) + goto free_ref_actions; + + ret = mlx5dr_fw_create_md_tbl(dmn, + hw_dests, + num_of_dests, + reformat_req, + &action->dest_tbl.fw_tbl.id, + &action->dest_tbl.fw_tbl.group_id); + if (ret) + goto free_action; + + refcount_inc(&dmn->refcount); + + for (i = 0; i < num_of_ref; i++) + refcount_inc(&ref_actions[i]->refcount); + + action->dest_tbl.is_fw_tbl = true; + action->dest_tbl.fw_tbl.dmn = dmn; + action->dest_tbl.fw_tbl.type = FS_FT_FDB; + action->dest_tbl.fw_tbl.ref_actions = ref_actions; + action->dest_tbl.fw_tbl.num_of_ref_actions = num_of_ref; + + kfree(hw_dests); + + return action; + +free_action: + kfree(action); +free_ref_actions: + kfree(ref_actions); +free_hw_dests: + kfree(hw_dests); + return NULL; +} + +struct mlx5dr_action * +mlx5dr_action_create_dest_flow_fw_table(struct mlx5dr_domain *dmn, + struct mlx5_flow_table *ft) { struct mlx5dr_action *action; @@ -992,8 +1093,11 @@ mlx5dr_create_action_dest_flow_fw_table(struct mlx5_flow_table *ft, return NULL; action->dest_tbl.is_fw_tbl = 1; - action->dest_tbl.fw_tbl.ft = ft; - action->dest_tbl.fw_tbl.mdev = mdev; + action->dest_tbl.fw_tbl.type = ft->type; + action->dest_tbl.fw_tbl.id = ft->id; + action->dest_tbl.fw_tbl.dmn = dmn; + + refcount_inc(&dmn->refcount); return action; } @@ -1213,58 +1317,85 @@ not_found: } static int -dr_action_modify_sw_to_hw(struct mlx5dr_domain *dmn, - __be64 *sw_action, - __be64 *hw_action, - const struct dr_action_modify_field_conv **ret_hw_info) +dr_action_modify_sw_to_hw_add(struct mlx5dr_domain *dmn, + __be64 *sw_action, + __be64 *hw_action, + const struct dr_action_modify_field_conv **ret_hw_info) { const struct dr_action_modify_field_conv *hw_action_info; - u8 offset, length, max_length, action; + u8 max_length; u16 sw_field; - u8 hw_opcode; u32 data; /* Get SW modify action data */ - action = MLX5_GET(set_action_in, sw_action, action_type); - length = MLX5_GET(set_action_in, sw_action, length); - offset = MLX5_GET(set_action_in, sw_action, offset); sw_field = MLX5_GET(set_action_in, sw_action, field); data = MLX5_GET(set_action_in, sw_action, data); /* Convert SW data to HW modify action format */ hw_action_info = dr_action_modify_get_hw_info(sw_field); if (!hw_action_info) { - mlx5dr_dbg(dmn, "Modify action invalid field given\n"); + mlx5dr_dbg(dmn, "Modify add action invalid field given\n"); return -EINVAL; } max_length = hw_action_info->end - hw_action_info->start + 1; - switch (action) { - case MLX5_ACTION_TYPE_SET: - hw_opcode = MLX5DR_ACTION_MDFY_HW_OP_SET; - /* PRM defines that length zero specific length of 32bits */ - if (!length) - length = 32; + MLX5_SET(dr_action_hw_set, hw_action, + opcode, MLX5DR_ACTION_MDFY_HW_OP_ADD); - if (length + offset > max_length) { - mlx5dr_dbg(dmn, "Modify action length + offset exceeds limit\n"); - return -EINVAL; - } - break; + MLX5_SET(dr_action_hw_set, hw_action, destination_field_code, + hw_action_info->hw_field); - case MLX5_ACTION_TYPE_ADD: - hw_opcode = MLX5DR_ACTION_MDFY_HW_OP_ADD; - offset = 0; - length = max_length; - break; + MLX5_SET(dr_action_hw_set, hw_action, destination_left_shifter, + hw_action_info->start); - default: - mlx5dr_info(dmn, "Unsupported action_type for modify action\n"); - return -EOPNOTSUPP; + /* PRM defines that length zero specific length of 32bits */ + MLX5_SET(dr_action_hw_set, hw_action, destination_length, + max_length == 32 ? 0 : max_length); + + MLX5_SET(dr_action_hw_set, hw_action, inline_data, data); + + *ret_hw_info = hw_action_info; + + return 0; +} + +static int +dr_action_modify_sw_to_hw_set(struct mlx5dr_domain *dmn, + __be64 *sw_action, + __be64 *hw_action, + const struct dr_action_modify_field_conv **ret_hw_info) +{ + const struct dr_action_modify_field_conv *hw_action_info; + u8 offset, length, max_length; + u16 sw_field; + u32 data; + + /* Get SW modify action data */ + length = MLX5_GET(set_action_in, sw_action, length); + offset = MLX5_GET(set_action_in, sw_action, offset); + sw_field = MLX5_GET(set_action_in, sw_action, field); + data = MLX5_GET(set_action_in, sw_action, data); + + /* Convert SW data to HW modify action format */ + hw_action_info = dr_action_modify_get_hw_info(sw_field); + if (!hw_action_info) { + mlx5dr_dbg(dmn, "Modify set action invalid field given\n"); + return -EINVAL; + } + + /* PRM defines that length zero specific length of 32bits */ + length = length ? length : 32; + + max_length = hw_action_info->end - hw_action_info->start + 1; + + if (length + offset > max_length) { + mlx5dr_dbg(dmn, "Modify action length + offset exceeds limit\n"); + return -EINVAL; } - MLX5_SET(dr_action_hw_set, hw_action, opcode, hw_opcode); + MLX5_SET(dr_action_hw_set, hw_action, + opcode, MLX5DR_ACTION_MDFY_HW_OP_SET); MLX5_SET(dr_action_hw_set, hw_action, destination_field_code, hw_action_info->hw_field); @@ -1283,48 +1414,236 @@ dr_action_modify_sw_to_hw(struct mlx5dr_domain *dmn, } static int -dr_action_modify_check_field_limitation(struct mlx5dr_domain *dmn, - const __be64 *sw_action) +dr_action_modify_sw_to_hw_copy(struct mlx5dr_domain *dmn, + __be64 *sw_action, + __be64 *hw_action, + const struct dr_action_modify_field_conv **ret_dst_hw_info, + const struct dr_action_modify_field_conv **ret_src_hw_info) +{ + u8 src_offset, dst_offset, src_max_length, dst_max_length, length; + const struct dr_action_modify_field_conv *hw_dst_action_info; + const struct dr_action_modify_field_conv *hw_src_action_info; + u16 src_field, dst_field; + + /* Get SW modify action data */ + src_field = MLX5_GET(copy_action_in, sw_action, src_field); + dst_field = MLX5_GET(copy_action_in, sw_action, dst_field); + src_offset = MLX5_GET(copy_action_in, sw_action, src_offset); + dst_offset = MLX5_GET(copy_action_in, sw_action, dst_offset); + length = MLX5_GET(copy_action_in, sw_action, length); + + /* Convert SW data to HW modify action format */ + hw_src_action_info = dr_action_modify_get_hw_info(src_field); + hw_dst_action_info = dr_action_modify_get_hw_info(dst_field); + if (!hw_src_action_info || !hw_dst_action_info) { + mlx5dr_dbg(dmn, "Modify copy action invalid field given\n"); + return -EINVAL; + } + + /* PRM defines that length zero specific length of 32bits */ + length = length ? length : 32; + + src_max_length = hw_src_action_info->end - + hw_src_action_info->start + 1; + dst_max_length = hw_dst_action_info->end - + hw_dst_action_info->start + 1; + + if (length + src_offset > src_max_length || + length + dst_offset > dst_max_length) { + mlx5dr_dbg(dmn, "Modify action length + offset exceeds limit\n"); + return -EINVAL; + } + + MLX5_SET(dr_action_hw_copy, hw_action, + opcode, MLX5DR_ACTION_MDFY_HW_OP_COPY); + + MLX5_SET(dr_action_hw_copy, hw_action, destination_field_code, + hw_dst_action_info->hw_field); + + MLX5_SET(dr_action_hw_copy, hw_action, destination_left_shifter, + hw_dst_action_info->start + dst_offset); + + MLX5_SET(dr_action_hw_copy, hw_action, destination_length, + length == 32 ? 0 : length); + + MLX5_SET(dr_action_hw_copy, hw_action, source_field_code, + hw_src_action_info->hw_field); + + MLX5_SET(dr_action_hw_copy, hw_action, source_left_shifter, + hw_src_action_info->start + dst_offset); + + *ret_dst_hw_info = hw_dst_action_info; + *ret_src_hw_info = hw_src_action_info; + + return 0; +} + +static int +dr_action_modify_sw_to_hw(struct mlx5dr_domain *dmn, + __be64 *sw_action, + __be64 *hw_action, + const struct dr_action_modify_field_conv **ret_dst_hw_info, + const struct dr_action_modify_field_conv **ret_src_hw_info) { - u16 sw_field; u8 action; + int ret; - sw_field = MLX5_GET(set_action_in, sw_action, field); + *hw_action = 0; + *ret_src_hw_info = NULL; + + /* Get SW modify action type */ action = MLX5_GET(set_action_in, sw_action, action_type); - /* Check if SW field is supported in current domain (RX/TX) */ - if (action == MLX5_ACTION_TYPE_SET) { - if (sw_field == MLX5_ACTION_IN_FIELD_METADATA_REG_A) { + switch (action) { + case MLX5_ACTION_TYPE_SET: + ret = dr_action_modify_sw_to_hw_set(dmn, sw_action, + hw_action, + ret_dst_hw_info); + break; + + case MLX5_ACTION_TYPE_ADD: + ret = dr_action_modify_sw_to_hw_add(dmn, sw_action, + hw_action, + ret_dst_hw_info); + break; + + case MLX5_ACTION_TYPE_COPY: + ret = dr_action_modify_sw_to_hw_copy(dmn, sw_action, + hw_action, + ret_dst_hw_info, + ret_src_hw_info); + break; + + default: + mlx5dr_info(dmn, "Unsupported action_type for modify action\n"); + ret = -EOPNOTSUPP; + } + + return ret; +} + +static int +dr_action_modify_check_set_field_limitation(struct mlx5dr_action *action, + const __be64 *sw_action) +{ + u16 sw_field = MLX5_GET(set_action_in, sw_action, field); + struct mlx5dr_domain *dmn = action->rewrite.dmn; + + if (sw_field == MLX5_ACTION_IN_FIELD_METADATA_REG_A) { + action->rewrite.allow_rx = 0; + if (dmn->type != MLX5DR_DOMAIN_TYPE_NIC_TX) { + mlx5dr_dbg(dmn, "Unsupported field %d for RX/FDB set action\n", + sw_field); + return -EINVAL; + } + } else if (sw_field == MLX5_ACTION_IN_FIELD_METADATA_REG_B) { + action->rewrite.allow_tx = 0; + if (dmn->type != MLX5DR_DOMAIN_TYPE_NIC_RX) { + mlx5dr_dbg(dmn, "Unsupported field %d for TX/FDB set action\n", + sw_field); + return -EINVAL; + } + } + + if (!action->rewrite.allow_rx && !action->rewrite.allow_tx) { + mlx5dr_dbg(dmn, "Modify SET actions not supported on both RX and TX\n"); + return -EINVAL; + } + + return 0; +} + +static int +dr_action_modify_check_add_field_limitation(struct mlx5dr_action *action, + const __be64 *sw_action) +{ + u16 sw_field = MLX5_GET(set_action_in, sw_action, field); + struct mlx5dr_domain *dmn = action->rewrite.dmn; + + if (sw_field != MLX5_ACTION_IN_FIELD_OUT_IP_TTL && + sw_field != MLX5_ACTION_IN_FIELD_OUT_IPV6_HOPLIMIT && + sw_field != MLX5_ACTION_IN_FIELD_OUT_TCP_SEQ_NUM && + sw_field != MLX5_ACTION_IN_FIELD_OUT_TCP_ACK_NUM) { + mlx5dr_dbg(dmn, "Unsupported field %d for add action\n", + sw_field); + return -EINVAL; + } + + return 0; +} + +static int +dr_action_modify_check_copy_field_limitation(struct mlx5dr_action *action, + const __be64 *sw_action) +{ + struct mlx5dr_domain *dmn = action->rewrite.dmn; + u16 sw_fields[2]; + int i; + + sw_fields[0] = MLX5_GET(copy_action_in, sw_action, src_field); + sw_fields[1] = MLX5_GET(copy_action_in, sw_action, dst_field); + + for (i = 0; i < 2; i++) { + if (sw_fields[i] == MLX5_ACTION_IN_FIELD_METADATA_REG_A) { + action->rewrite.allow_rx = 0; if (dmn->type != MLX5DR_DOMAIN_TYPE_NIC_TX) { mlx5dr_dbg(dmn, "Unsupported field %d for RX/FDB set action\n", - sw_field); + sw_fields[i]); return -EINVAL; } - } - - if (sw_field == MLX5_ACTION_IN_FIELD_METADATA_REG_B) { + } else if (sw_fields[i] == MLX5_ACTION_IN_FIELD_METADATA_REG_B) { + action->rewrite.allow_tx = 0; if (dmn->type != MLX5DR_DOMAIN_TYPE_NIC_RX) { mlx5dr_dbg(dmn, "Unsupported field %d for TX/FDB set action\n", - sw_field); + sw_fields[i]); return -EINVAL; } } - } else if (action == MLX5_ACTION_TYPE_ADD) { - if (sw_field != MLX5_ACTION_IN_FIELD_OUT_IP_TTL && - sw_field != MLX5_ACTION_IN_FIELD_OUT_IPV6_HOPLIMIT && - sw_field != MLX5_ACTION_IN_FIELD_OUT_TCP_SEQ_NUM && - sw_field != MLX5_ACTION_IN_FIELD_OUT_TCP_ACK_NUM) { - mlx5dr_dbg(dmn, "Unsupported field %d for add action\n", sw_field); - return -EINVAL; - } - } else { - mlx5dr_info(dmn, "Unsupported action %d modify action\n", action); - return -EOPNOTSUPP; + } + + if (!action->rewrite.allow_rx && !action->rewrite.allow_tx) { + mlx5dr_dbg(dmn, "Modify copy actions not supported on both RX and TX\n"); + return -EINVAL; } return 0; } +static int +dr_action_modify_check_field_limitation(struct mlx5dr_action *action, + const __be64 *sw_action) +{ + struct mlx5dr_domain *dmn = action->rewrite.dmn; + u8 action_type; + int ret; + + action_type = MLX5_GET(set_action_in, sw_action, action_type); + + switch (action_type) { + case MLX5_ACTION_TYPE_SET: + ret = dr_action_modify_check_set_field_limitation(action, + sw_action); + break; + + case MLX5_ACTION_TYPE_ADD: + ret = dr_action_modify_check_add_field_limitation(action, + sw_action); + break; + + case MLX5_ACTION_TYPE_COPY: + ret = dr_action_modify_check_copy_field_limitation(action, + sw_action); + break; + + default: + mlx5dr_info(dmn, "Unsupported action %d modify action\n", + action_type); + ret = -EOPNOTSUPP; + } + + return ret; +} + static bool dr_action_modify_check_is_ttl_modify(const u64 *sw_action) { @@ -1333,7 +1652,7 @@ dr_action_modify_check_is_ttl_modify(const u64 *sw_action) return sw_field == MLX5_ACTION_IN_FIELD_OUT_IP_TTL; } -static int dr_actions_convert_modify_header(struct mlx5dr_domain *dmn, +static int dr_actions_convert_modify_header(struct mlx5dr_action *action, u32 max_hw_actions, u32 num_sw_actions, __be64 sw_actions[], @@ -1341,20 +1660,26 @@ static int dr_actions_convert_modify_header(struct mlx5dr_domain *dmn, u32 *num_hw_actions, bool *modify_ttl) { - const struct dr_action_modify_field_conv *hw_action_info; + const struct dr_action_modify_field_conv *hw_dst_action_info; + const struct dr_action_modify_field_conv *hw_src_action_info; u16 hw_field = MLX5DR_ACTION_MDFY_HW_FLD_RESERVED; u32 l3_type = MLX5DR_ACTION_MDFY_HW_HDR_L3_NONE; u32 l4_type = MLX5DR_ACTION_MDFY_HW_HDR_L4_NONE; + struct mlx5dr_domain *dmn = action->rewrite.dmn; int ret, i, hw_idx = 0; __be64 *sw_action; __be64 hw_action; *modify_ttl = false; + action->rewrite.allow_rx = 1; + action->rewrite.allow_tx = 1; + for (i = 0; i < num_sw_actions; i++) { sw_action = &sw_actions[i]; - ret = dr_action_modify_check_field_limitation(dmn, sw_action); + ret = dr_action_modify_check_field_limitation(action, + sw_action); if (ret) return ret; @@ -1365,32 +1690,35 @@ static int dr_actions_convert_modify_header(struct mlx5dr_domain *dmn, ret = dr_action_modify_sw_to_hw(dmn, sw_action, &hw_action, - &hw_action_info); + &hw_dst_action_info, + &hw_src_action_info); if (ret) return ret; /* Due to a HW limitation we cannot modify 2 different L3 types */ - if (l3_type && hw_action_info->l3_type && - hw_action_info->l3_type != l3_type) { + if (l3_type && hw_dst_action_info->l3_type && + hw_dst_action_info->l3_type != l3_type) { mlx5dr_dbg(dmn, "Action list can't support two different L3 types\n"); return -EINVAL; } - if (hw_action_info->l3_type) - l3_type = hw_action_info->l3_type; + if (hw_dst_action_info->l3_type) + l3_type = hw_dst_action_info->l3_type; /* Due to a HW limitation we cannot modify two different L4 types */ - if (l4_type && hw_action_info->l4_type && - hw_action_info->l4_type != l4_type) { + if (l4_type && hw_dst_action_info->l4_type && + hw_dst_action_info->l4_type != l4_type) { mlx5dr_dbg(dmn, "Action list can't support two different L4 types\n"); return -EINVAL; } - if (hw_action_info->l4_type) - l4_type = hw_action_info->l4_type; + if (hw_dst_action_info->l4_type) + l4_type = hw_dst_action_info->l4_type; /* HW reads and executes two actions at once this means we * need to create a gap if two actions access the same field */ - if ((hw_idx % 2) && hw_field == hw_action_info->hw_field) { + if ((hw_idx % 2) && (hw_field == hw_dst_action_info->hw_field || + (hw_src_action_info && + hw_field == hw_src_action_info->hw_field))) { /* Check if after gap insertion the total number of HW * modify actions doesn't exceeds the limit */ @@ -1400,7 +1728,7 @@ static int dr_actions_convert_modify_header(struct mlx5dr_domain *dmn, return -EINVAL; } } - hw_field = hw_action_info->hw_field; + hw_field = hw_dst_action_info->hw_field; hw_actions[hw_idx] = hw_action; hw_idx++; @@ -1443,7 +1771,7 @@ static int dr_action_create_modify_action(struct mlx5dr_domain *dmn, goto free_chunk; } - ret = dr_actions_convert_modify_header(dmn, + ret = dr_actions_convert_modify_header(action, max_hw_actions, num_sw_actions, actions, @@ -1559,8 +1887,26 @@ int mlx5dr_action_destroy(struct mlx5dr_action *action) switch (action->action_type) { case DR_ACTION_TYP_FT: - if (!action->dest_tbl.is_fw_tbl) + if (action->dest_tbl.is_fw_tbl) + refcount_dec(&action->dest_tbl.fw_tbl.dmn->refcount); + else refcount_dec(&action->dest_tbl.tbl->refcount); + + if (action->dest_tbl.is_fw_tbl && + action->dest_tbl.fw_tbl.num_of_ref_actions) { + struct mlx5dr_action **ref_actions; + int i; + + ref_actions = action->dest_tbl.fw_tbl.ref_actions; + for (i = 0; i < action->dest_tbl.fw_tbl.num_of_ref_actions; i++) + refcount_dec(&ref_actions[i]->refcount); + + kfree(ref_actions); + + mlx5dr_fw_destroy_md_tbl(action->dest_tbl.fw_tbl.dmn, + action->dest_tbl.fw_tbl.id, + action->dest_tbl.fw_tbl.group_id); + } break; case DR_ACTION_TYP_TNL_L2_TO_L2: refcount_dec(&action->reformat.dmn->refcount); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c index 41662c4e2664..461b39376daf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c @@ -320,12 +320,7 @@ int mlx5dr_cmd_destroy_flow_group(struct mlx5_core_dev *mdev, } int mlx5dr_cmd_create_flow_table(struct mlx5_core_dev *mdev, - u32 table_type, - u64 icm_addr_rx, - u64 icm_addr_tx, - u8 level, - bool sw_owner, - bool term_tbl, + struct mlx5dr_cmd_create_flow_table_attr *attr, u64 *fdb_rx_icm_addr, u32 *table_id) { @@ -335,37 +330,43 @@ int mlx5dr_cmd_create_flow_table(struct mlx5_core_dev *mdev, int err; MLX5_SET(create_flow_table_in, in, opcode, MLX5_CMD_OP_CREATE_FLOW_TABLE); - MLX5_SET(create_flow_table_in, in, table_type, table_type); + MLX5_SET(create_flow_table_in, in, table_type, attr->table_type); ft_mdev = MLX5_ADDR_OF(create_flow_table_in, in, flow_table_context); - MLX5_SET(flow_table_context, ft_mdev, termination_table, term_tbl); - MLX5_SET(flow_table_context, ft_mdev, sw_owner, sw_owner); - MLX5_SET(flow_table_context, ft_mdev, level, level); + MLX5_SET(flow_table_context, ft_mdev, termination_table, attr->term_tbl); + MLX5_SET(flow_table_context, ft_mdev, sw_owner, attr->sw_owner); + MLX5_SET(flow_table_context, ft_mdev, level, attr->level); - if (sw_owner) { + if (attr->sw_owner) { /* icm_addr_0 used for FDB RX / NIC TX / NIC_RX * icm_addr_1 used for FDB TX */ - if (table_type == MLX5_FLOW_TABLE_TYPE_NIC_RX) { + if (attr->table_type == MLX5_FLOW_TABLE_TYPE_NIC_RX) { MLX5_SET64(flow_table_context, ft_mdev, - sw_owner_icm_root_0, icm_addr_rx); - } else if (table_type == MLX5_FLOW_TABLE_TYPE_NIC_TX) { + sw_owner_icm_root_0, attr->icm_addr_rx); + } else if (attr->table_type == MLX5_FLOW_TABLE_TYPE_NIC_TX) { MLX5_SET64(flow_table_context, ft_mdev, - sw_owner_icm_root_0, icm_addr_tx); - } else if (table_type == MLX5_FLOW_TABLE_TYPE_FDB) { + sw_owner_icm_root_0, attr->icm_addr_tx); + } else if (attr->table_type == MLX5_FLOW_TABLE_TYPE_FDB) { MLX5_SET64(flow_table_context, ft_mdev, - sw_owner_icm_root_0, icm_addr_rx); + sw_owner_icm_root_0, attr->icm_addr_rx); MLX5_SET64(flow_table_context, ft_mdev, - sw_owner_icm_root_1, icm_addr_tx); + sw_owner_icm_root_1, attr->icm_addr_tx); } } + MLX5_SET(create_flow_table_in, in, flow_table_context.decap_en, + attr->decap_en); + MLX5_SET(create_flow_table_in, in, flow_table_context.reformat_en, + attr->reformat_en); + err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); if (err) return err; *table_id = MLX5_GET(create_flow_table_out, out, table_id); - if (!sw_owner && table_type == MLX5_FLOW_TABLE_TYPE_FDB) + if (!attr->sw_owner && attr->table_type == MLX5_FLOW_TABLE_TYPE_FDB && + fdb_rx_icm_addr) *fdb_rx_icm_addr = (u64)MLX5_GET(create_flow_table_out, out, icm_address_31_0) | (u64)MLX5_GET(create_flow_table_out, out, icm_address_39_32) << 32 | @@ -478,3 +479,208 @@ int mlx5dr_cmd_query_gid(struct mlx5_core_dev *mdev, u8 vhca_port_num, return 0; } + +static int mlx5dr_cmd_set_extended_dest(struct mlx5_core_dev *dev, + struct mlx5dr_cmd_fte_info *fte, + bool *extended_dest) +{ + int fw_log_max_fdb_encap_uplink = MLX5_CAP_ESW(dev, log_max_fdb_encap_uplink); + int num_fwd_destinations = 0; + int num_encap = 0; + int i; + + *extended_dest = false; + if (!(fte->action.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST)) + return 0; + for (i = 0; i < fte->dests_size; i++) { + if (fte->dest_arr[i].type == MLX5_FLOW_DESTINATION_TYPE_COUNTER) + continue; + if (fte->dest_arr[i].type == MLX5_FLOW_DESTINATION_TYPE_VPORT && + fte->dest_arr[i].vport.flags & MLX5_FLOW_DEST_VPORT_REFORMAT_ID) + num_encap++; + num_fwd_destinations++; + } + + if (num_fwd_destinations > 1 && num_encap > 0) + *extended_dest = true; + + if (*extended_dest && !fw_log_max_fdb_encap_uplink) { + mlx5_core_warn(dev, "FW does not support extended destination"); + return -EOPNOTSUPP; + } + if (num_encap > (1 << fw_log_max_fdb_encap_uplink)) { + mlx5_core_warn(dev, "FW does not support more than %d encaps", + 1 << fw_log_max_fdb_encap_uplink); + return -EOPNOTSUPP; + } + + return 0; +} + +int mlx5dr_cmd_set_fte(struct mlx5_core_dev *dev, + int opmod, int modify_mask, + struct mlx5dr_cmd_ft_info *ft, + u32 group_id, + struct mlx5dr_cmd_fte_info *fte) +{ + u32 out[MLX5_ST_SZ_DW(set_fte_out)] = {}; + void *in_flow_context, *vlan; + bool extended_dest = false; + void *in_match_value; + unsigned int inlen; + int dst_cnt_size; + void *in_dests; + u32 *in; + int err; + int i; + + if (mlx5dr_cmd_set_extended_dest(dev, fte, &extended_dest)) + return -EOPNOTSUPP; + + if (!extended_dest) + dst_cnt_size = MLX5_ST_SZ_BYTES(dest_format_struct); + else + dst_cnt_size = MLX5_ST_SZ_BYTES(extended_dest_format); + + inlen = MLX5_ST_SZ_BYTES(set_fte_in) + fte->dests_size * dst_cnt_size; + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(set_fte_in, in, opcode, MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY); + MLX5_SET(set_fte_in, in, op_mod, opmod); + MLX5_SET(set_fte_in, in, modify_enable_mask, modify_mask); + MLX5_SET(set_fte_in, in, table_type, ft->type); + MLX5_SET(set_fte_in, in, table_id, ft->id); + MLX5_SET(set_fte_in, in, flow_index, fte->index); + if (ft->vport) { + MLX5_SET(set_fte_in, in, vport_number, ft->vport); + MLX5_SET(set_fte_in, in, other_vport, 1); + } + + in_flow_context = MLX5_ADDR_OF(set_fte_in, in, flow_context); + MLX5_SET(flow_context, in_flow_context, group_id, group_id); + + MLX5_SET(flow_context, in_flow_context, flow_tag, + fte->flow_context.flow_tag); + MLX5_SET(flow_context, in_flow_context, flow_source, + fte->flow_context.flow_source); + + MLX5_SET(flow_context, in_flow_context, extended_destination, + extended_dest); + if (extended_dest) { + u32 action; + + action = fte->action.action & + ~MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; + MLX5_SET(flow_context, in_flow_context, action, action); + } else { + MLX5_SET(flow_context, in_flow_context, action, + fte->action.action); + if (fte->action.pkt_reformat) + MLX5_SET(flow_context, in_flow_context, packet_reformat_id, + fte->action.pkt_reformat->id); + } + if (fte->action.modify_hdr) + MLX5_SET(flow_context, in_flow_context, modify_header_id, + fte->action.modify_hdr->id); + + vlan = MLX5_ADDR_OF(flow_context, in_flow_context, push_vlan); + + MLX5_SET(vlan, vlan, ethtype, fte->action.vlan[0].ethtype); + MLX5_SET(vlan, vlan, vid, fte->action.vlan[0].vid); + MLX5_SET(vlan, vlan, prio, fte->action.vlan[0].prio); + + vlan = MLX5_ADDR_OF(flow_context, in_flow_context, push_vlan_2); + + MLX5_SET(vlan, vlan, ethtype, fte->action.vlan[1].ethtype); + MLX5_SET(vlan, vlan, vid, fte->action.vlan[1].vid); + MLX5_SET(vlan, vlan, prio, fte->action.vlan[1].prio); + + in_match_value = MLX5_ADDR_OF(flow_context, in_flow_context, + match_value); + memcpy(in_match_value, fte->val, sizeof(u32) * MLX5_ST_SZ_DW_MATCH_PARAM); + + in_dests = MLX5_ADDR_OF(flow_context, in_flow_context, destination); + if (fte->action.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) { + int list_size = 0; + + for (i = 0; i < fte->dests_size; i++) { + unsigned int id, type = fte->dest_arr[i].type; + + if (type == MLX5_FLOW_DESTINATION_TYPE_COUNTER) + continue; + + switch (type) { + case MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM: + id = fte->dest_arr[i].ft_num; + type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + break; + case MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE: + id = fte->dest_arr[i].ft_id; + break; + case MLX5_FLOW_DESTINATION_TYPE_VPORT: + id = fte->dest_arr[i].vport.num; + MLX5_SET(dest_format_struct, in_dests, + destination_eswitch_owner_vhca_id_valid, + !!(fte->dest_arr[i].vport.flags & + MLX5_FLOW_DEST_VPORT_VHCA_ID)); + MLX5_SET(dest_format_struct, in_dests, + destination_eswitch_owner_vhca_id, + fte->dest_arr[i].vport.vhca_id); + if (extended_dest && (fte->dest_arr[i].vport.flags & + MLX5_FLOW_DEST_VPORT_REFORMAT_ID)) { + MLX5_SET(dest_format_struct, in_dests, + packet_reformat, + !!(fte->dest_arr[i].vport.flags & + MLX5_FLOW_DEST_VPORT_REFORMAT_ID)); + MLX5_SET(extended_dest_format, in_dests, + packet_reformat_id, + fte->dest_arr[i].vport.reformat_id); + } + break; + default: + id = fte->dest_arr[i].tir_num; + } + + MLX5_SET(dest_format_struct, in_dests, destination_type, + type); + MLX5_SET(dest_format_struct, in_dests, destination_id, id); + in_dests += dst_cnt_size; + list_size++; + } + + MLX5_SET(flow_context, in_flow_context, destination_list_size, + list_size); + } + + if (fte->action.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) { + int max_list_size = BIT(MLX5_CAP_FLOWTABLE_TYPE(dev, + log_max_flow_counter, + ft->type)); + int list_size = 0; + + for (i = 0; i < fte->dests_size; i++) { + if (fte->dest_arr[i].type != + MLX5_FLOW_DESTINATION_TYPE_COUNTER) + continue; + + MLX5_SET(flow_counter_list, in_dests, flow_counter_id, + fte->dest_arr[i].counter_id); + in_dests += dst_cnt_size; + list_size++; + } + if (list_size > max_list_size) { + err = -EINVAL; + goto err_out; + } + + MLX5_SET(flow_context, in_flow_context, flow_counter_list_size, + list_size); + } + + err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); +err_out: + kvfree(in); + return err; +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_fw.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_fw.c index 60ef6e6171e3..1fbcd012bb85 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_fw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_fw.c @@ -7,6 +7,7 @@ struct mlx5dr_fw_recalc_cs_ft * mlx5dr_fw_create_recalc_cs_ft(struct mlx5dr_domain *dmn, u32 vport_num) { + struct mlx5dr_cmd_create_flow_table_attr ft_attr = {}; struct mlx5dr_fw_recalc_cs_ft *recalc_cs_ft; u32 table_id, group_id, modify_hdr_id; u64 rx_icm_addr, modify_ttl_action; @@ -16,9 +17,14 @@ mlx5dr_fw_create_recalc_cs_ft(struct mlx5dr_domain *dmn, u32 vport_num) if (!recalc_cs_ft) return NULL; - ret = mlx5dr_cmd_create_flow_table(dmn->mdev, MLX5_FLOW_TABLE_TYPE_FDB, - 0, 0, dmn->info.caps.max_ft_level - 1, - false, true, &rx_icm_addr, &table_id); + ft_attr.table_type = MLX5_FLOW_TABLE_TYPE_FDB; + ft_attr.level = dmn->info.caps.max_ft_level - 1; + ft_attr.term_tbl = true; + + ret = mlx5dr_cmd_create_flow_table(dmn->mdev, + &ft_attr, + &rx_icm_addr, + &table_id); if (ret) { mlx5dr_err(dmn, "Failed creating TTL W/A FW flow table %d\n", ret); goto free_ttl_tbl; @@ -91,3 +97,70 @@ void mlx5dr_fw_destroy_recalc_cs_ft(struct mlx5dr_domain *dmn, kfree(recalc_cs_ft); } + +int mlx5dr_fw_create_md_tbl(struct mlx5dr_domain *dmn, + struct mlx5dr_cmd_flow_destination_hw_info *dest, + int num_dest, + bool reformat_req, + u32 *tbl_id, + u32 *group_id) +{ + struct mlx5dr_cmd_create_flow_table_attr ft_attr = {}; + struct mlx5dr_cmd_fte_info fte_info = {}; + u32 val[MLX5_ST_SZ_DW_MATCH_PARAM] = {}; + struct mlx5dr_cmd_ft_info ft_info = {}; + int ret; + + ft_attr.table_type = MLX5_FLOW_TABLE_TYPE_FDB; + ft_attr.level = dmn->info.caps.max_ft_level - 2; + ft_attr.reformat_en = reformat_req; + ft_attr.decap_en = reformat_req; + + ret = mlx5dr_cmd_create_flow_table(dmn->mdev, &ft_attr, NULL, tbl_id); + if (ret) { + mlx5dr_err(dmn, "Failed creating multi dest FW flow table %d\n", ret); + return ret; + } + + ret = mlx5dr_cmd_create_empty_flow_group(dmn->mdev, + MLX5_FLOW_TABLE_TYPE_FDB, + *tbl_id, group_id); + if (ret) { + mlx5dr_err(dmn, "Failed creating multi dest FW flow group %d\n", ret); + goto free_flow_table; + } + + ft_info.id = *tbl_id; + ft_info.type = FS_FT_FDB; + fte_info.action.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + fte_info.dests_size = num_dest; + fte_info.val = val; + fte_info.dest_arr = dest; + + ret = mlx5dr_cmd_set_fte(dmn->mdev, 0, 0, &ft_info, *group_id, &fte_info); + if (ret) { + mlx5dr_err(dmn, "Failed setting fte into table %d\n", ret); + goto free_flow_group; + } + + return 0; + +free_flow_group: + mlx5dr_cmd_destroy_flow_group(dmn->mdev, MLX5_FLOW_TABLE_TYPE_FDB, + *tbl_id, *group_id); +free_flow_table: + mlx5dr_cmd_destroy_flow_table(dmn->mdev, *tbl_id, + MLX5_FLOW_TABLE_TYPE_FDB); + return ret; +} + +void mlx5dr_fw_destroy_md_tbl(struct mlx5dr_domain *dmn, + u32 tbl_id, u32 group_id) +{ + mlx5dr_cmd_del_flow_table_entry(dmn->mdev, FS_FT_FDB, tbl_id); + mlx5dr_cmd_destroy_flow_group(dmn->mdev, + MLX5_FLOW_TABLE_TYPE_FDB, + tbl_id, group_id); + mlx5dr_cmd_destroy_flow_table(dmn->mdev, tbl_id, + MLX5_FLOW_TABLE_TYPE_FDB); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c index 32e94d2ee5e4..e4cff7abb348 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c @@ -209,7 +209,7 @@ static void dr_rule_rehash_copy_ste_ctrl(struct mlx5dr_matcher *matcher, /* We need to copy the refcount since this ste * may have been traversed several times */ - refcount_set(&new_ste->refcount, refcount_read(&cur_ste->refcount)); + new_ste->refcount = cur_ste->refcount; /* Link old STEs rule_mem list to the new ste */ mlx5dr_rule_update_rule_member(cur_ste, new_ste); @@ -638,6 +638,9 @@ static int dr_rule_add_member(struct mlx5dr_rule_rx_tx *nic_rule, if (!rule_mem) return -ENOMEM; + INIT_LIST_HEAD(&rule_mem->list); + INIT_LIST_HEAD(&rule_mem->use_ste_list); + rule_mem->ste = ste; list_add_tail(&rule_mem->list, &nic_rule->rule_members_list); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c index 51803eef13dd..c7f10d4f8f8d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* Copyright (c) 2019 Mellanox Technologies. */ +#include <linux/smp.h> #include "dr_types.h" #define QUEUE_SIZE 128 @@ -729,7 +730,7 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, if (!in) goto err_cqwq; - vector = smp_processor_id() % mlx5_comp_vectors_count(mdev); + vector = raw_smp_processor_id() % mlx5_comp_vectors_count(mdev); err = mlx5_vector2eqn(mdev, vector, &eqn, &irqn); if (err) { kvfree(in); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c index a5a266983dd3..c6c7d1defbd7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c @@ -348,7 +348,7 @@ static void dr_ste_replace(struct mlx5dr_ste *dst, struct mlx5dr_ste *src) if (dst->next_htbl) dst->next_htbl->pointing_ste = dst; - refcount_set(&dst->refcount, refcount_read(&src->refcount)); + dst->refcount = src->refcount; INIT_LIST_HEAD(&dst->rule_list); list_splice_tail_init(&src->rule_list, &dst->rule_list); @@ -565,7 +565,7 @@ bool mlx5dr_ste_is_not_valid_entry(u8 *p_hw_ste) bool mlx5dr_ste_not_used_ste(struct mlx5dr_ste *ste) { - return !refcount_read(&ste->refcount); + return !ste->refcount; } /* Init one ste as a pattern for ste data array */ @@ -689,14 +689,14 @@ struct mlx5dr_ste_htbl *mlx5dr_ste_htbl_alloc(struct mlx5dr_icm_pool *pool, htbl->ste_arr = chunk->ste_arr; htbl->hw_ste_arr = chunk->hw_ste_arr; htbl->miss_list = chunk->miss_list; - refcount_set(&htbl->refcount, 0); + htbl->refcount = 0; for (i = 0; i < chunk->num_of_entries; i++) { struct mlx5dr_ste *ste = &htbl->ste_arr[i]; ste->hw_ste = htbl->hw_ste_arr + i * DR_STE_SIZE_REDUCED; ste->htbl = htbl; - refcount_set(&ste->refcount, 0); + ste->refcount = 0; INIT_LIST_HEAD(&ste->miss_list_node); INIT_LIST_HEAD(&htbl->miss_list[i]); INIT_LIST_HEAD(&ste->rule_list); @@ -713,7 +713,7 @@ out_free_htbl: int mlx5dr_ste_htbl_free(struct mlx5dr_ste_htbl *htbl) { - if (refcount_read(&htbl->refcount)) + if (htbl->refcount) return -EBUSY; mlx5dr_icm_free_chunk(htbl->chunk); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_table.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_table.c index e178d8d3dbc9..14ce2d7dbb66 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_table.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_table.c @@ -211,6 +211,9 @@ static int dr_table_destroy_sw_owned_tbl(struct mlx5dr_table *tbl) static int dr_table_create_sw_owned_tbl(struct mlx5dr_table *tbl) { + bool en_encap = !!(tbl->flags & MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT); + bool en_decap = !!(tbl->flags & MLX5_FLOW_TABLE_TUNNEL_EN_DECAP); + struct mlx5dr_cmd_create_flow_table_attr ft_attr = {}; u64 icm_addr_rx = 0; u64 icm_addr_tx = 0; int ret; @@ -221,18 +224,21 @@ static int dr_table_create_sw_owned_tbl(struct mlx5dr_table *tbl) if (tbl->tx.s_anchor) icm_addr_tx = tbl->tx.s_anchor->chunk->icm_addr; - ret = mlx5dr_cmd_create_flow_table(tbl->dmn->mdev, - tbl->table_type, - icm_addr_rx, - icm_addr_tx, - tbl->dmn->info.caps.max_ft_level - 1, - true, false, NULL, - &tbl->table_id); + ft_attr.table_type = tbl->table_type; + ft_attr.icm_addr_rx = icm_addr_rx; + ft_attr.icm_addr_tx = icm_addr_tx; + ft_attr.level = tbl->dmn->info.caps.max_ft_level - 1; + ft_attr.sw_owner = true; + ft_attr.decap_en = en_decap; + ft_attr.reformat_en = en_encap; + + ret = mlx5dr_cmd_create_flow_table(tbl->dmn->mdev, &ft_attr, + NULL, &tbl->table_id); return ret; } -struct mlx5dr_table *mlx5dr_table_create(struct mlx5dr_domain *dmn, u32 level) +struct mlx5dr_table *mlx5dr_table_create(struct mlx5dr_domain *dmn, u32 level, u32 flags) { struct mlx5dr_table *tbl; int ret; @@ -245,6 +251,7 @@ struct mlx5dr_table *mlx5dr_table_create(struct mlx5dr_domain *dmn, u32 level) tbl->dmn = dmn; tbl->level = level; + tbl->flags = flags; refcount_set(&tbl->refcount, 1); ret = dr_table_init(tbl); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h index 290fe61c33d0..dffe35145d19 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h @@ -123,7 +123,7 @@ struct mlx5dr_matcher_rx_tx; struct mlx5dr_ste { u8 *hw_ste; /* refcount: indicates the num of rules that using this ste */ - refcount_t refcount; + u32 refcount; /* attached to the miss_list head at each htbl entry */ struct list_head miss_list_node; @@ -155,7 +155,7 @@ struct mlx5dr_ste_htbl_ctrl { struct mlx5dr_ste_htbl { u8 lu_type; u16 byte_mask; - refcount_t refcount; + u32 refcount; struct mlx5dr_icm_chunk *chunk; struct mlx5dr_ste *ste_arr; u8 *hw_ste_arr; @@ -206,13 +206,14 @@ int mlx5dr_ste_htbl_free(struct mlx5dr_ste_htbl *htbl); static inline void mlx5dr_htbl_put(struct mlx5dr_ste_htbl *htbl) { - if (refcount_dec_and_test(&htbl->refcount)) + htbl->refcount--; + if (!htbl->refcount) mlx5dr_ste_htbl_free(htbl); } static inline void mlx5dr_htbl_get(struct mlx5dr_ste_htbl *htbl) { - refcount_inc(&htbl->refcount); + htbl->refcount++; } /* STE utils */ @@ -254,14 +255,15 @@ static inline void mlx5dr_ste_put(struct mlx5dr_ste *ste, struct mlx5dr_matcher *matcher, struct mlx5dr_matcher_rx_tx *nic_matcher) { - if (refcount_dec_and_test(&ste->refcount)) + ste->refcount--; + if (!ste->refcount) mlx5dr_ste_free(ste, matcher, nic_matcher); } /* initial as 0, increased only when ste appears in a new rule */ static inline void mlx5dr_ste_get(struct mlx5dr_ste *ste) { - refcount_inc(&ste->refcount); + ste->refcount++; } void mlx5dr_ste_set_hit_addr_by_next_htbl(u8 *hw_ste, @@ -679,6 +681,7 @@ struct mlx5dr_table { u32 level; u32 table_type; u32 table_id; + u32 flags; struct list_head matcher_list; struct mlx5dr_action *miss_action; refcount_t refcount; @@ -742,10 +745,14 @@ struct mlx5dr_action { union { struct mlx5dr_table *tbl; struct { - struct mlx5_flow_table *ft; + struct mlx5dr_domain *dmn; + u32 id; + u32 group_id; + enum fs_flow_table_type type; u64 rx_icm_addr; u64 tx_icm_addr; - struct mlx5_core_dev *mdev; + struct mlx5dr_action **ref_actions; + u32 num_of_ref_actions; } fw_tbl; }; } dest_tbl; @@ -867,6 +874,17 @@ struct mlx5dr_cmd_query_flow_table_details { u64 sw_owner_icm_root_0; }; +struct mlx5dr_cmd_create_flow_table_attr { + u32 table_type; + u64 icm_addr_rx; + u64 icm_addr_tx; + u8 level; + bool sw_owner; + bool term_tbl; + bool decap_en; + bool reformat_en; +}; + /* internal API functions */ int mlx5dr_cmd_query_device(struct mlx5_core_dev *mdev, struct mlx5dr_cmd_caps *caps); @@ -904,12 +922,7 @@ int mlx5dr_cmd_destroy_flow_group(struct mlx5_core_dev *mdev, u32 table_id, u32 group_id); int mlx5dr_cmd_create_flow_table(struct mlx5_core_dev *mdev, - u32 table_type, - u64 icm_addr_rx, - u64 icm_addr_tx, - u8 level, - bool sw_owner, - bool term_tbl, + struct mlx5dr_cmd_create_flow_table_attr *attr, u64 *fdb_rx_icm_addr, u32 *table_id); int mlx5dr_cmd_destroy_flow_table(struct mlx5_core_dev *mdev, @@ -1051,6 +1064,43 @@ int mlx5dr_send_postsend_formatted_htbl(struct mlx5dr_domain *dmn, int mlx5dr_send_postsend_action(struct mlx5dr_domain *dmn, struct mlx5dr_action *action); +struct mlx5dr_cmd_ft_info { + u32 id; + u16 vport; + enum fs_flow_table_type type; +}; + +struct mlx5dr_cmd_flow_destination_hw_info { + enum mlx5_flow_destination_type type; + union { + u32 tir_num; + u32 ft_num; + u32 ft_id; + u32 counter_id; + struct { + u16 num; + u16 vhca_id; + u32 reformat_id; + u8 flags; + } vport; + }; +}; + +struct mlx5dr_cmd_fte_info { + u32 dests_size; + u32 index; + struct mlx5_flow_context flow_context; + u32 *val; + struct mlx5_flow_act action; + struct mlx5dr_cmd_flow_destination_hw_info *dest_arr; +}; + +int mlx5dr_cmd_set_fte(struct mlx5_core_dev *dev, + int opmod, int modify_mask, + struct mlx5dr_cmd_ft_info *ft, + u32 group_id, + struct mlx5dr_cmd_fte_info *fte); + struct mlx5dr_fw_recalc_cs_ft { u64 rx_icm_addr; u32 table_id; @@ -1065,4 +1115,12 @@ void mlx5dr_fw_destroy_recalc_cs_ft(struct mlx5dr_domain *dmn, int mlx5dr_domain_cache_get_recalc_cs_ft_addr(struct mlx5dr_domain *dmn, u32 vport_num, u64 *rx_icm_addr); +int mlx5dr_fw_create_md_tbl(struct mlx5dr_domain *dmn, + struct mlx5dr_cmd_flow_destination_hw_info *dest, + int num_dest, + bool reformat_req, + u32 *tbl_id, + u32 *group_id); +void mlx5dr_fw_destroy_md_tbl(struct mlx5dr_domain *dmn, u32 tbl_id, + u32 group_id); #endif /* _DR_TYPES_H_ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c index 3d587d0bdbbe..3abfc8125926 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c @@ -74,7 +74,7 @@ static int mlx5_cmd_dr_create_flow_table(struct mlx5_flow_root_namespace *ns, next_ft); tbl = mlx5dr_table_create(ns->fs_dr_domain.dr_domain, - ft->level); + ft->level, ft->flags); if (!tbl) { mlx5_core_err(ns->dev, "Failed creating dr flow_table\n"); return -EINVAL; @@ -184,13 +184,13 @@ static struct mlx5dr_action *create_vport_action(struct mlx5dr_domain *domain, dest_attr->vport.vhca_id); } -static struct mlx5dr_action *create_ft_action(struct mlx5_core_dev *dev, +static struct mlx5dr_action *create_ft_action(struct mlx5dr_domain *domain, struct mlx5_flow_rule *dst) { struct mlx5_flow_table *dest_ft = dst->dest_attr.ft; if (mlx5_dr_is_fw_table(dest_ft->flags)) - return mlx5dr_create_action_dest_flow_fw_table(dest_ft, dev); + return mlx5dr_action_create_dest_flow_fw_table(domain, dest_ft); return mlx5dr_action_create_dest_table(dest_ft->fs_dr_table.dr_table); } @@ -206,6 +206,12 @@ static struct mlx5dr_action *create_action_push_vlan(struct mlx5dr_domain *domai return mlx5dr_action_create_push_vlan(domain, htonl(vlan_hdr)); } +static bool contain_vport_reformat_action(struct mlx5_flow_rule *dst) +{ + return dst->dest_attr.type == MLX5_FLOW_DESTINATION_TYPE_VPORT && + dst->dest_attr.vport.flags & MLX5_FLOW_DEST_VPORT_REFORMAT_ID; +} + #define MLX5_FLOW_CONTEXT_ACTION_MAX 20 static int mlx5_cmd_dr_create_fte(struct mlx5_flow_root_namespace *ns, struct mlx5_flow_table *ft, @@ -213,7 +219,7 @@ static int mlx5_cmd_dr_create_fte(struct mlx5_flow_root_namespace *ns, struct fs_fte *fte) { struct mlx5dr_domain *domain = ns->fs_dr_domain.dr_domain; - struct mlx5dr_action *term_action = NULL; + struct mlx5dr_action_dest *term_actions; struct mlx5dr_match_parameters params; struct mlx5_core_dev *dev = ns->dev; struct mlx5dr_action **fs_dr_actions; @@ -223,6 +229,7 @@ static int mlx5_cmd_dr_create_fte(struct mlx5_flow_root_namespace *ns, struct mlx5dr_rule *rule; struct mlx5_flow_rule *dst; int fs_dr_num_actions = 0; + int num_term_actions = 0; int num_actions = 0; size_t match_sz; int err = 0; @@ -233,18 +240,38 @@ static int mlx5_cmd_dr_create_fte(struct mlx5_flow_root_namespace *ns, actions = kcalloc(MLX5_FLOW_CONTEXT_ACTION_MAX, sizeof(*actions), GFP_KERNEL); - if (!actions) - return -ENOMEM; + if (!actions) { + err = -ENOMEM; + goto out_err; + } fs_dr_actions = kcalloc(MLX5_FLOW_CONTEXT_ACTION_MAX, sizeof(*fs_dr_actions), GFP_KERNEL); if (!fs_dr_actions) { - kfree(actions); - return -ENOMEM; + err = -ENOMEM; + goto free_actions_alloc; + } + + term_actions = kcalloc(MLX5_FLOW_CONTEXT_ACTION_MAX, + sizeof(*term_actions), GFP_KERNEL); + if (!term_actions) { + err = -ENOMEM; + goto free_fs_dr_actions_alloc; } match_sz = sizeof(fte->val); + /* Drop reformat action bit if destination vport set with reformat */ + if (fte->action.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) { + list_for_each_entry(dst, &fte->node.children, node.list) { + if (!contain_vport_reformat_action(dst)) + continue; + + fte->action.action &= ~MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; + break; + } + } + /* The order of the actions are must to be keep, only the following * order is supported by SW steering: * TX: push vlan -> modify header -> encap @@ -335,7 +362,7 @@ static int mlx5_cmd_dr_create_fte(struct mlx5_flow_root_namespace *ns, goto free_actions; } fs_dr_actions[fs_dr_num_actions++] = tmp_action; - term_action = tmp_action; + term_actions[num_term_actions++].dest = tmp_action; } if (fte->flow_context.flow_tag) { @@ -352,34 +379,25 @@ static int mlx5_cmd_dr_create_fte(struct mlx5_flow_root_namespace *ns, if (fte->action.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) { list_for_each_entry(dst, &fte->node.children, node.list) { enum mlx5_flow_destination_type type = dst->dest_attr.type; - u32 id; - if (num_actions == MLX5_FLOW_CONTEXT_ACTION_MAX) { + if (num_actions == MLX5_FLOW_CONTEXT_ACTION_MAX || + num_term_actions >= MLX5_FLOW_CONTEXT_ACTION_MAX) { err = -ENOSPC; goto free_actions; } - switch (type) { - case MLX5_FLOW_DESTINATION_TYPE_COUNTER: - id = dst->dest_attr.counter_id; + if (type == MLX5_FLOW_DESTINATION_TYPE_COUNTER) + continue; - tmp_action = - mlx5dr_action_create_flow_counter(id); - if (!tmp_action) { - err = -ENOMEM; - goto free_actions; - } - fs_dr_actions[fs_dr_num_actions++] = tmp_action; - actions[num_actions++] = tmp_action; - break; + switch (type) { case MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE: - tmp_action = create_ft_action(dev, dst); + tmp_action = create_ft_action(domain, dst); if (!tmp_action) { err = -ENOMEM; goto free_actions; } fs_dr_actions[fs_dr_num_actions++] = tmp_action; - term_action = tmp_action; + term_actions[num_term_actions++].dest = tmp_action; break; case MLX5_FLOW_DESTINATION_TYPE_VPORT: tmp_action = create_vport_action(domain, dst); @@ -388,7 +406,14 @@ static int mlx5_cmd_dr_create_fte(struct mlx5_flow_root_namespace *ns, goto free_actions; } fs_dr_actions[fs_dr_num_actions++] = tmp_action; - term_action = tmp_action; + term_actions[num_term_actions].dest = tmp_action; + + if (dst->dest_attr.vport.flags & + MLX5_FLOW_DEST_VPORT_REFORMAT_ID) + term_actions[num_term_actions].reformat = + dst->dest_attr.vport.pkt_reformat->action.dr_action; + + num_term_actions++; break; default: err = -EOPNOTSUPP; @@ -397,11 +422,50 @@ static int mlx5_cmd_dr_create_fte(struct mlx5_flow_root_namespace *ns, } } + if (fte->action.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) { + list_for_each_entry(dst, &fte->node.children, node.list) { + u32 id; + + if (dst->dest_attr.type != + MLX5_FLOW_DESTINATION_TYPE_COUNTER) + continue; + + if (num_actions == MLX5_FLOW_CONTEXT_ACTION_MAX) { + err = -ENOSPC; + goto free_actions; + } + + id = dst->dest_attr.counter_id; + tmp_action = + mlx5dr_action_create_flow_counter(id); + if (!tmp_action) { + err = -ENOMEM; + goto free_actions; + } + + fs_dr_actions[fs_dr_num_actions++] = tmp_action; + actions[num_actions++] = tmp_action; + } + } + params.match_sz = match_sz; params.match_buf = (u64 *)fte->val; - - if (term_action) - actions[num_actions++] = term_action; + if (num_term_actions == 1) { + if (term_actions->reformat) + actions[num_actions++] = term_actions->reformat; + + actions[num_actions++] = term_actions->dest; + } else if (num_term_actions > 1) { + tmp_action = mlx5dr_action_create_mult_dest_tbl(domain, + term_actions, + num_term_actions); + if (!tmp_action) { + err = -EOPNOTSUPP; + goto free_actions; + } + fs_dr_actions[fs_dr_num_actions++] = tmp_action; + actions[num_actions++] = tmp_action; + } rule = mlx5dr_rule_create(group->fs_dr_matcher.dr_matcher, ¶ms, @@ -412,7 +476,9 @@ static int mlx5_cmd_dr_create_fte(struct mlx5_flow_root_namespace *ns, goto free_actions; } + kfree(term_actions); kfree(actions); + fte->fs_dr_rule.dr_rule = rule; fte->fs_dr_rule.num_actions = fs_dr_num_actions; fte->fs_dr_rule.dr_actions = fs_dr_actions; @@ -420,13 +486,18 @@ static int mlx5_cmd_dr_create_fte(struct mlx5_flow_root_namespace *ns, return 0; free_actions: - for (i = 0; i < fs_dr_num_actions; i++) + /* Free in reverse order to handle action dependencies */ + for (i = fs_dr_num_actions - 1; i >= 0; i--) if (!IS_ERR_OR_NULL(fs_dr_actions[i])) mlx5dr_action_destroy(fs_dr_actions[i]); - mlx5_core_err(dev, "Failed to create dr rule err(%d)\n", err); - kfree(actions); + kfree(term_actions); +free_fs_dr_actions_alloc: kfree(fs_dr_actions); +free_actions_alloc: + kfree(actions); +out_err: + mlx5_core_err(dev, "Failed to create dr rule err(%d)\n", err); return err; } @@ -533,7 +604,8 @@ static int mlx5_cmd_dr_delete_fte(struct mlx5_flow_root_namespace *ns, if (err) return err; - for (i = 0; i < rule->num_actions; i++) + /* Free in reverse order to handle action dependencies */ + for (i = rule->num_actions - 1; i >= 0; i--) if (!IS_ERR_OR_NULL(rule->dr_actions[i])) mlx5dr_action_destroy(rule->dr_actions[i]); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5_ifc_dr.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5_ifc_dr.h index 1722f4668269..e01c3766c7de 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5_ifc_dr.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5_ifc_dr.h @@ -32,6 +32,7 @@ enum { }; enum { + MLX5DR_ACTION_MDFY_HW_OP_COPY = 0x1, MLX5DR_ACTION_MDFY_HW_OP_SET = 0x2, MLX5DR_ACTION_MDFY_HW_OP_ADD = 0x3, }; @@ -625,4 +626,19 @@ struct mlx5_ifc_dr_action_hw_set_bits { u8 inline_data[0x20]; }; +struct mlx5_ifc_dr_action_hw_copy_bits { + u8 opcode[0x8]; + u8 destination_field_code[0x8]; + u8 reserved_at_10[0x2]; + u8 destination_left_shifter[0x6]; + u8 reserved_at_18[0x2]; + u8 destination_length[0x6]; + + u8 reserved_at_20[0x8]; + u8 source_field_code[0x8]; + u8 reserved_at_30[0x2]; + u8 source_left_shifter[0x6]; + u8 reserved_at_38[0x8]; +}; + #endif /* MLX5_IFC_DR_H */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h index adda9cbfba45..e1edc9c247b7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h @@ -33,6 +33,11 @@ struct mlx5dr_match_parameters { u64 *match_buf; /* Device spec format */ }; +struct mlx5dr_action_dest { + struct mlx5dr_action *dest; + struct mlx5dr_action *reformat; +}; + #ifdef CONFIG_MLX5_SW_STEERING struct mlx5dr_domain * @@ -46,7 +51,7 @@ void mlx5dr_domain_set_peer(struct mlx5dr_domain *dmn, struct mlx5dr_domain *peer_dmn); struct mlx5dr_table * -mlx5dr_table_create(struct mlx5dr_domain *domain, u32 level); +mlx5dr_table_create(struct mlx5dr_domain *domain, u32 level, u32 flags); int mlx5dr_table_destroy(struct mlx5dr_table *table); @@ -75,14 +80,19 @@ struct mlx5dr_action * mlx5dr_action_create_dest_table(struct mlx5dr_table *table); struct mlx5dr_action * -mlx5dr_create_action_dest_flow_fw_table(struct mlx5_flow_table *ft, - struct mlx5_core_dev *mdev); +mlx5dr_action_create_dest_flow_fw_table(struct mlx5dr_domain *domain, + struct mlx5_flow_table *ft); struct mlx5dr_action * mlx5dr_action_create_dest_vport(struct mlx5dr_domain *domain, u32 vport, u8 vhca_id_valid, u16 vhca_id); +struct mlx5dr_action * +mlx5dr_action_create_mult_dest_tbl(struct mlx5dr_domain *dmn, + struct mlx5dr_action_dest *dests, + u32 num_of_dests); + struct mlx5dr_action *mlx5dr_action_create_drop(void); struct mlx5dr_action *mlx5dr_action_create_tag(u32 tag_value); @@ -131,7 +141,7 @@ mlx5dr_domain_set_peer(struct mlx5dr_domain *dmn, struct mlx5dr_domain *peer_dmn) { } static inline struct mlx5dr_table * -mlx5dr_table_create(struct mlx5dr_domain *domain, u32 level) { return NULL; } +mlx5dr_table_create(struct mlx5dr_domain *domain, u32 level, u32 flags) { return NULL; } static inline int mlx5dr_table_destroy(struct mlx5dr_table *table) { return 0; } @@ -165,8 +175,8 @@ static inline struct mlx5dr_action * mlx5dr_action_create_dest_table(struct mlx5dr_table *table) { return NULL; } static inline struct mlx5dr_action * -mlx5dr_create_action_dest_flow_fw_table(struct mlx5_flow_table *ft, - struct mlx5_core_dev *mdev) { return NULL; } +mlx5dr_action_create_dest_flow_fw_table(struct mlx5dr_domain *domain, + struct mlx5_flow_table *ft) { return NULL; } static inline struct mlx5dr_action * mlx5dr_action_create_dest_vport(struct mlx5dr_domain *domain, @@ -174,6 +184,11 @@ mlx5dr_action_create_dest_vport(struct mlx5dr_domain *domain, u16 vhca_id) { return NULL; } static inline struct mlx5dr_action * +mlx5dr_action_create_mult_dest_tbl(struct mlx5dr_domain *dmn, + struct mlx5dr_action_dest *dests, + u32 num_of_dests) { return NULL; } + +static inline struct mlx5dr_action * mlx5dr_action_create_drop(void) { return NULL; } static inline struct mlx5dr_action * diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.c b/drivers/net/ethernet/mellanox/mlx5/core/wq.c index f2a0e72285ba..02f7e4a39578 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/wq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.c @@ -89,7 +89,7 @@ void mlx5_wq_cyc_wqe_dump(struct mlx5_wq_cyc *wq, u16 ix, u8 nstrides) len = nstrides << wq->fbc.log_stride; wqe = mlx5_wq_cyc_get_wqe(wq, ix); - pr_info("WQE DUMP: WQ size %d WQ cur size %d, WQE index 0x%x, len: %ld\n", + pr_info("WQE DUMP: WQ size %d WQ cur size %d, WQE index 0x%x, len: %zu\n", mlx5_wq_cyc_get_size(wq), wq->cur_sz, ix, len); print_hex_dump(KERN_WARNING, "", DUMP_PREFIX_OFFSET, 16, 1, wqe, len, false); } diff --git a/drivers/net/ethernet/mellanox/mlxsw/minimal.c b/drivers/net/ethernet/mellanox/mlxsw/minimal.c index 2b543911ae00..c4caeeadcba9 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/minimal.c +++ b/drivers/net/ethernet/mellanox/mlxsw/minimal.c @@ -213,8 +213,8 @@ mlxsw_m_port_create(struct mlxsw_m *mlxsw_m, u8 local_port, u8 module) err_register_netdev: mlxsw_m->ports[local_port] = NULL; - free_netdev(dev); err_dev_addr_get: + free_netdev(dev); err_alloc_etherdev: mlxsw_core_port_fini(mlxsw_m->core, local_port); return err; diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h index fd59280cf979..dd6685156396 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/reg.h +++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h @@ -3563,8 +3563,8 @@ MLXSW_ITEM32(reg, qeec, min_shaper_rate, 0x0C, 0, 28); */ MLXSW_ITEM32(reg, qeec, mase, 0x10, 31, 1); -/* A large max rate will disable the max shaper. */ -#define MLXSW_REG_QEEC_MAS_DIS 200000000 /* Kbps */ +/* The largest max shaper value possible to disable the shaper. */ +#define MLXSW_REG_QEEC_MAS_DIS ((1u << 31) - 1) /* Kbps */ /* reg_qeec_max_shaper_rate * Max shaper information rate. @@ -3602,6 +3602,21 @@ MLXSW_ITEM32(reg, qeec, dwrr, 0x18, 15, 1); */ MLXSW_ITEM32(reg, qeec, dwrr_weight, 0x18, 0, 8); +/* reg_qeec_max_shaper_bs + * Max shaper burst size + * Burst size is 2^max_shaper_bs * 512 bits + * For Spectrum-1: Range is: 5..25 + * For Spectrum-2: Range is: 11..25 + * Reserved when ptps = 1 + * Access: RW + */ +MLXSW_ITEM32(reg, qeec, max_shaper_bs, 0x1C, 0, 6); + +#define MLXSW_REG_QEEC_HIGHEST_SHAPER_BS 25 +#define MLXSW_REG_QEEC_LOWEST_SHAPER_BS_SP1 5 +#define MLXSW_REG_QEEC_LOWEST_SHAPER_BS_SP2 11 +#define MLXSW_REG_QEEC_LOWEST_SHAPER_BS_SP3 5 + static inline void mlxsw_reg_qeec_pack(char *payload, u8 local_port, enum mlxsw_reg_qeec_hr hr, u8 index, u8 next_index) @@ -5513,6 +5528,7 @@ enum mlxsw_reg_htgt_discard_trap_group { MLXSW_REG_HTGT_DISCARD_TRAP_GROUP_BASE = MLXSW_REG_HTGT_TRAP_GROUP_MAX, MLXSW_REG_HTGT_TRAP_GROUP_SP_L2_DISCARDS, MLXSW_REG_HTGT_TRAP_GROUP_SP_L3_DISCARDS, + MLXSW_REG_HTGT_TRAP_GROUP_SP_TUNNEL_DISCARDS, }; /* reg_htgt_trap_group @@ -10140,6 +10156,92 @@ static inline void mlxsw_reg_tigcr_pack(char *payload, bool ttlc, u8 ttl_uc) mlxsw_reg_tigcr_ttl_uc_set(payload, ttl_uc); } +/* TIEEM - Tunneling IPinIP Encapsulation ECN Mapping Register + * ----------------------------------------------------------- + * The TIEEM register maps ECN of the IP header at the ingress to the + * encapsulation to the ECN of the underlay network. + */ +#define MLXSW_REG_TIEEM_ID 0xA812 +#define MLXSW_REG_TIEEM_LEN 0x0C + +MLXSW_REG_DEFINE(tieem, MLXSW_REG_TIEEM_ID, MLXSW_REG_TIEEM_LEN); + +/* reg_tieem_overlay_ecn + * ECN of the IP header in the overlay network. + * Access: Index + */ +MLXSW_ITEM32(reg, tieem, overlay_ecn, 0x04, 24, 2); + +/* reg_tineem_underlay_ecn + * ECN of the IP header in the underlay network. + * Access: RW + */ +MLXSW_ITEM32(reg, tieem, underlay_ecn, 0x04, 16, 2); + +static inline void mlxsw_reg_tieem_pack(char *payload, u8 overlay_ecn, + u8 underlay_ecn) +{ + MLXSW_REG_ZERO(tieem, payload); + mlxsw_reg_tieem_overlay_ecn_set(payload, overlay_ecn); + mlxsw_reg_tieem_underlay_ecn_set(payload, underlay_ecn); +} + +/* TIDEM - Tunneling IPinIP Decapsulation ECN Mapping Register + * ----------------------------------------------------------- + * The TIDEM register configures the actions that are done in the + * decapsulation. + */ +#define MLXSW_REG_TIDEM_ID 0xA813 +#define MLXSW_REG_TIDEM_LEN 0x0C + +MLXSW_REG_DEFINE(tidem, MLXSW_REG_TIDEM_ID, MLXSW_REG_TIDEM_LEN); + +/* reg_tidem_underlay_ecn + * ECN field of the IP header in the underlay network. + * Access: Index + */ +MLXSW_ITEM32(reg, tidem, underlay_ecn, 0x04, 24, 2); + +/* reg_tidem_overlay_ecn + * ECN field of the IP header in the overlay network. + * Access: Index + */ +MLXSW_ITEM32(reg, tidem, overlay_ecn, 0x04, 16, 2); + +/* reg_tidem_eip_ecn + * Egress IP ECN. ECN field of the IP header of the packet which goes out + * from the decapsulation. + * Access: RW + */ +MLXSW_ITEM32(reg, tidem, eip_ecn, 0x04, 8, 2); + +/* reg_tidem_trap_en + * Trap enable: + * 0 - No trap due to decap ECN + * 1 - Trap enable with trap_id + * Access: RW + */ +MLXSW_ITEM32(reg, tidem, trap_en, 0x08, 28, 4); + +/* reg_tidem_trap_id + * Trap ID. Either DECAP_ECN0 or DECAP_ECN1. + * Reserved when trap_en is '0'. + * Access: RW + */ +MLXSW_ITEM32(reg, tidem, trap_id, 0x08, 0, 9); + +static inline void mlxsw_reg_tidem_pack(char *payload, u8 underlay_ecn, + u8 overlay_ecn, u8 eip_ecn, + bool trap_en, u16 trap_id) +{ + MLXSW_REG_ZERO(tidem, payload); + mlxsw_reg_tidem_underlay_ecn_set(payload, underlay_ecn); + mlxsw_reg_tidem_overlay_ecn_set(payload, overlay_ecn); + mlxsw_reg_tidem_eip_ecn_set(payload, eip_ecn); + mlxsw_reg_tidem_trap_en_set(payload, trap_en); + mlxsw_reg_tidem_trap_id_set(payload, trap_id); +} + /* SBPR - Shared Buffer Pools Register * ----------------------------------- * The SBPR configures and retrieves the shared buffer pools and configuration. @@ -10684,6 +10786,8 @@ static const struct mlxsw_reg_info *mlxsw_reg_infos[] = { MLXSW_REG(tndem), MLXSW_REG(tnpc), MLXSW_REG(tigcr), + MLXSW_REG(tieem), + MLXSW_REG(tidem), MLXSW_REG(sbpr), MLXSW_REG(sbcm), MLXSW_REG(sbpm), diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index fe02ffb1e3a8..7358b5bc7eb6 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -45,11 +45,9 @@ #include "spectrum_ptp.h" #include "../mlxfw/mlxfw.h" -#define MLXSW_SP_FWREV_MINOR_TO_BRANCH(minor) ((minor) / 100) - #define MLXSW_SP1_FWREV_MAJOR 13 #define MLXSW_SP1_FWREV_MINOR 2000 -#define MLXSW_SP1_FWREV_SUBMINOR 2308 +#define MLXSW_SP1_FWREV_SUBMINOR 2714 #define MLXSW_SP1_FWREV_CAN_RESET_MINOR 1702 static const struct mlxsw_fw_rev mlxsw_sp1_fw_rev = { @@ -66,7 +64,7 @@ static const struct mlxsw_fw_rev mlxsw_sp1_fw_rev = { #define MLXSW_SP2_FWREV_MAJOR 29 #define MLXSW_SP2_FWREV_MINOR 2000 -#define MLXSW_SP2_FWREV_SUBMINOR 2308 +#define MLXSW_SP2_FWREV_SUBMINOR 2714 static const struct mlxsw_fw_rev mlxsw_sp2_fw_rev = { .major = MLXSW_SP2_FWREV_MAJOR, @@ -197,6 +195,10 @@ struct mlxsw_sp_ptp_ops { u64 *data, int data_index); }; +struct mlxsw_sp_span_ops { + u32 (*buffsize_get)(int mtu, u32 speed); +}; + static int mlxsw_sp_component_query(struct mlxfw_dev *mlxfw_dev, u16 component_index, u32 *p_max_size, u8 *p_align_bits, u16 *p_max_write_size) @@ -423,13 +425,12 @@ static int mlxsw_sp_fw_rev_validate(struct mlxsw_sp *mlxsw_sp) rev->major, req_rev->major); return -EINVAL; } - if (MLXSW_SP_FWREV_MINOR_TO_BRANCH(rev->minor) == - MLXSW_SP_FWREV_MINOR_TO_BRANCH(req_rev->minor) && - mlxsw_core_fw_rev_minor_subminor_validate(rev, req_rev)) + if (mlxsw_core_fw_rev_minor_subminor_validate(rev, req_rev)) return 0; - dev_info(mlxsw_sp->bus_info->dev, "The firmware version %d.%d.%d is incompatible with the driver\n", - rev->major, rev->minor, rev->subminor); + dev_err(mlxsw_sp->bus_info->dev, "The firmware version %d.%d.%d is incompatible with the driver (required >= %d.%d.%d)\n", + rev->major, rev->minor, rev->subminor, req_rev->major, + req_rev->minor, req_rev->subminor); dev_info(mlxsw_sp->bus_info->dev, "Flashing firmware using file %s\n", fw_filename); @@ -860,23 +861,17 @@ static netdev_tx_t mlxsw_sp_port_xmit(struct sk_buff *skb, u64 len; int err; + if (skb_cow_head(skb, MLXSW_TXHDR_LEN)) { + this_cpu_inc(mlxsw_sp_port->pcpu_stats->tx_dropped); + dev_kfree_skb_any(skb); + return NETDEV_TX_OK; + } + memset(skb->cb, 0, sizeof(struct mlxsw_skb_cb)); if (mlxsw_core_skb_transmit_busy(mlxsw_sp->core, &tx_info)) return NETDEV_TX_BUSY; - if (unlikely(skb_headroom(skb) < MLXSW_TXHDR_LEN)) { - struct sk_buff *skb_orig = skb; - - skb = skb_realloc_headroom(skb, MLXSW_TXHDR_LEN); - if (!skb) { - this_cpu_inc(mlxsw_sp_port->pcpu_stats->tx_dropped); - dev_kfree_skb_any(skb_orig); - return NETDEV_TX_OK; - } - dev_consume_skb_any(skb_orig); - } - if (eth_skb_pad(skb)) { this_cpu_inc(mlxsw_sp_port->pcpu_stats->tx_dropped); return NETDEV_TX_OK; @@ -1215,6 +1210,9 @@ static void update_stats_cache(struct work_struct *work) periodic_hw_stats.update_dw.work); if (!netif_carrier_ok(mlxsw_sp_port->dev)) + /* Note: mlxsw_sp_port_down_wipe_counters() clears the cache as + * necessary when port goes down. + */ goto out; mlxsw_sp_port_get_hw_stats(mlxsw_sp_port->dev, @@ -1798,6 +1796,8 @@ static int mlxsw_sp_setup_tc(struct net_device *dev, enum tc_setup_type type, return mlxsw_sp_setup_tc_prio(mlxsw_sp_port, type_data); case TC_SETUP_QDISC_ETS: return mlxsw_sp_setup_tc_ets(mlxsw_sp_port, type_data); + case TC_SETUP_QDISC_TBF: + return mlxsw_sp_setup_tc_tbf(mlxsw_sp_port, type_data); default: return -EOPNOTSUPP; } @@ -3541,6 +3541,27 @@ mlxsw_sp_port_speed_by_width_set(struct mlxsw_sp_port *mlxsw_sp_port) return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ptys), ptys_pl); } +int mlxsw_sp_port_speed_get(struct mlxsw_sp_port *mlxsw_sp_port, u32 *speed) +{ + const struct mlxsw_sp_port_type_speed_ops *port_type_speed_ops; + struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; + char ptys_pl[MLXSW_REG_PTYS_LEN]; + u32 eth_proto_oper; + int err; + + port_type_speed_ops = mlxsw_sp->port_type_speed_ops; + port_type_speed_ops->reg_ptys_eth_pack(mlxsw_sp, ptys_pl, + mlxsw_sp_port->local_port, 0, + false); + err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(ptys), ptys_pl); + if (err) + return err; + port_type_speed_ops->reg_ptys_eth_unpack(mlxsw_sp, ptys_pl, NULL, NULL, + ð_proto_oper); + *speed = port_type_speed_ops->from_ptys_speed(mlxsw_sp, eth_proto_oper); + return 0; +} + int mlxsw_sp_port_ets_set(struct mlxsw_sp_port *mlxsw_sp_port, enum mlxsw_reg_qeec_hr hr, u8 index, u8 next_index, bool dwrr, u8 dwrr_weight) @@ -3558,7 +3579,7 @@ int mlxsw_sp_port_ets_set(struct mlxsw_sp_port *mlxsw_sp_port, int mlxsw_sp_port_ets_maxrate_set(struct mlxsw_sp_port *mlxsw_sp_port, enum mlxsw_reg_qeec_hr hr, u8 index, - u8 next_index, u32 maxrate) + u8 next_index, u32 maxrate, u8 burst_size) { struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; char qeec_pl[MLXSW_REG_QEEC_LEN]; @@ -3567,6 +3588,7 @@ int mlxsw_sp_port_ets_maxrate_set(struct mlxsw_sp_port *mlxsw_sp_port, next_index); mlxsw_reg_qeec_mase_set(qeec_pl, true); mlxsw_reg_qeec_max_shaper_rate_set(qeec_pl, maxrate); + mlxsw_reg_qeec_max_shaper_bs_set(qeec_pl, burst_size); return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(qeec), qeec_pl); } @@ -3635,14 +3657,14 @@ static int mlxsw_sp_port_ets_init(struct mlxsw_sp_port *mlxsw_sp_port) */ err = mlxsw_sp_port_ets_maxrate_set(mlxsw_sp_port, MLXSW_REG_QEEC_HR_PORT, 0, 0, - MLXSW_REG_QEEC_MAS_DIS); + MLXSW_REG_QEEC_MAS_DIS, 0); if (err) return err; for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) { err = mlxsw_sp_port_ets_maxrate_set(mlxsw_sp_port, MLXSW_REG_QEEC_HR_SUBGROUP, i, 0, - MLXSW_REG_QEEC_MAS_DIS); + MLXSW_REG_QEEC_MAS_DIS, 0); if (err) return err; } @@ -3650,14 +3672,14 @@ static int mlxsw_sp_port_ets_init(struct mlxsw_sp_port *mlxsw_sp_port) err = mlxsw_sp_port_ets_maxrate_set(mlxsw_sp_port, MLXSW_REG_QEEC_HR_TC, i, i, - MLXSW_REG_QEEC_MAS_DIS); + MLXSW_REG_QEEC_MAS_DIS, 0); if (err) return err; err = mlxsw_sp_port_ets_maxrate_set(mlxsw_sp_port, MLXSW_REG_QEEC_HR_TC, i + 8, i, - MLXSW_REG_QEEC_MAS_DIS); + MLXSW_REG_QEEC_MAS_DIS, 0); if (err) return err; } @@ -3889,6 +3911,8 @@ static int mlxsw_sp_port_create(struct mlxsw_sp *mlxsw_sp, u8 local_port, INIT_DELAYED_WORK(&mlxsw_sp_port->ptp.shaper_dw, mlxsw_sp->ptp_ops->shaper_work); + INIT_DELAYED_WORK(&mlxsw_sp_port->span.speed_update_dw, + mlxsw_sp_span_speed_update_work); mlxsw_sp->ports[local_port] = mlxsw_sp_port; err = register_netdev(dev); @@ -3945,6 +3969,7 @@ static void mlxsw_sp_port_remove(struct mlxsw_sp *mlxsw_sp, u8 local_port) struct mlxsw_sp_port *mlxsw_sp_port = mlxsw_sp->ports[local_port]; cancel_delayed_work_sync(&mlxsw_sp_port->periodic_hw_stats.update_dw); + cancel_delayed_work_sync(&mlxsw_sp_port->span.speed_update_dw); cancel_delayed_work_sync(&mlxsw_sp_port->ptp.shaper_dw); mlxsw_sp_port_ptp_clear(mlxsw_sp_port); mlxsw_core_port_clear(mlxsw_sp->core, local_port, mlxsw_sp); @@ -4325,6 +4350,15 @@ static int mlxsw_sp_port_unsplit(struct mlxsw_core *mlxsw_core, u8 local_port, return 0; } +static void +mlxsw_sp_port_down_wipe_counters(struct mlxsw_sp_port *mlxsw_sp_port) +{ + int i; + + for (i = 0; i < TC_MAX_QUEUE; i++) + mlxsw_sp_port->periodic_hw_stats.xstats.backlog[i] = 0; +} + static void mlxsw_sp_pude_event_func(const struct mlxsw_reg_info *reg, char *pude_pl, void *priv) { @@ -4343,9 +4377,11 @@ static void mlxsw_sp_pude_event_func(const struct mlxsw_reg_info *reg, netdev_info(mlxsw_sp_port->dev, "link up\n"); netif_carrier_on(mlxsw_sp_port->dev); mlxsw_core_schedule_dw(&mlxsw_sp_port->ptp.shaper_dw, 0); + mlxsw_core_schedule_dw(&mlxsw_sp_port->span.speed_update_dw, 0); } else { netdev_info(mlxsw_sp_port->dev, "link down\n"); netif_carrier_off(mlxsw_sp_port->dev); + mlxsw_sp_port_down_wipe_counters(mlxsw_sp_port); } } @@ -4541,10 +4577,16 @@ static const struct mlxsw_listener mlxsw_sp_listener[] = { false), MLXSW_SP_RXL_MARK(ROUTER_ALERT_IPV4, TRAP_TO_CPU, ROUTER_EXP, false), MLXSW_SP_RXL_MARK(ROUTER_ALERT_IPV6, TRAP_TO_CPU, ROUTER_EXP, false), - MLXSW_SP_RXL_MARK(IPIP_DECAP_ERROR, TRAP_TO_CPU, ROUTER_EXP, false), - MLXSW_SP_RXL_MARK(DECAP_ECN0, TRAP_TO_CPU, ROUTER_EXP, false), MLXSW_SP_RXL_MARK(IPV4_VRRP, TRAP_TO_CPU, VRRP, false), MLXSW_SP_RXL_MARK(IPV6_VRRP, TRAP_TO_CPU, VRRP, false), + MLXSW_SP_RXL_NO_MARK(DISCARD_ING_ROUTER_SIP_CLASS_E, FORWARD, + ROUTER_EXP, false), + MLXSW_SP_RXL_NO_MARK(DISCARD_ING_ROUTER_MC_DMAC, FORWARD, + ROUTER_EXP, false), + MLXSW_SP_RXL_NO_MARK(DISCARD_ING_ROUTER_SIP_DIP, FORWARD, + ROUTER_EXP, false), + MLXSW_SP_RXL_NO_MARK(DISCARD_ING_ROUTER_DIP_LINK_LOCAL, FORWARD, + ROUTER_EXP, false), /* PKT Sample trap */ MLXSW_RXL(mlxsw_sp_rx_listener_sample_func, PKT_SAMPLE, MIRROR_TO_CPU, false, SP_IP2ME, DISCARD), @@ -4883,6 +4925,33 @@ static const struct mlxsw_sp_ptp_ops mlxsw_sp2_ptp_ops = { .get_stats = mlxsw_sp2_get_stats, }; +static u32 mlxsw_sp1_span_buffsize_get(int mtu, u32 speed) +{ + return mtu * 5 / 2; +} + +static const struct mlxsw_sp_span_ops mlxsw_sp1_span_ops = { + .buffsize_get = mlxsw_sp1_span_buffsize_get, +}; + +#define MLXSW_SP2_SPAN_EG_MIRROR_BUFFER_FACTOR 38 + +static u32 mlxsw_sp2_span_buffsize_get(int mtu, u32 speed) +{ + return 3 * mtu + MLXSW_SP2_SPAN_EG_MIRROR_BUFFER_FACTOR * speed / 1000; +} + +static const struct mlxsw_sp_span_ops mlxsw_sp2_span_ops = { + .buffsize_get = mlxsw_sp2_span_buffsize_get, +}; + +u32 mlxsw_sp_span_buffsize_get(struct mlxsw_sp *mlxsw_sp, int mtu, u32 speed) +{ + u32 buffsize = mlxsw_sp->span_ops->buffsize_get(speed, mtu); + + return mlxsw_sp_bytes_cells(mlxsw_sp, buffsize) + 1; +} + static int mlxsw_sp_netdevice_event(struct notifier_block *unused, unsigned long event, void *ptr); @@ -5104,8 +5173,10 @@ static int mlxsw_sp1_init(struct mlxsw_core *mlxsw_core, mlxsw_sp->sb_vals = &mlxsw_sp1_sb_vals; mlxsw_sp->port_type_speed_ops = &mlxsw_sp1_port_type_speed_ops; mlxsw_sp->ptp_ops = &mlxsw_sp1_ptp_ops; + mlxsw_sp->span_ops = &mlxsw_sp1_span_ops; mlxsw_sp->listeners = mlxsw_sp1_listener; mlxsw_sp->listeners_count = ARRAY_SIZE(mlxsw_sp1_listener); + mlxsw_sp->lowest_shaper_bs = MLXSW_REG_QEEC_LOWEST_SHAPER_BS_SP1; return mlxsw_sp_init(mlxsw_core, mlxsw_bus_info, extack); } @@ -5129,6 +5200,31 @@ static int mlxsw_sp2_init(struct mlxsw_core *mlxsw_core, mlxsw_sp->sb_vals = &mlxsw_sp2_sb_vals; mlxsw_sp->port_type_speed_ops = &mlxsw_sp2_port_type_speed_ops; mlxsw_sp->ptp_ops = &mlxsw_sp2_ptp_ops; + mlxsw_sp->span_ops = &mlxsw_sp2_span_ops; + mlxsw_sp->lowest_shaper_bs = MLXSW_REG_QEEC_LOWEST_SHAPER_BS_SP2; + + return mlxsw_sp_init(mlxsw_core, mlxsw_bus_info, extack); +} + +static int mlxsw_sp3_init(struct mlxsw_core *mlxsw_core, + const struct mlxsw_bus_info *mlxsw_bus_info, + struct netlink_ext_ack *extack) +{ + struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core); + + mlxsw_sp->kvdl_ops = &mlxsw_sp2_kvdl_ops; + mlxsw_sp->afa_ops = &mlxsw_sp2_act_afa_ops; + mlxsw_sp->afk_ops = &mlxsw_sp2_afk_ops; + mlxsw_sp->mr_tcam_ops = &mlxsw_sp2_mr_tcam_ops; + mlxsw_sp->acl_tcam_ops = &mlxsw_sp2_acl_tcam_ops; + mlxsw_sp->nve_ops_arr = mlxsw_sp2_nve_ops_arr; + mlxsw_sp->mac_mask = mlxsw_sp2_mac_mask; + mlxsw_sp->rif_ops_arr = mlxsw_sp2_rif_ops_arr; + mlxsw_sp->sb_vals = &mlxsw_sp2_sb_vals; + mlxsw_sp->port_type_speed_ops = &mlxsw_sp2_port_type_speed_ops; + mlxsw_sp->ptp_ops = &mlxsw_sp2_ptp_ops; + mlxsw_sp->span_ops = &mlxsw_sp2_span_ops; + mlxsw_sp->lowest_shaper_bs = MLXSW_REG_QEEC_LOWEST_SHAPER_BS_SP3; return mlxsw_sp_init(mlxsw_core, mlxsw_bus_info, extack); } @@ -5635,7 +5731,7 @@ static struct mlxsw_driver mlxsw_sp2_driver = { static struct mlxsw_driver mlxsw_sp3_driver = { .kind = mlxsw_sp3_driver_name, .priv_size = sizeof(struct mlxsw_sp), - .init = mlxsw_sp2_init, + .init = mlxsw_sp3_init, .fini = mlxsw_sp_fini, .basic_trap_groups_set = mlxsw_sp_basic_trap_groups_set, .port_split = mlxsw_sp_port_split, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index 948ef4720d40..a0f1f9dceec5 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -140,6 +140,7 @@ struct mlxsw_sp_sb_vals; struct mlxsw_sp_port_type_speed_ops; struct mlxsw_sp_ptp_state; struct mlxsw_sp_ptp_ops; +struct mlxsw_sp_span_ops; struct mlxsw_sp_port_mapping { u8 module; @@ -185,8 +186,10 @@ struct mlxsw_sp { const struct mlxsw_sp_sb_vals *sb_vals; const struct mlxsw_sp_port_type_speed_ops *port_type_speed_ops; const struct mlxsw_sp_ptp_ops *ptp_ops; + const struct mlxsw_sp_span_ops *span_ops; const struct mlxsw_listener *listeners; size_t listeners_count; + u32 lowest_shaper_bs; }; static inline struct mlxsw_sp_upper * @@ -292,6 +295,9 @@ struct mlxsw_sp_port { struct mlxsw_sp_ptp_port_stats stats; } ptp; u8 split_base_local_port; + struct { + struct delayed_work speed_update_dw; + } span; }; struct mlxsw_sp_port_type_speed_ops { @@ -471,6 +477,7 @@ extern struct notifier_block mlxsw_sp_switchdev_notifier; /* spectrum.c */ void mlxsw_sp_rx_listener_no_mark_func(struct sk_buff *skb, u8 local_port, void *priv); +int mlxsw_sp_port_speed_get(struct mlxsw_sp_port *mlxsw_sp_port, u32 *speed); int mlxsw_sp_port_ets_set(struct mlxsw_sp_port *mlxsw_sp_port, enum mlxsw_reg_qeec_hr hr, u8 index, u8 next_index, bool dwrr, u8 dwrr_weight); @@ -481,7 +488,7 @@ int __mlxsw_sp_port_headroom_set(struct mlxsw_sp_port *mlxsw_sp_port, int mtu, struct ieee_pfc *my_pfc); int mlxsw_sp_port_ets_maxrate_set(struct mlxsw_sp_port *mlxsw_sp_port, enum mlxsw_reg_qeec_hr hr, u8 index, - u8 next_index, u32 maxrate); + u8 next_index, u32 maxrate, u8 burst_size); enum mlxsw_reg_spms_state mlxsw_sp_stp_spms_state(u8 stp_state); int mlxsw_sp_port_vid_stp_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 vid, u8 state); @@ -501,6 +508,7 @@ int mlxsw_sp_flow_counter_alloc(struct mlxsw_sp *mlxsw_sp, unsigned int *p_counter_index); void mlxsw_sp_flow_counter_free(struct mlxsw_sp *mlxsw_sp, unsigned int counter_index); +u32 mlxsw_sp_span_buffsize_get(struct mlxsw_sp *mlxsw_sp, int mtu, u32 speed); bool mlxsw_sp_port_dev_check(const struct net_device *dev); struct mlxsw_sp *mlxsw_sp_lower_get(struct net_device *dev); struct mlxsw_sp_port *mlxsw_sp_port_dev_lower_find(struct net_device *dev); @@ -854,6 +862,8 @@ int mlxsw_sp_setup_tc_prio(struct mlxsw_sp_port *mlxsw_sp_port, struct tc_prio_qopt_offload *p); int mlxsw_sp_setup_tc_ets(struct mlxsw_sp_port *mlxsw_sp_port, struct tc_ets_qopt_offload *p); +int mlxsw_sp_setup_tc_tbf(struct mlxsw_sp_port *mlxsw_sp_port, + struct tc_tbf_qopt_offload *p); /* spectrum_fid.c */ bool mlxsw_sp_fid_is_dummy(struct mlxsw_sp *mlxsw_sp, u16 fid_index); diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c index 150b3a144b83..3d3cca596116 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c @@ -8,6 +8,7 @@ #include <linux/string.h> #include <linux/rhashtable.h> #include <linux/netdevice.h> +#include <linux/mutex.h> #include <net/net_namespace.h> #include <net/tc_act/tc_vlan.h> @@ -25,6 +26,7 @@ struct mlxsw_sp_acl { struct mlxsw_sp_fid *dummy_fid; struct rhashtable ruleset_ht; struct list_head rules; + struct mutex rules_lock; /* Protects rules list */ struct { struct delayed_work dw; unsigned long interval; /* ms */ @@ -701,7 +703,9 @@ int mlxsw_sp_acl_rule_add(struct mlxsw_sp *mlxsw_sp, goto err_ruleset_block_bind; } + mutex_lock(&mlxsw_sp->acl->rules_lock); list_add_tail(&rule->list, &mlxsw_sp->acl->rules); + mutex_unlock(&mlxsw_sp->acl->rules_lock); block->rule_count++; block->egress_blocker_rule_count += rule->rulei->egress_bind_blocker; return 0; @@ -723,7 +727,9 @@ void mlxsw_sp_acl_rule_del(struct mlxsw_sp *mlxsw_sp, block->egress_blocker_rule_count -= rule->rulei->egress_bind_blocker; ruleset->ht_key.block->rule_count--; + mutex_lock(&mlxsw_sp->acl->rules_lock); list_del(&rule->list); + mutex_unlock(&mlxsw_sp->acl->rules_lock); if (!ruleset->ht_key.chain_index && mlxsw_sp_acl_ruleset_is_singular(ruleset)) mlxsw_sp_acl_ruleset_block_unbind(mlxsw_sp, ruleset, @@ -783,19 +789,18 @@ static int mlxsw_sp_acl_rules_activity_update(struct mlxsw_sp_acl *acl) struct mlxsw_sp_acl_rule *rule; int err; - /* Protect internal structures from changes */ - rtnl_lock(); + mutex_lock(&acl->rules_lock); list_for_each_entry(rule, &acl->rules, list) { err = mlxsw_sp_acl_rule_activity_update(acl->mlxsw_sp, rule); if (err) goto err_rule_update; } - rtnl_unlock(); + mutex_unlock(&acl->rules_lock); return 0; err_rule_update: - rtnl_unlock(); + mutex_unlock(&acl->rules_lock); return err; } @@ -880,6 +885,7 @@ int mlxsw_sp_acl_init(struct mlxsw_sp *mlxsw_sp) acl->dummy_fid = fid; INIT_LIST_HEAD(&acl->rules); + mutex_init(&acl->rules_lock); err = mlxsw_sp_acl_tcam_init(mlxsw_sp, &acl->tcam); if (err) goto err_acl_ops_init; @@ -892,6 +898,7 @@ int mlxsw_sp_acl_init(struct mlxsw_sp *mlxsw_sp) return 0; err_acl_ops_init: + mutex_destroy(&acl->rules_lock); mlxsw_sp_fid_put(fid); err_fid_get: rhashtable_destroy(&acl->ruleset_ht); @@ -908,6 +915,7 @@ void mlxsw_sp_acl_fini(struct mlxsw_sp *mlxsw_sp) cancel_delayed_work_sync(&mlxsw_sp->acl->rule_activity_update.dw); mlxsw_sp_acl_tcam_fini(mlxsw_sp, &acl->tcam); + mutex_destroy(&acl->rules_lock); WARN_ON(!list_empty(&acl->rules)); mlxsw_sp_fid_put(acl->dummy_fid); rhashtable_destroy(&acl->ruleset_ht); diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dcb.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dcb.c index db66f2b56a6d..49a72a8f1f57 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dcb.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dcb.c @@ -526,7 +526,7 @@ static int mlxsw_sp_dcbnl_ieee_setmaxrate(struct net_device *dev, err = mlxsw_sp_port_ets_maxrate_set(mlxsw_sp_port, MLXSW_REG_QEEC_HR_SUBGROUP, i, 0, - maxrate->tc_maxrate[i]); + maxrate->tc_maxrate[i], 0); if (err) { netdev_err(dev, "Failed to set maxrate for TC %d\n", i); goto err_port_ets_maxrate_set; @@ -541,7 +541,8 @@ err_port_ets_maxrate_set: for (i--; i >= 0; i--) mlxsw_sp_port_ets_maxrate_set(mlxsw_sp_port, MLXSW_REG_QEEC_HR_SUBGROUP, - i, 0, my_maxrate->tc_maxrate[i]); + i, 0, + my_maxrate->tc_maxrate[i], 0); return err; } diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.c index 6400cd644b7a..a8525992528f 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.c @@ -3,8 +3,10 @@ #include <net/ip_tunnels.h> #include <net/ip6_tunnel.h> +#include <net/inet_ecn.h> #include "spectrum_ipip.h" +#include "reg.h" struct ip_tunnel_parm mlxsw_sp_ipip_netdev_parms4(const struct net_device *ol_dev) @@ -338,3 +340,61 @@ static const struct mlxsw_sp_ipip_ops mlxsw_sp_ipip_gre4_ops = { const struct mlxsw_sp_ipip_ops *mlxsw_sp_ipip_ops_arr[] = { [MLXSW_SP_IPIP_TYPE_GRE4] = &mlxsw_sp_ipip_gre4_ops, }; + +static int mlxsw_sp_ipip_ecn_encap_init_one(struct mlxsw_sp *mlxsw_sp, + u8 inner_ecn, u8 outer_ecn) +{ + char tieem_pl[MLXSW_REG_TIEEM_LEN]; + + mlxsw_reg_tieem_pack(tieem_pl, inner_ecn, outer_ecn); + return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(tieem), tieem_pl); +} + +int mlxsw_sp_ipip_ecn_encap_init(struct mlxsw_sp *mlxsw_sp) +{ + int i; + + /* Iterate over inner ECN values */ + for (i = INET_ECN_NOT_ECT; i <= INET_ECN_CE; i++) { + u8 outer_ecn = INET_ECN_encapsulate(0, i); + int err; + + err = mlxsw_sp_ipip_ecn_encap_init_one(mlxsw_sp, i, outer_ecn); + if (err) + return err; + } + + return 0; +} + +static int mlxsw_sp_ipip_ecn_decap_init_one(struct mlxsw_sp *mlxsw_sp, + u8 inner_ecn, u8 outer_ecn) +{ + char tidem_pl[MLXSW_REG_TIDEM_LEN]; + bool trap_en, set_ce = false; + u8 new_inner_ecn; + + trap_en = __INET_ECN_decapsulate(outer_ecn, inner_ecn, &set_ce); + new_inner_ecn = set_ce ? INET_ECN_CE : inner_ecn; + + mlxsw_reg_tidem_pack(tidem_pl, outer_ecn, inner_ecn, new_inner_ecn, + trap_en, trap_en ? MLXSW_TRAP_ID_DECAP_ECN0 : 0); + return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(tidem), tidem_pl); +} + +int mlxsw_sp_ipip_ecn_decap_init(struct mlxsw_sp *mlxsw_sp) +{ + int i, j, err; + + /* Iterate over inner ECN values */ + for (i = INET_ECN_NOT_ECT; i <= INET_ECN_CE; i++) { + /* Iterate over outer ECN values */ + for (j = INET_ECN_NOT_ECT; j <= INET_ECN_CE; j++) { + err = mlxsw_sp_ipip_ecn_decap_init_one(mlxsw_sp, i, j); + if (err) + return err; + } + } + + return 0; +} diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ptp.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ptp.c index 4aaaa4937b1a..34f7c3501b08 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ptp.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ptp.c @@ -1016,27 +1016,17 @@ mlxsw_sp1_ptp_port_shaper_set(struct mlxsw_sp_port *mlxsw_sp_port, bool enable) static int mlxsw_sp1_ptp_port_shaper_check(struct mlxsw_sp_port *mlxsw_sp_port) { - const struct mlxsw_sp_port_type_speed_ops *port_type_speed_ops; - struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; - char ptys_pl[MLXSW_REG_PTYS_LEN]; - u32 eth_proto_oper, speed; bool ptps = false; int err, i; + u32 speed; if (!mlxsw_sp1_ptp_hwtstamp_enabled(mlxsw_sp_port)) return mlxsw_sp1_ptp_port_shaper_set(mlxsw_sp_port, false); - port_type_speed_ops = mlxsw_sp->port_type_speed_ops; - port_type_speed_ops->reg_ptys_eth_pack(mlxsw_sp, ptys_pl, - mlxsw_sp_port->local_port, 0, - false); - err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(ptys), ptys_pl); + err = mlxsw_sp_port_speed_get(mlxsw_sp_port, &speed); if (err) return err; - port_type_speed_ops->reg_ptys_eth_unpack(mlxsw_sp, ptys_pl, NULL, NULL, - ð_proto_oper); - speed = port_type_speed_ops->from_ptys_speed(mlxsw_sp, eth_proto_oper); for (i = 0; i < MLXSW_SP1_PTP_SHAPER_PARAMS_LEN; i++) { if (mlxsw_sp1_ptp_shaper_params[i].ethtool_speed == speed) { ptps = true; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c index 81a2c087f534..79a2801d59f6 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c @@ -19,6 +19,7 @@ enum mlxsw_sp_qdisc_type { MLXSW_SP_QDISC_RED, MLXSW_SP_QDISC_PRIO, MLXSW_SP_QDISC_ETS, + MLXSW_SP_QDISC_TBF, }; struct mlxsw_sp_qdisc_ops { @@ -196,6 +197,20 @@ mlxsw_sp_qdisc_get_xstats(struct mlxsw_sp_port *mlxsw_sp_port, return -EOPNOTSUPP; } +static u64 +mlxsw_sp_xstats_backlog(struct mlxsw_sp_port_xstats *xstats, int tclass_num) +{ + return xstats->backlog[tclass_num] + + xstats->backlog[tclass_num + 8]; +} + +static u64 +mlxsw_sp_xstats_tail_drop(struct mlxsw_sp_port_xstats *xstats, int tclass_num) +{ + return xstats->tail_drop[tclass_num] + + xstats->tail_drop[tclass_num + 8]; +} + static void mlxsw_sp_qdisc_bstats_per_priority_get(struct mlxsw_sp_port_xstats *xstats, u8 prio_bitmap, u64 *tx_packets, @@ -213,6 +228,70 @@ mlxsw_sp_qdisc_bstats_per_priority_get(struct mlxsw_sp_port_xstats *xstats, } } +static void +mlxsw_sp_qdisc_collect_tc_stats(struct mlxsw_sp_port *mlxsw_sp_port, + struct mlxsw_sp_qdisc *mlxsw_sp_qdisc, + u64 *p_tx_bytes, u64 *p_tx_packets, + u64 *p_drops, u64 *p_backlog) +{ + u8 tclass_num = mlxsw_sp_qdisc->tclass_num; + struct mlxsw_sp_port_xstats *xstats; + u64 tx_bytes, tx_packets; + + xstats = &mlxsw_sp_port->periodic_hw_stats.xstats; + mlxsw_sp_qdisc_bstats_per_priority_get(xstats, + mlxsw_sp_qdisc->prio_bitmap, + &tx_packets, &tx_bytes); + + *p_tx_packets += tx_packets; + *p_tx_bytes += tx_bytes; + *p_drops += xstats->wred_drop[tclass_num] + + mlxsw_sp_xstats_tail_drop(xstats, tclass_num); + *p_backlog += mlxsw_sp_xstats_backlog(xstats, tclass_num); +} + +static void +mlxsw_sp_qdisc_update_stats(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_qdisc *mlxsw_sp_qdisc, + u64 tx_bytes, u64 tx_packets, + u64 drops, u64 backlog, + struct tc_qopt_offload_stats *stats_ptr) +{ + struct mlxsw_sp_qdisc_stats *stats_base = &mlxsw_sp_qdisc->stats_base; + + tx_bytes -= stats_base->tx_bytes; + tx_packets -= stats_base->tx_packets; + drops -= stats_base->drops; + backlog -= stats_base->backlog; + + _bstats_update(stats_ptr->bstats, tx_bytes, tx_packets); + stats_ptr->qstats->drops += drops; + stats_ptr->qstats->backlog += mlxsw_sp_cells_bytes(mlxsw_sp, backlog); + + stats_base->backlog += backlog; + stats_base->drops += drops; + stats_base->tx_bytes += tx_bytes; + stats_base->tx_packets += tx_packets; +} + +static void +mlxsw_sp_qdisc_get_tc_stats(struct mlxsw_sp_port *mlxsw_sp_port, + struct mlxsw_sp_qdisc *mlxsw_sp_qdisc, + struct tc_qopt_offload_stats *stats_ptr) +{ + u64 tx_packets = 0; + u64 tx_bytes = 0; + u64 backlog = 0; + u64 drops = 0; + + mlxsw_sp_qdisc_collect_tc_stats(mlxsw_sp_port, mlxsw_sp_qdisc, + &tx_bytes, &tx_packets, + &drops, &backlog); + mlxsw_sp_qdisc_update_stats(mlxsw_sp_port->mlxsw_sp, mlxsw_sp_qdisc, + tx_bytes, tx_packets, drops, backlog, + stats_ptr); +} + static int mlxsw_sp_tclass_congestion_enable(struct mlxsw_sp_port *mlxsw_sp_port, int tclass_num, u32 min, u32 max, @@ -270,7 +349,7 @@ mlxsw_sp_setup_tc_qdisc_red_clean_stats(struct mlxsw_sp_port *mlxsw_sp_port, &stats_base->tx_bytes); red_base->prob_mark = xstats->ecn; red_base->prob_drop = xstats->wred_drop[tclass_num]; - red_base->pdrop = xstats->tail_drop[tclass_num]; + red_base->pdrop = mlxsw_sp_xstats_tail_drop(xstats, tclass_num); stats_base->overlimits = red_base->prob_drop + red_base->prob_mark; stats_base->drops = red_base->prob_drop + red_base->pdrop; @@ -343,19 +422,28 @@ mlxsw_sp_qdisc_red_replace(struct mlxsw_sp_port *mlxsw_sp_port, } static void -mlxsw_sp_qdisc_red_unoffload(struct mlxsw_sp_port *mlxsw_sp_port, - struct mlxsw_sp_qdisc *mlxsw_sp_qdisc, - void *params) +mlxsw_sp_qdisc_leaf_unoffload(struct mlxsw_sp_port *mlxsw_sp_port, + struct mlxsw_sp_qdisc *mlxsw_sp_qdisc, + struct gnet_stats_queue *qstats) { - struct tc_red_qopt_offload_params *p = params; u64 backlog; backlog = mlxsw_sp_cells_bytes(mlxsw_sp_port->mlxsw_sp, mlxsw_sp_qdisc->stats_base.backlog); - p->qstats->backlog -= backlog; + qstats->backlog -= backlog; mlxsw_sp_qdisc->stats_base.backlog = 0; } +static void +mlxsw_sp_qdisc_red_unoffload(struct mlxsw_sp_port *mlxsw_sp_port, + struct mlxsw_sp_qdisc *mlxsw_sp_qdisc, + void *params) +{ + struct tc_red_qopt_offload_params *p = params; + + mlxsw_sp_qdisc_leaf_unoffload(mlxsw_sp_port, mlxsw_sp_qdisc, p->qstats); +} + static int mlxsw_sp_qdisc_get_red_xstats(struct mlxsw_sp_port *mlxsw_sp_port, struct mlxsw_sp_qdisc *mlxsw_sp_qdisc, @@ -371,7 +459,8 @@ mlxsw_sp_qdisc_get_red_xstats(struct mlxsw_sp_port *mlxsw_sp_port, early_drops = xstats->wred_drop[tclass_num] - xstats_base->prob_drop; marks = xstats->ecn - xstats_base->prob_mark; - pdrops = xstats->tail_drop[tclass_num] - xstats_base->pdrop; + pdrops = mlxsw_sp_xstats_tail_drop(xstats, tclass_num) - + xstats_base->pdrop; res->pdrop += pdrops; res->prob_drop += early_drops; @@ -388,40 +477,21 @@ mlxsw_sp_qdisc_get_red_stats(struct mlxsw_sp_port *mlxsw_sp_port, struct mlxsw_sp_qdisc *mlxsw_sp_qdisc, struct tc_qopt_offload_stats *stats_ptr) { - u64 tx_bytes, tx_packets, overlimits, drops, backlog; u8 tclass_num = mlxsw_sp_qdisc->tclass_num; struct mlxsw_sp_qdisc_stats *stats_base; struct mlxsw_sp_port_xstats *xstats; + u64 overlimits; xstats = &mlxsw_sp_port->periodic_hw_stats.xstats; stats_base = &mlxsw_sp_qdisc->stats_base; - mlxsw_sp_qdisc_bstats_per_priority_get(xstats, - mlxsw_sp_qdisc->prio_bitmap, - &tx_packets, &tx_bytes); - tx_bytes = tx_bytes - stats_base->tx_bytes; - tx_packets = tx_packets - stats_base->tx_packets; - + mlxsw_sp_qdisc_get_tc_stats(mlxsw_sp_port, mlxsw_sp_qdisc, stats_ptr); overlimits = xstats->wred_drop[tclass_num] + xstats->ecn - stats_base->overlimits; - drops = xstats->wred_drop[tclass_num] + xstats->tail_drop[tclass_num] - - stats_base->drops; - backlog = xstats->backlog[tclass_num]; - _bstats_update(stats_ptr->bstats, tx_bytes, tx_packets); stats_ptr->qstats->overlimits += overlimits; - stats_ptr->qstats->drops += drops; - stats_ptr->qstats->backlog += - mlxsw_sp_cells_bytes(mlxsw_sp_port->mlxsw_sp, - backlog) - - mlxsw_sp_cells_bytes(mlxsw_sp_port->mlxsw_sp, - stats_base->backlog); - - stats_base->backlog = backlog; - stats_base->drops += drops; stats_base->overlimits += overlimits; - stats_base->tx_bytes += tx_bytes; - stats_base->tx_packets += tx_packets; + return 0; } @@ -471,6 +541,204 @@ int mlxsw_sp_setup_tc_red(struct mlxsw_sp_port *mlxsw_sp_port, } } +static void +mlxsw_sp_setup_tc_qdisc_leaf_clean_stats(struct mlxsw_sp_port *mlxsw_sp_port, + struct mlxsw_sp_qdisc *mlxsw_sp_qdisc) +{ + u64 backlog_cells = 0; + u64 tx_packets = 0; + u64 tx_bytes = 0; + u64 drops = 0; + + mlxsw_sp_qdisc_collect_tc_stats(mlxsw_sp_port, mlxsw_sp_qdisc, + &tx_bytes, &tx_packets, + &drops, &backlog_cells); + + mlxsw_sp_qdisc->stats_base.tx_packets = tx_packets; + mlxsw_sp_qdisc->stats_base.tx_bytes = tx_bytes; + mlxsw_sp_qdisc->stats_base.drops = drops; + mlxsw_sp_qdisc->stats_base.backlog = 0; +} + +static int +mlxsw_sp_qdisc_tbf_destroy(struct mlxsw_sp_port *mlxsw_sp_port, + struct mlxsw_sp_qdisc *mlxsw_sp_qdisc) +{ + struct mlxsw_sp_qdisc *root_qdisc = mlxsw_sp_port->root_qdisc; + + if (root_qdisc != mlxsw_sp_qdisc) + root_qdisc->stats_base.backlog -= + mlxsw_sp_qdisc->stats_base.backlog; + + return mlxsw_sp_port_ets_maxrate_set(mlxsw_sp_port, + MLXSW_REG_QEEC_HR_SUBGROUP, + mlxsw_sp_qdisc->tclass_num, 0, + MLXSW_REG_QEEC_MAS_DIS, 0); +} + +static int +mlxsw_sp_qdisc_tbf_bs(struct mlxsw_sp_port *mlxsw_sp_port, + u32 max_size, u8 *p_burst_size) +{ + /* TBF burst size is configured in bytes. The ASIC burst size value is + * ((2 ^ bs) * 512 bits. Convert the TBF bytes to 512-bit units. + */ + u32 bs512 = max_size / 64; + u8 bs = fls(bs512); + + if (!bs) + return -EINVAL; + --bs; + + /* Demand a power of two. */ + if ((1 << bs) != bs512) + return -EINVAL; + + if (bs < mlxsw_sp_port->mlxsw_sp->lowest_shaper_bs || + bs > MLXSW_REG_QEEC_HIGHEST_SHAPER_BS) + return -EINVAL; + + *p_burst_size = bs; + return 0; +} + +static u32 +mlxsw_sp_qdisc_tbf_max_size(u8 bs) +{ + return (1U << bs) * 64; +} + +static u64 +mlxsw_sp_qdisc_tbf_rate_kbps(struct tc_tbf_qopt_offload_replace_params *p) +{ + /* TBF interface is in bytes/s, whereas Spectrum ASIC is configured in + * Kbits/s. + */ + return p->rate.rate_bytes_ps / 1000 * 8; +} + +static int +mlxsw_sp_qdisc_tbf_check_params(struct mlxsw_sp_port *mlxsw_sp_port, + struct mlxsw_sp_qdisc *mlxsw_sp_qdisc, + void *params) +{ + struct tc_tbf_qopt_offload_replace_params *p = params; + struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; + u64 rate_kbps = mlxsw_sp_qdisc_tbf_rate_kbps(p); + u8 burst_size; + int err; + + if (rate_kbps >= MLXSW_REG_QEEC_MAS_DIS) { + dev_err(mlxsw_sp_port->mlxsw_sp->bus_info->dev, + "spectrum: TBF: rate of %lluKbps must be below %u\n", + rate_kbps, MLXSW_REG_QEEC_MAS_DIS); + return -EINVAL; + } + + err = mlxsw_sp_qdisc_tbf_bs(mlxsw_sp_port, p->max_size, &burst_size); + if (err) { + u8 highest_shaper_bs = MLXSW_REG_QEEC_HIGHEST_SHAPER_BS; + + dev_err(mlxsw_sp->bus_info->dev, + "spectrum: TBF: invalid burst size of %u, must be a power of two between %u and %u", + p->max_size, + mlxsw_sp_qdisc_tbf_max_size(mlxsw_sp->lowest_shaper_bs), + mlxsw_sp_qdisc_tbf_max_size(highest_shaper_bs)); + return -EINVAL; + } + + return 0; +} + +static int +mlxsw_sp_qdisc_tbf_replace(struct mlxsw_sp_port *mlxsw_sp_port, + struct mlxsw_sp_qdisc *mlxsw_sp_qdisc, + void *params) +{ + struct tc_tbf_qopt_offload_replace_params *p = params; + u64 rate_kbps = mlxsw_sp_qdisc_tbf_rate_kbps(p); + u8 burst_size; + int err; + + err = mlxsw_sp_qdisc_tbf_bs(mlxsw_sp_port, p->max_size, &burst_size); + if (WARN_ON_ONCE(err)) + /* check_params above was supposed to reject this value. */ + return -EINVAL; + + /* Configure subgroup shaper, so that both UC and MC traffic is subject + * to shaping. That is unlike RED, however UC queue lengths are going to + * be different than MC ones due to different pool and quota + * configurations, so the configuration is not applicable. For shaper on + * the other hand, subjecting the overall stream to the configured + * shaper makes sense. Also note that that is what we do for + * ieee_setmaxrate(). + */ + return mlxsw_sp_port_ets_maxrate_set(mlxsw_sp_port, + MLXSW_REG_QEEC_HR_SUBGROUP, + mlxsw_sp_qdisc->tclass_num, 0, + rate_kbps, burst_size); +} + +static void +mlxsw_sp_qdisc_tbf_unoffload(struct mlxsw_sp_port *mlxsw_sp_port, + struct mlxsw_sp_qdisc *mlxsw_sp_qdisc, + void *params) +{ + struct tc_tbf_qopt_offload_replace_params *p = params; + + mlxsw_sp_qdisc_leaf_unoffload(mlxsw_sp_port, mlxsw_sp_qdisc, p->qstats); +} + +static int +mlxsw_sp_qdisc_get_tbf_stats(struct mlxsw_sp_port *mlxsw_sp_port, + struct mlxsw_sp_qdisc *mlxsw_sp_qdisc, + struct tc_qopt_offload_stats *stats_ptr) +{ + mlxsw_sp_qdisc_get_tc_stats(mlxsw_sp_port, mlxsw_sp_qdisc, + stats_ptr); + return 0; +} + +static struct mlxsw_sp_qdisc_ops mlxsw_sp_qdisc_ops_tbf = { + .type = MLXSW_SP_QDISC_TBF, + .check_params = mlxsw_sp_qdisc_tbf_check_params, + .replace = mlxsw_sp_qdisc_tbf_replace, + .unoffload = mlxsw_sp_qdisc_tbf_unoffload, + .destroy = mlxsw_sp_qdisc_tbf_destroy, + .get_stats = mlxsw_sp_qdisc_get_tbf_stats, + .clean_stats = mlxsw_sp_setup_tc_qdisc_leaf_clean_stats, +}; + +int mlxsw_sp_setup_tc_tbf(struct mlxsw_sp_port *mlxsw_sp_port, + struct tc_tbf_qopt_offload *p) +{ + struct mlxsw_sp_qdisc *mlxsw_sp_qdisc; + + mlxsw_sp_qdisc = mlxsw_sp_qdisc_find(mlxsw_sp_port, p->parent, false); + if (!mlxsw_sp_qdisc) + return -EOPNOTSUPP; + + if (p->command == TC_TBF_REPLACE) + return mlxsw_sp_qdisc_replace(mlxsw_sp_port, p->handle, + mlxsw_sp_qdisc, + &mlxsw_sp_qdisc_ops_tbf, + &p->replace_params); + + if (!mlxsw_sp_qdisc_compare(mlxsw_sp_qdisc, p->handle, + MLXSW_SP_QDISC_TBF)) + return -EOPNOTSUPP; + + switch (p->command) { + case TC_TBF_DESTROY: + return mlxsw_sp_qdisc_destroy(mlxsw_sp_port, mlxsw_sp_qdisc); + case TC_TBF_STATS: + return mlxsw_sp_qdisc_get_stats(mlxsw_sp_port, mlxsw_sp_qdisc, + &p->stats); + default: + return -EOPNOTSUPP; + } +} + static int __mlxsw_sp_qdisc_ets_destroy(struct mlxsw_sp_port *mlxsw_sp_port) { @@ -612,37 +880,23 @@ mlxsw_sp_qdisc_get_prio_stats(struct mlxsw_sp_port *mlxsw_sp_port, struct mlxsw_sp_qdisc *mlxsw_sp_qdisc, struct tc_qopt_offload_stats *stats_ptr) { - u64 tx_bytes, tx_packets, drops = 0, backlog = 0; - struct mlxsw_sp_qdisc_stats *stats_base; - struct mlxsw_sp_port_xstats *xstats; - struct rtnl_link_stats64 *stats; + struct mlxsw_sp_qdisc *tc_qdisc; + u64 tx_packets = 0; + u64 tx_bytes = 0; + u64 backlog = 0; + u64 drops = 0; int i; - xstats = &mlxsw_sp_port->periodic_hw_stats.xstats; - stats = &mlxsw_sp_port->periodic_hw_stats.stats; - stats_base = &mlxsw_sp_qdisc->stats_base; - - tx_bytes = stats->tx_bytes - stats_base->tx_bytes; - tx_packets = stats->tx_packets - stats_base->tx_packets; - for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) { - drops += xstats->tail_drop[i]; - drops += xstats->wred_drop[i]; - backlog += xstats->backlog[i]; + tc_qdisc = &mlxsw_sp_port->tclass_qdiscs[i]; + mlxsw_sp_qdisc_collect_tc_stats(mlxsw_sp_port, tc_qdisc, + &tx_bytes, &tx_packets, + &drops, &backlog); } - drops = drops - stats_base->drops; - _bstats_update(stats_ptr->bstats, tx_bytes, tx_packets); - stats_ptr->qstats->drops += drops; - stats_ptr->qstats->backlog += - mlxsw_sp_cells_bytes(mlxsw_sp_port->mlxsw_sp, - backlog) - - mlxsw_sp_cells_bytes(mlxsw_sp_port->mlxsw_sp, - stats_base->backlog); - stats_base->backlog = backlog; - stats_base->drops += drops; - stats_base->tx_bytes += tx_bytes; - stats_base->tx_packets += tx_packets; + mlxsw_sp_qdisc_update_stats(mlxsw_sp_port->mlxsw_sp, mlxsw_sp_qdisc, + tx_bytes, tx_packets, drops, backlog, + stats_ptr); return 0; } @@ -664,7 +918,7 @@ mlxsw_sp_setup_tc_qdisc_prio_clean_stats(struct mlxsw_sp_port *mlxsw_sp_port, stats_base->drops = 0; for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) { - stats_base->drops += xstats->tail_drop[i]; + stats_base->drops += mlxsw_sp_xstats_tail_drop(xstats, i); stats_base->drops += xstats->wred_drop[i]; } @@ -767,6 +1021,13 @@ __mlxsw_sp_qdisc_ets_graft(struct mlxsw_sp_port *mlxsw_sp_port, mlxsw_sp_port->tclass_qdiscs[tclass_num].handle == child_handle) return 0; + if (!child_handle) { + /* This is an invisible FIFO replacing the original Qdisc. + * Ignore it--the original Qdisc's destroy will follow. + */ + return 0; + } + /* See if the grafted qdisc is already offloaded on any tclass. If so, * unoffload it. */ diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 7f70aa799064..ce707723f8cf 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -3235,20 +3235,6 @@ mlxsw_sp_nexthop_fib_entries_update(struct mlxsw_sp *mlxsw_sp, return 0; } -static void -mlxsw_sp_fib_entry_offload_refresh(struct mlxsw_sp_fib_entry *fib_entry, - enum mlxsw_reg_ralue_op op, int err); - -static void -mlxsw_sp_nexthop_fib_entries_refresh(struct mlxsw_sp_nexthop_group *nh_grp) -{ - enum mlxsw_reg_ralue_op op = MLXSW_REG_RALUE_OP_WRITE_WRITE; - struct mlxsw_sp_fib_entry *fib_entry; - - list_for_each_entry(fib_entry, &nh_grp->fib_list, nexthop_group_node) - mlxsw_sp_fib_entry_offload_refresh(fib_entry, op, 0); -} - static void mlxsw_sp_adj_grp_size_round_up(u16 *p_adj_grp_size) { /* Valid sizes for an adjacency group are: @@ -3352,6 +3338,73 @@ mlxsw_sp_nexthop_group_rebalance(struct mlxsw_sp_nexthop_group *nh_grp) } } +static struct mlxsw_sp_nexthop * +mlxsw_sp_rt6_nexthop(struct mlxsw_sp_nexthop_group *nh_grp, + const struct mlxsw_sp_rt6 *mlxsw_sp_rt6); + +static void +mlxsw_sp_nexthop4_group_offload_refresh(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_nexthop_group *nh_grp) +{ + int i; + + for (i = 0; i < nh_grp->count; i++) { + struct mlxsw_sp_nexthop *nh = &nh_grp->nexthops[i]; + + if (nh->offloaded) + nh->key.fib_nh->fib_nh_flags |= RTNH_F_OFFLOAD; + else + nh->key.fib_nh->fib_nh_flags &= ~RTNH_F_OFFLOAD; + } +} + +static void +__mlxsw_sp_nexthop6_group_offload_refresh(struct mlxsw_sp_nexthop_group *nh_grp, + struct mlxsw_sp_fib6_entry *fib6_entry) +{ + struct mlxsw_sp_rt6 *mlxsw_sp_rt6; + + list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) { + struct fib6_nh *fib6_nh = mlxsw_sp_rt6->rt->fib6_nh; + struct mlxsw_sp_nexthop *nh; + + nh = mlxsw_sp_rt6_nexthop(nh_grp, mlxsw_sp_rt6); + if (nh && nh->offloaded) + fib6_nh->fib_nh_flags |= RTNH_F_OFFLOAD; + else + fib6_nh->fib_nh_flags &= ~RTNH_F_OFFLOAD; + } +} + +static void +mlxsw_sp_nexthop6_group_offload_refresh(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_nexthop_group *nh_grp) +{ + struct mlxsw_sp_fib6_entry *fib6_entry; + + /* Unfortunately, in IPv6 the route and the nexthop are described by + * the same struct, so we need to iterate over all the routes using the + * nexthop group and set / clear the offload indication for them. + */ + list_for_each_entry(fib6_entry, &nh_grp->fib_list, + common.nexthop_group_node) + __mlxsw_sp_nexthop6_group_offload_refresh(nh_grp, fib6_entry); +} + +static void +mlxsw_sp_nexthop_group_offload_refresh(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_nexthop_group *nh_grp) +{ + switch (mlxsw_sp_nexthop_group_type(nh_grp)) { + case AF_INET: + mlxsw_sp_nexthop4_group_offload_refresh(mlxsw_sp, nh_grp); + break; + case AF_INET6: + mlxsw_sp_nexthop6_group_offload_refresh(mlxsw_sp, nh_grp); + break; + } +} + static void mlxsw_sp_nexthop_group_refresh(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_nexthop_group *nh_grp) @@ -3425,6 +3478,8 @@ mlxsw_sp_nexthop_group_refresh(struct mlxsw_sp *mlxsw_sp, goto set_trap; } + mlxsw_sp_nexthop_group_offload_refresh(mlxsw_sp, nh_grp); + if (!old_adj_index_valid) { /* The trap was set for fib entries, so we have to call * fib entry update to unset it and use adjacency index. @@ -3446,9 +3501,6 @@ mlxsw_sp_nexthop_group_refresh(struct mlxsw_sp *mlxsw_sp, goto set_trap; } - /* Offload state within the group changed, so update the flags. */ - mlxsw_sp_nexthop_fib_entries_refresh(nh_grp); - return; set_trap: @@ -3461,6 +3513,7 @@ set_trap: err = mlxsw_sp_nexthop_fib_entries_update(mlxsw_sp, nh_grp); if (err) dev_warn(mlxsw_sp->bus_info->dev, "Failed to set traps for fib entries.\n"); + mlxsw_sp_nexthop_group_offload_refresh(mlxsw_sp, nh_grp); if (old_adj_index_valid) mlxsw_sp_kvdl_free(mlxsw_sp, MLXSW_SP_KVDL_ENTRY_TYPE_ADJ, nh_grp->ecmp_size, nh_grp->adj_index); @@ -4043,131 +4096,128 @@ mlxsw_sp_rt6_nexthop(struct mlxsw_sp_nexthop_group *nh_grp, } static void -mlxsw_sp_fib4_entry_offload_set(struct mlxsw_sp_fib_entry *fib_entry) +mlxsw_sp_fib4_entry_hw_flags_set(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_fib_entry *fib_entry) { - struct mlxsw_sp_nexthop_group *nh_grp = fib_entry->nh_group; - int i; - - if (fib_entry->type == MLXSW_SP_FIB_ENTRY_TYPE_LOCAL || - fib_entry->type == MLXSW_SP_FIB_ENTRY_TYPE_BLACKHOLE || - fib_entry->type == MLXSW_SP_FIB_ENTRY_TYPE_IPIP_DECAP || - fib_entry->type == MLXSW_SP_FIB_ENTRY_TYPE_NVE_DECAP) { - nh_grp->nexthops->key.fib_nh->fib_nh_flags |= RTNH_F_OFFLOAD; - return; - } - - for (i = 0; i < nh_grp->count; i++) { - struct mlxsw_sp_nexthop *nh = &nh_grp->nexthops[i]; + struct fib_info *fi = mlxsw_sp_nexthop4_group_fi(fib_entry->nh_group); + u32 *p_dst = (u32 *) fib_entry->fib_node->key.addr; + int dst_len = fib_entry->fib_node->key.prefix_len; + struct mlxsw_sp_fib4_entry *fib4_entry; + struct fib_rt_info fri; + bool should_offload; - if (nh->offloaded) - nh->key.fib_nh->fib_nh_flags |= RTNH_F_OFFLOAD; - else - nh->key.fib_nh->fib_nh_flags &= ~RTNH_F_OFFLOAD; - } + should_offload = mlxsw_sp_fib_entry_should_offload(fib_entry); + fib4_entry = container_of(fib_entry, struct mlxsw_sp_fib4_entry, + common); + fri.fi = fi; + fri.tb_id = fib4_entry->tb_id; + fri.dst = cpu_to_be32(*p_dst); + fri.dst_len = dst_len; + fri.tos = fib4_entry->tos; + fri.type = fib4_entry->type; + fri.offload = should_offload; + fri.trap = !should_offload; + fib_alias_hw_flags_set(mlxsw_sp_net(mlxsw_sp), &fri); } static void -mlxsw_sp_fib4_entry_offload_unset(struct mlxsw_sp_fib_entry *fib_entry) +mlxsw_sp_fib4_entry_hw_flags_clear(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_fib_entry *fib_entry) { - struct mlxsw_sp_nexthop_group *nh_grp = fib_entry->nh_group; - int i; - - if (!list_is_singular(&nh_grp->fib_list)) - return; - - for (i = 0; i < nh_grp->count; i++) { - struct mlxsw_sp_nexthop *nh = &nh_grp->nexthops[i]; + struct fib_info *fi = mlxsw_sp_nexthop4_group_fi(fib_entry->nh_group); + u32 *p_dst = (u32 *) fib_entry->fib_node->key.addr; + int dst_len = fib_entry->fib_node->key.prefix_len; + struct mlxsw_sp_fib4_entry *fib4_entry; + struct fib_rt_info fri; - nh->key.fib_nh->fib_nh_flags &= ~RTNH_F_OFFLOAD; - } + fib4_entry = container_of(fib_entry, struct mlxsw_sp_fib4_entry, + common); + fri.fi = fi; + fri.tb_id = fib4_entry->tb_id; + fri.dst = cpu_to_be32(*p_dst); + fri.dst_len = dst_len; + fri.tos = fib4_entry->tos; + fri.type = fib4_entry->type; + fri.offload = false; + fri.trap = false; + fib_alias_hw_flags_set(mlxsw_sp_net(mlxsw_sp), &fri); } static void -mlxsw_sp_fib6_entry_offload_set(struct mlxsw_sp_fib_entry *fib_entry) +mlxsw_sp_fib6_entry_hw_flags_set(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_fib_entry *fib_entry) { struct mlxsw_sp_fib6_entry *fib6_entry; struct mlxsw_sp_rt6 *mlxsw_sp_rt6; + bool should_offload; + should_offload = mlxsw_sp_fib_entry_should_offload(fib_entry); + + /* In IPv6 a multipath route is represented using multiple routes, so + * we need to set the flags on all of them. + */ fib6_entry = container_of(fib_entry, struct mlxsw_sp_fib6_entry, common); - - if (fib_entry->type == MLXSW_SP_FIB_ENTRY_TYPE_LOCAL || - fib_entry->type == MLXSW_SP_FIB_ENTRY_TYPE_BLACKHOLE) { - list_first_entry(&fib6_entry->rt6_list, struct mlxsw_sp_rt6, - list)->rt->fib6_nh->fib_nh_flags |= RTNH_F_OFFLOAD; - return; - } - - list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) { - struct mlxsw_sp_nexthop_group *nh_grp = fib_entry->nh_group; - struct fib6_nh *fib6_nh = mlxsw_sp_rt6->rt->fib6_nh; - struct mlxsw_sp_nexthop *nh; - - nh = mlxsw_sp_rt6_nexthop(nh_grp, mlxsw_sp_rt6); - if (nh && nh->offloaded) - fib6_nh->fib_nh_flags |= RTNH_F_OFFLOAD; - else - fib6_nh->fib_nh_flags &= ~RTNH_F_OFFLOAD; - } + list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) + fib6_info_hw_flags_set(mlxsw_sp_rt6->rt, should_offload, + !should_offload); } static void -mlxsw_sp_fib6_entry_offload_unset(struct mlxsw_sp_fib_entry *fib_entry) +mlxsw_sp_fib6_entry_hw_flags_clear(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_fib_entry *fib_entry) { struct mlxsw_sp_fib6_entry *fib6_entry; struct mlxsw_sp_rt6 *mlxsw_sp_rt6; fib6_entry = container_of(fib_entry, struct mlxsw_sp_fib6_entry, common); - list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) { - struct fib6_info *rt = mlxsw_sp_rt6->rt; - - rt->fib6_nh->fib_nh_flags &= ~RTNH_F_OFFLOAD; - } + list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) + fib6_info_hw_flags_set(mlxsw_sp_rt6->rt, false, false); } -static void mlxsw_sp_fib_entry_offload_set(struct mlxsw_sp_fib_entry *fib_entry) +static void +mlxsw_sp_fib_entry_hw_flags_set(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_fib_entry *fib_entry) { switch (fib_entry->fib_node->fib->proto) { case MLXSW_SP_L3_PROTO_IPV4: - mlxsw_sp_fib4_entry_offload_set(fib_entry); + mlxsw_sp_fib4_entry_hw_flags_set(mlxsw_sp, fib_entry); break; case MLXSW_SP_L3_PROTO_IPV6: - mlxsw_sp_fib6_entry_offload_set(fib_entry); + mlxsw_sp_fib6_entry_hw_flags_set(mlxsw_sp, fib_entry); break; } } static void -mlxsw_sp_fib_entry_offload_unset(struct mlxsw_sp_fib_entry *fib_entry) +mlxsw_sp_fib_entry_hw_flags_clear(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_fib_entry *fib_entry) { switch (fib_entry->fib_node->fib->proto) { case MLXSW_SP_L3_PROTO_IPV4: - mlxsw_sp_fib4_entry_offload_unset(fib_entry); + mlxsw_sp_fib4_entry_hw_flags_clear(mlxsw_sp, fib_entry); break; case MLXSW_SP_L3_PROTO_IPV6: - mlxsw_sp_fib6_entry_offload_unset(fib_entry); + mlxsw_sp_fib6_entry_hw_flags_clear(mlxsw_sp, fib_entry); break; } } static void -mlxsw_sp_fib_entry_offload_refresh(struct mlxsw_sp_fib_entry *fib_entry, - enum mlxsw_reg_ralue_op op, int err) +mlxsw_sp_fib_entry_hw_flags_refresh(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_fib_entry *fib_entry, + enum mlxsw_reg_ralue_op op) { switch (op) { - case MLXSW_REG_RALUE_OP_WRITE_DELETE: - return mlxsw_sp_fib_entry_offload_unset(fib_entry); case MLXSW_REG_RALUE_OP_WRITE_WRITE: - if (err) - return; - if (mlxsw_sp_fib_entry_should_offload(fib_entry)) - mlxsw_sp_fib_entry_offload_set(fib_entry); - else - mlxsw_sp_fib_entry_offload_unset(fib_entry); - return; + mlxsw_sp_fib_entry_hw_flags_set(mlxsw_sp, fib_entry); + break; + case MLXSW_REG_RALUE_OP_WRITE_DELETE: + mlxsw_sp_fib_entry_hw_flags_clear(mlxsw_sp, fib_entry); + break; default: - return; + break; } } @@ -4394,7 +4444,10 @@ static int mlxsw_sp_fib_entry_op(struct mlxsw_sp *mlxsw_sp, { int err = __mlxsw_sp_fib_entry_op(mlxsw_sp, fib_entry, op); - mlxsw_sp_fib_entry_offload_refresh(fib_entry, op, err); + if (err) + return err; + + mlxsw_sp_fib_entry_hw_flags_refresh(mlxsw_sp, fib_entry, op); return err; } @@ -4830,7 +4883,7 @@ mlxsw_sp_router_fib4_replace(struct mlxsw_sp *mlxsw_sp, if (!replaced) return 0; - mlxsw_sp_fib_entry_offload_unset(replaced); + mlxsw_sp_fib_entry_hw_flags_clear(mlxsw_sp, replaced); fib4_replaced = container_of(replaced, struct mlxsw_sp_fib4_entry, common); mlxsw_sp_fib4_entry_destroy(mlxsw_sp, fib4_replaced); @@ -5113,6 +5166,11 @@ static int mlxsw_sp_nexthop6_group_get(struct mlxsw_sp *mlxsw_sp, &nh_grp->fib_list); fib6_entry->common.nh_group = nh_grp; + /* The route and the nexthop are described by the same struct, so we + * need to the update the nexthop offload indication for the new route. + */ + __mlxsw_sp_nexthop6_group_offload_refresh(nh_grp, fib6_entry); + return 0; } @@ -5393,7 +5451,7 @@ static int mlxsw_sp_router_fib6_replace(struct mlxsw_sp *mlxsw_sp, if (!replaced) return 0; - mlxsw_sp_fib_entry_offload_unset(replaced); + mlxsw_sp_fib_entry_hw_flags_clear(mlxsw_sp, replaced); fib6_replaced = container_of(replaced, struct mlxsw_sp_fib6_entry, common); mlxsw_sp_fib6_entry_destroy(mlxsw_sp, fib6_replaced); @@ -7733,8 +7791,18 @@ mlxsw_sp_ipip_config_tigcr(struct mlxsw_sp *mlxsw_sp) static int mlxsw_sp_ipips_init(struct mlxsw_sp *mlxsw_sp) { + int err; + mlxsw_sp->router->ipip_ops_arr = mlxsw_sp_ipip_ops_arr; INIT_LIST_HEAD(&mlxsw_sp->router->ipip_list); + + err = mlxsw_sp_ipip_ecn_encap_init(mlxsw_sp); + if (err) + return err; + err = mlxsw_sp_ipip_ecn_decap_init(mlxsw_sp); + if (err) + return err; + return mlxsw_sp_ipip_config_tigcr(mlxsw_sp); } diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h index cc1de91e8217..c9b94f435cdd 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h @@ -104,4 +104,7 @@ static inline bool mlxsw_sp_l3addr_eq(const union mlxsw_sp_l3addr *addr1, return !memcmp(addr1, addr2, sizeof(*addr1)); } +int mlxsw_sp_ipip_ecn_encap_init(struct mlxsw_sp *mlxsw_sp); +int mlxsw_sp_ipip_ecn_decap_init(struct mlxsw_sp *mlxsw_sp); + #endif /* _MLXSW_ROUTER_H_*/ diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c index 200d324e6d99..0cdd7954a085 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c @@ -748,33 +748,50 @@ static bool mlxsw_sp_span_is_egress_mirror(struct mlxsw_sp_port *port) return false; } -static int mlxsw_sp_span_mtu_to_buffsize(const struct mlxsw_sp *mlxsw_sp, - int mtu) +static int +mlxsw_sp_span_port_buffsize_update(struct mlxsw_sp_port *mlxsw_sp_port, u16 mtu) { - return mlxsw_sp_bytes_cells(mlxsw_sp, mtu * 5 / 2) + 1; + struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; + char sbib_pl[MLXSW_REG_SBIB_LEN]; + u32 buffsize; + u32 speed; + int err; + + err = mlxsw_sp_port_speed_get(mlxsw_sp_port, &speed); + if (err) + return err; + if (speed == SPEED_UNKNOWN) + speed = 0; + + buffsize = mlxsw_sp_span_buffsize_get(mlxsw_sp, speed, mtu); + mlxsw_reg_sbib_pack(sbib_pl, mlxsw_sp_port->local_port, buffsize); + return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sbib), sbib_pl); } int mlxsw_sp_span_port_mtu_update(struct mlxsw_sp_port *port, u16 mtu) { - struct mlxsw_sp *mlxsw_sp = port->mlxsw_sp; - char sbib_pl[MLXSW_REG_SBIB_LEN]; - int err; - /* If port is egress mirrored, the shared buffer size should be * updated according to the mtu value */ - if (mlxsw_sp_span_is_egress_mirror(port)) { - u32 buffsize = mlxsw_sp_span_mtu_to_buffsize(mlxsw_sp, mtu); + if (mlxsw_sp_span_is_egress_mirror(port)) + return mlxsw_sp_span_port_buffsize_update(port, mtu); + return 0; +} - mlxsw_reg_sbib_pack(sbib_pl, port->local_port, buffsize); - err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sbib), sbib_pl); - if (err) { - netdev_err(port->dev, "Could not update shared buffer for mirroring\n"); - return err; - } - } +void mlxsw_sp_span_speed_update_work(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct mlxsw_sp_port *mlxsw_sp_port; - return 0; + mlxsw_sp_port = container_of(dwork, struct mlxsw_sp_port, + span.speed_update_dw); + + /* If port is egress mirrored, the shared buffer size should be + * updated according to the speed value. + */ + if (mlxsw_sp_span_is_egress_mirror(mlxsw_sp_port)) + mlxsw_sp_span_port_buffsize_update(mlxsw_sp_port, + mlxsw_sp_port->dev->mtu); } static struct mlxsw_sp_span_inspected_port * @@ -836,15 +853,9 @@ mlxsw_sp_span_inspected_port_add(struct mlxsw_sp_port *port, /* if it is an egress SPAN, bind a shared buffer to it */ if (type == MLXSW_SP_SPAN_EGRESS) { - u32 buffsize = mlxsw_sp_span_mtu_to_buffsize(mlxsw_sp, - port->dev->mtu); - - mlxsw_reg_sbib_pack(sbib_pl, port->local_port, buffsize); - err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sbib), sbib_pl); - if (err) { - netdev_err(port->dev, "Could not create shared buffer for mirroring\n"); + err = mlxsw_sp_span_port_buffsize_update(port, port->dev->mtu); + if (err) return err; - } } if (bind) { diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h index 5e04252f2a11..59724335525f 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h @@ -74,5 +74,6 @@ void mlxsw_sp_span_entry_invalidate(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_span_entry *span_entry); int mlxsw_sp_span_port_mtu_update(struct mlxsw_sp_port *port, u16 mtu); +void mlxsw_sp_span_speed_update_work(struct work_struct *work); #endif diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c index e0d7c49ffae0..60205aa3f6a5 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c @@ -9,6 +9,20 @@ #include "reg.h" #include "spectrum.h" +/* All driver-specific traps must be documented in + * Documentation/networking/devlink/mlxsw.rst + */ +enum { + DEVLINK_MLXSW_TRAP_ID_BASE = DEVLINK_TRAP_GENERIC_ID_MAX, + DEVLINK_MLXSW_TRAP_ID_IRIF_DISABLED, + DEVLINK_MLXSW_TRAP_ID_ERIF_DISABLED, +}; + +#define DEVLINK_MLXSW_TRAP_NAME_IRIF_DISABLED \ + "irif_disabled" +#define DEVLINK_MLXSW_TRAP_NAME_ERIF_DISABLED \ + "erif_disabled" + #define MLXSW_SP_TRAP_METADATA DEVLINK_TRAP_METADATA_TYPE_F_IN_PORT static void mlxsw_sp_rx_drop_listener(struct sk_buff *skb, u8 local_port, @@ -21,6 +35,12 @@ static void mlxsw_sp_rx_exception_listener(struct sk_buff *skb, u8 local_port, DEVLINK_TRAP_GROUP_GENERIC(_group_id), \ MLXSW_SP_TRAP_METADATA) +#define MLXSW_SP_TRAP_DRIVER_DROP(_id, _group_id) \ + DEVLINK_TRAP_DRIVER(DROP, DROP, DEVLINK_MLXSW_TRAP_ID_##_id, \ + DEVLINK_MLXSW_TRAP_NAME_##_id, \ + DEVLINK_TRAP_GROUP_GENERIC(_group_id), \ + MLXSW_SP_TRAP_METADATA) + #define MLXSW_SP_TRAP_EXCEPTION(_id, _group_id) \ DEVLINK_TRAP_GENERIC(EXCEPTION, TRAP, _id, \ DEVLINK_TRAP_GROUP_GENERIC(_group_id), \ @@ -58,6 +78,11 @@ static struct devlink_trap mlxsw_sp_traps_arr[] = { MLXSW_SP_TRAP_EXCEPTION(UNRESOLVED_NEIGH, L3_DROPS), MLXSW_SP_TRAP_EXCEPTION(IPV4_LPM_UNICAST_MISS, L3_DROPS), MLXSW_SP_TRAP_EXCEPTION(IPV6_LPM_UNICAST_MISS, L3_DROPS), + MLXSW_SP_TRAP_DRIVER_DROP(IRIF_DISABLED, L3_DROPS), + MLXSW_SP_TRAP_DRIVER_DROP(ERIF_DISABLED, L3_DROPS), + MLXSW_SP_TRAP_DROP(NON_ROUTABLE, L3_DROPS), + MLXSW_SP_TRAP_EXCEPTION(DECAP_ERROR, TUNNEL_DROPS), + MLXSW_SP_TRAP_DROP(OVERLAY_SMAC_MC, TUNNEL_DROPS), }; static struct mlxsw_listener mlxsw_sp_listeners_arr[] = { @@ -90,6 +115,15 @@ static struct mlxsw_listener mlxsw_sp_listeners_arr[] = { TRAP_EXCEPTION_TO_CPU), MLXSW_SP_RXL_EXCEPTION(DISCARD_ROUTER_LPM6, ROUTER_EXP, TRAP_EXCEPTION_TO_CPU), + MLXSW_SP_RXL_DISCARD(ROUTER_IRIF_EN, L3_DISCARDS), + MLXSW_SP_RXL_DISCARD(ROUTER_ERIF_EN, L3_DISCARDS), + MLXSW_SP_RXL_DISCARD(NON_ROUTABLE, L3_DISCARDS), + MLXSW_SP_RXL_EXCEPTION(DECAP_ECN0, ROUTER_EXP, TRAP_EXCEPTION_TO_CPU), + MLXSW_SP_RXL_EXCEPTION(IPIP_DECAP_ERROR, ROUTER_EXP, + TRAP_EXCEPTION_TO_CPU), + MLXSW_SP_RXL_EXCEPTION(DISCARD_DEC_PKT, TUNNEL_DISCARDS, + TRAP_EXCEPTION_TO_CPU), + MLXSW_SP_RXL_DISCARD(OVERLAY_SMAC_MC, TUNNEL_DISCARDS), }; /* Mapping between hardware trap and devlink trap. Multiple hardware traps can @@ -123,6 +157,13 @@ static u16 mlxsw_sp_listener_devlink_map[] = { DEVLINK_TRAP_GENERIC_ID_UNRESOLVED_NEIGH, DEVLINK_TRAP_GENERIC_ID_IPV4_LPM_UNICAST_MISS, DEVLINK_TRAP_GENERIC_ID_IPV6_LPM_UNICAST_MISS, + DEVLINK_MLXSW_TRAP_ID_IRIF_DISABLED, + DEVLINK_MLXSW_TRAP_ID_ERIF_DISABLED, + DEVLINK_TRAP_GENERIC_ID_NON_ROUTABLE, + DEVLINK_TRAP_GENERIC_ID_DECAP_ERROR, + DEVLINK_TRAP_GENERIC_ID_DECAP_ERROR, + DEVLINK_TRAP_GENERIC_ID_DECAP_ERROR, + DEVLINK_TRAP_GENERIC_ID_OVERLAY_SMAC_MC, }; static int mlxsw_sp_rx_listener(struct mlxsw_sp *mlxsw_sp, struct sk_buff *skb, @@ -304,8 +345,9 @@ mlxsw_sp_trap_group_policer_init(struct mlxsw_sp *mlxsw_sp, u32 rate; switch (group->id) { - case DEVLINK_TRAP_GROUP_GENERIC_ID_L3_DROPS:/* fall through */ - case DEVLINK_TRAP_GROUP_GENERIC_ID_L2_DROPS: + case DEVLINK_TRAP_GROUP_GENERIC_ID_L2_DROPS: /* fall through */ + case DEVLINK_TRAP_GROUP_GENERIC_ID_L3_DROPS: /* fall through */ + case DEVLINK_TRAP_GROUP_GENERIC_ID_TUNNEL_DROPS: policer_id = MLXSW_SP_DISCARD_POLICER_ID; ir_units = MLXSW_REG_QPCR_IR_UNITS_M; is_bytes = false; @@ -342,6 +384,12 @@ __mlxsw_sp_trap_group_init(struct mlxsw_sp *mlxsw_sp, priority = 0; tc = 1; break; + case DEVLINK_TRAP_GROUP_GENERIC_ID_TUNNEL_DROPS: + group_id = MLXSW_REG_HTGT_TRAP_GROUP_SP_TUNNEL_DISCARDS; + policer_id = MLXSW_SP_DISCARD_POLICER_ID; + priority = 0; + tc = 1; + break; default: return -EINVAL; } diff --git a/drivers/net/ethernet/mellanox/mlxsw/switchx2.c b/drivers/net/ethernet/mellanox/mlxsw/switchx2.c index de6cb22f68b1..f0e98ec8f1ee 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/switchx2.c +++ b/drivers/net/ethernet/mellanox/mlxsw/switchx2.c @@ -299,22 +299,17 @@ static netdev_tx_t mlxsw_sx_port_xmit(struct sk_buff *skb, u64 len; int err; + if (skb_cow_head(skb, MLXSW_TXHDR_LEN)) { + this_cpu_inc(mlxsw_sx_port->pcpu_stats->tx_dropped); + dev_kfree_skb_any(skb); + return NETDEV_TX_OK; + } + memset(skb->cb, 0, sizeof(struct mlxsw_skb_cb)); if (mlxsw_core_skb_transmit_busy(mlxsw_sx->core, &tx_info)) return NETDEV_TX_BUSY; - if (unlikely(skb_headroom(skb) < MLXSW_TXHDR_LEN)) { - struct sk_buff *skb_orig = skb; - - skb = skb_realloc_headroom(skb, MLXSW_TXHDR_LEN); - if (!skb) { - this_cpu_inc(mlxsw_sx_port->pcpu_stats->tx_dropped); - dev_kfree_skb_any(skb_orig); - return NETDEV_TX_OK; - } - dev_consume_skb_any(skb_orig); - } mlxsw_sx_txhdr_construct(skb, &tx_info); /* TX header is consumed by HW on the way so we shouldn't count its * bytes as being sent. diff --git a/drivers/net/ethernet/mellanox/mlxsw/trap.h b/drivers/net/ethernet/mellanox/mlxsw/trap.h index 0c1c142bb6b0..12e1fa998d42 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/trap.h +++ b/drivers/net/ethernet/mellanox/mlxsw/trap.h @@ -67,6 +67,7 @@ enum { MLXSW_TRAP_ID_NVE_ENCAP_ARP = 0xBD, MLXSW_TRAP_ID_ROUTER_ALERT_IPV4 = 0xD6, MLXSW_TRAP_ID_ROUTER_ALERT_IPV6 = 0xD7, + MLXSW_TRAP_ID_DISCARD_NON_ROUTABLE = 0x11A, MLXSW_TRAP_ID_DISCARD_ROUTER2 = 0x130, MLXSW_TRAP_ID_DISCARD_ROUTER3 = 0x131, MLXSW_TRAP_ID_DISCARD_ING_PACKET_SMAC_MC = 0x140, @@ -80,12 +81,20 @@ enum { MLXSW_TRAP_ID_DISCARD_ING_ROUTER_UC_DIP_MC_DMAC = 0x161, MLXSW_TRAP_ID_DISCARD_ING_ROUTER_DIP_LB = 0x162, MLXSW_TRAP_ID_DISCARD_ING_ROUTER_SIP_MC = 0x163, + MLXSW_TRAP_ID_DISCARD_ING_ROUTER_SIP_CLASS_E = 0x164, MLXSW_TRAP_ID_DISCARD_ING_ROUTER_SIP_LB = 0x165, MLXSW_TRAP_ID_DISCARD_ING_ROUTER_CORRUPTED_IP_HDR = 0x167, + MLXSW_TRAP_ID_DISCARD_ING_ROUTER_MC_DMAC = 0x168, + MLXSW_TRAP_ID_DISCARD_ING_ROUTER_SIP_DIP = 0x169, MLXSW_TRAP_ID_DISCARD_ING_ROUTER_IPV4_SIP_BC = 0x16A, MLXSW_TRAP_ID_DISCARD_ING_ROUTER_IPV4_DIP_LOCAL_NET = 0x16B, + MLXSW_TRAP_ID_DISCARD_ING_ROUTER_DIP_LINK_LOCAL = 0x16C, + MLXSW_TRAP_ID_DISCARD_ROUTER_IRIF_EN = 0x178, + MLXSW_TRAP_ID_DISCARD_ROUTER_ERIF_EN = 0x179, MLXSW_TRAP_ID_DISCARD_ROUTER_LPM4 = 0x17B, MLXSW_TRAP_ID_DISCARD_ROUTER_LPM6 = 0x17C, + MLXSW_TRAP_ID_DISCARD_DEC_PKT = 0x188, + MLXSW_TRAP_ID_DISCARD_OVERLAY_SMAC_MC = 0x190, MLXSW_TRAP_ID_DISCARD_IPV6_MC_DIP_RESERVED_SCOPE = 0x1B0, MLXSW_TRAP_ID_DISCARD_IPV6_MC_DIP_INTERFACE_LOCAL_SCOPE = 0x1B1, MLXSW_TRAP_ID_ACL0 = 0x1C0, diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 985b46d7e3d1..86d543ab1ab9 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -500,13 +500,14 @@ EXPORT_SYMBOL(ocelot_port_enable); static int ocelot_port_open(struct net_device *dev) { struct ocelot_port_private *priv = netdev_priv(dev); - struct ocelot *ocelot = priv->port.ocelot; + struct ocelot_port *ocelot_port = &priv->port; + struct ocelot *ocelot = ocelot_port->ocelot; int port = priv->chip_port; int err; if (priv->serdes) { err = phy_set_mode_ext(priv->serdes, PHY_MODE_ETHERNET, - priv->phy_mode); + ocelot_port->phy_mode); if (err) { netdev_err(dev, "Could not set mode of SerDes\n"); return err; @@ -514,7 +515,7 @@ static int ocelot_port_open(struct net_device *dev) } err = phy_connect_direct(dev, priv->phy, &ocelot_port_adjust_link, - priv->phy_mode); + ocelot_port->phy_mode); if (err) { netdev_err(dev, "Could not attach to PHY\n"); return err; diff --git a/drivers/net/ethernet/mscc/ocelot.h b/drivers/net/ethernet/mscc/ocelot.h index c259114c48fd..04372ba72fec 100644 --- a/drivers/net/ethernet/mscc/ocelot.h +++ b/drivers/net/ethernet/mscc/ocelot.h @@ -18,11 +18,11 @@ #include <linux/ptp_clock_kernel.h> #include <linux/regmap.h> +#include <soc/mscc/ocelot_qsys.h> #include <soc/mscc/ocelot_sys.h> +#include <soc/mscc/ocelot_dev.h> +#include <soc/mscc/ocelot_ana.h> #include <soc/mscc/ocelot.h> -#include "ocelot_ana.h" -#include "ocelot_dev.h" -#include "ocelot_qsys.h" #include "ocelot_rew.h" #include "ocelot_qs.h" #include "ocelot_tc.h" @@ -68,7 +68,6 @@ struct ocelot_port_private { u8 vlan_aware; - phy_interface_t phy_mode; struct phy *serdes; struct ocelot_port_tc tc; diff --git a/drivers/net/ethernet/mscc/ocelot_board.c b/drivers/net/ethernet/mscc/ocelot_board.c index 2da8eee27e98..b38820849faa 100644 --- a/drivers/net/ethernet/mscc/ocelot_board.c +++ b/drivers/net/ethernet/mscc/ocelot_board.c @@ -402,9 +402,9 @@ static int mscc_ocelot_probe(struct platform_device *pdev) of_get_phy_mode(portnp, &phy_mode); - priv->phy_mode = phy_mode; + ocelot_port->phy_mode = phy_mode; - switch (priv->phy_mode) { + switch (ocelot_port->phy_mode) { case PHY_INTERFACE_MODE_NA: continue; case PHY_INTERFACE_MODE_SGMII: diff --git a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c index c979f38a2e0c..2ee0d0be113a 100644 --- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c +++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c @@ -2892,7 +2892,7 @@ drop: static netdev_tx_t myri10ge_sw_tso(struct sk_buff *skb, struct net_device *dev) { - struct sk_buff *segs, *curr; + struct sk_buff *segs, *curr, *next; struct myri10ge_priv *mgp = netdev_priv(dev); struct myri10ge_slice_state *ss; netdev_tx_t status; @@ -2901,10 +2901,8 @@ static netdev_tx_t myri10ge_sw_tso(struct sk_buff *skb, if (IS_ERR(segs)) goto drop; - while (segs) { - curr = segs; - segs = segs->next; - curr->next = NULL; + skb_list_walk_safe(segs, curr, next) { + skb_mark_not_on_list(curr); status = myri10ge_xmit(curr, dev); if (status != 0) { dev_kfree_skb_any(curr); diff --git a/drivers/net/ethernet/natsemi/sonic.c b/drivers/net/ethernet/natsemi/sonic.c index fdebc8598b22..31be3ba66877 100644 --- a/drivers/net/ethernet/natsemi/sonic.c +++ b/drivers/net/ethernet/natsemi/sonic.c @@ -64,6 +64,8 @@ static int sonic_open(struct net_device *dev) netif_dbg(lp, ifup, dev, "%s: initializing sonic driver\n", __func__); + spin_lock_init(&lp->lock); + for (i = 0; i < SONIC_NUM_RRS; i++) { struct sk_buff *skb = netdev_alloc_skb(dev, SONIC_RBSIZE + 2); if (skb == NULL) { @@ -114,6 +116,24 @@ static int sonic_open(struct net_device *dev) return 0; } +/* Wait for the SONIC to become idle. */ +static void sonic_quiesce(struct net_device *dev, u16 mask) +{ + struct sonic_local * __maybe_unused lp = netdev_priv(dev); + int i; + u16 bits; + + for (i = 0; i < 1000; ++i) { + bits = SONIC_READ(SONIC_CMD) & mask; + if (!bits) + return; + if (irqs_disabled() || in_interrupt()) + udelay(20); + else + usleep_range(100, 200); + } + WARN_ONCE(1, "command deadline expired! 0x%04x\n", bits); +} /* * Close the SONIC device @@ -130,6 +150,9 @@ static int sonic_close(struct net_device *dev) /* * stop the SONIC, disable interrupts */ + SONIC_WRITE(SONIC_CMD, SONIC_CR_RXDIS); + sonic_quiesce(dev, SONIC_CR_ALL); + SONIC_WRITE(SONIC_IMR, 0); SONIC_WRITE(SONIC_ISR, 0x7fff); SONIC_WRITE(SONIC_CMD, SONIC_CR_RST); @@ -169,6 +192,9 @@ static void sonic_tx_timeout(struct net_device *dev, unsigned int txqueue) * put the Sonic into software-reset mode and * disable all interrupts before releasing DMA buffers */ + SONIC_WRITE(SONIC_CMD, SONIC_CR_RXDIS); + sonic_quiesce(dev, SONIC_CR_ALL); + SONIC_WRITE(SONIC_IMR, 0); SONIC_WRITE(SONIC_ISR, 0x7fff); SONIC_WRITE(SONIC_CMD, SONIC_CR_RST); @@ -206,8 +232,6 @@ static void sonic_tx_timeout(struct net_device *dev, unsigned int txqueue) * wake the tx queue * Concurrently with all of this, the SONIC is potentially writing to * the status flags of the TDs. - * Until some mutual exclusion is added, this code will not work with SMP. However, - * MIPS Jazz machines and m68k Macs were all uni-processor machines. */ static int sonic_send_packet(struct sk_buff *skb, struct net_device *dev) @@ -215,7 +239,8 @@ static int sonic_send_packet(struct sk_buff *skb, struct net_device *dev) struct sonic_local *lp = netdev_priv(dev); dma_addr_t laddr; int length; - int entry = lp->next_tx; + int entry; + unsigned long flags; netif_dbg(lp, tx_queued, dev, "%s: skb=%p\n", __func__, skb); @@ -237,6 +262,10 @@ static int sonic_send_packet(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } + spin_lock_irqsave(&lp->lock, flags); + + entry = lp->next_tx; + sonic_tda_put(dev, entry, SONIC_TD_STATUS, 0); /* clear status */ sonic_tda_put(dev, entry, SONIC_TD_FRAG_COUNT, 1); /* single fragment */ sonic_tda_put(dev, entry, SONIC_TD_PKTSIZE, length); /* length of packet */ @@ -246,10 +275,6 @@ static int sonic_send_packet(struct sk_buff *skb, struct net_device *dev) sonic_tda_put(dev, entry, SONIC_TD_LINK, sonic_tda_get(dev, entry, SONIC_TD_LINK) | SONIC_EOL); - /* - * Must set tx_skb[entry] only after clearing status, and - * before clearing EOL and before stopping queue - */ wmb(); lp->tx_len[entry] = length; lp->tx_laddr[entry] = laddr; @@ -272,6 +297,8 @@ static int sonic_send_packet(struct sk_buff *skb, struct net_device *dev) SONIC_WRITE(SONIC_CMD, SONIC_CR_TXP); + spin_unlock_irqrestore(&lp->lock, flags); + return NETDEV_TX_OK; } @@ -284,15 +311,28 @@ static irqreturn_t sonic_interrupt(int irq, void *dev_id) struct net_device *dev = dev_id; struct sonic_local *lp = netdev_priv(dev); int status; + unsigned long flags; + + /* The lock has two purposes. Firstly, it synchronizes sonic_interrupt() + * with sonic_send_packet() so that the two functions can share state. + * Secondly, it makes sonic_interrupt() re-entrant, as that is required + * by macsonic which must use two IRQs with different priority levels. + */ + spin_lock_irqsave(&lp->lock, flags); + + status = SONIC_READ(SONIC_ISR) & SONIC_IMR_DEFAULT; + if (!status) { + spin_unlock_irqrestore(&lp->lock, flags); - if (!(status = SONIC_READ(SONIC_ISR) & SONIC_IMR_DEFAULT)) return IRQ_NONE; + } do { + SONIC_WRITE(SONIC_ISR, status); /* clear the interrupt(s) */ + if (status & SONIC_INT_PKTRX) { netif_dbg(lp, intr, dev, "%s: packet rx\n", __func__); sonic_rx(dev); /* got packet(s) */ - SONIC_WRITE(SONIC_ISR, SONIC_INT_PKTRX); /* clear the interrupt */ } if (status & SONIC_INT_TXDN) { @@ -300,11 +340,12 @@ static irqreturn_t sonic_interrupt(int irq, void *dev_id) int td_status; int freed_some = 0; - /* At this point, cur_tx is the index of a TD that is one of: - * unallocated/freed (status set & tx_skb[entry] clear) - * allocated and sent (status set & tx_skb[entry] set ) - * allocated and not yet sent (status clear & tx_skb[entry] set ) - * still being allocated by sonic_send_packet (status clear & tx_skb[entry] clear) + /* The state of a Transmit Descriptor may be inferred + * from { tx_skb[entry], td_status } as follows. + * { clear, clear } => the TD has never been used + * { set, clear } => the TD was handed to SONIC + * { set, set } => the TD was handed back + * { clear, set } => the TD is available for re-use */ netif_dbg(lp, intr, dev, "%s: tx done\n", __func__); @@ -313,18 +354,19 @@ static irqreturn_t sonic_interrupt(int irq, void *dev_id) if ((td_status = sonic_tda_get(dev, entry, SONIC_TD_STATUS)) == 0) break; - if (td_status & 0x0001) { + if (td_status & SONIC_TCR_PTX) { lp->stats.tx_packets++; lp->stats.tx_bytes += sonic_tda_get(dev, entry, SONIC_TD_PKTSIZE); } else { - lp->stats.tx_errors++; - if (td_status & 0x0642) + if (td_status & (SONIC_TCR_EXD | + SONIC_TCR_EXC | SONIC_TCR_BCM)) lp->stats.tx_aborted_errors++; - if (td_status & 0x0180) + if (td_status & + (SONIC_TCR_NCRS | SONIC_TCR_CRLS)) lp->stats.tx_carrier_errors++; - if (td_status & 0x0020) + if (td_status & SONIC_TCR_OWC) lp->stats.tx_window_errors++; - if (td_status & 0x0004) + if (td_status & SONIC_TCR_FU) lp->stats.tx_fifo_errors++; } @@ -346,7 +388,6 @@ static irqreturn_t sonic_interrupt(int irq, void *dev_id) if (freed_some || lp->tx_skb[entry] == NULL) netif_wake_queue(dev); /* The ring is no longer full */ lp->cur_tx = entry; - SONIC_WRITE(SONIC_ISR, SONIC_INT_TXDN); /* clear the interrupt */ } /* @@ -355,42 +396,37 @@ static irqreturn_t sonic_interrupt(int irq, void *dev_id) if (status & SONIC_INT_RFO) { netif_dbg(lp, rx_err, dev, "%s: rx fifo overrun\n", __func__); - lp->stats.rx_fifo_errors++; - SONIC_WRITE(SONIC_ISR, SONIC_INT_RFO); /* clear the interrupt */ } if (status & SONIC_INT_RDE) { netif_dbg(lp, rx_err, dev, "%s: rx descriptors exhausted\n", __func__); - lp->stats.rx_dropped++; - SONIC_WRITE(SONIC_ISR, SONIC_INT_RDE); /* clear the interrupt */ } if (status & SONIC_INT_RBAE) { netif_dbg(lp, rx_err, dev, "%s: rx buffer area exceeded\n", __func__); - lp->stats.rx_dropped++; - SONIC_WRITE(SONIC_ISR, SONIC_INT_RBAE); /* clear the interrupt */ } /* counter overruns; all counters are 16bit wide */ - if (status & SONIC_INT_FAE) { + if (status & SONIC_INT_FAE) lp->stats.rx_frame_errors += 65536; - SONIC_WRITE(SONIC_ISR, SONIC_INT_FAE); /* clear the interrupt */ - } - if (status & SONIC_INT_CRC) { + if (status & SONIC_INT_CRC) lp->stats.rx_crc_errors += 65536; - SONIC_WRITE(SONIC_ISR, SONIC_INT_CRC); /* clear the interrupt */ - } - if (status & SONIC_INT_MP) { + if (status & SONIC_INT_MP) lp->stats.rx_missed_errors += 65536; - SONIC_WRITE(SONIC_ISR, SONIC_INT_MP); /* clear the interrupt */ - } /* transmit error */ if (status & SONIC_INT_TXER) { - if (SONIC_READ(SONIC_TCR) & SONIC_TCR_FU) - netif_dbg(lp, tx_err, dev, "%s: tx fifo underrun\n", - __func__); - SONIC_WRITE(SONIC_ISR, SONIC_INT_TXER); /* clear the interrupt */ + u16 tcr = SONIC_READ(SONIC_TCR); + + netif_dbg(lp, tx_err, dev, "%s: TXER intr, TCR %04x\n", + __func__, tcr); + + if (tcr & (SONIC_TCR_EXD | SONIC_TCR_EXC | + SONIC_TCR_FU | SONIC_TCR_BCM)) { + /* Aborted transmission. Try again. */ + netif_stop_queue(dev); + SONIC_WRITE(SONIC_CMD, SONIC_CR_TXP); + } } /* bus retry */ @@ -400,107 +436,164 @@ static irqreturn_t sonic_interrupt(int irq, void *dev_id) /* ... to help debug DMA problems causing endless interrupts. */ /* Bounce the eth interface to turn on the interrupt again. */ SONIC_WRITE(SONIC_IMR, 0); - SONIC_WRITE(SONIC_ISR, SONIC_INT_BR); /* clear the interrupt */ } - /* load CAM done */ - if (status & SONIC_INT_LCD) - SONIC_WRITE(SONIC_ISR, SONIC_INT_LCD); /* clear the interrupt */ - } while((status = SONIC_READ(SONIC_ISR) & SONIC_IMR_DEFAULT)); + status = SONIC_READ(SONIC_ISR) & SONIC_IMR_DEFAULT; + } while (status); + + spin_unlock_irqrestore(&lp->lock, flags); + return IRQ_HANDLED; } +/* Return the array index corresponding to a given Receive Buffer pointer. */ +static int index_from_addr(struct sonic_local *lp, dma_addr_t addr, + unsigned int last) +{ + unsigned int i = last; + + do { + i = (i + 1) & SONIC_RRS_MASK; + if (addr == lp->rx_laddr[i]) + return i; + } while (i != last); + + return -ENOENT; +} + +/* Allocate and map a new skb to be used as a receive buffer. */ +static bool sonic_alloc_rb(struct net_device *dev, struct sonic_local *lp, + struct sk_buff **new_skb, dma_addr_t *new_addr) +{ + *new_skb = netdev_alloc_skb(dev, SONIC_RBSIZE + 2); + if (!*new_skb) + return false; + + if (SONIC_BUS_SCALE(lp->dma_bitmode) == 2) + skb_reserve(*new_skb, 2); + + *new_addr = dma_map_single(lp->device, skb_put(*new_skb, SONIC_RBSIZE), + SONIC_RBSIZE, DMA_FROM_DEVICE); + if (!*new_addr) { + dev_kfree_skb(*new_skb); + *new_skb = NULL; + return false; + } + + return true; +} + +/* Place a new receive resource in the Receive Resource Area and update RWP. */ +static void sonic_update_rra(struct net_device *dev, struct sonic_local *lp, + dma_addr_t old_addr, dma_addr_t new_addr) +{ + unsigned int entry = sonic_rr_entry(dev, SONIC_READ(SONIC_RWP)); + unsigned int end = sonic_rr_entry(dev, SONIC_READ(SONIC_RRP)); + u32 buf; + + /* The resources in the range [RRP, RWP) belong to the SONIC. This loop + * scans the other resources in the RRA, those in the range [RWP, RRP). + */ + do { + buf = (sonic_rra_get(dev, entry, SONIC_RR_BUFADR_H) << 16) | + sonic_rra_get(dev, entry, SONIC_RR_BUFADR_L); + + if (buf == old_addr) + break; + + entry = (entry + 1) & SONIC_RRS_MASK; + } while (entry != end); + + WARN_ONCE(buf != old_addr, "failed to find resource!\n"); + + sonic_rra_put(dev, entry, SONIC_RR_BUFADR_H, new_addr >> 16); + sonic_rra_put(dev, entry, SONIC_RR_BUFADR_L, new_addr & 0xffff); + + entry = (entry + 1) & SONIC_RRS_MASK; + + SONIC_WRITE(SONIC_RWP, sonic_rr_addr(dev, entry)); +} + /* * We have a good packet(s), pass it/them up the network stack. */ static void sonic_rx(struct net_device *dev) { struct sonic_local *lp = netdev_priv(dev); - int status; int entry = lp->cur_rx; + int prev_entry = lp->eol_rx; + bool rbe = false; while (sonic_rda_get(dev, entry, SONIC_RD_IN_USE) == 0) { - struct sk_buff *used_skb; - struct sk_buff *new_skb; - dma_addr_t new_laddr; - u16 bufadr_l; - u16 bufadr_h; - int pkt_len; - - status = sonic_rda_get(dev, entry, SONIC_RD_STATUS); - if (status & SONIC_RCR_PRX) { - /* Malloc up new buffer. */ - new_skb = netdev_alloc_skb(dev, SONIC_RBSIZE + 2); - if (new_skb == NULL) { - lp->stats.rx_dropped++; + u16 status = sonic_rda_get(dev, entry, SONIC_RD_STATUS); + + /* If the RD has LPKT set, the chip has finished with the RB */ + if ((status & SONIC_RCR_PRX) && (status & SONIC_RCR_LPKT)) { + struct sk_buff *new_skb; + dma_addr_t new_laddr; + u32 addr = (sonic_rda_get(dev, entry, + SONIC_RD_PKTPTR_H) << 16) | + sonic_rda_get(dev, entry, SONIC_RD_PKTPTR_L); + int i = index_from_addr(lp, addr, entry); + + if (i < 0) { + WARN_ONCE(1, "failed to find buffer!\n"); break; } - /* provide 16 byte IP header alignment unless DMA requires otherwise */ - if(SONIC_BUS_SCALE(lp->dma_bitmode) == 2) - skb_reserve(new_skb, 2); - - new_laddr = dma_map_single(lp->device, skb_put(new_skb, SONIC_RBSIZE), - SONIC_RBSIZE, DMA_FROM_DEVICE); - if (!new_laddr) { - dev_kfree_skb(new_skb); - printk(KERN_ERR "%s: Failed to map rx buffer, dropping packet.\n", dev->name); + + if (sonic_alloc_rb(dev, lp, &new_skb, &new_laddr)) { + struct sk_buff *used_skb = lp->rx_skb[i]; + int pkt_len; + + /* Pass the used buffer up the stack */ + dma_unmap_single(lp->device, addr, SONIC_RBSIZE, + DMA_FROM_DEVICE); + + pkt_len = sonic_rda_get(dev, entry, + SONIC_RD_PKTLEN); + skb_trim(used_skb, pkt_len); + used_skb->protocol = eth_type_trans(used_skb, + dev); + netif_rx(used_skb); + lp->stats.rx_packets++; + lp->stats.rx_bytes += pkt_len; + + lp->rx_skb[i] = new_skb; + lp->rx_laddr[i] = new_laddr; + } else { + /* Failed to obtain a new buffer so re-use it */ + new_laddr = addr; lp->stats.rx_dropped++; - break; } - - /* now we have a new skb to replace it, pass the used one up the stack */ - dma_unmap_single(lp->device, lp->rx_laddr[entry], SONIC_RBSIZE, DMA_FROM_DEVICE); - used_skb = lp->rx_skb[entry]; - pkt_len = sonic_rda_get(dev, entry, SONIC_RD_PKTLEN); - skb_trim(used_skb, pkt_len); - used_skb->protocol = eth_type_trans(used_skb, dev); - netif_rx(used_skb); - lp->stats.rx_packets++; - lp->stats.rx_bytes += pkt_len; - - /* and insert the new skb */ - lp->rx_laddr[entry] = new_laddr; - lp->rx_skb[entry] = new_skb; - - bufadr_l = (unsigned long)new_laddr & 0xffff; - bufadr_h = (unsigned long)new_laddr >> 16; - sonic_rra_put(dev, entry, SONIC_RR_BUFADR_L, bufadr_l); - sonic_rra_put(dev, entry, SONIC_RR_BUFADR_H, bufadr_h); - } else { - /* This should only happen, if we enable accepting broken packets. */ - lp->stats.rx_errors++; - if (status & SONIC_RCR_FAER) - lp->stats.rx_frame_errors++; - if (status & SONIC_RCR_CRCR) - lp->stats.rx_crc_errors++; - } - if (status & SONIC_RCR_LPKT) { - /* - * this was the last packet out of the current receive buffer - * give the buffer back to the SONIC + /* If RBE is already asserted when RWP advances then + * it's safe to clear RBE after processing this packet. */ - lp->cur_rwp += SIZEOF_SONIC_RR * SONIC_BUS_SCALE(lp->dma_bitmode); - if (lp->cur_rwp >= lp->rra_end) lp->cur_rwp = lp->rra_laddr & 0xffff; - SONIC_WRITE(SONIC_RWP, lp->cur_rwp); - if (SONIC_READ(SONIC_ISR) & SONIC_INT_RBE) { - netif_dbg(lp, rx_err, dev, "%s: rx buffer exhausted\n", - __func__); - SONIC_WRITE(SONIC_ISR, SONIC_INT_RBE); /* clear the flag */ - } - } else - printk(KERN_ERR "%s: rx desc without RCR_LPKT. Shouldn't happen !?\n", - dev->name); + rbe = rbe || SONIC_READ(SONIC_ISR) & SONIC_INT_RBE; + sonic_update_rra(dev, lp, addr, new_laddr); + } /* * give back the descriptor */ - sonic_rda_put(dev, entry, SONIC_RD_LINK, - sonic_rda_get(dev, entry, SONIC_RD_LINK) | SONIC_EOL); + sonic_rda_put(dev, entry, SONIC_RD_STATUS, 0); sonic_rda_put(dev, entry, SONIC_RD_IN_USE, 1); - sonic_rda_put(dev, lp->eol_rx, SONIC_RD_LINK, - sonic_rda_get(dev, lp->eol_rx, SONIC_RD_LINK) & ~SONIC_EOL); - lp->eol_rx = entry; - lp->cur_rx = entry = (entry + 1) & SONIC_RDS_MASK; + + prev_entry = entry; + entry = (entry + 1) & SONIC_RDS_MASK; + } + + lp->cur_rx = entry; + + if (prev_entry != lp->eol_rx) { + /* Advance the EOL flag to put descriptors back into service */ + sonic_rda_put(dev, prev_entry, SONIC_RD_LINK, SONIC_EOL | + sonic_rda_get(dev, prev_entry, SONIC_RD_LINK)); + sonic_rda_put(dev, lp->eol_rx, SONIC_RD_LINK, ~SONIC_EOL & + sonic_rda_get(dev, lp->eol_rx, SONIC_RD_LINK)); + lp->eol_rx = prev_entry; } + + if (rbe) + SONIC_WRITE(SONIC_ISR, SONIC_INT_RBE); /* * If any worth-while packets have been received, netif_rx() * has done a mark_bh(NET_BH) for us and will work on them @@ -550,6 +643,8 @@ static void sonic_multicast_list(struct net_device *dev) (netdev_mc_count(dev) > 15)) { rcr |= SONIC_RCR_AMC; } else { + unsigned long flags; + netif_dbg(lp, ifup, dev, "%s: mc_count %d\n", __func__, netdev_mc_count(dev)); sonic_set_cam_enable(dev, 1); /* always enable our own address */ @@ -563,9 +658,14 @@ static void sonic_multicast_list(struct net_device *dev) i++; } SONIC_WRITE(SONIC_CDC, 16); - /* issue Load CAM command */ SONIC_WRITE(SONIC_CDP, lp->cda_laddr & 0xffff); + + /* LCAM and TXP commands can't be used simultaneously */ + spin_lock_irqsave(&lp->lock, flags); + sonic_quiesce(dev, SONIC_CR_TXP); SONIC_WRITE(SONIC_CMD, SONIC_CR_LCAM); + sonic_quiesce(dev, SONIC_CR_LCAM); + spin_unlock_irqrestore(&lp->lock, flags); } } @@ -580,7 +680,6 @@ static void sonic_multicast_list(struct net_device *dev) */ static int sonic_init(struct net_device *dev) { - unsigned int cmd; struct sonic_local *lp = netdev_priv(dev); int i; @@ -592,12 +691,16 @@ static int sonic_init(struct net_device *dev) SONIC_WRITE(SONIC_ISR, 0x7fff); SONIC_WRITE(SONIC_CMD, SONIC_CR_RST); + /* While in reset mode, clear CAM Enable register */ + SONIC_WRITE(SONIC_CE, 0); + /* * clear software reset flag, disable receiver, clear and * enable interrupts, then completely initialize the SONIC */ SONIC_WRITE(SONIC_CMD, 0); - SONIC_WRITE(SONIC_CMD, SONIC_CR_RXDIS); + SONIC_WRITE(SONIC_CMD, SONIC_CR_RXDIS | SONIC_CR_STP); + sonic_quiesce(dev, SONIC_CR_ALL); /* * initialize the receive resource area @@ -615,15 +718,10 @@ static int sonic_init(struct net_device *dev) } /* initialize all RRA registers */ - lp->rra_end = (lp->rra_laddr + SONIC_NUM_RRS * SIZEOF_SONIC_RR * - SONIC_BUS_SCALE(lp->dma_bitmode)) & 0xffff; - lp->cur_rwp = (lp->rra_laddr + (SONIC_NUM_RRS - 1) * SIZEOF_SONIC_RR * - SONIC_BUS_SCALE(lp->dma_bitmode)) & 0xffff; - - SONIC_WRITE(SONIC_RSA, lp->rra_laddr & 0xffff); - SONIC_WRITE(SONIC_REA, lp->rra_end); - SONIC_WRITE(SONIC_RRP, lp->rra_laddr & 0xffff); - SONIC_WRITE(SONIC_RWP, lp->cur_rwp); + SONIC_WRITE(SONIC_RSA, sonic_rr_addr(dev, 0)); + SONIC_WRITE(SONIC_REA, sonic_rr_addr(dev, SONIC_NUM_RRS)); + SONIC_WRITE(SONIC_RRP, sonic_rr_addr(dev, 0)); + SONIC_WRITE(SONIC_RWP, sonic_rr_addr(dev, SONIC_NUM_RRS - 1)); SONIC_WRITE(SONIC_URRA, lp->rra_laddr >> 16); SONIC_WRITE(SONIC_EOBC, (SONIC_RBSIZE >> 1) - (lp->dma_bitmode ? 2 : 1)); @@ -631,14 +729,7 @@ static int sonic_init(struct net_device *dev) netif_dbg(lp, ifup, dev, "%s: issuing RRRA command\n", __func__); SONIC_WRITE(SONIC_CMD, SONIC_CR_RRRA); - i = 0; - while (i++ < 100) { - if (SONIC_READ(SONIC_CMD) & SONIC_CR_RRRA) - break; - } - - netif_dbg(lp, ifup, dev, "%s: status=%x, i=%d\n", __func__, - SONIC_READ(SONIC_CMD), i); + sonic_quiesce(dev, SONIC_CR_RRRA); /* * Initialize the receive descriptors so that they @@ -713,28 +804,17 @@ static int sonic_init(struct net_device *dev) * load the CAM */ SONIC_WRITE(SONIC_CMD, SONIC_CR_LCAM); - - i = 0; - while (i++ < 100) { - if (SONIC_READ(SONIC_ISR) & SONIC_INT_LCD) - break; - } - netif_dbg(lp, ifup, dev, "%s: CMD=%x, ISR=%x, i=%d\n", __func__, - SONIC_READ(SONIC_CMD), SONIC_READ(SONIC_ISR), i); + sonic_quiesce(dev, SONIC_CR_LCAM); /* * enable receiver, disable loopback * and enable all interrupts */ - SONIC_WRITE(SONIC_CMD, SONIC_CR_RXEN | SONIC_CR_STP); SONIC_WRITE(SONIC_RCR, SONIC_RCR_DEFAULT); SONIC_WRITE(SONIC_TCR, SONIC_TCR_DEFAULT); SONIC_WRITE(SONIC_ISR, 0x7fff); SONIC_WRITE(SONIC_IMR, SONIC_IMR_DEFAULT); - - cmd = SONIC_READ(SONIC_CMD); - if ((cmd & SONIC_CR_RXEN) == 0 || (cmd & SONIC_CR_STP) == 0) - printk(KERN_ERR "sonic_init: failed, status=%x\n", cmd); + SONIC_WRITE(SONIC_CMD, SONIC_CR_RXEN); netif_dbg(lp, ifup, dev, "%s: new status=%x\n", __func__, SONIC_READ(SONIC_CMD)); diff --git a/drivers/net/ethernet/natsemi/sonic.h b/drivers/net/ethernet/natsemi/sonic.h index f1544481aac1..e0e4cba6f6f6 100644 --- a/drivers/net/ethernet/natsemi/sonic.h +++ b/drivers/net/ethernet/natsemi/sonic.h @@ -110,6 +110,9 @@ #define SONIC_CR_TXP 0x0002 #define SONIC_CR_HTX 0x0001 +#define SONIC_CR_ALL (SONIC_CR_LCAM | SONIC_CR_RRRA | \ + SONIC_CR_RXEN | SONIC_CR_TXP) + /* * SONIC data configuration bits */ @@ -175,6 +178,7 @@ #define SONIC_TCR_NCRS 0x0100 #define SONIC_TCR_CRLS 0x0080 #define SONIC_TCR_EXC 0x0040 +#define SONIC_TCR_OWC 0x0020 #define SONIC_TCR_PMB 0x0008 #define SONIC_TCR_FU 0x0004 #define SONIC_TCR_BCM 0x0002 @@ -274,8 +278,9 @@ #define SONIC_NUM_RDS SONIC_NUM_RRS /* number of receive descriptors */ #define SONIC_NUM_TDS 16 /* number of transmit descriptors */ -#define SONIC_RDS_MASK (SONIC_NUM_RDS-1) -#define SONIC_TDS_MASK (SONIC_NUM_TDS-1) +#define SONIC_RRS_MASK (SONIC_NUM_RRS - 1) +#define SONIC_RDS_MASK (SONIC_NUM_RDS - 1) +#define SONIC_TDS_MASK (SONIC_NUM_TDS - 1) #define SONIC_RBSIZE 1520 /* size of one resource buffer */ @@ -312,8 +317,6 @@ struct sonic_local { u32 rda_laddr; /* logical DMA address of RDA */ dma_addr_t rx_laddr[SONIC_NUM_RRS]; /* logical DMA addresses of rx skbuffs */ dma_addr_t tx_laddr[SONIC_NUM_TDS]; /* logical DMA addresses of tx skbuffs */ - unsigned int rra_end; - unsigned int cur_rwp; unsigned int cur_rx; unsigned int cur_tx; /* first unacked transmit packet */ unsigned int eol_rx; @@ -322,6 +325,7 @@ struct sonic_local { int msg_enable; struct device *device; /* generic device */ struct net_device_stats stats; + spinlock_t lock; }; #define TX_TIMEOUT (3 * HZ) @@ -344,30 +348,30 @@ static void sonic_msg_init(struct net_device *dev); as far as we can tell. */ /* OpenBSD calls this "SWO". I'd like to think that sonic_buf_put() is a much better name. */ -static inline void sonic_buf_put(void* base, int bitmode, +static inline void sonic_buf_put(u16 *base, int bitmode, int offset, __u16 val) { if (bitmode) #ifdef __BIG_ENDIAN - ((__u16 *) base + (offset*2))[1] = val; + __raw_writew(val, base + (offset * 2) + 1); #else - ((__u16 *) base + (offset*2))[0] = val; + __raw_writew(val, base + (offset * 2) + 0); #endif else - ((__u16 *) base)[offset] = val; + __raw_writew(val, base + (offset * 1) + 0); } -static inline __u16 sonic_buf_get(void* base, int bitmode, +static inline __u16 sonic_buf_get(u16 *base, int bitmode, int offset) { if (bitmode) #ifdef __BIG_ENDIAN - return ((volatile __u16 *) base + (offset*2))[1]; + return __raw_readw(base + (offset * 2) + 1); #else - return ((volatile __u16 *) base + (offset*2))[0]; + return __raw_readw(base + (offset * 2) + 0); #endif else - return ((volatile __u16 *) base)[offset]; + return __raw_readw(base + (offset * 1) + 0); } /* Inlines that you should actually use for reading/writing DMA buffers */ @@ -447,6 +451,22 @@ static inline __u16 sonic_rra_get(struct net_device* dev, int entry, (entry * SIZEOF_SONIC_RR) + offset); } +static inline u16 sonic_rr_addr(struct net_device *dev, int entry) +{ + struct sonic_local *lp = netdev_priv(dev); + + return lp->rra_laddr + + entry * SIZEOF_SONIC_RR * SONIC_BUS_SCALE(lp->dma_bitmode); +} + +static inline u16 sonic_rr_entry(struct net_device *dev, u16 addr) +{ + struct sonic_local *lp = netdev_priv(dev); + + return (addr - (u16)lp->rra_laddr) / (SIZEOF_SONIC_RR * + SONIC_BUS_SCALE(lp->dma_bitmode)); +} + static const char version[] = "sonic.c:v0.92 20.9.98 tsbogend@alpha.franken.de\n"; diff --git a/drivers/net/ethernet/netronome/Kconfig b/drivers/net/ethernet/netronome/Kconfig index bac5be4d4f43..a3f68a718813 100644 --- a/drivers/net/ethernet/netronome/Kconfig +++ b/drivers/net/ethernet/netronome/Kconfig @@ -31,6 +31,7 @@ config NFP_APP_FLOWER bool "NFP4000/NFP6000 TC Flower offload support" depends on NFP depends on NET_SWITCHDEV + depends on IPV6!=m || NFP=m default y ---help--- Enable driver support for TC Flower offload on NFP4000 and NFP6000. diff --git a/drivers/net/ethernet/nxp/lpc_eth.c b/drivers/net/ethernet/nxp/lpc_eth.c index 656169214cdb..d20cf03a3ea0 100644 --- a/drivers/net/ethernet/nxp/lpc_eth.c +++ b/drivers/net/ethernet/nxp/lpc_eth.c @@ -1149,19 +1149,6 @@ static void lpc_eth_set_multicast_list(struct net_device *ndev) spin_unlock_irqrestore(&pldat->lock, flags); } -static int lpc_eth_ioctl(struct net_device *ndev, struct ifreq *req, int cmd) -{ - struct phy_device *phydev = ndev->phydev; - - if (!netif_running(ndev)) - return -EINVAL; - - if (!phydev) - return -ENODEV; - - return phy_mii_ioctl(phydev, req, cmd); -} - static int lpc_eth_open(struct net_device *ndev) { struct netdata_local *pldat = netdev_priv(ndev); @@ -1229,7 +1216,7 @@ static const struct net_device_ops lpc_netdev_ops = { .ndo_stop = lpc_eth_close, .ndo_start_xmit = lpc_eth_hard_start_xmit, .ndo_set_rx_mode = lpc_eth_set_multicast_list, - .ndo_do_ioctl = lpc_eth_ioctl, + .ndo_do_ioctl = phy_do_ioctl_running, .ndo_set_mac_address = lpc_set_mac_address, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/ethernet/pensando/ionic/ionic.h b/drivers/net/ethernet/pensando/ionic/ionic.h index 98e102af7756..bb106a32f416 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic.h +++ b/drivers/net/ethernet/pensando/ionic/ionic.h @@ -12,19 +12,27 @@ struct ionic_lif; #define IONIC_DRV_NAME "ionic" #define IONIC_DRV_DESCRIPTION "Pensando Ethernet NIC Driver" -#define IONIC_DRV_VERSION "0.18.0-k" +#define IONIC_DRV_VERSION "0.20.0-k" #define PCI_VENDOR_ID_PENSANDO 0x1dd8 #define PCI_DEVICE_ID_PENSANDO_IONIC_ETH_PF 0x1002 #define PCI_DEVICE_ID_PENSANDO_IONIC_ETH_VF 0x1003 -#define IONIC_SUBDEV_ID_NAPLES_25 0x4000 -#define IONIC_SUBDEV_ID_NAPLES_100_4 0x4001 -#define IONIC_SUBDEV_ID_NAPLES_100_8 0x4002 - #define DEVCMD_TIMEOUT 10 +struct ionic_vf { + u16 index; + u8 macaddr[6]; + __le32 maxrate; + __le16 vlanid; + u8 spoofchk; + u8 trusted; + u8 linkstate; + dma_addr_t stats_pa; + struct ionic_lif_stats stats; +}; + struct ionic { struct pci_dev *pdev; struct device *dev; @@ -46,6 +54,9 @@ struct ionic { DECLARE_BITMAP(intrs, IONIC_INTR_CTRL_REGS_MAX); struct work_struct nb_work; struct notifier_block nb; + struct rw_semaphore vf_op_lock; /* lock for VF operations */ + struct ionic_vf *vfs; + int num_vfs; struct timer_list watchdog_timer; int watchdog_period; }; diff --git a/drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c b/drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c index 9a9ab8cb2cb3..448d7b23b2f7 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c @@ -104,10 +104,112 @@ void ionic_bus_unmap_dbpage(struct ionic *ionic, void __iomem *page) iounmap(page); } +static void ionic_vf_dealloc_locked(struct ionic *ionic) +{ + struct ionic_vf *v; + dma_addr_t dma = 0; + int i; + + if (!ionic->vfs) + return; + + for (i = ionic->num_vfs - 1; i >= 0; i--) { + v = &ionic->vfs[i]; + + if (v->stats_pa) { + (void)ionic_set_vf_config(ionic, i, + IONIC_VF_ATTR_STATSADDR, + (u8 *)&dma); + dma_unmap_single(ionic->dev, v->stats_pa, + sizeof(v->stats), DMA_FROM_DEVICE); + v->stats_pa = 0; + } + } + + kfree(ionic->vfs); + ionic->vfs = NULL; + ionic->num_vfs = 0; +} + +static void ionic_vf_dealloc(struct ionic *ionic) +{ + down_write(&ionic->vf_op_lock); + ionic_vf_dealloc_locked(ionic); + up_write(&ionic->vf_op_lock); +} + +static int ionic_vf_alloc(struct ionic *ionic, int num_vfs) +{ + struct ionic_vf *v; + int err = 0; + int i; + + down_write(&ionic->vf_op_lock); + + ionic->vfs = kcalloc(num_vfs, sizeof(struct ionic_vf), GFP_KERNEL); + if (!ionic->vfs) { + err = -ENOMEM; + goto out; + } + + for (i = 0; i < num_vfs; i++) { + v = &ionic->vfs[i]; + v->stats_pa = dma_map_single(ionic->dev, &v->stats, + sizeof(v->stats), DMA_FROM_DEVICE); + if (dma_mapping_error(ionic->dev, v->stats_pa)) { + v->stats_pa = 0; + err = -ENODEV; + goto out; + } + + /* ignore failures from older FW, we just won't get stats */ + (void)ionic_set_vf_config(ionic, i, IONIC_VF_ATTR_STATSADDR, + (u8 *)&v->stats_pa); + ionic->num_vfs++; + } + +out: + if (err) + ionic_vf_dealloc_locked(ionic); + up_write(&ionic->vf_op_lock); + return err; +} + +static int ionic_sriov_configure(struct pci_dev *pdev, int num_vfs) +{ + struct ionic *ionic = pci_get_drvdata(pdev); + struct device *dev = ionic->dev; + int ret = 0; + + if (num_vfs > 0) { + ret = pci_enable_sriov(pdev, num_vfs); + if (ret) { + dev_err(dev, "Cannot enable SRIOV: %d\n", ret); + goto out; + } + + ret = ionic_vf_alloc(ionic, num_vfs); + if (ret) { + dev_err(dev, "Cannot alloc VFs: %d\n", ret); + pci_disable_sriov(pdev); + goto out; + } + + ret = num_vfs; + } else { + pci_disable_sriov(pdev); + ionic_vf_dealloc(ionic); + } + +out: + return ret; +} + static int ionic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { struct device *dev = &pdev->dev; struct ionic *ionic; + int num_vfs; int err; ionic = ionic_devlink_alloc(dev); @@ -206,6 +308,15 @@ static int ionic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto err_out_free_lifs; } + init_rwsem(&ionic->vf_op_lock); + num_vfs = pci_num_vf(pdev); + if (num_vfs) { + dev_info(dev, "%d VFs found already enabled\n", num_vfs); + err = ionic_vf_alloc(ionic, num_vfs); + if (err) + dev_err(dev, "Cannot enable existing VFs: %d\n", err); + } + err = ionic_lifs_register(ionic); if (err) { dev_err(dev, "Cannot register LIFs: %d, aborting\n", err); @@ -223,6 +334,7 @@ static int ionic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) err_out_deregister_lifs: ionic_lifs_unregister(ionic); err_out_deinit_lifs: + ionic_vf_dealloc(ionic); ionic_lifs_deinit(ionic); err_out_free_lifs: ionic_lifs_free(ionic); @@ -279,6 +391,7 @@ static struct pci_driver ionic_driver = { .id_table = ionic_id_table, .probe = ionic_probe, .remove = ionic_remove, + .sriov_configure = ionic_sriov_configure, }; int ionic_bus_register_driver(void) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_dev.c b/drivers/net/ethernet/pensando/ionic/ionic_dev.c index 5f9d2ec70446..87f82f36812f 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_dev.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_dev.c @@ -286,6 +286,64 @@ void ionic_dev_cmd_port_pause(struct ionic_dev *idev, u8 pause_type) ionic_dev_cmd_go(idev, &cmd); } +/* VF commands */ +int ionic_set_vf_config(struct ionic *ionic, int vf, u8 attr, u8 *data) +{ + union ionic_dev_cmd cmd = { + .vf_setattr.opcode = IONIC_CMD_VF_SETATTR, + .vf_setattr.attr = attr, + .vf_setattr.vf_index = vf, + }; + int err; + + switch (attr) { + case IONIC_VF_ATTR_SPOOFCHK: + cmd.vf_setattr.spoofchk = *data; + dev_dbg(ionic->dev, "%s: vf %d spoof %d\n", + __func__, vf, *data); + break; + case IONIC_VF_ATTR_TRUST: + cmd.vf_setattr.trust = *data; + dev_dbg(ionic->dev, "%s: vf %d trust %d\n", + __func__, vf, *data); + break; + case IONIC_VF_ATTR_LINKSTATE: + cmd.vf_setattr.linkstate = *data; + dev_dbg(ionic->dev, "%s: vf %d linkstate %d\n", + __func__, vf, *data); + break; + case IONIC_VF_ATTR_MAC: + ether_addr_copy(cmd.vf_setattr.macaddr, data); + dev_dbg(ionic->dev, "%s: vf %d macaddr %pM\n", + __func__, vf, data); + break; + case IONIC_VF_ATTR_VLAN: + cmd.vf_setattr.vlanid = cpu_to_le16(*(u16 *)data); + dev_dbg(ionic->dev, "%s: vf %d vlan %d\n", + __func__, vf, *(u16 *)data); + break; + case IONIC_VF_ATTR_RATE: + cmd.vf_setattr.maxrate = cpu_to_le32(*(u32 *)data); + dev_dbg(ionic->dev, "%s: vf %d maxrate %d\n", + __func__, vf, *(u32 *)data); + break; + case IONIC_VF_ATTR_STATSADDR: + cmd.vf_setattr.stats_pa = cpu_to_le64(*(u64 *)data); + dev_dbg(ionic->dev, "%s: vf %d stats_pa 0x%08llx\n", + __func__, vf, *(u64 *)data); + break; + default: + return -EINVAL; + } + + mutex_lock(&ionic->dev_cmd_lock); + ionic_dev_cmd_go(&ionic->idev, &cmd); + err = ionic_dev_cmd_wait(ionic, DEVCMD_TIMEOUT); + mutex_unlock(&ionic->dev_cmd_lock); + + return err; +} + /* LIF commands */ void ionic_dev_cmd_lif_identify(struct ionic_dev *idev, u8 type, u8 ver) { diff --git a/drivers/net/ethernet/pensando/ionic/ionic_dev.h b/drivers/net/ethernet/pensando/ionic/ionic_dev.h index 4665c5dc5324..7838e342c4fd 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_dev.h +++ b/drivers/net/ethernet/pensando/ionic/ionic_dev.h @@ -113,6 +113,12 @@ static_assert(sizeof(struct ionic_rxq_desc) == 16); static_assert(sizeof(struct ionic_rxq_sg_desc) == 128); static_assert(sizeof(struct ionic_rxq_comp) == 16); +/* SR/IOV */ +static_assert(sizeof(struct ionic_vf_setattr_cmd) == 64); +static_assert(sizeof(struct ionic_vf_setattr_comp) == 16); +static_assert(sizeof(struct ionic_vf_getattr_cmd) == 64); +static_assert(sizeof(struct ionic_vf_getattr_comp) == 16); + struct ionic_devinfo { u8 asic_type; u8 asic_rev; @@ -275,6 +281,7 @@ void ionic_dev_cmd_port_autoneg(struct ionic_dev *idev, u8 an_enable); void ionic_dev_cmd_port_fec(struct ionic_dev *idev, u8 fec_type); void ionic_dev_cmd_port_pause(struct ionic_dev *idev, u8 pause_type); +int ionic_set_vf_config(struct ionic *ionic, int vf, u8 attr, u8 *data); void ionic_dev_cmd_lif_identify(struct ionic_dev *idev, u8 type, u8 ver); void ionic_dev_cmd_lif_init(struct ionic_dev *idev, u16 lif_index, dma_addr_t addr); diff --git a/drivers/net/ethernet/pensando/ionic/ionic_if.h b/drivers/net/ethernet/pensando/ionic/ionic_if.h index 39317cdfa6cf..f131adad96e3 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_if.h +++ b/drivers/net/ethernet/pensando/ionic/ionic_if.h @@ -51,6 +51,10 @@ enum ionic_cmd_opcode { IONIC_CMD_RDMA_CREATE_CQ = 52, IONIC_CMD_RDMA_CREATE_ADMINQ = 53, + /* SR/IOV commands */ + IONIC_CMD_VF_GETATTR = 60, + IONIC_CMD_VF_SETATTR = 61, + /* QoS commands */ IONIC_CMD_QOS_CLASS_IDENTIFY = 240, IONIC_CMD_QOS_CLASS_INIT = 241, @@ -1639,6 +1643,93 @@ enum ionic_qos_sched_type { IONIC_QOS_SCHED_TYPE_DWRR = 1, /* Deficit weighted round-robin */ }; +enum ionic_vf_attr { + IONIC_VF_ATTR_SPOOFCHK = 1, + IONIC_VF_ATTR_TRUST = 2, + IONIC_VF_ATTR_MAC = 3, + IONIC_VF_ATTR_LINKSTATE = 4, + IONIC_VF_ATTR_VLAN = 5, + IONIC_VF_ATTR_RATE = 6, + IONIC_VF_ATTR_STATSADDR = 7, +}; + +/** + * VF link status + */ +enum ionic_vf_link_status { + IONIC_VF_LINK_STATUS_AUTO = 0, /* link state of the uplink */ + IONIC_VF_LINK_STATUS_UP = 1, /* link is always up */ + IONIC_VF_LINK_STATUS_DOWN = 2, /* link is always down */ +}; + +/** + * struct ionic_vf_setattr_cmd - Set VF attributes on the NIC + * @opcode: Opcode + * @index: VF index + * @attr: Attribute type (enum ionic_vf_attr) + * macaddr mac address + * vlanid vlan ID + * maxrate max Tx rate in Mbps + * spoofchk enable address spoof checking + * trust enable VF trust + * linkstate set link up or down + * stats_pa set DMA address for VF stats + */ +struct ionic_vf_setattr_cmd { + u8 opcode; + u8 attr; + __le16 vf_index; + union { + u8 macaddr[6]; + __le16 vlanid; + __le32 maxrate; + u8 spoofchk; + u8 trust; + u8 linkstate; + __le64 stats_pa; + u8 pad[60]; + }; +}; + +struct ionic_vf_setattr_comp { + u8 status; + u8 attr; + __le16 vf_index; + __le16 comp_index; + u8 rsvd[9]; + u8 color; +}; + +/** + * struct ionic_vf_getattr_cmd - Get VF attributes from the NIC + * @opcode: Opcode + * @index: VF index + * @attr: Attribute type (enum ionic_vf_attr) + */ +struct ionic_vf_getattr_cmd { + u8 opcode; + u8 attr; + __le16 vf_index; + u8 rsvd[60]; +}; + +struct ionic_vf_getattr_comp { + u8 status; + u8 attr; + __le16 vf_index; + union { + u8 macaddr[6]; + __le16 vlanid; + __le32 maxrate; + u8 spoofchk; + u8 trust; + u8 linkstate; + __le64 stats_pa; + u8 pad[11]; + }; + u8 color; +}; + /** * union ionic_qos_config - Qos configuration structure * @flags: Configuration flags @@ -2289,6 +2380,9 @@ union ionic_dev_cmd { struct ionic_port_getattr_cmd port_getattr; struct ionic_port_setattr_cmd port_setattr; + struct ionic_vf_setattr_cmd vf_setattr; + struct ionic_vf_getattr_cmd vf_getattr; + struct ionic_lif_identify_cmd lif_identify; struct ionic_lif_init_cmd lif_init; struct ionic_lif_reset_cmd lif_reset; @@ -2318,6 +2412,9 @@ union ionic_dev_cmd_comp { struct ionic_port_getattr_comp port_getattr; struct ionic_port_setattr_comp port_setattr; + struct ionic_vf_setattr_comp vf_setattr; + struct ionic_vf_getattr_comp vf_getattr; + struct ionic_lif_identify_comp lif_identify; struct ionic_lif_init_comp lif_init; ionic_lif_reset_comp lif_reset; diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c index a76108a9c7db..191271f6260d 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c @@ -1619,6 +1619,227 @@ int ionic_stop(struct net_device *netdev) return err; } +static int ionic_get_vf_config(struct net_device *netdev, + int vf, struct ifla_vf_info *ivf) +{ + struct ionic_lif *lif = netdev_priv(netdev); + struct ionic *ionic = lif->ionic; + int ret = 0; + + down_read(&ionic->vf_op_lock); + + if (vf >= pci_num_vf(ionic->pdev) || !ionic->vfs) { + ret = -EINVAL; + } else { + ivf->vf = vf; + ivf->vlan = ionic->vfs[vf].vlanid; + ivf->qos = 0; + ivf->spoofchk = ionic->vfs[vf].spoofchk; + ivf->linkstate = ionic->vfs[vf].linkstate; + ivf->max_tx_rate = ionic->vfs[vf].maxrate; + ivf->trusted = ionic->vfs[vf].trusted; + ether_addr_copy(ivf->mac, ionic->vfs[vf].macaddr); + } + + up_read(&ionic->vf_op_lock); + return ret; +} + +static int ionic_get_vf_stats(struct net_device *netdev, int vf, + struct ifla_vf_stats *vf_stats) +{ + struct ionic_lif *lif = netdev_priv(netdev); + struct ionic *ionic = lif->ionic; + struct ionic_lif_stats *vs; + int ret = 0; + + down_read(&ionic->vf_op_lock); + + if (vf >= pci_num_vf(ionic->pdev) || !ionic->vfs) { + ret = -EINVAL; + } else { + memset(vf_stats, 0, sizeof(*vf_stats)); + vs = &ionic->vfs[vf].stats; + + vf_stats->rx_packets = le64_to_cpu(vs->rx_ucast_packets); + vf_stats->tx_packets = le64_to_cpu(vs->tx_ucast_packets); + vf_stats->rx_bytes = le64_to_cpu(vs->rx_ucast_bytes); + vf_stats->tx_bytes = le64_to_cpu(vs->tx_ucast_bytes); + vf_stats->broadcast = le64_to_cpu(vs->rx_bcast_packets); + vf_stats->multicast = le64_to_cpu(vs->rx_mcast_packets); + vf_stats->rx_dropped = le64_to_cpu(vs->rx_ucast_drop_packets) + + le64_to_cpu(vs->rx_mcast_drop_packets) + + le64_to_cpu(vs->rx_bcast_drop_packets); + vf_stats->tx_dropped = le64_to_cpu(vs->tx_ucast_drop_packets) + + le64_to_cpu(vs->tx_mcast_drop_packets) + + le64_to_cpu(vs->tx_bcast_drop_packets); + } + + up_read(&ionic->vf_op_lock); + return ret; +} + +static int ionic_set_vf_mac(struct net_device *netdev, int vf, u8 *mac) +{ + struct ionic_lif *lif = netdev_priv(netdev); + struct ionic *ionic = lif->ionic; + int ret; + + if (!(is_zero_ether_addr(mac) || is_valid_ether_addr(mac))) + return -EINVAL; + + down_read(&ionic->vf_op_lock); + + if (vf >= pci_num_vf(ionic->pdev) || !ionic->vfs) { + ret = -EINVAL; + } else { + ret = ionic_set_vf_config(ionic, vf, IONIC_VF_ATTR_MAC, mac); + if (!ret) + ether_addr_copy(ionic->vfs[vf].macaddr, mac); + } + + up_read(&ionic->vf_op_lock); + return ret; +} + +static int ionic_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, + u8 qos, __be16 proto) +{ + struct ionic_lif *lif = netdev_priv(netdev); + struct ionic *ionic = lif->ionic; + int ret; + + /* until someday when we support qos */ + if (qos) + return -EINVAL; + + if (vlan > 4095) + return -EINVAL; + + if (proto != htons(ETH_P_8021Q)) + return -EPROTONOSUPPORT; + + down_read(&ionic->vf_op_lock); + + if (vf >= pci_num_vf(ionic->pdev) || !ionic->vfs) { + ret = -EINVAL; + } else { + ret = ionic_set_vf_config(ionic, vf, + IONIC_VF_ATTR_VLAN, (u8 *)&vlan); + if (!ret) + ionic->vfs[vf].vlanid = vlan; + } + + up_read(&ionic->vf_op_lock); + return ret; +} + +static int ionic_set_vf_rate(struct net_device *netdev, int vf, + int tx_min, int tx_max) +{ + struct ionic_lif *lif = netdev_priv(netdev); + struct ionic *ionic = lif->ionic; + int ret; + + /* setting the min just seems silly */ + if (tx_min) + return -EINVAL; + + down_write(&ionic->vf_op_lock); + + if (vf >= pci_num_vf(ionic->pdev) || !ionic->vfs) { + ret = -EINVAL; + } else { + ret = ionic_set_vf_config(ionic, vf, + IONIC_VF_ATTR_RATE, (u8 *)&tx_max); + if (!ret) + lif->ionic->vfs[vf].maxrate = tx_max; + } + + up_write(&ionic->vf_op_lock); + return ret; +} + +static int ionic_set_vf_spoofchk(struct net_device *netdev, int vf, bool set) +{ + struct ionic_lif *lif = netdev_priv(netdev); + struct ionic *ionic = lif->ionic; + u8 data = set; /* convert to u8 for config */ + int ret; + + down_write(&ionic->vf_op_lock); + + if (vf >= pci_num_vf(ionic->pdev) || !ionic->vfs) { + ret = -EINVAL; + } else { + ret = ionic_set_vf_config(ionic, vf, + IONIC_VF_ATTR_SPOOFCHK, &data); + if (!ret) + ionic->vfs[vf].spoofchk = data; + } + + up_write(&ionic->vf_op_lock); + return ret; +} + +static int ionic_set_vf_trust(struct net_device *netdev, int vf, bool set) +{ + struct ionic_lif *lif = netdev_priv(netdev); + struct ionic *ionic = lif->ionic; + u8 data = set; /* convert to u8 for config */ + int ret; + + down_write(&ionic->vf_op_lock); + + if (vf >= pci_num_vf(ionic->pdev) || !ionic->vfs) { + ret = -EINVAL; + } else { + ret = ionic_set_vf_config(ionic, vf, + IONIC_VF_ATTR_TRUST, &data); + if (!ret) + ionic->vfs[vf].trusted = data; + } + + up_write(&ionic->vf_op_lock); + return ret; +} + +static int ionic_set_vf_link_state(struct net_device *netdev, int vf, int set) +{ + struct ionic_lif *lif = netdev_priv(netdev); + struct ionic *ionic = lif->ionic; + u8 data; + int ret; + + switch (set) { + case IFLA_VF_LINK_STATE_ENABLE: + data = IONIC_VF_LINK_STATUS_UP; + break; + case IFLA_VF_LINK_STATE_DISABLE: + data = IONIC_VF_LINK_STATUS_DOWN; + break; + case IFLA_VF_LINK_STATE_AUTO: + data = IONIC_VF_LINK_STATUS_AUTO; + break; + default: + return -EINVAL; + } + + down_write(&ionic->vf_op_lock); + + if (vf >= pci_num_vf(ionic->pdev) || !ionic->vfs) { + ret = -EINVAL; + } else { + ret = ionic_set_vf_config(ionic, vf, + IONIC_VF_ATTR_LINKSTATE, &data); + if (!ret) + ionic->vfs[vf].linkstate = set; + } + + up_write(&ionic->vf_op_lock); + return ret; +} + static const struct net_device_ops ionic_netdev_ops = { .ndo_open = ionic_open, .ndo_stop = ionic_stop, @@ -1632,6 +1853,14 @@ static const struct net_device_ops ionic_netdev_ops = { .ndo_change_mtu = ionic_change_mtu, .ndo_vlan_rx_add_vid = ionic_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = ionic_vlan_rx_kill_vid, + .ndo_set_vf_vlan = ionic_set_vf_vlan, + .ndo_set_vf_trust = ionic_set_vf_trust, + .ndo_set_vf_mac = ionic_set_vf_mac, + .ndo_set_vf_rate = ionic_set_vf_rate, + .ndo_set_vf_spoofchk = ionic_set_vf_spoofchk, + .ndo_get_vf_config = ionic_get_vf_config, + .ndo_set_vf_link_state = ionic_set_vf_link_state, + .ndo_get_vf_stats = ionic_get_vf_stats, }; int ionic_reset_queues(struct ionic_lif *lif) @@ -1965,18 +2194,22 @@ static int ionic_station_set(struct ionic_lif *lif) if (err) return err; + if (is_zero_ether_addr(ctx.comp.lif_getattr.mac)) + return 0; + memcpy(addr.sa_data, ctx.comp.lif_getattr.mac, netdev->addr_len); addr.sa_family = AF_INET; err = eth_prepare_mac_addr_change(netdev, &addr); - if (err) - return err; - - if (!is_zero_ether_addr(netdev->dev_addr)) { - netdev_dbg(lif->netdev, "deleting station MAC addr %pM\n", - netdev->dev_addr); - ionic_lif_addr(lif, netdev->dev_addr, false); + if (err) { + netdev_warn(lif->netdev, "ignoring bad MAC addr from NIC %pM\n", + addr.sa_data); + return 0; } + netdev_dbg(lif->netdev, "deleting station MAC addr %pM\n", + netdev->dev_addr); + ionic_lif_addr(lif, netdev->dev_addr, false); + eth_commit_mac_addr_change(netdev, &addr); netdev_dbg(lif->netdev, "adding station MAC addr %pM\n", netdev->dev_addr); diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.h b/drivers/net/ethernet/pensando/ionic/ionic_lif.h index a55fd1f8c31b..9c5a7dd45f9d 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.h +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.h @@ -37,6 +37,7 @@ struct ionic_rx_stats { u64 csum_complete; u64 csum_error; u64 buffers_posted; + u64 dropped; }; #define IONIC_QCQ_F_INITED BIT(0) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_main.c b/drivers/net/ethernet/pensando/ionic/ionic_main.c index 3590ea7fd88a..a8e3fb73b465 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_main.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_main.c @@ -165,6 +165,10 @@ static const char *ionic_opcode_to_str(enum ionic_cmd_opcode opcode) return "IONIC_CMD_FW_DOWNLOAD"; case IONIC_CMD_FW_CONTROL: return "IONIC_CMD_FW_CONTROL"; + case IONIC_CMD_VF_GETATTR: + return "IONIC_CMD_VF_GETATTR"; + case IONIC_CMD_VF_SETATTR: + return "IONIC_CMD_VF_SETATTR"; default: return "DEVCMD_UNKNOWN"; } @@ -326,9 +330,9 @@ int ionic_dev_cmd_wait(struct ionic *ionic, unsigned long max_seconds) unsigned long max_wait; unsigned long duration; int opcode; + int hb = 0; int done; int err; - int hb; WARN_ON(in_interrupt()); diff --git a/drivers/net/ethernet/pensando/ionic/ionic_stats.c b/drivers/net/ethernet/pensando/ionic/ionic_stats.c index 03916b6d47f2..a1e9796a660a 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_stats.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_stats.c @@ -39,6 +39,7 @@ static const struct ionic_stat_desc ionic_rx_stats_desc[] = { IONIC_RX_STAT_DESC(csum_none), IONIC_RX_STAT_DESC(csum_complete), IONIC_RX_STAT_DESC(csum_error), + IONIC_RX_STAT_DESC(dropped), }; static const struct ionic_stat_desc ionic_txq_stats_desc[] = { diff --git a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c index 97e79949b359..e452f4242ba0 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c @@ -152,12 +152,16 @@ static void ionic_rx_clean(struct ionic_queue *q, struct ionic_desc_info *desc_i stats = q_to_rx_stats(q); netdev = q->lif->netdev; - if (comp->status) + if (comp->status) { + stats->dropped++; return; + } /* no packet processing while resetting */ - if (unlikely(test_bit(IONIC_LIF_QUEUE_RESET, q->lif->state))) + if (unlikely(test_bit(IONIC_LIF_QUEUE_RESET, q->lif->state))) { + stats->dropped++; return; + } stats->pkts++; stats->bytes += le16_to_cpu(comp->len); @@ -167,8 +171,10 @@ static void ionic_rx_clean(struct ionic_queue *q, struct ionic_desc_info *desc_i else skb = ionic_rx_frags(q, desc_info, cq_info); - if (unlikely(!skb)) + if (unlikely(!skb)) { + stats->dropped++; return; + } skb_record_rx_queue(skb, q->index); @@ -337,6 +343,8 @@ void ionic_rx_fill(struct ionic_queue *q) struct ionic_rxq_sg_desc *sg_desc; struct ionic_rxq_sg_elem *sg_elem; struct ionic_rxq_desc *desc; + unsigned int remain_len; + unsigned int seg_len; unsigned int nfrags; bool ring_doorbell; unsigned int i, j; @@ -346,6 +354,7 @@ void ionic_rx_fill(struct ionic_queue *q) nfrags = round_up(len, PAGE_SIZE) / PAGE_SIZE; for (i = ionic_q_space_avail(q); i; i--) { + remain_len = len; desc_info = q->head; desc = desc_info->desc; sg_desc = desc_info->sg_desc; @@ -369,7 +378,9 @@ void ionic_rx_fill(struct ionic_queue *q) return; } desc->addr = cpu_to_le64(page_info->dma_addr); - desc->len = cpu_to_le16(PAGE_SIZE); + seg_len = min_t(unsigned int, PAGE_SIZE, len); + desc->len = cpu_to_le16(seg_len); + remain_len -= seg_len; page_info++; /* fill sg descriptors - pages[1..n] */ @@ -385,7 +396,9 @@ void ionic_rx_fill(struct ionic_queue *q) return; } sg_elem->addr = cpu_to_le64(page_info->dma_addr); - sg_elem->len = cpu_to_le16(PAGE_SIZE); + seg_len = min_t(unsigned int, PAGE_SIZE, remain_len); + sg_elem->len = cpu_to_le16(seg_len); + remain_len -= seg_len; page_info++; } diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c index a496390b8632..07f9067affc6 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c @@ -2043,6 +2043,7 @@ static void qlcnic_83xx_exec_template_cmd(struct qlcnic_adapter *p_dev, break; } entry += p_hdr->size; + cond_resched(); } p_dev->ahw->reset.seq_index = index; } diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c index afa10a163da1..f34ae8c75bc5 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c @@ -703,6 +703,7 @@ static u32 qlcnic_read_memory_test_agent(struct qlcnic_adapter *adapter, addr += 16; reg_read -= 16; ret += 16; + cond_resched(); } out: mutex_unlock(&adapter->ahw->mem_lock); @@ -1383,6 +1384,7 @@ int qlcnic_dump_fw(struct qlcnic_adapter *adapter) buf_offset += entry->hdr.cap_size; entry_offset += entry->hdr.offset; buffer = fw_dump->data + buf_offset; + cond_resched(); } fw_dump->clr = 1; diff --git a/drivers/net/ethernet/qualcomm/emac/emac.c b/drivers/net/ethernet/qualcomm/emac/emac.c index 522fad4cb2cd..18b0c7a2d6dc 100644 --- a/drivers/net/ethernet/qualcomm/emac/emac.c +++ b/drivers/net/ethernet/qualcomm/emac/emac.c @@ -289,18 +289,6 @@ static void emac_tx_timeout(struct net_device *netdev, unsigned int txqueue) schedule_work(&adpt->work_thread); } -/* IOCTL support for the interface */ -static int emac_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd) -{ - if (!netif_running(netdev)) - return -EINVAL; - - if (!netdev->phydev) - return -ENODEV; - - return phy_mii_ioctl(netdev->phydev, ifr, cmd); -} - /** * emac_update_hw_stats - read the EMAC stat registers * @@ -387,7 +375,7 @@ static const struct net_device_ops emac_netdev_ops = { .ndo_start_xmit = emac_start_xmit, .ndo_set_mac_address = eth_mac_addr, .ndo_change_mtu = emac_change_mtu, - .ndo_do_ioctl = emac_ioctl, + .ndo_do_ioctl = phy_do_ioctl_running, .ndo_tx_timeout = emac_tx_timeout, .ndo_get_stats64 = emac_get_stats64, .ndo_set_features = emac_set_features, diff --git a/drivers/net/ethernet/rdc/r6040.c b/drivers/net/ethernet/rdc/r6040.c index c23cb61bbd30..f5ecc410ff85 100644 --- a/drivers/net/ethernet/rdc/r6040.c +++ b/drivers/net/ethernet/rdc/r6040.c @@ -498,14 +498,6 @@ static int r6040_close(struct net_device *dev) return 0; } -static int r6040_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) -{ - if (!dev->phydev) - return -EINVAL; - - return phy_mii_ioctl(dev->phydev, rq, cmd); -} - static int r6040_rx(struct net_device *dev, int limit) { struct r6040_private *priv = netdev_priv(dev); @@ -957,7 +949,7 @@ static const struct net_device_ops r6040_netdev_ops = { .ndo_set_rx_mode = r6040_multicast_list, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, - .ndo_do_ioctl = r6040_ioctl, + .ndo_do_ioctl = phy_do_ioctl, .ndo_tx_timeout = r6040_tx_timeout, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = r6040_poll_controller, diff --git a/drivers/net/ethernet/realtek/Makefile b/drivers/net/ethernet/realtek/Makefile index d5304bad2372..2e1d78b106b0 100644 --- a/drivers/net/ethernet/realtek/Makefile +++ b/drivers/net/ethernet/realtek/Makefile @@ -6,5 +6,5 @@ obj-$(CONFIG_8139CP) += 8139cp.o obj-$(CONFIG_8139TOO) += 8139too.o obj-$(CONFIG_ATP) += atp.o -r8169-objs += r8169_main.o r8169_firmware.o +r8169-objs += r8169_main.o r8169_firmware.o r8169_phy_config.o obj-$(CONFIG_R8169) += r8169.o diff --git a/drivers/net/ethernet/realtek/r8169.h b/drivers/net/ethernet/realtek/r8169.h new file mode 100644 index 000000000000..22a6a057b11e --- /dev/null +++ b/drivers/net/ethernet/realtek/r8169.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* r8169.h: RealTek 8169/8168/8101 ethernet driver. + * + * Copyright (c) 2002 ShuChen <shuchen@realtek.com.tw> + * Copyright (c) 2003 - 2007 Francois Romieu <romieu@fr.zoreil.com> + * Copyright (c) a lot of people too. Please respect their work. + * + * See MAINTAINERS file for support contact information. + */ + +#include <linux/types.h> +#include <linux/phy.h> + +enum mac_version { + /* support for ancient RTL_GIGA_MAC_VER_01 has been removed */ + RTL_GIGA_MAC_VER_02, + RTL_GIGA_MAC_VER_03, + RTL_GIGA_MAC_VER_04, + RTL_GIGA_MAC_VER_05, + RTL_GIGA_MAC_VER_06, + RTL_GIGA_MAC_VER_07, + RTL_GIGA_MAC_VER_08, + RTL_GIGA_MAC_VER_09, + RTL_GIGA_MAC_VER_10, + RTL_GIGA_MAC_VER_11, + RTL_GIGA_MAC_VER_12, + RTL_GIGA_MAC_VER_13, + RTL_GIGA_MAC_VER_14, + RTL_GIGA_MAC_VER_15, + RTL_GIGA_MAC_VER_16, + RTL_GIGA_MAC_VER_17, + RTL_GIGA_MAC_VER_18, + RTL_GIGA_MAC_VER_19, + RTL_GIGA_MAC_VER_20, + RTL_GIGA_MAC_VER_21, + RTL_GIGA_MAC_VER_22, + RTL_GIGA_MAC_VER_23, + RTL_GIGA_MAC_VER_24, + RTL_GIGA_MAC_VER_25, + RTL_GIGA_MAC_VER_26, + RTL_GIGA_MAC_VER_27, + RTL_GIGA_MAC_VER_28, + RTL_GIGA_MAC_VER_29, + RTL_GIGA_MAC_VER_30, + RTL_GIGA_MAC_VER_31, + RTL_GIGA_MAC_VER_32, + RTL_GIGA_MAC_VER_33, + RTL_GIGA_MAC_VER_34, + RTL_GIGA_MAC_VER_35, + RTL_GIGA_MAC_VER_36, + RTL_GIGA_MAC_VER_37, + RTL_GIGA_MAC_VER_38, + RTL_GIGA_MAC_VER_39, + RTL_GIGA_MAC_VER_40, + RTL_GIGA_MAC_VER_41, + RTL_GIGA_MAC_VER_42, + RTL_GIGA_MAC_VER_43, + RTL_GIGA_MAC_VER_44, + RTL_GIGA_MAC_VER_45, + RTL_GIGA_MAC_VER_46, + RTL_GIGA_MAC_VER_47, + RTL_GIGA_MAC_VER_48, + RTL_GIGA_MAC_VER_49, + RTL_GIGA_MAC_VER_50, + RTL_GIGA_MAC_VER_51, + RTL_GIGA_MAC_VER_52, + RTL_GIGA_MAC_VER_60, + RTL_GIGA_MAC_VER_61, + RTL_GIGA_MAC_NONE +}; + +struct rtl8169_private; + +void r8169_apply_firmware(struct rtl8169_private *tp); +u16 rtl8168h_2_get_adc_bias_ioffset(struct rtl8169_private *tp); +u8 rtl8168d_efuse_read(struct rtl8169_private *tp, int reg_addr); +void r8169_hw_phy_config(struct rtl8169_private *tp, struct phy_device *phydev, + enum mac_version ver); diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 0161d839fa87..aaa316be6183 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -31,6 +31,7 @@ #include <linux/ipv6.h> #include <net/ip6_checksum.h> +#include "r8169.h" #include "r8169_firmware.h" #define MODULENAME "r8169" @@ -84,65 +85,6 @@ #define RTL_R16(tp, reg) readw(tp->mmio_addr + (reg)) #define RTL_R32(tp, reg) readl(tp->mmio_addr + (reg)) -enum mac_version { - /* support for ancient RTL_GIGA_MAC_VER_01 has been removed */ - RTL_GIGA_MAC_VER_02, - RTL_GIGA_MAC_VER_03, - RTL_GIGA_MAC_VER_04, - RTL_GIGA_MAC_VER_05, - RTL_GIGA_MAC_VER_06, - RTL_GIGA_MAC_VER_07, - RTL_GIGA_MAC_VER_08, - RTL_GIGA_MAC_VER_09, - RTL_GIGA_MAC_VER_10, - RTL_GIGA_MAC_VER_11, - RTL_GIGA_MAC_VER_12, - RTL_GIGA_MAC_VER_13, - RTL_GIGA_MAC_VER_14, - RTL_GIGA_MAC_VER_15, - RTL_GIGA_MAC_VER_16, - RTL_GIGA_MAC_VER_17, - RTL_GIGA_MAC_VER_18, - RTL_GIGA_MAC_VER_19, - RTL_GIGA_MAC_VER_20, - RTL_GIGA_MAC_VER_21, - RTL_GIGA_MAC_VER_22, - RTL_GIGA_MAC_VER_23, - RTL_GIGA_MAC_VER_24, - RTL_GIGA_MAC_VER_25, - RTL_GIGA_MAC_VER_26, - RTL_GIGA_MAC_VER_27, - RTL_GIGA_MAC_VER_28, - RTL_GIGA_MAC_VER_29, - RTL_GIGA_MAC_VER_30, - RTL_GIGA_MAC_VER_31, - RTL_GIGA_MAC_VER_32, - RTL_GIGA_MAC_VER_33, - RTL_GIGA_MAC_VER_34, - RTL_GIGA_MAC_VER_35, - RTL_GIGA_MAC_VER_36, - RTL_GIGA_MAC_VER_37, - RTL_GIGA_MAC_VER_38, - RTL_GIGA_MAC_VER_39, - RTL_GIGA_MAC_VER_40, - RTL_GIGA_MAC_VER_41, - RTL_GIGA_MAC_VER_42, - RTL_GIGA_MAC_VER_43, - RTL_GIGA_MAC_VER_44, - RTL_GIGA_MAC_VER_45, - RTL_GIGA_MAC_VER_46, - RTL_GIGA_MAC_VER_47, - RTL_GIGA_MAC_VER_48, - RTL_GIGA_MAC_VER_49, - RTL_GIGA_MAC_VER_50, - RTL_GIGA_MAC_VER_51, - RTL_GIGA_MAC_VER_52, - RTL_GIGA_MAC_VER_60, - RTL_GIGA_MAC_VER_61, - RTL_GIGA_MAC_NONE -}; - -#define JUMBO_1K ETH_DATA_LEN #define JUMBO_4K (4*1024 - ETH_HLEN - 2) #define JUMBO_6K (6*1024 - ETH_HLEN - 2) #define JUMBO_7K (7*1024 - ETH_HLEN - 2) @@ -492,6 +434,7 @@ enum rtl_register_content { /* CPlusCmd p.31 */ EnableBist = (1 << 15), // 8168 8101 Mac_dbgo_oe = (1 << 14), // 8168 8101 + EnAnaPLL = (1 << 14), // 8169 Normal_mode = (1 << 13), // unused Force_half_dup = (1 << 12), // 8168 8101 Force_rxflow_en = (1 << 11), // 8168 8101 @@ -1078,52 +1021,6 @@ static int rtl_readphy(struct rtl8169_private *tp, int location) } } -static void rtl_patchphy(struct rtl8169_private *tp, int reg_addr, int value) -{ - rtl_writephy(tp, reg_addr, rtl_readphy(tp, reg_addr) | value); -} - -static void rtl_w0w1_phy(struct rtl8169_private *tp, int reg_addr, int p, int m) -{ - int val; - - val = rtl_readphy(tp, reg_addr); - rtl_writephy(tp, reg_addr, (val & ~m) | p); -} - -static void r8168d_modify_extpage(struct phy_device *phydev, int extpage, - int reg, u16 mask, u16 val) -{ - int oldpage = phy_select_page(phydev, 0x0007); - - __phy_write(phydev, 0x1e, extpage); - __phy_modify(phydev, reg, mask, val); - - phy_restore_page(phydev, oldpage, 0); -} - -static void r8168d_phy_param(struct phy_device *phydev, u16 parm, - u16 mask, u16 val) -{ - int oldpage = phy_select_page(phydev, 0x0005); - - __phy_write(phydev, 0x05, parm); - __phy_modify(phydev, 0x06, mask, val); - - phy_restore_page(phydev, oldpage, 0); -} - -static void r8168g_phy_param(struct phy_device *phydev, u16 parm, - u16 mask, u16 val) -{ - int oldpage = phy_select_page(phydev, 0x0a43); - - __phy_write(phydev, 0x13, parm); - __phy_modify(phydev, 0x14, mask, val); - - phy_restore_page(phydev, oldpage, 0); -} - DECLARE_RTL_COND(rtl_ephyar_cond) { return RTL_R32(tp, EPHYAR) & EPHYAR_FLAG; @@ -1372,7 +1269,7 @@ DECLARE_RTL_COND(rtl_efusear_cond) return RTL_R32(tp, EFUSEAR) & EFUSEAR_FLAG; } -static u8 rtl8168d_efuse_read(struct rtl8169_private *tp, int reg_addr) +u8 rtl8168d_efuse_read(struct rtl8169_private *tp, int reg_addr) { RTL_W32(tp, EFUSEAR, (reg_addr & EFUSEAR_REG_MASK) << EFUSEAR_REG_SHIFT); @@ -1596,7 +1493,7 @@ static netdev_features_t rtl8169_fix_features(struct net_device *dev, if (dev->mtu > TD_MSS_MAX) features &= ~NETIF_F_ALL_TSO; - if (dev->mtu > JUMBO_1K && + if (dev->mtu > ETH_DATA_LEN && tp->mac_version > RTL_GIGA_MAC_VER_06) features &= ~(NETIF_F_CSUM_MASK | NETIF_F_ALL_TSO); @@ -2268,22 +2165,6 @@ static void rtl8169_get_mac_version(struct rtl8169_private *tp) } } -struct phy_reg { - u16 reg; - u16 val; -}; - -static void __rtl_writephy_batch(struct rtl8169_private *tp, - const struct phy_reg *regs, int len) -{ - while (len-- > 0) { - rtl_writephy(tp, regs->reg, regs->val); - regs++; - } -} - -#define rtl_writephy_batch(tp, a) __rtl_writephy_batch(tp, a, ARRAY_SIZE(a)) - static void rtl_release_firmware(struct rtl8169_private *tp) { if (tp->rtl_fw) { @@ -2293,7 +2174,7 @@ static void rtl_release_firmware(struct rtl8169_private *tp) } } -static void rtl_apply_firmware(struct rtl8169_private *tp) +void r8169_apply_firmware(struct rtl8169_private *tp) { /* TODO: release firmware if rtl_fw_write_firmware signals failure. */ if (tp->rtl_fw) @@ -2315,594 +2196,6 @@ static void rtl8125_config_eee_mac(struct rtl8169_private *tp) r8168_mac_ocp_modify(tp, 0xeb62, 0, BIT(2) | BIT(1)); } -static void rtl8168f_config_eee_phy(struct rtl8169_private *tp) -{ - struct phy_device *phydev = tp->phydev; - - r8168d_modify_extpage(phydev, 0x0020, 0x15, 0, BIT(8)); - r8168d_phy_param(phydev, 0x8b85, 0, BIT(13)); -} - -static void rtl8168g_config_eee_phy(struct rtl8169_private *tp) -{ - phy_modify_paged(tp->phydev, 0x0a43, 0x11, 0, BIT(4)); -} - -static void rtl8168h_config_eee_phy(struct rtl8169_private *tp) -{ - struct phy_device *phydev = tp->phydev; - - rtl8168g_config_eee_phy(tp); - - phy_modify_paged(phydev, 0xa4a, 0x11, 0x0000, 0x0200); - phy_modify_paged(phydev, 0xa42, 0x14, 0x0000, 0x0080); -} - -static void rtl8125_config_eee_phy(struct rtl8169_private *tp) -{ - struct phy_device *phydev = tp->phydev; - - rtl8168h_config_eee_phy(tp); - - phy_modify_paged(phydev, 0xa6d, 0x12, 0x0001, 0x0000); - phy_modify_paged(phydev, 0xa6d, 0x14, 0x0010, 0x0000); -} - -static void rtl8169s_hw_phy_config(struct rtl8169_private *tp) -{ - static const struct phy_reg phy_reg_init[] = { - { 0x1f, 0x0001 }, - { 0x06, 0x006e }, - { 0x08, 0x0708 }, - { 0x15, 0x4000 }, - { 0x18, 0x65c7 }, - - { 0x1f, 0x0001 }, - { 0x03, 0x00a1 }, - { 0x02, 0x0008 }, - { 0x01, 0x0120 }, - { 0x00, 0x1000 }, - { 0x04, 0x0800 }, - { 0x04, 0x0000 }, - - { 0x03, 0xff41 }, - { 0x02, 0xdf60 }, - { 0x01, 0x0140 }, - { 0x00, 0x0077 }, - { 0x04, 0x7800 }, - { 0x04, 0x7000 }, - - { 0x03, 0x802f }, - { 0x02, 0x4f02 }, - { 0x01, 0x0409 }, - { 0x00, 0xf0f9 }, - { 0x04, 0x9800 }, - { 0x04, 0x9000 }, - - { 0x03, 0xdf01 }, - { 0x02, 0xdf20 }, - { 0x01, 0xff95 }, - { 0x00, 0xba00 }, - { 0x04, 0xa800 }, - { 0x04, 0xa000 }, - - { 0x03, 0xff41 }, - { 0x02, 0xdf20 }, - { 0x01, 0x0140 }, - { 0x00, 0x00bb }, - { 0x04, 0xb800 }, - { 0x04, 0xb000 }, - - { 0x03, 0xdf41 }, - { 0x02, 0xdc60 }, - { 0x01, 0x6340 }, - { 0x00, 0x007d }, - { 0x04, 0xd800 }, - { 0x04, 0xd000 }, - - { 0x03, 0xdf01 }, - { 0x02, 0xdf20 }, - { 0x01, 0x100a }, - { 0x00, 0xa0ff }, - { 0x04, 0xf800 }, - { 0x04, 0xf000 }, - - { 0x1f, 0x0000 }, - { 0x0b, 0x0000 }, - { 0x00, 0x9200 } - }; - - rtl_writephy_batch(tp, phy_reg_init); -} - -static void rtl8169sb_hw_phy_config(struct rtl8169_private *tp) -{ - phy_write_paged(tp->phydev, 0x0002, 0x01, 0x90d0); -} - -static void rtl8169scd_hw_phy_config_quirk(struct rtl8169_private *tp) -{ - struct pci_dev *pdev = tp->pci_dev; - - if ((pdev->subsystem_vendor != PCI_VENDOR_ID_GIGABYTE) || - (pdev->subsystem_device != 0xe000)) - return; - - phy_write_paged(tp->phydev, 0x0001, 0x10, 0xf01b); -} - -static void rtl8169scd_hw_phy_config(struct rtl8169_private *tp) -{ - static const struct phy_reg phy_reg_init[] = { - { 0x1f, 0x0001 }, - { 0x04, 0x0000 }, - { 0x03, 0x00a1 }, - { 0x02, 0x0008 }, - { 0x01, 0x0120 }, - { 0x00, 0x1000 }, - { 0x04, 0x0800 }, - { 0x04, 0x9000 }, - { 0x03, 0x802f }, - { 0x02, 0x4f02 }, - { 0x01, 0x0409 }, - { 0x00, 0xf099 }, - { 0x04, 0x9800 }, - { 0x04, 0xa000 }, - { 0x03, 0xdf01 }, - { 0x02, 0xdf20 }, - { 0x01, 0xff95 }, - { 0x00, 0xba00 }, - { 0x04, 0xa800 }, - { 0x04, 0xf000 }, - { 0x03, 0xdf01 }, - { 0x02, 0xdf20 }, - { 0x01, 0x101a }, - { 0x00, 0xa0ff }, - { 0x04, 0xf800 }, - { 0x04, 0x0000 }, - { 0x1f, 0x0000 }, - - { 0x1f, 0x0001 }, - { 0x10, 0xf41b }, - { 0x14, 0xfb54 }, - { 0x18, 0xf5c7 }, - { 0x1f, 0x0000 }, - - { 0x1f, 0x0001 }, - { 0x17, 0x0cc0 }, - { 0x1f, 0x0000 } - }; - - rtl_writephy_batch(tp, phy_reg_init); - - rtl8169scd_hw_phy_config_quirk(tp); -} - -static void rtl8169sce_hw_phy_config(struct rtl8169_private *tp) -{ - static const struct phy_reg phy_reg_init[] = { - { 0x1f, 0x0001 }, - { 0x04, 0x0000 }, - { 0x03, 0x00a1 }, - { 0x02, 0x0008 }, - { 0x01, 0x0120 }, - { 0x00, 0x1000 }, - { 0x04, 0x0800 }, - { 0x04, 0x9000 }, - { 0x03, 0x802f }, - { 0x02, 0x4f02 }, - { 0x01, 0x0409 }, - { 0x00, 0xf099 }, - { 0x04, 0x9800 }, - { 0x04, 0xa000 }, - { 0x03, 0xdf01 }, - { 0x02, 0xdf20 }, - { 0x01, 0xff95 }, - { 0x00, 0xba00 }, - { 0x04, 0xa800 }, - { 0x04, 0xf000 }, - { 0x03, 0xdf01 }, - { 0x02, 0xdf20 }, - { 0x01, 0x101a }, - { 0x00, 0xa0ff }, - { 0x04, 0xf800 }, - { 0x04, 0x0000 }, - { 0x1f, 0x0000 }, - - { 0x1f, 0x0001 }, - { 0x0b, 0x8480 }, - { 0x1f, 0x0000 }, - - { 0x1f, 0x0001 }, - { 0x18, 0x67c7 }, - { 0x04, 0x2000 }, - { 0x03, 0x002f }, - { 0x02, 0x4360 }, - { 0x01, 0x0109 }, - { 0x00, 0x3022 }, - { 0x04, 0x2800 }, - { 0x1f, 0x0000 }, - - { 0x1f, 0x0001 }, - { 0x17, 0x0cc0 }, - { 0x1f, 0x0000 } - }; - - rtl_writephy_batch(tp, phy_reg_init); -} - -static void rtl8168bb_hw_phy_config(struct rtl8169_private *tp) -{ - rtl_writephy(tp, 0x1f, 0x0001); - rtl_patchphy(tp, 0x16, 1 << 0); - rtl_writephy(tp, 0x10, 0xf41b); - rtl_writephy(tp, 0x1f, 0x0000); -} - -static void rtl8168bef_hw_phy_config(struct rtl8169_private *tp) -{ - phy_write_paged(tp->phydev, 0x0001, 0x10, 0xf41b); -} - -static void rtl8168cp_1_hw_phy_config(struct rtl8169_private *tp) -{ - phy_write(tp->phydev, 0x1d, 0x0f00); - phy_write_paged(tp->phydev, 0x0002, 0x0c, 0x1ec8); -} - -static void rtl8168cp_2_hw_phy_config(struct rtl8169_private *tp) -{ - phy_set_bits(tp->phydev, 0x14, BIT(5)); - phy_set_bits(tp->phydev, 0x0d, BIT(5)); - phy_write_paged(tp->phydev, 0x0001, 0x1d, 0x3d98); -} - -static void rtl8168c_1_hw_phy_config(struct rtl8169_private *tp) -{ - static const struct phy_reg phy_reg_init[] = { - { 0x1f, 0x0001 }, - { 0x12, 0x2300 }, - { 0x1f, 0x0002 }, - { 0x00, 0x88d4 }, - { 0x01, 0x82b1 }, - { 0x03, 0x7002 }, - { 0x08, 0x9e30 }, - { 0x09, 0x01f0 }, - { 0x0a, 0x5500 }, - { 0x0c, 0x00c8 }, - { 0x1f, 0x0003 }, - { 0x12, 0xc096 }, - { 0x16, 0x000a }, - { 0x1f, 0x0000 }, - { 0x1f, 0x0000 }, - { 0x09, 0x2000 }, - { 0x09, 0x0000 } - }; - - rtl_writephy_batch(tp, phy_reg_init); - - rtl_patchphy(tp, 0x14, 1 << 5); - rtl_patchphy(tp, 0x0d, 1 << 5); - rtl_writephy(tp, 0x1f, 0x0000); -} - -static void rtl8168c_2_hw_phy_config(struct rtl8169_private *tp) -{ - static const struct phy_reg phy_reg_init[] = { - { 0x1f, 0x0001 }, - { 0x12, 0x2300 }, - { 0x03, 0x802f }, - { 0x02, 0x4f02 }, - { 0x01, 0x0409 }, - { 0x00, 0xf099 }, - { 0x04, 0x9800 }, - { 0x04, 0x9000 }, - { 0x1d, 0x3d98 }, - { 0x1f, 0x0002 }, - { 0x0c, 0x7eb8 }, - { 0x06, 0x0761 }, - { 0x1f, 0x0003 }, - { 0x16, 0x0f0a }, - { 0x1f, 0x0000 } - }; - - rtl_writephy_batch(tp, phy_reg_init); - - rtl_patchphy(tp, 0x16, 1 << 0); - rtl_patchphy(tp, 0x14, 1 << 5); - rtl_patchphy(tp, 0x0d, 1 << 5); - rtl_writephy(tp, 0x1f, 0x0000); -} - -static void rtl8168c_3_hw_phy_config(struct rtl8169_private *tp) -{ - static const struct phy_reg phy_reg_init[] = { - { 0x1f, 0x0001 }, - { 0x12, 0x2300 }, - { 0x1d, 0x3d98 }, - { 0x1f, 0x0002 }, - { 0x0c, 0x7eb8 }, - { 0x06, 0x5461 }, - { 0x1f, 0x0003 }, - { 0x16, 0x0f0a }, - { 0x1f, 0x0000 } - }; - - rtl_writephy_batch(tp, phy_reg_init); - - rtl_patchphy(tp, 0x16, 1 << 0); - rtl_patchphy(tp, 0x14, 1 << 5); - rtl_patchphy(tp, 0x0d, 1 << 5); - rtl_writephy(tp, 0x1f, 0x0000); -} - -static const struct phy_reg rtl8168d_1_phy_reg_init_0[] = { - /* Channel Estimation */ - { 0x1f, 0x0001 }, - { 0x06, 0x4064 }, - { 0x07, 0x2863 }, - { 0x08, 0x059c }, - { 0x09, 0x26b4 }, - { 0x0a, 0x6a19 }, - { 0x0b, 0xdcc8 }, - { 0x10, 0xf06d }, - { 0x14, 0x7f68 }, - { 0x18, 0x7fd9 }, - { 0x1c, 0xf0ff }, - { 0x1d, 0x3d9c }, - { 0x1f, 0x0003 }, - { 0x12, 0xf49f }, - { 0x13, 0x070b }, - { 0x1a, 0x05ad }, - { 0x14, 0x94c0 }, - - /* - * Tx Error Issue - * Enhance line driver power - */ - { 0x1f, 0x0002 }, - { 0x06, 0x5561 }, - { 0x1f, 0x0005 }, - { 0x05, 0x8332 }, - { 0x06, 0x5561 }, - - /* - * Can not link to 1Gbps with bad cable - * Decrease SNR threshold form 21.07dB to 19.04dB - */ - { 0x1f, 0x0001 }, - { 0x17, 0x0cc0 }, - - { 0x1f, 0x0000 }, - { 0x0d, 0xf880 } -}; - -static const struct phy_reg rtl8168d_1_phy_reg_init_1[] = { - { 0x1f, 0x0002 }, - { 0x05, 0x669a }, - { 0x1f, 0x0005 }, - { 0x05, 0x8330 }, - { 0x06, 0x669a }, - { 0x1f, 0x0002 } -}; - -static void rtl8168d_apply_firmware_cond(struct rtl8169_private *tp, u16 val) -{ - u16 reg_val; - - rtl_writephy(tp, 0x1f, 0x0005); - rtl_writephy(tp, 0x05, 0x001b); - reg_val = rtl_readphy(tp, 0x06); - rtl_writephy(tp, 0x1f, 0x0000); - - if (reg_val != val) - netif_warn(tp, hw, tp->dev, "chipset not ready for firmware\n"); - else - rtl_apply_firmware(tp); -} - -static void rtl8168d_1_hw_phy_config(struct rtl8169_private *tp) -{ - rtl_writephy_batch(tp, rtl8168d_1_phy_reg_init_0); - - /* - * Rx Error Issue - * Fine Tune Switching regulator parameter - */ - rtl_writephy(tp, 0x1f, 0x0002); - rtl_w0w1_phy(tp, 0x0b, 0x0010, 0x00ef); - rtl_w0w1_phy(tp, 0x0c, 0xa200, 0x5d00); - - if (rtl8168d_efuse_read(tp, 0x01) == 0xb1) { - int val; - - rtl_writephy_batch(tp, rtl8168d_1_phy_reg_init_1); - - val = rtl_readphy(tp, 0x0d); - - if ((val & 0x00ff) != 0x006c) { - static const u32 set[] = { - 0x0065, 0x0066, 0x0067, 0x0068, - 0x0069, 0x006a, 0x006b, 0x006c - }; - int i; - - rtl_writephy(tp, 0x1f, 0x0002); - - val &= 0xff00; - for (i = 0; i < ARRAY_SIZE(set); i++) - rtl_writephy(tp, 0x0d, val | set[i]); - } - } else { - phy_write_paged(tp->phydev, 0x0002, 0x05, 0x6662); - r8168d_phy_param(tp->phydev, 0x8330, 0xffff, 0x6662); - } - - /* RSET couple improve */ - rtl_writephy(tp, 0x1f, 0x0002); - rtl_patchphy(tp, 0x0d, 0x0300); - rtl_patchphy(tp, 0x0f, 0x0010); - - /* Fine tune PLL performance */ - rtl_writephy(tp, 0x1f, 0x0002); - rtl_w0w1_phy(tp, 0x02, 0x0100, 0x0600); - rtl_w0w1_phy(tp, 0x03, 0x0000, 0xe000); - rtl_writephy(tp, 0x1f, 0x0000); - - rtl8168d_apply_firmware_cond(tp, 0xbf00); -} - -static void rtl8168d_2_hw_phy_config(struct rtl8169_private *tp) -{ - rtl_writephy_batch(tp, rtl8168d_1_phy_reg_init_0); - - if (rtl8168d_efuse_read(tp, 0x01) == 0xb1) { - int val; - - rtl_writephy_batch(tp, rtl8168d_1_phy_reg_init_1); - - val = rtl_readphy(tp, 0x0d); - if ((val & 0x00ff) != 0x006c) { - static const u32 set[] = { - 0x0065, 0x0066, 0x0067, 0x0068, - 0x0069, 0x006a, 0x006b, 0x006c - }; - int i; - - rtl_writephy(tp, 0x1f, 0x0002); - - val &= 0xff00; - for (i = 0; i < ARRAY_SIZE(set); i++) - rtl_writephy(tp, 0x0d, val | set[i]); - } - } else { - phy_write_paged(tp->phydev, 0x0002, 0x05, 0x2642); - r8168d_phy_param(tp->phydev, 0x8330, 0xffff, 0x2642); - } - - /* Fine tune PLL performance */ - rtl_writephy(tp, 0x1f, 0x0002); - rtl_w0w1_phy(tp, 0x02, 0x0100, 0x0600); - rtl_w0w1_phy(tp, 0x03, 0x0000, 0xe000); - - /* Switching regulator Slew rate */ - rtl_writephy(tp, 0x1f, 0x0002); - rtl_patchphy(tp, 0x0f, 0x0017); - rtl_writephy(tp, 0x1f, 0x0000); - - rtl8168d_apply_firmware_cond(tp, 0xb300); -} - -static void rtl8168d_3_hw_phy_config(struct rtl8169_private *tp) -{ - static const struct phy_reg phy_reg_init[] = { - { 0x1f, 0x0002 }, - { 0x10, 0x0008 }, - { 0x0d, 0x006c }, - - { 0x1f, 0x0000 }, - { 0x0d, 0xf880 }, - - { 0x1f, 0x0001 }, - { 0x17, 0x0cc0 }, - - { 0x1f, 0x0001 }, - { 0x0b, 0xa4d8 }, - { 0x09, 0x281c }, - { 0x07, 0x2883 }, - { 0x0a, 0x6b35 }, - { 0x1d, 0x3da4 }, - { 0x1c, 0xeffd }, - { 0x14, 0x7f52 }, - { 0x18, 0x7fc6 }, - { 0x08, 0x0601 }, - { 0x06, 0x4063 }, - { 0x10, 0xf074 }, - { 0x1f, 0x0003 }, - { 0x13, 0x0789 }, - { 0x12, 0xf4bd }, - { 0x1a, 0x04fd }, - { 0x14, 0x84b0 }, - { 0x1f, 0x0000 }, - { 0x00, 0x9200 }, - - { 0x1f, 0x0005 }, - { 0x01, 0x0340 }, - { 0x1f, 0x0001 }, - { 0x04, 0x4000 }, - { 0x03, 0x1d21 }, - { 0x02, 0x0c32 }, - { 0x01, 0x0200 }, - { 0x00, 0x5554 }, - { 0x04, 0x4800 }, - { 0x04, 0x4000 }, - { 0x04, 0xf000 }, - { 0x03, 0xdf01 }, - { 0x02, 0xdf20 }, - { 0x01, 0x101a }, - { 0x00, 0xa0ff }, - { 0x04, 0xf800 }, - { 0x04, 0xf000 }, - { 0x1f, 0x0000 }, - }; - - rtl_writephy_batch(tp, phy_reg_init); - - r8168d_modify_extpage(tp->phydev, 0x0023, 0x16, 0xffff, 0x0000); -} - -static void rtl8168d_4_hw_phy_config(struct rtl8169_private *tp) -{ - phy_write_paged(tp->phydev, 0x0001, 0x17, 0x0cc0); - r8168d_modify_extpage(tp->phydev, 0x002d, 0x18, 0xffff, 0x0040); - phy_set_bits(tp->phydev, 0x0d, BIT(5)); -} - -static void rtl8168e_1_hw_phy_config(struct rtl8169_private *tp) -{ - static const struct phy_reg phy_reg_init[] = { - /* Channel estimation fine tune */ - { 0x1f, 0x0001 }, - { 0x0b, 0x6c20 }, - { 0x07, 0x2872 }, - { 0x1c, 0xefff }, - { 0x1f, 0x0003 }, - { 0x14, 0x6420 }, - { 0x1f, 0x0000 }, - }; - struct phy_device *phydev = tp->phydev; - - rtl_apply_firmware(tp); - - /* Enable Delay cap */ - r8168d_phy_param(phydev, 0x8b80, 0xffff, 0xc896); - - rtl_writephy_batch(tp, phy_reg_init); - - /* Update PFM & 10M TX idle timer */ - r8168d_modify_extpage(phydev, 0x002f, 0x15, 0xffff, 0x1919); - - r8168d_modify_extpage(phydev, 0x00ac, 0x18, 0xffff, 0x0006); - - /* DCO enable for 10M IDLE Power */ - r8168d_modify_extpage(phydev, 0x0023, 0x17, 0x0000, 0x0006); - - /* For impedance matching */ - phy_modify_paged(phydev, 0x0002, 0x08, 0x7f00, 0x8000); - - /* PHY auto speed down */ - r8168d_modify_extpage(phydev, 0x002d, 0x18, 0x0000, 0x0050); - phy_set_bits(phydev, 0x14, BIT(15)); - - r8168d_phy_param(phydev, 0x8b86, 0x0000, 0x0001); - r8168d_phy_param(phydev, 0x8b85, 0x2000, 0x0000); - - r8168d_modify_extpage(phydev, 0x0020, 0x15, 0x1100, 0x0000); - phy_write_paged(phydev, 0x0006, 0x00, 0x5a00); - - phy_write_mmd(phydev, MDIO_MMD_AN, MDIO_AN_EEE_ADV, 0x0000); -} - static void rtl_rar_exgmac_set(struct rtl8169_private *tp, u8 *addr) { const u16 w[] = { @@ -2917,286 +2210,7 @@ static void rtl_rar_exgmac_set(struct rtl8169_private *tp, u8 *addr) rtl_eri_write(tp, 0xf4, ERIAR_MASK_1111, w[1] | (w[2] << 16)); } -static void rtl8168e_2_hw_phy_config(struct rtl8169_private *tp) -{ - struct phy_device *phydev = tp->phydev; - - rtl_apply_firmware(tp); - - /* Enable Delay cap */ - r8168d_modify_extpage(phydev, 0x00ac, 0x18, 0xffff, 0x0006); - - /* Channel estimation fine tune */ - phy_write_paged(phydev, 0x0003, 0x09, 0xa20f); - - /* Green Setting */ - r8168d_phy_param(phydev, 0x8b5b, 0xffff, 0x9222); - r8168d_phy_param(phydev, 0x8b6d, 0xffff, 0x8000); - r8168d_phy_param(phydev, 0x8b76, 0xffff, 0x8000); - - /* For 4-corner performance improve */ - rtl_writephy(tp, 0x1f, 0x0005); - rtl_writephy(tp, 0x05, 0x8b80); - rtl_w0w1_phy(tp, 0x17, 0x0006, 0x0000); - rtl_writephy(tp, 0x1f, 0x0000); - - /* PHY auto speed down */ - r8168d_modify_extpage(phydev, 0x002d, 0x18, 0x0000, 0x0010); - phy_set_bits(phydev, 0x14, BIT(15)); - - /* improve 10M EEE waveform */ - r8168d_phy_param(phydev, 0x8b86, 0x0000, 0x0001); - - /* Improve 2-pair detection performance */ - r8168d_phy_param(phydev, 0x8b85, 0x0000, 0x4000); - - rtl8168f_config_eee_phy(tp); - - /* Green feature */ - rtl_writephy(tp, 0x1f, 0x0003); - rtl_w0w1_phy(tp, 0x19, 0x0001, 0x0000); - rtl_w0w1_phy(tp, 0x10, 0x0400, 0x0000); - rtl_writephy(tp, 0x1f, 0x0000); - rtl_writephy(tp, 0x1f, 0x0005); - rtl_w0w1_phy(tp, 0x01, 0x0100, 0x0000); - rtl_writephy(tp, 0x1f, 0x0000); -} - -static void rtl8168f_hw_phy_config(struct rtl8169_private *tp) -{ - struct phy_device *phydev = tp->phydev; - - /* For 4-corner performance improve */ - r8168d_phy_param(phydev, 0x8b80, 0x0000, 0x0006); - - /* PHY auto speed down */ - r8168d_modify_extpage(phydev, 0x002d, 0x18, 0x0000, 0x0010); - phy_set_bits(phydev, 0x14, BIT(15)); - - /* Improve 10M EEE waveform */ - r8168d_phy_param(phydev, 0x8b86, 0x0000, 0x0001); - - rtl8168f_config_eee_phy(tp); -} - -static void rtl8168f_1_hw_phy_config(struct rtl8169_private *tp) -{ - struct phy_device *phydev = tp->phydev; - - rtl_apply_firmware(tp); - - /* Channel estimation fine tune */ - phy_write_paged(phydev, 0x0003, 0x09, 0xa20f); - - /* Modify green table for giga & fnet */ - r8168d_phy_param(phydev, 0x8b55, 0xffff, 0x0000); - r8168d_phy_param(phydev, 0x8b5e, 0xffff, 0x0000); - r8168d_phy_param(phydev, 0x8b67, 0xffff, 0x0000); - r8168d_phy_param(phydev, 0x8b70, 0xffff, 0x0000); - r8168d_modify_extpage(phydev, 0x0078, 0x17, 0xffff, 0x0000); - r8168d_modify_extpage(phydev, 0x0078, 0x19, 0xffff, 0x00fb); - - /* Modify green table for 10M */ - r8168d_phy_param(phydev, 0x8b79, 0xffff, 0xaa00); - - /* Disable hiimpedance detection (RTCT) */ - phy_write_paged(phydev, 0x0003, 0x01, 0x328a); - - rtl8168f_hw_phy_config(tp); - - /* Improve 2-pair detection performance */ - r8168d_phy_param(phydev, 0x8b85, 0x0000, 0x4000); -} - -static void rtl8168f_2_hw_phy_config(struct rtl8169_private *tp) -{ - rtl_apply_firmware(tp); - - rtl8168f_hw_phy_config(tp); -} - -static void rtl8411_hw_phy_config(struct rtl8169_private *tp) -{ - struct phy_device *phydev = tp->phydev; - - rtl_apply_firmware(tp); - - rtl8168f_hw_phy_config(tp); - - /* Improve 2-pair detection performance */ - r8168d_phy_param(phydev, 0x8b85, 0x0000, 0x4000); - - /* Channel estimation fine tune */ - phy_write_paged(phydev, 0x0003, 0x09, 0xa20f); - - /* Modify green table for giga & fnet */ - r8168d_phy_param(phydev, 0x8b55, 0xffff, 0x0000); - r8168d_phy_param(phydev, 0x8b5e, 0xffff, 0x0000); - r8168d_phy_param(phydev, 0x8b67, 0xffff, 0x0000); - r8168d_phy_param(phydev, 0x8b70, 0xffff, 0x0000); - r8168d_modify_extpage(phydev, 0x0078, 0x17, 0xffff, 0x0000); - r8168d_modify_extpage(phydev, 0x0078, 0x19, 0xffff, 0x00aa); - - /* Modify green table for 10M */ - r8168d_phy_param(phydev, 0x8b79, 0xffff, 0xaa00); - - /* Disable hiimpedance detection (RTCT) */ - phy_write_paged(phydev, 0x0003, 0x01, 0x328a); - - /* Modify green table for giga */ - r8168d_phy_param(phydev, 0x8b54, 0x0800, 0x0000); - r8168d_phy_param(phydev, 0x8b5d, 0x0800, 0x0000); - r8168d_phy_param(phydev, 0x8a7c, 0x0100, 0x0000); - r8168d_phy_param(phydev, 0x8a7f, 0x0000, 0x0100); - r8168d_phy_param(phydev, 0x8a82, 0x0100, 0x0000); - r8168d_phy_param(phydev, 0x8a85, 0x0100, 0x0000); - r8168d_phy_param(phydev, 0x8a88, 0x0100, 0x0000); - - /* uc same-seed solution */ - r8168d_phy_param(phydev, 0x8b85, 0x0000, 0x8000); - - /* Green feature */ - rtl_writephy(tp, 0x1f, 0x0003); - rtl_w0w1_phy(tp, 0x19, 0x0000, 0x0001); - rtl_w0w1_phy(tp, 0x10, 0x0000, 0x0400); - rtl_writephy(tp, 0x1f, 0x0000); -} - -static void rtl8168g_disable_aldps(struct rtl8169_private *tp) -{ - phy_modify_paged(tp->phydev, 0x0a43, 0x10, BIT(2), 0); -} - -static void rtl8168g_phy_adjust_10m_aldps(struct rtl8169_private *tp) -{ - struct phy_device *phydev = tp->phydev; - - phy_modify_paged(phydev, 0x0bcc, 0x14, BIT(8), 0); - phy_modify_paged(phydev, 0x0a44, 0x11, 0, BIT(7) | BIT(6)); - r8168g_phy_param(phydev, 0x8084, 0x6000, 0x0000); - phy_modify_paged(phydev, 0x0a43, 0x10, 0x0000, 0x1003); -} - -static void rtl8168g_1_hw_phy_config(struct rtl8169_private *tp) -{ - int ret; - - rtl_apply_firmware(tp); - - ret = phy_read_paged(tp->phydev, 0x0a46, 0x10); - if (ret & BIT(8)) - phy_modify_paged(tp->phydev, 0x0bcc, 0x12, BIT(15), 0); - else - phy_modify_paged(tp->phydev, 0x0bcc, 0x12, 0, BIT(15)); - - ret = phy_read_paged(tp->phydev, 0x0a46, 0x13); - if (ret & BIT(8)) - phy_modify_paged(tp->phydev, 0x0c41, 0x15, 0, BIT(1)); - else - phy_modify_paged(tp->phydev, 0x0c41, 0x15, BIT(1), 0); - - /* Enable PHY auto speed down */ - phy_modify_paged(tp->phydev, 0x0a44, 0x11, 0, BIT(3) | BIT(2)); - - rtl8168g_phy_adjust_10m_aldps(tp); - - /* EEE auto-fallback function */ - phy_modify_paged(tp->phydev, 0x0a4b, 0x11, 0, BIT(2)); - - /* Enable UC LPF tune function */ - r8168g_phy_param(tp->phydev, 0x8012, 0x0000, 0x8000); - - phy_modify_paged(tp->phydev, 0x0c42, 0x11, BIT(13), BIT(14)); - - /* Improve SWR Efficiency */ - rtl_writephy(tp, 0x1f, 0x0bcd); - rtl_writephy(tp, 0x14, 0x5065); - rtl_writephy(tp, 0x14, 0xd065); - rtl_writephy(tp, 0x1f, 0x0bc8); - rtl_writephy(tp, 0x11, 0x5655); - rtl_writephy(tp, 0x1f, 0x0bcd); - rtl_writephy(tp, 0x14, 0x1065); - rtl_writephy(tp, 0x14, 0x9065); - rtl_writephy(tp, 0x14, 0x1065); - rtl_writephy(tp, 0x1f, 0x0000); - - rtl8168g_disable_aldps(tp); - rtl8168g_config_eee_phy(tp); -} - -static void rtl8168g_2_hw_phy_config(struct rtl8169_private *tp) -{ - rtl_apply_firmware(tp); - rtl8168g_config_eee_phy(tp); -} - -static void rtl8168h_1_hw_phy_config(struct rtl8169_private *tp) -{ - struct phy_device *phydev = tp->phydev; - u16 dout_tapbin; - u32 data; - - rtl_apply_firmware(tp); - - /* CHN EST parameters adjust - giga master */ - r8168g_phy_param(phydev, 0x809b, 0xf800, 0x8000); - r8168g_phy_param(phydev, 0x80a2, 0xff00, 0x8000); - r8168g_phy_param(phydev, 0x80a4, 0xff00, 0x8500); - r8168g_phy_param(phydev, 0x809c, 0xff00, 0xbd00); - - /* CHN EST parameters adjust - giga slave */ - r8168g_phy_param(phydev, 0x80ad, 0xf800, 0x7000); - r8168g_phy_param(phydev, 0x80b4, 0xff00, 0x5000); - r8168g_phy_param(phydev, 0x80ac, 0xff00, 0x4000); - - /* CHN EST parameters adjust - fnet */ - r8168g_phy_param(phydev, 0x808e, 0xff00, 0x1200); - r8168g_phy_param(phydev, 0x8090, 0xff00, 0xe500); - r8168g_phy_param(phydev, 0x8092, 0xff00, 0x9f00); - - /* enable R-tune & PGA-retune function */ - dout_tapbin = 0; - data = phy_read_paged(phydev, 0x0a46, 0x13); - data &= 3; - data <<= 2; - dout_tapbin |= data; - data = phy_read_paged(phydev, 0x0a46, 0x12); - data &= 0xc000; - data >>= 14; - dout_tapbin |= data; - dout_tapbin = ~(dout_tapbin^0x08); - dout_tapbin <<= 12; - dout_tapbin &= 0xf000; - - r8168g_phy_param(phydev, 0x827a, 0xf000, dout_tapbin); - r8168g_phy_param(phydev, 0x827b, 0xf000, dout_tapbin); - r8168g_phy_param(phydev, 0x827c, 0xf000, dout_tapbin); - r8168g_phy_param(phydev, 0x827d, 0xf000, dout_tapbin); - r8168g_phy_param(phydev, 0x0811, 0x0000, 0x0800); - phy_modify_paged(phydev, 0x0a42, 0x16, 0x0000, 0x0002); - - /* enable GPHY 10M */ - phy_modify_paged(tp->phydev, 0x0a44, 0x11, 0, BIT(11)); - - /* SAR ADC performance */ - phy_modify_paged(tp->phydev, 0x0bca, 0x17, BIT(12) | BIT(13), BIT(14)); - - r8168g_phy_param(phydev, 0x803f, 0x3000, 0x0000); - r8168g_phy_param(phydev, 0x8047, 0x3000, 0x0000); - r8168g_phy_param(phydev, 0x804f, 0x3000, 0x0000); - r8168g_phy_param(phydev, 0x8057, 0x3000, 0x0000); - r8168g_phy_param(phydev, 0x805f, 0x3000, 0x0000); - r8168g_phy_param(phydev, 0x8067, 0x3000, 0x0000); - r8168g_phy_param(phydev, 0x806f, 0x3000, 0x0000); - - /* disable phy pfm mode */ - phy_modify_paged(tp->phydev, 0x0a44, 0x11, BIT(7), 0); - - rtl8168g_disable_aldps(tp); - rtl8168h_config_eee_phy(tp); -} - -static u16 rtl8168h_2_get_adc_bias_ioffset(struct rtl8169_private *tp) +u16 rtl8168h_2_get_adc_bias_ioffset(struct rtl8169_private *tp) { u16 data1, data2, ioffset; @@ -3212,410 +2226,28 @@ static u16 rtl8168h_2_get_adc_bias_ioffset(struct rtl8169_private *tp) return ioffset; } -static void rtl8168h_2_hw_phy_config(struct rtl8169_private *tp) -{ - struct phy_device *phydev = tp->phydev; - u16 ioffset, rlen; - u32 data; - - rtl_apply_firmware(tp); - - /* CHIN EST parameter update */ - r8168g_phy_param(phydev, 0x808a, 0x003f, 0x000a); - - /* enable R-tune & PGA-retune function */ - r8168g_phy_param(phydev, 0x0811, 0x0000, 0x0800); - phy_modify_paged(phydev, 0x0a42, 0x16, 0x0000, 0x0002); - - /* enable GPHY 10M */ - phy_modify_paged(tp->phydev, 0x0a44, 0x11, 0, BIT(11)); - - ioffset = rtl8168h_2_get_adc_bias_ioffset(tp); - if (ioffset != 0xffff) - phy_write_paged(phydev, 0x0bcf, 0x16, ioffset); - - /* Modify rlen (TX LPF corner frequency) level */ - data = phy_read_paged(phydev, 0x0bcd, 0x16); - data &= 0x000f; - rlen = 0; - if (data > 3) - rlen = data - 3; - data = rlen | (rlen<<4) | (rlen<<8) | (rlen<<12); - phy_write_paged(phydev, 0x0bcd, 0x17, data); - - /* disable phy pfm mode */ - phy_modify_paged(phydev, 0x0a44, 0x11, BIT(7), 0); - - rtl8168g_disable_aldps(tp); - rtl8168g_config_eee_phy(tp); -} - -static void rtl8168ep_1_hw_phy_config(struct rtl8169_private *tp) -{ - struct phy_device *phydev = tp->phydev; - - /* Enable PHY auto speed down */ - phy_modify_paged(phydev, 0x0a44, 0x11, 0, BIT(3) | BIT(2)); - - rtl8168g_phy_adjust_10m_aldps(tp); - - /* Enable EEE auto-fallback function */ - phy_modify_paged(phydev, 0x0a4b, 0x11, 0, BIT(2)); - - /* Enable UC LPF tune function */ - r8168g_phy_param(phydev, 0x8012, 0x0000, 0x8000); - - /* set rg_sel_sdm_rate */ - phy_modify_paged(phydev, 0x0c42, 0x11, BIT(13), BIT(14)); - - rtl8168g_disable_aldps(tp); - rtl8168g_config_eee_phy(tp); -} - -static void rtl8168ep_2_hw_phy_config(struct rtl8169_private *tp) -{ - struct phy_device *phydev = tp->phydev; - - rtl8168g_phy_adjust_10m_aldps(tp); - - /* Enable UC LPF tune function */ - r8168g_phy_param(phydev, 0x8012, 0x0000, 0x8000); - - /* Set rg_sel_sdm_rate */ - phy_modify_paged(tp->phydev, 0x0c42, 0x11, BIT(13), BIT(14)); - - /* Channel estimation parameters */ - r8168g_phy_param(phydev, 0x80f3, 0xff00, 0x8b00); - r8168g_phy_param(phydev, 0x80f0, 0xff00, 0x3a00); - r8168g_phy_param(phydev, 0x80ef, 0xff00, 0x0500); - r8168g_phy_param(phydev, 0x80f6, 0xff00, 0x6e00); - r8168g_phy_param(phydev, 0x80ec, 0xff00, 0x6800); - r8168g_phy_param(phydev, 0x80ed, 0xff00, 0x7c00); - r8168g_phy_param(phydev, 0x80f2, 0xff00, 0xf400); - r8168g_phy_param(phydev, 0x80f4, 0xff00, 0x8500); - r8168g_phy_param(phydev, 0x8110, 0xff00, 0xa800); - r8168g_phy_param(phydev, 0x810f, 0xff00, 0x1d00); - r8168g_phy_param(phydev, 0x8111, 0xff00, 0xf500); - r8168g_phy_param(phydev, 0x8113, 0xff00, 0x6100); - r8168g_phy_param(phydev, 0x8115, 0xff00, 0x9200); - r8168g_phy_param(phydev, 0x810e, 0xff00, 0x0400); - r8168g_phy_param(phydev, 0x810c, 0xff00, 0x7c00); - r8168g_phy_param(phydev, 0x810b, 0xff00, 0x5a00); - r8168g_phy_param(phydev, 0x80d1, 0xff00, 0xff00); - r8168g_phy_param(phydev, 0x80cd, 0xff00, 0x9e00); - r8168g_phy_param(phydev, 0x80d3, 0xff00, 0x0e00); - r8168g_phy_param(phydev, 0x80d5, 0xff00, 0xca00); - r8168g_phy_param(phydev, 0x80d7, 0xff00, 0x8400); - - /* Force PWM-mode */ - rtl_writephy(tp, 0x1f, 0x0bcd); - rtl_writephy(tp, 0x14, 0x5065); - rtl_writephy(tp, 0x14, 0xd065); - rtl_writephy(tp, 0x1f, 0x0bc8); - rtl_writephy(tp, 0x12, 0x00ed); - rtl_writephy(tp, 0x1f, 0x0bcd); - rtl_writephy(tp, 0x14, 0x1065); - rtl_writephy(tp, 0x14, 0x9065); - rtl_writephy(tp, 0x14, 0x1065); - rtl_writephy(tp, 0x1f, 0x0000); - - rtl8168g_disable_aldps(tp); - rtl8168g_config_eee_phy(tp); -} - -static void rtl8117_hw_phy_config(struct rtl8169_private *tp) -{ - struct phy_device *phydev = tp->phydev; - - /* CHN EST parameters adjust - fnet */ - r8168g_phy_param(phydev, 0x808e, 0xff00, 0x4800); - r8168g_phy_param(phydev, 0x8090, 0xff00, 0xcc00); - r8168g_phy_param(phydev, 0x8092, 0xff00, 0xb000); - - r8168g_phy_param(phydev, 0x8088, 0xff00, 0x6000); - r8168g_phy_param(phydev, 0x808b, 0x3f00, 0x0b00); - r8168g_phy_param(phydev, 0x808d, 0x1f00, 0x0600); - r8168g_phy_param(phydev, 0x808c, 0xff00, 0xb000); - r8168g_phy_param(phydev, 0x80a0, 0xff00, 0x2800); - r8168g_phy_param(phydev, 0x80a2, 0xff00, 0x5000); - r8168g_phy_param(phydev, 0x809b, 0xf800, 0xb000); - r8168g_phy_param(phydev, 0x809a, 0xff00, 0x4b00); - r8168g_phy_param(phydev, 0x809d, 0x3f00, 0x0800); - r8168g_phy_param(phydev, 0x80a1, 0xff00, 0x7000); - r8168g_phy_param(phydev, 0x809f, 0x1f00, 0x0300); - r8168g_phy_param(phydev, 0x809e, 0xff00, 0x8800); - r8168g_phy_param(phydev, 0x80b2, 0xff00, 0x2200); - r8168g_phy_param(phydev, 0x80ad, 0xf800, 0x9800); - r8168g_phy_param(phydev, 0x80af, 0x3f00, 0x0800); - r8168g_phy_param(phydev, 0x80b3, 0xff00, 0x6f00); - r8168g_phy_param(phydev, 0x80b1, 0x1f00, 0x0300); - r8168g_phy_param(phydev, 0x80b0, 0xff00, 0x9300); - - r8168g_phy_param(phydev, 0x8011, 0x0000, 0x0800); - - /* enable GPHY 10M */ - phy_modify_paged(tp->phydev, 0x0a44, 0x11, 0, BIT(11)); - - r8168g_phy_param(phydev, 0x8016, 0x0000, 0x0400); - - rtl8168g_disable_aldps(tp); - rtl8168h_config_eee_phy(tp); -} - -static void rtl8102e_hw_phy_config(struct rtl8169_private *tp) -{ - static const struct phy_reg phy_reg_init[] = { - { 0x1f, 0x0003 }, - { 0x08, 0x441d }, - { 0x01, 0x9100 }, - { 0x1f, 0x0000 } - }; - - rtl_writephy(tp, 0x1f, 0x0000); - rtl_patchphy(tp, 0x11, 1 << 12); - rtl_patchphy(tp, 0x19, 1 << 13); - rtl_patchphy(tp, 0x10, 1 << 15); - - rtl_writephy_batch(tp, phy_reg_init); -} - -static void rtl8105e_hw_phy_config(struct rtl8169_private *tp) -{ - /* Disable ALDPS before ram code */ - phy_write(tp->phydev, 0x18, 0x0310); - msleep(100); - - rtl_apply_firmware(tp); - - phy_write_paged(tp->phydev, 0x0005, 0x1a, 0x0000); - phy_write_paged(tp->phydev, 0x0004, 0x1c, 0x0000); - phy_write_paged(tp->phydev, 0x0001, 0x15, 0x7701); -} - -static void rtl8402_hw_phy_config(struct rtl8169_private *tp) -{ - /* Disable ALDPS before setting firmware */ - phy_write(tp->phydev, 0x18, 0x0310); - msleep(20); - - rtl_apply_firmware(tp); - - /* EEE setting */ - rtl_eri_write(tp, 0x1b0, ERIAR_MASK_0011, 0x0000); - rtl_writephy(tp, 0x1f, 0x0004); - rtl_writephy(tp, 0x10, 0x401f); - rtl_writephy(tp, 0x19, 0x7030); - rtl_writephy(tp, 0x1f, 0x0000); -} - -static void rtl8106e_hw_phy_config(struct rtl8169_private *tp) -{ - static const struct phy_reg phy_reg_init[] = { - { 0x1f, 0x0004 }, - { 0x10, 0xc07f }, - { 0x19, 0x7030 }, - { 0x1f, 0x0000 } - }; - - /* Disable ALDPS before ram code */ - phy_write(tp->phydev, 0x18, 0x0310); - msleep(100); - - rtl_apply_firmware(tp); - - rtl_eri_write(tp, 0x1b0, ERIAR_MASK_0011, 0x0000); - rtl_writephy_batch(tp, phy_reg_init); - - rtl_eri_write(tp, 0x1d0, ERIAR_MASK_0011, 0x0000); -} - -static void rtl8125_1_hw_phy_config(struct rtl8169_private *tp) -{ - struct phy_device *phydev = tp->phydev; - - phy_modify_paged(phydev, 0xad4, 0x10, 0x03ff, 0x0084); - phy_modify_paged(phydev, 0xad4, 0x17, 0x0000, 0x0010); - phy_modify_paged(phydev, 0xad1, 0x13, 0x03ff, 0x0006); - phy_modify_paged(phydev, 0xad3, 0x11, 0x003f, 0x0006); - phy_modify_paged(phydev, 0xac0, 0x14, 0x0000, 0x1100); - phy_modify_paged(phydev, 0xac8, 0x15, 0xf000, 0x7000); - phy_modify_paged(phydev, 0xad1, 0x14, 0x0000, 0x0400); - phy_modify_paged(phydev, 0xad1, 0x15, 0x0000, 0x03ff); - phy_modify_paged(phydev, 0xad1, 0x16, 0x0000, 0x03ff); - - r8168g_phy_param(phydev, 0x80ea, 0xff00, 0xc400); - r8168g_phy_param(phydev, 0x80eb, 0x0700, 0x0300); - r8168g_phy_param(phydev, 0x80f8, 0xff00, 0x1c00); - r8168g_phy_param(phydev, 0x80f1, 0xff00, 0x3000); - r8168g_phy_param(phydev, 0x80fe, 0xff00, 0xa500); - r8168g_phy_param(phydev, 0x8102, 0xff00, 0x5000); - r8168g_phy_param(phydev, 0x8105, 0xff00, 0x3300); - r8168g_phy_param(phydev, 0x8100, 0xff00, 0x7000); - r8168g_phy_param(phydev, 0x8104, 0xff00, 0xf000); - r8168g_phy_param(phydev, 0x8106, 0xff00, 0x6500); - r8168g_phy_param(phydev, 0x80dc, 0xff00, 0xed00); - r8168g_phy_param(phydev, 0x80df, 0x0000, 0x0100); - r8168g_phy_param(phydev, 0x80e1, 0x0100, 0x0000); - - phy_modify_paged(phydev, 0xbf0, 0x13, 0x003f, 0x0038); - r8168g_phy_param(phydev, 0x819f, 0xffff, 0xd0b6); - - phy_write_paged(phydev, 0xbc3, 0x12, 0x5555); - phy_modify_paged(phydev, 0xbf0, 0x15, 0x0e00, 0x0a00); - phy_modify_paged(phydev, 0xa5c, 0x10, 0x0400, 0x0000); - phy_modify_paged(phydev, 0xa44, 0x11, 0x0000, 0x0800); - - rtl8125_config_eee_phy(tp); -} - -static void rtl8125_2_hw_phy_config(struct rtl8169_private *tp) -{ - struct phy_device *phydev = tp->phydev; - int i; - - phy_modify_paged(phydev, 0xad4, 0x17, 0x0000, 0x0010); - phy_modify_paged(phydev, 0xad1, 0x13, 0x03ff, 0x03ff); - phy_modify_paged(phydev, 0xad3, 0x11, 0x003f, 0x0006); - phy_modify_paged(phydev, 0xac0, 0x14, 0x1100, 0x0000); - phy_modify_paged(phydev, 0xacc, 0x10, 0x0003, 0x0002); - phy_modify_paged(phydev, 0xad4, 0x10, 0x00e7, 0x0044); - phy_modify_paged(phydev, 0xac1, 0x12, 0x0080, 0x0000); - phy_modify_paged(phydev, 0xac8, 0x10, 0x0300, 0x0000); - phy_modify_paged(phydev, 0xac5, 0x17, 0x0007, 0x0002); - phy_write_paged(phydev, 0xad4, 0x16, 0x00a8); - phy_write_paged(phydev, 0xac5, 0x16, 0x01ff); - phy_modify_paged(phydev, 0xac8, 0x15, 0x00f0, 0x0030); - - phy_write(phydev, 0x1f, 0x0b87); - phy_write(phydev, 0x16, 0x80a2); - phy_write(phydev, 0x17, 0x0153); - phy_write(phydev, 0x16, 0x809c); - phy_write(phydev, 0x17, 0x0153); - phy_write(phydev, 0x1f, 0x0000); - - phy_write(phydev, 0x1f, 0x0a43); - phy_write(phydev, 0x13, 0x81B3); - phy_write(phydev, 0x14, 0x0043); - phy_write(phydev, 0x14, 0x00A7); - phy_write(phydev, 0x14, 0x00D6); - phy_write(phydev, 0x14, 0x00EC); - phy_write(phydev, 0x14, 0x00F6); - phy_write(phydev, 0x14, 0x00FB); - phy_write(phydev, 0x14, 0x00FD); - phy_write(phydev, 0x14, 0x00FF); - phy_write(phydev, 0x14, 0x00BB); - phy_write(phydev, 0x14, 0x0058); - phy_write(phydev, 0x14, 0x0029); - phy_write(phydev, 0x14, 0x0013); - phy_write(phydev, 0x14, 0x0009); - phy_write(phydev, 0x14, 0x0004); - phy_write(phydev, 0x14, 0x0002); - for (i = 0; i < 25; i++) - phy_write(phydev, 0x14, 0x0000); - phy_write(phydev, 0x1f, 0x0000); - - r8168g_phy_param(phydev, 0x8257, 0xffff, 0x020F); - r8168g_phy_param(phydev, 0x80ea, 0xffff, 0x7843); - - rtl_apply_firmware(tp); - - phy_modify_paged(phydev, 0xd06, 0x14, 0x0000, 0x2000); - - r8168g_phy_param(phydev, 0x81a2, 0x0000, 0x0100); - - phy_modify_paged(phydev, 0xb54, 0x16, 0xff00, 0xdb00); - phy_modify_paged(phydev, 0xa45, 0x12, 0x0001, 0x0000); - phy_modify_paged(phydev, 0xa5d, 0x12, 0x0000, 0x0020); - phy_modify_paged(phydev, 0xad4, 0x17, 0x0010, 0x0000); - phy_modify_paged(phydev, 0xa86, 0x15, 0x0001, 0x0000); - phy_modify_paged(phydev, 0xa44, 0x11, 0x0000, 0x0800); - - rtl8125_config_eee_phy(tp); -} - -static void rtl_hw_phy_config(struct net_device *dev) -{ - static const rtl_generic_fct phy_configs[] = { - /* PCI devices. */ - [RTL_GIGA_MAC_VER_02] = rtl8169s_hw_phy_config, - [RTL_GIGA_MAC_VER_03] = rtl8169s_hw_phy_config, - [RTL_GIGA_MAC_VER_04] = rtl8169sb_hw_phy_config, - [RTL_GIGA_MAC_VER_05] = rtl8169scd_hw_phy_config, - [RTL_GIGA_MAC_VER_06] = rtl8169sce_hw_phy_config, - /* PCI-E devices. */ - [RTL_GIGA_MAC_VER_07] = rtl8102e_hw_phy_config, - [RTL_GIGA_MAC_VER_08] = rtl8102e_hw_phy_config, - [RTL_GIGA_MAC_VER_09] = rtl8102e_hw_phy_config, - [RTL_GIGA_MAC_VER_10] = NULL, - [RTL_GIGA_MAC_VER_11] = rtl8168bb_hw_phy_config, - [RTL_GIGA_MAC_VER_12] = rtl8168bef_hw_phy_config, - [RTL_GIGA_MAC_VER_13] = NULL, - [RTL_GIGA_MAC_VER_14] = NULL, - [RTL_GIGA_MAC_VER_15] = NULL, - [RTL_GIGA_MAC_VER_16] = NULL, - [RTL_GIGA_MAC_VER_17] = rtl8168bef_hw_phy_config, - [RTL_GIGA_MAC_VER_18] = rtl8168cp_1_hw_phy_config, - [RTL_GIGA_MAC_VER_19] = rtl8168c_1_hw_phy_config, - [RTL_GIGA_MAC_VER_20] = rtl8168c_2_hw_phy_config, - [RTL_GIGA_MAC_VER_21] = rtl8168c_3_hw_phy_config, - [RTL_GIGA_MAC_VER_22] = rtl8168c_3_hw_phy_config, - [RTL_GIGA_MAC_VER_23] = rtl8168cp_2_hw_phy_config, - [RTL_GIGA_MAC_VER_24] = rtl8168cp_2_hw_phy_config, - [RTL_GIGA_MAC_VER_25] = rtl8168d_1_hw_phy_config, - [RTL_GIGA_MAC_VER_26] = rtl8168d_2_hw_phy_config, - [RTL_GIGA_MAC_VER_27] = rtl8168d_3_hw_phy_config, - [RTL_GIGA_MAC_VER_28] = rtl8168d_4_hw_phy_config, - [RTL_GIGA_MAC_VER_29] = rtl8105e_hw_phy_config, - [RTL_GIGA_MAC_VER_30] = rtl8105e_hw_phy_config, - [RTL_GIGA_MAC_VER_31] = NULL, - [RTL_GIGA_MAC_VER_32] = rtl8168e_1_hw_phy_config, - [RTL_GIGA_MAC_VER_33] = rtl8168e_1_hw_phy_config, - [RTL_GIGA_MAC_VER_34] = rtl8168e_2_hw_phy_config, - [RTL_GIGA_MAC_VER_35] = rtl8168f_1_hw_phy_config, - [RTL_GIGA_MAC_VER_36] = rtl8168f_2_hw_phy_config, - [RTL_GIGA_MAC_VER_37] = rtl8402_hw_phy_config, - [RTL_GIGA_MAC_VER_38] = rtl8411_hw_phy_config, - [RTL_GIGA_MAC_VER_39] = rtl8106e_hw_phy_config, - [RTL_GIGA_MAC_VER_40] = rtl8168g_1_hw_phy_config, - [RTL_GIGA_MAC_VER_41] = NULL, - [RTL_GIGA_MAC_VER_42] = rtl8168g_2_hw_phy_config, - [RTL_GIGA_MAC_VER_43] = rtl8168g_2_hw_phy_config, - [RTL_GIGA_MAC_VER_44] = rtl8168g_2_hw_phy_config, - [RTL_GIGA_MAC_VER_45] = rtl8168h_1_hw_phy_config, - [RTL_GIGA_MAC_VER_46] = rtl8168h_2_hw_phy_config, - [RTL_GIGA_MAC_VER_47] = rtl8168h_1_hw_phy_config, - [RTL_GIGA_MAC_VER_48] = rtl8168h_2_hw_phy_config, - [RTL_GIGA_MAC_VER_49] = rtl8168ep_1_hw_phy_config, - [RTL_GIGA_MAC_VER_50] = rtl8168ep_2_hw_phy_config, - [RTL_GIGA_MAC_VER_51] = rtl8168ep_2_hw_phy_config, - [RTL_GIGA_MAC_VER_52] = rtl8117_hw_phy_config, - [RTL_GIGA_MAC_VER_60] = rtl8125_1_hw_phy_config, - [RTL_GIGA_MAC_VER_61] = rtl8125_2_hw_phy_config, - }; - struct rtl8169_private *tp = netdev_priv(dev); - - if (phy_configs[tp->mac_version]) - phy_configs[tp->mac_version](tp); -} - static void rtl_schedule_task(struct rtl8169_private *tp, enum rtl_flag flag) { if (!test_and_set_bit(flag, tp->wk.flags)) schedule_work(&tp->wk.work); } -static void rtl8169_init_phy(struct net_device *dev, struct rtl8169_private *tp) +static void rtl8169_init_phy(struct rtl8169_private *tp) { - rtl_hw_phy_config(dev); + r8169_hw_phy_config(tp, tp->phydev, tp->mac_version); if (tp->mac_version <= RTL_GIGA_MAC_VER_06) { pci_write_config_byte(tp->pci_dev, PCI_LATENCY_TIMER, 0x40); pci_write_config_byte(tp->pci_dev, PCI_CACHE_LINE_SIZE, 0x08); - netif_dbg(tp, drv, dev, - "Set MAC Reg C+CR Offset 0x82h = 0x01h\n"); + /* set undocumented MAC Reg C+CR Offset 0x82h */ RTL_W8(tp, 0x82, 0x01); } + if (tp->mac_version == RTL_GIGA_MAC_VER_05 && + tp->pci_dev->subsystem_vendor == PCI_VENDOR_ID_GIGABYTE && + tp->pci_dev->subsystem_device == 0xe000) + phy_write_paged(tp->phydev, 0x0001, 0x10, 0xf01b); + /* We may have called phy_speed_down before */ phy_speed_up(tp->phydev); @@ -3665,16 +2297,6 @@ static int rtl_set_mac_address(struct net_device *dev, void *p) return 0; } -static int rtl8169_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) -{ - struct rtl8169_private *tp = netdev_priv(dev); - - if (!netif_running(dev)) - return -ENODEV; - - return phy_mii_ioctl(tp->phydev, ifr, cmd); -} - static void rtl_wol_suspend_quirk(struct rtl8169_private *tp) { switch (tp->mac_version) { @@ -4700,9 +3322,7 @@ static void rtl_hw_start_8168h_1(struct rtl8169_private *tp) rtl_pcie_state_l2l3_disable(tp); - rtl_writephy(tp, 0x1f, 0x0c42); - rg_saw_cnt = (rtl_readphy(tp, 0x13) & 0x3fff); - rtl_writephy(tp, 0x1f, 0x0000); + rg_saw_cnt = phy_read_paged(tp->phydev, 0x0c42, 0x13) & 0x3fff; if (rg_saw_cnt > 0) { u16 sw_cnt_1ms_ini; @@ -4877,7 +3497,7 @@ static void rtl_hw_start_8117(struct rtl8169_private *tp) r8168_mac_ocp_write(tp, 0xc09e, 0x0000); /* firmware is for MAC only */ - rtl_apply_firmware(tp); + r8169_apply_firmware(tp); rtl_hw_aspm_clkreq_enable(tp, true); } @@ -4981,6 +3601,9 @@ static void rtl_hw_start_8402(struct rtl8169_private *tp) rtl_eri_write(tp, 0xb8, ERIAR_MASK_0011, 0x0000); rtl_w0w1_eri(tp, 0x0d4, ERIAR_MASK_0011, 0x0e00, 0xff00); + /* disable EEE */ + rtl_eri_write(tp, 0x1b0, ERIAR_MASK_0011, 0x0000); + rtl_pcie_state_l2l3_disable(tp); } @@ -4995,6 +3618,11 @@ static void rtl_hw_start_8106(struct rtl8169_private *tp) RTL_W8(tp, MCU, RTL_R8(tp, MCU) | EN_NDP | EN_OOB_RESET); RTL_W8(tp, DLLPR, RTL_R8(tp, DLLPR) & ~PFM_EN); + rtl_eri_write(tp, 0x1d0, ERIAR_MASK_0011, 0x0000); + + /* disable EEE */ + rtl_eri_write(tp, 0x1b0, ERIAR_MASK_0011, 0x0000); + rtl_pcie_state_l2l3_disable(tp); rtl_hw_aspm_clkreq_enable(tp, true); } @@ -5212,11 +3840,8 @@ static void rtl_hw_start_8169(struct rtl8169_private *tp) tp->cp_cmd |= PCIMulRW; if (tp->mac_version == RTL_GIGA_MAC_VER_02 || - tp->mac_version == RTL_GIGA_MAC_VER_03) { - netif_dbg(tp, drv, tp->dev, - "Set MAC Reg C+CR Offset 0xe0. Bit 3 and Bit 14 MUST be 1\n"); - tp->cp_cmd |= (1 << 14); - } + tp->mac_version == RTL_GIGA_MAC_VER_03) + tp->cp_cmd |= EnAnaPLL; RTL_W16(tp, CPlusCmd, tp->cp_cmd); @@ -6230,7 +4855,7 @@ static int rtl_open(struct net_device *dev) napi_enable(&tp->napi); - rtl8169_init_phy(dev, tp); + rtl8169_init_phy(tp); rtl_pll_power_up(tp); @@ -6361,7 +4986,7 @@ static void __rtl8169_resume(struct net_device *dev) netif_device_attach(dev); rtl_pll_power_up(tp); - rtl8169_init_phy(dev, tp); + rtl8169_init_phy(tp); phy_start(tp->phydev); @@ -6532,7 +5157,7 @@ static const struct net_device_ops rtl_netdev_ops = { .ndo_fix_features = rtl8169_fix_features, .ndo_set_features = rtl8169_set_features, .ndo_set_mac_address = rtl_set_mac_address, - .ndo_do_ioctl = rtl8169_ioctl, + .ndo_do_ioctl = phy_do_ioctl_running, .ndo_set_rx_mode = rtl_set_rx_mode, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = rtl8169_netpoll, @@ -6734,7 +5359,7 @@ static int rtl_jumbo_max(struct rtl8169_private *tp) { /* Non-GBit versions don't support jumbo frames */ if (!tp->supports_gmii) - return JUMBO_1K; + return 0; switch (tp->mac_version) { /* RTL8169 */ @@ -6965,10 +5590,9 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) dev->hw_features |= NETIF_F_RXALL; dev->hw_features |= NETIF_F_RXFCS; - /* MTU range: 60 - hw-specific max */ - dev->min_mtu = ETH_ZLEN; jumbo_max = rtl_jumbo_max(tp); - dev->max_mtu = jumbo_max; + if (jumbo_max) + dev->max_mtu = jumbo_max; rtl_set_irq_mask(tp); @@ -6998,7 +5622,7 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) (RTL_R32(tp, TxConfig) >> 20) & 0xfcf, pci_irq_vector(pdev, 0)); - if (jumbo_max > JUMBO_1K) + if (jumbo_max) netif_info(tp, probe, dev, "jumbo features [frames: %d bytes, tx checksumming: %s]\n", jumbo_max, tp->mac_version <= RTL_GIGA_MAC_VER_06 ? diff --git a/drivers/net/ethernet/realtek/r8169_phy_config.c b/drivers/net/ethernet/realtek/r8169_phy_config.c new file mode 100644 index 000000000000..e367e77c773b --- /dev/null +++ b/drivers/net/ethernet/realtek/r8169_phy_config.c @@ -0,0 +1,1307 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * r8169_phy_config.c: RealTek 8169/8168/8101 ethernet driver. + * + * Copyright (c) 2002 ShuChen <shuchen@realtek.com.tw> + * Copyright (c) 2003 - 2007 Francois Romieu <romieu@fr.zoreil.com> + * Copyright (c) a lot of people too. Please respect their work. + * + * See MAINTAINERS file for support contact information. + */ + +#include <linux/delay.h> +#include <linux/phy.h> + +#include "r8169.h" + +typedef void (*rtl_phy_cfg_fct)(struct rtl8169_private *tp, + struct phy_device *phydev); + +static void r8168d_modify_extpage(struct phy_device *phydev, int extpage, + int reg, u16 mask, u16 val) +{ + int oldpage = phy_select_page(phydev, 0x0007); + + __phy_write(phydev, 0x1e, extpage); + __phy_modify(phydev, reg, mask, val); + + phy_restore_page(phydev, oldpage, 0); +} + +static void r8168d_phy_param(struct phy_device *phydev, u16 parm, + u16 mask, u16 val) +{ + int oldpage = phy_select_page(phydev, 0x0005); + + __phy_write(phydev, 0x05, parm); + __phy_modify(phydev, 0x06, mask, val); + + phy_restore_page(phydev, oldpage, 0); +} + +static void r8168g_phy_param(struct phy_device *phydev, u16 parm, + u16 mask, u16 val) +{ + int oldpage = phy_select_page(phydev, 0x0a43); + + __phy_write(phydev, 0x13, parm); + __phy_modify(phydev, 0x14, mask, val); + + phy_restore_page(phydev, oldpage, 0); +} + +struct phy_reg { + u16 reg; + u16 val; +}; + +static void __rtl_writephy_batch(struct phy_device *phydev, + const struct phy_reg *regs, int len) +{ + phy_lock_mdio_bus(phydev); + + while (len-- > 0) { + __phy_write(phydev, regs->reg, regs->val); + regs++; + } + + phy_unlock_mdio_bus(phydev); +} + +#define rtl_writephy_batch(p, a) __rtl_writephy_batch(p, a, ARRAY_SIZE(a)) + +static void rtl8168f_config_eee_phy(struct phy_device *phydev) +{ + r8168d_modify_extpage(phydev, 0x0020, 0x15, 0, BIT(8)); + r8168d_phy_param(phydev, 0x8b85, 0, BIT(13)); +} + +static void rtl8168g_config_eee_phy(struct phy_device *phydev) +{ + phy_modify_paged(phydev, 0x0a43, 0x11, 0, BIT(4)); +} + +static void rtl8168h_config_eee_phy(struct phy_device *phydev) +{ + rtl8168g_config_eee_phy(phydev); + + phy_modify_paged(phydev, 0xa4a, 0x11, 0x0000, 0x0200); + phy_modify_paged(phydev, 0xa42, 0x14, 0x0000, 0x0080); +} + +static void rtl8125_config_eee_phy(struct phy_device *phydev) +{ + rtl8168h_config_eee_phy(phydev); + + phy_modify_paged(phydev, 0xa6d, 0x12, 0x0001, 0x0000); + phy_modify_paged(phydev, 0xa6d, 0x14, 0x0010, 0x0000); +} + +static void rtl8169s_hw_phy_config(struct rtl8169_private *tp, + struct phy_device *phydev) +{ + static const struct phy_reg phy_reg_init[] = { + { 0x1f, 0x0001 }, + { 0x06, 0x006e }, + { 0x08, 0x0708 }, + { 0x15, 0x4000 }, + { 0x18, 0x65c7 }, + + { 0x1f, 0x0001 }, + { 0x03, 0x00a1 }, + { 0x02, 0x0008 }, + { 0x01, 0x0120 }, + { 0x00, 0x1000 }, + { 0x04, 0x0800 }, + { 0x04, 0x0000 }, + + { 0x03, 0xff41 }, + { 0x02, 0xdf60 }, + { 0x01, 0x0140 }, + { 0x00, 0x0077 }, + { 0x04, 0x7800 }, + { 0x04, 0x7000 }, + + { 0x03, 0x802f }, + { 0x02, 0x4f02 }, + { 0x01, 0x0409 }, + { 0x00, 0xf0f9 }, + { 0x04, 0x9800 }, + { 0x04, 0x9000 }, + + { 0x03, 0xdf01 }, + { 0x02, 0xdf20 }, + { 0x01, 0xff95 }, + { 0x00, 0xba00 }, + { 0x04, 0xa800 }, + { 0x04, 0xa000 }, + + { 0x03, 0xff41 }, + { 0x02, 0xdf20 }, + { 0x01, 0x0140 }, + { 0x00, 0x00bb }, + { 0x04, 0xb800 }, + { 0x04, 0xb000 }, + + { 0x03, 0xdf41 }, + { 0x02, 0xdc60 }, + { 0x01, 0x6340 }, + { 0x00, 0x007d }, + { 0x04, 0xd800 }, + { 0x04, 0xd000 }, + + { 0x03, 0xdf01 }, + { 0x02, 0xdf20 }, + { 0x01, 0x100a }, + { 0x00, 0xa0ff }, + { 0x04, 0xf800 }, + { 0x04, 0xf000 }, + + { 0x1f, 0x0000 }, + { 0x0b, 0x0000 }, + { 0x00, 0x9200 } + }; + + rtl_writephy_batch(phydev, phy_reg_init); +} + +static void rtl8169sb_hw_phy_config(struct rtl8169_private *tp, + struct phy_device *phydev) +{ + phy_write_paged(phydev, 0x0002, 0x01, 0x90d0); +} + +static void rtl8169scd_hw_phy_config(struct rtl8169_private *tp, + struct phy_device *phydev) +{ + static const struct phy_reg phy_reg_init[] = { + { 0x1f, 0x0001 }, + { 0x04, 0x0000 }, + { 0x03, 0x00a1 }, + { 0x02, 0x0008 }, + { 0x01, 0x0120 }, + { 0x00, 0x1000 }, + { 0x04, 0x0800 }, + { 0x04, 0x9000 }, + { 0x03, 0x802f }, + { 0x02, 0x4f02 }, + { 0x01, 0x0409 }, + { 0x00, 0xf099 }, + { 0x04, 0x9800 }, + { 0x04, 0xa000 }, + { 0x03, 0xdf01 }, + { 0x02, 0xdf20 }, + { 0x01, 0xff95 }, + { 0x00, 0xba00 }, + { 0x04, 0xa800 }, + { 0x04, 0xf000 }, + { 0x03, 0xdf01 }, + { 0x02, 0xdf20 }, + { 0x01, 0x101a }, + { 0x00, 0xa0ff }, + { 0x04, 0xf800 }, + { 0x04, 0x0000 }, + { 0x1f, 0x0000 }, + + { 0x1f, 0x0001 }, + { 0x10, 0xf41b }, + { 0x14, 0xfb54 }, + { 0x18, 0xf5c7 }, + { 0x1f, 0x0000 }, + + { 0x1f, 0x0001 }, + { 0x17, 0x0cc0 }, + { 0x1f, 0x0000 } + }; + + rtl_writephy_batch(phydev, phy_reg_init); +} + +static void rtl8169sce_hw_phy_config(struct rtl8169_private *tp, + struct phy_device *phydev) +{ + static const struct phy_reg phy_reg_init[] = { + { 0x1f, 0x0001 }, + { 0x04, 0x0000 }, + { 0x03, 0x00a1 }, + { 0x02, 0x0008 }, + { 0x01, 0x0120 }, + { 0x00, 0x1000 }, + { 0x04, 0x0800 }, + { 0x04, 0x9000 }, + { 0x03, 0x802f }, + { 0x02, 0x4f02 }, + { 0x01, 0x0409 }, + { 0x00, 0xf099 }, + { 0x04, 0x9800 }, + { 0x04, 0xa000 }, + { 0x03, 0xdf01 }, + { 0x02, 0xdf20 }, + { 0x01, 0xff95 }, + { 0x00, 0xba00 }, + { 0x04, 0xa800 }, + { 0x04, 0xf000 }, + { 0x03, 0xdf01 }, + { 0x02, 0xdf20 }, + { 0x01, 0x101a }, + { 0x00, 0xa0ff }, + { 0x04, 0xf800 }, + { 0x04, 0x0000 }, + { 0x1f, 0x0000 }, + + { 0x1f, 0x0001 }, + { 0x0b, 0x8480 }, + { 0x1f, 0x0000 }, + + { 0x1f, 0x0001 }, + { 0x18, 0x67c7 }, + { 0x04, 0x2000 }, + { 0x03, 0x002f }, + { 0x02, 0x4360 }, + { 0x01, 0x0109 }, + { 0x00, 0x3022 }, + { 0x04, 0x2800 }, + { 0x1f, 0x0000 }, + + { 0x1f, 0x0001 }, + { 0x17, 0x0cc0 }, + { 0x1f, 0x0000 } + }; + + rtl_writephy_batch(phydev, phy_reg_init); +} + +static void rtl8168bb_hw_phy_config(struct rtl8169_private *tp, + struct phy_device *phydev) +{ + phy_write(phydev, 0x1f, 0x0001); + phy_set_bits(phydev, 0x16, BIT(0)); + phy_write(phydev, 0x10, 0xf41b); + phy_write(phydev, 0x1f, 0x0000); +} + +static void rtl8168bef_hw_phy_config(struct rtl8169_private *tp, + struct phy_device *phydev) +{ + phy_write_paged(phydev, 0x0001, 0x10, 0xf41b); +} + +static void rtl8168cp_1_hw_phy_config(struct rtl8169_private *tp, + struct phy_device *phydev) +{ + phy_write(phydev, 0x1d, 0x0f00); + phy_write_paged(phydev, 0x0002, 0x0c, 0x1ec8); +} + +static void rtl8168cp_2_hw_phy_config(struct rtl8169_private *tp, + struct phy_device *phydev) +{ + phy_set_bits(phydev, 0x14, BIT(5)); + phy_set_bits(phydev, 0x0d, BIT(5)); + phy_write_paged(phydev, 0x0001, 0x1d, 0x3d98); +} + +static void rtl8168c_1_hw_phy_config(struct rtl8169_private *tp, + struct phy_device *phydev) +{ + static const struct phy_reg phy_reg_init[] = { + { 0x1f, 0x0001 }, + { 0x12, 0x2300 }, + { 0x1f, 0x0002 }, + { 0x00, 0x88d4 }, + { 0x01, 0x82b1 }, + { 0x03, 0x7002 }, + { 0x08, 0x9e30 }, + { 0x09, 0x01f0 }, + { 0x0a, 0x5500 }, + { 0x0c, 0x00c8 }, + { 0x1f, 0x0003 }, + { 0x12, 0xc096 }, + { 0x16, 0x000a }, + { 0x1f, 0x0000 }, + { 0x1f, 0x0000 }, + { 0x09, 0x2000 }, + { 0x09, 0x0000 } + }; + + rtl_writephy_batch(phydev, phy_reg_init); + + phy_set_bits(phydev, 0x14, BIT(5)); + phy_set_bits(phydev, 0x0d, BIT(5)); +} + +static void rtl8168c_2_hw_phy_config(struct rtl8169_private *tp, + struct phy_device *phydev) +{ + static const struct phy_reg phy_reg_init[] = { + { 0x1f, 0x0001 }, + { 0x12, 0x2300 }, + { 0x03, 0x802f }, + { 0x02, 0x4f02 }, + { 0x01, 0x0409 }, + { 0x00, 0xf099 }, + { 0x04, 0x9800 }, + { 0x04, 0x9000 }, + { 0x1d, 0x3d98 }, + { 0x1f, 0x0002 }, + { 0x0c, 0x7eb8 }, + { 0x06, 0x0761 }, + { 0x1f, 0x0003 }, + { 0x16, 0x0f0a }, + { 0x1f, 0x0000 } + }; + + rtl_writephy_batch(phydev, phy_reg_init); + + phy_set_bits(phydev, 0x16, BIT(0)); + phy_set_bits(phydev, 0x14, BIT(5)); + phy_set_bits(phydev, 0x0d, BIT(5)); +} + +static void rtl8168c_3_hw_phy_config(struct rtl8169_private *tp, + struct phy_device *phydev) +{ + static const struct phy_reg phy_reg_init[] = { + { 0x1f, 0x0001 }, + { 0x12, 0x2300 }, + { 0x1d, 0x3d98 }, + { 0x1f, 0x0002 }, + { 0x0c, 0x7eb8 }, + { 0x06, 0x5461 }, + { 0x1f, 0x0003 }, + { 0x16, 0x0f0a }, + { 0x1f, 0x0000 } + }; + + rtl_writephy_batch(phydev, phy_reg_init); + + phy_set_bits(phydev, 0x16, BIT(0)); + phy_set_bits(phydev, 0x14, BIT(5)); + phy_set_bits(phydev, 0x0d, BIT(5)); +} + +static const struct phy_reg rtl8168d_1_phy_reg_init_0[] = { + /* Channel Estimation */ + { 0x1f, 0x0001 }, + { 0x06, 0x4064 }, + { 0x07, 0x2863 }, + { 0x08, 0x059c }, + { 0x09, 0x26b4 }, + { 0x0a, 0x6a19 }, + { 0x0b, 0xdcc8 }, + { 0x10, 0xf06d }, + { 0x14, 0x7f68 }, + { 0x18, 0x7fd9 }, + { 0x1c, 0xf0ff }, + { 0x1d, 0x3d9c }, + { 0x1f, 0x0003 }, + { 0x12, 0xf49f }, + { 0x13, 0x070b }, + { 0x1a, 0x05ad }, + { 0x14, 0x94c0 }, + + /* + * Tx Error Issue + * Enhance line driver power + */ + { 0x1f, 0x0002 }, + { 0x06, 0x5561 }, + { 0x1f, 0x0005 }, + { 0x05, 0x8332 }, + { 0x06, 0x5561 }, + + /* + * Can not link to 1Gbps with bad cable + * Decrease SNR threshold form 21.07dB to 19.04dB + */ + { 0x1f, 0x0001 }, + { 0x17, 0x0cc0 }, + + { 0x1f, 0x0000 }, + { 0x0d, 0xf880 } +}; + +static const struct phy_reg rtl8168d_1_phy_reg_init_1[] = { + { 0x1f, 0x0002 }, + { 0x05, 0x669a }, + { 0x1f, 0x0005 }, + { 0x05, 0x8330 }, + { 0x06, 0x669a }, + { 0x1f, 0x0002 } +}; + +static void rtl8168d_apply_firmware_cond(struct rtl8169_private *tp, + struct phy_device *phydev, + u16 val) +{ + u16 reg_val; + + phy_write(phydev, 0x1f, 0x0005); + phy_write(phydev, 0x05, 0x001b); + reg_val = phy_read(phydev, 0x06); + phy_write(phydev, 0x1f, 0x0000); + + if (reg_val != val) + phydev_warn(phydev, "chipset not ready for firmware\n"); + else + r8169_apply_firmware(tp); +} + +static void rtl8168d_1_hw_phy_config(struct rtl8169_private *tp, + struct phy_device *phydev) +{ + rtl_writephy_batch(phydev, rtl8168d_1_phy_reg_init_0); + + /* + * Rx Error Issue + * Fine Tune Switching regulator parameter + */ + phy_write(phydev, 0x1f, 0x0002); + phy_modify(phydev, 0x0b, 0x00ef, 0x0010); + phy_modify(phydev, 0x0c, 0x5d00, 0xa200); + + if (rtl8168d_efuse_read(tp, 0x01) == 0xb1) { + int val; + + rtl_writephy_batch(phydev, rtl8168d_1_phy_reg_init_1); + + val = phy_read(phydev, 0x0d); + + if ((val & 0x00ff) != 0x006c) { + static const u32 set[] = { + 0x0065, 0x0066, 0x0067, 0x0068, + 0x0069, 0x006a, 0x006b, 0x006c + }; + int i; + + phy_write(phydev, 0x1f, 0x0002); + + val &= 0xff00; + for (i = 0; i < ARRAY_SIZE(set); i++) + phy_write(phydev, 0x0d, val | set[i]); + } + } else { + phy_write_paged(phydev, 0x0002, 0x05, 0x6662); + r8168d_phy_param(phydev, 0x8330, 0xffff, 0x6662); + } + + /* RSET couple improve */ + phy_write(phydev, 0x1f, 0x0002); + phy_set_bits(phydev, 0x0d, 0x0300); + phy_set_bits(phydev, 0x0f, 0x0010); + + /* Fine tune PLL performance */ + phy_write(phydev, 0x1f, 0x0002); + phy_modify(phydev, 0x02, 0x0600, 0x0100); + phy_clear_bits(phydev, 0x03, 0xe000); + phy_write(phydev, 0x1f, 0x0000); + + rtl8168d_apply_firmware_cond(tp, phydev, 0xbf00); +} + +static void rtl8168d_2_hw_phy_config(struct rtl8169_private *tp, + struct phy_device *phydev) +{ + rtl_writephy_batch(phydev, rtl8168d_1_phy_reg_init_0); + + if (rtl8168d_efuse_read(tp, 0x01) == 0xb1) { + int val; + + rtl_writephy_batch(phydev, rtl8168d_1_phy_reg_init_1); + + val = phy_read(phydev, 0x0d); + if ((val & 0x00ff) != 0x006c) { + static const u32 set[] = { + 0x0065, 0x0066, 0x0067, 0x0068, + 0x0069, 0x006a, 0x006b, 0x006c + }; + int i; + + phy_write(phydev, 0x1f, 0x0002); + + val &= 0xff00; + for (i = 0; i < ARRAY_SIZE(set); i++) + phy_write(phydev, 0x0d, val | set[i]); + } + } else { + phy_write_paged(phydev, 0x0002, 0x05, 0x2642); + r8168d_phy_param(phydev, 0x8330, 0xffff, 0x2642); + } + + /* Fine tune PLL performance */ + phy_write(phydev, 0x1f, 0x0002); + phy_modify(phydev, 0x02, 0x0600, 0x0100); + phy_clear_bits(phydev, 0x03, 0xe000); + phy_write(phydev, 0x1f, 0x0000); + + /* Switching regulator Slew rate */ + phy_modify_paged(phydev, 0x0002, 0x0f, 0x0000, 0x0017); + + rtl8168d_apply_firmware_cond(tp, phydev, 0xb300); +} + +static void rtl8168d_3_hw_phy_config(struct rtl8169_private *tp, + struct phy_device *phydev) +{ + static const struct phy_reg phy_reg_init[] = { + { 0x1f, 0x0002 }, + { 0x10, 0x0008 }, + { 0x0d, 0x006c }, + + { 0x1f, 0x0000 }, + { 0x0d, 0xf880 }, + + { 0x1f, 0x0001 }, + { 0x17, 0x0cc0 }, + + { 0x1f, 0x0001 }, + { 0x0b, 0xa4d8 }, + { 0x09, 0x281c }, + { 0x07, 0x2883 }, + { 0x0a, 0x6b35 }, + { 0x1d, 0x3da4 }, + { 0x1c, 0xeffd }, + { 0x14, 0x7f52 }, + { 0x18, 0x7fc6 }, + { 0x08, 0x0601 }, + { 0x06, 0x4063 }, + { 0x10, 0xf074 }, + { 0x1f, 0x0003 }, + { 0x13, 0x0789 }, + { 0x12, 0xf4bd }, + { 0x1a, 0x04fd }, + { 0x14, 0x84b0 }, + { 0x1f, 0x0000 }, + { 0x00, 0x9200 }, + + { 0x1f, 0x0005 }, + { 0x01, 0x0340 }, + { 0x1f, 0x0001 }, + { 0x04, 0x4000 }, + { 0x03, 0x1d21 }, + { 0x02, 0x0c32 }, + { 0x01, 0x0200 }, + { 0x00, 0x5554 }, + { 0x04, 0x4800 }, + { 0x04, 0x4000 }, + { 0x04, 0xf000 }, + { 0x03, 0xdf01 }, + { 0x02, 0xdf20 }, + { 0x01, 0x101a }, + { 0x00, 0xa0ff }, + { 0x04, 0xf800 }, + { 0x04, 0xf000 }, + { 0x1f, 0x0000 }, + }; + + rtl_writephy_batch(phydev, phy_reg_init); + r8168d_modify_extpage(phydev, 0x0023, 0x16, 0xffff, 0x0000); +} + +static void rtl8168d_4_hw_phy_config(struct rtl8169_private *tp, + struct phy_device *phydev) +{ + phy_write_paged(phydev, 0x0001, 0x17, 0x0cc0); + r8168d_modify_extpage(phydev, 0x002d, 0x18, 0xffff, 0x0040); + phy_set_bits(phydev, 0x0d, BIT(5)); +} + +static void rtl8168e_1_hw_phy_config(struct rtl8169_private *tp, + struct phy_device *phydev) +{ + static const struct phy_reg phy_reg_init[] = { + /* Channel estimation fine tune */ + { 0x1f, 0x0001 }, + { 0x0b, 0x6c20 }, + { 0x07, 0x2872 }, + { 0x1c, 0xefff }, + { 0x1f, 0x0003 }, + { 0x14, 0x6420 }, + { 0x1f, 0x0000 }, + }; + + r8169_apply_firmware(tp); + + /* Enable Delay cap */ + r8168d_phy_param(phydev, 0x8b80, 0xffff, 0xc896); + + rtl_writephy_batch(phydev, phy_reg_init); + + /* Update PFM & 10M TX idle timer */ + r8168d_modify_extpage(phydev, 0x002f, 0x15, 0xffff, 0x1919); + + r8168d_modify_extpage(phydev, 0x00ac, 0x18, 0xffff, 0x0006); + + /* DCO enable for 10M IDLE Power */ + r8168d_modify_extpage(phydev, 0x0023, 0x17, 0x0000, 0x0006); + + /* For impedance matching */ + phy_modify_paged(phydev, 0x0002, 0x08, 0x7f00, 0x8000); + + /* PHY auto speed down */ + r8168d_modify_extpage(phydev, 0x002d, 0x18, 0x0000, 0x0050); + phy_set_bits(phydev, 0x14, BIT(15)); + + r8168d_phy_param(phydev, 0x8b86, 0x0000, 0x0001); + r8168d_phy_param(phydev, 0x8b85, 0x2000, 0x0000); + + r8168d_modify_extpage(phydev, 0x0020, 0x15, 0x1100, 0x0000); + phy_write_paged(phydev, 0x0006, 0x00, 0x5a00); + + phy_write_mmd(phydev, MDIO_MMD_AN, MDIO_AN_EEE_ADV, 0x0000); +} + +static void rtl8168e_2_hw_phy_config(struct rtl8169_private *tp, + struct phy_device *phydev) +{ + r8169_apply_firmware(tp); + + /* Enable Delay cap */ + r8168d_modify_extpage(phydev, 0x00ac, 0x18, 0xffff, 0x0006); + + /* Channel estimation fine tune */ + phy_write_paged(phydev, 0x0003, 0x09, 0xa20f); + + /* Green Setting */ + r8168d_phy_param(phydev, 0x8b5b, 0xffff, 0x9222); + r8168d_phy_param(phydev, 0x8b6d, 0xffff, 0x8000); + r8168d_phy_param(phydev, 0x8b76, 0xffff, 0x8000); + + /* For 4-corner performance improve */ + phy_write(phydev, 0x1f, 0x0005); + phy_write(phydev, 0x05, 0x8b80); + phy_set_bits(phydev, 0x17, 0x0006); + phy_write(phydev, 0x1f, 0x0000); + + /* PHY auto speed down */ + r8168d_modify_extpage(phydev, 0x002d, 0x18, 0x0000, 0x0010); + phy_set_bits(phydev, 0x14, BIT(15)); + + /* improve 10M EEE waveform */ + r8168d_phy_param(phydev, 0x8b86, 0x0000, 0x0001); + + /* Improve 2-pair detection performance */ + r8168d_phy_param(phydev, 0x8b85, 0x0000, 0x4000); + + rtl8168f_config_eee_phy(phydev); + + /* Green feature */ + phy_write(phydev, 0x1f, 0x0003); + phy_set_bits(phydev, 0x19, BIT(0)); + phy_set_bits(phydev, 0x10, BIT(10)); + phy_write(phydev, 0x1f, 0x0000); + phy_modify_paged(phydev, 0x0005, 0x01, 0, BIT(8)); +} + +static void rtl8168f_hw_phy_config(struct rtl8169_private *tp, + struct phy_device *phydev) +{ + /* For 4-corner performance improve */ + r8168d_phy_param(phydev, 0x8b80, 0x0000, 0x0006); + + /* PHY auto speed down */ + r8168d_modify_extpage(phydev, 0x002d, 0x18, 0x0000, 0x0010); + phy_set_bits(phydev, 0x14, BIT(15)); + + /* Improve 10M EEE waveform */ + r8168d_phy_param(phydev, 0x8b86, 0x0000, 0x0001); + + rtl8168f_config_eee_phy(phydev); +} + +static void rtl8168f_1_hw_phy_config(struct rtl8169_private *tp, + struct phy_device *phydev) +{ + r8169_apply_firmware(tp); + + /* Channel estimation fine tune */ + phy_write_paged(phydev, 0x0003, 0x09, 0xa20f); + + /* Modify green table for giga & fnet */ + r8168d_phy_param(phydev, 0x8b55, 0xffff, 0x0000); + r8168d_phy_param(phydev, 0x8b5e, 0xffff, 0x0000); + r8168d_phy_param(phydev, 0x8b67, 0xffff, 0x0000); + r8168d_phy_param(phydev, 0x8b70, 0xffff, 0x0000); + r8168d_modify_extpage(phydev, 0x0078, 0x17, 0xffff, 0x0000); + r8168d_modify_extpage(phydev, 0x0078, 0x19, 0xffff, 0x00fb); + + /* Modify green table for 10M */ + r8168d_phy_param(phydev, 0x8b79, 0xffff, 0xaa00); + + /* Disable hiimpedance detection (RTCT) */ + phy_write_paged(phydev, 0x0003, 0x01, 0x328a); + + rtl8168f_hw_phy_config(tp, phydev); + + /* Improve 2-pair detection performance */ + r8168d_phy_param(phydev, 0x8b85, 0x0000, 0x4000); +} + +static void rtl8168f_2_hw_phy_config(struct rtl8169_private *tp, + struct phy_device *phydev) +{ + r8169_apply_firmware(tp); + + rtl8168f_hw_phy_config(tp, phydev); +} + +static void rtl8411_hw_phy_config(struct rtl8169_private *tp, + struct phy_device *phydev) +{ + r8169_apply_firmware(tp); + + rtl8168f_hw_phy_config(tp, phydev); + + /* Improve 2-pair detection performance */ + r8168d_phy_param(phydev, 0x8b85, 0x0000, 0x4000); + + /* Channel estimation fine tune */ + phy_write_paged(phydev, 0x0003, 0x09, 0xa20f); + + /* Modify green table for giga & fnet */ + r8168d_phy_param(phydev, 0x8b55, 0xffff, 0x0000); + r8168d_phy_param(phydev, 0x8b5e, 0xffff, 0x0000); + r8168d_phy_param(phydev, 0x8b67, 0xffff, 0x0000); + r8168d_phy_param(phydev, 0x8b70, 0xffff, 0x0000); + r8168d_modify_extpage(phydev, 0x0078, 0x17, 0xffff, 0x0000); + r8168d_modify_extpage(phydev, 0x0078, 0x19, 0xffff, 0x00aa); + + /* Modify green table for 10M */ + r8168d_phy_param(phydev, 0x8b79, 0xffff, 0xaa00); + + /* Disable hiimpedance detection (RTCT) */ + phy_write_paged(phydev, 0x0003, 0x01, 0x328a); + + /* Modify green table for giga */ + r8168d_phy_param(phydev, 0x8b54, 0x0800, 0x0000); + r8168d_phy_param(phydev, 0x8b5d, 0x0800, 0x0000); + r8168d_phy_param(phydev, 0x8a7c, 0x0100, 0x0000); + r8168d_phy_param(phydev, 0x8a7f, 0x0000, 0x0100); + r8168d_phy_param(phydev, 0x8a82, 0x0100, 0x0000); + r8168d_phy_param(phydev, 0x8a85, 0x0100, 0x0000); + r8168d_phy_param(phydev, 0x8a88, 0x0100, 0x0000); + + /* uc same-seed solution */ + r8168d_phy_param(phydev, 0x8b85, 0x0000, 0x8000); + + /* Green feature */ + phy_write(phydev, 0x1f, 0x0003); + phy_clear_bits(phydev, 0x19, BIT(0)); + phy_clear_bits(phydev, 0x10, BIT(10)); + phy_write(phydev, 0x1f, 0x0000); +} + +static void rtl8168g_disable_aldps(struct phy_device *phydev) +{ + phy_modify_paged(phydev, 0x0a43, 0x10, BIT(2), 0); +} + +static void rtl8168g_phy_adjust_10m_aldps(struct phy_device *phydev) +{ + phy_modify_paged(phydev, 0x0bcc, 0x14, BIT(8), 0); + phy_modify_paged(phydev, 0x0a44, 0x11, 0, BIT(7) | BIT(6)); + r8168g_phy_param(phydev, 0x8084, 0x6000, 0x0000); + phy_modify_paged(phydev, 0x0a43, 0x10, 0x0000, 0x1003); +} + +static void rtl8168g_1_hw_phy_config(struct rtl8169_private *tp, + struct phy_device *phydev) +{ + int ret; + + r8169_apply_firmware(tp); + + ret = phy_read_paged(phydev, 0x0a46, 0x10); + if (ret & BIT(8)) + phy_modify_paged(phydev, 0x0bcc, 0x12, BIT(15), 0); + else + phy_modify_paged(phydev, 0x0bcc, 0x12, 0, BIT(15)); + + ret = phy_read_paged(phydev, 0x0a46, 0x13); + if (ret & BIT(8)) + phy_modify_paged(phydev, 0x0c41, 0x15, 0, BIT(1)); + else + phy_modify_paged(phydev, 0x0c41, 0x15, BIT(1), 0); + + /* Enable PHY auto speed down */ + phy_modify_paged(phydev, 0x0a44, 0x11, 0, BIT(3) | BIT(2)); + + rtl8168g_phy_adjust_10m_aldps(phydev); + + /* EEE auto-fallback function */ + phy_modify_paged(phydev, 0x0a4b, 0x11, 0, BIT(2)); + + /* Enable UC LPF tune function */ + r8168g_phy_param(phydev, 0x8012, 0x0000, 0x8000); + + phy_modify_paged(phydev, 0x0c42, 0x11, BIT(13), BIT(14)); + + /* Improve SWR Efficiency */ + phy_write(phydev, 0x1f, 0x0bcd); + phy_write(phydev, 0x14, 0x5065); + phy_write(phydev, 0x14, 0xd065); + phy_write(phydev, 0x1f, 0x0bc8); + phy_write(phydev, 0x11, 0x5655); + phy_write(phydev, 0x1f, 0x0bcd); + phy_write(phydev, 0x14, 0x1065); + phy_write(phydev, 0x14, 0x9065); + phy_write(phydev, 0x14, 0x1065); + phy_write(phydev, 0x1f, 0x0000); + + rtl8168g_disable_aldps(phydev); + rtl8168g_config_eee_phy(phydev); +} + +static void rtl8168g_2_hw_phy_config(struct rtl8169_private *tp, + struct phy_device *phydev) +{ + r8169_apply_firmware(tp); + rtl8168g_config_eee_phy(phydev); +} + +static void rtl8168h_1_hw_phy_config(struct rtl8169_private *tp, + struct phy_device *phydev) +{ + u16 dout_tapbin; + u32 data; + + r8169_apply_firmware(tp); + + /* CHN EST parameters adjust - giga master */ + r8168g_phy_param(phydev, 0x809b, 0xf800, 0x8000); + r8168g_phy_param(phydev, 0x80a2, 0xff00, 0x8000); + r8168g_phy_param(phydev, 0x80a4, 0xff00, 0x8500); + r8168g_phy_param(phydev, 0x809c, 0xff00, 0xbd00); + + /* CHN EST parameters adjust - giga slave */ + r8168g_phy_param(phydev, 0x80ad, 0xf800, 0x7000); + r8168g_phy_param(phydev, 0x80b4, 0xff00, 0x5000); + r8168g_phy_param(phydev, 0x80ac, 0xff00, 0x4000); + + /* CHN EST parameters adjust - fnet */ + r8168g_phy_param(phydev, 0x808e, 0xff00, 0x1200); + r8168g_phy_param(phydev, 0x8090, 0xff00, 0xe500); + r8168g_phy_param(phydev, 0x8092, 0xff00, 0x9f00); + + /* enable R-tune & PGA-retune function */ + dout_tapbin = 0; + data = phy_read_paged(phydev, 0x0a46, 0x13); + data &= 3; + data <<= 2; + dout_tapbin |= data; + data = phy_read_paged(phydev, 0x0a46, 0x12); + data &= 0xc000; + data >>= 14; + dout_tapbin |= data; + dout_tapbin = ~(dout_tapbin ^ 0x08); + dout_tapbin <<= 12; + dout_tapbin &= 0xf000; + + r8168g_phy_param(phydev, 0x827a, 0xf000, dout_tapbin); + r8168g_phy_param(phydev, 0x827b, 0xf000, dout_tapbin); + r8168g_phy_param(phydev, 0x827c, 0xf000, dout_tapbin); + r8168g_phy_param(phydev, 0x827d, 0xf000, dout_tapbin); + r8168g_phy_param(phydev, 0x0811, 0x0000, 0x0800); + phy_modify_paged(phydev, 0x0a42, 0x16, 0x0000, 0x0002); + + /* enable GPHY 10M */ + phy_modify_paged(phydev, 0x0a44, 0x11, 0, BIT(11)); + + /* SAR ADC performance */ + phy_modify_paged(phydev, 0x0bca, 0x17, BIT(12) | BIT(13), BIT(14)); + + r8168g_phy_param(phydev, 0x803f, 0x3000, 0x0000); + r8168g_phy_param(phydev, 0x8047, 0x3000, 0x0000); + r8168g_phy_param(phydev, 0x804f, 0x3000, 0x0000); + r8168g_phy_param(phydev, 0x8057, 0x3000, 0x0000); + r8168g_phy_param(phydev, 0x805f, 0x3000, 0x0000); + r8168g_phy_param(phydev, 0x8067, 0x3000, 0x0000); + r8168g_phy_param(phydev, 0x806f, 0x3000, 0x0000); + + /* disable phy pfm mode */ + phy_modify_paged(phydev, 0x0a44, 0x11, BIT(7), 0); + + rtl8168g_disable_aldps(phydev); + rtl8168h_config_eee_phy(phydev); +} + +static void rtl8168h_2_hw_phy_config(struct rtl8169_private *tp, + struct phy_device *phydev) +{ + u16 ioffset, rlen; + u32 data; + + r8169_apply_firmware(tp); + + /* CHIN EST parameter update */ + r8168g_phy_param(phydev, 0x808a, 0x003f, 0x000a); + + /* enable R-tune & PGA-retune function */ + r8168g_phy_param(phydev, 0x0811, 0x0000, 0x0800); + phy_modify_paged(phydev, 0x0a42, 0x16, 0x0000, 0x0002); + + /* enable GPHY 10M */ + phy_modify_paged(phydev, 0x0a44, 0x11, 0, BIT(11)); + + ioffset = rtl8168h_2_get_adc_bias_ioffset(tp); + if (ioffset != 0xffff) + phy_write_paged(phydev, 0x0bcf, 0x16, ioffset); + + /* Modify rlen (TX LPF corner frequency) level */ + data = phy_read_paged(phydev, 0x0bcd, 0x16); + data &= 0x000f; + rlen = 0; + if (data > 3) + rlen = data - 3; + data = rlen | (rlen << 4) | (rlen << 8) | (rlen << 12); + phy_write_paged(phydev, 0x0bcd, 0x17, data); + + /* disable phy pfm mode */ + phy_modify_paged(phydev, 0x0a44, 0x11, BIT(7), 0); + + rtl8168g_disable_aldps(phydev); + rtl8168g_config_eee_phy(phydev); +} + +static void rtl8168ep_1_hw_phy_config(struct rtl8169_private *tp, + struct phy_device *phydev) +{ + /* Enable PHY auto speed down */ + phy_modify_paged(phydev, 0x0a44, 0x11, 0, BIT(3) | BIT(2)); + + rtl8168g_phy_adjust_10m_aldps(phydev); + + /* Enable EEE auto-fallback function */ + phy_modify_paged(phydev, 0x0a4b, 0x11, 0, BIT(2)); + + /* Enable UC LPF tune function */ + r8168g_phy_param(phydev, 0x8012, 0x0000, 0x8000); + + /* set rg_sel_sdm_rate */ + phy_modify_paged(phydev, 0x0c42, 0x11, BIT(13), BIT(14)); + + rtl8168g_disable_aldps(phydev); + rtl8168g_config_eee_phy(phydev); +} + +static void rtl8168ep_2_hw_phy_config(struct rtl8169_private *tp, + struct phy_device *phydev) +{ + rtl8168g_phy_adjust_10m_aldps(phydev); + + /* Enable UC LPF tune function */ + r8168g_phy_param(phydev, 0x8012, 0x0000, 0x8000); + + /* Set rg_sel_sdm_rate */ + phy_modify_paged(phydev, 0x0c42, 0x11, BIT(13), BIT(14)); + + /* Channel estimation parameters */ + r8168g_phy_param(phydev, 0x80f3, 0xff00, 0x8b00); + r8168g_phy_param(phydev, 0x80f0, 0xff00, 0x3a00); + r8168g_phy_param(phydev, 0x80ef, 0xff00, 0x0500); + r8168g_phy_param(phydev, 0x80f6, 0xff00, 0x6e00); + r8168g_phy_param(phydev, 0x80ec, 0xff00, 0x6800); + r8168g_phy_param(phydev, 0x80ed, 0xff00, 0x7c00); + r8168g_phy_param(phydev, 0x80f2, 0xff00, 0xf400); + r8168g_phy_param(phydev, 0x80f4, 0xff00, 0x8500); + r8168g_phy_param(phydev, 0x8110, 0xff00, 0xa800); + r8168g_phy_param(phydev, 0x810f, 0xff00, 0x1d00); + r8168g_phy_param(phydev, 0x8111, 0xff00, 0xf500); + r8168g_phy_param(phydev, 0x8113, 0xff00, 0x6100); + r8168g_phy_param(phydev, 0x8115, 0xff00, 0x9200); + r8168g_phy_param(phydev, 0x810e, 0xff00, 0x0400); + r8168g_phy_param(phydev, 0x810c, 0xff00, 0x7c00); + r8168g_phy_param(phydev, 0x810b, 0xff00, 0x5a00); + r8168g_phy_param(phydev, 0x80d1, 0xff00, 0xff00); + r8168g_phy_param(phydev, 0x80cd, 0xff00, 0x9e00); + r8168g_phy_param(phydev, 0x80d3, 0xff00, 0x0e00); + r8168g_phy_param(phydev, 0x80d5, 0xff00, 0xca00); + r8168g_phy_param(phydev, 0x80d7, 0xff00, 0x8400); + + /* Force PWM-mode */ + phy_write(phydev, 0x1f, 0x0bcd); + phy_write(phydev, 0x14, 0x5065); + phy_write(phydev, 0x14, 0xd065); + phy_write(phydev, 0x1f, 0x0bc8); + phy_write(phydev, 0x12, 0x00ed); + phy_write(phydev, 0x1f, 0x0bcd); + phy_write(phydev, 0x14, 0x1065); + phy_write(phydev, 0x14, 0x9065); + phy_write(phydev, 0x14, 0x1065); + phy_write(phydev, 0x1f, 0x0000); + + rtl8168g_disable_aldps(phydev); + rtl8168g_config_eee_phy(phydev); +} + +static void rtl8117_hw_phy_config(struct rtl8169_private *tp, + struct phy_device *phydev) +{ + /* CHN EST parameters adjust - fnet */ + r8168g_phy_param(phydev, 0x808e, 0xff00, 0x4800); + r8168g_phy_param(phydev, 0x8090, 0xff00, 0xcc00); + r8168g_phy_param(phydev, 0x8092, 0xff00, 0xb000); + + r8168g_phy_param(phydev, 0x8088, 0xff00, 0x6000); + r8168g_phy_param(phydev, 0x808b, 0x3f00, 0x0b00); + r8168g_phy_param(phydev, 0x808d, 0x1f00, 0x0600); + r8168g_phy_param(phydev, 0x808c, 0xff00, 0xb000); + r8168g_phy_param(phydev, 0x80a0, 0xff00, 0x2800); + r8168g_phy_param(phydev, 0x80a2, 0xff00, 0x5000); + r8168g_phy_param(phydev, 0x809b, 0xf800, 0xb000); + r8168g_phy_param(phydev, 0x809a, 0xff00, 0x4b00); + r8168g_phy_param(phydev, 0x809d, 0x3f00, 0x0800); + r8168g_phy_param(phydev, 0x80a1, 0xff00, 0x7000); + r8168g_phy_param(phydev, 0x809f, 0x1f00, 0x0300); + r8168g_phy_param(phydev, 0x809e, 0xff00, 0x8800); + r8168g_phy_param(phydev, 0x80b2, 0xff00, 0x2200); + r8168g_phy_param(phydev, 0x80ad, 0xf800, 0x9800); + r8168g_phy_param(phydev, 0x80af, 0x3f00, 0x0800); + r8168g_phy_param(phydev, 0x80b3, 0xff00, 0x6f00); + r8168g_phy_param(phydev, 0x80b1, 0x1f00, 0x0300); + r8168g_phy_param(phydev, 0x80b0, 0xff00, 0x9300); + + r8168g_phy_param(phydev, 0x8011, 0x0000, 0x0800); + + /* enable GPHY 10M */ + phy_modify_paged(phydev, 0x0a44, 0x11, 0, BIT(11)); + + r8168g_phy_param(phydev, 0x8016, 0x0000, 0x0400); + + rtl8168g_disable_aldps(phydev); + rtl8168h_config_eee_phy(phydev); +} + +static void rtl8102e_hw_phy_config(struct rtl8169_private *tp, + struct phy_device *phydev) +{ + static const struct phy_reg phy_reg_init[] = { + { 0x1f, 0x0003 }, + { 0x08, 0x441d }, + { 0x01, 0x9100 }, + { 0x1f, 0x0000 } + }; + + phy_set_bits(phydev, 0x11, BIT(12)); + phy_set_bits(phydev, 0x19, BIT(13)); + phy_set_bits(phydev, 0x10, BIT(15)); + + rtl_writephy_batch(phydev, phy_reg_init); +} + +static void rtl8105e_hw_phy_config(struct rtl8169_private *tp, + struct phy_device *phydev) +{ + /* Disable ALDPS before ram code */ + phy_write(phydev, 0x18, 0x0310); + msleep(100); + + r8169_apply_firmware(tp); + + phy_write_paged(phydev, 0x0005, 0x1a, 0x0000); + phy_write_paged(phydev, 0x0004, 0x1c, 0x0000); + phy_write_paged(phydev, 0x0001, 0x15, 0x7701); +} + +static void rtl8402_hw_phy_config(struct rtl8169_private *tp, + struct phy_device *phydev) +{ + /* Disable ALDPS before setting firmware */ + phy_write(phydev, 0x18, 0x0310); + msleep(20); + + r8169_apply_firmware(tp); + + /* EEE setting */ + phy_write(phydev, 0x1f, 0x0004); + phy_write(phydev, 0x10, 0x401f); + phy_write(phydev, 0x19, 0x7030); + phy_write(phydev, 0x1f, 0x0000); +} + +static void rtl8106e_hw_phy_config(struct rtl8169_private *tp, + struct phy_device *phydev) +{ + static const struct phy_reg phy_reg_init[] = { + { 0x1f, 0x0004 }, + { 0x10, 0xc07f }, + { 0x19, 0x7030 }, + { 0x1f, 0x0000 } + }; + + /* Disable ALDPS before ram code */ + phy_write(phydev, 0x18, 0x0310); + msleep(100); + + r8169_apply_firmware(tp); + + rtl_writephy_batch(phydev, phy_reg_init); +} + +static void rtl8125_1_hw_phy_config(struct rtl8169_private *tp, + struct phy_device *phydev) +{ + phy_modify_paged(phydev, 0xad4, 0x10, 0x03ff, 0x0084); + phy_modify_paged(phydev, 0xad4, 0x17, 0x0000, 0x0010); + phy_modify_paged(phydev, 0xad1, 0x13, 0x03ff, 0x0006); + phy_modify_paged(phydev, 0xad3, 0x11, 0x003f, 0x0006); + phy_modify_paged(phydev, 0xac0, 0x14, 0x0000, 0x1100); + phy_modify_paged(phydev, 0xac8, 0x15, 0xf000, 0x7000); + phy_modify_paged(phydev, 0xad1, 0x14, 0x0000, 0x0400); + phy_modify_paged(phydev, 0xad1, 0x15, 0x0000, 0x03ff); + phy_modify_paged(phydev, 0xad1, 0x16, 0x0000, 0x03ff); + + r8168g_phy_param(phydev, 0x80ea, 0xff00, 0xc400); + r8168g_phy_param(phydev, 0x80eb, 0x0700, 0x0300); + r8168g_phy_param(phydev, 0x80f8, 0xff00, 0x1c00); + r8168g_phy_param(phydev, 0x80f1, 0xff00, 0x3000); + r8168g_phy_param(phydev, 0x80fe, 0xff00, 0xa500); + r8168g_phy_param(phydev, 0x8102, 0xff00, 0x5000); + r8168g_phy_param(phydev, 0x8105, 0xff00, 0x3300); + r8168g_phy_param(phydev, 0x8100, 0xff00, 0x7000); + r8168g_phy_param(phydev, 0x8104, 0xff00, 0xf000); + r8168g_phy_param(phydev, 0x8106, 0xff00, 0x6500); + r8168g_phy_param(phydev, 0x80dc, 0xff00, 0xed00); + r8168g_phy_param(phydev, 0x80df, 0x0000, 0x0100); + r8168g_phy_param(phydev, 0x80e1, 0x0100, 0x0000); + + phy_modify_paged(phydev, 0xbf0, 0x13, 0x003f, 0x0038); + r8168g_phy_param(phydev, 0x819f, 0xffff, 0xd0b6); + + phy_write_paged(phydev, 0xbc3, 0x12, 0x5555); + phy_modify_paged(phydev, 0xbf0, 0x15, 0x0e00, 0x0a00); + phy_modify_paged(phydev, 0xa5c, 0x10, 0x0400, 0x0000); + phy_modify_paged(phydev, 0xa44, 0x11, 0x0000, 0x0800); + + rtl8125_config_eee_phy(phydev); +} + +static void rtl8125_2_hw_phy_config(struct rtl8169_private *tp, + struct phy_device *phydev) +{ + int i; + + phy_modify_paged(phydev, 0xad4, 0x17, 0x0000, 0x0010); + phy_modify_paged(phydev, 0xad1, 0x13, 0x03ff, 0x03ff); + phy_modify_paged(phydev, 0xad3, 0x11, 0x003f, 0x0006); + phy_modify_paged(phydev, 0xac0, 0x14, 0x1100, 0x0000); + phy_modify_paged(phydev, 0xacc, 0x10, 0x0003, 0x0002); + phy_modify_paged(phydev, 0xad4, 0x10, 0x00e7, 0x0044); + phy_modify_paged(phydev, 0xac1, 0x12, 0x0080, 0x0000); + phy_modify_paged(phydev, 0xac8, 0x10, 0x0300, 0x0000); + phy_modify_paged(phydev, 0xac5, 0x17, 0x0007, 0x0002); + phy_write_paged(phydev, 0xad4, 0x16, 0x00a8); + phy_write_paged(phydev, 0xac5, 0x16, 0x01ff); + phy_modify_paged(phydev, 0xac8, 0x15, 0x00f0, 0x0030); + + phy_write(phydev, 0x1f, 0x0b87); + phy_write(phydev, 0x16, 0x80a2); + phy_write(phydev, 0x17, 0x0153); + phy_write(phydev, 0x16, 0x809c); + phy_write(phydev, 0x17, 0x0153); + phy_write(phydev, 0x1f, 0x0000); + + phy_write(phydev, 0x1f, 0x0a43); + phy_write(phydev, 0x13, 0x81B3); + phy_write(phydev, 0x14, 0x0043); + phy_write(phydev, 0x14, 0x00A7); + phy_write(phydev, 0x14, 0x00D6); + phy_write(phydev, 0x14, 0x00EC); + phy_write(phydev, 0x14, 0x00F6); + phy_write(phydev, 0x14, 0x00FB); + phy_write(phydev, 0x14, 0x00FD); + phy_write(phydev, 0x14, 0x00FF); + phy_write(phydev, 0x14, 0x00BB); + phy_write(phydev, 0x14, 0x0058); + phy_write(phydev, 0x14, 0x0029); + phy_write(phydev, 0x14, 0x0013); + phy_write(phydev, 0x14, 0x0009); + phy_write(phydev, 0x14, 0x0004); + phy_write(phydev, 0x14, 0x0002); + for (i = 0; i < 25; i++) + phy_write(phydev, 0x14, 0x0000); + phy_write(phydev, 0x1f, 0x0000); + + r8168g_phy_param(phydev, 0x8257, 0xffff, 0x020F); + r8168g_phy_param(phydev, 0x80ea, 0xffff, 0x7843); + + r8169_apply_firmware(tp); + + phy_modify_paged(phydev, 0xd06, 0x14, 0x0000, 0x2000); + + r8168g_phy_param(phydev, 0x81a2, 0x0000, 0x0100); + + phy_modify_paged(phydev, 0xb54, 0x16, 0xff00, 0xdb00); + phy_modify_paged(phydev, 0xa45, 0x12, 0x0001, 0x0000); + phy_modify_paged(phydev, 0xa5d, 0x12, 0x0000, 0x0020); + phy_modify_paged(phydev, 0xad4, 0x17, 0x0010, 0x0000); + phy_modify_paged(phydev, 0xa86, 0x15, 0x0001, 0x0000); + phy_modify_paged(phydev, 0xa44, 0x11, 0x0000, 0x0800); + + rtl8125_config_eee_phy(phydev); +} + +void r8169_hw_phy_config(struct rtl8169_private *tp, struct phy_device *phydev, + enum mac_version ver) +{ + static const rtl_phy_cfg_fct phy_configs[] = { + /* PCI devices. */ + [RTL_GIGA_MAC_VER_02] = rtl8169s_hw_phy_config, + [RTL_GIGA_MAC_VER_03] = rtl8169s_hw_phy_config, + [RTL_GIGA_MAC_VER_04] = rtl8169sb_hw_phy_config, + [RTL_GIGA_MAC_VER_05] = rtl8169scd_hw_phy_config, + [RTL_GIGA_MAC_VER_06] = rtl8169sce_hw_phy_config, + /* PCI-E devices. */ + [RTL_GIGA_MAC_VER_07] = rtl8102e_hw_phy_config, + [RTL_GIGA_MAC_VER_08] = rtl8102e_hw_phy_config, + [RTL_GIGA_MAC_VER_09] = rtl8102e_hw_phy_config, + [RTL_GIGA_MAC_VER_10] = NULL, + [RTL_GIGA_MAC_VER_11] = rtl8168bb_hw_phy_config, + [RTL_GIGA_MAC_VER_12] = rtl8168bef_hw_phy_config, + [RTL_GIGA_MAC_VER_13] = NULL, + [RTL_GIGA_MAC_VER_14] = NULL, + [RTL_GIGA_MAC_VER_15] = NULL, + [RTL_GIGA_MAC_VER_16] = NULL, + [RTL_GIGA_MAC_VER_17] = rtl8168bef_hw_phy_config, + [RTL_GIGA_MAC_VER_18] = rtl8168cp_1_hw_phy_config, + [RTL_GIGA_MAC_VER_19] = rtl8168c_1_hw_phy_config, + [RTL_GIGA_MAC_VER_20] = rtl8168c_2_hw_phy_config, + [RTL_GIGA_MAC_VER_21] = rtl8168c_3_hw_phy_config, + [RTL_GIGA_MAC_VER_22] = rtl8168c_3_hw_phy_config, + [RTL_GIGA_MAC_VER_23] = rtl8168cp_2_hw_phy_config, + [RTL_GIGA_MAC_VER_24] = rtl8168cp_2_hw_phy_config, + [RTL_GIGA_MAC_VER_25] = rtl8168d_1_hw_phy_config, + [RTL_GIGA_MAC_VER_26] = rtl8168d_2_hw_phy_config, + [RTL_GIGA_MAC_VER_27] = rtl8168d_3_hw_phy_config, + [RTL_GIGA_MAC_VER_28] = rtl8168d_4_hw_phy_config, + [RTL_GIGA_MAC_VER_29] = rtl8105e_hw_phy_config, + [RTL_GIGA_MAC_VER_30] = rtl8105e_hw_phy_config, + [RTL_GIGA_MAC_VER_31] = NULL, + [RTL_GIGA_MAC_VER_32] = rtl8168e_1_hw_phy_config, + [RTL_GIGA_MAC_VER_33] = rtl8168e_1_hw_phy_config, + [RTL_GIGA_MAC_VER_34] = rtl8168e_2_hw_phy_config, + [RTL_GIGA_MAC_VER_35] = rtl8168f_1_hw_phy_config, + [RTL_GIGA_MAC_VER_36] = rtl8168f_2_hw_phy_config, + [RTL_GIGA_MAC_VER_37] = rtl8402_hw_phy_config, + [RTL_GIGA_MAC_VER_38] = rtl8411_hw_phy_config, + [RTL_GIGA_MAC_VER_39] = rtl8106e_hw_phy_config, + [RTL_GIGA_MAC_VER_40] = rtl8168g_1_hw_phy_config, + [RTL_GIGA_MAC_VER_41] = NULL, + [RTL_GIGA_MAC_VER_42] = rtl8168g_2_hw_phy_config, + [RTL_GIGA_MAC_VER_43] = rtl8168g_2_hw_phy_config, + [RTL_GIGA_MAC_VER_44] = rtl8168g_2_hw_phy_config, + [RTL_GIGA_MAC_VER_45] = rtl8168h_1_hw_phy_config, + [RTL_GIGA_MAC_VER_46] = rtl8168h_2_hw_phy_config, + [RTL_GIGA_MAC_VER_47] = rtl8168h_1_hw_phy_config, + [RTL_GIGA_MAC_VER_48] = rtl8168h_2_hw_phy_config, + [RTL_GIGA_MAC_VER_49] = rtl8168ep_1_hw_phy_config, + [RTL_GIGA_MAC_VER_50] = rtl8168ep_2_hw_phy_config, + [RTL_GIGA_MAC_VER_51] = rtl8168ep_2_hw_phy_config, + [RTL_GIGA_MAC_VER_52] = rtl8117_hw_phy_config, + [RTL_GIGA_MAC_VER_60] = rtl8125_1_hw_phy_config, + [RTL_GIGA_MAC_VER_61] = rtl8125_2_hw_phy_config, + }; + + if (phy_configs[ver]) + phy_configs[ver](tp, phydev); +} diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index cdd8ab2eb910..58ca126518a2 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -2204,24 +2204,28 @@ static size_t __sh_eth_get_regs(struct net_device *ndev, u32 *buf) if (cd->tsu) { add_tsu_reg(ARSTR); add_tsu_reg(TSU_CTRST); - add_tsu_reg(TSU_FWEN0); - add_tsu_reg(TSU_FWEN1); - add_tsu_reg(TSU_FCM); - add_tsu_reg(TSU_BSYSL0); - add_tsu_reg(TSU_BSYSL1); - add_tsu_reg(TSU_PRISL0); - add_tsu_reg(TSU_PRISL1); - add_tsu_reg(TSU_FWSL0); - add_tsu_reg(TSU_FWSL1); + if (cd->dual_port) { + add_tsu_reg(TSU_FWEN0); + add_tsu_reg(TSU_FWEN1); + add_tsu_reg(TSU_FCM); + add_tsu_reg(TSU_BSYSL0); + add_tsu_reg(TSU_BSYSL1); + add_tsu_reg(TSU_PRISL0); + add_tsu_reg(TSU_PRISL1); + add_tsu_reg(TSU_FWSL0); + add_tsu_reg(TSU_FWSL1); + } add_tsu_reg(TSU_FWSLC); - add_tsu_reg(TSU_QTAGM0); - add_tsu_reg(TSU_QTAGM1); - add_tsu_reg(TSU_FWSR); - add_tsu_reg(TSU_FWINMK); - add_tsu_reg(TSU_ADQT0); - add_tsu_reg(TSU_ADQT1); - add_tsu_reg(TSU_VTAG0); - add_tsu_reg(TSU_VTAG1); + if (cd->dual_port) { + add_tsu_reg(TSU_QTAGM0); + add_tsu_reg(TSU_QTAGM1); + add_tsu_reg(TSU_FWSR); + add_tsu_reg(TSU_FWINMK); + add_tsu_reg(TSU_ADQT0); + add_tsu_reg(TSU_ADQT1); + add_tsu_reg(TSU_VTAG0); + add_tsu_reg(TSU_VTAG1); + } add_tsu_reg(TSU_ADSBSY); add_tsu_reg(TSU_TEN); add_tsu_reg(TSU_POST1); @@ -2643,20 +2647,6 @@ static int sh_eth_close(struct net_device *ndev) return 0; } -/* ioctl to device function */ -static int sh_eth_do_ioctl(struct net_device *ndev, struct ifreq *rq, int cmd) -{ - struct phy_device *phydev = ndev->phydev; - - if (!netif_running(ndev)) - return -EINVAL; - - if (!phydev) - return -ENODEV; - - return phy_mii_ioctl(phydev, rq, cmd); -} - static int sh_eth_change_mtu(struct net_device *ndev, int new_mtu) { if (netif_running(ndev)) @@ -3155,7 +3145,7 @@ static const struct net_device_ops sh_eth_netdev_ops = { .ndo_get_stats = sh_eth_get_stats, .ndo_set_rx_mode = sh_eth_set_rx_mode, .ndo_tx_timeout = sh_eth_tx_timeout, - .ndo_do_ioctl = sh_eth_do_ioctl, + .ndo_do_ioctl = phy_do_ioctl_running, .ndo_change_mtu = sh_eth_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, @@ -3171,7 +3161,7 @@ static const struct net_device_ops sh_eth_netdev_ops_tsu = { .ndo_vlan_rx_add_vid = sh_eth_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = sh_eth_vlan_rx_kill_vid, .ndo_tx_timeout = sh_eth_tx_timeout, - .ndo_do_ioctl = sh_eth_do_ioctl, + .ndo_do_ioctl = phy_do_ioctl_running, .ndo_change_mtu = sh_eth_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, diff --git a/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c b/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c index cd6e0de48248..c705743d69f7 100644 --- a/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c +++ b/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c @@ -1939,9 +1939,7 @@ static int sxgbe_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) case SIOCGMIIPHY: case SIOCGMIIREG: case SIOCSMIIREG: - if (!dev->phydev) - return -EINVAL; - ret = phy_mii_ioctl(dev->phydev, rq, cmd); + ret = phy_do_ioctl(dev, rq, cmd); break; default: break; @@ -2296,7 +2294,7 @@ __setup("sxgbeeth=", sxgbe_cmdline_opt); -MODULE_DESCRIPTION("SAMSUNG 10G/2.5G/1G Ethernet PLATFORM driver"); +MODULE_DESCRIPTION("Samsung 10G/2.5G/1G Ethernet PLATFORM driver"); MODULE_PARM_DESC(debug, "Message Level (-1: default, 0: no output, 16: all)"); MODULE_PARM_DESC(eee_timer, "EEE-LPI Default LS timer value"); diff --git a/drivers/net/ethernet/sfc/Makefile b/drivers/net/ethernet/sfc/Makefile index c5c297e78d06..890fd65caa2d 100644 --- a/drivers/net/ethernet/sfc/Makefile +++ b/drivers/net/ethernet/sfc/Makefile @@ -1,7 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 -sfc-y += efx.o nic.o farch.o siena.o ef10.o tx.o rx.o \ - selftest.o ethtool.o ptp.o tx_tso.o \ - mcdi.o mcdi_port.o mcdi_mon.o +sfc-y += efx.o efx_common.o efx_channels.o nic.o \ + farch.o siena.o ef10.o \ + tx.o tx_common.o tx_tso.o rx.o rx_common.o \ + selftest.o ethtool.o ethtool_common.o ptp.o \ + mcdi.o mcdi_port.o mcdi_port_common.o \ + mcdi_functions.o mcdi_mon.o sfc-$(CONFIG_SFC_MTD) += mtd.o sfc-$(CONFIG_SFC_SRIOV) += sriov.o siena_sriov.o ef10_sriov.o diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c index 4d9bbccc6f89..4997f61de3d6 100644 --- a/drivers/net/ethernet/sfc/ef10.c +++ b/drivers/net/ethernet/sfc/ef10.c @@ -5,10 +5,13 @@ */ #include "net_driver.h" +#include "rx_common.h" #include "ef10_regs.h" #include "io.h" #include "mcdi.h" #include "mcdi_pcol.h" +#include "mcdi_port_common.h" +#include "mcdi_functions.h" #include "nic.h" #include "workarounds.h" #include "selftest.h" @@ -185,24 +188,6 @@ static bool efx_ef10_is_vf(struct efx_nic *efx) return efx->type->is_vf; } -static int efx_ef10_get_pf_index(struct efx_nic *efx) -{ - MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_FUNCTION_INFO_OUT_LEN); - struct efx_ef10_nic_data *nic_data = efx->nic_data; - size_t outlen; - int rc; - - rc = efx_mcdi_rpc(efx, MC_CMD_GET_FUNCTION_INFO, NULL, 0, outbuf, - sizeof(outbuf), &outlen); - if (rc) - return rc; - if (outlen < sizeof(outbuf)) - return -EIO; - - nic_data->pf_index = MCDI_DWORD(outbuf, GET_FUNCTION_INFO_OUT_PF); - return 0; -} - #ifdef CONFIG_SFC_SRIOV static int efx_ef10_get_vf_index(struct efx_nic *efx) { @@ -273,24 +258,9 @@ static int efx_ef10_init_datapath_caps(struct efx_nic *efx) u8 vi_window_mode = MCDI_BYTE(outbuf, GET_CAPABILITIES_V3_OUT_VI_WINDOW_MODE); - switch (vi_window_mode) { - case MC_CMD_GET_CAPABILITIES_V3_OUT_VI_WINDOW_MODE_8K: - efx->vi_stride = 8192; - break; - case MC_CMD_GET_CAPABILITIES_V3_OUT_VI_WINDOW_MODE_16K: - efx->vi_stride = 16384; - break; - case MC_CMD_GET_CAPABILITIES_V3_OUT_VI_WINDOW_MODE_64K: - efx->vi_stride = 65536; - break; - default: - netif_err(efx, probe, efx->net_dev, - "Unrecognised VI window mode %d\n", - vi_window_mode); - return -EIO; - } - netif_dbg(efx, probe, efx->net_dev, "vi_stride = %u\n", - efx->vi_stride); + rc = efx_mcdi_window_mode_to_stride(efx, vi_window_mode); + if (rc) + return rc; } else { /* keep default VI stride */ netif_dbg(efx, probe, efx->net_dev, @@ -689,7 +659,7 @@ static int efx_ef10_probe(struct efx_nic *efx) } nic_data->warm_boot_count = rc; - efx->rss_context.context_id = EFX_EF10_RSS_CONTEXT_INVALID; + efx->rss_context.context_id = EFX_MCDI_RSS_CONTEXT_INVALID; nic_data->vport_id = EVB_PORT_ID_ASSIGNED; @@ -725,7 +695,7 @@ static int efx_ef10_probe(struct efx_nic *efx) if (rc) goto fail4; - rc = efx_ef10_get_pf_index(efx); + rc = efx_get_pf_index(efx, &nic_data->pf_index); if (rc) goto fail5; @@ -831,22 +801,6 @@ fail1: return rc; } -static int efx_ef10_free_vis(struct efx_nic *efx) -{ - MCDI_DECLARE_BUF_ERR(outbuf); - size_t outlen; - int rc = efx_mcdi_rpc_quiet(efx, MC_CMD_FREE_VIS, NULL, 0, - outbuf, sizeof(outbuf), &outlen); - - /* -EALREADY means nothing to free, so ignore */ - if (rc == -EALREADY) - rc = 0; - if (rc) - efx_mcdi_display_error(efx, MC_CMD_FREE_VIS, 0, outbuf, outlen, - rc); - return rc; -} - #ifdef EFX_USE_PIO static void efx_ef10_free_piobufs(struct efx_nic *efx) @@ -1089,7 +1043,7 @@ static void efx_ef10_remove(struct efx_nic *efx) if (nic_data->wc_membase) iounmap(nic_data->wc_membase); - rc = efx_ef10_free_vis(efx); + rc = efx_mcdi_free_vis(efx); WARN_ON(rc != 0); if (!nic_data->must_restore_piobufs) @@ -1260,28 +1214,10 @@ static int efx_ef10_probe_vf(struct efx_nic *efx __attribute__ ((unused))) static int efx_ef10_alloc_vis(struct efx_nic *efx, unsigned int min_vis, unsigned int max_vis) { - MCDI_DECLARE_BUF(inbuf, MC_CMD_ALLOC_VIS_IN_LEN); - MCDI_DECLARE_BUF(outbuf, MC_CMD_ALLOC_VIS_OUT_LEN); struct efx_ef10_nic_data *nic_data = efx->nic_data; - size_t outlen; - int rc; - - MCDI_SET_DWORD(inbuf, ALLOC_VIS_IN_MIN_VI_COUNT, min_vis); - MCDI_SET_DWORD(inbuf, ALLOC_VIS_IN_MAX_VI_COUNT, max_vis); - rc = efx_mcdi_rpc(efx, MC_CMD_ALLOC_VIS, inbuf, sizeof(inbuf), - outbuf, sizeof(outbuf), &outlen); - if (rc != 0) - return rc; - - if (outlen < MC_CMD_ALLOC_VIS_OUT_LEN) - return -EIO; - netif_dbg(efx, drv, efx->net_dev, "base VI is A0x%03x\n", - MCDI_DWORD(outbuf, ALLOC_VIS_OUT_VI_BASE)); - - nic_data->vi_base = MCDI_DWORD(outbuf, ALLOC_VIS_OUT_VI_BASE); - nic_data->n_allocated_vis = MCDI_DWORD(outbuf, ALLOC_VIS_OUT_VI_COUNT); - return 0; + return efx_mcdi_alloc_vis(efx, min_vis, max_vis, &nic_data->vi_base, + &nic_data->n_allocated_vis); } /* Note that the failure path of this function does not free @@ -1363,7 +1299,7 @@ static int efx_ef10_dimension_resources(struct efx_nic *efx) } /* In case the last attached driver failed to free VIs, do it now */ - rc = efx_ef10_free_vis(efx); + rc = efx_mcdi_free_vis(efx); if (rc != 0) return rc; @@ -1384,7 +1320,7 @@ static int efx_ef10_dimension_resources(struct efx_nic *efx) efx->max_tx_channels = nic_data->n_allocated_vis / EFX_TXQ_TYPES; - efx_ef10_free_vis(efx); + efx_mcdi_free_vis(efx); return -EAGAIN; } @@ -1503,7 +1439,7 @@ static void efx_ef10_reset_mc_allocations(struct efx_nic *efx) nic_data->must_restore_filters = true; nic_data->must_restore_piobufs = true; efx_ef10_forget_old_piobufs(efx); - efx->rss_context.context_id = EFX_EF10_RSS_CONTEXT_INVALID; + efx->rss_context.context_id = EFX_MCDI_RSS_CONTEXT_INVALID; /* Driver-created vswitches and vports must be re-created */ nic_data->must_probe_vswitching = true; @@ -2408,20 +2344,15 @@ static u32 efx_ef10_tso_versions(struct efx_nic *efx) static void efx_ef10_tx_init(struct efx_tx_queue *tx_queue) { - MCDI_DECLARE_BUF(inbuf, MC_CMD_INIT_TXQ_IN_LEN(EFX_MAX_DMAQ_SIZE * 8 / - EFX_BUF_SIZE)); bool csum_offload = tx_queue->queue & EFX_TXQ_TYPE_OFFLOAD; - size_t entries = tx_queue->txd.buf.len / EFX_BUF_SIZE; struct efx_channel *channel = tx_queue->channel; struct efx_nic *efx = tx_queue->efx; - struct efx_ef10_nic_data *nic_data = efx->nic_data; + struct efx_ef10_nic_data *nic_data; bool tso_v2 = false; - size_t inlen; - dma_addr_t dma_addr; efx_qword_t *txd; int rc; - int i; - BUILD_BUG_ON(MC_CMD_INIT_TXQ_OUT_LEN != 0); + + nic_data = efx->nic_data; /* Only attempt to enable TX timestamping if we have the license for it, * otherwise TXQ init will fail @@ -2448,51 +2379,9 @@ static void efx_ef10_tx_init(struct efx_tx_queue *tx_queue) channel->channel); } - MCDI_SET_DWORD(inbuf, INIT_TXQ_IN_SIZE, tx_queue->ptr_mask + 1); - MCDI_SET_DWORD(inbuf, INIT_TXQ_IN_TARGET_EVQ, channel->channel); - MCDI_SET_DWORD(inbuf, INIT_TXQ_IN_LABEL, tx_queue->queue); - MCDI_SET_DWORD(inbuf, INIT_TXQ_IN_INSTANCE, tx_queue->queue); - MCDI_SET_DWORD(inbuf, INIT_TXQ_IN_OWNER_ID, 0); - MCDI_SET_DWORD(inbuf, INIT_TXQ_IN_PORT_ID, nic_data->vport_id); - - dma_addr = tx_queue->txd.buf.dma_addr; - - netif_dbg(efx, hw, efx->net_dev, "pushing TXQ %d. %zu entries (%llx)\n", - tx_queue->queue, entries, (u64)dma_addr); - - for (i = 0; i < entries; ++i) { - MCDI_SET_ARRAY_QWORD(inbuf, INIT_TXQ_IN_DMA_ADDR, i, dma_addr); - dma_addr += EFX_BUF_SIZE; - } - - inlen = MC_CMD_INIT_TXQ_IN_LEN(entries); - - do { - MCDI_POPULATE_DWORD_4(inbuf, INIT_TXQ_IN_FLAGS, - /* This flag was removed from mcdi_pcol.h for - * the non-_EXT version of INIT_TXQ. However, - * firmware still honours it. - */ - INIT_TXQ_EXT_IN_FLAG_TSOV2_EN, tso_v2, - INIT_TXQ_IN_FLAG_IP_CSUM_DIS, !csum_offload, - INIT_TXQ_IN_FLAG_TCP_CSUM_DIS, !csum_offload, - INIT_TXQ_EXT_IN_FLAG_TIMESTAMP, - tx_queue->timestamping); - - rc = efx_mcdi_rpc_quiet(efx, MC_CMD_INIT_TXQ, inbuf, inlen, - NULL, 0, NULL); - if (rc == -ENOSPC && tso_v2) { - /* Retry without TSOv2 if we're short on contexts. */ - tso_v2 = false; - netif_warn(efx, probe, efx->net_dev, - "TSOv2 context not available to segment in hardware. TCP performance may be reduced.\n"); - } else if (rc) { - efx_mcdi_display_error(efx, MC_CMD_INIT_TXQ, - MC_CMD_INIT_TXQ_EXT_IN_LEN, - NULL, 0, rc); - goto fail; - } - } while (rc); + rc = efx_mcdi_tx_init(tx_queue, tso_v2); + if (rc) + goto fail; /* A previous user of this TX queue might have set us up the * bomb by writing a descriptor to the TX push collector but @@ -2530,35 +2419,6 @@ fail: tx_queue->queue); } -static void efx_ef10_tx_fini(struct efx_tx_queue *tx_queue) -{ - MCDI_DECLARE_BUF(inbuf, MC_CMD_FINI_TXQ_IN_LEN); - MCDI_DECLARE_BUF_ERR(outbuf); - struct efx_nic *efx = tx_queue->efx; - size_t outlen; - int rc; - - MCDI_SET_DWORD(inbuf, FINI_TXQ_IN_INSTANCE, - tx_queue->queue); - - rc = efx_mcdi_rpc_quiet(efx, MC_CMD_FINI_TXQ, inbuf, sizeof(inbuf), - outbuf, sizeof(outbuf), &outlen); - - if (rc && rc != -EALREADY) - goto fail; - - return; - -fail: - efx_mcdi_display_error(efx, MC_CMD_FINI_TXQ, MC_CMD_FINI_TXQ_IN_LEN, - outbuf, outlen, rc); -} - -static void efx_ef10_tx_remove(struct efx_tx_queue *tx_queue) -{ - efx_nic_free_buffer(tx_queue->efx, &tx_queue->txd.buf); -} - /* This writes to the TX_DESC_WPTR; write pointer for TX descriptor ring */ static inline void efx_ef10_notify_tx_desc(struct efx_tx_queue *tx_queue) { @@ -2737,7 +2597,7 @@ static int efx_ef10_alloc_rss_context(struct efx_nic *efx, bool exclusive, EFX_EF10_MAX_SHARED_RSS_CONTEXT_SIZE); if (!exclusive && rss_spread == 1) { - ctx->context_id = EFX_EF10_RSS_CONTEXT_INVALID; + ctx->context_id = EFX_MCDI_RSS_CONTEXT_INVALID; if (context_size) *context_size = 1; return 0; @@ -2824,11 +2684,11 @@ static void efx_ef10_rx_free_indir_table(struct efx_nic *efx) { int rc; - if (efx->rss_context.context_id != EFX_EF10_RSS_CONTEXT_INVALID) { + if (efx->rss_context.context_id != EFX_MCDI_RSS_CONTEXT_INVALID) { rc = efx_ef10_free_rss_context(efx, efx->rss_context.context_id); WARN_ON(rc != 0); } - efx->rss_context.context_id = EFX_EF10_RSS_CONTEXT_INVALID; + efx->rss_context.context_id = EFX_MCDI_RSS_CONTEXT_INVALID; } static int efx_ef10_rx_push_shared_rss_config(struct efx_nic *efx, @@ -2854,7 +2714,7 @@ static int efx_ef10_rx_push_exclusive_rss_config(struct efx_nic *efx, struct efx_ef10_nic_data *nic_data = efx->nic_data; int rc; - if (efx->rss_context.context_id == EFX_EF10_RSS_CONTEXT_INVALID || + if (efx->rss_context.context_id == EFX_MCDI_RSS_CONTEXT_INVALID || !nic_data->rx_rss_context_exclusive) { rc = efx_ef10_alloc_rss_context(efx, true, &efx->rss_context, NULL); @@ -2870,7 +2730,7 @@ static int efx_ef10_rx_push_exclusive_rss_config(struct efx_nic *efx, goto fail2; if (efx->rss_context.context_id != old_rx_rss_context && - old_rx_rss_context != EFX_EF10_RSS_CONTEXT_INVALID) + old_rx_rss_context != EFX_MCDI_RSS_CONTEXT_INVALID) WARN_ON(efx_ef10_free_rss_context(efx, old_rx_rss_context) != 0); nic_data->rx_rss_context_exclusive = true; if (rx_indir_table != efx->rss_context.rx_indir_table) @@ -2901,7 +2761,7 @@ static int efx_ef10_rx_push_rss_context_config(struct efx_nic *efx, WARN_ON(!mutex_is_locked(&efx->rss_lock)); - if (ctx->context_id == EFX_EF10_RSS_CONTEXT_INVALID) { + if (ctx->context_id == EFX_MCDI_RSS_CONTEXT_INVALID) { rc = efx_ef10_alloc_rss_context(efx, true, ctx, NULL); if (rc) return rc; @@ -2936,7 +2796,7 @@ static int efx_ef10_rx_pull_rss_context_config(struct efx_nic *efx, BUILD_BUG_ON(MC_CMD_RSS_CONTEXT_GET_TABLE_IN_LEN != MC_CMD_RSS_CONTEXT_GET_KEY_IN_LEN); - if (ctx->context_id == EFX_EF10_RSS_CONTEXT_INVALID) + if (ctx->context_id == EFX_MCDI_RSS_CONTEXT_INVALID) return -ENOENT; MCDI_SET_DWORD(inbuf, RSS_CONTEXT_GET_TABLE_IN_RSS_CONTEXT_ID, @@ -2997,7 +2857,7 @@ static void efx_ef10_rx_restore_rss_contexts(struct efx_nic *efx) list_for_each_entry(ctx, &efx->rss_context.list, list) { /* previous NIC RSS context is gone */ - ctx->context_id = EFX_EF10_RSS_CONTEXT_INVALID; + ctx->context_id = EFX_MCDI_RSS_CONTEXT_INVALID; /* so try to allocate a new one */ rc = efx_ef10_rx_push_rss_context_config(efx, ctx, ctx->rx_indir_table, @@ -3068,96 +2928,11 @@ static int efx_ef10_vf_rx_push_rss_config(struct efx_nic *efx, bool user, { if (user) return -EOPNOTSUPP; - if (efx->rss_context.context_id != EFX_EF10_RSS_CONTEXT_INVALID) + if (efx->rss_context.context_id != EFX_MCDI_RSS_CONTEXT_INVALID) return 0; return efx_ef10_rx_push_shared_rss_config(efx, NULL); } -static int efx_ef10_rx_probe(struct efx_rx_queue *rx_queue) -{ - return efx_nic_alloc_buffer(rx_queue->efx, &rx_queue->rxd.buf, - (rx_queue->ptr_mask + 1) * - sizeof(efx_qword_t), - GFP_KERNEL); -} - -static void efx_ef10_rx_init(struct efx_rx_queue *rx_queue) -{ - MCDI_DECLARE_BUF(inbuf, - MC_CMD_INIT_RXQ_IN_LEN(EFX_MAX_DMAQ_SIZE * 8 / - EFX_BUF_SIZE)); - struct efx_channel *channel = efx_rx_queue_channel(rx_queue); - size_t entries = rx_queue->rxd.buf.len / EFX_BUF_SIZE; - struct efx_nic *efx = rx_queue->efx; - struct efx_ef10_nic_data *nic_data = efx->nic_data; - size_t inlen; - dma_addr_t dma_addr; - int rc; - int i; - BUILD_BUG_ON(MC_CMD_INIT_RXQ_OUT_LEN != 0); - - rx_queue->scatter_n = 0; - rx_queue->scatter_len = 0; - - MCDI_SET_DWORD(inbuf, INIT_RXQ_IN_SIZE, rx_queue->ptr_mask + 1); - MCDI_SET_DWORD(inbuf, INIT_RXQ_IN_TARGET_EVQ, channel->channel); - MCDI_SET_DWORD(inbuf, INIT_RXQ_IN_LABEL, efx_rx_queue_index(rx_queue)); - MCDI_SET_DWORD(inbuf, INIT_RXQ_IN_INSTANCE, - efx_rx_queue_index(rx_queue)); - MCDI_POPULATE_DWORD_2(inbuf, INIT_RXQ_IN_FLAGS, - INIT_RXQ_IN_FLAG_PREFIX, 1, - INIT_RXQ_IN_FLAG_TIMESTAMP, 1); - MCDI_SET_DWORD(inbuf, INIT_RXQ_IN_OWNER_ID, 0); - MCDI_SET_DWORD(inbuf, INIT_RXQ_IN_PORT_ID, nic_data->vport_id); - - dma_addr = rx_queue->rxd.buf.dma_addr; - - netif_dbg(efx, hw, efx->net_dev, "pushing RXQ %d. %zu entries (%llx)\n", - efx_rx_queue_index(rx_queue), entries, (u64)dma_addr); - - for (i = 0; i < entries; ++i) { - MCDI_SET_ARRAY_QWORD(inbuf, INIT_RXQ_IN_DMA_ADDR, i, dma_addr); - dma_addr += EFX_BUF_SIZE; - } - - inlen = MC_CMD_INIT_RXQ_IN_LEN(entries); - - rc = efx_mcdi_rpc(efx, MC_CMD_INIT_RXQ, inbuf, inlen, - NULL, 0, NULL); - if (rc) - netdev_WARN(efx->net_dev, "failed to initialise RXQ %d\n", - efx_rx_queue_index(rx_queue)); -} - -static void efx_ef10_rx_fini(struct efx_rx_queue *rx_queue) -{ - MCDI_DECLARE_BUF(inbuf, MC_CMD_FINI_RXQ_IN_LEN); - MCDI_DECLARE_BUF_ERR(outbuf); - struct efx_nic *efx = rx_queue->efx; - size_t outlen; - int rc; - - MCDI_SET_DWORD(inbuf, FINI_RXQ_IN_INSTANCE, - efx_rx_queue_index(rx_queue)); - - rc = efx_mcdi_rpc_quiet(efx, MC_CMD_FINI_RXQ, inbuf, sizeof(inbuf), - outbuf, sizeof(outbuf), &outlen); - - if (rc && rc != -EALREADY) - goto fail; - - return; - -fail: - efx_mcdi_display_error(efx, MC_CMD_FINI_RXQ, MC_CMD_FINI_RXQ_IN_LEN, - outbuf, outlen, rc); -} - -static void efx_ef10_rx_remove(struct efx_rx_queue *rx_queue) -{ - efx_nic_free_buffer(rx_queue->efx, &rx_queue->rxd.buf); -} - /* This creates an entry in the RX descriptor queue */ static inline void efx_ef10_build_rx_desc(struct efx_rx_queue *rx_queue, unsigned int index) @@ -3229,106 +3004,20 @@ efx_ef10_rx_defer_refill_complete(struct efx_nic *efx, unsigned long cookie, /* nothing to do */ } -static int efx_ef10_ev_probe(struct efx_channel *channel) -{ - return efx_nic_alloc_buffer(channel->efx, &channel->eventq.buf, - (channel->eventq_mask + 1) * - sizeof(efx_qword_t), - GFP_KERNEL); -} - -static void efx_ef10_ev_fini(struct efx_channel *channel) -{ - MCDI_DECLARE_BUF(inbuf, MC_CMD_FINI_EVQ_IN_LEN); - MCDI_DECLARE_BUF_ERR(outbuf); - struct efx_nic *efx = channel->efx; - size_t outlen; - int rc; - - MCDI_SET_DWORD(inbuf, FINI_EVQ_IN_INSTANCE, channel->channel); - - rc = efx_mcdi_rpc_quiet(efx, MC_CMD_FINI_EVQ, inbuf, sizeof(inbuf), - outbuf, sizeof(outbuf), &outlen); - - if (rc && rc != -EALREADY) - goto fail; - - return; - -fail: - efx_mcdi_display_error(efx, MC_CMD_FINI_EVQ, MC_CMD_FINI_EVQ_IN_LEN, - outbuf, outlen, rc); -} - static int efx_ef10_ev_init(struct efx_channel *channel) { - MCDI_DECLARE_BUF(inbuf, - MC_CMD_INIT_EVQ_V2_IN_LEN(EFX_MAX_EVQ_SIZE * 8 / - EFX_BUF_SIZE)); - MCDI_DECLARE_BUF(outbuf, MC_CMD_INIT_EVQ_V2_OUT_LEN); - size_t entries = channel->eventq.buf.len / EFX_BUF_SIZE; struct efx_nic *efx = channel->efx; struct efx_ef10_nic_data *nic_data; - size_t inlen, outlen; unsigned int enabled, implemented; - dma_addr_t dma_addr; + bool use_v2, cut_thru; int rc; - int i; nic_data = efx->nic_data; - - /* Fill event queue with all ones (i.e. empty events) */ - memset(channel->eventq.buf.addr, 0xff, channel->eventq.buf.len); - - MCDI_SET_DWORD(inbuf, INIT_EVQ_IN_SIZE, channel->eventq_mask + 1); - MCDI_SET_DWORD(inbuf, INIT_EVQ_IN_INSTANCE, channel->channel); - /* INIT_EVQ expects index in vector table, not absolute */ - MCDI_SET_DWORD(inbuf, INIT_EVQ_IN_IRQ_NUM, channel->channel); - MCDI_SET_DWORD(inbuf, INIT_EVQ_IN_TMR_MODE, - MC_CMD_INIT_EVQ_IN_TMR_MODE_DIS); - MCDI_SET_DWORD(inbuf, INIT_EVQ_IN_TMR_LOAD, 0); - MCDI_SET_DWORD(inbuf, INIT_EVQ_IN_TMR_RELOAD, 0); - MCDI_SET_DWORD(inbuf, INIT_EVQ_IN_COUNT_MODE, - MC_CMD_INIT_EVQ_IN_COUNT_MODE_DIS); - MCDI_SET_DWORD(inbuf, INIT_EVQ_IN_COUNT_THRSHLD, 0); - - if (nic_data->datapath_caps2 & - 1 << MC_CMD_GET_CAPABILITIES_V2_OUT_INIT_EVQ_V2_LBN) { - /* Use the new generic approach to specifying event queue - * configuration, requesting lower latency or higher throughput. - * The options that actually get used appear in the output. - */ - MCDI_POPULATE_DWORD_2(inbuf, INIT_EVQ_V2_IN_FLAGS, - INIT_EVQ_V2_IN_FLAG_INTERRUPTING, 1, - INIT_EVQ_V2_IN_FLAG_TYPE, - MC_CMD_INIT_EVQ_V2_IN_FLAG_TYPE_AUTO); - } else { - bool cut_thru = !(nic_data->datapath_caps & - 1 << MC_CMD_GET_CAPABILITIES_OUT_RX_BATCHING_LBN); - - MCDI_POPULATE_DWORD_4(inbuf, INIT_EVQ_IN_FLAGS, - INIT_EVQ_IN_FLAG_INTERRUPTING, 1, - INIT_EVQ_IN_FLAG_RX_MERGE, 1, - INIT_EVQ_IN_FLAG_TX_MERGE, 1, - INIT_EVQ_IN_FLAG_CUT_THRU, cut_thru); - } - - dma_addr = channel->eventq.buf.dma_addr; - for (i = 0; i < entries; ++i) { - MCDI_SET_ARRAY_QWORD(inbuf, INIT_EVQ_IN_DMA_ADDR, i, dma_addr); - dma_addr += EFX_BUF_SIZE; - } - - inlen = MC_CMD_INIT_EVQ_IN_LEN(entries); - - rc = efx_mcdi_rpc(efx, MC_CMD_INIT_EVQ, inbuf, inlen, - outbuf, sizeof(outbuf), &outlen); - - if (outlen >= MC_CMD_INIT_EVQ_V2_OUT_LEN) - netif_dbg(efx, drv, efx->net_dev, - "Channel %d using event queue flags %08x\n", - channel->channel, - MCDI_DWORD(outbuf, INIT_EVQ_V2_OUT_FLAGS)); + use_v2 = nic_data->datapath_caps2 & + 1 << MC_CMD_GET_CAPABILITIES_V2_OUT_INIT_EVQ_V2_LBN; + cut_thru = !(nic_data->datapath_caps & + 1 << MC_CMD_GET_CAPABILITIES_OUT_RX_BATCHING_LBN); + rc = efx_mcdi_ev_init(channel, cut_thru, use_v2); /* IRQ return is ignored */ if (channel->channel || rc) @@ -3386,15 +3075,10 @@ static int efx_ef10_ev_init(struct efx_channel *channel) return 0; fail: - efx_ef10_ev_fini(channel); + efx_mcdi_ev_fini(channel); return rc; } -static void efx_ef10_ev_remove(struct efx_channel *channel) -{ - efx_nic_free_buffer(channel->efx, &channel->eventq.buf); -} - static void efx_ef10_handle_rx_wrong_queue(struct efx_rx_queue *rx_queue, unsigned int rx_queue_label) { @@ -3976,9 +3660,9 @@ static int efx_ef10_fini_dmaq(struct efx_nic *efx) if (efx->state != STATE_RECOVERY) { efx_for_each_channel(channel, efx) { efx_for_each_channel_rx_queue(rx_queue, channel) - efx_ef10_rx_fini(rx_queue); + efx_mcdi_rx_fini(rx_queue); efx_for_each_channel_tx_queue(tx_queue, channel) - efx_ef10_tx_fini(tx_queue); + efx_mcdi_tx_fini(tx_queue); } wait_event_timeout(efx->flush_wq, @@ -4165,7 +3849,7 @@ static void efx_ef10_filter_push_prep(struct efx_nic *efx, */ if (WARN_ON_ONCE(!ctx)) flags &= ~EFX_FILTER_FLAG_RX_RSS; - else if (WARN_ON_ONCE(ctx->context_id == EFX_EF10_RSS_CONTEXT_INVALID)) + else if (WARN_ON_ONCE(ctx->context_id == EFX_MCDI_RSS_CONTEXT_INVALID)) flags &= ~EFX_FILTER_FLAG_RX_RSS; } @@ -4344,7 +4028,7 @@ static s32 efx_ef10_filter_insert_locked(struct efx_nic *efx, rc = -ENOENT; goto out_unlock; } - if (ctx->context_id == EFX_EF10_RSS_CONTEXT_INVALID) { + if (ctx->context_id == EFX_MCDI_RSS_CONTEXT_INVALID) { rc = -EOPNOTSUPP; goto out_unlock; } @@ -5085,7 +4769,7 @@ static void efx_ef10_filter_table_restore(struct efx_nic *efx) invalid_filters++; goto not_restored; } - if (ctx->context_id == EFX_EF10_RSS_CONTEXT_INVALID) { + if (ctx->context_id == EFX_MCDI_RSS_CONTEXT_INVALID) { netif_warn(efx, drv, efx->net_dev, "Warning: unable to restore a filter with RSS context %u as it was not created.\n", spec->rss_context); @@ -6650,20 +6334,20 @@ const struct efx_nic_type efx_hunt_a0_vf_nic_type = { .irq_handle_legacy = efx_ef10_legacy_interrupt, .tx_probe = efx_ef10_tx_probe, .tx_init = efx_ef10_tx_init, - .tx_remove = efx_ef10_tx_remove, + .tx_remove = efx_mcdi_tx_remove, .tx_write = efx_ef10_tx_write, .tx_limit_len = efx_ef10_tx_limit_len, .rx_push_rss_config = efx_ef10_vf_rx_push_rss_config, .rx_pull_rss_config = efx_ef10_rx_pull_rss_config, - .rx_probe = efx_ef10_rx_probe, - .rx_init = efx_ef10_rx_init, - .rx_remove = efx_ef10_rx_remove, + .rx_probe = efx_mcdi_rx_probe, + .rx_init = efx_mcdi_rx_init, + .rx_remove = efx_mcdi_rx_remove, .rx_write = efx_ef10_rx_write, .rx_defer_refill = efx_ef10_rx_defer_refill, - .ev_probe = efx_ef10_ev_probe, + .ev_probe = efx_mcdi_ev_probe, .ev_init = efx_ef10_ev_init, - .ev_fini = efx_ef10_ev_fini, - .ev_remove = efx_ef10_ev_remove, + .ev_fini = efx_mcdi_ev_fini, + .ev_remove = efx_mcdi_ev_remove, .ev_process = efx_ef10_ev_process, .ev_read_ack = efx_ef10_ev_read_ack, .ev_test_generate = efx_ef10_ev_test_generate, @@ -6759,7 +6443,7 @@ const struct efx_nic_type efx_hunt_a0_nic_type = { .irq_handle_legacy = efx_ef10_legacy_interrupt, .tx_probe = efx_ef10_tx_probe, .tx_init = efx_ef10_tx_init, - .tx_remove = efx_ef10_tx_remove, + .tx_remove = efx_mcdi_tx_remove, .tx_write = efx_ef10_tx_write, .tx_limit_len = efx_ef10_tx_limit_len, .rx_push_rss_config = efx_ef10_pf_rx_push_rss_config, @@ -6767,15 +6451,15 @@ const struct efx_nic_type efx_hunt_a0_nic_type = { .rx_push_rss_context_config = efx_ef10_rx_push_rss_context_config, .rx_pull_rss_context_config = efx_ef10_rx_pull_rss_context_config, .rx_restore_rss_contexts = efx_ef10_rx_restore_rss_contexts, - .rx_probe = efx_ef10_rx_probe, - .rx_init = efx_ef10_rx_init, - .rx_remove = efx_ef10_rx_remove, + .rx_probe = efx_mcdi_rx_probe, + .rx_init = efx_mcdi_rx_init, + .rx_remove = efx_mcdi_rx_remove, .rx_write = efx_ef10_rx_write, .rx_defer_refill = efx_ef10_rx_defer_refill, - .ev_probe = efx_ef10_ev_probe, + .ev_probe = efx_mcdi_ev_probe, .ev_init = efx_ef10_ev_init, - .ev_fini = efx_ef10_ev_fini, - .ev_remove = efx_ef10_ev_remove, + .ev_fini = efx_mcdi_ev_fini, + .ev_remove = efx_mcdi_ev_remove, .ev_process = efx_ef10_ev_process, .ev_read_ack = efx_ef10_ev_read_ack, .ev_test_generate = efx_ef10_ev_test_generate, diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c index 033907e6fdb0..4481f21a1f43 100644 --- a/drivers/net/ethernet/sfc/efx.c +++ b/drivers/net/ethernet/sfc/efx.c @@ -23,6 +23,10 @@ #include <net/gre.h> #include <net/udp_tunnel.h> #include "efx.h" +#include "efx_common.h" +#include "efx_channels.h" +#include "rx_common.h" +#include "tx_common.h" #include "nic.h" #include "io.h" #include "selftest.h" @@ -39,56 +43,6 @@ ************************************************************************** */ -/* Loopback mode names (see LOOPBACK_MODE()) */ -const unsigned int efx_loopback_mode_max = LOOPBACK_MAX; -const char *const efx_loopback_mode_names[] = { - [LOOPBACK_NONE] = "NONE", - [LOOPBACK_DATA] = "DATAPATH", - [LOOPBACK_GMAC] = "GMAC", - [LOOPBACK_XGMII] = "XGMII", - [LOOPBACK_XGXS] = "XGXS", - [LOOPBACK_XAUI] = "XAUI", - [LOOPBACK_GMII] = "GMII", - [LOOPBACK_SGMII] = "SGMII", - [LOOPBACK_XGBR] = "XGBR", - [LOOPBACK_XFI] = "XFI", - [LOOPBACK_XAUI_FAR] = "XAUI_FAR", - [LOOPBACK_GMII_FAR] = "GMII_FAR", - [LOOPBACK_SGMII_FAR] = "SGMII_FAR", - [LOOPBACK_XFI_FAR] = "XFI_FAR", - [LOOPBACK_GPHY] = "GPHY", - [LOOPBACK_PHYXS] = "PHYXS", - [LOOPBACK_PCS] = "PCS", - [LOOPBACK_PMAPMD] = "PMA/PMD", - [LOOPBACK_XPORT] = "XPORT", - [LOOPBACK_XGMII_WS] = "XGMII_WS", - [LOOPBACK_XAUI_WS] = "XAUI_WS", - [LOOPBACK_XAUI_WS_FAR] = "XAUI_WS_FAR", - [LOOPBACK_XAUI_WS_NEAR] = "XAUI_WS_NEAR", - [LOOPBACK_GMII_WS] = "GMII_WS", - [LOOPBACK_XFI_WS] = "XFI_WS", - [LOOPBACK_XFI_WS_FAR] = "XFI_WS_FAR", - [LOOPBACK_PHYXS_WS] = "PHYXS_WS", -}; - -const unsigned int efx_reset_type_max = RESET_TYPE_MAX; -const char *const efx_reset_type_names[] = { - [RESET_TYPE_INVISIBLE] = "INVISIBLE", - [RESET_TYPE_ALL] = "ALL", - [RESET_TYPE_RECOVER_OR_ALL] = "RECOVER_OR_ALL", - [RESET_TYPE_WORLD] = "WORLD", - [RESET_TYPE_RECOVER_OR_DISABLE] = "RECOVER_OR_DISABLE", - [RESET_TYPE_DATAPATH] = "DATAPATH", - [RESET_TYPE_MC_BIST] = "MC_BIST", - [RESET_TYPE_DISABLE] = "DISABLE", - [RESET_TYPE_TX_WATCHDOG] = "TX_WATCHDOG", - [RESET_TYPE_INT_ERROR] = "INT_ERROR", - [RESET_TYPE_DMA_ERROR] = "DMA_ERROR", - [RESET_TYPE_TX_SKIP] = "TX_SKIP", - [RESET_TYPE_MC_FAILURE] = "MC_FAILURE", - [RESET_TYPE_MCDI_TIMEOUT] = "MCDI_TIMEOUT (FLR)", -}; - /* UDP tunnel type names */ static const char *const efx_udp_tunnel_type_names[] = { [TUNNEL_ENCAP_UDP_PORT_ENTRY_VXLAN] = "vxlan", @@ -104,18 +58,6 @@ void efx_get_udp_tunnel_type_name(u16 type, char *buf, size_t buflen) snprintf(buf, buflen, "type %d", type); } -/* Reset workqueue. If any NIC has a hardware failure then a reset will be - * queued onto this work queue. This is not a per-nic work queue, because - * efx_reset_work() acquires the rtnl lock, so resets are naturally serialised. - */ -static struct workqueue_struct *reset_workqueue; - -/* How often and how many times to poll for a reset while waiting for a - * BIST that another function started to complete. - */ -#define BIST_WAIT_DELAY_MS 100 -#define BIST_WAIT_DELAY_COUNT 100 - /************************************************************************** * * Configurable values @@ -135,21 +77,6 @@ module_param(efx_separate_tx_channels, bool, 0444); MODULE_PARM_DESC(efx_separate_tx_channels, "Use separate channels for TX and RX"); -/* This is the weight assigned to each of the (per-channel) virtual - * NAPI devices. - */ -static int napi_weight = 64; - -/* This is the time (in jiffies) between invocations of the hardware - * monitor. - * On Falcon-based NICs, this will: - * - Check the on-board hardware monitor; - * - Poll the link state and reconfigure the hardware as necessary. - * On Siena-based NICs for power systems with EEH support, this will give EEH a - * chance to start. - */ -static unsigned int efx_monitor_interval = 1 * HZ; - /* Initial interrupt moderation settings. They can be modified after * module load with ethtool. * @@ -169,38 +96,10 @@ static unsigned int rx_irq_mod_usec = 60; */ static unsigned int tx_irq_mod_usec = 150; -/* This is the first interrupt mode to try out of: - * 0 => MSI-X - * 1 => MSI - * 2 => legacy - */ -static unsigned int interrupt_mode; - -/* This is the requested number of CPUs to use for Receive-Side Scaling (RSS), - * i.e. the number of CPUs among which we may distribute simultaneous - * interrupt handling. - * - * Cards without MSI-X will only target one CPU via legacy or MSI interrupt. - * The default (0) means to assign an interrupt to each core. - */ -static unsigned int rss_cpus; -module_param(rss_cpus, uint, 0444); -MODULE_PARM_DESC(rss_cpus, "Number of CPUs to use for Receive-Side Scaling"); - static bool phy_flash_cfg; module_param(phy_flash_cfg, bool, 0644); MODULE_PARM_DESC(phy_flash_cfg, "Set PHYs into reflash mode initially"); -static unsigned irq_adapt_low_thresh = 8000; -module_param(irq_adapt_low_thresh, uint, 0644); -MODULE_PARM_DESC(irq_adapt_low_thresh, - "Threshold score for reducing IRQ moderation"); - -static unsigned irq_adapt_high_thresh = 16000; -module_param(irq_adapt_high_thresh, uint, 0644); -MODULE_PARM_DESC(irq_adapt_high_thresh, - "Threshold score for increasing IRQ moderation"); - static unsigned debug = (NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK | NETIF_MSG_IFDOWN | NETIF_MSG_IFUP | NETIF_MSG_RX_ERR | @@ -214,18 +113,8 @@ MODULE_PARM_DESC(debug, "Bitmapped debugging message enable value"); * *************************************************************************/ -static int efx_soft_enable_interrupts(struct efx_nic *efx); -static void efx_soft_disable_interrupts(struct efx_nic *efx); -static void efx_remove_channel(struct efx_channel *channel); -static void efx_remove_channels(struct efx_nic *efx); static const struct efx_channel_type efx_default_channel_type; static void efx_remove_port(struct efx_nic *efx); -static void efx_init_napi_channel(struct efx_channel *channel); -static void efx_fini_napi(struct efx_nic *efx); -static void efx_fini_napi_channel(struct efx_channel *channel); -static void efx_fini_struct(struct efx_nic *efx); -static void efx_start_all(struct efx_nic *efx); -static void efx_stop_all(struct efx_nic *efx); static int efx_xdp_setup_prog(struct efx_nic *efx, struct bpf_prog *prog); static int efx_xdp(struct net_device *dev, struct netdev_bpf *xdp); static int efx_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **xdpfs, @@ -239,776 +128,12 @@ static int efx_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **xdpfs, ASSERT_RTNL(); \ } while (0) -static int efx_check_disabled(struct efx_nic *efx) -{ - if (efx->state == STATE_DISABLED || efx->state == STATE_RECOVERY) { - netif_err(efx, drv, efx->net_dev, - "device is disabled due to earlier errors\n"); - return -EIO; - } - return 0; -} - -/************************************************************************** - * - * Event queue processing - * - *************************************************************************/ - -/* Process channel's event queue - * - * This function is responsible for processing the event queue of a - * single channel. The caller must guarantee that this function will - * never be concurrently called more than once on the same channel, - * though different channels may be being processed concurrently. - */ -static int efx_process_channel(struct efx_channel *channel, int budget) -{ - struct efx_tx_queue *tx_queue; - struct list_head rx_list; - int spent; - - if (unlikely(!channel->enabled)) - return 0; - - /* Prepare the batch receive list */ - EFX_WARN_ON_PARANOID(channel->rx_list != NULL); - INIT_LIST_HEAD(&rx_list); - channel->rx_list = &rx_list; - - efx_for_each_channel_tx_queue(tx_queue, channel) { - tx_queue->pkts_compl = 0; - tx_queue->bytes_compl = 0; - } - - spent = efx_nic_process_eventq(channel, budget); - if (spent && efx_channel_has_rx_queue(channel)) { - struct efx_rx_queue *rx_queue = - efx_channel_get_rx_queue(channel); - - efx_rx_flush_packet(channel); - efx_fast_push_rx_descriptors(rx_queue, true); - } - - /* Update BQL */ - efx_for_each_channel_tx_queue(tx_queue, channel) { - if (tx_queue->bytes_compl) { - netdev_tx_completed_queue(tx_queue->core_txq, - tx_queue->pkts_compl, tx_queue->bytes_compl); - } - } - - /* Receive any packets we queued up */ - netif_receive_skb_list(channel->rx_list); - channel->rx_list = NULL; - - return spent; -} - -/* NAPI poll handler - * - * NAPI guarantees serialisation of polls of the same device, which - * provides the guarantee required by efx_process_channel(). - */ -static void efx_update_irq_mod(struct efx_nic *efx, struct efx_channel *channel) -{ - int step = efx->irq_mod_step_us; - - if (channel->irq_mod_score < irq_adapt_low_thresh) { - if (channel->irq_moderation_us > step) { - channel->irq_moderation_us -= step; - efx->type->push_irq_moderation(channel); - } - } else if (channel->irq_mod_score > irq_adapt_high_thresh) { - if (channel->irq_moderation_us < - efx->irq_rx_moderation_us) { - channel->irq_moderation_us += step; - efx->type->push_irq_moderation(channel); - } - } - - channel->irq_count = 0; - channel->irq_mod_score = 0; -} - -static int efx_poll(struct napi_struct *napi, int budget) -{ - struct efx_channel *channel = - container_of(napi, struct efx_channel, napi_str); - struct efx_nic *efx = channel->efx; - int spent; - - netif_vdbg(efx, intr, efx->net_dev, - "channel %d NAPI poll executing on CPU %d\n", - channel->channel, raw_smp_processor_id()); - - spent = efx_process_channel(channel, budget); - - xdp_do_flush_map(); - - if (spent < budget) { - if (efx_channel_has_rx_queue(channel) && - efx->irq_rx_adaptive && - unlikely(++channel->irq_count == 1000)) { - efx_update_irq_mod(efx, channel); - } - -#ifdef CONFIG_RFS_ACCEL - /* Perhaps expire some ARFS filters */ - mod_delayed_work(system_wq, &channel->filter_work, 0); -#endif - - /* There is no race here; although napi_disable() will - * only wait for napi_complete(), this isn't a problem - * since efx_nic_eventq_read_ack() will have no effect if - * interrupts have already been disabled. - */ - if (napi_complete_done(napi, spent)) - efx_nic_eventq_read_ack(channel); - } - - return spent; -} - -/* Create event queue - * Event queue memory allocations are done only once. If the channel - * is reset, the memory buffer will be reused; this guards against - * errors during channel reset and also simplifies interrupt handling. - */ -static int efx_probe_eventq(struct efx_channel *channel) -{ - struct efx_nic *efx = channel->efx; - unsigned long entries; - - netif_dbg(efx, probe, efx->net_dev, - "chan %d create event queue\n", channel->channel); - - /* Build an event queue with room for one event per tx and rx buffer, - * plus some extra for link state events and MCDI completions. */ - entries = roundup_pow_of_two(efx->rxq_entries + efx->txq_entries + 128); - EFX_WARN_ON_PARANOID(entries > EFX_MAX_EVQ_SIZE); - channel->eventq_mask = max(entries, EFX_MIN_EVQ_SIZE) - 1; - - return efx_nic_probe_eventq(channel); -} - -/* Prepare channel's event queue */ -static int efx_init_eventq(struct efx_channel *channel) -{ - struct efx_nic *efx = channel->efx; - int rc; - - EFX_WARN_ON_PARANOID(channel->eventq_init); - - netif_dbg(efx, drv, efx->net_dev, - "chan %d init event queue\n", channel->channel); - - rc = efx_nic_init_eventq(channel); - if (rc == 0) { - efx->type->push_irq_moderation(channel); - channel->eventq_read_ptr = 0; - channel->eventq_init = true; - } - return rc; -} - -/* Enable event queue processing and NAPI */ -void efx_start_eventq(struct efx_channel *channel) -{ - netif_dbg(channel->efx, ifup, channel->efx->net_dev, - "chan %d start event queue\n", channel->channel); - - /* Make sure the NAPI handler sees the enabled flag set */ - channel->enabled = true; - smp_wmb(); - - napi_enable(&channel->napi_str); - efx_nic_eventq_read_ack(channel); -} - -/* Disable event queue processing and NAPI */ -void efx_stop_eventq(struct efx_channel *channel) -{ - if (!channel->enabled) - return; - - napi_disable(&channel->napi_str); - channel->enabled = false; -} - -static void efx_fini_eventq(struct efx_channel *channel) -{ - if (!channel->eventq_init) - return; - - netif_dbg(channel->efx, drv, channel->efx->net_dev, - "chan %d fini event queue\n", channel->channel); - - efx_nic_fini_eventq(channel); - channel->eventq_init = false; -} - -static void efx_remove_eventq(struct efx_channel *channel) -{ - netif_dbg(channel->efx, drv, channel->efx->net_dev, - "chan %d remove event queue\n", channel->channel); - - efx_nic_remove_eventq(channel); -} - -/************************************************************************** - * - * Channel handling - * - *************************************************************************/ - -/* Allocate and initialise a channel structure. */ -static struct efx_channel * -efx_alloc_channel(struct efx_nic *efx, int i, struct efx_channel *old_channel) -{ - struct efx_channel *channel; - struct efx_rx_queue *rx_queue; - struct efx_tx_queue *tx_queue; - int j; - - channel = kzalloc(sizeof(*channel), GFP_KERNEL); - if (!channel) - return NULL; - - channel->efx = efx; - channel->channel = i; - channel->type = &efx_default_channel_type; - - for (j = 0; j < EFX_TXQ_TYPES; j++) { - tx_queue = &channel->tx_queue[j]; - tx_queue->efx = efx; - tx_queue->queue = i * EFX_TXQ_TYPES + j; - tx_queue->channel = channel; - } - -#ifdef CONFIG_RFS_ACCEL - INIT_DELAYED_WORK(&channel->filter_work, efx_filter_rfs_expire); -#endif - - rx_queue = &channel->rx_queue; - rx_queue->efx = efx; - timer_setup(&rx_queue->slow_fill, efx_rx_slow_fill, 0); - - return channel; -} - -/* Allocate and initialise a channel structure, copying parameters - * (but not resources) from an old channel structure. - */ -static struct efx_channel * -efx_copy_channel(const struct efx_channel *old_channel) -{ - struct efx_channel *channel; - struct efx_rx_queue *rx_queue; - struct efx_tx_queue *tx_queue; - int j; - - channel = kmalloc(sizeof(*channel), GFP_KERNEL); - if (!channel) - return NULL; - - *channel = *old_channel; - - channel->napi_dev = NULL; - INIT_HLIST_NODE(&channel->napi_str.napi_hash_node); - channel->napi_str.napi_id = 0; - channel->napi_str.state = 0; - memset(&channel->eventq, 0, sizeof(channel->eventq)); - - for (j = 0; j < EFX_TXQ_TYPES; j++) { - tx_queue = &channel->tx_queue[j]; - if (tx_queue->channel) - tx_queue->channel = channel; - tx_queue->buffer = NULL; - memset(&tx_queue->txd, 0, sizeof(tx_queue->txd)); - } - - rx_queue = &channel->rx_queue; - rx_queue->buffer = NULL; - memset(&rx_queue->rxd, 0, sizeof(rx_queue->rxd)); - timer_setup(&rx_queue->slow_fill, efx_rx_slow_fill, 0); -#ifdef CONFIG_RFS_ACCEL - INIT_DELAYED_WORK(&channel->filter_work, efx_filter_rfs_expire); -#endif - - return channel; -} - -static int efx_probe_channel(struct efx_channel *channel) -{ - struct efx_tx_queue *tx_queue; - struct efx_rx_queue *rx_queue; - int rc; - - netif_dbg(channel->efx, probe, channel->efx->net_dev, - "creating channel %d\n", channel->channel); - - rc = channel->type->pre_probe(channel); - if (rc) - goto fail; - - rc = efx_probe_eventq(channel); - if (rc) - goto fail; - - efx_for_each_channel_tx_queue(tx_queue, channel) { - rc = efx_probe_tx_queue(tx_queue); - if (rc) - goto fail; - } - - efx_for_each_channel_rx_queue(rx_queue, channel) { - rc = efx_probe_rx_queue(rx_queue); - if (rc) - goto fail; - } - - channel->rx_list = NULL; - - return 0; - -fail: - efx_remove_channel(channel); - return rc; -} - -static void -efx_get_channel_name(struct efx_channel *channel, char *buf, size_t len) -{ - struct efx_nic *efx = channel->efx; - const char *type; - int number; - - number = channel->channel; - - if (number >= efx->xdp_channel_offset && - !WARN_ON_ONCE(!efx->n_xdp_channels)) { - type = "-xdp"; - number -= efx->xdp_channel_offset; - } else if (efx->tx_channel_offset == 0) { - type = ""; - } else if (number < efx->tx_channel_offset) { - type = "-rx"; - } else { - type = "-tx"; - number -= efx->tx_channel_offset; - } - snprintf(buf, len, "%s%s-%d", efx->name, type, number); -} - -static void efx_set_channel_names(struct efx_nic *efx) -{ - struct efx_channel *channel; - - efx_for_each_channel(channel, efx) - channel->type->get_name(channel, - efx->msi_context[channel->channel].name, - sizeof(efx->msi_context[0].name)); -} - -static int efx_probe_channels(struct efx_nic *efx) -{ - struct efx_channel *channel; - int rc; - - /* Restart special buffer allocation */ - efx->next_buffer_table = 0; - - /* Probe channels in reverse, so that any 'extra' channels - * use the start of the buffer table. This allows the traffic - * channels to be resized without moving them or wasting the - * entries before them. - */ - efx_for_each_channel_rev(channel, efx) { - rc = efx_probe_channel(channel); - if (rc) { - netif_err(efx, probe, efx->net_dev, - "failed to create channel %d\n", - channel->channel); - goto fail; - } - } - efx_set_channel_names(efx); - - return 0; - -fail: - efx_remove_channels(efx); - return rc; -} - -/* Channels are shutdown and reinitialised whilst the NIC is running - * to propagate configuration changes (mtu, checksum offload), or - * to clear hardware error conditions - */ -static void efx_start_datapath(struct efx_nic *efx) -{ - netdev_features_t old_features = efx->net_dev->features; - bool old_rx_scatter = efx->rx_scatter; - struct efx_tx_queue *tx_queue; - struct efx_rx_queue *rx_queue; - struct efx_channel *channel; - size_t rx_buf_len; - - /* Calculate the rx buffer allocation parameters required to - * support the current MTU, including padding for header - * alignment and overruns. - */ - efx->rx_dma_len = (efx->rx_prefix_size + - EFX_MAX_FRAME_LEN(efx->net_dev->mtu) + - efx->type->rx_buffer_padding); - rx_buf_len = (sizeof(struct efx_rx_page_state) + XDP_PACKET_HEADROOM + - efx->rx_ip_align + efx->rx_dma_len); - if (rx_buf_len <= PAGE_SIZE) { - efx->rx_scatter = efx->type->always_rx_scatter; - efx->rx_buffer_order = 0; - } else if (efx->type->can_rx_scatter) { - BUILD_BUG_ON(EFX_RX_USR_BUF_SIZE % L1_CACHE_BYTES); - BUILD_BUG_ON(sizeof(struct efx_rx_page_state) + - 2 * ALIGN(NET_IP_ALIGN + EFX_RX_USR_BUF_SIZE, - EFX_RX_BUF_ALIGNMENT) > - PAGE_SIZE); - efx->rx_scatter = true; - efx->rx_dma_len = EFX_RX_USR_BUF_SIZE; - efx->rx_buffer_order = 0; - } else { - efx->rx_scatter = false; - efx->rx_buffer_order = get_order(rx_buf_len); - } - - efx_rx_config_page_split(efx); - if (efx->rx_buffer_order) - netif_dbg(efx, drv, efx->net_dev, - "RX buf len=%u; page order=%u batch=%u\n", - efx->rx_dma_len, efx->rx_buffer_order, - efx->rx_pages_per_batch); - else - netif_dbg(efx, drv, efx->net_dev, - "RX buf len=%u step=%u bpp=%u; page batch=%u\n", - efx->rx_dma_len, efx->rx_page_buf_step, - efx->rx_bufs_per_page, efx->rx_pages_per_batch); - - /* Restore previously fixed features in hw_features and remove - * features which are fixed now - */ - efx->net_dev->hw_features |= efx->net_dev->features; - efx->net_dev->hw_features &= ~efx->fixed_features; - efx->net_dev->features |= efx->fixed_features; - if (efx->net_dev->features != old_features) - netdev_features_change(efx->net_dev); - - /* RX filters may also have scatter-enabled flags */ - if (efx->rx_scatter != old_rx_scatter) - efx->type->filter_update_rx_scatter(efx); - - /* We must keep at least one descriptor in a TX ring empty. - * We could avoid this when the queue size does not exactly - * match the hardware ring size, but it's not that important. - * Therefore we stop the queue when one more skb might fill - * the ring completely. We wake it when half way back to - * empty. - */ - efx->txq_stop_thresh = efx->txq_entries - efx_tx_max_skb_descs(efx); - efx->txq_wake_thresh = efx->txq_stop_thresh / 2; - - /* Initialise the channels */ - efx_for_each_channel(channel, efx) { - efx_for_each_channel_tx_queue(tx_queue, channel) { - efx_init_tx_queue(tx_queue); - atomic_inc(&efx->active_queues); - } - - efx_for_each_channel_rx_queue(rx_queue, channel) { - efx_init_rx_queue(rx_queue); - atomic_inc(&efx->active_queues); - efx_stop_eventq(channel); - efx_fast_push_rx_descriptors(rx_queue, false); - efx_start_eventq(channel); - } - - WARN_ON(channel->rx_pkt_n_frags); - } - - efx_ptp_start_datapath(efx); - - if (netif_device_present(efx->net_dev)) - netif_tx_wake_all_queues(efx->net_dev); -} - -static void efx_stop_datapath(struct efx_nic *efx) -{ - struct efx_channel *channel; - struct efx_tx_queue *tx_queue; - struct efx_rx_queue *rx_queue; - int rc; - - EFX_ASSERT_RESET_SERIALISED(efx); - BUG_ON(efx->port_enabled); - - efx_ptp_stop_datapath(efx); - - /* Stop RX refill */ - efx_for_each_channel(channel, efx) { - efx_for_each_channel_rx_queue(rx_queue, channel) - rx_queue->refill_enabled = false; - } - - efx_for_each_channel(channel, efx) { - /* RX packet processing is pipelined, so wait for the - * NAPI handler to complete. At least event queue 0 - * might be kept active by non-data events, so don't - * use napi_synchronize() but actually disable NAPI - * temporarily. - */ - if (efx_channel_has_rx_queue(channel)) { - efx_stop_eventq(channel); - efx_start_eventq(channel); - } - } - - rc = efx->type->fini_dmaq(efx); - if (rc) { - netif_err(efx, drv, efx->net_dev, "failed to flush queues\n"); - } else { - netif_dbg(efx, drv, efx->net_dev, - "successfully flushed all queues\n"); - } - - efx_for_each_channel(channel, efx) { - efx_for_each_channel_rx_queue(rx_queue, channel) - efx_fini_rx_queue(rx_queue); - efx_for_each_possible_channel_tx_queue(tx_queue, channel) - efx_fini_tx_queue(tx_queue); - } - efx->xdp_rxq_info_failed = false; -} - -static void efx_remove_channel(struct efx_channel *channel) -{ - struct efx_tx_queue *tx_queue; - struct efx_rx_queue *rx_queue; - - netif_dbg(channel->efx, drv, channel->efx->net_dev, - "destroy chan %d\n", channel->channel); - - efx_for_each_channel_rx_queue(rx_queue, channel) - efx_remove_rx_queue(rx_queue); - efx_for_each_possible_channel_tx_queue(tx_queue, channel) - efx_remove_tx_queue(tx_queue); - efx_remove_eventq(channel); - channel->type->post_remove(channel); -} - -static void efx_remove_channels(struct efx_nic *efx) -{ - struct efx_channel *channel; - - efx_for_each_channel(channel, efx) - efx_remove_channel(channel); - - kfree(efx->xdp_tx_queues); -} - -int -efx_realloc_channels(struct efx_nic *efx, u32 rxq_entries, u32 txq_entries) -{ - struct efx_channel *other_channel[EFX_MAX_CHANNELS], *channel; - u32 old_rxq_entries, old_txq_entries; - unsigned i, next_buffer_table = 0; - int rc, rc2; - - rc = efx_check_disabled(efx); - if (rc) - return rc; - - /* Not all channels should be reallocated. We must avoid - * reallocating their buffer table entries. - */ - efx_for_each_channel(channel, efx) { - struct efx_rx_queue *rx_queue; - struct efx_tx_queue *tx_queue; - - if (channel->type->copy) - continue; - next_buffer_table = max(next_buffer_table, - channel->eventq.index + - channel->eventq.entries); - efx_for_each_channel_rx_queue(rx_queue, channel) - next_buffer_table = max(next_buffer_table, - rx_queue->rxd.index + - rx_queue->rxd.entries); - efx_for_each_channel_tx_queue(tx_queue, channel) - next_buffer_table = max(next_buffer_table, - tx_queue->txd.index + - tx_queue->txd.entries); - } - - efx_device_detach_sync(efx); - efx_stop_all(efx); - efx_soft_disable_interrupts(efx); - - /* Clone channels (where possible) */ - memset(other_channel, 0, sizeof(other_channel)); - for (i = 0; i < efx->n_channels; i++) { - channel = efx->channel[i]; - if (channel->type->copy) - channel = channel->type->copy(channel); - if (!channel) { - rc = -ENOMEM; - goto out; - } - other_channel[i] = channel; - } - - /* Swap entry counts and channel pointers */ - old_rxq_entries = efx->rxq_entries; - old_txq_entries = efx->txq_entries; - efx->rxq_entries = rxq_entries; - efx->txq_entries = txq_entries; - for (i = 0; i < efx->n_channels; i++) { - channel = efx->channel[i]; - efx->channel[i] = other_channel[i]; - other_channel[i] = channel; - } - - /* Restart buffer table allocation */ - efx->next_buffer_table = next_buffer_table; - - for (i = 0; i < efx->n_channels; i++) { - channel = efx->channel[i]; - if (!channel->type->copy) - continue; - rc = efx_probe_channel(channel); - if (rc) - goto rollback; - efx_init_napi_channel(efx->channel[i]); - } - -out: - /* Destroy unused channel structures */ - for (i = 0; i < efx->n_channels; i++) { - channel = other_channel[i]; - if (channel && channel->type->copy) { - efx_fini_napi_channel(channel); - efx_remove_channel(channel); - kfree(channel); - } - } - - rc2 = efx_soft_enable_interrupts(efx); - if (rc2) { - rc = rc ? rc : rc2; - netif_err(efx, drv, efx->net_dev, - "unable to restart interrupts on channel reallocation\n"); - efx_schedule_reset(efx, RESET_TYPE_DISABLE); - } else { - efx_start_all(efx); - efx_device_attach_if_not_resetting(efx); - } - return rc; - -rollback: - /* Swap back */ - efx->rxq_entries = old_rxq_entries; - efx->txq_entries = old_txq_entries; - for (i = 0; i < efx->n_channels; i++) { - channel = efx->channel[i]; - efx->channel[i] = other_channel[i]; - other_channel[i] = channel; - } - goto out; -} - -void efx_schedule_slow_fill(struct efx_rx_queue *rx_queue) -{ - mod_timer(&rx_queue->slow_fill, jiffies + msecs_to_jiffies(10)); -} - -static bool efx_default_channel_want_txqs(struct efx_channel *channel) -{ - return channel->channel - channel->efx->tx_channel_offset < - channel->efx->n_tx_channels; -} - -static const struct efx_channel_type efx_default_channel_type = { - .pre_probe = efx_channel_dummy_op_int, - .post_remove = efx_channel_dummy_op_void, - .get_name = efx_get_channel_name, - .copy = efx_copy_channel, - .want_txqs = efx_default_channel_want_txqs, - .keep_eventq = false, - .want_pio = true, -}; - -int efx_channel_dummy_op_int(struct efx_channel *channel) -{ - return 0; -} - -void efx_channel_dummy_op_void(struct efx_channel *channel) -{ -} - /************************************************************************** * * Port handling * **************************************************************************/ -/* This ensures that the kernel is kept informed (via - * netif_carrier_on/off) of the link status, and also maintains the - * link status's stop on the port's TX queue. - */ -void efx_link_status_changed(struct efx_nic *efx) -{ - struct efx_link_state *link_state = &efx->link_state; - - /* SFC Bug 5356: A net_dev notifier is registered, so we must ensure - * that no events are triggered between unregister_netdev() and the - * driver unloading. A more general condition is that NETDEV_CHANGE - * can only be generated between NETDEV_UP and NETDEV_DOWN */ - if (!netif_running(efx->net_dev)) - return; - - if (link_state->up != netif_carrier_ok(efx->net_dev)) { - efx->n_link_state_changes++; - - if (link_state->up) - netif_carrier_on(efx->net_dev); - else - netif_carrier_off(efx->net_dev); - } - - /* Status message for kernel log */ - if (link_state->up) - netif_info(efx, link, efx->net_dev, - "link up at %uMbps %s-duplex (MTU %d)\n", - link_state->speed, link_state->fd ? "full" : "half", - efx->net_dev->mtu); - else - netif_info(efx, link, efx->net_dev, "link down\n"); -} - -void efx_link_set_advertising(struct efx_nic *efx, - const unsigned long *advertising) -{ - memcpy(efx->link_advertising, advertising, - sizeof(__ETHTOOL_DECLARE_LINK_MODE_MASK())); - - efx->link_advertising[0] |= ADVERTISED_Autoneg; - if (advertising[0] & ADVERTISED_Pause) - efx->wanted_fc |= (EFX_FC_TX | EFX_FC_RX); - else - efx->wanted_fc &= ~(EFX_FC_TX | EFX_FC_RX); - if (advertising[0] & ADVERTISED_Asym_Pause) - efx->wanted_fc ^= EFX_FC_TX; -} - /* Equivalent to efx_link_set_advertising with all-zeroes, except does not * force the Autoneg bit on. */ @@ -1035,73 +160,6 @@ void efx_link_set_wanted_fc(struct efx_nic *efx, u8 wanted_fc) static void efx_fini_port(struct efx_nic *efx); -/* We assume that efx->type->reconfigure_mac will always try to sync RX - * filters and therefore needs to read-lock the filter table against freeing - */ -void efx_mac_reconfigure(struct efx_nic *efx) -{ - down_read(&efx->filter_sem); - efx->type->reconfigure_mac(efx); - up_read(&efx->filter_sem); -} - -/* Push loopback/power/transmit disable settings to the PHY, and reconfigure - * the MAC appropriately. All other PHY configuration changes are pushed - * through phy_op->set_settings(), and pushed asynchronously to the MAC - * through efx_monitor(). - * - * Callers must hold the mac_lock - */ -int __efx_reconfigure_port(struct efx_nic *efx) -{ - enum efx_phy_mode phy_mode; - int rc; - - WARN_ON(!mutex_is_locked(&efx->mac_lock)); - - /* Disable PHY transmit in mac level loopbacks */ - phy_mode = efx->phy_mode; - if (LOOPBACK_INTERNAL(efx)) - efx->phy_mode |= PHY_MODE_TX_DISABLED; - else - efx->phy_mode &= ~PHY_MODE_TX_DISABLED; - - rc = efx->type->reconfigure_port(efx); - - if (rc) - efx->phy_mode = phy_mode; - - return rc; -} - -/* Reinitialise the MAC to pick up new PHY settings, even if the port is - * disabled. */ -int efx_reconfigure_port(struct efx_nic *efx) -{ - int rc; - - EFX_ASSERT_RESET_SERIALISED(efx); - - mutex_lock(&efx->mac_lock); - rc = __efx_reconfigure_port(efx); - mutex_unlock(&efx->mac_lock); - - return rc; -} - -/* Asynchronous work item for changing MAC promiscuity and multicast - * hash. Avoid a drain/rx_ingress enable by reconfiguring the current - * MAC directly. */ -static void efx_mac_work(struct work_struct *data) -{ - struct efx_nic *efx = container_of(data, struct efx_nic, mac_work); - - mutex_lock(&efx->mac_lock); - if (efx->port_enabled) - efx_mac_reconfigure(efx); - mutex_unlock(&efx->mac_lock); -} - static int efx_probe_port(struct efx_nic *efx) { int rc; @@ -1155,44 +213,6 @@ fail1: return rc; } -static void efx_start_port(struct efx_nic *efx) -{ - netif_dbg(efx, ifup, efx->net_dev, "start port\n"); - BUG_ON(efx->port_enabled); - - mutex_lock(&efx->mac_lock); - efx->port_enabled = true; - - /* Ensure MAC ingress/egress is enabled */ - efx_mac_reconfigure(efx); - - mutex_unlock(&efx->mac_lock); -} - -/* Cancel work for MAC reconfiguration, periodic hardware monitoring - * and the async self-test, wait for them to finish and prevent them - * being scheduled again. This doesn't cover online resets, which - * should only be cancelled when removing the device. - */ -static void efx_stop_port(struct efx_nic *efx) -{ - netif_dbg(efx, ifdown, efx->net_dev, "stop port\n"); - - EFX_ASSERT_RESET_SERIALISED(efx); - - mutex_lock(&efx->mac_lock); - efx->port_enabled = false; - mutex_unlock(&efx->mac_lock); - - /* Serialise against efx_set_multicast_list() */ - netif_addr_lock_bh(efx->net_dev); - netif_addr_unlock_bh(efx->net_dev); - - cancel_delayed_work_sync(&efx->monitor_work); - efx_selftest_async_cancel(efx); - cancel_work_sync(&efx->mac_work); -} - static void efx_fini_port(struct efx_nic *efx) { netif_dbg(efx, drv, efx->net_dev, "shut down port\n"); @@ -1291,582 +311,6 @@ static void efx_dissociate(struct efx_nic *efx) } } -/* This configures the PCI device to enable I/O and DMA. */ -static int efx_init_io(struct efx_nic *efx) -{ - struct pci_dev *pci_dev = efx->pci_dev; - dma_addr_t dma_mask = efx->type->max_dma_mask; - unsigned int mem_map_size = efx->type->mem_map_size(efx); - int rc, bar; - - netif_dbg(efx, probe, efx->net_dev, "initialising I/O\n"); - - bar = efx->type->mem_bar(efx); - - rc = pci_enable_device(pci_dev); - if (rc) { - netif_err(efx, probe, efx->net_dev, - "failed to enable PCI device\n"); - goto fail1; - } - - pci_set_master(pci_dev); - - /* Set the PCI DMA mask. Try all possibilities from our genuine mask - * down to 32 bits, because some architectures will allow 40 bit - * masks event though they reject 46 bit masks. - */ - while (dma_mask > 0x7fffffffUL) { - rc = dma_set_mask_and_coherent(&pci_dev->dev, dma_mask); - if (rc == 0) - break; - dma_mask >>= 1; - } - if (rc) { - netif_err(efx, probe, efx->net_dev, - "could not find a suitable DMA mask\n"); - goto fail2; - } - netif_dbg(efx, probe, efx->net_dev, - "using DMA mask %llx\n", (unsigned long long) dma_mask); - - efx->membase_phys = pci_resource_start(efx->pci_dev, bar); - rc = pci_request_region(pci_dev, bar, "sfc"); - if (rc) { - netif_err(efx, probe, efx->net_dev, - "request for memory BAR failed\n"); - rc = -EIO; - goto fail3; - } - efx->membase = ioremap_nocache(efx->membase_phys, mem_map_size); - if (!efx->membase) { - netif_err(efx, probe, efx->net_dev, - "could not map memory BAR at %llx+%x\n", - (unsigned long long)efx->membase_phys, mem_map_size); - rc = -ENOMEM; - goto fail4; - } - netif_dbg(efx, probe, efx->net_dev, - "memory BAR at %llx+%x (virtual %p)\n", - (unsigned long long)efx->membase_phys, mem_map_size, - efx->membase); - - return 0; - - fail4: - pci_release_region(efx->pci_dev, bar); - fail3: - efx->membase_phys = 0; - fail2: - pci_disable_device(efx->pci_dev); - fail1: - return rc; -} - -static void efx_fini_io(struct efx_nic *efx) -{ - int bar; - - netif_dbg(efx, drv, efx->net_dev, "shutting down I/O\n"); - - if (efx->membase) { - iounmap(efx->membase); - efx->membase = NULL; - } - - if (efx->membase_phys) { - bar = efx->type->mem_bar(efx); - pci_release_region(efx->pci_dev, bar); - efx->membase_phys = 0; - } - - /* Don't disable bus-mastering if VFs are assigned */ - if (!pci_vfs_assigned(efx->pci_dev)) - pci_disable_device(efx->pci_dev); -} - -void efx_set_default_rx_indir_table(struct efx_nic *efx, - struct efx_rss_context *ctx) -{ - size_t i; - - for (i = 0; i < ARRAY_SIZE(ctx->rx_indir_table); i++) - ctx->rx_indir_table[i] = - ethtool_rxfh_indir_default(i, efx->rss_spread); -} - -static unsigned int efx_wanted_parallelism(struct efx_nic *efx) -{ - cpumask_var_t thread_mask; - unsigned int count; - int cpu; - - if (rss_cpus) { - count = rss_cpus; - } else { - if (unlikely(!zalloc_cpumask_var(&thread_mask, GFP_KERNEL))) { - netif_warn(efx, probe, efx->net_dev, - "RSS disabled due to allocation failure\n"); - return 1; - } - - count = 0; - for_each_online_cpu(cpu) { - if (!cpumask_test_cpu(cpu, thread_mask)) { - ++count; - cpumask_or(thread_mask, thread_mask, - topology_sibling_cpumask(cpu)); - } - } - - free_cpumask_var(thread_mask); - } - - if (count > EFX_MAX_RX_QUEUES) { - netif_cond_dbg(efx, probe, efx->net_dev, !rss_cpus, warn, - "Reducing number of rx queues from %u to %u.\n", - count, EFX_MAX_RX_QUEUES); - count = EFX_MAX_RX_QUEUES; - } - - /* If RSS is requested for the PF *and* VFs then we can't write RSS - * table entries that are inaccessible to VFs - */ -#ifdef CONFIG_SFC_SRIOV - if (efx->type->sriov_wanted) { - if (efx->type->sriov_wanted(efx) && efx_vf_size(efx) > 1 && - count > efx_vf_size(efx)) { - netif_warn(efx, probe, efx->net_dev, - "Reducing number of RSS channels from %u to %u for " - "VF support. Increase vf-msix-limit to use more " - "channels on the PF.\n", - count, efx_vf_size(efx)); - count = efx_vf_size(efx); - } - } -#endif - - return count; -} - -static int efx_allocate_msix_channels(struct efx_nic *efx, - unsigned int max_channels, - unsigned int extra_channels, - unsigned int parallelism) -{ - unsigned int n_channels = parallelism; - int vec_count; - int n_xdp_tx; - int n_xdp_ev; - - if (efx_separate_tx_channels) - n_channels *= 2; - n_channels += extra_channels; - - /* To allow XDP transmit to happen from arbitrary NAPI contexts - * we allocate a TX queue per CPU. We share event queues across - * multiple tx queues, assuming tx and ev queues are both - * maximum size. - */ - - n_xdp_tx = num_possible_cpus(); - n_xdp_ev = DIV_ROUND_UP(n_xdp_tx, EFX_TXQ_TYPES); - - vec_count = pci_msix_vec_count(efx->pci_dev); - if (vec_count < 0) - return vec_count; - - max_channels = min_t(unsigned int, vec_count, max_channels); - - /* Check resources. - * We need a channel per event queue, plus a VI per tx queue. - * This may be more pessimistic than it needs to be. - */ - if (n_channels + n_xdp_ev > max_channels) { - netif_err(efx, drv, efx->net_dev, - "Insufficient resources for %d XDP event queues (%d other channels, max %d)\n", - n_xdp_ev, n_channels, max_channels); - efx->n_xdp_channels = 0; - efx->xdp_tx_per_channel = 0; - efx->xdp_tx_queue_count = 0; - } else { - efx->n_xdp_channels = n_xdp_ev; - efx->xdp_tx_per_channel = EFX_TXQ_TYPES; - efx->xdp_tx_queue_count = n_xdp_tx; - n_channels += n_xdp_ev; - netif_dbg(efx, drv, efx->net_dev, - "Allocating %d TX and %d event queues for XDP\n", - n_xdp_tx, n_xdp_ev); - } - - if (vec_count < n_channels) { - netif_err(efx, drv, efx->net_dev, - "WARNING: Insufficient MSI-X vectors available (%d < %u).\n", - vec_count, n_channels); - netif_err(efx, drv, efx->net_dev, - "WARNING: Performance may be reduced.\n"); - n_channels = vec_count; - } - - n_channels = min(n_channels, max_channels); - - efx->n_channels = n_channels; - - /* Ignore XDP tx channels when creating rx channels. */ - n_channels -= efx->n_xdp_channels; - - if (efx_separate_tx_channels) { - efx->n_tx_channels = - min(max(n_channels / 2, 1U), - efx->max_tx_channels); - efx->tx_channel_offset = - n_channels - efx->n_tx_channels; - efx->n_rx_channels = - max(n_channels - - efx->n_tx_channels, 1U); - } else { - efx->n_tx_channels = min(n_channels, efx->max_tx_channels); - efx->tx_channel_offset = 0; - efx->n_rx_channels = n_channels; - } - - efx->n_rx_channels = min(efx->n_rx_channels, parallelism); - efx->n_tx_channels = min(efx->n_tx_channels, parallelism); - - efx->xdp_channel_offset = n_channels; - - netif_dbg(efx, drv, efx->net_dev, - "Allocating %u RX channels\n", - efx->n_rx_channels); - - return efx->n_channels; -} - -/* Probe the number and type of interrupts we are able to obtain, and - * the resulting numbers of channels and RX queues. - */ -static int efx_probe_interrupts(struct efx_nic *efx) -{ - unsigned int extra_channels = 0; - unsigned int rss_spread; - unsigned int i, j; - int rc; - - for (i = 0; i < EFX_MAX_EXTRA_CHANNELS; i++) - if (efx->extra_channel_type[i]) - ++extra_channels; - - if (efx->interrupt_mode == EFX_INT_MODE_MSIX) { - unsigned int parallelism = efx_wanted_parallelism(efx); - struct msix_entry xentries[EFX_MAX_CHANNELS]; - unsigned int n_channels; - - rc = efx_allocate_msix_channels(efx, efx->max_channels, - extra_channels, parallelism); - if (rc >= 0) { - n_channels = rc; - for (i = 0; i < n_channels; i++) - xentries[i].entry = i; - rc = pci_enable_msix_range(efx->pci_dev, xentries, 1, - n_channels); - } - if (rc < 0) { - /* Fall back to single channel MSI */ - netif_err(efx, drv, efx->net_dev, - "could not enable MSI-X\n"); - if (efx->type->min_interrupt_mode >= EFX_INT_MODE_MSI) - efx->interrupt_mode = EFX_INT_MODE_MSI; - else - return rc; - } else if (rc < n_channels) { - netif_err(efx, drv, efx->net_dev, - "WARNING: Insufficient MSI-X vectors" - " available (%d < %u).\n", rc, n_channels); - netif_err(efx, drv, efx->net_dev, - "WARNING: Performance may be reduced.\n"); - n_channels = rc; - } - - if (rc > 0) { - for (i = 0; i < efx->n_channels; i++) - efx_get_channel(efx, i)->irq = - xentries[i].vector; - } - } - - /* Try single interrupt MSI */ - if (efx->interrupt_mode == EFX_INT_MODE_MSI) { - efx->n_channels = 1; - efx->n_rx_channels = 1; - efx->n_tx_channels = 1; - efx->n_xdp_channels = 0; - efx->xdp_channel_offset = efx->n_channels; - rc = pci_enable_msi(efx->pci_dev); - if (rc == 0) { - efx_get_channel(efx, 0)->irq = efx->pci_dev->irq; - } else { - netif_err(efx, drv, efx->net_dev, - "could not enable MSI\n"); - if (efx->type->min_interrupt_mode >= EFX_INT_MODE_LEGACY) - efx->interrupt_mode = EFX_INT_MODE_LEGACY; - else - return rc; - } - } - - /* Assume legacy interrupts */ - if (efx->interrupt_mode == EFX_INT_MODE_LEGACY) { - efx->n_channels = 1 + (efx_separate_tx_channels ? 1 : 0); - efx->n_rx_channels = 1; - efx->n_tx_channels = 1; - efx->n_xdp_channels = 0; - efx->xdp_channel_offset = efx->n_channels; - efx->legacy_irq = efx->pci_dev->irq; - } - - /* Assign extra channels if possible, before XDP channels */ - efx->n_extra_tx_channels = 0; - j = efx->xdp_channel_offset; - for (i = 0; i < EFX_MAX_EXTRA_CHANNELS; i++) { - if (!efx->extra_channel_type[i]) - continue; - if (j <= efx->tx_channel_offset + efx->n_tx_channels) { - efx->extra_channel_type[i]->handle_no_channel(efx); - } else { - --j; - efx_get_channel(efx, j)->type = - efx->extra_channel_type[i]; - if (efx_channel_has_tx_queues(efx_get_channel(efx, j))) - efx->n_extra_tx_channels++; - } - } - - rss_spread = efx->n_rx_channels; - /* RSS might be usable on VFs even if it is disabled on the PF */ -#ifdef CONFIG_SFC_SRIOV - if (efx->type->sriov_wanted) { - efx->rss_spread = ((rss_spread > 1 || - !efx->type->sriov_wanted(efx)) ? - rss_spread : efx_vf_size(efx)); - return 0; - } -#endif - efx->rss_spread = rss_spread; - - return 0; -} - -#if defined(CONFIG_SMP) -static void efx_set_interrupt_affinity(struct efx_nic *efx) -{ - struct efx_channel *channel; - unsigned int cpu; - - efx_for_each_channel(channel, efx) { - cpu = cpumask_local_spread(channel->channel, - pcibus_to_node(efx->pci_dev->bus)); - irq_set_affinity_hint(channel->irq, cpumask_of(cpu)); - } -} - -static void efx_clear_interrupt_affinity(struct efx_nic *efx) -{ - struct efx_channel *channel; - - efx_for_each_channel(channel, efx) - irq_set_affinity_hint(channel->irq, NULL); -} -#else -static void -efx_set_interrupt_affinity(struct efx_nic *efx __attribute__ ((unused))) -{ -} - -static void -efx_clear_interrupt_affinity(struct efx_nic *efx __attribute__ ((unused))) -{ -} -#endif /* CONFIG_SMP */ - -static int efx_soft_enable_interrupts(struct efx_nic *efx) -{ - struct efx_channel *channel, *end_channel; - int rc; - - BUG_ON(efx->state == STATE_DISABLED); - - efx->irq_soft_enabled = true; - smp_wmb(); - - efx_for_each_channel(channel, efx) { - if (!channel->type->keep_eventq) { - rc = efx_init_eventq(channel); - if (rc) - goto fail; - } - efx_start_eventq(channel); - } - - efx_mcdi_mode_event(efx); - - return 0; -fail: - end_channel = channel; - efx_for_each_channel(channel, efx) { - if (channel == end_channel) - break; - efx_stop_eventq(channel); - if (!channel->type->keep_eventq) - efx_fini_eventq(channel); - } - - return rc; -} - -static void efx_soft_disable_interrupts(struct efx_nic *efx) -{ - struct efx_channel *channel; - - if (efx->state == STATE_DISABLED) - return; - - efx_mcdi_mode_poll(efx); - - efx->irq_soft_enabled = false; - smp_wmb(); - - if (efx->legacy_irq) - synchronize_irq(efx->legacy_irq); - - efx_for_each_channel(channel, efx) { - if (channel->irq) - synchronize_irq(channel->irq); - - efx_stop_eventq(channel); - if (!channel->type->keep_eventq) - efx_fini_eventq(channel); - } - - /* Flush the asynchronous MCDI request queue */ - efx_mcdi_flush_async(efx); -} - -static int efx_enable_interrupts(struct efx_nic *efx) -{ - struct efx_channel *channel, *end_channel; - int rc; - - BUG_ON(efx->state == STATE_DISABLED); - - if (efx->eeh_disabled_legacy_irq) { - enable_irq(efx->legacy_irq); - efx->eeh_disabled_legacy_irq = false; - } - - efx->type->irq_enable_master(efx); - - efx_for_each_channel(channel, efx) { - if (channel->type->keep_eventq) { - rc = efx_init_eventq(channel); - if (rc) - goto fail; - } - } - - rc = efx_soft_enable_interrupts(efx); - if (rc) - goto fail; - - return 0; - -fail: - end_channel = channel; - efx_for_each_channel(channel, efx) { - if (channel == end_channel) - break; - if (channel->type->keep_eventq) - efx_fini_eventq(channel); - } - - efx->type->irq_disable_non_ev(efx); - - return rc; -} - -static void efx_disable_interrupts(struct efx_nic *efx) -{ - struct efx_channel *channel; - - efx_soft_disable_interrupts(efx); - - efx_for_each_channel(channel, efx) { - if (channel->type->keep_eventq) - efx_fini_eventq(channel); - } - - efx->type->irq_disable_non_ev(efx); -} - -static void efx_remove_interrupts(struct efx_nic *efx) -{ - struct efx_channel *channel; - - /* Remove MSI/MSI-X interrupts */ - efx_for_each_channel(channel, efx) - channel->irq = 0; - pci_disable_msi(efx->pci_dev); - pci_disable_msix(efx->pci_dev); - - /* Remove legacy interrupt */ - efx->legacy_irq = 0; -} - -static int efx_set_channels(struct efx_nic *efx) -{ - struct efx_channel *channel; - struct efx_tx_queue *tx_queue; - int xdp_queue_number; - - efx->tx_channel_offset = - efx_separate_tx_channels ? - efx->n_channels - efx->n_tx_channels : 0; - - if (efx->xdp_tx_queue_count) { - EFX_WARN_ON_PARANOID(efx->xdp_tx_queues); - - /* Allocate array for XDP TX queue lookup. */ - efx->xdp_tx_queues = kcalloc(efx->xdp_tx_queue_count, - sizeof(*efx->xdp_tx_queues), - GFP_KERNEL); - if (!efx->xdp_tx_queues) - return -ENOMEM; - } - - /* We need to mark which channels really have RX and TX - * queues, and adjust the TX queue numbers if we have separate - * RX-only and TX-only channels. - */ - xdp_queue_number = 0; - efx_for_each_channel(channel, efx) { - if (channel->channel < efx->n_rx_channels) - channel->rx_queue.core_index = channel->channel; - else - channel->rx_queue.core_index = -1; - - efx_for_each_channel_tx_queue(tx_queue, channel) { - tx_queue->queue -= (efx->tx_channel_offset * - EFX_TXQ_TYPES); - - if (efx_channel_is_xdp_tx(channel) && - xdp_queue_number < efx->xdp_tx_queue_count) { - efx->xdp_tx_queues[xdp_queue_number] = tx_queue; - xdp_queue_number++; - } - } - } - return 0; -} - static int efx_probe_nic(struct efx_nic *efx) { int rc; @@ -1939,70 +383,6 @@ static void efx_remove_nic(struct efx_nic *efx) efx->type->remove(efx); } -static int efx_probe_filters(struct efx_nic *efx) -{ - int rc; - - init_rwsem(&efx->filter_sem); - mutex_lock(&efx->mac_lock); - down_write(&efx->filter_sem); - rc = efx->type->filter_table_probe(efx); - if (rc) - goto out_unlock; - -#ifdef CONFIG_RFS_ACCEL - if (efx->type->offload_features & NETIF_F_NTUPLE) { - struct efx_channel *channel; - int i, success = 1; - - efx_for_each_channel(channel, efx) { - channel->rps_flow_id = - kcalloc(efx->type->max_rx_ip_filters, - sizeof(*channel->rps_flow_id), - GFP_KERNEL); - if (!channel->rps_flow_id) - success = 0; - else - for (i = 0; - i < efx->type->max_rx_ip_filters; - ++i) - channel->rps_flow_id[i] = - RPS_FLOW_ID_INVALID; - channel->rfs_expire_index = 0; - channel->rfs_filter_count = 0; - } - - if (!success) { - efx_for_each_channel(channel, efx) - kfree(channel->rps_flow_id); - efx->type->filter_table_remove(efx); - rc = -ENOMEM; - goto out_unlock; - } - } -#endif -out_unlock: - up_write(&efx->filter_sem); - mutex_unlock(&efx->mac_lock); - return rc; -} - -static void efx_remove_filters(struct efx_nic *efx) -{ -#ifdef CONFIG_RFS_ACCEL - struct efx_channel *channel; - - efx_for_each_channel(channel, efx) { - cancel_delayed_work_sync(&channel->filter_work); - kfree(channel->rps_flow_id); - } -#endif - down_write(&efx->filter_sem); - efx->type->filter_table_remove(efx); - up_write(&efx->filter_sem); -} - - /************************************************************************** * * NIC startup/shutdown @@ -2067,81 +447,6 @@ static int efx_probe_all(struct efx_nic *efx) return rc; } -/* If the interface is supposed to be running but is not, start - * the hardware and software data path, regular activity for the port - * (MAC statistics, link polling, etc.) and schedule the port to be - * reconfigured. Interrupts must already be enabled. This function - * is safe to call multiple times, so long as the NIC is not disabled. - * Requires the RTNL lock. - */ -static void efx_start_all(struct efx_nic *efx) -{ - EFX_ASSERT_RESET_SERIALISED(efx); - BUG_ON(efx->state == STATE_DISABLED); - - /* Check that it is appropriate to restart the interface. All - * of these flags are safe to read under just the rtnl lock */ - if (efx->port_enabled || !netif_running(efx->net_dev) || - efx->reset_pending) - return; - - efx_start_port(efx); - efx_start_datapath(efx); - - /* Start the hardware monitor if there is one */ - if (efx->type->monitor != NULL) - queue_delayed_work(efx->workqueue, &efx->monitor_work, - efx_monitor_interval); - - /* Link state detection is normally event-driven; we have - * to poll now because we could have missed a change - */ - mutex_lock(&efx->mac_lock); - if (efx->phy_op->poll(efx)) - efx_link_status_changed(efx); - mutex_unlock(&efx->mac_lock); - - efx->type->start_stats(efx); - efx->type->pull_stats(efx); - spin_lock_bh(&efx->stats_lock); - efx->type->update_stats(efx, NULL, NULL); - spin_unlock_bh(&efx->stats_lock); -} - -/* Quiesce the hardware and software data path, and regular activity - * for the port without bringing the link down. Safe to call multiple - * times with the NIC in almost any state, but interrupts should be - * enabled. Requires the RTNL lock. - */ -static void efx_stop_all(struct efx_nic *efx) -{ - EFX_ASSERT_RESET_SERIALISED(efx); - - /* port_enabled can be read safely under the rtnl lock */ - if (!efx->port_enabled) - return; - - /* update stats before we go down so we can accurately count - * rx_nodesc_drops - */ - efx->type->pull_stats(efx); - spin_lock_bh(&efx->stats_lock); - efx->type->update_stats(efx, NULL, NULL); - spin_unlock_bh(&efx->stats_lock); - efx->type->stop_stats(efx); - efx_stop_port(efx); - - /* Stop the kernel transmit interface. This is only valid if - * the device is stopped or detached; otherwise the watchdog - * may fire immediately. - */ - WARN_ON(netif_running(efx->net_dev) && - netif_device_present(efx->net_dev)); - netif_tx_disable(efx->net_dev); - - efx_stop_datapath(efx); -} - static void efx_remove_all(struct efx_nic *efx) { rtnl_lock(); @@ -2237,36 +542,6 @@ void efx_get_irq_moderation(struct efx_nic *efx, unsigned int *tx_usecs, /************************************************************************** * - * Hardware monitor - * - **************************************************************************/ - -/* Run periodically off the general workqueue */ -static void efx_monitor(struct work_struct *data) -{ - struct efx_nic *efx = container_of(data, struct efx_nic, - monitor_work.work); - - netif_vdbg(efx, timer, efx->net_dev, - "hardware monitor executing on CPU %d\n", - raw_smp_processor_id()); - BUG_ON(efx->type->monitor == NULL); - - /* If the mac_lock is already held then it is likely a port - * reconfiguration is already in place, which will likely do - * most of the work of monitor() anyway. */ - if (mutex_trylock(&efx->mac_lock)) { - if (efx->port_enabled) - efx->type->monitor(efx); - mutex_unlock(&efx->mac_lock); - } - - queue_delayed_work(efx->workqueue, &efx->monitor_work, - efx_monitor_interval); -} - -/************************************************************************** - * * ioctls * *************************************************************************/ @@ -2294,45 +569,6 @@ static int efx_ioctl(struct net_device *net_dev, struct ifreq *ifr, int cmd) /************************************************************************** * - * NAPI interface - * - **************************************************************************/ - -static void efx_init_napi_channel(struct efx_channel *channel) -{ - struct efx_nic *efx = channel->efx; - - channel->napi_dev = efx->net_dev; - netif_napi_add(channel->napi_dev, &channel->napi_str, - efx_poll, napi_weight); -} - -static void efx_init_napi(struct efx_nic *efx) -{ - struct efx_channel *channel; - - efx_for_each_channel(channel, efx) - efx_init_napi_channel(channel); -} - -static void efx_fini_napi_channel(struct efx_channel *channel) -{ - if (channel->napi_dev) - netif_napi_del(&channel->napi_str); - - channel->napi_dev = NULL; -} - -static void efx_fini_napi(struct efx_nic *efx) -{ - struct efx_channel *channel; - - efx_for_each_channel(channel, efx) - efx_fini_napi_channel(channel); -} - -/************************************************************************** - * * Kernel net device interface * *************************************************************************/ @@ -2382,17 +618,6 @@ int efx_net_stop(struct net_device *net_dev) return 0; } -/* Context: process, dev_base_lock or RTNL held, non-blocking. */ -static void efx_net_stats(struct net_device *net_dev, - struct rtnl_link_stats64 *stats) -{ - struct efx_nic *efx = netdev_priv(net_dev); - - spin_lock_bh(&efx->stats_lock); - efx->type->update_stats(efx, NULL, stats); - spin_unlock_bh(&efx->stats_lock); -} - /* Context: netif_tx_lock held, BHs disabled. */ static void efx_watchdog(struct net_device *net_dev, unsigned int txqueue) { @@ -2405,51 +630,6 @@ static void efx_watchdog(struct net_device *net_dev, unsigned int txqueue) efx_schedule_reset(efx, RESET_TYPE_TX_WATCHDOG); } -static unsigned int efx_xdp_max_mtu(struct efx_nic *efx) -{ - /* The maximum MTU that we can fit in a single page, allowing for - * framing, overhead and XDP headroom. - */ - int overhead = EFX_MAX_FRAME_LEN(0) + sizeof(struct efx_rx_page_state) + - efx->rx_prefix_size + efx->type->rx_buffer_padding + - efx->rx_ip_align + XDP_PACKET_HEADROOM; - - return PAGE_SIZE - overhead; -} - -/* Context: process, rtnl_lock() held. */ -static int efx_change_mtu(struct net_device *net_dev, int new_mtu) -{ - struct efx_nic *efx = netdev_priv(net_dev); - int rc; - - rc = efx_check_disabled(efx); - if (rc) - return rc; - - if (rtnl_dereference(efx->xdp_prog) && - new_mtu > efx_xdp_max_mtu(efx)) { - netif_err(efx, drv, efx->net_dev, - "Requested MTU of %d too big for XDP (max: %d)\n", - new_mtu, efx_xdp_max_mtu(efx)); - return -EINVAL; - } - - netif_dbg(efx, drv, efx->net_dev, "changing MTU to %d\n", new_mtu); - - efx_device_detach_sync(efx); - efx_stop_all(efx); - - mutex_lock(&efx->mac_lock); - net_dev->mtu = new_mtu; - efx_mac_reconfigure(efx); - mutex_unlock(&efx->mac_lock); - - efx_start_all(efx); - efx_device_attach_if_not_resetting(efx); - return 0; -} - static int efx_set_mac_address(struct net_device *net_dev, void *data) { struct efx_nic *efx = netdev_priv(net_dev); @@ -2726,28 +906,6 @@ show_phy_type(struct device *dev, struct device_attribute *attr, char *buf) } static DEVICE_ATTR(phy_type, 0444, show_phy_type, NULL); -#ifdef CONFIG_SFC_MCDI_LOGGING -static ssize_t show_mcdi_log(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct efx_nic *efx = dev_get_drvdata(dev); - struct efx_mcdi_iface *mcdi = efx_mcdi(efx); - - return scnprintf(buf, PAGE_SIZE, "%d\n", mcdi->logging_enabled); -} -static ssize_t set_mcdi_log(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct efx_nic *efx = dev_get_drvdata(dev); - struct efx_mcdi_iface *mcdi = efx_mcdi(efx); - bool enable = count > 0 && *buf != '0'; - - mcdi->logging_enabled = enable; - return count; -} -static DEVICE_ATTR(mcdi_logging, 0644, show_mcdi_log, set_mcdi_log); -#endif - static int efx_register_netdev(struct efx_nic *efx) { struct net_device *net_dev = efx->net_dev; @@ -2807,21 +965,11 @@ static int efx_register_netdev(struct efx_nic *efx) "failed to init net dev attributes\n"); goto fail_registered; } -#ifdef CONFIG_SFC_MCDI_LOGGING - rc = device_create_file(&efx->pci_dev->dev, &dev_attr_mcdi_logging); - if (rc) { - netif_err(efx, drv, efx->net_dev, - "failed to init net dev attributes\n"); - goto fail_attr_mcdi_logging; - } -#endif + + efx_init_mcdi_logging(efx); return 0; -#ifdef CONFIG_SFC_MCDI_LOGGING -fail_attr_mcdi_logging: - device_remove_file(&efx->pci_dev->dev, &dev_attr_phy_type); -#endif fail_registered: rtnl_lock(); efx_dissociate(efx); @@ -2842,9 +990,7 @@ static void efx_unregister_netdev(struct efx_nic *efx) if (efx_dev_registered(efx)) { strlcpy(efx->name, pci_name(efx->pci_dev), sizeof(efx->name)); -#ifdef CONFIG_SFC_MCDI_LOGGING - device_remove_file(&efx->pci_dev->dev, &dev_attr_mcdi_logging); -#endif + efx_fini_mcdi_logging(efx); device_remove_file(&efx->pci_dev->dev, &dev_attr_phy_type); unregister_netdev(efx->net_dev); } @@ -2852,292 +998,6 @@ static void efx_unregister_netdev(struct efx_nic *efx) /************************************************************************** * - * Device reset and suspend - * - **************************************************************************/ - -/* Tears down the entire software state and most of the hardware state - * before reset. */ -void efx_reset_down(struct efx_nic *efx, enum reset_type method) -{ - EFX_ASSERT_RESET_SERIALISED(efx); - - if (method == RESET_TYPE_MCDI_TIMEOUT) - efx->type->prepare_flr(efx); - - efx_stop_all(efx); - efx_disable_interrupts(efx); - - mutex_lock(&efx->mac_lock); - down_write(&efx->filter_sem); - mutex_lock(&efx->rss_lock); - if (efx->port_initialized && method != RESET_TYPE_INVISIBLE && - method != RESET_TYPE_DATAPATH) - efx->phy_op->fini(efx); - efx->type->fini(efx); -} - -/* This function will always ensure that the locks acquired in - * efx_reset_down() are released. A failure return code indicates - * that we were unable to reinitialise the hardware, and the - * driver should be disabled. If ok is false, then the rx and tx - * engines are not restarted, pending a RESET_DISABLE. */ -int efx_reset_up(struct efx_nic *efx, enum reset_type method, bool ok) -{ - int rc; - - EFX_ASSERT_RESET_SERIALISED(efx); - - if (method == RESET_TYPE_MCDI_TIMEOUT) - efx->type->finish_flr(efx); - - /* Ensure that SRAM is initialised even if we're disabling the device */ - rc = efx->type->init(efx); - if (rc) { - netif_err(efx, drv, efx->net_dev, "failed to initialise NIC\n"); - goto fail; - } - - if (!ok) - goto fail; - - if (efx->port_initialized && method != RESET_TYPE_INVISIBLE && - method != RESET_TYPE_DATAPATH) { - rc = efx->phy_op->init(efx); - if (rc) - goto fail; - rc = efx->phy_op->reconfigure(efx); - if (rc && rc != -EPERM) - netif_err(efx, drv, efx->net_dev, - "could not restore PHY settings\n"); - } - - rc = efx_enable_interrupts(efx); - if (rc) - goto fail; - -#ifdef CONFIG_SFC_SRIOV - rc = efx->type->vswitching_restore(efx); - if (rc) /* not fatal; the PF will still work fine */ - netif_warn(efx, probe, efx->net_dev, - "failed to restore vswitching rc=%d;" - " VFs may not function\n", rc); -#endif - - if (efx->type->rx_restore_rss_contexts) - efx->type->rx_restore_rss_contexts(efx); - mutex_unlock(&efx->rss_lock); - efx->type->filter_table_restore(efx); - up_write(&efx->filter_sem); - if (efx->type->sriov_reset) - efx->type->sriov_reset(efx); - - mutex_unlock(&efx->mac_lock); - - efx_start_all(efx); - - if (efx->type->udp_tnl_push_ports) - efx->type->udp_tnl_push_ports(efx); - - return 0; - -fail: - efx->port_initialized = false; - - mutex_unlock(&efx->rss_lock); - up_write(&efx->filter_sem); - mutex_unlock(&efx->mac_lock); - - return rc; -} - -/* Reset the NIC using the specified method. Note that the reset may - * fail, in which case the card will be left in an unusable state. - * - * Caller must hold the rtnl_lock. - */ -int efx_reset(struct efx_nic *efx, enum reset_type method) -{ - int rc, rc2; - bool disabled; - - netif_info(efx, drv, efx->net_dev, "resetting (%s)\n", - RESET_TYPE(method)); - - efx_device_detach_sync(efx); - efx_reset_down(efx, method); - - rc = efx->type->reset(efx, method); - if (rc) { - netif_err(efx, drv, efx->net_dev, "failed to reset hardware\n"); - goto out; - } - - /* Clear flags for the scopes we covered. We assume the NIC and - * driver are now quiescent so that there is no race here. - */ - if (method < RESET_TYPE_MAX_METHOD) - efx->reset_pending &= -(1 << (method + 1)); - else /* it doesn't fit into the well-ordered scope hierarchy */ - __clear_bit(method, &efx->reset_pending); - - /* Reinitialise bus-mastering, which may have been turned off before - * the reset was scheduled. This is still appropriate, even in the - * RESET_TYPE_DISABLE since this driver generally assumes the hardware - * can respond to requests. */ - pci_set_master(efx->pci_dev); - -out: - /* Leave device stopped if necessary */ - disabled = rc || - method == RESET_TYPE_DISABLE || - method == RESET_TYPE_RECOVER_OR_DISABLE; - rc2 = efx_reset_up(efx, method, !disabled); - if (rc2) { - disabled = true; - if (!rc) - rc = rc2; - } - - if (disabled) { - dev_close(efx->net_dev); - netif_err(efx, drv, efx->net_dev, "has been disabled\n"); - efx->state = STATE_DISABLED; - } else { - netif_dbg(efx, drv, efx->net_dev, "reset complete\n"); - efx_device_attach_if_not_resetting(efx); - } - return rc; -} - -/* Try recovery mechanisms. - * For now only EEH is supported. - * Returns 0 if the recovery mechanisms are unsuccessful. - * Returns a non-zero value otherwise. - */ -int efx_try_recovery(struct efx_nic *efx) -{ -#ifdef CONFIG_EEH - /* A PCI error can occur and not be seen by EEH because nothing - * happens on the PCI bus. In this case the driver may fail and - * schedule a 'recover or reset', leading to this recovery handler. - * Manually call the eeh failure check function. - */ - struct eeh_dev *eehdev = pci_dev_to_eeh_dev(efx->pci_dev); - if (eeh_dev_check_failure(eehdev)) { - /* The EEH mechanisms will handle the error and reset the - * device if necessary. - */ - return 1; - } -#endif - return 0; -} - -static void efx_wait_for_bist_end(struct efx_nic *efx) -{ - int i; - - for (i = 0; i < BIST_WAIT_DELAY_COUNT; ++i) { - if (efx_mcdi_poll_reboot(efx)) - goto out; - msleep(BIST_WAIT_DELAY_MS); - } - - netif_err(efx, drv, efx->net_dev, "Warning: No MC reboot after BIST mode\n"); -out: - /* Either way unset the BIST flag. If we found no reboot we probably - * won't recover, but we should try. - */ - efx->mc_bist_for_other_fn = false; -} - -/* The worker thread exists so that code that cannot sleep can - * schedule a reset for later. - */ -static void efx_reset_work(struct work_struct *data) -{ - struct efx_nic *efx = container_of(data, struct efx_nic, reset_work); - unsigned long pending; - enum reset_type method; - - pending = READ_ONCE(efx->reset_pending); - method = fls(pending) - 1; - - if (method == RESET_TYPE_MC_BIST) - efx_wait_for_bist_end(efx); - - if ((method == RESET_TYPE_RECOVER_OR_DISABLE || - method == RESET_TYPE_RECOVER_OR_ALL) && - efx_try_recovery(efx)) - return; - - if (!pending) - return; - - rtnl_lock(); - - /* We checked the state in efx_schedule_reset() but it may - * have changed by now. Now that we have the RTNL lock, - * it cannot change again. - */ - if (efx->state == STATE_READY) - (void)efx_reset(efx, method); - - rtnl_unlock(); -} - -void efx_schedule_reset(struct efx_nic *efx, enum reset_type type) -{ - enum reset_type method; - - if (efx->state == STATE_RECOVERY) { - netif_dbg(efx, drv, efx->net_dev, - "recovering: skip scheduling %s reset\n", - RESET_TYPE(type)); - return; - } - - switch (type) { - case RESET_TYPE_INVISIBLE: - case RESET_TYPE_ALL: - case RESET_TYPE_RECOVER_OR_ALL: - case RESET_TYPE_WORLD: - case RESET_TYPE_DISABLE: - case RESET_TYPE_RECOVER_OR_DISABLE: - case RESET_TYPE_DATAPATH: - case RESET_TYPE_MC_BIST: - case RESET_TYPE_MCDI_TIMEOUT: - method = type; - netif_dbg(efx, drv, efx->net_dev, "scheduling %s reset\n", - RESET_TYPE(method)); - break; - default: - method = efx->type->map_reset_reason(type); - netif_dbg(efx, drv, efx->net_dev, - "scheduling %s reset for %s\n", - RESET_TYPE(method), RESET_TYPE(type)); - break; - } - - set_bit(method, &efx->reset_pending); - smp_mb(); /* ensure we change reset_pending before checking state */ - - /* If we're not READY then just leave the flags set as the cue - * to abort probing or reschedule the reset later. - */ - if (READ_ONCE(efx->state) != STATE_READY) - return; - - /* efx_process_channel() will no longer read events once a - * reset is scheduled. So switch back to poll'd MCDI completions. */ - efx_mcdi_mode_poll(efx); - - queue_work(reset_workqueue, &efx->reset_work); -} - -/************************************************************************** - * * List of NICs we support * **************************************************************************/ @@ -3169,139 +1029,10 @@ static const struct pci_device_id efx_pci_table[] = { /************************************************************************** * - * Dummy PHY/MAC operations - * - * Can be used for some unimplemented operations - * Needed so all function pointers are valid and do not have to be tested - * before use - * - **************************************************************************/ -int efx_port_dummy_op_int(struct efx_nic *efx) -{ - return 0; -} -void efx_port_dummy_op_void(struct efx_nic *efx) {} - -static bool efx_port_dummy_op_poll(struct efx_nic *efx) -{ - return false; -} - -static const struct efx_phy_operations efx_dummy_phy_operations = { - .init = efx_port_dummy_op_int, - .reconfigure = efx_port_dummy_op_int, - .poll = efx_port_dummy_op_poll, - .fini = efx_port_dummy_op_void, -}; - -/************************************************************************** - * * Data housekeeping * **************************************************************************/ -/* This zeroes out and then fills in the invariants in a struct - * efx_nic (including all sub-structures). - */ -static int efx_init_struct(struct efx_nic *efx, - struct pci_dev *pci_dev, struct net_device *net_dev) -{ - int rc = -ENOMEM, i; - - /* Initialise common structures */ - INIT_LIST_HEAD(&efx->node); - INIT_LIST_HEAD(&efx->secondary_list); - spin_lock_init(&efx->biu_lock); -#ifdef CONFIG_SFC_MTD - INIT_LIST_HEAD(&efx->mtd_list); -#endif - INIT_WORK(&efx->reset_work, efx_reset_work); - INIT_DELAYED_WORK(&efx->monitor_work, efx_monitor); - INIT_DELAYED_WORK(&efx->selftest_work, efx_selftest_async_work); - efx->pci_dev = pci_dev; - efx->msg_enable = debug; - efx->state = STATE_UNINIT; - strlcpy(efx->name, pci_name(pci_dev), sizeof(efx->name)); - - efx->net_dev = net_dev; - efx->rx_prefix_size = efx->type->rx_prefix_size; - efx->rx_ip_align = - NET_IP_ALIGN ? (efx->rx_prefix_size + NET_IP_ALIGN) % 4 : 0; - efx->rx_packet_hash_offset = - efx->type->rx_hash_offset - efx->type->rx_prefix_size; - efx->rx_packet_ts_offset = - efx->type->rx_ts_offset - efx->type->rx_prefix_size; - INIT_LIST_HEAD(&efx->rss_context.list); - mutex_init(&efx->rss_lock); - spin_lock_init(&efx->stats_lock); - efx->vi_stride = EFX_DEFAULT_VI_STRIDE; - efx->num_mac_stats = MC_CMD_MAC_NSTATS; - BUILD_BUG_ON(MC_CMD_MAC_NSTATS - 1 != MC_CMD_MAC_GENERATION_END); - mutex_init(&efx->mac_lock); -#ifdef CONFIG_RFS_ACCEL - mutex_init(&efx->rps_mutex); - spin_lock_init(&efx->rps_hash_lock); - /* Failure to allocate is not fatal, but may degrade ARFS performance */ - efx->rps_hash_table = kcalloc(EFX_ARFS_HASH_TABLE_SIZE, - sizeof(*efx->rps_hash_table), GFP_KERNEL); -#endif - efx->phy_op = &efx_dummy_phy_operations; - efx->mdio.dev = net_dev; - INIT_WORK(&efx->mac_work, efx_mac_work); - init_waitqueue_head(&efx->flush_wq); - - for (i = 0; i < EFX_MAX_CHANNELS; i++) { - efx->channel[i] = efx_alloc_channel(efx, i, NULL); - if (!efx->channel[i]) - goto fail; - efx->msi_context[i].efx = efx; - efx->msi_context[i].index = i; - } - - /* Higher numbered interrupt modes are less capable! */ - if (WARN_ON_ONCE(efx->type->max_interrupt_mode > - efx->type->min_interrupt_mode)) { - rc = -EIO; - goto fail; - } - efx->interrupt_mode = max(efx->type->max_interrupt_mode, - interrupt_mode); - efx->interrupt_mode = min(efx->type->min_interrupt_mode, - interrupt_mode); - - /* Would be good to use the net_dev name, but we're too early */ - snprintf(efx->workqueue_name, sizeof(efx->workqueue_name), "sfc%s", - pci_name(pci_dev)); - efx->workqueue = create_singlethread_workqueue(efx->workqueue_name); - if (!efx->workqueue) - goto fail; - - return 0; - -fail: - efx_fini_struct(efx); - return rc; -} - -static void efx_fini_struct(struct efx_nic *efx) -{ - int i; - -#ifdef CONFIG_RFS_ACCEL - kfree(efx->rps_hash_table); -#endif - - for (i = 0; i < EFX_MAX_CHANNELS; i++) - kfree(efx->channel[i]); - - kfree(efx->vpd_sn); - - if (efx->workqueue) { - destroy_workqueue(efx->workqueue); - efx->workqueue = NULL; - } -} - void efx_update_sw_stats(struct efx_nic *efx, u64 *stats) { u64 n_rx_nodesc_trunc = 0; @@ -3313,197 +1044,6 @@ void efx_update_sw_stats(struct efx_nic *efx, u64 *stats) stats[GENERIC_STAT_rx_noskb_drops] = atomic_read(&efx->n_rx_noskb_drops); } -bool efx_filter_spec_equal(const struct efx_filter_spec *left, - const struct efx_filter_spec *right) -{ - if ((left->match_flags ^ right->match_flags) | - ((left->flags ^ right->flags) & - (EFX_FILTER_FLAG_RX | EFX_FILTER_FLAG_TX))) - return false; - - return memcmp(&left->outer_vid, &right->outer_vid, - sizeof(struct efx_filter_spec) - - offsetof(struct efx_filter_spec, outer_vid)) == 0; -} - -u32 efx_filter_spec_hash(const struct efx_filter_spec *spec) -{ - BUILD_BUG_ON(offsetof(struct efx_filter_spec, outer_vid) & 3); - return jhash2((const u32 *)&spec->outer_vid, - (sizeof(struct efx_filter_spec) - - offsetof(struct efx_filter_spec, outer_vid)) / 4, - 0); -} - -#ifdef CONFIG_RFS_ACCEL -bool efx_rps_check_rule(struct efx_arfs_rule *rule, unsigned int filter_idx, - bool *force) -{ - if (rule->filter_id == EFX_ARFS_FILTER_ID_PENDING) { - /* ARFS is currently updating this entry, leave it */ - return false; - } - if (rule->filter_id == EFX_ARFS_FILTER_ID_ERROR) { - /* ARFS tried and failed to update this, so it's probably out - * of date. Remove the filter and the ARFS rule entry. - */ - rule->filter_id = EFX_ARFS_FILTER_ID_REMOVING; - *force = true; - return true; - } else if (WARN_ON(rule->filter_id != filter_idx)) { /* can't happen */ - /* ARFS has moved on, so old filter is not needed. Since we did - * not mark the rule with EFX_ARFS_FILTER_ID_REMOVING, it will - * not be removed by efx_rps_hash_del() subsequently. - */ - *force = true; - return true; - } - /* Remove it iff ARFS wants to. */ - return true; -} - -static -struct hlist_head *efx_rps_hash_bucket(struct efx_nic *efx, - const struct efx_filter_spec *spec) -{ - u32 hash = efx_filter_spec_hash(spec); - - lockdep_assert_held(&efx->rps_hash_lock); - if (!efx->rps_hash_table) - return NULL; - return &efx->rps_hash_table[hash % EFX_ARFS_HASH_TABLE_SIZE]; -} - -struct efx_arfs_rule *efx_rps_hash_find(struct efx_nic *efx, - const struct efx_filter_spec *spec) -{ - struct efx_arfs_rule *rule; - struct hlist_head *head; - struct hlist_node *node; - - head = efx_rps_hash_bucket(efx, spec); - if (!head) - return NULL; - hlist_for_each(node, head) { - rule = container_of(node, struct efx_arfs_rule, node); - if (efx_filter_spec_equal(spec, &rule->spec)) - return rule; - } - return NULL; -} - -struct efx_arfs_rule *efx_rps_hash_add(struct efx_nic *efx, - const struct efx_filter_spec *spec, - bool *new) -{ - struct efx_arfs_rule *rule; - struct hlist_head *head; - struct hlist_node *node; - - head = efx_rps_hash_bucket(efx, spec); - if (!head) - return NULL; - hlist_for_each(node, head) { - rule = container_of(node, struct efx_arfs_rule, node); - if (efx_filter_spec_equal(spec, &rule->spec)) { - *new = false; - return rule; - } - } - rule = kmalloc(sizeof(*rule), GFP_ATOMIC); - *new = true; - if (rule) { - memcpy(&rule->spec, spec, sizeof(rule->spec)); - hlist_add_head(&rule->node, head); - } - return rule; -} - -void efx_rps_hash_del(struct efx_nic *efx, const struct efx_filter_spec *spec) -{ - struct efx_arfs_rule *rule; - struct hlist_head *head; - struct hlist_node *node; - - head = efx_rps_hash_bucket(efx, spec); - if (WARN_ON(!head)) - return; - hlist_for_each(node, head) { - rule = container_of(node, struct efx_arfs_rule, node); - if (efx_filter_spec_equal(spec, &rule->spec)) { - /* Someone already reused the entry. We know that if - * this check doesn't fire (i.e. filter_id == REMOVING) - * then the REMOVING mark was put there by our caller, - * because caller is holding a lock on filter table and - * only holders of that lock set REMOVING. - */ - if (rule->filter_id != EFX_ARFS_FILTER_ID_REMOVING) - return; - hlist_del(node); - kfree(rule); - return; - } - } - /* We didn't find it. */ - WARN_ON(1); -} -#endif - -/* RSS contexts. We're using linked lists and crappy O(n) algorithms, because - * (a) this is an infrequent control-plane operation and (b) n is small (max 64) - */ -struct efx_rss_context *efx_alloc_rss_context_entry(struct efx_nic *efx) -{ - struct list_head *head = &efx->rss_context.list; - struct efx_rss_context *ctx, *new; - u32 id = 1; /* Don't use zero, that refers to the master RSS context */ - - WARN_ON(!mutex_is_locked(&efx->rss_lock)); - - /* Search for first gap in the numbering */ - list_for_each_entry(ctx, head, list) { - if (ctx->user_id != id) - break; - id++; - /* Check for wrap. If this happens, we have nearly 2^32 - * allocated RSS contexts, which seems unlikely. - */ - if (WARN_ON_ONCE(!id)) - return NULL; - } - - /* Create the new entry */ - new = kmalloc(sizeof(struct efx_rss_context), GFP_KERNEL); - if (!new) - return NULL; - new->context_id = EFX_EF10_RSS_CONTEXT_INVALID; - new->rx_hash_udp_4tuple = false; - - /* Insert the new entry into the gap */ - new->user_id = id; - list_add_tail(&new->list, &ctx->list); - return new; -} - -struct efx_rss_context *efx_find_rss_context_entry(struct efx_nic *efx, u32 id) -{ - struct list_head *head = &efx->rss_context.list; - struct efx_rss_context *ctx; - - WARN_ON(!mutex_is_locked(&efx->rss_lock)); - - list_for_each_entry(ctx, head, list) - if (ctx->user_id == id) - return ctx; - return NULL; -} - -void efx_free_rss_context_entry(struct efx_rss_context *ctx) -{ - list_del(&ctx->list); - kfree(ctx); -} - /************************************************************************** * * PCI interface @@ -3519,7 +1059,7 @@ static void efx_pci_remove_main(struct efx_nic *efx) * are not READY. */ BUG_ON(efx->state == STATE_READY); - cancel_work_sync(&efx->reset_work); + efx_flush_reset_workqueue(efx); efx_disable_interrupts(efx); efx_clear_interrupt_affinity(efx); @@ -3559,7 +1099,7 @@ static void efx_pci_remove(struct pci_dev *pci_dev) efx_pci_remove_main(efx); - efx_fini_io(efx); + efx_fini_io(efx, efx->type->mem_bar(efx)); netif_dbg(efx, drv, efx->net_dev, "shutdown successful\n"); efx_fini_struct(efx); @@ -3782,7 +1322,8 @@ static int efx_pci_probe(struct pci_dev *pci_dev, efx_probe_vpd_strings(efx); /* Set up basic I/O (BAR mappings etc) */ - rc = efx_init_io(efx); + rc = efx_init_io(efx, efx->type->mem_bar(efx), efx->type->max_dma_mask, + efx->type->mem_map_size(efx)); if (rc) goto fail2; @@ -3826,7 +1367,7 @@ static int efx_pci_probe(struct pci_dev *pci_dev, return 0; fail3: - efx_fini_io(efx); + efx_fini_io(efx, efx->type->mem_bar(efx)); fail2: efx_fini_struct(efx); fail1: @@ -3904,7 +1445,7 @@ static int efx_pm_thaw(struct device *dev) rtnl_unlock(); /* Reschedule any quenched resets scheduled during efx_pm_freeze() */ - queue_work(reset_workqueue, &efx->reset_work); + efx_queue_reset_work(efx); return 0; @@ -4083,10 +1624,6 @@ static struct pci_driver efx_pci_driver = { * *************************************************************************/ -module_param(interrupt_mode, uint, 0444); -MODULE_PARM_DESC(interrupt_mode, - "Interrupt mode (0=>MSIX 1=>MSI 2=>legacy)"); - static int __init efx_init_module(void) { int rc; @@ -4103,11 +1640,9 @@ static int __init efx_init_module(void) goto err_sriov; #endif - reset_workqueue = create_singlethread_workqueue("sfc_reset"); - if (!reset_workqueue) { - rc = -ENOMEM; + rc = efx_create_reset_workqueue(); + if (rc) goto err_reset; - } rc = pci_register_driver(&efx_pci_driver); if (rc < 0) @@ -4116,7 +1651,7 @@ static int __init efx_init_module(void) return 0; err_pci: - destroy_workqueue(reset_workqueue); + efx_destroy_reset_workqueue(); err_reset: #ifdef CONFIG_SFC_SRIOV efx_fini_sriov(); @@ -4132,7 +1667,7 @@ static void __exit efx_exit_module(void) printk(KERN_INFO "Solarflare NET driver unloading\n"); pci_unregister_driver(&efx_pci_driver); - destroy_workqueue(reset_workqueue); + efx_destroy_reset_workqueue(); #ifdef CONFIG_SFC_SRIOV efx_fini_sriov(); #endif diff --git a/drivers/net/ethernet/sfc/efx.h b/drivers/net/ethernet/sfc/efx.h index 2dd8d5002315..f1bdb04efbe4 100644 --- a/drivers/net/ethernet/sfc/efx.h +++ b/drivers/net/ethernet/sfc/efx.h @@ -15,31 +15,17 @@ int efx_net_open(struct net_device *net_dev); int efx_net_stop(struct net_device *net_dev); /* TX */ -int efx_probe_tx_queue(struct efx_tx_queue *tx_queue); -void efx_remove_tx_queue(struct efx_tx_queue *tx_queue); -void efx_init_tx_queue(struct efx_tx_queue *tx_queue); void efx_init_tx_queue_core_txq(struct efx_tx_queue *tx_queue); -void efx_fini_tx_queue(struct efx_tx_queue *tx_queue); netdev_tx_t efx_hard_start_xmit(struct sk_buff *skb, struct net_device *net_dev); netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb); void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index); int efx_setup_tc(struct net_device *net_dev, enum tc_setup_type type, void *type_data); -unsigned int efx_tx_max_skb_descs(struct efx_nic *efx); extern unsigned int efx_piobuf_size; extern bool efx_separate_tx_channels; /* RX */ -void efx_set_default_rx_indir_table(struct efx_nic *efx, - struct efx_rss_context *ctx); -void efx_rx_config_page_split(struct efx_nic *efx); -int efx_probe_rx_queue(struct efx_rx_queue *rx_queue); -void efx_remove_rx_queue(struct efx_rx_queue *rx_queue); -void efx_init_rx_queue(struct efx_rx_queue *rx_queue); -void efx_fini_rx_queue(struct efx_rx_queue *rx_queue); -void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue, bool atomic); -void efx_rx_slow_fill(struct timer_list *t); void __efx_rx_packet(struct efx_channel *channel); void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index, unsigned int n_frags, unsigned int len, u16 flags); @@ -48,7 +34,6 @@ static inline void efx_rx_flush_packet(struct efx_channel *channel) if (channel->rx_pkt_n_frags) __efx_rx_packet(channel); } -void efx_schedule_slow_fill(struct efx_rx_queue *rx_queue); #define EFX_MAX_DMAQ_SIZE 4096UL #define EFX_DEFAULT_DMAQ_SIZE 1024UL @@ -80,8 +65,6 @@ static inline bool efx_rss_enabled(struct efx_nic *efx) /* Filters */ -void efx_mac_reconfigure(struct efx_nic *efx); - /** * efx_filter_insert_filter - add or replace a filter * @efx: NIC in which to insert the filter @@ -186,58 +169,17 @@ static inline void efx_filter_rfs_expire(struct work_struct *data) static inline void efx_filter_rfs_expire(struct work_struct *data) {} #define efx_filter_rfs_enabled() 0 #endif -bool efx_filter_is_mc_recipient(const struct efx_filter_spec *spec); - -bool efx_filter_spec_equal(const struct efx_filter_spec *left, - const struct efx_filter_spec *right); -u32 efx_filter_spec_hash(const struct efx_filter_spec *spec); - -#ifdef CONFIG_RFS_ACCEL -bool efx_rps_check_rule(struct efx_arfs_rule *rule, unsigned int filter_idx, - bool *force); - -struct efx_arfs_rule *efx_rps_hash_find(struct efx_nic *efx, - const struct efx_filter_spec *spec); - -/* @new is written to indicate if entry was newly added (true) or if an old - * entry was found and returned (false). - */ -struct efx_arfs_rule *efx_rps_hash_add(struct efx_nic *efx, - const struct efx_filter_spec *spec, - bool *new); - -void efx_rps_hash_del(struct efx_nic *efx, const struct efx_filter_spec *spec); -#endif /* RSS contexts */ -struct efx_rss_context *efx_alloc_rss_context_entry(struct efx_nic *efx); -struct efx_rss_context *efx_find_rss_context_entry(struct efx_nic *efx, u32 id); -void efx_free_rss_context_entry(struct efx_rss_context *ctx); static inline bool efx_rss_active(struct efx_rss_context *ctx) { - return ctx->context_id != EFX_EF10_RSS_CONTEXT_INVALID; + return ctx->context_id != EFX_MCDI_RSS_CONTEXT_INVALID; } -/* Channels */ -int efx_channel_dummy_op_int(struct efx_channel *channel); -void efx_channel_dummy_op_void(struct efx_channel *channel); -int efx_realloc_channels(struct efx_nic *efx, u32 rxq_entries, u32 txq_entries); - -/* Ports */ -int efx_reconfigure_port(struct efx_nic *efx); -int __efx_reconfigure_port(struct efx_nic *efx); - /* Ethtool support */ extern const struct ethtool_ops efx_ethtool_ops; -/* Reset handling */ -int efx_reset(struct efx_nic *efx, enum reset_type method); -void efx_reset_down(struct efx_nic *efx, enum reset_type method); -int efx_reset_up(struct efx_nic *efx, enum reset_type method, bool ok); -int efx_try_recovery(struct efx_nic *efx); - /* Global */ -void efx_schedule_reset(struct efx_nic *efx, enum reset_type type); unsigned int efx_usecs_to_ticks(struct efx_nic *efx, unsigned int usecs); unsigned int efx_ticks_to_usecs(struct efx_nic *efx, unsigned int ticks); int efx_init_irq_moderation(struct efx_nic *efx, unsigned int tx_usecs, @@ -245,8 +187,6 @@ int efx_init_irq_moderation(struct efx_nic *efx, unsigned int tx_usecs, bool rx_may_override_tx); void efx_get_irq_moderation(struct efx_nic *efx, unsigned int *tx_usecs, unsigned int *rx_usecs, bool *rx_adaptive); -void efx_stop_eventq(struct efx_channel *channel); -void efx_start_eventq(struct efx_channel *channel); /* Dummy PHY ops for PHY drivers */ int efx_port_dummy_op_int(struct efx_nic *efx); @@ -293,9 +233,6 @@ static inline void efx_schedule_channel_irq(struct efx_channel *channel) efx_schedule_channel(channel); } -void efx_link_status_changed(struct efx_nic *efx); -void efx_link_set_advertising(struct efx_nic *efx, - const unsigned long *advertising); void efx_link_clear_advertising(struct efx_nic *efx); void efx_link_set_wanted_fc(struct efx_nic *efx, u8); diff --git a/drivers/net/ethernet/sfc/efx_channels.c b/drivers/net/ethernet/sfc/efx_channels.c new file mode 100644 index 000000000000..aeb5e8aa2f2a --- /dev/null +++ b/drivers/net/ethernet/sfc/efx_channels.c @@ -0,0 +1,1234 @@ +// SPDX-License-Identifier: GPL-2.0-only +/**************************************************************************** + * Driver for Solarflare network controllers and boards + * Copyright 2018 Solarflare Communications Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + */ + +#include "net_driver.h" +#include <linux/module.h> +#include "efx_channels.h" +#include "efx.h" +#include "efx_common.h" +#include "tx_common.h" +#include "rx_common.h" +#include "nic.h" +#include "sriov.h" + +/* This is the first interrupt mode to try out of: + * 0 => MSI-X + * 1 => MSI + * 2 => legacy + */ +static unsigned int interrupt_mode; +module_param(interrupt_mode, uint, 0444); +MODULE_PARM_DESC(interrupt_mode, + "Interrupt mode (0=>MSIX 1=>MSI 2=>legacy)"); + +/* This is the requested number of CPUs to use for Receive-Side Scaling (RSS), + * i.e. the number of CPUs among which we may distribute simultaneous + * interrupt handling. + * + * Cards without MSI-X will only target one CPU via legacy or MSI interrupt. + * The default (0) means to assign an interrupt to each core. + */ +static unsigned int rss_cpus; +module_param(rss_cpus, uint, 0444); +MODULE_PARM_DESC(rss_cpus, "Number of CPUs to use for Receive-Side Scaling"); + +static unsigned int irq_adapt_low_thresh = 8000; +module_param(irq_adapt_low_thresh, uint, 0644); +MODULE_PARM_DESC(irq_adapt_low_thresh, + "Threshold score for reducing IRQ moderation"); + +static unsigned int irq_adapt_high_thresh = 16000; +module_param(irq_adapt_high_thresh, uint, 0644); +MODULE_PARM_DESC(irq_adapt_high_thresh, + "Threshold score for increasing IRQ moderation"); + +/* This is the weight assigned to each of the (per-channel) virtual + * NAPI devices. + */ +static int napi_weight = 64; + +/*************** + * Housekeeping + ***************/ + +int efx_channel_dummy_op_int(struct efx_channel *channel) +{ + return 0; +} + +void efx_channel_dummy_op_void(struct efx_channel *channel) +{ +} + +static const struct efx_channel_type efx_default_channel_type = { + .pre_probe = efx_channel_dummy_op_int, + .post_remove = efx_channel_dummy_op_void, + .get_name = efx_get_channel_name, + .copy = efx_copy_channel, + .want_txqs = efx_default_channel_want_txqs, + .keep_eventq = false, + .want_pio = true, +}; + +/************* + * INTERRUPTS + *************/ + +static unsigned int efx_wanted_parallelism(struct efx_nic *efx) +{ + cpumask_var_t thread_mask; + unsigned int count; + int cpu; + + if (rss_cpus) { + count = rss_cpus; + } else { + if (unlikely(!zalloc_cpumask_var(&thread_mask, GFP_KERNEL))) { + netif_warn(efx, probe, efx->net_dev, + "RSS disabled due to allocation failure\n"); + return 1; + } + + count = 0; + for_each_online_cpu(cpu) { + if (!cpumask_test_cpu(cpu, thread_mask)) { + ++count; + cpumask_or(thread_mask, thread_mask, + topology_sibling_cpumask(cpu)); + } + } + + free_cpumask_var(thread_mask); + } + + if (count > EFX_MAX_RX_QUEUES) { + netif_cond_dbg(efx, probe, efx->net_dev, !rss_cpus, warn, + "Reducing number of rx queues from %u to %u.\n", + count, EFX_MAX_RX_QUEUES); + count = EFX_MAX_RX_QUEUES; + } + + /* If RSS is requested for the PF *and* VFs then we can't write RSS + * table entries that are inaccessible to VFs + */ +#ifdef CONFIG_SFC_SRIOV + if (efx->type->sriov_wanted) { + if (efx->type->sriov_wanted(efx) && efx_vf_size(efx) > 1 && + count > efx_vf_size(efx)) { + netif_warn(efx, probe, efx->net_dev, + "Reducing number of RSS channels from %u to %u for " + "VF support. Increase vf-msix-limit to use more " + "channels on the PF.\n", + count, efx_vf_size(efx)); + count = efx_vf_size(efx); + } + } +#endif + + return count; +} + +static int efx_allocate_msix_channels(struct efx_nic *efx, + unsigned int max_channels, + unsigned int extra_channels, + unsigned int parallelism) +{ + unsigned int n_channels = parallelism; + int vec_count; + int n_xdp_tx; + int n_xdp_ev; + + if (efx_separate_tx_channels) + n_channels *= 2; + n_channels += extra_channels; + + /* To allow XDP transmit to happen from arbitrary NAPI contexts + * we allocate a TX queue per CPU. We share event queues across + * multiple tx queues, assuming tx and ev queues are both + * maximum size. + */ + + n_xdp_tx = num_possible_cpus(); + n_xdp_ev = DIV_ROUND_UP(n_xdp_tx, EFX_TXQ_TYPES); + + vec_count = pci_msix_vec_count(efx->pci_dev); + if (vec_count < 0) + return vec_count; + + max_channels = min_t(unsigned int, vec_count, max_channels); + + /* Check resources. + * We need a channel per event queue, plus a VI per tx queue. + * This may be more pessimistic than it needs to be. + */ + if (n_channels + n_xdp_ev > max_channels) { + netif_err(efx, drv, efx->net_dev, + "Insufficient resources for %d XDP event queues (%d other channels, max %d)\n", + n_xdp_ev, n_channels, max_channels); + efx->n_xdp_channels = 0; + efx->xdp_tx_per_channel = 0; + efx->xdp_tx_queue_count = 0; + } else { + efx->n_xdp_channels = n_xdp_ev; + efx->xdp_tx_per_channel = EFX_TXQ_TYPES; + efx->xdp_tx_queue_count = n_xdp_tx; + n_channels += n_xdp_ev; + netif_dbg(efx, drv, efx->net_dev, + "Allocating %d TX and %d event queues for XDP\n", + n_xdp_tx, n_xdp_ev); + } + + if (vec_count < n_channels) { + netif_err(efx, drv, efx->net_dev, + "WARNING: Insufficient MSI-X vectors available (%d < %u).\n", + vec_count, n_channels); + netif_err(efx, drv, efx->net_dev, + "WARNING: Performance may be reduced.\n"); + n_channels = vec_count; + } + + n_channels = min(n_channels, max_channels); + + efx->n_channels = n_channels; + + /* Ignore XDP tx channels when creating rx channels. */ + n_channels -= efx->n_xdp_channels; + + if (efx_separate_tx_channels) { + efx->n_tx_channels = + min(max(n_channels / 2, 1U), + efx->max_tx_channels); + efx->tx_channel_offset = + n_channels - efx->n_tx_channels; + efx->n_rx_channels = + max(n_channels - + efx->n_tx_channels, 1U); + } else { + efx->n_tx_channels = min(n_channels, efx->max_tx_channels); + efx->tx_channel_offset = 0; + efx->n_rx_channels = n_channels; + } + + efx->n_rx_channels = min(efx->n_rx_channels, parallelism); + efx->n_tx_channels = min(efx->n_tx_channels, parallelism); + + efx->xdp_channel_offset = n_channels; + + netif_dbg(efx, drv, efx->net_dev, + "Allocating %u RX channels\n", + efx->n_rx_channels); + + return efx->n_channels; +} + +/* Probe the number and type of interrupts we are able to obtain, and + * the resulting numbers of channels and RX queues. + */ +int efx_probe_interrupts(struct efx_nic *efx) +{ + unsigned int extra_channels = 0; + unsigned int rss_spread; + unsigned int i, j; + int rc; + + for (i = 0; i < EFX_MAX_EXTRA_CHANNELS; i++) + if (efx->extra_channel_type[i]) + ++extra_channels; + + if (efx->interrupt_mode == EFX_INT_MODE_MSIX) { + unsigned int parallelism = efx_wanted_parallelism(efx); + struct msix_entry xentries[EFX_MAX_CHANNELS]; + unsigned int n_channels; + + rc = efx_allocate_msix_channels(efx, efx->max_channels, + extra_channels, parallelism); + if (rc >= 0) { + n_channels = rc; + for (i = 0; i < n_channels; i++) + xentries[i].entry = i; + rc = pci_enable_msix_range(efx->pci_dev, xentries, 1, + n_channels); + } + if (rc < 0) { + /* Fall back to single channel MSI */ + netif_err(efx, drv, efx->net_dev, + "could not enable MSI-X\n"); + if (efx->type->min_interrupt_mode >= EFX_INT_MODE_MSI) + efx->interrupt_mode = EFX_INT_MODE_MSI; + else + return rc; + } else if (rc < n_channels) { + netif_err(efx, drv, efx->net_dev, + "WARNING: Insufficient MSI-X vectors" + " available (%d < %u).\n", rc, n_channels); + netif_err(efx, drv, efx->net_dev, + "WARNING: Performance may be reduced.\n"); + n_channels = rc; + } + + if (rc > 0) { + for (i = 0; i < efx->n_channels; i++) + efx_get_channel(efx, i)->irq = + xentries[i].vector; + } + } + + /* Try single interrupt MSI */ + if (efx->interrupt_mode == EFX_INT_MODE_MSI) { + efx->n_channels = 1; + efx->n_rx_channels = 1; + efx->n_tx_channels = 1; + efx->n_xdp_channels = 0; + efx->xdp_channel_offset = efx->n_channels; + rc = pci_enable_msi(efx->pci_dev); + if (rc == 0) { + efx_get_channel(efx, 0)->irq = efx->pci_dev->irq; + } else { + netif_err(efx, drv, efx->net_dev, + "could not enable MSI\n"); + if (efx->type->min_interrupt_mode >= EFX_INT_MODE_LEGACY) + efx->interrupt_mode = EFX_INT_MODE_LEGACY; + else + return rc; + } + } + + /* Assume legacy interrupts */ + if (efx->interrupt_mode == EFX_INT_MODE_LEGACY) { + efx->n_channels = 1 + (efx_separate_tx_channels ? 1 : 0); + efx->n_rx_channels = 1; + efx->n_tx_channels = 1; + efx->n_xdp_channels = 0; + efx->xdp_channel_offset = efx->n_channels; + efx->legacy_irq = efx->pci_dev->irq; + } + + /* Assign extra channels if possible, before XDP channels */ + efx->n_extra_tx_channels = 0; + j = efx->xdp_channel_offset; + for (i = 0; i < EFX_MAX_EXTRA_CHANNELS; i++) { + if (!efx->extra_channel_type[i]) + continue; + if (j <= efx->tx_channel_offset + efx->n_tx_channels) { + efx->extra_channel_type[i]->handle_no_channel(efx); + } else { + --j; + efx_get_channel(efx, j)->type = + efx->extra_channel_type[i]; + if (efx_channel_has_tx_queues(efx_get_channel(efx, j))) + efx->n_extra_tx_channels++; + } + } + + rss_spread = efx->n_rx_channels; + /* RSS might be usable on VFs even if it is disabled on the PF */ +#ifdef CONFIG_SFC_SRIOV + if (efx->type->sriov_wanted) { + efx->rss_spread = ((rss_spread > 1 || + !efx->type->sriov_wanted(efx)) ? + rss_spread : efx_vf_size(efx)); + return 0; + } +#endif + efx->rss_spread = rss_spread; + + return 0; +} + +#if defined(CONFIG_SMP) +void efx_set_interrupt_affinity(struct efx_nic *efx) +{ + struct efx_channel *channel; + unsigned int cpu; + + efx_for_each_channel(channel, efx) { + cpu = cpumask_local_spread(channel->channel, + pcibus_to_node(efx->pci_dev->bus)); + irq_set_affinity_hint(channel->irq, cpumask_of(cpu)); + } +} + +void efx_clear_interrupt_affinity(struct efx_nic *efx) +{ + struct efx_channel *channel; + + efx_for_each_channel(channel, efx) + irq_set_affinity_hint(channel->irq, NULL); +} +#else +void +efx_set_interrupt_affinity(struct efx_nic *efx __attribute__ ((unused))) +{ +} + +void +efx_clear_interrupt_affinity(struct efx_nic *efx __attribute__ ((unused))) +{ +} +#endif /* CONFIG_SMP */ + +void efx_remove_interrupts(struct efx_nic *efx) +{ + struct efx_channel *channel; + + /* Remove MSI/MSI-X interrupts */ + efx_for_each_channel(channel, efx) + channel->irq = 0; + pci_disable_msi(efx->pci_dev); + pci_disable_msix(efx->pci_dev); + + /* Remove legacy interrupt */ + efx->legacy_irq = 0; +} + +/*************** + * EVENT QUEUES + ***************/ + +/* Create event queue + * Event queue memory allocations are done only once. If the channel + * is reset, the memory buffer will be reused; this guards against + * errors during channel reset and also simplifies interrupt handling. + */ +int efx_probe_eventq(struct efx_channel *channel) +{ + struct efx_nic *efx = channel->efx; + unsigned long entries; + + netif_dbg(efx, probe, efx->net_dev, + "chan %d create event queue\n", channel->channel); + + /* Build an event queue with room for one event per tx and rx buffer, + * plus some extra for link state events and MCDI completions. + */ + entries = roundup_pow_of_two(efx->rxq_entries + efx->txq_entries + 128); + EFX_WARN_ON_PARANOID(entries > EFX_MAX_EVQ_SIZE); + channel->eventq_mask = max(entries, EFX_MIN_EVQ_SIZE) - 1; + + return efx_nic_probe_eventq(channel); +} + +/* Prepare channel's event queue */ +int efx_init_eventq(struct efx_channel *channel) +{ + struct efx_nic *efx = channel->efx; + int rc; + + EFX_WARN_ON_PARANOID(channel->eventq_init); + + netif_dbg(efx, drv, efx->net_dev, + "chan %d init event queue\n", channel->channel); + + rc = efx_nic_init_eventq(channel); + if (rc == 0) { + efx->type->push_irq_moderation(channel); + channel->eventq_read_ptr = 0; + channel->eventq_init = true; + } + return rc; +} + +/* Enable event queue processing and NAPI */ +void efx_start_eventq(struct efx_channel *channel) +{ + netif_dbg(channel->efx, ifup, channel->efx->net_dev, + "chan %d start event queue\n", channel->channel); + + /* Make sure the NAPI handler sees the enabled flag set */ + channel->enabled = true; + smp_wmb(); + + napi_enable(&channel->napi_str); + efx_nic_eventq_read_ack(channel); +} + +/* Disable event queue processing and NAPI */ +void efx_stop_eventq(struct efx_channel *channel) +{ + if (!channel->enabled) + return; + + napi_disable(&channel->napi_str); + channel->enabled = false; +} + +void efx_fini_eventq(struct efx_channel *channel) +{ + if (!channel->eventq_init) + return; + + netif_dbg(channel->efx, drv, channel->efx->net_dev, + "chan %d fini event queue\n", channel->channel); + + efx_nic_fini_eventq(channel); + channel->eventq_init = false; +} + +void efx_remove_eventq(struct efx_channel *channel) +{ + netif_dbg(channel->efx, drv, channel->efx->net_dev, + "chan %d remove event queue\n", channel->channel); + + efx_nic_remove_eventq(channel); +} + +/************************************************************************** + * + * Channel handling + * + *************************************************************************/ + +/* Allocate and initialise a channel structure. */ +struct efx_channel * +efx_alloc_channel(struct efx_nic *efx, int i, struct efx_channel *old_channel) +{ + struct efx_rx_queue *rx_queue; + struct efx_tx_queue *tx_queue; + struct efx_channel *channel; + int j; + + channel = kzalloc(sizeof(*channel), GFP_KERNEL); + if (!channel) + return NULL; + + channel->efx = efx; + channel->channel = i; + channel->type = &efx_default_channel_type; + + for (j = 0; j < EFX_TXQ_TYPES; j++) { + tx_queue = &channel->tx_queue[j]; + tx_queue->efx = efx; + tx_queue->queue = i * EFX_TXQ_TYPES + j; + tx_queue->channel = channel; + } + +#ifdef CONFIG_RFS_ACCEL + INIT_DELAYED_WORK(&channel->filter_work, efx_filter_rfs_expire); +#endif + + rx_queue = &channel->rx_queue; + rx_queue->efx = efx; + timer_setup(&rx_queue->slow_fill, efx_rx_slow_fill, 0); + + return channel; +} + +int efx_init_channels(struct efx_nic *efx) +{ + unsigned int i; + + for (i = 0; i < EFX_MAX_CHANNELS; i++) { + efx->channel[i] = efx_alloc_channel(efx, i, NULL); + if (!efx->channel[i]) + return -ENOMEM; + efx->msi_context[i].efx = efx; + efx->msi_context[i].index = i; + } + + /* Higher numbered interrupt modes are less capable! */ + if (WARN_ON_ONCE(efx->type->max_interrupt_mode > + efx->type->min_interrupt_mode)) { + return -EIO; + } + efx->interrupt_mode = max(efx->type->max_interrupt_mode, + interrupt_mode); + efx->interrupt_mode = min(efx->type->min_interrupt_mode, + interrupt_mode); + + return 0; +} + +void efx_fini_channels(struct efx_nic *efx) +{ + unsigned int i; + + for (i = 0; i < EFX_MAX_CHANNELS; i++) + if (efx->channel[i]) { + kfree(efx->channel[i]); + efx->channel[i] = NULL; + } +} + +/* Allocate and initialise a channel structure, copying parameters + * (but not resources) from an old channel structure. + */ +struct efx_channel *efx_copy_channel(const struct efx_channel *old_channel) +{ + struct efx_rx_queue *rx_queue; + struct efx_tx_queue *tx_queue; + struct efx_channel *channel; + int j; + + channel = kmalloc(sizeof(*channel), GFP_KERNEL); + if (!channel) + return NULL; + + *channel = *old_channel; + + channel->napi_dev = NULL; + INIT_HLIST_NODE(&channel->napi_str.napi_hash_node); + channel->napi_str.napi_id = 0; + channel->napi_str.state = 0; + memset(&channel->eventq, 0, sizeof(channel->eventq)); + + for (j = 0; j < EFX_TXQ_TYPES; j++) { + tx_queue = &channel->tx_queue[j]; + if (tx_queue->channel) + tx_queue->channel = channel; + tx_queue->buffer = NULL; + memset(&tx_queue->txd, 0, sizeof(tx_queue->txd)); + } + + rx_queue = &channel->rx_queue; + rx_queue->buffer = NULL; + memset(&rx_queue->rxd, 0, sizeof(rx_queue->rxd)); + timer_setup(&rx_queue->slow_fill, efx_rx_slow_fill, 0); +#ifdef CONFIG_RFS_ACCEL + INIT_DELAYED_WORK(&channel->filter_work, efx_filter_rfs_expire); +#endif + + return channel; +} + +static int efx_probe_channel(struct efx_channel *channel) +{ + struct efx_tx_queue *tx_queue; + struct efx_rx_queue *rx_queue; + int rc; + + netif_dbg(channel->efx, probe, channel->efx->net_dev, + "creating channel %d\n", channel->channel); + + rc = channel->type->pre_probe(channel); + if (rc) + goto fail; + + rc = efx_probe_eventq(channel); + if (rc) + goto fail; + + efx_for_each_channel_tx_queue(tx_queue, channel) { + rc = efx_probe_tx_queue(tx_queue); + if (rc) + goto fail; + } + + efx_for_each_channel_rx_queue(rx_queue, channel) { + rc = efx_probe_rx_queue(rx_queue); + if (rc) + goto fail; + } + + channel->rx_list = NULL; + + return 0; + +fail: + efx_remove_channel(channel); + return rc; +} + +void efx_get_channel_name(struct efx_channel *channel, char *buf, size_t len) +{ + struct efx_nic *efx = channel->efx; + const char *type; + int number; + + number = channel->channel; + + if (number >= efx->xdp_channel_offset && + !WARN_ON_ONCE(!efx->n_xdp_channels)) { + type = "-xdp"; + number -= efx->xdp_channel_offset; + } else if (efx->tx_channel_offset == 0) { + type = ""; + } else if (number < efx->tx_channel_offset) { + type = "-rx"; + } else { + type = "-tx"; + number -= efx->tx_channel_offset; + } + snprintf(buf, len, "%s%s-%d", efx->name, type, number); +} + +void efx_set_channel_names(struct efx_nic *efx) +{ + struct efx_channel *channel; + + efx_for_each_channel(channel, efx) + channel->type->get_name(channel, + efx->msi_context[channel->channel].name, + sizeof(efx->msi_context[0].name)); +} + +int efx_probe_channels(struct efx_nic *efx) +{ + struct efx_channel *channel; + int rc; + + /* Restart special buffer allocation */ + efx->next_buffer_table = 0; + + /* Probe channels in reverse, so that any 'extra' channels + * use the start of the buffer table. This allows the traffic + * channels to be resized without moving them or wasting the + * entries before them. + */ + efx_for_each_channel_rev(channel, efx) { + rc = efx_probe_channel(channel); + if (rc) { + netif_err(efx, probe, efx->net_dev, + "failed to create channel %d\n", + channel->channel); + goto fail; + } + } + efx_set_channel_names(efx); + + return 0; + +fail: + efx_remove_channels(efx); + return rc; +} + +void efx_remove_channel(struct efx_channel *channel) +{ + struct efx_tx_queue *tx_queue; + struct efx_rx_queue *rx_queue; + + netif_dbg(channel->efx, drv, channel->efx->net_dev, + "destroy chan %d\n", channel->channel); + + efx_for_each_channel_rx_queue(rx_queue, channel) + efx_remove_rx_queue(rx_queue); + efx_for_each_possible_channel_tx_queue(tx_queue, channel) + efx_remove_tx_queue(tx_queue); + efx_remove_eventq(channel); + channel->type->post_remove(channel); +} + +void efx_remove_channels(struct efx_nic *efx) +{ + struct efx_channel *channel; + + efx_for_each_channel(channel, efx) + efx_remove_channel(channel); + + kfree(efx->xdp_tx_queues); +} + +int efx_realloc_channels(struct efx_nic *efx, u32 rxq_entries, u32 txq_entries) +{ + struct efx_channel *other_channel[EFX_MAX_CHANNELS], *channel; + unsigned int i, next_buffer_table = 0; + u32 old_rxq_entries, old_txq_entries; + int rc, rc2; + + rc = efx_check_disabled(efx); + if (rc) + return rc; + + /* Not all channels should be reallocated. We must avoid + * reallocating their buffer table entries. + */ + efx_for_each_channel(channel, efx) { + struct efx_rx_queue *rx_queue; + struct efx_tx_queue *tx_queue; + + if (channel->type->copy) + continue; + next_buffer_table = max(next_buffer_table, + channel->eventq.index + + channel->eventq.entries); + efx_for_each_channel_rx_queue(rx_queue, channel) + next_buffer_table = max(next_buffer_table, + rx_queue->rxd.index + + rx_queue->rxd.entries); + efx_for_each_channel_tx_queue(tx_queue, channel) + next_buffer_table = max(next_buffer_table, + tx_queue->txd.index + + tx_queue->txd.entries); + } + + efx_device_detach_sync(efx); + efx_stop_all(efx); + efx_soft_disable_interrupts(efx); + + /* Clone channels (where possible) */ + memset(other_channel, 0, sizeof(other_channel)); + for (i = 0; i < efx->n_channels; i++) { + channel = efx->channel[i]; + if (channel->type->copy) + channel = channel->type->copy(channel); + if (!channel) { + rc = -ENOMEM; + goto out; + } + other_channel[i] = channel; + } + + /* Swap entry counts and channel pointers */ + old_rxq_entries = efx->rxq_entries; + old_txq_entries = efx->txq_entries; + efx->rxq_entries = rxq_entries; + efx->txq_entries = txq_entries; + for (i = 0; i < efx->n_channels; i++) { + channel = efx->channel[i]; + efx->channel[i] = other_channel[i]; + other_channel[i] = channel; + } + + /* Restart buffer table allocation */ + efx->next_buffer_table = next_buffer_table; + + for (i = 0; i < efx->n_channels; i++) { + channel = efx->channel[i]; + if (!channel->type->copy) + continue; + rc = efx_probe_channel(channel); + if (rc) + goto rollback; + efx_init_napi_channel(efx->channel[i]); + } + +out: + /* Destroy unused channel structures */ + for (i = 0; i < efx->n_channels; i++) { + channel = other_channel[i]; + if (channel && channel->type->copy) { + efx_fini_napi_channel(channel); + efx_remove_channel(channel); + kfree(channel); + } + } + + rc2 = efx_soft_enable_interrupts(efx); + if (rc2) { + rc = rc ? rc : rc2; + netif_err(efx, drv, efx->net_dev, + "unable to restart interrupts on channel reallocation\n"); + efx_schedule_reset(efx, RESET_TYPE_DISABLE); + } else { + efx_start_all(efx); + efx_device_attach_if_not_resetting(efx); + } + return rc; + +rollback: + /* Swap back */ + efx->rxq_entries = old_rxq_entries; + efx->txq_entries = old_txq_entries; + for (i = 0; i < efx->n_channels; i++) { + channel = efx->channel[i]; + efx->channel[i] = other_channel[i]; + other_channel[i] = channel; + } + goto out; +} + +int efx_set_channels(struct efx_nic *efx) +{ + struct efx_channel *channel; + struct efx_tx_queue *tx_queue; + int xdp_queue_number; + + efx->tx_channel_offset = + efx_separate_tx_channels ? + efx->n_channels - efx->n_tx_channels : 0; + + if (efx->xdp_tx_queue_count) { + EFX_WARN_ON_PARANOID(efx->xdp_tx_queues); + + /* Allocate array for XDP TX queue lookup. */ + efx->xdp_tx_queues = kcalloc(efx->xdp_tx_queue_count, + sizeof(*efx->xdp_tx_queues), + GFP_KERNEL); + if (!efx->xdp_tx_queues) + return -ENOMEM; + } + + /* We need to mark which channels really have RX and TX + * queues, and adjust the TX queue numbers if we have separate + * RX-only and TX-only channels. + */ + xdp_queue_number = 0; + efx_for_each_channel(channel, efx) { + if (channel->channel < efx->n_rx_channels) + channel->rx_queue.core_index = channel->channel; + else + channel->rx_queue.core_index = -1; + + efx_for_each_channel_tx_queue(tx_queue, channel) { + tx_queue->queue -= (efx->tx_channel_offset * + EFX_TXQ_TYPES); + + if (efx_channel_is_xdp_tx(channel) && + xdp_queue_number < efx->xdp_tx_queue_count) { + efx->xdp_tx_queues[xdp_queue_number] = tx_queue; + xdp_queue_number++; + } + } + } + return 0; +} + +bool efx_default_channel_want_txqs(struct efx_channel *channel) +{ + return channel->channel - channel->efx->tx_channel_offset < + channel->efx->n_tx_channels; +} + +/************* + * START/STOP + *************/ + +int efx_soft_enable_interrupts(struct efx_nic *efx) +{ + struct efx_channel *channel, *end_channel; + int rc; + + BUG_ON(efx->state == STATE_DISABLED); + + efx->irq_soft_enabled = true; + smp_wmb(); + + efx_for_each_channel(channel, efx) { + if (!channel->type->keep_eventq) { + rc = efx_init_eventq(channel); + if (rc) + goto fail; + } + efx_start_eventq(channel); + } + + efx_mcdi_mode_event(efx); + + return 0; +fail: + end_channel = channel; + efx_for_each_channel(channel, efx) { + if (channel == end_channel) + break; + efx_stop_eventq(channel); + if (!channel->type->keep_eventq) + efx_fini_eventq(channel); + } + + return rc; +} + +void efx_soft_disable_interrupts(struct efx_nic *efx) +{ + struct efx_channel *channel; + + if (efx->state == STATE_DISABLED) + return; + + efx_mcdi_mode_poll(efx); + + efx->irq_soft_enabled = false; + smp_wmb(); + + if (efx->legacy_irq) + synchronize_irq(efx->legacy_irq); + + efx_for_each_channel(channel, efx) { + if (channel->irq) + synchronize_irq(channel->irq); + + efx_stop_eventq(channel); + if (!channel->type->keep_eventq) + efx_fini_eventq(channel); + } + + /* Flush the asynchronous MCDI request queue */ + efx_mcdi_flush_async(efx); +} + +int efx_enable_interrupts(struct efx_nic *efx) +{ + struct efx_channel *channel, *end_channel; + int rc; + + /* TODO: Is this really a bug? */ + BUG_ON(efx->state == STATE_DISABLED); + + if (efx->eeh_disabled_legacy_irq) { + enable_irq(efx->legacy_irq); + efx->eeh_disabled_legacy_irq = false; + } + + efx->type->irq_enable_master(efx); + + efx_for_each_channel(channel, efx) { + if (channel->type->keep_eventq) { + rc = efx_init_eventq(channel); + if (rc) + goto fail; + } + } + + rc = efx_soft_enable_interrupts(efx); + if (rc) + goto fail; + + return 0; + +fail: + end_channel = channel; + efx_for_each_channel(channel, efx) { + if (channel == end_channel) + break; + if (channel->type->keep_eventq) + efx_fini_eventq(channel); + } + + efx->type->irq_disable_non_ev(efx); + + return rc; +} + +void efx_disable_interrupts(struct efx_nic *efx) +{ + struct efx_channel *channel; + + efx_soft_disable_interrupts(efx); + + efx_for_each_channel(channel, efx) { + if (channel->type->keep_eventq) + efx_fini_eventq(channel); + } + + efx->type->irq_disable_non_ev(efx); +} + +void efx_start_channels(struct efx_nic *efx) +{ + struct efx_tx_queue *tx_queue; + struct efx_rx_queue *rx_queue; + struct efx_channel *channel; + + efx_for_each_channel(channel, efx) { + efx_for_each_channel_tx_queue(tx_queue, channel) { + efx_init_tx_queue(tx_queue); + atomic_inc(&efx->active_queues); + } + + efx_for_each_channel_rx_queue(rx_queue, channel) { + efx_init_rx_queue(rx_queue); + atomic_inc(&efx->active_queues); + efx_stop_eventq(channel); + efx_fast_push_rx_descriptors(rx_queue, false); + efx_start_eventq(channel); + } + + WARN_ON(channel->rx_pkt_n_frags); + } +} + +void efx_stop_channels(struct efx_nic *efx) +{ + struct efx_tx_queue *tx_queue; + struct efx_rx_queue *rx_queue; + struct efx_channel *channel; + int rc = 0; + + /* Stop RX refill */ + efx_for_each_channel(channel, efx) { + efx_for_each_channel_rx_queue(rx_queue, channel) + rx_queue->refill_enabled = false; + } + + efx_for_each_channel(channel, efx) { + /* RX packet processing is pipelined, so wait for the + * NAPI handler to complete. At least event queue 0 + * might be kept active by non-data events, so don't + * use napi_synchronize() but actually disable NAPI + * temporarily. + */ + if (efx_channel_has_rx_queue(channel)) { + efx_stop_eventq(channel); + efx_start_eventq(channel); + } + } + + if (efx->type->fini_dmaq) + rc = efx->type->fini_dmaq(efx); + + if (rc) { + netif_err(efx, drv, efx->net_dev, "failed to flush queues\n"); + } else { + netif_dbg(efx, drv, efx->net_dev, + "successfully flushed all queues\n"); + } + + efx_for_each_channel(channel, efx) { + efx_for_each_channel_rx_queue(rx_queue, channel) + efx_fini_rx_queue(rx_queue); + efx_for_each_possible_channel_tx_queue(tx_queue, channel) + efx_fini_tx_queue(tx_queue); + } +} + +/************************************************************************** + * + * NAPI interface + * + *************************************************************************/ + +/* Process channel's event queue + * + * This function is responsible for processing the event queue of a + * single channel. The caller must guarantee that this function will + * never be concurrently called more than once on the same channel, + * though different channels may be being processed concurrently. + */ +static int efx_process_channel(struct efx_channel *channel, int budget) +{ + struct efx_tx_queue *tx_queue; + struct list_head rx_list; + int spent; + + if (unlikely(!channel->enabled)) + return 0; + + /* Prepare the batch receive list */ + EFX_WARN_ON_PARANOID(channel->rx_list != NULL); + INIT_LIST_HEAD(&rx_list); + channel->rx_list = &rx_list; + + efx_for_each_channel_tx_queue(tx_queue, channel) { + tx_queue->pkts_compl = 0; + tx_queue->bytes_compl = 0; + } + + spent = efx_nic_process_eventq(channel, budget); + if (spent && efx_channel_has_rx_queue(channel)) { + struct efx_rx_queue *rx_queue = + efx_channel_get_rx_queue(channel); + + efx_rx_flush_packet(channel); + efx_fast_push_rx_descriptors(rx_queue, true); + } + + /* Update BQL */ + efx_for_each_channel_tx_queue(tx_queue, channel) { + if (tx_queue->bytes_compl) { + netdev_tx_completed_queue(tx_queue->core_txq, + tx_queue->pkts_compl, + tx_queue->bytes_compl); + } + } + + /* Receive any packets we queued up */ + netif_receive_skb_list(channel->rx_list); + channel->rx_list = NULL; + + return spent; +} + +static void efx_update_irq_mod(struct efx_nic *efx, struct efx_channel *channel) +{ + int step = efx->irq_mod_step_us; + + if (channel->irq_mod_score < irq_adapt_low_thresh) { + if (channel->irq_moderation_us > step) { + channel->irq_moderation_us -= step; + efx->type->push_irq_moderation(channel); + } + } else if (channel->irq_mod_score > irq_adapt_high_thresh) { + if (channel->irq_moderation_us < + efx->irq_rx_moderation_us) { + channel->irq_moderation_us += step; + efx->type->push_irq_moderation(channel); + } + } + + channel->irq_count = 0; + channel->irq_mod_score = 0; +} + +/* NAPI poll handler + * + * NAPI guarantees serialisation of polls of the same device, which + * provides the guarantee required by efx_process_channel(). + */ +static int efx_poll(struct napi_struct *napi, int budget) +{ + struct efx_channel *channel = + container_of(napi, struct efx_channel, napi_str); + struct efx_nic *efx = channel->efx; + int spent; + + netif_vdbg(efx, intr, efx->net_dev, + "channel %d NAPI poll executing on CPU %d\n", + channel->channel, raw_smp_processor_id()); + + spent = efx_process_channel(channel, budget); + + xdp_do_flush_map(); + + if (spent < budget) { + if (efx_channel_has_rx_queue(channel) && + efx->irq_rx_adaptive && + unlikely(++channel->irq_count == 1000)) { + efx_update_irq_mod(efx, channel); + } + +#ifdef CONFIG_RFS_ACCEL + /* Perhaps expire some ARFS filters */ + mod_delayed_work(system_wq, &channel->filter_work, 0); +#endif + + /* There is no race here; although napi_disable() will + * only wait for napi_complete(), this isn't a problem + * since efx_nic_eventq_read_ack() will have no effect if + * interrupts have already been disabled. + */ + if (napi_complete_done(napi, spent)) + efx_nic_eventq_read_ack(channel); + } + + return spent; +} + +void efx_init_napi_channel(struct efx_channel *channel) +{ + struct efx_nic *efx = channel->efx; + + channel->napi_dev = efx->net_dev; + netif_napi_add(channel->napi_dev, &channel->napi_str, + efx_poll, napi_weight); +} + +void efx_init_napi(struct efx_nic *efx) +{ + struct efx_channel *channel; + + efx_for_each_channel(channel, efx) + efx_init_napi_channel(channel); +} + +void efx_fini_napi_channel(struct efx_channel *channel) +{ + if (channel->napi_dev) + netif_napi_del(&channel->napi_str); + + channel->napi_dev = NULL; +} + +void efx_fini_napi(struct efx_nic *efx) +{ + struct efx_channel *channel; + + efx_for_each_channel(channel, efx) + efx_fini_napi_channel(channel); +} diff --git a/drivers/net/ethernet/sfc/efx_channels.h b/drivers/net/ethernet/sfc/efx_channels.h new file mode 100644 index 000000000000..8d7b8c4142d7 --- /dev/null +++ b/drivers/net/ethernet/sfc/efx_channels.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/**************************************************************************** + * Driver for Solarflare network controllers and boards + * Copyright 2018 Solarflare Communications Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + */ + +#ifndef EFX_CHANNELS_H +#define EFX_CHANNELS_H + +int efx_probe_interrupts(struct efx_nic *efx); +void efx_remove_interrupts(struct efx_nic *efx); +int efx_soft_enable_interrupts(struct efx_nic *efx); +void efx_soft_disable_interrupts(struct efx_nic *efx); +int efx_enable_interrupts(struct efx_nic *efx); +void efx_disable_interrupts(struct efx_nic *efx); + +void efx_set_interrupt_affinity(struct efx_nic *efx); +void efx_clear_interrupt_affinity(struct efx_nic *efx); + +int efx_probe_eventq(struct efx_channel *channel); +int efx_init_eventq(struct efx_channel *channel); +void efx_start_eventq(struct efx_channel *channel); +void efx_stop_eventq(struct efx_channel *channel); +void efx_fini_eventq(struct efx_channel *channel); +void efx_remove_eventq(struct efx_channel *channel); + +struct efx_channel * +efx_alloc_channel(struct efx_nic *efx, int i, struct efx_channel *old_channel); +int efx_realloc_channels(struct efx_nic *efx, u32 rxq_entries, u32 txq_entries); +void efx_get_channel_name(struct efx_channel *channel, char *buf, size_t len); +void efx_set_channel_names(struct efx_nic *efx); +int efx_init_channels(struct efx_nic *efx); +int efx_probe_channels(struct efx_nic *efx); +int efx_set_channels(struct efx_nic *efx); +bool efx_default_channel_want_txqs(struct efx_channel *channel); +void efx_remove_channel(struct efx_channel *channel); +void efx_remove_channels(struct efx_nic *efx); +void efx_fini_channels(struct efx_nic *efx); +struct efx_channel *efx_copy_channel(const struct efx_channel *old_channel); +void efx_start_channels(struct efx_nic *efx); +void efx_stop_channels(struct efx_nic *efx); + +void efx_init_napi_channel(struct efx_channel *channel); +void efx_init_napi(struct efx_nic *efx); +void efx_fini_napi_channel(struct efx_channel *channel); +void efx_fini_napi(struct efx_nic *efx); + +int efx_channel_dummy_op_int(struct efx_channel *channel); +void efx_channel_dummy_op_void(struct efx_channel *channel); + +#endif diff --git a/drivers/net/ethernet/sfc/efx_common.c b/drivers/net/ethernet/sfc/efx_common.c new file mode 100644 index 000000000000..ab0ce62f81c1 --- /dev/null +++ b/drivers/net/ethernet/sfc/efx_common.c @@ -0,0 +1,1102 @@ +// SPDX-License-Identifier: GPL-2.0-only +/**************************************************************************** + * Driver for Solarflare network controllers and boards + * Copyright 2018 Solarflare Communications Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + */ + +#include "net_driver.h" +#include <linux/module.h> +#include <linux/netdevice.h> +#include "efx_common.h" +#include "efx_channels.h" +#include "efx.h" +#include "mcdi.h" +#include "selftest.h" +#include "rx_common.h" +#include "tx_common.h" +#include "nic.h" +#include "io.h" +#include "mcdi_pcol.h" + +static unsigned int debug = (NETIF_MSG_DRV | NETIF_MSG_PROBE | + NETIF_MSG_LINK | NETIF_MSG_IFDOWN | + NETIF_MSG_IFUP | NETIF_MSG_RX_ERR | + NETIF_MSG_TX_ERR | NETIF_MSG_HW); +module_param(debug, uint, 0); +MODULE_PARM_DESC(debug, "Bitmapped debugging message enable value"); + +/* This is the time (in jiffies) between invocations of the hardware + * monitor. + * On Falcon-based NICs, this will: + * - Check the on-board hardware monitor; + * - Poll the link state and reconfigure the hardware as necessary. + * On Siena-based NICs for power systems with EEH support, this will give EEH a + * chance to start. + */ +static unsigned int efx_monitor_interval = 1 * HZ; + +/* How often and how many times to poll for a reset while waiting for a + * BIST that another function started to complete. + */ +#define BIST_WAIT_DELAY_MS 100 +#define BIST_WAIT_DELAY_COUNT 100 + +/* Default stats update time */ +#define STATS_PERIOD_MS_DEFAULT 1000 + +const unsigned int efx_reset_type_max = RESET_TYPE_MAX; +const char *const efx_reset_type_names[] = { + [RESET_TYPE_INVISIBLE] = "INVISIBLE", + [RESET_TYPE_ALL] = "ALL", + [RESET_TYPE_RECOVER_OR_ALL] = "RECOVER_OR_ALL", + [RESET_TYPE_WORLD] = "WORLD", + [RESET_TYPE_RECOVER_OR_DISABLE] = "RECOVER_OR_DISABLE", + [RESET_TYPE_DATAPATH] = "DATAPATH", + [RESET_TYPE_MC_BIST] = "MC_BIST", + [RESET_TYPE_DISABLE] = "DISABLE", + [RESET_TYPE_TX_WATCHDOG] = "TX_WATCHDOG", + [RESET_TYPE_INT_ERROR] = "INT_ERROR", + [RESET_TYPE_DMA_ERROR] = "DMA_ERROR", + [RESET_TYPE_TX_SKIP] = "TX_SKIP", + [RESET_TYPE_MC_FAILURE] = "MC_FAILURE", + [RESET_TYPE_MCDI_TIMEOUT] = "MCDI_TIMEOUT (FLR)", +}; + +#define RESET_TYPE(type) \ + STRING_TABLE_LOOKUP(type, efx_reset_type) + +/* Loopback mode names (see LOOPBACK_MODE()) */ +const unsigned int efx_loopback_mode_max = LOOPBACK_MAX; +const char *const efx_loopback_mode_names[] = { + [LOOPBACK_NONE] = "NONE", + [LOOPBACK_DATA] = "DATAPATH", + [LOOPBACK_GMAC] = "GMAC", + [LOOPBACK_XGMII] = "XGMII", + [LOOPBACK_XGXS] = "XGXS", + [LOOPBACK_XAUI] = "XAUI", + [LOOPBACK_GMII] = "GMII", + [LOOPBACK_SGMII] = "SGMII", + [LOOPBACK_XGBR] = "XGBR", + [LOOPBACK_XFI] = "XFI", + [LOOPBACK_XAUI_FAR] = "XAUI_FAR", + [LOOPBACK_GMII_FAR] = "GMII_FAR", + [LOOPBACK_SGMII_FAR] = "SGMII_FAR", + [LOOPBACK_XFI_FAR] = "XFI_FAR", + [LOOPBACK_GPHY] = "GPHY", + [LOOPBACK_PHYXS] = "PHYXS", + [LOOPBACK_PCS] = "PCS", + [LOOPBACK_PMAPMD] = "PMA/PMD", + [LOOPBACK_XPORT] = "XPORT", + [LOOPBACK_XGMII_WS] = "XGMII_WS", + [LOOPBACK_XAUI_WS] = "XAUI_WS", + [LOOPBACK_XAUI_WS_FAR] = "XAUI_WS_FAR", + [LOOPBACK_XAUI_WS_NEAR] = "XAUI_WS_NEAR", + [LOOPBACK_GMII_WS] = "GMII_WS", + [LOOPBACK_XFI_WS] = "XFI_WS", + [LOOPBACK_XFI_WS_FAR] = "XFI_WS_FAR", + [LOOPBACK_PHYXS_WS] = "PHYXS_WS", +}; + +/* Reset workqueue. If any NIC has a hardware failure then a reset will be + * queued onto this work queue. This is not a per-nic work queue, because + * efx_reset_work() acquires the rtnl lock, so resets are naturally serialised. + */ +static struct workqueue_struct *reset_workqueue; + +int efx_create_reset_workqueue(void) +{ + reset_workqueue = create_singlethread_workqueue("sfc_reset"); + if (!reset_workqueue) { + printk(KERN_ERR "Failed to create reset workqueue\n"); + return -ENOMEM; + } + + return 0; +} + +void efx_queue_reset_work(struct efx_nic *efx) +{ + queue_work(reset_workqueue, &efx->reset_work); +} + +void efx_flush_reset_workqueue(struct efx_nic *efx) +{ + cancel_work_sync(&efx->reset_work); +} + +void efx_destroy_reset_workqueue(void) +{ + if (reset_workqueue) { + destroy_workqueue(reset_workqueue); + reset_workqueue = NULL; + } +} + +/* We assume that efx->type->reconfigure_mac will always try to sync RX + * filters and therefore needs to read-lock the filter table against freeing + */ +void efx_mac_reconfigure(struct efx_nic *efx) +{ + if (efx->type->reconfigure_mac) { + down_read(&efx->filter_sem); + efx->type->reconfigure_mac(efx); + up_read(&efx->filter_sem); + } +} + +/* Asynchronous work item for changing MAC promiscuity and multicast + * hash. Avoid a drain/rx_ingress enable by reconfiguring the current + * MAC directly. + */ +static void efx_mac_work(struct work_struct *data) +{ + struct efx_nic *efx = container_of(data, struct efx_nic, mac_work); + + mutex_lock(&efx->mac_lock); + if (efx->port_enabled) + efx_mac_reconfigure(efx); + mutex_unlock(&efx->mac_lock); +} + +/* This ensures that the kernel is kept informed (via + * netif_carrier_on/off) of the link status, and also maintains the + * link status's stop on the port's TX queue. + */ +void efx_link_status_changed(struct efx_nic *efx) +{ + struct efx_link_state *link_state = &efx->link_state; + + /* SFC Bug 5356: A net_dev notifier is registered, so we must ensure + * that no events are triggered between unregister_netdev() and the + * driver unloading. A more general condition is that NETDEV_CHANGE + * can only be generated between NETDEV_UP and NETDEV_DOWN + */ + if (!netif_running(efx->net_dev)) + return; + + if (link_state->up != netif_carrier_ok(efx->net_dev)) { + efx->n_link_state_changes++; + + if (link_state->up) + netif_carrier_on(efx->net_dev); + else + netif_carrier_off(efx->net_dev); + } + + /* Status message for kernel log */ + if (link_state->up) + netif_info(efx, link, efx->net_dev, + "link up at %uMbps %s-duplex (MTU %d)\n", + link_state->speed, link_state->fd ? "full" : "half", + efx->net_dev->mtu); + else + netif_info(efx, link, efx->net_dev, "link down\n"); +} + +unsigned int efx_xdp_max_mtu(struct efx_nic *efx) +{ + /* The maximum MTU that we can fit in a single page, allowing for + * framing, overhead and XDP headroom. + */ + int overhead = EFX_MAX_FRAME_LEN(0) + sizeof(struct efx_rx_page_state) + + efx->rx_prefix_size + efx->type->rx_buffer_padding + + efx->rx_ip_align + XDP_PACKET_HEADROOM; + + return PAGE_SIZE - overhead; +} + +/* Context: process, rtnl_lock() held. */ +int efx_change_mtu(struct net_device *net_dev, int new_mtu) +{ + struct efx_nic *efx = netdev_priv(net_dev); + int rc; + + rc = efx_check_disabled(efx); + if (rc) + return rc; + + if (rtnl_dereference(efx->xdp_prog) && + new_mtu > efx_xdp_max_mtu(efx)) { + netif_err(efx, drv, efx->net_dev, + "Requested MTU of %d too big for XDP (max: %d)\n", + new_mtu, efx_xdp_max_mtu(efx)); + return -EINVAL; + } + + netif_dbg(efx, drv, efx->net_dev, "changing MTU to %d\n", new_mtu); + + efx_device_detach_sync(efx); + efx_stop_all(efx); + + mutex_lock(&efx->mac_lock); + net_dev->mtu = new_mtu; + efx_mac_reconfigure(efx); + mutex_unlock(&efx->mac_lock); + + efx_start_all(efx); + efx_device_attach_if_not_resetting(efx); + return 0; +} + +/************************************************************************** + * + * Hardware monitor + * + **************************************************************************/ + +/* Run periodically off the general workqueue */ +static void efx_monitor(struct work_struct *data) +{ + struct efx_nic *efx = container_of(data, struct efx_nic, + monitor_work.work); + + netif_vdbg(efx, timer, efx->net_dev, + "hardware monitor executing on CPU %d\n", + raw_smp_processor_id()); + BUG_ON(efx->type->monitor == NULL); + + /* If the mac_lock is already held then it is likely a port + * reconfiguration is already in place, which will likely do + * most of the work of monitor() anyway. + */ + if (mutex_trylock(&efx->mac_lock)) { + if (efx->port_enabled && efx->type->monitor) + efx->type->monitor(efx); + mutex_unlock(&efx->mac_lock); + } + + efx_start_monitor(efx); +} + +void efx_start_monitor(struct efx_nic *efx) +{ + if (efx->type->monitor) + queue_delayed_work(efx->workqueue, &efx->monitor_work, + efx_monitor_interval); +} + +/************************************************************************** + * + * Event queue processing + * + *************************************************************************/ + +/* Channels are shutdown and reinitialised whilst the NIC is running + * to propagate configuration changes (mtu, checksum offload), or + * to clear hardware error conditions + */ +static void efx_start_datapath(struct efx_nic *efx) +{ + netdev_features_t old_features = efx->net_dev->features; + bool old_rx_scatter = efx->rx_scatter; + size_t rx_buf_len; + + /* Calculate the rx buffer allocation parameters required to + * support the current MTU, including padding for header + * alignment and overruns. + */ + efx->rx_dma_len = (efx->rx_prefix_size + + EFX_MAX_FRAME_LEN(efx->net_dev->mtu) + + efx->type->rx_buffer_padding); + rx_buf_len = (sizeof(struct efx_rx_page_state) + XDP_PACKET_HEADROOM + + efx->rx_ip_align + efx->rx_dma_len); + if (rx_buf_len <= PAGE_SIZE) { + efx->rx_scatter = efx->type->always_rx_scatter; + efx->rx_buffer_order = 0; + } else if (efx->type->can_rx_scatter) { + BUILD_BUG_ON(EFX_RX_USR_BUF_SIZE % L1_CACHE_BYTES); + BUILD_BUG_ON(sizeof(struct efx_rx_page_state) + + 2 * ALIGN(NET_IP_ALIGN + EFX_RX_USR_BUF_SIZE, + EFX_RX_BUF_ALIGNMENT) > + PAGE_SIZE); + efx->rx_scatter = true; + efx->rx_dma_len = EFX_RX_USR_BUF_SIZE; + efx->rx_buffer_order = 0; + } else { + efx->rx_scatter = false; + efx->rx_buffer_order = get_order(rx_buf_len); + } + + efx_rx_config_page_split(efx); + if (efx->rx_buffer_order) + netif_dbg(efx, drv, efx->net_dev, + "RX buf len=%u; page order=%u batch=%u\n", + efx->rx_dma_len, efx->rx_buffer_order, + efx->rx_pages_per_batch); + else + netif_dbg(efx, drv, efx->net_dev, + "RX buf len=%u step=%u bpp=%u; page batch=%u\n", + efx->rx_dma_len, efx->rx_page_buf_step, + efx->rx_bufs_per_page, efx->rx_pages_per_batch); + + /* Restore previously fixed features in hw_features and remove + * features which are fixed now + */ + efx->net_dev->hw_features |= efx->net_dev->features; + efx->net_dev->hw_features &= ~efx->fixed_features; + efx->net_dev->features |= efx->fixed_features; + if (efx->net_dev->features != old_features) + netdev_features_change(efx->net_dev); + + /* RX filters may also have scatter-enabled flags */ + if ((efx->rx_scatter != old_rx_scatter) && + efx->type->filter_update_rx_scatter) + efx->type->filter_update_rx_scatter(efx); + + /* We must keep at least one descriptor in a TX ring empty. + * We could avoid this when the queue size does not exactly + * match the hardware ring size, but it's not that important. + * Therefore we stop the queue when one more skb might fill + * the ring completely. We wake it when half way back to + * empty. + */ + efx->txq_stop_thresh = efx->txq_entries - efx_tx_max_skb_descs(efx); + efx->txq_wake_thresh = efx->txq_stop_thresh / 2; + + /* Initialise the channels */ + efx_start_channels(efx); + + efx_ptp_start_datapath(efx); + + if (netif_device_present(efx->net_dev)) + netif_tx_wake_all_queues(efx->net_dev); +} + +static void efx_stop_datapath(struct efx_nic *efx) +{ + EFX_ASSERT_RESET_SERIALISED(efx); + BUG_ON(efx->port_enabled); + + efx_ptp_stop_datapath(efx); + + efx_stop_channels(efx); +} + +/************************************************************************** + * + * Port handling + * + **************************************************************************/ + +static void efx_start_port(struct efx_nic *efx) +{ + netif_dbg(efx, ifup, efx->net_dev, "start port\n"); + BUG_ON(efx->port_enabled); + + mutex_lock(&efx->mac_lock); + efx->port_enabled = true; + + /* Ensure MAC ingress/egress is enabled */ + efx_mac_reconfigure(efx); + + mutex_unlock(&efx->mac_lock); +} + +/* Cancel work for MAC reconfiguration, periodic hardware monitoring + * and the async self-test, wait for them to finish and prevent them + * being scheduled again. This doesn't cover online resets, which + * should only be cancelled when removing the device. + */ +static void efx_stop_port(struct efx_nic *efx) +{ + netif_dbg(efx, ifdown, efx->net_dev, "stop port\n"); + + EFX_ASSERT_RESET_SERIALISED(efx); + + mutex_lock(&efx->mac_lock); + efx->port_enabled = false; + mutex_unlock(&efx->mac_lock); + + /* Serialise against efx_set_multicast_list() */ + netif_addr_lock_bh(efx->net_dev); + netif_addr_unlock_bh(efx->net_dev); + + cancel_delayed_work_sync(&efx->monitor_work); + efx_selftest_async_cancel(efx); + cancel_work_sync(&efx->mac_work); +} + +/* If the interface is supposed to be running but is not, start + * the hardware and software data path, regular activity for the port + * (MAC statistics, link polling, etc.) and schedule the port to be + * reconfigured. Interrupts must already be enabled. This function + * is safe to call multiple times, so long as the NIC is not disabled. + * Requires the RTNL lock. + */ +void efx_start_all(struct efx_nic *efx) +{ + EFX_ASSERT_RESET_SERIALISED(efx); + BUG_ON(efx->state == STATE_DISABLED); + + /* Check that it is appropriate to restart the interface. All + * of these flags are safe to read under just the rtnl lock + */ + if (efx->port_enabled || !netif_running(efx->net_dev) || + efx->reset_pending) + return; + + efx_start_port(efx); + efx_start_datapath(efx); + + /* Start the hardware monitor if there is one */ + efx_start_monitor(efx); + + /* Link state detection is normally event-driven; we have + * to poll now because we could have missed a change + */ + mutex_lock(&efx->mac_lock); + if (efx->phy_op->poll(efx)) + efx_link_status_changed(efx); + mutex_unlock(&efx->mac_lock); + + if (efx->type->start_stats) { + efx->type->start_stats(efx); + efx->type->pull_stats(efx); + spin_lock_bh(&efx->stats_lock); + efx->type->update_stats(efx, NULL, NULL); + spin_unlock_bh(&efx->stats_lock); + } +} + +/* Quiesce the hardware and software data path, and regular activity + * for the port without bringing the link down. Safe to call multiple + * times with the NIC in almost any state, but interrupts should be + * enabled. Requires the RTNL lock. + */ +void efx_stop_all(struct efx_nic *efx) +{ + EFX_ASSERT_RESET_SERIALISED(efx); + + /* port_enabled can be read safely under the rtnl lock */ + if (!efx->port_enabled) + return; + + if (efx->type->update_stats) { + /* update stats before we go down so we can accurately count + * rx_nodesc_drops + */ + efx->type->pull_stats(efx); + spin_lock_bh(&efx->stats_lock); + efx->type->update_stats(efx, NULL, NULL); + spin_unlock_bh(&efx->stats_lock); + efx->type->stop_stats(efx); + } + + efx_stop_port(efx); + + /* Stop the kernel transmit interface. This is only valid if + * the device is stopped or detached; otherwise the watchdog + * may fire immediately. + */ + WARN_ON(netif_running(efx->net_dev) && + netif_device_present(efx->net_dev)); + netif_tx_disable(efx->net_dev); + + efx_stop_datapath(efx); +} + +/* Context: process, dev_base_lock or RTNL held, non-blocking. */ +void efx_net_stats(struct net_device *net_dev, struct rtnl_link_stats64 *stats) +{ + struct efx_nic *efx = netdev_priv(net_dev); + + spin_lock_bh(&efx->stats_lock); + efx->type->update_stats(efx, NULL, stats); + spin_unlock_bh(&efx->stats_lock); +} + +/* Push loopback/power/transmit disable settings to the PHY, and reconfigure + * the MAC appropriately. All other PHY configuration changes are pushed + * through phy_op->set_settings(), and pushed asynchronously to the MAC + * through efx_monitor(). + * + * Callers must hold the mac_lock + */ +int __efx_reconfigure_port(struct efx_nic *efx) +{ + enum efx_phy_mode phy_mode; + int rc = 0; + + WARN_ON(!mutex_is_locked(&efx->mac_lock)); + + /* Disable PHY transmit in mac level loopbacks */ + phy_mode = efx->phy_mode; + if (LOOPBACK_INTERNAL(efx)) + efx->phy_mode |= PHY_MODE_TX_DISABLED; + else + efx->phy_mode &= ~PHY_MODE_TX_DISABLED; + + if (efx->type->reconfigure_port) + rc = efx->type->reconfigure_port(efx); + + if (rc) + efx->phy_mode = phy_mode; + + return rc; +} + +/* Reinitialise the MAC to pick up new PHY settings, even if the port is + * disabled. + */ +int efx_reconfigure_port(struct efx_nic *efx) +{ + int rc; + + EFX_ASSERT_RESET_SERIALISED(efx); + + mutex_lock(&efx->mac_lock); + rc = __efx_reconfigure_port(efx); + mutex_unlock(&efx->mac_lock); + + return rc; +} + +/************************************************************************** + * + * Device reset and suspend + * + **************************************************************************/ + +static void efx_wait_for_bist_end(struct efx_nic *efx) +{ + int i; + + for (i = 0; i < BIST_WAIT_DELAY_COUNT; ++i) { + if (efx_mcdi_poll_reboot(efx)) + goto out; + msleep(BIST_WAIT_DELAY_MS); + } + + netif_err(efx, drv, efx->net_dev, "Warning: No MC reboot after BIST mode\n"); +out: + /* Either way unset the BIST flag. If we found no reboot we probably + * won't recover, but we should try. + */ + efx->mc_bist_for_other_fn = false; +} + +/* Try recovery mechanisms. + * For now only EEH is supported. + * Returns 0 if the recovery mechanisms are unsuccessful. + * Returns a non-zero value otherwise. + */ +int efx_try_recovery(struct efx_nic *efx) +{ +#ifdef CONFIG_EEH + /* A PCI error can occur and not be seen by EEH because nothing + * happens on the PCI bus. In this case the driver may fail and + * schedule a 'recover or reset', leading to this recovery handler. + * Manually call the eeh failure check function. + */ + struct eeh_dev *eehdev = pci_dev_to_eeh_dev(efx->pci_dev); + if (eeh_dev_check_failure(eehdev)) { + /* The EEH mechanisms will handle the error and reset the + * device if necessary. + */ + return 1; + } +#endif + return 0; +} + +/* Tears down the entire software state and most of the hardware state + * before reset. + */ +void efx_reset_down(struct efx_nic *efx, enum reset_type method) +{ + EFX_ASSERT_RESET_SERIALISED(efx); + + if (method == RESET_TYPE_MCDI_TIMEOUT) + efx->type->prepare_flr(efx); + + efx_stop_all(efx); + efx_disable_interrupts(efx); + + mutex_lock(&efx->mac_lock); + down_write(&efx->filter_sem); + mutex_lock(&efx->rss_lock); + if (efx->port_initialized && method != RESET_TYPE_INVISIBLE && + method != RESET_TYPE_DATAPATH) + efx->phy_op->fini(efx); + efx->type->fini(efx); +} + +/* This function will always ensure that the locks acquired in + * efx_reset_down() are released. A failure return code indicates + * that we were unable to reinitialise the hardware, and the + * driver should be disabled. If ok is false, then the rx and tx + * engines are not restarted, pending a RESET_DISABLE. + */ +int efx_reset_up(struct efx_nic *efx, enum reset_type method, bool ok) +{ + int rc; + + EFX_ASSERT_RESET_SERIALISED(efx); + + if (method == RESET_TYPE_MCDI_TIMEOUT) + efx->type->finish_flr(efx); + + /* Ensure that SRAM is initialised even if we're disabling the device */ + rc = efx->type->init(efx); + if (rc) { + netif_err(efx, drv, efx->net_dev, "failed to initialise NIC\n"); + goto fail; + } + + if (!ok) + goto fail; + + if (efx->port_initialized && method != RESET_TYPE_INVISIBLE && + method != RESET_TYPE_DATAPATH) { + rc = efx->phy_op->init(efx); + if (rc) + goto fail; + rc = efx->phy_op->reconfigure(efx); + if (rc && rc != -EPERM) + netif_err(efx, drv, efx->net_dev, + "could not restore PHY settings\n"); + } + + rc = efx_enable_interrupts(efx); + if (rc) + goto fail; + +#ifdef CONFIG_SFC_SRIOV + rc = efx->type->vswitching_restore(efx); + if (rc) /* not fatal; the PF will still work fine */ + netif_warn(efx, probe, efx->net_dev, + "failed to restore vswitching rc=%d;" + " VFs may not function\n", rc); +#endif + + if (efx->type->rx_restore_rss_contexts) + efx->type->rx_restore_rss_contexts(efx); + mutex_unlock(&efx->rss_lock); + efx->type->filter_table_restore(efx); + up_write(&efx->filter_sem); + if (efx->type->sriov_reset) + efx->type->sriov_reset(efx); + + mutex_unlock(&efx->mac_lock); + + efx_start_all(efx); + + if (efx->type->udp_tnl_push_ports) + efx->type->udp_tnl_push_ports(efx); + + return 0; + +fail: + efx->port_initialized = false; + + mutex_unlock(&efx->rss_lock); + up_write(&efx->filter_sem); + mutex_unlock(&efx->mac_lock); + + return rc; +} + +/* Reset the NIC using the specified method. Note that the reset may + * fail, in which case the card will be left in an unusable state. + * + * Caller must hold the rtnl_lock. + */ +int efx_reset(struct efx_nic *efx, enum reset_type method) +{ + bool disabled; + int rc, rc2; + + netif_info(efx, drv, efx->net_dev, "resetting (%s)\n", + RESET_TYPE(method)); + + efx_device_detach_sync(efx); + efx_reset_down(efx, method); + + rc = efx->type->reset(efx, method); + if (rc) { + netif_err(efx, drv, efx->net_dev, "failed to reset hardware\n"); + goto out; + } + + /* Clear flags for the scopes we covered. We assume the NIC and + * driver are now quiescent so that there is no race here. + */ + if (method < RESET_TYPE_MAX_METHOD) + efx->reset_pending &= -(1 << (method + 1)); + else /* it doesn't fit into the well-ordered scope hierarchy */ + __clear_bit(method, &efx->reset_pending); + + /* Reinitialise bus-mastering, which may have been turned off before + * the reset was scheduled. This is still appropriate, even in the + * RESET_TYPE_DISABLE since this driver generally assumes the hardware + * can respond to requests. + */ + pci_set_master(efx->pci_dev); + +out: + /* Leave device stopped if necessary */ + disabled = rc || + method == RESET_TYPE_DISABLE || + method == RESET_TYPE_RECOVER_OR_DISABLE; + rc2 = efx_reset_up(efx, method, !disabled); + if (rc2) { + disabled = true; + if (!rc) + rc = rc2; + } + + if (disabled) { + dev_close(efx->net_dev); + netif_err(efx, drv, efx->net_dev, "has been disabled\n"); + efx->state = STATE_DISABLED; + } else { + netif_dbg(efx, drv, efx->net_dev, "reset complete\n"); + efx_device_attach_if_not_resetting(efx); + } + return rc; +} + +/* The worker thread exists so that code that cannot sleep can + * schedule a reset for later. + */ +static void efx_reset_work(struct work_struct *data) +{ + struct efx_nic *efx = container_of(data, struct efx_nic, reset_work); + unsigned long pending; + enum reset_type method; + + pending = READ_ONCE(efx->reset_pending); + method = fls(pending) - 1; + + if (method == RESET_TYPE_MC_BIST) + efx_wait_for_bist_end(efx); + + if ((method == RESET_TYPE_RECOVER_OR_DISABLE || + method == RESET_TYPE_RECOVER_OR_ALL) && + efx_try_recovery(efx)) + return; + + if (!pending) + return; + + rtnl_lock(); + + /* We checked the state in efx_schedule_reset() but it may + * have changed by now. Now that we have the RTNL lock, + * it cannot change again. + */ + if (efx->state == STATE_READY) + (void)efx_reset(efx, method); + + rtnl_unlock(); +} + +void efx_schedule_reset(struct efx_nic *efx, enum reset_type type) +{ + enum reset_type method; + + if (efx->state == STATE_RECOVERY) { + netif_dbg(efx, drv, efx->net_dev, + "recovering: skip scheduling %s reset\n", + RESET_TYPE(type)); + return; + } + + switch (type) { + case RESET_TYPE_INVISIBLE: + case RESET_TYPE_ALL: + case RESET_TYPE_RECOVER_OR_ALL: + case RESET_TYPE_WORLD: + case RESET_TYPE_DISABLE: + case RESET_TYPE_RECOVER_OR_DISABLE: + case RESET_TYPE_DATAPATH: + case RESET_TYPE_MC_BIST: + case RESET_TYPE_MCDI_TIMEOUT: + method = type; + netif_dbg(efx, drv, efx->net_dev, "scheduling %s reset\n", + RESET_TYPE(method)); + break; + default: + method = efx->type->map_reset_reason(type); + netif_dbg(efx, drv, efx->net_dev, + "scheduling %s reset for %s\n", + RESET_TYPE(method), RESET_TYPE(type)); + break; + } + + set_bit(method, &efx->reset_pending); + smp_mb(); /* ensure we change reset_pending before checking state */ + + /* If we're not READY then just leave the flags set as the cue + * to abort probing or reschedule the reset later. + */ + if (READ_ONCE(efx->state) != STATE_READY) + return; + + /* efx_process_channel() will no longer read events once a + * reset is scheduled. So switch back to poll'd MCDI completions. + */ + efx_mcdi_mode_poll(efx); + + efx_queue_reset_work(efx); +} + +/************************************************************************** + * + * Dummy PHY/MAC operations + * + * Can be used for some unimplemented operations + * Needed so all function pointers are valid and do not have to be tested + * before use + * + **************************************************************************/ +int efx_port_dummy_op_int(struct efx_nic *efx) +{ + return 0; +} +void efx_port_dummy_op_void(struct efx_nic *efx) {} + +static bool efx_port_dummy_op_poll(struct efx_nic *efx) +{ + return false; +} + +static const struct efx_phy_operations efx_dummy_phy_operations = { + .init = efx_port_dummy_op_int, + .reconfigure = efx_port_dummy_op_int, + .poll = efx_port_dummy_op_poll, + .fini = efx_port_dummy_op_void, +}; + +/************************************************************************** + * + * Data housekeeping + * + **************************************************************************/ + +/* This zeroes out and then fills in the invariants in a struct + * efx_nic (including all sub-structures). + */ +int efx_init_struct(struct efx_nic *efx, + struct pci_dev *pci_dev, struct net_device *net_dev) +{ + int rc = -ENOMEM; + + /* Initialise common structures */ + INIT_LIST_HEAD(&efx->node); + INIT_LIST_HEAD(&efx->secondary_list); + spin_lock_init(&efx->biu_lock); +#ifdef CONFIG_SFC_MTD + INIT_LIST_HEAD(&efx->mtd_list); +#endif + INIT_WORK(&efx->reset_work, efx_reset_work); + INIT_DELAYED_WORK(&efx->monitor_work, efx_monitor); + efx_selftest_async_init(efx); + efx->pci_dev = pci_dev; + efx->msg_enable = debug; + efx->state = STATE_UNINIT; + strlcpy(efx->name, pci_name(pci_dev), sizeof(efx->name)); + + efx->net_dev = net_dev; + efx->rx_prefix_size = efx->type->rx_prefix_size; + efx->rx_ip_align = + NET_IP_ALIGN ? (efx->rx_prefix_size + NET_IP_ALIGN) % 4 : 0; + efx->rx_packet_hash_offset = + efx->type->rx_hash_offset - efx->type->rx_prefix_size; + efx->rx_packet_ts_offset = + efx->type->rx_ts_offset - efx->type->rx_prefix_size; + INIT_LIST_HEAD(&efx->rss_context.list); + mutex_init(&efx->rss_lock); + spin_lock_init(&efx->stats_lock); + efx->vi_stride = EFX_DEFAULT_VI_STRIDE; + efx->num_mac_stats = MC_CMD_MAC_NSTATS; + BUILD_BUG_ON(MC_CMD_MAC_NSTATS - 1 != MC_CMD_MAC_GENERATION_END); + mutex_init(&efx->mac_lock); +#ifdef CONFIG_RFS_ACCEL + mutex_init(&efx->rps_mutex); + spin_lock_init(&efx->rps_hash_lock); + /* Failure to allocate is not fatal, but may degrade ARFS performance */ + efx->rps_hash_table = kcalloc(EFX_ARFS_HASH_TABLE_SIZE, + sizeof(*efx->rps_hash_table), GFP_KERNEL); +#endif + efx->phy_op = &efx_dummy_phy_operations; + efx->mdio.dev = net_dev; + INIT_WORK(&efx->mac_work, efx_mac_work); + init_waitqueue_head(&efx->flush_wq); + + rc = efx_init_channels(efx); + if (rc) + goto fail; + + /* Would be good to use the net_dev name, but we're too early */ + snprintf(efx->workqueue_name, sizeof(efx->workqueue_name), "sfc%s", + pci_name(pci_dev)); + efx->workqueue = create_singlethread_workqueue(efx->workqueue_name); + if (!efx->workqueue) { + rc = -ENOMEM; + goto fail; + } + + return 0; + +fail: + efx_fini_struct(efx); + return rc; +} + +void efx_fini_struct(struct efx_nic *efx) +{ +#ifdef CONFIG_RFS_ACCEL + kfree(efx->rps_hash_table); +#endif + + efx_fini_channels(efx); + + kfree(efx->vpd_sn); + + if (efx->workqueue) { + destroy_workqueue(efx->workqueue); + efx->workqueue = NULL; + } +} + +/* This configures the PCI device to enable I/O and DMA. */ +int efx_init_io(struct efx_nic *efx, int bar, dma_addr_t dma_mask, + unsigned int mem_map_size) +{ + struct pci_dev *pci_dev = efx->pci_dev; + int rc; + + netif_dbg(efx, probe, efx->net_dev, "initialising I/O\n"); + + rc = pci_enable_device(pci_dev); + if (rc) { + netif_err(efx, probe, efx->net_dev, + "failed to enable PCI device\n"); + goto fail1; + } + + pci_set_master(pci_dev); + + /* Set the PCI DMA mask. Try all possibilities from our + * genuine mask down to 32 bits, because some architectures + * (e.g. x86_64 with iommu_sac_force set) will allow 40 bit + * masks event though they reject 46 bit masks. + */ + while (dma_mask > 0x7fffffffUL) { + rc = dma_set_mask_and_coherent(&pci_dev->dev, dma_mask); + if (rc == 0) + break; + dma_mask >>= 1; + } + if (rc) { + netif_err(efx, probe, efx->net_dev, + "could not find a suitable DMA mask\n"); + goto fail2; + } + netif_dbg(efx, probe, efx->net_dev, + "using DMA mask %llx\n", (unsigned long long)dma_mask); + + efx->membase_phys = pci_resource_start(efx->pci_dev, bar); + if (!efx->membase_phys) { + netif_err(efx, probe, efx->net_dev, + "ERROR: No BAR%d mapping from the BIOS. " + "Try pci=realloc on the kernel command line\n", bar); + rc = -ENODEV; + goto fail3; + } + + rc = pci_request_region(pci_dev, bar, "sfc"); + if (rc) { + netif_err(efx, probe, efx->net_dev, + "request for memory BAR failed\n"); + rc = -EIO; + goto fail3; + } + + efx->membase = ioremap_nocache(efx->membase_phys, mem_map_size); + if (!efx->membase) { + netif_err(efx, probe, efx->net_dev, + "could not map memory BAR at %llx+%x\n", + (unsigned long long)efx->membase_phys, mem_map_size); + rc = -ENOMEM; + goto fail4; + } + netif_dbg(efx, probe, efx->net_dev, + "memory BAR at %llx+%x (virtual %p)\n", + (unsigned long long)efx->membase_phys, mem_map_size, + efx->membase); + + return 0; + +fail4: + pci_release_region(efx->pci_dev, bar); +fail3: + efx->membase_phys = 0; +fail2: + pci_disable_device(efx->pci_dev); +fail1: + return rc; +} + +void efx_fini_io(struct efx_nic *efx, int bar) +{ + netif_dbg(efx, drv, efx->net_dev, "shutting down I/O\n"); + + if (efx->membase) { + iounmap(efx->membase); + efx->membase = NULL; + } + + if (efx->membase_phys) { + pci_release_region(efx->pci_dev, bar); + efx->membase_phys = 0; + } + + /* Don't disable bus-mastering if VFs are assigned */ + if (!pci_vfs_assigned(efx->pci_dev)) + pci_disable_device(efx->pci_dev); +} + +#ifdef CONFIG_SFC_MCDI_LOGGING +static ssize_t show_mcdi_log(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct efx_nic *efx = dev_get_drvdata(dev); + struct efx_mcdi_iface *mcdi = efx_mcdi(efx); + + return scnprintf(buf, PAGE_SIZE, "%d\n", mcdi->logging_enabled); +} + +static ssize_t set_mcdi_log(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct efx_nic *efx = dev_get_drvdata(dev); + struct efx_mcdi_iface *mcdi = efx_mcdi(efx); + bool enable = count > 0 && *buf != '0'; + + mcdi->logging_enabled = enable; + return count; +} + +static DEVICE_ATTR(mcdi_logging, 0644, show_mcdi_log, set_mcdi_log); + +void efx_init_mcdi_logging(struct efx_nic *efx) +{ + int rc = device_create_file(&efx->pci_dev->dev, &dev_attr_mcdi_logging); + + if (rc) { + netif_warn(efx, drv, efx->net_dev, + "failed to init net dev attributes\n"); + } +} + +void efx_fini_mcdi_logging(struct efx_nic *efx) +{ + device_remove_file(&efx->pci_dev->dev, &dev_attr_mcdi_logging); +} +#endif diff --git a/drivers/net/ethernet/sfc/efx_common.h b/drivers/net/ethernet/sfc/efx_common.h new file mode 100644 index 000000000000..fa2fc681e7f9 --- /dev/null +++ b/drivers/net/ethernet/sfc/efx_common.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/**************************************************************************** + * Driver for Solarflare network controllers and boards + * Copyright 2018 Solarflare Communications Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + */ + +#ifndef EFX_COMMON_H +#define EFX_COMMON_H + +int efx_init_io(struct efx_nic *efx, int bar, dma_addr_t dma_mask, + unsigned int mem_map_size); +void efx_fini_io(struct efx_nic *efx, int bar); +int efx_init_struct(struct efx_nic *efx, struct pci_dev *pci_dev, + struct net_device *net_dev); +void efx_fini_struct(struct efx_nic *efx); + +void efx_start_all(struct efx_nic *efx); +void efx_stop_all(struct efx_nic *efx); + +void efx_net_stats(struct net_device *net_dev, struct rtnl_link_stats64 *stats); + +int efx_create_reset_workqueue(void); +void efx_queue_reset_work(struct efx_nic *efx); +void efx_flush_reset_workqueue(struct efx_nic *efx); +void efx_destroy_reset_workqueue(void); + +void efx_start_monitor(struct efx_nic *efx); + +int __efx_reconfigure_port(struct efx_nic *efx); +int efx_reconfigure_port(struct efx_nic *efx); + +#define EFX_ASSERT_RESET_SERIALISED(efx) \ + do { \ + if ((efx->state == STATE_READY) || \ + (efx->state == STATE_RECOVERY) || \ + (efx->state == STATE_DISABLED)) \ + ASSERT_RTNL(); \ + } while (0) + +int efx_try_recovery(struct efx_nic *efx); +void efx_reset_down(struct efx_nic *efx, enum reset_type method); +int efx_reset_up(struct efx_nic *efx, enum reset_type method, bool ok); +int efx_reset(struct efx_nic *efx, enum reset_type method); +void efx_schedule_reset(struct efx_nic *efx, enum reset_type type); + +static inline int efx_check_disabled(struct efx_nic *efx) +{ + if (efx->state == STATE_DISABLED || efx->state == STATE_RECOVERY) { + netif_err(efx, drv, efx->net_dev, + "device is disabled due to earlier errors\n"); + return -EIO; + } + return 0; +} + +#ifdef CONFIG_SFC_MCDI_LOGGING +void efx_init_mcdi_logging(struct efx_nic *efx); +void efx_fini_mcdi_logging(struct efx_nic *efx); +#else +static inline void efx_init_mcdi_logging(struct efx_nic *efx) {} +static inline void efx_fini_mcdi_logging(struct efx_nic *efx) {} +#endif + +void efx_mac_reconfigure(struct efx_nic *efx); +void efx_link_status_changed(struct efx_nic *efx); +unsigned int efx_xdp_max_mtu(struct efx_nic *efx); +int efx_change_mtu(struct net_device *net_dev, int new_mtu); + +#endif diff --git a/drivers/net/ethernet/sfc/ethtool.c b/drivers/net/ethernet/sfc/ethtool.c index b31032da4bcb..993b5769525b 100644 --- a/drivers/net/ethernet/sfc/ethtool.c +++ b/drivers/net/ethernet/sfc/ethtool.c @@ -13,92 +13,13 @@ #include "workarounds.h" #include "selftest.h" #include "efx.h" +#include "efx_channels.h" +#include "rx_common.h" +#include "tx_common.h" +#include "ethtool_common.h" #include "filter.h" #include "nic.h" -struct efx_sw_stat_desc { - const char *name; - enum { - EFX_ETHTOOL_STAT_SOURCE_nic, - EFX_ETHTOOL_STAT_SOURCE_channel, - EFX_ETHTOOL_STAT_SOURCE_tx_queue - } source; - unsigned offset; - u64(*get_stat) (void *field); /* Reader function */ -}; - -/* Initialiser for a struct efx_sw_stat_desc with type-checking */ -#define EFX_ETHTOOL_STAT(stat_name, source_name, field, field_type, \ - get_stat_function) { \ - .name = #stat_name, \ - .source = EFX_ETHTOOL_STAT_SOURCE_##source_name, \ - .offset = ((((field_type *) 0) == \ - &((struct efx_##source_name *)0)->field) ? \ - offsetof(struct efx_##source_name, field) : \ - offsetof(struct efx_##source_name, field)), \ - .get_stat = get_stat_function, \ -} - -static u64 efx_get_uint_stat(void *field) -{ - return *(unsigned int *)field; -} - -static u64 efx_get_atomic_stat(void *field) -{ - return atomic_read((atomic_t *) field); -} - -#define EFX_ETHTOOL_ATOMIC_NIC_ERROR_STAT(field) \ - EFX_ETHTOOL_STAT(field, nic, field, \ - atomic_t, efx_get_atomic_stat) - -#define EFX_ETHTOOL_UINT_CHANNEL_STAT(field) \ - EFX_ETHTOOL_STAT(field, channel, n_##field, \ - unsigned int, efx_get_uint_stat) -#define EFX_ETHTOOL_UINT_CHANNEL_STAT_NO_N(field) \ - EFX_ETHTOOL_STAT(field, channel, field, \ - unsigned int, efx_get_uint_stat) - -#define EFX_ETHTOOL_UINT_TXQ_STAT(field) \ - EFX_ETHTOOL_STAT(tx_##field, tx_queue, field, \ - unsigned int, efx_get_uint_stat) - -static const struct efx_sw_stat_desc efx_sw_stat_desc[] = { - EFX_ETHTOOL_UINT_TXQ_STAT(merge_events), - EFX_ETHTOOL_UINT_TXQ_STAT(tso_bursts), - EFX_ETHTOOL_UINT_TXQ_STAT(tso_long_headers), - EFX_ETHTOOL_UINT_TXQ_STAT(tso_packets), - EFX_ETHTOOL_UINT_TXQ_STAT(tso_fallbacks), - EFX_ETHTOOL_UINT_TXQ_STAT(pushes), - EFX_ETHTOOL_UINT_TXQ_STAT(pio_packets), - EFX_ETHTOOL_UINT_TXQ_STAT(cb_packets), - EFX_ETHTOOL_ATOMIC_NIC_ERROR_STAT(rx_reset), - EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_tobe_disc), - EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_ip_hdr_chksum_err), - EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_tcp_udp_chksum_err), - EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_inner_ip_hdr_chksum_err), - EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_inner_tcp_udp_chksum_err), - EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_outer_ip_hdr_chksum_err), - EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_outer_tcp_udp_chksum_err), - EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_eth_crc_err), - EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_mcast_mismatch), - EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_frm_trunc), - EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_merge_events), - EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_merge_packets), - EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_xdp_drops), - EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_xdp_bad_drops), - EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_xdp_tx), - EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_xdp_redirect), -#ifdef CONFIG_RFS_ACCEL - EFX_ETHTOOL_UINT_CHANNEL_STAT_NO_N(rfs_filter_count), - EFX_ETHTOOL_UINT_CHANNEL_STAT(rfs_succeeded), - EFX_ETHTOOL_UINT_CHANNEL_STAT(rfs_failed), -#endif -}; - -#define EFX_ETHTOOL_SW_STAT_COUNT ARRAY_SIZE(efx_sw_stat_desc) - #define EFX_ETHTOOL_EEPROM_MAGIC 0xEFAB /************************************************************************** @@ -185,18 +106,6 @@ efx_ethtool_set_link_ksettings(struct net_device *net_dev, return rc; } -static void efx_ethtool_get_drvinfo(struct net_device *net_dev, - struct ethtool_drvinfo *info) -{ - struct efx_nic *efx = netdev_priv(net_dev); - - strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver)); - strlcpy(info->version, EFX_DRIVER_VERSION, sizeof(info->version)); - efx_mcdi_print_fwver(efx, info->fw_version, - sizeof(info->fw_version)); - strlcpy(info->bus_info, pci_name(efx->pci_dev), sizeof(info->bus_info)); -} - static int efx_ethtool_get_regs_len(struct net_device *net_dev) { return efx_nic_get_regs_len(netdev_priv(net_dev)); @@ -211,341 +120,6 @@ static void efx_ethtool_get_regs(struct net_device *net_dev, efx_nic_get_regs(efx, buf); } -static u32 efx_ethtool_get_msglevel(struct net_device *net_dev) -{ - struct efx_nic *efx = netdev_priv(net_dev); - return efx->msg_enable; -} - -static void efx_ethtool_set_msglevel(struct net_device *net_dev, u32 msg_enable) -{ - struct efx_nic *efx = netdev_priv(net_dev); - efx->msg_enable = msg_enable; -} - -/** - * efx_fill_test - fill in an individual self-test entry - * @test_index: Index of the test - * @strings: Ethtool strings, or %NULL - * @data: Ethtool test results, or %NULL - * @test: Pointer to test result (used only if data != %NULL) - * @unit_format: Unit name format (e.g. "chan\%d") - * @unit_id: Unit id (e.g. 0 for "chan0") - * @test_format: Test name format (e.g. "loopback.\%s.tx.sent") - * @test_id: Test id (e.g. "PHYXS" for "loopback.PHYXS.tx_sent") - * - * Fill in an individual self-test entry. - */ -static void efx_fill_test(unsigned int test_index, u8 *strings, u64 *data, - int *test, const char *unit_format, int unit_id, - const char *test_format, const char *test_id) -{ - char unit_str[ETH_GSTRING_LEN], test_str[ETH_GSTRING_LEN]; - - /* Fill data value, if applicable */ - if (data) - data[test_index] = *test; - - /* Fill string, if applicable */ - if (strings) { - if (strchr(unit_format, '%')) - snprintf(unit_str, sizeof(unit_str), - unit_format, unit_id); - else - strcpy(unit_str, unit_format); - snprintf(test_str, sizeof(test_str), test_format, test_id); - snprintf(strings + test_index * ETH_GSTRING_LEN, - ETH_GSTRING_LEN, - "%-6s %-24s", unit_str, test_str); - } -} - -#define EFX_CHANNEL_NAME(_channel) "chan%d", _channel->channel -#define EFX_TX_QUEUE_NAME(_tx_queue) "txq%d", _tx_queue->queue -#define EFX_RX_QUEUE_NAME(_rx_queue) "rxq%d", _rx_queue->queue -#define EFX_LOOPBACK_NAME(_mode, _counter) \ - "loopback.%s." _counter, STRING_TABLE_LOOKUP(_mode, efx_loopback_mode) - -/** - * efx_fill_loopback_test - fill in a block of loopback self-test entries - * @efx: Efx NIC - * @lb_tests: Efx loopback self-test results structure - * @mode: Loopback test mode - * @test_index: Starting index of the test - * @strings: Ethtool strings, or %NULL - * @data: Ethtool test results, or %NULL - * - * Fill in a block of loopback self-test entries. Return new test - * index. - */ -static int efx_fill_loopback_test(struct efx_nic *efx, - struct efx_loopback_self_tests *lb_tests, - enum efx_loopback_mode mode, - unsigned int test_index, - u8 *strings, u64 *data) -{ - struct efx_channel *channel = - efx_get_channel(efx, efx->tx_channel_offset); - struct efx_tx_queue *tx_queue; - - efx_for_each_channel_tx_queue(tx_queue, channel) { - efx_fill_test(test_index++, strings, data, - &lb_tests->tx_sent[tx_queue->queue], - EFX_TX_QUEUE_NAME(tx_queue), - EFX_LOOPBACK_NAME(mode, "tx_sent")); - efx_fill_test(test_index++, strings, data, - &lb_tests->tx_done[tx_queue->queue], - EFX_TX_QUEUE_NAME(tx_queue), - EFX_LOOPBACK_NAME(mode, "tx_done")); - } - efx_fill_test(test_index++, strings, data, - &lb_tests->rx_good, - "rx", 0, - EFX_LOOPBACK_NAME(mode, "rx_good")); - efx_fill_test(test_index++, strings, data, - &lb_tests->rx_bad, - "rx", 0, - EFX_LOOPBACK_NAME(mode, "rx_bad")); - - return test_index; -} - -/** - * efx_ethtool_fill_self_tests - get self-test details - * @efx: Efx NIC - * @tests: Efx self-test results structure, or %NULL - * @strings: Ethtool strings, or %NULL - * @data: Ethtool test results, or %NULL - * - * Get self-test number of strings, strings, and/or test results. - * Return number of strings (== number of test results). - * - * The reason for merging these three functions is to make sure that - * they can never be inconsistent. - */ -static int efx_ethtool_fill_self_tests(struct efx_nic *efx, - struct efx_self_tests *tests, - u8 *strings, u64 *data) -{ - struct efx_channel *channel; - unsigned int n = 0, i; - enum efx_loopback_mode mode; - - efx_fill_test(n++, strings, data, &tests->phy_alive, - "phy", 0, "alive", NULL); - efx_fill_test(n++, strings, data, &tests->nvram, - "core", 0, "nvram", NULL); - efx_fill_test(n++, strings, data, &tests->interrupt, - "core", 0, "interrupt", NULL); - - /* Event queues */ - efx_for_each_channel(channel, efx) { - efx_fill_test(n++, strings, data, - &tests->eventq_dma[channel->channel], - EFX_CHANNEL_NAME(channel), - "eventq.dma", NULL); - efx_fill_test(n++, strings, data, - &tests->eventq_int[channel->channel], - EFX_CHANNEL_NAME(channel), - "eventq.int", NULL); - } - - efx_fill_test(n++, strings, data, &tests->memory, - "core", 0, "memory", NULL); - efx_fill_test(n++, strings, data, &tests->registers, - "core", 0, "registers", NULL); - - if (efx->phy_op->run_tests != NULL) { - EFX_WARN_ON_PARANOID(efx->phy_op->test_name == NULL); - - for (i = 0; true; ++i) { - const char *name; - - EFX_WARN_ON_PARANOID(i >= EFX_MAX_PHY_TESTS); - name = efx->phy_op->test_name(efx, i); - if (name == NULL) - break; - - efx_fill_test(n++, strings, data, &tests->phy_ext[i], - "phy", 0, name, NULL); - } - } - - /* Loopback tests */ - for (mode = LOOPBACK_NONE; mode <= LOOPBACK_TEST_MAX; mode++) { - if (!(efx->loopback_modes & (1 << mode))) - continue; - n = efx_fill_loopback_test(efx, - &tests->loopback[mode], mode, n, - strings, data); - } - - return n; -} - -static size_t efx_describe_per_queue_stats(struct efx_nic *efx, u8 *strings) -{ - size_t n_stats = 0; - struct efx_channel *channel; - - efx_for_each_channel(channel, efx) { - if (efx_channel_has_tx_queues(channel)) { - n_stats++; - if (strings != NULL) { - snprintf(strings, ETH_GSTRING_LEN, - "tx-%u.tx_packets", - channel->tx_queue[0].queue / - EFX_TXQ_TYPES); - - strings += ETH_GSTRING_LEN; - } - } - } - efx_for_each_channel(channel, efx) { - if (efx_channel_has_rx_queue(channel)) { - n_stats++; - if (strings != NULL) { - snprintf(strings, ETH_GSTRING_LEN, - "rx-%d.rx_packets", channel->channel); - strings += ETH_GSTRING_LEN; - } - } - } - if (efx->xdp_tx_queue_count && efx->xdp_tx_queues) { - unsigned short xdp; - - for (xdp = 0; xdp < efx->xdp_tx_queue_count; xdp++) { - n_stats++; - if (strings) { - snprintf(strings, ETH_GSTRING_LEN, - "tx-xdp-cpu-%hu.tx_packets", xdp); - strings += ETH_GSTRING_LEN; - } - } - } - - return n_stats; -} - -static int efx_ethtool_get_sset_count(struct net_device *net_dev, - int string_set) -{ - struct efx_nic *efx = netdev_priv(net_dev); - - switch (string_set) { - case ETH_SS_STATS: - return efx->type->describe_stats(efx, NULL) + - EFX_ETHTOOL_SW_STAT_COUNT + - efx_describe_per_queue_stats(efx, NULL) + - efx_ptp_describe_stats(efx, NULL); - case ETH_SS_TEST: - return efx_ethtool_fill_self_tests(efx, NULL, NULL, NULL); - default: - return -EINVAL; - } -} - -static void efx_ethtool_get_strings(struct net_device *net_dev, - u32 string_set, u8 *strings) -{ - struct efx_nic *efx = netdev_priv(net_dev); - int i; - - switch (string_set) { - case ETH_SS_STATS: - strings += (efx->type->describe_stats(efx, strings) * - ETH_GSTRING_LEN); - for (i = 0; i < EFX_ETHTOOL_SW_STAT_COUNT; i++) - strlcpy(strings + i * ETH_GSTRING_LEN, - efx_sw_stat_desc[i].name, ETH_GSTRING_LEN); - strings += EFX_ETHTOOL_SW_STAT_COUNT * ETH_GSTRING_LEN; - strings += (efx_describe_per_queue_stats(efx, strings) * - ETH_GSTRING_LEN); - efx_ptp_describe_stats(efx, strings); - break; - case ETH_SS_TEST: - efx_ethtool_fill_self_tests(efx, NULL, strings, NULL); - break; - default: - /* No other string sets */ - break; - } -} - -static void efx_ethtool_get_stats(struct net_device *net_dev, - struct ethtool_stats *stats, - u64 *data) -{ - struct efx_nic *efx = netdev_priv(net_dev); - const struct efx_sw_stat_desc *stat; - struct efx_channel *channel; - struct efx_tx_queue *tx_queue; - struct efx_rx_queue *rx_queue; - int i; - - spin_lock_bh(&efx->stats_lock); - - /* Get NIC statistics */ - data += efx->type->update_stats(efx, data, NULL); - - /* Get software statistics */ - for (i = 0; i < EFX_ETHTOOL_SW_STAT_COUNT; i++) { - stat = &efx_sw_stat_desc[i]; - switch (stat->source) { - case EFX_ETHTOOL_STAT_SOURCE_nic: - data[i] = stat->get_stat((void *)efx + stat->offset); - break; - case EFX_ETHTOOL_STAT_SOURCE_channel: - data[i] = 0; - efx_for_each_channel(channel, efx) - data[i] += stat->get_stat((void *)channel + - stat->offset); - break; - case EFX_ETHTOOL_STAT_SOURCE_tx_queue: - data[i] = 0; - efx_for_each_channel(channel, efx) { - efx_for_each_channel_tx_queue(tx_queue, channel) - data[i] += - stat->get_stat((void *)tx_queue - + stat->offset); - } - break; - } - } - data += EFX_ETHTOOL_SW_STAT_COUNT; - - spin_unlock_bh(&efx->stats_lock); - - efx_for_each_channel(channel, efx) { - if (efx_channel_has_tx_queues(channel)) { - *data = 0; - efx_for_each_channel_tx_queue(tx_queue, channel) { - *data += tx_queue->tx_packets; - } - data++; - } - } - efx_for_each_channel(channel, efx) { - if (efx_channel_has_rx_queue(channel)) { - *data = 0; - efx_for_each_channel_rx_queue(rx_queue, channel) { - *data += rx_queue->rx_packets; - } - data++; - } - } - if (efx->xdp_tx_queue_count && efx->xdp_tx_queues) { - int xdp; - - for (xdp = 0; xdp < efx->xdp_tx_queue_count; xdp++) { - data[0] = efx->xdp_tx_queues[xdp]->tx_packets; - data++; - } - } - - efx_ptp_update_stats(efx, data); -} - static void efx_ethtool_self_test(struct net_device *net_dev, struct ethtool_test *test, u64 *data) { @@ -787,16 +361,6 @@ out: return rc; } -static void efx_ethtool_get_pauseparam(struct net_device *net_dev, - struct ethtool_pauseparam *pause) -{ - struct efx_nic *efx = netdev_priv(net_dev); - - pause->rx_pause = !!(efx->wanted_fc & EFX_FC_RX); - pause->tx_pause = !!(efx->wanted_fc & EFX_FC_TX); - pause->autoneg = !!(efx->wanted_fc & EFX_FC_AUTO); -} - static void efx_ethtool_get_wol(struct net_device *net_dev, struct ethtool_wolinfo *wol) { @@ -1456,7 +1020,7 @@ static int efx_ethtool_set_rxfh_context(struct net_device *net_dev, rc = -ENOMEM; goto out_unlock; } - ctx->context_id = EFX_EF10_RSS_CONTEXT_INVALID; + ctx->context_id = EFX_MCDI_RSS_CONTEXT_INVALID; /* Initialise indir table and key to defaults */ efx_set_default_rx_indir_table(efx, ctx); netdev_rss_key_fill(ctx->rx_hash_key, sizeof(ctx->rx_hash_key)); diff --git a/drivers/net/ethernet/sfc/ethtool_common.c b/drivers/net/ethernet/sfc/ethtool_common.c new file mode 100644 index 000000000000..b8d281ab6c7a --- /dev/null +++ b/drivers/net/ethernet/sfc/ethtool_common.c @@ -0,0 +1,457 @@ +// SPDX-License-Identifier: GPL-2.0-only +/**************************************************************************** + * Driver for Solarflare network controllers and boards + * Copyright 2019 Solarflare Communications Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + */ +#include <linux/module.h> +#include <linux/netdevice.h> +#include "net_driver.h" +#include "mcdi.h" +#include "nic.h" +#include "selftest.h" +#include "ethtool_common.h" + +struct efx_sw_stat_desc { + const char *name; + enum { + EFX_ETHTOOL_STAT_SOURCE_nic, + EFX_ETHTOOL_STAT_SOURCE_channel, + EFX_ETHTOOL_STAT_SOURCE_tx_queue + } source; + unsigned int offset; + u64 (*get_stat)(void *field); /* Reader function */ +}; + +/* Initialiser for a struct efx_sw_stat_desc with type-checking */ +#define EFX_ETHTOOL_STAT(stat_name, source_name, field, field_type, \ + get_stat_function) { \ + .name = #stat_name, \ + .source = EFX_ETHTOOL_STAT_SOURCE_##source_name, \ + .offset = ((((field_type *) 0) == \ + &((struct efx_##source_name *)0)->field) ? \ + offsetof(struct efx_##source_name, field) : \ + offsetof(struct efx_##source_name, field)), \ + .get_stat = get_stat_function, \ +} + +static u64 efx_get_uint_stat(void *field) +{ + return *(unsigned int *)field; +} + +static u64 efx_get_atomic_stat(void *field) +{ + return atomic_read((atomic_t *) field); +} + +#define EFX_ETHTOOL_ATOMIC_NIC_ERROR_STAT(field) \ + EFX_ETHTOOL_STAT(field, nic, field, \ + atomic_t, efx_get_atomic_stat) + +#define EFX_ETHTOOL_UINT_CHANNEL_STAT(field) \ + EFX_ETHTOOL_STAT(field, channel, n_##field, \ + unsigned int, efx_get_uint_stat) +#define EFX_ETHTOOL_UINT_CHANNEL_STAT_NO_N(field) \ + EFX_ETHTOOL_STAT(field, channel, field, \ + unsigned int, efx_get_uint_stat) + +#define EFX_ETHTOOL_UINT_TXQ_STAT(field) \ + EFX_ETHTOOL_STAT(tx_##field, tx_queue, field, \ + unsigned int, efx_get_uint_stat) + +static const struct efx_sw_stat_desc efx_sw_stat_desc[] = { + EFX_ETHTOOL_UINT_TXQ_STAT(merge_events), + EFX_ETHTOOL_UINT_TXQ_STAT(tso_bursts), + EFX_ETHTOOL_UINT_TXQ_STAT(tso_long_headers), + EFX_ETHTOOL_UINT_TXQ_STAT(tso_packets), + EFX_ETHTOOL_UINT_TXQ_STAT(tso_fallbacks), + EFX_ETHTOOL_UINT_TXQ_STAT(pushes), + EFX_ETHTOOL_UINT_TXQ_STAT(pio_packets), + EFX_ETHTOOL_UINT_TXQ_STAT(cb_packets), + EFX_ETHTOOL_ATOMIC_NIC_ERROR_STAT(rx_reset), + EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_tobe_disc), + EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_ip_hdr_chksum_err), + EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_tcp_udp_chksum_err), + EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_inner_ip_hdr_chksum_err), + EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_inner_tcp_udp_chksum_err), + EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_outer_ip_hdr_chksum_err), + EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_outer_tcp_udp_chksum_err), + EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_eth_crc_err), + EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_mcast_mismatch), + EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_frm_trunc), + EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_merge_events), + EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_merge_packets), + EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_xdp_drops), + EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_xdp_bad_drops), + EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_xdp_tx), + EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_xdp_redirect), +#ifdef CONFIG_RFS_ACCEL + EFX_ETHTOOL_UINT_CHANNEL_STAT_NO_N(rfs_filter_count), + EFX_ETHTOOL_UINT_CHANNEL_STAT(rfs_succeeded), + EFX_ETHTOOL_UINT_CHANNEL_STAT(rfs_failed), +#endif +}; + +#define EFX_ETHTOOL_SW_STAT_COUNT ARRAY_SIZE(efx_sw_stat_desc) + +void efx_ethtool_get_drvinfo(struct net_device *net_dev, + struct ethtool_drvinfo *info) +{ + struct efx_nic *efx = netdev_priv(net_dev); + + strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver)); + strlcpy(info->version, EFX_DRIVER_VERSION, sizeof(info->version)); + efx_mcdi_print_fwver(efx, info->fw_version, + sizeof(info->fw_version)); + strlcpy(info->bus_info, pci_name(efx->pci_dev), sizeof(info->bus_info)); +} + +u32 efx_ethtool_get_msglevel(struct net_device *net_dev) +{ + struct efx_nic *efx = netdev_priv(net_dev); + + return efx->msg_enable; +} + +void efx_ethtool_set_msglevel(struct net_device *net_dev, u32 msg_enable) +{ + struct efx_nic *efx = netdev_priv(net_dev); + + efx->msg_enable = msg_enable; +} + +void efx_ethtool_get_pauseparam(struct net_device *net_dev, + struct ethtool_pauseparam *pause) +{ + struct efx_nic *efx = netdev_priv(net_dev); + + pause->rx_pause = !!(efx->wanted_fc & EFX_FC_RX); + pause->tx_pause = !!(efx->wanted_fc & EFX_FC_TX); + pause->autoneg = !!(efx->wanted_fc & EFX_FC_AUTO); +} + +/** + * efx_fill_test - fill in an individual self-test entry + * @test_index: Index of the test + * @strings: Ethtool strings, or %NULL + * @data: Ethtool test results, or %NULL + * @test: Pointer to test result (used only if data != %NULL) + * @unit_format: Unit name format (e.g. "chan\%d") + * @unit_id: Unit id (e.g. 0 for "chan0") + * @test_format: Test name format (e.g. "loopback.\%s.tx.sent") + * @test_id: Test id (e.g. "PHYXS" for "loopback.PHYXS.tx_sent") + * + * Fill in an individual self-test entry. + */ +static void efx_fill_test(unsigned int test_index, u8 *strings, u64 *data, + int *test, const char *unit_format, int unit_id, + const char *test_format, const char *test_id) +{ + char unit_str[ETH_GSTRING_LEN], test_str[ETH_GSTRING_LEN]; + + /* Fill data value, if applicable */ + if (data) + data[test_index] = *test; + + /* Fill string, if applicable */ + if (strings) { + if (strchr(unit_format, '%')) + snprintf(unit_str, sizeof(unit_str), + unit_format, unit_id); + else + strcpy(unit_str, unit_format); + snprintf(test_str, sizeof(test_str), test_format, test_id); + snprintf(strings + test_index * ETH_GSTRING_LEN, + ETH_GSTRING_LEN, + "%-6s %-24s", unit_str, test_str); + } +} + +#define EFX_CHANNEL_NAME(_channel) "chan%d", _channel->channel +#define EFX_TX_QUEUE_NAME(_tx_queue) "txq%d", _tx_queue->queue +#define EFX_RX_QUEUE_NAME(_rx_queue) "rxq%d", _rx_queue->queue +#define EFX_LOOPBACK_NAME(_mode, _counter) \ + "loopback.%s." _counter, STRING_TABLE_LOOKUP(_mode, efx_loopback_mode) + +/** + * efx_fill_loopback_test - fill in a block of loopback self-test entries + * @efx: Efx NIC + * @lb_tests: Efx loopback self-test results structure + * @mode: Loopback test mode + * @test_index: Starting index of the test + * @strings: Ethtool strings, or %NULL + * @data: Ethtool test results, or %NULL + * + * Fill in a block of loopback self-test entries. Return new test + * index. + */ +static int efx_fill_loopback_test(struct efx_nic *efx, + struct efx_loopback_self_tests *lb_tests, + enum efx_loopback_mode mode, + unsigned int test_index, + u8 *strings, u64 *data) +{ + struct efx_channel *channel = + efx_get_channel(efx, efx->tx_channel_offset); + struct efx_tx_queue *tx_queue; + + efx_for_each_channel_tx_queue(tx_queue, channel) { + efx_fill_test(test_index++, strings, data, + &lb_tests->tx_sent[tx_queue->queue], + EFX_TX_QUEUE_NAME(tx_queue), + EFX_LOOPBACK_NAME(mode, "tx_sent")); + efx_fill_test(test_index++, strings, data, + &lb_tests->tx_done[tx_queue->queue], + EFX_TX_QUEUE_NAME(tx_queue), + EFX_LOOPBACK_NAME(mode, "tx_done")); + } + efx_fill_test(test_index++, strings, data, + &lb_tests->rx_good, + "rx", 0, + EFX_LOOPBACK_NAME(mode, "rx_good")); + efx_fill_test(test_index++, strings, data, + &lb_tests->rx_bad, + "rx", 0, + EFX_LOOPBACK_NAME(mode, "rx_bad")); + + return test_index; +} + +/** + * efx_ethtool_fill_self_tests - get self-test details + * @efx: Efx NIC + * @tests: Efx self-test results structure, or %NULL + * @strings: Ethtool strings, or %NULL + * @data: Ethtool test results, or %NULL + * + * Get self-test number of strings, strings, and/or test results. + * Return number of strings (== number of test results). + * + * The reason for merging these three functions is to make sure that + * they can never be inconsistent. + */ +int efx_ethtool_fill_self_tests(struct efx_nic *efx, + struct efx_self_tests *tests, + u8 *strings, u64 *data) +{ + struct efx_channel *channel; + unsigned int n = 0, i; + enum efx_loopback_mode mode; + + efx_fill_test(n++, strings, data, &tests->phy_alive, + "phy", 0, "alive", NULL); + efx_fill_test(n++, strings, data, &tests->nvram, + "core", 0, "nvram", NULL); + efx_fill_test(n++, strings, data, &tests->interrupt, + "core", 0, "interrupt", NULL); + + /* Event queues */ + efx_for_each_channel(channel, efx) { + efx_fill_test(n++, strings, data, + &tests->eventq_dma[channel->channel], + EFX_CHANNEL_NAME(channel), + "eventq.dma", NULL); + efx_fill_test(n++, strings, data, + &tests->eventq_int[channel->channel], + EFX_CHANNEL_NAME(channel), + "eventq.int", NULL); + } + + efx_fill_test(n++, strings, data, &tests->memory, + "core", 0, "memory", NULL); + efx_fill_test(n++, strings, data, &tests->registers, + "core", 0, "registers", NULL); + + if (efx->phy_op->run_tests != NULL) { + EFX_WARN_ON_PARANOID(efx->phy_op->test_name == NULL); + + for (i = 0; true; ++i) { + const char *name; + + EFX_WARN_ON_PARANOID(i >= EFX_MAX_PHY_TESTS); + name = efx->phy_op->test_name(efx, i); + if (name == NULL) + break; + + efx_fill_test(n++, strings, data, &tests->phy_ext[i], + "phy", 0, name, NULL); + } + } + + /* Loopback tests */ + for (mode = LOOPBACK_NONE; mode <= LOOPBACK_TEST_MAX; mode++) { + if (!(efx->loopback_modes & (1 << mode))) + continue; + n = efx_fill_loopback_test(efx, + &tests->loopback[mode], mode, n, + strings, data); + } + + return n; +} + +static size_t efx_describe_per_queue_stats(struct efx_nic *efx, u8 *strings) +{ + size_t n_stats = 0; + struct efx_channel *channel; + + efx_for_each_channel(channel, efx) { + if (efx_channel_has_tx_queues(channel)) { + n_stats++; + if (strings != NULL) { + snprintf(strings, ETH_GSTRING_LEN, + "tx-%u.tx_packets", + channel->tx_queue[0].queue / + EFX_TXQ_TYPES); + + strings += ETH_GSTRING_LEN; + } + } + } + efx_for_each_channel(channel, efx) { + if (efx_channel_has_rx_queue(channel)) { + n_stats++; + if (strings != NULL) { + snprintf(strings, ETH_GSTRING_LEN, + "rx-%d.rx_packets", channel->channel); + strings += ETH_GSTRING_LEN; + } + } + } + if (efx->xdp_tx_queue_count && efx->xdp_tx_queues) { + unsigned short xdp; + + for (xdp = 0; xdp < efx->xdp_tx_queue_count; xdp++) { + n_stats++; + if (strings) { + snprintf(strings, ETH_GSTRING_LEN, + "tx-xdp-cpu-%hu.tx_packets", xdp); + strings += ETH_GSTRING_LEN; + } + } + } + + return n_stats; +} + +int efx_ethtool_get_sset_count(struct net_device *net_dev, int string_set) +{ + struct efx_nic *efx = netdev_priv(net_dev); + + switch (string_set) { + case ETH_SS_STATS: + return efx->type->describe_stats(efx, NULL) + + EFX_ETHTOOL_SW_STAT_COUNT + + efx_describe_per_queue_stats(efx, NULL) + + efx_ptp_describe_stats(efx, NULL); + case ETH_SS_TEST: + return efx_ethtool_fill_self_tests(efx, NULL, NULL, NULL); + default: + return -EINVAL; + } +} + +void efx_ethtool_get_strings(struct net_device *net_dev, + u32 string_set, u8 *strings) +{ + struct efx_nic *efx = netdev_priv(net_dev); + int i; + + switch (string_set) { + case ETH_SS_STATS: + strings += (efx->type->describe_stats(efx, strings) * + ETH_GSTRING_LEN); + for (i = 0; i < EFX_ETHTOOL_SW_STAT_COUNT; i++) + strlcpy(strings + i * ETH_GSTRING_LEN, + efx_sw_stat_desc[i].name, ETH_GSTRING_LEN); + strings += EFX_ETHTOOL_SW_STAT_COUNT * ETH_GSTRING_LEN; + strings += (efx_describe_per_queue_stats(efx, strings) * + ETH_GSTRING_LEN); + efx_ptp_describe_stats(efx, strings); + break; + case ETH_SS_TEST: + efx_ethtool_fill_self_tests(efx, NULL, strings, NULL); + break; + default: + /* No other string sets */ + break; + } +} + +void efx_ethtool_get_stats(struct net_device *net_dev, + struct ethtool_stats *stats, + u64 *data) +{ + struct efx_nic *efx = netdev_priv(net_dev); + const struct efx_sw_stat_desc *stat; + struct efx_channel *channel; + struct efx_tx_queue *tx_queue; + struct efx_rx_queue *rx_queue; + int i; + + spin_lock_bh(&efx->stats_lock); + + /* Get NIC statistics */ + data += efx->type->update_stats(efx, data, NULL); + + /* Get software statistics */ + for (i = 0; i < EFX_ETHTOOL_SW_STAT_COUNT; i++) { + stat = &efx_sw_stat_desc[i]; + switch (stat->source) { + case EFX_ETHTOOL_STAT_SOURCE_nic: + data[i] = stat->get_stat((void *)efx + stat->offset); + break; + case EFX_ETHTOOL_STAT_SOURCE_channel: + data[i] = 0; + efx_for_each_channel(channel, efx) + data[i] += stat->get_stat((void *)channel + + stat->offset); + break; + case EFX_ETHTOOL_STAT_SOURCE_tx_queue: + data[i] = 0; + efx_for_each_channel(channel, efx) { + efx_for_each_channel_tx_queue(tx_queue, channel) + data[i] += + stat->get_stat((void *)tx_queue + + stat->offset); + } + break; + } + } + data += EFX_ETHTOOL_SW_STAT_COUNT; + + spin_unlock_bh(&efx->stats_lock); + + efx_for_each_channel(channel, efx) { + if (efx_channel_has_tx_queues(channel)) { + *data = 0; + efx_for_each_channel_tx_queue(tx_queue, channel) { + *data += tx_queue->tx_packets; + } + data++; + } + } + efx_for_each_channel(channel, efx) { + if (efx_channel_has_rx_queue(channel)) { + *data = 0; + efx_for_each_channel_rx_queue(rx_queue, channel) { + *data += rx_queue->rx_packets; + } + data++; + } + } + if (efx->xdp_tx_queue_count && efx->xdp_tx_queues) { + int xdp; + + for (xdp = 0; xdp < efx->xdp_tx_queue_count; xdp++) { + data[0] = efx->xdp_tx_queues[xdp]->tx_packets; + data++; + } + } + + efx_ptp_update_stats(efx, data); +} diff --git a/drivers/net/ethernet/sfc/ethtool_common.h b/drivers/net/ethernet/sfc/ethtool_common.h new file mode 100644 index 000000000000..fa624313f330 --- /dev/null +++ b/drivers/net/ethernet/sfc/ethtool_common.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/**************************************************************************** + * Driver for Solarflare network controllers and boards + * Copyright 2019 Solarflare Communications Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + */ + +#ifndef EFX_ETHTOOL_COMMON_H +#define EFX_ETHTOOL_COMMON_H + +void efx_ethtool_get_drvinfo(struct net_device *net_dev, + struct ethtool_drvinfo *info); +u32 efx_ethtool_get_msglevel(struct net_device *net_dev); +void efx_ethtool_set_msglevel(struct net_device *net_dev, u32 msg_enable); +void efx_ethtool_get_pauseparam(struct net_device *net_dev, + struct ethtool_pauseparam *pause); +int efx_ethtool_fill_self_tests(struct efx_nic *efx, + struct efx_self_tests *tests, + u8 *strings, u64 *data); +int efx_ethtool_get_sset_count(struct net_device *net_dev, int string_set); +void efx_ethtool_get_strings(struct net_device *net_dev, u32 string_set, + u8 *strings); +void efx_ethtool_get_stats(struct net_device *net_dev, + struct ethtool_stats *stats __attribute__ ((unused)), + u64 *data); + +#endif diff --git a/drivers/net/ethernet/sfc/farch.c b/drivers/net/ethernet/sfc/farch.c index eedd32e2bfcb..dbbb898adddb 100644 --- a/drivers/net/ethernet/sfc/farch.c +++ b/drivers/net/ethernet/sfc/farch.c @@ -15,6 +15,7 @@ #include "net_driver.h" #include "bitfield.h" #include "efx.h" +#include "rx_common.h" #include "nic.h" #include "farch_regs.h" #include "sriov.h" diff --git a/drivers/net/ethernet/sfc/mcdi.h b/drivers/net/ethernet/sfc/mcdi.h index 9081f84a2604..54a45010b576 100644 --- a/drivers/net/ethernet/sfc/mcdi.h +++ b/drivers/net/ethernet/sfc/mcdi.h @@ -346,11 +346,8 @@ int efx_mcdi_flush_rxqs(struct efx_nic *efx); int efx_mcdi_port_probe(struct efx_nic *efx); void efx_mcdi_port_remove(struct efx_nic *efx); int efx_mcdi_port_reconfigure(struct efx_nic *efx); -int efx_mcdi_port_get_number(struct efx_nic *efx); u32 efx_mcdi_phy_get_caps(struct efx_nic *efx); void efx_mcdi_process_link_change(struct efx_nic *efx, efx_qword_t *ev); -int efx_mcdi_set_mac(struct efx_nic *efx); -#define EFX_MC_STATS_GENERATION_INVALID ((__force __le64)(-1)) void efx_mcdi_mac_start_stats(struct efx_nic *efx); void efx_mcdi_mac_stop_stats(struct efx_nic *efx); void efx_mcdi_mac_pull_stats(struct efx_nic *efx); diff --git a/drivers/net/ethernet/sfc/mcdi_functions.c b/drivers/net/ethernet/sfc/mcdi_functions.c new file mode 100644 index 000000000000..dcfe78b0fa5a --- /dev/null +++ b/drivers/net/ethernet/sfc/mcdi_functions.c @@ -0,0 +1,386 @@ +// SPDX-License-Identifier: GPL-2.0-only +/**************************************************************************** + * Driver for Solarflare network controllers and boards + * Copyright 2019 Solarflare Communications Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + */ + +#include "net_driver.h" +#include "efx.h" +#include "nic.h" +#include "mcdi_functions.h" +#include "mcdi.h" +#include "mcdi_pcol.h" + +int efx_mcdi_free_vis(struct efx_nic *efx) +{ + MCDI_DECLARE_BUF_ERR(outbuf); + size_t outlen; + int rc = efx_mcdi_rpc_quiet(efx, MC_CMD_FREE_VIS, NULL, 0, + outbuf, sizeof(outbuf), &outlen); + + /* -EALREADY means nothing to free, so ignore */ + if (rc == -EALREADY) + rc = 0; + if (rc) + efx_mcdi_display_error(efx, MC_CMD_FREE_VIS, 0, outbuf, outlen, + rc); + return rc; +} + +int efx_mcdi_alloc_vis(struct efx_nic *efx, unsigned int min_vis, + unsigned int max_vis, unsigned int *vi_base, + unsigned int *allocated_vis) +{ + MCDI_DECLARE_BUF(outbuf, MC_CMD_ALLOC_VIS_OUT_LEN); + MCDI_DECLARE_BUF(inbuf, MC_CMD_ALLOC_VIS_IN_LEN); + size_t outlen; + int rc; + + MCDI_SET_DWORD(inbuf, ALLOC_VIS_IN_MIN_VI_COUNT, min_vis); + MCDI_SET_DWORD(inbuf, ALLOC_VIS_IN_MAX_VI_COUNT, max_vis); + rc = efx_mcdi_rpc(efx, MC_CMD_ALLOC_VIS, inbuf, sizeof(inbuf), + outbuf, sizeof(outbuf), &outlen); + if (rc != 0) + return rc; + + if (outlen < MC_CMD_ALLOC_VIS_OUT_LEN) + return -EIO; + + netif_dbg(efx, drv, efx->net_dev, "base VI is A0x%03x\n", + MCDI_DWORD(outbuf, ALLOC_VIS_OUT_VI_BASE)); + + if (vi_base) + *vi_base = MCDI_DWORD(outbuf, ALLOC_VIS_OUT_VI_BASE); + if (allocated_vis) + *allocated_vis = MCDI_DWORD(outbuf, ALLOC_VIS_OUT_VI_COUNT); + return 0; +} + +int efx_mcdi_ev_probe(struct efx_channel *channel) +{ + return efx_nic_alloc_buffer(channel->efx, &channel->eventq.buf, + (channel->eventq_mask + 1) * + sizeof(efx_qword_t), + GFP_KERNEL); +} + +int efx_mcdi_ev_init(struct efx_channel *channel, bool v1_cut_thru, bool v2) +{ + MCDI_DECLARE_BUF(inbuf, + MC_CMD_INIT_EVQ_V2_IN_LEN(EFX_MAX_EVQ_SIZE * 8 / + EFX_BUF_SIZE)); + MCDI_DECLARE_BUF(outbuf, MC_CMD_INIT_EVQ_V2_OUT_LEN); + size_t entries = channel->eventq.buf.len / EFX_BUF_SIZE; + struct efx_nic *efx = channel->efx; + size_t inlen, outlen; + dma_addr_t dma_addr; + int rc, i; + + /* Fill event queue with all ones (i.e. empty events) */ + memset(channel->eventq.buf.addr, 0xff, channel->eventq.buf.len); + + MCDI_SET_DWORD(inbuf, INIT_EVQ_IN_SIZE, channel->eventq_mask + 1); + MCDI_SET_DWORD(inbuf, INIT_EVQ_IN_INSTANCE, channel->channel); + /* INIT_EVQ expects index in vector table, not absolute */ + MCDI_SET_DWORD(inbuf, INIT_EVQ_IN_IRQ_NUM, channel->channel); + MCDI_SET_DWORD(inbuf, INIT_EVQ_IN_TMR_MODE, + MC_CMD_INIT_EVQ_IN_TMR_MODE_DIS); + MCDI_SET_DWORD(inbuf, INIT_EVQ_IN_TMR_LOAD, 0); + MCDI_SET_DWORD(inbuf, INIT_EVQ_IN_TMR_RELOAD, 0); + MCDI_SET_DWORD(inbuf, INIT_EVQ_IN_COUNT_MODE, + MC_CMD_INIT_EVQ_IN_COUNT_MODE_DIS); + MCDI_SET_DWORD(inbuf, INIT_EVQ_IN_COUNT_THRSHLD, 0); + + if (v2) { + /* Use the new generic approach to specifying event queue + * configuration, requesting lower latency or higher throughput. + * The options that actually get used appear in the output. + */ + MCDI_POPULATE_DWORD_2(inbuf, INIT_EVQ_V2_IN_FLAGS, + INIT_EVQ_V2_IN_FLAG_INTERRUPTING, 1, + INIT_EVQ_V2_IN_FLAG_TYPE, + MC_CMD_INIT_EVQ_V2_IN_FLAG_TYPE_AUTO); + } else { + MCDI_POPULATE_DWORD_4(inbuf, INIT_EVQ_IN_FLAGS, + INIT_EVQ_IN_FLAG_INTERRUPTING, 1, + INIT_EVQ_IN_FLAG_RX_MERGE, 1, + INIT_EVQ_IN_FLAG_TX_MERGE, 1, + INIT_EVQ_IN_FLAG_CUT_THRU, v1_cut_thru); + } + + dma_addr = channel->eventq.buf.dma_addr; + for (i = 0; i < entries; ++i) { + MCDI_SET_ARRAY_QWORD(inbuf, INIT_EVQ_IN_DMA_ADDR, i, dma_addr); + dma_addr += EFX_BUF_SIZE; + } + + inlen = MC_CMD_INIT_EVQ_IN_LEN(entries); + + rc = efx_mcdi_rpc(efx, MC_CMD_INIT_EVQ, inbuf, inlen, + outbuf, sizeof(outbuf), &outlen); + + if (outlen >= MC_CMD_INIT_EVQ_V2_OUT_LEN) + netif_dbg(efx, drv, efx->net_dev, + "Channel %d using event queue flags %08x\n", + channel->channel, + MCDI_DWORD(outbuf, INIT_EVQ_V2_OUT_FLAGS)); + + return rc; +} + +void efx_mcdi_ev_remove(struct efx_channel *channel) +{ + efx_nic_free_buffer(channel->efx, &channel->eventq.buf); +} + +void efx_mcdi_ev_fini(struct efx_channel *channel) +{ + MCDI_DECLARE_BUF(inbuf, MC_CMD_FINI_EVQ_IN_LEN); + MCDI_DECLARE_BUF_ERR(outbuf); + struct efx_nic *efx = channel->efx; + size_t outlen; + int rc; + + MCDI_SET_DWORD(inbuf, FINI_EVQ_IN_INSTANCE, channel->channel); + + rc = efx_mcdi_rpc_quiet(efx, MC_CMD_FINI_EVQ, inbuf, sizeof(inbuf), + outbuf, sizeof(outbuf), &outlen); + + if (rc && rc != -EALREADY) + goto fail; + + return; + +fail: + efx_mcdi_display_error(efx, MC_CMD_FINI_EVQ, MC_CMD_FINI_EVQ_IN_LEN, + outbuf, outlen, rc); +} + +int efx_mcdi_tx_init(struct efx_tx_queue *tx_queue, bool tso_v2) +{ + MCDI_DECLARE_BUF(inbuf, MC_CMD_INIT_TXQ_IN_LEN(EFX_MAX_DMAQ_SIZE * 8 / + EFX_BUF_SIZE)); + bool csum_offload = tx_queue->queue & EFX_TXQ_TYPE_OFFLOAD; + size_t entries = tx_queue->txd.buf.len / EFX_BUF_SIZE; + struct efx_channel *channel = tx_queue->channel; + struct efx_nic *efx = tx_queue->efx; + struct efx_ef10_nic_data *nic_data; + dma_addr_t dma_addr; + size_t inlen; + int rc, i; + + BUILD_BUG_ON(MC_CMD_INIT_TXQ_OUT_LEN != 0); + + nic_data = efx->nic_data; + + MCDI_SET_DWORD(inbuf, INIT_TXQ_IN_SIZE, tx_queue->ptr_mask + 1); + MCDI_SET_DWORD(inbuf, INIT_TXQ_IN_TARGET_EVQ, channel->channel); + MCDI_SET_DWORD(inbuf, INIT_TXQ_IN_LABEL, tx_queue->queue); + MCDI_SET_DWORD(inbuf, INIT_TXQ_IN_INSTANCE, tx_queue->queue); + MCDI_SET_DWORD(inbuf, INIT_TXQ_IN_OWNER_ID, 0); + MCDI_SET_DWORD(inbuf, INIT_TXQ_IN_PORT_ID, nic_data->vport_id); + + dma_addr = tx_queue->txd.buf.dma_addr; + + netif_dbg(efx, hw, efx->net_dev, "pushing TXQ %d. %zu entries (%llx)\n", + tx_queue->queue, entries, (u64)dma_addr); + + for (i = 0; i < entries; ++i) { + MCDI_SET_ARRAY_QWORD(inbuf, INIT_TXQ_IN_DMA_ADDR, i, dma_addr); + dma_addr += EFX_BUF_SIZE; + } + + inlen = MC_CMD_INIT_TXQ_IN_LEN(entries); + + do { + MCDI_POPULATE_DWORD_4(inbuf, INIT_TXQ_IN_FLAGS, + /* This flag was removed from mcdi_pcol.h for + * the non-_EXT version of INIT_TXQ. However, + * firmware still honours it. + */ + INIT_TXQ_EXT_IN_FLAG_TSOV2_EN, tso_v2, + INIT_TXQ_IN_FLAG_IP_CSUM_DIS, !csum_offload, + INIT_TXQ_IN_FLAG_TCP_CSUM_DIS, !csum_offload, + INIT_TXQ_EXT_IN_FLAG_TIMESTAMP, + tx_queue->timestamping); + + rc = efx_mcdi_rpc_quiet(efx, MC_CMD_INIT_TXQ, inbuf, inlen, + NULL, 0, NULL); + if (rc == -ENOSPC && tso_v2) { + /* Retry without TSOv2 if we're short on contexts. */ + tso_v2 = false; + netif_warn(efx, probe, efx->net_dev, + "TSOv2 context not available to segment in " + "hardware. TCP performance may be reduced.\n" + ); + } else if (rc) { + efx_mcdi_display_error(efx, MC_CMD_INIT_TXQ, + MC_CMD_INIT_TXQ_EXT_IN_LEN, + NULL, 0, rc); + goto fail; + } + } while (rc); + + return 0; + +fail: + return rc; +} + +void efx_mcdi_tx_remove(struct efx_tx_queue *tx_queue) +{ + efx_nic_free_buffer(tx_queue->efx, &tx_queue->txd.buf); +} + +void efx_mcdi_tx_fini(struct efx_tx_queue *tx_queue) +{ + MCDI_DECLARE_BUF(inbuf, MC_CMD_FINI_TXQ_IN_LEN); + MCDI_DECLARE_BUF_ERR(outbuf); + struct efx_nic *efx = tx_queue->efx; + size_t outlen; + int rc; + + MCDI_SET_DWORD(inbuf, FINI_TXQ_IN_INSTANCE, + tx_queue->queue); + + rc = efx_mcdi_rpc_quiet(efx, MC_CMD_FINI_TXQ, inbuf, sizeof(inbuf), + outbuf, sizeof(outbuf), &outlen); + + if (rc && rc != -EALREADY) + goto fail; + + return; + +fail: + efx_mcdi_display_error(efx, MC_CMD_FINI_TXQ, MC_CMD_FINI_TXQ_IN_LEN, + outbuf, outlen, rc); +} + +int efx_mcdi_rx_probe(struct efx_rx_queue *rx_queue) +{ + return efx_nic_alloc_buffer(rx_queue->efx, &rx_queue->rxd.buf, + (rx_queue->ptr_mask + 1) * + sizeof(efx_qword_t), + GFP_KERNEL); +} + +void efx_mcdi_rx_init(struct efx_rx_queue *rx_queue) +{ + MCDI_DECLARE_BUF(inbuf, + MC_CMD_INIT_RXQ_IN_LEN(EFX_MAX_DMAQ_SIZE * 8 / + EFX_BUF_SIZE)); + struct efx_channel *channel = efx_rx_queue_channel(rx_queue); + size_t entries = rx_queue->rxd.buf.len / EFX_BUF_SIZE; + struct efx_nic *efx = rx_queue->efx; + struct efx_ef10_nic_data *nic_data = efx->nic_data; + dma_addr_t dma_addr; + size_t inlen; + int rc; + int i; + BUILD_BUG_ON(MC_CMD_INIT_RXQ_OUT_LEN != 0); + + rx_queue->scatter_n = 0; + rx_queue->scatter_len = 0; + + MCDI_SET_DWORD(inbuf, INIT_RXQ_IN_SIZE, rx_queue->ptr_mask + 1); + MCDI_SET_DWORD(inbuf, INIT_RXQ_IN_TARGET_EVQ, channel->channel); + MCDI_SET_DWORD(inbuf, INIT_RXQ_IN_LABEL, efx_rx_queue_index(rx_queue)); + MCDI_SET_DWORD(inbuf, INIT_RXQ_IN_INSTANCE, + efx_rx_queue_index(rx_queue)); + MCDI_POPULATE_DWORD_2(inbuf, INIT_RXQ_IN_FLAGS, + INIT_RXQ_IN_FLAG_PREFIX, 1, + INIT_RXQ_IN_FLAG_TIMESTAMP, 1); + MCDI_SET_DWORD(inbuf, INIT_RXQ_IN_OWNER_ID, 0); + MCDI_SET_DWORD(inbuf, INIT_RXQ_IN_PORT_ID, nic_data->vport_id); + + dma_addr = rx_queue->rxd.buf.dma_addr; + + netif_dbg(efx, hw, efx->net_dev, "pushing RXQ %d. %zu entries (%llx)\n", + efx_rx_queue_index(rx_queue), entries, (u64)dma_addr); + + for (i = 0; i < entries; ++i) { + MCDI_SET_ARRAY_QWORD(inbuf, INIT_RXQ_IN_DMA_ADDR, i, dma_addr); + dma_addr += EFX_BUF_SIZE; + } + + inlen = MC_CMD_INIT_RXQ_IN_LEN(entries); + + rc = efx_mcdi_rpc(efx, MC_CMD_INIT_RXQ, inbuf, inlen, + NULL, 0, NULL); + if (rc) + netdev_WARN(efx->net_dev, "failed to initialise RXQ %d\n", + efx_rx_queue_index(rx_queue)); +} + +void efx_mcdi_rx_remove(struct efx_rx_queue *rx_queue) +{ + efx_nic_free_buffer(rx_queue->efx, &rx_queue->rxd.buf); +} + +void efx_mcdi_rx_fini(struct efx_rx_queue *rx_queue) +{ + MCDI_DECLARE_BUF(inbuf, MC_CMD_FINI_RXQ_IN_LEN); + MCDI_DECLARE_BUF_ERR(outbuf); + struct efx_nic *efx = rx_queue->efx; + size_t outlen; + int rc; + + MCDI_SET_DWORD(inbuf, FINI_RXQ_IN_INSTANCE, + efx_rx_queue_index(rx_queue)); + + rc = efx_mcdi_rpc_quiet(efx, MC_CMD_FINI_RXQ, inbuf, sizeof(inbuf), + outbuf, sizeof(outbuf), &outlen); + + if (rc && rc != -EALREADY) + goto fail; + + return; + +fail: + efx_mcdi_display_error(efx, MC_CMD_FINI_RXQ, MC_CMD_FINI_RXQ_IN_LEN, + outbuf, outlen, rc); +} + +int efx_mcdi_window_mode_to_stride(struct efx_nic *efx, u8 vi_window_mode) +{ + switch (vi_window_mode) { + case MC_CMD_GET_CAPABILITIES_V3_OUT_VI_WINDOW_MODE_8K: + efx->vi_stride = 8192; + break; + case MC_CMD_GET_CAPABILITIES_V3_OUT_VI_WINDOW_MODE_16K: + efx->vi_stride = 16384; + break; + case MC_CMD_GET_CAPABILITIES_V3_OUT_VI_WINDOW_MODE_64K: + efx->vi_stride = 65536; + break; + default: + netif_err(efx, probe, efx->net_dev, + "Unrecognised VI window mode %d\n", + vi_window_mode); + return -EIO; + } + netif_dbg(efx, probe, efx->net_dev, "vi_stride = %u\n", + efx->vi_stride); + return 0; +} + +int efx_get_pf_index(struct efx_nic *efx, unsigned int *pf_index) +{ + MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_FUNCTION_INFO_OUT_LEN); + size_t outlen; + int rc; + + rc = efx_mcdi_rpc(efx, MC_CMD_GET_FUNCTION_INFO, NULL, 0, outbuf, + sizeof(outbuf), &outlen); + if (rc) + return rc; + if (outlen < sizeof(outbuf)) + return -EIO; + + *pf_index = MCDI_DWORD(outbuf, GET_FUNCTION_INFO_OUT_PF); + return 0; +} diff --git a/drivers/net/ethernet/sfc/mcdi_functions.h b/drivers/net/ethernet/sfc/mcdi_functions.h new file mode 100644 index 000000000000..ca4a5ac1a66b --- /dev/null +++ b/drivers/net/ethernet/sfc/mcdi_functions.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/**************************************************************************** + * Driver for Solarflare network controllers and boards + * Copyright 2018 Solarflare Communications Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + */ +#ifndef EFX_MCDI_FUNCTIONS_H +#define EFX_MCDI_FUNCTIONS_H + +int efx_mcdi_alloc_vis(struct efx_nic *efx, unsigned int min_vis, + unsigned int max_vis, unsigned int *vi_base, + unsigned int *allocated_vis); +int efx_mcdi_free_vis(struct efx_nic *efx); + +int efx_mcdi_ev_probe(struct efx_channel *channel); +int efx_mcdi_ev_init(struct efx_channel *channel, bool v1_cut_thru, bool v2); +void efx_mcdi_ev_remove(struct efx_channel *channel); +void efx_mcdi_ev_fini(struct efx_channel *channel); +int efx_mcdi_tx_init(struct efx_tx_queue *tx_queue, bool tso_v2); +void efx_mcdi_tx_remove(struct efx_tx_queue *tx_queue); +void efx_mcdi_tx_fini(struct efx_tx_queue *tx_queue); +int efx_mcdi_rx_probe(struct efx_rx_queue *rx_queue); +void efx_mcdi_rx_init(struct efx_rx_queue *rx_queue); +void efx_mcdi_rx_remove(struct efx_rx_queue *rx_queue); +void efx_mcdi_rx_fini(struct efx_rx_queue *rx_queue); +int efx_mcdi_window_mode_to_stride(struct efx_nic *efx, u8 vi_window_mode); +int efx_get_pf_index(struct efx_nic *efx, unsigned int *pf_index); + +#endif diff --git a/drivers/net/ethernet/sfc/mcdi_port.c b/drivers/net/ethernet/sfc/mcdi_port.c index fb7cde4980ed..ab5227b13ae6 100644 --- a/drivers/net/ethernet/sfc/mcdi_port.c +++ b/drivers/net/ethernet/sfc/mcdi_port.c @@ -14,106 +14,7 @@ #include "mcdi_pcol.h" #include "nic.h" #include "selftest.h" - -struct efx_mcdi_phy_data { - u32 flags; - u32 type; - u32 supported_cap; - u32 channel; - u32 port; - u32 stats_mask; - u8 name[20]; - u32 media; - u32 mmd_mask; - u8 revision[20]; - u32 forced_cap; -}; - -static int -efx_mcdi_get_phy_cfg(struct efx_nic *efx, struct efx_mcdi_phy_data *cfg) -{ - MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_PHY_CFG_OUT_LEN); - size_t outlen; - int rc; - - BUILD_BUG_ON(MC_CMD_GET_PHY_CFG_IN_LEN != 0); - BUILD_BUG_ON(MC_CMD_GET_PHY_CFG_OUT_NAME_LEN != sizeof(cfg->name)); - - rc = efx_mcdi_rpc(efx, MC_CMD_GET_PHY_CFG, NULL, 0, - outbuf, sizeof(outbuf), &outlen); - if (rc) - goto fail; - - if (outlen < MC_CMD_GET_PHY_CFG_OUT_LEN) { - rc = -EIO; - goto fail; - } - - cfg->flags = MCDI_DWORD(outbuf, GET_PHY_CFG_OUT_FLAGS); - cfg->type = MCDI_DWORD(outbuf, GET_PHY_CFG_OUT_TYPE); - cfg->supported_cap = - MCDI_DWORD(outbuf, GET_PHY_CFG_OUT_SUPPORTED_CAP); - cfg->channel = MCDI_DWORD(outbuf, GET_PHY_CFG_OUT_CHANNEL); - cfg->port = MCDI_DWORD(outbuf, GET_PHY_CFG_OUT_PRT); - cfg->stats_mask = MCDI_DWORD(outbuf, GET_PHY_CFG_OUT_STATS_MASK); - memcpy(cfg->name, MCDI_PTR(outbuf, GET_PHY_CFG_OUT_NAME), - sizeof(cfg->name)); - cfg->media = MCDI_DWORD(outbuf, GET_PHY_CFG_OUT_MEDIA_TYPE); - cfg->mmd_mask = MCDI_DWORD(outbuf, GET_PHY_CFG_OUT_MMD_MASK); - memcpy(cfg->revision, MCDI_PTR(outbuf, GET_PHY_CFG_OUT_REVISION), - sizeof(cfg->revision)); - - return 0; - -fail: - netif_err(efx, hw, efx->net_dev, "%s: failed rc=%d\n", __func__, rc); - return rc; -} - -static int efx_mcdi_set_link(struct efx_nic *efx, u32 capabilities, - u32 flags, u32 loopback_mode, - u32 loopback_speed) -{ - MCDI_DECLARE_BUF(inbuf, MC_CMD_SET_LINK_IN_LEN); - int rc; - - BUILD_BUG_ON(MC_CMD_SET_LINK_OUT_LEN != 0); - - MCDI_SET_DWORD(inbuf, SET_LINK_IN_CAP, capabilities); - MCDI_SET_DWORD(inbuf, SET_LINK_IN_FLAGS, flags); - MCDI_SET_DWORD(inbuf, SET_LINK_IN_LOOPBACK_MODE, loopback_mode); - MCDI_SET_DWORD(inbuf, SET_LINK_IN_LOOPBACK_SPEED, loopback_speed); - - rc = efx_mcdi_rpc(efx, MC_CMD_SET_LINK, inbuf, sizeof(inbuf), - NULL, 0, NULL); - return rc; -} - -static int efx_mcdi_loopback_modes(struct efx_nic *efx, u64 *loopback_modes) -{ - MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_LOOPBACK_MODES_OUT_LEN); - size_t outlen; - int rc; - - rc = efx_mcdi_rpc(efx, MC_CMD_GET_LOOPBACK_MODES, NULL, 0, - outbuf, sizeof(outbuf), &outlen); - if (rc) - goto fail; - - if (outlen < (MC_CMD_GET_LOOPBACK_MODES_OUT_SUGGESTED_OFST + - MC_CMD_GET_LOOPBACK_MODES_OUT_SUGGESTED_LEN)) { - rc = -EIO; - goto fail; - } - - *loopback_modes = MCDI_QWORD(outbuf, GET_LOOPBACK_MODES_OUT_SUGGESTED); - - return 0; - -fail: - netif_err(efx, hw, efx->net_dev, "%s: failed rc=%d\n", __func__, rc); - return rc; -} +#include "mcdi_port_common.h" static int efx_mcdi_mdio_read(struct net_device *net_dev, int prtad, int devad, u16 addr) @@ -168,246 +69,6 @@ static int efx_mcdi_mdio_write(struct net_device *net_dev, return 0; } -static void mcdi_to_ethtool_linkset(u32 media, u32 cap, unsigned long *linkset) -{ - #define SET_BIT(name) __set_bit(ETHTOOL_LINK_MODE_ ## name ## _BIT, \ - linkset) - - bitmap_zero(linkset, __ETHTOOL_LINK_MODE_MASK_NBITS); - switch (media) { - case MC_CMD_MEDIA_KX4: - SET_BIT(Backplane); - if (cap & (1 << MC_CMD_PHY_CAP_1000FDX_LBN)) - SET_BIT(1000baseKX_Full); - if (cap & (1 << MC_CMD_PHY_CAP_10000FDX_LBN)) - SET_BIT(10000baseKX4_Full); - if (cap & (1 << MC_CMD_PHY_CAP_40000FDX_LBN)) - SET_BIT(40000baseKR4_Full); - break; - - case MC_CMD_MEDIA_XFP: - case MC_CMD_MEDIA_SFP_PLUS: - case MC_CMD_MEDIA_QSFP_PLUS: - SET_BIT(FIBRE); - if (cap & (1 << MC_CMD_PHY_CAP_1000FDX_LBN)) - SET_BIT(1000baseT_Full); - if (cap & (1 << MC_CMD_PHY_CAP_10000FDX_LBN)) - SET_BIT(10000baseT_Full); - if (cap & (1 << MC_CMD_PHY_CAP_40000FDX_LBN)) - SET_BIT(40000baseCR4_Full); - if (cap & (1 << MC_CMD_PHY_CAP_100000FDX_LBN)) - SET_BIT(100000baseCR4_Full); - if (cap & (1 << MC_CMD_PHY_CAP_25000FDX_LBN)) - SET_BIT(25000baseCR_Full); - if (cap & (1 << MC_CMD_PHY_CAP_50000FDX_LBN)) - SET_BIT(50000baseCR2_Full); - break; - - case MC_CMD_MEDIA_BASE_T: - SET_BIT(TP); - if (cap & (1 << MC_CMD_PHY_CAP_10HDX_LBN)) - SET_BIT(10baseT_Half); - if (cap & (1 << MC_CMD_PHY_CAP_10FDX_LBN)) - SET_BIT(10baseT_Full); - if (cap & (1 << MC_CMD_PHY_CAP_100HDX_LBN)) - SET_BIT(100baseT_Half); - if (cap & (1 << MC_CMD_PHY_CAP_100FDX_LBN)) - SET_BIT(100baseT_Full); - if (cap & (1 << MC_CMD_PHY_CAP_1000HDX_LBN)) - SET_BIT(1000baseT_Half); - if (cap & (1 << MC_CMD_PHY_CAP_1000FDX_LBN)) - SET_BIT(1000baseT_Full); - if (cap & (1 << MC_CMD_PHY_CAP_10000FDX_LBN)) - SET_BIT(10000baseT_Full); - break; - } - - if (cap & (1 << MC_CMD_PHY_CAP_PAUSE_LBN)) - SET_BIT(Pause); - if (cap & (1 << MC_CMD_PHY_CAP_ASYM_LBN)) - SET_BIT(Asym_Pause); - if (cap & (1 << MC_CMD_PHY_CAP_AN_LBN)) - SET_BIT(Autoneg); - - #undef SET_BIT -} - -static u32 ethtool_linkset_to_mcdi_cap(const unsigned long *linkset) -{ - u32 result = 0; - - #define TEST_BIT(name) test_bit(ETHTOOL_LINK_MODE_ ## name ## _BIT, \ - linkset) - - if (TEST_BIT(10baseT_Half)) - result |= (1 << MC_CMD_PHY_CAP_10HDX_LBN); - if (TEST_BIT(10baseT_Full)) - result |= (1 << MC_CMD_PHY_CAP_10FDX_LBN); - if (TEST_BIT(100baseT_Half)) - result |= (1 << MC_CMD_PHY_CAP_100HDX_LBN); - if (TEST_BIT(100baseT_Full)) - result |= (1 << MC_CMD_PHY_CAP_100FDX_LBN); - if (TEST_BIT(1000baseT_Half)) - result |= (1 << MC_CMD_PHY_CAP_1000HDX_LBN); - if (TEST_BIT(1000baseT_Full) || TEST_BIT(1000baseKX_Full)) - result |= (1 << MC_CMD_PHY_CAP_1000FDX_LBN); - if (TEST_BIT(10000baseT_Full) || TEST_BIT(10000baseKX4_Full)) - result |= (1 << MC_CMD_PHY_CAP_10000FDX_LBN); - if (TEST_BIT(40000baseCR4_Full) || TEST_BIT(40000baseKR4_Full)) - result |= (1 << MC_CMD_PHY_CAP_40000FDX_LBN); - if (TEST_BIT(100000baseCR4_Full)) - result |= (1 << MC_CMD_PHY_CAP_100000FDX_LBN); - if (TEST_BIT(25000baseCR_Full)) - result |= (1 << MC_CMD_PHY_CAP_25000FDX_LBN); - if (TEST_BIT(50000baseCR2_Full)) - result |= (1 << MC_CMD_PHY_CAP_50000FDX_LBN); - if (TEST_BIT(Pause)) - result |= (1 << MC_CMD_PHY_CAP_PAUSE_LBN); - if (TEST_BIT(Asym_Pause)) - result |= (1 << MC_CMD_PHY_CAP_ASYM_LBN); - if (TEST_BIT(Autoneg)) - result |= (1 << MC_CMD_PHY_CAP_AN_LBN); - - #undef TEST_BIT - - return result; -} - -static u32 efx_get_mcdi_phy_flags(struct efx_nic *efx) -{ - struct efx_mcdi_phy_data *phy_cfg = efx->phy_data; - enum efx_phy_mode mode, supported; - u32 flags; - - /* TODO: Advertise the capabilities supported by this PHY */ - supported = 0; - if (phy_cfg->flags & (1 << MC_CMD_GET_PHY_CFG_OUT_TXDIS_LBN)) - supported |= PHY_MODE_TX_DISABLED; - if (phy_cfg->flags & (1 << MC_CMD_GET_PHY_CFG_OUT_LOWPOWER_LBN)) - supported |= PHY_MODE_LOW_POWER; - if (phy_cfg->flags & (1 << MC_CMD_GET_PHY_CFG_OUT_POWEROFF_LBN)) - supported |= PHY_MODE_OFF; - - mode = efx->phy_mode & supported; - - flags = 0; - if (mode & PHY_MODE_TX_DISABLED) - flags |= (1 << MC_CMD_SET_LINK_IN_TXDIS_LBN); - if (mode & PHY_MODE_LOW_POWER) - flags |= (1 << MC_CMD_SET_LINK_IN_LOWPOWER_LBN); - if (mode & PHY_MODE_OFF) - flags |= (1 << MC_CMD_SET_LINK_IN_POWEROFF_LBN); - - return flags; -} - -static u8 mcdi_to_ethtool_media(u32 media) -{ - switch (media) { - case MC_CMD_MEDIA_XAUI: - case MC_CMD_MEDIA_CX4: - case MC_CMD_MEDIA_KX4: - return PORT_OTHER; - - case MC_CMD_MEDIA_XFP: - case MC_CMD_MEDIA_SFP_PLUS: - case MC_CMD_MEDIA_QSFP_PLUS: - return PORT_FIBRE; - - case MC_CMD_MEDIA_BASE_T: - return PORT_TP; - - default: - return PORT_OTHER; - } -} - -static void efx_mcdi_phy_decode_link(struct efx_nic *efx, - struct efx_link_state *link_state, - u32 speed, u32 flags, u32 fcntl) -{ - switch (fcntl) { - case MC_CMD_FCNTL_AUTO: - WARN_ON(1); /* This is not a link mode */ - link_state->fc = EFX_FC_AUTO | EFX_FC_TX | EFX_FC_RX; - break; - case MC_CMD_FCNTL_BIDIR: - link_state->fc = EFX_FC_TX | EFX_FC_RX; - break; - case MC_CMD_FCNTL_RESPOND: - link_state->fc = EFX_FC_RX; - break; - default: - WARN_ON(1); - /* Fall through */ - case MC_CMD_FCNTL_OFF: - link_state->fc = 0; - break; - } - - link_state->up = !!(flags & (1 << MC_CMD_GET_LINK_OUT_LINK_UP_LBN)); - link_state->fd = !!(flags & (1 << MC_CMD_GET_LINK_OUT_FULL_DUPLEX_LBN)); - link_state->speed = speed; -} - -/* The semantics of the ethtool FEC mode bitmask are not well defined, - * particularly the meaning of combinations of bits. Which means we get to - * define our own semantics, as follows: - * OFF overrides any other bits, and means "disable all FEC" (with the - * exception of 25G KR4/CR4, where it is not possible to reject it if AN - * partner requests it). - * AUTO on its own means use cable requirements and link partner autoneg with - * fw-default preferences for the cable type. - * AUTO and either RS or BASER means use the specified FEC type if cable and - * link partner support it, otherwise autoneg/fw-default. - * RS or BASER alone means use the specified FEC type if cable and link partner - * support it and either requests it, otherwise no FEC. - * Both RS and BASER (whether AUTO or not) means use FEC if cable and link - * partner support it, preferring RS to BASER. - */ -static u32 ethtool_fec_caps_to_mcdi(u32 ethtool_cap) -{ - u32 ret = 0; - - if (ethtool_cap & ETHTOOL_FEC_OFF) - return 0; - - if (ethtool_cap & ETHTOOL_FEC_AUTO) - ret |= (1 << MC_CMD_PHY_CAP_BASER_FEC_LBN) | - (1 << MC_CMD_PHY_CAP_25G_BASER_FEC_LBN) | - (1 << MC_CMD_PHY_CAP_RS_FEC_LBN); - if (ethtool_cap & ETHTOOL_FEC_RS) - ret |= (1 << MC_CMD_PHY_CAP_RS_FEC_LBN) | - (1 << MC_CMD_PHY_CAP_RS_FEC_REQUESTED_LBN); - if (ethtool_cap & ETHTOOL_FEC_BASER) - ret |= (1 << MC_CMD_PHY_CAP_BASER_FEC_LBN) | - (1 << MC_CMD_PHY_CAP_25G_BASER_FEC_LBN) | - (1 << MC_CMD_PHY_CAP_BASER_FEC_REQUESTED_LBN) | - (1 << MC_CMD_PHY_CAP_25G_BASER_FEC_REQUESTED_LBN); - return ret; -} - -/* Invert ethtool_fec_caps_to_mcdi. There are two combinations that function - * can never produce, (baser xor rs) and neither req; the implementation below - * maps both of those to AUTO. This should never matter, and it's not clear - * what a better mapping would be anyway. - */ -static u32 mcdi_fec_caps_to_ethtool(u32 caps, bool is_25g) -{ - bool rs = caps & (1 << MC_CMD_PHY_CAP_RS_FEC_LBN), - rs_req = caps & (1 << MC_CMD_PHY_CAP_RS_FEC_REQUESTED_LBN), - baser = is_25g ? caps & (1 << MC_CMD_PHY_CAP_25G_BASER_FEC_LBN) - : caps & (1 << MC_CMD_PHY_CAP_BASER_FEC_LBN), - baser_req = is_25g ? caps & (1 << MC_CMD_PHY_CAP_25G_BASER_FEC_REQUESTED_LBN) - : caps & (1 << MC_CMD_PHY_CAP_BASER_FEC_REQUESTED_LBN); - - if (!baser && !rs) - return ETHTOOL_FEC_OFF; - return (rs_req ? ETHTOOL_FEC_RS : 0) | - (baser_req ? ETHTOOL_FEC_BASER : 0) | - (baser == baser_req && rs == rs_req ? 0 : ETHTOOL_FEC_AUTO); -} - static int efx_mcdi_phy_probe(struct efx_nic *efx) { struct efx_mcdi_phy_data *phy_data; @@ -527,58 +188,6 @@ int efx_mcdi_port_reconfigure(struct efx_nic *efx) efx->loopback_mode, 0); } -/* Verify that the forced flow control settings (!EFX_FC_AUTO) are - * supported by the link partner. Warn the user if this isn't the case - */ -static void efx_mcdi_phy_check_fcntl(struct efx_nic *efx, u32 lpa) -{ - struct efx_mcdi_phy_data *phy_cfg = efx->phy_data; - u32 rmtadv; - - /* The link partner capabilities are only relevant if the - * link supports flow control autonegotiation */ - if (~phy_cfg->supported_cap & (1 << MC_CMD_PHY_CAP_AN_LBN)) - return; - - /* If flow control autoneg is supported and enabled, then fine */ - if (efx->wanted_fc & EFX_FC_AUTO) - return; - - rmtadv = 0; - if (lpa & (1 << MC_CMD_PHY_CAP_PAUSE_LBN)) - rmtadv |= ADVERTISED_Pause; - if (lpa & (1 << MC_CMD_PHY_CAP_ASYM_LBN)) - rmtadv |= ADVERTISED_Asym_Pause; - - if ((efx->wanted_fc & EFX_FC_TX) && rmtadv == ADVERTISED_Asym_Pause) - netif_err(efx, link, efx->net_dev, - "warning: link partner doesn't support pause frames"); -} - -static bool efx_mcdi_phy_poll(struct efx_nic *efx) -{ - struct efx_link_state old_state = efx->link_state; - MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_LINK_OUT_LEN); - int rc; - - WARN_ON(!mutex_is_locked(&efx->mac_lock)); - - BUILD_BUG_ON(MC_CMD_GET_LINK_IN_LEN != 0); - - rc = efx_mcdi_rpc(efx, MC_CMD_GET_LINK, NULL, 0, - outbuf, sizeof(outbuf), NULL); - if (rc) - efx->link_state.up = false; - else - efx_mcdi_phy_decode_link( - efx, &efx->link_state, - MCDI_DWORD(outbuf, GET_LINK_OUT_LINK_SPEED), - MCDI_DWORD(outbuf, GET_LINK_OUT_FLAGS), - MCDI_DWORD(outbuf, GET_LINK_OUT_FCNTL)); - - return !efx_link_state_equal(&efx->link_state, &old_state); -} - static void efx_mcdi_phy_remove(struct efx_nic *efx) { struct efx_mcdi_phy_data *phy_data = efx->phy_data; @@ -666,58 +275,6 @@ efx_mcdi_phy_set_link_ksettings(struct efx_nic *efx, return 0; } -static int efx_mcdi_phy_get_fecparam(struct efx_nic *efx, - struct ethtool_fecparam *fec) -{ - MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_LINK_OUT_V2_LEN); - u32 caps, active, speed; /* MCDI format */ - bool is_25g = false; - size_t outlen; - int rc; - - BUILD_BUG_ON(MC_CMD_GET_LINK_IN_LEN != 0); - rc = efx_mcdi_rpc(efx, MC_CMD_GET_LINK, NULL, 0, - outbuf, sizeof(outbuf), &outlen); - if (rc) - return rc; - if (outlen < MC_CMD_GET_LINK_OUT_V2_LEN) - return -EOPNOTSUPP; - - /* behaviour for 25G/50G links depends on 25G BASER bit */ - speed = MCDI_DWORD(outbuf, GET_LINK_OUT_V2_LINK_SPEED); - is_25g = speed == 25000 || speed == 50000; - - caps = MCDI_DWORD(outbuf, GET_LINK_OUT_V2_CAP); - fec->fec = mcdi_fec_caps_to_ethtool(caps, is_25g); - /* BASER is never supported on 100G */ - if (speed == 100000) - fec->fec &= ~ETHTOOL_FEC_BASER; - - active = MCDI_DWORD(outbuf, GET_LINK_OUT_V2_FEC_TYPE); - switch (active) { - case MC_CMD_FEC_NONE: - fec->active_fec = ETHTOOL_FEC_OFF; - break; - case MC_CMD_FEC_BASER: - fec->active_fec = ETHTOOL_FEC_BASER; - break; - case MC_CMD_FEC_RS: - fec->active_fec = ETHTOOL_FEC_RS; - break; - default: - netif_warn(efx, hw, efx->net_dev, - "Firmware reports unrecognised FEC_TYPE %u\n", - active); - /* We don't know what firmware has picked. AUTO is as good a - * "can't happen" value as any other. - */ - fec->active_fec = ETHTOOL_FEC_AUTO; - break; - } - - return 0; -} - static int efx_mcdi_phy_set_fecparam(struct efx_nic *efx, const struct ethtool_fecparam *fec) { @@ -745,27 +302,6 @@ static int efx_mcdi_phy_set_fecparam(struct efx_nic *efx, return 0; } -static int efx_mcdi_phy_test_alive(struct efx_nic *efx) -{ - MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_PHY_STATE_OUT_LEN); - size_t outlen; - int rc; - - BUILD_BUG_ON(MC_CMD_GET_PHY_STATE_IN_LEN != 0); - - rc = efx_mcdi_rpc(efx, MC_CMD_GET_PHY_STATE, NULL, 0, - outbuf, sizeof(outbuf), &outlen); - if (rc) - return rc; - - if (outlen < MC_CMD_GET_PHY_STATE_OUT_LEN) - return -EIO; - if (MCDI_DWORD(outbuf, GET_PHY_STATE_OUT_STATE) != MC_CMD_PHY_STATE_OK) - return -EINVAL; - - return 0; -} - static const char *const mcdi_sft9001_cable_diag_names[] = { "cable.pairA.length", "cable.pairB.length", @@ -1139,84 +675,6 @@ u32 efx_mcdi_phy_get_caps(struct efx_nic *efx) return phy_data->supported_cap; } -static unsigned int efx_mcdi_event_link_speed[] = { - [MCDI_EVENT_LINKCHANGE_SPEED_100M] = 100, - [MCDI_EVENT_LINKCHANGE_SPEED_1G] = 1000, - [MCDI_EVENT_LINKCHANGE_SPEED_10G] = 10000, - [MCDI_EVENT_LINKCHANGE_SPEED_40G] = 40000, - [MCDI_EVENT_LINKCHANGE_SPEED_25G] = 25000, - [MCDI_EVENT_LINKCHANGE_SPEED_50G] = 50000, - [MCDI_EVENT_LINKCHANGE_SPEED_100G] = 100000, -}; - -void efx_mcdi_process_link_change(struct efx_nic *efx, efx_qword_t *ev) -{ - u32 flags, fcntl, speed, lpa; - - speed = EFX_QWORD_FIELD(*ev, MCDI_EVENT_LINKCHANGE_SPEED); - EFX_WARN_ON_PARANOID(speed >= ARRAY_SIZE(efx_mcdi_event_link_speed)); - speed = efx_mcdi_event_link_speed[speed]; - - flags = EFX_QWORD_FIELD(*ev, MCDI_EVENT_LINKCHANGE_LINK_FLAGS); - fcntl = EFX_QWORD_FIELD(*ev, MCDI_EVENT_LINKCHANGE_FCNTL); - lpa = EFX_QWORD_FIELD(*ev, MCDI_EVENT_LINKCHANGE_LP_CAP); - - /* efx->link_state is only modified by efx_mcdi_phy_get_link(), - * which is only run after flushing the event queues. Therefore, it - * is safe to modify the link state outside of the mac_lock here. - */ - efx_mcdi_phy_decode_link(efx, &efx->link_state, speed, flags, fcntl); - - efx_mcdi_phy_check_fcntl(efx, lpa); - - efx_link_status_changed(efx); -} - -int efx_mcdi_set_mac(struct efx_nic *efx) -{ - u32 fcntl; - MCDI_DECLARE_BUF(cmdbytes, MC_CMD_SET_MAC_IN_LEN); - - BUILD_BUG_ON(MC_CMD_SET_MAC_OUT_LEN != 0); - - /* This has no effect on EF10 */ - ether_addr_copy(MCDI_PTR(cmdbytes, SET_MAC_IN_ADDR), - efx->net_dev->dev_addr); - - MCDI_SET_DWORD(cmdbytes, SET_MAC_IN_MTU, - EFX_MAX_FRAME_LEN(efx->net_dev->mtu)); - MCDI_SET_DWORD(cmdbytes, SET_MAC_IN_DRAIN, 0); - - /* Set simple MAC filter for Siena */ - MCDI_POPULATE_DWORD_1(cmdbytes, SET_MAC_IN_REJECT, - SET_MAC_IN_REJECT_UNCST, efx->unicast_filter); - - MCDI_POPULATE_DWORD_1(cmdbytes, SET_MAC_IN_FLAGS, - SET_MAC_IN_FLAG_INCLUDE_FCS, - !!(efx->net_dev->features & NETIF_F_RXFCS)); - - switch (efx->wanted_fc) { - case EFX_FC_RX | EFX_FC_TX: - fcntl = MC_CMD_FCNTL_BIDIR; - break; - case EFX_FC_RX: - fcntl = MC_CMD_FCNTL_RESPOND; - break; - default: - fcntl = MC_CMD_FCNTL_OFF; - break; - } - if (efx->wanted_fc & EFX_FC_AUTO) - fcntl = MC_CMD_FCNTL_AUTO; - if (efx->fc_disable) - fcntl = MC_CMD_FCNTL_OFF; - - MCDI_SET_DWORD(cmdbytes, SET_MAC_IN_FCNTL, fcntl); - - return efx_mcdi_rpc(efx, MC_CMD_SET_MAC, cmdbytes, sizeof(cmdbytes), - NULL, 0, NULL); -} - bool efx_mcdi_mac_check_fault(struct efx_nic *efx) { MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_LINK_OUT_LEN); @@ -1348,17 +806,3 @@ void efx_mcdi_port_remove(struct efx_nic *efx) efx->phy_op->remove(efx); efx_nic_free_buffer(efx, &efx->stats_buffer); } - -/* Get physical port number (EF10 only; on Siena it is same as PF number) */ -int efx_mcdi_port_get_number(struct efx_nic *efx) -{ - MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_PORT_ASSIGNMENT_OUT_LEN); - int rc; - - rc = efx_mcdi_rpc(efx, MC_CMD_GET_PORT_ASSIGNMENT, NULL, 0, - outbuf, sizeof(outbuf), NULL); - if (rc) - return rc; - - return MCDI_DWORD(outbuf, GET_PORT_ASSIGNMENT_OUT_PORT); -} diff --git a/drivers/net/ethernet/sfc/mcdi_port_common.c b/drivers/net/ethernet/sfc/mcdi_port_common.c new file mode 100644 index 000000000000..a6a072ba46d3 --- /dev/null +++ b/drivers/net/ethernet/sfc/mcdi_port_common.c @@ -0,0 +1,568 @@ +// SPDX-License-Identifier: GPL-2.0-only +/**************************************************************************** + * Driver for Solarflare network controllers and boards + * Copyright 2018 Solarflare Communications Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + */ + +#include "mcdi_port_common.h" +#include "efx_common.h" + +int efx_mcdi_get_phy_cfg(struct efx_nic *efx, struct efx_mcdi_phy_data *cfg) +{ + MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_PHY_CFG_OUT_LEN); + size_t outlen; + int rc; + + BUILD_BUG_ON(MC_CMD_GET_PHY_CFG_IN_LEN != 0); + BUILD_BUG_ON(MC_CMD_GET_PHY_CFG_OUT_NAME_LEN != sizeof(cfg->name)); + + rc = efx_mcdi_rpc(efx, MC_CMD_GET_PHY_CFG, NULL, 0, + outbuf, sizeof(outbuf), &outlen); + if (rc) + goto fail; + + if (outlen < MC_CMD_GET_PHY_CFG_OUT_LEN) { + rc = -EIO; + goto fail; + } + + cfg->flags = MCDI_DWORD(outbuf, GET_PHY_CFG_OUT_FLAGS); + cfg->type = MCDI_DWORD(outbuf, GET_PHY_CFG_OUT_TYPE); + cfg->supported_cap = + MCDI_DWORD(outbuf, GET_PHY_CFG_OUT_SUPPORTED_CAP); + cfg->channel = MCDI_DWORD(outbuf, GET_PHY_CFG_OUT_CHANNEL); + cfg->port = MCDI_DWORD(outbuf, GET_PHY_CFG_OUT_PRT); + cfg->stats_mask = MCDI_DWORD(outbuf, GET_PHY_CFG_OUT_STATS_MASK); + memcpy(cfg->name, MCDI_PTR(outbuf, GET_PHY_CFG_OUT_NAME), + sizeof(cfg->name)); + cfg->media = MCDI_DWORD(outbuf, GET_PHY_CFG_OUT_MEDIA_TYPE); + cfg->mmd_mask = MCDI_DWORD(outbuf, GET_PHY_CFG_OUT_MMD_MASK); + memcpy(cfg->revision, MCDI_PTR(outbuf, GET_PHY_CFG_OUT_REVISION), + sizeof(cfg->revision)); + + return 0; + +fail: + netif_err(efx, hw, efx->net_dev, "%s: failed rc=%d\n", __func__, rc); + return rc; +} + +void efx_link_set_advertising(struct efx_nic *efx, + const unsigned long *advertising) +{ + memcpy(efx->link_advertising, advertising, + sizeof(__ETHTOOL_DECLARE_LINK_MODE_MASK())); + + efx->link_advertising[0] |= ADVERTISED_Autoneg; + if (advertising[0] & ADVERTISED_Pause) + efx->wanted_fc |= (EFX_FC_TX | EFX_FC_RX); + else + efx->wanted_fc &= ~(EFX_FC_TX | EFX_FC_RX); + if (advertising[0] & ADVERTISED_Asym_Pause) + efx->wanted_fc ^= EFX_FC_TX; +} + +int efx_mcdi_set_link(struct efx_nic *efx, u32 capabilities, + u32 flags, u32 loopback_mode, u32 loopback_speed) +{ + MCDI_DECLARE_BUF(inbuf, MC_CMD_SET_LINK_IN_LEN); + int rc; + + BUILD_BUG_ON(MC_CMD_SET_LINK_OUT_LEN != 0); + + MCDI_SET_DWORD(inbuf, SET_LINK_IN_CAP, capabilities); + MCDI_SET_DWORD(inbuf, SET_LINK_IN_FLAGS, flags); + MCDI_SET_DWORD(inbuf, SET_LINK_IN_LOOPBACK_MODE, loopback_mode); + MCDI_SET_DWORD(inbuf, SET_LINK_IN_LOOPBACK_SPEED, loopback_speed); + + rc = efx_mcdi_rpc(efx, MC_CMD_SET_LINK, inbuf, sizeof(inbuf), + NULL, 0, NULL); + return rc; +} + +int efx_mcdi_loopback_modes(struct efx_nic *efx, u64 *loopback_modes) +{ + MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_LOOPBACK_MODES_OUT_LEN); + size_t outlen; + int rc; + + rc = efx_mcdi_rpc(efx, MC_CMD_GET_LOOPBACK_MODES, NULL, 0, + outbuf, sizeof(outbuf), &outlen); + if (rc) + goto fail; + + if (outlen < (MC_CMD_GET_LOOPBACK_MODES_OUT_SUGGESTED_OFST + + MC_CMD_GET_LOOPBACK_MODES_OUT_SUGGESTED_LEN)) { + rc = -EIO; + goto fail; + } + + *loopback_modes = MCDI_QWORD(outbuf, GET_LOOPBACK_MODES_OUT_SUGGESTED); + + return 0; + +fail: + netif_err(efx, hw, efx->net_dev, "%s: failed rc=%d\n", __func__, rc); + return rc; +} + +void mcdi_to_ethtool_linkset(u32 media, u32 cap, unsigned long *linkset) +{ + #define SET_BIT(name) __set_bit(ETHTOOL_LINK_MODE_ ## name ## _BIT, \ + linkset) + + bitmap_zero(linkset, __ETHTOOL_LINK_MODE_MASK_NBITS); + switch (media) { + case MC_CMD_MEDIA_KX4: + SET_BIT(Backplane); + if (cap & (1 << MC_CMD_PHY_CAP_1000FDX_LBN)) + SET_BIT(1000baseKX_Full); + if (cap & (1 << MC_CMD_PHY_CAP_10000FDX_LBN)) + SET_BIT(10000baseKX4_Full); + if (cap & (1 << MC_CMD_PHY_CAP_40000FDX_LBN)) + SET_BIT(40000baseKR4_Full); + break; + + case MC_CMD_MEDIA_XFP: + case MC_CMD_MEDIA_SFP_PLUS: + case MC_CMD_MEDIA_QSFP_PLUS: + SET_BIT(FIBRE); + if (cap & (1 << MC_CMD_PHY_CAP_1000FDX_LBN)) + SET_BIT(1000baseT_Full); + if (cap & (1 << MC_CMD_PHY_CAP_10000FDX_LBN)) + SET_BIT(10000baseT_Full); + if (cap & (1 << MC_CMD_PHY_CAP_40000FDX_LBN)) + SET_BIT(40000baseCR4_Full); + if (cap & (1 << MC_CMD_PHY_CAP_100000FDX_LBN)) + SET_BIT(100000baseCR4_Full); + if (cap & (1 << MC_CMD_PHY_CAP_25000FDX_LBN)) + SET_BIT(25000baseCR_Full); + if (cap & (1 << MC_CMD_PHY_CAP_50000FDX_LBN)) + SET_BIT(50000baseCR2_Full); + break; + + case MC_CMD_MEDIA_BASE_T: + SET_BIT(TP); + if (cap & (1 << MC_CMD_PHY_CAP_10HDX_LBN)) + SET_BIT(10baseT_Half); + if (cap & (1 << MC_CMD_PHY_CAP_10FDX_LBN)) + SET_BIT(10baseT_Full); + if (cap & (1 << MC_CMD_PHY_CAP_100HDX_LBN)) + SET_BIT(100baseT_Half); + if (cap & (1 << MC_CMD_PHY_CAP_100FDX_LBN)) + SET_BIT(100baseT_Full); + if (cap & (1 << MC_CMD_PHY_CAP_1000HDX_LBN)) + SET_BIT(1000baseT_Half); + if (cap & (1 << MC_CMD_PHY_CAP_1000FDX_LBN)) + SET_BIT(1000baseT_Full); + if (cap & (1 << MC_CMD_PHY_CAP_10000FDX_LBN)) + SET_BIT(10000baseT_Full); + break; + } + + if (cap & (1 << MC_CMD_PHY_CAP_PAUSE_LBN)) + SET_BIT(Pause); + if (cap & (1 << MC_CMD_PHY_CAP_ASYM_LBN)) + SET_BIT(Asym_Pause); + if (cap & (1 << MC_CMD_PHY_CAP_AN_LBN)) + SET_BIT(Autoneg); + + #undef SET_BIT +} + +u32 ethtool_linkset_to_mcdi_cap(const unsigned long *linkset) +{ + u32 result = 0; + + #define TEST_BIT(name) test_bit(ETHTOOL_LINK_MODE_ ## name ## _BIT, \ + linkset) + + if (TEST_BIT(10baseT_Half)) + result |= (1 << MC_CMD_PHY_CAP_10HDX_LBN); + if (TEST_BIT(10baseT_Full)) + result |= (1 << MC_CMD_PHY_CAP_10FDX_LBN); + if (TEST_BIT(100baseT_Half)) + result |= (1 << MC_CMD_PHY_CAP_100HDX_LBN); + if (TEST_BIT(100baseT_Full)) + result |= (1 << MC_CMD_PHY_CAP_100FDX_LBN); + if (TEST_BIT(1000baseT_Half)) + result |= (1 << MC_CMD_PHY_CAP_1000HDX_LBN); + if (TEST_BIT(1000baseT_Full) || TEST_BIT(1000baseKX_Full)) + result |= (1 << MC_CMD_PHY_CAP_1000FDX_LBN); + if (TEST_BIT(10000baseT_Full) || TEST_BIT(10000baseKX4_Full)) + result |= (1 << MC_CMD_PHY_CAP_10000FDX_LBN); + if (TEST_BIT(40000baseCR4_Full) || TEST_BIT(40000baseKR4_Full)) + result |= (1 << MC_CMD_PHY_CAP_40000FDX_LBN); + if (TEST_BIT(100000baseCR4_Full)) + result |= (1 << MC_CMD_PHY_CAP_100000FDX_LBN); + if (TEST_BIT(25000baseCR_Full)) + result |= (1 << MC_CMD_PHY_CAP_25000FDX_LBN); + if (TEST_BIT(50000baseCR2_Full)) + result |= (1 << MC_CMD_PHY_CAP_50000FDX_LBN); + if (TEST_BIT(Pause)) + result |= (1 << MC_CMD_PHY_CAP_PAUSE_LBN); + if (TEST_BIT(Asym_Pause)) + result |= (1 << MC_CMD_PHY_CAP_ASYM_LBN); + if (TEST_BIT(Autoneg)) + result |= (1 << MC_CMD_PHY_CAP_AN_LBN); + + #undef TEST_BIT + + return result; +} + +u32 efx_get_mcdi_phy_flags(struct efx_nic *efx) +{ + struct efx_mcdi_phy_data *phy_cfg = efx->phy_data; + enum efx_phy_mode mode, supported; + u32 flags; + + /* TODO: Advertise the capabilities supported by this PHY */ + supported = 0; + if (phy_cfg->flags & (1 << MC_CMD_GET_PHY_CFG_OUT_TXDIS_LBN)) + supported |= PHY_MODE_TX_DISABLED; + if (phy_cfg->flags & (1 << MC_CMD_GET_PHY_CFG_OUT_LOWPOWER_LBN)) + supported |= PHY_MODE_LOW_POWER; + if (phy_cfg->flags & (1 << MC_CMD_GET_PHY_CFG_OUT_POWEROFF_LBN)) + supported |= PHY_MODE_OFF; + + mode = efx->phy_mode & supported; + + flags = 0; + if (mode & PHY_MODE_TX_DISABLED) + flags |= (1 << MC_CMD_SET_LINK_IN_TXDIS_LBN); + if (mode & PHY_MODE_LOW_POWER) + flags |= (1 << MC_CMD_SET_LINK_IN_LOWPOWER_LBN); + if (mode & PHY_MODE_OFF) + flags |= (1 << MC_CMD_SET_LINK_IN_POWEROFF_LBN); + + return flags; +} + +u8 mcdi_to_ethtool_media(u32 media) +{ + switch (media) { + case MC_CMD_MEDIA_XAUI: + case MC_CMD_MEDIA_CX4: + case MC_CMD_MEDIA_KX4: + return PORT_OTHER; + + case MC_CMD_MEDIA_XFP: + case MC_CMD_MEDIA_SFP_PLUS: + case MC_CMD_MEDIA_QSFP_PLUS: + return PORT_FIBRE; + + case MC_CMD_MEDIA_BASE_T: + return PORT_TP; + + default: + return PORT_OTHER; + } +} + +void efx_mcdi_phy_decode_link(struct efx_nic *efx, + struct efx_link_state *link_state, + u32 speed, u32 flags, u32 fcntl) +{ + switch (fcntl) { + case MC_CMD_FCNTL_AUTO: + WARN_ON(1); /* This is not a link mode */ + link_state->fc = EFX_FC_AUTO | EFX_FC_TX | EFX_FC_RX; + break; + case MC_CMD_FCNTL_BIDIR: + link_state->fc = EFX_FC_TX | EFX_FC_RX; + break; + case MC_CMD_FCNTL_RESPOND: + link_state->fc = EFX_FC_RX; + break; + default: + WARN_ON(1); + /* Fall through */ + case MC_CMD_FCNTL_OFF: + link_state->fc = 0; + break; + } + + link_state->up = !!(flags & (1 << MC_CMD_GET_LINK_OUT_LINK_UP_LBN)); + link_state->fd = !!(flags & (1 << MC_CMD_GET_LINK_OUT_FULL_DUPLEX_LBN)); + link_state->speed = speed; +} + +/* The semantics of the ethtool FEC mode bitmask are not well defined, + * particularly the meaning of combinations of bits. Which means we get to + * define our own semantics, as follows: + * OFF overrides any other bits, and means "disable all FEC" (with the + * exception of 25G KR4/CR4, where it is not possible to reject it if AN + * partner requests it). + * AUTO on its own means use cable requirements and link partner autoneg with + * fw-default preferences for the cable type. + * AUTO and either RS or BASER means use the specified FEC type if cable and + * link partner support it, otherwise autoneg/fw-default. + * RS or BASER alone means use the specified FEC type if cable and link partner + * support it and either requests it, otherwise no FEC. + * Both RS and BASER (whether AUTO or not) means use FEC if cable and link + * partner support it, preferring RS to BASER. + */ +u32 ethtool_fec_caps_to_mcdi(u32 ethtool_cap) +{ + u32 ret = 0; + + if (ethtool_cap & ETHTOOL_FEC_OFF) + return 0; + + if (ethtool_cap & ETHTOOL_FEC_AUTO) + ret |= (1 << MC_CMD_PHY_CAP_BASER_FEC_LBN) | + (1 << MC_CMD_PHY_CAP_25G_BASER_FEC_LBN) | + (1 << MC_CMD_PHY_CAP_RS_FEC_LBN); + if (ethtool_cap & ETHTOOL_FEC_RS) + ret |= (1 << MC_CMD_PHY_CAP_RS_FEC_LBN) | + (1 << MC_CMD_PHY_CAP_RS_FEC_REQUESTED_LBN); + if (ethtool_cap & ETHTOOL_FEC_BASER) + ret |= (1 << MC_CMD_PHY_CAP_BASER_FEC_LBN) | + (1 << MC_CMD_PHY_CAP_25G_BASER_FEC_LBN) | + (1 << MC_CMD_PHY_CAP_BASER_FEC_REQUESTED_LBN) | + (1 << MC_CMD_PHY_CAP_25G_BASER_FEC_REQUESTED_LBN); + return ret; +} + +/* Invert ethtool_fec_caps_to_mcdi. There are two combinations that function + * can never produce, (baser xor rs) and neither req; the implementation below + * maps both of those to AUTO. This should never matter, and it's not clear + * what a better mapping would be anyway. + */ +u32 mcdi_fec_caps_to_ethtool(u32 caps, bool is_25g) +{ + bool rs = caps & (1 << MC_CMD_PHY_CAP_RS_FEC_LBN), + rs_req = caps & (1 << MC_CMD_PHY_CAP_RS_FEC_REQUESTED_LBN), + baser = is_25g ? caps & (1 << MC_CMD_PHY_CAP_25G_BASER_FEC_LBN) + : caps & (1 << MC_CMD_PHY_CAP_BASER_FEC_LBN), + baser_req = is_25g ? caps & (1 << MC_CMD_PHY_CAP_25G_BASER_FEC_REQUESTED_LBN) + : caps & (1 << MC_CMD_PHY_CAP_BASER_FEC_REQUESTED_LBN); + + if (!baser && !rs) + return ETHTOOL_FEC_OFF; + return (rs_req ? ETHTOOL_FEC_RS : 0) | + (baser_req ? ETHTOOL_FEC_BASER : 0) | + (baser == baser_req && rs == rs_req ? 0 : ETHTOOL_FEC_AUTO); +} + +/* Verify that the forced flow control settings (!EFX_FC_AUTO) are + * supported by the link partner. Warn the user if this isn't the case + */ +void efx_mcdi_phy_check_fcntl(struct efx_nic *efx, u32 lpa) +{ + struct efx_mcdi_phy_data *phy_cfg = efx->phy_data; + u32 rmtadv; + + /* The link partner capabilities are only relevant if the + * link supports flow control autonegotiation + */ + if (~phy_cfg->supported_cap & (1 << MC_CMD_PHY_CAP_AN_LBN)) + return; + + /* If flow control autoneg is supported and enabled, then fine */ + if (efx->wanted_fc & EFX_FC_AUTO) + return; + + rmtadv = 0; + if (lpa & (1 << MC_CMD_PHY_CAP_PAUSE_LBN)) + rmtadv |= ADVERTISED_Pause; + if (lpa & (1 << MC_CMD_PHY_CAP_ASYM_LBN)) + rmtadv |= ADVERTISED_Asym_Pause; + + if ((efx->wanted_fc & EFX_FC_TX) && rmtadv == ADVERTISED_Asym_Pause) + netif_err(efx, link, efx->net_dev, + "warning: link partner doesn't support pause frames"); +} + +bool efx_mcdi_phy_poll(struct efx_nic *efx) +{ + struct efx_link_state old_state = efx->link_state; + MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_LINK_OUT_LEN); + int rc; + + WARN_ON(!mutex_is_locked(&efx->mac_lock)); + + BUILD_BUG_ON(MC_CMD_GET_LINK_IN_LEN != 0); + + rc = efx_mcdi_rpc(efx, MC_CMD_GET_LINK, NULL, 0, + outbuf, sizeof(outbuf), NULL); + if (rc) + efx->link_state.up = false; + else + efx_mcdi_phy_decode_link( + efx, &efx->link_state, + MCDI_DWORD(outbuf, GET_LINK_OUT_LINK_SPEED), + MCDI_DWORD(outbuf, GET_LINK_OUT_FLAGS), + MCDI_DWORD(outbuf, GET_LINK_OUT_FCNTL)); + + return !efx_link_state_equal(&efx->link_state, &old_state); +} + +int efx_mcdi_phy_get_fecparam(struct efx_nic *efx, struct ethtool_fecparam *fec) +{ + MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_LINK_OUT_V2_LEN); + u32 caps, active, speed; /* MCDI format */ + bool is_25g = false; + size_t outlen; + int rc; + + BUILD_BUG_ON(MC_CMD_GET_LINK_IN_LEN != 0); + rc = efx_mcdi_rpc(efx, MC_CMD_GET_LINK, NULL, 0, + outbuf, sizeof(outbuf), &outlen); + if (rc) + return rc; + if (outlen < MC_CMD_GET_LINK_OUT_V2_LEN) + return -EOPNOTSUPP; + + /* behaviour for 25G/50G links depends on 25G BASER bit */ + speed = MCDI_DWORD(outbuf, GET_LINK_OUT_V2_LINK_SPEED); + is_25g = speed == 25000 || speed == 50000; + + caps = MCDI_DWORD(outbuf, GET_LINK_OUT_V2_CAP); + fec->fec = mcdi_fec_caps_to_ethtool(caps, is_25g); + /* BASER is never supported on 100G */ + if (speed == 100000) + fec->fec &= ~ETHTOOL_FEC_BASER; + + active = MCDI_DWORD(outbuf, GET_LINK_OUT_V2_FEC_TYPE); + switch (active) { + case MC_CMD_FEC_NONE: + fec->active_fec = ETHTOOL_FEC_OFF; + break; + case MC_CMD_FEC_BASER: + fec->active_fec = ETHTOOL_FEC_BASER; + break; + case MC_CMD_FEC_RS: + fec->active_fec = ETHTOOL_FEC_RS; + break; + default: + netif_warn(efx, hw, efx->net_dev, + "Firmware reports unrecognised FEC_TYPE %u\n", + active); + /* We don't know what firmware has picked. AUTO is as good a + * "can't happen" value as any other. + */ + fec->active_fec = ETHTOOL_FEC_AUTO; + break; + } + + return 0; +} + +int efx_mcdi_phy_test_alive(struct efx_nic *efx) +{ + MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_PHY_STATE_OUT_LEN); + size_t outlen; + int rc; + + BUILD_BUG_ON(MC_CMD_GET_PHY_STATE_IN_LEN != 0); + + rc = efx_mcdi_rpc(efx, MC_CMD_GET_PHY_STATE, NULL, 0, + outbuf, sizeof(outbuf), &outlen); + if (rc) + return rc; + + if (outlen < MC_CMD_GET_PHY_STATE_OUT_LEN) + return -EIO; + if (MCDI_DWORD(outbuf, GET_PHY_STATE_OUT_STATE) != MC_CMD_PHY_STATE_OK) + return -EINVAL; + + return 0; +} + +int efx_mcdi_set_mac(struct efx_nic *efx) +{ + u32 fcntl; + MCDI_DECLARE_BUF(cmdbytes, MC_CMD_SET_MAC_IN_LEN); + + BUILD_BUG_ON(MC_CMD_SET_MAC_OUT_LEN != 0); + + /* This has no effect on EF10 */ + ether_addr_copy(MCDI_PTR(cmdbytes, SET_MAC_IN_ADDR), + efx->net_dev->dev_addr); + + MCDI_SET_DWORD(cmdbytes, SET_MAC_IN_MTU, + EFX_MAX_FRAME_LEN(efx->net_dev->mtu)); + MCDI_SET_DWORD(cmdbytes, SET_MAC_IN_DRAIN, 0); + + /* Set simple MAC filter for Siena */ + MCDI_POPULATE_DWORD_1(cmdbytes, SET_MAC_IN_REJECT, + SET_MAC_IN_REJECT_UNCST, efx->unicast_filter); + + MCDI_POPULATE_DWORD_1(cmdbytes, SET_MAC_IN_FLAGS, + SET_MAC_IN_FLAG_INCLUDE_FCS, + !!(efx->net_dev->features & NETIF_F_RXFCS)); + + switch (efx->wanted_fc) { + case EFX_FC_RX | EFX_FC_TX: + fcntl = MC_CMD_FCNTL_BIDIR; + break; + case EFX_FC_RX: + fcntl = MC_CMD_FCNTL_RESPOND; + break; + default: + fcntl = MC_CMD_FCNTL_OFF; + break; + } + if (efx->wanted_fc & EFX_FC_AUTO) + fcntl = MC_CMD_FCNTL_AUTO; + if (efx->fc_disable) + fcntl = MC_CMD_FCNTL_OFF; + + MCDI_SET_DWORD(cmdbytes, SET_MAC_IN_FCNTL, fcntl); + + return efx_mcdi_rpc(efx, MC_CMD_SET_MAC, cmdbytes, sizeof(cmdbytes), + NULL, 0, NULL); +} + +/* Get physical port number (EF10 only; on Siena it is same as PF number) */ +int efx_mcdi_port_get_number(struct efx_nic *efx) +{ + MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_PORT_ASSIGNMENT_OUT_LEN); + int rc; + + rc = efx_mcdi_rpc(efx, MC_CMD_GET_PORT_ASSIGNMENT, NULL, 0, + outbuf, sizeof(outbuf), NULL); + if (rc) + return rc; + + return MCDI_DWORD(outbuf, GET_PORT_ASSIGNMENT_OUT_PORT); +} + +static unsigned int efx_mcdi_event_link_speed[] = { + [MCDI_EVENT_LINKCHANGE_SPEED_100M] = 100, + [MCDI_EVENT_LINKCHANGE_SPEED_1G] = 1000, + [MCDI_EVENT_LINKCHANGE_SPEED_10G] = 10000, + [MCDI_EVENT_LINKCHANGE_SPEED_40G] = 40000, + [MCDI_EVENT_LINKCHANGE_SPEED_25G] = 25000, + [MCDI_EVENT_LINKCHANGE_SPEED_50G] = 50000, + [MCDI_EVENT_LINKCHANGE_SPEED_100G] = 100000, +}; + +void efx_mcdi_process_link_change(struct efx_nic *efx, efx_qword_t *ev) +{ + u32 flags, fcntl, speed, lpa; + + speed = EFX_QWORD_FIELD(*ev, MCDI_EVENT_LINKCHANGE_SPEED); + EFX_WARN_ON_PARANOID(speed >= ARRAY_SIZE(efx_mcdi_event_link_speed)); + speed = efx_mcdi_event_link_speed[speed]; + + flags = EFX_QWORD_FIELD(*ev, MCDI_EVENT_LINKCHANGE_LINK_FLAGS); + fcntl = EFX_QWORD_FIELD(*ev, MCDI_EVENT_LINKCHANGE_FCNTL); + lpa = EFX_QWORD_FIELD(*ev, MCDI_EVENT_LINKCHANGE_LP_CAP); + + /* efx->link_state is only modified by efx_mcdi_phy_get_link(), + * which is only run after flushing the event queues. Therefore, it + * is safe to modify the link state outside of the mac_lock here. + */ + efx_mcdi_phy_decode_link(efx, &efx->link_state, speed, flags, fcntl); + + efx_mcdi_phy_check_fcntl(efx, lpa); + + efx_link_status_changed(efx); +} diff --git a/drivers/net/ethernet/sfc/mcdi_port_common.h b/drivers/net/ethernet/sfc/mcdi_port_common.h new file mode 100644 index 000000000000..b16f11265269 --- /dev/null +++ b/drivers/net/ethernet/sfc/mcdi_port_common.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/**************************************************************************** + * Driver for Solarflare network controllers and boards + * Copyright 2018 Solarflare Communications Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + */ +#ifndef EFX_MCDI_PORT_COMMON_H +#define EFX_MCDI_PORT_COMMON_H + +#include "net_driver.h" +#include "mcdi.h" +#include "mcdi_pcol.h" + +struct efx_mcdi_phy_data { + u32 flags; + u32 type; + u32 supported_cap; + u32 channel; + u32 port; + u32 stats_mask; + u8 name[20]; + u32 media; + u32 mmd_mask; + u8 revision[20]; + u32 forced_cap; +}; + +#define EFX_MC_STATS_GENERATION_INVALID ((__force __le64)(-1)) + +int efx_mcdi_get_phy_cfg(struct efx_nic *efx, struct efx_mcdi_phy_data *cfg); +void efx_link_set_advertising(struct efx_nic *efx, + const unsigned long *advertising); +int efx_mcdi_set_link(struct efx_nic *efx, u32 capabilities, + u32 flags, u32 loopback_mode, u32 loopback_speed); +int efx_mcdi_loopback_modes(struct efx_nic *efx, u64 *loopback_modes); +void mcdi_to_ethtool_linkset(u32 media, u32 cap, unsigned long *linkset); +u32 ethtool_linkset_to_mcdi_cap(const unsigned long *linkset); +u32 efx_get_mcdi_phy_flags(struct efx_nic *efx); +u8 mcdi_to_ethtool_media(u32 media); +void efx_mcdi_phy_decode_link(struct efx_nic *efx, + struct efx_link_state *link_state, + u32 speed, u32 flags, u32 fcntl); +u32 ethtool_fec_caps_to_mcdi(u32 ethtool_cap); +u32 mcdi_fec_caps_to_ethtool(u32 caps, bool is_25g); +void efx_mcdi_phy_check_fcntl(struct efx_nic *efx, u32 lpa); +bool efx_mcdi_phy_poll(struct efx_nic *efx); +int efx_mcdi_phy_get_fecparam(struct efx_nic *efx, + struct ethtool_fecparam *fec); +int efx_mcdi_phy_test_alive(struct efx_nic *efx); +int efx_mcdi_set_mac(struct efx_nic *efx); +int efx_mcdi_port_get_number(struct efx_nic *efx); +void efx_mcdi_process_link_change(struct efx_nic *efx, efx_qword_t *ev); + +#endif diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h index 709172a6995e..9f9886f222c8 100644 --- a/drivers/net/ethernet/sfc/net_driver.h +++ b/drivers/net/ethernet/sfc/net_driver.h @@ -138,6 +138,8 @@ struct efx_special_buffer { * freed when descriptor completes * @xdpf: When @flags & %EFX_TX_BUF_XDP, the XDP frame information; its @data * member is the associated buffer to drop a page reference on. + * @option: When @flags & %EFX_TX_BUF_OPTION, an EF10-specific option + * descriptor. * @dma_addr: DMA address of the fragment. * @flags: Flags for allocation and DMA mapping type * @len: Length of this fragment. @@ -152,7 +154,7 @@ struct efx_tx_buffer { struct xdp_frame *xdpf; }; union { - efx_qword_t option; + efx_qword_t option; /* EF10 */ dma_addr_t dma_addr; }; unsigned short flags; @@ -742,13 +744,13 @@ union efx_multicast_hash { struct vfdi_status; /* The reserved RSS context value */ -#define EFX_EF10_RSS_CONTEXT_INVALID 0xffffffff +#define EFX_MCDI_RSS_CONTEXT_INVALID 0xffffffff /** * struct efx_rss_context - A user-defined RSS context for filtering * @list: node of linked list on which this struct is stored * @context_id: the RSS_CONTEXT_ID returned by MC firmware, or - * %EFX_EF10_RSS_CONTEXT_INVALID if this context is not present on the NIC. - * For Siena, 0 if RSS is active, else %EFX_EF10_RSS_CONTEXT_INVALID. + * %EFX_MCDI_RSS_CONTEXT_INVALID if this context is not present on the NIC. + * For Siena, 0 if RSS is active, else %EFX_MCDI_RSS_CONTEXT_INVALID. * @user_id: the rss_context ID exposed to userspace over ethtool. * @rx_hash_udp_4tuple: UDP 4-tuple hashing enabled * @rx_hash_key: Toeplitz hash key for this RSS context @@ -1610,6 +1612,15 @@ static inline struct efx_rx_buffer *efx_rx_buffer(struct efx_rx_queue *rx_queue, return &rx_queue->buffer[index]; } +static inline struct efx_rx_buffer * +efx_rx_buf_next(struct efx_rx_queue *rx_queue, struct efx_rx_buffer *rx_buf) +{ + if (unlikely(rx_buf == efx_rx_buffer(rx_queue, rx_queue->ptr_mask))) + return efx_rx_buffer(rx_queue, 0); + else + return rx_buf + 1; +} + /** * EFX_MAX_FRAME_LEN - calculate maximum frame length * diff --git a/drivers/net/ethernet/sfc/nic.h b/drivers/net/ethernet/sfc/nic.h index bf0bdb22cc64..6670fda8f35a 100644 --- a/drivers/net/ethernet/sfc/nic.h +++ b/drivers/net/ethernet/sfc/nic.h @@ -11,6 +11,7 @@ #include <linux/net_tstamp.h> #include "net_driver.h" #include "efx.h" +#include "efx_common.h" #include "mcdi.h" enum { @@ -505,6 +506,9 @@ static inline void efx_nic_push_buffers(struct efx_tx_queue *tx_queue) tx_queue->efx->type->tx_write(tx_queue); } +int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue, struct sk_buff *skb, + bool *data_mapped); + /* RX data path */ static inline int efx_nic_probe_rx(struct efx_rx_queue *rx_queue) { @@ -553,6 +557,7 @@ static inline void efx_nic_eventq_read_ack(struct efx_channel *channel) { channel->efx->type->ev_read_ack(channel); } + void efx_nic_event_test_start(struct efx_channel *channel); /* Falcon/Siena queue operations */ @@ -670,6 +675,7 @@ struct efx_farch_register_test { unsigned address; efx_oword_t mask; }; + int efx_farch_test_registers(struct efx_nic *efx, const struct efx_farch_register_test *regs, size_t n_regs); diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c index c29bf862a94c..a2042f16babc 100644 --- a/drivers/net/ethernet/sfc/rx.c +++ b/drivers/net/ethernet/sfc/rx.c @@ -21,6 +21,7 @@ #include <linux/bpf_trace.h> #include "net_driver.h" #include "efx.h" +#include "rx_common.h" #include "filter.h" #include "nic.h" #include "selftest.h" @@ -32,60 +33,13 @@ /* Maximum rx prefix used by any architecture. */ #define EFX_MAX_RX_PREFIX_SIZE 16 -/* Number of RX buffers to recycle pages for. When creating the RX page recycle - * ring, this number is divided by the number of buffers per page to calculate - * the number of pages to store in the RX page recycle ring. - */ -#define EFX_RECYCLE_RING_SIZE_IOMMU 4096 -#define EFX_RECYCLE_RING_SIZE_NOIOMMU (2 * EFX_RX_PREFERRED_BATCH) - /* Size of buffer allocated for skb header area. */ #define EFX_SKB_HEADERS 128u -/* This is the percentage fill level below which new RX descriptors - * will be added to the RX descriptor ring. - */ -static unsigned int rx_refill_threshold; - /* Each packet can consume up to ceil(max_frame_len / buffer_size) buffers */ #define EFX_RX_MAX_FRAGS DIV_ROUND_UP(EFX_MAX_FRAME_LEN(EFX_MAX_MTU), \ EFX_RX_USR_BUF_SIZE) -/* - * RX maximum head room required. - * - * This must be at least 1 to prevent overflow, plus one packet-worth - * to allow pipelined receives. - */ -#define EFX_RXD_HEAD_ROOM (1 + EFX_RX_MAX_FRAGS) - -static inline u8 *efx_rx_buf_va(struct efx_rx_buffer *buf) -{ - return page_address(buf->page) + buf->page_offset; -} - -static inline u32 efx_rx_buf_hash(struct efx_nic *efx, const u8 *eh) -{ -#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) - return __le32_to_cpup((const __le32 *)(eh + efx->rx_packet_hash_offset)); -#else - const u8 *data = eh + efx->rx_packet_hash_offset; - return (u32)data[0] | - (u32)data[1] << 8 | - (u32)data[2] << 16 | - (u32)data[3] << 24; -#endif -} - -static inline struct efx_rx_buffer * -efx_rx_buf_next(struct efx_rx_queue *rx_queue, struct efx_rx_buffer *rx_buf) -{ - if (unlikely(rx_buf == efx_rx_buffer(rx_queue, rx_queue->ptr_mask))) - return efx_rx_buffer(rx_queue, 0); - else - return rx_buf + 1; -} - static inline void efx_sync_rx_buffer(struct efx_nic *efx, struct efx_rx_buffer *rx_buf, unsigned int len) @@ -94,301 +48,6 @@ static inline void efx_sync_rx_buffer(struct efx_nic *efx, DMA_FROM_DEVICE); } -void efx_rx_config_page_split(struct efx_nic *efx) -{ - efx->rx_page_buf_step = ALIGN(efx->rx_dma_len + efx->rx_ip_align + - XDP_PACKET_HEADROOM, - EFX_RX_BUF_ALIGNMENT); - efx->rx_bufs_per_page = efx->rx_buffer_order ? 1 : - ((PAGE_SIZE - sizeof(struct efx_rx_page_state)) / - efx->rx_page_buf_step); - efx->rx_buffer_truesize = (PAGE_SIZE << efx->rx_buffer_order) / - efx->rx_bufs_per_page; - efx->rx_pages_per_batch = DIV_ROUND_UP(EFX_RX_PREFERRED_BATCH, - efx->rx_bufs_per_page); -} - -/* Check the RX page recycle ring for a page that can be reused. */ -static struct page *efx_reuse_page(struct efx_rx_queue *rx_queue) -{ - struct efx_nic *efx = rx_queue->efx; - struct page *page; - struct efx_rx_page_state *state; - unsigned index; - - index = rx_queue->page_remove & rx_queue->page_ptr_mask; - page = rx_queue->page_ring[index]; - if (page == NULL) - return NULL; - - rx_queue->page_ring[index] = NULL; - /* page_remove cannot exceed page_add. */ - if (rx_queue->page_remove != rx_queue->page_add) - ++rx_queue->page_remove; - - /* If page_count is 1 then we hold the only reference to this page. */ - if (page_count(page) == 1) { - ++rx_queue->page_recycle_count; - return page; - } else { - state = page_address(page); - dma_unmap_page(&efx->pci_dev->dev, state->dma_addr, - PAGE_SIZE << efx->rx_buffer_order, - DMA_FROM_DEVICE); - put_page(page); - ++rx_queue->page_recycle_failed; - } - - return NULL; -} - -/** - * efx_init_rx_buffers - create EFX_RX_BATCH page-based RX buffers - * - * @rx_queue: Efx RX queue - * - * This allocates a batch of pages, maps them for DMA, and populates - * struct efx_rx_buffers for each one. Return a negative error code or - * 0 on success. If a single page can be used for multiple buffers, - * then the page will either be inserted fully, or not at all. - */ -static int efx_init_rx_buffers(struct efx_rx_queue *rx_queue, bool atomic) -{ - struct efx_nic *efx = rx_queue->efx; - struct efx_rx_buffer *rx_buf; - struct page *page; - unsigned int page_offset; - struct efx_rx_page_state *state; - dma_addr_t dma_addr; - unsigned index, count; - - count = 0; - do { - page = efx_reuse_page(rx_queue); - if (page == NULL) { - page = alloc_pages(__GFP_COMP | - (atomic ? GFP_ATOMIC : GFP_KERNEL), - efx->rx_buffer_order); - if (unlikely(page == NULL)) - return -ENOMEM; - dma_addr = - dma_map_page(&efx->pci_dev->dev, page, 0, - PAGE_SIZE << efx->rx_buffer_order, - DMA_FROM_DEVICE); - if (unlikely(dma_mapping_error(&efx->pci_dev->dev, - dma_addr))) { - __free_pages(page, efx->rx_buffer_order); - return -EIO; - } - state = page_address(page); - state->dma_addr = dma_addr; - } else { - state = page_address(page); - dma_addr = state->dma_addr; - } - - dma_addr += sizeof(struct efx_rx_page_state); - page_offset = sizeof(struct efx_rx_page_state); - - do { - index = rx_queue->added_count & rx_queue->ptr_mask; - rx_buf = efx_rx_buffer(rx_queue, index); - rx_buf->dma_addr = dma_addr + efx->rx_ip_align + - XDP_PACKET_HEADROOM; - rx_buf->page = page; - rx_buf->page_offset = page_offset + efx->rx_ip_align + - XDP_PACKET_HEADROOM; - rx_buf->len = efx->rx_dma_len; - rx_buf->flags = 0; - ++rx_queue->added_count; - get_page(page); - dma_addr += efx->rx_page_buf_step; - page_offset += efx->rx_page_buf_step; - } while (page_offset + efx->rx_page_buf_step <= PAGE_SIZE); - - rx_buf->flags = EFX_RX_BUF_LAST_IN_PAGE; - } while (++count < efx->rx_pages_per_batch); - - return 0; -} - -/* Unmap a DMA-mapped page. This function is only called for the final RX - * buffer in a page. - */ -static void efx_unmap_rx_buffer(struct efx_nic *efx, - struct efx_rx_buffer *rx_buf) -{ - struct page *page = rx_buf->page; - - if (page) { - struct efx_rx_page_state *state = page_address(page); - dma_unmap_page(&efx->pci_dev->dev, - state->dma_addr, - PAGE_SIZE << efx->rx_buffer_order, - DMA_FROM_DEVICE); - } -} - -static void efx_free_rx_buffers(struct efx_rx_queue *rx_queue, - struct efx_rx_buffer *rx_buf, - unsigned int num_bufs) -{ - do { - if (rx_buf->page) { - put_page(rx_buf->page); - rx_buf->page = NULL; - } - rx_buf = efx_rx_buf_next(rx_queue, rx_buf); - } while (--num_bufs); -} - -/* Attempt to recycle the page if there is an RX recycle ring; the page can - * only be added if this is the final RX buffer, to prevent pages being used in - * the descriptor ring and appearing in the recycle ring simultaneously. - */ -static void efx_recycle_rx_page(struct efx_channel *channel, - struct efx_rx_buffer *rx_buf) -{ - struct page *page = rx_buf->page; - struct efx_rx_queue *rx_queue = efx_channel_get_rx_queue(channel); - struct efx_nic *efx = rx_queue->efx; - unsigned index; - - /* Only recycle the page after processing the final buffer. */ - if (!(rx_buf->flags & EFX_RX_BUF_LAST_IN_PAGE)) - return; - - index = rx_queue->page_add & rx_queue->page_ptr_mask; - if (rx_queue->page_ring[index] == NULL) { - unsigned read_index = rx_queue->page_remove & - rx_queue->page_ptr_mask; - - /* The next slot in the recycle ring is available, but - * increment page_remove if the read pointer currently - * points here. - */ - if (read_index == index) - ++rx_queue->page_remove; - rx_queue->page_ring[index] = page; - ++rx_queue->page_add; - return; - } - ++rx_queue->page_recycle_full; - efx_unmap_rx_buffer(efx, rx_buf); - put_page(rx_buf->page); -} - -static void efx_fini_rx_buffer(struct efx_rx_queue *rx_queue, - struct efx_rx_buffer *rx_buf) -{ - /* Release the page reference we hold for the buffer. */ - if (rx_buf->page) - put_page(rx_buf->page); - - /* If this is the last buffer in a page, unmap and free it. */ - if (rx_buf->flags & EFX_RX_BUF_LAST_IN_PAGE) { - efx_unmap_rx_buffer(rx_queue->efx, rx_buf); - efx_free_rx_buffers(rx_queue, rx_buf, 1); - } - rx_buf->page = NULL; -} - -/* Recycle the pages that are used by buffers that have just been received. */ -static void efx_recycle_rx_pages(struct efx_channel *channel, - struct efx_rx_buffer *rx_buf, - unsigned int n_frags) -{ - struct efx_rx_queue *rx_queue = efx_channel_get_rx_queue(channel); - - do { - efx_recycle_rx_page(channel, rx_buf); - rx_buf = efx_rx_buf_next(rx_queue, rx_buf); - } while (--n_frags); -} - -static void efx_discard_rx_packet(struct efx_channel *channel, - struct efx_rx_buffer *rx_buf, - unsigned int n_frags) -{ - struct efx_rx_queue *rx_queue = efx_channel_get_rx_queue(channel); - - efx_recycle_rx_pages(channel, rx_buf, n_frags); - - efx_free_rx_buffers(rx_queue, rx_buf, n_frags); -} - -/** - * efx_fast_push_rx_descriptors - push new RX descriptors quickly - * @rx_queue: RX descriptor queue - * - * This will aim to fill the RX descriptor queue up to - * @rx_queue->@max_fill. If there is insufficient atomic - * memory to do so, a slow fill will be scheduled. - * - * The caller must provide serialisation (none is used here). In practise, - * this means this function must run from the NAPI handler, or be called - * when NAPI is disabled. - */ -void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue, bool atomic) -{ - struct efx_nic *efx = rx_queue->efx; - unsigned int fill_level, batch_size; - int space, rc = 0; - - if (!rx_queue->refill_enabled) - return; - - /* Calculate current fill level, and exit if we don't need to fill */ - fill_level = (rx_queue->added_count - rx_queue->removed_count); - EFX_WARN_ON_ONCE_PARANOID(fill_level > rx_queue->efx->rxq_entries); - if (fill_level >= rx_queue->fast_fill_trigger) - goto out; - - /* Record minimum fill level */ - if (unlikely(fill_level < rx_queue->min_fill)) { - if (fill_level) - rx_queue->min_fill = fill_level; - } - - batch_size = efx->rx_pages_per_batch * efx->rx_bufs_per_page; - space = rx_queue->max_fill - fill_level; - EFX_WARN_ON_ONCE_PARANOID(space < batch_size); - - netif_vdbg(rx_queue->efx, rx_status, rx_queue->efx->net_dev, - "RX queue %d fast-filling descriptor ring from" - " level %d to level %d\n", - efx_rx_queue_index(rx_queue), fill_level, - rx_queue->max_fill); - - - do { - rc = efx_init_rx_buffers(rx_queue, atomic); - if (unlikely(rc)) { - /* Ensure that we don't leave the rx queue empty */ - efx_schedule_slow_fill(rx_queue); - goto out; - } - } while ((space -= batch_size) >= batch_size); - - netif_vdbg(rx_queue->efx, rx_status, rx_queue->efx->net_dev, - "RX queue %d fast-filled descriptor ring " - "to level %d\n", efx_rx_queue_index(rx_queue), - rx_queue->added_count - rx_queue->removed_count); - - out: - if (rx_queue->notified_count != rx_queue->added_count) - efx_nic_notify_rx_desc(rx_queue); -} - -void efx_rx_slow_fill(struct timer_list *t) -{ - struct efx_rx_queue *rx_queue = from_timer(rx_queue, t, slow_fill); - - /* Post an event to cause NAPI to run and refill the queue */ - efx_nic_generate_fill_event(rx_queue); - ++rx_queue->slow_fill_count; -} - static void efx_rx_packet__check_len(struct efx_rx_queue *rx_queue, struct efx_rx_buffer *rx_buf, int len) @@ -412,53 +71,6 @@ static void efx_rx_packet__check_len(struct efx_rx_queue *rx_queue, efx_rx_queue_channel(rx_queue)->n_rx_overlength++; } -/* Pass a received packet up through GRO. GRO can handle pages - * regardless of checksum state and skbs with a good checksum. - */ -static void -efx_rx_packet_gro(struct efx_channel *channel, struct efx_rx_buffer *rx_buf, - unsigned int n_frags, u8 *eh) -{ - struct napi_struct *napi = &channel->napi_str; - struct efx_nic *efx = channel->efx; - struct sk_buff *skb; - - skb = napi_get_frags(napi); - if (unlikely(!skb)) { - struct efx_rx_queue *rx_queue; - - rx_queue = efx_channel_get_rx_queue(channel); - efx_free_rx_buffers(rx_queue, rx_buf, n_frags); - return; - } - - if (efx->net_dev->features & NETIF_F_RXHASH) - skb_set_hash(skb, efx_rx_buf_hash(efx, eh), - PKT_HASH_TYPE_L3); - skb->ip_summed = ((rx_buf->flags & EFX_RX_PKT_CSUMMED) ? - CHECKSUM_UNNECESSARY : CHECKSUM_NONE); - skb->csum_level = !!(rx_buf->flags & EFX_RX_PKT_CSUM_LEVEL); - - for (;;) { - skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags, - rx_buf->page, rx_buf->page_offset, - rx_buf->len); - rx_buf->page = NULL; - skb->len += rx_buf->len; - if (skb_shinfo(skb)->nr_frags == n_frags) - break; - - rx_buf = efx_rx_buf_next(&channel->rx_queue, rx_buf); - } - - skb->data_len = skb->len; - skb->truesize += n_frags * efx->rx_buffer_truesize; - - skb_record_rx_queue(skb, channel->rx_queue.core_index); - - napi_gro_frags(napi); -} - /* Allocate and construct an SKB around page fragments */ static struct sk_buff *efx_rx_mk_skb(struct efx_channel *channel, struct efx_rx_buffer *rx_buf, @@ -805,174 +417,6 @@ out: channel->rx_pkt_n_frags = 0; } -int efx_probe_rx_queue(struct efx_rx_queue *rx_queue) -{ - struct efx_nic *efx = rx_queue->efx; - unsigned int entries; - int rc; - - /* Create the smallest power-of-two aligned ring */ - entries = max(roundup_pow_of_two(efx->rxq_entries), EFX_MIN_DMAQ_SIZE); - EFX_WARN_ON_PARANOID(entries > EFX_MAX_DMAQ_SIZE); - rx_queue->ptr_mask = entries - 1; - - netif_dbg(efx, probe, efx->net_dev, - "creating RX queue %d size %#x mask %#x\n", - efx_rx_queue_index(rx_queue), efx->rxq_entries, - rx_queue->ptr_mask); - - /* Allocate RX buffers */ - rx_queue->buffer = kcalloc(entries, sizeof(*rx_queue->buffer), - GFP_KERNEL); - if (!rx_queue->buffer) - return -ENOMEM; - - rc = efx_nic_probe_rx(rx_queue); - if (rc) { - kfree(rx_queue->buffer); - rx_queue->buffer = NULL; - } - - return rc; -} - -static void efx_init_rx_recycle_ring(struct efx_nic *efx, - struct efx_rx_queue *rx_queue) -{ - unsigned int bufs_in_recycle_ring, page_ring_size; - - /* Set the RX recycle ring size */ -#ifdef CONFIG_PPC64 - bufs_in_recycle_ring = EFX_RECYCLE_RING_SIZE_IOMMU; -#else - if (iommu_present(&pci_bus_type)) - bufs_in_recycle_ring = EFX_RECYCLE_RING_SIZE_IOMMU; - else - bufs_in_recycle_ring = EFX_RECYCLE_RING_SIZE_NOIOMMU; -#endif /* CONFIG_PPC64 */ - - page_ring_size = roundup_pow_of_two(bufs_in_recycle_ring / - efx->rx_bufs_per_page); - rx_queue->page_ring = kcalloc(page_ring_size, - sizeof(*rx_queue->page_ring), GFP_KERNEL); - rx_queue->page_ptr_mask = page_ring_size - 1; -} - -void efx_init_rx_queue(struct efx_rx_queue *rx_queue) -{ - struct efx_nic *efx = rx_queue->efx; - unsigned int max_fill, trigger, max_trigger; - int rc = 0; - - netif_dbg(rx_queue->efx, drv, rx_queue->efx->net_dev, - "initialising RX queue %d\n", efx_rx_queue_index(rx_queue)); - - /* Initialise ptr fields */ - rx_queue->added_count = 0; - rx_queue->notified_count = 0; - rx_queue->removed_count = 0; - rx_queue->min_fill = -1U; - efx_init_rx_recycle_ring(efx, rx_queue); - - rx_queue->page_remove = 0; - rx_queue->page_add = rx_queue->page_ptr_mask + 1; - rx_queue->page_recycle_count = 0; - rx_queue->page_recycle_failed = 0; - rx_queue->page_recycle_full = 0; - - /* Initialise limit fields */ - max_fill = efx->rxq_entries - EFX_RXD_HEAD_ROOM; - max_trigger = - max_fill - efx->rx_pages_per_batch * efx->rx_bufs_per_page; - if (rx_refill_threshold != 0) { - trigger = max_fill * min(rx_refill_threshold, 100U) / 100U; - if (trigger > max_trigger) - trigger = max_trigger; - } else { - trigger = max_trigger; - } - - rx_queue->max_fill = max_fill; - rx_queue->fast_fill_trigger = trigger; - rx_queue->refill_enabled = true; - - /* Initialise XDP queue information */ - rc = xdp_rxq_info_reg(&rx_queue->xdp_rxq_info, efx->net_dev, - rx_queue->core_index); - - if (rc) { - netif_err(efx, rx_err, efx->net_dev, - "Failure to initialise XDP queue information rc=%d\n", - rc); - efx->xdp_rxq_info_failed = true; - } else { - rx_queue->xdp_rxq_info_valid = true; - } - - /* Set up RX descriptor ring */ - efx_nic_init_rx(rx_queue); -} - -void efx_fini_rx_queue(struct efx_rx_queue *rx_queue) -{ - int i; - struct efx_nic *efx = rx_queue->efx; - struct efx_rx_buffer *rx_buf; - - netif_dbg(rx_queue->efx, drv, rx_queue->efx->net_dev, - "shutting down RX queue %d\n", efx_rx_queue_index(rx_queue)); - - del_timer_sync(&rx_queue->slow_fill); - - /* Release RX buffers from the current read ptr to the write ptr */ - if (rx_queue->buffer) { - for (i = rx_queue->removed_count; i < rx_queue->added_count; - i++) { - unsigned index = i & rx_queue->ptr_mask; - rx_buf = efx_rx_buffer(rx_queue, index); - efx_fini_rx_buffer(rx_queue, rx_buf); - } - } - - /* Unmap and release the pages in the recycle ring. Remove the ring. */ - for (i = 0; i <= rx_queue->page_ptr_mask; i++) { - struct page *page = rx_queue->page_ring[i]; - struct efx_rx_page_state *state; - - if (page == NULL) - continue; - - state = page_address(page); - dma_unmap_page(&efx->pci_dev->dev, state->dma_addr, - PAGE_SIZE << efx->rx_buffer_order, - DMA_FROM_DEVICE); - put_page(page); - } - kfree(rx_queue->page_ring); - rx_queue->page_ring = NULL; - - if (rx_queue->xdp_rxq_info_valid) - xdp_rxq_info_unreg(&rx_queue->xdp_rxq_info); - - rx_queue->xdp_rxq_info_valid = false; -} - -void efx_remove_rx_queue(struct efx_rx_queue *rx_queue) -{ - netif_dbg(rx_queue->efx, drv, rx_queue->efx->net_dev, - "destroying RX queue %d\n", efx_rx_queue_index(rx_queue)); - - efx_nic_remove_rx(rx_queue); - - kfree(rx_queue->buffer); - rx_queue->buffer = NULL; -} - - -module_param(rx_refill_threshold, uint, 0444); -MODULE_PARM_DESC(rx_refill_threshold, - "RX descriptor ring refill threshold (%)"); - #ifdef CONFIG_RFS_ACCEL static void efx_filter_rfs_work(struct work_struct *data) @@ -1206,37 +650,3 @@ bool __efx_filter_rfs_expire(struct efx_channel *channel, unsigned int quota) } #endif /* CONFIG_RFS_ACCEL */ - -/** - * efx_filter_is_mc_recipient - test whether spec is a multicast recipient - * @spec: Specification to test - * - * Return: %true if the specification is a non-drop RX filter that - * matches a local MAC address I/G bit value of 1 or matches a local - * IPv4 or IPv6 address value in the respective multicast address - * range. Otherwise %false. - */ -bool efx_filter_is_mc_recipient(const struct efx_filter_spec *spec) -{ - if (!(spec->flags & EFX_FILTER_FLAG_RX) || - spec->dmaq_id == EFX_FILTER_RX_DMAQ_ID_DROP) - return false; - - if (spec->match_flags & - (EFX_FILTER_MATCH_LOC_MAC | EFX_FILTER_MATCH_LOC_MAC_IG) && - is_multicast_ether_addr(spec->loc_mac)) - return true; - - if ((spec->match_flags & - (EFX_FILTER_MATCH_ETHER_TYPE | EFX_FILTER_MATCH_LOC_HOST)) == - (EFX_FILTER_MATCH_ETHER_TYPE | EFX_FILTER_MATCH_LOC_HOST)) { - if (spec->ether_type == htons(ETH_P_IP) && - ipv4_is_multicast(spec->loc_host[0])) - return true; - if (spec->ether_type == htons(ETH_P_IPV6) && - ((const u8 *)spec->loc_host)[0] == 0xff) - return true; - } - - return false; -} diff --git a/drivers/net/ethernet/sfc/rx_common.c b/drivers/net/ethernet/sfc/rx_common.c new file mode 100644 index 000000000000..ee8beb87bdc1 --- /dev/null +++ b/drivers/net/ethernet/sfc/rx_common.c @@ -0,0 +1,851 @@ +// SPDX-License-Identifier: GPL-2.0-only +/**************************************************************************** + * Driver for Solarflare network controllers and boards + * Copyright 2018 Solarflare Communications Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + */ + +#include "net_driver.h" +#include <linux/module.h> +#include <linux/iommu.h> +#include "efx.h" +#include "nic.h" +#include "rx_common.h" + +/* This is the percentage fill level below which new RX descriptors + * will be added to the RX descriptor ring. + */ +static unsigned int rx_refill_threshold; +module_param(rx_refill_threshold, uint, 0444); +MODULE_PARM_DESC(rx_refill_threshold, + "RX descriptor ring refill threshold (%)"); + +/* Number of RX buffers to recycle pages for. When creating the RX page recycle + * ring, this number is divided by the number of buffers per page to calculate + * the number of pages to store in the RX page recycle ring. + */ +#define EFX_RECYCLE_RING_SIZE_IOMMU 4096 +#define EFX_RECYCLE_RING_SIZE_NOIOMMU (2 * EFX_RX_PREFERRED_BATCH) + +/* RX maximum head room required. + * + * This must be at least 1 to prevent overflow, plus one packet-worth + * to allow pipelined receives. + */ +#define EFX_RXD_HEAD_ROOM (1 + EFX_RX_MAX_FRAGS) + +/* Check the RX page recycle ring for a page that can be reused. */ +static struct page *efx_reuse_page(struct efx_rx_queue *rx_queue) +{ + struct efx_nic *efx = rx_queue->efx; + struct efx_rx_page_state *state; + unsigned int index; + struct page *page; + + index = rx_queue->page_remove & rx_queue->page_ptr_mask; + page = rx_queue->page_ring[index]; + if (page == NULL) + return NULL; + + rx_queue->page_ring[index] = NULL; + /* page_remove cannot exceed page_add. */ + if (rx_queue->page_remove != rx_queue->page_add) + ++rx_queue->page_remove; + + /* If page_count is 1 then we hold the only reference to this page. */ + if (page_count(page) == 1) { + ++rx_queue->page_recycle_count; + return page; + } else { + state = page_address(page); + dma_unmap_page(&efx->pci_dev->dev, state->dma_addr, + PAGE_SIZE << efx->rx_buffer_order, + DMA_FROM_DEVICE); + put_page(page); + ++rx_queue->page_recycle_failed; + } + + return NULL; +} + +/* Attempt to recycle the page if there is an RX recycle ring; the page can + * only be added if this is the final RX buffer, to prevent pages being used in + * the descriptor ring and appearing in the recycle ring simultaneously. + */ +static void efx_recycle_rx_page(struct efx_channel *channel, + struct efx_rx_buffer *rx_buf) +{ + struct efx_rx_queue *rx_queue = efx_channel_get_rx_queue(channel); + struct efx_nic *efx = rx_queue->efx; + struct page *page = rx_buf->page; + unsigned int index; + + /* Only recycle the page after processing the final buffer. */ + if (!(rx_buf->flags & EFX_RX_BUF_LAST_IN_PAGE)) + return; + + index = rx_queue->page_add & rx_queue->page_ptr_mask; + if (rx_queue->page_ring[index] == NULL) { + unsigned int read_index = rx_queue->page_remove & + rx_queue->page_ptr_mask; + + /* The next slot in the recycle ring is available, but + * increment page_remove if the read pointer currently + * points here. + */ + if (read_index == index) + ++rx_queue->page_remove; + rx_queue->page_ring[index] = page; + ++rx_queue->page_add; + return; + } + ++rx_queue->page_recycle_full; + efx_unmap_rx_buffer(efx, rx_buf); + put_page(rx_buf->page); +} + +/* Recycle the pages that are used by buffers that have just been received. */ +void efx_recycle_rx_pages(struct efx_channel *channel, + struct efx_rx_buffer *rx_buf, + unsigned int n_frags) +{ + struct efx_rx_queue *rx_queue = efx_channel_get_rx_queue(channel); + + do { + efx_recycle_rx_page(channel, rx_buf); + rx_buf = efx_rx_buf_next(rx_queue, rx_buf); + } while (--n_frags); +} + +void efx_discard_rx_packet(struct efx_channel *channel, + struct efx_rx_buffer *rx_buf, + unsigned int n_frags) +{ + struct efx_rx_queue *rx_queue = efx_channel_get_rx_queue(channel); + + efx_recycle_rx_pages(channel, rx_buf, n_frags); + + efx_free_rx_buffers(rx_queue, rx_buf, n_frags); +} + +static void efx_init_rx_recycle_ring(struct efx_rx_queue *rx_queue) +{ + unsigned int bufs_in_recycle_ring, page_ring_size; + struct efx_nic *efx = rx_queue->efx; + + /* Set the RX recycle ring size */ +#ifdef CONFIG_PPC64 + bufs_in_recycle_ring = EFX_RECYCLE_RING_SIZE_IOMMU; +#else + if (iommu_present(&pci_bus_type)) + bufs_in_recycle_ring = EFX_RECYCLE_RING_SIZE_IOMMU; + else + bufs_in_recycle_ring = EFX_RECYCLE_RING_SIZE_NOIOMMU; +#endif /* CONFIG_PPC64 */ + + page_ring_size = roundup_pow_of_two(bufs_in_recycle_ring / + efx->rx_bufs_per_page); + rx_queue->page_ring = kcalloc(page_ring_size, + sizeof(*rx_queue->page_ring), GFP_KERNEL); + rx_queue->page_ptr_mask = page_ring_size - 1; +} + +static void efx_fini_rx_recycle_ring(struct efx_rx_queue *rx_queue) +{ + struct efx_nic *efx = rx_queue->efx; + int i; + + /* Unmap and release the pages in the recycle ring. Remove the ring. */ + for (i = 0; i <= rx_queue->page_ptr_mask; i++) { + struct page *page = rx_queue->page_ring[i]; + struct efx_rx_page_state *state; + + if (page == NULL) + continue; + + state = page_address(page); + dma_unmap_page(&efx->pci_dev->dev, state->dma_addr, + PAGE_SIZE << efx->rx_buffer_order, + DMA_FROM_DEVICE); + put_page(page); + } + kfree(rx_queue->page_ring); + rx_queue->page_ring = NULL; +} + +static void efx_fini_rx_buffer(struct efx_rx_queue *rx_queue, + struct efx_rx_buffer *rx_buf) +{ + /* Release the page reference we hold for the buffer. */ + if (rx_buf->page) + put_page(rx_buf->page); + + /* If this is the last buffer in a page, unmap and free it. */ + if (rx_buf->flags & EFX_RX_BUF_LAST_IN_PAGE) { + efx_unmap_rx_buffer(rx_queue->efx, rx_buf); + efx_free_rx_buffers(rx_queue, rx_buf, 1); + } + rx_buf->page = NULL; +} + +int efx_probe_rx_queue(struct efx_rx_queue *rx_queue) +{ + struct efx_nic *efx = rx_queue->efx; + unsigned int entries; + int rc; + + /* Create the smallest power-of-two aligned ring */ + entries = max(roundup_pow_of_two(efx->rxq_entries), EFX_MIN_DMAQ_SIZE); + EFX_WARN_ON_PARANOID(entries > EFX_MAX_DMAQ_SIZE); + rx_queue->ptr_mask = entries - 1; + + netif_dbg(efx, probe, efx->net_dev, + "creating RX queue %d size %#x mask %#x\n", + efx_rx_queue_index(rx_queue), efx->rxq_entries, + rx_queue->ptr_mask); + + /* Allocate RX buffers */ + rx_queue->buffer = kcalloc(entries, sizeof(*rx_queue->buffer), + GFP_KERNEL); + if (!rx_queue->buffer) + return -ENOMEM; + + rc = efx_nic_probe_rx(rx_queue); + if (rc) { + kfree(rx_queue->buffer); + rx_queue->buffer = NULL; + } + + return rc; +} + +void efx_init_rx_queue(struct efx_rx_queue *rx_queue) +{ + unsigned int max_fill, trigger, max_trigger; + struct efx_nic *efx = rx_queue->efx; + int rc = 0; + + netif_dbg(rx_queue->efx, drv, rx_queue->efx->net_dev, + "initialising RX queue %d\n", efx_rx_queue_index(rx_queue)); + + /* Initialise ptr fields */ + rx_queue->added_count = 0; + rx_queue->notified_count = 0; + rx_queue->removed_count = 0; + rx_queue->min_fill = -1U; + efx_init_rx_recycle_ring(rx_queue); + + rx_queue->page_remove = 0; + rx_queue->page_add = rx_queue->page_ptr_mask + 1; + rx_queue->page_recycle_count = 0; + rx_queue->page_recycle_failed = 0; + rx_queue->page_recycle_full = 0; + + /* Initialise limit fields */ + max_fill = efx->rxq_entries - EFX_RXD_HEAD_ROOM; + max_trigger = + max_fill - efx->rx_pages_per_batch * efx->rx_bufs_per_page; + if (rx_refill_threshold != 0) { + trigger = max_fill * min(rx_refill_threshold, 100U) / 100U; + if (trigger > max_trigger) + trigger = max_trigger; + } else { + trigger = max_trigger; + } + + rx_queue->max_fill = max_fill; + rx_queue->fast_fill_trigger = trigger; + rx_queue->refill_enabled = true; + + /* Initialise XDP queue information */ + rc = xdp_rxq_info_reg(&rx_queue->xdp_rxq_info, efx->net_dev, + rx_queue->core_index); + + if (rc) { + netif_err(efx, rx_err, efx->net_dev, + "Failure to initialise XDP queue information rc=%d\n", + rc); + efx->xdp_rxq_info_failed = true; + } else { + rx_queue->xdp_rxq_info_valid = true; + } + + /* Set up RX descriptor ring */ + efx_nic_init_rx(rx_queue); +} + +void efx_fini_rx_queue(struct efx_rx_queue *rx_queue) +{ + struct efx_rx_buffer *rx_buf; + int i; + + netif_dbg(rx_queue->efx, drv, rx_queue->efx->net_dev, + "shutting down RX queue %d\n", efx_rx_queue_index(rx_queue)); + + del_timer_sync(&rx_queue->slow_fill); + + /* Release RX buffers from the current read ptr to the write ptr */ + if (rx_queue->buffer) { + for (i = rx_queue->removed_count; i < rx_queue->added_count; + i++) { + unsigned int index = i & rx_queue->ptr_mask; + + rx_buf = efx_rx_buffer(rx_queue, index); + efx_fini_rx_buffer(rx_queue, rx_buf); + } + } + + efx_fini_rx_recycle_ring(rx_queue); + + if (rx_queue->xdp_rxq_info_valid) + xdp_rxq_info_unreg(&rx_queue->xdp_rxq_info); + + rx_queue->xdp_rxq_info_valid = false; +} + +void efx_remove_rx_queue(struct efx_rx_queue *rx_queue) +{ + netif_dbg(rx_queue->efx, drv, rx_queue->efx->net_dev, + "destroying RX queue %d\n", efx_rx_queue_index(rx_queue)); + + efx_nic_remove_rx(rx_queue); + + kfree(rx_queue->buffer); + rx_queue->buffer = NULL; +} + +/* Unmap a DMA-mapped page. This function is only called for the final RX + * buffer in a page. + */ +void efx_unmap_rx_buffer(struct efx_nic *efx, + struct efx_rx_buffer *rx_buf) +{ + struct page *page = rx_buf->page; + + if (page) { + struct efx_rx_page_state *state = page_address(page); + + dma_unmap_page(&efx->pci_dev->dev, + state->dma_addr, + PAGE_SIZE << efx->rx_buffer_order, + DMA_FROM_DEVICE); + } +} + +void efx_free_rx_buffers(struct efx_rx_queue *rx_queue, + struct efx_rx_buffer *rx_buf, + unsigned int num_bufs) +{ + do { + if (rx_buf->page) { + put_page(rx_buf->page); + rx_buf->page = NULL; + } + rx_buf = efx_rx_buf_next(rx_queue, rx_buf); + } while (--num_bufs); +} + +void efx_rx_slow_fill(struct timer_list *t) +{ + struct efx_rx_queue *rx_queue = from_timer(rx_queue, t, slow_fill); + + /* Post an event to cause NAPI to run and refill the queue */ + efx_nic_generate_fill_event(rx_queue); + ++rx_queue->slow_fill_count; +} + +void efx_schedule_slow_fill(struct efx_rx_queue *rx_queue) +{ + mod_timer(&rx_queue->slow_fill, jiffies + msecs_to_jiffies(10)); +} + +/* efx_init_rx_buffers - create EFX_RX_BATCH page-based RX buffers + * + * @rx_queue: Efx RX queue + * + * This allocates a batch of pages, maps them for DMA, and populates + * struct efx_rx_buffers for each one. Return a negative error code or + * 0 on success. If a single page can be used for multiple buffers, + * then the page will either be inserted fully, or not at all. + */ +static int efx_init_rx_buffers(struct efx_rx_queue *rx_queue, bool atomic) +{ + unsigned int page_offset, index, count; + struct efx_nic *efx = rx_queue->efx; + struct efx_rx_page_state *state; + struct efx_rx_buffer *rx_buf; + dma_addr_t dma_addr; + struct page *page; + + count = 0; + do { + page = efx_reuse_page(rx_queue); + if (page == NULL) { + page = alloc_pages(__GFP_COMP | + (atomic ? GFP_ATOMIC : GFP_KERNEL), + efx->rx_buffer_order); + if (unlikely(page == NULL)) + return -ENOMEM; + dma_addr = + dma_map_page(&efx->pci_dev->dev, page, 0, + PAGE_SIZE << efx->rx_buffer_order, + DMA_FROM_DEVICE); + if (unlikely(dma_mapping_error(&efx->pci_dev->dev, + dma_addr))) { + __free_pages(page, efx->rx_buffer_order); + return -EIO; + } + state = page_address(page); + state->dma_addr = dma_addr; + } else { + state = page_address(page); + dma_addr = state->dma_addr; + } + + dma_addr += sizeof(struct efx_rx_page_state); + page_offset = sizeof(struct efx_rx_page_state); + + do { + index = rx_queue->added_count & rx_queue->ptr_mask; + rx_buf = efx_rx_buffer(rx_queue, index); + rx_buf->dma_addr = dma_addr + efx->rx_ip_align + + XDP_PACKET_HEADROOM; + rx_buf->page = page; + rx_buf->page_offset = page_offset + efx->rx_ip_align + + XDP_PACKET_HEADROOM; + rx_buf->len = efx->rx_dma_len; + rx_buf->flags = 0; + ++rx_queue->added_count; + get_page(page); + dma_addr += efx->rx_page_buf_step; + page_offset += efx->rx_page_buf_step; + } while (page_offset + efx->rx_page_buf_step <= PAGE_SIZE); + + rx_buf->flags = EFX_RX_BUF_LAST_IN_PAGE; + } while (++count < efx->rx_pages_per_batch); + + return 0; +} + +void efx_rx_config_page_split(struct efx_nic *efx) +{ + efx->rx_page_buf_step = ALIGN(efx->rx_dma_len + efx->rx_ip_align + + XDP_PACKET_HEADROOM, + EFX_RX_BUF_ALIGNMENT); + efx->rx_bufs_per_page = efx->rx_buffer_order ? 1 : + ((PAGE_SIZE - sizeof(struct efx_rx_page_state)) / + efx->rx_page_buf_step); + efx->rx_buffer_truesize = (PAGE_SIZE << efx->rx_buffer_order) / + efx->rx_bufs_per_page; + efx->rx_pages_per_batch = DIV_ROUND_UP(EFX_RX_PREFERRED_BATCH, + efx->rx_bufs_per_page); +} + +/* efx_fast_push_rx_descriptors - push new RX descriptors quickly + * @rx_queue: RX descriptor queue + * + * This will aim to fill the RX descriptor queue up to + * @rx_queue->@max_fill. If there is insufficient atomic + * memory to do so, a slow fill will be scheduled. + * + * The caller must provide serialisation (none is used here). In practise, + * this means this function must run from the NAPI handler, or be called + * when NAPI is disabled. + */ +void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue, bool atomic) +{ + struct efx_nic *efx = rx_queue->efx; + unsigned int fill_level, batch_size; + int space, rc = 0; + + if (!rx_queue->refill_enabled) + return; + + /* Calculate current fill level, and exit if we don't need to fill */ + fill_level = (rx_queue->added_count - rx_queue->removed_count); + EFX_WARN_ON_ONCE_PARANOID(fill_level > rx_queue->efx->rxq_entries); + if (fill_level >= rx_queue->fast_fill_trigger) + goto out; + + /* Record minimum fill level */ + if (unlikely(fill_level < rx_queue->min_fill)) { + if (fill_level) + rx_queue->min_fill = fill_level; + } + + batch_size = efx->rx_pages_per_batch * efx->rx_bufs_per_page; + space = rx_queue->max_fill - fill_level; + EFX_WARN_ON_ONCE_PARANOID(space < batch_size); + + netif_vdbg(rx_queue->efx, rx_status, rx_queue->efx->net_dev, + "RX queue %d fast-filling descriptor ring from" + " level %d to level %d\n", + efx_rx_queue_index(rx_queue), fill_level, + rx_queue->max_fill); + + do { + rc = efx_init_rx_buffers(rx_queue, atomic); + if (unlikely(rc)) { + /* Ensure that we don't leave the rx queue empty */ + efx_schedule_slow_fill(rx_queue); + goto out; + } + } while ((space -= batch_size) >= batch_size); + + netif_vdbg(rx_queue->efx, rx_status, rx_queue->efx->net_dev, + "RX queue %d fast-filled descriptor ring " + "to level %d\n", efx_rx_queue_index(rx_queue), + rx_queue->added_count - rx_queue->removed_count); + + out: + if (rx_queue->notified_count != rx_queue->added_count) + efx_nic_notify_rx_desc(rx_queue); +} + +/* Pass a received packet up through GRO. GRO can handle pages + * regardless of checksum state and skbs with a good checksum. + */ +void +efx_rx_packet_gro(struct efx_channel *channel, struct efx_rx_buffer *rx_buf, + unsigned int n_frags, u8 *eh) +{ + struct napi_struct *napi = &channel->napi_str; + struct efx_nic *efx = channel->efx; + struct sk_buff *skb; + + skb = napi_get_frags(napi); + if (unlikely(!skb)) { + struct efx_rx_queue *rx_queue; + + rx_queue = efx_channel_get_rx_queue(channel); + efx_free_rx_buffers(rx_queue, rx_buf, n_frags); + return; + } + + if (efx->net_dev->features & NETIF_F_RXHASH) + skb_set_hash(skb, efx_rx_buf_hash(efx, eh), + PKT_HASH_TYPE_L3); + skb->ip_summed = ((rx_buf->flags & EFX_RX_PKT_CSUMMED) ? + CHECKSUM_UNNECESSARY : CHECKSUM_NONE); + skb->csum_level = !!(rx_buf->flags & EFX_RX_PKT_CSUM_LEVEL); + + for (;;) { + skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags, + rx_buf->page, rx_buf->page_offset, + rx_buf->len); + rx_buf->page = NULL; + skb->len += rx_buf->len; + if (skb_shinfo(skb)->nr_frags == n_frags) + break; + + rx_buf = efx_rx_buf_next(&channel->rx_queue, rx_buf); + } + + skb->data_len = skb->len; + skb->truesize += n_frags * efx->rx_buffer_truesize; + + skb_record_rx_queue(skb, channel->rx_queue.core_index); + + napi_gro_frags(napi); +} + +/* RSS contexts. We're using linked lists and crappy O(n) algorithms, because + * (a) this is an infrequent control-plane operation and (b) n is small (max 64) + */ +struct efx_rss_context *efx_alloc_rss_context_entry(struct efx_nic *efx) +{ + struct list_head *head = &efx->rss_context.list; + struct efx_rss_context *ctx, *new; + u32 id = 1; /* Don't use zero, that refers to the master RSS context */ + + WARN_ON(!mutex_is_locked(&efx->rss_lock)); + + /* Search for first gap in the numbering */ + list_for_each_entry(ctx, head, list) { + if (ctx->user_id != id) + break; + id++; + /* Check for wrap. If this happens, we have nearly 2^32 + * allocated RSS contexts, which seems unlikely. + */ + if (WARN_ON_ONCE(!id)) + return NULL; + } + + /* Create the new entry */ + new = kmalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return NULL; + new->context_id = EFX_MCDI_RSS_CONTEXT_INVALID; + new->rx_hash_udp_4tuple = false; + + /* Insert the new entry into the gap */ + new->user_id = id; + list_add_tail(&new->list, &ctx->list); + return new; +} + +struct efx_rss_context *efx_find_rss_context_entry(struct efx_nic *efx, u32 id) +{ + struct list_head *head = &efx->rss_context.list; + struct efx_rss_context *ctx; + + WARN_ON(!mutex_is_locked(&efx->rss_lock)); + + list_for_each_entry(ctx, head, list) + if (ctx->user_id == id) + return ctx; + return NULL; +} + +void efx_free_rss_context_entry(struct efx_rss_context *ctx) +{ + list_del(&ctx->list); + kfree(ctx); +} + +void efx_set_default_rx_indir_table(struct efx_nic *efx, + struct efx_rss_context *ctx) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(ctx->rx_indir_table); i++) + ctx->rx_indir_table[i] = + ethtool_rxfh_indir_default(i, efx->rss_spread); +} + +/** + * efx_filter_is_mc_recipient - test whether spec is a multicast recipient + * @spec: Specification to test + * + * Return: %true if the specification is a non-drop RX filter that + * matches a local MAC address I/G bit value of 1 or matches a local + * IPv4 or IPv6 address value in the respective multicast address + * range. Otherwise %false. + */ +bool efx_filter_is_mc_recipient(const struct efx_filter_spec *spec) +{ + if (!(spec->flags & EFX_FILTER_FLAG_RX) || + spec->dmaq_id == EFX_FILTER_RX_DMAQ_ID_DROP) + return false; + + if (spec->match_flags & + (EFX_FILTER_MATCH_LOC_MAC | EFX_FILTER_MATCH_LOC_MAC_IG) && + is_multicast_ether_addr(spec->loc_mac)) + return true; + + if ((spec->match_flags & + (EFX_FILTER_MATCH_ETHER_TYPE | EFX_FILTER_MATCH_LOC_HOST)) == + (EFX_FILTER_MATCH_ETHER_TYPE | EFX_FILTER_MATCH_LOC_HOST)) { + if (spec->ether_type == htons(ETH_P_IP) && + ipv4_is_multicast(spec->loc_host[0])) + return true; + if (spec->ether_type == htons(ETH_P_IPV6) && + ((const u8 *)spec->loc_host)[0] == 0xff) + return true; + } + + return false; +} + +bool efx_filter_spec_equal(const struct efx_filter_spec *left, + const struct efx_filter_spec *right) +{ + if ((left->match_flags ^ right->match_flags) | + ((left->flags ^ right->flags) & + (EFX_FILTER_FLAG_RX | EFX_FILTER_FLAG_TX))) + return false; + + return memcmp(&left->outer_vid, &right->outer_vid, + sizeof(struct efx_filter_spec) - + offsetof(struct efx_filter_spec, outer_vid)) == 0; +} + +u32 efx_filter_spec_hash(const struct efx_filter_spec *spec) +{ + BUILD_BUG_ON(offsetof(struct efx_filter_spec, outer_vid) & 3); + return jhash2((const u32 *)&spec->outer_vid, + (sizeof(struct efx_filter_spec) - + offsetof(struct efx_filter_spec, outer_vid)) / 4, + 0); +} + +#ifdef CONFIG_RFS_ACCEL +bool efx_rps_check_rule(struct efx_arfs_rule *rule, unsigned int filter_idx, + bool *force) +{ + if (rule->filter_id == EFX_ARFS_FILTER_ID_PENDING) { + /* ARFS is currently updating this entry, leave it */ + return false; + } + if (rule->filter_id == EFX_ARFS_FILTER_ID_ERROR) { + /* ARFS tried and failed to update this, so it's probably out + * of date. Remove the filter and the ARFS rule entry. + */ + rule->filter_id = EFX_ARFS_FILTER_ID_REMOVING; + *force = true; + return true; + } else if (WARN_ON(rule->filter_id != filter_idx)) { /* can't happen */ + /* ARFS has moved on, so old filter is not needed. Since we did + * not mark the rule with EFX_ARFS_FILTER_ID_REMOVING, it will + * not be removed by efx_rps_hash_del() subsequently. + */ + *force = true; + return true; + } + /* Remove it iff ARFS wants to. */ + return true; +} + +static +struct hlist_head *efx_rps_hash_bucket(struct efx_nic *efx, + const struct efx_filter_spec *spec) +{ + u32 hash = efx_filter_spec_hash(spec); + + lockdep_assert_held(&efx->rps_hash_lock); + if (!efx->rps_hash_table) + return NULL; + return &efx->rps_hash_table[hash % EFX_ARFS_HASH_TABLE_SIZE]; +} + +struct efx_arfs_rule *efx_rps_hash_find(struct efx_nic *efx, + const struct efx_filter_spec *spec) +{ + struct efx_arfs_rule *rule; + struct hlist_head *head; + struct hlist_node *node; + + head = efx_rps_hash_bucket(efx, spec); + if (!head) + return NULL; + hlist_for_each(node, head) { + rule = container_of(node, struct efx_arfs_rule, node); + if (efx_filter_spec_equal(spec, &rule->spec)) + return rule; + } + return NULL; +} + +struct efx_arfs_rule *efx_rps_hash_add(struct efx_nic *efx, + const struct efx_filter_spec *spec, + bool *new) +{ + struct efx_arfs_rule *rule; + struct hlist_head *head; + struct hlist_node *node; + + head = efx_rps_hash_bucket(efx, spec); + if (!head) + return NULL; + hlist_for_each(node, head) { + rule = container_of(node, struct efx_arfs_rule, node); + if (efx_filter_spec_equal(spec, &rule->spec)) { + *new = false; + return rule; + } + } + rule = kmalloc(sizeof(*rule), GFP_ATOMIC); + *new = true; + if (rule) { + memcpy(&rule->spec, spec, sizeof(rule->spec)); + hlist_add_head(&rule->node, head); + } + return rule; +} + +void efx_rps_hash_del(struct efx_nic *efx, const struct efx_filter_spec *spec) +{ + struct efx_arfs_rule *rule; + struct hlist_head *head; + struct hlist_node *node; + + head = efx_rps_hash_bucket(efx, spec); + if (WARN_ON(!head)) + return; + hlist_for_each(node, head) { + rule = container_of(node, struct efx_arfs_rule, node); + if (efx_filter_spec_equal(spec, &rule->spec)) { + /* Someone already reused the entry. We know that if + * this check doesn't fire (i.e. filter_id == REMOVING) + * then the REMOVING mark was put there by our caller, + * because caller is holding a lock on filter table and + * only holders of that lock set REMOVING. + */ + if (rule->filter_id != EFX_ARFS_FILTER_ID_REMOVING) + return; + hlist_del(node); + kfree(rule); + return; + } + } + /* We didn't find it. */ + WARN_ON(1); +} +#endif + +int efx_probe_filters(struct efx_nic *efx) +{ + int rc; + + init_rwsem(&efx->filter_sem); + mutex_lock(&efx->mac_lock); + down_write(&efx->filter_sem); + rc = efx->type->filter_table_probe(efx); + if (rc) + goto out_unlock; + +#ifdef CONFIG_RFS_ACCEL + if (efx->type->offload_features & NETIF_F_NTUPLE) { + struct efx_channel *channel; + int i, success = 1; + + efx_for_each_channel(channel, efx) { + channel->rps_flow_id = + kcalloc(efx->type->max_rx_ip_filters, + sizeof(*channel->rps_flow_id), + GFP_KERNEL); + if (!channel->rps_flow_id) + success = 0; + else + for (i = 0; + i < efx->type->max_rx_ip_filters; + ++i) + channel->rps_flow_id[i] = + RPS_FLOW_ID_INVALID; + channel->rfs_expire_index = 0; + channel->rfs_filter_count = 0; + } + + if (!success) { + efx_for_each_channel(channel, efx) + kfree(channel->rps_flow_id); + efx->type->filter_table_remove(efx); + rc = -ENOMEM; + goto out_unlock; + } + } +#endif +out_unlock: + up_write(&efx->filter_sem); + mutex_unlock(&efx->mac_lock); + return rc; +} + +void efx_remove_filters(struct efx_nic *efx) +{ +#ifdef CONFIG_RFS_ACCEL + struct efx_channel *channel; + + efx_for_each_channel(channel, efx) { + cancel_delayed_work_sync(&channel->filter_work); + kfree(channel->rps_flow_id); + } +#endif + down_write(&efx->filter_sem); + efx->type->filter_table_remove(efx); + up_write(&efx->filter_sem); +} diff --git a/drivers/net/ethernet/sfc/rx_common.h b/drivers/net/ethernet/sfc/rx_common.h new file mode 100644 index 000000000000..c41f12a89477 --- /dev/null +++ b/drivers/net/ethernet/sfc/rx_common.h @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/**************************************************************************** + * Driver for Solarflare network controllers and boards + * Copyright 2018 Solarflare Communications Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + */ + +#ifndef EFX_RX_COMMON_H +#define EFX_RX_COMMON_H + +/* Preferred number of descriptors to fill at once */ +#define EFX_RX_PREFERRED_BATCH 8U + +/* Each packet can consume up to ceil(max_frame_len / buffer_size) buffers */ +#define EFX_RX_MAX_FRAGS DIV_ROUND_UP(EFX_MAX_FRAME_LEN(EFX_MAX_MTU), \ + EFX_RX_USR_BUF_SIZE) + +static inline u8 *efx_rx_buf_va(struct efx_rx_buffer *buf) +{ + return page_address(buf->page) + buf->page_offset; +} + +static inline u32 efx_rx_buf_hash(struct efx_nic *efx, const u8 *eh) +{ +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) + return __le32_to_cpup((const __le32 *)(eh + efx->rx_packet_hash_offset)); +#else + const u8 *data = eh + efx->rx_packet_hash_offset; + + return (u32)data[0] | + (u32)data[1] << 8 | + (u32)data[2] << 16 | + (u32)data[3] << 24; +#endif +} + +void efx_rx_slow_fill(struct timer_list *t); + +void efx_recycle_rx_pages(struct efx_channel *channel, + struct efx_rx_buffer *rx_buf, + unsigned int n_frags); +void efx_discard_rx_packet(struct efx_channel *channel, + struct efx_rx_buffer *rx_buf, + unsigned int n_frags); + +int efx_probe_rx_queue(struct efx_rx_queue *rx_queue); +void efx_init_rx_queue(struct efx_rx_queue *rx_queue); +void efx_fini_rx_queue(struct efx_rx_queue *rx_queue); +void efx_remove_rx_queue(struct efx_rx_queue *rx_queue); +void efx_destroy_rx_queue(struct efx_rx_queue *rx_queue); + +void efx_init_rx_buffer(struct efx_rx_queue *rx_queue, + struct page *page, + unsigned int page_offset, + u16 flags); +void efx_unmap_rx_buffer(struct efx_nic *efx, struct efx_rx_buffer *rx_buf); +void efx_free_rx_buffers(struct efx_rx_queue *rx_queue, + struct efx_rx_buffer *rx_buf, + unsigned int num_bufs); + +void efx_schedule_slow_fill(struct efx_rx_queue *rx_queue); +void efx_rx_config_page_split(struct efx_nic *efx); +void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue, bool atomic); + +void +efx_rx_packet_gro(struct efx_channel *channel, struct efx_rx_buffer *rx_buf, + unsigned int n_frags, u8 *eh); + +struct efx_rss_context *efx_alloc_rss_context_entry(struct efx_nic *efx); +struct efx_rss_context *efx_find_rss_context_entry(struct efx_nic *efx, u32 id); +void efx_free_rss_context_entry(struct efx_rss_context *ctx); +void efx_set_default_rx_indir_table(struct efx_nic *efx, + struct efx_rss_context *ctx); + +bool efx_filter_is_mc_recipient(const struct efx_filter_spec *spec); +bool efx_filter_spec_equal(const struct efx_filter_spec *left, + const struct efx_filter_spec *right); +u32 efx_filter_spec_hash(const struct efx_filter_spec *spec); + +#ifdef CONFIG_RFS_ACCEL +bool efx_rps_check_rule(struct efx_arfs_rule *rule, unsigned int filter_idx, + bool *force); +struct efx_arfs_rule *efx_rps_hash_find(struct efx_nic *efx, + const struct efx_filter_spec *spec); +struct efx_arfs_rule *efx_rps_hash_add(struct efx_nic *efx, + const struct efx_filter_spec *spec, + bool *new); +void efx_rps_hash_del(struct efx_nic *efx, const struct efx_filter_spec *spec); +#endif + +int efx_probe_filters(struct efx_nic *efx); +void efx_remove_filters(struct efx_nic *efx); + +#endif diff --git a/drivers/net/ethernet/sfc/selftest.c b/drivers/net/ethernet/sfc/selftest.c index 8474cf8ea7d3..1ae369022d7d 100644 --- a/drivers/net/ethernet/sfc/selftest.c +++ b/drivers/net/ethernet/sfc/selftest.c @@ -18,6 +18,8 @@ #include <linux/slab.h> #include "net_driver.h" #include "efx.h" +#include "efx_common.h" +#include "efx_channels.h" #include "nic.h" #include "selftest.h" #include "workarounds.h" @@ -783,7 +785,7 @@ void efx_selftest_async_cancel(struct efx_nic *efx) cancel_delayed_work_sync(&efx->selftest_work); } -void efx_selftest_async_work(struct work_struct *data) +static void efx_selftest_async_work(struct work_struct *data) { struct efx_nic *efx = container_of(data, struct efx_nic, selftest_work.work); @@ -802,3 +804,8 @@ void efx_selftest_async_work(struct work_struct *data) channel->channel, cpu); } } + +void efx_selftest_async_init(struct efx_nic *efx) +{ + INIT_DELAYED_WORK(&efx->selftest_work, efx_selftest_async_work); +} diff --git a/drivers/net/ethernet/sfc/selftest.h b/drivers/net/ethernet/sfc/selftest.h index a3553816d92c..ca88ebb4f6b1 100644 --- a/drivers/net/ethernet/sfc/selftest.h +++ b/drivers/net/ethernet/sfc/selftest.h @@ -45,8 +45,8 @@ void efx_loopback_rx_packet(struct efx_nic *efx, const char *buf_ptr, int pkt_len); int efx_selftest(struct efx_nic *efx, struct efx_self_tests *tests, unsigned flags); +void efx_selftest_async_init(struct efx_nic *efx); void efx_selftest_async_start(struct efx_nic *efx); void efx_selftest_async_cancel(struct efx_nic *efx); -void efx_selftest_async_work(struct work_struct *data); #endif /* EFX_SELFTEST_H */ diff --git a/drivers/net/ethernet/sfc/siena.c b/drivers/net/ethernet/sfc/siena.c index 81499244a4b4..baa464161626 100644 --- a/drivers/net/ethernet/sfc/siena.c +++ b/drivers/net/ethernet/sfc/siena.c @@ -14,12 +14,14 @@ #include "net_driver.h" #include "bitfield.h" #include "efx.h" +#include "efx_common.h" #include "nic.h" #include "farch_regs.h" #include "io.h" #include "workarounds.h" #include "mcdi.h" #include "mcdi_pcol.h" +#include "mcdi_port_common.h" #include "selftest.h" #include "siena_sriov.h" diff --git a/drivers/net/ethernet/sfc/siena_sriov.c b/drivers/net/ethernet/sfc/siena_sriov.c index dfbdf05dcf79..83dcfcae3d4b 100644 --- a/drivers/net/ethernet/sfc/siena_sriov.c +++ b/drivers/net/ethernet/sfc/siena_sriov.c @@ -7,6 +7,7 @@ #include <linux/module.h> #include "net_driver.h" #include "efx.h" +#include "efx_channels.h" #include "nic.h" #include "io.h" #include "mcdi.h" diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c index 00c1c4402451..04d7f41d7ed9 100644 --- a/drivers/net/ethernet/sfc/tx.c +++ b/drivers/net/ethernet/sfc/tx.c @@ -20,6 +20,7 @@ #include "io.h" #include "nic.h" #include "tx.h" +#include "tx_common.h" #include "workarounds.h" #include "ef10_regs.h" @@ -56,72 +57,6 @@ u8 *efx_tx_get_copy_buffer_limited(struct efx_tx_queue *tx_queue, return efx_tx_get_copy_buffer(tx_queue, buffer); } -static void efx_dequeue_buffer(struct efx_tx_queue *tx_queue, - struct efx_tx_buffer *buffer, - unsigned int *pkts_compl, - unsigned int *bytes_compl) -{ - if (buffer->unmap_len) { - struct device *dma_dev = &tx_queue->efx->pci_dev->dev; - dma_addr_t unmap_addr = buffer->dma_addr - buffer->dma_offset; - if (buffer->flags & EFX_TX_BUF_MAP_SINGLE) - dma_unmap_single(dma_dev, unmap_addr, buffer->unmap_len, - DMA_TO_DEVICE); - else - dma_unmap_page(dma_dev, unmap_addr, buffer->unmap_len, - DMA_TO_DEVICE); - buffer->unmap_len = 0; - } - - if (buffer->flags & EFX_TX_BUF_SKB) { - struct sk_buff *skb = (struct sk_buff *)buffer->skb; - - EFX_WARN_ON_PARANOID(!pkts_compl || !bytes_compl); - (*pkts_compl)++; - (*bytes_compl) += skb->len; - if (tx_queue->timestamping && - (tx_queue->completed_timestamp_major || - tx_queue->completed_timestamp_minor)) { - struct skb_shared_hwtstamps hwtstamp; - - hwtstamp.hwtstamp = - efx_ptp_nic_to_kernel_time(tx_queue); - skb_tstamp_tx(skb, &hwtstamp); - - tx_queue->completed_timestamp_major = 0; - tx_queue->completed_timestamp_minor = 0; - } - dev_consume_skb_any((struct sk_buff *)buffer->skb); - netif_vdbg(tx_queue->efx, tx_done, tx_queue->efx->net_dev, - "TX queue %d transmission id %x complete\n", - tx_queue->queue, tx_queue->read_count); - } else if (buffer->flags & EFX_TX_BUF_XDP) { - xdp_return_frame_rx_napi(buffer->xdpf); - } - - buffer->len = 0; - buffer->flags = 0; -} - -unsigned int efx_tx_max_skb_descs(struct efx_nic *efx) -{ - /* Header and payload descriptor for each output segment, plus - * one for every input fragment boundary within a segment - */ - unsigned int max_descs = EFX_TSO_MAX_SEGS * 2 + MAX_SKB_FRAGS; - - /* Possibly one more per segment for option descriptors */ - if (efx_nic_rev(efx) >= EFX_REV_HUNT_A0) - max_descs += EFX_TSO_MAX_SEGS; - - /* Possibly more for PCIe page boundaries within input fragments */ - if (PAGE_SIZE > EFX_PAGE_SIZE) - max_descs += max_t(unsigned int, MAX_SKB_FRAGS, - DIV_ROUND_UP(GSO_MAX_SIZE, EFX_PAGE_SIZE)); - - return max_descs; -} - static void efx_tx_maybe_stop_queue(struct efx_tx_queue *txq1) { /* We need to consider both queues that the net core sees as one */ @@ -333,125 +268,6 @@ static int efx_enqueue_skb_pio(struct efx_tx_queue *tx_queue, } #endif /* EFX_USE_PIO */ -static struct efx_tx_buffer *efx_tx_map_chunk(struct efx_tx_queue *tx_queue, - dma_addr_t dma_addr, - size_t len) -{ - const struct efx_nic_type *nic_type = tx_queue->efx->type; - struct efx_tx_buffer *buffer; - unsigned int dma_len; - - /* Map the fragment taking account of NIC-dependent DMA limits. */ - do { - buffer = efx_tx_queue_get_insert_buffer(tx_queue); - dma_len = nic_type->tx_limit_len(tx_queue, dma_addr, len); - - buffer->len = dma_len; - buffer->dma_addr = dma_addr; - buffer->flags = EFX_TX_BUF_CONT; - len -= dma_len; - dma_addr += dma_len; - ++tx_queue->insert_count; - } while (len); - - return buffer; -} - -/* Map all data from an SKB for DMA and create descriptors on the queue. - */ -static int efx_tx_map_data(struct efx_tx_queue *tx_queue, struct sk_buff *skb, - unsigned int segment_count) -{ - struct efx_nic *efx = tx_queue->efx; - struct device *dma_dev = &efx->pci_dev->dev; - unsigned int frag_index, nr_frags; - dma_addr_t dma_addr, unmap_addr; - unsigned short dma_flags; - size_t len, unmap_len; - - nr_frags = skb_shinfo(skb)->nr_frags; - frag_index = 0; - - /* Map header data. */ - len = skb_headlen(skb); - dma_addr = dma_map_single(dma_dev, skb->data, len, DMA_TO_DEVICE); - dma_flags = EFX_TX_BUF_MAP_SINGLE; - unmap_len = len; - unmap_addr = dma_addr; - - if (unlikely(dma_mapping_error(dma_dev, dma_addr))) - return -EIO; - - if (segment_count) { - /* For TSO we need to put the header in to a separate - * descriptor. Map this separately if necessary. - */ - size_t header_len = skb_transport_header(skb) - skb->data + - (tcp_hdr(skb)->doff << 2u); - - if (header_len != len) { - tx_queue->tso_long_headers++; - efx_tx_map_chunk(tx_queue, dma_addr, header_len); - len -= header_len; - dma_addr += header_len; - } - } - - /* Add descriptors for each fragment. */ - do { - struct efx_tx_buffer *buffer; - skb_frag_t *fragment; - - buffer = efx_tx_map_chunk(tx_queue, dma_addr, len); - - /* The final descriptor for a fragment is responsible for - * unmapping the whole fragment. - */ - buffer->flags = EFX_TX_BUF_CONT | dma_flags; - buffer->unmap_len = unmap_len; - buffer->dma_offset = buffer->dma_addr - unmap_addr; - - if (frag_index >= nr_frags) { - /* Store SKB details with the final buffer for - * the completion. - */ - buffer->skb = skb; - buffer->flags = EFX_TX_BUF_SKB | dma_flags; - return 0; - } - - /* Move on to the next fragment. */ - fragment = &skb_shinfo(skb)->frags[frag_index++]; - len = skb_frag_size(fragment); - dma_addr = skb_frag_dma_map(dma_dev, fragment, - 0, len, DMA_TO_DEVICE); - dma_flags = 0; - unmap_len = len; - unmap_addr = dma_addr; - - if (unlikely(dma_mapping_error(dma_dev, dma_addr))) - return -EIO; - } while (1); -} - -/* Remove buffers put into a tx_queue for the current packet. - * None of the buffers must have an skb attached. - */ -static void efx_enqueue_unwind(struct efx_tx_queue *tx_queue, - unsigned int insert_count) -{ - struct efx_tx_buffer *buffer; - unsigned int bytes_compl = 0; - unsigned int pkts_compl = 0; - - /* Work backwards until we hit the original insert pointer value */ - while (tx_queue->insert_count != insert_count) { - --tx_queue->insert_count; - buffer = __efx_tx_queue_get_insert_buffer(tx_queue); - efx_dequeue_buffer(tx_queue, buffer, &pkts_compl, &bytes_compl); - } -} - /* * Fallback to software TSO. * @@ -473,12 +289,9 @@ static int efx_tx_tso_fallback(struct efx_tx_queue *tx_queue, dev_consume_skb_any(skb); skb = segments; - while (skb) { - next = skb->next; - skb->next = NULL; - + skb_list_walk_safe(skb, skb, next) { + skb_mark_not_on_list(skb); efx_enqueue_skb(tx_queue, skb); - skb = next; } return 0; @@ -687,41 +500,6 @@ int efx_xdp_tx_buffers(struct efx_nic *efx, int n, struct xdp_frame **xdpfs, return i; } -/* Remove packets from the TX queue - * - * This removes packets from the TX queue, up to and including the - * specified index. - */ -static void efx_dequeue_buffers(struct efx_tx_queue *tx_queue, - unsigned int index, - unsigned int *pkts_compl, - unsigned int *bytes_compl) -{ - struct efx_nic *efx = tx_queue->efx; - unsigned int stop_index, read_ptr; - - stop_index = (index + 1) & tx_queue->ptr_mask; - read_ptr = tx_queue->read_count & tx_queue->ptr_mask; - - while (read_ptr != stop_index) { - struct efx_tx_buffer *buffer = &tx_queue->buffer[read_ptr]; - - if (!(buffer->flags & EFX_TX_BUF_OPTION) && - unlikely(buffer->len == 0)) { - netif_err(efx, tx_err, efx->net_dev, - "TX queue %d spurious TX completion id %x\n", - tx_queue->queue, read_ptr); - efx_schedule_reset(efx, RESET_TYPE_TX_SKIP); - return; - } - - efx_dequeue_buffer(tx_queue, buffer, pkts_compl, bytes_compl); - - ++tx_queue->read_count; - read_ptr = tx_queue->read_count & tx_queue->ptr_mask; - } -} - /* Initiate a packet transmission. We use one channel per CPU * (sharing when we have more CPUs than channels). On Falcon, the TX * completion events will be directed back to the CPU that transmitted @@ -834,173 +612,3 @@ int efx_setup_tc(struct net_device *net_dev, enum tc_setup_type type, net_dev->num_tc = num_tc; return 0; } - -void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index) -{ - unsigned fill_level; - struct efx_nic *efx = tx_queue->efx; - struct efx_tx_queue *txq2; - unsigned int pkts_compl = 0, bytes_compl = 0; - - EFX_WARN_ON_ONCE_PARANOID(index > tx_queue->ptr_mask); - - efx_dequeue_buffers(tx_queue, index, &pkts_compl, &bytes_compl); - tx_queue->pkts_compl += pkts_compl; - tx_queue->bytes_compl += bytes_compl; - - if (pkts_compl > 1) - ++tx_queue->merge_events; - - /* See if we need to restart the netif queue. This memory - * barrier ensures that we write read_count (inside - * efx_dequeue_buffers()) before reading the queue status. - */ - smp_mb(); - if (unlikely(netif_tx_queue_stopped(tx_queue->core_txq)) && - likely(efx->port_enabled) && - likely(netif_device_present(efx->net_dev))) { - txq2 = efx_tx_queue_partner(tx_queue); - fill_level = max(tx_queue->insert_count - tx_queue->read_count, - txq2->insert_count - txq2->read_count); - if (fill_level <= efx->txq_wake_thresh) - netif_tx_wake_queue(tx_queue->core_txq); - } - - /* Check whether the hardware queue is now empty */ - if ((int)(tx_queue->read_count - tx_queue->old_write_count) >= 0) { - tx_queue->old_write_count = READ_ONCE(tx_queue->write_count); - if (tx_queue->read_count == tx_queue->old_write_count) { - smp_mb(); - tx_queue->empty_read_count = - tx_queue->read_count | EFX_EMPTY_COUNT_VALID; - } - } -} - -static unsigned int efx_tx_cb_page_count(struct efx_tx_queue *tx_queue) -{ - return DIV_ROUND_UP(tx_queue->ptr_mask + 1, PAGE_SIZE >> EFX_TX_CB_ORDER); -} - -int efx_probe_tx_queue(struct efx_tx_queue *tx_queue) -{ - struct efx_nic *efx = tx_queue->efx; - unsigned int entries; - int rc; - - /* Create the smallest power-of-two aligned ring */ - entries = max(roundup_pow_of_two(efx->txq_entries), EFX_MIN_DMAQ_SIZE); - EFX_WARN_ON_PARANOID(entries > EFX_MAX_DMAQ_SIZE); - tx_queue->ptr_mask = entries - 1; - - netif_dbg(efx, probe, efx->net_dev, - "creating TX queue %d size %#x mask %#x\n", - tx_queue->queue, efx->txq_entries, tx_queue->ptr_mask); - - /* Allocate software ring */ - tx_queue->buffer = kcalloc(entries, sizeof(*tx_queue->buffer), - GFP_KERNEL); - if (!tx_queue->buffer) - return -ENOMEM; - - tx_queue->cb_page = kcalloc(efx_tx_cb_page_count(tx_queue), - sizeof(tx_queue->cb_page[0]), GFP_KERNEL); - if (!tx_queue->cb_page) { - rc = -ENOMEM; - goto fail1; - } - - /* Allocate hardware ring */ - rc = efx_nic_probe_tx(tx_queue); - if (rc) - goto fail2; - - return 0; - -fail2: - kfree(tx_queue->cb_page); - tx_queue->cb_page = NULL; -fail1: - kfree(tx_queue->buffer); - tx_queue->buffer = NULL; - return rc; -} - -void efx_init_tx_queue(struct efx_tx_queue *tx_queue) -{ - struct efx_nic *efx = tx_queue->efx; - - netif_dbg(efx, drv, efx->net_dev, - "initialising TX queue %d\n", tx_queue->queue); - - tx_queue->insert_count = 0; - tx_queue->write_count = 0; - tx_queue->packet_write_count = 0; - tx_queue->old_write_count = 0; - tx_queue->read_count = 0; - tx_queue->old_read_count = 0; - tx_queue->empty_read_count = 0 | EFX_EMPTY_COUNT_VALID; - tx_queue->xmit_more_available = false; - tx_queue->timestamping = (efx_ptp_use_mac_tx_timestamps(efx) && - tx_queue->channel == efx_ptp_channel(efx)); - tx_queue->completed_desc_ptr = tx_queue->ptr_mask; - tx_queue->completed_timestamp_major = 0; - tx_queue->completed_timestamp_minor = 0; - - tx_queue->xdp_tx = efx_channel_is_xdp_tx(tx_queue->channel); - - /* Set up default function pointers. These may get replaced by - * efx_nic_init_tx() based off NIC/queue capabilities. - */ - tx_queue->handle_tso = efx_enqueue_skb_tso; - - /* Set up TX descriptor ring */ - efx_nic_init_tx(tx_queue); - - tx_queue->initialised = true; -} - -void efx_fini_tx_queue(struct efx_tx_queue *tx_queue) -{ - struct efx_tx_buffer *buffer; - - netif_dbg(tx_queue->efx, drv, tx_queue->efx->net_dev, - "shutting down TX queue %d\n", tx_queue->queue); - - if (!tx_queue->buffer) - return; - - /* Free any buffers left in the ring */ - while (tx_queue->read_count != tx_queue->write_count) { - unsigned int pkts_compl = 0, bytes_compl = 0; - buffer = &tx_queue->buffer[tx_queue->read_count & tx_queue->ptr_mask]; - efx_dequeue_buffer(tx_queue, buffer, &pkts_compl, &bytes_compl); - - ++tx_queue->read_count; - } - tx_queue->xmit_more_available = false; - netdev_tx_reset_queue(tx_queue->core_txq); -} - -void efx_remove_tx_queue(struct efx_tx_queue *tx_queue) -{ - int i; - - if (!tx_queue->buffer) - return; - - netif_dbg(tx_queue->efx, drv, tx_queue->efx->net_dev, - "destroying TX queue %d\n", tx_queue->queue); - efx_nic_remove_tx(tx_queue); - - if (tx_queue->cb_page) { - for (i = 0; i < efx_tx_cb_page_count(tx_queue); i++) - efx_nic_free_buffer(tx_queue->efx, - &tx_queue->cb_page[i]); - kfree(tx_queue->cb_page); - tx_queue->cb_page = NULL; - } - - kfree(tx_queue->buffer); - tx_queue->buffer = NULL; -} diff --git a/drivers/net/ethernet/sfc/tx_common.c b/drivers/net/ethernet/sfc/tx_common.c new file mode 100644 index 000000000000..b1571e9789d0 --- /dev/null +++ b/drivers/net/ethernet/sfc/tx_common.c @@ -0,0 +1,404 @@ +// SPDX-License-Identifier: GPL-2.0-only +/**************************************************************************** + * Driver for Solarflare network controllers and boards + * Copyright 2018 Solarflare Communications Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + */ + +#include "net_driver.h" +#include "efx.h" +#include "nic.h" +#include "tx_common.h" + +static unsigned int efx_tx_cb_page_count(struct efx_tx_queue *tx_queue) +{ + return DIV_ROUND_UP(tx_queue->ptr_mask + 1, + PAGE_SIZE >> EFX_TX_CB_ORDER); +} + +int efx_probe_tx_queue(struct efx_tx_queue *tx_queue) +{ + struct efx_nic *efx = tx_queue->efx; + unsigned int entries; + int rc; + + /* Create the smallest power-of-two aligned ring */ + entries = max(roundup_pow_of_two(efx->txq_entries), EFX_MIN_DMAQ_SIZE); + EFX_WARN_ON_PARANOID(entries > EFX_MAX_DMAQ_SIZE); + tx_queue->ptr_mask = entries - 1; + + netif_dbg(efx, probe, efx->net_dev, + "creating TX queue %d size %#x mask %#x\n", + tx_queue->queue, efx->txq_entries, tx_queue->ptr_mask); + + /* Allocate software ring */ + tx_queue->buffer = kcalloc(entries, sizeof(*tx_queue->buffer), + GFP_KERNEL); + if (!tx_queue->buffer) + return -ENOMEM; + + tx_queue->cb_page = kcalloc(efx_tx_cb_page_count(tx_queue), + sizeof(tx_queue->cb_page[0]), GFP_KERNEL); + if (!tx_queue->cb_page) { + rc = -ENOMEM; + goto fail1; + } + + /* Allocate hardware ring */ + rc = efx_nic_probe_tx(tx_queue); + if (rc) + goto fail2; + + return 0; + +fail2: + kfree(tx_queue->cb_page); + tx_queue->cb_page = NULL; +fail1: + kfree(tx_queue->buffer); + tx_queue->buffer = NULL; + return rc; +} + +void efx_init_tx_queue(struct efx_tx_queue *tx_queue) +{ + struct efx_nic *efx = tx_queue->efx; + + netif_dbg(efx, drv, efx->net_dev, + "initialising TX queue %d\n", tx_queue->queue); + + tx_queue->insert_count = 0; + tx_queue->write_count = 0; + tx_queue->packet_write_count = 0; + tx_queue->old_write_count = 0; + tx_queue->read_count = 0; + tx_queue->old_read_count = 0; + tx_queue->empty_read_count = 0 | EFX_EMPTY_COUNT_VALID; + tx_queue->xmit_more_available = false; + tx_queue->timestamping = (efx_ptp_use_mac_tx_timestamps(efx) && + tx_queue->channel == efx_ptp_channel(efx)); + tx_queue->completed_desc_ptr = tx_queue->ptr_mask; + tx_queue->completed_timestamp_major = 0; + tx_queue->completed_timestamp_minor = 0; + + tx_queue->xdp_tx = efx_channel_is_xdp_tx(tx_queue->channel); + + /* Set up default function pointers. These may get replaced by + * efx_nic_init_tx() based off NIC/queue capabilities. + */ + tx_queue->handle_tso = efx_enqueue_skb_tso; + + /* Set up TX descriptor ring */ + efx_nic_init_tx(tx_queue); + + tx_queue->initialised = true; +} + +void efx_fini_tx_queue(struct efx_tx_queue *tx_queue) +{ + struct efx_tx_buffer *buffer; + + netif_dbg(tx_queue->efx, drv, tx_queue->efx->net_dev, + "shutting down TX queue %d\n", tx_queue->queue); + + if (!tx_queue->buffer) + return; + + /* Free any buffers left in the ring */ + while (tx_queue->read_count != tx_queue->write_count) { + unsigned int pkts_compl = 0, bytes_compl = 0; + + buffer = &tx_queue->buffer[tx_queue->read_count & tx_queue->ptr_mask]; + efx_dequeue_buffer(tx_queue, buffer, &pkts_compl, &bytes_compl); + + ++tx_queue->read_count; + } + tx_queue->xmit_more_available = false; + netdev_tx_reset_queue(tx_queue->core_txq); +} + +void efx_remove_tx_queue(struct efx_tx_queue *tx_queue) +{ + int i; + + if (!tx_queue->buffer) + return; + + netif_dbg(tx_queue->efx, drv, tx_queue->efx->net_dev, + "destroying TX queue %d\n", tx_queue->queue); + efx_nic_remove_tx(tx_queue); + + if (tx_queue->cb_page) { + for (i = 0; i < efx_tx_cb_page_count(tx_queue); i++) + efx_nic_free_buffer(tx_queue->efx, + &tx_queue->cb_page[i]); + kfree(tx_queue->cb_page); + tx_queue->cb_page = NULL; + } + + kfree(tx_queue->buffer); + tx_queue->buffer = NULL; +} + +void efx_dequeue_buffer(struct efx_tx_queue *tx_queue, + struct efx_tx_buffer *buffer, + unsigned int *pkts_compl, + unsigned int *bytes_compl) +{ + if (buffer->unmap_len) { + struct device *dma_dev = &tx_queue->efx->pci_dev->dev; + dma_addr_t unmap_addr = buffer->dma_addr - buffer->dma_offset; + + if (buffer->flags & EFX_TX_BUF_MAP_SINGLE) + dma_unmap_single(dma_dev, unmap_addr, buffer->unmap_len, + DMA_TO_DEVICE); + else + dma_unmap_page(dma_dev, unmap_addr, buffer->unmap_len, + DMA_TO_DEVICE); + buffer->unmap_len = 0; + } + + if (buffer->flags & EFX_TX_BUF_SKB) { + struct sk_buff *skb = (struct sk_buff *)buffer->skb; + + EFX_WARN_ON_PARANOID(!pkts_compl || !bytes_compl); + (*pkts_compl)++; + (*bytes_compl) += skb->len; + if (tx_queue->timestamping && + (tx_queue->completed_timestamp_major || + tx_queue->completed_timestamp_minor)) { + struct skb_shared_hwtstamps hwtstamp; + + hwtstamp.hwtstamp = + efx_ptp_nic_to_kernel_time(tx_queue); + skb_tstamp_tx(skb, &hwtstamp); + + tx_queue->completed_timestamp_major = 0; + tx_queue->completed_timestamp_minor = 0; + } + dev_consume_skb_any((struct sk_buff *)buffer->skb); + netif_vdbg(tx_queue->efx, tx_done, tx_queue->efx->net_dev, + "TX queue %d transmission id %x complete\n", + tx_queue->queue, tx_queue->read_count); + } else if (buffer->flags & EFX_TX_BUF_XDP) { + xdp_return_frame_rx_napi(buffer->xdpf); + } + + buffer->len = 0; + buffer->flags = 0; +} + +/* Remove packets from the TX queue + * + * This removes packets from the TX queue, up to and including the + * specified index. + */ +static void efx_dequeue_buffers(struct efx_tx_queue *tx_queue, + unsigned int index, + unsigned int *pkts_compl, + unsigned int *bytes_compl) +{ + struct efx_nic *efx = tx_queue->efx; + unsigned int stop_index, read_ptr; + + stop_index = (index + 1) & tx_queue->ptr_mask; + read_ptr = tx_queue->read_count & tx_queue->ptr_mask; + + while (read_ptr != stop_index) { + struct efx_tx_buffer *buffer = &tx_queue->buffer[read_ptr]; + + if (!(buffer->flags & EFX_TX_BUF_OPTION) && + unlikely(buffer->len == 0)) { + netif_err(efx, tx_err, efx->net_dev, + "TX queue %d spurious TX completion id %x\n", + tx_queue->queue, read_ptr); + efx_schedule_reset(efx, RESET_TYPE_TX_SKIP); + return; + } + + efx_dequeue_buffer(tx_queue, buffer, pkts_compl, bytes_compl); + + ++tx_queue->read_count; + read_ptr = tx_queue->read_count & tx_queue->ptr_mask; + } +} + +void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index) +{ + unsigned int fill_level, pkts_compl = 0, bytes_compl = 0; + struct efx_nic *efx = tx_queue->efx; + struct efx_tx_queue *txq2; + + EFX_WARN_ON_ONCE_PARANOID(index > tx_queue->ptr_mask); + + efx_dequeue_buffers(tx_queue, index, &pkts_compl, &bytes_compl); + tx_queue->pkts_compl += pkts_compl; + tx_queue->bytes_compl += bytes_compl; + + if (pkts_compl > 1) + ++tx_queue->merge_events; + + /* See if we need to restart the netif queue. This memory + * barrier ensures that we write read_count (inside + * efx_dequeue_buffers()) before reading the queue status. + */ + smp_mb(); + if (unlikely(netif_tx_queue_stopped(tx_queue->core_txq)) && + likely(efx->port_enabled) && + likely(netif_device_present(efx->net_dev))) { + txq2 = efx_tx_queue_partner(tx_queue); + fill_level = max(tx_queue->insert_count - tx_queue->read_count, + txq2->insert_count - txq2->read_count); + if (fill_level <= efx->txq_wake_thresh) + netif_tx_wake_queue(tx_queue->core_txq); + } + + /* Check whether the hardware queue is now empty */ + if ((int)(tx_queue->read_count - tx_queue->old_write_count) >= 0) { + tx_queue->old_write_count = READ_ONCE(tx_queue->write_count); + if (tx_queue->read_count == tx_queue->old_write_count) { + smp_mb(); + tx_queue->empty_read_count = + tx_queue->read_count | EFX_EMPTY_COUNT_VALID; + } + } +} + +/* Remove buffers put into a tx_queue for the current packet. + * None of the buffers must have an skb attached. + */ +void efx_enqueue_unwind(struct efx_tx_queue *tx_queue, + unsigned int insert_count) +{ + struct efx_tx_buffer *buffer; + unsigned int bytes_compl = 0; + unsigned int pkts_compl = 0; + + /* Work backwards until we hit the original insert pointer value */ + while (tx_queue->insert_count != insert_count) { + --tx_queue->insert_count; + buffer = __efx_tx_queue_get_insert_buffer(tx_queue); + efx_dequeue_buffer(tx_queue, buffer, &pkts_compl, &bytes_compl); + } +} + +struct efx_tx_buffer *efx_tx_map_chunk(struct efx_tx_queue *tx_queue, + dma_addr_t dma_addr, size_t len) +{ + const struct efx_nic_type *nic_type = tx_queue->efx->type; + struct efx_tx_buffer *buffer; + unsigned int dma_len; + + /* Map the fragment taking account of NIC-dependent DMA limits. */ + do { + buffer = efx_tx_queue_get_insert_buffer(tx_queue); + dma_len = nic_type->tx_limit_len(tx_queue, dma_addr, len); + + buffer->len = dma_len; + buffer->dma_addr = dma_addr; + buffer->flags = EFX_TX_BUF_CONT; + len -= dma_len; + dma_addr += dma_len; + ++tx_queue->insert_count; + } while (len); + + return buffer; +} + +/* Map all data from an SKB for DMA and create descriptors on the queue. */ +int efx_tx_map_data(struct efx_tx_queue *tx_queue, struct sk_buff *skb, + unsigned int segment_count) +{ + struct efx_nic *efx = tx_queue->efx; + struct device *dma_dev = &efx->pci_dev->dev; + unsigned int frag_index, nr_frags; + dma_addr_t dma_addr, unmap_addr; + unsigned short dma_flags; + size_t len, unmap_len; + + nr_frags = skb_shinfo(skb)->nr_frags; + frag_index = 0; + + /* Map header data. */ + len = skb_headlen(skb); + dma_addr = dma_map_single(dma_dev, skb->data, len, DMA_TO_DEVICE); + dma_flags = EFX_TX_BUF_MAP_SINGLE; + unmap_len = len; + unmap_addr = dma_addr; + + if (unlikely(dma_mapping_error(dma_dev, dma_addr))) + return -EIO; + + if (segment_count) { + /* For TSO we need to put the header in to a separate + * descriptor. Map this separately if necessary. + */ + size_t header_len = skb_transport_header(skb) - skb->data + + (tcp_hdr(skb)->doff << 2u); + + if (header_len != len) { + tx_queue->tso_long_headers++; + efx_tx_map_chunk(tx_queue, dma_addr, header_len); + len -= header_len; + dma_addr += header_len; + } + } + + /* Add descriptors for each fragment. */ + do { + struct efx_tx_buffer *buffer; + skb_frag_t *fragment; + + buffer = efx_tx_map_chunk(tx_queue, dma_addr, len); + + /* The final descriptor for a fragment is responsible for + * unmapping the whole fragment. + */ + buffer->flags = EFX_TX_BUF_CONT | dma_flags; + buffer->unmap_len = unmap_len; + buffer->dma_offset = buffer->dma_addr - unmap_addr; + + if (frag_index >= nr_frags) { + /* Store SKB details with the final buffer for + * the completion. + */ + buffer->skb = skb; + buffer->flags = EFX_TX_BUF_SKB | dma_flags; + return 0; + } + + /* Move on to the next fragment. */ + fragment = &skb_shinfo(skb)->frags[frag_index++]; + len = skb_frag_size(fragment); + dma_addr = skb_frag_dma_map(dma_dev, fragment, 0, len, + DMA_TO_DEVICE); + dma_flags = 0; + unmap_len = len; + unmap_addr = dma_addr; + + if (unlikely(dma_mapping_error(dma_dev, dma_addr))) + return -EIO; + } while (1); +} + +unsigned int efx_tx_max_skb_descs(struct efx_nic *efx) +{ + /* Header and payload descriptor for each output segment, plus + * one for every input fragment boundary within a segment + */ + unsigned int max_descs = EFX_TSO_MAX_SEGS * 2 + MAX_SKB_FRAGS; + + /* Possibly one more per segment for option descriptors */ + if (efx_nic_rev(efx) >= EFX_REV_HUNT_A0) + max_descs += EFX_TSO_MAX_SEGS; + + /* Possibly more for PCIe page boundaries within input fragments */ + if (PAGE_SIZE > EFX_PAGE_SIZE) + max_descs += max_t(unsigned int, MAX_SKB_FRAGS, + DIV_ROUND_UP(GSO_MAX_SIZE, EFX_PAGE_SIZE)); + + return max_descs; +} diff --git a/drivers/net/ethernet/sfc/tx_common.h b/drivers/net/ethernet/sfc/tx_common.h new file mode 100644 index 000000000000..f92f1fe3a87f --- /dev/null +++ b/drivers/net/ethernet/sfc/tx_common.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/**************************************************************************** + * Driver for Solarflare network controllers and boards + * Copyright 2018 Solarflare Communications Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + */ + +#ifndef EFX_TX_COMMON_H +#define EFX_TX_COMMON_H + +int efx_probe_tx_queue(struct efx_tx_queue *tx_queue); +void efx_init_tx_queue(struct efx_tx_queue *tx_queue); +void efx_fini_tx_queue(struct efx_tx_queue *tx_queue); +void efx_remove_tx_queue(struct efx_tx_queue *tx_queue); + +void efx_dequeue_buffer(struct efx_tx_queue *tx_queue, + struct efx_tx_buffer *buffer, + unsigned int *pkts_compl, + unsigned int *bytes_compl); + +void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index); + +void efx_enqueue_unwind(struct efx_tx_queue *tx_queue, + unsigned int insert_count); + +struct efx_tx_buffer *efx_tx_map_chunk(struct efx_tx_queue *tx_queue, + dma_addr_t dma_addr, size_t len); +int efx_tx_map_data(struct efx_tx_queue *tx_queue, struct sk_buff *skb, + unsigned int segment_count); + +unsigned int efx_tx_max_skb_descs(struct efx_nic *efx); + +#endif diff --git a/drivers/net/ethernet/smsc/epic100.c b/drivers/net/ethernet/smsc/epic100.c index 912760e8514c..61ddee0c2a2e 100644 --- a/drivers/net/ethernet/smsc/epic100.c +++ b/drivers/net/ethernet/smsc/epic100.c @@ -280,6 +280,7 @@ struct epic_private { signed char phys[4]; /* MII device addresses. */ u16 advertising; /* NWay media advertisement */ int mii_phy_cnt; + u32 ethtool_ops_nesting; struct mii_if_info mii; unsigned int tx_full:1; /* The Tx queue is full. */ unsigned int default_port:4; /* Last dev->if_port value. */ @@ -1435,8 +1436,10 @@ static int ethtool_begin(struct net_device *dev) struct epic_private *ep = netdev_priv(dev); void __iomem *ioaddr = ep->ioaddr; + if (ep->ethtool_ops_nesting == U32_MAX) + return -EBUSY; /* power-up, if interface is down */ - if (!netif_running(dev)) { + if (!ep->ethtool_ops_nesting++ && !netif_running(dev)) { ew32(GENCTL, 0x0200); ew32(NVCTL, (er32(NVCTL) & ~0x003c) | 0x4800); } @@ -1449,7 +1452,7 @@ static void ethtool_complete(struct net_device *dev) void __iomem *ioaddr = ep->ioaddr; /* power-down, if interface is down */ - if (!netif_running(dev)) { + if (!--ep->ethtool_ops_nesting && !netif_running(dev)) { ew32(GENCTL, 0x0008); ew32(NVCTL, (er32(NVCTL) & ~0x483c) | 0x0000); } diff --git a/drivers/net/ethernet/smsc/smsc911x.c b/drivers/net/ethernet/smsc/smsc911x.c index 38068fc34141..7dc2154ccdad 100644 --- a/drivers/net/ethernet/smsc/smsc911x.c +++ b/drivers/net/ethernet/smsc/smsc911x.c @@ -1943,15 +1943,6 @@ static int smsc911x_set_mac_address(struct net_device *dev, void *p) return 0; } -/* Standard ioctls for mii-tool */ -static int smsc911x_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) -{ - if (!netif_running(dev) || !dev->phydev) - return -EINVAL; - - return phy_mii_ioctl(dev->phydev, ifr, cmd); -} - static void smsc911x_ethtool_getdrvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { @@ -2151,7 +2142,7 @@ static const struct net_device_ops smsc911x_netdev_ops = { .ndo_start_xmit = smsc911x_hard_start_xmit, .ndo_get_stats = smsc911x_get_stats, .ndo_set_rx_mode = smsc911x_set_multicast_list, - .ndo_do_ioctl = smsc911x_do_ioctl, + .ndo_do_ioctl = phy_do_ioctl_running, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = smsc911x_set_mac_address, #ifdef CONFIG_NET_POLL_CONTROLLER diff --git a/drivers/net/ethernet/smsc/smsc9420.c b/drivers/net/ethernet/smsc/smsc9420.c index a6962a41c3d2..7312e522c022 100644 --- a/drivers/net/ethernet/smsc/smsc9420.c +++ b/drivers/net/ethernet/smsc/smsc9420.c @@ -210,15 +210,6 @@ static int smsc9420_eeprom_reload(struct smsc9420_pdata *pd) return -EIO; } -/* Standard ioctls for mii-tool */ -static int smsc9420_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) -{ - if (!netif_running(dev) || !dev->phydev) - return -EINVAL; - - return phy_mii_ioctl(dev->phydev, ifr, cmd); -} - static void smsc9420_ethtool_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo) { @@ -1504,7 +1495,7 @@ static const struct net_device_ops smsc9420_netdev_ops = { .ndo_start_xmit = smsc9420_hard_start_xmit, .ndo_get_stats = smsc9420_get_stats, .ndo_set_rx_mode = smsc9420_set_multicast_list, - .ndo_do_ioctl = smsc9420_do_ioctl, + .ndo_do_ioctl = phy_do_ioctl_running, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, #ifdef CONFIG_NET_POLL_CONTROLLER diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c index b5a9e947a4a8..e8224b543dfc 100644 --- a/drivers/net/ethernet/socionext/netsec.c +++ b/drivers/net/ethernet/socionext/netsec.c @@ -243,6 +243,7 @@ NET_IP_ALIGN) #define NETSEC_RX_BUF_NON_DATA (NETSEC_RXBUF_HEADROOM + \ SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) +#define NETSEC_RX_BUF_SIZE (PAGE_SIZE - NETSEC_RX_BUF_NON_DATA) #define DESC_SZ sizeof(struct netsec_de) @@ -719,7 +720,6 @@ static void *netsec_alloc_rx_data(struct netsec_priv *priv, { struct netsec_desc_ring *dring = &priv->desc_ring[NETSEC_RING_RX]; - enum dma_data_direction dma_dir; struct page *page; page = page_pool_dev_alloc_pages(dring->page_pool); @@ -734,9 +734,7 @@ static void *netsec_alloc_rx_data(struct netsec_priv *priv, /* Make sure the incoming payload fits in the page for XDP and non-XDP * cases and reserve enough space for headroom + skb_shared_info */ - *desc_len = PAGE_SIZE - NETSEC_RX_BUF_NON_DATA; - dma_dir = page_pool_get_dma_dir(dring->page_pool); - dma_sync_single_for_device(priv->dev, *dma_handle, *desc_len, dma_dir); + *desc_len = NETSEC_RX_BUF_SIZE; return page_address(page); } @@ -883,6 +881,8 @@ static u32 netsec_xdp_xmit_back(struct netsec_priv *priv, struct xdp_buff *xdp) static u32 netsec_run_xdp(struct netsec_priv *priv, struct bpf_prog *prog, struct xdp_buff *xdp) { + struct netsec_desc_ring *dring = &priv->desc_ring[NETSEC_RING_RX]; + unsigned int len = xdp->data_end - xdp->data; u32 ret = NETSEC_XDP_PASS; int err; u32 act; @@ -896,7 +896,9 @@ static u32 netsec_run_xdp(struct netsec_priv *priv, struct bpf_prog *prog, case XDP_TX: ret = netsec_xdp_xmit_back(priv, xdp); if (ret != NETSEC_XDP_TX) - xdp_return_buff(xdp); + __page_pool_put_page(dring->page_pool, + virt_to_head_page(xdp->data), + len, true); break; case XDP_REDIRECT: err = xdp_do_redirect(priv->ndev, xdp, prog); @@ -904,7 +906,9 @@ static u32 netsec_run_xdp(struct netsec_priv *priv, struct bpf_prog *prog, ret = NETSEC_XDP_REDIR; } else { ret = NETSEC_XDP_CONSUMED; - xdp_return_buff(xdp); + __page_pool_put_page(dring->page_pool, + virt_to_head_page(xdp->data), + len, true); } break; default: @@ -915,7 +919,9 @@ static u32 netsec_run_xdp(struct netsec_priv *priv, struct bpf_prog *prog, /* fall through -- handle aborts by dropping packet */ case XDP_DROP: ret = NETSEC_XDP_CONSUMED; - xdp_return_buff(xdp); + __page_pool_put_page(dring->page_pool, + virt_to_head_page(xdp->data), + len, true); break; } @@ -929,7 +935,6 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget) struct netsec_rx_pkt_info rx_info; enum dma_data_direction dma_dir; struct bpf_prog *xdp_prog; - struct sk_buff *skb = NULL; u16 xdp_xmit = 0; u32 xdp_act = 0; int done = 0; @@ -943,7 +948,8 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget) struct netsec_de *de = dring->vaddr + (DESC_SZ * idx); struct netsec_desc *desc = &dring->desc[idx]; struct page *page = virt_to_page(desc->addr); - u32 xdp_result = XDP_PASS; + u32 xdp_result = NETSEC_XDP_PASS; + struct sk_buff *skb = NULL; u16 pkt_len, desc_len; dma_addr_t dma_handle; struct xdp_buff xdp; @@ -1014,7 +1020,8 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget) * cache state. Since we paid the allocation cost if * building an skb fails try to put the page into cache */ - page_pool_recycle_direct(dring->page_pool, page); + __page_pool_put_page(dring->page_pool, page, + pkt_len, true); netif_err(priv, drv, priv->ndev, "rx failed to build skb\n"); break; @@ -1272,17 +1279,19 @@ static int netsec_setup_rx_dring(struct netsec_priv *priv) { struct netsec_desc_ring *dring = &priv->desc_ring[NETSEC_RING_RX]; struct bpf_prog *xdp_prog = READ_ONCE(priv->xdp_prog); - struct page_pool_params pp_params = { 0 }; + struct page_pool_params pp_params = { + .order = 0, + /* internal DMA mapping in page_pool */ + .flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV, + .pool_size = DESC_NUM, + .nid = NUMA_NO_NODE, + .dev = priv->dev, + .dma_dir = xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE, + .offset = NETSEC_RXBUF_HEADROOM, + .max_len = NETSEC_RX_BUF_SIZE, + }; int i, err; - pp_params.order = 0; - /* internal DMA mapping in page_pool */ - pp_params.flags = PP_FLAG_DMA_MAP; - pp_params.pool_size = DESC_NUM; - pp_params.nid = NUMA_NO_NODE; - pp_params.dev = priv->dev; - pp_params.dma_dir = xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE; - dring->page_pool = page_pool_create(&pp_params); if (IS_ERR(dring->page_pool)) { err = PTR_ERR(dring->page_pool); @@ -1731,12 +1740,6 @@ static int netsec_netdev_set_features(struct net_device *ndev, return 0; } -static int netsec_netdev_ioctl(struct net_device *ndev, struct ifreq *ifr, - int cmd) -{ - return phy_mii_ioctl(ndev->phydev, ifr, cmd); -} - static int netsec_xdp_xmit(struct net_device *ndev, int n, struct xdp_frame **frames, u32 flags) { @@ -1821,7 +1824,7 @@ static const struct net_device_ops netsec_netdev_ops = { .ndo_set_features = netsec_netdev_set_features, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, - .ndo_do_ioctl = netsec_netdev_ioctl, + .ndo_do_ioctl = phy_do_ioctl, .ndo_xdp_xmit = netsec_xdp_xmit, .ndo_bpf = netsec_xdp, }; diff --git a/drivers/net/ethernet/socionext/sni_ave.c b/drivers/net/ethernet/socionext/sni_ave.c index f7e927ad67fa..b7032422393f 100644 --- a/drivers/net/ethernet/socionext/sni_ave.c +++ b/drivers/net/ethernet/socionext/sni_ave.c @@ -424,16 +424,22 @@ static void ave_ethtool_get_wol(struct net_device *ndev, phy_ethtool_get_wol(ndev->phydev, wol); } -static int ave_ethtool_set_wol(struct net_device *ndev, - struct ethtool_wolinfo *wol) +static int __ave_ethtool_set_wol(struct net_device *ndev, + struct ethtool_wolinfo *wol) { - int ret; - if (!ndev->phydev || (wol->wolopts & (WAKE_ARP | WAKE_MAGICSECURE))) return -EOPNOTSUPP; - ret = phy_ethtool_set_wol(ndev->phydev, wol); + return phy_ethtool_set_wol(ndev->phydev, wol); +} + +static int ave_ethtool_set_wol(struct net_device *ndev, + struct ethtool_wolinfo *wol) +{ + int ret; + + ret = __ave_ethtool_set_wol(ndev, wol); if (!ret) device_set_wakeup_enable(&ndev->dev, !!wol->wolopts); @@ -1216,7 +1222,7 @@ static int ave_init(struct net_device *ndev) /* set wol initial state disabled */ wol.wolopts = 0; - ave_ethtool_set_wol(ndev, &wol); + __ave_ethtool_set_wol(ndev, &wol); if (!phy_interface_is_rgmii(phydev)) phy_set_max_speed(phydev, SPEED_100); @@ -1768,7 +1774,7 @@ static int ave_resume(struct device *dev) ave_ethtool_get_wol(ndev, &wol); wol.wolopts = priv->wolopts; - ave_ethtool_set_wol(ndev, &wol); + __ave_ethtool_set_wol(ndev, &wol); if (ndev->phydev) { ret = phy_resume(ndev->phydev); diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h index 31003b67d24f..487099092693 100644 --- a/drivers/net/ethernet/stmicro/stmmac/common.h +++ b/drivers/net/ethernet/stmicro/stmmac/common.h @@ -368,6 +368,7 @@ struct dma_features { unsigned int estdep; unsigned int estsel; unsigned int fpesel; + unsigned int tbssel; }; /* RX Buffer size must be multiple of 4/8/16 bytes */ diff --git a/drivers/net/ethernet/stmicro/stmmac/descs.h b/drivers/net/ethernet/stmicro/stmmac/descs.h index 9f0b9a9e63b3..49d6a866244f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/descs.h +++ b/drivers/net/ethernet/stmicro/stmmac/descs.h @@ -171,6 +171,15 @@ struct dma_extended_desc { __le32 des7; /* Tx/Rx Timestamp High */ }; +/* Enhanced descriptor for TBS */ +struct dma_edesc { + __le32 des4; + __le32 des5; + __le32 des6; + __le32 des7; + struct dma_desc basic; +}; + /* Transmit checksum insertion control */ #define TX_CIC_FULL 3 /* Include IP header and pseudoheader */ diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c index 6f834302fda3..58e0511badba 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c @@ -973,6 +973,9 @@ static int sun8i_dwmac_set_syscon(struct stmmac_priv *priv) /* default */ break; case PHY_INTERFACE_MODE_RGMII: + case PHY_INTERFACE_MODE_RGMII_ID: + case PHY_INTERFACE_MODE_RGMII_RXID: + case PHY_INTERFACE_MODE_RGMII_TXID: reg |= SYSCON_EPIT | SYSCON_ETCS_INT_GMII; break; case PHY_INTERFACE_MODE_RMII: diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c index 26353ef616b8..7d40760e9ba8 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c @@ -44,7 +44,7 @@ static int sun7i_gmac_init(struct platform_device *pdev, void *priv) * rate, which then uses the auto-reparenting feature of the * clock driver, and enabling/disabling the clock. */ - if (gmac->interface == PHY_INTERFACE_MODE_RGMII) { + if (phy_interface_mode_is_rgmii(gmac->interface)) { clk_set_rate(gmac->tx_clk, SUN7I_GMAC_GMII_RGMII_RATE); clk_prepare_enable(gmac->tx_clk); gmac->clk_enabled = 1; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h index 2e6b60a476c6..af50af27550b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h @@ -239,6 +239,7 @@ enum power_event { /* MAC HW features3 bitmap */ #define GMAC_HW_FEAT_ASP GENMASK(29, 28) +#define GMAC_HW_FEAT_TBSSEL BIT(27) #define GMAC_HW_FEAT_FPESEL BIT(26) #define GMAC_HW_FEAT_ESTWID GENMASK(21, 20) #define GMAC_HW_FEAT_ESTDEP GENMASK(19, 17) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c index 3e14da69f378..eff82065a501 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c @@ -10,6 +10,7 @@ #include <linux/stmmac.h> #include "common.h" +#include "dwmac4.h" #include "dwmac4_descs.h" static int dwmac4_wrback_get_tx_status(void *data, struct stmmac_extra_stats *x, @@ -505,6 +506,14 @@ static void dwmac4_set_sec_addr(struct dma_desc *p, dma_addr_t addr) p->des3 = cpu_to_le32(upper_32_bits(addr) | RDES3_BUFFER2_VALID_ADDR); } +static void dwmac4_set_tbs(struct dma_edesc *p, u32 sec, u32 nsec) +{ + p->des4 = cpu_to_le32((sec & TDES4_LT) | TDES4_LTV); + p->des5 = cpu_to_le32(nsec & TDES5_LT); + p->des6 = 0; + p->des7 = 0; +} + const struct stmmac_desc_ops dwmac4_desc_ops = { .tx_status = dwmac4_wrback_get_tx_status, .rx_status = dwmac4_wrback_get_rx_status, @@ -534,6 +543,7 @@ const struct stmmac_desc_ops dwmac4_desc_ops = { .set_vlan = dwmac4_set_vlan, .get_rx_header_len = dwmac4_get_rx_header_len, .set_sec_addr = dwmac4_set_sec_addr, + .set_tbs = dwmac4_set_tbs, }; const struct stmmac_mode_ops dwmac4_ring_mode_ops = { diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.h index 6d92109dc9aa..6da070ccd737 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.h @@ -73,6 +73,13 @@ #define TDES3_CONTEXT_TYPE BIT(30) #define TDES3_CONTEXT_TYPE_SHIFT 30 +/* TDES4 */ +#define TDES4_LTV BIT(31) +#define TDES4_LT GENMASK(7, 0) + +/* TDES5 */ +#define TDES5_LT GENMASK(31, 8) + /* TDS3 use for both format (read and write back) */ #define TDES3_OWN BIT(31) #define TDES3_OWN_SHIFT 31 diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c index 213d44482ffa..bb29bfcd62c3 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c @@ -404,6 +404,7 @@ static void dwmac4_get_hw_feature(void __iomem *ioaddr, /* 5.10 Features */ dma_cap->asp = (hw_cap & GMAC_HW_FEAT_ASP) >> 28; + dma_cap->tbssel = (hw_cap & GMAC_HW_FEAT_TBSSEL) >> 27; dma_cap->fpesel = (hw_cap & GMAC_HW_FEAT_FPESEL) >> 26; dma_cap->estwid = (hw_cap & GMAC_HW_FEAT_ESTWID) >> 20; dma_cap->estdep = (hw_cap & GMAC_HW_FEAT_ESTDEP) >> 17; @@ -471,6 +472,25 @@ static void dwmac4_enable_sph(void __iomem *ioaddr, bool en, u32 chan) writel(value, ioaddr + DMA_CHAN_CONTROL(chan)); } +static int dwmac4_enable_tbs(void __iomem *ioaddr, bool en, u32 chan) +{ + u32 value = readl(ioaddr + DMA_CHAN_TX_CONTROL(chan)); + + if (en) + value |= DMA_CONTROL_EDSE; + else + value &= ~DMA_CONTROL_EDSE; + + writel(value, ioaddr + DMA_CHAN_TX_CONTROL(chan)); + + value = readl(ioaddr + DMA_CHAN_TX_CONTROL(chan)) & DMA_CONTROL_EDSE; + if (en && !value) + return -EIO; + + writel(DMA_TBS_DEF_FTOS, ioaddr + DMA_TBS_CTRL); + return 0; +} + const struct stmmac_dma_ops dwmac4_dma_ops = { .reset = dwmac4_dma_reset, .init = dwmac4_dma_init, @@ -527,4 +547,5 @@ const struct stmmac_dma_ops dwmac410_dma_ops = { .qmode = dwmac4_qmode, .set_bfsize = dwmac4_set_bfsize, .enable_sph = dwmac4_enable_sph, + .enable_tbs = dwmac4_enable_tbs, }; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h index bcb6d5190f3d..8391ca63d943 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h @@ -22,6 +22,7 @@ #define DMA_DEBUG_STATUS_1 0x00001010 #define DMA_DEBUG_STATUS_2 0x00001014 #define DMA_AXI_BUS_MODE 0x00001028 +#define DMA_TBS_CTRL 0x00001050 /* DMA Bus Mode bitmap */ #define DMA_BUS_MODE_SFT_RESET BIT(0) @@ -82,6 +83,11 @@ #define DMA_AXI_BURST_LEN_MASK 0x000000FE +/* DMA TBS Control */ +#define DMA_TBS_FTOS GENMASK(31, 8) +#define DMA_TBS_FTOV BIT(0) +#define DMA_TBS_DEF_FTOS (DMA_TBS_FTOS | DMA_TBS_FTOV) + /* Following DMA defines are chanels oriented */ #define DMA_CHAN_BASE_ADDR 0x00001100 #define DMA_CHAN_BASE_OFFSET 0x80 @@ -114,6 +120,7 @@ #define DMA_CONTROL_MSS_MASK GENMASK(13, 0) /* DMA Tx Channel X Control register defines */ +#define DMA_CONTROL_EDSE BIT(28) #define DMA_CONTROL_TSE BIT(12) #define DMA_CONTROL_OSP BIT(4) #define DMA_CONTROL_ST BIT(0) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac5.c b/drivers/net/ethernet/stmicro/stmmac/dwmac5.c index 5d4a3c2458ea..494c859b4ade 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac5.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac5.c @@ -657,6 +657,7 @@ void dwmac5_fpe_configure(void __iomem *ioaddr, u32 num_txq, u32 num_rxq, value &= ~EFPE; writel(value, ioaddr + MAC_FPE_CTRL_STS); + return; } value = readl(ioaddr + GMAC_RXQ_CTRL1); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h index 64d13e50e403..6c3b8a950f58 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h @@ -139,6 +139,7 @@ #define XGMAC_HWFEAT_TXQCNT GENMASK(9, 6) #define XGMAC_HWFEAT_RXQCNT GENMASK(3, 0) #define XGMAC_HW_FEATURE3 0x00000128 +#define XGMAC_HWFEAT_TBSSEL BIT(27) #define XGMAC_HWFEAT_FPESEL BIT(26) #define XGMAC_HWFEAT_ESTWID GENMASK(24, 23) #define XGMAC_HWFEAT_ESTDEP GENMASK(22, 20) @@ -346,6 +347,13 @@ #define XGMAC_TDPS GENMASK(29, 0) #define XGMAC_RX_EDMA_CTRL 0x00003044 #define XGMAC_RDPS GENMASK(29, 0) +#define XGMAC_DMA_TBS_CTRL0 0x00003054 +#define XGMAC_DMA_TBS_CTRL1 0x00003058 +#define XGMAC_DMA_TBS_CTRL2 0x0000305c +#define XGMAC_DMA_TBS_CTRL3 0x00003060 +#define XGMAC_FTOS GENMASK(31, 8) +#define XGMAC_FTOV BIT(0) +#define XGMAC_DEF_FTOS (XGMAC_FTOS | XGMAC_FTOV) #define XGMAC_DMA_SAFETY_INT_STATUS 0x00003064 #define XGMAC_MCSIS BIT(31) #define XGMAC_MSUIS BIT(29) @@ -360,6 +368,7 @@ #define XGMAC_SPH BIT(24) #define XGMAC_PBLx8 BIT(16) #define XGMAC_DMA_CH_TX_CONTROL(x) (0x00003104 + (0x80 * (x))) +#define XGMAC_EDSE BIT(28) #define XGMAC_TxPBL GENMASK(21, 16) #define XGMAC_TxPBL_SHIFT 16 #define XGMAC_TSE BIT(12) @@ -404,6 +413,9 @@ #define XGMAC_REGSIZE ((0x0000317c + (0x80 * 15)) / 4) /* Descriptors */ +#define XGMAC_TDES0_LTV BIT(31) +#define XGMAC_TDES0_LT GENMASK(7, 0) +#define XGMAC_TDES1_LT GENMASK(31, 8) #define XGMAC_TDES2_IVT GENMASK(31, 16) #define XGMAC_TDES2_IVT_SHIFT 16 #define XGMAC_TDES2_IOC BIT(31) @@ -422,6 +434,7 @@ #define XGMAC_TDES3_TCMSSV BIT(26) #define XGMAC_TDES3_SAIC GENMASK(25, 23) #define XGMAC_TDES3_SAIC_SHIFT 23 +#define XGMAC_TDES3_TBSV BIT(24) #define XGMAC_TDES3_THL GENMASK(22, 19) #define XGMAC_TDES3_THL_SHIFT 19 #define XGMAC_TDES3_IVTIR GENMASK(19, 18) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c index 307105e8dea0..2af3ac5409b7 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c @@ -1421,6 +1421,7 @@ static void dwxgmac3_fpe_configure(void __iomem *ioaddr, u32 num_txq, value &= ~XGMAC_EFPE; writel(value, ioaddr + XGMAC_FPE_CTRL_STS); + return; } value = readl(ioaddr + XGMAC_RXQ_CTRL1); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_descs.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_descs.c index bd5838ce1e8a..c3d654cfa9ef 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_descs.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_descs.c @@ -339,6 +339,14 @@ static void dwxgmac2_set_vlan(struct dma_desc *p, u32 type) p->des2 |= cpu_to_le32(type & XGMAC_TDES2_VTIR); } +static void dwxgmac2_set_tbs(struct dma_edesc *p, u32 sec, u32 nsec) +{ + p->des4 = cpu_to_le32((sec & XGMAC_TDES0_LT) | XGMAC_TDES0_LTV); + p->des5 = cpu_to_le32(nsec & XGMAC_TDES1_LT); + p->des6 = 0; + p->des7 = 0; +} + const struct stmmac_desc_ops dwxgmac210_desc_ops = { .tx_status = dwxgmac2_get_tx_status, .rx_status = dwxgmac2_get_rx_status, @@ -368,4 +376,5 @@ const struct stmmac_desc_ops dwxgmac210_desc_ops = { .set_sarc = dwxgmac2_set_sarc, .set_vlan_tag = dwxgmac2_set_vlan_tag, .set_vlan = dwxgmac2_set_vlan, + .set_tbs = dwxgmac2_set_tbs, }; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c index bbbfa793a367..77308c5c5d29 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c @@ -429,6 +429,7 @@ static void dwxgmac2_get_hw_feature(void __iomem *ioaddr, /* MAC HW feature 3 */ hw_cap = readl(ioaddr + XGMAC_HW_FEATURE3); + dma_cap->tbssel = (hw_cap & XGMAC_HWFEAT_TBSSEL) >> 27; dma_cap->fpesel = (hw_cap & XGMAC_HWFEAT_FPESEL) >> 26; dma_cap->estwid = (hw_cap & XGMAC_HWFEAT_ESTWID) >> 23; dma_cap->estdep = (hw_cap & XGMAC_HWFEAT_ESTDEP) >> 20; @@ -523,6 +524,28 @@ static void dwxgmac2_enable_sph(void __iomem *ioaddr, bool en, u32 chan) writel(value, ioaddr + XGMAC_DMA_CH_CONTROL(chan)); } +static int dwxgmac2_enable_tbs(void __iomem *ioaddr, bool en, u32 chan) +{ + u32 value = readl(ioaddr + XGMAC_DMA_CH_TX_CONTROL(chan)); + + if (en) + value |= XGMAC_EDSE; + else + value &= ~XGMAC_EDSE; + + writel(value, ioaddr + XGMAC_DMA_CH_TX_CONTROL(chan)); + + value = readl(ioaddr + XGMAC_DMA_CH_TX_CONTROL(chan)) & XGMAC_EDSE; + if (en && !value) + return -EIO; + + writel(XGMAC_DEF_FTOS, ioaddr + XGMAC_DMA_TBS_CTRL0); + writel(XGMAC_DEF_FTOS, ioaddr + XGMAC_DMA_TBS_CTRL1); + writel(XGMAC_DEF_FTOS, ioaddr + XGMAC_DMA_TBS_CTRL2); + writel(XGMAC_DEF_FTOS, ioaddr + XGMAC_DMA_TBS_CTRL3); + return 0; +} + const struct stmmac_dma_ops dwxgmac210_dma_ops = { .reset = dwxgmac2_dma_reset, .init = dwxgmac2_dma_init, @@ -550,4 +573,5 @@ const struct stmmac_dma_ops dwxgmac210_dma_ops = { .qmode = dwxgmac2_qmode, .set_bfsize = dwxgmac2_set_bfsize, .enable_sph = dwxgmac2_enable_sph, + .enable_tbs = dwxgmac2_enable_tbs, }; diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h index 905a6f0edaca..df63b0367aff 100644 --- a/drivers/net/ethernet/stmicro/stmmac/hwif.h +++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h @@ -29,6 +29,7 @@ struct stmmac_extra_stats; struct stmmac_safety_stats; struct dma_desc; struct dma_extended_desc; +struct dma_edesc; /* Descriptors helpers */ struct stmmac_desc_ops { @@ -95,6 +96,7 @@ struct stmmac_desc_ops { void (*set_vlan_tag)(struct dma_desc *p, u16 tag, u16 inner_tag, u32 inner_type); void (*set_vlan)(struct dma_desc *p, u32 type); + void (*set_tbs)(struct dma_edesc *p, u32 sec, u32 nsec); }; #define stmmac_init_rx_desc(__priv, __args...) \ @@ -157,6 +159,8 @@ struct stmmac_desc_ops { stmmac_do_void_callback(__priv, desc, set_vlan_tag, __args) #define stmmac_set_desc_vlan(__priv, __args...) \ stmmac_do_void_callback(__priv, desc, set_vlan, __args) +#define stmmac_set_desc_tbs(__priv, __args...) \ + stmmac_do_void_callback(__priv, desc, set_tbs, __args) struct stmmac_dma_cfg; struct dma_features; @@ -210,6 +214,7 @@ struct stmmac_dma_ops { void (*qmode)(void __iomem *ioaddr, u32 channel, u8 qmode); void (*set_bfsize)(void __iomem *ioaddr, int bfsize, u32 chan); void (*enable_sph)(void __iomem *ioaddr, bool en, u32 chan); + int (*enable_tbs)(void __iomem *ioaddr, bool en, u32 chan); }; #define stmmac_reset(__priv, __args...) \ @@ -268,6 +273,8 @@ struct stmmac_dma_ops { stmmac_do_void_callback(__priv, dma, set_bfsize, __args) #define stmmac_enable_sph(__priv, __args...) \ stmmac_do_void_callback(__priv, dma, enable_sph, __args) +#define stmmac_enable_tbs(__priv, __args...) \ + stmmac_do_callback(__priv, dma, enable_tbs, __args) struct mac_device_info; struct net_device; @@ -526,6 +533,7 @@ struct tc_cls_u32_offload; struct tc_cbs_qopt_offload; struct flow_cls_offload; struct tc_taprio_qopt_offload; +struct tc_etf_qopt_offload; struct stmmac_tc_ops { int (*init)(struct stmmac_priv *priv); @@ -537,6 +545,8 @@ struct stmmac_tc_ops { struct flow_cls_offload *cls); int (*setup_taprio)(struct stmmac_priv *priv, struct tc_taprio_qopt_offload *qopt); + int (*setup_etf)(struct stmmac_priv *priv, + struct tc_etf_qopt_offload *qopt); }; #define stmmac_tc_init(__priv, __args...) \ @@ -549,6 +559,8 @@ struct stmmac_tc_ops { stmmac_do_callback(__priv, tc, setup_cls, __args) #define stmmac_tc_setup_taprio(__priv, __args...) \ stmmac_do_callback(__priv, tc, setup_taprio, __args) +#define stmmac_tc_setup_etf(__priv, __args...) \ + stmmac_do_callback(__priv, tc, setup_etf, __args) struct stmmac_counters; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h index f98c5eefb382..9c02fc754bf1 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h @@ -39,13 +39,18 @@ struct stmmac_tx_info { bool is_jumbo; }; +#define STMMAC_TBS_AVAIL BIT(0) +#define STMMAC_TBS_EN BIT(1) + /* Frequently used values are kept adjacent for cache effect */ struct stmmac_tx_queue { u32 tx_count_frames; + int tbs; struct timer_list txtimer; u32 queue_index; struct stmmac_priv *priv_data; struct dma_extended_desc *dma_etx ____cacheline_aligned_in_smp; + struct dma_edesc *dma_entx; struct dma_desc *dma_tx; struct sk_buff **tx_skbuff; struct stmmac_tx_info *tx_skbuff_dma; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index f19b8b15ed80..ff1cbfc834b0 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -106,6 +106,7 @@ MODULE_PARM_DESC(chain_mode, "To use chain instead of ring mode"); static irqreturn_t stmmac_interrupt(int irq, void *dev_id); #ifdef CONFIG_DEBUG_FS +static const struct net_device_ops stmmac_netdev_ops; static void stmmac_init_fs(struct net_device *dev); static void stmmac_exit_fs(struct net_device *dev); #endif @@ -387,9 +388,8 @@ bool stmmac_eee_init(struct stmmac_priv *priv) /* Using PCS we cannot dial with the phy registers at this stage * so we do not support extra feature like EEE. */ - if ((priv->hw->pcs == STMMAC_PCS_RGMII) || - (priv->hw->pcs == STMMAC_PCS_TBI) || - (priv->hw->pcs == STMMAC_PCS_RTBI)) + if (priv->hw->pcs == STMMAC_PCS_TBI || + priv->hw->pcs == STMMAC_PCS_RTBI) return false; /* Check if MAC core supports the EEE feature. */ @@ -1089,6 +1089,8 @@ static void stmmac_display_tx_rings(struct stmmac_priv *priv) if (priv->extend_desc) head_tx = (void *)tx_q->dma_etx; + else if (tx_q->tbs & STMMAC_TBS_AVAIL) + head_tx = (void *)tx_q->dma_entx; else head_tx = (void *)tx_q->dma_tx; @@ -1162,13 +1164,19 @@ static void stmmac_clear_tx_descriptors(struct stmmac_priv *priv, u32 queue) int i; /* Clear the TX descriptors */ - for (i = 0; i < DMA_TX_SIZE; i++) + for (i = 0; i < DMA_TX_SIZE; i++) { + int last = (i == (DMA_TX_SIZE - 1)); + struct dma_desc *p; + if (priv->extend_desc) - stmmac_init_tx_desc(priv, &tx_q->dma_etx[i].basic, - priv->mode, (i == DMA_TX_SIZE - 1)); + p = &tx_q->dma_etx[i].basic; + else if (tx_q->tbs & STMMAC_TBS_AVAIL) + p = &tx_q->dma_entx[i].basic; else - stmmac_init_tx_desc(priv, &tx_q->dma_tx[i], - priv->mode, (i == DMA_TX_SIZE - 1)); + p = &tx_q->dma_tx[i]; + + stmmac_init_tx_desc(priv, p, priv->mode, last); + } } /** @@ -1382,7 +1390,7 @@ static int init_dma_tx_desc_rings(struct net_device *dev) if (priv->extend_desc) stmmac_mode_init(priv, tx_q->dma_etx, tx_q->dma_tx_phy, DMA_TX_SIZE, 1); - else + else if (!(tx_q->tbs & STMMAC_TBS_AVAIL)) stmmac_mode_init(priv, tx_q->dma_tx, tx_q->dma_tx_phy, DMA_TX_SIZE, 0); } @@ -1391,6 +1399,8 @@ static int init_dma_tx_desc_rings(struct net_device *dev) struct dma_desc *p; if (priv->extend_desc) p = &((tx_q->dma_etx + i)->basic); + else if (tx_q->tbs & STMMAC_TBS_AVAIL) + p = &((tx_q->dma_entx + i)->basic); else p = tx_q->dma_tx + i; @@ -1510,19 +1520,26 @@ static void free_dma_tx_desc_resources(struct stmmac_priv *priv) /* Free TX queue resources */ for (queue = 0; queue < tx_count; queue++) { struct stmmac_tx_queue *tx_q = &priv->tx_queue[queue]; + size_t size; + void *addr; /* Release the DMA TX socket buffers */ dma_free_tx_skbufs(priv, queue); - /* Free DMA regions of consistent memory previously allocated */ - if (!priv->extend_desc) - dma_free_coherent(priv->device, - DMA_TX_SIZE * sizeof(struct dma_desc), - tx_q->dma_tx, tx_q->dma_tx_phy); - else - dma_free_coherent(priv->device, DMA_TX_SIZE * - sizeof(struct dma_extended_desc), - tx_q->dma_etx, tx_q->dma_tx_phy); + if (priv->extend_desc) { + size = sizeof(struct dma_extended_desc); + addr = tx_q->dma_etx; + } else if (tx_q->tbs & STMMAC_TBS_AVAIL) { + size = sizeof(struct dma_edesc); + addr = tx_q->dma_entx; + } else { + size = sizeof(struct dma_desc); + addr = tx_q->dma_tx; + } + + size *= DMA_TX_SIZE; + + dma_free_coherent(priv->device, size, addr, tx_q->dma_tx_phy); kfree(tx_q->tx_skbuff_dma); kfree(tx_q->tx_skbuff); @@ -1615,6 +1632,8 @@ static int alloc_dma_tx_desc_resources(struct stmmac_priv *priv) /* TX queues buffers and DMA */ for (queue = 0; queue < tx_count; queue++) { struct stmmac_tx_queue *tx_q = &priv->tx_queue[queue]; + size_t size; + void *addr; tx_q->queue_index = queue; tx_q->priv_data = priv; @@ -1631,28 +1650,32 @@ static int alloc_dma_tx_desc_resources(struct stmmac_priv *priv) if (!tx_q->tx_skbuff) goto err_dma; - if (priv->extend_desc) { - tx_q->dma_etx = dma_alloc_coherent(priv->device, - DMA_TX_SIZE * sizeof(struct dma_extended_desc), - &tx_q->dma_tx_phy, - GFP_KERNEL); - if (!tx_q->dma_etx) - goto err_dma; - } else { - tx_q->dma_tx = dma_alloc_coherent(priv->device, - DMA_TX_SIZE * sizeof(struct dma_desc), - &tx_q->dma_tx_phy, - GFP_KERNEL); - if (!tx_q->dma_tx) - goto err_dma; - } + if (priv->extend_desc) + size = sizeof(struct dma_extended_desc); + else if (tx_q->tbs & STMMAC_TBS_AVAIL) + size = sizeof(struct dma_edesc); + else + size = sizeof(struct dma_desc); + + size *= DMA_TX_SIZE; + + addr = dma_alloc_coherent(priv->device, size, + &tx_q->dma_tx_phy, GFP_KERNEL); + if (!addr) + goto err_dma; + + if (priv->extend_desc) + tx_q->dma_etx = addr; + else if (tx_q->tbs & STMMAC_TBS_AVAIL) + tx_q->dma_entx = addr; + else + tx_q->dma_tx = addr; } return 0; err_dma: free_dma_tx_desc_resources(priv); - return ret; } @@ -1884,6 +1907,8 @@ static int stmmac_tx_clean(struct stmmac_priv *priv, int budget, u32 queue) if (priv->extend_desc) p = (struct dma_desc *)(tx_q->dma_etx + entry); + else if (tx_q->tbs & STMMAC_TBS_AVAIL) + p = &tx_q->dma_entx[entry].basic; else p = tx_q->dma_tx + entry; @@ -1982,19 +2007,12 @@ static int stmmac_tx_clean(struct stmmac_priv *priv, int budget, u32 queue) static void stmmac_tx_err(struct stmmac_priv *priv, u32 chan) { struct stmmac_tx_queue *tx_q = &priv->tx_queue[chan]; - int i; netif_tx_stop_queue(netdev_get_tx_queue(priv->dev, chan)); stmmac_stop_tx_dma(priv, chan); dma_free_tx_skbufs(priv, chan); - for (i = 0; i < DMA_TX_SIZE; i++) - if (priv->extend_desc) - stmmac_init_tx_desc(priv, &tx_q->dma_etx[i].basic, - priv->mode, (i == DMA_TX_SIZE - 1)); - else - stmmac_init_tx_desc(priv, &tx_q->dma_tx[i], - priv->mode, (i == DMA_TX_SIZE - 1)); + stmmac_clear_tx_descriptors(priv, chan); tx_q->dirty_tx = 0; tx_q->cur_tx = 0; tx_q->mss = 0; @@ -2631,6 +2649,14 @@ static int stmmac_hw_setup(struct net_device *dev, bool init_ptp) if (priv->dma_cap.vlins) stmmac_enable_vlan(priv, priv->hw, STMMAC_VLAN_INSERT); + /* TBS */ + for (chan = 0; chan < tx_cnt; chan++) { + struct stmmac_tx_queue *tx_q = &priv->tx_queue[chan]; + int enable = tx_q->tbs & STMMAC_TBS_AVAIL; + + stmmac_enable_tbs(priv, priv->ioaddr, enable, chan); + } + /* Start the ball rolling... */ stmmac_start_all_dma(priv); @@ -2660,8 +2686,7 @@ static int stmmac_open(struct net_device *dev) u32 chan; int ret; - if (priv->hw->pcs != STMMAC_PCS_RGMII && - priv->hw->pcs != STMMAC_PCS_TBI && + if (priv->hw->pcs != STMMAC_PCS_TBI && priv->hw->pcs != STMMAC_PCS_RTBI) { ret = stmmac_init_phy(dev); if (ret) { @@ -2688,6 +2713,16 @@ static int stmmac_open(struct net_device *dev) priv->rx_copybreak = STMMAC_RX_COPYBREAK; + /* Earlier check for TBS */ + for (chan = 0; chan < priv->plat->tx_queues_to_use; chan++) { + struct stmmac_tx_queue *tx_q = &priv->tx_queue[chan]; + int tbs_en = priv->plat->tx_queues_cfg[chan].tbs_en; + + tx_q->tbs |= tbs_en ? STMMAC_TBS_AVAIL : 0; + if (stmmac_enable_tbs(priv, priv->ioaddr, tbs_en, chan)) + tx_q->tbs &= ~STMMAC_TBS_AVAIL; + } + ret = alloc_dma_desc_resources(priv); if (ret < 0) { netdev_err(priv->dev, "%s: DMA descriptors allocation failed\n", @@ -2836,7 +2871,11 @@ static bool stmmac_vlan_insert(struct stmmac_priv *priv, struct sk_buff *skb, tag = skb_vlan_tag_get(skb); - p = tx_q->dma_tx + tx_q->cur_tx; + if (tx_q->tbs & STMMAC_TBS_AVAIL) + p = &tx_q->dma_entx[tx_q->cur_tx].basic; + else + p = &tx_q->dma_tx[tx_q->cur_tx]; + if (stmmac_set_desc_vlan_tag(priv, p, tag, inner_tag, inner_type)) return false; @@ -2871,7 +2910,11 @@ static void stmmac_tso_allocator(struct stmmac_priv *priv, dma_addr_t des, tx_q->cur_tx = STMMAC_GET_ENTRY(tx_q->cur_tx, DMA_TX_SIZE); WARN_ON(tx_q->tx_skbuff[tx_q->cur_tx]); - desc = tx_q->dma_tx + tx_q->cur_tx; + + if (tx_q->tbs & STMMAC_TBS_AVAIL) + desc = &tx_q->dma_entx[tx_q->cur_tx].basic; + else + desc = &tx_q->dma_tx[tx_q->cur_tx]; curr_addr = des + (total_len - tmp_len); if (priv->dma_cap.addr64 <= 32) @@ -2922,13 +2965,13 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev) { struct dma_desc *desc, *first, *mss_desc = NULL; struct stmmac_priv *priv = netdev_priv(dev); + int desc_size, tmp_pay_len = 0, first_tx; int nfrags = skb_shinfo(skb)->nr_frags; u32 queue = skb_get_queue_mapping(skb); unsigned int first_entry, tx_packets; - int tmp_pay_len = 0, first_tx; struct stmmac_tx_queue *tx_q; - u8 proto_hdr_len, hdr; bool has_vlan, set_ic; + u8 proto_hdr_len, hdr; u32 pay_len, mss; dma_addr_t des; int i; @@ -2965,7 +3008,11 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev) /* set new MSS value if needed */ if (mss != tx_q->mss) { - mss_desc = tx_q->dma_tx + tx_q->cur_tx; + if (tx_q->tbs & STMMAC_TBS_AVAIL) + mss_desc = &tx_q->dma_entx[tx_q->cur_tx].basic; + else + mss_desc = &tx_q->dma_tx[tx_q->cur_tx]; + stmmac_set_mss(priv, mss_desc, mss); tx_q->mss = mss; tx_q->cur_tx = STMMAC_GET_ENTRY(tx_q->cur_tx, DMA_TX_SIZE); @@ -2985,7 +3032,10 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev) first_entry = tx_q->cur_tx; WARN_ON(tx_q->tx_skbuff[first_entry]); - desc = tx_q->dma_tx + first_entry; + if (tx_q->tbs & STMMAC_TBS_AVAIL) + desc = &tx_q->dma_entx[first_entry].basic; + else + desc = &tx_q->dma_tx[first_entry]; first = desc; if (has_vlan) @@ -3057,7 +3107,11 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev) set_ic = false; if (set_ic) { - desc = &tx_q->dma_tx[tx_q->cur_tx]; + if (tx_q->tbs & STMMAC_TBS_AVAIL) + desc = &tx_q->dma_entx[tx_q->cur_tx].basic; + else + desc = &tx_q->dma_tx[tx_q->cur_tx]; + tx_q->tx_count_frames = 0; stmmac_set_tx_ic(priv, desc); priv->xstats.tx_set_ic_bit++; @@ -3120,16 +3174,18 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev) pr_info("%s: curr=%d dirty=%d f=%d, e=%d, f_p=%p, nfrags %d\n", __func__, tx_q->cur_tx, tx_q->dirty_tx, first_entry, tx_q->cur_tx, first, nfrags); - - stmmac_display_ring(priv, (void *)tx_q->dma_tx, DMA_TX_SIZE, 0); - pr_info(">>> frame to be transmitted: "); print_pkt(skb->data, skb_headlen(skb)); } netdev_tx_sent_queue(netdev_get_tx_queue(dev, queue), skb->len); - tx_q->tx_tail_addr = tx_q->dma_tx_phy + (tx_q->cur_tx * sizeof(*desc)); + if (tx_q->tbs & STMMAC_TBS_AVAIL) + desc_size = sizeof(struct dma_edesc); + else + desc_size = sizeof(struct dma_desc); + + tx_q->tx_tail_addr = tx_q->dma_tx_phy + (tx_q->cur_tx * desc_size); stmmac_set_tx_tail_ptr(priv, priv->ioaddr, tx_q->tx_tail_addr, queue); stmmac_tx_timer_arm(priv, queue); @@ -3159,10 +3215,11 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev) u32 queue = skb_get_queue_mapping(skb); int nfrags = skb_shinfo(skb)->nr_frags; int gso = skb_shinfo(skb)->gso_type; + struct dma_edesc *tbs_desc = NULL; + int entry, desc_size, first_tx; struct dma_desc *desc, *first; struct stmmac_tx_queue *tx_q; bool has_vlan, set_ic; - int entry, first_tx; dma_addr_t des; tx_q = &priv->tx_queue[queue]; @@ -3202,6 +3259,8 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev) if (likely(priv->extend_desc)) desc = (struct dma_desc *)(tx_q->dma_etx + entry); + else if (tx_q->tbs & STMMAC_TBS_AVAIL) + desc = &tx_q->dma_entx[entry].basic; else desc = tx_q->dma_tx + entry; @@ -3231,6 +3290,8 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev) if (likely(priv->extend_desc)) desc = (struct dma_desc *)(tx_q->dma_etx + entry); + else if (tx_q->tbs & STMMAC_TBS_AVAIL) + desc = &tx_q->dma_entx[entry].basic; else desc = tx_q->dma_tx + entry; @@ -3277,6 +3338,8 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev) if (set_ic) { if (likely(priv->extend_desc)) desc = &tx_q->dma_etx[entry].basic; + else if (tx_q->tbs & STMMAC_TBS_AVAIL) + desc = &tx_q->dma_entx[entry].basic; else desc = &tx_q->dma_tx[entry]; @@ -3294,20 +3357,11 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev) tx_q->cur_tx = entry; if (netif_msg_pktdata(priv)) { - void *tx_head; - netdev_dbg(priv->dev, "%s: curr=%d dirty=%d f=%d, e=%d, first=%p, nfrags=%d", __func__, tx_q->cur_tx, tx_q->dirty_tx, first_entry, entry, first, nfrags); - if (priv->extend_desc) - tx_head = (void *)tx_q->dma_etx; - else - tx_head = (void *)tx_q->dma_tx; - - stmmac_display_ring(priv, tx_head, DMA_TX_SIZE, false); - netdev_dbg(priv->dev, ">>> frame to be transmitted: "); print_pkt(skb->data, skb->len); } @@ -3353,12 +3407,19 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev) /* Prepare the first descriptor setting the OWN bit too */ stmmac_prepare_tx_desc(priv, first, 1, nopaged_len, - csum_insertion, priv->mode, 1, last_segment, + csum_insertion, priv->mode, 0, last_segment, skb->len); - } else { - stmmac_set_tx_owner(priv, first); } + if (tx_q->tbs & STMMAC_TBS_EN) { + struct timespec64 ts = ns_to_timespec64(skb->tstamp); + + tbs_desc = &tx_q->dma_entx[first_entry]; + stmmac_set_desc_tbs(priv, tbs_desc, ts.tv_sec, ts.tv_nsec); + } + + stmmac_set_tx_owner(priv, first); + /* The own bit must be the latest setting done when prepare the * descriptor and then barrier is needed to make sure that * all is coherent before granting the DMA engine. @@ -3369,7 +3430,14 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev) stmmac_enable_dma_transmission(priv, priv->ioaddr); - tx_q->tx_tail_addr = tx_q->dma_tx_phy + (tx_q->cur_tx * sizeof(*desc)); + if (likely(priv->extend_desc)) + desc_size = sizeof(struct dma_extended_desc); + else if (tx_q->tbs & STMMAC_TBS_AVAIL) + desc_size = sizeof(struct dma_edesc); + else + desc_size = sizeof(struct dma_desc); + + tx_q->tx_tail_addr = tx_q->dma_tx_phy + (tx_q->cur_tx * desc_size); stmmac_set_tx_tail_ptr(priv, priv->ioaddr, tx_q->tx_tail_addr, queue); stmmac_tx_timer_arm(priv, queue); @@ -4089,6 +4157,8 @@ static int stmmac_setup_tc(struct net_device *ndev, enum tc_setup_type type, return stmmac_tc_setup_cbs(priv, priv, type_data); case TC_SETUP_QDISC_TAPRIO: return stmmac_tc_setup_taprio(priv, priv, type_data); + case TC_SETUP_QDISC_ETF: + return stmmac_tc_setup_etf(priv, priv, type_data); default: return -EOPNOTSUPP; } @@ -4192,7 +4262,7 @@ static int stmmac_rings_status_show(struct seq_file *seq, void *v) seq_printf(seq, "Extended descriptor ring:\n"); sysfs_display_ring((void *)tx_q->dma_etx, DMA_TX_SIZE, 1, seq); - } else { + } else if (!(tx_q->tbs & STMMAC_TBS_AVAIL)) { seq_printf(seq, "Descriptor ring:\n"); sysfs_display_ring((void *)tx_q->dma_tx, DMA_TX_SIZE, 0, seq); @@ -4293,10 +4363,44 @@ static int stmmac_dma_cap_show(struct seq_file *seq, void *v) priv->dma_cap.l3l4fnum); seq_printf(seq, "\tARP Offloading: %s\n", priv->dma_cap.arpoffsel ? "Y" : "N"); + seq_printf(seq, "\tEnhancements to Scheduled Traffic (EST): %s\n", + priv->dma_cap.estsel ? "Y" : "N"); + seq_printf(seq, "\tFrame Preemption (FPE): %s\n", + priv->dma_cap.fpesel ? "Y" : "N"); + seq_printf(seq, "\tTime-Based Scheduling (TBS): %s\n", + priv->dma_cap.tbssel ? "Y" : "N"); return 0; } DEFINE_SHOW_ATTRIBUTE(stmmac_dma_cap); +/* Use network device events to rename debugfs file entries. + */ +static int stmmac_device_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct stmmac_priv *priv = netdev_priv(dev); + + if (dev->netdev_ops != &stmmac_netdev_ops) + goto done; + + switch (event) { + case NETDEV_CHANGENAME: + if (priv->dbgfs_dir) + priv->dbgfs_dir = debugfs_rename(stmmac_fs_dir, + priv->dbgfs_dir, + stmmac_fs_dir, + dev->name); + break; + } +done: + return NOTIFY_DONE; +} + +static struct notifier_block stmmac_notifier = { + .notifier_call = stmmac_device_event, +}; + static void stmmac_init_fs(struct net_device *dev) { struct stmmac_priv *priv = netdev_priv(dev); @@ -4311,12 +4415,15 @@ static void stmmac_init_fs(struct net_device *dev) /* Entry to report the DMA HW features */ debugfs_create_file("dma_cap", 0444, priv->dbgfs_dir, dev, &stmmac_dma_cap_fops); + + register_netdevice_notifier(&stmmac_notifier); } static void stmmac_exit_fs(struct net_device *dev) { struct stmmac_priv *priv = netdev_priv(dev); + unregister_netdevice_notifier(&stmmac_notifier); debugfs_remove_recursive(priv->dbgfs_dir); } #endif /* CONFIG_DEBUG_FS */ @@ -4767,8 +4874,7 @@ int stmmac_dvr_probe(struct device *device, stmmac_check_pcs_mode(priv); - if (priv->hw->pcs != STMMAC_PCS_RGMII && - priv->hw->pcs != STMMAC_PCS_TBI && + if (priv->hw->pcs != STMMAC_PCS_TBI && priv->hw->pcs != STMMAC_PCS_RTBI) { /* MDIO bus Registration */ ret = stmmac_mdio_register(ndev); @@ -4802,8 +4908,7 @@ int stmmac_dvr_probe(struct device *device, error_netdev_register: phylink_destroy(priv->phylink); error_phy_setup: - if (priv->hw->pcs != STMMAC_PCS_RGMII && - priv->hw->pcs != STMMAC_PCS_TBI && + if (priv->hw->pcs != STMMAC_PCS_TBI && priv->hw->pcs != STMMAC_PCS_RTBI) stmmac_mdio_unregister(ndev); error_mdio_register: @@ -4848,8 +4953,7 @@ int stmmac_dvr_remove(struct device *dev) reset_control_assert(priv->plat->stmmac_rst); clk_disable_unprepare(priv->plat->pclk); clk_disable_unprepare(priv->plat->stmmac_clk); - if (priv->hw->pcs != STMMAC_PCS_RGMII && - priv->hw->pcs != STMMAC_PCS_TBI && + if (priv->hw->pcs != STMMAC_PCS_TBI && priv->hw->pcs != STMMAC_PCS_RTBI) stmmac_mdio_unregister(ndev); destroy_workqueue(priv->wq); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c index 8237dbc3e991..623521052152 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c @@ -65,7 +65,6 @@ static void common_default_data(struct plat_stmmacenet_data *plat) plat->force_sf_dma_mode = 1; plat->mdio_bus_data->needs_reset = true; - plat->mdio_bus_data->phy_mask = 0; /* Set default value for multicast hash bins */ plat->multicast_filter_bins = HASH_TABLE_SIZE; @@ -154,8 +153,6 @@ static int intel_mgbe_common_data(struct pci_dev *pdev, plat->tx_queues_cfg[6].weight = 0x0F; plat->tx_queues_cfg[7].weight = 0x10; - plat->mdio_bus_data->phy_mask = 0; - plat->dma_cfg->pbl = 32; plat->dma_cfg->pblx8 = true; plat->dma_cfg->fixed_burst = 0; @@ -386,8 +383,6 @@ static int snps_gmac5_default_data(struct pci_dev *pdev, plat->tso_en = 1; plat->pmt = 1; - plat->mdio_bus_data->phy_mask = 0; - /* Set default value for multicast hash bins */ plat->multicast_filter_bins = HASH_TABLE_SIZE; @@ -406,6 +401,8 @@ static int snps_gmac5_default_data(struct pci_dev *pdev, plat->tx_queues_cfg[i].use_prio = false; plat->tx_queues_cfg[i].mode_to_use = MTL_QUEUE_DCB; plat->tx_queues_cfg[i].weight = 25; + if (i > 0) + plat->tx_queues_cfg[i].tbs_en = 1; } plat->rx_sched_algorithm = MTL_RX_ALGORITHM_SP; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index cc8d7e7bf9ac..d10ac54bf385 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -320,7 +320,7 @@ out: static int stmmac_dt_phy(struct plat_stmmacenet_data *plat, struct device_node *np, struct device *dev) { - bool mdio = false; + bool mdio = !of_phy_is_fixed_link(np); static const struct of_device_id need_mdio_ids[] = { { .compatible = "snps,dwc-qos-ethernet-4.10" }, {}, @@ -412,9 +412,9 @@ stmmac_probe_config_dt(struct platform_device *pdev, const char **mac) *mac = NULL; } - rc = of_get_phy_mode(np, &plat->phy_interface); - if (rc) - return ERR_PTR(rc); + plat->phy_interface = device_get_phy_mode(&pdev->dev); + if (plat->phy_interface < 0) + return ERR_PTR(plat->phy_interface); plat->interface = stmmac_of_get_mac_mode(np); if (plat->interface < 0) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c index 13227909287c..2aba2673d6c3 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c @@ -14,6 +14,7 @@ #include <linux/phy.h> #include <linux/udp.h> #include <net/pkt_cls.h> +#include <net/pkt_sched.h> #include <net/tcp.h> #include <net/udp.h> #include <net/tc_act/tc_gact.h> @@ -50,6 +51,7 @@ struct stmmac_packet_attrs { u8 id; int sarc; u16 queue_mapping; + u64 timestamp; }; static u8 stmmac_test_next_id; @@ -80,7 +82,7 @@ static struct sk_buff *stmmac_test_get_udp_skb(struct stmmac_priv *priv, if (attr->max_size && (attr->max_size > size)) size = attr->max_size; - skb = netdev_alloc_skb_ip_align(priv->dev, size); + skb = netdev_alloc_skb(priv->dev, size); if (!skb) return NULL; @@ -208,6 +210,9 @@ static struct sk_buff *stmmac_test_get_udp_skb(struct stmmac_priv *priv, skb->pkt_type = PACKET_HOST; skb->dev = priv->dev; + if (attr->timestamp) + skb->tstamp = ns_to_ktime(attr->timestamp); + return skb; } @@ -244,6 +249,8 @@ static int stmmac_test_loopback_validate(struct sk_buff *skb, struct net_device *orig_ndev) { struct stmmac_test_priv *tpriv = pt->af_packet_priv; + unsigned char *src = tpriv->packet->src; + unsigned char *dst = tpriv->packet->dst; struct stmmachdr *shdr; struct ethhdr *ehdr; struct udphdr *uhdr; @@ -260,15 +267,15 @@ static int stmmac_test_loopback_validate(struct sk_buff *skb, goto out; ehdr = (struct ethhdr *)skb_mac_header(skb); - if (tpriv->packet->dst) { - if (!ether_addr_equal(ehdr->h_dest, tpriv->packet->dst)) + if (dst) { + if (!ether_addr_equal_unaligned(ehdr->h_dest, dst)) goto out; } if (tpriv->packet->sarc) { - if (!ether_addr_equal(ehdr->h_source, ehdr->h_dest)) + if (!ether_addr_equal_unaligned(ehdr->h_source, ehdr->h_dest)) goto out; - } else if (tpriv->packet->src) { - if (!ether_addr_equal(ehdr->h_source, tpriv->packet->src)) + } else if (src) { + if (!ether_addr_equal_unaligned(ehdr->h_source, src)) goto out; } @@ -339,8 +346,7 @@ static int __stmmac_test_loopback(struct stmmac_priv *priv, goto cleanup; } - skb_set_queue_mapping(skb, attr->queue_mapping); - ret = dev_queue_xmit(skb); + ret = dev_direct_xmit(skb, attr->queue_mapping); if (ret) goto cleanup; @@ -714,7 +720,7 @@ static int stmmac_test_flowctrl_validate(struct sk_buff *skb, struct ethhdr *ehdr; ehdr = (struct ethhdr *)skb_mac_header(skb); - if (!ether_addr_equal(ehdr->h_source, orig_ndev->dev_addr)) + if (!ether_addr_equal_unaligned(ehdr->h_source, orig_ndev->dev_addr)) goto out; if (ehdr->h_proto != htons(ETH_P_PAUSE)) goto out; @@ -851,12 +857,16 @@ static int stmmac_test_vlan_validate(struct sk_buff *skb, if (tpriv->vlan_id) { if (skb->vlan_proto != htons(proto)) goto out; - if (skb->vlan_tci != tpriv->vlan_id) + if (skb->vlan_tci != tpriv->vlan_id) { + /* Means filter did not work. */ + tpriv->ok = false; + complete(&tpriv->comp); goto out; + } } ehdr = (struct ethhdr *)skb_mac_header(skb); - if (!ether_addr_equal(ehdr->h_dest, tpriv->packet->dst)) + if (!ether_addr_equal_unaligned(ehdr->h_dest, tpriv->packet->dst)) goto out; ihdr = ip_hdr(skb); @@ -926,8 +936,7 @@ static int __stmmac_test_vlanfilt(struct stmmac_priv *priv) goto vlan_del; } - skb_set_queue_mapping(skb, 0); - ret = dev_queue_xmit(skb); + ret = dev_direct_xmit(skb, 0); if (ret) goto vlan_del; @@ -965,6 +974,9 @@ static int stmmac_test_vlanfilt_perfect(struct stmmac_priv *priv) { int ret, prev_cap = priv->dma_cap.vlhash; + if (!(priv->dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) + return -EOPNOTSUPP; + priv->dma_cap.vlhash = 0; ret = __stmmac_test_vlanfilt(priv); priv->dma_cap.vlhash = prev_cap; @@ -1018,8 +1030,7 @@ static int __stmmac_test_dvlanfilt(struct stmmac_priv *priv) goto vlan_del; } - skb_set_queue_mapping(skb, 0); - ret = dev_queue_xmit(skb); + ret = dev_direct_xmit(skb, 0); if (ret) goto vlan_del; @@ -1057,6 +1068,9 @@ static int stmmac_test_dvlanfilt_perfect(struct stmmac_priv *priv) { int ret, prev_cap = priv->dma_cap.vlhash; + if (!(priv->dev->features & NETIF_F_HW_VLAN_STAG_FILTER)) + return -EOPNOTSUPP; + priv->dma_cap.vlhash = 0; ret = __stmmac_test_dvlanfilt(priv); priv->dma_cap.vlhash = prev_cap; @@ -1286,8 +1300,7 @@ static int stmmac_test_vlanoff_common(struct stmmac_priv *priv, bool svlan) __vlan_hwaccel_put_tag(skb, htons(proto), tpriv->vlan_id); skb->protocol = htons(proto); - skb_set_queue_mapping(skb, 0); - ret = dev_queue_xmit(skb); + ret = dev_direct_xmit(skb, 0); if (ret) goto vlan_del; @@ -1323,16 +1336,19 @@ static int __stmmac_test_l3filt(struct stmmac_priv *priv, u32 dst, u32 src, struct stmmac_packet_attrs attr = { }; struct flow_dissector *dissector; struct flow_cls_offload *cls; + int ret, old_enable = 0; struct flow_rule *rule; - int ret; if (!tc_can_offload(priv->dev)) return -EOPNOTSUPP; if (!priv->dma_cap.l3l4fnum) return -EOPNOTSUPP; - if (priv->rss.enable) + if (priv->rss.enable) { + old_enable = priv->rss.enable; + priv->rss.enable = false; stmmac_rss_configure(priv, priv->hw, NULL, priv->plat->rx_queues_to_use); + } dissector = kzalloc(sizeof(*dissector), GFP_KERNEL); if (!dissector) { @@ -1399,7 +1415,8 @@ cleanup_cls: cleanup_dissector: kfree(dissector); cleanup_rss: - if (priv->rss.enable) { + if (old_enable) { + priv->rss.enable = old_enable; stmmac_rss_configure(priv, priv->hw, &priv->rss, priv->plat->rx_queues_to_use); } @@ -1444,16 +1461,19 @@ static int __stmmac_test_l4filt(struct stmmac_priv *priv, u32 dst, u32 src, struct stmmac_packet_attrs attr = { }; struct flow_dissector *dissector; struct flow_cls_offload *cls; + int ret, old_enable = 0; struct flow_rule *rule; - int ret; if (!tc_can_offload(priv->dev)) return -EOPNOTSUPP; if (!priv->dma_cap.l3l4fnum) return -EOPNOTSUPP; - if (priv->rss.enable) + if (priv->rss.enable) { + old_enable = priv->rss.enable; + priv->rss.enable = false; stmmac_rss_configure(priv, priv->hw, NULL, priv->plat->rx_queues_to_use); + } dissector = kzalloc(sizeof(*dissector), GFP_KERNEL); if (!dissector) { @@ -1525,7 +1545,8 @@ cleanup_cls: cleanup_dissector: kfree(dissector); cleanup_rss: - if (priv->rss.enable) { + if (old_enable) { + priv->rss.enable = old_enable; stmmac_rss_configure(priv, priv->hw, &priv->rss, priv->plat->rx_queues_to_use); } @@ -1578,7 +1599,7 @@ static int stmmac_test_arp_validate(struct sk_buff *skb, struct arphdr *ahdr; ehdr = (struct ethhdr *)skb_mac_header(skb); - if (!ether_addr_equal(ehdr->h_dest, tpriv->packet->src)) + if (!ether_addr_equal_unaligned(ehdr->h_dest, tpriv->packet->src)) goto out; ahdr = arp_hdr(skb); @@ -1639,8 +1660,7 @@ static int stmmac_test_arpoffload(struct stmmac_priv *priv) if (ret) goto cleanup; - skb_set_queue_mapping(skb, 0); - ret = dev_queue_xmit(skb); + ret = dev_direct_xmit(skb, 0); if (ret) goto cleanup_promisc; @@ -1728,6 +1748,68 @@ static int stmmac_test_sph(struct stmmac_priv *priv) return 0; } +static int stmmac_test_tbs(struct stmmac_priv *priv) +{ +#define STMMAC_TBS_LT_OFFSET (500 * 1000 * 1000) /* 500 ms*/ + struct stmmac_packet_attrs attr = { }; + struct tc_etf_qopt_offload qopt; + u64 start_time, curr_time = 0; + unsigned long flags; + int ret, i; + + if (!priv->hwts_tx_en) + return -EOPNOTSUPP; + + /* Find first TBS enabled Queue, if any */ + for (i = 0; i < priv->plat->tx_queues_to_use; i++) + if (priv->tx_queue[i].tbs & STMMAC_TBS_AVAIL) + break; + + if (i >= priv->plat->tx_queues_to_use) + return -EOPNOTSUPP; + + qopt.enable = true; + qopt.queue = i; + + ret = stmmac_tc_setup_etf(priv, priv, &qopt); + if (ret) + return ret; + + spin_lock_irqsave(&priv->ptp_lock, flags); + stmmac_get_systime(priv, priv->ptpaddr, &curr_time); + spin_unlock_irqrestore(&priv->ptp_lock, flags); + + if (!curr_time) { + ret = -EOPNOTSUPP; + goto fail_disable; + } + + start_time = curr_time; + curr_time += STMMAC_TBS_LT_OFFSET; + + attr.dst = priv->dev->dev_addr; + attr.timestamp = curr_time; + attr.timeout = nsecs_to_jiffies(2 * STMMAC_TBS_LT_OFFSET); + attr.queue_mapping = i; + + ret = __stmmac_test_loopback(priv, &attr); + if (ret) + goto fail_disable; + + /* Check if expected time has elapsed */ + spin_lock_irqsave(&priv->ptp_lock, flags); + stmmac_get_systime(priv, priv->ptpaddr, &curr_time); + spin_unlock_irqrestore(&priv->ptp_lock, flags); + + if ((curr_time - start_time) < STMMAC_TBS_LT_OFFSET) + ret = -EINVAL; + +fail_disable: + qopt.enable = false; + stmmac_tc_setup_etf(priv, priv, &qopt); + return ret; +} + #define STMMAC_LOOPBACK_NONE 0 #define STMMAC_LOOPBACK_MAC 1 #define STMMAC_LOOPBACK_PHY 2 @@ -1861,6 +1943,10 @@ static const struct stmmac_test { .name = "Split Header ", .lb = STMMAC_LOOPBACK_PHY, .fn = stmmac_test_sph, + }, { + .name = "TBS (ETF Scheduler) ", + .lb = STMMAC_LOOPBACK_PHY, + .fn = stmmac_test_tbs, }, }; @@ -1869,7 +1955,6 @@ void stmmac_selftest_run(struct net_device *dev, { struct stmmac_priv *priv = netdev_priv(dev); int count = stmmac_selftest_get_count(priv); - int carrier = netif_carrier_ok(dev); int i, ret; memset(buf, 0, sizeof(*buf) * count); @@ -1879,15 +1964,12 @@ void stmmac_selftest_run(struct net_device *dev, netdev_err(priv->dev, "Only offline tests are supported\n"); etest->flags |= ETH_TEST_FL_FAILED; return; - } else if (!carrier) { + } else if (!netif_carrier_ok(dev)) { netdev_err(priv->dev, "You need valid Link to execute tests\n"); etest->flags |= ETH_TEST_FL_FAILED; return; } - /* We don't want extra traffic */ - netif_carrier_off(dev); - /* Wait for queues drain */ msleep(200); @@ -1942,10 +2024,6 @@ void stmmac_selftest_run(struct net_device *dev, break; } } - - /* Restart everything */ - if (carrier) - netif_carrier_on(dev); } void stmmac_selftest_get_strings(struct stmmac_priv *priv, u8 *data) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c index 6c4686b77516..7a01dee2f9a8 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c @@ -577,6 +577,10 @@ static int tc_setup_cls(struct stmmac_priv *priv, { int ret = 0; + /* When RSS is enabled, the filtering will be bypassed */ + if (priv->rss.enable) + return -EBUSY; + switch (cls->command) { case FLOW_CLS_REPLACE: ret = tc_add_flow(priv, cls); @@ -727,10 +731,31 @@ disable: return ret; } +static int tc_setup_etf(struct stmmac_priv *priv, + struct tc_etf_qopt_offload *qopt) +{ + if (!priv->dma_cap.tbssel) + return -EOPNOTSUPP; + if (qopt->queue >= priv->plat->tx_queues_to_use) + return -EINVAL; + if (!(priv->tx_queue[qopt->queue].tbs & STMMAC_TBS_AVAIL)) + return -EINVAL; + + if (qopt->enable) + priv->tx_queue[qopt->queue].tbs |= STMMAC_TBS_EN; + else + priv->tx_queue[qopt->queue].tbs &= ~STMMAC_TBS_EN; + + netdev_info(priv->dev, "%s ETF for Queue %d\n", + qopt->enable ? "enabled" : "disabled", qopt->queue); + return 0; +} + const struct stmmac_tc_ops dwmac510_tc_ops = { .init = tc_init, .setup_cls_u32 = tc_setup_cls_u32, .setup_cbs = tc_setup_cbs, .setup_cls = tc_setup_cls, .setup_taprio = tc_setup_taprio, + .setup_etf = tc_setup_etf, }; diff --git a/drivers/net/ethernet/sun/sunvnet_common.c b/drivers/net/ethernet/sun/sunvnet_common.c index a601a306f9a5..c23ce838ff63 100644 --- a/drivers/net/ethernet/sun/sunvnet_common.c +++ b/drivers/net/ethernet/sun/sunvnet_common.c @@ -1223,7 +1223,7 @@ vnet_handle_offloads(struct vnet_port *port, struct sk_buff *skb, { struct net_device *dev = VNET_PORT_TO_NET_DEVICE(port); struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING]; - struct sk_buff *segs; + struct sk_buff *segs, *curr, *next; int maclen, datalen; int status; int gso_size, gso_type, gso_segs; @@ -1282,11 +1282,8 @@ vnet_handle_offloads(struct vnet_port *port, struct sk_buff *skb, skb_reset_mac_header(skb); status = 0; - while (segs) { - struct sk_buff *curr = segs; - - segs = segs->next; - curr->next = NULL; + skb_list_walk_safe(segs, curr, next) { + skb_mark_not_on_list(curr); if (port->tso && curr->len > dev->mtu) { skb_shinfo(curr)->gso_size = gso_size; skb_shinfo(curr)->gso_type = gso_type; diff --git a/drivers/net/ethernet/ti/cpmac.c b/drivers/net/ethernet/ti/cpmac.c index 5e1b8292cd3f..a530afe3ce12 100644 --- a/drivers/net/ethernet/ti/cpmac.c +++ b/drivers/net/ethernet/ti/cpmac.c @@ -816,16 +816,6 @@ static void cpmac_tx_timeout(struct net_device *dev, unsigned int txqueue) netif_tx_wake_all_queues(priv->dev); } -static int cpmac_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) -{ - if (!(netif_running(dev))) - return -EINVAL; - if (!dev->phydev) - return -EINVAL; - - return phy_mii_ioctl(dev->phydev, ifr, cmd); -} - static void cpmac_get_ringparam(struct net_device *dev, struct ethtool_ringparam *ring) { @@ -1054,7 +1044,7 @@ static const struct net_device_ops cpmac_netdev_ops = { .ndo_start_xmit = cpmac_start_xmit, .ndo_tx_timeout = cpmac_tx_timeout, .ndo_set_rx_mode = cpmac_set_multicast_list, - .ndo_do_ioctl = cpmac_ioctl, + .ndo_do_ioctl = phy_do_ioctl_running, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, }; diff --git a/drivers/net/ethernet/toshiba/tc35815.c b/drivers/net/ethernet/toshiba/tc35815.c index 708de826200e..3fd43d30b20d 100644 --- a/drivers/net/ethernet/toshiba/tc35815.c +++ b/drivers/net/ethernet/toshiba/tc35815.c @@ -484,7 +484,6 @@ static int tc35815_close(struct net_device *dev); static struct net_device_stats *tc35815_get_stats(struct net_device *dev); static void tc35815_set_multicast_list(struct net_device *dev); static void tc35815_tx_timeout(struct net_device *dev, unsigned int txqueue); -static int tc35815_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); #ifdef CONFIG_NET_POLL_CONTROLLER static void tc35815_poll_controller(struct net_device *dev); #endif @@ -751,7 +750,7 @@ static const struct net_device_ops tc35815_netdev_ops = { .ndo_get_stats = tc35815_get_stats, .ndo_set_rx_mode = tc35815_set_multicast_list, .ndo_tx_timeout = tc35815_tx_timeout, - .ndo_do_ioctl = tc35815_ioctl, + .ndo_do_ioctl = phy_do_ioctl_running, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, #ifdef CONFIG_NET_POLL_CONTROLLER @@ -2009,15 +2008,6 @@ static const struct ethtool_ops tc35815_ethtool_ops = { .set_link_ksettings = phy_ethtool_set_link_ksettings, }; -static int tc35815_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) -{ - if (!netif_running(dev)) - return -EINVAL; - if (!dev->phydev) - return -ENODEV; - return phy_mii_ioctl(dev->phydev, rq, cmd); -} - static void tc35815_chip_reset(struct net_device *dev) { struct tc35815_regs __iomem *tr = diff --git a/drivers/net/ethernet/via/via-velocity.c b/drivers/net/ethernet/via/via-velocity.c index 346e44115c4e..4b556b74541a 100644 --- a/drivers/net/ethernet/via/via-velocity.c +++ b/drivers/net/ethernet/via/via-velocity.c @@ -3257,12 +3257,16 @@ static struct platform_driver velocity_platform_driver = { * @dev: network device * * Called before an ethtool operation. We need to make sure the - * chip is out of D3 state before we poke at it. + * chip is out of D3 state before we poke at it. In case of ethtool + * ops nesting, only wake the device up in the outermost block. */ static int velocity_ethtool_up(struct net_device *dev) { struct velocity_info *vptr = netdev_priv(dev); - if (!netif_running(dev)) + + if (vptr->ethtool_ops_nesting == U32_MAX) + return -EBUSY; + if (!vptr->ethtool_ops_nesting++ && !netif_running(dev)) velocity_set_power_state(vptr, PCI_D0); return 0; } @@ -3272,12 +3276,14 @@ static int velocity_ethtool_up(struct net_device *dev) * @dev: network device * * Called after an ethtool operation. Restore the chip back to D3 - * state if it isn't running. + * state if it isn't running. In case of ethtool ops nesting, only + * put the device to sleep in the outermost block. */ static void velocity_ethtool_down(struct net_device *dev) { struct velocity_info *vptr = netdev_priv(dev); - if (!netif_running(dev)) + + if (!--vptr->ethtool_ops_nesting && !netif_running(dev)) velocity_set_power_state(vptr, PCI_D3hot); } diff --git a/drivers/net/ethernet/via/via-velocity.h b/drivers/net/ethernet/via/via-velocity.h index cdfe7809e3c1..f196e71d2c04 100644 --- a/drivers/net/ethernet/via/via-velocity.h +++ b/drivers/net/ethernet/via/via-velocity.h @@ -1483,6 +1483,7 @@ struct velocity_info { struct velocity_context context; u32 ticks; + u32 ethtool_ops_nesting; u8 rev_id; diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c index 21c1b4322ea7..f5bbb19d62d0 100644 --- a/drivers/net/ethernet/xilinx/ll_temac_main.c +++ b/drivers/net/ethernet/xilinx/ll_temac_main.c @@ -1080,17 +1080,6 @@ temac_poll_controller(struct net_device *ndev) } #endif -static int temac_ioctl(struct net_device *ndev, struct ifreq *rq, int cmd) -{ - if (!netif_running(ndev)) - return -EINVAL; - - if (!ndev->phydev) - return -EINVAL; - - return phy_mii_ioctl(ndev->phydev, rq, cmd); -} - static const struct net_device_ops temac_netdev_ops = { .ndo_open = temac_open, .ndo_stop = temac_stop, @@ -1098,7 +1087,7 @@ static const struct net_device_ops temac_netdev_ops = { .ndo_set_rx_mode = temac_set_multicast_list, .ndo_set_mac_address = temac_set_mac_address, .ndo_validate_addr = eth_validate_addr, - .ndo_do_ioctl = temac_ioctl, + .ndo_do_ioctl = phy_do_ioctl_running, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = temac_poll_controller, #endif diff --git a/drivers/net/ethernet/xscale/Kconfig b/drivers/net/ethernet/xscale/Kconfig index cd0a8f46e7c6..98aa7b8ddb06 100644 --- a/drivers/net/ethernet/xscale/Kconfig +++ b/drivers/net/ethernet/xscale/Kconfig @@ -27,4 +27,18 @@ config IXP4XX_ETH Say Y here if you want to use built-in Ethernet ports on IXP4xx processor. +config PTP_1588_CLOCK_IXP46X + tristate "Intel IXP46x as PTP clock" + depends on IXP4XX_ETH + depends on PTP_1588_CLOCK + default y + help + This driver adds support for using the IXP46X as a PTP + clock. This clock is only useful if your PTP programs are + getting hardware time stamps on the PTP Ethernet packets + using the SO_TIMESTAMPING API. + + To compile this driver as a module, choose M here: the module + will be called ptp_ixp46x. + endif # NET_VENDOR_XSCALE diff --git a/drivers/net/ethernet/xscale/Makefile b/drivers/net/ethernet/xscale/Makefile index 794a519d07b3..607f91b1e878 100644 --- a/drivers/net/ethernet/xscale/Makefile +++ b/drivers/net/ethernet/xscale/Makefile @@ -3,4 +3,5 @@ # Makefile for the Intel XScale IXP device drivers. # -obj-$(CONFIG_IXP4XX_ETH) += ixp4xx_eth.o +obj-$(CONFIG_IXP4XX_ETH) += ixp4xx_eth.o +obj-$(CONFIG_PTP_1588_CLOCK_IXP46X) += ptp_ixp46x.o diff --git a/arch/arm/mach-ixp4xx/include/mach/ixp46x_ts.h b/drivers/net/ethernet/xscale/ixp46x_ts.h index d792130e27b0..d792130e27b0 100644 --- a/arch/arm/mach-ixp4xx/include/mach/ixp46x_ts.h +++ b/drivers/net/ethernet/xscale/ixp46x_ts.h diff --git a/drivers/net/ethernet/xscale/ixp4xx_eth.c b/drivers/net/ethernet/xscale/ixp4xx_eth.c index 6fc04ffb22c2..269596c15133 100644 --- a/drivers/net/ethernet/xscale/ixp4xx_eth.c +++ b/drivers/net/ethernet/xscale/ixp4xx_eth.c @@ -29,14 +29,16 @@ #include <linux/net_tstamp.h> #include <linux/of.h> #include <linux/phy.h> +#include <linux/platform_data/eth_ixp4xx.h> #include <linux/platform_device.h> #include <linux/ptp_classify.h> #include <linux/slab.h> #include <linux/module.h> -#include <mach/ixp46x_ts.h> #include <linux/soc/ixp4xx/npe.h> #include <linux/soc/ixp4xx/qmgr.h> +#include "ixp46x_ts.h" + #define DEBUG_DESC 0 #define DEBUG_RX 0 #define DEBUG_TX 0 @@ -517,25 +519,14 @@ static int ixp4xx_mdio_write(struct mii_bus *bus, int phy_id, int location, return ret; } -static int ixp4xx_mdio_register(void) +static int ixp4xx_mdio_register(struct eth_regs __iomem *regs) { int err; if (!(mdio_bus = mdiobus_alloc())) return -ENOMEM; - if (cpu_is_ixp43x()) { - /* IXP43x lacks NPE-B and uses NPE-C for MII PHY access */ - if (!(ixp4xx_read_feature_bits() & IXP4XX_FEATURE_NPEC_ETH)) - return -ENODEV; - mdio_regs = (struct eth_regs __iomem *)IXP4XX_EthC_BASE_VIRT; - } else { - /* All MII PHY accesses use NPE-B Ethernet registers */ - if (!(ixp4xx_read_feature_bits() & IXP4XX_FEATURE_NPEB_ETH0)) - return -ENODEV; - mdio_regs = (struct eth_regs __iomem *)IXP4XX_EthB_BASE_VIRT; - } - + mdio_regs = regs; __raw_writel(DEFAULT_CORE_CNTRL, &mdio_regs->core_control); spin_lock_init(&mdio_lock); mdio_bus->name = "IXP4xx MII Bus"; @@ -581,8 +572,8 @@ static void ixp4xx_adjust_link(struct net_device *dev) __raw_writel(DEFAULT_TX_CNTRL0 | TX_CNTRL0_HALFDUPLEX, &port->regs->tx_control[0]); - printk(KERN_INFO "%s: link up, speed %u Mb/s, %s duplex\n", - dev->name, port->speed, port->duplex ? "full" : "half"); + netdev_info(dev, "%s: link up, speed %u Mb/s, %s duplex\n", + dev->name, port->speed, port->duplex ? "full" : "half"); } @@ -592,7 +583,7 @@ static inline void debug_pkt(struct net_device *dev, const char *func, #if DEBUG_PKT_BYTES int i; - printk(KERN_DEBUG "%s: %s(%i) ", dev->name, func, len); + netdev_debug(dev, "%s(%i) ", func, len); for (i = 0; i < len; i++) { if (i >= DEBUG_PKT_BYTES) break; @@ -683,7 +674,7 @@ static int eth_poll(struct napi_struct *napi, int budget) int received = 0; #if DEBUG_RX - printk(KERN_DEBUG "%s: eth_poll\n", dev->name); + netdev_debug(dev, "eth_poll\n"); #endif while (received < budget) { @@ -697,23 +688,20 @@ static int eth_poll(struct napi_struct *napi, int budget) if ((n = queue_get_desc(rxq, port, 0)) < 0) { #if DEBUG_RX - printk(KERN_DEBUG "%s: eth_poll napi_complete\n", - dev->name); + netdev_debug(dev, "eth_poll napi_complete\n"); #endif napi_complete(napi); qmgr_enable_irq(rxq); if (!qmgr_stat_below_low_watermark(rxq) && napi_reschedule(napi)) { /* not empty again */ #if DEBUG_RX - printk(KERN_DEBUG "%s: eth_poll napi_reschedule succeeded\n", - dev->name); + netdev_debug(dev, "eth_poll napi_reschedule succeeded\n"); #endif qmgr_disable_irq(rxq); continue; } #if DEBUG_RX - printk(KERN_DEBUG "%s: eth_poll all done\n", - dev->name); + netdev_debug(dev, "eth_poll all done\n"); #endif return received; /* all work done */ } @@ -778,7 +766,7 @@ static int eth_poll(struct napi_struct *napi, int budget) } #if DEBUG_RX - printk(KERN_DEBUG "eth_poll(): end, not all work done\n"); + netdev_debug(dev, "eth_poll(): end, not all work done\n"); #endif return received; /* not all work done */ } @@ -842,7 +830,7 @@ static int eth_xmit(struct sk_buff *skb, struct net_device *dev) struct desc *desc; #if DEBUG_TX - printk(KERN_DEBUG "%s: eth_xmit\n", dev->name); + netdev_debug(dev, "eth_xmit\n"); #endif if (unlikely(skb->len > MAX_MRU)) { @@ -897,22 +885,21 @@ static int eth_xmit(struct sk_buff *skb, struct net_device *dev) if (qmgr_stat_below_low_watermark(txreadyq)) { /* empty */ #if DEBUG_TX - printk(KERN_DEBUG "%s: eth_xmit queue full\n", dev->name); + netdev_debug(dev, "eth_xmit queue full\n"); #endif netif_stop_queue(dev); /* we could miss TX ready interrupt */ /* really empty in fact */ if (!qmgr_stat_below_low_watermark(txreadyq)) { #if DEBUG_TX - printk(KERN_DEBUG "%s: eth_xmit ready again\n", - dev->name); + netdev_debug(dev, "eth_xmit ready again\n"); #endif netif_wake_queue(dev); } } #if DEBUG_TX - printk(KERN_DEBUG "%s: eth_xmit end\n", dev->name); + netdev_debug(dev, "eth_xmit end\n"); #endif ixp_tx_timestamp(port, skb); @@ -1099,7 +1086,7 @@ static int init_queues(struct port *port) int i; if (!ports_open) { - dma_pool = dma_pool_create(DRV_NAME, &port->netdev->dev, + dma_pool = dma_pool_create(DRV_NAME, port->netdev->dev.parent, POOL_ALLOC_SIZE, 32, 0); if (!dma_pool) return -ENOMEM; @@ -1186,8 +1173,7 @@ static int eth_open(struct net_device *dev) return err; if (npe_recv_message(npe, &msg, "ETH_GET_STATUS")) { - printk(KERN_ERR "%s: %s not responding\n", dev->name, - npe_name(npe)); + netdev_err(dev, "%s not responding\n", npe_name(npe)); return -EIO; } port->firmware[0] = msg.byte4; @@ -1299,7 +1285,7 @@ static int eth_close(struct net_device *dev) msg.eth_id = port->id; msg.byte3 = 1; if (npe_send_recv_message(port->npe, &msg, "ETH_ENABLE_LOOPBACK")) - printk(KERN_CRIT "%s: unable to enable loopback\n", dev->name); + netdev_crit(dev, "unable to enable loopback\n"); i = 0; do { /* drain RX buffers */ @@ -1323,11 +1309,11 @@ static int eth_close(struct net_device *dev) } while (++i < MAX_CLOSE_WAIT); if (buffs) - printk(KERN_CRIT "%s: unable to drain RX queue, %i buffer(s)" - " left in NPE\n", dev->name, buffs); + netdev_crit(dev, "unable to drain RX queue, %i buffer(s)" + " left in NPE\n", buffs); #if DEBUG_CLOSE if (!buffs) - printk(KERN_DEBUG "Draining RX queue took %i cycles\n", i); + netdev_debug(dev, "draining RX queue took %i cycles\n", i); #endif buffs = TX_DESCS; @@ -1343,17 +1329,16 @@ static int eth_close(struct net_device *dev) } while (++i < MAX_CLOSE_WAIT); if (buffs) - printk(KERN_CRIT "%s: unable to drain TX queue, %i buffer(s) " - "left in NPE\n", dev->name, buffs); + netdev_crit(dev, "unable to drain TX queue, %i buffer(s) " + "left in NPE\n", buffs); #if DEBUG_CLOSE if (!buffs) - printk(KERN_DEBUG "Draining TX queues took %i cycles\n", i); + netdev_debug(dev, "draining TX queues took %i cycles\n", i); #endif msg.byte3 = 0; if (npe_send_recv_message(port->npe, &msg, "ETH_DISABLE_LOOPBACK")) - printk(KERN_CRIT "%s: unable to disable loopback\n", - dev->name); + netdev_crit(dev, "unable to disable loopback\n"); phy_stop(dev->phydev); @@ -1374,54 +1359,88 @@ static const struct net_device_ops ixp4xx_netdev_ops = { .ndo_validate_addr = eth_validate_addr, }; -static int eth_init_one(struct platform_device *pdev) +static int ixp4xx_eth_probe(struct platform_device *pdev) { - struct port *port; - struct net_device *dev; - struct eth_plat_info *plat = dev_get_platdata(&pdev->dev); - struct phy_device *phydev = NULL; - u32 regs_phys; char phy_id[MII_BUS_ID_SIZE + 3]; + struct phy_device *phydev = NULL; + struct device *dev = &pdev->dev; + struct eth_plat_info *plat; + resource_size_t regs_phys; + struct net_device *ndev; + struct resource *res; + struct port *port; int err; - if (!(dev = alloc_etherdev(sizeof(struct port)))) + plat = dev_get_platdata(dev); + + if (!(ndev = devm_alloc_etherdev(dev, sizeof(struct port)))) return -ENOMEM; - SET_NETDEV_DEV(dev, &pdev->dev); - port = netdev_priv(dev); - port->netdev = dev; + SET_NETDEV_DEV(ndev, dev); + port = netdev_priv(ndev); + port->netdev = ndev; port->id = pdev->id; + /* Get the port resource and remap */ + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) + return -ENODEV; + regs_phys = res->start; + port->regs = devm_ioremap_resource(dev, res); + switch (port->id) { case IXP4XX_ETH_NPEA: - port->regs = (struct eth_regs __iomem *)IXP4XX_EthA_BASE_VIRT; - regs_phys = IXP4XX_EthA_BASE_PHYS; + /* If the MDIO bus is not up yet, defer probe */ + if (!mdio_bus) + return -EPROBE_DEFER; break; case IXP4XX_ETH_NPEB: - port->regs = (struct eth_regs __iomem *)IXP4XX_EthB_BASE_VIRT; - regs_phys = IXP4XX_EthB_BASE_PHYS; + /* + * On all except IXP43x, NPE-B is used for the MDIO bus. + * If there is no NPE-B in the feature set, bail out, else + * register the MDIO bus. + */ + if (!cpu_is_ixp43x()) { + if (!(ixp4xx_read_feature_bits() & + IXP4XX_FEATURE_NPEB_ETH0)) + return -ENODEV; + /* Else register the MDIO bus on NPE-B */ + if ((err = ixp4xx_mdio_register(port->regs))) + return err; + } + if (!mdio_bus) + return -EPROBE_DEFER; break; case IXP4XX_ETH_NPEC: - port->regs = (struct eth_regs __iomem *)IXP4XX_EthC_BASE_VIRT; - regs_phys = IXP4XX_EthC_BASE_PHYS; + /* + * IXP43x lacks NPE-B and uses NPE-C for the MDIO bus access, + * of there is no NPE-C, no bus, nothing works, so bail out. + */ + if (cpu_is_ixp43x()) { + if (!(ixp4xx_read_feature_bits() & + IXP4XX_FEATURE_NPEC_ETH)) + return -ENODEV; + /* Else register the MDIO bus on NPE-C */ + if ((err = ixp4xx_mdio_register(port->regs))) + return err; + } + if (!mdio_bus) + return -EPROBE_DEFER; break; default: - err = -ENODEV; - goto err_free; + return -ENODEV; } - dev->netdev_ops = &ixp4xx_netdev_ops; - dev->ethtool_ops = &ixp4xx_ethtool_ops; - dev->tx_queue_len = 100; + ndev->netdev_ops = &ixp4xx_netdev_ops; + ndev->ethtool_ops = &ixp4xx_ethtool_ops; + ndev->tx_queue_len = 100; - netif_napi_add(dev, &port->napi, eth_poll, NAPI_WEIGHT); + netif_napi_add(ndev, &port->napi, eth_poll, NAPI_WEIGHT); - if (!(port->npe = npe_request(NPE_ID(port->id)))) { - err = -EIO; - goto err_free; - } + if (!(port->npe = npe_request(NPE_ID(port->id)))) + return -EIO; - port->mem_res = request_mem_region(regs_phys, REGS_SIZE, dev->name); + port->mem_res = request_mem_region(regs_phys, REGS_SIZE, ndev->name); if (!port->mem_res) { err = -EBUSY; goto err_npe_rel; @@ -1429,9 +1448,9 @@ static int eth_init_one(struct platform_device *pdev) port->plat = plat; npe_port_tab[NPE_ID(port->id)] = port; - memcpy(dev->dev_addr, plat->hwaddr, ETH_ALEN); + memcpy(ndev->dev_addr, plat->hwaddr, ETH_ALEN); - platform_set_drvdata(pdev, dev); + platform_set_drvdata(pdev, ndev); __raw_writel(DEFAULT_CORE_CNTRL | CORE_RESET, &port->regs->core_control); @@ -1441,7 +1460,7 @@ static int eth_init_one(struct platform_device *pdev) snprintf(phy_id, MII_BUS_ID_SIZE + 3, PHY_ID_FMT, mdio_bus->id, plat->phy); - phydev = phy_connect(dev, phy_id, &ixp4xx_adjust_link, + phydev = phy_connect(ndev, phy_id, &ixp4xx_adjust_link, PHY_INTERFACE_MODE_MII); if (IS_ERR(phydev)) { err = PTR_ERR(phydev); @@ -1450,11 +1469,11 @@ static int eth_init_one(struct platform_device *pdev) phydev->irq = PHY_POLL; - if ((err = register_netdev(dev))) + if ((err = register_netdev(ndev))) goto err_phy_dis; - printk(KERN_INFO "%s: MII PHY %i on %s\n", dev->name, plat->phy, - npe_name(port->npe)); + netdev_info(ndev, "%s: MII PHY %i on %s\n", ndev->name, plat->phy, + npe_name(port->npe)); return 0; @@ -1465,58 +1484,32 @@ err_free_mem: release_resource(port->mem_res); err_npe_rel: npe_release(port->npe); -err_free: - free_netdev(dev); return err; } -static int eth_remove_one(struct platform_device *pdev) +static int ixp4xx_eth_remove(struct platform_device *pdev) { - struct net_device *dev = platform_get_drvdata(pdev); - struct phy_device *phydev = dev->phydev; - struct port *port = netdev_priv(dev); + struct net_device *ndev = platform_get_drvdata(pdev); + struct phy_device *phydev = ndev->phydev; + struct port *port = netdev_priv(ndev); - unregister_netdev(dev); + unregister_netdev(ndev); phy_disconnect(phydev); + ixp4xx_mdio_remove(); npe_port_tab[NPE_ID(port->id)] = NULL; npe_release(port->npe); release_resource(port->mem_res); - free_netdev(dev); return 0; } static struct platform_driver ixp4xx_eth_driver = { .driver.name = DRV_NAME, - .probe = eth_init_one, - .remove = eth_remove_one, + .probe = ixp4xx_eth_probe, + .remove = ixp4xx_eth_remove, }; - -static int __init eth_init_module(void) -{ - int err; - - /* - * FIXME: we bail out on device tree boot but this really needs - * to be fixed in a nicer way: this registers the MDIO bus before - * even matching the driver infrastructure, we should only probe - * detected hardware. - */ - if (of_have_populated_dt()) - return -ENODEV; - if ((err = ixp4xx_mdio_register())) - return err; - return platform_driver_register(&ixp4xx_eth_driver); -} - -static void __exit eth_cleanup_module(void) -{ - platform_driver_unregister(&ixp4xx_eth_driver); - ixp4xx_mdio_remove(); -} +module_platform_driver(ixp4xx_eth_driver); MODULE_AUTHOR("Krzysztof Halasa"); MODULE_DESCRIPTION("Intel IXP4xx Ethernet driver"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS("platform:ixp4xx_eth"); -module_init(eth_init_module); -module_exit(eth_cleanup_module); diff --git a/drivers/ptp/ptp_ixp46x.c b/drivers/net/ethernet/xscale/ptp_ixp46x.c index 67028484e9a0..9ecc395239e9 100644 --- a/drivers/ptp/ptp_ixp46x.c +++ b/drivers/net/ethernet/xscale/ptp_ixp46x.c @@ -15,7 +15,8 @@ #include <linux/module.h> #include <linux/ptp_clock_kernel.h> -#include <mach/ixp46x_ts.h> + +#include "ixp46x_ts.h" #define DRIVER "ptp_ixp46x" #define N_EXT_TS 2 diff --git a/drivers/net/fddi/skfp/skfddi.c b/drivers/net/fddi/skfp/skfddi.c index 15754451cb87..69c29a2ef95d 100644 --- a/drivers/net/fddi/skfp/skfddi.c +++ b/drivers/net/fddi/skfp/skfddi.c @@ -1520,22 +1520,8 @@ void mac_drv_tx_complete(struct s_smc *smc, volatile struct s_smt_fp_txd *txd) #ifdef DUMPPACKETS void dump_data(unsigned char *Data, int length) { - int i, j; - unsigned char s[255], sh[10]; - if (length > 64) { - length = 64; - } printk(KERN_INFO "---Packet start---\n"); - for (i = 0, j = 0; i < length / 8; i++, j += 8) - printk(KERN_INFO "%02x %02x %02x %02x %02x %02x %02x %02x\n", - Data[j + 0], Data[j + 1], Data[j + 2], Data[j + 3], - Data[j + 4], Data[j + 5], Data[j + 6], Data[j + 7]); - strcpy(s, ""); - for (i = 0; i < length % 8; i++) { - sprintf(sh, "%02x ", Data[j + i]); - strcat(s, sh); - } - printk(KERN_INFO "%s\n", s); + print_hex_dump(KERN_INFO, "", DUMP_PREFIX_NONE, 16, 1, Data, min_t(size_t, length, 64), false); printk(KERN_INFO "------------------\n"); } // dump_data #else diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index fca471e27f39..7032a2405e1a 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -804,19 +804,21 @@ static struct sock *gtp_encap_enable_socket(int fd, int type, return NULL; } - if (sock->sk->sk_protocol != IPPROTO_UDP) { + sk = sock->sk; + if (sk->sk_protocol != IPPROTO_UDP || + sk->sk_type != SOCK_DGRAM || + (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)) { pr_debug("socket fd=%d not UDP\n", fd); sk = ERR_PTR(-EINVAL); goto out_sock; } - lock_sock(sock->sk); - if (sock->sk->sk_user_data) { + lock_sock(sk); + if (sk->sk_user_data) { sk = ERR_PTR(-EBUSY); - goto out_sock; + goto out_rel_sock; } - sk = sock->sk; sock_hold(sk); tuncfg.sk_user_data = gtp; @@ -826,8 +828,9 @@ static struct sock *gtp_encap_enable_socket(int fd, int type, setup_udp_tunnel_sock(sock_net(sock->sk), sock, &tuncfg); -out_sock: +out_rel_sock: release_sock(sock->sk); +out_sock: sockfd_put(sock); return sk; } @@ -851,8 +854,7 @@ static int gtp_encap_enable(struct gtp_dev *gtp, struct nlattr *data[]) sk1u = gtp_encap_enable_socket(fd1, UDP_ENCAP_GTP1U, gtp); if (IS_ERR(sk1u)) { - if (sk0) - gtp_encap_disable_sock(sk0); + gtp_encap_disable_sock(sk0); return PTR_ERR(sk1u); } } @@ -860,10 +862,8 @@ static int gtp_encap_enable(struct gtp_dev *gtp, struct nlattr *data[]) if (data[IFLA_GTP_ROLE]) { role = nla_get_u32(data[IFLA_GTP_ROLE]); if (role > GTP_ROLE_SGSN) { - if (sk0) - gtp_encap_disable_sock(sk0); - if (sk1u) - gtp_encap_disable_sock(sk1u); + gtp_encap_disable_sock(sk0); + gtp_encap_disable_sock(sk1u); return -EINVAL; } } diff --git a/drivers/net/hyperv/Makefile b/drivers/net/hyperv/Makefile index 3a2aa0708166..0db7ccaec4a4 100644 --- a/drivers/net/hyperv/Makefile +++ b/drivers/net/hyperv/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_HYPERV_NET) += hv_netvsc.o -hv_netvsc-y := netvsc_drv.o netvsc.o rndis_filter.o netvsc_trace.o +hv_netvsc-y := netvsc_drv.o netvsc.o rndis_filter.o netvsc_trace.o netvsc_bpf.o diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index dc44819946e6..abda736e7c7d 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -142,6 +142,8 @@ struct netvsc_device_info { u32 send_section_size; u32 recv_section_size; + struct bpf_prog *bprog; + u8 rss_key[NETVSC_HASH_KEYLEN]; }; @@ -189,7 +191,8 @@ int netvsc_send(struct net_device *net, struct hv_netvsc_packet *packet, struct rndis_message *rndis_msg, struct hv_page_buffer *page_buffer, - struct sk_buff *skb); + struct sk_buff *skb, + bool xdp_tx); void netvsc_linkstatus_callback(struct net_device *net, struct rndis_message *resp); int netvsc_recv_callback(struct net_device *net, @@ -198,6 +201,16 @@ int netvsc_recv_callback(struct net_device *net, void netvsc_channel_cb(void *context); int netvsc_poll(struct napi_struct *napi, int budget); +u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan, + struct xdp_buff *xdp); +unsigned int netvsc_xdp_fraglen(unsigned int len); +struct bpf_prog *netvsc_xdp_get(struct netvsc_device *nvdev); +int netvsc_xdp_set(struct net_device *dev, struct bpf_prog *prog, + struct netlink_ext_ack *extack, + struct netvsc_device *nvdev); +int netvsc_vf_setxdp(struct net_device *vf_netdev, struct bpf_prog *prog); +int netvsc_bpf(struct net_device *dev, struct netdev_bpf *bpf); + int rndis_set_subchannel(struct net_device *ndev, struct netvsc_device *nvdev, struct netvsc_device_info *dev_info); @@ -832,6 +845,8 @@ struct nvsp_message { #define RNDIS_MAX_PKT_DEFAULT 8 #define RNDIS_PKT_ALIGN_DEFAULT 8 +#define NETVSC_XDP_HDRM 256 + struct multi_send_data { struct sk_buff *skb; /* skb containing the pkt */ struct hv_netvsc_packet *pkt; /* netvsc pkt pending */ @@ -867,6 +882,7 @@ struct netvsc_stats { u64 bytes; u64 broadcast; u64 multicast; + u64 xdp_drop; struct u64_stats_sync syncp; }; @@ -972,6 +988,9 @@ struct netvsc_channel { atomic_t queue_sends; struct nvsc_rsc rsc; + struct bpf_prog __rcu *bpf_prog; + struct xdp_rxq_info xdp_rxq; + struct netvsc_stats tx_stats; struct netvsc_stats rx_stats; }; diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index eab83e71567a..ae3f3084c2ed 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -122,8 +122,10 @@ static void free_netvsc_device(struct rcu_head *head) vfree(nvdev->send_buf); kfree(nvdev->send_section_map); - for (i = 0; i < VRSS_CHANNEL_MAX; i++) + for (i = 0; i < VRSS_CHANNEL_MAX; i++) { + xdp_rxq_info_unreg(&nvdev->chan_table[i].xdp_rxq); vfree(nvdev->chan_table[i].mrc.slots); + } kfree(nvdev); } @@ -900,7 +902,8 @@ int netvsc_send(struct net_device *ndev, struct hv_netvsc_packet *packet, struct rndis_message *rndis_msg, struct hv_page_buffer *pb, - struct sk_buff *skb) + struct sk_buff *skb, + bool xdp_tx) { struct net_device_context *ndev_ctx = netdev_priv(ndev); struct netvsc_device *net_device @@ -923,10 +926,11 @@ int netvsc_send(struct net_device *ndev, packet->send_buf_index = NETVSC_INVALID_INDEX; packet->cp_partial = false; - /* Send control message directly without accessing msd (Multi-Send - * Data) field which may be changed during data packet processing. + /* Send a control message or XDP packet directly without accessing + * msd (Multi-Send Data) field which may be changed during data packet + * processing. */ - if (!skb) + if (!skb || xdp_tx) return netvsc_send_pkt(device, packet, net_device, pb, skb); /* batch packets in send buffer if possible */ @@ -1392,6 +1396,21 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device, nvchan->net_device = net_device; u64_stats_init(&nvchan->tx_stats.syncp); u64_stats_init(&nvchan->rx_stats.syncp); + + ret = xdp_rxq_info_reg(&nvchan->xdp_rxq, ndev, i); + + if (ret) { + netdev_err(ndev, "xdp_rxq_info_reg fail: %d\n", ret); + goto cleanup2; + } + + ret = xdp_rxq_info_reg_mem_model(&nvchan->xdp_rxq, + MEM_TYPE_PAGE_SHARED, NULL); + + if (ret) { + netdev_err(ndev, "xdp reg_mem_model fail: %d\n", ret); + goto cleanup2; + } } /* Enable NAPI handler before init callbacks */ @@ -1437,6 +1456,8 @@ close: cleanup: netif_napi_del(&net_device->chan_table[0].napi); + +cleanup2: free_netvsc_device(&net_device->rcu); return ERR_PTR(ret); diff --git a/drivers/net/hyperv/netvsc_bpf.c b/drivers/net/hyperv/netvsc_bpf.c new file mode 100644 index 000000000000..20adfe544294 --- /dev/null +++ b/drivers/net/hyperv/netvsc_bpf.c @@ -0,0 +1,209 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2019, Microsoft Corporation. + * + * Author: + * Haiyang Zhang <haiyangz@microsoft.com> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/ethtool.h> +#include <linux/bpf.h> +#include <linux/bpf_trace.h> +#include <linux/kernel.h> +#include <net/xdp.h> + +#include <linux/mutex.h> +#include <linux/rtnetlink.h> + +#include "hyperv_net.h" + +u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan, + struct xdp_buff *xdp) +{ + void *data = nvchan->rsc.data[0]; + u32 len = nvchan->rsc.len[0]; + struct page *page = NULL; + struct bpf_prog *prog; + u32 act = XDP_PASS; + + xdp->data_hard_start = NULL; + + rcu_read_lock(); + prog = rcu_dereference(nvchan->bpf_prog); + + if (!prog) + goto out; + + /* allocate page buffer for data */ + page = alloc_page(GFP_ATOMIC); + if (!page) { + act = XDP_DROP; + goto out; + } + + xdp->data_hard_start = page_address(page); + xdp->data = xdp->data_hard_start + NETVSC_XDP_HDRM; + xdp_set_data_meta_invalid(xdp); + xdp->data_end = xdp->data + len; + xdp->rxq = &nvchan->xdp_rxq; + xdp->handle = 0; + + memcpy(xdp->data, data, len); + + act = bpf_prog_run_xdp(prog, xdp); + + switch (act) { + case XDP_PASS: + case XDP_TX: + case XDP_DROP: + break; + + case XDP_ABORTED: + trace_xdp_exception(ndev, prog, act); + break; + + default: + bpf_warn_invalid_xdp_action(act); + } + +out: + rcu_read_unlock(); + + if (page && act != XDP_PASS && act != XDP_TX) { + __free_page(page); + xdp->data_hard_start = NULL; + } + + return act; +} + +unsigned int netvsc_xdp_fraglen(unsigned int len) +{ + return SKB_DATA_ALIGN(len) + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); +} + +struct bpf_prog *netvsc_xdp_get(struct netvsc_device *nvdev) +{ + return rtnl_dereference(nvdev->chan_table[0].bpf_prog); +} + +int netvsc_xdp_set(struct net_device *dev, struct bpf_prog *prog, + struct netlink_ext_ack *extack, + struct netvsc_device *nvdev) +{ + struct bpf_prog *old_prog; + int buf_max, i; + + old_prog = netvsc_xdp_get(nvdev); + + if (!old_prog && !prog) + return 0; + + buf_max = NETVSC_XDP_HDRM + netvsc_xdp_fraglen(dev->mtu + ETH_HLEN); + if (prog && buf_max > PAGE_SIZE) { + netdev_err(dev, "XDP: mtu:%u too large, buf_max:%u\n", + dev->mtu, buf_max); + NL_SET_ERR_MSG_MOD(extack, "XDP: mtu too large"); + + return -EOPNOTSUPP; + } + + if (prog && (dev->features & NETIF_F_LRO)) { + netdev_err(dev, "XDP: not support LRO\n"); + NL_SET_ERR_MSG_MOD(extack, "XDP: not support LRO"); + + return -EOPNOTSUPP; + } + + if (prog) + bpf_prog_add(prog, nvdev->num_chn); + + for (i = 0; i < nvdev->num_chn; i++) + rcu_assign_pointer(nvdev->chan_table[i].bpf_prog, prog); + + if (old_prog) + for (i = 0; i < nvdev->num_chn; i++) + bpf_prog_put(old_prog); + + return 0; +} + +int netvsc_vf_setxdp(struct net_device *vf_netdev, struct bpf_prog *prog) +{ + struct netdev_bpf xdp; + bpf_op_t ndo_bpf; + + ASSERT_RTNL(); + + if (!vf_netdev) + return 0; + + ndo_bpf = vf_netdev->netdev_ops->ndo_bpf; + if (!ndo_bpf) + return 0; + + memset(&xdp, 0, sizeof(xdp)); + + xdp.command = XDP_SETUP_PROG; + xdp.prog = prog; + + return ndo_bpf(vf_netdev, &xdp); +} + +static u32 netvsc_xdp_query(struct netvsc_device *nvdev) +{ + struct bpf_prog *prog = netvsc_xdp_get(nvdev); + + if (prog) + return prog->aux->id; + + return 0; +} + +int netvsc_bpf(struct net_device *dev, struct netdev_bpf *bpf) +{ + struct net_device_context *ndevctx = netdev_priv(dev); + struct netvsc_device *nvdev = rtnl_dereference(ndevctx->nvdev); + struct net_device *vf_netdev = rtnl_dereference(ndevctx->vf_netdev); + struct netlink_ext_ack *extack = bpf->extack; + int ret; + + if (!nvdev || nvdev->destroy) { + if (bpf->command == XDP_QUERY_PROG) { + bpf->prog_id = 0; + return 0; /* Query must always succeed */ + } else { + return -ENODEV; + } + } + + switch (bpf->command) { + case XDP_SETUP_PROG: + ret = netvsc_xdp_set(dev, bpf->prog, extack, nvdev); + + if (ret) + return ret; + + ret = netvsc_vf_setxdp(vf_netdev, bpf->prog); + + if (ret) { + netdev_err(dev, "vf_setxdp failed:%d\n", ret); + NL_SET_ERR_MSG_MOD(extack, "vf_setxdp failed"); + + netvsc_xdp_set(dev, NULL, extack, nvdev); + } + + return ret; + + case XDP_QUERY_PROG: + bpf->prog_id = netvsc_xdp_query(nvdev); + return 0; + + default: + return -EINVAL; + } +} diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index f3f9eb8a402a..8fc71bd49894 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -25,6 +25,7 @@ #include <linux/slab.h> #include <linux/rtnetlink.h> #include <linux/netpoll.h> +#include <linux/bpf.h> #include <net/arp.h> #include <net/route.h> @@ -519,7 +520,7 @@ static int netvsc_vf_xmit(struct net_device *net, struct net_device *vf_netdev, return rc; } -static int netvsc_start_xmit(struct sk_buff *skb, struct net_device *net) +static int netvsc_xmit(struct sk_buff *skb, struct net_device *net, bool xdp_tx) { struct net_device_context *net_device_ctx = netdev_priv(net); struct hv_netvsc_packet *packet = NULL; @@ -686,7 +687,7 @@ static int netvsc_start_xmit(struct sk_buff *skb, struct net_device *net) /* timestamp packet in software */ skb_tx_timestamp(skb); - ret = netvsc_send(net, packet, rndis_msg, pb, skb); + ret = netvsc_send(net, packet, rndis_msg, pb, skb, xdp_tx); if (likely(ret == 0)) return NETDEV_TX_OK; @@ -709,6 +710,11 @@ no_memory: goto drop; } +static int netvsc_start_xmit(struct sk_buff *skb, struct net_device *ndev) +{ + return netvsc_xmit(skb, ndev, false); +} + /* * netvsc_linkstatus_callback - Link up/down notification */ @@ -751,6 +757,22 @@ void netvsc_linkstatus_callback(struct net_device *net, schedule_delayed_work(&ndev_ctx->dwork, 0); } +static void netvsc_xdp_xmit(struct sk_buff *skb, struct net_device *ndev) +{ + int rc; + + skb->queue_mapping = skb_get_rx_queue(skb); + __skb_push(skb, ETH_HLEN); + + rc = netvsc_xmit(skb, ndev, true); + + if (dev_xmit_complete(rc)) + return; + + dev_kfree_skb_any(skb); + ndev->stats.tx_dropped++; +} + static void netvsc_comp_ipcsum(struct sk_buff *skb) { struct iphdr *iph = (struct iphdr *)skb->data; @@ -760,7 +782,8 @@ static void netvsc_comp_ipcsum(struct sk_buff *skb) } static struct sk_buff *netvsc_alloc_recv_skb(struct net_device *net, - struct netvsc_channel *nvchan) + struct netvsc_channel *nvchan, + struct xdp_buff *xdp) { struct napi_struct *napi = &nvchan->napi; const struct ndis_pkt_8021q_info *vlan = nvchan->rsc.vlan; @@ -768,18 +791,37 @@ static struct sk_buff *netvsc_alloc_recv_skb(struct net_device *net, nvchan->rsc.csum_info; const u32 *hash_info = nvchan->rsc.hash_info; struct sk_buff *skb; + void *xbuf = xdp->data_hard_start; int i; - skb = napi_alloc_skb(napi, nvchan->rsc.pktlen); - if (!skb) - return skb; + if (xbuf) { + unsigned int hdroom = xdp->data - xdp->data_hard_start; + unsigned int xlen = xdp->data_end - xdp->data; + unsigned int frag_size = netvsc_xdp_fraglen(hdroom + xlen); - /* - * Copy to skb. This copy is needed here since the memory pointed by - * hv_netvsc_packet cannot be deallocated - */ - for (i = 0; i < nvchan->rsc.cnt; i++) - skb_put_data(skb, nvchan->rsc.data[i], nvchan->rsc.len[i]); + skb = build_skb(xbuf, frag_size); + + if (!skb) { + __free_page(virt_to_page(xbuf)); + return NULL; + } + + skb_reserve(skb, hdroom); + skb_put(skb, xlen); + skb->dev = napi->dev; + } else { + skb = napi_alloc_skb(napi, nvchan->rsc.pktlen); + + if (!skb) + return NULL; + + /* Copy to skb. This copy is needed here since the memory + * pointed by hv_netvsc_packet cannot be deallocated. + */ + for (i = 0; i < nvchan->rsc.cnt; i++) + skb_put_data(skb, nvchan->rsc.data[i], + nvchan->rsc.len[i]); + } skb->protocol = eth_type_trans(skb, net); @@ -829,13 +871,25 @@ int netvsc_recv_callback(struct net_device *net, struct vmbus_channel *channel = nvchan->channel; u16 q_idx = channel->offermsg.offer.sub_channel_index; struct sk_buff *skb; - struct netvsc_stats *rx_stats; + struct netvsc_stats *rx_stats = &nvchan->rx_stats; + struct xdp_buff xdp; + u32 act; if (net->reg_state != NETREG_REGISTERED) return NVSP_STAT_FAIL; + act = netvsc_run_xdp(net, nvchan, &xdp); + + if (act != XDP_PASS && act != XDP_TX) { + u64_stats_update_begin(&rx_stats->syncp); + rx_stats->xdp_drop++; + u64_stats_update_end(&rx_stats->syncp); + + return NVSP_STAT_SUCCESS; /* consumed by XDP */ + } + /* Allocate a skb - TODO direct I/O to pages? */ - skb = netvsc_alloc_recv_skb(net, nvchan); + skb = netvsc_alloc_recv_skb(net, nvchan, &xdp); if (unlikely(!skb)) { ++net_device_ctx->eth_stats.rx_no_memory; @@ -849,7 +903,6 @@ int netvsc_recv_callback(struct net_device *net, * on the synthetic device because modifying the VF device * statistics will not work correctly. */ - rx_stats = &nvchan->rx_stats; u64_stats_update_begin(&rx_stats->syncp); rx_stats->packets++; rx_stats->bytes += nvchan->rsc.pktlen; @@ -860,6 +913,11 @@ int netvsc_recv_callback(struct net_device *net, ++rx_stats->multicast; u64_stats_update_end(&rx_stats->syncp); + if (act == XDP_TX) { + netvsc_xdp_xmit(skb, net); + return NVSP_STAT_SUCCESS; + } + napi_gro_receive(&nvchan->napi, skb); return NVSP_STAT_SUCCESS; } @@ -886,10 +944,11 @@ static void netvsc_get_channels(struct net_device *net, /* Alloc struct netvsc_device_info, and initialize it from either existing * struct netvsc_device, or from default values. */ -static struct netvsc_device_info *netvsc_devinfo_get - (struct netvsc_device *nvdev) +static +struct netvsc_device_info *netvsc_devinfo_get(struct netvsc_device *nvdev) { struct netvsc_device_info *dev_info; + struct bpf_prog *prog; dev_info = kzalloc(sizeof(*dev_info), GFP_ATOMIC); @@ -897,6 +956,8 @@ static struct netvsc_device_info *netvsc_devinfo_get return NULL; if (nvdev) { + ASSERT_RTNL(); + dev_info->num_chn = nvdev->num_chn; dev_info->send_sections = nvdev->send_section_cnt; dev_info->send_section_size = nvdev->send_section_size; @@ -905,6 +966,12 @@ static struct netvsc_device_info *netvsc_devinfo_get memcpy(dev_info->rss_key, nvdev->extension->rss_key, NETVSC_HASH_KEYLEN); + + prog = netvsc_xdp_get(nvdev); + if (prog) { + bpf_prog_inc(prog); + dev_info->bprog = prog; + } } else { dev_info->num_chn = VRSS_CHANNEL_DEFAULT; dev_info->send_sections = NETVSC_DEFAULT_TX; @@ -916,6 +983,17 @@ static struct netvsc_device_info *netvsc_devinfo_get return dev_info; } +/* Free struct netvsc_device_info */ +static void netvsc_devinfo_put(struct netvsc_device_info *dev_info) +{ + if (dev_info->bprog) { + ASSERT_RTNL(); + bpf_prog_put(dev_info->bprog); + } + + kfree(dev_info); +} + static int netvsc_detach(struct net_device *ndev, struct netvsc_device *nvdev) { @@ -927,6 +1005,8 @@ static int netvsc_detach(struct net_device *ndev, if (cancel_work_sync(&nvdev->subchan_work)) nvdev->num_chn = 1; + netvsc_xdp_set(ndev, NULL, NULL, nvdev); + /* If device was up (receiving) then shutdown */ if (netif_running(ndev)) { netvsc_tx_disable(nvdev, ndev); @@ -960,7 +1040,8 @@ static int netvsc_attach(struct net_device *ndev, struct hv_device *hdev = ndev_ctx->device_ctx; struct netvsc_device *nvdev; struct rndis_device *rdev; - int ret; + struct bpf_prog *prog; + int ret = 0; nvdev = rndis_filter_device_add(hdev, dev_info); if (IS_ERR(nvdev)) @@ -976,6 +1057,13 @@ static int netvsc_attach(struct net_device *ndev, } } + prog = dev_info->bprog; + if (prog) { + ret = netvsc_xdp_set(ndev, prog, NULL, nvdev); + if (ret) + goto err1; + } + /* In any case device is now ready */ netif_device_attach(ndev); @@ -985,7 +1073,7 @@ static int netvsc_attach(struct net_device *ndev, if (netif_running(ndev)) { ret = rndis_filter_open(nvdev); if (ret) - goto err; + goto err2; rdev = nvdev->extension; if (!rdev->link_state) @@ -994,9 +1082,10 @@ static int netvsc_attach(struct net_device *ndev, return 0; -err: +err2: netif_device_detach(ndev); +err1: rndis_filter_device_remove(hdev, nvdev); return ret; @@ -1046,7 +1135,7 @@ static int netvsc_set_channels(struct net_device *net, } out: - kfree(device_info); + netvsc_devinfo_put(device_info); return ret; } @@ -1153,7 +1242,7 @@ rollback_vf: dev_set_mtu(vf_netdev, orig_mtu); out: - kfree(device_info); + netvsc_devinfo_put(device_info); return ret; } @@ -1378,8 +1467,8 @@ static const struct { /* statistics per queue (rx/tx packets/bytes) */ #define NETVSC_PCPU_STATS_LEN (num_present_cpus() * ARRAY_SIZE(pcpu_stats)) -/* 4 statistics per queue (rx/tx packets/bytes) */ -#define NETVSC_QUEUE_STATS_LEN(dev) ((dev)->num_chn * 4) +/* 5 statistics per queue (rx/tx packets/bytes, rx xdp_drop) */ +#define NETVSC_QUEUE_STATS_LEN(dev) ((dev)->num_chn * 5) static int netvsc_get_sset_count(struct net_device *dev, int string_set) { @@ -1411,6 +1500,7 @@ static void netvsc_get_ethtool_stats(struct net_device *dev, struct netvsc_ethtool_pcpu_stats *pcpu_sum; unsigned int start; u64 packets, bytes; + u64 xdp_drop; int i, j, cpu; if (!nvdev) @@ -1439,9 +1529,11 @@ static void netvsc_get_ethtool_stats(struct net_device *dev, start = u64_stats_fetch_begin_irq(&qstats->syncp); packets = qstats->packets; bytes = qstats->bytes; + xdp_drop = qstats->xdp_drop; } while (u64_stats_fetch_retry_irq(&qstats->syncp, start)); data[i++] = packets; data[i++] = bytes; + data[i++] = xdp_drop; } pcpu_sum = kvmalloc_array(num_possible_cpus(), @@ -1489,6 +1581,8 @@ static void netvsc_get_strings(struct net_device *dev, u32 stringset, u8 *data) p += ETH_GSTRING_LEN; sprintf(p, "rx_queue_%u_bytes", i); p += ETH_GSTRING_LEN; + sprintf(p, "rx_queue_%u_xdp_drop", i); + p += ETH_GSTRING_LEN; } for_each_present_cpu(cpu) { @@ -1785,10 +1879,27 @@ static int netvsc_set_ringparam(struct net_device *ndev, } out: - kfree(device_info); + netvsc_devinfo_put(device_info); return ret; } +static netdev_features_t netvsc_fix_features(struct net_device *ndev, + netdev_features_t features) +{ + struct net_device_context *ndevctx = netdev_priv(ndev); + struct netvsc_device *nvdev = rtnl_dereference(ndevctx->nvdev); + + if (!nvdev || nvdev->destroy) + return features; + + if ((features & NETIF_F_LRO) && netvsc_xdp_get(nvdev)) { + features ^= NETIF_F_LRO; + netdev_info(ndev, "Skip LRO - unsupported with XDP\n"); + } + + return features; +} + static int netvsc_set_features(struct net_device *ndev, netdev_features_t features) { @@ -1875,12 +1986,14 @@ static const struct net_device_ops device_ops = { .ndo_start_xmit = netvsc_start_xmit, .ndo_change_rx_flags = netvsc_change_rx_flags, .ndo_set_rx_mode = netvsc_set_rx_mode, + .ndo_fix_features = netvsc_fix_features, .ndo_set_features = netvsc_set_features, .ndo_change_mtu = netvsc_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = netvsc_set_mac_addr, .ndo_select_queue = netvsc_select_queue, .ndo_get_stats64 = netvsc_get_stats64, + .ndo_bpf = netvsc_bpf, }; /* @@ -2167,6 +2280,7 @@ static int netvsc_register_vf(struct net_device *vf_netdev) { struct net_device_context *net_device_ctx; struct netvsc_device *netvsc_dev; + struct bpf_prog *prog; struct net_device *ndev; int ret; @@ -2211,6 +2325,9 @@ static int netvsc_register_vf(struct net_device *vf_netdev) vf_netdev->wanted_features = ndev->features; netdev_update_features(vf_netdev); + prog = netvsc_xdp_get(netvsc_dev); + netvsc_vf_setxdp(vf_netdev, prog); + return NOTIFY_OK; } @@ -2252,6 +2369,8 @@ static int netvsc_unregister_vf(struct net_device *vf_netdev) netdev_info(ndev, "VF unregistering: %s\n", vf_netdev->name); + netvsc_vf_setxdp(vf_netdev, NULL); + netdev_rx_handler_unregister(vf_netdev); netdev_upper_dev_unlink(vf_netdev, ndev); RCU_INIT_POINTER(net_device_ctx->vf_netdev, NULL); @@ -2363,14 +2482,14 @@ static int netvsc_probe(struct hv_device *dev, list_add(&net_device_ctx->list, &netvsc_dev_list); rtnl_unlock(); - kfree(device_info); + netvsc_devinfo_put(device_info); return 0; register_failed: rtnl_unlock(); rndis_filter_device_remove(dev, nvdev); rndis_failed: - kfree(device_info); + netvsc_devinfo_put(device_info); devinfo_failed: free_percpu(net_device_ctx->vf_stats); no_stats: @@ -2398,8 +2517,10 @@ static int netvsc_remove(struct hv_device *dev) rtnl_lock(); nvdev = rtnl_dereference(ndev_ctx->nvdev); - if (nvdev) + if (nvdev) { cancel_work_sync(&nvdev->subchan_work); + netvsc_xdp_set(net, NULL, NULL, nvdev); + } /* * Call to the vsc driver to let it know that the device is being @@ -2472,11 +2593,11 @@ static int netvsc_resume(struct hv_device *dev) ret = netvsc_attach(net, device_info); - rtnl_unlock(); - - kfree(device_info); + netvsc_devinfo_put(device_info); net_device_ctx->saved_netvsc_dev_info = NULL; + rtnl_unlock(); + return ret; } static const struct hv_vmbus_device_id id_table[] = { diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c index 857c4bea451c..b81ceba38218 100644 --- a/drivers/net/hyperv/rndis_filter.c +++ b/drivers/net/hyperv/rndis_filter.c @@ -235,7 +235,7 @@ static int rndis_filter_send_request(struct rndis_device *dev, trace_rndis_send(dev->ndev, 0, &req->request_msg); rcu_read_lock_bh(); - ret = netvsc_send(dev->ndev, packet, NULL, pb, NULL); + ret = netvsc_send(dev->ndev, packet, NULL, pb, NULL, false); rcu_read_unlock_bh(); return ret; @@ -1443,8 +1443,6 @@ void rndis_filter_device_remove(struct hv_device *dev, /* Halt and release the rndis device */ rndis_filter_halt_device(net_dev, rndis_dev); - net_dev->extension = NULL; - netvsc_device_remove(dev); } diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index afd8b2a08245..45bfd99f17fa 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -11,16 +11,17 @@ #include <linux/module.h> #include <crypto/aead.h> #include <linux/etherdevice.h> +#include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/refcount.h> #include <net/genetlink.h> #include <net/sock.h> #include <net/gro_cells.h> +#include <net/macsec.h> +#include <linux/phy.h> #include <uapi/linux/if_macsec.h> -typedef u64 __bitwise sci_t; - #define MACSEC_SCI_LEN 8 /* SecTAG length = macsec_eth_header without the optional SCI */ @@ -58,8 +59,6 @@ struct macsec_eth_header { #define GCM_AES_IV_LEN 12 #define DEFAULT_ICV_LEN 16 -#define MACSEC_NUM_AN 4 /* 2 bits for the association number */ - #define for_each_rxsc(secy, sc) \ for (sc = rcu_dereference_bh(secy->rx_sc); \ sc; \ @@ -77,49 +76,6 @@ struct gcm_iv { __be32 pn; }; -/** - * struct macsec_key - SA key - * @id: user-provided key identifier - * @tfm: crypto struct, key storage - */ -struct macsec_key { - u8 id[MACSEC_KEYID_LEN]; - struct crypto_aead *tfm; -}; - -struct macsec_rx_sc_stats { - __u64 InOctetsValidated; - __u64 InOctetsDecrypted; - __u64 InPktsUnchecked; - __u64 InPktsDelayed; - __u64 InPktsOK; - __u64 InPktsInvalid; - __u64 InPktsLate; - __u64 InPktsNotValid; - __u64 InPktsNotUsingSA; - __u64 InPktsUnusedSA; -}; - -struct macsec_rx_sa_stats { - __u32 InPktsOK; - __u32 InPktsInvalid; - __u32 InPktsNotValid; - __u32 InPktsNotUsingSA; - __u32 InPktsUnusedSA; -}; - -struct macsec_tx_sa_stats { - __u32 OutPktsProtected; - __u32 OutPktsEncrypted; -}; - -struct macsec_tx_sc_stats { - __u64 OutPktsProtected; - __u64 OutPktsEncrypted; - __u64 OutOctetsProtected; - __u64 OutOctetsEncrypted; -}; - struct macsec_dev_stats { __u64 OutPktsUntagged; __u64 InPktsUntagged; @@ -131,124 +87,8 @@ struct macsec_dev_stats { __u64 InPktsOverrun; }; -/** - * struct macsec_rx_sa - receive secure association - * @active: - * @next_pn: packet number expected for the next packet - * @lock: protects next_pn manipulations - * @key: key structure - * @stats: per-SA stats - */ -struct macsec_rx_sa { - struct macsec_key key; - spinlock_t lock; - u32 next_pn; - refcount_t refcnt; - bool active; - struct macsec_rx_sa_stats __percpu *stats; - struct macsec_rx_sc *sc; - struct rcu_head rcu; -}; - -struct pcpu_rx_sc_stats { - struct macsec_rx_sc_stats stats; - struct u64_stats_sync syncp; -}; - -/** - * struct macsec_rx_sc - receive secure channel - * @sci: secure channel identifier for this SC - * @active: channel is active - * @sa: array of secure associations - * @stats: per-SC stats - */ -struct macsec_rx_sc { - struct macsec_rx_sc __rcu *next; - sci_t sci; - bool active; - struct macsec_rx_sa __rcu *sa[MACSEC_NUM_AN]; - struct pcpu_rx_sc_stats __percpu *stats; - refcount_t refcnt; - struct rcu_head rcu_head; -}; - -/** - * struct macsec_tx_sa - transmit secure association - * @active: - * @next_pn: packet number to use for the next packet - * @lock: protects next_pn manipulations - * @key: key structure - * @stats: per-SA stats - */ -struct macsec_tx_sa { - struct macsec_key key; - spinlock_t lock; - u32 next_pn; - refcount_t refcnt; - bool active; - struct macsec_tx_sa_stats __percpu *stats; - struct rcu_head rcu; -}; - -struct pcpu_tx_sc_stats { - struct macsec_tx_sc_stats stats; - struct u64_stats_sync syncp; -}; - -/** - * struct macsec_tx_sc - transmit secure channel - * @active: - * @encoding_sa: association number of the SA currently in use - * @encrypt: encrypt packets on transmit, or authenticate only - * @send_sci: always include the SCI in the SecTAG - * @end_station: - * @scb: single copy broadcast flag - * @sa: array of secure associations - * @stats: stats for this TXSC - */ -struct macsec_tx_sc { - bool active; - u8 encoding_sa; - bool encrypt; - bool send_sci; - bool end_station; - bool scb; - struct macsec_tx_sa __rcu *sa[MACSEC_NUM_AN]; - struct pcpu_tx_sc_stats __percpu *stats; -}; - #define MACSEC_VALIDATE_DEFAULT MACSEC_VALIDATE_STRICT -/** - * struct macsec_secy - MACsec Security Entity - * @netdev: netdevice for this SecY - * @n_rx_sc: number of receive secure channels configured on this SecY - * @sci: secure channel identifier used for tx - * @key_len: length of keys used by the cipher suite - * @icv_len: length of ICV used by the cipher suite - * @validate_frames: validation mode - * @operational: MAC_Operational flag - * @protect_frames: enable protection for this SecY - * @replay_protect: enable packet number checks on receive - * @replay_window: size of the replay window - * @tx_sc: transmit secure channel - * @rx_sc: linked list of receive secure channels - */ -struct macsec_secy { - struct net_device *netdev; - unsigned int n_rx_sc; - sci_t sci; - u16 key_len; - u16 icv_len; - enum macsec_validation_type validate_frames; - bool operational; - bool protect_frames; - bool replay_protect; - u32 replay_window; - struct macsec_tx_sc tx_sc; - struct macsec_rx_sc __rcu *rx_sc; -}; - struct pcpu_secy_stats { struct macsec_dev_stats stats; struct u64_stats_sync syncp; @@ -260,6 +100,7 @@ struct pcpu_secy_stats { * @real_dev: pointer to underlying netdevice * @stats: MACsec device stats * @secys: linked list of SecY's on the underlying device + * @offload: status of offloading on the MACsec device */ struct macsec_dev { struct macsec_secy secy; @@ -267,6 +108,7 @@ struct macsec_dev { struct pcpu_secy_stats __percpu *stats; struct list_head secys; struct gro_cells gro_cells; + enum macsec_offload offload; }; /** @@ -480,6 +322,56 @@ static void macsec_set_shortlen(struct macsec_eth_header *h, size_t data_len) h->short_length = data_len; } +/* Checks if a MACsec interface is being offloaded to an hardware engine */ +static bool macsec_is_offloaded(struct macsec_dev *macsec) +{ + if (macsec->offload == MACSEC_OFFLOAD_PHY) + return true; + + return false; +} + +/* Checks if underlying layers implement MACsec offloading functions. */ +static bool macsec_check_offload(enum macsec_offload offload, + struct macsec_dev *macsec) +{ + if (!macsec || !macsec->real_dev) + return false; + + if (offload == MACSEC_OFFLOAD_PHY) + return macsec->real_dev->phydev && + macsec->real_dev->phydev->macsec_ops; + + return false; +} + +static const struct macsec_ops *__macsec_get_ops(enum macsec_offload offload, + struct macsec_dev *macsec, + struct macsec_context *ctx) +{ + if (ctx) { + memset(ctx, 0, sizeof(*ctx)); + ctx->offload = offload; + + if (offload == MACSEC_OFFLOAD_PHY) + ctx->phydev = macsec->real_dev->phydev; + } + + return macsec->real_dev->phydev->macsec_ops; +} + +/* Returns a pointer to the MACsec ops struct if any and updates the MACsec + * context device reference if provided. + */ +static const struct macsec_ops *macsec_get_ops(struct macsec_dev *macsec, + struct macsec_context *ctx) +{ + if (!macsec_check_offload(macsec->offload, macsec)) + return NULL; + + return __macsec_get_ops(macsec->offload, macsec, ctx); +} + /* validate MACsec packet according to IEEE 802.1AE-2006 9.12 */ static bool macsec_validate_skb(struct sk_buff *skb, u16 icv_len) { @@ -532,6 +424,23 @@ static struct macsec_eth_header *macsec_ethhdr(struct sk_buff *skb) return (struct macsec_eth_header *)skb_mac_header(skb); } +static void __macsec_pn_wrapped(struct macsec_secy *secy, + struct macsec_tx_sa *tx_sa) +{ + pr_debug("PN wrapped, transitioning to !oper\n"); + tx_sa->active = false; + if (secy->protect_frames) + secy->operational = false; +} + +void macsec_pn_wrapped(struct macsec_secy *secy, struct macsec_tx_sa *tx_sa) +{ + spin_lock_bh(&tx_sa->lock); + __macsec_pn_wrapped(secy, tx_sa); + spin_unlock_bh(&tx_sa->lock); +} +EXPORT_SYMBOL_GPL(macsec_pn_wrapped); + static u32 tx_sa_update_pn(struct macsec_tx_sa *tx_sa, struct macsec_secy *secy) { u32 pn; @@ -540,12 +449,8 @@ static u32 tx_sa_update_pn(struct macsec_tx_sa *tx_sa, struct macsec_secy *secy) pn = tx_sa->next_pn; tx_sa->next_pn++; - if (tx_sa->next_pn == 0) { - pr_debug("PN wrapped, transitioning to !oper\n"); - tx_sa->active = false; - if (secy->protect_frames) - secy->operational = false; - } + if (tx_sa->next_pn == 0) + __macsec_pn_wrapped(secy, tx_sa); spin_unlock_bh(&tx_sa->lock); return pn; @@ -1029,8 +934,10 @@ static struct macsec_rx_sc *find_rx_sc_rtnl(struct macsec_secy *secy, sci_t sci) return NULL; } -static void handle_not_macsec(struct sk_buff *skb) +static enum rx_handler_result handle_not_macsec(struct sk_buff *skb) { + /* Deliver to the uncontrolled port by default */ + enum rx_handler_result ret = RX_HANDLER_PASS; struct macsec_rxh_data *rxd; struct macsec_dev *macsec; @@ -1045,7 +952,8 @@ static void handle_not_macsec(struct sk_buff *skb) struct sk_buff *nskb; struct pcpu_secy_stats *secy_stats = this_cpu_ptr(macsec->stats); - if (macsec->secy.validate_frames == MACSEC_VALIDATE_STRICT) { + if (!macsec_is_offloaded(macsec) && + macsec->secy.validate_frames == MACSEC_VALIDATE_STRICT) { u64_stats_update_begin(&secy_stats->syncp); secy_stats->stats.InPktsNoTag++; u64_stats_update_end(&secy_stats->syncp); @@ -1064,9 +972,17 @@ static void handle_not_macsec(struct sk_buff *skb) secy_stats->stats.InPktsUntagged++; u64_stats_update_end(&secy_stats->syncp); } + + if (netif_running(macsec->secy.netdev) && + macsec_is_offloaded(macsec)) { + ret = RX_HANDLER_EXACT; + goto out; + } } +out: rcu_read_unlock(); + return ret; } static rx_handler_result_t macsec_handle_frame(struct sk_buff **pskb) @@ -1091,12 +1007,8 @@ static rx_handler_result_t macsec_handle_frame(struct sk_buff **pskb) goto drop_direct; hdr = macsec_ethhdr(skb); - if (hdr->eth.h_proto != htons(ETH_P_MACSEC)) { - handle_not_macsec(skb); - - /* and deliver to the uncontrolled port */ - return RX_HANDLER_PASS; - } + if (hdr->eth.h_proto != htons(ETH_P_MACSEC)) + return handle_not_macsec(skb); skb = skb_unshare(skb, GFP_ATOMIC); *pskb = skb; @@ -1585,6 +1497,7 @@ static const struct nla_policy macsec_genl_policy[NUM_MACSEC_ATTR] = { [MACSEC_ATTR_IFINDEX] = { .type = NLA_U32 }, [MACSEC_ATTR_RXSC_CONFIG] = { .type = NLA_NESTED }, [MACSEC_ATTR_SA_CONFIG] = { .type = NLA_NESTED }, + [MACSEC_ATTR_OFFLOAD] = { .type = NLA_NESTED }, }; static const struct nla_policy macsec_genl_rxsc_policy[NUM_MACSEC_RXSC_ATTR] = { @@ -1602,6 +1515,44 @@ static const struct nla_policy macsec_genl_sa_policy[NUM_MACSEC_SA_ATTR] = { .len = MACSEC_MAX_KEY_LEN, }, }; +static const struct nla_policy macsec_genl_offload_policy[NUM_MACSEC_OFFLOAD_ATTR] = { + [MACSEC_OFFLOAD_ATTR_TYPE] = { .type = NLA_U8 }, +}; + +/* Offloads an operation to a device driver */ +static int macsec_offload(int (* const func)(struct macsec_context *), + struct macsec_context *ctx) +{ + int ret; + + if (unlikely(!func)) + return 0; + + if (ctx->offload == MACSEC_OFFLOAD_PHY) + mutex_lock(&ctx->phydev->lock); + + /* Phase I: prepare. The drive should fail here if there are going to be + * issues in the commit phase. + */ + ctx->prepare = true; + ret = (*func)(ctx); + if (ret) + goto phy_unlock; + + /* Phase II: commit. This step cannot fail. */ + ctx->prepare = false; + ret = (*func)(ctx); + /* This should never happen: commit is not allowed to fail */ + if (unlikely(ret)) + WARN(1, "MACsec offloading commit failed (%d)\n", ret); + +phy_unlock: + if (ctx->offload == MACSEC_OFFLOAD_PHY) + mutex_unlock(&ctx->phydev->lock); + + return ret; +} + static int parse_sa_config(struct nlattr **attrs, struct nlattr **tb_sa) { if (!attrs[MACSEC_ATTR_SA_CONFIG]) @@ -1717,13 +1668,40 @@ static int macsec_add_rxsa(struct sk_buff *skb, struct genl_info *info) if (tb_sa[MACSEC_SA_ATTR_ACTIVE]) rx_sa->active = !!nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]); - nla_memcpy(rx_sa->key.id, tb_sa[MACSEC_SA_ATTR_KEYID], MACSEC_KEYID_LEN); rx_sa->sc = rx_sc; + + /* If h/w offloading is available, propagate to the device */ + if (macsec_is_offloaded(netdev_priv(dev))) { + const struct macsec_ops *ops; + struct macsec_context ctx; + + ops = macsec_get_ops(netdev_priv(dev), &ctx); + if (!ops) { + err = -EOPNOTSUPP; + goto cleanup; + } + + ctx.sa.assoc_num = assoc_num; + ctx.sa.rx_sa = rx_sa; + memcpy(ctx.sa.key, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]), + MACSEC_KEYID_LEN); + + err = macsec_offload(ops->mdo_add_rxsa, &ctx); + if (err) + goto cleanup; + } + + nla_memcpy(rx_sa->key.id, tb_sa[MACSEC_SA_ATTR_KEYID], MACSEC_KEYID_LEN); rcu_assign_pointer(rx_sc->sa[assoc_num], rx_sa); rtnl_unlock(); return 0; + +cleanup: + kfree(rx_sa); + rtnl_unlock(); + return err; } static bool validate_add_rxsc(struct nlattr **attrs) @@ -1746,6 +1724,8 @@ static int macsec_add_rxsc(struct sk_buff *skb, struct genl_info *info) struct nlattr **attrs = info->attrs; struct macsec_rx_sc *rx_sc; struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1]; + bool was_active; + int ret; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; @@ -1771,12 +1751,35 @@ static int macsec_add_rxsc(struct sk_buff *skb, struct genl_info *info) return PTR_ERR(rx_sc); } + was_active = rx_sc->active; if (tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE]) rx_sc->active = !!nla_get_u8(tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE]); + if (macsec_is_offloaded(netdev_priv(dev))) { + const struct macsec_ops *ops; + struct macsec_context ctx; + + ops = macsec_get_ops(netdev_priv(dev), &ctx); + if (!ops) { + ret = -EOPNOTSUPP; + goto cleanup; + } + + ctx.rx_sc = rx_sc; + + ret = macsec_offload(ops->mdo_add_rxsc, &ctx); + if (ret) + goto cleanup; + } + rtnl_unlock(); return 0; + +cleanup: + rx_sc->active = was_active; + rtnl_unlock(); + return ret; } static bool validate_add_txsa(struct nlattr **attrs) @@ -1813,6 +1816,7 @@ static int macsec_add_txsa(struct sk_buff *skb, struct genl_info *info) struct macsec_tx_sa *tx_sa; unsigned char assoc_num; struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1]; + bool was_operational; int err; if (!attrs[MACSEC_ATTR_IFINDEX]) @@ -1863,8 +1867,6 @@ static int macsec_add_txsa(struct sk_buff *skb, struct genl_info *info) return err; } - nla_memcpy(tx_sa->key.id, tb_sa[MACSEC_SA_ATTR_KEYID], MACSEC_KEYID_LEN); - spin_lock_bh(&tx_sa->lock); tx_sa->next_pn = nla_get_u32(tb_sa[MACSEC_SA_ATTR_PN]); spin_unlock_bh(&tx_sa->lock); @@ -1872,14 +1874,43 @@ static int macsec_add_txsa(struct sk_buff *skb, struct genl_info *info) if (tb_sa[MACSEC_SA_ATTR_ACTIVE]) tx_sa->active = !!nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]); + was_operational = secy->operational; if (assoc_num == tx_sc->encoding_sa && tx_sa->active) secy->operational = true; + /* If h/w offloading is available, propagate to the device */ + if (macsec_is_offloaded(netdev_priv(dev))) { + const struct macsec_ops *ops; + struct macsec_context ctx; + + ops = macsec_get_ops(netdev_priv(dev), &ctx); + if (!ops) { + err = -EOPNOTSUPP; + goto cleanup; + } + + ctx.sa.assoc_num = assoc_num; + ctx.sa.tx_sa = tx_sa; + memcpy(ctx.sa.key, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]), + MACSEC_KEYID_LEN); + + err = macsec_offload(ops->mdo_add_txsa, &ctx); + if (err) + goto cleanup; + } + + nla_memcpy(tx_sa->key.id, tb_sa[MACSEC_SA_ATTR_KEYID], MACSEC_KEYID_LEN); rcu_assign_pointer(tx_sc->sa[assoc_num], tx_sa); rtnl_unlock(); return 0; + +cleanup: + secy->operational = was_operational; + kfree(tx_sa); + rtnl_unlock(); + return err; } static int macsec_del_rxsa(struct sk_buff *skb, struct genl_info *info) @@ -1892,6 +1923,7 @@ static int macsec_del_rxsa(struct sk_buff *skb, struct genl_info *info) u8 assoc_num; struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1]; struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1]; + int ret; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; @@ -1915,12 +1947,35 @@ static int macsec_del_rxsa(struct sk_buff *skb, struct genl_info *info) return -EBUSY; } + /* If h/w offloading is available, propagate to the device */ + if (macsec_is_offloaded(netdev_priv(dev))) { + const struct macsec_ops *ops; + struct macsec_context ctx; + + ops = macsec_get_ops(netdev_priv(dev), &ctx); + if (!ops) { + ret = -EOPNOTSUPP; + goto cleanup; + } + + ctx.sa.assoc_num = assoc_num; + ctx.sa.rx_sa = rx_sa; + + ret = macsec_offload(ops->mdo_del_rxsa, &ctx); + if (ret) + goto cleanup; + } + RCU_INIT_POINTER(rx_sc->sa[assoc_num], NULL); clear_rx_sa(rx_sa); rtnl_unlock(); return 0; + +cleanup: + rtnl_unlock(); + return ret; } static int macsec_del_rxsc(struct sk_buff *skb, struct genl_info *info) @@ -1931,6 +1986,7 @@ static int macsec_del_rxsc(struct sk_buff *skb, struct genl_info *info) struct macsec_rx_sc *rx_sc; sci_t sci; struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1]; + int ret; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; @@ -1957,10 +2013,31 @@ static int macsec_del_rxsc(struct sk_buff *skb, struct genl_info *info) return -ENODEV; } + /* If h/w offloading is available, propagate to the device */ + if (macsec_is_offloaded(netdev_priv(dev))) { + const struct macsec_ops *ops; + struct macsec_context ctx; + + ops = macsec_get_ops(netdev_priv(dev), &ctx); + if (!ops) { + ret = -EOPNOTSUPP; + goto cleanup; + } + + ctx.rx_sc = rx_sc; + ret = macsec_offload(ops->mdo_del_rxsc, &ctx); + if (ret) + goto cleanup; + } + free_rx_sc(rx_sc); rtnl_unlock(); return 0; + +cleanup: + rtnl_unlock(); + return ret; } static int macsec_del_txsa(struct sk_buff *skb, struct genl_info *info) @@ -1972,6 +2049,7 @@ static int macsec_del_txsa(struct sk_buff *skb, struct genl_info *info) struct macsec_tx_sa *tx_sa; u8 assoc_num; struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1]; + int ret; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; @@ -1992,12 +2070,35 @@ static int macsec_del_txsa(struct sk_buff *skb, struct genl_info *info) return -EBUSY; } + /* If h/w offloading is available, propagate to the device */ + if (macsec_is_offloaded(netdev_priv(dev))) { + const struct macsec_ops *ops; + struct macsec_context ctx; + + ops = macsec_get_ops(netdev_priv(dev), &ctx); + if (!ops) { + ret = -EOPNOTSUPP; + goto cleanup; + } + + ctx.sa.assoc_num = assoc_num; + ctx.sa.tx_sa = tx_sa; + + ret = macsec_offload(ops->mdo_del_txsa, &ctx); + if (ret) + goto cleanup; + } + RCU_INIT_POINTER(tx_sc->sa[assoc_num], NULL); clear_tx_sa(tx_sa); rtnl_unlock(); return 0; + +cleanup: + rtnl_unlock(); + return ret; } static bool validate_upd_sa(struct nlattr **attrs) @@ -2030,6 +2131,9 @@ static int macsec_upd_txsa(struct sk_buff *skb, struct genl_info *info) struct macsec_tx_sa *tx_sa; u8 assoc_num; struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1]; + bool was_operational, was_active; + u32 prev_pn = 0; + int ret = 0; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; @@ -2050,19 +2154,52 @@ static int macsec_upd_txsa(struct sk_buff *skb, struct genl_info *info) if (tb_sa[MACSEC_SA_ATTR_PN]) { spin_lock_bh(&tx_sa->lock); + prev_pn = tx_sa->next_pn; tx_sa->next_pn = nla_get_u32(tb_sa[MACSEC_SA_ATTR_PN]); spin_unlock_bh(&tx_sa->lock); } + was_active = tx_sa->active; if (tb_sa[MACSEC_SA_ATTR_ACTIVE]) tx_sa->active = nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]); + was_operational = secy->operational; if (assoc_num == tx_sc->encoding_sa) secy->operational = tx_sa->active; + /* If h/w offloading is available, propagate to the device */ + if (macsec_is_offloaded(netdev_priv(dev))) { + const struct macsec_ops *ops; + struct macsec_context ctx; + + ops = macsec_get_ops(netdev_priv(dev), &ctx); + if (!ops) { + ret = -EOPNOTSUPP; + goto cleanup; + } + + ctx.sa.assoc_num = assoc_num; + ctx.sa.tx_sa = tx_sa; + + ret = macsec_offload(ops->mdo_upd_txsa, &ctx); + if (ret) + goto cleanup; + } + rtnl_unlock(); return 0; + +cleanup: + if (tb_sa[MACSEC_SA_ATTR_PN]) { + spin_lock_bh(&tx_sa->lock); + tx_sa->next_pn = prev_pn; + spin_unlock_bh(&tx_sa->lock); + } + tx_sa->active = was_active; + secy->operational = was_operational; + rtnl_unlock(); + return ret; } static int macsec_upd_rxsa(struct sk_buff *skb, struct genl_info *info) @@ -2075,6 +2212,9 @@ static int macsec_upd_rxsa(struct sk_buff *skb, struct genl_info *info) u8 assoc_num; struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1]; struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1]; + bool was_active; + u32 prev_pn = 0; + int ret = 0; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; @@ -2098,15 +2238,46 @@ static int macsec_upd_rxsa(struct sk_buff *skb, struct genl_info *info) if (tb_sa[MACSEC_SA_ATTR_PN]) { spin_lock_bh(&rx_sa->lock); + prev_pn = rx_sa->next_pn; rx_sa->next_pn = nla_get_u32(tb_sa[MACSEC_SA_ATTR_PN]); spin_unlock_bh(&rx_sa->lock); } + was_active = rx_sa->active; if (tb_sa[MACSEC_SA_ATTR_ACTIVE]) rx_sa->active = nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]); + /* If h/w offloading is available, propagate to the device */ + if (macsec_is_offloaded(netdev_priv(dev))) { + const struct macsec_ops *ops; + struct macsec_context ctx; + + ops = macsec_get_ops(netdev_priv(dev), &ctx); + if (!ops) { + ret = -EOPNOTSUPP; + goto cleanup; + } + + ctx.sa.assoc_num = assoc_num; + ctx.sa.rx_sa = rx_sa; + + ret = macsec_offload(ops->mdo_upd_rxsa, &ctx); + if (ret) + goto cleanup; + } + rtnl_unlock(); return 0; + +cleanup: + if (tb_sa[MACSEC_SA_ATTR_PN]) { + spin_lock_bh(&rx_sa->lock); + rx_sa->next_pn = prev_pn; + spin_unlock_bh(&rx_sa->lock); + } + rx_sa->active = was_active; + rtnl_unlock(); + return ret; } static int macsec_upd_rxsc(struct sk_buff *skb, struct genl_info *info) @@ -2116,6 +2287,9 @@ static int macsec_upd_rxsc(struct sk_buff *skb, struct genl_info *info) struct macsec_secy *secy; struct macsec_rx_sc *rx_sc; struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1]; + unsigned int prev_n_rx_sc; + bool was_active; + int ret; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; @@ -2133,6 +2307,8 @@ static int macsec_upd_rxsc(struct sk_buff *skb, struct genl_info *info) return PTR_ERR(rx_sc); } + was_active = rx_sc->active; + prev_n_rx_sc = secy->n_rx_sc; if (tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE]) { bool new = !!nla_get_u8(tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE]); @@ -2142,9 +2318,153 @@ static int macsec_upd_rxsc(struct sk_buff *skb, struct genl_info *info) rx_sc->active = new; } + /* If h/w offloading is available, propagate to the device */ + if (macsec_is_offloaded(netdev_priv(dev))) { + const struct macsec_ops *ops; + struct macsec_context ctx; + + ops = macsec_get_ops(netdev_priv(dev), &ctx); + if (!ops) { + ret = -EOPNOTSUPP; + goto cleanup; + } + + ctx.rx_sc = rx_sc; + + ret = macsec_offload(ops->mdo_upd_rxsc, &ctx); + if (ret) + goto cleanup; + } + + rtnl_unlock(); + + return 0; + +cleanup: + secy->n_rx_sc = prev_n_rx_sc; + rx_sc->active = was_active; rtnl_unlock(); + return ret; +} + +static bool macsec_is_configured(struct macsec_dev *macsec) +{ + struct macsec_secy *secy = &macsec->secy; + struct macsec_tx_sc *tx_sc = &secy->tx_sc; + int i; + + if (secy->n_rx_sc > 0) + return true; + + for (i = 0; i < MACSEC_NUM_AN; i++) + if (tx_sc->sa[i]) + return true; + + return false; +} + +static int macsec_upd_offload(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *tb_offload[MACSEC_OFFLOAD_ATTR_MAX + 1]; + enum macsec_offload offload, prev_offload; + int (*func)(struct macsec_context *ctx); + struct nlattr **attrs = info->attrs; + struct net_device *dev, *loop_dev; + const struct macsec_ops *ops; + struct macsec_context ctx; + struct macsec_dev *macsec; + struct net *loop_net; + int ret; + + if (!attrs[MACSEC_ATTR_IFINDEX]) + return -EINVAL; + + if (!attrs[MACSEC_ATTR_OFFLOAD]) + return -EINVAL; + + if (nla_parse_nested_deprecated(tb_offload, MACSEC_OFFLOAD_ATTR_MAX, + attrs[MACSEC_ATTR_OFFLOAD], + macsec_genl_offload_policy, NULL)) + return -EINVAL; + + dev = get_dev_from_nl(genl_info_net(info), attrs); + if (IS_ERR(dev)) + return PTR_ERR(dev); + macsec = macsec_priv(dev); + + offload = nla_get_u8(tb_offload[MACSEC_OFFLOAD_ATTR_TYPE]); + if (macsec->offload == offload) + return 0; + + /* Check if the offloading mode is supported by the underlying layers */ + if (offload != MACSEC_OFFLOAD_OFF && + !macsec_check_offload(offload, macsec)) + return -EOPNOTSUPP; + + if (offload == MACSEC_OFFLOAD_OFF) + goto skip_limitation; + /* Check the physical interface isn't offloading another interface + * first. + */ + for_each_net(loop_net) { + for_each_netdev(loop_net, loop_dev) { + struct macsec_dev *priv; + + if (!netif_is_macsec(loop_dev)) + continue; + + priv = macsec_priv(loop_dev); + + if (priv->real_dev == macsec->real_dev && + priv->offload != MACSEC_OFFLOAD_OFF) + return -EBUSY; + } + } + +skip_limitation: + /* Check if the net device is busy. */ + if (netif_running(dev)) + return -EBUSY; + + rtnl_lock(); + + prev_offload = macsec->offload; + macsec->offload = offload; + + /* Check if the device already has rules configured: we do not support + * rules migration. + */ + if (macsec_is_configured(macsec)) { + ret = -EBUSY; + goto rollback; + } + + ops = __macsec_get_ops(offload == MACSEC_OFFLOAD_OFF ? prev_offload : offload, + macsec, &ctx); + if (!ops) { + ret = -EOPNOTSUPP; + goto rollback; + } + + if (prev_offload == MACSEC_OFFLOAD_OFF) + func = ops->mdo_add_secy; + else + func = ops->mdo_del_secy; + + ctx.secy = &macsec->secy; + ret = macsec_offload(func, &ctx); + if (ret) + goto rollback; + + rtnl_unlock(); return 0; + +rollback: + macsec->offload = prev_offload; + + rtnl_unlock(); + return ret; } static int copy_tx_sa_stats(struct sk_buff *skb, @@ -2408,12 +2728,13 @@ static noinline_for_stack int dump_secy(struct macsec_secy *secy, struct net_device *dev, struct sk_buff *skb, struct netlink_callback *cb) { - struct macsec_rx_sc *rx_sc; + struct macsec_dev *macsec = netdev_priv(dev); struct macsec_tx_sc *tx_sc = &secy->tx_sc; struct nlattr *txsa_list, *rxsc_list; - int i, j; - void *hdr; + struct macsec_rx_sc *rx_sc; struct nlattr *attr; + void *hdr; + int i, j; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &macsec_fam, NLM_F_MULTI, MACSEC_CMD_GET_TXSC); @@ -2425,6 +2746,13 @@ dump_secy(struct macsec_secy *secy, struct net_device *dev, if (nla_put_u32(skb, MACSEC_ATTR_IFINDEX, dev->ifindex)) goto nla_put_failure; + attr = nla_nest_start_noflag(skb, MACSEC_ATTR_OFFLOAD); + if (!attr) + goto nla_put_failure; + if (nla_put_u8(skb, MACSEC_OFFLOAD_ATTR_TYPE, macsec->offload)) + goto nla_put_failure; + nla_nest_end(skb, attr); + if (nla_put_secy(secy, skb)) goto nla_put_failure; @@ -2690,6 +3018,12 @@ static const struct genl_ops macsec_genl_ops[] = { .doit = macsec_upd_rxsa, .flags = GENL_ADMIN_PERM, }, + { + .cmd = MACSEC_CMD_UPD_OFFLOAD, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = macsec_upd_offload, + .flags = GENL_ADMIN_PERM, + }, }; static struct genl_family macsec_fam __ro_after_init = { @@ -2712,6 +3046,11 @@ static netdev_tx_t macsec_start_xmit(struct sk_buff *skb, struct pcpu_secy_stats *secy_stats; int ret, len; + if (macsec_is_offloaded(netdev_priv(dev))) { + skb->dev = macsec->real_dev; + return dev_queue_xmit(skb); + } + /* 10.5 */ if (!secy->protect_frames) { secy_stats = this_cpu_ptr(macsec->stats); @@ -2825,6 +3164,22 @@ static int macsec_dev_open(struct net_device *dev) goto clear_allmulti; } + /* If h/w offloading is available, propagate to the device */ + if (macsec_is_offloaded(macsec)) { + const struct macsec_ops *ops; + struct macsec_context ctx; + + ops = macsec_get_ops(netdev_priv(dev), &ctx); + if (!ops) { + err = -EOPNOTSUPP; + goto clear_allmulti; + } + + err = macsec_offload(ops->mdo_dev_open, &ctx); + if (err) + goto clear_allmulti; + } + if (netif_carrier_ok(real_dev)) netif_carrier_on(dev); @@ -2845,6 +3200,16 @@ static int macsec_dev_stop(struct net_device *dev) netif_carrier_off(dev); + /* If h/w offloading is available, propagate to the device */ + if (macsec_is_offloaded(macsec)) { + const struct macsec_ops *ops; + struct macsec_context ctx; + + ops = macsec_get_ops(macsec, &ctx); + if (ops) + macsec_offload(ops->mdo_dev_stop, &ctx); + } + dev_mc_unsync(real_dev, dev); dev_uc_unsync(real_dev, dev); @@ -3076,6 +3441,11 @@ static int macsec_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { + struct macsec_dev *macsec = macsec_priv(dev); + struct macsec_tx_sa tx_sc; + struct macsec_secy secy; + int ret; + if (!data) return 0; @@ -3085,7 +3455,41 @@ static int macsec_changelink(struct net_device *dev, struct nlattr *tb[], data[IFLA_MACSEC_PORT]) return -EINVAL; - return macsec_changelink_common(dev, data); + /* Keep a copy of unmodified secy and tx_sc, in case the offload + * propagation fails, to revert macsec_changelink_common. + */ + memcpy(&secy, &macsec->secy, sizeof(secy)); + memcpy(&tx_sc, &macsec->secy.tx_sc, sizeof(tx_sc)); + + ret = macsec_changelink_common(dev, data); + if (ret) + return ret; + + /* If h/w offloading is available, propagate to the device */ + if (macsec_is_offloaded(macsec)) { + const struct macsec_ops *ops; + struct macsec_context ctx; + int ret; + + ops = macsec_get_ops(netdev_priv(dev), &ctx); + if (!ops) { + ret = -EOPNOTSUPP; + goto cleanup; + } + + ctx.secy = &macsec->secy; + ret = macsec_offload(ops->mdo_upd_secy, &ctx); + if (ret) + goto cleanup; + } + + return 0; + +cleanup: + memcpy(&macsec->secy.tx_sc, &tx_sc, sizeof(tx_sc)); + memcpy(&macsec->secy, &secy, sizeof(secy)); + + return ret; } static void macsec_del_dev(struct macsec_dev *macsec) @@ -3128,6 +3532,18 @@ static void macsec_dellink(struct net_device *dev, struct list_head *head) struct net_device *real_dev = macsec->real_dev; struct macsec_rxh_data *rxd = macsec_data_rtnl(real_dev); + /* If h/w offloading is available, propagate to the device */ + if (macsec_is_offloaded(macsec)) { + const struct macsec_ops *ops; + struct macsec_context ctx; + + ops = macsec_get_ops(netdev_priv(dev), &ctx); + if (ops) { + ctx.secy = &macsec->secy; + macsec_offload(ops->mdo_del_secy, &ctx); + } + } + macsec_common_dellink(dev, head); if (list_empty(&rxd->secys)) { @@ -3239,6 +3655,9 @@ static int macsec_newlink(struct net *net, struct net_device *dev, macsec->real_dev = real_dev; + /* MACsec offloading is off by default */ + macsec->offload = MACSEC_OFFLOAD_OFF; + if (data && data[IFLA_MACSEC_ICV_LEN]) icv_len = nla_get_u8(data[IFLA_MACSEC_ICV_LEN]); dev->mtu = real_dev->mtu - icv_len - macsec_extra_len(true); diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index d066cf58c926..81aa7adf4801 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -513,10 +513,11 @@ static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) const struct macvlan_dev *dest; if (vlan->mode == MACVLAN_MODE_BRIDGE) { - const struct ethhdr *eth = (void *)skb->data; + const struct ethhdr *eth = skb_eth_hdr(skb); /* send to other bridge ports directly */ if (is_multicast_ether_addr(eth->h_dest)) { + skb_reset_mac_header(skb); macvlan_broadcast(skb, port, dev, MACVLAN_MODE_BRIDGE); goto xmit_world; } diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c index 059711edfc61..b53fbc06e104 100644 --- a/drivers/net/netdevsim/dev.c +++ b/drivers/net/netdevsim/dev.c @@ -53,7 +53,7 @@ static ssize_t nsim_dev_take_snapshot_write(struct file *file, get_random_bytes(dummy_data, NSIM_DEV_DUMMY_REGION_SIZE); - id = devlink_region_shapshot_id_get(priv_to_devlink(nsim_dev)); + id = devlink_region_snapshot_id_get(priv_to_devlink(nsim_dev)); err = devlink_region_snapshot_create(nsim_dev->dummy_region, dummy_data, id, kfree); if (err) { @@ -270,7 +270,7 @@ struct nsim_trap_data { }; /* All driver-specific traps must be documented in - * Documentation/networking/devlink-trap-netdevsim.rst + * Documentation/networking/devlink/netdevsim.rst */ enum { NSIM_TRAP_ID_BASE = DEVLINK_TRAP_GENERIC_ID_MAX, diff --git a/drivers/net/netdevsim/fib.c b/drivers/net/netdevsim/fib.c index b5df308b4e33..f32d56ac3e80 100644 --- a/drivers/net/netdevsim/fib.c +++ b/drivers/net/netdevsim/fib.c @@ -14,6 +14,12 @@ * THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. */ +#include <linux/in6.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/rhashtable.h> +#include <linux/spinlock_types.h> +#include <linux/types.h> #include <net/fib_notifier.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> @@ -36,6 +42,48 @@ struct nsim_fib_data { struct notifier_block fib_nb; struct nsim_per_fib_data ipv4; struct nsim_per_fib_data ipv6; + struct rhashtable fib_rt_ht; + struct list_head fib_rt_list; + spinlock_t fib_lock; /* Protects hashtable, list and accounting */ + struct devlink *devlink; +}; + +struct nsim_fib_rt_key { + unsigned char addr[sizeof(struct in6_addr)]; + unsigned char prefix_len; + int family; + u32 tb_id; +}; + +struct nsim_fib_rt { + struct nsim_fib_rt_key key; + struct rhash_head ht_node; + struct list_head list; /* Member of fib_rt_list */ +}; + +struct nsim_fib4_rt { + struct nsim_fib_rt common; + struct fib_info *fi; + u8 tos; + u8 type; +}; + +struct nsim_fib6_rt { + struct nsim_fib_rt common; + struct list_head nh_list; + unsigned int nhs; +}; + +struct nsim_fib6_rt_nh { + struct list_head list; /* Member of nh_list */ + struct fib6_info *rt; +}; + +static const struct rhashtable_params nsim_fib_rt_ht_params = { + .key_offset = offsetof(struct nsim_fib_rt, key), + .head_offset = offsetof(struct nsim_fib_rt, ht_node), + .key_len = sizeof(struct nsim_fib_rt_key), + .automatic_shrinking = true, }; u64 nsim_fib_get_val(struct nsim_fib_data *fib_data, @@ -144,18 +192,556 @@ static int nsim_fib_account(struct nsim_fib_entry *entry, bool add, return err; } +static void nsim_fib_rt_init(struct nsim_fib_data *data, + struct nsim_fib_rt *fib_rt, const void *addr, + size_t addr_len, unsigned int prefix_len, + int family, u32 tb_id) +{ + memcpy(fib_rt->key.addr, addr, addr_len); + fib_rt->key.prefix_len = prefix_len; + fib_rt->key.family = family; + fib_rt->key.tb_id = tb_id; + list_add(&fib_rt->list, &data->fib_rt_list); +} + +static void nsim_fib_rt_fini(struct nsim_fib_rt *fib_rt) +{ + list_del(&fib_rt->list); +} + +static struct nsim_fib_rt *nsim_fib_rt_lookup(struct rhashtable *fib_rt_ht, + const void *addr, size_t addr_len, + unsigned int prefix_len, + int family, u32 tb_id) +{ + struct nsim_fib_rt_key key; + + memset(&key, 0, sizeof(key)); + memcpy(key.addr, addr, addr_len); + key.prefix_len = prefix_len; + key.family = family; + key.tb_id = tb_id; + + return rhashtable_lookup_fast(fib_rt_ht, &key, nsim_fib_rt_ht_params); +} + +static struct nsim_fib4_rt * +nsim_fib4_rt_create(struct nsim_fib_data *data, + struct fib_entry_notifier_info *fen_info) +{ + struct nsim_fib4_rt *fib4_rt; + + fib4_rt = kzalloc(sizeof(*fib4_rt), GFP_ATOMIC); + if (!fib4_rt) + return NULL; + + nsim_fib_rt_init(data, &fib4_rt->common, &fen_info->dst, sizeof(u32), + fen_info->dst_len, AF_INET, fen_info->tb_id); + + fib4_rt->fi = fen_info->fi; + fib_info_hold(fib4_rt->fi); + fib4_rt->tos = fen_info->tos; + fib4_rt->type = fen_info->type; + + return fib4_rt; +} + +static void nsim_fib4_rt_destroy(struct nsim_fib4_rt *fib4_rt) +{ + fib_info_put(fib4_rt->fi); + nsim_fib_rt_fini(&fib4_rt->common); + kfree(fib4_rt); +} + +static struct nsim_fib4_rt * +nsim_fib4_rt_lookup(struct rhashtable *fib_rt_ht, + const struct fib_entry_notifier_info *fen_info) +{ + struct nsim_fib_rt *fib_rt; + + fib_rt = nsim_fib_rt_lookup(fib_rt_ht, &fen_info->dst, sizeof(u32), + fen_info->dst_len, AF_INET, + fen_info->tb_id); + if (!fib_rt) + return NULL; + + return container_of(fib_rt, struct nsim_fib4_rt, common); +} + +static void nsim_fib4_rt_hw_flags_set(struct net *net, + const struct nsim_fib4_rt *fib4_rt, + bool trap) +{ + u32 *p_dst = (u32 *) fib4_rt->common.key.addr; + int dst_len = fib4_rt->common.key.prefix_len; + struct fib_rt_info fri; + + fri.fi = fib4_rt->fi; + fri.tb_id = fib4_rt->common.key.tb_id; + fri.dst = cpu_to_be32(*p_dst); + fri.dst_len = dst_len; + fri.tos = fib4_rt->tos; + fri.type = fib4_rt->type; + fri.offload = false; + fri.trap = trap; + fib_alias_hw_flags_set(net, &fri); +} + +static int nsim_fib4_rt_add(struct nsim_fib_data *data, + struct nsim_fib4_rt *fib4_rt, + struct netlink_ext_ack *extack) +{ + struct net *net = devlink_net(data->devlink); + int err; + + err = nsim_fib_account(&data->ipv4.fib, true, extack); + if (err) + return err; + + err = rhashtable_insert_fast(&data->fib_rt_ht, + &fib4_rt->common.ht_node, + nsim_fib_rt_ht_params); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Failed to insert IPv4 route"); + goto err_fib_dismiss; + } + + nsim_fib4_rt_hw_flags_set(net, fib4_rt, true); + + return 0; + +err_fib_dismiss: + nsim_fib_account(&data->ipv4.fib, false, extack); + return err; +} + +static int nsim_fib4_rt_replace(struct nsim_fib_data *data, + struct nsim_fib4_rt *fib4_rt, + struct nsim_fib4_rt *fib4_rt_old, + struct netlink_ext_ack *extack) +{ + struct net *net = devlink_net(data->devlink); + int err; + + /* We are replacing a route, so no need to change the accounting. */ + err = rhashtable_replace_fast(&data->fib_rt_ht, + &fib4_rt_old->common.ht_node, + &fib4_rt->common.ht_node, + nsim_fib_rt_ht_params); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Failed to replace IPv4 route"); + return err; + } + + nsim_fib4_rt_hw_flags_set(net, fib4_rt, true); + + nsim_fib4_rt_hw_flags_set(net, fib4_rt_old, false); + nsim_fib4_rt_destroy(fib4_rt_old); + + return 0; +} + +static int nsim_fib4_rt_insert(struct nsim_fib_data *data, + struct fib_entry_notifier_info *fen_info) +{ + struct netlink_ext_ack *extack = fen_info->info.extack; + struct nsim_fib4_rt *fib4_rt, *fib4_rt_old; + int err; + + fib4_rt = nsim_fib4_rt_create(data, fen_info); + if (!fib4_rt) + return -ENOMEM; + + fib4_rt_old = nsim_fib4_rt_lookup(&data->fib_rt_ht, fen_info); + if (!fib4_rt_old) + err = nsim_fib4_rt_add(data, fib4_rt, extack); + else + err = nsim_fib4_rt_replace(data, fib4_rt, fib4_rt_old, extack); + + if (err) + nsim_fib4_rt_destroy(fib4_rt); + + return err; +} + +static void nsim_fib4_rt_remove(struct nsim_fib_data *data, + const struct fib_entry_notifier_info *fen_info) +{ + struct netlink_ext_ack *extack = fen_info->info.extack; + struct nsim_fib4_rt *fib4_rt; + + fib4_rt = nsim_fib4_rt_lookup(&data->fib_rt_ht, fen_info); + if (WARN_ON_ONCE(!fib4_rt)) + return; + + rhashtable_remove_fast(&data->fib_rt_ht, &fib4_rt->common.ht_node, + nsim_fib_rt_ht_params); + nsim_fib_account(&data->ipv4.fib, false, extack); + nsim_fib4_rt_destroy(fib4_rt); +} + +static int nsim_fib4_event(struct nsim_fib_data *data, + struct fib_notifier_info *info, + unsigned long event) +{ + struct fib_entry_notifier_info *fen_info; + int err = 0; + + fen_info = container_of(info, struct fib_entry_notifier_info, info); + + if (fen_info->fi->nh) { + NL_SET_ERR_MSG_MOD(info->extack, "IPv4 route with nexthop objects is not supported"); + return 0; + } + + switch (event) { + case FIB_EVENT_ENTRY_REPLACE: + err = nsim_fib4_rt_insert(data, fen_info); + break; + case FIB_EVENT_ENTRY_DEL: + nsim_fib4_rt_remove(data, fen_info); + break; + default: + break; + } + + return err; +} + +static struct nsim_fib6_rt_nh * +nsim_fib6_rt_nh_find(const struct nsim_fib6_rt *fib6_rt, + const struct fib6_info *rt) +{ + struct nsim_fib6_rt_nh *fib6_rt_nh; + + list_for_each_entry(fib6_rt_nh, &fib6_rt->nh_list, list) { + if (fib6_rt_nh->rt == rt) + return fib6_rt_nh; + } + + return NULL; +} + +static int nsim_fib6_rt_nh_add(struct nsim_fib6_rt *fib6_rt, + struct fib6_info *rt) +{ + struct nsim_fib6_rt_nh *fib6_rt_nh; + + fib6_rt_nh = kzalloc(sizeof(*fib6_rt_nh), GFP_ATOMIC); + if (!fib6_rt_nh) + return -ENOMEM; + + fib6_info_hold(rt); + fib6_rt_nh->rt = rt; + list_add_tail(&fib6_rt_nh->list, &fib6_rt->nh_list); + fib6_rt->nhs++; + + return 0; +} + +static void nsim_fib6_rt_nh_del(struct nsim_fib6_rt *fib6_rt, + const struct fib6_info *rt) +{ + struct nsim_fib6_rt_nh *fib6_rt_nh; + + fib6_rt_nh = nsim_fib6_rt_nh_find(fib6_rt, rt); + if (WARN_ON_ONCE(!fib6_rt_nh)) + return; + + fib6_rt->nhs--; + list_del(&fib6_rt_nh->list); +#if IS_ENABLED(CONFIG_IPV6) + fib6_info_release(fib6_rt_nh->rt); +#endif + kfree(fib6_rt_nh); +} + +static struct nsim_fib6_rt * +nsim_fib6_rt_create(struct nsim_fib_data *data, + struct fib6_entry_notifier_info *fen6_info) +{ + struct fib6_info *iter, *rt = fen6_info->rt; + struct nsim_fib6_rt *fib6_rt; + int i = 0; + int err; + + fib6_rt = kzalloc(sizeof(*fib6_rt), GFP_ATOMIC); + if (!fib6_rt) + return ERR_PTR(-ENOMEM); + + nsim_fib_rt_init(data, &fib6_rt->common, &rt->fib6_dst.addr, + sizeof(rt->fib6_dst.addr), rt->fib6_dst.plen, AF_INET6, + rt->fib6_table->tb6_id); + + /* We consider a multipath IPv6 route as one entry, but it can be made + * up from several fib6_info structs (one for each nexthop), so we + * add them all to the same list under the entry. + */ + INIT_LIST_HEAD(&fib6_rt->nh_list); + + err = nsim_fib6_rt_nh_add(fib6_rt, rt); + if (err) + goto err_fib_rt_fini; + + if (!fen6_info->nsiblings) + return fib6_rt; + + list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { + if (i == fen6_info->nsiblings) + break; + + err = nsim_fib6_rt_nh_add(fib6_rt, iter); + if (err) + goto err_fib6_rt_nh_del; + i++; + } + WARN_ON_ONCE(i != fen6_info->nsiblings); + + return fib6_rt; + +err_fib6_rt_nh_del: + list_for_each_entry_continue_reverse(iter, &rt->fib6_siblings, + fib6_siblings) + nsim_fib6_rt_nh_del(fib6_rt, iter); + nsim_fib6_rt_nh_del(fib6_rt, rt); +err_fib_rt_fini: + nsim_fib_rt_fini(&fib6_rt->common); + kfree(fib6_rt); + return ERR_PTR(err); +} + +static void nsim_fib6_rt_destroy(struct nsim_fib6_rt *fib6_rt) +{ + struct nsim_fib6_rt_nh *iter, *tmp; + + list_for_each_entry_safe(iter, tmp, &fib6_rt->nh_list, list) + nsim_fib6_rt_nh_del(fib6_rt, iter->rt); + WARN_ON_ONCE(!list_empty(&fib6_rt->nh_list)); + nsim_fib_rt_fini(&fib6_rt->common); + kfree(fib6_rt); +} + +static struct nsim_fib6_rt * +nsim_fib6_rt_lookup(struct rhashtable *fib_rt_ht, const struct fib6_info *rt) +{ + struct nsim_fib_rt *fib_rt; + + fib_rt = nsim_fib_rt_lookup(fib_rt_ht, &rt->fib6_dst.addr, + sizeof(rt->fib6_dst.addr), + rt->fib6_dst.plen, AF_INET6, + rt->fib6_table->tb6_id); + if (!fib_rt) + return NULL; + + return container_of(fib_rt, struct nsim_fib6_rt, common); +} + +static int nsim_fib6_rt_append(struct nsim_fib_data *data, + struct fib6_entry_notifier_info *fen6_info) +{ + struct fib6_info *iter, *rt = fen6_info->rt; + struct nsim_fib6_rt *fib6_rt; + int i = 0; + int err; + + fib6_rt = nsim_fib6_rt_lookup(&data->fib_rt_ht, rt); + if (WARN_ON_ONCE(!fib6_rt)) + return -EINVAL; + + err = nsim_fib6_rt_nh_add(fib6_rt, rt); + if (err) + return err; + rt->trap = true; + + if (!fen6_info->nsiblings) + return 0; + + list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { + if (i == fen6_info->nsiblings) + break; + + err = nsim_fib6_rt_nh_add(fib6_rt, iter); + if (err) + goto err_fib6_rt_nh_del; + iter->trap = true; + i++; + } + WARN_ON_ONCE(i != fen6_info->nsiblings); + + return 0; + +err_fib6_rt_nh_del: + list_for_each_entry_continue_reverse(iter, &rt->fib6_siblings, + fib6_siblings) { + iter->trap = false; + nsim_fib6_rt_nh_del(fib6_rt, iter); + } + rt->trap = false; + nsim_fib6_rt_nh_del(fib6_rt, rt); + return err; +} + +static void nsim_fib6_rt_hw_flags_set(const struct nsim_fib6_rt *fib6_rt, + bool trap) +{ + struct nsim_fib6_rt_nh *fib6_rt_nh; + + list_for_each_entry(fib6_rt_nh, &fib6_rt->nh_list, list) + fib6_info_hw_flags_set(fib6_rt_nh->rt, false, trap); +} + +static int nsim_fib6_rt_add(struct nsim_fib_data *data, + struct nsim_fib6_rt *fib6_rt, + struct netlink_ext_ack *extack) +{ + int err; + + err = nsim_fib_account(&data->ipv6.fib, true, extack); + if (err) + return err; + + err = rhashtable_insert_fast(&data->fib_rt_ht, + &fib6_rt->common.ht_node, + nsim_fib_rt_ht_params); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Failed to insert IPv6 route"); + goto err_fib_dismiss; + } + + nsim_fib6_rt_hw_flags_set(fib6_rt, true); + + return 0; + +err_fib_dismiss: + nsim_fib_account(&data->ipv6.fib, false, extack); + return err; +} + +static int nsim_fib6_rt_replace(struct nsim_fib_data *data, + struct nsim_fib6_rt *fib6_rt, + struct nsim_fib6_rt *fib6_rt_old, + struct netlink_ext_ack *extack) +{ + int err; + + /* We are replacing a route, so no need to change the accounting. */ + err = rhashtable_replace_fast(&data->fib_rt_ht, + &fib6_rt_old->common.ht_node, + &fib6_rt->common.ht_node, + nsim_fib_rt_ht_params); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Failed to replace IPv6 route"); + return err; + } + + nsim_fib6_rt_hw_flags_set(fib6_rt, true); + + nsim_fib6_rt_hw_flags_set(fib6_rt_old, false); + nsim_fib6_rt_destroy(fib6_rt_old); + + return 0; +} + +static int nsim_fib6_rt_insert(struct nsim_fib_data *data, + struct fib6_entry_notifier_info *fen6_info) +{ + struct netlink_ext_ack *extack = fen6_info->info.extack; + struct nsim_fib6_rt *fib6_rt, *fib6_rt_old; + int err; + + fib6_rt = nsim_fib6_rt_create(data, fen6_info); + if (IS_ERR(fib6_rt)) + return PTR_ERR(fib6_rt); + + fib6_rt_old = nsim_fib6_rt_lookup(&data->fib_rt_ht, fen6_info->rt); + if (!fib6_rt_old) + err = nsim_fib6_rt_add(data, fib6_rt, extack); + else + err = nsim_fib6_rt_replace(data, fib6_rt, fib6_rt_old, extack); + + if (err) + nsim_fib6_rt_destroy(fib6_rt); + + return err; +} + +static void +nsim_fib6_rt_remove(struct nsim_fib_data *data, + const struct fib6_entry_notifier_info *fen6_info) +{ + struct netlink_ext_ack *extack = fen6_info->info.extack; + struct nsim_fib6_rt *fib6_rt; + + /* Multipath routes are first added to the FIB trie and only then + * notified. If we vetoed the addition, we will get a delete + * notification for a route we do not have. Therefore, do not warn if + * route was not found. + */ + fib6_rt = nsim_fib6_rt_lookup(&data->fib_rt_ht, fen6_info->rt); + if (!fib6_rt) + return; + + /* If not all the nexthops are deleted, then only reduce the nexthop + * group. + */ + if (fen6_info->nsiblings + 1 != fib6_rt->nhs) { + nsim_fib6_rt_nh_del(fib6_rt, fen6_info->rt); + return; + } + + rhashtable_remove_fast(&data->fib_rt_ht, &fib6_rt->common.ht_node, + nsim_fib_rt_ht_params); + nsim_fib_account(&data->ipv6.fib, false, extack); + nsim_fib6_rt_destroy(fib6_rt); +} + +static int nsim_fib6_event(struct nsim_fib_data *data, + struct fib_notifier_info *info, + unsigned long event) +{ + struct fib6_entry_notifier_info *fen6_info; + int err = 0; + + fen6_info = container_of(info, struct fib6_entry_notifier_info, info); + + if (fen6_info->rt->nh) { + NL_SET_ERR_MSG_MOD(info->extack, "IPv6 route with nexthop objects is not supported"); + return 0; + } + + if (fen6_info->rt->fib6_src.plen) { + NL_SET_ERR_MSG_MOD(info->extack, "IPv6 source-specific route is not supported"); + return 0; + } + + switch (event) { + case FIB_EVENT_ENTRY_REPLACE: + err = nsim_fib6_rt_insert(data, fen6_info); + break; + case FIB_EVENT_ENTRY_APPEND: + err = nsim_fib6_rt_append(data, fen6_info); + break; + case FIB_EVENT_ENTRY_DEL: + nsim_fib6_rt_remove(data, fen6_info); + break; + default: + break; + } + + return err; +} + static int nsim_fib_event(struct nsim_fib_data *data, - struct fib_notifier_info *info, bool add) + struct fib_notifier_info *info, unsigned long event) { - struct netlink_ext_ack *extack = info->extack; int err = 0; switch (info->family) { case AF_INET: - err = nsim_fib_account(&data->ipv4.fib, add, extack); + err = nsim_fib4_event(data, info, event); break; case AF_INET6: - err = nsim_fib_account(&data->ipv6.fib, add, extack); + err = nsim_fib6_event(data, info, event); break; } @@ -170,6 +756,9 @@ static int nsim_fib_event_nb(struct notifier_block *nb, unsigned long event, struct fib_notifier_info *info = ptr; int err = 0; + /* IPv6 routes can be added via RAs from softIRQ. */ + spin_lock_bh(&data->fib_lock); + switch (event) { case FIB_EVENT_RULE_ADD: /* fall through */ case FIB_EVENT_RULE_DEL: @@ -178,23 +767,74 @@ static int nsim_fib_event_nb(struct notifier_block *nb, unsigned long event, break; case FIB_EVENT_ENTRY_REPLACE: /* fall through */ + case FIB_EVENT_ENTRY_APPEND: /* fall through */ case FIB_EVENT_ENTRY_DEL: - err = nsim_fib_event(data, info, event != FIB_EVENT_ENTRY_DEL); + err = nsim_fib_event(data, info, event); break; } + spin_unlock_bh(&data->fib_lock); + return notifier_from_errno(err); } +static void nsim_fib4_rt_free(struct nsim_fib_rt *fib_rt, + struct nsim_fib_data *data) +{ + struct devlink *devlink = data->devlink; + struct nsim_fib4_rt *fib4_rt; + + fib4_rt = container_of(fib_rt, struct nsim_fib4_rt, common); + nsim_fib4_rt_hw_flags_set(devlink_net(devlink), fib4_rt, false); + nsim_fib_account(&data->ipv4.fib, false, NULL); + nsim_fib4_rt_destroy(fib4_rt); +} + +static void nsim_fib6_rt_free(struct nsim_fib_rt *fib_rt, + struct nsim_fib_data *data) +{ + struct nsim_fib6_rt *fib6_rt; + + fib6_rt = container_of(fib_rt, struct nsim_fib6_rt, common); + nsim_fib6_rt_hw_flags_set(fib6_rt, false); + nsim_fib_account(&data->ipv6.fib, false, NULL); + nsim_fib6_rt_destroy(fib6_rt); +} + +static void nsim_fib_rt_free(void *ptr, void *arg) +{ + struct nsim_fib_rt *fib_rt = ptr; + struct nsim_fib_data *data = arg; + + switch (fib_rt->key.family) { + case AF_INET: + nsim_fib4_rt_free(fib_rt, data); + break; + case AF_INET6: + nsim_fib6_rt_free(fib_rt, data); + break; + default: + WARN_ON_ONCE(1); + } +} + /* inconsistent dump, trying again */ static void nsim_fib_dump_inconsistent(struct notifier_block *nb) { struct nsim_fib_data *data = container_of(nb, struct nsim_fib_data, fib_nb); + struct nsim_fib_rt *fib_rt, *fib_rt_tmp; + + /* The notifier block is still not registered, so we do not need to + * take any locks here. + */ + list_for_each_entry_safe(fib_rt, fib_rt_tmp, &data->fib_rt_list, list) { + rhashtable_remove_fast(&data->fib_rt_ht, &fib_rt->ht_node, + nsim_fib_rt_ht_params); + nsim_fib_rt_free(fib_rt, data); + } - data->ipv4.fib.num = 0ULL; data->ipv4.rules.num = 0ULL; - data->ipv6.fib.num = 0ULL; data->ipv6.rules.num = 0ULL; } @@ -255,6 +895,13 @@ struct nsim_fib_data *nsim_fib_create(struct devlink *devlink, data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return ERR_PTR(-ENOMEM); + data->devlink = devlink; + + spin_lock_init(&data->fib_lock); + INIT_LIST_HEAD(&data->fib_rt_list); + err = rhashtable_init(&data->fib_rt_ht, &nsim_fib_rt_ht_params); + if (err) + goto err_data_free; nsim_fib_set_max_all(data, devlink); @@ -263,7 +910,7 @@ struct nsim_fib_data *nsim_fib_create(struct devlink *devlink, nsim_fib_dump_inconsistent, extack); if (err) { pr_err("Failed to register fib notifier\n"); - goto err_out; + goto err_rhashtable_destroy; } devlink_resource_occ_get_register(devlink, @@ -284,7 +931,10 @@ struct nsim_fib_data *nsim_fib_create(struct devlink *devlink, data); return data; -err_out: +err_rhashtable_destroy: + rhashtable_free_and_destroy(&data->fib_rt_ht, nsim_fib_rt_free, + data); +err_data_free: kfree(data); return ERR_PTR(err); } @@ -300,5 +950,8 @@ void nsim_fib_destroy(struct devlink *devlink, struct nsim_fib_data *data) devlink_resource_occ_get_unregister(devlink, NSIM_RESOURCE_IPV4_FIB); unregister_fib_notifier(devlink_net(devlink), &data->fib_nb); + rhashtable_free_and_destroy(&data->fib_rt_ht, nsim_fib_rt_free, + data); + WARN_ON_ONCE(!list_empty(&data->fib_rt_list)); kfree(data); } diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig index 2e016271e126..71fc778ce398 100644 --- a/drivers/net/phy/Kconfig +++ b/drivers/net/phy/Kconfig @@ -346,14 +346,15 @@ config DAVICOM_PHY Currently supports dm9161e and dm9131 config DP83822_PHY - tristate "Texas Instruments DP83822 PHY" + tristate "Texas Instruments DP83822/825/826 PHYs" ---help--- - Supports the DP83822 PHY. + Supports the DP83822, DP83825I, DP83825CM, DP83825CS, DP83825S, + DP83826C and DP83826NC PHYs. config DP83TC811_PHY - tristate "Texas Instruments DP83TC822 PHY" + tristate "Texas Instruments DP83TC811 PHY" ---help--- - Supports the DP83TC822 PHY. + Supports the DP83TC811 PHY. config DP83848_PHY tristate "Texas Instruments DP83848 PHY" @@ -437,6 +438,9 @@ config MICROCHIP_T1_PHY config MICROSEMI_PHY tristate "Microsemi PHYs" + depends on MACSEC || MACSEC=n + select CRYPTO_AES + select CRYPTO_ECB ---help--- Currently supports VSC8514, VSC8530, VSC8531, VSC8540 and VSC8541 PHYs diff --git a/drivers/net/phy/adin.c b/drivers/net/phy/adin.c index cf5a391c93e6..c7eabe4382fb 100644 --- a/drivers/net/phy/adin.c +++ b/drivers/net/phy/adin.c @@ -145,7 +145,7 @@ struct adin_clause45_mmd_map { u16 adin_regnum; }; -static struct adin_clause45_mmd_map adin_clause45_mmd_map[] = { +static const struct adin_clause45_mmd_map adin_clause45_mmd_map[] = { { MDIO_MMD_PCS, MDIO_PCS_EEE_ABLE, ADIN1300_EEE_CAP_REG }, { MDIO_MMD_AN, MDIO_AN_EEE_LPABLE, ADIN1300_EEE_LPABLE_REG }, { MDIO_MMD_AN, MDIO_AN_EEE_ADV, ADIN1300_EEE_ADV_REG }, @@ -159,7 +159,7 @@ struct adin_hw_stat { u16 reg2; }; -static struct adin_hw_stat adin_hw_stats[] = { +static const struct adin_hw_stat adin_hw_stats[] = { { "total_frames_checked_count", 0x940A, 0x940B }, /* hi + lo */ { "length_error_frames_count", 0x940C }, { "alignment_error_frames_count", 0x940D }, @@ -456,7 +456,7 @@ static int adin_phy_config_intr(struct phy_device *phydev) static int adin_cl45_to_adin_reg(struct phy_device *phydev, int devad, u16 cl45_regnum) { - struct adin_clause45_mmd_map *m; + const struct adin_clause45_mmd_map *m; int i; if (devad == MDIO_MMD_VEND1) @@ -625,7 +625,7 @@ static int adin_soft_reset(struct phy_device *phydev) if (rc < 0) return rc; - msleep(10); + msleep(20); /* If we get a read error something may be wrong */ rc = phy_read_mmd(phydev, MDIO_MMD_VEND1, @@ -650,7 +650,7 @@ static void adin_get_strings(struct phy_device *phydev, u8 *data) } static int adin_read_mmd_stat_regs(struct phy_device *phydev, - struct adin_hw_stat *stat, + const struct adin_hw_stat *stat, u32 *val) { int ret; @@ -676,7 +676,7 @@ static int adin_read_mmd_stat_regs(struct phy_device *phydev, static u64 adin_get_stat(struct phy_device *phydev, int i) { - struct adin_hw_stat *stat = &adin_hw_stats[i]; + const struct adin_hw_stat *stat = &adin_hw_stats[i]; struct adin_priv *priv = phydev->priv; u32 val; int ret; diff --git a/drivers/net/phy/aquantia_main.c b/drivers/net/phy/aquantia_main.c index 975789d9349d..31927b2c7d5a 100644 --- a/drivers/net/phy/aquantia_main.c +++ b/drivers/net/phy/aquantia_main.c @@ -358,9 +358,11 @@ static int aqr107_read_status(struct phy_device *phydev) switch (FIELD_GET(MDIO_PHYXS_VEND_IF_STATUS_TYPE_MASK, val)) { case MDIO_PHYXS_VEND_IF_STATUS_TYPE_KR: - case MDIO_PHYXS_VEND_IF_STATUS_TYPE_XFI: phydev->interface = PHY_INTERFACE_MODE_10GKR; break; + case MDIO_PHYXS_VEND_IF_STATUS_TYPE_XFI: + phydev->interface = PHY_INTERFACE_MODE_10GBASER; + break; case MDIO_PHYXS_VEND_IF_STATUS_TYPE_USXGMII: phydev->interface = PHY_INTERFACE_MODE_USXGMII; break; @@ -493,7 +495,8 @@ static int aqr107_config_init(struct phy_device *phydev) phydev->interface != PHY_INTERFACE_MODE_2500BASEX && phydev->interface != PHY_INTERFACE_MODE_XGMII && phydev->interface != PHY_INTERFACE_MODE_USXGMII && - phydev->interface != PHY_INTERFACE_MODE_10GKR) + phydev->interface != PHY_INTERFACE_MODE_10GKR && + phydev->interface != PHY_INTERFACE_MODE_10GBASER) return -ENODEV; WARN(phydev->interface == PHY_INTERFACE_MODE_XGMII, diff --git a/drivers/net/phy/bcm84881.c b/drivers/net/phy/bcm84881.c index db59911b9b3c..14d55a77eb28 100644 --- a/drivers/net/phy/bcm84881.c +++ b/drivers/net/phy/bcm84881.c @@ -53,7 +53,7 @@ static int bcm84881_config_init(struct phy_device *phydev) switch (phydev->interface) { case PHY_INTERFACE_MODE_SGMII: case PHY_INTERFACE_MODE_2500BASEX: - case PHY_INTERFACE_MODE_10GKR: + case PHY_INTERFACE_MODE_10GBASER: break; default: return -ENODEV; @@ -218,7 +218,7 @@ static int bcm84881_read_status(struct phy_device *phydev) if (mode == 1 || mode == 2) phydev->interface = PHY_INTERFACE_MODE_SGMII; else if (mode == 3) - phydev->interface = PHY_INTERFACE_MODE_10GKR; + phydev->interface = PHY_INTERFACE_MODE_10GBASER; else if (mode == 4) phydev->interface = PHY_INTERFACE_MODE_2500BASEX; switch (mode & 7) { diff --git a/drivers/net/phy/dp83822.c b/drivers/net/phy/dp83822.c index 8a4b1d167ce2..fe9aa3ad52a7 100644 --- a/drivers/net/phy/dp83822.c +++ b/drivers/net/phy/dp83822.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* - * Driver for the Texas Instruments DP83822 PHY +/* Driver for the Texas Instruments DP83822, DP83825 and DP83826 PHYs. * * Copyright (C) 2017 Texas Instruments Inc. */ @@ -15,7 +14,12 @@ #include <linux/netdevice.h> #define DP83822_PHY_ID 0x2000a240 +#define DP83825S_PHY_ID 0x2000a140 #define DP83825I_PHY_ID 0x2000a150 +#define DP83825CM_PHY_ID 0x2000a160 +#define DP83825CS_PHY_ID 0x2000a170 +#define DP83826C_PHY_ID 0x2000a130 +#define DP83826NC_PHY_ID 0x2000a110 #define DP83822_DEVADDR 0x1f @@ -319,12 +323,22 @@ static int dp83822_resume(struct phy_device *phydev) static struct phy_driver dp83822_driver[] = { DP83822_PHY_DRIVER(DP83822_PHY_ID, "TI DP83822"), DP83822_PHY_DRIVER(DP83825I_PHY_ID, "TI DP83825I"), + DP83822_PHY_DRIVER(DP83826C_PHY_ID, "TI DP83826C"), + DP83822_PHY_DRIVER(DP83826NC_PHY_ID, "TI DP83826NC"), + DP83822_PHY_DRIVER(DP83825S_PHY_ID, "TI DP83825S"), + DP83822_PHY_DRIVER(DP83825CM_PHY_ID, "TI DP83825M"), + DP83822_PHY_DRIVER(DP83825CS_PHY_ID, "TI DP83825CS"), }; module_phy_driver(dp83822_driver); static struct mdio_device_id __maybe_unused dp83822_tbl[] = { { DP83822_PHY_ID, 0xfffffff0 }, { DP83825I_PHY_ID, 0xfffffff0 }, + { DP83826C_PHY_ID, 0xfffffff0 }, + { DP83826NC_PHY_ID, 0xfffffff0 }, + { DP83825S_PHY_ID, 0xfffffff0 }, + { DP83825CM_PHY_ID, 0xfffffff0 }, + { DP83825CS_PHY_ID, 0xfffffff0 }, { }, }; MODULE_DEVICE_TABLE(mdio, dp83822_tbl); diff --git a/drivers/net/phy/dp83867.c b/drivers/net/phy/dp83867.c index adda0d0eab80..967f57ed0b65 100644 --- a/drivers/net/phy/dp83867.c +++ b/drivers/net/phy/dp83867.c @@ -99,6 +99,7 @@ #define DP83867_PHYCR_TX_FIFO_DEPTH_MASK GENMASK(15, 14) #define DP83867_PHYCR_RX_FIFO_DEPTH_MASK GENMASK(13, 12) #define DP83867_PHYCR_RESERVED_MASK BIT(11) +#define DP83867_PHYCR_FORCE_LINK_GOOD BIT(10) /* RGMIIDCTL bits */ #define DP83867_RGMII_TX_CLK_DELAY_MAX 0xf @@ -635,7 +636,12 @@ static int dp83867_phy_reset(struct phy_device *phydev) usleep_range(10, 20); - return 0; + /* After reset FORCE_LINK_GOOD bit is set. Although the + * default value should be unset. Disable FORCE_LINK_GOOD + * for the phy to work properly. + */ + return phy_modify(phydev, MII_DP83867_PHYCTRL, + DP83867_PHYCR_FORCE_LINK_GOOD, 0); } static struct phy_driver dp83867_driver[] = { diff --git a/drivers/net/phy/fixed_phy.c b/drivers/net/phy/fixed_phy.c index 7c5265fd2b94..4a3d34f40cb9 100644 --- a/drivers/net/phy/fixed_phy.c +++ b/drivers/net/phy/fixed_phy.c @@ -210,18 +210,15 @@ static struct gpio_desc *fixed_phy_get_gpiod(struct device_node *np) * Linux device associated with it, we simply have obtain * the GPIO descriptor from the device tree like this. */ - gpiod = gpiod_get_from_of_node(fixed_link_node, "link-gpios", 0, - GPIOD_IN, "mdio"); - of_node_put(fixed_link_node); - if (IS_ERR(gpiod)) { - if (PTR_ERR(gpiod) == -EPROBE_DEFER) - return gpiod; - + gpiod = fwnode_gpiod_get_index(of_fwnode_handle(fixed_link_node), + "link", 0, GPIOD_IN, "mdio"); + if (IS_ERR(gpiod) && PTR_ERR(gpiod) != -EPROBE_DEFER) { if (PTR_ERR(gpiod) != -ENOENT) pr_err("error getting GPIO for fixed link %pOF, proceed without\n", fixed_link_node); gpiod = NULL; } + of_node_put(fixed_link_node); return gpiod; } diff --git a/drivers/net/phy/marvell10g.c b/drivers/net/phy/marvell10g.c index 512f27b0b5cd..64c9f3bba2cd 100644 --- a/drivers/net/phy/marvell10g.c +++ b/drivers/net/phy/marvell10g.c @@ -216,7 +216,7 @@ static int mv3310_sfp_insert(void *upstream, const struct sfp_eeprom_id *id) sfp_parse_support(phydev->sfp_bus, id, support); iface = sfp_select_interface(phydev->sfp_bus, support); - if (iface != PHY_INTERFACE_MODE_10GKR) { + if (iface != PHY_INTERFACE_MODE_10GBASER) { dev_err(&phydev->mdio.dev, "incompatible SFP module inserted\n"); return -EINVAL; } @@ -304,7 +304,7 @@ static int mv3310_config_init(struct phy_device *phydev) phydev->interface != PHY_INTERFACE_MODE_2500BASEX && phydev->interface != PHY_INTERFACE_MODE_XAUI && phydev->interface != PHY_INTERFACE_MODE_RXAUI && - phydev->interface != PHY_INTERFACE_MODE_10GKR) + phydev->interface != PHY_INTERFACE_MODE_10GBASER) return -ENODEV; return 0; @@ -386,16 +386,17 @@ static void mv3310_update_interface(struct phy_device *phydev) { if ((phydev->interface == PHY_INTERFACE_MODE_SGMII || phydev->interface == PHY_INTERFACE_MODE_2500BASEX || - phydev->interface == PHY_INTERFACE_MODE_10GKR) && phydev->link) { + phydev->interface == PHY_INTERFACE_MODE_10GBASER) && + phydev->link) { /* The PHY automatically switches its serdes interface (and - * active PHYXS instance) between Cisco SGMII, 10GBase-KR and + * active PHYXS instance) between Cisco SGMII, 10GBase-R and * 2500BaseX modes according to the speed. Florian suggests * setting phydev->interface to communicate this to the MAC. * Only do this if we are already in one of the above modes. */ switch (phydev->speed) { case SPEED_10000: - phydev->interface = PHY_INTERFACE_MODE_10GKR; + phydev->interface = PHY_INTERFACE_MODE_10GBASER; break; case SPEED_2500: phydev->interface = PHY_INTERFACE_MODE_2500BASEX; diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index 229e480179ff..9bb9f37f21dc 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -59,17 +59,11 @@ static int mdiobus_register_gpiod(struct mdio_device *mdiodev) static int mdiobus_register_reset(struct mdio_device *mdiodev) { - struct reset_control *reset = NULL; - - if (mdiodev->dev.of_node) - reset = of_reset_control_get_exclusive(mdiodev->dev.of_node, - "phy"); - if (IS_ERR(reset)) { - if (PTR_ERR(reset) == -ENOENT || PTR_ERR(reset) == -ENOTSUPP) - reset = NULL; - else - return PTR_ERR(reset); - } + struct reset_control *reset; + + reset = reset_control_get_optional_exclusive(&mdiodev->dev, "phy"); + if (IS_ERR(reset)) + return PTR_ERR(reset); mdiodev->reset_ctrl = reset; @@ -164,9 +158,11 @@ struct mii_bus *mdiobus_alloc_size(size_t size) if (size) bus->priv = (void *)bus + aligned_size; - /* Initialise the interrupts to polling */ - for (i = 0; i < PHY_MAX_ADDR; i++) + /* Initialise the interrupts to polling and 64-bit seqcounts */ + for (i = 0; i < PHY_MAX_ADDR; i++) { bus->irq[i] = PHY_POLL; + u64_stats_init(&bus->stats[i].syncp); + } return bus; } @@ -255,9 +251,215 @@ static void mdiobus_release(struct device *d) kfree(bus); } +struct mdio_bus_stat_attr { + int addr; + unsigned int field_offset; +}; + +static u64 mdio_bus_get_stat(struct mdio_bus_stats *s, unsigned int offset) +{ + const char *p = (const char *)s + offset; + unsigned int start; + u64 val = 0; + + do { + start = u64_stats_fetch_begin(&s->syncp); + val = u64_stats_read((const u64_stats_t *)p); + } while (u64_stats_fetch_retry(&s->syncp, start)); + + return val; +} + +static u64 mdio_bus_get_global_stat(struct mii_bus *bus, unsigned int offset) +{ + unsigned int i; + u64 val = 0; + + for (i = 0; i < PHY_MAX_ADDR; i++) + val += mdio_bus_get_stat(&bus->stats[i], offset); + + return val; +} + +static ssize_t mdio_bus_stat_field_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct mii_bus *bus = to_mii_bus(dev); + struct mdio_bus_stat_attr *sattr; + struct dev_ext_attribute *eattr; + u64 val; + + eattr = container_of(attr, struct dev_ext_attribute, attr); + sattr = eattr->var; + + if (sattr->addr < 0) + val = mdio_bus_get_global_stat(bus, sattr->field_offset); + else + val = mdio_bus_get_stat(&bus->stats[sattr->addr], + sattr->field_offset); + + return sprintf(buf, "%llu\n", val); +} + +static ssize_t mdio_bus_device_stat_field_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct mdio_device *mdiodev = to_mdio_device(dev); + struct mii_bus *bus = mdiodev->bus; + struct mdio_bus_stat_attr *sattr; + struct dev_ext_attribute *eattr; + int addr = mdiodev->addr; + u64 val; + + eattr = container_of(attr, struct dev_ext_attribute, attr); + sattr = eattr->var; + + val = mdio_bus_get_stat(&bus->stats[addr], sattr->field_offset); + + return sprintf(buf, "%llu\n", val); +} + +#define MDIO_BUS_STATS_ATTR_DECL(field, file) \ +static struct dev_ext_attribute dev_attr_mdio_bus_##field = { \ + .attr = { .attr = { .name = file, .mode = 0444 }, \ + .show = mdio_bus_stat_field_show, \ + }, \ + .var = &((struct mdio_bus_stat_attr) { \ + -1, offsetof(struct mdio_bus_stats, field) \ + }), \ +}; \ +static struct dev_ext_attribute dev_attr_mdio_bus_device_##field = { \ + .attr = { .attr = { .name = file, .mode = 0444 }, \ + .show = mdio_bus_device_stat_field_show, \ + }, \ + .var = &((struct mdio_bus_stat_attr) { \ + -1, offsetof(struct mdio_bus_stats, field) \ + }), \ +}; + +#define MDIO_BUS_STATS_ATTR(field) \ + MDIO_BUS_STATS_ATTR_DECL(field, __stringify(field)) + +MDIO_BUS_STATS_ATTR(transfers); +MDIO_BUS_STATS_ATTR(errors); +MDIO_BUS_STATS_ATTR(writes); +MDIO_BUS_STATS_ATTR(reads); + +#define MDIO_BUS_STATS_ADDR_ATTR_DECL(field, addr, file) \ +static struct dev_ext_attribute dev_attr_mdio_bus_addr_##field##_##addr = { \ + .attr = { .attr = { .name = file, .mode = 0444 }, \ + .show = mdio_bus_stat_field_show, \ + }, \ + .var = &((struct mdio_bus_stat_attr) { \ + addr, offsetof(struct mdio_bus_stats, field) \ + }), \ +} + +#define MDIO_BUS_STATS_ADDR_ATTR(field, addr) \ + MDIO_BUS_STATS_ADDR_ATTR_DECL(field, addr, \ + __stringify(field) "_" __stringify(addr)) + +#define MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(addr) \ + MDIO_BUS_STATS_ADDR_ATTR(transfers, addr); \ + MDIO_BUS_STATS_ADDR_ATTR(errors, addr); \ + MDIO_BUS_STATS_ADDR_ATTR(writes, addr); \ + MDIO_BUS_STATS_ADDR_ATTR(reads, addr) \ + +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(0); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(1); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(2); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(3); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(4); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(5); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(6); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(7); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(8); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(9); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(10); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(11); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(12); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(13); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(14); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(15); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(16); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(17); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(18); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(19); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(20); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(21); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(22); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(23); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(24); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(25); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(26); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(27); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(28); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(29); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(30); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(31); + +#define MDIO_BUS_STATS_ADDR_ATTR_GROUP(addr) \ + &dev_attr_mdio_bus_addr_transfers_##addr.attr.attr, \ + &dev_attr_mdio_bus_addr_errors_##addr.attr.attr, \ + &dev_attr_mdio_bus_addr_writes_##addr.attr.attr, \ + &dev_attr_mdio_bus_addr_reads_##addr.attr.attr \ + +static struct attribute *mdio_bus_statistics_attrs[] = { + &dev_attr_mdio_bus_transfers.attr.attr, + &dev_attr_mdio_bus_errors.attr.attr, + &dev_attr_mdio_bus_writes.attr.attr, + &dev_attr_mdio_bus_reads.attr.attr, + MDIO_BUS_STATS_ADDR_ATTR_GROUP(0), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(1), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(2), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(3), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(4), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(5), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(6), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(7), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(8), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(9), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(10), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(11), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(12), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(13), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(14), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(15), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(16), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(17), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(18), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(19), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(20), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(21), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(22), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(23), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(24), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(25), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(26), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(27), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(28), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(29), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(30), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(31), + NULL, +}; + +static const struct attribute_group mdio_bus_statistics_group = { + .name = "statistics", + .attrs = mdio_bus_statistics_attrs, +}; + +static const struct attribute_group *mdio_bus_groups[] = { + &mdio_bus_statistics_group, + NULL, +}; + static struct class mdio_bus_class = { .name = "mdio_bus", .dev_release = mdiobus_release, + .dev_groups = mdio_bus_groups, }; #if IS_ENABLED(CONFIG_OF_MDIO) @@ -536,6 +738,24 @@ struct phy_device *mdiobus_scan(struct mii_bus *bus, int addr) } EXPORT_SYMBOL(mdiobus_scan); +static void mdiobus_stats_acct(struct mdio_bus_stats *stats, bool op, int ret) +{ + u64_stats_update_begin(&stats->syncp); + + u64_stats_inc(&stats->transfers); + if (ret < 0) { + u64_stats_inc(&stats->errors); + goto out; + } + + if (op) + u64_stats_inc(&stats->reads); + else + u64_stats_inc(&stats->writes); +out: + u64_stats_update_end(&stats->syncp); +} + /** * __mdiobus_read - Unlocked version of the mdiobus_read function * @bus: the mii_bus struct @@ -555,6 +775,7 @@ int __mdiobus_read(struct mii_bus *bus, int addr, u32 regnum) retval = bus->read(bus, addr, regnum); trace_mdio_access(bus, 1, addr, regnum, retval, retval); + mdiobus_stats_acct(&bus->stats[addr], true, retval); return retval; } @@ -580,6 +801,7 @@ int __mdiobus_write(struct mii_bus *bus, int addr, u32 regnum, u16 val) err = bus->write(bus, addr, regnum, val); trace_mdio_access(bus, 0, addr, regnum, val, err); + mdiobus_stats_acct(&bus->stats[addr], false, err); return err; } @@ -725,8 +947,27 @@ static int mdio_uevent(struct device *dev, struct kobj_uevent_env *env) return 0; } +static struct attribute *mdio_bus_device_statistics_attrs[] = { + &dev_attr_mdio_bus_device_transfers.attr.attr, + &dev_attr_mdio_bus_device_errors.attr.attr, + &dev_attr_mdio_bus_device_writes.attr.attr, + &dev_attr_mdio_bus_device_reads.attr.attr, + NULL, +}; + +static const struct attribute_group mdio_bus_device_statistics_group = { + .name = "statistics", + .attrs = mdio_bus_device_statistics_attrs, +}; + +static const struct attribute_group *mdio_bus_dev_groups[] = { + &mdio_bus_device_statistics_group, + NULL, +}; + struct bus_type mdio_bus_type = { .name = "mdio_bus", + .dev_groups = mdio_bus_dev_groups, .match = mdio_bus_match, .uevent = mdio_uevent, }; diff --git a/drivers/net/phy/mscc.c b/drivers/net/phy/mscc.c index 50214c081164..937ac7da2789 100644 --- a/drivers/net/phy/mscc.c +++ b/drivers/net/phy/mscc.c @@ -18,6 +18,17 @@ #include <linux/netdevice.h> #include <dt-bindings/net/mscc-phy-vsc8531.h> +#include <linux/scatterlist.h> +#include <crypto/skcipher.h> + +#if IS_ENABLED(CONFIG_MACSEC) +#include <net/macsec.h> +#endif + +#include "mscc_macsec.h" +#include "mscc_mac.h" +#include "mscc_fc_buffer.h" + enum rgmii_rx_clock_delay { RGMII_RX_CLK_DELAY_0_2_NS = 0, RGMII_RX_CLK_DELAY_0_8_NS = 1, @@ -69,7 +80,7 @@ enum rgmii_rx_clock_delay { #define MSCC_PHY_EXT_PHY_CNTL_2 24 #define MII_VSC85XX_INT_MASK 25 -#define MII_VSC85XX_INT_MASK_MASK 0xa000 +#define MII_VSC85XX_INT_MASK_MASK 0xa020 #define MII_VSC85XX_INT_MASK_WOL 0x0040 #define MII_VSC85XX_INT_STATUS 26 @@ -121,6 +132,26 @@ enum rgmii_rx_clock_delay { #define PHY_S6G_PLL_FSM_CTRL_DATA_POS 8 #define PHY_S6G_PLL_FSM_ENA_POS 7 +#define MSCC_EXT_PAGE_MACSEC_17 17 +#define MSCC_EXT_PAGE_MACSEC_18 18 + +#define MSCC_EXT_PAGE_MACSEC_19 19 +#define MSCC_PHY_MACSEC_19_REG_ADDR(x) (x) +#define MSCC_PHY_MACSEC_19_TARGET(x) ((x) << 12) +#define MSCC_PHY_MACSEC_19_READ BIT(14) +#define MSCC_PHY_MACSEC_19_CMD BIT(15) + +#define MSCC_EXT_PAGE_MACSEC_20 20 +#define MSCC_PHY_MACSEC_20_TARGET(x) (x) +enum macsec_bank { + FC_BUFFER = 0x04, + HOST_MAC = 0x05, + LINE_MAC = 0x06, + IP_1588 = 0x0e, + MACSEC_INGR = 0x38, + MACSEC_EGR = 0x3c, +}; + #define MSCC_EXT_PAGE_ACCESS 31 #define MSCC_PHY_PAGE_STANDARD 0x0000 /* Standard registers */ #define MSCC_PHY_PAGE_EXTENDED 0x0001 /* Extended registers */ @@ -128,6 +159,7 @@ enum rgmii_rx_clock_delay { #define MSCC_PHY_PAGE_EXTENDED_3 0x0003 /* Extended reg - page 3 */ #define MSCC_PHY_PAGE_EXTENDED_4 0x0004 /* Extended reg - page 4 */ #define MSCC_PHY_PAGE_CSR_CNTL MSCC_PHY_PAGE_EXTENDED_4 +#define MSCC_PHY_PAGE_MACSEC MSCC_PHY_PAGE_EXTENDED_4 /* Extended reg - GPIO; this is a bank of registers that are shared for all PHYs * in the same package. */ @@ -175,6 +207,9 @@ enum rgmii_rx_clock_delay { #define SECURE_ON_ENABLE 0x8000 #define SECURE_ON_PASSWD_LEN_4 0x4000 +#define MSCC_PHY_EXTENDED_INT 28 +#define MSCC_PHY_EXTENDED_INT_MS_EGR BIT(9) + /* Extended Page 3 Registers */ #define MSCC_PHY_SERDES_TX_VALID_CNT 21 #define MSCC_PHY_SERDES_TX_CRC_ERR_CNT 22 @@ -411,6 +446,44 @@ static const struct vsc85xx_hw_stat vsc8584_hw_stats[] = { }, }; +#if IS_ENABLED(CONFIG_MACSEC) +struct macsec_flow { + struct list_head list; + enum mscc_macsec_destination_ports port; + enum macsec_bank bank; + u32 index; + int assoc_num; + bool has_transformation; + + /* Highest takes precedence [0..15] */ + u8 priority; + + u8 key[MACSEC_KEYID_LEN]; + + union { + struct macsec_rx_sa *rx_sa; + struct macsec_tx_sa *tx_sa; + }; + + /* Matching */ + struct { + u8 sci:1; + u8 tagged:1; + u8 untagged:1; + u8 etype:1; + } match; + + u16 etype; + + /* Action */ + struct { + u8 bypass:1; + u8 drop:1; + } action; + +}; +#endif + struct vsc8531_private { int rate_magic; u16 supp_led_modes; @@ -424,6 +497,19 @@ struct vsc8531_private { * package. */ unsigned int base_addr; + +#if IS_ENABLED(CONFIG_MACSEC) + /* MACsec fields: + * - One SecY per device (enforced at the s/w implementation level) + * - macsec_flows: list of h/w flows + * - ingr_flows: bitmap of ingress flows + * - egr_flows: bitmap of egress flows + */ + struct macsec_secy *secy; + struct list_head macsec_flows; + unsigned long ingr_flows; + unsigned long egr_flows; +#endif }; #ifdef CONFIG_OF_MDIO @@ -1584,6 +1670,978 @@ out: return ret; } +#if IS_ENABLED(CONFIG_MACSEC) +static u32 vsc8584_macsec_phy_read(struct phy_device *phydev, + enum macsec_bank bank, u32 reg) +{ + u32 val, val_l = 0, val_h = 0; + unsigned long deadline; + int rc; + + rc = phy_select_page(phydev, MSCC_PHY_PAGE_MACSEC); + if (rc < 0) + goto failed; + + __phy_write(phydev, MSCC_EXT_PAGE_MACSEC_20, + MSCC_PHY_MACSEC_20_TARGET(bank >> 2)); + + if (bank >> 2 == 0x1) + /* non-MACsec access */ + bank &= 0x3; + else + bank = 0; + + __phy_write(phydev, MSCC_EXT_PAGE_MACSEC_19, + MSCC_PHY_MACSEC_19_CMD | MSCC_PHY_MACSEC_19_READ | + MSCC_PHY_MACSEC_19_REG_ADDR(reg) | + MSCC_PHY_MACSEC_19_TARGET(bank)); + + deadline = jiffies + msecs_to_jiffies(PROC_CMD_NCOMPLETED_TIMEOUT_MS); + do { + val = __phy_read(phydev, MSCC_EXT_PAGE_MACSEC_19); + } while (time_before(jiffies, deadline) && !(val & MSCC_PHY_MACSEC_19_CMD)); + + val_l = __phy_read(phydev, MSCC_EXT_PAGE_MACSEC_17); + val_h = __phy_read(phydev, MSCC_EXT_PAGE_MACSEC_18); + +failed: + phy_restore_page(phydev, rc, rc); + + return (val_h << 16) | val_l; +} + +static void vsc8584_macsec_phy_write(struct phy_device *phydev, + enum macsec_bank bank, u32 reg, u32 val) +{ + unsigned long deadline; + int rc; + + rc = phy_select_page(phydev, MSCC_PHY_PAGE_MACSEC); + if (rc < 0) + goto failed; + + __phy_write(phydev, MSCC_EXT_PAGE_MACSEC_20, + MSCC_PHY_MACSEC_20_TARGET(bank >> 2)); + + if ((bank >> 2 == 0x1) || (bank >> 2 == 0x3)) + bank &= 0x3; + else + /* MACsec access */ + bank = 0; + + __phy_write(phydev, MSCC_EXT_PAGE_MACSEC_17, (u16)val); + __phy_write(phydev, MSCC_EXT_PAGE_MACSEC_18, (u16)(val >> 16)); + + __phy_write(phydev, MSCC_EXT_PAGE_MACSEC_19, + MSCC_PHY_MACSEC_19_CMD | MSCC_PHY_MACSEC_19_REG_ADDR(reg) | + MSCC_PHY_MACSEC_19_TARGET(bank)); + + deadline = jiffies + msecs_to_jiffies(PROC_CMD_NCOMPLETED_TIMEOUT_MS); + do { + val = __phy_read(phydev, MSCC_EXT_PAGE_MACSEC_19); + } while (time_before(jiffies, deadline) && !(val & MSCC_PHY_MACSEC_19_CMD)); + +failed: + phy_restore_page(phydev, rc, rc); +} + +static void vsc8584_macsec_classification(struct phy_device *phydev, + enum macsec_bank bank) +{ + /* enable VLAN tag parsing */ + vsc8584_macsec_phy_write(phydev, bank, MSCC_MS_SAM_CP_TAG, + MSCC_MS_SAM_CP_TAG_PARSE_STAG | + MSCC_MS_SAM_CP_TAG_PARSE_QTAG | + MSCC_MS_SAM_CP_TAG_PARSE_QINQ); +} + +static void vsc8584_macsec_flow_default_action(struct phy_device *phydev, + enum macsec_bank bank, + bool block) +{ + u32 port = (bank == MACSEC_INGR) ? + MSCC_MS_PORT_UNCONTROLLED : MSCC_MS_PORT_COMMON; + u32 action = MSCC_MS_FLOW_BYPASS; + + if (block) + action = MSCC_MS_FLOW_DROP; + + vsc8584_macsec_phy_write(phydev, bank, MSCC_MS_SAM_NM_FLOW_NCP, + /* MACsec untagged */ + MSCC_MS_SAM_NM_FLOW_NCP_UNTAGGED_FLOW_TYPE(action) | + MSCC_MS_SAM_NM_FLOW_NCP_UNTAGGED_DROP_ACTION(MSCC_MS_ACTION_DROP) | + MSCC_MS_SAM_NM_FLOW_NCP_UNTAGGED_DEST_PORT(port) | + /* MACsec tagged */ + MSCC_MS_SAM_NM_FLOW_NCP_TAGGED_FLOW_TYPE(action) | + MSCC_MS_SAM_NM_FLOW_NCP_TAGGED_DROP_ACTION(MSCC_MS_ACTION_DROP) | + MSCC_MS_SAM_NM_FLOW_NCP_TAGGED_DEST_PORT(port) | + /* Bad tag */ + MSCC_MS_SAM_NM_FLOW_NCP_BADTAG_FLOW_TYPE(action) | + MSCC_MS_SAM_NM_FLOW_NCP_BADTAG_DROP_ACTION(MSCC_MS_ACTION_DROP) | + MSCC_MS_SAM_NM_FLOW_NCP_BADTAG_DEST_PORT(port) | + /* Kay tag */ + MSCC_MS_SAM_NM_FLOW_NCP_KAY_FLOW_TYPE(action) | + MSCC_MS_SAM_NM_FLOW_NCP_KAY_DROP_ACTION(MSCC_MS_ACTION_DROP) | + MSCC_MS_SAM_NM_FLOW_NCP_KAY_DEST_PORT(port)); + vsc8584_macsec_phy_write(phydev, bank, MSCC_MS_SAM_NM_FLOW_CP, + /* MACsec untagged */ + MSCC_MS_SAM_NM_FLOW_NCP_UNTAGGED_FLOW_TYPE(action) | + MSCC_MS_SAM_NM_FLOW_CP_UNTAGGED_DROP_ACTION(MSCC_MS_ACTION_DROP) | + MSCC_MS_SAM_NM_FLOW_CP_UNTAGGED_DEST_PORT(port) | + /* MACsec tagged */ + MSCC_MS_SAM_NM_FLOW_NCP_TAGGED_FLOW_TYPE(action) | + MSCC_MS_SAM_NM_FLOW_CP_TAGGED_DROP_ACTION(MSCC_MS_ACTION_DROP) | + MSCC_MS_SAM_NM_FLOW_CP_TAGGED_DEST_PORT(port) | + /* Bad tag */ + MSCC_MS_SAM_NM_FLOW_NCP_BADTAG_FLOW_TYPE(action) | + MSCC_MS_SAM_NM_FLOW_CP_BADTAG_DROP_ACTION(MSCC_MS_ACTION_DROP) | + MSCC_MS_SAM_NM_FLOW_CP_BADTAG_DEST_PORT(port) | + /* Kay tag */ + MSCC_MS_SAM_NM_FLOW_NCP_KAY_FLOW_TYPE(action) | + MSCC_MS_SAM_NM_FLOW_CP_KAY_DROP_ACTION(MSCC_MS_ACTION_DROP) | + MSCC_MS_SAM_NM_FLOW_CP_KAY_DEST_PORT(port)); +} + +static void vsc8584_macsec_integrity_checks(struct phy_device *phydev, + enum macsec_bank bank) +{ + u32 val; + + if (bank != MACSEC_INGR) + return; + + /* Set default rules to pass unmatched frames */ + val = vsc8584_macsec_phy_read(phydev, bank, + MSCC_MS_PARAMS2_IG_CC_CONTROL); + val |= MSCC_MS_PARAMS2_IG_CC_CONTROL_NON_MATCH_CTRL_ACT | + MSCC_MS_PARAMS2_IG_CC_CONTROL_NON_MATCH_ACT; + vsc8584_macsec_phy_write(phydev, bank, MSCC_MS_PARAMS2_IG_CC_CONTROL, + val); + + vsc8584_macsec_phy_write(phydev, bank, MSCC_MS_PARAMS2_IG_CP_TAG, + MSCC_MS_PARAMS2_IG_CP_TAG_PARSE_STAG | + MSCC_MS_PARAMS2_IG_CP_TAG_PARSE_QTAG | + MSCC_MS_PARAMS2_IG_CP_TAG_PARSE_QINQ); +} + +static void vsc8584_macsec_block_init(struct phy_device *phydev, + enum macsec_bank bank) +{ + u32 val; + int i; + + vsc8584_macsec_phy_write(phydev, bank, MSCC_MS_ENA_CFG, + MSCC_MS_ENA_CFG_SW_RST | + MSCC_MS_ENA_CFG_MACSEC_BYPASS_ENA); + + /* Set the MACsec block out of s/w reset and enable clocks */ + vsc8584_macsec_phy_write(phydev, bank, MSCC_MS_ENA_CFG, + MSCC_MS_ENA_CFG_CLK_ENA); + + vsc8584_macsec_phy_write(phydev, bank, MSCC_MS_STATUS_CONTEXT_CTRL, + bank == MACSEC_INGR ? 0xe5880214 : 0xe5880218); + vsc8584_macsec_phy_write(phydev, bank, MSCC_MS_MISC_CONTROL, + MSCC_MS_MISC_CONTROL_MC_LATENCY_FIX(bank == MACSEC_INGR ? 57 : 40) | + MSCC_MS_MISC_CONTROL_XFORM_REC_SIZE(bank == MACSEC_INGR ? 1 : 2)); + + /* Clear the counters */ + val = vsc8584_macsec_phy_read(phydev, bank, MSCC_MS_COUNT_CONTROL); + val |= MSCC_MS_COUNT_CONTROL_AUTO_CNTR_RESET; + vsc8584_macsec_phy_write(phydev, bank, MSCC_MS_COUNT_CONTROL, val); + + /* Enable octet increment mode */ + vsc8584_macsec_phy_write(phydev, bank, MSCC_MS_PP_CTRL, + MSCC_MS_PP_CTRL_MACSEC_OCTET_INCR_MODE); + + vsc8584_macsec_phy_write(phydev, bank, MSCC_MS_BLOCK_CTX_UPDATE, 0x3); + + val = vsc8584_macsec_phy_read(phydev, bank, MSCC_MS_COUNT_CONTROL); + val |= MSCC_MS_COUNT_CONTROL_RESET_ALL; + vsc8584_macsec_phy_write(phydev, bank, MSCC_MS_COUNT_CONTROL, val); + + /* Set the MTU */ + vsc8584_macsec_phy_write(phydev, bank, MSCC_MS_NON_VLAN_MTU_CHECK, + MSCC_MS_NON_VLAN_MTU_CHECK_NV_MTU_COMPARE(32761) | + MSCC_MS_NON_VLAN_MTU_CHECK_NV_MTU_COMP_DROP); + + for (i = 0; i < 8; i++) + vsc8584_macsec_phy_write(phydev, bank, MSCC_MS_VLAN_MTU_CHECK(i), + MSCC_MS_VLAN_MTU_CHECK_MTU_COMPARE(32761) | + MSCC_MS_VLAN_MTU_CHECK_MTU_COMP_DROP); + + if (bank == MACSEC_EGR) { + val = vsc8584_macsec_phy_read(phydev, bank, MSCC_MS_INTR_CTRL_STATUS); + val &= ~MSCC_MS_INTR_CTRL_STATUS_INTR_ENABLE_M; + vsc8584_macsec_phy_write(phydev, bank, MSCC_MS_INTR_CTRL_STATUS, val); + + vsc8584_macsec_phy_write(phydev, bank, MSCC_MS_FC_CFG, + MSCC_MS_FC_CFG_FCBUF_ENA | + MSCC_MS_FC_CFG_LOW_THRESH(0x1) | + MSCC_MS_FC_CFG_HIGH_THRESH(0x4) | + MSCC_MS_FC_CFG_LOW_BYTES_VAL(0x4) | + MSCC_MS_FC_CFG_HIGH_BYTES_VAL(0x6)); + } + + vsc8584_macsec_classification(phydev, bank); + vsc8584_macsec_flow_default_action(phydev, bank, false); + vsc8584_macsec_integrity_checks(phydev, bank); + + /* Enable the MACsec block */ + vsc8584_macsec_phy_write(phydev, bank, MSCC_MS_ENA_CFG, + MSCC_MS_ENA_CFG_CLK_ENA | + MSCC_MS_ENA_CFG_MACSEC_ENA | + MSCC_MS_ENA_CFG_MACSEC_SPEED_MODE(0x5)); +} + +static void vsc8584_macsec_mac_init(struct phy_device *phydev, + enum macsec_bank bank) +{ + u32 val; + int i; + + /* Clear host & line stats */ + for (i = 0; i < 36; i++) + vsc8584_macsec_phy_write(phydev, bank, 0x1c + i, 0); + + val = vsc8584_macsec_phy_read(phydev, bank, + MSCC_MAC_PAUSE_CFG_TX_FRAME_CTRL); + val &= ~MSCC_MAC_PAUSE_CFG_TX_FRAME_CTRL_PAUSE_MODE_M; + val |= MSCC_MAC_PAUSE_CFG_TX_FRAME_CTRL_PAUSE_MODE(2) | + MSCC_MAC_PAUSE_CFG_TX_FRAME_CTRL_PAUSE_VALUE(0xffff); + vsc8584_macsec_phy_write(phydev, bank, + MSCC_MAC_PAUSE_CFG_TX_FRAME_CTRL, val); + + val = vsc8584_macsec_phy_read(phydev, bank, + MSCC_MAC_PAUSE_CFG_TX_FRAME_CTRL_2); + val |= 0xffff; + vsc8584_macsec_phy_write(phydev, bank, + MSCC_MAC_PAUSE_CFG_TX_FRAME_CTRL_2, val); + + val = vsc8584_macsec_phy_read(phydev, bank, + MSCC_MAC_PAUSE_CFG_RX_FRAME_CTRL); + if (bank == HOST_MAC) + val |= MSCC_MAC_PAUSE_CFG_RX_FRAME_CTRL_PAUSE_TIMER_ENA | + MSCC_MAC_PAUSE_CFG_RX_FRAME_CTRL_PAUSE_FRAME_DROP_ENA; + else + val |= MSCC_MAC_PAUSE_CFG_RX_FRAME_CTRL_PAUSE_REACT_ENA | + MSCC_MAC_PAUSE_CFG_RX_FRAME_CTRL_PAUSE_FRAME_DROP_ENA | + MSCC_MAC_PAUSE_CFG_RX_FRAME_CTRL_PAUSE_MODE | + MSCC_MAC_PAUSE_CFG_RX_FRAME_CTRL_EARLY_PAUSE_DETECT_ENA; + vsc8584_macsec_phy_write(phydev, bank, + MSCC_MAC_PAUSE_CFG_RX_FRAME_CTRL, val); + + vsc8584_macsec_phy_write(phydev, bank, MSCC_MAC_CFG_PKTINF_CFG, + MSCC_MAC_CFG_PKTINF_CFG_STRIP_FCS_ENA | + MSCC_MAC_CFG_PKTINF_CFG_INSERT_FCS_ENA | + MSCC_MAC_CFG_PKTINF_CFG_LPI_RELAY_ENA | + MSCC_MAC_CFG_PKTINF_CFG_STRIP_PREAMBLE_ENA | + MSCC_MAC_CFG_PKTINF_CFG_INSERT_PREAMBLE_ENA | + (bank == HOST_MAC ? + MSCC_MAC_CFG_PKTINF_CFG_ENABLE_TX_PADDING : 0)); + + val = vsc8584_macsec_phy_read(phydev, bank, MSCC_MAC_CFG_MODE_CFG); + val &= ~MSCC_MAC_CFG_MODE_CFG_DISABLE_DIC; + vsc8584_macsec_phy_write(phydev, bank, MSCC_MAC_CFG_MODE_CFG, val); + + val = vsc8584_macsec_phy_read(phydev, bank, MSCC_MAC_CFG_MAXLEN_CFG); + val &= ~MSCC_MAC_CFG_MAXLEN_CFG_MAX_LEN_M; + val |= MSCC_MAC_CFG_MAXLEN_CFG_MAX_LEN(10240); + vsc8584_macsec_phy_write(phydev, bank, MSCC_MAC_CFG_MAXLEN_CFG, val); + + vsc8584_macsec_phy_write(phydev, bank, MSCC_MAC_CFG_ADV_CHK_CFG, + MSCC_MAC_CFG_ADV_CHK_CFG_SFD_CHK_ENA | + MSCC_MAC_CFG_ADV_CHK_CFG_PRM_CHK_ENA | + MSCC_MAC_CFG_ADV_CHK_CFG_OOR_ERR_ENA | + MSCC_MAC_CFG_ADV_CHK_CFG_INR_ERR_ENA); + + val = vsc8584_macsec_phy_read(phydev, bank, MSCC_MAC_CFG_LFS_CFG); + val &= ~MSCC_MAC_CFG_LFS_CFG_LFS_MODE_ENA; + vsc8584_macsec_phy_write(phydev, bank, MSCC_MAC_CFG_LFS_CFG, val); + + vsc8584_macsec_phy_write(phydev, bank, MSCC_MAC_CFG_ENA_CFG, + MSCC_MAC_CFG_ENA_CFG_RX_CLK_ENA | + MSCC_MAC_CFG_ENA_CFG_TX_CLK_ENA | + MSCC_MAC_CFG_ENA_CFG_RX_ENA | + MSCC_MAC_CFG_ENA_CFG_TX_ENA); +} + +/* Must be called with mdio_lock taken */ +static int vsc8584_macsec_init(struct phy_device *phydev) +{ + u32 val; + + vsc8584_macsec_block_init(phydev, MACSEC_INGR); + vsc8584_macsec_block_init(phydev, MACSEC_EGR); + vsc8584_macsec_mac_init(phydev, HOST_MAC); + vsc8584_macsec_mac_init(phydev, LINE_MAC); + + vsc8584_macsec_phy_write(phydev, FC_BUFFER, + MSCC_FCBUF_FC_READ_THRESH_CFG, + MSCC_FCBUF_FC_READ_THRESH_CFG_TX_THRESH(4) | + MSCC_FCBUF_FC_READ_THRESH_CFG_RX_THRESH(5)); + + val = vsc8584_macsec_phy_read(phydev, FC_BUFFER, MSCC_FCBUF_MODE_CFG); + val |= MSCC_FCBUF_MODE_CFG_PAUSE_GEN_ENA | + MSCC_FCBUF_MODE_CFG_RX_PPM_RATE_ADAPT_ENA | + MSCC_FCBUF_MODE_CFG_TX_PPM_RATE_ADAPT_ENA; + vsc8584_macsec_phy_write(phydev, FC_BUFFER, MSCC_FCBUF_MODE_CFG, val); + + vsc8584_macsec_phy_write(phydev, FC_BUFFER, MSCC_FCBUF_PPM_RATE_ADAPT_THRESH_CFG, + MSCC_FCBUF_PPM_RATE_ADAPT_THRESH_CFG_TX_THRESH(8) | + MSCC_FCBUF_PPM_RATE_ADAPT_THRESH_CFG_TX_OFFSET(9)); + + val = vsc8584_macsec_phy_read(phydev, FC_BUFFER, + MSCC_FCBUF_TX_DATA_QUEUE_CFG); + val &= ~(MSCC_FCBUF_TX_DATA_QUEUE_CFG_START_M | + MSCC_FCBUF_TX_DATA_QUEUE_CFG_END_M); + val |= MSCC_FCBUF_TX_DATA_QUEUE_CFG_START(0) | + MSCC_FCBUF_TX_DATA_QUEUE_CFG_END(5119); + vsc8584_macsec_phy_write(phydev, FC_BUFFER, + MSCC_FCBUF_TX_DATA_QUEUE_CFG, val); + + val = vsc8584_macsec_phy_read(phydev, FC_BUFFER, MSCC_FCBUF_ENA_CFG); + val |= MSCC_FCBUF_ENA_CFG_TX_ENA | MSCC_FCBUF_ENA_CFG_RX_ENA; + vsc8584_macsec_phy_write(phydev, FC_BUFFER, MSCC_FCBUF_ENA_CFG, val); + + val = vsc8584_macsec_phy_read(phydev, IP_1588, + MSCC_PROC_0_IP_1588_TOP_CFG_STAT_MODE_CTL); + val &= ~MSCC_PROC_0_IP_1588_TOP_CFG_STAT_MODE_CTL_PROTOCOL_MODE_M; + val |= MSCC_PROC_0_IP_1588_TOP_CFG_STAT_MODE_CTL_PROTOCOL_MODE(4); + vsc8584_macsec_phy_write(phydev, IP_1588, + MSCC_PROC_0_IP_1588_TOP_CFG_STAT_MODE_CTL, val); + + return 0; +} + +static void vsc8584_macsec_flow(struct phy_device *phydev, + struct macsec_flow *flow) +{ + struct vsc8531_private *priv = phydev->priv; + enum macsec_bank bank = flow->bank; + u32 val, match = 0, mask = 0, action = 0, idx = flow->index; + + if (flow->match.tagged) + match |= MSCC_MS_SAM_MISC_MATCH_TAGGED; + if (flow->match.untagged) + match |= MSCC_MS_SAM_MISC_MATCH_UNTAGGED; + + if (bank == MACSEC_INGR && flow->assoc_num >= 0) { + match |= MSCC_MS_SAM_MISC_MATCH_AN(flow->assoc_num); + mask |= MSCC_MS_SAM_MASK_AN_MASK(0x3); + } + + if (bank == MACSEC_INGR && flow->match.sci && flow->rx_sa->sc->sci) { + match |= MSCC_MS_SAM_MISC_MATCH_TCI(BIT(3)); + mask |= MSCC_MS_SAM_MASK_TCI_MASK(BIT(3)) | + MSCC_MS_SAM_MASK_SCI_MASK; + + vsc8584_macsec_phy_write(phydev, bank, MSCC_MS_SAM_MATCH_SCI_LO(idx), + lower_32_bits(flow->rx_sa->sc->sci)); + vsc8584_macsec_phy_write(phydev, bank, MSCC_MS_SAM_MATCH_SCI_HI(idx), + upper_32_bits(flow->rx_sa->sc->sci)); + } + + if (flow->match.etype) { + mask |= MSCC_MS_SAM_MASK_MAC_ETYPE_MASK; + + vsc8584_macsec_phy_write(phydev, bank, MSCC_MS_SAM_MAC_SA_MATCH_HI(idx), + MSCC_MS_SAM_MAC_SA_MATCH_HI_ETYPE(htons(flow->etype))); + } + + match |= MSCC_MS_SAM_MISC_MATCH_PRIORITY(flow->priority); + + vsc8584_macsec_phy_write(phydev, bank, MSCC_MS_SAM_MISC_MATCH(idx), match); + vsc8584_macsec_phy_write(phydev, bank, MSCC_MS_SAM_MASK(idx), mask); + + /* Action for matching packets */ + if (flow->action.drop) + action = MSCC_MS_FLOW_DROP; + else if (flow->action.bypass || flow->port == MSCC_MS_PORT_UNCONTROLLED) + action = MSCC_MS_FLOW_BYPASS; + else + action = (bank == MACSEC_INGR) ? + MSCC_MS_FLOW_INGRESS : MSCC_MS_FLOW_EGRESS; + + val = MSCC_MS_SAM_FLOW_CTRL_FLOW_TYPE(action) | + MSCC_MS_SAM_FLOW_CTRL_DROP_ACTION(MSCC_MS_ACTION_DROP) | + MSCC_MS_SAM_FLOW_CTRL_DEST_PORT(flow->port); + + if (action == MSCC_MS_FLOW_BYPASS) + goto write_ctrl; + + if (bank == MACSEC_INGR) { + if (priv->secy->replay_protect) + val |= MSCC_MS_SAM_FLOW_CTRL_REPLAY_PROTECT; + if (priv->secy->validate_frames == MACSEC_VALIDATE_STRICT) + val |= MSCC_MS_SAM_FLOW_CTRL_VALIDATE_FRAMES(MSCC_MS_VALIDATE_STRICT); + else if (priv->secy->validate_frames == MACSEC_VALIDATE_CHECK) + val |= MSCC_MS_SAM_FLOW_CTRL_VALIDATE_FRAMES(MSCC_MS_VALIDATE_CHECK); + } else if (bank == MACSEC_EGR) { + if (priv->secy->protect_frames) + val |= MSCC_MS_SAM_FLOW_CTRL_PROTECT_FRAME; + if (priv->secy->tx_sc.encrypt) + val |= MSCC_MS_SAM_FLOW_CTRL_CONF_PROTECT; + if (priv->secy->tx_sc.send_sci) + val |= MSCC_MS_SAM_FLOW_CTRL_INCLUDE_SCI; + } + +write_ctrl: + vsc8584_macsec_phy_write(phydev, bank, MSCC_MS_SAM_FLOW_CTRL(idx), val); +} + +static struct macsec_flow *vsc8584_macsec_find_flow(struct macsec_context *ctx, + enum macsec_bank bank) +{ + struct vsc8531_private *priv = ctx->phydev->priv; + struct macsec_flow *pos, *tmp; + + list_for_each_entry_safe(pos, tmp, &priv->macsec_flows, list) + if (pos->assoc_num == ctx->sa.assoc_num && pos->bank == bank) + return pos; + + return ERR_PTR(-ENOENT); +} + +static void vsc8584_macsec_flow_enable(struct phy_device *phydev, + struct macsec_flow *flow) +{ + enum macsec_bank bank = flow->bank; + u32 val, idx = flow->index; + + if ((flow->bank == MACSEC_INGR && flow->rx_sa && !flow->rx_sa->active) || + (flow->bank == MACSEC_EGR && flow->tx_sa && !flow->tx_sa->active)) + return; + + /* Enable */ + vsc8584_macsec_phy_write(phydev, bank, MSCC_MS_SAM_ENTRY_SET1, BIT(idx)); + + /* Set in-use */ + val = vsc8584_macsec_phy_read(phydev, bank, MSCC_MS_SAM_FLOW_CTRL(idx)); + val |= MSCC_MS_SAM_FLOW_CTRL_SA_IN_USE; + vsc8584_macsec_phy_write(phydev, bank, MSCC_MS_SAM_FLOW_CTRL(idx), val); +} + +static void vsc8584_macsec_flow_disable(struct phy_device *phydev, + struct macsec_flow *flow) +{ + enum macsec_bank bank = flow->bank; + u32 val, idx = flow->index; + + /* Disable */ + vsc8584_macsec_phy_write(phydev, bank, MSCC_MS_SAM_ENTRY_CLEAR1, BIT(idx)); + + /* Clear in-use */ + val = vsc8584_macsec_phy_read(phydev, bank, MSCC_MS_SAM_FLOW_CTRL(idx)); + val &= ~MSCC_MS_SAM_FLOW_CTRL_SA_IN_USE; + vsc8584_macsec_phy_write(phydev, bank, MSCC_MS_SAM_FLOW_CTRL(idx), val); +} + +static u32 vsc8584_macsec_flow_context_id(struct macsec_flow *flow) +{ + if (flow->bank == MACSEC_INGR) + return flow->index + MSCC_MS_MAX_FLOWS; + + return flow->index; +} + +/* Derive the AES key to get a key for the hash autentication */ +static int vsc8584_macsec_derive_key(const u8 key[MACSEC_KEYID_LEN], + u16 key_len, u8 hkey[16]) +{ + struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0); + struct skcipher_request *req = NULL; + struct scatterlist src, dst; + DECLARE_CRYPTO_WAIT(wait); + u32 input[4] = {0}; + int ret; + + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + req = skcipher_request_alloc(tfm, GFP_KERNEL); + if (!req) { + ret = -ENOMEM; + goto out; + } + + skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | + CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, + &wait); + ret = crypto_skcipher_setkey(tfm, key, key_len); + if (ret < 0) + goto out; + + sg_init_one(&src, input, 16); + sg_init_one(&dst, hkey, 16); + skcipher_request_set_crypt(req, &src, &dst, 16, NULL); + + ret = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); + +out: + skcipher_request_free(req); + crypto_free_skcipher(tfm); + return ret; +} + +static int vsc8584_macsec_transformation(struct phy_device *phydev, + struct macsec_flow *flow) +{ + struct vsc8531_private *priv = phydev->priv; + enum macsec_bank bank = flow->bank; + int i, ret, index = flow->index; + u32 rec = 0, control = 0; + u8 hkey[16]; + sci_t sci; + + ret = vsc8584_macsec_derive_key(flow->key, priv->secy->key_len, hkey); + if (ret) + return ret; + + switch (priv->secy->key_len) { + case 16: + control |= CONTROL_CRYPTO_ALG(CTRYPTO_ALG_AES_CTR_128); + break; + case 32: + control |= CONTROL_CRYPTO_ALG(CTRYPTO_ALG_AES_CTR_256); + break; + default: + return -EINVAL; + } + + control |= (bank == MACSEC_EGR) ? + (CONTROL_TYPE_EGRESS | CONTROL_AN(priv->secy->tx_sc.encoding_sa)) : + (CONTROL_TYPE_INGRESS | CONTROL_SEQ_MASK); + + control |= CONTROL_UPDATE_SEQ | CONTROL_ENCRYPT_AUTH | CONTROL_KEY_IN_CTX | + CONTROL_IV0 | CONTROL_IV1 | CONTROL_IV_IN_SEQ | + CONTROL_DIGEST_TYPE(0x2) | CONTROL_SEQ_TYPE(0x1) | + CONTROL_AUTH_ALG(AUTH_ALG_AES_GHAS) | CONTROL_CONTEXT_ID; + + /* Set the control word */ + vsc8584_macsec_phy_write(phydev, bank, MSCC_MS_XFORM_REC(index, rec++), + control); + + /* Set the context ID. Must be unique. */ + vsc8584_macsec_phy_write(phydev, bank, MSCC_MS_XFORM_REC(index, rec++), + vsc8584_macsec_flow_context_id(flow)); + + /* Set the encryption/decryption key */ + for (i = 0; i < priv->secy->key_len / sizeof(u32); i++) + vsc8584_macsec_phy_write(phydev, bank, + MSCC_MS_XFORM_REC(index, rec++), + ((u32 *)flow->key)[i]); + + /* Set the authentication key */ + for (i = 0; i < 4; i++) + vsc8584_macsec_phy_write(phydev, bank, + MSCC_MS_XFORM_REC(index, rec++), + ((u32 *)hkey)[i]); + + /* Initial sequence number */ + vsc8584_macsec_phy_write(phydev, bank, MSCC_MS_XFORM_REC(index, rec++), + bank == MACSEC_INGR ? + flow->rx_sa->next_pn : flow->tx_sa->next_pn); + + if (bank == MACSEC_INGR) + /* Set the mask (replay window size) */ + vsc8584_macsec_phy_write(phydev, bank, + MSCC_MS_XFORM_REC(index, rec++), + priv->secy->replay_window); + + /* Set the input vectors */ + sci = bank == MACSEC_INGR ? flow->rx_sa->sc->sci : priv->secy->sci; + vsc8584_macsec_phy_write(phydev, bank, MSCC_MS_XFORM_REC(index, rec++), + lower_32_bits(sci)); + vsc8584_macsec_phy_write(phydev, bank, MSCC_MS_XFORM_REC(index, rec++), + upper_32_bits(sci)); + + while (rec < 20) + vsc8584_macsec_phy_write(phydev, bank, MSCC_MS_XFORM_REC(index, rec++), + 0); + + flow->has_transformation = true; + return 0; +} + +static struct macsec_flow *vsc8584_macsec_alloc_flow(struct vsc8531_private *priv, + enum macsec_bank bank) +{ + unsigned long *bitmap = bank == MACSEC_INGR ? + &priv->ingr_flows : &priv->egr_flows; + struct macsec_flow *flow; + int index; + + index = find_first_zero_bit(bitmap, MSCC_MS_MAX_FLOWS); + + if (index == MSCC_MS_MAX_FLOWS) + return ERR_PTR(-ENOMEM); + + flow = kzalloc(sizeof(*flow), GFP_KERNEL); + if (!flow) + return ERR_PTR(-ENOMEM); + + set_bit(index, bitmap); + flow->index = index; + flow->bank = bank; + flow->priority = 8; + flow->assoc_num = -1; + + list_add_tail(&flow->list, &priv->macsec_flows); + return flow; +} + +static void vsc8584_macsec_free_flow(struct vsc8531_private *priv, + struct macsec_flow *flow) +{ + unsigned long *bitmap = flow->bank == MACSEC_INGR ? + &priv->ingr_flows : &priv->egr_flows; + + list_del(&flow->list); + clear_bit(flow->index, bitmap); + kfree(flow); +} + +static int vsc8584_macsec_add_flow(struct phy_device *phydev, + struct macsec_flow *flow, bool update) +{ + int ret; + + flow->port = MSCC_MS_PORT_CONTROLLED; + vsc8584_macsec_flow(phydev, flow); + + if (update) + return 0; + + ret = vsc8584_macsec_transformation(phydev, flow); + if (ret) { + vsc8584_macsec_free_flow(phydev->priv, flow); + return ret; + } + + return 0; +} + +static int vsc8584_macsec_default_flows(struct phy_device *phydev) +{ + struct macsec_flow *flow; + + /* Add a rule to let the MKA traffic go through, ingress */ + flow = vsc8584_macsec_alloc_flow(phydev->priv, MACSEC_INGR); + if (IS_ERR(flow)) + return PTR_ERR(flow); + + flow->priority = 15; + flow->port = MSCC_MS_PORT_UNCONTROLLED; + flow->match.tagged = 1; + flow->match.untagged = 1; + flow->match.etype = 1; + flow->etype = ETH_P_PAE; + flow->action.bypass = 1; + + vsc8584_macsec_flow(phydev, flow); + vsc8584_macsec_flow_enable(phydev, flow); + + /* Add a rule to let the MKA traffic go through, egress */ + flow = vsc8584_macsec_alloc_flow(phydev->priv, MACSEC_EGR); + if (IS_ERR(flow)) + return PTR_ERR(flow); + + flow->priority = 15; + flow->port = MSCC_MS_PORT_COMMON; + flow->match.untagged = 1; + flow->match.etype = 1; + flow->etype = ETH_P_PAE; + flow->action.bypass = 1; + + vsc8584_macsec_flow(phydev, flow); + vsc8584_macsec_flow_enable(phydev, flow); + + return 0; +} + +static void vsc8584_macsec_del_flow(struct phy_device *phydev, + struct macsec_flow *flow) +{ + vsc8584_macsec_flow_disable(phydev, flow); + vsc8584_macsec_free_flow(phydev->priv, flow); +} + +static int __vsc8584_macsec_add_rxsa(struct macsec_context *ctx, + struct macsec_flow *flow, bool update) +{ + struct phy_device *phydev = ctx->phydev; + struct vsc8531_private *priv = phydev->priv; + + if (!flow) { + flow = vsc8584_macsec_alloc_flow(priv, MACSEC_INGR); + if (IS_ERR(flow)) + return PTR_ERR(flow); + + memcpy(flow->key, ctx->sa.key, priv->secy->key_len); + } + + flow->assoc_num = ctx->sa.assoc_num; + flow->rx_sa = ctx->sa.rx_sa; + + /* Always match tagged packets on ingress */ + flow->match.tagged = 1; + flow->match.sci = 1; + + if (priv->secy->validate_frames != MACSEC_VALIDATE_DISABLED) + flow->match.untagged = 1; + + return vsc8584_macsec_add_flow(phydev, flow, update); +} + +static int __vsc8584_macsec_add_txsa(struct macsec_context *ctx, + struct macsec_flow *flow, bool update) +{ + struct phy_device *phydev = ctx->phydev; + struct vsc8531_private *priv = phydev->priv; + + if (!flow) { + flow = vsc8584_macsec_alloc_flow(priv, MACSEC_EGR); + if (IS_ERR(flow)) + return PTR_ERR(flow); + + memcpy(flow->key, ctx->sa.key, priv->secy->key_len); + } + + flow->assoc_num = ctx->sa.assoc_num; + flow->tx_sa = ctx->sa.tx_sa; + + /* Always match untagged packets on egress */ + flow->match.untagged = 1; + + return vsc8584_macsec_add_flow(phydev, flow, update); +} + +static int vsc8584_macsec_dev_open(struct macsec_context *ctx) +{ + struct vsc8531_private *priv = ctx->phydev->priv; + struct macsec_flow *flow, *tmp; + + /* No operation to perform before the commit step */ + if (ctx->prepare) + return 0; + + list_for_each_entry_safe(flow, tmp, &priv->macsec_flows, list) + vsc8584_macsec_flow_enable(ctx->phydev, flow); + + return 0; +} + +static int vsc8584_macsec_dev_stop(struct macsec_context *ctx) +{ + struct vsc8531_private *priv = ctx->phydev->priv; + struct macsec_flow *flow, *tmp; + + /* No operation to perform before the commit step */ + if (ctx->prepare) + return 0; + + list_for_each_entry_safe(flow, tmp, &priv->macsec_flows, list) + vsc8584_macsec_flow_disable(ctx->phydev, flow); + + return 0; +} + +static int vsc8584_macsec_add_secy(struct macsec_context *ctx) +{ + struct vsc8531_private *priv = ctx->phydev->priv; + struct macsec_secy *secy = ctx->secy; + + if (ctx->prepare) { + if (priv->secy) + return -EEXIST; + + return 0; + } + + priv->secy = secy; + + vsc8584_macsec_flow_default_action(ctx->phydev, MACSEC_EGR, + secy->validate_frames != MACSEC_VALIDATE_DISABLED); + vsc8584_macsec_flow_default_action(ctx->phydev, MACSEC_INGR, + secy->validate_frames != MACSEC_VALIDATE_DISABLED); + + return vsc8584_macsec_default_flows(ctx->phydev); +} + +static int vsc8584_macsec_del_secy(struct macsec_context *ctx) +{ + struct vsc8531_private *priv = ctx->phydev->priv; + struct macsec_flow *flow, *tmp; + + /* No operation to perform before the commit step */ + if (ctx->prepare) + return 0; + + list_for_each_entry_safe(flow, tmp, &priv->macsec_flows, list) + vsc8584_macsec_del_flow(ctx->phydev, flow); + + vsc8584_macsec_flow_default_action(ctx->phydev, MACSEC_EGR, false); + vsc8584_macsec_flow_default_action(ctx->phydev, MACSEC_INGR, false); + + priv->secy = NULL; + return 0; +} + +static int vsc8584_macsec_upd_secy(struct macsec_context *ctx) +{ + /* No operation to perform before the commit step */ + if (ctx->prepare) + return 0; + + vsc8584_macsec_del_secy(ctx); + return vsc8584_macsec_add_secy(ctx); +} + +static int vsc8584_macsec_add_rxsc(struct macsec_context *ctx) +{ + /* Nothing to do */ + return 0; +} + +static int vsc8584_macsec_upd_rxsc(struct macsec_context *ctx) +{ + return -EOPNOTSUPP; +} + +static int vsc8584_macsec_del_rxsc(struct macsec_context *ctx) +{ + struct vsc8531_private *priv = ctx->phydev->priv; + struct macsec_flow *flow, *tmp; + + /* No operation to perform before the commit step */ + if (ctx->prepare) + return 0; + + list_for_each_entry_safe(flow, tmp, &priv->macsec_flows, list) { + if (flow->bank == MACSEC_INGR && flow->rx_sa && + flow->rx_sa->sc->sci == ctx->rx_sc->sci) + vsc8584_macsec_del_flow(ctx->phydev, flow); + } + + return 0; +} + +static int vsc8584_macsec_add_rxsa(struct macsec_context *ctx) +{ + struct macsec_flow *flow = NULL; + + if (ctx->prepare) + return __vsc8584_macsec_add_rxsa(ctx, flow, false); + + flow = vsc8584_macsec_find_flow(ctx, MACSEC_INGR); + if (IS_ERR(flow)) + return PTR_ERR(flow); + + vsc8584_macsec_flow_enable(ctx->phydev, flow); + return 0; +} + +static int vsc8584_macsec_upd_rxsa(struct macsec_context *ctx) +{ + struct macsec_flow *flow; + + flow = vsc8584_macsec_find_flow(ctx, MACSEC_INGR); + if (IS_ERR(flow)) + return PTR_ERR(flow); + + if (ctx->prepare) { + /* Make sure the flow is disabled before updating it */ + vsc8584_macsec_flow_disable(ctx->phydev, flow); + + return __vsc8584_macsec_add_rxsa(ctx, flow, true); + } + + vsc8584_macsec_flow_enable(ctx->phydev, flow); + return 0; +} + +static int vsc8584_macsec_del_rxsa(struct macsec_context *ctx) +{ + struct macsec_flow *flow; + + flow = vsc8584_macsec_find_flow(ctx, MACSEC_INGR); + + if (IS_ERR(flow)) + return PTR_ERR(flow); + if (ctx->prepare) + return 0; + + vsc8584_macsec_del_flow(ctx->phydev, flow); + return 0; +} + +static int vsc8584_macsec_add_txsa(struct macsec_context *ctx) +{ + struct macsec_flow *flow = NULL; + + if (ctx->prepare) + return __vsc8584_macsec_add_txsa(ctx, flow, false); + + flow = vsc8584_macsec_find_flow(ctx, MACSEC_EGR); + if (IS_ERR(flow)) + return PTR_ERR(flow); + + vsc8584_macsec_flow_enable(ctx->phydev, flow); + return 0; +} + +static int vsc8584_macsec_upd_txsa(struct macsec_context *ctx) +{ + struct macsec_flow *flow; + + flow = vsc8584_macsec_find_flow(ctx, MACSEC_EGR); + if (IS_ERR(flow)) + return PTR_ERR(flow); + + if (ctx->prepare) { + /* Make sure the flow is disabled before updating it */ + vsc8584_macsec_flow_disable(ctx->phydev, flow); + + return __vsc8584_macsec_add_txsa(ctx, flow, true); + } + + vsc8584_macsec_flow_enable(ctx->phydev, flow); + return 0; +} + +static int vsc8584_macsec_del_txsa(struct macsec_context *ctx) +{ + struct macsec_flow *flow; + + flow = vsc8584_macsec_find_flow(ctx, MACSEC_EGR); + + if (IS_ERR(flow)) + return PTR_ERR(flow); + if (ctx->prepare) + return 0; + + vsc8584_macsec_del_flow(ctx->phydev, flow); + return 0; +} + +static struct macsec_ops vsc8584_macsec_ops = { + .mdo_dev_open = vsc8584_macsec_dev_open, + .mdo_dev_stop = vsc8584_macsec_dev_stop, + .mdo_add_secy = vsc8584_macsec_add_secy, + .mdo_upd_secy = vsc8584_macsec_upd_secy, + .mdo_del_secy = vsc8584_macsec_del_secy, + .mdo_add_rxsc = vsc8584_macsec_add_rxsc, + .mdo_upd_rxsc = vsc8584_macsec_upd_rxsc, + .mdo_del_rxsc = vsc8584_macsec_del_rxsc, + .mdo_add_rxsa = vsc8584_macsec_add_rxsa, + .mdo_upd_rxsa = vsc8584_macsec_upd_rxsa, + .mdo_del_rxsa = vsc8584_macsec_del_rxsa, + .mdo_add_txsa = vsc8584_macsec_add_txsa, + .mdo_upd_txsa = vsc8584_macsec_upd_txsa, + .mdo_del_txsa = vsc8584_macsec_del_txsa, +}; +#endif /* CONFIG_MACSEC */ + /* Check if one PHY has already done the init of the parts common to all PHYs * in the Quad PHY package. */ @@ -1733,6 +2791,24 @@ static int vsc8584_config_init(struct phy_device *phydev) mutex_unlock(&phydev->mdio.bus->mdio_lock); +#if IS_ENABLED(CONFIG_MACSEC) + /* MACsec */ + switch (phydev->phy_id & phydev->drv->phy_id_mask) { + case PHY_ID_VSC856X: + case PHY_ID_VSC8575: + case PHY_ID_VSC8582: + case PHY_ID_VSC8584: + INIT_LIST_HEAD(&vsc8531->macsec_flows); + vsc8531->secy = NULL; + + phydev->macsec_ops = &vsc8584_macsec_ops; + + ret = vsc8584_macsec_init(phydev); + if (ret) + goto err; + } +#endif + phy_write(phydev, MSCC_EXT_PAGE_ACCESS, MSCC_PHY_PAGE_STANDARD); val = phy_read(phydev, MSCC_PHY_EXT_PHY_CNTL_1); @@ -1758,6 +2834,43 @@ err: return ret; } +static int vsc8584_handle_interrupt(struct phy_device *phydev) +{ +#if IS_ENABLED(CONFIG_MACSEC) + struct vsc8531_private *priv = phydev->priv; + struct macsec_flow *flow, *tmp; + u32 cause, rec; + + /* Check MACsec PN rollover */ + cause = vsc8584_macsec_phy_read(phydev, MACSEC_EGR, + MSCC_MS_INTR_CTRL_STATUS); + cause &= MSCC_MS_INTR_CTRL_STATUS_INTR_CLR_STATUS_M; + if (!(cause & MACSEC_INTR_CTRL_STATUS_ROLLOVER)) + goto skip_rollover; + + rec = 6 + priv->secy->key_len / sizeof(u32); + list_for_each_entry_safe(flow, tmp, &priv->macsec_flows, list) { + u32 val; + + if (flow->bank != MACSEC_EGR || !flow->has_transformation) + continue; + + val = vsc8584_macsec_phy_read(phydev, MACSEC_EGR, + MSCC_MS_XFORM_REC(flow->index, rec)); + if (val == 0xffffffff) { + vsc8584_macsec_flow_disable(phydev, flow); + macsec_pn_wrapped(priv->secy, flow->tx_sa); + break; + } + } + +skip_rollover: +#endif + + phy_mac_interrupt(phydev); + return 0; +} + static int vsc85xx_config_init(struct phy_device *phydev) { int rc, i, phy_id; @@ -2201,6 +3314,20 @@ static int vsc85xx_config_intr(struct phy_device *phydev) int rc; if (phydev->interrupts == PHY_INTERRUPT_ENABLED) { +#if IS_ENABLED(CONFIG_MACSEC) + phy_write(phydev, MSCC_EXT_PAGE_ACCESS, + MSCC_PHY_PAGE_EXTENDED_2); + phy_write(phydev, MSCC_PHY_EXTENDED_INT, + MSCC_PHY_EXTENDED_INT_MS_EGR); + phy_write(phydev, MSCC_EXT_PAGE_ACCESS, + MSCC_PHY_PAGE_STANDARD); + + vsc8584_macsec_phy_write(phydev, MACSEC_EGR, + MSCC_MS_AIC_CTRL, 0xf); + vsc8584_macsec_phy_write(phydev, MACSEC_EGR, + MSCC_MS_INTR_CTRL_STATUS, + MSCC_MS_INTR_CTRL_STATUS_INTR_ENABLE(MACSEC_INTR_CTRL_STATUS_ROLLOVER)); +#endif rc = phy_write(phydev, MII_VSC85XX_INT_MASK, MII_VSC85XX_INT_MASK_MASK); } else { @@ -2550,6 +3677,7 @@ static struct phy_driver vsc85xx_driver[] = { .config_aneg = &vsc85xx_config_aneg, .aneg_done = &genphy_aneg_done, .read_status = &vsc85xx_read_status, + .handle_interrupt = &vsc8584_handle_interrupt, .ack_interrupt = &vsc85xx_ack_interrupt, .config_intr = &vsc85xx_config_intr, .did_interrupt = &vsc8584_did_interrupt, @@ -2602,6 +3730,7 @@ static struct phy_driver vsc85xx_driver[] = { .config_aneg = &vsc85xx_config_aneg, .aneg_done = &genphy_aneg_done, .read_status = &vsc85xx_read_status, + .handle_interrupt = &vsc8584_handle_interrupt, .ack_interrupt = &vsc85xx_ack_interrupt, .config_intr = &vsc85xx_config_intr, .did_interrupt = &vsc8584_did_interrupt, @@ -2626,6 +3755,7 @@ static struct phy_driver vsc85xx_driver[] = { .config_aneg = &vsc85xx_config_aneg, .aneg_done = &genphy_aneg_done, .read_status = &vsc85xx_read_status, + .handle_interrupt = &vsc8584_handle_interrupt, .ack_interrupt = &vsc85xx_ack_interrupt, .config_intr = &vsc85xx_config_intr, .did_interrupt = &vsc8584_did_interrupt, @@ -2650,6 +3780,7 @@ static struct phy_driver vsc85xx_driver[] = { .config_aneg = &vsc85xx_config_aneg, .aneg_done = &genphy_aneg_done, .read_status = &vsc85xx_read_status, + .handle_interrupt = &vsc8584_handle_interrupt, .ack_interrupt = &vsc85xx_ack_interrupt, .config_intr = &vsc85xx_config_intr, .did_interrupt = &vsc8584_did_interrupt, diff --git a/drivers/net/phy/mscc_fc_buffer.h b/drivers/net/phy/mscc_fc_buffer.h new file mode 100644 index 000000000000..7e9c0e877895 --- /dev/null +++ b/drivers/net/phy/mscc_fc_buffer.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */ +/* + * Microsemi Ocelot Switch driver + * + * Copyright (C) 2019 Microsemi Corporation + */ + +#ifndef _MSCC_OCELOT_FC_BUFFER_H_ +#define _MSCC_OCELOT_FC_BUFFER_H_ + +#define MSCC_FCBUF_ENA_CFG 0x00 +#define MSCC_FCBUF_MODE_CFG 0x01 +#define MSCC_FCBUF_PPM_RATE_ADAPT_THRESH_CFG 0x02 +#define MSCC_FCBUF_TX_CTRL_QUEUE_CFG 0x03 +#define MSCC_FCBUF_TX_DATA_QUEUE_CFG 0x04 +#define MSCC_FCBUF_RX_DATA_QUEUE_CFG 0x05 +#define MSCC_FCBUF_TX_BUFF_XON_XOFF_THRESH_CFG 0x06 +#define MSCC_FCBUF_FC_READ_THRESH_CFG 0x07 +#define MSCC_FCBUF_TX_FRM_GAP_COMP 0x08 + +#define MSCC_FCBUF_ENA_CFG_TX_ENA BIT(0) +#define MSCC_FCBUF_ENA_CFG_RX_ENA BIT(4) + +#define MSCC_FCBUF_MODE_CFG_DROP_BEHAVIOUR BIT(4) +#define MSCC_FCBUF_MODE_CFG_PAUSE_REACT_ENA BIT(8) +#define MSCC_FCBUF_MODE_CFG_RX_PPM_RATE_ADAPT_ENA BIT(12) +#define MSCC_FCBUF_MODE_CFG_TX_PPM_RATE_ADAPT_ENA BIT(16) +#define MSCC_FCBUF_MODE_CFG_TX_CTRL_QUEUE_ENA BIT(20) +#define MSCC_FCBUF_MODE_CFG_PAUSE_GEN_ENA BIT(24) +#define MSCC_FCBUF_MODE_CFG_INCLUDE_PAUSE_RCVD_IN_PAUSE_GEN BIT(28) + +#define MSCC_FCBUF_PPM_RATE_ADAPT_THRESH_CFG_TX_THRESH(x) (x) +#define MSCC_FCBUF_PPM_RATE_ADAPT_THRESH_CFG_TX_THRESH_M GENMASK(15, 0) +#define MSCC_FCBUF_PPM_RATE_ADAPT_THRESH_CFG_TX_OFFSET(x) ((x) << 16) +#define MSCC_FCBUF_PPM_RATE_ADAPT_THRESH_CFG_TX_OFFSET_M GENMASK(19, 16) +#define MSCC_FCBUF_PPM_RATE_ADAPT_THRESH_CFG_RX_THRESH(x) ((x) << 20) +#define MSCC_FCBUF_PPM_RATE_ADAPT_THRESH_CFG_RX_THRESH_M GENMASK(31, 20) + +#define MSCC_FCBUF_TX_CTRL_QUEUE_CFG_START(x) (x) +#define MSCC_FCBUF_TX_CTRL_QUEUE_CFG_START_M GENMASK(15, 0) +#define MSCC_FCBUF_TX_CTRL_QUEUE_CFG_END(x) ((x) << 16) +#define MSCC_FCBUF_TX_CTRL_QUEUE_CFG_END_M GENMASK(31, 16) + +#define MSCC_FCBUF_TX_DATA_QUEUE_CFG_START(x) (x) +#define MSCC_FCBUF_TX_DATA_QUEUE_CFG_START_M GENMASK(15, 0) +#define MSCC_FCBUF_TX_DATA_QUEUE_CFG_END(x) ((x) << 16) +#define MSCC_FCBUF_TX_DATA_QUEUE_CFG_END_M GENMASK(31, 16) + +#define MSCC_FCBUF_RX_DATA_QUEUE_CFG_START(x) (x) +#define MSCC_FCBUF_RX_DATA_QUEUE_CFG_START_M GENMASK(15, 0) +#define MSCC_FCBUF_RX_DATA_QUEUE_CFG_END(x) ((x) << 16) +#define MSCC_FCBUF_RX_DATA_QUEUE_CFG_END_M GENMASK(31, 16) + +#define MSCC_FCBUF_TX_BUFF_XON_XOFF_THRESH_CFG_XOFF_THRESH(x) (x) +#define MSCC_FCBUF_TX_BUFF_XON_XOFF_THRESH_CFG_XOFF_THRESH_M GENMASK(15, 0) +#define MSCC_FCBUF_TX_BUFF_XON_XOFF_THRESH_CFG_XON_THRESH(x) ((x) << 16) +#define MSCC_FCBUF_TX_BUFF_XON_XOFF_THRESH_CFG_XON_THRESH_M GENMASK(31, 16) + +#define MSCC_FCBUF_FC_READ_THRESH_CFG_TX_THRESH(x) (x) +#define MSCC_FCBUF_FC_READ_THRESH_CFG_TX_THRESH_M GENMASK(15, 0) +#define MSCC_FCBUF_FC_READ_THRESH_CFG_RX_THRESH(x) ((x) << 16) +#define MSCC_FCBUF_FC_READ_THRESH_CFG_RX_THRESH_M GENMASK(31, 16) + +#endif diff --git a/drivers/net/phy/mscc_mac.h b/drivers/net/phy/mscc_mac.h new file mode 100644 index 000000000000..9420ee5175a6 --- /dev/null +++ b/drivers/net/phy/mscc_mac.h @@ -0,0 +1,159 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */ +/* + * Microsemi Ocelot Switch driver + * + * Copyright (c) 2017 Microsemi Corporation + */ + +#ifndef _MSCC_OCELOT_LINE_MAC_H_ +#define _MSCC_OCELOT_LINE_MAC_H_ + +#define MSCC_MAC_CFG_ENA_CFG 0x00 +#define MSCC_MAC_CFG_MODE_CFG 0x01 +#define MSCC_MAC_CFG_MAXLEN_CFG 0x02 +#define MSCC_MAC_CFG_NUM_TAGS_CFG 0x03 +#define MSCC_MAC_CFG_TAGS_CFG 0x04 +#define MSCC_MAC_CFG_ADV_CHK_CFG 0x07 +#define MSCC_MAC_CFG_LFS_CFG 0x08 +#define MSCC_MAC_CFG_LB_CFG 0x09 +#define MSCC_MAC_CFG_PKTINF_CFG 0x0a +#define MSCC_MAC_PAUSE_CFG_TX_FRAME_CTRL 0x0b +#define MSCC_MAC_PAUSE_CFG_TX_FRAME_CTRL_2 0x0c +#define MSCC_MAC_PAUSE_CFG_RX_FRAME_CTRL 0x0d +#define MSCC_MAC_PAUSE_CFG_STATE 0x0e +#define MSCC_MAC_PAUSE_CFG_MAC_ADDRESS_LSB 0x0f +#define MSCC_MAC_PAUSE_CFG_MAC_ADDRESS_MSB 0x10 +#define MSCC_MAC_STATUS_RX_LANE_STICKY_0 0x11 +#define MSCC_MAC_STATUS_RX_LANE_STICKY_1 0x12 +#define MSCC_MAC_STATUS_TX_MONITOR_STICKY 0x13 +#define MSCC_MAC_STATUS_TX_MONITOR_STICKY_MASK 0x14 +#define MSCC_MAC_STATUS_STICKY 0x15 +#define MSCC_MAC_STATUS_STICKY_MASK 0x16 +#define MSCC_MAC_STATS_32BIT_RX_HIH_CKSM_ERR_CNT 0x17 +#define MSCC_MAC_STATS_32BIT_RX_XGMII_PROT_ERR_CNT 0x18 +#define MSCC_MAC_STATS_32BIT_RX_SYMBOL_ERR_CNT 0x19 +#define MSCC_MAC_STATS_32BIT_RX_PAUSE_CNT 0x1a +#define MSCC_MAC_STATS_32BIT_RX_UNSUP_OPCODE_CNT 0x1b +#define MSCC_MAC_STATS_32BIT_RX_UC_CNT 0x1c +#define MSCC_MAC_STATS_32BIT_RX_MC_CNT 0x1d +#define MSCC_MAC_STATS_32BIT_RX_BC_CNT 0x1e +#define MSCC_MAC_STATS_32BIT_RX_CRC_ERR_CNT 0x1f +#define MSCC_MAC_STATS_32BIT_RX_UNDERSIZE_CNT 0x20 +#define MSCC_MAC_STATS_32BIT_RX_FRAGMENTS_CNT 0x21 +#define MSCC_MAC_STATS_32BIT_RX_IN_RANGE_LEN_ERR_CNT 0x22 +#define MSCC_MAC_STATS_32BIT_RX_OUT_OF_RANGE_LEN_ERR_CNT 0x23 +#define MSCC_MAC_STATS_32BIT_RX_OVERSIZE_CNT 0x24 +#define MSCC_MAC_STATS_32BIT_RX_JABBERS_CNT 0x25 +#define MSCC_MAC_STATS_32BIT_RX_SIZE64_CNT 0x26 +#define MSCC_MAC_STATS_32BIT_RX_SIZE65TO127_CNT 0x27 +#define MSCC_MAC_STATS_32BIT_RX_SIZE128TO255_CNT 0x28 +#define MSCC_MAC_STATS_32BIT_RX_SIZE256TO511_CNT 0x29 +#define MSCC_MAC_STATS_32BIT_RX_SIZE512TO1023_CNT 0x2a +#define MSCC_MAC_STATS_32BIT_RX_SIZE1024TO1518_CNT 0x2b +#define MSCC_MAC_STATS_32BIT_RX_SIZE1519TOMAX_CNT 0x2c +#define MSCC_MAC_STATS_32BIT_RX_IPG_SHRINK_CNT 0x2d +#define MSCC_MAC_STATS_32BIT_TX_PAUSE_CNT 0x2e +#define MSCC_MAC_STATS_32BIT_TX_UC_CNT 0x2f +#define MSCC_MAC_STATS_32BIT_TX_MC_CNT 0x30 +#define MSCC_MAC_STATS_32BIT_TX_BC_CNT 0x31 +#define MSCC_MAC_STATS_32BIT_TX_SIZE64_CNT 0x32 +#define MSCC_MAC_STATS_32BIT_TX_SIZE65TO127_CNT 0x33 +#define MSCC_MAC_STATS_32BIT_TX_SIZE128TO255_CNT 0x34 +#define MSCC_MAC_STATS_32BIT_TX_SIZE256TO511_CNT 0x35 +#define MSCC_MAC_STATS_32BIT_TX_SIZE512TO1023_CNT 0x36 +#define MSCC_MAC_STATS_32BIT_TX_SIZE1024TO1518_CNT 0x37 +#define MSCC_MAC_STATS_32BIT_TX_SIZE1519TOMAX_CNT 0x38 +#define MSCC_MAC_STATS_40BIT_RX_BAD_BYTES_CNT 0x39 +#define MSCC_MAC_STATS_40BIT_RX_BAD_BYTES_MSB_CNT 0x3a +#define MSCC_MAC_STATS_40BIT_RX_OK_BYTES_CNT 0x3b +#define MSCC_MAC_STATS_40BIT_RX_OK_BYTES_MSB_CNT 0x3c +#define MSCC_MAC_STATS_40BIT_RX_IN_BYTES_CNT 0x3d +#define MSCC_MAC_STATS_40BIT_RX_IN_BYTES_MSB_CNT 0x3e +#define MSCC_MAC_STATS_40BIT_TX_OK_BYTES_CNT 0x3f +#define MSCC_MAC_STATS_40BIT_TX_OK_BYTES_MSB_CNT 0x40 +#define MSCC_MAC_STATS_40BIT_TX_OUT_BYTES_CNT 0x41 +#define MSCC_MAC_STATS_40BIT_TX_OUT_BYTES_MSB_CNT 0x42 + +#define MSCC_MAC_CFG_ENA_CFG_RX_CLK_ENA BIT(0) +#define MSCC_MAC_CFG_ENA_CFG_TX_CLK_ENA BIT(4) +#define MSCC_MAC_CFG_ENA_CFG_RX_SW_RST BIT(8) +#define MSCC_MAC_CFG_ENA_CFG_TX_SW_RST BIT(12) +#define MSCC_MAC_CFG_ENA_CFG_RX_ENA BIT(16) +#define MSCC_MAC_CFG_ENA_CFG_TX_ENA BIT(20) + +#define MSCC_MAC_CFG_MODE_CFG_FORCE_CW_UPDATE_INTERVAL(x) ((x) << 20) +#define MSCC_MAC_CFG_MODE_CFG_FORCE_CW_UPDATE_INTERVAL_M GENMASK(29, 20) +#define MSCC_MAC_CFG_MODE_CFG_FORCE_CW_UPDATE BIT(16) +#define MSCC_MAC_CFG_MODE_CFG_TUNNEL_PAUSE_FRAMES BIT(14) +#define MSCC_MAC_CFG_MODE_CFG_MAC_PREAMBLE_CFG(x) ((x) << 10) +#define MSCC_MAC_CFG_MODE_CFG_MAC_PREAMBLE_CFG_M GENMASK(12, 10) +#define MSCC_MAC_CFG_MODE_CFG_MAC_IPG_CFG BIT(6) +#define MSCC_MAC_CFG_MODE_CFG_XGMII_GEN_MODE_ENA BIT(4) +#define MSCC_MAC_CFG_MODE_CFG_HIH_CRC_CHECK BIT(2) +#define MSCC_MAC_CFG_MODE_CFG_UNDERSIZED_FRAME_DROP_DIS BIT(1) +#define MSCC_MAC_CFG_MODE_CFG_DISABLE_DIC BIT(0) + +#define MSCC_MAC_CFG_MAXLEN_CFG_MAX_LEN_TAG_CHK BIT(16) +#define MSCC_MAC_CFG_MAXLEN_CFG_MAX_LEN(x) (x) +#define MSCC_MAC_CFG_MAXLEN_CFG_MAX_LEN_M GENMASK(15, 0) + +#define MSCC_MAC_CFG_TAGS_CFG_RSZ 0x4 +#define MSCC_MAC_CFG_TAGS_CFG_TAG_ID(x) ((x) << 16) +#define MSCC_MAC_CFG_TAGS_CFG_TAG_ID_M GENMASK(31, 16) +#define MSCC_MAC_CFG_TAGS_CFG_TAG_ENA BIT(4) + +#define MSCC_MAC_CFG_ADV_CHK_CFG_EXT_EOP_CHK_ENA BIT(24) +#define MSCC_MAC_CFG_ADV_CHK_CFG_EXT_SOP_CHK_ENA BIT(20) +#define MSCC_MAC_CFG_ADV_CHK_CFG_SFD_CHK_ENA BIT(16) +#define MSCC_MAC_CFG_ADV_CHK_CFG_PRM_SHK_CHK_DIS BIT(12) +#define MSCC_MAC_CFG_ADV_CHK_CFG_PRM_CHK_ENA BIT(8) +#define MSCC_MAC_CFG_ADV_CHK_CFG_OOR_ERR_ENA BIT(4) +#define MSCC_MAC_CFG_ADV_CHK_CFG_INR_ERR_ENA BIT(0) + +#define MSCC_MAC_CFG_LFS_CFG_LFS_INH_TX BIT(8) +#define MSCC_MAC_CFG_LFS_CFG_LFS_DIS_TX BIT(4) +#define MSCC_MAC_CFG_LFS_CFG_LFS_UNIDIR_ENA BIT(3) +#define MSCC_MAC_CFG_LFS_CFG_USE_LEADING_EDGE_DETECT BIT(2) +#define MSCC_MAC_CFG_LFS_CFG_SPURIOUS_Q_DIS BIT(1) +#define MSCC_MAC_CFG_LFS_CFG_LFS_MODE_ENA BIT(0) + +#define MSCC_MAC_CFG_LB_CFG_XGMII_HOST_LB_ENA BIT(4) +#define MSCC_MAC_CFG_LB_CFG_XGMII_PHY_LB_ENA BIT(0) + +#define MSCC_MAC_CFG_PKTINF_CFG_STRIP_FCS_ENA BIT(0) +#define MSCC_MAC_CFG_PKTINF_CFG_INSERT_FCS_ENA BIT(4) +#define MSCC_MAC_CFG_PKTINF_CFG_STRIP_PREAMBLE_ENA BIT(8) +#define MSCC_MAC_CFG_PKTINF_CFG_INSERT_PREAMBLE_ENA BIT(12) +#define MSCC_MAC_CFG_PKTINF_CFG_LPI_RELAY_ENA BIT(16) +#define MSCC_MAC_CFG_PKTINF_CFG_LF_RELAY_ENA BIT(20) +#define MSCC_MAC_CFG_PKTINF_CFG_RF_RELAY_ENA BIT(24) +#define MSCC_MAC_CFG_PKTINF_CFG_ENABLE_TX_PADDING BIT(25) +#define MSCC_MAC_CFG_PKTINF_CFG_ENABLE_RX_PADDING BIT(26) +#define MSCC_MAC_CFG_PKTINF_CFG_ENABLE_4BYTE_PREAMBLE BIT(27) +#define MSCC_MAC_CFG_PKTINF_CFG_MACSEC_BYPASS_NUM_PTP_STALL_CLKS(x) ((x) << 28) +#define MSCC_MAC_CFG_PKTINF_CFG_MACSEC_BYPASS_NUM_PTP_STALL_CLKS_M GENMASK(30, 28) + +#define MSCC_MAC_PAUSE_CFG_TX_FRAME_CTRL_PAUSE_VALUE(x) ((x) << 16) +#define MSCC_MAC_PAUSE_CFG_TX_FRAME_CTRL_PAUSE_VALUE_M GENMASK(31, 16) +#define MSCC_MAC_PAUSE_CFG_TX_FRAME_CTRL_WAIT_FOR_LPI_LOW BIT(12) +#define MSCC_MAC_PAUSE_CFG_TX_FRAME_CTRL_USE_PAUSE_STALL_ENA BIT(8) +#define MSCC_MAC_PAUSE_CFG_TX_FRAME_CTRL_PAUSE_REPL_MODE BIT(4) +#define MSCC_MAC_PAUSE_CFG_TX_FRAME_CTRL_PAUSE_FRC_FRAME BIT(2) +#define MSCC_MAC_PAUSE_CFG_TX_FRAME_CTRL_PAUSE_MODE(x) (x) +#define MSCC_MAC_PAUSE_CFG_TX_FRAME_CTRL_PAUSE_MODE_M GENMASK(1, 0) + +#define MSCC_MAC_PAUSE_CFG_RX_FRAME_CTRL_EARLY_PAUSE_DETECT_ENA BIT(16) +#define MSCC_MAC_PAUSE_CFG_RX_FRAME_CTRL_PRE_CRC_MODE BIT(20) +#define MSCC_MAC_PAUSE_CFG_RX_FRAME_CTRL_PAUSE_TIMER_ENA BIT(12) +#define MSCC_MAC_PAUSE_CFG_RX_FRAME_CTRL_PAUSE_REACT_ENA BIT(8) +#define MSCC_MAC_PAUSE_CFG_RX_FRAME_CTRL_PAUSE_FRAME_DROP_ENA BIT(4) +#define MSCC_MAC_PAUSE_CFG_RX_FRAME_CTRL_PAUSE_MODE BIT(0) + +#define MSCC_MAC_PAUSE_CFG_STATE_PAUSE_STATE BIT(0) +#define MSCC_MAC_PAUSE_CFG_STATE_MAC_TX_PAUSE_GEN BIT(4) + +#define MSCC_PROC_0_IP_1588_TOP_CFG_STAT_MODE_CTL 0x2 +#define MSCC_PROC_0_IP_1588_TOP_CFG_STAT_MODE_CTL_PROTOCOL_MODE(x) (x) +#define MSCC_PROC_0_IP_1588_TOP_CFG_STAT_MODE_CTL_PROTOCOL_MODE_M GENMASK(2, 0) + +#endif /* _MSCC_OCELOT_LINE_MAC_H_ */ diff --git a/drivers/net/phy/mscc_macsec.h b/drivers/net/phy/mscc_macsec.h new file mode 100644 index 000000000000..d9ab6aba7482 --- /dev/null +++ b/drivers/net/phy/mscc_macsec.h @@ -0,0 +1,266 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */ +/* + * Microsemi Ocelot Switch driver + * + * Copyright (c) 2018 Microsemi Corporation + */ + +#ifndef _MSCC_OCELOT_MACSEC_H_ +#define _MSCC_OCELOT_MACSEC_H_ + +#define MSCC_MS_MAX_FLOWS 16 + +#define CONTROL_TYPE_EGRESS 0x6 +#define CONTROL_TYPE_INGRESS 0xf +#define CONTROL_IV0 BIT(5) +#define CONTROL_IV1 BIT(6) +#define CONTROL_IV2 BIT(7) +#define CONTROL_UPDATE_SEQ BIT(13) +#define CONTROL_IV_IN_SEQ BIT(14) +#define CONTROL_ENCRYPT_AUTH BIT(15) +#define CONTROL_KEY_IN_CTX BIT(16) +#define CONTROL_CRYPTO_ALG(x) ((x) << 17) +#define CTRYPTO_ALG_AES_CTR_128 0x5 +#define CTRYPTO_ALG_AES_CTR_192 0x6 +#define CTRYPTO_ALG_AES_CTR_256 0x7 +#define CONTROL_DIGEST_TYPE(x) ((x) << 21) +#define CONTROL_AUTH_ALG(x) ((x) << 23) +#define AUTH_ALG_AES_GHAS 0x4 +#define CONTROL_AN(x) ((x) << 26) +#define CONTROL_SEQ_TYPE(x) ((x) << 28) +#define CONTROL_SEQ_MASK BIT(30) +#define CONTROL_CONTEXT_ID BIT(31) + +enum mscc_macsec_destination_ports { + MSCC_MS_PORT_COMMON = 0, + MSCC_MS_PORT_RSVD = 1, + MSCC_MS_PORT_CONTROLLED = 2, + MSCC_MS_PORT_UNCONTROLLED = 3, +}; + +enum mscc_macsec_drop_actions { + MSCC_MS_ACTION_BYPASS_CRC = 0, + MSCC_MS_ACTION_BYPASS_BAD = 1, + MSCC_MS_ACTION_DROP = 2, + MSCC_MS_ACTION_BYPASS = 3, +}; + +enum mscc_macsec_flow_types { + MSCC_MS_FLOW_BYPASS = 0, + MSCC_MS_FLOW_DROP = 1, + MSCC_MS_FLOW_INGRESS = 2, + MSCC_MS_FLOW_EGRESS = 3, +}; + +enum mscc_macsec_validate_levels { + MSCC_MS_VALIDATE_DISABLED = 0, + MSCC_MS_VALIDATE_CHECK = 1, + MSCC_MS_VALIDATE_STRICT = 2, +}; + +#define MSCC_MS_XFORM_REC(x, y) (((x) << 5) + (y)) +#define MSCC_MS_ENA_CFG 0x800 +#define MSCC_MS_FC_CFG 0x804 +#define MSCC_MS_SAM_MAC_SA_MATCH_LO(x) (0x1000 + ((x) << 4)) +#define MSCC_MS_SAM_MAC_SA_MATCH_HI(x) (0x1001 + ((x) << 4)) +#define MSCC_MS_SAM_MISC_MATCH(x) (0x1004 + ((x) << 4)) +#define MSCC_MS_SAM_MATCH_SCI_LO(x) (0x1005 + ((x) << 4)) +#define MSCC_MS_SAM_MATCH_SCI_HI(x) (0x1006 + ((x) << 4)) +#define MSCC_MS_SAM_MASK(x) (0x1007 + ((x) << 4)) +#define MSCC_MS_SAM_ENTRY_SET1 0x1808 +#define MSCC_MS_SAM_ENTRY_CLEAR1 0x180c +#define MSCC_MS_SAM_FLOW_CTRL(x) (0x1c00 + (x)) +#define MSCC_MS_SAM_CP_TAG 0x1e40 +#define MSCC_MS_SAM_NM_FLOW_NCP 0x1e51 +#define MSCC_MS_SAM_NM_FLOW_CP 0x1e52 +#define MSCC_MS_MISC_CONTROL 0x1e5f +#define MSCC_MS_COUNT_CONTROL 0x3204 +#define MSCC_MS_PARAMS2_IG_CC_CONTROL 0x3a10 +#define MSCC_MS_PARAMS2_IG_CP_TAG 0x3a14 +#define MSCC_MS_VLAN_MTU_CHECK(x) (0x3c40 + (x)) +#define MSCC_MS_NON_VLAN_MTU_CHECK 0x3c48 +#define MSCC_MS_PP_CTRL 0x3c4b +#define MSCC_MS_STATUS_CONTEXT_CTRL 0x3d02 +#define MSCC_MS_INTR_CTRL_STATUS 0x3d04 +#define MSCC_MS_BLOCK_CTX_UPDATE 0x3d0c +#define MSCC_MS_AIC_CTRL 0x3e02 + +/* MACSEC_ENA_CFG */ +#define MSCC_MS_ENA_CFG_CLK_ENA BIT(0) +#define MSCC_MS_ENA_CFG_SW_RST BIT(1) +#define MSCC_MS_ENA_CFG_MACSEC_BYPASS_ENA BIT(8) +#define MSCC_MS_ENA_CFG_MACSEC_ENA BIT(9) +#define MSCC_MS_ENA_CFG_MACSEC_SPEED_MODE(x) ((x) << 10) +#define MSCC_MS_ENA_CFG_MACSEC_SPEED_MODE_M GENMASK(12, 10) + +/* MACSEC_FC_CFG */ +#define MSCC_MS_FC_CFG_FCBUF_ENA BIT(0) +#define MSCC_MS_FC_CFG_USE_PKT_EXPANSION_INDICATION BIT(1) +#define MSCC_MS_FC_CFG_LOW_THRESH(x) ((x) << 4) +#define MSCC_MS_FC_CFG_LOW_THRESH_M GENMASK(7, 4) +#define MSCC_MS_FC_CFG_HIGH_THRESH(x) ((x) << 8) +#define MSCC_MS_FC_CFG_HIGH_THRESH_M GENMASK(11, 8) +#define MSCC_MS_FC_CFG_LOW_BYTES_VAL(x) ((x) << 12) +#define MSCC_MS_FC_CFG_LOW_BYTES_VAL_M GENMASK(14, 12) +#define MSCC_MS_FC_CFG_HIGH_BYTES_VAL(x) ((x) << 16) +#define MSCC_MS_FC_CFG_HIGH_BYTES_VAL_M GENMASK(18, 16) + +/* MSCC_MS_SAM_MAC_SA_MATCH_HI */ +#define MSCC_MS_SAM_MAC_SA_MATCH_HI_ETYPE(x) ((x) << 16) +#define MSCC_MS_SAM_MAC_SA_MATCH_HI_ETYPE_M GENMASK(31, 16) + +/* MACSEC_SAM_MISC_MATCH */ +#define MSCC_MS_SAM_MISC_MATCH_VLAN_VALID BIT(0) +#define MSCC_MS_SAM_MISC_MATCH_QINQ_FOUND BIT(1) +#define MSCC_MS_SAM_MISC_MATCH_STAG_VALID BIT(2) +#define MSCC_MS_SAM_MISC_MATCH_QTAG_VALID BIT(3) +#define MSCC_MS_SAM_MISC_MATCH_VLAN_UP(x) ((x) << 4) +#define MSCC_MS_SAM_MISC_MATCH_VLAN_UP_M GENMASK(6, 4) +#define MSCC_MS_SAM_MISC_MATCH_CONTROL_PACKET BIT(7) +#define MSCC_MS_SAM_MISC_MATCH_UNTAGGED BIT(8) +#define MSCC_MS_SAM_MISC_MATCH_TAGGED BIT(9) +#define MSCC_MS_SAM_MISC_MATCH_BAD_TAG BIT(10) +#define MSCC_MS_SAM_MISC_MATCH_KAY_TAG BIT(11) +#define MSCC_MS_SAM_MISC_MATCH_SOURCE_PORT(x) ((x) << 12) +#define MSCC_MS_SAM_MISC_MATCH_SOURCE_PORT_M GENMASK(13, 12) +#define MSCC_MS_SAM_MISC_MATCH_PRIORITY(x) ((x) << 16) +#define MSCC_MS_SAM_MISC_MATCH_PRIORITY_M GENMASK(19, 16) +#define MSCC_MS_SAM_MISC_MATCH_AN(x) ((x) << 24) +#define MSCC_MS_SAM_MISC_MATCH_TCI(x) ((x) << 26) + +/* MACSEC_SAM_MASK */ +#define MSCC_MS_SAM_MASK_MAC_SA_MASK(x) (x) +#define MSCC_MS_SAM_MASK_MAC_SA_MASK_M GENMASK(5, 0) +#define MSCC_MS_SAM_MASK_MAC_DA_MASK(x) ((x) << 6) +#define MSCC_MS_SAM_MASK_MAC_DA_MASK_M GENMASK(11, 6) +#define MSCC_MS_SAM_MASK_MAC_ETYPE_MASK BIT(12) +#define MSCC_MS_SAM_MASK_VLAN_VLD_MASK BIT(13) +#define MSCC_MS_SAM_MASK_QINQ_FOUND_MASK BIT(14) +#define MSCC_MS_SAM_MASK_STAG_VLD_MASK BIT(15) +#define MSCC_MS_SAM_MASK_QTAG_VLD_MASK BIT(16) +#define MSCC_MS_SAM_MASK_VLAN_UP_MASK BIT(17) +#define MSCC_MS_SAM_MASK_VLAN_ID_MASK BIT(18) +#define MSCC_MS_SAM_MASK_SOURCE_PORT_MASK BIT(19) +#define MSCC_MS_SAM_MASK_CTL_PACKET_MASK BIT(20) +#define MSCC_MS_SAM_MASK_VLAN_UP_INNER_MASK BIT(21) +#define MSCC_MS_SAM_MASK_VLAN_ID_INNER_MASK BIT(22) +#define MSCC_MS_SAM_MASK_SCI_MASK BIT(23) +#define MSCC_MS_SAM_MASK_AN_MASK(x) ((x) << 24) +#define MSCC_MS_SAM_MASK_TCI_MASK(x) ((x) << 26) + +/* MACSEC_SAM_FLOW_CTRL_EGR */ +#define MSCC_MS_SAM_FLOW_CTRL_FLOW_TYPE(x) (x) +#define MSCC_MS_SAM_FLOW_CTRL_FLOW_TYPE_M GENMASK(1, 0) +#define MSCC_MS_SAM_FLOW_CTRL_DEST_PORT(x) ((x) << 2) +#define MSCC_MS_SAM_FLOW_CTRL_DEST_PORT_M GENMASK(3, 2) +#define MSCC_MS_SAM_FLOW_CTRL_RESV_4 BIT(4) +#define MSCC_MS_SAM_FLOW_CTRL_FLOW_CRYPT_AUTH BIT(5) +#define MSCC_MS_SAM_FLOW_CTRL_DROP_ACTION(x) ((x) << 6) +#define MSCC_MS_SAM_FLOW_CTRL_DROP_ACTION_M GENMASK(7, 6) +#define MSCC_MS_SAM_FLOW_CTRL_RESV_15_TO_8(x) ((x) << 8) +#define MSCC_MS_SAM_FLOW_CTRL_RESV_15_TO_8_M GENMASK(15, 8) +#define MSCC_MS_SAM_FLOW_CTRL_PROTECT_FRAME BIT(16) +#define MSCC_MS_SAM_FLOW_CTRL_REPLAY_PROTECT BIT(16) +#define MSCC_MS_SAM_FLOW_CTRL_SA_IN_USE BIT(17) +#define MSCC_MS_SAM_FLOW_CTRL_INCLUDE_SCI BIT(18) +#define MSCC_MS_SAM_FLOW_CTRL_USE_ES BIT(19) +#define MSCC_MS_SAM_FLOW_CTRL_USE_SCB BIT(20) +#define MSCC_MS_SAM_FLOW_CTRL_VALIDATE_FRAMES(x) ((x) << 19) +#define MSCC_MS_SAM_FLOW_CTRL_TAG_BYPASS_SIZE(x) ((x) << 21) +#define MSCC_MS_SAM_FLOW_CTRL_TAG_BYPASS_SIZE_M GENMASK(22, 21) +#define MSCC_MS_SAM_FLOW_CTRL_RESV_23 BIT(23) +#define MSCC_MS_SAM_FLOW_CTRL_CONFIDENTIALITY_OFFSET(x) ((x) << 24) +#define MSCC_MS_SAM_FLOW_CTRL_CONFIDENTIALITY_OFFSET_M GENMASK(30, 24) +#define MSCC_MS_SAM_FLOW_CTRL_CONF_PROTECT BIT(31) + +/* MACSEC_SAM_CP_TAG */ +#define MSCC_MS_SAM_CP_TAG_MAP_TBL(x) (x) +#define MSCC_MS_SAM_CP_TAG_MAP_TBL_M GENMASK(23, 0) +#define MSCC_MS_SAM_CP_TAG_DEF_UP(x) ((x) << 24) +#define MSCC_MS_SAM_CP_TAG_DEF_UP_M GENMASK(26, 24) +#define MSCC_MS_SAM_CP_TAG_STAG_UP_EN BIT(27) +#define MSCC_MS_SAM_CP_TAG_QTAG_UP_EN BIT(28) +#define MSCC_MS_SAM_CP_TAG_PARSE_QINQ BIT(29) +#define MSCC_MS_SAM_CP_TAG_PARSE_STAG BIT(30) +#define MSCC_MS_SAM_CP_TAG_PARSE_QTAG BIT(31) + +/* MACSEC_SAM_NM_FLOW_NCP */ +#define MSCC_MS_SAM_NM_FLOW_NCP_UNTAGGED_FLOW_TYPE(x) (x) +#define MSCC_MS_SAM_NM_FLOW_NCP_UNTAGGED_DEST_PORT(x) ((x) << 2) +#define MSCC_MS_SAM_NM_FLOW_NCP_UNTAGGED_DROP_ACTION(x) ((x) << 6) +#define MSCC_MS_SAM_NM_FLOW_NCP_TAGGED_FLOW_TYPE(x) ((x) << 8) +#define MSCC_MS_SAM_NM_FLOW_NCP_TAGGED_DEST_PORT(x) ((x) << 10) +#define MSCC_MS_SAM_NM_FLOW_NCP_TAGGED_DROP_ACTION(x) ((x) << 14) +#define MSCC_MS_SAM_NM_FLOW_NCP_BADTAG_FLOW_TYPE(x) ((x) << 16) +#define MSCC_MS_SAM_NM_FLOW_NCP_BADTAG_DEST_PORT(x) ((x) << 18) +#define MSCC_MS_SAM_NM_FLOW_NCP_BADTAG_DROP_ACTION(x) ((x) << 22) +#define MSCC_MS_SAM_NM_FLOW_NCP_KAY_FLOW_TYPE(x) ((x) << 24) +#define MSCC_MS_SAM_NM_FLOW_NCP_KAY_DEST_PORT(x) ((x) << 26) +#define MSCC_MS_SAM_NM_FLOW_NCP_KAY_DROP_ACTION(x) ((x) << 30) + +/* MACSEC_SAM_NM_FLOW_CP */ +#define MSCC_MS_SAM_NM_FLOW_CP_UNTAGGED_FLOW_TYPE(x) (x) +#define MSCC_MS_SAM_NM_FLOW_CP_UNTAGGED_DEST_PORT(x) ((x) << 2) +#define MSCC_MS_SAM_NM_FLOW_CP_UNTAGGED_DROP_ACTION(x) ((x) << 6) +#define MSCC_MS_SAM_NM_FLOW_CP_TAGGED_FLOW_TYPE(x) ((x) << 8) +#define MSCC_MS_SAM_NM_FLOW_CP_TAGGED_DEST_PORT(x) ((x) << 10) +#define MSCC_MS_SAM_NM_FLOW_CP_TAGGED_DROP_ACTION(x) ((x) << 14) +#define MSCC_MS_SAM_NM_FLOW_CP_BADTAG_FLOW_TYPE(x) ((x) << 16) +#define MSCC_MS_SAM_NM_FLOW_CP_BADTAG_DEST_PORT(x) ((x) << 18) +#define MSCC_MS_SAM_NM_FLOW_CP_BADTAG_DROP_ACTION(x) ((x) << 22) +#define MSCC_MS_SAM_NM_FLOW_CP_KAY_FLOW_TYPE(x) ((x) << 24) +#define MSCC_MS_SAM_NM_FLOW_CP_KAY_DEST_PORT(x) ((x) << 26) +#define MSCC_MS_SAM_NM_FLOW_CP_KAY_DROP_ACTION(x) ((x) << 30) + +/* MACSEC_MISC_CONTROL */ +#define MSCC_MS_MISC_CONTROL_MC_LATENCY_FIX(x) (x) +#define MSCC_MS_MISC_CONTROL_MC_LATENCY_FIX_M GENMASK(5, 0) +#define MSCC_MS_MISC_CONTROL_STATIC_BYPASS BIT(8) +#define MSCC_MS_MISC_CONTROL_NM_MACSEC_EN BIT(9) +#define MSCC_MS_MISC_CONTROL_VALIDATE_FRAMES(x) ((x) << 10) +#define MSCC_MS_MISC_CONTROL_VALIDATE_FRAMES_M GENMASK(11, 10) +#define MSCC_MS_MISC_CONTROL_XFORM_REC_SIZE(x) ((x) << 24) +#define MSCC_MS_MISC_CONTROL_XFORM_REC_SIZE_M GENMASK(25, 24) + +/* MACSEC_COUNT_CONTROL */ +#define MSCC_MS_COUNT_CONTROL_RESET_ALL BIT(0) +#define MSCC_MS_COUNT_CONTROL_DEBUG_ACCESS BIT(1) +#define MSCC_MS_COUNT_CONTROL_SATURATE_CNTRS BIT(2) +#define MSCC_MS_COUNT_CONTROL_AUTO_CNTR_RESET BIT(3) + +/* MACSEC_PARAMS2_IG_CC_CONTROL */ +#define MSCC_MS_PARAMS2_IG_CC_CONTROL_NON_MATCH_CTRL_ACT BIT(14) +#define MSCC_MS_PARAMS2_IG_CC_CONTROL_NON_MATCH_ACT BIT(15) + +/* MACSEC_PARAMS2_IG_CP_TAG */ +#define MSCC_MS_PARAMS2_IG_CP_TAG_MAP_TBL(x) (x) +#define MSCC_MS_PARAMS2_IG_CP_TAG_MAP_TBL_M GENMASK(23, 0) +#define MSCC_MS_PARAMS2_IG_CP_TAG_DEF_UP(x) ((x) << 24) +#define MSCC_MS_PARAMS2_IG_CP_TAG_DEF_UP_M GENMASK(26, 24) +#define MSCC_MS_PARAMS2_IG_CP_TAG_STAG_UP_EN BIT(27) +#define MSCC_MS_PARAMS2_IG_CP_TAG_QTAG_UP_EN BIT(28) +#define MSCC_MS_PARAMS2_IG_CP_TAG_PARSE_QINQ BIT(29) +#define MSCC_MS_PARAMS2_IG_CP_TAG_PARSE_STAG BIT(30) +#define MSCC_MS_PARAMS2_IG_CP_TAG_PARSE_QTAG BIT(31) + +/* MACSEC_VLAN_MTU_CHECK */ +#define MSCC_MS_VLAN_MTU_CHECK_MTU_COMPARE(x) (x) +#define MSCC_MS_VLAN_MTU_CHECK_MTU_COMPARE_M GENMASK(14, 0) +#define MSCC_MS_VLAN_MTU_CHECK_MTU_COMP_DROP BIT(15) + +/* MACSEC_NON_VLAN_MTU_CHECK */ +#define MSCC_MS_NON_VLAN_MTU_CHECK_NV_MTU_COMPARE(x) (x) +#define MSCC_MS_NON_VLAN_MTU_CHECK_NV_MTU_COMPARE_M GENMASK(14, 0) +#define MSCC_MS_NON_VLAN_MTU_CHECK_NV_MTU_COMP_DROP BIT(15) + +/* MACSEC_PP_CTRL */ +#define MSCC_MS_PP_CTRL_MACSEC_OCTET_INCR_MODE BIT(0) + +/* MACSEC_INTR_CTRL_STATUS */ +#define MSCC_MS_INTR_CTRL_STATUS_INTR_CLR_STATUS(x) (x) +#define MSCC_MS_INTR_CTRL_STATUS_INTR_CLR_STATUS_M GENMASK(15, 0) +#define MSCC_MS_INTR_CTRL_STATUS_INTR_ENABLE(x) ((x) << 16) +#define MSCC_MS_INTR_CTRL_STATUS_INTR_ENABLE_M GENMASK(31, 16) +#define MACSEC_INTR_CTRL_STATUS_ROLLOVER BIT(5) + +#endif diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c index 769a076514b0..a4d2d59fceca 100644 --- a/drivers/net/phy/phy-core.c +++ b/drivers/net/phy/phy-core.c @@ -387,7 +387,7 @@ int __phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum) if (regnum > (u16)~0 || devad > 32) return -EINVAL; - if (phydev->drv->read_mmd) { + if (phydev->drv && phydev->drv->read_mmd) { val = phydev->drv->read_mmd(phydev, devad, regnum); } else if (phydev->is_c45) { u32 addr = MII_ADDR_C45 | (devad << 16) | (regnum & 0xffff); @@ -444,7 +444,7 @@ int __phy_write_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val) if (regnum > (u16)~0 || devad > 32) return -EINVAL; - if (phydev->drv->write_mmd) { + if (phydev->drv && phydev->drv->write_mmd) { ret = phydev->drv->write_mmd(phydev, devad, regnum, val); } else if (phydev->is_c45) { u32 addr = MII_ADDR_C45 | (devad << 16) | (regnum & 0xffff); diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 541ed01496bf..d76e038cf2cb 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -432,6 +432,31 @@ int phy_mii_ioctl(struct phy_device *phydev, struct ifreq *ifr, int cmd) } EXPORT_SYMBOL(phy_mii_ioctl); +/** + * phy_do_ioctl - generic ndo_do_ioctl implementation + * @dev: the net_device struct + * @ifr: &struct ifreq for socket ioctl's + * @cmd: ioctl cmd to execute + */ +int phy_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ + if (!dev->phydev) + return -ENODEV; + + return phy_mii_ioctl(dev->phydev, ifr, cmd); +} +EXPORT_SYMBOL(phy_do_ioctl); + +/* same as phy_do_ioctl, but ensures that net_device is running */ +int phy_do_ioctl_running(struct net_device *dev, struct ifreq *ifr, int cmd) +{ + if (!netif_running(dev)) + return -ENODEV; + + return phy_do_ioctl(dev, ifr, cmd); +} +EXPORT_SYMBOL(phy_do_ioctl_running); + void phy_queue_state_machine(struct phy_device *phydev, unsigned long jiffies) { mod_delayed_work(system_power_efficient_wq, &phydev->state_queue, diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index e5dc9f87f495..6a5056e0ae77 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -1107,9 +1107,8 @@ void phy_attached_info(struct phy_device *phydev) EXPORT_SYMBOL(phy_attached_info); #define ATTACHED_FMT "attached PHY driver [%s] (mii_bus:phy_addr=%s, irq=%s)" -void phy_attached_print(struct phy_device *phydev, const char *fmt, ...) +char *phy_attached_info_irq(struct phy_device *phydev) { - const char *drv_name = phydev->drv ? phydev->drv->name : "unbound"; char *irq_str; char irq_num[8]; @@ -1126,6 +1125,14 @@ void phy_attached_print(struct phy_device *phydev, const char *fmt, ...) break; } + return kasprintf(GFP_KERNEL, "%s", irq_str); +} +EXPORT_SYMBOL(phy_attached_info_irq); + +void phy_attached_print(struct phy_device *phydev, const char *fmt, ...) +{ + const char *drv_name = phydev->drv ? phydev->drv->name : "unbound"; + char *irq_str = phy_attached_info_irq(phydev); if (!fmt) { phydev_info(phydev, ATTACHED_FMT "\n", @@ -1142,6 +1149,7 @@ void phy_attached_print(struct phy_device *phydev, const char *fmt, ...) vprintk(fmt, ap); va_end(ap); } + kfree(irq_str); } EXPORT_SYMBOL(phy_attached_print); diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index ba9468cc8e13..70b9a143db84 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -186,8 +186,8 @@ static int phylink_parse_fixedlink(struct phylink *pl, pl->link_config.pause |= MLO_PAUSE_ASYM; if (ret == 0) { - desc = fwnode_get_named_gpiod(fixed_node, "link-gpios", - 0, GPIOD_IN, "?"); + desc = fwnode_gpiod_get_index(fixed_node, "link", 0, + GPIOD_IN, "?"); if (!IS_ERR(desc)) pl->link_gpio = desc; @@ -281,6 +281,7 @@ static int phylink_parse_mode(struct phylink *pl, struct fwnode_handle *fwnode) switch (pl->link_config.interface) { case PHY_INTERFACE_MODE_SGMII: + case PHY_INTERFACE_MODE_QSGMII: phylink_set(pl->supported, 10baseT_Half); phylink_set(pl->supported, 10baseT_Full); phylink_set(pl->supported, 100baseT_Half); @@ -297,7 +298,9 @@ static int phylink_parse_mode(struct phylink *pl, struct fwnode_handle *fwnode) phylink_set(pl->supported, 2500baseX_Full); break; + case PHY_INTERFACE_MODE_USXGMII: case PHY_INTERFACE_MODE_10GKR: + case PHY_INTERFACE_MODE_10GBASER: phylink_set(pl->supported, 10baseT_Half); phylink_set(pl->supported, 10baseT_Full); phylink_set(pl->supported, 100baseT_Half); @@ -305,6 +308,10 @@ static int phylink_parse_mode(struct phylink *pl, struct fwnode_handle *fwnode) phylink_set(pl->supported, 1000baseT_Half); phylink_set(pl->supported, 1000baseT_Full); phylink_set(pl->supported, 1000baseX_Full); + phylink_set(pl->supported, 2500baseT_Full); + phylink_set(pl->supported, 2500baseX_Full); + phylink_set(pl->supported, 5000baseT_Full); + phylink_set(pl->supported, 10000baseT_Full); phylink_set(pl->supported, 10000baseKR_Full); phylink_set(pl->supported, 10000baseCR_Full); phylink_set(pl->supported, 10000baseSR_Full); @@ -570,6 +577,9 @@ static int phylink_register_sfp(struct phylink *pl, struct sfp_bus *bus; int ret; + if (!fwnode) + return 0; + bus = sfp_bus_find_fwnode(fwnode); if (IS_ERR(bus)) { ret = PTR_ERR(bus); @@ -721,6 +731,7 @@ static int phylink_bringup_phy(struct phylink *pl, struct phy_device *phy, { struct phylink_link_state config; __ETHTOOL_DECLARE_LINK_MODE_MASK(supported); + char *irq_str; int ret; /* @@ -756,9 +767,11 @@ static int phylink_bringup_phy(struct phylink *pl, struct phy_device *phy, phy->phylink = pl; phy->phy_link_change = phylink_phy_change; + irq_str = phy_attached_info_irq(phy); phylink_info(pl, - "PHY [%s] driver [%s]\n", dev_name(&phy->mdio.dev), - phy->drv->name); + "PHY [%s] driver [%s] (irq=%s)\n", + dev_name(&phy->mdio.dev), phy->drv->name, irq_str); + kfree(irq_str); mutex_lock(&phy->lock); mutex_lock(&pl->state_mutex); @@ -1020,7 +1033,8 @@ void phylink_start(struct phylink *pl) if (irq <= 0) mod_timer(&pl->link_poll, jiffies + HZ); } - if (pl->cfg_link_an_mode == MLO_AN_FIXED && pl->get_fixed_state) + if ((pl->cfg_link_an_mode == MLO_AN_FIXED && pl->get_fixed_state) || + pl->config->pcs_poll) mod_timer(&pl->link_poll, jiffies + HZ); if (pl->phydev) phy_start(pl->phydev); diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c index 06e6429b8b71..d949ea7b4f8c 100644 --- a/drivers/net/phy/sfp-bus.c +++ b/drivers/net/phy/sfp-bus.c @@ -373,7 +373,7 @@ phy_interface_t sfp_select_interface(struct sfp_bus *bus, phylink_test(link_modes, 10000baseLRM_Full) || phylink_test(link_modes, 10000baseER_Full) || phylink_test(link_modes, 10000baseT_Full)) - return PHY_INTERFACE_MODE_10GKR; + return PHY_INTERFACE_MODE_10GBASER; if (phylink_test(link_modes, 2500baseX_Full)) return PHY_INTERFACE_MODE_2500BASEX; diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c index e1fabb3e3246..acccb747aeda 100644 --- a/drivers/net/ppp/pptp.c +++ b/drivers/net/ppp/pptp.c @@ -155,7 +155,7 @@ static int pptp_xmit(struct ppp_channel *chan, struct sk_buff *skb) opt->dst_addr.sin_addr.s_addr, opt->src_addr.sin_addr.s_addr, 0, 0, IPPROTO_GRE, - RT_TOS(0), 0); + RT_TOS(0), sk->sk_bound_dev_if); if (IS_ERR(rt)) goto tx_error; @@ -444,7 +444,8 @@ static int pptp_connect(struct socket *sock, struct sockaddr *uservaddr, opt->dst_addr.sin_addr.s_addr, opt->src_addr.sin_addr.s_addr, 0, 0, - IPPROTO_GRE, RT_CONN_FLAGS(sk), 0); + IPPROTO_GRE, RT_CONN_FLAGS(sk), + sk->sk_bound_dev_if); if (IS_ERR(rt)) { error = -EHOSTUNREACH; goto end; diff --git a/drivers/net/slip/slip.c b/drivers/net/slip/slip.c index 317d3a8df316..6f4d7ba8b109 100644 --- a/drivers/net/slip/slip.c +++ b/drivers/net/slip/slip.c @@ -452,9 +452,16 @@ static void slip_transmit(struct work_struct *work) */ static void slip_write_wakeup(struct tty_struct *tty) { - struct slip *sl = tty->disc_data; + struct slip *sl; + + rcu_read_lock(); + sl = rcu_dereference(tty->disc_data); + if (!sl) + goto out; schedule_work(&sl->tx_work); +out: + rcu_read_unlock(); } static void sl_tx_timeout(struct net_device *dev, unsigned int txqueue) @@ -882,10 +889,11 @@ static void slip_close(struct tty_struct *tty) return; spin_lock_bh(&sl->lock); - tty->disc_data = NULL; + rcu_assign_pointer(tty->disc_data, NULL); sl->tty = NULL; spin_unlock_bh(&sl->lock); + synchronize_rcu(); flush_work(&sl->tx_work); /* VSV = very important to remove timers */ diff --git a/drivers/net/tap.c b/drivers/net/tap.c index a6d63665ad03..1f4bdd94407a 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -341,6 +341,7 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb) features |= tap->tap_features; if (netif_needs_gso(skb, features)) { struct sk_buff *segs = __skb_gso_segment(skb, features, false); + struct sk_buff *next; if (IS_ERR(segs)) goto drop; @@ -352,16 +353,13 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb) } consume_skb(skb); - while (segs) { - struct sk_buff *nskb = segs->next; - - segs->next = NULL; - if (ptr_ring_produce(&q->ring, segs)) { - kfree_skb(segs); - kfree_skb_list(nskb); + skb_list_walk_safe(segs, skb, next) { + skb_mark_not_on_list(skb); + if (ptr_ring_produce(&q->ring, skb)) { + kfree_skb(skb); + kfree_skb_list(next); break; } - segs = nskb; } } else { /* If we receive a partial checksum and the tap side diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 683d371e6e82..650c937ed56b 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1718,7 +1718,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, if (err < 0) goto err_xdp; if (err == XDP_REDIRECT) - xdp_do_flush_map(); + xdp_do_flush(); if (err != XDP_PASS) goto out; @@ -1936,6 +1936,10 @@ drop: if (ret != XDP_PASS) { rcu_read_unlock(); local_bh_enable(); + if (frags) { + tfile->napi.skb = NULL; + mutex_unlock(&tfile->napi_mutex); + } return total_len; } } @@ -2549,7 +2553,7 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) } if (flush) - xdp_do_flush_map(); + xdp_do_flush(); rcu_read_unlock(); local_bh_enable(); diff --git a/drivers/net/usb/ax88172a.c b/drivers/net/usb/ax88172a.c index af3994e0853b..4e514f5d7c6c 100644 --- a/drivers/net/usb/ax88172a.c +++ b/drivers/net/usb/ax88172a.c @@ -39,17 +39,6 @@ static int asix_mdio_bus_write(struct mii_bus *bus, int phy_id, int regnum, return 0; } -static int ax88172a_ioctl(struct net_device *net, struct ifreq *rq, int cmd) -{ - if (!netif_running(net)) - return -EINVAL; - - if (!net->phydev) - return -ENODEV; - - return phy_mii_ioctl(net->phydev, rq, cmd); -} - /* set MAC link settings according to information from phylib */ static void ax88172a_adjust_link(struct net_device *netdev) { @@ -134,7 +123,7 @@ static const struct net_device_ops ax88172a_netdev_ops = { .ndo_get_stats64 = usbnet_get_stats64, .ndo_set_mac_address = asix_set_mac_address, .ndo_validate_addr = eth_validate_addr, - .ndo_do_ioctl = ax88172a_ioctl, + .ndo_do_ioctl = phy_do_ioctl_running, .ndo_set_rx_mode = asix_set_multicast, }; diff --git a/drivers/net/usb/ch9200.c b/drivers/net/usb/ch9200.c index 9df3c1ffff35..d7f3b70d5477 100644 --- a/drivers/net/usb/ch9200.c +++ b/drivers/net/usb/ch9200.c @@ -111,8 +111,8 @@ static int control_read(struct usbnet *dev, request_type = (USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE); - netdev_dbg(dev->net, "Control_read() index=0x%02x size=%d\n", - index, size); + netdev_dbg(dev->net, "%s() index=0x%02x size=%d\n", + __func__, index, size); buf = kmalloc(size, GFP_KERNEL); if (!buf) { @@ -130,8 +130,6 @@ static int control_read(struct usbnet *dev, err = -EINVAL; kfree(buf); - return err; - err_out: return err; } @@ -151,8 +149,8 @@ static int control_write(struct usbnet *dev, unsigned char request, request_type = (USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE); - netdev_dbg(dev->net, "Control_write() index=0x%02x size=%d\n", - index, size); + netdev_dbg(dev->net, "%s() index=0x%02x size=%d\n", + __func__, index, size); if (data) { buf = kmemdup(data, size, GFP_KERNEL); @@ -181,8 +179,8 @@ static int ch9200_mdio_read(struct net_device *netdev, int phy_id, int loc) struct usbnet *dev = netdev_priv(netdev); unsigned char buff[2]; - netdev_dbg(netdev, "ch9200_mdio_read phy_id:%02x loc:%02x\n", - phy_id, loc); + netdev_dbg(netdev, "%s phy_id:%02x loc:%02x\n", + __func__, phy_id, loc); if (phy_id != 0) return -ENODEV; @@ -199,8 +197,8 @@ static void ch9200_mdio_write(struct net_device *netdev, struct usbnet *dev = netdev_priv(netdev); unsigned char buff[2]; - netdev_dbg(netdev, "ch9200_mdio_write() phy_id=%02x loc:%02x\n", - phy_id, loc); + netdev_dbg(netdev, "%s() phy_id=%02x loc:%02x\n", + __func__, phy_id, loc); if (phy_id != 0) return; @@ -219,8 +217,8 @@ static int ch9200_link_reset(struct usbnet *dev) mii_check_media(&dev->mii, 1, 1); mii_ethtool_gset(&dev->mii, &ecmd); - netdev_dbg(dev->net, "link_reset() speed:%d duplex:%d\n", - ecmd.speed, ecmd.duplex); + netdev_dbg(dev->net, "%s() speed:%d duplex:%d\n", + __func__, ecmd.speed, ecmd.duplex); return 0; } @@ -309,7 +307,7 @@ static int get_mac_address(struct usbnet *dev, unsigned char *data) unsigned char mac_addr[0x06]; int rd_mac_len = 0; - netdev_dbg(dev->net, "get_mac_address:\n\tusbnet VID:%0x PID:%0x\n", + netdev_dbg(dev->net, "%s:\n\tusbnet VID:%0x PID:%0x\n", __func__, le16_to_cpu(dev->udev->descriptor.idVendor), le16_to_cpu(dev->udev->descriptor.idProduct)); diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index f1a63497235e..eccbf4cd7149 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -20,6 +20,7 @@ #include <linux/mdio.h> #include <linux/phy.h> #include <net/ip6_checksum.h> +#include <net/vxlan.h> #include <linux/interrupt.h> #include <linux/irqdomain.h> #include <linux/irq.h> @@ -1663,14 +1664,6 @@ static const struct ethtool_ops lan78xx_ethtool_ops = { .get_regs = lan78xx_get_regs, }; -static int lan78xx_ioctl(struct net_device *netdev, struct ifreq *rq, int cmd) -{ - if (!netif_running(netdev)) - return -EINVAL; - - return phy_mii_ioctl(netdev->phydev, rq, cmd); -} - static void lan78xx_init_mac_address(struct lan78xx_net *dev) { u32 addr_lo, addr_hi; @@ -2724,11 +2717,6 @@ static int lan78xx_stop(struct net_device *net) return 0; } -static int lan78xx_linearize(struct sk_buff *skb) -{ - return skb_linearize(skb); -} - static struct sk_buff *lan78xx_tx_prep(struct lan78xx_net *dev, struct sk_buff *skb, gfp_t flags) { @@ -2740,8 +2728,10 @@ static struct sk_buff *lan78xx_tx_prep(struct lan78xx_net *dev, return NULL; } - if (lan78xx_linearize(skb) < 0) + if (skb_linearize(skb)) { + dev_kfree_skb_any(skb); return NULL; + } tx_cmd_a = (u32)(skb->len & TX_CMD_A_LEN_MASK_) | TX_CMD_A_FCS_; @@ -3671,6 +3661,19 @@ static void lan78xx_tx_timeout(struct net_device *net, unsigned int txqueue) tasklet_schedule(&dev->bh); } +static netdev_features_t lan78xx_features_check(struct sk_buff *skb, + struct net_device *netdev, + netdev_features_t features) +{ + if (skb->len + TX_OVERHEAD > MAX_SINGLE_PACKET_SIZE) + features &= ~NETIF_F_GSO_MASK; + + features = vlan_features_check(skb, features); + features = vxlan_features_check(skb, features); + + return features; +} + static const struct net_device_ops lan78xx_netdev_ops = { .ndo_open = lan78xx_open, .ndo_stop = lan78xx_stop, @@ -3679,11 +3682,12 @@ static const struct net_device_ops lan78xx_netdev_ops = { .ndo_change_mtu = lan78xx_change_mtu, .ndo_set_mac_address = lan78xx_set_mac_addr, .ndo_validate_addr = eth_validate_addr, - .ndo_do_ioctl = lan78xx_ioctl, + .ndo_do_ioctl = phy_do_ioctl_running, .ndo_set_rx_mode = lan78xx_set_multicast, .ndo_set_features = lan78xx_set_features, .ndo_vlan_rx_add_vid = lan78xx_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = lan78xx_vlan_rx_kill_vid, + .ndo_features_check = lan78xx_features_check, }; static void lan78xx_stat_monitor(struct timer_list *t) @@ -3753,6 +3757,7 @@ static int lan78xx_probe(struct usb_interface *intf, /* MTU range: 68 - 9000 */ netdev->max_mtu = MAX_SINGLE_PACKET_SIZE; + netif_set_gso_max_size(netdev, MAX_SINGLE_PACKET_SIZE - MAX_HEADER); dev->ep_blkin = (intf->cur_altsetting)->endpoint + 0; dev->ep_blkout = (intf->cur_altsetting)->endpoint + 1; diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 4196c0e32740..9485c8d1de8a 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1062,6 +1062,7 @@ static const struct usb_device_id products[] = { {QMI_QUIRK_QUECTEL_DYNCFG(0x2c7c, 0x0125)}, /* Quectel EC25, EC20 R2.0 Mini PCIe */ {QMI_QUIRK_QUECTEL_DYNCFG(0x2c7c, 0x0306)}, /* Quectel EP06/EG06/EM06 */ {QMI_QUIRK_QUECTEL_DYNCFG(0x2c7c, 0x0512)}, /* Quectel EG12/EM12 */ + {QMI_QUIRK_QUECTEL_DYNCFG(0x2c7c, 0x0800)}, /* Quectel RM500Q-GL */ /* 3. Combined interface devices matching on interface number */ {QMI_FIXED_INTF(0x0408, 0xea42, 4)}, /* Yota / Megafon M100-1 */ diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 9ec1da429514..e8cd8c05b156 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -31,7 +31,7 @@ #define NETNEXT_VERSION "11" /* Information for net */ -#define NET_VERSION "10" +#define NET_VERSION "11" #define DRIVER_VERSION "v1." NETNEXT_VERSION "." NET_VERSION #define DRIVER_AUTHOR "Realtek linux nic maintainers <nic_swsd@realtek.com>" @@ -68,6 +68,7 @@ #define PLA_LED_FEATURE 0xdd92 #define PLA_PHYAR 0xde00 #define PLA_BOOT_CTRL 0xe004 +#define PLA_LWAKE_CTRL_REG 0xe007 #define PLA_GPHY_INTR_IMR 0xe022 #define PLA_EEE_CR 0xe040 #define PLA_EEEP_CR 0xe080 @@ -95,6 +96,7 @@ #define PLA_TALLYCNT 0xe890 #define PLA_SFF_STS_7 0xe8de #define PLA_PHYSTATUS 0xe908 +#define PLA_CONFIG6 0xe90a /* CONFIG6 */ #define PLA_BP_BA 0xfc26 #define PLA_BP_0 0xfc28 #define PLA_BP_1 0xfc2a @@ -107,6 +109,7 @@ #define PLA_BP_EN 0xfc38 #define USB_USB2PHY 0xb41e +#define USB_SSPHYLINK1 0xb426 #define USB_SSPHYLINK2 0xb428 #define USB_U2P3_CTRL 0xb460 #define USB_CSR_DUMMY1 0xb464 @@ -300,6 +303,9 @@ #define LINK_ON_WAKE_EN 0x0010 #define LINK_OFF_WAKE_EN 0x0008 +/* PLA_CONFIG6 */ +#define LANWAKE_CLR_EN BIT(0) + /* PLA_CONFIG5 */ #define BWF_EN 0x0040 #define MWF_EN 0x0020 @@ -312,6 +318,7 @@ /* PLA_PHY_PWR */ #define TX_10M_IDLE_EN 0x0080 #define PFM_PWM_SWITCH 0x0040 +#define TEST_IO_OFF BIT(4) /* PLA_MAC_PWR_CTRL */ #define D3_CLK_GATED_EN 0x00004000 @@ -324,6 +331,7 @@ #define MAC_CLK_SPDWN_EN BIT(15) /* PLA_MAC_PWR_CTRL3 */ +#define PLA_MCU_SPDWN_EN BIT(14) #define PKT_AVAIL_SPDWN_EN 0x0100 #define SUSPEND_SPDWN_EN 0x0004 #define U1U2_SPDWN_EN 0x0002 @@ -354,6 +362,9 @@ /* PLA_BOOT_CTRL */ #define AUTOLOAD_DONE 0x0002 +/* PLA_LWAKE_CTRL_REG */ +#define LANWAKE_PIN BIT(7) + /* PLA_SUSPEND_FLAG */ #define LINK_CHG_EVENT BIT(0) @@ -365,13 +376,18 @@ #define DEBUG_LTSSM 0x0082 /* PLA_EXTRA_STATUS */ +#define CUR_LINK_OK BIT(15) #define U3P3_CHECK_EN BIT(7) /* RTL_VER_05 only */ #define LINK_CHANGE_FLAG BIT(8) +#define POLL_LINK_CHG BIT(0) /* USB_USB2PHY */ #define USB2PHY_SUSPEND 0x0001 #define USB2PHY_L1 0x0002 +/* USB_SSPHYLINK1 */ +#define DELAY_PHY_PWR_CHG BIT(1) + /* USB_SSPHYLINK2 */ #define pwd_dn_scale_mask 0x3ffe #define pwd_dn_scale(x) ((x) << 1) @@ -1897,8 +1913,8 @@ static void r8152_csum_workaround(struct r8152 *tp, struct sk_buff *skb, { if (skb_shinfo(skb)->gso_size) { netdev_features_t features = tp->netdev->features; + struct sk_buff *segs, *seg, *next; struct sk_buff_head seg_list; - struct sk_buff *segs, *nskb; features &= ~(NETIF_F_SG | NETIF_F_IPV6_CSUM | NETIF_F_TSO6); segs = skb_gso_segment(skb, features); @@ -1907,12 +1923,10 @@ static void r8152_csum_workaround(struct r8152 *tp, struct sk_buff *skb, __skb_queue_head_init(&seg_list); - do { - nskb = segs; - segs = segs->next; - nskb->next = NULL; - __skb_queue_tail(&seg_list, nskb); - } while (segs); + skb_list_walk_safe(segs, seg, next) { + skb_mark_not_on_list(seg); + __skb_queue_tail(&seg_list, seg); + } skb_queue_splice(&seg_list, list); dev_kfree_skb(skb); @@ -2863,6 +2877,17 @@ static int rtl8153_enable(struct r8152 *tp) r8153_set_rx_early_timeout(tp); r8153_set_rx_early_size(tp); + if (tp->version == RTL_VER_09) { + u32 ocp_data; + + ocp_data = ocp_read_word(tp, MCU_TYPE_USB, USB_FW_TASK); + ocp_data &= ~FC_PATCH_TASK; + ocp_write_word(tp, MCU_TYPE_USB, USB_FW_TASK, ocp_data); + usleep_range(1000, 2000); + ocp_data |= FC_PATCH_TASK; + ocp_write_word(tp, MCU_TYPE_USB, USB_FW_TASK, ocp_data); + } + return rtl_enable(tp); } @@ -3376,8 +3401,8 @@ static void rtl8153b_runtime_enable(struct r8152 *tp, bool enable) r8153b_ups_en(tp, false); r8153_queue_wake(tp, false); rtl_runtime_suspend_enable(tp, false); - r8153_u2p3en(tp, true); - r8153b_u1u2en(tp, true); + if (tp->udev->speed != USB_SPEED_HIGH) + r8153b_u1u2en(tp, true); } } @@ -4675,7 +4700,6 @@ static void r8153b_hw_phy_cfg(struct r8152 *tp) r8153_aldps_en(tp, true); r8152b_enable_fc(tp); - r8153_u2p3en(tp, true); set_bit(PHY_RESET, &tp->flags); } @@ -4954,6 +4978,8 @@ static void rtl8152_down(struct r8152 *tp) static void rtl8153_up(struct r8152 *tp) { + u32 ocp_data; + if (test_bit(RTL8152_UNPLUG, &tp->flags)) return; @@ -4961,6 +4987,19 @@ static void rtl8153_up(struct r8152 *tp) r8153_u2p3en(tp, false); r8153_aldps_en(tp, false); r8153_first_init(tp); + + ocp_data = ocp_read_byte(tp, MCU_TYPE_PLA, PLA_CONFIG6); + ocp_data |= LANWAKE_CLR_EN; + ocp_write_byte(tp, MCU_TYPE_PLA, PLA_CONFIG6, ocp_data); + + ocp_data = ocp_read_byte(tp, MCU_TYPE_PLA, PLA_LWAKE_CTRL_REG); + ocp_data &= ~LANWAKE_PIN; + ocp_write_byte(tp, MCU_TYPE_PLA, PLA_LWAKE_CTRL_REG, ocp_data); + + ocp_data = ocp_read_word(tp, MCU_TYPE_USB, USB_SSPHYLINK1); + ocp_data &= ~DELAY_PHY_PWR_CHG; + ocp_write_word(tp, MCU_TYPE_USB, USB_SSPHYLINK1, ocp_data); + r8153_aldps_en(tp, true); switch (tp->version) { @@ -4979,11 +5018,17 @@ static void rtl8153_up(struct r8152 *tp) static void rtl8153_down(struct r8152 *tp) { + u32 ocp_data; + if (test_bit(RTL8152_UNPLUG, &tp->flags)) { rtl_drop_queued_tx(tp); return; } + ocp_data = ocp_read_byte(tp, MCU_TYPE_PLA, PLA_CONFIG6); + ocp_data &= ~LANWAKE_CLR_EN; + ocp_write_byte(tp, MCU_TYPE_PLA, PLA_CONFIG6, ocp_data); + r8153_u1u2en(tp, false); r8153_u2p3en(tp, false); r8153_power_cut_en(tp, false); @@ -4994,6 +5039,8 @@ static void rtl8153_down(struct r8152 *tp) static void rtl8153b_up(struct r8152 *tp) { + u32 ocp_data; + if (test_bit(RTL8152_UNPLUG, &tp->flags)) return; @@ -5004,18 +5051,29 @@ static void rtl8153b_up(struct r8152 *tp) r8153_first_init(tp); ocp_write_dword(tp, MCU_TYPE_USB, USB_RX_BUF_TH, RX_THR_B); + ocp_data = ocp_read_word(tp, MCU_TYPE_PLA, PLA_MAC_PWR_CTRL3); + ocp_data &= ~PLA_MCU_SPDWN_EN; + ocp_write_word(tp, MCU_TYPE_PLA, PLA_MAC_PWR_CTRL3, ocp_data); + r8153_aldps_en(tp, true); - r8153_u2p3en(tp, true); - r8153b_u1u2en(tp, true); + + if (tp->udev->speed != USB_SPEED_HIGH) + r8153b_u1u2en(tp, true); } static void rtl8153b_down(struct r8152 *tp) { + u32 ocp_data; + if (test_bit(RTL8152_UNPLUG, &tp->flags)) { rtl_drop_queued_tx(tp); return; } + ocp_data = ocp_read_word(tp, MCU_TYPE_PLA, PLA_MAC_PWR_CTRL3); + ocp_data |= PLA_MCU_SPDWN_EN; + ocp_write_word(tp, MCU_TYPE_PLA, PLA_MAC_PWR_CTRL3, ocp_data); + r8153b_u1u2en(tp, false); r8153_u2p3en(tp, false); r8153b_power_cut_en(tp, false); @@ -5387,6 +5445,16 @@ static void r8153_init(struct r8152 *tp) else ocp_data |= DYNAMIC_BURST; ocp_write_byte(tp, MCU_TYPE_USB, USB_CSR_DUMMY1, ocp_data); + + r8153_queue_wake(tp, false); + + ocp_data = ocp_read_word(tp, MCU_TYPE_PLA, PLA_EXTRA_STATUS); + if (rtl8152_get_speed(tp) & LINK_STATUS) + ocp_data |= CUR_LINK_OK; + else + ocp_data &= ~CUR_LINK_OK; + ocp_data |= POLL_LINK_CHG; + ocp_write_word(tp, MCU_TYPE_PLA, PLA_EXTRA_STATUS, ocp_data); } ocp_data = ocp_read_byte(tp, MCU_TYPE_USB, USB_CSR_DUMMY2); @@ -5416,10 +5484,19 @@ static void r8153_init(struct r8152 *tp) ocp_write_word(tp, MCU_TYPE_USB, USB_CONNECT_TIMER, 0x0001); r8153_power_cut_en(tp, false); + rtl_runtime_suspend_enable(tp, false); r8153_u1u2en(tp, true); r8153_mac_clk_spd(tp, false); usb_enable_lpm(tp->udev); + ocp_data = ocp_read_byte(tp, MCU_TYPE_PLA, PLA_CONFIG6); + ocp_data |= LANWAKE_CLR_EN; + ocp_write_byte(tp, MCU_TYPE_PLA, PLA_CONFIG6, ocp_data); + + ocp_data = ocp_read_byte(tp, MCU_TYPE_PLA, PLA_LWAKE_CTRL_REG); + ocp_data &= ~LANWAKE_PIN; + ocp_write_byte(tp, MCU_TYPE_PLA, PLA_LWAKE_CTRL_REG, ocp_data); + /* rx aggregation */ ocp_data = ocp_read_word(tp, MCU_TYPE_USB, USB_USB_CTRL); ocp_data &= ~(RX_AGG_DISABLE | RX_ZERO_EN); @@ -5484,7 +5561,17 @@ static void r8153b_init(struct r8152 *tp) r8153b_ups_en(tp, false); r8153_queue_wake(tp, false); rtl_runtime_suspend_enable(tp, false); - r8153b_u1u2en(tp, true); + + ocp_data = ocp_read_word(tp, MCU_TYPE_PLA, PLA_EXTRA_STATUS); + if (rtl8152_get_speed(tp) & LINK_STATUS) + ocp_data |= CUR_LINK_OK; + else + ocp_data &= ~CUR_LINK_OK; + ocp_data |= POLL_LINK_CHG; + ocp_write_word(tp, MCU_TYPE_PLA, PLA_EXTRA_STATUS, ocp_data); + + if (tp->udev->speed != USB_SPEED_HIGH) + r8153b_u1u2en(tp, true); usb_enable_lpm(tp->udev); /* MAC clock speed down */ @@ -5492,6 +5579,19 @@ static void r8153b_init(struct r8152 *tp) ocp_data |= MAC_CLK_SPDWN_EN; ocp_write_word(tp, MCU_TYPE_PLA, PLA_MAC_PWR_CTRL2, ocp_data); + ocp_data = ocp_read_word(tp, MCU_TYPE_PLA, PLA_MAC_PWR_CTRL3); + ocp_data &= ~PLA_MCU_SPDWN_EN; + ocp_write_word(tp, MCU_TYPE_PLA, PLA_MAC_PWR_CTRL3, ocp_data); + + if (tp->version == RTL_VER_09) { + /* Disable Test IO for 32QFN */ + if (ocp_read_byte(tp, MCU_TYPE_PLA, 0xdc00) & BIT(5)) { + ocp_data = ocp_read_word(tp, MCU_TYPE_PLA, PLA_PHY_PWR); + ocp_data |= TEST_IO_OFF; + ocp_write_word(tp, MCU_TYPE_PLA, PLA_PHY_PWR, ocp_data); + } + } + set_bit(GREEN_ETHERNET, &tp->flags); /* rx aggregation */ @@ -6597,6 +6697,9 @@ static int rtl8152_probe(struct usb_interface *intf, return -ENODEV; } + if (intf->cur_altsetting->desc.bNumEndpoints < 3) + return -ENODEV; + usb_reset_device(udev); netdev = alloc_etherdev(sizeof(struct r8152)); if (!netdev) { @@ -6704,6 +6807,11 @@ static int rtl8152_probe(struct usb_interface *intf, intf->needs_remote_wakeup = 1; + if (!rtl_can_wakeup(tp)) + __rtl_set_wol(tp, 0); + else + tp->saved_wolopts = __rtl_get_wol(tp); + tp->rtl_ops.init(tp); #if IS_BUILTIN(CONFIG_USB_RTL8152) /* Retry in case request_firmware() is not ready yet. */ @@ -6721,10 +6829,6 @@ static int rtl8152_probe(struct usb_interface *intf, goto out1; } - if (!rtl_can_wakeup(tp)) - __rtl_set_wol(tp, 0); - - tp->saved_wolopts = __rtl_get_wol(tp); if (tp->saved_wolopts) device_set_wakeup_enable(&udev->dev, true); else diff --git a/drivers/net/veth.c b/drivers/net/veth.c index a552df37a347..1c89017beebb 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -769,7 +769,7 @@ static int veth_poll(struct napi_struct *napi, int budget) if (xdp_xmit & VETH_XDP_TX) veth_xdp_flush(rq->dev, &bq); if (xdp_xmit & VETH_XDP_REDIR) - xdp_do_flush_map(); + xdp_do_flush(); xdp_clear_return_frame_no_direct(); return done; diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 4d7d5434cc5d..c458cd313281 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1432,7 +1432,7 @@ static int virtnet_poll(struct napi_struct *napi, int budget) virtqueue_napi_complete(napi, rq->vq, received); if (xdp_xmit & VIRTIO_XDP_REDIR) - xdp_do_flush_map(); + xdp_do_flush(); if (xdp_xmit & VIRTIO_XDP_TX) { sq = virtnet_xdp_sq(vi); diff --git a/drivers/net/vmxnet3/vmxnet3_ethtool.c b/drivers/net/vmxnet3/vmxnet3_ethtool.c index 0a38c76688ab..1e4b9ba70983 100644 --- a/drivers/net/vmxnet3/vmxnet3_ethtool.c +++ b/drivers/net/vmxnet3/vmxnet3_ethtool.c @@ -555,10 +555,8 @@ vmxnet3_set_ringparam(struct net_device *netdev, } if (VMXNET3_VERSION_GE_3(adapter)) { - if (param->rx_mini_pending < 0 || - param->rx_mini_pending > VMXNET3_RXDATA_DESC_MAX_SIZE) { + if (param->rx_mini_pending > VMXNET3_RXDATA_DESC_MAX_SIZE) return -EINVAL; - } } else if (param->rx_mini_pending != 0) { return -EINVAL; } diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index e95e6585ab82..d3b08b76b1ec 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -2542,7 +2542,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, ndst = &rt->dst; skb_tunnel_check_pmtu(skb, ndst, VXLAN_HEADROOM); - tos = ip_tunnel_ecn_encap(tos, old_iph, skb); + tos = ip_tunnel_ecn_encap(RT_TOS(tos), old_iph, skb); ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); err = vxlan_build_skb(skb, ndst, sizeof(struct iphdr), vni, md, flags, udp_sum); @@ -2582,7 +2582,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, skb_tunnel_check_pmtu(skb, ndst, VXLAN6_HEADROOM); - tos = ip_tunnel_ecn_encap(tos, old_iph, skb); + tos = ip_tunnel_ecn_encap(RT_TOS(tos), old_iph, skb); ttl = ttl ? : ip6_dst_hoplimit(ndst); skb_scrub_packet(skb, xnet); err = vxlan_build_skb(skb, ndst, sizeof(struct ipv6hdr), diff --git a/drivers/net/wan/Kconfig b/drivers/net/wan/Kconfig index dd1a147f2971..4530840e15ef 100644 --- a/drivers/net/wan/Kconfig +++ b/drivers/net/wan/Kconfig @@ -315,7 +315,8 @@ config DSCC4_PCI_RST config IXP4XX_HSS tristate "Intel IXP4xx HSS (synchronous serial port) support" - depends on HDLC && ARM && ARCH_IXP4XX && IXP4XX_NPE && IXP4XX_QMGR + depends on HDLC && IXP4XX_NPE && IXP4XX_QMGR + depends on ARCH_IXP4XX help Say Y here if you want to use built-in HSS ports on IXP4xx processor. diff --git a/drivers/net/wan/fsl_ucc_hdlc.c b/drivers/net/wan/fsl_ucc_hdlc.c index c28f8409067e..3998cac49d7f 100644 --- a/drivers/net/wan/fsl_ucc_hdlc.c +++ b/drivers/net/wan/fsl_ucc_hdlc.c @@ -73,7 +73,7 @@ static struct ucc_tdm_info utdm_primary_info = { }, }; -static struct ucc_tdm_info utdm_info[MAX_HDLC_NUM]; +static struct ucc_tdm_info utdm_info[UCC_MAX_NUM]; static int uhdlc_init(struct ucc_hdlc_private *priv) { diff --git a/drivers/net/wan/hdlc_cisco.c b/drivers/net/wan/hdlc_cisco.c index a030f5aa6b95..d8cba3625c18 100644 --- a/drivers/net/wan/hdlc_cisco.c +++ b/drivers/net/wan/hdlc_cisco.c @@ -75,7 +75,7 @@ static int cisco_hard_header(struct sk_buff *skb, struct net_device *dev, { struct hdlc_header *data; #ifdef DEBUG_HARD_HEADER - printk(KERN_DEBUG "%s: cisco_hard_header called\n", dev->name); + netdev_dbg(dev, "%s called\n", __func__); #endif skb_push(skb, sizeof(struct hdlc_header)); @@ -101,7 +101,7 @@ static void cisco_keepalive_send(struct net_device *dev, u32 type, skb = dev_alloc_skb(sizeof(struct hdlc_header) + sizeof(struct cisco_packet)); if (!skb) { - netdev_warn(dev, "Memory squeeze on cisco_keepalive_send()\n"); + netdev_warn(dev, "Memory squeeze on %s()\n", __func__); return; } skb_reserve(skb, 4); diff --git a/drivers/net/wan/hdlc_x25.c b/drivers/net/wan/hdlc_x25.c index 5643675ff724..c84536b03aa8 100644 --- a/drivers/net/wan/hdlc_x25.c +++ b/drivers/net/wan/hdlc_x25.c @@ -21,8 +21,17 @@ #include <linux/skbuff.h> #include <net/x25device.h> +struct x25_state { + x25_hdlc_proto settings; +}; + static int x25_ioctl(struct net_device *dev, struct ifreq *ifr); +static struct x25_state *state(hdlc_device *hdlc) +{ + return hdlc->state; +} + /* These functions are callbacks called by LAPB layer */ static void x25_connect_disconnect(struct net_device *dev, int reason, int code) @@ -62,11 +71,12 @@ static int x25_data_indication(struct net_device *dev, struct sk_buff *skb) { unsigned char *ptr; - skb_push(skb, 1); - if (skb_cow(skb, 1)) return NET_RX_DROP; + skb_push(skb, 1); + skb_reset_network_header(skb); + ptr = skb->data; *ptr = X25_IFACE_DATA; @@ -79,6 +89,13 @@ static int x25_data_indication(struct net_device *dev, struct sk_buff *skb) static void x25_data_transmit(struct net_device *dev, struct sk_buff *skb) { hdlc_device *hdlc = dev_to_hdlc(dev); + + skb_reset_network_header(skb); + skb->protocol = hdlc_type_trans(skb, dev); + + if (dev_nit_active(dev)) + dev_queue_xmit_nit(skb, dev); + hdlc->xmit(skb, dev); /* Ignore return value :-( */ } @@ -93,6 +110,7 @@ static netdev_tx_t x25_xmit(struct sk_buff *skb, struct net_device *dev) switch (skb->data[0]) { case X25_IFACE_DATA: /* Data to be transmitted */ skb_pull(skb, 1); + skb_reset_network_header(skb); if ((result = lapb_data_request(dev, skb)) != LAPB_OK) dev_kfree_skb(skb); return NETDEV_TX_OK; @@ -131,7 +149,6 @@ static netdev_tx_t x25_xmit(struct sk_buff *skb, struct net_device *dev) static int x25_open(struct net_device *dev) { - int result; static const struct lapb_register_struct cb = { .connect_confirmation = x25_connected, .connect_indication = x25_connected, @@ -140,10 +157,33 @@ static int x25_open(struct net_device *dev) .data_indication = x25_data_indication, .data_transmit = x25_data_transmit, }; + hdlc_device *hdlc = dev_to_hdlc(dev); + struct lapb_parms_struct params; + int result; result = lapb_register(dev, &cb); if (result != LAPB_OK) return result; + + result = lapb_getparms(dev, ¶ms); + if (result != LAPB_OK) + return result; + + if (state(hdlc)->settings.dce) + params.mode = params.mode | LAPB_DCE; + + if (state(hdlc)->settings.modulo == 128) + params.mode = params.mode | LAPB_EXTENDED; + + params.window = state(hdlc)->settings.window; + params.t1 = state(hdlc)->settings.t1; + params.t2 = state(hdlc)->settings.t2; + params.n2 = state(hdlc)->settings.n2; + + result = lapb_setparms(dev, ¶ms); + if (result != LAPB_OK) + return result; + return 0; } @@ -186,7 +226,10 @@ static struct hdlc_proto proto = { static int x25_ioctl(struct net_device *dev, struct ifreq *ifr) { + x25_hdlc_proto __user *x25_s = ifr->ifr_settings.ifs_ifsu.x25; + const size_t size = sizeof(x25_hdlc_proto); hdlc_device *hdlc = dev_to_hdlc(dev); + x25_hdlc_proto new_settings; int result; switch (ifr->ifr_settings.type) { @@ -194,7 +237,13 @@ static int x25_ioctl(struct net_device *dev, struct ifreq *ifr) if (dev_to_hdlc(dev)->proto != &proto) return -EINVAL; ifr->ifr_settings.type = IF_PROTO_X25; - return 0; /* return protocol only, no settable parameters */ + if (ifr->ifr_settings.size < size) { + ifr->ifr_settings.size = size; /* data size wanted */ + return -ENOBUFS; + } + if (copy_to_user(x25_s, &state(hdlc)->settings, size)) + return -EFAULT; + return 0; case IF_PROTO_X25: if (!capable(CAP_NET_ADMIN)) @@ -203,12 +252,46 @@ static int x25_ioctl(struct net_device *dev, struct ifreq *ifr) if (dev->flags & IFF_UP) return -EBUSY; + /* backward compatibility */ + if (ifr->ifr_settings.size == 0) { + new_settings.dce = 0; + new_settings.modulo = 8; + new_settings.window = 7; + new_settings.t1 = 3; + new_settings.t2 = 1; + new_settings.n2 = 10; + } + else { + if (copy_from_user(&new_settings, x25_s, size)) + return -EFAULT; + + if ((new_settings.dce != 0 && + new_settings.dce != 1) || + (new_settings.modulo != 8 && + new_settings.modulo != 128) || + new_settings.window < 1 || + (new_settings.modulo == 8 && + new_settings.window > 7) || + (new_settings.modulo == 128 && + new_settings.window > 127) || + new_settings.t1 < 1 || + new_settings.t1 > 255 || + new_settings.t2 < 1 || + new_settings.t2 > 255 || + new_settings.n2 < 1 || + new_settings.n2 > 255) + return -EINVAL; + } + result=hdlc->attach(dev, ENCODING_NRZ,PARITY_CRC16_PR1_CCITT); if (result) return result; - if ((result = attach_hdlc_protocol(dev, &proto, 0))) + if ((result = attach_hdlc_protocol(dev, &proto, + sizeof(struct x25_state)))) return result; + + memcpy(&state(hdlc)->settings, &new_settings, size); dev->type = ARPHRD_X25; call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE, dev); netif_dormant_off(dev); diff --git a/drivers/net/wan/ixp4xx_hss.c b/drivers/net/wan/ixp4xx_hss.c index ea6ee6a608ce..7c5cf77e9ef1 100644 --- a/drivers/net/wan/ixp4xx_hss.c +++ b/drivers/net/wan/ixp4xx_hss.c @@ -17,6 +17,7 @@ #include <linux/io.h> #include <linux/kernel.h> #include <linux/platform_device.h> +#include <linux/platform_data/wan_ixp4xx_hss.h> #include <linux/poll.h> #include <linux/slab.h> #include <linux/soc/ixp4xx/npe.h> @@ -258,7 +259,7 @@ struct port { struct hss_plat_info *plat; buffer_t *rx_buff_tab[RX_DESCS], *tx_buff_tab[TX_DESCS]; struct desc *desc_tab; /* coherent */ - u32 desc_tab_phys; + dma_addr_t desc_tab_phys; unsigned int id; unsigned int clock_type, clock_rate, loopback; unsigned int initialized, carrier; @@ -858,7 +859,7 @@ static int hss_hdlc_xmit(struct sk_buff *skb, struct net_device *dev) dev->stats.tx_dropped++; return NETDEV_TX_OK; } - memcpy_swab32(mem, (u32 *)((int)skb->data & ~3), bytes / 4); + memcpy_swab32(mem, (u32 *)((uintptr_t)skb->data & ~3), bytes / 4); dev_kfree_skb(skb); #endif @@ -1182,14 +1183,14 @@ static int hss_hdlc_attach(struct net_device *dev, unsigned short encoding, } } -static u32 check_clock(u32 rate, u32 a, u32 b, u32 c, +static u32 check_clock(u32 timer_freq, u32 rate, u32 a, u32 b, u32 c, u32 *best, u32 *best_diff, u32 *reg) { /* a is 10-bit, b is 10-bit, c is 12-bit */ u64 new_rate; u32 new_diff; - new_rate = ixp4xx_timer_freq * (u64)(c + 1); + new_rate = timer_freq * (u64)(c + 1); do_div(new_rate, a * (c + 1) + b + 1); new_diff = abs((u32)new_rate - rate); @@ -1201,40 +1202,43 @@ static u32 check_clock(u32 rate, u32 a, u32 b, u32 c, return new_diff; } -static void find_best_clock(u32 rate, u32 *best, u32 *reg) +static void find_best_clock(u32 timer_freq, u32 rate, u32 *best, u32 *reg) { u32 a, b, diff = 0xFFFFFFFF; - a = ixp4xx_timer_freq / rate; + a = timer_freq / rate; if (a > 0x3FF) { /* 10-bit value - we can go as slow as ca. 65 kb/s */ - check_clock(rate, 0x3FF, 1, 1, best, &diff, reg); + check_clock(timer_freq, rate, 0x3FF, 1, 1, best, &diff, reg); return; } if (a == 0) { /* > 66.666 MHz */ a = 1; /* minimum divider is 1 (a = 0, b = 1, c = 1) */ - rate = ixp4xx_timer_freq; + rate = timer_freq; } - if (rate * a == ixp4xx_timer_freq) { /* don't divide by 0 later */ - check_clock(rate, a - 1, 1, 1, best, &diff, reg); + if (rate * a == timer_freq) { /* don't divide by 0 later */ + check_clock(timer_freq, rate, a - 1, 1, 1, best, &diff, reg); return; } for (b = 0; b < 0x400; b++) { u64 c = (b + 1) * (u64)rate; - do_div(c, ixp4xx_timer_freq - rate * a); + do_div(c, timer_freq - rate * a); c--; if (c >= 0xFFF) { /* 12-bit - no need to check more 'b's */ if (b == 0 && /* also try a bit higher rate */ - !check_clock(rate, a - 1, 1, 1, best, &diff, reg)) + !check_clock(timer_freq, rate, a - 1, 1, 1, best, + &diff, reg)) return; - check_clock(rate, a, b, 0xFFF, best, &diff, reg); + check_clock(timer_freq, rate, a, b, 0xFFF, best, + &diff, reg); return; } - if (!check_clock(rate, a, b, c, best, &diff, reg)) + if (!check_clock(timer_freq, rate, a, b, c, best, &diff, reg)) return; - if (!check_clock(rate, a, b, c + 1, best, &diff, reg)) + if (!check_clock(timer_freq, rate, a, b, c + 1, best, &diff, + reg)) return; } } @@ -1285,8 +1289,9 @@ static int hss_hdlc_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) port->clock_type = clk; /* Update settings */ if (clk == CLOCK_INT) - find_best_clock(new_line.clock_rate, &port->clock_rate, - &port->clock_reg); + find_best_clock(port->plat->timer_freq, + new_line.clock_rate, + &port->clock_rate, &port->clock_reg); else { port->clock_rate = 0; port->clock_reg = CLK42X_SPEED_2048KHZ; diff --git a/drivers/net/wan/lapbether.c b/drivers/net/wan/lapbether.c index 0f1217b506ad..e30d91a38cfb 100644 --- a/drivers/net/wan/lapbether.c +++ b/drivers/net/wan/lapbether.c @@ -64,7 +64,7 @@ static struct lapbethdev *lapbeth_get_x25_dev(struct net_device *dev) { struct lapbethdev *lapbeth; - list_for_each_entry_rcu(lapbeth, &lapbeth_devices, node) { + list_for_each_entry_rcu(lapbeth, &lapbeth_devices, node, lockdep_rtnl_is_held()) { if (lapbeth->ethdev == dev) return lapbeth; } diff --git a/drivers/net/wan/sdla.c b/drivers/net/wan/sdla.c index e2e679a01b65..77ccf3672ede 100644 --- a/drivers/net/wan/sdla.c +++ b/drivers/net/wan/sdla.c @@ -708,7 +708,7 @@ static netdev_tx_t sdla_transmit(struct sk_buff *skb, spin_lock_irqsave(&sdla_lock, flags); SDLA_WINDOW(dev, addr); - pbuf = (void *)(((int) dev->mem_start) + (addr & SDLA_ADDR_MASK)); + pbuf = (void *)(dev->mem_start + (addr & SDLA_ADDR_MASK)); __sdla_write(dev, pbuf->buf_addr, skb->data, skb->len); SDLA_WINDOW(dev, addr); pbuf->opp_flag = 1; diff --git a/drivers/net/wireguard/device.h b/drivers/net/wireguard/device.h index c91f3051c5c7..b15a8be9d816 100644 --- a/drivers/net/wireguard/device.h +++ b/drivers/net/wireguard/device.h @@ -62,12 +62,4 @@ struct wg_device { int wg_device_init(void); void wg_device_uninit(void); -/* Later after the dust settles, this can be moved into include/linux/skbuff.h, - * where virtually all code that deals with GSO segs can benefit, around ~30 - * drivers as of writing. - */ -#define skb_list_walk_safe(first, skb, next) \ - for (skb = first, next = skb->next; skb; \ - skb = next, next = skb ? skb->next : NULL) - #endif /* _WG_DEVICE_H */ diff --git a/drivers/net/wireguard/queueing.h b/drivers/net/wireguard/queueing.h index e49a464238fd..fecb559cbdb6 100644 --- a/drivers/net/wireguard/queueing.h +++ b/drivers/net/wireguard/queueing.h @@ -83,13 +83,10 @@ static inline __be16 wg_skb_examine_untrusted_ip_hdr(struct sk_buff *skb) static inline void wg_reset_packet(struct sk_buff *skb) { - const int pfmemalloc = skb->pfmemalloc; - skb_scrub_packet(skb, true); memset(&skb->headers_start, 0, offsetof(struct sk_buff, headers_end) - offsetof(struct sk_buff, headers_start)); - skb->pfmemalloc = pfmemalloc; skb->queue_mapping = 0; skb->nohdr = 0; skb->peeked = 0; diff --git a/drivers/net/wireguard/socket.c b/drivers/net/wireguard/socket.c index c46256d0d81c..262f3b5c819d 100644 --- a/drivers/net/wireguard/socket.c +++ b/drivers/net/wireguard/socket.c @@ -333,6 +333,7 @@ static int wg_receive(struct sock *sk, struct sk_buff *skb) wg = sk->sk_user_data; if (unlikely(!wg)) goto err; + skb_mark_not_on_list(skb); wg_packet_receive(wg, skb); return 0; diff --git a/drivers/net/wireless/ath/wil6210/ethtool.c b/drivers/net/wireless/ath/wil6210/ethtool.c index 912c4eaf017b..fef10886ca4a 100644 --- a/drivers/net/wireless/ath/wil6210/ethtool.c +++ b/drivers/net/wireless/ath/wil6210/ethtool.c @@ -11,26 +11,6 @@ #include "wil6210.h" -static int wil_ethtoolops_begin(struct net_device *ndev) -{ - struct wil6210_priv *wil = ndev_to_wil(ndev); - - mutex_lock(&wil->mutex); - - wil_dbg_misc(wil, "ethtoolops_begin\n"); - - return 0; -} - -static void wil_ethtoolops_complete(struct net_device *ndev) -{ - struct wil6210_priv *wil = ndev_to_wil(ndev); - - wil_dbg_misc(wil, "ethtoolops_complete\n"); - - mutex_unlock(&wil->mutex); -} - static int wil_ethtoolops_get_coalesce(struct net_device *ndev, struct ethtool_coalesce *cp) { @@ -39,11 +19,12 @@ static int wil_ethtoolops_get_coalesce(struct net_device *ndev, u32 rx_itr_en, rx_itr_val = 0; int ret; + mutex_lock(&wil->mutex); wil_dbg_misc(wil, "ethtoolops_get_coalesce\n"); ret = wil_pm_runtime_get(wil); if (ret < 0) - return ret; + goto out; tx_itr_en = wil_r(wil, RGF_DMA_ITR_TX_CNT_CTL); if (tx_itr_en & BIT_DMA_ITR_TX_CNT_CTL_EN) @@ -57,7 +38,11 @@ static int wil_ethtoolops_get_coalesce(struct net_device *ndev, cp->tx_coalesce_usecs = tx_itr_val; cp->rx_coalesce_usecs = rx_itr_val; - return 0; + ret = 0; + +out: + mutex_unlock(&wil->mutex); + return ret; } static int wil_ethtoolops_set_coalesce(struct net_device *ndev, @@ -67,12 +52,14 @@ static int wil_ethtoolops_set_coalesce(struct net_device *ndev, struct wireless_dev *wdev = ndev->ieee80211_ptr; int ret; + mutex_lock(&wil->mutex); wil_dbg_misc(wil, "ethtoolops_set_coalesce: rx %d usec, tx %d usec\n", cp->rx_coalesce_usecs, cp->tx_coalesce_usecs); if (wdev->iftype == NL80211_IFTYPE_MONITOR) { wil_dbg_misc(wil, "No IRQ coalescing in monitor mode\n"); - return -EINVAL; + ret = -EINVAL; + goto out; } /* only @rx_coalesce_usecs and @tx_coalesce_usecs supported, @@ -88,24 +75,26 @@ static int wil_ethtoolops_set_coalesce(struct net_device *ndev, ret = wil_pm_runtime_get(wil); if (ret < 0) - return ret; + goto out; wil->txrx_ops.configure_interrupt_moderation(wil); wil_pm_runtime_put(wil); + ret = 0; - return 0; +out: + mutex_unlock(&wil->mutex); + return ret; out_bad: wil_dbg_misc(wil, "Unsupported coalescing params. Raw command:\n"); print_hex_dump_debug("DBG[MISC] coal ", DUMP_PREFIX_OFFSET, 16, 4, cp, sizeof(*cp), false); + mutex_unlock(&wil->mutex); return -EINVAL; } static const struct ethtool_ops wil_ethtool_ops = { - .begin = wil_ethtoolops_begin, - .complete = wil_ethtoolops_complete, .get_drvinfo = cfg80211_get_drvinfo, .get_coalesce = wil_ethtoolops_get_coalesce, .set_coalesce = wil_ethtoolops_set_coalesce, diff --git a/drivers/net/wireless/cisco/airo.c b/drivers/net/wireless/cisco/airo.c index f43c06569ea1..c4c8f1b62e1e 100644 --- a/drivers/net/wireless/cisco/airo.c +++ b/drivers/net/wireless/cisco/airo.c @@ -7790,16 +7790,8 @@ static int readrids(struct net_device *dev, aironet_ioctl *comp) { case AIROGVLIST: ridcode = RID_APLIST; break; case AIROGDRVNAM: ridcode = RID_DRVNAME; break; case AIROGEHTENC: ridcode = RID_ETHERENCAP; break; - case AIROGWEPKTMP: ridcode = RID_WEP_TEMP; - /* Only super-user can read WEP keys */ - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - break; - case AIROGWEPKNV: ridcode = RID_WEP_PERM; - /* Only super-user can read WEP keys */ - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - break; + case AIROGWEPKTMP: ridcode = RID_WEP_TEMP; break; + case AIROGWEPKNV: ridcode = RID_WEP_PERM; break; case AIROGSTAT: ridcode = RID_STATUS; break; case AIROGSTATSD32: ridcode = RID_STATSDELTA; break; case AIROGSTATSC32: ridcode = RID_STATS; break; @@ -7813,7 +7805,13 @@ static int readrids(struct net_device *dev, aironet_ioctl *comp) { return -EINVAL; } - if ((iobuf = kmalloc(RIDSIZE, GFP_KERNEL)) == NULL) + if (ridcode == RID_WEP_TEMP || ridcode == RID_WEP_PERM) { + /* Only super-user can read WEP keys */ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + } + + if ((iobuf = kzalloc(RIDSIZE, GFP_KERNEL)) == NULL) return -ENOMEM; PC4500_readrid(ai,ridcode,iobuf,RIDSIZE, 1); diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/tx.c b/drivers/net/wireless/intel/iwlwifi/dvm/tx.c index cd73fc5cfcbb..fd454836adbe 100644 --- a/drivers/net/wireless/intel/iwlwifi/dvm/tx.c +++ b/drivers/net/wireless/intel/iwlwifi/dvm/tx.c @@ -267,7 +267,7 @@ int iwlagn_tx_skb(struct iwl_priv *priv, struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct iwl_station_priv *sta_priv = NULL; struct iwl_rxon_context *ctx = &priv->contexts[IWL_RXON_CTX_BSS]; - struct iwl_device_cmd *dev_cmd; + struct iwl_device_tx_cmd *dev_cmd; struct iwl_tx_cmd *tx_cmd; __le16 fc; u8 hdr_len; @@ -348,7 +348,6 @@ int iwlagn_tx_skb(struct iwl_priv *priv, if (unlikely(!dev_cmd)) goto drop_unlock_priv; - memset(dev_cmd, 0, sizeof(*dev_cmd)); dev_cmd->hdr.cmd = REPLY_TX; tx_cmd = (struct iwl_tx_cmd *) dev_cmd->payload; diff --git a/drivers/net/wireless/intel/iwlwifi/fw/acpi.c b/drivers/net/wireless/intel/iwlwifi/fw/acpi.c index 40fe2d667622..48d375a86d86 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/acpi.c +++ b/drivers/net/wireless/intel/iwlwifi/fw/acpi.c @@ -357,8 +357,8 @@ int iwl_sar_get_ewrd_table(struct iwl_fw_runtime *fwrt) { union acpi_object *wifi_pkg, *data; bool enabled; - int i, n_profiles, tbl_rev; - int ret = 0; + int i, n_profiles, tbl_rev, pos; + int ret = 0; data = iwl_acpi_get_object(fwrt->dev, ACPI_EWRD_METHOD); if (IS_ERR(data)) @@ -390,10 +390,10 @@ int iwl_sar_get_ewrd_table(struct iwl_fw_runtime *fwrt) goto out_free; } - for (i = 0; i < n_profiles; i++) { - /* the tables start at element 3 */ - int pos = 3; + /* the tables start at element 3 */ + pos = 3; + for (i = 0; i < n_profiles; i++) { /* The EWRD profiles officially go from 2 to 4, but we * save them in sar_profiles[1-3] (because we don't * have profile 0). So in the array we start from 1. diff --git a/drivers/net/wireless/intel/iwlwifi/fw/dbg.c b/drivers/net/wireless/intel/iwlwifi/fw/dbg.c index ed90dd104366..4c60f9959f7b 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/dbg.c +++ b/drivers/net/wireless/intel/iwlwifi/fw/dbg.c @@ -2669,12 +2669,7 @@ int iwl_fw_dbg_stop_restart_recording(struct iwl_fw_runtime *fwrt, { int ret = 0; - /* if the FW crashed or not debug monitor cfg was given, there is - * no point in changing the recording state - */ - if (test_bit(STATUS_FW_ERROR, &fwrt->trans->status) || - (!fwrt->trans->dbg.dest_tlv && - fwrt->trans->dbg.ini_dest == IWL_FW_INI_LOCATION_INVALID)) + if (test_bit(STATUS_FW_ERROR, &fwrt->trans->status)) return 0; if (fw_has_capa(&fwrt->fw->ucode_capa, diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-csr.h b/drivers/net/wireless/intel/iwlwifi/iwl-csr.h index 92d9898ab7c2..c2f7252ae4e7 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-csr.h +++ b/drivers/net/wireless/intel/iwlwifi/iwl-csr.h @@ -379,7 +379,7 @@ enum { /* CSR GIO */ -#define CSR_GIO_REG_VAL_L0S_ENABLED (0x00000002) +#define CSR_GIO_REG_VAL_L0S_DISABLED (0x00000002) /* * UCODE-DRIVER GP (general purpose) mailbox register 1 diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-dbg-tlv.c b/drivers/net/wireless/intel/iwlwifi/iwl-dbg-tlv.c index f266647dc08c..ce8f248c33ea 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-dbg-tlv.c +++ b/drivers/net/wireless/intel/iwlwifi/iwl-dbg-tlv.c @@ -480,7 +480,14 @@ static int iwl_dbg_tlv_alloc_fragment(struct iwl_fw_runtime *fwrt, if (!frag || frag->size || !pages) return -EIO; - while (pages) { + /* + * We try to allocate as many pages as we can, starting with + * the requested amount and going down until we can allocate + * something. Because of DIV_ROUND_UP(), pages will never go + * down to 0 and stop the loop, so stop when pages reaches 1, + * which is too small anyway. + */ + while (pages > 1) { block = dma_alloc_coherent(fwrt->dev, pages * PAGE_SIZE, &physical, GFP_KERNEL | __GFP_NOWARN); diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-drv.c b/drivers/net/wireless/intel/iwlwifi/iwl-drv.c index 4096ccf58b07..bc8c959588ca 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-drv.c +++ b/drivers/net/wireless/intel/iwlwifi/iwl-drv.c @@ -1817,9 +1817,6 @@ MODULE_PARM_DESC(antenna_coupling, module_param_named(nvm_file, iwlwifi_mod_params.nvm_file, charp, 0444); MODULE_PARM_DESC(nvm_file, "NVM file name"); -module_param_named(lar_disable, iwlwifi_mod_params.lar_disable, bool, 0444); -MODULE_PARM_DESC(lar_disable, "disable LAR functionality (default: N)"); - module_param_named(uapsd_disable, iwlwifi_mod_params.uapsd_disable, uint, 0644); MODULE_PARM_DESC(uapsd_disable, "disable U-APSD functionality bitmap 1: BSS 2: P2P Client (default: 3)"); diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-modparams.h b/drivers/net/wireless/intel/iwlwifi/iwl-modparams.h index ebea3f308b5d..82e5cac23d8d 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-modparams.h +++ b/drivers/net/wireless/intel/iwlwifi/iwl-modparams.h @@ -115,7 +115,6 @@ enum iwl_uapsd_disable { * @nvm_file: specifies a external NVM file * @uapsd_disable: disable U-APSD, see &enum iwl_uapsd_disable, default = * IWL_DISABLE_UAPSD_BSS | IWL_DISABLE_UAPSD_P2P_CLIENT - * @lar_disable: disable LAR (regulatory), default = 0 * @fw_monitor: allow to use firmware monitor * @disable_11ac: disable VHT capabilities, default = false. * @remove_when_gone: remove an inaccessible device from the PCIe bus. @@ -136,7 +135,6 @@ struct iwl_mod_params { int antenna_coupling; char *nvm_file; u32 uapsd_disable; - bool lar_disable; bool fw_monitor; bool disable_11ac; /** diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c index 1e240a2a8329..d4f834b52f50 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c +++ b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c @@ -224,6 +224,34 @@ enum iwl_nvm_channel_flags { NVM_CHANNEL_DC_HIGH = BIT(12), }; +/** + * enum iwl_reg_capa_flags - global flags applied for the whole regulatory + * domain. + * @REG_CAPA_BF_CCD_LOW_BAND: Beam-forming or Cyclic Delay Diversity in the + * 2.4Ghz band is allowed. + * @REG_CAPA_BF_CCD_HIGH_BAND: Beam-forming or Cyclic Delay Diversity in the + * 5Ghz band is allowed. + * @REG_CAPA_160MHZ_ALLOWED: 11ac channel with a width of 160Mhz is allowed + * for this regulatory domain (valid only in 5Ghz). + * @REG_CAPA_80MHZ_ALLOWED: 11ac channel with a width of 80Mhz is allowed + * for this regulatory domain (valid only in 5Ghz). + * @REG_CAPA_MCS_8_ALLOWED: 11ac with MCS 8 is allowed. + * @REG_CAPA_MCS_9_ALLOWED: 11ac with MCS 9 is allowed. + * @REG_CAPA_40MHZ_FORBIDDEN: 11n channel with a width of 40Mhz is forbidden + * for this regulatory domain (valid only in 5Ghz). + * @REG_CAPA_DC_HIGH_ENABLED: DC HIGH allowed. + */ +enum iwl_reg_capa_flags { + REG_CAPA_BF_CCD_LOW_BAND = BIT(0), + REG_CAPA_BF_CCD_HIGH_BAND = BIT(1), + REG_CAPA_160MHZ_ALLOWED = BIT(2), + REG_CAPA_80MHZ_ALLOWED = BIT(3), + REG_CAPA_MCS_8_ALLOWED = BIT(4), + REG_CAPA_MCS_9_ALLOWED = BIT(5), + REG_CAPA_40MHZ_FORBIDDEN = BIT(7), + REG_CAPA_DC_HIGH_ENABLED = BIT(9), +}; + static inline void iwl_nvm_print_channel_flags(struct device *dev, u32 level, int chan, u32 flags) { @@ -939,10 +967,11 @@ iwl_nvm_no_wide_in_5ghz(struct iwl_trans *trans, const struct iwl_cfg *cfg, struct iwl_nvm_data * iwl_parse_nvm_data(struct iwl_trans *trans, const struct iwl_cfg *cfg, + const struct iwl_fw *fw, const __be16 *nvm_hw, const __le16 *nvm_sw, const __le16 *nvm_calib, const __le16 *regulatory, const __le16 *mac_override, const __le16 *phy_sku, - u8 tx_chains, u8 rx_chains, bool lar_fw_supported) + u8 tx_chains, u8 rx_chains) { struct iwl_nvm_data *data; bool lar_enabled; @@ -1022,7 +1051,8 @@ iwl_parse_nvm_data(struct iwl_trans *trans, const struct iwl_cfg *cfg, return NULL; } - if (lar_fw_supported && lar_enabled) + if (lar_enabled && + fw_has_capa(&fw->ucode_capa, IWL_UCODE_TLV_CAPA_LAR_SUPPORT)) sbands_flags |= IWL_NVM_SBANDS_FLAGS_LAR; if (iwl_nvm_no_wide_in_5ghz(trans, cfg, nvm_hw)) @@ -1038,6 +1068,7 @@ IWL_EXPORT_SYMBOL(iwl_parse_nvm_data); static u32 iwl_nvm_get_regdom_bw_flags(const u16 *nvm_chan, int ch_idx, u16 nvm_flags, + u16 cap_flags, const struct iwl_cfg *cfg) { u32 flags = NL80211_RRF_NO_HT40; @@ -1076,13 +1107,27 @@ static u32 iwl_nvm_get_regdom_bw_flags(const u16 *nvm_chan, (flags & NL80211_RRF_NO_IR)) flags |= NL80211_RRF_GO_CONCURRENT; + /* + * cap_flags is per regulatory domain so apply it for every channel + */ + if (ch_idx >= NUM_2GHZ_CHANNELS) { + if (cap_flags & REG_CAPA_40MHZ_FORBIDDEN) + flags |= NL80211_RRF_NO_HT40; + + if (!(cap_flags & REG_CAPA_80MHZ_ALLOWED)) + flags |= NL80211_RRF_NO_80MHZ; + + if (!(cap_flags & REG_CAPA_160MHZ_ALLOWED)) + flags |= NL80211_RRF_NO_160MHZ; + } + return flags; } struct ieee80211_regdomain * iwl_parse_nvm_mcc_info(struct device *dev, const struct iwl_cfg *cfg, int num_of_ch, __le32 *channels, u16 fw_mcc, - u16 geo_info) + u16 geo_info, u16 cap) { int ch_idx; u16 ch_flags; @@ -1140,7 +1185,8 @@ iwl_parse_nvm_mcc_info(struct device *dev, const struct iwl_cfg *cfg, } reg_rule_flags = iwl_nvm_get_regdom_bw_flags(nvm_chan, ch_idx, - ch_flags, cfg); + ch_flags, cap, + cfg); /* we can't continue the same rule */ if (ch_idx == 0 || prev_reg_rule_flags != reg_rule_flags || @@ -1405,9 +1451,6 @@ struct iwl_nvm_data *iwl_get_nvm(struct iwl_trans *trans, .id = WIDE_ID(REGULATORY_AND_NVM_GROUP, NVM_GET_INFO) }; int ret; - bool lar_fw_supported = !iwlwifi_mod_params.lar_disable && - fw_has_capa(&fw->ucode_capa, - IWL_UCODE_TLV_CAPA_LAR_SUPPORT); bool empty_otp; u32 mac_flags; u32 sbands_flags = 0; @@ -1485,7 +1528,9 @@ struct iwl_nvm_data *iwl_get_nvm(struct iwl_trans *trans, nvm->valid_tx_ant = (u8)le32_to_cpu(rsp->phy_sku.tx_chains); nvm->valid_rx_ant = (u8)le32_to_cpu(rsp->phy_sku.rx_chains); - if (le32_to_cpu(rsp->regulatory.lar_enabled) && lar_fw_supported) { + if (le32_to_cpu(rsp->regulatory.lar_enabled) && + fw_has_capa(&fw->ucode_capa, + IWL_UCODE_TLV_CAPA_LAR_SUPPORT)) { nvm->lar_enabled = true; sbands_flags |= IWL_NVM_SBANDS_FLAGS_LAR; } diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.h b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.h index b7e1ddf8f177..fb0b385d10fd 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.h +++ b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.h @@ -7,7 +7,7 @@ * * Copyright(c) 2008 - 2015 Intel Corporation. All rights reserved. * Copyright(c) 2016 - 2017 Intel Deutschland GmbH - * Copyright(c) 2018 Intel Corporation + * Copyright(c) 2018 - 2019 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as @@ -29,7 +29,7 @@ * * Copyright(c) 2005 - 2014 Intel Corporation. All rights reserved. * Copyright(c) 2016 - 2017 Intel Deutschland GmbH - * Copyright(c) 2018 Intel Corporation + * Copyright(c) 2018 - 2019 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -85,10 +85,11 @@ enum iwl_nvm_sbands_flags { */ struct iwl_nvm_data * iwl_parse_nvm_data(struct iwl_trans *trans, const struct iwl_cfg *cfg, + const struct iwl_fw *fw, const __be16 *nvm_hw, const __le16 *nvm_sw, const __le16 *nvm_calib, const __le16 *regulatory, const __le16 *mac_override, const __le16 *phy_sku, - u8 tx_chains, u8 rx_chains, bool lar_fw_supported); + u8 tx_chains, u8 rx_chains); /** * iwl_parse_mcc_info - parse MCC (mobile country code) info coming from FW @@ -103,7 +104,7 @@ iwl_parse_nvm_data(struct iwl_trans *trans, const struct iwl_cfg *cfg, struct ieee80211_regdomain * iwl_parse_nvm_mcc_info(struct device *dev, const struct iwl_cfg *cfg, int num_of_ch, __le32 *channels, u16 fw_mcc, - u16 geo_info); + u16 geo_info, u16 cap); /** * struct iwl_nvm_section - describes an NVM section in memory. diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-trans.c b/drivers/net/wireless/intel/iwlwifi/iwl-trans.c index 28bdc9a9617e..f91197e4ae40 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-trans.c +++ b/drivers/net/wireless/intel/iwlwifi/iwl-trans.c @@ -66,7 +66,9 @@ struct iwl_trans *iwl_trans_alloc(unsigned int priv_size, struct device *dev, - const struct iwl_trans_ops *ops) + const struct iwl_trans_ops *ops, + unsigned int cmd_pool_size, + unsigned int cmd_pool_align) { struct iwl_trans *trans; #ifdef CONFIG_LOCKDEP @@ -90,10 +92,8 @@ struct iwl_trans *iwl_trans_alloc(unsigned int priv_size, "iwl_cmd_pool:%s", dev_name(trans->dev)); trans->dev_cmd_pool = kmem_cache_create(trans->dev_cmd_pool_name, - sizeof(struct iwl_device_cmd), - sizeof(void *), - SLAB_HWCACHE_ALIGN, - NULL); + cmd_pool_size, cmd_pool_align, + SLAB_HWCACHE_ALIGN, NULL); if (!trans->dev_cmd_pool) return NULL; diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-trans.h b/drivers/net/wireless/intel/iwlwifi/iwl-trans.h index 8cadad7364ac..e33df5ad00e0 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-trans.h +++ b/drivers/net/wireless/intel/iwlwifi/iwl-trans.h @@ -193,6 +193,18 @@ struct iwl_device_cmd { }; } __packed; +/** + * struct iwl_device_tx_cmd - buffer for TX command + * @hdr: the header + * @payload: the payload placeholder + * + * The actual structure is sized dynamically according to need. + */ +struct iwl_device_tx_cmd { + struct iwl_cmd_header hdr; + u8 payload[]; +} __packed; + #define TFD_MAX_PAYLOAD_SIZE (sizeof(struct iwl_device_cmd)) /* @@ -544,7 +556,7 @@ struct iwl_trans_ops { int (*send_cmd)(struct iwl_trans *trans, struct iwl_host_cmd *cmd); int (*tx)(struct iwl_trans *trans, struct sk_buff *skb, - struct iwl_device_cmd *dev_cmd, int queue); + struct iwl_device_tx_cmd *dev_cmd, int queue); void (*reclaim)(struct iwl_trans *trans, int queue, int ssn, struct sk_buff_head *skbs); @@ -948,22 +960,22 @@ iwl_trans_dump_data(struct iwl_trans *trans, u32 dump_mask) return trans->ops->dump_data(trans, dump_mask); } -static inline struct iwl_device_cmd * +static inline struct iwl_device_tx_cmd * iwl_trans_alloc_tx_cmd(struct iwl_trans *trans) { - return kmem_cache_alloc(trans->dev_cmd_pool, GFP_ATOMIC); + return kmem_cache_zalloc(trans->dev_cmd_pool, GFP_ATOMIC); } int iwl_trans_send_cmd(struct iwl_trans *trans, struct iwl_host_cmd *cmd); static inline void iwl_trans_free_tx_cmd(struct iwl_trans *trans, - struct iwl_device_cmd *dev_cmd) + struct iwl_device_tx_cmd *dev_cmd) { kmem_cache_free(trans->dev_cmd_pool, dev_cmd); } static inline int iwl_trans_tx(struct iwl_trans *trans, struct sk_buff *skb, - struct iwl_device_cmd *dev_cmd, int queue) + struct iwl_device_tx_cmd *dev_cmd, int queue) { if (unlikely(test_bit(STATUS_FW_ERROR, &trans->status))) return -EIO; @@ -1271,7 +1283,9 @@ static inline bool iwl_trans_dbg_ini_valid(struct iwl_trans *trans) *****************************************************/ struct iwl_trans *iwl_trans_alloc(unsigned int priv_size, struct device *dev, - const struct iwl_trans_ops *ops); + const struct iwl_trans_ops *ops, + unsigned int cmd_pool_size, + unsigned int cmd_pool_align); void iwl_trans_free(struct iwl_trans *trans); /***************************************************** diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/constants.h b/drivers/net/wireless/intel/iwlwifi/mvm/constants.h index 60aff2ecec12..58df25e2fb32 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/constants.h +++ b/drivers/net/wireless/intel/iwlwifi/mvm/constants.h @@ -154,5 +154,6 @@ #define IWL_MVM_D3_DEBUG false #define IWL_MVM_USE_TWT false #define IWL_MVM_AMPDU_CONSEC_DROPS_DELBA 10 +#define IWL_MVM_USE_NSSN_SYNC 0 #endif /* __MVM_CONSTANTS_H */ diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c index dd685f7eb410..c09624d8d7ee 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c @@ -841,9 +841,13 @@ int iwl_mvm_ppag_send_cmd(struct iwl_mvm *mvm) return 0; } + if (!mvm->fwrt.ppag_table.enabled) { + IWL_DEBUG_RADIO(mvm, + "PPAG not enabled, command not sent.\n"); + return 0; + } + IWL_DEBUG_RADIO(mvm, "Sending PER_PLATFORM_ANT_GAIN_CMD\n"); - IWL_DEBUG_RADIO(mvm, "PPAG is %s\n", - mvm->fwrt.ppag_table.enabled ? "enabled" : "disabled"); for (i = 0; i < ACPI_PPAG_NUM_CHAINS; i++) { for (j = 0; j < ACPI_PPAG_NUM_SUB_BANDS; j++) { diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index 32dc9d6f0fb6..6717f25c46b1 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -256,7 +256,8 @@ struct ieee80211_regdomain *iwl_mvm_get_regdomain(struct wiphy *wiphy, __le32_to_cpu(resp->n_channels), resp->channels, __le16_to_cpu(resp->mcc), - __le16_to_cpu(resp->geo_info)); + __le16_to_cpu(resp->geo_info), + __le16_to_cpu(resp->cap)); /* Store the return source id */ src_id = resp->source_id; kfree(resp); @@ -754,6 +755,20 @@ int iwl_mvm_mac_setup_register(struct iwl_mvm *mvm) return ret; } +static void iwl_mvm_tx_skb(struct iwl_mvm *mvm, struct sk_buff *skb, + struct ieee80211_sta *sta) +{ + if (likely(sta)) { + if (likely(iwl_mvm_tx_skb_sta(mvm, skb, sta) == 0)) + return; + } else { + if (likely(iwl_mvm_tx_skb_non_sta(mvm, skb) == 0)) + return; + } + + ieee80211_free_txskb(mvm->hw, skb); +} + static void iwl_mvm_mac_tx(struct ieee80211_hw *hw, struct ieee80211_tx_control *control, struct sk_buff *skb) @@ -797,14 +812,7 @@ static void iwl_mvm_mac_tx(struct ieee80211_hw *hw, } } - if (sta) { - if (iwl_mvm_tx_skb(mvm, skb, sta)) - goto drop; - return; - } - - if (iwl_mvm_tx_skb_non_sta(mvm, skb)) - goto drop; + iwl_mvm_tx_skb(mvm, skb, sta); return; drop: ieee80211_free_txskb(hw, skb); @@ -854,10 +862,7 @@ void iwl_mvm_mac_itxq_xmit(struct ieee80211_hw *hw, struct ieee80211_txq *txq) break; } - if (!txq->sta) - iwl_mvm_tx_skb_non_sta(mvm, skb); - else - iwl_mvm_tx_skb(mvm, skb, txq->sta); + iwl_mvm_tx_skb(mvm, skb, txq->sta); } } while (atomic_dec_return(&mvmtxq->tx_request)); rcu_read_unlock(); @@ -4771,6 +4776,125 @@ static int iwl_mvm_mac_get_survey(struct ieee80211_hw *hw, int idx, return ret; } +static void iwl_mvm_set_sta_rate(u32 rate_n_flags, struct rate_info *rinfo) +{ + switch (rate_n_flags & RATE_MCS_CHAN_WIDTH_MSK) { + case RATE_MCS_CHAN_WIDTH_20: + rinfo->bw = RATE_INFO_BW_20; + break; + case RATE_MCS_CHAN_WIDTH_40: + rinfo->bw = RATE_INFO_BW_40; + break; + case RATE_MCS_CHAN_WIDTH_80: + rinfo->bw = RATE_INFO_BW_80; + break; + case RATE_MCS_CHAN_WIDTH_160: + rinfo->bw = RATE_INFO_BW_160; + break; + } + + if (rate_n_flags & RATE_MCS_HT_MSK) { + rinfo->flags |= RATE_INFO_FLAGS_MCS; + rinfo->mcs = u32_get_bits(rate_n_flags, RATE_HT_MCS_INDEX_MSK); + rinfo->nss = u32_get_bits(rate_n_flags, + RATE_HT_MCS_NSS_MSK) + 1; + if (rate_n_flags & RATE_MCS_SGI_MSK) + rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI; + } else if (rate_n_flags & RATE_MCS_VHT_MSK) { + rinfo->flags |= RATE_INFO_FLAGS_VHT_MCS; + rinfo->mcs = u32_get_bits(rate_n_flags, + RATE_VHT_MCS_RATE_CODE_MSK); + rinfo->nss = u32_get_bits(rate_n_flags, + RATE_VHT_MCS_NSS_MSK) + 1; + if (rate_n_flags & RATE_MCS_SGI_MSK) + rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI; + } else if (rate_n_flags & RATE_MCS_HE_MSK) { + u32 gi_ltf = u32_get_bits(rate_n_flags, + RATE_MCS_HE_GI_LTF_MSK); + + rinfo->flags |= RATE_INFO_FLAGS_HE_MCS; + rinfo->mcs = u32_get_bits(rate_n_flags, + RATE_VHT_MCS_RATE_CODE_MSK); + rinfo->nss = u32_get_bits(rate_n_flags, + RATE_VHT_MCS_NSS_MSK) + 1; + + if (rate_n_flags & RATE_MCS_HE_106T_MSK) { + rinfo->bw = RATE_INFO_BW_HE_RU; + rinfo->he_ru_alloc = NL80211_RATE_INFO_HE_RU_ALLOC_106; + } + + switch (rate_n_flags & RATE_MCS_HE_TYPE_MSK) { + case RATE_MCS_HE_TYPE_SU: + case RATE_MCS_HE_TYPE_EXT_SU: + if (gi_ltf == 0 || gi_ltf == 1) + rinfo->he_gi = NL80211_RATE_INFO_HE_GI_0_8; + else if (gi_ltf == 2) + rinfo->he_gi = NL80211_RATE_INFO_HE_GI_1_6; + else if (rate_n_flags & RATE_MCS_SGI_MSK) + rinfo->he_gi = NL80211_RATE_INFO_HE_GI_0_8; + else + rinfo->he_gi = NL80211_RATE_INFO_HE_GI_3_2; + break; + case RATE_MCS_HE_TYPE_MU: + if (gi_ltf == 0 || gi_ltf == 1) + rinfo->he_gi = NL80211_RATE_INFO_HE_GI_0_8; + else if (gi_ltf == 2) + rinfo->he_gi = NL80211_RATE_INFO_HE_GI_1_6; + else + rinfo->he_gi = NL80211_RATE_INFO_HE_GI_3_2; + break; + case RATE_MCS_HE_TYPE_TRIG: + if (gi_ltf == 0 || gi_ltf == 1) + rinfo->he_gi = NL80211_RATE_INFO_HE_GI_1_6; + else + rinfo->he_gi = NL80211_RATE_INFO_HE_GI_3_2; + break; + } + + if (rate_n_flags & RATE_HE_DUAL_CARRIER_MODE_MSK) + rinfo->he_dcm = 1; + } else { + switch (u32_get_bits(rate_n_flags, RATE_LEGACY_RATE_MSK)) { + case IWL_RATE_1M_PLCP: + rinfo->legacy = 10; + break; + case IWL_RATE_2M_PLCP: + rinfo->legacy = 20; + break; + case IWL_RATE_5M_PLCP: + rinfo->legacy = 55; + break; + case IWL_RATE_11M_PLCP: + rinfo->legacy = 110; + break; + case IWL_RATE_6M_PLCP: + rinfo->legacy = 60; + break; + case IWL_RATE_9M_PLCP: + rinfo->legacy = 90; + break; + case IWL_RATE_12M_PLCP: + rinfo->legacy = 120; + break; + case IWL_RATE_18M_PLCP: + rinfo->legacy = 180; + break; + case IWL_RATE_24M_PLCP: + rinfo->legacy = 240; + break; + case IWL_RATE_36M_PLCP: + rinfo->legacy = 360; + break; + case IWL_RATE_48M_PLCP: + rinfo->legacy = 480; + break; + case IWL_RATE_54M_PLCP: + rinfo->legacy = 540; + break; + } + } +} + static void iwl_mvm_mac_sta_statistics(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, @@ -4785,6 +4909,13 @@ static void iwl_mvm_mac_sta_statistics(struct ieee80211_hw *hw, sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG); } + if (iwl_mvm_has_tlc_offload(mvm)) { + struct iwl_lq_sta_rs_fw *lq_sta = &mvmsta->lq_sta.rs_fw; + + iwl_mvm_set_sta_rate(lq_sta->last_rate_n_flags, &sinfo->txrate); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BITRATE); + } + /* if beacon filtering isn't on mac80211 does it anyway */ if (!(vif->driver_flags & IEEE80211_VIF_BEACON_FILTER)) return; diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h index 3ec8de00f3aa..67ab7e7e9c9d 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h @@ -1298,9 +1298,6 @@ static inline bool iwl_mvm_is_lar_supported(struct iwl_mvm *mvm) bool tlv_lar = fw_has_capa(&mvm->fw->ucode_capa, IWL_UCODE_TLV_CAPA_LAR_SUPPORT); - if (iwlwifi_mod_params.lar_disable) - return false; - /* * Enable LAR only if it is supported by the FW (TLV) && * enabled in the NVM @@ -1508,8 +1505,8 @@ int __must_check iwl_mvm_send_cmd_status(struct iwl_mvm *mvm, int __must_check iwl_mvm_send_cmd_pdu_status(struct iwl_mvm *mvm, u32 id, u16 len, const void *data, u32 *status); -int iwl_mvm_tx_skb(struct iwl_mvm *mvm, struct sk_buff *skb, - struct ieee80211_sta *sta); +int iwl_mvm_tx_skb_sta(struct iwl_mvm *mvm, struct sk_buff *skb, + struct ieee80211_sta *sta); int iwl_mvm_tx_skb_non_sta(struct iwl_mvm *mvm, struct sk_buff *skb); void iwl_mvm_set_tx_cmd(struct iwl_mvm *mvm, struct sk_buff *skb, struct iwl_tx_cmd *tx_cmd, diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/nvm.c b/drivers/net/wireless/intel/iwlwifi/mvm/nvm.c index 945c1ea5cda8..46128a2a9c6e 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/nvm.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/nvm.c @@ -277,11 +277,10 @@ iwl_parse_nvm_sections(struct iwl_mvm *mvm) struct iwl_nvm_section *sections = mvm->nvm_sections; const __be16 *hw; const __le16 *sw, *calib, *regulatory, *mac_override, *phy_sku; - bool lar_enabled; int regulatory_type; /* Checking for required sections */ - if (mvm->trans->cfg->nvm_type != IWL_NVM_EXT) { + if (mvm->trans->cfg->nvm_type == IWL_NVM) { if (!mvm->nvm_sections[NVM_SECTION_TYPE_SW].data || !mvm->nvm_sections[mvm->cfg->nvm_hw_section_num].data) { IWL_ERR(mvm, "Can't parse empty OTP/NVM sections\n"); @@ -327,14 +326,9 @@ iwl_parse_nvm_sections(struct iwl_mvm *mvm) (const __le16 *)sections[NVM_SECTION_TYPE_REGULATORY_SDP].data : (const __le16 *)sections[NVM_SECTION_TYPE_REGULATORY].data; - lar_enabled = !iwlwifi_mod_params.lar_disable && - fw_has_capa(&mvm->fw->ucode_capa, - IWL_UCODE_TLV_CAPA_LAR_SUPPORT); - - return iwl_parse_nvm_data(mvm->trans, mvm->cfg, hw, sw, calib, + return iwl_parse_nvm_data(mvm->trans, mvm->cfg, mvm->fw, hw, sw, calib, regulatory, mac_override, phy_sku, - mvm->fw->valid_tx_ant, mvm->fw->valid_rx_ant, - lar_enabled); + mvm->fw->valid_tx_ant, mvm->fw->valid_rx_ant); } /* Loads the NVM data stored in mvm->nvm_sections into the NIC */ diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c index ef99c49247b7..c15f7dbc9516 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c @@ -514,14 +514,17 @@ static bool iwl_mvm_is_sn_less(u16 sn1, u16 sn2, u16 buffer_size) static void iwl_mvm_sync_nssn(struct iwl_mvm *mvm, u8 baid, u16 nssn) { - struct iwl_mvm_rss_sync_notif notif = { - .metadata.type = IWL_MVM_RXQ_NSSN_SYNC, - .metadata.sync = 0, - .nssn_sync.baid = baid, - .nssn_sync.nssn = nssn, - }; - - iwl_mvm_sync_rx_queues_internal(mvm, (void *)¬if, sizeof(notif)); + if (IWL_MVM_USE_NSSN_SYNC) { + struct iwl_mvm_rss_sync_notif notif = { + .metadata.type = IWL_MVM_RXQ_NSSN_SYNC, + .metadata.sync = 0, + .nssn_sync.baid = baid, + .nssn_sync.nssn = nssn, + }; + + iwl_mvm_sync_rx_queues_internal(mvm, (void *)¬if, + sizeof(notif)); + } } #define RX_REORDER_BUF_TIMEOUT_MQ (HZ / 10) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c index a046ac9fa852..a5af8f4128b1 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c @@ -1213,7 +1213,7 @@ static int iwl_mvm_legacy_config_scan(struct iwl_mvm *mvm) cmd_size = sizeof(struct iwl_scan_config_v2); else cmd_size = sizeof(struct iwl_scan_config_v1); - cmd_size += num_channels; + cmd_size += mvm->fw->ucode_capa.n_scan_channels; cfg = kzalloc(cmd_size, GFP_KERNEL); if (!cfg) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c index dc5c02fbc65a..a8d0d17f79fd 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c @@ -490,13 +490,13 @@ static void iwl_mvm_set_tx_cmd_crypto(struct iwl_mvm *mvm, /* * Allocates and sets the Tx cmd the driver data pointers in the skb */ -static struct iwl_device_cmd * +static struct iwl_device_tx_cmd * iwl_mvm_set_tx_params(struct iwl_mvm *mvm, struct sk_buff *skb, struct ieee80211_tx_info *info, int hdrlen, struct ieee80211_sta *sta, u8 sta_id) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; - struct iwl_device_cmd *dev_cmd; + struct iwl_device_tx_cmd *dev_cmd; struct iwl_tx_cmd *tx_cmd; dev_cmd = iwl_trans_alloc_tx_cmd(mvm->trans); @@ -504,11 +504,6 @@ iwl_mvm_set_tx_params(struct iwl_mvm *mvm, struct sk_buff *skb, if (unlikely(!dev_cmd)) return NULL; - /* Make sure we zero enough of dev_cmd */ - BUILD_BUG_ON(sizeof(struct iwl_tx_cmd_gen2) > sizeof(*tx_cmd)); - BUILD_BUG_ON(sizeof(struct iwl_tx_cmd_gen3) > sizeof(*tx_cmd)); - - memset(dev_cmd, 0, sizeof(dev_cmd->hdr) + sizeof(*tx_cmd)); dev_cmd->hdr.cmd = TX_CMD; if (iwl_mvm_has_new_tx_api(mvm)) { @@ -597,7 +592,7 @@ out: } static void iwl_mvm_skb_prepare_status(struct sk_buff *skb, - struct iwl_device_cmd *cmd) + struct iwl_device_tx_cmd *cmd) { struct ieee80211_tx_info *skb_info = IEEE80211_SKB_CB(skb); @@ -716,7 +711,7 @@ int iwl_mvm_tx_skb_non_sta(struct iwl_mvm *mvm, struct sk_buff *skb) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; struct ieee80211_tx_info info; - struct iwl_device_cmd *dev_cmd; + struct iwl_device_tx_cmd *dev_cmd; u8 sta_id; int hdrlen = ieee80211_hdrlen(hdr->frame_control); __le16 fc = hdr->frame_control; @@ -847,10 +842,7 @@ iwl_mvm_tx_tso_segment(struct sk_buff *skb, unsigned int num_subframes, else if (next) consume_skb(skb); - while (next) { - tmp = next; - next = tmp->next; - + skb_list_walk_safe(next, tmp, next) { memcpy(tmp->cb, cb, sizeof(tmp->cb)); /* * Compute the length of all the data added for the A-MSDU. @@ -880,9 +872,7 @@ iwl_mvm_tx_tso_segment(struct sk_buff *skb, unsigned int num_subframes, skb_shinfo(tmp)->gso_size = 0; } - tmp->prev = NULL; - tmp->next = NULL; - + skb_mark_not_on_list(tmp); __skb_queue_tail(mpdus_skb, tmp); i++; } @@ -1078,7 +1068,7 @@ static int iwl_mvm_tx_mpdu(struct iwl_mvm *mvm, struct sk_buff *skb, { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; struct iwl_mvm_sta *mvmsta; - struct iwl_device_cmd *dev_cmd; + struct iwl_device_tx_cmd *dev_cmd; __le16 fc; u16 seq_number = 0; u8 tid = IWL_MAX_TID_COUNT; @@ -1154,7 +1144,7 @@ static int iwl_mvm_tx_mpdu(struct iwl_mvm *mvm, struct sk_buff *skb, if (WARN_ONCE(txq_id == IWL_MVM_INVALID_QUEUE, "Invalid TXQ id")) { iwl_trans_free_tx_cmd(mvm->trans, dev_cmd); spin_unlock(&mvmsta->lock); - return 0; + return -1; } if (!iwl_mvm_has_new_tx_api(mvm)) { @@ -1206,8 +1196,8 @@ drop: return -1; } -int iwl_mvm_tx_skb(struct iwl_mvm *mvm, struct sk_buff *skb, - struct ieee80211_sta *sta) +int iwl_mvm_tx_skb_sta(struct iwl_mvm *mvm, struct sk_buff *skb, + struct ieee80211_sta *sta) { struct iwl_mvm_sta *mvmsta = iwl_mvm_sta_from_mac80211(sta); struct ieee80211_tx_info info; diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info.c b/drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info.c index d38cefbb779e..e249e3fd14c6 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info.c @@ -57,6 +57,42 @@ #include "internal.h" #include "iwl-prph.h" +static void *_iwl_pcie_ctxt_info_dma_alloc_coherent(struct iwl_trans *trans, + size_t size, + dma_addr_t *phys, + int depth) +{ + void *result; + + if (WARN(depth > 2, + "failed to allocate DMA memory not crossing 2^32 boundary")) + return NULL; + + result = dma_alloc_coherent(trans->dev, size, phys, GFP_KERNEL); + + if (!result) + return NULL; + + if (unlikely(iwl_pcie_crosses_4g_boundary(*phys, size))) { + void *old = result; + dma_addr_t oldphys = *phys; + + result = _iwl_pcie_ctxt_info_dma_alloc_coherent(trans, size, + phys, + depth + 1); + dma_free_coherent(trans->dev, size, old, oldphys); + } + + return result; +} + +static void *iwl_pcie_ctxt_info_dma_alloc_coherent(struct iwl_trans *trans, + size_t size, + dma_addr_t *phys) +{ + return _iwl_pcie_ctxt_info_dma_alloc_coherent(trans, size, phys, 0); +} + void iwl_pcie_ctxt_info_free_paging(struct iwl_trans *trans) { struct iwl_self_init_dram *dram = &trans->init_dram; @@ -161,14 +197,17 @@ int iwl_pcie_ctxt_info_init(struct iwl_trans *trans, struct iwl_context_info *ctxt_info; struct iwl_context_info_rbd_cfg *rx_cfg; u32 control_flags = 0, rb_size; + dma_addr_t phys; int ret; - ctxt_info = dma_alloc_coherent(trans->dev, sizeof(*ctxt_info), - &trans_pcie->ctxt_info_dma_addr, - GFP_KERNEL); + ctxt_info = iwl_pcie_ctxt_info_dma_alloc_coherent(trans, + sizeof(*ctxt_info), + &phys); if (!ctxt_info) return -ENOMEM; + trans_pcie->ctxt_info_dma_addr = phys; + ctxt_info->version.version = 0; ctxt_info->version.mac_id = cpu_to_le16((u16)iwl_read32(trans, CSR_HW_REV)); diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h index a091690f6c79..f14bcef3495e 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h +++ b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h @@ -305,7 +305,7 @@ struct iwl_cmd_meta { #define IWL_FIRST_TB_SIZE_ALIGN ALIGN(IWL_FIRST_TB_SIZE, 64) struct iwl_pcie_txq_entry { - struct iwl_device_cmd *cmd; + void *cmd; struct sk_buff *skb; /* buffer to free after command completes */ const void *free_buf; @@ -672,6 +672,16 @@ void iwl_pcie_disable_ict(struct iwl_trans *trans); /***************************************************** * TX / HCMD ******************************************************/ +/* + * We need this inline in case dma_addr_t is only 32-bits - since the + * hardware is always 64-bit, the issue can still occur in that case, + * so use u64 for 'phys' here to force the addition in 64-bit. + */ +static inline bool iwl_pcie_crosses_4g_boundary(u64 phys, u16 len) +{ + return upper_32_bits(phys) != upper_32_bits(phys + len); +} + int iwl_pcie_tx_init(struct iwl_trans *trans); int iwl_pcie_gen2_tx_init(struct iwl_trans *trans, int txq_id, int queue_size); @@ -688,7 +698,7 @@ void iwl_trans_pcie_txq_set_shared_mode(struct iwl_trans *trans, u32 txq_id, void iwl_trans_pcie_log_scd_error(struct iwl_trans *trans, struct iwl_txq *txq); int iwl_trans_pcie_tx(struct iwl_trans *trans, struct sk_buff *skb, - struct iwl_device_cmd *dev_cmd, int txq_id); + struct iwl_device_tx_cmd *dev_cmd, int txq_id); void iwl_pcie_txq_check_wrptrs(struct iwl_trans *trans); int iwl_trans_pcie_send_hcmd(struct iwl_trans *trans, struct iwl_host_cmd *cmd); void iwl_pcie_cmdq_reclaim(struct iwl_trans *trans, int txq_id, int idx); @@ -1082,7 +1092,8 @@ void iwl_pcie_apply_destination(struct iwl_trans *trans); void iwl_pcie_free_tso_page(struct iwl_trans_pcie *trans_pcie, struct sk_buff *skb); #ifdef CONFIG_INET -struct iwl_tso_hdr_page *get_page_hdr(struct iwl_trans *trans, size_t len); +struct iwl_tso_hdr_page *get_page_hdr(struct iwl_trans *trans, size_t len, + struct sk_buff *skb); #endif /* common functions that are used by gen3 transport */ @@ -1106,7 +1117,7 @@ int iwl_trans_pcie_dyn_txq_alloc(struct iwl_trans *trans, unsigned int timeout); void iwl_trans_pcie_dyn_txq_free(struct iwl_trans *trans, int queue); int iwl_trans_pcie_gen2_tx(struct iwl_trans *trans, struct sk_buff *skb, - struct iwl_device_cmd *dev_cmd, int txq_id); + struct iwl_device_tx_cmd *dev_cmd, int txq_id); int iwl_trans_pcie_gen2_send_hcmd(struct iwl_trans *trans, struct iwl_host_cmd *cmd); void iwl_trans_pcie_gen2_stop_device(struct iwl_trans *trans); diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/rx.c b/drivers/net/wireless/intel/iwlwifi/pcie/rx.c index 452da44a21e0..f0b8ff67a1bc 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/rx.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/rx.c @@ -1529,13 +1529,13 @@ out: napi = &rxq->napi; if (napi->poll) { + napi_gro_flush(napi, false); + if (napi->rx_count) { netif_receive_skb_list(&napi->rx_list); INIT_LIST_HEAD(&napi->rx_list); napi->rx_count = 0; } - - napi_gro_flush(napi, false); } iwl_pcie_rxq_restock(trans, rxq); diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c index a0677131634d..f60d66f1e55b 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c @@ -79,6 +79,7 @@ #include "iwl-agn-hw.h" #include "fw/error-dump.h" #include "fw/dbg.h" +#include "fw/api/tx.h" #include "internal.h" #include "iwl-fh.h" @@ -301,18 +302,13 @@ void iwl_pcie_apm_config(struct iwl_trans *trans) u16 cap; /* - * HW bug W/A for instability in PCIe bus L0S->L1 transition. - * Check if BIOS (or OS) enabled L1-ASPM on this device. - * If so (likely), disable L0S, so device moves directly L0->L1; - * costs negligible amount of power savings. - * If not (unlikely), enable L0S, so there is at least some - * power savings, even without L1. + * L0S states have been found to be unstable with our devices + * and in newer hardware they are not officially supported at + * all, so we must always set the L0S_DISABLED bit. */ + iwl_set_bit(trans, CSR_GIO_REG, CSR_GIO_REG_VAL_L0S_DISABLED); + pcie_capability_read_word(trans_pcie->pci_dev, PCI_EXP_LNKCTL, &lctl); - if (lctl & PCI_EXP_LNKCTL_ASPM_L1) - iwl_set_bit(trans, CSR_GIO_REG, CSR_GIO_REG_VAL_L0S_ENABLED); - else - iwl_clear_bit(trans, CSR_GIO_REG, CSR_GIO_REG_VAL_L0S_ENABLED); trans->pm_support = !(lctl & PCI_EXP_LNKCTL_ASPM_L0S); pcie_capability_read_word(trans_pcie->pci_dev, PCI_EXP_DEVCTL2, &cap); @@ -3460,19 +3456,34 @@ struct iwl_trans *iwl_trans_pcie_alloc(struct pci_dev *pdev, { struct iwl_trans_pcie *trans_pcie; struct iwl_trans *trans; - int ret, addr_size; + int ret, addr_size, txcmd_size, txcmd_align; + const struct iwl_trans_ops *ops = &trans_ops_pcie_gen2; + + if (!cfg_trans->gen2) { + ops = &trans_ops_pcie; + txcmd_size = sizeof(struct iwl_tx_cmd); + txcmd_align = sizeof(void *); + } else if (cfg_trans->device_family < IWL_DEVICE_FAMILY_AX210) { + txcmd_size = sizeof(struct iwl_tx_cmd_gen2); + txcmd_align = 64; + } else { + txcmd_size = sizeof(struct iwl_tx_cmd_gen3); + txcmd_align = 128; + } + + txcmd_size += sizeof(struct iwl_cmd_header); + txcmd_size += 36; /* biggest possible 802.11 header */ + + /* Ensure device TX cmd cannot reach/cross a page boundary in gen2 */ + if (WARN_ON(cfg_trans->gen2 && txcmd_size >= txcmd_align)) + return ERR_PTR(-EINVAL); ret = pcim_enable_device(pdev); if (ret) return ERR_PTR(ret); - if (cfg_trans->gen2) - trans = iwl_trans_alloc(sizeof(struct iwl_trans_pcie), - &pdev->dev, &trans_ops_pcie_gen2); - else - trans = iwl_trans_alloc(sizeof(struct iwl_trans_pcie), - &pdev->dev, &trans_ops_pcie); - + trans = iwl_trans_alloc(sizeof(struct iwl_trans_pcie), &pdev->dev, ops, + txcmd_size, txcmd_align); if (!trans) return ERR_PTR(-ENOMEM); diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c index 8ca0250de99e..bfb984b2e00c 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c @@ -221,6 +221,17 @@ static int iwl_pcie_gen2_set_tb(struct iwl_trans *trans, int idx = iwl_pcie_gen2_get_num_tbs(trans, tfd); struct iwl_tfh_tb *tb; + /* + * Only WARN here so we know about the issue, but we mess up our + * unmap path because not every place currently checks for errors + * returned from this function - it can only return an error if + * there's no more space, and so when we know there is enough we + * don't always check ... + */ + WARN(iwl_pcie_crosses_4g_boundary(addr, len), + "possible DMA problem with iova:0x%llx, len:%d\n", + (unsigned long long)addr, len); + if (WARN_ON(idx >= IWL_TFH_NUM_TBS)) return -EINVAL; tb = &tfd->tbs[idx]; @@ -240,13 +251,114 @@ static int iwl_pcie_gen2_set_tb(struct iwl_trans *trans, return idx; } +static struct page *get_workaround_page(struct iwl_trans *trans, + struct sk_buff *skb) +{ + struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); + struct page **page_ptr; + struct page *ret; + + page_ptr = (void *)((u8 *)skb->cb + trans_pcie->page_offs); + + ret = alloc_page(GFP_ATOMIC); + if (!ret) + return NULL; + + /* set the chaining pointer to the previous page if there */ + *(void **)(page_address(ret) + PAGE_SIZE - sizeof(void *)) = *page_ptr; + *page_ptr = ret; + + return ret; +} + +/* + * Add a TB and if needed apply the FH HW bug workaround; + * meta != NULL indicates that it's a page mapping and we + * need to dma_unmap_page() and set the meta->tbs bit in + * this case. + */ +static int iwl_pcie_gen2_set_tb_with_wa(struct iwl_trans *trans, + struct sk_buff *skb, + struct iwl_tfh_tfd *tfd, + dma_addr_t phys, void *virt, + u16 len, struct iwl_cmd_meta *meta) +{ + dma_addr_t oldphys = phys; + struct page *page; + int ret; + + if (unlikely(dma_mapping_error(trans->dev, phys))) + return -ENOMEM; + + if (likely(!iwl_pcie_crosses_4g_boundary(phys, len))) { + ret = iwl_pcie_gen2_set_tb(trans, tfd, phys, len); + + if (ret < 0) + goto unmap; + + if (meta) + meta->tbs |= BIT(ret); + + ret = 0; + goto trace; + } + + /* + * Work around a hardware bug. If (as expressed in the + * condition above) the TB ends on a 32-bit boundary, + * then the next TB may be accessed with the wrong + * address. + * To work around it, copy the data elsewhere and make + * a new mapping for it so the device will not fail. + */ + + if (WARN_ON(len > PAGE_SIZE - sizeof(void *))) { + ret = -ENOBUFS; + goto unmap; + } + + page = get_workaround_page(trans, skb); + if (!page) { + ret = -ENOMEM; + goto unmap; + } + + memcpy(page_address(page), virt, len); + + phys = dma_map_single(trans->dev, page_address(page), len, + DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(trans->dev, phys))) + return -ENOMEM; + ret = iwl_pcie_gen2_set_tb(trans, tfd, phys, len); + if (ret < 0) { + /* unmap the new allocation as single */ + oldphys = phys; + meta = NULL; + goto unmap; + } + IWL_WARN(trans, + "TB bug workaround: copied %d bytes from 0x%llx to 0x%llx\n", + len, (unsigned long long)oldphys, (unsigned long long)phys); + + ret = 0; +unmap: + if (meta) + dma_unmap_page(trans->dev, oldphys, len, DMA_TO_DEVICE); + else + dma_unmap_single(trans->dev, oldphys, len, DMA_TO_DEVICE); +trace: + trace_iwlwifi_dev_tx_tb(trans->dev, skb, virt, phys, len); + + return ret; +} + static int iwl_pcie_gen2_build_amsdu(struct iwl_trans *trans, struct sk_buff *skb, struct iwl_tfh_tfd *tfd, int start_len, - u8 hdr_len, struct iwl_device_cmd *dev_cmd) + u8 hdr_len, + struct iwl_device_tx_cmd *dev_cmd) { #ifdef CONFIG_INET - struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); struct iwl_tx_cmd_gen2 *tx_cmd = (void *)dev_cmd->payload; struct ieee80211_hdr *hdr = (void *)skb->data; unsigned int snap_ip_tcp_hdrlen, ip_hdrlen, total_len, hdr_room; @@ -254,7 +366,6 @@ static int iwl_pcie_gen2_build_amsdu(struct iwl_trans *trans, u16 length, amsdu_pad; u8 *start_hdr; struct iwl_tso_hdr_page *hdr_page; - struct page **page_ptr; struct tso_t tso; trace_iwlwifi_dev_tx(trans->dev, skb, tfd, sizeof(*tfd), @@ -270,14 +381,11 @@ static int iwl_pcie_gen2_build_amsdu(struct iwl_trans *trans, (3 + snap_ip_tcp_hdrlen + sizeof(struct ethhdr)); /* Our device supports 9 segments at most, it will fit in 1 page */ - hdr_page = get_page_hdr(trans, hdr_room); + hdr_page = get_page_hdr(trans, hdr_room, skb); if (!hdr_page) return -ENOMEM; - get_page(hdr_page->page); start_hdr = hdr_page->pos; - page_ptr = (void *)((u8 *)skb->cb + trans_pcie->page_offs); - *page_ptr = hdr_page->page; /* * Pull the ieee80211 header to be able to use TSO core, @@ -332,6 +440,11 @@ static int iwl_pcie_gen2_build_amsdu(struct iwl_trans *trans, dev_kfree_skb(csum_skb); goto out_err; } + /* + * No need for _with_wa, this is from the TSO page and + * we leave some space at the end of it so can't hit + * the buggy scenario. + */ iwl_pcie_gen2_set_tb(trans, tfd, tb_phys, tb_len); trace_iwlwifi_dev_tx_tb(trans->dev, skb, start_hdr, tb_phys, tb_len); @@ -343,16 +456,18 @@ static int iwl_pcie_gen2_build_amsdu(struct iwl_trans *trans, /* put the payload */ while (data_left) { + int ret; + tb_len = min_t(unsigned int, tso.size, data_left); tb_phys = dma_map_single(trans->dev, tso.data, tb_len, DMA_TO_DEVICE); - if (unlikely(dma_mapping_error(trans->dev, tb_phys))) { + ret = iwl_pcie_gen2_set_tb_with_wa(trans, skb, tfd, + tb_phys, tso.data, + tb_len, NULL); + if (ret) { dev_kfree_skb(csum_skb); goto out_err; } - iwl_pcie_gen2_set_tb(trans, tfd, tb_phys, tb_len); - trace_iwlwifi_dev_tx_tb(trans->dev, skb, tso.data, - tb_phys, tb_len); data_left -= tb_len; tso_build_data(skb, &tso, tb_len); @@ -372,7 +487,7 @@ out_err: static struct iwl_tfh_tfd *iwl_pcie_gen2_build_tx_amsdu(struct iwl_trans *trans, struct iwl_txq *txq, - struct iwl_device_cmd *dev_cmd, + struct iwl_device_tx_cmd *dev_cmd, struct sk_buff *skb, struct iwl_cmd_meta *out_meta, int hdr_len, @@ -386,6 +501,11 @@ iwl_tfh_tfd *iwl_pcie_gen2_build_tx_amsdu(struct iwl_trans *trans, tb_phys = iwl_pcie_get_first_tb_dma(txq, idx); + /* + * No need for _with_wa, the first TB allocation is aligned up + * to a 64-byte boundary and thus can't be at the end or cross + * a page boundary (much less a 2^32 boundary). + */ iwl_pcie_gen2_set_tb(trans, tfd, tb_phys, IWL_FIRST_TB_SIZE); /* @@ -404,6 +524,10 @@ iwl_tfh_tfd *iwl_pcie_gen2_build_tx_amsdu(struct iwl_trans *trans, tb_phys = dma_map_single(trans->dev, tb1_addr, len, DMA_TO_DEVICE); if (unlikely(dma_mapping_error(trans->dev, tb_phys))) goto out_err; + /* + * No need for _with_wa(), we ensure (via alignment) that the data + * here can never cross or end at a page boundary. + */ iwl_pcie_gen2_set_tb(trans, tfd, tb_phys, len); if (iwl_pcie_gen2_build_amsdu(trans, skb, tfd, @@ -430,24 +554,19 @@ static int iwl_pcie_gen2_tx_add_frags(struct iwl_trans *trans, for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; dma_addr_t tb_phys; - int tb_idx; + unsigned int fragsz = skb_frag_size(frag); + int ret; - if (!skb_frag_size(frag)) + if (!fragsz) continue; tb_phys = skb_frag_dma_map(trans->dev, frag, 0, - skb_frag_size(frag), DMA_TO_DEVICE); - - if (unlikely(dma_mapping_error(trans->dev, tb_phys))) - return -ENOMEM; - tb_idx = iwl_pcie_gen2_set_tb(trans, tfd, tb_phys, - skb_frag_size(frag)); - trace_iwlwifi_dev_tx_tb(trans->dev, skb, skb_frag_address(frag), - tb_phys, skb_frag_size(frag)); - if (tb_idx < 0) - return tb_idx; - - out_meta->tbs |= BIT(tb_idx); + fragsz, DMA_TO_DEVICE); + ret = iwl_pcie_gen2_set_tb_with_wa(trans, skb, tfd, tb_phys, + skb_frag_address(frag), + fragsz, out_meta); + if (ret) + return ret; } return 0; @@ -456,7 +575,7 @@ static int iwl_pcie_gen2_tx_add_frags(struct iwl_trans *trans, static struct iwl_tfh_tfd *iwl_pcie_gen2_build_tx(struct iwl_trans *trans, struct iwl_txq *txq, - struct iwl_device_cmd *dev_cmd, + struct iwl_device_tx_cmd *dev_cmd, struct sk_buff *skb, struct iwl_cmd_meta *out_meta, int hdr_len, @@ -475,6 +594,11 @@ iwl_tfh_tfd *iwl_pcie_gen2_build_tx(struct iwl_trans *trans, /* The first TB points to bi-directional DMA data */ memcpy(&txq->first_tb_bufs[idx], dev_cmd, IWL_FIRST_TB_SIZE); + /* + * No need for _with_wa, the first TB allocation is aligned up + * to a 64-byte boundary and thus can't be at the end or cross + * a page boundary (much less a 2^32 boundary). + */ iwl_pcie_gen2_set_tb(trans, tfd, tb_phys, IWL_FIRST_TB_SIZE); /* @@ -496,6 +620,10 @@ iwl_tfh_tfd *iwl_pcie_gen2_build_tx(struct iwl_trans *trans, tb_phys = dma_map_single(trans->dev, tb1_addr, tb1_len, DMA_TO_DEVICE); if (unlikely(dma_mapping_error(trans->dev, tb_phys))) goto out_err; + /* + * No need for _with_wa(), we ensure (via alignment) that the data + * here can never cross or end at a page boundary. + */ iwl_pcie_gen2_set_tb(trans, tfd, tb_phys, tb1_len); trace_iwlwifi_dev_tx(trans->dev, skb, tfd, sizeof(*tfd), &dev_cmd->hdr, IWL_FIRST_TB_SIZE + tb1_len, hdr_len); @@ -504,26 +632,30 @@ iwl_tfh_tfd *iwl_pcie_gen2_build_tx(struct iwl_trans *trans, tb2_len = skb_headlen(skb) - hdr_len; if (tb2_len > 0) { + int ret; + tb_phys = dma_map_single(trans->dev, skb->data + hdr_len, tb2_len, DMA_TO_DEVICE); - if (unlikely(dma_mapping_error(trans->dev, tb_phys))) + ret = iwl_pcie_gen2_set_tb_with_wa(trans, skb, tfd, tb_phys, + skb->data + hdr_len, tb2_len, + NULL); + if (ret) goto out_err; - iwl_pcie_gen2_set_tb(trans, tfd, tb_phys, tb2_len); - trace_iwlwifi_dev_tx_tb(trans->dev, skb, skb->data + hdr_len, - tb_phys, tb2_len); } if (iwl_pcie_gen2_tx_add_frags(trans, skb, tfd, out_meta)) goto out_err; skb_walk_frags(skb, frag) { + int ret; + tb_phys = dma_map_single(trans->dev, frag->data, skb_headlen(frag), DMA_TO_DEVICE); - if (unlikely(dma_mapping_error(trans->dev, tb_phys))) + ret = iwl_pcie_gen2_set_tb_with_wa(trans, skb, tfd, tb_phys, + frag->data, + skb_headlen(frag), NULL); + if (ret) goto out_err; - iwl_pcie_gen2_set_tb(trans, tfd, tb_phys, skb_headlen(frag)); - trace_iwlwifi_dev_tx_tb(trans->dev, skb, frag->data, - tb_phys, skb_headlen(frag)); if (iwl_pcie_gen2_tx_add_frags(trans, frag, tfd, out_meta)) goto out_err; } @@ -538,7 +670,7 @@ out_err: static struct iwl_tfh_tfd *iwl_pcie_gen2_build_tfd(struct iwl_trans *trans, struct iwl_txq *txq, - struct iwl_device_cmd *dev_cmd, + struct iwl_device_tx_cmd *dev_cmd, struct sk_buff *skb, struct iwl_cmd_meta *out_meta) { @@ -578,7 +710,7 @@ struct iwl_tfh_tfd *iwl_pcie_gen2_build_tfd(struct iwl_trans *trans, } int iwl_trans_pcie_gen2_tx(struct iwl_trans *trans, struct sk_buff *skb, - struct iwl_device_cmd *dev_cmd, int txq_id) + struct iwl_device_tx_cmd *dev_cmd, int txq_id) { struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); struct iwl_cmd_meta *out_meta; @@ -603,7 +735,7 @@ int iwl_trans_pcie_gen2_tx(struct iwl_trans *trans, struct sk_buff *skb, /* don't put the packet on the ring, if there is no room */ if (unlikely(iwl_queue_space(trans, txq) < 3)) { - struct iwl_device_cmd **dev_cmd_ptr; + struct iwl_device_tx_cmd **dev_cmd_ptr; dev_cmd_ptr = (void *)((u8 *)skb->cb + trans_pcie->dev_cmd_offs); diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c index f21f16ab2a97..b0eb52b4951b 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c @@ -213,8 +213,8 @@ static void iwl_pcie_txq_update_byte_cnt_tbl(struct iwl_trans *trans, u8 sec_ctl = 0; u16 len = byte_cnt + IWL_TX_CRC_SIZE + IWL_TX_DELIMITER_SIZE; __le16 bc_ent; - struct iwl_tx_cmd *tx_cmd = - (void *)txq->entries[txq->write_ptr].cmd->payload; + struct iwl_device_tx_cmd *dev_cmd = txq->entries[txq->write_ptr].cmd; + struct iwl_tx_cmd *tx_cmd = (void *)dev_cmd->payload; u8 sta_id = tx_cmd->sta_id; scd_bc_tbl = trans_pcie->scd_bc_tbls.addr; @@ -257,8 +257,8 @@ static void iwl_pcie_txq_inval_byte_cnt_tbl(struct iwl_trans *trans, int read_ptr = txq->read_ptr; u8 sta_id = 0; __le16 bc_ent; - struct iwl_tx_cmd *tx_cmd = - (void *)txq->entries[read_ptr].cmd->payload; + struct iwl_device_tx_cmd *dev_cmd = txq->entries[read_ptr].cmd; + struct iwl_tx_cmd *tx_cmd = (void *)dev_cmd->payload; WARN_ON(read_ptr >= TFD_QUEUE_SIZE_MAX); @@ -624,12 +624,18 @@ void iwl_pcie_free_tso_page(struct iwl_trans_pcie *trans_pcie, struct sk_buff *skb) { struct page **page_ptr; + struct page *next; page_ptr = (void *)((u8 *)skb->cb + trans_pcie->page_offs); + next = *page_ptr; + *page_ptr = NULL; - if (*page_ptr) { - __free_page(*page_ptr); - *page_ptr = NULL; + while (next) { + struct page *tmp = next; + + next = *(void **)(page_address(next) + PAGE_SIZE - + sizeof(void *)); + __free_page(tmp); } } @@ -1196,7 +1202,7 @@ void iwl_trans_pcie_reclaim(struct iwl_trans *trans, int txq_id, int ssn, while (!skb_queue_empty(&overflow_skbs)) { struct sk_buff *skb = __skb_dequeue(&overflow_skbs); - struct iwl_device_cmd *dev_cmd_ptr; + struct iwl_device_tx_cmd *dev_cmd_ptr; dev_cmd_ptr = *(void **)((u8 *)skb->cb + trans_pcie->dev_cmd_offs); @@ -2052,17 +2058,34 @@ static int iwl_fill_data_tbs(struct iwl_trans *trans, struct sk_buff *skb, } #ifdef CONFIG_INET -struct iwl_tso_hdr_page *get_page_hdr(struct iwl_trans *trans, size_t len) +struct iwl_tso_hdr_page *get_page_hdr(struct iwl_trans *trans, size_t len, + struct sk_buff *skb) { struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); struct iwl_tso_hdr_page *p = this_cpu_ptr(trans_pcie->tso_hdr_page); + struct page **page_ptr; + + page_ptr = (void *)((u8 *)skb->cb + trans_pcie->page_offs); + + if (WARN_ON(*page_ptr)) + return NULL; if (!p->page) goto alloc; - /* enough room on this page */ - if (p->pos + len < (u8 *)page_address(p->page) + PAGE_SIZE) - return p; + /* + * Check if there's enough room on this page + * + * Note that we put a page chaining pointer *last* in the + * page - we need it somewhere, and if it's there then we + * avoid DMA mapping the last bits of the page which may + * trigger the 32-bit boundary hardware bug. + * + * (see also get_workaround_page() in tx-gen2.c) + */ + if (p->pos + len < (u8 *)page_address(p->page) + PAGE_SIZE - + sizeof(void *)) + goto out; /* We don't have enough room on this page, get a new one. */ __free_page(p->page); @@ -2072,6 +2095,11 @@ alloc: if (!p->page) return NULL; p->pos = page_address(p->page); + /* set the chaining pointer to NULL */ + *(void **)(page_address(p->page) + PAGE_SIZE - sizeof(void *)) = NULL; +out: + *page_ptr = p->page; + get_page(p->page); return p; } @@ -2097,7 +2125,8 @@ static void iwl_compute_pseudo_hdr_csum(void *iph, struct tcphdr *tcph, static int iwl_fill_data_tbs_amsdu(struct iwl_trans *trans, struct sk_buff *skb, struct iwl_txq *txq, u8 hdr_len, struct iwl_cmd_meta *out_meta, - struct iwl_device_cmd *dev_cmd, u16 tb1_len) + struct iwl_device_tx_cmd *dev_cmd, + u16 tb1_len) { struct iwl_tx_cmd *tx_cmd = (void *)dev_cmd->payload; struct iwl_trans_pcie *trans_pcie = txq->trans_pcie; @@ -2107,7 +2136,6 @@ static int iwl_fill_data_tbs_amsdu(struct iwl_trans *trans, struct sk_buff *skb, u16 length, iv_len, amsdu_pad; u8 *start_hdr; struct iwl_tso_hdr_page *hdr_page; - struct page **page_ptr; struct tso_t tso; /* if the packet is protected, then it must be CCMP or GCMP */ @@ -2130,14 +2158,11 @@ static int iwl_fill_data_tbs_amsdu(struct iwl_trans *trans, struct sk_buff *skb, (3 + snap_ip_tcp_hdrlen + sizeof(struct ethhdr)) + iv_len; /* Our device supports 9 segments at most, it will fit in 1 page */ - hdr_page = get_page_hdr(trans, hdr_room); + hdr_page = get_page_hdr(trans, hdr_room, skb); if (!hdr_page) return -ENOMEM; - get_page(hdr_page->page); start_hdr = hdr_page->pos; - page_ptr = (void *)((u8 *)skb->cb + trans_pcie->page_offs); - *page_ptr = hdr_page->page; memcpy(hdr_page->pos, skb->data + hdr_len, iv_len); hdr_page->pos += iv_len; @@ -2279,7 +2304,8 @@ static int iwl_fill_data_tbs_amsdu(struct iwl_trans *trans, struct sk_buff *skb, static int iwl_fill_data_tbs_amsdu(struct iwl_trans *trans, struct sk_buff *skb, struct iwl_txq *txq, u8 hdr_len, struct iwl_cmd_meta *out_meta, - struct iwl_device_cmd *dev_cmd, u16 tb1_len) + struct iwl_device_tx_cmd *dev_cmd, + u16 tb1_len) { /* No A-MSDU without CONFIG_INET */ WARN_ON(1); @@ -2289,7 +2315,7 @@ static int iwl_fill_data_tbs_amsdu(struct iwl_trans *trans, struct sk_buff *skb, #endif /* CONFIG_INET */ int iwl_trans_pcie_tx(struct iwl_trans *trans, struct sk_buff *skb, - struct iwl_device_cmd *dev_cmd, int txq_id) + struct iwl_device_tx_cmd *dev_cmd, int txq_id) { struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); struct ieee80211_hdr *hdr; @@ -2346,7 +2372,7 @@ int iwl_trans_pcie_tx(struct iwl_trans *trans, struct sk_buff *skb, /* don't put the packet on the ring, if there is no room */ if (unlikely(iwl_queue_space(trans, txq) < 3)) { - struct iwl_device_cmd **dev_cmd_ptr; + struct iwl_device_tx_cmd **dev_cmd_ptr; dev_cmd_ptr = (void *)((u8 *)skb->cb + trans_pcie->dev_cmd_offs); diff --git a/drivers/net/wireless/marvell/libertas/cfg.c b/drivers/net/wireless/marvell/libertas/cfg.c index 57edfada0665..c9401c121a14 100644 --- a/drivers/net/wireless/marvell/libertas/cfg.c +++ b/drivers/net/wireless/marvell/libertas/cfg.c @@ -273,6 +273,10 @@ add_ie_rates(u8 *tlv, const u8 *ie, int *nrates) int hw, ap, ap_max = ie[1]; u8 hw_rate; + if (ap_max > MAX_RATES) { + lbs_deb_assoc("invalid rates\n"); + return tlv; + } /* Advance past IE header */ ie += 2; @@ -1717,6 +1721,9 @@ static int lbs_ibss_join_existing(struct lbs_private *priv, struct cmd_ds_802_11_ad_hoc_join cmd; u8 preamble = RADIO_PREAMBLE_SHORT; int ret = 0; + int hw, i; + u8 rates_max; + u8 *rates; /* TODO: set preamble based on scan result */ ret = lbs_set_radio(priv, preamble, 1); @@ -1775,9 +1782,12 @@ static int lbs_ibss_join_existing(struct lbs_private *priv, if (!rates_eid) { lbs_add_rates(cmd.bss.rates); } else { - int hw, i; - u8 rates_max = rates_eid[1]; - u8 *rates = cmd.bss.rates; + rates_max = rates_eid[1]; + if (rates_max > MAX_RATES) { + lbs_deb_join("invalid rates"); + goto out; + } + rates = cmd.bss.rates; for (hw = 0; hw < ARRAY_SIZE(lbs_rates); hw++) { u8 hw_rate = lbs_rates[hw].bitrate / 5; for (i = 0; i < rates_max; i++) { diff --git a/drivers/net/wireless/mediatek/mt76/airtime.c b/drivers/net/wireless/mediatek/mt76/airtime.c index 55116f395f9a..a4a785467748 100644 --- a/drivers/net/wireless/mediatek/mt76/airtime.c +++ b/drivers/net/wireless/mediatek/mt76/airtime.c @@ -242,7 +242,7 @@ u32 mt76_calc_rx_airtime(struct mt76_dev *dev, struct mt76_rx_status *status, return 0; sband = dev->hw->wiphy->bands[status->band]; - if (!sband || status->rate_idx > sband->n_bitrates) + if (!sband || status->rate_idx >= sband->n_bitrates) return 0; rate = &sband->bitrates[status->rate_idx]; diff --git a/drivers/net/wireless/mediatek/mt76/mac80211.c b/drivers/net/wireless/mediatek/mt76/mac80211.c index b9f2a401041a..96018fd65779 100644 --- a/drivers/net/wireless/mediatek/mt76/mac80211.c +++ b/drivers/net/wireless/mediatek/mt76/mac80211.c @@ -378,7 +378,8 @@ void mt76_unregister_device(struct mt76_dev *dev) { struct ieee80211_hw *hw = dev->hw; - mt76_led_cleanup(dev); + if (IS_ENABLED(CONFIG_MT76_LEDS)) + mt76_led_cleanup(dev); mt76_tx_status_check(dev, NULL, true); ieee80211_unregister_hw(hw); } diff --git a/drivers/net/xen-netback/hash.c b/drivers/net/xen-netback/hash.c index 10d580c3dea3..6b7532f7c936 100644 --- a/drivers/net/xen-netback/hash.c +++ b/drivers/net/xen-netback/hash.c @@ -51,7 +51,8 @@ static void xenvif_add_hash(struct xenvif *vif, const u8 *tag, found = false; oldest = NULL; - list_for_each_entry_rcu(entry, &vif->hash.cache.list, link) { + list_for_each_entry_rcu(entry, &vif->hash.cache.list, link, + lockdep_is_held(&vif->hash.cache.lock)) { /* Make sure we don't add duplicate entries */ if (entry->len == len && memcmp(entry->tag, tag, len) == 0) @@ -102,7 +103,8 @@ static void xenvif_flush_hash(struct xenvif *vif) spin_lock_irqsave(&vif->hash.cache.lock, flags); - list_for_each_entry_rcu(entry, &vif->hash.cache.list, link) { + list_for_each_entry_rcu(entry, &vif->hash.cache.list, link, + lockdep_is_held(&vif->hash.cache.lock)) { list_del_rcu(&entry->link); vif->hash.cache.count--; kfree_rcu(entry, rcu); diff --git a/drivers/nfc/pn533/i2c.c b/drivers/nfc/pn533/i2c.c index 7507176cca0a..0207e66cee21 100644 --- a/drivers/nfc/pn533/i2c.c +++ b/drivers/nfc/pn533/i2c.c @@ -274,7 +274,6 @@ MODULE_DEVICE_TABLE(i2c, pn533_i2c_id_table); static struct i2c_driver pn533_i2c_driver = { .driver = { .name = PN533_I2C_DRIVER_NAME, - .owner = THIS_MODULE, .of_match_table = of_match_ptr(of_pn533_i2c_match), }, .probe = pn533_i2c_probe, diff --git a/drivers/nfc/pn533/usb.c b/drivers/nfc/pn533/usb.c index 4590fbf82dc2..f5bb7ace2ff5 100644 --- a/drivers/nfc/pn533/usb.c +++ b/drivers/nfc/pn533/usb.c @@ -391,7 +391,7 @@ static int pn533_acr122_poweron_rdr(struct pn533_usb_phy *phy) cmd, sizeof(cmd), false); rc = usb_bulk_msg(phy->udev, phy->out_urb->pipe, buffer, sizeof(cmd), - &transferred, 0); + &transferred, 5000); kfree(buffer); if (rc || (transferred != sizeof(cmd))) { nfc_err(&phy->udev->dev, diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 667f18f465be..5dc32b72e7fa 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -222,6 +222,8 @@ static blk_status_t nvme_error_status(u16 status) case NVME_SC_CAP_EXCEEDED: return BLK_STS_NOSPC; case NVME_SC_LBA_RANGE: + case NVME_SC_CMD_INTERRUPTED: + case NVME_SC_NS_NOT_READY: return BLK_STS_TARGET; case NVME_SC_BAD_ATTRIBUTES: case NVME_SC_ONCS_NOT_SUPPORTED: diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 56c21b501185..72a7e41f3018 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -24,6 +24,16 @@ u32 nvmet_get_log_page_len(struct nvme_command *cmd) return len; } +static u32 nvmet_feat_data_len(struct nvmet_req *req, u32 cdw10) +{ + switch (cdw10 & 0xff) { + case NVME_FEAT_HOST_ID: + return sizeof(req->sq->ctrl->hostid); + default: + return 0; + } +} + u64 nvmet_get_log_page_offset(struct nvme_command *cmd) { return le64_to_cpu(cmd->get_log_page.lpo); @@ -778,7 +788,7 @@ static void nvmet_execute_get_features(struct nvmet_req *req) u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10); u16 status = 0; - if (!nvmet_check_data_len(req, 0)) + if (!nvmet_check_data_len(req, nvmet_feat_data_len(req, cdw10))) return; switch (cdw10 & 0xff) { diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 4937a088d7d8..fbeb9f73ef28 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -5074,18 +5074,25 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SERVERWORKS, 0x0422, quirk_no_ext_tags); #ifdef CONFIG_PCI_ATS /* - * Some devices have a broken ATS implementation causing IOMMU stalls. - * Don't use ATS for those devices. + * Some devices require additional driver setup to enable ATS. Don't use + * ATS for those devices as ATS will be enabled before the driver has had a + * chance to load and configure the device. */ -static void quirk_no_ats(struct pci_dev *pdev) +static void quirk_amd_harvest_no_ats(struct pci_dev *pdev) { - pci_info(pdev, "disabling ATS (broken on this device)\n"); + if (pdev->device == 0x7340 && pdev->revision != 0xc5) + return; + + pci_info(pdev, "disabling ATS\n"); pdev->ats_cap = 0; } /* AMD Stoney platform GPU */ -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x98e4, quirk_no_ats); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x6900, quirk_no_ats); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x98e4, quirk_amd_harvest_no_ats); +/* AMD Iceland dGPU */ +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x6900, quirk_amd_harvest_no_ats); +/* AMD Navi14 dGPU */ +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x7340, quirk_amd_harvest_no_ats); #endif /* CONFIG_PCI_ATS */ /* Freescale PCIe doesn't support MSI in RC mode */ diff --git a/drivers/phy/marvell/phy-mvebu-cp110-comphy.c b/drivers/phy/marvell/phy-mvebu-cp110-comphy.c index e3b87c94aaf6..e41367f36ee1 100644 --- a/drivers/phy/marvell/phy-mvebu-cp110-comphy.c +++ b/drivers/phy/marvell/phy-mvebu-cp110-comphy.c @@ -221,7 +221,7 @@ static const struct mvebu_comphy_conf mvebu_comphy_cp110_modes[] = { ETH_CONF(2, 0, PHY_INTERFACE_MODE_SGMII, 0x1, COMPHY_FW_MODE_SGMII), ETH_CONF(2, 0, PHY_INTERFACE_MODE_2500BASEX, 0x1, COMPHY_FW_MODE_HS_SGMII), ETH_CONF(2, 0, PHY_INTERFACE_MODE_RXAUI, 0x1, COMPHY_FW_MODE_RXAUI), - ETH_CONF(2, 0, PHY_INTERFACE_MODE_10GKR, 0x1, COMPHY_FW_MODE_XFI), + ETH_CONF(2, 0, PHY_INTERFACE_MODE_10GBASER, 0x1, COMPHY_FW_MODE_XFI), GEN_CONF(2, 0, PHY_MODE_USB_HOST_SS, COMPHY_FW_MODE_USB3H), GEN_CONF(2, 0, PHY_MODE_SATA, COMPHY_FW_MODE_SATA), GEN_CONF(2, 0, PHY_MODE_PCIE, COMPHY_FW_MODE_PCIE), @@ -235,14 +235,14 @@ static const struct mvebu_comphy_conf mvebu_comphy_cp110_modes[] = { /* lane 4 */ ETH_CONF(4, 0, PHY_INTERFACE_MODE_SGMII, 0x2, COMPHY_FW_MODE_SGMII), ETH_CONF(4, 0, PHY_INTERFACE_MODE_2500BASEX, 0x2, COMPHY_FW_MODE_HS_SGMII), - ETH_CONF(4, 0, PHY_INTERFACE_MODE_10GKR, 0x2, COMPHY_FW_MODE_XFI), + ETH_CONF(4, 0, PHY_INTERFACE_MODE_10GBASER, 0x2, COMPHY_FW_MODE_XFI), ETH_CONF(4, 0, PHY_INTERFACE_MODE_RXAUI, 0x2, COMPHY_FW_MODE_RXAUI), GEN_CONF(4, 0, PHY_MODE_USB_DEVICE_SS, COMPHY_FW_MODE_USB3D), GEN_CONF(4, 1, PHY_MODE_USB_HOST_SS, COMPHY_FW_MODE_USB3H), GEN_CONF(4, 1, PHY_MODE_PCIE, COMPHY_FW_MODE_PCIE), ETH_CONF(4, 1, PHY_INTERFACE_MODE_SGMII, 0x1, COMPHY_FW_MODE_SGMII), ETH_CONF(4, 1, PHY_INTERFACE_MODE_2500BASEX, -1, COMPHY_FW_MODE_HS_SGMII), - ETH_CONF(4, 1, PHY_INTERFACE_MODE_10GKR, -1, COMPHY_FW_MODE_XFI), + ETH_CONF(4, 1, PHY_INTERFACE_MODE_10GBASER, -1, COMPHY_FW_MODE_XFI), /* lane 5 */ ETH_CONF(5, 1, PHY_INTERFACE_MODE_RXAUI, 0x2, COMPHY_FW_MODE_RXAUI), GEN_CONF(5, 1, PHY_MODE_SATA, COMPHY_FW_MODE_SATA), @@ -342,7 +342,7 @@ static int mvebu_comphy_ethernet_init_reset(struct mvebu_comphy_lane *lane) MVEBU_COMPHY_SERDES_CFG0_RXAUI_MODE); switch (lane->submode) { - case PHY_INTERFACE_MODE_10GKR: + case PHY_INTERFACE_MODE_10GBASER: val |= MVEBU_COMPHY_SERDES_CFG0_GEN_RX(0xe) | MVEBU_COMPHY_SERDES_CFG0_GEN_TX(0xe); break; @@ -417,7 +417,7 @@ static int mvebu_comphy_ethernet_init_reset(struct mvebu_comphy_lane *lane) /* refclk selection */ val = readl(priv->base + MVEBU_COMPHY_MISC_CTRL0(lane->id)); val &= ~MVEBU_COMPHY_MISC_CTRL0_REFCLK_SEL; - if (lane->submode == PHY_INTERFACE_MODE_10GKR) + if (lane->submode == PHY_INTERFACE_MODE_10GBASER) val |= MVEBU_COMPHY_MISC_CTRL0_ICP_FORCE; writel(val, priv->base + MVEBU_COMPHY_MISC_CTRL0(lane->id)); @@ -564,7 +564,7 @@ static int mvebu_comphy_set_mode_rxaui(struct phy *phy) return mvebu_comphy_init_plls(lane); } -static int mvebu_comphy_set_mode_10gkr(struct phy *phy) +static int mvebu_comphy_set_mode_10gbaser(struct phy *phy) { struct mvebu_comphy_lane *lane = phy_get_drvdata(phy); struct mvebu_comphy_priv *priv = lane->priv; @@ -735,8 +735,8 @@ static int mvebu_comphy_power_on_legacy(struct phy *phy) case PHY_INTERFACE_MODE_RXAUI: ret = mvebu_comphy_set_mode_rxaui(phy); break; - case PHY_INTERFACE_MODE_10GKR: - ret = mvebu_comphy_set_mode_10gkr(phy); + case PHY_INTERFACE_MODE_10GBASER: + ret = mvebu_comphy_set_mode_10gbaser(phy); break; default: return -ENOTSUPP; @@ -782,8 +782,8 @@ static int mvebu_comphy_power_on(struct phy *phy) lane->id); fw_speed = COMPHY_FW_SPEED_3125; break; - case PHY_INTERFACE_MODE_10GKR: - dev_dbg(priv->dev, "set lane %d to 10G-KR mode\n", + case PHY_INTERFACE_MODE_10GBASER: + dev_dbg(priv->dev, "set lane %d to 10GBASE-R mode\n", lane->id); fw_speed = COMPHY_FW_SPEED_103125; break; diff --git a/drivers/phy/motorola/phy-cpcap-usb.c b/drivers/phy/motorola/phy-cpcap-usb.c index ead06c6c2601..12e71a315a2c 100644 --- a/drivers/phy/motorola/phy-cpcap-usb.c +++ b/drivers/phy/motorola/phy-cpcap-usb.c @@ -115,7 +115,7 @@ struct cpcap_usb_ints_state { enum cpcap_gpio_mode { CPCAP_DM_DP, CPCAP_MDM_RX_TX, - CPCAP_UNKNOWN, + CPCAP_UNKNOWN_DISABLED, /* Seems to disable USB lines */ CPCAP_OTG_DM_DP, }; @@ -134,6 +134,8 @@ struct cpcap_phy_ddata { struct iio_channel *id; struct regulator *vusb; atomic_t active; + unsigned int vbus_provider:1; + unsigned int docked:1; }; static bool cpcap_usb_vbus_valid(struct cpcap_phy_ddata *ddata) @@ -207,6 +209,19 @@ static int cpcap_phy_get_ints_state(struct cpcap_phy_ddata *ddata, static int cpcap_usb_set_uart_mode(struct cpcap_phy_ddata *ddata); static int cpcap_usb_set_usb_mode(struct cpcap_phy_ddata *ddata); +static void cpcap_usb_try_musb_mailbox(struct cpcap_phy_ddata *ddata, + enum musb_vbus_id_status status) +{ + int error; + + error = musb_mailbox(status); + if (!error) + return; + + dev_dbg(ddata->dev, "%s: musb_mailbox failed: %i\n", + __func__, error); +} + static void cpcap_usb_detect(struct work_struct *work) { struct cpcap_phy_ddata *ddata; @@ -220,16 +235,66 @@ static void cpcap_usb_detect(struct work_struct *work) if (error) return; - if (s.id_ground) { - dev_dbg(ddata->dev, "id ground, USB host mode\n"); + vbus = cpcap_usb_vbus_valid(ddata); + + /* We need to kick the VBUS as USB A-host */ + if (s.id_ground && ddata->vbus_provider) { + dev_dbg(ddata->dev, "still in USB A-host mode, kicking VBUS\n"); + + cpcap_usb_try_musb_mailbox(ddata, MUSB_ID_GROUND); + + error = regmap_update_bits(ddata->reg, CPCAP_REG_USBC3, + CPCAP_BIT_VBUSSTBY_EN | + CPCAP_BIT_VBUSEN_SPI, + CPCAP_BIT_VBUSEN_SPI); + if (error) + goto out_err; + + return; + } + + if (vbus && s.id_ground && ddata->docked) { + dev_dbg(ddata->dev, "still docked as A-host, signal ID down\n"); + + cpcap_usb_try_musb_mailbox(ddata, MUSB_ID_GROUND); + + return; + } + + /* No VBUS needed with docks */ + if (vbus && s.id_ground && !ddata->vbus_provider) { + dev_dbg(ddata->dev, "connected to a dock\n"); + + ddata->docked = true; + error = cpcap_usb_set_usb_mode(ddata); if (error) goto out_err; - error = musb_mailbox(MUSB_ID_GROUND); + cpcap_usb_try_musb_mailbox(ddata, MUSB_ID_GROUND); + + /* + * Force check state again after musb has reoriented, + * otherwise devices won't enumerate after loading PHY + * driver. + */ + schedule_delayed_work(&ddata->detect_work, + msecs_to_jiffies(1000)); + + return; + } + + if (s.id_ground && !ddata->docked) { + dev_dbg(ddata->dev, "id ground, USB host mode\n"); + + ddata->vbus_provider = true; + + error = cpcap_usb_set_usb_mode(ddata); if (error) goto out_err; + cpcap_usb_try_musb_mailbox(ddata, MUSB_ID_GROUND); + error = regmap_update_bits(ddata->reg, CPCAP_REG_USBC3, CPCAP_BIT_VBUSSTBY_EN | CPCAP_BIT_VBUSEN_SPI, @@ -248,43 +313,26 @@ static void cpcap_usb_detect(struct work_struct *work) vbus = cpcap_usb_vbus_valid(ddata); + /* Otherwise assume we're connected to a USB host */ if (vbus) { - /* Are we connected to a docking station with vbus? */ - if (s.id_ground) { - dev_dbg(ddata->dev, "connected to a dock\n"); - - /* No VBUS needed with docks */ - error = cpcap_usb_set_usb_mode(ddata); - if (error) - goto out_err; - error = musb_mailbox(MUSB_ID_GROUND); - if (error) - goto out_err; - - return; - } - - /* Otherwise assume we're connected to a USB host */ dev_dbg(ddata->dev, "connected to USB host\n"); error = cpcap_usb_set_usb_mode(ddata); if (error) goto out_err; - error = musb_mailbox(MUSB_VBUS_VALID); - if (error) - goto out_err; + cpcap_usb_try_musb_mailbox(ddata, MUSB_VBUS_VALID); return; } + ddata->vbus_provider = false; + ddata->docked = false; + cpcap_usb_try_musb_mailbox(ddata, MUSB_VBUS_OFF); + /* Default to debug UART mode */ error = cpcap_usb_set_uart_mode(ddata); if (error) goto out_err; - error = musb_mailbox(MUSB_VBUS_OFF); - if (error) - goto out_err; - dev_dbg(ddata->dev, "set UART mode\n"); return; @@ -376,7 +424,8 @@ static int cpcap_usb_set_uart_mode(struct cpcap_phy_ddata *ddata) { int error; - error = cpcap_usb_gpio_set_mode(ddata, CPCAP_DM_DP); + /* Disable lines to prevent glitches from waking up mdm6600 */ + error = cpcap_usb_gpio_set_mode(ddata, CPCAP_UNKNOWN_DISABLED); if (error) goto out_err; @@ -403,6 +452,11 @@ static int cpcap_usb_set_uart_mode(struct cpcap_phy_ddata *ddata) if (error) goto out_err; + /* Enable UART mode */ + error = cpcap_usb_gpio_set_mode(ddata, CPCAP_DM_DP); + if (error) + goto out_err; + return 0; out_err: @@ -415,7 +469,8 @@ static int cpcap_usb_set_usb_mode(struct cpcap_phy_ddata *ddata) { int error; - error = cpcap_usb_gpio_set_mode(ddata, CPCAP_OTG_DM_DP); + /* Disable lines to prevent glitches from waking up mdm6600 */ + error = cpcap_usb_gpio_set_mode(ddata, CPCAP_UNKNOWN_DISABLED); if (error) return error; @@ -434,12 +489,6 @@ static int cpcap_usb_set_usb_mode(struct cpcap_phy_ddata *ddata) if (error) goto out_err; - error = regmap_update_bits(ddata->reg, CPCAP_REG_USBC2, - CPCAP_BIT_USBXCVREN, - CPCAP_BIT_USBXCVREN); - if (error) - goto out_err; - error = regmap_update_bits(ddata->reg, CPCAP_REG_USBC3, CPCAP_BIT_PU_SPI | CPCAP_BIT_DMPD_SPI | @@ -455,6 +504,11 @@ static int cpcap_usb_set_usb_mode(struct cpcap_phy_ddata *ddata) if (error) goto out_err; + /* Enable USB mode */ + error = cpcap_usb_gpio_set_mode(ddata, CPCAP_OTG_DM_DP); + if (error) + goto out_err; + return 0; out_err: @@ -649,9 +703,7 @@ static int cpcap_usb_phy_remove(struct platform_device *pdev) if (error) dev_err(ddata->dev, "could not set UART mode\n"); - error = musb_mailbox(MUSB_VBUS_OFF); - if (error) - dev_err(ddata->dev, "could not set mailbox\n"); + cpcap_usb_try_musb_mailbox(ddata, MUSB_VBUS_OFF); usb_remove_phy(&ddata->phy); cancel_delayed_work_sync(&ddata->detect_work); diff --git a/drivers/phy/motorola/phy-mapphone-mdm6600.c b/drivers/phy/motorola/phy-mapphone-mdm6600.c index ee184d5607bd..f20524f0c21d 100644 --- a/drivers/phy/motorola/phy-mapphone-mdm6600.c +++ b/drivers/phy/motorola/phy-mapphone-mdm6600.c @@ -200,7 +200,7 @@ static void phy_mdm6600_status(struct work_struct *work) struct phy_mdm6600 *ddata; struct device *dev; DECLARE_BITMAP(values, PHY_MDM6600_NR_STATUS_LINES); - int error, i, val = 0; + int error; ddata = container_of(work, struct phy_mdm6600, status_work.work); dev = ddata->dev; @@ -212,16 +212,11 @@ static void phy_mdm6600_status(struct work_struct *work) if (error) return; - for (i = 0; i < PHY_MDM6600_NR_STATUS_LINES; i++) { - val |= test_bit(i, values) << i; - dev_dbg(ddata->dev, "XXX %s: i: %i values[i]: %i val: %i\n", - __func__, i, test_bit(i, values), val); - } - ddata->status = values[0]; + ddata->status = values[0] & ((1 << PHY_MDM6600_NR_STATUS_LINES) - 1); dev_info(dev, "modem status: %i %s\n", ddata->status, - phy_mdm6600_status_name[ddata->status & 7]); + phy_mdm6600_status_name[ddata->status]); complete(&ddata->ack); } diff --git a/drivers/phy/qualcomm/phy-qcom-qmp.c b/drivers/phy/qualcomm/phy-qcom-qmp.c index 091e20303a14..66f91726b8b2 100644 --- a/drivers/phy/qualcomm/phy-qcom-qmp.c +++ b/drivers/phy/qualcomm/phy-qcom-qmp.c @@ -66,7 +66,7 @@ /* QPHY_V3_PCS_MISC_CLAMP_ENABLE register bits */ #define CLAMP_EN BIT(0) /* enables i/o clamp_n */ -#define PHY_INIT_COMPLETE_TIMEOUT 1000 +#define PHY_INIT_COMPLETE_TIMEOUT 10000 #define POWER_DOWN_DELAY_US_MIN 10 #define POWER_DOWN_DELAY_US_MAX 11 diff --git a/drivers/phy/rockchip/phy-rockchip-inno-hdmi.c b/drivers/phy/rockchip/phy-rockchip-inno-hdmi.c index 2b97fb1185a0..9ca20c947283 100644 --- a/drivers/phy/rockchip/phy-rockchip-inno-hdmi.c +++ b/drivers/phy/rockchip/phy-rockchip-inno-hdmi.c @@ -603,6 +603,8 @@ static long inno_hdmi_phy_rk3228_clk_round_rate(struct clk_hw *hw, { const struct pre_pll_config *cfg = pre_pll_cfg_table; + rate = (rate / 1000) * 1000; + for (; cfg->pixclock != 0; cfg++) if (cfg->pixclock == rate && !cfg->fracdiv) break; @@ -755,6 +757,8 @@ static long inno_hdmi_phy_rk3328_clk_round_rate(struct clk_hw *hw, { const struct pre_pll_config *cfg = pre_pll_cfg_table; + rate = (rate / 1000) * 1000; + for (; cfg->pixclock != 0; cfg++) if (cfg->pixclock == rate) break; diff --git a/drivers/pinctrl/cirrus/Kconfig b/drivers/pinctrl/cirrus/Kconfig index f1806fd781a0..530426a74f75 100644 --- a/drivers/pinctrl/cirrus/Kconfig +++ b/drivers/pinctrl/cirrus/Kconfig @@ -2,6 +2,7 @@ config PINCTRL_LOCHNAGAR tristate "Cirrus Logic Lochnagar pinctrl driver" depends on MFD_LOCHNAGAR + select GPIOLIB select PINMUX select PINCONF select GENERIC_PINCONF diff --git a/drivers/pinctrl/intel/pinctrl-sunrisepoint.c b/drivers/pinctrl/intel/pinctrl-sunrisepoint.c index 44d7f50bbc82..d936e7aa74c4 100644 --- a/drivers/pinctrl/intel/pinctrl-sunrisepoint.c +++ b/drivers/pinctrl/intel/pinctrl-sunrisepoint.c @@ -49,6 +49,7 @@ .padown_offset = SPT_PAD_OWN, \ .padcfglock_offset = SPT_PADCFGLOCK, \ .hostown_offset = SPT_HOSTSW_OWN, \ + .is_offset = SPT_GPI_IS, \ .ie_offset = SPT_GPI_IE, \ .pin_base = (s), \ .npins = ((e) - (s) + 1), \ diff --git a/drivers/pinctrl/meson/pinctrl-meson.c b/drivers/pinctrl/meson/pinctrl-meson.c index 3c80828a5e50..bbc919bef2bf 100644 --- a/drivers/pinctrl/meson/pinctrl-meson.c +++ b/drivers/pinctrl/meson/pinctrl-meson.c @@ -441,6 +441,7 @@ static int meson_pinconf_get_drive_strength(struct meson_pinctrl *pc, return ret; meson_calc_reg_and_bit(bank, pin, REG_DS, ®, &bit); + bit = bit << 1; ret = regmap_read(pc->reg_ds, reg, &val); if (ret) diff --git a/drivers/platform/chrome/wilco_ec/keyboard_leds.c b/drivers/platform/chrome/wilco_ec/keyboard_leds.c index bb0edf51dfda..5731d1b60e28 100644 --- a/drivers/platform/chrome/wilco_ec/keyboard_leds.c +++ b/drivers/platform/chrome/wilco_ec/keyboard_leds.c @@ -73,13 +73,6 @@ static int send_kbbl_msg(struct wilco_ec_device *ec, return ret; } - if (response->status) { - dev_err(ec->dev, - "EC reported failure sending keyboard LEDs command: %d", - response->status); - return -EIO; - } - return 0; } @@ -87,6 +80,7 @@ static int set_kbbl(struct wilco_ec_device *ec, enum led_brightness brightness) { struct wilco_keyboard_leds_msg request; struct wilco_keyboard_leds_msg response; + int ret; memset(&request, 0, sizeof(request)); request.command = WILCO_EC_COMMAND_KBBL; @@ -94,7 +88,18 @@ static int set_kbbl(struct wilco_ec_device *ec, enum led_brightness brightness) request.mode = WILCO_KBBL_MODE_FLAG_PWM; request.percent = brightness; - return send_kbbl_msg(ec, &request, &response); + ret = send_kbbl_msg(ec, &request, &response); + if (ret < 0) + return ret; + + if (response.status) { + dev_err(ec->dev, + "EC reported failure sending keyboard LEDs command: %d", + response.status); + return -EIO; + } + + return 0; } static int kbbl_exist(struct wilco_ec_device *ec, bool *exists) @@ -140,6 +145,13 @@ static int kbbl_init(struct wilco_ec_device *ec) if (ret < 0) return ret; + if (response.status) { + dev_err(ec->dev, + "EC reported failure sending keyboard LEDs command: %d", + response.status); + return -EIO; + } + if (response.mode & WILCO_KBBL_MODE_FLAG_PWM) return response.percent; diff --git a/drivers/platform/mellanox/mlxbf-tmfifo.c b/drivers/platform/mellanox/mlxbf-tmfifo.c index 9a5c9fd2dbc6..5739a9669b29 100644 --- a/drivers/platform/mellanox/mlxbf-tmfifo.c +++ b/drivers/platform/mellanox/mlxbf-tmfifo.c @@ -149,7 +149,7 @@ struct mlxbf_tmfifo_irq_info { * @work: work struct for deferred process * @timer: background timer * @vring: Tx/Rx ring - * @spin_lock: spin lock + * @spin_lock: Tx/Rx spin lock * @is_ready: ready flag */ struct mlxbf_tmfifo { @@ -164,7 +164,7 @@ struct mlxbf_tmfifo { struct work_struct work; struct timer_list timer; struct mlxbf_tmfifo_vring *vring[2]; - spinlock_t spin_lock; /* spin lock */ + spinlock_t spin_lock[2]; /* spin lock */ bool is_ready; }; @@ -525,7 +525,7 @@ static void mlxbf_tmfifo_console_tx(struct mlxbf_tmfifo *fifo, int avail) writeq(*(u64 *)&hdr, fifo->tx_base + MLXBF_TMFIFO_TX_DATA); /* Use spin-lock to protect the 'cons->tx_buf'. */ - spin_lock_irqsave(&fifo->spin_lock, flags); + spin_lock_irqsave(&fifo->spin_lock[0], flags); while (size > 0) { addr = cons->tx_buf.buf + cons->tx_buf.tail; @@ -552,7 +552,7 @@ static void mlxbf_tmfifo_console_tx(struct mlxbf_tmfifo *fifo, int avail) } } - spin_unlock_irqrestore(&fifo->spin_lock, flags); + spin_unlock_irqrestore(&fifo->spin_lock[0], flags); } /* Rx/Tx one word in the descriptor buffer. */ @@ -731,9 +731,9 @@ static bool mlxbf_tmfifo_rxtx_one_desc(struct mlxbf_tmfifo_vring *vring, fifo->vring[is_rx] = NULL; /* Notify upper layer that packet is done. */ - spin_lock_irqsave(&fifo->spin_lock, flags); + spin_lock_irqsave(&fifo->spin_lock[is_rx], flags); vring_interrupt(0, vring->vq); - spin_unlock_irqrestore(&fifo->spin_lock, flags); + spin_unlock_irqrestore(&fifo->spin_lock[is_rx], flags); } mlxbf_tmfifo_desc_done: @@ -852,10 +852,10 @@ static bool mlxbf_tmfifo_virtio_notify(struct virtqueue *vq) * worker handler. */ if (vring->vdev_id == VIRTIO_ID_CONSOLE) { - spin_lock_irqsave(&fifo->spin_lock, flags); + spin_lock_irqsave(&fifo->spin_lock[0], flags); tm_vdev = fifo->vdev[VIRTIO_ID_CONSOLE]; mlxbf_tmfifo_console_output(tm_vdev, vring); - spin_unlock_irqrestore(&fifo->spin_lock, flags); + spin_unlock_irqrestore(&fifo->spin_lock[0], flags); } else if (test_and_set_bit(MLXBF_TM_TX_LWM_IRQ, &fifo->pend_events)) { return true; @@ -1189,7 +1189,8 @@ static int mlxbf_tmfifo_probe(struct platform_device *pdev) if (!fifo) return -ENOMEM; - spin_lock_init(&fifo->spin_lock); + spin_lock_init(&fifo->spin_lock[0]); + spin_lock_init(&fifo->spin_lock[1]); INIT_WORK(&fifo->work, mlxbf_tmfifo_work_handler); mutex_init(&fifo->lock); diff --git a/drivers/platform/mips/Kconfig b/drivers/platform/mips/Kconfig index f4d0a86c00d0..5e77b0dc5fd6 100644 --- a/drivers/platform/mips/Kconfig +++ b/drivers/platform/mips/Kconfig @@ -18,7 +18,7 @@ if MIPS_PLATFORM_DEVICES config CPU_HWMON tristate "Loongson-3 CPU HWMon Driver" - depends on CONFIG_MACH_LOONGSON64 + depends on MACH_LOONGSON64 select HWMON default y help diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c index 821b08e01635..982f0cc8270c 100644 --- a/drivers/platform/x86/asus-wmi.c +++ b/drivers/platform/x86/asus-wmi.c @@ -512,13 +512,7 @@ static void kbd_led_update(struct asus_wmi *asus) { int ctrl_param = 0; - /* - * bits 0-2: level - * bit 7: light on/off - */ - if (asus->kbd_led_wk > 0) - ctrl_param = 0x80 | (asus->kbd_led_wk & 0x7F); - + ctrl_param = 0x80 | (asus->kbd_led_wk & 0x7F); asus_wmi_set_devstate(ASUS_WMI_DEVID_KBD_BACKLIGHT, ctrl_param, NULL); } diff --git a/drivers/platform/x86/gpd-pocket-fan.c b/drivers/platform/x86/gpd-pocket-fan.c index be85ed966bf3..b471b86c28fe 100644 --- a/drivers/platform/x86/gpd-pocket-fan.c +++ b/drivers/platform/x86/gpd-pocket-fan.c @@ -16,17 +16,27 @@ #define MAX_SPEED 3 -static int temp_limits[3] = { 55000, 60000, 65000 }; +#define TEMP_LIMIT0_DEFAULT 55000 +#define TEMP_LIMIT1_DEFAULT 60000 +#define TEMP_LIMIT2_DEFAULT 65000 + +#define HYSTERESIS_DEFAULT 3000 + +#define SPEED_ON_AC_DEFAULT 2 + +static int temp_limits[3] = { + TEMP_LIMIT0_DEFAULT, TEMP_LIMIT1_DEFAULT, TEMP_LIMIT2_DEFAULT, +}; module_param_array(temp_limits, int, NULL, 0444); MODULE_PARM_DESC(temp_limits, "Millicelsius values above which the fan speed increases"); -static int hysteresis = 3000; +static int hysteresis = HYSTERESIS_DEFAULT; module_param(hysteresis, int, 0444); MODULE_PARM_DESC(hysteresis, "Hysteresis in millicelsius before lowering the fan speed"); -static int speed_on_ac = 2; +static int speed_on_ac = SPEED_ON_AC_DEFAULT; module_param(speed_on_ac, int, 0444); MODULE_PARM_DESC(speed_on_ac, "minimum fan speed to allow when system is powered by AC"); @@ -117,21 +127,24 @@ static int gpd_pocket_fan_probe(struct platform_device *pdev) int i; for (i = 0; i < ARRAY_SIZE(temp_limits); i++) { - if (temp_limits[i] < 40000 || temp_limits[i] > 70000) { + if (temp_limits[i] < 20000 || temp_limits[i] > 90000) { dev_err(&pdev->dev, "Invalid temp-limit %d (must be between 40000 and 70000)\n", temp_limits[i]); - return -EINVAL; + temp_limits[0] = TEMP_LIMIT0_DEFAULT; + temp_limits[1] = TEMP_LIMIT1_DEFAULT; + temp_limits[2] = TEMP_LIMIT2_DEFAULT; + break; } } if (hysteresis < 1000 || hysteresis > 10000) { dev_err(&pdev->dev, "Invalid hysteresis %d (must be between 1000 and 10000)\n", hysteresis); - return -EINVAL; + hysteresis = HYSTERESIS_DEFAULT; } if (speed_on_ac < 0 || speed_on_ac > MAX_SPEED) { dev_err(&pdev->dev, "Invalid speed_on_ac %d (must be between 0 and 3)\n", speed_on_ac); - return -EINVAL; + speed_on_ac = SPEED_ON_AC_DEFAULT; } fan = devm_kzalloc(&pdev->dev, sizeof(*fan), GFP_KERNEL); diff --git a/drivers/platform/x86/intel_ips.h b/drivers/platform/x86/intel_ips.h index 512ad234ad0d..35ed9711c7b9 100644 --- a/drivers/platform/x86/intel_ips.h +++ b/drivers/platform/x86/intel_ips.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2010 Intel Corporation */ diff --git a/drivers/platform/x86/intel_pmc_core.h b/drivers/platform/x86/intel_pmc_core.h index fdee5772e532..8203ae38dc46 100644 --- a/drivers/platform/x86/intel_pmc_core.h +++ b/drivers/platform/x86/intel_pmc_core.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Intel Core SoC Power Management Controller Header File * diff --git a/drivers/platform/x86/intel_pmc_core_pltdrv.c b/drivers/platform/x86/intel_pmc_core_pltdrv.c index 6fe829f30997..e1266f5c6359 100644 --- a/drivers/platform/x86/intel_pmc_core_pltdrv.c +++ b/drivers/platform/x86/intel_pmc_core_pltdrv.c @@ -44,6 +44,8 @@ static const struct x86_cpu_id intel_pmc_core_platform_ids[] = { INTEL_CPU_FAM6(KABYLAKE, pmc_core_device), INTEL_CPU_FAM6(CANNONLAKE_L, pmc_core_device), INTEL_CPU_FAM6(ICELAKE_L, pmc_core_device), + INTEL_CPU_FAM6(COMETLAKE, pmc_core_device), + INTEL_CPU_FAM6(COMETLAKE_L, pmc_core_device), {} }; MODULE_DEVICE_TABLE(x86cpu, intel_pmc_core_platform_ids); diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index a67701ed93e8..2e5b6a6834da 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -1295,6 +1295,9 @@ struct rapl_package *rapl_add_package(int cpu, struct rapl_if_priv *priv) struct cpuinfo_x86 *c = &cpu_data(cpu); int ret; + if (!rapl_defaults) + return ERR_PTR(-ENODEV); + rp = kzalloc(sizeof(struct rapl_package), GFP_KERNEL); if (!rp) return ERR_PTR(-ENOMEM); diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig index c382158f587d..475c60dccaa4 100644 --- a/drivers/ptp/Kconfig +++ b/drivers/ptp/Kconfig @@ -56,20 +56,6 @@ config PTP_1588_CLOCK_QORIQ To compile this driver as a module, choose M here: the module will be called ptp-qoriq. -config PTP_1588_CLOCK_IXP46X - tristate "Intel IXP46x as PTP clock" - depends on IXP4XX_ETH - depends on PTP_1588_CLOCK - default y - help - This driver adds support for using the IXP46X as a PTP - clock. This clock is only useful if your PTP programs are - getting hardware time stamps on the PTP Ethernet packets - using the SO_TIMESTAMPING API. - - To compile this driver as a module, choose M here: the module - will be called ptp_ixp46x. - comment "Enable PHYLIB and NETWORK_PHY_TIMESTAMPING to see the additional clocks." depends on PHYLIB=n || NETWORK_PHY_TIMESTAMPING=n diff --git a/drivers/ptp/Makefile b/drivers/ptp/Makefile index 3fb91bebbaf7..8c830336f178 100644 --- a/drivers/ptp/Makefile +++ b/drivers/ptp/Makefile @@ -7,10 +7,9 @@ ptp-y := ptp_clock.o ptp_chardev.o ptp_sysfs.o obj-$(CONFIG_PTP_1588_CLOCK) += ptp.o obj-$(CONFIG_PTP_1588_CLOCK_DTE) += ptp_dte.o obj-$(CONFIG_PTP_1588_CLOCK_INES) += ptp_ines.o -obj-$(CONFIG_PTP_1588_CLOCK_IXP46X) += ptp_ixp46x.o obj-$(CONFIG_PTP_1588_CLOCK_PCH) += ptp_pch.o obj-$(CONFIG_PTP_1588_CLOCK_KVM) += ptp_kvm.o obj-$(CONFIG_PTP_1588_CLOCK_QORIQ) += ptp-qoriq.o ptp-qoriq-y += ptp_qoriq.o ptp-qoriq-$(CONFIG_DEBUG_FS) += ptp_qoriq_debugfs.o -obj-$(CONFIG_PTP_1588_CLOCK_IDTCM) += ptp_clockmatrix.o
\ No newline at end of file +obj-$(CONFIG_PTP_1588_CLOCK_IDTCM) += ptp_clockmatrix.o diff --git a/drivers/ptp/idt8a340_reg.h b/drivers/ptp/idt8a340_reg.h index 9263bc33b8f4..69eedda9f731 100644 --- a/drivers/ptp/idt8a340_reg.h +++ b/drivers/ptp/idt8a340_reg.h @@ -77,6 +77,8 @@ #define JTAG_DEVICE_ID 0x001c #define PRODUCT_ID 0x001e +#define OTP_SCSR_CONFIG_SELECT 0x0022 + #define STATUS 0xc03c #define USER_GPIO0_TO_7_STATUS 0x008a #define USER_GPIO8_TO_15_STATUS 0x008b diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c index da97a5bab26e..ac1f2bf9e888 100644 --- a/drivers/ptp/ptp_clock.c +++ b/drivers/ptp/ptp_clock.c @@ -170,6 +170,7 @@ static void ptp_clock_release(struct device *dev) { struct ptp_clock *ptp = container_of(dev, struct ptp_clock, dev); + ptp_cleanup_pin_groups(ptp); mutex_destroy(&ptp->tsevq_mux); mutex_destroy(&ptp->pincfg_mux); ida_simple_remove(&ptp_clocks_map, ptp->index); @@ -302,9 +303,8 @@ int ptp_clock_unregister(struct ptp_clock *ptp) if (ptp->pps_source) pps_unregister_source(ptp->pps_source); - ptp_cleanup_pin_groups(ptp); - posix_clock_unregister(&ptp->clock); + return 0; } EXPORT_SYMBOL(ptp_clock_unregister); diff --git a/drivers/ptp/ptp_clockmatrix.c b/drivers/ptp/ptp_clockmatrix.c index e85836715f0b..032e112c3dd9 100644 --- a/drivers/ptp/ptp_clockmatrix.c +++ b/drivers/ptp/ptp_clockmatrix.c @@ -405,6 +405,7 @@ static int _idtcm_set_dpll_tod(struct idtcm_channel *channel, if (wr_trig == HW_TOD_WR_TRIG_SEL_MSB) { if (idtcm->calculate_overhead_flag) { + /* Assumption: I2C @ 400KHz */ total_overhead_ns = ktime_to_ns(ktime_get_raw() - idtcm->start_time) + idtcm->tod_write_overhead_ns @@ -596,44 +597,7 @@ static int idtcm_state_machine_reset(struct idtcm *idtcm) static int idtcm_read_hw_rev_id(struct idtcm *idtcm, u8 *hw_rev_id) { - return idtcm_read(idtcm, - GENERAL_STATUS, - HW_REV_ID, - hw_rev_id, - sizeof(u8)); -} - -static int idtcm_read_bond_id(struct idtcm *idtcm, u8 *bond_id) -{ - return idtcm_read(idtcm, - GENERAL_STATUS, - BOND_ID, - bond_id, - sizeof(u8)); -} - -static int idtcm_read_hw_csr_id(struct idtcm *idtcm, u16 *hw_csr_id) -{ - int err; - u8 buf[2] = {0}; - - err = idtcm_read(idtcm, GENERAL_STATUS, HW_CSR_ID, buf, sizeof(buf)); - - *hw_csr_id = (buf[1] << 8) | buf[0]; - - return err; -} - -static int idtcm_read_hw_irq_id(struct idtcm *idtcm, u16 *hw_irq_id) -{ - int err; - u8 buf[2] = {0}; - - err = idtcm_read(idtcm, GENERAL_STATUS, HW_IRQ_ID, buf, sizeof(buf)); - - *hw_irq_id = (buf[1] << 8) | buf[0]; - - return err; + return idtcm_read(idtcm, HW_REVISION, REV_ID, hw_rev_id, sizeof(u8)); } static int idtcm_read_product_id(struct idtcm *idtcm, u16 *product_id) @@ -674,20 +638,11 @@ static int idtcm_read_hotfix_release(struct idtcm *idtcm, u8 *hotfix) sizeof(u8)); } -static int idtcm_read_pipeline(struct idtcm *idtcm, u32 *pipeline) +static int idtcm_read_otp_scsr_config_select(struct idtcm *idtcm, + u8 *config_select) { - int err; - u8 buf[4] = {0}; - - err = idtcm_read(idtcm, - GENERAL_STATUS, - PIPELINE_ID, - &buf[0], - sizeof(buf)); - - *pipeline = (buf[3] << 24) | (buf[2] << 16) | (buf[1] << 8) | buf[0]; - - return err; + return idtcm_read(idtcm, GENERAL_STATUS, OTP_SCSR_CONFIG_SELECT, + config_select, sizeof(u8)); } static int process_pll_mask(struct idtcm *idtcm, u32 addr, u8 val, u8 *mask) @@ -1078,28 +1033,22 @@ static void idtcm_display_version_info(struct idtcm *idtcm) u8 major; u8 minor; u8 hotfix; - u32 pipeline; u16 product_id; - u16 csr_id; - u16 irq_id; u8 hw_rev_id; - u8 bond_id; + u8 config_select; + char *fmt = "%d.%d.%d, Id: 0x%04x HW Rev: %d OTP Config Select: %d\n"; idtcm_read_major_release(idtcm, &major); idtcm_read_minor_release(idtcm, &minor); idtcm_read_hotfix_release(idtcm, &hotfix); - idtcm_read_pipeline(idtcm, &pipeline); idtcm_read_product_id(idtcm, &product_id); idtcm_read_hw_rev_id(idtcm, &hw_rev_id); - idtcm_read_bond_id(idtcm, &bond_id); - idtcm_read_hw_csr_id(idtcm, &csr_id); - idtcm_read_hw_irq_id(idtcm, &irq_id); - - dev_info(&idtcm->client->dev, "Version: %d.%d.%d, Pipeline %u\t" - "0x%04x, Rev %d, Bond %d, CSR %d, IRQ %d\n", - major, minor, hotfix, pipeline, - product_id, hw_rev_id, bond_id, csr_id, irq_id); + + idtcm_read_otp_scsr_config_select(idtcm, &config_select); + + dev_info(&idtcm->client->dev, fmt, major, minor, hotfix, + product_id, hw_rev_id, config_select); } static const struct ptp_clock_info idtcm_caps = { diff --git a/drivers/regulator/axp20x-regulator.c b/drivers/regulator/axp20x-regulator.c index 989506bd90b1..16f0c8570036 100644 --- a/drivers/regulator/axp20x-regulator.c +++ b/drivers/regulator/axp20x-regulator.c @@ -413,10 +413,13 @@ static int axp20x_set_ramp_delay(struct regulator_dev *rdev, int ramp) int i; for (i = 0; i < rate_count; i++) { - if (ramp <= slew_rates[i]) - cfg = AXP20X_DCDC2_LDO3_V_RAMP_LDO3_RATE(i); - else + if (ramp > slew_rates[i]) break; + + if (id == AXP20X_DCDC2) + cfg = AXP20X_DCDC2_LDO3_V_RAMP_DCDC2_RATE(i); + else + cfg = AXP20X_DCDC2_LDO3_V_RAMP_LDO3_RATE(i); } if (cfg == 0xff) { @@ -605,7 +608,7 @@ static const struct regulator_desc axp22x_regulators[] = { AXP22X_PWR_OUT_CTRL2, AXP22X_PWR_OUT_ELDO1_MASK), AXP_DESC(AXP22X, ELDO2, "eldo2", "eldoin", 700, 3300, 100, AXP22X_ELDO2_V_OUT, AXP22X_ELDO2_V_OUT_MASK, - AXP22X_PWR_OUT_CTRL2, AXP22X_PWR_OUT_ELDO1_MASK), + AXP22X_PWR_OUT_CTRL2, AXP22X_PWR_OUT_ELDO2_MASK), AXP_DESC(AXP22X, ELDO3, "eldo3", "eldoin", 700, 3300, 100, AXP22X_ELDO3_V_OUT, AXP22X_ELDO3_V_OUT_MASK, AXP22X_PWR_OUT_CTRL2, AXP22X_PWR_OUT_ELDO3_MASK), diff --git a/drivers/regulator/bd70528-regulator.c b/drivers/regulator/bd70528-regulator.c index ec764022621f..5bf8a2dc5fe7 100644 --- a/drivers/regulator/bd70528-regulator.c +++ b/drivers/regulator/bd70528-regulator.c @@ -101,7 +101,6 @@ static const struct regulator_ops bd70528_ldo_ops = { .set_voltage_sel = regulator_set_voltage_sel_regmap, .get_voltage_sel = regulator_get_voltage_sel_regmap, .set_voltage_time_sel = regulator_set_voltage_time_sel, - .set_ramp_delay = bd70528_set_ramp_delay, }; static const struct regulator_ops bd70528_led_ops = { diff --git a/drivers/rtc/rtc-mc146818-lib.c b/drivers/rtc/rtc-mc146818-lib.c index df2829dd55ad..2ecd8752b088 100644 --- a/drivers/rtc/rtc-mc146818-lib.c +++ b/drivers/rtc/rtc-mc146818-lib.c @@ -172,20 +172,7 @@ int mc146818_set_time(struct rtc_time *time) save_control = CMOS_READ(RTC_CONTROL); CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL); save_freq_select = CMOS_READ(RTC_FREQ_SELECT); - -#ifdef CONFIG_X86 - if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 == 0x17) || - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { - CMOS_WRITE((save_freq_select & (~RTC_DIV_RESET2)), - RTC_FREQ_SELECT); - save_freq_select &= ~RTC_DIV_RESET2; - } else - CMOS_WRITE((save_freq_select | RTC_DIV_RESET2), - RTC_FREQ_SELECT); -#else - CMOS_WRITE((save_freq_select | RTC_DIV_RESET2), RTC_FREQ_SELECT); -#endif + CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT); #ifdef CONFIG_MACH_DECSTATION CMOS_WRITE(real_yrs, RTC_DEC_YEAR); diff --git a/drivers/rtc/rtc-mt6397.c b/drivers/rtc/rtc-mt6397.c index 5249fc99fd5f..9135e2101752 100644 --- a/drivers/rtc/rtc-mt6397.c +++ b/drivers/rtc/rtc-mt6397.c @@ -47,7 +47,7 @@ static irqreturn_t mtk_rtc_irq_handler_thread(int irq, void *data) irqen = irqsta & ~RTC_IRQ_EN_AL; mutex_lock(&rtc->lock); if (regmap_write(rtc->regmap, rtc->addr_base + RTC_IRQ_EN, - irqen) < 0) + irqen) == 0) mtk_rtc_write_trigger(rtc); mutex_unlock(&rtc->lock); @@ -169,12 +169,12 @@ static int mtk_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alm) alm->pending = !!(pdn2 & RTC_PDN2_PWRON_ALARM); mutex_unlock(&rtc->lock); - tm->tm_sec = data[RTC_OFFSET_SEC]; - tm->tm_min = data[RTC_OFFSET_MIN]; - tm->tm_hour = data[RTC_OFFSET_HOUR]; - tm->tm_mday = data[RTC_OFFSET_DOM]; - tm->tm_mon = data[RTC_OFFSET_MTH]; - tm->tm_year = data[RTC_OFFSET_YEAR]; + tm->tm_sec = data[RTC_OFFSET_SEC] & RTC_AL_SEC_MASK; + tm->tm_min = data[RTC_OFFSET_MIN] & RTC_AL_MIN_MASK; + tm->tm_hour = data[RTC_OFFSET_HOUR] & RTC_AL_HOU_MASK; + tm->tm_mday = data[RTC_OFFSET_DOM] & RTC_AL_DOM_MASK; + tm->tm_mon = data[RTC_OFFSET_MTH] & RTC_AL_MTH_MASK; + tm->tm_year = data[RTC_OFFSET_YEAR] & RTC_AL_YEA_MASK; tm->tm_year += RTC_MIN_YEAR_OFFSET; tm->tm_mon--; @@ -195,14 +195,25 @@ static int mtk_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alm) tm->tm_year -= RTC_MIN_YEAR_OFFSET; tm->tm_mon++; - data[RTC_OFFSET_SEC] = tm->tm_sec; - data[RTC_OFFSET_MIN] = tm->tm_min; - data[RTC_OFFSET_HOUR] = tm->tm_hour; - data[RTC_OFFSET_DOM] = tm->tm_mday; - data[RTC_OFFSET_MTH] = tm->tm_mon; - data[RTC_OFFSET_YEAR] = tm->tm_year; - mutex_lock(&rtc->lock); + ret = regmap_bulk_read(rtc->regmap, rtc->addr_base + RTC_AL_SEC, + data, RTC_OFFSET_COUNT); + if (ret < 0) + goto exit; + + data[RTC_OFFSET_SEC] = ((data[RTC_OFFSET_SEC] & ~(RTC_AL_SEC_MASK)) | + (tm->tm_sec & RTC_AL_SEC_MASK)); + data[RTC_OFFSET_MIN] = ((data[RTC_OFFSET_MIN] & ~(RTC_AL_MIN_MASK)) | + (tm->tm_min & RTC_AL_MIN_MASK)); + data[RTC_OFFSET_HOUR] = ((data[RTC_OFFSET_HOUR] & ~(RTC_AL_HOU_MASK)) | + (tm->tm_hour & RTC_AL_HOU_MASK)); + data[RTC_OFFSET_DOM] = ((data[RTC_OFFSET_DOM] & ~(RTC_AL_DOM_MASK)) | + (tm->tm_mday & RTC_AL_DOM_MASK)); + data[RTC_OFFSET_MTH] = ((data[RTC_OFFSET_MTH] & ~(RTC_AL_MTH_MASK)) | + (tm->tm_mon & RTC_AL_MTH_MASK)); + data[RTC_OFFSET_YEAR] = ((data[RTC_OFFSET_YEAR] & ~(RTC_AL_YEA_MASK)) | + (tm->tm_year & RTC_AL_YEA_MASK)); + if (alm->enabled) { ret = regmap_bulk_write(rtc->regmap, rtc->addr_base + RTC_AL_SEC, diff --git a/drivers/rtc/rtc-sun6i.c b/drivers/rtc/rtc-sun6i.c index 8dcd20b34dde..852f5f3b3592 100644 --- a/drivers/rtc/rtc-sun6i.c +++ b/drivers/rtc/rtc-sun6i.c @@ -379,6 +379,22 @@ static void __init sun50i_h6_rtc_clk_init(struct device_node *node) CLK_OF_DECLARE_DRIVER(sun50i_h6_rtc_clk, "allwinner,sun50i-h6-rtc", sun50i_h6_rtc_clk_init); +/* + * The R40 user manual is self-conflicting on whether the prescaler is + * fixed or configurable. The clock diagram shows it as fixed, but there + * is also a configurable divider in the RTC block. + */ +static const struct sun6i_rtc_clk_data sun8i_r40_rtc_data = { + .rc_osc_rate = 16000000, + .fixed_prescaler = 512, +}; +static void __init sun8i_r40_rtc_clk_init(struct device_node *node) +{ + sun6i_rtc_clk_init(node, &sun8i_r40_rtc_data); +} +CLK_OF_DECLARE_DRIVER(sun8i_r40_rtc_clk, "allwinner,sun8i-r40-rtc", + sun8i_r40_rtc_clk_init); + static const struct sun6i_rtc_clk_data sun8i_v3_rtc_data = { .rc_osc_rate = 32000, .has_out_clk = 1, diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index a1915061932e..5256e3ce84e5 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -793,8 +793,6 @@ static int ap_device_probe(struct device *dev) drvres = ap_drv->flags & AP_DRIVER_FLAG_DEFAULT; if (!!devres != !!drvres) return -ENODEV; - /* (re-)init queue's state machine */ - ap_queue_reinit_state(to_ap_queue(dev)); } /* Add queue/card to list of active queues/cards */ diff --git a/drivers/s390/crypto/ap_bus.h b/drivers/s390/crypto/ap_bus.h index 433b7b64368d..bb35ba4a8d24 100644 --- a/drivers/s390/crypto/ap_bus.h +++ b/drivers/s390/crypto/ap_bus.h @@ -261,7 +261,7 @@ void ap_queue_prepare_remove(struct ap_queue *aq); void ap_queue_remove(struct ap_queue *aq); void ap_queue_suspend(struct ap_device *ap_dev); void ap_queue_resume(struct ap_device *ap_dev); -void ap_queue_reinit_state(struct ap_queue *aq); +void ap_queue_init_state(struct ap_queue *aq); struct ap_card *ap_card_create(int id, int queue_depth, int raw_device_type, int comp_device_type, unsigned int functions); diff --git a/drivers/s390/crypto/ap_queue.c b/drivers/s390/crypto/ap_queue.c index dad2be333d82..37c3bdc3642d 100644 --- a/drivers/s390/crypto/ap_queue.c +++ b/drivers/s390/crypto/ap_queue.c @@ -638,7 +638,7 @@ struct ap_queue *ap_queue_create(ap_qid_t qid, int device_type) aq->ap_dev.device.type = &ap_queue_type; aq->ap_dev.device_type = device_type; aq->qid = qid; - aq->state = AP_STATE_RESET_START; + aq->state = AP_STATE_UNBOUND; aq->interrupt = AP_INTR_DISABLED; spin_lock_init(&aq->lock); INIT_LIST_HEAD(&aq->list); @@ -771,10 +771,11 @@ void ap_queue_remove(struct ap_queue *aq) spin_unlock_bh(&aq->lock); } -void ap_queue_reinit_state(struct ap_queue *aq) +void ap_queue_init_state(struct ap_queue *aq) { spin_lock_bh(&aq->lock); aq->state = AP_STATE_RESET_START; ap_wait(ap_sm_event(aq, AP_EVENT_POLL)); spin_unlock_bh(&aq->lock); } +EXPORT_SYMBOL(ap_queue_init_state); diff --git a/drivers/s390/crypto/zcrypt_ccamisc.c b/drivers/s390/crypto/zcrypt_ccamisc.c index c1db64a2db21..110fe9d0cb91 100644 --- a/drivers/s390/crypto/zcrypt_ccamisc.c +++ b/drivers/s390/crypto/zcrypt_ccamisc.c @@ -1037,8 +1037,8 @@ static int _ip_cprb_helper(u16 cardnr, u16 domain, prepparm = (struct iprepparm *) prepcblk->rpl_parmb; /* do some plausibility checks on the key block */ - if (prepparm->kb.len < 120 + 5 * sizeof(uint16_t) || - prepparm->kb.len > 136 + 5 * sizeof(uint16_t)) { + if (prepparm->kb.len < 120 + 3 * sizeof(uint16_t) || + prepparm->kb.len > 136 + 3 * sizeof(uint16_t)) { DEBUG_ERR("%s reply with invalid or unknown key block\n", __func__); rc = -EIO; diff --git a/drivers/s390/crypto/zcrypt_cex2a.c b/drivers/s390/crypto/zcrypt_cex2a.c index c50f3e86cc74..7cbb384ec535 100644 --- a/drivers/s390/crypto/zcrypt_cex2a.c +++ b/drivers/s390/crypto/zcrypt_cex2a.c @@ -175,6 +175,7 @@ static int zcrypt_cex2a_queue_probe(struct ap_device *ap_dev) zq->queue = aq; zq->online = 1; atomic_set(&zq->load, 0); + ap_queue_init_state(aq); ap_queue_init_reply(aq, &zq->reply); aq->request_timeout = CEX2A_CLEANUP_TIME, aq->private = zq; diff --git a/drivers/s390/crypto/zcrypt_cex2c.c b/drivers/s390/crypto/zcrypt_cex2c.c index 35c7c6672713..c78c0d119806 100644 --- a/drivers/s390/crypto/zcrypt_cex2c.c +++ b/drivers/s390/crypto/zcrypt_cex2c.c @@ -220,6 +220,7 @@ static int zcrypt_cex2c_queue_probe(struct ap_device *ap_dev) zq->queue = aq; zq->online = 1; atomic_set(&zq->load, 0); + ap_rapq(aq->qid); rc = zcrypt_cex2c_rng_supported(aq); if (rc < 0) { zcrypt_queue_free(zq); @@ -231,6 +232,7 @@ static int zcrypt_cex2c_queue_probe(struct ap_device *ap_dev) else zq->ops = zcrypt_msgtype(MSGTYPE06_NAME, MSGTYPE06_VARIANT_NORNG); + ap_queue_init_state(aq); ap_queue_init_reply(aq, &zq->reply); aq->request_timeout = CEX2C_CLEANUP_TIME; aq->private = zq; diff --git a/drivers/s390/crypto/zcrypt_cex4.c b/drivers/s390/crypto/zcrypt_cex4.c index 442e3d6162f7..6fabc906114c 100644 --- a/drivers/s390/crypto/zcrypt_cex4.c +++ b/drivers/s390/crypto/zcrypt_cex4.c @@ -381,6 +381,7 @@ static int zcrypt_cex4_queue_probe(struct ap_device *ap_dev) zq->queue = aq; zq->online = 1; atomic_set(&zq->load, 0); + ap_queue_init_state(aq); ap_queue_init_reply(aq, &zq->reply); aq->request_timeout = CEX4_CLEANUP_TIME, aq->private = zq; diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h index 7c37e94fb28b..9575a627a1e1 100644 --- a/drivers/s390/net/qeth_core.h +++ b/drivers/s390/net/qeth_core.h @@ -558,7 +558,6 @@ enum qeth_channel_states { */ enum qeth_card_states { CARD_STATE_DOWN, - CARD_STATE_HARDSETUP, CARD_STATE_SOFTSETUP, }; @@ -608,6 +607,8 @@ struct qeth_cmd_buffer { long timeout; unsigned char *data; void (*finalize)(struct qeth_card *card, struct qeth_cmd_buffer *iob); + bool (*match)(struct qeth_cmd_buffer *iob, + struct qeth_cmd_buffer *reply); void (*callback)(struct qeth_card *card, struct qeth_cmd_buffer *iob, unsigned int data_length); int rc; @@ -618,6 +619,14 @@ static inline void qeth_get_cmd(struct qeth_cmd_buffer *iob) refcount_inc(&iob->ref_count); } +static inline struct qeth_ipa_cmd *__ipa_reply(struct qeth_cmd_buffer *iob) +{ + if (!IS_IPA(iob->data)) + return NULL; + + return (struct qeth_ipa_cmd *) PDU_ENCAPSULATION(iob->data); +} + static inline struct qeth_ipa_cmd *__ipa_cmd(struct qeth_cmd_buffer *iob) { return (struct qeth_ipa_cmd *)(iob->data + IPA_PDU_HEADER_SIZE); @@ -727,11 +736,10 @@ struct qeth_osn_info { struct qeth_discipline { const struct device_type *devtype; - int (*recover)(void *ptr); int (*setup) (struct ccwgroup_device *); void (*remove) (struct ccwgroup_device *); - int (*set_online) (struct ccwgroup_device *); - int (*set_offline) (struct ccwgroup_device *); + int (*set_online)(struct qeth_card *card); + void (*set_offline)(struct qeth_card *card); int (*do_ioctl)(struct net_device *dev, struct ifreq *rq, int cmd); int (*control_event_handler)(struct qeth_card *card, struct qeth_ipa_cmd *cmd); @@ -987,14 +995,11 @@ struct net_device *qeth_clone_netdev(struct net_device *orig); struct qeth_card *qeth_get_card_by_busid(char *bus_id); void qeth_set_allowed_threads(struct qeth_card *, unsigned long , int); int qeth_threads_running(struct qeth_card *, unsigned long); -int qeth_do_run_thread(struct qeth_card *, unsigned long); -void qeth_clear_thread_start_bit(struct qeth_card *, unsigned long); -void qeth_clear_thread_running_bit(struct qeth_card *, unsigned long); int qeth_core_hardsetup_card(struct qeth_card *card, bool *carrier_ok); int qeth_stop_channel(struct qeth_channel *channel); +int qeth_set_offline(struct qeth_card *card, bool resetting); void qeth_print_status_message(struct qeth_card *); -int qeth_init_qdio_queues(struct qeth_card *); int qeth_send_ipa_cmd(struct qeth_card *, struct qeth_cmd_buffer *, int (*reply_cb) (struct qeth_card *, struct qeth_reply *, unsigned long), @@ -1027,7 +1032,9 @@ void qeth_setadp_promisc_mode(struct qeth_card *card, bool enable); int qeth_setadpparms_change_macaddr(struct qeth_card *); void qeth_tx_timeout(struct net_device *, unsigned int txqueue); void qeth_prepare_ipa_cmd(struct qeth_card *card, struct qeth_cmd_buffer *iob, - u16 cmd_length); + u16 cmd_length, + bool (*match)(struct qeth_cmd_buffer *iob, + struct qeth_cmd_buffer *reply)); int qeth_query_switch_attributes(struct qeth_card *card, struct qeth_switch_info *sw_info); int qeth_query_card_info(struct qeth_card *card, diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index c19cc3978e1f..9639938581f5 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -247,9 +247,6 @@ int qeth_realloc_buffer_pool(struct qeth_card *card, int bufcnt) { QETH_CARD_TEXT(card, 2, "realcbp"); - if (card->state != CARD_STATE_DOWN) - return -EPERM; - /* TODO: steel/add buffers from/to a running card's buffer pool (?) */ qeth_clear_working_pool_list(card); qeth_free_buffer_pool(card); @@ -748,8 +745,8 @@ static void qeth_issue_next_read_cb(struct qeth_card *card, goto out; } - if (IS_IPA(iob->data)) { - cmd = (struct qeth_ipa_cmd *) PDU_ENCAPSULATION(iob->data); + cmd = __ipa_reply(iob); + if (cmd) { cmd = qeth_check_ipa_data(card, cmd); if (!cmd) goto out; @@ -758,17 +755,12 @@ static void qeth_issue_next_read_cb(struct qeth_card *card, card->osn_info.assist_cb(card->dev, cmd); goto out; } - } else { - /* non-IPA commands should only flow during initialization */ - if (card->state != CARD_STATE_DOWN) - goto out; } /* match against pending cmd requests */ spin_lock_irqsave(&card->lock, flags); list_for_each_entry(tmp, &card->cmd_waiter_list, list) { - if (!IS_IPA(tmp->data) || - __ipa_cmd(tmp)->hdr.seqno == cmd->hdr.seqno) { + if (tmp->match && tmp->match(tmp, iob)) { request = tmp; /* take the object outside the lock */ qeth_get_cmd(request); @@ -823,7 +815,8 @@ static int qeth_set_thread_start_bit(struct qeth_card *card, return 0; } -void qeth_clear_thread_start_bit(struct qeth_card *card, unsigned long thread) +static void qeth_clear_thread_start_bit(struct qeth_card *card, + unsigned long thread) { unsigned long flags; @@ -832,9 +825,9 @@ void qeth_clear_thread_start_bit(struct qeth_card *card, unsigned long thread) spin_unlock_irqrestore(&card->thread_mask_lock, flags); wake_up(&card->wait_q); } -EXPORT_SYMBOL_GPL(qeth_clear_thread_start_bit); -void qeth_clear_thread_running_bit(struct qeth_card *card, unsigned long thread) +static void qeth_clear_thread_running_bit(struct qeth_card *card, + unsigned long thread) { unsigned long flags; @@ -843,7 +836,6 @@ void qeth_clear_thread_running_bit(struct qeth_card *card, unsigned long thread) spin_unlock_irqrestore(&card->thread_mask_lock, flags); wake_up_all(&card->wait_q); } -EXPORT_SYMBOL_GPL(qeth_clear_thread_running_bit); static int __qeth_do_run_thread(struct qeth_card *card, unsigned long thread) { @@ -864,7 +856,7 @@ static int __qeth_do_run_thread(struct qeth_card *card, unsigned long thread) return rc; } -int qeth_do_run_thread(struct qeth_card *card, unsigned long thread) +static int qeth_do_run_thread(struct qeth_card *card, unsigned long thread) { int rc = 0; @@ -872,7 +864,6 @@ int qeth_do_run_thread(struct qeth_card *card, unsigned long thread) (rc = __qeth_do_run_thread(card, thread)) >= 0); return rc; } -EXPORT_SYMBOL_GPL(qeth_do_run_thread); void qeth_schedule_recovery(struct qeth_card *card) { @@ -880,7 +871,6 @@ void qeth_schedule_recovery(struct qeth_card *card) if (qeth_set_thread_start_bit(card, QETH_RECOVER_THREAD) == 0) schedule_work(&card->kernel_thread_starter); } -EXPORT_SYMBOL_GPL(qeth_schedule_recovery); static int qeth_get_problem(struct qeth_card *card, struct ccw_device *cdev, struct irb *irb) @@ -1287,6 +1277,7 @@ static int qeth_do_start_thread(struct qeth_card *card, unsigned long thread) return rc; } +static int qeth_do_reset(void *data); static void qeth_start_kernel_thread(struct work_struct *work) { struct task_struct *ts; @@ -1298,8 +1289,7 @@ static void qeth_start_kernel_thread(struct work_struct *work) card->write.state != CH_STATE_UP) return; if (qeth_do_start_thread(card, QETH_RECOVER_THREAD)) { - ts = kthread_run(card->discipline->recover, (void *)card, - "qeth_recover"); + ts = kthread_run(qeth_do_reset, card, "qeth_recover"); if (IS_ERR(ts)) { qeth_clear_thread_start_bit(card, QETH_RECOVER_THREAD); qeth_clear_thread_running_bit(card, @@ -1690,6 +1680,13 @@ static void qeth_mpc_finalize_cmd(struct qeth_card *card, iob->callback = qeth_release_buffer_cb; } +static bool qeth_mpc_match_reply(struct qeth_cmd_buffer *iob, + struct qeth_cmd_buffer *reply) +{ + /* MPC cmds are issued strictly in sequence. */ + return !IS_IPA(reply->data); +} + static struct qeth_cmd_buffer *qeth_mpc_alloc_cmd(struct qeth_card *card, void *data, unsigned int data_length) @@ -1704,6 +1701,7 @@ static struct qeth_cmd_buffer *qeth_mpc_alloc_cmd(struct qeth_card *card, qeth_setup_ccw(__ccw_from_cmd(iob), CCW_CMD_WRITE, 0, data_length, iob->data); iob->finalize = qeth_mpc_finalize_cmd; + iob->match = qeth_mpc_match_reply; return iob; } @@ -2665,7 +2663,7 @@ static unsigned int qeth_tx_select_bulk_max(struct qeth_card *card, return card->ssqd.mmwc ? card->ssqd.mmwc : 1; } -int qeth_init_qdio_queues(struct qeth_card *card) +static int qeth_init_qdio_queues(struct qeth_card *card) { unsigned int i; int rc; @@ -2713,7 +2711,6 @@ int qeth_init_qdio_queues(struct qeth_card *card) } return 0; } -EXPORT_SYMBOL_GPL(qeth_init_qdio_queues); static void qeth_ipa_finalize_cmd(struct qeth_card *card, struct qeth_cmd_buffer *iob) @@ -2725,7 +2722,9 @@ static void qeth_ipa_finalize_cmd(struct qeth_card *card, } void qeth_prepare_ipa_cmd(struct qeth_card *card, struct qeth_cmd_buffer *iob, - u16 cmd_length) + u16 cmd_length, + bool (*match)(struct qeth_cmd_buffer *iob, + struct qeth_cmd_buffer *reply)) { u8 prot_type = qeth_mpc_select_prot_type(card); u16 total_length = iob->length; @@ -2733,6 +2732,7 @@ void qeth_prepare_ipa_cmd(struct qeth_card *card, struct qeth_cmd_buffer *iob, qeth_setup_ccw(__ccw_from_cmd(iob), CCW_CMD_WRITE, 0, total_length, iob->data); iob->finalize = qeth_ipa_finalize_cmd; + iob->match = match; memcpy(iob->data, IPA_PDU_HEADER, IPA_PDU_HEADER_SIZE); memcpy(QETH_IPA_PDU_LEN_TOTAL(iob->data), &total_length, 2); @@ -2745,6 +2745,14 @@ void qeth_prepare_ipa_cmd(struct qeth_card *card, struct qeth_cmd_buffer *iob, } EXPORT_SYMBOL_GPL(qeth_prepare_ipa_cmd); +static bool qeth_ipa_match_reply(struct qeth_cmd_buffer *iob, + struct qeth_cmd_buffer *reply) +{ + struct qeth_ipa_cmd *ipa_reply = __ipa_reply(reply); + + return ipa_reply && (__ipa_cmd(iob)->hdr.seqno == ipa_reply->hdr.seqno); +} + struct qeth_cmd_buffer *qeth_ipa_alloc_cmd(struct qeth_card *card, enum qeth_ipa_cmds cmd_code, enum qeth_prot_versions prot, @@ -2760,7 +2768,7 @@ struct qeth_cmd_buffer *qeth_ipa_alloc_cmd(struct qeth_card *card, if (!iob) return NULL; - qeth_prepare_ipa_cmd(card, iob, data_length); + qeth_prepare_ipa_cmd(card, iob, data_length, qeth_ipa_match_reply); hdr = &__ipa_cmd(iob)->hdr; hdr->command = cmd_code; @@ -3096,7 +3104,6 @@ int qeth_hw_trap(struct qeth_card *card, enum qeth_diags_trap_action action) } return qeth_send_ipa_cmd(card, iob, qeth_hw_trap_cb, NULL); } -EXPORT_SYMBOL_GPL(qeth_hw_trap); static int qeth_check_qdio_errors(struct qeth_card *card, struct qdio_buffer *buf, @@ -5026,6 +5033,12 @@ retriable: if (rc) goto out; + rc = qeth_init_qdio_queues(card); + if (rc) { + QETH_CARD_TEXT_(card, 2, "9err%d", rc); + goto out; + } + return 0; out: dev_warn(&card->gdev->dev, "The qeth device driver failed to recover " @@ -5036,6 +5049,89 @@ out: } EXPORT_SYMBOL_GPL(qeth_core_hardsetup_card); +static int qeth_set_online(struct qeth_card *card) +{ + int rc; + + mutex_lock(&card->discipline_mutex); + mutex_lock(&card->conf_mutex); + QETH_CARD_TEXT(card, 2, "setonlin"); + + rc = card->discipline->set_online(card); + + mutex_unlock(&card->conf_mutex); + mutex_unlock(&card->discipline_mutex); + + return rc; +} + +int qeth_set_offline(struct qeth_card *card, bool resetting) +{ + int rc, rc2, rc3; + + mutex_lock(&card->discipline_mutex); + mutex_lock(&card->conf_mutex); + QETH_CARD_TEXT(card, 3, "setoffl"); + + if ((!resetting && card->info.hwtrap) || card->info.hwtrap == 2) { + qeth_hw_trap(card, QETH_DIAGS_TRAP_DISARM); + card->info.hwtrap = 1; + } + + rtnl_lock(); + card->info.open_when_online = card->dev->flags & IFF_UP; + dev_close(card->dev); + netif_device_detach(card->dev); + netif_carrier_off(card->dev); + rtnl_unlock(); + + card->discipline->set_offline(card); + + rc = qeth_stop_channel(&card->data); + rc2 = qeth_stop_channel(&card->write); + rc3 = qeth_stop_channel(&card->read); + if (!rc) + rc = (rc2) ? rc2 : rc3; + if (rc) + QETH_CARD_TEXT_(card, 2, "1err%d", rc); + qdio_free(CARD_DDEV(card)); + + /* let user_space know that device is offline */ + kobject_uevent(&card->gdev->dev.kobj, KOBJ_CHANGE); + + mutex_unlock(&card->conf_mutex); + mutex_unlock(&card->discipline_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(qeth_set_offline); + +static int qeth_do_reset(void *data) +{ + struct qeth_card *card = data; + int rc; + + QETH_CARD_TEXT(card, 2, "recover1"); + if (!qeth_do_run_thread(card, QETH_RECOVER_THREAD)) + return 0; + QETH_CARD_TEXT(card, 2, "recover2"); + dev_warn(&card->gdev->dev, + "A recovery process has been started for the device\n"); + + qeth_set_offline(card, true); + rc = qeth_set_online(card); + if (!rc) { + dev_info(&card->gdev->dev, + "Device successfully recovered!\n"); + } else { + ccwgroup_set_offline(card->gdev); + dev_warn(&card->gdev->dev, + "The qeth device driver failed to recover an error on the device\n"); + } + qeth_clear_thread_start_bit(card, QETH_RECOVER_THREAD); + qeth_clear_thread_running_bit(card, QETH_RECOVER_THREAD); + return 0; +} + #if IS_ENABLED(CONFIG_QETH_L3) static void qeth_l3_rebuild_skb(struct qeth_card *card, struct sk_buff *skb, struct qeth_hdr *hdr) @@ -5972,7 +6068,8 @@ static int qeth_core_set_online(struct ccwgroup_device *gdev) goto err; } } - rc = card->discipline->set_online(gdev); + + rc = qeth_set_online(card); err: return rc; } @@ -5980,7 +6077,8 @@ err: static int qeth_core_set_offline(struct ccwgroup_device *gdev) { struct qeth_card *card = dev_get_drvdata(&gdev->dev); - return card->discipline->set_offline(gdev); + + return qeth_set_offline(card, false); } static void qeth_core_shutdown(struct ccwgroup_device *gdev) @@ -6003,7 +6101,7 @@ static int qeth_suspend(struct ccwgroup_device *gdev) if (gdev->state == CCWGROUP_OFFLINE) return 0; - card->discipline->set_offline(gdev); + qeth_set_offline(card, false); return 0; } @@ -6012,7 +6110,7 @@ static int qeth_resume(struct ccwgroup_device *gdev) struct qeth_card *card = dev_get_drvdata(&gdev->dev); int rc; - rc = card->discipline->set_online(gdev); + rc = qeth_set_online(card); qeth_set_allowed_threads(card, 0xffffffff, 0); if (rc) diff --git a/drivers/s390/net/qeth_core_sys.c b/drivers/s390/net/qeth_core_sys.c index 7bd86027f559..2bd9993aa60b 100644 --- a/drivers/s390/net/qeth_core_sys.c +++ b/drivers/s390/net/qeth_core_sys.c @@ -24,8 +24,6 @@ static ssize_t qeth_dev_state_show(struct device *dev, switch (card->state) { case CARD_STATE_DOWN: return sprintf(buf, "DOWN\n"); - case CARD_STATE_HARDSETUP: - return sprintf(buf, "HARDSETUP\n"); case CARD_STATE_SOFTSETUP: if (card->dev->flags & IFF_UP) return sprintf(buf, "UP (LAN %s)\n", diff --git a/drivers/s390/net/qeth_l2.h b/drivers/s390/net/qeth_l2.h index ddc615b431a8..adf25c9fd2b3 100644 --- a/drivers/s390/net/qeth_l2.h +++ b/drivers/s390/net/qeth_l2.h @@ -13,7 +13,6 @@ extern const struct attribute_group *qeth_l2_attr_groups[]; int qeth_l2_create_device_attributes(struct device *); void qeth_l2_remove_device_attributes(struct device *); -void qeth_l2_setup_bridgeport_attrs(struct qeth_card *card); int qeth_bridgeport_query_ports(struct qeth_card *card, enum qeth_sbp_roles *role, enum qeth_sbp_states *state); diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c index 7175b5d8a23c..692bd2623401 100644 --- a/drivers/s390/net/qeth_l2_main.c +++ b/drivers/s390/net/qeth_l2_main.c @@ -24,7 +24,6 @@ #include "qeth_core.h" #include "qeth_l2.h" -static int qeth_l2_set_offline(struct ccwgroup_device *); static void qeth_bridgeport_query_support(struct qeth_card *card); static void qeth_bridge_state_change(struct qeth_card *card, struct qeth_ipa_cmd *cmd); @@ -284,15 +283,12 @@ static void qeth_l2_stop_card(struct qeth_card *card) if (card->state == CARD_STATE_SOFTSETUP) { qeth_clear_ipacmd_list(card); - card->state = CARD_STATE_HARDSETUP; - } - if (card->state == CARD_STATE_HARDSETUP) { qeth_drain_output_queues(card); - qeth_clear_working_pool_list(card); card->state = CARD_STATE_DOWN; } qeth_qdio_clear_card(card, 0); + qeth_clear_working_pool_list(card); flush_workqueue(card->event_wq); card->info.mac_bits &= ~QETH_LAYER2_MAC_REGISTERED; card->info.promisc_mode = 0; @@ -610,7 +606,7 @@ static void qeth_l2_remove_device(struct ccwgroup_device *cgdev) wait_event(card->wait_q, qeth_threads_running(card, 0xffffffff) == 0); if (cgdev->state == CCWGROUP_ONLINE) - qeth_l2_set_offline(cgdev); + qeth_set_offline(card, false); cancel_work_sync(&card->close_dev_work); if (qeth_netdev_is_registered(card->dev)) @@ -728,17 +724,31 @@ static void qeth_l2_trace_features(struct qeth_card *card) sizeof(card->options.vnicc.sup_chars)); } -static int qeth_l2_set_online(struct ccwgroup_device *gdev) +static void qeth_l2_setup_bridgeport_attrs(struct qeth_card *card) { - struct qeth_card *card = dev_get_drvdata(&gdev->dev); + if (!card->options.sbp.reflect_promisc && + card->options.sbp.role != QETH_SBP_ROLE_NONE) { + /* Conditional to avoid spurious error messages */ + qeth_bridgeport_setrole(card, card->options.sbp.role); + /* Let the callback function refresh the stored role value. */ + qeth_bridgeport_query_ports(card, &card->options.sbp.role, + NULL); + } + if (card->options.sbp.hostnotification) { + if (qeth_bridgeport_an_set(card, 1)) + card->options.sbp.hostnotification = 0; + } else { + qeth_bridgeport_an_set(card, 0); + } +} + +static int qeth_l2_set_online(struct qeth_card *card) +{ + struct ccwgroup_device *gdev = card->gdev; struct net_device *dev = card->dev; int rc = 0; bool carrier_ok; - mutex_lock(&card->discipline_mutex); - mutex_lock(&card->conf_mutex); - QETH_CARD_TEXT(card, 2, "setonlin"); - rc = qeth_core_hardsetup_card(card, &carrier_ok); if (rc) { QETH_CARD_TEXT_(card, 2, "2err%04x", rc); @@ -748,9 +758,11 @@ static int qeth_l2_set_online(struct ccwgroup_device *gdev) mutex_lock(&card->sbp_lock); qeth_bridgeport_query_support(card); - if (card->options.sbp.supported_funcs) + if (card->options.sbp.supported_funcs) { + qeth_l2_setup_bridgeport_attrs(card); dev_info(&card->gdev->dev, - "The device represents a Bridge Capable Port\n"); + "The device represents a Bridge Capable Port\n"); + } mutex_unlock(&card->sbp_lock); qeth_l2_register_dev_addr(card); @@ -761,20 +773,11 @@ static int qeth_l2_set_online(struct ccwgroup_device *gdev) qeth_trace_features(card); qeth_l2_trace_features(card); - qeth_l2_setup_bridgeport_attrs(card); - - card->state = CARD_STATE_HARDSETUP; qeth_print_status_message(card); /* softsetup */ QETH_CARD_TEXT(card, 2, "softsetp"); - rc = qeth_init_qdio_queues(card); - if (rc) { - QETH_CARD_TEXT_(card, 2, "6err%d", rc); - rc = -ENODEV; - goto out_remove; - } card->state = CARD_STATE_SOFTSETUP; qeth_set_allowed_threads(card, 0xffffffff, 0); @@ -801,8 +804,6 @@ static int qeth_l2_set_online(struct ccwgroup_device *gdev) } /* let user_space know that device is online */ kobject_uevent(&gdev->dev.kobj, KOBJ_CHANGE); - mutex_unlock(&card->conf_mutex); - mutex_unlock(&card->discipline_mutex); return 0; out_remove: @@ -811,81 +812,12 @@ out_remove: qeth_stop_channel(&card->write); qeth_stop_channel(&card->read); qdio_free(CARD_DDEV(card)); - - mutex_unlock(&card->conf_mutex); - mutex_unlock(&card->discipline_mutex); return rc; } -static int __qeth_l2_set_offline(struct ccwgroup_device *cgdev, - int recovery_mode) +static void qeth_l2_set_offline(struct qeth_card *card) { - struct qeth_card *card = dev_get_drvdata(&cgdev->dev); - int rc = 0, rc2 = 0, rc3 = 0; - - mutex_lock(&card->discipline_mutex); - mutex_lock(&card->conf_mutex); - QETH_CARD_TEXT(card, 3, "setoffl"); - - if ((!recovery_mode && card->info.hwtrap) || card->info.hwtrap == 2) { - qeth_hw_trap(card, QETH_DIAGS_TRAP_DISARM); - card->info.hwtrap = 1; - } - - rtnl_lock(); - card->info.open_when_online = card->dev->flags & IFF_UP; - dev_close(card->dev); - netif_device_detach(card->dev); - netif_carrier_off(card->dev); - rtnl_unlock(); - qeth_l2_stop_card(card); - rc = qeth_stop_channel(&card->data); - rc2 = qeth_stop_channel(&card->write); - rc3 = qeth_stop_channel(&card->read); - if (!rc) - rc = (rc2) ? rc2 : rc3; - if (rc) - QETH_CARD_TEXT_(card, 2, "1err%d", rc); - qdio_free(CARD_DDEV(card)); - - /* let user_space know that device is offline */ - kobject_uevent(&cgdev->dev.kobj, KOBJ_CHANGE); - mutex_unlock(&card->conf_mutex); - mutex_unlock(&card->discipline_mutex); - return 0; -} - -static int qeth_l2_set_offline(struct ccwgroup_device *cgdev) -{ - return __qeth_l2_set_offline(cgdev, 0); -} - -static int qeth_l2_recover(void *ptr) -{ - struct qeth_card *card; - int rc = 0; - - card = (struct qeth_card *) ptr; - QETH_CARD_TEXT(card, 2, "recover1"); - if (!qeth_do_run_thread(card, QETH_RECOVER_THREAD)) - return 0; - QETH_CARD_TEXT(card, 2, "recover2"); - dev_warn(&card->gdev->dev, - "A recovery process has been started for the device\n"); - __qeth_l2_set_offline(card->gdev, 1); - rc = qeth_l2_set_online(card->gdev); - if (!rc) - dev_info(&card->gdev->dev, - "Device successfully recovered!\n"); - else { - ccwgroup_set_offline(card->gdev); - dev_warn(&card->gdev->dev, "The qeth device driver " - "failed to recover an error on the device\n"); - } - qeth_clear_thread_start_bit(card, QETH_RECOVER_THREAD); - qeth_clear_thread_running_bit(card, QETH_RECOVER_THREAD); - return 0; } static int __init qeth_l2_init(void) @@ -922,7 +854,6 @@ static int qeth_l2_control_event(struct qeth_card *card, struct qeth_discipline qeth_l2_discipline = { .devtype = &qeth_l2_devtype, - .recover = qeth_l2_recover, .setup = qeth_l2_probe_device, .remove = qeth_l2_remove_device, .set_online = qeth_l2_set_online, @@ -961,7 +892,8 @@ int qeth_osn_assist(struct net_device *dev, void *data, int data_len) if (!iob) return -ENOMEM; - qeth_prepare_ipa_cmd(card, iob, (u16) data_len); + qeth_prepare_ipa_cmd(card, iob, (u16) data_len, NULL); + memcpy(__ipa_cmd(iob), data, data_len); iob->callback = qeth_osn_assist_cb; return qeth_send_ipa_cmd(card, iob, NULL, NULL); diff --git a/drivers/s390/net/qeth_l2_sys.c b/drivers/s390/net/qeth_l2_sys.c index 7fa325cf6f8d..86bcae992f72 100644 --- a/drivers/s390/net/qeth_l2_sys.c +++ b/drivers/s390/net/qeth_l2_sys.c @@ -246,40 +246,6 @@ static struct attribute_group qeth_l2_bridgeport_attr_group = { .attrs = qeth_l2_bridgeport_attrs, }; -/** - * qeth_l2_setup_bridgeport_attrs() - set/restore attrs when turning online. - * @card: qeth_card structure pointer - * - * Note: this function is called with conf_mutex held by the caller - */ -void qeth_l2_setup_bridgeport_attrs(struct qeth_card *card) -{ - int rc; - - if (!card) - return; - if (!card->options.sbp.supported_funcs) - return; - - mutex_lock(&card->sbp_lock); - if (!card->options.sbp.reflect_promisc && - card->options.sbp.role != QETH_SBP_ROLE_NONE) { - /* Conditional to avoid spurious error messages */ - qeth_bridgeport_setrole(card, card->options.sbp.role); - /* Let the callback function refresh the stored role value. */ - qeth_bridgeport_query_ports(card, - &card->options.sbp.role, NULL); - } - if (card->options.sbp.hostnotification) { - rc = qeth_bridgeport_an_set(card, 1); - if (rc) - card->options.sbp.hostnotification = 0; - } else { - qeth_bridgeport_an_set(card, 0); - } - mutex_unlock(&card->sbp_lock); -} - /* VNIC CHARS support */ /* convert sysfs attr name to VNIC characteristic */ diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c index 6c0bffd11138..317d56647a4a 100644 --- a/drivers/s390/net/qeth_l3_main.c +++ b/drivers/s390/net/qeth_l3_main.c @@ -37,8 +37,6 @@ #include "qeth_l3.h" - -static int qeth_l3_set_offline(struct ccwgroup_device *); static int qeth_l3_register_addr_entry(struct qeth_card *, struct qeth_ipaddr *); static int qeth_l3_deregister_addr_entry(struct qeth_card *, @@ -903,7 +901,7 @@ out: return rc; } -static int qeth_l3_start_ipassists(struct qeth_card *card) +static void qeth_l3_start_ipassists(struct qeth_card *card) { QETH_CARD_TEXT(card, 3, "strtipas"); @@ -913,7 +911,6 @@ static int qeth_l3_start_ipassists(struct qeth_card *card) qeth_l3_start_ipa_multicast(card); /* go on*/ qeth_l3_start_ipa_ipv6(card); /* go on*/ qeth_l3_start_ipa_broadcast(card); /* go on*/ - return 0; } static int qeth_l3_iqd_read_initial_mac_cb(struct qeth_card *card, @@ -1180,15 +1177,12 @@ static void qeth_l3_stop_card(struct qeth_card *card) if (card->state == CARD_STATE_SOFTSETUP) { qeth_l3_clear_ip_htable(card, 1); qeth_clear_ipacmd_list(card); - card->state = CARD_STATE_HARDSETUP; - } - if (card->state == CARD_STATE_HARDSETUP) { qeth_drain_output_queues(card); - qeth_clear_working_pool_list(card); card->state = CARD_STATE_DOWN; } qeth_qdio_clear_card(card, 0); + qeth_clear_working_pool_list(card); flush_workqueue(card->event_wq); card->info.promisc_mode = 0; } @@ -2044,7 +2038,7 @@ static void qeth_l3_remove_device(struct ccwgroup_device *cgdev) wait_event(card->wait_q, qeth_threads_running(card, 0xffffffff) == 0); if (cgdev->state == CCWGROUP_ONLINE) - qeth_l3_set_offline(cgdev); + qeth_set_offline(card, false); cancel_work_sync(&card->close_dev_work); if (qeth_netdev_is_registered(card->dev)) @@ -2056,17 +2050,13 @@ static void qeth_l3_remove_device(struct ccwgroup_device *cgdev) qeth_l3_clear_ipato_list(card); } -static int qeth_l3_set_online(struct ccwgroup_device *gdev) +static int qeth_l3_set_online(struct qeth_card *card) { - struct qeth_card *card = dev_get_drvdata(&gdev->dev); + struct ccwgroup_device *gdev = card->gdev; struct net_device *dev = card->dev; int rc = 0; bool carrier_ok; - mutex_lock(&card->discipline_mutex); - mutex_lock(&card->conf_mutex); - QETH_CARD_TEXT(card, 2, "setonlin"); - rc = qeth_core_hardsetup_card(card, &carrier_ok); if (rc) { QETH_CARD_TEXT_(card, 2, "2err%04x", rc); @@ -2074,7 +2064,6 @@ static int qeth_l3_set_online(struct ccwgroup_device *gdev) goto out_remove; } - card->state = CARD_STATE_HARDSETUP; qeth_print_status_message(card); /* softsetup */ @@ -2084,11 +2073,8 @@ static int qeth_l3_set_online(struct ccwgroup_device *gdev) if (rc) QETH_CARD_TEXT_(card, 2, "2err%04x", rc); if (!card->options.sniffer) { - rc = qeth_l3_start_ipassists(card); - if (rc) { - QETH_CARD_TEXT_(card, 2, "3err%d", rc); - goto out_remove; - } + qeth_l3_start_ipassists(card); + rc = qeth_l3_setrouting_v4(card); if (rc) QETH_CARD_TEXT_(card, 2, "4err%04x", rc); @@ -2097,12 +2083,6 @@ static int qeth_l3_set_online(struct ccwgroup_device *gdev) QETH_CARD_TEXT_(card, 2, "5err%04x", rc); } - rc = qeth_init_qdio_queues(card); - if (rc) { - QETH_CARD_TEXT_(card, 2, "6err%d", rc); - rc = -ENODEV; - goto out_remove; - } card->state = CARD_STATE_SOFTSETUP; qeth_set_allowed_threads(card, 0xffffffff, 0); @@ -2131,8 +2111,6 @@ static int qeth_l3_set_online(struct ccwgroup_device *gdev) qeth_trace_features(card); /* let user_space know that device is online */ kobject_uevent(&gdev->dev.kobj, KOBJ_CHANGE); - mutex_unlock(&card->conf_mutex); - mutex_unlock(&card->discipline_mutex); return 0; out_remove: qeth_l3_stop_card(card); @@ -2140,82 +2118,12 @@ out_remove: qeth_stop_channel(&card->write); qeth_stop_channel(&card->read); qdio_free(CARD_DDEV(card)); - - mutex_unlock(&card->conf_mutex); - mutex_unlock(&card->discipline_mutex); return rc; } -static int __qeth_l3_set_offline(struct ccwgroup_device *cgdev, - int recovery_mode) +static void qeth_l3_set_offline(struct qeth_card *card) { - struct qeth_card *card = dev_get_drvdata(&cgdev->dev); - int rc = 0, rc2 = 0, rc3 = 0; - - mutex_lock(&card->discipline_mutex); - mutex_lock(&card->conf_mutex); - QETH_CARD_TEXT(card, 3, "setoffl"); - - if ((!recovery_mode && card->info.hwtrap) || card->info.hwtrap == 2) { - qeth_hw_trap(card, QETH_DIAGS_TRAP_DISARM); - card->info.hwtrap = 1; - } - - rtnl_lock(); - card->info.open_when_online = card->dev->flags & IFF_UP; - dev_close(card->dev); - netif_device_detach(card->dev); - netif_carrier_off(card->dev); - rtnl_unlock(); - qeth_l3_stop_card(card); - rc = qeth_stop_channel(&card->data); - rc2 = qeth_stop_channel(&card->write); - rc3 = qeth_stop_channel(&card->read); - if (!rc) - rc = (rc2) ? rc2 : rc3; - if (rc) - QETH_CARD_TEXT_(card, 2, "1err%d", rc); - qdio_free(CARD_DDEV(card)); - - /* let user_space know that device is offline */ - kobject_uevent(&cgdev->dev.kobj, KOBJ_CHANGE); - mutex_unlock(&card->conf_mutex); - mutex_unlock(&card->discipline_mutex); - return 0; -} - -static int qeth_l3_set_offline(struct ccwgroup_device *cgdev) -{ - return __qeth_l3_set_offline(cgdev, 0); -} - -static int qeth_l3_recover(void *ptr) -{ - struct qeth_card *card; - int rc = 0; - - card = (struct qeth_card *) ptr; - QETH_CARD_TEXT(card, 2, "recover1"); - QETH_CARD_HEX(card, 2, &card, sizeof(void *)); - if (!qeth_do_run_thread(card, QETH_RECOVER_THREAD)) - return 0; - QETH_CARD_TEXT(card, 2, "recover2"); - dev_warn(&card->gdev->dev, - "A recovery process has been started for the device\n"); - __qeth_l3_set_offline(card->gdev, 1); - rc = qeth_l3_set_online(card->gdev); - if (!rc) - dev_info(&card->gdev->dev, - "Device successfully recovered!\n"); - else { - ccwgroup_set_offline(card->gdev); - dev_warn(&card->gdev->dev, "The qeth device driver " - "failed to recover an error on the device\n"); - } - qeth_clear_thread_start_bit(card, QETH_RECOVER_THREAD); - qeth_clear_thread_running_bit(card, QETH_RECOVER_THREAD); - return 0; } /* Returns zero if the command is successfully "consumed" */ @@ -2227,7 +2135,6 @@ static int qeth_l3_control_event(struct qeth_card *card, struct qeth_discipline qeth_l3_discipline = { .devtype = &qeth_l3_devtype, - .recover = qeth_l3_recover, .setup = qeth_l3_probe_device, .remove = qeth_l3_remove_device, .set_online = qeth_l3_set_online, diff --git a/drivers/scsi/fnic/vnic_dev.c b/drivers/scsi/fnic/vnic_dev.c index 1f55b9e4e74a..1b88a3b53eee 100644 --- a/drivers/scsi/fnic/vnic_dev.c +++ b/drivers/scsi/fnic/vnic_dev.c @@ -688,26 +688,26 @@ int vnic_dev_soft_reset_done(struct vnic_dev *vdev, int *done) int vnic_dev_hang_notify(struct vnic_dev *vdev) { - u64 a0, a1; + u64 a0 = 0, a1 = 0; int wait = 1000; return vnic_dev_cmd(vdev, CMD_HANG_NOTIFY, &a0, &a1, wait); } int vnic_dev_mac_addr(struct vnic_dev *vdev, u8 *mac_addr) { - u64 a0, a1; + u64 a[2] = {}; int wait = 1000; int err, i; for (i = 0; i < ETH_ALEN; i++) mac_addr[i] = 0; - err = vnic_dev_cmd(vdev, CMD_MAC_ADDR, &a0, &a1, wait); + err = vnic_dev_cmd(vdev, CMD_MAC_ADDR, &a[0], &a[1], wait); if (err) return err; for (i = 0; i < ETH_ALEN; i++) - mac_addr[i] = ((u8 *)&a0)[i]; + mac_addr[i] = ((u8 *)&a)[i]; return 0; } @@ -732,30 +732,30 @@ void vnic_dev_packet_filter(struct vnic_dev *vdev, int directed, int multicast, void vnic_dev_add_addr(struct vnic_dev *vdev, u8 *addr) { - u64 a0 = 0, a1 = 0; + u64 a[2] = {}; int wait = 1000; int err; int i; for (i = 0; i < ETH_ALEN; i++) - ((u8 *)&a0)[i] = addr[i]; + ((u8 *)&a)[i] = addr[i]; - err = vnic_dev_cmd(vdev, CMD_ADDR_ADD, &a0, &a1, wait); + err = vnic_dev_cmd(vdev, CMD_ADDR_ADD, &a[0], &a[1], wait); if (err) pr_err("Can't add addr [%pM], %d\n", addr, err); } void vnic_dev_del_addr(struct vnic_dev *vdev, u8 *addr) { - u64 a0 = 0, a1 = 0; + u64 a[2] = {}; int wait = 1000; int err; int i; for (i = 0; i < ETH_ALEN; i++) - ((u8 *)&a0)[i] = addr[i]; + ((u8 *)&a)[i] = addr[i]; - err = vnic_dev_cmd(vdev, CMD_ADDR_DEL, &a0, &a1, wait); + err = vnic_dev_cmd(vdev, CMD_ADDR_DEL, &a[0], &a[1], wait); if (err) pr_err("Can't del addr [%pM], %d\n", addr, err); } diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index cea625906440..65ce10c7989c 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -2211,8 +2211,10 @@ static int sd_read_protection_type(struct scsi_disk *sdkp, unsigned char *buffer u8 type; int ret = 0; - if (scsi_device_protection(sdp) == 0 || (buffer[12] & 1) == 0) + if (scsi_device_protection(sdp) == 0 || (buffer[12] & 1) == 0) { + sdkp->protection_type = 0; return ret; + } type = ((buffer[12] >> 1) & 7) + 1; /* P_TYPE 0 = Type 1 */ diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index f8faf8b3d965..fb41636519ee 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -1842,9 +1842,11 @@ static int storvsc_probe(struct hv_device *device, */ host->sg_tablesize = (stor_device->max_transfer_bytes >> PAGE_SHIFT); /* + * For non-IDE disks, the host supports multiple channels. * Set the number of HW queues we are supporting. */ - host->nr_hw_queues = num_present_cpus(); + if (!dev_is_ide) + host->nr_hw_queues = num_present_cpus(); /* * Set the error handler work queue. diff --git a/drivers/soc/amlogic/meson-ee-pwrc.c b/drivers/soc/amlogic/meson-ee-pwrc.c index 5823f5b67d16..3f0261d53ad9 100644 --- a/drivers/soc/amlogic/meson-ee-pwrc.c +++ b/drivers/soc/amlogic/meson-ee-pwrc.c @@ -323,6 +323,8 @@ static int meson_ee_pwrc_init_domain(struct platform_device *pdev, struct meson_ee_pwrc *pwrc, struct meson_ee_pwrc_domain *dom) { + int ret; + dom->pwrc = pwrc; dom->num_rstc = dom->desc.reset_names_count; dom->num_clks = dom->desc.clk_names_count; @@ -368,15 +370,21 @@ static int meson_ee_pwrc_init_domain(struct platform_device *pdev, * prepare/enable counters won't be in sync. */ if (dom->num_clks && dom->desc.get_power && !dom->desc.get_power(dom)) { - int ret = clk_bulk_prepare_enable(dom->num_clks, dom->clks); + ret = clk_bulk_prepare_enable(dom->num_clks, dom->clks); if (ret) return ret; - pm_genpd_init(&dom->base, &pm_domain_always_on_gov, false); - } else - pm_genpd_init(&dom->base, NULL, - (dom->desc.get_power ? - dom->desc.get_power(dom) : true)); + ret = pm_genpd_init(&dom->base, &pm_domain_always_on_gov, + false); + if (ret) + return ret; + } else { + ret = pm_genpd_init(&dom->base, NULL, + (dom->desc.get_power ? + dom->desc.get_power(dom) : true)); + if (ret) + return ret; + } return 0; } @@ -441,9 +449,7 @@ static int meson_ee_pwrc_probe(struct platform_device *pdev) pwrc->xlate.domains[i] = &dom->base; } - of_genpd_add_provider_onecell(pdev->dev.of_node, &pwrc->xlate); - - return 0; + return of_genpd_add_provider_onecell(pdev->dev.of_node, &pwrc->xlate); } static void meson_ee_pwrc_shutdown(struct platform_device *pdev) diff --git a/drivers/soc/sifive/sifive_l2_cache.c b/drivers/soc/sifive/sifive_l2_cache.c index a9ffff3277c7..a5069394cd61 100644 --- a/drivers/soc/sifive/sifive_l2_cache.c +++ b/drivers/soc/sifive/sifive_l2_cache.c @@ -9,7 +9,7 @@ #include <linux/interrupt.h> #include <linux/of_irq.h> #include <linux/of_address.h> -#include <asm/sifive_l2_cache.h> +#include <soc/sifive/sifive_l2_cache.h> #define SIFIVE_L2_DIRECCFIX_LOW 0x100 #define SIFIVE_L2_DIRECCFIX_HIGH 0x104 diff --git a/drivers/soc/ti/wkup_m3_ipc.c b/drivers/soc/ti/wkup_m3_ipc.c index 378369d9364a..e9ece45d7a33 100644 --- a/drivers/soc/ti/wkup_m3_ipc.c +++ b/drivers/soc/ti/wkup_m3_ipc.c @@ -419,6 +419,8 @@ static void wkup_m3_rproc_boot_thread(struct wkup_m3_ipc *m3_ipc) ret = rproc_boot(m3_ipc->rproc); if (ret) dev_err(dev, "rproc_boot failed\n"); + else + m3_ipc_state = m3_ipc; do_exit(0); } @@ -505,8 +507,6 @@ static int wkup_m3_ipc_probe(struct platform_device *pdev) goto err_put_rproc; } - m3_ipc_state = m3_ipc; - return 0; err_put_rproc: diff --git a/drivers/spi/spi-dw.c b/drivers/spi/spi-dw.c index 76d6b94a7597..5a25da377119 100644 --- a/drivers/spi/spi-dw.c +++ b/drivers/spi/spi-dw.c @@ -172,9 +172,11 @@ static inline u32 rx_max(struct dw_spi *dws) static void dw_writer(struct dw_spi *dws) { - u32 max = tx_max(dws); + u32 max; u16 txw = 0; + spin_lock(&dws->buf_lock); + max = tx_max(dws); while (max--) { /* Set the tx word if the transfer's original "tx" is not null */ if (dws->tx_end - dws->len) { @@ -186,13 +188,16 @@ static void dw_writer(struct dw_spi *dws) dw_write_io_reg(dws, DW_SPI_DR, txw); dws->tx += dws->n_bytes; } + spin_unlock(&dws->buf_lock); } static void dw_reader(struct dw_spi *dws) { - u32 max = rx_max(dws); + u32 max; u16 rxw; + spin_lock(&dws->buf_lock); + max = rx_max(dws); while (max--) { rxw = dw_read_io_reg(dws, DW_SPI_DR); /* Care rx only if the transfer's original "rx" is not null */ @@ -204,6 +209,7 @@ static void dw_reader(struct dw_spi *dws) } dws->rx += dws->n_bytes; } + spin_unlock(&dws->buf_lock); } static void int_error_stop(struct dw_spi *dws, const char *msg) @@ -276,18 +282,20 @@ static int dw_spi_transfer_one(struct spi_controller *master, { struct dw_spi *dws = spi_controller_get_devdata(master); struct chip_data *chip = spi_get_ctldata(spi); + unsigned long flags; u8 imask = 0; u16 txlevel = 0; u32 cr0; int ret; dws->dma_mapped = 0; - + spin_lock_irqsave(&dws->buf_lock, flags); dws->tx = (void *)transfer->tx_buf; dws->tx_end = dws->tx + transfer->len; dws->rx = transfer->rx_buf; dws->rx_end = dws->rx + transfer->len; dws->len = transfer->len; + spin_unlock_irqrestore(&dws->buf_lock, flags); spi_enable_chip(dws, 0); @@ -471,6 +479,7 @@ int dw_spi_add_host(struct device *dev, struct dw_spi *dws) dws->type = SSI_MOTO_SPI; dws->dma_inited = 0; dws->dma_addr = (dma_addr_t)(dws->paddr + DW_SPI_DR); + spin_lock_init(&dws->buf_lock); spi_controller_set_devdata(master, dws); diff --git a/drivers/spi/spi-dw.h b/drivers/spi/spi-dw.h index 38c7de1f0aa9..1bf5713e047d 100644 --- a/drivers/spi/spi-dw.h +++ b/drivers/spi/spi-dw.h @@ -119,6 +119,7 @@ struct dw_spi { size_t len; void *tx; void *tx_end; + spinlock_t buf_lock; void *rx; void *rx_end; int dma_mapped; diff --git a/drivers/spi/spi-fsl-dspi.c b/drivers/spi/spi-fsl-dspi.c index 442cff71a0d2..8428b69c858b 100644 --- a/drivers/spi/spi-fsl-dspi.c +++ b/drivers/spi/spi-fsl-dspi.c @@ -185,6 +185,7 @@ struct fsl_dspi { struct spi_transfer *cur_transfer; struct spi_message *cur_msg; struct chip_data *cur_chip; + size_t progress; size_t len; const void *tx; void *rx; @@ -586,21 +587,14 @@ static void dspi_tcfq_write(struct fsl_dspi *dspi) dspi->tx_cmd |= SPI_PUSHR_CMD_CTCNT; if (dspi->devtype_data->xspi_mode && dspi->bits_per_word > 16) { - /* Write two TX FIFO entries first, and then the corresponding - * CMD FIFO entry. + /* Write the CMD FIFO entry first, and then the two + * corresponding TX FIFO entries. */ u32 data = dspi_pop_tx(dspi); - if (dspi->cur_chip->ctar_val & SPI_CTAR_LSBFE) { - /* LSB */ - tx_fifo_write(dspi, data & 0xFFFF); - tx_fifo_write(dspi, data >> 16); - } else { - /* MSB */ - tx_fifo_write(dspi, data >> 16); - tx_fifo_write(dspi, data & 0xFFFF); - } cmd_fifo_write(dspi); + tx_fifo_write(dspi, data & 0xFFFF); + tx_fifo_write(dspi, data >> 16); } else { /* Write one entry to both TX FIFO and CMD FIFO * simultaneously. @@ -658,7 +652,7 @@ static int dspi_rxtx(struct fsl_dspi *dspi) u32 spi_tcr; spi_take_timestamp_post(dspi->ctlr, dspi->cur_transfer, - dspi->tx - dspi->bytes_per_word, !dspi->irq); + dspi->progress, !dspi->irq); /* Get transfer counter (in number of SPI transfers). It was * reset to 0 when transfer(s) were started. @@ -667,6 +661,7 @@ static int dspi_rxtx(struct fsl_dspi *dspi) spi_tcnt = SPI_TCR_GET_TCNT(spi_tcr); /* Update total number of bytes that were transferred */ msg->actual_length += spi_tcnt * dspi->bytes_per_word; + dspi->progress += spi_tcnt; trans_mode = dspi->devtype_data->trans_mode; if (trans_mode == DSPI_EOQ_MODE) @@ -679,7 +674,7 @@ static int dspi_rxtx(struct fsl_dspi *dspi) return 0; spi_take_timestamp_pre(dspi->ctlr, dspi->cur_transfer, - dspi->tx, !dspi->irq); + dspi->progress, !dspi->irq); if (trans_mode == DSPI_EOQ_MODE) dspi_eoq_write(dspi); @@ -768,6 +763,7 @@ static int dspi_transfer_one_message(struct spi_controller *ctlr, dspi->rx = transfer->rx_buf; dspi->rx_end = dspi->rx + transfer->len; dspi->len = transfer->len; + dspi->progress = 0; /* Validated transfer specific frame size (defaults applied) */ dspi->bits_per_word = transfer->bits_per_word; if (transfer->bits_per_word <= 8) @@ -789,7 +785,7 @@ static int dspi_transfer_one_message(struct spi_controller *ctlr, SPI_CTARE_DTCP(1)); spi_take_timestamp_pre(dspi->ctlr, dspi->cur_transfer, - dspi->tx, !dspi->irq); + dspi->progress, !dspi->irq); trans_mode = dspi->devtype_data->trans_mode; switch (trans_mode) { diff --git a/drivers/spi/spi-uniphier.c b/drivers/spi/spi-uniphier.c index 47cde1864630..ce9b30112e26 100644 --- a/drivers/spi/spi-uniphier.c +++ b/drivers/spi/spi-uniphier.c @@ -290,25 +290,32 @@ static void uniphier_spi_recv(struct uniphier_spi_priv *priv) } } -static void uniphier_spi_fill_tx_fifo(struct uniphier_spi_priv *priv) +static void uniphier_spi_set_fifo_threshold(struct uniphier_spi_priv *priv, + unsigned int threshold) { - unsigned int fifo_threshold, fill_bytes; u32 val; - fifo_threshold = DIV_ROUND_UP(priv->rx_bytes, - bytes_per_word(priv->bits_per_word)); - fifo_threshold = min(fifo_threshold, SSI_FIFO_DEPTH); - - fill_bytes = fifo_threshold - (priv->rx_bytes - priv->tx_bytes); - - /* set fifo threshold */ val = readl(priv->base + SSI_FC); val &= ~(SSI_FC_TXFTH_MASK | SSI_FC_RXFTH_MASK); - val |= FIELD_PREP(SSI_FC_TXFTH_MASK, fifo_threshold); - val |= FIELD_PREP(SSI_FC_RXFTH_MASK, fifo_threshold); + val |= FIELD_PREP(SSI_FC_TXFTH_MASK, SSI_FIFO_DEPTH - threshold); + val |= FIELD_PREP(SSI_FC_RXFTH_MASK, threshold); writel(val, priv->base + SSI_FC); +} + +static void uniphier_spi_fill_tx_fifo(struct uniphier_spi_priv *priv) +{ + unsigned int fifo_threshold, fill_words; + unsigned int bpw = bytes_per_word(priv->bits_per_word); + + fifo_threshold = DIV_ROUND_UP(priv->rx_bytes, bpw); + fifo_threshold = min(fifo_threshold, SSI_FIFO_DEPTH); + + uniphier_spi_set_fifo_threshold(priv, fifo_threshold); + + fill_words = fifo_threshold - + DIV_ROUND_UP(priv->rx_bytes - priv->tx_bytes, bpw); - while (fill_bytes--) + while (fill_words--) uniphier_spi_send(priv); } diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 5e4c4532f7f3..8994545367a2 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -1499,8 +1499,7 @@ static void spi_pump_messages(struct kthread_work *work) * advances its @tx buffer pointer monotonically. * @ctlr: Pointer to the spi_controller structure of the driver * @xfer: Pointer to the transfer being timestamped - * @tx: Pointer to the current word within the xfer->tx_buf that the driver is - * preparing to transmit right now. + * @progress: How many words (not bytes) have been transferred so far * @irqs_off: If true, will disable IRQs and preemption for the duration of the * transfer, for less jitter in time measurement. Only compatible * with PIO drivers. If true, must follow up with @@ -1510,21 +1509,19 @@ static void spi_pump_messages(struct kthread_work *work) */ void spi_take_timestamp_pre(struct spi_controller *ctlr, struct spi_transfer *xfer, - const void *tx, bool irqs_off) + size_t progress, bool irqs_off) { - u8 bytes_per_word = DIV_ROUND_UP(xfer->bits_per_word, 8); - if (!xfer->ptp_sts) return; if (xfer->timestamped_pre) return; - if (tx < (xfer->tx_buf + xfer->ptp_sts_word_pre * bytes_per_word)) + if (progress < xfer->ptp_sts_word_pre) return; /* Capture the resolution of the timestamp */ - xfer->ptp_sts_word_pre = (tx - xfer->tx_buf) / bytes_per_word; + xfer->ptp_sts_word_pre = progress; xfer->timestamped_pre = true; @@ -1546,23 +1543,20 @@ EXPORT_SYMBOL_GPL(spi_take_timestamp_pre); * timestamped. * @ctlr: Pointer to the spi_controller structure of the driver * @xfer: Pointer to the transfer being timestamped - * @tx: Pointer to the current word within the xfer->tx_buf that the driver has - * just transmitted. + * @progress: How many words (not bytes) have been transferred so far * @irqs_off: If true, will re-enable IRQs and preemption for the local CPU. */ void spi_take_timestamp_post(struct spi_controller *ctlr, struct spi_transfer *xfer, - const void *tx, bool irqs_off) + size_t progress, bool irqs_off) { - u8 bytes_per_word = DIV_ROUND_UP(xfer->bits_per_word, 8); - if (!xfer->ptp_sts) return; if (xfer->timestamped_post) return; - if (tx < (xfer->tx_buf + xfer->ptp_sts_word_post * bytes_per_word)) + if (progress < xfer->ptp_sts_word_post) return; ptp_read_system_postts(xfer->ptp_sts); @@ -1573,7 +1567,7 @@ void spi_take_timestamp_post(struct spi_controller *ctlr, } /* Capture the resolution of the timestamp */ - xfer->ptp_sts_word_post = (tx - xfer->tx_buf) / bytes_per_word; + xfer->ptp_sts_word_post = progress; xfer->timestamped_post = true; } diff --git a/drivers/staging/comedi/drivers/adv_pci1710.c b/drivers/staging/comedi/drivers/adv_pci1710.c index dbff0f7e7cf5..ddc0dc93d08b 100644 --- a/drivers/staging/comedi/drivers/adv_pci1710.c +++ b/drivers/staging/comedi/drivers/adv_pci1710.c @@ -46,8 +46,8 @@ #define PCI171X_RANGE_UNI BIT(4) #define PCI171X_RANGE_GAIN(x) (((x) & 0x7) << 0) #define PCI171X_MUX_REG 0x04 /* W: A/D multiplexor control */ -#define PCI171X_MUX_CHANH(x) (((x) & 0xf) << 8) -#define PCI171X_MUX_CHANL(x) (((x) & 0xf) << 0) +#define PCI171X_MUX_CHANH(x) (((x) & 0xff) << 8) +#define PCI171X_MUX_CHANL(x) (((x) & 0xff) << 0) #define PCI171X_MUX_CHAN(x) (PCI171X_MUX_CHANH(x) | PCI171X_MUX_CHANL(x)) #define PCI171X_STATUS_REG 0x06 /* R: status register */ #define PCI171X_STATUS_IRQ BIT(11) /* 1=IRQ occurred */ diff --git a/drivers/staging/comedi/drivers/ni_routes.c b/drivers/staging/comedi/drivers/ni_routes.c index 673d732dcb8f..8f398b30f5bf 100644 --- a/drivers/staging/comedi/drivers/ni_routes.c +++ b/drivers/staging/comedi/drivers/ni_routes.c @@ -72,9 +72,6 @@ static int ni_find_device_routes(const char *device_family, } } - if (!rv) - return -ENODATA; - /* Second, find the set of routes valid for this device. */ for (i = 0; ni_device_routes_list[i]; ++i) { if (memcmp(ni_device_routes_list[i]->device, board_name, @@ -84,12 +81,12 @@ static int ni_find_device_routes(const char *device_family, } } - if (!dr) - return -ENODATA; - tables->route_values = rv; tables->valid_routes = dr; + if (!rv || !dr) + return -ENODATA; + return 0; } @@ -487,6 +484,9 @@ int ni_find_route_source(const u8 src_sel_reg_value, int dest, { int src; + if (!tables->route_values) + return -EINVAL; + dest = B(dest); /* subtract NI names offset */ /* ensure we are not going to under/over run the route value table */ if (dest < 0 || dest >= NI_NUM_NAMES) diff --git a/drivers/staging/media/ipu3/include/intel-ipu3.h b/drivers/staging/media/ipu3/include/intel-ipu3.h index 08eaa0bad0de..1c9c3ba4d518 100644 --- a/drivers/staging/media/ipu3/include/intel-ipu3.h +++ b/drivers/staging/media/ipu3/include/intel-ipu3.h @@ -449,7 +449,7 @@ struct ipu3_uapi_awb_fr_config_s { __u16 reserved1; __u32 bayer_sign; __u8 bayer_nf; - __u8 reserved2[3]; + __u8 reserved2[7]; } __attribute__((aligned(32))) __packed; /** diff --git a/drivers/staging/rtl8188eu/os_dep/usb_intf.c b/drivers/staging/rtl8188eu/os_dep/usb_intf.c index a7cac0719b8b..b5d42f411dd8 100644 --- a/drivers/staging/rtl8188eu/os_dep/usb_intf.c +++ b/drivers/staging/rtl8188eu/os_dep/usb_intf.c @@ -37,6 +37,7 @@ static const struct usb_device_id rtw_usb_id_tbl[] = { {USB_DEVICE(0x2001, 0x3311)}, /* DLink GO-USB-N150 REV B1 */ {USB_DEVICE(0x2001, 0x331B)}, /* D-Link DWA-121 rev B1 */ {USB_DEVICE(0x2357, 0x010c)}, /* TP-Link TL-WN722N v2 */ + {USB_DEVICE(0x2357, 0x0111)}, /* TP-Link TL-WN727N v5.21 */ {USB_DEVICE(0x0df6, 0x0076)}, /* Sitecom N150 v2 */ {USB_DEVICE(USB_VENDER_ID_REALTEK, 0xffef)}, /* Rosewill RNX-N150NUB */ {} /* Terminating entry */ diff --git a/drivers/staging/vt6656/baseband.c b/drivers/staging/vt6656/baseband.c index 8d19ae71e7cc..4e651b698617 100644 --- a/drivers/staging/vt6656/baseband.c +++ b/drivers/staging/vt6656/baseband.c @@ -449,8 +449,8 @@ int vnt_vt3184_init(struct vnt_private *priv) memcpy(array, addr, length); - ret = vnt_control_out(priv, MESSAGE_TYPE_WRITE, 0, - MESSAGE_REQUEST_BBREG, length, array); + ret = vnt_control_out_blocks(priv, VNT_REG_BLOCK_SIZE, + MESSAGE_REQUEST_BBREG, length, array); if (ret) goto end; diff --git a/drivers/staging/vt6656/card.c b/drivers/staging/vt6656/card.c index 56cd77fd9ea0..7958fc165462 100644 --- a/drivers/staging/vt6656/card.c +++ b/drivers/staging/vt6656/card.c @@ -719,7 +719,7 @@ end: */ int vnt_radio_power_on(struct vnt_private *priv) { - int ret = true; + int ret = 0; vnt_exit_deep_sleep(priv); diff --git a/drivers/staging/vt6656/device.h b/drivers/staging/vt6656/device.h index 6074ceda78bf..50e1c8918040 100644 --- a/drivers/staging/vt6656/device.h +++ b/drivers/staging/vt6656/device.h @@ -259,6 +259,7 @@ struct vnt_private { u8 mac_hw; /* netdev */ struct usb_device *usb; + struct usb_interface *intf; u64 tsf_time; u8 rx_rate; diff --git a/drivers/staging/vt6656/main_usb.c b/drivers/staging/vt6656/main_usb.c index 4ac85ecb0921..9cb924c54571 100644 --- a/drivers/staging/vt6656/main_usb.c +++ b/drivers/staging/vt6656/main_usb.c @@ -949,7 +949,7 @@ static const struct ieee80211_ops vnt_mac_ops = { int vnt_init(struct vnt_private *priv) { - if (!(vnt_init_registers(priv))) + if (vnt_init_registers(priv)) return -EAGAIN; SET_IEEE80211_PERM_ADDR(priv->hw, priv->permanent_net_addr); @@ -992,6 +992,7 @@ vt6656_probe(struct usb_interface *intf, const struct usb_device_id *id) priv = hw->priv; priv->hw = hw; priv->usb = udev; + priv->intf = intf; vnt_set_options(priv); diff --git a/drivers/staging/vt6656/usbpipe.c b/drivers/staging/vt6656/usbpipe.c index d3304df6bd53..d977d4777e4f 100644 --- a/drivers/staging/vt6656/usbpipe.c +++ b/drivers/staging/vt6656/usbpipe.c @@ -59,7 +59,9 @@ int vnt_control_out(struct vnt_private *priv, u8 request, u16 value, kfree(usb_buffer); - if (ret >= 0 && ret < (int)length) + if (ret == (int)length) + ret = 0; + else ret = -EIO; end_unlock: @@ -74,6 +76,23 @@ int vnt_control_out_u8(struct vnt_private *priv, u8 reg, u8 reg_off, u8 data) reg_off, reg, sizeof(u8), &data); } +int vnt_control_out_blocks(struct vnt_private *priv, + u16 block, u8 reg, u16 length, u8 *data) +{ + int ret = 0, i; + + for (i = 0; i < length; i += block) { + u16 len = min_t(int, length - i, block); + + ret = vnt_control_out(priv, MESSAGE_TYPE_WRITE, + i, reg, len, data + i); + if (ret) + goto end; + } +end: + return ret; +} + int vnt_control_in(struct vnt_private *priv, u8 request, u16 value, u16 index, u16 length, u8 *buffer) { @@ -103,7 +122,9 @@ int vnt_control_in(struct vnt_private *priv, u8 request, u16 value, kfree(usb_buffer); - if (ret >= 0 && ret < (int)length) + if (ret == (int)length) + ret = 0; + else ret = -EIO; end_unlock: diff --git a/drivers/staging/vt6656/usbpipe.h b/drivers/staging/vt6656/usbpipe.h index 95147ec7b96a..b65d9c01a211 100644 --- a/drivers/staging/vt6656/usbpipe.h +++ b/drivers/staging/vt6656/usbpipe.h @@ -18,6 +18,8 @@ #include "device.h" +#define VNT_REG_BLOCK_SIZE 64 + int vnt_control_out(struct vnt_private *priv, u8 request, u16 value, u16 index, u16 length, u8 *buffer); int vnt_control_in(struct vnt_private *priv, u8 request, u16 value, @@ -26,6 +28,9 @@ int vnt_control_in(struct vnt_private *priv, u8 request, u16 value, int vnt_control_out_u8(struct vnt_private *priv, u8 reg, u8 ref_off, u8 data); int vnt_control_in_u8(struct vnt_private *priv, u8 reg, u8 reg_off, u8 *data); +int vnt_control_out_blocks(struct vnt_private *priv, + u16 block, u8 reg, u16 len, u8 *data); + int vnt_start_interrupt_urb(struct vnt_private *priv); int vnt_submit_rx_urb(struct vnt_private *priv, struct vnt_rcb *rcb); int vnt_tx_context(struct vnt_private *priv, diff --git a/drivers/staging/vt6656/wcmd.c b/drivers/staging/vt6656/wcmd.c index 3eb2f11a5de1..2c5250ca2801 100644 --- a/drivers/staging/vt6656/wcmd.c +++ b/drivers/staging/vt6656/wcmd.c @@ -99,6 +99,7 @@ void vnt_run_command(struct work_struct *work) if (vnt_init(priv)) { /* If fail all ends TODO retry */ dev_err(&priv->usb->dev, "failed to start\n"); + usb_set_intfdata(priv->intf, NULL); ieee80211_free_hw(priv->hw); return; } diff --git a/drivers/tee/optee/Kconfig b/drivers/tee/optee/Kconfig index d1ad512e1708..3ca71e3812ed 100644 --- a/drivers/tee/optee/Kconfig +++ b/drivers/tee/optee/Kconfig @@ -3,6 +3,7 @@ config OPTEE tristate "OP-TEE" depends on HAVE_ARM_SMCCC + depends on MMU help This implements the OP-TEE Trusted Execution Environment (TEE) driver. diff --git a/drivers/tee/optee/shm_pool.c b/drivers/tee/optee/shm_pool.c index 0332a5301d61..d767eebf30bd 100644 --- a/drivers/tee/optee/shm_pool.c +++ b/drivers/tee/optee/shm_pool.c @@ -28,9 +28,22 @@ static int pool_op_alloc(struct tee_shm_pool_mgr *poolm, shm->size = PAGE_SIZE << order; if (shm->flags & TEE_SHM_DMA_BUF) { + unsigned int nr_pages = 1 << order, i; + struct page **pages; + + pages = kcalloc(nr_pages, sizeof(pages), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + for (i = 0; i < nr_pages; i++) { + pages[i] = page; + page++; + } + shm->flags |= TEE_SHM_REGISTER; - rc = optee_shm_register(shm->ctx, shm, &page, 1 << order, + rc = optee_shm_register(shm->ctx, shm, pages, nr_pages, (unsigned long)shm->kaddr); + kfree(pages); } return rc; diff --git a/drivers/thermal/qcom/tsens.c b/drivers/thermal/qcom/tsens.c index 015e7d201598..0e7cf5236932 100644 --- a/drivers/thermal/qcom/tsens.c +++ b/drivers/thermal/qcom/tsens.c @@ -110,6 +110,9 @@ static int tsens_register(struct tsens_priv *priv) irq = platform_get_irq_byname(pdev, "uplow"); if (irq < 0) { ret = irq; + /* For old DTs with no IRQ defined */ + if (irq == -ENXIO) + ret = 0; goto err_put_device; } diff --git a/drivers/tty/serdev/core.c b/drivers/tty/serdev/core.c index 226adeec2aed..ce5309d00280 100644 --- a/drivers/tty/serdev/core.c +++ b/drivers/tty/serdev/core.c @@ -663,6 +663,12 @@ static acpi_status acpi_serdev_register_device(struct serdev_controller *ctrl, return AE_OK; } +static const struct acpi_device_id serdev_acpi_devices_blacklist[] = { + { "INT3511", 0 }, + { "INT3512", 0 }, + { }, +}; + static acpi_status acpi_serdev_add_device(acpi_handle handle, u32 level, void *data, void **return_value) { @@ -675,6 +681,10 @@ static acpi_status acpi_serdev_add_device(acpi_handle handle, u32 level, if (acpi_device_enumerated(adev)) return AE_OK; + /* Skip if black listed */ + if (!acpi_match_device_ids(adev, serdev_acpi_devices_blacklist)) + return AE_OK; + if (acpi_serdev_check_resources(ctrl, adev)) return AE_OK; diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c index 5023c85ebc6e..044c3cbdcfa4 100644 --- a/drivers/tty/tty_port.c +++ b/drivers/tty/tty_port.c @@ -89,8 +89,7 @@ void tty_port_link_device(struct tty_port *port, { if (WARN_ON(index >= driver->num)) return; - if (!driver->ports[index]) - driver->ports[index] = port; + driver->ports[index] = port; } EXPORT_SYMBOL_GPL(tty_port_link_device); diff --git a/drivers/usb/cdns3/gadget.c b/drivers/usb/cdns3/gadget.c index 4c1e75509303..02f6ca2cb1ba 100644 --- a/drivers/usb/cdns3/gadget.c +++ b/drivers/usb/cdns3/gadget.c @@ -1375,13 +1375,10 @@ static void cdns3_check_usb_interrupt_proceed(struct cdns3_device *priv_dev, */ static irqreturn_t cdns3_device_irq_handler(int irq, void *data) { - struct cdns3_device *priv_dev; - struct cdns3 *cdns = data; + struct cdns3_device *priv_dev = data; irqreturn_t ret = IRQ_NONE; u32 reg; - priv_dev = cdns->gadget_dev; - /* check USB device interrupt */ reg = readl(&priv_dev->regs->usb_ists); if (reg) { @@ -1419,14 +1416,12 @@ static irqreturn_t cdns3_device_irq_handler(int irq, void *data) */ static irqreturn_t cdns3_device_thread_irq_handler(int irq, void *data) { - struct cdns3_device *priv_dev; - struct cdns3 *cdns = data; + struct cdns3_device *priv_dev = data; irqreturn_t ret = IRQ_NONE; unsigned long flags; int bit; u32 reg; - priv_dev = cdns->gadget_dev; spin_lock_irqsave(&priv_dev->lock, flags); reg = readl(&priv_dev->regs->usb_ists); @@ -2539,7 +2534,7 @@ void cdns3_gadget_exit(struct cdns3 *cdns) priv_dev = cdns->gadget_dev; - devm_free_irq(cdns->dev, cdns->dev_irq, cdns); + devm_free_irq(cdns->dev, cdns->dev_irq, priv_dev); pm_runtime_mark_last_busy(cdns->dev); pm_runtime_put_autosuspend(cdns->dev); @@ -2710,7 +2705,8 @@ static int __cdns3_gadget_init(struct cdns3 *cdns) ret = devm_request_threaded_irq(cdns->dev, cdns->dev_irq, cdns3_device_irq_handler, cdns3_device_thread_irq_handler, - IRQF_SHARED, dev_name(cdns->dev), cdns); + IRQF_SHARED, dev_name(cdns->dev), + cdns->gadget_dev); if (ret) goto err0; diff --git a/drivers/usb/chipidea/host.c b/drivers/usb/chipidea/host.c index b45ceb91c735..48e4a5ca1835 100644 --- a/drivers/usb/chipidea/host.c +++ b/drivers/usb/chipidea/host.c @@ -26,6 +26,7 @@ static int (*orig_bus_suspend)(struct usb_hcd *hcd); struct ehci_ci_priv { struct regulator *reg_vbus; + bool enabled; }; static int ehci_ci_portpower(struct usb_hcd *hcd, int portnum, bool enable) @@ -37,7 +38,7 @@ static int ehci_ci_portpower(struct usb_hcd *hcd, int portnum, bool enable) int ret = 0; int port = HCS_N_PORTS(ehci->hcs_params); - if (priv->reg_vbus) { + if (priv->reg_vbus && enable != priv->enabled) { if (port > 1) { dev_warn(dev, "Not support multi-port regulator control\n"); @@ -53,6 +54,7 @@ static int ehci_ci_portpower(struct usb_hcd *hcd, int portnum, bool enable) enable ? "enable" : "disable", ret); return ret; } + priv->enabled = enable; } if (enable && (ci->platdata->phy_mode == USBPHY_INTERFACE_MODE_HSIC)) { diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c index 5f40117e68e7..26bc05e48d8a 100644 --- a/drivers/usb/core/config.c +++ b/drivers/usb/core/config.c @@ -203,9 +203,58 @@ static const unsigned short super_speed_maxpacket_maxes[4] = { [USB_ENDPOINT_XFER_INT] = 1024, }; -static int usb_parse_endpoint(struct device *ddev, int cfgno, int inum, - int asnum, struct usb_host_interface *ifp, int num_ep, - unsigned char *buffer, int size) +static bool endpoint_is_duplicate(struct usb_endpoint_descriptor *e1, + struct usb_endpoint_descriptor *e2) +{ + if (e1->bEndpointAddress == e2->bEndpointAddress) + return true; + + if (usb_endpoint_xfer_control(e1) || usb_endpoint_xfer_control(e2)) { + if (usb_endpoint_num(e1) == usb_endpoint_num(e2)) + return true; + } + + return false; +} + +/* + * Check for duplicate endpoint addresses in other interfaces and in the + * altsetting currently being parsed. + */ +static bool config_endpoint_is_duplicate(struct usb_host_config *config, + int inum, int asnum, struct usb_endpoint_descriptor *d) +{ + struct usb_endpoint_descriptor *epd; + struct usb_interface_cache *intfc; + struct usb_host_interface *alt; + int i, j, k; + + for (i = 0; i < config->desc.bNumInterfaces; ++i) { + intfc = config->intf_cache[i]; + + for (j = 0; j < intfc->num_altsetting; ++j) { + alt = &intfc->altsetting[j]; + + if (alt->desc.bInterfaceNumber == inum && + alt->desc.bAlternateSetting != asnum) + continue; + + for (k = 0; k < alt->desc.bNumEndpoints; ++k) { + epd = &alt->endpoint[k].desc; + + if (endpoint_is_duplicate(epd, d)) + return true; + } + } + } + + return false; +} + +static int usb_parse_endpoint(struct device *ddev, int cfgno, + struct usb_host_config *config, int inum, int asnum, + struct usb_host_interface *ifp, int num_ep, + unsigned char *buffer, int size) { unsigned char *buffer0 = buffer; struct usb_endpoint_descriptor *d; @@ -242,13 +291,10 @@ static int usb_parse_endpoint(struct device *ddev, int cfgno, int inum, goto skip_to_next_endpoint_or_interface_descriptor; /* Check for duplicate endpoint addresses */ - for (i = 0; i < ifp->desc.bNumEndpoints; ++i) { - if (ifp->endpoint[i].desc.bEndpointAddress == - d->bEndpointAddress) { - dev_warn(ddev, "config %d interface %d altsetting %d has a duplicate endpoint with address 0x%X, skipping\n", - cfgno, inum, asnum, d->bEndpointAddress); - goto skip_to_next_endpoint_or_interface_descriptor; - } + if (config_endpoint_is_duplicate(config, inum, asnum, d)) { + dev_warn(ddev, "config %d interface %d altsetting %d has a duplicate endpoint with address 0x%X, skipping\n", + cfgno, inum, asnum, d->bEndpointAddress); + goto skip_to_next_endpoint_or_interface_descriptor; } endpoint = &ifp->endpoint[ifp->desc.bNumEndpoints]; @@ -346,12 +392,16 @@ static int usb_parse_endpoint(struct device *ddev, int cfgno, int inum, endpoint->desc.wMaxPacketSize = cpu_to_le16(8); } - /* Validate the wMaxPacketSize field */ + /* + * Validate the wMaxPacketSize field. + * Some devices have isochronous endpoints in altsetting 0; + * the USB-2 spec requires such endpoints to have wMaxPacketSize = 0 + * (see the end of section 5.6.3), so don't warn about them. + */ maxp = usb_endpoint_maxp(&endpoint->desc); - if (maxp == 0) { - dev_warn(ddev, "config %d interface %d altsetting %d endpoint 0x%X has wMaxPacketSize 0, skipping\n", + if (maxp == 0 && !(usb_endpoint_xfer_isoc(d) && asnum == 0)) { + dev_warn(ddev, "config %d interface %d altsetting %d endpoint 0x%X has invalid wMaxPacketSize 0\n", cfgno, inum, asnum, d->bEndpointAddress); - goto skip_to_next_endpoint_or_interface_descriptor; } /* Find the highest legal maxpacket size for this endpoint */ @@ -522,8 +572,8 @@ static int usb_parse_interface(struct device *ddev, int cfgno, if (((struct usb_descriptor_header *) buffer)->bDescriptorType == USB_DT_INTERFACE) break; - retval = usb_parse_endpoint(ddev, cfgno, inum, asnum, alt, - num_ep, buffer, size); + retval = usb_parse_endpoint(ddev, cfgno, config, inum, asnum, + alt, num_ep, buffer, size); if (retval < 0) return retval; ++n; diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index f229ad6952c0..3405b146edc9 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -1192,6 +1192,7 @@ static void hub_activate(struct usb_hub *hub, enum hub_activation_type type) * PORT_OVER_CURRENT is not. So check for any of them. */ if (udev || (portstatus & USB_PORT_STAT_CONNECTION) || + (portchange & USB_PORT_STAT_C_CONNECTION) || (portstatus & USB_PORT_STAT_OVERCURRENT) || (portchange & USB_PORT_STAT_C_OVERCURRENT)) set_bit(port1, hub->change_bits); @@ -2692,7 +2693,7 @@ static unsigned hub_is_wusb(struct usb_hub *hub) #define SET_ADDRESS_TRIES 2 #define GET_DESCRIPTOR_TRIES 2 #define SET_CONFIG_TRIES (2 * (use_both_schemes + 1)) -#define USE_NEW_SCHEME(i, scheme) ((i) / 2 == (int)scheme) +#define USE_NEW_SCHEME(i, scheme) ((i) / 2 == (int)(scheme)) #define HUB_ROOT_RESET_TIME 60 /* times are in msec */ #define HUB_SHORT_RESET_TIME 10 diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index 0c960a97ea02..154f3f3e8cff 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -2467,6 +2467,13 @@ static int dwc3_gadget_ep_reclaim_trb_linear(struct dwc3_ep *dep, static bool dwc3_gadget_ep_request_completed(struct dwc3_request *req) { + /* + * For OUT direction, host may send less than the setup + * length. Return true for all OUT requests. + */ + if (!req->direction) + return true; + return req->request.actual == req->request.length; } diff --git a/drivers/usb/gadget/udc/Kconfig b/drivers/usb/gadget/udc/Kconfig index ae70ce29d5e4..797d6ace8994 100644 --- a/drivers/usb/gadget/udc/Kconfig +++ b/drivers/usb/gadget/udc/Kconfig @@ -445,6 +445,7 @@ config USB_TEGRA_XUDC tristate "NVIDIA Tegra Superspeed USB 3.0 Device Controller" depends on ARCH_TEGRA || COMPILE_TEST depends on PHY_TEGRA_XUSB + select USB_ROLE_SWITCH help Enables NVIDIA Tegra USB 3.0 device mode controller driver. diff --git a/drivers/usb/host/ohci-da8xx.c b/drivers/usb/host/ohci-da8xx.c index 38183ac438c6..1371b0c249ec 100644 --- a/drivers/usb/host/ohci-da8xx.c +++ b/drivers/usb/host/ohci-da8xx.c @@ -415,13 +415,17 @@ static int ohci_da8xx_probe(struct platform_device *pdev) } da8xx_ohci->oc_gpio = devm_gpiod_get_optional(dev, "oc", GPIOD_IN); - if (IS_ERR(da8xx_ohci->oc_gpio)) + if (IS_ERR(da8xx_ohci->oc_gpio)) { + error = PTR_ERR(da8xx_ohci->oc_gpio); goto err; + } if (da8xx_ohci->oc_gpio) { oc_irq = gpiod_to_irq(da8xx_ohci->oc_gpio); - if (oc_irq < 0) + if (oc_irq < 0) { + error = oc_irq; goto err; + } error = devm_request_threaded_irq(dev, oc_irq, NULL, ohci_da8xx_oc_thread, IRQF_TRIGGER_RISING | diff --git a/drivers/usb/musb/jz4740.c b/drivers/usb/musb/jz4740.c index 5261f8dfedec..e3b8c84ccdb8 100644 --- a/drivers/usb/musb/jz4740.c +++ b/drivers/usb/musb/jz4740.c @@ -75,14 +75,17 @@ static struct musb_hdrc_platform_data jz4740_musb_platform_data = { static int jz4740_musb_init(struct musb *musb) { struct device *dev = musb->controller->parent; + int err; if (dev->of_node) musb->xceiv = devm_usb_get_phy_by_phandle(dev, "phys", 0); else musb->xceiv = devm_usb_get_phy(dev, USB_PHY_TYPE_USB2); if (IS_ERR(musb->xceiv)) { - dev_err(dev, "No transceiver configured\n"); - return PTR_ERR(musb->xceiv); + err = PTR_ERR(musb->xceiv); + if (err != -EPROBE_DEFER) + dev_err(dev, "No transceiver configured: %d", err); + return err; } /* Silicon does not implement ConfigData register. diff --git a/drivers/usb/musb/musb_core.c b/drivers/usb/musb/musb_core.c index 15cca912c53e..5ebf30bd61bd 100644 --- a/drivers/usb/musb/musb_core.c +++ b/drivers/usb/musb/musb_core.c @@ -1840,6 +1840,9 @@ ATTRIBUTE_GROUPS(musb); #define MUSB_QUIRK_B_INVALID_VBUS_91 (MUSB_DEVCTL_BDEVICE | \ (2 << MUSB_DEVCTL_VBUS_SHIFT) | \ MUSB_DEVCTL_SESSION) +#define MUSB_QUIRK_B_DISCONNECT_99 (MUSB_DEVCTL_BDEVICE | \ + (3 << MUSB_DEVCTL_VBUS_SHIFT) | \ + MUSB_DEVCTL_SESSION) #define MUSB_QUIRK_A_DISCONNECT_19 ((3 << MUSB_DEVCTL_VBUS_SHIFT) | \ MUSB_DEVCTL_SESSION) @@ -1862,6 +1865,11 @@ static void musb_pm_runtime_check_session(struct musb *musb) s = MUSB_DEVCTL_FSDEV | MUSB_DEVCTL_LSDEV | MUSB_DEVCTL_HR; switch (devctl & ~s) { + case MUSB_QUIRK_B_DISCONNECT_99: + musb_dbg(musb, "Poll devctl in case of suspend after disconnect\n"); + schedule_delayed_work(&musb->irq_work, + msecs_to_jiffies(1000)); + break; case MUSB_QUIRK_B_INVALID_VBUS_91: if (musb->quirk_retries && !musb->flush_irq_work) { musb_dbg(musb, @@ -2310,6 +2318,9 @@ musb_init_controller(struct device *dev, int nIrq, void __iomem *ctrl) musb_disable_interrupts(musb); musb_writeb(musb->mregs, MUSB_DEVCTL, 0); + /* MUSB_POWER_SOFTCONN might be already set, JZ4740 does this. */ + musb_writeb(musb->mregs, MUSB_POWER, 0); + /* Init IRQ workqueue before request_irq */ INIT_DELAYED_WORK(&musb->irq_work, musb_irq_work); INIT_DELAYED_WORK(&musb->deassert_reset_work, musb_deassert_reset); diff --git a/drivers/usb/musb/musbhsdma.c b/drivers/usb/musb/musbhsdma.c index 5fc6825745f2..2d3751d885b4 100644 --- a/drivers/usb/musb/musbhsdma.c +++ b/drivers/usb/musb/musbhsdma.c @@ -425,7 +425,7 @@ struct dma_controller *musbhs_dma_controller_create(struct musb *musb, controller->controller.channel_abort = dma_channel_abort; if (request_irq(irq, dma_controller_irq, 0, - dev_name(musb->controller), &controller->controller)) { + dev_name(musb->controller), controller)) { dev_err(dev, "request_irq %d failed!\n", irq); musb_dma_controller_destroy(&controller->controller); diff --git a/drivers/usb/serial/ch341.c b/drivers/usb/serial/ch341.c index df582fe855f0..d3f420f3a083 100644 --- a/drivers/usb/serial/ch341.c +++ b/drivers/usb/serial/ch341.c @@ -642,9 +642,13 @@ static int ch341_tiocmget(struct tty_struct *tty) static int ch341_reset_resume(struct usb_serial *serial) { struct usb_serial_port *port = serial->port[0]; - struct ch341_private *priv = usb_get_serial_port_data(port); + struct ch341_private *priv; int ret; + priv = usb_get_serial_port_data(port); + if (!priv) + return 0; + /* reconfigure ch341 serial port after bus-reset */ ch341_configure(serial->dev, priv); diff --git a/drivers/usb/serial/io_edgeport.c b/drivers/usb/serial/io_edgeport.c index 9690a5f4b9d6..5737add6a2a4 100644 --- a/drivers/usb/serial/io_edgeport.c +++ b/drivers/usb/serial/io_edgeport.c @@ -716,7 +716,7 @@ static void edge_interrupt_callback(struct urb *urb) if (txCredits) { port = edge_serial->serial->port[portNumber]; edge_port = usb_get_serial_port_data(port); - if (edge_port->open) { + if (edge_port && edge_port->open) { spin_lock_irqsave(&edge_port->ep_lock, flags); edge_port->txCredits += txCredits; @@ -1725,7 +1725,8 @@ static void edge_break(struct tty_struct *tty, int break_state) static void process_rcvd_data(struct edgeport_serial *edge_serial, unsigned char *buffer, __u16 bufferLength) { - struct device *dev = &edge_serial->serial->dev->dev; + struct usb_serial *serial = edge_serial->serial; + struct device *dev = &serial->dev->dev; struct usb_serial_port *port; struct edgeport_port *edge_port; __u16 lastBufferLength; @@ -1821,11 +1822,10 @@ static void process_rcvd_data(struct edgeport_serial *edge_serial, /* spit this data back into the tty driver if this port is open */ - if (rxLen) { - port = edge_serial->serial->port[ - edge_serial->rxPort]; + if (rxLen && edge_serial->rxPort < serial->num_ports) { + port = serial->port[edge_serial->rxPort]; edge_port = usb_get_serial_port_data(port); - if (edge_port->open) { + if (edge_port && edge_port->open) { dev_dbg(dev, "%s - Sending %d bytes to TTY for port %d\n", __func__, rxLen, edge_serial->rxPort); @@ -1833,8 +1833,8 @@ static void process_rcvd_data(struct edgeport_serial *edge_serial, rxLen); edge_port->port->icount.rx += rxLen; } - buffer += rxLen; } + buffer += rxLen; break; case EXPECT_HDR3: /* Expect 3rd byte of status header */ @@ -1869,6 +1869,8 @@ static void process_rcvd_status(struct edgeport_serial *edge_serial, __u8 code = edge_serial->rxStatusCode; /* switch the port pointer to the one being currently talked about */ + if (edge_serial->rxPort >= edge_serial->serial->num_ports) + return; port = edge_serial->serial->port[edge_serial->rxPort]; edge_port = usb_get_serial_port_data(port); if (edge_port == NULL) { diff --git a/drivers/usb/serial/keyspan.c b/drivers/usb/serial/keyspan.c index e66a59ef43a1..aa3dbce22cfb 100644 --- a/drivers/usb/serial/keyspan.c +++ b/drivers/usb/serial/keyspan.c @@ -1058,6 +1058,8 @@ static void usa49_glocont_callback(struct urb *urb) for (i = 0; i < serial->num_ports; ++i) { port = serial->port[i]; p_priv = usb_get_serial_port_data(port); + if (!p_priv) + continue; if (p_priv->resend_cont) { dev_dbg(&port->dev, "%s - sending setup\n", __func__); @@ -1459,6 +1461,8 @@ static void usa67_glocont_callback(struct urb *urb) for (i = 0; i < serial->num_ports; ++i) { port = serial->port[i]; p_priv = usb_get_serial_port_data(port); + if (!p_priv) + continue; if (p_priv->resend_cont) { dev_dbg(&port->dev, "%s - sending setup\n", __func__); diff --git a/drivers/usb/serial/opticon.c b/drivers/usb/serial/opticon.c index cb7aac9cd9e7..ed2b4e6dca38 100644 --- a/drivers/usb/serial/opticon.c +++ b/drivers/usb/serial/opticon.c @@ -113,7 +113,7 @@ static int send_control_msg(struct usb_serial_port *port, u8 requesttype, retval = usb_control_msg(serial->dev, usb_sndctrlpipe(serial->dev, 0), requesttype, USB_DIR_OUT|USB_TYPE_VENDOR|USB_RECIP_INTERFACE, - 0, 0, buffer, 1, 0); + 0, 0, buffer, 1, USB_CTRL_SET_TIMEOUT); kfree(buffer); if (retval < 0) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index e9491d400a24..084cc2fff3ae 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -248,6 +248,7 @@ static void option_instat_callback(struct urb *urb); #define QUECTEL_PRODUCT_BG96 0x0296 #define QUECTEL_PRODUCT_EP06 0x0306 #define QUECTEL_PRODUCT_EM12 0x0512 +#define QUECTEL_PRODUCT_RM500Q 0x0800 #define CMOTECH_VENDOR_ID 0x16d8 #define CMOTECH_PRODUCT_6001 0x6001 @@ -567,6 +568,9 @@ static void option_instat_callback(struct urb *urb); /* Interface must have two endpoints */ #define NUMEP2 BIT(16) +/* Device needs ZLP */ +#define ZLP BIT(17) + static const struct usb_device_id option_ids[] = { { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_COLT) }, @@ -1101,6 +1105,11 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM12, 0xff, 0xff, 0xff), .driver_info = RSVD(1) | RSVD(2) | RSVD(3) | RSVD(4) | NUMEP2 }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM12, 0xff, 0, 0) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_RM500Q, 0xff, 0xff, 0x30) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_RM500Q, 0xff, 0, 0) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_RM500Q, 0xff, 0xff, 0x10), + .driver_info = ZLP }, + { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_6001) }, { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_CMU_300) }, { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_6003), @@ -1172,6 +1181,8 @@ static const struct usb_device_id option_ids[] = { .driver_info = NCTRL(0) | RSVD(3) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1102, 0xff), /* Telit ME910 (ECM) */ .driver_info = NCTRL(0) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x110a, 0xff), /* Telit ME910G1 */ + .driver_info = NCTRL(0) | RSVD(3) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910), .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910_USBCFG4), @@ -1196,6 +1207,8 @@ static const struct usb_device_id option_ids[] = { .driver_info = NCTRL(0) | RSVD(1) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1901, 0xff), /* Telit LN940 (MBIM) */ .driver_info = NCTRL(0) }, + { USB_DEVICE(TELIT_VENDOR_ID, 0x9010), /* Telit SBL FN980 flashing device */ + .driver_info = NCTRL(0) | ZLP }, { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, ZTE_PRODUCT_MF622, 0xff, 0xff, 0xff) }, /* ZTE WCDMA products */ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0002, 0xff, 0xff, 0xff), .driver_info = RSVD(1) }, @@ -2097,6 +2110,9 @@ static int option_attach(struct usb_serial *serial) if (!(device_flags & NCTRL(iface_desc->bInterfaceNumber))) data->use_send_setup = 1; + if (device_flags & ZLP) + data->use_zlp = 1; + spin_lock_init(&data->susp_lock); usb_set_serial_data(serial, data); diff --git a/drivers/usb/serial/quatech2.c b/drivers/usb/serial/quatech2.c index a62981ca7a73..f93b81a297d6 100644 --- a/drivers/usb/serial/quatech2.c +++ b/drivers/usb/serial/quatech2.c @@ -841,7 +841,10 @@ static void qt2_update_msr(struct usb_serial_port *port, unsigned char *ch) u8 newMSR = (u8) *ch; unsigned long flags; + /* May be called from qt2_process_read_urb() for an unbound port. */ port_priv = usb_get_serial_port_data(port); + if (!port_priv) + return; spin_lock_irqsave(&port_priv->lock, flags); port_priv->shadowMSR = newMSR; @@ -869,7 +872,10 @@ static void qt2_update_lsr(struct usb_serial_port *port, unsigned char *ch) unsigned long flags; u8 newLSR = (u8) *ch; + /* May be called from qt2_process_read_urb() for an unbound port. */ port_priv = usb_get_serial_port_data(port); + if (!port_priv) + return; if (newLSR & UART_LSR_BI) newLSR &= (u8) (UART_LSR_OE | UART_LSR_BI); diff --git a/drivers/usb/serial/usb-serial-simple.c b/drivers/usb/serial/usb-serial-simple.c index edbbb13d6de6..bd23a7cb1be2 100644 --- a/drivers/usb/serial/usb-serial-simple.c +++ b/drivers/usb/serial/usb-serial-simple.c @@ -86,6 +86,8 @@ DEVICE(moto_modem, MOTO_IDS); #define MOTOROLA_TETRA_IDS() \ { USB_DEVICE(0x0cad, 0x9011) }, /* Motorola Solutions TETRA PEI */ \ { USB_DEVICE(0x0cad, 0x9012) }, /* MTP6550 */ \ + { USB_DEVICE(0x0cad, 0x9013) }, /* MTP3xxx */ \ + { USB_DEVICE(0x0cad, 0x9015) }, /* MTP85xx */ \ { USB_DEVICE(0x0cad, 0x9016) } /* TPG2200 */ DEVICE(motorola_tetra, MOTOROLA_TETRA_IDS); diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c index 8f066bb55d7d..dc7a65b9ec98 100644 --- a/drivers/usb/serial/usb-serial.c +++ b/drivers/usb/serial/usb-serial.c @@ -1317,6 +1317,9 @@ static int usb_serial_register(struct usb_serial_driver *driver) return -EINVAL; } + /* Prevent individual ports from being unbound. */ + driver->driver.suppress_bind_attrs = true; + usb_serial_operations_init(driver); /* Add this device to our list of devices */ diff --git a/drivers/usb/serial/usb-wwan.h b/drivers/usb/serial/usb-wwan.h index 1c120eaf4091..934e9361cf6b 100644 --- a/drivers/usb/serial/usb-wwan.h +++ b/drivers/usb/serial/usb-wwan.h @@ -38,6 +38,7 @@ struct usb_wwan_intf_private { spinlock_t susp_lock; unsigned int suspended:1; unsigned int use_send_setup:1; + unsigned int use_zlp:1; int in_flight; unsigned int open_ports; void *private; diff --git a/drivers/usb/serial/usb_wwan.c b/drivers/usb/serial/usb_wwan.c index 7e855c87e4f7..13be21aad2f4 100644 --- a/drivers/usb/serial/usb_wwan.c +++ b/drivers/usb/serial/usb_wwan.c @@ -461,6 +461,7 @@ static struct urb *usb_wwan_setup_urb(struct usb_serial_port *port, void (*callback) (struct urb *)) { struct usb_serial *serial = port->serial; + struct usb_wwan_intf_private *intfdata = usb_get_serial_data(serial); struct urb *urb; urb = usb_alloc_urb(0, GFP_KERNEL); /* No ISO */ @@ -471,6 +472,9 @@ static struct urb *usb_wwan_setup_urb(struct usb_serial_port *port, usb_sndbulkpipe(serial->dev, endpoint) | dir, buf, len, callback, ctx); + if (intfdata->use_zlp && dir == USB_DIR_OUT) + urb->transfer_flags |= URB_ZERO_PACKET; + return urb; } diff --git a/drivers/usb/typec/tcpm/tcpci.c b/drivers/usb/typec/tcpm/tcpci.c index c1f7073a56de..8b4ff9fff340 100644 --- a/drivers/usb/typec/tcpm/tcpci.c +++ b/drivers/usb/typec/tcpm/tcpci.c @@ -432,20 +432,30 @@ irqreturn_t tcpci_irq(struct tcpci *tcpci) if (status & TCPC_ALERT_RX_STATUS) { struct pd_message msg; - unsigned int cnt; + unsigned int cnt, payload_cnt; u16 header; regmap_read(tcpci->regmap, TCPC_RX_BYTE_CNT, &cnt); + /* + * 'cnt' corresponds to READABLE_BYTE_COUNT in section 4.4.14 + * of the TCPCI spec [Rev 2.0 Ver 1.0 October 2017] and is + * defined in table 4-36 as one greater than the number of + * bytes received. And that number includes the header. So: + */ + if (cnt > 3) + payload_cnt = cnt - (1 + sizeof(msg.header)); + else + payload_cnt = 0; tcpci_read16(tcpci, TCPC_RX_HDR, &header); msg.header = cpu_to_le16(header); - if (WARN_ON(cnt > sizeof(msg.payload))) - cnt = sizeof(msg.payload); + if (WARN_ON(payload_cnt > sizeof(msg.payload))) + payload_cnt = sizeof(msg.payload); - if (cnt > 0) + if (payload_cnt > 0) regmap_raw_read(tcpci->regmap, TCPC_RX_DATA, - &msg.payload, cnt); + &msg.payload, payload_cnt); /* Read complete, clear RX status alert bit */ tcpci_write16(tcpci, TCPC_ALERT, TCPC_ALERT_RX_STATUS); diff --git a/drivers/usb/typec/ucsi/ucsi.h b/drivers/usb/typec/ucsi/ucsi.h index 8569bbd3762f..831c9470bdc1 100644 --- a/drivers/usb/typec/ucsi/ucsi.h +++ b/drivers/usb/typec/ucsi/ucsi.h @@ -94,15 +94,15 @@ void ucsi_connector_change(struct ucsi *ucsi, u8 num); #define UCSI_ENABLE_NTFY_CMD_COMPLETE BIT(16) #define UCSI_ENABLE_NTFY_EXT_PWR_SRC_CHANGE BIT(17) #define UCSI_ENABLE_NTFY_PWR_OPMODE_CHANGE BIT(18) -#define UCSI_ENABLE_NTFY_CAP_CHANGE BIT(19) -#define UCSI_ENABLE_NTFY_PWR_LEVEL_CHANGE BIT(20) -#define UCSI_ENABLE_NTFY_PD_RESET_COMPLETE BIT(21) -#define UCSI_ENABLE_NTFY_CAM_CHANGE BIT(22) -#define UCSI_ENABLE_NTFY_BAT_STATUS_CHANGE BIT(23) -#define UCSI_ENABLE_NTFY_PARTNER_CHANGE BIT(24) -#define UCSI_ENABLE_NTFY_PWR_DIR_CHANGE BIT(25) -#define UCSI_ENABLE_NTFY_CONNECTOR_CHANGE BIT(26) -#define UCSI_ENABLE_NTFY_ERROR BIT(27) +#define UCSI_ENABLE_NTFY_CAP_CHANGE BIT(21) +#define UCSI_ENABLE_NTFY_PWR_LEVEL_CHANGE BIT(22) +#define UCSI_ENABLE_NTFY_PD_RESET_COMPLETE BIT(23) +#define UCSI_ENABLE_NTFY_CAM_CHANGE BIT(24) +#define UCSI_ENABLE_NTFY_BAT_STATUS_CHANGE BIT(25) +#define UCSI_ENABLE_NTFY_PARTNER_CHANGE BIT(27) +#define UCSI_ENABLE_NTFY_PWR_DIR_CHANGE BIT(28) +#define UCSI_ENABLE_NTFY_CONNECTOR_CHANGE BIT(30) +#define UCSI_ENABLE_NTFY_ERROR BIT(31) #define UCSI_ENABLE_NTFY_ALL 0xdbe70000 /* SET_UOR command bits */ diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index 1679e0dc869b..cec868f8db3f 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -687,6 +687,7 @@ config MAX63XX_WATCHDOG config MAX77620_WATCHDOG tristate "Maxim Max77620 Watchdog Timer" depends on MFD_MAX77620 || COMPILE_TEST + select WATCHDOG_CORE help This is the driver for the Max77620 watchdog timer. Say 'Y' here to enable the watchdog timer support for @@ -1444,6 +1445,7 @@ config SMSC37B787_WDT config TQMX86_WDT tristate "TQ-Systems TQMX86 Watchdog Timer" depends on X86 + select WATCHDOG_CORE help This is the driver for the hardware watchdog timer in the TQMX86 IO controller found on some of their ComExpress Modules. diff --git a/drivers/watchdog/imx7ulp_wdt.c b/drivers/watchdog/imx7ulp_wdt.c index 0a87c6f4bab2..11b9e7c6b7f5 100644 --- a/drivers/watchdog/imx7ulp_wdt.c +++ b/drivers/watchdog/imx7ulp_wdt.c @@ -112,7 +112,7 @@ static int imx7ulp_wdt_restart(struct watchdog_device *wdog, { struct imx7ulp_wdt_device *wdt = watchdog_get_drvdata(wdog); - imx7ulp_wdt_enable(wdt->base, true); + imx7ulp_wdt_enable(wdog, true); imx7ulp_wdt_set_timeout(&wdt->wdd, 1); /* wait for wdog to fire */ diff --git a/drivers/watchdog/orion_wdt.c b/drivers/watchdog/orion_wdt.c index 1cccf8eb1c5d..8e6dfe76f9c9 100644 --- a/drivers/watchdog/orion_wdt.c +++ b/drivers/watchdog/orion_wdt.c @@ -602,7 +602,7 @@ static int orion_wdt_probe(struct platform_device *pdev) set_bit(WDOG_HW_RUNNING, &dev->wdt.status); /* Request the IRQ only after the watchdog is disabled */ - irq = platform_get_irq(pdev, 0); + irq = platform_get_irq_optional(pdev, 0); if (irq > 0) { /* * Not all supported platforms specify an interrupt for the @@ -617,7 +617,7 @@ static int orion_wdt_probe(struct platform_device *pdev) } /* Optional 2nd interrupt for pretimeout */ - irq = platform_get_irq(pdev, 1); + irq = platform_get_irq_optional(pdev, 1); if (irq > 0) { orion_wdt_info.options |= WDIOF_PRETIMEOUT; ret = devm_request_irq(&pdev->dev, irq, orion_wdt_pre_irq, diff --git a/drivers/watchdog/rn5t618_wdt.c b/drivers/watchdog/rn5t618_wdt.c index 234876047431..6e524c8e26a8 100644 --- a/drivers/watchdog/rn5t618_wdt.c +++ b/drivers/watchdog/rn5t618_wdt.c @@ -188,6 +188,7 @@ static struct platform_driver rn5t618_wdt_driver = { module_platform_driver(rn5t618_wdt_driver); +MODULE_ALIAS("platform:rn5t618-wdt"); MODULE_AUTHOR("Beniamino Galvani <b.galvani@gmail.com>"); MODULE_DESCRIPTION("RN5T618 watchdog driver"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/watchdog/w83627hf_wdt.c b/drivers/watchdog/w83627hf_wdt.c index fdf533fe0bb2..56a4a4030ca9 100644 --- a/drivers/watchdog/w83627hf_wdt.c +++ b/drivers/watchdog/w83627hf_wdt.c @@ -420,7 +420,7 @@ static int wdt_find(int addr) cr_wdt_csr = NCT6102D_WDT_CSR; break; case NCT6116_ID: - ret = nct6102; + ret = nct6116; cr_wdt_timeout = NCT6102D_WDT_TIMEOUT; cr_wdt_control = NCT6102D_WDT_CONTROL; cr_wdt_csr = NCT6102D_WDT_CSR; diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 497f979018c2..5c794f4b051a 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -908,6 +908,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct afs_vnode *dvnode = AFS_FS_I(dir); + struct afs_fid fid = {}; struct inode *inode; struct dentry *d; struct key *key; @@ -951,21 +952,18 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, afs_stat_v(dvnode, n_lookup); inode = afs_do_lookup(dir, dentry, key); key_put(key); - if (inode == ERR_PTR(-ENOENT)) { + if (inode == ERR_PTR(-ENOENT)) inode = afs_try_auto_mntpt(dentry, dir); - } else { - dentry->d_fsdata = - (void *)(unsigned long)dvnode->status.data_version; - } + + if (!IS_ERR_OR_NULL(inode)) + fid = AFS_FS_I(inode)->fid; + d = d_splice_alias(inode, dentry); if (!IS_ERR_OR_NULL(d)) { d->d_fsdata = dentry->d_fsdata; - trace_afs_lookup(dvnode, &d->d_name, - inode ? AFS_FS_I(inode) : NULL); + trace_afs_lookup(dvnode, &d->d_name, &fid); } else { - trace_afs_lookup(dvnode, &dentry->d_name, - IS_ERR_OR_NULL(inode) ? NULL - : AFS_FS_I(inode)); + trace_afs_lookup(dvnode, &dentry->d_name, &fid); } return d; } diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index ee834ef7beb4..43e1660f450f 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -447,7 +447,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, if (blkcg_css) { bio->bi_opf |= REQ_CGROUP_PUNT; - bio_associate_blkg_from_css(bio, blkcg_css); + kthread_associate_blkcg(blkcg_css); } refcount_set(&cb->pending_bios, 1); @@ -491,6 +491,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, bio->bi_opf = REQ_OP_WRITE | write_flags; bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; + if (blkcg_css) + bio->bi_opf |= REQ_CGROUP_PUNT; bio_add_page(bio, page, PAGE_SIZE, 0); } if (bytes_left < PAGE_SIZE) { @@ -517,6 +519,9 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, bio_endio(bio); } + if (blkcg_css) + kthread_associate_blkcg(NULL); + return 0; } diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index f639dde2a679..ba4d8f375b3c 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -500,11 +500,8 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, &dev_replace->scrub_progress, 0, 1); ret = btrfs_dev_replace_finishing(fs_info, ret); - if (ret == -EINPROGRESS) { + if (ret == -EINPROGRESS) ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; - } else if (ret != -ECANCELED) { - WARN_ON(ret); - } return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e3c76645cad7..c70baafb2a39 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1479,10 +1479,10 @@ next_slot: disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); /* - * If extent we got ends before our range starts, skip - * to next extent + * If the extent we got ends before our current offset, + * skip to the next extent. */ - if (extent_end <= start) { + if (extent_end <= cur_offset) { path->slots[0]++; goto next_slot; } @@ -4238,18 +4238,30 @@ out: } static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, - struct inode *dir, u64 objectid, - const char *name, int name_len) + struct inode *dir, struct dentry *dentry) { struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_dir_item *di; struct btrfs_key key; + const char *name = dentry->d_name.name; + int name_len = dentry->d_name.len; u64 index; int ret; + u64 objectid; u64 dir_ino = btrfs_ino(BTRFS_I(dir)); + if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) { + objectid = inode->root->root_key.objectid; + } else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { + objectid = inode->location.objectid; + } else { + WARN_ON(1); + return -EINVAL; + } + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -4271,13 +4283,16 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - ret = btrfs_del_root_ref(trans, objectid, root->root_key.objectid, - dir_ino, &index, name, name_len); - if (ret < 0) { - if (ret != -ENOENT) { - btrfs_abort_transaction(trans, ret); - goto out; - } + /* + * This is a placeholder inode for a subvolume we didn't have a + * reference to at the time of the snapshot creation. In the meantime + * we could have renamed the real subvol link into our snapshot, so + * depending on btrfs_del_root_ref to return -ENOENT here is incorret. + * Instead simply lookup the dir_index_item for this entry so we can + * remove it. Otherwise we know we have a ref to the root and we can + * call btrfs_del_root_ref, and it _shouldn't_ fail. + */ + if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { di = btrfs_search_dir_index_item(root, path, dir_ino, name, name_len); if (IS_ERR_OR_NULL(di)) { @@ -4292,8 +4307,16 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); index = key.offset; + btrfs_release_path(path); + } else { + ret = btrfs_del_root_ref(trans, objectid, + root->root_key.objectid, dir_ino, + &index, name, name_len); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } } - btrfs_release_path(path); ret = btrfs_delete_delayed_dir_index(trans, BTRFS_I(dir), index); if (ret) { @@ -4487,8 +4510,7 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) btrfs_record_snapshot_destroy(trans, BTRFS_I(dir)); - ret = btrfs_unlink_subvol(trans, dir, dest->root_key.objectid, - dentry->d_name.name, dentry->d_name.len); + ret = btrfs_unlink_subvol(trans, dir, dentry); if (ret) { err = ret; btrfs_abort_transaction(trans, ret); @@ -4583,10 +4605,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) return PTR_ERR(trans); if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { - err = btrfs_unlink_subvol(trans, dir, - BTRFS_I(inode)->location.objectid, - dentry->d_name.name, - dentry->d_name.len); + err = btrfs_unlink_subvol(trans, dir, dentry); goto out; } @@ -9536,7 +9555,6 @@ static int btrfs_rename_exchange(struct inode *old_dir, u64 new_ino = btrfs_ino(BTRFS_I(new_inode)); u64 old_idx = 0; u64 new_idx = 0; - u64 root_objectid; int ret; bool root_log_pinned = false; bool dest_log_pinned = false; @@ -9642,10 +9660,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* src is a subvolume */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { - root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; - ret = btrfs_unlink_subvol(trans, old_dir, root_objectid, - old_dentry->d_name.name, - old_dentry->d_name.len); + ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); } else { /* src is an inode */ ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir), BTRFS_I(old_dentry->d_inode), @@ -9661,10 +9676,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* dest is a subvolume */ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { - root_objectid = BTRFS_I(new_inode)->root->root_key.objectid; - ret = btrfs_unlink_subvol(trans, new_dir, root_objectid, - new_dentry->d_name.name, - new_dentry->d_name.len); + ret = btrfs_unlink_subvol(trans, new_dir, new_dentry); } else { /* dest is an inode */ ret = __btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir), BTRFS_I(new_dentry->d_inode), @@ -9862,7 +9874,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_inode = d_inode(new_dentry); struct inode *old_inode = d_inode(old_dentry); u64 index = 0; - u64 root_objectid; int ret; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); bool log_pinned = false; @@ -9970,10 +9981,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, BTRFS_I(old_inode), 1); if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { - root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; - ret = btrfs_unlink_subvol(trans, old_dir, root_objectid, - old_dentry->d_name.name, - old_dentry->d_name.len); + ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); } else { ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir), BTRFS_I(d_inode(old_dentry)), @@ -9992,10 +10000,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, new_inode->i_ctime = current_time(new_inode); if (unlikely(btrfs_ino(BTRFS_I(new_inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { - root_objectid = BTRFS_I(new_inode)->location.objectid; - ret = btrfs_unlink_subvol(trans, new_dir, root_objectid, - new_dentry->d_name.name, - new_dentry->d_name.len); + ret = btrfs_unlink_subvol(trans, new_dir, new_dentry); BUG_ON(new_inode->i_nlink == 0); } else { ret = btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir), diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 18e328ce4b54..12ae31e1813e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4252,7 +4252,19 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg) &sa->progress, sa->flags & BTRFS_SCRUB_READONLY, 0); - if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa))) + /* + * Copy scrub args to user space even if btrfs_scrub_dev() returned an + * error. This is important as it allows user space to know how much + * progress scrub has done. For example, if scrub is canceled we get + * -ECANCELED from btrfs_scrub_dev() and return that error back to user + * space. Later user space can inspect the progress from the structure + * btrfs_ioctl_scrub_args and resume scrub from where it left off + * previously (btrfs-progs does this). + * If we fail to copy the btrfs_ioctl_scrub_args structure to user space + * then return -EFAULT to signal the structure was not copied or it may + * be corrupt and unreliable due to a partial copy. + */ + if (copy_to_user(arg, sa, sizeof(*sa))) ret = -EFAULT; if (!(sa->flags & BTRFS_SCRUB_READONLY)) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index d4282e12f2a6..39fc8c3d3a75 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2423,8 +2423,12 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 nr_old_roots = 0; int ret = 0; + /* + * If quotas get disabled meanwhile, the resouces need to be freed and + * we can't just exit here. + */ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) - return 0; + goto out_free; if (new_roots) { if (!maybe_fs_roots(new_roots)) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index c58245797f30..da5abd62db22 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -517,6 +517,34 @@ static int update_backref_cache(struct btrfs_trans_handle *trans, return 1; } +static bool reloc_root_is_dead(struct btrfs_root *root) +{ + /* + * Pair with set_bit/clear_bit in clean_dirty_subvols and + * btrfs_update_reloc_root. We need to see the updated bit before + * trying to access reloc_root + */ + smp_rmb(); + if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)) + return true; + return false; +} + +/* + * Check if this subvolume tree has valid reloc tree. + * + * Reloc tree after swap is considered dead, thus not considered as valid. + * This is enough for most callers, as they don't distinguish dead reloc root + * from no reloc root. But should_ignore_root() below is a special case. + */ +static bool have_reloc_root(struct btrfs_root *root) +{ + if (reloc_root_is_dead(root)) + return false; + if (!root->reloc_root) + return false; + return true; +} static int should_ignore_root(struct btrfs_root *root) { @@ -525,6 +553,10 @@ static int should_ignore_root(struct btrfs_root *root) if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) return 0; + /* This root has been merged with its reloc tree, we can ignore it */ + if (reloc_root_is_dead(root)) + return 1; + reloc_root = root->reloc_root; if (!reloc_root) return 0; @@ -1439,7 +1471,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, * The subvolume has reloc tree but the swap is finished, no need to * create/update the dead reloc tree */ - if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)) + if (reloc_root_is_dead(root)) return 0; if (root->reloc_root) { @@ -1478,8 +1510,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root_item *root_item; int ret; - if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state) || - !root->reloc_root) + if (!have_reloc_root(root)) goto out; reloc_root = root->reloc_root; @@ -1489,6 +1520,11 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, if (fs_info->reloc_ctl->merge_reloc_tree && btrfs_root_refs(root_item) == 0) { set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); + /* + * Mark the tree as dead before we change reloc_root so + * have_reloc_root will not touch it from now on. + */ + smp_wmb(); __del_reloc_root(reloc_root); } @@ -2201,6 +2237,11 @@ static int clean_dirty_subvols(struct reloc_control *rc) if (ret2 < 0 && !ret) ret = ret2; } + /* + * Need barrier to ensure clear_bit() only happens after + * root->reloc_root = NULL. Pairs with have_reloc_root. + */ + smp_wmb(); clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); btrfs_put_fs_root(root); } else { @@ -4718,7 +4759,7 @@ void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, struct btrfs_root *root = pending->root; struct reloc_control *rc = root->fs_info->reloc_ctl; - if (!root->reloc_root || !rc) + if (!rc || !have_reloc_root(root)) return; if (!rc->merge_reloc_tree) @@ -4752,7 +4793,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, struct reloc_control *rc = root->fs_info->reloc_ctl; int ret; - if (!root->reloc_root || !rc) + if (!rc || !have_reloc_root(root)) return 0; rc = root->fs_info->reloc_ctl; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 3b17b647d002..612411c74550 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -376,11 +376,13 @@ again: leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); - - WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid); - WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len); ptr = (unsigned long)(ref + 1); - WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len)); + if ((btrfs_root_ref_dirid(leaf, ref) != dirid) || + (btrfs_root_ref_name_len(leaf, ref) != name_len) || + memcmp_extent_buffer(leaf, name, ptr, name_len)) { + err = -ENOENT; + goto out; + } *sequence = btrfs_root_ref_sequence(leaf, ref); ret = btrfs_del_item(trans, tree_root, path); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 21de630b0730..fd266a2d15ec 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3577,17 +3577,27 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, * This can easily boost the amount of SYSTEM chunks if cleaner * thread can't be triggered fast enough, and use up all space * of btrfs_super_block::sys_chunk_array + * + * While for dev replace, we need to try our best to mark block + * group RO, to prevent race between: + * - Write duplication + * Contains latest data + * - Scrub copy + * Contains data from commit tree + * + * If target block group is not marked RO, nocow writes can + * be overwritten by scrub copy, causing data corruption. + * So for dev-replace, it's not allowed to continue if a block + * group is not RO. */ - ret = btrfs_inc_block_group_ro(cache, false); - scrub_pause_off(fs_info); - + ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace); if (ret == 0) { ro_set = 1; - } else if (ret == -ENOSPC) { + } else if (ret == -ENOSPC && !sctx->is_dev_replace) { /* * btrfs_inc_block_group_ro return -ENOSPC when it * failed in creating new chunk for metadata. - * It is not a problem for scrub/replace, because + * It is not a problem for scrub, because * metadata are always cowed, and our scrub paused * commit_transactions. */ @@ -3596,9 +3606,22 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, btrfs_warn(fs_info, "failed setting block group ro: %d", ret); btrfs_put_block_group(cache); + scrub_pause_off(fs_info); break; } + /* + * Now the target block is marked RO, wait for nocow writes to + * finish before dev-replace. + * COW is fine, as COW never overwrites extents in commit tree. + */ + if (sctx->is_dev_replace) { + btrfs_wait_nocow_writers(cache); + btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, + cache->length); + } + + scrub_pause_off(fs_info); down_write(&dev_replace->rwsem); dev_replace->cursor_right = found_key.offset + length; dev_replace->cursor_left = found_key.offset; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a6d3f08bfff3..9b78e720c697 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3881,7 +3881,11 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, } } - num_devices = btrfs_num_devices(fs_info); + /* + * rw_devices will not change at the moment, device add/delete/replace + * are excluded by EXCL_OP + */ + num_devices = fs_info->fs_devices->rw_devices; /* * SINGLE profile on-disk has no profile bit, but in-memory we have a diff --git a/fs/buffer.c b/fs/buffer.c index d8c7242426bb..18a87ec8a465 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3031,11 +3031,9 @@ static void end_bio_bh_io_sync(struct bio *bio) * errors, this only handles the "we need to be able to * do IO at the final sector" case. */ -void guard_bio_eod(int op, struct bio *bio) +void guard_bio_eod(struct bio *bio) { sector_t maxsector; - struct bio_vec *bvec = bio_last_bvec_all(bio); - unsigned truncated_bytes; struct hd_struct *part; rcu_read_lock(); @@ -3061,28 +3059,7 @@ void guard_bio_eod(int op, struct bio *bio) if (likely((bio->bi_iter.bi_size >> 9) <= maxsector)) return; - /* Uhhuh. We've got a bio that straddles the device size! */ - truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9); - - /* - * The bio contains more than one segment which spans EOD, just return - * and let IO layer turn it into an EIO - */ - if (truncated_bytes > bvec->bv_len) - return; - - /* Truncate the bio.. */ - bio->bi_iter.bi_size -= truncated_bytes; - bvec->bv_len -= truncated_bytes; - - /* ..and clear the end of the buffer for reads */ - if (op == REQ_OP_READ) { - struct bio_vec bv; - - mp_bvec_last_segment(bvec, &bv); - zero_user(bv.bv_page, bv.bv_offset + bv.bv_len, - truncated_bytes); - } + bio_truncate(bio, maxsector << 9); } static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, @@ -3118,15 +3095,15 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, bio->bi_end_io = end_bio_bh_io_sync; bio->bi_private = bh; - /* Take care of bh's that straddle the end of the device */ - guard_bio_eod(op, bio); - if (buffer_meta(bh)) op_flags |= REQ_META; if (buffer_prio(bh)) op_flags |= REQ_PRIO; bio_set_op_attrs(bio, op, op_flags); + /* Take care of bh's that straddle the end of the device */ + guard_bio_eod(bio); + if (wbc) { wbc_init_bio(wbc, bio); wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 374db1bd57d1..145d46ba25ae 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -708,8 +708,10 @@ void ceph_mdsc_release_request(struct kref *kref) /* avoid calling iput_final() in mds dispatch threads */ ceph_async_iput(req->r_inode); } - if (req->r_parent) + if (req->r_parent) { ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN); + ceph_async_iput(req->r_parent); + } ceph_async_iput(req->r_target_inode); if (req->r_dentry) dput(req->r_dentry); @@ -2676,8 +2678,10 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir, /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */ if (req->r_inode) ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); - if (req->r_parent) + if (req->r_parent) { ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN); + ihold(req->r_parent); + } if (req->r_old_dentry_dir) ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), CEPH_CAP_PIN); diff --git a/fs/char_dev.c b/fs/char_dev.c index 00dfe17871ac..c5e6eff5a381 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -352,7 +352,7 @@ static struct kobject *cdev_get(struct cdev *p) if (owner && !try_module_get(owner)) return NULL; - kobj = kobject_get(&p->kobj); + kobj = kobject_get_unless_zero(&p->kobj); if (!kobj) module_put(owner); return kobj; diff --git a/fs/direct-io.c b/fs/direct-io.c index 0ec4f270139f..00b4d15bb811 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -39,6 +39,8 @@ #include <linux/atomic.h> #include <linux/prefetch.h> +#include "internal.h" + /* * How many user pages to map in one call to get_user_pages(). This determines * the size of a structure in the slab cache diff --git a/fs/file.c b/fs/file.c index 2f4fcf985079..3da91a112bab 100644 --- a/fs/file.c +++ b/fs/file.c @@ -960,7 +960,7 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) return ksys_dup3(oldfd, newfd, 0); } -SYSCALL_DEFINE1(dup, unsigned int, fildes) +int ksys_dup(unsigned int fildes) { int ret = -EBADF; struct file *file = fget_raw(fildes); @@ -975,6 +975,11 @@ SYSCALL_DEFINE1(dup, unsigned int, fildes) return ret; } +SYSCALL_DEFINE1(dup, unsigned int, fildes) +{ + return ksys_dup(fildes); +} + int f_dupfd(unsigned int from, struct file *file, unsigned flags) { int err; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a63d779eac10..ce715380143c 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -882,6 +882,7 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) struct fuse_args_pages *ap = &ia->ap; loff_t pos = page_offset(ap->pages[0]); size_t count = ap->num_pages << PAGE_SHIFT; + ssize_t res; int err; ap->args.out_pages = true; @@ -896,7 +897,8 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) if (!err) return; } else { - err = fuse_simple_request(fc, &ap->args); + res = fuse_simple_request(fc, &ap->args); + err = res < 0 ? res : 0; } fuse_readpages_end(fc, &ap->args, err); } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index d5c2a3158610..a66e425884d1 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1498,8 +1498,10 @@ static int __init init_hugetlbfs_fs(void) /* other hstates are optional */ i = 0; for_each_hstate(h) { - if (i == default_hstate_idx) + if (i == default_hstate_idx) { + i++; continue; + } mnt = mount_one_hugetlbfs(h); if (IS_ERR(mnt)) diff --git a/fs/internal.h b/fs/internal.h index 4a7da1df573d..e3fa69544b66 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -38,7 +38,7 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait) /* * buffer.c */ -extern void guard_bio_eod(int rw, struct bio *bio); +extern void guard_bio_eod(struct bio *bio); extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, get_block_t *get_block, struct iomap *iomap); diff --git a/fs/io-wq.c b/fs/io-wq.c index 541c8a3e0bbb..5147d2213b01 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -445,10 +445,14 @@ next: task_unlock(current); } if ((work->flags & IO_WQ_WORK_NEEDS_USER) && !worker->mm && - wq->mm && mmget_not_zero(wq->mm)) { - use_mm(wq->mm); - set_fs(USER_DS); - worker->mm = wq->mm; + wq->mm) { + if (mmget_not_zero(wq->mm)) { + use_mm(wq->mm); + set_fs(USER_DS); + worker->mm = wq->mm; + } else { + work->flags |= IO_WQ_WORK_CANCEL; + } } if (!worker->creds) worker->creds = override_creds(wq->creds); diff --git a/fs/io_uring.c b/fs/io_uring.c index 562e3a1a1bf9..5953d7f13690 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1786,6 +1786,9 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, struct iovec *iovec, struct iovec *fast_iov, struct iov_iter *iter) { + if (req->opcode == IORING_OP_READ_FIXED || + req->opcode == IORING_OP_WRITE_FIXED) + return 0; if (!req->io && io_alloc_async_ctx(req)) return -ENOMEM; @@ -1840,6 +1843,7 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, if (!force_nonblock) req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT; + req->result = 0; io_size = ret; if (req->flags & REQ_F_LINK) req->result = io_size; @@ -1863,18 +1867,6 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, else ret2 = loop_rw_iter(READ, req->file, kiocb, &iter); - /* - * In case of a short read, punt to async. This can happen - * if we have data partially cached. Alternatively we can - * return the short read, in which case the application will - * need to issue another SQE and wait for it. That SQE will - * need async punt anyway, so it's more efficient to do it - * here. - */ - if (force_nonblock && !(req->flags & REQ_F_NOWAIT) && - (req->flags & REQ_F_ISREG) && - ret2 > 0 && ret2 < io_size) - ret2 = -EAGAIN; /* Catch -EAGAIN return for forced non-blocking submission */ if (!force_nonblock || ret2 != -EAGAIN) { kiocb_done(kiocb, ret2, nxt, req->in_async); @@ -1939,6 +1931,7 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, if (!force_nonblock) req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT; + req->result = 0; io_size = ret; if (req->flags & REQ_F_LINK) req->result = io_size; @@ -2046,6 +2039,28 @@ static bool io_req_cancelled(struct io_kiocb *req) return false; } +static void io_link_work_cb(struct io_wq_work **workptr) +{ + struct io_wq_work *work = *workptr; + struct io_kiocb *link = work->data; + + io_queue_linked_timeout(link); + work->func = io_wq_submit_work; +} + +static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt) +{ + struct io_kiocb *link; + + io_prep_async_work(nxt, &link); + *workptr = &nxt->work; + if (link) { + nxt->work.flags |= IO_WQ_WORK_CB; + nxt->work.func = io_link_work_cb; + nxt->work.data = link; + } +} + static void io_fsync_finish(struct io_wq_work **workptr) { struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); @@ -2064,7 +2079,7 @@ static void io_fsync_finish(struct io_wq_work **workptr) io_cqring_add_event(req, ret); io_put_req_find_next(req, &nxt); if (nxt) - *workptr = &nxt->work; + io_wq_assign_next(workptr, nxt); } static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt, @@ -2120,7 +2135,7 @@ static void io_sync_file_range_finish(struct io_wq_work **workptr) io_cqring_add_event(req, ret); io_put_req_find_next(req, &nxt); if (nxt) - *workptr = &nxt->work; + io_wq_assign_next(workptr, nxt); } static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt, @@ -2386,7 +2401,7 @@ static void io_accept_finish(struct io_wq_work **workptr) return; __io_accept(req, &nxt, false); if (nxt) - *workptr = &nxt->work; + io_wq_assign_next(workptr, nxt); } #endif @@ -2617,7 +2632,7 @@ static void io_poll_complete_work(struct io_wq_work **workptr) req_set_fail_links(req); io_put_req_find_next(req, &nxt); if (nxt) - *workptr = &nxt->work; + io_wq_assign_next(workptr, nxt); } static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, @@ -3271,24 +3286,24 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, return ret; if (ctx->flags & IORING_SETUP_IOPOLL) { + const bool in_async = io_wq_current_is_worker(); + if (req->result == -EAGAIN) return -EAGAIN; + /* workqueue context doesn't hold uring_lock, grab it now */ + if (in_async) + mutex_lock(&ctx->uring_lock); + io_iopoll_req_issued(req); + + if (in_async) + mutex_unlock(&ctx->uring_lock); } return 0; } -static void io_link_work_cb(struct io_wq_work **workptr) -{ - struct io_wq_work *work = *workptr; - struct io_kiocb *link = work->data; - - io_queue_linked_timeout(link); - work->func = io_wq_submit_work; -} - static void io_wq_submit_work(struct io_wq_work **workptr) { struct io_wq_work *work = *workptr; @@ -3325,17 +3340,8 @@ static void io_wq_submit_work(struct io_wq_work **workptr) } /* if a dependent link is ready, pass it back */ - if (!ret && nxt) { - struct io_kiocb *link; - - io_prep_async_work(nxt, &link); - *workptr = &nxt->work; - if (link) { - nxt->work.flags |= IO_WQ_WORK_CB; - nxt->work.func = io_link_work_cb; - nxt->work.data = link; - } - } + if (!ret && nxt) + io_wq_assign_next(workptr, nxt); } static bool io_req_op_valid(int op) @@ -4457,13 +4463,15 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, return -EINVAL; if (copy_from_user(&up, arg, sizeof(up))) return -EFAULT; + if (up.resv) + return -EINVAL; if (check_add_overflow(up.offset, nr_args, &done)) return -EOVERFLOW; if (done > ctx->nr_user_files) return -EINVAL; done = 0; - fds = (__s32 __user *) up.fds; + fds = u64_to_user_ptr(up.fds); while (nr_args) { struct fixed_file_table *table; unsigned index; @@ -5153,6 +5161,12 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, } else if (to_submit) { struct mm_struct *cur_mm; + if (current->mm != ctx->sqo_mm || + current_cred() != ctx->creds) { + ret = -EPERM; + goto out; + } + to_submit = min(to_submit, ctx->sq_entries); mutex_lock(&ctx->uring_lock); /* already have mm, so io_submit_sqes() won't try to grab it */ diff --git a/fs/mpage.c b/fs/mpage.c index a63620cdb73a..ccba3c4c4479 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -62,7 +62,7 @@ static struct bio *mpage_bio_submit(int op, int op_flags, struct bio *bio) { bio->bi_end_io = mpage_end_io; bio_set_op_attrs(bio, op, op_flags); - guard_bio_eod(op, bio); + guard_bio_eod(bio); submit_bio(bio); return NULL; } diff --git a/fs/namei.c b/fs/namei.c index d6c91d1e88cb..d2720dc71d0e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1232,6 +1232,7 @@ static int follow_managed(struct path *path, struct nameidata *nd) BUG_ON(!path->dentry->d_op); BUG_ON(!path->dentry->d_op->d_manage); ret = path->dentry->d_op->d_manage(path, false); + flags = smp_load_acquire(&path->dentry->d_flags); if (ret < 0) break; } @@ -1649,17 +1650,15 @@ again: if (IS_ERR(dentry)) return dentry; if (unlikely(!d_in_lookup(dentry))) { - if (!(flags & LOOKUP_NO_REVAL)) { - int error = d_revalidate(dentry, flags); - if (unlikely(error <= 0)) { - if (!error) { - d_invalidate(dentry); - dput(dentry); - goto again; - } + int error = d_revalidate(dentry, flags); + if (unlikely(error <= 0)) { + if (!error) { + d_invalidate(dentry); dput(dentry); - dentry = ERR_PTR(error); + goto again; } + dput(dentry); + dentry = ERR_PTR(error); } } else { old = inode->i_op->lookup(inode, dentry, flags); @@ -2618,72 +2617,6 @@ int user_path_at_empty(int dfd, const char __user *name, unsigned flags, EXPORT_SYMBOL(user_path_at_empty); /** - * mountpoint_last - look up last component for umount - * @nd: pathwalk nameidata - currently pointing at parent directory of "last" - * - * This is a special lookup_last function just for umount. In this case, we - * need to resolve the path without doing any revalidation. - * - * The nameidata should be the result of doing a LOOKUP_PARENT pathwalk. Since - * mountpoints are always pinned in the dcache, their ancestors are too. Thus, - * in almost all cases, this lookup will be served out of the dcache. The only - * cases where it won't are if nd->last refers to a symlink or the path is - * bogus and it doesn't exist. - * - * Returns: - * -error: if there was an error during lookup. This includes -ENOENT if the - * lookup found a negative dentry. - * - * 0: if we successfully resolved nd->last and found it to not to be a - * symlink that needs to be followed. - * - * 1: if we successfully resolved nd->last and found it to be a symlink - * that needs to be followed. - */ -static int -mountpoint_last(struct nameidata *nd) -{ - int error = 0; - struct dentry *dir = nd->path.dentry; - struct path path; - - /* If we're in rcuwalk, drop out of it to handle last component */ - if (nd->flags & LOOKUP_RCU) { - if (unlazy_walk(nd)) - return -ECHILD; - } - - nd->flags &= ~LOOKUP_PARENT; - - if (unlikely(nd->last_type != LAST_NORM)) { - error = handle_dots(nd, nd->last_type); - if (error) - return error; - path.dentry = dget(nd->path.dentry); - } else { - path.dentry = d_lookup(dir, &nd->last); - if (!path.dentry) { - /* - * No cached dentry. Mounted dentries are pinned in the - * cache, so that means that this dentry is probably - * a symlink or the path doesn't actually point - * to a mounted dentry. - */ - path.dentry = lookup_slow(&nd->last, dir, - nd->flags | LOOKUP_NO_REVAL); - if (IS_ERR(path.dentry)) - return PTR_ERR(path.dentry); - } - } - if (d_flags_negative(smp_load_acquire(&path.dentry->d_flags))) { - dput(path.dentry); - return -ENOENT; - } - path.mnt = nd->path.mnt; - return step_into(nd, &path, 0, d_backing_inode(path.dentry), 0); -} - -/** * path_mountpoint - look up a path to be umounted * @nd: lookup context * @flags: lookup flags @@ -2699,14 +2632,17 @@ path_mountpoint(struct nameidata *nd, unsigned flags, struct path *path) int err; while (!(err = link_path_walk(s, nd)) && - (err = mountpoint_last(nd)) > 0) { + (err = lookup_last(nd)) > 0) { s = trailing_symlink(nd); } + if (!err && (nd->flags & LOOKUP_RCU)) + err = unlazy_walk(nd); + if (!err) + err = handle_lookup_down(nd); if (!err) { *path = nd->path; nd->path.mnt = NULL; nd->path.dentry = NULL; - follow_mount(path); } terminate_walk(nd); return err; diff --git a/fs/namespace.c b/fs/namespace.c index be601d3a8008..5e1bf611a9eb 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1728,7 +1728,7 @@ static bool is_mnt_ns_file(struct dentry *dentry) dentry->d_fsdata == &mntns_operations; } -struct mnt_namespace *to_mnt_ns(struct ns_common *ns) +static struct mnt_namespace *to_mnt_ns(struct ns_common *ns) { return container_of(ns, struct mnt_namespace, ns); } diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index f64a33d2a1d1..2a82dcce5fc1 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -206,7 +206,6 @@ TRACE_DEFINE_ENUM(LOOKUP_AUTOMOUNT); TRACE_DEFINE_ENUM(LOOKUP_PARENT); TRACE_DEFINE_ENUM(LOOKUP_REVAL); TRACE_DEFINE_ENUM(LOOKUP_RCU); -TRACE_DEFINE_ENUM(LOOKUP_NO_REVAL); TRACE_DEFINE_ENUM(LOOKUP_OPEN); TRACE_DEFINE_ENUM(LOOKUP_CREATE); TRACE_DEFINE_ENUM(LOOKUP_EXCL); @@ -224,7 +223,6 @@ TRACE_DEFINE_ENUM(LOOKUP_DOWN); { LOOKUP_PARENT, "PARENT" }, \ { LOOKUP_REVAL, "REVAL" }, \ { LOOKUP_RCU, "RCU" }, \ - { LOOKUP_NO_REVAL, "NO_REVAL" }, \ { LOOKUP_OPEN, "OPEN" }, \ { LOOKUP_CREATE, "CREATE" }, \ { LOOKUP_EXCL, "EXCL" }, \ diff --git a/fs/nsfs.c b/fs/nsfs.c index a0431642c6b5..f75767bd623a 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -3,6 +3,7 @@ #include <linux/pseudo_fs.h> #include <linux/file.h> #include <linux/fs.h> +#include <linux/proc_fs.h> #include <linux/proc_ns.h> #include <linux/magic.h> #include <linux/ktime.h> @@ -11,6 +12,8 @@ #include <linux/nsfs.h> #include <linux/uaccess.h> +#include "internal.h" + static struct vfsmount *nsfs_mnt; static long ns_ioctl(struct file *filp, unsigned int ioctl, diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 1c4c51f3df60..cda1027d0819 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -3282,6 +3282,7 @@ static void ocfs2_dlm_init_debug(struct ocfs2_super *osb) debugfs_create_u32("locking_filter", 0600, osb->osb_debug_root, &dlm_debug->d_filter_secs); + ocfs2_get_dlm_debug(dlm_debug); } static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 1afe57f425a0..68ba354cf361 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1066,6 +1066,14 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed) ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num); + if (replayed) { + jbd2_journal_lock_updates(journal->j_journal); + status = jbd2_journal_flush(journal->j_journal); + jbd2_journal_unlock_updates(journal->j_journal); + if (status < 0) + mlog_errno(status); + } + status = ocfs2_journal_toggle_dirty(osb, 1, replayed); if (status < 0) { mlog_errno(status); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 84ad1c90d535..249672bf54fe 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -631,12 +631,15 @@ EXPORT_SYMBOL_GPL(posix_acl_create); /** * posix_acl_update_mode - update mode in set_acl + * @inode: target inode + * @mode_p: mode (pointer) for update + * @acl: acl pointer * * Update the file mode when setting an ACL: compute the new file permission * bits based on the ACL. In addition, if the ACL is equivalent to the new - * file mode, set *acl to NULL to indicate that no ACL should be set. + * file mode, set *@acl to NULL to indicate that no ACL should be set. * - * As with chmod, clear the setgit bit if the caller is not in the owning group + * As with chmod, clear the setgid bit if the caller is not in the owning group * or capable of CAP_FSETID (see inode_change_ok). * * Called from set_acl inode operations. diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 8caff834f002..013486b5125e 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -407,6 +407,17 @@ static int notrace ramoops_pstore_write(struct pstore_record *record) prz = cxt->dprzs[cxt->dump_write_cnt]; + /* + * Since this is a new crash dump, we need to reset the buffer in + * case it still has an old dump present. Without this, the new dump + * will get appended, which would seriously confuse anything trying + * to check dump file contents. Specifically, ramoops_read_kmsg_hdr() + * expects to find a dump header in the beginning of buffer data, so + * we must to reset the buffer values, in order to ensure that the + * header will be written to the beginning of the buffer. + */ + persistent_ram_zap(prz); + /* Build header and append record contents. */ hlen = ramoops_write_kmsg_hdr(prz, record); if (!hlen) @@ -572,6 +583,7 @@ static int ramoops_init_przs(const char *name, prz_ar[i] = persistent_ram_new(*paddr, zone_sz, sig, &cxt->ecc_info, cxt->memtype, flags, label); + kfree(label); if (IS_ERR(prz_ar[i])) { err = PTR_ERR(prz_ar[i]); dev_err(dev, "failed to request %s mem region (0x%zx@0x%llx): %d\n", @@ -617,6 +629,7 @@ static int ramoops_init_prz(const char *name, label = kasprintf(GFP_KERNEL, "ramoops:%s", name); *prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info, cxt->memtype, PRZ_FLAG_ZAP_OLD, label); + kfree(label); if (IS_ERR(*prz)) { int err = PTR_ERR(*prz); diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 8823f65888f0..1f4d8c06f9be 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -574,7 +574,7 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, /* Initialize general buffer state. */ raw_spin_lock_init(&prz->buffer_lock); prz->flags = flags; - prz->label = label; + prz->label = kstrdup(label, GFP_KERNEL); ret = persistent_ram_buffer_map(start, size, prz, memtype); if (ret) diff --git a/fs/readdir.c b/fs/readdir.c index d26d5ea4de7b..de2eceffdee8 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -102,10 +102,14 @@ EXPORT_SYMBOL(iterate_dir); * filename length, and the above "soft error" worry means * that it's probably better left alone until we have that * issue clarified. + * + * Note the PATH_MAX check - it's arbitrary but the real + * kernel limit on a possible path component, not NAME_MAX, + * which is the technical standard limit. */ static int verify_dirent_name(const char *name, int len) { - if (!len) + if (len <= 0 || len >= PATH_MAX) return -EIO; if (memchr(name, '/', len)) return -EIO; @@ -206,7 +210,7 @@ struct linux_dirent { struct getdents_callback { struct dir_context ctx; struct linux_dirent __user * current_dir; - struct linux_dirent __user * previous; + int prev_reclen; int count; int error; }; @@ -214,12 +218,13 @@ struct getdents_callback { static int filldir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { - struct linux_dirent __user * dirent; + struct linux_dirent __user *dirent, *prev; struct getdents_callback *buf = container_of(ctx, struct getdents_callback, ctx); unsigned long d_ino; int reclen = ALIGN(offsetof(struct linux_dirent, d_name) + namlen + 2, sizeof(long)); + int prev_reclen; buf->error = verify_dirent_name(name, namlen); if (unlikely(buf->error)) @@ -232,28 +237,24 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen, buf->error = -EOVERFLOW; return -EOVERFLOW; } - dirent = buf->previous; - if (dirent && signal_pending(current)) + prev_reclen = buf->prev_reclen; + if (prev_reclen && signal_pending(current)) return -EINTR; - - /* - * Note! This range-checks 'previous' (which may be NULL). - * The real range was checked in getdents - */ - if (!user_access_begin(dirent, sizeof(*dirent))) - goto efault; - if (dirent) - unsafe_put_user(offset, &dirent->d_off, efault_end); dirent = buf->current_dir; + prev = (void __user *) dirent - prev_reclen; + if (!user_access_begin(prev, reclen + prev_reclen)) + goto efault; + + /* This might be 'dirent->d_off', but if so it will get overwritten */ + unsafe_put_user(offset, &prev->d_off, efault_end); unsafe_put_user(d_ino, &dirent->d_ino, efault_end); unsafe_put_user(reclen, &dirent->d_reclen, efault_end); unsafe_put_user(d_type, (char __user *) dirent + reclen - 1, efault_end); unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); user_access_end(); - buf->previous = dirent; - dirent = (void __user *)dirent + reclen; - buf->current_dir = dirent; + buf->current_dir = (void __user *)dirent + reclen; + buf->prev_reclen = reclen; buf->count -= reclen; return 0; efault_end: @@ -267,7 +268,6 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd, struct linux_dirent __user *, dirent, unsigned int, count) { struct fd f; - struct linux_dirent __user * lastdirent; struct getdents_callback buf = { .ctx.actor = filldir, .count = count, @@ -285,8 +285,10 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd, error = iterate_dir(f.file, &buf.ctx); if (error >= 0) error = buf.error; - lastdirent = buf.previous; - if (lastdirent) { + if (buf.prev_reclen) { + struct linux_dirent __user * lastdirent; + lastdirent = (void __user *)buf.current_dir - buf.prev_reclen; + if (put_user(buf.ctx.pos, &lastdirent->d_off)) error = -EFAULT; else @@ -299,7 +301,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd, struct getdents_callback64 { struct dir_context ctx; struct linux_dirent64 __user * current_dir; - struct linux_dirent64 __user * previous; + int prev_reclen; int count; int error; }; @@ -307,11 +309,12 @@ struct getdents_callback64 { static int filldir64(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { - struct linux_dirent64 __user *dirent; + struct linux_dirent64 __user *dirent, *prev; struct getdents_callback64 *buf = container_of(ctx, struct getdents_callback64, ctx); int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1, sizeof(u64)); + int prev_reclen; buf->error = verify_dirent_name(name, namlen); if (unlikely(buf->error)) @@ -319,30 +322,27 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen, buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) return -EINVAL; - dirent = buf->previous; - if (dirent && signal_pending(current)) + prev_reclen = buf->prev_reclen; + if (prev_reclen && signal_pending(current)) return -EINTR; - - /* - * Note! This range-checks 'previous' (which may be NULL). - * The real range was checked in getdents - */ - if (!user_access_begin(dirent, sizeof(*dirent))) - goto efault; - if (dirent) - unsafe_put_user(offset, &dirent->d_off, efault_end); dirent = buf->current_dir; + prev = (void __user *)dirent - prev_reclen; + if (!user_access_begin(prev, reclen + prev_reclen)) + goto efault; + + /* This might be 'dirent->d_off', but if so it will get overwritten */ + unsafe_put_user(offset, &prev->d_off, efault_end); unsafe_put_user(ino, &dirent->d_ino, efault_end); unsafe_put_user(reclen, &dirent->d_reclen, efault_end); unsafe_put_user(d_type, &dirent->d_type, efault_end); unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); user_access_end(); - buf->previous = dirent; - dirent = (void __user *)dirent + reclen; - buf->current_dir = dirent; + buf->prev_reclen = reclen; + buf->current_dir = (void __user *)dirent + reclen; buf->count -= reclen; return 0; + efault_end: user_access_end(); efault: @@ -354,7 +354,6 @@ int ksys_getdents64(unsigned int fd, struct linux_dirent64 __user *dirent, unsigned int count) { struct fd f; - struct linux_dirent64 __user * lastdirent; struct getdents_callback64 buf = { .ctx.actor = filldir64, .count = count, @@ -372,9 +371,11 @@ int ksys_getdents64(unsigned int fd, struct linux_dirent64 __user *dirent, error = iterate_dir(f.file, &buf.ctx); if (error >= 0) error = buf.error; - lastdirent = buf.previous; - if (lastdirent) { + if (buf.prev_reclen) { + struct linux_dirent64 __user * lastdirent; typeof(lastdirent->d_off) d_off = buf.ctx.pos; + + lastdirent = (void __user *) buf.current_dir - buf.prev_reclen; if (__put_user(d_off, &lastdirent->d_off)) error = -EFAULT; else diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 62b40df36c98..28b241cd6987 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -319,8 +319,12 @@ static int reiserfs_for_each_xattr(struct inode *inode, out_dir: dput(dir); out: - /* -ENODATA isn't an error */ - if (err == -ENODATA) + /* + * -ENODATA: this object doesn't have any xattrs + * -EOPNOTSUPP: this file system doesn't have xattrs enabled on disk. + * Neither are errors + */ + if (err == -ENODATA || err == -EOPNOTSUPP) err = 0; return err; } diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h index a950a22c4890..cac7404b2bdd 100644 --- a/include/asm-generic/cacheflush.h +++ b/include/asm-generic/cacheflush.h @@ -11,71 +11,102 @@ * The cache doesn't need to be flushed when TLB entries change when * the cache is mapped to physical memory, not virtual memory */ +#ifndef flush_cache_all static inline void flush_cache_all(void) { } +#endif +#ifndef flush_cache_mm static inline void flush_cache_mm(struct mm_struct *mm) { } +#endif +#ifndef flush_cache_dup_mm static inline void flush_cache_dup_mm(struct mm_struct *mm) { } +#endif +#ifndef flush_cache_range static inline void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { } +#endif +#ifndef flush_cache_page static inline void flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, unsigned long pfn) { } +#endif +#ifndef flush_dcache_page static inline void flush_dcache_page(struct page *page) { } +#endif +#ifndef flush_dcache_mmap_lock static inline void flush_dcache_mmap_lock(struct address_space *mapping) { } +#endif +#ifndef flush_dcache_mmap_unlock static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { } +#endif +#ifndef flush_icache_range static inline void flush_icache_range(unsigned long start, unsigned long end) { } +#endif +#ifndef flush_icache_page static inline void flush_icache_page(struct vm_area_struct *vma, struct page *page) { } +#endif +#ifndef flush_icache_user_range static inline void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len) { } +#endif +#ifndef flush_cache_vmap static inline void flush_cache_vmap(unsigned long start, unsigned long end) { } +#endif +#ifndef flush_cache_vunmap static inline void flush_cache_vunmap(unsigned long start, unsigned long end) { } +#endif -#define copy_to_user_page(vma, page, vaddr, dst, src, len) \ +#ifndef copy_to_user_page +#define copy_to_user_page(vma, page, vaddr, dst, src, len) \ do { \ memcpy(dst, src, len); \ flush_icache_user_range(vma, page, vaddr, len); \ } while (0) +#endif + +#ifndef copy_from_user_page #define copy_from_user_page(vma, page, vaddr, dst, src, len) \ memcpy(dst, src, len) +#endif #endif /* __ASM_CACHEFLUSH_H */ diff --git a/include/drm/drm_dp_mst_helper.h b/include/drm/drm_dp_mst_helper.h index d5fc90b30487..c1bda7030e2d 100644 --- a/include/drm/drm_dp_mst_helper.h +++ b/include/drm/drm_dp_mst_helper.h @@ -605,6 +605,12 @@ struct drm_dp_mst_topology_mgr { * &drm_dp_sideband_msg_tx.state once they are queued */ struct mutex qlock; + + /** + * @is_waiting_for_dwn_reply: indicate whether is waiting for down reply + */ + bool is_waiting_for_dwn_reply; + /** * @tx_msg_downq: List of pending down replies. */ diff --git a/include/dt-bindings/reset/amlogic,meson8b-reset.h b/include/dt-bindings/reset/amlogic,meson8b-reset.h index c614438bcbdb..fbc524a900da 100644 --- a/include/dt-bindings/reset/amlogic,meson8b-reset.h +++ b/include/dt-bindings/reset/amlogic,meson8b-reset.h @@ -46,9 +46,9 @@ #define RESET_VD_RMEM 64 #define RESET_AUDIN 65 #define RESET_DBLK 66 -#define RESET_PIC_DC 66 -#define RESET_PSC 66 -#define RESET_NAND 66 +#define RESET_PIC_DC 67 +#define RESET_PSC 68 +#define RESET_NAND 69 #define RESET_GE2D 70 #define RESET_PARSER_REG 71 #define RESET_PARSER_FETCH 72 diff --git a/include/linux/bio.h b/include/linux/bio.h index 3cdb84cdc488..853d92ceee64 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -470,6 +470,7 @@ extern struct bio *bio_copy_user_iov(struct request_queue *, gfp_t); extern int bio_uncopy_user(struct bio *); void zero_fill_bio_iter(struct bio *bio, struct bvec_iter iter); +void bio_truncate(struct bio *bio, unsigned new_size); static inline void zero_fill_bio(struct bio *bio) { diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index ff335b22f23c..f0f3a9fffa6a 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -53,6 +53,7 @@ * bitmap_find_next_zero_area_off(buf, len, pos, n, mask) as above * bitmap_shift_right(dst, src, n, nbits) *dst = *src >> n * bitmap_shift_left(dst, src, n, nbits) *dst = *src << n + * bitmap_cut(dst, src, first, n, nbits) Cut n bits from first, copy rest * bitmap_replace(dst, old, new, mask, nbits) *dst = (*old & ~(*mask)) | (*new & *mask) * bitmap_remap(dst, src, old, new, nbits) *dst = map(old, new)(src) * bitmap_bitremap(oldbit, old, new, nbits) newbit = map(old, new)(oldbit) @@ -133,6 +134,9 @@ extern void __bitmap_shift_right(unsigned long *dst, const unsigned long *src, unsigned int shift, unsigned int nbits); extern void __bitmap_shift_left(unsigned long *dst, const unsigned long *src, unsigned int shift, unsigned int nbits); +extern void bitmap_cut(unsigned long *dst, const unsigned long *src, + unsigned int first, unsigned int cut, + unsigned int nbits); extern int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); extern void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 47eb22a3b7f9..4c636c42ad68 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -328,6 +328,7 @@ struct queue_limits { unsigned int max_sectors; unsigned int max_segment_size; unsigned int physical_block_size; + unsigned int logical_block_size; unsigned int alignment_offset; unsigned int io_min; unsigned int io_opt; @@ -338,7 +339,6 @@ struct queue_limits { unsigned int discard_granularity; unsigned int discard_alignment; - unsigned short logical_block_size; unsigned short max_segments; unsigned short max_integrity_segments; unsigned short max_discard_segments; @@ -1077,7 +1077,7 @@ extern void blk_queue_max_write_same_sectors(struct request_queue *q, unsigned int max_write_same_sectors); extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q, unsigned int max_write_same_sectors); -extern void blk_queue_logical_block_size(struct request_queue *, unsigned short); +extern void blk_queue_logical_block_size(struct request_queue *, unsigned int); extern void blk_queue_physical_block_size(struct request_queue *, unsigned int); extern void blk_queue_alignment_offset(struct request_queue *q, unsigned int alignment); @@ -1291,7 +1291,7 @@ static inline unsigned int queue_max_segment_size(const struct request_queue *q) return q->limits.max_segment_size; } -static inline unsigned short queue_logical_block_size(const struct request_queue *q) +static inline unsigned queue_logical_block_size(const struct request_queue *q) { int retval = 512; @@ -1301,7 +1301,7 @@ static inline unsigned short queue_logical_block_size(const struct request_queue return retval; } -static inline unsigned short bdev_logical_block_size(struct block_device *bdev) +static inline unsigned int bdev_logical_block_size(struct block_device *bdev) { return queue_logical_block_size(bdev_get_queue(bdev)); } diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b14e51d56a82..a9687861fd7e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -17,6 +17,7 @@ #include <linux/u64_stats_sync.h> #include <linux/refcount.h> #include <linux/mutex.h> +#include <linux/module.h> struct bpf_verifier_env; struct bpf_verifier_log; @@ -43,6 +44,15 @@ struct bpf_map_ops { int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key); void (*map_release_uref)(struct bpf_map *map); void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key); + int (*map_lookup_batch)(struct bpf_map *map, const union bpf_attr *attr, + union bpf_attr __user *uattr); + int (*map_lookup_and_delete_batch)(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr); + int (*map_update_batch)(struct bpf_map *map, const union bpf_attr *attr, + union bpf_attr __user *uattr); + int (*map_delete_batch)(struct bpf_map *map, const union bpf_attr *attr, + union bpf_attr __user *uattr); /* funcs callable from userspace and from eBPF programs */ void *(*map_lookup_elem)(struct bpf_map *map, void *key); @@ -106,6 +116,7 @@ struct bpf_map { struct btf *btf; struct bpf_map_memory memory; char name[BPF_OBJ_NAME_LEN]; + u32 btf_vmlinux_value_type_id; bool unpriv_array; bool frozen; /* write-once; write-protected by freeze_mutex */ /* 22 bytes hole */ @@ -183,7 +194,8 @@ static inline bool bpf_map_offload_neutral(const struct bpf_map *map) static inline bool bpf_map_support_seq_show(const struct bpf_map *map) { - return map->btf && map->ops->map_seq_show_elem; + return (map->btf_value_type_id || map->btf_vmlinux_value_type_id) && + map->ops->map_seq_show_elem; } int map_check_no_btf(const struct bpf_map *map, @@ -349,6 +361,10 @@ struct bpf_verifier_ops { const struct bpf_insn *src, struct bpf_insn *dst, struct bpf_prog *prog, u32 *target_size); + int (*btf_struct_access)(struct bpf_verifier_log *log, + const struct btf_type *t, int off, int size, + enum bpf_access_type atype, + u32 *next_btf_id); }; struct bpf_prog_offload_ops { @@ -437,7 +453,8 @@ struct btf_func_model { * fentry = a set of program to run before calling original function * fexit = a set of program to run after original function */ -int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags, +int arch_prepare_bpf_trampoline(void *image, void *image_end, + const struct btf_func_model *m, u32 flags, struct bpf_prog **fentry_progs, int fentry_cnt, struct bpf_prog **fexit_progs, int fexit_cnt, void *orig_call); @@ -448,7 +465,8 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start); enum bpf_tramp_prog_type { BPF_TRAMP_FENTRY, BPF_TRAMP_FEXIT, - BPF_TRAMP_MAX + BPF_TRAMP_MAX, + BPF_TRAMP_REPLACE, /* more than MAX */ }; struct bpf_trampoline { @@ -463,6 +481,11 @@ struct bpf_trampoline { void *addr; bool ftrace_managed; } func; + /* if !NULL this is BPF_PROG_TYPE_EXT program that extends another BPF + * program by replacing one of its functions. func.addr is the address + * of the function it replaced. + */ + struct bpf_prog *extension_prog; /* list of BPF programs using this trampoline */ struct hlist_head progs_hlist[BPF_TRAMP_MAX]; /* Number of attached programs. A counter per kind. */ @@ -558,6 +581,7 @@ static inline void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, #endif struct bpf_func_info_aux { + u16 linkage; bool unreliable; }; @@ -668,6 +692,73 @@ struct bpf_array_aux { struct work_struct work; }; +struct bpf_struct_ops_value; +struct btf_type; +struct btf_member; + +#define BPF_STRUCT_OPS_MAX_NR_MEMBERS 64 +struct bpf_struct_ops { + const struct bpf_verifier_ops *verifier_ops; + int (*init)(struct btf *btf); + int (*check_member)(const struct btf_type *t, + const struct btf_member *member); + int (*init_member)(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata); + int (*reg)(void *kdata); + void (*unreg)(void *kdata); + const struct btf_type *type; + const struct btf_type *value_type; + const char *name; + struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS]; + u32 type_id; + u32 value_id; +}; + +#if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) +#define BPF_MODULE_OWNER ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA)) +const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id); +void bpf_struct_ops_init(struct btf *btf); +bool bpf_struct_ops_get(const void *kdata); +void bpf_struct_ops_put(const void *kdata); +int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, + void *value); +static inline bool bpf_try_module_get(const void *data, struct module *owner) +{ + if (owner == BPF_MODULE_OWNER) + return bpf_struct_ops_get(data); + else + return try_module_get(owner); +} +static inline void bpf_module_put(const void *data, struct module *owner) +{ + if (owner == BPF_MODULE_OWNER) + bpf_struct_ops_put(data); + else + module_put(owner); +} +#else +static inline const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id) +{ + return NULL; +} +static inline void bpf_struct_ops_init(struct btf *btf) { } +static inline bool bpf_try_module_get(const void *data, struct module *owner) +{ + return try_module_get(owner); +} +static inline void bpf_module_put(const void *data, struct module *owner) +{ + module_put(owner); +} +static inline int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, + void *key, + void *value) +{ + return -EINVAL; +} +#endif + struct bpf_array { struct bpf_map map; u32 elem_size; @@ -906,6 +997,15 @@ void *bpf_map_area_alloc(u64 size, int numa_node); void *bpf_map_area_mmapable_alloc(u64 size, int numa_node); void bpf_map_area_free(void *base); void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); +int generic_map_lookup_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr); +int generic_map_update_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr); +int generic_map_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr); extern int sysctl_unprivileged_bpf_disabled; @@ -962,7 +1062,9 @@ struct sk_buff; struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key); struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key); -void __dev_map_flush(void); +void __dev_flush(void); +int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, + struct net_device *dev_rx); int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, struct net_device *dev_rx); int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, @@ -1006,7 +1108,13 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, const char *func_name, struct btf_func_model *m); -int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog); +struct bpf_reg_state; +int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, + struct bpf_reg_state *regs); +int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, + struct bpf_reg_state *reg); +int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog, + struct btf *btf, const struct btf_type *t); struct bpf_prog *bpf_prog_by_id(u32 id); @@ -1071,7 +1179,7 @@ static inline struct net_device *__dev_map_hash_lookup_elem(struct bpf_map *map return NULL; } -static inline void __dev_map_flush(void) +static inline void __dev_flush(void) { } @@ -1079,6 +1187,13 @@ struct xdp_buff; struct bpf_dtab_netdev; static inline +int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, + struct net_device *dev_rx) +{ + return 0; +} + +static inline int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, struct net_device *dev_rx) { @@ -1299,6 +1414,7 @@ extern const struct bpf_func_proto bpf_get_local_storage_proto; extern const struct bpf_func_proto bpf_strtol_proto; extern const struct bpf_func_proto bpf_strtoul_proto; extern const struct bpf_func_proto bpf_tcp_sock_proto; +extern const struct bpf_func_proto bpf_jiffies64_proto; /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 93740b3614d7..c81d4ece79a4 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -65,6 +65,12 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2, BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport, struct sk_reuseport_md, struct sk_reuseport_kern) #endif +#if defined(CONFIG_BPF_JIT) +BPF_PROG_TYPE(BPF_PROG_TYPE_STRUCT_OPS, bpf_struct_ops, + void *, void *) +BPF_PROG_TYPE(BPF_PROG_TYPE_EXT, bpf_extension, + void *, void *) +#endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) @@ -105,3 +111,6 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, reuseport_array_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_QUEUE, queue_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops) +#if defined(CONFIG_BPF_JIT) +BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops) +#endif diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 26e40de9ef55..5406e6e96585 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -304,11 +304,13 @@ struct bpf_insn_aux_data { u64 map_key_state; /* constant (32 bit) key tracking for maps */ int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ int sanitize_stack_off; /* stack slot to be cleared */ - bool seen; /* this insn was processed by the verifier */ + u32 seen; /* this insn was processed by the verifier at env->pass_cnt */ bool zext_dst; /* this insn zero extends dst reg */ u8 alu_state; /* used in combination with alu_limit */ - bool prune_point; + + /* below fields are initialized once */ unsigned int orig_idx; /* original instruction index */ + bool prune_point; }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ @@ -379,6 +381,7 @@ struct bpf_verifier_env { int *insn_stack; int cur_stack; } cfg; + u32 pass_cnt; /* number of times do_check() was called */ u32 subprog_cnt; /* number of instructions analyzed by the verifier */ u32 prev_insn_processed, insn_processed; @@ -428,4 +431,7 @@ bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off, void bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt); +int check_ctx_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno); + #endif /* _LINUX_BPF_VERIFIER_H */ diff --git a/include/linux/btf.h b/include/linux/btf.h index 79d4abc2556a..5c1ea99b480f 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -7,6 +7,8 @@ #include <linux/types.h> #include <uapi/linux/btf.h> +#define BTF_TYPE_EMIT(type) ((void)(type *)0) + struct btf; struct btf_member; struct btf_type; @@ -53,6 +55,22 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, u32 expected_offset, u32 expected_size); int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); bool btf_type_is_void(const struct btf_type *t); +s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind); +const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, + u32 id, u32 *res_id); +const struct btf_type *btf_type_resolve_ptr(const struct btf *btf, + u32 id, u32 *res_id); +const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf, + u32 id, u32 *res_id); +const struct btf_type * +btf_resolve_size(const struct btf *btf, const struct btf_type *type, + u32 *type_size, const struct btf_type **elem_type, + u32 *total_nelems); + +#define for_each_member(i, struct_type, member) \ + for (i = 0, member = btf_type_member(struct_type); \ + i < btf_type_vlen(struct_type); \ + i++, member++) static inline bool btf_type_is_ptr(const struct btf_type *t) { @@ -84,6 +102,40 @@ static inline bool btf_type_is_func_proto(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC_PROTO; } +static inline u16 btf_type_vlen(const struct btf_type *t) +{ + return BTF_INFO_VLEN(t->info); +} + +static inline u16 btf_func_linkage(const struct btf_type *t) +{ + return BTF_INFO_VLEN(t->info); +} + +static inline bool btf_type_kflag(const struct btf_type *t) +{ + return BTF_INFO_KFLAG(t->info); +} + +static inline u32 btf_member_bit_offset(const struct btf_type *struct_type, + const struct btf_member *member) +{ + return btf_type_kflag(struct_type) ? BTF_MEMBER_BIT_OFFSET(member->offset) + : member->offset; +} + +static inline u32 btf_member_bitfield_size(const struct btf_type *struct_type, + const struct btf_member *member) +{ + return btf_type_kflag(struct_type) ? BTF_MEMBER_BITFIELD_SIZE(member->offset) + : 0; +} + +static inline const struct btf_member *btf_type_member(const struct btf_type *t) +{ + return (const struct btf_member *)(t + 1); +} + #ifdef CONFIG_BPF_SYSCALL const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); const char *btf_name_by_offset(const struct btf *btf, u32 offset); diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 679a42253170..a81c13ac1972 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -153,26 +153,4 @@ static inline void bvec_advance(const struct bio_vec *bvec, } } -/* - * Get the last single-page segment from the multi-page bvec and store it - * in @seg - */ -static inline void mp_bvec_last_segment(const struct bio_vec *bvec, - struct bio_vec *seg) -{ - unsigned total = bvec->bv_offset + bvec->bv_len; - unsigned last_page = (total - 1) / PAGE_SIZE; - - seg->bv_page = bvec->bv_page + last_page; - - /* the whole segment is inside the last page */ - if (bvec->bv_offset >= last_page * PAGE_SIZE) { - seg->bv_offset = bvec->bv_offset % PAGE_SIZE; - seg->bv_len = bvec->bv_len; - } else { - seg->bv_offset = 0; - seg->bv_len = total - last_page * PAGE_SIZE; - } -} - #endif /* __LINUX_BVEC_ITER_H */ diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 9b3c720a31b1..5e3d45525bd3 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -18,6 +18,7 @@ #include <linux/can/error.h> #include <linux/can/led.h> #include <linux/can/netlink.h> +#include <linux/can/skb.h> #include <linux/netdevice.h> /* @@ -91,6 +92,36 @@ struct can_priv { #define get_can_dlc(i) (min_t(__u8, (i), CAN_MAX_DLC)) #define get_canfd_dlc(i) (min_t(__u8, (i), CANFD_MAX_DLC)) +/* Check for outgoing skbs that have not been created by the CAN subsystem */ +static inline bool can_skb_headroom_valid(struct net_device *dev, + struct sk_buff *skb) +{ + /* af_packet creates a headroom of HH_DATA_MOD bytes which is fine */ + if (WARN_ON_ONCE(skb_headroom(skb) < sizeof(struct can_skb_priv))) + return false; + + /* af_packet does not apply CAN skb specific settings */ + if (skb->ip_summed == CHECKSUM_NONE) { + /* init headroom */ + can_skb_prv(skb)->ifindex = dev->ifindex; + can_skb_prv(skb)->skbcnt = 0; + + skb->ip_summed = CHECKSUM_UNNECESSARY; + + /* preform proper loopback on capable devices */ + if (dev->flags & IFF_ECHO) + skb->pkt_type = PACKET_LOOPBACK; + else + skb->pkt_type = PACKET_HOST; + + skb_reset_mac_header(skb); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + } + + return true; +} + /* Drop a given socketbuffer if it does not contain a valid CAN frame. */ static inline bool can_dropped_invalid_skb(struct net_device *dev, struct sk_buff *skb) @@ -108,6 +139,9 @@ static inline bool can_dropped_invalid_skb(struct net_device *dev, } else goto inval_skb; + if (!can_skb_headroom_valid(dev, skb)) + goto inval_skb; + return false; inval_skb: diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 8fcdee1c0cf9..dad4a68fa009 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -1364,8 +1364,11 @@ static inline int dma_get_slave_caps(struct dma_chan *chan, static inline int dmaengine_desc_set_reuse(struct dma_async_tx_descriptor *tx) { struct dma_slave_caps caps; + int ret; - dma_get_slave_caps(tx->chan, &caps); + ret = dma_get_slave_caps(tx->chan, &caps); + if (ret) + return ret; if (caps.descriptor_reuse) { tx->flags |= DMA_CTRL_REUSE; diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index c0b6a603ea8c..fa5735c353cd 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -53,10 +53,12 @@ struct sja1105_skb_cb { ((struct sja1105_skb_cb *)DSA_SKB_CB_PRIV(skb)) struct sja1105_port { + struct kthread_worker *xmit_worker; + struct kthread_work xmit_work; + struct sk_buff_head xmit_queue; struct sja1105_tagger_data *data; struct dsa_port *dp; bool hwts_tx_en; - int mgmt_slot; }; #endif /* _NET_DSA_SJA1105_H */ diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index f6564b572d77..8801f1f986e5 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -43,7 +43,6 @@ __be16 eth_header_parse_protocol(const struct sk_buff *skb); int eth_prepare_mac_addr_change(struct net_device *dev, void *p); void eth_commit_mac_addr_change(struct net_device *dev, void *p); int eth_mac_addr(struct net_device *dev, void *p); -int eth_change_mtu(struct net_device *dev, int new_mtu); int eth_validate_addr(struct net_device *dev); struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs, diff --git a/include/linux/filter.h b/include/linux/filter.h index 70e6dd960bca..f349e2c0884c 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -843,6 +843,8 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog); int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, bpf_aux_classic_check_t trans, bool save_orig); void bpf_prog_destroy(struct bpf_prog *fp); +const struct bpf_func_proto * +bpf_base_func_proto(enum bpf_func_id func_id); int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_attach_bpf(u32 ufd, struct sock *sk); @@ -916,7 +918,7 @@ static inline int xdp_ok_fwd_dev(const struct net_device *fwd, return 0; } -/* The pair of xdp_do_redirect and xdp_do_flush_map MUST be called in the +/* The pair of xdp_do_redirect and xdp_do_flush MUST be called in the * same cpu context. Further for best results no more than a single map * for the do_redirect/do_flush pair should be used. This limitation is * because we only track one map and force a flush when the map changes. @@ -927,7 +929,13 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *prog); -void xdp_do_flush_map(void); +void xdp_do_flush(void); + +/* The xdp_do_flush_map() helper has been renamed to drop the _map suffix, as + * it is no longer only flushing maps. Keep this define for compatibility + * until all drivers are updated - do not use xdp_do_flush_map() in new code! + */ +#define xdp_do_flush_map xdp_do_flush void bpf_warn_invalid_xdp_action(u32 act); diff --git a/include/linux/fsl/enetc_mdio.h b/include/linux/fsl/enetc_mdio.h new file mode 100644 index 000000000000..4875dd38af7e --- /dev/null +++ b/include/linux/fsl/enetc_mdio.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */ +/* Copyright 2019 NXP */ + +#ifndef _FSL_ENETC_MDIO_H_ +#define _FSL_ENETC_MDIO_H_ + +#include <linux/phy.h> + +/* PCS registers */ +#define ENETC_PCS_LINK_TIMER1 0x12 +#define ENETC_PCS_LINK_TIMER1_VAL 0x06a0 +#define ENETC_PCS_LINK_TIMER2 0x13 +#define ENETC_PCS_LINK_TIMER2_VAL 0x0003 +#define ENETC_PCS_IF_MODE 0x14 +#define ENETC_PCS_IF_MODE_SGMII_EN BIT(0) +#define ENETC_PCS_IF_MODE_USE_SGMII_AN BIT(1) +#define ENETC_PCS_IF_MODE_SGMII_SPEED(x) (((x) << 2) & GENMASK(3, 2)) + +/* Not a mistake, the SerDes PLL needs to be set at 3.125 GHz by Reset + * Configuration Word (RCW, outside Linux control) for 2.5G SGMII mode. The PCS + * still thinks it's at gigabit. + */ +enum enetc_pcs_speed { + ENETC_PCS_SPEED_10 = 0, + ENETC_PCS_SPEED_100 = 1, + ENETC_PCS_SPEED_1000 = 2, + ENETC_PCS_SPEED_2500 = 2, +}; + +struct enetc_hw; + +struct enetc_mdio_priv { + struct enetc_hw *hw; + int mdio_base; +}; + +#if IS_REACHABLE(CONFIG_FSL_ENETC_MDIO) + +int enetc_mdio_read(struct mii_bus *bus, int phy_id, int regnum); +int enetc_mdio_write(struct mii_bus *bus, int phy_id, int regnum, u16 value); +struct enetc_hw *enetc_hw_alloc(struct device *dev, void __iomem *port_regs); + +#else + +static inline int enetc_mdio_read(struct mii_bus *bus, int phy_id, int regnum) +{ return -EINVAL; } +static inline int enetc_mdio_write(struct mii_bus *bus, int phy_id, int regnum, + u16 value) +{ return -EINVAL; } +struct enetc_hw *enetc_hw_alloc(struct device *dev, void __iomem *port_regs) +{ return ERR_PTR(-EINVAL); } + +#endif + +#endif diff --git a/include/linux/if_ether.h b/include/linux/if_ether.h index 76cf11e905e1..8a9792a6427a 100644 --- a/include/linux/if_ether.h +++ b/include/linux/if_ether.h @@ -24,6 +24,14 @@ static inline struct ethhdr *eth_hdr(const struct sk_buff *skb) return (struct ethhdr *)skb_mac_header(skb); } +/* Prefer this version in TX path, instead of + * skb_reset_mac_header() + eth_hdr() + */ +static inline struct ethhdr *skb_eth_hdr(const struct sk_buff *skb) +{ + return (struct ethhdr *)skb->data; +} + static inline struct ethhdr *inner_eth_hdr(const struct sk_buff *skb) { return (struct ethhdr *)skb_inner_mac_header(skb); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 3adcb39fa6f5..0d9db2a14f44 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -79,15 +79,6 @@ */ #define round_down(x, y) ((x) & ~__round_mask(x, y)) -/** - * FIELD_SIZEOF - get the size of a struct's field - * @t: the target struct - * @f: the target struct's field - * Return: the size of @f in the struct definition without having a - * declared instance of @t. - */ -#define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f)) - #define typeof_member(T, m) typeof(((T*)0)->m) #define DIV_ROUND_UP __KERNEL_DIV_ROUND_UP diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 3a08ecdfca11..ba0dca6aac6e 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -122,8 +122,8 @@ static inline bool movable_node_is_enabled(void) extern void arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap); -extern void __remove_pages(struct zone *zone, unsigned long start_pfn, - unsigned long nr_pages, struct vmem_altmap *altmap); +extern void __remove_pages(unsigned long start_pfn, unsigned long nr_pages, + struct vmem_altmap *altmap); /* reasonably generic interface to expand the physical pages */ extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, @@ -342,6 +342,9 @@ extern int add_memory(int nid, u64 start, u64 size); extern int add_memory_resource(int nid, struct resource *resource); extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap); +extern void remove_pfn_range_from_zone(struct zone *zone, + unsigned long start_pfn, + unsigned long nr_pages); extern bool is_memblock_offlined(struct memory_block *mem); extern int sparse_add_section(int nid, unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap); diff --git a/include/linux/mfd/mt6397/rtc.h b/include/linux/mfd/mt6397/rtc.h index f84b9163c0ee..7dfb63b81373 100644 --- a/include/linux/mfd/mt6397/rtc.h +++ b/include/linux/mfd/mt6397/rtc.h @@ -46,6 +46,14 @@ #define RTC_AL_SEC 0x0018 +#define RTC_AL_SEC_MASK 0x003f +#define RTC_AL_MIN_MASK 0x003f +#define RTC_AL_HOU_MASK 0x001f +#define RTC_AL_DOM_MASK 0x001f +#define RTC_AL_DOW_MASK 0x0007 +#define RTC_AL_MTH_MASK 0x000f +#define RTC_AL_YEA_MASK 0x007f + #define RTC_PDN2 0x002e #define RTC_PDN2_PWRON_ALARM BIT(4) diff --git a/include/linux/mii.h b/include/linux/mii.h index 4ce8901a1af6..18c6208f56fc 100644 --- a/include/linux/mii.h +++ b/include/linux/mii.h @@ -373,6 +373,56 @@ static inline u32 mii_lpa_to_ethtool_lpa_x(u32 lpa) } /** + * mii_lpa_mod_linkmode_adv_sgmii + * @lp_advertising: pointer to destination link mode. + * @lpa: value of the MII_LPA register + * + * A small helper function that translates MII_LPA bits to + * linkmode advertisement settings for SGMII. + * Leaves other bits unchanged. + */ +static inline void +mii_lpa_mod_linkmode_lpa_sgmii(unsigned long *lp_advertising, u32 lpa) +{ + u32 speed_duplex = lpa & LPA_SGMII_DPX_SPD_MASK; + + linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT_Half_BIT, lp_advertising, + speed_duplex == LPA_SGMII_1000HALF); + + linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT, lp_advertising, + speed_duplex == LPA_SGMII_1000FULL); + + linkmode_mod_bit(ETHTOOL_LINK_MODE_100baseT_Half_BIT, lp_advertising, + speed_duplex == LPA_SGMII_100HALF); + + linkmode_mod_bit(ETHTOOL_LINK_MODE_100baseT_Full_BIT, lp_advertising, + speed_duplex == LPA_SGMII_100FULL); + + linkmode_mod_bit(ETHTOOL_LINK_MODE_10baseT_Half_BIT, lp_advertising, + speed_duplex == LPA_SGMII_10HALF); + + linkmode_mod_bit(ETHTOOL_LINK_MODE_10baseT_Full_BIT, lp_advertising, + speed_duplex == LPA_SGMII_10FULL); +} + +/** + * mii_lpa_to_linkmode_adv_sgmii + * @advertising: pointer to destination link mode. + * @lpa: value of the MII_LPA register + * + * A small helper function that translates MII_ADVERTISE bits + * to linkmode advertisement settings when in SGMII mode. + * Clears the old value of advertising. + */ +static inline void mii_lpa_to_linkmode_lpa_sgmii(unsigned long *lp_advertising, + u32 lpa) +{ + linkmode_zero(lp_advertising); + + mii_lpa_mod_linkmode_lpa_sgmii(lp_advertising, lpa); +} + +/** * mii_adv_mod_linkmode_adv_t * @advertising:pointer to destination link mode. * @adv: value of the MII_ADVERTISE register diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 36e412c3d657..20372de0b587 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -47,7 +47,7 @@ #define DEFAULT_UAR_PAGE_SHIFT 12 #define MAX_MSIX_P_PORT 17 -#define MAX_MSIX 64 +#define MAX_MSIX 128 #define MIN_MSIX_P_PORT 5 #define MLX4_IS_LEGACY_EQ_MODE(dev_cap) ((dev_cap).num_comp_vectors < \ (dev_cap).num_ports * MIN_MSIX_P_PORT) diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index cc1c230f10ee..0e62c3db45e5 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1105,6 +1105,7 @@ enum mlx5_cap_type { MLX5_CAP_DEV_MEM, MLX5_CAP_RESERVED_16, MLX5_CAP_TLS, + MLX5_CAP_VDPA_EMULATION = 0x13, MLX5_CAP_DEV_EVENT = 0x14, /* NUM OF CAP Types */ MLX5_CAP_NUM @@ -1120,6 +1121,9 @@ enum mlx5_pcam_feature_groups { enum mlx5_mcam_reg_groups { MLX5_MCAM_REGS_FIRST_128 = 0x0, + MLX5_MCAM_REGS_0x9080_0x90FF = 0x1, + MLX5_MCAM_REGS_0x9100_0x917F = 0x2, + MLX5_MCAM_REGS_NUM = 0x3, }; enum mlx5_mcam_feature_groups { @@ -1268,7 +1272,16 @@ enum mlx5_qcam_feature_groups { MLX5_GET(pcam_reg, (mdev)->caps.pcam, port_access_reg_cap_mask.regs_5000_to_507f.reg) #define MLX5_CAP_MCAM_REG(mdev, reg) \ - MLX5_GET(mcam_reg, (mdev)->caps.mcam, mng_access_reg_cap_mask.access_regs.reg) + MLX5_GET(mcam_reg, (mdev)->caps.mcam[MLX5_MCAM_REGS_FIRST_128], \ + mng_access_reg_cap_mask.access_regs.reg) + +#define MLX5_CAP_MCAM_REG1(mdev, reg) \ + MLX5_GET(mcam_reg, (mdev)->caps.mcam[MLX5_MCAM_REGS_0x9080_0x90FF], \ + mng_access_reg_cap_mask.access_regs1.reg) + +#define MLX5_CAP_MCAM_REG2(mdev, reg) \ + MLX5_GET(mcam_reg, (mdev)->caps.mcam[MLX5_MCAM_REGS_0x9100_0x917F], \ + mng_access_reg_cap_mask.access_regs2.reg) #define MLX5_CAP_MCAM_FEATURE(mdev, fld) \ MLX5_GET(mcam_reg, (mdev)->caps.mcam, mng_feature_cap_mask.enhanced_features.fld) @@ -1297,6 +1310,14 @@ enum mlx5_qcam_feature_groups { #define MLX5_CAP_DEV_EVENT(mdev, cap)\ MLX5_ADDR_OF(device_event_cap, (mdev)->caps.hca_cur[MLX5_CAP_DEV_EVENT], cap) +#define MLX5_CAP_DEV_VDPA_EMULATION(mdev, cap)\ + MLX5_GET(device_virtio_emulation_cap, \ + (mdev)->caps.hca_cur[MLX5_CAP_VDPA_EMULATION], cap) + +#define MLX5_CAP64_DEV_VDPA_EMULATION(mdev, cap)\ + MLX5_GET64(device_virtio_emulation_cap, \ + (mdev)->caps.hca_cur[MLX5_CAP_VDPA_EMULATION], cap) + enum { MLX5_CMD_STAT_OK = 0x0, MLX5_CMD_STAT_INT_ERR = 0x1, diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 27200dea0297..22bd0d5024c8 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -145,6 +145,8 @@ enum { MLX5_REG_MCC = 0x9062, MLX5_REG_MCDA = 0x9063, MLX5_REG_MCAM = 0x907f, + MLX5_REG_MIRC = 0x9162, + MLX5_REG_RESOURCE_DUMP = 0xC000, }; enum mlx5_qpts_trust_state { @@ -684,7 +686,7 @@ struct mlx5_core_dev { u32 hca_cur[MLX5_CAP_NUM][MLX5_UN_SZ_DW(hca_cap_union)]; u32 hca_max[MLX5_CAP_NUM][MLX5_UN_SZ_DW(hca_cap_union)]; u32 pcam[MLX5_ST_SZ_DW(pcam_reg)]; - u32 mcam[MLX5_ST_SZ_DW(mcam_reg)]; + u32 mcam[MLX5_MCAM_REGS_NUM][MLX5_ST_SZ_DW(mcam_reg)]; u32 fpga[MLX5_ST_SZ_DW(fpga_cap)]; u32 qcam[MLX5_ST_SZ_DW(qcam_reg)]; u8 embedded_cpu; @@ -928,8 +930,6 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev); void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health); void mlx5_drain_health_wq(struct mlx5_core_dev *dev); void mlx5_trigger_health_work(struct mlx5_core_dev *dev); -int mlx5_buf_alloc_node(struct mlx5_core_dev *dev, int size, - struct mlx5_frag_buf *buf, int node); int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, struct mlx5_frag_buf *buf); void mlx5_buf_free(struct mlx5_core_dev *dev, struct mlx5_frag_buf *buf); diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 4e5b84e66822..4cae16016b2b 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -48,6 +48,7 @@ enum { MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT = BIT(0), MLX5_FLOW_TABLE_TUNNEL_EN_DECAP = BIT(1), MLX5_FLOW_TABLE_TERMINATION = BIT(2), + MLX5_FLOW_TABLE_UNMANAGED = BIT(3), }; #define LEFTOVERS_RULE_NUM 2 @@ -145,19 +146,17 @@ mlx5_get_flow_vport_acl_namespace(struct mlx5_core_dev *dev, enum mlx5_flow_namespace_type type, int vport); -struct mlx5_flow_table * -mlx5_create_auto_grouped_flow_table(struct mlx5_flow_namespace *ns, - int prio, - int num_flow_table_entries, - int max_num_groups, - u32 level, - u32 flags); - struct mlx5_flow_table_attr { int prio; int max_fte; u32 level; u32 flags; + struct mlx5_flow_table *next_ft; + + struct { + int max_num_groups; + int num_reserved_entries; + } autogroup; }; struct mlx5_flow_table * @@ -165,6 +164,10 @@ mlx5_create_flow_table(struct mlx5_flow_namespace *ns, struct mlx5_flow_table_attr *ft_attr); struct mlx5_flow_table * +mlx5_create_auto_grouped_flow_table(struct mlx5_flow_namespace *ns, + struct mlx5_flow_table_attr *ft_attr); + +struct mlx5_flow_table * mlx5_create_vport_flow_table(struct mlx5_flow_namespace *ns, int prio, int num_flow_table_entries, @@ -194,6 +197,7 @@ struct mlx5_fs_vlan { enum { FLOW_ACT_NO_APPEND = BIT(0), + FLOW_ACT_IGNORE_FLOW_LEVEL = BIT(1), }; struct mlx5_flow_act { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 5d54fccf87fc..ee0a34d66c7c 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -87,6 +87,7 @@ enum { enum { MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM = (1ULL << MLX5_OBJ_TYPE_SW_ICM), MLX5_GENERAL_OBJ_TYPES_CAP_GENEVE_TLV_OPT = (1ULL << 11), + MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q = (1ULL << 13), }; enum { @@ -374,8 +375,17 @@ struct mlx5_ifc_flow_table_fields_supported_bits { u8 outer_esp_spi[0x1]; u8 reserved_at_58[0x2]; u8 bth_dst_qp[0x1]; + u8 reserved_at_5b[0x5]; - u8 reserved_at_5b[0x25]; + u8 reserved_at_60[0x18]; + u8 metadata_reg_c_7[0x1]; + u8 metadata_reg_c_6[0x1]; + u8 metadata_reg_c_5[0x1]; + u8 metadata_reg_c_4[0x1]; + u8 metadata_reg_c_3[0x1]; + u8 metadata_reg_c_2[0x1]; + u8 metadata_reg_c_1[0x1]; + u8 metadata_reg_c_0[0x1]; }; struct mlx5_ifc_flow_table_prop_layout_bits { @@ -400,7 +410,8 @@ struct mlx5_ifc_flow_table_prop_layout_bits { u8 reformat_l3_tunnel_to_l2[0x1]; u8 reformat_l2_to_l3_tunnel[0x1]; u8 reformat_and_modify_action[0x1]; - u8 reserved_at_15[0x2]; + u8 ignore_flow_level[0x1]; + u8 reserved_at_16[0x1]; u8 table_miss_action_domain[0x1]; u8 termination_table[0x1]; u8 reserved_at_19[0x7]; @@ -721,7 +732,9 @@ enum { struct mlx5_ifc_flow_table_eswitch_cap_bits { u8 fdb_to_vport_reg_c_id[0x8]; - u8 reserved_at_8[0xf]; + u8 reserved_at_8[0xd]; + u8 fdb_modify_header_fwd_to_table[0x1]; + u8 reserved_at_16[0x1]; u8 flow_source[0x1]; u8 reserved_at_18[0x2]; u8 multi_fdb_encap[0x1]; @@ -822,7 +835,9 @@ struct mlx5_ifc_qos_cap_bits { struct mlx5_ifc_debug_cap_bits { u8 core_dump_general[0x1]; u8 core_dump_qp[0x1]; - u8 reserved_at_2[0x1e]; + u8 reserved_at_2[0x7]; + u8 resource_dump[0x1]; + u8 reserved_at_a[0x16]; u8 reserved_at_20[0x2]; u8 stall_detect[0x1]; @@ -953,6 +968,19 @@ struct mlx5_ifc_device_event_cap_bits { u8 user_unaffiliated_events[4][0x40]; }; +struct mlx5_ifc_device_virtio_emulation_cap_bits { + u8 reserved_at_0[0x20]; + + u8 reserved_at_20[0x13]; + u8 log_doorbell_stride[0x5]; + u8 reserved_at_38[0x3]; + u8 log_doorbell_bar_size[0x5]; + + u8 doorbell_bar_offset[0x40]; + + u8 reserved_at_80[0x780]; +}; + enum { MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_1_BYTE = 0x0, MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_2_BYTES = 0x2, @@ -1753,6 +1781,132 @@ struct mlx5_ifc_resize_field_select_bits { u8 resize_field_select[0x20]; }; +struct mlx5_ifc_resource_dump_bits { + u8 more_dump[0x1]; + u8 inline_dump[0x1]; + u8 reserved_at_2[0xa]; + u8 seq_num[0x4]; + u8 segment_type[0x10]; + + u8 reserved_at_20[0x10]; + u8 vhca_id[0x10]; + + u8 index1[0x20]; + + u8 index2[0x20]; + + u8 num_of_obj1[0x10]; + u8 num_of_obj2[0x10]; + + u8 reserved_at_a0[0x20]; + + u8 device_opaque[0x40]; + + u8 mkey[0x20]; + + u8 size[0x20]; + + u8 address[0x40]; + + u8 inline_data[52][0x20]; +}; + +struct mlx5_ifc_resource_dump_menu_record_bits { + u8 reserved_at_0[0x4]; + u8 num_of_obj2_supports_active[0x1]; + u8 num_of_obj2_supports_all[0x1]; + u8 must_have_num_of_obj2[0x1]; + u8 support_num_of_obj2[0x1]; + u8 num_of_obj1_supports_active[0x1]; + u8 num_of_obj1_supports_all[0x1]; + u8 must_have_num_of_obj1[0x1]; + u8 support_num_of_obj1[0x1]; + u8 must_have_index2[0x1]; + u8 support_index2[0x1]; + u8 must_have_index1[0x1]; + u8 support_index1[0x1]; + u8 segment_type[0x10]; + + u8 segment_name[4][0x20]; + + u8 index1_name[4][0x20]; + + u8 index2_name[4][0x20]; +}; + +struct mlx5_ifc_resource_dump_segment_header_bits { + u8 length_dw[0x10]; + u8 segment_type[0x10]; +}; + +struct mlx5_ifc_resource_dump_command_segment_bits { + struct mlx5_ifc_resource_dump_segment_header_bits segment_header; + + u8 segment_called[0x10]; + u8 vhca_id[0x10]; + + u8 index1[0x20]; + + u8 index2[0x20]; + + u8 num_of_obj1[0x10]; + u8 num_of_obj2[0x10]; +}; + +struct mlx5_ifc_resource_dump_error_segment_bits { + struct mlx5_ifc_resource_dump_segment_header_bits segment_header; + + u8 reserved_at_20[0x10]; + u8 syndrome_id[0x10]; + + u8 reserved_at_40[0x40]; + + u8 error[8][0x20]; +}; + +struct mlx5_ifc_resource_dump_info_segment_bits { + struct mlx5_ifc_resource_dump_segment_header_bits segment_header; + + u8 reserved_at_20[0x18]; + u8 dump_version[0x8]; + + u8 hw_version[0x20]; + + u8 fw_version[0x20]; +}; + +struct mlx5_ifc_resource_dump_menu_segment_bits { + struct mlx5_ifc_resource_dump_segment_header_bits segment_header; + + u8 reserved_at_20[0x10]; + u8 num_of_records[0x10]; + + struct mlx5_ifc_resource_dump_menu_record_bits record[0]; +}; + +struct mlx5_ifc_resource_dump_resource_segment_bits { + struct mlx5_ifc_resource_dump_segment_header_bits segment_header; + + u8 reserved_at_20[0x20]; + + u8 index1[0x20]; + + u8 index2[0x20]; + + u8 payload[0][0x20]; +}; + +struct mlx5_ifc_resource_dump_terminate_segment_bits { + struct mlx5_ifc_resource_dump_segment_header_bits segment_header; +}; + +struct mlx5_ifc_menu_resource_dump_response_bits { + struct mlx5_ifc_resource_dump_info_segment_bits info; + struct mlx5_ifc_resource_dump_command_segment_bits cmd; + struct mlx5_ifc_resource_dump_menu_segment_bits menu; + struct mlx5_ifc_resource_dump_terminate_segment_bits terminate; +}; + enum { MLX5_MODIFY_FIELD_SELECT_MODIFY_FIELD_SELECT_CQ_PERIOD = 0x1, MLX5_MODIFY_FIELD_SELECT_MODIFY_FIELD_SELECT_CQ_MAX_COUNT = 0x2, @@ -2026,7 +2180,9 @@ struct mlx5_ifc_eth_per_prio_grp_data_layout_bits { u8 rx_pause_transition_low[0x20]; - u8 reserved_at_3c0[0x40]; + u8 rx_discards_high[0x20]; + + u8 rx_discards_low[0x20]; u8 device_stall_minor_watermark_cnt_high[0x20]; @@ -2751,6 +2907,7 @@ union mlx5_ifc_hca_cap_union_bits { struct mlx5_ifc_fpga_cap_bits fpga_cap; struct mlx5_ifc_tls_cap_bits tls_cap; struct mlx5_ifc_device_mem_cap_bits device_mem_cap; + struct mlx5_ifc_device_virtio_emulation_cap_bits virtio_emulation_cap; u8 reserved_at_0[0x8000]; }; @@ -3998,7 +4155,8 @@ struct mlx5_ifc_set_fte_in_bits { u8 reserved_at_a0[0x8]; u8 table_id[0x18]; - u8 reserved_at_c0[0x18]; + u8 ignore_flow_level[0x1]; + u8 reserved_at_c1[0x17]; u8 modify_enable_mask[0x8]; u8 reserved_at_e0[0x20]; @@ -5466,15 +5624,32 @@ struct mlx5_ifc_add_action_in_bits { u8 data[0x20]; }; +struct mlx5_ifc_copy_action_in_bits { + u8 action_type[0x4]; + u8 src_field[0xc]; + u8 reserved_at_10[0x3]; + u8 src_offset[0x5]; + u8 reserved_at_18[0x3]; + u8 length[0x5]; + + u8 reserved_at_20[0x4]; + u8 dst_field[0xc]; + u8 reserved_at_30[0x3]; + u8 dst_offset[0x5]; + u8 reserved_at_38[0x8]; +}; + union mlx5_ifc_set_action_in_add_action_in_auto_bits { struct mlx5_ifc_set_action_in_bits set_action_in; struct mlx5_ifc_add_action_in_bits add_action_in; + struct mlx5_ifc_copy_action_in_bits copy_action_in; u8 reserved_at_0[0x40]; }; enum { MLX5_ACTION_TYPE_SET = 0x1, MLX5_ACTION_TYPE_ADD = 0x2, + MLX5_ACTION_TYPE_COPY = 0x3, }; enum { @@ -5510,6 +5685,8 @@ enum { MLX5_ACTION_IN_FIELD_METADATA_REG_C_3 = 0x54, MLX5_ACTION_IN_FIELD_METADATA_REG_C_4 = 0x55, MLX5_ACTION_IN_FIELD_METADATA_REG_C_5 = 0x56, + MLX5_ACTION_IN_FIELD_METADATA_REG_C_6 = 0x57, + MLX5_ACTION_IN_FIELD_METADATA_REG_C_7 = 0x58, MLX5_ACTION_IN_FIELD_OUT_TCP_SEQ_NUM = 0x59, MLX5_ACTION_IN_FIELD_OUT_TCP_ACK_NUM = 0x5B, }; @@ -8406,6 +8583,18 @@ struct mlx5_ifc_pplm_reg_bits { u8 fec_override_admin_50g[0x4]; u8 fec_override_admin_25g[0x4]; u8 fec_override_admin_10g_40g[0x4]; + + u8 fec_override_cap_400g_8x[0x10]; + u8 fec_override_cap_200g_4x[0x10]; + + u8 fec_override_cap_100g_2x[0x10]; + u8 fec_override_cap_50g_1x[0x10]; + + u8 fec_override_admin_400g_8x[0x10]; + u8 fec_override_admin_200g_4x[0x10]; + + u8 fec_override_admin_100g_2x[0x10]; + u8 fec_override_admin_50g_1x[0x10]; }; struct mlx5_ifc_ppcnt_reg_bits { @@ -8732,7 +8921,9 @@ struct mlx5_ifc_mpegc_reg_bits { }; struct mlx5_ifc_pcam_enhanced_features_bits { - u8 reserved_at_0[0x6d]; + u8 reserved_at_0[0x68]; + u8 fec_50G_per_lane_in_pplm[0x1]; + u8 reserved_at_69[0x4]; u8 rx_icrc_encapsulated_counter[0x1]; u8 reserved_at_6e[0x4]; u8 ptys_extended_ethernet[0x1]; @@ -8817,6 +9008,28 @@ struct mlx5_ifc_mcam_access_reg_bits { u8 regs_31_to_0[0x20]; }; +struct mlx5_ifc_mcam_access_reg_bits1 { + u8 regs_127_to_96[0x20]; + + u8 regs_95_to_64[0x20]; + + u8 regs_63_to_32[0x20]; + + u8 regs_31_to_0[0x20]; +}; + +struct mlx5_ifc_mcam_access_reg_bits2 { + u8 regs_127_to_99[0x1d]; + u8 mirc[0x1]; + u8 regs_97_to_96[0x2]; + + u8 regs_95_to_64[0x20]; + + u8 regs_63_to_32[0x20]; + + u8 regs_31_to_0[0x20]; +}; + struct mlx5_ifc_mcam_reg_bits { u8 reserved_at_0[0x8]; u8 feature_group[0x8]; @@ -8827,6 +9040,8 @@ struct mlx5_ifc_mcam_reg_bits { union { struct mlx5_ifc_mcam_access_reg_bits access_regs; + struct mlx5_ifc_mcam_access_reg_bits1 access_regs1; + struct mlx5_ifc_mcam_access_reg_bits2 access_regs2; u8 reserved_at_0[0x80]; } mng_access_reg_cap_mask; @@ -9432,6 +9647,13 @@ struct mlx5_ifc_mcda_reg_bits { u8 data[0][0x20]; }; +struct mlx5_ifc_mirc_reg_bits { + u8 reserved_at_0[0x18]; + u8 status_code[0x8]; + + u8 reserved_at_20[0x20]; +}; + union mlx5_ifc_ports_control_registers_document_bits { struct mlx5_ifc_bufferx_reg_bits bufferx_reg; struct mlx5_ifc_eth_2819_cntrs_grp_data_layout_bits eth_2819_cntrs_grp_data_layout; @@ -9487,6 +9709,7 @@ union mlx5_ifc_ports_control_registers_document_bits { struct mlx5_ifc_mcqi_reg_bits mcqi_reg; struct mlx5_ifc_mcc_reg_bits mcc_reg; struct mlx5_ifc_mcda_reg_bits mcda_reg; + struct mlx5_ifc_mirc_reg_bits mirc_reg; u8 reserved_at_0[0x60e0]; }; diff --git a/include/linux/mm.h b/include/linux/mm.h index 80a9162b406c..cfaa8feecfe8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2658,14 +2658,26 @@ static inline bool want_init_on_free(void) !page_poisoning_enabled(); } -#ifdef CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT -DECLARE_STATIC_KEY_TRUE(_debug_pagealloc_enabled); +#ifdef CONFIG_DEBUG_PAGEALLOC +extern void init_debug_pagealloc(void); #else -DECLARE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); +static inline void init_debug_pagealloc(void) {} #endif +extern bool _debug_pagealloc_enabled_early; +DECLARE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); static inline bool debug_pagealloc_enabled(void) { + return IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && + _debug_pagealloc_enabled_early; +} + +/* + * For use in fast paths after init_debug_pagealloc() has run, or when a + * false negative result is not harmful when called too early. + */ +static inline bool debug_pagealloc_enabled_static(void) +{ if (!IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) return false; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 89d8ff06c9ce..5334ad8fc7bd 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -215,9 +215,8 @@ enum node_stat_item { NR_INACTIVE_FILE, /* " " " " " */ NR_ACTIVE_FILE, /* " " " " " */ NR_UNEVICTABLE, /* " " " " " */ - NR_SLAB_RECLAIMABLE, /* Please do not reorder this item */ - NR_SLAB_UNRECLAIMABLE, /* and this one without looking at - * memcg_flush_percpu_vmstats() first. */ + NR_SLAB_RECLAIMABLE, + NR_SLAB_UNRECLAIMABLE, NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ WORKINGSET_NODES, diff --git a/include/linux/mtd/flashchip.h b/include/linux/mtd/flashchip.h index ecc88a41792a..c04f690871ca 100644 --- a/include/linux/mtd/flashchip.h +++ b/include/linux/mtd/flashchip.h @@ -40,7 +40,7 @@ typedef enum { FL_READING, FL_CACHEDPRG, /* These 4 come from onenand_state_t, which has been unified here */ - FL_RESETING, + FL_RESETTING, FL_OTPING, FL_PREPARING_ERASE, FL_VERIFYING_ERASE, diff --git a/include/linux/namei.h b/include/linux/namei.h index 7fe7b87a3ded..07bfb0874033 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -34,7 +34,6 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; /* internal use only */ #define LOOKUP_PARENT 0x0010 -#define LOOKUP_NO_REVAL 0x0080 #define LOOKUP_JUMPED 0x1000 #define LOOKUP_ROOT 0x2000 #define LOOKUP_ROOT_GRABBED 0x0008 diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 4b19c544c59a..34d050bb1ae6 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -53,8 +53,9 @@ enum { NETIF_F_GSO_ESP_BIT, /* ... ESP with TSO */ NETIF_F_GSO_UDP_BIT, /* ... UFO, deprecated except tuntap */ NETIF_F_GSO_UDP_L4_BIT, /* ... UDP payload GSO (not UFO) */ + NETIF_F_GSO_FRAGLIST_BIT, /* ... Fraglist GSO */ /**/NETIF_F_GSO_LAST = /* last bit, see GSO_MASK */ - NETIF_F_GSO_UDP_L4_BIT, + NETIF_F_GSO_FRAGLIST_BIT, NETIF_F_FCOE_CRC_BIT, /* FCoE CRC32 */ NETIF_F_SCTP_CRC_BIT, /* SCTP checksum offload */ @@ -80,6 +81,7 @@ enum { NETIF_F_GRO_HW_BIT, /* Hardware Generic receive offload */ NETIF_F_HW_TLS_RECORD_BIT, /* Offload TLS record */ + NETIF_F_GRO_FRAGLIST_BIT, /* Fraglist GRO */ /* * Add your fresh new feature above and remember to update @@ -150,6 +152,8 @@ enum { #define NETIF_F_GSO_UDP_L4 __NETIF_F(GSO_UDP_L4) #define NETIF_F_HW_TLS_TX __NETIF_F(HW_TLS_TX) #define NETIF_F_HW_TLS_RX __NETIF_F(HW_TLS_RX) +#define NETIF_F_GRO_FRAGLIST __NETIF_F(GRO_FRAGLIST) +#define NETIF_F_GSO_FRAGLIST __NETIF_F(GSO_FRAGLIST) /* Finds the next feature with the highest number of the range of start till 0. */ @@ -226,6 +230,9 @@ static inline int find_next_netdev_feature(u64 feature, unsigned long start) /* changeable features with no special hardware requirements */ #define NETIF_F_SOFT_FEATURES (NETIF_F_GSO | NETIF_F_GRO) +/* Changeable features with no special hardware requirements that defaults to off. */ +#define NETIF_F_SOFT_FEATURES_OFF NETIF_F_GRO_FRAGLIST + #define NETIF_F_VLAN_FEATURES (NETIF_F_HW_VLAN_CTAG_FILTER | \ NETIF_F_HW_VLAN_CTAG_RX | \ NETIF_F_HW_VLAN_CTAG_TX | \ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2741aa35bec6..4626188a754b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -850,6 +850,7 @@ enum tc_setup_type { TC_SETUP_QDISC_TAPRIO, TC_SETUP_FT, TC_SETUP_QDISC_ETS, + TC_SETUP_QDISC_TBF, }; /* These structures hold the attributes of bpf state that are being passed @@ -876,6 +877,7 @@ enum bpf_netdev_command { struct bpf_prog_offload_ops; struct netlink_ext_ack; struct xdp_umem; +struct xdp_dev_bulk_queue; struct netdev_bpf { enum bpf_netdev_command command; @@ -937,6 +939,11 @@ struct netdev_name_node { int netdev_name_node_alt_create(struct net_device *dev, const char *name); int netdev_name_node_alt_destroy(struct net_device *dev, const char *name); +struct netdev_net_notifier { + struct list_head list; + struct notifier_block *nb; +}; + /* * This structure defines the management hooks for network devices. * The following hooks can be defined; unless noted otherwise, they are @@ -1791,6 +1798,10 @@ enum netdev_priv_flags { * * @wol_enabled: Wake-on-LAN is enabled * + * @net_notifier_list: List of per-net netdev notifier block + * that follow this device when it is moved + * to another network namespace. + * * FIXME: cleanup struct net_device such that network protocol info * moves out. */ @@ -1986,12 +1997,10 @@ struct net_device { unsigned int num_tx_queues; unsigned int real_num_tx_queues; struct Qdisc *qdisc; -#ifdef CONFIG_NET_SCHED - DECLARE_HASHTABLE (qdisc_hash, 4); -#endif unsigned int tx_queue_len; spinlock_t tx_global_lock; - int watchdog_timeo; + + struct xdp_dev_bulk_queue __percpu *xdp_bulkq; #ifdef CONFIG_XPS struct xps_dev_maps __rcu *xps_cpus_map; @@ -2001,11 +2010,15 @@ struct net_device { struct mini_Qdisc __rcu *miniq_egress; #endif +#ifdef CONFIG_NET_SCHED + DECLARE_HASHTABLE (qdisc_hash, 4); +#endif /* These may be needed for future network-power-down code. */ struct timer_list watchdog_timer; + int watchdog_timeo; - int __percpu *pcpu_refcnt; struct list_head todo_list; + int __percpu *pcpu_refcnt; struct list_head link_watch_list; @@ -2081,6 +2094,8 @@ struct net_device { struct lock_class_key addr_list_lock_key; bool proto_down; unsigned wol_enabled:1; + + struct list_head net_notifier_list; }; #define to_net_dev(d) container_of(d, struct net_device, dev) @@ -2322,7 +2337,8 @@ struct napi_gro_cb { /* Number of gro_receive callbacks this packet already went through */ u8 recursion_counter:4; - /* 1 bit hole */ + /* GRO is done by frag_list pointer chaining. */ + u8 is_flist:1; /* used to support CHECKSUM_COMPLETE for tunneling protocols */ __wsum csum; @@ -2524,6 +2540,12 @@ int unregister_netdevice_notifier(struct notifier_block *nb); int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb); int unregister_netdevice_notifier_net(struct net *net, struct notifier_block *nb); +int register_netdevice_notifier_dev_net(struct net_device *dev, + struct notifier_block *nb, + struct netdev_net_notifier *nn); +int unregister_netdevice_notifier_dev_net(struct net_device *dev, + struct notifier_block *nb, + struct netdev_net_notifier *nn); struct netdev_notifier_info { struct net_device *dev; @@ -2690,6 +2712,7 @@ struct net_device *dev_get_by_napi_id(unsigned int napi_id); int netdev_get_name(struct net *net, char *name, int ifindex); int dev_restart(struct net_device *dev); int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb); +int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb); static inline unsigned int skb_gro_offset(const struct sk_buff *skb) { @@ -3701,6 +3724,8 @@ int dev_set_alias(struct net_device *, const char *, size_t); int dev_get_alias(const struct net_device *, char *, size_t); int dev_change_net_namespace(struct net_device *, struct net *, const char *); int __dev_set_mtu(struct net_device *, int); +int dev_validate_mtu(struct net_device *dev, int mtu, + struct netlink_ext_ack *extack); int dev_set_mtu_ext(struct net_device *dev, int mtu, struct netlink_ext_ack *extack); int dev_set_mtu(struct net_device *, int); @@ -4564,6 +4589,7 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type) BUILD_BUG_ON(SKB_GSO_ESP != (NETIF_F_GSO_ESP >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_UDP != (NETIF_F_GSO_UDP >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_UDP_L4 != (NETIF_F_GSO_UDP_L4 >> NETIF_F_GSO_SHIFT)); + BUILD_BUG_ON(SKB_GSO_FRAGLIST != (NETIF_F_GSO_FRAGLIST >> NETIF_F_GSO_SHIFT)); return (features & feature) == feature; } diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index 4d8b1eaf7708..908d38dbcb91 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -426,13 +426,6 @@ ip6addrptr(const struct sk_buff *skb, bool src, struct in6_addr *addr) sizeof(*addr)); } -/* Calculate the bytes required to store the inclusive range of a-b */ -static inline int -bitmap_bytes(u32 a, u32 b) -{ - return 4 * ((((b - a + 8) / 8) + 3) / 4); -} - /* How often should the gc be run by default */ #define IPSET_GC_TIME (3 * 60) diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index cf09ab37b45b..851425c3178f 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -31,7 +31,7 @@ struct nfnetlink_subsystem { const struct nfnl_callback *cb; /* callback for individual types */ struct module *owner; int (*commit)(struct net *net, struct sk_buff *skb); - int (*abort)(struct net *net, struct sk_buff *skb); + int (*abort)(struct net *net, struct sk_buff *skb, bool autoload); void (*cleanup)(struct net *net); bool (*valid_genid)(struct net *net, u32 genid); }; diff --git a/include/linux/phy.h b/include/linux/phy.h index 30e599c454db..c570e162e05e 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -22,6 +22,7 @@ #include <linux/timer.h> #include <linux/workqueue.h> #include <linux/mod_devicetable.h> +#include <linux/u64_stats_sync.h> #include <linux/atomic.h> @@ -100,9 +101,11 @@ typedef enum { PHY_INTERFACE_MODE_2500BASEX, PHY_INTERFACE_MODE_RXAUI, PHY_INTERFACE_MODE_XAUI, - /* 10GBASE-KR, XFI, SFI - single lane 10G Serdes */ - PHY_INTERFACE_MODE_10GKR, + /* 10GBASE-R, XFI, SFI - single lane 10G Serdes */ + PHY_INTERFACE_MODE_10GBASER, PHY_INTERFACE_MODE_USXGMII, + /* 10GBASE-KR - with Clause 73 AN */ + PHY_INTERFACE_MODE_10GKR, PHY_INTERFACE_MODE_MAX, } phy_interface_t; @@ -176,10 +179,12 @@ static inline const char *phy_modes(phy_interface_t interface) return "rxaui"; case PHY_INTERFACE_MODE_XAUI: return "xaui"; - case PHY_INTERFACE_MODE_10GKR: - return "10gbase-kr"; + case PHY_INTERFACE_MODE_10GBASER: + return "10gbase-r"; case PHY_INTERFACE_MODE_USXGMII: return "usxgmii"; + case PHY_INTERFACE_MODE_10GKR: + return "10gbase-kr"; default: return "unknown"; } @@ -208,6 +213,15 @@ struct sfp_bus; struct sfp_upstream_ops; struct sk_buff; +struct mdio_bus_stats { + u64_stats_t transfers; + u64_stats_t errors; + u64_stats_t writes; + u64_stats_t reads; + /* Must be last, add new statistics above */ + struct u64_stats_sync syncp; +}; + /* * The Bus class for PHYs. Devices which provide access to * PHYs should register using this structure @@ -220,6 +234,7 @@ struct mii_bus { int (*read)(struct mii_bus *bus, int addr, int regnum); int (*write)(struct mii_bus *bus, int addr, int regnum, u16 val); int (*reset)(struct mii_bus *bus); + struct mdio_bus_stats stats[PHY_MAX_ADDR]; /* * A lock to ensure that only one thing can read/write @@ -328,6 +343,9 @@ struct phy_c45_device_ids { u32 device_ids[8]; }; +struct macsec_context; +struct macsec_ops; + /* phy_device: An instance of a PHY * * drv: Pointer to the driver for this PHY instance @@ -350,6 +368,7 @@ struct phy_c45_device_ids { * attached_dev: The attached enet driver's device instance ptr * adjust_link: Callback for the enet controller to respond to * changes in the link state. + * macsec_ops: MACsec offloading ops. * * speed, duplex, pause, supported, advertising, lp_advertising, * and autoneg are used like in mii_if_info @@ -449,6 +468,11 @@ struct phy_device { void (*phy_link_change)(struct phy_device *, bool up, bool do_carrier); void (*adjust_link)(struct net_device *dev); + +#if IS_ENABLED(CONFIG_MACSEC) + /* MACsec management functions */ + const struct macsec_ops *macsec_ops; +#endif }; #define to_phy_device(d) container_of(to_mdio_device(d), \ struct phy_device, mdio) @@ -1127,6 +1151,8 @@ static inline void phy_unlock_mdio_bus(struct phy_device *phydev) void phy_attached_print(struct phy_device *phydev, const char *fmt, ...) __printf(2, 3); +char *phy_attached_info_irq(struct phy_device *phydev) + __malloc; void phy_attached_info(struct phy_device *phydev); /* Clause 22 PHY */ @@ -1216,6 +1242,8 @@ void phy_ethtool_ksettings_get(struct phy_device *phydev, int phy_ethtool_ksettings_set(struct phy_device *phydev, const struct ethtool_link_ksettings *cmd); int phy_mii_ioctl(struct phy_device *phydev, struct ifreq *ifr, int cmd); +int phy_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd); +int phy_do_ioctl_running(struct net_device *dev, struct ifreq *ifr, int cmd); void phy_request_interrupt(struct phy_device *phydev); void phy_free_interrupt(struct phy_device *phydev); void phy_print_status(struct phy_device *phydev); diff --git a/include/linux/phylink.h b/include/linux/phylink.h index fed5488e3c75..523209e70947 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -63,10 +63,12 @@ enum phylink_op_type { * struct phylink_config - PHYLINK configuration structure * @dev: a pointer to a struct device associated with the MAC * @type: operation type of PHYLINK instance + * @pcs_poll: MAC PCS cannot provide link change interrupt */ struct phylink_config { struct device *dev; enum phylink_op_type type; + bool pcs_poll; }; /** diff --git a/include/linux/platform_data/eth_ixp4xx.h b/include/linux/platform_data/eth_ixp4xx.h new file mode 100644 index 000000000000..6f652ea0c6ae --- /dev/null +++ b/include/linux/platform_data/eth_ixp4xx.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PLATFORM_DATA_ETH_IXP4XX +#define __PLATFORM_DATA_ETH_IXP4XX + +#include <linux/types.h> + +#define IXP4XX_ETH_NPEA 0x00 +#define IXP4XX_ETH_NPEB 0x10 +#define IXP4XX_ETH_NPEC 0x20 + +/* Information about built-in Ethernet MAC interfaces */ +struct eth_plat_info { + u8 phy; /* MII PHY ID, 0 - 31 */ + u8 rxq; /* configurable, currently 0 - 31 only */ + u8 txreadyq; + u8 hwaddr[6]; +}; + +#endif diff --git a/include/linux/platform_data/wan_ixp4xx_hss.h b/include/linux/platform_data/wan_ixp4xx_hss.h new file mode 100644 index 000000000000..d525a0feb9e1 --- /dev/null +++ b/include/linux/platform_data/wan_ixp4xx_hss.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PLATFORM_DATA_WAN_IXP4XX_HSS_H +#define __PLATFORM_DATA_WAN_IXP4XX_HSS_H + +#include <linux/types.h> + +/* Information about built-in HSS (synchronous serial) interfaces */ +struct hss_plat_info { + int (*set_clock)(int port, unsigned int clock_type); + int (*open)(int port, void *pdev, + void (*set_carrier_cb)(void *pdev, int carrier)); + void (*close)(int port, void *pdev); + u8 txreadyq; + u32 timer_freq; +}; + +#endif diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h index 0abe9a4fc842..417db0a79a62 100644 --- a/include/linux/ptr_ring.h +++ b/include/linux/ptr_ring.h @@ -23,6 +23,7 @@ #include <linux/types.h> #include <linux/compiler.h> #include <linux/slab.h> +#include <linux/mm.h> #include <asm/errno.h> #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 467d26046416..716ad1d8d95e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1929,11 +1929,11 @@ static inline void rseq_migrate(struct task_struct *t) /* * If parent process has a registered restartable sequences area, the - * child inherits. Only applies when forking a process, not a thread. + * child inherits. Unregister rseq for a clone with CLONE_VM set. */ static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { - if (clone_flags & CLONE_THREAD) { + if (clone_flags & CLONE_VM) { t->rseq = NULL; t->rseq_sig = 0; t->rseq_event_mask = 0; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index e9133bcf0544..3d13a4b717e9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -592,6 +592,8 @@ enum { SKB_GSO_UDP = 1 << 16, SKB_GSO_UDP_L4 = 1 << 17, + + SKB_GSO_FRAGLIST = 1 << 18, }; #if BITS_PER_LONG > 32 @@ -1478,6 +1480,11 @@ static inline void skb_mark_not_on_list(struct sk_buff *skb) skb->next = NULL; } +/* Iterate through singly-linked GSO fragments of an skb. */ +#define skb_list_walk_safe(first, skb, next_skb) \ + for ((skb) = (first), (next_skb) = (skb) ? (skb)->next : NULL; (skb); \ + (skb) = (next_skb), (next_skb) = (skb) ? (skb)->next : NULL) + static inline void skb_list_del_init(struct sk_buff *skb) { __list_del_entry(&skb->list); @@ -3459,7 +3466,8 @@ static inline void skb_frag_list_init(struct sk_buff *skb) for (iter = skb_shinfo(skb)->frag_list; iter; iter = iter->next) -int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p, +int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue, + int *err, long *timeo_p, const struct sk_buff *skb); struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, struct sk_buff_head *queue, @@ -3468,12 +3476,16 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, struct sk_buff *skb), int *off, int *err, struct sk_buff **last); -struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned flags, +struct sk_buff *__skb_try_recv_datagram(struct sock *sk, + struct sk_buff_head *queue, + unsigned int flags, void (*destructor)(struct sock *sk, struct sk_buff *skb), int *off, int *err, struct sk_buff **last); -struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags, +struct sk_buff *__skb_recv_datagram(struct sock *sk, + struct sk_buff_head *sk_queue, + unsigned int flags, void (*destructor)(struct sock *sk, struct sk_buff *skb), int *off, int *err); @@ -3523,6 +3535,8 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet); bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu); bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len); struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features); +struct sk_buff *skb_segment_list(struct sk_buff *skb, netdev_features_t features, + unsigned int offset); struct sk_buff *skb_vlan_untag(struct sk_buff *skb); int skb_ensure_writable(struct sk_buff *skb, int write_len); int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci); @@ -4092,6 +4106,9 @@ enum skb_ext_id { #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) TC_SKB_EXT, #endif +#if IS_ENABLED(CONFIG_MPTCP) + SKB_EXT_MPTCP, +#endif SKB_EXT_NUM, /* must be last */ }; @@ -4112,6 +4129,9 @@ struct skb_ext { char data[0] __aligned(8); }; +struct skb_ext *__skb_ext_alloc(void); +void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id, + struct skb_ext *ext); void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id); void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id); void __skb_ext_put(struct skb_ext *ext); diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index ef7031f8a304..14d61bba0b79 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -358,17 +358,22 @@ static inline void sk_psock_update_proto(struct sock *sk, static inline void sk_psock_restore_proto(struct sock *sk, struct sk_psock *psock) { - sk->sk_write_space = psock->saved_write_space; + sk->sk_prot->unhash = psock->saved_unhash; if (psock->sk_proto) { struct inet_connection_sock *icsk = inet_csk(sk); bool has_ulp = !!icsk->icsk_ulp_data; - if (has_ulp) - tcp_update_ulp(sk, psock->sk_proto); - else + if (has_ulp) { + tcp_update_ulp(sk, psock->sk_proto, + psock->saved_write_space); + } else { sk->sk_prot = psock->sk_proto; + sk->sk_write_space = psock->saved_write_space; + } psock->sk_proto = NULL; + } else { + sk->sk_write_space = psock->saved_write_space; } } diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 98fe8663033a..3a67a7e45633 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -689,10 +689,10 @@ extern void spi_finalize_current_transfer(struct spi_controller *ctlr); /* Helper calls for driver to timestamp transfer */ void spi_take_timestamp_pre(struct spi_controller *ctlr, struct spi_transfer *xfer, - const void *tx, bool irqs_off); + size_t progress, bool irqs_off); void spi_take_timestamp_post(struct spi_controller *ctlr, struct spi_transfer *xfer, - const void *tx, bool irqs_off); + size_t progress, bool irqs_off); /* the spi driver core manages memory for the spi_controller classdev */ extern struct spi_controller *__spi_alloc_controller(struct device *host, diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 0531afa9b21e..19190c609282 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -139,6 +139,7 @@ struct stmmac_txq_cfg { u32 low_credit; bool use_prio; u32 prio; + int tbs_en; }; struct plat_stmmacenet_data { diff --git a/include/linux/sxgbe_platform.h b/include/linux/sxgbe_platform.h index 85ec745767bd..966146f7267a 100644 --- a/include/linux/sxgbe_platform.h +++ b/include/linux/sxgbe_platform.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * 10G controller driver for Samsung EXYNOS SoCs + * 10G controller driver for Samsung Exynos SoCs * * Copyright (C) 2013 Samsung Electronics Co., Ltd. * http://www.samsung.com diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 2960dedcfde8..5262b7a76d39 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1232,6 +1232,7 @@ asmlinkage long sys_ni_syscall(void); */ int ksys_umount(char __user *name, int flags); +int ksys_dup(unsigned int fildes); int ksys_chroot(const char __user *filename); ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count); int ksys_chdir(const char __user *filename); diff --git a/include/linux/tcp.h b/include/linux/tcp.h index ca6f01531e64..1cf73e6f85ca 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -78,6 +78,27 @@ struct tcp_sack_block { #define TCP_SACK_SEEN (1 << 0) /*1 = peer is SACK capable, */ #define TCP_DSACK_SEEN (1 << 2) /*1 = DSACK was received from peer*/ +#if IS_ENABLED(CONFIG_MPTCP) +struct mptcp_options_received { + u64 sndr_key; + u64 rcvr_key; + u64 data_ack; + u64 data_seq; + u32 subflow_seq; + u16 data_len; + u8 mp_capable : 1, + mp_join : 1, + dss : 1; + u8 use_map:1, + dsn64:1, + data_fin:1, + use_ack:1, + ack64:1, + mpc_map:1, + __unused:2; +}; +#endif + struct tcp_options_received { /* PAWS/RTTM data */ int ts_recent_stamp;/* Time we stored ts_recent (for aging) */ @@ -95,6 +116,9 @@ struct tcp_options_received { u8 num_sacks; /* Number of SACK blocks */ u16 user_mss; /* mss requested by user in ioctl */ u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ +#if IS_ENABLED(CONFIG_MPTCP) + struct mptcp_options_received mptcp; +#endif }; static inline void tcp_clear_options(struct tcp_options_received *rx_opt) @@ -104,6 +128,11 @@ static inline void tcp_clear_options(struct tcp_options_received *rx_opt) #if IS_ENABLED(CONFIG_SMC) rx_opt->smc_ok = 0; #endif +#if IS_ENABLED(CONFIG_MPTCP) + rx_opt->mptcp.mp_capable = 0; + rx_opt->mptcp.mp_join = 0; + rx_opt->mptcp.dss = 0; +#endif } /* This is the max number of SACKS that we'll generate and process. It's safe @@ -119,6 +148,9 @@ struct tcp_request_sock { const struct tcp_request_sock_ops *af_specific; u64 snt_synack; /* first SYNACK sent time */ bool tfo_listener; +#if IS_ENABLED(CONFIG_MPTCP) + bool is_mptcp; +#endif u32 txhash; u32 rcv_isn; u32 snt_isn; @@ -354,6 +386,8 @@ struct tcp_sock { #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0 #endif + u16 timeout_rehash; /* Timeout-triggered rehash attempts */ + u32 rcv_ooopack; /* Received out-of-order packets, for tcpinfo */ /* Receiver side RTT estimation */ @@ -379,6 +413,9 @@ struct tcp_sock { u32 mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG * while socket was owned by user. */ +#if IS_ENABLED(CONFIG_MPTCP) + bool is_mptcp; +#endif #ifdef CONFIG_TCP_MD5SIG /* TCP AF-Specific parts; only used by MD5 Signature support so far */ diff --git a/include/linux/tnum.h b/include/linux/tnum.h index c17af77f3fae..ea627d1ab7e3 100644 --- a/include/linux/tnum.h +++ b/include/linux/tnum.h @@ -30,7 +30,7 @@ struct tnum tnum_lshift(struct tnum a, u8 shift); /* Shift (rsh) a tnum right (by a fixed shift) */ struct tnum tnum_rshift(struct tnum a, u8 shift); /* Shift (arsh) a tnum right (by a fixed min_shift) */ -struct tnum tnum_arshift(struct tnum a, u8 min_shift); +struct tnum tnum_arshift(struct tnum a, u8 min_shift, u8 insn_bitness); /* Add two tnums, return @a + @b */ struct tnum tnum_add(struct tnum a, struct tnum b); /* Subtract two tnums, return @a - @b */ diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 86eecbd98e84..f73e1775ded0 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -417,6 +417,36 @@ static inline bool xa_marked(const struct xarray *xa, xa_mark_t mark) } /** + * xa_for_each_range() - Iterate over a portion of an XArray. + * @xa: XArray. + * @index: Index of @entry. + * @entry: Entry retrieved from array. + * @start: First index to retrieve from array. + * @last: Last index to retrieve from array. + * + * During the iteration, @entry will have the value of the entry stored + * in @xa at @index. You may modify @index during the iteration if you + * want to skip or reprocess indices. It is safe to modify the array + * during the iteration. At the end of the iteration, @entry will be set + * to NULL and @index will have a value less than or equal to max. + * + * xa_for_each_range() is O(n.log(n)) while xas_for_each() is O(n). You have + * to handle your own locking with xas_for_each(), and if you have to unlock + * after each iteration, it will also end up being O(n.log(n)). + * xa_for_each_range() will spin if it hits a retry entry; if you intend to + * see retry entries, you should use the xas_for_each() iterator instead. + * The xas_for_each() iterator will expand into more inline code than + * xa_for_each_range(). + * + * Context: Any context. Takes and releases the RCU lock. + */ +#define xa_for_each_range(xa, index, entry, start, last) \ + for (index = start, \ + entry = xa_find(xa, &index, last, XA_PRESENT); \ + entry; \ + entry = xa_find_after(xa, &index, last, XA_PRESENT)) + +/** * xa_for_each_start() - Iterate over a portion of an XArray. * @xa: XArray. * @index: Index of @entry. @@ -439,11 +469,8 @@ static inline bool xa_marked(const struct xarray *xa, xa_mark_t mark) * * Context: Any context. Takes and releases the RCU lock. */ -#define xa_for_each_start(xa, index, entry, start) \ - for (index = start, \ - entry = xa_find(xa, &index, ULONG_MAX, XA_PRESENT); \ - entry; \ - entry = xa_find_after(xa, &index, ULONG_MAX, XA_PRESENT)) +#define xa_for_each_start(xa, index, entry, start) \ + xa_for_each_range(xa, index, entry, start, ULONG_MAX) /** * xa_for_each() - Iterate over present entries in an XArray. @@ -508,6 +535,14 @@ static inline bool xa_marked(const struct xarray *xa, xa_mark_t mark) spin_lock_irqsave(&(xa)->xa_lock, flags) #define xa_unlock_irqrestore(xa, flags) \ spin_unlock_irqrestore(&(xa)->xa_lock, flags) +#define xa_lock_nested(xa, subclass) \ + spin_lock_nested(&(xa)->xa_lock, subclass) +#define xa_lock_bh_nested(xa, subclass) \ + spin_lock_bh_nested(&(xa)->xa_lock, subclass) +#define xa_lock_irq_nested(xa, subclass) \ + spin_lock_irq_nested(&(xa)->xa_lock, subclass) +#define xa_lock_irqsave_nested(xa, flags, subclass) \ + spin_lock_irqsave_nested(&(xa)->xa_lock, flags, subclass) /* * Versions of the normal API which require the caller to hold the diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 059524b87c4c..f22bd6c838a3 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3548,6 +3548,9 @@ struct cfg80211_update_owe_info { * * @start_radar_detection: Start radar detection in the driver. * + * @end_cac: End running CAC, probably because a related CAC + * was finished on another phy. + * * @update_ft_ies: Provide updated Fast BSS Transition information to the * driver. If the SME is in the driver/firmware, this information can be * used in building Authentication and Reassociation Request frames. @@ -3874,6 +3877,8 @@ struct cfg80211_ops { struct net_device *dev, struct cfg80211_chan_def *chandef, u32 cac_time_ms); + void (*end_cac)(struct wiphy *wiphy, + struct net_device *dev); int (*update_ft_ies)(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_update_ft_ies_params *ftie); int (*crit_proto_start)(struct wiphy *wiphy, diff --git a/include/net/devlink.h b/include/net/devlink.h index 47f87b2fcf63..5e46c24bb6e6 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -485,6 +485,8 @@ enum devlink_param_generic_id { #define DEVLINK_INFO_VERSION_GENERIC_FW_UNDI "fw.undi" /* NCSI support/handler version */ #define DEVLINK_INFO_VERSION_GENERIC_FW_NCSI "fw.ncsi" +/* FW parameter set id */ +#define DEVLINK_INFO_VERSION_GENERIC_FW_PSID "fw.psid" struct devlink_region; struct devlink_info_req; @@ -562,7 +564,7 @@ struct devlink_trap { }; /* All traps must be documented in - * Documentation/networking/devlink-trap.rst + * Documentation/networking/devlink/devlink-trap.rst */ enum devlink_trap_generic_id { DEVLINK_TRAP_GENERIC_ID_SMAC_MC, @@ -589,6 +591,9 @@ enum devlink_trap_generic_id { DEVLINK_TRAP_GENERIC_ID_REJECT_ROUTE, DEVLINK_TRAP_GENERIC_ID_IPV4_LPM_UNICAST_MISS, DEVLINK_TRAP_GENERIC_ID_IPV6_LPM_UNICAST_MISS, + DEVLINK_TRAP_GENERIC_ID_NON_ROUTABLE, + DEVLINK_TRAP_GENERIC_ID_DECAP_ERROR, + DEVLINK_TRAP_GENERIC_ID_OVERLAY_SMAC_MC, /* Add new generic trap IDs above */ __DEVLINK_TRAP_GENERIC_ID_MAX, @@ -596,12 +601,13 @@ enum devlink_trap_generic_id { }; /* All trap groups must be documented in - * Documentation/networking/devlink-trap.rst + * Documentation/networking/devlink/devlink-trap.rst */ enum devlink_trap_group_generic_id { DEVLINK_TRAP_GROUP_GENERIC_ID_L2_DROPS, DEVLINK_TRAP_GROUP_GENERIC_ID_L3_DROPS, DEVLINK_TRAP_GROUP_GENERIC_ID_BUFFER_DROPS, + DEVLINK_TRAP_GROUP_GENERIC_ID_TUNNEL_DROPS, /* Add new generic trap group IDs above */ __DEVLINK_TRAP_GROUP_GENERIC_ID_MAX, @@ -657,6 +663,12 @@ enum devlink_trap_group_generic_id { "ipv4_lpm_miss" #define DEVLINK_TRAP_GENERIC_NAME_IPV6_LPM_UNICAST_MISS \ "ipv6_lpm_miss" +#define DEVLINK_TRAP_GENERIC_NAME_NON_ROUTABLE \ + "non_routable_packet" +#define DEVLINK_TRAP_GENERIC_NAME_DECAP_ERROR \ + "decap_error" +#define DEVLINK_TRAP_GENERIC_NAME_OVERLAY_SMAC_MC \ + "overlay_smac_is_mc" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L2_DROPS \ "l2_drops" @@ -664,6 +676,8 @@ enum devlink_trap_group_generic_id { "l3_drops" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_BUFFER_DROPS \ "buffer_drops" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_TUNNEL_DROPS \ + "tunnel_drops" #define DEVLINK_TRAP_GENERIC(_type, _init_action, _id, _group, _metadata_cap) \ { \ @@ -938,7 +952,7 @@ struct devlink_region *devlink_region_create(struct devlink *devlink, u32 region_max_snapshots, u64 region_size); void devlink_region_destroy(struct devlink_region *region); -u32 devlink_region_shapshot_id_get(struct devlink *devlink); +u32 devlink_region_snapshot_id_get(struct devlink *devlink); int devlink_region_snapshot_create(struct devlink_region *region, u8 *data, u32 snapshot_id, devlink_snapshot_data_dest_t *data_destructor); @@ -1000,6 +1014,8 @@ int devlink_health_report(struct devlink_health_reporter *reporter, void devlink_health_reporter_state_update(struct devlink_health_reporter *reporter, enum devlink_health_reporter_state state); +void +devlink_health_reporter_recovery_done(struct devlink_health_reporter *reporter); bool devlink_is_reload_failed(const struct devlink *devlink); diff --git a/include/net/dsa.h b/include/net/dsa.h index da5578db228e..63495e3443ac 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -90,7 +90,6 @@ struct dsa_device_ops { struct dsa_skb_cb { struct sk_buff *clone; - bool deferred_xmit; }; struct __dsa_skb_cb { @@ -192,9 +191,6 @@ struct dsa_port { struct phylink *pl; struct phylink_config pl_config; - struct work_struct xmit_work; - struct sk_buff_head xmit_queue; - struct list_head list; /* @@ -283,6 +279,11 @@ struct dsa_switch { */ bool vlan_filtering; + /* MAC PCS does not provide link state change interrupt, and requires + * polling. Flag passed on to PHYLINK. + */ + bool pcs_poll; + size_t num_ports; }; @@ -379,7 +380,8 @@ typedef int dsa_fdb_dump_cb_t(const unsigned char *addr, u16 vid, bool is_static, void *data); struct dsa_switch_ops { enum dsa_tag_protocol (*get_tag_protocol)(struct dsa_switch *ds, - int port); + int port, + enum dsa_tag_protocol mprot); int (*setup)(struct dsa_switch *ds); void (*teardown)(struct dsa_switch *ds); @@ -564,11 +566,6 @@ struct dsa_switch_ops { bool (*port_rxtstamp)(struct dsa_switch *ds, int port, struct sk_buff *skb, unsigned int type); - /* - * Deferred frame Tx - */ - netdev_tx_t (*port_deferred_xmit)(struct dsa_switch *ds, int port, - struct sk_buff *skb); /* Devlink parameters */ int (*devlink_param_get)(struct dsa_switch *ds, u32 id, struct devlink_param_gset_ctx *ctx); diff --git a/include/net/espintcp.h b/include/net/espintcp.h new file mode 100644 index 000000000000..dd7026a00066 --- /dev/null +++ b/include/net/espintcp.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NET_ESPINTCP_H +#define _NET_ESPINTCP_H + +#include <net/strparser.h> +#include <linux/skmsg.h> + +void __init espintcp_init(void); + +int espintcp_push_skb(struct sock *sk, struct sk_buff *skb); +int espintcp_queue_out(struct sock *sk, struct sk_buff *skb); +bool tcp_is_ulp_esp(struct sock *sk); + +struct espintcp_msg { + struct sk_buff *skb; + struct sk_msg skmsg; + int offset; + int len; +}; + +struct espintcp_ctx { + struct strparser strp; + struct sk_buff_head ike_queue; + struct sk_buff_head out_queue; + struct espintcp_msg partial; + void (*saved_data_ready)(struct sock *sk); + void (*saved_write_space)(struct sock *sk); + struct work_struct work; + bool tx_running; +}; + +static inline struct espintcp_ctx *espintcp_getctx(const struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + /* RCU is only needed for diag */ + return (__force void *)icsk->icsk_ulp_data; +} +#endif diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index b579faea41e9..fd60a8ac02ee 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -192,7 +192,9 @@ struct fib6_info { dst_nopolicy:1, dst_host:1, fib6_destroying:1, - unused:3; + offload:1, + trap:1, + unused:1; struct rcu_head rcu; struct nexthop *nh; @@ -329,6 +331,13 @@ static inline void fib6_info_release(struct fib6_info *f6i) call_rcu(&f6i->rcu, fib6_info_destroy_rcu); } +static inline void fib6_info_hw_flags_set(struct fib6_info *f6i, bool offload, + bool trap) +{ + f6i->offload = offload; + f6i->trap = trap; +} + enum fib6_walk_state { #ifdef CONFIG_IPV6_SUBTREES FWS_S, diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index b9cba41c6d4f..6a1ae49809de 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -204,6 +204,18 @@ __be32 fib_result_prefsrc(struct net *net, struct fib_result *res); #define FIB_RES_DEV(res) (FIB_RES_NHC(res)->nhc_dev) #define FIB_RES_OIF(res) (FIB_RES_NHC(res)->nhc_oif) +struct fib_rt_info { + struct fib_info *fi; + u32 tb_id; + __be32 dst; + int dst_len; + u8 tos; + u8 type; + u8 offload:1, + trap:1, + unused:6; +}; + struct fib_entry_notifier_info { struct fib_notifier_info info; /* must be first */ u32 dst; @@ -464,6 +476,7 @@ int fib_nh_common_init(struct fib_nh_common *nhc, struct nlattr *fc_encap, void fib_nh_common_release(struct fib_nh_common *nhc); /* Exported by fib_trie.c */ +void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri); void fib_trie_init(void); struct fib_table *fib_trie_table(u32 id, struct fib_table *alias); diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 4e95f6df508c..cec1a54401f2 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1113,6 +1113,9 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int inet6_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk); +int inet6_sendmsg(struct socket *sock, struct msghdr *msg, size_t size); +int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, + int flags); /* * reassembly.c diff --git a/include/net/macsec.h b/include/net/macsec.h new file mode 100644 index 000000000000..92e43db8b566 --- /dev/null +++ b/include/net/macsec.h @@ -0,0 +1,224 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * MACsec netdev header, used for h/w accelerated implementations. + * + * Copyright (c) 2015 Sabrina Dubroca <sd@queasysnail.net> + */ +#ifndef _NET_MACSEC_H_ +#define _NET_MACSEC_H_ + +#include <linux/u64_stats_sync.h> +#include <uapi/linux/if_link.h> +#include <uapi/linux/if_macsec.h> + +typedef u64 __bitwise sci_t; + +#define MACSEC_NUM_AN 4 /* 2 bits for the association number */ + +/** + * struct macsec_key - SA key + * @id: user-provided key identifier + * @tfm: crypto struct, key storage + */ +struct macsec_key { + u8 id[MACSEC_KEYID_LEN]; + struct crypto_aead *tfm; +}; + +struct macsec_rx_sc_stats { + __u64 InOctetsValidated; + __u64 InOctetsDecrypted; + __u64 InPktsUnchecked; + __u64 InPktsDelayed; + __u64 InPktsOK; + __u64 InPktsInvalid; + __u64 InPktsLate; + __u64 InPktsNotValid; + __u64 InPktsNotUsingSA; + __u64 InPktsUnusedSA; +}; + +struct macsec_rx_sa_stats { + __u32 InPktsOK; + __u32 InPktsInvalid; + __u32 InPktsNotValid; + __u32 InPktsNotUsingSA; + __u32 InPktsUnusedSA; +}; + +struct macsec_tx_sa_stats { + __u32 OutPktsProtected; + __u32 OutPktsEncrypted; +}; + +struct macsec_tx_sc_stats { + __u64 OutPktsProtected; + __u64 OutPktsEncrypted; + __u64 OutOctetsProtected; + __u64 OutOctetsEncrypted; +}; + +/** + * struct macsec_rx_sa - receive secure association + * @active: + * @next_pn: packet number expected for the next packet + * @lock: protects next_pn manipulations + * @key: key structure + * @stats: per-SA stats + */ +struct macsec_rx_sa { + struct macsec_key key; + spinlock_t lock; + u32 next_pn; + refcount_t refcnt; + bool active; + struct macsec_rx_sa_stats __percpu *stats; + struct macsec_rx_sc *sc; + struct rcu_head rcu; +}; + +struct pcpu_rx_sc_stats { + struct macsec_rx_sc_stats stats; + struct u64_stats_sync syncp; +}; + +struct pcpu_tx_sc_stats { + struct macsec_tx_sc_stats stats; + struct u64_stats_sync syncp; +}; + +/** + * struct macsec_rx_sc - receive secure channel + * @sci: secure channel identifier for this SC + * @active: channel is active + * @sa: array of secure associations + * @stats: per-SC stats + */ +struct macsec_rx_sc { + struct macsec_rx_sc __rcu *next; + sci_t sci; + bool active; + struct macsec_rx_sa __rcu *sa[MACSEC_NUM_AN]; + struct pcpu_rx_sc_stats __percpu *stats; + refcount_t refcnt; + struct rcu_head rcu_head; +}; + +/** + * struct macsec_tx_sa - transmit secure association + * @active: + * @next_pn: packet number to use for the next packet + * @lock: protects next_pn manipulations + * @key: key structure + * @stats: per-SA stats + */ +struct macsec_tx_sa { + struct macsec_key key; + spinlock_t lock; + u32 next_pn; + refcount_t refcnt; + bool active; + struct macsec_tx_sa_stats __percpu *stats; + struct rcu_head rcu; +}; + +/** + * struct macsec_tx_sc - transmit secure channel + * @active: + * @encoding_sa: association number of the SA currently in use + * @encrypt: encrypt packets on transmit, or authenticate only + * @send_sci: always include the SCI in the SecTAG + * @end_station: + * @scb: single copy broadcast flag + * @sa: array of secure associations + * @stats: stats for this TXSC + */ +struct macsec_tx_sc { + bool active; + u8 encoding_sa; + bool encrypt; + bool send_sci; + bool end_station; + bool scb; + struct macsec_tx_sa __rcu *sa[MACSEC_NUM_AN]; + struct pcpu_tx_sc_stats __percpu *stats; +}; + +/** + * struct macsec_secy - MACsec Security Entity + * @netdev: netdevice for this SecY + * @n_rx_sc: number of receive secure channels configured on this SecY + * @sci: secure channel identifier used for tx + * @key_len: length of keys used by the cipher suite + * @icv_len: length of ICV used by the cipher suite + * @validate_frames: validation mode + * @operational: MAC_Operational flag + * @protect_frames: enable protection for this SecY + * @replay_protect: enable packet number checks on receive + * @replay_window: size of the replay window + * @tx_sc: transmit secure channel + * @rx_sc: linked list of receive secure channels + */ +struct macsec_secy { + struct net_device *netdev; + unsigned int n_rx_sc; + sci_t sci; + u16 key_len; + u16 icv_len; + enum macsec_validation_type validate_frames; + bool operational; + bool protect_frames; + bool replay_protect; + u32 replay_window; + struct macsec_tx_sc tx_sc; + struct macsec_rx_sc __rcu *rx_sc; +}; + +/** + * struct macsec_context - MACsec context for hardware offloading + */ +struct macsec_context { + struct phy_device *phydev; + enum macsec_offload offload; + + struct macsec_secy *secy; + struct macsec_rx_sc *rx_sc; + struct { + unsigned char assoc_num; + u8 key[MACSEC_KEYID_LEN]; + union { + struct macsec_rx_sa *rx_sa; + struct macsec_tx_sa *tx_sa; + }; + } sa; + + u8 prepare:1; +}; + +/** + * struct macsec_ops - MACsec offloading operations + */ +struct macsec_ops { + /* Device wide */ + int (*mdo_dev_open)(struct macsec_context *ctx); + int (*mdo_dev_stop)(struct macsec_context *ctx); + /* SecY */ + int (*mdo_add_secy)(struct macsec_context *ctx); + int (*mdo_upd_secy)(struct macsec_context *ctx); + int (*mdo_del_secy)(struct macsec_context *ctx); + /* Security channels */ + int (*mdo_add_rxsc)(struct macsec_context *ctx); + int (*mdo_upd_rxsc)(struct macsec_context *ctx); + int (*mdo_del_rxsc)(struct macsec_context *ctx); + /* Security associations */ + int (*mdo_add_rxsa)(struct macsec_context *ctx); + int (*mdo_upd_rxsa)(struct macsec_context *ctx); + int (*mdo_del_rxsa)(struct macsec_context *ctx); + int (*mdo_add_txsa)(struct macsec_context *ctx); + int (*mdo_upd_txsa)(struct macsec_context *ctx); + int (*mdo_del_txsa)(struct macsec_context *ctx); +}; + +void macsec_pn_wrapped(struct macsec_secy *secy, struct macsec_tx_sa *tx_sa); + +#endif /* _NET_MACSEC_H_ */ diff --git a/include/net/mptcp.h b/include/net/mptcp.h new file mode 100644 index 000000000000..27627e2d1bc2 --- /dev/null +++ b/include/net/mptcp.h @@ -0,0 +1,188 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Multipath TCP + * + * Copyright (c) 2017 - 2019, Intel Corporation. + */ + +#ifndef __NET_MPTCP_H +#define __NET_MPTCP_H + +#include <linux/skbuff.h> +#include <linux/tcp.h> +#include <linux/types.h> + +/* MPTCP sk_buff extension data */ +struct mptcp_ext { + u64 data_ack; + u64 data_seq; + u32 subflow_seq; + u16 data_len; + u8 use_map:1, + dsn64:1, + data_fin:1, + use_ack:1, + ack64:1, + mpc_map:1, + __unused:2; + /* one byte hole */ +}; + +struct mptcp_out_options { +#if IS_ENABLED(CONFIG_MPTCP) + u16 suboptions; + u64 sndr_key; + u64 rcvr_key; + struct mptcp_ext ext_copy; +#endif +}; + +#ifdef CONFIG_MPTCP + +void mptcp_init(void); + +static inline bool sk_is_mptcp(const struct sock *sk) +{ + return tcp_sk(sk)->is_mptcp; +} + +static inline bool rsk_is_mptcp(const struct request_sock *req) +{ + return tcp_rsk(req)->is_mptcp; +} + +void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, + int opsize, struct tcp_options_received *opt_rx); +bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, + unsigned int *size, struct mptcp_out_options *opts); +void mptcp_rcv_synsent(struct sock *sk); +bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, + struct mptcp_out_options *opts); +bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, + unsigned int *size, unsigned int remaining, + struct mptcp_out_options *opts); +void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, + struct tcp_options_received *opt_rx); + +void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts); + +/* move the skb extension owership, with the assumption that 'to' is + * newly allocated + */ +static inline void mptcp_skb_ext_move(struct sk_buff *to, + struct sk_buff *from) +{ + if (!skb_ext_exist(from, SKB_EXT_MPTCP)) + return; + + if (WARN_ON_ONCE(to->active_extensions)) + skb_ext_put(to); + + to->active_extensions = from->active_extensions; + to->extensions = from->extensions; + from->active_extensions = 0; +} + +static inline bool mptcp_ext_matches(const struct mptcp_ext *to_ext, + const struct mptcp_ext *from_ext) +{ + /* MPTCP always clears the ext when adding it to the skb, so + * holes do not bother us here + */ + return !from_ext || + (to_ext && from_ext && + !memcmp(from_ext, to_ext, sizeof(struct mptcp_ext))); +} + +/* check if skbs can be collapsed. + * MPTCP collapse is allowed if neither @to or @from carry an mptcp data + * mapping, or if the extension of @to is the same as @from. + * Collapsing is not possible if @to lacks an extension, but @from carries one. + */ +static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, + const struct sk_buff *from) +{ + return mptcp_ext_matches(skb_ext_find(to, SKB_EXT_MPTCP), + skb_ext_find(from, SKB_EXT_MPTCP)); +} + +#else + +static inline void mptcp_init(void) +{ +} + +static inline bool sk_is_mptcp(const struct sock *sk) +{ + return false; +} + +static inline bool rsk_is_mptcp(const struct request_sock *req) +{ + return false; +} + +static inline void mptcp_parse_option(const struct sk_buff *skb, + const unsigned char *ptr, int opsize, + struct tcp_options_received *opt_rx) +{ +} + +static inline bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, + unsigned int *size, + struct mptcp_out_options *opts) +{ + return false; +} + +static inline void mptcp_rcv_synsent(struct sock *sk) +{ +} + +static inline bool mptcp_synack_options(const struct request_sock *req, + unsigned int *size, + struct mptcp_out_options *opts) +{ + return false; +} + +static inline bool mptcp_established_options(struct sock *sk, + struct sk_buff *skb, + unsigned int *size, + unsigned int remaining, + struct mptcp_out_options *opts) +{ + return false; +} + +static inline void mptcp_incoming_options(struct sock *sk, + struct sk_buff *skb, + struct tcp_options_received *opt_rx) +{ +} + +static inline void mptcp_skb_ext_move(struct sk_buff *to, + const struct sk_buff *from) +{ +} + +static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, + const struct sk_buff *from) +{ + return true; +} + +#endif /* CONFIG_MPTCP */ + +void mptcp_handle_ipv6_mapped(struct sock *sk, bool mapped); + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +int mptcpv6_init(void); +#elif IS_ENABLED(CONFIG_IPV6) +static inline int mptcpv6_init(void) +{ + return 0; +} +#endif + +#endif /* __NET_MPTCP_H */ diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index b8ceaf0cd997..854d39ef1ca3 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -347,9 +347,9 @@ static inline struct net *read_pnet(const possible_net_t *pnet) #endif int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp); -int peernet2id(struct net *net, struct net *peer); -bool peernet_has_id(struct net *net, struct net *peer); -struct net *get_net_ns_by_id(struct net *net, int id); +int peernet2id(const struct net *net, struct net *peer); +bool peernet_has_id(const struct net *net, struct net *peer); +struct net *get_net_ns_by_id(const struct net *net, int id); struct pernet_operations { struct list_head list; @@ -427,7 +427,7 @@ static inline void unregister_net_sysctl_table(struct ctl_table_header *header) } #endif -static inline int rt_genid_ipv4(struct net *net) +static inline int rt_genid_ipv4(const struct net *net) { return atomic_read(&net->ipv4.rt_genid); } @@ -459,7 +459,7 @@ static inline void rt_genid_bump_all(struct net *net) rt_genid_bump_ipv6(net); } -static inline int fnhe_genid(struct net *net) +static inline int fnhe_genid(const struct net *net) { return atomic_read(&net->fnhe_genid); } diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index f0897b3c97fb..e0f709d9d547 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -47,6 +47,11 @@ struct nf_flowtable { possible_net_t net; }; +static inline bool nf_flowtable_hw_offload(struct nf_flowtable *flowtable) +{ + return flowtable->flags & NF_FLOWTABLE_HW_OFFLOAD; +} + enum flow_offload_tuple_dir { FLOW_OFFLOAD_DIR_ORIGINAL = IP_CT_DIR_ORIGINAL, FLOW_OFFLOAD_DIR_REPLY = IP_CT_DIR_REPLY, @@ -83,13 +88,15 @@ struct flow_offload_tuple_rhash { struct flow_offload_tuple tuple; }; -#define FLOW_OFFLOAD_SNAT 0x1 -#define FLOW_OFFLOAD_DNAT 0x2 -#define FLOW_OFFLOAD_DYING 0x4 -#define FLOW_OFFLOAD_TEARDOWN 0x8 -#define FLOW_OFFLOAD_HW 0x10 -#define FLOW_OFFLOAD_HW_DYING 0x20 -#define FLOW_OFFLOAD_HW_DEAD 0x40 +enum nf_flow_flags { + NF_FLOW_SNAT, + NF_FLOW_DNAT, + NF_FLOW_TEARDOWN, + NF_FLOW_HW, + NF_FLOW_HW_DYING, + NF_FLOW_HW_DEAD, + NF_FLOW_HW_REFRESH, +}; enum flow_offload_type { NF_FLOW_OFFLOAD_UNSPEC = 0, @@ -99,13 +106,19 @@ enum flow_offload_type { struct flow_offload { struct flow_offload_tuple_rhash tuplehash[FLOW_OFFLOAD_DIR_MAX]; struct nf_conn *ct; - u16 flags; + unsigned long flags; u16 type; u32 timeout; struct rcu_head rcu_head; }; #define NF_FLOW_TIMEOUT (30 * HZ) +#define nf_flowtable_time_stamp (u32)jiffies + +static inline __s32 nf_flow_timeout_delta(unsigned int timeout) +{ + return (__s32)(timeout - nf_flowtable_time_stamp); +} struct nf_flow_route { struct { @@ -128,10 +141,6 @@ int nf_flow_table_init(struct nf_flowtable *flow_table); void nf_flow_table_free(struct nf_flowtable *flow_table); void flow_offload_teardown(struct flow_offload *flow); -static inline void flow_offload_dead(struct flow_offload *flow) -{ - flow->flags |= FLOW_OFFLOAD_DYING; -} int nf_flow_snat_port(const struct flow_offload *flow, struct sk_buff *skb, unsigned int thoff, diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index fe7c50acc681..4170c033d461 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -231,6 +231,7 @@ struct nft_userdata { * struct nft_set_elem - generic representation of set elements * * @key: element key + * @key_end: closing element key * @priv: element private data and extensions */ struct nft_set_elem { @@ -238,6 +239,10 @@ struct nft_set_elem { u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; struct nft_data val; } key; + union { + u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; + struct nft_data val; + } key_end; void *priv; }; @@ -259,11 +264,15 @@ struct nft_set_iter { * @klen: key length * @dlen: data length * @size: number of set elements + * @field_len: length of each field in concatenation, bytes + * @field_count: number of concatenated fields in element */ struct nft_set_desc { unsigned int klen; unsigned int dlen; unsigned int size; + u8 field_len[NFT_REG32_COUNT]; + u8 field_count; }; /** @@ -404,6 +413,8 @@ void nft_unregister_set(struct nft_set_type *type); * @dtype: data type (verdict or numeric type defined by userspace) * @objtype: object type (see NFT_OBJECT_* definitions) * @size: maximum set size + * @field_len: length of each field in concatenation, bytes + * @field_count: number of concatenated fields in element * @use: number of rules references to this set * @nelems: number of elements * @ndeact: number of deactivated elements queued for removal @@ -430,6 +441,8 @@ struct nft_set { u32 dtype; u32 objtype; u32 size; + u8 field_len[NFT_REG32_COUNT]; + u8 field_count; u32 use; atomic_t nelems; u32 ndeact; @@ -502,6 +515,7 @@ void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set); * enum nft_set_extensions - set extension type IDs * * @NFT_SET_EXT_KEY: element key + * @NFT_SET_EXT_KEY_END: upper bound element key, for ranges * @NFT_SET_EXT_DATA: mapping data * @NFT_SET_EXT_FLAGS: element flags * @NFT_SET_EXT_TIMEOUT: element timeout @@ -513,6 +527,7 @@ void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set); */ enum nft_set_extensions { NFT_SET_EXT_KEY, + NFT_SET_EXT_KEY_END, NFT_SET_EXT_DATA, NFT_SET_EXT_FLAGS, NFT_SET_EXT_TIMEOUT, @@ -606,6 +621,11 @@ static inline struct nft_data *nft_set_ext_key(const struct nft_set_ext *ext) return nft_set_ext(ext, NFT_SET_EXT_KEY); } +static inline struct nft_data *nft_set_ext_key_end(const struct nft_set_ext *ext) +{ + return nft_set_ext(ext, NFT_SET_EXT_KEY_END); +} + static inline struct nft_data *nft_set_ext_data(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_DATA); @@ -655,7 +675,7 @@ static inline struct nft_object **nft_set_ext_obj(const struct nft_set_ext *ext) void *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, - const u32 *key, const u32 *data, + const u32 *key, const u32 *key_end, const u32 *data, u64 timeout, u64 expiration, gfp_t gfp); void nft_set_elem_destroy(const struct nft_set *set, void *elem, bool destroy_expr); diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 2656155b4069..29e7e1021267 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -74,6 +74,7 @@ extern struct nft_set_type nft_set_hash_type; extern struct nft_set_type nft_set_hash_fast_type; extern struct nft_set_type nft_set_rbtree_type; extern struct nft_set_type nft_set_bitmap_type; +extern struct nft_set_type nft_set_pipapo_type; struct nft_expr; struct nft_regs; diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h index 286fd960896f..a1a8d45adb42 100644 --- a/include/net/netns/nftables.h +++ b/include/net/netns/nftables.h @@ -7,6 +7,7 @@ struct netns_nftables { struct list_head tables; struct list_head commit_list; + struct list_head module_list; struct mutex commit_mutex; unsigned int base_seq; u8 gencursor; diff --git a/include/net/pie.h b/include/net/pie.h new file mode 100644 index 000000000000..fd5a37cb7993 --- /dev/null +++ b/include/net/pie.h @@ -0,0 +1,138 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __NET_SCHED_PIE_H +#define __NET_SCHED_PIE_H + +#include <linux/ktime.h> +#include <linux/skbuff.h> +#include <linux/types.h> +#include <net/inet_ecn.h> +#include <net/pkt_sched.h> + +#define MAX_PROB U64_MAX +#define DTIME_INVALID U64_MAX +#define QUEUE_THRESHOLD 16384 +#define DQCOUNT_INVALID -1 +#define PIE_SCALE 8 + +/** + * struct pie_params - contains pie parameters + * @target: target delay in pschedtime + * @tudpate: interval at which drop probability is calculated + * @limit: total number of packets that can be in the queue + * @alpha: parameter to control drop probability + * @beta: parameter to control drop probability + * @ecn: is ECN marking of packets enabled + * @bytemode: is drop probability scaled based on pkt size + * @dq_rate_estimator: is Little's law used for qdelay calculation + */ +struct pie_params { + psched_time_t target; + u32 tupdate; + u32 limit; + u32 alpha; + u32 beta; + u8 ecn; + u8 bytemode; + u8 dq_rate_estimator; +}; + +/** + * struct pie_vars - contains pie variables + * @qdelay: current queue delay + * @qdelay_old: queue delay in previous qdelay calculation + * @burst_time: burst time allowance + * @dq_tstamp: timestamp at which dq rate was last calculated + * @prob: drop probability + * @accu_prob: accumulated drop probability + * @dq_count: number of bytes dequeued in a measurement cycle + * @avg_dq_rate: calculated average dq rate + * @qlen_old: queue length during previous qdelay calculation + * @accu_prob_overflows: number of times accu_prob overflows + */ +struct pie_vars { + psched_time_t qdelay; + psched_time_t qdelay_old; + psched_time_t burst_time; + psched_time_t dq_tstamp; + u64 prob; + u64 accu_prob; + u64 dq_count; + u32 avg_dq_rate; + u32 qlen_old; + u8 accu_prob_overflows; +}; + +/** + * struct pie_stats - contains pie stats + * @packets_in: total number of packets enqueued + * @dropped: packets dropped due to pie action + * @overlimit: packets dropped due to lack of space in queue + * @ecn_mark: packets marked with ECN + * @maxq: maximum queue size + */ +struct pie_stats { + u32 packets_in; + u32 dropped; + u32 overlimit; + u32 ecn_mark; + u32 maxq; +}; + +/** + * struct pie_skb_cb - contains private skb vars + * @enqueue_time: timestamp when the packet is enqueued + * @mem_usage: size of the skb during enqueue + */ +struct pie_skb_cb { + psched_time_t enqueue_time; + u32 mem_usage; +}; + +static inline void pie_params_init(struct pie_params *params) +{ + params->target = PSCHED_NS2TICKS(15 * NSEC_PER_MSEC); /* 15 ms */ + params->tupdate = usecs_to_jiffies(15 * USEC_PER_MSEC); /* 15 ms */ + params->limit = 1000; + params->alpha = 2; + params->beta = 20; + params->ecn = false; + params->bytemode = false; + params->dq_rate_estimator = false; +} + +static inline void pie_vars_init(struct pie_vars *vars) +{ + vars->burst_time = PSCHED_NS2TICKS(150 * NSEC_PER_MSEC); /* 150 ms */ + vars->dq_tstamp = DTIME_INVALID; + vars->accu_prob = 0; + vars->dq_count = DQCOUNT_INVALID; + vars->avg_dq_rate = 0; + vars->accu_prob_overflows = 0; +} + +static inline struct pie_skb_cb *get_pie_cb(const struct sk_buff *skb) +{ + qdisc_cb_private_validate(skb, sizeof(struct pie_skb_cb)); + return (struct pie_skb_cb *)qdisc_skb_cb(skb)->data; +} + +static inline psched_time_t pie_get_enqueue_time(const struct sk_buff *skb) +{ + return get_pie_cb(skb)->enqueue_time; +} + +static inline void pie_set_enqueue_time(struct sk_buff *skb) +{ + get_pie_cb(skb)->enqueue_time = psched_get_time(); +} + +bool pie_drop_early(struct Qdisc *sch, struct pie_params *params, + struct pie_vars *vars, u32 qlen, u32 packet_size); + +void pie_process_dequeue(struct sk_buff *skb, struct pie_params *params, + struct pie_vars *vars, u32 qlen); + +void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars, + u32 qlen); + +#endif diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 47b115e2012a..a972244ab193 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -141,31 +141,38 @@ __cls_set_class(unsigned long *clp, unsigned long cl) return xchg(clp, cl); } -static inline unsigned long -cls_set_class(struct Qdisc *q, unsigned long *clp, unsigned long cl) +static inline void +__tcf_bind_filter(struct Qdisc *q, struct tcf_result *r, unsigned long base) { - unsigned long old_cl; + unsigned long cl; - sch_tree_lock(q); - old_cl = __cls_set_class(clp, cl); - sch_tree_unlock(q); - return old_cl; + cl = q->ops->cl_ops->bind_tcf(q, base, r->classid); + cl = __cls_set_class(&r->class, cl); + if (cl) + q->ops->cl_ops->unbind_tcf(q, cl); } static inline void tcf_bind_filter(struct tcf_proto *tp, struct tcf_result *r, unsigned long base) { struct Qdisc *q = tp->chain->block->q; - unsigned long cl; /* Check q as it is not set for shared blocks. In that case, * setting class is not supported. */ if (!q) return; - cl = q->ops->cl_ops->bind_tcf(q, base, r->classid); - cl = cls_set_class(q, &r->class, cl); - if (cl) + sch_tree_lock(q); + __tcf_bind_filter(q, r, base); + sch_tree_unlock(q); +} + +static inline void +__tcf_unbind_filter(struct Qdisc *q, struct tcf_result *r) +{ + unsigned long cl; + + if ((cl = __cls_set_class(&r->class, 0)) != 0) q->ops->cl_ops->unbind_tcf(q, cl); } @@ -173,12 +180,10 @@ static inline void tcf_unbind_filter(struct tcf_proto *tp, struct tcf_result *r) { struct Qdisc *q = tp->chain->block->q; - unsigned long cl; if (!q) return; - if ((cl = __cls_set_class(&r->class, 0)) != 0) - q->ops->cl_ops->unbind_tcf(q, cl); + __tcf_unbind_filter(q, r); } struct tcf_exts { @@ -854,4 +859,26 @@ struct tc_ets_qopt_offload { }; }; +enum tc_tbf_command { + TC_TBF_REPLACE, + TC_TBF_DESTROY, + TC_TBF_STATS, +}; + +struct tc_tbf_qopt_offload_replace_params { + struct psched_ratecfg rate; + u32 max_size; + struct gnet_stats_queue *qstats; +}; + +struct tc_tbf_qopt_offload { + enum tc_tbf_command command; + u32 handle; + u32 parent; + union { + struct tc_tbf_qopt_offload_replace_params replace_params; + struct tc_qopt_offload_stats stats; + }; +}; + #endif diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index fceddf89592a..151208704ed2 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -318,7 +318,8 @@ struct tcf_proto_ops { void *type_data); void (*hw_del)(struct tcf_proto *tp, void *type_data); - void (*bind_class)(void *, u32, unsigned long); + void (*bind_class)(void *, u32, unsigned long, + void *, unsigned long); void * (*tmplt_create)(struct net *net, struct tcf_chain *chain, struct nlattr **tca, diff --git a/include/net/sock.h b/include/net/sock.h index 8dff68b4c316..02162b0378f7 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -436,31 +436,15 @@ struct sock { * Because of non atomicity rules, all * changes are protected by socket lock. */ - unsigned int __sk_flags_offset[0]; -#ifdef __BIG_ENDIAN_BITFIELD -#define SK_FL_PROTO_SHIFT 16 -#define SK_FL_PROTO_MASK 0x00ff0000 - -#define SK_FL_TYPE_SHIFT 0 -#define SK_FL_TYPE_MASK 0x0000ffff -#else -#define SK_FL_PROTO_SHIFT 8 -#define SK_FL_PROTO_MASK 0x0000ff00 - -#define SK_FL_TYPE_SHIFT 16 -#define SK_FL_TYPE_MASK 0xffff0000 -#endif - - unsigned int sk_padding : 1, + u8 sk_padding : 1, sk_kern_sock : 1, sk_no_check_tx : 1, sk_no_check_rx : 1, - sk_userlocks : 4, - sk_protocol : 8, - sk_type : 16; -#define SK_PROTOCOL_MAX U8_MAX - u16 sk_gso_max_segs; + sk_userlocks : 4; u8 sk_pacing_shift; + u16 sk_type; + u16 sk_protocol; + u16 sk_gso_max_segs; unsigned long sk_lingertime; struct proto *sk_prot_creator; rwlock_t sk_callback_lock; @@ -1480,6 +1464,7 @@ static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb) sk_mem_uncharge(sk, skb->truesize); if (static_branch_unlikely(&tcp_tx_skb_cache_key) && !sk->sk_tx_skb_cache && !skb_cloned(skb)) { + skb_ext_reset(skb); skb_zcopy_clear(skb, true); sk->sk_tx_skb_cache = skb; return; @@ -2612,4 +2597,6 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif) return false; } +void sock_def_readable(struct sock *sk); + #endif /* _SOCK_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 7df37e2fddca..a5ea27df3c2b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -39,6 +39,7 @@ #include <net/tcp_states.h> #include <net/inet_ecn.h> #include <net/dst.h> +#include <net/mptcp.h> #include <linux/seq_file.h> #include <linux/memcontrol.h> @@ -182,6 +183,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCPOPT_SACK 5 /* SACK Block */ #define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */ #define TCPOPT_MD5SIG 19 /* MD5 Signature (RFC2385) */ +#define TCPOPT_MPTCP 30 /* Multipath TCP (RFC6824) */ #define TCPOPT_FASTOPEN 34 /* Fast open (RFC7413) */ #define TCPOPT_EXP 254 /* Experimental */ /* Magic number to be after the option value for sharing TCP @@ -328,6 +330,9 @@ int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, size_t size, int flags); ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, size_t size, int flags); +int tcp_send_mss(struct sock *sk, int *size_goal, int flags); +void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle, + int size_goal); void tcp_release_cb(struct sock *sk); void tcp_wfree(struct sk_buff *skb); void tcp_write_timer_handler(struct sock *sk); @@ -977,6 +982,13 @@ static inline bool tcp_skb_can_collapse_to(const struct sk_buff *skb) return likely(!TCP_SKB_CB(skb)->eor); } +static inline bool tcp_skb_can_collapse(const struct sk_buff *to, + const struct sk_buff *from) +{ + return likely(tcp_skb_can_collapse_to(to) && + mptcp_skb_can_collapse(to, from)); +} + /* Events passed to congestion control interface */ enum tcp_ca_event { CA_EVENT_TX_START, /* first transmit when no packets in flight */ @@ -1007,6 +1019,7 @@ enum tcp_ca_ack_event_flags { #define TCP_CONG_NON_RESTRICTED 0x1 /* Requires ECN/ECT set on all packets */ #define TCP_CONG_NEEDS_ECN 0x2 +#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN) union tcp_cc_info; @@ -1101,6 +1114,7 @@ u32 tcp_reno_undo_cwnd(struct sock *sk); void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); extern struct tcp_congestion_ops tcp_reno; +struct tcp_congestion_ops *tcp_ca_find(const char *name); struct tcp_congestion_ops *tcp_ca_find_key(u32 key); u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca); #ifdef CONFIG_INET @@ -2002,6 +2016,11 @@ struct tcp_request_sock_ops { enum tcp_synack_type synack_type); }; +extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops; +#if IS_ENABLED(CONFIG_IPV6) +extern const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops; +#endif + #ifdef CONFIG_SYN_COOKIES static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops, const struct sock *sk, struct sk_buff *skb, @@ -2147,12 +2166,16 @@ struct tcp_ulp_ops { /* initialize ulp */ int (*init)(struct sock *sk); /* update ulp */ - void (*update)(struct sock *sk, struct proto *p); + void (*update)(struct sock *sk, struct proto *p, + void (*write_space)(struct sock *sk)); /* cleanup ulp */ void (*release)(struct sock *sk); /* diagnostic */ int (*get_info)(const struct sock *sk, struct sk_buff *skb); size_t (*get_info_size)(const struct sock *sk); + /* clone ulp */ + void (*clone)(const struct request_sock *req, struct sock *newsk, + const gfp_t priority); char name[TCP_ULP_NAME_MAX]; struct module *owner; @@ -2162,7 +2185,8 @@ void tcp_unregister_ulp(struct tcp_ulp_ops *type); int tcp_set_ulp(struct sock *sk, const char *name); void tcp_get_available_ulp(char *buf, size_t len); void tcp_cleanup_ulp(struct sock *sk); -void tcp_update_ulp(struct sock *sk, struct proto *p); +void tcp_update_ulp(struct sock *sk, struct proto *p, + void (*write_space)(struct sock *sk)); #define MODULE_ALIAS_TCP_ULP(name) \ __MODULE_INFO(alias, alias_userspace, name); \ diff --git a/include/net/udp.h b/include/net/udp.h index bad74f780831..44e0e52b585c 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -167,7 +167,7 @@ typedef struct sock *(*udp_lookup_t)(struct sk_buff *skb, __be16 sport, __be16 dport); struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, - struct udphdr *uh, udp_lookup_t lookup); + struct udphdr *uh, struct sock *sk); int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup); struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, diff --git a/include/net/xfrm.h b/include/net/xfrm.h index dda3c025452e..8f71c111e65a 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -193,6 +193,7 @@ struct xfrm_state { /* Data for encapsulator */ struct xfrm_encap_tmpl *encap; + struct sock __rcu *encap_sk; /* Data for care-of address */ xfrm_address_t *coaddr; @@ -1547,6 +1548,9 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload); int xfrm_init_state(struct xfrm_state *x); int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type); int xfrm_input_resume(struct sk_buff *skb, int nexthdr); +int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb, + int (*finish)(struct net *, struct sock *, + struct sk_buff *)); int xfrm_trans_queue(struct sk_buff *skb, int (*finish)(struct net *, struct sock *, struct sk_buff *)); diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index 753f54e17e0a..e3518fd6b95b 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -69,7 +69,7 @@ static inline size_t ib_umem_num_pages(struct ib_umem *umem) #ifdef CONFIG_INFINIBAND_USER_MEM -struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, +struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr, size_t size, int access); void ib_umem_release(struct ib_umem *umem); int ib_umem_page_count(struct ib_umem *umem); @@ -83,7 +83,7 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem, #include <linux/err.h> -static inline struct ib_umem *ib_umem_get(struct ib_udata *udata, +static inline struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr, size_t size, int access) { diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 81429acc8257..64314ff76612 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -114,9 +114,9 @@ static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp) #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING struct ib_umem_odp * -ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, size_t size, +ib_umem_odp_get(struct ib_device *device, unsigned long addr, size_t size, int access, const struct mmu_interval_notifier_ops *ops); -struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata, +struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device, int access); struct ib_umem_odp * ib_umem_odp_alloc_child(struct ib_umem_odp *root_umem, unsigned long addr, @@ -134,7 +134,7 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset, #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ static inline struct ib_umem_odp * -ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, size_t size, +ib_umem_odp_get(struct ib_device *device, unsigned long addr, size_t size, int access, const struct mmu_interval_notifier_ops *ops) { return ERR_PTR(-EINVAL); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 5608e14e3aad..e2cc62217cc2 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -4153,6 +4153,15 @@ static inline void ib_dma_free_coherent(struct ib_device *dev, dma_free_coherent(dev->dma_device, size, cpu_addr, dma_handle); } +/* ib_reg_user_mr - register a memory region for virtual addresses from kernel + * space. This function should be called when 'current' is the owning MM. + */ +struct ib_mr *ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int mr_access_flags); + +/* ib_advise_mr - give an advice about an address range in a memory region */ +int ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, + u32 flags, struct ib_sge *sg_list, u32 num_sge); /** * ib_dereg_mr_user - Deregisters a memory region and removes it from the * HCA translation table. diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 64cbbbe74a36..068f96b1a83e 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -420,6 +420,8 @@ struct ocelot_port { u8 ptp_cmd; struct sk_buff_head tx_skbs; u8 ts_id; + + phy_interface_t phy_mode; }; struct ocelot { diff --git a/drivers/net/ethernet/mscc/ocelot_ana.h b/include/soc/mscc/ocelot_ana.h index 841c6ec22b64..841c6ec22b64 100644 --- a/drivers/net/ethernet/mscc/ocelot_ana.h +++ b/include/soc/mscc/ocelot_ana.h diff --git a/drivers/net/ethernet/mscc/ocelot_dev.h b/include/soc/mscc/ocelot_dev.h index 0a50d53bbd3f..0a50d53bbd3f 100644 --- a/drivers/net/ethernet/mscc/ocelot_dev.h +++ b/include/soc/mscc/ocelot_dev.h diff --git a/drivers/net/ethernet/mscc/ocelot_qsys.h b/include/soc/mscc/ocelot_qsys.h index d8c63aa761be..d8c63aa761be 100644 --- a/drivers/net/ethernet/mscc/ocelot_qsys.h +++ b/include/soc/mscc/ocelot_qsys.h diff --git a/arch/riscv/include/asm/sifive_l2_cache.h b/include/soc/sifive/sifive_l2_cache.h index 04f6748fc50b..92ade10ed67e 100644 --- a/arch/riscv/include/asm/sifive_l2_cache.h +++ b/include/soc/sifive/sifive_l2_cache.h @@ -4,8 +4,8 @@ * */ -#ifndef _ASM_RISCV_SIFIVE_L2_CACHE_H -#define _ASM_RISCV_SIFIVE_L2_CACHE_H +#ifndef __SOC_SIFIVE_L2_CACHE_H +#define __SOC_SIFIVE_L2_CACHE_H extern int register_sifive_l2_error_notifier(struct notifier_block *nb); extern int unregister_sifive_l2_error_notifier(struct notifier_block *nb); @@ -13,4 +13,4 @@ extern int unregister_sifive_l2_error_notifier(struct notifier_block *nb); #define SIFIVE_L2_ERR_TYPE_CE 0 #define SIFIVE_L2_ERR_TYPE_UE 1 -#endif /* _ASM_RISCV_SIFIVE_L2_CACHE_H */ +#endif /* __SOC_SIFIVE_L2_CACHE_H */ diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index d5ec4fac82ae..564ba1b5cf57 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -915,9 +915,9 @@ TRACE_EVENT(afs_call_state, TRACE_EVENT(afs_lookup, TP_PROTO(struct afs_vnode *dvnode, const struct qstr *name, - struct afs_vnode *vnode), + struct afs_fid *fid), - TP_ARGS(dvnode, name, vnode), + TP_ARGS(dvnode, name, fid), TP_STRUCT__entry( __field_struct(struct afs_fid, dfid ) @@ -928,13 +928,7 @@ TRACE_EVENT(afs_lookup, TP_fast_assign( int __len = min_t(int, name->len, 23); __entry->dfid = dvnode->fid; - if (vnode) { - __entry->fid = vnode->fid; - } else { - __entry->fid.vid = 0; - __entry->fid.vnode = 0; - __entry->fid.unique = 0; - } + __entry->fid = *fid; memcpy(__entry->name, name->name, __len); __entry->name[__len] = 0; ), diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index dd4db334bd63..d82a0f4e824d 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -31,7 +31,8 @@ EM( SCAN_ALLOC_HUGE_PAGE_FAIL, "alloc_huge_page_failed") \ EM( SCAN_CGROUP_CHARGE_FAIL, "ccgroup_charge_failed") \ EM( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte") \ - EMe(SCAN_TRUNCATED, "truncated") \ + EM( SCAN_TRUNCATED, "truncated") \ + EMe(SCAN_PAGE_HAS_PRIVATE, "page_has_private") \ #undef EM #undef EMe diff --git a/include/trace/events/preemptirq.h b/include/trace/events/preemptirq.h index 95fba0471e5b..3f249e150c0c 100644 --- a/include/trace/events/preemptirq.h +++ b/include/trace/events/preemptirq.h @@ -18,13 +18,13 @@ DECLARE_EVENT_CLASS(preemptirq_template, TP_ARGS(ip, parent_ip), TP_STRUCT__entry( - __field(u32, caller_offs) - __field(u32, parent_offs) + __field(s32, caller_offs) + __field(s32, parent_offs) ), TP_fast_assign( - __entry->caller_offs = (u32)(ip - (unsigned long)_stext); - __entry->parent_offs = (u32)(parent_ip - (unsigned long)_stext); + __entry->caller_offs = (s32)(ip - (unsigned long)_stext); + __entry->parent_offs = (s32)(parent_ip - (unsigned long)_stext); ), TP_printk("caller=%pS parent=%pS", diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h index 51fe9f6719eb..a966d4b5ab37 100644 --- a/include/trace/events/sock.h +++ b/include/trace/events/sock.h @@ -19,7 +19,8 @@ #define inet_protocol_names \ EM(IPPROTO_TCP) \ EM(IPPROTO_DCCP) \ - EMe(IPPROTO_SCTP) + EM(IPPROTO_SCTP) \ + EMe(IPPROTO_MPTCP) #define tcp_state_names \ EM(TCP_ESTABLISHED) \ @@ -147,7 +148,7 @@ TRACE_EVENT(inet_sock_set_state, __field(__u16, sport) __field(__u16, dport) __field(__u16, family) - __field(__u8, protocol) + __field(__u16, protocol) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index a7378bcd9928..b95d65e8c628 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -79,14 +79,26 @@ TRACE_EVENT(xdp_bulk_tx, __entry->sent, __entry->drops, __entry->err) ); +#ifndef __DEVMAP_OBJ_TYPE +#define __DEVMAP_OBJ_TYPE +struct _bpf_dtab_netdev { + struct net_device *dev; +}; +#endif /* __DEVMAP_OBJ_TYPE */ + +#define devmap_ifindex(tgt, map) \ + (((map->map_type == BPF_MAP_TYPE_DEVMAP || \ + map->map_type == BPF_MAP_TYPE_DEVMAP_HASH)) ? \ + ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex : 0) + DECLARE_EVENT_CLASS(xdp_redirect_template, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, - int to_ifindex, int err, - const struct bpf_map *map, u32 map_index), + const void *tgt, int err, + const struct bpf_map *map, u32 index), - TP_ARGS(dev, xdp, to_ifindex, err, map, map_index), + TP_ARGS(dev, xdp, tgt, err, map, index), TP_STRUCT__entry( __field(int, prog_id) @@ -103,90 +115,65 @@ DECLARE_EVENT_CLASS(xdp_redirect_template, __entry->act = XDP_REDIRECT; __entry->ifindex = dev->ifindex; __entry->err = err; - __entry->to_ifindex = to_ifindex; + __entry->to_ifindex = map ? devmap_ifindex(tgt, map) : + index; __entry->map_id = map ? map->id : 0; - __entry->map_index = map_index; + __entry->map_index = map ? index : 0; ), - TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d", + TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d" + " map_id=%d map_index=%d", __entry->prog_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->ifindex, __entry->to_ifindex, - __entry->err) + __entry->err, __entry->map_id, __entry->map_index) ); DEFINE_EVENT(xdp_redirect_template, xdp_redirect, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, - int to_ifindex, int err, - const struct bpf_map *map, u32 map_index), - TP_ARGS(dev, xdp, to_ifindex, err, map, map_index) + const void *tgt, int err, + const struct bpf_map *map, u32 index), + TP_ARGS(dev, xdp, tgt, err, map, index) ); DEFINE_EVENT(xdp_redirect_template, xdp_redirect_err, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, - int to_ifindex, int err, - const struct bpf_map *map, u32 map_index), - TP_ARGS(dev, xdp, to_ifindex, err, map, map_index) + const void *tgt, int err, + const struct bpf_map *map, u32 index), + TP_ARGS(dev, xdp, tgt, err, map, index) ); #define _trace_xdp_redirect(dev, xdp, to) \ - trace_xdp_redirect(dev, xdp, to, 0, NULL, 0); + trace_xdp_redirect(dev, xdp, NULL, 0, NULL, to); #define _trace_xdp_redirect_err(dev, xdp, to, err) \ - trace_xdp_redirect_err(dev, xdp, to, err, NULL, 0); + trace_xdp_redirect_err(dev, xdp, NULL, err, NULL, to); -DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map, +#define _trace_xdp_redirect_map(dev, xdp, to, map, index) \ + trace_xdp_redirect(dev, xdp, to, 0, map, index); + +#define _trace_xdp_redirect_map_err(dev, xdp, to, map, index, err) \ + trace_xdp_redirect_err(dev, xdp, to, err, map, index); + +/* not used anymore, but kept around so as not to break old programs */ +DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, - int to_ifindex, int err, - const struct bpf_map *map, u32 map_index), - TP_ARGS(dev, xdp, to_ifindex, err, map, map_index), - TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d" - " map_id=%d map_index=%d", - __entry->prog_id, - __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), - __entry->ifindex, __entry->to_ifindex, - __entry->err, - __entry->map_id, __entry->map_index) + const void *tgt, int err, + const struct bpf_map *map, u32 index), + TP_ARGS(dev, xdp, tgt, err, map, index) ); -DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err, +DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map_err, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, - int to_ifindex, int err, - const struct bpf_map *map, u32 map_index), - TP_ARGS(dev, xdp, to_ifindex, err, map, map_index), - TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d" - " map_id=%d map_index=%d", - __entry->prog_id, - __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), - __entry->ifindex, __entry->to_ifindex, - __entry->err, - __entry->map_id, __entry->map_index) + const void *tgt, int err, + const struct bpf_map *map, u32 index), + TP_ARGS(dev, xdp, tgt, err, map, index) ); -#ifndef __DEVMAP_OBJ_TYPE -#define __DEVMAP_OBJ_TYPE -struct _bpf_dtab_netdev { - struct net_device *dev; -}; -#endif /* __DEVMAP_OBJ_TYPE */ - -#define devmap_ifindex(fwd, map) \ - ((map->map_type == BPF_MAP_TYPE_DEVMAP || \ - map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) ? \ - ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0) - -#define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx) \ - trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map), \ - 0, map, idx) - -#define _trace_xdp_redirect_map_err(dev, xdp, fwd, map, idx, err) \ - trace_xdp_redirect_map_err(dev, xdp, devmap_ifindex(fwd, map), \ - err, map, idx) - TRACE_EVENT(xdp_cpumap_kthread, TP_PROTO(int map_id, unsigned int processed, unsigned int drops, @@ -259,43 +246,38 @@ TRACE_EVENT(xdp_cpumap_enqueue, TRACE_EVENT(xdp_devmap_xmit, - TP_PROTO(const struct bpf_map *map, u32 map_index, - int sent, int drops, - const struct net_device *from_dev, - const struct net_device *to_dev, int err), + TP_PROTO(const struct net_device *from_dev, + const struct net_device *to_dev, + int sent, int drops, int err), - TP_ARGS(map, map_index, sent, drops, from_dev, to_dev, err), + TP_ARGS(from_dev, to_dev, sent, drops, err), TP_STRUCT__entry( - __field(int, map_id) + __field(int, from_ifindex) __field(u32, act) - __field(u32, map_index) + __field(int, to_ifindex) __field(int, drops) __field(int, sent) - __field(int, from_ifindex) - __field(int, to_ifindex) __field(int, err) ), TP_fast_assign( - __entry->map_id = map->id; + __entry->from_ifindex = from_dev->ifindex; __entry->act = XDP_REDIRECT; - __entry->map_index = map_index; + __entry->to_ifindex = to_dev->ifindex; __entry->drops = drops; __entry->sent = sent; - __entry->from_ifindex = from_dev->ifindex; - __entry->to_ifindex = to_dev->ifindex; __entry->err = err; ), TP_printk("ndo_xdp_xmit" - " map_id=%d map_index=%d action=%s" + " from_ifindex=%d to_ifindex=%d action=%s" " sent=%d drops=%d" - " from_ifindex=%d to_ifindex=%d err=%d", - __entry->map_id, __entry->map_index, + " err=%d", + __entry->from_ifindex, __entry->to_ifindex, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->sent, __entry->drops, - __entry->from_ifindex, __entry->to_ifindex, __entry->err) + __entry->err) ); /* Expect users already include <net/xdp.h>, but not xdp_priv.h */ diff --git a/include/trace/events/xen.h b/include/trace/events/xen.h index 9a0e8af21310..a5ccfa67bc5c 100644 --- a/include/trace/events/xen.h +++ b/include/trace/events/xen.h @@ -66,7 +66,11 @@ TRACE_EVENT(xen_mc_callback, TP_PROTO(xen_mc_callback_fn_t fn, void *data), TP_ARGS(fn, data), TP_STRUCT__entry( - __field(xen_mc_callback_fn_t, fn) + /* + * Use field_struct to avoid is_signed_type() + * comparison of a function pointer. + */ + __field_struct(xen_mc_callback_fn_t, fn) __field(void *, data) ), TP_fast_assign( diff --git a/include/uapi/linux/batadv_packet.h b/include/uapi/linux/batadv_packet.h index 2a15f01c2243..0ae34c85ef9e 100644 --- a/include/uapi/linux/batadv_packet.h +++ b/include/uapi/linux/batadv_packet.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index 67f4636758af..617c180ff0c8 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: MIT */ -/* Copyright (C) 2016-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2016-2020 B.A.T.M.A.N. contributors: * * Matthias Schiffer */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7df436da542d..f1d74a2bd234 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -107,6 +107,10 @@ enum bpf_cmd { BPF_MAP_LOOKUP_AND_DELETE_ELEM, BPF_MAP_FREEZE, BPF_BTF_GET_NEXT_ID, + BPF_MAP_LOOKUP_BATCH, + BPF_MAP_LOOKUP_AND_DELETE_BATCH, + BPF_MAP_UPDATE_BATCH, + BPF_MAP_DELETE_BATCH, }; enum bpf_map_type { @@ -136,6 +140,7 @@ enum bpf_map_type { BPF_MAP_TYPE_STACK, BPF_MAP_TYPE_SK_STORAGE, BPF_MAP_TYPE_DEVMAP_HASH, + BPF_MAP_TYPE_STRUCT_OPS, }; /* Note that tracing related programs such as @@ -174,6 +179,8 @@ enum bpf_prog_type { BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, BPF_PROG_TYPE_CGROUP_SOCKOPT, BPF_PROG_TYPE_TRACING, + BPF_PROG_TYPE_STRUCT_OPS, + BPF_PROG_TYPE_EXT, }; enum bpf_attach_type { @@ -357,7 +364,12 @@ enum bpf_attach_type { /* Enable memory-mapping BPF map */ #define BPF_F_MMAPABLE (1U << 10) -/* flags for BPF_PROG_QUERY */ +/* Flags for BPF_PROG_QUERY. */ + +/* Query effective (directly attached + inherited from ancestor cgroups) + * programs that will be executed for events within a cgroup. + * attach_flags with this flag are returned only for directly attached programs. + */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) enum bpf_stack_build_id_status { @@ -397,6 +409,10 @@ union bpf_attr { __u32 btf_fd; /* fd pointing to a BTF type data */ __u32 btf_key_type_id; /* BTF type_id of the key */ __u32 btf_value_type_id; /* BTF type_id of the value */ + __u32 btf_vmlinux_value_type_id;/* BTF type_id of a kernel- + * struct stored as the + * map value + */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ @@ -409,6 +425,23 @@ union bpf_attr { __u64 flags; }; + struct { /* struct used by BPF_MAP_*_BATCH commands */ + __aligned_u64 in_batch; /* start batch, + * NULL to start from beginning + */ + __aligned_u64 out_batch; /* output: next start batch */ + __aligned_u64 keys; + __aligned_u64 values; + __u32 count; /* input/output: + * input: # of key/value + * elements + * output: # of filled elements + */ + __u32 map_fd; + __u64 elem_flags; + __u64 flags; + } batch; + struct { /* anonymous struct used by BPF_PROG_LOAD command */ __u32 prog_type; /* one of enum bpf_prog_type */ __u32 insn_cnt; @@ -2703,7 +2736,8 @@ union bpf_attr { * * int bpf_send_signal(u32 sig) * Description - * Send signal *sig* to the current task. + * Send signal *sig* to the process of the current task. + * The signal may be delivered to any of this process's threads. * Return * 0 on success or successfully queued. * @@ -2831,6 +2865,33 @@ union bpf_attr { * Return * On success, the strictly positive length of the string, including * the trailing NUL character. On error, a negative value. + * + * int bpf_tcp_send_ack(void *tp, u32 rcv_nxt) + * Description + * Send out a tcp-ack. *tp* is the in-kernel struct tcp_sock. + * *rcv_nxt* is the ack_seq to be sent out. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_send_signal_thread(u32 sig) + * Description + * Send signal *sig* to the thread corresponding to the current task. + * Return + * 0 on success or successfully queued. + * + * **-EBUSY** if work queue under nmi is full. + * + * **-EINVAL** if *sig* is invalid. + * + * **-EPERM** if no permission to send the *sig*. + * + * **-EAGAIN** if bpf program can try again. + * + * u64 bpf_jiffies64(void) + * Description + * Obtain the 64bit jiffies + * Return + * The 64 bit jiffies */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2948,7 +3009,10 @@ union bpf_attr { FN(probe_read_user), \ FN(probe_read_kernel), \ FN(probe_read_user_str), \ - FN(probe_read_kernel_str), + FN(probe_read_kernel_str), \ + FN(tcp_send_ack), \ + FN(send_signal_thread), \ + FN(jiffies64), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3349,7 +3413,7 @@ struct bpf_map_info { __u32 map_flags; char name[BPF_OBJ_NAME_LEN]; __u32 ifindex; - __u32 :32; + __u32 btf_vmlinux_value_type_id; __u64 netns_dev; __u64 netns_ino; __u32 btf_id; diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index 1a2898c482ee..5a667107ad2c 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -146,6 +146,12 @@ enum { BTF_VAR_GLOBAL_EXTERN = 2, }; +enum btf_func_linkage { + BTF_FUNC_STATIC = 0, + BTF_FUNC_GLOBAL = 1, + BTF_FUNC_EXTERN = 2, +}; + /* BTF_KIND_VAR is followed by a single "struct btf_var" to describe * additional information related to the variable such as its linkage. */ diff --git a/include/uapi/linux/hdlc/ioctl.h b/include/uapi/linux/hdlc/ioctl.h index 0fe4238e8246..b06341acab5e 100644 --- a/include/uapi/linux/hdlc/ioctl.h +++ b/include/uapi/linux/hdlc/ioctl.h @@ -79,6 +79,15 @@ typedef struct { unsigned int timeout; } cisco_proto; +typedef struct { + unsigned short dce; /* 1 for DCE (network side) operation */ + unsigned int modulo; /* modulo (8 = basic / 128 = extended) */ + unsigned int window; /* frame window size */ + unsigned int t1; /* timeout t1 */ + unsigned int t2; /* timeout t2 */ + unsigned int n2; /* frame retry counter */ +} x25_hdlc_proto; + /* PPP doesn't need any info now - supply length = 0 to ioctl */ #endif /* __ASSEMBLY__ */ diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h index 4bf33344aab1..be714cd8c826 100644 --- a/include/uapi/linux/if.h +++ b/include/uapi/linux/if.h @@ -213,6 +213,7 @@ struct if_settings { fr_proto __user *fr; fr_proto_pvc __user *fr_pvc; fr_proto_pvc_info __user *fr_pvc_info; + x25_hdlc_proto __user *x25; /* interface settings */ sync_serial_settings __user *sync; diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 4a58e3d7de46..42f7ca38ad80 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -130,6 +130,7 @@ enum { #define BRIDGE_VLAN_INFO_RANGE_BEGIN (1<<3) /* VLAN is start of vlan range */ #define BRIDGE_VLAN_INFO_RANGE_END (1<<4) /* VLAN is end of vlan range */ #define BRIDGE_VLAN_INFO_BRENTRY (1<<5) /* Global bridge VLAN entry */ +#define BRIDGE_VLAN_INFO_ONLY_OPTS (1<<6) /* Skip create/delete/flags */ struct bridge_vlan_info { __u16 flags; @@ -165,6 +166,36 @@ struct bridge_stp_xstats { __u64 tx_tcn; }; +/* Bridge vlan RTM header */ +struct br_vlan_msg { + __u8 family; + __u8 reserved1; + __u16 reserved2; + __u32 ifindex; +}; + +/* Bridge vlan RTM attributes + * [BRIDGE_VLANDB_ENTRY] = { + * [BRIDGE_VLANDB_ENTRY_INFO] + * ... + * } + */ +enum { + BRIDGE_VLANDB_UNSPEC, + BRIDGE_VLANDB_ENTRY, + __BRIDGE_VLANDB_MAX, +}; +#define BRIDGE_VLANDB_MAX (__BRIDGE_VLANDB_MAX - 1) + +enum { + BRIDGE_VLANDB_ENTRY_UNSPEC, + BRIDGE_VLANDB_ENTRY_INFO, + BRIDGE_VLANDB_ENTRY_RANGE, + BRIDGE_VLANDB_ENTRY_STATE, + __BRIDGE_VLANDB_ENTRY_MAX, +}; +#define BRIDGE_VLANDB_ENTRY_MAX (__BRIDGE_VLANDB_ENTRY_MAX - 1) + /* Bridge multicast database attributes * [MDBA_MDB] = { * [MDBA_MDB_ENTRY] = { diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 1d69f637c5d6..024af2d1d0af 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -486,6 +486,13 @@ enum macsec_validation_type { MACSEC_VALIDATE_MAX = __MACSEC_VALIDATE_END - 1, }; +enum macsec_offload { + MACSEC_OFFLOAD_OFF = 0, + MACSEC_OFFLOAD_PHY = 1, + __MACSEC_OFFLOAD_END, + MACSEC_OFFLOAD_MAX = __MACSEC_OFFLOAD_END - 1, +}; + /* IPVLAN section */ enum { IFLA_IPVLAN_UNSPEC, diff --git a/include/uapi/linux/if_macsec.h b/include/uapi/linux/if_macsec.h index 98e4d5d7c45c..1d63c43c38cc 100644 --- a/include/uapi/linux/if_macsec.h +++ b/include/uapi/linux/if_macsec.h @@ -45,6 +45,7 @@ enum macsec_attrs { MACSEC_ATTR_RXSC_LIST, /* dump, nested, macsec_rxsc_attrs for each RXSC */ MACSEC_ATTR_TXSC_STATS, /* dump, nested, macsec_txsc_stats_attr */ MACSEC_ATTR_SECY_STATS, /* dump, nested, macsec_secy_stats_attr */ + MACSEC_ATTR_OFFLOAD, /* config, nested, macsec_offload_attrs */ __MACSEC_ATTR_END, NUM_MACSEC_ATTR = __MACSEC_ATTR_END, MACSEC_ATTR_MAX = __MACSEC_ATTR_END - 1, @@ -97,6 +98,15 @@ enum macsec_sa_attrs { MACSEC_SA_ATTR_MAX = __MACSEC_SA_ATTR_END - 1, }; +enum macsec_offload_attrs { + MACSEC_OFFLOAD_ATTR_UNSPEC, + MACSEC_OFFLOAD_ATTR_TYPE, /* config/dump, u8 0..2 */ + MACSEC_OFFLOAD_ATTR_PAD, + __MACSEC_OFFLOAD_ATTR_END, + NUM_MACSEC_OFFLOAD_ATTR = __MACSEC_OFFLOAD_ATTR_END, + MACSEC_OFFLOAD_ATTR_MAX = __MACSEC_OFFLOAD_ATTR_END - 1, +}; + enum macsec_nl_commands { MACSEC_CMD_GET_TXSC, MACSEC_CMD_ADD_RXSC, @@ -108,6 +118,7 @@ enum macsec_nl_commands { MACSEC_CMD_ADD_RXSA, MACSEC_CMD_DEL_RXSA, MACSEC_CMD_UPD_RXSA, + MACSEC_CMD_UPD_OFFLOAD, }; /* u64 per-RXSC stats */ diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index e7ad9d350a28..1521073b6348 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -76,6 +76,8 @@ enum { #define IPPROTO_MPLS IPPROTO_MPLS IPPROTO_RAW = 255, /* Raw IP packets */ #define IPPROTO_RAW IPPROTO_RAW + IPPROTO_MPTCP = 262, /* Multipath TCP connection */ +#define IPPROTO_MPTCP IPPROTO_MPTCP IPPROTO_MAX }; #endif diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h index f056b2a00d5c..9a61c28ed3ae 100644 --- a/include/uapi/linux/input.h +++ b/include/uapi/linux/input.h @@ -34,6 +34,7 @@ struct input_event { __kernel_ulong_t __sec; #if defined(__sparc__) && defined(__arch64__) unsigned int __usec; + unsigned int __pad; #else __kernel_ulong_t __usec; #endif diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index a3300e1b9a01..55cfcb71606d 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -178,7 +178,8 @@ struct io_uring_params { struct io_uring_files_update { __u32 offset; - __s32 *fds; + __u32 resv; + __aligned_u64 /* __s32 * */ fds; }; #endif diff --git a/include/uapi/linux/kcov.h b/include/uapi/linux/kcov.h index 409d3ad1e6e2..1d0350e44ae3 100644 --- a/include/uapi/linux/kcov.h +++ b/include/uapi/linux/kcov.h @@ -9,11 +9,11 @@ * and the comment before kcov_remote_start() for usage details. */ struct kcov_remote_arg { - unsigned int trace_mode; /* KCOV_TRACE_PC or KCOV_TRACE_CMP */ - unsigned int area_size; /* Length of coverage buffer in words */ - unsigned int num_handles; /* Size of handles array */ - __u64 common_handle; - __u64 handles[0]; + __u32 trace_mode; /* KCOV_TRACE_PC or KCOV_TRACE_CMP */ + __u32 area_size; /* Length of coverage buffer in words */ + __u32 num_handles; /* Size of handles array */ + __aligned_u64 common_handle; + __aligned_u64 handles[0]; }; #define KCOV_REMOTE_MAX_HANDLES 0x100 diff --git a/include/uapi/linux/mii.h b/include/uapi/linux/mii.h index 51b48e4be1f2..0b9c3beda345 100644 --- a/include/uapi/linux/mii.h +++ b/include/uapi/linux/mii.h @@ -131,6 +131,18 @@ #define NWAYTEST_LOOPBACK 0x0100 /* Enable loopback for N-way */ #define NWAYTEST_RESV2 0xfe00 /* Unused... */ +/* MAC and PHY tx_config_Reg[15:0] for SGMII in-band auto-negotiation.*/ +#define ADVERTISE_SGMII 0x0001 /* MAC can do SGMII */ +#define LPA_SGMII 0x0001 /* PHY can do SGMII */ +#define LPA_SGMII_DPX_SPD_MASK 0x1C00 /* SGMII duplex and speed bits */ +#define LPA_SGMII_10HALF 0x0000 /* Can do 10mbps half-duplex */ +#define LPA_SGMII_10FULL 0x1000 /* Can do 10mbps full-duplex */ +#define LPA_SGMII_100HALF 0x0400 /* Can do 100mbps half-duplex */ +#define LPA_SGMII_100FULL 0x1400 /* Can do 100mbps full-duplex */ +#define LPA_SGMII_1000HALF 0x0800 /* Can do 1000mbps half-duplex */ +#define LPA_SGMII_1000FULL 0x1800 /* Can do 1000mbps full-duplex */ +#define LPA_SGMII_LINK 0x8000 /* PHY link with copper-side partner */ + /* 1000BASE-T Control register */ #define ADVERTISE_1000FULL 0x0200 /* Advertise 1000BASE-T full duplex */ #define ADVERTISE_1000HALF 0x0100 /* Advertise 1000BASE-T half duplex */ diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index e237ecbdcd8a..065218a20bb7 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -48,6 +48,7 @@ enum nft_registers { #define NFT_REG_SIZE 16 #define NFT_REG32_SIZE 4 +#define NFT_REG32_COUNT (NFT_REG32_15 - NFT_REG32_00 + 1) /** * enum nft_verdicts - nf_tables internal verdicts @@ -301,15 +302,29 @@ enum nft_set_policies { * enum nft_set_desc_attributes - set element description * * @NFTA_SET_DESC_SIZE: number of elements in set (NLA_U32) + * @NFTA_SET_DESC_CONCAT: description of field concatenation (NLA_NESTED) */ enum nft_set_desc_attributes { NFTA_SET_DESC_UNSPEC, NFTA_SET_DESC_SIZE, + NFTA_SET_DESC_CONCAT, __NFTA_SET_DESC_MAX }; #define NFTA_SET_DESC_MAX (__NFTA_SET_DESC_MAX - 1) /** + * enum nft_set_field_attributes - attributes of concatenated fields + * + * @NFTA_SET_FIELD_LEN: length of single field, in bits (NLA_U32) + */ +enum nft_set_field_attributes { + NFTA_SET_FIELD_UNSPEC, + NFTA_SET_FIELD_LEN, + __NFTA_SET_FIELD_MAX +}; +#define NFTA_SET_FIELD_MAX (__NFTA_SET_FIELD_MAX - 1) + +/** * enum nft_set_attributes - nf_tables set netlink attributes * * @NFTA_SET_TABLE: table name (NLA_STRING) @@ -370,6 +385,7 @@ enum nft_set_elem_flags { * @NFTA_SET_ELEM_USERDATA: user data (NLA_BINARY) * @NFTA_SET_ELEM_EXPR: expression (NLA_NESTED: nft_expr_attributes) * @NFTA_SET_ELEM_OBJREF: stateful object reference (NLA_STRING) + * @NFTA_SET_ELEM_KEY_END: closing key value (NLA_NESTED: nft_data) */ enum nft_set_elem_attributes { NFTA_SET_ELEM_UNSPEC, @@ -382,6 +398,7 @@ enum nft_set_elem_attributes { NFTA_SET_ELEM_EXPR, NFTA_SET_ELEM_PAD, NFTA_SET_ELEM_OBJREF, + NFTA_SET_ELEM_KEY_END, __NFTA_SET_ELEM_MAX }; #define NFTA_SET_ELEM_MAX (__NFTA_SET_ELEM_MAX - 1) @@ -485,6 +502,20 @@ enum nft_immediate_attributes { #define NFTA_IMMEDIATE_MAX (__NFTA_IMMEDIATE_MAX - 1) /** + * enum nft_bitwise_ops - nf_tables bitwise operations + * + * @NFT_BITWISE_BOOL: mask-and-xor operation used to implement NOT, AND, OR and + * XOR boolean operations + * @NFT_BITWISE_LSHIFT: left-shift operation + * @NFT_BITWISE_RSHIFT: right-shift operation + */ +enum nft_bitwise_ops { + NFT_BITWISE_BOOL, + NFT_BITWISE_LSHIFT, + NFT_BITWISE_RSHIFT, +}; + +/** * enum nft_bitwise_attributes - nf_tables bitwise expression netlink attributes * * @NFTA_BITWISE_SREG: source register (NLA_U32: nft_registers) @@ -492,16 +523,20 @@ enum nft_immediate_attributes { * @NFTA_BITWISE_LEN: length of operands (NLA_U32) * @NFTA_BITWISE_MASK: mask value (NLA_NESTED: nft_data_attributes) * @NFTA_BITWISE_XOR: xor value (NLA_NESTED: nft_data_attributes) + * @NFTA_BITWISE_OP: type of operation (NLA_U32: nft_bitwise_ops) + * @NFTA_BITWISE_DATA: argument for non-boolean operations + * (NLA_NESTED: nft_data_attributes) * - * The bitwise expression performs the following operation: + * The bitwise expression supports boolean and shift operations. It implements + * the boolean operations by performing the following operation: * * dreg = (sreg & mask) ^ xor * - * which allow to express all bitwise operations: + * with these mask and xor values: * * mask xor * NOT: 1 1 - * OR: 0 x + * OR: ~x x * XOR: 1 x * AND: x 0 */ @@ -512,6 +547,8 @@ enum nft_bitwise_attributes { NFTA_BITWISE_LEN, NFTA_BITWISE_MASK, NFTA_BITWISE_XOR, + NFTA_BITWISE_OP, + NFTA_BITWISE_DATA, __NFTA_BITWISE_MAX }; #define NFTA_BITWISE_MAX (__NFTA_BITWISE_MAX - 1) diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index bf5a5b1dfb0b..bbe791b24168 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -971,6 +971,37 @@ struct tc_pie_xstats { __u32 ecn_mark; /* packets marked with ecn*/ }; +/* FQ PIE */ +enum { + TCA_FQ_PIE_UNSPEC, + TCA_FQ_PIE_LIMIT, + TCA_FQ_PIE_FLOWS, + TCA_FQ_PIE_TARGET, + TCA_FQ_PIE_TUPDATE, + TCA_FQ_PIE_ALPHA, + TCA_FQ_PIE_BETA, + TCA_FQ_PIE_QUANTUM, + TCA_FQ_PIE_MEMORY_LIMIT, + TCA_FQ_PIE_ECN_PROB, + TCA_FQ_PIE_ECN, + TCA_FQ_PIE_BYTEMODE, + TCA_FQ_PIE_DQ_RATE_ESTIMATOR, + __TCA_FQ_PIE_MAX +}; +#define TCA_FQ_PIE_MAX (__TCA_FQ_PIE_MAX - 1) + +struct tc_fq_pie_xstats { + __u32 packets_in; /* total number of packets enqueued */ + __u32 dropped; /* packets dropped due to fq_pie_action */ + __u32 overlimit; /* dropped due to lack of space in queue */ + __u32 overmemory; /* dropped due to lack of memory in queue */ + __u32 ecn_mark; /* packets marked with ecn */ + __u32 new_flow_count; /* count of new flows created by packets */ + __u32 new_flows_len; /* count of flows in new list */ + __u32 old_flows_len; /* count of flows in old list */ + __u32 memory_usage; /* total memory across all queues */ +}; + /* CBS */ struct tc_cbs_qopt { __u8 offload; diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 1418a8362bb7..4a8c5b745157 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -171,6 +171,13 @@ enum { RTM_GETLINKPROP, #define RTM_GETLINKPROP RTM_GETLINKPROP + RTM_NEWVLAN = 112, +#define RTM_NEWNVLAN RTM_NEWVLAN + RTM_DELVLAN, +#define RTM_DELVLAN RTM_DELVLAN + RTM_GETVLAN, +#define RTM_GETVLAN RTM_GETVLAN + __RTM_MAX, #define RTM_MAX (((__RTM_MAX + 3) & ~3) - 1) }; @@ -309,6 +316,8 @@ enum rt_scope_t { #define RTM_F_PREFIX 0x800 /* Prefix addresses */ #define RTM_F_LOOKUP_TABLE 0x1000 /* set rtm_table to FIB lookup result */ #define RTM_F_FIB_MATCH 0x2000 /* return full fib lookup match */ +#define RTM_F_OFFLOAD 0x4000 /* route is offloaded */ +#define RTM_F_TRAP 0x8000 /* route is trapping packets */ /* Reserved table identifiers */ @@ -721,6 +730,8 @@ enum rtnetlink_groups { #define RTNLGRP_IPV6_MROUTE_R RTNLGRP_IPV6_MROUTE_R RTNLGRP_NEXTHOP, #define RTNLGRP_NEXTHOP RTNLGRP_NEXTHOP + RTNLGRP_BRVLAN, +#define RTNLGRP_BRVLAN RTNLGRP_BRVLAN __RTNLGRP_MAX }; #define RTNLGRP_MAX (__RTNLGRP_MAX - 1) diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 7eee233e78d2..7d91f4debc48 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -285,6 +285,8 @@ enum LINUX_MIB_TCPRCVQDROP, /* TCPRcvQDrop */ LINUX_MIB_TCPWQUEUETOOBIG, /* TCPWqueueTooBig */ LINUX_MIB_TCPFASTOPENPASSIVEALTKEY, /* TCPFastOpenPassiveAltKey */ + LINUX_MIB_TCPTIMEOUTREHASH, /* TCPTimeoutRehash */ + LINUX_MIB_TCPDUPLICATEDATAREHASH, /* TCPDuplicateDataRehash */ __LINUX_MIB_MAX }; diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index d87184e673ca..fd9eb8f6bcae 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -311,6 +311,7 @@ enum { TCP_NLA_DSACK_DUPS, /* DSACK blocks received */ TCP_NLA_REORD_SEEN, /* reordering events seen */ TCP_NLA_SRTT, /* smoothed RTT in usecs */ + TCP_NLA_TIMEOUT_REHASH, /* Timeout-triggered rehash attempts */ }; /* for TCP_MD5SIG socket option */ diff --git a/include/uapi/linux/udp.h b/include/uapi/linux/udp.h index 30baccb6c9c4..4828794efcf8 100644 --- a/include/uapi/linux/udp.h +++ b/include/uapi/linux/udp.h @@ -42,5 +42,6 @@ struct udphdr { #define UDP_ENCAP_GTP0 4 /* GSM TS 09.60 */ #define UDP_ENCAP_GTP1U 5 /* 3GPP TS 29.060 */ #define UDP_ENCAP_RXRPC 6 +#define TCP_ENCAP_ESPINTCP 7 /* Yikes, this is really xfrm encap types. */ #endif /* _UAPI_LINUX_UDP_H */ diff --git a/init/main.c b/init/main.c index 1ecfd43ed464..da1bc0b60a7d 100644 --- a/init/main.c +++ b/init/main.c @@ -93,7 +93,6 @@ #include <linux/rodata_test.h> #include <linux/jump_label.h> #include <linux/mem_encrypt.h> -#include <linux/file.h> #include <asm/io.h> #include <asm/bugs.h> @@ -554,6 +553,7 @@ static void __init mm_init(void) * bigger than MAX_ORDER unless SPARSEMEM. */ page_ext_init_flatmem(); + init_debug_pagealloc(); report_meminit(); mem_init(); kmem_cache_init(); @@ -1158,26 +1158,13 @@ static int __ref kernel_init(void *unused) void console_on_rootfs(void) { - struct file *file; - unsigned int i; - - /* Open /dev/console in kernelspace, this should never fail */ - file = filp_open("/dev/console", O_RDWR, 0); - if (IS_ERR(file)) - goto err_out; - - /* create stdin/stdout/stderr, this should never fail */ - for (i = 0; i < 3; i++) { - if (f_dupfd(i, file, 0) != i) - goto err_out; - } - - return; + /* Open the /dev/console as stdin, this should never fail */ + if (ksys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0) + pr_err("Warning: unable to open an initial console.\n"); -err_out: - /* no panic -- this might not be fatal */ - pr_err("Warning: unable to open an initial console.\n"); - return; + /* create stdout/stderr */ + (void) ksys_dup(0); + (void) ksys_dup(0); } static noinline void __init kernel_init_freeable(void) diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index d4f330351f87..046ce5d98033 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -27,3 +27,6 @@ endif ifeq ($(CONFIG_SYSFS),y) obj-$(CONFIG_DEBUG_INFO_BTF) += sysfs_btf.o endif +ifeq ($(CONFIG_BPF_JIT),y) +obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o +endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index f0d19bbb9211..95d77770353c 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -503,6 +503,8 @@ const struct bpf_map_ops array_map_ops = { .map_mmap = array_map_mmap, .map_seq_show_elem = array_map_seq_show_elem, .map_check_btf = array_map_check_btf, + .map_lookup_batch = generic_map_lookup_batch, + .map_update_batch = generic_map_update_batch, }; const struct bpf_map_ops percpu_array_map_ops = { diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c new file mode 100644 index 000000000000..8ad1c9ea26b2 --- /dev/null +++ b/kernel/bpf/bpf_struct_ops.c @@ -0,0 +1,634 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2019 Facebook */ + +#include <linux/bpf.h> +#include <linux/bpf_verifier.h> +#include <linux/btf.h> +#include <linux/filter.h> +#include <linux/slab.h> +#include <linux/numa.h> +#include <linux/seq_file.h> +#include <linux/refcount.h> +#include <linux/mutex.h> + +enum bpf_struct_ops_state { + BPF_STRUCT_OPS_STATE_INIT, + BPF_STRUCT_OPS_STATE_INUSE, + BPF_STRUCT_OPS_STATE_TOBEFREE, +}; + +#define BPF_STRUCT_OPS_COMMON_VALUE \ + refcount_t refcnt; \ + enum bpf_struct_ops_state state + +struct bpf_struct_ops_value { + BPF_STRUCT_OPS_COMMON_VALUE; + char data[0] ____cacheline_aligned_in_smp; +}; + +struct bpf_struct_ops_map { + struct bpf_map map; + const struct bpf_struct_ops *st_ops; + /* protect map_update */ + struct mutex lock; + /* progs has all the bpf_prog that is populated + * to the func ptr of the kernel's struct + * (in kvalue.data). + */ + struct bpf_prog **progs; + /* image is a page that has all the trampolines + * that stores the func args before calling the bpf_prog. + * A PAGE_SIZE "image" is enough to store all trampoline for + * "progs[]". + */ + void *image; + /* uvalue->data stores the kernel struct + * (e.g. tcp_congestion_ops) that is more useful + * to userspace than the kvalue. For example, + * the bpf_prog's id is stored instead of the kernel + * address of a func ptr. + */ + struct bpf_struct_ops_value *uvalue; + /* kvalue.data stores the actual kernel's struct + * (e.g. tcp_congestion_ops) that will be + * registered to the kernel subsystem. + */ + struct bpf_struct_ops_value kvalue; +}; + +#define VALUE_PREFIX "bpf_struct_ops_" +#define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1) + +/* bpf_struct_ops_##_name (e.g. bpf_struct_ops_tcp_congestion_ops) is + * the map's value exposed to the userspace and its btf-type-id is + * stored at the map->btf_vmlinux_value_type_id. + * + */ +#define BPF_STRUCT_OPS_TYPE(_name) \ +extern struct bpf_struct_ops bpf_##_name; \ + \ +struct bpf_struct_ops_##_name { \ + BPF_STRUCT_OPS_COMMON_VALUE; \ + struct _name data ____cacheline_aligned_in_smp; \ +}; +#include "bpf_struct_ops_types.h" +#undef BPF_STRUCT_OPS_TYPE + +enum { +#define BPF_STRUCT_OPS_TYPE(_name) BPF_STRUCT_OPS_TYPE_##_name, +#include "bpf_struct_ops_types.h" +#undef BPF_STRUCT_OPS_TYPE + __NR_BPF_STRUCT_OPS_TYPE, +}; + +static struct bpf_struct_ops * const bpf_struct_ops[] = { +#define BPF_STRUCT_OPS_TYPE(_name) \ + [BPF_STRUCT_OPS_TYPE_##_name] = &bpf_##_name, +#include "bpf_struct_ops_types.h" +#undef BPF_STRUCT_OPS_TYPE +}; + +const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = { +}; + +const struct bpf_prog_ops bpf_struct_ops_prog_ops = { +}; + +static const struct btf_type *module_type; + +void bpf_struct_ops_init(struct btf *btf) +{ + s32 type_id, value_id, module_id; + const struct btf_member *member; + struct bpf_struct_ops *st_ops; + struct bpf_verifier_log log = {}; + const struct btf_type *t; + char value_name[128]; + const char *mname; + u32 i, j; + + /* Ensure BTF type is emitted for "struct bpf_struct_ops_##_name" */ +#define BPF_STRUCT_OPS_TYPE(_name) BTF_TYPE_EMIT(struct bpf_struct_ops_##_name); +#include "bpf_struct_ops_types.h" +#undef BPF_STRUCT_OPS_TYPE + + module_id = btf_find_by_name_kind(btf, "module", BTF_KIND_STRUCT); + if (module_id < 0) { + pr_warn("Cannot find struct module in btf_vmlinux\n"); + return; + } + module_type = btf_type_by_id(btf, module_id); + + for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { + st_ops = bpf_struct_ops[i]; + + if (strlen(st_ops->name) + VALUE_PREFIX_LEN >= + sizeof(value_name)) { + pr_warn("struct_ops name %s is too long\n", + st_ops->name); + continue; + } + sprintf(value_name, "%s%s", VALUE_PREFIX, st_ops->name); + + value_id = btf_find_by_name_kind(btf, value_name, + BTF_KIND_STRUCT); + if (value_id < 0) { + pr_warn("Cannot find struct %s in btf_vmlinux\n", + value_name); + continue; + } + + type_id = btf_find_by_name_kind(btf, st_ops->name, + BTF_KIND_STRUCT); + if (type_id < 0) { + pr_warn("Cannot find struct %s in btf_vmlinux\n", + st_ops->name); + continue; + } + t = btf_type_by_id(btf, type_id); + if (btf_type_vlen(t) > BPF_STRUCT_OPS_MAX_NR_MEMBERS) { + pr_warn("Cannot support #%u members in struct %s\n", + btf_type_vlen(t), st_ops->name); + continue; + } + + for_each_member(j, t, member) { + const struct btf_type *func_proto; + + mname = btf_name_by_offset(btf, member->name_off); + if (!*mname) { + pr_warn("anon member in struct %s is not supported\n", + st_ops->name); + break; + } + + if (btf_member_bitfield_size(t, member)) { + pr_warn("bit field member %s in struct %s is not supported\n", + mname, st_ops->name); + break; + } + + func_proto = btf_type_resolve_func_ptr(btf, + member->type, + NULL); + if (func_proto && + btf_distill_func_proto(&log, btf, + func_proto, mname, + &st_ops->func_models[j])) { + pr_warn("Error in parsing func ptr %s in struct %s\n", + mname, st_ops->name); + break; + } + } + + if (j == btf_type_vlen(t)) { + if (st_ops->init(btf)) { + pr_warn("Error in init bpf_struct_ops %s\n", + st_ops->name); + } else { + st_ops->type_id = type_id; + st_ops->type = t; + st_ops->value_id = value_id; + st_ops->value_type = btf_type_by_id(btf, + value_id); + } + } + } +} + +extern struct btf *btf_vmlinux; + +static const struct bpf_struct_ops * +bpf_struct_ops_find_value(u32 value_id) +{ + unsigned int i; + + if (!value_id || !btf_vmlinux) + return NULL; + + for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { + if (bpf_struct_ops[i]->value_id == value_id) + return bpf_struct_ops[i]; + } + + return NULL; +} + +const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id) +{ + unsigned int i; + + if (!type_id || !btf_vmlinux) + return NULL; + + for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { + if (bpf_struct_ops[i]->type_id == type_id) + return bpf_struct_ops[i]; + } + + return NULL; +} + +static int bpf_struct_ops_map_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + if (key && *(u32 *)key == 0) + return -ENOENT; + + *(u32 *)next_key = 0; + return 0; +} + +int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, + void *value) +{ + struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; + struct bpf_struct_ops_value *uvalue, *kvalue; + enum bpf_struct_ops_state state; + + if (unlikely(*(u32 *)key != 0)) + return -ENOENT; + + kvalue = &st_map->kvalue; + /* Pair with smp_store_release() during map_update */ + state = smp_load_acquire(&kvalue->state); + if (state == BPF_STRUCT_OPS_STATE_INIT) { + memset(value, 0, map->value_size); + return 0; + } + + /* No lock is needed. state and refcnt do not need + * to be updated together under atomic context. + */ + uvalue = (struct bpf_struct_ops_value *)value; + memcpy(uvalue, st_map->uvalue, map->value_size); + uvalue->state = state; + refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt)); + + return 0; +} + +static void *bpf_struct_ops_map_lookup_elem(struct bpf_map *map, void *key) +{ + return ERR_PTR(-EINVAL); +} + +static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map) +{ + const struct btf_type *t = st_map->st_ops->type; + u32 i; + + for (i = 0; i < btf_type_vlen(t); i++) { + if (st_map->progs[i]) { + bpf_prog_put(st_map->progs[i]); + st_map->progs[i] = NULL; + } + } +} + +static int check_zero_holes(const struct btf_type *t, void *data) +{ + const struct btf_member *member; + u32 i, moff, msize, prev_mend = 0; + const struct btf_type *mtype; + + for_each_member(i, t, member) { + moff = btf_member_bit_offset(t, member) / 8; + if (moff > prev_mend && + memchr_inv(data + prev_mend, 0, moff - prev_mend)) + return -EINVAL; + + mtype = btf_type_by_id(btf_vmlinux, member->type); + mtype = btf_resolve_size(btf_vmlinux, mtype, &msize, + NULL, NULL); + if (IS_ERR(mtype)) + return PTR_ERR(mtype); + prev_mend = moff + msize; + } + + if (t->size > prev_mend && + memchr_inv(data + prev_mend, 0, t->size - prev_mend)) + return -EINVAL; + + return 0; +} + +static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; + const struct bpf_struct_ops *st_ops = st_map->st_ops; + struct bpf_struct_ops_value *uvalue, *kvalue; + const struct btf_member *member; + const struct btf_type *t = st_ops->type; + void *udata, *kdata; + int prog_fd, err = 0; + void *image; + u32 i; + + if (flags) + return -EINVAL; + + if (*(u32 *)key != 0) + return -E2BIG; + + err = check_zero_holes(st_ops->value_type, value); + if (err) + return err; + + uvalue = (struct bpf_struct_ops_value *)value; + err = check_zero_holes(t, uvalue->data); + if (err) + return err; + + if (uvalue->state || refcount_read(&uvalue->refcnt)) + return -EINVAL; + + uvalue = (struct bpf_struct_ops_value *)st_map->uvalue; + kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue; + + mutex_lock(&st_map->lock); + + if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT) { + err = -EBUSY; + goto unlock; + } + + memcpy(uvalue, value, map->value_size); + + udata = &uvalue->data; + kdata = &kvalue->data; + image = st_map->image; + + for_each_member(i, t, member) { + const struct btf_type *mtype, *ptype; + struct bpf_prog *prog; + u32 moff; + + moff = btf_member_bit_offset(t, member) / 8; + ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL); + if (ptype == module_type) { + if (*(void **)(udata + moff)) + goto reset_unlock; + *(void **)(kdata + moff) = BPF_MODULE_OWNER; + continue; + } + + err = st_ops->init_member(t, member, kdata, udata); + if (err < 0) + goto reset_unlock; + + /* The ->init_member() has handled this member */ + if (err > 0) + continue; + + /* If st_ops->init_member does not handle it, + * we will only handle func ptrs and zero-ed members + * here. Reject everything else. + */ + + /* All non func ptr member must be 0 */ + if (!ptype || !btf_type_is_func_proto(ptype)) { + u32 msize; + + mtype = btf_type_by_id(btf_vmlinux, member->type); + mtype = btf_resolve_size(btf_vmlinux, mtype, &msize, + NULL, NULL); + if (IS_ERR(mtype)) { + err = PTR_ERR(mtype); + goto reset_unlock; + } + + if (memchr_inv(udata + moff, 0, msize)) { + err = -EINVAL; + goto reset_unlock; + } + + continue; + } + + prog_fd = (int)(*(unsigned long *)(udata + moff)); + /* Similar check as the attr->attach_prog_fd */ + if (!prog_fd) + continue; + + prog = bpf_prog_get(prog_fd); + if (IS_ERR(prog)) { + err = PTR_ERR(prog); + goto reset_unlock; + } + st_map->progs[i] = prog; + + if (prog->type != BPF_PROG_TYPE_STRUCT_OPS || + prog->aux->attach_btf_id != st_ops->type_id || + prog->expected_attach_type != i) { + err = -EINVAL; + goto reset_unlock; + } + + err = arch_prepare_bpf_trampoline(image, + st_map->image + PAGE_SIZE, + &st_ops->func_models[i], 0, + &prog, 1, NULL, 0, NULL); + if (err < 0) + goto reset_unlock; + + *(void **)(kdata + moff) = image; + image += err; + + /* put prog_id to udata */ + *(unsigned long *)(udata + moff) = prog->aux->id; + } + + refcount_set(&kvalue->refcnt, 1); + bpf_map_inc(map); + + set_memory_ro((long)st_map->image, 1); + set_memory_x((long)st_map->image, 1); + err = st_ops->reg(kdata); + if (likely(!err)) { + /* Pair with smp_load_acquire() during lookup_elem(). + * It ensures the above udata updates (e.g. prog->aux->id) + * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set. + */ + smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_INUSE); + goto unlock; + } + + /* Error during st_ops->reg(). It is very unlikely since + * the above init_member() should have caught it earlier + * before reg(). The only possibility is if there was a race + * in registering the struct_ops (under the same name) to + * a sub-system through different struct_ops's maps. + */ + set_memory_nx((long)st_map->image, 1); + set_memory_rw((long)st_map->image, 1); + bpf_map_put(map); + +reset_unlock: + bpf_struct_ops_map_put_progs(st_map); + memset(uvalue, 0, map->value_size); + memset(kvalue, 0, map->value_size); +unlock: + mutex_unlock(&st_map->lock); + return err; +} + +static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) +{ + enum bpf_struct_ops_state prev_state; + struct bpf_struct_ops_map *st_map; + + st_map = (struct bpf_struct_ops_map *)map; + prev_state = cmpxchg(&st_map->kvalue.state, + BPF_STRUCT_OPS_STATE_INUSE, + BPF_STRUCT_OPS_STATE_TOBEFREE); + if (prev_state == BPF_STRUCT_OPS_STATE_INUSE) { + st_map->st_ops->unreg(&st_map->kvalue.data); + if (refcount_dec_and_test(&st_map->kvalue.refcnt)) + bpf_map_put(map); + } + + return 0; +} + +static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key, + struct seq_file *m) +{ + void *value; + int err; + + value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); + if (!value) + return; + + err = bpf_struct_ops_map_sys_lookup_elem(map, key, value); + if (!err) { + btf_type_seq_show(btf_vmlinux, map->btf_vmlinux_value_type_id, + value, m); + seq_puts(m, "\n"); + } + + kfree(value); +} + +static void bpf_struct_ops_map_free(struct bpf_map *map) +{ + struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; + + if (st_map->progs) + bpf_struct_ops_map_put_progs(st_map); + bpf_map_area_free(st_map->progs); + bpf_jit_free_exec(st_map->image); + bpf_map_area_free(st_map->uvalue); + bpf_map_area_free(st_map); +} + +static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr) +{ + if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 || + attr->map_flags || !attr->btf_vmlinux_value_type_id) + return -EINVAL; + return 0; +} + +static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) +{ + const struct bpf_struct_ops *st_ops; + size_t map_total_size, st_map_size; + struct bpf_struct_ops_map *st_map; + const struct btf_type *t, *vt; + struct bpf_map_memory mem; + struct bpf_map *map; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id); + if (!st_ops) + return ERR_PTR(-ENOTSUPP); + + vt = st_ops->value_type; + if (attr->value_size != vt->size) + return ERR_PTR(-EINVAL); + + t = st_ops->type; + + st_map_size = sizeof(*st_map) + + /* kvalue stores the + * struct bpf_struct_ops_tcp_congestions_ops + */ + (vt->size - sizeof(struct bpf_struct_ops_value)); + map_total_size = st_map_size + + /* uvalue */ + sizeof(vt->size) + + /* struct bpf_progs **progs */ + btf_type_vlen(t) * sizeof(struct bpf_prog *); + err = bpf_map_charge_init(&mem, map_total_size); + if (err < 0) + return ERR_PTR(err); + + st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE); + if (!st_map) { + bpf_map_charge_finish(&mem); + return ERR_PTR(-ENOMEM); + } + st_map->st_ops = st_ops; + map = &st_map->map; + + st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE); + st_map->progs = + bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_prog *), + NUMA_NO_NODE); + st_map->image = bpf_jit_alloc_exec(PAGE_SIZE); + if (!st_map->uvalue || !st_map->progs || !st_map->image) { + bpf_struct_ops_map_free(map); + bpf_map_charge_finish(&mem); + return ERR_PTR(-ENOMEM); + } + + mutex_init(&st_map->lock); + set_vm_flush_reset_perms(st_map->image); + bpf_map_init_from_attr(map, attr); + bpf_map_charge_move(&map->memory, &mem); + + return map; +} + +const struct bpf_map_ops bpf_struct_ops_map_ops = { + .map_alloc_check = bpf_struct_ops_map_alloc_check, + .map_alloc = bpf_struct_ops_map_alloc, + .map_free = bpf_struct_ops_map_free, + .map_get_next_key = bpf_struct_ops_map_get_next_key, + .map_lookup_elem = bpf_struct_ops_map_lookup_elem, + .map_delete_elem = bpf_struct_ops_map_delete_elem, + .map_update_elem = bpf_struct_ops_map_update_elem, + .map_seq_show_elem = bpf_struct_ops_map_seq_show_elem, +}; + +/* "const void *" because some subsystem is + * passing a const (e.g. const struct tcp_congestion_ops *) + */ +bool bpf_struct_ops_get(const void *kdata) +{ + struct bpf_struct_ops_value *kvalue; + + kvalue = container_of(kdata, struct bpf_struct_ops_value, data); + + return refcount_inc_not_zero(&kvalue->refcnt); +} + +void bpf_struct_ops_put(const void *kdata) +{ + struct bpf_struct_ops_value *kvalue; + + kvalue = container_of(kdata, struct bpf_struct_ops_value, data); + if (refcount_dec_and_test(&kvalue->refcnt)) { + struct bpf_struct_ops_map *st_map; + + st_map = container_of(kvalue, struct bpf_struct_ops_map, + kvalue); + bpf_map_put(&st_map->map); + } +} diff --git a/kernel/bpf/bpf_struct_ops_types.h b/kernel/bpf/bpf_struct_ops_types.h new file mode 100644 index 000000000000..066d83ea1c99 --- /dev/null +++ b/kernel/bpf/bpf_struct_ops_types.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* internal file - do not include directly */ + +#ifdef CONFIG_BPF_JIT +#ifdef CONFIG_INET +#include <net/tcp.h> +BPF_STRUCT_OPS_TYPE(tcp_congestion_ops) +#endif +#endif diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ed2075884724..32963b6d5a9c 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -180,11 +180,6 @@ */ #define BTF_MAX_SIZE (16 * 1024 * 1024) -#define for_each_member(i, struct_type, member) \ - for (i = 0, member = btf_type_member(struct_type); \ - i < btf_type_vlen(struct_type); \ - i++, member++) - #define for_each_member_from(i, from, struct_type, member) \ for (i = from, member = btf_type_member(struct_type) + from; \ i < btf_type_vlen(struct_type); \ @@ -281,6 +276,11 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = { [BTF_KIND_DATASEC] = "DATASEC", }; +static const char *btf_type_str(const struct btf_type *t) +{ + return btf_kind_str[BTF_INFO_KIND(t->info)]; +} + struct btf_kind_operations { s32 (*check_meta)(struct btf_verifier_env *env, const struct btf_type *t, @@ -382,6 +382,65 @@ static bool btf_type_is_datasec(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; } +s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind) +{ + const struct btf_type *t; + const char *tname; + u32 i; + + for (i = 1; i <= btf->nr_types; i++) { + t = btf->types[i]; + if (BTF_INFO_KIND(t->info) != kind) + continue; + + tname = btf_name_by_offset(btf, t->name_off); + if (!strcmp(tname, name)) + return i; + } + + return -ENOENT; +} + +const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, + u32 id, u32 *res_id) +{ + const struct btf_type *t = btf_type_by_id(btf, id); + + while (btf_type_is_modifier(t)) { + id = t->type; + t = btf_type_by_id(btf, t->type); + } + + if (res_id) + *res_id = id; + + return t; +} + +const struct btf_type *btf_type_resolve_ptr(const struct btf *btf, + u32 id, u32 *res_id) +{ + const struct btf_type *t; + + t = btf_type_skip_modifiers(btf, id, NULL); + if (!btf_type_is_ptr(t)) + return NULL; + + return btf_type_skip_modifiers(btf, t->type, res_id); +} + +const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf, + u32 id, u32 *res_id) +{ + const struct btf_type *ptype; + + ptype = btf_type_resolve_ptr(btf, id, res_id); + if (ptype && btf_type_is_func_proto(ptype)) + return ptype; + + return NULL; +} + /* Types that act only as a source, not sink or intermediate * type when resolving. */ @@ -446,30 +505,6 @@ static const char *btf_int_encoding_str(u8 encoding) return "UNKN"; } -static u16 btf_type_vlen(const struct btf_type *t) -{ - return BTF_INFO_VLEN(t->info); -} - -static bool btf_type_kflag(const struct btf_type *t) -{ - return BTF_INFO_KFLAG(t->info); -} - -static u32 btf_member_bit_offset(const struct btf_type *struct_type, - const struct btf_member *member) -{ - return btf_type_kflag(struct_type) ? BTF_MEMBER_BIT_OFFSET(member->offset) - : member->offset; -} - -static u32 btf_member_bitfield_size(const struct btf_type *struct_type, - const struct btf_member *member) -{ - return btf_type_kflag(struct_type) ? BTF_MEMBER_BITFIELD_SIZE(member->offset) - : 0; -} - static u32 btf_type_int(const struct btf_type *t) { return *(u32 *)(t + 1); @@ -480,11 +515,6 @@ static const struct btf_array *btf_type_array(const struct btf_type *t) return (const struct btf_array *)(t + 1); } -static const struct btf_member *btf_type_member(const struct btf_type *t) -{ - return (const struct btf_member *)(t + 1); -} - static const struct btf_enum *btf_type_enum(const struct btf_type *t) { return (const struct btf_enum *)(t + 1); @@ -1057,7 +1087,7 @@ static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env) * *elem_type: same as return type ("struct X") * *total_nelems: 1 */ -static const struct btf_type * +const struct btf_type * btf_resolve_size(const struct btf *btf, const struct btf_type *type, u32 *type_size, const struct btf_type **elem_type, u32 *total_nelems) @@ -1111,8 +1141,10 @@ resolved: return ERR_PTR(-EINVAL); *type_size = nelems * size; - *total_nelems = nelems; - *elem_type = type; + if (total_nelems) + *total_nelems = nelems; + if (elem_type) + *elem_type = type; return array_type ? : type; } @@ -1826,7 +1858,10 @@ static void btf_modifier_seq_show(const struct btf *btf, u32 type_id, void *data, u8 bits_offset, struct seq_file *m) { - t = btf_type_id_resolve(btf, &type_id); + if (btf->resolved_ids) + t = btf_type_id_resolve(btf, &type_id); + else + t = btf_type_skip_modifiers(btf, type_id, NULL); btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m); } @@ -2621,8 +2656,8 @@ static s32 btf_func_check_meta(struct btf_verifier_env *env, return -EINVAL; } - if (btf_type_vlen(t)) { - btf_verifier_log_type(env, t, "vlen != 0"); + if (btf_type_vlen(t) > BTF_FUNC_GLOBAL) { + btf_verifier_log_type(env, t, "Invalid func linkage"); return -EINVAL; } @@ -3476,7 +3511,8 @@ static u8 bpf_ctx_convert_map[] = { static const struct btf_member * btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf, - const struct btf_type *t, enum bpf_prog_type prog_type) + const struct btf_type *t, enum bpf_prog_type prog_type, + int arg) { const struct btf_type *conv_struct; const struct btf_type *ctx_struct; @@ -3497,12 +3533,13 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf, * is not supported yet. * BPF_PROG_TYPE_RAW_TRACEPOINT is fine. */ - bpf_log(log, "BPF program ctx type is not a struct\n"); + if (log->level & BPF_LOG_LEVEL) + bpf_log(log, "arg#%d type is not a struct\n", arg); return NULL; } tname = btf_name_by_offset(btf, t->name_off); if (!tname) { - bpf_log(log, "BPF program ctx struct doesn't have a name\n"); + bpf_log(log, "arg#%d struct doesn't have a name\n", arg); return NULL; } /* prog_type is valid bpf program type. No need for bounds check. */ @@ -3535,11 +3572,12 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf, static int btf_translate_to_vmlinux(struct bpf_verifier_log *log, struct btf *btf, const struct btf_type *t, - enum bpf_prog_type prog_type) + enum bpf_prog_type prog_type, + int arg) { const struct btf_member *prog_ctx_type, *kern_ctx_type; - prog_ctx_type = btf_get_prog_ctx_type(log, btf, t, prog_type); + prog_ctx_type = btf_get_prog_ctx_type(log, btf, t, prog_type, arg); if (!prog_ctx_type) return -ENOENT; kern_ctx_type = prog_ctx_type + 1; @@ -3605,6 +3643,8 @@ struct btf *btf_parse_vmlinux(void) goto errout; } + bpf_struct_ops_init(btf); + btf_verifier_env_free(env); refcount_set(&btf->refcnt, 1); return btf; @@ -3677,7 +3717,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* skip modifiers */ while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (btf_type_is_int(t)) + if (btf_type_is_int(t) || btf_type_is_enum(t)) /* accessing a scalar */ return true; if (!btf_type_is_ptr(t)) { @@ -3697,10 +3737,9 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* this is a pointer to another type */ info->reg_type = PTR_TO_BTF_ID; - info->btf_id = t->type; if (tgt_prog) { - ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type); + ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg); if (ret > 0) { info->btf_id = ret; return true; @@ -3708,10 +3747,14 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return false; } } + + info->btf_id = t->type; t = btf_type_by_id(btf, t->type); /* skip modifiers */ - while (btf_type_is_modifier(t)) + while (btf_type_is_modifier(t)) { + info->btf_id = t->type; t = btf_type_by_id(btf, t->type); + } if (!btf_type_is_struct(t)) { bpf_log(log, "func '%s' arg%d type %s is not a struct\n", @@ -3737,23 +3780,57 @@ int btf_struct_access(struct bpf_verifier_log *log, again: tname = __btf_name_by_offset(btf_vmlinux, t->name_off); if (!btf_type_is_struct(t)) { - bpf_log(log, "Type '%s' is not a struct", tname); + bpf_log(log, "Type '%s' is not a struct\n", tname); return -EINVAL; } - for_each_member(i, t, member) { - if (btf_member_bitfield_size(t, member)) - /* bitfields are not supported yet */ - continue; + if (off + size > t->size) { + bpf_log(log, "access beyond struct %s at off %u size %u\n", + tname, off, size); + return -EACCES; + } + for_each_member(i, t, member) { /* offset of the field in bytes */ moff = btf_member_bit_offset(t, member) / 8; if (off + size <= moff) /* won't find anything, field is already too far */ break; + + if (btf_member_bitfield_size(t, member)) { + u32 end_bit = btf_member_bit_offset(t, member) + + btf_member_bitfield_size(t, member); + + /* off <= moff instead of off == moff because clang + * does not generate a BTF member for anonymous + * bitfield like the ":16" here: + * struct { + * int :16; + * int x:8; + * }; + */ + if (off <= moff && + BITS_ROUNDUP_BYTES(end_bit) <= off + size) + return SCALAR_VALUE; + + /* off may be accessing a following member + * + * or + * + * Doing partial access at either end of this + * bitfield. Continue on this case also to + * treat it as not accessing this bitfield + * and eventually error out as field not + * found to keep it simple. + * It could be relaxed if there was a legit + * partial access case later. + */ + continue; + } + /* In case of "off" is pointing to holes of a struct */ if (off < moff) - continue; + break; /* type of the field */ mtype = btf_type_by_id(btf_vmlinux, member->type); @@ -4043,11 +4120,158 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, return 0; } -int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog) +/* Compare BTFs of two functions assuming only scalars and pointers to context. + * t1 points to BTF_KIND_FUNC in btf1 + * t2 points to BTF_KIND_FUNC in btf2 + * Returns: + * EINVAL - function prototype mismatch + * EFAULT - verifier bug + * 0 - 99% match. The last 1% is validated by the verifier. + */ +int btf_check_func_type_match(struct bpf_verifier_log *log, + struct btf *btf1, const struct btf_type *t1, + struct btf *btf2, const struct btf_type *t2) +{ + const struct btf_param *args1, *args2; + const char *fn1, *fn2, *s1, *s2; + u32 nargs1, nargs2, i; + + fn1 = btf_name_by_offset(btf1, t1->name_off); + fn2 = btf_name_by_offset(btf2, t2->name_off); + + if (btf_func_linkage(t1) != BTF_FUNC_GLOBAL) { + bpf_log(log, "%s() is not a global function\n", fn1); + return -EINVAL; + } + if (btf_func_linkage(t2) != BTF_FUNC_GLOBAL) { + bpf_log(log, "%s() is not a global function\n", fn2); + return -EINVAL; + } + + t1 = btf_type_by_id(btf1, t1->type); + if (!t1 || !btf_type_is_func_proto(t1)) + return -EFAULT; + t2 = btf_type_by_id(btf2, t2->type); + if (!t2 || !btf_type_is_func_proto(t2)) + return -EFAULT; + + args1 = (const struct btf_param *)(t1 + 1); + nargs1 = btf_type_vlen(t1); + args2 = (const struct btf_param *)(t2 + 1); + nargs2 = btf_type_vlen(t2); + + if (nargs1 != nargs2) { + bpf_log(log, "%s() has %d args while %s() has %d args\n", + fn1, nargs1, fn2, nargs2); + return -EINVAL; + } + + t1 = btf_type_skip_modifiers(btf1, t1->type, NULL); + t2 = btf_type_skip_modifiers(btf2, t2->type, NULL); + if (t1->info != t2->info) { + bpf_log(log, + "Return type %s of %s() doesn't match type %s of %s()\n", + btf_type_str(t1), fn1, + btf_type_str(t2), fn2); + return -EINVAL; + } + + for (i = 0; i < nargs1; i++) { + t1 = btf_type_skip_modifiers(btf1, args1[i].type, NULL); + t2 = btf_type_skip_modifiers(btf2, args2[i].type, NULL); + + if (t1->info != t2->info) { + bpf_log(log, "arg%d in %s() is %s while %s() has %s\n", + i, fn1, btf_type_str(t1), + fn2, btf_type_str(t2)); + return -EINVAL; + } + if (btf_type_has_size(t1) && t1->size != t2->size) { + bpf_log(log, + "arg%d in %s() has size %d while %s() has %d\n", + i, fn1, t1->size, + fn2, t2->size); + return -EINVAL; + } + + /* global functions are validated with scalars and pointers + * to context only. And only global functions can be replaced. + * Hence type check only those types. + */ + if (btf_type_is_int(t1) || btf_type_is_enum(t1)) + continue; + if (!btf_type_is_ptr(t1)) { + bpf_log(log, + "arg%d in %s() has unrecognized type\n", + i, fn1); + return -EINVAL; + } + t1 = btf_type_skip_modifiers(btf1, t1->type, NULL); + t2 = btf_type_skip_modifiers(btf2, t2->type, NULL); + if (!btf_type_is_struct(t1)) { + bpf_log(log, + "arg%d in %s() is not a pointer to context\n", + i, fn1); + return -EINVAL; + } + if (!btf_type_is_struct(t2)) { + bpf_log(log, + "arg%d in %s() is not a pointer to context\n", + i, fn2); + return -EINVAL; + } + /* This is an optional check to make program writing easier. + * Compare names of structs and report an error to the user. + * btf_prepare_func_args() already checked that t2 struct + * is a context type. btf_prepare_func_args() will check + * later that t1 struct is a context type as well. + */ + s1 = btf_name_by_offset(btf1, t1->name_off); + s2 = btf_name_by_offset(btf2, t2->name_off); + if (strcmp(s1, s2)) { + bpf_log(log, + "arg%d %s(struct %s *) doesn't match %s(struct %s *)\n", + i, fn1, s1, fn2, s2); + return -EINVAL; + } + } + return 0; +} + +/* Compare BTFs of given program with BTF of target program */ +int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog, + struct btf *btf2, const struct btf_type *t2) +{ + struct btf *btf1 = prog->aux->btf; + const struct btf_type *t1; + u32 btf_id = 0; + + if (!prog->aux->func_info) { + bpf_log(&env->log, "Program extension requires BTF\n"); + return -EINVAL; + } + + btf_id = prog->aux->func_info[0].type_id; + if (!btf_id) + return -EFAULT; + + t1 = btf_type_by_id(btf1, btf_id); + if (!t1 || !btf_type_is_func(t1)) + return -EFAULT; + + return btf_check_func_type_match(&env->log, btf1, t1, btf2, t2); +} + +/* Compare BTF of a function with given bpf_reg_state. + * Returns: + * EFAULT - there is a verifier bug. Abort verification. + * EINVAL - there is a type mismatch or BTF is not available. + * 0 - BTF matches with what bpf_reg_state expects. + * Only PTR_TO_CTX and SCALAR_VALUE states are recognized. + */ +int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, + struct bpf_reg_state *reg) { - struct bpf_verifier_state *st = env->cur_state; - struct bpf_func_state *func = st->frame[st->curframe]; - struct bpf_reg_state *reg = func->regs; struct bpf_verifier_log *log = &env->log; struct bpf_prog *prog = env->prog; struct btf *btf = prog->aux->btf; @@ -4057,27 +4281,30 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog) const char *tname; if (!prog->aux->func_info) - return 0; + return -EINVAL; btf_id = prog->aux->func_info[subprog].type_id; if (!btf_id) - return 0; + return -EFAULT; if (prog->aux->func_info_aux[subprog].unreliable) - return 0; + return -EINVAL; t = btf_type_by_id(btf, btf_id); if (!t || !btf_type_is_func(t)) { - bpf_log(log, "BTF of subprog %d doesn't point to KIND_FUNC\n", + /* These checks were already done by the verifier while loading + * struct bpf_func_info + */ + bpf_log(log, "BTF of func#%d doesn't point to KIND_FUNC\n", subprog); - return -EINVAL; + return -EFAULT; } tname = btf_name_by_offset(btf, t->name_off); t = btf_type_by_id(btf, t->type); if (!t || !btf_type_is_func_proto(t)) { - bpf_log(log, "Invalid type of func %s\n", tname); - return -EINVAL; + bpf_log(log, "Invalid BTF of func %s\n", tname); + return -EFAULT; } args = (const struct btf_param *)(t + 1); nargs = btf_type_vlen(t); @@ -4103,25 +4330,130 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog) bpf_log(log, "R%d is not a pointer\n", i + 1); goto out; } - /* If program is passing PTR_TO_CTX into subprogram - * check that BTF type matches. + /* If function expects ctx type in BTF check that caller + * is passing PTR_TO_CTX. */ - if (reg[i + 1].type == PTR_TO_CTX && - !btf_get_prog_ctx_type(log, btf, t, prog->type)) - goto out; - /* All other pointers are ok */ - continue; + if (btf_get_prog_ctx_type(log, btf, t, prog->type, i)) { + if (reg[i + 1].type != PTR_TO_CTX) { + bpf_log(log, + "arg#%d expected pointer to ctx, but got %s\n", + i, btf_kind_str[BTF_INFO_KIND(t->info)]); + goto out; + } + if (check_ctx_reg(env, ®[i + 1], i + 1)) + goto out; + continue; + } } - bpf_log(log, "Unrecognized argument type %s\n", - btf_kind_str[BTF_INFO_KIND(t->info)]); + bpf_log(log, "Unrecognized arg#%d type %s\n", + i, btf_kind_str[BTF_INFO_KIND(t->info)]); goto out; } return 0; out: - /* LLVM optimizations can remove arguments from static functions. */ - bpf_log(log, - "Type info disagrees with actual arguments due to compiler optimizations\n"); + /* Compiler optimizations can remove arguments from static functions + * or mismatched type can be passed into a global function. + * In such cases mark the function as unreliable from BTF point of view. + */ prog->aux->func_info_aux[subprog].unreliable = true; + return -EINVAL; +} + +/* Convert BTF of a function into bpf_reg_state if possible + * Returns: + * EFAULT - there is a verifier bug. Abort verification. + * EINVAL - cannot convert BTF. + * 0 - Successfully converted BTF into bpf_reg_state + * (either PTR_TO_CTX or SCALAR_VALUE). + */ +int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, + struct bpf_reg_state *reg) +{ + struct bpf_verifier_log *log = &env->log; + struct bpf_prog *prog = env->prog; + enum bpf_prog_type prog_type = prog->type; + struct btf *btf = prog->aux->btf; + const struct btf_param *args; + const struct btf_type *t; + u32 i, nargs, btf_id; + const char *tname; + + if (!prog->aux->func_info || + prog->aux->func_info_aux[subprog].linkage != BTF_FUNC_GLOBAL) { + bpf_log(log, "Verifier bug\n"); + return -EFAULT; + } + + btf_id = prog->aux->func_info[subprog].type_id; + if (!btf_id) { + bpf_log(log, "Global functions need valid BTF\n"); + return -EFAULT; + } + + t = btf_type_by_id(btf, btf_id); + if (!t || !btf_type_is_func(t)) { + /* These checks were already done by the verifier while loading + * struct bpf_func_info + */ + bpf_log(log, "BTF of func#%d doesn't point to KIND_FUNC\n", + subprog); + return -EFAULT; + } + tname = btf_name_by_offset(btf, t->name_off); + + if (log->level & BPF_LOG_LEVEL) + bpf_log(log, "Validating %s() func#%d...\n", + tname, subprog); + + if (prog->aux->func_info_aux[subprog].unreliable) { + bpf_log(log, "Verifier bug in function %s()\n", tname); + return -EFAULT; + } + if (prog_type == BPF_PROG_TYPE_EXT) + prog_type = prog->aux->linked_prog->type; + + t = btf_type_by_id(btf, t->type); + if (!t || !btf_type_is_func_proto(t)) { + bpf_log(log, "Invalid type of function %s()\n", tname); + return -EFAULT; + } + args = (const struct btf_param *)(t + 1); + nargs = btf_type_vlen(t); + if (nargs > 5) { + bpf_log(log, "Global function %s() with %d > 5 args. Buggy compiler.\n", + tname, nargs); + return -EINVAL; + } + /* check that function returns int */ + t = btf_type_by_id(btf, t->type); + while (btf_type_is_modifier(t)) + t = btf_type_by_id(btf, t->type); + if (!btf_type_is_int(t) && !btf_type_is_enum(t)) { + bpf_log(log, + "Global function %s() doesn't return scalar. Only those are supported.\n", + tname); + return -EINVAL; + } + /* Convert BTF function arguments into verifier types. + * Only PTR_TO_CTX and SCALAR are supported atm. + */ + for (i = 0; i < nargs; i++) { + t = btf_type_by_id(btf, args[i].type); + while (btf_type_is_modifier(t)) + t = btf_type_by_id(btf, t->type); + if (btf_type_is_int(t) || btf_type_is_enum(t)) { + reg[i + 1].type = SCALAR_VALUE; + continue; + } + if (btf_type_is_ptr(t) && + btf_get_prog_ctx_type(log, btf, t, prog_type, i)) { + reg[i + 1].type = PTR_TO_CTX; + continue; + } + bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n", + i, btf_kind_str[BTF_INFO_KIND(t->info)], tname); + return -EINVAL; + } return 0; } diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ee871d4a51f7..9a500fadbef5 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -35,8 +35,8 @@ void cgroup_bpf_offline(struct cgroup *cgrp) */ static void cgroup_bpf_release(struct work_struct *work) { - struct cgroup *cgrp = container_of(work, struct cgroup, - bpf.release_work); + struct cgroup *p, *cgrp = container_of(work, struct cgroup, + bpf.release_work); enum bpf_cgroup_storage_type stype; struct bpf_prog_array *old_array; unsigned int type; @@ -65,6 +65,9 @@ static void cgroup_bpf_release(struct work_struct *work) mutex_unlock(&cgroup_mutex); + for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) + cgroup_bpf_put(p); + percpu_ref_exit(&cgrp->bpf.refcnt); cgroup_put(cgrp); } @@ -198,6 +201,7 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) */ #define NR ARRAY_SIZE(cgrp->bpf.effective) struct bpf_prog_array *arrays[NR] = {}; + struct cgroup *p; int ret, i; ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0, @@ -205,6 +209,9 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) if (ret) return ret; + for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) + cgroup_bpf_get(p); + for (i = 0; i < NR; i++) INIT_LIST_HEAD(&cgrp->bpf.progs[i]); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 29d47aae0dd1..973a20d49749 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2137,6 +2137,7 @@ const struct bpf_func_proto bpf_map_pop_elem_proto __weak; const struct bpf_func_proto bpf_map_peek_elem_proto __weak; const struct bpf_func_proto bpf_spin_lock_proto __weak; const struct bpf_func_proto bpf_spin_unlock_proto __weak; +const struct bpf_func_proto bpf_jiffies64_proto __weak; const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index da9c832fc5c8..de630f980282 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -53,13 +53,11 @@ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) #define DEV_MAP_BULK_SIZE 16 -struct bpf_dtab_netdev; - -struct xdp_bulk_queue { +struct xdp_dev_bulk_queue { struct xdp_frame *q[DEV_MAP_BULK_SIZE]; struct list_head flush_node; + struct net_device *dev; struct net_device *dev_rx; - struct bpf_dtab_netdev *obj; unsigned int count; }; @@ -67,9 +65,8 @@ struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct hlist_node index_hlist; struct bpf_dtab *dtab; - struct xdp_bulk_queue __percpu *bulkq; struct rcu_head rcu; - unsigned int idx; /* keep track of map index for tracepoint */ + unsigned int idx; }; struct bpf_dtab { @@ -84,7 +81,7 @@ struct bpf_dtab { u32 n_buckets; }; -static DEFINE_PER_CPU(struct list_head, dev_map_flush_list); +static DEFINE_PER_CPU(struct list_head, dev_flush_list); static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); @@ -219,7 +216,6 @@ static void dev_map_free(struct bpf_map *map) hlist_for_each_entry_safe(dev, next, head, index_hlist) { hlist_del_rcu(&dev->index_hlist); - free_percpu(dev->bulkq); dev_put(dev->dev); kfree(dev); } @@ -234,7 +230,6 @@ static void dev_map_free(struct bpf_map *map) if (!dev) continue; - free_percpu(dev->bulkq); dev_put(dev->dev); kfree(dev); } @@ -320,10 +315,9 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, return -ENOENT; } -static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags) +static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) { - struct bpf_dtab_netdev *obj = bq->obj; - struct net_device *dev = obj->dev; + struct net_device *dev = bq->dev; int sent = 0, drops = 0, err = 0; int i; @@ -346,8 +340,7 @@ static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags) out: bq->count = 0; - trace_xdp_devmap_xmit(&obj->dtab->map, obj->idx, - sent, drops, bq->dev_rx, dev, err); + trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err); bq->dev_rx = NULL; __list_del_clearprev(&bq->flush_node); return 0; @@ -364,17 +357,17 @@ error: goto out; } -/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled +/* __dev_flush is called from xdp_do_flush() which _must_ be signaled * from the driver before returning from its napi->poll() routine. The poll() * routine is called either from busy_poll context or net_rx_action signaled * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the * net device can be torn down. On devmap tear down we ensure the flush list * is empty before completing to ensure all flush operations have completed. */ -void __dev_map_flush(void) +void __dev_flush(void) { - struct list_head *flush_list = this_cpu_ptr(&dev_map_flush_list); - struct xdp_bulk_queue *bq, *tmp; + struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); + struct xdp_dev_bulk_queue *bq, *tmp; rcu_read_lock(); list_for_each_entry_safe(bq, tmp, flush_list, flush_node) @@ -401,12 +394,11 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) /* Runs under RCU-read-side, plus in softirq under NAPI protection. * Thus, safe percpu variable access. */ -static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, +static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx) - { - struct list_head *flush_list = this_cpu_ptr(&dev_map_flush_list); - struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); + struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); + struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) bq_xmit_all(bq, 0); @@ -426,10 +418,9 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, return 0; } -int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, - struct net_device *dev_rx) +static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, + struct net_device *dev_rx) { - struct net_device *dev = dst->dev; struct xdp_frame *xdpf; int err; @@ -444,7 +435,21 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, if (unlikely(!xdpf)) return -EOVERFLOW; - return bq_enqueue(dst, xdpf, dev_rx); + return bq_enqueue(dev, xdpf, dev_rx); +} + +int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, + struct net_device *dev_rx) +{ + return __xdp_enqueue(dev, xdp, dev_rx); +} + +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, + struct net_device *dev_rx) +{ + struct net_device *dev = dst->dev; + + return __xdp_enqueue(dev, xdp, dev_rx); } int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, @@ -483,7 +488,6 @@ static void __dev_map_entry_free(struct rcu_head *rcu) struct bpf_dtab_netdev *dev; dev = container_of(rcu, struct bpf_dtab_netdev, rcu); - free_percpu(dev->bulkq); dev_put(dev->dev); kfree(dev); } @@ -538,30 +542,15 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, u32 ifindex, unsigned int idx) { - gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; struct bpf_dtab_netdev *dev; - struct xdp_bulk_queue *bq; - int cpu; - dev = kmalloc_node(sizeof(*dev), gfp, dtab->map.numa_node); + dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, + dtab->map.numa_node); if (!dev) return ERR_PTR(-ENOMEM); - dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq), - sizeof(void *), gfp); - if (!dev->bulkq) { - kfree(dev); - return ERR_PTR(-ENOMEM); - } - - for_each_possible_cpu(cpu) { - bq = per_cpu_ptr(dev->bulkq, cpu); - bq->obj = dev; - } - dev->dev = dev_get_by_index(net, ifindex); if (!dev->dev) { - free_percpu(dev->bulkq); kfree(dev); return ERR_PTR(-EINVAL); } @@ -721,9 +710,23 @@ static int dev_map_notification(struct notifier_block *notifier, { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); struct bpf_dtab *dtab; - int i; + int i, cpu; switch (event) { + case NETDEV_REGISTER: + if (!netdev->netdev_ops->ndo_xdp_xmit || netdev->xdp_bulkq) + break; + + /* will be freed in free_netdev() */ + netdev->xdp_bulkq = + __alloc_percpu_gfp(sizeof(struct xdp_dev_bulk_queue), + sizeof(void *), GFP_ATOMIC); + if (!netdev->xdp_bulkq) + return NOTIFY_BAD; + + for_each_possible_cpu(cpu) + per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev = netdev; + break; case NETDEV_UNREGISTER: /* This rcu_read_lock/unlock pair is needed because * dev_map_list is an RCU list AND to ensure a delete @@ -771,7 +774,7 @@ static int __init dev_map_init(void) register_netdevice_notifier(&dev_map_notifier); for_each_possible_cpu(cpu) - INIT_LIST_HEAD(&per_cpu(dev_map_flush_list, cpu)); + INIT_LIST_HEAD(&per_cpu(dev_flush_list, cpu)); return 0; } diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 22066a62c8c9..2d182c4ee9d9 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -17,6 +17,16 @@ (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED) +#define BATCH_OPS(_name) \ + .map_lookup_batch = \ + _name##_map_lookup_batch, \ + .map_lookup_and_delete_batch = \ + _name##_map_lookup_and_delete_batch, \ + .map_update_batch = \ + generic_map_update_batch, \ + .map_delete_batch = \ + generic_map_delete_batch + struct bucket { struct hlist_nulls_head head; raw_spinlock_t lock; @@ -1232,6 +1242,256 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } +static int +__htab_map_lookup_and_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr, + bool do_delete, bool is_lru_map, + bool is_percpu) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + u32 bucket_cnt, total, key_size, value_size, roundup_key_size; + void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val; + void __user *uvalues = u64_to_user_ptr(attr->batch.values); + void __user *ukeys = u64_to_user_ptr(attr->batch.keys); + void *ubatch = u64_to_user_ptr(attr->batch.in_batch); + u32 batch, max_count, size, bucket_size; + u64 elem_map_flags, map_flags; + struct hlist_nulls_head *head; + struct hlist_nulls_node *n; + unsigned long flags; + struct htab_elem *l; + struct bucket *b; + int ret = 0; + + elem_map_flags = attr->batch.elem_flags; + if ((elem_map_flags & ~BPF_F_LOCK) || + ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))) + return -EINVAL; + + map_flags = attr->batch.flags; + if (map_flags) + return -EINVAL; + + max_count = attr->batch.count; + if (!max_count) + return 0; + + if (put_user(0, &uattr->batch.count)) + return -EFAULT; + + batch = 0; + if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch))) + return -EFAULT; + + if (batch >= htab->n_buckets) + return -ENOENT; + + key_size = htab->map.key_size; + roundup_key_size = round_up(htab->map.key_size, 8); + value_size = htab->map.value_size; + size = round_up(value_size, 8); + if (is_percpu) + value_size = size * num_possible_cpus(); + total = 0; + /* while experimenting with hash tables with sizes ranging from 10 to + * 1000, it was observed that a bucket can have upto 5 entries. + */ + bucket_size = 5; + +alloc: + /* We cannot do copy_from_user or copy_to_user inside + * the rcu_read_lock. Allocate enough space here. + */ + keys = kvmalloc(key_size * bucket_size, GFP_USER | __GFP_NOWARN); + values = kvmalloc(value_size * bucket_size, GFP_USER | __GFP_NOWARN); + if (!keys || !values) { + ret = -ENOMEM; + goto after_loop; + } + +again: + preempt_disable(); + this_cpu_inc(bpf_prog_active); + rcu_read_lock(); +again_nocopy: + dst_key = keys; + dst_val = values; + b = &htab->buckets[batch]; + head = &b->head; + raw_spin_lock_irqsave(&b->lock, flags); + + bucket_cnt = 0; + hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) + bucket_cnt++; + + if (bucket_cnt > (max_count - total)) { + if (total == 0) + ret = -ENOSPC; + raw_spin_unlock_irqrestore(&b->lock, flags); + rcu_read_unlock(); + this_cpu_dec(bpf_prog_active); + preempt_enable(); + goto after_loop; + } + + if (bucket_cnt > bucket_size) { + bucket_size = bucket_cnt; + raw_spin_unlock_irqrestore(&b->lock, flags); + rcu_read_unlock(); + this_cpu_dec(bpf_prog_active); + preempt_enable(); + kvfree(keys); + kvfree(values); + goto alloc; + } + + hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { + memcpy(dst_key, l->key, key_size); + + if (is_percpu) { + int off = 0, cpu; + void __percpu *pptr; + + pptr = htab_elem_get_ptr(l, map->key_size); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(dst_val + off, + per_cpu_ptr(pptr, cpu), size); + off += size; + } + } else { + value = l->key + roundup_key_size; + if (elem_map_flags & BPF_F_LOCK) + copy_map_value_locked(map, dst_val, value, + true); + else + copy_map_value(map, dst_val, value); + check_and_init_map_lock(map, dst_val); + } + if (do_delete) { + hlist_nulls_del_rcu(&l->hash_node); + if (is_lru_map) + bpf_lru_push_free(&htab->lru, &l->lru_node); + else + free_htab_elem(htab, l); + } + dst_key += key_size; + dst_val += value_size; + } + + raw_spin_unlock_irqrestore(&b->lock, flags); + /* If we are not copying data, we can go to next bucket and avoid + * unlocking the rcu. + */ + if (!bucket_cnt && (batch + 1 < htab->n_buckets)) { + batch++; + goto again_nocopy; + } + + rcu_read_unlock(); + this_cpu_dec(bpf_prog_active); + preempt_enable(); + if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys, + key_size * bucket_cnt) || + copy_to_user(uvalues + total * value_size, values, + value_size * bucket_cnt))) { + ret = -EFAULT; + goto after_loop; + } + + total += bucket_cnt; + batch++; + if (batch >= htab->n_buckets) { + ret = -ENOENT; + goto after_loop; + } + goto again; + +after_loop: + if (ret == -EFAULT) + goto out; + + /* copy # of entries and next batch */ + ubatch = u64_to_user_ptr(attr->batch.out_batch); + if (copy_to_user(ubatch, &batch, sizeof(batch)) || + put_user(total, &uattr->batch.count)) + ret = -EFAULT; + +out: + kvfree(keys); + kvfree(values); + return ret; +} + +static int +htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, + false, true); +} + +static int +htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, + false, true); +} + +static int +htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, + false, false); +} + +static int +htab_map_lookup_and_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, + false, false); +} + +static int +htab_lru_percpu_map_lookup_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, + true, true); +} + +static int +htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, + true, true); +} + +static int +htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, + true, false); +} + +static int +htab_lru_map_lookup_and_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, + true, false); +} + const struct bpf_map_ops htab_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1242,6 +1502,7 @@ const struct bpf_map_ops htab_map_ops = { .map_delete_elem = htab_map_delete_elem, .map_gen_lookup = htab_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, + BATCH_OPS(htab), }; const struct bpf_map_ops htab_lru_map_ops = { @@ -1255,6 +1516,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_delete_elem = htab_lru_map_delete_elem, .map_gen_lookup = htab_lru_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, + BATCH_OPS(htab_lru), }; /* Called from eBPF program */ @@ -1368,6 +1630,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_update_elem = htab_percpu_map_update_elem, .map_delete_elem = htab_map_delete_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, + BATCH_OPS(htab_percpu), }; const struct bpf_map_ops htab_lru_percpu_map_ops = { @@ -1379,6 +1642,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_update_elem = htab_lru_percpu_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, + BATCH_OPS(htab_lru_percpu), }; static int fd_htab_map_alloc_check(union bpf_attr *attr) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index cada974c9f4e..d8b7b110a1c5 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -11,6 +11,7 @@ #include <linux/uidgid.h> #include <linux/filter.h> #include <linux/ctype.h> +#include <linux/jiffies.h> #include "../../lib/kstrtox.h" @@ -312,6 +313,17 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, preempt_enable(); } +BPF_CALL_0(bpf_jiffies64) +{ + return get_jiffies_64(); +} + +const struct bpf_func_proto bpf_jiffies64_proto = { + .func = bpf_jiffies64, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + #ifdef CONFIG_CGROUPS BPF_CALL_0(bpf_get_current_cgroup_id) { diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index ecf42bec38c0..e11059b99f18 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -380,7 +380,7 @@ static const struct inode_operations bpf_dir_iops = { .unlink = simple_unlink, }; -static int bpf_obj_do_pin(const struct filename *pathname, void *raw, +static int bpf_obj_do_pin(const char __user *pathname, void *raw, enum bpf_type type) { struct dentry *dentry; @@ -389,7 +389,7 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw, umode_t mode; int ret; - dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0); + dentry = user_path_create(AT_FDCWD, pathname, &path, 0); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -422,30 +422,22 @@ out: int bpf_obj_pin_user(u32 ufd, const char __user *pathname) { - struct filename *pname; enum bpf_type type; void *raw; int ret; - pname = getname(pathname); - if (IS_ERR(pname)) - return PTR_ERR(pname); - raw = bpf_fd_probe_obj(ufd, &type); - if (IS_ERR(raw)) { - ret = PTR_ERR(raw); - goto out; - } + if (IS_ERR(raw)) + return PTR_ERR(raw); - ret = bpf_obj_do_pin(pname, raw, type); + ret = bpf_obj_do_pin(pathname, raw, type); if (ret != 0) bpf_any_put(raw, type); -out: - putname(pname); + return ret; } -static void *bpf_obj_do_get(const struct filename *pathname, +static void *bpf_obj_do_get(const char __user *pathname, enum bpf_type *type, int flags) { struct inode *inode; @@ -453,7 +445,7 @@ static void *bpf_obj_do_get(const struct filename *pathname, void *raw; int ret; - ret = kern_path(pathname->name, LOOKUP_FOLLOW, &path); + ret = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW, &path); if (ret) return ERR_PTR(ret); @@ -480,36 +472,27 @@ out: int bpf_obj_get_user(const char __user *pathname, int flags) { enum bpf_type type = BPF_TYPE_UNSPEC; - struct filename *pname; - int ret = -ENOENT; int f_flags; void *raw; + int ret; f_flags = bpf_get_file_flag(flags); if (f_flags < 0) return f_flags; - pname = getname(pathname); - if (IS_ERR(pname)) - return PTR_ERR(pname); - - raw = bpf_obj_do_get(pname, &type, f_flags); - if (IS_ERR(raw)) { - ret = PTR_ERR(raw); - goto out; - } + raw = bpf_obj_do_get(pathname, &type, f_flags); + if (IS_ERR(raw)) + return PTR_ERR(raw); if (type == BPF_TYPE_PROG) ret = bpf_prog_new_fd(raw); else if (type == BPF_TYPE_MAP) ret = bpf_map_new_fd(raw, f_flags); else - goto out; + return -ENOENT; if (ret < 0) bpf_any_put(raw, type); -out: - putname(pname); return ret; } diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 5e9366b33f0f..b3c48d1533cb 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -22,7 +22,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) */ if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY || inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE || - inner_map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { + inner_map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE || + inner_map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { fdput(f); return ERR_PTR(-ENOTSUPP); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 81ee8595dfee..a91ad518c050 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -129,6 +129,152 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) return map; } +static u32 bpf_map_value_size(struct bpf_map *map) +{ + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || + map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) + return round_up(map->value_size, 8) * num_possible_cpus(); + else if (IS_FD_MAP(map)) + return sizeof(u32); + else + return map->value_size; +} + +static void maybe_wait_bpf_programs(struct bpf_map *map) +{ + /* Wait for any running BPF programs to complete so that + * userspace, when we return to it, knows that all programs + * that could be running use the new map value. + */ + if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS || + map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) + synchronize_rcu(); +} + +static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key, + void *value, __u64 flags) +{ + int err; + + /* Need to create a kthread, thus must support schedule */ + if (bpf_map_is_dev_bound(map)) { + return bpf_map_offload_update_elem(map, key, value, flags); + } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || + map->map_type == BPF_MAP_TYPE_SOCKHASH || + map->map_type == BPF_MAP_TYPE_SOCKMAP || + map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { + return map->ops->map_update_elem(map, key, value, flags); + } else if (IS_FD_PROG_ARRAY(map)) { + return bpf_fd_array_map_update_elem(map, f.file, key, value, + flags); + } + + /* must increment bpf_prog_active to avoid kprobe+bpf triggering from + * inside bpf map update or delete otherwise deadlocks are possible + */ + preempt_disable(); + __this_cpu_inc(bpf_prog_active); + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { + err = bpf_percpu_hash_update(map, key, value, flags); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { + err = bpf_percpu_array_update(map, key, value, flags); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { + err = bpf_percpu_cgroup_storage_update(map, key, value, + flags); + } else if (IS_FD_ARRAY(map)) { + rcu_read_lock(); + err = bpf_fd_array_map_update_elem(map, f.file, key, value, + flags); + rcu_read_unlock(); + } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { + rcu_read_lock(); + err = bpf_fd_htab_map_update_elem(map, f.file, key, value, + flags); + rcu_read_unlock(); + } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { + /* rcu_read_lock() is not needed */ + err = bpf_fd_reuseport_array_update_elem(map, key, value, + flags); + } else if (map->map_type == BPF_MAP_TYPE_QUEUE || + map->map_type == BPF_MAP_TYPE_STACK) { + err = map->ops->map_push_elem(map, value, flags); + } else { + rcu_read_lock(); + err = map->ops->map_update_elem(map, key, value, flags); + rcu_read_unlock(); + } + __this_cpu_dec(bpf_prog_active); + preempt_enable(); + maybe_wait_bpf_programs(map); + + return err; +} + +static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, + __u64 flags) +{ + void *ptr; + int err; + + if (bpf_map_is_dev_bound(map)) + return bpf_map_offload_lookup_elem(map, key, value); + + preempt_disable(); + this_cpu_inc(bpf_prog_active); + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { + err = bpf_percpu_hash_copy(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { + err = bpf_percpu_array_copy(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { + err = bpf_percpu_cgroup_storage_copy(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { + err = bpf_stackmap_copy(map, key, value); + } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) { + err = bpf_fd_array_map_lookup_elem(map, key, value); + } else if (IS_FD_HASH(map)) { + err = bpf_fd_htab_map_lookup_elem(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { + err = bpf_fd_reuseport_array_lookup_elem(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_QUEUE || + map->map_type == BPF_MAP_TYPE_STACK) { + err = map->ops->map_peek_elem(map, value); + } else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { + /* struct_ops map requires directly updating "value" */ + err = bpf_struct_ops_map_sys_lookup_elem(map, key, value); + } else { + rcu_read_lock(); + if (map->ops->map_lookup_elem_sys_only) + ptr = map->ops->map_lookup_elem_sys_only(map, key); + else + ptr = map->ops->map_lookup_elem(map, key); + if (IS_ERR(ptr)) { + err = PTR_ERR(ptr); + } else if (!ptr) { + err = -ENOENT; + } else { + err = 0; + if (flags & BPF_F_LOCK) + /* lock 'ptr' and copy everything but lock */ + copy_map_value_locked(map, value, ptr, true); + else + copy_map_value(map, value, ptr); + /* mask lock, since value wasn't zero inited */ + check_and_init_map_lock(map, value); + } + rcu_read_unlock(); + } + + this_cpu_dec(bpf_prog_active); + preempt_enable(); + maybe_wait_bpf_programs(map); + + return err; +} + static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) { /* We really just want to fail instead of triggering OOM killer @@ -628,7 +774,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, return ret; } -#define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id +#define BPF_MAP_CREATE_LAST_FIELD btf_vmlinux_value_type_id /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -642,6 +788,14 @@ static int map_create(union bpf_attr *attr) if (err) return -EINVAL; + if (attr->btf_vmlinux_value_type_id) { + if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS || + attr->btf_key_type_id || attr->btf_value_type_id) + return -EINVAL; + } else if (attr->btf_key_type_id && !attr->btf_value_type_id) { + return -EINVAL; + } + f_flags = bpf_get_file_flag(attr->map_flags); if (f_flags < 0) return f_flags; @@ -664,32 +818,35 @@ static int map_create(union bpf_attr *attr) atomic64_set(&map->usercnt, 1); mutex_init(&map->freeze_mutex); - if (attr->btf_key_type_id || attr->btf_value_type_id) { + map->spin_lock_off = -EINVAL; + if (attr->btf_key_type_id || attr->btf_value_type_id || + /* Even the map's value is a kernel's struct, + * the bpf_prog.o must have BTF to begin with + * to figure out the corresponding kernel's + * counter part. Thus, attr->btf_fd has + * to be valid also. + */ + attr->btf_vmlinux_value_type_id) { struct btf *btf; - if (!attr->btf_value_type_id) { - err = -EINVAL; - goto free_map; - } - btf = btf_get_by_fd(attr->btf_fd); if (IS_ERR(btf)) { err = PTR_ERR(btf); goto free_map; } + map->btf = btf; - err = map_check_btf(map, btf, attr->btf_key_type_id, - attr->btf_value_type_id); - if (err) { - btf_put(btf); - goto free_map; + if (attr->btf_value_type_id) { + err = map_check_btf(map, btf, attr->btf_key_type_id, + attr->btf_value_type_id); + if (err) + goto free_map; } - map->btf = btf; map->btf_key_type_id = attr->btf_key_type_id; map->btf_value_type_id = attr->btf_value_type_id; - } else { - map->spin_lock_off = -EINVAL; + map->btf_vmlinux_value_type_id = + attr->btf_vmlinux_value_type_id; } err = security_bpf_map_alloc(map); @@ -816,7 +973,7 @@ static int map_lookup_elem(union bpf_attr *attr) void __user *uvalue = u64_to_user_ptr(attr->value); int ufd = attr->map_fd; struct bpf_map *map; - void *key, *value, *ptr; + void *key, *value; u32 value_size; struct fd f; int err; @@ -848,72 +1005,14 @@ static int map_lookup_elem(union bpf_attr *attr) goto err_put; } - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || - map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) - value_size = round_up(map->value_size, 8) * num_possible_cpus(); - else if (IS_FD_MAP(map)) - value_size = sizeof(u32); - else - value_size = map->value_size; + value_size = bpf_map_value_size(map); err = -ENOMEM; value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; - if (bpf_map_is_dev_bound(map)) { - err = bpf_map_offload_lookup_elem(map, key, value); - goto done; - } - - preempt_disable(); - this_cpu_inc(bpf_prog_active); - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { - err = bpf_percpu_hash_copy(map, key, value); - } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { - err = bpf_percpu_array_copy(map, key, value); - } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { - err = bpf_percpu_cgroup_storage_copy(map, key, value); - } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { - err = bpf_stackmap_copy(map, key, value); - } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) { - err = bpf_fd_array_map_lookup_elem(map, key, value); - } else if (IS_FD_HASH(map)) { - err = bpf_fd_htab_map_lookup_elem(map, key, value); - } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { - err = bpf_fd_reuseport_array_lookup_elem(map, key, value); - } else if (map->map_type == BPF_MAP_TYPE_QUEUE || - map->map_type == BPF_MAP_TYPE_STACK) { - err = map->ops->map_peek_elem(map, value); - } else { - rcu_read_lock(); - if (map->ops->map_lookup_elem_sys_only) - ptr = map->ops->map_lookup_elem_sys_only(map, key); - else - ptr = map->ops->map_lookup_elem(map, key); - if (IS_ERR(ptr)) { - err = PTR_ERR(ptr); - } else if (!ptr) { - err = -ENOENT; - } else { - err = 0; - if (attr->flags & BPF_F_LOCK) - /* lock 'ptr' and copy everything but lock */ - copy_map_value_locked(map, value, ptr, true); - else - copy_map_value(map, value, ptr); - /* mask lock, since value wasn't zero inited */ - check_and_init_map_lock(map, value); - } - rcu_read_unlock(); - } - this_cpu_dec(bpf_prog_active); - preempt_enable(); - -done: + err = bpf_map_copy_value(map, key, value, attr->flags); if (err) goto free_value; @@ -932,16 +1031,6 @@ err_put: return err; } -static void maybe_wait_bpf_programs(struct bpf_map *map) -{ - /* Wait for any running BPF programs to complete so that - * userspace, when we return to it, knows that all programs - * that could be running use the new map value. - */ - if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS || - map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) - synchronize_rcu(); -} #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags @@ -997,60 +1086,8 @@ static int map_update_elem(union bpf_attr *attr) if (copy_from_user(value, uvalue, value_size) != 0) goto free_value; - /* Need to create a kthread, thus must support schedule */ - if (bpf_map_is_dev_bound(map)) { - err = bpf_map_offload_update_elem(map, key, value, attr->flags); - goto out; - } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || - map->map_type == BPF_MAP_TYPE_SOCKHASH || - map->map_type == BPF_MAP_TYPE_SOCKMAP) { - err = map->ops->map_update_elem(map, key, value, attr->flags); - goto out; - } else if (IS_FD_PROG_ARRAY(map)) { - err = bpf_fd_array_map_update_elem(map, f.file, key, value, - attr->flags); - goto out; - } + err = bpf_map_update_value(map, f, key, value, attr->flags); - /* must increment bpf_prog_active to avoid kprobe+bpf triggering from - * inside bpf map update or delete otherwise deadlocks are possible - */ - preempt_disable(); - __this_cpu_inc(bpf_prog_active); - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { - err = bpf_percpu_hash_update(map, key, value, attr->flags); - } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { - err = bpf_percpu_array_update(map, key, value, attr->flags); - } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { - err = bpf_percpu_cgroup_storage_update(map, key, value, - attr->flags); - } else if (IS_FD_ARRAY(map)) { - rcu_read_lock(); - err = bpf_fd_array_map_update_elem(map, f.file, key, value, - attr->flags); - rcu_read_unlock(); - } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { - rcu_read_lock(); - err = bpf_fd_htab_map_update_elem(map, f.file, key, value, - attr->flags); - rcu_read_unlock(); - } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { - /* rcu_read_lock() is not needed */ - err = bpf_fd_reuseport_array_update_elem(map, key, value, - attr->flags); - } else if (map->map_type == BPF_MAP_TYPE_QUEUE || - map->map_type == BPF_MAP_TYPE_STACK) { - err = map->ops->map_push_elem(map, value, attr->flags); - } else { - rcu_read_lock(); - err = map->ops->map_update_elem(map, key, value, attr->flags); - rcu_read_unlock(); - } - __this_cpu_dec(bpf_prog_active); - preempt_enable(); - maybe_wait_bpf_programs(map); -out: free_value: kfree(value); free_key: @@ -1092,7 +1129,9 @@ static int map_delete_elem(union bpf_attr *attr) if (bpf_map_is_dev_bound(map)) { err = bpf_map_offload_delete_elem(map, key); goto out; - } else if (IS_FD_PROG_ARRAY(map)) { + } else if (IS_FD_PROG_ARRAY(map) || + map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { + /* These maps require sleepable context */ err = map->ops->map_delete_elem(map, key); goto out; } @@ -1179,6 +1218,220 @@ err_put: return err; } +int generic_map_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + void __user *keys = u64_to_user_ptr(attr->batch.keys); + u32 cp, max_count; + int err = 0; + void *key; + + if (attr->batch.elem_flags & ~BPF_F_LOCK) + return -EINVAL; + + if ((attr->batch.elem_flags & BPF_F_LOCK) && + !map_value_has_spin_lock(map)) { + return -EINVAL; + } + + max_count = attr->batch.count; + if (!max_count) + return 0; + + key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN); + if (!key) + return -ENOMEM; + + for (cp = 0; cp < max_count; cp++) { + err = -EFAULT; + if (copy_from_user(key, keys + cp * map->key_size, + map->key_size)) + break; + + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_delete_elem(map, key); + break; + } + + preempt_disable(); + __this_cpu_inc(bpf_prog_active); + rcu_read_lock(); + err = map->ops->map_delete_elem(map, key); + rcu_read_unlock(); + __this_cpu_dec(bpf_prog_active); + preempt_enable(); + maybe_wait_bpf_programs(map); + if (err) + break; + } + if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) + err = -EFAULT; + + kfree(key); + return err; +} + +int generic_map_update_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + void __user *values = u64_to_user_ptr(attr->batch.values); + void __user *keys = u64_to_user_ptr(attr->batch.keys); + u32 value_size, cp, max_count; + int ufd = attr->map_fd; + void *key, *value; + struct fd f; + int err = 0; + + f = fdget(ufd); + if (attr->batch.elem_flags & ~BPF_F_LOCK) + return -EINVAL; + + if ((attr->batch.elem_flags & BPF_F_LOCK) && + !map_value_has_spin_lock(map)) { + return -EINVAL; + } + + value_size = bpf_map_value_size(map); + + max_count = attr->batch.count; + if (!max_count) + return 0; + + key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN); + if (!key) + return -ENOMEM; + + value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); + if (!value) { + kfree(key); + return -ENOMEM; + } + + for (cp = 0; cp < max_count; cp++) { + err = -EFAULT; + if (copy_from_user(key, keys + cp * map->key_size, + map->key_size) || + copy_from_user(value, values + cp * value_size, value_size)) + break; + + err = bpf_map_update_value(map, f, key, value, + attr->batch.elem_flags); + + if (err) + break; + } + + if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) + err = -EFAULT; + + kfree(value); + kfree(key); + return err; +} + +#define MAP_LOOKUP_RETRIES 3 + +int generic_map_lookup_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch); + void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); + void __user *values = u64_to_user_ptr(attr->batch.values); + void __user *keys = u64_to_user_ptr(attr->batch.keys); + void *buf, *buf_prevkey, *prev_key, *key, *value; + int err, retry = MAP_LOOKUP_RETRIES; + u32 value_size, cp, max_count; + + if (attr->batch.elem_flags & ~BPF_F_LOCK) + return -EINVAL; + + if ((attr->batch.elem_flags & BPF_F_LOCK) && + !map_value_has_spin_lock(map)) + return -EINVAL; + + value_size = bpf_map_value_size(map); + + max_count = attr->batch.count; + if (!max_count) + return 0; + + if (put_user(0, &uattr->batch.count)) + return -EFAULT; + + buf_prevkey = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN); + if (!buf_prevkey) + return -ENOMEM; + + buf = kmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN); + if (!buf) { + kvfree(buf_prevkey); + return -ENOMEM; + } + + err = -EFAULT; + prev_key = NULL; + if (ubatch && copy_from_user(buf_prevkey, ubatch, map->key_size)) + goto free_buf; + key = buf; + value = key + map->key_size; + if (ubatch) + prev_key = buf_prevkey; + + for (cp = 0; cp < max_count;) { + rcu_read_lock(); + err = map->ops->map_get_next_key(map, prev_key, key); + rcu_read_unlock(); + if (err) + break; + err = bpf_map_copy_value(map, key, value, + attr->batch.elem_flags); + + if (err == -ENOENT) { + if (retry) { + retry--; + continue; + } + err = -EINTR; + break; + } + + if (err) + goto free_buf; + + if (copy_to_user(keys + cp * map->key_size, key, + map->key_size)) { + err = -EFAULT; + goto free_buf; + } + if (copy_to_user(values + cp * value_size, value, value_size)) { + err = -EFAULT; + goto free_buf; + } + + if (!prev_key) + prev_key = buf_prevkey; + + swap(prev_key, key); + retry = MAP_LOOKUP_RETRIES; + cp++; + } + + if (err == -EFAULT) + goto free_buf; + + if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) || + (cp && copy_to_user(uobatch, prev_key, map->key_size)))) + err = -EFAULT; + +free_buf: + kfree(buf_prevkey); + kfree(buf); + return err; +} + #define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value static int map_lookup_and_delete_elem(union bpf_attr *attr) @@ -1672,17 +1925,24 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, enum bpf_attach_type expected_attach_type, u32 btf_id, u32 prog_fd) { - switch (prog_type) { - case BPF_PROG_TYPE_TRACING: + if (btf_id) { if (btf_id > BTF_MAX_TYPE) return -EINVAL; - break; - default: - if (btf_id || prog_fd) + + switch (prog_type) { + case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_STRUCT_OPS: + case BPF_PROG_TYPE_EXT: + break; + default: return -EINVAL; - break; + } } + if (prog_fd && prog_type != BPF_PROG_TYPE_TRACING && + prog_type != BPF_PROG_TYPE_EXT) + return -EINVAL; + switch (prog_type) { case BPF_PROG_TYPE_CGROUP_SOCK: switch (expected_attach_type) { @@ -1723,6 +1983,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, default: return -EINVAL; } + case BPF_PROG_TYPE_EXT: + if (expected_attach_type) + return -EINVAL; + /* fallthrough */ default: return 0; } @@ -1925,7 +2189,8 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) int tr_fd, err; if (prog->expected_attach_type != BPF_TRACE_FENTRY && - prog->expected_attach_type != BPF_TRACE_FEXIT) { + prog->expected_attach_type != BPF_TRACE_FEXIT && + prog->type != BPF_PROG_TYPE_EXT) { err = -EINVAL; goto out_put_prog; } @@ -1992,12 +2257,14 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT && prog->type != BPF_PROG_TYPE_TRACING && + prog->type != BPF_PROG_TYPE_EXT && prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) { err = -EINVAL; goto out_put_prog; } - if (prog->type == BPF_PROG_TYPE_TRACING) { + if (prog->type == BPF_PROG_TYPE_TRACING || + prog->type == BPF_PROG_TYPE_EXT) { if (attr->raw_tracepoint.name) { /* The attach point for this category of programs * should be specified via btf_id during program load. @@ -2817,6 +3084,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, info.btf_key_type_id = map->btf_key_type_id; info.btf_value_type_id = map->btf_value_type_id; } + info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id; if (bpf_map_is_dev_bound(map)) { err = bpf_map_offload_info_fill(&info, map); @@ -3029,6 +3297,61 @@ out: return err; } +#define BPF_MAP_BATCH_LAST_FIELD batch.flags + +#define BPF_DO_BATCH(fn) \ + do { \ + if (!fn) { \ + err = -ENOTSUPP; \ + goto err_put; \ + } \ + err = fn(map, attr, uattr); \ + } while (0) + +static int bpf_map_do_batch(const union bpf_attr *attr, + union bpf_attr __user *uattr, + int cmd) +{ + struct bpf_map *map; + int err, ufd; + struct fd f; + + if (CHECK_ATTR(BPF_MAP_BATCH)) + return -EINVAL; + + ufd = attr->batch.map_fd; + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + if ((cmd == BPF_MAP_LOOKUP_BATCH || + cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) && + !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { + err = -EPERM; + goto err_put; + } + + if (cmd != BPF_MAP_LOOKUP_BATCH && + !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { + err = -EPERM; + goto err_put; + } + + if (cmd == BPF_MAP_LOOKUP_BATCH) + BPF_DO_BATCH(map->ops->map_lookup_batch); + else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) + BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch); + else if (cmd == BPF_MAP_UPDATE_BATCH) + BPF_DO_BATCH(map->ops->map_update_batch); + else + BPF_DO_BATCH(map->ops->map_delete_batch); + +err_put: + fdput(f); + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -3126,6 +3449,19 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_LOOKUP_AND_DELETE_ELEM: err = map_lookup_and_delete_elem(&attr); break; + case BPF_MAP_LOOKUP_BATCH: + err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_BATCH); + break; + case BPF_MAP_LOOKUP_AND_DELETE_BATCH: + err = bpf_map_do_batch(&attr, uattr, + BPF_MAP_LOOKUP_AND_DELETE_BATCH); + break; + case BPF_MAP_UPDATE_BATCH: + err = bpf_map_do_batch(&attr, uattr, BPF_MAP_UPDATE_BATCH); + break; + case BPF_MAP_DELETE_BATCH: + err = bpf_map_do_batch(&attr, uattr, BPF_MAP_DELETE_BATCH); + break; default: err = -EINVAL; break; diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index ca52b9642943..d4f335a9a899 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -44,14 +44,19 @@ struct tnum tnum_rshift(struct tnum a, u8 shift) return TNUM(a.value >> shift, a.mask >> shift); } -struct tnum tnum_arshift(struct tnum a, u8 min_shift) +struct tnum tnum_arshift(struct tnum a, u8 min_shift, u8 insn_bitness) { /* if a.value is negative, arithmetic shifting by minimum shift * will have larger negative offset compared to more shifting. * If a.value is nonnegative, arithmetic shifting by minimum shift * will have larger positive offset compare to more shifting. */ - return TNUM((s64)a.value >> min_shift, (s64)a.mask >> min_shift); + if (insn_bitness == 32) + return TNUM((u32)(((s32)a.value) >> min_shift), + (u32)(((s32)a.mask) >> min_shift)); + else + return TNUM((s64)a.value >> min_shift, + (s64)a.mask >> min_shift); } struct tnum tnum_add(struct tnum a, struct tnum b) diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 505f4e4b31d2..eb64c245052b 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -5,6 +5,12 @@ #include <linux/filter.h> #include <linux/ftrace.h> +/* dummy _ops. The verifier will operate on target program's ops. */ +const struct bpf_verifier_ops bpf_extension_verifier_ops = { +}; +const struct bpf_prog_ops bpf_extension_prog_ops = { +}; + /* btf_vmlinux has ~22k attachable functions. 1k htab is enough. */ #define TRAMPOLINE_HASH_BITS 10 #define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS) @@ -160,11 +166,20 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) if (fexit_cnt) flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; - err = arch_prepare_bpf_trampoline(new_image, &tr->func.model, flags, + /* Though the second half of trampoline page is unused a task could be + * preempted in the middle of the first half of trampoline and two + * updates to trampoline would change the code from underneath the + * preempted task. Hence wait for tasks to voluntarily schedule or go + * to userspace. + */ + synchronize_rcu_tasks(); + + err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2, + &tr->func.model, flags, fentry, fentry_cnt, fexit, fexit_cnt, tr->func.addr); - if (err) + if (err < 0) goto out; if (tr->selector) @@ -185,8 +200,10 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(enum bpf_attach_type t) switch (t) { case BPF_TRACE_FENTRY: return BPF_TRAMP_FENTRY; - default: + case BPF_TRACE_FEXIT: return BPF_TRAMP_FEXIT; + default: + return BPF_TRAMP_REPLACE; } } @@ -195,12 +212,31 @@ int bpf_trampoline_link_prog(struct bpf_prog *prog) enum bpf_tramp_prog_type kind; struct bpf_trampoline *tr; int err = 0; + int cnt; tr = prog->aux->trampoline; kind = bpf_attach_type_to_tramp(prog->expected_attach_type); mutex_lock(&tr->mutex); - if (tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT] - >= BPF_MAX_TRAMP_PROGS) { + if (tr->extension_prog) { + /* cannot attach fentry/fexit if extension prog is attached. + * cannot overwrite extension prog either. + */ + err = -EBUSY; + goto out; + } + cnt = tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT]; + if (kind == BPF_TRAMP_REPLACE) { + /* Cannot attach extension if fentry/fexit are in use. */ + if (cnt) { + err = -EBUSY; + goto out; + } + tr->extension_prog = prog; + err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL, + prog->bpf_func); + goto out; + } + if (cnt >= BPF_MAX_TRAMP_PROGS) { err = -E2BIG; goto out; } @@ -231,9 +267,17 @@ int bpf_trampoline_unlink_prog(struct bpf_prog *prog) tr = prog->aux->trampoline; kind = bpf_attach_type_to_tramp(prog->expected_attach_type); mutex_lock(&tr->mutex); + if (kind == BPF_TRAMP_REPLACE) { + WARN_ON_ONCE(!tr->extension_prog); + err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, + tr->extension_prog->bpf_func, NULL); + tr->extension_prog = NULL; + goto out; + } hlist_del(&prog->aux->tramp_hlist); tr->progs_cnt[kind]--; err = bpf_trampoline_update(prog->aux->trampoline); +out: mutex_unlock(&tr->mutex); return err; } @@ -250,6 +294,8 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) goto out; if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT]))) goto out; + /* wait for tasks to get out of trampoline before freeing it */ + synchronize_rcu_tasks(); bpf_jit_free_exec(tr->image); hlist_del(&tr->hlist); kfree(tr); @@ -296,7 +342,8 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) } int __weak -arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags, +arch_prepare_bpf_trampoline(void *image, void *image_end, + const struct btf_func_model *m, u32 flags, struct bpf_prog **fentry_progs, int fentry_cnt, struct bpf_prog **fexit_progs, int fexit_cnt, void *orig_call) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6f63ae7a370c..1cc945daa9c8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1122,10 +1122,6 @@ static void init_reg_state(struct bpf_verifier_env *env, regs[BPF_REG_FP].type = PTR_TO_STACK; mark_reg_known_zero(env, regs, BPF_REG_FP); regs[BPF_REG_FP].frameno = state->frameno; - - /* 1st arg to a function */ - regs[BPF_REG_1].type = PTR_TO_CTX; - mark_reg_known_zero(env, regs, BPF_REG_1); } #define BPF_MAIN_FUNC (-1) @@ -1916,6 +1912,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_TCP_SOCK: case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: + case PTR_TO_BTF_ID: return true; default: return false; @@ -2738,8 +2735,8 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env, } #endif -static int check_ctx_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno) +int check_ctx_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno) { /* Access to ctx or passing it to a helper is only allowed in * its original, unmodified form. @@ -2858,11 +2855,6 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, u32 btf_id; int ret; - if (atype != BPF_READ) { - verbose(env, "only read is supported\n"); - return -EACCES; - } - if (off < 0) { verbose(env, "R%d is ptr_%s invalid negative access: off=%d\n", @@ -2879,17 +2871,32 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, return -EACCES; } - ret = btf_struct_access(&env->log, t, off, size, atype, &btf_id); + if (env->ops->btf_struct_access) { + ret = env->ops->btf_struct_access(&env->log, t, off, size, + atype, &btf_id); + } else { + if (atype != BPF_READ) { + verbose(env, "only read is supported\n"); + return -EACCES; + } + + ret = btf_struct_access(&env->log, t, off, size, atype, + &btf_id); + } + if (ret < 0) return ret; - if (ret == SCALAR_VALUE) { - mark_reg_unknown(env, regs, value_regno); - return 0; + if (atype == BPF_READ) { + if (ret == SCALAR_VALUE) { + mark_reg_unknown(env, regs, value_regno); + return 0; + } + mark_reg_known_zero(env, regs, value_regno); + regs[value_regno].type = PTR_TO_BTF_ID; + regs[value_regno].btf_id = btf_id; } - mark_reg_known_zero(env, regs, value_regno); - regs[value_regno].type = PTR_TO_BTF_ID; - regs[value_regno].btf_id = btf_id; + return 0; } @@ -3945,12 +3952,26 @@ static int release_reference(struct bpf_verifier_env *env, return 0; } +static void clear_caller_saved_regs(struct bpf_verifier_env *env, + struct bpf_reg_state *regs) +{ + int i; + + /* after the call registers r0 - r5 were scratched */ + for (i = 0; i < CALLER_SAVED_REGS; i++) { + mark_reg_not_init(env, regs, caller_saved[i]); + check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); + } +} + static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { struct bpf_verifier_state *state = env->cur_state; + struct bpf_func_info_aux *func_info_aux; struct bpf_func_state *caller, *callee; int i, err, subprog, target_insn; + bool is_global = false; if (state->curframe + 1 >= MAX_CALL_FRAMES) { verbose(env, "the call stack of %d frames is too deep\n", @@ -3973,6 +3994,32 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EFAULT; } + func_info_aux = env->prog->aux->func_info_aux; + if (func_info_aux) + is_global = func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; + err = btf_check_func_arg_match(env, subprog, caller->regs); + if (err == -EFAULT) + return err; + if (is_global) { + if (err) { + verbose(env, "Caller passes invalid args into func#%d\n", + subprog); + return err; + } else { + if (env->log.level & BPF_LOG_LEVEL) + verbose(env, + "Func#%d is global and valid. Skipping.\n", + subprog); + clear_caller_saved_regs(env, caller->regs); + + /* All global functions return SCALAR_VALUE */ + mark_reg_unknown(env, caller->regs, BPF_REG_0); + + /* continue with next insn after call */ + return 0; + } + } + callee = kzalloc(sizeof(*callee), GFP_KERNEL); if (!callee) return -ENOMEM; @@ -3999,18 +4046,11 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, for (i = BPF_REG_1; i <= BPF_REG_5; i++) callee->regs[i] = caller->regs[i]; - /* after the call registers r0 - r5 were scratched */ - for (i = 0; i < CALLER_SAVED_REGS; i++) { - mark_reg_not_init(env, caller->regs, caller_saved[i]); - check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); - } + clear_caller_saved_regs(env, caller->regs); /* only increment it after check_reg_arg() finished */ state->curframe++; - if (btf_check_func_arg_match(env, subprog)) - return -EINVAL; - /* and go analyze first insn of the callee */ *insn_idx = target_insn; @@ -5049,9 +5089,16 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, /* Upon reaching here, src_known is true and * umax_val is equal to umin_val. */ - dst_reg->smin_value >>= umin_val; - dst_reg->smax_value >>= umin_val; - dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val); + if (insn_bitness == 32) { + dst_reg->smin_value = (u32)(((s32)dst_reg->smin_value) >> umin_val); + dst_reg->smax_value = (u32)(((s32)dst_reg->smax_value) >> umin_val); + } else { + dst_reg->smin_value >>= umin_val; + dst_reg->smax_value >>= umin_val; + } + + dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val, + insn_bitness); /* blow away the dst_reg umin_value/umax_value and rely on * dst_reg var_off to refine the result. @@ -6264,6 +6311,7 @@ static bool may_access_skb(enum bpf_prog_type type) static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) { struct bpf_reg_state *regs = cur_regs(env); + static const int ctx_reg = BPF_REG_6; u8 mode = BPF_MODE(insn->code); int i, err; @@ -6297,7 +6345,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) } /* check whether implicit source operand (register R6) is readable */ - err = check_reg_arg(env, BPF_REG_6, SRC_OP); + err = check_reg_arg(env, ctx_reg, SRC_OP); if (err) return err; @@ -6316,7 +6364,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } - if (regs[BPF_REG_6].type != PTR_TO_CTX) { + if (regs[ctx_reg].type != PTR_TO_CTX) { verbose(env, "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); return -EINVAL; @@ -6329,6 +6377,10 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; } + err = check_ctx_reg(env, ®s[ctx_reg], ctx_reg); + if (err < 0) + return err; + /* reset caller saved regs to unreadable */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, regs, caller_saved[i]); @@ -6348,8 +6400,30 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) static int check_return_code(struct bpf_verifier_env *env) { struct tnum enforce_attach_type_range = tnum_unknown; + const struct bpf_prog *prog = env->prog; struct bpf_reg_state *reg; struct tnum range = tnum_range(0, 1); + int err; + + /* The struct_ops func-ptr's return type could be "void" */ + if (env->prog->type == BPF_PROG_TYPE_STRUCT_OPS && + !prog->aux->attach_func_proto->type) + return 0; + + /* eBPF calling convetion is such that R0 is used + * to return the value from eBPF program. + * Make sure that it's readable at this time + * of bpf_exit, which means that program wrote + * something into it earlier + */ + err = check_reg_arg(env, BPF_REG_0, SRC_OP); + if (err) + return err; + + if (is_pointer_value(env, BPF_REG_0)) { + verbose(env, "R0 leaks addr as return value\n"); + return -EACCES; + } switch (env->prog->type) { case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: @@ -6738,12 +6812,13 @@ static int check_btf_func(struct bpf_verifier_env *env, /* check type_id */ type = btf_type_by_id(btf, krecord[i].type_id); - if (!type || BTF_INFO_KIND(type->info) != BTF_KIND_FUNC) { + if (!type || !btf_type_is_func(type)) { verbose(env, "invalid type id %d in func info", krecord[i].type_id); ret = -EINVAL; goto err_free; } + info_aux[i].linkage = BTF_INFO_VLEN(type->info); prev_offset = krecord[i].insn_off; urecord += urec_size; } @@ -7723,35 +7798,13 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev) static int do_check(struct bpf_verifier_env *env) { - struct bpf_verifier_state *state; + struct bpf_verifier_state *state = env->cur_state; struct bpf_insn *insns = env->prog->insnsi; struct bpf_reg_state *regs; int insn_cnt = env->prog->len; bool do_print_state = false; int prev_insn_idx = -1; - env->prev_linfo = NULL; - - state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL); - if (!state) - return -ENOMEM; - state->curframe = 0; - state->speculative = false; - state->branches = 1; - state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); - if (!state->frame[0]) { - kfree(state); - return -ENOMEM; - } - env->cur_state = state; - init_func_state(env, state->frame[0], - BPF_MAIN_FUNC /* callsite */, - 0 /* frameno */, - 0 /* subprogno, zero == main subprog */); - - if (btf_check_func_arg_match(env, 0)) - return -EINVAL; - for (;;) { struct bpf_insn *insn; u8 class; @@ -7829,7 +7882,7 @@ static int do_check(struct bpf_verifier_env *env) } regs = cur_regs(env); - env->insn_aux_data[env->insn_idx].seen = true; + env->insn_aux_data[env->insn_idx].seen = env->pass_cnt; prev_insn_idx = env->insn_idx; if (class == BPF_ALU || class == BPF_ALU64) { @@ -8015,21 +8068,6 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; - /* eBPF calling convetion is such that R0 is used - * to return the value from eBPF program. - * Make sure that it's readable at this time - * of bpf_exit, which means that program wrote - * something into it earlier - */ - err = check_reg_arg(env, BPF_REG_0, SRC_OP); - if (err) - return err; - - if (is_pointer_value(env, BPF_REG_0)) { - verbose(env, "R0 leaks addr as return value\n"); - return -EACCES; - } - err = check_return_code(env); if (err) return err; @@ -8064,7 +8102,7 @@ process_bpf_exit: return err; env->insn_idx++; - env->insn_aux_data[env->insn_idx].seen = true; + env->insn_aux_data[env->insn_idx].seen = env->pass_cnt; } else { verbose(env, "invalid BPF_LD mode\n"); return -EINVAL; @@ -8077,7 +8115,6 @@ process_bpf_exit: env->insn_idx++; } - env->prog->aux->stack_depth = env->subprog_info[0].stack_depth; return 0; } @@ -8137,6 +8174,11 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, return -EINVAL; } + if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { + verbose(env, "bpf_struct_ops map cannot be used in prog\n"); + return -EINVAL; + } + return 0; } @@ -8349,7 +8391,7 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, memcpy(new_data + off + cnt - 1, old_data + off, sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); for (i = off; i < off + cnt - 1; i++) { - new_data[i].seen = true; + new_data[i].seen = env->pass_cnt; new_data[i].zext_dst = insn_has_def32(env, insn + i); } env->insn_aux_data = new_data; @@ -8828,12 +8870,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) convert_ctx_access = bpf_xdp_sock_convert_ctx_access; break; case PTR_TO_BTF_ID: - if (type == BPF_WRITE) { + if (type == BPF_READ) { + insn->code = BPF_LDX | BPF_PROBE_MEM | + BPF_SIZE((insn)->code); + env->prog->aux->num_exentries++; + } else if (env->prog->type != BPF_PROG_TYPE_STRUCT_OPS) { verbose(env, "Writes through BTF pointers are not allowed\n"); return -EINVAL; } - insn->code = BPF_LDX | BPF_PROBE_MEM | BPF_SIZE((insn)->code); - env->prog->aux->num_exentries++; continue; default: continue; @@ -9413,6 +9457,30 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) goto patch_call_imm; } + if (prog->jit_requested && BITS_PER_LONG == 64 && + insn->imm == BPF_FUNC_jiffies64) { + struct bpf_insn ld_jiffies_addr[2] = { + BPF_LD_IMM64(BPF_REG_0, + (unsigned long)&jiffies), + }; + + insn_buf[0] = ld_jiffies_addr[0]; + insn_buf[1] = ld_jiffies_addr[1]; + insn_buf[2] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, + BPF_REG_0, 0); + cnt = 3; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, + cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + patch_call_imm: fn = env->ops->get_func_proto(insn->imm, env->prog); /* all functions that have prototype and verifier allowed @@ -9459,6 +9527,7 @@ static void free_states(struct bpf_verifier_env *env) kfree(sl); sl = sln; } + env->free_list = NULL; if (!env->explored_states) return; @@ -9472,11 +9541,164 @@ static void free_states(struct bpf_verifier_env *env) kfree(sl); sl = sln; } + env->explored_states[i] = NULL; } +} - kvfree(env->explored_states); +/* The verifier is using insn_aux_data[] to store temporary data during + * verification and to store information for passes that run after the + * verification like dead code sanitization. do_check_common() for subprogram N + * may analyze many other subprograms. sanitize_insn_aux_data() clears all + * temporary data after do_check_common() finds that subprogram N cannot be + * verified independently. pass_cnt counts the number of times + * do_check_common() was run and insn->aux->seen tells the pass number + * insn_aux_data was touched. These variables are compared to clear temporary + * data from failed pass. For testing and experiments do_check_common() can be + * run multiple times even when prior attempt to verify is unsuccessful. + */ +static void sanitize_insn_aux_data(struct bpf_verifier_env *env) +{ + struct bpf_insn *insn = env->prog->insnsi; + struct bpf_insn_aux_data *aux; + int i, class; + + for (i = 0; i < env->prog->len; i++) { + class = BPF_CLASS(insn[i].code); + if (class != BPF_LDX && class != BPF_STX) + continue; + aux = &env->insn_aux_data[i]; + if (aux->seen != env->pass_cnt) + continue; + memset(aux, 0, offsetof(typeof(*aux), orig_idx)); + } +} + +static int do_check_common(struct bpf_verifier_env *env, int subprog) +{ + struct bpf_verifier_state *state; + struct bpf_reg_state *regs; + int ret, i; + + env->prev_linfo = NULL; + env->pass_cnt++; + + state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL); + if (!state) + return -ENOMEM; + state->curframe = 0; + state->speculative = false; + state->branches = 1; + state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); + if (!state->frame[0]) { + kfree(state); + return -ENOMEM; + } + env->cur_state = state; + init_func_state(env, state->frame[0], + BPF_MAIN_FUNC /* callsite */, + 0 /* frameno */, + subprog); + + regs = state->frame[state->curframe]->regs; + if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) { + ret = btf_prepare_func_args(env, subprog, regs); + if (ret) + goto out; + for (i = BPF_REG_1; i <= BPF_REG_5; i++) { + if (regs[i].type == PTR_TO_CTX) + mark_reg_known_zero(env, regs, i); + else if (regs[i].type == SCALAR_VALUE) + mark_reg_unknown(env, regs, i); + } + } else { + /* 1st arg to a function */ + regs[BPF_REG_1].type = PTR_TO_CTX; + mark_reg_known_zero(env, regs, BPF_REG_1); + ret = btf_check_func_arg_match(env, subprog, regs); + if (ret == -EFAULT) + /* unlikely verifier bug. abort. + * ret == 0 and ret < 0 are sadly acceptable for + * main() function due to backward compatibility. + * Like socket filter program may be written as: + * int bpf_prog(struct pt_regs *ctx) + * and never dereference that ctx in the program. + * 'struct pt_regs' is a type mismatch for socket + * filter that should be using 'struct __sk_buff'. + */ + goto out; + } + + ret = do_check(env); +out: + /* check for NULL is necessary, since cur_state can be freed inside + * do_check() under memory pressure. + */ + if (env->cur_state) { + free_verifier_state(env->cur_state, true); + env->cur_state = NULL; + } + while (!pop_stack(env, NULL, NULL)); + free_states(env); + if (ret) + /* clean aux data in case subprog was rejected */ + sanitize_insn_aux_data(env); + return ret; } +/* Verify all global functions in a BPF program one by one based on their BTF. + * All global functions must pass verification. Otherwise the whole program is rejected. + * Consider: + * int bar(int); + * int foo(int f) + * { + * return bar(f); + * } + * int bar(int b) + * { + * ... + * } + * foo() will be verified first for R1=any_scalar_value. During verification it + * will be assumed that bar() already verified successfully and call to bar() + * from foo() will be checked for type match only. Later bar() will be verified + * independently to check that it's safe for R1=any_scalar_value. + */ +static int do_check_subprogs(struct bpf_verifier_env *env) +{ + struct bpf_prog_aux *aux = env->prog->aux; + int i, ret; + + if (!aux->func_info) + return 0; + + for (i = 1; i < env->subprog_cnt; i++) { + if (aux->func_info_aux[i].linkage != BTF_FUNC_GLOBAL) + continue; + env->insn_idx = env->subprog_info[i].start; + WARN_ON_ONCE(env->insn_idx == 0); + ret = do_check_common(env, i); + if (ret) { + return ret; + } else if (env->log.level & BPF_LOG_LEVEL) { + verbose(env, + "Func#%d is safe for any args that match its prototype\n", + i); + } + } + return 0; +} + +static int do_check_main(struct bpf_verifier_env *env) +{ + int ret; + + env->insn_idx = 0; + ret = do_check_common(env, 0); + if (!ret) + env->prog->aux->stack_depth = env->subprog_info[0].stack_depth; + return ret; +} + + static void print_verification_stats(struct bpf_verifier_env *env) { int i; @@ -9501,9 +9723,62 @@ static void print_verification_stats(struct bpf_verifier_env *env) env->peak_states, env->longest_mark_read_walk); } +static int check_struct_ops_btf_id(struct bpf_verifier_env *env) +{ + const struct btf_type *t, *func_proto; + const struct bpf_struct_ops *st_ops; + const struct btf_member *member; + struct bpf_prog *prog = env->prog; + u32 btf_id, member_idx; + const char *mname; + + btf_id = prog->aux->attach_btf_id; + st_ops = bpf_struct_ops_find(btf_id); + if (!st_ops) { + verbose(env, "attach_btf_id %u is not a supported struct\n", + btf_id); + return -ENOTSUPP; + } + + t = st_ops->type; + member_idx = prog->expected_attach_type; + if (member_idx >= btf_type_vlen(t)) { + verbose(env, "attach to invalid member idx %u of struct %s\n", + member_idx, st_ops->name); + return -EINVAL; + } + + member = &btf_type_member(t)[member_idx]; + mname = btf_name_by_offset(btf_vmlinux, member->name_off); + func_proto = btf_type_resolve_func_ptr(btf_vmlinux, member->type, + NULL); + if (!func_proto) { + verbose(env, "attach to invalid member %s(@idx %u) of struct %s\n", + mname, member_idx, st_ops->name); + return -EINVAL; + } + + if (st_ops->check_member) { + int err = st_ops->check_member(t, member); + + if (err) { + verbose(env, "attach to unsupported member %s of struct %s\n", + mname, st_ops->name); + return err; + } + } + + prog->aux->attach_func_proto = func_proto; + prog->aux->attach_func_name = mname; + env->ops = st_ops->verifier_ops; + + return 0; +} + static int check_attach_btf_id(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; + bool prog_extension = prog->type == BPF_PROG_TYPE_EXT; struct bpf_prog *tgt_prog = prog->aux->linked_prog; u32 btf_id = prog->aux->attach_btf_id; const char prefix[] = "btf_trace_"; @@ -9516,7 +9791,10 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) long addr; u64 key; - if (prog->type != BPF_PROG_TYPE_TRACING) + if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) + return check_struct_ops_btf_id(env); + + if (prog->type != BPF_PROG_TYPE_TRACING && !prog_extension) return 0; if (!btf_id) { @@ -9552,8 +9830,59 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return -EINVAL; } conservative = aux->func_info_aux[subprog].unreliable; + if (prog_extension) { + if (conservative) { + verbose(env, + "Cannot replace static functions\n"); + return -EINVAL; + } + if (!prog->jit_requested) { + verbose(env, + "Extension programs should be JITed\n"); + return -EINVAL; + } + env->ops = bpf_verifier_ops[tgt_prog->type]; + } + if (!tgt_prog->jited) { + verbose(env, "Can attach to only JITed progs\n"); + return -EINVAL; + } + if (tgt_prog->type == prog->type) { + /* Cannot fentry/fexit another fentry/fexit program. + * Cannot attach program extension to another extension. + * It's ok to attach fentry/fexit to extension program. + */ + verbose(env, "Cannot recursively attach\n"); + return -EINVAL; + } + if (tgt_prog->type == BPF_PROG_TYPE_TRACING && + prog_extension && + (tgt_prog->expected_attach_type == BPF_TRACE_FENTRY || + tgt_prog->expected_attach_type == BPF_TRACE_FEXIT)) { + /* Program extensions can extend all program types + * except fentry/fexit. The reason is the following. + * The fentry/fexit programs are used for performance + * analysis, stats and can be attached to any program + * type except themselves. When extension program is + * replacing XDP function it is necessary to allow + * performance analysis of all functions. Both original + * XDP program and its program extension. Hence + * attaching fentry/fexit to BPF_PROG_TYPE_EXT is + * allowed. If extending of fentry/fexit was allowed it + * would be possible to create long call chain + * fentry->extension->fentry->extension beyond + * reasonable stack size. Hence extending fentry is not + * allowed. + */ + verbose(env, "Cannot extend fentry/fexit\n"); + return -EINVAL; + } key = ((u64)aux->id) << 32 | btf_id; } else { + if (prog_extension) { + verbose(env, "Cannot replace kernel functions\n"); + return -EINVAL; + } key = btf_id; } @@ -9591,6 +9920,10 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_func_proto = t; prog->aux->attach_btf_trace = true; return 0; + default: + if (!prog_extension) + return -EINVAL; + /* fallthrough */ case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: if (!btf_type_is_func(t)) { @@ -9598,6 +9931,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) btf_id); return -EINVAL; } + if (prog_extension && + btf_check_type_match(env, prog, btf, t)) + return -EINVAL; t = btf_type_by_id(btf, t->type); if (!btf_type_is_func_proto(t)) return -EINVAL; @@ -9621,18 +9957,6 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) if (ret < 0) goto out; if (tgt_prog) { - if (!tgt_prog->jited) { - /* for now */ - verbose(env, "Can trace only JITed BPF progs\n"); - ret = -EINVAL; - goto out; - } - if (tgt_prog->type == BPF_PROG_TYPE_TRACING) { - /* prevent cycles */ - verbose(env, "Cannot recursively attach\n"); - ret = -EINVAL; - goto out; - } if (subprog == 0) addr = (long) tgt_prog->bpf_func; else @@ -9654,8 +9978,6 @@ out: if (ret) bpf_trampoline_put(tr); return ret; - default: - return -EINVAL; } } @@ -9725,10 +10047,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, goto skip_full_check; } - ret = check_attach_btf_id(env); - if (ret) - goto skip_full_check; - env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT); if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; @@ -9765,22 +10083,22 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (ret < 0) goto skip_full_check; + ret = check_attach_btf_id(env); + if (ret) + goto skip_full_check; + ret = check_cfg(env); if (ret < 0) goto skip_full_check; - ret = do_check(env); - if (env->cur_state) { - free_verifier_state(env->cur_state, true); - env->cur_state = NULL; - } + ret = do_check_subprogs(env); + ret = ret ?: do_check_main(env); if (ret == 0 && bpf_prog_is_dev_bound(env->prog->aux)) ret = bpf_prog_offload_finalize(env); skip_full_check: - while (!pop_stack(env, NULL, NULL)); - free_states(env); + kvfree(env->explored_states); if (ret == 0) ret = check_max_stack_depth(env); diff --git a/kernel/cpu.c b/kernel/cpu.c index a59cc980adad..4dc279ed3b2d 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1909,6 +1909,78 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) } EXPORT_SYMBOL(__cpuhp_remove_state); +#ifdef CONFIG_HOTPLUG_SMT +static void cpuhp_offline_cpu_device(unsigned int cpu) +{ + struct device *dev = get_cpu_device(cpu); + + dev->offline = true; + /* Tell user space about the state change */ + kobject_uevent(&dev->kobj, KOBJ_OFFLINE); +} + +static void cpuhp_online_cpu_device(unsigned int cpu) +{ + struct device *dev = get_cpu_device(cpu); + + dev->offline = false; + /* Tell user space about the state change */ + kobject_uevent(&dev->kobj, KOBJ_ONLINE); +} + +int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) +{ + int cpu, ret = 0; + + cpu_maps_update_begin(); + for_each_online_cpu(cpu) { + if (topology_is_primary_thread(cpu)) + continue; + ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE); + if (ret) + break; + /* + * As this needs to hold the cpu maps lock it's impossible + * to call device_offline() because that ends up calling + * cpu_down() which takes cpu maps lock. cpu maps lock + * needs to be held as this might race against in kernel + * abusers of the hotplug machinery (thermal management). + * + * So nothing would update device:offline state. That would + * leave the sysfs entry stale and prevent onlining after + * smt control has been changed to 'off' again. This is + * called under the sysfs hotplug lock, so it is properly + * serialized against the regular offline usage. + */ + cpuhp_offline_cpu_device(cpu); + } + if (!ret) + cpu_smt_control = ctrlval; + cpu_maps_update_done(); + return ret; +} + +int cpuhp_smt_enable(void) +{ + int cpu, ret = 0; + + cpu_maps_update_begin(); + cpu_smt_control = CPU_SMT_ENABLED; + for_each_present_cpu(cpu) { + /* Skip online CPUs and CPUs on offline nodes */ + if (cpu_online(cpu) || !node_online(cpu_to_node(cpu))) + continue; + ret = _cpu_up(cpu, 0, CPUHP_ONLINE); + if (ret) + break; + /* See comment in cpuhp_smt_disable() */ + cpuhp_online_cpu_device(cpu); + } + cpu_maps_update_done(); + return ret; +} +#endif + #if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU) static ssize_t show_cpuhp_state(struct device *dev, struct device_attribute *attr, char *buf) @@ -2063,77 +2135,6 @@ static const struct attribute_group cpuhp_cpu_root_attr_group = { #ifdef CONFIG_HOTPLUG_SMT -static void cpuhp_offline_cpu_device(unsigned int cpu) -{ - struct device *dev = get_cpu_device(cpu); - - dev->offline = true; - /* Tell user space about the state change */ - kobject_uevent(&dev->kobj, KOBJ_OFFLINE); -} - -static void cpuhp_online_cpu_device(unsigned int cpu) -{ - struct device *dev = get_cpu_device(cpu); - - dev->offline = false; - /* Tell user space about the state change */ - kobject_uevent(&dev->kobj, KOBJ_ONLINE); -} - -int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) -{ - int cpu, ret = 0; - - cpu_maps_update_begin(); - for_each_online_cpu(cpu) { - if (topology_is_primary_thread(cpu)) - continue; - ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE); - if (ret) - break; - /* - * As this needs to hold the cpu maps lock it's impossible - * to call device_offline() because that ends up calling - * cpu_down() which takes cpu maps lock. cpu maps lock - * needs to be held as this might race against in kernel - * abusers of the hotplug machinery (thermal management). - * - * So nothing would update device:offline state. That would - * leave the sysfs entry stale and prevent onlining after - * smt control has been changed to 'off' again. This is - * called under the sysfs hotplug lock, so it is properly - * serialized against the regular offline usage. - */ - cpuhp_offline_cpu_device(cpu); - } - if (!ret) - cpu_smt_control = ctrlval; - cpu_maps_update_done(); - return ret; -} - -int cpuhp_smt_enable(void) -{ - int cpu, ret = 0; - - cpu_maps_update_begin(); - cpu_smt_control = CPU_SMT_ENABLED; - for_each_present_cpu(cpu) { - /* Skip online CPUs and CPUs on offline nodes */ - if (cpu_online(cpu) || !node_online(cpu_to_node(cpu))) - continue; - ret = _cpu_up(cpu, 0, CPUHP_ONLINE); - if (ret) - break; - /* See comment in cpuhp_smt_disable() */ - cpuhp_online_cpu_device(cpu); - } - cpu_maps_update_done(); - return ret; -} - - static ssize_t __store_smt_control(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) diff --git a/kernel/cred.c b/kernel/cred.c index c0a4c12d38b2..809a985b1793 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -175,8 +175,8 @@ void exit_creds(struct task_struct *tsk) put_cred(cred); #ifdef CONFIG_KEYS_REQUEST_CACHE - key_put(current->cached_requested_key); - current->cached_requested_key = NULL; + key_put(tsk->cached_requested_key); + tsk->cached_requested_key = NULL; #endif } @@ -223,7 +223,7 @@ struct cred *cred_alloc_blank(void) new->magic = CRED_MAGIC; #endif - if (security_cred_alloc_blank(new, GFP_KERNEL) < 0) + if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0) goto error; return new; @@ -282,7 +282,7 @@ struct cred *prepare_creds(void) new->security = NULL; #endif - if (security_prepare_creds(new, old, GFP_KERNEL) < 0) + if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) goto error; validate_creds(new); return new; @@ -715,7 +715,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) #ifdef CONFIG_SECURITY new->security = NULL; #endif - if (security_prepare_creds(new, old, GFP_KERNEL) < 0) + if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) goto error; put_cred(old); diff --git a/kernel/events/core.c b/kernel/events/core.c index a1f8bde19b56..2173c23c25b4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11465,8 +11465,10 @@ SYSCALL_DEFINE5(perf_event_open, } } - if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) + if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) { + err = -EINVAL; goto err_locked; + } /* * Must be under the same ctx::mutex as perf_install_in_context(), diff --git a/kernel/exit.c b/kernel/exit.c index bcbd59888e67..2833ffb0c211 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -517,10 +517,6 @@ static struct task_struct *find_child_reaper(struct task_struct *father, } write_unlock_irq(&tasklist_lock); - if (unlikely(pid_ns == &init_pid_ns)) { - panic("Attempted to kill init! exitcode=0x%08x\n", - father->signal->group_exit_code ?: father->exit_code); - } list_for_each_entry_safe(p, n, dead, ptrace_entry) { list_del_init(&p->ptrace_entry); @@ -766,6 +762,14 @@ void __noreturn do_exit(long code) acct_update_integrals(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { + /* + * If the last thread of global init has exited, panic + * immediately to get a useable coredump. + */ + if (unlikely(is_global_init(tsk))) + panic("Attempted to kill init! exitcode=0x%08x\n", + tsk->signal->group_exit_code ?: (int)code); + #ifdef CONFIG_POSIX_TIMERS hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); diff --git a/kernel/fork.c b/kernel/fork.c index 2508a4f238a3..080809560072 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2578,6 +2578,16 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, #endif #ifdef __ARCH_WANT_SYS_CLONE3 + +/* + * copy_thread implementations handle CLONE_SETTLS by reading the TLS value from + * the registers containing the syscall arguments for clone. This doesn't work + * with clone3 since the TLS value is passed in clone_args instead. + */ +#ifndef CONFIG_HAVE_COPY_THREAD_TLS +#error clone3 requires copy_thread_tls support in arch +#endif + noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, struct clone_args __user *uargs, size_t usize) diff --git a/kernel/futex.c b/kernel/futex.c index 03c518e9747e..0cf84c8664f2 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1178,6 +1178,7 @@ out_error: /** * wait_for_owner_exiting - Block until the owner has exited + * @ret: owner's current futex lock status * @exiting: Pointer to the exiting task * * Caller must hold a refcount on @exiting. diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 32282e7112d3..32406ef0d6a2 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -482,7 +482,7 @@ static struct lock_trace *save_trace(void) struct lock_trace *trace, *t2; struct hlist_head *hash_head; u32 hash; - unsigned int max_entries; + int max_entries; BUILD_BUG_ON_NOT_POWER_OF_2(STACK_TRACE_HASH_SIZE); BUILD_BUG_ON(LOCK_TRACE_SIZE_IN_LONGS >= MAX_STACK_TRACE_ENTRIES); @@ -490,10 +490,8 @@ static struct lock_trace *save_trace(void) trace = (struct lock_trace *)(stack_trace + nr_stack_trace_entries); max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries - LOCK_TRACE_SIZE_IN_LONGS; - trace->nr_entries = stack_trace_save(trace->entries, max_entries, 3); - if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES - - LOCK_TRACE_SIZE_IN_LONGS - 1) { + if (max_entries <= 0) { if (!debug_locks_off_graph_unlock()) return NULL; @@ -502,6 +500,7 @@ static struct lock_trace *save_trace(void) return NULL; } + trace->nr_entries = stack_trace_save(trace->entries, max_entries, 3); hash = jhash(trace->entries, trace->nr_entries * sizeof(trace->entries[0]), 0); diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 44e68761f432..0d9b6be9ecc8 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1226,8 +1226,8 @@ wait: * In this case, we attempt to acquire the lock again * without sleeping. */ - if ((wstate == WRITER_HANDOFF) && - (rwsem_spin_on_owner(sem, 0) == OWNER_NULL)) + if (wstate == WRITER_HANDOFF && + rwsem_spin_on_owner(sem, RWSEM_NONSPINNABLE) == OWNER_NULL) goto trylock_again; /* Block until there are no active lockers. */ diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 26b9168321e7..d65f2d5ab694 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1147,24 +1147,24 @@ void free_basic_memory_bitmaps(void) void clear_free_pages(void) { -#ifdef CONFIG_PAGE_POISONING_ZERO struct memory_bitmap *bm = free_pages_map; unsigned long pfn; if (WARN_ON(!(free_pages_map))) return; - memory_bm_position_reset(bm); - pfn = memory_bm_next_pfn(bm); - while (pfn != BM_END_OF_MAP) { - if (pfn_valid(pfn)) - clear_highpage(pfn_to_page(pfn)); - + if (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) || want_init_on_free()) { + memory_bm_position_reset(bm); pfn = memory_bm_next_pfn(bm); + while (pfn != BM_END_OF_MAP) { + if (pfn_valid(pfn)) + clear_highpage(pfn_to_page(pfn)); + + pfn = memory_bm_next_pfn(bm); + } + memory_bm_position_reset(bm); + pr_info("free pages cleared after restore\n"); } - memory_bm_position_reset(bm); - pr_info("free pages cleared after restore\n"); -#endif /* PAGE_POISONING_ZERO */ } /** diff --git a/kernel/ptrace.c b/kernel/ptrace.c index cb9ddcc08119..43d6179508d6 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -264,12 +264,17 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) return ret; } -static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode) +static bool ptrace_has_cap(const struct cred *cred, struct user_namespace *ns, + unsigned int mode) { + int ret; + if (mode & PTRACE_MODE_NOAUDIT) - return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE); + ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NOAUDIT); else - return has_ns_capability(current, ns, CAP_SYS_PTRACE); + ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NONE); + + return ret == 0; } /* Returns 0 on success, -errno on denial. */ @@ -321,7 +326,7 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) gid_eq(caller_gid, tcred->sgid) && gid_eq(caller_gid, tcred->gid)) goto ok; - if (ptrace_has_cap(tcred->user_ns, mode)) + if (ptrace_has_cap(cred, tcred->user_ns, mode)) goto ok; rcu_read_unlock(); return -EPERM; @@ -340,7 +345,7 @@ ok: mm = task->mm; if (mm && ((get_dumpable(mm) != SUID_DUMP_USER) && - !ptrace_has_cap(mm->user_ns, mode))) + !ptrace_has_cap(cred, mm->user_ns, mode))) return -EPERM; return security_ptrace_access_check(task, mode); diff --git a/kernel/rseq.c b/kernel/rseq.c index 27c48eb7de40..a4f86a9d6937 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -310,6 +310,8 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int ret; if (flags & RSEQ_FLAG_UNREGISTER) { + if (flags & ~RSEQ_FLAG_UNREGISTER) + return -EINVAL; /* Unregister rseq for current thread. */ if (current->rseq != rseq || !current->rseq) return -EINVAL; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 12d2227e5786..b6ea3dcb57bf 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1026,6 +1026,13 @@ static long seccomp_notify_recv(struct seccomp_filter *filter, struct seccomp_notif unotif; ssize_t ret; + /* Verify that we're not given garbage to keep struct extensible. */ + ret = check_zeroed_user(buf, sizeof(unotif)); + if (ret < 0) + return ret; + if (!ret) + return -EINVAL; + memset(&unotif, 0, sizeof(unotif)); ret = down_interruptible(&filter->notif->request); diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 13a0f2e6ebc2..e2ac0e37c4ae 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -554,25 +554,33 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; - struct taskstats *stats; + struct taskstats *stats_new, *stats; - if (sig->stats || thread_group_empty(tsk)) - goto ret; + /* Pairs with smp_store_release() below. */ + stats = smp_load_acquire(&sig->stats); + if (stats || thread_group_empty(tsk)) + return stats; /* No problem if kmem_cache_zalloc() fails */ - stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); + stats_new = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); spin_lock_irq(&tsk->sighand->siglock); - if (!sig->stats) { - sig->stats = stats; - stats = NULL; + stats = sig->stats; + if (!stats) { + /* + * Pairs with smp_store_release() above and order the + * kmem_cache_zalloc(). + */ + smp_store_release(&sig->stats, stats_new); + stats = stats_new; + stats_new = NULL; } spin_unlock_irq(&tsk->sighand->siglock); - if (stats) - kmem_cache_free(taskstats_cache, stats); -ret: - return sig->stats; + if (stats_new) + kmem_cache_free(taskstats_cache, stats_new); + + return stats; } /* Send pid data out on exit */ diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 67df65f887ac..20c65a7d4e3a 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -151,6 +151,9 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, #ifdef CONFIG_COMPAT COMPAT_SYS_NI(timer_create); +#endif + +#if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA) COMPAT_SYS_NI(getitimer); COMPAT_SYS_NI(setitimer); #endif diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 8b192e67aabc..a792d21cac64 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -58,8 +58,9 @@ static void tick_do_update_jiffies64(ktime_t now) /* * Do a quick check without holding jiffies_lock: + * The READ_ONCE() pairs with two updates done later in this function. */ - delta = ktime_sub(now, last_jiffies_update); + delta = ktime_sub(now, READ_ONCE(last_jiffies_update)); if (delta < tick_period) return; @@ -70,8 +71,9 @@ static void tick_do_update_jiffies64(ktime_t now) if (delta >= tick_period) { delta = ktime_sub(delta, tick_period); - last_jiffies_update = ktime_add(last_jiffies_update, - tick_period); + /* Pairs with the lockless read in this function. */ + WRITE_ONCE(last_jiffies_update, + ktime_add(last_jiffies_update, tick_period)); /* Slow path for long timeouts */ if (unlikely(delta >= tick_period)) { @@ -79,8 +81,10 @@ static void tick_do_update_jiffies64(ktime_t now) ticks = ktime_divns(delta, incr); - last_jiffies_update = ktime_add_ns(last_jiffies_update, - incr * ticks); + /* Pairs with the lockless read in this function. */ + WRITE_ONCE(last_jiffies_update, + ktime_add_ns(last_jiffies_update, + incr * ticks)); } do_timer(++ticks); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e5ef4ae9edb5..19e793aa441a 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -703,6 +703,7 @@ struct send_signal_irq_work { struct irq_work irq_work; struct task_struct *task; u32 sig; + enum pid_type type; }; static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work); @@ -712,10 +713,10 @@ static void do_bpf_send_signal(struct irq_work *entry) struct send_signal_irq_work *work; work = container_of(entry, struct send_signal_irq_work, irq_work); - group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, PIDTYPE_TGID); + group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, work->type); } -BPF_CALL_1(bpf_send_signal, u32, sig) +static int bpf_send_signal_common(u32 sig, enum pid_type type) { struct send_signal_irq_work *work = NULL; @@ -748,11 +749,17 @@ BPF_CALL_1(bpf_send_signal, u32, sig) */ work->task = current; work->sig = sig; + work->type = type; irq_work_queue(&work->irq_work); return 0; } - return group_send_sig_info(sig, SEND_SIG_PRIV, current, PIDTYPE_TGID); + return group_send_sig_info(sig, SEND_SIG_PRIV, current, type); +} + +BPF_CALL_1(bpf_send_signal, u32, sig) +{ + return bpf_send_signal_common(sig, PIDTYPE_TGID); } static const struct bpf_func_proto bpf_send_signal_proto = { @@ -762,6 +769,18 @@ static const struct bpf_func_proto bpf_send_signal_proto = { .arg1_type = ARG_ANYTHING, }; +BPF_CALL_1(bpf_send_signal_thread, u32, sig) +{ + return bpf_send_signal_common(sig, PIDTYPE_PID); +} + +static const struct bpf_func_proto bpf_send_signal_thread_proto = { + .func = bpf_send_signal_thread, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -822,6 +841,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #endif case BPF_FUNC_send_signal: return &bpf_send_signal_proto; + case BPF_FUNC_send_signal_thread: + return &bpf_send_signal_thread_proto; default: return NULL; } diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index a2659735db73..1af321dec0f1 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -96,6 +96,20 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, return 0; } +/* + * Not all archs define MCOUNT_INSN_SIZE which is used to look for direct + * functions. But those archs currently don't support direct functions + * anyway, and ftrace_find_rec_direct() is just a stub for them. + * Define MCOUNT_INSN_SIZE to keep those archs compiling. + */ +#ifndef MCOUNT_INSN_SIZE +/* Make sure this only works without direct calls */ +# ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +# error MCOUNT_INSN_SIZE not defined with direct calls enabled +# endif +# define MCOUNT_INSN_SIZE 0 +#endif + int function_graph_enter(unsigned long ret, unsigned long func, unsigned long frame_pointer, unsigned long *retp) { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ac99a3500076..9bf1f2cd515e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -526,8 +526,7 @@ static int function_stat_show(struct seq_file *m, void *v) } #ifdef CONFIG_FUNCTION_GRAPH_TRACER - avg = rec->time; - do_div(avg, rec->counter); + avg = div64_ul(rec->time, rec->counter); if (tracing_thresh && (avg < tracing_thresh)) goto out; #endif @@ -553,7 +552,8 @@ static int function_stat_show(struct seq_file *m, void *v) * Divide only 1000 for ns^2 -> us^2 conversion. * trace_print_graph_duration will divide 1000 again. */ - do_div(stddev, rec->counter * (rec->counter - 1) * 1000); + stddev = div64_ul(stddev, + rec->counter * (rec->counter - 1) * 1000); } trace_seq_init(&s); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ddb7e7f5fe8d..5b6ee4aadc26 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9420,6 +9420,11 @@ __init static int tracing_set_default_clock(void) { /* sched_clock_stable() is determined in late_initcall */ if (!trace_boot_clock && !sched_clock_stable()) { + if (security_locked_down(LOCKDOWN_TRACEFS)) { + pr_warn("Can not set tracing clock due to lockdown\n"); + return -EPERM; + } + printk(KERN_WARNING "Unstable clock detected, switching default tracing clock to \"global\"\n" "If you want to keep using the local clock, then add:\n" diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index f62de5f43e79..6ac35b9e195d 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -116,6 +116,7 @@ struct hist_field { struct ftrace_event_field *field; unsigned long flags; hist_field_fn_t fn; + unsigned int ref; unsigned int size; unsigned int offset; unsigned int is_signed; @@ -1766,11 +1767,13 @@ static struct hist_field *find_var(struct hist_trigger_data *hist_data, struct event_trigger_data *test; struct hist_field *hist_field; + lockdep_assert_held(&event_mutex); + hist_field = find_var_field(hist_data, var_name); if (hist_field) return hist_field; - list_for_each_entry_rcu(test, &file->triggers, list) { + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { test_data = test->private_data; hist_field = find_var_field(test_data, var_name); @@ -1820,7 +1823,9 @@ static struct hist_field *find_file_var(struct trace_event_file *file, struct event_trigger_data *test; struct hist_field *hist_field; - list_for_each_entry_rcu(test, &file->triggers, list) { + lockdep_assert_held(&event_mutex); + + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { test_data = test->private_data; hist_field = find_var_field(test_data, var_name); @@ -2423,8 +2428,16 @@ static int contains_operator(char *str) return field_op; } +static void get_hist_field(struct hist_field *hist_field) +{ + hist_field->ref++; +} + static void __destroy_hist_field(struct hist_field *hist_field) { + if (--hist_field->ref > 1) + return; + kfree(hist_field->var.name); kfree(hist_field->name); kfree(hist_field->type); @@ -2466,6 +2479,8 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, if (!hist_field) return NULL; + hist_field->ref = 1; + hist_field->hist_data = hist_data; if (flags & HIST_FIELD_FL_EXPR || flags & HIST_FIELD_FL_ALIAS) @@ -2661,6 +2676,17 @@ static struct hist_field *create_var_ref(struct hist_trigger_data *hist_data, { unsigned long flags = HIST_FIELD_FL_VAR_REF; struct hist_field *ref_field; + int i; + + /* Check if the variable already exists */ + for (i = 0; i < hist_data->n_var_refs; i++) { + ref_field = hist_data->var_refs[i]; + if (ref_field->var.idx == var_field->var.idx && + ref_field->var.hist_data == var_field->hist_data) { + get_hist_field(ref_field); + return ref_field; + } + } ref_field = create_hist_field(var_field->hist_data, NULL, flags, NULL); if (ref_field) { @@ -3115,7 +3141,9 @@ static char *find_trigger_filter(struct hist_trigger_data *hist_data, { struct event_trigger_data *test; - list_for_each_entry_rcu(test, &file->triggers, list) { + lockdep_assert_held(&event_mutex); + + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { if (test->private_data == hist_data) return test->filter_str; @@ -3166,9 +3194,11 @@ find_compatible_hist(struct hist_trigger_data *target_hist_data, struct event_trigger_data *test; unsigned int n_keys; + lockdep_assert_held(&event_mutex); + n_keys = target_hist_data->n_fields - target_hist_data->n_vals; - list_for_each_entry_rcu(test, &file->triggers, list) { + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { hist_data = test->private_data; @@ -5528,7 +5558,7 @@ static int hist_show(struct seq_file *m, void *v) goto out_unlock; } - list_for_each_entry_rcu(data, &event_file->triggers, list) { + list_for_each_entry(data, &event_file->triggers, list) { if (data->cmd_ops->trigger_type == ETT_EVENT_HIST) hist_trigger_show(m, data, n++); } @@ -5921,7 +5951,9 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, if (hist_data->attrs->name && !named_data) goto new; - list_for_each_entry_rcu(test, &file->triggers, list) { + lockdep_assert_held(&event_mutex); + + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { if (!hist_trigger_match(data, test, named_data, false)) continue; @@ -6005,10 +6037,12 @@ static bool have_hist_trigger_match(struct event_trigger_data *data, struct event_trigger_data *test, *named_data = NULL; bool match = false; + lockdep_assert_held(&event_mutex); + if (hist_data->attrs->name) named_data = find_named_trigger(hist_data->attrs->name); - list_for_each_entry_rcu(test, &file->triggers, list) { + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { if (hist_trigger_match(data, test, named_data, false)) { match = true; @@ -6026,10 +6060,12 @@ static bool hist_trigger_check_refs(struct event_trigger_data *data, struct hist_trigger_data *hist_data = data->private_data; struct event_trigger_data *test, *named_data = NULL; + lockdep_assert_held(&event_mutex); + if (hist_data->attrs->name) named_data = find_named_trigger(hist_data->attrs->name); - list_for_each_entry_rcu(test, &file->triggers, list) { + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { if (!hist_trigger_match(data, test, named_data, false)) continue; @@ -6051,10 +6087,12 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *test, *named_data = NULL; bool unregistered = false; + lockdep_assert_held(&event_mutex); + if (hist_data->attrs->name) named_data = find_named_trigger(hist_data->attrs->name); - list_for_each_entry_rcu(test, &file->triggers, list) { + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { if (!hist_trigger_match(data, test, named_data, false)) continue; @@ -6080,7 +6118,9 @@ static bool hist_file_check_refs(struct trace_event_file *file) struct hist_trigger_data *hist_data; struct event_trigger_data *test; - list_for_each_entry_rcu(test, &file->triggers, list) { + lockdep_assert_held(&event_mutex); + + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { hist_data = test->private_data; if (check_var_refs(hist_data)) @@ -6323,7 +6363,8 @@ hist_enable_trigger(struct event_trigger_data *data, void *rec, struct enable_trigger_data *enable_data = data->private_data; struct event_trigger_data *test; - list_for_each_entry_rcu(test, &enable_data->file->triggers, list) { + list_for_each_entry_rcu(test, &enable_data->file->triggers, list, + lockdep_is_held(&event_mutex)) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { if (enable_data->enable) test->paused = false; diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c index d45079ee62f8..22bcf7c51d1e 100644 --- a/kernel/trace/trace_events_inject.c +++ b/kernel/trace/trace_events_inject.c @@ -195,7 +195,7 @@ static int parse_entry(char *str, struct trace_event_call *call, void **pentry) unsigned long irq_flags; void *entry = NULL; int entry_size; - u64 val; + u64 val = 0; int len; entry = trace_alloc_entry(call, &entry_size); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 2cd53ca21b51..40106fff06a4 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -501,7 +501,9 @@ void update_cond_flag(struct trace_event_file *file) struct event_trigger_data *data; bool set_cond = false; - list_for_each_entry_rcu(data, &file->triggers, list) { + lockdep_assert_held(&event_mutex); + + list_for_each_entry(data, &file->triggers, list) { if (data->filter || event_command_post_trigger(data->cmd_ops) || event_command_needs_rec(data->cmd_ops)) { set_cond = true; @@ -536,7 +538,9 @@ static int register_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *test; int ret = 0; - list_for_each_entry_rcu(test, &file->triggers, list) { + lockdep_assert_held(&event_mutex); + + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) { ret = -EEXIST; goto out; @@ -581,7 +585,9 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *data; bool unregistered = false; - list_for_each_entry_rcu(data, &file->triggers, list) { + lockdep_assert_held(&event_mutex); + + list_for_each_entry(data, &file->triggers, list) { if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) { unregistered = true; list_del_rcu(&data->list); @@ -1497,7 +1503,9 @@ int event_enable_register_trigger(char *glob, struct event_trigger_data *test; int ret = 0; - list_for_each_entry_rcu(test, &file->triggers, list) { + lockdep_assert_held(&event_mutex); + + list_for_each_entry(test, &file->triggers, list) { test_enable_data = test->private_data; if (test_enable_data && (test->cmd_ops->trigger_type == @@ -1537,7 +1545,9 @@ void event_enable_unregister_trigger(char *glob, struct event_trigger_data *data; bool unregistered = false; - list_for_each_entry_rcu(data, &file->triggers, list) { + lockdep_assert_held(&event_mutex); + + list_for_each_entry(data, &file->triggers, list) { enable_data = data->private_data; if (enable_data && (data->cmd_ops->trigger_type == diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 7f890262c8a3..3f54dc2f6e1c 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -290,7 +290,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, INIT_HLIST_NODE(&tk->rp.kp.hlist); INIT_LIST_HEAD(&tk->rp.kp.list); - ret = trace_probe_init(&tk->tp, event, group); + ret = trace_probe_init(&tk->tp, event, group, false); if (ret < 0) goto error; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 905b10af5d5c..9ae87be422f2 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -984,15 +984,19 @@ void trace_probe_cleanup(struct trace_probe *tp) } int trace_probe_init(struct trace_probe *tp, const char *event, - const char *group) + const char *group, bool alloc_filter) { struct trace_event_call *call; + size_t size = sizeof(struct trace_probe_event); int ret = 0; if (!event || !group) return -EINVAL; - tp->event = kzalloc(sizeof(struct trace_probe_event), GFP_KERNEL); + if (alloc_filter) + size += sizeof(struct trace_uprobe_filter); + + tp->event = kzalloc(size, GFP_KERNEL); if (!tp->event) return -ENOMEM; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 4ee703728aec..a0ff9e200ef6 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -223,6 +223,12 @@ struct probe_arg { const struct fetch_type *type; /* Type of this argument */ }; +struct trace_uprobe_filter { + rwlock_t rwlock; + int nr_systemwide; + struct list_head perf_events; +}; + /* Event call and class holder */ struct trace_probe_event { unsigned int flags; /* For TP_FLAG_* */ @@ -230,6 +236,7 @@ struct trace_probe_event { struct trace_event_call call; struct list_head files; struct list_head probes; + struct trace_uprobe_filter filter[0]; }; struct trace_probe { @@ -322,7 +329,7 @@ static inline bool trace_probe_has_single_file(struct trace_probe *tp) } int trace_probe_init(struct trace_probe *tp, const char *event, - const char *group); + const char *group, bool alloc_filter); void trace_probe_cleanup(struct trace_probe *tp); int trace_probe_append(struct trace_probe *tp, struct trace_probe *to); void trace_probe_unlink(struct trace_probe *tp); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 5e43b9664eca..617e297f46dc 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -630,7 +630,7 @@ static void start_wakeup_tracer(struct trace_array *tr) if (ret) { pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_migrate_task\n"); - return; + goto fail_deprobe_sched_switch; } wakeup_reset(tr); @@ -648,6 +648,8 @@ static void start_wakeup_tracer(struct trace_array *tr) printk(KERN_ERR "failed to start wakeup tracer\n"); return; +fail_deprobe_sched_switch: + unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); fail_deprobe_wake_new: unregister_trace_sched_wakeup_new(probe_wakeup, NULL); fail_deprobe: diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index 344e4c1aa09c..87de6edafd14 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -381,7 +381,7 @@ int trace_seq_hex_dump(struct trace_seq *s, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii) { - unsigned int save_len = s->seq.len; + unsigned int save_len = s->seq.len; if (s->full) return 0; diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 4df9a209f7ca..c557f42a9397 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -283,6 +283,11 @@ static void check_stack(unsigned long ip, unsigned long *stack) local_irq_restore(flags); } +/* Some archs may not define MCOUNT_INSN_SIZE */ +#ifndef MCOUNT_INSN_SIZE +# define MCOUNT_INSN_SIZE 0 +#endif + static void stack_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 352073d36585..2619bc5ed520 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -34,12 +34,6 @@ struct uprobe_trace_entry_head { #define DATAOF_TRACE_ENTRY(entry, is_return) \ ((void*)(entry) + SIZEOF_TRACE_ENTRY(is_return)) -struct trace_uprobe_filter { - rwlock_t rwlock; - int nr_systemwide; - struct list_head perf_events; -}; - static int trace_uprobe_create(int argc, const char **argv); static int trace_uprobe_show(struct seq_file *m, struct dyn_event *ev); static int trace_uprobe_release(struct dyn_event *ev); @@ -60,7 +54,6 @@ static struct dyn_event_operations trace_uprobe_ops = { */ struct trace_uprobe { struct dyn_event devent; - struct trace_uprobe_filter filter; struct uprobe_consumer consumer; struct path path; struct inode *inode; @@ -351,7 +344,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) if (!tu) return ERR_PTR(-ENOMEM); - ret = trace_probe_init(&tu->tp, event, group); + ret = trace_probe_init(&tu->tp, event, group, true); if (ret < 0) goto error; @@ -359,7 +352,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) tu->consumer.handler = uprobe_dispatcher; if (is_ret) tu->consumer.ret_handler = uretprobe_dispatcher; - init_trace_uprobe_filter(&tu->filter); + init_trace_uprobe_filter(tu->tp.event->filter); return tu; error: @@ -1067,13 +1060,14 @@ static void __probe_event_disable(struct trace_probe *tp) struct trace_probe *pos; struct trace_uprobe *tu; + tu = container_of(tp, struct trace_uprobe, tp); + WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter)); + list_for_each_entry(pos, trace_probe_probe_list(tp), list) { tu = container_of(pos, struct trace_uprobe, tp); if (!tu->inode) continue; - WARN_ON(!uprobe_filter_is_empty(&tu->filter)); - uprobe_unregister(tu->inode, tu->offset, &tu->consumer); tu->inode = NULL; } @@ -1108,7 +1102,7 @@ static int probe_event_enable(struct trace_event_call *call, } tu = container_of(tp, struct trace_uprobe, tp); - WARN_ON(!uprobe_filter_is_empty(&tu->filter)); + WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter)); if (enabled) return 0; @@ -1205,39 +1199,39 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) } static inline bool -uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event) +trace_uprobe_filter_event(struct trace_uprobe_filter *filter, + struct perf_event *event) { - return __uprobe_perf_filter(&tu->filter, event->hw.target->mm); + return __uprobe_perf_filter(filter, event->hw.target->mm); } -static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) +static bool trace_uprobe_filter_remove(struct trace_uprobe_filter *filter, + struct perf_event *event) { bool done; - write_lock(&tu->filter.rwlock); + write_lock(&filter->rwlock); if (event->hw.target) { list_del(&event->hw.tp_list); - done = tu->filter.nr_systemwide || + done = filter->nr_systemwide || (event->hw.target->flags & PF_EXITING) || - uprobe_filter_event(tu, event); + trace_uprobe_filter_event(filter, event); } else { - tu->filter.nr_systemwide--; - done = tu->filter.nr_systemwide; + filter->nr_systemwide--; + done = filter->nr_systemwide; } - write_unlock(&tu->filter.rwlock); + write_unlock(&filter->rwlock); - if (!done) - return uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); - - return 0; + return done; } -static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) +/* This returns true if the filter always covers target mm */ +static bool trace_uprobe_filter_add(struct trace_uprobe_filter *filter, + struct perf_event *event) { bool done; - int err; - write_lock(&tu->filter.rwlock); + write_lock(&filter->rwlock); if (event->hw.target) { /* * event->parent != NULL means copy_process(), we can avoid @@ -1247,28 +1241,21 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) * attr.enable_on_exec means that exec/mmap will install the * breakpoints we need. */ - done = tu->filter.nr_systemwide || + done = filter->nr_systemwide || event->parent || event->attr.enable_on_exec || - uprobe_filter_event(tu, event); - list_add(&event->hw.tp_list, &tu->filter.perf_events); + trace_uprobe_filter_event(filter, event); + list_add(&event->hw.tp_list, &filter->perf_events); } else { - done = tu->filter.nr_systemwide; - tu->filter.nr_systemwide++; + done = filter->nr_systemwide; + filter->nr_systemwide++; } - write_unlock(&tu->filter.rwlock); + write_unlock(&filter->rwlock); - err = 0; - if (!done) { - err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); - if (err) - uprobe_perf_close(tu, event); - } - return err; + return done; } -static int uprobe_perf_multi_call(struct trace_event_call *call, - struct perf_event *event, - int (*op)(struct trace_uprobe *tu, struct perf_event *event)) +static int uprobe_perf_close(struct trace_event_call *call, + struct perf_event *event) { struct trace_probe *pos, *tp; struct trace_uprobe *tu; @@ -1278,25 +1265,59 @@ static int uprobe_perf_multi_call(struct trace_event_call *call, if (WARN_ON_ONCE(!tp)) return -ENODEV; + tu = container_of(tp, struct trace_uprobe, tp); + if (trace_uprobe_filter_remove(tu->tp.event->filter, event)) + return 0; + list_for_each_entry(pos, trace_probe_probe_list(tp), list) { tu = container_of(pos, struct trace_uprobe, tp); - ret = op(tu, event); + ret = uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); if (ret) break; } return ret; } + +static int uprobe_perf_open(struct trace_event_call *call, + struct perf_event *event) +{ + struct trace_probe *pos, *tp; + struct trace_uprobe *tu; + int err = 0; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + + tu = container_of(tp, struct trace_uprobe, tp); + if (trace_uprobe_filter_add(tu->tp.event->filter, event)) + return 0; + + list_for_each_entry(pos, trace_probe_probe_list(tp), list) { + err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); + if (err) { + uprobe_perf_close(call, event); + break; + } + } + + return err; +} + static bool uprobe_perf_filter(struct uprobe_consumer *uc, enum uprobe_filter_ctx ctx, struct mm_struct *mm) { + struct trace_uprobe_filter *filter; struct trace_uprobe *tu; int ret; tu = container_of(uc, struct trace_uprobe, consumer); - read_lock(&tu->filter.rwlock); - ret = __uprobe_perf_filter(&tu->filter, mm); - read_unlock(&tu->filter.rwlock); + filter = tu->tp.event->filter; + + read_lock(&filter->rwlock); + ret = __uprobe_perf_filter(filter, mm); + read_unlock(&filter->rwlock); return ret; } @@ -1419,10 +1440,10 @@ trace_uprobe_register(struct trace_event_call *event, enum trace_reg type, return 0; case TRACE_REG_PERF_OPEN: - return uprobe_perf_multi_call(event, data, uprobe_perf_open); + return uprobe_perf_open(event, data); case TRACE_REG_PERF_CLOSE: - return uprobe_perf_multi_call(event, data, uprobe_perf_close); + return uprobe_perf_close(event, data); #endif default: diff --git a/lib/bitmap.c b/lib/bitmap.c index 4250519d7d1c..6e175fbd69a9 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -168,6 +168,72 @@ void __bitmap_shift_left(unsigned long *dst, const unsigned long *src, } EXPORT_SYMBOL(__bitmap_shift_left); +/** + * bitmap_cut() - remove bit region from bitmap and right shift remaining bits + * @dst: destination bitmap, might overlap with src + * @src: source bitmap + * @first: start bit of region to be removed + * @cut: number of bits to remove + * @nbits: bitmap size, in bits + * + * Set the n-th bit of @dst iff the n-th bit of @src is set and + * n is less than @first, or the m-th bit of @src is set for any + * m such that @first <= n < nbits, and m = n + @cut. + * + * In pictures, example for a big-endian 32-bit architecture: + * + * @src: + * 31 63 + * | | + * 10000000 11000001 11110010 00010101 10000000 11000001 01110010 00010101 + * | | | | + * 16 14 0 32 + * + * if @cut is 3, and @first is 14, bits 14-16 in @src are cut and @dst is: + * + * 31 63 + * | | + * 10110000 00011000 00110010 00010101 00010000 00011000 00101110 01000010 + * | | | + * 14 (bit 17 0 32 + * from @src) + * + * Note that @dst and @src might overlap partially or entirely. + * + * This is implemented in the obvious way, with a shift and carry + * step for each moved bit. Optimisation is left as an exercise + * for the compiler. + */ +void bitmap_cut(unsigned long *dst, const unsigned long *src, + unsigned int first, unsigned int cut, unsigned int nbits) +{ + unsigned int len = BITS_TO_LONGS(nbits); + unsigned long keep = 0, carry; + int i; + + memmove(dst, src, len * sizeof(*dst)); + + if (first % BITS_PER_LONG) { + keep = src[first / BITS_PER_LONG] & + (~0UL >> (BITS_PER_LONG - first % BITS_PER_LONG)); + } + + while (cut--) { + for (i = first / BITS_PER_LONG; i < len; i++) { + if (i < len - 1) + carry = dst[i + 1] & 1UL; + else + carry = 0; + + dst[i] = (dst[i] >> 1) | (carry << (BITS_PER_LONG - 1)); + } + } + + dst[first / BITS_PER_LONG] &= ~0UL << (first % BITS_PER_LONG); + dst[first / BITS_PER_LONG] |= keep; +} +EXPORT_SYMBOL(bitmap_cut); + int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index dccb95af6003..706020b06617 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -30,13 +30,6 @@ static inline long do_strncpy_from_user(char *dst, const char __user *src, const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; unsigned long res = 0; - /* - * Truncate 'max' to the user-specified limit, so that - * we only have one limit we need to check in the loop - */ - if (max > count) - max = count; - if (IS_UNALIGNED(src, dst)) goto byte_at_a_time; @@ -114,6 +107,13 @@ long strncpy_from_user(char *dst, const char __user *src, long count) unsigned long max = max_addr - src_addr; long retval; + /* + * Truncate 'max' to the user-specified limit, so that + * we only have one limit we need to check in the loop + */ + if (max > count) + max = count; + kasan_check_write(dst, count); check_object_size(dst, count, false); if (user_access_begin(src, max)) { diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c index 6c0005d5dd5c..41670d4a5816 100644 --- a/lib/strnlen_user.c +++ b/lib/strnlen_user.c @@ -27,13 +27,6 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, unsigned long c; /* - * Truncate 'max' to the user-specified limit, so that - * we only have one limit we need to check in the loop - */ - if (max > count) - max = count; - - /* * Do everything aligned. But that means that we * need to also expand the maximum.. */ @@ -109,6 +102,13 @@ long strnlen_user(const char __user *str, long count) unsigned long max = max_addr - src_addr; long retval; + /* + * Truncate 'max' to the user-specified limit, so that + * we only have one limit we need to check in the loop + */ + if (max > count) + max = count; + if (user_access_begin(str, max)) { retval = do_strnlen_user(str, count, max); user_access_end(); diff --git a/lib/test_xarray.c b/lib/test_xarray.c index 7df4f7f395bf..55c14e8c8859 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -2,6 +2,7 @@ /* * test_xarray.c: Test the XArray API * Copyright (c) 2017-2018 Microsoft Corporation + * Copyright (c) 2019-2020 Oracle * Author: Matthew Wilcox <willy@infradead.org> */ @@ -902,28 +903,34 @@ static noinline void check_store_iter(struct xarray *xa) XA_BUG_ON(xa, !xa_empty(xa)); } -static noinline void check_multi_find(struct xarray *xa) +static noinline void check_multi_find_1(struct xarray *xa, unsigned order) { #ifdef CONFIG_XARRAY_MULTI + unsigned long multi = 3 << order; + unsigned long next = 4 << order; unsigned long index; - xa_store_order(xa, 12, 2, xa_mk_value(12), GFP_KERNEL); - XA_BUG_ON(xa, xa_store_index(xa, 16, GFP_KERNEL) != NULL); + xa_store_order(xa, multi, order, xa_mk_value(multi), GFP_KERNEL); + XA_BUG_ON(xa, xa_store_index(xa, next, GFP_KERNEL) != NULL); + XA_BUG_ON(xa, xa_store_index(xa, next + 1, GFP_KERNEL) != NULL); index = 0; XA_BUG_ON(xa, xa_find(xa, &index, ULONG_MAX, XA_PRESENT) != - xa_mk_value(12)); - XA_BUG_ON(xa, index != 12); - index = 13; + xa_mk_value(multi)); + XA_BUG_ON(xa, index != multi); + index = multi + 1; XA_BUG_ON(xa, xa_find(xa, &index, ULONG_MAX, XA_PRESENT) != - xa_mk_value(12)); - XA_BUG_ON(xa, (index < 12) || (index >= 16)); + xa_mk_value(multi)); + XA_BUG_ON(xa, (index < multi) || (index >= next)); XA_BUG_ON(xa, xa_find_after(xa, &index, ULONG_MAX, XA_PRESENT) != - xa_mk_value(16)); - XA_BUG_ON(xa, index != 16); - - xa_erase_index(xa, 12); - xa_erase_index(xa, 16); + xa_mk_value(next)); + XA_BUG_ON(xa, index != next); + XA_BUG_ON(xa, xa_find_after(xa, &index, next, XA_PRESENT) != NULL); + XA_BUG_ON(xa, index != next); + + xa_erase_index(xa, multi); + xa_erase_index(xa, next); + xa_erase_index(xa, next + 1); XA_BUG_ON(xa, !xa_empty(xa)); #endif } @@ -1046,12 +1053,33 @@ static noinline void check_find_3(struct xarray *xa) xa_destroy(xa); } +static noinline void check_find_4(struct xarray *xa) +{ + unsigned long index = 0; + void *entry; + + xa_store_index(xa, ULONG_MAX, GFP_KERNEL); + + entry = xa_find_after(xa, &index, ULONG_MAX, XA_PRESENT); + XA_BUG_ON(xa, entry != xa_mk_index(ULONG_MAX)); + + entry = xa_find_after(xa, &index, ULONG_MAX, XA_PRESENT); + XA_BUG_ON(xa, entry); + + xa_erase_index(xa, ULONG_MAX); +} + static noinline void check_find(struct xarray *xa) { + unsigned i; + check_find_1(xa); check_find_2(xa); check_find_3(xa); - check_multi_find(xa); + check_find_4(xa); + + for (i = 2; i < 10; i++) + check_multi_find_1(xa, i); check_multi_find_2(xa); } @@ -1132,6 +1160,27 @@ static noinline void check_move_tiny(struct xarray *xa) XA_BUG_ON(xa, !xa_empty(xa)); } +static noinline void check_move_max(struct xarray *xa) +{ + XA_STATE(xas, xa, 0); + + xa_store_index(xa, ULONG_MAX, GFP_KERNEL); + rcu_read_lock(); + XA_BUG_ON(xa, xas_find(&xas, ULONG_MAX) != xa_mk_index(ULONG_MAX)); + XA_BUG_ON(xa, xas_find(&xas, ULONG_MAX) != NULL); + rcu_read_unlock(); + + xas_set(&xas, 0); + rcu_read_lock(); + XA_BUG_ON(xa, xas_find(&xas, ULONG_MAX) != xa_mk_index(ULONG_MAX)); + xas_pause(&xas); + XA_BUG_ON(xa, xas_find(&xas, ULONG_MAX) != NULL); + rcu_read_unlock(); + + xa_erase_index(xa, ULONG_MAX); + XA_BUG_ON(xa, !xa_empty(xa)); +} + static noinline void check_move_small(struct xarray *xa, unsigned long idx) { XA_STATE(xas, xa, 0); @@ -1240,6 +1289,7 @@ static noinline void check_move(struct xarray *xa) xa_destroy(xa); check_move_tiny(xa); + check_move_max(xa); for (i = 0; i < 16; i++) check_move_small(xa, 1UL << i); diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index 9ecfd3b547ba..42bd8ab955fa 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -221,6 +221,7 @@ int __cvdso_clock_getres_common(clockid_t clock, struct __kernel_timespec *res) return 0; } +static __maybe_unused int __cvdso_clock_getres(clockid_t clock, struct __kernel_timespec *res) { int ret = __cvdso_clock_getres_common(clock, res); diff --git a/lib/xarray.c b/lib/xarray.c index 1237c213f52b..1d9fab7db8da 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -1,7 +1,8 @@ // SPDX-License-Identifier: GPL-2.0+ /* * XArray implementation - * Copyright (c) 2017 Microsoft Corporation + * Copyright (c) 2017-2018 Microsoft Corporation + * Copyright (c) 2018-2020 Oracle * Author: Matthew Wilcox <willy@infradead.org> */ @@ -967,6 +968,7 @@ void xas_pause(struct xa_state *xas) if (xas_invalid(xas)) return; + xas->xa_node = XAS_RESTART; if (node) { unsigned int offset = xas->xa_offset; while (++offset < XA_CHUNK_SIZE) { @@ -974,10 +976,11 @@ void xas_pause(struct xa_state *xas) break; } xas->xa_index += (offset - xas->xa_offset) << node->shift; + if (xas->xa_index == 0) + xas->xa_node = XAS_BOUNDS; } else { xas->xa_index++; } - xas->xa_node = XAS_RESTART; } EXPORT_SYMBOL_GPL(xas_pause); @@ -1079,13 +1082,15 @@ void *xas_find(struct xa_state *xas, unsigned long max) { void *entry; - if (xas_error(xas)) + if (xas_error(xas) || xas->xa_node == XAS_BOUNDS) return NULL; + if (xas->xa_index > max) + return set_bounds(xas); if (!xas->xa_node) { xas->xa_index = 1; return set_bounds(xas); - } else if (xas_top(xas->xa_node)) { + } else if (xas->xa_node == XAS_RESTART) { entry = xas_load(xas); if (entry || xas_not_node(xas->xa_node)) return entry; @@ -1150,6 +1155,8 @@ void *xas_find_marked(struct xa_state *xas, unsigned long max, xa_mark_t mark) if (xas_error(xas)) return NULL; + if (xas->xa_index > max) + goto max; if (!xas->xa_node) { xas->xa_index = 1; @@ -1824,6 +1831,17 @@ void *xa_find(struct xarray *xa, unsigned long *indexp, } EXPORT_SYMBOL(xa_find); +static bool xas_sibling(struct xa_state *xas) +{ + struct xa_node *node = xas->xa_node; + unsigned long mask; + + if (!node) + return false; + mask = (XA_CHUNK_SIZE << node->shift) - 1; + return (xas->xa_index & mask) > (xas->xa_offset << node->shift); +} + /** * xa_find_after() - Search the XArray for a present entry. * @xa: XArray. @@ -1847,21 +1865,20 @@ void *xa_find_after(struct xarray *xa, unsigned long *indexp, XA_STATE(xas, xa, *indexp + 1); void *entry; + if (xas.xa_index == 0) + return NULL; + rcu_read_lock(); for (;;) { if ((__force unsigned int)filter < XA_MAX_MARKS) entry = xas_find_marked(&xas, max, filter); else entry = xas_find(&xas, max); - if (xas.xa_node == XAS_BOUNDS) + + if (xas_invalid(&xas)) break; - if (xas.xa_shift) { - if (xas.xa_index & ((1UL << xas.xa_shift) - 1)) - continue; - } else { - if (xas.xa_offset < (xas.xa_index & XA_CHUNK_MASK)) - continue; - } + if (xas_sibling(&xas)) + continue; if (!xas_retry(&xas, entry)) break; } diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c index 7dd602d7f8db..ad9d5b1c4473 100644 --- a/mm/gup_benchmark.c +++ b/mm/gup_benchmark.c @@ -26,6 +26,7 @@ static int __gup_benchmark_ioctl(unsigned int cmd, unsigned long i, nr_pages, addr, next; int nr; struct page **pages; + int ret = 0; if (gup->size > ULONG_MAX) return -EINVAL; @@ -63,7 +64,9 @@ static int __gup_benchmark_ioctl(unsigned int cmd, NULL); break; default: - return -1; + kvfree(pages); + ret = -EINVAL; + goto out; } if (nr <= 0) @@ -85,7 +88,8 @@ static int __gup_benchmark_ioctl(unsigned int cmd, gup->put_delta_usec = ktime_us_delta(end_time, start_time); kvfree(pages); - return 0; +out: + return ret; } static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 41a0fbddc96b..a88093213674 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -527,13 +527,13 @@ void prep_transhuge_page(struct page *page) set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); } -static unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len, +static unsigned long __thp_get_unmapped_area(struct file *filp, + unsigned long addr, unsigned long len, loff_t off, unsigned long flags, unsigned long size) { - unsigned long addr; loff_t off_end = off + len; loff_t off_align = round_up(off, size); - unsigned long len_pad; + unsigned long len_pad, ret; if (off_end <= off_align || (off_end - off_align) < size) return 0; @@ -542,30 +542,40 @@ static unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long le if (len_pad < len || (off + len_pad) < off) return 0; - addr = current->mm->get_unmapped_area(filp, 0, len_pad, + ret = current->mm->get_unmapped_area(filp, addr, len_pad, off >> PAGE_SHIFT, flags); - if (IS_ERR_VALUE(addr)) + + /* + * The failure might be due to length padding. The caller will retry + * without the padding. + */ + if (IS_ERR_VALUE(ret)) return 0; - addr += (off - addr) & (size - 1); - return addr; + /* + * Do not try to align to THP boundary if allocation at the address + * hint succeeds. + */ + if (ret == addr) + return addr; + + ret += (off - ret) & (size - 1); + return ret; } unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { + unsigned long ret; loff_t off = (loff_t)pgoff << PAGE_SHIFT; - if (addr) - goto out; if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD)) goto out; - addr = __thp_get_unmapped_area(filp, len, off, flags, PMD_SIZE); - if (addr) - return addr; - - out: + ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE); + if (ret) + return ret; +out: return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); } EXPORT_SYMBOL_GPL(thp_get_unmapped_area); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ac65bb5e38ac..dd8737a94bec 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -27,6 +27,7 @@ #include <linux/swapops.h> #include <linux/jhash.h> #include <linux/numa.h> +#include <linux/llist.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -1136,7 +1137,7 @@ static inline void ClearPageHugeTemporary(struct page *page) page[2].mapping = NULL; } -void free_huge_page(struct page *page) +static void __free_huge_page(struct page *page) { /* * Can't pass hstate in here because it is called from the @@ -1199,6 +1200,54 @@ void free_huge_page(struct page *page) spin_unlock(&hugetlb_lock); } +/* + * As free_huge_page() can be called from a non-task context, we have + * to defer the actual freeing in a workqueue to prevent potential + * hugetlb_lock deadlock. + * + * free_hpage_workfn() locklessly retrieves the linked list of pages to + * be freed and frees them one-by-one. As the page->mapping pointer is + * going to be cleared in __free_huge_page() anyway, it is reused as the + * llist_node structure of a lockless linked list of huge pages to be freed. + */ +static LLIST_HEAD(hpage_freelist); + +static void free_hpage_workfn(struct work_struct *work) +{ + struct llist_node *node; + struct page *page; + + node = llist_del_all(&hpage_freelist); + + while (node) { + page = container_of((struct address_space **)node, + struct page, mapping); + node = node->next; + __free_huge_page(page); + } +} +static DECLARE_WORK(free_hpage_work, free_hpage_workfn); + +void free_huge_page(struct page *page) +{ + /* + * Defer freeing if in non-task context to avoid hugetlb_lock deadlock. + */ + if (!in_task()) { + /* + * Only call schedule_work() if hpage_freelist is previously + * empty. Otherwise, schedule_work() had been called but the + * workfn hasn't retrieved the list yet. + */ + if (llist_add((struct llist_node *)&page->mapping, + &hpage_freelist)) + schedule_work(&free_hpage_work); + return; + } + + __free_huge_page(page); +} + static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { INIT_LIST_HEAD(&page->lru); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c5b5f74cfd4d..6c83cf4ed970 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3287,49 +3287,34 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, } } -static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg, bool slab_only) +static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg) { - unsigned long stat[MEMCG_NR_STAT]; + unsigned long stat[MEMCG_NR_STAT] = {0}; struct mem_cgroup *mi; int node, cpu, i; - int min_idx, max_idx; - - if (slab_only) { - min_idx = NR_SLAB_RECLAIMABLE; - max_idx = NR_SLAB_UNRECLAIMABLE; - } else { - min_idx = 0; - max_idx = MEMCG_NR_STAT; - } - - for (i = min_idx; i < max_idx; i++) - stat[i] = 0; for_each_online_cpu(cpu) - for (i = min_idx; i < max_idx; i++) + for (i = 0; i < MEMCG_NR_STAT; i++) stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu); for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) - for (i = min_idx; i < max_idx; i++) + for (i = 0; i < MEMCG_NR_STAT; i++) atomic_long_add(stat[i], &mi->vmstats[i]); - if (!slab_only) - max_idx = NR_VM_NODE_STAT_ITEMS; - for_each_node(node) { struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; struct mem_cgroup_per_node *pi; - for (i = min_idx; i < max_idx; i++) + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) stat[i] = 0; for_each_online_cpu(cpu) - for (i = min_idx; i < max_idx; i++) + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) stat[i] += per_cpu( pn->lruvec_stat_cpu->count[i], cpu); for (pi = pn; pi; pi = parent_nodeinfo(pi, node)) - for (i = min_idx; i < max_idx; i++) + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) atomic_long_add(stat[i], &pi->lruvec_stat[i]); } } @@ -3403,13 +3388,9 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) parent = root_mem_cgroup; /* - * Deactivate and reparent kmem_caches. Then flush percpu - * slab statistics to have precise values at the parent and - * all ancestor levels. It's required to keep slab stats - * accurate after the reparenting of kmem_caches. + * Deactivate and reparent kmem_caches. */ memcg_deactivate_kmem_caches(memcg, parent); - memcg_flush_percpu_vmstats(memcg, true); kmemcg_id = memcg->kmemcg_id; BUG_ON(kmemcg_id < 0); @@ -4913,7 +4894,7 @@ static void mem_cgroup_free(struct mem_cgroup *memcg) * Flush percpu vmstats and vmevents to guarantee the value correctness * on parent's and all ancestor levels. */ - memcg_flush_percpu_vmstats(memcg, false); + memcg_flush_percpu_vmstats(memcg); memcg_flush_percpu_vmevents(memcg); __mem_cgroup_free(memcg); } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 55ac23ef11c1..a91a072f2b2c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -483,8 +483,9 @@ static void update_pgdat_span(struct pglist_data *pgdat) pgdat->node_spanned_pages = node_end_pfn - node_start_pfn; } -static void __remove_zone(struct zone *zone, unsigned long start_pfn, - unsigned long nr_pages) +void __ref remove_pfn_range_from_zone(struct zone *zone, + unsigned long start_pfn, + unsigned long nr_pages) { struct pglist_data *pgdat = zone->zone_pgdat; unsigned long flags; @@ -499,28 +500,30 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn, return; #endif + clear_zone_contiguous(zone); + pgdat_resize_lock(zone->zone_pgdat, &flags); shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); update_pgdat_span(pgdat); pgdat_resize_unlock(zone->zone_pgdat, &flags); + + set_zone_contiguous(zone); } -static void __remove_section(struct zone *zone, unsigned long pfn, - unsigned long nr_pages, unsigned long map_offset, - struct vmem_altmap *altmap) +static void __remove_section(unsigned long pfn, unsigned long nr_pages, + unsigned long map_offset, + struct vmem_altmap *altmap) { struct mem_section *ms = __nr_to_section(pfn_to_section_nr(pfn)); if (WARN_ON_ONCE(!valid_section(ms))) return; - __remove_zone(zone, pfn, nr_pages); sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap); } /** - * __remove_pages() - remove sections of pages from a zone - * @zone: zone from which pages need to be removed + * __remove_pages() - remove sections of pages * @pfn: starting pageframe (must be aligned to start of a section) * @nr_pages: number of pages to remove (must be multiple of section size) * @altmap: alternative device page map or %NULL if default memmap is used @@ -530,16 +533,14 @@ static void __remove_section(struct zone *zone, unsigned long pfn, * sure that pages are marked reserved and zones are adjust properly by * calling offline_pages(). */ -void __remove_pages(struct zone *zone, unsigned long pfn, - unsigned long nr_pages, struct vmem_altmap *altmap) +void __remove_pages(unsigned long pfn, unsigned long nr_pages, + struct vmem_altmap *altmap) { unsigned long map_offset = 0; unsigned long nr, start_sec, end_sec; map_offset = vmem_altmap_offset(altmap); - clear_zone_contiguous(zone); - if (check_pfn_span(pfn, nr_pages, "remove")) return; @@ -551,13 +552,11 @@ void __remove_pages(struct zone *zone, unsigned long pfn, cond_resched(); pfns = min(nr_pages, PAGES_PER_SECTION - (pfn & ~PAGE_SECTION_MASK)); - __remove_section(zone, pfn, pfns, map_offset, altmap); + __remove_section(pfn, pfns, map_offset, altmap); pfn += pfns; nr_pages -= pfns; map_offset = 0; } - - set_zone_contiguous(zone); } int set_online_page_callback(online_page_callback_t callback) @@ -869,6 +868,7 @@ failed_addition: (unsigned long long) pfn << PAGE_SHIFT, (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); memory_notify(MEM_CANCEL_ONLINE, &arg); + remove_pfn_range_from_zone(zone, pfn, nr_pages); mem_hotplug_done(); return ret; } @@ -1628,6 +1628,7 @@ static int __ref __offline_pages(unsigned long start_pfn, writeback_set_ratelimit(); memory_notify(MEM_OFFLINE, &arg); + remove_pfn_range_from_zone(zone, start_pfn, nr_pages); mem_hotplug_done(); return 0; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 067cf7d3daf5..b2920ae87a61 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2148,18 +2148,22 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, nmask = policy_nodemask(gfp, pol); if (!nmask || node_isset(hpage_node, *nmask)) { mpol_cond_put(pol); + /* + * First, try to allocate THP only on local node, but + * don't reclaim unnecessarily, just compact. + */ page = __alloc_pages_node(hpage_node, - gfp | __GFP_THISNODE, order); + gfp | __GFP_THISNODE | __GFP_NORETRY, order); /* * If hugepage allocations are configured to always * synchronous compact or the vma has been madvised * to prefer hugepage backing, retry allowing remote - * memory as well. + * memory with both reclaim and compact as well. */ if (!page && (gfp & __GFP_DIRECT_RECLAIM)) page = __alloc_pages_node(hpage_node, - gfp | __GFP_NORETRY, order); + gfp, order); goto out; } diff --git a/mm/memremap.c b/mm/memremap.c index 03ccbdfeb697..c51c6bd2fe34 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -120,7 +120,7 @@ void memunmap_pages(struct dev_pagemap *pgmap) mem_hotplug_begin(); if (pgmap->type == MEMORY_DEVICE_PRIVATE) { - __remove_pages(page_zone(first_page), PHYS_PFN(res->start), + __remove_pages(PHYS_PFN(res->start), PHYS_PFN(resource_size(res)), NULL); } else { arch_remove_memory(nid, res->start, resource_size(res), diff --git a/mm/migrate.c b/mm/migrate.c index eae1565285e3..86873b6f38a7 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1512,9 +1512,11 @@ static int do_move_pages_to_node(struct mm_struct *mm, /* * Resolves the given address to a struct page, isolates it from the LRU and * puts it to the given pagelist. - * Returns -errno if the page cannot be found/isolated or 0 when it has been - * queued or the page doesn't need to be migrated because it is already on - * the target node + * Returns: + * errno - if the page cannot be found/isolated + * 0 - when it doesn't have to be migrated because it is already on the + * target node + * 1 - when it has been queued */ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, int node, struct list_head *pagelist, bool migrate_all) @@ -1553,7 +1555,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, if (PageHuge(page)) { if (PageHead(page)) { isolate_huge_page(page, pagelist); - err = 0; + err = 1; } } else { struct page *head; @@ -1563,7 +1565,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, if (err) goto out_putpage; - err = 0; + err = 1; list_add_tail(&head->lru, pagelist); mod_node_page_state(page_pgdat(head), NR_ISOLATED_ANON + page_is_file_cache(head), @@ -1640,8 +1642,17 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, */ err = add_page_for_migration(mm, addr, current_node, &pagelist, flags & MPOL_MF_MOVE_ALL); - if (!err) + + if (!err) { + /* The page is already on the target node */ + err = store_status(status, i, current_node, 1); + if (err) + goto out_flush; continue; + } else if (err > 0) { + /* The page is successfully queued for migration */ + continue; + } err = store_status(status, i, err, 1); if (err) diff --git a/mm/mmap.c b/mm/mmap.c index 9c648524e4dc..71e4ffc83bcd 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -90,12 +90,6 @@ static void unmap_region(struct mm_struct *mm, * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes * w: (no) no w: (no) no w: (copy) copy w: (no) no * x: (no) no x: (no) yes x: (no) yes x: (yes) yes - * - * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and - * MAP_PRIVATE: - * r: (no) no - * w: (no) no - * x: (yes) yes */ pgprot_t protection_map[16] __ro_after_init = { __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 71e3acea7817..d58c481b3df8 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -890,7 +890,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) K(get_mm_counter(mm, MM_FILEPAGES)), K(get_mm_counter(mm, MM_SHMEMPAGES)), from_kuid(&init_user_ns, task_uid(victim)), - mm_pgtables_bytes(mm), victim->signal->oom_score_adj); + mm_pgtables_bytes(mm) >> 10, victim->signal->oom_score_adj); task_unlock(victim); /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 50055d2e4ea8..2caf780a42e7 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -201,11 +201,11 @@ static void wb_min_max_ratio(struct bdi_writeback *wb, if (this_bw < tot_bw) { if (min) { min *= this_bw; - do_div(min, tot_bw); + min = div64_ul(min, tot_bw); } if (max < 100) { max *= this_bw; - do_div(max, tot_bw); + max = div64_ul(max, tot_bw); } } @@ -766,7 +766,7 @@ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) struct wb_domain *dom = dtc_dom(dtc); unsigned long thresh = dtc->thresh; u64 wb_thresh; - long numerator, denominator; + unsigned long numerator, denominator; unsigned long wb_min_ratio, wb_max_ratio; /* @@ -777,7 +777,7 @@ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100; wb_thresh *= numerator; - do_div(wb_thresh, denominator); + wb_thresh = div64_ul(wb_thresh, denominator); wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio); @@ -1102,7 +1102,7 @@ static void wb_update_write_bandwidth(struct bdi_writeback *wb, bw = written - min(written, wb->written_stamp); bw *= HZ; if (unlikely(elapsed > period)) { - do_div(bw, elapsed); + bw = div64_ul(bw, elapsed); avg = bw; goto out; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4785a8a2040e..d047bf7d8fd4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -694,34 +694,27 @@ void prep_compound_page(struct page *page, unsigned int order) #ifdef CONFIG_DEBUG_PAGEALLOC unsigned int _debug_guardpage_minorder; -#ifdef CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT -DEFINE_STATIC_KEY_TRUE(_debug_pagealloc_enabled); -#else +bool _debug_pagealloc_enabled_early __read_mostly + = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); +EXPORT_SYMBOL(_debug_pagealloc_enabled_early); DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); -#endif EXPORT_SYMBOL(_debug_pagealloc_enabled); DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled); static int __init early_debug_pagealloc(char *buf) { - bool enable = false; - - if (kstrtobool(buf, &enable)) - return -EINVAL; - - if (enable) - static_branch_enable(&_debug_pagealloc_enabled); - - return 0; + return kstrtobool(buf, &_debug_pagealloc_enabled_early); } early_param("debug_pagealloc", early_debug_pagealloc); -static void init_debug_guardpage(void) +void init_debug_pagealloc(void) { if (!debug_pagealloc_enabled()) return; + static_branch_enable(&_debug_pagealloc_enabled); + if (!debug_guardpage_minorder()) return; @@ -1186,7 +1179,7 @@ static __always_inline bool free_pages_prepare(struct page *page, */ arch_free_page(page, order); - if (debug_pagealloc_enabled()) + if (debug_pagealloc_enabled_static()) kernel_map_pages(page, 1 << order, 0); kasan_free_nondeferred_pages(page, order); @@ -1207,7 +1200,7 @@ static bool free_pcp_prepare(struct page *page) static bool bulkfree_pcp_prepare(struct page *page) { - if (debug_pagealloc_enabled()) + if (debug_pagealloc_enabled_static()) return free_pages_check(page); else return false; @@ -1221,7 +1214,7 @@ static bool bulkfree_pcp_prepare(struct page *page) */ static bool free_pcp_prepare(struct page *page) { - if (debug_pagealloc_enabled()) + if (debug_pagealloc_enabled_static()) return free_pages_prepare(page, 0, true); else return free_pages_prepare(page, 0, false); @@ -1973,10 +1966,6 @@ void __init page_alloc_init_late(void) for_each_populated_zone(zone) set_zone_contiguous(zone); - -#ifdef CONFIG_DEBUG_PAGEALLOC - init_debug_guardpage(); -#endif } #ifdef CONFIG_CMA @@ -2106,7 +2095,7 @@ static inline bool free_pages_prezeroed(void) */ static inline bool check_pcp_refill(struct page *page) { - if (debug_pagealloc_enabled()) + if (debug_pagealloc_enabled_static()) return check_new_page(page); else return false; @@ -2128,7 +2117,7 @@ static inline bool check_pcp_refill(struct page *page) } static inline bool check_new_pcp(struct page *page) { - if (debug_pagealloc_enabled()) + if (debug_pagealloc_enabled_static()) return check_new_page(page); else return false; @@ -2155,7 +2144,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, set_page_refcounted(page); arch_alloc_page(page, order); - if (debug_pagealloc_enabled()) + if (debug_pagealloc_enabled_static()) kernel_map_pages(page, 1 << order, 1); kasan_alloc_pages(page, order); kernel_poison_pages(page, 1 << order, 1); @@ -4476,8 +4465,11 @@ retry_cpuset: if (page) goto got_pg; - if (order >= pageblock_order && (gfp_mask & __GFP_IO) && - !(gfp_mask & __GFP_RETRY_MAYFAIL)) { + /* + * Checks for costly allocations with __GFP_NORETRY, which + * includes some THP page fault allocations + */ + if (costly_order && (gfp_mask & __GFP_NORETRY)) { /* * If allocating entire pageblock(s) and compaction * failed because all zones are below low watermarks @@ -4498,23 +4490,6 @@ retry_cpuset: if (compact_result == COMPACT_SKIPPED || compact_result == COMPACT_DEFERRED) goto nopage; - } - - /* - * Checks for costly allocations with __GFP_NORETRY, which - * includes THP page fault allocations - */ - if (costly_order && (gfp_mask & __GFP_NORETRY)) { - /* - * If compaction is deferred for high-order allocations, - * it is because sync compaction recently failed. If - * this is the case and the caller requested a THP - * allocation, we do not want to heavily disrupt the - * system, so we fail the allocation instead of entering - * direct reclaim. - */ - if (compact_result == COMPACT_DEFERRED) - goto nopage; /* * Looks like reclaim/compaction is worth trying, but diff --git a/mm/shmem.c b/mm/shmem.c index 165fa6332993..8793e8cc1a48 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2107,9 +2107,10 @@ unsigned long shmem_get_unmapped_area(struct file *file, /* * Our priority is to support MAP_SHARED mapped hugely; * and support MAP_PRIVATE mapped hugely too, until it is COWed. - * But if caller specified an address hint, respect that as before. + * But if caller specified an address hint and we allocated area there + * successfully, respect that as before. */ - if (uaddr) + if (uaddr == addr) return addr; if (shmem_huge != SHMEM_HUGE_FORCE) { @@ -2143,7 +2144,7 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (inflated_len < len) return addr; - inflated_addr = get_area(NULL, 0, inflated_len, 0, flags); + inflated_addr = get_area(NULL, uaddr, inflated_len, 0, flags); if (IS_ERR_VALUE(inflated_addr)) return addr; if (inflated_addr & ~PAGE_MASK) diff --git a/mm/slab.c b/mm/slab.c index f1e1840af533..a89633603b2d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1416,7 +1416,7 @@ static void kmem_rcu_free(struct rcu_head *head) #if DEBUG static bool is_debug_pagealloc_cache(struct kmem_cache *cachep) { - if (debug_pagealloc_enabled() && OFF_SLAB(cachep) && + if (debug_pagealloc_enabled_static() && OFF_SLAB(cachep) && (cachep->size % PAGE_SIZE) == 0) return true; @@ -2008,7 +2008,7 @@ int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags) * to check size >= 256. It guarantees that all necessary small * sized slab is initialized in current slab initialization sequence. */ - if (debug_pagealloc_enabled() && (flags & SLAB_POISON) && + if (debug_pagealloc_enabled_static() && (flags & SLAB_POISON) && size >= 256 && cachep->object_size > cache_line_size()) { if (size < PAGE_SIZE || size % PAGE_SIZE == 0) { size_t tmp_size = ALIGN(size, PAGE_SIZE); diff --git a/mm/slab_common.c b/mm/slab_common.c index f0ab6d4ceb4c..0d95ddea13b0 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -903,7 +903,8 @@ static void flush_memcg_workqueue(struct kmem_cache *s) * deactivates the memcg kmem_caches through workqueue. Make sure all * previous workitems on workqueue are processed. */ - flush_workqueue(memcg_kmem_cache_wq); + if (likely(memcg_kmem_cache_wq)) + flush_workqueue(memcg_kmem_cache_wq); /* * If we're racing with children kmem_cache deactivation, it might diff --git a/mm/slub.c b/mm/slub.c index d11389710b12..8eafccf75940 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -288,7 +288,7 @@ static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) unsigned long freepointer_addr; void *p; - if (!debug_pagealloc_enabled()) + if (!debug_pagealloc_enabled_static()) return get_freepointer(s, object); freepointer_addr = (unsigned long)object + s->offset; diff --git a/mm/sparse.c b/mm/sparse.c index b20ab7cdac86..3822ecbd8a1f 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -777,7 +777,14 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) { unsigned long section_nr = pfn_to_section_nr(pfn); - if (!section_is_early) { + /* + * When removing an early section, the usage map is kept (as the + * usage maps of other sections fall into the same page). It + * will be re-used when re-adding the section - which is then no + * longer an early section. If the usage map is PageReserved, it + * was allocated during boot. + */ + if (!PageReserved(virt_to_page(ms->usage))) { kfree(ms->usage); ms->usage = NULL; } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e9681dc4aa75..b29ad17edcf5 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1383,7 +1383,7 @@ static void free_unmap_vmap_area(struct vmap_area *va) { flush_cache_vunmap(va->va_start, va->va_end); unmap_vmap_area(va); - if (debug_pagealloc_enabled()) + if (debug_pagealloc_enabled_static()) flush_tlb_kernel_range(va->va_start, va->va_end); free_vmap_area_noflush(va); @@ -1681,7 +1681,7 @@ static void vb_free(const void *addr, unsigned long size) vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); - if (debug_pagealloc_enabled()) + if (debug_pagealloc_enabled_static()) flush_tlb_kernel_range((unsigned long)addr, (unsigned long)addr + size); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 2b2b9aae8a3c..22d17ecfe7df 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -2069,6 +2069,11 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, zs_pool_dec_isolated(pool); } + if (page_zone(newpage) != page_zone(page)) { + dec_zone_page_state(page, NR_ZSPAGES); + inc_zone_page_state(newpage, NR_ZSPAGES); + } + reset_page(page); put_page(page); page = newpage; diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h index c46daf09a501..bb7ec1a3915d 100644 --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -126,6 +126,7 @@ int vlan_check_real_dev(struct net_device *real_dev, void vlan_setup(struct net_device *dev); int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack); void unregister_vlan_dev(struct net_device *dev, struct list_head *head); +void vlan_dev_uninit(struct net_device *dev); bool vlan_dev_inherit_address(struct net_device *dev, struct net_device *real_dev); diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 5ff8059837b4..990b9fde28c6 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -586,7 +586,8 @@ static int vlan_dev_init(struct net_device *dev) return 0; } -static void vlan_dev_uninit(struct net_device *dev) +/* Note: this function might be called multiple times for the same device. */ +void vlan_dev_uninit(struct net_device *dev) { struct vlan_priority_tci_mapping *pm; struct vlan_dev_priv *vlan = vlan_dev_priv(dev); diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c index c482a6fe9393..0db85aeb119b 100644 --- a/net/8021q/vlan_netlink.c +++ b/net/8021q/vlan_netlink.c @@ -108,11 +108,13 @@ static int vlan_changelink(struct net_device *dev, struct nlattr *tb[], struct ifla_vlan_flags *flags; struct ifla_vlan_qos_mapping *m; struct nlattr *attr; - int rem; + int rem, err; if (data[IFLA_VLAN_FLAGS]) { flags = nla_data(data[IFLA_VLAN_FLAGS]); - vlan_dev_change_flags(dev, flags->flags, flags->mask); + err = vlan_dev_change_flags(dev, flags->flags, flags->mask); + if (err) + return err; } if (data[IFLA_VLAN_INGRESS_QOS]) { nla_for_each_nested(attr, data[IFLA_VLAN_INGRESS_QOS], rem) { @@ -123,7 +125,9 @@ static int vlan_changelink(struct net_device *dev, struct nlattr *tb[], if (data[IFLA_VLAN_EGRESS_QOS]) { nla_for_each_nested(attr, data[IFLA_VLAN_EGRESS_QOS], rem) { m = nla_data(attr); - vlan_dev_set_egress_priority(dev, m->from, m->to); + err = vlan_dev_set_egress_priority(dev, m->from, m->to); + if (err) + return err; } } return 0; @@ -179,10 +183,11 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev, return -EINVAL; err = vlan_changelink(dev, tb, data, extack); - if (err < 0) - return err; - - return register_vlan_dev(dev, extack); + if (!err) + err = register_vlan_dev(dev, extack); + if (err) + vlan_dev_uninit(dev); + return err; } static inline size_t vlan_qos_map_size(unsigned int n) diff --git a/net/Kconfig b/net/Kconfig index 54916b7adb9b..b0937a700f01 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -91,6 +91,7 @@ if INET source "net/ipv4/Kconfig" source "net/ipv6/Kconfig" source "net/netlabel/Kconfig" +source "net/mptcp/Kconfig" endif # if INET diff --git a/net/Makefile b/net/Makefile index 848303d98d3d..07ea48160874 100644 --- a/net/Makefile +++ b/net/Makefile @@ -87,3 +87,4 @@ endif obj-$(CONFIG_QRTR) += qrtr/ obj-$(CONFIG_NET_NCSI) += ncsi/ obj-$(CONFIG_XDP_SOCKETS) += xdp/ +obj-$(CONFIG_MPTCP) += mptcp/ diff --git a/net/atm/atm_sysfs.c b/net/atm/atm_sysfs.c index 39b94ca5f65d..aa1b57161f3b 100644 --- a/net/atm/atm_sysfs.c +++ b/net/atm/atm_sysfs.c @@ -33,23 +33,17 @@ static ssize_t show_atmaddress(struct device *cdev, unsigned long flags; struct atm_dev *adev = to_atm_dev(cdev); struct atm_dev_addr *aaddr; - int bin[] = { 1, 2, 10, 6, 1 }, *fmt = bin; - int i, j, count = 0; + int count = 0; spin_lock_irqsave(&adev->lock, flags); list_for_each_entry(aaddr, &adev->local, entry) { - for (i = 0, j = 0; i < ATM_ESA_LEN; ++i, ++j) { - if (j == *fmt) { - count += scnprintf(buf + count, - PAGE_SIZE - count, "."); - ++fmt; - j = 0; - } - count += scnprintf(buf + count, - PAGE_SIZE - count, "%02x", - aaddr->addr.sas_addr.prv[i]); - } - count += scnprintf(buf + count, PAGE_SIZE - count, "\n"); + count += scnprintf(buf + count, PAGE_SIZE - count, + "%1phN.%2phN.%10phN.%6phN.%1phN\n", + &aaddr->addr.sas_addr.prv[0], + &aaddr->addr.sas_addr.prv[1], + &aaddr->addr.sas_addr.prv[3], + &aaddr->addr.sas_addr.prv[13], + &aaddr->addr.sas_addr.prv[19]); } spin_unlock_irqrestore(&adev->lock, flags); diff --git a/net/atm/lec.c b/net/atm/lec.c index b57368e70aab..25fa3a7b72bd 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -799,14 +799,9 @@ static const char *lec_arp_get_status_string(unsigned char status) static void lec_info(struct seq_file *seq, struct lec_arp_table *entry) { - int i; - - for (i = 0; i < ETH_ALEN; i++) - seq_printf(seq, "%2.2x", entry->mac_addr[i] & 0xff); - seq_printf(seq, " "); - for (i = 0; i < ATM_ESA_LEN; i++) - seq_printf(seq, "%2.2x", entry->atm_addr[i] & 0xff); - seq_printf(seq, " %s %4.4x", lec_arp_get_status_string(entry->status), + seq_printf(seq, "%pM ", entry->mac_addr); + seq_printf(seq, "%*phN ", ATM_ESA_LEN, entry->atm_addr); + seq_printf(seq, "%s %4.4x", lec_arp_get_status_string(entry->status), entry->flags & 0xffff); if (entry->vcc) seq_printf(seq, "%3d %3d ", entry->vcc->vpi, entry->vcc->vci); @@ -1354,7 +1349,7 @@ static void dump_arp_table(struct lec_priv *priv) { struct lec_arp_table *rulla; char buf[256]; - int i, j, offset; + int i, offset; pr_info("Dump %p:\n", priv); for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { @@ -1362,14 +1357,10 @@ static void dump_arp_table(struct lec_priv *priv) &priv->lec_arp_tables[i], next) { offset = 0; offset += sprintf(buf, "%d: %p\n", i, rulla); - offset += sprintf(buf + offset, "Mac: %pM", + offset += sprintf(buf + offset, "Mac: %pM ", rulla->mac_addr); - offset += sprintf(buf + offset, " Atm:"); - for (j = 0; j < ATM_ESA_LEN; j++) { - offset += sprintf(buf + offset, - "%2.2x ", - rulla->atm_addr[j] & 0xff); - } + offset += sprintf(buf + offset, "Atm: %*ph ", ATM_ESA_LEN, + rulla->atm_addr); offset += sprintf(buf + offset, "Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ", rulla->vcc ? rulla->vcc->vpi : 0, @@ -1392,12 +1383,9 @@ static void dump_arp_table(struct lec_priv *priv) pr_info("No forward\n"); hlist_for_each_entry(rulla, &priv->lec_no_forward, next) { offset = 0; - offset += sprintf(buf + offset, "Mac: %pM", rulla->mac_addr); - offset += sprintf(buf + offset, " Atm:"); - for (j = 0; j < ATM_ESA_LEN; j++) { - offset += sprintf(buf + offset, "%2.2x ", - rulla->atm_addr[j] & 0xff); - } + offset += sprintf(buf + offset, "Mac: %pM ", rulla->mac_addr); + offset += sprintf(buf + offset, "Atm: %*ph ", ATM_ESA_LEN, + rulla->atm_addr); offset += sprintf(buf + offset, "Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ", rulla->vcc ? rulla->vcc->vpi : 0, @@ -1417,12 +1405,9 @@ static void dump_arp_table(struct lec_priv *priv) pr_info("Empty ones\n"); hlist_for_each_entry(rulla, &priv->lec_arp_empty_ones, next) { offset = 0; - offset += sprintf(buf + offset, "Mac: %pM", rulla->mac_addr); - offset += sprintf(buf + offset, " Atm:"); - for (j = 0; j < ATM_ESA_LEN; j++) { - offset += sprintf(buf + offset, "%2.2x ", - rulla->atm_addr[j] & 0xff); - } + offset += sprintf(buf + offset, "Mac: %pM ", rulla->mac_addr); + offset += sprintf(buf + offset, "Atm: %*ph ", ATM_ESA_LEN, + rulla->atm_addr); offset += sprintf(buf + offset, "Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ", rulla->vcc ? rulla->vcc->vpi : 0, @@ -1442,12 +1427,9 @@ static void dump_arp_table(struct lec_priv *priv) pr_info("Multicast Forward VCCs\n"); hlist_for_each_entry(rulla, &priv->mcast_fwds, next) { offset = 0; - offset += sprintf(buf + offset, "Mac: %pM", rulla->mac_addr); - offset += sprintf(buf + offset, " Atm:"); - for (j = 0; j < ATM_ESA_LEN; j++) { - offset += sprintf(buf + offset, "%2.2x ", - rulla->atm_addr[j] & 0xff); - } + offset += sprintf(buf + offset, "Mac: %pM ", rulla->mac_addr); + offset += sprintf(buf + offset, "Atm: %*ph ", ATM_ESA_LEN, + rulla->atm_addr); offset += sprintf(buf + offset, "Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ", rulla->vcc ? rulla->vcc->vpi : 0, @@ -1973,17 +1955,8 @@ lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data, * Vcc which we don't want to make default vcc, * attach it anyway. */ - pr_debug("LEC_ARP:Attaching data direct, not default: %2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x\n", - ioc_data->atm_addr[0], ioc_data->atm_addr[1], - ioc_data->atm_addr[2], ioc_data->atm_addr[3], - ioc_data->atm_addr[4], ioc_data->atm_addr[5], - ioc_data->atm_addr[6], ioc_data->atm_addr[7], - ioc_data->atm_addr[8], ioc_data->atm_addr[9], - ioc_data->atm_addr[10], ioc_data->atm_addr[11], - ioc_data->atm_addr[12], ioc_data->atm_addr[13], - ioc_data->atm_addr[14], ioc_data->atm_addr[15], - ioc_data->atm_addr[16], ioc_data->atm_addr[17], - ioc_data->atm_addr[18], ioc_data->atm_addr[19]); + pr_debug("LEC_ARP:Attaching data direct, not default: %*phN\n", + ATM_ESA_LEN, ioc_data->atm_addr); entry = make_entry(priv, bus_mac); if (entry == NULL) goto out; @@ -1999,17 +1972,8 @@ lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data, dump_arp_table(priv); goto out; } - pr_debug("LEC_ARP:Attaching data direct, default: %2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x\n", - ioc_data->atm_addr[0], ioc_data->atm_addr[1], - ioc_data->atm_addr[2], ioc_data->atm_addr[3], - ioc_data->atm_addr[4], ioc_data->atm_addr[5], - ioc_data->atm_addr[6], ioc_data->atm_addr[7], - ioc_data->atm_addr[8], ioc_data->atm_addr[9], - ioc_data->atm_addr[10], ioc_data->atm_addr[11], - ioc_data->atm_addr[12], ioc_data->atm_addr[13], - ioc_data->atm_addr[14], ioc_data->atm_addr[15], - ioc_data->atm_addr[16], ioc_data->atm_addr[17], - ioc_data->atm_addr[18], ioc_data->atm_addr[19]); + pr_debug("LEC_ARP:Attaching data direct, default: %*phN\n", + ATM_ESA_LEN, ioc_data->atm_addr); for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { hlist_for_each_entry(entry, &priv->lec_arp_tables[i], next) { diff --git a/net/atm/proc.c b/net/atm/proc.c index d79221fd4dae..c31896707313 100644 --- a/net/atm/proc.c +++ b/net/atm/proc.c @@ -134,8 +134,7 @@ static void vcc_seq_stop(struct seq_file *seq, void *v) static void *vcc_seq_next(struct seq_file *seq, void *v, loff_t *pos) { v = vcc_walk(seq, 1); - if (v) - (*pos)++; + (*pos)++; return v; } diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 324306d6fde0..ff57ea89c27e 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -808,7 +808,7 @@ static int ax25_create(struct net *net, struct socket *sock, int protocol, struct sock *sk; ax25_cb *ax25; - if (protocol < 0 || protocol > SK_PROTOCOL_MAX) + if (protocol < 0 || protocol > U8_MAX) return -EINVAL; if (!net_eq(net, &init_net)) diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index d5028af750d5..c762758a4649 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -# Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +# Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: # # Marek Lindner, Simon Wunderlich @@ -100,7 +100,6 @@ config BATMAN_ADV_DEBUG config BATMAN_ADV_SYSFS bool "batman-adv sysfs entries" depends on BATMAN_ADV - default y help Say Y here if you want to enable batman-adv device configuration and status interface through sysfs attributes. It is replaced by the diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index fd63e116d9ff..daa49af7ff40 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -# Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +# Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: # # Marek Lindner, Simon Wunderlich diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c index fa39eaaab9d7..382fbe51fd34 100644 --- a/net/batman-adv/bat_algo.c +++ b/net/batman-adv/bat_algo.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h index 37898da8ad48..686a60bc9492 100644 --- a/net/batman-adv/bat_algo.h +++ b/net/batman-adv/bat_algo.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Linus Lüssing */ diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 5b0b20e6da95..f0209505e41a 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h index c7a9ba305bfc..0c57c1000c64 100644 --- a/net/batman-adv/bat_iv_ogm.h +++ b/net/batman-adv/bat_iv_ogm.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 4ff6cf1ecae7..0ecaf1bb0068 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2020 B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner */ diff --git a/net/batman-adv/bat_v.h b/net/batman-adv/bat_v.h index 37833db098e6..5e0be10bc84e 100644 --- a/net/batman-adv/bat_v.h +++ b/net/batman-adv/bat_v.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Linus Lüssing */ diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 2614a9caee00..1e3172db7492 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner */ @@ -107,10 +107,17 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) } if (ret) goto default_throughput; - if (!(sinfo.filled & BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT))) - goto default_throughput; - return sinfo.expected_throughput / 100; + if (sinfo.filled & BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT)) + return sinfo.expected_throughput / 100; + + /* try to estimate the expected throughput based on reported tx + * rates + */ + if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE)) + return cfg80211_calculate_bitrate(&sinfo.txrate) / 3; + + goto default_throughput; } /* if not a wifi interface, check if this device provides data via diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h index 1a29505f4f66..4358d436be2a 100644 --- a/net/batman-adv/bat_v_elp.h +++ b/net/batman-adv/bat_v_elp.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2020 B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner */ diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 714ce56cfcc8..969466218999 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2020 B.A.T.M.A.N. contributors: * * Antonio Quartulli */ diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h index bf16d040461d..0ae2575f70bb 100644 --- a/net/batman-adv/bat_v_ogm.h +++ b/net/batman-adv/bat_v_ogm.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2020 B.A.T.M.A.N. contributors: * * Antonio Quartulli */ diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c index 7f04a6acf14e..4bc695cda397 100644 --- a/net/batman-adv/bitarray.c +++ b/net/batman-adv/bitarray.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2006-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2020 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner */ diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h index 84ad2d2b6ac9..533c6d44cb58 100644 --- a/net/batman-adv/bitarray.h +++ b/net/batman-adv/bitarray.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2006-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2020 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner */ diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 663a53b6d36e..41cc87f06b14 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors: * * Simon Wunderlich */ @@ -844,7 +844,7 @@ static bool batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr, /* handle as ANNOUNCE frame */ backbone_gw->lasttime = jiffies; - crc = ntohs(*((__be16 *)(&an_addr[4]))); + crc = ntohs(*((__force __be16 *)(&an_addr[4]))); batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): ANNOUNCE vid %d (sent by %pM)... CRC = %#.4x\n", diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index 02b24a861a85..41edb2c4a327 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors: * * Simon Wunderlich */ diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index 38c4d8e51155..452856c27d20 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h index 1c5afd301ce9..7e2e8f586f42 100644 --- a/net/batman-adv/debugfs.h +++ b/net/batman-adv/debugfs.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index b0af3a11d406..3d21dd83f8cc 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors: * * Antonio Quartulli */ @@ -246,7 +246,7 @@ static u8 *batadv_arp_hw_src(struct sk_buff *skb, int hdr_size) */ static __be32 batadv_arp_ip_src(struct sk_buff *skb, int hdr_size) { - return *(__be32 *)(batadv_arp_hw_src(skb, hdr_size) + ETH_ALEN); + return *(__force __be32 *)(batadv_arp_hw_src(skb, hdr_size) + ETH_ALEN); } /** @@ -270,7 +270,9 @@ static u8 *batadv_arp_hw_dst(struct sk_buff *skb, int hdr_size) */ static __be32 batadv_arp_ip_dst(struct sk_buff *skb, int hdr_size) { - return *(__be32 *)(batadv_arp_hw_src(skb, hdr_size) + ETH_ALEN * 2 + 4); + u8 *dst = batadv_arp_hw_src(skb, hdr_size) + ETH_ALEN * 2 + 4; + + return *(__force __be32 *)dst; } /** @@ -285,16 +287,18 @@ static u32 batadv_hash_dat(const void *data, u32 size) u32 hash = 0; const struct batadv_dat_entry *dat = data; const unsigned char *key; + __be16 vid; u32 i; - key = (const unsigned char *)&dat->ip; + key = (__force const unsigned char *)&dat->ip; for (i = 0; i < sizeof(dat->ip); i++) { hash += key[i]; hash += (hash << 10); hash ^= (hash >> 6); } - key = (const unsigned char *)&dat->vid; + vid = htons(dat->vid); + key = (__force const unsigned char *)&vid; for (i = 0; i < sizeof(dat->vid); i++) { hash += key[i]; hash += (hash << 10); diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h index 67c7729add55..2bff2f4a325c 100644 --- a/net/batman-adv/distributed-arp-table.h +++ b/net/batman-adv/distributed-arp-table.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors: * * Antonio Quartulli */ diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 385fccdcf69d..7cad97644d05 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2020 B.A.T.M.A.N. contributors: * * Martin Hundebøll <martin@hundeboll.net> */ diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h index abfe8c6556de..881ef328b6cd 100644 --- a/net/batman-adv/fragmentation.h +++ b/net/batman-adv/fragmentation.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2020 B.A.T.M.A.N. contributors: * * Martin Hundebøll <martin@hundeboll.net> */ diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 47df4c678988..e22e49289677 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2009-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2020 B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h index 0be8e7178ec7..88b5dba84354 100644 --- a/net/batman-adv/gateway_client.h +++ b/net/batman-adv/gateway_client.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2009-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2020 B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index fc55750542e4..16cd9450ceb1 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2009-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2020 B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h index 211b14b37db8..c3a0c5a7f7e9 100644 --- a/net/batman-adv/gateway_common.h +++ b/net/batman-adv/gateway_common.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2009-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2020 B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index afb52282d5bd..c7e98a40dd33 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index bbb8a6f18d6b..bad2e50135e8 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c index a9d4e176f4de..68638e0450a6 100644 --- a/net/batman-adv/hash.c +++ b/net/batman-adv/hash.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2006-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2020 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner */ diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h index 57877f0b78e0..91ae9f32b580 100644 --- a/net/batman-adv/hash.h +++ b/net/batman-adv/hash.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2006-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2020 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner */ diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index 0a70b66e8770..ccb535c77e5d 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h index 27fafff586df..6abd0f4742ef 100644 --- a/net/batman-adv/icmp_socket.h +++ b/net/batman-adv/icmp_socket.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c index 11941cf1adcc..a67b2b091447 100644 --- a/net/batman-adv/log.c +++ b/net/batman-adv/log.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h index 741cfa3719ff..f9884dc56cf3 100644 --- a/net/batman-adv/log.h +++ b/net/batman-adv/log.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ @@ -74,7 +74,7 @@ __printf(2, 3); * @bat_priv: the bat priv with all the soft interface information * @ratelimited: whether output should be rate limited * @fmt: format string - * @arg...: variable arguments + * @arg: variable arguments */ #define _batadv_dbg(type, bat_priv, ratelimited, fmt, arg...) \ do { \ @@ -98,7 +98,7 @@ static inline void _batadv_dbg(int type __always_unused, * batadv_dbg() - Store debug output without ratelimiting * @type: type of debug message * @bat_priv: the bat priv with all the soft interface information - * @arg...: format string and variable arguments + * @arg: format string and variable arguments */ #define batadv_dbg(type, bat_priv, arg...) \ _batadv_dbg(type, bat_priv, 0, ## arg) @@ -107,7 +107,7 @@ static inline void _batadv_dbg(int type __always_unused, * batadv_dbg_ratelimited() - Store debug output with ratelimiting * @type: type of debug message * @bat_priv: the bat priv with all the soft interface information - * @arg...: format string and variable arguments + * @arg: format string and variable arguments */ #define batadv_dbg_ratelimited(type, bat_priv, arg...) \ _batadv_dbg(type, bat_priv, 1, ## arg) @@ -116,7 +116,7 @@ static inline void _batadv_dbg(int type __always_unused, * batadv_info() - Store message in debug buffer and print it to kmsg buffer * @net_dev: the soft interface net device * @fmt: format string - * @arg...: variable arguments + * @arg: variable arguments */ #define batadv_info(net_dev, fmt, arg...) \ do { \ @@ -130,7 +130,7 @@ static inline void _batadv_dbg(int type __always_unused, * batadv_err() - Store error in debug buffer and print it to kmsg buffer * @net_dev: the soft interface net device * @fmt: format string - * @arg...: variable arguments + * @arg: variable arguments */ #define batadv_err(net_dev, fmt, arg...) \ do { \ diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 4811ec65bc43..d8a255c85e77 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index c7b340ddd0e7..692306df7b6f 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2019.5" +#define BATADV_SOURCE_VERSION "2020.0" #endif /* B.A.T.M.A.N. parameters */ diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index f9ec8e7507b6..9ebdc1e864b9 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2014-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2014-2020 B.A.T.M.A.N. contributors: * * Linus Lüssing */ diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h index 5d9e2bb29c97..ebf825991ecd 100644 --- a/net/batman-adv/multicast.h +++ b/net/batman-adv/multicast.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2014-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2014-2020 B.A.T.M.A.N. contributors: * * Linus Lüssing */ diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 7e052d6f759b..02ed073f95a9 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2016-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2016-2020 B.A.T.M.A.N. contributors: * * Matthias Schiffer */ diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h index ddc674e47dbb..7ee48f916997 100644 --- a/net/batman-adv/netlink.h +++ b/net/batman-adv/netlink.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2016-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2016-2020 B.A.T.M.A.N. contributors: * * Matthias Schiffer */ diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index 580609389f0f..8f0717c3f7b5 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2012-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2020 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen */ diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h index 753fa49723cf..334289084127 100644 --- a/net/batman-adv/network-coding.h +++ b/net/batman-adv/network-coding.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2012-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2020 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen */ diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 38613487fb1b..5b0c2fffc214 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2009-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index 512a1f99dd75..7bc01c138b3a 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index f0f864820dea..3632bd976c56 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h index c20feac95107..2ed49db6eff5 100644 --- a/net/batman-adv/routing.h +++ b/net/batman-adv/routing.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 3ce5f7bad369..7f8ade04e08e 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index 5fc0fd1e5d08..0d36e15589f6 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 832e156c519e..5f05a728f347 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h index 29139ad769fe..534e08d6ad91 100644 --- a/net/batman-adv/soft-interface.h +++ b/net/batman-adv/soft-interface.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index e5bbc28ed12c..c45962d8527b 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h index 5e466093dfa5..d987f8b30a98 100644 --- a/net/batman-adv/sysfs.h +++ b/net/batman-adv/sysfs.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index dd6a9a40dbb9..bd2ac570c42c 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2012-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2020 B.A.T.M.A.N. contributors: * * Edo Monticelli, Antonio Quartulli */ diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h index 78d310da0ad3..140105215aa2 100644 --- a/net/batman-adv/tp_meter.h +++ b/net/batman-adv/tp_meter.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2012-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2020 B.A.T.M.A.N. contributors: * * Edo Monticelli, Antonio Quartulli */ diff --git a/net/batman-adv/trace.c b/net/batman-adv/trace.c index 3cedd2c36528..3444d9e4e90d 100644 --- a/net/batman-adv/trace.c +++ b/net/batman-adv/trace.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors: * * Sven Eckelmann */ diff --git a/net/batman-adv/trace.h b/net/batman-adv/trace.h index d8f764521c0b..f631b1e01b89 100644 --- a/net/batman-adv/trace.h +++ b/net/batman-adv/trace.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors: * * Sven Eckelmann */ diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 8a482c5ec67b..852932838ddc 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli */ diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h index 4a98860d7f0e..b24d35b9226a 100644 --- a/net/batman-adv/translation-table.h +++ b/net/batman-adv/translation-table.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli */ diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index aae63f0d21eb..0963a43ad996 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h index 36985000a0a8..d509d00c7a23 100644 --- a/net/batman-adv/tvlv.h +++ b/net/batman-adv/tvlv.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 47718a82eaf2..4a17a66cc572 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ @@ -457,7 +457,7 @@ struct batadv_orig_node { /** * @tt_lock: prevents from updating the table while reading it. Table * update is made up by two operations (data structure update and - * metdata -CRC/TTVN-recalculation) and they have to be executed + * metadata -CRC/TTVN-recalculation) and they have to be executed * atomically in order to avoid another thread to read the * table/metadata between those. */ @@ -1011,7 +1011,7 @@ struct batadv_priv_tt { /** * @commit_lock: prevents from executing a local TT commit while reading * the local table. The local TT commit is made up by two operations - * (data structure update and metdata -CRC/TTVN- recalculation) and + * (data structure update and metadata -CRC/TTVN- recalculation) and * they have to be executed atomically in order to avoid another thread * to read the table/metadata between those. */ diff --git a/net/bridge/Makefile b/net/bridge/Makefile index ac9ef337f0fa..49da7ae6f077 100644 --- a/net/bridge/Makefile +++ b/net/bridge/Makefile @@ -20,7 +20,7 @@ obj-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o -bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o br_vlan_tunnel.o +bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o br_vlan_tunnel.o br_vlan_options.o bridge-$(CONFIG_NET_SWITCHDEV) += br_switchdev.o diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index fb38add21b37..dc3d2c1dd9d5 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -32,6 +32,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) struct net_bridge_mdb_entry *mdst; struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats); const struct nf_br_ops *nf_ops; + u8 state = BR_STATE_FORWARDING; const unsigned char *dest; struct ethhdr *eth; u16 vid = 0; @@ -56,7 +57,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) eth = eth_hdr(skb); skb_pull(skb, ETH_HLEN); - if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid)) + if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid, &state)) goto out; if (IS_ENABLED(CONFIG_INET) && diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 86637000f275..7629b63f6f30 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -25,7 +25,7 @@ static inline int should_deliver(const struct net_bridge_port *p, vg = nbp_vlan_group_rcu(p); return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) && - br_allowed_egress(vg, skb) && p->state == BR_STATE_FORWARDING && + p->state == BR_STATE_FORWARDING && br_allowed_egress(vg, skb) && nbp_switchdev_allowed_egress(p, skb) && !br_skb_isolated(p, skb); } diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 8944ceb47fe9..fcc260840028 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -76,11 +76,14 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb bool local_rcv, mcast_hit = false; struct net_bridge *br; u16 vid = 0; + u8 state; if (!p || p->state == BR_STATE_DISABLED) goto drop; - if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid)) + state = p->state; + if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid, + &state)) goto out; nbp_switchdev_frame_mark(p, skb); @@ -103,7 +106,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb } } - if (p->state == BR_STATE_LEARNING) + if (state == BR_STATE_LEARNING) goto drop; BR_INPUT_SKB_CB(skb)->brdev = br->dev; diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 60136575aea4..43dab4066f91 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -561,52 +561,73 @@ static int br_vlan_info(struct net_bridge *br, struct net_bridge_port *p, return err; } -static int br_process_vlan_info(struct net_bridge *br, - struct net_bridge_port *p, int cmd, - struct bridge_vlan_info *vinfo_curr, - struct bridge_vlan_info **vinfo_last, - bool *changed, - struct netlink_ext_ack *extack) +int br_process_vlan_info(struct net_bridge *br, + struct net_bridge_port *p, int cmd, + struct bridge_vlan_info *vinfo_curr, + struct bridge_vlan_info **vinfo_last, + bool *changed, + struct netlink_ext_ack *extack) { - if (!vinfo_curr->vid || vinfo_curr->vid >= VLAN_VID_MASK) + int err, rtm_cmd; + + if (!br_vlan_valid_id(vinfo_curr->vid, extack)) return -EINVAL; + /* needed for vlan-only NEWVLAN/DELVLAN notifications */ + rtm_cmd = br_afspec_cmd_to_rtm(cmd); + if (vinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) { - /* check if we are already processing a range */ - if (*vinfo_last) + if (!br_vlan_valid_range(vinfo_curr, *vinfo_last, extack)) return -EINVAL; *vinfo_last = vinfo_curr; - /* don't allow range of pvids */ - if ((*vinfo_last)->flags & BRIDGE_VLAN_INFO_PVID) - return -EINVAL; return 0; } if (*vinfo_last) { struct bridge_vlan_info tmp_vinfo; - int v, err; - - if (!(vinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_END)) - return -EINVAL; + int v, v_change_start = 0; - if (vinfo_curr->vid <= (*vinfo_last)->vid) + if (!br_vlan_valid_range(vinfo_curr, *vinfo_last, extack)) return -EINVAL; memcpy(&tmp_vinfo, *vinfo_last, sizeof(struct bridge_vlan_info)); for (v = (*vinfo_last)->vid; v <= vinfo_curr->vid; v++) { + bool curr_change = false; + tmp_vinfo.vid = v; - err = br_vlan_info(br, p, cmd, &tmp_vinfo, changed, + err = br_vlan_info(br, p, cmd, &tmp_vinfo, &curr_change, extack); if (err) break; + if (curr_change) { + *changed = curr_change; + if (!v_change_start) + v_change_start = v; + } else { + /* nothing to notify yet */ + if (!v_change_start) + continue; + br_vlan_notify(br, p, v_change_start, + v - 1, rtm_cmd); + v_change_start = 0; + } } + /* v_change_start is set only if the last/whole range changed */ + if (v_change_start) + br_vlan_notify(br, p, v_change_start, + v - 1, rtm_cmd); + *vinfo_last = NULL; return err; } - return br_vlan_info(br, p, cmd, vinfo_curr, changed, extack); + err = br_vlan_info(br, p, cmd, vinfo_curr, changed, extack); + if (*changed) + br_vlan_notify(br, p, vinfo_curr->vid, 0, rtm_cmd); + + return err; } static int br_afspec(struct net_bridge *br, @@ -1664,6 +1685,7 @@ int __init br_netlink_init(void) int err; br_mdb_init(); + br_vlan_rtnl_init(); rtnl_af_register(&br_af_ops); err = rtnl_link_register(&br_link_ops); @@ -1681,6 +1703,7 @@ out_af: void br_netlink_fini(void) { br_mdb_uninit(); + br_vlan_rtnl_uninit(); rtnl_af_unregister(&br_af_ops); rtnl_link_unregister(&br_link_ops); } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index f540f3bdf294..5153ffe79a01 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -113,6 +113,7 @@ enum { * @vid: VLAN id * @flags: bridge vlan flags * @priv_flags: private (in-kernel) bridge vlan flags + * @state: STP state (e.g. blocking, learning, forwarding) * @stats: per-cpu VLAN statistics * @br: if MASTER flag set, this points to a bridge struct * @port: if MASTER flag unset, this points to a port struct @@ -133,6 +134,7 @@ struct net_bridge_vlan { u16 vid; u16 flags; u16 priv_flags; + u8 state; struct br_vlan_stats __percpu *stats; union { struct net_bridge *br; @@ -157,6 +159,7 @@ struct net_bridge_vlan { * @vlan_list: sorted VLAN entry list * @num_vlans: number of total VLAN entries * @pvid: PVID VLAN id + * @pvid_state: PVID's STP state (e.g. forwarding, learning, blocking) * * IMPORTANT: Be careful when checking if there're VLAN entries using list * primitives because the bridge can have entries in its list which @@ -170,6 +173,7 @@ struct net_bridge_vlan_group { struct list_head vlan_list; u16 num_vlans; u16 pvid; + u8 pvid_state; }; /* bridge fdb flags */ @@ -507,6 +511,65 @@ static inline bool nbp_state_should_learn(const struct net_bridge_port *p) return p->state == BR_STATE_LEARNING || p->state == BR_STATE_FORWARDING; } +static inline bool br_vlan_valid_id(u16 vid, struct netlink_ext_ack *extack) +{ + bool ret = vid > 0 && vid < VLAN_VID_MASK; + + if (!ret) + NL_SET_ERR_MSG_MOD(extack, "Vlan id is invalid"); + + return ret; +} + +static inline bool br_vlan_valid_range(const struct bridge_vlan_info *cur, + const struct bridge_vlan_info *last, + struct netlink_ext_ack *extack) +{ + /* pvid flag is not allowed in ranges */ + if (cur->flags & BRIDGE_VLAN_INFO_PVID) { + NL_SET_ERR_MSG_MOD(extack, "Pvid isn't allowed in a range"); + return false; + } + + /* when cur is the range end, check if: + * - it has range start flag + * - range ids are invalid (end is equal to or before start) + */ + if (last) { + if (cur->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) { + NL_SET_ERR_MSG_MOD(extack, "Found a new vlan range start while processing one"); + return false; + } else if (!(cur->flags & BRIDGE_VLAN_INFO_RANGE_END)) { + NL_SET_ERR_MSG_MOD(extack, "Vlan range end flag is missing"); + return false; + } else if (cur->vid <= last->vid) { + NL_SET_ERR_MSG_MOD(extack, "End vlan id is less than or equal to start vlan id"); + return false; + } + } + + /* check for required range flags */ + if (!(cur->flags & (BRIDGE_VLAN_INFO_RANGE_BEGIN | + BRIDGE_VLAN_INFO_RANGE_END))) { + NL_SET_ERR_MSG_MOD(extack, "Both vlan range flags are missing"); + return false; + } + + return true; +} + +static inline int br_afspec_cmd_to_rtm(int cmd) +{ + switch (cmd) { + case RTM_SETLINK: + return RTM_NEWVLAN; + case RTM_DELLINK: + return RTM_DELVLAN; + } + + return 0; +} + static inline int br_opt_get(const struct net_bridge *br, enum net_bridge_opts opt) { @@ -876,7 +939,7 @@ static inline int br_multicast_igmp_type(const struct sk_buff *skb) #ifdef CONFIG_BRIDGE_VLAN_FILTERING bool br_allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb, - u16 *vid); + u16 *vid, u8 *state); bool br_allowed_egress(struct net_bridge_vlan_group *vg, const struct sk_buff *skb); bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid); @@ -911,6 +974,14 @@ void br_vlan_get_stats(const struct net_bridge_vlan *v, void br_vlan_port_event(struct net_bridge_port *p, unsigned long event); int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr); +void br_vlan_rtnl_init(void); +void br_vlan_rtnl_uninit(void); +void br_vlan_notify(const struct net_bridge *br, + const struct net_bridge_port *p, + u16 vid, u16 vid_range, + int cmd); +bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr, + const struct net_bridge_vlan *range_end); static inline struct net_bridge_vlan_group *br_vlan_group( const struct net_bridge *br) @@ -962,11 +1033,15 @@ static inline u16 br_get_pvid(const struct net_bridge_vlan_group *vg) return vg->pvid; } +static inline u16 br_vlan_flags(const struct net_bridge_vlan *v, u16 pvid) +{ + return v->vid == pvid ? v->flags | BRIDGE_VLAN_INFO_PVID : v->flags; +} #else static inline bool br_allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb, - u16 *vid) + u16 *vid, u8 *state) { return true; } @@ -1105,6 +1180,70 @@ static inline int br_vlan_bridge_event(struct net_device *dev, { return 0; } + +static inline void br_vlan_rtnl_init(void) +{ +} + +static inline void br_vlan_rtnl_uninit(void) +{ +} + +static inline void br_vlan_notify(const struct net_bridge *br, + const struct net_bridge_port *p, + u16 vid, u16 vid_range, + int cmd) +{ +} +#endif + +/* br_vlan_options.c */ +#ifdef CONFIG_BRIDGE_VLAN_FILTERING +bool br_vlan_opts_eq(const struct net_bridge_vlan *v1, + const struct net_bridge_vlan *v2); +bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v); +size_t br_vlan_opts_nl_size(void); +int br_vlan_process_options(const struct net_bridge *br, + const struct net_bridge_port *p, + struct net_bridge_vlan *range_start, + struct net_bridge_vlan *range_end, + struct nlattr **tb, + struct netlink_ext_ack *extack); + +/* vlan state manipulation helpers using *_ONCE to annotate lock-free access */ +static inline u8 br_vlan_get_state(const struct net_bridge_vlan *v) +{ + return READ_ONCE(v->state); +} + +static inline void br_vlan_set_state(struct net_bridge_vlan *v, u8 state) +{ + WRITE_ONCE(v->state, state); +} + +static inline u8 br_vlan_get_pvid_state(const struct net_bridge_vlan_group *vg) +{ + return READ_ONCE(vg->pvid_state); +} + +static inline void br_vlan_set_pvid_state(struct net_bridge_vlan_group *vg, + u8 state) +{ + WRITE_ONCE(vg->pvid_state, state); +} + +/* learn_allow is true at ingress and false at egress */ +static inline bool br_vlan_state_allowed(u8 state, bool learn_allow) +{ + switch (state) { + case BR_STATE_LEARNING: + return learn_allow; + case BR_STATE_FORWARDING: + return true; + default: + return false; + } +} #endif struct nf_br_ops { @@ -1176,6 +1315,12 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags, int br_dellink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags); int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, u32 filter_mask, int nlflags); +int br_process_vlan_info(struct net_bridge *br, + struct net_bridge_port *p, int cmd, + struct bridge_vlan_info *vinfo_curr, + struct bridge_vlan_info **vinfo_last, + bool *changed, + struct netlink_ext_ack *extack); #ifdef CONFIG_SYSFS /* br_sysfs_if.c */ diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index bb98984cd27d..6b5deca08b89 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -34,13 +34,15 @@ static struct net_bridge_vlan *br_vlan_lookup(struct rhashtable *tbl, u16 vid) return rhashtable_lookup_fast(tbl, &vid, br_vlan_rht_params); } -static bool __vlan_add_pvid(struct net_bridge_vlan_group *vg, u16 vid) +static bool __vlan_add_pvid(struct net_bridge_vlan_group *vg, + const struct net_bridge_vlan *v) { - if (vg->pvid == vid) + if (vg->pvid == v->vid) return false; smp_wmb(); - vg->pvid = vid; + br_vlan_set_pvid_state(vg, v->state); + vg->pvid = v->vid; return true; } @@ -69,7 +71,7 @@ static bool __vlan_add_flags(struct net_bridge_vlan *v, u16 flags) vg = nbp_vlan_group(v->port); if (flags & BRIDGE_VLAN_INFO_PVID) - ret = __vlan_add_pvid(vg, v->vid); + ret = __vlan_add_pvid(vg, v); else ret = __vlan_delete_pvid(vg, v->vid); @@ -257,6 +259,10 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags, &changed, extack); if (err) goto out_filt; + + if (changed) + br_vlan_notify(br, NULL, v->vid, 0, + RTM_NEWVLAN); } masterv = br_vlan_get_master(br, v->vid, extack); @@ -289,6 +295,9 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags, vg->num_vlans++; } + /* set the state before publishing */ + v->state = BR_STATE_FORWARDING; + err = rhashtable_lookup_insert_fast(&vg->vlan_hash, &v->vnode, br_vlan_rht_params); if (err) @@ -380,13 +389,31 @@ static void __vlan_group_free(struct net_bridge_vlan_group *vg) kfree(vg); } -static void __vlan_flush(struct net_bridge_vlan_group *vg) +static void __vlan_flush(const struct net_bridge *br, + const struct net_bridge_port *p, + struct net_bridge_vlan_group *vg) { struct net_bridge_vlan *vlan, *tmp; + u16 v_start = 0, v_end = 0; __vlan_delete_pvid(vg, vg->pvid); - list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist) + list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist) { + /* take care of disjoint ranges */ + if (!v_start) { + v_start = vlan->vid; + } else if (vlan->vid - v_end != 1) { + /* found range end, notify and start next one */ + br_vlan_notify(br, p, v_start, v_end, RTM_DELVLAN); + v_start = vlan->vid; + } + v_end = vlan->vid; + __vlan_del(vlan); + } + + /* notify about the last/whole vlan range */ + if (v_start) + br_vlan_notify(br, p, v_start, v_end, RTM_DELVLAN); } struct sk_buff *br_handle_vlan(struct net_bridge *br, @@ -444,7 +471,8 @@ out: /* Called under RCU */ static bool __allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, - struct sk_buff *skb, u16 *vid) + struct sk_buff *skb, u16 *vid, + u8 *state) { struct br_vlan_stats *stats; struct net_bridge_vlan *v; @@ -510,13 +538,25 @@ static bool __allowed_ingress(const struct net_bridge *br, skb->vlan_tci |= pvid; /* if stats are disabled we can avoid the lookup */ - if (!br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) - return true; + if (!br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { + if (*state == BR_STATE_FORWARDING) { + *state = br_vlan_get_pvid_state(vg); + return br_vlan_state_allowed(*state, true); + } else { + return true; + } + } } v = br_vlan_find(vg, *vid); if (!v || !br_vlan_should_use(v)) goto drop; + if (*state == BR_STATE_FORWARDING) { + *state = br_vlan_get_state(v); + if (!br_vlan_state_allowed(*state, true)) + goto drop; + } + if (br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { stats = this_cpu_ptr(v->stats); u64_stats_update_begin(&stats->syncp); @@ -534,7 +574,7 @@ drop: bool br_allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb, - u16 *vid) + u16 *vid, u8 *state) { /* If VLAN filtering is disabled on the bridge, all packets are * permitted. @@ -544,7 +584,7 @@ bool br_allowed_ingress(const struct net_bridge *br, return true; } - return __allowed_ingress(br, vg, skb, vid); + return __allowed_ingress(br, vg, skb, vid, state); } /* Called under RCU. */ @@ -560,7 +600,8 @@ bool br_allowed_egress(struct net_bridge_vlan_group *vg, br_vlan_get_tag(skb, &vid); v = br_vlan_find(vg, vid); - if (v && br_vlan_should_use(v)) + if (v && br_vlan_should_use(v) && + br_vlan_state_allowed(br_vlan_get_state(v), false)) return true; return false; @@ -571,6 +612,7 @@ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid) { struct net_bridge_vlan_group *vg; struct net_bridge *br = p->br; + struct net_bridge_vlan *v; /* If filtering was disabled at input, let it pass. */ if (!br_opt_get(br, BROPT_VLAN_ENABLED)) @@ -585,13 +627,15 @@ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid) if (!*vid) { *vid = br_get_pvid(vg); - if (!*vid) + if (!*vid || + !br_vlan_state_allowed(br_vlan_get_pvid_state(vg), true)) return false; return true; } - if (br_vlan_find(vg, *vid)) + v = br_vlan_find(vg, *vid); + if (v && br_vlan_state_allowed(br_vlan_get_state(v), true)) return true; return false; @@ -716,7 +760,7 @@ void br_vlan_flush(struct net_bridge *br) ASSERT_RTNL(); vg = br_vlan_group(br); - __vlan_flush(vg); + __vlan_flush(br, NULL, vg); RCU_INIT_POINTER(br->vlgrp, NULL); synchronize_rcu(); __vlan_group_free(vg); @@ -925,12 +969,15 @@ static void br_vlan_disable_default_pvid(struct net_bridge *br) /* Disable default_pvid on all ports where it is still * configured. */ - if (vlan_default_pvid(br_vlan_group(br), pvid)) - br_vlan_delete(br, pvid); + if (vlan_default_pvid(br_vlan_group(br), pvid)) { + if (!br_vlan_delete(br, pvid)) + br_vlan_notify(br, NULL, pvid, 0, RTM_DELVLAN); + } list_for_each_entry(p, &br->port_list, list) { - if (vlan_default_pvid(nbp_vlan_group(p), pvid)) - nbp_vlan_delete(p, pvid); + if (vlan_default_pvid(nbp_vlan_group(p), pvid) && + !nbp_vlan_delete(p, pvid)) + br_vlan_notify(br, p, pvid, 0, RTM_DELVLAN); } br->default_pvid = 0; @@ -972,7 +1019,10 @@ int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid, &vlchange, extack); if (err) goto out; - br_vlan_delete(br, old_pvid); + + if (br_vlan_delete(br, old_pvid)) + br_vlan_notify(br, NULL, old_pvid, 0, RTM_DELVLAN); + br_vlan_notify(br, NULL, pvid, 0, RTM_NEWVLAN); set_bit(0, changed); } @@ -992,7 +1042,9 @@ int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid, &vlchange, extack); if (err) goto err_port; - nbp_vlan_delete(p, old_pvid); + if (nbp_vlan_delete(p, old_pvid)) + br_vlan_notify(br, p, old_pvid, 0, RTM_DELVLAN); + br_vlan_notify(p->br, p, pvid, 0, RTM_NEWVLAN); set_bit(p->port_no, changed); } @@ -1007,22 +1059,28 @@ err_port: if (!test_bit(p->port_no, changed)) continue; - if (old_pvid) + if (old_pvid) { nbp_vlan_add(p, old_pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED, &vlchange, NULL); + br_vlan_notify(p->br, p, old_pvid, 0, RTM_NEWVLAN); + } nbp_vlan_delete(p, pvid); + br_vlan_notify(br, p, pvid, 0, RTM_DELVLAN); } if (test_bit(0, changed)) { - if (old_pvid) + if (old_pvid) { br_vlan_add(br, old_pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED | BRIDGE_VLAN_INFO_BRENTRY, &vlchange, NULL); + br_vlan_notify(br, NULL, old_pvid, 0, RTM_NEWVLAN); + } br_vlan_delete(br, pvid); + br_vlan_notify(br, NULL, pvid, 0, RTM_DELVLAN); } goto out; } @@ -1115,6 +1173,7 @@ int nbp_vlan_init(struct net_bridge_port *p, struct netlink_ext_ack *extack) &changed, extack); if (ret) goto err_vlan_add; + br_vlan_notify(p->br, p, p->br->default_pvid, 0, RTM_NEWVLAN); } out: return ret; @@ -1196,7 +1255,7 @@ void nbp_vlan_flush(struct net_bridge_port *port) ASSERT_RTNL(); vg = nbp_vlan_group(port); - __vlan_flush(vg); + __vlan_flush(port->br, port, vg); RCU_INIT_POINTER(port->vlgrp, NULL); synchronize_rcu(); __vlan_group_free(vg); @@ -1462,8 +1521,8 @@ int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr) { struct netdev_notifier_changeupper_info *info; struct net_bridge *br = netdev_priv(dev); - bool changed; - int ret = 0; + int vlcmd = 0, ret = 0; + bool changed = false; switch (event) { case NETDEV_REGISTER: @@ -1471,9 +1530,11 @@ int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr) BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED | BRIDGE_VLAN_INFO_BRENTRY, &changed, NULL); + vlcmd = RTM_NEWVLAN; break; case NETDEV_UNREGISTER: - br_vlan_delete(br, br->default_pvid); + changed = !br_vlan_delete(br, br->default_pvid); + vlcmd = RTM_DELVLAN; break; case NETDEV_CHANGEUPPER: info = ptr; @@ -1487,6 +1548,8 @@ int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr) br_vlan_link_state_change(dev, br); break; } + if (changed) + br_vlan_notify(br, NULL, br->default_pvid, 0, vlcmd); return ret; } @@ -1505,3 +1568,441 @@ void br_vlan_port_event(struct net_bridge_port *p, unsigned long event) break; } } + +/* v_opts is used to dump the options which must be equal in the whole range */ +static bool br_vlan_fill_vids(struct sk_buff *skb, u16 vid, u16 vid_range, + const struct net_bridge_vlan *v_opts, + u16 flags) +{ + struct bridge_vlan_info info; + struct nlattr *nest; + + nest = nla_nest_start(skb, BRIDGE_VLANDB_ENTRY); + if (!nest) + return false; + + memset(&info, 0, sizeof(info)); + info.vid = vid; + if (flags & BRIDGE_VLAN_INFO_UNTAGGED) + info.flags |= BRIDGE_VLAN_INFO_UNTAGGED; + if (flags & BRIDGE_VLAN_INFO_PVID) + info.flags |= BRIDGE_VLAN_INFO_PVID; + + if (nla_put(skb, BRIDGE_VLANDB_ENTRY_INFO, sizeof(info), &info)) + goto out_err; + + if (vid_range && vid < vid_range && + !(flags & BRIDGE_VLAN_INFO_PVID) && + nla_put_u16(skb, BRIDGE_VLANDB_ENTRY_RANGE, vid_range)) + goto out_err; + + if (v_opts && !br_vlan_opts_fill(skb, v_opts)) + goto out_err; + + nla_nest_end(skb, nest); + + return true; + +out_err: + nla_nest_cancel(skb, nest); + return false; +} + +static size_t rtnl_vlan_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct br_vlan_msg)) + + nla_total_size(0) /* BRIDGE_VLANDB_ENTRY */ + + nla_total_size(sizeof(u16)) /* BRIDGE_VLANDB_ENTRY_RANGE */ + + nla_total_size(sizeof(struct bridge_vlan_info)) /* BRIDGE_VLANDB_ENTRY_INFO */ + + br_vlan_opts_nl_size(); /* bridge vlan options */ +} + +void br_vlan_notify(const struct net_bridge *br, + const struct net_bridge_port *p, + u16 vid, u16 vid_range, + int cmd) +{ + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *v = NULL; + struct br_vlan_msg *bvm; + struct nlmsghdr *nlh; + struct sk_buff *skb; + int err = -ENOBUFS; + struct net *net; + u16 flags = 0; + int ifindex; + + /* right now notifications are done only with rtnl held */ + ASSERT_RTNL(); + + if (p) { + ifindex = p->dev->ifindex; + vg = nbp_vlan_group(p); + net = dev_net(p->dev); + } else { + ifindex = br->dev->ifindex; + vg = br_vlan_group(br); + net = dev_net(br->dev); + } + + skb = nlmsg_new(rtnl_vlan_nlmsg_size(), GFP_KERNEL); + if (!skb) + goto out_err; + + err = -EMSGSIZE; + nlh = nlmsg_put(skb, 0, 0, cmd, sizeof(*bvm), 0); + if (!nlh) + goto out_err; + bvm = nlmsg_data(nlh); + memset(bvm, 0, sizeof(*bvm)); + bvm->family = AF_BRIDGE; + bvm->ifindex = ifindex; + + switch (cmd) { + case RTM_NEWVLAN: + /* need to find the vlan due to flags/options */ + v = br_vlan_find(vg, vid); + if (!v || !br_vlan_should_use(v)) + goto out_kfree; + + flags = v->flags; + if (br_get_pvid(vg) == v->vid) + flags |= BRIDGE_VLAN_INFO_PVID; + break; + case RTM_DELVLAN: + break; + default: + goto out_kfree; + } + + if (!br_vlan_fill_vids(skb, vid, vid_range, v, flags)) + goto out_err; + + nlmsg_end(skb, nlh); + rtnl_notify(skb, net, 0, RTNLGRP_BRVLAN, NULL, GFP_KERNEL); + return; + +out_err: + rtnl_set_sk_err(net, RTNLGRP_BRVLAN, err); +out_kfree: + kfree_skb(skb); +} + +/* check if v_curr can enter a range ending in range_end */ +bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr, + const struct net_bridge_vlan *range_end) +{ + return v_curr->vid - range_end->vid == 1 && + range_end->flags == v_curr->flags && + br_vlan_opts_eq(v_curr, range_end); +} + +static int br_vlan_dump_dev(const struct net_device *dev, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct net_bridge_vlan *v, *range_start = NULL, *range_end = NULL; + struct net_bridge_vlan_group *vg; + int idx = 0, s_idx = cb->args[1]; + struct nlmsghdr *nlh = NULL; + struct net_bridge_port *p; + struct br_vlan_msg *bvm; + struct net_bridge *br; + int err = 0; + u16 pvid; + + if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev)) + return -EINVAL; + + if (netif_is_bridge_master(dev)) { + br = netdev_priv(dev); + vg = br_vlan_group_rcu(br); + p = NULL; + } else { + p = br_port_get_rcu(dev); + if (WARN_ON(!p)) + return -EINVAL; + vg = nbp_vlan_group_rcu(p); + br = p->br; + } + + if (!vg) + return 0; + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + RTM_NEWVLAN, sizeof(*bvm), NLM_F_MULTI); + if (!nlh) + return -EMSGSIZE; + bvm = nlmsg_data(nlh); + memset(bvm, 0, sizeof(*bvm)); + bvm->family = PF_BRIDGE; + bvm->ifindex = dev->ifindex; + pvid = br_get_pvid(vg); + + /* idx must stay at range's beginning until it is filled in */ + list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { + if (!br_vlan_should_use(v)) + continue; + if (idx < s_idx) { + idx++; + continue; + } + + if (!range_start) { + range_start = v; + range_end = v; + continue; + } + + if (v->vid == pvid || !br_vlan_can_enter_range(v, range_end)) { + u16 flags = br_vlan_flags(range_start, pvid); + + if (!br_vlan_fill_vids(skb, range_start->vid, + range_end->vid, range_start, + flags)) { + err = -EMSGSIZE; + break; + } + /* advance number of filled vlans */ + idx += range_end->vid - range_start->vid + 1; + + range_start = v; + } + range_end = v; + } + + /* err will be 0 and range_start will be set in 3 cases here: + * - first vlan (range_start == range_end) + * - last vlan (range_start == range_end, not in range) + * - last vlan range (range_start != range_end, in range) + */ + if (!err && range_start && + !br_vlan_fill_vids(skb, range_start->vid, range_end->vid, + range_start, br_vlan_flags(range_start, pvid))) + err = -EMSGSIZE; + + cb->args[1] = err ? idx : 0; + + nlmsg_end(skb, nlh); + + return err; +} + +static int br_vlan_rtm_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx = 0, err = 0, s_idx = cb->args[0]; + struct net *net = sock_net(skb->sk); + struct br_vlan_msg *bvm; + struct net_device *dev; + + err = nlmsg_parse(cb->nlh, sizeof(*bvm), NULL, 0, NULL, cb->extack); + if (err < 0) + return err; + + bvm = nlmsg_data(cb->nlh); + + rcu_read_lock(); + if (bvm->ifindex) { + dev = dev_get_by_index_rcu(net, bvm->ifindex); + if (!dev) { + err = -ENODEV; + goto out_err; + } + err = br_vlan_dump_dev(dev, skb, cb); + if (err && err != -EMSGSIZE) + goto out_err; + } else { + for_each_netdev_rcu(net, dev) { + if (idx < s_idx) + goto skip; + + err = br_vlan_dump_dev(dev, skb, cb); + if (err == -EMSGSIZE) + break; +skip: + idx++; + } + } + cb->args[0] = idx; + rcu_read_unlock(); + + return skb->len; + +out_err: + rcu_read_unlock(); + + return err; +} + +static const struct nla_policy br_vlan_db_policy[BRIDGE_VLANDB_ENTRY_MAX + 1] = { + [BRIDGE_VLANDB_ENTRY_INFO] = { .type = NLA_EXACT_LEN, + .len = sizeof(struct bridge_vlan_info) }, + [BRIDGE_VLANDB_ENTRY_RANGE] = { .type = NLA_U16 }, + [BRIDGE_VLANDB_ENTRY_STATE] = { .type = NLA_U8 }, +}; + +static int br_vlan_rtm_process_one(struct net_device *dev, + const struct nlattr *attr, + int cmd, struct netlink_ext_ack *extack) +{ + struct bridge_vlan_info *vinfo, vrange_end, *vinfo_last = NULL; + struct nlattr *tb[BRIDGE_VLANDB_ENTRY_MAX + 1]; + bool changed = false, skip_processing = false; + struct net_bridge_vlan_group *vg; + struct net_bridge_port *p = NULL; + int err = 0, cmdmap = 0; + struct net_bridge *br; + + if (netif_is_bridge_master(dev)) { + br = netdev_priv(dev); + vg = br_vlan_group(br); + } else { + p = br_port_get_rtnl(dev); + if (WARN_ON(!p)) + return -ENODEV; + br = p->br; + vg = nbp_vlan_group(p); + } + + if (WARN_ON(!vg)) + return -ENODEV; + + err = nla_parse_nested(tb, BRIDGE_VLANDB_ENTRY_MAX, attr, + br_vlan_db_policy, extack); + if (err) + return err; + + if (!tb[BRIDGE_VLANDB_ENTRY_INFO]) { + NL_SET_ERR_MSG_MOD(extack, "Missing vlan entry info"); + return -EINVAL; + } + memset(&vrange_end, 0, sizeof(vrange_end)); + + vinfo = nla_data(tb[BRIDGE_VLANDB_ENTRY_INFO]); + if (vinfo->flags & (BRIDGE_VLAN_INFO_RANGE_BEGIN | + BRIDGE_VLAN_INFO_RANGE_END)) { + NL_SET_ERR_MSG_MOD(extack, "Old-style vlan ranges are not allowed when using RTM vlan calls"); + return -EINVAL; + } + if (!br_vlan_valid_id(vinfo->vid, extack)) + return -EINVAL; + + if (tb[BRIDGE_VLANDB_ENTRY_RANGE]) { + vrange_end.vid = nla_get_u16(tb[BRIDGE_VLANDB_ENTRY_RANGE]); + /* validate user-provided flags without RANGE_BEGIN */ + vrange_end.flags = BRIDGE_VLAN_INFO_RANGE_END | vinfo->flags; + vinfo->flags |= BRIDGE_VLAN_INFO_RANGE_BEGIN; + + /* vinfo_last is the range start, vinfo the range end */ + vinfo_last = vinfo; + vinfo = &vrange_end; + + if (!br_vlan_valid_id(vinfo->vid, extack) || + !br_vlan_valid_range(vinfo, vinfo_last, extack)) + return -EINVAL; + } + + switch (cmd) { + case RTM_NEWVLAN: + cmdmap = RTM_SETLINK; + skip_processing = !!(vinfo->flags & BRIDGE_VLAN_INFO_ONLY_OPTS); + break; + case RTM_DELVLAN: + cmdmap = RTM_DELLINK; + break; + } + + if (!skip_processing) { + struct bridge_vlan_info *tmp_last = vinfo_last; + + /* br_process_vlan_info may overwrite vinfo_last */ + err = br_process_vlan_info(br, p, cmdmap, vinfo, &tmp_last, + &changed, extack); + + /* notify first if anything changed */ + if (changed) + br_ifinfo_notify(cmdmap, br, p); + + if (err) + return err; + } + + /* deal with options */ + if (cmd == RTM_NEWVLAN) { + struct net_bridge_vlan *range_start, *range_end; + + if (vinfo_last) { + range_start = br_vlan_find(vg, vinfo_last->vid); + range_end = br_vlan_find(vg, vinfo->vid); + } else { + range_start = br_vlan_find(vg, vinfo->vid); + range_end = range_start; + } + + err = br_vlan_process_options(br, p, range_start, range_end, + tb, extack); + } + + return err; +} + +static int br_vlan_rtm_process(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct br_vlan_msg *bvm; + struct net_device *dev; + struct nlattr *attr; + int err, vlans = 0; + int rem; + + /* this should validate the header and check for remaining bytes */ + err = nlmsg_parse(nlh, sizeof(*bvm), NULL, BRIDGE_VLANDB_MAX, NULL, + extack); + if (err < 0) + return err; + + bvm = nlmsg_data(nlh); + dev = __dev_get_by_index(net, bvm->ifindex); + if (!dev) + return -ENODEV; + + if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev)) { + NL_SET_ERR_MSG_MOD(extack, "The device is not a valid bridge or bridge port"); + return -EINVAL; + } + + nlmsg_for_each_attr(attr, nlh, sizeof(*bvm), rem) { + if (nla_type(attr) != BRIDGE_VLANDB_ENTRY) + continue; + + vlans++; + err = br_vlan_rtm_process_one(dev, attr, nlh->nlmsg_type, + extack); + if (err) + break; + } + if (!vlans) { + NL_SET_ERR_MSG_MOD(extack, "No vlans found to process"); + err = -EINVAL; + } + + return err; +} + +void br_vlan_rtnl_init(void) +{ + rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_GETVLAN, NULL, + br_vlan_rtm_dump, 0); + rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_NEWVLAN, + br_vlan_rtm_process, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_DELVLAN, + br_vlan_rtm_process, NULL, 0); +} + +void br_vlan_rtnl_uninit(void) +{ + rtnl_unregister(PF_BRIDGE, RTM_GETVLAN); + rtnl_unregister(PF_BRIDGE, RTM_NEWVLAN); + rtnl_unregister(PF_BRIDGE, RTM_DELVLAN); +} diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c new file mode 100644 index 000000000000..cd2eb194eb98 --- /dev/null +++ b/net/bridge/br_vlan_options.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright (c) 2020, Nikolay Aleksandrov <nikolay@cumulusnetworks.com> +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/slab.h> + +#include "br_private.h" + +/* check if the options between two vlans are equal */ +bool br_vlan_opts_eq(const struct net_bridge_vlan *v1, + const struct net_bridge_vlan *v2) +{ + return v1->state == v2->state; +} + +bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v) +{ + return !nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_STATE, + br_vlan_get_state(v)); +} + +size_t br_vlan_opts_nl_size(void) +{ + return nla_total_size(sizeof(u8)); /* BRIDGE_VLANDB_ENTRY_STATE */ +} + +static int br_vlan_modify_state(struct net_bridge_vlan_group *vg, + struct net_bridge_vlan *v, + u8 state, + bool *changed, + struct netlink_ext_ack *extack) +{ + struct net_bridge *br; + + ASSERT_RTNL(); + + if (state > BR_STATE_BLOCKING) { + NL_SET_ERR_MSG_MOD(extack, "Invalid vlan state"); + return -EINVAL; + } + + if (br_vlan_is_brentry(v)) + br = v->br; + else + br = v->port->br; + + if (br->stp_enabled == BR_KERNEL_STP) { + NL_SET_ERR_MSG_MOD(extack, "Can't modify vlan state when using kernel STP"); + return -EBUSY; + } + + if (v->state == state) + return 0; + + if (v->vid == br_get_pvid(vg)) + br_vlan_set_pvid_state(vg, state); + + br_vlan_set_state(v, state); + *changed = true; + + return 0; +} + +static int br_vlan_process_one_opts(const struct net_bridge *br, + const struct net_bridge_port *p, + struct net_bridge_vlan_group *vg, + struct net_bridge_vlan *v, + struct nlattr **tb, + bool *changed, + struct netlink_ext_ack *extack) +{ + int err; + + *changed = false; + if (tb[BRIDGE_VLANDB_ENTRY_STATE]) { + u8 state = nla_get_u8(tb[BRIDGE_VLANDB_ENTRY_STATE]); + + err = br_vlan_modify_state(vg, v, state, changed, extack); + if (err) + return err; + } + + return 0; +} + +int br_vlan_process_options(const struct net_bridge *br, + const struct net_bridge_port *p, + struct net_bridge_vlan *range_start, + struct net_bridge_vlan *range_end, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct net_bridge_vlan *v, *curr_start = NULL, *curr_end = NULL; + struct net_bridge_vlan_group *vg; + int vid, err = 0; + u16 pvid; + + if (p) + vg = nbp_vlan_group(p); + else + vg = br_vlan_group(br); + + if (!range_start || !br_vlan_should_use(range_start)) { + NL_SET_ERR_MSG_MOD(extack, "Vlan range start doesn't exist, can't process options"); + return -ENOENT; + } + if (!range_end || !br_vlan_should_use(range_end)) { + NL_SET_ERR_MSG_MOD(extack, "Vlan range end doesn't exist, can't process options"); + return -ENOENT; + } + + pvid = br_get_pvid(vg); + for (vid = range_start->vid; vid <= range_end->vid; vid++) { + bool changed = false; + + v = br_vlan_find(vg, vid); + if (!v || !br_vlan_should_use(v)) { + NL_SET_ERR_MSG_MOD(extack, "Vlan in range doesn't exist, can't process options"); + err = -ENOENT; + break; + } + + err = br_vlan_process_one_opts(br, p, vg, v, tb, &changed, + extack); + if (err) + break; + + if (changed) { + /* vlan options changed, check for range */ + if (!curr_start) { + curr_start = v; + curr_end = v; + continue; + } + + if (v->vid == pvid || + !br_vlan_can_enter_range(v, curr_end)) { + br_vlan_notify(br, p, curr_start->vid, + curr_end->vid, RTM_NEWVLAN); + curr_start = v; + } + curr_end = v; + } else { + /* nothing changed and nothing to notify yet */ + if (!curr_start) + continue; + + br_vlan_notify(br, p, curr_start->vid, curr_end->vid, + RTM_NEWVLAN); + curr_start = NULL; + curr_end = NULL; + } + } + if (curr_start) + br_vlan_notify(br, p, curr_start->vid, curr_end->vid, + RTM_NEWVLAN); + + return err; +} diff --git a/net/caif/caif_usb.c b/net/caif/caif_usb.c index 76bd67891fb3..a0116b9503d9 100644 --- a/net/caif/caif_usb.c +++ b/net/caif/caif_usb.c @@ -62,7 +62,7 @@ static int cfusbl_transmit(struct cflayer *layr, struct cfpkt *pkt) hpad = (info->hdr_len + CFUSB_PAD_DESCR_SZ) & (CFUSB_ALIGNMENT - 1); if (skb_headroom(skb) < ETH_HLEN + CFUSB_PAD_DESCR_SZ + hpad) { - pr_warn("Headroom to small\n"); + pr_warn("Headroom too small\n"); kfree_skb(skb); return -EIO; } diff --git a/net/core/datagram.c b/net/core/datagram.c index da3c24ed129c..a78e7f864c1e 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -84,7 +84,8 @@ static int receiver_wake_function(wait_queue_entry_t *wait, unsigned int mode, i /* * Wait for the last received packet to be different from skb */ -int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p, +int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue, + int *err, long *timeo_p, const struct sk_buff *skb) { int error; @@ -97,7 +98,7 @@ int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p, if (error) goto out_err; - if (READ_ONCE(sk->sk_receive_queue.prev) != skb) + if (READ_ONCE(queue->prev) != skb) goto out; /* Socket shut down? */ @@ -209,6 +210,7 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, /** * __skb_try_recv_datagram - Receive a datagram skbuff * @sk: socket + * @queue: socket queue from which to receive * @flags: MSG\_ flags * @destructor: invoked under the receive lock on successful dequeue * @off: an offset in bytes to peek skb from. Returns an offset @@ -241,13 +243,14 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, * quite explicitly by POSIX 1003.1g, don't change them without having * the standard around please. */ -struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags, +struct sk_buff *__skb_try_recv_datagram(struct sock *sk, + struct sk_buff_head *queue, + unsigned int flags, void (*destructor)(struct sock *sk, struct sk_buff *skb), int *off, int *err, struct sk_buff **last) { - struct sk_buff_head *queue = &sk->sk_receive_queue; struct sk_buff *skb; unsigned long cpu_flags; /* @@ -278,7 +281,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags, break; sk_busy_loop(sk, flags & MSG_DONTWAIT); - } while (READ_ONCE(sk->sk_receive_queue.prev) != *last); + } while (READ_ONCE(queue->prev) != *last); error = -EAGAIN; @@ -288,7 +291,9 @@ no_packet: } EXPORT_SYMBOL(__skb_try_recv_datagram); -struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, +struct sk_buff *__skb_recv_datagram(struct sock *sk, + struct sk_buff_head *sk_queue, + unsigned int flags, void (*destructor)(struct sock *sk, struct sk_buff *skb), int *off, int *err) @@ -299,15 +304,16 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); do { - skb = __skb_try_recv_datagram(sk, flags, destructor, off, err, - &last); + skb = __skb_try_recv_datagram(sk, sk_queue, flags, destructor, + off, err, &last); if (skb) return skb; if (*err != -EAGAIN) break; } while (timeo && - !__skb_wait_for_more_packets(sk, err, &timeo, last)); + !__skb_wait_for_more_packets(sk, sk_queue, err, + &timeo, last)); return NULL; } @@ -318,7 +324,8 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags, { int off = 0; - return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), + return __skb_recv_datagram(sk, &sk->sk_receive_queue, + flags | (noblock ? MSG_DONTWAIT : 0), NULL, &off, err); } EXPORT_SYMBOL(skb_recv_datagram); diff --git a/net/core/dev.c b/net/core/dev.c index d99f88c58636..38bc35da39f7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1764,7 +1764,6 @@ EXPORT_SYMBOL(register_netdevice_notifier); int unregister_netdevice_notifier(struct notifier_block *nb) { - struct net_device *dev; struct net *net; int err; @@ -1775,16 +1774,9 @@ int unregister_netdevice_notifier(struct notifier_block *nb) if (err) goto unlock; - for_each_net(net) { - for_each_netdev(net, dev) { - if (dev->flags & IFF_UP) { - call_netdevice_notifier(nb, NETDEV_GOING_DOWN, - dev); - call_netdevice_notifier(nb, NETDEV_DOWN, dev); - } - call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); - } - } + for_each_net(net) + call_netdevice_unregister_net_notifiers(nb, net); + unlock: rtnl_unlock(); up_write(&pernet_ops_rwsem); @@ -1792,6 +1784,42 @@ unlock: } EXPORT_SYMBOL(unregister_netdevice_notifier); +static int __register_netdevice_notifier_net(struct net *net, + struct notifier_block *nb, + bool ignore_call_fail) +{ + int err; + + err = raw_notifier_chain_register(&net->netdev_chain, nb); + if (err) + return err; + if (dev_boot_phase) + return 0; + + err = call_netdevice_register_net_notifiers(nb, net); + if (err && !ignore_call_fail) + goto chain_unregister; + + return 0; + +chain_unregister: + raw_notifier_chain_unregister(&net->netdev_chain, nb); + return err; +} + +static int __unregister_netdevice_notifier_net(struct net *net, + struct notifier_block *nb) +{ + int err; + + err = raw_notifier_chain_unregister(&net->netdev_chain, nb); + if (err) + return err; + + call_netdevice_unregister_net_notifiers(nb, net); + return 0; +} + /** * register_netdevice_notifier_net - register a per-netns network notifier block * @net: network namespace @@ -1812,23 +1840,9 @@ int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb) int err; rtnl_lock(); - err = raw_notifier_chain_register(&net->netdev_chain, nb); - if (err) - goto unlock; - if (dev_boot_phase) - goto unlock; - - err = call_netdevice_register_net_notifiers(nb, net); - if (err) - goto chain_unregister; - -unlock: + err = __register_netdevice_notifier_net(net, nb, false); rtnl_unlock(); return err; - -chain_unregister: - raw_notifier_chain_unregister(&netdev_chain, nb); - goto unlock; } EXPORT_SYMBOL(register_netdevice_notifier_net); @@ -1854,17 +1868,53 @@ int unregister_netdevice_notifier_net(struct net *net, int err; rtnl_lock(); - err = raw_notifier_chain_unregister(&net->netdev_chain, nb); - if (err) - goto unlock; + err = __unregister_netdevice_notifier_net(net, nb); + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL(unregister_netdevice_notifier_net); - call_netdevice_unregister_net_notifiers(nb, net); +int register_netdevice_notifier_dev_net(struct net_device *dev, + struct notifier_block *nb, + struct netdev_net_notifier *nn) +{ + int err; -unlock: + rtnl_lock(); + err = __register_netdevice_notifier_net(dev_net(dev), nb, false); + if (!err) { + nn->nb = nb; + list_add(&nn->list, &dev->net_notifier_list); + } rtnl_unlock(); return err; } -EXPORT_SYMBOL(unregister_netdevice_notifier_net); +EXPORT_SYMBOL(register_netdevice_notifier_dev_net); + +int unregister_netdevice_notifier_dev_net(struct net_device *dev, + struct notifier_block *nb, + struct netdev_net_notifier *nn) +{ + int err; + + rtnl_lock(); + list_del(&nn->list); + err = __unregister_netdevice_notifier_net(dev_net(dev), nb); + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net); + +static void move_netdevice_notifiers_dev_net(struct net_device *dev, + struct net *net) +{ + struct netdev_net_notifier *nn; + + list_for_each_entry(nn, &dev->net_notifier_list, list) { + __unregister_netdevice_notifier_net(dev_net(dev), nn->nb); + __register_netdevice_notifier_net(net, nn->nb, true); + } +} /** * call_netdevice_notifiers_info - call all network notifier blocks @@ -3249,7 +3299,7 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, segs = skb_mac_gso_segment(skb, features); - if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs))) + if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs))) skb_warn_bad_offload(skb); return segs; @@ -5489,9 +5539,29 @@ static void flush_all_backlogs(void) put_online_cpus(); } +/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ +static void gro_normal_list(struct napi_struct *napi) +{ + if (!napi->rx_count) + return; + netif_receive_skb_list_internal(&napi->rx_list); + INIT_LIST_HEAD(&napi->rx_list); + napi->rx_count = 0; +} + +/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, + * pass the whole batch up to the stack. + */ +static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb) +{ + list_add_tail(&skb->list, &napi->rx_list); + if (++napi->rx_count >= gro_normal_batch) + gro_normal_list(napi); +} + INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int)); INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int)); -static int napi_gro_complete(struct sk_buff *skb) +static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) { struct packet_offload *ptype; __be16 type = skb->protocol; @@ -5524,7 +5594,8 @@ static int napi_gro_complete(struct sk_buff *skb) } out: - return netif_receive_skb_internal(skb); + gro_normal_one(napi, skb); + return NET_RX_SUCCESS; } static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, @@ -5537,7 +5608,7 @@ static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) return; skb_list_del_init(skb); - napi_gro_complete(skb); + napi_gro_complete(napi, skb); napi->gro_hash[index].count--; } @@ -5639,7 +5710,7 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow) } } -static void gro_flush_oldest(struct list_head *head) +static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) { struct sk_buff *oldest; @@ -5655,7 +5726,7 @@ static void gro_flush_oldest(struct list_head *head) * SKB to the chain. */ skb_list_del_init(oldest); - napi_gro_complete(oldest); + napi_gro_complete(napi, oldest); } INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *, @@ -5731,7 +5802,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (pp) { skb_list_del_init(pp); - napi_gro_complete(pp); + napi_gro_complete(napi, pp); napi->gro_hash[hash].count--; } @@ -5742,7 +5813,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff goto normal; if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) { - gro_flush_oldest(gro_head); + gro_flush_oldest(napi, gro_head); } else { napi->gro_hash[hash].count++; } @@ -5800,26 +5871,6 @@ struct packet_offload *gro_find_complete_by_type(__be16 type) } EXPORT_SYMBOL(gro_find_complete_by_type); -/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ -static void gro_normal_list(struct napi_struct *napi) -{ - if (!napi->rx_count) - return; - netif_receive_skb_list_internal(&napi->rx_list); - INIT_LIST_HEAD(&napi->rx_list); - napi->rx_count = 0; -} - -/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, - * pass the whole batch up to the stack. - */ -static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb) -{ - list_add_tail(&skb->list, &napi->rx_list); - if (++napi->rx_count >= gro_normal_batch) - gro_normal_list(napi); -} - static void napi_skb_free_stolen_head(struct sk_buff *skb) { skb_dst_drop(skb); @@ -6198,8 +6249,6 @@ bool napi_complete_done(struct napi_struct *n, int work_done) NAPIF_STATE_IN_BUSY_POLL))) return false; - gro_normal_list(n); - if (n->gro_bitmask) { unsigned long timeout = 0; @@ -6215,6 +6264,9 @@ bool napi_complete_done(struct napi_struct *n, int work_done) hrtimer_start(&n->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); } + + gro_normal_list(n); + if (unlikely(!list_empty(&n->poll_list))) { /* If n->poll_list is not empty, we need to mask irqs */ local_irq_save(flags); @@ -6546,8 +6598,6 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) goto out_unlock; } - gro_normal_list(n); - if (n->gro_bitmask) { /* flush too old packets * If HZ < 1000, flush all packets. @@ -6555,6 +6605,8 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) napi_gro_flush(n, HZ >= 1000); } + gro_normal_list(n); + /* Some drivers may have called napi_schedule * prior to exhausting their budget. */ @@ -8192,6 +8244,22 @@ int __dev_set_mtu(struct net_device *dev, int new_mtu) } EXPORT_SYMBOL(__dev_set_mtu); +int dev_validate_mtu(struct net_device *dev, int new_mtu, + struct netlink_ext_ack *extack) +{ + /* MTU must be positive, and in range */ + if (new_mtu < 0 || new_mtu < dev->min_mtu) { + NL_SET_ERR_MSG(extack, "mtu less than device minimum"); + return -EINVAL; + } + + if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) { + NL_SET_ERR_MSG(extack, "mtu greater than device maximum"); + return -EINVAL; + } + return 0; +} + /** * dev_set_mtu_ext - Change maximum transfer unit * @dev: device @@ -8208,16 +8276,9 @@ int dev_set_mtu_ext(struct net_device *dev, int new_mtu, if (new_mtu == dev->mtu) return 0; - /* MTU must be positive, and in range */ - if (new_mtu < 0 || new_mtu < dev->min_mtu) { - NL_SET_ERR_MSG(extack, "mtu less than device minimum"); - return -EINVAL; - } - - if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) { - NL_SET_ERR_MSG(extack, "mtu greater than device maximum"); - return -EINVAL; - } + err = dev_validate_mtu(dev, new_mtu, extack); + if (err) + return err; if (!netif_device_present(dev)) return -ENODEV; @@ -9192,22 +9253,10 @@ static void netdev_unregister_lockdep_key(struct net_device *dev) void netdev_update_lockdep_key(struct net_device *dev) { - struct netdev_queue *queue; - int i; - - lockdep_unregister_key(&dev->qdisc_xmit_lock_key); lockdep_unregister_key(&dev->addr_list_lock_key); - - lockdep_register_key(&dev->qdisc_xmit_lock_key); lockdep_register_key(&dev->addr_list_lock_key); lockdep_set_class(&dev->addr_list_lock, &dev->addr_list_lock_key); - for (i = 0; i < dev->num_tx_queues; i++) { - queue = netdev_get_tx_queue(dev, i); - - lockdep_set_class(&queue->_xmit_lock, - &dev->qdisc_xmit_lock_key); - } } EXPORT_SYMBOL(netdev_update_lockdep_key); @@ -9284,7 +9333,7 @@ int register_netdevice(struct net_device *dev) /* Transfer changeable features to wanted_features and enable * software offloads (GSO and GRO). */ - dev->hw_features |= NETIF_F_SOFT_FEATURES; + dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF); dev->features |= NETIF_F_SOFT_FEATURES; if (dev->netdev_ops->ndo_udp_tunnel_add) { @@ -9329,8 +9378,10 @@ int register_netdevice(struct net_device *dev) goto err_uninit; ret = netdev_register_kobject(dev); - if (ret) + if (ret) { + dev->reg_state = NETREG_UNREGISTERED; goto err_uninit; + } dev->reg_state = NETREG_REGISTERED; __netdev_update_features(dev); @@ -9777,6 +9828,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, INIT_LIST_HEAD(&dev->adj_list.lower); INIT_LIST_HEAD(&dev->ptype_all); INIT_LIST_HEAD(&dev->ptype_specific); + INIT_LIST_HEAD(&dev->net_notifier_list); #ifdef CONFIG_NET_SCHED hash_init(dev->qdisc_hash); #endif @@ -9847,6 +9899,8 @@ void free_netdev(struct net_device *dev) free_percpu(dev->pcpu_refcnt); dev->pcpu_refcnt = NULL; + free_percpu(dev->xdp_bulkq); + dev->xdp_bulkq = NULL; netdev_unregister_lockdep_key(dev); @@ -10038,6 +10092,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE); netdev_adjacent_del_links(dev); + /* Move per-net netdevice notifiers that are following the netdevice */ + move_netdevice_notifiers_dev_net(dev, net); + /* Actually switch the network namespace */ dev_net_set(dev, net); dev->ifindex = new_ifindex; diff --git a/net/core/devlink.c b/net/core/devlink.c index 4c63c9a4c09e..ca1df0ec3c97 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4843,22 +4843,100 @@ devlink_health_reporter_destroy(struct devlink_health_reporter *reporter) } EXPORT_SYMBOL_GPL(devlink_health_reporter_destroy); -void -devlink_health_reporter_state_update(struct devlink_health_reporter *reporter, - enum devlink_health_reporter_state state) +static int +devlink_nl_health_reporter_fill(struct sk_buff *msg, + struct devlink *devlink, + struct devlink_health_reporter *reporter, + enum devlink_command cmd, u32 portid, + u32 seq, int flags) { - if (WARN_ON(state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY && - state != DEVLINK_HEALTH_REPORTER_STATE_ERROR)) + struct nlattr *reporter_attr; + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto genlmsg_cancel; + + reporter_attr = nla_nest_start_noflag(msg, + DEVLINK_ATTR_HEALTH_REPORTER); + if (!reporter_attr) + goto genlmsg_cancel; + if (nla_put_string(msg, DEVLINK_ATTR_HEALTH_REPORTER_NAME, + reporter->ops->name)) + goto reporter_nest_cancel; + if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_STATE, + reporter->health_state)) + goto reporter_nest_cancel; + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_ERR_COUNT, + reporter->error_count, DEVLINK_ATTR_PAD)) + goto reporter_nest_cancel; + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_RECOVER_COUNT, + reporter->recovery_count, DEVLINK_ATTR_PAD)) + goto reporter_nest_cancel; + if (reporter->ops->recover && + nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD, + reporter->graceful_period, + DEVLINK_ATTR_PAD)) + goto reporter_nest_cancel; + if (reporter->ops->recover && + nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER, + reporter->auto_recover)) + goto reporter_nest_cancel; + if (reporter->dump_fmsg && + nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS, + jiffies_to_msecs(reporter->dump_ts), + DEVLINK_ATTR_PAD)) + goto reporter_nest_cancel; + if (reporter->dump_fmsg && + nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS, + reporter->dump_real_ts, DEVLINK_ATTR_PAD)) + goto reporter_nest_cancel; + + nla_nest_end(msg, reporter_attr); + genlmsg_end(msg, hdr); + return 0; + +reporter_nest_cancel: + nla_nest_end(msg, reporter_attr); +genlmsg_cancel: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static void devlink_recover_notify(struct devlink_health_reporter *reporter, + enum devlink_command cmd) +{ + struct sk_buff *msg; + int err; + + WARN_ON(cmd != DEVLINK_CMD_HEALTH_REPORTER_RECOVER); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) return; - if (reporter->health_state == state) + err = devlink_nl_health_reporter_fill(msg, reporter->devlink, + reporter, cmd, 0, 0, 0); + if (err) { + nlmsg_free(msg); return; + } - reporter->health_state = state; - trace_devlink_health_reporter_state_update(reporter->devlink, - reporter->ops->name, state); + genlmsg_multicast_netns(&devlink_nl_family, + devlink_net(reporter->devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } -EXPORT_SYMBOL_GPL(devlink_health_reporter_state_update); + +void +devlink_health_reporter_recovery_done(struct devlink_health_reporter *reporter) +{ + reporter->recovery_count++; + reporter->last_recovery_ts = jiffies; +} +EXPORT_SYMBOL_GPL(devlink_health_reporter_recovery_done); static int devlink_health_reporter_recover(struct devlink_health_reporter *reporter, @@ -4876,9 +4954,9 @@ devlink_health_reporter_recover(struct devlink_health_reporter *reporter, if (err) return err; - reporter->recovery_count++; + devlink_health_reporter_recovery_done(reporter); reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_HEALTHY; - reporter->last_recovery_ts = jiffies; + devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER); return 0; } @@ -4945,6 +5023,7 @@ int devlink_health_report(struct devlink_health_reporter *reporter, reporter->error_count++; prev_health_state = reporter->health_state; reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR; + devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER); /* abort if the previous error wasn't recovered */ if (reporter->auto_recover && @@ -5027,68 +5106,23 @@ devlink_health_reporter_put(struct devlink_health_reporter *reporter) refcount_dec(&reporter->refcount); } -static int -devlink_nl_health_reporter_fill(struct sk_buff *msg, - struct devlink *devlink, - struct devlink_health_reporter *reporter, - enum devlink_command cmd, u32 portid, - u32 seq, int flags) +void +devlink_health_reporter_state_update(struct devlink_health_reporter *reporter, + enum devlink_health_reporter_state state) { - struct nlattr *reporter_attr; - void *hdr; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - if (devlink_nl_put_handle(msg, devlink)) - goto genlmsg_cancel; - - reporter_attr = nla_nest_start_noflag(msg, - DEVLINK_ATTR_HEALTH_REPORTER); - if (!reporter_attr) - goto genlmsg_cancel; - if (nla_put_string(msg, DEVLINK_ATTR_HEALTH_REPORTER_NAME, - reporter->ops->name)) - goto reporter_nest_cancel; - if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_STATE, - reporter->health_state)) - goto reporter_nest_cancel; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_ERR_COUNT, - reporter->error_count, DEVLINK_ATTR_PAD)) - goto reporter_nest_cancel; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_RECOVER_COUNT, - reporter->recovery_count, DEVLINK_ATTR_PAD)) - goto reporter_nest_cancel; - if (reporter->ops->recover && - nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD, - reporter->graceful_period, - DEVLINK_ATTR_PAD)) - goto reporter_nest_cancel; - if (reporter->ops->recover && - nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER, - reporter->auto_recover)) - goto reporter_nest_cancel; - if (reporter->dump_fmsg && - nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS, - jiffies_to_msecs(reporter->dump_ts), - DEVLINK_ATTR_PAD)) - goto reporter_nest_cancel; - if (reporter->dump_fmsg && - nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS, - reporter->dump_real_ts, DEVLINK_ATTR_PAD)) - goto reporter_nest_cancel; + if (WARN_ON(state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY && + state != DEVLINK_HEALTH_REPORTER_STATE_ERROR)) + return; - nla_nest_end(msg, reporter_attr); - genlmsg_end(msg, hdr); - return 0; + if (reporter->health_state == state) + return; -reporter_nest_cancel: - nla_nest_end(msg, reporter_attr); -genlmsg_cancel: - genlmsg_cancel(msg, hdr); - return -EMSGSIZE; + reporter->health_state = state; + trace_devlink_health_reporter_state_update(reporter->devlink, + reporter->ops->name, state); + devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER); } +EXPORT_SYMBOL_GPL(devlink_health_reporter_state_update); static int devlink_nl_cmd_health_reporter_get_doit(struct sk_buff *skb, struct genl_info *info) @@ -6406,7 +6440,7 @@ static bool devlink_port_type_should_warn(struct devlink_port *devlink_port) devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA; } -#define DEVLINK_PORT_TYPE_WARN_TIMEOUT (HZ * 30) +#define DEVLINK_PORT_TYPE_WARN_TIMEOUT (HZ * 3600) static void devlink_port_type_warn_schedule(struct devlink_port *devlink_port) { @@ -7563,7 +7597,7 @@ void devlink_region_destroy(struct devlink_region *region) EXPORT_SYMBOL_GPL(devlink_region_destroy); /** - * devlink_region_shapshot_id_get - get snapshot ID + * devlink_region_snapshot_id_get - get snapshot ID * * This callback should be called when adding a new snapshot, * Driver should use the same id for multiple snapshots taken @@ -7571,7 +7605,7 @@ EXPORT_SYMBOL_GPL(devlink_region_destroy); * * @devlink: devlink */ -u32 devlink_region_shapshot_id_get(struct devlink *devlink) +u32 devlink_region_snapshot_id_get(struct devlink *devlink) { u32 id; @@ -7581,7 +7615,7 @@ u32 devlink_region_shapshot_id_get(struct devlink *devlink) return id; } -EXPORT_SYMBOL_GPL(devlink_region_shapshot_id_get); +EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_get); /** * devlink_region_snapshot_create - create a new snapshot @@ -7674,6 +7708,9 @@ static const struct devlink_trap devlink_trap_generic[] = { DEVLINK_TRAP(REJECT_ROUTE, EXCEPTION), DEVLINK_TRAP(IPV4_LPM_UNICAST_MISS, EXCEPTION), DEVLINK_TRAP(IPV6_LPM_UNICAST_MISS, EXCEPTION), + DEVLINK_TRAP(NON_ROUTABLE, DROP), + DEVLINK_TRAP(DECAP_ERROR, EXCEPTION), + DEVLINK_TRAP(OVERLAY_SMAC_MC, DROP), }; #define DEVLINK_TRAP_GROUP(_id) \ @@ -7686,6 +7723,7 @@ static const struct devlink_trap_group devlink_trap_group_generic[] = { DEVLINK_TRAP_GROUP(L2_DROPS), DEVLINK_TRAP_GROUP(L3_DROPS), DEVLINK_TRAP_GROUP(BUFFER_DROPS), + DEVLINK_TRAP_GROUP(TUNNEL_DROPS), }; static int devlink_trap_generic_verify(const struct devlink_trap *trap) diff --git a/net/core/filter.c b/net/core/filter.c index 42fd17c48c5f..792e3744b915 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2231,10 +2231,10 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, /* First find the starting scatterlist element */ i = msg->sg.start; do { + offset += len; len = sk_msg_elem(msg, i)->length; if (start < offset + len) break; - offset += len; sk_msg_iter_var_next(i); } while (i != msg->sg.end); @@ -2346,7 +2346,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, u32, len, u64, flags) { struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge; - u32 new, i = 0, l, space, copy = 0, offset = 0; + u32 new, i = 0, l = 0, space, copy = 0, offset = 0; u8 *raw, *to, *from; struct page *page; @@ -2356,11 +2356,11 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, /* First find the starting scatterlist element */ i = msg->sg.start; do { + offset += l; l = sk_msg_elem(msg, i)->length; if (start < offset + l) break; - offset += l; sk_msg_iter_var_next(i); } while (i != msg->sg.end); @@ -2415,6 +2415,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, sk_msg_iter_var_next(i); sg_unmark_end(psge); + sg_unmark_end(&rsge); sk_msg_iter_next(msg, end); } @@ -2506,7 +2507,7 @@ static void sk_msg_shift_right(struct sk_msg *msg, int i) BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, u32, len, u64, flags) { - u32 i = 0, l, space, offset = 0; + u32 i = 0, l = 0, space, offset = 0; u64 last = start + len; int pop; @@ -2516,11 +2517,11 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, /* First find the starting scatterlist element */ i = msg->sg.start; do { + offset += l; l = sk_msg_elem(msg, i)->length; if (start < offset + l) break; - offset += l; sk_msg_iter_var_next(i); } while (i != msg->sg.end); @@ -3458,58 +3459,6 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { .arg2_type = ARG_ANYTHING, }; -static int __bpf_tx_xdp(struct net_device *dev, - struct bpf_map *map, - struct xdp_buff *xdp, - u32 index) -{ - struct xdp_frame *xdpf; - int err, sent; - - if (!dev->netdev_ops->ndo_xdp_xmit) { - return -EOPNOTSUPP; - } - - err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data); - if (unlikely(err)) - return err; - - xdpf = convert_to_xdp_frame(xdp); - if (unlikely(!xdpf)) - return -EOVERFLOW; - - sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf, XDP_XMIT_FLUSH); - if (sent <= 0) - return sent; - return 0; -} - -static noinline int -xdp_do_redirect_slow(struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog, struct bpf_redirect_info *ri) -{ - struct net_device *fwd; - u32 index = ri->tgt_index; - int err; - - fwd = dev_get_by_index_rcu(dev_net(dev), index); - ri->tgt_index = 0; - if (unlikely(!fwd)) { - err = -EINVAL; - goto err; - } - - err = __bpf_tx_xdp(fwd, NULL, xdp, 0); - if (unlikely(err)) - goto err; - - _trace_xdp_redirect(dev, xdp_prog, index); - return 0; -err: - _trace_xdp_redirect_err(dev, xdp_prog, index, err); - return err; -} - static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, struct bpf_map *map, struct xdp_buff *xdp) { @@ -3522,18 +3471,18 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, case BPF_MAP_TYPE_XSKMAP: return __xsk_map_redirect(fwd, xdp); default: - break; + return -EBADRQC; } return 0; } -void xdp_do_flush_map(void) +void xdp_do_flush(void) { - __dev_map_flush(); + __dev_flush(); __cpu_map_flush(); __xsk_map_flush(); } -EXPORT_SYMBOL_GPL(xdp_do_flush_map); +EXPORT_SYMBOL_GPL(xdp_do_flush); static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) { @@ -3568,10 +3517,11 @@ void bpf_clear_redirect_map(struct bpf_map *map) } } -static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog, struct bpf_map *map, - struct bpf_redirect_info *ri) +int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) { + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_map *map = READ_ONCE(ri->map); u32 index = ri->tgt_index; void *fwd = ri->tgt_value; int err; @@ -3580,7 +3530,18 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, ri->tgt_value = NULL; WRITE_ONCE(ri->map, NULL); - err = __bpf_tx_xdp_map(dev, fwd, map, xdp); + if (unlikely(!map)) { + fwd = dev_get_by_index_rcu(dev_net(dev), index); + if (unlikely(!fwd)) { + err = -EINVAL; + goto err; + } + + err = dev_xdp_enqueue(fwd, xdp, dev); + } else { + err = __bpf_tx_xdp_map(dev, fwd, map, xdp); + } + if (unlikely(err)) goto err; @@ -3590,18 +3551,6 @@ err: _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err); return err; } - -int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) -{ - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - struct bpf_map *map = READ_ONCE(ri->map); - - if (likely(map)) - return xdp_do_redirect_map(dev, xdp, xdp_prog, map, ri); - - return xdp_do_redirect_slow(dev, xdp, xdp_prog, ri); -} EXPORT_SYMBOL_GPL(xdp_do_redirect); static int xdp_do_generic_redirect_map(struct net_device *dev, @@ -5277,8 +5226,7 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, if (sk) { sk = sk_to_full_sk(sk); if (!sk_fullsock(sk)) { - if (!sock_flag(sk, SOCK_RCU_FREE)) - sock_gen_put(sk); + sock_gen_put(sk); return NULL; } } @@ -5315,8 +5263,7 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, if (sk) { sk = sk_to_full_sk(sk); if (!sk_fullsock(sk)) { - if (!sock_flag(sk, SOCK_RCU_FREE)) - sock_gen_put(sk); + sock_gen_put(sk); return NULL; } } @@ -5383,7 +5330,8 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { BPF_CALL_1(bpf_sk_release, struct sock *, sk) { - if (!sock_flag(sk, SOCK_RCU_FREE)) + /* Only full sockets have sk->sk_flags. */ + if (!sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE)) sock_gen_put(sk); return 0; } @@ -5935,7 +5883,7 @@ bool bpf_helper_changes_pkt_data(void *func) return false; } -static const struct bpf_func_proto * +const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -5975,6 +5923,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_spin_unlock_proto; case BPF_FUNC_trace_printk: return bpf_get_trace_printk_proto(); + case BPF_FUNC_jiffies64: + return &bpf_jiffies64_proto; default: return NULL; } @@ -7607,21 +7557,21 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct bpf_sock, type): - BUILD_BUG_ON(HWEIGHT32(SK_FL_TYPE_MASK) != BITS_PER_BYTE * 2); - *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sock, __sk_flags_offset)); - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK); - *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT); - *target_size = 2; + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock, sk_type), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock, sk_type, + sizeof_field(struct sock, sk_type), + target_size)); break; case offsetof(struct bpf_sock, protocol): - BUILD_BUG_ON(HWEIGHT32(SK_FL_PROTO_MASK) != BITS_PER_BYTE); - *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sock, __sk_flags_offset)); - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); - *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT); - *target_size = 1; + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock, sk_protocol), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock, sk_protocol, + sizeof_field(struct sock, sk_protocol), + target_size)); break; case offsetof(struct bpf_sock, src_ip4): @@ -7903,20 +7853,13 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct bpf_sock_addr, type): - SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( - struct bpf_sock_addr_kern, struct sock, sk, - __sk_flags_offset, BPF_W, 0); - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK); - *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT); + SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, + struct sock, sk, sk_type); break; case offsetof(struct bpf_sock_addr, protocol): - SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( - struct bpf_sock_addr_kern, struct sock, sk, - __sk_flags_offset, BPF_W, 0); - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); - *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, - SK_FL_PROTO_SHIFT); + SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, + struct sock, sk, sk_protocol); break; case offsetof(struct bpf_sock_addr, msg_src_ip4): @@ -8835,11 +8778,11 @@ sk_reuseport_is_valid_access(int off, int size, skb, \ SKB_FIELD) -#define SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(SK_FIELD, BPF_SIZE, EXTRA_OFF) \ - SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(struct sk_reuseport_kern, \ - struct sock, \ - sk, \ - SK_FIELD, BPF_SIZE, EXTRA_OFF) +#define SK_REUSEPORT_LOAD_SK_FIELD(SK_FIELD) \ + SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \ + struct sock, \ + sk, \ + SK_FIELD) static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, @@ -8863,16 +8806,7 @@ static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct sk_reuseport_md, ip_protocol): - BUILD_BUG_ON(HWEIGHT32(SK_FL_PROTO_MASK) != BITS_PER_BYTE); - SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(__sk_flags_offset, - BPF_W, 0); - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); - *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, - SK_FL_PROTO_SHIFT); - /* SK_FL_PROTO_MASK and SK_FL_PROTO_SHIFT are endian - * aware. No further narrowing or masking is needed. - */ - *target_size = 1; + SK_REUSEPORT_LOAD_SK_FIELD(sk_protocol); break; case offsetof(struct sk_reuseport_md, data_end): diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 2dbbb030fbed..f560b4902060 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -233,7 +233,7 @@ static bool icmp_has_id(u8 type) * @skb: sk_buff to extract from * @key_icmp: struct flow_dissector_key_icmp to fill * @data: raw buffer pointer to the packet - * @toff: offset to extract at + * @thoff: offset to extract at * @hlen: packet header length */ void skb_flow_get_icmp_tci(const struct sk_buff *skb, diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 920784a9b7ff..789a73aa7bd8 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3290,6 +3290,7 @@ static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos) *pos = cpu+1; return per_cpu_ptr(tbl->stats, cpu); } + (*pos)++; return NULL; } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 39402840025e..757cc1d084e7 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -211,16 +211,10 @@ static int net_eq_idr(int id, void *net, void *peer) return 0; } -/* Should be called with nsid_lock held. If a new id is assigned, the bool alloc - * is set to true, thus the caller knows that the new id must be notified via - * rtnl. - */ -static int __peernet2id_alloc(struct net *net, struct net *peer, bool *alloc) +/* Must be called from RCU-critical section or with nsid_lock held */ +static int __peernet2id(const struct net *net, struct net *peer) { int id = idr_for_each(&net->netns_ids, net_eq_idr, peer); - bool alloc_it = *alloc; - - *alloc = false; /* Magic value for id 0. */ if (id == NET_ID_ZERO) @@ -228,23 +222,9 @@ static int __peernet2id_alloc(struct net *net, struct net *peer, bool *alloc) if (id > 0) return id; - if (alloc_it) { - id = alloc_netid(net, peer, -1); - *alloc = true; - return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED; - } - return NETNSA_NSID_NOT_ASSIGNED; } -/* should be called with nsid_lock held */ -static int __peernet2id(struct net *net, struct net *peer) -{ - bool no = false; - - return __peernet2id_alloc(net, peer, &no); -} - static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid, struct nlmsghdr *nlh, gfp_t gfp); /* This function returns the id of a peer netns. If no id is assigned, one will @@ -252,38 +232,50 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid, */ int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp) { - bool alloc = false, alive = false; int id; if (refcount_read(&net->count) == 0) return NETNSA_NSID_NOT_ASSIGNED; - spin_lock_bh(&net->nsid_lock); - /* - * When peer is obtained from RCU lists, we may race with + + spin_lock(&net->nsid_lock); + id = __peernet2id(net, peer); + if (id >= 0) { + spin_unlock(&net->nsid_lock); + return id; + } + + /* When peer is obtained from RCU lists, we may race with * its cleanup. Check whether it's alive, and this guarantees * we never hash a peer back to net->netns_ids, after it has * just been idr_remove()'d from there in cleanup_net(). */ - if (maybe_get_net(peer)) - alive = alloc = true; - id = __peernet2id_alloc(net, peer, &alloc); - spin_unlock_bh(&net->nsid_lock); - if (alloc && id >= 0) - rtnl_net_notifyid(net, RTM_NEWNSID, id, 0, NULL, gfp); - if (alive) - put_net(peer); + if (!maybe_get_net(peer)) { + spin_unlock(&net->nsid_lock); + return NETNSA_NSID_NOT_ASSIGNED; + } + + id = alloc_netid(net, peer, -1); + spin_unlock(&net->nsid_lock); + + put_net(peer); + if (id < 0) + return NETNSA_NSID_NOT_ASSIGNED; + + rtnl_net_notifyid(net, RTM_NEWNSID, id, 0, NULL, gfp); + return id; } EXPORT_SYMBOL_GPL(peernet2id_alloc); /* This function returns, if assigned, the id of a peer netns. */ -int peernet2id(struct net *net, struct net *peer) +int peernet2id(const struct net *net, struct net *peer) { int id; - spin_lock_bh(&net->nsid_lock); + rcu_read_lock(); id = __peernet2id(net, peer); - spin_unlock_bh(&net->nsid_lock); + rcu_read_unlock(); + return id; } EXPORT_SYMBOL(peernet2id); @@ -291,12 +283,12 @@ EXPORT_SYMBOL(peernet2id); /* This function returns true is the peer netns has an id assigned into the * current netns. */ -bool peernet_has_id(struct net *net, struct net *peer) +bool peernet_has_id(const struct net *net, struct net *peer) { return peernet2id(net, peer) >= 0; } -struct net *get_net_ns_by_id(struct net *net, int id) +struct net *get_net_ns_by_id(const struct net *net, int id) { struct net *peer; @@ -528,20 +520,20 @@ static void unhash_nsid(struct net *net, struct net *last) for_each_net(tmp) { int id; - spin_lock_bh(&tmp->nsid_lock); + spin_lock(&tmp->nsid_lock); id = __peernet2id(tmp, net); if (id >= 0) idr_remove(&tmp->netns_ids, id); - spin_unlock_bh(&tmp->nsid_lock); + spin_unlock(&tmp->nsid_lock); if (id >= 0) rtnl_net_notifyid(tmp, RTM_DELNSID, id, 0, NULL, GFP_KERNEL); if (tmp == last) break; } - spin_lock_bh(&net->nsid_lock); + spin_lock(&net->nsid_lock); idr_destroy(&net->netns_ids); - spin_unlock_bh(&net->nsid_lock); + spin_unlock(&net->nsid_lock); } static LLIST_HEAD(cleanup_list); @@ -754,9 +746,9 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh, return PTR_ERR(peer); } - spin_lock_bh(&net->nsid_lock); + spin_lock(&net->nsid_lock); if (__peernet2id(net, peer) >= 0) { - spin_unlock_bh(&net->nsid_lock); + spin_unlock(&net->nsid_lock); err = -EEXIST; NL_SET_BAD_ATTR(extack, nla); NL_SET_ERR_MSG(extack, @@ -765,7 +757,7 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh, } err = alloc_netid(net, peer, nsid); - spin_unlock_bh(&net->nsid_lock); + spin_unlock(&net->nsid_lock); if (err >= 0) { rtnl_net_notifyid(net, RTM_NEWNSID, err, NETLINK_CB(skb).portid, nlh, GFP_KERNEL); @@ -950,6 +942,7 @@ struct rtnl_net_dump_cb { int s_idx; }; +/* Runs in RCU-critical section. */ static int rtnl_net_dumpid_one(int id, void *peer, void *data) { struct rtnl_net_dump_cb *net_cb = (struct rtnl_net_dump_cb *)data; @@ -1034,19 +1027,9 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) goto end; } - spin_lock_bh(&net_cb.tgt_net->nsid_lock); - if (net_cb.fillargs.add_ref && - !net_eq(net_cb.ref_net, net_cb.tgt_net) && - !spin_trylock_bh(&net_cb.ref_net->nsid_lock)) { - spin_unlock_bh(&net_cb.tgt_net->nsid_lock); - err = -EAGAIN; - goto end; - } + rcu_read_lock(); idr_for_each(&net_cb.tgt_net->netns_ids, rtnl_net_dumpid_one, &net_cb); - if (net_cb.fillargs.add_ref && - !net_eq(net_cb.ref_net, net_cb.tgt_net)) - spin_unlock_bh(&net_cb.ref_net->nsid_lock); - spin_unlock_bh(&net_cb.tgt_net->nsid_lock); + rcu_read_unlock(); cb->args[0] = net_cb.idx; end: diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 294bfcf0ce0e..890be1b4877e 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -323,6 +323,10 @@ struct pktgen_dev { struct in6_addr max_in6_daddr; struct in6_addr min_in6_saddr; struct in6_addr max_in6_saddr; + u64 max_in6_h; + u64 max_in6_l; + u64 min_in6_h; + u64 min_in6_l; /* If we're doing ranges, random or incremental, then this * defines the min/max for those ranges. @@ -1355,6 +1359,59 @@ static ssize_t pktgen_if_write(struct file *file, sprintf(pg_result, "OK: dst6_max=%s", buf); return count; } + if (!strcmp(name, "src6_min")) { + len = strn_len(&user_buffer[i], sizeof(buf) - 1); + if (len < 0) + return len; + + pkt_dev->flags |= F_IPV6; + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + + in6_pton(buf, -1, pkt_dev->min_in6_saddr.s6_addr, -1, NULL); + snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->min_in6_saddr); + + memcpy(&pkt_dev->min_in6_h, pkt_dev->min_in6_saddr.s6_addr, 8); + memcpy(&pkt_dev->min_in6_l, pkt_dev->min_in6_saddr.s6_addr + 8, 8); + pkt_dev->min_in6_h = be64_to_cpu(pkt_dev->min_in6_h); + pkt_dev->min_in6_l = be64_to_cpu(pkt_dev->min_in6_l); + + pkt_dev->cur_in6_saddr = pkt_dev->min_in6_saddr; + if (debug) + pr_debug("src6_min set to: %s\n", buf); + + i += len; + sprintf(pg_result, "OK: src6_min=%s", buf); + return count; + } + if (!strcmp(name, "src6_max")) { + len = strn_len(&user_buffer[i], sizeof(buf) - 1); + if (len < 0) + return len; + + pkt_dev->flags |= F_IPV6; + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + + in6_pton(buf, -1, pkt_dev->max_in6_saddr.s6_addr, -1, NULL); + snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->max_in6_saddr); + + memcpy(&pkt_dev->max_in6_h, pkt_dev->max_in6_saddr.s6_addr, 8); + memcpy(&pkt_dev->max_in6_l, pkt_dev->max_in6_saddr.s6_addr + 8, 8); + pkt_dev->max_in6_h = be64_to_cpu(pkt_dev->max_in6_h); + pkt_dev->max_in6_l = be64_to_cpu(pkt_dev->max_in6_l); + + if (debug) + pr_debug("src6_max set to: %s\n", buf); + + i += len; + sprintf(pg_result, "OK: src6_max=%s", buf); + return count; + } if (!strcmp(name, "src6")) { len = strn_len(&user_buffer[i], sizeof(buf) - 1); if (len < 0) @@ -2286,6 +2343,45 @@ static void set_cur_queue_map(struct pktgen_dev *pkt_dev) pkt_dev->cur_queue_map = pkt_dev->cur_queue_map % pkt_dev->odev->real_num_tx_queues; } +/* generate ipv6 source addr */ +static void set_src_in6_addr(struct pktgen_dev *pkt_dev) +{ + u64 min6, max6, rand, i; + struct in6_addr addr6; + __be64 addr_l, *t; + + min6 = pkt_dev->min_in6_l; + max6 = pkt_dev->max_in6_l; + + /* only generate source address in least significant 64 bits range + * most significant 64 bits must be equal + */ + if (pkt_dev->max_in6_h != pkt_dev->min_in6_h || min6 >= max6) + return; + + addr6 = pkt_dev->min_in6_saddr; + t = (__be64 *)addr6.s6_addr + 1; + + if (pkt_dev->flags & F_IPSRC_RND) { + do { + prandom_bytes(&rand, sizeof(rand)); + rand = rand % (max6 - min6) + min6; + addr_l = cpu_to_be64(rand); + memcpy(t, &addr_l, 8); + } while (ipv6_addr_loopback(&addr6) || + ipv6_addr_v4mapped(&addr6) || + ipv6_addr_is_multicast(&addr6)); + } else { + addr6 = pkt_dev->cur_in6_saddr; + i = be64_to_cpu(*t); + if (++i > max6) + i = min6; + addr_l = cpu_to_be64(i); + memcpy(t, &addr_l, 8); + } + pkt_dev->cur_in6_saddr = addr6; +} + /* Increment/randomize headers according to flags and current values * for IP src/dest, UDP src/dst port, MAC-Addr src/dst */ @@ -2454,6 +2550,8 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) } } else { /* IPV6 * */ + set_src_in6_addr(pkt_dev); + if (!ipv6_addr_any(&pkt_dev->min_in6_daddr)) { int i; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 20bc406f3871..cdad6ed532c4 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3053,8 +3053,17 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, dev->rtnl_link_ops = ops; dev->rtnl_link_state = RTNL_LINK_INITIALIZING; - if (tb[IFLA_MTU]) - dev->mtu = nla_get_u32(tb[IFLA_MTU]); + if (tb[IFLA_MTU]) { + u32 mtu = nla_get_u32(tb[IFLA_MTU]); + int err; + + err = dev_validate_mtu(dev, mtu, extack); + if (err) { + free_netdev(dev); + return ERR_PTR(err); + } + dev->mtu = mtu; + } if (tb[IFLA_ADDRESS]) { memcpy(dev->dev_addr, nla_data(tb[IFLA_ADDRESS]), nla_len(tb[IFLA_ADDRESS])); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 44b0894d8ae1..864cb9e9622f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -68,6 +68,7 @@ #include <net/ip6_checksum.h> #include <net/xfrm.h> #include <net/mpls.h> +#include <net/mptcp.h> #include <linux/uaccess.h> #include <trace/events/skb.h> @@ -3638,6 +3639,97 @@ static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb) return head_frag; } +struct sk_buff *skb_segment_list(struct sk_buff *skb, + netdev_features_t features, + unsigned int offset) +{ + struct sk_buff *list_skb = skb_shinfo(skb)->frag_list; + unsigned int tnl_hlen = skb_tnl_header_len(skb); + unsigned int delta_truesize = 0; + unsigned int delta_len = 0; + struct sk_buff *tail = NULL; + struct sk_buff *nskb; + + skb_push(skb, -skb_network_offset(skb) + offset); + + skb_shinfo(skb)->frag_list = NULL; + + do { + nskb = list_skb; + list_skb = list_skb->next; + + if (!tail) + skb->next = nskb; + else + tail->next = nskb; + + tail = nskb; + + delta_len += nskb->len; + delta_truesize += nskb->truesize; + + skb_push(nskb, -skb_network_offset(nskb) + offset); + + __copy_skb_header(nskb, skb); + + skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb)); + skb_copy_from_linear_data_offset(skb, -tnl_hlen, + nskb->data - tnl_hlen, + offset + tnl_hlen); + + if (skb_needs_linearize(nskb, features) && + __skb_linearize(nskb)) + goto err_linearize; + + } while (list_skb); + + skb->truesize = skb->truesize - delta_truesize; + skb->data_len = skb->data_len - delta_len; + skb->len = skb->len - delta_len; + + skb_gso_reset(skb); + + skb->prev = tail; + + if (skb_needs_linearize(skb, features) && + __skb_linearize(skb)) + goto err_linearize; + + skb_get(skb); + + return skb; + +err_linearize: + kfree_skb_list(skb->next); + skb->next = NULL; + return ERR_PTR(-ENOMEM); +} +EXPORT_SYMBOL_GPL(skb_segment_list); + +int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) +{ + if (unlikely(p->len + skb->len >= 65536)) + return -E2BIG; + + if (NAPI_GRO_CB(p)->last == p) + skb_shinfo(p)->frag_list = skb; + else + NAPI_GRO_CB(p)->last->next = skb; + + skb_pull(skb, skb_gro_offset(skb)); + + NAPI_GRO_CB(p)->last = skb; + NAPI_GRO_CB(p)->count++; + p->data_len += skb->len; + p->truesize += skb->truesize; + p->len += skb->len; + + NAPI_GRO_CB(skb)->same_flow = 1; + + return 0; +} +EXPORT_SYMBOL_GPL(skb_gro_receive_list); + /** * skb_segment - Perform protocol segmentation on skb. * @head_skb: buffer to segment @@ -4109,6 +4201,9 @@ static const u8 skb_ext_type_len[] = { #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) [TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext), #endif +#if IS_ENABLED(CONFIG_MPTCP) + [SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext), +#endif }; static __always_inline unsigned int skb_ext_total_length(void) @@ -4123,6 +4218,9 @@ static __always_inline unsigned int skb_ext_total_length(void) #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) skb_ext_type_len[TC_SKB_EXT] + #endif +#if IS_ENABLED(CONFIG_MPTCP) + skb_ext_type_len[SKB_EXT_MPTCP] + +#endif 0; } @@ -5980,7 +6078,14 @@ static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id) return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE); } -static struct skb_ext *skb_ext_alloc(void) +/** + * __skb_ext_alloc - allocate a new skb extensions storage + * + * Returns the newly allocated pointer. The pointer can later attached to a + * skb via __skb_ext_set(). + * Note: caller must handle the skb_ext as an opaque data. + */ +struct skb_ext *__skb_ext_alloc(void) { struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); @@ -6021,6 +6126,30 @@ static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old, } /** + * __skb_ext_set - attach the specified extension storage to this skb + * @skb: buffer + * @id: extension id + * @ext: extension storage previously allocated via __skb_ext_alloc() + * + * Existing extensions, if any, are cleared. + * + * Returns the pointer to the extension. + */ +void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id, + struct skb_ext *ext) +{ + unsigned int newlen, newoff = SKB_EXT_CHUNKSIZEOF(*ext); + + skb_ext_put(skb); + newlen = newoff + skb_ext_type_len[id]; + ext->chunks = newlen; + ext->offset[id] = newoff; + skb->extensions = ext; + skb->active_extensions = 1 << id; + return skb_ext_get_ptr(ext, id); +} + +/** * skb_ext_add - allocate space for given extension, COW if needed * @skb: buffer * @id: extension to allocate space for @@ -6053,7 +6182,7 @@ void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) } else { newoff = SKB_EXT_CHUNKSIZEOF(*new); - new = skb_ext_alloc(); + new = __skb_ext_alloc(); if (!new) return NULL; } diff --git a/net/core/sock.c b/net/core/sock.c index 8459ad579f73..a4c8fac781ff 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2786,7 +2786,7 @@ static void sock_def_error_report(struct sock *sk) rcu_read_unlock(); } -static void sock_def_readable(struct sock *sk) +void sock_def_readable(struct sock *sk) { struct socket_wq *wq; diff --git a/net/core/sock_map.c b/net/core/sock_map.c index eb114ee419b6..8998e356f423 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -241,8 +241,11 @@ static void sock_map_free(struct bpf_map *map) struct sock *sk; sk = xchg(psk, NULL); - if (sk) + if (sk) { + lock_sock(sk); sock_map_unref(sk, psk); + release_sock(sk); + } } raw_spin_unlock_bh(&stab->lock); rcu_read_unlock(); @@ -862,7 +865,9 @@ static void sock_hash_free(struct bpf_map *map) raw_spin_lock_bh(&bucket->lock); hlist_for_each_entry_safe(elem, node, &bucket->head, node) { hlist_del_rcu(&elem->node); + lock_sock(elem->sk); sock_map_unref(elem->sk, elem); + release_sock(elem->sk); } raw_spin_unlock_bh(&bucket->lock); } diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index f19f179538b9..91e9f2223c39 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -107,7 +107,6 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) if (!more_reuse) return NULL; - more_reuse->max_socks = more_socks_size; more_reuse->num_socks = reuse->num_socks; more_reuse->prog = reuse->prog; more_reuse->reuseport_id = reuse->reuseport_id; diff --git a/net/core/utils.c b/net/core/utils.c index 6b6e51db9f3b..1f31a39236d5 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -438,6 +438,23 @@ void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, } EXPORT_SYMBOL(inet_proto_csum_replace4); +/** + * inet_proto_csum_replace16 - update layer 4 header checksum field + * @sum: Layer 4 header checksum field + * @skb: sk_buff for the packet + * @from: old IPv6 address + * @to: new IPv6 address + * @pseudohdr: True if layer 4 header checksum includes pseudoheader + * + * Update layer 4 header as per the update in IPv6 src/dst address. + * + * There is no need to update skb->csum in this function, because update in two + * fields a.) IPv6 src/dst address and b.) L4 header checksum cancels each other + * for skb->csum calculation. Whereas inet_proto_csum_replace4 function needs to + * update skb->csum, because update in 3 fields a.) IPv4 src/dst address, + * b.) IPv4 Header checksum and c.) L4 header checksum results in same diff as + * L4 Header checksum for skb->csum calculation. + */ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, const __be32 *from, const __be32 *to, bool pseudohdr) @@ -449,9 +466,6 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, if (skb->ip_summed != CHECKSUM_PARTIAL) { *sum = csum_fold(csum_partial(diff, sizeof(diff), ~csum_unfold(*sum))); - if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) - skb->csum = ~csum_partial(diff, sizeof(diff), - ~skb->csum); } else if (pseudohdr) *sum = ~csum_fold(csum_partial(diff, sizeof(diff), csum_unfold(*sum))); diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index e19a92a62e14..0a46ea3bddd5 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -670,7 +670,7 @@ static int dn_create(struct net *net, struct socket *sock, int protocol, { struct sock *sk; - if (protocol < 0 || protocol > SK_PROTOCOL_MAX) + if (protocol < 0 || protocol > U8_MAX) return -EINVAL; if (!net_eq(net, &init_net)) diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index c66abbed4daf..e7c30b472034 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -614,6 +614,32 @@ static int dsa_port_parse_dsa(struct dsa_port *dp) return 0; } +static enum dsa_tag_protocol dsa_get_tag_protocol(struct dsa_port *dp, + struct net_device *master) +{ + enum dsa_tag_protocol tag_protocol = DSA_TAG_PROTO_NONE; + struct dsa_switch *mds, *ds = dp->ds; + unsigned int mdp_upstream; + struct dsa_port *mdp; + + /* It is possible to stack DSA switches onto one another when that + * happens the switch driver may want to know if its tagging protocol + * is going to work in such a configuration. + */ + if (dsa_slave_dev_check(master)) { + mdp = dsa_slave_to_port(master); + mds = mdp->ds; + mdp_upstream = dsa_upstream_port(mds, mdp->index); + tag_protocol = mds->ops->get_tag_protocol(mds, mdp_upstream, + DSA_TAG_PROTO_NONE); + } + + /* If the master device is not itself a DSA slave in a disjoint DSA + * tree, then return immediately. + */ + return ds->ops->get_tag_protocol(ds, dp->index, tag_protocol); +} + static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master) { struct dsa_switch *ds = dp->ds; @@ -621,20 +647,21 @@ static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master) const struct dsa_device_ops *tag_ops; enum dsa_tag_protocol tag_protocol; - tag_protocol = ds->ops->get_tag_protocol(ds, dp->index); + tag_protocol = dsa_get_tag_protocol(dp, master); tag_ops = dsa_tag_driver_get(tag_protocol); if (IS_ERR(tag_ops)) { if (PTR_ERR(tag_ops) == -ENOPROTOOPT) return -EPROBE_DEFER; dev_warn(ds->dev, "No tagger for this switch\n"); + dp->master = NULL; return PTR_ERR(tag_ops); } + dp->master = master; dp->type = DSA_PORT_TYPE_CPU; dp->filter = tag_ops->filter; dp->rcv = tag_ops->rcv; dp->tag_ops = tag_ops; - dp->master = master; dp->dst = dst; return 0; @@ -822,6 +849,19 @@ static int dsa_switch_parse(struct dsa_switch *ds, struct dsa_chip_data *cd) return dsa_switch_parse_ports(ds, cd); } +static void dsa_switch_release_ports(struct dsa_switch *ds) +{ + struct dsa_switch_tree *dst = ds->dst; + struct dsa_port *dp, *next; + + list_for_each_entry_safe(dp, next, &dst->ports, list) { + if (dp->ds != ds) + continue; + list_del(&dp->list); + kfree(dp); + } +} + static int dsa_switch_probe(struct dsa_switch *ds) { struct dsa_switch_tree *dst; @@ -838,12 +878,17 @@ static int dsa_switch_probe(struct dsa_switch *ds) if (!ds->num_ports) return -EINVAL; - if (np) + if (np) { err = dsa_switch_parse_of(ds, np); - else if (pdata) + if (err) + dsa_switch_release_ports(ds); + } else if (pdata) { err = dsa_switch_parse(ds, pdata); - else + if (err) + dsa_switch_release_ports(ds); + } else { err = -ENODEV; + } if (err) return err; @@ -851,8 +896,10 @@ static int dsa_switch_probe(struct dsa_switch *ds) dst = ds->dst; dsa_tree_get(dst); err = dsa_tree_setup(dst); - if (err) + if (err) { + dsa_switch_release_ports(ds); dsa_tree_put(dst); + } return err; } @@ -873,15 +920,9 @@ EXPORT_SYMBOL_GPL(dsa_register_switch); static void dsa_switch_remove(struct dsa_switch *ds) { struct dsa_switch_tree *dst = ds->dst; - struct dsa_port *dp, *next; dsa_tree_teardown(dst); - - list_for_each_entry_safe(dp, next, &dst->ports, list) { - list_del(&dp->list); - kfree(dp); - } - + dsa_switch_release_ports(ds); dsa_tree_put(dst); } diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 09ea2fd78c74..a7662e7a691d 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -157,13 +157,12 @@ extern const struct dsa_device_ops notag_netdev_ops; void dsa_slave_mii_bus_init(struct dsa_switch *ds); int dsa_slave_create(struct dsa_port *dp); void dsa_slave_destroy(struct net_device *slave_dev); +bool dsa_slave_dev_check(const struct net_device *dev); int dsa_slave_suspend(struct net_device *slave_dev); int dsa_slave_resume(struct net_device *slave_dev); int dsa_slave_register_notifier(void); void dsa_slave_unregister_notifier(void); -void *dsa_defer_xmit(struct sk_buff *skb, struct net_device *dev); - static inline struct dsa_port *dsa_slave_to_port(const struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); diff --git a/net/dsa/port.c b/net/dsa/port.c index ffb5601f7ed6..774facb8d547 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -599,6 +599,7 @@ static int dsa_port_phylink_register(struct dsa_port *dp) dp->pl_config.dev = ds->dev; dp->pl_config.type = PHYLINK_DEV; + dp->pl_config.pcs_poll = ds->pcs_poll; dp->pl = phylink_create(&dp->pl_config, of_fwnode_handle(port_dn), mode, &dsa_port_phylink_mac_ops); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 78ffc87dc25e..088c886e609e 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -22,8 +22,6 @@ #include "dsa_priv.h" -static bool dsa_slave_dev_check(const struct net_device *dev); - /* slave mii_bus handling ***************************************************/ static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg) { @@ -116,9 +114,6 @@ static int dsa_slave_close(struct net_device *dev) struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); - cancel_work_sync(&dp->xmit_work); - skb_queue_purge(&dp->xmit_queue); - phylink_stop(dp->pl); dsa_port_disable(dp); @@ -518,7 +513,6 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) s->tx_bytes += skb->len; u64_stats_update_end(&s->syncp); - DSA_SKB_CB(skb)->deferred_xmit = false; DSA_SKB_CB(skb)->clone = NULL; /* Identify PTP protocol packets, clone them, and pass them to the @@ -531,39 +525,13 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) */ nskb = p->xmit(skb, dev); if (!nskb) { - if (!DSA_SKB_CB(skb)->deferred_xmit) - kfree_skb(skb); + kfree_skb(skb); return NETDEV_TX_OK; } return dsa_enqueue_skb(nskb, dev); } -void *dsa_defer_xmit(struct sk_buff *skb, struct net_device *dev) -{ - struct dsa_port *dp = dsa_slave_to_port(dev); - - DSA_SKB_CB(skb)->deferred_xmit = true; - - skb_queue_tail(&dp->xmit_queue, skb); - schedule_work(&dp->xmit_work); - return NULL; -} -EXPORT_SYMBOL_GPL(dsa_defer_xmit); - -static void dsa_port_xmit_work(struct work_struct *work) -{ - struct dsa_port *dp = container_of(work, struct dsa_port, xmit_work); - struct dsa_switch *ds = dp->ds; - struct sk_buff *skb; - - if (unlikely(!ds->ops->port_deferred_xmit)) - return; - - while ((skb = skb_dequeue(&dp->xmit_queue)) != NULL) - ds->ops->port_deferred_xmit(ds, dp->index, skb); -} - /* ethtool operations *******************************************************/ static void dsa_slave_get_drvinfo(struct net_device *dev, @@ -1367,9 +1335,6 @@ int dsa_slave_suspend(struct net_device *slave_dev) if (!netif_running(slave_dev)) return 0; - cancel_work_sync(&dp->xmit_work); - skb_queue_purge(&dp->xmit_queue); - netif_device_detach(slave_dev); rtnl_lock(); @@ -1455,8 +1420,6 @@ int dsa_slave_create(struct dsa_port *port) } p->dp = port; INIT_LIST_HEAD(&p->mall_tc_list); - INIT_WORK(&port->xmit_work, dsa_port_xmit_work); - skb_queue_head_init(&port->xmit_queue); p->xmit = cpu_dp->tag_ops->xmit; port->slave = slave_dev; @@ -1508,7 +1471,7 @@ void dsa_slave_destroy(struct net_device *slave_dev) free_netdev(slave_dev); } -static bool dsa_slave_dev_check(const struct net_device *dev) +bool dsa_slave_dev_check(const struct net_device *dev) { return dev->netdev_ops == &dsa_slave_netdev_ops; } diff --git a/net/dsa/tag_gswip.c b/net/dsa/tag_gswip.c index b678160bbd66..408d4af390a0 100644 --- a/net/dsa/tag_gswip.c +++ b/net/dsa/tag_gswip.c @@ -104,7 +104,7 @@ static struct sk_buff *gswip_tag_rcv(struct sk_buff *skb, } static const struct dsa_device_ops gswip_netdev_ops = { - .name = "gwsip", + .name = "gswip", .proto = DSA_TAG_PROTO_GSWIP, .xmit = gswip_tag_xmit, .rcv = gswip_tag_rcv, diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index c95885215525..c8a128c9e5e0 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -33,9 +33,6 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev) struct dsa_port *dp = dsa_slave_to_port(dev); u16 *phdr, hdr; - dev->stats.tx_packets++; - dev->stats.tx_bytes += skb->len; - if (skb_cow_head(skb, 0) < 0) return NULL; diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 63ef2a14c934..5366ea430349 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -83,12 +83,24 @@ static bool sja1105_filter(const struct sk_buff *skb, struct net_device *dev) return false; } +/* Calls sja1105_port_deferred_xmit in sja1105_main.c */ +static struct sk_buff *sja1105_defer_xmit(struct sja1105_port *sp, + struct sk_buff *skb) +{ + /* Increase refcount so the kfree_skb in dsa_slave_xmit + * won't really free the packet. + */ + skb_queue_tail(&sp->xmit_queue, skb_get(skb)); + kthread_queue_work(sp->xmit_worker, &sp->xmit_work); + + return NULL; +} + static struct sk_buff *sja1105_xmit(struct sk_buff *skb, struct net_device *netdev) { struct dsa_port *dp = dsa_slave_to_port(netdev); - struct dsa_switch *ds = dp->ds; - u16 tx_vid = dsa_8021q_tx_vid(ds, dp->index); + u16 tx_vid = dsa_8021q_tx_vid(dp->ds, dp->index); u16 queue_mapping = skb_get_queue_mapping(skb); u8 pcp = netdev_txq_to_tc(netdev, queue_mapping); @@ -97,7 +109,7 @@ static struct sk_buff *sja1105_xmit(struct sk_buff *skb, * is the .port_deferred_xmit driver callback. */ if (unlikely(sja1105_is_link_local(skb))) - return dsa_defer_xmit(skb, netdev); + return sja1105_defer_xmit(dp->priv, skb); /* If we are under a vlan_filtering bridge, IP termination on * switch ports based on 802.1Q tags is simply too brittle to diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 9040fe55e0f5..c8b903302ff2 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -335,22 +335,6 @@ int eth_mac_addr(struct net_device *dev, void *p) } EXPORT_SYMBOL(eth_mac_addr); -/** - * eth_change_mtu - set new MTU size - * @dev: network device - * @new_mtu: new Maximum Transfer Unit - * - * Allow changing MTU size. Needs to be overridden for devices - * supporting jumbo frames. - */ -int eth_change_mtu(struct net_device *dev, int new_mtu) -{ - netdev_warn(dev, "%s is deprecated\n", __func__); - dev->mtu = new_mtu; - return 0; -} -EXPORT_SYMBOL(eth_change_mtu); - int eth_validate_addr(struct net_device *dev) { if (!is_valid_ether_addr(dev->dev_addr)) diff --git a/net/ethtool/common.c b/net/ethtool/common.c index e621b1694d2f..c7b8956c3827 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -59,6 +59,7 @@ const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = { [NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record", [NETIF_F_HW_TLS_TX_BIT] = "tls-hw-tx-offload", [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload", + [NETIF_F_GRO_FRAGLIST_BIT] = "rx-gro-list", }; const char diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 4ca96c7b86b3..86b79f9bc08d 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -319,9 +319,10 @@ static int ethnl_default_doit(struct sk_buff *skb, struct genl_info *info) rtnl_unlock(); if (ret < 0) goto err_cleanup; - reply_len = ops->reply_size(req_info, reply_data); + ret = ops->reply_size(req_info, reply_data); if (ret < 0) goto err_cleanup; + reply_len = ret; ret = -ENOMEM; rskb = ethnl_reply_init(reply_len, req_info->dev, ops->reply_cmd, ops->hdr_attr, info, &reply_payload); @@ -472,8 +473,8 @@ static int ethnl_default_start(struct netlink_callback *cb) return -ENOMEM; reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL); if (!reply_data) { - kfree(req_info); - return -ENOMEM; + ret = -ENOMEM; + goto free_req_info; } ret = ethnl_default_parse(req_info, cb->nlh, sock_net(cb->skb->sk), ops, @@ -487,7 +488,7 @@ static int ethnl_default_start(struct netlink_callback *cb) req_info->dev = NULL; } if (ret < 0) - return ret; + goto free_reply_data; ctx->ops = ops; ctx->req_info = req_info; @@ -496,6 +497,13 @@ static int ethnl_default_start(struct netlink_callback *cb) ctx->pos_idx = 0; return 0; + +free_reply_data: + kfree(reply_data); +free_req_info: + kfree(req_info); + + return ret; } /* default ->done() handler for GET requests */ @@ -548,9 +556,10 @@ static void ethnl_default_notify(struct net_device *dev, unsigned int cmd, ret = ops->prepare_data(req_info, reply_data, NULL); if (ret < 0) goto err_cleanup; - reply_len = ops->reply_size(req_info, reply_data); + ret = ops->reply_size(req_info, reply_data); if (ret < 0) goto err_cleanup; + reply_len = ret; ret = -ENOMEM; skb = genlmsg_new(reply_len, GFP_KERNEL); if (!skb) diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c index 9f2243329015..82a059c13c1c 100644 --- a/net/ethtool/strset.c +++ b/net/ethtool/strset.c @@ -239,6 +239,7 @@ static int strset_prepare_data(const struct ethnl_req_info *req_base, return -EINVAL; } } + return 0; } ret = ethnl_ops_begin(dev); diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 27dc65d7de67..364ea2cc028e 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -35,7 +35,6 @@ static bool seq_nr_after(u16 a, u16 b) } #define seq_nr_before(a, b) seq_nr_after((b), (a)) -#define seq_nr_after_or_eq(a, b) (!seq_nr_before((a), (b))) #define seq_nr_before_or_eq(a, b) (!seq_nr_after((a), (b))) bool hsr_addr_is_self(struct hsr_priv *hsr, unsigned char *addr) diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index d40de84a637f..754d84b217f0 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -191,7 +191,7 @@ void hsr_debugfs_term(struct hsr_priv *priv); void hsr_debugfs_create_root(void); void hsr_debugfs_remove_root(void); #else -static inline void void hsr_debugfs_rename(struct net_device *dev) +static inline void hsr_debugfs_rename(struct net_device *dev) { } static inline void hsr_debugfs_init(struct hsr_priv *priv, diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index fc816b187170..f96bd489b362 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -378,6 +378,17 @@ config INET_ESP_OFFLOAD If unsure, say N. +config INET_ESPINTCP + bool "IP: ESP in TCP encapsulation (RFC 8229)" + depends on XFRM && INET_ESP + select STREAM_PARSER + select NET_SOCK_MSG + help + Support for RFC 8229 encapsulation of ESP and IKE over + TCP/IPv4 sockets. + + If unsure, say N. + config INET_IPCOMP tristate "IP: IPComp transformation" select INET_XFRM_TUNNEL diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index d57ecfaf89d4..9d97bace13c8 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -65,3 +65,7 @@ obj-$(CONFIG_NETLABEL) += cipso_ipv4.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o xfrm4_protocol.o + +ifeq ($(CONFIG_BPF_JIT),y) +obj-$(CONFIG_BPF_SYSCALL) += bpf_tcp_ca.o +endif diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c new file mode 100644 index 000000000000..574972bc7299 --- /dev/null +++ b/net/ipv4/bpf_tcp_ca.c @@ -0,0 +1,252 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook */ + +#include <linux/types.h> +#include <linux/bpf_verifier.h> +#include <linux/bpf.h> +#include <linux/btf.h> +#include <linux/filter.h> +#include <net/tcp.h> + +static u32 optional_ops[] = { + offsetof(struct tcp_congestion_ops, init), + offsetof(struct tcp_congestion_ops, release), + offsetof(struct tcp_congestion_ops, set_state), + offsetof(struct tcp_congestion_ops, cwnd_event), + offsetof(struct tcp_congestion_ops, in_ack_event), + offsetof(struct tcp_congestion_ops, pkts_acked), + offsetof(struct tcp_congestion_ops, min_tso_segs), + offsetof(struct tcp_congestion_ops, sndbuf_expand), + offsetof(struct tcp_congestion_ops, cong_control), +}; + +static u32 unsupported_ops[] = { + offsetof(struct tcp_congestion_ops, get_info), +}; + +static const struct btf_type *tcp_sock_type; +static u32 tcp_sock_id, sock_id; + +static int bpf_tcp_ca_init(struct btf *btf) +{ + s32 type_id; + + type_id = btf_find_by_name_kind(btf, "sock", BTF_KIND_STRUCT); + if (type_id < 0) + return -EINVAL; + sock_id = type_id; + + type_id = btf_find_by_name_kind(btf, "tcp_sock", BTF_KIND_STRUCT); + if (type_id < 0) + return -EINVAL; + tcp_sock_id = type_id; + tcp_sock_type = btf_type_by_id(btf, tcp_sock_id); + + return 0; +} + +static bool is_optional(u32 member_offset) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(optional_ops); i++) { + if (member_offset == optional_ops[i]) + return true; + } + + return false; +} + +static bool is_unsupported(u32 member_offset) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(unsupported_ops); i++) { + if (member_offset == unsupported_ops[i]) + return true; + } + + return false; +} + +extern struct btf *btf_vmlinux; + +static bool bpf_tcp_ca_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) + return false; + if (type != BPF_READ) + return false; + if (off % size != 0) + return false; + + if (!btf_ctx_access(off, size, type, prog, info)) + return false; + + if (info->reg_type == PTR_TO_BTF_ID && info->btf_id == sock_id) + /* promote it to tcp_sock */ + info->btf_id = tcp_sock_id; + + return true; +} + +static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log, + const struct btf_type *t, int off, + int size, enum bpf_access_type atype, + u32 *next_btf_id) +{ + size_t end; + + if (atype == BPF_READ) + return btf_struct_access(log, t, off, size, atype, next_btf_id); + + if (t != tcp_sock_type) { + bpf_log(log, "only read is supported\n"); + return -EACCES; + } + + switch (off) { + case bpf_ctx_range(struct inet_connection_sock, icsk_ca_priv): + end = offsetofend(struct inet_connection_sock, icsk_ca_priv); + break; + case offsetof(struct inet_connection_sock, icsk_ack.pending): + end = offsetofend(struct inet_connection_sock, + icsk_ack.pending); + break; + case offsetof(struct tcp_sock, snd_cwnd): + end = offsetofend(struct tcp_sock, snd_cwnd); + break; + case offsetof(struct tcp_sock, snd_cwnd_cnt): + end = offsetofend(struct tcp_sock, snd_cwnd_cnt); + break; + case offsetof(struct tcp_sock, snd_ssthresh): + end = offsetofend(struct tcp_sock, snd_ssthresh); + break; + case offsetof(struct tcp_sock, ecn_flags): + end = offsetofend(struct tcp_sock, ecn_flags); + break; + default: + bpf_log(log, "no write support to tcp_sock at off %d\n", off); + return -EACCES; + } + + if (off + size > end) { + bpf_log(log, + "write access at off %d with size %d beyond the member of tcp_sock ended at %zu\n", + off, size, end); + return -EACCES; + } + + return NOT_INIT; +} + +BPF_CALL_2(bpf_tcp_send_ack, struct tcp_sock *, tp, u32, rcv_nxt) +{ + /* bpf_tcp_ca prog cannot have NULL tp */ + __tcp_send_ack((struct sock *)tp, rcv_nxt); + return 0; +} + +static const struct bpf_func_proto bpf_tcp_send_ack_proto = { + .func = bpf_tcp_send_ack, + .gpl_only = false, + /* In case we want to report error later */ + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_ANYTHING, + .btf_id = &tcp_sock_id, +}; + +static const struct bpf_func_proto * +bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, + const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_tcp_send_ack: + return &bpf_tcp_send_ack_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +static const struct bpf_verifier_ops bpf_tcp_ca_verifier_ops = { + .get_func_proto = bpf_tcp_ca_get_func_proto, + .is_valid_access = bpf_tcp_ca_is_valid_access, + .btf_struct_access = bpf_tcp_ca_btf_struct_access, +}; + +static int bpf_tcp_ca_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + const struct tcp_congestion_ops *utcp_ca; + struct tcp_congestion_ops *tcp_ca; + size_t tcp_ca_name_len; + int prog_fd; + u32 moff; + + utcp_ca = (const struct tcp_congestion_ops *)udata; + tcp_ca = (struct tcp_congestion_ops *)kdata; + + moff = btf_member_bit_offset(t, member) / 8; + switch (moff) { + case offsetof(struct tcp_congestion_ops, flags): + if (utcp_ca->flags & ~TCP_CONG_MASK) + return -EINVAL; + tcp_ca->flags = utcp_ca->flags; + return 1; + case offsetof(struct tcp_congestion_ops, name): + tcp_ca_name_len = strnlen(utcp_ca->name, sizeof(utcp_ca->name)); + if (!tcp_ca_name_len || + tcp_ca_name_len == sizeof(utcp_ca->name)) + return -EINVAL; + if (tcp_ca_find(utcp_ca->name)) + return -EEXIST; + memcpy(tcp_ca->name, utcp_ca->name, sizeof(tcp_ca->name)); + return 1; + } + + if (!btf_type_resolve_func_ptr(btf_vmlinux, member->type, NULL)) + return 0; + + /* Ensure bpf_prog is provided for compulsory func ptr */ + prog_fd = (int)(*(unsigned long *)(udata + moff)); + if (!prog_fd && !is_optional(moff) && !is_unsupported(moff)) + return -EINVAL; + + return 0; +} + +static int bpf_tcp_ca_check_member(const struct btf_type *t, + const struct btf_member *member) +{ + if (is_unsupported(btf_member_bit_offset(t, member) / 8)) + return -ENOTSUPP; + return 0; +} + +static int bpf_tcp_ca_reg(void *kdata) +{ + return tcp_register_congestion_control(kdata); +} + +static void bpf_tcp_ca_unreg(void *kdata) +{ + tcp_unregister_congestion_control(kdata); +} + +/* Avoid sparse warning. It is only used in bpf_struct_ops.c. */ +extern struct bpf_struct_ops bpf_tcp_congestion_ops; + +struct bpf_struct_ops bpf_tcp_congestion_ops = { + .verifier_ops = &bpf_tcp_ca_verifier_ops, + .reg = bpf_tcp_ca_reg, + .unreg = bpf_tcp_ca_unreg, + .check_member = bpf_tcp_ca_check_member, + .init_member = bpf_tcp_ca_init_member, + .init = bpf_tcp_ca_init, + .name = "tcp_congestion_ops", +}; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 5c967764041f..103c7d599a3c 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -18,6 +18,8 @@ #include <net/icmp.h> #include <net/protocol.h> #include <net/udp.h> +#include <net/tcp.h> +#include <net/espintcp.h> #include <linux/highmem.h> @@ -117,6 +119,132 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp) put_page(sg_page(sg)); } +#ifdef CONFIG_INET_ESPINTCP +struct esp_tcp_sk { + struct sock *sk; + struct rcu_head rcu; +}; + +static void esp_free_tcp_sk(struct rcu_head *head) +{ + struct esp_tcp_sk *esk = container_of(head, struct esp_tcp_sk, rcu); + + sock_put(esk->sk); + kfree(esk); +} + +static struct sock *esp_find_tcp_sk(struct xfrm_state *x) +{ + struct xfrm_encap_tmpl *encap = x->encap; + struct esp_tcp_sk *esk; + __be16 sport, dport; + struct sock *nsk; + struct sock *sk; + + sk = rcu_dereference(x->encap_sk); + if (sk && sk->sk_state == TCP_ESTABLISHED) + return sk; + + spin_lock_bh(&x->lock); + sport = encap->encap_sport; + dport = encap->encap_dport; + nsk = rcu_dereference_protected(x->encap_sk, + lockdep_is_held(&x->lock)); + if (sk && sk == nsk) { + esk = kmalloc(sizeof(*esk), GFP_ATOMIC); + if (!esk) { + spin_unlock_bh(&x->lock); + return ERR_PTR(-ENOMEM); + } + RCU_INIT_POINTER(x->encap_sk, NULL); + esk->sk = sk; + call_rcu(&esk->rcu, esp_free_tcp_sk); + } + spin_unlock_bh(&x->lock); + + sk = inet_lookup_established(xs_net(x), &tcp_hashinfo, x->id.daddr.a4, + dport, x->props.saddr.a4, sport, 0); + if (!sk) + return ERR_PTR(-ENOENT); + + if (!tcp_is_ulp_esp(sk)) { + sock_put(sk); + return ERR_PTR(-EINVAL); + } + + spin_lock_bh(&x->lock); + nsk = rcu_dereference_protected(x->encap_sk, + lockdep_is_held(&x->lock)); + if (encap->encap_sport != sport || + encap->encap_dport != dport) { + sock_put(sk); + sk = nsk ?: ERR_PTR(-EREMCHG); + } else if (sk == nsk) { + sock_put(sk); + } else { + rcu_assign_pointer(x->encap_sk, sk); + } + spin_unlock_bh(&x->lock); + + return sk; +} + +static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb) +{ + struct sock *sk; + int err; + + rcu_read_lock(); + + sk = esp_find_tcp_sk(x); + err = PTR_ERR_OR_ZERO(sk); + if (err) + goto out; + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) + err = espintcp_queue_out(sk, skb); + else + err = espintcp_push_skb(sk, skb); + bh_unlock_sock(sk); + +out: + rcu_read_unlock(); + return err; +} + +static int esp_output_tcp_encap_cb(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct xfrm_state *x = dst->xfrm; + + return esp_output_tcp_finish(x, skb); +} + +static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb) +{ + int err; + + local_bh_disable(); + err = xfrm_trans_queue_net(xs_net(x), skb, esp_output_tcp_encap_cb); + local_bh_enable(); + + /* EINPROGRESS just happens to do the right thing. It + * actually means that the skb has been consumed and + * isn't coming back. + */ + return err ?: -EINPROGRESS; +} +#else +static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb) +{ + kfree_skb(skb); + + return -EOPNOTSUPP; +} +#endif + static void esp_output_done(struct crypto_async_request *base, int err) { struct sk_buff *skb = base->data; @@ -147,7 +275,11 @@ static void esp_output_done(struct crypto_async_request *base, int err) secpath_reset(skb); xfrm_dev_resume(skb); } else { - xfrm_output_resume(skb, err); + if (!err && + x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) + esp_output_tail_tcp(x, skb); + else + xfrm_output_resume(skb, err); } } @@ -225,45 +357,100 @@ static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto) tail[plen - 1] = proto; } -static int esp_output_udp_encap(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp) +static struct ip_esp_hdr *esp_output_udp_encap(struct sk_buff *skb, + int encap_type, + struct esp_info *esp, + __be16 sport, + __be16 dport) { - int encap_type; struct udphdr *uh; __be32 *udpdata32; - __be16 sport, dport; - struct xfrm_encap_tmpl *encap = x->encap; - struct ip_esp_hdr *esph = esp->esph; unsigned int len; - spin_lock_bh(&x->lock); - sport = encap->encap_sport; - dport = encap->encap_dport; - encap_type = encap->encap_type; - spin_unlock_bh(&x->lock); - len = skb->len + esp->tailen - skb_transport_offset(skb); - if (len + sizeof(struct iphdr) >= IP_MAX_MTU) - return -EMSGSIZE; + if (len + sizeof(struct iphdr) > IP_MAX_MTU) + return ERR_PTR(-EMSGSIZE); - uh = (struct udphdr *)esph; + uh = (struct udphdr *)esp->esph; uh->source = sport; uh->dest = dport; uh->len = htons(len); uh->check = 0; + *skb_mac_header(skb) = IPPROTO_UDP; + + if (encap_type == UDP_ENCAP_ESPINUDP_NON_IKE) { + udpdata32 = (__be32 *)(uh + 1); + udpdata32[0] = udpdata32[1] = 0; + return (struct ip_esp_hdr *)(udpdata32 + 2); + } + + return (struct ip_esp_hdr *)(uh + 1); +} + +#ifdef CONFIG_INET_ESPINTCP +static struct ip_esp_hdr *esp_output_tcp_encap(struct xfrm_state *x, + struct sk_buff *skb, + struct esp_info *esp) +{ + __be16 *lenp = (void *)esp->esph; + struct ip_esp_hdr *esph; + unsigned int len; + struct sock *sk; + + len = skb->len + esp->tailen - skb_transport_offset(skb); + if (len > IP_MAX_MTU) + return ERR_PTR(-EMSGSIZE); + + rcu_read_lock(); + sk = esp_find_tcp_sk(x); + rcu_read_unlock(); + + if (IS_ERR(sk)) + return ERR_CAST(sk); + + *lenp = htons(len); + esph = (struct ip_esp_hdr *)(lenp + 1); + + return esph; +} +#else +static struct ip_esp_hdr *esp_output_tcp_encap(struct xfrm_state *x, + struct sk_buff *skb, + struct esp_info *esp) +{ + return ERR_PTR(-EOPNOTSUPP); +} +#endif + +static int esp_output_encap(struct xfrm_state *x, struct sk_buff *skb, + struct esp_info *esp) +{ + struct xfrm_encap_tmpl *encap = x->encap; + struct ip_esp_hdr *esph; + __be16 sport, dport; + int encap_type; + + spin_lock_bh(&x->lock); + sport = encap->encap_sport; + dport = encap->encap_dport; + encap_type = encap->encap_type; + spin_unlock_bh(&x->lock); + switch (encap_type) { default: case UDP_ENCAP_ESPINUDP: - esph = (struct ip_esp_hdr *)(uh + 1); - break; case UDP_ENCAP_ESPINUDP_NON_IKE: - udpdata32 = (__be32 *)(uh + 1); - udpdata32[0] = udpdata32[1] = 0; - esph = (struct ip_esp_hdr *)(udpdata32 + 2); + esph = esp_output_udp_encap(skb, encap_type, esp, sport, dport); + break; + case TCP_ENCAP_ESPINTCP: + esph = esp_output_tcp_encap(x, skb, esp); break; } - *skb_mac_header(skb) = IPPROTO_UDP; + if (IS_ERR(esph)) + return PTR_ERR(esph); + esp->esph = esph; return 0; @@ -279,9 +466,9 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * struct sk_buff *trailer; int tailen = esp->tailen; - /* this is non-NULL only with UDP Encapsulation */ + /* this is non-NULL only with TCP/UDP Encapsulation */ if (x->encap) { - int err = esp_output_udp_encap(x, skb, esp); + int err = esp_output_encap(x, skb, esp); if (err < 0) return err; @@ -474,6 +661,9 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * if (sg != dsg) esp_ssg_unref(x, tmp); + if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) + err = esp_output_tail_tcp(x, skb); + error_free: kfree(tmp); error: @@ -600,7 +790,23 @@ int esp_input_done2(struct sk_buff *skb, int err) if (x->encap) { struct xfrm_encap_tmpl *encap = x->encap; + struct tcphdr *th = (void *)(skb_network_header(skb) + ihl); struct udphdr *uh = (void *)(skb_network_header(skb) + ihl); + __be16 source; + + switch (x->encap->encap_type) { + case TCP_ENCAP_ESPINTCP: + source = th->source; + break; + case UDP_ENCAP_ESPINUDP: + case UDP_ENCAP_ESPINUDP_NON_IKE: + source = uh->source; + break; + default: + WARN_ON_ONCE(1); + err = -EINVAL; + goto out; + } /* * 1) if the NAT-T peer's IP or port changed then @@ -609,11 +815,11 @@ int esp_input_done2(struct sk_buff *skb, int err) * SRC ports. */ if (iph->saddr != x->props.saddr.a4 || - uh->source != encap->encap_sport) { + source != encap->encap_sport) { xfrm_address_t ipaddr; ipaddr.a4 = iph->saddr; - km_new_mapping(x, &ipaddr, uh->source); + km_new_mapping(x, &ipaddr, source); /* XXX: perhaps add an extra * policy check here, to see @@ -988,6 +1194,14 @@ static int esp_init_state(struct xfrm_state *x) case UDP_ENCAP_ESPINUDP_NON_IKE: x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32); break; +#ifdef CONFIG_INET_ESPINTCP + case TCP_ENCAP_ESPINTCP: + /* only the length field, TCP encap is done by + * the socket + */ + x->props.header_len += 2; + break; +#endif } } diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 0e4a7cf6bc87..e2e219c7854a 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -57,6 +57,8 @@ static struct sk_buff *esp4_gro_receive(struct list_head *head, if (!x) goto out_reset; + skb->mark = xfrm_smark_get(skb->mark, x); + sp->xvec[sp->len++] = x; sp->olen++; diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index a68b5e21ec51..c092e9a55790 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -16,6 +16,9 @@ struct fib_alias { u8 fa_slen; u32 tb_id; s16 fa_default; + u8 offload:1, + trap:1, + unused:6; struct rcu_head rcu; }; @@ -35,9 +38,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg, int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, struct netlink_ext_ack *extack); bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi); -int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id, - u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi, - unsigned int); +int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, + struct fib_rt_info *fri, unsigned int flags); void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len, u32 tb_id, const struct nl_info *info, unsigned int nlm_flags); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index f1888c683426..a803cdd9400a 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -504,6 +504,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len, u32 tb_id, const struct nl_info *info, unsigned int nlm_flags) { + struct fib_rt_info fri; struct sk_buff *skb; u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; int err = -ENOBUFS; @@ -512,9 +513,15 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, if (!skb) goto errout; - err = fib_dump_info(skb, info->portid, seq, event, tb_id, - fa->fa_type, key, dst_len, - fa->fa_tos, fa->fa_info, nlm_flags); + fri.fi = fa->fa_info; + fri.tb_id = tb_id; + fri.dst = key; + fri.dst_len = dst_len; + fri.tos = fa->fa_tos; + fri.type = fa->fa_type; + fri.offload = fa->offload; + fri.trap = fa->trap; + err = fib_dump_info(skb, info->portid, seq, event, &fri, nlm_flags); if (err < 0) { /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -1725,10 +1732,11 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) #endif int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, - u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, - struct fib_info *fi, unsigned int flags) + struct fib_rt_info *fri, unsigned int flags) { - unsigned int nhs = fib_info_num_path(fi); + unsigned int nhs = fib_info_num_path(fri->fi); + struct fib_info *fi = fri->fi; + u32 tb_id = fri->tb_id; struct nlmsghdr *nlh; struct rtmsg *rtm; @@ -1738,22 +1746,22 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, rtm = nlmsg_data(nlh); rtm->rtm_family = AF_INET; - rtm->rtm_dst_len = dst_len; + rtm->rtm_dst_len = fri->dst_len; rtm->rtm_src_len = 0; - rtm->rtm_tos = tos; + rtm->rtm_tos = fri->tos; if (tb_id < 256) rtm->rtm_table = tb_id; else rtm->rtm_table = RT_TABLE_COMPAT; if (nla_put_u32(skb, RTA_TABLE, tb_id)) goto nla_put_failure; - rtm->rtm_type = type; + rtm->rtm_type = fri->type; rtm->rtm_flags = fi->fib_flags; rtm->rtm_scope = fi->fib_scope; rtm->rtm_protocol = fi->fib_protocol; if (rtm->rtm_dst_len && - nla_put_in_addr(skb, RTA_DST, dst)) + nla_put_in_addr(skb, RTA_DST, fri->dst)) goto nla_put_failure; if (fi->fib_priority && nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority)) @@ -1795,6 +1803,11 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, goto nla_put_failure; } + if (fri->offload) + rtm->rtm_flags |= RTM_F_OFFLOAD; + if (fri->trap) + rtm->rtm_flags |= RTM_F_TRAP; + nlmsg_end(skb, nlh); return 0; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index b92a42433a7d..ff0c24371e33 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1012,6 +1012,52 @@ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen, return NULL; } +static struct fib_alias * +fib_find_matching_alias(struct net *net, const struct fib_rt_info *fri) +{ + u8 slen = KEYLENGTH - fri->dst_len; + struct key_vector *l, *tp; + struct fib_table *tb; + struct fib_alias *fa; + struct trie *t; + + tb = fib_get_table(net, fri->tb_id); + if (!tb) + return NULL; + + t = (struct trie *)tb->tb_data; + l = fib_find_node(t, &tp, be32_to_cpu(fri->dst)); + if (!l) + return NULL; + + hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { + if (fa->fa_slen == slen && fa->tb_id == fri->tb_id && + fa->fa_tos == fri->tos && fa->fa_info == fri->fi && + fa->fa_type == fri->type) + return fa; + } + + return NULL; +} + +void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri) +{ + struct fib_alias *fa_match; + + rcu_read_lock(); + + fa_match = fib_find_matching_alias(net, fri); + if (!fa_match) + goto out; + + fa_match->offload = fri->offload; + fa_match->trap = fri->trap; + +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(fib_alias_hw_flags_set); + static void trie_rebalance(struct trie *t, struct key_vector *tn) { while (!IS_TRIE(tn)) @@ -1220,24 +1266,29 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->fa_slen = fa->fa_slen; new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; + new_fa->offload = 0; + new_fa->trap = 0; + + hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list); if (fib_find_alias(&l->leaf, fa->fa_slen, 0, 0, - tb->tb_id, true) == fa) { + tb->tb_id, true) == new_fa) { enum fib_event_type fib_event; fib_event = FIB_EVENT_ENTRY_REPLACE; err = call_fib_entry_notifiers(net, fib_event, key, plen, new_fa, extack); - if (err) + if (err) { + hlist_replace_rcu(&new_fa->fa_list, + &fa->fa_list); goto out_free_new_fa; + } } rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, nlflags); - hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list); - alias_free_mem_rcu(fa); fib_release_info(fi_drop); @@ -1275,6 +1326,8 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->fa_slen = slen; new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; + new_fa->offload = 0; + new_fa->trap = 0; /* Insert new entry to the list. */ err = fib_insert_alias(t, tp, l, new_fa, fa, key); @@ -2191,14 +2244,20 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, if (filter->dump_routes) { if (!s_fa) { + struct fib_rt_info fri; + + fri.fi = fi; + fri.tb_id = tb->tb_id; + fri.dst = xkey; + fri.dst_len = KEYLENGTH - fa->fa_slen; + fri.tos = fa->fa_tos; + fri.type = fa->fa_type; + fri.offload = fa->offload; + fri.trap = fa->trap; err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - RTM_NEWROUTE, - tb->tb_id, fa->fa_type, - xkey, - KEYLENGTH - fa->fa_slen, - fa->fa_tos, fi, flags); + RTM_NEWROUTE, &fri, flags); if (err < 0) goto stop; } @@ -2238,6 +2297,12 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, int count = cb->args[2]; t_key key = cb->args[3]; + /* First time here, count and key are both always 0. Count > 0 + * and key == 0 means the dump has wrapped around and we are done. + */ + if (count && !key) + return skb->len; + while ((l = leaf_walk_rcu(&tp, key)) != NULL) { int err; diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 30fa771d382a..dcc79ff54b41 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -662,8 +662,8 @@ static const struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = { [FOU_ATTR_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG, }, [FOU_ATTR_LOCAL_V4] = { .type = NLA_U32, }, [FOU_ATTR_PEER_V4] = { .type = NLA_U32, }, - [FOU_ATTR_LOCAL_V6] = { .type = sizeof(struct in6_addr), }, - [FOU_ATTR_PEER_V6] = { .type = sizeof(struct in6_addr), }, + [FOU_ATTR_LOCAL_V6] = { .len = sizeof(struct in6_addr), }, + [FOU_ATTR_PEER_V6] = { .len = sizeof(struct in6_addr), }, [FOU_ATTR_PEER_PORT] = { .type = NLA_U16, }, [FOU_ATTR_IFINDEX] = { .type = NLA_S32, }, }; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 18c0d5bffe12..a4db79b1b643 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -610,12 +610,6 @@ no_route: } EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); -#if IS_ENABLED(CONFIG_IPV6) -#define AF_INET_FAMILY(fam) ((fam) == AF_INET) -#else -#define AF_INET_FAMILY(fam) true -#endif - /* Decide when to expire the request and when to resend SYN-ACK */ static inline void syn_ack_recalc(struct request_sock *req, const int thresh, const int max_retries, @@ -770,6 +764,18 @@ void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, } EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); +static void inet_clone_ulp(const struct request_sock *req, struct sock *newsk, + const gfp_t priority) +{ + struct inet_connection_sock *icsk = inet_csk(newsk); + + if (!icsk->icsk_ulp_ops) + return; + + if (icsk->icsk_ulp_ops->clone) + icsk->icsk_ulp_ops->clone(req, newsk, priority); +} + /** * inet_csk_clone_lock - clone an inet socket, and lock its clone * @sk: the socket to clone @@ -810,6 +816,8 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, /* Deinitialize accept_queue to trap illegal accesses. */ memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); + inet_clone_ulp(req, newsk, priority); + security_inet_csk_clone(newsk, req); } return newsk; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 14db1e0b8a6e..d84819893db9 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -240,8 +240,8 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s static int ip_finish_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int mtu) { + struct sk_buff *segs, *nskb; netdev_features_t features; - struct sk_buff *segs; int ret = 0; /* common case: seglen is <= mtu @@ -272,8 +272,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, consume_skb(skb); - do { - struct sk_buff *nskb = segs->next; + skb_list_walk_safe(segs, segs, nskb) { int err; skb_mark_not_on_list(segs); @@ -281,8 +280,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, if (err && ret == 0) ret = err; - segs = nskb; - } while (segs); + } return ret; } diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 0fe2a5d3e258..74e1d964a615 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -1236,10 +1236,8 @@ int ip_tunnel_init(struct net_device *dev) iph->version = 4; iph->ihl = 5; - if (tunnel->collect_md) { - dev->features |= NETIF_F_NETNS_LOCAL; + if (tunnel->collect_md) netif_keep_dst(dev); - } return 0; } EXPORT_SYMBOL_GPL(ip_tunnel_init); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index e90b600c7a25..37cddd18f282 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -187,8 +187,17 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, int mtu; if (!dst) { - dev->stats.tx_carrier_errors++; - goto tx_error_icmp; + struct rtable *rt; + + fl->u.ip4.flowi4_oif = dev->ifindex; + fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; + rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4); + if (IS_ERR(rt)) { + dev->stats.tx_carrier_errors++; + goto tx_error_icmp; + } + dst = &rt->dst; + skb_dst_set(skb, dst); } dst_hold(dst); diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 214154b47d56..f1f78a742b36 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -384,10 +384,11 @@ next: ; return 1; } -static inline int check_target(struct arpt_entry *e, const char *name) +static int check_target(struct arpt_entry *e, struct net *net, const char *name) { struct xt_entry_target *t = arpt_get_target(e); struct xt_tgchk_param par = { + .net = net, .table = name, .entryinfo = e, .target = t->u.kernel.target, @@ -399,8 +400,9 @@ static inline int check_target(struct arpt_entry *e, const char *name) return xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false); } -static inline int -find_check_entry(struct arpt_entry *e, const char *name, unsigned int size, +static int +find_check_entry(struct arpt_entry *e, struct net *net, const char *name, + unsigned int size, struct xt_percpu_counter_alloc_state *alloc_state) { struct xt_entry_target *t; @@ -419,7 +421,7 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size, } t->u.kernel.target = target; - ret = check_target(e, name); + ret = check_target(e, net, name); if (ret) goto err; return 0; @@ -494,12 +496,13 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, return 0; } -static inline void cleanup_entry(struct arpt_entry *e) +static void cleanup_entry(struct arpt_entry *e, struct net *net) { struct xt_tgdtor_param par; struct xt_entry_target *t; t = arpt_get_target(e); + par.net = net; par.target = t->u.kernel.target; par.targinfo = t->data; par.family = NFPROTO_ARP; @@ -512,7 +515,9 @@ static inline void cleanup_entry(struct arpt_entry *e) /* Checks and translates the user-supplied table segment (held in * newinfo). */ -static int translate_table(struct xt_table_info *newinfo, void *entry0, +static int translate_table(struct net *net, + struct xt_table_info *newinfo, + void *entry0, const struct arpt_replace *repl) { struct xt_percpu_counter_alloc_state alloc_state = { 0 }; @@ -569,7 +574,7 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, /* Finally, each sanity check must pass */ i = 0; xt_entry_foreach(iter, entry0, newinfo->size) { - ret = find_check_entry(iter, repl->name, repl->size, + ret = find_check_entry(iter, net, repl->name, repl->size, &alloc_state); if (ret != 0) break; @@ -580,7 +585,7 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, xt_entry_foreach(iter, entry0, newinfo->size) { if (i-- == 0) break; - cleanup_entry(iter); + cleanup_entry(iter, net); } return ret; } @@ -923,7 +928,7 @@ static int __do_replace(struct net *net, const char *name, /* Decrease module usage counts and free resource */ loc_cpu_old_entry = oldinfo->entries; xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size) - cleanup_entry(iter); + cleanup_entry(iter, net); xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, @@ -974,7 +979,7 @@ static int do_replace(struct net *net, const void __user *user, goto free_newinfo; } - ret = translate_table(newinfo, loc_cpu_entry, &tmp); + ret = translate_table(net, newinfo, loc_cpu_entry, &tmp); if (ret != 0) goto free_newinfo; @@ -986,7 +991,7 @@ static int do_replace(struct net *net, const void __user *user, free_newinfo_untrans: xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) - cleanup_entry(iter); + cleanup_entry(iter, net); free_newinfo: xt_free_table_info(newinfo); return ret; @@ -1149,7 +1154,8 @@ compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr, } } -static int translate_compat_table(struct xt_table_info **pinfo, +static int translate_compat_table(struct net *net, + struct xt_table_info **pinfo, void **pentry0, const struct compat_arpt_replace *compatr) { @@ -1217,7 +1223,7 @@ static int translate_compat_table(struct xt_table_info **pinfo, repl.num_counters = 0; repl.counters = NULL; repl.size = newinfo->size; - ret = translate_table(newinfo, entry1, &repl); + ret = translate_table(net, newinfo, entry1, &repl); if (ret) goto free_newinfo; @@ -1270,7 +1276,7 @@ static int compat_do_replace(struct net *net, void __user *user, goto free_newinfo; } - ret = translate_compat_table(&newinfo, &loc_cpu_entry, &tmp); + ret = translate_compat_table(net, &newinfo, &loc_cpu_entry, &tmp); if (ret != 0) goto free_newinfo; @@ -1282,7 +1288,7 @@ static int compat_do_replace(struct net *net, void __user *user, free_newinfo_untrans: xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) - cleanup_entry(iter); + cleanup_entry(iter, net); free_newinfo: xt_free_table_info(newinfo); return ret; @@ -1509,7 +1515,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len return ret; } -static void __arpt_unregister_table(struct xt_table *table) +static void __arpt_unregister_table(struct net *net, struct xt_table *table) { struct xt_table_info *private; void *loc_cpu_entry; @@ -1521,7 +1527,7 @@ static void __arpt_unregister_table(struct xt_table *table) /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries; xt_entry_foreach(iter, loc_cpu_entry, private->size) - cleanup_entry(iter); + cleanup_entry(iter, net); if (private->number > private->initial_entries) module_put(table_owner); xt_free_table_info(private); @@ -1546,7 +1552,7 @@ int arpt_register_table(struct net *net, loc_cpu_entry = newinfo->entries; memcpy(loc_cpu_entry, repl->entries, repl->size); - ret = translate_table(newinfo, loc_cpu_entry, repl); + ret = translate_table(net, newinfo, loc_cpu_entry, repl); if (ret != 0) goto out_free; @@ -1561,7 +1567,7 @@ int arpt_register_table(struct net *net, ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); if (ret != 0) { - __arpt_unregister_table(new_table); + __arpt_unregister_table(net, new_table); *res = NULL; } @@ -1576,7 +1582,7 @@ void arpt_unregister_table(struct net *net, struct xt_table *table, const struct nf_hook_ops *ops) { nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); - __arpt_unregister_table(table); + __arpt_unregister_table(net, table); } /* The built-in targets: standard (NULL) and error. */ diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 511eaa94e2d1..d072c326dd64 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -321,7 +321,9 @@ static size_t nh_nlmsg_size_single(struct nexthop *nh) static size_t nh_nlmsg_size(struct nexthop *nh) { - size_t sz = nla_total_size(4); /* NHA_ID */ + size_t sz = NLMSG_ALIGN(sizeof(struct nhmsg)); + + sz += nla_total_size(4); /* NHA_ID */ if (nh->is_group) sz += nh_nlmsg_size_grp(nh); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index cc90243ccf76..2580303249e2 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -289,6 +289,8 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP), SNMP_MIB_ITEM("TCPWqueueTooBig", LINUX_MIB_TCPWQUEUETOOBIG), SNMP_MIB_ITEM("TCPFastOpenPassiveAltKey", LINUX_MIB_TCPFASTOPENPASSIVEALTKEY), + SNMP_MIB_ITEM("TcpTimeoutRehash", LINUX_MIB_TCPTIMEOUTREHASH), + SNMP_MIB_ITEM("TcpDuplicateDataRehash", LINUX_MIB_TCPDUPLICATEDATAREHASH), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 87e979f2b74a..d5c57b3f77d5 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -271,6 +271,7 @@ static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) *pos = cpu+1; return &per_cpu(rt_cache_stat, cpu); } + (*pos)++; return NULL; } @@ -3223,16 +3224,41 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, skb_reset_mac_header(skb); if (rtm->rtm_flags & RTM_F_FIB_MATCH) { + struct fib_rt_info fri; + if (!res.fi) { err = fib_props[res.type].error; if (!err) err = -EHOSTUNREACH; goto errout_rcu; } + fri.fi = res.fi; + fri.tb_id = table_id; + fri.dst = res.prefix; + fri.dst_len = res.prefixlen; + fri.tos = fl4.flowi4_tos; + fri.type = rt->rt_type; + fri.offload = 0; + fri.trap = 0; + if (res.fa_head) { + struct fib_alias *fa; + + hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) { + u8 slen = 32 - fri.dst_len; + + if (fa->fa_slen == slen && + fa->tb_id == fri.tb_id && + fa->fa_tos == fri.tos && + fa->fa_info == res.fi && + fa->fa_type == fri.type) { + fri.offload = fa->offload; + fri.trap = fa->trap; + break; + } + } + } err = fib_dump_info(skb, NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, RTM_NEWROUTE, table_id, - rt->rt_type, res.prefix, res.prefixlen, - fl4.flowi4_tos, res.fi, 0); + nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0); } else { err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb, NETLINK_CB(in_skb).portid, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f09fbc85b108..484485ae74c2 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -271,6 +271,7 @@ #include <net/icmp.h> #include <net/inet_common.h> #include <net/tcp.h> +#include <net/mptcp.h> #include <net/xfrm.h> #include <net/ip.h> #include <net/sock.h> @@ -690,8 +691,8 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, refcount_read(&sk->sk_wmem_alloc) > skb->truesize; } -static void tcp_push(struct sock *sk, int flags, int mss_now, - int nonagle, int size_goal) +void tcp_push(struct sock *sk, int flags, int mss_now, + int nonagle, int size_goal) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -925,7 +926,7 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, return max(size_goal, mss_now); } -static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) +int tcp_send_mss(struct sock *sk, int *size_goal, int flags) { int mss_now; @@ -2524,6 +2525,7 @@ static void tcp_rtx_queue_purge(struct sock *sk) { struct rb_node *p = rb_first(&sk->tcp_rtx_queue); + tcp_sk(sk)->highest_sack = NULL; while (p) { struct sk_buff *skb = rb_to_skb(p); @@ -2614,7 +2616,6 @@ int tcp_disconnect(struct sock *sk, int flags) WRITE_ONCE(tp->write_seq, seq); icsk->icsk_backoff = 0; - tp->snd_cwnd = 2; icsk->icsk_probes_out = 0; icsk->icsk_rto = TCP_TIMEOUT_INIT; tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; @@ -3336,6 +3337,7 @@ static size_t tcp_opt_stats_get_size(void) nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */ nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */ nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */ + nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */ 0; } @@ -3390,6 +3392,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups); nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen); nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3); + nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash); return stats; } @@ -4021,4 +4024,5 @@ void __init tcp_init(void) tcp_metrics_init(); BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0); tcp_tasklet_init(); + mptcp_init(); } diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index a6545ef0d27b..6c4d79baff26 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -779,8 +779,7 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) * bandwidth sample. Delivered is in packets and interval_us in uS and * ratio will be <<1 for most connections. So delivered is first scaled. */ - bw = (u64)rs->delivered * BW_UNIT; - do_div(bw, rs->interval_us); + bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us); /* If this sample is application-limited, it is likely to have a very * low delivered count that represents application behavior rather than diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index e38705165ac9..8a01428f80c1 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -121,14 +121,14 @@ int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, struct sk_psock *psock; int copied, ret; - if (unlikely(flags & MSG_ERRQUEUE)) - return inet_recv_error(sk, msg, len, addr_len); - if (!skb_queue_empty(&sk->sk_receive_queue)) - return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - psock = sk_psock_get(sk); if (unlikely(!psock)) return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); + if (!skb_queue_empty(&sk->sk_receive_queue) && + sk_psock_queue_empty(psock)) + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); lock_sock(sk); msg_bytes_ready: copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags); @@ -139,7 +139,7 @@ msg_bytes_ready: timeo = sock_rcvtimeo(sk, nonblock); data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err); if (data) { - if (skb_queue_empty(&sk->sk_receive_queue)) + if (!sk_psock_queue_empty(psock)) goto msg_bytes_ready; release_sock(sk); sk_psock_put(sk, psock); @@ -315,10 +315,7 @@ more_data: */ delta = msg->sg.size; psock->eval = sk_psock_msg_verdict(sk, psock, msg); - if (msg->sg.size < delta) - delta -= msg->sg.size; - else - delta = 0; + delta -= msg->sg.size; } if (msg->cork_bytes && diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 3737ec096650..3172e31987be 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -21,7 +21,7 @@ static DEFINE_SPINLOCK(tcp_cong_list_lock); static LIST_HEAD(tcp_cong_list); /* Simple linear search, don't expect many entries! */ -static struct tcp_congestion_ops *tcp_ca_find(const char *name) +struct tcp_congestion_ops *tcp_ca_find(const char *name) { struct tcp_congestion_ops *e; @@ -162,7 +162,7 @@ void tcp_assign_congestion_control(struct sock *sk) rcu_read_lock(); ca = rcu_dereference(net->ipv4.tcp_congestion_control); - if (unlikely(!try_module_get(ca->owner))) + if (unlikely(!bpf_try_module_get(ca, ca->owner))) ca = &tcp_reno; icsk->icsk_ca_ops = ca; rcu_read_unlock(); @@ -208,7 +208,7 @@ void tcp_cleanup_congestion_control(struct sock *sk) if (icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); - module_put(icsk->icsk_ca_ops->owner); + bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner); } /* Used by sysctl to change default congestion control */ @@ -222,12 +222,12 @@ int tcp_set_default_congestion_control(struct net *net, const char *name) ca = tcp_ca_find_autoload(net, name); if (!ca) { ret = -ENOENT; - } else if (!try_module_get(ca->owner)) { + } else if (!bpf_try_module_get(ca, ca->owner)) { ret = -EBUSY; } else { prev = xchg(&net->ipv4.tcp_congestion_control, ca); if (prev) - module_put(prev->owner); + bpf_module_put(prev, prev->owner); ca->flags |= TCP_CONG_NON_RESTRICTED; ret = 0; @@ -366,19 +366,19 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, } else if (!load) { const struct tcp_congestion_ops *old_ca = icsk->icsk_ca_ops; - if (try_module_get(ca->owner)) { + if (bpf_try_module_get(ca, ca->owner)) { if (reinit) { tcp_reinit_congestion_control(sk, ca); } else { icsk->icsk_ca_ops = ca; - module_put(old_ca->owner); + bpf_module_put(old_ca, old_ca->owner); } } else { err = -EBUSY; } } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) { err = -EPERM; - } else if (!try_module_get(ca->owner)) { + } else if (!bpf_try_module_get(ca, ca->owner)) { err = -EBUSY; } else { tcp_reinit_congestion_control(sk, ca); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 1d1e3493965f..e8b840a4767e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -79,6 +79,7 @@ #include <trace/events/tcp.h> #include <linux/jump_label_ratelimit.h> #include <net/busy_poll.h> +#include <net/mptcp.h> int sysctl_tcp_max_orphans __read_mostly = NR_FILE; @@ -915,9 +916,10 @@ static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq, /* This must be called before lost_out is incremented */ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) { - if (!tp->retransmit_skb_hint || - before(TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) + if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) || + (tp->retransmit_skb_hint && + before(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(tp->retransmit_skb_hint)->seq))) tp->retransmit_skb_hint = skb; } @@ -1422,7 +1424,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) goto fallback; - if (!tcp_skb_can_collapse_to(prev)) + if (!tcp_skb_can_collapse(prev, skb)) goto fallback; in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && @@ -1727,8 +1729,11 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, } /* Ignore very old stuff early */ - if (!after(sp[used_sacks].end_seq, prior_snd_una)) + if (!after(sp[used_sacks].end_seq, prior_snd_una)) { + if (i == 0) + first_sack_index = -1; continue; + } used_sacks++; } @@ -3160,6 +3165,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, tp->retransmit_skb_hint = NULL; if (unlikely(skb == tp->lost_skb_hint)) tp->lost_skb_hint = NULL; + tcp_highest_sack_replace(sk, skb, next); tcp_rtx_queue_unlink_and_free(skb, sk); } @@ -3920,6 +3926,10 @@ void tcp_parse_options(const struct net *net, */ break; #endif + case TCPOPT_MPTCP: + mptcp_parse_option(skb, ptr, opsize, opt_rx); + break; + case TCPOPT_FASTOPEN: tcp_parse_fastopen_option( opsize - TCPOLEN_FASTOPEN_BASE, @@ -4261,8 +4271,10 @@ static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb) * The receiver remembers and reflects via DSACKs. Leverage the * DSACK state and change the txhash to re-route speculatively. */ - if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq) + if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq) { sk_rethink_txhash(sk); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH); + } } static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) @@ -4420,6 +4432,9 @@ static bool tcp_try_coalesce(struct sock *sk, if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) return false; + if (!mptcp_skb_can_collapse(to, from)) + return false; + #ifdef CONFIG_TLS_DEVICE if (from->decrypted != to->decrypted) return false; @@ -4758,6 +4773,9 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) bool fragstolen; int eaten; + if (sk_is_mptcp(sk)) + mptcp_incoming_options(sk, skb, &tp->rx_opt); + if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { __kfree_skb(skb); return; @@ -4929,7 +4947,7 @@ restart: /* The first skb to collapse is: * - not SYN/FIN and * - bloated or contains data before "start" or - * overlaps to the next one. + * overlaps to the next one and mptcp allow collapsing. */ if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) && (tcp_win_from_space(sk, skb->truesize) > skb->len || @@ -4938,7 +4956,7 @@ restart: break; } - if (n && n != tail && + if (n && n != tail && mptcp_skb_can_collapse(skb, n) && TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) { end_of_skbs = false; break; @@ -4971,6 +4989,7 @@ restart: else __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */ skb_set_owner_r(nskb, sk); + mptcp_skb_ext_move(nskb, skb); /* Copy data, releasing collapsed skbs. */ while (copy > 0) { @@ -4990,6 +5009,7 @@ restart: skb = tcp_collapse_one(sk, skb, list, root); if (!skb || skb == tail || + !mptcp_skb_can_collapse(nskb, skb) || (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) goto end; #ifdef CONFIG_TLS_DEVICE @@ -5964,6 +5984,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); + if (sk_is_mptcp(sk)) + mptcp_rcv_synsent(sk); + /* Remember, tcp_poll() does not lock socket! * Change state from SYN-SENT only after copied_seq * is initialized. */ @@ -6329,8 +6352,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) case TCP_CLOSE_WAIT: case TCP_CLOSING: case TCP_LAST_ACK: - if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) + if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + if (sk_is_mptcp(sk)) + mptcp_incoming_options(sk, skb, &tp->rx_opt); break; + } /* fall through */ case TCP_FIN_WAIT1: case TCP_FIN_WAIT2: @@ -6586,6 +6612,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_rsk(req)->af_specific = af_ops; tcp_rsk(req)->ts_off = 0; +#if IS_ENABLED(CONFIG_MPTCP) + tcp_rsk(req)->is_mptcp = 0; +#endif tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = af_ops->mss_clamp; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4adac9c75343..df1166b76126 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1426,7 +1426,7 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = { .syn_ack_timeout = tcp_syn_ack_timeout, }; -static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { +const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { .mss_clamp = TCP_MSS_DEFAULT, #ifdef CONFIG_TCP_MD5SIG .req_md5_lookup = tcp_v4_md5_lookup, @@ -2678,7 +2678,8 @@ static void __net_exit tcp_sk_exit(struct net *net) int cpu; if (net->ipv4.tcp_congestion_control) - module_put(net->ipv4.tcp_congestion_control->owner); + bpf_module_put(net->ipv4.tcp_congestion_control, + net->ipv4.tcp_congestion_control->owner); for_each_possible_cpu(cpu) inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); @@ -2785,7 +2786,8 @@ static int __net_init tcp_sk_init(struct net *net) /* Reno is always built in */ if (!net_eq(net, &init_net) && - try_module_get(init_net.ipv4.tcp_congestion_control->owner)) + bpf_try_module_get(init_net.ipv4.tcp_congestion_control, + init_net.ipv4.tcp_congestion_control->owner)) net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; else net->ipv4.tcp_congestion_control = &tcp_reno; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index c802bc80c400..ad3b56d9fa71 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -414,7 +414,7 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) rcu_read_lock(); ca = tcp_ca_find_key(ca_key); - if (likely(ca && try_module_get(ca->owner))) { + if (likely(ca && bpf_try_module_get(ca, ca->owner))) { icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst); icsk->icsk_ca_ops = ca; ca_got_dst = true; @@ -425,7 +425,7 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) /* If no valid choice made yet, assign current system default ca. */ if (!ca_got_dst && (!icsk->icsk_ca_setsockopt || - !try_module_get(icsk->icsk_ca_ops->owner))) + !bpf_try_module_get(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner))) tcp_assign_congestion_control(sk); tcp_set_ca_state(sk, TCP_CA_Open); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 58c92a7d671c..306e25d743e8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -38,6 +38,7 @@ #define pr_fmt(fmt) "TCP: " fmt #include <net/tcp.h> +#include <net/mptcp.h> #include <linux/compiler.h> #include <linux/gfp.h> @@ -414,6 +415,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_WSCALE (1 << 3) #define OPTION_FAST_OPEN_COOKIE (1 << 8) #define OPTION_SMC (1 << 9) +#define OPTION_MPTCP (1 << 10) static void smc_options_write(__be32 *ptr, u16 *options) { @@ -439,8 +441,17 @@ struct tcp_out_options { __u8 *hash_location; /* temporary pointer, overloaded */ __u32 tsval, tsecr; /* need to include OPTION_TS */ struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ + struct mptcp_out_options mptcp; }; +static void mptcp_options_write(__be32 *ptr, struct tcp_out_options *opts) +{ +#if IS_ENABLED(CONFIG_MPTCP) + if (unlikely(OPTION_MPTCP & opts->options)) + mptcp_write_options(ptr, &opts->mptcp); +#endif +} + /* Write previously computed TCP options to the packet. * * Beware: Something in the Internet is very sensitive to the ordering of @@ -549,6 +560,8 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, } smc_options_write(ptr, &options); + + mptcp_options_write(ptr, opts); } static void smc_set_option(const struct tcp_sock *tp, @@ -584,6 +597,22 @@ static void smc_set_option_cond(const struct tcp_sock *tp, #endif } +static void mptcp_set_option_cond(const struct request_sock *req, + struct tcp_out_options *opts, + unsigned int *remaining) +{ + if (rsk_is_mptcp(req)) { + unsigned int size; + + if (mptcp_synack_options(req, &size, &opts->mptcp)) { + if (*remaining >= size) { + opts->options |= OPTION_MPTCP; + *remaining -= size; + } + } + } +} + /* Compute TCP options for SYN packets. This is not the final * network wire format yet. */ @@ -653,6 +682,15 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, smc_set_option(tp, opts, &remaining); + if (sk_is_mptcp(sk)) { + unsigned int size; + + if (mptcp_syn_options(sk, skb, &size, &opts->mptcp)) { + opts->options |= OPTION_MPTCP; + remaining -= size; + } + } + return MAX_TCP_OPTION_SPACE - remaining; } @@ -714,6 +752,8 @@ static unsigned int tcp_synack_options(const struct sock *sk, } } + mptcp_set_option_cond(req, opts, &remaining); + smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); return MAX_TCP_OPTION_SPACE - remaining; @@ -751,16 +791,37 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb size += TCPOLEN_TSTAMP_ALIGNED; } + /* MPTCP options have precedence over SACK for the limited TCP + * option space because a MPTCP connection would be forced to + * fall back to regular TCP if a required multipath option is + * missing. SACK still gets a chance to use whatever space is + * left. + */ + if (sk_is_mptcp(sk)) { + unsigned int remaining = MAX_TCP_OPTION_SPACE - size; + unsigned int opt_size = 0; + + if (mptcp_established_options(sk, skb, &opt_size, remaining, + &opts->mptcp)) { + opts->options |= OPTION_MPTCP; + size += opt_size; + } + } + eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; if (unlikely(eff_sacks)) { const unsigned int remaining = MAX_TCP_OPTION_SPACE - size; + if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED + + TCPOLEN_SACK_PERBLOCK)) + return size; + opts->num_sack_blocks = min_t(unsigned int, eff_sacks, (remaining - TCPOLEN_SACK_BASE_ALIGNED) / TCPOLEN_SACK_PERBLOCK); - if (likely(opts->num_sack_blocks)) - size += TCPOLEN_SACK_BASE_ALIGNED + - opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; + + size += TCPOLEN_SACK_BASE_ALIGNED + + opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; } return size; @@ -2865,7 +2926,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, if (!tcp_can_collapse(sk, skb)) break; - if (!tcp_skb_can_collapse_to(to)) + if (!tcp_skb_can_collapse(to, skb)) break; space -= skb->len; @@ -3232,6 +3293,7 @@ int tcp_send_synack(struct sock *sk) if (!nskb) return -ENOMEM; INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor); + tcp_highest_sack_replace(sk, skb, nskb); tcp_rtx_queue_unlink_and_free(skb, sk); __skb_header_release(nskb); tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb); @@ -3368,8 +3430,8 @@ static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst) rcu_read_lock(); ca = tcp_ca_find_key(ca_key); - if (likely(ca && try_module_get(ca->owner))) { - module_put(icsk->icsk_ca_ops->owner); + if (likely(ca && bpf_try_module_get(ca, ca->owner))) { + bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner); icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst); icsk->icsk_ca_ops = ca; } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 1097b438befe..c3f26dcd6704 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -223,6 +223,9 @@ static int tcp_write_timeout(struct sock *sk) dst_negative_advice(sk); } else { sk_rethink_txhash(sk); + tp->timeout_rehash++; + __NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPTIMEOUTREHASH); } retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; expired = icsk->icsk_retransmits >= retry_until; @@ -234,6 +237,9 @@ static int tcp_write_timeout(struct sock *sk) dst_negative_advice(sk); } else { sk_rethink_txhash(sk); + tp->timeout_rehash++; + __NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPTIMEOUTREHASH); } retry_until = net->ipv4.sysctl_tcp_retries2; diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c index 12ab5db2b71c..38d3ad141161 100644 --- a/net/ipv4/tcp_ulp.c +++ b/net/ipv4/tcp_ulp.c @@ -99,17 +99,19 @@ void tcp_get_available_ulp(char *buf, size_t maxlen) rcu_read_unlock(); } -void tcp_update_ulp(struct sock *sk, struct proto *proto) +void tcp_update_ulp(struct sock *sk, struct proto *proto, + void (*write_space)(struct sock *sk)) { struct inet_connection_sock *icsk = inet_csk(sk); if (!icsk->icsk_ulp_ops) { + sk->sk_write_space = write_space; sk->sk_prot = proto; return; } if (icsk->icsk_ulp_ops->update) - icsk->icsk_ulp_ops->update(sk, proto); + icsk->icsk_ulp_ops->update(sk, proto, write_space); } void tcp_cleanup_ulp(struct sock *sk) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 93a355b6b092..db76b9609299 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1368,7 +1368,8 @@ static void udp_rmem_release(struct sock *sk, int size, int partial, if (likely(partial)) { up->forward_deficit += size; size = up->forward_deficit; - if (size < (sk->sk_rcvbuf >> 2)) + if (size < (sk->sk_rcvbuf >> 2) && + !skb_queue_empty(&up->reader_queue)) return; } else { size += up->forward_deficit; @@ -1708,7 +1709,8 @@ busy_check: /* sk_queue is empty, reader_queue may contain peeked packets */ } while (timeo && - !__skb_wait_for_more_packets(sk, &error, &timeo, + !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, + &error, &timeo, (struct sk_buff *)sk_queue)); *err = error; @@ -2104,8 +2106,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_SGO_CB_OFFSET); __skb_push(skb, -skb_mac_offset(skb)); segs = udp_rcv_segment(sk, skb, true); - for (skb = segs; skb; skb = next) { - next = skb->next; + skb_list_walk_safe(segs, skb, next) { __skb_pull(skb, skb_transport_offset(skb)); ret = udp_queue_rcv_one_skb(sk, skb); if (ret > 0) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index b25e42100ceb..1a98583a79f4 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -184,6 +184,20 @@ out_unlock: } EXPORT_SYMBOL(skb_udp_tunnel_segment); +static struct sk_buff *__udp_gso_segment_list(struct sk_buff *skb, + netdev_features_t features) +{ + unsigned int mss = skb_shinfo(skb)->gso_size; + + skb = skb_segment_list(skb, features, skb_mac_header_len(skb)); + if (IS_ERR(skb)) + return skb; + + udp_hdr(skb)->len = htons(sizeof(struct udphdr) + mss); + + return skb; +} + struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, netdev_features_t features) { @@ -196,6 +210,9 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, __sum16 check; __be16 newlen; + if (skb_shinfo(gso_skb)->gso_type & SKB_GSO_FRAGLIST) + return __udp_gso_segment_list(gso_skb, features); + mss = skb_shinfo(gso_skb)->gso_size; if (gso_skb->len <= sizeof(*uh) + mss) return ERR_PTR(-EINVAL); @@ -354,6 +371,7 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, struct udphdr *uh2; struct sk_buff *p; unsigned int ulen; + int ret = 0; /* requires non zero csum, for symmetry with GSO */ if (!uh->check) { @@ -369,7 +387,6 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, } /* pull encapsulating udp header */ skb_gro_pull(skb, sizeof(struct udphdr)); - skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) @@ -383,14 +400,40 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, continue; } + if (NAPI_GRO_CB(skb)->is_flist != NAPI_GRO_CB(p)->is_flist) { + NAPI_GRO_CB(skb)->flush = 1; + return p; + } + /* Terminate the flow on len mismatch or if it grow "too much". * Under small packet flood GRO count could elsewhere grow a lot * leading to excessive truesize values. * On len mismatch merge the first packet shorter than gso_size, * otherwise complete the GRO packet. */ - if (ulen > ntohs(uh2->len) || skb_gro_receive(p, skb) || - ulen != ntohs(uh2->len) || + if (ulen > ntohs(uh2->len)) { + pp = p; + } else { + if (NAPI_GRO_CB(skb)->is_flist) { + if (!pskb_may_pull(skb, skb_gro_offset(skb))) { + NAPI_GRO_CB(skb)->flush = 1; + return NULL; + } + if ((skb->ip_summed != p->ip_summed) || + (skb->csum_level != p->csum_level)) { + NAPI_GRO_CB(skb)->flush = 1; + return NULL; + } + ret = skb_gro_receive_list(p, skb); + } else { + skb_gro_postpull_rcsum(skb, uh, + sizeof(struct udphdr)); + + ret = skb_gro_receive(p, skb); + } + } + + if (ret || ulen != ntohs(uh2->len) || NAPI_GRO_CB(p)->count >= UDP_GRO_CNT_MAX) pp = p; @@ -401,36 +444,29 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, return NULL; } -INDIRECT_CALLABLE_DECLARE(struct sock *udp6_lib_lookup_skb(struct sk_buff *skb, - __be16 sport, __be16 dport)); struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, - struct udphdr *uh, udp_lookup_t lookup) + struct udphdr *uh, struct sock *sk) { struct sk_buff *pp = NULL; struct sk_buff *p; struct udphdr *uh2; unsigned int off = skb_gro_offset(skb); int flush = 1; - struct sock *sk; - rcu_read_lock(); - sk = INDIRECT_CALL_INET(lookup, udp6_lib_lookup_skb, - udp4_lib_lookup_skb, skb, uh->source, uh->dest); - if (!sk) - goto out_unlock; + if (skb->dev->features & NETIF_F_GRO_FRAGLIST) + NAPI_GRO_CB(skb)->is_flist = sk ? !udp_sk(sk)->gro_enabled: 1; - if (udp_sk(sk)->gro_enabled) { + if ((sk && udp_sk(sk)->gro_enabled) || NAPI_GRO_CB(skb)->is_flist) { pp = call_gro_receive(udp_gro_receive_segment, head, skb); - rcu_read_unlock(); return pp; } - if (NAPI_GRO_CB(skb)->encap_mark || + if (!sk || NAPI_GRO_CB(skb)->encap_mark || (skb->ip_summed != CHECKSUM_PARTIAL && NAPI_GRO_CB(skb)->csum_cnt == 0 && !NAPI_GRO_CB(skb)->csum_valid) || !udp_sk(sk)->gro_receive) - goto out_unlock; + goto out; /* mark that this skb passed once through the tunnel gro layer */ NAPI_GRO_CB(skb)->encap_mark = 1; @@ -457,8 +493,7 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); pp = call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); -out_unlock: - rcu_read_unlock(); +out: skb_gro_flush_final(skb, pp, flush); return pp; } @@ -468,8 +503,10 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); + struct sk_buff *pp; + struct sock *sk; - if (unlikely(!uh) || !static_branch_unlikely(&udp_encap_needed_key)) + if (unlikely(!uh)) goto flush; /* Don't bother verifying checksum if we're going to flush anyway. */ @@ -484,7 +521,11 @@ struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb) inet_gro_compute_pseudo); skip: NAPI_GRO_CB(skb)->is_ipv6 = 0; - return udp_gro_receive(head, skb, uh, udp4_lib_lookup_skb); + rcu_read_lock(); + sk = static_branch_unlikely(&udp_encap_needed_key) ? udp4_lib_lookup_skb(skb, uh->source, uh->dest) : NULL; + pp = udp_gro_receive(head, skb, uh, sk); + rcu_read_unlock(); + return pp; flush: NAPI_GRO_CB(skb)->flush = 1; @@ -517,9 +558,7 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff, rcu_read_lock(); sk = INDIRECT_CALL_INET(lookup, udp6_lib_lookup_skb, udp4_lib_lookup_skb, skb, uh->source, uh->dest); - if (sk && udp_sk(sk)->gro_enabled) { - err = udp_gro_complete_segment(skb); - } else if (sk && udp_sk(sk)->gro_complete) { + if (sk && udp_sk(sk)->gro_complete) { skb_shinfo(skb)->gso_type = uh->check ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; @@ -529,6 +568,8 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff, skb->encapsulation = 1; err = udp_sk(sk)->gro_complete(sk, skb, nhoff + sizeof(struct udphdr)); + } else { + err = udp_gro_complete_segment(skb); } rcu_read_unlock(); @@ -544,6 +585,23 @@ INDIRECT_CALLABLE_SCOPE int udp4_gro_complete(struct sk_buff *skb, int nhoff) const struct iphdr *iph = ip_hdr(skb); struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); + if (NAPI_GRO_CB(skb)->is_flist) { + uh->len = htons(skb->len - nhoff); + + skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4); + skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; + + if (skb->ip_summed == CHECKSUM_UNNECESSARY) { + if (skb->csum_level < SKB_MAX_CSUM_LEVEL) + skb->csum_level++; + } else { + skb->ip_summed = CHECKSUM_UNNECESSARY; + skb->csum_level = 0; + } + + return 0; + } + if (uh->check) uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr, iph->daddr, 0); diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c index 8a4285712808..ea595c8549c7 100644 --- a/net/ipv4/xfrm4_protocol.c +++ b/net/ipv4/xfrm4_protocol.c @@ -72,6 +72,14 @@ int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, if (!head) goto out; + if (!skb_dst(skb)) { + const struct iphdr *iph = ip_hdr(skb); + + if (ip_route_input_noref(skb, iph->daddr, iph->saddr, + iph->tos, skb->dev)) + goto drop; + } + for_each_protocol_rcu(*head, handler) if ((ret = handler->input_handler(skb, nexthdr, spi, encap_type)) != -EINVAL) return ret; @@ -79,6 +87,7 @@ int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, out: icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); +drop: kfree_skb(skb); return 0; } diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index e31626ffccd1..fd535053245b 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -79,6 +79,8 @@ static struct sk_buff *esp6_gro_receive(struct list_head *head, if (!x) goto out_reset; + skb->mark = xfrm_smark_get(skb->mark, x); + sp->xvec[sp->len++] = x; sp->olen++; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index b1e9a10e1133..58fbde244381 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -2571,14 +2571,13 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; + ++(*pos); if (!v) goto iter_table; n = rcu_dereference_bh(((struct fib6_info *)v)->fib6_next); - if (n) { - ++*pos; + if (n) return n; - } iter_table: ipv6_route_check_sernum(iter); @@ -2586,8 +2585,6 @@ iter_table: r = fib6_walk_continue(&iter->w); spin_unlock_bh(&iter->tbl->tb6_lock); if (r > 0) { - if (v) - ++*pos; return iter->w.leaf; } else if (r < 0) { fib6_walker_unlink(net, &iter->w); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index ee968d980746..55bfc5149d0c 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1466,7 +1466,6 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) dev->mtu -= 8; if (tunnel->parms.collect_md) { - dev->features |= NETIF_F_NETNS_LOCAL; netif_keep_dst(dev); } ip6gre_tnl_init_features(dev); @@ -1894,7 +1893,6 @@ static void ip6gre_tap_setup(struct net_device *dev) dev->needs_free_netdev = true; dev->priv_destructor = ip6gre_dev_free; - dev->features |= NETIF_F_NETNS_LOCAL; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; netif_keep_dst(dev); @@ -2197,7 +2195,6 @@ static void ip6erspan_tap_setup(struct net_device *dev) dev->needs_free_netdev = true; dev->priv_destructor = ip6gre_dev_free; - dev->features |= NETIF_F_NETNS_LOCAL; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; netif_keep_dst(dev); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 2f376dbc37d5..b5dd20c4599b 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1877,10 +1877,8 @@ static int ip6_tnl_dev_init(struct net_device *dev) if (err) return err; ip6_tnl_link_config(t); - if (t->parms.collect_md) { - dev->features |= NETIF_F_NETNS_LOCAL; + if (t->parms.collect_md) netif_keep_dst(dev); - } return 0; } diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 6f08b760c2a7..524006aa0d78 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -449,8 +449,17 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) int err = -1; int mtu; - if (!dst) - goto tx_err_link_failure; + if (!dst) { + fl->u.ip6.flowi6_oif = dev->ifindex; + fl->u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; + dst = ip6_route_output(dev_net(dev), NULL, &fl->u.ip6); + if (dst->error) { + dst_release(dst); + dst = NULL; + goto tx_err_link_failure; + } + skb_dst_set(skb, dst); + } dst_hold(dst); dst = xfrm_lookup(t->net, dst, fl, NULL, 0); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0253b702afb7..4fbdc60b4e07 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5576,6 +5576,13 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, expires -= jiffies; } + if (!dst) { + if (rt->offload) + rtm->rtm_flags |= RTM_F_OFFLOAD; + if (rt->trap) + rtm->rtm_flags |= RTM_F_TRAP; + } + if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) goto nla_put_failure; diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 85a5447a3e8d..7cbc19731997 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -23,6 +23,7 @@ #include <net/addrconf.h> #include <net/ip6_route.h> #include <net/dst_cache.h> +#include <net/ip_tunnels.h> #ifdef CONFIG_IPV6_SEG6_HMAC #include <net/seg6_hmac.h> #endif @@ -135,7 +136,8 @@ static bool decap_and_validate(struct sk_buff *skb, int proto) skb_reset_network_header(skb); skb_reset_transport_header(skb); - skb->encapsulation = 0; + if (iptunnel_pull_offloads(skb)) + return false; return true; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 95e4e1e95db2..33a578a3eb3a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -75,7 +75,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); static const struct inet_connection_sock_af_ops ipv6_mapped; -static const struct inet_connection_sock_af_ops ipv6_specific; +const struct inet_connection_sock_af_ops ipv6_specific; #ifdef CONFIG_TCP_MD5SIG static const struct tcp_sock_af_ops tcp_sock_ipv6_specific; static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific; @@ -238,6 +238,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; icsk->icsk_af_ops = &ipv6_mapped; + if (sk_is_mptcp(sk)) + mptcp_handle_ipv6_mapped(sk, true); sk->sk_backlog_rcv = tcp_v4_do_rcv; #ifdef CONFIG_TCP_MD5SIG tp->af_specific = &tcp_sock_ipv6_mapped_specific; @@ -248,6 +250,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (err) { icsk->icsk_ext_hdr_len = exthdrlen; icsk->icsk_af_ops = &ipv6_specific; + if (sk_is_mptcp(sk)) + mptcp_handle_ipv6_mapped(sk, false); sk->sk_backlog_rcv = tcp_v6_do_rcv; #ifdef CONFIG_TCP_MD5SIG tp->af_specific = &tcp_sock_ipv6_specific; @@ -819,7 +823,7 @@ struct request_sock_ops tcp6_request_sock_ops __read_mostly = { .syn_ack_timeout = tcp_syn_ack_timeout, }; -static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { +const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { .mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr), #ifdef CONFIG_TCP_MD5SIG @@ -1203,6 +1207,8 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * newnp->saddr = newsk->sk_v6_rcv_saddr; inet_csk(newsk)->icsk_af_ops = &ipv6_mapped; + if (sk_is_mptcp(newsk)) + mptcp_handle_ipv6_mapped(newsk, true); newsk->sk_backlog_rcv = tcp_v4_do_rcv; #ifdef CONFIG_TCP_MD5SIG newtp->af_specific = &tcp_sock_ipv6_mapped_specific; @@ -1794,7 +1800,7 @@ static struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_destructor = tcp_twsk_destructor, }; -static const struct inet_connection_sock_af_ops ipv6_specific = { +const struct inet_connection_sock_af_ops ipv6_specific = { .queue_xmit = inet6_csk_xmit, .send_check = tcp_v6_send_check, .rebuild_header = inet6_sk_rebuild_header, @@ -2163,9 +2169,16 @@ int __init tcpv6_init(void) ret = register_pernet_subsys(&tcpv6_net_ops); if (ret) goto out_tcpv6_protosw; + + ret = mptcpv6_init(); + if (ret) + goto out_tcpv6_pernet_subsys; + out: return ret; +out_tcpv6_pernet_subsys: + unregister_pernet_subsys(&tcpv6_net_ops); out_tcpv6_protosw: inet6_unregister_protosw(&tcpv6_protosw); out_tcpv6_protocol: diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 9fec580c968e..5dc439a391fe 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -690,8 +690,7 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) __skb_push(skb, -skb_mac_offset(skb)); segs = udp_rcv_segment(sk, skb, false); - for (skb = segs; skb; skb = next) { - next = skb->next; + skb_list_walk_safe(segs, skb, next) { __skb_pull(skb, skb_transport_offset(skb)); ret = udpv6_queue_rcv_one_skb(sk, skb); diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index f0d5fc27d0b5..584157a07759 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -115,8 +115,10 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); + struct sk_buff *pp; + struct sock *sk; - if (unlikely(!uh) || !static_branch_unlikely(&udpv6_encap_needed_key)) + if (unlikely(!uh)) goto flush; /* Don't bother verifying checksum if we're going to flush anyway. */ @@ -132,7 +134,11 @@ struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb) skip: NAPI_GRO_CB(skb)->is_ipv6 = 1; - return udp_gro_receive(head, skb, uh, udp6_lib_lookup_skb); + rcu_read_lock(); + sk = static_branch_unlikely(&udpv6_encap_needed_key) ? udp6_lib_lookup_skb(skb, uh->source, uh->dest) : NULL; + pp = udp_gro_receive(head, skb, uh, sk); + rcu_read_unlock(); + return pp; flush: NAPI_GRO_CB(skb)->flush = 1; @@ -144,6 +150,23 @@ INDIRECT_CALLABLE_SCOPE int udp6_gro_complete(struct sk_buff *skb, int nhoff) const struct ipv6hdr *ipv6h = ipv6_hdr(skb); struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); + if (NAPI_GRO_CB(skb)->is_flist) { + uh->len = htons(skb->len - nhoff); + + skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4); + skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; + + if (skb->ip_summed == CHECKSUM_UNNECESSARY) { + if (skb->csum_level < SKB_MAX_CSUM_LEVEL) + skb->csum_level++; + } else { + skb->ip_summed = CHECKSUM_UNNECESSARY; + skb->csum_level = 0; + } + + return 0; + } + if (uh->check) uh->check = ~udp_v6_check(skb->len - nhoff, &ipv6h->saddr, &ipv6h->daddr, 0); diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 4fb7f1f12109..000c742d0527 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2954,6 +2954,28 @@ static int ieee80211_start_radar_detection(struct wiphy *wiphy, return err; } +static void ieee80211_end_cac(struct wiphy *wiphy, + struct net_device *dev) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = sdata->local; + + mutex_lock(&local->mtx); + list_for_each_entry(sdata, &local->interfaces, list) { + /* it might be waiting for the local->mtx, but then + * by the time it gets it, sdata->wdev.cac_started + * will no longer be true + */ + cancel_delayed_work(&sdata->dfs_cac_timer_work); + + if (sdata->wdev.cac_started) { + ieee80211_vif_release_channel(sdata); + sdata->wdev.cac_started = false; + } + } + mutex_unlock(&local->mtx); +} + static struct cfg80211_beacon_data * cfg80211_beacon_dup(struct cfg80211_beacon_data *beacon) { @@ -4023,6 +4045,7 @@ const struct cfg80211_ops mac80211_config_ops = { #endif .get_channel = ieee80211_cfg_get_channel, .start_radar_detection = ieee80211_start_radar_detection, + .end_cac = ieee80211_end_cac, .channel_switch = ieee80211_channel_switch, .set_qos_map = ieee80211_set_qos_map, .set_ap_chanwidth = ieee80211_set_ap_chanwidth, diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 68af62306385..d69983370381 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -328,6 +328,9 @@ u32 airtime_link_metric_get(struct ieee80211_local *local, unsigned long fail_avg = ewma_mesh_fail_avg_read(&sta->mesh->fail_avg); + if (sta->mesh->plink_state != NL80211_PLINK_ESTAB) + return MAX_METRIC; + /* Try to get rate based on HW/SW RC algorithm. * Rate is returned in units of Kbps, correct this * to comply with airtime calculation units diff --git a/net/mac80211/tkip.c b/net/mac80211/tkip.c index 727dc9f3f3b3..e7f57bb18f6e 100644 --- a/net/mac80211/tkip.c +++ b/net/mac80211/tkip.c @@ -263,9 +263,21 @@ int ieee80211_tkip_decrypt_data(struct arc4_ctx *ctx, if ((keyid >> 6) != key->conf.keyidx) return TKIP_DECRYPT_INVALID_KEYIDX; - if (rx_ctx->ctx.state != TKIP_STATE_NOT_INIT && - (iv32 < rx_ctx->iv32 || - (iv32 == rx_ctx->iv32 && iv16 <= rx_ctx->iv16))) + /* Reject replays if the received TSC is smaller than or equal to the + * last received value in a valid message, but with an exception for + * the case where a new key has been set and no valid frame using that + * key has yet received and the local RSC was initialized to 0. This + * exception allows the very first frame sent by the transmitter to be + * accepted even if that transmitter were to use TSC 0 (IEEE 802.11 + * described TSC to be initialized to 1 whenever a new key is taken into + * use). + */ + if (iv32 < rx_ctx->iv32 || + (iv32 == rx_ctx->iv32 && + (iv16 < rx_ctx->iv16 || + (iv16 == rx_ctx->iv16 && + (rx_ctx->iv32 || rx_ctx->iv16 || + rx_ctx->ctx.state != TKIP_STATE_NOT_INIT))))) return TKIP_DECRYPT_REPLAY; if (only_iv) { diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index a8a7306a1f56..4bd1faf4f779 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -3949,18 +3949,15 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, } } - next = skb; - while (next) { - skb = next; - next = skb->next; - - skb->prev = NULL; - skb->next = NULL; + skb_list_walk_safe(skb, skb, next) { + skb_mark_not_on_list(skb); skb = ieee80211_build_hdr(sdata, skb, info_flags, sta, ctrl_flags); - if (IS_ERR(skb)) + if (IS_ERR(skb)) { + kfree_skb_list(next); goto out; + } ieee80211_tx_stats(dev, skb->len); diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig new file mode 100644 index 000000000000..5db56d2218c5 --- /dev/null +++ b/net/mptcp/Kconfig @@ -0,0 +1,26 @@ + +config MPTCP + bool "MPTCP: Multipath TCP" + depends on INET + select SKB_EXTENSIONS + select CRYPTO_LIB_SHA256 + help + Multipath TCP (MPTCP) connections send and receive data over multiple + subflows in order to utilize multiple network paths. Each subflow + uses the TCP protocol, and TCP options carry header information for + MPTCP. + +config MPTCP_IPV6 + bool "MPTCP: IPv6 support for Multipath TCP" + depends on MPTCP + select IPV6 + default y + +config MPTCP_HMAC_TEST + bool "Tests for MPTCP HMAC implementation" + default n + help + This option enable boot time self-test for the HMAC implementation + used by the MPTCP code + + Say N if you are unsure. diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile new file mode 100644 index 000000000000..4e98d9edfd0a --- /dev/null +++ b/net/mptcp/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_MPTCP) += mptcp.o + +mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c new file mode 100644 index 000000000000..40d1bb18fd60 --- /dev/null +++ b/net/mptcp/crypto.c @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP cryptographic functions + * Copyright (c) 2017 - 2019, Intel Corporation. + * + * Note: This code is based on mptcp_ctrl.c, mptcp_ipv4.c, and + * mptcp_ipv6 from multipath-tcp.org, authored by: + * + * Sébastien Barré <sebastien.barre@uclouvain.be> + * Christoph Paasch <christoph.paasch@uclouvain.be> + * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi> + * Gregory Detal <gregory.detal@uclouvain.be> + * Fabien Duchêne <fabien.duchene@uclouvain.be> + * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de> + * Lavkesh Lahngir <lavkesh51@gmail.com> + * Andreas Ripke <ripke@neclab.eu> + * Vlad Dogaru <vlad.dogaru@intel.com> + * Octavian Purdila <octavian.purdila@intel.com> + * John Ronan <jronan@tssg.org> + * Catalin Nicutar <catalin.nicutar@gmail.com> + * Brandon Heller <brandonh@stanford.edu> + */ + +#include <linux/kernel.h> +#include <crypto/sha.h> +#include <asm/unaligned.h> + +#include "protocol.h" + +#define SHA256_DIGEST_WORDS (SHA256_DIGEST_SIZE / 4) + +void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn) +{ + __be32 mptcp_hashed_key[SHA256_DIGEST_WORDS]; + __be64 input = cpu_to_be64(key); + struct sha256_state state; + + sha256_init(&state); + sha256_update(&state, (__force u8 *)&input, sizeof(input)); + sha256_final(&state, (u8 *)mptcp_hashed_key); + + if (token) + *token = be32_to_cpu(mptcp_hashed_key[0]); + if (idsn) + *idsn = be64_to_cpu(*((__be64 *)&mptcp_hashed_key[6])); +} + +void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2, + void *hmac) +{ + u8 input[SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE]; + __be32 mptcp_hashed_key[SHA256_DIGEST_WORDS]; + __be32 *hash_out = (__force __be32 *)hmac; + struct sha256_state state; + u8 key1be[8]; + u8 key2be[8]; + int i; + + put_unaligned_be64(key1, key1be); + put_unaligned_be64(key2, key2be); + + /* Generate key xored with ipad */ + memset(input, 0x36, SHA_MESSAGE_BYTES); + for (i = 0; i < 8; i++) + input[i] ^= key1be[i]; + for (i = 0; i < 8; i++) + input[i + 8] ^= key2be[i]; + + put_unaligned_be32(nonce1, &input[SHA256_BLOCK_SIZE]); + put_unaligned_be32(nonce2, &input[SHA256_BLOCK_SIZE + 4]); + + sha256_init(&state); + sha256_update(&state, input, SHA256_BLOCK_SIZE + 8); + + /* emit sha256(K1 || msg) on the second input block, so we can + * reuse 'input' for the last hashing + */ + sha256_final(&state, &input[SHA256_BLOCK_SIZE]); + + /* Prepare second part of hmac */ + memset(input, 0x5C, SHA_MESSAGE_BYTES); + for (i = 0; i < 8; i++) + input[i] ^= key1be[i]; + for (i = 0; i < 8; i++) + input[i + 8] ^= key2be[i]; + + sha256_init(&state); + sha256_update(&state, input, SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE); + sha256_final(&state, (u8 *)mptcp_hashed_key); + + /* takes only first 160 bits */ + for (i = 0; i < 5; i++) + hash_out[i] = mptcp_hashed_key[i]; +} + +#ifdef CONFIG_MPTCP_HMAC_TEST +struct test_cast { + char *key; + char *msg; + char *result; +}; + +/* we can't reuse RFC 4231 test vectors, as we have constraint on the + * input and key size, and we truncate the output. + */ +static struct test_cast tests[] = { + { + .key = "0b0b0b0b0b0b0b0b", + .msg = "48692054", + .result = "8385e24fb4235ac37556b6b886db106284a1da67", + }, + { + .key = "aaaaaaaaaaaaaaaa", + .msg = "dddddddd", + .result = "2c5e219164ff1dca1c4a92318d847bb6b9d44492", + }, + { + .key = "0102030405060708", + .msg = "cdcdcdcd", + .result = "e73b9ba9969969cefb04aa0d6df18ec2fcc075b6", + }, +}; + +static int __init test_mptcp_crypto(void) +{ + char hmac[20], hmac_hex[41]; + u32 nonce1, nonce2; + u64 key1, key2; + int i, j; + + for (i = 0; i < ARRAY_SIZE(tests); ++i) { + /* mptcp hmap will convert to be before computing the hmac */ + key1 = be64_to_cpu(*((__be64 *)&tests[i].key[0])); + key2 = be64_to_cpu(*((__be64 *)&tests[i].key[8])); + nonce1 = be32_to_cpu(*((__be32 *)&tests[i].msg[0])); + nonce2 = be32_to_cpu(*((__be32 *)&tests[i].msg[4])); + + mptcp_crypto_hmac_sha(key1, key2, nonce1, nonce2, hmac); + for (j = 0; j < 20; ++j) + sprintf(&hmac_hex[j << 1], "%02x", hmac[j] & 0xff); + hmac_hex[40] = 0; + + if (memcmp(hmac_hex, tests[i].result, 40)) + pr_err("test %d failed, got %s expected %s", i, + hmac_hex, tests[i].result); + else + pr_info("test %d [ ok ]", i); + } + return 0; +} + +late_initcall(test_mptcp_crypto); +#endif diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c new file mode 100644 index 000000000000..8e39585d37f3 --- /dev/null +++ b/net/mptcp/ctrl.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2019, Tessares SA. + */ + +#include <linux/sysctl.h> + +#include <net/net_namespace.h> +#include <net/netns/generic.h> + +#include "protocol.h" + +#define MPTCP_SYSCTL_PATH "net/mptcp" + +static int mptcp_pernet_id; +struct mptcp_pernet { + struct ctl_table_header *ctl_table_hdr; + + int mptcp_enabled; +}; + +static struct mptcp_pernet *mptcp_get_pernet(struct net *net) +{ + return net_generic(net, mptcp_pernet_id); +} + +int mptcp_is_enabled(struct net *net) +{ + return mptcp_get_pernet(net)->mptcp_enabled; +} + +static struct ctl_table mptcp_sysctl_table[] = { + { + .procname = "enabled", + .maxlen = sizeof(int), + .mode = 0644, + /* users with CAP_NET_ADMIN or root (not and) can change this + * value, same as other sysctl or the 'net' tree. + */ + .proc_handler = proc_dointvec, + }, + {} +}; + +static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) +{ + pernet->mptcp_enabled = 1; +} + +static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) +{ + struct ctl_table_header *hdr; + struct ctl_table *table; + + table = mptcp_sysctl_table; + if (!net_eq(net, &init_net)) { + table = kmemdup(table, sizeof(mptcp_sysctl_table), GFP_KERNEL); + if (!table) + goto err_alloc; + } + + table[0].data = &pernet->mptcp_enabled; + + hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table); + if (!hdr) + goto err_reg; + + pernet->ctl_table_hdr = hdr; + + return 0; + +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; +} + +static void mptcp_pernet_del_table(struct mptcp_pernet *pernet) +{ + struct ctl_table *table = pernet->ctl_table_hdr->ctl_table_arg; + + unregister_net_sysctl_table(pernet->ctl_table_hdr); + + kfree(table); +} + +static int __net_init mptcp_net_init(struct net *net) +{ + struct mptcp_pernet *pernet = mptcp_get_pernet(net); + + mptcp_pernet_set_defaults(pernet); + + return mptcp_pernet_new_table(net, pernet); +} + +/* Note: the callback will only be called per extra netns */ +static void __net_exit mptcp_net_exit(struct net *net) +{ + struct mptcp_pernet *pernet = mptcp_get_pernet(net); + + mptcp_pernet_del_table(pernet); +} + +static struct pernet_operations mptcp_pernet_ops = { + .init = mptcp_net_init, + .exit = mptcp_net_exit, + .id = &mptcp_pernet_id, + .size = sizeof(struct mptcp_pernet), +}; + +void __init mptcp_init(void) +{ + mptcp_proto_init(); + + if (register_pernet_subsys(&mptcp_pernet_ops) < 0) + panic("Failed to register MPTCP pernet subsystem.\n"); +} + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +int __init mptcpv6_init(void) +{ + int err; + + err = mptcp_proto_v6_init(); + + return err; +} +#endif diff --git a/net/mptcp/options.c b/net/mptcp/options.c new file mode 100644 index 000000000000..45acd877bef3 --- /dev/null +++ b/net/mptcp/options.c @@ -0,0 +1,586 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2017 - 2019, Intel Corporation. + */ + +#include <linux/kernel.h> +#include <net/tcp.h> +#include <net/mptcp.h> +#include "protocol.h" + +static bool mptcp_cap_flag_sha256(u8 flags) +{ + return (flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA256; +} + +void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, + int opsize, struct tcp_options_received *opt_rx) +{ + struct mptcp_options_received *mp_opt = &opt_rx->mptcp; + u8 subtype = *ptr >> 4; + int expected_opsize; + u8 version; + u8 flags; + + switch (subtype) { + case MPTCPOPT_MP_CAPABLE: + /* strict size checking */ + if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { + if (skb->len > tcp_hdr(skb)->doff << 2) + expected_opsize = TCPOLEN_MPTCP_MPC_ACK_DATA; + else + expected_opsize = TCPOLEN_MPTCP_MPC_ACK; + } else { + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) + expected_opsize = TCPOLEN_MPTCP_MPC_SYNACK; + else + expected_opsize = TCPOLEN_MPTCP_MPC_SYN; + } + if (opsize != expected_opsize) + break; + + /* try to be gentle vs future versions on the initial syn */ + version = *ptr++ & MPTCP_VERSION_MASK; + if (opsize != TCPOLEN_MPTCP_MPC_SYN) { + if (version != MPTCP_SUPPORTED_VERSION) + break; + } else if (version < MPTCP_SUPPORTED_VERSION) { + break; + } + + flags = *ptr++; + if (!mptcp_cap_flag_sha256(flags) || + (flags & MPTCP_CAP_EXTENSIBILITY)) + break; + + /* RFC 6824, Section 3.1: + * "For the Checksum Required bit (labeled "A"), if either + * host requires the use of checksums, checksums MUST be used. + * In other words, the only way for checksums not to be used + * is if both hosts in their SYNs set A=0." + * + * Section 3.3.0: + * "If a checksum is not present when its use has been + * negotiated, the receiver MUST close the subflow with a RST as + * it is considered broken." + * + * We don't implement DSS checksum - fall back to TCP. + */ + if (flags & MPTCP_CAP_CHECKSUM_REQD) + break; + + mp_opt->mp_capable = 1; + if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) { + mp_opt->sndr_key = get_unaligned_be64(ptr); + ptr += 8; + } + if (opsize >= TCPOLEN_MPTCP_MPC_ACK) { + mp_opt->rcvr_key = get_unaligned_be64(ptr); + ptr += 8; + } + if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA) { + /* Section 3.1.: + * "the data parameters in a MP_CAPABLE are semantically + * equivalent to those in a DSS option and can be used + * interchangeably." + */ + mp_opt->dss = 1; + mp_opt->use_map = 1; + mp_opt->mpc_map = 1; + mp_opt->data_len = get_unaligned_be16(ptr); + ptr += 2; + } + pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d", + version, flags, opsize, mp_opt->sndr_key, + mp_opt->rcvr_key, mp_opt->data_len); + break; + + case MPTCPOPT_DSS: + pr_debug("DSS"); + ptr++; + + /* we must clear 'mpc_map' be able to detect MP_CAPABLE + * map vs DSS map in mptcp_incoming_options(), and reconstruct + * map info accordingly + */ + mp_opt->mpc_map = 0; + flags = (*ptr++) & MPTCP_DSS_FLAG_MASK; + mp_opt->data_fin = (flags & MPTCP_DSS_DATA_FIN) != 0; + mp_opt->dsn64 = (flags & MPTCP_DSS_DSN64) != 0; + mp_opt->use_map = (flags & MPTCP_DSS_HAS_MAP) != 0; + mp_opt->ack64 = (flags & MPTCP_DSS_ACK64) != 0; + mp_opt->use_ack = (flags & MPTCP_DSS_HAS_ACK); + + pr_debug("data_fin=%d dsn64=%d use_map=%d ack64=%d use_ack=%d", + mp_opt->data_fin, mp_opt->dsn64, + mp_opt->use_map, mp_opt->ack64, + mp_opt->use_ack); + + expected_opsize = TCPOLEN_MPTCP_DSS_BASE; + + if (mp_opt->use_ack) { + if (mp_opt->ack64) + expected_opsize += TCPOLEN_MPTCP_DSS_ACK64; + else + expected_opsize += TCPOLEN_MPTCP_DSS_ACK32; + } + + if (mp_opt->use_map) { + if (mp_opt->dsn64) + expected_opsize += TCPOLEN_MPTCP_DSS_MAP64; + else + expected_opsize += TCPOLEN_MPTCP_DSS_MAP32; + } + + /* RFC 6824, Section 3.3: + * If a checksum is present, but its use had + * not been negotiated in the MP_CAPABLE handshake, + * the checksum field MUST be ignored. + */ + if (opsize != expected_opsize && + opsize != expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) + break; + + mp_opt->dss = 1; + + if (mp_opt->use_ack) { + if (mp_opt->ack64) { + mp_opt->data_ack = get_unaligned_be64(ptr); + ptr += 8; + } else { + mp_opt->data_ack = get_unaligned_be32(ptr); + ptr += 4; + } + + pr_debug("data_ack=%llu", mp_opt->data_ack); + } + + if (mp_opt->use_map) { + if (mp_opt->dsn64) { + mp_opt->data_seq = get_unaligned_be64(ptr); + ptr += 8; + } else { + mp_opt->data_seq = get_unaligned_be32(ptr); + ptr += 4; + } + + mp_opt->subflow_seq = get_unaligned_be32(ptr); + ptr += 4; + + mp_opt->data_len = get_unaligned_be16(ptr); + ptr += 2; + + pr_debug("data_seq=%llu subflow_seq=%u data_len=%u", + mp_opt->data_seq, mp_opt->subflow_seq, + mp_opt->data_len); + } + + break; + + default: + break; + } +} + +void mptcp_get_options(const struct sk_buff *skb, + struct tcp_options_received *opt_rx) +{ + const unsigned char *ptr; + const struct tcphdr *th = tcp_hdr(skb); + int length = (th->doff * 4) - sizeof(struct tcphdr); + + ptr = (const unsigned char *)(th + 1); + + while (length > 0) { + int opcode = *ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize = *ptr++; + if (opsize < 2) /* "silly options" */ + return; + if (opsize > length) + return; /* don't parse partial options */ + if (opcode == TCPOPT_MPTCP) + mptcp_parse_option(skb, ptr, opsize, opt_rx); + ptr += opsize - 2; + length -= opsize; + } + } +} + +bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, + unsigned int *size, struct mptcp_out_options *opts) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + + /* we will use snd_isn to detect first pkt [re]transmission + * in mptcp_established_options_mp() + */ + subflow->snd_isn = TCP_SKB_CB(skb)->end_seq; + if (subflow->request_mptcp) { + pr_debug("local_key=%llu", subflow->local_key); + opts->suboptions = OPTION_MPTCP_MPC_SYN; + opts->sndr_key = subflow->local_key; + *size = TCPOLEN_MPTCP_MPC_SYN; + return true; + } + return false; +} + +void mptcp_rcv_synsent(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct tcp_sock *tp = tcp_sk(sk); + + pr_debug("subflow=%p", subflow); + if (subflow->request_mptcp && tp->rx_opt.mptcp.mp_capable) { + subflow->mp_capable = 1; + subflow->can_ack = 1; + subflow->remote_key = tp->rx_opt.mptcp.sndr_key; + } else { + tcp_sk(sk)->is_mptcp = 0; + } +} + +static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, + unsigned int *size, + unsigned int remaining, + struct mptcp_out_options *opts) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_ext *mpext; + unsigned int data_len; + + pr_debug("subflow=%p fourth_ack=%d seq=%x:%x remaining=%d", subflow, + subflow->fourth_ack, subflow->snd_isn, + skb ? TCP_SKB_CB(skb)->seq : 0, remaining); + + if (subflow->mp_capable && !subflow->fourth_ack && skb && + subflow->snd_isn == TCP_SKB_CB(skb)->seq) { + /* When skb is not available, we better over-estimate the + * emitted options len. A full DSS option is longer than + * TCPOLEN_MPTCP_MPC_ACK_DATA, so let's the caller try to fit + * that. + */ + mpext = mptcp_get_ext(skb); + data_len = mpext ? mpext->data_len : 0; + + /* we will check ext_copy.data_len in mptcp_write_options() to + * discriminate between TCPOLEN_MPTCP_MPC_ACK_DATA and + * TCPOLEN_MPTCP_MPC_ACK + */ + opts->ext_copy.data_len = data_len; + opts->suboptions = OPTION_MPTCP_MPC_ACK; + opts->sndr_key = subflow->local_key; + opts->rcvr_key = subflow->remote_key; + + /* Section 3.1. + * The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK + * packets that start the first subflow of an MPTCP connection, + * as well as the first packet that carries data + */ + if (data_len > 0) + *size = ALIGN(TCPOLEN_MPTCP_MPC_ACK_DATA, 4); + else + *size = TCPOLEN_MPTCP_MPC_ACK; + + pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d", + subflow, subflow->local_key, subflow->remote_key, + data_len); + + return true; + } + return false; +} + +static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow, + struct mptcp_ext *ext) +{ + ext->data_fin = 1; + + if (!ext->use_map) { + /* RFC6824 requires a DSS mapping with specific values + * if DATA_FIN is set but no data payload is mapped + */ + ext->use_map = 1; + ext->dsn64 = 1; + ext->data_seq = mptcp_sk(subflow->conn)->write_seq; + ext->subflow_seq = 0; + ext->data_len = 1; + } else { + /* If there's an existing DSS mapping, DATA_FIN consumes + * 1 additional byte of mapping space. + */ + ext->data_len++; + } +} + +static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, + unsigned int *size, + unsigned int remaining, + struct mptcp_out_options *opts) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + unsigned int dss_size = 0; + struct mptcp_ext *mpext; + struct mptcp_sock *msk; + unsigned int ack_size; + bool ret = false; + u8 tcp_fin; + + if (skb) { + mpext = mptcp_get_ext(skb); + tcp_fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; + } else { + mpext = NULL; + tcp_fin = 0; + } + + if (!skb || (mpext && mpext->use_map) || tcp_fin) { + unsigned int map_size; + + map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64; + + remaining -= map_size; + dss_size = map_size; + if (mpext) + opts->ext_copy = *mpext; + + if (skb && tcp_fin && + subflow->conn->sk_state != TCP_ESTABLISHED) + mptcp_write_data_fin(subflow, &opts->ext_copy); + ret = true; + } + + opts->ext_copy.use_ack = 0; + msk = mptcp_sk(subflow->conn); + if (!msk || !READ_ONCE(msk->can_ack)) { + *size = ALIGN(dss_size, 4); + return ret; + } + + ack_size = TCPOLEN_MPTCP_DSS_ACK64; + + /* Add kind/length/subtype/flag overhead if mapping is not populated */ + if (dss_size == 0) + ack_size += TCPOLEN_MPTCP_DSS_BASE; + + dss_size += ack_size; + + opts->ext_copy.data_ack = msk->ack_seq; + opts->ext_copy.ack64 = 1; + opts->ext_copy.use_ack = 1; + + *size = ALIGN(dss_size, 4); + return true; +} + +bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, + unsigned int *size, unsigned int remaining, + struct mptcp_out_options *opts) +{ + unsigned int opt_size = 0; + bool ret = false; + + if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts)) + ret = true; + else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining, + opts)) + ret = true; + + /* we reserved enough space for the above options, and exceeding the + * TCP option space would be fatal + */ + if (WARN_ON_ONCE(opt_size > remaining)) + return false; + + *size += opt_size; + remaining -= opt_size; + + return ret; +} + +bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, + struct mptcp_out_options *opts) +{ + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + + if (subflow_req->mp_capable) { + opts->suboptions = OPTION_MPTCP_MPC_SYNACK; + opts->sndr_key = subflow_req->local_key; + *size = TCPOLEN_MPTCP_MPC_SYNACK; + pr_debug("subflow_req=%p, local_key=%llu", + subflow_req, subflow_req->local_key); + return true; + } + return false; +} + +static bool check_fourth_ack(struct mptcp_subflow_context *subflow, + struct sk_buff *skb, + struct mptcp_options_received *mp_opt) +{ + /* here we can process OoO, in-window pkts, only in-sequence 4th ack + * are relevant + */ + if (likely(subflow->fourth_ack || + TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1)) + return true; + + if (mp_opt->use_ack) + subflow->fourth_ack = 1; + + if (subflow->can_ack) + return true; + + /* If the first established packet does not contain MP_CAPABLE + data + * then fallback to TCP + */ + if (!mp_opt->mp_capable) { + subflow->mp_capable = 0; + tcp_sk(mptcp_subflow_tcp_sock(subflow))->is_mptcp = 0; + return false; + } + subflow->remote_key = mp_opt->sndr_key; + subflow->can_ack = 1; + return true; +} + +void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, + struct tcp_options_received *opt_rx) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_options_received *mp_opt; + struct mptcp_ext *mpext; + + mp_opt = &opt_rx->mptcp; + if (!check_fourth_ack(subflow, skb, mp_opt)) + return; + + if (!mp_opt->dss) + return; + + mpext = skb_ext_add(skb, SKB_EXT_MPTCP); + if (!mpext) + return; + + memset(mpext, 0, sizeof(*mpext)); + + if (mp_opt->use_map) { + if (mp_opt->mpc_map) { + /* this is an MP_CAPABLE carrying MPTCP data + * we know this map the first chunk of data + */ + mptcp_crypto_key_sha(subflow->remote_key, NULL, + &mpext->data_seq); + mpext->data_seq++; + mpext->subflow_seq = 1; + mpext->dsn64 = 1; + mpext->mpc_map = 1; + } else { + mpext->data_seq = mp_opt->data_seq; + mpext->subflow_seq = mp_opt->subflow_seq; + mpext->dsn64 = mp_opt->dsn64; + } + mpext->data_len = mp_opt->data_len; + mpext->use_map = 1; + } + + if (mp_opt->use_ack) { + mpext->data_ack = mp_opt->data_ack; + mpext->use_ack = 1; + mpext->ack64 = mp_opt->ack64; + } + + mpext->data_fin = mp_opt->data_fin; +} + +void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) +{ + if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | + OPTION_MPTCP_MPC_ACK) & opts->suboptions) { + u8 len; + + if (OPTION_MPTCP_MPC_SYN & opts->suboptions) + len = TCPOLEN_MPTCP_MPC_SYN; + else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions) + len = TCPOLEN_MPTCP_MPC_SYNACK; + else if (opts->ext_copy.data_len) + len = TCPOLEN_MPTCP_MPC_ACK_DATA; + else + len = TCPOLEN_MPTCP_MPC_ACK; + + *ptr++ = htonl((TCPOPT_MPTCP << 24) | (len << 16) | + (MPTCPOPT_MP_CAPABLE << 12) | + (MPTCP_SUPPORTED_VERSION << 8) | + MPTCP_CAP_HMAC_SHA256); + + if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & + opts->suboptions)) + goto mp_capable_done; + + put_unaligned_be64(opts->sndr_key, ptr); + ptr += 2; + if (!((OPTION_MPTCP_MPC_ACK) & opts->suboptions)) + goto mp_capable_done; + + put_unaligned_be64(opts->rcvr_key, ptr); + ptr += 2; + if (!opts->ext_copy.data_len) + goto mp_capable_done; + + put_unaligned_be32(opts->ext_copy.data_len << 16 | + TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); + ptr += 1; + } + +mp_capable_done: + if (opts->ext_copy.use_ack || opts->ext_copy.use_map) { + struct mptcp_ext *mpext = &opts->ext_copy; + u8 len = TCPOLEN_MPTCP_DSS_BASE; + u8 flags = 0; + + if (mpext->use_ack) { + len += TCPOLEN_MPTCP_DSS_ACK64; + flags = MPTCP_DSS_HAS_ACK | MPTCP_DSS_ACK64; + } + + if (mpext->use_map) { + len += TCPOLEN_MPTCP_DSS_MAP64; + + /* Use only 64-bit mapping flags for now, add + * support for optional 32-bit mappings later. + */ + flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64; + if (mpext->data_fin) + flags |= MPTCP_DSS_DATA_FIN; + } + + *ptr++ = htonl((TCPOPT_MPTCP << 24) | + (len << 16) | + (MPTCPOPT_DSS << 12) | + (flags)); + + if (mpext->use_ack) { + put_unaligned_be64(mpext->data_ack, ptr); + ptr += 2; + } + + if (mpext->use_map) { + put_unaligned_be64(mpext->data_seq, ptr); + ptr += 2; + put_unaligned_be32(mpext->subflow_seq, ptr); + ptr += 1; + put_unaligned_be32(mpext->data_len << 16 | + TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); + } + } +} diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c new file mode 100644 index 000000000000..39fdca79ce90 --- /dev/null +++ b/net/mptcp/protocol.c @@ -0,0 +1,1276 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2017 - 2019, Intel Corporation. + */ + +#define pr_fmt(fmt) "MPTCP: " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/netdevice.h> +#include <linux/sched/signal.h> +#include <linux/atomic.h> +#include <net/sock.h> +#include <net/inet_common.h> +#include <net/inet_hashtables.h> +#include <net/protocol.h> +#include <net/tcp.h> +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +#include <net/transp_v6.h> +#endif +#include <net/mptcp.h> +#include "protocol.h" + +#define MPTCP_SAME_STATE TCP_MAX_STATES + +static void __mptcp_close(struct sock *sk, long timeout); + +static const struct proto_ops *tcp_proto_ops(struct sock *sk) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + return &inet6_stream_ops; +#endif + return &inet_stream_ops; +} + +/* MP_CAPABLE handshake failed, convert msk to plain tcp, replacing + * socket->sk and stream ops and destroying msk + * return the msk socket, as we can't access msk anymore after this function + * completes + * Called with msk lock held, releases such lock before returning + */ +static struct socket *__mptcp_fallback_to_tcp(struct mptcp_sock *msk, + struct sock *ssk) +{ + struct mptcp_subflow_context *subflow; + struct socket *sock; + struct sock *sk; + + sk = (struct sock *)msk; + sock = sk->sk_socket; + subflow = mptcp_subflow_ctx(ssk); + + /* detach the msk socket */ + list_del_init(&subflow->node); + sock_orphan(sk); + sock->sk = NULL; + + /* socket is now TCP */ + lock_sock(ssk); + sock_graft(ssk, sock); + if (subflow->conn) { + /* We can't release the ULP data on a live socket, + * restore the tcp callback + */ + mptcp_subflow_tcp_fallback(ssk, subflow); + sock_put(subflow->conn); + subflow->conn = NULL; + } + release_sock(ssk); + sock->ops = tcp_proto_ops(ssk); + + /* destroy the left-over msk sock */ + __mptcp_close(sk, 0); + return sock; +} + +/* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not + * completed yet or has failed, return the subflow socket. + * Otherwise return NULL. + */ +static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk) +{ + if (!msk->subflow || READ_ONCE(msk->can_ack)) + return NULL; + + return msk->subflow; +} + +static bool __mptcp_needs_tcp_fallback(const struct mptcp_sock *msk) +{ + return msk->first && !sk_is_mptcp(msk->first); +} + +/* if the mp_capable handshake has failed, it fallbacks msk to plain TCP, + * releases the socket lock and returns a reference to the now TCP socket. + * Otherwise returns NULL + */ +static struct socket *__mptcp_tcp_fallback(struct mptcp_sock *msk) +{ + sock_owned_by_me((const struct sock *)msk); + + if (likely(!__mptcp_needs_tcp_fallback(msk))) + return NULL; + + if (msk->subflow) { + /* the first subflow is an active connection, discart the + * paired socket + */ + msk->subflow->sk = NULL; + sock_release(msk->subflow); + msk->subflow = NULL; + } + + return __mptcp_fallback_to_tcp(msk, msk->first); +} + +static bool __mptcp_can_create_subflow(const struct mptcp_sock *msk) +{ + return !msk->first; +} + +static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + struct socket *ssock; + int err; + + ssock = __mptcp_nmpc_socket(msk); + if (ssock) + goto set_state; + + if (!__mptcp_can_create_subflow(msk)) + return ERR_PTR(-EINVAL); + + err = mptcp_subflow_create_socket(sk, &ssock); + if (err) + return ERR_PTR(err); + + msk->first = ssock->sk; + msk->subflow = ssock; + subflow = mptcp_subflow_ctx(ssock->sk); + list_add(&subflow->node, &msk->conn_list); + subflow->request_mptcp = 1; + +set_state: + if (state != MPTCP_SAME_STATE) + inet_sk_state_store(sk, state); + return ssock; +} + +static struct sock *mptcp_subflow_get(const struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + + sock_owned_by_me((const struct sock *)msk); + + mptcp_for_each_subflow(msk, subflow) { + return mptcp_subflow_tcp_sock(subflow); + } + + return NULL; +} + +static bool mptcp_ext_cache_refill(struct mptcp_sock *msk) +{ + if (!msk->cached_ext) + msk->cached_ext = __skb_ext_alloc(); + + return !!msk->cached_ext; +} + +static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + + sock_owned_by_me(sk); + + mptcp_for_each_subflow(msk, subflow) { + if (subflow->data_avail) + return mptcp_subflow_tcp_sock(subflow); + } + + return NULL; +} + +static inline bool mptcp_skb_can_collapse_to(const struct mptcp_sock *msk, + const struct sk_buff *skb, + const struct mptcp_ext *mpext) +{ + if (!tcp_skb_can_collapse_to(skb)) + return false; + + /* can collapse only if MPTCP level sequence is in order */ + return mpext && mpext->data_seq + mpext->data_len == msk->write_seq; +} + +static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, + struct msghdr *msg, long *timeo, int *pmss_now, + int *ps_goal) +{ + int mss_now, avail_size, size_goal, ret; + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_ext *mpext = NULL; + struct sk_buff *skb, *tail; + bool can_collapse = false; + struct page_frag *pfrag; + size_t psize; + + /* use the mptcp page cache so that we can easily move the data + * from one substream to another, but do per subflow memory accounting + */ + pfrag = sk_page_frag(sk); + while (!sk_page_frag_refill(ssk, pfrag) || + !mptcp_ext_cache_refill(msk)) { + ret = sk_stream_wait_memory(ssk, timeo); + if (ret) + return ret; + if (unlikely(__mptcp_needs_tcp_fallback(msk))) + return 0; + } + + /* compute copy limit */ + mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags); + *pmss_now = mss_now; + *ps_goal = size_goal; + avail_size = size_goal; + skb = tcp_write_queue_tail(ssk); + if (skb) { + mpext = skb_ext_find(skb, SKB_EXT_MPTCP); + + /* Limit the write to the size available in the + * current skb, if any, so that we create at most a new skb. + * Explicitly tells TCP internals to avoid collapsing on later + * queue management operation, to avoid breaking the ext <-> + * SSN association set here + */ + can_collapse = (size_goal - skb->len > 0) && + mptcp_skb_can_collapse_to(msk, skb, mpext); + if (!can_collapse) + TCP_SKB_CB(skb)->eor = 1; + else + avail_size = size_goal - skb->len; + } + psize = min_t(size_t, pfrag->size - pfrag->offset, avail_size); + + /* Copy to page */ + pr_debug("left=%zu", msg_data_left(msg)); + psize = copy_page_from_iter(pfrag->page, pfrag->offset, + min_t(size_t, msg_data_left(msg), psize), + &msg->msg_iter); + pr_debug("left=%zu", msg_data_left(msg)); + if (!psize) + return -EINVAL; + + /* tell the TCP stack to delay the push so that we can safely + * access the skb after the sendpages call + */ + ret = do_tcp_sendpages(ssk, pfrag->page, pfrag->offset, psize, + msg->msg_flags | MSG_SENDPAGE_NOTLAST); + if (ret <= 0) + return ret; + if (unlikely(ret < psize)) + iov_iter_revert(&msg->msg_iter, psize - ret); + + /* if the tail skb extension is still the cached one, collapsing + * really happened. Note: we can't check for 'same skb' as the sk_buff + * hdr on tail can be transmitted, freed and re-allocated by the + * do_tcp_sendpages() call + */ + tail = tcp_write_queue_tail(ssk); + if (mpext && tail && mpext == skb_ext_find(tail, SKB_EXT_MPTCP)) { + WARN_ON_ONCE(!can_collapse); + mpext->data_len += ret; + goto out; + } + + skb = tcp_write_queue_tail(ssk); + mpext = __skb_ext_set(skb, SKB_EXT_MPTCP, msk->cached_ext); + msk->cached_ext = NULL; + + memset(mpext, 0, sizeof(*mpext)); + mpext->data_seq = msk->write_seq; + mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq; + mpext->data_len = ret; + mpext->use_map = 1; + mpext->dsn64 = 1; + + pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d", + mpext->data_seq, mpext->subflow_seq, mpext->data_len, + mpext->dsn64); + +out: + pfrag->offset += ret; + msk->write_seq += ret; + mptcp_subflow_ctx(ssk)->rel_write_seq += ret; + + return ret; +} + +static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk) +{ + struct socket *sock; + + if (likely(sk_stream_is_writeable(ssk))) + return; + + sock = READ_ONCE(ssk->sk_socket); + + if (sock) { + clear_bit(MPTCP_SEND_SPACE, &msk->flags); + smp_mb__after_atomic(); + /* set NOSPACE only after clearing SEND_SPACE flag */ + set_bit(SOCK_NOSPACE, &sock->flags); + } +} + +static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) +{ + int mss_now = 0, size_goal = 0, ret = 0; + struct mptcp_sock *msk = mptcp_sk(sk); + struct socket *ssock; + size_t copied = 0; + struct sock *ssk; + long timeo; + + if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) + return -EOPNOTSUPP; + + lock_sock(sk); + ssock = __mptcp_tcp_fallback(msk); + if (unlikely(ssock)) { +fallback: + pr_debug("fallback passthrough"); + ret = sock_sendmsg(ssock, msg); + return ret >= 0 ? ret + copied : (copied ? copied : ret); + } + + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + + ssk = mptcp_subflow_get(msk); + if (!ssk) { + release_sock(sk); + return -ENOTCONN; + } + + pr_debug("conn_list->subflow=%p", ssk); + + lock_sock(ssk); + while (msg_data_left(msg)) { + ret = mptcp_sendmsg_frag(sk, ssk, msg, &timeo, &mss_now, + &size_goal); + if (ret < 0) + break; + if (ret == 0 && unlikely(__mptcp_needs_tcp_fallback(msk))) { + release_sock(ssk); + ssock = __mptcp_tcp_fallback(msk); + goto fallback; + } + + copied += ret; + } + + if (copied) { + ret = copied; + tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle, + size_goal); + } + + ssk_check_wmem(msk, ssk); + release_sock(ssk); + release_sock(sk); + return ret; +} + +int mptcp_read_actor(read_descriptor_t *desc, struct sk_buff *skb, + unsigned int offset, size_t len) +{ + struct mptcp_read_arg *arg = desc->arg.data; + size_t copy_len; + + copy_len = min(desc->count, len); + + if (likely(arg->msg)) { + int err; + + err = skb_copy_datagram_msg(skb, offset, arg->msg, copy_len); + if (err) { + pr_debug("error path"); + desc->error = err; + return err; + } + } else { + pr_debug("Flushing skb payload"); + } + + desc->count -= copy_len; + + pr_debug("consumed %zu bytes, %zu left", copy_len, desc->count); + return copy_len; +} + +static void mptcp_wait_data(struct sock *sk, long *timeo) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + struct mptcp_sock *msk = mptcp_sk(sk); + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + + sk_wait_event(sk, timeo, + test_and_clear_bit(MPTCP_DATA_READY, &msk->flags), &wait); + + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); +} + +static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, int *addr_len) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_subflow_context *subflow; + bool more_data_avail = false; + struct mptcp_read_arg arg; + read_descriptor_t desc; + bool wait_data = false; + struct socket *ssock; + struct tcp_sock *tp; + bool done = false; + struct sock *ssk; + int copied = 0; + int target; + long timeo; + + if (msg->msg_flags & ~(MSG_WAITALL | MSG_DONTWAIT)) + return -EOPNOTSUPP; + + lock_sock(sk); + ssock = __mptcp_tcp_fallback(msk); + if (unlikely(ssock)) { +fallback: + pr_debug("fallback-read subflow=%p", + mptcp_subflow_ctx(ssock->sk)); + copied = sock_recvmsg(ssock, msg, flags); + return copied; + } + + arg.msg = msg; + desc.arg.data = &arg; + desc.error = 0; + + timeo = sock_rcvtimeo(sk, nonblock); + + len = min_t(size_t, len, INT_MAX); + target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); + + while (!done) { + u32 map_remaining; + int bytes_read; + + ssk = mptcp_subflow_recv_lookup(msk); + pr_debug("msk=%p ssk=%p", msk, ssk); + if (!ssk) + goto wait_for_data; + + subflow = mptcp_subflow_ctx(ssk); + tp = tcp_sk(ssk); + + lock_sock(ssk); + do { + /* try to read as much data as available */ + map_remaining = subflow->map_data_len - + mptcp_subflow_get_map_offset(subflow); + desc.count = min_t(size_t, len - copied, map_remaining); + pr_debug("reading %zu bytes, copied %d", desc.count, + copied); + bytes_read = tcp_read_sock(ssk, &desc, + mptcp_read_actor); + if (bytes_read < 0) { + if (!copied) + copied = bytes_read; + done = true; + goto next; + } + + pr_debug("msk ack_seq=%llx -> %llx", msk->ack_seq, + msk->ack_seq + bytes_read); + msk->ack_seq += bytes_read; + copied += bytes_read; + if (copied >= len) { + done = true; + goto next; + } + if (tp->urg_data && tp->urg_seq == tp->copied_seq) { + pr_err("Urgent data present, cannot proceed"); + done = true; + goto next; + } +next: + more_data_avail = mptcp_subflow_data_available(ssk); + } while (more_data_avail && !done); + release_sock(ssk); + continue; + +wait_for_data: + more_data_avail = false; + + /* only the master socket status is relevant here. The exit + * conditions mirror closely tcp_recvmsg() + */ + if (copied >= target) + break; + + if (copied) { + if (sk->sk_err || + sk->sk_state == TCP_CLOSE || + (sk->sk_shutdown & RCV_SHUTDOWN) || + !timeo || + signal_pending(current)) + break; + } else { + if (sk->sk_err) { + copied = sock_error(sk); + break; + } + + if (sk->sk_shutdown & RCV_SHUTDOWN) + break; + + if (sk->sk_state == TCP_CLOSE) { + copied = -ENOTCONN; + break; + } + + if (!timeo) { + copied = -EAGAIN; + break; + } + + if (signal_pending(current)) { + copied = sock_intr_errno(timeo); + break; + } + } + + pr_debug("block timeout %ld", timeo); + wait_data = true; + mptcp_wait_data(sk, &timeo); + if (unlikely(__mptcp_tcp_fallback(msk))) + goto fallback; + } + + if (more_data_avail) { + if (!test_bit(MPTCP_DATA_READY, &msk->flags)) + set_bit(MPTCP_DATA_READY, &msk->flags); + } else if (!wait_data) { + clear_bit(MPTCP_DATA_READY, &msk->flags); + + /* .. race-breaker: ssk might get new data after last + * data_available() returns false. + */ + ssk = mptcp_subflow_recv_lookup(msk); + if (unlikely(ssk)) + set_bit(MPTCP_DATA_READY, &msk->flags); + } + + release_sock(sk); + return copied; +} + +/* subflow sockets can be either outgoing (connect) or incoming + * (accept). + * + * Outgoing subflows use in-kernel sockets. + * Incoming subflows do not have their own 'struct socket' allocated, + * so we need to use tcp_close() after detaching them from the mptcp + * parent socket. + */ +static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, + struct mptcp_subflow_context *subflow, + long timeout) +{ + struct socket *sock = READ_ONCE(ssk->sk_socket); + + list_del(&subflow->node); + + if (sock && sock != sk->sk_socket) { + /* outgoing subflow */ + sock_release(sock); + } else { + /* incoming subflow */ + tcp_close(ssk, timeout); + } +} + +static int __mptcp_init_sock(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + INIT_LIST_HEAD(&msk->conn_list); + __set_bit(MPTCP_SEND_SPACE, &msk->flags); + + msk->first = NULL; + + return 0; +} + +static int mptcp_init_sock(struct sock *sk) +{ + if (!mptcp_is_enabled(sock_net(sk))) + return -ENOPROTOOPT; + + return __mptcp_init_sock(sk); +} + +static void mptcp_subflow_shutdown(struct sock *ssk, int how) +{ + lock_sock(ssk); + + switch (ssk->sk_state) { + case TCP_LISTEN: + if (!(how & RCV_SHUTDOWN)) + break; + /* fall through */ + case TCP_SYN_SENT: + tcp_disconnect(ssk, O_NONBLOCK); + break; + default: + ssk->sk_shutdown |= how; + tcp_shutdown(ssk, how); + break; + } + + /* Wake up anyone sleeping in poll. */ + ssk->sk_state_change(ssk); + release_sock(ssk); +} + +/* Called with msk lock held, releases such lock before returning */ +static void __mptcp_close(struct sock *sk, long timeout) +{ + struct mptcp_subflow_context *subflow, *tmp; + struct mptcp_sock *msk = mptcp_sk(sk); + + mptcp_token_destroy(msk->token); + inet_sk_state_store(sk, TCP_CLOSE); + + list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + __mptcp_close_ssk(sk, ssk, subflow, timeout); + } + + if (msk->cached_ext) + __skb_ext_put(msk->cached_ext); + release_sock(sk); + sk_common_release(sk); +} + +static void mptcp_close(struct sock *sk, long timeout) +{ + lock_sock(sk); + __mptcp_close(sk, timeout); +} + +static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) +{ +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + const struct ipv6_pinfo *ssk6 = inet6_sk(ssk); + struct ipv6_pinfo *msk6 = inet6_sk(msk); + + msk->sk_v6_daddr = ssk->sk_v6_daddr; + msk->sk_v6_rcv_saddr = ssk->sk_v6_rcv_saddr; + + if (msk6 && ssk6) { + msk6->saddr = ssk6->saddr; + msk6->flow_label = ssk6->flow_label; + } +#endif + + inet_sk(msk)->inet_num = inet_sk(ssk)->inet_num; + inet_sk(msk)->inet_dport = inet_sk(ssk)->inet_dport; + inet_sk(msk)->inet_sport = inet_sk(ssk)->inet_sport; + inet_sk(msk)->inet_daddr = inet_sk(ssk)->inet_daddr; + inet_sk(msk)->inet_saddr = inet_sk(ssk)->inet_saddr; + inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr; +} + +static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, + bool kern) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct socket *listener; + struct sock *newsk; + + listener = __mptcp_nmpc_socket(msk); + if (WARN_ON_ONCE(!listener)) { + *err = -EINVAL; + return NULL; + } + + pr_debug("msk=%p, listener=%p", msk, mptcp_subflow_ctx(listener->sk)); + newsk = inet_csk_accept(listener->sk, flags, err, kern); + if (!newsk) + return NULL; + + pr_debug("msk=%p, subflow is mptcp=%d", msk, sk_is_mptcp(newsk)); + + if (sk_is_mptcp(newsk)) { + struct mptcp_subflow_context *subflow; + struct sock *new_mptcp_sock; + struct sock *ssk = newsk; + u64 ack_seq; + + subflow = mptcp_subflow_ctx(newsk); + lock_sock(sk); + + local_bh_disable(); + new_mptcp_sock = sk_clone_lock(sk, GFP_ATOMIC); + if (!new_mptcp_sock) { + *err = -ENOBUFS; + local_bh_enable(); + release_sock(sk); + mptcp_subflow_shutdown(newsk, SHUT_RDWR + 1); + tcp_close(newsk, 0); + return NULL; + } + + __mptcp_init_sock(new_mptcp_sock); + + msk = mptcp_sk(new_mptcp_sock); + msk->local_key = subflow->local_key; + msk->token = subflow->token; + msk->subflow = NULL; + msk->first = newsk; + + mptcp_token_update_accept(newsk, new_mptcp_sock); + + msk->write_seq = subflow->idsn + 1; + if (subflow->can_ack) { + msk->can_ack = true; + msk->remote_key = subflow->remote_key; + mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq); + ack_seq++; + msk->ack_seq = ack_seq; + } + newsk = new_mptcp_sock; + mptcp_copy_inaddrs(newsk, ssk); + list_add(&subflow->node, &msk->conn_list); + + /* will be fully established at mptcp_stream_accept() + * completion. + */ + inet_sk_state_store(new_mptcp_sock, TCP_SYN_RECV); + bh_unlock_sock(new_mptcp_sock); + local_bh_enable(); + release_sock(sk); + + /* the subflow can already receive packet, avoid racing with + * the receive path and process the pending ones + */ + lock_sock(ssk); + subflow->rel_write_seq = 1; + subflow->tcp_sock = ssk; + subflow->conn = new_mptcp_sock; + if (unlikely(!skb_queue_empty(&ssk->sk_receive_queue))) + mptcp_subflow_data_available(ssk); + release_sock(ssk); + } + + return newsk; +} + +static void mptcp_destroy(struct sock *sk) +{ +} + +static int mptcp_setsockopt(struct sock *sk, int level, int optname, + char __user *uoptval, unsigned int optlen) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + char __kernel *optval; + int ret = -EOPNOTSUPP; + struct socket *ssock; + + /* will be treated as __user in tcp_setsockopt */ + optval = (char __kernel __force *)uoptval; + + pr_debug("msk=%p", msk); + + /* @@ the meaning of setsockopt() when the socket is connected and + * there are multiple subflows is not defined. + */ + lock_sock(sk); + ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE); + if (!IS_ERR(ssock)) { + pr_debug("subflow=%p", ssock->sk); + ret = kernel_setsockopt(ssock, level, optname, optval, optlen); + } + release_sock(sk); + + return ret; +} + +static int mptcp_getsockopt(struct sock *sk, int level, int optname, + char __user *uoptval, int __user *uoption) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + char __kernel *optval; + int ret = -EOPNOTSUPP; + int __kernel *option; + struct socket *ssock; + + /* will be treated as __user in tcp_getsockopt */ + optval = (char __kernel __force *)uoptval; + option = (int __kernel __force *)uoption; + + pr_debug("msk=%p", msk); + + /* @@ the meaning of getsockopt() when the socket is connected and + * there are multiple subflows is not defined. + */ + lock_sock(sk); + ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE); + if (!IS_ERR(ssock)) { + pr_debug("subflow=%p", ssock->sk); + ret = kernel_getsockopt(ssock, level, optname, optval, option); + } + release_sock(sk); + + return ret; +} + +static int mptcp_get_port(struct sock *sk, unsigned short snum) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct socket *ssock; + + ssock = __mptcp_nmpc_socket(msk); + pr_debug("msk=%p, subflow=%p", msk, ssock); + if (WARN_ON_ONCE(!ssock)) + return -EINVAL; + + return inet_csk_get_port(ssock->sk, snum); +} + +void mptcp_finish_connect(struct sock *ssk) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk; + struct sock *sk; + u64 ack_seq; + + subflow = mptcp_subflow_ctx(ssk); + + if (!subflow->mp_capable) + return; + + sk = subflow->conn; + msk = mptcp_sk(sk); + + pr_debug("msk=%p, token=%u", sk, subflow->token); + + mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq); + ack_seq++; + subflow->map_seq = ack_seq; + subflow->map_subflow_seq = 1; + subflow->rel_write_seq = 1; + + /* the socket is not connected yet, no msk/subflow ops can access/race + * accessing the field below + */ + WRITE_ONCE(msk->remote_key, subflow->remote_key); + WRITE_ONCE(msk->local_key, subflow->local_key); + WRITE_ONCE(msk->token, subflow->token); + WRITE_ONCE(msk->write_seq, subflow->idsn + 1); + WRITE_ONCE(msk->ack_seq, ack_seq); + WRITE_ONCE(msk->can_ack, 1); +} + +static void mptcp_sock_graft(struct sock *sk, struct socket *parent) +{ + write_lock_bh(&sk->sk_callback_lock); + rcu_assign_pointer(sk->sk_wq, &parent->wq); + sk_set_socket(sk, parent); + sk->sk_uid = SOCK_INODE(parent)->i_uid; + write_unlock_bh(&sk->sk_callback_lock); +} + +static bool mptcp_memory_free(const struct sock *sk, int wake) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + return wake ? test_bit(MPTCP_SEND_SPACE, &msk->flags) : true; +} + +static struct proto mptcp_prot = { + .name = "MPTCP", + .owner = THIS_MODULE, + .init = mptcp_init_sock, + .close = mptcp_close, + .accept = mptcp_accept, + .setsockopt = mptcp_setsockopt, + .getsockopt = mptcp_getsockopt, + .shutdown = tcp_shutdown, + .destroy = mptcp_destroy, + .sendmsg = mptcp_sendmsg, + .recvmsg = mptcp_recvmsg, + .hash = inet_hash, + .unhash = inet_unhash, + .get_port = mptcp_get_port, + .stream_memory_free = mptcp_memory_free, + .obj_size = sizeof(struct mptcp_sock), + .no_autobind = true, +}; + +static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct mptcp_sock *msk = mptcp_sk(sock->sk); + struct socket *ssock; + int err; + + lock_sock(sock->sk); + ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE); + if (IS_ERR(ssock)) { + err = PTR_ERR(ssock); + goto unlock; + } + + err = ssock->ops->bind(ssock, uaddr, addr_len); + if (!err) + mptcp_copy_inaddrs(sock->sk, ssock->sk); + +unlock: + release_sock(sock->sk); + return err; +} + +static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + struct mptcp_sock *msk = mptcp_sk(sock->sk); + struct socket *ssock; + int err; + + lock_sock(sock->sk); + ssock = __mptcp_socket_create(msk, TCP_SYN_SENT); + if (IS_ERR(ssock)) { + err = PTR_ERR(ssock); + goto unlock; + } + +#ifdef CONFIG_TCP_MD5SIG + /* no MPTCP if MD5SIG is enabled on this socket or we may run out of + * TCP option space. + */ + if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info)) + mptcp_subflow_ctx(ssock->sk)->request_mptcp = 0; +#endif + + err = ssock->ops->connect(ssock, uaddr, addr_len, flags); + inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); + mptcp_copy_inaddrs(sock->sk, ssock->sk); + +unlock: + release_sock(sock->sk); + return err; +} + +static int mptcp_v4_getname(struct socket *sock, struct sockaddr *uaddr, + int peer) +{ + if (sock->sk->sk_prot == &tcp_prot) { + /* we are being invoked from __sys_accept4, after + * mptcp_accept() has just accepted a non-mp-capable + * flow: sk is a tcp_sk, not an mptcp one. + * + * Hand the socket over to tcp so all further socket ops + * bypass mptcp. + */ + sock->ops = &inet_stream_ops; + } + + return inet_getname(sock, uaddr, peer); +} + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +static int mptcp_v6_getname(struct socket *sock, struct sockaddr *uaddr, + int peer) +{ + if (sock->sk->sk_prot == &tcpv6_prot) { + /* we are being invoked from __sys_accept4 after + * mptcp_accept() has accepted a non-mp-capable + * subflow: sk is a tcp_sk, not mptcp. + * + * Hand the socket over to tcp so all further + * socket ops bypass mptcp. + */ + sock->ops = &inet6_stream_ops; + } + + return inet6_getname(sock, uaddr, peer); +} +#endif + +static int mptcp_listen(struct socket *sock, int backlog) +{ + struct mptcp_sock *msk = mptcp_sk(sock->sk); + struct socket *ssock; + int err; + + pr_debug("msk=%p", msk); + + lock_sock(sock->sk); + ssock = __mptcp_socket_create(msk, TCP_LISTEN); + if (IS_ERR(ssock)) { + err = PTR_ERR(ssock); + goto unlock; + } + + err = ssock->ops->listen(ssock, backlog); + inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); + if (!err) + mptcp_copy_inaddrs(sock->sk, ssock->sk); + +unlock: + release_sock(sock->sk); + return err; +} + +static bool is_tcp_proto(const struct proto *p) +{ +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + return p == &tcp_prot || p == &tcpv6_prot; +#else + return p == &tcp_prot; +#endif +} + +static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, + int flags, bool kern) +{ + struct mptcp_sock *msk = mptcp_sk(sock->sk); + struct socket *ssock; + int err; + + pr_debug("msk=%p", msk); + + lock_sock(sock->sk); + if (sock->sk->sk_state != TCP_LISTEN) + goto unlock_fail; + + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) + goto unlock_fail; + + sock_hold(ssock->sk); + release_sock(sock->sk); + + err = ssock->ops->accept(sock, newsock, flags, kern); + if (err == 0 && !is_tcp_proto(newsock->sk->sk_prot)) { + struct mptcp_sock *msk = mptcp_sk(newsock->sk); + struct mptcp_subflow_context *subflow; + + /* set ssk->sk_socket of accept()ed flows to mptcp socket. + * This is needed so NOSPACE flag can be set from tcp stack. + */ + list_for_each_entry(subflow, &msk->conn_list, node) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + if (!ssk->sk_socket) + mptcp_sock_graft(ssk, newsock); + } + + inet_sk_state_store(newsock->sk, TCP_ESTABLISHED); + } + + sock_put(ssock->sk); + return err; + +unlock_fail: + release_sock(sock->sk); + return -EINVAL; +} + +static __poll_t mptcp_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait) +{ + struct sock *sk = sock->sk; + struct mptcp_sock *msk; + struct socket *ssock; + __poll_t mask = 0; + + msk = mptcp_sk(sk); + lock_sock(sk); + ssock = __mptcp_nmpc_socket(msk); + if (ssock) { + mask = ssock->ops->poll(file, ssock, wait); + release_sock(sk); + return mask; + } + + release_sock(sk); + sock_poll_wait(file, sock, wait); + lock_sock(sk); + ssock = __mptcp_tcp_fallback(msk); + if (unlikely(ssock)) + return ssock->ops->poll(file, ssock, NULL); + + if (test_bit(MPTCP_DATA_READY, &msk->flags)) + mask = EPOLLIN | EPOLLRDNORM; + if (sk_stream_is_writeable(sk) && + test_bit(MPTCP_SEND_SPACE, &msk->flags)) + mask |= EPOLLOUT | EPOLLWRNORM; + if (sk->sk_shutdown & RCV_SHUTDOWN) + mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; + + release_sock(sk); + + return mask; +} + +static int mptcp_shutdown(struct socket *sock, int how) +{ + struct mptcp_sock *msk = mptcp_sk(sock->sk); + struct mptcp_subflow_context *subflow; + int ret = 0; + + pr_debug("sk=%p, how=%d", msk, how); + + lock_sock(sock->sk); + + if (how == SHUT_WR || how == SHUT_RDWR) + inet_sk_state_store(sock->sk, TCP_FIN_WAIT1); + + how++; + + if ((how & ~SHUTDOWN_MASK) || !how) { + ret = -EINVAL; + goto out_unlock; + } + + if (sock->state == SS_CONNECTING) { + if ((1 << sock->sk->sk_state) & + (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE)) + sock->state = SS_DISCONNECTING; + else + sock->state = SS_CONNECTED; + } + + mptcp_for_each_subflow(msk, subflow) { + struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); + + mptcp_subflow_shutdown(tcp_sk, how); + } + +out_unlock: + release_sock(sock->sk); + + return ret; +} + +static const struct proto_ops mptcp_stream_ops = { + .family = PF_INET, + .owner = THIS_MODULE, + .release = inet_release, + .bind = mptcp_bind, + .connect = mptcp_stream_connect, + .socketpair = sock_no_socketpair, + .accept = mptcp_stream_accept, + .getname = mptcp_v4_getname, + .poll = mptcp_poll, + .ioctl = inet_ioctl, + .gettstamp = sock_gettstamp, + .listen = mptcp_listen, + .shutdown = mptcp_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = inet_sendmsg, + .recvmsg = inet_recvmsg, + .mmap = sock_no_mmap, + .sendpage = inet_sendpage, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_sock_common_setsockopt, + .compat_getsockopt = compat_sock_common_getsockopt, +#endif +}; + +static struct inet_protosw mptcp_protosw = { + .type = SOCK_STREAM, + .protocol = IPPROTO_MPTCP, + .prot = &mptcp_prot, + .ops = &mptcp_stream_ops, + .flags = INET_PROTOSW_ICSK, +}; + +void mptcp_proto_init(void) +{ + mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo; + + mptcp_subflow_init(); + + if (proto_register(&mptcp_prot, 1) != 0) + panic("Failed to register MPTCP proto.\n"); + + inet_register_protosw(&mptcp_protosw); +} + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +static const struct proto_ops mptcp_v6_stream_ops = { + .family = PF_INET6, + .owner = THIS_MODULE, + .release = inet6_release, + .bind = mptcp_bind, + .connect = mptcp_stream_connect, + .socketpair = sock_no_socketpair, + .accept = mptcp_stream_accept, + .getname = mptcp_v6_getname, + .poll = mptcp_poll, + .ioctl = inet6_ioctl, + .gettstamp = sock_gettstamp, + .listen = mptcp_listen, + .shutdown = mptcp_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = inet6_sendmsg, + .recvmsg = inet6_recvmsg, + .mmap = sock_no_mmap, + .sendpage = inet_sendpage, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_sock_common_setsockopt, + .compat_getsockopt = compat_sock_common_getsockopt, +#endif +}; + +static struct proto mptcp_v6_prot; + +static void mptcp_v6_destroy(struct sock *sk) +{ + mptcp_destroy(sk); + inet6_destroy_sock(sk); +} + +static struct inet_protosw mptcp_v6_protosw = { + .type = SOCK_STREAM, + .protocol = IPPROTO_MPTCP, + .prot = &mptcp_v6_prot, + .ops = &mptcp_v6_stream_ops, + .flags = INET_PROTOSW_ICSK, +}; + +int mptcp_proto_v6_init(void) +{ + int err; + + mptcp_v6_prot = mptcp_prot; + strcpy(mptcp_v6_prot.name, "MPTCPv6"); + mptcp_v6_prot.slab = NULL; + mptcp_v6_prot.destroy = mptcp_v6_destroy; + mptcp_v6_prot.obj_size = sizeof(struct mptcp_sock) + + sizeof(struct ipv6_pinfo); + + err = proto_register(&mptcp_v6_prot, 1); + if (err) + return err; + + err = inet6_register_protosw(&mptcp_v6_protosw); + if (err) + proto_unregister(&mptcp_v6_prot); + + return err; +} +#endif diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h new file mode 100644 index 000000000000..8a99a2930284 --- /dev/null +++ b/net/mptcp/protocol.h @@ -0,0 +1,240 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Multipath TCP + * + * Copyright (c) 2017 - 2019, Intel Corporation. + */ + +#ifndef __MPTCP_PROTOCOL_H +#define __MPTCP_PROTOCOL_H + +#include <linux/random.h> +#include <net/tcp.h> +#include <net/inet_connection_sock.h> + +#define MPTCP_SUPPORTED_VERSION 1 + +/* MPTCP option bits */ +#define OPTION_MPTCP_MPC_SYN BIT(0) +#define OPTION_MPTCP_MPC_SYNACK BIT(1) +#define OPTION_MPTCP_MPC_ACK BIT(2) + +/* MPTCP option subtypes */ +#define MPTCPOPT_MP_CAPABLE 0 +#define MPTCPOPT_MP_JOIN 1 +#define MPTCPOPT_DSS 2 +#define MPTCPOPT_ADD_ADDR 3 +#define MPTCPOPT_RM_ADDR 4 +#define MPTCPOPT_MP_PRIO 5 +#define MPTCPOPT_MP_FAIL 6 +#define MPTCPOPT_MP_FASTCLOSE 7 + +/* MPTCP suboption lengths */ +#define TCPOLEN_MPTCP_MPC_SYN 4 +#define TCPOLEN_MPTCP_MPC_SYNACK 12 +#define TCPOLEN_MPTCP_MPC_ACK 20 +#define TCPOLEN_MPTCP_MPC_ACK_DATA 22 +#define TCPOLEN_MPTCP_DSS_BASE 4 +#define TCPOLEN_MPTCP_DSS_ACK32 4 +#define TCPOLEN_MPTCP_DSS_ACK64 8 +#define TCPOLEN_MPTCP_DSS_MAP32 10 +#define TCPOLEN_MPTCP_DSS_MAP64 14 +#define TCPOLEN_MPTCP_DSS_CHECKSUM 2 + +/* MPTCP MP_CAPABLE flags */ +#define MPTCP_VERSION_MASK (0x0F) +#define MPTCP_CAP_CHECKSUM_REQD BIT(7) +#define MPTCP_CAP_EXTENSIBILITY BIT(6) +#define MPTCP_CAP_HMAC_SHA256 BIT(0) +#define MPTCP_CAP_FLAG_MASK (0x3F) + +/* MPTCP DSS flags */ +#define MPTCP_DSS_DATA_FIN BIT(4) +#define MPTCP_DSS_DSN64 BIT(3) +#define MPTCP_DSS_HAS_MAP BIT(2) +#define MPTCP_DSS_ACK64 BIT(1) +#define MPTCP_DSS_HAS_ACK BIT(0) +#define MPTCP_DSS_FLAG_MASK (0x1F) + +/* MPTCP socket flags */ +#define MPTCP_DATA_READY BIT(0) +#define MPTCP_SEND_SPACE BIT(1) + +/* MPTCP connection sock */ +struct mptcp_sock { + /* inet_connection_sock must be the first member */ + struct inet_connection_sock sk; + u64 local_key; + u64 remote_key; + u64 write_seq; + u64 ack_seq; + u32 token; + unsigned long flags; + bool can_ack; + struct list_head conn_list; + struct skb_ext *cached_ext; /* for the next sendmsg */ + struct socket *subflow; /* outgoing connect/listener/!mp_capable */ + struct sock *first; +}; + +#define mptcp_for_each_subflow(__msk, __subflow) \ + list_for_each_entry(__subflow, &((__msk)->conn_list), node) + +static inline struct mptcp_sock *mptcp_sk(const struct sock *sk) +{ + return (struct mptcp_sock *)sk; +} + +struct mptcp_subflow_request_sock { + struct tcp_request_sock sk; + u16 mp_capable : 1, + mp_join : 1, + backup : 1, + remote_key_valid : 1; + u64 local_key; + u64 remote_key; + u64 idsn; + u32 token; + u32 ssn_offset; +}; + +static inline struct mptcp_subflow_request_sock * +mptcp_subflow_rsk(const struct request_sock *rsk) +{ + return (struct mptcp_subflow_request_sock *)rsk; +} + +/* MPTCP subflow context */ +struct mptcp_subflow_context { + struct list_head node;/* conn_list of subflows */ + u64 local_key; + u64 remote_key; + u64 idsn; + u64 map_seq; + u32 snd_isn; + u32 token; + u32 rel_write_seq; + u32 map_subflow_seq; + u32 ssn_offset; + u32 map_data_len; + u32 request_mptcp : 1, /* send MP_CAPABLE */ + mp_capable : 1, /* remote is MPTCP capable */ + fourth_ack : 1, /* send initial DSS */ + conn_finished : 1, + map_valid : 1, + mpc_map : 1, + data_avail : 1, + rx_eof : 1, + can_ack : 1; /* only after processing the remote a key */ + + struct sock *tcp_sock; /* tcp sk backpointer */ + struct sock *conn; /* parent mptcp_sock */ + const struct inet_connection_sock_af_ops *icsk_af_ops; + void (*tcp_data_ready)(struct sock *sk); + void (*tcp_state_change)(struct sock *sk); + void (*tcp_write_space)(struct sock *sk); + + struct rcu_head rcu; +}; + +static inline struct mptcp_subflow_context * +mptcp_subflow_ctx(const struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + /* Use RCU on icsk_ulp_data only for sock diag code */ + return (__force struct mptcp_subflow_context *)icsk->icsk_ulp_data; +} + +static inline struct sock * +mptcp_subflow_tcp_sock(const struct mptcp_subflow_context *subflow) +{ + return subflow->tcp_sock; +} + +static inline u64 +mptcp_subflow_get_map_offset(const struct mptcp_subflow_context *subflow) +{ + return tcp_sk(mptcp_subflow_tcp_sock(subflow))->copied_seq - + subflow->ssn_offset - + subflow->map_subflow_seq; +} + +static inline u64 +mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow) +{ + return subflow->map_seq + mptcp_subflow_get_map_offset(subflow); +} + +int mptcp_is_enabled(struct net *net); +bool mptcp_subflow_data_available(struct sock *sk); +void mptcp_subflow_init(void); +int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock); + +static inline void mptcp_subflow_tcp_fallback(struct sock *sk, + struct mptcp_subflow_context *ctx) +{ + sk->sk_data_ready = ctx->tcp_data_ready; + sk->sk_state_change = ctx->tcp_state_change; + sk->sk_write_space = ctx->tcp_write_space; + + inet_csk(sk)->icsk_af_ops = ctx->icsk_af_ops; +} + +extern const struct inet_connection_sock_af_ops ipv4_specific; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +extern const struct inet_connection_sock_af_ops ipv6_specific; +#endif + +void mptcp_proto_init(void); +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +int mptcp_proto_v6_init(void); +#endif + +struct mptcp_read_arg { + struct msghdr *msg; +}; + +int mptcp_read_actor(read_descriptor_t *desc, struct sk_buff *skb, + unsigned int offset, size_t len); + +void mptcp_get_options(const struct sk_buff *skb, + struct tcp_options_received *opt_rx); + +void mptcp_finish_connect(struct sock *sk); + +int mptcp_token_new_request(struct request_sock *req); +void mptcp_token_destroy_request(u32 token); +int mptcp_token_new_connect(struct sock *sk); +int mptcp_token_new_accept(u32 token); +void mptcp_token_update_accept(struct sock *sk, struct sock *conn); +void mptcp_token_destroy(u32 token); + +void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn); +static inline void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn) +{ + /* we might consider a faster version that computes the key as a + * hash of some information available in the MPTCP socket. Use + * random data at the moment, as it's probably the safest option + * in case multiple sockets are opened in different namespaces at + * the same time. + */ + get_random_bytes(key, sizeof(u64)); + mptcp_crypto_key_sha(*key, token, idsn); +} + +void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2, + void *hash_out); + +static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb) +{ + return (struct mptcp_ext *)skb_ext_find(skb, SKB_EXT_MPTCP); +} + +static inline bool before64(__u64 seq1, __u64 seq2) +{ + return (__s64)(seq1 - seq2) < 0; +} + +#define after64(seq2, seq1) before64(seq1, seq2) + +#endif /* __MPTCP_PROTOCOL_H */ diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c new file mode 100644 index 000000000000..1662e1178949 --- /dev/null +++ b/net/mptcp/subflow.c @@ -0,0 +1,860 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2017 - 2019, Intel Corporation. + */ + +#define pr_fmt(fmt) "MPTCP: " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/netdevice.h> +#include <net/sock.h> +#include <net/inet_common.h> +#include <net/inet_hashtables.h> +#include <net/protocol.h> +#include <net/tcp.h> +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +#include <net/ip6_route.h> +#endif +#include <net/mptcp.h> +#include "protocol.h" + +static int subflow_rebuild_header(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + int err = 0; + + if (subflow->request_mptcp && !subflow->token) { + pr_debug("subflow=%p", sk); + err = mptcp_token_new_connect(sk); + } + + if (err) + return err; + + return subflow->icsk_af_ops->rebuild_header(sk); +} + +static void subflow_req_destructor(struct request_sock *req) +{ + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + + pr_debug("subflow_req=%p", subflow_req); + + if (subflow_req->mp_capable) + mptcp_token_destroy_request(subflow_req->token); + tcp_request_sock_ops.destructor(req); +} + +static void subflow_init_req(struct request_sock *req, + const struct sock *sk_listener, + struct sk_buff *skb) +{ + struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + struct tcp_options_received rx_opt; + + pr_debug("subflow_req=%p, listener=%p", subflow_req, listener); + + memset(&rx_opt.mptcp, 0, sizeof(rx_opt.mptcp)); + mptcp_get_options(skb, &rx_opt); + + subflow_req->mp_capable = 0; + subflow_req->remote_key_valid = 0; + +#ifdef CONFIG_TCP_MD5SIG + /* no MPTCP if MD5SIG is enabled on this socket or we may run out of + * TCP option space. + */ + if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) + return; +#endif + + if (rx_opt.mptcp.mp_capable && listener->request_mptcp) { + int err; + + err = mptcp_token_new_request(req); + if (err == 0) + subflow_req->mp_capable = 1; + + subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; + } +} + +static void subflow_v4_init_req(struct request_sock *req, + const struct sock *sk_listener, + struct sk_buff *skb) +{ + tcp_rsk(req)->is_mptcp = 1; + + tcp_request_sock_ipv4_ops.init_req(req, sk_listener, skb); + + subflow_init_req(req, sk_listener, skb); +} + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +static void subflow_v6_init_req(struct request_sock *req, + const struct sock *sk_listener, + struct sk_buff *skb) +{ + tcp_rsk(req)->is_mptcp = 1; + + tcp_request_sock_ipv6_ops.init_req(req, sk_listener, skb); + + subflow_init_req(req, sk_listener, skb); +} +#endif + +static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + + subflow->icsk_af_ops->sk_rx_dst_set(sk, skb); + + if (subflow->conn && !subflow->conn_finished) { + pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk), + subflow->remote_key); + mptcp_finish_connect(sk); + subflow->conn_finished = 1; + + if (skb) { + pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq); + subflow->ssn_offset = TCP_SKB_CB(skb)->seq; + } + } +} + +static struct request_sock_ops subflow_request_sock_ops; +static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops; + +static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + + pr_debug("subflow=%p", subflow); + + /* Never answer to SYNs sent to broadcast or multicast */ + if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) + goto drop; + + return tcp_conn_request(&subflow_request_sock_ops, + &subflow_request_sock_ipv4_ops, + sk, skb); +drop: + tcp_listendrop(sk); + return 0; +} + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops; +static struct inet_connection_sock_af_ops subflow_v6_specific; +static struct inet_connection_sock_af_ops subflow_v6m_specific; + +static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + + pr_debug("subflow=%p", subflow); + + if (skb->protocol == htons(ETH_P_IP)) + return subflow_v4_conn_request(sk, skb); + + if (!ipv6_unicast_destination(skb)) + goto drop; + + return tcp_conn_request(&subflow_request_sock_ops, + &subflow_request_sock_ipv6_ops, sk, skb); + +drop: + tcp_listendrop(sk); + return 0; /* don't send reset */ +} +#endif + +static struct sock *subflow_syn_recv_sock(const struct sock *sk, + struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst, + struct request_sock *req_unhash, + bool *own_req) +{ + struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk); + struct mptcp_subflow_request_sock *subflow_req; + struct tcp_options_received opt_rx; + struct sock *child; + + pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn); + + /* if the sk is MP_CAPABLE, we try to fetch the client key */ + subflow_req = mptcp_subflow_rsk(req); + if (subflow_req->mp_capable) { + if (TCP_SKB_CB(skb)->seq != subflow_req->ssn_offset + 1) { + /* here we can receive and accept an in-window, + * out-of-order pkt, which will not carry the MP_CAPABLE + * opt even on mptcp enabled paths + */ + goto create_child; + } + + opt_rx.mptcp.mp_capable = 0; + mptcp_get_options(skb, &opt_rx); + if (opt_rx.mptcp.mp_capable) { + subflow_req->remote_key = opt_rx.mptcp.sndr_key; + subflow_req->remote_key_valid = 1; + } else { + subflow_req->mp_capable = 0; + } + } + +create_child: + child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst, + req_unhash, own_req); + + if (child && *own_req) { + struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child); + + /* we have null ctx on TCP fallback, not fatal on MPC + * handshake + */ + if (!ctx) + return child; + + if (ctx->mp_capable) { + if (mptcp_token_new_accept(ctx->token)) + goto close_child; + } + } + + return child; + +close_child: + pr_debug("closing child socket"); + tcp_send_active_reset(child, GFP_ATOMIC); + inet_csk_prepare_forced_close(child); + tcp_done(child); + return NULL; +} + +static struct inet_connection_sock_af_ops subflow_specific; + +enum mapping_status { + MAPPING_OK, + MAPPING_INVALID, + MAPPING_EMPTY, + MAPPING_DATA_FIN +}; + +static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq) +{ + if ((u32)seq == (u32)old_seq) + return old_seq; + + /* Assume map covers data not mapped yet. */ + return seq | ((old_seq + old_data_len + 1) & GENMASK_ULL(63, 32)); +} + +static void warn_bad_map(struct mptcp_subflow_context *subflow, u32 ssn) +{ + WARN_ONCE(1, "Bad mapping: ssn=%d map_seq=%d map_data_len=%d", + ssn, subflow->map_subflow_seq, subflow->map_data_len); +} + +static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + unsigned int skb_consumed; + + skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq; + if (WARN_ON_ONCE(skb_consumed >= skb->len)) + return true; + + return skb->len - skb_consumed <= subflow->map_data_len - + mptcp_subflow_get_map_offset(subflow); +} + +static bool validate_mapping(struct sock *ssk, struct sk_buff *skb) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset; + + if (unlikely(before(ssn, subflow->map_subflow_seq))) { + /* Mapping covers data later in the subflow stream, + * currently unsupported. + */ + warn_bad_map(subflow, ssn); + return false; + } + if (unlikely(!before(ssn, subflow->map_subflow_seq + + subflow->map_data_len))) { + /* Mapping does covers past subflow data, invalid */ + warn_bad_map(subflow, ssn + skb->len); + return false; + } + return true; +} + +static enum mapping_status get_mapping_status(struct sock *ssk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct mptcp_ext *mpext; + struct sk_buff *skb; + u16 data_len; + u64 map_seq; + + skb = skb_peek(&ssk->sk_receive_queue); + if (!skb) + return MAPPING_EMPTY; + + mpext = mptcp_get_ext(skb); + if (!mpext || !mpext->use_map) { + if (!subflow->map_valid && !skb->len) { + /* the TCP stack deliver 0 len FIN pkt to the receive + * queue, that is the only 0len pkts ever expected here, + * and we can admit no mapping only for 0 len pkts + */ + if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) + WARN_ONCE(1, "0len seq %d:%d flags %x", + TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq, + TCP_SKB_CB(skb)->tcp_flags); + sk_eat_skb(ssk, skb); + return MAPPING_EMPTY; + } + + if (!subflow->map_valid) + return MAPPING_INVALID; + + goto validate_seq; + } + + pr_debug("seq=%llu is64=%d ssn=%u data_len=%u data_fin=%d", + mpext->data_seq, mpext->dsn64, mpext->subflow_seq, + mpext->data_len, mpext->data_fin); + + data_len = mpext->data_len; + if (data_len == 0) { + pr_err("Infinite mapping not handled"); + return MAPPING_INVALID; + } + + if (mpext->data_fin == 1) { + if (data_len == 1) { + pr_debug("DATA_FIN with no payload"); + if (subflow->map_valid) { + /* A DATA_FIN might arrive in a DSS + * option before the previous mapping + * has been fully consumed. Continue + * handling the existing mapping. + */ + skb_ext_del(skb, SKB_EXT_MPTCP); + return MAPPING_OK; + } else { + return MAPPING_DATA_FIN; + } + } + + /* Adjust for DATA_FIN using 1 byte of sequence space */ + data_len--; + } + + if (!mpext->dsn64) { + map_seq = expand_seq(subflow->map_seq, subflow->map_data_len, + mpext->data_seq); + pr_debug("expanded seq=%llu", subflow->map_seq); + } else { + map_seq = mpext->data_seq; + } + + if (subflow->map_valid) { + /* Allow replacing only with an identical map */ + if (subflow->map_seq == map_seq && + subflow->map_subflow_seq == mpext->subflow_seq && + subflow->map_data_len == data_len) { + skb_ext_del(skb, SKB_EXT_MPTCP); + return MAPPING_OK; + } + + /* If this skb data are fully covered by the current mapping, + * the new map would need caching, which is not supported + */ + if (skb_is_fully_mapped(ssk, skb)) + return MAPPING_INVALID; + + /* will validate the next map after consuming the current one */ + return MAPPING_OK; + } + + subflow->map_seq = map_seq; + subflow->map_subflow_seq = mpext->subflow_seq; + subflow->map_data_len = data_len; + subflow->map_valid = 1; + subflow->mpc_map = mpext->mpc_map; + pr_debug("new map seq=%llu subflow_seq=%u data_len=%u", + subflow->map_seq, subflow->map_subflow_seq, + subflow->map_data_len); + +validate_seq: + /* we revalidate valid mapping on new skb, because we must ensure + * the current skb is completely covered by the available mapping + */ + if (!validate_mapping(ssk, skb)) + return MAPPING_INVALID; + + skb_ext_del(skb, SKB_EXT_MPTCP); + return MAPPING_OK; +} + +static bool subflow_check_data_avail(struct sock *ssk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + enum mapping_status status; + struct mptcp_sock *msk; + struct sk_buff *skb; + + pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow->conn, ssk, + subflow->data_avail, skb_peek(&ssk->sk_receive_queue)); + if (subflow->data_avail) + return true; + + if (!subflow->conn) + return false; + + msk = mptcp_sk(subflow->conn); + for (;;) { + u32 map_remaining; + size_t delta; + u64 ack_seq; + u64 old_ack; + + status = get_mapping_status(ssk); + pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status); + if (status == MAPPING_INVALID) { + ssk->sk_err = EBADMSG; + goto fatal; + } + + if (status != MAPPING_OK) + return false; + + skb = skb_peek(&ssk->sk_receive_queue); + if (WARN_ON_ONCE(!skb)) + return false; + + /* if msk lacks the remote key, this subflow must provide an + * MP_CAPABLE-based mapping + */ + if (unlikely(!READ_ONCE(msk->can_ack))) { + if (!subflow->mpc_map) { + ssk->sk_err = EBADMSG; + goto fatal; + } + WRITE_ONCE(msk->remote_key, subflow->remote_key); + WRITE_ONCE(msk->ack_seq, subflow->map_seq); + WRITE_ONCE(msk->can_ack, true); + } + + old_ack = READ_ONCE(msk->ack_seq); + ack_seq = mptcp_subflow_get_mapped_dsn(subflow); + pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack, + ack_seq); + if (ack_seq == old_ack) + break; + + /* only accept in-sequence mapping. Old values are spurious + * retransmission; we can hit "future" values on active backup + * subflow switch, we relay on retransmissions to get + * in-sequence data. + * Cuncurrent subflows support will require subflow data + * reordering + */ + map_remaining = subflow->map_data_len - + mptcp_subflow_get_map_offset(subflow); + if (before64(ack_seq, old_ack)) + delta = min_t(size_t, old_ack - ack_seq, map_remaining); + else + delta = min_t(size_t, ack_seq - old_ack, map_remaining); + + /* discard mapped data */ + pr_debug("discarding %zu bytes, current map len=%d", delta, + map_remaining); + if (delta) { + struct mptcp_read_arg arg = { + .msg = NULL, + }; + read_descriptor_t desc = { + .count = delta, + .arg.data = &arg, + }; + int ret; + + ret = tcp_read_sock(ssk, &desc, mptcp_read_actor); + if (ret < 0) { + ssk->sk_err = -ret; + goto fatal; + } + if (ret < delta) + return false; + if (delta == map_remaining) + subflow->map_valid = 0; + } + } + return true; + +fatal: + /* fatal protocol error, close the socket */ + /* This barrier is coupled with smp_rmb() in tcp_poll() */ + smp_wmb(); + ssk->sk_error_report(ssk); + tcp_set_state(ssk, TCP_CLOSE); + tcp_send_active_reset(ssk, GFP_ATOMIC); + return false; +} + +bool mptcp_subflow_data_available(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct sk_buff *skb; + + /* check if current mapping is still valid */ + if (subflow->map_valid && + mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) { + subflow->map_valid = 0; + subflow->data_avail = 0; + + pr_debug("Done with mapping: seq=%u data_len=%u", + subflow->map_subflow_seq, + subflow->map_data_len); + } + + if (!subflow_check_data_avail(sk)) { + subflow->data_avail = 0; + return false; + } + + skb = skb_peek(&sk->sk_receive_queue); + subflow->data_avail = skb && + before(tcp_sk(sk)->copied_seq, TCP_SKB_CB(skb)->end_seq); + return subflow->data_avail; +} + +static void subflow_data_ready(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct sock *parent = subflow->conn; + + if (!parent || !subflow->mp_capable) { + subflow->tcp_data_ready(sk); + + if (parent) + parent->sk_data_ready(parent); + return; + } + + if (mptcp_subflow_data_available(sk)) { + set_bit(MPTCP_DATA_READY, &mptcp_sk(parent)->flags); + + parent->sk_data_ready(parent); + } +} + +static void subflow_write_space(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct sock *parent = subflow->conn; + + sk_stream_write_space(sk); + if (parent && sk_stream_is_writeable(sk)) { + set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags); + smp_mb__after_atomic(); + /* set SEND_SPACE before sk_stream_write_space clears NOSPACE */ + sk_stream_write_space(parent); + } +} + +static struct inet_connection_sock_af_ops * +subflow_default_af_ops(struct sock *sk) +{ +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + if (sk->sk_family == AF_INET6) + return &subflow_v6_specific; +#endif + return &subflow_specific; +} + +void mptcp_handle_ipv6_mapped(struct sock *sk, bool mapped) +{ +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_connection_sock_af_ops *target; + + target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk); + + pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d", + subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped); + + if (likely(icsk->icsk_af_ops == target)) + return; + + subflow->icsk_af_ops = icsk->icsk_af_ops; + icsk->icsk_af_ops = target; +#endif +} + +int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) +{ + struct mptcp_subflow_context *subflow; + struct net *net = sock_net(sk); + struct socket *sf; + int err; + + err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP, + &sf); + if (err) + return err; + + lock_sock(sf->sk); + + /* kernel sockets do not by default acquire net ref, but TCP timer + * needs it. + */ + sf->sk->sk_net_refcnt = 1; + get_net(net); + this_cpu_add(*net->core.sock_inuse, 1); + err = tcp_set_ulp(sf->sk, "mptcp"); + release_sock(sf->sk); + + if (err) + return err; + + subflow = mptcp_subflow_ctx(sf->sk); + pr_debug("subflow=%p", subflow); + + *new_sock = sf; + sock_hold(sk); + subflow->conn = sk; + + return 0; +} + +static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk, + gfp_t priority) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct mptcp_subflow_context *ctx; + + ctx = kzalloc(sizeof(*ctx), priority); + if (!ctx) + return NULL; + + rcu_assign_pointer(icsk->icsk_ulp_data, ctx); + INIT_LIST_HEAD(&ctx->node); + + pr_debug("subflow=%p", ctx); + + ctx->tcp_sock = sk; + + return ctx; +} + +static void __subflow_state_change(struct sock *sk) +{ + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (skwq_has_sleeper(wq)) + wake_up_interruptible_all(&wq->wait); + rcu_read_unlock(); +} + +static bool subflow_is_done(const struct sock *sk) +{ + return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE; +} + +static void subflow_state_change(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct sock *parent = READ_ONCE(subflow->conn); + + __subflow_state_change(sk); + + /* as recvmsg() does not acquire the subflow socket for ssk selection + * a fin packet carrying a DSS can be unnoticed if we don't trigger + * the data available machinery here. + */ + if (parent && subflow->mp_capable && mptcp_subflow_data_available(sk)) { + set_bit(MPTCP_DATA_READY, &mptcp_sk(parent)->flags); + + parent->sk_data_ready(parent); + } + + if (parent && !(parent->sk_shutdown & RCV_SHUTDOWN) && + !subflow->rx_eof && subflow_is_done(sk)) { + subflow->rx_eof = 1; + parent->sk_shutdown |= RCV_SHUTDOWN; + __subflow_state_change(parent); + } +} + +static int subflow_ulp_init(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct mptcp_subflow_context *ctx; + struct tcp_sock *tp = tcp_sk(sk); + int err = 0; + + /* disallow attaching ULP to a socket unless it has been + * created with sock_create_kern() + */ + if (!sk->sk_kern_sock) { + err = -EOPNOTSUPP; + goto out; + } + + ctx = subflow_create_ctx(sk, GFP_KERNEL); + if (!ctx) { + err = -ENOMEM; + goto out; + } + + pr_debug("subflow=%p, family=%d", ctx, sk->sk_family); + + tp->is_mptcp = 1; + ctx->icsk_af_ops = icsk->icsk_af_ops; + icsk->icsk_af_ops = subflow_default_af_ops(sk); + ctx->tcp_data_ready = sk->sk_data_ready; + ctx->tcp_state_change = sk->sk_state_change; + ctx->tcp_write_space = sk->sk_write_space; + sk->sk_data_ready = subflow_data_ready; + sk->sk_write_space = subflow_write_space; + sk->sk_state_change = subflow_state_change; +out: + return err; +} + +static void subflow_ulp_release(struct sock *sk) +{ + struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(sk); + + if (!ctx) + return; + + if (ctx->conn) + sock_put(ctx->conn); + + kfree_rcu(ctx, rcu); +} + +static void subflow_ulp_fallback(struct sock *sk, + struct mptcp_subflow_context *old_ctx) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + mptcp_subflow_tcp_fallback(sk, old_ctx); + icsk->icsk_ulp_ops = NULL; + rcu_assign_pointer(icsk->icsk_ulp_data, NULL); + tcp_sk(sk)->is_mptcp = 0; +} + +static void subflow_ulp_clone(const struct request_sock *req, + struct sock *newsk, + const gfp_t priority) +{ + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk); + struct mptcp_subflow_context *new_ctx; + + if (!subflow_req->mp_capable) { + subflow_ulp_fallback(newsk, old_ctx); + return; + } + + new_ctx = subflow_create_ctx(newsk, priority); + if (!new_ctx) { + subflow_ulp_fallback(newsk, old_ctx); + return; + } + + /* see comments in subflow_syn_recv_sock(), MPTCP connection is fully + * established only after we receive the remote key + */ + new_ctx->conn_finished = 1; + new_ctx->icsk_af_ops = old_ctx->icsk_af_ops; + new_ctx->tcp_data_ready = old_ctx->tcp_data_ready; + new_ctx->tcp_state_change = old_ctx->tcp_state_change; + new_ctx->tcp_write_space = old_ctx->tcp_write_space; + new_ctx->mp_capable = 1; + new_ctx->fourth_ack = subflow_req->remote_key_valid; + new_ctx->can_ack = subflow_req->remote_key_valid; + new_ctx->remote_key = subflow_req->remote_key; + new_ctx->local_key = subflow_req->local_key; + new_ctx->token = subflow_req->token; + new_ctx->ssn_offset = subflow_req->ssn_offset; + new_ctx->idsn = subflow_req->idsn; +} + +static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = { + .name = "mptcp", + .owner = THIS_MODULE, + .init = subflow_ulp_init, + .release = subflow_ulp_release, + .clone = subflow_ulp_clone, +}; + +static int subflow_ops_init(struct request_sock_ops *subflow_ops) +{ + subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock); + subflow_ops->slab_name = "request_sock_subflow"; + + subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name, + subflow_ops->obj_size, 0, + SLAB_ACCOUNT | + SLAB_TYPESAFE_BY_RCU, + NULL); + if (!subflow_ops->slab) + return -ENOMEM; + + subflow_ops->destructor = subflow_req_destructor; + + return 0; +} + +void mptcp_subflow_init(void) +{ + subflow_request_sock_ops = tcp_request_sock_ops; + if (subflow_ops_init(&subflow_request_sock_ops) != 0) + panic("MPTCP: failed to init subflow request sock ops\n"); + + subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; + subflow_request_sock_ipv4_ops.init_req = subflow_v4_init_req; + + subflow_specific = ipv4_specific; + subflow_specific.conn_request = subflow_v4_conn_request; + subflow_specific.syn_recv_sock = subflow_syn_recv_sock; + subflow_specific.sk_rx_dst_set = subflow_finish_connect; + subflow_specific.rebuild_header = subflow_rebuild_header; + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; + subflow_request_sock_ipv6_ops.init_req = subflow_v6_init_req; + + subflow_v6_specific = ipv6_specific; + subflow_v6_specific.conn_request = subflow_v6_conn_request; + subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock; + subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect; + subflow_v6_specific.rebuild_header = subflow_rebuild_header; + + subflow_v6m_specific = subflow_v6_specific; + subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit; + subflow_v6m_specific.send_check = ipv4_specific.send_check; + subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len; + subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced; + subflow_v6m_specific.net_frag_header_len = 0; +#endif + + if (tcp_register_ulp(&subflow_ulp_ops) != 0) + panic("MPTCP: failed to register subflows to ULP\n"); +} diff --git a/net/mptcp/token.c b/net/mptcp/token.c new file mode 100644 index 000000000000..84d887806090 --- /dev/null +++ b/net/mptcp/token.c @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP token management + * Copyright (c) 2017 - 2019, Intel Corporation. + * + * Note: This code is based on mptcp_ctrl.c from multipath-tcp.org, + * authored by: + * + * Sébastien Barré <sebastien.barre@uclouvain.be> + * Christoph Paasch <christoph.paasch@uclouvain.be> + * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi> + * Gregory Detal <gregory.detal@uclouvain.be> + * Fabien Duchêne <fabien.duchene@uclouvain.be> + * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de> + * Lavkesh Lahngir <lavkesh51@gmail.com> + * Andreas Ripke <ripke@neclab.eu> + * Vlad Dogaru <vlad.dogaru@intel.com> + * Octavian Purdila <octavian.purdila@intel.com> + * John Ronan <jronan@tssg.org> + * Catalin Nicutar <catalin.nicutar@gmail.com> + * Brandon Heller <brandonh@stanford.edu> + */ + +#define pr_fmt(fmt) "MPTCP: " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/radix-tree.h> +#include <linux/ip.h> +#include <linux/tcp.h> +#include <net/sock.h> +#include <net/inet_common.h> +#include <net/protocol.h> +#include <net/mptcp.h> +#include "protocol.h" + +static RADIX_TREE(token_tree, GFP_ATOMIC); +static RADIX_TREE(token_req_tree, GFP_ATOMIC); +static DEFINE_SPINLOCK(token_tree_lock); +static int token_used __read_mostly; + +/** + * mptcp_token_new_request - create new key/idsn/token for subflow_request + * @req - the request socket + * + * This function is called when a new mptcp connection is coming in. + * + * It creates a unique token to identify the new mptcp connection, + * a secret local key and the initial data sequence number (idsn). + * + * Returns 0 on success. + */ +int mptcp_token_new_request(struct request_sock *req) +{ + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + int err; + + while (1) { + u32 token; + + mptcp_crypto_key_gen_sha(&subflow_req->local_key, + &subflow_req->token, + &subflow_req->idsn); + pr_debug("req=%p local_key=%llu, token=%u, idsn=%llu\n", + req, subflow_req->local_key, subflow_req->token, + subflow_req->idsn); + + token = subflow_req->token; + spin_lock_bh(&token_tree_lock); + if (!radix_tree_lookup(&token_req_tree, token) && + !radix_tree_lookup(&token_tree, token)) + break; + spin_unlock_bh(&token_tree_lock); + } + + err = radix_tree_insert(&token_req_tree, + subflow_req->token, &token_used); + spin_unlock_bh(&token_tree_lock); + return err; +} + +/** + * mptcp_token_new_connect - create new key/idsn/token for subflow + * @sk - the socket that will initiate a connection + * + * This function is called when a new outgoing mptcp connection is + * initiated. + * + * It creates a unique token to identify the new mptcp connection, + * a secret local key and the initial data sequence number (idsn). + * + * On success, the mptcp connection can be found again using + * the computed token at a later time, this is needed to process + * join requests. + * + * returns 0 on success. + */ +int mptcp_token_new_connect(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct sock *mptcp_sock = subflow->conn; + int err; + + while (1) { + u32 token; + + mptcp_crypto_key_gen_sha(&subflow->local_key, &subflow->token, + &subflow->idsn); + + pr_debug("ssk=%p, local_key=%llu, token=%u, idsn=%llu\n", + sk, subflow->local_key, subflow->token, subflow->idsn); + + token = subflow->token; + spin_lock_bh(&token_tree_lock); + if (!radix_tree_lookup(&token_req_tree, token) && + !radix_tree_lookup(&token_tree, token)) + break; + spin_unlock_bh(&token_tree_lock); + } + err = radix_tree_insert(&token_tree, subflow->token, mptcp_sock); + spin_unlock_bh(&token_tree_lock); + + return err; +} + +/** + * mptcp_token_new_accept - insert token for later processing + * @token: the token to insert to the tree + * + * Called when a SYN packet creates a new logical connection, i.e. + * is not a join request. + * + * We don't have an mptcp socket yet at that point. + * This is paired with mptcp_token_update_accept, called on accept(). + */ +int mptcp_token_new_accept(u32 token) +{ + int err; + + spin_lock_bh(&token_tree_lock); + err = radix_tree_insert(&token_tree, token, &token_used); + spin_unlock_bh(&token_tree_lock); + + return err; +} + +/** + * mptcp_token_update_accept - update token to map to mptcp socket + * @conn: the new struct mptcp_sock + * @sk: the initial subflow for this mptcp socket + * + * Called when the first mptcp socket is created on accept to + * refresh the dummy mapping (done to reserve the token) with + * the mptcp_socket structure that wasn't allocated before. + */ +void mptcp_token_update_accept(struct sock *sk, struct sock *conn) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + void __rcu **slot; + + spin_lock_bh(&token_tree_lock); + slot = radix_tree_lookup_slot(&token_tree, subflow->token); + WARN_ON_ONCE(!slot); + if (slot) { + WARN_ON_ONCE(rcu_access_pointer(*slot) != &token_used); + radix_tree_replace_slot(&token_tree, slot, conn); + } + spin_unlock_bh(&token_tree_lock); +} + +/** + * mptcp_token_destroy_request - remove mptcp connection/token + * @token - token of mptcp connection to remove + * + * Remove not-yet-fully-established incoming connection identified + * by @token. + */ +void mptcp_token_destroy_request(u32 token) +{ + spin_lock_bh(&token_tree_lock); + radix_tree_delete(&token_req_tree, token); + spin_unlock_bh(&token_tree_lock); +} + +/** + * mptcp_token_destroy - remove mptcp connection/token + * @token - token of mptcp connection to remove + * + * Remove the connection identified by @token. + */ +void mptcp_token_destroy(u32 token) +{ + spin_lock_bh(&token_tree_lock); + radix_tree_delete(&token_tree, token); + spin_unlock_bh(&token_tree_lock); +} diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index ad3fd7f1da75..e37102546be6 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -64,6 +64,17 @@ enum { NCSI_MODE_MAX }; +/* Supported media status bits for Mellanox Mac affinity command. + * Bit (0-2) for different protocol support; Bit 1 for RBT support, + * bit 1 for SMBUS support and bit 2 for PCIE support. Bit (3-5) + * for different protocol availability. Bit 4 for RBT, bit 4 for + * SMBUS and bit 5 for PCIE. + */ +enum { + MLX_MC_RBT_SUPPORT = 0x01, /* MC supports RBT */ + MLX_MC_RBT_AVL = 0x08, /* RBT medium is available */ +}; + /* OEM Vendor Manufacture ID */ #define NCSI_OEM_MFR_MLX_ID 0x8119 #define NCSI_OEM_MFR_BCM_ID 0x113d @@ -72,9 +83,15 @@ enum { /* Mellanox specific OEM Command */ #define NCSI_OEM_MLX_CMD_GMA 0x00 /* CMD ID for Get MAC */ #define NCSI_OEM_MLX_CMD_GMA_PARAM 0x1b /* Parameter for GMA */ +#define NCSI_OEM_MLX_CMD_SMAF 0x01 /* CMD ID for Set MC Affinity */ +#define NCSI_OEM_MLX_CMD_SMAF_PARAM 0x07 /* Parameter for SMAF */ /* OEM Command payload lengths*/ #define NCSI_OEM_BCM_CMD_GMA_LEN 12 #define NCSI_OEM_MLX_CMD_GMA_LEN 8 +#define NCSI_OEM_MLX_CMD_SMAF_LEN 60 +/* Offset in OEM request */ +#define MLX_SMAF_MAC_ADDR_OFFSET 8 /* Offset for MAC in SMAF */ +#define MLX_SMAF_MED_SUPPORT_OFFSET 14 /* Offset for medium in SMAF */ /* Mac address offset in OEM response */ #define BCM_MAC_ADDR_OFFSET 28 #define MLX_MAC_ADDR_OFFSET 8 @@ -251,6 +268,8 @@ enum { ncsi_dev_state_probe_deselect = 0x0201, ncsi_dev_state_probe_package, ncsi_dev_state_probe_channel, + ncsi_dev_state_probe_mlx_gma, + ncsi_dev_state_probe_mlx_smaf, ncsi_dev_state_probe_cis, ncsi_dev_state_probe_gvi, ncsi_dev_state_probe_gc, @@ -311,6 +330,7 @@ struct ncsi_dev_priv { struct list_head vlan_vids; /* List of active VLAN IDs */ bool multi_package; /* Enable multiple packages */ + bool mlx_multi_host; /* Enable multi host Mellanox */ u32 package_whitelist; /* Packages to configure */ }; diff --git a/net/ncsi/ncsi-cmd.c b/net/ncsi/ncsi-cmd.c index 0187e65176c0..ba9ae482141b 100644 --- a/net/ncsi/ncsi-cmd.c +++ b/net/ncsi/ncsi-cmd.c @@ -369,7 +369,15 @@ int ncsi_xmit_cmd(struct ncsi_cmd_arg *nca) eh = skb_push(nr->cmd, sizeof(*eh)); eh->h_proto = htons(ETH_P_NCSI); eth_broadcast_addr(eh->h_dest); - eth_broadcast_addr(eh->h_source); + + /* If mac address received from device then use it for + * source address as unicast address else use broadcast + * address as source address + */ + if (nca->ndp->gma_flag == 1) + memcpy(eh->h_source, nca->ndp->ndev.dev->dev_addr, ETH_ALEN); + else + eth_broadcast_addr(eh->h_source); /* Start the timer for the request that might not have * corresponding response. Given NCSI is an internal diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index e20b81514029..1f387be7827b 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -8,6 +8,8 @@ #include <linux/init.h> #include <linux/netdevice.h> #include <linux/skbuff.h> +#include <linux/of.h> +#include <linux/platform_device.h> #include <net/ncsi.h> #include <net/net_namespace.h> @@ -730,6 +732,34 @@ static int ncsi_oem_gma_handler_mlx(struct ncsi_cmd_arg *nca) return ret; } +static int ncsi_oem_smaf_mlx(struct ncsi_cmd_arg *nca) +{ + union { + u8 data_u8[NCSI_OEM_MLX_CMD_SMAF_LEN]; + u32 data_u32[NCSI_OEM_MLX_CMD_SMAF_LEN / sizeof(u32)]; + } u; + int ret = 0; + + memset(&u, 0, sizeof(u)); + u.data_u32[0] = ntohl(NCSI_OEM_MFR_MLX_ID); + u.data_u8[5] = NCSI_OEM_MLX_CMD_SMAF; + u.data_u8[6] = NCSI_OEM_MLX_CMD_SMAF_PARAM; + memcpy(&u.data_u8[MLX_SMAF_MAC_ADDR_OFFSET], + nca->ndp->ndev.dev->dev_addr, ETH_ALEN); + u.data_u8[MLX_SMAF_MED_SUPPORT_OFFSET] = + (MLX_MC_RBT_AVL | MLX_MC_RBT_SUPPORT); + + nca->payload = NCSI_OEM_MLX_CMD_SMAF_LEN; + nca->data = u.data_u8; + + ret = ncsi_xmit_cmd(nca); + if (ret) + netdev_err(nca->ndp->ndev.dev, + "NCSI: Failed to transmit cmd 0x%x during probe\n", + nca->type); + return ret; +} + /* OEM Command handlers initialization */ static struct ncsi_oem_gma_handler { unsigned int mfr_id; @@ -1310,8 +1340,38 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) break; } nd->state = ncsi_dev_state_probe_cis; + if (IS_ENABLED(CONFIG_NCSI_OEM_CMD_GET_MAC) && + ndp->mlx_multi_host) + nd->state = ncsi_dev_state_probe_mlx_gma; + schedule_work(&ndp->work); break; +#if IS_ENABLED(CONFIG_NCSI_OEM_CMD_GET_MAC) + case ncsi_dev_state_probe_mlx_gma: + ndp->pending_req_num = 1; + + nca.type = NCSI_PKT_CMD_OEM; + nca.package = ndp->active_package->id; + nca.channel = 0; + ret = ncsi_oem_gma_handler_mlx(&nca); + if (ret) + goto error; + + nd->state = ncsi_dev_state_probe_mlx_smaf; + break; + case ncsi_dev_state_probe_mlx_smaf: + ndp->pending_req_num = 1; + + nca.type = NCSI_PKT_CMD_OEM; + nca.package = ndp->active_package->id; + nca.channel = 0; + ret = ncsi_oem_smaf_mlx(&nca); + if (ret) + goto error; + + nd->state = ncsi_dev_state_probe_cis; + break; +#endif /* CONFIG_NCSI_OEM_CMD_GET_MAC */ case ncsi_dev_state_probe_cis: ndp->pending_req_num = NCSI_RESERVED_CHANNEL; @@ -1621,6 +1681,8 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev, { struct ncsi_dev_priv *ndp; struct ncsi_dev *nd; + struct platform_device *pdev; + struct device_node *np; unsigned long flags; int i; @@ -1667,6 +1729,13 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev, /* Set up generic netlink interface */ ncsi_init_netlink(dev); + pdev = to_platform_device(dev->dev.parent); + if (pdev) { + np = pdev->dev.of_node; + if (np && of_get_property(np, "mlx,multi-host", NULL)) + ndp->mlx_multi_host = true; + } + return nd; } EXPORT_SYMBOL_GPL(ncsi_register_dev); diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 5e9b2eb24349..3f572e5a975e 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -81,7 +81,8 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \ nft_chain_route.o nf_tables_offload.o nf_tables_set-objs := nf_tables_set_core.o \ - nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o + nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o \ + nft_set_pipapo.o obj-$(CONFIG_NF_TABLES) += nf_tables.o obj-$(CONFIG_NF_TABLES_SET) += nf_tables_set.o diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index 1abd6f0dc227..26ab0e9612d8 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -60,9 +60,9 @@ mtype_destroy(struct ip_set *set) if (SET_WITH_TIMEOUT(set)) del_timer_sync(&map->gc); - ip_set_free(map->members); if (set->dsize && set->extensions & IPSET_EXT_DESTROY) mtype_ext_cleanup(set); + ip_set_free(map->members); ip_set_free(map); set->data = NULL; @@ -75,7 +75,7 @@ mtype_flush(struct ip_set *set) if (set->extensions & IPSET_EXT_DESTROY) mtype_ext_cleanup(set); - memset(map->members, 0, map->memsize); + bitmap_zero(map->members, map->elements); set->elements = 0; set->ext_size = 0; } diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index abe8f77d7d23..0a2196f59106 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -37,7 +37,7 @@ MODULE_ALIAS("ip_set_bitmap:ip"); /* Type structure */ struct bitmap_ip { - void *members; /* the set members */ + unsigned long *members; /* the set members */ u32 first_ip; /* host byte order, included in range */ u32 last_ip; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ @@ -220,7 +220,7 @@ init_map_ip(struct ip_set *set, struct bitmap_ip *map, u32 first_ip, u32 last_ip, u32 elements, u32 hosts, u8 netmask) { - map->members = ip_set_alloc(map->memsize); + map->members = bitmap_zalloc(elements, GFP_KERNEL | __GFP_NOWARN); if (!map->members) return false; map->first_ip = first_ip; @@ -322,7 +322,7 @@ bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[], if (!map) return -ENOMEM; - map->memsize = bitmap_bytes(0, elements - 1); + map->memsize = BITS_TO_LONGS(elements) * sizeof(unsigned long); set->variant = &bitmap_ip; if (!init_map_ip(set, map, first_ip, last_ip, elements, hosts, netmask)) { diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index b618713297da..739e343efaf6 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -42,7 +42,7 @@ enum { /* Type structure */ struct bitmap_ipmac { - void *members; /* the set members */ + unsigned long *members; /* the set members */ u32 first_ip; /* host byte order, included in range */ u32 last_ip; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ @@ -299,7 +299,7 @@ static bool init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map, u32 first_ip, u32 last_ip, u32 elements) { - map->members = ip_set_alloc(map->memsize); + map->members = bitmap_zalloc(elements, GFP_KERNEL | __GFP_NOWARN); if (!map->members) return false; map->first_ip = first_ip; @@ -360,7 +360,7 @@ bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[], if (!map) return -ENOMEM; - map->memsize = bitmap_bytes(0, elements - 1); + map->memsize = BITS_TO_LONGS(elements) * sizeof(unsigned long); set->variant = &bitmap_ipmac; if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) { kfree(map); diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index 23d6095cb196..b49978dd810d 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -30,7 +30,7 @@ MODULE_ALIAS("ip_set_bitmap:port"); /* Type structure */ struct bitmap_port { - void *members; /* the set members */ + unsigned long *members; /* the set members */ u16 first_port; /* host byte order, included in range */ u16 last_port; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ @@ -231,7 +231,7 @@ static bool init_map_port(struct ip_set *set, struct bitmap_port *map, u16 first_port, u16 last_port) { - map->members = ip_set_alloc(map->memsize); + map->members = bitmap_zalloc(map->elements, GFP_KERNEL | __GFP_NOWARN); if (!map->members) return false; map->first_port = first_port; @@ -271,7 +271,7 @@ bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[], return -ENOMEM; map->elements = elements; - map->memsize = bitmap_bytes(0, map->elements); + map->memsize = BITS_TO_LONGS(elements) * sizeof(unsigned long); set->variant = &bitmap_port; if (!init_map_port(set, map, first_port, last_port)) { kfree(map); diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 169e0a04f814..cf895bc80871 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1848,6 +1848,7 @@ static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb, struct ip_set *set; struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {}; int ret = 0; + u32 lineno; if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_SETNAME] || @@ -1864,7 +1865,7 @@ static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb, return -IPSET_ERR_PROTOCOL; rcu_read_lock_bh(); - ret = set->variant->uadt(set, tb, IPSET_TEST, NULL, 0, 0); + ret = set->variant->uadt(set, tb, IPSET_TEST, &lineno, 0, 0); rcu_read_unlock_bh(); /* Userspace can't trigger element to be re-added */ if (ret == -EAGAIN) diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 8dc892a9dc91..605e0f68f8bd 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1239,7 +1239,7 @@ static void ip_vs_process_message(struct netns_ipvs *ipvs, __u8 *buffer, p = msg_end; if (p + sizeof(s->v4) > buffer+buflen) { - IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n"); + IP_VS_ERR_RL("BACKUP, Dropping buffer, too small\n"); return; } s = (union ip_vs_sync_conn *)p; diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index b6b14db3955b..b3f4a334f9d7 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -677,6 +677,9 @@ static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[], unsigned int *timeouts = data; int i; + if (!timeouts) + timeouts = dn->dccp_timeout; + /* set default DCCP timeouts. */ for (i=0; i<CT_DCCP_MAX; i++) timeouts[i] = dn->dccp_timeout[i]; diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index fce3d93f1541..4f897b14b606 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -114,7 +114,7 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { { /* ORIGINAL */ /* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */ -/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA}, +/* init */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA}, /* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA}, /* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, /* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL, sSS}, @@ -130,7 +130,7 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { /* REPLY */ /* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */ /* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* INIT in sCL Big TODO */ -/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA}, +/* init_ack */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA}, /* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV, sCL}, /* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV, sSR}, /* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV, sHA}, @@ -316,7 +316,7 @@ sctp_new(struct nf_conn *ct, const struct sk_buff *skb, ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag; } - ct->proto.sctp.state = new_state; + ct->proto.sctp.state = SCTP_CONNTRACK_NONE; } return true; @@ -594,6 +594,9 @@ static int sctp_timeout_nlattr_to_obj(struct nlattr *tb[], struct nf_sctp_net *sn = nf_sctp_pernet(net); int i; + if (!timeouts) + timeouts = sn->timeouts; + /* set default SCTP timeouts. */ for (i=0; i<SCTP_CONNTRACK_MAX; i++) timeouts[i] = sn->timeouts[i]; diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 9889d52eda82..7e91989a1b55 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -61,9 +61,9 @@ struct flow_offload *flow_offload_alloc(struct nf_conn *ct) flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_REPLY); if (ct->status & IPS_SRC_NAT) - flow->flags |= FLOW_OFFLOAD_SNAT; + __set_bit(NF_FLOW_SNAT, &flow->flags); if (ct->status & IPS_DST_NAT) - flow->flags |= FLOW_OFFLOAD_DNAT; + __set_bit(NF_FLOW_DNAT, &flow->flags); return flow; @@ -134,11 +134,6 @@ static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp) #define NF_FLOWTABLE_TCP_PICKUP_TIMEOUT (120 * HZ) #define NF_FLOWTABLE_UDP_PICKUP_TIMEOUT (30 * HZ) -static inline __s32 nf_flow_timeout_delta(unsigned int timeout) -{ - return (__s32)(timeout - (u32)jiffies); -} - static void flow_offload_fixup_ct_timeout(struct nf_conn *ct) { const struct nf_conntrack_l4proto *l4proto; @@ -187,8 +182,6 @@ void flow_offload_free(struct flow_offload *flow) default: break; } - if (flow->flags & FLOW_OFFLOAD_DYING) - nf_ct_delete(flow->ct, 0, 0); nf_ct_put(flow->ct); kfree_rcu(flow, rcu_head); } @@ -232,7 +225,7 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) { int err; - flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT; + flow->timeout = nf_flowtable_time_stamp + NF_FLOW_TIMEOUT; err = rhashtable_insert_fast(&flow_table->rhashtable, &flow->tuplehash[0].node, @@ -250,8 +243,10 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) return err; } - if (flow_table->flags & NF_FLOWTABLE_HW_OFFLOAD) + if (nf_flowtable_hw_offload(flow_table)) { + __set_bit(NF_FLOW_HW, &flow->flags); nf_flow_offload_add(flow_table, flow); + } return 0; } @@ -276,7 +271,7 @@ static void flow_offload_del(struct nf_flowtable *flow_table, if (nf_flow_has_expired(flow)) flow_offload_fixup_ct(flow->ct); - else if (flow->flags & FLOW_OFFLOAD_TEARDOWN) + else if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) flow_offload_fixup_ct_timeout(flow->ct); flow_offload_free(flow); @@ -284,7 +279,7 @@ static void flow_offload_del(struct nf_flowtable *flow_table, void flow_offload_teardown(struct flow_offload *flow) { - flow->flags |= FLOW_OFFLOAD_TEARDOWN; + set_bit(NF_FLOW_TEARDOWN, &flow->flags); flow_offload_fixup_ct_state(flow->ct); } @@ -305,7 +300,7 @@ flow_offload_lookup(struct nf_flowtable *flow_table, dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); - if (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN)) + if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) return NULL; if (unlikely(nf_ct_is_dying(flow->ct))) @@ -353,19 +348,18 @@ static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data) { struct nf_flowtable *flow_table = data; - if (flow->flags & FLOW_OFFLOAD_HW) - nf_flow_offload_stats(flow_table, flow); - if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct) || - (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN))) { - if (flow->flags & FLOW_OFFLOAD_HW) { - if (!(flow->flags & FLOW_OFFLOAD_HW_DYING)) + test_bit(NF_FLOW_TEARDOWN, &flow->flags)) { + if (test_bit(NF_FLOW_HW, &flow->flags)) { + if (!test_bit(NF_FLOW_HW_DYING, &flow->flags)) nf_flow_offload_del(flow_table, flow); - else if (flow->flags & FLOW_OFFLOAD_HW_DEAD) + else if (test_bit(NF_FLOW_HW_DEAD, &flow->flags)) flow_offload_del(flow_table, flow); } else { flow_offload_del(flow_table, flow); } + } else if (test_bit(NF_FLOW_HW, &flow->flags)) { + nf_flow_offload_stats(flow_table, flow); } } @@ -529,7 +523,7 @@ static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data) if (net_eq(nf_ct_net(flow->ct), dev_net(dev)) && (flow->tuplehash[0].tuple.iifidx == dev->ifindex || flow->tuplehash[1].tuple.iifidx == dev->ifindex)) - flow_offload_dead(flow); + flow_offload_teardown(flow); } static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable, diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index b9e7dd6e60ce..9e563fd3da0f 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -144,11 +144,11 @@ static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb, { struct iphdr *iph = ip_hdr(skb); - if (flow->flags & FLOW_OFFLOAD_SNAT && + if (test_bit(NF_FLOW_SNAT, &flow->flags) && (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 || nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0)) return -1; - if (flow->flags & FLOW_OFFLOAD_DNAT && + if (test_bit(NF_FLOW_DNAT, &flow->flags) && (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 || nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0)) return -1; @@ -232,6 +232,13 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb, return NF_STOLEN; } +static bool nf_flow_offload_refresh(struct nf_flowtable *flow_table, + struct flow_offload *flow) +{ + return nf_flowtable_hw_offload(flow_table) && + test_and_clear_bit(NF_FLOW_HW_REFRESH, &flow->flags); +} + unsigned int nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -272,6 +279,9 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, if (nf_flow_state_check(flow, ip_hdr(skb)->protocol, skb, thoff)) return NF_ACCEPT; + if (unlikely(nf_flow_offload_refresh(flow_table, flow))) + nf_flow_offload_add(flow_table, flow); + if (nf_flow_offload_dst_check(&rt->dst)) { flow_offload_teardown(flow); return NF_ACCEPT; @@ -280,7 +290,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, if (nf_flow_nat_ip(flow, skb, thoff, dir) < 0) return NF_DROP; - flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT; + flow->timeout = nf_flowtable_time_stamp + NF_FLOW_TIMEOUT; iph = ip_hdr(skb); ip_decrease_ttl(iph); skb->tstamp = 0; @@ -414,11 +424,11 @@ static int nf_flow_nat_ipv6(const struct flow_offload *flow, struct ipv6hdr *ip6h = ipv6_hdr(skb); unsigned int thoff = sizeof(*ip6h); - if (flow->flags & FLOW_OFFLOAD_SNAT && + if (test_bit(NF_FLOW_SNAT, &flow->flags) && (nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 || nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir) < 0)) return -1; - if (flow->flags & FLOW_OFFLOAD_DNAT && + if (test_bit(NF_FLOW_DNAT, &flow->flags) && (nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 || nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir) < 0)) return -1; @@ -498,6 +508,9 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, sizeof(*ip6h))) return NF_ACCEPT; + if (unlikely(nf_flow_offload_refresh(flow_table, flow))) + nf_flow_offload_add(flow_table, flow); + if (nf_flow_offload_dst_check(&rt->dst)) { flow_offload_teardown(flow); return NF_ACCEPT; @@ -509,7 +522,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, if (nf_flow_nat_ipv6(flow, skb, dir) < 0) return NF_DROP; - flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT; + flow->timeout = nf_flowtable_time_stamp + NF_FLOW_TIMEOUT; ip6h = ipv6_hdr(skb); ip6h->hop_limit--; skb->tstamp = 0; diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 0d72e5ccb47b..c8b70ffeef0c 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -24,6 +24,7 @@ struct flow_offload_work { }; struct nf_flow_key { + struct flow_dissector_key_meta meta; struct flow_dissector_key_control control; struct flow_dissector_key_basic basic; union { @@ -55,6 +56,7 @@ static int nf_flow_rule_match(struct nf_flow_match *match, struct nf_flow_key *mask = &match->mask; struct nf_flow_key *key = &match->key; + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_META, meta); NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_CONTROL, control); NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_BASIC, basic); NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4); @@ -62,6 +64,9 @@ static int nf_flow_rule_match(struct nf_flow_match *match, NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_TCP, tcp); NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_PORTS, tp); + key->meta.ingress_ifindex = tuple->iifidx; + mask->meta.ingress_ifindex = 0xffffffff; + switch (tuple->l3proto) { case AF_INET: key->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; @@ -105,7 +110,8 @@ static int nf_flow_rule_match(struct nf_flow_match *match, key->tp.dst = tuple->dst_port; mask->tp.dst = 0xffff; - match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_CONTROL) | + match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_META) | + BIT(FLOW_DISSECTOR_KEY_CONTROL) | BIT(FLOW_DISSECTOR_KEY_BASIC) | BIT(FLOW_DISSECTOR_KEY_PORTS); return 0; @@ -166,24 +172,38 @@ static int flow_offload_eth_dst(struct net *net, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { - const struct flow_offload_tuple *tuple = &flow->tuplehash[dir].tuple; struct flow_action_entry *entry0 = flow_action_entry_next(flow_rule); struct flow_action_entry *entry1 = flow_action_entry_next(flow_rule); + const void *daddr = &flow->tuplehash[!dir].tuple.src_v4; + const struct dst_entry *dst_cache; + unsigned char ha[ETH_ALEN]; struct neighbour *n; u32 mask, val; + u8 nud_state; u16 val16; - n = dst_neigh_lookup(tuple->dst_cache, &tuple->dst_v4); + dst_cache = flow->tuplehash[dir].tuple.dst_cache; + n = dst_neigh_lookup(dst_cache, daddr); if (!n) return -ENOENT; + read_lock_bh(&n->lock); + nud_state = n->nud_state; + ether_addr_copy(ha, n->ha); + read_unlock_bh(&n->lock); + + if (!(nud_state & NUD_VALID)) { + neigh_release(n); + return -ENOENT; + } + mask = ~0xffffffff; - memcpy(&val, n->ha, 4); + memcpy(&val, ha, 4); flow_offload_mangle(entry0, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 0, &val, &mask); mask = ~0x0000ffff; - memcpy(&val16, n->ha + 4, 2); + memcpy(&val16, ha + 4, 2); val = val16; flow_offload_mangle(entry1, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 4, &val, &mask); @@ -335,22 +355,26 @@ static void flow_offload_port_snat(struct net *net, struct nf_flow_rule *flow_rule) { struct flow_action_entry *entry = flow_action_entry_next(flow_rule); - u32 mask = ~htonl(0xffff0000), port; + u32 mask, port; u32 offset; switch (dir) { case FLOW_OFFLOAD_DIR_ORIGINAL: port = ntohs(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port); offset = 0; /* offsetof(struct tcphdr, source); */ + port = htonl(port << 16); + mask = ~htonl(0xffff0000); break; case FLOW_OFFLOAD_DIR_REPLY: port = ntohs(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port); offset = 0; /* offsetof(struct tcphdr, dest); */ + port = htonl(port); + mask = ~htonl(0xffff); break; default: return; } - port = htonl(port << 16); + flow_offload_mangle(entry, flow_offload_l4proto(flow), offset, &port, &mask); } @@ -361,22 +385,26 @@ static void flow_offload_port_dnat(struct net *net, struct nf_flow_rule *flow_rule) { struct flow_action_entry *entry = flow_action_entry_next(flow_rule); - u32 mask = ~htonl(0xffff), port; + u32 mask, port; u32 offset; switch (dir) { case FLOW_OFFLOAD_DIR_ORIGINAL: - port = ntohs(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port); - offset = 0; /* offsetof(struct tcphdr, source); */ + port = ntohs(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port); + offset = 0; /* offsetof(struct tcphdr, dest); */ + port = htonl(port); + mask = ~htonl(0xffff); break; case FLOW_OFFLOAD_DIR_REPLY: - port = ntohs(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port); - offset = 0; /* offsetof(struct tcphdr, dest); */ + port = ntohs(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port); + offset = 0; /* offsetof(struct tcphdr, source); */ + port = htonl(port << 16); + mask = ~htonl(0xffff0000); break; default: return; } - port = htonl(port); + flow_offload_mangle(entry, flow_offload_l4proto(flow), offset, &port, &mask); } @@ -422,16 +450,16 @@ int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow, flow_offload_eth_dst(net, flow, dir, flow_rule) < 0) return -1; - if (flow->flags & FLOW_OFFLOAD_SNAT) { + if (test_bit(NF_FLOW_SNAT, &flow->flags)) { flow_offload_ipv4_snat(net, flow, dir, flow_rule); flow_offload_port_snat(net, flow, dir, flow_rule); } - if (flow->flags & FLOW_OFFLOAD_DNAT) { + if (test_bit(NF_FLOW_DNAT, &flow->flags)) { flow_offload_ipv4_dnat(net, flow, dir, flow_rule); flow_offload_port_dnat(net, flow, dir, flow_rule); } - if (flow->flags & FLOW_OFFLOAD_SNAT || - flow->flags & FLOW_OFFLOAD_DNAT) + if (test_bit(NF_FLOW_SNAT, &flow->flags) || + test_bit(NF_FLOW_DNAT, &flow->flags)) flow_offload_ipv4_checksum(net, flow, flow_rule); flow_offload_redirect(flow, dir, flow_rule); @@ -448,11 +476,11 @@ int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow, flow_offload_eth_dst(net, flow, dir, flow_rule) < 0) return -1; - if (flow->flags & FLOW_OFFLOAD_SNAT) { + if (test_bit(NF_FLOW_SNAT, &flow->flags)) { flow_offload_ipv6_snat(net, flow, dir, flow_rule); flow_offload_port_snat(net, flow, dir, flow_rule); } - if (flow->flags & FLOW_OFFLOAD_DNAT) { + if (test_bit(NF_FLOW_DNAT, &flow->flags)) { flow_offload_ipv6_dnat(net, flow, dir, flow_rule); flow_offload_port_dnat(net, flow, dir, flow_rule); } @@ -564,23 +592,25 @@ static void nf_flow_offload_init(struct flow_cls_offload *cls_flow, cls_flow->cookie = (unsigned long)tuple; } -static int flow_offload_tuple_add(struct flow_offload_work *offload, - struct nf_flow_rule *flow_rule, - enum flow_offload_tuple_dir dir) +static int nf_flow_offload_tuple(struct nf_flowtable *flowtable, + struct flow_offload *flow, + struct nf_flow_rule *flow_rule, + enum flow_offload_tuple_dir dir, + int priority, int cmd, + struct list_head *block_cb_list) { - struct nf_flowtable *flowtable = offload->flowtable; struct flow_cls_offload cls_flow = {}; struct flow_block_cb *block_cb; struct netlink_ext_ack extack; __be16 proto = ETH_P_ALL; int err, i = 0; - nf_flow_offload_init(&cls_flow, proto, offload->priority, - FLOW_CLS_REPLACE, - &offload->flow->tuplehash[dir].tuple, &extack); - cls_flow.rule = flow_rule->rule; + nf_flow_offload_init(&cls_flow, proto, priority, cmd, + &flow->tuplehash[dir].tuple, &extack); + if (cmd == FLOW_CLS_REPLACE) + cls_flow.rule = flow_rule->rule; - list_for_each_entry(block_cb, &flowtable->flow_block.cb_list, list) { + list_for_each_entry(block_cb, block_cb_list, list) { err = block_cb->cb(TC_SETUP_CLSFLOWER, &cls_flow, block_cb->cb_priv); if (err < 0) @@ -592,23 +622,22 @@ static int flow_offload_tuple_add(struct flow_offload_work *offload, return i; } +static int flow_offload_tuple_add(struct flow_offload_work *offload, + struct nf_flow_rule *flow_rule, + enum flow_offload_tuple_dir dir) +{ + return nf_flow_offload_tuple(offload->flowtable, offload->flow, + flow_rule, dir, offload->priority, + FLOW_CLS_REPLACE, + &offload->flowtable->flow_block.cb_list); +} + static void flow_offload_tuple_del(struct flow_offload_work *offload, enum flow_offload_tuple_dir dir) { - struct nf_flowtable *flowtable = offload->flowtable; - struct flow_cls_offload cls_flow = {}; - struct flow_block_cb *block_cb; - struct netlink_ext_ack extack; - __be16 proto = ETH_P_ALL; - - nf_flow_offload_init(&cls_flow, proto, offload->priority, - FLOW_CLS_DESTROY, - &offload->flow->tuplehash[dir].tuple, &extack); - - list_for_each_entry(block_cb, &flowtable->flow_block.cb_list, list) - block_cb->cb(TC_SETUP_CLSFLOWER, &cls_flow, block_cb->cb_priv); - - offload->flow->flags |= FLOW_OFFLOAD_HW_DEAD; + nf_flow_offload_tuple(offload->flowtable, offload->flow, NULL, dir, + offload->priority, FLOW_CLS_DESTROY, + &offload->flowtable->flow_block.cb_list); } static int flow_offload_rule_add(struct flow_offload_work *offload, @@ -626,20 +655,20 @@ static int flow_offload_rule_add(struct flow_offload_work *offload, return 0; } -static int flow_offload_work_add(struct flow_offload_work *offload) +static void flow_offload_work_add(struct flow_offload_work *offload) { struct nf_flow_rule *flow_rule[FLOW_OFFLOAD_DIR_MAX]; int err; err = nf_flow_offload_alloc(offload, flow_rule); if (err < 0) - return -ENOMEM; + return; err = flow_offload_rule_add(offload, flow_rule); + if (err < 0) + set_bit(NF_FLOW_HW_REFRESH, &offload->flow->flags); nf_flow_offload_destroy(flow_rule); - - return err; } static void flow_offload_work_del(struct flow_offload_work *offload) @@ -684,7 +713,6 @@ static void flow_offload_work_handler(struct work_struct *work) { struct flow_offload_work *offload, *next; LIST_HEAD(offload_pending_list); - int ret; spin_lock_bh(&flow_offload_pending_list_lock); list_replace_init(&flow_offload_pending_list, &offload_pending_list); @@ -693,9 +721,7 @@ static void flow_offload_work_handler(struct work_struct *work) list_for_each_entry_safe(offload, next, &offload_pending_list, list) { switch (offload->cmd) { case FLOW_CLS_REPLACE: - ret = flow_offload_work_add(offload); - if (ret < 0) - offload->flow->flags &= ~FLOW_OFFLOAD_HW; + flow_offload_work_add(offload); break; case FLOW_CLS_DESTROY: flow_offload_work_del(offload); @@ -720,20 +746,33 @@ static void flow_offload_queue_work(struct flow_offload_work *offload) schedule_work(&nf_flow_offload_work); } -void nf_flow_offload_add(struct nf_flowtable *flowtable, - struct flow_offload *flow) +static struct flow_offload_work * +nf_flow_offload_work_alloc(struct nf_flowtable *flowtable, + struct flow_offload *flow, unsigned int cmd) { struct flow_offload_work *offload; offload = kmalloc(sizeof(struct flow_offload_work), GFP_ATOMIC); if (!offload) - return; + return NULL; - offload->cmd = FLOW_CLS_REPLACE; + offload->cmd = cmd; offload->flow = flow; offload->priority = flowtable->priority; offload->flowtable = flowtable; - flow->flags |= FLOW_OFFLOAD_HW; + + return offload; +} + + +void nf_flow_offload_add(struct nf_flowtable *flowtable, + struct flow_offload *flow) +{ + struct flow_offload_work *offload; + + offload = nf_flow_offload_work_alloc(flowtable, flow, FLOW_CLS_REPLACE); + if (!offload) + return; flow_offload_queue_work(offload); } @@ -743,15 +782,11 @@ void nf_flow_offload_del(struct nf_flowtable *flowtable, { struct flow_offload_work *offload; - offload = kzalloc(sizeof(struct flow_offload_work), GFP_ATOMIC); + offload = nf_flow_offload_work_alloc(flowtable, flow, FLOW_CLS_DESTROY); if (!offload) return; - offload->cmd = FLOW_CLS_DESTROY; - offload->flow = flow; - offload->flow->flags |= FLOW_OFFLOAD_HW_DYING; - offload->flowtable = flowtable; - + set_bit(NF_FLOW_HW_DYING, &flow->flags); flow_offload_queue_work(offload); } @@ -759,27 +794,22 @@ void nf_flow_offload_stats(struct nf_flowtable *flowtable, struct flow_offload *flow) { struct flow_offload_work *offload; - s64 delta; + __s32 delta; - delta = flow->timeout - jiffies; - if ((delta >= (9 * NF_FLOW_TIMEOUT) / 10) || - flow->flags & FLOW_OFFLOAD_HW_DYING) + delta = nf_flow_timeout_delta(flow->timeout); + if ((delta >= (9 * NF_FLOW_TIMEOUT) / 10)) return; - offload = kzalloc(sizeof(struct flow_offload_work), GFP_ATOMIC); + offload = nf_flow_offload_work_alloc(flowtable, flow, FLOW_CLS_STATS); if (!offload) return; - offload->cmd = FLOW_CLS_STATS; - offload->flow = flow; - offload->flowtable = flowtable; - flow_offload_queue_work(offload); } void nf_flow_table_offload_flush(struct nf_flowtable *flowtable) { - if (flowtable->flags & NF_FLOWTABLE_HW_OFFLOAD) + if (nf_flowtable_hw_offload(flowtable)) flush_work(&nf_flow_offload_work); } @@ -808,28 +838,44 @@ static int nf_flow_table_block_setup(struct nf_flowtable *flowtable, return err; } -int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, - struct net_device *dev, - enum flow_block_command cmd) +static int nf_flow_table_offload_cmd(struct flow_block_offload *bo, + struct nf_flowtable *flowtable, + struct net_device *dev, + enum flow_block_command cmd, + struct netlink_ext_ack *extack) { - struct netlink_ext_ack extack = {}; - struct flow_block_offload bo = {}; int err; - if (!(flowtable->flags & NF_FLOWTABLE_HW_OFFLOAD)) + if (!nf_flowtable_hw_offload(flowtable)) return 0; if (!dev->netdev_ops->ndo_setup_tc) return -EOPNOTSUPP; - bo.net = dev_net(dev); - bo.block = &flowtable->flow_block; - bo.command = cmd; - bo.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; - bo.extack = &extack; - INIT_LIST_HEAD(&bo.cb_list); + memset(bo, 0, sizeof(*bo)); + bo->net = dev_net(dev); + bo->block = &flowtable->flow_block; + bo->command = cmd; + bo->binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; + bo->extack = extack; + INIT_LIST_HEAD(&bo->cb_list); + + err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_FT, bo); + if (err < 0) + return err; + + return 0; +} + +int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, + struct net_device *dev, + enum flow_block_command cmd) +{ + struct netlink_ext_ack extack = {}; + struct flow_block_offload bo; + int err; - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_FT, &bo); + err = nf_flow_table_offload_cmd(&bo, flowtable, dev, cmd, &extack); if (err < 0) return err; diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c index 0a59c14b5177..64eedc17037a 100644 --- a/net/netfilter/nf_nat_proto.c +++ b/net/netfilter/nf_nat_proto.c @@ -233,6 +233,19 @@ icmp_manip_pkt(struct sk_buff *skb, return false; hdr = (struct icmphdr *)(skb->data + hdroff); + switch (hdr->type) { + case ICMP_ECHO: + case ICMP_ECHOREPLY: + case ICMP_TIMESTAMP: + case ICMP_TIMESTAMPREPLY: + case ICMP_INFO_REQUEST: + case ICMP_INFO_REPLY: + case ICMP_ADDRESS: + case ICMP_ADDRESSREPLY: + break; + default: + return true; + } inet_proto_csum_replace2(&hdr->checksum, skb, hdr->un.echo.id, tuple->src.u.icmp.id, false); hdr->un.echo.id = tuple->src.u.icmp.id; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 273f3838318b..d1318bdf49ca 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -22,6 +22,8 @@ #include <net/net_namespace.h> #include <net/sock.h> +#define NFT_MODULE_AUTOLOAD_LIMIT (MODULE_NAME_LEN - sizeof("nft-expr-255-")) + static LIST_HEAD(nf_tables_expressions); static LIST_HEAD(nf_tables_objects); static LIST_HEAD(nf_tables_flowtables); @@ -551,46 +553,70 @@ static inline u64 nf_tables_alloc_handle(struct nft_table *table) static const struct nft_chain_type *chain_type[NFPROTO_NUMPROTO][NFT_CHAIN_T_MAX]; static const struct nft_chain_type * +__nft_chain_type_get(u8 family, enum nft_chain_types type) +{ + if (family >= NFPROTO_NUMPROTO || + type >= NFT_CHAIN_T_MAX) + return NULL; + + return chain_type[family][type]; +} + +static const struct nft_chain_type * __nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family) { + const struct nft_chain_type *type; int i; for (i = 0; i < NFT_CHAIN_T_MAX; i++) { - if (chain_type[family][i] != NULL && - !nla_strcmp(nla, chain_type[family][i]->name)) - return chain_type[family][i]; + type = __nft_chain_type_get(family, i); + if (!type) + continue; + if (!nla_strcmp(nla, type->name)) + return type; } return NULL; } -/* - * Loading a module requires dropping mutex that guards the - * transaction. - * We first need to abort any pending transactions as once - * mutex is unlocked a different client could start a new - * transaction. It must not see any 'future generation' - * changes * as these changes will never happen. - */ -#ifdef CONFIG_MODULES -static int __nf_tables_abort(struct net *net); +struct nft_module_request { + struct list_head list; + char module[MODULE_NAME_LEN]; + bool done; +}; -static void nft_request_module(struct net *net, const char *fmt, ...) +#ifdef CONFIG_MODULES +static int nft_request_module(struct net *net, const char *fmt, ...) { char module_name[MODULE_NAME_LEN]; + struct nft_module_request *req; va_list args; int ret; - __nf_tables_abort(net); - va_start(args, fmt); ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); va_end(args); - if (WARN(ret >= MODULE_NAME_LEN, "truncated: '%s' (len %d)", module_name, ret)) - return; + if (ret >= MODULE_NAME_LEN) + return 0; - mutex_unlock(&net->nft.commit_mutex); - request_module("%s", module_name); - mutex_lock(&net->nft.commit_mutex); + list_for_each_entry(req, &net->nft.module_list, list) { + if (!strcmp(req->module, module_name)) { + if (req->done) + return 0; + + /* A request to load this module already exists. */ + return -EAGAIN; + } + } + + req = kmalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return -ENOMEM; + + req->done = false; + strlcpy(req->module, module_name, MODULE_NAME_LEN); + list_add_tail(&req->list, &net->nft.module_list); + + return -EAGAIN; } #endif @@ -614,10 +640,9 @@ nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla, lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (autoload) { - nft_request_module(net, "nft-chain-%u-%.*s", family, - nla_len(nla), (const char *)nla_data(nla)); - type = __nf_tables_chain_type_lookup(nla, family); - if (type != NULL) + if (nft_request_module(net, "nft-chain-%u-%.*s", family, + nla_len(nla), + (const char *)nla_data(nla)) == -EAGAIN) return ERR_PTR(-EAGAIN); } #endif @@ -1045,12 +1070,18 @@ static int nft_flush_table(struct nft_ctx *ctx) } list_for_each_entry_safe(flowtable, nft, &ctx->table->flowtables, list) { + if (!nft_is_active_next(ctx->net, flowtable)) + continue; + err = nft_delflowtable(ctx, flowtable); if (err < 0) goto out; } list_for_each_entry_safe(obj, ne, &ctx->table->objects, list) { + if (!nft_is_active_next(ctx->net, obj)) + continue; + err = nft_delobj(ctx, obj); if (err < 0) goto out; @@ -1153,11 +1184,8 @@ static void nf_tables_table_destroy(struct nft_ctx *ctx) void nft_register_chain_type(const struct nft_chain_type *ctype) { - if (WARN_ON(ctype->family >= NFPROTO_NUMPROTO)) - return; - nfnl_lock(NFNL_SUBSYS_NFTABLES); - if (WARN_ON(chain_type[ctype->family][ctype->type] != NULL)) { + if (WARN_ON(__nft_chain_type_get(ctype->family, ctype->type))) { nfnl_unlock(NFNL_SUBSYS_NFTABLES); return; } @@ -1241,7 +1269,8 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = { .len = NFT_CHAIN_MAXNAMELEN - 1 }, [NFTA_CHAIN_HOOK] = { .type = NLA_NESTED }, [NFTA_CHAIN_POLICY] = { .type = NLA_U32 }, - [NFTA_CHAIN_TYPE] = { .type = NLA_STRING }, + [NFTA_CHAIN_TYPE] = { .type = NLA_STRING, + .len = NFT_MODULE_AUTOLOAD_LIMIT }, [NFTA_CHAIN_COUNTERS] = { .type = NLA_NESTED }, [NFTA_CHAIN_FLAGS] = { .type = NLA_U32 }, }; @@ -1676,6 +1705,7 @@ static int nf_tables_parse_netdev_hooks(struct net *net, goto err_hook; } if (nft_hook_list_find(hook_list, hook)) { + kfree(hook); err = -EEXIST; goto err_hook; } @@ -1757,7 +1787,10 @@ static int nft_chain_parse_hook(struct net *net, hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM])); hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY])); - type = chain_type[family][NFT_CHAIN_T_DEFAULT]; + type = __nft_chain_type_get(family, NFT_CHAIN_T_DEFAULT); + if (!type) + return -EOPNOTSUPP; + if (nla[NFTA_CHAIN_TYPE]) { type = nf_tables_chain_type_lookup(net, nla[NFTA_CHAIN_TYPE], family, autoload); @@ -2317,9 +2350,8 @@ static const struct nft_expr_type *__nft_expr_type_get(u8 family, static int nft_expr_type_request_module(struct net *net, u8 family, struct nlattr *nla) { - nft_request_module(net, "nft-expr-%u-%.*s", family, - nla_len(nla), (char *)nla_data(nla)); - if (__nft_expr_type_get(family, nla)) + if (nft_request_module(net, "nft-expr-%u-%.*s", family, + nla_len(nla), (char *)nla_data(nla)) == -EAGAIN) return -EAGAIN; return 0; @@ -2345,9 +2377,9 @@ static const struct nft_expr_type *nft_expr_type_get(struct net *net, if (nft_expr_type_request_module(net, family, nla) == -EAGAIN) return ERR_PTR(-EAGAIN); - nft_request_module(net, "nft-expr-%.*s", - nla_len(nla), (char *)nla_data(nla)); - if (__nft_expr_type_get(family, nla)) + if (nft_request_module(net, "nft-expr-%.*s", + nla_len(nla), + (char *)nla_data(nla)) == -EAGAIN) return ERR_PTR(-EAGAIN); } #endif @@ -2355,7 +2387,8 @@ static const struct nft_expr_type *nft_expr_type_get(struct net *net, } static const struct nla_policy nft_expr_policy[NFTA_EXPR_MAX + 1] = { - [NFTA_EXPR_NAME] = { .type = NLA_STRING }, + [NFTA_EXPR_NAME] = { .type = NLA_STRING, + .len = NFT_MODULE_AUTOLOAD_LIMIT }, [NFTA_EXPR_DATA] = { .type = NLA_NESTED }, }; @@ -2437,9 +2470,10 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx, err = PTR_ERR(ops); #ifdef CONFIG_MODULES if (err == -EAGAIN) - nft_expr_type_request_module(ctx->net, - ctx->family, - tb[NFTA_EXPR_NAME]); + if (nft_expr_type_request_module(ctx->net, + ctx->family, + tb[NFTA_EXPR_NAME]) != -EAGAIN) + err = -ENOENT; #endif goto err1; } @@ -3276,8 +3310,7 @@ nft_select_set_ops(const struct nft_ctx *ctx, lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (list_empty(&nf_tables_set_types)) { - nft_request_module(ctx->net, "nft-set"); - if (!list_empty(&nf_tables_set_types)) + if (nft_request_module(ctx->net, "nft-set") == -EAGAIN) return ERR_PTR(-EAGAIN); } #endif @@ -3358,6 +3391,7 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { [NFTA_SET_DESC_SIZE] = { .type = NLA_U32 }, + [NFTA_SET_DESC_CONCAT] = { .type = NLA_NESTED }, }; static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net, @@ -3524,6 +3558,33 @@ static __be64 nf_jiffies64_to_msecs(u64 input) return cpu_to_be64(jiffies64_to_msecs(input)); } +static int nf_tables_fill_set_concat(struct sk_buff *skb, + const struct nft_set *set) +{ + struct nlattr *concat, *field; + int i; + + concat = nla_nest_start_noflag(skb, NFTA_SET_DESC_CONCAT); + if (!concat) + return -ENOMEM; + + for (i = 0; i < set->field_count; i++) { + field = nla_nest_start_noflag(skb, NFTA_LIST_ELEM); + if (!field) + return -ENOMEM; + + if (nla_put_be32(skb, NFTA_SET_FIELD_LEN, + htonl(set->field_len[i]))) + return -ENOMEM; + + nla_nest_end(skb, field); + } + + nla_nest_end(skb, concat); + + return 0; +} + static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, const struct nft_set *set, u16 event, u16 flags) { @@ -3587,11 +3648,17 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, goto nla_put_failure; desc = nla_nest_start_noflag(skb, NFTA_SET_DESC); + if (desc == NULL) goto nla_put_failure; if (set->size && nla_put_be32(skb, NFTA_SET_DESC_SIZE, htonl(set->size))) goto nla_put_failure; + + if (set->field_count > 1 && + nf_tables_fill_set_concat(skb, set)) + goto nla_put_failure; + nla_nest_end(skb, desc); nlmsg_end(skb, nlh); @@ -3764,6 +3831,53 @@ err: return err; } +static const struct nla_policy nft_concat_policy[NFTA_SET_FIELD_MAX + 1] = { + [NFTA_SET_FIELD_LEN] = { .type = NLA_U32 }, +}; + +static int nft_set_desc_concat_parse(const struct nlattr *attr, + struct nft_set_desc *desc) +{ + struct nlattr *tb[NFTA_SET_FIELD_MAX + 1]; + u32 len; + int err; + + err = nla_parse_nested_deprecated(tb, NFTA_SET_FIELD_MAX, attr, + nft_concat_policy, NULL); + if (err < 0) + return err; + + if (!tb[NFTA_SET_FIELD_LEN]) + return -EINVAL; + + len = ntohl(nla_get_be32(tb[NFTA_SET_FIELD_LEN])); + + if (len * BITS_PER_BYTE / 32 > NFT_REG32_COUNT) + return -E2BIG; + + desc->field_len[desc->field_count++] = len; + + return 0; +} + +static int nft_set_desc_concat(struct nft_set_desc *desc, + const struct nlattr *nla) +{ + struct nlattr *attr; + int rem, err; + + nla_for_each_nested(attr, nla, rem) { + if (nla_type(attr) != NFTA_LIST_ELEM) + return -EINVAL; + + err = nft_set_desc_concat_parse(attr, desc); + if (err < 0) + return err; + } + + return 0; +} + static int nf_tables_set_desc_parse(struct nft_set_desc *desc, const struct nlattr *nla) { @@ -3777,8 +3891,10 @@ static int nf_tables_set_desc_parse(struct nft_set_desc *desc, if (da[NFTA_SET_DESC_SIZE] != NULL) desc->size = ntohl(nla_get_be32(da[NFTA_SET_DESC_SIZE])); + if (da[NFTA_SET_DESC_CONCAT]) + err = nft_set_desc_concat(desc, da[NFTA_SET_DESC_CONCAT]); - return 0; + return err; } static int nf_tables_newset(struct net *net, struct sock *nlsk, @@ -3801,6 +3917,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, unsigned char *udata; u16 udlen; int err; + int i; if (nla[NFTA_SET_TABLE] == NULL || nla[NFTA_SET_NAME] == NULL || @@ -3979,6 +4096,10 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, set->gc_int = gc_int; set->handle = nf_tables_alloc_handle(table); + set->field_count = desc.field_count; + for (i = 0; i < desc.field_count; i++) + set->field_len[i] = desc.field_len[i]; + err = ops->init(set, &desc, nla); if (err < 0) goto err3; @@ -4182,6 +4303,9 @@ const struct nft_set_ext_type nft_set_ext_types[] = { .len = sizeof(struct nft_userdata), .align = __alignof__(struct nft_userdata), }, + [NFT_SET_EXT_KEY_END] = { + .align = __alignof__(u32), + }, }; EXPORT_SYMBOL_GPL(nft_set_ext_types); @@ -4198,7 +4322,9 @@ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = { [NFTA_SET_ELEM_USERDATA] = { .type = NLA_BINARY, .len = NFT_USERDATA_MAXLEN }, [NFTA_SET_ELEM_EXPR] = { .type = NLA_NESTED }, - [NFTA_SET_ELEM_OBJREF] = { .type = NLA_STRING }, + [NFTA_SET_ELEM_OBJREF] = { .type = NLA_STRING, + .len = NFT_OBJ_MAXNAMELEN - 1 }, + [NFTA_SET_ELEM_KEY_END] = { .type = NLA_NESTED }, }; static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = { @@ -4248,6 +4374,11 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, NFT_DATA_VALUE, set->klen) < 0) goto nla_put_failure; + if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END) && + nft_data_dump(skb, NFTA_SET_ELEM_KEY_END, nft_set_ext_key_end(ext), + NFT_DATA_VALUE, set->klen) < 0) + goto nla_put_failure; + if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && nft_data_dump(skb, NFTA_SET_ELEM_DATA, nft_set_ext_data(ext), set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE, @@ -4490,11 +4621,28 @@ static int nft_setelem_parse_flags(const struct nft_set *set, return 0; } +static int nft_setelem_parse_key(struct nft_ctx *ctx, struct nft_set *set, + struct nft_data *key, struct nlattr *attr) +{ + struct nft_data_desc desc; + int err; + + err = nft_data_init(ctx, key, NFT_DATA_VALUE_MAXLEN, &desc, attr); + if (err < 0) + return err; + + if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) { + nft_data_release(key, desc.type); + return -EINVAL; + } + + return 0; +} + static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; - struct nft_data_desc desc; struct nft_set_elem elem; struct sk_buff *skb; uint32_t flags = 0; @@ -4513,15 +4661,16 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (err < 0) return err; - err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &desc, - nla[NFTA_SET_ELEM_KEY]); + err = nft_setelem_parse_key(ctx, set, &elem.key.val, + nla[NFTA_SET_ELEM_KEY]); if (err < 0) return err; - err = -EINVAL; - if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) { - nft_data_release(&elem.key.val, desc.type); - return err; + if (nla[NFTA_SET_ELEM_KEY_END]) { + err = nft_setelem_parse_key(ctx, set, &elem.key_end.val, + nla[NFTA_SET_ELEM_KEY_END]); + if (err < 0) + return err; } priv = set->ops->get(ctx->net, set, &elem, flags); @@ -4649,8 +4798,8 @@ static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx, void *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, - const u32 *key, const u32 *data, - u64 timeout, u64 expiration, gfp_t gfp) + const u32 *key, const u32 *key_end, + const u32 *data, u64 timeout, u64 expiration, gfp_t gfp) { struct nft_set_ext *ext; void *elem; @@ -4663,6 +4812,8 @@ void *nft_set_elem_init(const struct nft_set *set, nft_set_ext_init(ext, tmpl); memcpy(nft_set_ext_key(ext), key, set->klen); + if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) + memcpy(nft_set_ext_key_end(ext), key_end, set->klen); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) memcpy(nft_set_ext_data(ext), data, set->dlen); if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { @@ -4722,13 +4873,13 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; u8 genmask = nft_genmask_next(ctx->net); - struct nft_data_desc d1, d2; struct nft_set_ext_tmpl tmpl; struct nft_set_ext *ext, *ext2; struct nft_set_elem elem; struct nft_set_binding *binding; struct nft_object *obj = NULL; struct nft_userdata *udata; + struct nft_data_desc desc; struct nft_data data; enum nft_registers dreg; struct nft_trans *trans; @@ -4794,15 +4945,22 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, return err; } - err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &d1, - nla[NFTA_SET_ELEM_KEY]); + err = nft_setelem_parse_key(ctx, set, &elem.key.val, + nla[NFTA_SET_ELEM_KEY]); if (err < 0) - goto err1; - err = -EINVAL; - if (d1.type != NFT_DATA_VALUE || d1.len != set->klen) - goto err2; + return err; + + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); + + if (nla[NFTA_SET_ELEM_KEY_END]) { + err = nft_setelem_parse_key(ctx, set, &elem.key_end.val, + nla[NFTA_SET_ELEM_KEY_END]); + if (err < 0) + goto err_parse_key; + + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen); + } - nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, d1.len); if (timeout > 0) { nft_set_ext_add(&tmpl, NFT_SET_EXT_EXPIRATION); if (timeout != set->timeout) @@ -4812,27 +4970,27 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (nla[NFTA_SET_ELEM_OBJREF] != NULL) { if (!(set->flags & NFT_SET_OBJECT)) { err = -EINVAL; - goto err2; + goto err_parse_key_end; } obj = nft_obj_lookup(ctx->net, ctx->table, nla[NFTA_SET_ELEM_OBJREF], set->objtype, genmask); if (IS_ERR(obj)) { err = PTR_ERR(obj); - goto err2; + goto err_parse_key_end; } nft_set_ext_add(&tmpl, NFT_SET_EXT_OBJREF); } if (nla[NFTA_SET_ELEM_DATA] != NULL) { - err = nft_data_init(ctx, &data, sizeof(data), &d2, + err = nft_data_init(ctx, &data, sizeof(data), &desc, nla[NFTA_SET_ELEM_DATA]); if (err < 0) - goto err2; + goto err_parse_key_end; err = -EINVAL; - if (set->dtype != NFT_DATA_VERDICT && d2.len != set->dlen) - goto err3; + if (set->dtype != NFT_DATA_VERDICT && desc.len != set->dlen) + goto err_parse_data; dreg = nft_type_to_reg(set->dtype); list_for_each_entry(binding, &set->bindings, list) { @@ -4848,18 +5006,18 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = nft_validate_register_store(&bind_ctx, dreg, &data, - d2.type, d2.len); + desc.type, desc.len); if (err < 0) - goto err3; + goto err_parse_data; - if (d2.type == NFT_DATA_VERDICT && + if (desc.type == NFT_DATA_VERDICT && (data.verdict.code == NFT_GOTO || data.verdict.code == NFT_JUMP)) nft_validate_state_update(ctx->net, NFT_VALIDATE_NEED); } - nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, d2.len); + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, desc.len); } /* The full maximum length of userdata can exceed the maximum @@ -4875,10 +5033,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } err = -ENOMEM; - elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, data.data, + elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, + elem.key_end.val.data, data.data, timeout, expiration, GFP_KERNEL); if (elem.priv == NULL) - goto err3; + goto err_parse_data; ext = nft_set_elem_ext(set, elem.priv); if (flags) @@ -4895,7 +5054,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set); if (trans == NULL) - goto err4; + goto err_trans; ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK; err = set->ops->insert(ctx->net, set, &elem, &ext2); @@ -4906,7 +5065,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) ^ nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF)) { err = -EBUSY; - goto err5; + goto err_element_clash; } if ((nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) && @@ -4919,33 +5078,35 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, else if (!(nlmsg_flags & NLM_F_EXCL)) err = 0; } - goto err5; + goto err_element_clash; } if (set->size && !atomic_add_unless(&set->nelems, 1, set->size + set->ndeact)) { err = -ENFILE; - goto err6; + goto err_set_full; } nft_trans_elem(trans) = elem; list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; -err6: +err_set_full: set->ops->remove(ctx->net, set, &elem); -err5: +err_element_clash: kfree(trans); -err4: +err_trans: if (obj) obj->use--; kfree(elem.priv); -err3: +err_parse_data: if (nla[NFTA_SET_ELEM_DATA] != NULL) - nft_data_release(&data, d2.type); -err2: - nft_data_release(&elem.key.val, d1.type); -err1: + nft_data_release(&data, desc.type); +err_parse_key_end: + nft_data_release(&elem.key_end.val, NFT_DATA_VALUE); +err_parse_key: + nft_data_release(&elem.key.val, NFT_DATA_VALUE); + return err; } @@ -5040,7 +5201,6 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; struct nft_set_ext_tmpl tmpl; - struct nft_data_desc desc; struct nft_set_elem elem; struct nft_set_ext *ext; struct nft_trans *trans; @@ -5051,11 +5211,10 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr, nft_set_elem_policy, NULL); if (err < 0) - goto err1; + return err; - err = -EINVAL; if (nla[NFTA_SET_ELEM_KEY] == NULL) - goto err1; + return -EINVAL; nft_set_ext_prepare(&tmpl); @@ -5065,37 +5224,41 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, if (flags != 0) nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS); - err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &desc, - nla[NFTA_SET_ELEM_KEY]); + err = nft_setelem_parse_key(ctx, set, &elem.key.val, + nla[NFTA_SET_ELEM_KEY]); if (err < 0) - goto err1; + return err; - err = -EINVAL; - if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) - goto err2; + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); - nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, desc.len); + if (nla[NFTA_SET_ELEM_KEY_END]) { + err = nft_setelem_parse_key(ctx, set, &elem.key_end.val, + nla[NFTA_SET_ELEM_KEY_END]); + if (err < 0) + return err; + + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen); + } err = -ENOMEM; - elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, NULL, 0, - 0, GFP_KERNEL); + elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, + elem.key_end.val.data, NULL, 0, 0, + GFP_KERNEL); if (elem.priv == NULL) - goto err2; + goto fail_elem; ext = nft_set_elem_ext(set, elem.priv); if (flags) *nft_set_ext_flags(ext) = flags; trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set); - if (trans == NULL) { - err = -ENOMEM; - goto err3; - } + if (trans == NULL) + goto fail_trans; priv = set->ops->deactivate(ctx->net, set, &elem); if (priv == NULL) { err = -ENOENT; - goto err4; + goto fail_ops; } kfree(elem.priv); elem.priv = priv; @@ -5106,13 +5269,12 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; -err4: +fail_ops: kfree(trans); -err3: +fail_trans: kfree(elem.priv); -err2: - nft_data_release(&elem.key.val, desc.type); -err1: +fail_elem: + nft_data_release(&elem.key.val, NFT_DATA_VALUE); return err; } @@ -5402,8 +5564,7 @@ nft_obj_type_get(struct net *net, u32 objtype) lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (type == NULL) { - nft_request_module(net, "nft-obj-%u", objtype); - if (__nft_obj_type_get(objtype)) + if (nft_request_module(net, "nft-obj-%u", objtype) == -EAGAIN) return ERR_PTR(-EAGAIN); } #endif @@ -5976,14 +6137,14 @@ nft_flowtable_type_get(struct net *net, u8 family) lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (type == NULL) { - nft_request_module(net, "nf-flowtable-%u", family); - if (__nft_flowtable_type_get(family)) + if (nft_request_module(net, "nf-flowtable-%u", family) == -EAGAIN) return ERR_PTR(-EAGAIN); } #endif return ERR_PTR(-ENOENT); } +/* Only called from error and netdev event paths. */ static void nft_unregister_flowtable_hook(struct net *net, struct nft_flowtable *flowtable, struct nft_hook *hook) @@ -5999,7 +6160,7 @@ static void nft_unregister_flowtable_net_hooks(struct net *net, struct nft_hook *hook; list_for_each_entry(hook, &flowtable->hook_list, list) - nft_unregister_flowtable_hook(net, flowtable, hook); + nf_unregister_net_hook(net, &hook->ops); } static int nft_register_flowtable_net_hooks(struct net *net, @@ -6448,12 +6609,14 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) { struct nft_hook *hook, *next; + flowtable->data.type->free(&flowtable->data); list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { + flowtable->data.type->setup(&flowtable->data, hook->ops.dev, + FLOW_BLOCK_UNBIND); list_del_rcu(&hook->list); kfree(hook); } kfree(flowtable->name); - flowtable->data.type->free(&flowtable->data); module_put(flowtable->data.type->owner); kfree(flowtable); } @@ -6497,6 +6660,7 @@ static void nft_flowtable_event(unsigned long event, struct net_device *dev, if (hook->ops.dev != dev) continue; + /* flow_offload_netdev_event() cleans up entries for us. */ nft_unregister_flowtable_hook(dev_net(dev), flowtable, hook); list_del_rcu(&hook->list); kfree_rcu(hook, rcu); @@ -6975,6 +7139,18 @@ static void nft_chain_del(struct nft_chain *chain) list_del_rcu(&chain->list); } +static void nf_tables_module_autoload_cleanup(struct net *net) +{ + struct nft_module_request *req, *next; + + WARN_ON_ONCE(!list_empty(&net->nft.commit_list)); + list_for_each_entry_safe(req, next, &net->nft.module_list, list) { + WARN_ON_ONCE(!req->done); + list_del(&req->list); + kfree(req); + } +} + static void nf_tables_commit_release(struct net *net) { struct nft_trans *trans; @@ -6987,6 +7163,7 @@ static void nf_tables_commit_release(struct net *net) * to prevent expensive synchronize_rcu() in commit phase. */ if (list_empty(&net->nft.commit_list)) { + nf_tables_module_autoload_cleanup(net); mutex_unlock(&net->nft.commit_mutex); return; } @@ -7001,6 +7178,7 @@ static void nf_tables_commit_release(struct net *net) list_splice_tail_init(&net->nft.commit_list, &nf_tables_destroy_list); spin_unlock(&nf_tables_destroy_list_lock); + nf_tables_module_autoload_cleanup(net); mutex_unlock(&net->nft.commit_mutex); schedule_work(&trans_destroy_work); @@ -7192,6 +7370,26 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) return 0; } +static void nf_tables_module_autoload(struct net *net) +{ + struct nft_module_request *req, *next; + LIST_HEAD(module_list); + + list_splice_init(&net->nft.module_list, &module_list); + mutex_unlock(&net->nft.commit_mutex); + list_for_each_entry_safe(req, next, &module_list, list) { + if (req->done) { + list_del(&req->list); + kfree(req); + } else { + request_module("%s", req->module); + req->done = true; + } + } + mutex_lock(&net->nft.commit_mutex); + list_splice(&module_list, &net->nft.module_list); +} + static void nf_tables_abort_release(struct nft_trans *trans) { switch (trans->msg_type) { @@ -7221,7 +7419,7 @@ static void nf_tables_abort_release(struct nft_trans *trans) kfree(trans); } -static int __nf_tables_abort(struct net *net) +static int __nf_tables_abort(struct net *net, bool autoload) { struct nft_trans *trans, *next; struct nft_trans_elem *te; @@ -7343,6 +7541,11 @@ static int __nf_tables_abort(struct net *net) nf_tables_abort_release(trans); } + if (autoload) + nf_tables_module_autoload(net); + else + nf_tables_module_autoload_cleanup(net); + return 0; } @@ -7351,9 +7554,9 @@ static void nf_tables_cleanup(struct net *net) nft_validate_state_update(net, NFT_VALIDATE_SKIP); } -static int nf_tables_abort(struct net *net, struct sk_buff *skb) +static int nf_tables_abort(struct net *net, struct sk_buff *skb, bool autoload) { - int ret = __nf_tables_abort(net); + int ret = __nf_tables_abort(net, autoload); mutex_unlock(&net->nft.commit_mutex); @@ -7948,6 +8151,7 @@ static int __net_init nf_tables_init_net(struct net *net) { INIT_LIST_HEAD(&net->nft.tables); INIT_LIST_HEAD(&net->nft.commit_list); + INIT_LIST_HEAD(&net->nft.module_list); mutex_init(&net->nft.commit_mutex); net->nft.base_seq = 1; net->nft.validate_state = NFT_VALIDATE_SKIP; @@ -7959,7 +8163,7 @@ static void __net_exit nf_tables_exit_net(struct net *net) { mutex_lock(&net->nft.commit_mutex); if (!list_empty(&net->nft.commit_list)) - __nf_tables_abort(net); + __nf_tables_abort(net, false); __nft_release_tables(net); mutex_unlock(&net->nft.commit_mutex); WARN_ON_ONCE(!list_empty(&net->nft.tables)); diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index a9ea29afb09f..2bb28483af22 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -564,7 +564,7 @@ static void nft_indr_block_cb(struct net_device *dev, mutex_lock(&net->nft.commit_mutex); chain = __nft_offload_get_chain(dev); - if (chain) { + if (chain && chain->flags & NFT_CHAIN_HW_OFFLOAD) { struct nft_base_chain *basechain; basechain = nft_base_chain(chain); diff --git a/net/netfilter/nf_tables_set_core.c b/net/netfilter/nf_tables_set_core.c index a9fce8d10051..586b621007eb 100644 --- a/net/netfilter/nf_tables_set_core.c +++ b/net/netfilter/nf_tables_set_core.c @@ -9,12 +9,14 @@ static int __init nf_tables_set_module_init(void) nft_register_set(&nft_set_rhash_type); nft_register_set(&nft_set_bitmap_type); nft_register_set(&nft_set_rbtree_type); + nft_register_set(&nft_set_pipapo_type); return 0; } static void __exit nf_tables_set_module_exit(void) { + nft_unregister_set(&nft_set_pipapo_type); nft_unregister_set(&nft_set_rbtree_type); nft_unregister_set(&nft_set_bitmap_type); nft_unregister_set(&nft_set_rhash_type); diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 4abbb452cf6c..99127e2d95a8 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -476,7 +476,7 @@ ack: } done: if (status & NFNL_BATCH_REPLAY) { - ss->abort(net, oskb); + ss->abort(net, oskb, true); nfnl_err_reset(&err_list); kfree_skb(skb); module_put(ss->owner); @@ -487,11 +487,11 @@ done: status |= NFNL_BATCH_REPLAY; goto done; } else if (err) { - ss->abort(net, oskb); + ss->abort(net, oskb, false); netlink_ack(oskb, nlmsg_hdr(oskb), err, NULL); } } else { - ss->abort(net, oskb); + ss->abort(net, oskb, false); } if (ss->cleanup) ss->cleanup(net); diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index feabdfb22920..76535fd9278c 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -778,7 +778,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) { unsigned int queued; struct nfqnl_instance *queue; - struct sk_buff *skb, *segs; + struct sk_buff *skb, *segs, *nskb; int err = -ENOBUFS; struct net *net = entry->state.net; struct nfnl_queue_net *q = nfnl_queue_pernet(net); @@ -815,8 +815,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) goto out_err; queued = 0; err = 0; - do { - struct sk_buff *nskb = segs->next; + skb_list_walk_safe(segs, segs, nskb) { if (err == 0) err = __nfqnl_enqueue_packet_gso(net, queue, segs, entry); @@ -824,8 +823,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) queued++; else kfree_skb(segs); - segs = nskb; - } while (segs); + } if (queued) { if (err) /* some segments are already queued */ diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index 10e9d50e4e19..0ed2281f03be 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -18,21 +18,66 @@ struct nft_bitwise { enum nft_registers sreg:8; enum nft_registers dreg:8; + enum nft_bitwise_ops op:8; u8 len; struct nft_data mask; struct nft_data xor; + struct nft_data data; }; +static void nft_bitwise_eval_bool(u32 *dst, const u32 *src, + const struct nft_bitwise *priv) +{ + unsigned int i; + + for (i = 0; i < DIV_ROUND_UP(priv->len, 4); i++) + dst[i] = (src[i] & priv->mask.data[i]) ^ priv->xor.data[i]; +} + +static void nft_bitwise_eval_lshift(u32 *dst, const u32 *src, + const struct nft_bitwise *priv) +{ + u32 shift = priv->data.data[0]; + unsigned int i; + u32 carry = 0; + + for (i = DIV_ROUND_UP(priv->len, sizeof(u32)); i > 0; i--) { + dst[i - 1] = (src[i - 1] << shift) | carry; + carry = src[i - 1] >> (BITS_PER_TYPE(u32) - shift); + } +} + +static void nft_bitwise_eval_rshift(u32 *dst, const u32 *src, + const struct nft_bitwise *priv) +{ + u32 shift = priv->data.data[0]; + unsigned int i; + u32 carry = 0; + + for (i = 0; i < DIV_ROUND_UP(priv->len, sizeof(u32)); i++) { + dst[i] = carry | (src[i] >> shift); + carry = src[i] << (BITS_PER_TYPE(u32) - shift); + } +} + void nft_bitwise_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_bitwise *priv = nft_expr_priv(expr); const u32 *src = ®s->data[priv->sreg]; u32 *dst = ®s->data[priv->dreg]; - unsigned int i; - for (i = 0; i < DIV_ROUND_UP(priv->len, 4); i++) - dst[i] = (src[i] & priv->mask.data[i]) ^ priv->xor.data[i]; + switch (priv->op) { + case NFT_BITWISE_BOOL: + nft_bitwise_eval_bool(dst, src, priv); + break; + case NFT_BITWISE_LSHIFT: + nft_bitwise_eval_lshift(dst, src, priv); + break; + case NFT_BITWISE_RSHIFT: + nft_bitwise_eval_rshift(dst, src, priv); + break; + } } static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = { @@ -41,40 +86,22 @@ static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = { [NFTA_BITWISE_LEN] = { .type = NLA_U32 }, [NFTA_BITWISE_MASK] = { .type = NLA_NESTED }, [NFTA_BITWISE_XOR] = { .type = NLA_NESTED }, + [NFTA_BITWISE_OP] = { .type = NLA_U32 }, + [NFTA_BITWISE_DATA] = { .type = NLA_NESTED }, }; -static int nft_bitwise_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) +static int nft_bitwise_init_bool(struct nft_bitwise *priv, + const struct nlattr *const tb[]) { - struct nft_bitwise *priv = nft_expr_priv(expr); struct nft_data_desc d1, d2; - u32 len; int err; - if (tb[NFTA_BITWISE_SREG] == NULL || - tb[NFTA_BITWISE_DREG] == NULL || - tb[NFTA_BITWISE_LEN] == NULL || - tb[NFTA_BITWISE_MASK] == NULL || - tb[NFTA_BITWISE_XOR] == NULL) + if (tb[NFTA_BITWISE_DATA]) return -EINVAL; - err = nft_parse_u32_check(tb[NFTA_BITWISE_LEN], U8_MAX, &len); - if (err < 0) - return err; - - priv->len = len; - - priv->sreg = nft_parse_register(tb[NFTA_BITWISE_SREG]); - err = nft_validate_register_load(priv->sreg, priv->len); - if (err < 0) - return err; - - priv->dreg = nft_parse_register(tb[NFTA_BITWISE_DREG]); - err = nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, priv->len); - if (err < 0) - return err; + if (!tb[NFTA_BITWISE_MASK] || + !tb[NFTA_BITWISE_XOR]) + return -EINVAL; err = nft_data_init(NULL, &priv->mask, sizeof(priv->mask), &d1, tb[NFTA_BITWISE_MASK]); @@ -102,40 +129,151 @@ err1: return err; } -static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_bitwise_init_shift(struct nft_bitwise *priv, + const struct nlattr *const tb[]) { - const struct nft_bitwise *priv = nft_expr_priv(expr); + struct nft_data_desc d; + int err; - if (nft_dump_register(skb, NFTA_BITWISE_SREG, priv->sreg)) - goto nla_put_failure; - if (nft_dump_register(skb, NFTA_BITWISE_DREG, priv->dreg)) - goto nla_put_failure; - if (nla_put_be32(skb, NFTA_BITWISE_LEN, htonl(priv->len))) - goto nla_put_failure; + if (tb[NFTA_BITWISE_MASK] || + tb[NFTA_BITWISE_XOR]) + return -EINVAL; + + if (!tb[NFTA_BITWISE_DATA]) + return -EINVAL; + err = nft_data_init(NULL, &priv->data, sizeof(priv->data), &d, + tb[NFTA_BITWISE_DATA]); + if (err < 0) + return err; + if (d.type != NFT_DATA_VALUE || d.len != sizeof(u32) || + priv->data.data[0] >= BITS_PER_TYPE(u32)) { + nft_data_release(&priv->data, d.type); + return -EINVAL; + } + + return 0; +} + +static int nft_bitwise_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_bitwise *priv = nft_expr_priv(expr); + u32 len; + int err; + + if (!tb[NFTA_BITWISE_SREG] || + !tb[NFTA_BITWISE_DREG] || + !tb[NFTA_BITWISE_LEN]) + return -EINVAL; + + err = nft_parse_u32_check(tb[NFTA_BITWISE_LEN], U8_MAX, &len); + if (err < 0) + return err; + + priv->len = len; + + priv->sreg = nft_parse_register(tb[NFTA_BITWISE_SREG]); + err = nft_validate_register_load(priv->sreg, priv->len); + if (err < 0) + return err; + + priv->dreg = nft_parse_register(tb[NFTA_BITWISE_DREG]); + err = nft_validate_register_store(ctx, priv->dreg, NULL, + NFT_DATA_VALUE, priv->len); + if (err < 0) + return err; + + if (tb[NFTA_BITWISE_OP]) { + priv->op = ntohl(nla_get_be32(tb[NFTA_BITWISE_OP])); + switch (priv->op) { + case NFT_BITWISE_BOOL: + case NFT_BITWISE_LSHIFT: + case NFT_BITWISE_RSHIFT: + break; + default: + return -EOPNOTSUPP; + } + } else { + priv->op = NFT_BITWISE_BOOL; + } + + switch(priv->op) { + case NFT_BITWISE_BOOL: + err = nft_bitwise_init_bool(priv, tb); + break; + case NFT_BITWISE_LSHIFT: + case NFT_BITWISE_RSHIFT: + err = nft_bitwise_init_shift(priv, tb); + break; + } + + return err; +} + +static int nft_bitwise_dump_bool(struct sk_buff *skb, + const struct nft_bitwise *priv) +{ if (nft_data_dump(skb, NFTA_BITWISE_MASK, &priv->mask, NFT_DATA_VALUE, priv->len) < 0) - goto nla_put_failure; + return -1; if (nft_data_dump(skb, NFTA_BITWISE_XOR, &priv->xor, NFT_DATA_VALUE, priv->len) < 0) - goto nla_put_failure; + return -1; return 0; +} + +static int nft_bitwise_dump_shift(struct sk_buff *skb, + const struct nft_bitwise *priv) +{ + if (nft_data_dump(skb, NFTA_BITWISE_DATA, &priv->data, + NFT_DATA_VALUE, sizeof(u32)) < 0) + return -1; + return 0; +} + +static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_bitwise *priv = nft_expr_priv(expr); + int err = 0; + + if (nft_dump_register(skb, NFTA_BITWISE_SREG, priv->sreg)) + return -1; + if (nft_dump_register(skb, NFTA_BITWISE_DREG, priv->dreg)) + return -1; + if (nla_put_be32(skb, NFTA_BITWISE_LEN, htonl(priv->len))) + return -1; + if (nla_put_be32(skb, NFTA_BITWISE_OP, htonl(priv->op))) + return -1; + + switch (priv->op) { + case NFT_BITWISE_BOOL: + err = nft_bitwise_dump_bool(skb, priv); + break; + case NFT_BITWISE_LSHIFT: + case NFT_BITWISE_RSHIFT: + err = nft_bitwise_dump_shift(skb, priv); + break; + } -nla_put_failure: - return -1; + return err; } static struct nft_data zero; static int nft_bitwise_offload(struct nft_offload_ctx *ctx, - struct nft_flow_rule *flow, - const struct nft_expr *expr) + struct nft_flow_rule *flow, + const struct nft_expr *expr) { const struct nft_bitwise *priv = nft_expr_priv(expr); struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; + if (priv->op != NFT_BITWISE_BOOL) + return -EOPNOTSUPP; + if (memcmp(&priv->xor, &zero, sizeof(priv->xor)) || priv->sreg != priv->dreg || priv->len != reg->len) return -EOPNOTSUPP; diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 8887295414dc..683785225a3e 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -54,7 +54,7 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, timeout = priv->timeout ? : set->timeout; elem = nft_set_elem_init(set, &priv->tmpl, - ®s->data[priv->sreg_key], + ®s->data[priv->sreg_key], NULL, ®s->data[priv->sreg_data], timeout, 0, GFP_ATOMIC); if (elem == NULL) diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index dd82ff2ee19f..b70b48996801 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -200,9 +200,6 @@ static void nft_flow_offload_activate(const struct nft_ctx *ctx, static void nft_flow_offload_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { - struct nft_flow_offload *priv = nft_expr_priv(expr); - - priv->flowtable->use--; nf_ct_netns_put(ctx->net, ctx->family); } diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index f54d6ae15bb1..b42247aa48a9 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -61,6 +61,9 @@ static int nft_osf_init(const struct nft_ctx *ctx, int err; u8 ttl; + if (!tb[NFTA_OSF_DREG]) + return -EINVAL; + if (tb[NFTA_OSF_TTL]) { ttl = nla_get_u8(tb[NFTA_OSF_TTL]); if (ttl > 2) diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 087a056e34d1..87e8d9ba0c9b 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -259,8 +259,8 @@ static u64 nft_bitmap_privsize(const struct nlattr * const nla[], } static int nft_bitmap_init(const struct nft_set *set, - const struct nft_set_desc *desc, - const struct nlattr * const nla[]) + const struct nft_set_desc *desc, + const struct nlattr * const nla[]) { struct nft_bitmap *priv = nft_set_priv(set); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index b331a3c9a3a8..d350a7cd3af0 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -645,7 +645,7 @@ static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features, } static bool nft_hash_fast_estimate(const struct nft_set_desc *desc, u32 features, - struct nft_set_estimate *est) + struct nft_set_estimate *est) { if (!desc->size) return false; diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c new file mode 100644 index 000000000000..f0cb1e13af50 --- /dev/null +++ b/net/netfilter/nft_set_pipapo.c @@ -0,0 +1,2102 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* PIPAPO: PIle PAcket POlicies: set for arbitrary concatenations of ranges + * + * Copyright (c) 2019-2020 Red Hat GmbH + * + * Author: Stefano Brivio <sbrivio@redhat.com> + */ + +/** + * DOC: Theory of Operation + * + * + * Problem + * ------- + * + * Match packet bytes against entries composed of ranged or non-ranged packet + * field specifiers, mapping them to arbitrary references. For example: + * + * :: + * + * --- fields ---> + * | [net],[port],[net]... => [reference] + * entries [net],[port],[net]... => [reference] + * | [net],[port],[net]... => [reference] + * V ... + * + * where [net] fields can be IP ranges or netmasks, and [port] fields are port + * ranges. Arbitrary packet fields can be matched. + * + * + * Algorithm Overview + * ------------------ + * + * This algorithm is loosely inspired by [Ligatti 2010], and fundamentally + * relies on the consideration that every contiguous range in a space of b bits + * can be converted into b * 2 netmasks, from Theorem 3 in [Rottenstreich 2010], + * as also illustrated in Section 9 of [Kogan 2014]. + * + * Classification against a number of entries, that require matching given bits + * of a packet field, is performed by grouping those bits in sets of arbitrary + * size, and classifying packet bits one group at a time. + * + * Example: + * to match the source port (16 bits) of a packet, we can divide those 16 bits + * in 4 groups of 4 bits each. Given the entry: + * 0000 0001 0101 1001 + * and a packet with source port: + * 0000 0001 1010 1001 + * first and second groups match, but the third doesn't. We conclude that the + * packet doesn't match the given entry. + * + * Translate the set to a sequence of lookup tables, one per field. Each table + * has two dimensions: bit groups to be matched for a single packet field, and + * all the possible values of said groups (buckets). Input entries are + * represented as one or more rules, depending on the number of composing + * netmasks for the given field specifier, and a group match is indicated as a + * set bit, with number corresponding to the rule index, in all the buckets + * whose value matches the entry for a given group. + * + * Rules are mapped between fields through an array of x, n pairs, with each + * item mapping a matched rule to one or more rules. The position of the pair in + * the array indicates the matched rule to be mapped to the next field, x + * indicates the first rule index in the next field, and n the amount of + * next-field rules the current rule maps to. + * + * The mapping array for the last field maps to the desired references. + * + * To match, we perform table lookups using the values of grouped packet bits, + * and use a sequence of bitwise operations to progressively evaluate rule + * matching. + * + * A stand-alone, reference implementation, also including notes about possible + * future optimisations, is available at: + * https://pipapo.lameexcu.se/ + * + * Insertion + * --------- + * + * - For each packet field: + * + * - divide the b packet bits we want to classify into groups of size t, + * obtaining ceil(b / t) groups + * + * Example: match on destination IP address, with t = 4: 32 bits, 8 groups + * of 4 bits each + * + * - allocate a lookup table with one column ("bucket") for each possible + * value of a group, and with one row for each group + * + * Example: 8 groups, 2^4 buckets: + * + * :: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 + * 1 + * 2 + * 3 + * 4 + * 5 + * 6 + * 7 + * + * - map the bits we want to classify for the current field, for a given + * entry, to a single rule for non-ranged and netmask set items, and to one + * or multiple rules for ranges. Ranges are expanded to composing netmasks + * by pipapo_expand(). + * + * Example: 2 entries, 10.0.0.5:1024 and 192.168.1.0-192.168.2.1:2048 + * - rule #0: 10.0.0.5 + * - rule #1: 192.168.1.0/24 + * - rule #2: 192.168.2.0/31 + * + * - insert references to the rules in the lookup table, selecting buckets + * according to bit values of a rule in the given group. This is done by + * pipapo_insert(). + * + * Example: given: + * - rule #0: 10.0.0.5 mapping to buckets + * < 0 10 0 0 0 0 0 5 > + * - rule #1: 192.168.1.0/24 mapping to buckets + * < 12 0 10 8 0 1 < 0..15 > < 0..15 > > + * - rule #2: 192.168.2.0/31 mapping to buckets + * < 12 0 10 8 0 2 0 < 0..1 > > + * + * these bits are set in the lookup table: + * + * :: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 0 1,2 + * 1 1,2 0 + * 2 0 1,2 + * 3 0 1,2 + * 4 0,1,2 + * 5 0 1 2 + * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 + * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 + * + * - if this is not the last field in the set, fill a mapping array that maps + * rules from the lookup table to rules belonging to the same entry in + * the next lookup table, done by pipapo_map(). + * + * Note that as rules map to contiguous ranges of rules, given how netmask + * expansion and insertion is performed, &union nft_pipapo_map_bucket stores + * this information as pairs of first rule index, rule count. + * + * Example: 2 entries, 10.0.0.5:1024 and 192.168.1.0-192.168.2.1:2048, + * given lookup table #0 for field 0 (see example above): + * + * :: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 0 1,2 + * 1 1,2 0 + * 2 0 1,2 + * 3 0 1,2 + * 4 0,1,2 + * 5 0 1 2 + * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 + * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 + * + * and lookup table #1 for field 1 with: + * - rule #0: 1024 mapping to buckets + * < 0 0 4 0 > + * - rule #1: 2048 mapping to buckets + * < 0 0 5 0 > + * + * :: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 0,1 + * 1 0,1 + * 2 0 1 + * 3 0,1 + * + * we need to map rules for 10.0.0.5 in lookup table #0 (rule #0) to 1024 + * in lookup table #1 (rule #0) and rules for 192.168.1.0-192.168.2.1 + * (rules #1, #2) to 2048 in lookup table #2 (rule #1): + * + * :: + * + * rule indices in current field: 0 1 2 + * map to rules in next field: 0 1 1 + * + * - if this is the last field in the set, fill a mapping array that maps + * rules from the last lookup table to element pointers, also done by + * pipapo_map(). + * + * Note that, in this implementation, we have two elements (start, end) for + * each entry. The pointer to the end element is stored in this array, and + * the pointer to the start element is linked from it. + * + * Example: entry 10.0.0.5:1024 has a corresponding &struct nft_pipapo_elem + * pointer, 0x66, and element for 192.168.1.0-192.168.2.1:2048 is at 0x42. + * From the rules of lookup table #1 as mapped above: + * + * :: + * + * rule indices in last field: 0 1 + * map to elements: 0x42 0x66 + * + * + * Matching + * -------- + * + * We use a result bitmap, with the size of a single lookup table bucket, to + * represent the matching state that applies at every algorithm step. This is + * done by pipapo_lookup(). + * + * - For each packet field: + * + * - start with an all-ones result bitmap (res_map in pipapo_lookup()) + * + * - perform a lookup into the table corresponding to the current field, + * for each group, and at every group, AND the current result bitmap with + * the value from the lookup table bucket + * + * :: + * + * Example: 192.168.1.5 < 12 0 10 8 0 1 0 5 >, with lookup table from + * insertion examples. + * Lookup table buckets are at least 3 bits wide, we'll assume 8 bits for + * convenience in this example. Initial result bitmap is 0xff, the steps + * below show the value of the result bitmap after each group is processed: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 0 1,2 + * result bitmap is now: 0xff & 0x6 [bucket 12] = 0x6 + * + * 1 1,2 0 + * result bitmap is now: 0x6 & 0x6 [bucket 0] = 0x6 + * + * 2 0 1,2 + * result bitmap is now: 0x6 & 0x6 [bucket 10] = 0x6 + * + * 3 0 1,2 + * result bitmap is now: 0x6 & 0x6 [bucket 8] = 0x6 + * + * 4 0,1,2 + * result bitmap is now: 0x6 & 0x7 [bucket 0] = 0x6 + * + * 5 0 1 2 + * result bitmap is now: 0x6 & 0x2 [bucket 1] = 0x2 + * + * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 + * result bitmap is now: 0x2 & 0x7 [bucket 0] = 0x2 + * + * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 + * final result bitmap for this field is: 0x2 & 0x3 [bucket 5] = 0x2 + * + * - at the next field, start with a new, all-zeroes result bitmap. For each + * bit set in the previous result bitmap, fill the new result bitmap + * (fill_map in pipapo_lookup()) with the rule indices from the + * corresponding buckets of the mapping field for this field, done by + * pipapo_refill() + * + * Example: with mapping table from insertion examples, with the current + * result bitmap from the previous example, 0x02: + * + * :: + * + * rule indices in current field: 0 1 2 + * map to rules in next field: 0 1 1 + * + * the new result bitmap will be 0x02: rule 1 was set, and rule 1 will be + * set. + * + * We can now extend this example to cover the second iteration of the step + * above (lookup and AND bitmap): assuming the port field is + * 2048 < 0 0 5 0 >, with starting result bitmap 0x2, and lookup table + * for "port" field from pre-computation example: + * + * :: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 0,1 + * 1 0,1 + * 2 0 1 + * 3 0,1 + * + * operations are: 0x2 & 0x3 [bucket 0] & 0x3 [bucket 0] & 0x2 [bucket 5] + * & 0x3 [bucket 0], resulting bitmap is 0x2. + * + * - if this is the last field in the set, look up the value from the mapping + * array corresponding to the final result bitmap + * + * Example: 0x2 resulting bitmap from 192.168.1.5:2048, mapping array for + * last field from insertion example: + * + * :: + * + * rule indices in last field: 0 1 + * map to elements: 0x42 0x66 + * + * the matching element is at 0x42. + * + * + * References + * ---------- + * + * [Ligatti 2010] + * A Packet-classification Algorithm for Arbitrary Bitmask Rules, with + * Automatic Time-space Tradeoffs + * Jay Ligatti, Josh Kuhn, and Chris Gage. + * Proceedings of the IEEE International Conference on Computer + * Communication Networks (ICCCN), August 2010. + * http://www.cse.usf.edu/~ligatti/papers/grouper-conf.pdf + * + * [Rottenstreich 2010] + * Worst-Case TCAM Rule Expansion + * Ori Rottenstreich and Isaac Keslassy. + * 2010 Proceedings IEEE INFOCOM, San Diego, CA, 2010. + * http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.212.4592&rep=rep1&type=pdf + * + * [Kogan 2014] + * SAX-PAC (Scalable And eXpressive PAcket Classification) + * Kirill Kogan, Sergey Nikolenko, Ori Rottenstreich, William Culhane, + * and Patrick Eugster. + * Proceedings of the 2014 ACM conference on SIGCOMM, August 2014. + * http://www.sigcomm.org/sites/default/files/ccr/papers/2014/August/2619239-2626294.pdf + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/log2.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <uapi/linux/netfilter/nf_tables.h> +#include <net/ipv6.h> /* For the maximum length of a field */ +#include <linux/bitmap.h> +#include <linux/bitops.h> + +/* Count of concatenated fields depends on count of 32-bit nftables registers */ +#define NFT_PIPAPO_MAX_FIELDS NFT_REG32_COUNT + +/* Largest supported field size */ +#define NFT_PIPAPO_MAX_BYTES (sizeof(struct in6_addr)) +#define NFT_PIPAPO_MAX_BITS (NFT_PIPAPO_MAX_BYTES * BITS_PER_BYTE) + +/* Number of bits to be grouped together in lookup table buckets, arbitrary */ +#define NFT_PIPAPO_GROUP_BITS 4 +#define NFT_PIPAPO_GROUPS_PER_BYTE (BITS_PER_BYTE / NFT_PIPAPO_GROUP_BITS) + +/* Fields are padded to 32 bits in input registers */ +#define NFT_PIPAPO_GROUPS_PADDED_SIZE(x) \ + (round_up((x) / NFT_PIPAPO_GROUPS_PER_BYTE, sizeof(u32))) +#define NFT_PIPAPO_GROUPS_PADDING(x) \ + (NFT_PIPAPO_GROUPS_PADDED_SIZE((x)) - (x) / NFT_PIPAPO_GROUPS_PER_BYTE) + +/* Number of buckets, given by 2 ^ n, with n grouped bits */ +#define NFT_PIPAPO_BUCKETS (1 << NFT_PIPAPO_GROUP_BITS) + +/* Each n-bit range maps to up to n * 2 rules */ +#define NFT_PIPAPO_MAP_NBITS (const_ilog2(NFT_PIPAPO_MAX_BITS * 2)) + +/* Use the rest of mapping table buckets for rule indices, but it makes no sense + * to exceed 32 bits + */ +#if BITS_PER_LONG == 64 +#define NFT_PIPAPO_MAP_TOBITS 32 +#else +#define NFT_PIPAPO_MAP_TOBITS (BITS_PER_LONG - NFT_PIPAPO_MAP_NBITS) +#endif + +/* ...which gives us the highest allowed index for a rule */ +#define NFT_PIPAPO_RULE0_MAX ((1UL << (NFT_PIPAPO_MAP_TOBITS - 1)) \ + - (1UL << NFT_PIPAPO_MAP_NBITS)) + +#define nft_pipapo_for_each_field(field, index, match) \ + for ((field) = (match)->f, (index) = 0; \ + (index) < (match)->field_count; \ + (index)++, (field)++) + +/** + * union nft_pipapo_map_bucket - Bucket of mapping table + * @to: First rule number (in next field) this rule maps to + * @n: Number of rules (in next field) this rule maps to + * @e: If there's no next field, pointer to element this rule maps to + */ +union nft_pipapo_map_bucket { + struct { +#if BITS_PER_LONG == 64 + static_assert(NFT_PIPAPO_MAP_TOBITS <= 32); + u32 to; + + static_assert(NFT_PIPAPO_MAP_NBITS <= 32); + u32 n; +#else + unsigned long to:NFT_PIPAPO_MAP_TOBITS; + unsigned long n:NFT_PIPAPO_MAP_NBITS; +#endif + }; + struct nft_pipapo_elem *e; +}; + +/** + * struct nft_pipapo_field - Lookup, mapping tables and related data for a field + * @groups: Amount of 4-bit groups + * @rules: Number of inserted rules + * @bsize: Size of each bucket in lookup table, in longs + * @lt: Lookup table: 'groups' rows of NFT_PIPAPO_BUCKETS buckets + * @mt: Mapping table: one bucket per rule + */ +struct nft_pipapo_field { + int groups; + unsigned long rules; + size_t bsize; + unsigned long *lt; + union nft_pipapo_map_bucket *mt; +}; + +/** + * struct nft_pipapo_match - Data used for lookup and matching + * @field_count Amount of fields in set + * @scratch: Preallocated per-CPU maps for partial matching results + * @bsize_max: Maximum lookup table bucket size of all fields, in longs + * @rcu Matching data is swapped on commits + * @f: Fields, with lookup and mapping tables + */ +struct nft_pipapo_match { + int field_count; + unsigned long * __percpu *scratch; + size_t bsize_max; + struct rcu_head rcu; + struct nft_pipapo_field f[0]; +}; + +/* Current working bitmap index, toggled between field matches */ +static DEFINE_PER_CPU(bool, nft_pipapo_scratch_index); + +/** + * struct nft_pipapo - Representation of a set + * @match: Currently in-use matching data + * @clone: Copy where pending insertions and deletions are kept + * @groups: Total amount of 4-bit groups for fields in this set + * @width: Total bytes to be matched for one packet, including padding + * @dirty: Working copy has pending insertions or deletions + * @last_gc: Timestamp of last garbage collection run, jiffies + */ +struct nft_pipapo { + struct nft_pipapo_match __rcu *match; + struct nft_pipapo_match *clone; + int groups; + int width; + bool dirty; + unsigned long last_gc; +}; + +struct nft_pipapo_elem; + +/** + * struct nft_pipapo_elem - API-facing representation of single set element + * @ext: nftables API extensions + */ +struct nft_pipapo_elem { + struct nft_set_ext ext; +}; + +/** + * pipapo_refill() - For each set bit, set bits from selected mapping table item + * @map: Bitmap to be scanned for set bits + * @len: Length of bitmap in longs + * @rules: Number of rules in field + * @dst: Destination bitmap + * @mt: Mapping table containing bit set specifiers + * @match_only: Find a single bit and return, don't fill + * + * Iteration over set bits with __builtin_ctzl(): Daniel Lemire, public domain. + * + * For each bit set in map, select the bucket from mapping table with index + * corresponding to the position of the bit set. Use start bit and amount of + * bits specified in bucket to fill region in dst. + * + * Return: -1 on no match, bit position on 'match_only', 0 otherwise. + */ +static int pipapo_refill(unsigned long *map, int len, int rules, + unsigned long *dst, union nft_pipapo_map_bucket *mt, + bool match_only) +{ + unsigned long bitset; + int k, ret = -1; + + for (k = 0; k < len; k++) { + bitset = map[k]; + while (bitset) { + unsigned long t = bitset & -bitset; + int r = __builtin_ctzl(bitset); + int i = k * BITS_PER_LONG + r; + + if (unlikely(i >= rules)) { + map[k] = 0; + return -1; + } + + if (unlikely(match_only)) { + bitmap_clear(map, i, 1); + return i; + } + + ret = 0; + + bitmap_set(dst, mt[i].to, mt[i].n); + + bitset ^= t; + } + map[k] = 0; + } + + return ret; +} + +/** + * nft_pipapo_lookup() - Lookup function + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * @ext: nftables API extension pointer, filled with matching reference + * + * For more details, see DOC: Theory of Operation. + * + * Return: true on match, false otherwise. + */ +static bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) +{ + struct nft_pipapo *priv = nft_set_priv(set); + unsigned long *res_map, *fill_map; + u8 genmask = nft_genmask_cur(net); + const u8 *rp = (const u8 *)key; + struct nft_pipapo_match *m; + struct nft_pipapo_field *f; + bool map_index; + int i; + + local_bh_disable(); + + map_index = raw_cpu_read(nft_pipapo_scratch_index); + + m = rcu_dereference(priv->match); + + if (unlikely(!m || !*raw_cpu_ptr(m->scratch))) + goto out; + + res_map = *raw_cpu_ptr(m->scratch) + (map_index ? m->bsize_max : 0); + fill_map = *raw_cpu_ptr(m->scratch) + (map_index ? 0 : m->bsize_max); + + memset(res_map, 0xff, m->bsize_max * sizeof(*res_map)); + + nft_pipapo_for_each_field(f, i, m) { + bool last = i == m->field_count - 1; + unsigned long *lt = f->lt; + int b, group; + + /* For each 4-bit group: select lookup table bucket depending on + * packet bytes value, then AND bucket value + */ + for (group = 0; group < f->groups; group += 2) { + u8 v; + + v = *rp >> 4; + __bitmap_and(res_map, res_map, lt + v * f->bsize, + f->bsize * BITS_PER_LONG); + lt += f->bsize * NFT_PIPAPO_BUCKETS; + + v = *rp & 0x0f; + rp++; + __bitmap_and(res_map, res_map, lt + v * f->bsize, + f->bsize * BITS_PER_LONG); + lt += f->bsize * NFT_PIPAPO_BUCKETS; + } + + /* Now populate the bitmap for the next field, unless this is + * the last field, in which case return the matched 'ext' + * pointer if any. + * + * Now res_map contains the matching bitmap, and fill_map is the + * bitmap for the next field. + */ +next_match: + b = pipapo_refill(res_map, f->bsize, f->rules, fill_map, f->mt, + last); + if (b < 0) { + raw_cpu_write(nft_pipapo_scratch_index, map_index); + local_bh_enable(); + + return false; + } + + if (last) { + *ext = &f->mt[b].e->ext; + if (unlikely(nft_set_elem_expired(*ext) || + !nft_set_elem_active(*ext, genmask))) + goto next_match; + + /* Last field: we're just returning the key without + * filling the initial bitmap for the next field, so the + * current inactive bitmap is clean and can be reused as + * *next* bitmap (not initial) for the next packet. + */ + raw_cpu_write(nft_pipapo_scratch_index, map_index); + local_bh_enable(); + + return true; + } + + /* Swap bitmap indices: res_map is the initial bitmap for the + * next field, and fill_map is guaranteed to be all-zeroes at + * this point. + */ + map_index = !map_index; + swap(res_map, fill_map); + + rp += NFT_PIPAPO_GROUPS_PADDING(f->groups); + } + +out: + local_bh_enable(); + return false; +} + +/** + * pipapo_get() - Get matching element reference given key data + * @net: Network namespace + * @set: nftables API set representation + * @data: Key data to be matched against existing elements + * @genmask: If set, check that element is active in given genmask + * + * This is essentially the same as the lookup function, except that it matches + * key data against the uncommitted copy and doesn't use preallocated maps for + * bitmap results. + * + * Return: pointer to &struct nft_pipapo_elem on match, error pointer otherwise. + */ +static struct nft_pipapo_elem *pipapo_get(const struct net *net, + const struct nft_set *set, + const u8 *data, u8 genmask) +{ + struct nft_pipapo_elem *ret = ERR_PTR(-ENOENT); + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m = priv->clone; + unsigned long *res_map, *fill_map = NULL; + struct nft_pipapo_field *f; + int i; + + res_map = kmalloc_array(m->bsize_max, sizeof(*res_map), GFP_ATOMIC); + if (!res_map) { + ret = ERR_PTR(-ENOMEM); + goto out; + } + + fill_map = kcalloc(m->bsize_max, sizeof(*res_map), GFP_ATOMIC); + if (!fill_map) { + ret = ERR_PTR(-ENOMEM); + goto out; + } + + memset(res_map, 0xff, m->bsize_max * sizeof(*res_map)); + + nft_pipapo_for_each_field(f, i, m) { + bool last = i == m->field_count - 1; + unsigned long *lt = f->lt; + int b, group; + + /* For each 4-bit group: select lookup table bucket depending on + * packet bytes value, then AND bucket value + */ + for (group = 0; group < f->groups; group++) { + u8 v; + + if (group % 2) { + v = *data & 0x0f; + data++; + } else { + v = *data >> 4; + } + __bitmap_and(res_map, res_map, lt + v * f->bsize, + f->bsize * BITS_PER_LONG); + + lt += f->bsize * NFT_PIPAPO_BUCKETS; + } + + /* Now populate the bitmap for the next field, unless this is + * the last field, in which case return the matched 'ext' + * pointer if any. + * + * Now res_map contains the matching bitmap, and fill_map is the + * bitmap for the next field. + */ +next_match: + b = pipapo_refill(res_map, f->bsize, f->rules, fill_map, f->mt, + last); + if (b < 0) + goto out; + + if (last) { + if (nft_set_elem_expired(&f->mt[b].e->ext) || + (genmask && + !nft_set_elem_active(&f->mt[b].e->ext, genmask))) + goto next_match; + + ret = f->mt[b].e; + goto out; + } + + data += NFT_PIPAPO_GROUPS_PADDING(f->groups); + + /* Swap bitmap indices: fill_map will be the initial bitmap for + * the next field (i.e. the new res_map), and res_map is + * guaranteed to be all-zeroes at this point, ready to be filled + * according to the next mapping table. + */ + swap(res_map, fill_map); + } + +out: + kfree(fill_map); + kfree(res_map); + return ret; +} + +/** + * nft_pipapo_get() - Get matching element reference given key data + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * @flags: Unused + */ +void *nft_pipapo_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) +{ + return pipapo_get(net, set, (const u8 *)elem->key.val.data, + nft_genmask_cur(net)); +} + +/** + * pipapo_resize() - Resize lookup or mapping table, or both + * @f: Field containing lookup and mapping tables + * @old_rules: Previous amount of rules in field + * @rules: New amount of rules + * + * Increase, decrease or maintain tables size depending on new amount of rules, + * and copy data over. In case the new size is smaller, throw away data for + * highest-numbered rules. + * + * Return: 0 on success, -ENOMEM on allocation failure. + */ +static int pipapo_resize(struct nft_pipapo_field *f, int old_rules, int rules) +{ + long *new_lt = NULL, *new_p, *old_lt = f->lt, *old_p; + union nft_pipapo_map_bucket *new_mt, *old_mt = f->mt; + size_t new_bucket_size, copy; + int group, bucket; + + new_bucket_size = DIV_ROUND_UP(rules, BITS_PER_LONG); + + if (new_bucket_size == f->bsize) + goto mt; + + if (new_bucket_size > f->bsize) + copy = f->bsize; + else + copy = new_bucket_size; + + new_lt = kvzalloc(f->groups * NFT_PIPAPO_BUCKETS * new_bucket_size * + sizeof(*new_lt), GFP_KERNEL); + if (!new_lt) + return -ENOMEM; + + new_p = new_lt; + old_p = old_lt; + for (group = 0; group < f->groups; group++) { + for (bucket = 0; bucket < NFT_PIPAPO_BUCKETS; bucket++) { + memcpy(new_p, old_p, copy * sizeof(*new_p)); + new_p += copy; + old_p += copy; + + if (new_bucket_size > f->bsize) + new_p += new_bucket_size - f->bsize; + else + old_p += f->bsize - new_bucket_size; + } + } + +mt: + new_mt = kvmalloc(rules * sizeof(*new_mt), GFP_KERNEL); + if (!new_mt) { + kvfree(new_lt); + return -ENOMEM; + } + + memcpy(new_mt, f->mt, min(old_rules, rules) * sizeof(*new_mt)); + if (rules > old_rules) { + memset(new_mt + old_rules, 0, + (rules - old_rules) * sizeof(*new_mt)); + } + + if (new_lt) { + f->bsize = new_bucket_size; + f->lt = new_lt; + kvfree(old_lt); + } + + f->mt = new_mt; + kvfree(old_mt); + + return 0; +} + +/** + * pipapo_bucket_set() - Set rule bit in bucket given group and group value + * @f: Field containing lookup table + * @rule: Rule index + * @group: Group index + * @v: Value of bit group + */ +static void pipapo_bucket_set(struct nft_pipapo_field *f, int rule, int group, + int v) +{ + unsigned long *pos; + + pos = f->lt + f->bsize * NFT_PIPAPO_BUCKETS * group; + pos += f->bsize * v; + + __set_bit(rule, pos); +} + +/** + * pipapo_insert() - Insert new rule in field given input key and mask length + * @f: Field containing lookup table + * @k: Input key for classification, without nftables padding + * @mask_bits: Length of mask; matches field length for non-ranged entry + * + * Insert a new rule reference in lookup buckets corresponding to k and + * mask_bits. + * + * Return: 1 on success (one rule inserted), negative error code on failure. + */ +static int pipapo_insert(struct nft_pipapo_field *f, const uint8_t *k, + int mask_bits) +{ + int rule = f->rules++, group, ret; + + ret = pipapo_resize(f, f->rules - 1, f->rules); + if (ret) + return ret; + + for (group = 0; group < f->groups; group++) { + int i, v; + u8 mask; + + if (group % 2) + v = k[group / 2] & 0x0f; + else + v = k[group / 2] >> 4; + + if (mask_bits >= (group + 1) * 4) { + /* Not masked */ + pipapo_bucket_set(f, rule, group, v); + } else if (mask_bits <= group * 4) { + /* Completely masked */ + for (i = 0; i < NFT_PIPAPO_BUCKETS; i++) + pipapo_bucket_set(f, rule, group, i); + } else { + /* The mask limit falls on this group */ + mask = 0x0f >> (mask_bits - group * 4); + for (i = 0; i < NFT_PIPAPO_BUCKETS; i++) { + if ((i & ~mask) == (v & ~mask)) + pipapo_bucket_set(f, rule, group, i); + } + } + } + + return 1; +} + +/** + * pipapo_step_diff() - Check if setting @step bit in netmask would change it + * @base: Mask we are expanding + * @step: Step bit for given expansion step + * @len: Total length of mask space (set and unset bits), bytes + * + * Convenience function for mask expansion. + * + * Return: true if step bit changes mask (i.e. isn't set), false otherwise. + */ +static bool pipapo_step_diff(u8 *base, int step, int len) +{ + /* Network order, byte-addressed */ +#ifdef __BIG_ENDIAN__ + return !(BIT(step % BITS_PER_BYTE) & base[step / BITS_PER_BYTE]); +#else + return !(BIT(step % BITS_PER_BYTE) & + base[len - 1 - step / BITS_PER_BYTE]); +#endif +} + +/** + * pipapo_step_after_end() - Check if mask exceeds range end with given step + * @base: Mask we are expanding + * @end: End of range + * @step: Step bit for given expansion step, highest bit to be set + * @len: Total length of mask space (set and unset bits), bytes + * + * Convenience function for mask expansion. + * + * Return: true if mask exceeds range setting step bits, false otherwise. + */ +static bool pipapo_step_after_end(const u8 *base, const u8 *end, int step, + int len) +{ + u8 tmp[NFT_PIPAPO_MAX_BYTES]; + int i; + + memcpy(tmp, base, len); + + /* Network order, byte-addressed */ + for (i = 0; i <= step; i++) +#ifdef __BIG_ENDIAN__ + tmp[i / BITS_PER_BYTE] |= BIT(i % BITS_PER_BYTE); +#else + tmp[len - 1 - i / BITS_PER_BYTE] |= BIT(i % BITS_PER_BYTE); +#endif + + return memcmp(tmp, end, len) > 0; +} + +/** + * pipapo_base_sum() - Sum step bit to given len-sized netmask base with carry + * @base: Netmask base + * @step: Step bit to sum + * @len: Netmask length, bytes + */ +static void pipapo_base_sum(u8 *base, int step, int len) +{ + bool carry = false; + int i; + + /* Network order, byte-addressed */ +#ifdef __BIG_ENDIAN__ + for (i = step / BITS_PER_BYTE; i < len; i++) { +#else + for (i = len - 1 - step / BITS_PER_BYTE; i >= 0; i--) { +#endif + if (carry) + base[i]++; + else + base[i] += 1 << (step % BITS_PER_BYTE); + + if (base[i]) + break; + + carry = true; + } +} + +/** + * pipapo_expand() - Expand to composing netmasks, insert into lookup table + * @f: Field containing lookup table + * @start: Start of range + * @end: End of range + * @len: Length of value in bits + * + * Expand range to composing netmasks and insert corresponding rule references + * in lookup buckets. + * + * Return: number of inserted rules on success, negative error code on failure. + */ +static int pipapo_expand(struct nft_pipapo_field *f, + const u8 *start, const u8 *end, int len) +{ + int step, masks = 0, bytes = DIV_ROUND_UP(len, BITS_PER_BYTE); + u8 base[NFT_PIPAPO_MAX_BYTES]; + + memcpy(base, start, bytes); + while (memcmp(base, end, bytes) <= 0) { + int err; + + step = 0; + while (pipapo_step_diff(base, step, bytes)) { + if (pipapo_step_after_end(base, end, step, bytes)) + break; + + step++; + if (step >= len) { + if (!masks) { + pipapo_insert(f, base, 0); + masks = 1; + } + goto out; + } + } + + err = pipapo_insert(f, base, len - step); + + if (err < 0) + return err; + + masks++; + pipapo_base_sum(base, step, bytes); + } +out: + return masks; +} + +/** + * pipapo_map() - Insert rules in mapping tables, mapping them between fields + * @m: Matching data, including mapping table + * @map: Table of rule maps: array of first rule and amount of rules + * in next field a given rule maps to, for each field + * @ext: For last field, nft_set_ext pointer matching rules map to + */ +static void pipapo_map(struct nft_pipapo_match *m, + union nft_pipapo_map_bucket map[NFT_PIPAPO_MAX_FIELDS], + struct nft_pipapo_elem *e) +{ + struct nft_pipapo_field *f; + int i, j; + + for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) { + for (j = 0; j < map[i].n; j++) { + f->mt[map[i].to + j].to = map[i + 1].to; + f->mt[map[i].to + j].n = map[i + 1].n; + } + } + + /* Last field: map to ext instead of mapping to next field */ + for (j = 0; j < map[i].n; j++) + f->mt[map[i].to + j].e = e; +} + +/** + * pipapo_realloc_scratch() - Reallocate scratch maps for partial match results + * @clone: Copy of matching data with pending insertions and deletions + * @bsize_max Maximum bucket size, scratch maps cover two buckets + * + * Return: 0 on success, -ENOMEM on failure. + */ +static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, + unsigned long bsize_max) +{ + int i; + + for_each_possible_cpu(i) { + unsigned long *scratch; + + scratch = kzalloc_node(bsize_max * sizeof(*scratch) * 2, + GFP_KERNEL, cpu_to_node(i)); + if (!scratch) { + /* On failure, there's no need to undo previous + * allocations: this means that some scratch maps have + * a bigger allocated size now (this is only called on + * insertion), but the extra space won't be used by any + * CPU as new elements are not inserted and m->bsize_max + * is not updated. + */ + return -ENOMEM; + } + + kfree(*per_cpu_ptr(clone->scratch, i)); + + *per_cpu_ptr(clone->scratch, i) = scratch; + } + + return 0; +} + +/** + * nft_pipapo_insert() - Validate and insert ranged elements + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * @ext2: Filled with pointer to &struct nft_set_ext in inserted element + * + * Return: 0 on success, error pointer on failure. + */ +static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, + struct nft_set_ext **ext2) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; + const u8 *start = (const u8 *)elem->key.val.data, *end; + struct nft_pipapo_elem *e = elem->priv, *dup; + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m = priv->clone; + u8 genmask = nft_genmask_next(net); + struct nft_pipapo_field *f; + int i, bsize_max, err = 0; + + dup = pipapo_get(net, set, start, genmask); + if (PTR_ERR(dup) == -ENOENT) { + if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) { + end = (const u8 *)nft_set_ext_key_end(ext)->data; + dup = pipapo_get(net, set, end, nft_genmask_next(net)); + } else { + end = start; + } + } + + if (PTR_ERR(dup) != -ENOENT) { + if (IS_ERR(dup)) + return PTR_ERR(dup); + *ext2 = &dup->ext; + return -EEXIST; + } + + /* Validate */ + nft_pipapo_for_each_field(f, i, m) { + const u8 *start_p = start, *end_p = end; + + if (f->rules >= (unsigned long)NFT_PIPAPO_RULE0_MAX) + return -ENOSPC; + + if (memcmp(start_p, end_p, + f->groups / NFT_PIPAPO_GROUPS_PER_BYTE) > 0) + return -EINVAL; + + start_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); + end_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); + } + + /* Insert */ + priv->dirty = true; + + bsize_max = m->bsize_max; + + nft_pipapo_for_each_field(f, i, m) { + int ret; + + rulemap[i].to = f->rules; + + ret = memcmp(start, end, + f->groups / NFT_PIPAPO_GROUPS_PER_BYTE); + if (!ret) { + ret = pipapo_insert(f, start, + f->groups * NFT_PIPAPO_GROUP_BITS); + } else { + ret = pipapo_expand(f, start, end, + f->groups * NFT_PIPAPO_GROUP_BITS); + } + + if (f->bsize > bsize_max) + bsize_max = f->bsize; + + rulemap[i].n = ret; + + start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); + end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); + } + + if (!*this_cpu_ptr(m->scratch) || bsize_max > m->bsize_max) { + err = pipapo_realloc_scratch(m, bsize_max); + if (err) + return err; + + this_cpu_write(nft_pipapo_scratch_index, false); + + m->bsize_max = bsize_max; + } + + *ext2 = &e->ext; + + pipapo_map(m, rulemap, e); + + return 0; +} + +/** + * pipapo_clone() - Clone matching data to create new working copy + * @old: Existing matching data + * + * Return: copy of matching data passed as 'old', error pointer on failure + */ +static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) +{ + struct nft_pipapo_field *dst, *src; + struct nft_pipapo_match *new; + int i; + + new = kmalloc(sizeof(*new) + sizeof(*dst) * old->field_count, + GFP_KERNEL); + if (!new) + return ERR_PTR(-ENOMEM); + + new->field_count = old->field_count; + new->bsize_max = old->bsize_max; + + new->scratch = alloc_percpu(*new->scratch); + if (!new->scratch) + goto out_scratch; + + rcu_head_init(&new->rcu); + + src = old->f; + dst = new->f; + + for (i = 0; i < old->field_count; i++) { + memcpy(dst, src, offsetof(struct nft_pipapo_field, lt)); + + dst->lt = kvzalloc(src->groups * NFT_PIPAPO_BUCKETS * + src->bsize * sizeof(*dst->lt), + GFP_KERNEL); + if (!dst->lt) + goto out_lt; + + memcpy(dst->lt, src->lt, + src->bsize * sizeof(*dst->lt) * + src->groups * NFT_PIPAPO_BUCKETS); + + dst->mt = kvmalloc(src->rules * sizeof(*src->mt), GFP_KERNEL); + if (!dst->mt) + goto out_mt; + + memcpy(dst->mt, src->mt, src->rules * sizeof(*src->mt)); + src++; + dst++; + } + + return new; + +out_mt: + kvfree(dst->lt); +out_lt: + for (dst--; i > 0; i--) { + kvfree(dst->mt); + kvfree(dst->lt); + dst--; + } + free_percpu(new->scratch); +out_scratch: + kfree(new); + + return ERR_PTR(-ENOMEM); +} + +/** + * pipapo_rules_same_key() - Get number of rules originated from the same entry + * @f: Field containing mapping table + * @first: Index of first rule in set of rules mapping to same entry + * + * Using the fact that all rules in a field that originated from the same entry + * will map to the same set of rules in the next field, or to the same element + * reference, return the cardinality of the set of rules that originated from + * the same entry as the rule with index @first, @first rule included. + * + * In pictures: + * rules + * field #0 0 1 2 3 4 + * map to: 0 1 2-4 2-4 5-9 + * . . ....... . ... + * | | | | \ \ + * | | | | \ \ + * | | | | \ \ + * ' ' ' ' ' \ + * in field #1 0 1 2 3 4 5 ... + * + * if this is called for rule 2 on field #0, it will return 3, as also rules 2 + * and 3 in field 0 map to the same set of rules (2, 3, 4) in the next field. + * + * For the last field in a set, we can rely on associated entries to map to the + * same element references. + * + * Return: Number of rules that originated from the same entry as @first. + */ +static int pipapo_rules_same_key(struct nft_pipapo_field *f, int first) +{ + struct nft_pipapo_elem *e = NULL; /* Keep gcc happy */ + int r; + + for (r = first; r < f->rules; r++) { + if (r != first && e != f->mt[r].e) + return r - first; + + e = f->mt[r].e; + } + + if (r != first) + return r - first; + + return 0; +} + +/** + * pipapo_unmap() - Remove rules from mapping tables, renumber remaining ones + * @mt: Mapping array + * @rules: Original amount of rules in mapping table + * @start: First rule index to be removed + * @n: Amount of rules to be removed + * @to_offset: First rule index, in next field, this group of rules maps to + * @is_last: If this is the last field, delete reference from mapping array + * + * This is used to unmap rules from the mapping table for a single field, + * maintaining consistency and compactness for the existing ones. + * + * In pictures: let's assume that we want to delete rules 2 and 3 from the + * following mapping array: + * + * rules + * 0 1 2 3 4 + * map to: 4-10 4-10 11-15 11-15 16-18 + * + * the result will be: + * + * rules + * 0 1 2 + * map to: 4-10 4-10 11-13 + * + * for fields before the last one. In case this is the mapping table for the + * last field in a set, and rules map to pointers to &struct nft_pipapo_elem: + * + * rules + * 0 1 2 3 4 + * element pointers: 0x42 0x42 0x33 0x33 0x44 + * + * the result will be: + * + * rules + * 0 1 2 + * element pointers: 0x42 0x42 0x44 + */ +static void pipapo_unmap(union nft_pipapo_map_bucket *mt, int rules, + int start, int n, int to_offset, bool is_last) +{ + int i; + + memmove(mt + start, mt + start + n, (rules - start - n) * sizeof(*mt)); + memset(mt + rules - n, 0, n * sizeof(*mt)); + + if (is_last) + return; + + for (i = start; i < rules - n; i++) + mt[i].to -= to_offset; +} + +/** + * pipapo_drop() - Delete entry from lookup and mapping tables, given rule map + * @m: Matching data + * @rulemap Table of rule maps, arrays of first rule and amount of rules + * in next field a given entry maps to, for each field + * + * For each rule in lookup table buckets mapping to this set of rules, drop + * all bits set in lookup table mapping. In pictures, assuming we want to drop + * rules 0 and 1 from this lookup table: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 0 1,2 + * 1 1,2 0 + * 2 0 1,2 + * 3 0 1,2 + * 4 0,1,2 + * 5 0 1 2 + * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 + * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 + * + * rule 2 becomes rule 0, and the result will be: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 0 + * 1 0 + * 2 0 + * 3 0 + * 4 0 + * 5 0 + * 6 0 + * 7 0 0 + * + * once this is done, call unmap() to drop all the corresponding rule references + * from mapping tables. + */ +static void pipapo_drop(struct nft_pipapo_match *m, + union nft_pipapo_map_bucket rulemap[]) +{ + struct nft_pipapo_field *f; + int i; + + nft_pipapo_for_each_field(f, i, m) { + int g; + + for (g = 0; g < f->groups; g++) { + unsigned long *pos; + int b; + + pos = f->lt + g * NFT_PIPAPO_BUCKETS * f->bsize; + + for (b = 0; b < NFT_PIPAPO_BUCKETS; b++) { + bitmap_cut(pos, pos, rulemap[i].to, + rulemap[i].n, + f->bsize * BITS_PER_LONG); + + pos += f->bsize; + } + } + + pipapo_unmap(f->mt, f->rules, rulemap[i].to, rulemap[i].n, + rulemap[i + 1].n, i == m->field_count - 1); + if (pipapo_resize(f, f->rules, f->rules - rulemap[i].n)) { + /* We can ignore this, a failure to shrink tables down + * doesn't make tables invalid. + */ + ; + } + f->rules -= rulemap[i].n; + } +} + +/** + * pipapo_gc() - Drop expired entries from set, destroy start and end elements + * @set: nftables API set representation + * @m: Matching data + */ +static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) +{ + struct nft_pipapo *priv = nft_set_priv(set); + int rules_f0, first_rule = 0; + + while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { + union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; + struct nft_pipapo_field *f; + struct nft_pipapo_elem *e; + int i, start, rules_fx; + + start = first_rule; + rules_fx = rules_f0; + + nft_pipapo_for_each_field(f, i, m) { + rulemap[i].to = start; + rulemap[i].n = rules_fx; + + if (i < m->field_count - 1) { + rules_fx = f->mt[start].n; + start = f->mt[start].to; + } + } + + /* Pick the last field, and its last index */ + f--; + i--; + e = f->mt[rulemap[i].to].e; + if (nft_set_elem_expired(&e->ext) && + !nft_set_elem_mark_busy(&e->ext)) { + priv->dirty = true; + pipapo_drop(m, rulemap); + + rcu_barrier(); + nft_set_elem_destroy(set, e, true); + + /* And check again current first rule, which is now the + * first we haven't checked. + */ + } else { + first_rule += rules_f0; + } + } + + priv->last_gc = jiffies; +} + +/** + * pipapo_free_fields() - Free per-field tables contained in matching data + * @m: Matching data + */ +static void pipapo_free_fields(struct nft_pipapo_match *m) +{ + struct nft_pipapo_field *f; + int i; + + nft_pipapo_for_each_field(f, i, m) { + kvfree(f->lt); + kvfree(f->mt); + } +} + +/** + * pipapo_reclaim_match - RCU callback to free fields from old matching data + * @rcu: RCU head + */ +static void pipapo_reclaim_match(struct rcu_head *rcu) +{ + struct nft_pipapo_match *m; + int i; + + m = container_of(rcu, struct nft_pipapo_match, rcu); + + for_each_possible_cpu(i) + kfree(*per_cpu_ptr(m->scratch, i)); + + free_percpu(m->scratch); + + pipapo_free_fields(m); + + kfree(m); +} + +/** + * pipapo_commit() - Replace lookup data with current working copy + * @set: nftables API set representation + * + * While at it, check if we should perform garbage collection on the working + * copy before committing it for lookup, and don't replace the table if the + * working copy doesn't have pending changes. + * + * We also need to create a new working copy for subsequent insertions and + * deletions. + */ +static void pipapo_commit(const struct nft_set *set) +{ + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *new_clone, *old; + + if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) + pipapo_gc(set, priv->clone); + + if (!priv->dirty) + return; + + new_clone = pipapo_clone(priv->clone); + if (IS_ERR(new_clone)) + return; + + priv->dirty = false; + + old = rcu_access_pointer(priv->match); + rcu_assign_pointer(priv->match, priv->clone); + if (old) + call_rcu(&old->rcu, pipapo_reclaim_match); + + priv->clone = new_clone; +} + +/** + * nft_pipapo_activate() - Mark element reference as active given key, commit + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * + * On insertion, elements are added to a copy of the matching data currently + * in use for lookups, and not directly inserted into current lookup data, so + * we'll take care of that by calling pipapo_commit() here. Both + * nft_pipapo_insert() and nft_pipapo_activate() are called once for each + * element, hence we can't purpose either one as a real commit operation. + */ +static void nft_pipapo_activate(const struct net *net, + const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nft_pipapo_elem *e; + + e = pipapo_get(net, set, (const u8 *)elem->key.val.data, 0); + if (IS_ERR(e)) + return; + + nft_set_elem_change_active(net, set, &e->ext); + nft_set_elem_clear_busy(&e->ext); + + pipapo_commit(set); +} + +/** + * pipapo_deactivate() - Check that element is in set, mark as inactive + * @net: Network namespace + * @set: nftables API set representation + * @data: Input key data + * @ext: nftables API extension pointer, used to check for end element + * + * This is a convenience function that can be called from both + * nft_pipapo_deactivate() and nft_pipapo_flush(), as they are in fact the same + * operation. + * + * Return: deactivated element if found, NULL otherwise. + */ +static void *pipapo_deactivate(const struct net *net, const struct nft_set *set, + const u8 *data, const struct nft_set_ext *ext) +{ + struct nft_pipapo_elem *e; + + e = pipapo_get(net, set, data, nft_genmask_next(net)); + if (IS_ERR(e)) + return NULL; + + nft_set_elem_change_active(net, set, &e->ext); + + return e; +} + +/** + * nft_pipapo_deactivate() - Call pipapo_deactivate() to make element inactive + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * + * Return: deactivated element if found, NULL otherwise. + */ +static void *nft_pipapo_deactivate(const struct net *net, + const struct nft_set *set, + const struct nft_set_elem *elem) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + + return pipapo_deactivate(net, set, (const u8 *)elem->key.val.data, ext); +} + +/** + * nft_pipapo_flush() - Call pipapo_deactivate() to make element inactive + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * + * This is functionally the same as nft_pipapo_deactivate(), with a slightly + * different interface, and it's also called once for each element in a set + * being flushed, so we can't implement, strictly speaking, a flush operation, + * which would otherwise be as simple as allocating an empty copy of the + * matching data. + * + * Note that we could in theory do that, mark the set as flushed, and ignore + * subsequent calls, but we would leak all the elements after the first one, + * because they wouldn't then be freed as result of API calls. + * + * Return: true if element was found and deactivated. + */ +static bool nft_pipapo_flush(const struct net *net, const struct nft_set *set, + void *elem) +{ + struct nft_pipapo_elem *e = elem; + + return pipapo_deactivate(net, set, (const u8 *)nft_set_ext_key(&e->ext), + &e->ext); +} + +/** + * pipapo_get_boundaries() - Get byte interval for associated rules + * @f: Field including lookup table + * @first_rule: First rule (lowest index) + * @rule_count: Number of associated rules + * @left: Byte expression for left boundary (start of range) + * @right: Byte expression for right boundary (end of range) + * + * Given the first rule and amount of rules that originated from the same entry, + * build the original range associated with the entry, and calculate the length + * of the originating netmask. + * + * In pictures: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 1,2 + * 1 1,2 + * 2 1,2 + * 3 1,2 + * 4 1,2 + * 5 1 2 + * 6 1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 + * 7 1,2 1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 + * + * this is the lookup table corresponding to the IPv4 range + * 192.168.1.0-192.168.2.1, which was expanded to the two composing netmasks, + * rule #1: 192.168.1.0/24, and rule #2: 192.168.2.0/31. + * + * This function fills @left and @right with the byte values of the leftmost + * and rightmost bucket indices for the lowest and highest rule indices, + * respectively. If @first_rule is 1 and @rule_count is 2, we obtain, in + * nibbles: + * left: < 12, 0, 10, 8, 0, 1, 0, 0 > + * right: < 12, 0, 10, 8, 0, 2, 2, 1 > + * corresponding to bytes: + * left: < 192, 168, 1, 0 > + * right: < 192, 168, 2, 1 > + * with mask length irrelevant here, unused on return, as the range is already + * defined by its start and end points. The mask length is relevant for a single + * ranged entry instead: if @first_rule is 1 and @rule_count is 1, we ignore + * rule 2 above: @left becomes < 192, 168, 1, 0 >, @right becomes + * < 192, 168, 1, 255 >, and the mask length, calculated from the distances + * between leftmost and rightmost bucket indices for each group, would be 24. + * + * Return: mask length, in bits. + */ +static int pipapo_get_boundaries(struct nft_pipapo_field *f, int first_rule, + int rule_count, u8 *left, u8 *right) +{ + u8 *l = left, *r = right; + int g, mask_len = 0; + + for (g = 0; g < f->groups; g++) { + int b, x0, x1; + + x0 = -1; + x1 = -1; + for (b = 0; b < NFT_PIPAPO_BUCKETS; b++) { + unsigned long *pos; + + pos = f->lt + (g * NFT_PIPAPO_BUCKETS + b) * f->bsize; + if (test_bit(first_rule, pos) && x0 == -1) + x0 = b; + if (test_bit(first_rule + rule_count - 1, pos)) + x1 = b; + } + + if (g % 2) { + *(l++) |= x0 & 0x0f; + *(r++) |= x1 & 0x0f; + } else { + *l |= x0 << 4; + *r |= x1 << 4; + } + + if (x1 - x0 == 0) + mask_len += 4; + else if (x1 - x0 == 1) + mask_len += 3; + else if (x1 - x0 == 3) + mask_len += 2; + else if (x1 - x0 == 7) + mask_len += 1; + } + + return mask_len; +} + +/** + * pipapo_match_field() - Match rules against byte ranges + * @f: Field including the lookup table + * @first_rule: First of associated rules originating from same entry + * @rule_count: Amount of associated rules + * @start: Start of range to be matched + * @end: End of range to be matched + * + * Return: true on match, false otherwise. + */ +static bool pipapo_match_field(struct nft_pipapo_field *f, + int first_rule, int rule_count, + const u8 *start, const u8 *end) +{ + u8 right[NFT_PIPAPO_MAX_BYTES] = { 0 }; + u8 left[NFT_PIPAPO_MAX_BYTES] = { 0 }; + + pipapo_get_boundaries(f, first_rule, rule_count, left, right); + + return !memcmp(start, left, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE) && + !memcmp(end, right, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE); +} + +/** + * nft_pipapo_remove() - Remove element given key, commit + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * + * Similarly to nft_pipapo_activate(), this is used as commit operation by the + * API, but it's called once per element in the pending transaction, so we can't + * implement this as a single commit operation. Closest we can get is to remove + * the matched element here, if any, and commit the updated matching data. + */ +static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem) +{ + const u8 *data = (const u8 *)elem->key.val.data; + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m = priv->clone; + int rules_f0, first_rule = 0; + struct nft_pipapo_elem *e; + + e = pipapo_get(net, set, data, 0); + if (IS_ERR(e)) + return; + + while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { + union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; + const u8 *match_start, *match_end; + struct nft_pipapo_field *f; + int i, start, rules_fx; + + match_start = data; + match_end = (const u8 *)nft_set_ext_key_end(&e->ext)->data; + + start = first_rule; + rules_fx = rules_f0; + + nft_pipapo_for_each_field(f, i, m) { + if (!pipapo_match_field(f, start, rules_fx, + match_start, match_end)) + break; + + rulemap[i].to = start; + rulemap[i].n = rules_fx; + + rules_fx = f->mt[start].n; + start = f->mt[start].to; + + match_start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); + match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); + } + + if (i == m->field_count) { + priv->dirty = true; + pipapo_drop(m, rulemap); + pipapo_commit(set); + return; + } + + first_rule += rules_f0; + } +} + +/** + * nft_pipapo_walk() - Walk over elements + * @ctx: nftables API context + * @set: nftables API set representation + * @iter: Iterator + * + * As elements are referenced in the mapping array for the last field, directly + * scan that array: there's no need to follow rule mappings from the first + * field. + */ +static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_iter *iter) +{ + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m; + struct nft_pipapo_field *f; + int i, r; + + rcu_read_lock(); + m = rcu_dereference(priv->match); + + if (unlikely(!m)) + goto out; + + for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) + ; + + for (r = 0; r < f->rules; r++) { + struct nft_pipapo_elem *e; + struct nft_set_elem elem; + + if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e) + continue; + + if (iter->count < iter->skip) + goto cont; + + e = f->mt[r].e; + if (nft_set_elem_expired(&e->ext)) + goto cont; + + elem.priv = e; + + iter->err = iter->fn(ctx, set, iter, &elem); + if (iter->err < 0) + goto out; + +cont: + iter->count++; + } + +out: + rcu_read_unlock(); +} + +/** + * nft_pipapo_privsize() - Return the size of private data for the set + * @nla: netlink attributes, ignored as size doesn't depend on them + * @desc: Set description, ignored as size doesn't depend on it + * + * Return: size of private data for this set implementation, in bytes + */ +static u64 nft_pipapo_privsize(const struct nlattr * const nla[], + const struct nft_set_desc *desc) +{ + return sizeof(struct nft_pipapo); +} + +/** + * nft_pipapo_estimate() - Estimate set size, space and lookup complexity + * @desc: Set description, element count and field description used here + * @features: Flags: NFT_SET_INTERVAL needs to be there + * @est: Storage for estimation data + * + * The size for this set type can vary dramatically, as it depends on the number + * of rules (composing netmasks) the entries expand to. We compute the worst + * case here. + * + * In general, for a non-ranged entry or a single composing netmask, we need + * one bit in each of the sixteen NFT_PIPAPO_BUCKETS, for each 4-bit group (that + * is, each input bit needs four bits of matching data), plus a bucket in the + * mapping table for each field. + * + * Return: true only for compatible range concatenations + */ +static bool nft_pipapo_estimate(const struct nft_set_desc *desc, u32 features, + struct nft_set_estimate *est) +{ + unsigned long entry_size; + int i; + + if (!(features & NFT_SET_INTERVAL) || desc->field_count <= 1) + return false; + + for (i = 0, entry_size = 0; i < desc->field_count; i++) { + unsigned long rules; + + if (desc->field_len[i] > NFT_PIPAPO_MAX_BYTES) + return false; + + /* Worst-case ranges for each concatenated field: each n-bit + * field can expand to up to n * 2 rules in each bucket, and + * each rule also needs a mapping bucket. + */ + rules = ilog2(desc->field_len[i] * BITS_PER_BYTE) * 2; + entry_size += rules * NFT_PIPAPO_BUCKETS / BITS_PER_BYTE; + entry_size += rules * sizeof(union nft_pipapo_map_bucket); + } + + /* Rules in lookup and mapping tables are needed for each entry */ + est->size = desc->size * entry_size; + if (est->size && div_u64(est->size, desc->size) != entry_size) + return false; + + est->size += sizeof(struct nft_pipapo) + + sizeof(struct nft_pipapo_match) * 2; + + est->size += sizeof(struct nft_pipapo_field) * desc->field_count; + + est->lookup = NFT_SET_CLASS_O_LOG_N; + + est->space = NFT_SET_CLASS_O_N; + + return true; +} + +/** + * nft_pipapo_init() - Initialise data for a set instance + * @set: nftables API set representation + * @desc: Set description + * @nla: netlink attributes + * + * Validate number and size of fields passed as NFTA_SET_DESC_CONCAT netlink + * attributes, initialise internal set parameters, current instance of matching + * data and a copy for subsequent insertions. + * + * Return: 0 on success, negative error code on failure. + */ +static int nft_pipapo_init(const struct nft_set *set, + const struct nft_set_desc *desc, + const struct nlattr * const nla[]) +{ + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m; + struct nft_pipapo_field *f; + int err, i; + + if (desc->field_count > NFT_PIPAPO_MAX_FIELDS) + return -EINVAL; + + m = kmalloc(sizeof(*priv->match) + sizeof(*f) * desc->field_count, + GFP_KERNEL); + if (!m) + return -ENOMEM; + + m->field_count = desc->field_count; + m->bsize_max = 0; + + m->scratch = alloc_percpu(unsigned long *); + if (!m->scratch) { + err = -ENOMEM; + goto out_free; + } + for_each_possible_cpu(i) + *per_cpu_ptr(m->scratch, i) = NULL; + + rcu_head_init(&m->rcu); + + nft_pipapo_for_each_field(f, i, m) { + f->groups = desc->field_len[i] * NFT_PIPAPO_GROUPS_PER_BYTE; + priv->groups += f->groups; + + priv->width += round_up(desc->field_len[i], sizeof(u32)); + + f->bsize = 0; + f->rules = 0; + f->lt = NULL; + f->mt = NULL; + } + + /* Create an initial clone of matching data for next insertion */ + priv->clone = pipapo_clone(m); + if (IS_ERR(priv->clone)) { + err = PTR_ERR(priv->clone); + goto out_free; + } + + priv->dirty = false; + + rcu_assign_pointer(priv->match, m); + + return 0; + +out_free: + free_percpu(m->scratch); + kfree(m); + + return err; +} + +/** + * nft_pipapo_destroy() - Free private data for set and all committed elements + * @set: nftables API set representation + */ +static void nft_pipapo_destroy(const struct nft_set *set) +{ + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m; + struct nft_pipapo_field *f; + int i, r, cpu; + + m = rcu_dereference_protected(priv->match, true); + if (m) { + rcu_barrier(); + + for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) + ; + + for (r = 0; r < f->rules; r++) { + struct nft_pipapo_elem *e; + + if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e) + continue; + + e = f->mt[r].e; + + nft_set_elem_destroy(set, e, true); + } + + for_each_possible_cpu(cpu) + kfree(*per_cpu_ptr(m->scratch, cpu)); + free_percpu(m->scratch); + + pipapo_free_fields(m); + kfree(m); + priv->match = NULL; + } + + if (priv->clone) { + for_each_possible_cpu(cpu) + kfree(*per_cpu_ptr(priv->clone->scratch, cpu)); + free_percpu(priv->clone->scratch); + + pipapo_free_fields(priv->clone); + kfree(priv->clone); + priv->clone = NULL; + } +} + +/** + * nft_pipapo_gc_init() - Initialise garbage collection + * @set: nftables API set representation + * + * Instead of actually setting up a periodic work for garbage collection, as + * this operation requires a swap of matching data with the working copy, we'll + * do that opportunistically with other commit operations if the interval is + * elapsed, so we just need to set the current jiffies timestamp here. + */ +static void nft_pipapo_gc_init(const struct nft_set *set) +{ + struct nft_pipapo *priv = nft_set_priv(set); + + priv->last_gc = jiffies; +} + +struct nft_set_type nft_set_pipapo_type __read_mostly = { + .owner = THIS_MODULE, + .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | + NFT_SET_TIMEOUT, + .ops = { + .lookup = nft_pipapo_lookup, + .insert = nft_pipapo_insert, + .activate = nft_pipapo_activate, + .deactivate = nft_pipapo_deactivate, + .flush = nft_pipapo_flush, + .remove = nft_pipapo_remove, + .walk = nft_pipapo_walk, + .get = nft_pipapo_get, + .privsize = nft_pipapo_privsize, + .estimate = nft_pipapo_estimate, + .init = nft_pipapo_init, + .destroy = nft_pipapo_destroy, + .gc_init = nft_pipapo_gc_init, + .elemsize = offsetof(struct nft_pipapo_elem, ext), + }, +}; diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index a9f804f7a04a..5000b938ab1e 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -466,6 +466,9 @@ static void nft_rbtree_destroy(const struct nft_set *set) static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est) { + if (desc->field_count > 1) + return false; + if (desc->size) est->size = sizeof(struct nft_rbtree) + desc->size * sizeof(struct nft_rbtree_elem); diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index 23cd163689d5..4c3f2e24c7cb 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -76,7 +76,7 @@ static int nft_tunnel_get_init(const struct nft_ctx *ctx, struct nft_tunnel *priv = nft_expr_priv(expr); u32 len; - if (!tb[NFTA_TUNNEL_KEY] && + if (!tb[NFTA_TUNNEL_KEY] || !tb[NFTA_TUNNEL_DREG]) return -EINVAL; @@ -267,6 +267,9 @@ static int nft_tunnel_obj_erspan_init(const struct nlattr *attr, if (err < 0) return err; + if (!tb[NFTA_TUNNEL_KEY_ERSPAN_VERSION]) + return -EINVAL; + version = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY_ERSPAN_VERSION])); switch (version) { case ERSPAN_VERSION: diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index ced3fc8fad7c..bccd47cd7190 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -357,21 +357,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, return 0; } -static bool select_all(const struct xt_hashlimit_htable *ht, - const struct dsthash_ent *he) -{ - return true; -} - -static bool select_gc(const struct xt_hashlimit_htable *ht, - const struct dsthash_ent *he) -{ - return time_after_eq(jiffies, he->expires); -} - -static void htable_selective_cleanup(struct xt_hashlimit_htable *ht, - bool (*select)(const struct xt_hashlimit_htable *ht, - const struct dsthash_ent *he)) +static void htable_selective_cleanup(struct xt_hashlimit_htable *ht, bool select_all) { unsigned int i; @@ -381,7 +367,7 @@ static void htable_selective_cleanup(struct xt_hashlimit_htable *ht, spin_lock_bh(&ht->lock); hlist_for_each_entry_safe(dh, n, &ht->hash[i], node) { - if ((*select)(ht, dh)) + if (time_after_eq(jiffies, dh->expires) || select_all) dsthash_free(ht, dh); } spin_unlock_bh(&ht->lock); @@ -395,7 +381,7 @@ static void htable_gc(struct work_struct *work) ht = container_of(work, struct xt_hashlimit_htable, gc_work.work); - htable_selective_cleanup(ht, select_gc); + htable_selective_cleanup(ht, false); queue_delayed_work(system_power_efficient_wq, &ht->gc_work, msecs_to_jiffies(ht->cfg.gc_interval)); @@ -419,7 +405,7 @@ static void htable_destroy(struct xt_hashlimit_htable *hinfo) { cancel_delayed_work_sync(&hinfo->gc_work); htable_remove_proc_entry(hinfo); - htable_selective_cleanup(hinfo, select_all); + htable_selective_cleanup(hinfo, true); kfree(hinfo->name); vfree(hinfo); } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index e3a37d22539c..659c2a790fe7 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -321,8 +321,7 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, } /* Queue all of the segments. */ - skb = segs; - do { + skb_list_walk_safe(segs, skb, nskb) { if (gso_type & SKB_GSO_UDP && skb != segs) key = &later_key; @@ -330,17 +329,15 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, if (err) break; - } while ((skb = skb->next)); + } /* Free all of the segments. */ - skb = segs; - do { - nskb = skb->next; + skb_list_walk_safe(segs, skb, nskb) { if (err) kfree_skb(skb); else consume_skb(skb); - } while ((skb = nskb)); + } return err; } diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 88f98f27ad88..5a8e42ad1504 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -8,6 +8,8 @@ #include <linux/qrtr.h> #include <linux/termios.h> /* For TIOCINQ/OUTQ */ #include <linux/numa.h> +#include <linux/spinlock.h> +#include <linux/wait.h> #include <net/sock.h> @@ -97,10 +99,11 @@ static inline struct qrtr_sock *qrtr_sk(struct sock *sk) static unsigned int qrtr_local_nid = NUMA_NO_NODE; /* for node ids */ -static RADIX_TREE(qrtr_nodes, GFP_KERNEL); +static RADIX_TREE(qrtr_nodes, GFP_ATOMIC); +static DEFINE_SPINLOCK(qrtr_nodes_lock); /* broadcast list */ static LIST_HEAD(qrtr_all_nodes); -/* lock for qrtr_nodes, qrtr_all_nodes and node reference */ +/* lock for qrtr_all_nodes and node reference */ static DEFINE_MUTEX(qrtr_node_lock); /* local port allocation management */ @@ -113,8 +116,9 @@ static DEFINE_MUTEX(qrtr_port_lock); * @ep: endpoint * @ref: reference count for node * @nid: node id + * @qrtr_tx_flow: tree of qrtr_tx_flow, keyed by node << 32 | port + * @qrtr_tx_lock: lock for qrtr_tx_flow inserts * @rx_queue: receive queue - * @work: scheduled work struct for recv work * @item: list item for broadcast list */ struct qrtr_node { @@ -123,17 +127,36 @@ struct qrtr_node { struct kref ref; unsigned int nid; + struct radix_tree_root qrtr_tx_flow; + struct mutex qrtr_tx_lock; /* for qrtr_tx_flow */ + struct sk_buff_head rx_queue; - struct work_struct work; struct list_head item; }; +/** + * struct qrtr_tx_flow - tx flow control + * @resume_tx: waiters for a resume tx from the remote + * @pending: number of waiting senders + * @tx_failed: indicates that a message with confirm_rx flag was lost + */ +struct qrtr_tx_flow { + struct wait_queue_head resume_tx; + int pending; + int tx_failed; +}; + +#define QRTR_TX_FLOW_HIGH 10 +#define QRTR_TX_FLOW_LOW 5 + static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb, int type, struct sockaddr_qrtr *from, struct sockaddr_qrtr *to); static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb, int type, struct sockaddr_qrtr *from, struct sockaddr_qrtr *to); +static struct qrtr_sock *qrtr_port_lookup(int port); +static void qrtr_port_put(struct qrtr_sock *ipc); /* Release node resources and free the node. * @@ -143,15 +166,25 @@ static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb, static void __qrtr_node_release(struct kref *kref) { struct qrtr_node *node = container_of(kref, struct qrtr_node, ref); + struct radix_tree_iter iter; + unsigned long flags; + void __rcu **slot; + spin_lock_irqsave(&qrtr_nodes_lock, flags); if (node->nid != QRTR_EP_NID_AUTO) radix_tree_delete(&qrtr_nodes, node->nid); + spin_unlock_irqrestore(&qrtr_nodes_lock, flags); list_del(&node->item); mutex_unlock(&qrtr_node_lock); - cancel_work_sync(&node->work); skb_queue_purge(&node->rx_queue); + + /* Free tx flow counters */ + radix_tree_for_each_slot(slot, &node->qrtr_tx_flow, &iter, 0) { + radix_tree_iter_delete(&node->qrtr_tx_flow, &iter, slot); + kfree(*slot); + } kfree(node); } @@ -171,6 +204,126 @@ static void qrtr_node_release(struct qrtr_node *node) kref_put_mutex(&node->ref, __qrtr_node_release, &qrtr_node_lock); } +/** + * qrtr_tx_resume() - reset flow control counter + * @node: qrtr_node that the QRTR_TYPE_RESUME_TX packet arrived on + * @skb: resume_tx packet + */ +static void qrtr_tx_resume(struct qrtr_node *node, struct sk_buff *skb) +{ + struct qrtr_ctrl_pkt *pkt = (struct qrtr_ctrl_pkt *)skb->data; + u64 remote_node = le32_to_cpu(pkt->client.node); + u32 remote_port = le32_to_cpu(pkt->client.port); + struct qrtr_tx_flow *flow; + unsigned long key; + + key = remote_node << 32 | remote_port; + + rcu_read_lock(); + flow = radix_tree_lookup(&node->qrtr_tx_flow, key); + rcu_read_unlock(); + if (flow) { + spin_lock(&flow->resume_tx.lock); + flow->pending = 0; + spin_unlock(&flow->resume_tx.lock); + wake_up_interruptible_all(&flow->resume_tx); + } + + consume_skb(skb); +} + +/** + * qrtr_tx_wait() - flow control for outgoing packets + * @node: qrtr_node that the packet is to be send to + * @dest_node: node id of the destination + * @dest_port: port number of the destination + * @type: type of message + * + * The flow control scheme is based around the low and high "watermarks". When + * the low watermark is passed the confirm_rx flag is set on the outgoing + * message, which will trigger the remote to send a control message of the type + * QRTR_TYPE_RESUME_TX to reset the counter. If the high watermark is hit + * further transmision should be paused. + * + * Return: 1 if confirm_rx should be set, 0 otherwise or errno failure + */ +static int qrtr_tx_wait(struct qrtr_node *node, int dest_node, int dest_port, + int type) +{ + unsigned long key = (u64)dest_node << 32 | dest_port; + struct qrtr_tx_flow *flow; + int confirm_rx = 0; + int ret; + + /* Never set confirm_rx on non-data packets */ + if (type != QRTR_TYPE_DATA) + return 0; + + mutex_lock(&node->qrtr_tx_lock); + flow = radix_tree_lookup(&node->qrtr_tx_flow, key); + if (!flow) { + flow = kzalloc(sizeof(*flow), GFP_KERNEL); + if (flow) { + init_waitqueue_head(&flow->resume_tx); + radix_tree_insert(&node->qrtr_tx_flow, key, flow); + } + } + mutex_unlock(&node->qrtr_tx_lock); + + /* Set confirm_rx if we where unable to find and allocate a flow */ + if (!flow) + return 1; + + spin_lock_irq(&flow->resume_tx.lock); + ret = wait_event_interruptible_locked_irq(flow->resume_tx, + flow->pending < QRTR_TX_FLOW_HIGH || + flow->tx_failed || + !node->ep); + if (ret < 0) { + confirm_rx = ret; + } else if (!node->ep) { + confirm_rx = -EPIPE; + } else if (flow->tx_failed) { + flow->tx_failed = 0; + confirm_rx = 1; + } else { + flow->pending++; + confirm_rx = flow->pending == QRTR_TX_FLOW_LOW; + } + spin_unlock_irq(&flow->resume_tx.lock); + + return confirm_rx; +} + +/** + * qrtr_tx_flow_failed() - flag that tx of confirm_rx flagged messages failed + * @node: qrtr_node that the packet is to be send to + * @dest_node: node id of the destination + * @dest_port: port number of the destination + * + * Signal that the transmission of a message with confirm_rx flag failed. The + * flow's "pending" counter will keep incrementing towards QRTR_TX_FLOW_HIGH, + * at which point transmission would stall forever waiting for the resume TX + * message associated with the dropped confirm_rx message. + * Work around this by marking the flow as having a failed transmission and + * cause the next transmission attempt to be sent with the confirm_rx. + */ +static void qrtr_tx_flow_failed(struct qrtr_node *node, int dest_node, + int dest_port) +{ + unsigned long key = (u64)dest_node << 32 | dest_port; + struct qrtr_tx_flow *flow; + + rcu_read_lock(); + flow = radix_tree_lookup(&node->qrtr_tx_flow, key); + rcu_read_unlock(); + if (flow) { + spin_lock_irq(&flow->resume_tx.lock); + flow->tx_failed = 1; + spin_unlock_irq(&flow->resume_tx.lock); + } +} + /* Pass an outgoing packet socket buffer to the endpoint driver. */ static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb, int type, struct sockaddr_qrtr *from, @@ -179,6 +332,13 @@ static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb, struct qrtr_hdr_v1 *hdr; size_t len = skb->len; int rc = -ENODEV; + int confirm_rx; + + confirm_rx = qrtr_tx_wait(node, to->sq_node, to->sq_port, type); + if (confirm_rx < 0) { + kfree_skb(skb); + return confirm_rx; + } hdr = skb_push(skb, sizeof(*hdr)); hdr->version = cpu_to_le32(QRTR_PROTO_VER_1); @@ -194,9 +354,9 @@ static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb, } hdr->size = cpu_to_le32(len); - hdr->confirm_rx = 0; + hdr->confirm_rx = !!confirm_rx; - skb_put_padto(skb, ALIGN(len, 4)); + skb_put_padto(skb, ALIGN(len, 4) + sizeof(*hdr)); mutex_lock(&node->ep_lock); if (node->ep) @@ -205,6 +365,11 @@ static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb, kfree_skb(skb); mutex_unlock(&node->ep_lock); + /* Need to ensure that a subsequent message carries the otherwise lost + * confirm_rx flag if we dropped this one */ + if (rc && confirm_rx) + qrtr_tx_flow_failed(node, to->sq_node, to->sq_port); + return rc; } @@ -215,11 +380,12 @@ static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb, static struct qrtr_node *qrtr_node_lookup(unsigned int nid) { struct qrtr_node *node; + unsigned long flags; - mutex_lock(&qrtr_node_lock); + spin_lock_irqsave(&qrtr_nodes_lock, flags); node = radix_tree_lookup(&qrtr_nodes, nid); node = qrtr_node_acquire(node); - mutex_unlock(&qrtr_node_lock); + spin_unlock_irqrestore(&qrtr_nodes_lock, flags); return node; } @@ -231,13 +397,15 @@ static struct qrtr_node *qrtr_node_lookup(unsigned int nid) */ static void qrtr_node_assign(struct qrtr_node *node, unsigned int nid) { + unsigned long flags; + if (node->nid != QRTR_EP_NID_AUTO || nid == QRTR_EP_NID_AUTO) return; - mutex_lock(&qrtr_node_lock); + spin_lock_irqsave(&qrtr_nodes_lock, flags); radix_tree_insert(&qrtr_nodes, nid, node); node->nid = nid; - mutex_unlock(&qrtr_node_lock); + spin_unlock_irqrestore(&qrtr_nodes_lock, flags); } /** @@ -253,6 +421,7 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) struct qrtr_node *node = ep->node; const struct qrtr_hdr_v1 *v1; const struct qrtr_hdr_v2 *v2; + struct qrtr_sock *ipc; struct sk_buff *skb; struct qrtr_cb *cb; unsigned int size; @@ -311,13 +480,26 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) if (len != ALIGN(size, 4) + hdrlen) goto err; - if (cb->dst_port != QRTR_PORT_CTRL && cb->type != QRTR_TYPE_DATA) + if (cb->dst_port != QRTR_PORT_CTRL && cb->type != QRTR_TYPE_DATA && + cb->type != QRTR_TYPE_RESUME_TX) goto err; skb_put_data(skb, data + hdrlen, size); - skb_queue_tail(&node->rx_queue, skb); - schedule_work(&node->work); + qrtr_node_assign(node, cb->src_node); + + if (cb->type == QRTR_TYPE_RESUME_TX) { + qrtr_tx_resume(node, skb); + } else { + ipc = qrtr_port_lookup(cb->dst_port); + if (!ipc) + goto err; + + if (sock_queue_rcv_skb(&ipc->sk, skb)) + goto err; + + qrtr_port_put(ipc); + } return 0; @@ -352,61 +534,6 @@ static struct sk_buff *qrtr_alloc_ctrl_packet(struct qrtr_ctrl_pkt **pkt) return skb; } -static struct qrtr_sock *qrtr_port_lookup(int port); -static void qrtr_port_put(struct qrtr_sock *ipc); - -/* Handle and route a received packet. - * - * This will auto-reply with resume-tx packet as necessary. - */ -static void qrtr_node_rx_work(struct work_struct *work) -{ - struct qrtr_node *node = container_of(work, struct qrtr_node, work); - struct qrtr_ctrl_pkt *pkt; - struct sockaddr_qrtr dst; - struct sockaddr_qrtr src; - struct sk_buff *skb; - - while ((skb = skb_dequeue(&node->rx_queue)) != NULL) { - struct qrtr_sock *ipc; - struct qrtr_cb *cb; - int confirm; - - cb = (struct qrtr_cb *)skb->cb; - src.sq_node = cb->src_node; - src.sq_port = cb->src_port; - dst.sq_node = cb->dst_node; - dst.sq_port = cb->dst_port; - confirm = !!cb->confirm_rx; - - qrtr_node_assign(node, cb->src_node); - - ipc = qrtr_port_lookup(cb->dst_port); - if (!ipc) { - kfree_skb(skb); - } else { - if (sock_queue_rcv_skb(&ipc->sk, skb)) - kfree_skb(skb); - - qrtr_port_put(ipc); - } - - if (confirm) { - skb = qrtr_alloc_ctrl_packet(&pkt); - if (!skb) - break; - - pkt->cmd = cpu_to_le32(QRTR_TYPE_RESUME_TX); - pkt->client.node = cpu_to_le32(dst.sq_node); - pkt->client.port = cpu_to_le32(dst.sq_port); - - if (qrtr_node_enqueue(node, skb, QRTR_TYPE_RESUME_TX, - &dst, &src)) - break; - } - } -} - /** * qrtr_endpoint_register() - register a new endpoint * @ep: endpoint to register @@ -426,13 +553,15 @@ int qrtr_endpoint_register(struct qrtr_endpoint *ep, unsigned int nid) if (!node) return -ENOMEM; - INIT_WORK(&node->work, qrtr_node_rx_work); kref_init(&node->ref); mutex_init(&node->ep_lock); skb_queue_head_init(&node->rx_queue); node->nid = QRTR_EP_NID_AUTO; node->ep = ep; + INIT_RADIX_TREE(&node->qrtr_tx_flow, GFP_KERNEL); + mutex_init(&node->qrtr_tx_lock); + qrtr_node_assign(node, nid); mutex_lock(&qrtr_node_lock); @@ -453,8 +582,11 @@ void qrtr_endpoint_unregister(struct qrtr_endpoint *ep) struct qrtr_node *node = ep->node; struct sockaddr_qrtr src = {AF_QIPCRTR, node->nid, QRTR_PORT_CTRL}; struct sockaddr_qrtr dst = {AF_QIPCRTR, qrtr_local_nid, QRTR_PORT_CTRL}; + struct radix_tree_iter iter; struct qrtr_ctrl_pkt *pkt; + struct qrtr_tx_flow *flow; struct sk_buff *skb; + void __rcu **slot; mutex_lock(&node->ep_lock); node->ep = NULL; @@ -467,6 +599,14 @@ void qrtr_endpoint_unregister(struct qrtr_endpoint *ep) qrtr_local_enqueue(NULL, skb, QRTR_TYPE_BYE, &src, &dst); } + /* Wake up any transmitters waiting for resume-tx from the node */ + mutex_lock(&node->qrtr_tx_lock); + radix_tree_for_each_slot(slot, &node->qrtr_tx_flow, &iter, 0) { + flow = *slot; + wake_up_interruptible_all(&flow->resume_tx); + } + mutex_unlock(&node->qrtr_tx_lock); + qrtr_node_release(node); ep->node = NULL; } @@ -483,11 +623,11 @@ static struct qrtr_sock *qrtr_port_lookup(int port) if (port == QRTR_PORT_CTRL) port = 0; - mutex_lock(&qrtr_port_lock); + rcu_read_lock(); ipc = idr_find(&qrtr_ports, port); if (ipc) sock_hold(&ipc->sk); - mutex_unlock(&qrtr_port_lock); + rcu_read_unlock(); return ipc; } @@ -529,6 +669,10 @@ static void qrtr_port_remove(struct qrtr_sock *ipc) mutex_lock(&qrtr_port_lock); idr_remove(&qrtr_ports, port); mutex_unlock(&qrtr_port_lock); + + /* Ensure that if qrtr_port_lookup() did enter the RCU read section we + * wait for it to up increment the refcount */ + synchronize_rcu(); } /* Assign port number to socket. @@ -816,6 +960,34 @@ out_node: return rc; } +static int qrtr_send_resume_tx(struct qrtr_cb *cb) +{ + struct sockaddr_qrtr remote = { AF_QIPCRTR, cb->src_node, cb->src_port }; + struct sockaddr_qrtr local = { AF_QIPCRTR, cb->dst_node, cb->dst_port }; + struct qrtr_ctrl_pkt *pkt; + struct qrtr_node *node; + struct sk_buff *skb; + int ret; + + node = qrtr_node_lookup(remote.sq_node); + if (!node) + return -EINVAL; + + skb = qrtr_alloc_ctrl_packet(&pkt); + if (!skb) + return -ENOMEM; + + pkt->cmd = cpu_to_le32(QRTR_TYPE_RESUME_TX); + pkt->client.node = cpu_to_le32(cb->dst_node); + pkt->client.port = cpu_to_le32(cb->dst_port); + + ret = qrtr_node_enqueue(node, skb, QRTR_TYPE_RESUME_TX, &local, &remote); + + qrtr_node_release(node); + + return ret; +} + static int qrtr_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { @@ -838,6 +1010,7 @@ static int qrtr_recvmsg(struct socket *sock, struct msghdr *msg, release_sock(sk); return rc; } + cb = (struct qrtr_cb *)skb->cb; copied = skb->len; if (copied > size) { @@ -851,7 +1024,6 @@ static int qrtr_recvmsg(struct socket *sock, struct msghdr *msg, rc = copied; if (addr) { - cb = (struct qrtr_cb *)skb->cb; addr->sq_family = AF_QIPCRTR; addr->sq_node = cb->src_node; addr->sq_port = cb->src_port; @@ -859,6 +1031,9 @@ static int qrtr_recvmsg(struct socket *sock, struct msghdr *msg, } out: + if (cb->confirm_rx) + qrtr_send_resume_tx(cb); + skb_free_datagram(sk, skb); release_sock(sk); diff --git a/net/rds/ib.c b/net/rds/ib.c index 3fd5f40189bd..a792d8a3872a 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -156,6 +156,13 @@ static void rds_ib_add_one(struct ib_device *device) has_fmr = (device->ops.alloc_fmr && device->ops.dealloc_fmr && device->ops.map_phys_fmr && device->ops.unmap_fmr); rds_ibdev->use_fastreg = (has_fr && !has_fmr); + rds_ibdev->odp_capable = + !!(device->attrs.device_cap_flags & + IB_DEVICE_ON_DEMAND_PAGING) && + !!(device->attrs.odp_caps.per_transport_caps.rc_odp_caps & + IB_ODP_SUPPORT_WRITE) && + !!(device->attrs.odp_caps.per_transport_caps.rc_odp_caps & + IB_ODP_SUPPORT_READ); rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32; rds_ibdev->max_1m_mrs = device->attrs.max_mr ? diff --git a/net/rds/ib.h b/net/rds/ib.h index 6e6f24753998..0296f1f7acda 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -247,7 +247,8 @@ struct rds_ib_device { struct ib_device *dev; struct ib_pd *pd; struct dma_pool *rid_hdrs_pool; /* RDS headers DMA pool */ - bool use_fastreg; + u8 use_fastreg:1; + u8 odp_capable:1; unsigned int max_mrs; struct rds_ib_mr_pool *mr_1m_pool; diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h index 9045a8c0edff..0c8252d7fe2b 100644 --- a/net/rds/ib_mr.h +++ b/net/rds/ib_mr.h @@ -67,6 +67,7 @@ struct rds_ib_frmr { /* This is stored as mr->r_trans_private. */ struct rds_ib_mr { + struct delayed_work work; struct rds_ib_device *device; struct rds_ib_mr_pool *pool; struct rds_ib_connection *ic; @@ -81,9 +82,11 @@ struct rds_ib_mr { unsigned int sg_len; int sg_dma_len; + u8 odp:1; union { struct rds_ib_fmr fmr; struct rds_ib_frmr frmr; + struct ib_mr *mr; } u; }; @@ -122,12 +125,14 @@ void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev, void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, struct rds_sock *rs, u32 *key_ret, - struct rds_connection *conn); + struct rds_connection *conn, u64 start, u64 length, + int need_odp); void rds_ib_sync_mr(void *trans_private, int dir); void rds_ib_free_mr(void *trans_private, int invalidate); void rds_ib_flush_mrs(void); int rds_ib_mr_init(void); void rds_ib_mr_exit(void); +u32 rds_ib_get_lkey(void *trans_private); void __rds_ib_teardown_mr(struct rds_ib_mr *); void rds_ib_teardown_mr(struct rds_ib_mr *); diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index c8c1e3ae8d84..b34b24e237f8 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -37,8 +37,15 @@ #include "rds_single_path.h" #include "ib_mr.h" +#include "rds.h" struct workqueue_struct *rds_ib_mr_wq; +struct rds_ib_dereg_odp_mr { + struct work_struct work; + struct ib_mr *mr; +}; + +static void rds_ib_odp_mr_worker(struct work_struct *work); static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr) { @@ -213,6 +220,9 @@ void rds_ib_sync_mr(void *trans_private, int direction) struct rds_ib_mr *ibmr = trans_private; struct rds_ib_device *rds_ibdev = ibmr->device; + if (ibmr->odp) + return; + switch (direction) { case DMA_FROM_DEVICE: ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg, @@ -482,6 +492,16 @@ void rds_ib_free_mr(void *trans_private, int invalidate) rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); + if (ibmr->odp) { + /* A MR created and marked as use_once. We use delayed work, + * because there is a change that we are in interrupt and can't + * call to ib_dereg_mr() directly. + */ + INIT_DELAYED_WORK(&ibmr->work, rds_ib_odp_mr_worker); + queue_delayed_work(rds_ib_mr_wq, &ibmr->work, 0); + return; + } + /* Return it to the pool's free list */ if (rds_ibdev->use_fastreg) rds_ib_free_frmr_list(ibmr); @@ -526,9 +546,17 @@ void rds_ib_flush_mrs(void) up_read(&rds_ib_devices_lock); } +u32 rds_ib_get_lkey(void *trans_private) +{ + struct rds_ib_mr *ibmr = trans_private; + + return ibmr->u.mr->lkey; +} + void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, struct rds_sock *rs, u32 *key_ret, - struct rds_connection *conn) + struct rds_connection *conn, + u64 start, u64 length, int need_odp) { struct rds_ib_device *rds_ibdev; struct rds_ib_mr *ibmr = NULL; @@ -541,6 +569,51 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, goto out; } + if (need_odp == ODP_ZEROBASED || need_odp == ODP_VIRTUAL) { + u64 virt_addr = need_odp == ODP_ZEROBASED ? 0 : start; + int access_flags = + (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC | + IB_ACCESS_ON_DEMAND); + struct ib_sge sge = {}; + struct ib_mr *ib_mr; + + if (!rds_ibdev->odp_capable) { + ret = -EOPNOTSUPP; + goto out; + } + + ib_mr = ib_reg_user_mr(rds_ibdev->pd, start, length, virt_addr, + access_flags); + + if (IS_ERR(ib_mr)) { + rdsdebug("rds_ib_get_user_mr returned %d\n", + IS_ERR(ib_mr)); + ret = PTR_ERR(ib_mr); + goto out; + } + if (key_ret) + *key_ret = ib_mr->rkey; + + ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL); + if (!ibmr) { + ib_dereg_mr(ib_mr); + ret = -ENOMEM; + goto out; + } + ibmr->u.mr = ib_mr; + ibmr->odp = 1; + + sge.addr = virt_addr; + sge.length = length; + sge.lkey = ib_mr->lkey; + + ib_advise_mr(rds_ibdev->pd, + IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE, + IB_UVERBS_ADVISE_MR_FLAG_FLUSH, &sge, 1); + return ibmr; + } + if (conn) ic = conn->c_transport_data; @@ -629,3 +702,12 @@ void rds_ib_mr_exit(void) { destroy_workqueue(rds_ib_mr_wq); } + +static void rds_ib_odp_mr_worker(struct work_struct *work) +{ + struct rds_ib_mr *ibmr; + + ibmr = container_of(work, struct rds_ib_mr, work.work); + ib_dereg_mr(ibmr->u.mr); + kfree(ibmr); +} diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index d1cc1d7778d8..dfe778220657 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -39,6 +39,7 @@ #include "rds_single_path.h" #include "rds.h" #include "ib.h" +#include "ib_mr.h" /* * Convert IB-specific error message to RDS error message and call core @@ -635,6 +636,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, send->s_sge[0].addr = ic->i_send_hdrs_dma[pos]; send->s_sge[0].length = sizeof(struct rds_header); + send->s_sge[0].lkey = ic->i_pd->local_dma_lkey; memcpy(ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); @@ -650,6 +652,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, send->s_sge[1].addr = sg_dma_address(scat); send->s_sge[1].addr += rm->data.op_dmaoff; send->s_sge[1].length = len; + send->s_sge[1].lkey = ic->i_pd->local_dma_lkey; bytes_sent += len; rm->data.op_dmaoff += len; @@ -858,20 +861,29 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) int ret; int num_sge; int nr_sig = 0; + u64 odp_addr = op->op_odp_addr; + u32 odp_lkey = 0; /* map the op the first time we see it */ - if (!op->op_mapped) { - op->op_count = ib_dma_map_sg(ic->i_cm_id->device, - op->op_sg, op->op_nents, (op->op_write) ? - DMA_TO_DEVICE : DMA_FROM_DEVICE); - rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count); - if (op->op_count == 0) { - rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); - ret = -ENOMEM; /* XXX ? */ - goto out; + if (!op->op_odp_mr) { + if (!op->op_mapped) { + op->op_count = + ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, + op->op_nents, + (op->op_write) ? DMA_TO_DEVICE : + DMA_FROM_DEVICE); + rdsdebug("ic %p mapping op %p: %d\n", ic, op, + op->op_count); + if (op->op_count == 0) { + rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); + ret = -ENOMEM; /* XXX ? */ + goto out; + } + op->op_mapped = 1; } - - op->op_mapped = 1; + } else { + op->op_count = op->op_nents; + odp_lkey = rds_ib_get_lkey(op->op_odp_mr->r_trans_private); } /* @@ -923,14 +935,20 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) for (j = 0; j < send->s_rdma_wr.wr.num_sge && scat != &op->op_sg[op->op_count]; j++) { len = sg_dma_len(scat); - send->s_sge[j].addr = sg_dma_address(scat); + if (!op->op_odp_mr) { + send->s_sge[j].addr = sg_dma_address(scat); + send->s_sge[j].lkey = ic->i_pd->local_dma_lkey; + } else { + send->s_sge[j].addr = odp_addr; + send->s_sge[j].lkey = odp_lkey; + } send->s_sge[j].length = len; - send->s_sge[j].lkey = ic->i_pd->local_dma_lkey; sent += len; rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr); remote_addr += len; + odp_addr += len; scat++; } diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 916f5ec373d8..3341eee87bf9 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -156,11 +156,13 @@ void rds_rdma_drop_keys(struct rds_sock *rs) static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages, struct page **pages, int write) { + unsigned int gup_flags = FOLL_LONGTERM; int ret; - ret = get_user_pages_fast(user_addr, nr_pages, write ? FOLL_WRITE : 0, - pages); + if (write) + gup_flags |= FOLL_WRITE; + ret = get_user_pages_fast(user_addr, nr_pages, gup_flags, pages); if (ret >= 0 && ret < nr_pages) { while (ret--) put_page(pages[ret]); @@ -175,13 +177,14 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, struct rds_conn_path *cp) { struct rds_mr *mr = NULL, *found; + struct scatterlist *sg = NULL; unsigned int nr_pages; struct page **pages = NULL; - struct scatterlist *sg; void *trans_private; unsigned long flags; rds_rdma_cookie_t cookie; - unsigned int nents; + unsigned int nents = 0; + int need_odp = 0; long i; int ret; @@ -195,6 +198,21 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, goto out; } + /* If the combination of the addr and size requested for this memory + * region causes an integer overflow, return error. + */ + if (((args->vec.addr + args->vec.bytes) < args->vec.addr) || + PAGE_ALIGN(args->vec.addr + args->vec.bytes) < + (args->vec.addr + args->vec.bytes)) { + ret = -EINVAL; + goto out; + } + + if (!can_do_mlock()) { + ret = -EPERM; + goto out; + } + nr_pages = rds_pages_in_vec(&args->vec); if (nr_pages == 0) { ret = -EINVAL; @@ -248,36 +266,44 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, * the zero page. */ ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1); - if (ret < 0) - goto out; - - nents = ret; - sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL); - if (!sg) { - ret = -ENOMEM; + if (ret == -EOPNOTSUPP) { + need_odp = 1; + } else if (ret <= 0) { goto out; - } - WARN_ON(!nents); - sg_init_table(sg, nents); - - /* Stick all pages into the scatterlist */ - for (i = 0 ; i < nents; i++) - sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0); + } else { + nents = ret; + sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL); + if (!sg) { + ret = -ENOMEM; + goto out; + } + WARN_ON(!nents); + sg_init_table(sg, nents); - rdsdebug("RDS: trans_private nents is %u\n", nents); + /* Stick all pages into the scatterlist */ + for (i = 0 ; i < nents; i++) + sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0); + rdsdebug("RDS: trans_private nents is %u\n", nents); + } /* Obtain a transport specific MR. If this succeeds, the * s/g list is now owned by the MR. * Note that dma_map() implies that pending writes are * flushed to RAM, so no dma_sync is needed here. */ - trans_private = rs->rs_transport->get_mr(sg, nents, rs, - &mr->r_key, - cp ? cp->cp_conn : NULL); + trans_private = rs->rs_transport->get_mr( + sg, nents, rs, &mr->r_key, cp ? cp->cp_conn : NULL, + args->vec.addr, args->vec.bytes, + need_odp ? ODP_ZEROBASED : ODP_NOT_NEEDED); if (IS_ERR(trans_private)) { - for (i = 0 ; i < nents; i++) - put_page(sg_page(&sg[i])); - kfree(sg); + /* In ODP case, we don't GUP pages, so don't need + * to release anything. + */ + if (!need_odp) { + for (i = 0 ; i < nents; i++) + put_page(sg_page(&sg[i])); + kfree(sg); + } ret = PTR_ERR(trans_private); goto out; } @@ -291,7 +317,11 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, * map page aligned regions. So we keep the offset, and build * a 64bit cookie containing <R_Key, offset> and pass that * around. */ - cookie = rds_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGE_MASK); + if (need_odp) + cookie = rds_rdma_make_cookie(mr->r_key, 0); + else + cookie = rds_rdma_make_cookie(mr->r_key, + args->vec.addr & ~PAGE_MASK); if (cookie_ret) *cookie_ret = cookie; @@ -456,22 +486,26 @@ void rds_rdma_free_op(struct rm_rdma_op *ro) { unsigned int i; - for (i = 0; i < ro->op_nents; i++) { - struct page *page = sg_page(&ro->op_sg[i]); - - /* Mark page dirty if it was possibly modified, which - * is the case for a RDMA_READ which copies from remote - * to local memory */ - if (!ro->op_write) { - WARN_ON(!page->mapping && irqs_disabled()); - set_page_dirty(page); + if (ro->op_odp_mr) { + rds_mr_put(ro->op_odp_mr); + } else { + for (i = 0; i < ro->op_nents; i++) { + struct page *page = sg_page(&ro->op_sg[i]); + + /* Mark page dirty if it was possibly modified, which + * is the case for a RDMA_READ which copies from remote + * to local memory + */ + if (!ro->op_write) + set_page_dirty(page); + put_page(page); } - put_page(page); } kfree(ro->op_notifier); ro->op_notifier = NULL; ro->op_active = 0; + ro->op_odp_mr = NULL; } void rds_atomic_free_op(struct rm_atomic_op *ao) @@ -581,6 +615,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, struct rds_iovec *iovs; unsigned int i, j; int ret = 0; + bool odp_supported = true; if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) || rm->rdma.op_active) @@ -602,6 +637,9 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, ret = -EINVAL; goto out_ret; } + /* odp-mr is not supported for multiple requests within one message */ + if (args->nr_local != 1) + odp_supported = false; iovs = vec->iov; @@ -623,6 +661,8 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, op->op_silent = !!(args->flags & RDS_RDMA_SILENT); op->op_active = 1; op->op_recverr = rs->rs_recverr; + op->op_odp_mr = NULL; + WARN_ON(!nr_pages); op->op_sg = rds_message_alloc_sgs(rm, nr_pages, &ret); if (!op->op_sg) @@ -672,10 +712,44 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, * If it's a READ operation, we need to pin the pages for writing. */ ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write); - if (ret < 0) + if ((!odp_supported && ret <= 0) || + (odp_supported && ret <= 0 && ret != -EOPNOTSUPP)) goto out_pages; - else - ret = 0; + + if (ret == -EOPNOTSUPP) { + struct rds_mr *local_odp_mr; + + if (!rs->rs_transport->get_mr) { + ret = -EOPNOTSUPP; + goto out_pages; + } + local_odp_mr = + kzalloc(sizeof(*local_odp_mr), GFP_KERNEL); + if (!local_odp_mr) { + ret = -ENOMEM; + goto out_pages; + } + RB_CLEAR_NODE(&local_odp_mr->r_rb_node); + refcount_set(&local_odp_mr->r_refcount, 1); + local_odp_mr->r_trans = rs->rs_transport; + local_odp_mr->r_sock = rs; + local_odp_mr->r_trans_private = + rs->rs_transport->get_mr( + NULL, 0, rs, &local_odp_mr->r_key, NULL, + iov->addr, iov->bytes, ODP_VIRTUAL); + if (IS_ERR(local_odp_mr->r_trans_private)) { + ret = IS_ERR(local_odp_mr->r_trans_private); + rdsdebug("get_mr ret %d %p\"", ret, + local_odp_mr->r_trans_private); + kfree(local_odp_mr); + ret = -EOPNOTSUPP; + goto out_pages; + } + rdsdebug("Need odp; local_odp_mr %p trans_private %p\n", + local_odp_mr, local_odp_mr->r_trans_private); + op->op_odp_mr = local_odp_mr; + op->op_odp_addr = iov->addr; + } rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n", nr_bytes, nr, iov->bytes, iov->addr); @@ -691,6 +765,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, min_t(unsigned int, iov->bytes, PAGE_SIZE - offset), offset); + sg_dma_len(sg) = sg->length; rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n", sg->offset, sg->length, iov->addr, iov->bytes); @@ -709,6 +784,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, goto out_pages; } op->op_bytes = nr_bytes; + ret = 0; out_pages: kfree(pages); @@ -755,7 +831,8 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); if (mr) { - mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE); + mr->r_trans->sync_mr(mr->r_trans_private, + DMA_TO_DEVICE); rm->rdma.op_rdma_mr = mr; } return err; diff --git a/net/rds/rds.h b/net/rds/rds.h index 53e86911773a..e4a603523083 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -40,7 +40,6 @@ #ifdef ATOMIC64_INIT #define KERNEL_HAS_ATOMIC64 #endif - #ifdef RDS_DEBUG #define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args) #else @@ -478,6 +477,9 @@ struct rds_message { struct rds_notifier *op_notifier; struct rds_mr *op_rdma_mr; + + u64 op_odp_addr; + struct rds_mr *op_odp_mr; } rdma; struct rm_data_op { unsigned int op_active:1; @@ -573,7 +575,8 @@ struct rds_transport { void (*exit)(void); void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg, struct rds_sock *rs, u32 *key_ret, - struct rds_connection *conn); + struct rds_connection *conn, + u64 start, u64 length, int need_odp); void (*sync_mr)(void *trans_private, int direction); void (*free_mr)(void *trans_private, int invalidate); void (*flush_mrs)(void); @@ -956,6 +959,12 @@ static inline bool rds_destroy_pending(struct rds_connection *conn) (conn->c_trans->t_unloading && conn->c_trans->t_unloading(conn)); } +enum { + ODP_NOT_NEEDED, + ODP_ZEROBASED, + ODP_VIRTUAL +}; + /* stats.c */ DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); #define rds_stats_inc_which(which, member) do { \ diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 46b8ff24020d..1e8eeb044b07 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -1475,7 +1475,7 @@ static int __init rose_proto_init(void) int rc; if (rose_ndevs > 0x7FFFFFFF/sizeof(struct net_device *)) { - printk(KERN_ERR "ROSE: rose_proto_init - rose_ndevs parameter to large\n"); + printk(KERN_ERR "ROSE: rose_proto_init - rose_ndevs parameter too large\n"); rc = -EINVAL; goto out; } diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index c53307623236..5277631fa14c 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -696,7 +696,6 @@ struct rose_neigh *rose_get_neigh(rose_address *addr, unsigned char *cause, for (i = 0; i < node->count; i++) { if (!rose_ftimer_running(node->neighbour[i])) { res = node->neighbour[i]; - failed = 0; goto out; } failed = 1; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 86bd133b4fa0..96d54e5bf7bc 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -413,7 +413,7 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); enum rxrpc_call_state state; - unsigned int j; + unsigned int j, nr_subpackets; rxrpc_serial_t serial = sp->hdr.serial, ack_serial = 0; rxrpc_seq_t seq0 = sp->hdr.seq, hard_ack; bool immediate_ack = false, jumbo_bad = false; @@ -457,7 +457,8 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) call->ackr_prev_seq = seq0; hard_ack = READ_ONCE(call->rx_hard_ack); - if (sp->nr_subpackets > 1) { + nr_subpackets = sp->nr_subpackets; + if (nr_subpackets > 1) { if (call->nr_jumbo_bad > 3) { ack = RXRPC_ACK_NOSPACE; ack_serial = serial; @@ -465,11 +466,11 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) } } - for (j = 0; j < sp->nr_subpackets; j++) { + for (j = 0; j < nr_subpackets; j++) { rxrpc_serial_t serial = sp->hdr.serial + j; rxrpc_seq_t seq = seq0 + j; unsigned int ix = seq & RXRPC_RXTX_BUFF_MASK; - bool terminal = (j == sp->nr_subpackets - 1); + bool terminal = (j == nr_subpackets - 1); bool last = terminal && (sp->rx_flags & RXRPC_SKB_INCL_LAST); u8 flags, annotation = j; @@ -506,7 +507,7 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) } if (call->rxtx_buffer[ix]) { - rxrpc_input_dup_data(call, seq, sp->nr_subpackets > 1, + rxrpc_input_dup_data(call, seq, nr_subpackets > 1, &jumbo_bad); if (ack != RXRPC_ACK_DUPLICATE) { ack = RXRPC_ACK_DUPLICATE; @@ -564,6 +565,7 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) * ring. */ skb = NULL; + sp = NULL; } if (last) { diff --git a/net/sched/Kconfig b/net/sched/Kconfig index b1e7ec726958..edde0e519438 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -366,6 +366,19 @@ config NET_SCH_PIE If unsure, say N. +config NET_SCH_FQ_PIE + depends on NET_SCH_PIE + tristate "Flow Queue Proportional Integral controller Enhanced (FQ-PIE)" + help + Say Y here if you want to use the Flow Queue Proportional Integral + controller Enhanced (FQ-PIE) packet scheduling algorithm. + For more information, please see https://tools.ietf.org/html/rfc8033 + + To compile this driver as a module, choose M here: the module + will be called sch_fq_pie. + + If unsure, say N. + config NET_SCH_INGRESS tristate "Ingress/classifier-action Qdisc" depends on NET_CLS_ACT diff --git a/net/sched/Makefile b/net/sched/Makefile index bc8856b865ff..31c367a6cd09 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -59,6 +59,7 @@ obj-$(CONFIG_NET_SCH_CAKE) += sch_cake.o obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o +obj-$(CONFIG_NET_SCH_FQ_PIE) += sch_fq_pie.o obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index 40038c321b4a..19649623493b 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -360,6 +360,16 @@ static int tcf_ctinfo_search(struct net *net, struct tc_action **a, u32 index) return tcf_idr_search(tn, a, index); } +static void tcf_ctinfo_cleanup(struct tc_action *a) +{ + struct tcf_ctinfo *ci = to_ctinfo(a); + struct tcf_ctinfo_params *cp; + + cp = rcu_dereference_protected(ci->params, 1); + if (cp) + kfree_rcu(cp, rcu); +} + static struct tc_action_ops act_ctinfo_ops = { .kind = "ctinfo", .id = TCA_ID_CTINFO, @@ -367,6 +377,7 @@ static struct tc_action_ops act_ctinfo_ops = { .act = tcf_ctinfo_act, .dump = tcf_ctinfo_dump, .init = tcf_ctinfo_init, + .cleanup= tcf_ctinfo_cleanup, .walk = tcf_ctinfo_walker, .lookup = tcf_ctinfo_search, .size = sizeof(struct tcf_ctinfo), diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 5e6379028fc3..c1fcd85719d6 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -537,6 +537,9 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, } ife = to_ife(*a); + if (ret == ACT_P_CREATED) + INIT_LIST_HEAD(&ife->metalist); + err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; @@ -566,10 +569,6 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, p->eth_type = ife_type; } - - if (ret == ACT_P_CREATED) - INIT_LIST_HEAD(&ife->metalist); - if (tb[TCA_IFE_METALST]) { err = nla_parse_nested_deprecated(tb2, IFE_META_MAX, tb[TCA_IFE_METALST], NULL, diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 76e0d122616a..c2cdd0fc2e70 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -2055,9 +2055,8 @@ replay: &chain_info)); mutex_unlock(&chain->filter_chain_lock); - tp_new = tcf_proto_create(nla_data(tca[TCA_KIND]), - protocol, prio, chain, rtnl_held, - extack); + tp_new = tcf_proto_create(name, protocol, prio, chain, + rtnl_held, extack); if (IS_ERR(tp_new)) { err = PTR_ERR(tp_new); goto errout_tp; diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index 4aafbe3d435c..f256a7c69093 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -263,12 +263,17 @@ skip: } } -static void basic_bind_class(void *fh, u32 classid, unsigned long cl) +static void basic_bind_class(void *fh, u32 classid, unsigned long cl, void *q, + unsigned long base) { struct basic_filter *f = fh; - if (f && f->res.classid == classid) - f->res.class = cl; + if (f && f->res.classid == classid) { + if (cl) + __tcf_bind_filter(q, &f->res, base); + else + __tcf_unbind_filter(q, &f->res); + } } static int basic_dump(struct net *net, struct tcf_proto *tp, void *fh, diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 8229ed4a67be..6e3e63db0e01 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -631,12 +631,17 @@ nla_put_failure: return -1; } -static void cls_bpf_bind_class(void *fh, u32 classid, unsigned long cl) +static void cls_bpf_bind_class(void *fh, u32 classid, unsigned long cl, + void *q, unsigned long base) { struct cls_bpf_prog *prog = fh; - if (prog && prog->res.classid == classid) - prog->res.class = cl; + if (prog && prog->res.classid == classid) { + if (cl) + __tcf_bind_filter(q, &prog->res, base); + else + __tcf_unbind_filter(q, &prog->res); + } } static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg, diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index b0f42e62dd76..f9c0d1e8d380 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -2765,12 +2765,17 @@ nla_put_failure: return -EMSGSIZE; } -static void fl_bind_class(void *fh, u32 classid, unsigned long cl) +static void fl_bind_class(void *fh, u32 classid, unsigned long cl, void *q, + unsigned long base) { struct cls_fl_filter *f = fh; - if (f && f->res.classid == classid) - f->res.class = cl; + if (f && f->res.classid == classid) { + if (cl) + __tcf_bind_filter(q, &f->res, base); + else + __tcf_unbind_filter(q, &f->res); + } } static bool fl_delete_empty(struct tcf_proto *tp) diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index c9496c920d6f..ec945294626a 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -419,12 +419,17 @@ nla_put_failure: return -1; } -static void fw_bind_class(void *fh, u32 classid, unsigned long cl) +static void fw_bind_class(void *fh, u32 classid, unsigned long cl, void *q, + unsigned long base) { struct fw_filter *f = fh; - if (f && f->res.classid == classid) - f->res.class = cl; + if (f && f->res.classid == classid) { + if (cl) + __tcf_bind_filter(q, &f->res, base); + else + __tcf_unbind_filter(q, &f->res); + } } static struct tcf_proto_ops cls_fw_ops __read_mostly = { diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 7fc2eb62aa98..039cc86974f4 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -393,12 +393,17 @@ nla_put_failure: return -1; } -static void mall_bind_class(void *fh, u32 classid, unsigned long cl) +static void mall_bind_class(void *fh, u32 classid, unsigned long cl, void *q, + unsigned long base) { struct cls_mall_head *head = fh; - if (head && head->res.classid == classid) - head->res.class = cl; + if (head && head->res.classid == classid) { + if (cl) + __tcf_bind_filter(q, &head->res, base); + else + __tcf_unbind_filter(q, &head->res); + } } static struct tcf_proto_ops cls_mall_ops __read_mostly = { diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 2d9e0b4484ea..6f8786b06bde 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -641,12 +641,17 @@ nla_put_failure: return -1; } -static void route4_bind_class(void *fh, u32 classid, unsigned long cl) +static void route4_bind_class(void *fh, u32 classid, unsigned long cl, void *q, + unsigned long base) { struct route4_filter *f = fh; - if (f && f->res.classid == classid) - f->res.class = cl; + if (f && f->res.classid == classid) { + if (cl) + __tcf_bind_filter(q, &f->res, base); + else + __tcf_unbind_filter(q, &f->res); + } } static struct tcf_proto_ops cls_route4_ops __read_mostly = { diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index 2f3c03b25d5d..c22624131949 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -738,12 +738,17 @@ nla_put_failure: return -1; } -static void rsvp_bind_class(void *fh, u32 classid, unsigned long cl) +static void rsvp_bind_class(void *fh, u32 classid, unsigned long cl, void *q, + unsigned long base) { struct rsvp_filter *f = fh; - if (f && f->res.classid == classid) - f->res.class = cl; + if (f && f->res.classid == classid) { + if (cl) + __tcf_bind_filter(q, &f->res, base); + else + __tcf_unbind_filter(q, &f->res); + } } static struct tcf_proto_ops RSVP_OPS __read_mostly = { diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index e573e5a5c794..3d4a1280352f 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -654,12 +654,17 @@ nla_put_failure: return -1; } -static void tcindex_bind_class(void *fh, u32 classid, unsigned long cl) +static void tcindex_bind_class(void *fh, u32 classid, unsigned long cl, + void *q, unsigned long base) { struct tcindex_filter_result *r = fh; - if (r && r->res.classid == classid) - r->res.class = cl; + if (r && r->res.classid == classid) { + if (cl) + __tcf_bind_filter(q, &r->res, base); + else + __tcf_unbind_filter(q, &r->res); + } } static struct tcf_proto_ops cls_tcindex_ops __read_mostly = { diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index a0e6fac613de..e15ff335953d 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -1255,12 +1255,17 @@ static int u32_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb, return 0; } -static void u32_bind_class(void *fh, u32 classid, unsigned long cl) +static void u32_bind_class(void *fh, u32 classid, unsigned long cl, void *q, + unsigned long base) { struct tc_u_knode *n = fh; - if (n && n->res.classid == classid) - n->res.class = cl; + if (n && n->res.classid == classid) { + if (cl) + __tcf_bind_filter(q, &n->res, base); + else + __tcf_unbind_filter(q, &n->res); + } } static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh, diff --git a/net/sched/ematch.c b/net/sched/ematch.c index 8f2ad706784d..dd3b8c11a2e0 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -238,6 +238,9 @@ static int tcf_em_validate(struct tcf_proto *tp, goto errout; if (em->ops->change) { + err = -EINVAL; + if (em_hdr->flags & TCF_EM_SIMPLE) + goto errout; err = em->ops->change(net, data, data_len, em); if (err < 0) goto errout; @@ -263,12 +266,12 @@ static int tcf_em_validate(struct tcf_proto *tp, } em->data = (unsigned long) v; } + em->datalen = data_len; } } em->matchid = em_hdr->matchid; em->flags = em_hdr->flags; - em->datalen = data_len; em->net = net; err = 0; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 1047825d9f48..50794125bf02 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1891,8 +1891,9 @@ static int tclass_del_notify(struct net *net, struct tcf_bind_args { struct tcf_walker w; - u32 classid; + unsigned long base; unsigned long cl; + u32 classid; }; static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg) @@ -1903,28 +1904,30 @@ static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg) struct Qdisc *q = tcf_block_q(tp->chain->block); sch_tree_lock(q); - tp->ops->bind_class(n, a->classid, a->cl); + tp->ops->bind_class(n, a->classid, a->cl, q, a->base); sch_tree_unlock(q); } return 0; } -static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid, - unsigned long new_cl) +struct tc_bind_class_args { + struct qdisc_walker w; + unsigned long new_cl; + u32 portid; + u32 clid; +}; + +static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl, + struct qdisc_walker *w) { + struct tc_bind_class_args *a = (struct tc_bind_class_args *)w; const struct Qdisc_class_ops *cops = q->ops->cl_ops; struct tcf_block *block; struct tcf_chain *chain; - unsigned long cl; - cl = cops->find(q, portid); - if (!cl) - return; - if (!cops->tcf_block) - return; block = cops->tcf_block(q, cl, NULL); if (!block) - return; + return 0; for (chain = tcf_get_next_chain(block, NULL); chain; chain = tcf_get_next_chain(block, chain)) { @@ -1935,11 +1938,29 @@ static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid, struct tcf_bind_args arg = {}; arg.w.fn = tcf_node_bind; - arg.classid = clid; - arg.cl = new_cl; + arg.classid = a->clid; + arg.base = cl; + arg.cl = a->new_cl; tp->ops->walk(tp, &arg.w, true); } } + + return 0; +} + +static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid, + unsigned long new_cl) +{ + const struct Qdisc_class_ops *cops = q->ops->cl_ops; + struct tc_bind_class_args args = {}; + + if (!cops->tcf_block) + return; + args.portid = portid; + args.clid = clid; + args.new_cl = new_cl; + args.w.fn = tc_bind_class_walker; + q->ops->cl_ops->walk(q, &args.w); } #else diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 6cc3ab145513..1496e87cd07b 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1682,8 +1682,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (IS_ERR_OR_NULL(segs)) return qdisc_drop(skb, sch, to_free); - while (segs) { - nskb = segs->next; + skb_list_walk_safe(segs, segs, nskb) { skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; cobalt_set_enqueue_time(segs, now); @@ -1696,7 +1695,6 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, slen += segs->len; q->buffer_used += segs->truesize; b->packets++; - segs = nskb; } /* stats */ @@ -1768,7 +1766,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, q->avg_window_begin)); u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC; - do_div(b, window_interval); + b = div64_u64(b, window_interval); q->avg_peak_bandwidth = cake_ewma(q->avg_peak_bandwidth, b, b > q->avg_peak_bandwidth ? 2 : 8); diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index ff4c5e9d0d77..a5a295477ecc 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -786,10 +786,12 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_FQ_QUANTUM]) { u32 quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]); - if (quantum > 0) + if (quantum > 0 && quantum <= (1 << 20)) { q->quantum = quantum; - else + } else { + NL_SET_ERR_MSG_MOD(extack, "invalid quantum"); err = -EINVAL; + } } if (tb[TCA_FQ_INITIAL_QUANTUM]) diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c new file mode 100644 index 000000000000..bbd0dea6b6b9 --- /dev/null +++ b/net/sched/sch_fq_pie.c @@ -0,0 +1,562 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Flow Queue PIE discipline + * + * Copyright (C) 2019 Mohit P. Tahiliani <tahiliani@nitk.edu.in> + * Copyright (C) 2019 Sachin D. Patil <sdp.sachin@gmail.com> + * Copyright (C) 2019 V. Saicharan <vsaicharan1998@gmail.com> + * Copyright (C) 2019 Mohit Bhasi <mohitbhasi1998@gmail.com> + * Copyright (C) 2019 Leslie Monis <lesliemonis@gmail.com> + * Copyright (C) 2019 Gautam Ramakrishnan <gautamramk@gmail.com> + */ + +#include <linux/jhash.h> +#include <linux/sizes.h> +#include <linux/vmalloc.h> +#include <net/pkt_cls.h> +#include <net/pie.h> + +/* Flow Queue PIE + * + * Principles: + * - Packets are classified on flows. + * - This is a Stochastic model (as we use a hash, several flows might + * be hashed to the same slot) + * - Each flow has a PIE managed queue. + * - Flows are linked onto two (Round Robin) lists, + * so that new flows have priority on old ones. + * - For a given flow, packets are not reordered. + * - Drops during enqueue only. + * - ECN capability is off by default. + * - ECN threshold (if ECN is enabled) is at 10% by default. + * - Uses timestamps to calculate queue delay by default. + */ + +/** + * struct fq_pie_flow - contains data for each flow + * @vars: pie vars associated with the flow + * @deficit: number of remaining byte credits + * @backlog: size of data in the flow + * @qlen: number of packets in the flow + * @flowchain: flowchain for the flow + * @head: first packet in the flow + * @tail: last packet in the flow + */ +struct fq_pie_flow { + struct pie_vars vars; + s32 deficit; + u32 backlog; + u32 qlen; + struct list_head flowchain; + struct sk_buff *head; + struct sk_buff *tail; +}; + +struct fq_pie_sched_data { + struct tcf_proto __rcu *filter_list; /* optional external classifier */ + struct tcf_block *block; + struct fq_pie_flow *flows; + struct Qdisc *sch; + struct list_head old_flows; + struct list_head new_flows; + struct pie_params p_params; + u32 ecn_prob; + u32 flows_cnt; + u32 quantum; + u32 memory_limit; + u32 new_flow_count; + u32 memory_usage; + u32 overmemory; + struct pie_stats stats; + struct timer_list adapt_timer; +}; + +static unsigned int fq_pie_hash(const struct fq_pie_sched_data *q, + struct sk_buff *skb) +{ + return reciprocal_scale(skb_get_hash(skb), q->flows_cnt); +} + +static unsigned int fq_pie_classify(struct sk_buff *skb, struct Qdisc *sch, + int *qerr) +{ + struct fq_pie_sched_data *q = qdisc_priv(sch); + struct tcf_proto *filter; + struct tcf_result res; + int result; + + if (TC_H_MAJ(skb->priority) == sch->handle && + TC_H_MIN(skb->priority) > 0 && + TC_H_MIN(skb->priority) <= q->flows_cnt) + return TC_H_MIN(skb->priority); + + filter = rcu_dereference_bh(q->filter_list); + if (!filter) + return fq_pie_hash(q, skb) + 1; + + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + result = tcf_classify(skb, filter, &res, false); + if (result >= 0) { +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + case TC_ACT_TRAP: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + /* fall through */ + case TC_ACT_SHOT: + return 0; + } +#endif + if (TC_H_MIN(res.classid) <= q->flows_cnt) + return TC_H_MIN(res.classid); + } + return 0; +} + +/* add skb to flow queue (tail add) */ +static inline void flow_queue_add(struct fq_pie_flow *flow, + struct sk_buff *skb) +{ + if (!flow->head) + flow->head = skb; + else + flow->tail->next = skb; + flow->tail = skb; + skb->next = NULL; +} + +static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct fq_pie_sched_data *q = qdisc_priv(sch); + struct fq_pie_flow *sel_flow; + int uninitialized_var(ret); + u8 memory_limited = false; + u8 enqueue = false; + u32 pkt_len; + u32 idx; + + /* Classifies packet into corresponding flow */ + idx = fq_pie_classify(skb, sch, &ret); + sel_flow = &q->flows[idx]; + + /* Checks whether adding a new packet would exceed memory limit */ + get_pie_cb(skb)->mem_usage = skb->truesize; + memory_limited = q->memory_usage > q->memory_limit + skb->truesize; + + /* Checks if the qdisc is full */ + if (unlikely(qdisc_qlen(sch) >= sch->limit)) { + q->stats.overlimit++; + goto out; + } else if (unlikely(memory_limited)) { + q->overmemory++; + } + + if (!pie_drop_early(sch, &q->p_params, &sel_flow->vars, + sel_flow->backlog, skb->len)) { + enqueue = true; + } else if (q->p_params.ecn && + sel_flow->vars.prob <= (MAX_PROB / 100) * q->ecn_prob && + INET_ECN_set_ce(skb)) { + /* If packet is ecn capable, mark it if drop probability + * is lower than the parameter ecn_prob, else drop it. + */ + q->stats.ecn_mark++; + enqueue = true; + } + if (enqueue) { + /* Set enqueue time only when dq_rate_estimator is disabled. */ + if (!q->p_params.dq_rate_estimator) + pie_set_enqueue_time(skb); + + pkt_len = qdisc_pkt_len(skb); + q->stats.packets_in++; + q->memory_usage += skb->truesize; + sch->qstats.backlog += pkt_len; + sch->q.qlen++; + flow_queue_add(sel_flow, skb); + if (list_empty(&sel_flow->flowchain)) { + list_add_tail(&sel_flow->flowchain, &q->new_flows); + q->new_flow_count++; + sel_flow->deficit = q->quantum; + sel_flow->qlen = 0; + sel_flow->backlog = 0; + } + sel_flow->qlen++; + sel_flow->backlog += pkt_len; + return NET_XMIT_SUCCESS; + } +out: + q->stats.dropped++; + sel_flow->vars.accu_prob = 0; + sel_flow->vars.accu_prob_overflows = 0; + __qdisc_drop(skb, to_free); + qdisc_qstats_drop(sch); + return NET_XMIT_CN; +} + +static const struct nla_policy fq_pie_policy[TCA_FQ_PIE_MAX + 1] = { + [TCA_FQ_PIE_LIMIT] = {.type = NLA_U32}, + [TCA_FQ_PIE_FLOWS] = {.type = NLA_U32}, + [TCA_FQ_PIE_TARGET] = {.type = NLA_U32}, + [TCA_FQ_PIE_TUPDATE] = {.type = NLA_U32}, + [TCA_FQ_PIE_ALPHA] = {.type = NLA_U32}, + [TCA_FQ_PIE_BETA] = {.type = NLA_U32}, + [TCA_FQ_PIE_QUANTUM] = {.type = NLA_U32}, + [TCA_FQ_PIE_MEMORY_LIMIT] = {.type = NLA_U32}, + [TCA_FQ_PIE_ECN_PROB] = {.type = NLA_U32}, + [TCA_FQ_PIE_ECN] = {.type = NLA_U32}, + [TCA_FQ_PIE_BYTEMODE] = {.type = NLA_U32}, + [TCA_FQ_PIE_DQ_RATE_ESTIMATOR] = {.type = NLA_U32}, +}; + +static inline struct sk_buff *dequeue_head(struct fq_pie_flow *flow) +{ + struct sk_buff *skb = flow->head; + + flow->head = skb->next; + skb->next = NULL; + return skb; +} + +static struct sk_buff *fq_pie_qdisc_dequeue(struct Qdisc *sch) +{ + struct fq_pie_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb = NULL; + struct fq_pie_flow *flow; + struct list_head *head; + u32 pkt_len; + +begin: + head = &q->new_flows; + if (list_empty(head)) { + head = &q->old_flows; + if (list_empty(head)) + return NULL; + } + + flow = list_first_entry(head, struct fq_pie_flow, flowchain); + /* Flow has exhausted all its credits */ + if (flow->deficit <= 0) { + flow->deficit += q->quantum; + list_move_tail(&flow->flowchain, &q->old_flows); + goto begin; + } + + if (flow->head) { + skb = dequeue_head(flow); + pkt_len = qdisc_pkt_len(skb); + sch->qstats.backlog -= pkt_len; + sch->q.qlen--; + qdisc_bstats_update(sch, skb); + } + + if (!skb) { + /* force a pass through old_flows to prevent starvation */ + if (head == &q->new_flows && !list_empty(&q->old_flows)) + list_move_tail(&flow->flowchain, &q->old_flows); + else + list_del_init(&flow->flowchain); + goto begin; + } + + flow->qlen--; + flow->deficit -= pkt_len; + flow->backlog -= pkt_len; + q->memory_usage -= get_pie_cb(skb)->mem_usage; + pie_process_dequeue(skb, &q->p_params, &flow->vars, flow->backlog); + return skb; +} + +static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct fq_pie_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_FQ_PIE_MAX + 1]; + unsigned int len_dropped = 0; + unsigned int num_dropped = 0; + int err; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_FQ_PIE_MAX, opt, fq_pie_policy, extack); + if (err < 0) + return err; + + sch_tree_lock(sch); + if (tb[TCA_FQ_PIE_LIMIT]) { + u32 limit = nla_get_u32(tb[TCA_FQ_PIE_LIMIT]); + + q->p_params.limit = limit; + sch->limit = limit; + } + if (tb[TCA_FQ_PIE_FLOWS]) { + if (q->flows) { + NL_SET_ERR_MSG_MOD(extack, + "Number of flows cannot be changed"); + goto flow_error; + } + q->flows_cnt = nla_get_u32(tb[TCA_FQ_PIE_FLOWS]); + if (!q->flows_cnt || q->flows_cnt > 65536) { + NL_SET_ERR_MSG_MOD(extack, + "Number of flows must be < 65536"); + goto flow_error; + } + } + + /* convert from microseconds to pschedtime */ + if (tb[TCA_FQ_PIE_TARGET]) { + /* target is in us */ + u32 target = nla_get_u32(tb[TCA_FQ_PIE_TARGET]); + + /* convert to pschedtime */ + q->p_params.target = + PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC); + } + + /* tupdate is in jiffies */ + if (tb[TCA_FQ_PIE_TUPDATE]) + q->p_params.tupdate = + usecs_to_jiffies(nla_get_u32(tb[TCA_FQ_PIE_TUPDATE])); + + if (tb[TCA_FQ_PIE_ALPHA]) + q->p_params.alpha = nla_get_u32(tb[TCA_FQ_PIE_ALPHA]); + + if (tb[TCA_FQ_PIE_BETA]) + q->p_params.beta = nla_get_u32(tb[TCA_FQ_PIE_BETA]); + + if (tb[TCA_FQ_PIE_QUANTUM]) + q->quantum = nla_get_u32(tb[TCA_FQ_PIE_QUANTUM]); + + if (tb[TCA_FQ_PIE_MEMORY_LIMIT]) + q->memory_limit = nla_get_u32(tb[TCA_FQ_PIE_MEMORY_LIMIT]); + + if (tb[TCA_FQ_PIE_ECN_PROB]) + q->ecn_prob = nla_get_u32(tb[TCA_FQ_PIE_ECN_PROB]); + + if (tb[TCA_FQ_PIE_ECN]) + q->p_params.ecn = nla_get_u32(tb[TCA_FQ_PIE_ECN]); + + if (tb[TCA_FQ_PIE_BYTEMODE]) + q->p_params.bytemode = nla_get_u32(tb[TCA_FQ_PIE_BYTEMODE]); + + if (tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR]) + q->p_params.dq_rate_estimator = + nla_get_u32(tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR]); + + /* Drop excess packets if new limit is lower */ + while (sch->q.qlen > sch->limit) { + struct sk_buff *skb = fq_pie_qdisc_dequeue(sch); + + kfree_skb(skb); + len_dropped += qdisc_pkt_len(skb); + num_dropped += 1; + } + qdisc_tree_reduce_backlog(sch, num_dropped, len_dropped); + + sch_tree_unlock(sch); + return 0; + +flow_error: + sch_tree_unlock(sch); + return -EINVAL; +} + +static void fq_pie_timer(struct timer_list *t) +{ + struct fq_pie_sched_data *q = from_timer(q, t, adapt_timer); + struct Qdisc *sch = q->sch; + spinlock_t *root_lock; /* to lock qdisc for probability calculations */ + u16 idx; + + root_lock = qdisc_lock(qdisc_root_sleeping(sch)); + spin_lock(root_lock); + + for (idx = 0; idx < q->flows_cnt; idx++) + pie_calculate_probability(&q->p_params, &q->flows[idx].vars, + q->flows[idx].backlog); + + /* reset the timer to fire after 'tupdate' jiffies. */ + if (q->p_params.tupdate) + mod_timer(&q->adapt_timer, jiffies + q->p_params.tupdate); + + spin_unlock(root_lock); +} + +static int fq_pie_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct fq_pie_sched_data *q = qdisc_priv(sch); + int err; + u16 idx; + + pie_params_init(&q->p_params); + sch->limit = 10 * 1024; + q->p_params.limit = sch->limit; + q->quantum = psched_mtu(qdisc_dev(sch)); + q->sch = sch; + q->ecn_prob = 10; + q->flows_cnt = 1024; + q->memory_limit = SZ_32M; + + INIT_LIST_HEAD(&q->new_flows); + INIT_LIST_HEAD(&q->old_flows); + + if (opt) { + err = fq_pie_change(sch, opt, extack); + + if (err) + return err; + } + + err = tcf_block_get(&q->block, &q->filter_list, sch, extack); + if (err) + goto init_failure; + + q->flows = kvcalloc(q->flows_cnt, sizeof(struct fq_pie_flow), + GFP_KERNEL); + if (!q->flows) { + err = -ENOMEM; + goto init_failure; + } + for (idx = 0; idx < q->flows_cnt; idx++) { + struct fq_pie_flow *flow = q->flows + idx; + + INIT_LIST_HEAD(&flow->flowchain); + pie_vars_init(&flow->vars); + } + + timer_setup(&q->adapt_timer, fq_pie_timer, 0); + mod_timer(&q->adapt_timer, jiffies + HZ / 2); + + return 0; + +init_failure: + q->flows_cnt = 0; + + return err; +} + +static int fq_pie_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct fq_pie_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (!opts) + return -EMSGSIZE; + + /* convert target from pschedtime to us */ + if (nla_put_u32(skb, TCA_FQ_PIE_LIMIT, sch->limit) || + nla_put_u32(skb, TCA_FQ_PIE_FLOWS, q->flows_cnt) || + nla_put_u32(skb, TCA_FQ_PIE_TARGET, + ((u32)PSCHED_TICKS2NS(q->p_params.target)) / + NSEC_PER_USEC) || + nla_put_u32(skb, TCA_FQ_PIE_TUPDATE, + jiffies_to_usecs(q->p_params.tupdate)) || + nla_put_u32(skb, TCA_FQ_PIE_ALPHA, q->p_params.alpha) || + nla_put_u32(skb, TCA_FQ_PIE_BETA, q->p_params.beta) || + nla_put_u32(skb, TCA_FQ_PIE_QUANTUM, q->quantum) || + nla_put_u32(skb, TCA_FQ_PIE_MEMORY_LIMIT, q->memory_limit) || + nla_put_u32(skb, TCA_FQ_PIE_ECN_PROB, q->ecn_prob) || + nla_put_u32(skb, TCA_FQ_PIE_ECN, q->p_params.ecn) || + nla_put_u32(skb, TCA_FQ_PIE_BYTEMODE, q->p_params.bytemode) || + nla_put_u32(skb, TCA_FQ_PIE_DQ_RATE_ESTIMATOR, + q->p_params.dq_rate_estimator)) + goto nla_put_failure; + + return nla_nest_end(skb, opts); + +nla_put_failure: + nla_nest_cancel(skb, opts); + return -EMSGSIZE; +} + +static int fq_pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct fq_pie_sched_data *q = qdisc_priv(sch); + struct tc_fq_pie_xstats st = { + .packets_in = q->stats.packets_in, + .overlimit = q->stats.overlimit, + .overmemory = q->overmemory, + .dropped = q->stats.dropped, + .ecn_mark = q->stats.ecn_mark, + .new_flow_count = q->new_flow_count, + .memory_usage = q->memory_usage, + }; + struct list_head *pos; + + sch_tree_lock(sch); + list_for_each(pos, &q->new_flows) + st.new_flows_len++; + + list_for_each(pos, &q->old_flows) + st.old_flows_len++; + sch_tree_unlock(sch); + + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static void fq_pie_reset(struct Qdisc *sch) +{ + struct fq_pie_sched_data *q = qdisc_priv(sch); + u16 idx; + + INIT_LIST_HEAD(&q->new_flows); + INIT_LIST_HEAD(&q->old_flows); + for (idx = 0; idx < q->flows_cnt; idx++) { + struct fq_pie_flow *flow = q->flows + idx; + + /* Removes all packets from flow */ + rtnl_kfree_skbs(flow->head, flow->tail); + flow->head = NULL; + + INIT_LIST_HEAD(&flow->flowchain); + pie_vars_init(&flow->vars); + } + + sch->q.qlen = 0; + sch->qstats.backlog = 0; +} + +static void fq_pie_destroy(struct Qdisc *sch) +{ + struct fq_pie_sched_data *q = qdisc_priv(sch); + + tcf_block_put(q->block); + del_timer_sync(&q->adapt_timer); + kvfree(q->flows); +} + +static struct Qdisc_ops fq_pie_qdisc_ops __read_mostly = { + .id = "fq_pie", + .priv_size = sizeof(struct fq_pie_sched_data), + .enqueue = fq_pie_qdisc_enqueue, + .dequeue = fq_pie_qdisc_dequeue, + .peek = qdisc_peek_dequeued, + .init = fq_pie_init, + .destroy = fq_pie_destroy, + .reset = fq_pie_reset, + .change = fq_pie_change, + .dump = fq_pie_dump, + .dump_stats = fq_pie_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init fq_pie_module_init(void) +{ + return register_qdisc(&fq_pie_qdisc_ops); +} + +static void __exit fq_pie_module_exit(void) +{ + unregister_qdisc(&fq_pie_qdisc_ops); +} + +module_init(fq_pie_module_init); +module_exit(fq_pie_module_exit); + +MODULE_DESCRIPTION("Flow Queue Proportional Integral controller Enhanced (FQ-PIE)"); +MODULE_AUTHOR("Mohit P. Tahiliani"); +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index b0b0dc46af61..915bcdb59a9f 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -19,159 +19,76 @@ #include <linux/skbuff.h> #include <net/pkt_sched.h> #include <net/inet_ecn.h> - -#define QUEUE_THRESHOLD 16384 -#define DQCOUNT_INVALID -1 -#define DTIME_INVALID 0xffffffffffffffff -#define MAX_PROB 0xffffffffffffffff -#define PIE_SCALE 8 - -/* parameters used */ -struct pie_params { - psched_time_t target; /* user specified target delay in pschedtime */ - u32 tupdate; /* timer frequency (in jiffies) */ - u32 limit; /* number of packets that can be enqueued */ - u32 alpha; /* alpha and beta are between 0 and 32 */ - u32 beta; /* and are used for shift relative to 1 */ - bool ecn; /* true if ecn is enabled */ - bool bytemode; /* to scale drop early prob based on pkt size */ - u8 dq_rate_estimator; /* to calculate delay using Little's law */ -}; - -/* variables used */ -struct pie_vars { - u64 prob; /* probability but scaled by u64 limit. */ - psched_time_t burst_time; - psched_time_t qdelay; - psched_time_t qdelay_old; - u64 dq_count; /* measured in bytes */ - psched_time_t dq_tstamp; /* drain rate */ - u64 accu_prob; /* accumulated drop probability */ - u32 avg_dq_rate; /* bytes per pschedtime tick,scaled */ - u32 qlen_old; /* in bytes */ - u8 accu_prob_overflows; /* overflows of accu_prob */ -}; - -/* statistics gathering */ -struct pie_stats { - u32 packets_in; /* total number of packets enqueued */ - u32 dropped; /* packets dropped due to pie_action */ - u32 overlimit; /* dropped due to lack of space in queue */ - u32 maxq; /* maximum queue size */ - u32 ecn_mark; /* packets marked with ECN */ -}; +#include <net/pie.h> /* private data for the Qdisc */ struct pie_sched_data { - struct pie_params params; struct pie_vars vars; + struct pie_params params; struct pie_stats stats; struct timer_list adapt_timer; struct Qdisc *sch; }; -static void pie_params_init(struct pie_params *params) +bool pie_drop_early(struct Qdisc *sch, struct pie_params *params, + struct pie_vars *vars, u32 qlen, u32 packet_size) { - params->alpha = 2; - params->beta = 20; - params->tupdate = usecs_to_jiffies(15 * USEC_PER_MSEC); /* 15 ms */ - params->limit = 1000; /* default of 1000 packets */ - params->target = PSCHED_NS2TICKS(15 * NSEC_PER_MSEC); /* 15 ms */ - params->ecn = false; - params->bytemode = false; - params->dq_rate_estimator = false; -} - -/* private skb vars */ -struct pie_skb_cb { - psched_time_t enqueue_time; -}; - -static struct pie_skb_cb *get_pie_cb(const struct sk_buff *skb) -{ - qdisc_cb_private_validate(skb, sizeof(struct pie_skb_cb)); - return (struct pie_skb_cb *)qdisc_skb_cb(skb)->data; -} - -static psched_time_t pie_get_enqueue_time(const struct sk_buff *skb) -{ - return get_pie_cb(skb)->enqueue_time; -} - -static void pie_set_enqueue_time(struct sk_buff *skb) -{ - get_pie_cb(skb)->enqueue_time = psched_get_time(); -} - -static void pie_vars_init(struct pie_vars *vars) -{ - vars->dq_count = DQCOUNT_INVALID; - vars->dq_tstamp = DTIME_INVALID; - vars->accu_prob = 0; - vars->avg_dq_rate = 0; - /* default of 150 ms in pschedtime */ - vars->burst_time = PSCHED_NS2TICKS(150 * NSEC_PER_MSEC); - vars->accu_prob_overflows = 0; -} - -static bool drop_early(struct Qdisc *sch, u32 packet_size) -{ - struct pie_sched_data *q = qdisc_priv(sch); u64 rnd; - u64 local_prob = q->vars.prob; + u64 local_prob = vars->prob; u32 mtu = psched_mtu(qdisc_dev(sch)); /* If there is still burst allowance left skip random early drop */ - if (q->vars.burst_time > 0) + if (vars->burst_time > 0) return false; /* If current delay is less than half of target, and * if drop prob is low already, disable early_drop */ - if ((q->vars.qdelay < q->params.target / 2) && - (q->vars.prob < MAX_PROB / 5)) + if ((vars->qdelay < params->target / 2) && + (vars->prob < MAX_PROB / 5)) return false; - /* If we have fewer than 2 mtu-sized packets, disable drop_early, + /* If we have fewer than 2 mtu-sized packets, disable pie_drop_early, * similar to min_th in RED */ - if (sch->qstats.backlog < 2 * mtu) + if (qlen < 2 * mtu) return false; /* If bytemode is turned on, use packet size to compute new * probablity. Smaller packets will have lower drop prob in this case */ - if (q->params.bytemode && packet_size <= mtu) + if (params->bytemode && packet_size <= mtu) local_prob = (u64)packet_size * div_u64(local_prob, mtu); else - local_prob = q->vars.prob; + local_prob = vars->prob; if (local_prob == 0) { - q->vars.accu_prob = 0; - q->vars.accu_prob_overflows = 0; + vars->accu_prob = 0; + vars->accu_prob_overflows = 0; } - if (local_prob > MAX_PROB - q->vars.accu_prob) - q->vars.accu_prob_overflows++; + if (local_prob > MAX_PROB - vars->accu_prob) + vars->accu_prob_overflows++; - q->vars.accu_prob += local_prob; + vars->accu_prob += local_prob; - if (q->vars.accu_prob_overflows == 0 && - q->vars.accu_prob < (MAX_PROB / 100) * 85) + if (vars->accu_prob_overflows == 0 && + vars->accu_prob < (MAX_PROB / 100) * 85) return false; - if (q->vars.accu_prob_overflows == 8 && - q->vars.accu_prob >= MAX_PROB / 2) + if (vars->accu_prob_overflows == 8 && + vars->accu_prob >= MAX_PROB / 2) return true; prandom_bytes(&rnd, 8); if (rnd < local_prob) { - q->vars.accu_prob = 0; - q->vars.accu_prob_overflows = 0; + vars->accu_prob = 0; + vars->accu_prob_overflows = 0; return true; } return false; } +EXPORT_SYMBOL_GPL(pie_drop_early); static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) @@ -184,7 +101,8 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, goto out; } - if (!drop_early(sch, skb->len)) { + if (!pie_drop_early(sch, &q->params, &q->vars, sch->qstats.backlog, + skb->len)) { enqueue = true; } else if (q->params.ecn && (q->vars.prob <= MAX_PROB / 10) && INET_ECN_set_ce(skb)) { @@ -216,14 +134,14 @@ out: } static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = { - [TCA_PIE_TARGET] = {.type = NLA_U32}, - [TCA_PIE_LIMIT] = {.type = NLA_U32}, - [TCA_PIE_TUPDATE] = {.type = NLA_U32}, - [TCA_PIE_ALPHA] = {.type = NLA_U32}, - [TCA_PIE_BETA] = {.type = NLA_U32}, - [TCA_PIE_ECN] = {.type = NLA_U32}, - [TCA_PIE_BYTEMODE] = {.type = NLA_U32}, - [TCA_PIE_DQ_RATE_ESTIMATOR] = {.type = NLA_U32}, + [TCA_PIE_TARGET] = {.type = NLA_U32}, + [TCA_PIE_LIMIT] = {.type = NLA_U32}, + [TCA_PIE_TUPDATE] = {.type = NLA_U32}, + [TCA_PIE_ALPHA] = {.type = NLA_U32}, + [TCA_PIE_BETA] = {.type = NLA_U32}, + [TCA_PIE_ECN] = {.type = NLA_U32}, + [TCA_PIE_BYTEMODE] = {.type = NLA_U32}, + [TCA_PIE_DQ_RATE_ESTIMATOR] = {.type = NLA_U32}, }; static int pie_change(struct Qdisc *sch, struct nlattr *opt, @@ -296,26 +214,25 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt, return 0; } -static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb) +void pie_process_dequeue(struct sk_buff *skb, struct pie_params *params, + struct pie_vars *vars, u32 qlen) { - struct pie_sched_data *q = qdisc_priv(sch); - int qlen = sch->qstats.backlog; /* current queue size in bytes */ psched_time_t now = psched_get_time(); u32 dtime = 0; /* If dq_rate_estimator is disabled, calculate qdelay using the * packet timestamp. */ - if (!q->params.dq_rate_estimator) { - q->vars.qdelay = now - pie_get_enqueue_time(skb); + if (!params->dq_rate_estimator) { + vars->qdelay = now - pie_get_enqueue_time(skb); - if (q->vars.dq_tstamp != DTIME_INVALID) - dtime = now - q->vars.dq_tstamp; + if (vars->dq_tstamp != DTIME_INVALID) + dtime = now - vars->dq_tstamp; - q->vars.dq_tstamp = now; + vars->dq_tstamp = now; if (qlen == 0) - q->vars.qdelay = 0; + vars->qdelay = 0; if (dtime == 0) return; @@ -327,39 +244,39 @@ static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb) * we have enough packets to calculate the drain rate. Save * current time as dq_tstamp and start measurement cycle. */ - if (qlen >= QUEUE_THRESHOLD && q->vars.dq_count == DQCOUNT_INVALID) { - q->vars.dq_tstamp = psched_get_time(); - q->vars.dq_count = 0; + if (qlen >= QUEUE_THRESHOLD && vars->dq_count == DQCOUNT_INVALID) { + vars->dq_tstamp = psched_get_time(); + vars->dq_count = 0; } - /* Calculate the average drain rate from this value. If queue length - * has receded to a small value viz., <= QUEUE_THRESHOLD bytes,reset + /* Calculate the average drain rate from this value. If queue length + * has receded to a small value viz., <= QUEUE_THRESHOLD bytes, reset * the dq_count to -1 as we don't have enough packets to calculate the - * drain rate anymore The following if block is entered only when we + * drain rate anymore. The following if block is entered only when we * have a substantial queue built up (QUEUE_THRESHOLD bytes or more) * and we calculate the drain rate for the threshold here. dq_count is * in bytes, time difference in psched_time, hence rate is in * bytes/psched_time. */ - if (q->vars.dq_count != DQCOUNT_INVALID) { - q->vars.dq_count += skb->len; + if (vars->dq_count != DQCOUNT_INVALID) { + vars->dq_count += skb->len; - if (q->vars.dq_count >= QUEUE_THRESHOLD) { - u32 count = q->vars.dq_count << PIE_SCALE; + if (vars->dq_count >= QUEUE_THRESHOLD) { + u32 count = vars->dq_count << PIE_SCALE; - dtime = now - q->vars.dq_tstamp; + dtime = now - vars->dq_tstamp; if (dtime == 0) return; count = count / dtime; - if (q->vars.avg_dq_rate == 0) - q->vars.avg_dq_rate = count; + if (vars->avg_dq_rate == 0) + vars->avg_dq_rate = count; else - q->vars.avg_dq_rate = - (q->vars.avg_dq_rate - - (q->vars.avg_dq_rate >> 3)) + (count >> 3); + vars->avg_dq_rate = + (vars->avg_dq_rate - + (vars->avg_dq_rate >> 3)) + (count >> 3); /* If the queue has receded below the threshold, we hold * on to the last drain rate calculated, else we reset @@ -367,10 +284,10 @@ static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb) * packet is dequeued */ if (qlen < QUEUE_THRESHOLD) { - q->vars.dq_count = DQCOUNT_INVALID; + vars->dq_count = DQCOUNT_INVALID; } else { - q->vars.dq_count = 0; - q->vars.dq_tstamp = psched_get_time(); + vars->dq_count = 0; + vars->dq_tstamp = psched_get_time(); } goto burst_allowance_reduction; @@ -380,18 +297,18 @@ static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb) return; burst_allowance_reduction: - if (q->vars.burst_time > 0) { - if (q->vars.burst_time > dtime) - q->vars.burst_time -= dtime; + if (vars->burst_time > 0) { + if (vars->burst_time > dtime) + vars->burst_time -= dtime; else - q->vars.burst_time = 0; + vars->burst_time = 0; } } +EXPORT_SYMBOL_GPL(pie_process_dequeue); -static void calculate_probability(struct Qdisc *sch) +void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars, + u32 qlen) { - struct pie_sched_data *q = qdisc_priv(sch); - u32 qlen = sch->qstats.backlog; /* queue size in bytes */ psched_time_t qdelay = 0; /* in pschedtime */ psched_time_t qdelay_old = 0; /* in pschedtime */ s64 delta = 0; /* determines the change in probability */ @@ -400,21 +317,21 @@ static void calculate_probability(struct Qdisc *sch) u32 power; bool update_prob = true; - if (q->params.dq_rate_estimator) { - qdelay_old = q->vars.qdelay; - q->vars.qdelay_old = q->vars.qdelay; + if (params->dq_rate_estimator) { + qdelay_old = vars->qdelay; + vars->qdelay_old = vars->qdelay; - if (q->vars.avg_dq_rate > 0) - qdelay = (qlen << PIE_SCALE) / q->vars.avg_dq_rate; + if (vars->avg_dq_rate > 0) + qdelay = (qlen << PIE_SCALE) / vars->avg_dq_rate; else qdelay = 0; } else { - qdelay = q->vars.qdelay; - qdelay_old = q->vars.qdelay_old; + qdelay = vars->qdelay; + qdelay_old = vars->qdelay_old; } - /* If qdelay is zero and qlen is not, it means qlen is very small, less - * than dequeue_rate, so we do not update probabilty in this round + /* If qdelay is zero and qlen is not, it means qlen is very small, + * so we do not update probabilty in this round. */ if (qdelay == 0 && qlen != 0) update_prob = false; @@ -426,18 +343,18 @@ static void calculate_probability(struct Qdisc *sch) * probability. alpha/beta are updated locally below by scaling down * by 16 to come to 0-2 range. */ - alpha = ((u64)q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; - beta = ((u64)q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; + alpha = ((u64)params->alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; + beta = ((u64)params->beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; /* We scale alpha and beta differently depending on how heavy the * congestion is. Please see RFC 8033 for details. */ - if (q->vars.prob < MAX_PROB / 10) { + if (vars->prob < MAX_PROB / 10) { alpha >>= 1; beta >>= 1; power = 100; - while (q->vars.prob < div_u64(MAX_PROB, power) && + while (vars->prob < div_u64(MAX_PROB, power) && power <= 1000000) { alpha >>= 2; beta >>= 2; @@ -446,14 +363,14 @@ static void calculate_probability(struct Qdisc *sch) } /* alpha and beta should be between 0 and 32, in multiples of 1/16 */ - delta += alpha * (u64)(qdelay - q->params.target); + delta += alpha * (u64)(qdelay - params->target); delta += beta * (u64)(qdelay - qdelay_old); - oldprob = q->vars.prob; + oldprob = vars->prob; /* to ensure we increase probability in steps of no more than 2% */ if (delta > (s64)(MAX_PROB / (100 / 2)) && - q->vars.prob >= MAX_PROB / 10) + vars->prob >= MAX_PROB / 10) delta = (MAX_PROB / 100) * 2; /* Non-linear drop: @@ -464,12 +381,12 @@ static void calculate_probability(struct Qdisc *sch) if (qdelay > (PSCHED_NS2TICKS(250 * NSEC_PER_MSEC))) delta += MAX_PROB / (100 / 2); - q->vars.prob += delta; + vars->prob += delta; if (delta > 0) { /* prevent overflow */ - if (q->vars.prob < oldprob) { - q->vars.prob = MAX_PROB; + if (vars->prob < oldprob) { + vars->prob = MAX_PROB; /* Prevent normalization error. If probability is at * maximum value already, we normalize it here, and * skip the check to do a non-linear drop in the next @@ -479,8 +396,8 @@ static void calculate_probability(struct Qdisc *sch) } } else { /* prevent underflow */ - if (q->vars.prob > oldprob) - q->vars.prob = 0; + if (vars->prob > oldprob) + vars->prob = 0; } /* Non-linear drop in probability: Reduce drop probability quickly if @@ -489,10 +406,10 @@ static void calculate_probability(struct Qdisc *sch) if (qdelay == 0 && qdelay_old == 0 && update_prob) /* Reduce drop probability to 98.4% */ - q->vars.prob -= q->vars.prob / 64u; + vars->prob -= vars->prob / 64; - q->vars.qdelay = qdelay; - q->vars.qlen_old = qlen; + vars->qdelay = qdelay; + vars->qlen_old = qlen; /* We restart the measurement cycle if the following conditions are met * 1. If the delay has been low for 2 consecutive Tupdate periods @@ -500,16 +417,17 @@ static void calculate_probability(struct Qdisc *sch) * 3. If average dq_rate_estimator is enabled, we have atleast one * estimate for the avg_dq_rate ie., is a non-zero value */ - if ((q->vars.qdelay < q->params.target / 2) && - (q->vars.qdelay_old < q->params.target / 2) && - q->vars.prob == 0 && - (!q->params.dq_rate_estimator || q->vars.avg_dq_rate > 0)) { - pie_vars_init(&q->vars); + if ((vars->qdelay < params->target / 2) && + (vars->qdelay_old < params->target / 2) && + vars->prob == 0 && + (!params->dq_rate_estimator || vars->avg_dq_rate > 0)) { + pie_vars_init(vars); } - if (!q->params.dq_rate_estimator) - q->vars.qdelay_old = qdelay; + if (!params->dq_rate_estimator) + vars->qdelay_old = qdelay; } +EXPORT_SYMBOL_GPL(pie_calculate_probability); static void pie_timer(struct timer_list *t) { @@ -518,7 +436,7 @@ static void pie_timer(struct timer_list *t) spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch)); spin_lock(root_lock); - calculate_probability(sch); + pie_calculate_probability(&q->params, &q->vars, sch->qstats.backlog); /* reset the timer to fire after 'tupdate'. tupdate is in jiffies. */ if (q->params.tupdate) @@ -607,12 +525,13 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch) { + struct pie_sched_data *q = qdisc_priv(sch); struct sk_buff *skb = qdisc_dequeue_head(sch); if (!skb) return NULL; - pie_process_dequeue(sch, skb); + pie_process_dequeue(skb, &q->params, &q->vars, sch->qstats.backlog); return skb; } @@ -633,7 +552,7 @@ static void pie_destroy(struct Qdisc *sch) } static struct Qdisc_ops pie_qdisc_ops __read_mostly = { - .id = "pie", + .id = "pie", .priv_size = sizeof(struct pie_sched_data), .enqueue = pie_qdisc_enqueue, .dequeue = pie_qdisc_dequeue, diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 18b884cfdfe8..647941702f9f 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -292,8 +292,14 @@ static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct tc_prio_qopt_offload graft_offload; unsigned long band = arg - 1; - if (new == NULL) - new = &noop_qdisc; + if (!new) { + new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + TC_H_MAKE(sch->handle, arg), extack); + if (!new) + new = &noop_qdisc; + else + qdisc_hash_add(new, true); + } *old = qdisc_replace(sch, new, &q->queues[band]); diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 5f72f3f916a5..78e79029dc63 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -15,6 +15,7 @@ #include <linux/skbuff.h> #include <net/netlink.h> #include <net/sch_generic.h> +#include <net/pkt_cls.h> #include <net/pkt_sched.h> @@ -137,6 +138,52 @@ static u64 psched_ns_t2l(const struct psched_ratecfg *r, return len; } +static void tbf_offload_change(struct Qdisc *sch) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct tc_tbf_qopt_offload qopt; + + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return; + + qopt.command = TC_TBF_REPLACE; + qopt.handle = sch->handle; + qopt.parent = sch->parent; + qopt.replace_params.rate = q->rate; + qopt.replace_params.max_size = q->max_size; + qopt.replace_params.qstats = &sch->qstats; + + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt); +} + +static void tbf_offload_destroy(struct Qdisc *sch) +{ + struct net_device *dev = qdisc_dev(sch); + struct tc_tbf_qopt_offload qopt; + + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return; + + qopt.command = TC_TBF_DESTROY; + qopt.handle = sch->handle; + qopt.parent = sch->parent; + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt); +} + +static int tbf_offload_dump(struct Qdisc *sch) +{ + struct tc_tbf_qopt_offload qopt; + + qopt.command = TC_TBF_STATS; + qopt.handle = sch->handle; + qopt.parent = sch->parent; + qopt.stats.bstats = &sch->bstats; + qopt.stats.qstats = &sch->qstats; + + return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_TBF, &qopt); +} + /* GSO packet is too big, segment it so that tbf can transmit * each segment in time */ @@ -155,8 +202,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch, return qdisc_drop(skb, sch, to_free); nb = 0; - while (segs) { - nskb = segs->next; + skb_list_walk_safe(segs, segs, nskb) { skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; len += segs->len; @@ -167,7 +213,6 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch, } else { nb++; } - segs = nskb; } sch->q.qlen += nb; if (nb > 1) @@ -409,6 +454,8 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt, sch_tree_unlock(sch); err = 0; + + tbf_offload_change(sch); done: return err; } @@ -434,6 +481,7 @@ static void tbf_destroy(struct Qdisc *sch) struct tbf_sched_data *q = qdisc_priv(sch); qdisc_watchdog_cancel(&q->watchdog); + tbf_offload_destroy(sch); qdisc_put(q->qdisc); } @@ -442,8 +490,12 @@ static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) struct tbf_sched_data *q = qdisc_priv(sch); struct nlattr *nest; struct tc_tbf_qopt opt; + int err; + + err = tbf_offload_dump(sch); + if (err) + return err; - sch->qstats.backlog = q->qdisc->qstats.backlog; nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index ce82699d0dca..2bc29463e1dc 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -1359,8 +1359,10 @@ static int sctp_cmd_interpreter(enum sctp_event_type event_type, /* Generate an INIT ACK chunk. */ new_obj = sctp_make_init_ack(asoc, chunk, GFP_ATOMIC, 0); - if (!new_obj) - goto nomem; + if (!new_obj) { + error = -ENOMEM; + break; + } sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(new_obj)); @@ -1382,7 +1384,8 @@ static int sctp_cmd_interpreter(enum sctp_event_type event_type, if (!new_obj) { if (cmd->obj.chunk) sctp_chunk_free(cmd->obj.chunk); - goto nomem; + error = -ENOMEM; + break; } sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(new_obj)); @@ -1429,8 +1432,10 @@ static int sctp_cmd_interpreter(enum sctp_event_type event_type, /* Generate a SHUTDOWN chunk. */ new_obj = sctp_make_shutdown(asoc, chunk); - if (!new_obj) - goto nomem; + if (!new_obj) { + error = -ENOMEM; + break; + } sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(new_obj)); break; @@ -1766,11 +1771,17 @@ static int sctp_cmd_interpreter(enum sctp_event_type event_type, break; } - if (error) + if (error) { + cmd = sctp_next_cmd(commands); + while (cmd) { + if (cmd->verb == SCTP_CMD_REPLY) + sctp_chunk_free(cmd->obj.chunk); + cmd = sctp_next_cmd(commands); + } break; + } } -out: /* If this is in response to a received chunk, wait until * we are done with the packet to open the queue so that we don't * send multiple packets in response to a single request. @@ -1785,7 +1796,4 @@ out: sp->data_ready_signalled = 0; return error; -nomem: - error = -ENOMEM; - goto out; } diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 82dedf052d86..2a5ed47c3e08 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -611,7 +611,7 @@ static const struct genl_ops smc_pnet_ops[] = { { .cmd = SMC_PNETID_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .flags = GENL_ADMIN_PERM, + /* can be retrieved by unprivileged users */ .doit = smc_pnet_get, .dumpit = smc_pnet_dump, .start = smc_pnet_dump_start diff --git a/net/socket.c b/net/socket.c index 51bf34995bcb..b79a05de7c6e 100644 --- a/net/socket.c +++ b/net/socket.c @@ -128,7 +128,18 @@ static ssize_t sock_sendpage(struct file *file, struct page *page, static ssize_t sock_splice_read(struct file *file, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); -static void sock_show_fdinfo(struct seq_file *m, struct file *f); + +#ifdef CONFIG_PROC_FS +static void sock_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct socket *sock = f->private_data; + + if (sock->ops->show_fdinfo) + sock->ops->show_fdinfo(m, sock); +} +#else +#define sock_show_fdinfo NULL +#endif /* * Socket files have a set of 'special' operations as well as the generic file ones. These don't appear @@ -151,9 +162,7 @@ static const struct file_operations socket_file_ops = { .sendpage = sock_sendpage, .splice_write = generic_splice_sendpage, .splice_read = sock_splice_read, -#ifdef CONFIG_PROC_FS .show_fdinfo = sock_show_fdinfo, -#endif }; /* @@ -997,14 +1006,6 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from) return res; } -static void sock_show_fdinfo(struct seq_file *m, struct file *f) -{ - struct socket *sock = f->private_data; - - if (sock->ops->show_fdinfo) - sock->ops->show_fdinfo(m, sock); -} - /* * Atomic setting of ioctl hooks to avoid race * with module unload. diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 77c7dd7f05e8..fda3889993cb 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -77,7 +77,7 @@ static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt, struct rpcrdma_sendctx *sc); static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt); -static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf); +static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt); static struct rpcrdma_regbuf * @@ -244,6 +244,7 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) ia->ri_id->device->name, rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt)); #endif + init_completion(&ia->ri_remove_done); set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags); ep->rep_connected = -ENODEV; xprt_force_disconnect(xprt); @@ -297,7 +298,6 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia) int rc; init_completion(&ia->ri_done); - init_completion(&ia->ri_remove_done); id = rdma_create_id(xprt->rx_xprt.xprt_net, rpcrdma_cm_event_handler, xprt, RDMA_PS_TCP, IB_QPT_RC); @@ -421,7 +421,7 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia) /* The ULP is responsible for ensuring all DMA * mappings and MRs are gone. */ - rpcrdma_reps_destroy(buf); + rpcrdma_reps_unmap(r_xprt); list_for_each_entry(req, &buf->rb_allreqs, rl_all) { rpcrdma_regbuf_dma_unmap(req->rl_rdmabuf); rpcrdma_regbuf_dma_unmap(req->rl_sendbuf); @@ -599,6 +599,7 @@ static int rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, struct ib_qp_init_attr *qp_init_attr) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_ep *ep = &r_xprt->rx_ep; int rc, err; trace_xprtrdma_reinsert(r_xprt); @@ -613,6 +614,7 @@ static int rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, pr_err("rpcrdma: rpcrdma_ep_create returned %d\n", err); goto out2; } + memcpy(qp_init_attr, &ep->rep_attr, sizeof(*qp_init_attr)); rc = -ENETUNREACH; err = rdma_create_qp(ia->ri_id, ia->ri_pd, qp_init_attr); @@ -1090,6 +1092,7 @@ static struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov; rep->rr_recv_wr.num_sge = 1; rep->rr_temp = temp; + list_add(&rep->rr_all, &r_xprt->rx_buf.rb_all_reps); return rep; out_free: @@ -1100,6 +1103,7 @@ out: static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep) { + list_del(&rep->rr_all); rpcrdma_regbuf_free(rep->rr_rdmabuf); kfree(rep); } @@ -1118,10 +1122,16 @@ static struct rpcrdma_rep *rpcrdma_rep_get_locked(struct rpcrdma_buffer *buf) static void rpcrdma_rep_put(struct rpcrdma_buffer *buf, struct rpcrdma_rep *rep) { - if (!rep->rr_temp) - llist_add(&rep->rr_node, &buf->rb_free_reps); - else - rpcrdma_rep_destroy(rep); + llist_add(&rep->rr_node, &buf->rb_free_reps); +} + +static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_rep *rep; + + list_for_each_entry(rep, &buf->rb_all_reps, rr_all) + rpcrdma_regbuf_dma_unmap(rep->rr_rdmabuf); } static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf) @@ -1152,6 +1162,7 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) INIT_LIST_HEAD(&buf->rb_send_bufs); INIT_LIST_HEAD(&buf->rb_allreqs); + INIT_LIST_HEAD(&buf->rb_all_reps); rc = -ENOMEM; for (i = 0; i < buf->rb_max_requests; i++) { @@ -1504,6 +1515,10 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) wr = NULL; while (needed) { rep = rpcrdma_rep_get_locked(buf); + if (rep && rep->rr_temp) { + rpcrdma_rep_destroy(rep); + continue; + } if (!rep) rep = rpcrdma_rep_create(r_xprt, temp); if (!rep) diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 5d15140a0266..d796d68609ed 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -203,6 +203,7 @@ struct rpcrdma_rep { struct xdr_stream rr_stream; struct llist_node rr_node; struct ib_recv_wr rr_recv_wr; + struct list_head rr_all; }; /* To reduce the rate at which a transport invokes ib_post_recv @@ -368,6 +369,7 @@ struct rpcrdma_buffer { struct list_head rb_allreqs; struct list_head rb_all_mrs; + struct list_head rb_all_reps; struct llist_head rb_free_reps; diff --git a/net/tipc/Makefile b/net/tipc/Makefile index 11255e970dd4..ee49a9f1dd4f 100644 --- a/net/tipc/Makefile +++ b/net/tipc/Makefile @@ -9,7 +9,7 @@ tipc-y += addr.o bcast.o bearer.o \ core.o link.o discover.o msg.o \ name_distr.o subscr.o monitor.o name_table.o net.o \ netlink.o netlink_compat.o node.o socket.o eth_media.o \ - topsrv.o socket.o group.o trace.o + topsrv.o group.o trace.o CFLAGS_trace.o += -I$(src) @@ -20,5 +20,3 @@ tipc-$(CONFIG_TIPC_CRYPTO) += crypto.o obj-$(CONFIG_TIPC_DIAG) += diag.o - -tipc_diag-y := diag.o diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 0254bb7e418b..217516357ef2 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -204,8 +204,8 @@ static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd, return -ENOMEM; } - attrbuf = kmalloc_array(tipc_genl_family.maxattr + 1, - sizeof(struct nlattr *), GFP_KERNEL); + attrbuf = kcalloc(tipc_genl_family.maxattr + 1, + sizeof(struct nlattr *), GFP_KERNEL); if (!attrbuf) { err = -ENOMEM; goto err_out; diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 6552f986774c..f9b4fb92c0b1 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -287,12 +287,12 @@ static void tipc_sk_respond(struct sock *sk, struct sk_buff *skb, int err) * * Caller must hold socket lock */ -static void tsk_rej_rx_queue(struct sock *sk) +static void tsk_rej_rx_queue(struct sock *sk, int error) { struct sk_buff *skb; while ((skb = __skb_dequeue(&sk->sk_receive_queue))) - tipc_sk_respond(sk, skb, TIPC_ERR_NO_PORT); + tipc_sk_respond(sk, skb, error); } static bool tipc_sk_connected(struct sock *sk) @@ -545,34 +545,45 @@ static void __tipc_shutdown(struct socket *sock, int error) /* Remove pending SYN */ __skb_queue_purge(&sk->sk_write_queue); - /* Reject all unreceived messages, except on an active connection - * (which disconnects locally & sends a 'FIN+' to peer). - */ - while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { - if (TIPC_SKB_CB(skb)->bytes_read) { - kfree_skb(skb); - continue; - } - if (!tipc_sk_type_connectionless(sk) && - sk->sk_state != TIPC_DISCONNECTING) { - tipc_set_sk_state(sk, TIPC_DISCONNECTING); - tipc_node_remove_conn(net, dnode, tsk->portid); - } - tipc_sk_respond(sk, skb, error); + /* Remove partially received buffer if any */ + skb = skb_peek(&sk->sk_receive_queue); + if (skb && TIPC_SKB_CB(skb)->bytes_read) { + __skb_unlink(skb, &sk->sk_receive_queue); + kfree_skb(skb); } - if (tipc_sk_type_connectionless(sk)) + /* Reject all unreceived messages if connectionless */ + if (tipc_sk_type_connectionless(sk)) { + tsk_rej_rx_queue(sk, error); return; + } - if (sk->sk_state != TIPC_DISCONNECTING) { + switch (sk->sk_state) { + case TIPC_CONNECTING: + case TIPC_ESTABLISHED: + tipc_set_sk_state(sk, TIPC_DISCONNECTING); + tipc_node_remove_conn(net, dnode, tsk->portid); + /* Send a FIN+/- to its peer */ + skb = __skb_dequeue(&sk->sk_receive_queue); + if (skb) { + __skb_queue_purge(&sk->sk_receive_queue); + tipc_sk_respond(sk, skb, error); + break; + } skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, SHORT_H_SIZE, 0, dnode, tsk_own_node(tsk), tsk_peer_port(tsk), tsk->portid, error); if (skb) tipc_node_xmit_skb(net, skb, dnode, tsk->portid); - tipc_node_remove_conn(net, dnode, tsk->portid); - tipc_set_sk_state(sk, TIPC_DISCONNECTING); + break; + case TIPC_LISTEN: + /* Reject all SYN messages */ + tsk_rej_rx_queue(sk, error); + break; + default: + __skb_queue_purge(&sk->sk_receive_queue); + break; } } @@ -2432,8 +2443,8 @@ static int tipc_wait_for_connect(struct socket *sock, long *timeo_p) return sock_intr_errno(*timeo_p); add_wait_queue(sk_sleep(sk), &wait); - done = sk_wait_event(sk, timeo_p, - sk->sk_state != TIPC_CONNECTING, &wait); + done = sk_wait_event(sk, timeo_p, tipc_sk_connected(sk), + &wait); remove_wait_queue(sk_sleep(sk), &wait); } while (!done); return 0; @@ -2643,7 +2654,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags, * Reject any stray messages received by new socket * before the socket lock was taken (very, very unlikely) */ - tsk_rej_rx_queue(new_sk); + tsk_rej_rx_queue(new_sk, TIPC_ERR_NO_PORT); /* Connect new socket to it's peer */ tipc_sk_finish_conn(new_tsock, msg_origport(msg), msg_orignode(msg)); diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index dac24c7aa7d4..94774c0e5ff3 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -732,15 +732,19 @@ out: return rc; } -static void tls_update(struct sock *sk, struct proto *p) +static void tls_update(struct sock *sk, struct proto *p, + void (*write_space)(struct sock *sk)) { struct tls_context *ctx; ctx = tls_get_ctx(sk); - if (likely(ctx)) + if (likely(ctx)) { + ctx->sk_write_space = write_space; ctx->sk_proto = p; - else + } else { sk->sk_prot = p; + sk->sk_write_space = write_space; + } } static int tls_get_info(const struct sock *sk, struct sk_buff *skb) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index c6803a82b769..c98e602a1a2d 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -256,8 +256,6 @@ static int tls_do_decryption(struct sock *sk, return ret; ret = crypto_wait_req(ret, &ctx->async_wait); - } else if (ret == -EBADMSG) { - TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSDECRYPTERROR); } if (async) @@ -682,12 +680,32 @@ static int tls_push_record(struct sock *sk, int flags, split_point = msg_pl->apply_bytes; split = split_point && split_point < msg_pl->sg.size; + if (unlikely((!split && + msg_pl->sg.size + + prot->overhead_size > msg_en->sg.size) || + (split && + split_point + + prot->overhead_size > msg_en->sg.size))) { + split = true; + split_point = msg_en->sg.size; + } if (split) { rc = tls_split_open_record(sk, rec, &tmp, msg_pl, msg_en, split_point, prot->overhead_size, &orig_end); if (rc < 0) return rc; + /* This can happen if above tls_split_open_record allocates + * a single large encryption buffer instead of two smaller + * ones. In this case adjust pointers and continue without + * split. + */ + if (!msg_pl->sg.size) { + tls_merge_open_record(sk, rec, tmp, orig_end); + msg_pl = &rec->msg_plaintext; + msg_en = &rec->msg_encrypted; + split = false; + } sk_msg_trim(sk, msg_en, msg_pl->sg.size + prot->overhead_size); } @@ -709,6 +727,12 @@ static int tls_push_record(struct sock *sk, int flags, sg_mark_end(sk_msg_elem(msg_pl, i)); } + if (msg_pl->sg.end < msg_pl->sg.start) { + sg_chain(&msg_pl->sg.data[msg_pl->sg.start], + MAX_SKB_FRAGS - msg_pl->sg.start + 1, + msg_pl->sg.data); + } + i = msg_pl->sg.start; sg_chain(rec->sg_aead_in, 2, &msg_pl->sg.data[i]); @@ -772,7 +796,7 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk, psock = sk_psock_get(sk); if (!psock || !policy) { err = tls_push_record(sk, flags, record_type); - if (err) { + if (err && err != -EINPROGRESS) { *copied -= sk_msg_free(sk, msg); tls_free_open_rec(sk); } @@ -783,10 +807,7 @@ more_data: if (psock->eval == __SK_NONE) { delta = msg->sg.size; psock->eval = sk_psock_msg_verdict(sk, psock, msg); - if (delta < msg->sg.size) - delta -= msg->sg.size; - else - delta = 0; + delta -= msg->sg.size; } if (msg->cork_bytes && msg->cork_bytes > msg->sg.size && !enospc && !full_record) { @@ -801,7 +822,7 @@ more_data: switch (psock->eval) { case __SK_PASS: err = tls_push_record(sk, flags, record_type); - if (err < 0) { + if (err && err != -EINPROGRESS) { *copied -= sk_msg_free(sk, msg); tls_free_open_rec(sk); goto out_err; @@ -1515,7 +1536,9 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, if (err == -EINPROGRESS) tls_advance_record_sn(sk, prot, &tls_ctx->rx); - + else if (err == -EBADMSG) + TLS_INC_STATS(sock_net(sk), + LINUX_MIB_TLSDECRYPTERROR); return err; } } else { diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 6756a3ccc392..321af97c7bbe 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2100,8 +2100,8 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, mutex_lock(&u->iolock); skip = sk_peek_offset(sk, flags); - skb = __skb_try_recv_datagram(sk, flags, scm_stat_del, - &skip, &err, &last); + skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags, + scm_stat_del, &skip, &err, &last); if (skb) break; @@ -2110,7 +2110,8 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, if (err != -EAGAIN) break; } while (timeo && - !__skb_wait_for_more_packets(sk, &err, &timeo, last)); + !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, + &err, &timeo, last)); if (!skb) { /* implies iolock unlocked */ unix_state_lock(sk); diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index b3bdae74c243..3492c021925f 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -138,28 +138,15 @@ struct hvsock { **************************************************************************** * The only valid Service GUIDs, from the perspectives of both the host and * * Linux VM, that can be connected by the other end, must conform to this * - * format: <port>-facb-11e6-bd58-64006a7986d3, and the "port" must be in * - * this range [0, 0x7FFFFFFF]. * + * format: <port>-facb-11e6-bd58-64006a7986d3. * **************************************************************************** * * When we write apps on the host to connect(), the GUID ServiceID is used. * When we write apps in Linux VM to connect(), we only need to specify the * port and the driver will form the GUID and use that to request the host. * - * From the perspective of Linux VM: - * 1. the local ephemeral port (i.e. the local auto-bound port when we call - * connect() without explicit bind()) is generated by __vsock_bind_stream(), - * and the range is [1024, 0xFFFFFFFF). - * 2. the remote ephemeral port (i.e. the auto-generated remote port for - * a connect request initiated by the host's connect()) is generated by - * hvs_remote_addr_init() and the range is [0x80000000, 0xFFFFFFFF). */ -#define MAX_LISTEN_PORT ((u32)0x7FFFFFFF) -#define MAX_VM_LISTEN_PORT MAX_LISTEN_PORT -#define MAX_HOST_LISTEN_PORT MAX_LISTEN_PORT -#define MIN_HOST_EPHEMERAL_PORT (MAX_HOST_LISTEN_PORT + 1) - /* 00000000-facb-11e6-bd58-64006a7986d3 */ static const guid_t srv_id_template = GUID_INIT(0x00000000, 0xfacb, 0x11e6, 0xbd, 0x58, @@ -184,34 +171,6 @@ static void hvs_addr_init(struct sockaddr_vm *addr, const guid_t *svr_id) vsock_addr_init(addr, VMADDR_CID_ANY, port); } -static void hvs_remote_addr_init(struct sockaddr_vm *remote, - struct sockaddr_vm *local) -{ - static u32 host_ephemeral_port = MIN_HOST_EPHEMERAL_PORT; - struct sock *sk; - - /* Remote peer is always the host */ - vsock_addr_init(remote, VMADDR_CID_HOST, VMADDR_PORT_ANY); - - while (1) { - /* Wrap around ? */ - if (host_ephemeral_port < MIN_HOST_EPHEMERAL_PORT || - host_ephemeral_port == VMADDR_PORT_ANY) - host_ephemeral_port = MIN_HOST_EPHEMERAL_PORT; - - remote->svm_port = host_ephemeral_port++; - - sk = vsock_find_connected_socket(remote, local); - if (!sk) { - /* Found an available ephemeral port */ - return; - } - - /* Release refcnt got in vsock_find_connected_socket */ - sock_put(sk); - } -} - static void hvs_set_channel_pending_send_size(struct vmbus_channel *chan) { set_channel_pending_send_size(chan, @@ -341,12 +300,7 @@ static void hvs_open_connection(struct vmbus_channel *chan) if_type = &chan->offermsg.offer.if_type; if_instance = &chan->offermsg.offer.if_instance; conn_from_host = chan->offermsg.offer.u.pipe.user_def[0]; - - /* The host or the VM should only listen on a port in - * [0, MAX_LISTEN_PORT] - */ - if (!is_valid_srv_id(if_type) || - get_port_by_srv_id(if_type) > MAX_LISTEN_PORT) + if (!is_valid_srv_id(if_type)) return; hvs_addr_init(&addr, conn_from_host ? if_type : if_instance); @@ -371,8 +325,11 @@ static void hvs_open_connection(struct vmbus_channel *chan) vnew = vsock_sk(new); hvs_addr_init(&vnew->local_addr, if_type); - hvs_remote_addr_init(&vnew->remote_addr, &vnew->local_addr); + /* Remote peer is always the host */ + vsock_addr_init(&vnew->remote_addr, + VMADDR_CID_HOST, VMADDR_PORT_ANY); + vnew->remote_addr.svm_port = get_port_by_srv_id(if_instance); ret = vsock_assign_transport(vnew, vsock_sk(sk)); /* Transport assigned (looking at remote_addr) must be the * same where we received the request. @@ -766,16 +723,6 @@ static bool hvs_stream_is_active(struct vsock_sock *vsk) static bool hvs_stream_allow(u32 cid, u32 port) { - /* The host's port range [MIN_HOST_EPHEMERAL_PORT, 0xFFFFFFFF) is - * reserved as ephemeral ports, which are used as the host's ports - * when the host initiates connections. - * - * Perform this check in the guest so an immediate error is produced - * instead of a timeout. - */ - if (port > MAX_HOST_LISTEN_PORT) - return false; - if (cid == VMADDR_CID_HOST) return true; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index fa3526592c51..123b8d720a59 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -10843,6 +10843,7 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev, if (err) return err; + cfg80211_sinfo_release_content(&sinfo); if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_BEACON_SIGNAL_AVG)) wdev->cqm_config->last_rssi_event_value = (s8) sinfo.rx_beacon_signal_avg; @@ -13795,6 +13796,8 @@ static int nl80211_probe_mesh_link(struct sk_buff *skb, struct genl_info *info) if (err) return err; + cfg80211_sinfo_release_content(&sinfo); + return rdev_probe_mesh_link(rdev, dev, dest, buf, len); } diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index e853a4fe6f97..e0d34f796d0b 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -538,6 +538,10 @@ static inline int rdev_set_wiphy_params(struct cfg80211_registered_device *rdev, u32 changed) { int ret; + + if (!rdev->ops->set_wiphy_params) + return -EOPNOTSUPP; + trace_rdev_set_wiphy_params(&rdev->wiphy, changed); ret = rdev->ops->set_wiphy_params(&rdev->wiphy, changed); trace_rdev_return_int(&rdev->wiphy, ret); @@ -1167,6 +1171,16 @@ rdev_start_radar_detection(struct cfg80211_registered_device *rdev, return ret; } +static inline void +rdev_end_cac(struct cfg80211_registered_device *rdev, + struct net_device *dev) +{ + trace_rdev_end_cac(&rdev->wiphy, dev); + if (rdev->ops->end_cac) + rdev->ops->end_cac(&rdev->wiphy, dev); + trace_rdev_return_void(&rdev->wiphy); +} + static inline int rdev_set_mcast_rate(struct cfg80211_registered_device *rdev, struct net_device *dev, diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 446c76d44e65..fff9a74891fc 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -2261,14 +2261,15 @@ static void update_all_wiphy_regulatory(enum nl80211_reg_initiator initiator) static void handle_channel_custom(struct wiphy *wiphy, struct ieee80211_channel *chan, - const struct ieee80211_regdomain *regd) + const struct ieee80211_regdomain *regd, + u32 min_bw) { u32 bw_flags = 0; const struct ieee80211_reg_rule *reg_rule = NULL; const struct ieee80211_power_rule *power_rule = NULL; u32 bw; - for (bw = MHZ_TO_KHZ(20); bw >= MHZ_TO_KHZ(5); bw = bw / 2) { + for (bw = MHZ_TO_KHZ(20); bw >= min_bw; bw = bw / 2) { reg_rule = freq_reg_info_regd(MHZ_TO_KHZ(chan->center_freq), regd, bw); if (!IS_ERR(reg_rule)) @@ -2324,8 +2325,14 @@ static void handle_band_custom(struct wiphy *wiphy, if (!sband) return; + /* + * We currently assume that you always want at least 20 MHz, + * otherwise channel 12 might get enabled if this rule is + * compatible to US, which permits 2402 - 2472 MHz. + */ for (i = 0; i < sband->n_channels; i++) - handle_channel_custom(wiphy, &sband->channels[i], regd); + handle_channel_custom(wiphy, &sband->channels[i], regd, + MHZ_TO_KHZ(20)); } /* Used by drivers prior to wiphy registration */ @@ -3885,6 +3892,25 @@ bool regulatory_pre_cac_allowed(struct wiphy *wiphy) } EXPORT_SYMBOL(regulatory_pre_cac_allowed); +static void cfg80211_check_and_end_cac(struct cfg80211_registered_device *rdev) +{ + struct wireless_dev *wdev; + /* If we finished CAC or received radar, we should end any + * CAC running on the same channels. + * the check !cfg80211_chandef_dfs_usable contain 2 options: + * either all channels are available - those the CAC_FINISHED + * event has effected another wdev state, or there is a channel + * in unavailable state in wdev chandef - those the RADAR_DETECTED + * event has effected another wdev state. + * In both cases we should end the CAC on the wdev. + */ + list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { + if (wdev->cac_started && + !cfg80211_chandef_dfs_usable(&rdev->wiphy, &wdev->chandef)) + rdev_end_cac(rdev, wdev->netdev); + } +} + void regulatory_propagate_dfs_state(struct wiphy *wiphy, struct cfg80211_chan_def *chandef, enum nl80211_dfs_state dfs_state, @@ -3911,8 +3937,10 @@ void regulatory_propagate_dfs_state(struct wiphy *wiphy, cfg80211_set_dfs_state(&rdev->wiphy, chandef, dfs_state); if (event == NL80211_RADAR_DETECTED || - event == NL80211_RADAR_CAC_FINISHED) + event == NL80211_RADAR_CAC_FINISHED) { cfg80211_sched_dfs_chan_update(rdev); + cfg80211_check_and_end_cac(rdev); + } nl80211_radar_notify(rdev, chandef, event, NULL, GFP_KERNEL); } diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 7a6c38ddc65a..d32a2ec4d96a 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -1307,14 +1307,14 @@ void cfg80211_autodisconnect_wk(struct work_struct *work) if (wdev->conn_owner_nlportid) { switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: - cfg80211_leave_ibss(rdev, wdev->netdev, false); + __cfg80211_leave_ibss(rdev, wdev->netdev, false); break; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: - cfg80211_stop_ap(rdev, wdev->netdev, false); + __cfg80211_stop_ap(rdev, wdev->netdev, false); break; case NL80211_IFTYPE_MESH_POINT: - cfg80211_leave_mesh(rdev, wdev->netdev); + __cfg80211_leave_mesh(rdev, wdev->netdev); break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: diff --git a/net/wireless/trace.h b/net/wireless/trace.h index d98ad2b3143b..8677d7ab7d69 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -646,6 +646,11 @@ DEFINE_EVENT(wiphy_netdev_evt, rdev_flush_pmksa, TP_ARGS(wiphy, netdev) ); +DEFINE_EVENT(wiphy_netdev_evt, rdev_end_cac, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev), + TP_ARGS(wiphy, netdev) +); + DECLARE_EVENT_CLASS(station_add_change, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *mac, struct station_parameters *params), diff --git a/net/wireless/util.c b/net/wireless/util.c index 5b4ed5bbc542..8481e9ac33da 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -564,7 +564,7 @@ __frame_add_frag(struct sk_buff *skb, struct page *page, struct skb_shared_info *sh = skb_shinfo(skb); int page_offset; - page_ref_inc(page); + get_page(page); page_offset = ptr - page_address(page); skb_add_rx_frag(skb, sh->nr_frags, page, page_offset, len, size); } diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index 5e677dac2a0c..69102fda9ebd 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -657,7 +657,8 @@ struct iw_statistics *get_wireless_stats(struct net_device *dev) return NULL; } -static int iw_handler_get_iwstats(struct net_device * dev, +/* noinline to avoid a bogus warning with -O3 */ +static noinline int iw_handler_get_iwstats(struct net_device * dev, struct iw_request_info * info, union iwreq_data * wrqu, char * extra) diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 2efe44a34644..d5b09bbff375 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -766,6 +766,10 @@ static int x25_connect(struct socket *sock, struct sockaddr *uaddr, if (sk->sk_state == TCP_ESTABLISHED) goto out; + rc = -EALREADY; /* Do nothing if call is already in progress */ + if (sk->sk_state == TCP_SYN_SENT) + goto out; + sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; @@ -812,7 +816,7 @@ static int x25_connect(struct socket *sock, struct sockaddr *uaddr, /* Now the loop */ rc = -EINPROGRESS; if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) - goto out_put_neigh; + goto out; rc = x25_wait_for_connection_establishment(sk); if (rc) diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 3049af269fbf..f93e917e0929 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -249,7 +249,7 @@ static void xdp_umem_release(struct xdp_umem *umem) xdp_umem_unmap_pages(umem); xdp_umem_unpin_pages(umem); - kfree(umem->pages); + kvfree(umem->pages); umem->pages = NULL; xdp_umem_unaccount_pages(umem); @@ -409,7 +409,8 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if (err) goto out_account; - umem->pages = kcalloc(umem->npgs, sizeof(*umem->pages), GFP_KERNEL); + umem->pages = kvcalloc(umem->npgs, sizeof(*umem->pages), + GFP_KERNEL_ACCOUNT); if (!umem->pages) { err = -ENOMEM; goto out_pin; @@ -419,7 +420,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if (!err) return 0; - kfree(umem->pages); + kvfree(umem->pages); out_pin: xdp_umem_unpin_pages(umem); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 02ada7ab8c6e..df600487a68d 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -217,7 +217,7 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) static void xsk_flush(struct xdp_sock *xs) { xskq_prod_submit(xs->rx); - xs->sk.sk_data_ready(&xs->sk); + sock_def_readable(&xs->sk); } int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile index fbc4552d17b8..212a4fcb4a88 100644 --- a/net/xfrm/Makefile +++ b/net/xfrm/Makefile @@ -11,3 +11,4 @@ obj-$(CONFIG_XFRM_ALGO) += xfrm_algo.o obj-$(CONFIG_XFRM_USER) += xfrm_user.o obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o +obj-$(CONFIG_INET_ESPINTCP) += espintcp.o diff --git a/net/xfrm/espintcp.c b/net/xfrm/espintcp.c new file mode 100644 index 000000000000..f15d6a564b0e --- /dev/null +++ b/net/xfrm/espintcp.c @@ -0,0 +1,509 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <net/tcp.h> +#include <net/strparser.h> +#include <net/xfrm.h> +#include <net/esp.h> +#include <net/espintcp.h> +#include <linux/skmsg.h> +#include <net/inet_common.h> + +static void handle_nonesp(struct espintcp_ctx *ctx, struct sk_buff *skb, + struct sock *sk) +{ + if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf || + !sk_rmem_schedule(sk, skb, skb->truesize)) { + kfree_skb(skb); + return; + } + + skb_set_owner_r(skb, sk); + + memset(skb->cb, 0, sizeof(skb->cb)); + skb_queue_tail(&ctx->ike_queue, skb); + ctx->saved_data_ready(sk); +} + +static void handle_esp(struct sk_buff *skb, struct sock *sk) +{ + skb_reset_transport_header(skb); + memset(skb->cb, 0, sizeof(skb->cb)); + + rcu_read_lock(); + skb->dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif); + local_bh_disable(); + xfrm4_rcv_encap(skb, IPPROTO_ESP, 0, TCP_ENCAP_ESPINTCP); + local_bh_enable(); + rcu_read_unlock(); +} + +static void espintcp_rcv(struct strparser *strp, struct sk_buff *skb) +{ + struct espintcp_ctx *ctx = container_of(strp, struct espintcp_ctx, + strp); + struct strp_msg *rxm = strp_msg(skb); + u32 nonesp_marker; + int err; + + err = skb_copy_bits(skb, rxm->offset + 2, &nonesp_marker, + sizeof(nonesp_marker)); + if (err < 0) { + kfree_skb(skb); + return; + } + + /* remove header, leave non-ESP marker/SPI */ + if (!__pskb_pull(skb, rxm->offset + 2)) { + kfree_skb(skb); + return; + } + + if (pskb_trim(skb, rxm->full_len - 2) != 0) { + kfree_skb(skb); + return; + } + + if (nonesp_marker == 0) + handle_nonesp(ctx, skb, strp->sk); + else + handle_esp(skb, strp->sk); +} + +static int espintcp_parse(struct strparser *strp, struct sk_buff *skb) +{ + struct strp_msg *rxm = strp_msg(skb); + __be16 blen; + u16 len; + int err; + + if (skb->len < rxm->offset + 2) + return 0; + + err = skb_copy_bits(skb, rxm->offset, &blen, sizeof(blen)); + if (err < 0) + return err; + + len = be16_to_cpu(blen); + if (len < 6) + return -EINVAL; + + return len; +} + +static int espintcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, int *addr_len) +{ + struct espintcp_ctx *ctx = espintcp_getctx(sk); + struct sk_buff *skb; + int err = 0; + int copied; + int off = 0; + + flags |= nonblock ? MSG_DONTWAIT : 0; + + skb = __skb_recv_datagram(sk, &ctx->ike_queue, flags, NULL, &off, &err); + if (!skb) + return err; + + copied = len; + if (copied > skb->len) + copied = skb->len; + else if (copied < skb->len) + msg->msg_flags |= MSG_TRUNC; + + err = skb_copy_datagram_msg(skb, 0, msg, copied); + if (unlikely(err)) { + kfree_skb(skb); + return err; + } + + if (flags & MSG_TRUNC) + copied = skb->len; + kfree_skb(skb); + return copied; +} + +int espintcp_queue_out(struct sock *sk, struct sk_buff *skb) +{ + struct espintcp_ctx *ctx = espintcp_getctx(sk); + + if (skb_queue_len(&ctx->out_queue) >= netdev_max_backlog) + return -ENOBUFS; + + __skb_queue_tail(&ctx->out_queue, skb); + + return 0; +} +EXPORT_SYMBOL_GPL(espintcp_queue_out); + +/* espintcp length field is 2B and length includes the length field's size */ +#define MAX_ESPINTCP_MSG (((1 << 16) - 1) - 2) + +static int espintcp_sendskb_locked(struct sock *sk, struct espintcp_msg *emsg, + int flags) +{ + do { + int ret; + + ret = skb_send_sock_locked(sk, emsg->skb, + emsg->offset, emsg->len); + if (ret < 0) + return ret; + + emsg->len -= ret; + emsg->offset += ret; + } while (emsg->len > 0); + + kfree_skb(emsg->skb); + memset(emsg, 0, sizeof(*emsg)); + + return 0; +} + +static int espintcp_sendskmsg_locked(struct sock *sk, + struct espintcp_msg *emsg, int flags) +{ + struct sk_msg *skmsg = &emsg->skmsg; + struct scatterlist *sg; + int done = 0; + int ret; + + flags |= MSG_SENDPAGE_NOTLAST; + sg = &skmsg->sg.data[skmsg->sg.start]; + do { + size_t size = sg->length - emsg->offset; + int offset = sg->offset + emsg->offset; + struct page *p; + + emsg->offset = 0; + + if (sg_is_last(sg)) + flags &= ~MSG_SENDPAGE_NOTLAST; + + p = sg_page(sg); +retry: + ret = do_tcp_sendpages(sk, p, offset, size, flags); + if (ret < 0) { + emsg->offset = offset - sg->offset; + skmsg->sg.start += done; + return ret; + } + + if (ret != size) { + offset += ret; + size -= ret; + goto retry; + } + + done++; + put_page(p); + sk_mem_uncharge(sk, sg->length); + sg = sg_next(sg); + } while (sg); + + memset(emsg, 0, sizeof(*emsg)); + + return 0; +} + +static int espintcp_push_msgs(struct sock *sk) +{ + struct espintcp_ctx *ctx = espintcp_getctx(sk); + struct espintcp_msg *emsg = &ctx->partial; + int err; + + if (!emsg->len) + return 0; + + if (ctx->tx_running) + return -EAGAIN; + ctx->tx_running = 1; + + if (emsg->skb) + err = espintcp_sendskb_locked(sk, emsg, 0); + else + err = espintcp_sendskmsg_locked(sk, emsg, 0); + if (err == -EAGAIN) { + ctx->tx_running = 0; + return 0; + } + if (!err) + memset(emsg, 0, sizeof(*emsg)); + + ctx->tx_running = 0; + + return err; +} + +int espintcp_push_skb(struct sock *sk, struct sk_buff *skb) +{ + struct espintcp_ctx *ctx = espintcp_getctx(sk); + struct espintcp_msg *emsg = &ctx->partial; + unsigned int len; + int offset; + + if (sk->sk_state != TCP_ESTABLISHED) { + kfree_skb(skb); + return -ECONNRESET; + } + + offset = skb_transport_offset(skb); + len = skb->len - offset; + + espintcp_push_msgs(sk); + + if (emsg->len) { + kfree_skb(skb); + return -ENOBUFS; + } + + skb_set_owner_w(skb, sk); + + emsg->offset = offset; + emsg->len = len; + emsg->skb = skb; + + espintcp_push_msgs(sk); + + return 0; +} +EXPORT_SYMBOL_GPL(espintcp_push_skb); + +static int espintcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +{ + long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + struct espintcp_ctx *ctx = espintcp_getctx(sk); + struct espintcp_msg *emsg = &ctx->partial; + struct iov_iter pfx_iter; + struct kvec pfx_iov = {}; + size_t msglen = size + 2; + char buf[2] = {0}; + int err, end; + + if (msg->msg_flags) + return -EOPNOTSUPP; + + if (size > MAX_ESPINTCP_MSG) + return -EMSGSIZE; + + if (msg->msg_controllen) + return -EOPNOTSUPP; + + lock_sock(sk); + + err = espintcp_push_msgs(sk); + if (err < 0) { + err = -ENOBUFS; + goto unlock; + } + + sk_msg_init(&emsg->skmsg); + while (1) { + /* only -ENOMEM is possible since we don't coalesce */ + err = sk_msg_alloc(sk, &emsg->skmsg, msglen, 0); + if (!err) + break; + + err = sk_stream_wait_memory(sk, &timeo); + if (err) + goto fail; + } + + *((__be16 *)buf) = cpu_to_be16(msglen); + pfx_iov.iov_base = buf; + pfx_iov.iov_len = sizeof(buf); + iov_iter_kvec(&pfx_iter, WRITE, &pfx_iov, 1, pfx_iov.iov_len); + + err = sk_msg_memcopy_from_iter(sk, &pfx_iter, &emsg->skmsg, + pfx_iov.iov_len); + if (err < 0) + goto fail; + + err = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, &emsg->skmsg, size); + if (err < 0) + goto fail; + + end = emsg->skmsg.sg.end; + emsg->len = size; + sk_msg_iter_var_prev(end); + sg_mark_end(sk_msg_elem(&emsg->skmsg, end)); + + tcp_rate_check_app_limited(sk); + + err = espintcp_push_msgs(sk); + /* this message could be partially sent, keep it */ + if (err < 0) + goto unlock; + release_sock(sk); + + return size; + +fail: + sk_msg_free(sk, &emsg->skmsg); + memset(emsg, 0, sizeof(*emsg)); +unlock: + release_sock(sk); + return err; +} + +static struct proto espintcp_prot __ro_after_init; +static struct proto_ops espintcp_ops __ro_after_init; + +static void espintcp_data_ready(struct sock *sk) +{ + struct espintcp_ctx *ctx = espintcp_getctx(sk); + + strp_data_ready(&ctx->strp); +} + +static void espintcp_tx_work(struct work_struct *work) +{ + struct espintcp_ctx *ctx = container_of(work, + struct espintcp_ctx, work); + struct sock *sk = ctx->strp.sk; + + lock_sock(sk); + if (!ctx->tx_running) + espintcp_push_msgs(sk); + release_sock(sk); +} + +static void espintcp_write_space(struct sock *sk) +{ + struct espintcp_ctx *ctx = espintcp_getctx(sk); + + schedule_work(&ctx->work); + ctx->saved_write_space(sk); +} + +static void espintcp_destruct(struct sock *sk) +{ + struct espintcp_ctx *ctx = espintcp_getctx(sk); + + kfree(ctx); +} + +bool tcp_is_ulp_esp(struct sock *sk) +{ + return sk->sk_prot == &espintcp_prot; +} +EXPORT_SYMBOL_GPL(tcp_is_ulp_esp); + +static int espintcp_init_sk(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct strp_callbacks cb = { + .rcv_msg = espintcp_rcv, + .parse_msg = espintcp_parse, + }; + struct espintcp_ctx *ctx; + int err; + + /* sockmap is not compatible with espintcp */ + if (sk->sk_user_data) + return -EBUSY; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + err = strp_init(&ctx->strp, sk, &cb); + if (err) + goto free; + + __sk_dst_reset(sk); + + strp_check_rcv(&ctx->strp); + skb_queue_head_init(&ctx->ike_queue); + skb_queue_head_init(&ctx->out_queue); + sk->sk_prot = &espintcp_prot; + sk->sk_socket->ops = &espintcp_ops; + ctx->saved_data_ready = sk->sk_data_ready; + ctx->saved_write_space = sk->sk_write_space; + sk->sk_data_ready = espintcp_data_ready; + sk->sk_write_space = espintcp_write_space; + sk->sk_destruct = espintcp_destruct; + rcu_assign_pointer(icsk->icsk_ulp_data, ctx); + INIT_WORK(&ctx->work, espintcp_tx_work); + + /* avoid using task_frag */ + sk->sk_allocation = GFP_ATOMIC; + + return 0; + +free: + kfree(ctx); + return err; +} + +static void espintcp_release(struct sock *sk) +{ + struct espintcp_ctx *ctx = espintcp_getctx(sk); + struct sk_buff_head queue; + struct sk_buff *skb; + + __skb_queue_head_init(&queue); + skb_queue_splice_init(&ctx->out_queue, &queue); + + while ((skb = __skb_dequeue(&queue))) + espintcp_push_skb(sk, skb); + + tcp_release_cb(sk); +} + +static void espintcp_close(struct sock *sk, long timeout) +{ + struct espintcp_ctx *ctx = espintcp_getctx(sk); + struct espintcp_msg *emsg = &ctx->partial; + + strp_stop(&ctx->strp); + + sk->sk_prot = &tcp_prot; + barrier(); + + cancel_work_sync(&ctx->work); + strp_done(&ctx->strp); + + skb_queue_purge(&ctx->out_queue); + skb_queue_purge(&ctx->ike_queue); + + if (emsg->len) { + if (emsg->skb) + kfree_skb(emsg->skb); + else + sk_msg_free(sk, &emsg->skmsg); + } + + tcp_close(sk, timeout); +} + +static __poll_t espintcp_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + __poll_t mask = datagram_poll(file, sock, wait); + struct sock *sk = sock->sk; + struct espintcp_ctx *ctx = espintcp_getctx(sk); + + if (!skb_queue_empty(&ctx->ike_queue)) + mask |= EPOLLIN | EPOLLRDNORM; + + return mask; +} + +static struct tcp_ulp_ops espintcp_ulp __read_mostly = { + .name = "espintcp", + .owner = THIS_MODULE, + .init = espintcp_init_sk, +}; + +void __init espintcp_init(void) +{ + memcpy(&espintcp_prot, &tcp_prot, sizeof(tcp_prot)); + memcpy(&espintcp_ops, &inet_stream_ops, sizeof(inet_stream_ops)); + espintcp_prot.sendmsg = espintcp_sendmsg; + espintcp_prot.recvmsg = espintcp_recvmsg; + espintcp_prot.close = espintcp_close; + espintcp_prot.release_cb = espintcp_release; + espintcp_ops.poll = espintcp_poll; + + tcp_register_ulp(&espintcp_ulp); +} diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 189ef15acbbc..50f567a88f45 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -78,7 +78,7 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur int err; unsigned long flags; struct xfrm_state *x; - struct sk_buff *skb2; + struct sk_buff *skb2, *nskb; struct softnet_data *sd; netdev_features_t esp_features = features; struct xfrm_offload *xo = xfrm_offload(skb); @@ -148,11 +148,7 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur return skb; } - skb2 = skb; - - do { - struct sk_buff *nskb = skb2->next; - + skb_list_walk_safe(skb, skb2, nskb) { esp_features |= skb->dev->gso_partial_features; skb_mark_not_on_list(skb2); @@ -176,14 +172,11 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur if (!skb) return NULL; - goto skip_push; + continue; } skb_push(skb2, skb2->data - skb_mac_header(skb2)); - -skip_push: - skb2 = nskb; - } while (skb2); + } return skb; } diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 2c86a2fc3915..aa35f23c4912 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -36,6 +36,7 @@ struct xfrm_trans_cb { #endif } header; int (*finish)(struct net *net, struct sock *sk, struct sk_buff *skb); + struct net *net; }; #define XFRM_TRANS_SKB_CB(__skb) ((struct xfrm_trans_cb *)&((__skb)->cb[0])) @@ -766,12 +767,13 @@ static void xfrm_trans_reinject(unsigned long data) skb_queue_splice_init(&trans->queue, &queue); while ((skb = __skb_dequeue(&queue))) - XFRM_TRANS_SKB_CB(skb)->finish(dev_net(skb->dev), NULL, skb); + XFRM_TRANS_SKB_CB(skb)->finish(XFRM_TRANS_SKB_CB(skb)->net, + NULL, skb); } -int xfrm_trans_queue(struct sk_buff *skb, - int (*finish)(struct net *, struct sock *, - struct sk_buff *)) +int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb, + int (*finish)(struct net *, struct sock *, + struct sk_buff *)) { struct xfrm_trans_tasklet *trans; @@ -780,11 +782,22 @@ int xfrm_trans_queue(struct sk_buff *skb, if (skb_queue_len(&trans->queue) >= netdev_max_backlog) return -ENOBUFS; + BUILD_BUG_ON(sizeof(struct xfrm_trans_cb) > sizeof(skb->cb)); + XFRM_TRANS_SKB_CB(skb)->finish = finish; + XFRM_TRANS_SKB_CB(skb)->net = net; __skb_queue_tail(&trans->queue, skb); tasklet_schedule(&trans->tasklet); return 0; } +EXPORT_SYMBOL(xfrm_trans_queue_net); + +int xfrm_trans_queue(struct sk_buff *skb, + int (*finish)(struct net *, struct sock *, + struct sk_buff *)) +{ + return xfrm_trans_queue_net(dev_net(skb->dev), skb, finish); +} EXPORT_SYMBOL(xfrm_trans_queue); void __init xfrm_input_init(void) diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 7ac1542feaf8..dc651a628dcf 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -268,9 +268,6 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) int err = -1; int mtu; - if (!dst) - goto tx_err_link_failure; - dst_hold(dst); dst = xfrm_lookup_with_ifid(xi->net, dst, fl, NULL, 0, xi->p.if_id); if (IS_ERR(dst)) { @@ -297,7 +294,7 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) mtu = dst_mtu(dst); if (!skb->ignore_df && skb->len > mtu) { - skb_dst_update_pmtu(skb, mtu); + skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->protocol == htons(ETH_P_IPV6)) { if (mtu < IPV6_MIN_MTU) @@ -343,6 +340,7 @@ static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct net_device_stats *stats = &xi->dev->stats; + struct dst_entry *dst = skb_dst(skb); struct flowi fl; int ret; @@ -352,10 +350,33 @@ static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) case htons(ETH_P_IPV6): xfrm_decode_session(skb, &fl, AF_INET6); memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); + if (!dst) { + fl.u.ip6.flowi6_oif = dev->ifindex; + fl.u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; + dst = ip6_route_output(dev_net(dev), NULL, &fl.u.ip6); + if (dst->error) { + dst_release(dst); + stats->tx_carrier_errors++; + goto tx_err; + } + skb_dst_set(skb, dst); + } break; case htons(ETH_P_IP): xfrm_decode_session(skb, &fl, AF_INET); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + if (!dst) { + struct rtable *rt; + + fl.u.ip4.flowi4_oif = dev->ifindex; + fl.u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; + rt = __ip_route_output_key(dev_net(dev), &fl.u.ip4); + if (IS_ERR(rt)) { + stats->tx_carrier_errors++; + goto tx_err; + } + skb_dst_set(skb, &rt->dst); + } break; default: goto tx_err; @@ -563,12 +584,9 @@ static void xfrmi_dev_setup(struct net_device *dev) { dev->netdev_ops = &xfrmi_netdev_ops; dev->type = ARPHRD_NONE; - dev->hard_header_len = ETH_HLEN; - dev->min_header_len = ETH_HLEN; dev->mtu = ETH_DATA_LEN; dev->min_mtu = ETH_MIN_MTU; - dev->max_mtu = ETH_DATA_LEN; - dev->addr_len = ETH_ALEN; + dev->max_mtu = IP_MAX_MTU; dev->flags = IFF_NOARP; dev->needs_free_netdev = true; dev->priv_destructor = xfrmi_dev_free; diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index b1db55b50ba1..fafc7aba705f 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -533,7 +533,7 @@ static int xfrm_output2(struct net *net, struct sock *sk, struct sk_buff *skb) static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct sk_buff *segs; + struct sk_buff *segs, *nskb; BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET); BUILD_BUG_ON(sizeof(*IP6CB(skb)) > SKB_SGO_CB_OFFSET); @@ -544,8 +544,7 @@ static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb if (segs == NULL) return -EINVAL; - do { - struct sk_buff *nskb = segs->next; + skb_list_walk_safe(segs, segs, nskb) { int err; skb_mark_not_on_list(segs); @@ -555,9 +554,7 @@ static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb kfree_skb_list(nskb); return err; } - - segs = nskb; - } while (segs); + } return 0; } diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index f2d1e573ea55..297d1eb79e5c 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -39,6 +39,9 @@ #ifdef CONFIG_XFRM_STATISTICS #include <net/snmp.h> #endif +#ifdef CONFIG_INET_ESPINTCP +#include <net/espintcp.h> +#endif #include "xfrm_hash.h" @@ -4157,6 +4160,10 @@ void __init xfrm_init(void) seqcount_init(&xfrm_policy_hash_generation); xfrm_input_init(); +#ifdef CONFIG_INET_ESPINTCP + espintcp_init(); +#endif + RCU_INIT_POINTER(xfrm_if_cb, NULL); synchronize_rcu(); } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index f3423562d933..170d6e7f31d3 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -670,6 +670,9 @@ int __xfrm_state_delete(struct xfrm_state *x) net->xfrm.state_num--; spin_unlock(&net->xfrm.xfrm_state_lock); + if (x->encap_sk) + sock_put(rcu_dereference_raw(x->encap_sk)); + xfrm_dev_state_delete(x); /* All xfrm_state objects are created by xfrm_state_alloc. diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 5b89c0370f33..b0e8adf7eb01 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -184,7 +184,6 @@ TPROGS_CFLAGS += -Wmissing-prototypes TPROGS_CFLAGS += -Wstrict-prototypes TPROGS_CFLAGS += -I$(objtree)/usr/include -TPROGS_CFLAGS += -I$(srctree)/tools/lib/bpf/ TPROGS_CFLAGS += -I$(srctree)/tools/testing/selftests/bpf/ TPROGS_CFLAGS += -I$(srctree)/tools/lib/ TPROGS_CFLAGS += -I$(srctree)/tools/include @@ -254,7 +253,7 @@ all: clean: $(MAKE) -C ../../ M=$(CURDIR) clean - @rm -f *~ + @find $(CURDIR) -type f -name '*~' -delete $(LIBBPF): FORCE # Fix up variables inherited from Kbuild that tools/ build system won't like @@ -305,7 +304,7 @@ $(obj)/%.o: $(src)/%.c @echo " CLANG-bpf " $@ $(Q)$(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(BPF_EXTRA_CFLAGS) \ -I$(obj) -I$(srctree)/tools/testing/selftests/bpf/ \ - -I$(srctree)/tools/lib/bpf/ \ + -I$(srctree)/tools/lib/ \ -D__KERNEL__ -D__BPF_TRACING__ -Wno-unused-value -Wno-pointer-sign \ -D__TARGET_ARCH_$(SRCARCH) -Wno-compare-distinct-pointer-types \ -Wno-gnu-variable-sized-type-not-at-end \ diff --git a/samples/bpf/cpustat_kern.c b/samples/bpf/cpustat_kern.c index 68c84da065b1..a86a19d5f033 100644 --- a/samples/bpf/cpustat_kern.c +++ b/samples/bpf/cpustat_kern.c @@ -3,7 +3,7 @@ #include <linux/version.h> #include <linux/ptrace.h> #include <uapi/linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> /* * The CPU number, cstate number and pstate number are based diff --git a/samples/bpf/fds_example.c b/samples/bpf/fds_example.c index 2d4b717726b6..d5992f787232 100644 --- a/samples/bpf/fds_example.c +++ b/samples/bpf/fds_example.c @@ -14,7 +14,7 @@ #include <bpf/bpf.h> -#include "libbpf.h" +#include <bpf/libbpf.h> #include "bpf_insn.h" #include "sock_example.h" diff --git a/samples/bpf/hbm.c b/samples/bpf/hbm.c index 829b68d87687..7d7153777678 100644 --- a/samples/bpf/hbm.c +++ b/samples/bpf/hbm.c @@ -50,8 +50,8 @@ #include "cgroup_helpers.h" #include "hbm.h" #include "bpf_util.h" -#include "bpf.h" -#include "libbpf.h" +#include <bpf/bpf.h> +#include <bpf/libbpf.h> bool outFlag = true; int minRate = 1000; /* cgroup rate limit in Mbps */ diff --git a/samples/bpf/hbm_kern.h b/samples/bpf/hbm_kern.h index 4edaf47876ca..e00f26f6afba 100644 --- a/samples/bpf/hbm_kern.h +++ b/samples/bpf/hbm_kern.h @@ -22,8 +22,8 @@ #include <uapi/linux/pkt_cls.h> #include <net/ipv6.h> #include <net/inet_ecn.h> -#include "bpf_endian.h" -#include "bpf_helpers.h" +#include <bpf/bpf_endian.h> +#include <bpf/bpf_helpers.h> #include "hbm.h" #define DROP_PKT 0 diff --git a/samples/bpf/ibumad_kern.c b/samples/bpf/ibumad_kern.c index f281df7e0089..3a91b4c1989a 100644 --- a/samples/bpf/ibumad_kern.c +++ b/samples/bpf/ibumad_kern.c @@ -13,7 +13,7 @@ #define KBUILD_MODNAME "ibumad_count_pkts_by_class" #include <uapi/linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> struct bpf_map_def SEC("maps") read_count = { diff --git a/samples/bpf/ibumad_user.c b/samples/bpf/ibumad_user.c index cb5a8f994849..fa06eef31a84 100644 --- a/samples/bpf/ibumad_user.c +++ b/samples/bpf/ibumad_user.c @@ -25,7 +25,7 @@ #include "bpf_load.h" #include "bpf_util.h" -#include "libbpf.h" +#include <bpf/libbpf.h> static void dump_counts(int fd) { diff --git a/samples/bpf/lathist_kern.c b/samples/bpf/lathist_kern.c index 18fa088473cd..ca9c2e4e69aa 100644 --- a/samples/bpf/lathist_kern.c +++ b/samples/bpf/lathist_kern.c @@ -8,7 +8,7 @@ #include <linux/version.h> #include <linux/ptrace.h> #include <uapi/linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #define MAX_ENTRIES 20 #define MAX_CPU 4 diff --git a/samples/bpf/lwt_len_hist_kern.c b/samples/bpf/lwt_len_hist_kern.c index df75383280f9..9ed63e10e170 100644 --- a/samples/bpf/lwt_len_hist_kern.c +++ b/samples/bpf/lwt_len_hist_kern.c @@ -14,7 +14,7 @@ #include <uapi/linux/if_ether.h> #include <uapi/linux/ip.h> #include <uapi/linux/in.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> # define printk(fmt, ...) \ ({ \ diff --git a/samples/bpf/map_perf_test_kern.c b/samples/bpf/map_perf_test_kern.c index 281bcdaee58e..12e91ae64d4d 100644 --- a/samples/bpf/map_perf_test_kern.c +++ b/samples/bpf/map_perf_test_kern.c @@ -8,9 +8,9 @@ #include <linux/netdevice.h> #include <linux/version.h> #include <uapi/linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #include "bpf_legacy.h" -#include "bpf_tracing.h" +#include <bpf/bpf_tracing.h> #define MAX_ENTRIES 1000 #define MAX_NR_CPUS 1024 diff --git a/samples/bpf/offwaketime_kern.c b/samples/bpf/offwaketime_kern.c index 9cb5207a692f..c4ec10dbfc3b 100644 --- a/samples/bpf/offwaketime_kern.c +++ b/samples/bpf/offwaketime_kern.c @@ -5,8 +5,8 @@ * License as published by the Free Software Foundation. */ #include <uapi/linux/bpf.h> -#include "bpf_helpers.h" -#include "bpf_tracing.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> #include <uapi/linux/ptrace.h> #include <uapi/linux/perf_event.h> #include <linux/version.h> diff --git a/samples/bpf/offwaketime_user.c b/samples/bpf/offwaketime_user.c index fc8767d001f6..51c7da5341cc 100644 --- a/samples/bpf/offwaketime_user.c +++ b/samples/bpf/offwaketime_user.c @@ -12,7 +12,7 @@ #include <assert.h> #include <stdbool.h> #include <sys/resource.h> -#include "libbpf.h" +#include <bpf/libbpf.h> #include "bpf_load.h" #include "trace_helpers.h" diff --git a/samples/bpf/parse_ldabs.c b/samples/bpf/parse_ldabs.c index ef5892377beb..c6f65f90a097 100644 --- a/samples/bpf/parse_ldabs.c +++ b/samples/bpf/parse_ldabs.c @@ -11,7 +11,7 @@ #include <linux/tcp.h> #include <linux/udp.h> #include <uapi/linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #include "bpf_legacy.h" #define DEFAULT_PKTGEN_UDP_PORT 9 diff --git a/samples/bpf/parse_simple.c b/samples/bpf/parse_simple.c index 10af53d33cc2..4a486cb1e0df 100644 --- a/samples/bpf/parse_simple.c +++ b/samples/bpf/parse_simple.c @@ -12,7 +12,7 @@ #include <linux/udp.h> #include <uapi/linux/bpf.h> #include <net/ip.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #define DEFAULT_PKTGEN_UDP_PORT 9 diff --git a/samples/bpf/parse_varlen.c b/samples/bpf/parse_varlen.c index 0b6f22feb2c9..d8623846e810 100644 --- a/samples/bpf/parse_varlen.c +++ b/samples/bpf/parse_varlen.c @@ -14,7 +14,7 @@ #include <linux/udp.h> #include <uapi/linux/bpf.h> #include <net/ip.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #define DEFAULT_PKTGEN_UDP_PORT 9 #define DEBUG 0 diff --git a/samples/bpf/sampleip_kern.c b/samples/bpf/sampleip_kern.c index 4a190893894f..e504dc308371 100644 --- a/samples/bpf/sampleip_kern.c +++ b/samples/bpf/sampleip_kern.c @@ -8,8 +8,8 @@ #include <linux/ptrace.h> #include <uapi/linux/bpf.h> #include <uapi/linux/bpf_perf_event.h> -#include "bpf_helpers.h" -#include "bpf_tracing.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> #define MAX_IPS 8192 diff --git a/samples/bpf/sampleip_user.c b/samples/bpf/sampleip_user.c index 6b5dc26d9701..b0f115f938bc 100644 --- a/samples/bpf/sampleip_user.c +++ b/samples/bpf/sampleip_user.c @@ -15,7 +15,7 @@ #include <linux/ptrace.h> #include <linux/bpf.h> #include <sys/ioctl.h> -#include "libbpf.h" +#include <bpf/libbpf.h> #include "bpf_load.h" #include "perf-sys.h" #include "trace_helpers.h" diff --git a/samples/bpf/sock_flags_kern.c b/samples/bpf/sock_flags_kern.c index 05dcdf8a4baa..6d0ac7569d6f 100644 --- a/samples/bpf/sock_flags_kern.c +++ b/samples/bpf/sock_flags_kern.c @@ -3,7 +3,7 @@ #include <linux/net.h> #include <uapi/linux/in.h> #include <uapi/linux/in6.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> SEC("cgroup/sock1") int bpf_prog1(struct bpf_sock *sk) diff --git a/samples/bpf/sockex1_kern.c b/samples/bpf/sockex1_kern.c index 2408dbfb7a21..431c956460ad 100644 --- a/samples/bpf/sockex1_kern.c +++ b/samples/bpf/sockex1_kern.c @@ -2,7 +2,7 @@ #include <uapi/linux/if_ether.h> #include <uapi/linux/if_packet.h> #include <uapi/linux/ip.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #include "bpf_legacy.h" struct { diff --git a/samples/bpf/sockex1_user.c b/samples/bpf/sockex1_user.c index a219442afbee..3c83722877dc 100644 --- a/samples/bpf/sockex1_user.c +++ b/samples/bpf/sockex1_user.c @@ -3,7 +3,7 @@ #include <assert.h> #include <linux/bpf.h> #include <bpf/bpf.h> -#include "libbpf.h" +#include <bpf/libbpf.h> #include "sock_example.h" #include <unistd.h> #include <arpa/inet.h> diff --git a/samples/bpf/sockex2_kern.c b/samples/bpf/sockex2_kern.c index a7bcd03bf529..a41dd520bc53 100644 --- a/samples/bpf/sockex2_kern.c +++ b/samples/bpf/sockex2_kern.c @@ -1,5 +1,5 @@ #include <uapi/linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #include "bpf_legacy.h" #include <uapi/linux/in.h> #include <uapi/linux/if.h> diff --git a/samples/bpf/sockex2_user.c b/samples/bpf/sockex2_user.c index 6de383ddd08b..af925a5afd1d 100644 --- a/samples/bpf/sockex2_user.c +++ b/samples/bpf/sockex2_user.c @@ -3,7 +3,7 @@ #include <assert.h> #include <linux/bpf.h> #include <bpf/bpf.h> -#include "libbpf.h" +#include <bpf/libbpf.h> #include "sock_example.h" #include <unistd.h> #include <arpa/inet.h> diff --git a/samples/bpf/sockex3_kern.c b/samples/bpf/sockex3_kern.c index 151dd842ecc0..36d4dac23549 100644 --- a/samples/bpf/sockex3_kern.c +++ b/samples/bpf/sockex3_kern.c @@ -5,7 +5,7 @@ * License as published by the Free Software Foundation. */ #include <uapi/linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #include "bpf_legacy.h" #include <uapi/linux/in.h> #include <uapi/linux/if.h> diff --git a/samples/bpf/spintest_kern.c b/samples/bpf/spintest_kern.c index 6e9478aa2938..f508af357251 100644 --- a/samples/bpf/spintest_kern.c +++ b/samples/bpf/spintest_kern.c @@ -9,8 +9,8 @@ #include <linux/version.h> #include <uapi/linux/bpf.h> #include <uapi/linux/perf_event.h> -#include "bpf_helpers.h" -#include "bpf_tracing.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> struct bpf_map_def SEC("maps") my_map = { .type = BPF_MAP_TYPE_HASH, diff --git a/samples/bpf/spintest_user.c b/samples/bpf/spintest_user.c index 2556af2d9b3e..fb430ea2ef51 100644 --- a/samples/bpf/spintest_user.c +++ b/samples/bpf/spintest_user.c @@ -5,7 +5,7 @@ #include <string.h> #include <assert.h> #include <sys/resource.h> -#include "libbpf.h" +#include <bpf/libbpf.h> #include "bpf_load.h" #include "trace_helpers.h" diff --git a/samples/bpf/syscall_tp_kern.c b/samples/bpf/syscall_tp_kern.c index 630ce8c4d5a2..5a62b03b1f88 100644 --- a/samples/bpf/syscall_tp_kern.c +++ b/samples/bpf/syscall_tp_kern.c @@ -2,7 +2,7 @@ /* Copyright (c) 2017 Facebook */ #include <uapi/linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> struct syscalls_enter_open_args { unsigned long long unused; diff --git a/samples/bpf/task_fd_query_kern.c b/samples/bpf/task_fd_query_kern.c index fb56fc2a3e5d..278ade5427c8 100644 --- a/samples/bpf/task_fd_query_kern.c +++ b/samples/bpf/task_fd_query_kern.c @@ -2,7 +2,7 @@ #include <linux/version.h> #include <linux/ptrace.h> #include <uapi/linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> SEC("kprobe/blk_mq_start_request") int bpf_prog1(struct pt_regs *ctx) diff --git a/samples/bpf/task_fd_query_user.c b/samples/bpf/task_fd_query_user.c index 4c31b305e6ef..ff2e9c1c7266 100644 --- a/samples/bpf/task_fd_query_user.c +++ b/samples/bpf/task_fd_query_user.c @@ -15,7 +15,7 @@ #include <sys/stat.h> #include <linux/perf_event.h> -#include "libbpf.h" +#include <bpf/libbpf.h> #include "bpf_load.h" #include "bpf_util.h" #include "perf-sys.h" diff --git a/samples/bpf/tc_l2_redirect_kern.c b/samples/bpf/tc_l2_redirect_kern.c index 7ef2a12b25b2..fd2fa0004330 100644 --- a/samples/bpf/tc_l2_redirect_kern.c +++ b/samples/bpf/tc_l2_redirect_kern.c @@ -15,7 +15,7 @@ #include <uapi/linux/filter.h> #include <uapi/linux/pkt_cls.h> #include <net/ipv6.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #define _htonl __builtin_bswap32 diff --git a/samples/bpf/tcbpf1_kern.c b/samples/bpf/tcbpf1_kern.c index ff43341bdfce..e9356130f84e 100644 --- a/samples/bpf/tcbpf1_kern.c +++ b/samples/bpf/tcbpf1_kern.c @@ -7,7 +7,7 @@ #include <uapi/linux/tcp.h> #include <uapi/linux/filter.h> #include <uapi/linux/pkt_cls.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #include "bpf_legacy.h" /* compiler workaround */ diff --git a/samples/bpf/tcp_basertt_kern.c b/samples/bpf/tcp_basertt_kern.c index 9dba48c2b920..8dfe09a92fec 100644 --- a/samples/bpf/tcp_basertt_kern.c +++ b/samples/bpf/tcp_basertt_kern.c @@ -16,8 +16,8 @@ #include <uapi/linux/if_packet.h> #include <uapi/linux/ip.h> #include <linux/socket.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> #define DEBUG 1 diff --git a/samples/bpf/tcp_bufs_kern.c b/samples/bpf/tcp_bufs_kern.c index af8486f33771..6a80d08952ad 100644 --- a/samples/bpf/tcp_bufs_kern.c +++ b/samples/bpf/tcp_bufs_kern.c @@ -17,8 +17,8 @@ #include <uapi/linux/if_packet.h> #include <uapi/linux/ip.h> #include <linux/socket.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> #define DEBUG 1 diff --git a/samples/bpf/tcp_clamp_kern.c b/samples/bpf/tcp_clamp_kern.c index 26c0fd091f3c..e88bd9ab0695 100644 --- a/samples/bpf/tcp_clamp_kern.c +++ b/samples/bpf/tcp_clamp_kern.c @@ -17,8 +17,8 @@ #include <uapi/linux/if_packet.h> #include <uapi/linux/ip.h> #include <linux/socket.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> #define DEBUG 1 diff --git a/samples/bpf/tcp_cong_kern.c b/samples/bpf/tcp_cong_kern.c index 6d4dc4c7dd1e..2311fc9dde85 100644 --- a/samples/bpf/tcp_cong_kern.c +++ b/samples/bpf/tcp_cong_kern.c @@ -16,8 +16,8 @@ #include <uapi/linux/if_packet.h> #include <uapi/linux/ip.h> #include <linux/socket.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> #define DEBUG 1 diff --git a/samples/bpf/tcp_dumpstats_kern.c b/samples/bpf/tcp_dumpstats_kern.c index 8557913106a0..e80d3afd24bd 100644 --- a/samples/bpf/tcp_dumpstats_kern.c +++ b/samples/bpf/tcp_dumpstats_kern.c @@ -4,8 +4,8 @@ */ #include <linux/bpf.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> #define INTERVAL 1000000000ULL diff --git a/samples/bpf/tcp_iw_kern.c b/samples/bpf/tcp_iw_kern.c index da61d53378b3..d1444557358e 100644 --- a/samples/bpf/tcp_iw_kern.c +++ b/samples/bpf/tcp_iw_kern.c @@ -17,8 +17,8 @@ #include <uapi/linux/if_packet.h> #include <uapi/linux/ip.h> #include <linux/socket.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> #define DEBUG 1 diff --git a/samples/bpf/tcp_rwnd_kern.c b/samples/bpf/tcp_rwnd_kern.c index d011e38b80d2..223d9c23b10c 100644 --- a/samples/bpf/tcp_rwnd_kern.c +++ b/samples/bpf/tcp_rwnd_kern.c @@ -16,8 +16,8 @@ #include <uapi/linux/if_packet.h> #include <uapi/linux/ip.h> #include <linux/socket.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> #define DEBUG 1 diff --git a/samples/bpf/tcp_synrto_kern.c b/samples/bpf/tcp_synrto_kern.c index 720d1950322d..d58004eef124 100644 --- a/samples/bpf/tcp_synrto_kern.c +++ b/samples/bpf/tcp_synrto_kern.c @@ -16,8 +16,8 @@ #include <uapi/linux/if_packet.h> #include <uapi/linux/ip.h> #include <linux/socket.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> #define DEBUG 1 diff --git a/samples/bpf/tcp_tos_reflect_kern.c b/samples/bpf/tcp_tos_reflect_kern.c index 369faca70a15..953fedc79ce1 100644 --- a/samples/bpf/tcp_tos_reflect_kern.c +++ b/samples/bpf/tcp_tos_reflect_kern.c @@ -15,8 +15,8 @@ #include <uapi/linux/ipv6.h> #include <uapi/linux/in.h> #include <linux/socket.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> #define DEBUG 1 diff --git a/samples/bpf/test_cgrp2_tc_kern.c b/samples/bpf/test_cgrp2_tc_kern.c index 1547b36a7b7b..4dd532a312b9 100644 --- a/samples/bpf/test_cgrp2_tc_kern.c +++ b/samples/bpf/test_cgrp2_tc_kern.c @@ -10,7 +10,7 @@ #include <uapi/linux/ipv6.h> #include <uapi/linux/pkt_cls.h> #include <uapi/linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> /* copy of 'struct ethhdr' without __packed */ struct eth_hdr { diff --git a/samples/bpf/test_current_task_under_cgroup_kern.c b/samples/bpf/test_current_task_under_cgroup_kern.c index 86b28d7d6c99..6dc4f41bb6cb 100644 --- a/samples/bpf/test_current_task_under_cgroup_kern.c +++ b/samples/bpf/test_current_task_under_cgroup_kern.c @@ -8,7 +8,7 @@ #include <linux/ptrace.h> #include <uapi/linux/bpf.h> #include <linux/version.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #include <uapi/linux/utsname.h> struct bpf_map_def SEC("maps") cgroup_map = { diff --git a/samples/bpf/test_lwt_bpf.c b/samples/bpf/test_lwt_bpf.c index bacc8013436b..1b568575ad11 100644 --- a/samples/bpf/test_lwt_bpf.c +++ b/samples/bpf/test_lwt_bpf.c @@ -20,7 +20,7 @@ #include <linux/udp.h> #include <linux/icmpv6.h> #include <linux/if_ether.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #include <string.h> # define printk(fmt, ...) \ diff --git a/samples/bpf/test_map_in_map_kern.c b/samples/bpf/test_map_in_map_kern.c index 32ee752f19df..6cee61e8ce9b 100644 --- a/samples/bpf/test_map_in_map_kern.c +++ b/samples/bpf/test_map_in_map_kern.c @@ -10,9 +10,9 @@ #include <linux/version.h> #include <uapi/linux/bpf.h> #include <uapi/linux/in6.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #include "bpf_legacy.h" -#include "bpf_tracing.h" +#include <bpf/bpf_tracing.h> #define MAX_NR_PORTS 65536 diff --git a/samples/bpf/test_overhead_kprobe_kern.c b/samples/bpf/test_overhead_kprobe_kern.c index 8d2518e68db9..8b811c29dc79 100644 --- a/samples/bpf/test_overhead_kprobe_kern.c +++ b/samples/bpf/test_overhead_kprobe_kern.c @@ -7,8 +7,8 @@ #include <linux/version.h> #include <linux/ptrace.h> #include <uapi/linux/bpf.h> -#include "bpf_helpers.h" -#include "bpf_tracing.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> #define _(P) ({typeof(P) val = 0; bpf_probe_read(&val, sizeof(val), &P); val;}) diff --git a/samples/bpf/test_overhead_raw_tp_kern.c b/samples/bpf/test_overhead_raw_tp_kern.c index d2af8bc1c805..8763181a32f3 100644 --- a/samples/bpf/test_overhead_raw_tp_kern.c +++ b/samples/bpf/test_overhead_raw_tp_kern.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2018 Facebook */ #include <uapi/linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> SEC("raw_tracepoint/task_rename") int prog(struct bpf_raw_tracepoint_args *ctx) diff --git a/samples/bpf/test_overhead_tp_kern.c b/samples/bpf/test_overhead_tp_kern.c index 38f5c0b9da9f..eaa32693f8fc 100644 --- a/samples/bpf/test_overhead_tp_kern.c +++ b/samples/bpf/test_overhead_tp_kern.c @@ -5,7 +5,7 @@ * License as published by the Free Software Foundation. */ #include <uapi/linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> /* from /sys/kernel/debug/tracing/events/task/task_rename/format */ struct task_rename { diff --git a/samples/bpf/test_probe_write_user_kern.c b/samples/bpf/test_probe_write_user_kern.c index b7c48f37132c..f033f36a13a3 100644 --- a/samples/bpf/test_probe_write_user_kern.c +++ b/samples/bpf/test_probe_write_user_kern.c @@ -8,8 +8,8 @@ #include <linux/netdevice.h> #include <uapi/linux/bpf.h> #include <linux/version.h> -#include "bpf_helpers.h" -#include "bpf_tracing.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> struct bpf_map_def SEC("maps") dnat_map = { .type = BPF_MAP_TYPE_HASH, diff --git a/samples/bpf/trace_event_kern.c b/samples/bpf/trace_event_kern.c index 8dc18d233a27..da1d69e20645 100644 --- a/samples/bpf/trace_event_kern.c +++ b/samples/bpf/trace_event_kern.c @@ -9,8 +9,8 @@ #include <uapi/linux/bpf.h> #include <uapi/linux/bpf_perf_event.h> #include <uapi/linux/perf_event.h> -#include "bpf_helpers.h" -#include "bpf_tracing.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> struct key_t { char comm[TASK_COMM_LEN]; diff --git a/samples/bpf/trace_event_user.c b/samples/bpf/trace_event_user.c index 749a50f2f9f3..356171bc392b 100644 --- a/samples/bpf/trace_event_user.c +++ b/samples/bpf/trace_event_user.c @@ -15,7 +15,7 @@ #include <assert.h> #include <errno.h> #include <sys/resource.h> -#include "libbpf.h" +#include <bpf/libbpf.h> #include "bpf_load.h" #include "perf-sys.h" #include "trace_helpers.h" diff --git a/samples/bpf/trace_output_kern.c b/samples/bpf/trace_output_kern.c index 9b96f4fb8cea..1d7d422cae6f 100644 --- a/samples/bpf/trace_output_kern.c +++ b/samples/bpf/trace_output_kern.c @@ -1,7 +1,7 @@ #include <linux/ptrace.h> #include <linux/version.h> #include <uapi/linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> struct bpf_map_def SEC("maps") my_map = { .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, diff --git a/samples/bpf/trace_output_user.c b/samples/bpf/trace_output_user.c index 8ee47699a870..60a17dd05345 100644 --- a/samples/bpf/trace_output_user.c +++ b/samples/bpf/trace_output_user.c @@ -15,7 +15,7 @@ #include <sys/mman.h> #include <time.h> #include <signal.h> -#include <libbpf.h> +#include <bpf/libbpf.h> #include "bpf_load.h" #include "perf-sys.h" diff --git a/samples/bpf/tracex1_kern.c b/samples/bpf/tracex1_kern.c index 1a15f6605129..8e2610e14475 100644 --- a/samples/bpf/tracex1_kern.c +++ b/samples/bpf/tracex1_kern.c @@ -8,8 +8,8 @@ #include <linux/netdevice.h> #include <uapi/linux/bpf.h> #include <linux/version.h> -#include "bpf_helpers.h" -#include "bpf_tracing.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> #define _(P) ({typeof(P) val = 0; bpf_probe_read(&val, sizeof(val), &P); val;}) diff --git a/samples/bpf/tracex2_kern.c b/samples/bpf/tracex2_kern.c index d70b3ea79ea7..d865bb309bcb 100644 --- a/samples/bpf/tracex2_kern.c +++ b/samples/bpf/tracex2_kern.c @@ -8,8 +8,8 @@ #include <linux/netdevice.h> #include <linux/version.h> #include <uapi/linux/bpf.h> -#include "bpf_helpers.h" -#include "bpf_tracing.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> struct bpf_map_def SEC("maps") my_map = { .type = BPF_MAP_TYPE_HASH, diff --git a/samples/bpf/tracex3_kern.c b/samples/bpf/tracex3_kern.c index 9af546bebfa9..fe21c14feb8d 100644 --- a/samples/bpf/tracex3_kern.c +++ b/samples/bpf/tracex3_kern.c @@ -8,8 +8,8 @@ #include <linux/netdevice.h> #include <linux/version.h> #include <uapi/linux/bpf.h> -#include "bpf_helpers.h" -#include "bpf_tracing.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> struct bpf_map_def SEC("maps") my_map = { .type = BPF_MAP_TYPE_HASH, diff --git a/samples/bpf/tracex4_kern.c b/samples/bpf/tracex4_kern.c index 2a02cbe9d9a1..b1bb9df88f8e 100644 --- a/samples/bpf/tracex4_kern.c +++ b/samples/bpf/tracex4_kern.c @@ -7,8 +7,8 @@ #include <linux/ptrace.h> #include <linux/version.h> #include <uapi/linux/bpf.h> -#include "bpf_helpers.h" -#include "bpf_tracing.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> struct pair { u64 val; diff --git a/samples/bpf/tracex5_kern.c b/samples/bpf/tracex5_kern.c index b3557b21a8fe..481790fde864 100644 --- a/samples/bpf/tracex5_kern.c +++ b/samples/bpf/tracex5_kern.c @@ -10,8 +10,8 @@ #include <uapi/linux/seccomp.h> #include <uapi/linux/unistd.h> #include "syscall_nrs.h" -#include "bpf_helpers.h" -#include "bpf_tracing.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> #define PROG(F) SEC("kprobe/"__stringify(F)) int bpf_func_##F diff --git a/samples/bpf/tracex6_kern.c b/samples/bpf/tracex6_kern.c index 46c557afac73..96c234efa852 100644 --- a/samples/bpf/tracex6_kern.c +++ b/samples/bpf/tracex6_kern.c @@ -1,7 +1,7 @@ #include <linux/ptrace.h> #include <linux/version.h> #include <uapi/linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> struct bpf_map_def SEC("maps") counters = { .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, diff --git a/samples/bpf/tracex7_kern.c b/samples/bpf/tracex7_kern.c index 1ab308a43e0f..c5a92df8ac31 100644 --- a/samples/bpf/tracex7_kern.c +++ b/samples/bpf/tracex7_kern.c @@ -1,7 +1,7 @@ #include <uapi/linux/ptrace.h> #include <uapi/linux/bpf.h> #include <linux/version.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> SEC("kprobe/open_ctree") int bpf_prog1(struct pt_regs *ctx) diff --git a/samples/bpf/xdp1_kern.c b/samples/bpf/xdp1_kern.c index db6870aee42c..34b64394ed9c 100644 --- a/samples/bpf/xdp1_kern.c +++ b/samples/bpf/xdp1_kern.c @@ -12,7 +12,7 @@ #include <linux/if_vlan.h> #include <linux/ip.h> #include <linux/ipv6.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> struct { __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); diff --git a/samples/bpf/xdp1_user.c b/samples/bpf/xdp1_user.c index 38a8852cb57f..c447ad9e3a1d 100644 --- a/samples/bpf/xdp1_user.c +++ b/samples/bpf/xdp1_user.c @@ -15,8 +15,8 @@ #include <net/if.h> #include "bpf_util.h" -#include "bpf.h" -#include "libbpf.h" +#include <bpf/bpf.h> +#include <bpf/libbpf.h> static int ifindex; static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; diff --git a/samples/bpf/xdp2_kern.c b/samples/bpf/xdp2_kern.c index c74b52c6d945..c787f4b49646 100644 --- a/samples/bpf/xdp2_kern.c +++ b/samples/bpf/xdp2_kern.c @@ -12,7 +12,7 @@ #include <linux/if_vlan.h> #include <linux/ip.h> #include <linux/ipv6.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> struct { __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); diff --git a/samples/bpf/xdp2skb_meta_kern.c b/samples/bpf/xdp2skb_meta_kern.c index 0c12048ac79f..9b783316e860 100644 --- a/samples/bpf/xdp2skb_meta_kern.c +++ b/samples/bpf/xdp2skb_meta_kern.c @@ -12,7 +12,7 @@ #include <uapi/linux/bpf.h> #include <uapi/linux/pkt_cls.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> /* * This struct is stored in the XDP 'data_meta' area, which is located diff --git a/samples/bpf/xdp_adjust_tail_kern.c b/samples/bpf/xdp_adjust_tail_kern.c index 0f707e0fb375..ffdd548627f0 100644 --- a/samples/bpf/xdp_adjust_tail_kern.c +++ b/samples/bpf/xdp_adjust_tail_kern.c @@ -18,7 +18,7 @@ #include <linux/if_vlan.h> #include <linux/ip.h> #include <linux/icmp.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #define DEFAULT_TTL 64 #define MAX_PCKT_SIZE 600 diff --git a/samples/bpf/xdp_adjust_tail_user.c b/samples/bpf/xdp_adjust_tail_user.c index 008789eb6ada..ba482dc3da33 100644 --- a/samples/bpf/xdp_adjust_tail_user.c +++ b/samples/bpf/xdp_adjust_tail_user.c @@ -19,8 +19,8 @@ #include <netinet/ether.h> #include <unistd.h> #include <time.h> -#include "bpf.h" -#include "libbpf.h" +#include <bpf/bpf.h> +#include <bpf/libbpf.h> #define STATS_INTERVAL_S 2U #define MAX_PCKT_SIZE 600 diff --git a/samples/bpf/xdp_fwd_kern.c b/samples/bpf/xdp_fwd_kern.c index d013029aeaa2..54c099cbd639 100644 --- a/samples/bpf/xdp_fwd_kern.c +++ b/samples/bpf/xdp_fwd_kern.c @@ -19,7 +19,7 @@ #include <linux/ip.h> #include <linux/ipv6.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #define IPV6_FLOWINFO_MASK cpu_to_be32(0x0FFFFFFF) diff --git a/samples/bpf/xdp_fwd_user.c b/samples/bpf/xdp_fwd_user.c index c30f9acfdb84..74a4583d0d86 100644 --- a/samples/bpf/xdp_fwd_user.c +++ b/samples/bpf/xdp_fwd_user.c @@ -24,7 +24,7 @@ #include <fcntl.h> #include <libgen.h> -#include "libbpf.h" +#include <bpf/libbpf.h> #include <bpf/bpf.h> static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c index ad10fe700d7d..3d33cca2d48a 100644 --- a/samples/bpf/xdp_monitor_kern.c +++ b/samples/bpf/xdp_monitor_kern.c @@ -4,7 +4,7 @@ * XDP monitor tool, based on tracepoints */ #include <uapi/linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> struct bpf_map_def SEC("maps") redirect_err_cnt = { .type = BPF_MAP_TYPE_PERCPU_ARRAY, @@ -222,14 +222,12 @@ struct bpf_map_def SEC("maps") devmap_xmit_cnt = { */ struct devmap_xmit_ctx { u64 __pad; // First 8 bytes are not accessible by bpf code - int map_id; // offset:8; size:4; signed:1; + int from_ifindex; // offset:8; size:4; signed:1; u32 act; // offset:12; size:4; signed:0; - u32 map_index; // offset:16; size:4; signed:0; + int to_ifindex; // offset:16; size:4; signed:1; int drops; // offset:20; size:4; signed:1; int sent; // offset:24; size:4; signed:1; - int from_ifindex; // offset:28; size:4; signed:1; - int to_ifindex; // offset:32; size:4; signed:1; - int err; // offset:36; size:4; signed:1; + int err; // offset:28; size:4; signed:1; }; SEC("tracepoint/xdp/xdp_devmap_xmit") diff --git a/samples/bpf/xdp_redirect_cpu_kern.c b/samples/bpf/xdp_redirect_cpu_kern.c index cfcc31e51197..313a8fe6d125 100644 --- a/samples/bpf/xdp_redirect_cpu_kern.c +++ b/samples/bpf/xdp_redirect_cpu_kern.c @@ -12,7 +12,7 @@ #include <uapi/linux/udp.h> #include <uapi/linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #include "hash_func01.h" #define MAX_CPUS 64 /* WARNING - sync with _user.c */ diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c index 79a2fb7d16cb..15bdf047a222 100644 --- a/samples/bpf/xdp_redirect_cpu_user.c +++ b/samples/bpf/xdp_redirect_cpu_user.c @@ -30,7 +30,7 @@ static const char *__doc__ = #define MAX_PROG 6 #include <bpf/bpf.h> -#include "libbpf.h" +#include <bpf/libbpf.h> #include "bpf_util.h" diff --git a/samples/bpf/xdp_redirect_kern.c b/samples/bpf/xdp_redirect_kern.c index 1f0b7d05abb2..d26ec3aa215e 100644 --- a/samples/bpf/xdp_redirect_kern.c +++ b/samples/bpf/xdp_redirect_kern.c @@ -17,7 +17,7 @@ #include <linux/if_vlan.h> #include <linux/ip.h> #include <linux/ipv6.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> struct { __uint(type, BPF_MAP_TYPE_ARRAY); diff --git a/samples/bpf/xdp_redirect_map_kern.c b/samples/bpf/xdp_redirect_map_kern.c index 4631b484c432..6489352ab7a4 100644 --- a/samples/bpf/xdp_redirect_map_kern.c +++ b/samples/bpf/xdp_redirect_map_kern.c @@ -17,7 +17,7 @@ #include <linux/if_vlan.h> #include <linux/ip.h> #include <linux/ipv6.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> struct { __uint(type, BPF_MAP_TYPE_DEVMAP); diff --git a/samples/bpf/xdp_redirect_map_user.c b/samples/bpf/xdp_redirect_map_user.c index cc840661faab..35e16dee613e 100644 --- a/samples/bpf/xdp_redirect_map_user.c +++ b/samples/bpf/xdp_redirect_map_user.c @@ -17,7 +17,7 @@ #include "bpf_util.h" #include <bpf/bpf.h> -#include "libbpf.h" +#include <bpf/libbpf.h> static int ifindex_in; static int ifindex_out; diff --git a/samples/bpf/xdp_redirect_user.c b/samples/bpf/xdp_redirect_user.c index 71dff8e3382a..9ca2bf457cda 100644 --- a/samples/bpf/xdp_redirect_user.c +++ b/samples/bpf/xdp_redirect_user.c @@ -17,7 +17,7 @@ #include "bpf_util.h" #include <bpf/bpf.h> -#include "libbpf.h" +#include <bpf/libbpf.h> static int ifindex_in; static int ifindex_out; diff --git a/samples/bpf/xdp_router_ipv4_kern.c b/samples/bpf/xdp_router_ipv4_kern.c index bf11efc8e949..b37ca2b13063 100644 --- a/samples/bpf/xdp_router_ipv4_kern.c +++ b/samples/bpf/xdp_router_ipv4_kern.c @@ -12,7 +12,7 @@ #include <linux/if_vlan.h> #include <linux/ip.h> #include <linux/ipv6.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #include <linux/slab.h> #include <net/ip_fib.h> diff --git a/samples/bpf/xdp_router_ipv4_user.c b/samples/bpf/xdp_router_ipv4_user.c index fef286c5add2..c2da1b51ff95 100644 --- a/samples/bpf/xdp_router_ipv4_user.c +++ b/samples/bpf/xdp_router_ipv4_user.c @@ -21,7 +21,7 @@ #include <sys/ioctl.h> #include <sys/syscall.h> #include "bpf_util.h" -#include "libbpf.h" +#include <bpf/libbpf.h> #include <sys/resource.h> #include <libgen.h> diff --git a/samples/bpf/xdp_rxq_info_kern.c b/samples/bpf/xdp_rxq_info_kern.c index 272d0f82a6b5..5e7459f9bf3e 100644 --- a/samples/bpf/xdp_rxq_info_kern.c +++ b/samples/bpf/xdp_rxq_info_kern.c @@ -6,7 +6,7 @@ #include <uapi/linux/bpf.h> #include <uapi/linux/if_ether.h> #include <uapi/linux/in.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> /* Config setup from with userspace * diff --git a/samples/bpf/xdp_rxq_info_user.c b/samples/bpf/xdp_rxq_info_user.c index fc4983fd6959..4fe47502ebed 100644 --- a/samples/bpf/xdp_rxq_info_user.c +++ b/samples/bpf/xdp_rxq_info_user.c @@ -22,8 +22,8 @@ static const char *__doc__ = " XDP RX-queue info extract example\n\n" #include <arpa/inet.h> #include <linux/if_link.h> -#include "bpf.h" -#include "libbpf.h" +#include <bpf/bpf.h> +#include <bpf/libbpf.h> #include "bpf_util.h" static int ifindex = -1; diff --git a/samples/bpf/xdp_sample_pkts_kern.c b/samples/bpf/xdp_sample_pkts_kern.c index 6c7c7e0aaeda..33377289e2a8 100644 --- a/samples/bpf/xdp_sample_pkts_kern.c +++ b/samples/bpf/xdp_sample_pkts_kern.c @@ -2,7 +2,7 @@ #include <linux/ptrace.h> #include <linux/version.h> #include <uapi/linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #define SAMPLE_SIZE 64ul #define MAX_CPUS 128 diff --git a/samples/bpf/xdp_sample_pkts_user.c b/samples/bpf/xdp_sample_pkts_user.c index 8c1af1b7372d..991ef6f0880b 100644 --- a/samples/bpf/xdp_sample_pkts_user.c +++ b/samples/bpf/xdp_sample_pkts_user.c @@ -10,7 +10,7 @@ #include <sys/sysinfo.h> #include <sys/ioctl.h> #include <signal.h> -#include <libbpf.h> +#include <bpf/libbpf.h> #include <bpf/bpf.h> #include <sys/resource.h> #include <libgen.h> diff --git a/samples/bpf/xdp_tx_iptunnel_kern.c b/samples/bpf/xdp_tx_iptunnel_kern.c index 6db450a5c1ca..575d57e4b8d6 100644 --- a/samples/bpf/xdp_tx_iptunnel_kern.c +++ b/samples/bpf/xdp_tx_iptunnel_kern.c @@ -16,7 +16,7 @@ #include <linux/if_vlan.h> #include <linux/ip.h> #include <linux/ipv6.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #include "xdp_tx_iptunnel_common.h" struct { diff --git a/samples/bpf/xdp_tx_iptunnel_user.c b/samples/bpf/xdp_tx_iptunnel_user.c index 5f33b5530032..a419bee151a8 100644 --- a/samples/bpf/xdp_tx_iptunnel_user.c +++ b/samples/bpf/xdp_tx_iptunnel_user.c @@ -15,7 +15,7 @@ #include <netinet/ether.h> #include <unistd.h> #include <time.h> -#include "libbpf.h" +#include <bpf/libbpf.h> #include <bpf/bpf.h> #include "bpf_util.h" #include "xdp_tx_iptunnel_common.h" diff --git a/samples/bpf/xdpsock_kern.c b/samples/bpf/xdpsock_kern.c index a06177c262cd..05430484375c 100644 --- a/samples/bpf/xdpsock_kern.c +++ b/samples/bpf/xdpsock_kern.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #include "xdpsock.h" /* This XDP program is only needed for the XDP_SHARED_UMEM mode. diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index d74c4c83fc93..0b5acd722306 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -30,10 +30,10 @@ #include <time.h> #include <unistd.h> -#include "libbpf.h" -#include "xsk.h" -#include "xdpsock.h" +#include <bpf/libbpf.h> +#include <bpf/xsk.h> #include <bpf/bpf.h> +#include "xdpsock.h" #ifndef SOL_XDP #define SOL_XDP 283 diff --git a/samples/seccomp/user-trap.c b/samples/seccomp/user-trap.c index 6d0125ca8af7..20291ec6489f 100644 --- a/samples/seccomp/user-trap.c +++ b/samples/seccomp/user-trap.c @@ -298,14 +298,14 @@ int main(void) req = malloc(sizes.seccomp_notif); if (!req) goto out_close; - memset(req, 0, sizeof(*req)); resp = malloc(sizes.seccomp_notif_resp); if (!resp) goto out_req; - memset(resp, 0, sizeof(*resp)); + memset(resp, 0, sizes.seccomp_notif_resp); while (1) { + memset(req, 0, sizes.seccomp_notif); if (ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, req)) { perror("ioctl recv"); goto out_resp; diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 7548569e8076..90baf7d70911 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -158,8 +158,6 @@ class HeaderParser(object): break self.reader.close() - print('Parsed description of %d helper function(s)' % len(self.helpers), - file=sys.stderr) ############################################################################### diff --git a/scripts/gcc-plugins/Kconfig b/scripts/gcc-plugins/Kconfig index d33de0b9f4f5..e3569543bdac 100644 --- a/scripts/gcc-plugins/Kconfig +++ b/scripts/gcc-plugins/Kconfig @@ -14,8 +14,8 @@ config HAVE_GCC_PLUGINS An arch should select this symbol if it supports building with GCC plugins. -config GCC_PLUGINS - bool +menuconfig GCC_PLUGINS + bool "GCC plugins" depends on HAVE_GCC_PLUGINS depends on PLUGIN_HOSTCC != "" default y @@ -25,8 +25,7 @@ config GCC_PLUGINS See Documentation/core-api/gcc-plugins.rst for details. -menu "GCC plugins" - depends on GCC_PLUGINS +if GCC_PLUGINS config GCC_PLUGIN_CYC_COMPLEXITY bool "Compute the cyclomatic complexity of a function" if EXPERT @@ -113,4 +112,4 @@ config GCC_PLUGIN_ARM_SSP_PER_TASK bool depends on GCC_PLUGINS && ARM -endmenu +endif diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 436379940356..408b5c0b99b1 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -108,13 +108,13 @@ gen_btf() local bin_arch if ! [ -x "$(command -v ${PAHOLE})" ]; then - info "BTF" "${1}: pahole (${PAHOLE}) is not available" + echo >&2 "BTF: ${1}: pahole (${PAHOLE}) is not available" return 1 fi pahole_ver=$(${PAHOLE} --version | sed -E 's/v([0-9]+)\.([0-9]+)/\1\2/') if [ "${pahole_ver}" -lt "113" ]; then - info "BTF" "${1}: pahole version $(${PAHOLE} --version) is too old, need at least v1.13" + echo >&2 "BTF: ${1}: pahole version $(${PAHOLE} --version) is too old, need at least v1.13" return 1 fi diff --git a/scripts/package/mkdebian b/scripts/package/mkdebian index 7c230016b08d..357dc56bcf30 100755 --- a/scripts/package/mkdebian +++ b/scripts/package/mkdebian @@ -136,7 +136,7 @@ mkdir -p debian/source/ echo "1.0" > debian/source/format echo $debarch > debian/arch -extra_build_depends=", $(if_enabled_echo CONFIG_UNWINDER_ORC libelf-dev)" +extra_build_depends=", $(if_enabled_echo CONFIG_UNWINDER_ORC libelf-dev:native)" extra_build_depends="$extra_build_depends, $(if_enabled_echo CONFIG_SYSTEM_TRUSTED_KEYRING libssl-dev:native)" # Generate a simple changelog template diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c index 612268eabef4..7225107a9aaf 100644 --- a/scripts/recordmcount.c +++ b/scripts/recordmcount.c @@ -38,6 +38,10 @@ #define R_AARCH64_ABS64 257 #endif +#define R_ARM_PC24 1 +#define R_ARM_THM_CALL 10 +#define R_ARM_CALL 28 + static int fd_map; /* File descriptor for file being modified. */ static int mmap_failed; /* Boolean flag. */ static char gpfx; /* prefix for global symbol name (sometimes '_') */ @@ -418,6 +422,18 @@ static char const *already_has_rel_mcount = "success"; /* our work here is done! #define RECORD_MCOUNT_64 #include "recordmcount.h" +static int arm_is_fake_mcount(Elf32_Rel const *rp) +{ + switch (ELF32_R_TYPE(w(rp->r_info))) { + case R_ARM_THM_CALL: + case R_ARM_CALL: + case R_ARM_PC24: + return 0; + } + + return 1; +} + /* 64-bit EM_MIPS has weird ELF64_Rela.r_info. * http://techpubs.sgi.com/library/manuals/4000/007-4658-001/pdf/007-4658-001.pdf * We interpret Table 29 Relocation Operation (Elf64_Rel, Elf64_Rela) [p.40] @@ -523,6 +539,7 @@ static int do_file(char const *const fname) altmcount = "__gnu_mcount_nc"; make_nop = make_nop_arm; rel_type_nop = R_ARM_NONE; + is_fake_mcount32 = arm_is_fake_mcount; gpfx = 0; break; case EM_AARCH64: diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 09996f2552ee..47aff8700547 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -623,7 +623,7 @@ static __poll_t ns_revision_poll(struct file *file, poll_table *pt) void __aa_bump_ns_revision(struct aa_ns *ns) { - ns->revision++; + WRITE_ONCE(ns->revision, ns->revision + 1); wake_up_interruptible(&ns->wait); } diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c index 9be7ccb8379e..6ceb74e0f789 100644 --- a/security/apparmor/domain.c +++ b/security/apparmor/domain.c @@ -317,6 +317,7 @@ static int aa_xattrs_match(const struct linux_binprm *bprm, if (!bprm || !profile->xattr_count) return 0; + might_sleep(); /* transition from exec match to xattr set */ state = aa_dfa_null_transition(profile->xmatch, state); @@ -361,10 +362,11 @@ out: } /** - * __attach_match_ - find an attachment match + * find_attach - do attachment search for unconfined processes * @bprm - binprm structure of transitioning task - * @name - to match against (NOT NULL) + * @ns: the current namespace (NOT NULL) * @head - profile list to walk (NOT NULL) + * @name - to match against (NOT NULL) * @info - info message if there was an error (NOT NULL) * * Do a linear search on the profiles in the list. There is a matching @@ -374,12 +376,11 @@ out: * * Requires: @head not be shared or have appropriate locks held * - * Returns: profile or NULL if no match found + * Returns: label or NULL if no match found */ -static struct aa_profile *__attach_match(const struct linux_binprm *bprm, - const char *name, - struct list_head *head, - const char **info) +static struct aa_label *find_attach(const struct linux_binprm *bprm, + struct aa_ns *ns, struct list_head *head, + const char *name, const char **info) { int candidate_len = 0, candidate_xattrs = 0; bool conflict = false; @@ -388,6 +389,8 @@ static struct aa_profile *__attach_match(const struct linux_binprm *bprm, AA_BUG(!name); AA_BUG(!head); + rcu_read_lock(); +restart: list_for_each_entry_rcu(profile, head, base.list) { if (profile->label.flags & FLAG_NULL && &profile->label == ns_unconfined(profile->ns)) @@ -413,16 +416,32 @@ static struct aa_profile *__attach_match(const struct linux_binprm *bprm, perm = dfa_user_allow(profile->xmatch, state); /* any accepting state means a valid match. */ if (perm & MAY_EXEC) { - int ret; + int ret = 0; if (count < candidate_len) continue; - ret = aa_xattrs_match(bprm, profile, state); - /* Fail matching if the xattrs don't match */ - if (ret < 0) - continue; - + if (bprm && profile->xattr_count) { + long rev = READ_ONCE(ns->revision); + + if (!aa_get_profile_not0(profile)) + goto restart; + rcu_read_unlock(); + ret = aa_xattrs_match(bprm, profile, + state); + rcu_read_lock(); + aa_put_profile(profile); + if (rev != + READ_ONCE(ns->revision)) + /* policy changed */ + goto restart; + /* + * Fail matching if the xattrs don't + * match + */ + if (ret < 0) + continue; + } /* * TODO: allow for more flexible best match * @@ -445,43 +464,28 @@ static struct aa_profile *__attach_match(const struct linux_binprm *bprm, candidate_xattrs = ret; conflict = false; } - } else if (!strcmp(profile->base.name, name)) + } else if (!strcmp(profile->base.name, name)) { /* * old exact non-re match, without conditionals such * as xattrs. no more searching required */ - return profile; + candidate = profile; + goto out; + } } - if (conflict) { - *info = "conflicting profile attachments"; + if (!candidate || conflict) { + if (conflict) + *info = "conflicting profile attachments"; + rcu_read_unlock(); return NULL; } - return candidate; -} - -/** - * find_attach - do attachment search for unconfined processes - * @bprm - binprm structure of transitioning task - * @ns: the current namespace (NOT NULL) - * @list: list to search (NOT NULL) - * @name: the executable name to match against (NOT NULL) - * @info: info message if there was an error - * - * Returns: label or NULL if no match found - */ -static struct aa_label *find_attach(const struct linux_binprm *bprm, - struct aa_ns *ns, struct list_head *list, - const char *name, const char **info) -{ - struct aa_profile *profile; - - rcu_read_lock(); - profile = aa_get_profile(__attach_match(bprm, name, list, info)); +out: + candidate = aa_get_newest_profile(candidate); rcu_read_unlock(); - return profile ? &profile->label : NULL; + return &candidate->label; } static const char *next_name(int xtype, const char *name) diff --git a/security/apparmor/file.c b/security/apparmor/file.c index fe2ebe5e865e..f1caf3674e1c 100644 --- a/security/apparmor/file.c +++ b/security/apparmor/file.c @@ -618,8 +618,7 @@ int aa_file_perm(const char *op, struct aa_label *label, struct file *file, fctx = file_ctx(file); rcu_read_lock(); - flabel = aa_get_newest_label(rcu_dereference(fctx->label)); - rcu_read_unlock(); + flabel = rcu_dereference(fctx->label); AA_BUG(!flabel); /* revalidate access, if task is unconfined, or the cached cred @@ -631,9 +630,13 @@ int aa_file_perm(const char *op, struct aa_label *label, struct file *file, */ denied = request & ~fctx->allow; if (unconfined(label) || unconfined(flabel) || - (!denied && aa_label_is_subset(flabel, label))) + (!denied && aa_label_is_subset(flabel, label))) { + rcu_read_unlock(); goto done; + } + flabel = aa_get_newest_label(flabel); + rcu_read_unlock(); /* TODO: label cross check */ if (file->f_path.mnt && path_mediated_fs(file->f_path.dentry)) @@ -643,8 +646,9 @@ int aa_file_perm(const char *op, struct aa_label *label, struct file *file, else if (S_ISSOCK(file_inode(file)->i_mode)) error = __file_sock_perm(op, label, flabel, file, request, denied); -done: aa_put_label(flabel); + +done: return error; } diff --git a/security/apparmor/mount.c b/security/apparmor/mount.c index 4ed6688f9d40..e0828ee7a345 100644 --- a/security/apparmor/mount.c +++ b/security/apparmor/mount.c @@ -442,7 +442,7 @@ int aa_bind_mount(struct aa_label *label, const struct path *path, buffer = aa_get_buffer(false); old_buffer = aa_get_buffer(false); error = -ENOMEM; - if (!buffer || old_buffer) + if (!buffer || !old_buffer) goto out; error = fn_for_each_confined(label, profile, diff --git a/security/apparmor/policy.c b/security/apparmor/policy.c index 03104830c913..269f2f53c0b1 100644 --- a/security/apparmor/policy.c +++ b/security/apparmor/policy.c @@ -1125,8 +1125,8 @@ ssize_t aa_remove_profiles(struct aa_ns *policy_ns, struct aa_label *subj, if (!name) { /* remove namespace - can only happen if fqname[0] == ':' */ mutex_lock_nested(&ns->parent->lock, ns->level); - __aa_remove_ns(ns); __aa_bump_ns_revision(ns); + __aa_remove_ns(ns); mutex_unlock(&ns->parent->lock); } else { /* remove profile */ @@ -1138,9 +1138,9 @@ ssize_t aa_remove_profiles(struct aa_ns *policy_ns, struct aa_label *subj, goto fail_ns_lock; } name = profile->base.hname; + __aa_bump_ns_revision(ns); __remove_profile(profile); __aa_labelset_update_subtree(ns); - __aa_bump_ns_revision(ns); mutex_unlock(&ns->lock); } diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c index c97fdae8f71b..b69231918686 100644 --- a/security/selinux/nlmsgtab.c +++ b/security/selinux/nlmsgtab.c @@ -85,6 +85,9 @@ static const struct nlmsg_perm nlmsg_route_perms[] = { RTM_GETNEXTHOP, NETLINK_ROUTE_SOCKET__NLMSG_READ }, { RTM_NEWLINKPROP, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_DELLINKPROP, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, + { RTM_NEWVLAN, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, + { RTM_DELVLAN, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, + { RTM_GETVLAN, NETLINK_ROUTE_SOCKET__NLMSG_READ }, }; static const struct nlmsg_perm nlmsg_tcpdiag_perms[] = @@ -168,7 +171,7 @@ int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm) * structures at the top of this file with the new mappings * before updating the BUILD_BUG_ON() macro! */ - BUILD_BUG_ON(RTM_MAX != (RTM_NEWLINKPROP + 3)); + BUILD_BUG_ON(RTM_MAX != (RTM_NEWVLAN + 3)); err = nlmsg_perm(nlmsg_type, perm, nlmsg_route_perms, sizeof(nlmsg_route_perms)); break; diff --git a/sound/core/seq/seq_timer.c b/sound/core/seq/seq_timer.c index 63dc7bdb622d..be59b59c9be4 100644 --- a/sound/core/seq/seq_timer.c +++ b/sound/core/seq/seq_timer.c @@ -471,15 +471,19 @@ void snd_seq_info_timer_read(struct snd_info_entry *entry, q = queueptr(idx); if (q == NULL) continue; - if ((tmr = q->timer) == NULL || - (ti = tmr->timeri) == NULL) { - queuefree(q); - continue; - } + mutex_lock(&q->timer_mutex); + tmr = q->timer; + if (!tmr) + goto unlock; + ti = tmr->timeri; + if (!ti) + goto unlock; snd_iprintf(buffer, "Timer for queue %i : %s\n", q->queue, ti->timer->name); resolution = snd_timer_resolution(ti) * tmr->ticks; snd_iprintf(buffer, " Period time : %lu.%09lu\n", resolution / 1000000000, resolution % 1000000000); snd_iprintf(buffer, " Skew : %u / %u\n", tmr->skew, tmr->skew_base); +unlock: + mutex_unlock(&q->timer_mutex); queuefree(q); } } diff --git a/sound/firewire/dice/dice-extension.c b/sound/firewire/dice/dice-extension.c index a63fcbc875ad..02f4a8318e38 100644 --- a/sound/firewire/dice/dice-extension.c +++ b/sound/firewire/dice/dice-extension.c @@ -159,8 +159,11 @@ int snd_dice_detect_extension_formats(struct snd_dice *dice) int j; for (j = i + 1; j < 9; ++j) { - if (pointers[i * 2] == pointers[j * 2]) + if (pointers[i * 2] == pointers[j * 2]) { + // Fallback to limited functionality. + err = -ENXIO; goto end; + } } } diff --git a/sound/firewire/tascam/amdtp-tascam.c b/sound/firewire/tascam/amdtp-tascam.c index e80bb84c43f6..f823a2ab3544 100644 --- a/sound/firewire/tascam/amdtp-tascam.c +++ b/sound/firewire/tascam/amdtp-tascam.c @@ -157,14 +157,15 @@ static void read_status_messages(struct amdtp_stream *s, if ((before ^ after) & mask) { struct snd_firewire_tascam_change *entry = &tscm->queue[tscm->push_pos]; + unsigned long flag; - spin_lock_irq(&tscm->lock); + spin_lock_irqsave(&tscm->lock, flag); entry->index = index; entry->before = before; entry->after = after; if (++tscm->push_pos >= SND_TSCM_QUEUE_COUNT) tscm->push_pos = 0; - spin_unlock_irq(&tscm->lock); + spin_unlock_irqrestore(&tscm->lock, flag); wake_up(&tscm->hwdep_wait); } diff --git a/sound/hda/hdac_regmap.c b/sound/hda/hdac_regmap.c index 906b1e20bae0..286361ecd640 100644 --- a/sound/hda/hdac_regmap.c +++ b/sound/hda/hdac_regmap.c @@ -363,7 +363,6 @@ static const struct regmap_config hda_regmap_cfg = { .reg_write = hda_reg_write, .use_single_read = true, .use_single_write = true, - .disable_locking = true, }; /** diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index b856b89378ac..8ef223aa1e37 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -125,7 +125,7 @@ static char *patch[SNDRV_CARDS]; static bool beep_mode[SNDRV_CARDS] = {[0 ... (SNDRV_CARDS-1)] = CONFIG_SND_HDA_INPUT_BEEP_MODE}; #endif -static bool dsp_driver = 1; +static bool dmic_detect = 1; module_param_array(index, int, NULL, 0444); MODULE_PARM_DESC(index, "Index value for Intel HD audio interface."); @@ -160,9 +160,10 @@ module_param_array(beep_mode, bool, NULL, 0444); MODULE_PARM_DESC(beep_mode, "Select HDA Beep registration mode " "(0=off, 1=on) (default=1)."); #endif -module_param(dsp_driver, bool, 0444); -MODULE_PARM_DESC(dsp_driver, "Allow DSP driver selection (bypass this driver) " - "(0=off, 1=on) (default=1)"); +module_param(dmic_detect, bool, 0444); +MODULE_PARM_DESC(dmic_detect, "Allow DSP driver selection (bypass this driver) " + "(0=off, 1=on) (default=1); " + "deprecated, use snd-intel-dspcfg.dsp_driver option instead"); #ifdef CONFIG_PM static int param_set_xint(const char *val, const struct kernel_param *kp); @@ -282,12 +283,13 @@ enum { /* quirks for old Intel chipsets */ #define AZX_DCAPS_INTEL_ICH \ - (AZX_DCAPS_OLD_SSYNC | AZX_DCAPS_NO_ALIGN_BUFSIZE) + (AZX_DCAPS_OLD_SSYNC | AZX_DCAPS_NO_ALIGN_BUFSIZE |\ + AZX_DCAPS_SYNC_WRITE) /* quirks for Intel PCH */ #define AZX_DCAPS_INTEL_PCH_BASE \ (AZX_DCAPS_NO_ALIGN_BUFSIZE | AZX_DCAPS_COUNT_LPIB_DELAY |\ - AZX_DCAPS_SNOOP_TYPE(SCH)) + AZX_DCAPS_SNOOP_TYPE(SCH) | AZX_DCAPS_SYNC_WRITE) /* PCH up to IVB; no runtime PM; bind with i915 gfx */ #define AZX_DCAPS_INTEL_PCH_NOPM \ @@ -302,13 +304,13 @@ enum { #define AZX_DCAPS_INTEL_HASWELL \ (/*AZX_DCAPS_ALIGN_BUFSIZE |*/ AZX_DCAPS_COUNT_LPIB_DELAY |\ AZX_DCAPS_PM_RUNTIME | AZX_DCAPS_I915_COMPONENT |\ - AZX_DCAPS_SNOOP_TYPE(SCH)) + AZX_DCAPS_SNOOP_TYPE(SCH) | AZX_DCAPS_SYNC_WRITE) /* Broadwell HDMI can't use position buffer reliably, force to use LPIB */ #define AZX_DCAPS_INTEL_BROADWELL \ (/*AZX_DCAPS_ALIGN_BUFSIZE |*/ AZX_DCAPS_POSFIX_LPIB |\ AZX_DCAPS_PM_RUNTIME | AZX_DCAPS_I915_COMPONENT |\ - AZX_DCAPS_SNOOP_TYPE(SCH)) + AZX_DCAPS_SNOOP_TYPE(SCH) | AZX_DCAPS_SYNC_WRITE) #define AZX_DCAPS_INTEL_BAYTRAIL \ (AZX_DCAPS_INTEL_PCH_BASE | AZX_DCAPS_I915_COMPONENT) @@ -1410,7 +1412,17 @@ static bool atpx_present(void) acpi_handle dhandle, atpx_handle; acpi_status status; - while ((pdev = pci_get_class(PCI_BASE_CLASS_DISPLAY << 16, pdev)) != NULL) { + while ((pdev = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, pdev)) != NULL) { + dhandle = ACPI_HANDLE(&pdev->dev); + if (dhandle) { + status = acpi_get_handle(dhandle, "ATPX", &atpx_handle); + if (!ACPI_FAILURE(status)) { + pci_dev_put(pdev); + return true; + } + } + } + while ((pdev = pci_get_class(PCI_CLASS_DISPLAY_OTHER << 8, pdev)) != NULL) { dhandle = ACPI_HANDLE(&pdev->dev); if (dhandle) { status = acpi_get_handle(dhandle, "ATPX", &atpx_handle); @@ -2088,11 +2100,13 @@ static int azx_probe(struct pci_dev *pci, /* * stop probe if another Intel's DSP driver should be activated */ - if (dsp_driver) { + if (dmic_detect) { err = snd_intel_dsp_driver_probe(pci); if (err != SND_INTEL_DSP_DRIVER_ANY && err != SND_INTEL_DSP_DRIVER_LEGACY) return -ENODEV; + } else { + dev_warn(&pci->dev, "dmic_detect option is deprecated, pass snd-intel-dspcfg.dsp_driver=1 option instead\n"); } err = snd_card_new(&pci->dev, index[dev], id[dev], THIS_MODULE, diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index dbfafee97931..f2ea3528bfb1 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -412,6 +412,7 @@ static void alc_fill_eapd_coef(struct hda_codec *codec) case 0x10ec0672: alc_update_coef_idx(codec, 0xd, 0, 1<<14); /* EAPD Ctrl */ break; + case 0x10ec0222: case 0x10ec0623: alc_update_coef_idx(codec, 0x19, 1<<13, 0); break; @@ -430,6 +431,7 @@ static void alc_fill_eapd_coef(struct hda_codec *codec) break; case 0x10ec0899: case 0x10ec0900: + case 0x10ec0b00: case 0x10ec1168: case 0x10ec1220: alc_update_coef_idx(codec, 0x7, 1<<1, 0); @@ -501,6 +503,7 @@ static void alc_shutup_pins(struct hda_codec *codec) struct alc_spec *spec = codec->spec; switch (codec->core.vendor_id) { + case 0x10ec0283: case 0x10ec0286: case 0x10ec0288: case 0x10ec0298: @@ -2525,6 +2528,7 @@ static int patch_alc882(struct hda_codec *codec) case 0x10ec0882: case 0x10ec0885: case 0x10ec0900: + case 0x10ec0b00: case 0x10ec1220: break; default: @@ -5904,9 +5908,12 @@ enum { ALC256_FIXUP_ASUS_HEADSET_MIC, ALC256_FIXUP_ASUS_MIC_NO_PRESENCE, ALC299_FIXUP_PREDATOR_SPK, - ALC294_FIXUP_ASUS_INTSPK_HEADSET_MIC, ALC256_FIXUP_MEDION_HEADSET_NO_PRESENCE, - ALC294_FIXUP_ASUS_INTSPK_GPIO, + ALC289_FIXUP_DELL_SPK2, + ALC289_FIXUP_DUAL_SPK, + ALC294_FIXUP_SPK2_TO_DAC1, + ALC294_FIXUP_ASUS_DUAL_SPK, + }; static const struct hda_fixup alc269_fixups[] = { @@ -6981,33 +6988,45 @@ static const struct hda_fixup alc269_fixups[] = { { } } }, - [ALC294_FIXUP_ASUS_INTSPK_HEADSET_MIC] = { + [ALC256_FIXUP_MEDION_HEADSET_NO_PRESENCE] = { .type = HDA_FIXUP_PINS, .v.pins = (const struct hda_pintbl[]) { - { 0x14, 0x411111f0 }, /* disable confusing internal speaker */ - { 0x19, 0x04a11150 }, /* use as headset mic, without its own jack detect */ + { 0x19, 0x04a11040 }, + { 0x21, 0x04211020 }, { } }, .chained = true, - .chain_id = ALC269_FIXUP_HEADSET_MODE_NO_HP_MIC + .chain_id = ALC256_FIXUP_ASUS_HEADSET_MODE }, - [ALC256_FIXUP_MEDION_HEADSET_NO_PRESENCE] = { + [ALC289_FIXUP_DELL_SPK2] = { .type = HDA_FIXUP_PINS, .v.pins = (const struct hda_pintbl[]) { - { 0x19, 0x04a11040 }, - { 0x21, 0x04211020 }, + { 0x17, 0x90170130 }, /* bass spk */ { } }, .chained = true, - .chain_id = ALC256_FIXUP_ASUS_HEADSET_MODE + .chain_id = ALC269_FIXUP_DELL4_MIC_NO_PRESENCE }, - [ALC294_FIXUP_ASUS_INTSPK_GPIO] = { + [ALC289_FIXUP_DUAL_SPK] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc285_fixup_speaker2_to_dac1, + .chained = true, + .chain_id = ALC289_FIXUP_DELL_SPK2 + }, + [ALC294_FIXUP_SPK2_TO_DAC1] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc285_fixup_speaker2_to_dac1, + .chained = true, + .chain_id = ALC294_FIXUP_ASUS_HEADSET_MIC + }, + [ALC294_FIXUP_ASUS_DUAL_SPK] = { .type = HDA_FIXUP_FUNC, /* The GPIO must be pulled to initialize the AMP */ .v.func = alc_fixup_gpio4, .chained = true, - .chain_id = ALC294_FIXUP_ASUS_INTSPK_HEADSET_MIC + .chain_id = ALC294_FIXUP_SPK2_TO_DAC1 }, + }; static const struct snd_pci_quirk alc269_fixup_tbl[] = { @@ -7080,6 +7099,8 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1028, 0x08ad, "Dell WYSE AIO", ALC225_FIXUP_DELL_WYSE_AIO_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x08ae, "Dell WYSE NB", ALC225_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x0935, "Dell", ALC274_FIXUP_DELL_AIO_LINEOUT_VERB), + SND_PCI_QUIRK(0x1028, 0x097e, "Dell Precision", ALC289_FIXUP_DUAL_SPK), + SND_PCI_QUIRK(0x1028, 0x097d, "Dell Precision", ALC289_FIXUP_DUAL_SPK), SND_PCI_QUIRK(0x1028, 0x164a, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x164b, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x103c, 0x1586, "HP", ALC269_FIXUP_HP_MUTE_LED_MIC2), @@ -7167,7 +7188,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x1427, "Asus Zenbook UX31E", ALC269VB_FIXUP_ASUS_ZENBOOK), SND_PCI_QUIRK(0x1043, 0x1517, "Asus Zenbook UX31A", ALC269VB_FIXUP_ASUS_ZENBOOK_UX31A), SND_PCI_QUIRK(0x1043, 0x16e3, "ASUS UX50", ALC269_FIXUP_STEREO_DMIC), - SND_PCI_QUIRK(0x1043, 0x17d1, "ASUS UX431FL", ALC294_FIXUP_ASUS_INTSPK_GPIO), + SND_PCI_QUIRK(0x1043, 0x17d1, "ASUS UX431FL", ALC294_FIXUP_ASUS_DUAL_SPK), SND_PCI_QUIRK(0x1043, 0x18b1, "Asus MJ401TA", ALC256_FIXUP_ASUS_HEADSET_MIC), SND_PCI_QUIRK(0x1043, 0x1a13, "Asus G73Jw", ALC269_FIXUP_ASUS_G73JW), SND_PCI_QUIRK(0x1043, 0x1a30, "ASUS X705UD", ALC256_FIXUP_ASUS_MIC), @@ -7239,6 +7260,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x224c, "Thinkpad", ALC298_FIXUP_TPT470_DOCK), SND_PCI_QUIRK(0x17aa, 0x224d, "Thinkpad", ALC298_FIXUP_TPT470_DOCK), SND_PCI_QUIRK(0x17aa, 0x225d, "Thinkpad T480", ALC269_FIXUP_LIMIT_INT_MIC_BOOST), + SND_PCI_QUIRK(0x17aa, 0x2292, "Thinkpad X1 Yoga 7th", ALC285_FIXUP_SPEAKER2_TO_DAC1), SND_PCI_QUIRK(0x17aa, 0x2293, "Thinkpad X1 Carbon 7th", ALC285_FIXUP_SPEAKER2_TO_DAC1), SND_PCI_QUIRK(0x17aa, 0x30bb, "ThinkCentre AIO", ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY), SND_PCI_QUIRK(0x17aa, 0x30e2, "ThinkCentre AIO", ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY), @@ -9237,6 +9259,7 @@ static const struct hda_device_id snd_hda_id_realtek[] = { HDA_CODEC_ENTRY(0x10ec0892, "ALC892", patch_alc662), HDA_CODEC_ENTRY(0x10ec0899, "ALC898", patch_alc882), HDA_CODEC_ENTRY(0x10ec0900, "ALC1150", patch_alc882), + HDA_CODEC_ENTRY(0x10ec0b00, "ALCS1200A", patch_alc882), HDA_CODEC_ENTRY(0x10ec1168, "ALC1220", patch_alc882), HDA_CODEC_ENTRY(0x10ec1220, "ALC1220", patch_alc882), {} /* terminator */ diff --git a/sound/pci/ice1712/ice1724.c b/sound/pci/ice1712/ice1724.c index c80a16ee6e76..242542e23d28 100644 --- a/sound/pci/ice1712/ice1724.c +++ b/sound/pci/ice1712/ice1724.c @@ -647,6 +647,7 @@ static int snd_vt1724_set_pro_rate(struct snd_ice1712 *ice, unsigned int rate, unsigned long flags; unsigned char mclk_change; unsigned int i, old_rate; + bool call_set_rate = false; if (rate > ice->hw_rates->list[ice->hw_rates->count - 1]) return -EINVAL; @@ -670,7 +671,7 @@ static int snd_vt1724_set_pro_rate(struct snd_ice1712 *ice, unsigned int rate, * setting clock rate for internal clock mode */ old_rate = ice->get_rate(ice); if (force || (old_rate != rate)) - ice->set_rate(ice, rate); + call_set_rate = true; else if (rate == ice->cur_rate) { spin_unlock_irqrestore(&ice->reg_lock, flags); return 0; @@ -678,12 +679,14 @@ static int snd_vt1724_set_pro_rate(struct snd_ice1712 *ice, unsigned int rate, } ice->cur_rate = rate; + spin_unlock_irqrestore(&ice->reg_lock, flags); + + if (call_set_rate) + ice->set_rate(ice, rate); /* setting master clock */ mclk_change = ice->set_mclk(ice, rate); - spin_unlock_irqrestore(&ice->reg_lock, flags); - if (mclk_change && ice->gpio.i2s_mclk_changed) ice->gpio.i2s_mclk_changed(ice); if (ice->gpio.set_pro_rate) diff --git a/sound/soc/codecs/cros_ec_codec.c b/sound/soc/codecs/cros_ec_codec.c index 7b17f39a6a10..ce3ed056ea8b 100644 --- a/sound/soc/codecs/cros_ec_codec.c +++ b/sound/soc/codecs/cros_ec_codec.c @@ -10,6 +10,7 @@ #include <crypto/hash.h> #include <crypto/sha.h> +#include <linux/acpi.h> #include <linux/delay.h> #include <linux/device.h> #include <linux/io.h> @@ -1047,10 +1048,17 @@ static const struct of_device_id cros_ec_codec_of_match[] = { MODULE_DEVICE_TABLE(of, cros_ec_codec_of_match); #endif +static const struct acpi_device_id cros_ec_codec_acpi_id[] = { + { "GOOG0013", 0 }, + { } +}; +MODULE_DEVICE_TABLE(acpi, cros_ec_codec_acpi_id); + static struct platform_driver cros_ec_codec_platform_driver = { .driver = { .name = "cros-ec-codec", .of_match_table = of_match_ptr(cros_ec_codec_of_match), + .acpi_match_table = ACPI_PTR(cros_ec_codec_acpi_id), }, .probe = cros_ec_codec_platform_probe, }; diff --git a/sound/soc/codecs/hdac_hda.c b/sound/soc/codecs/hdac_hda.c index 6803d39e09a5..43110151e928 100644 --- a/sound/soc/codecs/hdac_hda.c +++ b/sound/soc/codecs/hdac_hda.c @@ -588,7 +588,9 @@ static int hdac_hda_dev_remove(struct hdac_device *hdev) struct hdac_hda_priv *hda_pvt; hda_pvt = dev_get_drvdata(&hdev->dev); - cancel_delayed_work_sync(&hda_pvt->codec.jackpoll_work); + if (hda_pvt && hda_pvt->codec.registered) + cancel_delayed_work_sync(&hda_pvt->codec.jackpoll_work); + return 0; } diff --git a/sound/soc/codecs/msm8916-wcd-analog.c b/sound/soc/codecs/msm8916-wcd-analog.c index f53235be77d9..1f7964beb20c 100644 --- a/sound/soc/codecs/msm8916-wcd-analog.c +++ b/sound/soc/codecs/msm8916-wcd-analog.c @@ -396,9 +396,6 @@ static int pm8916_wcd_analog_enable_micbias_int(struct snd_soc_component switch (event) { case SND_SOC_DAPM_PRE_PMU: - snd_soc_component_update_bits(component, CDC_A_MICB_1_INT_RBIAS, - MICB_1_INT_TX2_INT_RBIAS_EN_MASK, - MICB_1_INT_TX2_INT_RBIAS_EN_ENABLE); snd_soc_component_update_bits(component, reg, MICB_1_EN_PULL_DOWN_EN_MASK, 0); snd_soc_component_update_bits(component, CDC_A_MICB_1_EN, MICB_1_EN_OPA_STG2_TAIL_CURR_MASK, @@ -448,6 +445,14 @@ static int pm8916_wcd_analog_enable_micbias_int1(struct struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); struct pm8916_wcd_analog_priv *wcd = snd_soc_component_get_drvdata(component); + switch (event) { + case SND_SOC_DAPM_PRE_PMU: + snd_soc_component_update_bits(component, CDC_A_MICB_1_INT_RBIAS, + MICB_1_INT_TX1_INT_RBIAS_EN_MASK, + MICB_1_INT_TX1_INT_RBIAS_EN_ENABLE); + break; + } + return pm8916_wcd_analog_enable_micbias_int(component, event, w->reg, wcd->micbias1_cap_mode); } @@ -558,6 +563,11 @@ static int pm8916_wcd_analog_enable_micbias_int2(struct struct pm8916_wcd_analog_priv *wcd = snd_soc_component_get_drvdata(component); switch (event) { + case SND_SOC_DAPM_PRE_PMU: + snd_soc_component_update_bits(component, CDC_A_MICB_1_INT_RBIAS, + MICB_1_INT_TX2_INT_RBIAS_EN_MASK, + MICB_1_INT_TX2_INT_RBIAS_EN_ENABLE); + break; case SND_SOC_DAPM_POST_PMU: pm8916_mbhc_configure_bias(wcd, true); break; @@ -938,10 +948,10 @@ static const struct snd_soc_dapm_widget pm8916_wcd_analog_dapm_widgets[] = { SND_SOC_DAPM_SUPPLY("MIC BIAS External1", CDC_A_MICB_1_EN, 7, 0, pm8916_wcd_analog_enable_micbias_ext1, - SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMD), + SND_SOC_DAPM_POST_PMU), SND_SOC_DAPM_SUPPLY("MIC BIAS External2", CDC_A_MICB_2_EN, 7, 0, pm8916_wcd_analog_enable_micbias_ext2, - SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_POST_PMD), + SND_SOC_DAPM_POST_PMU), SND_SOC_DAPM_ADC_E("ADC1", NULL, CDC_A_TX_1_EN, 7, 0, pm8916_wcd_analog_enable_adc, diff --git a/sound/soc/codecs/msm8916-wcd-digital.c b/sound/soc/codecs/msm8916-wcd-digital.c index 58b2468fb2a7..09fccacadd6b 100644 --- a/sound/soc/codecs/msm8916-wcd-digital.c +++ b/sound/soc/codecs/msm8916-wcd-digital.c @@ -586,6 +586,12 @@ static int msm8916_wcd_digital_enable_interpolator( snd_soc_component_write(component, rx_gain_reg[w->shift], snd_soc_component_read32(component, rx_gain_reg[w->shift])); break; + case SND_SOC_DAPM_POST_PMD: + snd_soc_component_update_bits(component, LPASS_CDC_CLK_RX_RESET_CTL, + 1 << w->shift, 1 << w->shift); + snd_soc_component_update_bits(component, LPASS_CDC_CLK_RX_RESET_CTL, + 1 << w->shift, 0x0); + break; } return 0; } diff --git a/sound/soc/codecs/rt5640.c b/sound/soc/codecs/rt5640.c index adbae1f36a8a..747ca248bf10 100644 --- a/sound/soc/codecs/rt5640.c +++ b/sound/soc/codecs/rt5640.c @@ -2432,6 +2432,13 @@ static void rt5640_disable_jack_detect(struct snd_soc_component *component) { struct rt5640_priv *rt5640 = snd_soc_component_get_drvdata(component); + /* + * soc_remove_component() force-disables jack and thus rt5640->jack + * could be NULL at the time of driver's module unloading. + */ + if (!rt5640->jack) + return; + disable_irq(rt5640->irq); rt5640_cancel_work(rt5640); diff --git a/sound/soc/fsl/fsl_audmix.c b/sound/soc/fsl/fsl_audmix.c index a1db1bce330f..5faecbeb5497 100644 --- a/sound/soc/fsl/fsl_audmix.c +++ b/sound/soc/fsl/fsl_audmix.c @@ -505,15 +505,20 @@ static int fsl_audmix_probe(struct platform_device *pdev) ARRAY_SIZE(fsl_audmix_dai)); if (ret) { dev_err(dev, "failed to register ASoC DAI\n"); - return ret; + goto err_disable_pm; } priv->pdev = platform_device_register_data(dev, mdrv, 0, NULL, 0); if (IS_ERR(priv->pdev)) { ret = PTR_ERR(priv->pdev); dev_err(dev, "failed to register platform %s: %d\n", mdrv, ret); + goto err_disable_pm; } + return 0; + +err_disable_pm: + pm_runtime_disable(dev); return ret; } @@ -521,6 +526,8 @@ static int fsl_audmix_remove(struct platform_device *pdev) { struct fsl_audmix *priv = dev_get_drvdata(&pdev->dev); + pm_runtime_disable(&pdev->dev); + if (priv->pdev) platform_device_unregister(priv->pdev); diff --git a/sound/soc/intel/boards/bytcht_es8316.c b/sound/soc/intel/boards/bytcht_es8316.c index 46612331f5ea..54e97455d7f6 100644 --- a/sound/soc/intel/boards/bytcht_es8316.c +++ b/sound/soc/intel/boards/bytcht_es8316.c @@ -442,7 +442,8 @@ static const struct dmi_system_id byt_cht_es8316_quirk_table[] = { DMI_MATCH(DMI_SYS_VENDOR, "IRBIS"), DMI_MATCH(DMI_PRODUCT_NAME, "NB41"), }, - .driver_data = (void *)(BYT_CHT_ES8316_INTMIC_IN2_MAP + .driver_data = (void *)(BYT_CHT_ES8316_SSP0 + | BYT_CHT_ES8316_INTMIC_IN2_MAP | BYT_CHT_ES8316_JD_INVERTED), }, { /* Teclast X98 Plus II */ diff --git a/sound/soc/intel/boards/cml_rt1011_rt5682.c b/sound/soc/intel/boards/cml_rt1011_rt5682.c index a22f97234201..5f1bf6d3800c 100644 --- a/sound/soc/intel/boards/cml_rt1011_rt5682.c +++ b/sound/soc/intel/boards/cml_rt1011_rt5682.c @@ -11,7 +11,6 @@ #include <linux/clk.h> #include <linux/dmi.h> #include <linux/slab.h> -#include <asm/cpu_device_id.h> #include <linux/acpi.h> #include <sound/core.h> #include <sound/jack.h> diff --git a/sound/soc/soc-component.c b/sound/soc/soc-component.c index 9054558ce386..b94680fb26fa 100644 --- a/sound/soc/soc-component.c +++ b/sound/soc/soc-component.c @@ -539,6 +539,9 @@ void snd_soc_pcm_component_free(struct snd_soc_pcm_runtime *rtd) struct snd_soc_rtdcom_list *rtdcom; struct snd_soc_component *component; + if (!rtd->pcm) + return; + for_each_rtd_components(rtd, rtdcom, component) if (component->driver->pcm_destruct) component->driver->pcm_destruct(component, rtd->pcm); diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c index 1c84ff1a5bf9..8ef0efeed0a7 100644 --- a/sound/soc/soc-core.c +++ b/sound/soc/soc-core.c @@ -479,6 +479,12 @@ static struct snd_soc_pcm_runtime *soc_new_pcm_runtime( goto free_rtd; rtd->dev = dev; + INIT_LIST_HEAD(&rtd->list); + INIT_LIST_HEAD(&rtd->component_list); + INIT_LIST_HEAD(&rtd->dpcm[SNDRV_PCM_STREAM_PLAYBACK].be_clients); + INIT_LIST_HEAD(&rtd->dpcm[SNDRV_PCM_STREAM_CAPTURE].be_clients); + INIT_LIST_HEAD(&rtd->dpcm[SNDRV_PCM_STREAM_PLAYBACK].fe_clients); + INIT_LIST_HEAD(&rtd->dpcm[SNDRV_PCM_STREAM_CAPTURE].fe_clients); dev_set_drvdata(dev, rtd); INIT_DELAYED_WORK(&rtd->delayed_work, close_delayed_work); @@ -494,12 +500,6 @@ static struct snd_soc_pcm_runtime *soc_new_pcm_runtime( /* * rtd remaining settings */ - INIT_LIST_HEAD(&rtd->component_list); - INIT_LIST_HEAD(&rtd->dpcm[SNDRV_PCM_STREAM_PLAYBACK].be_clients); - INIT_LIST_HEAD(&rtd->dpcm[SNDRV_PCM_STREAM_CAPTURE].be_clients); - INIT_LIST_HEAD(&rtd->dpcm[SNDRV_PCM_STREAM_PLAYBACK].fe_clients); - INIT_LIST_HEAD(&rtd->dpcm[SNDRV_PCM_STREAM_CAPTURE].fe_clients); - rtd->card = card; rtd->dai_link = dai_link; if (!rtd->dai_link->ops) @@ -1871,6 +1871,8 @@ match: /* convert non BE into BE */ dai_link->no_pcm = 1; + dai_link->dpcm_playback = 1; + dai_link->dpcm_capture = 1; /* override any BE fixups */ dai_link->be_hw_params_fixup = diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c index b28613149b0c..92e4f4d08bfa 100644 --- a/sound/soc/soc-topology.c +++ b/sound/soc/soc-topology.c @@ -548,12 +548,12 @@ static void remove_link(struct snd_soc_component *comp, if (dobj->ops && dobj->ops->link_unload) dobj->ops->link_unload(comp, dobj); + list_del(&dobj->list); + snd_soc_remove_dai_link(comp->card, link); + kfree(link->name); kfree(link->stream_name); kfree(link->cpus->dai_name); - - list_del(&dobj->list); - snd_soc_remove_dai_link(comp->card, link); kfree(link); } diff --git a/sound/soc/sof/imx/imx8.c b/sound/soc/sof/imx/imx8.c index cfefcfd92798..aef6ca167b9c 100644 --- a/sound/soc/sof/imx/imx8.c +++ b/sound/soc/sof/imx/imx8.c @@ -209,7 +209,7 @@ static int imx8_probe(struct snd_sof_dev *sdev) priv->pd_dev = devm_kmalloc_array(&pdev->dev, priv->num_domains, sizeof(*priv->pd_dev), GFP_KERNEL); - if (!priv) + if (!priv->pd_dev) return -ENOMEM; priv->link = devm_kmalloc_array(&pdev->dev, priv->num_domains, @@ -304,6 +304,9 @@ static int imx8_probe(struct snd_sof_dev *sdev) } sdev->mailbox_bar = SOF_FW_BLK_TYPE_SRAM; + /* set default mailbox offset for FW ready message */ + sdev->dsp_box.offset = MBOX_OFFSET; + return 0; exit_pdev_unregister: diff --git a/sound/soc/sof/intel/hda-codec.c b/sound/soc/sof/intel/hda-codec.c index 827f84a0722e..fbfa225d1c5a 100644 --- a/sound/soc/sof/intel/hda-codec.c +++ b/sound/soc/sof/intel/hda-codec.c @@ -24,19 +24,18 @@ #define IDISP_VID_INTEL 0x80860000 /* load the legacy HDA codec driver */ -#ifdef MODULE -static void hda_codec_load_module(struct hda_codec *codec) +static int hda_codec_load_module(struct hda_codec *codec) { +#ifdef MODULE char alias[MODULE_NAME_LEN]; const char *module = alias; snd_hdac_codec_modalias(&codec->core, alias, sizeof(alias)); dev_dbg(&codec->core.dev, "loading codec module: %s\n", module); request_module(module); -} -#else -static void hda_codec_load_module(struct hda_codec *codec) {} #endif + return device_attach(hda_codec_dev(codec)); +} /* enable controller wake up event for all codecs with jack connectors */ void hda_codec_jack_wake_enable(struct snd_sof_dev *sdev) @@ -129,10 +128,16 @@ static int hda_codec_probe(struct snd_sof_dev *sdev, int address) if ((mach_params && mach_params->common_hdmi_codec_drv) || (resp & 0xFFFF0000) != IDISP_VID_INTEL) { hdev->type = HDA_DEV_LEGACY; - hda_codec_load_module(&hda_priv->codec); + ret = hda_codec_load_module(&hda_priv->codec); + /* + * handle ret==0 (no driver bound) as an error, but pass + * other return codes without modification + */ + if (ret == 0) + ret = -ENOENT; } - return 0; + return ret; #else hdev = devm_kzalloc(sdev->dev, sizeof(*hdev), GFP_KERNEL); if (!hdev) diff --git a/sound/soc/sof/intel/hda-dai.c b/sound/soc/sof/intel/hda-dai.c index 8796f385be76..896d21984b73 100644 --- a/sound/soc/sof/intel/hda-dai.c +++ b/sound/soc/sof/intel/hda-dai.c @@ -216,6 +216,8 @@ static int hda_link_hw_params(struct snd_pcm_substream *substream, link_dev = hda_link_stream_assign(bus, substream); if (!link_dev) return -EBUSY; + + snd_soc_dai_set_dma_data(dai, substream, (void *)link_dev); } stream_tag = hdac_stream(link_dev)->stream_tag; @@ -228,8 +230,6 @@ static int hda_link_hw_params(struct snd_pcm_substream *substream, if (ret < 0) return ret; - snd_soc_dai_set_dma_data(dai, substream, (void *)link_dev); - link = snd_hdac_ext_bus_get_link(bus, codec_dai->component->name); if (!link) return -EINVAL; @@ -361,6 +361,13 @@ static int hda_link_hw_free(struct snd_pcm_substream *substream, bus = hstream->bus; rtd = snd_pcm_substream_chip(substream); link_dev = snd_soc_dai_get_dma_data(dai, substream); + + if (!link_dev) { + dev_dbg(dai->dev, + "%s: link_dev is not assigned\n", __func__); + return -EINVAL; + } + hda_stream = hstream_to_sof_hda_stream(link_dev); /* free the link DMA channel in the FW */ diff --git a/sound/soc/sof/intel/hda-loader.c b/sound/soc/sof/intel/hda-loader.c index b1783360fe10..bae7ac3581e5 100644 --- a/sound/soc/sof/intel/hda-loader.c +++ b/sound/soc/sof/intel/hda-loader.c @@ -329,13 +329,13 @@ int hda_dsp_cl_boot_firmware(struct snd_sof_dev *sdev) if (!ret) break; - dev_err(sdev->dev, "error: Error code=0x%x: FW status=0x%x\n", + dev_dbg(sdev->dev, "iteration %d of Core En/ROM load failed: %d\n", + i, ret); + dev_dbg(sdev->dev, "Error code=0x%x: FW status=0x%x\n", snd_sof_dsp_read(sdev, HDA_DSP_BAR, HDA_DSP_SRAM_REG_ROM_ERROR), snd_sof_dsp_read(sdev, HDA_DSP_BAR, HDA_DSP_SRAM_REG_ROM_STATUS)); - dev_err(sdev->dev, "error: iteration %d of Core En/ROM load failed: %d\n", - i, ret); } if (i == HDA_FW_BOOT_ATTEMPTS) { diff --git a/sound/soc/sof/ipc.c b/sound/soc/sof/ipc.c index 5994e1073364..5fdfbaa8c4ed 100644 --- a/sound/soc/sof/ipc.c +++ b/sound/soc/sof/ipc.c @@ -826,6 +826,9 @@ void snd_sof_ipc_free(struct snd_sof_dev *sdev) { struct snd_sof_ipc *ipc = sdev->ipc; + if (!ipc) + return; + /* disable sending of ipc's */ mutex_lock(&ipc->tx_mutex); ipc->disable_ipc_tx = true; diff --git a/sound/soc/sti/uniperif_player.c b/sound/soc/sti/uniperif_player.c index 48ea915b24ba..2ed92c990b97 100644 --- a/sound/soc/sti/uniperif_player.c +++ b/sound/soc/sti/uniperif_player.c @@ -226,7 +226,6 @@ static void uni_player_set_channel_status(struct uniperif *player, * sampling frequency. If no sample rate is already specified, then * set one. */ - mutex_lock(&player->ctrl_lock); if (runtime) { switch (runtime->rate) { case 22050: @@ -303,7 +302,6 @@ static void uni_player_set_channel_status(struct uniperif *player, player->stream_settings.iec958.status[3 + (n * 4)] << 24; SET_UNIPERIF_CHANNEL_STA_REGN(player, n, status); } - mutex_unlock(&player->ctrl_lock); /* Update the channel status */ if (player->ver < SND_ST_UNIPERIF_VERSION_UNI_PLR_TOP_1_0) @@ -365,8 +363,10 @@ static int uni_player_prepare_iec958(struct uniperif *player, SET_UNIPERIF_CTRL_ZERO_STUFF_HW(player); + mutex_lock(&player->ctrl_lock); /* Update the channel status */ uni_player_set_channel_status(player, runtime); + mutex_unlock(&player->ctrl_lock); /* Clear the user validity user bits */ SET_UNIPERIF_USER_VALIDITY_VALIDITY_LR(player, 0); @@ -598,7 +598,6 @@ static int uni_player_ctl_iec958_put(struct snd_kcontrol *kcontrol, iec958->status[1] = ucontrol->value.iec958.status[1]; iec958->status[2] = ucontrol->value.iec958.status[2]; iec958->status[3] = ucontrol->value.iec958.status[3]; - mutex_unlock(&player->ctrl_lock); spin_lock_irqsave(&player->irq_lock, flags); if (player->substream && player->substream->runtime) @@ -608,6 +607,8 @@ static int uni_player_ctl_iec958_put(struct snd_kcontrol *kcontrol, uni_player_set_channel_status(player, NULL); spin_unlock_irqrestore(&player->irq_lock, flags); + mutex_unlock(&player->ctrl_lock); + return 0; } diff --git a/sound/soc/stm/stm32_adfsdm.c b/sound/soc/stm/stm32_adfsdm.c index 81c407da15c5..08696a4adb69 100644 --- a/sound/soc/stm/stm32_adfsdm.c +++ b/sound/soc/stm/stm32_adfsdm.c @@ -153,13 +153,13 @@ static const struct snd_soc_component_driver stm32_adfsdm_dai_component = { .name = "stm32_dfsdm_audio", }; -static void memcpy_32to16(void *dest, const void *src, size_t n) +static void stm32_memcpy_32to16(void *dest, const void *src, size_t n) { unsigned int i = 0; u16 *d = (u16 *)dest, *s = (u16 *)src; s++; - for (i = n; i > 0; i--) { + for (i = n >> 1; i > 0; i--) { *d++ = *s++; s++; } @@ -186,8 +186,8 @@ static int stm32_afsdm_pcm_cb(const void *data, size_t size, void *private) if ((priv->pos + src_size) > buff_size) { if (format == SNDRV_PCM_FORMAT_S16_LE) - memcpy_32to16(&pcm_buff[priv->pos], src_buff, - buff_size - priv->pos); + stm32_memcpy_32to16(&pcm_buff[priv->pos], src_buff, + buff_size - priv->pos); else memcpy(&pcm_buff[priv->pos], src_buff, buff_size - priv->pos); @@ -196,8 +196,8 @@ static int stm32_afsdm_pcm_cb(const void *data, size_t size, void *private) } if (format == SNDRV_PCM_FORMAT_S16_LE) - memcpy_32to16(&pcm_buff[priv->pos], - &src_buff[src_size - cur_size], cur_size); + stm32_memcpy_32to16(&pcm_buff[priv->pos], + &src_buff[src_size - cur_size], cur_size); else memcpy(&pcm_buff[priv->pos], &src_buff[src_size - cur_size], cur_size); diff --git a/sound/soc/stm/stm32_sai_sub.c b/sound/soc/stm/stm32_sai_sub.c index 48e629ac2d88..30bcd5d3a32a 100644 --- a/sound/soc/stm/stm32_sai_sub.c +++ b/sound/soc/stm/stm32_sai_sub.c @@ -184,6 +184,56 @@ static bool stm32_sai_sub_writeable_reg(struct device *dev, unsigned int reg) } } +static int stm32_sai_sub_reg_up(struct stm32_sai_sub_data *sai, + unsigned int reg, unsigned int mask, + unsigned int val) +{ + int ret; + + ret = clk_enable(sai->pdata->pclk); + if (ret < 0) + return ret; + + ret = regmap_update_bits(sai->regmap, reg, mask, val); + + clk_disable(sai->pdata->pclk); + + return ret; +} + +static int stm32_sai_sub_reg_wr(struct stm32_sai_sub_data *sai, + unsigned int reg, unsigned int mask, + unsigned int val) +{ + int ret; + + ret = clk_enable(sai->pdata->pclk); + if (ret < 0) + return ret; + + ret = regmap_write_bits(sai->regmap, reg, mask, val); + + clk_disable(sai->pdata->pclk); + + return ret; +} + +static int stm32_sai_sub_reg_rd(struct stm32_sai_sub_data *sai, + unsigned int reg, unsigned int *val) +{ + int ret; + + ret = clk_enable(sai->pdata->pclk); + if (ret < 0) + return ret; + + ret = regmap_read(sai->regmap, reg, val); + + clk_disable(sai->pdata->pclk); + + return ret; +} + static const struct regmap_config stm32_sai_sub_regmap_config_f4 = { .reg_bits = 32, .reg_stride = 4, @@ -295,7 +345,7 @@ static int stm32_sai_set_clk_div(struct stm32_sai_sub_data *sai, mask = SAI_XCR1_MCKDIV_MASK(SAI_XCR1_MCKDIV_WIDTH(version)); cr1 = SAI_XCR1_MCKDIV_SET(div); - ret = regmap_update_bits(sai->regmap, STM_SAI_CR1_REGX, mask, cr1); + ret = stm32_sai_sub_reg_up(sai, STM_SAI_CR1_REGX, mask, cr1); if (ret < 0) dev_err(&sai->pdev->dev, "Failed to update CR1 register\n"); @@ -372,8 +422,8 @@ static int stm32_sai_mclk_enable(struct clk_hw *hw) dev_dbg(&sai->pdev->dev, "Enable master clock\n"); - return regmap_update_bits(sai->regmap, STM_SAI_CR1_REGX, - SAI_XCR1_MCKEN, SAI_XCR1_MCKEN); + return stm32_sai_sub_reg_up(sai, STM_SAI_CR1_REGX, + SAI_XCR1_MCKEN, SAI_XCR1_MCKEN); } static void stm32_sai_mclk_disable(struct clk_hw *hw) @@ -383,7 +433,7 @@ static void stm32_sai_mclk_disable(struct clk_hw *hw) dev_dbg(&sai->pdev->dev, "Disable master clock\n"); - regmap_update_bits(sai->regmap, STM_SAI_CR1_REGX, SAI_XCR1_MCKEN, 0); + stm32_sai_sub_reg_up(sai, STM_SAI_CR1_REGX, SAI_XCR1_MCKEN, 0); } static const struct clk_ops mclk_ops = { @@ -446,15 +496,15 @@ static irqreturn_t stm32_sai_isr(int irq, void *devid) unsigned int sr, imr, flags; snd_pcm_state_t status = SNDRV_PCM_STATE_RUNNING; - regmap_read(sai->regmap, STM_SAI_IMR_REGX, &imr); - regmap_read(sai->regmap, STM_SAI_SR_REGX, &sr); + stm32_sai_sub_reg_rd(sai, STM_SAI_IMR_REGX, &imr); + stm32_sai_sub_reg_rd(sai, STM_SAI_SR_REGX, &sr); flags = sr & imr; if (!flags) return IRQ_NONE; - regmap_write_bits(sai->regmap, STM_SAI_CLRFR_REGX, SAI_XCLRFR_MASK, - SAI_XCLRFR_MASK); + stm32_sai_sub_reg_wr(sai, STM_SAI_CLRFR_REGX, SAI_XCLRFR_MASK, + SAI_XCLRFR_MASK); if (!sai->substream) { dev_err(&pdev->dev, "Device stopped. Spurious IRQ 0x%x\n", sr); @@ -503,8 +553,8 @@ static int stm32_sai_set_sysclk(struct snd_soc_dai *cpu_dai, int ret; if (dir == SND_SOC_CLOCK_OUT && sai->sai_mclk) { - ret = regmap_update_bits(sai->regmap, STM_SAI_CR1_REGX, - SAI_XCR1_NODIV, + ret = stm32_sai_sub_reg_up(sai, STM_SAI_CR1_REGX, + SAI_XCR1_NODIV, freq ? 0 : SAI_XCR1_NODIV); if (ret < 0) return ret; @@ -583,7 +633,7 @@ static int stm32_sai_set_dai_tdm_slot(struct snd_soc_dai *cpu_dai, u32 tx_mask, slotr_mask |= SAI_XSLOTR_SLOTEN_MASK; - regmap_update_bits(sai->regmap, STM_SAI_SLOTR_REGX, slotr_mask, slotr); + stm32_sai_sub_reg_up(sai, STM_SAI_SLOTR_REGX, slotr_mask, slotr); sai->slot_width = slot_width; sai->slots = slots; @@ -665,7 +715,7 @@ static int stm32_sai_set_dai_fmt(struct snd_soc_dai *cpu_dai, unsigned int fmt) cr1_mask |= SAI_XCR1_CKSTR; frcr_mask |= SAI_XFRCR_FSPOL; - regmap_update_bits(sai->regmap, STM_SAI_FRCR_REGX, frcr_mask, frcr); + stm32_sai_sub_reg_up(sai, STM_SAI_FRCR_REGX, frcr_mask, frcr); /* DAI clock master masks */ switch (fmt & SND_SOC_DAIFMT_MASTER_MASK) { @@ -693,7 +743,7 @@ static int stm32_sai_set_dai_fmt(struct snd_soc_dai *cpu_dai, unsigned int fmt) cr1_mask |= SAI_XCR1_SLAVE; conf_update: - ret = regmap_update_bits(sai->regmap, STM_SAI_CR1_REGX, cr1_mask, cr1); + ret = stm32_sai_sub_reg_up(sai, STM_SAI_CR1_REGX, cr1_mask, cr1); if (ret < 0) { dev_err(cpu_dai->dev, "Failed to update CR1 register\n"); return ret; @@ -730,12 +780,12 @@ static int stm32_sai_startup(struct snd_pcm_substream *substream, } /* Enable ITs */ - regmap_write_bits(sai->regmap, STM_SAI_CLRFR_REGX, - SAI_XCLRFR_MASK, SAI_XCLRFR_MASK); + stm32_sai_sub_reg_wr(sai, STM_SAI_CLRFR_REGX, + SAI_XCLRFR_MASK, SAI_XCLRFR_MASK); imr = SAI_XIMR_OVRUDRIE; if (STM_SAI_IS_CAPTURE(sai)) { - regmap_read(sai->regmap, STM_SAI_CR2_REGX, &cr2); + stm32_sai_sub_reg_rd(sai, STM_SAI_CR2_REGX, &cr2); if (cr2 & SAI_XCR2_MUTECNT_MASK) imr |= SAI_XIMR_MUTEDETIE; } @@ -745,8 +795,8 @@ static int stm32_sai_startup(struct snd_pcm_substream *substream, else imr |= SAI_XIMR_AFSDETIE | SAI_XIMR_LFSDETIE; - regmap_update_bits(sai->regmap, STM_SAI_IMR_REGX, - SAI_XIMR_MASK, imr); + stm32_sai_sub_reg_up(sai, STM_SAI_IMR_REGX, + SAI_XIMR_MASK, imr); return 0; } @@ -763,10 +813,10 @@ static int stm32_sai_set_config(struct snd_soc_dai *cpu_dai, * SAI fifo threshold is set to half fifo, to keep enough space * for DMA incoming bursts. */ - regmap_write_bits(sai->regmap, STM_SAI_CR2_REGX, - SAI_XCR2_FFLUSH | SAI_XCR2_FTH_MASK, - SAI_XCR2_FFLUSH | - SAI_XCR2_FTH_SET(STM_SAI_FIFO_TH_HALF)); + stm32_sai_sub_reg_wr(sai, STM_SAI_CR2_REGX, + SAI_XCR2_FFLUSH | SAI_XCR2_FTH_MASK, + SAI_XCR2_FFLUSH | + SAI_XCR2_FTH_SET(STM_SAI_FIFO_TH_HALF)); /* DS bits in CR1 not set for SPDIF (size forced to 24 bits).*/ if (STM_SAI_PROTOCOL_IS_SPDIF(sai)) { @@ -795,7 +845,7 @@ static int stm32_sai_set_config(struct snd_soc_dai *cpu_dai, if ((sai->slots == 2) && (params_channels(params) == 1)) cr1 |= SAI_XCR1_MONO; - ret = regmap_update_bits(sai->regmap, STM_SAI_CR1_REGX, cr1_mask, cr1); + ret = stm32_sai_sub_reg_up(sai, STM_SAI_CR1_REGX, cr1_mask, cr1); if (ret < 0) { dev_err(cpu_dai->dev, "Failed to update CR1 register\n"); return ret; @@ -809,7 +859,7 @@ static int stm32_sai_set_slots(struct snd_soc_dai *cpu_dai) struct stm32_sai_sub_data *sai = snd_soc_dai_get_drvdata(cpu_dai); int slotr, slot_sz; - regmap_read(sai->regmap, STM_SAI_SLOTR_REGX, &slotr); + stm32_sai_sub_reg_rd(sai, STM_SAI_SLOTR_REGX, &slotr); /* * If SLOTSZ is set to auto in SLOTR, align slot width on data size @@ -831,16 +881,16 @@ static int stm32_sai_set_slots(struct snd_soc_dai *cpu_dai) sai->slots = 2; /* The number of slots in the audio frame is equal to NBSLOT[3:0] + 1*/ - regmap_update_bits(sai->regmap, STM_SAI_SLOTR_REGX, - SAI_XSLOTR_NBSLOT_MASK, - SAI_XSLOTR_NBSLOT_SET((sai->slots - 1))); + stm32_sai_sub_reg_up(sai, STM_SAI_SLOTR_REGX, + SAI_XSLOTR_NBSLOT_MASK, + SAI_XSLOTR_NBSLOT_SET((sai->slots - 1))); /* Set default slots mask if not already set from DT */ if (!(slotr & SAI_XSLOTR_SLOTEN_MASK)) { sai->slot_mask = (1 << sai->slots) - 1; - regmap_update_bits(sai->regmap, - STM_SAI_SLOTR_REGX, SAI_XSLOTR_SLOTEN_MASK, - SAI_XSLOTR_SLOTEN_SET(sai->slot_mask)); + stm32_sai_sub_reg_up(sai, + STM_SAI_SLOTR_REGX, SAI_XSLOTR_SLOTEN_MASK, + SAI_XSLOTR_SLOTEN_SET(sai->slot_mask)); } dev_dbg(cpu_dai->dev, "Slots %d, slot width %d\n", @@ -870,14 +920,14 @@ static void stm32_sai_set_frame(struct snd_soc_dai *cpu_dai) dev_dbg(cpu_dai->dev, "Frame length %d, frame active %d\n", sai->fs_length, fs_active); - regmap_update_bits(sai->regmap, STM_SAI_FRCR_REGX, frcr_mask, frcr); + stm32_sai_sub_reg_up(sai, STM_SAI_FRCR_REGX, frcr_mask, frcr); if ((sai->fmt & SND_SOC_DAIFMT_FORMAT_MASK) == SND_SOC_DAIFMT_LSB) { offset = sai->slot_width - sai->data_size; - regmap_update_bits(sai->regmap, STM_SAI_SLOTR_REGX, - SAI_XSLOTR_FBOFF_MASK, - SAI_XSLOTR_FBOFF_SET(offset)); + stm32_sai_sub_reg_up(sai, STM_SAI_SLOTR_REGX, + SAI_XSLOTR_FBOFF_MASK, + SAI_XSLOTR_FBOFF_SET(offset)); } } @@ -994,9 +1044,9 @@ static int stm32_sai_configure_clock(struct snd_soc_dai *cpu_dai, return -EINVAL; } - regmap_update_bits(sai->regmap, - STM_SAI_CR1_REGX, - SAI_XCR1_OSR, cr1); + stm32_sai_sub_reg_up(sai, + STM_SAI_CR1_REGX, + SAI_XCR1_OSR, cr1); div = stm32_sai_get_clk_div(sai, sai_clk_rate, sai->mclk_rate); @@ -1058,12 +1108,12 @@ static int stm32_sai_trigger(struct snd_pcm_substream *substream, int cmd, case SNDRV_PCM_TRIGGER_PAUSE_RELEASE: dev_dbg(cpu_dai->dev, "Enable DMA and SAI\n"); - regmap_update_bits(sai->regmap, STM_SAI_CR1_REGX, - SAI_XCR1_DMAEN, SAI_XCR1_DMAEN); + stm32_sai_sub_reg_up(sai, STM_SAI_CR1_REGX, + SAI_XCR1_DMAEN, SAI_XCR1_DMAEN); /* Enable SAI */ - ret = regmap_update_bits(sai->regmap, STM_SAI_CR1_REGX, - SAI_XCR1_SAIEN, SAI_XCR1_SAIEN); + ret = stm32_sai_sub_reg_up(sai, STM_SAI_CR1_REGX, + SAI_XCR1_SAIEN, SAI_XCR1_SAIEN); if (ret < 0) dev_err(cpu_dai->dev, "Failed to update CR1 register\n"); break; @@ -1072,16 +1122,16 @@ static int stm32_sai_trigger(struct snd_pcm_substream *substream, int cmd, case SNDRV_PCM_TRIGGER_STOP: dev_dbg(cpu_dai->dev, "Disable DMA and SAI\n"); - regmap_update_bits(sai->regmap, STM_SAI_IMR_REGX, - SAI_XIMR_MASK, 0); + stm32_sai_sub_reg_up(sai, STM_SAI_IMR_REGX, + SAI_XIMR_MASK, 0); - regmap_update_bits(sai->regmap, STM_SAI_CR1_REGX, - SAI_XCR1_SAIEN, - (unsigned int)~SAI_XCR1_SAIEN); + stm32_sai_sub_reg_up(sai, STM_SAI_CR1_REGX, + SAI_XCR1_SAIEN, + (unsigned int)~SAI_XCR1_SAIEN); - ret = regmap_update_bits(sai->regmap, STM_SAI_CR1_REGX, - SAI_XCR1_DMAEN, - (unsigned int)~SAI_XCR1_DMAEN); + ret = stm32_sai_sub_reg_up(sai, STM_SAI_CR1_REGX, + SAI_XCR1_DMAEN, + (unsigned int)~SAI_XCR1_DMAEN); if (ret < 0) dev_err(cpu_dai->dev, "Failed to update CR1 register\n"); @@ -1101,7 +1151,7 @@ static void stm32_sai_shutdown(struct snd_pcm_substream *substream, struct stm32_sai_sub_data *sai = snd_soc_dai_get_drvdata(cpu_dai); unsigned long flags; - regmap_update_bits(sai->regmap, STM_SAI_IMR_REGX, SAI_XIMR_MASK, 0); + stm32_sai_sub_reg_up(sai, STM_SAI_IMR_REGX, SAI_XIMR_MASK, 0); clk_disable_unprepare(sai->sai_ck); @@ -1169,7 +1219,7 @@ static int stm32_sai_dai_probe(struct snd_soc_dai *cpu_dai) cr1_mask |= SAI_XCR1_SYNCEN_MASK; cr1 |= SAI_XCR1_SYNCEN_SET(sai->sync); - return regmap_update_bits(sai->regmap, STM_SAI_CR1_REGX, cr1_mask, cr1); + return stm32_sai_sub_reg_up(sai, STM_SAI_CR1_REGX, cr1_mask, cr1); } static const struct snd_soc_dai_ops stm32_sai_pcm_dai_ops = { @@ -1322,8 +1372,13 @@ static int stm32_sai_sub_parse_of(struct platform_device *pdev, if (STM_SAI_HAS_PDM(sai) && STM_SAI_IS_SUB_A(sai)) sai->regmap_config = &stm32_sai_sub_regmap_config_h7; - sai->regmap = devm_regmap_init_mmio_clk(&pdev->dev, "sai_ck", - base, sai->regmap_config); + /* + * Do not manage peripheral clock through regmap framework as this + * can lead to circular locking issue with sai master clock provider. + * Manage peripheral clock directly in driver instead. + */ + sai->regmap = devm_regmap_init_mmio(&pdev->dev, base, + sai->regmap_config); if (IS_ERR(sai->regmap)) { dev_err(&pdev->dev, "Failed to initialize MMIO\n"); return PTR_ERR(sai->regmap); @@ -1420,6 +1475,10 @@ static int stm32_sai_sub_parse_of(struct platform_device *pdev, return PTR_ERR(sai->sai_ck); } + ret = clk_prepare(sai->pdata->pclk); + if (ret < 0) + return ret; + if (STM_SAI_IS_F4(sai->pdata)) return 0; @@ -1501,22 +1560,48 @@ static int stm32_sai_sub_probe(struct platform_device *pdev) return 0; } +static int stm32_sai_sub_remove(struct platform_device *pdev) +{ + struct stm32_sai_sub_data *sai = dev_get_drvdata(&pdev->dev); + + clk_unprepare(sai->pdata->pclk); + + return 0; +} + #ifdef CONFIG_PM_SLEEP static int stm32_sai_sub_suspend(struct device *dev) { struct stm32_sai_sub_data *sai = dev_get_drvdata(dev); + int ret; + + ret = clk_enable(sai->pdata->pclk); + if (ret < 0) + return ret; regcache_cache_only(sai->regmap, true); regcache_mark_dirty(sai->regmap); + + clk_disable(sai->pdata->pclk); + return 0; } static int stm32_sai_sub_resume(struct device *dev) { struct stm32_sai_sub_data *sai = dev_get_drvdata(dev); + int ret; + + ret = clk_enable(sai->pdata->pclk); + if (ret < 0) + return ret; regcache_cache_only(sai->regmap, false); - return regcache_sync(sai->regmap); + ret = regcache_sync(sai->regmap); + + clk_disable(sai->pdata->pclk); + + return ret; } #endif /* CONFIG_PM_SLEEP */ @@ -1531,6 +1616,7 @@ static struct platform_driver stm32_sai_sub_driver = { .pm = &stm32_sai_sub_pm_ops, }, .probe = stm32_sai_sub_probe, + .remove = stm32_sai_sub_remove, }; module_platform_driver(stm32_sai_sub_driver); diff --git a/sound/soc/stm/stm32_spdifrx.c b/sound/soc/stm/stm32_spdifrx.c index 3fd28ee01675..3769d9ce5dbe 100644 --- a/sound/soc/stm/stm32_spdifrx.c +++ b/sound/soc/stm/stm32_spdifrx.c @@ -12,7 +12,6 @@ #include <linux/delay.h> #include <linux/module.h> #include <linux/of_platform.h> -#include <linux/pinctrl/consumer.h> #include <linux/regmap.h> #include <linux/reset.h> @@ -220,6 +219,7 @@ * @slave_config: dma slave channel runtime config pointer * @phys_addr: SPDIFRX registers physical base address * @lock: synchronization enabling lock + * @irq_lock: prevent race condition with IRQ on stream state * @cs: channel status buffer * @ub: user data buffer * @irq: SPDIFRX interrupt line @@ -240,6 +240,7 @@ struct stm32_spdifrx_data { struct dma_slave_config slave_config; dma_addr_t phys_addr; spinlock_t lock; /* Sync enabling lock */ + spinlock_t irq_lock; /* Prevent race condition on stream state */ unsigned char cs[SPDIFRX_CS_BYTES_NB]; unsigned char ub[SPDIFRX_UB_BYTES_NB]; int irq; @@ -320,6 +321,7 @@ static void stm32_spdifrx_dma_ctrl_stop(struct stm32_spdifrx_data *spdifrx) static int stm32_spdifrx_start_sync(struct stm32_spdifrx_data *spdifrx) { int cr, cr_mask, imr, ret; + unsigned long flags; /* Enable IRQs */ imr = SPDIFRX_IMR_IFEIE | SPDIFRX_IMR_SYNCDIE | SPDIFRX_IMR_PERRIE; @@ -327,7 +329,7 @@ static int stm32_spdifrx_start_sync(struct stm32_spdifrx_data *spdifrx) if (ret) return ret; - spin_lock(&spdifrx->lock); + spin_lock_irqsave(&spdifrx->lock, flags); spdifrx->refcount++; @@ -362,7 +364,7 @@ static int stm32_spdifrx_start_sync(struct stm32_spdifrx_data *spdifrx) "Failed to start synchronization\n"); } - spin_unlock(&spdifrx->lock); + spin_unlock_irqrestore(&spdifrx->lock, flags); return ret; } @@ -370,11 +372,12 @@ static int stm32_spdifrx_start_sync(struct stm32_spdifrx_data *spdifrx) static void stm32_spdifrx_stop(struct stm32_spdifrx_data *spdifrx) { int cr, cr_mask, reg; + unsigned long flags; - spin_lock(&spdifrx->lock); + spin_lock_irqsave(&spdifrx->lock, flags); if (--spdifrx->refcount) { - spin_unlock(&spdifrx->lock); + spin_unlock_irqrestore(&spdifrx->lock, flags); return; } @@ -393,7 +396,7 @@ static void stm32_spdifrx_stop(struct stm32_spdifrx_data *spdifrx) regmap_read(spdifrx->regmap, STM32_SPDIFRX_DR, ®); regmap_read(spdifrx->regmap, STM32_SPDIFRX_CSR, ®); - spin_unlock(&spdifrx->lock); + spin_unlock_irqrestore(&spdifrx->lock, flags); } static int stm32_spdifrx_dma_ctrl_register(struct device *dev, @@ -480,8 +483,6 @@ static int stm32_spdifrx_get_ctrl_data(struct stm32_spdifrx_data *spdifrx) memset(spdifrx->cs, 0, SPDIFRX_CS_BYTES_NB); memset(spdifrx->ub, 0, SPDIFRX_UB_BYTES_NB); - pinctrl_pm_select_default_state(&spdifrx->pdev->dev); - ret = stm32_spdifrx_dma_ctrl_start(spdifrx); if (ret < 0) return ret; @@ -513,7 +514,6 @@ static int stm32_spdifrx_get_ctrl_data(struct stm32_spdifrx_data *spdifrx) end: clk_disable_unprepare(spdifrx->kclk); - pinctrl_pm_select_sleep_state(&spdifrx->pdev->dev); return ret; } @@ -665,7 +665,6 @@ static const struct regmap_config stm32_h7_spdifrx_regmap_conf = { static irqreturn_t stm32_spdifrx_isr(int irq, void *devid) { struct stm32_spdifrx_data *spdifrx = (struct stm32_spdifrx_data *)devid; - struct snd_pcm_substream *substream = spdifrx->substream; struct platform_device *pdev = spdifrx->pdev; unsigned int cr, mask, sr, imr; unsigned int flags, sync_state; @@ -745,14 +744,19 @@ static irqreturn_t stm32_spdifrx_isr(int irq, void *devid) return IRQ_HANDLED; } - if (substream) - snd_pcm_stop(substream, SNDRV_PCM_STATE_DISCONNECTED); + spin_lock(&spdifrx->irq_lock); + if (spdifrx->substream) + snd_pcm_stop(spdifrx->substream, + SNDRV_PCM_STATE_DISCONNECTED); + spin_unlock(&spdifrx->irq_lock); return IRQ_HANDLED; } - if (err_xrun && substream) - snd_pcm_stop_xrun(substream); + spin_lock(&spdifrx->irq_lock); + if (err_xrun && spdifrx->substream) + snd_pcm_stop_xrun(spdifrx->substream); + spin_unlock(&spdifrx->irq_lock); return IRQ_HANDLED; } @@ -761,9 +765,12 @@ static int stm32_spdifrx_startup(struct snd_pcm_substream *substream, struct snd_soc_dai *cpu_dai) { struct stm32_spdifrx_data *spdifrx = snd_soc_dai_get_drvdata(cpu_dai); + unsigned long flags; int ret; + spin_lock_irqsave(&spdifrx->irq_lock, flags); spdifrx->substream = substream; + spin_unlock_irqrestore(&spdifrx->irq_lock, flags); ret = clk_prepare_enable(spdifrx->kclk); if (ret) @@ -839,8 +846,12 @@ static void stm32_spdifrx_shutdown(struct snd_pcm_substream *substream, struct snd_soc_dai *cpu_dai) { struct stm32_spdifrx_data *spdifrx = snd_soc_dai_get_drvdata(cpu_dai); + unsigned long flags; + spin_lock_irqsave(&spdifrx->irq_lock, flags); spdifrx->substream = NULL; + spin_unlock_irqrestore(&spdifrx->irq_lock, flags); + clk_disable_unprepare(spdifrx->kclk); } @@ -944,6 +955,7 @@ static int stm32_spdifrx_probe(struct platform_device *pdev) spdifrx->pdev = pdev; init_completion(&spdifrx->cs_completion); spin_lock_init(&spdifrx->lock); + spin_lock_init(&spdifrx->irq_lock); platform_set_drvdata(pdev, spdifrx); diff --git a/sound/usb/card.h b/sound/usb/card.h index 2991b9986f66..395403a2d33f 100644 --- a/sound/usb/card.h +++ b/sound/usb/card.h @@ -145,6 +145,7 @@ struct snd_usb_substream { struct snd_usb_endpoint *sync_endpoint; unsigned long flags; bool need_setup_ep; /* (re)configure EP at prepare? */ + bool need_setup_fmt; /* (re)configure fmt after resume? */ unsigned int speed; /* USB_SPEED_XXX */ u64 formats; /* format bitmasks (all or'ed) */ diff --git a/sound/usb/pcm.c b/sound/usb/pcm.c index 9c8930bb00c8..0e4eab96e23e 100644 --- a/sound/usb/pcm.c +++ b/sound/usb/pcm.c @@ -370,7 +370,7 @@ static int set_sync_ep_implicit_fb_quirk(struct snd_usb_substream *subs, add_sync_ep_from_ifnum: iface = usb_ifnum_to_if(dev, ifnum); - if (!iface || iface->num_altsetting == 0) + if (!iface || iface->num_altsetting < 2) return -EINVAL; alts = &iface->altsetting[1]; @@ -506,15 +506,15 @@ static int set_format(struct snd_usb_substream *subs, struct audioformat *fmt) if (WARN_ON(!iface)) return -EINVAL; alts = usb_altnum_to_altsetting(iface, fmt->altsetting); - altsd = get_iface_desc(alts); - if (WARN_ON(altsd->bAlternateSetting != fmt->altsetting)) + if (WARN_ON(!alts)) return -EINVAL; + altsd = get_iface_desc(alts); - if (fmt == subs->cur_audiofmt) + if (fmt == subs->cur_audiofmt && !subs->need_setup_fmt) return 0; /* close the old interface */ - if (subs->interface >= 0 && subs->interface != fmt->iface) { + if (subs->interface >= 0 && (subs->interface != fmt->iface || subs->need_setup_fmt)) { if (!subs->stream->chip->keep_iface) { err = usb_set_interface(subs->dev, subs->interface, 0); if (err < 0) { @@ -528,6 +528,9 @@ static int set_format(struct snd_usb_substream *subs, struct audioformat *fmt) subs->altset_idx = 0; } + if (subs->need_setup_fmt) + subs->need_setup_fmt = false; + /* set interface */ if (iface->cur_altsetting != alts) { err = snd_usb_select_mode_quirk(subs, fmt); @@ -1728,6 +1731,13 @@ static int snd_usb_substream_playback_trigger(struct snd_pcm_substream *substrea subs->data_endpoint->retire_data_urb = retire_playback_urb; subs->running = 0; return 0; + case SNDRV_PCM_TRIGGER_SUSPEND: + if (subs->stream->chip->setup_fmt_after_resume_quirk) { + stop_endpoints(subs, true); + subs->need_setup_fmt = true; + return 0; + } + break; } return -EINVAL; @@ -1760,6 +1770,13 @@ static int snd_usb_substream_capture_trigger(struct snd_pcm_substream *substream subs->data_endpoint->retire_data_urb = retire_capture_urb; subs->running = 1; return 0; + case SNDRV_PCM_TRIGGER_SUSPEND: + if (subs->stream->chip->setup_fmt_after_resume_quirk) { + stop_endpoints(subs, true); + subs->need_setup_fmt = true; + return 0; + } + break; } return -EINVAL; diff --git a/sound/usb/quirks-table.h b/sound/usb/quirks-table.h index 70c338f3ae24..d187aa6d50db 100644 --- a/sound/usb/quirks-table.h +++ b/sound/usb/quirks-table.h @@ -3466,7 +3466,8 @@ AU0828_DEVICE(0x2040, 0x7270, "Hauppauge", "HVR-950Q"), .vendor_name = "Dell", .product_name = "WD19 Dock", .profile_name = "Dell-WD15-Dock", - .ifnum = QUIRK_NO_INTERFACE + .ifnum = QUIRK_ANY_INTERFACE, + .type = QUIRK_SETUP_FMT_AFTER_RESUME } }, /* MOTU Microbook II */ diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 349e1e52996d..82184036437b 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -508,6 +508,16 @@ static int create_standard_mixer_quirk(struct snd_usb_audio *chip, return snd_usb_create_mixer(chip, quirk->ifnum, 0); } + +static int setup_fmt_after_resume_quirk(struct snd_usb_audio *chip, + struct usb_interface *iface, + struct usb_driver *driver, + const struct snd_usb_audio_quirk *quirk) +{ + chip->setup_fmt_after_resume_quirk = 1; + return 1; /* Continue with creating streams and mixer */ +} + /* * audio-interface quirks * @@ -546,6 +556,7 @@ int snd_usb_create_quirk(struct snd_usb_audio *chip, [QUIRK_AUDIO_EDIROL_UAXX] = create_uaxx_quirk, [QUIRK_AUDIO_ALIGN_TRANSFER] = create_align_transfer_quirk, [QUIRK_AUDIO_STANDARD_MIXER] = create_standard_mixer_quirk, + [QUIRK_SETUP_FMT_AFTER_RESUME] = setup_fmt_after_resume_quirk, }; if (quirk->type < QUIRK_TYPE_COUNT) { @@ -1386,6 +1397,7 @@ bool snd_usb_get_sample_rate_quirk(struct snd_usb_audio *chip) case USB_ID(0x04D8, 0xFEEA): /* Benchmark DAC1 Pre */ case USB_ID(0x0556, 0x0014): /* Phoenix Audio TMX320VC */ case USB_ID(0x05A3, 0x9420): /* ELP HD USB Camera */ + case USB_ID(0x05a7, 0x1020): /* Bose Companion 5 */ case USB_ID(0x074D, 0x3553): /* Outlaw RR2150 (Micronas UAC3553B) */ case USB_ID(0x1395, 0x740a): /* Sennheiser DECT */ case USB_ID(0x1901, 0x0191): /* GE B850V3 CP2114 audio interface */ diff --git a/sound/usb/usbaudio.h b/sound/usb/usbaudio.h index ff3cbf653de8..6fe3ab582ec6 100644 --- a/sound/usb/usbaudio.h +++ b/sound/usb/usbaudio.h @@ -33,7 +33,7 @@ struct snd_usb_audio { wait_queue_head_t shutdown_wait; unsigned int txfr_quirk:1; /* Subframe boundaries on transfers */ unsigned int tx_length_quirk:1; /* Put length specifier in transfers */ - + unsigned int setup_fmt_after_resume_quirk:1; /* setup the format to interface after resume */ int num_interfaces; int num_suspended_intf; int sample_rate_read_error; @@ -98,6 +98,7 @@ enum quirk_type { QUIRK_AUDIO_EDIROL_UAXX, QUIRK_AUDIO_ALIGN_TRANSFER, QUIRK_AUDIO_STANDARD_MIXER, + QUIRK_SETUP_FMT_AFTER_RESUME, QUIRK_TYPE_COUNT }; diff --git a/tools/bpf/Makefile b/tools/bpf/Makefile index 5535650800ab..f897eeeb0b4f 100644 --- a/tools/bpf/Makefile +++ b/tools/bpf/Makefile @@ -38,7 +38,7 @@ FEATURE_TESTS = libbfd disassembler-four-args FEATURE_DISPLAY = libbfd disassembler-four-args check_feat := 1 -NON_CHECK_FEAT_TARGETS := clean bpftool_clean +NON_CHECK_FEAT_TARGETS := clean bpftool_clean runqslower_clean ifdef MAKECMDGOALS ifeq ($(filter-out $(NON_CHECK_FEAT_TARGETS),$(MAKECMDGOALS)),) check_feat := 0 @@ -73,7 +73,7 @@ $(OUTPUT)%.lex.o: $(OUTPUT)%.lex.c PROGS = $(OUTPUT)bpf_jit_disasm $(OUTPUT)bpf_dbg $(OUTPUT)bpf_asm -all: $(PROGS) bpftool +all: $(PROGS) bpftool runqslower $(OUTPUT)bpf_jit_disasm: CFLAGS += -DPACKAGE='bpf_jit_disasm' $(OUTPUT)bpf_jit_disasm: $(OUTPUT)bpf_jit_disasm.o @@ -89,7 +89,7 @@ $(OUTPUT)bpf_exp.lex.c: $(OUTPUT)bpf_exp.yacc.c $(OUTPUT)bpf_exp.yacc.o: $(OUTPUT)bpf_exp.yacc.c $(OUTPUT)bpf_exp.lex.o: $(OUTPUT)bpf_exp.lex.c -clean: bpftool_clean +clean: bpftool_clean runqslower_clean $(call QUIET_CLEAN, bpf-progs) $(Q)$(RM) -r -- $(OUTPUT)*.o $(OUTPUT)bpf_jit_disasm $(OUTPUT)bpf_dbg \ $(OUTPUT)bpf_asm $(OUTPUT)bpf_exp.yacc.* $(OUTPUT)bpf_exp.lex.* @@ -97,7 +97,7 @@ clean: bpftool_clean $(Q)$(RM) -- $(OUTPUT)FEATURE-DUMP.bpf $(Q)$(RM) -r -- $(OUTPUT)feature -install: $(PROGS) bpftool_install +install: $(PROGS) bpftool_install runqslower_install $(call QUIET_INSTALL, bpf_jit_disasm) $(Q)$(INSTALL) -m 0755 -d $(DESTDIR)$(prefix)/bin $(Q)$(INSTALL) $(OUTPUT)bpf_jit_disasm $(DESTDIR)$(prefix)/bin/bpf_jit_disasm @@ -115,4 +115,14 @@ bpftool_install: bpftool_clean: $(call descend,bpftool,clean) -.PHONY: all install clean bpftool bpftool_install bpftool_clean +runqslower: + $(call descend,runqslower) + +runqslower_install: + $(call descend,runqslower,install) + +runqslower_clean: + $(call descend,runqslower,clean) + +.PHONY: all install clean bpftool bpftool_install bpftool_clean \ + runqslower runqslower_install runqslower_clean diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst index 86a87da97d0b..94d91322895a 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst @@ -196,7 +196,7 @@ and global variables. #define __EXAMPLE_SKEL_H__ #include <stdlib.h> - #include <libbpf.h> + #include <bpf/libbpf.h> struct example { struct bpf_object_skeleton *skeleton; diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index 39bc6f0f4f0b..c4e810335810 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -45,7 +45,7 @@ CFLAGS += -DPACKAGE='"bpftool"' -D__EXPORTED_HEADERS__ \ -I$(srctree)/kernel/bpf/ \ -I$(srctree)/tools/include \ -I$(srctree)/tools/include/uapi \ - -I$(srctree)/tools/lib/bpf \ + -I$(srctree)/tools/lib \ -I$(srctree)/tools/perf CFLAGS += -DBPFTOOL_VERSION='"$(BPFTOOL_VERSION)"' ifneq ($(EXTRA_CFLAGS),) diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c index e5bc97b71ceb..4ba90d81b6a1 100644 --- a/tools/bpf/bpftool/btf.c +++ b/tools/bpf/bpftool/btf.c @@ -8,15 +8,15 @@ #include <stdio.h> #include <string.h> #include <unistd.h> -#include <bpf.h> -#include <libbpf.h> +#include <bpf/bpf.h> +#include <bpf/btf.h> +#include <bpf/libbpf.h> #include <linux/btf.h> #include <linux/hashtable.h> #include <sys/types.h> #include <sys/stat.h> #include <unistd.h> -#include "btf.h" #include "json_writer.h" #include "main.h" @@ -370,6 +370,10 @@ static int dump_btf_c(const struct btf *btf, if (IS_ERR(d)) return PTR_ERR(d); + printf("#ifndef BPF_NO_PRESERVE_ACCESS_INDEX\n"); + printf("#pragma clang attribute push (__attribute__((preserve_access_index)), apply_to = record)\n"); + printf("#endif\n\n"); + if (root_type_cnt) { for (i = 0; i < root_type_cnt; i++) { err = btf_dump__dump_type(d, root_type_ids[i]); @@ -386,6 +390,10 @@ static int dump_btf_c(const struct btf *btf, } } + printf("#ifndef BPF_NO_PRESERVE_ACCESS_INDEX\n"); + printf("#pragma clang attribute pop\n"); + printf("#endif\n"); + done: btf_dump__free(d); return err; @@ -524,7 +532,7 @@ static int do_dump(int argc, char **argv) if (IS_ERR(btf)) { err = PTR_ERR(btf); btf = NULL; - p_err("failed to load BTF from %s: %s", + p_err("failed to load BTF from %s: %s", *argv, strerror(err)); goto done; } diff --git a/tools/bpf/bpftool/btf_dumper.c b/tools/bpf/bpftool/btf_dumper.c index d66131f69689..01cc52b834fa 100644 --- a/tools/bpf/bpftool/btf_dumper.c +++ b/tools/bpf/bpftool/btf_dumper.c @@ -8,8 +8,8 @@ #include <linux/bitops.h> #include <linux/btf.h> #include <linux/err.h> +#include <bpf/btf.h> -#include "btf.h" #include "json_writer.h" #include "main.h" @@ -26,7 +26,7 @@ static void btf_dumper_ptr(const void *data, json_writer_t *jw, bool is_plain_text) { if (is_plain_text) - jsonw_printf(jw, "%p", data); + jsonw_printf(jw, "%p", *(void **)data); else jsonw_printf(jw, "%lu", *(unsigned long *)data); } diff --git a/tools/bpf/bpftool/cgroup.c b/tools/bpf/bpftool/cgroup.c index 2f017caa678d..62c6a1d7cd18 100644 --- a/tools/bpf/bpftool/cgroup.c +++ b/tools/bpf/bpftool/cgroup.c @@ -14,7 +14,7 @@ #include <sys/types.h> #include <unistd.h> -#include <bpf.h> +#include <bpf/bpf.h> #include "main.h" diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index 88264abaa738..b75b8ec5469c 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -20,8 +20,8 @@ #include <sys/stat.h> #include <sys/vfs.h> -#include <bpf.h> -#include <libbpf.h> /* libbpf_num_possible_cpus */ +#include <bpf/bpf.h> +#include <bpf/libbpf.h> /* libbpf_num_possible_cpus */ #include "main.h" diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index 03bdc5b3ac49..446ba891f1e2 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -12,8 +12,8 @@ #include <linux/filter.h> #include <linux/limits.h> -#include <bpf.h> -#include <libbpf.h> +#include <bpf/bpf.h> +#include <bpf/libbpf.h> #include <zlib.h> #include "main.h" @@ -572,6 +572,18 @@ probe_helpers_for_progtype(enum bpf_prog_type prog_type, bool supported_type, printf("\n"); } +static void +probe_large_insn_limit(const char *define_prefix, __u32 ifindex) +{ + bool res; + + res = bpf_probe_large_insn_limit(ifindex); + print_bool_feature("have_large_insn_limit", + "Large program size limit", + "HAVE_LARGE_INSN_LIMIT", + res, define_prefix); +} + static int do_probe(int argc, char **argv) { enum probe_component target = COMPONENT_UNSPEC; @@ -724,6 +736,12 @@ static int do_probe(int argc, char **argv) probe_helpers_for_progtype(i, supported_types[i], define_prefix, ifindex); + print_end_then_start_section("misc", + "Scanning miscellaneous eBPF features...", + "/*** eBPF misc features ***/", + define_prefix); + probe_large_insn_limit(define_prefix, ifindex); + exit_close_json: if (json_output) { /* End current "section" of probes */ diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 7ce09a9a6999..f8113b3646f5 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -12,15 +12,15 @@ #include <stdio.h> #include <string.h> #include <unistd.h> -#include <bpf.h> -#include <libbpf.h> +#include <bpf/bpf.h> +#include <bpf/libbpf.h> #include <sys/types.h> #include <sys/stat.h> #include <sys/mman.h> #include <unistd.h> +#include <bpf/btf.h> -#include "btf.h" -#include "libbpf_internal.h" +#include "bpf/libbpf_internal.h" #include "json_writer.h" #include "main.h" @@ -333,7 +333,7 @@ static int do_skeleton(int argc, char **argv) #define %2$s \n\ \n\ #include <stdlib.h> \n\ - #include <libbpf.h> \n\ + #include <bpf/libbpf.h> \n\ \n\ struct %1$s { \n\ struct bpf_object_skeleton *skeleton; \n\ diff --git a/tools/bpf/bpftool/jit_disasm.c b/tools/bpf/bpftool/jit_disasm.c index bfed711258ce..f7f5885aa3ba 100644 --- a/tools/bpf/bpftool/jit_disasm.c +++ b/tools/bpf/bpftool/jit_disasm.c @@ -24,7 +24,7 @@ #include <dis-asm.h> #include <sys/stat.h> #include <limits.h> -#include <libbpf.h> +#include <bpf/libbpf.h> #include "json_writer.h" #include "main.h" diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c index 1fe91c558508..6d41bbfc6459 100644 --- a/tools/bpf/bpftool/main.c +++ b/tools/bpf/bpftool/main.c @@ -9,8 +9,8 @@ #include <stdlib.h> #include <string.h> -#include <bpf.h> -#include <libbpf.h> +#include <bpf/bpf.h> +#include <bpf/libbpf.h> #include "main.h" diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index c01f76fa6876..e6c85680b34d 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -15,9 +15,9 @@ #include <sys/types.h> #include <sys/stat.h> -#include <bpf.h> +#include <bpf/bpf.h> +#include <bpf/btf.h> -#include "btf.h" #include "json_writer.h" #include "main.h" @@ -48,6 +48,7 @@ const char * const map_type_name[] = { [BPF_MAP_TYPE_QUEUE] = "queue", [BPF_MAP_TYPE_STACK] = "stack", [BPF_MAP_TYPE_SK_STORAGE] = "sk_storage", + [BPF_MAP_TYPE_STRUCT_OPS] = "struct_ops", }; const size_t map_type_name_size = ARRAY_SIZE(map_type_name); @@ -251,6 +252,7 @@ static int do_dump_btf(const struct btf_dumper *d, struct bpf_map_info *map_info, void *key, void *value) { + __u32 value_id; int ret; /* start of key-value pair */ @@ -264,9 +266,12 @@ static int do_dump_btf(const struct btf_dumper *d, goto err_end_obj; } + value_id = map_info->btf_vmlinux_value_type_id ? + : map_info->btf_value_type_id; + if (!map_is_per_cpu(map_info->type)) { jsonw_name(d->jw, "value"); - ret = btf_dumper_type(d, map_info->btf_value_type_id, value); + ret = btf_dumper_type(d, value_id, value); } else { unsigned int i, n, step; @@ -278,8 +283,7 @@ static int do_dump_btf(const struct btf_dumper *d, jsonw_start_object(d->jw); jsonw_int_field(d->jw, "cpu", i); jsonw_name(d->jw, "value"); - ret = btf_dumper_type(d, map_info->btf_value_type_id, - value + i * step); + ret = btf_dumper_type(d, value_id, value + i * step); jsonw_end_object(d->jw); if (ret) break; @@ -915,37 +919,63 @@ static int maps_have_btf(int *fds, int nb_fds) { struct bpf_map_info info = {}; __u32 len = sizeof(info); - struct btf *btf = NULL; int err, i; for (i = 0; i < nb_fds; i++) { err = bpf_obj_get_info_by_fd(fds[i], &info, &len); if (err) { p_err("can't get map info: %s", strerror(errno)); - goto err_close; - } - - err = btf__get_from_id(info.btf_id, &btf); - if (err) { - p_err("failed to get btf"); - goto err_close; + return -1; } - if (!btf) + if (!info.btf_id) return 0; } return 1; +} -err_close: - for (; i < nb_fds; i++) - close(fds[i]); - return -1; +static struct btf *btf_vmlinux; + +static struct btf *get_map_kv_btf(const struct bpf_map_info *info) +{ + struct btf *btf = NULL; + + if (info->btf_vmlinux_value_type_id) { + if (!btf_vmlinux) { + btf_vmlinux = libbpf_find_kernel_btf(); + if (IS_ERR(btf_vmlinux)) + p_err("failed to get kernel btf"); + } + return btf_vmlinux; + } else if (info->btf_value_type_id) { + int err; + + err = btf__get_from_id(info->btf_id, &btf); + if (err || !btf) { + p_err("failed to get btf"); + btf = err ? ERR_PTR(err) : ERR_PTR(-ESRCH); + } + } + + return btf; +} + +static void free_map_kv_btf(struct btf *btf) +{ + if (!IS_ERR(btf) && btf != btf_vmlinux) + btf__free(btf); +} + +static void free_btf_vmlinux(void) +{ + if (!IS_ERR(btf_vmlinux)) + btf__free(btf_vmlinux); } static int map_dump(int fd, struct bpf_map_info *info, json_writer_t *wtr, - bool enable_btf, bool show_header) + bool show_header) { void *key, *value, *prev_key; unsigned int num_elems = 0; @@ -962,18 +992,13 @@ map_dump(int fd, struct bpf_map_info *info, json_writer_t *wtr, prev_key = NULL; - if (enable_btf) { - err = btf__get_from_id(info->btf_id, &btf); - if (err || !btf) { - /* enable_btf is true only if we've already checked - * that all maps have BTF information. - */ - p_err("failed to get btf"); + if (wtr) { + btf = get_map_kv_btf(info); + if (IS_ERR(btf)) { + err = PTR_ERR(btf); goto exit_free; } - } - if (wtr) { if (show_header) { jsonw_start_object(wtr); /* map object */ show_map_header_json(info, wtr); @@ -1012,7 +1037,7 @@ exit_free: free(key); free(value); close(fd); - btf__free(btf); + free_map_kv_btf(btf); return err; } @@ -1021,7 +1046,7 @@ static int do_dump(int argc, char **argv) { json_writer_t *wtr = NULL, *btf_wtr = NULL; struct bpf_map_info info = {}; - int nb_fds, i = 0, btf = 0; + int nb_fds, i = 0; __u32 len = sizeof(info); int *fds = NULL; int err = -1; @@ -1041,17 +1066,17 @@ static int do_dump(int argc, char **argv) if (json_output) { wtr = json_wtr; } else { - btf = maps_have_btf(fds, nb_fds); - if (btf < 0) + int do_plain_btf; + + do_plain_btf = maps_have_btf(fds, nb_fds); + if (do_plain_btf < 0) goto exit_close; - if (btf) { + + if (do_plain_btf) { btf_wtr = get_btf_writer(); - if (btf_wtr) { - wtr = btf_wtr; - } else { + wtr = btf_wtr; + if (!btf_wtr) p_info("failed to create json writer for btf. falling back to plain output"); - btf = 0; - } } } @@ -1062,7 +1087,7 @@ static int do_dump(int argc, char **argv) p_err("can't get map info: %s", strerror(errno)); break; } - err = map_dump(fds[i], &info, wtr, btf, nb_fds > 1); + err = map_dump(fds[i], &info, wtr, nb_fds > 1); if (!wtr && i != nb_fds - 1) printf("\n"); @@ -1073,13 +1098,14 @@ static int do_dump(int argc, char **argv) if (wtr && nb_fds > 1) jsonw_end_array(wtr); /* root array */ - if (btf) + if (btf_wtr) jsonw_destroy(&btf_wtr); exit_close: for (; i < nb_fds; i++) close(fds[i]); exit_free: free(fds); + free_btf_vmlinux(); return err; } diff --git a/tools/bpf/bpftool/map_perf_ring.c b/tools/bpf/bpftool/map_perf_ring.c index 4c5531d1a450..d9b29c17fbb8 100644 --- a/tools/bpf/bpftool/map_perf_ring.c +++ b/tools/bpf/bpftool/map_perf_ring.c @@ -6,7 +6,7 @@ */ #include <errno.h> #include <fcntl.h> -#include <libbpf.h> +#include <bpf/libbpf.h> #include <poll.h> #include <signal.h> #include <stdbool.h> @@ -21,7 +21,7 @@ #include <sys/mman.h> #include <sys/syscall.h> -#include <bpf.h> +#include <bpf/bpf.h> #include <perf-sys.h> #include "main.h" diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c index d93bee298e54..c5e3895b7c8b 100644 --- a/tools/bpf/bpftool/net.c +++ b/tools/bpf/bpftool/net.c @@ -7,7 +7,8 @@ #include <stdlib.h> #include <string.h> #include <unistd.h> -#include <libbpf.h> +#include <bpf/bpf.h> +#include <bpf/libbpf.h> #include <net/if.h> #include <linux/if.h> #include <linux/rtnetlink.h> @@ -16,9 +17,8 @@ #include <sys/stat.h> #include <sys/types.h> -#include <bpf.h> -#include <nlattr.h> -#include "libbpf_internal.h" +#include "bpf/nlattr.h" +#include "bpf/libbpf_internal.h" #include "main.h" #include "netlink_dumper.h" diff --git a/tools/bpf/bpftool/netlink_dumper.c b/tools/bpf/bpftool/netlink_dumper.c index 550a0f537eed..5f65140b003b 100644 --- a/tools/bpf/bpftool/netlink_dumper.c +++ b/tools/bpf/bpftool/netlink_dumper.c @@ -3,11 +3,11 @@ #include <stdlib.h> #include <string.h> -#include <libbpf.h> +#include <bpf/libbpf.h> #include <linux/rtnetlink.h> #include <linux/tc_act/tc_bpf.h> -#include <nlattr.h> +#include "bpf/nlattr.h" #include "main.h" #include "netlink_dumper.h" diff --git a/tools/bpf/bpftool/perf.c b/tools/bpf/bpftool/perf.c index b2046f33e23f..3341aa14acda 100644 --- a/tools/bpf/bpftool/perf.c +++ b/tools/bpf/bpftool/perf.c @@ -13,7 +13,7 @@ #include <unistd.h> #include <ftw.h> -#include <bpf.h> +#include <bpf/bpf.h> #include "main.h" diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 47a61ac42dc0..a3521deca869 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -17,9 +17,9 @@ #include <linux/err.h> #include <linux/sizes.h> -#include <bpf.h> -#include <btf.h> -#include <libbpf.h> +#include <bpf/bpf.h> +#include <bpf/btf.h> +#include <bpf/libbpf.h> #include "cfg.h" #include "main.h" diff --git a/tools/bpf/bpftool/xlated_dumper.c b/tools/bpf/bpftool/xlated_dumper.c index 5b91ee65a080..8608cd68cdd0 100644 --- a/tools/bpf/bpftool/xlated_dumper.c +++ b/tools/bpf/bpftool/xlated_dumper.c @@ -7,7 +7,7 @@ #include <stdlib.h> #include <string.h> #include <sys/types.h> -#include <libbpf.h> +#include <bpf/libbpf.h> #include "disasm.h" #include "json_writer.h" diff --git a/tools/bpf/runqslower/.gitignore b/tools/bpf/runqslower/.gitignore new file mode 100644 index 000000000000..90a456a2a72f --- /dev/null +++ b/tools/bpf/runqslower/.gitignore @@ -0,0 +1 @@ +/.output diff --git a/tools/bpf/runqslower/Makefile b/tools/bpf/runqslower/Makefile new file mode 100644 index 000000000000..faf5418609ea --- /dev/null +++ b/tools/bpf/runqslower/Makefile @@ -0,0 +1,84 @@ +# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +OUTPUT := .output +CLANG := clang +LLC := llc +LLVM_STRIP := llvm-strip +DEFAULT_BPFTOOL := $(OUTPUT)/sbin/bpftool +BPFTOOL ?= $(DEFAULT_BPFTOOL) +LIBBPF_SRC := $(abspath ../../lib/bpf) +BPFOBJ := $(OUTPUT)/libbpf.a +BPF_INCLUDE := $(OUTPUT) +INCLUDES := -I$(BPF_INCLUDE) -I$(OUTPUT) -I$(abspath ../../lib) +CFLAGS := -g -Wall + +# Try to detect best kernel BTF source +KERNEL_REL := $(shell uname -r) +VMLINUX_BTF_PATHS := /sys/kernel/btf/vmlinux /boot/vmlinux-$(KERNEL_REL) +VMLINUX_BTF_PATH := $(or $(VMLINUX_BTF),$(firstword \ + $(wildcard $(VMLINUX_BTF_PATHS)))) + +abs_out := $(abspath $(OUTPUT)) +ifeq ($(V),1) +Q = +msg = +else +Q = @ +msg = @printf ' %-8s %s%s\n' "$(1)" "$(notdir $(2))" "$(if $(3), $(3))"; +MAKEFLAGS += --no-print-directory +submake_extras := feature_display=0 +endif + +.DELETE_ON_ERROR: + +.PHONY: all clean runqslower +all: runqslower + +runqslower: $(OUTPUT)/runqslower + +clean: + $(call msg,CLEAN) + $(Q)rm -rf $(OUTPUT) runqslower + +$(OUTPUT)/runqslower: $(OUTPUT)/runqslower.o $(BPFOBJ) + $(call msg,BINARY,$@) + $(Q)$(CC) $(CFLAGS) -lelf -lz $^ -o $@ + +$(OUTPUT)/runqslower.o: runqslower.h $(OUTPUT)/runqslower.skel.h \ + $(OUTPUT)/runqslower.bpf.o + +$(OUTPUT)/runqslower.bpf.o: $(OUTPUT)/vmlinux.h runqslower.h + +$(OUTPUT)/%.skel.h: $(OUTPUT)/%.bpf.o | $(BPFTOOL) + $(call msg,GEN-SKEL,$@) + $(Q)$(BPFTOOL) gen skeleton $< > $@ + +$(OUTPUT)/%.bpf.o: %.bpf.c $(BPFOBJ) | $(OUTPUT) + $(call msg,BPF,$@) + $(Q)$(CLANG) -g -O2 -target bpf $(INCLUDES) \ + -c $(filter %.c,$^) -o $@ && \ + $(LLVM_STRIP) -g $@ + +$(OUTPUT)/%.o: %.c | $(OUTPUT) + $(call msg,CC,$@) + $(Q)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@ + +$(OUTPUT): + $(call msg,MKDIR,$@) + $(Q)mkdir -p $(OUTPUT) + +$(OUTPUT)/vmlinux.h: $(VMLINUX_BTF_PATH) | $(OUTPUT) $(BPFTOOL) + $(call msg,GEN,$@) + $(Q)if [ ! -e "$(VMLINUX_BTF_PATH)" ] ; then \ + echo "Couldn't find kernel BTF; set VMLINUX_BTF to" \ + "specify its location." >&2; \ + exit 1;\ + fi + $(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF_PATH) format c > $@ + +$(BPFOBJ): | $(OUTPUT) + $(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) \ + OUTPUT=$(abspath $(dir $@))/ $(abspath $@) + +$(DEFAULT_BPFTOOL): + $(Q)$(MAKE) $(submake_extras) -C ../bpftool \ + prefix= OUTPUT=$(abs_out)/ DESTDIR=$(abs_out) install diff --git a/tools/bpf/runqslower/runqslower.bpf.c b/tools/bpf/runqslower/runqslower.bpf.c new file mode 100644 index 000000000000..48a39f72fadf --- /dev/null +++ b/tools/bpf/runqslower/runqslower.bpf.c @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2019 Facebook +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include "runqslower.h" + +#define TASK_RUNNING 0 + +#define BPF_F_INDEX_MASK 0xffffffffULL +#define BPF_F_CURRENT_CPU BPF_F_INDEX_MASK + +const volatile __u64 min_us = 0; +const volatile pid_t targ_pid = 0; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 10240); + __type(key, u32); + __type(value, u64); +} start SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u32)); +} events SEC(".maps"); + +/* record enqueue timestamp */ +__always_inline +static int trace_enqueue(u32 tgid, u32 pid) +{ + u64 ts; + + if (!pid || (targ_pid && targ_pid != pid)) + return 0; + + ts = bpf_ktime_get_ns(); + bpf_map_update_elem(&start, &pid, &ts, 0); + return 0; +} + +SEC("tp_btf/sched_wakeup") +int handle__sched_wakeup(u64 *ctx) +{ + /* TP_PROTO(struct task_struct *p) */ + struct task_struct *p = (void *)ctx[0]; + + return trace_enqueue(p->tgid, p->pid); +} + +SEC("tp_btf/sched_wakeup_new") +int handle__sched_wakeup_new(u64 *ctx) +{ + /* TP_PROTO(struct task_struct *p) */ + struct task_struct *p = (void *)ctx[0]; + + return trace_enqueue(p->tgid, p->pid); +} + +SEC("tp_btf/sched_switch") +int handle__sched_switch(u64 *ctx) +{ + /* TP_PROTO(bool preempt, struct task_struct *prev, + * struct task_struct *next) + */ + struct task_struct *prev = (struct task_struct *)ctx[1]; + struct task_struct *next = (struct task_struct *)ctx[2]; + struct event event = {}; + u64 *tsp, delta_us; + long state; + u32 pid; + + /* ivcsw: treat like an enqueue event and store timestamp */ + if (prev->state == TASK_RUNNING) + trace_enqueue(prev->tgid, prev->pid); + + pid = next->pid; + + /* fetch timestamp and calculate delta */ + tsp = bpf_map_lookup_elem(&start, &pid); + if (!tsp) + return 0; /* missed enqueue */ + + delta_us = (bpf_ktime_get_ns() - *tsp) / 1000; + if (min_us && delta_us <= min_us) + return 0; + + event.pid = pid; + event.delta_us = delta_us; + bpf_get_current_comm(&event.task, sizeof(event.task)); + + /* output */ + bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, + &event, sizeof(event)); + + bpf_map_delete_elem(&start, &pid); + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; diff --git a/tools/bpf/runqslower/runqslower.c b/tools/bpf/runqslower/runqslower.c new file mode 100644 index 000000000000..d89715844952 --- /dev/null +++ b/tools/bpf/runqslower/runqslower.c @@ -0,0 +1,187 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +// Copyright (c) 2019 Facebook +#include <argp.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/resource.h> +#include <time.h> +#include <bpf/libbpf.h> +#include <bpf/bpf.h> +#include "runqslower.h" +#include "runqslower.skel.h" + +struct env { + pid_t pid; + __u64 min_us; + bool verbose; +} env = { + .min_us = 10000, +}; + +const char *argp_program_version = "runqslower 0.1"; +const char *argp_program_bug_address = "<bpf@vger.kernel.org>"; +const char argp_program_doc[] = +"runqslower Trace long process scheduling delays.\n" +" For Linux, uses eBPF, BPF CO-RE, libbpf, BTF.\n" +"\n" +"This script traces high scheduling delays between tasks being\n" +"ready to run and them running on CPU after that.\n" +"\n" +"USAGE: runqslower [-p PID] [min_us]\n" +"\n" +"EXAMPLES:\n" +" runqslower # trace run queue latency higher than 10000 us (default)\n" +" runqslower 1000 # trace run queue latency higher than 1000 us\n" +" runqslower -p 123 # trace pid 123 only\n"; + +static const struct argp_option opts[] = { + { "pid", 'p', "PID", 0, "Process PID to trace"}, + { "verbose", 'v', NULL, 0, "Verbose debug output" }, + {}, +}; + +static error_t parse_arg(int key, char *arg, struct argp_state *state) +{ + static int pos_args; + int pid; + long long min_us; + + switch (key) { + case 'v': + env.verbose = true; + break; + case 'p': + errno = 0; + pid = strtol(arg, NULL, 10); + if (errno || pid <= 0) { + fprintf(stderr, "Invalid PID: %s\n", arg); + argp_usage(state); + } + env.pid = pid; + break; + case ARGP_KEY_ARG: + if (pos_args++) { + fprintf(stderr, + "Unrecognized positional argument: %s\n", arg); + argp_usage(state); + } + errno = 0; + min_us = strtoll(arg, NULL, 10); + if (errno || min_us <= 0) { + fprintf(stderr, "Invalid delay (in us): %s\n", arg); + argp_usage(state); + } + env.min_us = min_us; + break; + default: + return ARGP_ERR_UNKNOWN; + } + return 0; +} + +int libbpf_print_fn(enum libbpf_print_level level, + const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG && !env.verbose) + return 0; + return vfprintf(stderr, format, args); +} + +static int bump_memlock_rlimit(void) +{ + struct rlimit rlim_new = { + .rlim_cur = RLIM_INFINITY, + .rlim_max = RLIM_INFINITY, + }; + + return setrlimit(RLIMIT_MEMLOCK, &rlim_new); +} + +void handle_event(void *ctx, int cpu, void *data, __u32 data_sz) +{ + const struct event *e = data; + struct tm *tm; + char ts[32]; + time_t t; + + time(&t); + tm = localtime(&t); + strftime(ts, sizeof(ts), "%H:%M:%S", tm); + printf("%-8s %-16s %-6d %14llu\n", ts, e->task, e->pid, e->delta_us); +} + +void handle_lost_events(void *ctx, int cpu, __u64 lost_cnt) +{ + printf("Lost %llu events on CPU #%d!\n", lost_cnt, cpu); +} + +int main(int argc, char **argv) +{ + static const struct argp argp = { + .options = opts, + .parser = parse_arg, + .doc = argp_program_doc, + }; + struct perf_buffer_opts pb_opts; + struct perf_buffer *pb = NULL; + struct runqslower_bpf *obj; + int err; + + err = argp_parse(&argp, argc, argv, 0, NULL, NULL); + if (err) + return err; + + libbpf_set_print(libbpf_print_fn); + + err = bump_memlock_rlimit(); + if (err) { + fprintf(stderr, "failed to increase rlimit: %d", err); + return 1; + } + + obj = runqslower_bpf__open(); + if (!obj) { + fprintf(stderr, "failed to open and/or load BPF object\n"); + return 1; + } + + /* initialize global data (filtering options) */ + obj->rodata->targ_pid = env.pid; + obj->rodata->min_us = env.min_us; + + err = runqslower_bpf__load(obj); + if (err) { + fprintf(stderr, "failed to load BPF object: %d\n", err); + goto cleanup; + } + + err = runqslower_bpf__attach(obj); + if (err) { + fprintf(stderr, "failed to attach BPF programs\n"); + goto cleanup; + } + + printf("Tracing run queue latency higher than %llu us\n", env.min_us); + printf("%-8s %-16s %-6s %14s\n", "TIME", "COMM", "PID", "LAT(us)"); + + pb_opts.sample_cb = handle_event; + pb_opts.lost_cb = handle_lost_events; + pb = perf_buffer__new(bpf_map__fd(obj->maps.events), 64, &pb_opts); + err = libbpf_get_error(pb); + if (err) { + pb = NULL; + fprintf(stderr, "failed to open perf buffer: %d\n", err); + goto cleanup; + } + + while ((err = perf_buffer__poll(pb, 100)) >= 0) + ; + printf("Error polling perf buffer: %d\n", err); + +cleanup: + perf_buffer__free(pb); + runqslower_bpf__destroy(obj); + + return err != 0; +} diff --git a/tools/bpf/runqslower/runqslower.h b/tools/bpf/runqslower/runqslower.h new file mode 100644 index 000000000000..9db225425e5f --- /dev/null +++ b/tools/bpf/runqslower/runqslower.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#ifndef __RUNQSLOWER_H +#define __RUNQSLOWER_H + +#define TASK_COMM_LEN 16 + +struct event { + char task[TASK_COMM_LEN]; + __u64 delta_us; + pid_t pid; +}; + +#endif /* __RUNQSLOWER_H */ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7df436da542d..f1d74a2bd234 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -107,6 +107,10 @@ enum bpf_cmd { BPF_MAP_LOOKUP_AND_DELETE_ELEM, BPF_MAP_FREEZE, BPF_BTF_GET_NEXT_ID, + BPF_MAP_LOOKUP_BATCH, + BPF_MAP_LOOKUP_AND_DELETE_BATCH, + BPF_MAP_UPDATE_BATCH, + BPF_MAP_DELETE_BATCH, }; enum bpf_map_type { @@ -136,6 +140,7 @@ enum bpf_map_type { BPF_MAP_TYPE_STACK, BPF_MAP_TYPE_SK_STORAGE, BPF_MAP_TYPE_DEVMAP_HASH, + BPF_MAP_TYPE_STRUCT_OPS, }; /* Note that tracing related programs such as @@ -174,6 +179,8 @@ enum bpf_prog_type { BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, BPF_PROG_TYPE_CGROUP_SOCKOPT, BPF_PROG_TYPE_TRACING, + BPF_PROG_TYPE_STRUCT_OPS, + BPF_PROG_TYPE_EXT, }; enum bpf_attach_type { @@ -357,7 +364,12 @@ enum bpf_attach_type { /* Enable memory-mapping BPF map */ #define BPF_F_MMAPABLE (1U << 10) -/* flags for BPF_PROG_QUERY */ +/* Flags for BPF_PROG_QUERY. */ + +/* Query effective (directly attached + inherited from ancestor cgroups) + * programs that will be executed for events within a cgroup. + * attach_flags with this flag are returned only for directly attached programs. + */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) enum bpf_stack_build_id_status { @@ -397,6 +409,10 @@ union bpf_attr { __u32 btf_fd; /* fd pointing to a BTF type data */ __u32 btf_key_type_id; /* BTF type_id of the key */ __u32 btf_value_type_id; /* BTF type_id of the value */ + __u32 btf_vmlinux_value_type_id;/* BTF type_id of a kernel- + * struct stored as the + * map value + */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ @@ -409,6 +425,23 @@ union bpf_attr { __u64 flags; }; + struct { /* struct used by BPF_MAP_*_BATCH commands */ + __aligned_u64 in_batch; /* start batch, + * NULL to start from beginning + */ + __aligned_u64 out_batch; /* output: next start batch */ + __aligned_u64 keys; + __aligned_u64 values; + __u32 count; /* input/output: + * input: # of key/value + * elements + * output: # of filled elements + */ + __u32 map_fd; + __u64 elem_flags; + __u64 flags; + } batch; + struct { /* anonymous struct used by BPF_PROG_LOAD command */ __u32 prog_type; /* one of enum bpf_prog_type */ __u32 insn_cnt; @@ -2703,7 +2736,8 @@ union bpf_attr { * * int bpf_send_signal(u32 sig) * Description - * Send signal *sig* to the current task. + * Send signal *sig* to the process of the current task. + * The signal may be delivered to any of this process's threads. * Return * 0 on success or successfully queued. * @@ -2831,6 +2865,33 @@ union bpf_attr { * Return * On success, the strictly positive length of the string, including * the trailing NUL character. On error, a negative value. + * + * int bpf_tcp_send_ack(void *tp, u32 rcv_nxt) + * Description + * Send out a tcp-ack. *tp* is the in-kernel struct tcp_sock. + * *rcv_nxt* is the ack_seq to be sent out. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_send_signal_thread(u32 sig) + * Description + * Send signal *sig* to the thread corresponding to the current task. + * Return + * 0 on success or successfully queued. + * + * **-EBUSY** if work queue under nmi is full. + * + * **-EINVAL** if *sig* is invalid. + * + * **-EPERM** if no permission to send the *sig*. + * + * **-EAGAIN** if bpf program can try again. + * + * u64 bpf_jiffies64(void) + * Description + * Obtain the 64bit jiffies + * Return + * The 64 bit jiffies */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2948,7 +3009,10 @@ union bpf_attr { FN(probe_read_user), \ FN(probe_read_kernel), \ FN(probe_read_user_str), \ - FN(probe_read_kernel_str), + FN(probe_read_kernel_str), \ + FN(tcp_send_ack), \ + FN(send_signal_thread), \ + FN(jiffies64), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3349,7 +3413,7 @@ struct bpf_map_info { __u32 map_flags; char name[BPF_OBJ_NAME_LEN]; __u32 ifindex; - __u32 :32; + __u32 btf_vmlinux_value_type_id; __u64 netns_dev; __u64 netns_ino; __u32 btf_id; diff --git a/tools/include/uapi/linux/btf.h b/tools/include/uapi/linux/btf.h index 1a2898c482ee..5a667107ad2c 100644 --- a/tools/include/uapi/linux/btf.h +++ b/tools/include/uapi/linux/btf.h @@ -146,6 +146,12 @@ enum { BTF_VAR_GLOBAL_EXTERN = 2, }; +enum btf_func_linkage { + BTF_FUNC_STATIC = 0, + BTF_FUNC_GLOBAL = 1, + BTF_FUNC_EXTERN = 2, +}; + /* BTF_KIND_VAR is followed by a single "struct btf_var" to describe * additional information related to the variable such as its linkage. */ diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index 8aec8769d944..024af2d1d0af 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -169,6 +169,7 @@ enum { IFLA_MAX_MTU, IFLA_PROP_LIST, IFLA_ALT_IFNAME, /* Alternative ifname */ + IFLA_PERM_ADDRESS, __IFLA_MAX }; @@ -485,6 +486,13 @@ enum macsec_validation_type { MACSEC_VALIDATE_MAX = __MACSEC_VALIDATE_END - 1, }; +enum macsec_offload { + MACSEC_OFFLOAD_OFF = 0, + MACSEC_OFFLOAD_PHY = 1, + __MACSEC_OFFLOAD_END, + MACSEC_OFFLOAD_MAX = __MACSEC_OFFLOAD_END - 1, +}; + /* IPVLAN section */ enum { IFLA_IPVLAN_UNSPEC, diff --git a/tools/include/uapi/linux/in.h b/tools/include/uapi/linux/in.h index e7ad9d350a28..1521073b6348 100644 --- a/tools/include/uapi/linux/in.h +++ b/tools/include/uapi/linux/in.h @@ -76,6 +76,8 @@ enum { #define IPPROTO_MPLS IPPROTO_MPLS IPPROTO_RAW = 255, /* Raw IP packets */ #define IPPROTO_RAW IPPROTO_RAW + IPPROTO_MPTCP = 262, /* Multipath TCP connection */ +#define IPPROTO_MPTCP IPPROTO_MPTCP IPPROTO_MAX }; #endif diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index d87830e7ea63..aee7f1a83c77 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -183,7 +183,7 @@ $(BPF_IN_STATIC): force elfdep zdep bpfdep $(BPF_HELPER_DEFS) $(Q)$(MAKE) $(build)=libbpf OUTPUT=$(STATIC_OBJDIR) $(BPF_HELPER_DEFS): $(srctree)/tools/include/uapi/linux/bpf.h - $(Q)$(srctree)/scripts/bpf_helpers_doc.py --header \ + $(QUIET_GEN)$(srctree)/scripts/bpf_helpers_doc.py --header \ --file $(srctree)/tools/include/uapi/linux/bpf.h > $(BPF_HELPER_DEFS) $(OUTPUT)libbpf.so: $(OUTPUT)libbpf.so.$(LIBBPF_VERSION) @@ -273,10 +273,11 @@ config-clean: $(Q)$(MAKE) -C $(srctree)/tools/build/feature/ clean >/dev/null clean: - $(call QUIET_CLEAN, libbpf) $(RM) -rf $(CMD_TARGETS) \ - *.o *~ *.a *.so *.so.$(LIBBPF_MAJOR_VERSION) .*.d .*.cmd \ - *.pc LIBBPF-CFLAGS $(BPF_HELPER_DEFS) \ - $(SHARED_OBJDIR) $(STATIC_OBJDIR) + $(call QUIET_CLEAN, libbpf) $(RM) -rf $(CMD_TARGETS) \ + *~ .*.d .*.cmd LIBBPF-CFLAGS $(BPF_HELPER_DEFS) \ + $(SHARED_OBJDIR) $(STATIC_OBJDIR) \ + $(addprefix $(OUTPUT), \ + *.o *.a *.so *.so.$(LIBBPF_MAJOR_VERSION) *.pc) $(call QUIET_CLEAN, core-gen) $(RM) $(OUTPUT)FEATURE-DUMP.libbpf diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index a787d53699c8..c6dafe563176 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -32,6 +32,9 @@ #include "libbpf.h" #include "libbpf_internal.h" +/* make sure libbpf doesn't use kernel-only integer typedefs */ +#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64 + /* * When building perf, unistd.h is overridden. __NR_bpf is * required to be defined explicitly. @@ -95,7 +98,11 @@ int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr) attr.btf_key_type_id = create_attr->btf_key_type_id; attr.btf_value_type_id = create_attr->btf_value_type_id; attr.map_ifindex = create_attr->map_ifindex; - attr.inner_map_fd = create_attr->inner_map_fd; + if (attr.map_type == BPF_MAP_TYPE_STRUCT_OPS) + attr.btf_vmlinux_value_type_id = + create_attr->btf_vmlinux_value_type_id; + else + attr.inner_map_fd = create_attr->inner_map_fd; return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); } @@ -228,7 +235,10 @@ int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr, memset(&attr, 0, sizeof(attr)); attr.prog_type = load_attr->prog_type; attr.expected_attach_type = load_attr->expected_attach_type; - if (attr.prog_type == BPF_PROG_TYPE_TRACING) { + if (attr.prog_type == BPF_PROG_TYPE_STRUCT_OPS) { + attr.attach_btf_id = load_attr->attach_btf_id; + } else if (attr.prog_type == BPF_PROG_TYPE_TRACING || + attr.prog_type == BPF_PROG_TYPE_EXT) { attr.attach_btf_id = load_attr->attach_btf_id; attr.attach_prog_fd = load_attr->attach_prog_fd; } else { @@ -443,6 +453,64 @@ int bpf_map_freeze(int fd) return sys_bpf(BPF_MAP_FREEZE, &attr, sizeof(attr)); } +static int bpf_map_batch_common(int cmd, int fd, void *in_batch, + void *out_batch, void *keys, void *values, + __u32 *count, + const struct bpf_map_batch_opts *opts) +{ + union bpf_attr attr; + int ret; + + if (!OPTS_VALID(opts, bpf_map_batch_opts)) + return -EINVAL; + + memset(&attr, 0, sizeof(attr)); + attr.batch.map_fd = fd; + attr.batch.in_batch = ptr_to_u64(in_batch); + attr.batch.out_batch = ptr_to_u64(out_batch); + attr.batch.keys = ptr_to_u64(keys); + attr.batch.values = ptr_to_u64(values); + attr.batch.count = *count; + attr.batch.elem_flags = OPTS_GET(opts, elem_flags, 0); + attr.batch.flags = OPTS_GET(opts, flags, 0); + + ret = sys_bpf(cmd, &attr, sizeof(attr)); + *count = attr.batch.count; + + return ret; +} + +int bpf_map_delete_batch(int fd, void *keys, __u32 *count, + const struct bpf_map_batch_opts *opts) +{ + return bpf_map_batch_common(BPF_MAP_DELETE_BATCH, fd, NULL, + NULL, keys, NULL, count, opts); +} + +int bpf_map_lookup_batch(int fd, void *in_batch, void *out_batch, void *keys, + void *values, __u32 *count, + const struct bpf_map_batch_opts *opts) +{ + return bpf_map_batch_common(BPF_MAP_LOOKUP_BATCH, fd, in_batch, + out_batch, keys, values, count, opts); +} + +int bpf_map_lookup_and_delete_batch(int fd, void *in_batch, void *out_batch, + void *keys, void *values, __u32 *count, + const struct bpf_map_batch_opts *opts) +{ + return bpf_map_batch_common(BPF_MAP_LOOKUP_AND_DELETE_BATCH, + fd, in_batch, out_batch, keys, values, + count, opts); +} + +int bpf_map_update_batch(int fd, void *keys, void *values, __u32 *count, + const struct bpf_map_batch_opts *opts) +{ + return bpf_map_batch_common(BPF_MAP_UPDATE_BATCH, fd, NULL, NULL, + keys, values, count, opts); +} + int bpf_obj_pin(int fd, const char *pathname) { union bpf_attr attr; diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index f0ab8519986e..b976e77316cc 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -46,7 +46,10 @@ struct bpf_create_map_attr { __u32 btf_key_type_id; __u32 btf_value_type_id; __u32 map_ifindex; - __u32 inner_map_fd; + union { + __u32 inner_map_fd; + __u32 btf_vmlinux_value_type_id; + }; }; LIBBPF_API int @@ -124,6 +127,28 @@ LIBBPF_API int bpf_map_lookup_and_delete_elem(int fd, const void *key, LIBBPF_API int bpf_map_delete_elem(int fd, const void *key); LIBBPF_API int bpf_map_get_next_key(int fd, const void *key, void *next_key); LIBBPF_API int bpf_map_freeze(int fd); + +struct bpf_map_batch_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u64 elem_flags; + __u64 flags; +}; +#define bpf_map_batch_opts__last_field flags + +LIBBPF_API int bpf_map_delete_batch(int fd, void *keys, + __u32 *count, + const struct bpf_map_batch_opts *opts); +LIBBPF_API int bpf_map_lookup_batch(int fd, void *in_batch, void *out_batch, + void *keys, void *values, __u32 *count, + const struct bpf_map_batch_opts *opts); +LIBBPF_API int bpf_map_lookup_and_delete_batch(int fd, void *in_batch, + void *out_batch, void *keys, + void *values, __u32 *count, + const struct bpf_map_batch_opts *opts); +LIBBPF_API int bpf_map_update_batch(int fd, void *keys, void *values, + __u32 *count, + const struct bpf_map_batch_opts *opts); + LIBBPF_API int bpf_obj_pin(int fd, const char *pathname); LIBBPF_API int bpf_obj_get(const char *pathname); diff --git a/tools/lib/bpf/bpf_prog_linfo.c b/tools/lib/bpf/bpf_prog_linfo.c index 3ed1a27b5f7c..bafca49cb1e6 100644 --- a/tools/lib/bpf/bpf_prog_linfo.c +++ b/tools/lib/bpf/bpf_prog_linfo.c @@ -8,6 +8,9 @@ #include "libbpf.h" #include "libbpf_internal.h" +/* make sure libbpf doesn't use kernel-only integer typedefs */ +#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64 + struct bpf_prog_linfo { void *raw_linfo; void *raw_jited_linfo; diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 5f04f56e1eb6..3d1c25fc97ae 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -8,6 +8,10 @@ #include <fcntl.h> #include <unistd.h> #include <errno.h> +#include <sys/utsname.h> +#include <sys/param.h> +#include <sys/stat.h> +#include <linux/kernel.h> #include <linux/err.h> #include <linux/btf.h> #include <gelf.h> @@ -17,8 +21,11 @@ #include "libbpf_internal.h" #include "hashmap.h" -#define BTF_MAX_NR_TYPES 0x7fffffff -#define BTF_MAX_STR_OFFSET 0x7fffffff +/* make sure libbpf doesn't use kernel-only integer typedefs */ +#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64 + +#define BTF_MAX_NR_TYPES 0x7fffffffU +#define BTF_MAX_STR_OFFSET 0x7fffffffU static struct btf_type btf_void; @@ -50,7 +57,7 @@ static int btf_add_type(struct btf *btf, struct btf_type *t) if (btf->types_size == BTF_MAX_NR_TYPES) return -E2BIG; - expand_by = max(btf->types_size >> 2, 16); + expand_by = max(btf->types_size >> 2, 16U); new_size = min(BTF_MAX_NR_TYPES, btf->types_size + expand_by); new_types = realloc(btf->types, sizeof(*new_types) * new_size); @@ -286,7 +293,7 @@ int btf__align_of(const struct btf *btf, __u32 id) switch (kind) { case BTF_KIND_INT: case BTF_KIND_ENUM: - return min(sizeof(void *), t->size); + return min(sizeof(void *), (size_t)t->size); case BTF_KIND_PTR: return sizeof(void *); case BTF_KIND_TYPEDEF: @@ -1398,7 +1405,7 @@ static int btf_dedup_hypot_map_add(struct btf_dedup *d, if (d->hypot_cnt == d->hypot_cap) { __u32 *new_list; - d->hypot_cap += max(16, d->hypot_cap / 2); + d->hypot_cap += max((size_t)16, d->hypot_cap / 2); new_list = realloc(d->hypot_list, sizeof(__u32) * d->hypot_cap); if (!new_list) return -ENOMEM; @@ -1694,7 +1701,7 @@ static int btf_dedup_strings(struct btf_dedup *d) if (strs.cnt + 1 > strs.cap) { struct btf_str_ptr *new_ptrs; - strs.cap += max(strs.cnt / 2, 16); + strs.cap += max(strs.cnt / 2, 16U); new_ptrs = realloc(strs.ptrs, sizeof(strs.ptrs[0]) * strs.cap); if (!new_ptrs) { @@ -2928,3 +2935,89 @@ static int btf_dedup_remap_types(struct btf_dedup *d) } return 0; } + +static struct btf *btf_load_raw(const char *path) +{ + struct btf *btf; + size_t read_cnt; + struct stat st; + void *data; + FILE *f; + + if (stat(path, &st)) + return ERR_PTR(-errno); + + data = malloc(st.st_size); + if (!data) + return ERR_PTR(-ENOMEM); + + f = fopen(path, "rb"); + if (!f) { + btf = ERR_PTR(-errno); + goto cleanup; + } + + read_cnt = fread(data, 1, st.st_size, f); + fclose(f); + if (read_cnt < st.st_size) { + btf = ERR_PTR(-EBADF); + goto cleanup; + } + + btf = btf__new(data, read_cnt); + +cleanup: + free(data); + return btf; +} + +/* + * Probe few well-known locations for vmlinux kernel image and try to load BTF + * data out of it to use for target BTF. + */ +struct btf *libbpf_find_kernel_btf(void) +{ + struct { + const char *path_fmt; + bool raw_btf; + } locations[] = { + /* try canonical vmlinux BTF through sysfs first */ + { "/sys/kernel/btf/vmlinux", true /* raw BTF */ }, + /* fall back to trying to find vmlinux ELF on disk otherwise */ + { "/boot/vmlinux-%1$s" }, + { "/lib/modules/%1$s/vmlinux-%1$s" }, + { "/lib/modules/%1$s/build/vmlinux" }, + { "/usr/lib/modules/%1$s/kernel/vmlinux" }, + { "/usr/lib/debug/boot/vmlinux-%1$s" }, + { "/usr/lib/debug/boot/vmlinux-%1$s.debug" }, + { "/usr/lib/debug/lib/modules/%1$s/vmlinux" }, + }; + char path[PATH_MAX + 1]; + struct utsname buf; + struct btf *btf; + int i; + + uname(&buf); + + for (i = 0; i < ARRAY_SIZE(locations); i++) { + snprintf(path, PATH_MAX, locations[i].path_fmt, buf.release); + + if (access(path, R_OK)) + continue; + + if (locations[i].raw_btf) + btf = btf_load_raw(path); + else + btf = btf__parse_elf(path, NULL); + + pr_debug("loading kernel BTF '%s': %ld\n", + path, IS_ERR(btf) ? PTR_ERR(btf) : 0); + if (IS_ERR(btf)) + continue; + + return btf; + } + + pr_warn("failed to find valid kernel BTF\n"); + return ERR_PTR(-ESRCH); +} diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index 8d73f7f5551f..70c1b7ec2bd0 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -102,6 +102,8 @@ LIBBPF_API int btf_ext__reloc_line_info(const struct btf *btf, LIBBPF_API __u32 btf_ext__func_info_rec_size(const struct btf_ext *btf_ext); LIBBPF_API __u32 btf_ext__line_info_rec_size(const struct btf_ext *btf_ext); +LIBBPF_API struct btf *libbpf_find_kernel_btf(void); + struct btf_dedup_opts { unsigned int dedup_table_size; bool dont_resolve_fwds; diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index e95f7710f210..bd09ed1710f1 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -18,6 +18,9 @@ #include "libbpf.h" #include "libbpf_internal.h" +/* make sure libbpf doesn't use kernel-only integer typedefs */ +#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64 + static const char PREFIXES[] = "\t\t\t\t\t\t\t\t\t\t\t\t\t"; static const size_t PREFIX_CNT = sizeof(PREFIXES) - 1; @@ -139,6 +142,7 @@ struct btf_dump *btf_dump__new(const struct btf *btf, if (IS_ERR(d->type_names)) { err = PTR_ERR(d->type_names); d->type_names = NULL; + goto err; } d->ident_names = hashmap__new(str_hash_fn, str_equal_fn, NULL); if (IS_ERR(d->ident_names)) { diff --git a/tools/lib/bpf/hashmap.c b/tools/lib/bpf/hashmap.c index 6122272943e6..54c30c802070 100644 --- a/tools/lib/bpf/hashmap.c +++ b/tools/lib/bpf/hashmap.c @@ -12,6 +12,9 @@ #include <linux/err.h> #include "hashmap.h" +/* make sure libbpf doesn't use kernel-only integer typedefs */ +#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64 + /* start with 4 buckets */ #define HASHMAP_MIN_CAP_BITS 2 diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 7513165b104f..ae34b681ae82 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -55,6 +55,9 @@ #include "libbpf_internal.h" #include "hashmap.h" +/* make sure libbpf doesn't use kernel-only integer typedefs */ +#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64 + #ifndef EM_BPF #define EM_BPF 247 #endif @@ -70,6 +73,12 @@ #define __printf(a, b) __attribute__((format(printf, a, b))) +static struct bpf_map *bpf_object__add_map(struct bpf_object *obj); +static struct bpf_program *bpf_object__find_prog_by_idx(struct bpf_object *obj, + int idx); +static const struct btf_type * +skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id); + static int __base_pr(enum libbpf_print_level level, const char *format, va_list args) { @@ -166,6 +175,8 @@ struct bpf_capabilities { __u32 btf_datasec:1; /* BPF_F_MMAPABLE is supported for arrays */ __u32 array_mmap:1; + /* BTF_FUNC_GLOBAL is supported */ + __u32 btf_func_global:1; }; enum reloc_type { @@ -229,10 +240,32 @@ struct bpf_program { __u32 prog_flags; }; +struct bpf_struct_ops { + const char *tname; + const struct btf_type *type; + struct bpf_program **progs; + __u32 *kern_func_off; + /* e.g. struct tcp_congestion_ops in bpf_prog's btf format */ + void *data; + /* e.g. struct bpf_struct_ops_tcp_congestion_ops in + * btf_vmlinux's format. + * struct bpf_struct_ops_tcp_congestion_ops { + * [... some other kernel fields ...] + * struct tcp_congestion_ops data; + * } + * kern_vdata-size == sizeof(struct bpf_struct_ops_tcp_congestion_ops) + * bpf_map__init_kern_struct_ops() will populate the "kern_vdata" + * from "data". + */ + void *kern_vdata; + __u32 type_id; +}; + #define DATA_SEC ".data" #define BSS_SEC ".bss" #define RODATA_SEC ".rodata" #define KCONFIG_SEC ".kconfig" +#define STRUCT_OPS_SEC ".struct_ops" enum libbpf_map_type { LIBBPF_MAP_UNSPEC, @@ -259,10 +292,12 @@ struct bpf_map { struct bpf_map_def def; __u32 btf_key_type_id; __u32 btf_value_type_id; + __u32 btf_vmlinux_value_type_id; void *priv; bpf_map_clear_priv_t clear_priv; enum libbpf_map_type libbpf_type; void *mmaped; + struct bpf_struct_ops *st_ops; char *pin_path; bool pinned; bool reused; @@ -326,6 +361,7 @@ struct bpf_object { Elf_Data *data; Elf_Data *rodata; Elf_Data *bss; + Elf_Data *st_ops_data; size_t strtabidx; struct { GElf_Shdr shdr; @@ -339,6 +375,7 @@ struct bpf_object { int data_shndx; int rodata_shndx; int bss_shndx; + int st_ops_shndx; } efile; /* * All loaded bpf_object is linked in a list, which is @@ -348,6 +385,10 @@ struct bpf_object { struct list_head list; struct btf *btf; + /* Parse and load BTF vmlinux if any of the programs in the object need + * it at load time. + */ + struct btf *btf_vmlinux; struct btf_ext *btf_ext; void *priv; @@ -566,6 +607,348 @@ static __u32 get_kernel_version(void) return KERNEL_VERSION(major, minor, patch); } +static const struct btf_member * +find_member_by_offset(const struct btf_type *t, __u32 bit_offset) +{ + struct btf_member *m; + int i; + + for (i = 0, m = btf_members(t); i < btf_vlen(t); i++, m++) { + if (btf_member_bit_offset(t, i) == bit_offset) + return m; + } + + return NULL; +} + +static const struct btf_member * +find_member_by_name(const struct btf *btf, const struct btf_type *t, + const char *name) +{ + struct btf_member *m; + int i; + + for (i = 0, m = btf_members(t); i < btf_vlen(t); i++, m++) { + if (!strcmp(btf__name_by_offset(btf, m->name_off), name)) + return m; + } + + return NULL; +} + +#define STRUCT_OPS_VALUE_PREFIX "bpf_struct_ops_" +static int find_btf_by_prefix_kind(const struct btf *btf, const char *prefix, + const char *name, __u32 kind); + +static int +find_struct_ops_kern_types(const struct btf *btf, const char *tname, + const struct btf_type **type, __u32 *type_id, + const struct btf_type **vtype, __u32 *vtype_id, + const struct btf_member **data_member) +{ + const struct btf_type *kern_type, *kern_vtype; + const struct btf_member *kern_data_member; + __s32 kern_vtype_id, kern_type_id; + __u32 i; + + kern_type_id = btf__find_by_name_kind(btf, tname, BTF_KIND_STRUCT); + if (kern_type_id < 0) { + pr_warn("struct_ops init_kern: struct %s is not found in kernel BTF\n", + tname); + return kern_type_id; + } + kern_type = btf__type_by_id(btf, kern_type_id); + + /* Find the corresponding "map_value" type that will be used + * in map_update(BPF_MAP_TYPE_STRUCT_OPS). For example, + * find "struct bpf_struct_ops_tcp_congestion_ops" from the + * btf_vmlinux. + */ + kern_vtype_id = find_btf_by_prefix_kind(btf, STRUCT_OPS_VALUE_PREFIX, + tname, BTF_KIND_STRUCT); + if (kern_vtype_id < 0) { + pr_warn("struct_ops init_kern: struct %s%s is not found in kernel BTF\n", + STRUCT_OPS_VALUE_PREFIX, tname); + return kern_vtype_id; + } + kern_vtype = btf__type_by_id(btf, kern_vtype_id); + + /* Find "struct tcp_congestion_ops" from + * struct bpf_struct_ops_tcp_congestion_ops { + * [ ... ] + * struct tcp_congestion_ops data; + * } + */ + kern_data_member = btf_members(kern_vtype); + for (i = 0; i < btf_vlen(kern_vtype); i++, kern_data_member++) { + if (kern_data_member->type == kern_type_id) + break; + } + if (i == btf_vlen(kern_vtype)) { + pr_warn("struct_ops init_kern: struct %s data is not found in struct %s%s\n", + tname, STRUCT_OPS_VALUE_PREFIX, tname); + return -EINVAL; + } + + *type = kern_type; + *type_id = kern_type_id; + *vtype = kern_vtype; + *vtype_id = kern_vtype_id; + *data_member = kern_data_member; + + return 0; +} + +static bool bpf_map__is_struct_ops(const struct bpf_map *map) +{ + return map->def.type == BPF_MAP_TYPE_STRUCT_OPS; +} + +/* Init the map's fields that depend on kern_btf */ +static int bpf_map__init_kern_struct_ops(struct bpf_map *map, + const struct btf *btf, + const struct btf *kern_btf) +{ + const struct btf_member *member, *kern_member, *kern_data_member; + const struct btf_type *type, *kern_type, *kern_vtype; + __u32 i, kern_type_id, kern_vtype_id, kern_data_off; + struct bpf_struct_ops *st_ops; + void *data, *kern_data; + const char *tname; + int err; + + st_ops = map->st_ops; + type = st_ops->type; + tname = st_ops->tname; + err = find_struct_ops_kern_types(kern_btf, tname, + &kern_type, &kern_type_id, + &kern_vtype, &kern_vtype_id, + &kern_data_member); + if (err) + return err; + + pr_debug("struct_ops init_kern %s: type_id:%u kern_type_id:%u kern_vtype_id:%u\n", + map->name, st_ops->type_id, kern_type_id, kern_vtype_id); + + map->def.value_size = kern_vtype->size; + map->btf_vmlinux_value_type_id = kern_vtype_id; + + st_ops->kern_vdata = calloc(1, kern_vtype->size); + if (!st_ops->kern_vdata) + return -ENOMEM; + + data = st_ops->data; + kern_data_off = kern_data_member->offset / 8; + kern_data = st_ops->kern_vdata + kern_data_off; + + member = btf_members(type); + for (i = 0; i < btf_vlen(type); i++, member++) { + const struct btf_type *mtype, *kern_mtype; + __u32 mtype_id, kern_mtype_id; + void *mdata, *kern_mdata; + __s64 msize, kern_msize; + __u32 moff, kern_moff; + __u32 kern_member_idx; + const char *mname; + + mname = btf__name_by_offset(btf, member->name_off); + kern_member = find_member_by_name(kern_btf, kern_type, mname); + if (!kern_member) { + pr_warn("struct_ops init_kern %s: Cannot find member %s in kernel BTF\n", + map->name, mname); + return -ENOTSUP; + } + + kern_member_idx = kern_member - btf_members(kern_type); + if (btf_member_bitfield_size(type, i) || + btf_member_bitfield_size(kern_type, kern_member_idx)) { + pr_warn("struct_ops init_kern %s: bitfield %s is not supported\n", + map->name, mname); + return -ENOTSUP; + } + + moff = member->offset / 8; + kern_moff = kern_member->offset / 8; + + mdata = data + moff; + kern_mdata = kern_data + kern_moff; + + mtype = skip_mods_and_typedefs(btf, member->type, &mtype_id); + kern_mtype = skip_mods_and_typedefs(kern_btf, kern_member->type, + &kern_mtype_id); + if (BTF_INFO_KIND(mtype->info) != + BTF_INFO_KIND(kern_mtype->info)) { + pr_warn("struct_ops init_kern %s: Unmatched member type %s %u != %u(kernel)\n", + map->name, mname, BTF_INFO_KIND(mtype->info), + BTF_INFO_KIND(kern_mtype->info)); + return -ENOTSUP; + } + + if (btf_is_ptr(mtype)) { + struct bpf_program *prog; + + mtype = skip_mods_and_typedefs(btf, mtype->type, &mtype_id); + kern_mtype = skip_mods_and_typedefs(kern_btf, + kern_mtype->type, + &kern_mtype_id); + if (!btf_is_func_proto(mtype) || + !btf_is_func_proto(kern_mtype)) { + pr_warn("struct_ops init_kern %s: non func ptr %s is not supported\n", + map->name, mname); + return -ENOTSUP; + } + + prog = st_ops->progs[i]; + if (!prog) { + pr_debug("struct_ops init_kern %s: func ptr %s is not set\n", + map->name, mname); + continue; + } + + prog->attach_btf_id = kern_type_id; + prog->expected_attach_type = kern_member_idx; + + st_ops->kern_func_off[i] = kern_data_off + kern_moff; + + pr_debug("struct_ops init_kern %s: func ptr %s is set to prog %s from data(+%u) to kern_data(+%u)\n", + map->name, mname, prog->name, moff, + kern_moff); + + continue; + } + + msize = btf__resolve_size(btf, mtype_id); + kern_msize = btf__resolve_size(kern_btf, kern_mtype_id); + if (msize < 0 || kern_msize < 0 || msize != kern_msize) { + pr_warn("struct_ops init_kern %s: Error in size of member %s: %zd != %zd(kernel)\n", + map->name, mname, (ssize_t)msize, + (ssize_t)kern_msize); + return -ENOTSUP; + } + + pr_debug("struct_ops init_kern %s: copy %s %u bytes from data(+%u) to kern_data(+%u)\n", + map->name, mname, (unsigned int)msize, + moff, kern_moff); + memcpy(kern_mdata, mdata, msize); + } + + return 0; +} + +static int bpf_object__init_kern_struct_ops_maps(struct bpf_object *obj) +{ + struct bpf_map *map; + size_t i; + int err; + + for (i = 0; i < obj->nr_maps; i++) { + map = &obj->maps[i]; + + if (!bpf_map__is_struct_ops(map)) + continue; + + err = bpf_map__init_kern_struct_ops(map, obj->btf, + obj->btf_vmlinux); + if (err) + return err; + } + + return 0; +} + +static int bpf_object__init_struct_ops_maps(struct bpf_object *obj) +{ + const struct btf_type *type, *datasec; + const struct btf_var_secinfo *vsi; + struct bpf_struct_ops *st_ops; + const char *tname, *var_name; + __s32 type_id, datasec_id; + const struct btf *btf; + struct bpf_map *map; + __u32 i; + + if (obj->efile.st_ops_shndx == -1) + return 0; + + btf = obj->btf; + datasec_id = btf__find_by_name_kind(btf, STRUCT_OPS_SEC, + BTF_KIND_DATASEC); + if (datasec_id < 0) { + pr_warn("struct_ops init: DATASEC %s not found\n", + STRUCT_OPS_SEC); + return -EINVAL; + } + + datasec = btf__type_by_id(btf, datasec_id); + vsi = btf_var_secinfos(datasec); + for (i = 0; i < btf_vlen(datasec); i++, vsi++) { + type = btf__type_by_id(obj->btf, vsi->type); + var_name = btf__name_by_offset(obj->btf, type->name_off); + + type_id = btf__resolve_type(obj->btf, vsi->type); + if (type_id < 0) { + pr_warn("struct_ops init: Cannot resolve var type_id %u in DATASEC %s\n", + vsi->type, STRUCT_OPS_SEC); + return -EINVAL; + } + + type = btf__type_by_id(obj->btf, type_id); + tname = btf__name_by_offset(obj->btf, type->name_off); + if (!tname[0]) { + pr_warn("struct_ops init: anonymous type is not supported\n"); + return -ENOTSUP; + } + if (!btf_is_struct(type)) { + pr_warn("struct_ops init: %s is not a struct\n", tname); + return -EINVAL; + } + + map = bpf_object__add_map(obj); + if (IS_ERR(map)) + return PTR_ERR(map); + + map->sec_idx = obj->efile.st_ops_shndx; + map->sec_offset = vsi->offset; + map->name = strdup(var_name); + if (!map->name) + return -ENOMEM; + + map->def.type = BPF_MAP_TYPE_STRUCT_OPS; + map->def.key_size = sizeof(int); + map->def.value_size = type->size; + map->def.max_entries = 1; + + map->st_ops = calloc(1, sizeof(*map->st_ops)); + if (!map->st_ops) + return -ENOMEM; + st_ops = map->st_ops; + st_ops->data = malloc(type->size); + st_ops->progs = calloc(btf_vlen(type), sizeof(*st_ops->progs)); + st_ops->kern_func_off = malloc(btf_vlen(type) * + sizeof(*st_ops->kern_func_off)); + if (!st_ops->data || !st_ops->progs || !st_ops->kern_func_off) + return -ENOMEM; + + if (vsi->offset + type->size > obj->efile.st_ops_data->d_size) { + pr_warn("struct_ops init: var %s is beyond the end of DATASEC %s\n", + var_name, STRUCT_OPS_SEC); + return -EINVAL; + } + + memcpy(st_ops->data, + obj->efile.st_ops_data->d_buf + vsi->offset, + type->size); + st_ops->tname = tname; + st_ops->type = type; + st_ops->type_id = type_id; + + pr_debug("struct_ops init: struct %s(type_id=%u) %s found at offset %u\n", + tname, type_id, var_name, vsi->offset); + } + + return 0; +} + static struct bpf_object *bpf_object__new(const char *path, const void *obj_buf, size_t obj_buf_sz, @@ -607,6 +990,7 @@ static struct bpf_object *bpf_object__new(const char *path, obj->efile.data_shndx = -1; obj->efile.rodata_shndx = -1; obj->efile.bss_shndx = -1; + obj->efile.st_ops_shndx = -1; obj->kconfig_map_idx = -1; obj->kern_version = get_kernel_version(); @@ -630,6 +1014,7 @@ static void bpf_object__elf_finish(struct bpf_object *obj) obj->efile.data = NULL; obj->efile.rodata = NULL; obj->efile.bss = NULL; + obj->efile.st_ops_data = NULL; zfree(&obj->efile.reloc_sects); obj->efile.nr_reloc_sects = 0; @@ -735,16 +1120,6 @@ bpf_object__init_kversion(struct bpf_object *obj, void *data, size_t size) return 0; } -static int compare_bpf_map(const void *_a, const void *_b) -{ - const struct bpf_map *a = _a; - const struct bpf_map *b = _b; - - if (a->sec_idx != b->sec_idx) - return a->sec_idx - b->sec_idx; - return a->sec_offset - b->sec_offset; -} - static bool bpf_map_type__is_map_in_map(enum bpf_map_type type) { if (type == BPF_MAP_TYPE_ARRAY_OF_MAPS || @@ -815,6 +1190,9 @@ int bpf_object__section_size(const struct bpf_object *obj, const char *name, } else if (!strcmp(name, RODATA_SEC)) { if (obj->efile.rodata) *size = obj->efile.rodata->d_size; + } else if (!strcmp(name, STRUCT_OPS_SEC)) { + if (obj->efile.st_ops_data) + *size = obj->efile.st_ops_data->d_size; } else { ret = bpf_object_search_section_size(obj, name, &d_size); if (!ret) @@ -898,7 +1276,7 @@ static size_t bpf_map_mmap_sz(const struct bpf_map *map) long page_sz = sysconf(_SC_PAGE_SIZE); size_t map_sz; - map_sz = roundup(map->def.value_size, 8) * map->def.max_entries; + map_sz = (size_t)roundup(map->def.value_size, 8) * map->def.max_entries; map_sz = roundup(map_sz, page_sz); return map_sz; } @@ -1440,6 +1818,20 @@ skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id) return t; } +static const struct btf_type * +resolve_func_ptr(const struct btf *btf, __u32 id, __u32 *res_id) +{ + const struct btf_type *t; + + t = skip_mods_and_typedefs(btf, id, NULL); + if (!btf_is_ptr(t)) + return NULL; + + t = skip_mods_and_typedefs(btf, t->type, res_id); + + return btf_is_func_proto(t) ? t : NULL; +} + /* * Fetch integer attribute of BTF map definition. Such attributes are * represented using a pointer to an array, in which dimensionality of array @@ -1787,13 +2179,10 @@ static int bpf_object__init_maps(struct bpf_object *obj, err = err ?: bpf_object__init_user_btf_maps(obj, strict, pin_root_path); err = err ?: bpf_object__init_global_data_maps(obj); err = err ?: bpf_object__init_kconfig_map(obj); + err = err ?: bpf_object__init_struct_ops_maps(obj); if (err) return err; - if (obj->nr_maps) { - qsort(obj->maps, obj->nr_maps, sizeof(obj->maps[0]), - compare_bpf_map); - } return 0; } @@ -1817,13 +2206,14 @@ static bool section_have_execinstr(struct bpf_object *obj, int idx) static void bpf_object__sanitize_btf(struct bpf_object *obj) { + bool has_func_global = obj->caps.btf_func_global; bool has_datasec = obj->caps.btf_datasec; bool has_func = obj->caps.btf_func; struct btf *btf = obj->btf; struct btf_type *t; int i, j, vlen; - if (!obj->btf || (has_func && has_datasec)) + if (!obj->btf || (has_func && has_datasec && has_func_global)) return; for (i = 1; i <= btf__get_nr_types(btf); i++) { @@ -1871,6 +2261,9 @@ static void bpf_object__sanitize_btf(struct bpf_object *obj) } else if (!has_func && btf_is_func(t)) { /* replace FUNC with TYPEDEF */ t->info = BTF_INFO_ENC(BTF_KIND_TYPEDEF, 0, 0); + } else if (!has_func_global && btf_is_func(t)) { + /* replace BTF_FUNC_GLOBAL with BTF_FUNC_STATIC */ + t->info = BTF_INFO_ENC(BTF_KIND_FUNC, 0, 0); } } } @@ -1889,23 +2282,26 @@ static void bpf_object__sanitize_btf_ext(struct bpf_object *obj) static bool bpf_object__is_btf_mandatory(const struct bpf_object *obj) { return obj->efile.btf_maps_shndx >= 0 || - obj->nr_extern > 0; + obj->efile.st_ops_shndx >= 0 || + obj->nr_extern > 0; } static int bpf_object__init_btf(struct bpf_object *obj, Elf_Data *btf_data, Elf_Data *btf_ext_data) { - bool btf_required = bpf_object__is_btf_mandatory(obj); - int err = 0; + int err = -ENOENT; if (btf_data) { obj->btf = btf__new(btf_data->d_buf, btf_data->d_size); if (IS_ERR(obj->btf)) { + err = PTR_ERR(obj->btf); + obj->btf = NULL; pr_warn("Error loading ELF section %s: %d.\n", BTF_ELF_SEC, err); goto out; } + err = 0; } if (btf_ext_data) { if (!obj->btf) { @@ -1923,18 +2319,9 @@ static int bpf_object__init_btf(struct bpf_object *obj, } } out: - if (err || IS_ERR(obj->btf)) { - if (btf_required) - err = err ? : PTR_ERR(obj->btf); - else - err = 0; - if (!IS_ERR_OR_NULL(obj->btf)) - btf__free(obj->btf); - obj->btf = NULL; - } - if (btf_required && !obj->btf) { + if (err && bpf_object__is_btf_mandatory(obj)) { pr_warn("BTF is required, but is missing or corrupted.\n"); - return err == 0 ? -ENOENT : err; + return err; } return 0; } @@ -1963,6 +2350,41 @@ static int bpf_object__finalize_btf(struct bpf_object *obj) return 0; } +static inline bool libbpf_prog_needs_vmlinux_btf(struct bpf_program *prog) +{ + if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) + return true; + + /* BPF_PROG_TYPE_TRACING programs which do not attach to other programs + * also need vmlinux BTF + */ + if (prog->type == BPF_PROG_TYPE_TRACING && !prog->attach_prog_fd) + return true; + + return false; +} + +static int bpf_object__load_vmlinux_btf(struct bpf_object *obj) +{ + struct bpf_program *prog; + int err; + + bpf_object__for_each_program(prog, obj) { + if (libbpf_prog_needs_vmlinux_btf(prog)) { + obj->btf_vmlinux = libbpf_find_kernel_btf(); + if (IS_ERR(obj->btf_vmlinux)) { + err = PTR_ERR(obj->btf_vmlinux); + pr_warn("Error loading vmlinux BTF: %d\n", err); + obj->btf_vmlinux = NULL; + return err; + } + return 0; + } + } + + return 0; +} + static int bpf_object__sanitize_and_load_btf(struct bpf_object *obj) { int err = 0; @@ -2088,6 +2510,9 @@ static int bpf_object__elf_collect(struct bpf_object *obj) } else if (strcmp(name, RODATA_SEC) == 0) { obj->efile.rodata = data; obj->efile.rodata_shndx = idx; + } else if (strcmp(name, STRUCT_OPS_SEC) == 0) { + obj->efile.st_ops_data = data; + obj->efile.st_ops_shndx = idx; } else { pr_debug("skip section(%d) %s\n", idx, name); } @@ -2097,7 +2522,8 @@ static int bpf_object__elf_collect(struct bpf_object *obj) int sec = sh.sh_info; /* points to other section */ /* Only do relo for section with exec instructions */ - if (!section_have_execinstr(obj, sec)) { + if (!section_have_execinstr(obj, sec) && + strcmp(name, ".rel" STRUCT_OPS_SEC)) { pr_debug("skip relo %s(%d) for section(%d)\n", name, idx, sec); continue; @@ -2599,8 +3025,12 @@ static int bpf_map_find_btf_info(struct bpf_object *obj, struct bpf_map *map) __u32 key_type_id = 0, value_type_id = 0; int ret; - /* if it's BTF-defined map, we don't need to search for type IDs */ - if (map->sec_idx == obj->efile.btf_maps_shndx) + /* if it's BTF-defined map, we don't need to search for type IDs. + * For struct_ops map, it does not need btf_key_type_id and + * btf_value_type_id. + */ + if (map->sec_idx == obj->efile.btf_maps_shndx || + bpf_map__is_struct_ops(map)) return 0; if (!bpf_map__is_internal(map)) { @@ -2804,6 +3234,32 @@ static int bpf_object__probe_btf_func(struct bpf_object *obj) return 0; } +static int bpf_object__probe_btf_func_global(struct bpf_object *obj) +{ + static const char strs[] = "\0int\0x\0a"; + /* static void x(int a) {} */ + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + /* FUNC_PROTO */ /* [2] */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 1), 0), + BTF_PARAM_ENC(7, 1), + /* FUNC x BTF_FUNC_GLOBAL */ /* [3] */ + BTF_TYPE_ENC(5, BTF_INFO_ENC(BTF_KIND_FUNC, 0, BTF_FUNC_GLOBAL), 2), + }; + int btf_fd; + + btf_fd = libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs)); + if (btf_fd >= 0) { + obj->caps.btf_func_global = 1; + close(btf_fd); + return 1; + } + + return 0; +} + static int bpf_object__probe_btf_datasec(struct bpf_object *obj) { static const char strs[] = "\0x\0.data"; @@ -2859,6 +3315,7 @@ bpf_object__probe_caps(struct bpf_object *obj) bpf_object__probe_name, bpf_object__probe_global_data, bpf_object__probe_btf_func, + bpf_object__probe_btf_func_global, bpf_object__probe_btf_datasec, bpf_object__probe_array_mmap, }; @@ -3025,6 +3482,9 @@ bpf_object__create_maps(struct bpf_object *obj) if (bpf_map_type__is_map_in_map(def->type) && map->inner_map_fd >= 0) create_attr.inner_map_fd = map->inner_map_fd; + if (bpf_map__is_struct_ops(map)) + create_attr.btf_vmlinux_value_type_id = + map->btf_vmlinux_value_type_id; if (obj->btf && !bpf_map_find_btf_info(obj, map)) { create_attr.btf_fd = btf__fd(obj->btf); @@ -3860,92 +4320,6 @@ static int bpf_core_reloc_insn(struct bpf_program *prog, return 0; } -static struct btf *btf_load_raw(const char *path) -{ - struct btf *btf; - size_t read_cnt; - struct stat st; - void *data; - FILE *f; - - if (stat(path, &st)) - return ERR_PTR(-errno); - - data = malloc(st.st_size); - if (!data) - return ERR_PTR(-ENOMEM); - - f = fopen(path, "rb"); - if (!f) { - btf = ERR_PTR(-errno); - goto cleanup; - } - - read_cnt = fread(data, 1, st.st_size, f); - fclose(f); - if (read_cnt < st.st_size) { - btf = ERR_PTR(-EBADF); - goto cleanup; - } - - btf = btf__new(data, read_cnt); - -cleanup: - free(data); - return btf; -} - -/* - * Probe few well-known locations for vmlinux kernel image and try to load BTF - * data out of it to use for target BTF. - */ -static struct btf *bpf_core_find_kernel_btf(void) -{ - struct { - const char *path_fmt; - bool raw_btf; - } locations[] = { - /* try canonical vmlinux BTF through sysfs first */ - { "/sys/kernel/btf/vmlinux", true /* raw BTF */ }, - /* fall back to trying to find vmlinux ELF on disk otherwise */ - { "/boot/vmlinux-%1$s" }, - { "/lib/modules/%1$s/vmlinux-%1$s" }, - { "/lib/modules/%1$s/build/vmlinux" }, - { "/usr/lib/modules/%1$s/kernel/vmlinux" }, - { "/usr/lib/debug/boot/vmlinux-%1$s" }, - { "/usr/lib/debug/boot/vmlinux-%1$s.debug" }, - { "/usr/lib/debug/lib/modules/%1$s/vmlinux" }, - }; - char path[PATH_MAX + 1]; - struct utsname buf; - struct btf *btf; - int i; - - uname(&buf); - - for (i = 0; i < ARRAY_SIZE(locations); i++) { - snprintf(path, PATH_MAX, locations[i].path_fmt, buf.release); - - if (access(path, R_OK)) - continue; - - if (locations[i].raw_btf) - btf = btf_load_raw(path); - else - btf = btf__parse_elf(path, NULL); - - pr_debug("loading kernel BTF '%s': %ld\n", - path, IS_ERR(btf) ? PTR_ERR(btf) : 0); - if (IS_ERR(btf)) - continue; - - return btf; - } - - pr_warn("failed to find valid kernel BTF\n"); - return ERR_PTR(-ESRCH); -} - /* Output spec definition in the format: * [<type-id>] (<type-name>) + <raw-spec> => <offset>@<spec>, * where <spec> is a C-syntax view of recorded field access, e.g.: x.a[3].b @@ -4180,7 +4554,7 @@ bpf_core_reloc_fields(struct bpf_object *obj, const char *targ_btf_path) if (targ_btf_path) targ_btf = btf__parse_elf(targ_btf_path, NULL); else - targ_btf = bpf_core_find_kernel_btf(); + targ_btf = libbpf_find_kernel_btf(); if (IS_ERR(targ_btf)) { pr_warn("failed to get target BTF: %ld\n", PTR_ERR(targ_btf)); return PTR_ERR(targ_btf); @@ -4252,13 +4626,7 @@ bpf_program__reloc_text(struct bpf_program *prog, struct bpf_object *obj, size_t new_cnt; int err; - if (prog->idx == obj->efile.text_shndx) { - pr_warn("relo in .text insn %d into off %d (insn #%d)\n", - relo->insn_idx, relo->sym_off, relo->sym_off / 8); - return -LIBBPF_ERRNO__RELOC; - } - - if (prog->main_prog_cnt == 0) { + if (prog->idx != obj->efile.text_shndx && prog->main_prog_cnt == 0) { text = bpf_object__find_prog_by_idx(obj, obj->efile.text_shndx); if (!text) { pr_warn("no .text section found yet relo into text exist\n"); @@ -4288,6 +4656,7 @@ bpf_program__reloc_text(struct bpf_program *prog, struct bpf_object *obj, text->insns_cnt, text->section_name, prog->section_name); } + insn = &prog->insns[relo->insn_idx]; insn->imm += relo->sym_off / 8 + prog->main_prog_cnt - relo->insn_idx; return 0; @@ -4367,8 +4736,28 @@ bpf_object__relocate(struct bpf_object *obj, const char *targ_btf_path) return err; } } + /* ensure .text is relocated first, as it's going to be copied as-is + * later for sub-program calls + */ + for (i = 0; i < obj->nr_programs; i++) { + prog = &obj->programs[i]; + if (prog->idx != obj->efile.text_shndx) + continue; + + err = bpf_program__relocate(prog, obj); + if (err) { + pr_warn("failed to relocate '%s'\n", prog->section_name); + return err; + } + break; + } + /* now relocate everything but .text, which by now is relocated + * properly, so we can copy raw sub-program instructions as is safely + */ for (i = 0; i < obj->nr_programs; i++) { prog = &obj->programs[i]; + if (prog->idx == obj->efile.text_shndx) + continue; err = bpf_program__relocate(prog, obj); if (err) { @@ -4379,6 +4768,10 @@ bpf_object__relocate(struct bpf_object *obj, const char *targ_btf_path) return 0; } +static int bpf_object__collect_struct_ops_map_reloc(struct bpf_object *obj, + GElf_Shdr *shdr, + Elf_Data *data); + static int bpf_object__collect_reloc(struct bpf_object *obj) { int i, err; @@ -4399,6 +4792,15 @@ static int bpf_object__collect_reloc(struct bpf_object *obj) return -LIBBPF_ERRNO__INTERNAL; } + if (idx == obj->efile.st_ops_shndx) { + err = bpf_object__collect_struct_ops_map_reloc(obj, + shdr, + data); + if (err) + return err; + continue; + } + prog = bpf_object__find_prog_by_idx(obj, idx); if (!prog) { pr_warn("relocation failed: no section(%d)\n", idx); @@ -4433,7 +4835,10 @@ load_program(struct bpf_program *prog, struct bpf_insn *insns, int insns_cnt, load_attr.insns = insns; load_attr.insns_cnt = insns_cnt; load_attr.license = license; - if (prog->type == BPF_PROG_TYPE_TRACING) { + if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) { + load_attr.attach_btf_id = prog->attach_btf_id; + } else if (prog->type == BPF_PROG_TYPE_TRACING || + prog->type == BPF_PROG_TYPE_EXT) { load_attr.attach_prog_fd = prog->attach_prog_fd; load_attr.attach_btf_id = prog->attach_btf_id; } else { @@ -4508,18 +4913,15 @@ out: return ret; } -static int libbpf_find_attach_btf_id(const char *name, - enum bpf_attach_type attach_type, - __u32 attach_prog_fd); +static int libbpf_find_attach_btf_id(struct bpf_program *prog); int bpf_program__load(struct bpf_program *prog, char *license, __u32 kern_ver) { int err = 0, fd, i, btf_id; - if (prog->type == BPF_PROG_TYPE_TRACING) { - btf_id = libbpf_find_attach_btf_id(prog->section_name, - prog->expected_attach_type, - prog->attach_prog_fd); + if (prog->type == BPF_PROG_TYPE_TRACING || + prog->type == BPF_PROG_TYPE_EXT) { + btf_id = libbpf_find_attach_btf_id(prog); if (btf_id <= 0) return btf_id; prog->attach_btf_id = btf_id; @@ -4679,6 +5081,9 @@ __bpf_object__open(const char *path, const void *obj_buf, size_t obj_buf_sz, enum bpf_prog_type prog_type; enum bpf_attach_type attach_type; + if (prog->type != BPF_PROG_TYPE_UNSPEC) + continue; + err = libbpf_prog_type_by_name(prog->section_name, &prog_type, &attach_type); if (err == -ESRCH) @@ -4689,7 +5094,8 @@ __bpf_object__open(const char *path, const void *obj_buf, size_t obj_buf_sz, bpf_program__set_type(prog, prog_type); bpf_program__set_expected_attach_type(prog, attach_type); - if (prog_type == BPF_PROG_TYPE_TRACING) + if (prog_type == BPF_PROG_TYPE_TRACING || + prog_type == BPF_PROG_TYPE_EXT) prog->attach_prog_fd = OPTS_GET(opts, attach_prog_fd, 0); } @@ -4774,8 +5180,11 @@ int bpf_object__unload(struct bpf_object *obj) if (!obj) return -EINVAL; - for (i = 0; i < obj->nr_maps; i++) + for (i = 0; i < obj->nr_maps; i++) { zclose(obj->maps[i].fd); + if (obj->maps[i].st_ops) + zfree(&obj->maps[i].st_ops->kern_vdata); + } for (i = 0; i < obj->nr_programs; i++) bpf_program__unload(&obj->programs[i]); @@ -4891,9 +5300,15 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr) err = err ? : bpf_object__resolve_externs(obj, obj->kconfig); err = err ? : bpf_object__sanitize_and_load_btf(obj); err = err ? : bpf_object__sanitize_maps(obj); + err = err ? : bpf_object__load_vmlinux_btf(obj); + err = err ? : bpf_object__init_kern_struct_ops_maps(obj); err = err ? : bpf_object__create_maps(obj); err = err ? : bpf_object__relocate(obj, attr->target_btf_path); err = err ? : bpf_object__load_progs(obj, attr->log_level); + + btf__free(obj->btf_vmlinux); + obj->btf_vmlinux = NULL; + if (err) goto out; @@ -5478,6 +5893,13 @@ void bpf_object__close(struct bpf_object *obj) map->mmaped = NULL; } + if (map->st_ops) { + zfree(&map->st_ops->data); + zfree(&map->st_ops->progs); + zfree(&map->st_ops->kern_func_off); + zfree(&map->st_ops); + } + zfree(&map->name); zfree(&map->pin_path); } @@ -5746,6 +6168,8 @@ BPF_PROG_TYPE_FNS(raw_tracepoint, BPF_PROG_TYPE_RAW_TRACEPOINT); BPF_PROG_TYPE_FNS(xdp, BPF_PROG_TYPE_XDP); BPF_PROG_TYPE_FNS(perf_event, BPF_PROG_TYPE_PERF_EVENT); BPF_PROG_TYPE_FNS(tracing, BPF_PROG_TYPE_TRACING); +BPF_PROG_TYPE_FNS(struct_ops, BPF_PROG_TYPE_STRUCT_OPS); +BPF_PROG_TYPE_FNS(extension, BPF_PROG_TYPE_EXT); enum bpf_attach_type bpf_program__get_expected_attach_type(struct bpf_program *prog) @@ -5845,6 +6269,9 @@ static const struct bpf_sec_def section_defs[] = { .expected_attach_type = BPF_TRACE_FEXIT, .is_attach_btf = true, .attach_fn = attach_trace), + SEC_DEF("freplace/", EXT, + .is_attach_btf = true, + .attach_fn = attach_trace), BPF_PROG_SEC("xdp", BPF_PROG_TYPE_XDP), BPF_PROG_SEC("perf_event", BPF_PROG_TYPE_PERF_EVENT), BPF_PROG_SEC("lwt_in", BPF_PROG_TYPE_LWT_IN), @@ -5899,6 +6326,7 @@ static const struct bpf_sec_def section_defs[] = { BPF_CGROUP_GETSOCKOPT), BPF_EAPROG_SEC("cgroup/setsockopt", BPF_PROG_TYPE_CGROUP_SOCKOPT, BPF_CGROUP_SETSOCKOPT), + BPF_PROG_SEC("struct_ops", BPF_PROG_TYPE_STRUCT_OPS), }; #undef BPF_PROG_SEC_IMPL @@ -5975,34 +6403,182 @@ int libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type, return -ESRCH; } -#define BTF_PREFIX "btf_trace_" +static struct bpf_map *find_struct_ops_map_by_offset(struct bpf_object *obj, + size_t offset) +{ + struct bpf_map *map; + size_t i; + + for (i = 0; i < obj->nr_maps; i++) { + map = &obj->maps[i]; + if (!bpf_map__is_struct_ops(map)) + continue; + if (map->sec_offset <= offset && + offset - map->sec_offset < map->def.value_size) + return map; + } + + return NULL; +} + +/* Collect the reloc from ELF and populate the st_ops->progs[] */ +static int bpf_object__collect_struct_ops_map_reloc(struct bpf_object *obj, + GElf_Shdr *shdr, + Elf_Data *data) +{ + const struct btf_member *member; + struct bpf_struct_ops *st_ops; + struct bpf_program *prog; + unsigned int shdr_idx; + const struct btf *btf; + struct bpf_map *map; + Elf_Data *symbols; + unsigned int moff; + const char *name; + __u32 member_idx; + GElf_Sym sym; + GElf_Rel rel; + int i, nrels; + + symbols = obj->efile.symbols; + btf = obj->btf; + nrels = shdr->sh_size / shdr->sh_entsize; + for (i = 0; i < nrels; i++) { + if (!gelf_getrel(data, i, &rel)) { + pr_warn("struct_ops reloc: failed to get %d reloc\n", i); + return -LIBBPF_ERRNO__FORMAT; + } + + if (!gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &sym)) { + pr_warn("struct_ops reloc: symbol %zx not found\n", + (size_t)GELF_R_SYM(rel.r_info)); + return -LIBBPF_ERRNO__FORMAT; + } + + name = elf_strptr(obj->efile.elf, obj->efile.strtabidx, + sym.st_name) ? : "<?>"; + map = find_struct_ops_map_by_offset(obj, rel.r_offset); + if (!map) { + pr_warn("struct_ops reloc: cannot find map at rel.r_offset %zu\n", + (size_t)rel.r_offset); + return -EINVAL; + } + + moff = rel.r_offset - map->sec_offset; + shdr_idx = sym.st_shndx; + st_ops = map->st_ops; + pr_debug("struct_ops reloc %s: for %lld value %lld shdr_idx %u rel.r_offset %zu map->sec_offset %zu name %d (\'%s\')\n", + map->name, + (long long)(rel.r_info >> 32), + (long long)sym.st_value, + shdr_idx, (size_t)rel.r_offset, + map->sec_offset, sym.st_name, name); + + if (shdr_idx >= SHN_LORESERVE) { + pr_warn("struct_ops reloc %s: rel.r_offset %zu shdr_idx %u unsupported non-static function\n", + map->name, (size_t)rel.r_offset, shdr_idx); + return -LIBBPF_ERRNO__RELOC; + } + + member = find_member_by_offset(st_ops->type, moff * 8); + if (!member) { + pr_warn("struct_ops reloc %s: cannot find member at moff %u\n", + map->name, moff); + return -EINVAL; + } + member_idx = member - btf_members(st_ops->type); + name = btf__name_by_offset(btf, member->name_off); + + if (!resolve_func_ptr(btf, member->type, NULL)) { + pr_warn("struct_ops reloc %s: cannot relocate non func ptr %s\n", + map->name, name); + return -EINVAL; + } + + prog = bpf_object__find_prog_by_idx(obj, shdr_idx); + if (!prog) { + pr_warn("struct_ops reloc %s: cannot find prog at shdr_idx %u to relocate func ptr %s\n", + map->name, shdr_idx, name); + return -EINVAL; + } + + if (prog->type == BPF_PROG_TYPE_UNSPEC) { + const struct bpf_sec_def *sec_def; + + sec_def = find_sec_def(prog->section_name); + if (sec_def && + sec_def->prog_type != BPF_PROG_TYPE_STRUCT_OPS) { + /* for pr_warn */ + prog->type = sec_def->prog_type; + goto invalid_prog; + } + + prog->type = BPF_PROG_TYPE_STRUCT_OPS; + prog->attach_btf_id = st_ops->type_id; + prog->expected_attach_type = member_idx; + } else if (prog->type != BPF_PROG_TYPE_STRUCT_OPS || + prog->attach_btf_id != st_ops->type_id || + prog->expected_attach_type != member_idx) { + goto invalid_prog; + } + st_ops->progs[member_idx] = prog; + } + + return 0; + +invalid_prog: + pr_warn("struct_ops reloc %s: cannot use prog %s in sec %s with type %u attach_btf_id %u expected_attach_type %u for func ptr %s\n", + map->name, prog->name, prog->section_name, prog->type, + prog->attach_btf_id, prog->expected_attach_type, name); + return -EINVAL; +} + +#define BTF_TRACE_PREFIX "btf_trace_" +#define BTF_MAX_NAME_SIZE 128 + +static int find_btf_by_prefix_kind(const struct btf *btf, const char *prefix, + const char *name, __u32 kind) +{ + char btf_type_name[BTF_MAX_NAME_SIZE]; + int ret; + + ret = snprintf(btf_type_name, sizeof(btf_type_name), + "%s%s", prefix, name); + /* snprintf returns the number of characters written excluding the + * the terminating null. So, if >= BTF_MAX_NAME_SIZE are written, it + * indicates truncation. + */ + if (ret < 0 || ret >= sizeof(btf_type_name)) + return -ENAMETOOLONG; + return btf__find_by_name_kind(btf, btf_type_name, kind); +} + +static inline int __find_vmlinux_btf_id(struct btf *btf, const char *name, + enum bpf_attach_type attach_type) +{ + int err; + + if (attach_type == BPF_TRACE_RAW_TP) + err = find_btf_by_prefix_kind(btf, BTF_TRACE_PREFIX, name, + BTF_KIND_TYPEDEF); + else + err = btf__find_by_name_kind(btf, name, BTF_KIND_FUNC); + + return err; +} + int libbpf_find_vmlinux_btf_id(const char *name, enum bpf_attach_type attach_type) { - struct btf *btf = bpf_core_find_kernel_btf(); - char raw_tp_btf[128] = BTF_PREFIX; - char *dst = raw_tp_btf + sizeof(BTF_PREFIX) - 1; - const char *btf_name; - int err = -EINVAL; - __u32 kind; + struct btf *btf; + btf = libbpf_find_kernel_btf(); if (IS_ERR(btf)) { pr_warn("vmlinux BTF is not found\n"); return -EINVAL; } - if (attach_type == BPF_TRACE_RAW_TP) { - /* prepend "btf_trace_" prefix per kernel convention */ - strncat(dst, name, sizeof(raw_tp_btf) - sizeof(BTF_PREFIX)); - btf_name = raw_tp_btf; - kind = BTF_KIND_TYPEDEF; - } else { - btf_name = name; - kind = BTF_KIND_FUNC; - } - err = btf__find_by_name_kind(btf, btf_name, kind); - btf__free(btf); - return err; + return __find_vmlinux_btf_id(btf, name, attach_type); } static int libbpf_find_prog_btf_id(const char *name, __u32 attach_prog_fd) @@ -6038,10 +6614,11 @@ out: return err; } -static int libbpf_find_attach_btf_id(const char *name, - enum bpf_attach_type attach_type, - __u32 attach_prog_fd) +static int libbpf_find_attach_btf_id(struct bpf_program *prog) { + enum bpf_attach_type attach_type = prog->expected_attach_type; + __u32 attach_prog_fd = prog->attach_prog_fd; + const char *name = prog->section_name; int i, err; if (!name) @@ -6056,8 +6633,9 @@ static int libbpf_find_attach_btf_id(const char *name, err = libbpf_find_prog_btf_id(name + section_defs[i].len, attach_prog_fd); else - err = libbpf_find_vmlinux_btf_id(name + section_defs[i].len, - attach_type); + err = __find_vmlinux_btf_id(prog->obj->btf_vmlinux, + name + section_defs[i].len, + attach_type); if (err <= 0) pr_warn("%s is not found in vmlinux BTF\n", name); return err; @@ -6805,6 +7383,58 @@ struct bpf_link *bpf_program__attach(struct bpf_program *prog) return sec_def->attach_fn(sec_def, prog); } +static int bpf_link__detach_struct_ops(struct bpf_link *link) +{ + struct bpf_link_fd *l = (void *)link; + __u32 zero = 0; + + if (bpf_map_delete_elem(l->fd, &zero)) + return -errno; + + return 0; +} + +struct bpf_link *bpf_map__attach_struct_ops(struct bpf_map *map) +{ + struct bpf_struct_ops *st_ops; + struct bpf_link_fd *link; + __u32 i, zero = 0; + int err; + + if (!bpf_map__is_struct_ops(map) || map->fd == -1) + return ERR_PTR(-EINVAL); + + link = calloc(1, sizeof(*link)); + if (!link) + return ERR_PTR(-EINVAL); + + st_ops = map->st_ops; + for (i = 0; i < btf_vlen(st_ops->type); i++) { + struct bpf_program *prog = st_ops->progs[i]; + void *kern_data; + int prog_fd; + + if (!prog) + continue; + + prog_fd = bpf_program__fd(prog); + kern_data = st_ops->kern_vdata + st_ops->kern_func_off[i]; + *(unsigned long *)kern_data = prog_fd; + } + + err = bpf_map_update_elem(map->fd, &zero, st_ops->kern_vdata, 0); + if (err) { + err = -errno; + free(link); + return ERR_PTR(err); + } + + link->link.detach = bpf_link__detach_struct_ops; + link->fd = map->fd; + + return (struct bpf_link *)link; +} + enum bpf_perf_event_ret bpf_perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size, void **copy_mem, size_t *copy_size, diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index fe592ef48f1b..2a5e3b087002 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -239,6 +239,8 @@ bpf_program__attach_raw_tracepoint(struct bpf_program *prog, LIBBPF_API struct bpf_link * bpf_program__attach_trace(struct bpf_program *prog); +struct bpf_map; +LIBBPF_API struct bpf_link *bpf_map__attach_struct_ops(struct bpf_map *map); struct bpf_insn; /* @@ -315,6 +317,8 @@ LIBBPF_API int bpf_program__set_sched_act(struct bpf_program *prog); LIBBPF_API int bpf_program__set_xdp(struct bpf_program *prog); LIBBPF_API int bpf_program__set_perf_event(struct bpf_program *prog); LIBBPF_API int bpf_program__set_tracing(struct bpf_program *prog); +LIBBPF_API int bpf_program__set_struct_ops(struct bpf_program *prog); +LIBBPF_API int bpf_program__set_extension(struct bpf_program *prog); LIBBPF_API enum bpf_prog_type bpf_program__get_type(struct bpf_program *prog); LIBBPF_API void bpf_program__set_type(struct bpf_program *prog, @@ -335,6 +339,8 @@ LIBBPF_API bool bpf_program__is_sched_act(const struct bpf_program *prog); LIBBPF_API bool bpf_program__is_xdp(const struct bpf_program *prog); LIBBPF_API bool bpf_program__is_perf_event(const struct bpf_program *prog); LIBBPF_API bool bpf_program__is_tracing(const struct bpf_program *prog); +LIBBPF_API bool bpf_program__is_struct_ops(const struct bpf_program *prog); +LIBBPF_API bool bpf_program__is_extension(const struct bpf_program *prog); /* * No need for __attribute__((packed)), all members of 'bpf_map_def' @@ -354,7 +360,6 @@ struct bpf_map_def { * The 'struct bpf_map' in include/linux/bpf.h is internal to the kernel, * so no need to worry about a name clash. */ -struct bpf_map; LIBBPF_API struct bpf_map * bpf_object__find_map_by_name(const struct bpf_object *obj, const char *name); @@ -521,6 +526,7 @@ LIBBPF_API bool bpf_probe_prog_type(enum bpf_prog_type prog_type, LIBBPF_API bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex); LIBBPF_API bool bpf_probe_helper(enum bpf_func_id id, enum bpf_prog_type prog_type, __u32 ifindex); +LIBBPF_API bool bpf_probe_large_insn_limit(__u32 ifindex); /* * Get bpf_prog_info in continuous memory diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index e9713a574243..b035122142bb 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -213,14 +213,25 @@ LIBBPF_0.0.7 { global: btf_dump__emit_type_decl; bpf_link__disconnect; + bpf_map__attach_struct_ops; + bpf_map_delete_batch; + bpf_map_lookup_and_delete_batch; + bpf_map_lookup_batch; + bpf_map_update_batch; bpf_object__find_program_by_name; bpf_object__attach_skeleton; bpf_object__destroy_skeleton; bpf_object__detach_skeleton; bpf_object__load_skeleton; bpf_object__open_skeleton; + bpf_probe_large_insn_limit; bpf_prog_attach_xattr; bpf_program__attach; bpf_program__name; + bpf_program__is_extension; + bpf_program__is_struct_ops; + bpf_program__set_extension; + bpf_program__set_struct_ops; btf__align_of; + libbpf_find_kernel_btf; } LIBBPF_0.0.6; diff --git a/tools/lib/bpf/libbpf_errno.c b/tools/lib/bpf/libbpf_errno.c index 4343e40588c6..0afb51f7a919 100644 --- a/tools/lib/bpf/libbpf_errno.c +++ b/tools/lib/bpf/libbpf_errno.c @@ -13,6 +13,9 @@ #include "libbpf.h" +/* make sure libbpf doesn't use kernel-only integer typedefs */ +#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64 + #define ERRNO_OFFSET(e) ((e) - __LIBBPF_ERRNO__START) #define ERRCODE_OFFSET(c) ERRNO_OFFSET(LIBBPF_ERRNO__##c) #define NR_ERRNO (__LIBBPF_ERRNO__END - __LIBBPF_ERRNO__START) diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index a9eb8b322671..b782ebef6ac9 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -17,6 +17,9 @@ #include "libbpf.h" #include "libbpf_internal.h" +/* make sure libbpf doesn't use kernel-only integer typedefs */ +#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64 + static bool grep(const char *buffer, const char *pattern) { return !!strstr(buffer, pattern); @@ -103,6 +106,8 @@ probe_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_CGROUP_SOCKOPT: case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_STRUCT_OPS: + case BPF_PROG_TYPE_EXT: default: break; } @@ -251,6 +256,7 @@ bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex) case BPF_MAP_TYPE_XSKMAP: case BPF_MAP_TYPE_SOCKHASH: case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: + case BPF_MAP_TYPE_STRUCT_OPS: default: break; } @@ -321,3 +327,24 @@ bool bpf_probe_helper(enum bpf_func_id id, enum bpf_prog_type prog_type, return res; } + +/* + * Probe for availability of kernel commit (5.3): + * + * c04c0d2b968a ("bpf: increase complexity limit and maximum program size") + */ +bool bpf_probe_large_insn_limit(__u32 ifindex) +{ + struct bpf_insn insns[BPF_MAXINSNS + 1]; + int i; + + for (i = 0; i < BPF_MAXINSNS; i++) + insns[i] = BPF_MOV64_IMM(BPF_REG_0, 1); + insns[BPF_MAXINSNS] = BPF_EXIT_INSN(); + + errno = 0; + probe_load(BPF_PROG_TYPE_SCHED_CLS, insns, ARRAY_SIZE(insns), NULL, 0, + ifindex); + + return errno != E2BIG && errno != EINVAL; +} diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c index 5065c1aa1061..431bd25c6cdb 100644 --- a/tools/lib/bpf/netlink.c +++ b/tools/lib/bpf/netlink.c @@ -15,6 +15,9 @@ #include "libbpf_internal.h" #include "nlattr.h" +/* make sure libbpf doesn't use kernel-only integer typedefs */ +#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64 + #ifndef SOL_NETLINK #define SOL_NETLINK 270 #endif diff --git a/tools/lib/bpf/nlattr.c b/tools/lib/bpf/nlattr.c index 8db44bbfc66d..0ad41dfea8eb 100644 --- a/tools/lib/bpf/nlattr.c +++ b/tools/lib/bpf/nlattr.c @@ -13,6 +13,9 @@ #include <string.h> #include <stdio.h> +/* make sure libbpf doesn't use kernel-only integer typedefs */ +#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64 + static uint16_t nla_attr_minlen[LIBBPF_NLA_TYPE_MAX+1] = { [LIBBPF_NLA_U8] = sizeof(uint8_t), [LIBBPF_NLA_U16] = sizeof(uint16_t), diff --git a/tools/lib/bpf/str_error.c b/tools/lib/bpf/str_error.c index b8064eedc177..146da01979c7 100644 --- a/tools/lib/bpf/str_error.c +++ b/tools/lib/bpf/str_error.c @@ -4,6 +4,9 @@ #include <stdio.h> #include "str_error.h" +/* make sure libbpf doesn't use kernel-only integer typedefs */ +#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64 + /* * Wrapper to allow for building in non-GNU systems such as Alpine Linux's musl * libc, while checking strerror_r() return to avoid having to check this in diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c index 8e0ffa800a71..9807903f121e 100644 --- a/tools/lib/bpf/xsk.c +++ b/tools/lib/bpf/xsk.c @@ -32,6 +32,9 @@ #include "libbpf_internal.h" #include "xsk.h" +/* make sure libbpf doesn't use kernel-only integer typedefs */ +#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64 + #ifndef SOL_XDP #define SOL_XDP 283 #endif diff --git a/tools/lib/traceevent/parse-filter.c b/tools/lib/traceevent/parse-filter.c index f3cbf86e51ac..20eed719542e 100644 --- a/tools/lib/traceevent/parse-filter.c +++ b/tools/lib/traceevent/parse-filter.c @@ -1228,8 +1228,10 @@ filter_event(struct tep_event_filter *filter, struct tep_event *event, } filter_type = add_filter_type(filter, event->id); - if (filter_type == NULL) + if (filter_type == NULL) { + free_arg(arg); return TEP_ERRNO__MEM_ALLOC_FAILED; + } if (filter_type->filter) free_arg(filter_type->filter); diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 387311c67264..de988589d99b 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -1076,6 +1076,7 @@ int cmd_report(int argc, const char **argv) struct stat st; bool has_br_stack = false; int branch_mode = -1; + int last_key = 0; bool branch_call_mode = false; #define CALLCHAIN_DEFAULT_OPT "graph,0.5,caller,function,percent" static const char report_callchain_help[] = "Display call graph (stack chain/backtrace):\n\n" @@ -1450,7 +1451,8 @@ repeat: sort_order = sort_tmp; } - if (setup_sorting(session->evlist) < 0) { + if ((last_key != K_SWITCH_INPUT_DATA) && + (setup_sorting(session->evlist) < 0)) { if (sort_order) parse_options_usage(report_usage, options, "s", 1); if (field_order) @@ -1530,6 +1532,7 @@ repeat: ret = __cmd_report(&report); if (ret == K_SWITCH_INPUT_DATA) { perf_session__delete(session); + last_key = K_SWITCH_INPUT_DATA; goto repeat; } else ret = 0; diff --git a/tools/perf/examples/bpf/5sec.c b/tools/perf/examples/bpf/5sec.c index b9c203219691..49f4f84da485 100644 --- a/tools/perf/examples/bpf/5sec.c +++ b/tools/perf/examples/bpf/5sec.c @@ -39,7 +39,7 @@ Copyright (C) 2018 Red Hat, Inc., Arnaldo Carvalho de Melo <acme@redhat.com> */ -#include <bpf.h> +#include <bpf/bpf.h> int probe(hrtimer_nanosleep, rqtp->tv_sec)(void *ctx, int err, long sec) { diff --git a/tools/perf/examples/bpf/empty.c b/tools/perf/examples/bpf/empty.c index 3776d26db9e7..7d7fb0c9fe76 100644 --- a/tools/perf/examples/bpf/empty.c +++ b/tools/perf/examples/bpf/empty.c @@ -1,3 +1,3 @@ -#include <bpf.h> +#include <bpf/bpf.h> license(GPL); diff --git a/tools/perf/examples/bpf/sys_enter_openat.c b/tools/perf/examples/bpf/sys_enter_openat.c index 9cd124b09392..c4481c390d23 100644 --- a/tools/perf/examples/bpf/sys_enter_openat.c +++ b/tools/perf/examples/bpf/sys_enter_openat.c @@ -14,7 +14,7 @@ * the return value. */ -#include <bpf.h> +#include <bpf/bpf.h> struct syscall_enter_openat_args { unsigned long long unused; diff --git a/tools/perf/include/bpf/pid_filter.h b/tools/perf/include/bpf/pid_filter.h index 6e61c4bdf548..607189a315b2 100644 --- a/tools/perf/include/bpf/pid_filter.h +++ b/tools/perf/include/bpf/pid_filter.h @@ -3,7 +3,7 @@ #ifndef _PERF_BPF_PID_FILTER_ #define _PERF_BPF_PID_FILTER_ -#include <bpf.h> +#include <bpf/bpf.h> #define pid_filter(name) pid_map(name, bool) diff --git a/tools/perf/include/bpf/stdio.h b/tools/perf/include/bpf/stdio.h index 316af5b2ff35..7ca6fa5463ee 100644 --- a/tools/perf/include/bpf/stdio.h +++ b/tools/perf/include/bpf/stdio.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -#include <bpf.h> +#include <bpf/bpf.h> struct bpf_map SEC("maps") __bpf_stdout__ = { .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, diff --git a/tools/perf/include/bpf/unistd.h b/tools/perf/include/bpf/unistd.h index ca7877f9a976..d1a35b6c649d 100644 --- a/tools/perf/include/bpf/unistd.h +++ b/tools/perf/include/bpf/unistd.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: LGPL-2.1 -#include <bpf.h> +#include <bpf/bpf.h> static int (*bpf_get_current_pid_tgid)(void) = (void *)BPF_FUNC_get_current_pid_tgid; diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 45286900aacb..0aa63aeb58ec 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -339,10 +339,10 @@ static inline void perf_hpp__prepend_sort_field(struct perf_hpp_fmt *format) list_for_each_entry_safe(format, tmp, &(_list)->sorts, sort_list) #define hists__for_each_format(hists, format) \ - perf_hpp_list__for_each_format((hists)->hpp_list, fmt) + perf_hpp_list__for_each_format((hists)->hpp_list, format) #define hists__for_each_sort_list(hists, format) \ - perf_hpp_list__for_each_sort_list((hists)->hpp_list, fmt) + perf_hpp_list__for_each_sort_list((hists)->hpp_list, format) extern struct perf_hpp_fmt perf_hpp__format[]; diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index 6658fbf196e6..1965aefccb02 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -920,6 +920,9 @@ static int dso__process_kernel_symbol(struct dso *dso, struct map *map, if (curr_map == NULL) return -1; + if (curr_dso->kernel) + map__kmap(curr_map)->kmaps = kmaps; + if (adjust_kernel_syms) { curr_map->start = shdr->sh_addr + ref_reloc(kmap); curr_map->end = curr_map->start + shdr->sh_size; diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index b001c602414b..b45a8f77ee24 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -32,6 +32,7 @@ TARGETS += memory-hotplug TARGETS += mount TARGETS += mqueue TARGETS += net +TARGETS += net/mptcp TARGETS += netfilter TARGETS += networking/timestamping TARGETS += nsfs diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 301ac12d5d69..ec464859c6b6 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -22,17 +22,13 @@ get_cgroup_id_user test_skb_cgroup_id_user test_socket_cookie test_cgroup_storage -test_select_reuseport test_flow_dissector flow_dissector_load test_netcnt -test_section_names test_tcpnotify_user test_libbpf test_tcp_check_syncookie_user test_sysctl -libbpf.pc -libbpf.so.* test_hashmap test_btf_dump xdping @@ -41,4 +37,4 @@ test_cpp /no_alu32 /bpf_gcc /tools -bpf_helper_defs.h + diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index f1f949cd8ed9..5f41f5bd8033 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -20,8 +20,8 @@ CLANG ?= clang LLC ?= llc LLVM_OBJCOPY ?= llvm-objcopy BPF_GCC ?= $(shell command -v bpf-gcc;) -CFLAGS += -g -Wall -O2 $(GENFLAGS) -I$(APIDIR) -I$(LIBDIR) -I$(BPFDIR) \ - -I$(GENDIR) -I$(TOOLSINCDIR) -I$(CURDIR) \ +CFLAGS += -g -Wall -O2 $(GENFLAGS) -I$(CURDIR) -I$(APIDIR) \ + -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) -I$(TOOLSINCDIR) \ -Dbpf_prog_load=bpf_prog_test_load \ -Dbpf_load_program=bpf_test_load_program LDLIBS += -lcap -lelf -lz -lrt -lpthread @@ -73,7 +73,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \ # Compile but not part of 'make run_tests' TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ - test_lirc_mode2_user xdping test_cpp + test_lirc_mode2_user xdping test_cpp runqslower TEST_CUSTOM_PROGS = urandom_read @@ -83,20 +83,29 @@ TEST_CUSTOM_PROGS = urandom_read # $3 - target (assumed to be file); only file name will be emitted; # $4 - optional extra arg, emitted as-is, if provided. ifeq ($(V),1) +Q = msg = else -msg = @$(info $(1)$(if $(2), [$(2)]) $(notdir $(3)))$(if $(4), $(4)) +Q = @ +msg = @printf ' %-8s%s %s%s\n' "$(1)" "$(if $(2), [$(2)])" "$(notdir $(3))" "$(if $(4), $(4))"; +MAKEFLAGS += --no-print-directory +submake_extras := feature_display=0 endif # override lib.mk's default rules OVERRIDE_TARGETS := 1 override define CLEAN - $(call msg, CLEAN) + $(call msg,CLEAN) $(RM) -r $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) $(EXTRA_CLEAN) endef include ../lib.mk +SCRATCH_DIR := $(OUTPUT)/tools +BUILD_DIR := $(SCRATCH_DIR)/build +INCLUDE_DIR := $(SCRATCH_DIR)/include +BPFOBJ := $(BUILD_DIR)/libbpf/libbpf.a + # Define simple and short `make test_progs`, `make test_sysctl`, etc targets # to build individual tests. # NOTE: Semicolon at the end is critical to override lib.mk's default static @@ -108,18 +117,25 @@ $(notdir $(TEST_GEN_PROGS) \ $(TEST_CUSTOM_PROGS)): %: $(OUTPUT)/% ; $(OUTPUT)/%:%.c - $(call msg, BINARY,,$@) + $(call msg,BINARY,,$@) $(LINK.c) $^ $(LDLIBS) -o $@ $(OUTPUT)/urandom_read: urandom_read.c - $(call msg, BINARY,,$@) - $(CC) -o $@ $< -Wl,--build-id + $(call msg,BINARY,,$@) + $(CC) $(LDFLAGS) -o $@ $< $(LDLIBS) -Wl,--build-id -$(OUTPUT)/test_stub.o: test_stub.c - $(call msg, CC,,$@) +$(OUTPUT)/test_stub.o: test_stub.c $(BPFOBJ) + $(call msg,CC,,$@) $(CC) -c $(CFLAGS) -o $@ $< -BPFOBJ := $(OUTPUT)/libbpf.a +VMLINUX_BTF_PATHS := $(abspath ../../../../vmlinux) \ + /sys/kernel/btf/vmlinux \ + /boot/vmlinux-$(shell uname -r) +VMLINUX_BTF:= $(firstword $(wildcard $(VMLINUX_BTF_PATHS))) +$(OUTPUT)/runqslower: $(BPFOBJ) + $(Q)$(MAKE) $(submake_extras) -C $(TOOLSDIR)/bpf/runqslower \ + OUTPUT=$(SCRATCH_DIR)/ VMLINUX_BTF=$(VMLINUX_BTF) \ + BPFOBJ=$(BPFOBJ) BPF_INCLUDE=$(INCLUDE_DIR) $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/test_stub.o $(BPFOBJ) @@ -137,23 +153,22 @@ $(OUTPUT)/test_netcnt: cgroup_helpers.c $(OUTPUT)/test_sock_fields: cgroup_helpers.c $(OUTPUT)/test_sysctl: cgroup_helpers.c -.PHONY: force - -# force a rebuild of BPFOBJ when its dependencies are updated -force: - -DEFAULT_BPFTOOL := $(OUTPUT)/tools/usr/local/sbin/bpftool +DEFAULT_BPFTOOL := $(SCRATCH_DIR)/sbin/bpftool BPFTOOL ?= $(DEFAULT_BPFTOOL) +$(DEFAULT_BPFTOOL): $(BPFOBJ) | $(BUILD_DIR)/bpftool + $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOLDIR) \ + OUTPUT=$(BUILD_DIR)/bpftool/ \ + prefix= DESTDIR=$(SCRATCH_DIR)/ install -$(DEFAULT_BPFTOOL): force - $(MAKE) -C $(BPFTOOLDIR) DESTDIR=$(OUTPUT)/tools install - -$(BPFOBJ): force - $(MAKE) -C $(BPFDIR) OUTPUT=$(OUTPUT)/ +$(BPFOBJ): $(wildcard $(BPFDIR)/*.c $(BPFDIR)/*.h $(BPFDIR)/Makefile) \ + ../../../include/uapi/linux/bpf.h \ + | $(INCLUDE_DIR) $(BUILD_DIR)/libbpf + $(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(BUILD_DIR)/libbpf/ \ + DESTDIR=$(SCRATCH_DIR) prefix= all install_headers -BPF_HELPERS := $(OUTPUT)/bpf_helper_defs.h $(wildcard $(BPFDIR)/bpf_*.h) -$(OUTPUT)/bpf_helper_defs.h: - $(MAKE) -C $(BPFDIR) OUTPUT=$(OUTPUT)/ $(OUTPUT)/bpf_helper_defs.h +$(BUILD_DIR)/libbpf $(BUILD_DIR)/bpftool $(INCLUDE_DIR): + $(call msg,MKDIR,,$@) + mkdir -p $@ # Get Clang's default includes on this system, as opposed to those seen by # '-target bpf'. This fixes "missing" files on some architectures/distros, @@ -173,8 +188,8 @@ MENDIAN=$(if $(IS_LITTLE_ENDIAN),-mlittle-endian,-mbig-endian) CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG)) BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH) $(MENDIAN) \ - -I. -I./include/uapi -I$(APIDIR) \ - -I$(BPFDIR) -I$(abspath $(OUTPUT)/../usr/include) + -I$(INCLUDE_DIR) -I$(CURDIR) -I$(CURDIR)/include/uapi \ + -I$(APIDIR) -I$(abspath $(OUTPUT)/../usr/include) CLANG_CFLAGS = $(CLANG_SYS_INCLUDES) \ -Wno-compare-distinct-pointer-types @@ -190,28 +205,28 @@ $(OUTPUT)/flow_dissector_load.o: flow_dissector_load.h # $3 - CFLAGS # $4 - LDFLAGS define CLANG_BPF_BUILD_RULE - $(call msg, CLANG-LLC,$(TRUNNER_BINARY),$2) + $(call msg,CLNG-LLC,$(TRUNNER_BINARY),$2) ($(CLANG) $3 -O2 -target bpf -emit-llvm \ -c $1 -o - || echo "BPF obj compilation failed") | \ $(LLC) -mattr=dwarfris -march=bpf -mcpu=probe $4 -filetype=obj -o $2 endef # Similar to CLANG_BPF_BUILD_RULE, but with disabled alu32 define CLANG_NOALU32_BPF_BUILD_RULE - $(call msg, CLANG-LLC,$(TRUNNER_BINARY),$2) + $(call msg,CLNG-LLC,$(TRUNNER_BINARY),$2) ($(CLANG) $3 -O2 -target bpf -emit-llvm \ -c $1 -o - || echo "BPF obj compilation failed") | \ $(LLC) -march=bpf -mcpu=v2 $4 -filetype=obj -o $2 endef # Similar to CLANG_BPF_BUILD_RULE, but using native Clang and bpf LLC define CLANG_NATIVE_BPF_BUILD_RULE - $(call msg, CLANG-BPF,$(TRUNNER_BINARY),$2) + $(call msg,CLNG-BPF,$(TRUNNER_BINARY),$2) ($(CLANG) $3 -O2 -emit-llvm \ -c $1 -o - || echo "BPF obj compilation failed") | \ $(LLC) -march=bpf -mcpu=probe $4 -filetype=obj -o $2 endef # Build BPF object using GCC define GCC_BPF_BUILD_RULE - $(call msg, GCC-BPF,$(TRUNNER_BINARY),$2) + $(call msg,GCC-BPF,$(TRUNNER_BINARY),$2) $(BPF_GCC) $3 $4 -O2 -c $1 -o $2 endef @@ -252,6 +267,7 @@ define DEFINE_TEST_RUNNER_RULES ifeq ($($(TRUNNER_OUTPUT)-dir),) $(TRUNNER_OUTPUT)-dir := y $(TRUNNER_OUTPUT): + $$(call msg,MKDIR,,$$@) mkdir -p $$@ endif @@ -262,7 +278,7 @@ $(TRUNNER_BPF_PROGS_DIR)$(if $2,-)$2-bpfobjs := y $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.o: \ $(TRUNNER_BPF_PROGS_DIR)/%.c \ $(TRUNNER_BPF_PROGS_DIR)/*.h \ - $$(BPF_HELPERS) | $(TRUNNER_OUTPUT) + $$(BPFOBJ) | $(TRUNNER_OUTPUT) $$(call $(TRUNNER_BPF_BUILD_RULE),$$<,$$@, \ $(TRUNNER_BPF_CFLAGS), \ $(TRUNNER_BPF_LDFLAGS)) @@ -270,7 +286,7 @@ $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.o: \ $(TRUNNER_BPF_SKELS): $(TRUNNER_OUTPUT)/%.skel.h: \ $(TRUNNER_OUTPUT)/%.o \ | $(BPFTOOL) $(TRUNNER_OUTPUT) - $$(call msg, GEN-SKEL,$(TRUNNER_BINARY),$$@) + $$(call msg,GEN-SKEL,$(TRUNNER_BINARY),$$@) $$(BPFTOOL) gen skeleton $$< > $$@ endif @@ -278,7 +294,7 @@ endif ifeq ($($(TRUNNER_TESTS_DIR)-tests-hdr),) $(TRUNNER_TESTS_DIR)-tests-hdr := y $(TRUNNER_TESTS_HDR): $(TRUNNER_TESTS_DIR)/*.c - $$(call msg, TEST-HDR,$(TRUNNER_BINARY),$$@) + $$(call msg,TEST-HDR,$(TRUNNER_BINARY),$$@) $$(shell ( cd $(TRUNNER_TESTS_DIR); \ echo '/* Generated header, do not edit */'; \ ls *.c 2> /dev/null | \ @@ -294,7 +310,7 @@ $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o: \ $(TRUNNER_BPF_OBJS) \ $(TRUNNER_BPF_SKELS) \ $$(BPFOBJ) | $(TRUNNER_OUTPUT) - $$(call msg, TEST-OBJ,$(TRUNNER_BINARY),$$@) + $$(call msg,TEST-OBJ,$(TRUNNER_BINARY),$$@) cd $$(@D) && $$(CC) $$(CFLAGS) -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F) $(TRUNNER_EXTRA_OBJS): $(TRUNNER_OUTPUT)/%.o: \ @@ -302,20 +318,20 @@ $(TRUNNER_EXTRA_OBJS): $(TRUNNER_OUTPUT)/%.o: \ $(TRUNNER_EXTRA_HDRS) \ $(TRUNNER_TESTS_HDR) \ $$(BPFOBJ) | $(TRUNNER_OUTPUT) - $$(call msg, EXTRA-OBJ,$(TRUNNER_BINARY),$$@) + $$(call msg,EXT-OBJ,$(TRUNNER_BINARY),$$@) $$(CC) $$(CFLAGS) -c $$< $$(LDLIBS) -o $$@ # only copy extra resources if in flavored build $(TRUNNER_BINARY)-extras: $(TRUNNER_EXTRA_FILES) | $(TRUNNER_OUTPUT) ifneq ($2,) - $$(call msg, EXTRAS-CP,$(TRUNNER_BINARY),$(TRUNNER_EXTRA_FILES)) + $$(call msg,EXT-COPY,$(TRUNNER_BINARY),$(TRUNNER_EXTRA_FILES)) cp -a $$^ $(TRUNNER_OUTPUT)/ endif $(OUTPUT)/$(TRUNNER_BINARY): $(TRUNNER_TEST_OBJS) \ $(TRUNNER_EXTRA_OBJS) $$(BPFOBJ) \ | $(TRUNNER_BINARY)-extras - $$(call msg, BINARY,,$$@) + $$(call msg,BINARY,,$$@) $$(CC) $$(CFLAGS) $$(filter %.a %.o,$$^) $$(LDLIBS) -o $$@ endef @@ -328,7 +344,7 @@ TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c \ TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read \ $(wildcard progs/btf_dump_test_case_*.c) TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE -TRUNNER_BPF_CFLAGS := -I. -I$(OUTPUT) $(BPF_CFLAGS) $(CLANG_CFLAGS) +TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS) TRUNNER_BPF_LDFLAGS := -mattr=+alu32 $(eval $(call DEFINE_TEST_RUNNER,test_progs)) @@ -367,15 +383,15 @@ verifier/tests.h: verifier/*.c echo '#endif' \ ) > verifier/tests.h) $(OUTPUT)/test_verifier: test_verifier.c verifier/tests.h $(BPFOBJ) | $(OUTPUT) - $(call msg, BINARY,,$@) + $(call msg,BINARY,,$@) $(CC) $(CFLAGS) $(filter %.a %.o %.c,$^) $(LDLIBS) -o $@ # Make sure we are able to include and link libbpf against c++. $(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ) - $(call msg, CXX,,$@) + $(call msg,CXX,,$@) $(CXX) $(CFLAGS) $^ $(LDLIBS) -o $@ -EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) \ +EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(SCRATCH_DIR) \ prog_tests/tests.h map_tests/tests.h verifier/tests.h \ - feature $(OUTPUT)/*.o $(OUTPUT)/no_alu32 $(OUTPUT)/bpf_gcc \ - tools *.skel.h + feature \ + $(addprefix $(OUTPUT)/,*.o *.skel.h no_alu32 bpf_gcc) diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h new file mode 100644 index 000000000000..8f21965ffc6c --- /dev/null +++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h @@ -0,0 +1,235 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __BPF_TCP_HELPERS_H +#define __BPF_TCP_HELPERS_H + +#include <stdbool.h> +#include <linux/types.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_core_read.h> +#include "bpf_trace_helpers.h" + +#define BPF_STRUCT_OPS(name, args...) \ +SEC("struct_ops/"#name) \ +BPF_PROG(name, args) + +#define tcp_jiffies32 ((__u32)bpf_jiffies64()) + +struct sock_common { + unsigned char skc_state; +} __attribute__((preserve_access_index)); + +enum sk_pacing { + SK_PACING_NONE = 0, + SK_PACING_NEEDED = 1, + SK_PACING_FQ = 2, +}; + +struct sock { + struct sock_common __sk_common; + unsigned long sk_pacing_rate; + __u32 sk_pacing_status; /* see enum sk_pacing */ +} __attribute__((preserve_access_index)); + +struct inet_sock { + struct sock sk; +} __attribute__((preserve_access_index)); + +struct inet_connection_sock { + struct inet_sock icsk_inet; + __u8 icsk_ca_state:6, + icsk_ca_setsockopt:1, + icsk_ca_dst_locked:1; + struct { + __u8 pending; + } icsk_ack; + __u64 icsk_ca_priv[104 / sizeof(__u64)]; +} __attribute__((preserve_access_index)); + +struct tcp_sock { + struct inet_connection_sock inet_conn; + + __u32 rcv_nxt; + __u32 snd_nxt; + __u32 snd_una; + __u8 ecn_flags; + __u32 delivered; + __u32 delivered_ce; + __u32 snd_cwnd; + __u32 snd_cwnd_cnt; + __u32 snd_cwnd_clamp; + __u32 snd_ssthresh; + __u8 syn_data:1, /* SYN includes data */ + syn_fastopen:1, /* SYN includes Fast Open option */ + syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ + syn_fastopen_ch:1, /* Active TFO re-enabling probe */ + syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ + save_syn:1, /* Save headers of SYN packet */ + is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */ + syn_smc:1; /* SYN includes SMC */ + __u32 max_packets_out; + __u32 lsndtime; + __u32 prior_cwnd; + __u64 tcp_mstamp; /* most recent packet received/sent */ +} __attribute__((preserve_access_index)); + +static __always_inline struct inet_connection_sock *inet_csk(const struct sock *sk) +{ + return (struct inet_connection_sock *)sk; +} + +static __always_inline void *inet_csk_ca(const struct sock *sk) +{ + return (void *)inet_csk(sk)->icsk_ca_priv; +} + +static __always_inline struct tcp_sock *tcp_sk(const struct sock *sk) +{ + return (struct tcp_sock *)sk; +} + +static __always_inline bool before(__u32 seq1, __u32 seq2) +{ + return (__s32)(seq1-seq2) < 0; +} +#define after(seq2, seq1) before(seq1, seq2) + +#define TCP_ECN_OK 1 +#define TCP_ECN_QUEUE_CWR 2 +#define TCP_ECN_DEMAND_CWR 4 +#define TCP_ECN_SEEN 8 + +enum inet_csk_ack_state_t { + ICSK_ACK_SCHED = 1, + ICSK_ACK_TIMER = 2, + ICSK_ACK_PUSHED = 4, + ICSK_ACK_PUSHED2 = 8, + ICSK_ACK_NOW = 16 /* Send the next ACK immediately (once) */ +}; + +enum tcp_ca_event { + CA_EVENT_TX_START = 0, + CA_EVENT_CWND_RESTART = 1, + CA_EVENT_COMPLETE_CWR = 2, + CA_EVENT_LOSS = 3, + CA_EVENT_ECN_NO_CE = 4, + CA_EVENT_ECN_IS_CE = 5, +}; + +enum tcp_ca_state { + TCP_CA_Open = 0, + TCP_CA_Disorder = 1, + TCP_CA_CWR = 2, + TCP_CA_Recovery = 3, + TCP_CA_Loss = 4 +}; + +struct ack_sample { + __u32 pkts_acked; + __s32 rtt_us; + __u32 in_flight; +} __attribute__((preserve_access_index)); + +struct rate_sample { + __u64 prior_mstamp; /* starting timestamp for interval */ + __u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ + __s32 delivered; /* number of packets delivered over interval */ + long interval_us; /* time for tp->delivered to incr "delivered" */ + __u32 snd_interval_us; /* snd interval for delivered packets */ + __u32 rcv_interval_us; /* rcv interval for delivered packets */ + long rtt_us; /* RTT of last (S)ACKed packet (or -1) */ + int losses; /* number of packets marked lost upon ACK */ + __u32 acked_sacked; /* number of packets newly (S)ACKed upon ACK */ + __u32 prior_in_flight; /* in flight before this ACK */ + bool is_app_limited; /* is sample from packet with bubble in pipe? */ + bool is_retrans; /* is sample from retransmission? */ + bool is_ack_delayed; /* is this (likely) a delayed ACK? */ +} __attribute__((preserve_access_index)); + +#define TCP_CA_NAME_MAX 16 +#define TCP_CONG_NEEDS_ECN 0x2 + +struct tcp_congestion_ops { + char name[TCP_CA_NAME_MAX]; + __u32 flags; + + /* initialize private data (optional) */ + void (*init)(struct sock *sk); + /* cleanup private data (optional) */ + void (*release)(struct sock *sk); + + /* return slow start threshold (required) */ + __u32 (*ssthresh)(struct sock *sk); + /* do new cwnd calculation (required) */ + void (*cong_avoid)(struct sock *sk, __u32 ack, __u32 acked); + /* call before changing ca_state (optional) */ + void (*set_state)(struct sock *sk, __u8 new_state); + /* call when cwnd event occurs (optional) */ + void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev); + /* call when ack arrives (optional) */ + void (*in_ack_event)(struct sock *sk, __u32 flags); + /* new value of cwnd after loss (required) */ + __u32 (*undo_cwnd)(struct sock *sk); + /* hook for packet ack accounting (optional) */ + void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); + /* override sysctl_tcp_min_tso_segs */ + __u32 (*min_tso_segs)(struct sock *sk); + /* returns the multiplier used in tcp_sndbuf_expand (optional) */ + __u32 (*sndbuf_expand)(struct sock *sk); + /* call when packets are delivered to update cwnd and pacing rate, + * after all the ca_state processing. (optional) + */ + void (*cong_control)(struct sock *sk, const struct rate_sample *rs); +}; + +#define min(a, b) ((a) < (b) ? (a) : (b)) +#define max(a, b) ((a) > (b) ? (a) : (b)) +#define min_not_zero(x, y) ({ \ + typeof(x) __x = (x); \ + typeof(y) __y = (y); \ + __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); }) + +static __always_inline __u32 tcp_slow_start(struct tcp_sock *tp, __u32 acked) +{ + __u32 cwnd = min(tp->snd_cwnd + acked, tp->snd_ssthresh); + + acked -= cwnd - tp->snd_cwnd; + tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); + + return acked; +} + +static __always_inline bool tcp_in_slow_start(const struct tcp_sock *tp) +{ + return tp->snd_cwnd < tp->snd_ssthresh; +} + +static __always_inline bool tcp_is_cwnd_limited(const struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + + /* If in slow start, ensure cwnd grows to twice what was ACKed. */ + if (tcp_in_slow_start(tp)) + return tp->snd_cwnd < 2 * tp->max_packets_out; + + return !!BPF_CORE_READ_BITFIELD(tp, is_cwnd_limited); +} + +static __always_inline void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked) +{ + /* If credits accumulated at a higher w, apply them gently now. */ + if (tp->snd_cwnd_cnt >= w) { + tp->snd_cwnd_cnt = 0; + tp->snd_cwnd++; + } + + tp->snd_cwnd_cnt += acked; + if (tp->snd_cwnd_cnt >= w) { + __u32 delta = tp->snd_cwnd_cnt / w; + + tp->snd_cwnd_cnt -= delta * w; + tp->snd_cwnd += delta; + } + tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_cwnd_clamp); +} + +#endif diff --git a/tools/testing/selftests/bpf/bpf_trace_helpers.h b/tools/testing/selftests/bpf/bpf_trace_helpers.h index c76a214a53b0..c6f1354d93fb 100644 --- a/tools/testing/selftests/bpf/bpf_trace_helpers.h +++ b/tools/testing/selftests/bpf/bpf_trace_helpers.h @@ -1,58 +1,120 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ #ifndef __BPF_TRACE_HELPERS_H #define __BPF_TRACE_HELPERS_H -#include "bpf_helpers.h" - -#define __BPF_MAP_0(i, m, v, ...) v -#define __BPF_MAP_1(i, m, v, t, a, ...) m(t, a, ctx[i]) -#define __BPF_MAP_2(i, m, v, t, a, ...) m(t, a, ctx[i]), __BPF_MAP_1(i+1, m, v, __VA_ARGS__) -#define __BPF_MAP_3(i, m, v, t, a, ...) m(t, a, ctx[i]), __BPF_MAP_2(i+1, m, v, __VA_ARGS__) -#define __BPF_MAP_4(i, m, v, t, a, ...) m(t, a, ctx[i]), __BPF_MAP_3(i+1, m, v, __VA_ARGS__) -#define __BPF_MAP_5(i, m, v, t, a, ...) m(t, a, ctx[i]), __BPF_MAP_4(i+1, m, v, __VA_ARGS__) -#define __BPF_MAP_6(i, m, v, t, a, ...) m(t, a, ctx[i]), __BPF_MAP_5(i+1, m, v, __VA_ARGS__) -#define __BPF_MAP_7(i, m, v, t, a, ...) m(t, a, ctx[i]), __BPF_MAP_6(i+1, m, v, __VA_ARGS__) -#define __BPF_MAP_8(i, m, v, t, a, ...) m(t, a, ctx[i]), __BPF_MAP_7(i+1, m, v, __VA_ARGS__) -#define __BPF_MAP_9(i, m, v, t, a, ...) m(t, a, ctx[i]), __BPF_MAP_8(i+1, m, v, __VA_ARGS__) -#define __BPF_MAP_10(i, m, v, t, a, ...) m(t, a, ctx[i]), __BPF_MAP_9(i+1, m, v, __VA_ARGS__) -#define __BPF_MAP_11(i, m, v, t, a, ...) m(t, a, ctx[i]), __BPF_MAP_10(i+1, m, v, __VA_ARGS__) -#define __BPF_MAP_12(i, m, v, t, a, ...) m(t, a, ctx[i]), __BPF_MAP_11(i+1, m, v, __VA_ARGS__) -#define __BPF_MAP(n, ...) __BPF_MAP_##n(0, __VA_ARGS__) - -/* BPF sizeof(void *) is always 8, so no need to cast to long first - * for ptr to avoid compiler warning. +#include <bpf/bpf_helpers.h> + +#define ___bpf_concat(a, b) a ## b +#define ___bpf_apply(fn, n) ___bpf_concat(fn, n) +#define ___bpf_nth(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _a, _b, _c, N, ...) N +#define ___bpf_narg(...) \ + ___bpf_nth(_, ##__VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) +#define ___bpf_empty(...) \ + ___bpf_nth(_, ##__VA_ARGS__, N, N, N, N, N, N, N, N, N, N, 0) + +#define ___bpf_ctx_cast0() ctx +#define ___bpf_ctx_cast1(x) ___bpf_ctx_cast0(), (void *)ctx[0] +#define ___bpf_ctx_cast2(x, args...) ___bpf_ctx_cast1(args), (void *)ctx[1] +#define ___bpf_ctx_cast3(x, args...) ___bpf_ctx_cast2(args), (void *)ctx[2] +#define ___bpf_ctx_cast4(x, args...) ___bpf_ctx_cast3(args), (void *)ctx[3] +#define ___bpf_ctx_cast5(x, args...) ___bpf_ctx_cast4(args), (void *)ctx[4] +#define ___bpf_ctx_cast6(x, args...) ___bpf_ctx_cast5(args), (void *)ctx[5] +#define ___bpf_ctx_cast7(x, args...) ___bpf_ctx_cast6(args), (void *)ctx[6] +#define ___bpf_ctx_cast8(x, args...) ___bpf_ctx_cast7(args), (void *)ctx[7] +#define ___bpf_ctx_cast9(x, args...) ___bpf_ctx_cast8(args), (void *)ctx[8] +#define ___bpf_ctx_cast10(x, args...) ___bpf_ctx_cast9(args), (void *)ctx[9] +#define ___bpf_ctx_cast11(x, args...) ___bpf_ctx_cast10(args), (void *)ctx[10] +#define ___bpf_ctx_cast12(x, args...) ___bpf_ctx_cast11(args), (void *)ctx[11] +#define ___bpf_ctx_cast(args...) \ + ___bpf_apply(___bpf_ctx_cast, ___bpf_narg(args))(args) + +/* + * BPF_PROG is a convenience wrapper for generic tp_btf/fentry/fexit and + * similar kinds of BPF programs, that accept input arguments as a single + * pointer to untyped u64 array, where each u64 can actually be a typed + * pointer or integer of different size. Instead of requring user to write + * manual casts and work with array elements by index, BPF_PROG macro + * allows user to declare a list of named and typed input arguments in the + * same syntax as for normal C function. All the casting is hidden and + * performed transparently, while user code can just assume working with + * function arguments of specified type and name. + * + * Original raw context argument is preserved as well as 'ctx' argument. + * This is useful when using BPF helpers that expect original context + * as one of the parameters (e.g., for bpf_perf_event_output()). */ -#define __BPF_CAST(t, a, ctx) (t) ctx -#define __BPF_V void -#define __BPF_N - -#define __BPF_DECL_ARGS(t, a, ctx) t a - -#define BPF_TRACE_x(x, sec_name, fname, ret_type, ...) \ -static __always_inline ret_type \ -____##fname(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \ - \ -SEC(sec_name) \ -ret_type fname(__u64 *ctx) \ -{ \ - return ____##fname(__BPF_MAP(x, __BPF_CAST, __BPF_N, __VA_ARGS__));\ -} \ - \ -static __always_inline \ -ret_type ____##fname(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)) - -#define BPF_TRACE_0(sec, fname, ...) BPF_TRACE_x(0, sec, fname, int, __VA_ARGS__) -#define BPF_TRACE_1(sec, fname, ...) BPF_TRACE_x(1, sec, fname, int, __VA_ARGS__) -#define BPF_TRACE_2(sec, fname, ...) BPF_TRACE_x(2, sec, fname, int, __VA_ARGS__) -#define BPF_TRACE_3(sec, fname, ...) BPF_TRACE_x(3, sec, fname, int, __VA_ARGS__) -#define BPF_TRACE_4(sec, fname, ...) BPF_TRACE_x(4, sec, fname, int, __VA_ARGS__) -#define BPF_TRACE_5(sec, fname, ...) BPF_TRACE_x(5, sec, fname, int, __VA_ARGS__) -#define BPF_TRACE_6(sec, fname, ...) BPF_TRACE_x(6, sec, fname, int, __VA_ARGS__) -#define BPF_TRACE_7(sec, fname, ...) BPF_TRACE_x(7, sec, fname, int, __VA_ARGS__) -#define BPF_TRACE_8(sec, fname, ...) BPF_TRACE_x(8, sec, fname, int, __VA_ARGS__) -#define BPF_TRACE_9(sec, fname, ...) BPF_TRACE_x(9, sec, fname, int, __VA_ARGS__) -#define BPF_TRACE_10(sec, fname, ...) BPF_TRACE_x(10, sec, fname, int, __VA_ARGS__) -#define BPF_TRACE_11(sec, fname, ...) BPF_TRACE_x(11, sec, fname, int, __VA_ARGS__) -#define BPF_TRACE_12(sec, fname, ...) BPF_TRACE_x(12, sec, fname, int, __VA_ARGS__) +#define BPF_PROG(name, args...) \ +name(unsigned long long *ctx); \ +static __always_inline typeof(name(0)) \ +____##name(unsigned long long *ctx, ##args); \ +typeof(name(0)) name(unsigned long long *ctx) \ +{ \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + return ____##name(___bpf_ctx_cast(args)); \ + _Pragma("GCC diagnostic pop") \ +} \ +static __always_inline typeof(name(0)) \ +____##name(unsigned long long *ctx, ##args) + +struct pt_regs; + +#define ___bpf_kprobe_args0() ctx +#define ___bpf_kprobe_args1(x) \ + ___bpf_kprobe_args0(), (void *)PT_REGS_PARM1(ctx) +#define ___bpf_kprobe_args2(x, args...) \ + ___bpf_kprobe_args1(args), (void *)PT_REGS_PARM2(ctx) +#define ___bpf_kprobe_args3(x, args...) \ + ___bpf_kprobe_args2(args), (void *)PT_REGS_PARM3(ctx) +#define ___bpf_kprobe_args4(x, args...) \ + ___bpf_kprobe_args3(args), (void *)PT_REGS_PARM4(ctx) +#define ___bpf_kprobe_args5(x, args...) \ + ___bpf_kprobe_args4(args), (void *)PT_REGS_PARM5(ctx) +#define ___bpf_kprobe_args(args...) \ + ___bpf_apply(___bpf_kprobe_args, ___bpf_narg(args))(args) +/* + * BPF_KPROBE serves the same purpose for kprobes as BPF_PROG for + * tp_btf/fentry/fexit BPF programs. It hides the underlying platform-specific + * low-level way of getting kprobe input arguments from struct pt_regs, and + * provides a familiar typed and named function arguments syntax and + * semantics of accessing kprobe input paremeters. + * + * Original struct pt_regs* context is preserved as 'ctx' argument. This might + * be necessary when using BPF helpers like bpf_perf_event_output(). + */ +#define BPF_KPROBE(name, args...) \ +name(struct pt_regs *ctx); \ +static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args);\ +typeof(name(0)) name(struct pt_regs *ctx) \ +{ \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + return ____##name(___bpf_kprobe_args(args)); \ + _Pragma("GCC diagnostic pop") \ +} \ +static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args) + +#define ___bpf_kretprobe_args0() ctx +#define ___bpf_kretprobe_argsN(x, args...) \ + ___bpf_kprobe_args(args), (void *)PT_REGS_RET(ctx) +#define ___bpf_kretprobe_args(args...) \ + ___bpf_apply(___bpf_kretprobe_args, ___bpf_empty(args))(args) + +/* + * BPF_KRETPROBE is similar to BPF_KPROBE, except, in addition to listing all + * input kprobe arguments, one last extra argument has to be specified, which + * captures kprobe return value. + */ +#define BPF_KRETPROBE(name, args...) \ +name(struct pt_regs *ctx); \ +static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args);\ +typeof(name(0)) name(struct pt_regs *ctx) \ +{ \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + return ____##name(___bpf_kretprobe_args(args)); \ + _Pragma("GCC diagnostic pop") \ +} \ +static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args) #endif diff --git a/tools/testing/selftests/bpf/bpf_util.h b/tools/testing/selftests/bpf/bpf_util.h index ec219f84e041..a3352a64c067 100644 --- a/tools/testing/selftests/bpf/bpf_util.h +++ b/tools/testing/selftests/bpf/bpf_util.h @@ -6,7 +6,7 @@ #include <stdlib.h> #include <string.h> #include <errno.h> -#include <libbpf.h> /* libbpf_num_possible_cpus */ +#include <bpf/libbpf.h> /* libbpf_num_possible_cpus */ static inline unsigned int bpf_num_possible_cpus(void) { diff --git a/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c b/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c new file mode 100644 index 000000000000..f0a64d8ac59a --- /dev/null +++ b/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <stdio.h> +#include <errno.h> +#include <string.h> + +#include <bpf/bpf.h> +#include <bpf/libbpf.h> + +#include <test_maps.h> + +static void map_batch_update(int map_fd, __u32 max_entries, int *keys, + int *values) +{ + int i, err; + DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts, + .elem_flags = 0, + .flags = 0, + ); + + for (i = 0; i < max_entries; i++) { + keys[i] = i; + values[i] = i + 1; + } + + err = bpf_map_update_batch(map_fd, keys, values, &max_entries, &opts); + CHECK(err, "bpf_map_update_batch()", "error:%s\n", strerror(errno)); +} + +static void map_batch_verify(int *visited, __u32 max_entries, + int *keys, int *values) +{ + int i; + + memset(visited, 0, max_entries * sizeof(*visited)); + for (i = 0; i < max_entries; i++) { + CHECK(keys[i] + 1 != values[i], "key/value checking", + "error: i %d key %d value %d\n", i, keys[i], values[i]); + visited[i] = 1; + } + for (i = 0; i < max_entries; i++) { + CHECK(visited[i] != 1, "visited checking", + "error: keys array at index %d missing\n", i); + } +} + +void test_array_map_batch_ops(void) +{ + struct bpf_create_map_attr xattr = { + .name = "array_map", + .map_type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(int), + }; + int map_fd, *keys, *values, *visited; + __u32 count, total, total_success; + const __u32 max_entries = 10; + bool nospace_err; + __u64 batch = 0; + int err, step; + DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts, + .elem_flags = 0, + .flags = 0, + ); + + xattr.max_entries = max_entries; + map_fd = bpf_create_map_xattr(&xattr); + CHECK(map_fd == -1, + "bpf_create_map_xattr()", "error:%s\n", strerror(errno)); + + keys = malloc(max_entries * sizeof(int)); + values = malloc(max_entries * sizeof(int)); + visited = malloc(max_entries * sizeof(int)); + CHECK(!keys || !values || !visited, "malloc()", "error:%s\n", + strerror(errno)); + + /* populate elements to the map */ + map_batch_update(map_fd, max_entries, keys, values); + + /* test 1: lookup in a loop with various steps. */ + total_success = 0; + for (step = 1; step < max_entries; step++) { + map_batch_update(map_fd, max_entries, keys, values); + map_batch_verify(visited, max_entries, keys, values); + memset(keys, 0, max_entries * sizeof(*keys)); + memset(values, 0, max_entries * sizeof(*values)); + batch = 0; + total = 0; + /* iteratively lookup/delete elements with 'step' + * elements each. + */ + count = step; + nospace_err = false; + while (true) { + err = bpf_map_lookup_batch(map_fd, + total ? &batch : NULL, &batch, + keys + total, + values + total, + &count, &opts); + + CHECK((err && errno != ENOENT), "lookup with steps", + "error: %s\n", strerror(errno)); + + total += count; + if (err) + break; + + } + + if (nospace_err == true) + continue; + + CHECK(total != max_entries, "lookup with steps", + "total = %u, max_entries = %u\n", total, max_entries); + + map_batch_verify(visited, max_entries, keys, values); + + total_success++; + } + + CHECK(total_success == 0, "check total_success", + "unexpected failure\n"); + + printf("%s:PASS\n", __func__); + + free(keys); + free(values); + free(visited); +} diff --git a/tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c b/tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c new file mode 100644 index 000000000000..976bf415fbdd --- /dev/null +++ b/tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c @@ -0,0 +1,283 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook */ +#include <stdio.h> +#include <errno.h> +#include <string.h> + +#include <bpf/bpf.h> +#include <bpf/libbpf.h> + +#include <bpf_util.h> +#include <test_maps.h> + +static void map_batch_update(int map_fd, __u32 max_entries, int *keys, + void *values, bool is_pcpu) +{ + typedef BPF_DECLARE_PERCPU(int, value); + value *v = NULL; + int i, j, err; + DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts, + .elem_flags = 0, + .flags = 0, + ); + + if (is_pcpu) + v = (value *)values; + + for (i = 0; i < max_entries; i++) { + keys[i] = i + 1; + if (is_pcpu) + for (j = 0; j < bpf_num_possible_cpus(); j++) + bpf_percpu(v[i], j) = i + 2 + j; + else + ((int *)values)[i] = i + 2; + } + + err = bpf_map_update_batch(map_fd, keys, values, &max_entries, &opts); + CHECK(err, "bpf_map_update_batch()", "error:%s\n", strerror(errno)); +} + +static void map_batch_verify(int *visited, __u32 max_entries, + int *keys, void *values, bool is_pcpu) +{ + typedef BPF_DECLARE_PERCPU(int, value); + value *v = NULL; + int i, j; + + if (is_pcpu) + v = (value *)values; + + memset(visited, 0, max_entries * sizeof(*visited)); + for (i = 0; i < max_entries; i++) { + + if (is_pcpu) { + for (j = 0; j < bpf_num_possible_cpus(); j++) { + CHECK(keys[i] + 1 + j != bpf_percpu(v[i], j), + "key/value checking", + "error: i %d j %d key %d value %d\n", + i, j, keys[i], bpf_percpu(v[i], j)); + } + } else { + CHECK(keys[i] + 1 != ((int *)values)[i], + "key/value checking", + "error: i %d key %d value %d\n", i, keys[i], + ((int *)values)[i]); + } + + visited[i] = 1; + + } + for (i = 0; i < max_entries; i++) { + CHECK(visited[i] != 1, "visited checking", + "error: keys array at index %d missing\n", i); + } +} + +void __test_map_lookup_and_delete_batch(bool is_pcpu) +{ + __u32 batch, count, total, total_success; + typedef BPF_DECLARE_PERCPU(int, value); + int map_fd, *keys, *visited, key; + const __u32 max_entries = 10; + value pcpu_values[max_entries]; + int err, step, value_size; + bool nospace_err; + void *values; + struct bpf_create_map_attr xattr = { + .name = "hash_map", + .map_type = is_pcpu ? BPF_MAP_TYPE_PERCPU_HASH : + BPF_MAP_TYPE_HASH, + .key_size = sizeof(int), + .value_size = sizeof(int), + }; + DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts, + .elem_flags = 0, + .flags = 0, + ); + + xattr.max_entries = max_entries; + map_fd = bpf_create_map_xattr(&xattr); + CHECK(map_fd == -1, + "bpf_create_map_xattr()", "error:%s\n", strerror(errno)); + + value_size = is_pcpu ? sizeof(value) : sizeof(int); + keys = malloc(max_entries * sizeof(int)); + if (is_pcpu) + values = pcpu_values; + else + values = malloc(max_entries * sizeof(int)); + visited = malloc(max_entries * sizeof(int)); + CHECK(!keys || !values || !visited, "malloc()", + "error:%s\n", strerror(errno)); + + /* test 1: lookup/delete an empty hash table, -ENOENT */ + count = max_entries; + err = bpf_map_lookup_and_delete_batch(map_fd, NULL, &batch, keys, + values, &count, &opts); + CHECK((err && errno != ENOENT), "empty map", + "error: %s\n", strerror(errno)); + + /* populate elements to the map */ + map_batch_update(map_fd, max_entries, keys, values, is_pcpu); + + /* test 2: lookup/delete with count = 0, success */ + count = 0; + err = bpf_map_lookup_and_delete_batch(map_fd, NULL, &batch, keys, + values, &count, &opts); + CHECK(err, "count = 0", "error: %s\n", strerror(errno)); + + /* test 3: lookup/delete with count = max_entries, success */ + memset(keys, 0, max_entries * sizeof(*keys)); + memset(values, 0, max_entries * value_size); + count = max_entries; + err = bpf_map_lookup_and_delete_batch(map_fd, NULL, &batch, keys, + values, &count, &opts); + CHECK((err && errno != ENOENT), "count = max_entries", + "error: %s\n", strerror(errno)); + CHECK(count != max_entries, "count = max_entries", + "count = %u, max_entries = %u\n", count, max_entries); + map_batch_verify(visited, max_entries, keys, values, is_pcpu); + + /* bpf_map_get_next_key() should return -ENOENT for an empty map. */ + err = bpf_map_get_next_key(map_fd, NULL, &key); + CHECK(!err, "bpf_map_get_next_key()", "error: %s\n", strerror(errno)); + + /* test 4: lookup/delete in a loop with various steps. */ + total_success = 0; + for (step = 1; step < max_entries; step++) { + map_batch_update(map_fd, max_entries, keys, values, is_pcpu); + memset(keys, 0, max_entries * sizeof(*keys)); + memset(values, 0, max_entries * value_size); + total = 0; + /* iteratively lookup/delete elements with 'step' + * elements each + */ + count = step; + nospace_err = false; + while (true) { + err = bpf_map_lookup_batch(map_fd, + total ? &batch : NULL, + &batch, keys + total, + values + + total * value_size, + &count, &opts); + /* It is possible that we are failing due to buffer size + * not big enough. In such cases, let us just exit and + * go with large steps. Not that a buffer size with + * max_entries should always work. + */ + if (err && errno == ENOSPC) { + nospace_err = true; + break; + } + + CHECK((err && errno != ENOENT), "lookup with steps", + "error: %s\n", strerror(errno)); + + total += count; + if (err) + break; + + } + if (nospace_err == true) + continue; + + CHECK(total != max_entries, "lookup with steps", + "total = %u, max_entries = %u\n", total, max_entries); + map_batch_verify(visited, max_entries, keys, values, is_pcpu); + + total = 0; + count = step; + while (total < max_entries) { + if (max_entries - total < step) + count = max_entries - total; + err = bpf_map_delete_batch(map_fd, + keys + total, + &count, &opts); + CHECK((err && errno != ENOENT), "delete batch", + "error: %s\n", strerror(errno)); + total += count; + if (err) + break; + } + CHECK(total != max_entries, "delete with steps", + "total = %u, max_entries = %u\n", total, max_entries); + + /* check map is empty, errono == ENOENT */ + err = bpf_map_get_next_key(map_fd, NULL, &key); + CHECK(!err || errno != ENOENT, "bpf_map_get_next_key()", + "error: %s\n", strerror(errno)); + + /* iteratively lookup/delete elements with 'step' + * elements each + */ + map_batch_update(map_fd, max_entries, keys, values, is_pcpu); + memset(keys, 0, max_entries * sizeof(*keys)); + memset(values, 0, max_entries * value_size); + total = 0; + count = step; + nospace_err = false; + while (true) { + err = bpf_map_lookup_and_delete_batch(map_fd, + total ? &batch : NULL, + &batch, keys + total, + values + + total * value_size, + &count, &opts); + /* It is possible that we are failing due to buffer size + * not big enough. In such cases, let us just exit and + * go with large steps. Not that a buffer size with + * max_entries should always work. + */ + if (err && errno == ENOSPC) { + nospace_err = true; + break; + } + + CHECK((err && errno != ENOENT), "lookup with steps", + "error: %s\n", strerror(errno)); + + total += count; + if (err) + break; + } + + if (nospace_err == true) + continue; + + CHECK(total != max_entries, "lookup/delete with steps", + "total = %u, max_entries = %u\n", total, max_entries); + + map_batch_verify(visited, max_entries, keys, values, is_pcpu); + err = bpf_map_get_next_key(map_fd, NULL, &key); + CHECK(!err, "bpf_map_get_next_key()", "error: %s\n", + strerror(errno)); + + total_success++; + } + + CHECK(total_success == 0, "check total_success", + "unexpected failure\n"); + free(keys); + free(visited); + if (!is_pcpu) + free(values); +} + +void htab_map_batch_ops(void) +{ + __test_map_lookup_and_delete_batch(false); + printf("test_%s:PASS\n", __func__); +} + +void htab_percpu_map_batch_ops(void) +{ + __test_map_lookup_and_delete_batch(true); + printf("test_%s:PASS\n", __func__); +} + +void test_htab_map_batch_ops(void) +{ + htab_map_batch_ops(); + htab_percpu_map_batch_ops(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c new file mode 100644 index 000000000000..8482bbc67eec --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -0,0 +1,212 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook */ + +#include <linux/err.h> +#include <test_progs.h> +#include "bpf_dctcp.skel.h" +#include "bpf_cubic.skel.h" + +#define min(a, b) ((a) < (b) ? (a) : (b)) + +static const unsigned int total_bytes = 10 * 1024 * 1024; +static const struct timeval timeo_sec = { .tv_sec = 10 }; +static const size_t timeo_optlen = sizeof(timeo_sec); +static int stop, duration; + +static int settimeo(int fd) +{ + int err; + + err = setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &timeo_sec, + timeo_optlen); + if (CHECK(err == -1, "setsockopt(fd, SO_RCVTIMEO)", "errno:%d\n", + errno)) + return -1; + + err = setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &timeo_sec, + timeo_optlen); + if (CHECK(err == -1, "setsockopt(fd, SO_SNDTIMEO)", "errno:%d\n", + errno)) + return -1; + + return 0; +} + +static int settcpca(int fd, const char *tcp_ca) +{ + int err; + + err = setsockopt(fd, IPPROTO_TCP, TCP_CONGESTION, tcp_ca, strlen(tcp_ca)); + if (CHECK(err == -1, "setsockopt(fd, TCP_CONGESTION)", "errno:%d\n", + errno)) + return -1; + + return 0; +} + +static void *server(void *arg) +{ + int lfd = (int)(long)arg, err = 0, fd; + ssize_t nr_sent = 0, bytes = 0; + char batch[1500]; + + fd = accept(lfd, NULL, NULL); + while (fd == -1) { + if (errno == EINTR) + continue; + err = -errno; + goto done; + } + + if (settimeo(fd)) { + err = -errno; + goto done; + } + + while (bytes < total_bytes && !READ_ONCE(stop)) { + nr_sent = send(fd, &batch, + min(total_bytes - bytes, sizeof(batch)), 0); + if (nr_sent == -1 && errno == EINTR) + continue; + if (nr_sent == -1) { + err = -errno; + break; + } + bytes += nr_sent; + } + + CHECK(bytes != total_bytes, "send", "%zd != %u nr_sent:%zd errno:%d\n", + bytes, total_bytes, nr_sent, errno); + +done: + if (fd != -1) + close(fd); + if (err) { + WRITE_ONCE(stop, 1); + return ERR_PTR(err); + } + return NULL; +} + +static void do_test(const char *tcp_ca) +{ + struct sockaddr_in6 sa6 = {}; + ssize_t nr_recv = 0, bytes = 0; + int lfd = -1, fd = -1; + pthread_t srv_thread; + socklen_t addrlen = sizeof(sa6); + void *thread_ret; + char batch[1500]; + int err; + + WRITE_ONCE(stop, 0); + + lfd = socket(AF_INET6, SOCK_STREAM, 0); + if (CHECK(lfd == -1, "socket", "errno:%d\n", errno)) + return; + fd = socket(AF_INET6, SOCK_STREAM, 0); + if (CHECK(fd == -1, "socket", "errno:%d\n", errno)) { + close(lfd); + return; + } + + if (settcpca(lfd, tcp_ca) || settcpca(fd, tcp_ca) || + settimeo(lfd) || settimeo(fd)) + goto done; + + /* bind, listen and start server thread to accept */ + sa6.sin6_family = AF_INET6; + sa6.sin6_addr = in6addr_loopback; + err = bind(lfd, (struct sockaddr *)&sa6, addrlen); + if (CHECK(err == -1, "bind", "errno:%d\n", errno)) + goto done; + err = getsockname(lfd, (struct sockaddr *)&sa6, &addrlen); + if (CHECK(err == -1, "getsockname", "errno:%d\n", errno)) + goto done; + err = listen(lfd, 1); + if (CHECK(err == -1, "listen", "errno:%d\n", errno)) + goto done; + err = pthread_create(&srv_thread, NULL, server, (void *)(long)lfd); + if (CHECK(err != 0, "pthread_create", "err:%d\n", err)) + goto done; + + /* connect to server */ + err = connect(fd, (struct sockaddr *)&sa6, addrlen); + if (CHECK(err == -1, "connect", "errno:%d\n", errno)) + goto wait_thread; + + /* recv total_bytes */ + while (bytes < total_bytes && !READ_ONCE(stop)) { + nr_recv = recv(fd, &batch, + min(total_bytes - bytes, sizeof(batch)), 0); + if (nr_recv == -1 && errno == EINTR) + continue; + if (nr_recv == -1) + break; + bytes += nr_recv; + } + + CHECK(bytes != total_bytes, "recv", "%zd != %u nr_recv:%zd errno:%d\n", + bytes, total_bytes, nr_recv, errno); + +wait_thread: + WRITE_ONCE(stop, 1); + pthread_join(srv_thread, &thread_ret); + CHECK(IS_ERR(thread_ret), "pthread_join", "thread_ret:%ld", + PTR_ERR(thread_ret)); +done: + close(lfd); + close(fd); +} + +static void test_cubic(void) +{ + struct bpf_cubic *cubic_skel; + struct bpf_link *link; + + cubic_skel = bpf_cubic__open_and_load(); + if (CHECK(!cubic_skel, "bpf_cubic__open_and_load", "failed\n")) + return; + + link = bpf_map__attach_struct_ops(cubic_skel->maps.cubic); + if (CHECK(IS_ERR(link), "bpf_map__attach_struct_ops", "err:%ld\n", + PTR_ERR(link))) { + bpf_cubic__destroy(cubic_skel); + return; + } + + do_test("bpf_cubic"); + + bpf_link__destroy(link); + bpf_cubic__destroy(cubic_skel); +} + +static void test_dctcp(void) +{ + struct bpf_dctcp *dctcp_skel; + struct bpf_link *link; + + dctcp_skel = bpf_dctcp__open_and_load(); + if (CHECK(!dctcp_skel, "bpf_dctcp__open_and_load", "failed\n")) + return; + + link = bpf_map__attach_struct_ops(dctcp_skel->maps.dctcp); + if (CHECK(IS_ERR(link), "bpf_map__attach_struct_ops", "err:%ld\n", + PTR_ERR(link))) { + bpf_dctcp__destroy(dctcp_skel); + return; + } + + do_test("bpf_dctcp"); + + bpf_link__destroy(link); + bpf_dctcp__destroy(dctcp_skel); +} + +void test_bpf_tcp_ca(void) +{ + if (test__start_subtest("dctcp")) + test_dctcp(); + if (test__start_subtest("cubic")) + test_cubic(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c index 9486c13af6b2..e9f2f12ba06b 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c @@ -48,6 +48,8 @@ void test_bpf_verif_scale(void) { "test_verif_scale2.o", BPF_PROG_TYPE_SCHED_CLS }, { "test_verif_scale3.o", BPF_PROG_TYPE_SCHED_CLS }, + { "pyperf_global.o", BPF_PROG_TYPE_RAW_TRACEPOINT }, + /* full unroll by llvm */ { "pyperf50.o", BPF_PROG_TYPE_RAW_TRACEPOINT }, { "pyperf100.o", BPF_PROG_TYPE_RAW_TRACEPOINT }, diff --git a/tools/testing/selftests/bpf/prog_tests/cpu_mask.c b/tools/testing/selftests/bpf/prog_tests/cpu_mask.c index 1fa1bdbaffa9..f7c7e25232be 100644 --- a/tools/testing/selftests/bpf/prog_tests/cpu_mask.c +++ b/tools/testing/selftests/bpf/prog_tests/cpu_mask.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> #include <bpf/btf.h> -#include "libbpf_internal.h" +#include "bpf/libbpf_internal.h" static int duration = 0; diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c index b426bf2f97e4..db5c74d2ce6d 100644 --- a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c +++ b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c @@ -26,7 +26,7 @@ static void test_fexit_bpf2bpf_common(const char *obj_file, link = calloc(sizeof(struct bpf_link *), prog_cnt); prog = calloc(sizeof(struct bpf_program *), prog_cnt); - result = malloc(prog_cnt * sizeof(u64)); + result = malloc((prog_cnt + 32 /* spare */) * sizeof(u64)); if (CHECK(!link || !prog || !result, "alloc_memory", "failed to alloc memory")) goto close_prog; @@ -98,6 +98,24 @@ static void test_target_yes_callees(void) "fexit/test_pkt_access", "fexit/test_pkt_access_subprog1", "fexit/test_pkt_access_subprog2", + "fexit/test_pkt_access_subprog3", + }; + test_fexit_bpf2bpf_common("./fexit_bpf2bpf.o", + "./test_pkt_access.o", + ARRAY_SIZE(prog_name), + prog_name); +} + +static void test_func_replace(void) +{ + const char *prog_name[] = { + "fexit/test_pkt_access", + "fexit/test_pkt_access_subprog1", + "fexit/test_pkt_access_subprog2", + "fexit/test_pkt_access_subprog3", + "freplace/get_skb_len", + "freplace/get_skb_ifindex", + "freplace/get_constant", }; test_fexit_bpf2bpf_common("./fexit_bpf2bpf.o", "./test_pkt_access.o", @@ -109,4 +127,5 @@ void test_fexit_bpf2bpf(void) { test_target_no_callees(); test_target_yes_callees(); + test_func_replace(); } diff --git a/tools/testing/selftests/bpf/prog_tests/perf_buffer.c b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c index cf6c87936c69..1450ea2dd4cc 100644 --- a/tools/testing/selftests/bpf/prog_tests/perf_buffer.c +++ b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c @@ -4,7 +4,7 @@ #include <sched.h> #include <sys/socket.h> #include <test_progs.h> -#include "libbpf_internal.h" +#include "bpf/libbpf_internal.h" static void on_sample(void *ctx, int cpu, void *data, __u32 size) { diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c index b607112c64e7..504abb7bfb95 100644 --- a/tools/testing/selftests/bpf/prog_tests/send_signal.c +++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include "test_send_signal_kern.skel.h" static volatile int sigusr1_received = 0; @@ -9,17 +10,15 @@ static void sigusr1_handler(int signum) } static void test_send_signal_common(struct perf_event_attr *attr, - int prog_type, + bool signal_thread, const char *test_name) { - int err = -1, pmu_fd, prog_fd, info_map_fd, status_map_fd; - const char *file = "./test_send_signal_kern.o"; - struct bpf_object *obj = NULL; + struct test_send_signal_kern *skel; int pipe_c2p[2], pipe_p2c[2]; - __u32 key = 0, duration = 0; + int err = -1, pmu_fd = -1; + __u32 duration = 0; char buf[256]; pid_t pid; - __u64 val; if (CHECK(pipe(pipe_c2p), test_name, "pipe pipe_c2p error: %s\n", strerror(errno))) @@ -73,45 +72,39 @@ static void test_send_signal_common(struct perf_event_attr *attr, close(pipe_c2p[1]); /* close write */ close(pipe_p2c[0]); /* close read */ - err = bpf_prog_load(file, prog_type, &obj, &prog_fd); - if (CHECK(err < 0, test_name, "bpf_prog_load error: %s\n", - strerror(errno))) - goto prog_load_failure; - - pmu_fd = syscall(__NR_perf_event_open, attr, pid, -1, - -1 /* group id */, 0 /* flags */); - if (CHECK(pmu_fd < 0, test_name, "perf_event_open error: %s\n", - strerror(errno))) { - err = -1; - goto close_prog; - } + skel = test_send_signal_kern__open_and_load(); + if (CHECK(!skel, "skel_open_and_load", "skeleton open_and_load failed\n")) + goto skel_open_load_failure; - err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0); - if (CHECK(err < 0, test_name, "ioctl perf_event_ioc_enable error: %s\n", - strerror(errno))) - goto disable_pmu; - - err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd); - if (CHECK(err < 0, test_name, "ioctl perf_event_ioc_set_bpf error: %s\n", - strerror(errno))) - goto disable_pmu; - - err = -1; - info_map_fd = bpf_object__find_map_fd_by_name(obj, "info_map"); - if (CHECK(info_map_fd < 0, test_name, "find map %s error\n", "info_map")) - goto disable_pmu; + if (!attr) { + err = test_send_signal_kern__attach(skel); + if (CHECK(err, "skel_attach", "skeleton attach failed\n")) { + err = -1; + goto destroy_skel; + } + } else { + pmu_fd = syscall(__NR_perf_event_open, attr, pid, -1, + -1 /* group id */, 0 /* flags */); + if (CHECK(pmu_fd < 0, test_name, "perf_event_open error: %s\n", + strerror(errno))) { + err = -1; + goto destroy_skel; + } - status_map_fd = bpf_object__find_map_fd_by_name(obj, "status_map"); - if (CHECK(status_map_fd < 0, test_name, "find map %s error\n", "status_map")) - goto disable_pmu; + skel->links.send_signal_perf = + bpf_program__attach_perf_event(skel->progs.send_signal_perf, pmu_fd); + if (CHECK(IS_ERR(skel->links.send_signal_perf), "attach_perf_event", + "err %ld\n", PTR_ERR(skel->links.send_signal_perf))) + goto disable_pmu; + } /* wait until child signal handler installed */ read(pipe_c2p[0], buf, 1); /* trigger the bpf send_signal */ - key = 0; - val = (((__u64)(SIGUSR1)) << 32) | pid; - bpf_map_update_elem(info_map_fd, &key, &val, 0); + skel->bss->pid = pid; + skel->bss->sig = SIGUSR1; + skel->bss->signal_thread = signal_thread; /* notify child that bpf program can send_signal now */ write(pipe_p2c[1], buf, 1); @@ -132,46 +125,20 @@ static void test_send_signal_common(struct perf_event_attr *attr, disable_pmu: close(pmu_fd); -close_prog: - bpf_object__close(obj); -prog_load_failure: +destroy_skel: + test_send_signal_kern__destroy(skel); +skel_open_load_failure: close(pipe_c2p[0]); close(pipe_p2c[1]); wait(NULL); } -static void test_send_signal_tracepoint(void) +static void test_send_signal_tracepoint(bool signal_thread) { - const char *id_path = "/sys/kernel/debug/tracing/events/syscalls/sys_enter_nanosleep/id"; - struct perf_event_attr attr = { - .type = PERF_TYPE_TRACEPOINT, - .sample_type = PERF_SAMPLE_RAW | PERF_SAMPLE_CALLCHAIN, - .sample_period = 1, - .wakeup_events = 1, - }; - __u32 duration = 0; - int bytes, efd; - char buf[256]; - - efd = open(id_path, O_RDONLY, 0); - if (CHECK(efd < 0, "tracepoint", - "open syscalls/sys_enter_nanosleep/id failure: %s\n", - strerror(errno))) - return; - - bytes = read(efd, buf, sizeof(buf)); - close(efd); - if (CHECK(bytes <= 0 || bytes >= sizeof(buf), "tracepoint", - "read syscalls/sys_enter_nanosleep/id failure: %s\n", - strerror(errno))) - return; - - attr.config = strtol(buf, NULL, 0); - - test_send_signal_common(&attr, BPF_PROG_TYPE_TRACEPOINT, "tracepoint"); + test_send_signal_common(NULL, signal_thread, "tracepoint"); } -static void test_send_signal_perf(void) +static void test_send_signal_perf(bool signal_thread) { struct perf_event_attr attr = { .sample_period = 1, @@ -179,15 +146,13 @@ static void test_send_signal_perf(void) .config = PERF_COUNT_SW_CPU_CLOCK, }; - test_send_signal_common(&attr, BPF_PROG_TYPE_PERF_EVENT, - "perf_sw_event"); + test_send_signal_common(&attr, signal_thread, "perf_sw_event"); } -static void test_send_signal_nmi(void) +static void test_send_signal_nmi(bool signal_thread) { struct perf_event_attr attr = { - .sample_freq = 50, - .freq = 1, + .sample_period = 1, .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, }; @@ -210,16 +175,21 @@ static void test_send_signal_nmi(void) close(pmu_fd); } - test_send_signal_common(&attr, BPF_PROG_TYPE_PERF_EVENT, - "perf_hw_event"); + test_send_signal_common(&attr, signal_thread, "perf_hw_event"); } void test_send_signal(void) { if (test__start_subtest("send_signal_tracepoint")) - test_send_signal_tracepoint(); + test_send_signal_tracepoint(false); if (test__start_subtest("send_signal_perf")) - test_send_signal_perf(); + test_send_signal_perf(false); if (test__start_subtest("send_signal_nmi")) - test_send_signal_nmi(); + test_send_signal_nmi(false); + if (test__start_subtest("send_signal_tracepoint_thread")) + test_send_signal_tracepoint(true); + if (test__start_subtest("send_signal_perf_thread")) + test_send_signal_perf(true); + if (test__start_subtest("send_signal_nmi_thread")) + test_send_signal_nmi(true); } diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c index 8974450a4bdb..f002e3090d92 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c @@ -49,8 +49,12 @@ retry: pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */, 0 /* cpu 0 */, -1 /* group id */, 0 /* flags */); - if (CHECK(pmu_fd < 0, "perf_event_open", - "err %d errno %d. Does the test host support PERF_COUNT_HW_CPU_CYCLES?\n", + if (pmu_fd < 0 && errno == ENOENT) { + printf("%s:SKIP:no PERF_COUNT_HW_CPU_CYCLES\n", __func__); + test__skip(); + goto cleanup; + } + if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n", pmu_fd, errno)) goto cleanup; diff --git a/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c b/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c new file mode 100644 index 000000000000..25b068591e9a --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include <test_progs.h> + +const char *err_str; +bool found; + +static int libbpf_debug_print(enum libbpf_print_level level, + const char *format, va_list args) +{ + char *log_buf; + + if (level != LIBBPF_WARN || + strcmp(format, "libbpf: \n%s\n")) { + vprintf(format, args); + return 0; + } + + log_buf = va_arg(args, char *); + if (!log_buf) + goto out; + if (strstr(log_buf, err_str) == 0) + found = true; +out: + printf(format, log_buf); + return 0; +} + +extern int extra_prog_load_log_flags; + +static int check_load(const char *file) +{ + struct bpf_prog_load_attr attr; + struct bpf_object *obj = NULL; + int err, prog_fd; + + memset(&attr, 0, sizeof(struct bpf_prog_load_attr)); + attr.file = file; + attr.prog_type = BPF_PROG_TYPE_UNSPEC; + attr.log_level = extra_prog_load_log_flags; + attr.prog_flags = BPF_F_TEST_RND_HI32; + found = false; + err = bpf_prog_load_xattr(&attr, &obj, &prog_fd); + bpf_object__close(obj); + return err; +} + +struct test_def { + const char *file; + const char *err_str; +}; + +void test_test_global_funcs(void) +{ + struct test_def tests[] = { + { "test_global_func1.o", "combined stack size of 4 calls is 544" }, + { "test_global_func2.o" }, + { "test_global_func3.o" , "the call stack of 8 frames" }, + { "test_global_func4.o" }, + { "test_global_func5.o" , "expected pointer to ctx, but got PTR" }, + { "test_global_func6.o" , "modified ctx ptr R2" }, + { "test_global_func7.o" , "foo() doesn't return scalar" }, + }; + libbpf_print_fn_t old_print_fn = NULL; + int err, i, duration = 0; + + old_print_fn = libbpf_set_print(libbpf_debug_print); + + for (i = 0; i < ARRAY_SIZE(tests); i++) { + const struct test_def *test = &tests[i]; + + if (!test__start_subtest(test->file)) + continue; + + err_str = test->err_str; + err = check_load(test->file); + CHECK_FAIL(!!err ^ !!err_str); + if (err_str) + CHECK(found, "", "expected string '%s'", err_str); + } + libbpf_set_print(old_print_fn); +} diff --git a/tools/testing/selftests/bpf/prog_tests/test_overhead.c b/tools/testing/selftests/bpf/prog_tests/test_overhead.c index c32aa28bd93f..465b371a561d 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_overhead.c +++ b/tools/testing/selftests/bpf/prog_tests/test_overhead.c @@ -2,6 +2,7 @@ /* Copyright (c) 2019 Facebook */ #define _GNU_SOURCE #include <sched.h> +#include <sys/prctl.h> #include <test_progs.h> #define MAX_CNT 100000 @@ -17,7 +18,7 @@ static __u64 time_get_ns(void) static int test_task_rename(const char *prog) { int i, fd, duration = 0, err; - char buf[] = "test\n"; + char buf[] = "test_overhead"; __u64 start_time; fd = open("/proc/self/comm", O_WRONLY|O_TRUNC); @@ -66,6 +67,10 @@ void test_test_overhead(void) struct bpf_object *obj; struct bpf_link *link; int err, duration = 0; + char comm[16] = {}; + + if (CHECK_FAIL(prctl(PR_GET_NAME, comm, 0L, 0L, 0L))) + return; obj = bpf_object__open_file("./test_overhead.o", NULL); if (CHECK(IS_ERR(obj), "obj_open_file", "err %ld\n", PTR_ERR(obj))) @@ -138,5 +143,6 @@ void test_test_overhead(void) test_run("fexit"); bpf_link__destroy(link); cleanup: + prctl(PR_SET_NAME, comm, 0L, 0L, 0L); bpf_object__close(obj); } diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c b/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c new file mode 100644 index 000000000000..6b56bdc73ebc --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <test_progs.h> +#include <net/if.h> +#include "test_xdp.skel.h" +#include "test_xdp_bpf2bpf.skel.h" + +void test_xdp_bpf2bpf(void) +{ + __u32 duration = 0, retval, size; + char buf[128]; + int err, pkt_fd, map_fd; + struct iphdr *iph = (void *)buf + sizeof(struct ethhdr); + struct iptnl_info value4 = {.family = AF_INET}; + struct test_xdp *pkt_skel = NULL; + struct test_xdp_bpf2bpf *ftrace_skel = NULL; + struct vip key4 = {.protocol = 6, .family = AF_INET}; + DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts); + + /* Load XDP program to introspect */ + pkt_skel = test_xdp__open_and_load(); + if (CHECK(!pkt_skel, "pkt_skel_load", "test_xdp skeleton failed\n")) + return; + + pkt_fd = bpf_program__fd(pkt_skel->progs._xdp_tx_iptunnel); + + map_fd = bpf_map__fd(pkt_skel->maps.vip2tnl); + bpf_map_update_elem(map_fd, &key4, &value4, 0); + + /* Load trace program */ + opts.attach_prog_fd = pkt_fd, + ftrace_skel = test_xdp_bpf2bpf__open_opts(&opts); + if (CHECK(!ftrace_skel, "__open", "ftrace skeleton failed\n")) + goto out; + + err = test_xdp_bpf2bpf__load(ftrace_skel); + if (CHECK(err, "__load", "ftrace skeleton failed\n")) + goto out; + + err = test_xdp_bpf2bpf__attach(ftrace_skel); + if (CHECK(err, "ftrace_attach", "ftrace attach failed: %d\n", err)) + goto out; + + /* Run test program */ + err = bpf_prog_test_run(pkt_fd, 1, &pkt_v4, sizeof(pkt_v4), + buf, &size, &retval, &duration); + + if (CHECK(err || retval != XDP_TX || size != 74 || + iph->protocol != IPPROTO_IPIP, "ipv4", + "err %d errno %d retval %d size %d\n", + err, errno, retval, size)) + goto out; + + /* Verify test results */ + if (CHECK(ftrace_skel->bss->test_result_fentry != if_nametoindex("lo"), + "result", "fentry failed err %llu\n", + ftrace_skel->bss->test_result_fentry)) + goto out; + + CHECK(ftrace_skel->bss->test_result_fexit != XDP_TX, "result", + "fexit failed err %llu\n", ftrace_skel->bss->test_result_fexit); + +out: + test_xdp__destroy(pkt_skel); + test_xdp_bpf2bpf__destroy(ftrace_skel); +} diff --git a/tools/testing/selftests/bpf/progs/bpf_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cubic.c new file mode 100644 index 000000000000..7897c8f4d363 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_cubic.c @@ -0,0 +1,544 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* WARNING: This implemenation is not necessarily the same + * as the tcp_cubic.c. The purpose is mainly for testing + * the kernel BPF logic. + * + * Highlights: + * 1. CONFIG_HZ .kconfig map is used. + * 2. In bictcp_update(), calculation is changed to use usec + * resolution (i.e. USEC_PER_JIFFY) instead of using jiffies. + * Thus, usecs_to_jiffies() is not used in the bpf_cubic.c. + * 3. In bitctcp_update() [under tcp_friendliness], the original + * "while (ca->ack_cnt > delta)" loop is changed to the equivalent + * "ca->ack_cnt / delta" operation. + */ + +#include <linux/bpf.h> +#include "bpf_tcp_helpers.h" + +char _license[] SEC("license") = "GPL"; + +#define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi) + +#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation + * max_cwnd = snd_cwnd * beta + */ +#define BICTCP_HZ 10 /* BIC HZ 2^10 = 1024 */ + +/* Two methods of hybrid slow start */ +#define HYSTART_ACK_TRAIN 0x1 +#define HYSTART_DELAY 0x2 + +/* Number of delay samples for detecting the increase of delay */ +#define HYSTART_MIN_SAMPLES 8 +#define HYSTART_DELAY_MIN (4000U) /* 4ms */ +#define HYSTART_DELAY_MAX (16000U) /* 16 ms */ +#define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX) + +static int fast_convergence = 1; +static const int beta = 717; /* = 717/1024 (BICTCP_BETA_SCALE) */ +static int initial_ssthresh; +static const int bic_scale = 41; +static int tcp_friendliness = 1; + +static int hystart = 1; +static int hystart_detect = HYSTART_ACK_TRAIN | HYSTART_DELAY; +static int hystart_low_window = 16; +static int hystart_ack_delta_us = 2000; + +static const __u32 cube_rtt_scale = (bic_scale * 10); /* 1024*c/rtt */ +static const __u32 beta_scale = 8*(BICTCP_BETA_SCALE+beta) / 3 + / (BICTCP_BETA_SCALE - beta); +/* calculate the "K" for (wmax-cwnd) = c/rtt * K^3 + * so K = cubic_root( (wmax-cwnd)*rtt/c ) + * the unit of K is bictcp_HZ=2^10, not HZ + * + * c = bic_scale >> 10 + * rtt = 100ms + * + * the following code has been designed and tested for + * cwnd < 1 million packets + * RTT < 100 seconds + * HZ < 1,000,00 (corresponding to 10 nano-second) + */ + +/* 1/c * 2^2*bictcp_HZ * srtt, 2^40 */ +static const __u64 cube_factor = (__u64)(1ull << (10+3*BICTCP_HZ)) + / (bic_scale * 10); + +/* BIC TCP Parameters */ +struct bictcp { + __u32 cnt; /* increase cwnd by 1 after ACKs */ + __u32 last_max_cwnd; /* last maximum snd_cwnd */ + __u32 last_cwnd; /* the last snd_cwnd */ + __u32 last_time; /* time when updated last_cwnd */ + __u32 bic_origin_point;/* origin point of bic function */ + __u32 bic_K; /* time to origin point + from the beginning of the current epoch */ + __u32 delay_min; /* min delay (usec) */ + __u32 epoch_start; /* beginning of an epoch */ + __u32 ack_cnt; /* number of acks */ + __u32 tcp_cwnd; /* estimated tcp cwnd */ + __u16 unused; + __u8 sample_cnt; /* number of samples to decide curr_rtt */ + __u8 found; /* the exit point is found? */ + __u32 round_start; /* beginning of each round */ + __u32 end_seq; /* end_seq of the round */ + __u32 last_ack; /* last time when the ACK spacing is close */ + __u32 curr_rtt; /* the minimum rtt of current round */ +}; + +static inline void bictcp_reset(struct bictcp *ca) +{ + ca->cnt = 0; + ca->last_max_cwnd = 0; + ca->last_cwnd = 0; + ca->last_time = 0; + ca->bic_origin_point = 0; + ca->bic_K = 0; + ca->delay_min = 0; + ca->epoch_start = 0; + ca->ack_cnt = 0; + ca->tcp_cwnd = 0; + ca->found = 0; +} + +extern unsigned long CONFIG_HZ __kconfig; +#define HZ CONFIG_HZ +#define USEC_PER_MSEC 1000UL +#define USEC_PER_SEC 1000000UL +#define USEC_PER_JIFFY (USEC_PER_SEC / HZ) + +static __always_inline __u64 div64_u64(__u64 dividend, __u64 divisor) +{ + return dividend / divisor; +} + +#define div64_ul div64_u64 + +#define BITS_PER_U64 (sizeof(__u64) * 8) +static __always_inline int fls64(__u64 x) +{ + int num = BITS_PER_U64 - 1; + + if (x == 0) + return 0; + + if (!(x & (~0ull << (BITS_PER_U64-32)))) { + num -= 32; + x <<= 32; + } + if (!(x & (~0ull << (BITS_PER_U64-16)))) { + num -= 16; + x <<= 16; + } + if (!(x & (~0ull << (BITS_PER_U64-8)))) { + num -= 8; + x <<= 8; + } + if (!(x & (~0ull << (BITS_PER_U64-4)))) { + num -= 4; + x <<= 4; + } + if (!(x & (~0ull << (BITS_PER_U64-2)))) { + num -= 2; + x <<= 2; + } + if (!(x & (~0ull << (BITS_PER_U64-1)))) + num -= 1; + + return num + 1; +} + +static __always_inline __u32 bictcp_clock_us(const struct sock *sk) +{ + return tcp_sk(sk)->tcp_mstamp; +} + +static __always_inline void bictcp_hystart_reset(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bictcp *ca = inet_csk_ca(sk); + + ca->round_start = ca->last_ack = bictcp_clock_us(sk); + ca->end_seq = tp->snd_nxt; + ca->curr_rtt = ~0U; + ca->sample_cnt = 0; +} + +/* "struct_ops/" prefix is not a requirement + * It will be recognized as BPF_PROG_TYPE_STRUCT_OPS + * as long as it is used in one of the func ptr + * under SEC(".struct_ops"). + */ +SEC("struct_ops/bictcp_init") +void BPF_PROG(bictcp_init, struct sock *sk) +{ + struct bictcp *ca = inet_csk_ca(sk); + + bictcp_reset(ca); + + if (hystart) + bictcp_hystart_reset(sk); + + if (!hystart && initial_ssthresh) + tcp_sk(sk)->snd_ssthresh = initial_ssthresh; +} + +/* No prefix in SEC will also work. + * The remaining tcp-cubic functions have an easier way. + */ +SEC("no-sec-prefix-bictcp_cwnd_event") +void BPF_PROG(bictcp_cwnd_event, struct sock *sk, enum tcp_ca_event event) +{ + if (event == CA_EVENT_TX_START) { + struct bictcp *ca = inet_csk_ca(sk); + __u32 now = tcp_jiffies32; + __s32 delta; + + delta = now - tcp_sk(sk)->lsndtime; + + /* We were application limited (idle) for a while. + * Shift epoch_start to keep cwnd growth to cubic curve. + */ + if (ca->epoch_start && delta > 0) { + ca->epoch_start += delta; + if (after(ca->epoch_start, now)) + ca->epoch_start = now; + } + return; + } +} + +/* + * cbrt(x) MSB values for x MSB values in [0..63]. + * Precomputed then refined by hand - Willy Tarreau + * + * For x in [0..63], + * v = cbrt(x << 18) - 1 + * cbrt(x) = (v[x] + 10) >> 6 + */ +static const __u8 v[] = { + /* 0x00 */ 0, 54, 54, 54, 118, 118, 118, 118, + /* 0x08 */ 123, 129, 134, 138, 143, 147, 151, 156, + /* 0x10 */ 157, 161, 164, 168, 170, 173, 176, 179, + /* 0x18 */ 181, 185, 187, 190, 192, 194, 197, 199, + /* 0x20 */ 200, 202, 204, 206, 209, 211, 213, 215, + /* 0x28 */ 217, 219, 221, 222, 224, 225, 227, 229, + /* 0x30 */ 231, 232, 234, 236, 237, 239, 240, 242, + /* 0x38 */ 244, 245, 246, 248, 250, 251, 252, 254, +}; + +/* calculate the cubic root of x using a table lookup followed by one + * Newton-Raphson iteration. + * Avg err ~= 0.195% + */ +static __always_inline __u32 cubic_root(__u64 a) +{ + __u32 x, b, shift; + + if (a < 64) { + /* a in [0..63] */ + return ((__u32)v[(__u32)a] + 35) >> 6; + } + + b = fls64(a); + b = ((b * 84) >> 8) - 1; + shift = (a >> (b * 3)); + + /* it is needed for verifier's bound check on v */ + if (shift >= 64) + return 0; + + x = ((__u32)(((__u32)v[shift] + 10) << b)) >> 6; + + /* + * Newton-Raphson iteration + * 2 + * x = ( 2 * x + a / x ) / 3 + * k+1 k k + */ + x = (2 * x + (__u32)div64_u64(a, (__u64)x * (__u64)(x - 1))); + x = ((x * 341) >> 10); + return x; +} + +/* + * Compute congestion window to use. + */ +static __always_inline void bictcp_update(struct bictcp *ca, __u32 cwnd, + __u32 acked) +{ + __u32 delta, bic_target, max_cnt; + __u64 offs, t; + + ca->ack_cnt += acked; /* count the number of ACKed packets */ + + if (ca->last_cwnd == cwnd && + (__s32)(tcp_jiffies32 - ca->last_time) <= HZ / 32) + return; + + /* The CUBIC function can update ca->cnt at most once per jiffy. + * On all cwnd reduction events, ca->epoch_start is set to 0, + * which will force a recalculation of ca->cnt. + */ + if (ca->epoch_start && tcp_jiffies32 == ca->last_time) + goto tcp_friendliness; + + ca->last_cwnd = cwnd; + ca->last_time = tcp_jiffies32; + + if (ca->epoch_start == 0) { + ca->epoch_start = tcp_jiffies32; /* record beginning */ + ca->ack_cnt = acked; /* start counting */ + ca->tcp_cwnd = cwnd; /* syn with cubic */ + + if (ca->last_max_cwnd <= cwnd) { + ca->bic_K = 0; + ca->bic_origin_point = cwnd; + } else { + /* Compute new K based on + * (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ) + */ + ca->bic_K = cubic_root(cube_factor + * (ca->last_max_cwnd - cwnd)); + ca->bic_origin_point = ca->last_max_cwnd; + } + } + + /* cubic function - calc*/ + /* calculate c * time^3 / rtt, + * while considering overflow in calculation of time^3 + * (so time^3 is done by using 64 bit) + * and without the support of division of 64bit numbers + * (so all divisions are done by using 32 bit) + * also NOTE the unit of those veriables + * time = (t - K) / 2^bictcp_HZ + * c = bic_scale >> 10 + * rtt = (srtt >> 3) / HZ + * !!! The following code does not have overflow problems, + * if the cwnd < 1 million packets !!! + */ + + t = (__s32)(tcp_jiffies32 - ca->epoch_start) * USEC_PER_JIFFY; + t += ca->delay_min; + /* change the unit from usec to bictcp_HZ */ + t <<= BICTCP_HZ; + t /= USEC_PER_SEC; + + if (t < ca->bic_K) /* t - K */ + offs = ca->bic_K - t; + else + offs = t - ca->bic_K; + + /* c/rtt * (t-K)^3 */ + delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ); + if (t < ca->bic_K) /* below origin*/ + bic_target = ca->bic_origin_point - delta; + else /* above origin*/ + bic_target = ca->bic_origin_point + delta; + + /* cubic function - calc bictcp_cnt*/ + if (bic_target > cwnd) { + ca->cnt = cwnd / (bic_target - cwnd); + } else { + ca->cnt = 100 * cwnd; /* very small increment*/ + } + + /* + * The initial growth of cubic function may be too conservative + * when the available bandwidth is still unknown. + */ + if (ca->last_max_cwnd == 0 && ca->cnt > 20) + ca->cnt = 20; /* increase cwnd 5% per RTT */ + +tcp_friendliness: + /* TCP Friendly */ + if (tcp_friendliness) { + __u32 scale = beta_scale; + __u32 n; + + /* update tcp cwnd */ + delta = (cwnd * scale) >> 3; + if (ca->ack_cnt > delta && delta) { + n = ca->ack_cnt / delta; + ca->ack_cnt -= n * delta; + ca->tcp_cwnd += n; + } + + if (ca->tcp_cwnd > cwnd) { /* if bic is slower than tcp */ + delta = ca->tcp_cwnd - cwnd; + max_cnt = cwnd / delta; + if (ca->cnt > max_cnt) + ca->cnt = max_cnt; + } + } + + /* The maximum rate of cwnd increase CUBIC allows is 1 packet per + * 2 packets ACKed, meaning cwnd grows at 1.5x per RTT. + */ + ca->cnt = max(ca->cnt, 2U); +} + +/* Or simply use the BPF_STRUCT_OPS to avoid the SEC boiler plate. */ +void BPF_STRUCT_OPS(bictcp_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bictcp *ca = inet_csk_ca(sk); + + if (!tcp_is_cwnd_limited(sk)) + return; + + if (tcp_in_slow_start(tp)) { + if (hystart && after(ack, ca->end_seq)) + bictcp_hystart_reset(sk); + acked = tcp_slow_start(tp, acked); + if (!acked) + return; + } + bictcp_update(ca, tp->snd_cwnd, acked); + tcp_cong_avoid_ai(tp, ca->cnt, acked); +} + +__u32 BPF_STRUCT_OPS(bictcp_recalc_ssthresh, struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct bictcp *ca = inet_csk_ca(sk); + + ca->epoch_start = 0; /* end of epoch */ + + /* Wmax and fast convergence */ + if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence) + ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta)) + / (2 * BICTCP_BETA_SCALE); + else + ca->last_max_cwnd = tp->snd_cwnd; + + return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); +} + +void BPF_STRUCT_OPS(bictcp_state, struct sock *sk, __u8 new_state) +{ + if (new_state == TCP_CA_Loss) { + bictcp_reset(inet_csk_ca(sk)); + bictcp_hystart_reset(sk); + } +} + +#define GSO_MAX_SIZE 65536 + +/* Account for TSO/GRO delays. + * Otherwise short RTT flows could get too small ssthresh, since during + * slow start we begin with small TSO packets and ca->delay_min would + * not account for long aggregation delay when TSO packets get bigger. + * Ideally even with a very small RTT we would like to have at least one + * TSO packet being sent and received by GRO, and another one in qdisc layer. + * We apply another 100% factor because @rate is doubled at this point. + * We cap the cushion to 1ms. + */ +static __always_inline __u32 hystart_ack_delay(struct sock *sk) +{ + unsigned long rate; + + rate = sk->sk_pacing_rate; + if (!rate) + return 0; + return min((__u64)USEC_PER_MSEC, + div64_ul((__u64)GSO_MAX_SIZE * 4 * USEC_PER_SEC, rate)); +} + +static __always_inline void hystart_update(struct sock *sk, __u32 delay) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bictcp *ca = inet_csk_ca(sk); + __u32 threshold; + + if (hystart_detect & HYSTART_ACK_TRAIN) { + __u32 now = bictcp_clock_us(sk); + + /* first detection parameter - ack-train detection */ + if ((__s32)(now - ca->last_ack) <= hystart_ack_delta_us) { + ca->last_ack = now; + + threshold = ca->delay_min + hystart_ack_delay(sk); + + /* Hystart ack train triggers if we get ack past + * ca->delay_min/2. + * Pacing might have delayed packets up to RTT/2 + * during slow start. + */ + if (sk->sk_pacing_status == SK_PACING_NONE) + threshold >>= 1; + + if ((__s32)(now - ca->round_start) > threshold) { + ca->found = 1; + tp->snd_ssthresh = tp->snd_cwnd; + } + } + } + + if (hystart_detect & HYSTART_DELAY) { + /* obtain the minimum delay of more than sampling packets */ + if (ca->sample_cnt < HYSTART_MIN_SAMPLES) { + if (ca->curr_rtt > delay) + ca->curr_rtt = delay; + + ca->sample_cnt++; + } else { + if (ca->curr_rtt > ca->delay_min + + HYSTART_DELAY_THRESH(ca->delay_min >> 3)) { + ca->found = 1; + tp->snd_ssthresh = tp->snd_cwnd; + } + } + } +} + +void BPF_STRUCT_OPS(bictcp_acked, struct sock *sk, + const struct ack_sample *sample) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct bictcp *ca = inet_csk_ca(sk); + __u32 delay; + + /* Some calls are for duplicates without timetamps */ + if (sample->rtt_us < 0) + return; + + /* Discard delay samples right after fast recovery */ + if (ca->epoch_start && (__s32)(tcp_jiffies32 - ca->epoch_start) < HZ) + return; + + delay = sample->rtt_us; + if (delay == 0) + delay = 1; + + /* first time call or link delay decreases */ + if (ca->delay_min == 0 || ca->delay_min > delay) + ca->delay_min = delay; + + /* hystart triggers when cwnd is larger than some threshold */ + if (!ca->found && tcp_in_slow_start(tp) && hystart && + tp->snd_cwnd >= hystart_low_window) + hystart_update(sk, delay); +} + +__u32 BPF_STRUCT_OPS(tcp_reno_undo_cwnd, struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + + return max(tp->snd_cwnd, tp->prior_cwnd); +} + +SEC(".struct_ops") +struct tcp_congestion_ops cubic = { + .init = (void *)bictcp_init, + .ssthresh = (void *)bictcp_recalc_ssthresh, + .cong_avoid = (void *)bictcp_cong_avoid, + .set_state = (void *)bictcp_state, + .undo_cwnd = (void *)tcp_reno_undo_cwnd, + .cwnd_event = (void *)bictcp_cwnd_event, + .pkts_acked = (void *)bictcp_acked, + .name = "bpf_cubic", +}; diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c new file mode 100644 index 000000000000..b631fb5032d2 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook */ + +/* WARNING: This implemenation is not necessarily the same + * as the tcp_dctcp.c. The purpose is mainly for testing + * the kernel BPF logic. + */ + +#include <linux/bpf.h> +#include <linux/types.h> +#include <bpf/bpf_helpers.h> +#include "bpf_trace_helpers.h" +#include "bpf_tcp_helpers.h" + +char _license[] SEC("license") = "GPL"; + +#define DCTCP_MAX_ALPHA 1024U + +struct dctcp { + __u32 old_delivered; + __u32 old_delivered_ce; + __u32 prior_rcv_nxt; + __u32 dctcp_alpha; + __u32 next_seq; + __u32 ce_state; + __u32 loss_cwnd; +}; + +static unsigned int dctcp_shift_g = 4; /* g = 1/2^4 */ +static unsigned int dctcp_alpha_on_init = DCTCP_MAX_ALPHA; + +static __always_inline void dctcp_reset(const struct tcp_sock *tp, + struct dctcp *ca) +{ + ca->next_seq = tp->snd_nxt; + + ca->old_delivered = tp->delivered; + ca->old_delivered_ce = tp->delivered_ce; +} + +SEC("struct_ops/dctcp_init") +void BPF_PROG(dctcp_init, struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct dctcp *ca = inet_csk_ca(sk); + + ca->prior_rcv_nxt = tp->rcv_nxt; + ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA); + ca->loss_cwnd = 0; + ca->ce_state = 0; + + dctcp_reset(tp, ca); +} + +SEC("struct_ops/dctcp_ssthresh") +__u32 BPF_PROG(dctcp_ssthresh, struct sock *sk) +{ + struct dctcp *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + ca->loss_cwnd = tp->snd_cwnd; + return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U); +} + +SEC("struct_ops/dctcp_update_alpha") +void BPF_PROG(dctcp_update_alpha, struct sock *sk, __u32 flags) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct dctcp *ca = inet_csk_ca(sk); + + /* Expired RTT */ + if (!before(tp->snd_una, ca->next_seq)) { + __u32 delivered_ce = tp->delivered_ce - ca->old_delivered_ce; + __u32 alpha = ca->dctcp_alpha; + + /* alpha = (1 - g) * alpha + g * F */ + + alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g); + if (delivered_ce) { + __u32 delivered = tp->delivered - ca->old_delivered; + + /* If dctcp_shift_g == 1, a 32bit value would overflow + * after 8 M packets. + */ + delivered_ce <<= (10 - dctcp_shift_g); + delivered_ce /= max(1U, delivered); + + alpha = min(alpha + delivered_ce, DCTCP_MAX_ALPHA); + } + ca->dctcp_alpha = alpha; + dctcp_reset(tp, ca); + } +} + +static __always_inline void dctcp_react_to_loss(struct sock *sk) +{ + struct dctcp *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + ca->loss_cwnd = tp->snd_cwnd; + tp->snd_ssthresh = max(tp->snd_cwnd >> 1U, 2U); +} + +SEC("struct_ops/dctcp_state") +void BPF_PROG(dctcp_state, struct sock *sk, __u8 new_state) +{ + if (new_state == TCP_CA_Recovery && + new_state != BPF_CORE_READ_BITFIELD(inet_csk(sk), icsk_ca_state)) + dctcp_react_to_loss(sk); + /* We handle RTO in dctcp_cwnd_event to ensure that we perform only + * one loss-adjustment per RTT. + */ +} + +static __always_inline void dctcp_ece_ack_cwr(struct sock *sk, __u32 ce_state) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (ce_state == 1) + tp->ecn_flags |= TCP_ECN_DEMAND_CWR; + else + tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; +} + +/* Minimal DCTP CE state machine: + * + * S: 0 <- last pkt was non-CE + * 1 <- last pkt was CE + */ +static __always_inline +void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt, + __u32 *prior_rcv_nxt, __u32 *ce_state) +{ + __u32 new_ce_state = (evt == CA_EVENT_ECN_IS_CE) ? 1 : 0; + + if (*ce_state != new_ce_state) { + /* CE state has changed, force an immediate ACK to + * reflect the new CE state. If an ACK was delayed, + * send that first to reflect the prior CE state. + */ + if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) { + dctcp_ece_ack_cwr(sk, *ce_state); + bpf_tcp_send_ack(sk, *prior_rcv_nxt); + } + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; + } + *prior_rcv_nxt = tcp_sk(sk)->rcv_nxt; + *ce_state = new_ce_state; + dctcp_ece_ack_cwr(sk, new_ce_state); +} + +SEC("struct_ops/dctcp_cwnd_event") +void BPF_PROG(dctcp_cwnd_event, struct sock *sk, enum tcp_ca_event ev) +{ + struct dctcp *ca = inet_csk_ca(sk); + + switch (ev) { + case CA_EVENT_ECN_IS_CE: + case CA_EVENT_ECN_NO_CE: + dctcp_ece_ack_update(sk, ev, &ca->prior_rcv_nxt, &ca->ce_state); + break; + case CA_EVENT_LOSS: + dctcp_react_to_loss(sk); + break; + default: + /* Don't care for the rest. */ + break; + } +} + +SEC("struct_ops/dctcp_cwnd_undo") +__u32 BPF_PROG(dctcp_cwnd_undo, struct sock *sk) +{ + const struct dctcp *ca = inet_csk_ca(sk); + + return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); +} + +SEC("struct_ops/tcp_reno_cong_avoid") +void BPF_PROG(tcp_reno_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (!tcp_is_cwnd_limited(sk)) + return; + + /* In "safe" area, increase. */ + if (tcp_in_slow_start(tp)) { + acked = tcp_slow_start(tp, acked); + if (!acked) + return; + } + /* In dangerous area, increase slowly. */ + tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked); +} + +SEC(".struct_ops") +struct tcp_congestion_ops dctcp_nouse = { + .init = (void *)dctcp_init, + .set_state = (void *)dctcp_state, + .flags = TCP_CONG_NEEDS_ECN, + .name = "bpf_dctcp_nouse", +}; + +SEC(".struct_ops") +struct tcp_congestion_ops dctcp = { + .init = (void *)dctcp_init, + .in_ack_event = (void *)dctcp_update_alpha, + .cwnd_event = (void *)dctcp_cwnd_event, + .ssthresh = (void *)dctcp_ssthresh, + .cong_avoid = (void *)tcp_reno_cong_avoid, + .undo_cwnd = (void *)dctcp_cwnd_undo, + .set_state = (void *)dctcp_state, + .flags = TCP_CONG_NEEDS_ECN, + .name = "bpf_dctcp", +}; diff --git a/tools/testing/selftests/bpf/progs/bpf_flow.c b/tools/testing/selftests/bpf/progs/bpf_flow.c index 040a44206f29..9941f0ba471e 100644 --- a/tools/testing/selftests/bpf/progs/bpf_flow.c +++ b/tools/testing/selftests/bpf/progs/bpf_flow.c @@ -16,8 +16,8 @@ #include <sys/socket.h> #include <linux/if_tunnel.h> #include <linux/mpls.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> int _version SEC("version") = 1; #define PROG(F) SEC(#F) int bpf_func_##F diff --git a/tools/testing/selftests/bpf/progs/connect4_prog.c b/tools/testing/selftests/bpf/progs/connect4_prog.c index 1fd244d35ba9..75085119c5bb 100644 --- a/tools/testing/selftests/bpf/progs/connect4_prog.c +++ b/tools/testing/selftests/bpf/progs/connect4_prog.c @@ -9,8 +9,8 @@ #include <linux/in6.h> #include <sys/socket.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> #define SRC_REWRITE_IP4 0x7f000004U #define DST_REWRITE_IP4 0x7f000001U diff --git a/tools/testing/selftests/bpf/progs/connect6_prog.c b/tools/testing/selftests/bpf/progs/connect6_prog.c index 26397ab7b3c7..506d0f81a375 100644 --- a/tools/testing/selftests/bpf/progs/connect6_prog.c +++ b/tools/testing/selftests/bpf/progs/connect6_prog.c @@ -9,8 +9,8 @@ #include <linux/in6.h> #include <sys/socket.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> #define SRC_REWRITE_IP6_0 0 #define SRC_REWRITE_IP6_1 0 diff --git a/tools/testing/selftests/bpf/progs/dev_cgroup.c b/tools/testing/selftests/bpf/progs/dev_cgroup.c index ce41a3475f27..8924e06bdef0 100644 --- a/tools/testing/selftests/bpf/progs/dev_cgroup.c +++ b/tools/testing/selftests/bpf/progs/dev_cgroup.c @@ -7,7 +7,7 @@ #include <linux/bpf.h> #include <linux/version.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> SEC("cgroup/dev") int bpf_prog1(struct bpf_cgroup_dev_ctx *ctx) diff --git a/tools/testing/selftests/bpf/progs/fentry_test.c b/tools/testing/selftests/bpf/progs/fentry_test.c index 615f7c6bca77..38d3a82144ca 100644 --- a/tools/testing/selftests/bpf/progs/fentry_test.c +++ b/tools/testing/selftests/bpf/progs/fentry_test.c @@ -1,43 +1,46 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #include "bpf_trace_helpers.h" char _license[] SEC("license") = "GPL"; __u64 test1_result = 0; -BPF_TRACE_1("fentry/bpf_fentry_test1", test1, int, a) +SEC("fentry/bpf_fentry_test1") +int BPF_PROG(test1, int a) { test1_result = a == 1; return 0; } __u64 test2_result = 0; -BPF_TRACE_2("fentry/bpf_fentry_test2", test2, int, a, __u64, b) +SEC("fentry/bpf_fentry_test2") +int BPF_PROG(test2, int a, __u64 b) { test2_result = a == 2 && b == 3; return 0; } __u64 test3_result = 0; -BPF_TRACE_3("fentry/bpf_fentry_test3", test3, char, a, int, b, __u64, c) +SEC("fentry/bpf_fentry_test3") +int BPF_PROG(test3, char a, int b, __u64 c) { test3_result = a == 4 && b == 5 && c == 6; return 0; } __u64 test4_result = 0; -BPF_TRACE_4("fentry/bpf_fentry_test4", test4, - void *, a, char, b, int, c, __u64, d) +SEC("fentry/bpf_fentry_test4") +int BPF_PROG(test4, void *a, char b, int c, __u64 d) { test4_result = a == (void *)7 && b == 8 && c == 9 && d == 10; return 0; } __u64 test5_result = 0; -BPF_TRACE_5("fentry/bpf_fentry_test5", test5, - __u64, a, void *, b, short, c, int, d, __u64, e) +SEC("fentry/bpf_fentry_test5") +int BPF_PROG(test5, __u64 a, void *b, short c, int d, __u64 e) { test5_result = a == 11 && b == (void *)12 && c == 13 && d == 14 && e == 15; @@ -45,8 +48,8 @@ BPF_TRACE_5("fentry/bpf_fentry_test5", test5, } __u64 test6_result = 0; -BPF_TRACE_6("fentry/bpf_fentry_test6", test6, - __u64, a, void *, b, short, c, int, d, void *, e, __u64, f) +SEC("fentry/bpf_fentry_test6") +int BPF_PROG(test6, __u64 a, void *b, short c, int d, void * e, __u64 f) { test6_result = a == 16 && b == (void *)17 && c == 18 && d == 19 && e == (void *)20 && f == 21; diff --git a/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c b/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c index 2d211ee98a1c..c329fccf9842 100644 --- a/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c +++ b/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c @@ -1,7 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ +#include <linux/stddef.h> +#include <linux/ipv6.h> #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> #include "bpf_trace_helpers.h" struct sk_buff { @@ -9,8 +12,8 @@ struct sk_buff { }; __u64 test_result = 0; -BPF_TRACE_2("fexit/test_pkt_access", test_main, - struct sk_buff *, skb, int, ret) +SEC("fexit/test_pkt_access") +int BPF_PROG(test_main, struct sk_buff *skb, int ret) { int len; @@ -24,8 +27,8 @@ BPF_TRACE_2("fexit/test_pkt_access", test_main, } __u64 test_result_subprog1 = 0; -BPF_TRACE_2("fexit/test_pkt_access_subprog1", test_subprog1, - struct sk_buff *, skb, int, ret) +SEC("fexit/test_pkt_access_subprog1") +int BPF_PROG(test_subprog1, struct sk_buff *skb, int ret) { int len; @@ -79,4 +82,73 @@ int test_subprog2(struct args_subprog2 *ctx) test_result_subprog2 = 1; return 0; } + +__u64 test_result_subprog3 = 0; +SEC("fexit/test_pkt_access_subprog3") +int BPF_PROG(test_subprog3, int val, struct sk_buff *skb, int ret) +{ + int len; + + __builtin_preserve_access_index(({ + len = skb->len; + })); + if (len != 74 || ret != 74 * val || val != 3) + return 0; + test_result_subprog3 = 1; + return 0; +} + +__u64 test_get_skb_len = 0; +SEC("freplace/get_skb_len") +int new_get_skb_len(struct __sk_buff *skb) +{ + int len = skb->len; + + if (len != 74) + return 0; + test_get_skb_len = 1; + return 74; /* original get_skb_len() returns skb->len */ +} + +__u64 test_get_skb_ifindex = 0; +SEC("freplace/get_skb_ifindex") +int new_get_skb_ifindex(int val, struct __sk_buff *skb, int var) +{ + void *data_end = (void *)(long)skb->data_end; + void *data = (void *)(long)skb->data; + struct ipv6hdr ip6, *ip6p; + int ifindex = skb->ifindex; + __u32 eth_proto; + __u32 nh_off; + + /* check that BPF extension can read packet via direct packet access */ + if (data + 14 + sizeof(ip6) > data_end) + return 0; + ip6p = data + 14; + + if (ip6p->nexthdr != 6 || ip6p->payload_len != __bpf_constant_htons(123)) + return 0; + + /* check that legacy packet access helper works too */ + if (bpf_skb_load_bytes(skb, 14, &ip6, sizeof(ip6)) < 0) + return 0; + ip6p = &ip6; + if (ip6p->nexthdr != 6 || ip6p->payload_len != __bpf_constant_htons(123)) + return 0; + + if (ifindex != 1 || val != 3 || var != 1) + return 0; + test_get_skb_ifindex = 1; + return 3; /* original get_skb_ifindex() returns val * ifindex * var */ +} + +volatile __u64 test_get_constant = 0; +SEC("freplace/get_constant") +int new_get_constant(long val) +{ + if (val != 123) + return 0; + test_get_constant = 1; + return test_get_constant; /* original get_constant() returns val - 122 */ +} char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/fexit_bpf2bpf_simple.c b/tools/testing/selftests/bpf/progs/fexit_bpf2bpf_simple.c index ebc0ab7f0f5c..92f3fa47cf40 100644 --- a/tools/testing/selftests/bpf/progs/fexit_bpf2bpf_simple.c +++ b/tools/testing/selftests/bpf/progs/fexit_bpf2bpf_simple.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #include "bpf_trace_helpers.h" struct sk_buff { @@ -9,8 +9,9 @@ struct sk_buff { }; __u64 test_result = 0; -BPF_TRACE_2("fexit/test_pkt_md_access", test_main2, - struct sk_buff *, skb, int, ret) + +SEC("fexit/test_pkt_md_access") +int BPF_PROG(test_main2, struct sk_buff *skb, int ret) { int len; diff --git a/tools/testing/selftests/bpf/progs/fexit_test.c b/tools/testing/selftests/bpf/progs/fexit_test.c index 86db0d60fb6e..348109b9ea07 100644 --- a/tools/testing/selftests/bpf/progs/fexit_test.c +++ b/tools/testing/selftests/bpf/progs/fexit_test.c @@ -1,45 +1,47 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #include "bpf_trace_helpers.h" char _license[] SEC("license") = "GPL"; __u64 test1_result = 0; -BPF_TRACE_2("fexit/bpf_fentry_test1", test1, int, a, int, ret) +SEC("fexit/bpf_fentry_test1") +int BPF_PROG(test1, int a, int ret) { test1_result = a == 1 && ret == 2; return 0; } __u64 test2_result = 0; -BPF_TRACE_3("fexit/bpf_fentry_test2", test2, int, a, __u64, b, int, ret) +SEC("fexit/bpf_fentry_test2") +int BPF_PROG(test2, int a, __u64 b, int ret) { test2_result = a == 2 && b == 3 && ret == 5; return 0; } __u64 test3_result = 0; -BPF_TRACE_4("fexit/bpf_fentry_test3", test3, char, a, int, b, __u64, c, int, ret) +SEC("fexit/bpf_fentry_test3") +int BPF_PROG(test3, char a, int b, __u64 c, int ret) { test3_result = a == 4 && b == 5 && c == 6 && ret == 15; return 0; } __u64 test4_result = 0; -BPF_TRACE_5("fexit/bpf_fentry_test4", test4, - void *, a, char, b, int, c, __u64, d, int, ret) +SEC("fexit/bpf_fentry_test4") +int BPF_PROG(test4, void *a, char b, int c, __u64 d, int ret) { - test4_result = a == (void *)7 && b == 8 && c == 9 && d == 10 && ret == 34; return 0; } __u64 test5_result = 0; -BPF_TRACE_6("fexit/bpf_fentry_test5", test5, - __u64, a, void *, b, short, c, int, d, __u64, e, int, ret) +SEC("fexit/bpf_fentry_test5") +int BPF_PROG(test5, __u64 a, void *b, short c, int d, __u64 e, int ret) { test5_result = a == 11 && b == (void *)12 && c == 13 && d == 14 && e == 15 && ret == 65; @@ -47,9 +49,8 @@ BPF_TRACE_6("fexit/bpf_fentry_test5", test5, } __u64 test6_result = 0; -BPF_TRACE_7("fexit/bpf_fentry_test6", test6, - __u64, a, void *, b, short, c, int, d, void *, e, __u64, f, - int, ret) +SEC("fexit/bpf_fentry_test6") +int BPF_PROG(test6, __u64 a, void *b, short c, int d, void *e, __u64 f, int ret) { test6_result = a == 16 && b == (void *)17 && c == 18 && d == 19 && e == (void *)20 && f == 21 && ret == 111; diff --git a/tools/testing/selftests/bpf/progs/get_cgroup_id_kern.c b/tools/testing/selftests/bpf/progs/get_cgroup_id_kern.c index 16c54ade6888..6b42db2fe391 100644 --- a/tools/testing/selftests/bpf/progs/get_cgroup_id_kern.c +++ b/tools/testing/selftests/bpf/progs/get_cgroup_id_kern.c @@ -2,7 +2,7 @@ // Copyright (c) 2018 Facebook #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> struct { __uint(type, BPF_MAP_TYPE_ARRAY); diff --git a/tools/testing/selftests/bpf/progs/kfree_skb.c b/tools/testing/selftests/bpf/progs/kfree_skb.c index 974d6f3bb319..8f48a909f079 100644 --- a/tools/testing/selftests/bpf/progs/kfree_skb.c +++ b/tools/testing/selftests/bpf/progs/kfree_skb.c @@ -2,8 +2,8 @@ // Copyright (c) 2019 Facebook #include <linux/bpf.h> #include <stdbool.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> #include "bpf_trace_helpers.h" char _license[] SEC("license") = "GPL"; @@ -57,8 +57,8 @@ struct meta { /* TRACE_EVENT(kfree_skb, * TP_PROTO(struct sk_buff *skb, void *location), */ -BPF_TRACE_2("tp_btf/kfree_skb", trace_kfree_skb, - struct sk_buff *, skb, void *, location) +SEC("tp_btf/kfree_skb") +int BPF_PROG(trace_kfree_skb, struct sk_buff *skb, void *location) { struct net_device *dev; struct callback_head *ptr; @@ -114,9 +114,9 @@ static volatile struct { bool fexit_test_ok; } result; -BPF_TRACE_3("fentry/eth_type_trans", fentry_eth_type_trans, - struct sk_buff *, skb, struct net_device *, dev, - unsigned short, protocol) +SEC("fentry/eth_type_trans") +int BPF_PROG(fentry_eth_type_trans, struct sk_buff *skb, struct net_device *dev, + unsigned short protocol) { int len, ifindex; @@ -132,9 +132,9 @@ BPF_TRACE_3("fentry/eth_type_trans", fentry_eth_type_trans, return 0; } -BPF_TRACE_3("fexit/eth_type_trans", fexit_eth_type_trans, - struct sk_buff *, skb, struct net_device *, dev, - unsigned short, protocol) +SEC("fexit/eth_type_trans") +int BPF_PROG(fexit_eth_type_trans, struct sk_buff *skb, struct net_device *dev, + unsigned short protocol) { int len, ifindex; diff --git a/tools/testing/selftests/bpf/progs/loop1.c b/tools/testing/selftests/bpf/progs/loop1.c index 40ac722a9da5..50e66772c046 100644 --- a/tools/testing/selftests/bpf/progs/loop1.c +++ b/tools/testing/selftests/bpf/progs/loop1.c @@ -6,8 +6,8 @@ #include <stddef.h> #include <stdbool.h> #include <linux/bpf.h> -#include "bpf_helpers.h" -#include "bpf_tracing.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/loop2.c b/tools/testing/selftests/bpf/progs/loop2.c index bb80f29aa7f7..947bb7e988c2 100644 --- a/tools/testing/selftests/bpf/progs/loop2.c +++ b/tools/testing/selftests/bpf/progs/loop2.c @@ -6,8 +6,8 @@ #include <stddef.h> #include <stdbool.h> #include <linux/bpf.h> -#include "bpf_helpers.h" -#include "bpf_tracing.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/loop3.c b/tools/testing/selftests/bpf/progs/loop3.c index 2b9165a7afe1..76e93b31c14b 100644 --- a/tools/testing/selftests/bpf/progs/loop3.c +++ b/tools/testing/selftests/bpf/progs/loop3.c @@ -6,8 +6,8 @@ #include <stddef.h> #include <stdbool.h> #include <linux/bpf.h> -#include "bpf_helpers.h" -#include "bpf_tracing.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/loop4.c b/tools/testing/selftests/bpf/progs/loop4.c index 650859022771..b35337926d66 100644 --- a/tools/testing/selftests/bpf/progs/loop4.c +++ b/tools/testing/selftests/bpf/progs/loop4.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2019 Facebook #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/loop5.c b/tools/testing/selftests/bpf/progs/loop5.c index 28d1d668f07c..913791923fa3 100644 --- a/tools/testing/selftests/bpf/progs/loop5.c +++ b/tools/testing/selftests/bpf/progs/loop5.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2019 Facebook #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #define barrier() __asm__ __volatile__("": : :"memory") char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/netcnt_prog.c b/tools/testing/selftests/bpf/progs/netcnt_prog.c index 38a997852cad..d071adf178bd 100644 --- a/tools/testing/selftests/bpf/progs/netcnt_prog.c +++ b/tools/testing/selftests/bpf/progs/netcnt_prog.c @@ -2,7 +2,7 @@ #include <linux/bpf.h> #include <linux/version.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #include "netcnt_common.h" #define MAX_BPS (3 * 1024 * 1024) diff --git a/tools/testing/selftests/bpf/progs/pyperf.h b/tools/testing/selftests/bpf/progs/pyperf.h index 71d383cc9b85..cc615b82b56e 100644 --- a/tools/testing/selftests/bpf/progs/pyperf.h +++ b/tools/testing/selftests/bpf/progs/pyperf.h @@ -6,7 +6,7 @@ #include <stddef.h> #include <stdbool.h> #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #define FUNCTION_NAME_LEN 64 #define FILE_NAME_LEN 128 @@ -154,7 +154,12 @@ struct { __uint(value_size, sizeof(long long) * 127); } stackmap SEC(".maps"); -static __always_inline int __on_event(struct pt_regs *ctx) +#ifdef GLOBAL_FUNC +__attribute__((noinline)) +#else +static __always_inline +#endif +int __on_event(struct bpf_raw_tracepoint_args *ctx) { uint64_t pid_tgid = bpf_get_current_pid_tgid(); pid_t pid = (pid_t)(pid_tgid >> 32); @@ -254,7 +259,7 @@ static __always_inline int __on_event(struct pt_regs *ctx) } SEC("raw_tracepoint/kfree_skb") -int on_event(struct pt_regs* ctx) +int on_event(struct bpf_raw_tracepoint_args* ctx) { int i, ret = 0; ret |= __on_event(ctx); diff --git a/tools/testing/selftests/bpf/progs/pyperf_global.c b/tools/testing/selftests/bpf/progs/pyperf_global.c new file mode 100644 index 000000000000..079e78a7562b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/pyperf_global.c @@ -0,0 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#define STACK_MAX_LEN 50 +#define GLOBAL_FUNC +#include "pyperf.h" diff --git a/tools/testing/selftests/bpf/progs/sample_map_ret0.c b/tools/testing/selftests/bpf/progs/sample_map_ret0.c index 0756303676ac..1612a32007b6 100644 --- a/tools/testing/selftests/bpf/progs/sample_map_ret0.c +++ b/tools/testing/selftests/bpf/progs/sample_map_ret0.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) */ #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> struct bpf_map_def SEC("maps") htab = { .type = BPF_MAP_TYPE_HASH, diff --git a/tools/testing/selftests/bpf/progs/sendmsg4_prog.c b/tools/testing/selftests/bpf/progs/sendmsg4_prog.c index a91536b1c47e..092d9da536f3 100644 --- a/tools/testing/selftests/bpf/progs/sendmsg4_prog.c +++ b/tools/testing/selftests/bpf/progs/sendmsg4_prog.c @@ -5,8 +5,8 @@ #include <linux/bpf.h> #include <sys/socket.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> #define SRC1_IP4 0xAC100001U /* 172.16.0.1 */ #define SRC2_IP4 0x00000000U diff --git a/tools/testing/selftests/bpf/progs/sendmsg6_prog.c b/tools/testing/selftests/bpf/progs/sendmsg6_prog.c index a68062820410..255a432bc163 100644 --- a/tools/testing/selftests/bpf/progs/sendmsg6_prog.c +++ b/tools/testing/selftests/bpf/progs/sendmsg6_prog.c @@ -5,8 +5,8 @@ #include <linux/bpf.h> #include <sys/socket.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> #define SRC_REWRITE_IP6_0 0 #define SRC_REWRITE_IP6_1 0 diff --git a/tools/testing/selftests/bpf/progs/socket_cookie_prog.c b/tools/testing/selftests/bpf/progs/socket_cookie_prog.c index e4440fdd94cb..0cb5656a22b0 100644 --- a/tools/testing/selftests/bpf/progs/socket_cookie_prog.c +++ b/tools/testing/selftests/bpf/progs/socket_cookie_prog.c @@ -4,8 +4,8 @@ #include <linux/bpf.h> #include <sys/socket.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> struct socket_cookie { __u64 cookie_key; diff --git a/tools/testing/selftests/bpf/progs/sockmap_parse_prog.c b/tools/testing/selftests/bpf/progs/sockmap_parse_prog.c index 9390e0244259..a5c6d5903b22 100644 --- a/tools/testing/selftests/bpf/progs/sockmap_parse_prog.c +++ b/tools/testing/selftests/bpf/progs/sockmap_parse_prog.c @@ -1,6 +1,6 @@ #include <linux/bpf.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> int _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/sockmap_tcp_msg_prog.c b/tools/testing/selftests/bpf/progs/sockmap_tcp_msg_prog.c index e80484d98a1a..fdb4bf4408fa 100644 --- a/tools/testing/selftests/bpf/progs/sockmap_tcp_msg_prog.c +++ b/tools/testing/selftests/bpf/progs/sockmap_tcp_msg_prog.c @@ -1,7 +1,7 @@ #include <linux/bpf.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> int _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/sockmap_verdict_prog.c b/tools/testing/selftests/bpf/progs/sockmap_verdict_prog.c index 433e23918a62..4797dc985064 100644 --- a/tools/testing/selftests/bpf/progs/sockmap_verdict_prog.c +++ b/tools/testing/selftests/bpf/progs/sockmap_verdict_prog.c @@ -1,6 +1,6 @@ #include <linux/bpf.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> int _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/sockopt_inherit.c b/tools/testing/selftests/bpf/progs/sockopt_inherit.c index dede0fcd6102..c6d428a8d785 100644 --- a/tools/testing/selftests/bpf/progs/sockopt_inherit.c +++ b/tools/testing/selftests/bpf/progs/sockopt_inherit.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> char _license[] SEC("license") = "GPL"; __u32 _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/sockopt_multi.c b/tools/testing/selftests/bpf/progs/sockopt_multi.c index 4afd2595c08e..9d8c212dde9f 100644 --- a/tools/testing/selftests/bpf/progs/sockopt_multi.c +++ b/tools/testing/selftests/bpf/progs/sockopt_multi.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <netinet/in.h> #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> char _license[] SEC("license") = "GPL"; __u32 _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/sockopt_sk.c b/tools/testing/selftests/bpf/progs/sockopt_sk.c index 1bafbb944e37..d5a5eeb5fb52 100644 --- a/tools/testing/selftests/bpf/progs/sockopt_sk.c +++ b/tools/testing/selftests/bpf/progs/sockopt_sk.c @@ -3,7 +3,7 @@ #include <netinet/in.h> #include <netinet/tcp.h> #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> char _license[] SEC("license") = "GPL"; __u32 _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/strobemeta.h b/tools/testing/selftests/bpf/progs/strobemeta.h index 4bf16e0a1b0e..ad61b722a9de 100644 --- a/tools/testing/selftests/bpf/progs/strobemeta.h +++ b/tools/testing/selftests/bpf/progs/strobemeta.h @@ -8,7 +8,7 @@ #include <linux/ptrace.h> #include <linux/sched.h> #include <linux/types.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> typedef uint32_t pid_t; struct task_struct {}; diff --git a/tools/testing/selftests/bpf/progs/tailcall1.c b/tools/testing/selftests/bpf/progs/tailcall1.c index 63531e1a9fa4..1f407e65ae52 100644 --- a/tools/testing/selftests/bpf/progs/tailcall1.c +++ b/tools/testing/selftests/bpf/progs/tailcall1.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> struct { __uint(type, BPF_MAP_TYPE_PROG_ARRAY); diff --git a/tools/testing/selftests/bpf/progs/tailcall2.c b/tools/testing/selftests/bpf/progs/tailcall2.c index 21c85c477210..a093e739cf0e 100644 --- a/tools/testing/selftests/bpf/progs/tailcall2.c +++ b/tools/testing/selftests/bpf/progs/tailcall2.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> struct { __uint(type, BPF_MAP_TYPE_PROG_ARRAY); diff --git a/tools/testing/selftests/bpf/progs/tailcall3.c b/tools/testing/selftests/bpf/progs/tailcall3.c index 1ecae198b8c1..cabda877cf0a 100644 --- a/tools/testing/selftests/bpf/progs/tailcall3.c +++ b/tools/testing/selftests/bpf/progs/tailcall3.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> struct { __uint(type, BPF_MAP_TYPE_PROG_ARRAY); diff --git a/tools/testing/selftests/bpf/progs/tailcall4.c b/tools/testing/selftests/bpf/progs/tailcall4.c index 499388758119..f82075b47d7d 100644 --- a/tools/testing/selftests/bpf/progs/tailcall4.c +++ b/tools/testing/selftests/bpf/progs/tailcall4.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> struct { __uint(type, BPF_MAP_TYPE_PROG_ARRAY); diff --git a/tools/testing/selftests/bpf/progs/tailcall5.c b/tools/testing/selftests/bpf/progs/tailcall5.c index 49c64eb53f19..ce5450744fd4 100644 --- a/tools/testing/selftests/bpf/progs/tailcall5.c +++ b/tools/testing/selftests/bpf/progs/tailcall5.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> struct { __uint(type, BPF_MAP_TYPE_PROG_ARRAY); diff --git a/tools/testing/selftests/bpf/progs/tcp_rtt.c b/tools/testing/selftests/bpf/progs/tcp_rtt.c index 2cf813a06b83..0cb3204ddb18 100644 --- a/tools/testing/selftests/bpf/progs/tcp_rtt.c +++ b/tools/testing/selftests/bpf/progs/tcp_rtt.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> char _license[] SEC("license") = "GPL"; __u32 _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/test_adjust_tail.c b/tools/testing/selftests/bpf/progs/test_adjust_tail.c index 4cd5e860c903..b7fc85769bdc 100644 --- a/tools/testing/selftests/bpf/progs/test_adjust_tail.c +++ b/tools/testing/selftests/bpf/progs/test_adjust_tail.c @@ -7,7 +7,7 @@ */ #include <linux/bpf.h> #include <linux/if_ether.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> int _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/test_attach_probe.c b/tools/testing/selftests/bpf/progs/test_attach_probe.c index 221b69700625..dd8fae6660ab 100644 --- a/tools/testing/selftests/bpf/progs/test_attach_probe.c +++ b/tools/testing/selftests/bpf/progs/test_attach_probe.c @@ -3,7 +3,7 @@ #include <linux/ptrace.h> #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> int kprobe_res = 0; int kretprobe_res = 0; diff --git a/tools/testing/selftests/bpf/progs/test_btf_haskv.c b/tools/testing/selftests/bpf/progs/test_btf_haskv.c index 62ad7e22105e..88b0566da13d 100644 --- a/tools/testing/selftests/bpf/progs/test_btf_haskv.c +++ b/tools/testing/selftests/bpf/progs/test_btf_haskv.c @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (c) 2018 Facebook */ #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #include "bpf_legacy.h" int _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/test_btf_newkv.c b/tools/testing/selftests/bpf/progs/test_btf_newkv.c index fb8d91a1dbe0..a924e53c8e9d 100644 --- a/tools/testing/selftests/bpf/progs/test_btf_newkv.c +++ b/tools/testing/selftests/bpf/progs/test_btf_newkv.c @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (c) 2018 Facebook */ #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #include "bpf_legacy.h" int _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/test_btf_nokv.c b/tools/testing/selftests/bpf/progs/test_btf_nokv.c index 3f4422044759..983aedd1c072 100644 --- a/tools/testing/selftests/bpf/progs/test_btf_nokv.c +++ b/tools/testing/selftests/bpf/progs/test_btf_nokv.c @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (c) 2018 Facebook */ #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> int _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/test_core_extern.c b/tools/testing/selftests/bpf/progs/test_core_extern.c index 9bfc91d9d004..3ac3603ad53d 100644 --- a/tools/testing/selftests/bpf/progs/test_core_extern.c +++ b/tools/testing/selftests/bpf/progs/test_core_extern.c @@ -5,7 +5,7 @@ #include <stdbool.h> #include <linux/ptrace.h> #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> /* non-existing BPF helper, to test dead code elimination */ static int (*bpf_missing_helper)(const void *arg1, int arg2) = (void *) 999; diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_arrays.c b/tools/testing/selftests/bpf/progs/test_core_reloc_arrays.c index 053b86f6b53f..51b3f79df523 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_arrays.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_arrays.c @@ -3,8 +3,8 @@ #include <linux/bpf.h> #include <stdint.h> -#include "bpf_helpers.h" -#include "bpf_core_read.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_core_read.h> char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_bitfields_direct.c b/tools/testing/selftests/bpf/progs/test_core_reloc_bitfields_direct.c index edc0f7c9e56d..56aec20212b5 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_bitfields_direct.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_bitfields_direct.c @@ -3,8 +3,8 @@ #include <linux/bpf.h> #include <stdint.h> -#include "bpf_helpers.h" -#include "bpf_core_read.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_core_read.h> char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_bitfields_probed.c b/tools/testing/selftests/bpf/progs/test_core_reloc_bitfields_probed.c index 6c20e433558b..ab1e647aeb31 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_bitfields_probed.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_bitfields_probed.c @@ -3,8 +3,8 @@ #include <linux/bpf.h> #include <stdint.h> -#include "bpf_helpers.h" -#include "bpf_core_read.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_core_read.h> char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_existence.c b/tools/testing/selftests/bpf/progs/test_core_reloc_existence.c index 1b7f0ae49cfb..7e45e2bdf6cd 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_existence.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_existence.c @@ -3,8 +3,8 @@ #include <linux/bpf.h> #include <stdint.h> -#include "bpf_helpers.h" -#include "bpf_core_read.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_core_read.h> char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_flavors.c b/tools/testing/selftests/bpf/progs/test_core_reloc_flavors.c index b5dbeef540fd..525acc2f841b 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_flavors.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_flavors.c @@ -3,8 +3,8 @@ #include <linux/bpf.h> #include <stdint.h> -#include "bpf_helpers.h" -#include "bpf_core_read.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_core_read.h> char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_ints.c b/tools/testing/selftests/bpf/progs/test_core_reloc_ints.c index c78ab6d28a14..6b5290739806 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_ints.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_ints.c @@ -3,8 +3,8 @@ #include <linux/bpf.h> #include <stdint.h> -#include "bpf_helpers.h" -#include "bpf_core_read.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_core_read.h> char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_kernel.c b/tools/testing/selftests/bpf/progs/test_core_reloc_kernel.c index 270de441b60a..aba928fd60d3 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_kernel.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_kernel.c @@ -3,8 +3,8 @@ #include <linux/bpf.h> #include <stdint.h> -#include "bpf_helpers.h" -#include "bpf_core_read.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_core_read.h> char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_misc.c b/tools/testing/selftests/bpf/progs/test_core_reloc_misc.c index 292a5c4ee76a..d5756dbdef82 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_misc.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_misc.c @@ -3,8 +3,8 @@ #include <linux/bpf.h> #include <stdint.h> -#include "bpf_helpers.h" -#include "bpf_core_read.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_core_read.h> char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_mods.c b/tools/testing/selftests/bpf/progs/test_core_reloc_mods.c index 0b28bfacc8fd..8b533db4a7a5 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_mods.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_mods.c @@ -3,8 +3,8 @@ #include <linux/bpf.h> #include <stdint.h> -#include "bpf_helpers.h" -#include "bpf_core_read.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_core_read.h> char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_nesting.c b/tools/testing/selftests/bpf/progs/test_core_reloc_nesting.c index 39279bf0c9db..2b4b6d49c677 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_nesting.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_nesting.c @@ -3,8 +3,8 @@ #include <linux/bpf.h> #include <stdint.h> -#include "bpf_helpers.h" -#include "bpf_core_read.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_core_read.h> char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_primitives.c b/tools/testing/selftests/bpf/progs/test_core_reloc_primitives.c index ea57973cdd19..2a8975678aa6 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_primitives.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_primitives.c @@ -3,8 +3,8 @@ #include <linux/bpf.h> #include <stdint.h> -#include "bpf_helpers.h" -#include "bpf_core_read.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_core_read.h> char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_ptr_as_arr.c b/tools/testing/selftests/bpf/progs/test_core_reloc_ptr_as_arr.c index d1eb59d4ea64..ca61a5183b88 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_ptr_as_arr.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_ptr_as_arr.c @@ -3,8 +3,8 @@ #include <linux/bpf.h> #include <stdint.h> -#include "bpf_helpers.h" -#include "bpf_core_read.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_core_read.h> char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_size.c b/tools/testing/selftests/bpf/progs/test_core_reloc_size.c index 9e091124d3bd..d7fb6cfc7891 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_size.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_size.c @@ -3,8 +3,8 @@ #include <linux/bpf.h> #include <stdint.h> -#include "bpf_helpers.h" -#include "bpf_core_read.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_core_read.h> char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_get_stack_rawtp.c b/tools/testing/selftests/bpf/progs/test_get_stack_rawtp.c index 6a4a8f57f174..29817a703984 100644 --- a/tools/testing/selftests/bpf/progs/test_get_stack_rawtp.c +++ b/tools/testing/selftests/bpf/progs/test_get_stack_rawtp.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> /* Permit pretty deep stack traces */ #define MAX_STACK_RAWTP 100 diff --git a/tools/testing/selftests/bpf/progs/test_global_data.c b/tools/testing/selftests/bpf/progs/test_global_data.c index 32a6073acb99..dd7a4d3dbc0d 100644 --- a/tools/testing/selftests/bpf/progs/test_global_data.c +++ b/tools/testing/selftests/bpf/progs/test_global_data.c @@ -5,7 +5,7 @@ #include <linux/pkt_cls.h> #include <string.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> struct { __uint(type, BPF_MAP_TYPE_ARRAY); diff --git a/tools/testing/selftests/bpf/progs/test_global_func1.c b/tools/testing/selftests/bpf/progs/test_global_func1.c new file mode 100644 index 000000000000..880260f6d536 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_global_func1.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2020 Facebook */ +#include <stddef.h> +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +#ifndef MAX_STACK +#define MAX_STACK (512 - 3 * 32 + 8) +#endif + +static __attribute__ ((noinline)) +int f0(int var, struct __sk_buff *skb) +{ + return skb->len; +} + +__attribute__ ((noinline)) +int f1(struct __sk_buff *skb) +{ + volatile char buf[MAX_STACK] = {}; + + return f0(0, skb) + skb->len; +} + +int f3(int, struct __sk_buff *skb, int); + +__attribute__ ((noinline)) +int f2(int val, struct __sk_buff *skb) +{ + return f1(skb) + f3(val, skb, 1); +} + +__attribute__ ((noinline)) +int f3(int val, struct __sk_buff *skb, int var) +{ + volatile char buf[MAX_STACK] = {}; + + return skb->ifindex * val * var; +} + +SEC("classifier/test") +int test_cls(struct __sk_buff *skb) +{ + return f0(1, skb) + f1(skb) + f2(2, skb) + f3(3, skb, 4); +} diff --git a/tools/testing/selftests/bpf/progs/test_global_func2.c b/tools/testing/selftests/bpf/progs/test_global_func2.c new file mode 100644 index 000000000000..2c18d82923a2 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_global_func2.c @@ -0,0 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2020 Facebook */ +#define MAX_STACK (512 - 3 * 32) +#include "test_global_func1.c" diff --git a/tools/testing/selftests/bpf/progs/test_global_func3.c b/tools/testing/selftests/bpf/progs/test_global_func3.c new file mode 100644 index 000000000000..86f0ecb304fc --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_global_func3.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2020 Facebook */ +#include <stddef.h> +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +__attribute__ ((noinline)) +int f1(struct __sk_buff *skb) +{ + return skb->len; +} + +__attribute__ ((noinline)) +int f2(int val, struct __sk_buff *skb) +{ + return f1(skb) + val; +} + +__attribute__ ((noinline)) +int f3(int val, struct __sk_buff *skb, int var) +{ + return f2(var, skb) + val; +} + +__attribute__ ((noinline)) +int f4(struct __sk_buff *skb) +{ + return f3(1, skb, 2); +} + +__attribute__ ((noinline)) +int f5(struct __sk_buff *skb) +{ + return f4(skb); +} + +__attribute__ ((noinline)) +int f6(struct __sk_buff *skb) +{ + return f5(skb); +} + +__attribute__ ((noinline)) +int f7(struct __sk_buff *skb) +{ + return f6(skb); +} + +#ifndef NO_FN8 +__attribute__ ((noinline)) +int f8(struct __sk_buff *skb) +{ + return f7(skb); +} +#endif + +SEC("classifier/test") +int test_cls(struct __sk_buff *skb) +{ +#ifndef NO_FN8 + return f8(skb); +#else + return f7(skb); +#endif +} diff --git a/tools/testing/selftests/bpf/progs/test_global_func4.c b/tools/testing/selftests/bpf/progs/test_global_func4.c new file mode 100644 index 000000000000..610f75edf276 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_global_func4.c @@ -0,0 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2020 Facebook */ +#define NO_FN8 +#include "test_global_func3.c" diff --git a/tools/testing/selftests/bpf/progs/test_global_func5.c b/tools/testing/selftests/bpf/progs/test_global_func5.c new file mode 100644 index 000000000000..260c25b827ef --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_global_func5.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2020 Facebook */ +#include <stddef.h> +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +__attribute__ ((noinline)) +int f1(struct __sk_buff *skb) +{ + return skb->len; +} + +int f3(int, struct __sk_buff *skb); + +__attribute__ ((noinline)) +int f2(int val, struct __sk_buff *skb) +{ + return f1(skb) + f3(val, (void *)&val); /* type mismatch */ +} + +__attribute__ ((noinline)) +int f3(int val, struct __sk_buff *skb) +{ + return skb->ifindex * val; +} + +SEC("classifier/test") +int test_cls(struct __sk_buff *skb) +{ + return f1(skb) + f2(2, skb) + f3(3, skb); +} diff --git a/tools/testing/selftests/bpf/progs/test_global_func6.c b/tools/testing/selftests/bpf/progs/test_global_func6.c new file mode 100644 index 000000000000..69e19c64e10b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_global_func6.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2020 Facebook */ +#include <stddef.h> +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +__attribute__ ((noinline)) +int f1(struct __sk_buff *skb) +{ + return skb->len; +} + +int f3(int, struct __sk_buff *skb); + +__attribute__ ((noinline)) +int f2(int val, struct __sk_buff *skb) +{ + return f1(skb) + f3(val, skb + 1); /* type mismatch */ +} + +__attribute__ ((noinline)) +int f3(int val, struct __sk_buff *skb) +{ + return skb->ifindex * val; +} + +SEC("classifier/test") +int test_cls(struct __sk_buff *skb) +{ + return f1(skb) + f2(2, skb) + f3(3, skb); +} diff --git a/tools/testing/selftests/bpf/progs/test_global_func7.c b/tools/testing/selftests/bpf/progs/test_global_func7.c new file mode 100644 index 000000000000..309b3f6136bd --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_global_func7.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2020 Facebook */ +#include <stddef.h> +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +__attribute__ ((noinline)) +void foo(struct __sk_buff *skb) +{ + skb->tc_index = 0; +} + +SEC("classifier/test") +int test_cls(struct __sk_buff *skb) +{ + foo(skb); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_l4lb.c b/tools/testing/selftests/bpf/progs/test_l4lb.c index 1d652ee8e73d..33493911d87a 100644 --- a/tools/testing/selftests/bpf/progs/test_l4lb.c +++ b/tools/testing/selftests/bpf/progs/test_l4lb.c @@ -17,9 +17,9 @@ #include <linux/icmpv6.h> #include <linux/tcp.h> #include <linux/udp.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #include "test_iptunnel_common.h" -#include "bpf_endian.h" +#include <bpf/bpf_endian.h> int _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c b/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c index 2e4efe70b1e5..28351936a438 100644 --- a/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c +++ b/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c @@ -13,9 +13,9 @@ #include <linux/icmpv6.h> #include <linux/tcp.h> #include <linux/udp.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #include "test_iptunnel_common.h" -#include "bpf_endian.h" +#include <bpf/bpf_endian.h> int _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/test_lirc_mode2_kern.c b/tools/testing/selftests/bpf/progs/test_lirc_mode2_kern.c index 4147130cc3b7..7a6620671a83 100644 --- a/tools/testing/selftests/bpf/progs/test_lirc_mode2_kern.c +++ b/tools/testing/selftests/bpf/progs/test_lirc_mode2_kern.c @@ -5,7 +5,7 @@ #include <linux/bpf.h> #include <linux/lirc.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> SEC("lirc_mode2") int bpf_decoder(unsigned int *sample) diff --git a/tools/testing/selftests/bpf/progs/test_lwt_ip_encap.c b/tools/testing/selftests/bpf/progs/test_lwt_ip_encap.c index c957d6dfe6d7..d6cb986e7533 100644 --- a/tools/testing/selftests/bpf/progs/test_lwt_ip_encap.c +++ b/tools/testing/selftests/bpf/progs/test_lwt_ip_encap.c @@ -4,8 +4,8 @@ #include <linux/bpf.h> #include <linux/ip.h> #include <linux/ipv6.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> struct grehdr { __be16 flags; diff --git a/tools/testing/selftests/bpf/progs/test_lwt_seg6local.c b/tools/testing/selftests/bpf/progs/test_lwt_seg6local.c index 41a3ebcd593d..48ff2b2ad5e7 100644 --- a/tools/testing/selftests/bpf/progs/test_lwt_seg6local.c +++ b/tools/testing/selftests/bpf/progs/test_lwt_seg6local.c @@ -3,8 +3,8 @@ #include <errno.h> #include <linux/seg6_local.h> #include <linux/bpf.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> /* Packet parsing state machine helpers. */ #define cursor_advance(_cursor, _len) \ diff --git a/tools/testing/selftests/bpf/progs/test_map_in_map.c b/tools/testing/selftests/bpf/progs/test_map_in_map.c index 113226115365..1cfeb940cf9f 100644 --- a/tools/testing/selftests/bpf/progs/test_map_in_map.c +++ b/tools/testing/selftests/bpf/progs/test_map_in_map.c @@ -3,7 +3,7 @@ #include <stddef.h> #include <linux/bpf.h> #include <linux/types.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> struct { __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); diff --git a/tools/testing/selftests/bpf/progs/test_map_lock.c b/tools/testing/selftests/bpf/progs/test_map_lock.c index bb7ce35f691b..b5c07ae7b68f 100644 --- a/tools/testing/selftests/bpf/progs/test_map_lock.c +++ b/tools/testing/selftests/bpf/progs/test_map_lock.c @@ -2,7 +2,7 @@ // Copyright (c) 2019 Facebook #include <linux/bpf.h> #include <linux/version.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #define VAR_NUM 16 diff --git a/tools/testing/selftests/bpf/progs/test_mmap.c b/tools/testing/selftests/bpf/progs/test_mmap.c index e808791b7047..6239596cd14e 100644 --- a/tools/testing/selftests/bpf/progs/test_mmap.c +++ b/tools/testing/selftests/bpf/progs/test_mmap.c @@ -3,7 +3,7 @@ #include <linux/bpf.h> #include <stdint.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_obj_id.c b/tools/testing/selftests/bpf/progs/test_obj_id.c index 3d30c02bdae9..98b9de2fafd0 100644 --- a/tools/testing/selftests/bpf/progs/test_obj_id.c +++ b/tools/testing/selftests/bpf/progs/test_obj_id.c @@ -4,7 +4,7 @@ #include <stddef.h> #include <linux/bpf.h> #include <linux/pkt_cls.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> /* It is a dumb bpf program such that it must have no * issue to be loaded since testing the verifier is diff --git a/tools/testing/selftests/bpf/progs/test_overhead.c b/tools/testing/selftests/bpf/progs/test_overhead.c index 96c0124a04ba..bfe9fbcb9684 100644 --- a/tools/testing/selftests/bpf/progs/test_overhead.c +++ b/tools/testing/selftests/bpf/progs/test_overhead.c @@ -1,39 +1,45 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ +#include <stdbool.h> +#include <stddef.h> #include <linux/bpf.h> -#include "bpf_helpers.h" -#include "bpf_tracing.h" +#include <linux/ptrace.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> #include "bpf_trace_helpers.h" +struct task_struct; + SEC("kprobe/__set_task_comm") -int prog1(struct pt_regs *ctx) +int BPF_KPROBE(prog1, struct task_struct *tsk, const char *buf, bool exec) { - return 0; + return !tsk; } SEC("kretprobe/__set_task_comm") -int prog2(struct pt_regs *ctx) +int BPF_KRETPROBE(prog2, + struct task_struct *tsk, const char *buf, bool exec, + int ret) { - return 0; + return !PT_REGS_PARM1(ctx) && ret; } SEC("raw_tp/task_rename") int prog3(struct bpf_raw_tracepoint_args *ctx) { - return 0; + return !ctx->args[0]; } -struct task_struct; -BPF_TRACE_3("fentry/__set_task_comm", prog4, - struct task_struct *, tsk, const char *, buf, __u8, exec) +SEC("fentry/__set_task_comm") +int BPF_PROG(prog4, struct task_struct *tsk, const char *buf, bool exec) { - return 0; + return !tsk; } -BPF_TRACE_3("fexit/__set_task_comm", prog5, - struct task_struct *, tsk, const char *, buf, __u8, exec) +SEC("fexit/__set_task_comm") +int BPF_PROG(prog5, struct task_struct *tsk, const char *buf, bool exec) { - return 0; + return !tsk; } char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_perf_buffer.c b/tools/testing/selftests/bpf/progs/test_perf_buffer.c index 07c09ca5546a..ebfcc9f50c35 100644 --- a/tools/testing/selftests/bpf/progs/test_perf_buffer.c +++ b/tools/testing/selftests/bpf/progs/test_perf_buffer.c @@ -3,7 +3,8 @@ #include <linux/ptrace.h> #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> +#include "bpf_trace_helpers.h" struct { __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); @@ -12,7 +13,7 @@ struct { } perf_buf_map SEC(".maps"); SEC("kprobe/sys_nanosleep") -int handle_sys_nanosleep_entry(struct pt_regs *ctx) +int BPF_KPROBE(handle_sys_nanosleep_entry) { int cpu = bpf_get_smp_processor_id(); diff --git a/tools/testing/selftests/bpf/progs/test_pinning.c b/tools/testing/selftests/bpf/progs/test_pinning.c index f20e7e00373f..4ef2630292b2 100644 --- a/tools/testing/selftests/bpf/progs/test_pinning.c +++ b/tools/testing/selftests/bpf/progs/test_pinning.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> int _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/test_pinning_invalid.c b/tools/testing/selftests/bpf/progs/test_pinning_invalid.c index 51b38abe7ba1..5412e0c732c7 100644 --- a/tools/testing/selftests/bpf/progs/test_pinning_invalid.c +++ b/tools/testing/selftests/bpf/progs/test_pinning_invalid.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> int _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/test_pkt_access.c b/tools/testing/selftests/bpf/progs/test_pkt_access.c index 3a7b4b607ed3..e72eba4a93d2 100644 --- a/tools/testing/selftests/bpf/progs/test_pkt_access.c +++ b/tools/testing/selftests/bpf/progs/test_pkt_access.c @@ -11,8 +11,8 @@ #include <linux/in.h> #include <linux/tcp.h> #include <linux/pkt_cls.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> #define barrier() __asm__ __volatile__("": : :"memory") int _version SEC("version") = 1; @@ -47,6 +47,38 @@ int test_pkt_access_subprog2(int val, volatile struct __sk_buff *skb) return skb->len * val; } +#define MAX_STACK (512 - 2 * 32) + +__attribute__ ((noinline)) +int get_skb_len(struct __sk_buff *skb) +{ + volatile char buf[MAX_STACK] = {}; + + return skb->len; +} + +__attribute__ ((noinline)) +int get_constant(long val) +{ + return val - 122; +} + +int get_skb_ifindex(int, struct __sk_buff *skb, int); + +__attribute__ ((noinline)) +int test_pkt_access_subprog3(int val, struct __sk_buff *skb) +{ + return get_skb_len(skb) * get_skb_ifindex(val, skb, get_constant(123)); +} + +__attribute__ ((noinline)) +int get_skb_ifindex(int val, struct __sk_buff *skb, int var) +{ + volatile char buf[MAX_STACK] = {}; + + return skb->ifindex * val * var; +} + SEC("classifier/test_pkt_access") int test_pkt_access(struct __sk_buff *skb) { @@ -82,6 +114,8 @@ int test_pkt_access(struct __sk_buff *skb) return TC_ACT_SHOT; if (test_pkt_access_subprog2(2, skb) != skb->len * 2) return TC_ACT_SHOT; + if (test_pkt_access_subprog3(3, skb) != skb->len * 3 * skb->ifindex) + return TC_ACT_SHOT; if (tcp) { if (((void *)(tcp) + 20) > data_end || proto != 6) return TC_ACT_SHOT; diff --git a/tools/testing/selftests/bpf/progs/test_pkt_md_access.c b/tools/testing/selftests/bpf/progs/test_pkt_md_access.c index 1db2623021ad..610c74ea9f64 100644 --- a/tools/testing/selftests/bpf/progs/test_pkt_md_access.c +++ b/tools/testing/selftests/bpf/progs/test_pkt_md_access.c @@ -5,7 +5,7 @@ #include <string.h> #include <linux/bpf.h> #include <linux/pkt_cls.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> int _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/test_probe_user.c b/tools/testing/selftests/bpf/progs/test_probe_user.c index 1871e2ece0c4..d556b1572cc6 100644 --- a/tools/testing/selftests/bpf/progs/test_probe_user.c +++ b/tools/testing/selftests/bpf/progs/test_probe_user.c @@ -5,13 +5,14 @@ #include <netinet/in.h> -#include "bpf_helpers.h" -#include "bpf_tracing.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include "bpf_trace_helpers.h" static struct sockaddr_in old; SEC("kprobe/__sys_connect") -int handle_sys_connect(struct pt_regs *ctx) +int BPF_KPROBE(handle_sys_connect) { void *ptr = (void *)PT_REGS_PARM2(ctx); struct sockaddr_in new; diff --git a/tools/testing/selftests/bpf/progs/test_queue_stack_map.h b/tools/testing/selftests/bpf/progs/test_queue_stack_map.h index 0e014d3b2b36..4dd9806ad73b 100644 --- a/tools/testing/selftests/bpf/progs/test_queue_stack_map.h +++ b/tools/testing/selftests/bpf/progs/test_queue_stack_map.h @@ -6,7 +6,7 @@ #include <linux/if_ether.h> #include <linux/ip.h> #include <linux/pkt_cls.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> int _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/test_rdonly_maps.c b/tools/testing/selftests/bpf/progs/test_rdonly_maps.c index 52d94e8b214d..ecbeea2df259 100644 --- a/tools/testing/selftests/bpf/progs/test_rdonly_maps.c +++ b/tools/testing/selftests/bpf/progs/test_rdonly_maps.c @@ -3,7 +3,7 @@ #include <linux/ptrace.h> #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> static volatile const struct { unsigned a[4]; diff --git a/tools/testing/selftests/bpf/progs/test_seg6_loop.c b/tools/testing/selftests/bpf/progs/test_seg6_loop.c index 69880c1e7700..a7278f064368 100644 --- a/tools/testing/selftests/bpf/progs/test_seg6_loop.c +++ b/tools/testing/selftests/bpf/progs/test_seg6_loop.c @@ -3,8 +3,8 @@ #include <errno.h> #include <linux/seg6_local.h> #include <linux/bpf.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> /* Packet parsing state machine helpers. */ #define cursor_advance(_cursor, _len) \ diff --git a/tools/testing/selftests/bpf/progs/test_select_reuseport_kern.c b/tools/testing/selftests/bpf/progs/test_select_reuseport_kern.c index b1f09f5bb1cf..d69a1f2bbbfd 100644 --- a/tools/testing/selftests/bpf/progs/test_select_reuseport_kern.c +++ b/tools/testing/selftests/bpf/progs/test_select_reuseport_kern.c @@ -11,8 +11,8 @@ #include <linux/types.h> #include <linux/if_ether.h> -#include "bpf_endian.h" -#include "bpf_helpers.h" +#include <bpf/bpf_endian.h> +#include <bpf/bpf_helpers.h> #include "test_select_reuseport_common.h" int _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c index 0e6be01157e6..1acc91e87bfc 100644 --- a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c +++ b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c @@ -2,46 +2,39 @@ // Copyright (c) 2019 Facebook #include <linux/bpf.h> #include <linux/version.h> -#include "bpf_helpers.h" - -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 1); - __type(key, __u32); - __type(value, __u64); -} info_map SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 1); - __type(key, __u32); - __type(value, __u64); -} status_map SEC(".maps"); - -SEC("send_signal_demo") -int bpf_send_signal_test(void *ctx) +#include <bpf/bpf_helpers.h> + +__u32 sig = 0, pid = 0, status = 0, signal_thread = 0; + +static __always_inline int bpf_send_signal_test(void *ctx) { - __u64 *info_val, *status_val; - __u32 key = 0, pid, sig; int ret; - status_val = bpf_map_lookup_elem(&status_map, &key); - if (!status_val || *status_val != 0) - return 0; - - info_val = bpf_map_lookup_elem(&info_map, &key); - if (!info_val || *info_val == 0) + if (status != 0 || sig == 0 || pid == 0) return 0; - sig = *info_val >> 32; - pid = *info_val & 0xffffFFFF; - if ((bpf_get_current_pid_tgid() >> 32) == pid) { - ret = bpf_send_signal(sig); + if (signal_thread) + ret = bpf_send_signal_thread(sig); + else + ret = bpf_send_signal(sig); if (ret == 0) - *status_val = 1; + status = 1; } return 0; } + +SEC("tracepoint/syscalls/sys_enter_nanosleep") +int send_signal_tp(void *ctx) +{ + return bpf_send_signal_test(ctx); +} + +SEC("perf_event") +int send_signal_perf(void *ctx) +{ + return bpf_send_signal_test(ctx); +} + char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c b/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c index cb49ccb707d1..d2b38fa6a5b0 100644 --- a/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c +++ b/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c @@ -12,8 +12,8 @@ #include <linux/pkt_cls.h> #include <linux/tcp.h> #include <sys/socket.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> int _version SEC("version") = 1; char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_skb_cgroup_id_kern.c b/tools/testing/selftests/bpf/progs/test_skb_cgroup_id_kern.c index 68cf9829f5a7..552f2090665c 100644 --- a/tools/testing/selftests/bpf/progs/test_skb_cgroup_id_kern.c +++ b/tools/testing/selftests/bpf/progs/test_skb_cgroup_id_kern.c @@ -6,7 +6,7 @@ #include <string.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #define NUM_CGROUP_LEVELS 4 diff --git a/tools/testing/selftests/bpf/progs/test_skb_ctx.c b/tools/testing/selftests/bpf/progs/test_skb_ctx.c index e18da87fe84f..202de3938494 100644 --- a/tools/testing/selftests/bpf/progs/test_skb_ctx.c +++ b/tools/testing/selftests/bpf/progs/test_skb_ctx.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> int _version SEC("version") = 1; char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_skeleton.c b/tools/testing/selftests/bpf/progs/test_skeleton.c index 4f69aac5635f..de03a90f78ca 100644 --- a/tools/testing/selftests/bpf/progs/test_skeleton.c +++ b/tools/testing/selftests/bpf/progs/test_skeleton.c @@ -3,7 +3,7 @@ #include <stdbool.h> #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> struct s { int a; diff --git a/tools/testing/selftests/bpf/progs/test_sock_fields_kern.c b/tools/testing/selftests/bpf/progs/test_sock_fields_kern.c index a47b003623ef..9bcaa37f476a 100644 --- a/tools/testing/selftests/bpf/progs/test_sock_fields_kern.c +++ b/tools/testing/selftests/bpf/progs/test_sock_fields_kern.c @@ -5,8 +5,8 @@ #include <netinet/in.h> #include <stdbool.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> enum bpf_addr_array_idx { ADDR_SRV_IDX, diff --git a/tools/testing/selftests/bpf/progs/test_spin_lock.c b/tools/testing/selftests/bpf/progs/test_spin_lock.c index a43b999c8da2..0d31a3b3505f 100644 --- a/tools/testing/selftests/bpf/progs/test_spin_lock.c +++ b/tools/testing/selftests/bpf/progs/test_spin_lock.c @@ -2,7 +2,7 @@ // Copyright (c) 2019 Facebook #include <linux/bpf.h> #include <linux/version.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> struct hmap_elem { volatile int cnt; diff --git a/tools/testing/selftests/bpf/progs/test_stacktrace_build_id.c b/tools/testing/selftests/bpf/progs/test_stacktrace_build_id.c index f5638e26865d..0cf0134631b4 100644 --- a/tools/testing/selftests/bpf/progs/test_stacktrace_build_id.c +++ b/tools/testing/selftests/bpf/progs/test_stacktrace_build_id.c @@ -2,7 +2,7 @@ // Copyright (c) 2018 Facebook #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #ifndef PERF_MAX_STACK_DEPTH #define PERF_MAX_STACK_DEPTH 127 diff --git a/tools/testing/selftests/bpf/progs/test_stacktrace_map.c b/tools/testing/selftests/bpf/progs/test_stacktrace_map.c index 3b7e1dca8829..00ed48672620 100644 --- a/tools/testing/selftests/bpf/progs/test_stacktrace_map.c +++ b/tools/testing/selftests/bpf/progs/test_stacktrace_map.c @@ -2,7 +2,7 @@ // Copyright (c) 2018 Facebook #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #ifndef PERF_MAX_STACK_DEPTH #define PERF_MAX_STACK_DEPTH 127 diff --git a/tools/testing/selftests/bpf/progs/test_sysctl_loop1.c b/tools/testing/selftests/bpf/progs/test_sysctl_loop1.c index d22e438198cf..458b0d69133e 100644 --- a/tools/testing/selftests/bpf/progs/test_sysctl_loop1.c +++ b/tools/testing/selftests/bpf/progs/test_sysctl_loop1.c @@ -7,7 +7,7 @@ #include <linux/stddef.h> #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #ifndef ARRAY_SIZE #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) diff --git a/tools/testing/selftests/bpf/progs/test_sysctl_loop2.c b/tools/testing/selftests/bpf/progs/test_sysctl_loop2.c index cb201cbe11e7..b2e6f9b0894d 100644 --- a/tools/testing/selftests/bpf/progs/test_sysctl_loop2.c +++ b/tools/testing/selftests/bpf/progs/test_sysctl_loop2.c @@ -7,7 +7,7 @@ #include <linux/stddef.h> #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #ifndef ARRAY_SIZE #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) diff --git a/tools/testing/selftests/bpf/progs/test_sysctl_prog.c b/tools/testing/selftests/bpf/progs/test_sysctl_prog.c index 5cbbff416998..2d0b0b82a78a 100644 --- a/tools/testing/selftests/bpf/progs/test_sysctl_prog.c +++ b/tools/testing/selftests/bpf/progs/test_sysctl_prog.c @@ -7,7 +7,7 @@ #include <linux/stddef.h> #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> /* Max supported length of a string with unsigned long in base 10 (pow2 - 1). */ #define MAX_ULONG_STR_LEN 0xF diff --git a/tools/testing/selftests/bpf/progs/test_tc_edt.c b/tools/testing/selftests/bpf/progs/test_tc_edt.c index 0961415ba477..bf28814bfde5 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_edt.c +++ b/tools/testing/selftests/bpf/progs/test_tc_edt.c @@ -7,8 +7,8 @@ #include <linux/ip.h> #include <linux/pkt_cls.h> #include <linux/tcp.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> /* the maximum delay we are willing to add (drop packets beyond that) */ #define TIME_HORIZON_NS (2000 * 1000 * 1000) diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c index 74370e7e286d..37bce7a7c394 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c @@ -17,8 +17,8 @@ #include <linux/pkt_cls.h> #include <linux/types.h> -#include "bpf_endian.h" -#include "bpf_helpers.h" +#include <bpf/bpf_endian.h> +#include <bpf/bpf_helpers.h> static const int cfg_port = 8000; diff --git a/tools/testing/selftests/bpf/progs/test_tcp_check_syncookie_kern.c b/tools/testing/selftests/bpf/progs/test_tcp_check_syncookie_kern.c index d8803dfa8d32..47cbe2eeae43 100644 --- a/tools/testing/selftests/bpf/progs/test_tcp_check_syncookie_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tcp_check_syncookie_kern.c @@ -13,8 +13,8 @@ #include <sys/socket.h> #include <linux/tcp.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> struct bpf_map_def SEC("maps") results = { .type = BPF_MAP_TYPE_ARRAY, diff --git a/tools/testing/selftests/bpf/progs/test_tcp_estats.c b/tools/testing/selftests/bpf/progs/test_tcp_estats.c index 87b7d934ce73..adc83a54c352 100644 --- a/tools/testing/selftests/bpf/progs/test_tcp_estats.c +++ b/tools/testing/selftests/bpf/progs/test_tcp_estats.c @@ -36,7 +36,7 @@ #include <linux/ipv6.h> #include <linux/version.h> #include <sys/socket.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #define _(P) ({typeof(P) val = 0; bpf_probe_read_kernel(&val, sizeof(val), &P); val;}) #define TCP_ESTATS_MAGIC 0xBAADBEEF diff --git a/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c b/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c index 7fa4595d2b66..1f1966e86e9f 100644 --- a/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c @@ -10,8 +10,8 @@ #include <linux/types.h> #include <linux/socket.h> #include <linux/tcp.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> #include "test_tcpbpf.h" struct { diff --git a/tools/testing/selftests/bpf/progs/test_tcpnotify_kern.c b/tools/testing/selftests/bpf/progs/test_tcpnotify_kern.c index 08346e7765d5..ac63410bb541 100644 --- a/tools/testing/selftests/bpf/progs/test_tcpnotify_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tcpnotify_kern.c @@ -10,8 +10,8 @@ #include <linux/types.h> #include <linux/socket.h> #include <linux/tcp.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> #include "test_tcpnotify.h" struct { diff --git a/tools/testing/selftests/bpf/progs/test_tracepoint.c b/tools/testing/selftests/bpf/progs/test_tracepoint.c index 04bf084517e0..4b825ee122cf 100644 --- a/tools/testing/selftests/bpf/progs/test_tracepoint.c +++ b/tools/testing/selftests/bpf/progs/test_tracepoint.c @@ -2,7 +2,7 @@ // Copyright (c) 2017 Facebook #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> /* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */ struct sched_switch_args { diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c index 504df69c83df..f48dbfe24ddc 100644 --- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c @@ -19,8 +19,8 @@ #include <linux/socket.h> #include <linux/pkt_cls.h> #include <linux/erspan.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> #define ERROR(ret) do {\ char fmt[] = "ERROR line:%d ret:%d\n";\ diff --git a/tools/testing/selftests/bpf/progs/test_verif_scale1.c b/tools/testing/selftests/bpf/progs/test_verif_scale1.c index f3236ce35f31..d38153dab3dd 100644 --- a/tools/testing/selftests/bpf/progs/test_verif_scale1.c +++ b/tools/testing/selftests/bpf/progs/test_verif_scale1.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2019 Facebook #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #define ATTR __attribute__((noinline)) #include "test_jhash.h" diff --git a/tools/testing/selftests/bpf/progs/test_verif_scale2.c b/tools/testing/selftests/bpf/progs/test_verif_scale2.c index 9897150ed516..f024154c7be7 100644 --- a/tools/testing/selftests/bpf/progs/test_verif_scale2.c +++ b/tools/testing/selftests/bpf/progs/test_verif_scale2.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2019 Facebook #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #define ATTR __always_inline #include "test_jhash.h" diff --git a/tools/testing/selftests/bpf/progs/test_verif_scale3.c b/tools/testing/selftests/bpf/progs/test_verif_scale3.c index 1848da04ea41..9beb5bf80373 100644 --- a/tools/testing/selftests/bpf/progs/test_verif_scale3.c +++ b/tools/testing/selftests/bpf/progs/test_verif_scale3.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2019 Facebook #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #define ATTR __attribute__((noinline)) #include "test_jhash.h" diff --git a/tools/testing/selftests/bpf/progs/test_xdp.c b/tools/testing/selftests/bpf/progs/test_xdp.c index 0941c655b07b..31f9bce37491 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp.c +++ b/tools/testing/selftests/bpf/progs/test_xdp.c @@ -16,8 +16,8 @@ #include <linux/tcp.h> #include <linux/pkt_cls.h> #include <sys/socket.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> #include "test_iptunnel_common.h" int _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/test_xdp_bpf2bpf.c b/tools/testing/selftests/bpf/progs/test_xdp_bpf2bpf.c new file mode 100644 index 000000000000..cb8a04ab7a78 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_xdp_bpf2bpf.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include "bpf_trace_helpers.h" + +struct net_device { + /* Structure does not need to contain all entries, + * as "preserve_access_index" will use BTF to fix this... + */ + int ifindex; +} __attribute__((preserve_access_index)); + +struct xdp_rxq_info { + /* Structure does not need to contain all entries, + * as "preserve_access_index" will use BTF to fix this... + */ + struct net_device *dev; + __u32 queue_index; +} __attribute__((preserve_access_index)); + +struct xdp_buff { + void *data; + void *data_end; + void *data_meta; + void *data_hard_start; + unsigned long handle; + struct xdp_rxq_info *rxq; +} __attribute__((preserve_access_index)); + +__u64 test_result_fentry = 0; +SEC("fentry/_xdp_tx_iptunnel") +int BPF_PROG(trace_on_entry, struct xdp_buff *xdp) +{ + test_result_fentry = xdp->rxq->dev->ifindex; + return 0; +} + +__u64 test_result_fexit = 0; +SEC("fexit/_xdp_tx_iptunnel") +int BPF_PROG(trace_on_exit, struct xdp_buff *xdp, int ret) +{ + test_result_fexit = ret; + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_xdp_loop.c b/tools/testing/selftests/bpf/progs/test_xdp_loop.c index 97175f73c3fe..fcabcda30ba3 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_loop.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_loop.c @@ -12,8 +12,8 @@ #include <linux/tcp.h> #include <linux/pkt_cls.h> #include <sys/socket.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> #include "test_iptunnel_common.h" int _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/test_xdp_meta.c b/tools/testing/selftests/bpf/progs/test_xdp_meta.c index 8d0182650653..a7c4a7d49fe6 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_meta.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_meta.c @@ -2,7 +2,7 @@ #include <linux/if_ether.h> #include <linux/pkt_cls.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #define __round_mask(x, y) ((__typeof__(x))((y) - 1)) #define round_up(x, y) ((((x) - 1) | __round_mask(x, y)) + 1) diff --git a/tools/testing/selftests/bpf/progs/test_xdp_noinline.c b/tools/testing/selftests/bpf/progs/test_xdp_noinline.c index e88d7b9d65ab..8beecec166d9 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_noinline.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_noinline.c @@ -13,8 +13,8 @@ #include <linux/icmpv6.h> #include <linux/tcp.h> #include <linux/udp.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> static __u32 rol32(__u32 word, unsigned int shift) { @@ -86,7 +86,7 @@ u32 jhash(const void *key, u32 length, u32 initval) return c; } -static __attribute__ ((noinline)) +__attribute__ ((noinline)) u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval) { a += initval; @@ -96,7 +96,7 @@ u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval) return c; } -static __attribute__ ((noinline)) +__attribute__ ((noinline)) u32 jhash_2words(u32 a, u32 b, u32 initval) { return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2)); diff --git a/tools/testing/selftests/bpf/progs/test_xdp_redirect.c b/tools/testing/selftests/bpf/progs/test_xdp_redirect.c index ef9e704be140..a5337cd9400b 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_redirect.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_redirect.c @@ -10,7 +10,7 @@ * General Public License for more details. */ #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> int _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/test_xdp_vlan.c b/tools/testing/selftests/bpf/progs/test_xdp_vlan.c index 365a7d2d9f5c..134768f6b788 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_vlan.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_vlan.c @@ -22,8 +22,8 @@ #include <linux/in.h> #include <linux/pkt_cls.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> /* linux/if_vlan.h have not exposed this as UAPI, thus mirror some here * diff --git a/tools/testing/selftests/bpf/progs/xdp_dummy.c b/tools/testing/selftests/bpf/progs/xdp_dummy.c index 43b0ef1001ed..ea25e8881992 100644 --- a/tools/testing/selftests/bpf/progs/xdp_dummy.c +++ b/tools/testing/selftests/bpf/progs/xdp_dummy.c @@ -2,7 +2,7 @@ #define KBUILD_MODNAME "xdp_dummy" #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> SEC("xdp_dummy") int xdp_dummy_prog(struct xdp_md *ctx) diff --git a/tools/testing/selftests/bpf/progs/xdp_redirect_map.c b/tools/testing/selftests/bpf/progs/xdp_redirect_map.c index 1c5f298d7196..d037262c8937 100644 --- a/tools/testing/selftests/bpf/progs/xdp_redirect_map.c +++ b/tools/testing/selftests/bpf/progs/xdp_redirect_map.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> struct { __uint(type, BPF_MAP_TYPE_DEVMAP); diff --git a/tools/testing/selftests/bpf/progs/xdp_tx.c b/tools/testing/selftests/bpf/progs/xdp_tx.c index 57912e7c94b0..94e6c2b281cb 100644 --- a/tools/testing/selftests/bpf/progs/xdp_tx.c +++ b/tools/testing/selftests/bpf/progs/xdp_tx.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> SEC("tx") int xdp_tx(struct xdp_md *xdp) diff --git a/tools/testing/selftests/bpf/progs/xdping_kern.c b/tools/testing/selftests/bpf/progs/xdping_kern.c index 112a2857f4e2..6b9ca40bd1f4 100644 --- a/tools/testing/selftests/bpf/progs/xdping_kern.c +++ b/tools/testing/selftests/bpf/progs/xdping_kern.c @@ -12,8 +12,8 @@ #include <linux/if_vlan.h> #include <linux/ip.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> #include "xdping.h" diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c index 3d617e806054..93040ca83e60 100644 --- a/tools/testing/selftests/bpf/test_btf.c +++ b/tools/testing/selftests/bpf/test_btf.c @@ -4148,10 +4148,6 @@ static int do_test_file(unsigned int test_num) if (CHECK(IS_ERR(obj), "obj: %ld", PTR_ERR(obj))) return PTR_ERR(obj); - err = bpf_object__btf_fd(obj); - if (CHECK(err == -1, "bpf_object__btf_fd: -1")) - goto done; - prog = bpf_program__next(NULL, obj); if (CHECK(!prog, "Cannot find bpf_prog")) { err = -1; diff --git a/tools/testing/selftests/bpf/test_cpp.cpp b/tools/testing/selftests/bpf/test_cpp.cpp index 6fe23a10d48a..a8d2e9a87fbf 100644 --- a/tools/testing/selftests/bpf/test_cpp.cpp +++ b/tools/testing/selftests/bpf/test_cpp.cpp @@ -1,8 +1,8 @@ /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ #include <iostream> -#include "libbpf.h" -#include "bpf.h" -#include "btf.h" +#include <bpf/libbpf.h> +#include <bpf/bpf.h> +#include <bpf/btf.h> #include "test_core_extern.skel.h" /* do nothing, just make sure we can link successfully */ diff --git a/tools/testing/selftests/bpf/test_hashmap.c b/tools/testing/selftests/bpf/test_hashmap.c index b64094c981e3..c490e012c23f 100644 --- a/tools/testing/selftests/bpf/test_hashmap.c +++ b/tools/testing/selftests/bpf/test_hashmap.c @@ -8,7 +8,7 @@ #include <stdio.h> #include <errno.h> #include <linux/err.h> -#include "hashmap.h" +#include "bpf/hashmap.h" #define CHECK(condition, format...) ({ \ int __ret = !!(condition); \ diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 7fa7d08a8104..bab1e6f1d8f1 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -8,7 +8,7 @@ #include <string.h> /* defined in test_progs.h */ -struct test_env env; +struct test_env env = {}; struct prog_test_def { const char *test_name; @@ -29,10 +29,19 @@ struct prog_test_def { static bool should_run(struct test_selector *sel, int num, const char *name) { - if (sel->name && sel->name[0] && !strstr(name, sel->name)) - return false; + int i; + + for (i = 0; i < sel->blacklist.cnt; i++) { + if (strstr(name, sel->blacklist.strs[i])) + return false; + } - if (!sel->num_set) + for (i = 0; i < sel->whitelist.cnt; i++) { + if (strstr(name, sel->whitelist.strs[i])) + return true; + } + + if (!sel->whitelist.cnt && !sel->num_set) return true; return num < sel->num_set_len && sel->num_set[num]; @@ -334,6 +343,7 @@ const char argp_program_doc[] = "BPF selftests test runner"; enum ARG_KEYS { ARG_TEST_NUM = 'n', ARG_TEST_NAME = 't', + ARG_TEST_NAME_BLACKLIST = 'b', ARG_VERIFIER_STATS = 's', ARG_VERBOSE = 'v', }; @@ -341,8 +351,10 @@ enum ARG_KEYS { static const struct argp_option opts[] = { { "num", ARG_TEST_NUM, "NUM", 0, "Run test number NUM only " }, - { "name", ARG_TEST_NAME, "NAME", 0, - "Run tests with names containing NAME" }, + { "name", ARG_TEST_NAME, "NAMES", 0, + "Run tests with names containing any string from NAMES list" }, + { "name-blacklist", ARG_TEST_NAME_BLACKLIST, "NAMES", 0, + "Don't run tests with names containing any string from NAMES list" }, { "verifier-stats", ARG_VERIFIER_STATS, NULL, 0, "Output verifier statistics", }, { "verbose", ARG_VERBOSE, "LEVEL", OPTION_ARG_OPTIONAL, @@ -359,6 +371,41 @@ static int libbpf_print_fn(enum libbpf_print_level level, return 0; } +static int parse_str_list(const char *s, struct str_set *set) +{ + char *input, *state = NULL, *next, **tmp, **strs = NULL; + int cnt = 0; + + input = strdup(s); + if (!input) + return -ENOMEM; + + set->cnt = 0; + set->strs = NULL; + + while ((next = strtok_r(state ? NULL : input, ",", &state))) { + tmp = realloc(strs, sizeof(*strs) * (cnt + 1)); + if (!tmp) + goto err; + strs = tmp; + + strs[cnt] = strdup(next); + if (!strs[cnt]) + goto err; + + cnt++; + } + + set->cnt = cnt; + set->strs = (const char **)strs; + free(input); + return 0; +err: + free(strs); + free(input); + return -ENOMEM; +} + int parse_num_list(const char *s, struct test_selector *sel) { int i, set_len = 0, num, start = 0, end = -1; @@ -449,12 +496,24 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) if (subtest_str) { *subtest_str = '\0'; - env->subtest_selector.name = strdup(subtest_str + 1); - if (!env->subtest_selector.name) + if (parse_str_list(subtest_str + 1, + &env->subtest_selector.whitelist)) + return -ENOMEM; + } + if (parse_str_list(arg, &env->test_selector.whitelist)) + return -ENOMEM; + break; + } + case ARG_TEST_NAME_BLACKLIST: { + char *subtest_str = strchr(arg, '/'); + + if (subtest_str) { + *subtest_str = '\0'; + if (parse_str_list(subtest_str + 1, + &env->subtest_selector.blacklist)) return -ENOMEM; } - env->test_selector.name = strdup(arg); - if (!env->test_selector.name) + if (parse_str_list(arg, &env->test_selector.blacklist)) return -ENOMEM; break; } @@ -617,7 +676,11 @@ int main(int argc, char **argv) printf("Summary: %d/%d PASSED, %d SKIPPED, %d FAILED\n", env.succ_cnt, env.sub_succ_cnt, env.skip_cnt, env.fail_cnt); + free(env.test_selector.blacklist.strs); + free(env.test_selector.whitelist.strs); free(env.test_selector.num_set); + free(env.subtest_selector.blacklist.strs); + free(env.subtest_selector.whitelist.strs); free(env.subtest_selector.num_set); return env.fail_cnt ? EXIT_FAILURE : EXIT_SUCCESS; diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index de1fdaa4e7b4..bcfa9ef23fda 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -35,7 +35,7 @@ typedef __u16 __sum16; #include "test_iptunnel_common.h" #include "bpf_util.h" -#include "bpf_endian.h" +#include <bpf/bpf_endian.h> #include "trace_helpers.h" #include "flow_dissector_load.h" @@ -46,8 +46,14 @@ enum verbosity { VERBOSE_SUPER, }; +struct str_set { + const char **strs; + int cnt; +}; + struct test_selector { - const char *name; + struct str_set whitelist; + struct str_set blacklist; bool *num_set; int num_set_len; }; diff --git a/tools/testing/selftests/bpf/test_sock.c b/tools/testing/selftests/bpf/test_sock.c index 0e6652733462..52bf14955797 100644 --- a/tools/testing/selftests/bpf/test_sock.c +++ b/tools/testing/selftests/bpf/test_sock.c @@ -13,7 +13,7 @@ #include <bpf/bpf.h> #include "cgroup_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_endian.h> #include "bpf_rlimit.h" #include "bpf_util.h" diff --git a/tools/testing/selftests/bpf/test_sockmap_kern.h b/tools/testing/selftests/bpf/test_sockmap_kern.h index d008b41b7d8d..9b4d3a68a91a 100644 --- a/tools/testing/selftests/bpf/test_sockmap_kern.h +++ b/tools/testing/selftests/bpf/test_sockmap_kern.h @@ -12,8 +12,8 @@ #include <linux/tcp.h> #include <linux/pkt_cls.h> #include <sys/socket.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> /* Sockmap sample program connects a client and a backend together * using cgroups. diff --git a/tools/testing/selftests/bpf/test_sysctl.c b/tools/testing/selftests/bpf/test_sysctl.c index 40bd93a6e7ae..d196e2a4a6e0 100644 --- a/tools/testing/selftests/bpf/test_sysctl.c +++ b/tools/testing/selftests/bpf/test_sysctl.c @@ -13,7 +13,7 @@ #include <bpf/bpf.h> #include <bpf/libbpf.h> -#include "bpf_endian.h" +#include <bpf/bpf_endian.h> #include "bpf_rlimit.h" #include "bpf_util.h" #include "cgroup_helpers.h" diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h index aa4dcfe18050..0383c9b8adc1 100644 --- a/tools/testing/selftests/bpf/trace_helpers.h +++ b/tools/testing/selftests/bpf/trace_helpers.h @@ -2,7 +2,7 @@ #ifndef __TRACE_HELPER_H #define __TRACE_HELPER_H -#include <libbpf.h> +#include <bpf/libbpf.h> struct ksym { long addr; diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh index b4efb023ae51..d88d8e47d11b 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh @@ -50,6 +50,8 @@ ALL_TESTS=" ipv6_mc_dip_reserved_scope_test ipv6_mc_dip_interface_local_scope_test blackhole_route_test + irif_disabled_test + erif_disabled_test " NUM_NETIFS=4 @@ -553,6 +555,116 @@ blackhole_route_test() __blackhole_route_test "6" "2001:db8:2::/120" "ipv6" $h2_ipv6 "icmpv6" } +irif_disabled_test() +{ + local trap_name="irif_disabled" + local group_name="l3_drops" + local t0_packets t0_bytes + local t1_packets t1_bytes + local mz_pid + + RET=0 + + ping_check $trap_name + + devlink_trap_action_set $trap_name "trap" + + # When RIF of a physical port ("Sub-port RIF") is destroyed, we first + # block the STP of the {Port, VLAN} so packets cannot get into the RIF. + # Using bridge enables us to see this trap because when bridge is + # destroyed, there is a small time window that packets can go into the + # RIF, while it is disabled. + ip link add dev br0 type bridge + ip link set dev $rp1 master br0 + ip address flush dev $rp1 + __addr_add_del br0 add 192.0.2.2/24 + ip li set dev br0 up + + t0_packets=$(devlink_trap_rx_packets_get $trap_name) + t0_bytes=$(devlink_trap_rx_bytes_get $trap_name) + + # Generate packets to h2 through br0 RIF that will be removed later + $MZ $h1 -t udp "sp=54321,dp=12345" -c 0 -p 100 -a own -b $rp1mac \ + -B $h2_ipv4 -q & + mz_pid=$! + + # Wait before removing br0 RIF to allow packets to go into the bridge. + sleep 1 + + # Flushing address will dismantle the RIF + ip address flush dev br0 + + t1_packets=$(devlink_trap_rx_packets_get $trap_name) + t1_bytes=$(devlink_trap_rx_bytes_get $trap_name) + + if [[ $t0_packets -eq $t1_packets && $t0_bytes -eq $t1_bytes ]]; then + check_err 1 "Trap stats idle when packets should be trapped" + fi + + log_test "Ingress RIF disabled" + + kill $mz_pid && wait $mz_pid &> /dev/null + ip link set dev $rp1 nomaster + __addr_add_del $rp1 add 192.0.2.2/24 2001:db8:1::2/64 + ip link del dev br0 type bridge + devlink_trap_action_set $trap_name "drop" +} + +erif_disabled_test() +{ + local trap_name="erif_disabled" + local group_name="l3_drops" + local t0_packets t0_bytes + local t1_packets t1_bytes + local mz_pid + + RET=0 + + ping_check $trap_name + + devlink_trap_action_set $trap_name "trap" + ip link add dev br0 type bridge + ip add flush dev $rp1 + ip link set dev $rp1 master br0 + __addr_add_del br0 add 192.0.2.2/24 + ip link set dev br0 up + + t0_packets=$(devlink_trap_rx_packets_get $trap_name) + t0_bytes=$(devlink_trap_rx_bytes_get $trap_name) + + rp2mac=$(mac_get $rp2) + + # Generate packets that should go out through br0 RIF that will be + # removed later + $MZ $h2 -t udp "sp=54321,dp=12345" -c 0 -p 100 -a own -b $rp2mac \ + -B 192.0.2.1 -q & + mz_pid=$! + + sleep 5 + # In order to see this trap we need a route that points to disabled RIF. + # When ipv6 address is flushed, there is a delay and the routes are + # deleted before the RIF and we cannot get state that we have route + # to disabled RIF. + # Delete IPv6 address first and then check this trap with flushing IPv4. + ip -6 add flush dev br0 + ip -4 add flush dev br0 + + t1_packets=$(devlink_trap_rx_packets_get $trap_name) + t1_bytes=$(devlink_trap_rx_bytes_get $trap_name) + + if [[ $t0_packets -eq $t1_packets && $t0_bytes -eq $t1_bytes ]]; then + check_err 1 "Trap stats idle when packets should be trapped" + fi + + log_test "Egress RIF disabled" + + kill $mz_pid && wait $mz_pid &> /dev/null + ip link set dev $rp1 nomaster + __addr_add_del $rp1 add 192.0.2.2/24 2001:db8:1::2/64 + ip link del dev br0 type bridge + devlink_trap_action_set $trap_name "drop" +} + trap cleanup EXIT setup_prepare diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip.sh new file mode 100755 index 000000000000..039629bb92a3 --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip.sh @@ -0,0 +1,265 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test devlink-trap tunnel exceptions functionality over mlxsw. +# Check all exception traps to make sure they are triggered under the right +# conditions. + +# +-------------------------+ +# | H1 | +# | $h1 + | +# | 192.0.2.1/28 | | +# +-------------------|-----+ +# | +# +-------------------|-----+ +# | SW1 | | +# | $swp1 + | +# | 192.0.2.2/28 | +# | | +# | + g1a (gre) | +# | loc=192.0.2.65 | +# | rem=192.0.2.66 | +# | tos=inherit | +# | | +# | + $rp1 | +# | | 198.51.100.1/28 | +# +--|----------------------+ +# | +# +--|----------------------+ +# | | VRF2 | +# | + $rp2 | +# | 198.51.100.2/28 | +# +-------------------------+ + +lib_dir=$(dirname $0)/../../../net/forwarding + +ALL_TESTS=" + decap_error_test +" + +NUM_NETIFS=4 +source $lib_dir/lib.sh +source $lib_dir/tc_common.sh +source $lib_dir/devlink_lib.sh + +h1_create() +{ + simple_if_init $h1 192.0.2.1/28 +} + +h1_destroy() +{ + simple_if_fini $h1 192.0.2.1/28 +} + +vrf2_create() +{ + simple_if_init $rp2 198.51.100.2/28 +} + +vrf2_destroy() +{ + simple_if_fini $rp2 198.51.100.2/28 +} + +switch_create() +{ + __addr_add_del $swp1 add 192.0.2.2/28 + tc qdisc add dev $swp1 clsact + ip link set dev $swp1 up + + tunnel_create g1 gre 192.0.2.65 192.0.2.66 tos inherit + __addr_add_del g1 add 192.0.2.65/32 + ip link set dev g1 up + + __addr_add_del $rp1 add 198.51.100.1/28 + ip link set dev $rp1 up +} + +switch_destroy() +{ + ip link set dev $rp1 down + __addr_add_del $rp1 del 198.51.100.1/28 + + ip link set dev g1 down + __addr_add_del g1 del 192.0.2.65/32 + tunnel_destroy g1 + + ip link set dev $swp1 down + tc qdisc del dev $swp1 clsact + __addr_add_del $swp1 del 192.0.2.2/28 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + swp1=${NETIFS[p2]} + + rp1=${NETIFS[p3]} + rp2=${NETIFS[p4]} + + forwarding_enable + vrf_prepare + h1_create + switch_create + vrf2_create +} + +cleanup() +{ + pre_cleanup + + vrf2_destroy + switch_destroy + h1_destroy + vrf_cleanup + forwarding_restore +} + +ecn_payload_get() +{ + p=$(: + )"0"$( : GRE flags + )"0:00:"$( : Reserved + version + )"08:00:"$( : ETH protocol type + )"4"$( : IP version + )"5:"$( : IHL + )"00:"$( : IP TOS + )"00:14:"$( : IP total length + )"00:00:"$( : IP identification + )"20:00:"$( : IP flags + frag off + )"30:"$( : IP TTL + )"01:"$( : IP proto + )"E7:E6:"$( : IP header csum + )"C0:00:01:01:"$( : IP saddr : 192.0.1.1 + )"C0:00:02:01:"$( : IP daddr : 192.0.2.1 + ) + echo $p +} + +ecn_decap_test() +{ + local trap_name="decap_error" + local group_name="tunnel_drops" + local desc=$1; shift + local ecn_desc=$1; shift + local outer_tos=$1; shift + local mz_pid + + RET=0 + + tc filter add dev $swp1 egress protocol ip pref 1 handle 101 \ + flower src_ip 192.0.1.1 dst_ip 192.0.2.1 action pass + + rp1_mac=$(mac_get $rp1) + rp2_mac=$(mac_get $rp2) + payload=$(ecn_payload_get) + + ip vrf exec v$rp2 $MZ $rp2 -c 0 -d 1msec -a $rp2_mac -b $rp1_mac \ + -A 192.0.2.66 -B 192.0.2.65 -t ip \ + len=48,tos=$outer_tos,proto=47,p=$payload -q & + + mz_pid=$! + + devlink_trap_exception_test $trap_name $group_name + + tc_check_packets "dev $swp1 egress" 101 0 + check_err $? "Packets were not dropped" + + log_test "$desc: Inner ECN is not ECT and outer is $ecn_desc" + + kill $mz_pid && wait $mz_pid &> /dev/null + tc filter del dev $swp1 egress protocol ip pref 1 handle 101 flower +} + +ipip_payload_get() +{ + local flags=$1; shift + local key=$1; shift + + p=$(: + )"$flags"$( : GRE flags + )"0:00:"$( : Reserved + version + )"08:00:"$( : ETH protocol type + )"$key"$( : Key + )"4"$( : IP version + )"5:"$( : IHL + )"00:"$( : IP TOS + )"00:14:"$( : IP total length + )"00:00:"$( : IP identification + )"20:00:"$( : IP flags + frag off + )"30:"$( : IP TTL + )"01:"$( : IP proto + )"E7:E6:"$( : IP header csum + )"C0:00:01:01:"$( : IP saddr : 192.0.1.1 + )"C0:00:02:01:"$( : IP daddr : 192.0.2.1 + ) + echo $p +} + +no_matching_tunnel_test() +{ + local trap_name="decap_error" + local group_name="tunnel_drops" + local desc=$1; shift + local sip=$1; shift + local mz_pid + + RET=0 + + tc filter add dev $swp1 egress protocol ip pref 1 handle 101 \ + flower src_ip 192.0.1.1 dst_ip 192.0.2.1 action pass + + rp1_mac=$(mac_get $rp1) + rp2_mac=$(mac_get $rp2) + payload=$(ipip_payload_get "$@") + + ip vrf exec v$rp2 $MZ $rp2 -c 0 -d 1msec -a $rp2_mac -b $rp1_mac \ + -A $sip -B 192.0.2.65 -t ip len=48,proto=47,p=$payload -q & + mz_pid=$! + + devlink_trap_exception_test $trap_name $group_name + + tc_check_packets "dev $swp1 egress" 101 0 + check_err $? "Packets were not dropped" + + log_test "$desc" + + kill $mz_pid && wait $mz_pid &> /dev/null + tc filter del dev $swp1 egress protocol ip pref 1 handle 101 flower +} + +decap_error_test() +{ + # Correct source IP - the remote address + local sip=192.0.2.66 + + ecn_decap_test "Decap error" "ECT(1)" 01 + ecn_decap_test "Decap error" "ECT(0)" 02 + ecn_decap_test "Decap error" "CE" 03 + + no_matching_tunnel_test "Decap error: Source IP check failed" \ + 192.0.2.68 "0" + no_matching_tunnel_test \ + "Decap error: Key exists but was not expected" $sip "2" ":E9:" + + # Destroy the tunnel and create new one with key + __addr_add_del g1 del 192.0.2.65/32 + tunnel_destroy g1 + + tunnel_create g1 gre 192.0.2.65 192.0.2.66 tos inherit key 233 + __addr_add_del g1 add 192.0.2.65/32 + + no_matching_tunnel_test \ + "Decap error: Key does not exist but was expected" $sip "0" + no_matching_tunnel_test \ + "Decap error: Packet has a wrong key field" $sip "2" "E8:" +} + +trap cleanup EXIT + +setup_prepare +setup_wait +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh new file mode 100755 index 000000000000..fd19161dd4ec --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh @@ -0,0 +1,330 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test devlink-trap tunnel drops and exceptions functionality over mlxsw. +# Check all traps to make sure they are triggered under the right +# conditions. + +# +--------------------+ +# | H1 (vrf) | +# | + $h1 | +# | | 192.0.2.1/28 | +# +----|---------------+ +# | +# +----|----------------------------------------------------------------------+ +# | SW | | +# | +--|--------------------------------------------------------------------+ | +# | | + $swp1 BR1 (802.1d) | | +# | | | | +# | | + vx1 (vxlan) | | +# | | local 192.0.2.17 | | +# | | id 1000 dstport $VXPORT | | +# | +-----------------------------------------------------------------------+ | +# | | +# | + $rp1 | +# | | 192.0.2.17/28 | +# +----|----------------------------------------------------------------------+ +# | +# +----|--------------------------------------------------------+ +# | | VRF2 | +# | + $rp2 | +# | 192.0.2.18/28 | +# | | +# +-------------------------------------------------------------+ + +lib_dir=$(dirname $0)/../../../net/forwarding + +ALL_TESTS=" + decap_error_test + overlay_smac_is_mc_test +" + +NUM_NETIFS=4 +source $lib_dir/lib.sh +source $lib_dir/tc_common.sh +source $lib_dir/devlink_lib.sh + +: ${VXPORT:=4789} +export VXPORT + +h1_create() +{ + simple_if_init $h1 192.0.2.1/28 +} + +h1_destroy() +{ + simple_if_fini $h1 192.0.2.1/28 +} + +switch_create() +{ + ip link add name br1 type bridge vlan_filtering 0 mcast_snooping 0 + # Make sure the bridge uses the MAC address of the local port and not + # that of the VxLAN's device. + ip link set dev br1 address $(mac_get $swp1) + ip link set dev br1 up + + tc qdisc add dev $swp1 clsact + ip link set dev $swp1 master br1 + ip link set dev $swp1 up + + ip link add name vx1 type vxlan id 1000 local 192.0.2.17 \ + dstport "$VXPORT" nolearning noudpcsum tos inherit ttl 100 + ip link set dev vx1 master br1 + ip link set dev vx1 up + + ip address add dev $rp1 192.0.2.17/28 + ip link set dev $rp1 up +} + +switch_destroy() +{ + ip link set dev $rp1 down + ip address del dev $rp1 192.0.2.17/28 + + ip link set dev vx1 down + ip link set dev vx1 nomaster + ip link del dev vx1 + + ip link set dev $swp1 down + ip link set dev $swp1 nomaster + tc qdisc del dev $swp1 clsact + + ip link set dev br1 down + ip link del dev br1 +} + +vrf2_create() +{ + simple_if_init $rp2 192.0.2.18/28 +} + +vrf2_destroy() +{ + simple_if_fini $rp2 192.0.2.18/28 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + swp1=${NETIFS[p2]} + + rp1=${NETIFS[p3]} + rp2=${NETIFS[p4]} + + vrf_prepare + forwarding_enable + h1_create + switch_create + vrf2_create +} + +cleanup() +{ + pre_cleanup + + vrf2_destroy + switch_destroy + h1_destroy + forwarding_restore + vrf_cleanup +} + +ecn_payload_get() +{ + dest_mac=$(mac_get $h1) + p=$(: + )"08:"$( : VXLAN flags + )"00:00:00:"$( : VXLAN reserved + )"00:03:e8:"$( : VXLAN VNI : 1000 + )"00:"$( : VXLAN reserved + )"$dest_mac:"$( : ETH daddr + )"00:00:00:00:00:00:"$( : ETH saddr + )"08:00:"$( : ETH type + )"45:"$( : IP version + IHL + )"00:"$( : IP TOS + )"00:14:"$( : IP total length + )"00:00:"$( : IP identification + )"20:00:"$( : IP flags + frag off + )"40:"$( : IP TTL + )"00:"$( : IP proto + )"D6:E5:"$( : IP header csum + )"c0:00:02:03:"$( : IP saddr: 192.0.2.3 + )"c0:00:02:01:"$( : IP daddr: 192.0.2.1 + ) + echo $p +} + +ecn_decap_test() +{ + local trap_name="decap_error" + local group_name="tunnel_drops" + local desc=$1; shift + local ecn_desc=$1; shift + local outer_tos=$1; shift + local mz_pid + + RET=0 + + tc filter add dev $swp1 egress protocol ip pref 1 handle 101 \ + flower src_ip 192.0.2.3 dst_ip 192.0.2.1 action pass + + rp1_mac=$(mac_get $rp1) + payload=$(ecn_payload_get) + + ip vrf exec v$rp2 $MZ $rp2 -c 0 -d 1msec -b $rp1_mac -B 192.0.2.17 \ + -t udp sp=12345,dp=$VXPORT,tos=$outer_tos,p=$payload -q & + mz_pid=$! + + devlink_trap_exception_test $trap_name $group_name + + tc_check_packets "dev $swp1 egress" 101 0 + check_err $? "Packets were not dropped" + + log_test "$desc: Inner ECN is not ECT and outer is $ecn_desc" + + kill $mz_pid && wait $mz_pid &> /dev/null + tc filter del dev $swp1 egress protocol ip pref 1 handle 101 flower +} + +reserved_bits_payload_get() +{ + dest_mac=$(mac_get $h1) + p=$(: + )"08:"$( : VXLAN flags + )"01:00:00:"$( : VXLAN reserved + )"00:03:e8:"$( : VXLAN VNI : 1000 + )"00:"$( : VXLAN reserved + )"$dest_mac:"$( : ETH daddr + )"00:00:00:00:00:00:"$( : ETH saddr + )"08:00:"$( : ETH type + )"45:"$( : IP version + IHL + )"00:"$( : IP TOS + )"00:14:"$( : IP total length + )"00:00:"$( : IP identification + )"20:00:"$( : IP flags + frag off + )"40:"$( : IP TTL + )"00:"$( : IP proto + )"00:00:"$( : IP header csum + )"c0:00:02:03:"$( : IP saddr: 192.0.2.3 + )"c0:00:02:01:"$( : IP daddr: 192.0.2.1 + ) + echo $p +} + +short_payload_get() +{ + dest_mac=$(mac_get $h1) + p=$(: + )"08:"$( : VXLAN flags + )"01:00:00:"$( : VXLAN reserved + )"00:03:e8:"$( : VXLAN VNI : 1000 + )"00:"$( : VXLAN reserved + ) + echo $p +} + +corrupted_packet_test() +{ + local trap_name="decap_error" + local group_name="tunnel_drops" + local desc=$1; shift + local payload_get=$1; shift + local mz_pid + + RET=0 + + # In case of too short packet, there is no any inner packet, + # so the matching will always succeed + tc filter add dev $swp1 egress protocol ip pref 1 handle 101 \ + flower skip_hw src_ip 192.0.2.3 dst_ip 192.0.2.1 action pass + + rp1_mac=$(mac_get $rp1) + payload=$($payload_get) + ip vrf exec v$rp2 $MZ $rp2 -c 0 -d 1msec -b $rp1_mac \ + -B 192.0.2.17 -t udp sp=12345,dp=$VXPORT,p=$payload -q & + mz_pid=$! + + devlink_trap_exception_test $trap_name $group_name + + tc_check_packets "dev $swp1 egress" 101 0 + check_err $? "Packets were not dropped" + + log_test "$desc" + + kill $mz_pid && wait $mz_pid &> /dev/null + tc filter del dev $swp1 egress protocol ip pref 1 handle 101 flower +} + +decap_error_test() +{ + ecn_decap_test "Decap error" "ECT(1)" 01 + ecn_decap_test "Decap error" "ECT(0)" 02 + ecn_decap_test "Decap error" "CE" 03 + + corrupted_packet_test "Decap error: Reserved bits in use" \ + "reserved_bits_payload_get" + corrupted_packet_test "Decap error: No L2 header" "short_payload_get" +} + +mc_smac_payload_get() +{ + dest_mac=$(mac_get $h1) + source_mac=01:02:03:04:05:06 + p=$(: + )"08:"$( : VXLAN flags + )"00:00:00:"$( : VXLAN reserved + )"00:03:e8:"$( : VXLAN VNI : 1000 + )"00:"$( : VXLAN reserved + )"$dest_mac:"$( : ETH daddr + )"$source_mac:"$( : ETH saddr + )"08:00:"$( : ETH type + )"45:"$( : IP version + IHL + )"00:"$( : IP TOS + )"00:14:"$( : IP total length + )"00:00:"$( : IP identification + )"20:00:"$( : IP flags + frag off + )"40:"$( : IP TTL + )"00:"$( : IP proto + )"00:00:"$( : IP header csum + )"c0:00:02:03:"$( : IP saddr: 192.0.2.3 + )"c0:00:02:01:"$( : IP daddr: 192.0.2.1 + ) + echo $p +} + +overlay_smac_is_mc_test() +{ + local trap_name="overlay_smac_is_mc" + local group_name="tunnel_drops" + local mz_pid + + RET=0 + + # The matching will be checked on devlink_trap_drop_test() + # and the filter will be removed on devlink_trap_drop_cleanup() + tc filter add dev $swp1 egress protocol ip pref 1 handle 101 \ + flower src_mac 01:02:03:04:05:06 action pass + + rp1_mac=$(mac_get $rp1) + payload=$(mc_smac_payload_get) + + ip vrf exec v$rp2 $MZ $rp2 -c 0 -d 1msec -b $rp1_mac \ + -B 192.0.2.17 -t udp sp=12345,dp=$VXPORT,p=$payload -q & + mz_pid=$! + + devlink_trap_drop_test $trap_name $group_name $swp1 + + log_test "Overlay source MAC is multicast" + + devlink_trap_drop_cleanup $mz_pid $swp1 "ip" +} + +trap cleanup EXIT + +setup_prepare +setup_wait +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/mlxsw/fib.sh b/tools/testing/selftests/drivers/net/mlxsw/fib.sh new file mode 100755 index 000000000000..45115f81c2b1 --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/fib.sh @@ -0,0 +1,180 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# This test is for checking the FIB offload API on top of mlxsw. + +lib_dir=$(dirname $0)/../../../net/forwarding + +ALL_TESTS=" + ipv4_identical_routes + ipv4_tos + ipv4_metric + ipv4_replace + ipv4_delete + ipv4_plen + ipv4_replay + ipv4_flush + ipv6_add + ipv6_metric + ipv6_append_single + ipv6_replace_single + ipv6_metric_multipath + ipv6_append_multipath + ipv6_replace_multipath + ipv6_append_multipath_to_single + ipv6_delete_single + ipv6_delete_multipath + ipv6_replay_single + ipv6_replay_multipath +" +NUM_NETIFS=0 +source $lib_dir/lib.sh +source $lib_dir/devlink_lib.sh +source $lib_dir/fib_offload_lib.sh + +ipv4_identical_routes() +{ + fib_ipv4_identical_routes_test "testns1" +} + +ipv4_tos() +{ + fib_ipv4_tos_test "testns1" +} + +ipv4_metric() +{ + fib_ipv4_metric_test "testns1" +} + +ipv4_replace() +{ + fib_ipv4_replace_test "testns1" +} + +ipv4_delete() +{ + fib_ipv4_delete_test "testns1" +} + +ipv4_plen() +{ + fib_ipv4_plen_test "testns1" +} + +ipv4_replay_metric() +{ + fib_ipv4_replay_metric_test "testns1" "$DEVLINK_DEV" +} + +ipv4_replay_tos() +{ + fib_ipv4_replay_tos_test "testns1" "$DEVLINK_DEV" +} + +ipv4_replay_plen() +{ + fib_ipv4_replay_plen_test "testns1" "$DEVLINK_DEV" +} + +ipv4_replay() +{ + ipv4_replay_metric + ipv4_replay_tos + ipv4_replay_plen +} + +ipv4_flush() +{ + fib_ipv4_flush_test "testns1" +} + +ipv6_add() +{ + fib_ipv6_add_test "testns1" +} + +ipv6_metric() +{ + fib_ipv6_metric_test "testns1" +} + +ipv6_append_single() +{ + fib_ipv6_append_single_test "testns1" +} + +ipv6_replace_single() +{ + fib_ipv6_replace_single_test "testns1" +} + +ipv6_metric_multipath() +{ + fib_ipv6_metric_multipath_test "testns1" +} + +ipv6_append_multipath() +{ + fib_ipv6_append_multipath_test "testns1" +} + +ipv6_replace_multipath() +{ + fib_ipv6_replace_multipath_test "testns1" +} + +ipv6_append_multipath_to_single() +{ + fib_ipv6_append_multipath_to_single_test "testns1" +} + +ipv6_delete_single() +{ + fib_ipv6_delete_single_test "testns1" +} + +ipv6_delete_multipath() +{ + fib_ipv6_delete_multipath_test "testns1" +} + +ipv6_replay_single() +{ + fib_ipv6_replay_single_test "testns1" "$DEVLINK_DEV" +} + +ipv6_replay_multipath() +{ + fib_ipv6_replay_multipath_test "testns1" "$DEVLINK_DEV" +} + +setup_prepare() +{ + ip netns add testns1 + if [ $? -ne 0 ]; then + echo "Failed to add netns \"testns1\"" + exit 1 + fi + + devlink dev reload $DEVLINK_DEV netns testns1 + if [ $? -ne 0 ]; then + echo "Failed to reload into netns \"testns1\"" + exit 1 + fi +} + +cleanup() +{ + pre_cleanup + devlink -N testns1 dev reload $DEVLINK_DEV netns $$ + ip netns del testns1 +} + +trap cleanup EXIT + +setup_prepare + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_lib.sh b/tools/testing/selftests/drivers/net/mlxsw/qos_lib.sh index a5937069ac16..faa51012cdac 100644 --- a/tools/testing/selftests/drivers/net/mlxsw/qos_lib.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/qos_lib.sh @@ -1,29 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -humanize() -{ - local speed=$1; shift - - for unit in bps Kbps Mbps Gbps; do - if (($(echo "$speed < 1024" | bc))); then - break - fi - - speed=$(echo "scale=1; $speed / 1024" | bc) - done - - echo "$speed${unit}" -} - -rate() -{ - local t0=$1; shift - local t1=$1; shift - local interval=$1; shift - - echo $((8 * (t1 - t0) / interval)) -} - check_rate() { local rate=$1; shift diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_mc_aware.sh b/tools/testing/selftests/drivers/net/mlxsw/qos_mc_aware.sh index 47315fe48d5a..24dd8ed48580 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/qos_mc_aware.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/qos_mc_aware.sh @@ -232,7 +232,7 @@ test_mc_aware() stop_traffic local ucth1=${uc_rate[1]} - start_traffic $h1 own bc bc + start_traffic $h1 192.0.2.65 bc bc local d0=$(date +%s) local t0=$(ethtool_stats_get $h3 rx_octets_prio_0) @@ -254,7 +254,11 @@ test_mc_aware() ret = 100 * ($ucth1 - $ucth2) / $ucth1 if (ret > 0) { ret } else { 0 } ") - check_err $(bc <<< "$deg > 25") + + # Minimum shaper of 200Mbps on MC TCs should cause about 20% of + # degradation on 1Gbps link. + check_err $(bc <<< "$deg < 15") "Minimum shaper not in effect" + check_err $(bc <<< "$deg > 25") "MC traffic degrades UC performance too much" local interval=$((d1 - d0)) local mc_ir=$(rate $u0 $u1 $interval) diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_tbf_ets.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_tbf_ets.sh new file mode 100755 index 000000000000..c6ce0b448bf3 --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/sch_tbf_ets.sh @@ -0,0 +1,9 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source qos_lib.sh +bail_on_lldpad + +lib_dir=$(dirname $0)/../../../net/forwarding +TCFLAGS=skip_sw +source $lib_dir/sch_tbf_ets.sh diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_tbf_prio.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_tbf_prio.sh new file mode 100755 index 000000000000..8d245f331619 --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/sch_tbf_prio.sh @@ -0,0 +1,9 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source qos_lib.sh +bail_on_lldpad + +lib_dir=$(dirname $0)/../../../net/forwarding +TCFLAGS=skip_sw +source $lib_dir/sch_tbf_prio.sh diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_tbf_root.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_tbf_root.sh new file mode 100755 index 000000000000..013886061f15 --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/sch_tbf_root.sh @@ -0,0 +1,9 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source qos_lib.sh +bail_on_lldpad + +lib_dir=$(dirname $0)/../../../net/forwarding +TCFLAGS=skip_sw +source $lib_dir/sch_tbf_root.sh diff --git a/tools/testing/selftests/drivers/net/netdevsim/fib.sh b/tools/testing/selftests/drivers/net/netdevsim/fib.sh new file mode 100755 index 000000000000..2f87c3be76a9 --- /dev/null +++ b/tools/testing/selftests/drivers/net/netdevsim/fib.sh @@ -0,0 +1,341 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# This test is for checking the FIB offload API. It makes use of netdevsim +# which registers a listener to the FIB notification chain. + +lib_dir=$(dirname $0)/../../../net/forwarding + +ALL_TESTS=" + ipv4_identical_routes + ipv4_tos + ipv4_metric + ipv4_replace + ipv4_delete + ipv4_plen + ipv4_replay + ipv4_flush + ipv4_error_path + ipv6_add + ipv6_metric + ipv6_append_single + ipv6_replace_single + ipv6_metric_multipath + ipv6_append_multipath + ipv6_replace_multipath + ipv6_append_multipath_to_single + ipv6_delete_single + ipv6_delete_multipath + ipv6_replay_single + ipv6_replay_multipath + ipv6_error_path +" +NETDEVSIM_PATH=/sys/bus/netdevsim/ +DEV_ADDR=1337 +DEV=netdevsim${DEV_ADDR} +DEVLINK_DEV=netdevsim/${DEV} +SYSFS_NET_DIR=/sys/bus/netdevsim/devices/$DEV/net/ +NUM_NETIFS=0 +source $lib_dir/lib.sh +source $lib_dir/devlink_lib.sh +source $lib_dir/fib_offload_lib.sh + +ipv4_identical_routes() +{ + fib_ipv4_identical_routes_test "testns1" +} + +ipv4_tos() +{ + fib_ipv4_tos_test "testns1" +} + +ipv4_metric() +{ + fib_ipv4_metric_test "testns1" +} + +ipv4_replace() +{ + fib_ipv4_replace_test "testns1" +} + +ipv4_delete() +{ + fib_ipv4_delete_test "testns1" +} + +ipv4_plen() +{ + fib_ipv4_plen_test "testns1" +} + +ipv4_replay_metric() +{ + fib_ipv4_replay_metric_test "testns1" "$DEVLINK_DEV" +} + +ipv4_replay_tos() +{ + fib_ipv4_replay_tos_test "testns1" "$DEVLINK_DEV" +} + +ipv4_replay_plen() +{ + fib_ipv4_replay_plen_test "testns1" "$DEVLINK_DEV" +} + +ipv4_replay() +{ + ipv4_replay_metric + ipv4_replay_tos + ipv4_replay_plen +} + +ipv4_flush() +{ + fib_ipv4_flush_test "testns1" +} + +ipv4_error_path_add() +{ + local lsb + + RET=0 + + ip -n testns1 link add name dummy1 type dummy + ip -n testns1 link set dev dummy1 up + + devlink -N testns1 resource set $DEVLINK_DEV path IPv4/fib size 10 + devlink -N testns1 dev reload $DEVLINK_DEV + + for lsb in $(seq 1 20); do + ip -n testns1 route add 192.0.2.${lsb}/32 dev dummy1 \ + &> /dev/null + done + + log_test "IPv4 error path - add" + + ip -n testns1 link del dev dummy1 +} + +ipv4_error_path_replay() +{ + local lsb + + RET=0 + + ip -n testns1 link add name dummy1 type dummy + ip -n testns1 link set dev dummy1 up + + devlink -N testns1 resource set $DEVLINK_DEV path IPv4/fib size 100 + devlink -N testns1 dev reload $DEVLINK_DEV + + for lsb in $(seq 1 20); do + ip -n testns1 route add 192.0.2.${lsb}/32 dev dummy1 + done + + devlink -N testns1 resource set $DEVLINK_DEV path IPv4/fib size 10 + devlink -N testns1 dev reload $DEVLINK_DEV &> /dev/null + + log_test "IPv4 error path - replay" + + ip -n testns1 link del dev dummy1 + + # Successfully reload after deleting all the routes. + devlink -N testns1 resource set $DEVLINK_DEV path IPv4/fib size 100 + devlink -N testns1 dev reload $DEVLINK_DEV +} + +ipv4_error_path() +{ + # Test the different error paths of the notifiers by limiting the size + # of the "IPv4/fib" resource. + ipv4_error_path_add + ipv4_error_path_replay +} + +ipv6_add() +{ + fib_ipv6_add_test "testns1" +} + +ipv6_metric() +{ + fib_ipv6_metric_test "testns1" +} + +ipv6_append_single() +{ + fib_ipv6_append_single_test "testns1" +} + +ipv6_replace_single() +{ + fib_ipv6_replace_single_test "testns1" +} + +ipv6_metric_multipath() +{ + fib_ipv6_metric_multipath_test "testns1" +} + +ipv6_append_multipath() +{ + fib_ipv6_append_multipath_test "testns1" +} + +ipv6_replace_multipath() +{ + fib_ipv6_replace_multipath_test "testns1" +} + +ipv6_append_multipath_to_single() +{ + fib_ipv6_append_multipath_to_single_test "testns1" +} + +ipv6_delete_single() +{ + fib_ipv6_delete_single_test "testns1" +} + +ipv6_delete_multipath() +{ + fib_ipv6_delete_multipath_test "testns1" +} + +ipv6_replay_single() +{ + fib_ipv6_replay_single_test "testns1" "$DEVLINK_DEV" +} + +ipv6_replay_multipath() +{ + fib_ipv6_replay_multipath_test "testns1" "$DEVLINK_DEV" +} + +ipv6_error_path_add_single() +{ + local lsb + + RET=0 + + ip -n testns1 link add name dummy1 type dummy + ip -n testns1 link set dev dummy1 up + + devlink -N testns1 resource set $DEVLINK_DEV path IPv6/fib size 10 + devlink -N testns1 dev reload $DEVLINK_DEV + + for lsb in $(seq 1 20); do + ip -n testns1 route add 2001:db8:1::${lsb}/128 dev dummy1 \ + &> /dev/null + done + + log_test "IPv6 error path - add single" + + ip -n testns1 link del dev dummy1 +} + +ipv6_error_path_add_multipath() +{ + local lsb + + RET=0 + + for i in $(seq 1 2); do + ip -n testns1 link add name dummy$i type dummy + ip -n testns1 link set dev dummy$i up + ip -n testns1 address add 2001:db8:$i::1/64 dev dummy$i + done + + devlink -N testns1 resource set $DEVLINK_DEV path IPv6/fib size 10 + devlink -N testns1 dev reload $DEVLINK_DEV + + for lsb in $(seq 1 20); do + ip -n testns1 route add 2001:db8:10::${lsb}/128 \ + nexthop via 2001:db8:1::2 dev dummy1 \ + nexthop via 2001:db8:2::2 dev dummy2 &> /dev/null + done + + log_test "IPv6 error path - add multipath" + + for i in $(seq 1 2); do + ip -n testns1 link del dev dummy$i + done +} + +ipv6_error_path_replay() +{ + local lsb + + RET=0 + + ip -n testns1 link add name dummy1 type dummy + ip -n testns1 link set dev dummy1 up + + devlink -N testns1 resource set $DEVLINK_DEV path IPv6/fib size 100 + devlink -N testns1 dev reload $DEVLINK_DEV + + for lsb in $(seq 1 20); do + ip -n testns1 route add 2001:db8:1::${lsb}/128 dev dummy1 + done + + devlink -N testns1 resource set $DEVLINK_DEV path IPv6/fib size 10 + devlink -N testns1 dev reload $DEVLINK_DEV &> /dev/null + + log_test "IPv6 error path - replay" + + ip -n testns1 link del dev dummy1 + + # Successfully reload after deleting all the routes. + devlink -N testns1 resource set $DEVLINK_DEV path IPv6/fib size 100 + devlink -N testns1 dev reload $DEVLINK_DEV +} + +ipv6_error_path() +{ + # Test the different error paths of the notifiers by limiting the size + # of the "IPv6/fib" resource. + ipv6_error_path_add_single + ipv6_error_path_add_multipath + ipv6_error_path_replay +} + +setup_prepare() +{ + local netdev + + modprobe netdevsim &> /dev/null + + echo "$DEV_ADDR 1" > ${NETDEVSIM_PATH}/new_device + while [ ! -d $SYSFS_NET_DIR ] ; do :; done + + ip netns add testns1 + if [ $? -ne 0 ]; then + echo "Failed to add netns \"testns1\"" + exit 1 + fi + + devlink dev reload $DEVLINK_DEV netns testns1 + if [ $? -ne 0 ]; then + echo "Failed to reload into netns \"testns1\"" + exit 1 + fi +} + +cleanup() +{ + pre_cleanup + ip netns del testns1 + echo "$DEV_ADDR" > ${NETDEVSIM_PATH}/del_device + modprobe -r netdevsim &> /dev/null +} + +trap cleanup EXIT + +setup_prepare + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh index 15cb8ab68c7c..fb5c55dd6df8 100755 --- a/tools/testing/selftests/net/fcnal-test.sh +++ b/tools/testing/selftests/net/fcnal-test.sh @@ -1165,8 +1165,8 @@ ipv4_tcp_vrf() for a in ${NSA_IP} ${VRF_IP} do log_start - show_hint "Should fail 'No route to host' since client is not bound to VRF" - run_cmd nettest -s -2 ${VRF} & + show_hint "Should fail 'Connection refused' since client is not bound to VRF" + run_cmd nettest -s -d ${VRF} & sleep 1 run_cmd nettest -r ${a} log_test_addr ${a} $? 1 "Global server, local connection" @@ -2663,8 +2663,8 @@ ipv6_tcp_vrf() for a in ${NSA_IP6} ${VRF_IP6} do log_start - show_hint "Fails 'No route to host' since client is not in VRF" - run_cmd nettest -6 -s -2 ${VRF} & + show_hint "Fails 'Connection refused' since client is not in VRF" + run_cmd nettest -6 -s -d ${VRF} & sleep 1 run_cmd nettest -6 -r ${a} log_test_addr ${a} $? 1 "Global server, local connection" diff --git a/tools/testing/selftests/net/forwarding/fib_offload_lib.sh b/tools/testing/selftests/net/forwarding/fib_offload_lib.sh new file mode 100644 index 000000000000..66496659bea7 --- /dev/null +++ b/tools/testing/selftests/net/forwarding/fib_offload_lib.sh @@ -0,0 +1,873 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Various helpers and tests to verify FIB offload. + +__fib_trap_check() +{ + local ns=$1; shift + local family=$1; shift + local route=$1; shift + local should_fail=$1; shift + local ret + + ip -n $ns -j -p -$family route show $route \ + | jq -e '.[]["flags"] | contains(["trap"])' &> /dev/null + ret=$? + if [[ $should_fail == "true" ]]; then + if [[ $ret -ne 0 ]]; then + return 0 + else + return 1 + fi + fi + + return $ret +} + +fib_trap_check() +{ + local ns=$1; shift + local family=$1; shift + local route=$1; shift + local should_fail=$1; shift + + busywait 5000 __fib_trap_check $ns $family "$route" $should_fail +} + +fib4_trap_check() +{ + local ns=$1; shift + local route=$1; shift + local should_fail=$1; shift + + fib_trap_check $ns 4 "$route" $should_fail +} + +fib6_trap_check() +{ + local ns=$1; shift + local route=$1; shift + local should_fail=$1; shift + + fib_trap_check $ns 6 "$route" $should_fail +} + +fib_ipv4_identical_routes_test() +{ + local ns=$1; shift + local i + + RET=0 + + for i in $(seq 1 3); do + ip -n $ns link add name dummy$i type dummy + ip -n $ns link set dev dummy$i up + done + + ip -n $ns route add 192.0.2.0/24 dev dummy1 tos 0 metric 1024 + fib4_trap_check $ns "192.0.2.0/24 dev dummy1 tos 0 metric 1024" false + check_err $? "Route not in hardware when should" + + ip -n $ns route append 192.0.2.0/24 dev dummy2 tos 0 metric 1024 + fib4_trap_check $ns "192.0.2.0/24 dev dummy2 tos 0 metric 1024" true + check_err $? "Appended route in hardware when should not" + + ip -n $ns route prepend 192.0.2.0/24 dev dummy3 tos 0 metric 1024 + fib4_trap_check $ns "192.0.2.0/24 dev dummy3 tos 0 metric 1024" false + check_err $? "Prepended route not in hardware when should" + + fib4_trap_check $ns "192.0.2.0/24 dev dummy1 tos 0 metric 1024" true + check_err $? "Route was not replaced in hardware by prepended one" + + log_test "IPv4 identical routes" + + for i in $(seq 1 3); do + ip -n $ns link del dev dummy$i + done +} + +fib_ipv4_tos_test() +{ + local ns=$1; shift + + RET=0 + + ip -n $ns link add name dummy1 type dummy + ip -n $ns link set dev dummy1 up + + ip -n $ns route add 192.0.2.0/24 dev dummy1 tos 0 metric 1024 + fib4_trap_check $ns "192.0.2.0/24 dev dummy1 tos 0 metric 1024" false + check_err $? "Route not in hardware when should" + + ip -n $ns route add 192.0.2.0/24 dev dummy1 tos 2 metric 1024 + fib4_trap_check $ns "192.0.2.0/24 dev dummy1 tos 2 metric 1024" false + check_err $? "Highest TOS route not in hardware when should" + + fib4_trap_check $ns "192.0.2.0/24 dev dummy1 tos 0 metric 1024" true + check_err $? "Lowest TOS route still in hardware when should not" + + ip -n $ns route add 192.0.2.0/24 dev dummy1 tos 1 metric 1024 + fib4_trap_check $ns "192.0.2.0/24 dev dummy1 tos 1 metric 1024" true + check_err $? "Middle TOS route in hardware when should not" + + log_test "IPv4 routes with TOS" + + ip -n $ns link del dev dummy1 +} + +fib_ipv4_metric_test() +{ + local ns=$1; shift + + RET=0 + + ip -n $ns link add name dummy1 type dummy + ip -n $ns link set dev dummy1 up + + ip -n $ns route add 192.0.2.0/24 dev dummy1 metric 1024 + fib4_trap_check $ns "192.0.2.0/24 dev dummy1 metric 1024" false + check_err $? "Route not in hardware when should" + + ip -n $ns route add 192.0.2.0/24 dev dummy1 metric 1022 + fib4_trap_check $ns "192.0.2.0/24 dev dummy1 metric 1022" false + check_err $? "Lowest metric route not in hardware when should" + + fib4_trap_check $ns "192.0.2.0/24 dev dummy1 metric 1024" true + check_err $? "Highest metric route still in hardware when should not" + + ip -n $ns route add 192.0.2.0/24 dev dummy1 metric 1023 + fib4_trap_check $ns "192.0.2.0/24 dev dummy1 metric 1023" true + check_err $? "Middle metric route in hardware when should not" + + log_test "IPv4 routes with metric" + + ip -n $ns link del dev dummy1 +} + +fib_ipv4_replace_test() +{ + local ns=$1; shift + local i + + RET=0 + + for i in $(seq 1 2); do + ip -n $ns link add name dummy$i type dummy + ip -n $ns link set dev dummy$i up + done + + ip -n $ns route add 192.0.2.0/24 dev dummy1 metric 1024 + fib4_trap_check $ns "192.0.2.0/24 dev dummy1 metric 1024" false + check_err $? "Route not in hardware when should" + + ip -n $ns route replace 192.0.2.0/24 dev dummy2 metric 1024 + fib4_trap_check $ns "192.0.2.0/24 dev dummy2 metric 1024" false + check_err $? "Replacement route not in hardware when should" + + # Add a route with an higher metric and make sure that replacing it + # does not affect the lower metric one. + ip -n $ns route add 192.0.2.0/24 dev dummy1 metric 1025 + ip -n $ns route replace 192.0.2.0/24 dev dummy2 metric 1025 + + fib4_trap_check $ns "192.0.2.0/24 dev dummy2 metric 1024" false + check_err $? "Lowest metric route not in hardware when should" + fib4_trap_check $ns "192.0.2.0/24 dev dummy2 metric 1025" true + check_err $? "Highest metric route in hardware when should not" + + log_test "IPv4 route replace" + + for i in $(seq 1 2); do + ip -n $ns link del dev dummy$i + done +} + +fib_ipv4_delete_test() +{ + local ns=$1; shift + local metric + + RET=0 + + ip -n $ns link add name dummy1 type dummy + ip -n $ns link set dev dummy1 up + + # Insert multiple routes with the same prefix and length and varying + # metrics. Make sure that throughout delete operations the lowest + # metric route is the one in hardware. + for metric in $(seq 1024 1026); do + ip -n $ns route add 192.0.2.0/24 dev dummy1 metric $metric + done + + fib4_trap_check $ns "192.0.2.0/24 dev dummy1 metric 1024" false + check_err $? "Route not in hardware when should" + + ip -n $ns route del 192.0.2.0/24 dev dummy1 metric 1024 + fib4_trap_check $ns "192.0.2.0/24 dev dummy1 metric 1025" false + check_err $? "Lowest metric route not in hardware when should" + + ip -n $ns route del 192.0.2.0/24 dev dummy1 metric 1026 + fib4_trap_check $ns "192.0.2.0/24 dev dummy1 metric 1025" false + check_err $? "Sole route not in hardware when should" + + log_test "IPv4 route delete" + + ip -n $ns link del dev dummy1 +} + +fib_ipv4_plen_test() +{ + local ns=$1; shift + + RET=0 + + ip -n $ns link add name dummy1 type dummy + ip -n $ns link set dev dummy1 up + + # Add two routes with the same key and different prefix length and + # make sure both are in hardware. It can be verfied that both are + # sharing the same leaf by checking the /proc/net/fib_trie + ip -n $ns route add 192.0.2.0/24 dev dummy1 + ip -n $ns route add 192.0.2.0/25 dev dummy1 + + fib4_trap_check $ns "192.0.2.0/24 dev dummy1" false + check_err $? "/24 not in hardware when should" + + fib4_trap_check $ns "192.0.2.0/25 dev dummy1" false + check_err $? "/25 not in hardware when should" + + log_test "IPv4 routes with different prefix length" + + ip -n $ns link del dev dummy1 +} + +fib_ipv4_replay_metric_test() +{ + local ns=$1; shift + local devlink_dev=$1; shift + + RET=0 + + ip -n $ns link add name dummy1 type dummy + ip -n $ns link set dev dummy1 up + + ip -n $ns route add 192.0.2.0/24 dev dummy1 metric 1024 + ip -n $ns route add 192.0.2.0/24 dev dummy1 metric 1025 + + devlink -N $ns dev reload $devlink_dev + + fib4_trap_check $ns "192.0.2.0/24 dev dummy1 metric 1024" false + check_err $? "Lowest metric route not in hardware when should" + + fib4_trap_check $ns "192.0.2.0/24 dev dummy1 metric 1025" true + check_err $? "Highest metric route in hardware when should not" + + log_test "IPv4 routes replay - metric" + + ip -n $ns link del dev dummy1 +} + +fib_ipv4_replay_tos_test() +{ + local ns=$1; shift + local devlink_dev=$1; shift + + RET=0 + + ip -n $ns link add name dummy1 type dummy + ip -n $ns link set dev dummy1 up + + ip -n $ns route add 192.0.2.0/24 dev dummy1 tos 0 + ip -n $ns route add 192.0.2.0/24 dev dummy1 tos 1 + + devlink -N $ns dev reload $devlink_dev + + fib4_trap_check $ns "192.0.2.0/24 dev dummy1 tos 1" false + check_err $? "Highest TOS route not in hardware when should" + + fib4_trap_check $ns "192.0.2.0/24 dev dummy1 tos 0" true + check_err $? "Lowest TOS route in hardware when should not" + + log_test "IPv4 routes replay - TOS" + + ip -n $ns link del dev dummy1 +} + +fib_ipv4_replay_plen_test() +{ + local ns=$1; shift + local devlink_dev=$1; shift + + RET=0 + + ip -n $ns link add name dummy1 type dummy + ip -n $ns link set dev dummy1 up + + ip -n $ns route add 192.0.2.0/24 dev dummy1 + ip -n $ns route add 192.0.2.0/25 dev dummy1 + + devlink -N $ns dev reload $devlink_dev + + fib4_trap_check $ns "192.0.2.0/24 dev dummy1" false + check_err $? "/24 not in hardware when should" + + fib4_trap_check $ns "192.0.2.0/25 dev dummy1" false + check_err $? "/25 not in hardware when should" + + log_test "IPv4 routes replay - prefix length" + + ip -n $ns link del dev dummy1 +} + +fib_ipv4_flush_test() +{ + local ns=$1; shift + local metric + + RET=0 + + ip -n $ns link add name dummy1 type dummy + ip -n $ns link set dev dummy1 up + + # Exercise the routes flushing code paths by inserting various + # prefix routes on a netdev and then deleting it. + for metric in $(seq 1 20); do + ip -n $ns route add 192.0.2.0/24 dev dummy1 metric $metric + done + + ip -n $ns link del dev dummy1 + + log_test "IPv4 routes flushing" +} + +fib_ipv6_add_test() +{ + local ns=$1; shift + + RET=0 + + for i in $(seq 1 2); do + ip -n $ns link add name dummy$i type dummy + ip -n $ns link set dev dummy$i up + done + + ip -n $ns route add 2001:db8:1::/64 dev dummy1 metric 1024 + fib6_trap_check $ns "2001:db8:1::/64 dev dummy1 metric 1024" false + check_err $? "Route not in hardware when should" + + ip -n $ns route append 2001:db8:1::/64 dev dummy2 metric 1024 + fib6_trap_check $ns "2001:db8:1::/64 dev dummy2 metric 1024" true + check_err $? "Route in hardware when should not" + + fib6_trap_check $ns "2001:db8:1::/64 dev dummy1 metric 1024" false + check_err $? "Route not in hardware after appending route" + + log_test "IPv6 single route add" + + for i in $(seq 1 2); do + ip -n $ns link del dev dummy$i + done +} + +fib_ipv6_metric_test() +{ + local ns=$1; shift + + RET=0 + + ip -n $ns link add name dummy1 type dummy + ip -n $ns link set dev dummy1 up + + ip -n $ns route add 2001:db8:1::/64 dev dummy1 metric 1024 + fib6_trap_check $ns "2001:db8:1::/64 dev dummy1 metric 1024" false + check_err $? "Route not in hardware when should" + + ip -n $ns route add 2001:db8:1::/64 dev dummy1 metric 1022 + fib6_trap_check $ns "2001:db8:1::/64 dev dummy1 metric 1022" false + check_err $? "Lowest metric route not in hardware when should" + + fib6_trap_check $ns "2001:db8:1::/64 dev dummy1 metric 1024" true + check_err $? "Highest metric route still in hardware when should not" + + ip -n $ns route add 2001:db8:1::/64 dev dummy1 metric 1023 + fib6_trap_check $ns "2001:db8:1::/64 dev dummy1 metric 1023" true + check_err $? "Middle metric route in hardware when should not" + + log_test "IPv6 routes with metric" + + ip -n $ns link del dev dummy1 +} + +fib_ipv6_append_single_test() +{ + local ns=$1; shift + + # When an IPv6 multipath route is added without the 'nexthop' keyword, + # different code paths are taken compared to when the keyword is used. + # This test tries to verify the former. + RET=0 + + for i in $(seq 1 2); do + ip -n $ns link add name dummy$i type dummy + ip -n $ns link set dev dummy$i up + ip -n $ns address add 2001:db8:$i::1/64 dev dummy$i + done + + ip -n $ns route add 2001:db8:10::/64 via 2001:db8:1::2 metric 1024 + fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false + check_err $? "Route not in hardware when should" + + ip -n $ns route append 2001:db8:10::/64 via 2001:db8:2::2 metric 1024 + fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false + check_err $? "Route not in hardware after appending" + + ip -n $ns route add 2001:db8:10::/64 via 2001:db8:1::2 metric 1025 + fib6_trap_check $ns "2001:db8:10::/64 metric 1025" true + check_err $? "Route in hardware when should not" + + ip -n $ns route append 2001:db8:10::/64 via 2001:db8:2::2 metric 1025 + fib6_trap_check $ns "2001:db8:10::/64 metric 1025" true + check_err $? "Route in hardware when should not after appending" + + fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false + check_err $? "Lowest metric route not in hardware when should" + + log_test "IPv6 append single route without 'nexthop' keyword" + + for i in $(seq 1 2); do + ip -n $ns link del dev dummy$i + done +} + +fib_ipv6_replace_single_test() +{ + local ns=$1; shift + local i + + RET=0 + + for i in $(seq 1 2); do + ip -n $ns link add name dummy$i type dummy + ip -n $ns link set dev dummy$i up + done + + ip -n $ns route add 2001:db8:1::/64 dev dummy1 metric 1024 + fib6_trap_check $ns "2001:db8:1::/64 dev dummy1 metric 1024" false + check_err $? "Route not in hardware when should" + + ip -n $ns route replace 2001:db8:1::/64 dev dummy2 metric 1024 + fib6_trap_check $ns "2001:db8:1::/64 dev dummy2 metric 1024" false + check_err $? "Replacement route not in hardware when should" + + # Add a route with an higher metric and make sure that replacing it + # does not affect the lower metric one. + ip -n $ns route add 2001:db8:1::/64 dev dummy1 metric 1025 + ip -n $ns route replace 2001:db8:1::/64 dev dummy2 metric 1025 + + fib6_trap_check $ns "2001:db8:1::/64 dev dummy2 metric 1024" false + check_err $? "Lowest metric route not in hardware when should" + fib6_trap_check $ns "2001:db8:1::/64 dev dummy2 metric 1025" true + check_err $? "Highest metric route in hardware when should not" + + log_test "IPv6 single route replace" + + for i in $(seq 1 2); do + ip -n $ns link del dev dummy$i + done +} + +fib_ipv6_metric_multipath_test() +{ + local ns=$1; shift + + RET=0 + + for i in $(seq 1 2); do + ip -n $ns link add name dummy$i type dummy + ip -n $ns link set dev dummy$i up + ip -n $ns address add 2001:db8:$i::1/64 dev dummy$i + done + + ip -n $ns route add 2001:db8:10::/64 metric 1024 \ + nexthop via 2001:db8:1::2 dev dummy1 \ + nexthop via 2001:db8:2::2 dev dummy2 + fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false + check_err $? "Route not in hardware when should" + + ip -n $ns route add 2001:db8:10::/64 metric 1022 \ + nexthop via 2001:db8:1::2 dev dummy1 \ + nexthop via 2001:db8:2::2 dev dummy2 + fib6_trap_check $ns "2001:db8:10::/64 metric 1022" false + check_err $? "Lowest metric route not in hardware when should" + + ip -n $ns route add 2001:db8:10::/64 metric 1023 \ + nexthop via 2001:db8:1::2 dev dummy1 \ + nexthop via 2001:db8:2::2 dev dummy2 + fib6_trap_check $ns "2001:db8:10::/64 metric 1024" true + check_err $? "Highest metric route still in hardware when should not" + + fib6_trap_check $ns "2001:db8:10::/64 metric 1023" true + check_err $? "Middle metric route in hardware when should not" + + log_test "IPv6 multipath routes with metric" + + for i in $(seq 1 2); do + ip -n $ns link del dev dummy$i + done +} + +fib_ipv6_append_multipath_test() +{ + local ns=$1; shift + + RET=0 + + for i in $(seq 1 3); do + ip -n $ns link add name dummy$i type dummy + ip -n $ns link set dev dummy$i up + ip -n $ns address add 2001:db8:$i::1/64 dev dummy$i + done + + ip -n $ns route add 2001:db8:10::/64 metric 1024 \ + nexthop via 2001:db8:1::2 dev dummy1 + fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false + check_err $? "Route not in hardware when should" + + ip -n $ns route append 2001:db8:10::/64 metric 1024 \ + nexthop via 2001:db8:2::2 dev dummy2 \ + nexthop via 2001:db8:3::2 dev dummy3 + fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false + check_err $? "Route not in hardware after appending" + + ip -n $ns route add 2001:db8:10::/64 metric 1025 \ + nexthop via 2001:db8:1::2 dev dummy1 + fib6_trap_check $ns "2001:db8:10::/64 metric 1025" true + check_err $? "Route in hardware when should not" + + ip -n $ns route append 2001:db8:10::/64 metric 1025 \ + nexthop via 2001:db8:2::2 dev dummy2 \ + nexthop via 2001:db8:3::2 dev dummy3 + fib6_trap_check $ns "2001:db8:10::/64 metric 1025" true + check_err $? "Route in hardware when should not after appending" + + fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false + check_err $? "Lowest metric route not in hardware when should" + + log_test "IPv6 append multipath route with 'nexthop' keyword" + + for i in $(seq 1 3); do + ip -n $ns link del dev dummy$i + done +} + +fib_ipv6_replace_multipath_test() +{ + local ns=$1; shift + local i + + RET=0 + + for i in $(seq 1 3); do + ip -n $ns link add name dummy$i type dummy + ip -n $ns link set dev dummy$i up + ip -n $ns address add 2001:db8:$i::1/64 dev dummy$i + done + + ip -n $ns route add 2001:db8:10::/64 metric 1024 \ + nexthop via 2001:db8:1::2 dev dummy1 \ + nexthop via 2001:db8:2::2 dev dummy2 + fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false + check_err $? "Route not in hardware when should" + + ip -n $ns route replace 2001:db8:10::/64 metric 1024 \ + nexthop via 2001:db8:1::2 dev dummy1 \ + nexthop via 2001:db8:3::2 dev dummy3 + fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false + check_err $? "Replacement route not in hardware when should" + + # Add a route with an higher metric and make sure that replacing it + # does not affect the lower metric one. + ip -n $ns route add 2001:db8:10::/64 metric 1025 \ + nexthop via 2001:db8:1::2 dev dummy1 \ + nexthop via 2001:db8:2::2 dev dummy2 + ip -n $ns route replace 2001:db8:10::/64 metric 1025 \ + nexthop via 2001:db8:1::2 dev dummy1 \ + nexthop via 2001:db8:3::2 dev dummy3 + + fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false + check_err $? "Lowest metric route not in hardware when should" + fib6_trap_check $ns "2001:db8:10::/64 metric 1025" true + check_err $? "Highest metric route in hardware when should not" + + log_test "IPv6 multipath route replace" + + for i in $(seq 1 3); do + ip -n $ns link del dev dummy$i + done +} + +fib_ipv6_append_multipath_to_single_test() +{ + local ns=$1; shift + + # Test that when the first route in the leaf is not a multipath route + # and we try to append a multipath route with the same metric to it, it + # is not notified. + RET=0 + + for i in $(seq 1 2); do + ip -n $ns link add name dummy$i type dummy + ip -n $ns link set dev dummy$i up + ip -n $ns address add 2001:db8:$i::1/64 dev dummy$i + done + + ip -n $ns route add 2001:db8:10::/64 dev dummy1 metric 1024 + fib6_trap_check $ns "2001:db8:10::/64 dev dummy1 metric 1024" false + check_err $? "Route not in hardware when should" + + ip -n $ns route append 2001:db8:10::/64 metric 1024 \ + nexthop via 2001:db8:2::2 dev dummy2 + fib6_trap_check $ns "2001:db8:10::/64 dev dummy2 metric 1024" true + check_err $? "Route in hardware when should not" + + fib6_trap_check $ns "2001:db8:10::/64 dev dummy1 metric 1024" false + check_err $? "Route not in hardware after append" + + log_test "IPv6 append multipath route to non-multipath route" + + for i in $(seq 1 2); do + ip -n $ns link del dev dummy$i + done +} + +fib_ipv6_delete_single_test() +{ + local ns=$1; shift + + # Test various deletion scenarios, where only a single route is + # deleted from the FIB node. + for i in $(seq 1 2); do + ip -n $ns link add name dummy$i type dummy + ip -n $ns link set dev dummy$i up + ip -n $ns address add 2001:db8:$i::1/64 dev dummy$i + done + + # Test deletion of a single route when it is the only route in the FIB + # node. + RET=0 + + ip -n $ns route add 2001:db8:10::/64 dev dummy1 metric 1024 + ip -n $ns route del 2001:db8:10::/64 dev dummy1 metric 1024 + + log_test "IPv6 delete sole single route" + + # Test that deletion of last route does not affect the first one. + RET=0 + + ip -n $ns route add 2001:db8:10::/64 dev dummy1 metric 1024 + ip -n $ns route add 2001:db8:10::/64 dev dummy1 metric 1025 + ip -n $ns route del 2001:db8:10::/64 dev dummy1 metric 1025 + + fib6_trap_check $ns "2001:db8:10::/64 dev dummy1 metric 1024" false + check_err $? "Route not in hardware after deleting higher metric route" + + log_test "IPv6 delete single route not in hardware" + + ip -n $ns route del 2001:db8:10::/64 dev dummy1 metric 1024 + + # Test that first route is replaced by next single route in the FIB + # node. + RET=0 + + ip -n $ns route add 2001:db8:10::/64 dev dummy1 metric 1024 + ip -n $ns route add 2001:db8:10::/64 dev dummy1 metric 1025 + ip -n $ns route del 2001:db8:10::/64 dev dummy1 metric 1024 + + fib6_trap_check $ns "2001:db8:10::/64 dev dummy1 metric 1025" false + check_err $? "Route not in hardware after deleting lowest metric route" + + log_test "IPv6 delete single route - replaced by single" + + ip -n $ns route del 2001:db8:10::/64 dev dummy1 metric 1025 + + # Test that first route is replaced by next multipath route in the FIB + # node. + RET=0 + + ip -n $ns route add 2001:db8:10::/64 dev dummy1 metric 1024 + ip -n $ns route add 2001:db8:10::/64 metric 1025 \ + nexthop via 2001:db8:1::2 dev dummy1 \ + nexthop via 2001:db8:2::2 dev dummy2 + ip -n $ns route del 2001:db8:10::/64 dev dummy1 metric 1024 + + fib6_trap_check $ns "2001:db8:10::/64 metric 1025" false + check_err $? "Route not in hardware after deleting lowest metric route" + + log_test "IPv6 delete single route - replaced by multipath" + + ip -n $ns route del 2001:db8:10::/64 metric 1025 + + # Test deletion of a single nexthop from a multipath route. + ip -n $ns route add 2001:db8:10::/64 metric 1024 \ + nexthop via 2001:db8:1::2 dev dummy1 \ + nexthop via 2001:db8:2::2 dev dummy2 + ip -n $ns route del 2001:db8:10::/64 metric 1024 \ + nexthop via 2001:db8:1::2 dev dummy1 + + fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false + check_err $? "Route not in hardware after deleting a single nexthop" + + log_test "IPv6 delete single nexthop" + + ip -n $ns route del 2001:db8:10::/64 metric 1024 + + for i in $(seq 1 2); do + ip -n $ns link del dev dummy$i + done +} + +fib_ipv6_delete_multipath_test() +{ + local ns=$1; shift + + # Test various deletion scenarios, where an entire multipath route is + # deleted from the FIB node. + for i in $(seq 1 2); do + ip -n $ns link add name dummy$i type dummy + ip -n $ns link set dev dummy$i up + ip -n $ns address add 2001:db8:$i::1/64 dev dummy$i + done + + # Test deletion of a multipath route when it is the only route in the + # FIB node. + RET=0 + + ip -n $ns route add 2001:db8:10::/64 metric 1024 \ + nexthop via 2001:db8:1::2 dev dummy1 \ + nexthop via 2001:db8:2::2 dev dummy2 + ip -n $ns route del 2001:db8:10::/64 metric 1024 + + log_test "IPv6 delete sole multipath route" + + # Test that deletion of last route does not affect the first one. + RET=0 + + ip -n $ns route add 2001:db8:10::/64 metric 1024 \ + nexthop via 2001:db8:1::2 dev dummy1 \ + nexthop via 2001:db8:2::2 dev dummy2 + ip -n $ns route add 2001:db8:10::/64 metric 1025 \ + nexthop via 2001:db8:1::2 dev dummy1 \ + nexthop via 2001:db8:2::2 dev dummy2 + ip -n $ns route del 2001:db8:10::/64 metric 1025 + + fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false + check_err $? "Route not in hardware after deleting higher metric route" + + log_test "IPv6 delete multipath route not in hardware" + + ip -n $ns route del 2001:db8:10::/64 metric 1024 + + # Test that first route is replaced by next single route in the FIB + # node. + RET=0 + + ip -n $ns route add 2001:db8:10::/64 metric 1024 \ + nexthop via 2001:db8:1::2 dev dummy1 \ + nexthop via 2001:db8:2::2 dev dummy2 + ip -n $ns route add 2001:db8:10::/64 dev dummy1 metric 1025 + ip -n $ns route del 2001:db8:10::/64 metric 1024 + + fib6_trap_check $ns "2001:db8:10::/64 dev dummy1 metric 1025" false + check_err $? "Route not in hardware after deleting lowest metric route" + + log_test "IPv6 delete multipath route - replaced by single" + + ip -n $ns route del 2001:db8:10::/64 dev dummy1 metric 1025 + + # Test that first route is replaced by next multipath route in the FIB + # node. + RET=0 + + ip -n $ns route add 2001:db8:10::/64 metric 1024 \ + nexthop via 2001:db8:1::2 dev dummy1 \ + nexthop via 2001:db8:2::2 dev dummy2 + ip -n $ns route add 2001:db8:10::/64 metric 1025 \ + nexthop via 2001:db8:1::2 dev dummy1 \ + nexthop via 2001:db8:2::2 dev dummy2 + ip -n $ns route del 2001:db8:10::/64 metric 1024 + + fib6_trap_check $ns "2001:db8:10::/64 metric 1025" false + check_err $? "Route not in hardware after deleting lowest metric route" + + log_test "IPv6 delete multipath route - replaced by multipath" + + ip -n $ns route del 2001:db8:10::/64 metric 1025 + + for i in $(seq 1 2); do + ip -n $ns link del dev dummy$i + done +} + +fib_ipv6_replay_single_test() +{ + local ns=$1; shift + local devlink_dev=$1; shift + + RET=0 + + for i in $(seq 1 2); do + ip -n $ns link add name dummy$i type dummy + ip -n $ns link set dev dummy$i up + done + + ip -n $ns route add 2001:db8:1::/64 dev dummy1 + ip -n $ns route append 2001:db8:1::/64 dev dummy2 + + devlink -N $ns dev reload $devlink_dev + + fib6_trap_check $ns "2001:db8:1::/64 dev dummy1" false + check_err $? "First route not in hardware when should" + + fib6_trap_check $ns "2001:db8:1::/64 dev dummy2" true + check_err $? "Second route in hardware when should not" + + log_test "IPv6 routes replay - single route" + + for i in $(seq 1 2); do + ip -n $ns link del dev dummy$i + done +} + +fib_ipv6_replay_multipath_test() +{ + local ns=$1; shift + local devlink_dev=$1; shift + + RET=0 + + for i in $(seq 1 2); do + ip -n $ns link add name dummy$i type dummy + ip -n $ns link set dev dummy$i up + ip -n $ns address add 2001:db8:$i::1/64 dev dummy$i + done + + ip -n $ns route add 2001:db8:10::/64 metric 1024 \ + nexthop via 2001:db8:1::2 dev dummy1 \ + nexthop via 2001:db8:2::2 dev dummy2 + ip -n $ns route add 2001:db8:10::/64 metric 1025 \ + nexthop via 2001:db8:1::2 dev dummy1 \ + nexthop via 2001:db8:2::2 dev dummy2 + + devlink -N $ns dev reload $devlink_dev + + fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false + check_err $? "First route not in hardware when should" + + fib6_trap_check $ns "2001:db8:10::/64 metric 1025" true + check_err $? "Second route in hardware when should not" + + log_test "IPv6 routes replay - multipath route" + + for i in $(seq 1 2); do + ip -n $ns link del dev dummy$i + done +} diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 8dc5fac98cbc..2f5da414aaa7 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -248,6 +248,24 @@ busywait() done } +until_counter_is() +{ + local value=$1; shift + local current=$("$@") + + echo $((current)) + ((current >= value)) +} + +busywait_for_counter() +{ + local timeout=$1; shift + local delta=$1; shift + + local base=$("$@") + busywait "$timeout" until_counter_is $((base + delta)) "$@" +} + setup_wait_dev() { local dev=$1; shift @@ -575,9 +593,10 @@ tc_rule_stats_get() local dev=$1; shift local pref=$1; shift local dir=$1; shift + local selector=${1:-.packets}; shift tc -j -s filter show dev $dev ${dir:-ingress} pref $pref \ - | jq '.[1].options.actions[].stats.packets' + | jq ".[1].options.actions[].stats$selector" } ethtool_stats_get() @@ -588,6 +607,30 @@ ethtool_stats_get() ethtool -S $dev | grep "^ *$stat:" | head -n 1 | cut -d: -f2 } +humanize() +{ + local speed=$1; shift + + for unit in bps Kbps Mbps Gbps; do + if (($(echo "$speed < 1024" | bc))); then + break + fi + + speed=$(echo "scale=1; $speed / 1024" | bc) + done + + echo "$speed${unit}" +} + +rate() +{ + local t0=$1; shift + local t1=$1; shift + local interval=$1; shift + + echo $((8 * (t1 - t0) / interval)) +} + mac_get() { local if_name=$1 diff --git a/tools/testing/selftests/net/forwarding/loopback.sh b/tools/testing/selftests/net/forwarding/loopback.sh index 6e4626ae71b0..8f4057310b5b 100755 --- a/tools/testing/selftests/net/forwarding/loopback.sh +++ b/tools/testing/selftests/net/forwarding/loopback.sh @@ -1,6 +1,9 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + ALL_TESTS="loopback_test" NUM_NETIFS=2 source tc_common.sh @@ -72,6 +75,11 @@ setup_prepare() h1_create h2_create + + if ethtool -k $h1 | grep loopback | grep -q fixed; then + log_test "SKIP: dev $h1 does not support loopback feature" + exit $ksft_skip + fi } cleanup() diff --git a/tools/testing/selftests/net/forwarding/router.sh b/tools/testing/selftests/net/forwarding/router.sh index a75cb51cc5bd..057f91b05098 100755 --- a/tools/testing/selftests/net/forwarding/router.sh +++ b/tools/testing/selftests/net/forwarding/router.sh @@ -1,9 +1,23 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -ALL_TESTS="ping_ipv4 ping_ipv6" +ALL_TESTS=" + ping_ipv4 + ping_ipv6 + sip_in_class_e + mc_mac_mismatch + ipv4_sip_equal_dip + ipv6_sip_equal_dip + ipv4_dip_link_local +" + NUM_NETIFS=4 source lib.sh +source tc_common.sh + +require_command $MCD +require_command $MC_CLI +table_name=selftests h1_create() { @@ -64,6 +78,8 @@ router_create() ip link set dev $rp1 up ip link set dev $rp2 up + tc qdisc add dev $rp2 clsact + ip address add 192.0.2.1/24 dev $rp1 ip address add 2001:db8:1::1/64 dev $rp1 @@ -79,10 +95,31 @@ router_destroy() ip address del 2001:db8:1::1/64 dev $rp1 ip address del 192.0.2.1/24 dev $rp1 + tc qdisc del dev $rp2 clsact + ip link set dev $rp2 down ip link set dev $rp1 down } +start_mcd() +{ + SMCROUTEDIR="$(mktemp -d)" + + for ((i = 1; i <= $NUM_NETIFS; ++i)); do + echo "phyint ${NETIFS[p$i]} enable" >> \ + $SMCROUTEDIR/$table_name.conf + done + + $MCD -N -I $table_name -f $SMCROUTEDIR/$table_name.conf \ + -P $SMCROUTEDIR/$table_name.pid +} + +kill_mcd() +{ + pkill $MCD + rm -rf $SMCROUTEDIR +} + setup_prepare() { h1=${NETIFS[p1]} @@ -91,6 +128,10 @@ setup_prepare() rp2=${NETIFS[p3]} h2=${NETIFS[p4]} + rp1mac=$(mac_get $rp1) + + start_mcd + vrf_prepare h1_create @@ -113,6 +154,8 @@ cleanup() h1_destroy vrf_cleanup + + kill_mcd } ping_ipv4() @@ -125,6 +168,150 @@ ping_ipv6() ping6_test $h1 2001:db8:2::2 } +sip_in_class_e() +{ + RET=0 + + # Disable rpfilter to prevent packets to be dropped because of it. + sysctl_set net.ipv4.conf.all.rp_filter 0 + sysctl_set net.ipv4.conf.$rp1.rp_filter 0 + + tc filter add dev $rp2 egress protocol ip pref 1 handle 101 \ + flower src_ip 240.0.0.1 ip_proto udp action pass + + $MZ $h1 -t udp "sp=54321,dp=12345" -c 5 -d 1msec \ + -A 240.0.0.1 -b $rp1mac -B 198.51.100.2 -q + + tc_check_packets "dev $rp2 egress" 101 5 + check_err $? "Packets were dropped" + + log_test "Source IP in class E" + + tc filter del dev $rp2 egress protocol ip pref 1 handle 101 flower + sysctl_restore net.ipv4.conf.$rp1.rp_filter + sysctl_restore net.ipv4.conf.all.rp_filter +} + +create_mcast_sg() +{ + local if_name=$1; shift + local s_addr=$1; shift + local mcast=$1; shift + local dest_ifs=${@} + + $MC_CLI -I $table_name add $if_name $s_addr $mcast $dest_ifs +} + +delete_mcast_sg() +{ + local if_name=$1; shift + local s_addr=$1; shift + local mcast=$1; shift + local dest_ifs=${@} + + $MC_CLI -I $table_name remove $if_name $s_addr $mcast $dest_ifs +} + +__mc_mac_mismatch() +{ + local desc=$1; shift + local proto=$1; shift + local sip=$1; shift + local dip=$1; shift + local flags=${1:-""}; shift + local dmac=01:02:03:04:05:06 + + RET=0 + + tc filter add dev $rp2 egress protocol $proto pref 1 handle 101 \ + flower dst_ip $dip action pass + + create_mcast_sg $rp1 $sip $dip $rp2 + + $MZ $flags $h1 -t udp "sp=54321,dp=12345" -c 5 -d 1msec -b $dmac \ + -B $dip -q + + tc_check_packets "dev $rp2 egress" 101 5 + check_err $? "Packets were dropped" + + log_test "Multicast MAC mismatch: $desc" + + delete_mcast_sg $rp1 $sip $dip $rp2 + tc filter del dev $rp2 egress protocol $proto pref 1 handle 101 flower +} + +mc_mac_mismatch() +{ + __mc_mac_mismatch "IPv4" "ip" 192.0.2.2 225.1.2.3 + __mc_mac_mismatch "IPv6" "ipv6" 2001:db8:1::2 ff0e::3 "-6" +} + +ipv4_sip_equal_dip() +{ + RET=0 + + # Disable rpfilter to prevent packets to be dropped because of it. + sysctl_set net.ipv4.conf.all.rp_filter 0 + sysctl_set net.ipv4.conf.$rp1.rp_filter 0 + + tc filter add dev $rp2 egress protocol ip pref 1 handle 101 \ + flower src_ip 198.51.100.2 action pass + + $MZ $h1 -t udp "sp=54321,dp=12345" -c 5 -d 1msec \ + -A 198.51.100.2 -b $rp1mac -B 198.51.100.2 -q + + tc_check_packets "dev $rp2 egress" 101 5 + check_err $? "Packets were dropped" + + log_test "Source IP is equal to destination IP: IPv4" + + tc filter del dev $rp2 egress protocol ip pref 1 handle 101 flower + sysctl_restore net.ipv4.conf.$rp1.rp_filter + sysctl_restore net.ipv4.conf.all.rp_filter +} + +ipv6_sip_equal_dip() +{ + RET=0 + + tc filter add dev $rp2 egress protocol ipv6 pref 1 handle 101 \ + flower src_ip 2001:db8:2::2 action pass + + $MZ -6 $h1 -t udp "sp=54321,dp=12345" -c 5 -d 1msec \ + -A 2001:db8:2::2 -b $rp1mac -B 2001:db8:2::2 -q + + tc_check_packets "dev $rp2 egress" 101 5 + check_err $? "Packets were dropped" + + log_test "Source IP is equal to destination IP: IPv6" + + tc filter del dev $rp2 egress protocol ipv6 pref 1 handle 101 flower +} + +ipv4_dip_link_local() +{ + local dip=169.254.1.1 + + RET=0 + + tc filter add dev $rp2 egress protocol ip pref 1 handle 101 \ + flower dst_ip $dip action pass + + ip neigh add 169.254.1.1 lladdr 00:11:22:33:44:55 dev $rp2 + ip route add 169.254.1.0/24 dev $rp2 + + $MZ $h1 -t udp "sp=54321,dp=12345" -c 5 -d 1msec -b $rp1mac -B $dip -q + + tc_check_packets "dev $rp2 egress" 101 5 + check_err $? "Packets were dropped" + + log_test "IPv4 destination IP is link-local" + + ip route del 169.254.1.0/24 dev $rp2 + ip neigh del 169.254.1.1 lladdr 00:11:22:33:44:55 dev $rp2 + tc filter del dev $rp2 egress protocol ip pref 1 handle 101 flower +} + trap cleanup EXIT setup_prepare diff --git a/tools/testing/selftests/net/forwarding/sch_tbf_core.sh b/tools/testing/selftests/net/forwarding/sch_tbf_core.sh new file mode 100644 index 000000000000..d1f26cb7cd73 --- /dev/null +++ b/tools/testing/selftests/net/forwarding/sch_tbf_core.sh @@ -0,0 +1,233 @@ +# SPDX-License-Identifier: GPL-2.0 + +# This test sends a stream of traffic from H1 through a switch, to H2. On the +# egress port from the switch ($swp2), a shaper is installed. The test verifies +# that the rates on the port match the configured shaper. +# +# In order to test per-class shaping, $swp2 actually contains TBF under PRIO or +# ETS, with two different configurations. Traffic is prioritized using 802.1p. +# +# +-------------------------------------------+ +# | H1 | +# | + $h1.10 $h1.11 + | +# | | 192.0.2.1/28 192.0.2.17/28 | | +# | | | | +# | \______________ _____________/ | +# | \ / | +# | + $h1 | +# +---------------------|---------------------+ +# | +# +---------------------|---------------------+ +# | SW + $swp1 | +# | _______________/ \_______________ | +# | / \ | +# | +-|--------------+ +--------------|-+ | +# | | + $swp1.10 | | $swp1.11 + | | +# | | | | | | +# | | BR10 | | BR11 | | +# | | | | | | +# | | + $swp2.10 | | $swp2.11 + | | +# | +-|--------------+ +--------------|-+ | +# | \_______________ ______________/ | +# | \ / | +# | + $swp2 | +# +---------------------|---------------------+ +# | +# +---------------------|---------------------+ +# | H2 + $h2 | +# | ______________/ \______________ | +# | / \ | +# | | | | +# | + $h2.10 $h2.11 + | +# | 192.0.2.2/28 192.0.2.18/28 | +# +-------------------------------------------+ + +NUM_NETIFS=4 +CHECK_TC="yes" +source $lib_dir/lib.sh + +ipaddr() +{ + local host=$1; shift + local vlan=$1; shift + + echo 192.0.2.$((16 * (vlan - 10) + host)) +} + +host_create() +{ + local dev=$1; shift + local host=$1; shift + + simple_if_init $dev + mtu_set $dev 10000 + + vlan_create $dev 10 v$dev $(ipaddr $host 10)/28 + ip link set dev $dev.10 type vlan egress 0:0 + + vlan_create $dev 11 v$dev $(ipaddr $host 11)/28 + ip link set dev $dev.11 type vlan egress 0:1 +} + +host_destroy() +{ + local dev=$1; shift + + vlan_destroy $dev 11 + vlan_destroy $dev 10 + mtu_restore $dev + simple_if_fini $dev +} + +h1_create() +{ + host_create $h1 1 +} + +h1_destroy() +{ + host_destroy $h1 +} + +h2_create() +{ + host_create $h2 2 + + tc qdisc add dev $h2 clsact + tc filter add dev $h2 ingress pref 1010 prot 802.1q \ + flower $TCFLAGS vlan_id 10 action pass + tc filter add dev $h2 ingress pref 1011 prot 802.1q \ + flower $TCFLAGS vlan_id 11 action pass +} + +h2_destroy() +{ + tc qdisc del dev $h2 clsact + host_destroy $h2 +} + +switch_create() +{ + local intf + local vlan + + ip link add dev br10 type bridge + ip link add dev br11 type bridge + + for intf in $swp1 $swp2; do + ip link set dev $intf up + mtu_set $intf 10000 + + for vlan in 10 11; do + vlan_create $intf $vlan + ip link set dev $intf.$vlan master br$vlan + ip link set dev $intf.$vlan up + done + done + + for vlan in 10 11; do + ip link set dev $swp1.$vlan type vlan ingress 0:0 1:1 + done + + ip link set dev br10 up + ip link set dev br11 up +} + +switch_destroy() +{ + local intf + local vlan + + # A test may have been interrupted mid-run, with Qdisc installed. Delete + # it here. + tc qdisc del dev $swp2 root 2>/dev/null + + ip link set dev br11 down + ip link set dev br10 down + + for intf in $swp2 $swp1; do + for vlan in 11 10; do + ip link set dev $intf.$vlan down + ip link set dev $intf.$vlan nomaster + vlan_destroy $intf $vlan + done + + mtu_restore $intf + ip link set dev $intf down + done + + ip link del dev br11 + ip link del dev br10 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + swp1=${NETIFS[p2]} + + swp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + + swp3=${NETIFS[p5]} + h3=${NETIFS[p6]} + + swp4=${NETIFS[p7]} + swp5=${NETIFS[p8]} + + h2_mac=$(mac_get $h2) + + vrf_prepare + + h1_create + h2_create + switch_create +} + +cleanup() +{ + pre_cleanup + + switch_destroy + h2_destroy + h1_destroy + + vrf_cleanup +} + +ping_ipv4() +{ + ping_test $h1.10 $(ipaddr 2 10) " vlan 10" + ping_test $h1.11 $(ipaddr 2 11) " vlan 11" +} + +tbf_get_counter() +{ + local vlan=$1; shift + + tc_rule_stats_get $h2 10$vlan ingress .bytes +} + +do_tbf_test() +{ + local vlan=$1; shift + local mbit=$1; shift + + start_traffic $h1.$vlan $(ipaddr 1 $vlan) $(ipaddr 2 $vlan) $h2_mac + sleep 5 # Wait for the burst to dwindle + + local t2=$(busywait_for_counter 1000 +1 tbf_get_counter $vlan) + sleep 10 + local t3=$(tbf_get_counter $vlan) + stop_traffic + + RET=0 + + # Note: TBF uses 10^6 Mbits, not 2^20 ones. + local er=$((mbit * 1000 * 1000)) + local nr=$(rate $t2 $t3 10) + local nr_pct=$((100 * (nr - er) / er)) + ((-5 <= nr_pct && nr_pct <= 5)) + check_err $? "Expected rate $(humanize $er), got $(humanize $nr), which is $nr_pct% off. Required accuracy is +-5%." + + log_test "TC $((vlan - 10)): TBF rate ${mbit}Mbit" +} diff --git a/tools/testing/selftests/net/forwarding/sch_tbf_ets.sh b/tools/testing/selftests/net/forwarding/sch_tbf_ets.sh new file mode 100755 index 000000000000..84fb6cab88e4 --- /dev/null +++ b/tools/testing/selftests/net/forwarding/sch_tbf_ets.sh @@ -0,0 +1,6 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +QDISC="ets strict" +: ${lib_dir:=.} +source $lib_dir/sch_tbf_etsprio.sh diff --git a/tools/testing/selftests/net/forwarding/sch_tbf_etsprio.sh b/tools/testing/selftests/net/forwarding/sch_tbf_etsprio.sh new file mode 100644 index 000000000000..8bd85da1905a --- /dev/null +++ b/tools/testing/selftests/net/forwarding/sch_tbf_etsprio.sh @@ -0,0 +1,39 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ALL_TESTS=" + ping_ipv4 + tbf_test +" +source $lib_dir/sch_tbf_core.sh + +tbf_test_one() +{ + local bs=$1; shift + + tc qdisc replace dev $swp2 parent 10:3 handle 103: tbf \ + rate 400Mbit burst $bs limit 1M + tc qdisc replace dev $swp2 parent 10:2 handle 102: tbf \ + rate 800Mbit burst $bs limit 1M + + do_tbf_test 10 400 $bs + do_tbf_test 11 800 $bs +} + +tbf_test() +{ + # This test is used for both ETS and PRIO. Even though we only need two + # bands, PRIO demands a minimum of three. + tc qdisc add dev $swp2 root handle 10: $QDISC 3 priomap 2 1 0 + tbf_test_one 128K + tc qdisc del dev $swp2 root +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/sch_tbf_prio.sh b/tools/testing/selftests/net/forwarding/sch_tbf_prio.sh new file mode 100755 index 000000000000..9c8cb1cb9ba4 --- /dev/null +++ b/tools/testing/selftests/net/forwarding/sch_tbf_prio.sh @@ -0,0 +1,6 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +QDISC="prio bands" +: ${lib_dir:=.} +source $lib_dir/sch_tbf_etsprio.sh diff --git a/tools/testing/selftests/net/forwarding/sch_tbf_root.sh b/tools/testing/selftests/net/forwarding/sch_tbf_root.sh new file mode 100755 index 000000000000..72aa21ba88c7 --- /dev/null +++ b/tools/testing/selftests/net/forwarding/sch_tbf_root.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ALL_TESTS=" + ping_ipv4 + tbf_test +" +: ${lib_dir:=.} +source $lib_dir/sch_tbf_core.sh + +tbf_test_one() +{ + local bs=$1; shift + + tc qdisc replace dev $swp2 root handle 108: tbf \ + rate 400Mbit burst $bs limit 1M + do_tbf_test 10 400 $bs +} + +tbf_test() +{ + tbf_test_one 128K + tc qdisc del dev $swp2 root +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/mptcp/.gitignore b/tools/testing/selftests/net/mptcp/.gitignore new file mode 100644 index 000000000000..d72f07642738 --- /dev/null +++ b/tools/testing/selftests/net/mptcp/.gitignore @@ -0,0 +1,2 @@ +mptcp_connect +*.pcap diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/selftests/net/mptcp/Makefile new file mode 100644 index 000000000000..93de52016dde --- /dev/null +++ b/tools/testing/selftests/net/mptcp/Makefile @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0 + +top_srcdir = ../../../../.. + +CFLAGS = -Wall -Wl,--no-as-needed -O2 -g + +TEST_PROGS := mptcp_connect.sh + +TEST_GEN_FILES = mptcp_connect + +EXTRA_CLEAN := *.pcap + +include ../../lib.mk diff --git a/tools/testing/selftests/net/mptcp/config b/tools/testing/selftests/net/mptcp/config new file mode 100644 index 000000000000..2499824d9e1c --- /dev/null +++ b/tools/testing/selftests/net/mptcp/config @@ -0,0 +1,4 @@ +CONFIG_MPTCP=y +CONFIG_MPTCP_IPV6=y +CONFIG_VETH=y +CONFIG_NET_SCH_NETEM=m diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c new file mode 100644 index 000000000000..a3dccd816ae4 --- /dev/null +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -0,0 +1,832 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE + +#include <errno.h> +#include <limits.h> +#include <fcntl.h> +#include <string.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <strings.h> +#include <unistd.h> + +#include <sys/poll.h> +#include <sys/sendfile.h> +#include <sys/stat.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <sys/mman.h> + +#include <netdb.h> +#include <netinet/in.h> + +#include <linux/tcp.h> + +extern int optind; + +#ifndef IPPROTO_MPTCP +#define IPPROTO_MPTCP 262 +#endif +#ifndef TCP_ULP +#define TCP_ULP 31 +#endif + +static bool listen_mode; +static int poll_timeout; + +enum cfg_mode { + CFG_MODE_POLL, + CFG_MODE_MMAP, + CFG_MODE_SENDFILE, +}; + +static enum cfg_mode cfg_mode = CFG_MODE_POLL; +static const char *cfg_host; +static const char *cfg_port = "12000"; +static int cfg_sock_proto = IPPROTO_MPTCP; +static bool tcpulp_audit; +static int pf = AF_INET; +static int cfg_sndbuf; + +static void die_usage(void) +{ + fprintf(stderr, "Usage: mptcp_connect [-6] [-u] [-s MPTCP|TCP] [-p port] -m mode]" + "[ -l ] [ -t timeout ] connect_address\n"); + exit(1); +} + +static const char *getxinfo_strerr(int err) +{ + if (err == EAI_SYSTEM) + return strerror(errno); + + return gai_strerror(err); +} + +static void xgetnameinfo(const struct sockaddr *addr, socklen_t addrlen, + char *host, socklen_t hostlen, + char *serv, socklen_t servlen) +{ + int flags = NI_NUMERICHOST | NI_NUMERICSERV; + int err = getnameinfo(addr, addrlen, host, hostlen, serv, servlen, + flags); + + if (err) { + const char *errstr = getxinfo_strerr(err); + + fprintf(stderr, "Fatal: getnameinfo: %s\n", errstr); + exit(1); + } +} + +static void xgetaddrinfo(const char *node, const char *service, + const struct addrinfo *hints, + struct addrinfo **res) +{ + int err = getaddrinfo(node, service, hints, res); + + if (err) { + const char *errstr = getxinfo_strerr(err); + + fprintf(stderr, "Fatal: getaddrinfo(%s:%s): %s\n", + node ? node : "", service ? service : "", errstr); + exit(1); + } +} + +static void set_sndbuf(int fd, unsigned int size) +{ + int err; + + err = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &size, sizeof(size)); + if (err) { + perror("set SO_SNDBUF"); + exit(1); + } +} + +static int sock_listen_mptcp(const char * const listenaddr, + const char * const port) +{ + int sock; + struct addrinfo hints = { + .ai_protocol = IPPROTO_TCP, + .ai_socktype = SOCK_STREAM, + .ai_flags = AI_PASSIVE | AI_NUMERICHOST + }; + + hints.ai_family = pf; + + struct addrinfo *a, *addr; + int one = 1; + + xgetaddrinfo(listenaddr, port, &hints, &addr); + hints.ai_family = pf; + + for (a = addr; a; a = a->ai_next) { + sock = socket(a->ai_family, a->ai_socktype, cfg_sock_proto); + if (sock < 0) + continue; + + if (-1 == setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &one, + sizeof(one))) + perror("setsockopt"); + + if (bind(sock, a->ai_addr, a->ai_addrlen) == 0) + break; /* success */ + + perror("bind"); + close(sock); + sock = -1; + } + + freeaddrinfo(addr); + + if (sock < 0) { + fprintf(stderr, "Could not create listen socket\n"); + return sock; + } + + if (listen(sock, 20)) { + perror("listen"); + close(sock); + return -1; + } + + return sock; +} + +static bool sock_test_tcpulp(const char * const remoteaddr, + const char * const port) +{ + struct addrinfo hints = { + .ai_protocol = IPPROTO_TCP, + .ai_socktype = SOCK_STREAM, + }; + struct addrinfo *a, *addr; + int sock = -1, ret = 0; + bool test_pass = false; + + hints.ai_family = AF_INET; + + xgetaddrinfo(remoteaddr, port, &hints, &addr); + for (a = addr; a; a = a->ai_next) { + sock = socket(a->ai_family, a->ai_socktype, IPPROTO_TCP); + if (sock < 0) { + perror("socket"); + continue; + } + ret = setsockopt(sock, IPPROTO_TCP, TCP_ULP, "mptcp", + sizeof("mptcp")); + if (ret == -1 && errno == EOPNOTSUPP) + test_pass = true; + close(sock); + + if (test_pass) + break; + if (!ret) + fprintf(stderr, + "setsockopt(TCP_ULP) returned 0\n"); + else + perror("setsockopt(TCP_ULP)"); + } + return test_pass; +} + +static int sock_connect_mptcp(const char * const remoteaddr, + const char * const port, int proto) +{ + struct addrinfo hints = { + .ai_protocol = IPPROTO_TCP, + .ai_socktype = SOCK_STREAM, + }; + struct addrinfo *a, *addr; + int sock = -1; + + hints.ai_family = pf; + + xgetaddrinfo(remoteaddr, port, &hints, &addr); + for (a = addr; a; a = a->ai_next) { + sock = socket(a->ai_family, a->ai_socktype, proto); + if (sock < 0) { + perror("socket"); + continue; + } + + if (connect(sock, a->ai_addr, a->ai_addrlen) == 0) + break; /* success */ + + perror("connect()"); + close(sock); + sock = -1; + } + + freeaddrinfo(addr); + return sock; +} + +static size_t do_rnd_write(const int fd, char *buf, const size_t len) +{ + unsigned int do_w; + ssize_t bw; + + do_w = rand() & 0xffff; + if (do_w == 0 || do_w > len) + do_w = len; + + bw = write(fd, buf, do_w); + if (bw < 0) + perror("write"); + + return bw; +} + +static size_t do_write(const int fd, char *buf, const size_t len) +{ + size_t offset = 0; + + while (offset < len) { + size_t written; + ssize_t bw; + + bw = write(fd, buf + offset, len - offset); + if (bw < 0) { + perror("write"); + return 0; + } + + written = (size_t)bw; + offset += written; + } + + return offset; +} + +static ssize_t do_rnd_read(const int fd, char *buf, const size_t len) +{ + size_t cap = rand(); + + cap &= 0xffff; + + if (cap == 0) + cap = 1; + else if (cap > len) + cap = len; + + return read(fd, buf, cap); +} + +static void set_nonblock(int fd) +{ + int flags = fcntl(fd, F_GETFL); + + if (flags == -1) + return; + + fcntl(fd, F_SETFL, flags | O_NONBLOCK); +} + +static int copyfd_io_poll(int infd, int peerfd, int outfd) +{ + struct pollfd fds = { + .fd = peerfd, + .events = POLLIN | POLLOUT, + }; + unsigned int woff = 0, wlen = 0; + char wbuf[8192]; + + set_nonblock(peerfd); + + for (;;) { + char rbuf[8192]; + ssize_t len; + + if (fds.events == 0) + break; + + switch (poll(&fds, 1, poll_timeout)) { + case -1: + if (errno == EINTR) + continue; + perror("poll"); + return 1; + case 0: + fprintf(stderr, "%s: poll timed out (events: " + "POLLIN %u, POLLOUT %u)\n", __func__, + fds.events & POLLIN, fds.events & POLLOUT); + return 2; + } + + if (fds.revents & POLLIN) { + len = do_rnd_read(peerfd, rbuf, sizeof(rbuf)); + if (len == 0) { + /* no more data to receive: + * peer has closed its write side + */ + fds.events &= ~POLLIN; + + if ((fds.events & POLLOUT) == 0) + /* and nothing more to send */ + break; + + /* Else, still have data to transmit */ + } else if (len < 0) { + perror("read"); + return 3; + } + + do_write(outfd, rbuf, len); + } + + if (fds.revents & POLLOUT) { + if (wlen == 0) { + woff = 0; + wlen = read(infd, wbuf, sizeof(wbuf)); + } + + if (wlen > 0) { + ssize_t bw; + + bw = do_rnd_write(peerfd, wbuf + woff, wlen); + if (bw < 0) + return 111; + + woff += bw; + wlen -= bw; + } else if (wlen == 0) { + /* We have no more data to send. */ + fds.events &= ~POLLOUT; + + if ((fds.events & POLLIN) == 0) + /* ... and peer also closed already */ + break; + + /* ... but we still receive. + * Close our write side. + */ + shutdown(peerfd, SHUT_WR); + } else { + if (errno == EINTR) + continue; + perror("read"); + return 4; + } + } + + if (fds.revents & (POLLERR | POLLNVAL)) { + fprintf(stderr, "Unexpected revents: " + "POLLERR/POLLNVAL(%x)\n", fds.revents); + return 5; + } + } + + close(peerfd); + return 0; +} + +static int do_recvfile(int infd, int outfd) +{ + ssize_t r; + + do { + char buf[16384]; + + r = do_rnd_read(infd, buf, sizeof(buf)); + if (r > 0) { + if (write(outfd, buf, r) != r) + break; + } else if (r < 0) { + perror("read"); + } + } while (r > 0); + + return (int)r; +} + +static int do_mmap(int infd, int outfd, unsigned int size) +{ + char *inbuf = mmap(NULL, size, PROT_READ, MAP_SHARED, infd, 0); + ssize_t ret = 0, off = 0; + size_t rem; + + if (inbuf == MAP_FAILED) { + perror("mmap"); + return 1; + } + + rem = size; + + while (rem > 0) { + ret = write(outfd, inbuf + off, rem); + + if (ret < 0) { + perror("write"); + break; + } + + off += ret; + rem -= ret; + } + + munmap(inbuf, size); + return rem; +} + +static int get_infd_size(int fd) +{ + struct stat sb; + ssize_t count; + int err; + + err = fstat(fd, &sb); + if (err < 0) { + perror("fstat"); + return -1; + } + + if ((sb.st_mode & S_IFMT) != S_IFREG) { + fprintf(stderr, "%s: stdin is not a regular file\n", __func__); + return -2; + } + + count = sb.st_size; + if (count > INT_MAX) { + fprintf(stderr, "File too large: %zu\n", count); + return -3; + } + + return (int)count; +} + +static int do_sendfile(int infd, int outfd, unsigned int count) +{ + while (count > 0) { + ssize_t r; + + r = sendfile(outfd, infd, NULL, count); + if (r < 0) { + perror("sendfile"); + return 3; + } + + count -= r; + } + + return 0; +} + +static int copyfd_io_mmap(int infd, int peerfd, int outfd, + unsigned int size) +{ + int err; + + if (listen_mode) { + err = do_recvfile(peerfd, outfd); + if (err) + return err; + + err = do_mmap(infd, peerfd, size); + } else { + err = do_mmap(infd, peerfd, size); + if (err) + return err; + + shutdown(peerfd, SHUT_WR); + + err = do_recvfile(peerfd, outfd); + } + + return err; +} + +static int copyfd_io_sendfile(int infd, int peerfd, int outfd, + unsigned int size) +{ + int err; + + if (listen_mode) { + err = do_recvfile(peerfd, outfd); + if (err) + return err; + + err = do_sendfile(infd, peerfd, size); + } else { + err = do_sendfile(infd, peerfd, size); + if (err) + return err; + err = do_recvfile(peerfd, outfd); + } + + return err; +} + +static int copyfd_io(int infd, int peerfd, int outfd) +{ + int file_size; + + switch (cfg_mode) { + case CFG_MODE_POLL: + return copyfd_io_poll(infd, peerfd, outfd); + case CFG_MODE_MMAP: + file_size = get_infd_size(infd); + if (file_size < 0) + return file_size; + return copyfd_io_mmap(infd, peerfd, outfd, file_size); + case CFG_MODE_SENDFILE: + file_size = get_infd_size(infd); + if (file_size < 0) + return file_size; + return copyfd_io_sendfile(infd, peerfd, outfd, file_size); + } + + fprintf(stderr, "Invalid mode %d\n", cfg_mode); + + die_usage(); + return 1; +} + +static void check_sockaddr(int pf, struct sockaddr_storage *ss, + socklen_t salen) +{ + struct sockaddr_in6 *sin6; + struct sockaddr_in *sin; + socklen_t wanted_size = 0; + + switch (pf) { + case AF_INET: + wanted_size = sizeof(*sin); + sin = (void *)ss; + if (!sin->sin_port) + fprintf(stderr, "accept: something wrong: ip connection from port 0"); + break; + case AF_INET6: + wanted_size = sizeof(*sin6); + sin6 = (void *)ss; + if (!sin6->sin6_port) + fprintf(stderr, "accept: something wrong: ipv6 connection from port 0"); + break; + default: + fprintf(stderr, "accept: Unknown pf %d, salen %u\n", pf, salen); + return; + } + + if (salen != wanted_size) + fprintf(stderr, "accept: size mismatch, got %d expected %d\n", + (int)salen, wanted_size); + + if (ss->ss_family != pf) + fprintf(stderr, "accept: pf mismatch, expect %d, ss_family is %d\n", + (int)ss->ss_family, pf); +} + +static void check_getpeername(int fd, struct sockaddr_storage *ss, socklen_t salen) +{ + struct sockaddr_storage peerss; + socklen_t peersalen = sizeof(peerss); + + if (getpeername(fd, (struct sockaddr *)&peerss, &peersalen) < 0) { + perror("getpeername"); + return; + } + + if (peersalen != salen) { + fprintf(stderr, "%s: %d vs %d\n", __func__, peersalen, salen); + return; + } + + if (memcmp(ss, &peerss, peersalen)) { + char a[INET6_ADDRSTRLEN]; + char b[INET6_ADDRSTRLEN]; + char c[INET6_ADDRSTRLEN]; + char d[INET6_ADDRSTRLEN]; + + xgetnameinfo((struct sockaddr *)ss, salen, + a, sizeof(a), b, sizeof(b)); + + xgetnameinfo((struct sockaddr *)&peerss, peersalen, + c, sizeof(c), d, sizeof(d)); + + fprintf(stderr, "%s: memcmp failure: accept %s vs peername %s, %s vs %s salen %d vs %d\n", + __func__, a, c, b, d, peersalen, salen); + } +} + +static void check_getpeername_connect(int fd) +{ + struct sockaddr_storage ss; + socklen_t salen = sizeof(ss); + char a[INET6_ADDRSTRLEN]; + char b[INET6_ADDRSTRLEN]; + + if (getpeername(fd, (struct sockaddr *)&ss, &salen) < 0) { + perror("getpeername"); + return; + } + + xgetnameinfo((struct sockaddr *)&ss, salen, + a, sizeof(a), b, sizeof(b)); + + if (strcmp(cfg_host, a) || strcmp(cfg_port, b)) + fprintf(stderr, "%s: %s vs %s, %s vs %s\n", __func__, + cfg_host, a, cfg_port, b); +} + +int main_loop_s(int listensock) +{ + struct sockaddr_storage ss; + struct pollfd polls; + socklen_t salen; + int remotesock; + + polls.fd = listensock; + polls.events = POLLIN; + + switch (poll(&polls, 1, poll_timeout)) { + case -1: + perror("poll"); + return 1; + case 0: + fprintf(stderr, "%s: timed out\n", __func__); + close(listensock); + return 2; + } + + salen = sizeof(ss); + remotesock = accept(listensock, (struct sockaddr *)&ss, &salen); + if (remotesock >= 0) { + check_sockaddr(pf, &ss, salen); + check_getpeername(remotesock, &ss, salen); + + return copyfd_io(0, remotesock, 1); + } + + perror("accept"); + + return 1; +} + +static void init_rng(void) +{ + int fd = open("/dev/urandom", O_RDONLY); + unsigned int foo; + + if (fd > 0) { + int ret = read(fd, &foo, sizeof(foo)); + + if (ret < 0) + srand(fd + foo); + close(fd); + } + + srand(foo); +} + +int main_loop(void) +{ + int fd; + + /* listener is ready. */ + fd = sock_connect_mptcp(cfg_host, cfg_port, cfg_sock_proto); + if (fd < 0) + return 2; + + check_getpeername_connect(fd); + + if (cfg_sndbuf) + set_sndbuf(fd, cfg_sndbuf); + + return copyfd_io(0, fd, 1); +} + +int parse_proto(const char *proto) +{ + if (!strcasecmp(proto, "MPTCP")) + return IPPROTO_MPTCP; + if (!strcasecmp(proto, "TCP")) + return IPPROTO_TCP; + + fprintf(stderr, "Unknown protocol: %s\n.", proto); + die_usage(); + + /* silence compiler warning */ + return 0; +} + +int parse_mode(const char *mode) +{ + if (!strcasecmp(mode, "poll")) + return CFG_MODE_POLL; + if (!strcasecmp(mode, "mmap")) + return CFG_MODE_MMAP; + if (!strcasecmp(mode, "sendfile")) + return CFG_MODE_SENDFILE; + + fprintf(stderr, "Unknown test mode: %s\n", mode); + fprintf(stderr, "Supported modes are:\n"); + fprintf(stderr, "\t\t\"poll\" - interleaved read/write using poll()\n"); + fprintf(stderr, "\t\t\"mmap\" - send entire input file (mmap+write), then read response (-l will read input first)\n"); + fprintf(stderr, "\t\t\"sendfile\" - send entire input file (sendfile), then read response (-l will read input first)\n"); + + die_usage(); + + /* silence compiler warning */ + return 0; +} + +int parse_sndbuf(const char *size) +{ + unsigned long s; + + errno = 0; + + s = strtoul(size, NULL, 0); + + if (errno) { + fprintf(stderr, "Invalid sndbuf size %s (%s)\n", + size, strerror(errno)); + die_usage(); + } + + if (s > INT_MAX) { + fprintf(stderr, "Invalid sndbuf size %s (%s)\n", + size, strerror(ERANGE)); + die_usage(); + } + + cfg_sndbuf = s; + + return 0; +} + +static void parse_opts(int argc, char **argv) +{ + int c; + + while ((c = getopt(argc, argv, "6lp:s:hut:m:b:")) != -1) { + switch (c) { + case 'l': + listen_mode = true; + break; + case 'p': + cfg_port = optarg; + break; + case 's': + cfg_sock_proto = parse_proto(optarg); + break; + case 'h': + die_usage(); + break; + case 'u': + tcpulp_audit = true; + break; + case '6': + pf = AF_INET6; + break; + case 't': + poll_timeout = atoi(optarg) * 1000; + if (poll_timeout <= 0) + poll_timeout = -1; + break; + case 'm': + cfg_mode = parse_mode(optarg); + break; + case 'b': + cfg_sndbuf = parse_sndbuf(optarg); + break; + } + } + + if (optind + 1 != argc) + die_usage(); + cfg_host = argv[optind]; + + if (strchr(cfg_host, ':')) + pf = AF_INET6; +} + +int main(int argc, char *argv[]) +{ + init_rng(); + + parse_opts(argc, argv); + + if (tcpulp_audit) + return sock_test_tcpulp(cfg_host, cfg_port) ? 0 : 1; + + if (listen_mode) { + int fd = sock_listen_mptcp(cfg_host, cfg_port); + + if (fd < 0) + return 1; + + if (cfg_sndbuf) + set_sndbuf(fd, cfg_sndbuf); + + return main_loop_s(fd); + } + + return main_loop(); +} diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh new file mode 100755 index 000000000000..d573a0feb98d --- /dev/null +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -0,0 +1,595 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +time_start=$(date +%s) + +optstring="b:d:e:l:r:h4cm:" +ret=0 +sin="" +sout="" +cin="" +cout="" +ksft_skip=4 +capture=false +timeout=30 +ipv6=true +ethtool_random_on=true +tc_delay="$((RANDOM%400))" +tc_loss=$((RANDOM%101)) +tc_reorder="" +testmode="" +sndbuf=0 +options_log=true + +if [ $tc_loss -eq 100 ];then + tc_loss=1% +elif [ $tc_loss -ge 10 ]; then + tc_loss=0.$tc_loss% +elif [ $tc_loss -ge 1 ]; then + tc_loss=0.0$tc_loss% +else + tc_loss="" +fi + +usage() { + echo "Usage: $0 [ -a ]" + echo -e "\t-d: tc/netem delay in milliseconds, e.g. \"-d 10\" (default random)" + echo -e "\t-l: tc/netem loss percentage, e.g. \"-l 0.02\" (default random)" + echo -e "\t-r: tc/netem reorder mode, e.g. \"-r 25% 50% gap 5\", use "-r 0" to disable reordering (default random)" + echo -e "\t-e: ethtool features to disable, e.g.: \"-e tso -e gso\" (default: randomly disable any of tso/gso/gro)" + echo -e "\t-4: IPv4 only: disable IPv6 tests (default: test both IPv4 and IPv6)" + echo -e "\t-c: capture packets for each test using tcpdump (default: no capture)" + echo -e "\t-b: set sndbuf value (default: use kernel default)" + echo -e "\t-m: test mode (poll, sendfile; default: poll)" +} + +while getopts "$optstring" option;do + case "$option" in + "h") + usage $0 + exit 0 + ;; + "d") + if [ $OPTARG -ge 0 ];then + tc_delay="$OPTARG" + else + echo "-d requires numeric argument, got \"$OPTARG\"" 1>&2 + exit 1 + fi + ;; + "e") + ethtool_args="$ethtool_args $OPTARG off" + ethtool_random_on=false + ;; + "l") + tc_loss="$OPTARG" + ;; + "r") + tc_reorder="$OPTARG" + ;; + "4") + ipv6=false + ;; + "c") + capture=true + ;; + "b") + if [ $OPTARG -ge 0 ];then + sndbuf="$OPTARG" + else + echo "-s requires numeric argument, got \"$OPTARG\"" 1>&2 + exit 1 + fi + ;; + "m") + testmode="$OPTARG" + ;; + "?") + usage $0 + exit 1 + ;; + esac +done + +sec=$(date +%s) +rndh=$(printf %x $sec)-$(mktemp -u XXXXXX) +ns1="ns1-$rndh" +ns2="ns2-$rndh" +ns3="ns3-$rndh" +ns4="ns4-$rndh" + +TEST_COUNT=0 + +cleanup() +{ + rm -f "$cin" "$cout" + rm -f "$sin" "$sout" + rm -f "$capout" + + local netns + for netns in "$ns1" "$ns2" "$ns3" "$ns4";do + ip netns del $netns + done +} + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +sin=$(mktemp) +sout=$(mktemp) +cin=$(mktemp) +cout=$(mktemp) +capout=$(mktemp) +trap cleanup EXIT + +for i in "$ns1" "$ns2" "$ns3" "$ns4";do + ip netns add $i || exit $ksft_skip + ip -net $i link set lo up +done + +# "$ns1" ns2 ns3 ns4 +# ns1eth2 ns2eth1 ns2eth3 ns3eth2 ns3eth4 ns4eth3 +# - drop 1% -> reorder 25% +# <- TSO off - + +ip link add ns1eth2 netns "$ns1" type veth peer name ns2eth1 netns "$ns2" +ip link add ns2eth3 netns "$ns2" type veth peer name ns3eth2 netns "$ns3" +ip link add ns3eth4 netns "$ns3" type veth peer name ns4eth3 netns "$ns4" + +ip -net "$ns1" addr add 10.0.1.1/24 dev ns1eth2 +ip -net "$ns1" addr add dead:beef:1::1/64 dev ns1eth2 nodad + +ip -net "$ns1" link set ns1eth2 up +ip -net "$ns1" route add default via 10.0.1.2 +ip -net "$ns1" route add default via dead:beef:1::2 + +ip -net "$ns2" addr add 10.0.1.2/24 dev ns2eth1 +ip -net "$ns2" addr add dead:beef:1::2/64 dev ns2eth1 nodad +ip -net "$ns2" link set ns2eth1 up + +ip -net "$ns2" addr add 10.0.2.1/24 dev ns2eth3 +ip -net "$ns2" addr add dead:beef:2::1/64 dev ns2eth3 nodad +ip -net "$ns2" link set ns2eth3 up +ip -net "$ns2" route add default via 10.0.2.2 +ip -net "$ns2" route add default via dead:beef:2::2 +ip netns exec "$ns2" sysctl -q net.ipv4.ip_forward=1 +ip netns exec "$ns2" sysctl -q net.ipv6.conf.all.forwarding=1 + +ip -net "$ns3" addr add 10.0.2.2/24 dev ns3eth2 +ip -net "$ns3" addr add dead:beef:2::2/64 dev ns3eth2 nodad +ip -net "$ns3" link set ns3eth2 up + +ip -net "$ns3" addr add 10.0.3.2/24 dev ns3eth4 +ip -net "$ns3" addr add dead:beef:3::2/64 dev ns3eth4 nodad +ip -net "$ns3" link set ns3eth4 up +ip -net "$ns3" route add default via 10.0.2.1 +ip -net "$ns3" route add default via dead:beef:2::1 +ip netns exec "$ns3" sysctl -q net.ipv4.ip_forward=1 +ip netns exec "$ns3" sysctl -q net.ipv6.conf.all.forwarding=1 + +ip -net "$ns4" addr add 10.0.3.1/24 dev ns4eth3 +ip -net "$ns4" addr add dead:beef:3::1/64 dev ns4eth3 nodad +ip -net "$ns4" link set ns4eth3 up +ip -net "$ns4" route add default via 10.0.3.2 +ip -net "$ns4" route add default via dead:beef:3::2 + +set_ethtool_flags() { + local ns="$1" + local dev="$2" + local flags="$3" + + ip netns exec $ns ethtool -K $dev $flags 2>/dev/null + [ $? -eq 0 ] && echo "INFO: set $ns dev $dev: ethtool -K $flags" +} + +set_random_ethtool_flags() { + local flags="" + local r=$RANDOM + + local pick1=$((r & 1)) + local pick2=$((r & 2)) + local pick3=$((r & 4)) + + [ $pick1 -ne 0 ] && flags="tso off" + [ $pick2 -ne 0 ] && flags="$flags gso off" + [ $pick3 -ne 0 ] && flags="$flags gro off" + + [ -z "$flags" ] && return + + set_ethtool_flags "$1" "$2" "$flags" +} + +if $ethtool_random_on;then + set_random_ethtool_flags "$ns3" ns3eth2 + set_random_ethtool_flags "$ns4" ns4eth3 +else + set_ethtool_flags "$ns3" ns3eth2 "$ethtool_args" + set_ethtool_flags "$ns4" ns4eth3 "$ethtool_args" +fi + +print_file_err() +{ + ls -l "$1" 1>&2 + echo "Trailing bytes are: " + tail -c 27 "$1" +} + +check_transfer() +{ + local in=$1 + local out=$2 + local what=$3 + + cmp "$in" "$out" > /dev/null 2>&1 + if [ $? -ne 0 ] ;then + echo "[ FAIL ] $what does not match (in, out):" + print_file_err "$in" + print_file_err "$out" + + return 1 + fi + + return 0 +} + +check_mptcp_disabled() +{ + local disabled_ns + disabled_ns="ns_disabled-$sech-$(mktemp -u XXXXXX)" + ip netns add ${disabled_ns} || exit $ksft_skip + + # net.mptcp.enabled should be enabled by default + if [ "$(ip netns exec ${disabled_ns} sysctl net.mptcp.enabled | awk '{ print $3 }')" -ne 1 ]; then + echo -e "net.mptcp.enabled sysctl is not 1 by default\t\t[ FAIL ]" + ret=1 + return 1 + fi + ip netns exec ${disabled_ns} sysctl -q net.mptcp.enabled=0 + + local err=0 + LANG=C ip netns exec ${disabled_ns} ./mptcp_connect -t $timeout -p 10000 -s MPTCP 127.0.0.1 < "$cin" 2>&1 | \ + grep -q "^socket: Protocol not available$" && err=1 + ip netns delete ${disabled_ns} + + if [ ${err} -eq 0 ]; then + echo -e "New MPTCP socket cannot be blocked via sysctl\t\t[ FAIL ]" + ret=1 + return 1 + fi + + echo -e "New MPTCP socket can be blocked via sysctl\t\t[ OK ]" + return 0 +} + +check_mptcp_ulp_setsockopt() +{ + local t retval + t="ns_ulp-$sech-$(mktemp -u XXXXXX)" + + ip netns add ${t} || exit $ksft_skip + if ! ip netns exec ${t} ./mptcp_connect -u -p 10000 -s TCP 127.0.0.1 2>&1; then + printf "setsockopt(..., TCP_ULP, \"mptcp\", ...) allowed\t[ FAIL ]\n" + retval=1 + ret=$retval + else + printf "setsockopt(..., TCP_ULP, \"mptcp\", ...) blocked\t[ OK ]\n" + retval=0 + fi + ip netns del ${t} + return $retval +} + +# $1: IP address +is_v6() +{ + [ -z "${1##*:*}" ] +} + +do_ping() +{ + local listener_ns="$1" + local connector_ns="$2" + local connect_addr="$3" + local ping_args="-q -c 1" + + if is_v6 "${connect_addr}"; then + $ipv6 || return 0 + ping_args="${ping_args} -6" + fi + + ip netns exec ${connector_ns} ping ${ping_args} $connect_addr >/dev/null + if [ $? -ne 0 ] ; then + echo "$listener_ns -> $connect_addr connectivity [ FAIL ]" 1>&2 + ret=1 + + return 1 + fi + + return 0 +} + +# $1: ns, $2: port +wait_local_port_listen() +{ + local listener_ns="${1}" + local port="${2}" + + local port_hex i + + port_hex="$(printf "%04X" "${port}")" + for i in $(seq 10); do + ip netns exec "${listener_ns}" cat /proc/net/tcp* | \ + awk "BEGIN {rc=1} {if (\$2 ~ /:${port_hex}\$/ && \$4 ~ /0A/) {rc=0; exit}} END {exit rc}" && + break + sleep 0.1 + done +} + +do_transfer() +{ + local listener_ns="$1" + local connector_ns="$2" + local cl_proto="$3" + local srv_proto="$4" + local connect_addr="$5" + local local_addr="$6" + local extra_args="" + + local port + port=$((10000+$TEST_COUNT)) + TEST_COUNT=$((TEST_COUNT+1)) + + if [ "$sndbuf" -gt 0 ]; then + extra_args="$extra_args -b $sndbuf" + fi + + if [ -n "$testmode" ]; then + extra_args="$extra_args -m $testmode" + fi + + if [ -n "$extra_args" ] && $options_log; then + options_log=false + echo "INFO: extra options: $extra_args" + fi + + :> "$cout" + :> "$sout" + :> "$capout" + + local addr_port + addr_port=$(printf "%s:%d" ${connect_addr} ${port}) + printf "%.3s %-5s -> %.3s (%-20s) %-5s\t" ${connector_ns} ${cl_proto} ${listener_ns} ${addr_port} ${srv_proto} + + if $capture; then + local capuser + if [ -z $SUDO_USER ] ; then + capuser="" + else + capuser="-Z $SUDO_USER" + fi + + local capfile="${listener_ns}-${connector_ns}-${cl_proto}-${srv_proto}-${connect_addr}.pcap" + + ip netns exec ${listener_ns} tcpdump -i any -s 65535 -B 32768 $capuser -w $capfile > "$capout" 2>&1 & + local cappid=$! + + sleep 1 + fi + + ip netns exec ${listener_ns} ./mptcp_connect -t $timeout -l -p $port -s ${srv_proto} $extra_args $local_addr < "$sin" > "$sout" & + local spid=$! + + wait_local_port_listen "${listener_ns}" "${port}" + + local start + start=$(date +%s%3N) + ip netns exec ${connector_ns} ./mptcp_connect -t $timeout -p $port -s ${cl_proto} $extra_args $connect_addr < "$cin" > "$cout" & + local cpid=$! + + wait $cpid + local retc=$? + wait $spid + local rets=$? + + local stop + stop=$(date +%s%3N) + + if $capture; then + sleep 1 + kill $cappid + fi + + local duration + duration=$((stop-start)) + duration=$(printf "(duration %05sms)" $duration) + if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then + echo "$duration [ FAIL ] client exit code $retc, server $rets" 1>&2 + echo "\nnetns ${listener_ns} socket stat for $port:" 1>&2 + ip netns exec ${listener_ns} ss -nita 1>&2 -o "sport = :$port" + echo "\nnetns ${connector_ns} socket stat for $port:" 1>&2 + ip netns exec ${connector_ns} ss -nita 1>&2 -o "dport = :$port" + + cat "$capout" + return 1 + fi + + check_transfer $sin $cout "file received by client" + retc=$? + check_transfer $cin $sout "file received by server" + rets=$? + + if [ $retc -eq 0 ] && [ $rets -eq 0 ];then + echo "$duration [ OK ]" + cat "$capout" + return 0 + fi + + cat "$capout" + return 1 +} + +make_file() +{ + local name=$1 + local who=$2 + + local SIZE TSIZE + SIZE=$((RANDOM % (1024 * 8))) + TSIZE=$((SIZE * 1024)) + + dd if=/dev/urandom of="$name" bs=1024 count=$SIZE 2> /dev/null + + SIZE=$((RANDOM % 1024)) + SIZE=$((SIZE + 128)) + TSIZE=$((TSIZE + SIZE)) + dd if=/dev/urandom conv=notrunc of="$name" bs=1 count=$SIZE 2> /dev/null + echo -e "\nMPTCP_TEST_FILE_END_MARKER" >> "$name" + + echo "Created $name (size $TSIZE) containing data sent by $who" +} + +run_tests_lo() +{ + local listener_ns="$1" + local connector_ns="$2" + local connect_addr="$3" + local loopback="$4" + local lret=0 + + # skip if test programs are running inside same netns for subsequent runs. + if [ $loopback -eq 0 ] && [ ${listener_ns} = ${connector_ns} ]; then + return 0 + fi + + # skip if we don't want v6 + if ! $ipv6 && is_v6 "${connect_addr}"; then + return 0 + fi + + local local_addr + if is_v6 "${connect_addr}"; then + local_addr="::" + else + local_addr="0.0.0.0" + fi + + do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP ${connect_addr} ${local_addr} + lret=$? + if [ $lret -ne 0 ]; then + ret=$lret + return 1 + fi + + # don't bother testing fallback tcp except for loopback case. + if [ ${listener_ns} != ${connector_ns} ]; then + return 0 + fi + + do_transfer ${listener_ns} ${connector_ns} MPTCP TCP ${connect_addr} ${local_addr} + lret=$? + if [ $lret -ne 0 ]; then + ret=$lret + return 1 + fi + + do_transfer ${listener_ns} ${connector_ns} TCP MPTCP ${connect_addr} ${local_addr} + lret=$? + if [ $lret -ne 0 ]; then + ret=$lret + return 1 + fi + + return 0 +} + +run_tests() +{ + run_tests_lo $1 $2 $3 0 +} + +make_file "$cin" "client" +make_file "$sin" "server" + +check_mptcp_disabled + +check_mptcp_ulp_setsockopt + +echo "INFO: validating network environment with pings" +for sender in "$ns1" "$ns2" "$ns3" "$ns4";do + do_ping "$ns1" $sender 10.0.1.1 + do_ping "$ns1" $sender dead:beef:1::1 + + do_ping "$ns2" $sender 10.0.1.2 + do_ping "$ns2" $sender dead:beef:1::2 + do_ping "$ns2" $sender 10.0.2.1 + do_ping "$ns2" $sender dead:beef:2::1 + + do_ping "$ns3" $sender 10.0.2.2 + do_ping "$ns3" $sender dead:beef:2::2 + do_ping "$ns3" $sender 10.0.3.2 + do_ping "$ns3" $sender dead:beef:3::2 + + do_ping "$ns4" $sender 10.0.3.1 + do_ping "$ns4" $sender dead:beef:3::1 +done + +[ -n "$tc_loss" ] && tc -net "$ns2" qdisc add dev ns2eth3 root netem loss random $tc_loss +echo -n "INFO: Using loss of $tc_loss " +test "$tc_delay" -gt 0 && echo -n "delay $tc_delay ms " + +if [ -z "${tc_reorder}" ]; then + reorder1=$((RANDOM%10)) + reorder1=$((100 - reorder1)) + reorder2=$((RANDOM%100)) + + if [ $tc_delay -gt 0 ] && [ $reorder1 -lt 100 ] && [ $reorder2 -gt 0 ]; then + tc_reorder="reorder ${reorder1}% ${reorder2}%" + echo -n "$tc_reorder " + fi +elif [ "$tc_reorder" = "0" ];then + tc_reorder="" +elif [ "$tc_delay" -gt 0 ];then + # reordering requires some delay + tc_reorder="reorder $tc_reorder" + echo -n "$tc_reorder " +fi + +echo "on ns3eth4" + +tc -net "$ns3" qdisc add dev ns3eth4 root netem delay ${tc_delay}ms $tc_reorder + +for sender in $ns1 $ns2 $ns3 $ns4;do + run_tests_lo "$ns1" "$sender" 10.0.1.1 1 + if [ $ret -ne 0 ] ;then + echo "FAIL: Could not even run loopback test" 1>&2 + exit $ret + fi + run_tests_lo "$ns1" $sender dead:beef:1::1 1 + if [ $ret -ne 0 ] ;then + echo "FAIL: Could not even run loopback v6 test" 2>&1 + exit $ret + fi + + run_tests "$ns2" $sender 10.0.1.2 + run_tests "$ns2" $sender dead:beef:1::2 + run_tests "$ns2" $sender 10.0.2.1 + run_tests "$ns2" $sender dead:beef:2::1 + + run_tests "$ns3" $sender 10.0.2.2 + run_tests "$ns3" $sender dead:beef:2::2 + run_tests "$ns3" $sender 10.0.3.2 + run_tests "$ns3" $sender dead:beef:3::2 + + run_tests "$ns4" $sender 10.0.3.1 + run_tests "$ns4" $sender dead:beef:3::1 +done + +time_end=$(date +%s) +time_run=$((time_end-time_start)) + +echo "Time: ${time_run} seconds" + +exit $ret diff --git a/tools/testing/selftests/net/mptcp/settings b/tools/testing/selftests/net/mptcp/settings new file mode 100644 index 000000000000..026384c189c9 --- /dev/null +++ b/tools/testing/selftests/net/mptcp/settings @@ -0,0 +1 @@ +timeout=450 diff --git a/tools/testing/selftests/netfilter/Makefile b/tools/testing/selftests/netfilter/Makefile index de1032b5ddea..08194aa44006 100644 --- a/tools/testing/selftests/netfilter/Makefile +++ b/tools/testing/selftests/netfilter/Makefile @@ -2,6 +2,7 @@ # Makefile for netfilter selftests TEST_PROGS := nft_trans_stress.sh nft_nat.sh bridge_brouter.sh \ - conntrack_icmp_related.sh nft_flowtable.sh ipvs.sh + conntrack_icmp_related.sh nft_flowtable.sh ipvs.sh \ + nft_concat_range.sh include ../lib.mk diff --git a/tools/testing/selftests/netfilter/nft_concat_range.sh b/tools/testing/selftests/netfilter/nft_concat_range.sh new file mode 100755 index 000000000000..aca21dde102a --- /dev/null +++ b/tools/testing/selftests/netfilter/nft_concat_range.sh @@ -0,0 +1,1481 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# +# nft_concat_range.sh - Tests for sets with concatenation of ranged fields +# +# Copyright (c) 2019 Red Hat GmbH +# +# Author: Stefano Brivio <sbrivio@redhat.com> +# +# shellcheck disable=SC2154,SC2034,SC2016,SC2030,SC2031 +# ^ Configuration and templates sourced with eval, counters reused in subshells + +KSELFTEST_SKIP=4 + +# Available test groups: +# - correctness: check that packets match given entries, and only those +# - concurrency: attempt races between insertion, deletion and lookup +# - timeout: check that packets match entries until they expire +# - performance: estimate matching rate, compare with rbtree and hash baselines +TESTS="correctness concurrency timeout" +[ "${quicktest}" != "1" ] && TESTS="${TESTS} performance" + +# Set types, defined by TYPE_ variables below +TYPES="net_port port_net net6_port port_proto net6_port_mac net6_port_mac_proto + net_port_net net_mac net_mac_icmp net6_mac_icmp net6_port_net6_port + net_port_mac_proto_net" + +# List of possible paths to pktgen script from kernel tree for performance tests +PKTGEN_SCRIPT_PATHS=" + ../../../samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh + pktgen/pktgen_bench_xmit_mode_netif_receive.sh" + +# Definition of set types: +# display display text for test report +# type_spec nftables set type specifier +# chain_spec nftables type specifier for rules mapping to set +# dst call sequence of format_*() functions for destination fields +# src call sequence of format_*() functions for source fields +# start initial integer used to generate addresses and ports +# count count of entries to generate and match +# src_delta number summed to destination generator for source fields +# tools list of tools for correctness and timeout tests, any can be used +# proto L4 protocol of test packets +# +# race_repeat race attempts per thread, 0 disables concurrency test for type +# flood_tools list of tools for concurrency tests, any can be used +# flood_proto L4 protocol of test packets for concurrency tests +# flood_spec nftables type specifier for concurrency tests +# +# perf_duration duration of single pktgen injection test +# perf_spec nftables type specifier for performance tests +# perf_dst format_*() functions for destination fields in performance test +# perf_src format_*() functions for source fields in performance test +# perf_entries number of set entries for performance test +# perf_proto L3 protocol of test packets +TYPE_net_port=" +display net,port +type_spec ipv4_addr . inet_service +chain_spec ip daddr . udp dport +dst addr4 port +src +start 1 +count 5 +src_delta 2000 +tools sendip nc bash +proto udp + +race_repeat 3 +flood_tools iperf3 iperf netperf +flood_proto udp +flood_spec ip daddr . udp dport + +perf_duration 5 +perf_spec ip daddr . udp dport +perf_dst addr4 port +perf_src +perf_entries 1000 +perf_proto ipv4 +" + +TYPE_port_net=" +display port,net +type_spec inet_service . ipv4_addr +chain_spec udp dport . ip daddr +dst port addr4 +src +start 1 +count 5 +src_delta 2000 +tools sendip nc bash +proto udp + +race_repeat 3 +flood_tools iperf3 iperf netperf +flood_proto udp +flood_spec udp dport . ip daddr + +perf_duration 5 +perf_spec udp dport . ip daddr +perf_dst port addr4 +perf_src +perf_entries 100 +perf_proto ipv4 +" + +TYPE_net6_port=" +display net6,port +type_spec ipv6_addr . inet_service +chain_spec ip6 daddr . udp dport +dst addr6 port +src +start 10 +count 5 +src_delta 2000 +tools sendip nc bash +proto udp6 + +race_repeat 3 +flood_tools iperf3 iperf netperf +flood_proto tcp6 +flood_spec ip6 daddr . udp dport + +perf_duration 5 +perf_spec ip6 daddr . udp dport +perf_dst addr6 port +perf_src +perf_entries 1000 +perf_proto ipv6 +" + +TYPE_port_proto=" +display port,proto +type_spec inet_service . inet_proto +chain_spec udp dport . meta l4proto +dst port proto +src +start 1 +count 5 +src_delta 2000 +tools sendip nc bash +proto udp + +race_repeat 0 + +perf_duration 5 +perf_spec udp dport . meta l4proto +perf_dst port proto +perf_src +perf_entries 30000 +perf_proto ipv4 +" + +TYPE_net6_port_mac=" +display net6,port,mac +type_spec ipv6_addr . inet_service . ether_addr +chain_spec ip6 daddr . udp dport . ether saddr +dst addr6 port +src mac +start 10 +count 5 +src_delta 2000 +tools sendip nc bash +proto udp6 + +race_repeat 0 + +perf_duration 5 +perf_spec ip6 daddr . udp dport . ether daddr +perf_dst addr6 port mac +perf_src +perf_entries 10 +perf_proto ipv6 +" + +TYPE_net6_port_mac_proto=" +display net6,port,mac,proto +type_spec ipv6_addr . inet_service . ether_addr . inet_proto +chain_spec ip6 daddr . udp dport . ether saddr . meta l4proto +dst addr6 port +src mac proto +start 10 +count 5 +src_delta 2000 +tools sendip nc bash +proto udp6 + +race_repeat 0 + +perf_duration 5 +perf_spec ip6 daddr . udp dport . ether daddr . meta l4proto +perf_dst addr6 port mac proto +perf_src +perf_entries 1000 +perf_proto ipv6 +" + +TYPE_net_port_net=" +display net,port,net +type_spec ipv4_addr . inet_service . ipv4_addr +chain_spec ip daddr . udp dport . ip saddr +dst addr4 port +src addr4 +start 1 +count 5 +src_delta 2000 +tools sendip nc bash +proto udp + +race_repeat 3 +flood_tools iperf3 iperf netperf +flood_proto tcp +flood_spec ip daddr . udp dport . ip saddr + +perf_duration 0 +" + +TYPE_net6_port_net6_port=" +display net6,port,net6,port +type_spec ipv6_addr . inet_service . ipv6_addr . inet_service +chain_spec ip6 daddr . udp dport . ip6 saddr . udp sport +dst addr6 port +src addr6 port +start 10 +count 5 +src_delta 2000 +tools sendip nc +proto udp6 + +race_repeat 3 +flood_tools iperf3 iperf netperf +flood_proto tcp6 +flood_spec ip6 daddr . tcp dport . ip6 saddr . tcp sport + +perf_duration 0 +" + +TYPE_net_port_mac_proto_net=" +display net,port,mac,proto,net +type_spec ipv4_addr . inet_service . ether_addr . inet_proto . ipv4_addr +chain_spec ip daddr . udp dport . ether saddr . meta l4proto . ip saddr +dst addr4 port +src mac proto addr4 +start 1 +count 5 +src_delta 2000 +tools sendip nc bash +proto udp + +race_repeat 0 + +perf_duration 0 +" + +TYPE_net_mac=" +display net,mac +type_spec ipv4_addr . ether_addr +chain_spec ip daddr . ether saddr +dst addr4 +src mac +start 1 +count 5 +src_delta 2000 +tools sendip nc bash +proto udp + +race_repeat 0 + +perf_duration 5 +perf_spec ip daddr . ether daddr +perf_dst addr4 mac +perf_src +perf_entries 1000 +perf_proto ipv4 +" + +TYPE_net_mac_icmp=" +display net,mac - ICMP +type_spec ipv4_addr . ether_addr +chain_spec ip daddr . ether saddr +dst addr4 +src mac +start 1 +count 5 +src_delta 2000 +tools ping +proto icmp + +race_repeat 0 + +perf_duration 0 +" + +TYPE_net6_mac_icmp=" +display net6,mac - ICMPv6 +type_spec ipv6_addr . ether_addr +chain_spec ip6 daddr . ether saddr +dst addr6 +src mac +start 10 +count 50 +src_delta 2000 +tools ping +proto icmp6 + +race_repeat 0 + +perf_duration 0 +" + +TYPE_net_port_proto_net=" +display net,port,proto,net +type_spec ipv4_addr . inet_service . inet_proto . ipv4_addr +chain_spec ip daddr . udp dport . meta l4proto . ip saddr +dst addr4 port proto +src addr4 +start 1 +count 5 +src_delta 2000 +tools sendip nc +proto udp + +race_repeat 3 +flood_tools iperf3 iperf netperf +flood_proto tcp +flood_spec ip daddr . tcp dport . meta l4proto . ip saddr + +perf_duration 0 +" + +# Set template for all tests, types and rules are filled in depending on test +set_template=' +flush ruleset + +table inet filter { + counter test { + packets 0 bytes 0 + } + + set test { + type ${type_spec} + flags interval,timeout + } + + chain input { + type filter hook prerouting priority 0; policy accept; + ${chain_spec} @test counter name \"test\" + } +} + +table netdev perf { + counter test { + packets 0 bytes 0 + } + + counter match { + packets 0 bytes 0 + } + + set test { + type ${type_spec} + flags interval + } + + set norange { + type ${type_spec} + } + + set noconcat { + type ${type_spec%% *} + flags interval + } + + chain test { + type filter hook ingress device veth_a priority 0; + } +} +' + +err_buf= +info_buf= + +# Append string to error buffer +err() { + err_buf="${err_buf}${1} +" +} + +# Append string to information buffer +info() { + info_buf="${info_buf}${1} +" +} + +# Flush error buffer to stdout +err_flush() { + printf "%s" "${err_buf}" + err_buf= +} + +# Flush information buffer to stdout +info_flush() { + printf "%s" "${info_buf}" + info_buf= +} + +# Setup veth pair: this namespace receives traffic, B generates it +setup_veth() { + ip netns add B + ip link add veth_a type veth peer name veth_b || return 1 + + ip link set veth_a up + ip link set veth_b netns B + + ip -n B link set veth_b up + + ip addr add dev veth_a 10.0.0.1 + ip route add default dev veth_a + + ip -6 addr add fe80::1/64 dev veth_a nodad + ip -6 addr add 2001:db8::1/64 dev veth_a nodad + ip -6 route add default dev veth_a + + ip -n B route add default dev veth_b + + ip -6 -n B addr add fe80::2/64 dev veth_b nodad + ip -6 -n B addr add 2001:db8::2/64 dev veth_b nodad + ip -6 -n B route add default dev veth_b + + B() { + ip netns exec B "$@" >/dev/null 2>&1 + } + + sleep 2 +} + +# Fill in set template and initialise set +setup_set() { + eval "echo \"${set_template}\"" | nft -f - +} + +# Check that at least one of the needed tools is available +check_tools() { + __tools= + for tool in ${tools}; do + if [ "${tool}" = "nc" ] && [ "${proto}" = "udp6" ] && \ + ! nc -u -w0 1.1.1.1 1 2>/dev/null; then + # Some GNU netcat builds might not support IPv6 + __tools="${__tools} netcat-openbsd" + continue + fi + __tools="${__tools} ${tool}" + + command -v "${tool}" >/dev/null && return 0 + done + err "need one of:${__tools}, skipping" && return 1 +} + +# Set up function to send ICMP packets +setup_send_icmp() { + send_icmp() { + B ping -c1 -W1 "${dst_addr4}" >/dev/null 2>&1 + } +} + +# Set up function to send ICMPv6 packets +setup_send_icmp6() { + if command -v ping6 >/dev/null; then + send_icmp6() { + ip -6 addr add "${dst_addr6}" dev veth_a nodad \ + 2>/dev/null + B ping6 -q -c1 -W1 "${dst_addr6}" + } + else + send_icmp6() { + ip -6 addr add "${dst_addr6}" dev veth_a nodad \ + 2>/dev/null + B ping -q -6 -c1 -W1 "${dst_addr6}" + } + fi +} + +# Set up function to send single UDP packets on IPv4 +setup_send_udp() { + if command -v sendip >/dev/null; then + send_udp() { + [ -n "${src_port}" ] && src_port="-us ${src_port}" + [ -n "${dst_port}" ] && dst_port="-ud ${dst_port}" + [ -n "${src_addr4}" ] && src_addr4="-is ${src_addr4}" + + # shellcheck disable=SC2086 # sendip needs split options + B sendip -p ipv4 -p udp ${src_addr4} ${src_port} \ + ${dst_port} "${dst_addr4}" + + src_port= + dst_port= + src_addr4= + } + elif command -v nc >/dev/null; then + if nc -u -w0 1.1.1.1 1 2>/dev/null; then + # OpenBSD netcat + nc_opt="-w0" + else + # GNU netcat + nc_opt="-q0" + fi + + send_udp() { + if [ -n "${src_addr4}" ]; then + B ip addr add "${src_addr4}" dev veth_b + __src_addr4="-s ${src_addr4}" + fi + ip addr add "${dst_addr4}" dev veth_a 2>/dev/null + [ -n "${src_port}" ] && src_port="-p ${src_port}" + + echo "" | B nc -u "${nc_opt}" "${__src_addr4}" \ + "${src_port}" "${dst_addr4}" "${dst_port}" + + src_addr4= + src_port= + } + elif [ -z "$(bash -c 'type -p')" ]; then + send_udp() { + ip addr add "${dst_addr4}" dev veth_a 2>/dev/null + if [ -n "${src_addr4}" ]; then + B ip addr add "${src_addr4}/16" dev veth_b + B ip route add default dev veth_b + fi + + B bash -c "echo > /dev/udp/${dst_addr4}/${dst_port}" + + if [ -n "${src_addr4}" ]; then + B ip addr del "${src_addr4}/16" dev veth_b + fi + src_addr4= + } + else + return 1 + fi +} + +# Set up function to send single UDP packets on IPv6 +setup_send_udp6() { + if command -v sendip >/dev/null; then + send_udp6() { + [ -n "${src_port}" ] && src_port="-us ${src_port}" + [ -n "${dst_port}" ] && dst_port="-ud ${dst_port}" + if [ -n "${src_addr6}" ]; then + src_addr6="-6s ${src_addr6}" + else + src_addr6="-6s 2001:db8::2" + fi + ip -6 addr add "${dst_addr6}" dev veth_a nodad \ + 2>/dev/null + + # shellcheck disable=SC2086 # this needs split options + B sendip -p ipv6 -p udp ${src_addr6} ${src_port} \ + ${dst_port} "${dst_addr6}" + + src_port= + dst_port= + src_addr6= + } + elif command -v nc >/dev/null && nc -u -w0 1.1.1.1 1 2>/dev/null; then + # GNU netcat might not work with IPv6, try next tool + send_udp6() { + ip -6 addr add "${dst_addr6}" dev veth_a nodad \ + 2>/dev/null + if [ -n "${src_addr6}" ]; then + B ip addr add "${src_addr6}" dev veth_b nodad + else + src_addr6="2001:db8::2" + fi + [ -n "${src_port}" ] && src_port="-p ${src_port}" + + # shellcheck disable=SC2086 # this needs split options + echo "" | B nc -u w0 "-s${src_addr6}" ${src_port} \ + ${dst_addr6} ${dst_port} + + src_addr6= + src_port= + } + elif [ -z "$(bash -c 'type -p')" ]; then + send_udp6() { + ip -6 addr add "${dst_addr6}" dev veth_a nodad \ + 2>/dev/null + B ip addr add "${src_addr6}" dev veth_b nodad + B bash -c "echo > /dev/udp/${dst_addr6}/${dst_port}" + ip -6 addr del "${dst_addr6}" dev veth_a 2>/dev/null + } + else + return 1 + fi +} + +# Set up function to send TCP traffic on IPv4 +setup_flood_tcp() { + if command -v iperf3 >/dev/null; then + flood_tcp() { + [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" + if [ -n "${src_addr4}" ]; then + B ip addr add "${src_addr4}/16" dev veth_b + src_addr4="-B ${src_addr4}" + else + B ip addr add dev veth_b 10.0.0.2 + src_addr4="-B 10.0.0.2" + fi + if [ -n "${src_port}" ]; then + src_port="--cport ${src_port}" + fi + B ip route add default dev veth_b 2>/dev/null + ip addr add "${dst_addr4}" dev veth_a 2>/dev/null + + # shellcheck disable=SC2086 # this needs split options + iperf3 -s -DB "${dst_addr4}" ${dst_port} >/dev/null 2>&1 + sleep 2 + + # shellcheck disable=SC2086 # this needs split options + B iperf3 -c "${dst_addr4}" ${dst_port} ${src_port} \ + ${src_addr4} -l16 -t 1000 + + src_addr4= + src_port= + dst_port= + } + elif command -v iperf >/dev/null; then + flood_tcp() { + [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" + if [ -n "${src_addr4}" ]; then + B ip addr add "${src_addr4}/16" dev veth_b + src_addr4="-B ${src_addr4}" + else + B ip addr add dev veth_b 10.0.0.2 2>/dev/null + src_addr4="-B 10.0.0.2" + fi + if [ -n "${src_port}" ]; then + src_addr4="${src_addr4}:${src_port}" + fi + B ip route add default dev veth_b + ip addr add "${dst_addr4}" dev veth_a 2>/dev/null + + # shellcheck disable=SC2086 # this needs split options + iperf -s -DB "${dst_addr4}" ${dst_port} >/dev/null 2>&1 + sleep 2 + + # shellcheck disable=SC2086 # this needs split options + B iperf -c "${dst_addr4}" ${dst_port} ${src_addr4} \ + -l20 -t 1000 + + src_addr4= + src_port= + dst_port= + } + elif command -v netperf >/dev/null; then + flood_tcp() { + [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" + if [ -n "${src_addr4}" ]; then + B ip addr add "${src_addr4}/16" dev veth_b + else + B ip addr add dev veth_b 10.0.0.2 + src_addr4="10.0.0.2" + fi + if [ -n "${src_port}" ]; then + dst_port="${dst_port},${src_port}" + fi + B ip route add default dev veth_b + ip addr add "${dst_addr4}" dev veth_a 2>/dev/null + + # shellcheck disable=SC2086 # this needs split options + netserver -4 ${dst_port} -L "${dst_addr4}" \ + >/dev/null 2>&1 + sleep 2 + + # shellcheck disable=SC2086 # this needs split options + B netperf -4 -H "${dst_addr4}" ${dst_port} \ + -L "${src_addr4}" -l 1000 -t TCP_STREAM + + src_addr4= + src_port= + dst_port= + } + else + return 1 + fi +} + +# Set up function to send TCP traffic on IPv6 +setup_flood_tcp6() { + if command -v iperf3 >/dev/null; then + flood_tcp6() { + [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" + if [ -n "${src_addr6}" ]; then + B ip addr add "${src_addr6}" dev veth_b nodad + src_addr6="-B ${src_addr6}" + else + src_addr6="-B 2001:db8::2" + fi + if [ -n "${src_port}" ]; then + src_port="--cport ${src_port}" + fi + B ip route add default dev veth_b + ip -6 addr add "${dst_addr6}" dev veth_a nodad \ + 2>/dev/null + + # shellcheck disable=SC2086 # this needs split options + iperf3 -s -DB "${dst_addr6}" ${dst_port} >/dev/null 2>&1 + sleep 2 + + # shellcheck disable=SC2086 # this needs split options + B iperf3 -c "${dst_addr6}" ${dst_port} \ + ${src_port} ${src_addr6} -l16 -t 1000 + + src_addr6= + src_port= + dst_port= + } + elif command -v iperf >/dev/null; then + flood_tcp6() { + [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" + if [ -n "${src_addr6}" ]; then + B ip addr add "${src_addr6}" dev veth_b nodad + src_addr6="-B ${src_addr6}" + else + src_addr6="-B 2001:db8::2" + fi + if [ -n "${src_port}" ]; then + src_addr6="${src_addr6}:${src_port}" + fi + B ip route add default dev veth_b + ip -6 addr add "${dst_addr6}" dev veth_a nodad \ + 2>/dev/null + + # shellcheck disable=SC2086 # this needs split options + iperf -s -VDB "${dst_addr6}" ${dst_port} >/dev/null 2>&1 + sleep 2 + + # shellcheck disable=SC2086 # this needs split options + B iperf -c "${dst_addr6}" -V ${dst_port} \ + ${src_addr6} -l1 -t 1000 + + src_addr6= + src_port= + dst_port= + } + elif command -v netperf >/dev/null; then + flood_tcp6() { + [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" + if [ -n "${src_addr6}" ]; then + B ip addr add "${src_addr6}" dev veth_b nodad + else + src_addr6="2001:db8::2" + fi + if [ -n "${src_port}" ]; then + dst_port="${dst_port},${src_port}" + fi + B ip route add default dev veth_b + ip -6 addr add "${dst_addr6}" dev veth_a nodad \ + 2>/dev/null + + # shellcheck disable=SC2086 # this needs split options + netserver -6 ${dst_port} -L "${dst_addr6}" \ + >/dev/null 2>&1 + sleep 2 + + # shellcheck disable=SC2086 # this needs split options + B netperf -6 -H "${dst_addr6}" ${dst_port} \ + -L "${src_addr6}" -l 1000 -t TCP_STREAM + + src_addr6= + src_port= + dst_port= + } + else + return 1 + fi +} + +# Set up function to send UDP traffic on IPv4 +setup_flood_udp() { + if command -v iperf3 >/dev/null; then + flood_udp() { + [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" + if [ -n "${src_addr4}" ]; then + B ip addr add "${src_addr4}/16" dev veth_b + src_addr4="-B ${src_addr4}" + else + B ip addr add dev veth_b 10.0.0.2 2>/dev/null + src_addr4="-B 10.0.0.2" + fi + if [ -n "${src_port}" ]; then + src_port="--cport ${src_port}" + fi + B ip route add default dev veth_b + ip addr add "${dst_addr4}" dev veth_a 2>/dev/null + + # shellcheck disable=SC2086 # this needs split options + iperf3 -s -DB "${dst_addr4}" ${dst_port} + sleep 2 + + # shellcheck disable=SC2086 # this needs split options + B iperf3 -u -c "${dst_addr4}" -Z -b 100M -l16 -t1000 \ + ${dst_port} ${src_port} ${src_addr4} + + src_addr4= + src_port= + dst_port= + } + elif command -v iperf >/dev/null; then + flood_udp() { + [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" + if [ -n "${src_addr4}" ]; then + B ip addr add "${src_addr4}/16" dev veth_b + src_addr4="-B ${src_addr4}" + else + B ip addr add dev veth_b 10.0.0.2 + src_addr4="-B 10.0.0.2" + fi + if [ -n "${src_port}" ]; then + src_addr4="${src_addr4}:${src_port}" + fi + B ip route add default dev veth_b + ip addr add "${dst_addr4}" dev veth_a 2>/dev/null + + # shellcheck disable=SC2086 # this needs split options + iperf -u -sDB "${dst_addr4}" ${dst_port} >/dev/null 2>&1 + sleep 2 + + # shellcheck disable=SC2086 # this needs split options + B iperf -u -c "${dst_addr4}" -b 100M -l1 -t1000 \ + ${dst_port} ${src_addr4} + + src_addr4= + src_port= + dst_port= + } + elif command -v netperf >/dev/null; then + flood_udp() { + [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" + if [ -n "${src_addr4}" ]; then + B ip addr add "${src_addr4}/16" dev veth_b + else + B ip addr add dev veth_b 10.0.0.2 + src_addr4="10.0.0.2" + fi + if [ -n "${src_port}" ]; then + dst_port="${dst_port},${src_port}" + fi + B ip route add default dev veth_b + ip addr add "${dst_addr4}" dev veth_a 2>/dev/null + + # shellcheck disable=SC2086 # this needs split options + netserver -4 ${dst_port} -L "${dst_addr4}" \ + >/dev/null 2>&1 + sleep 2 + + # shellcheck disable=SC2086 # this needs split options + B netperf -4 -H "${dst_addr4}" ${dst_port} \ + -L "${src_addr4}" -l 1000 -t UDP_STREAM + + src_addr4= + src_port= + dst_port= + } + else + return 1 + fi +} + +# Find pktgen script and set up function to start pktgen injection +setup_perf() { + for pktgen_script_path in ${PKTGEN_SCRIPT_PATHS} __notfound; do + command -v "${pktgen_script_path}" >/dev/null && break + done + [ "${pktgen_script_path}" = "__notfound" ] && return 1 + + perf_ipv4() { + ${pktgen_script_path} -s80 \ + -i veth_a -d "${dst_addr4}" -p "${dst_port}" \ + -m "${dst_mac}" \ + -t $(($(nproc) / 5 + 1)) -b10000 -n0 2>/dev/null & + perf_pid=$! + } + perf_ipv6() { + IP6=6 ${pktgen_script_path} -s100 \ + -i veth_a -d "${dst_addr6}" -p "${dst_port}" \ + -m "${dst_mac}" \ + -t $(($(nproc) / 5 + 1)) -b10000 -n0 2>/dev/null & + perf_pid=$! + } +} + +# Clean up before each test +cleanup() { + nft reset counter inet filter test >/dev/null 2>&1 + nft flush ruleset >/dev/null 2>&1 + ip link del dummy0 2>/dev/null + ip route del default 2>/dev/null + ip -6 route del default 2>/dev/null + ip netns del B 2>/dev/null + ip link del veth_a 2>/dev/null + timeout= + killall iperf3 2>/dev/null + killall iperf 2>/dev/null + killall netperf 2>/dev/null + killall netserver 2>/dev/null + rm -f ${tmp} + sleep 2 +} + +# Entry point for setup functions +setup() { + if [ "$(id -u)" -ne 0 ]; then + echo " need to run as root" + exit ${KSELFTEST_SKIP} + fi + + cleanup + check_tools || return 1 + for arg do + if ! eval setup_"${arg}"; then + err " ${arg} not supported" + return 1 + fi + done +} + +# Format integer into IPv4 address, summing 10.0.0.5 (arbitrary) to it +format_addr4() { + a=$((${1} + 16777216 * 10 + 5)) + printf "%i.%i.%i.%i" \ + "$((a / 16777216))" "$((a % 16777216 / 65536))" \ + "$((a % 65536 / 256))" "$((a % 256))" +} + +# Format integer into IPv6 address, summing 2001:db8:: to it +format_addr6() { + printf "2001:db8::%04x:%04x" "$((${1} / 65536))" "$((${1} % 65536))" +} + +# Format integer into EUI-48 address, summing 00:01:00:00:00:00 to it +format_mac() { + printf "00:01:%02x:%02x:%02x:%02x" \ + "$((${1} / 16777216))" "$((${1} % 16777216 / 65536))" \ + "$((${1} % 65536 / 256))" "$((${1} % 256))" +} + +# Format integer into port, avoid 0 port +format_port() { + printf "%i" "$((${1} % 65534 + 1))" +} + +# Drop suffixed '6' from L4 protocol, if any +format_proto() { + printf "%s" "${proto}" | tr -d 6 +} + +# Format destination and source fields into nft concatenated type +format() { + __start= + __end= + __expr="{ " + + for f in ${dst}; do + [ "${__expr}" != "{ " ] && __expr="${__expr} . " + + __start="$(eval format_"${f}" "${start}")" + __end="$(eval format_"${f}" "${end}")" + + if [ "${f}" = "proto" ]; then + __expr="${__expr}${__start}" + else + __expr="${__expr}${__start}-${__end}" + fi + done + for f in ${src}; do + __expr="${__expr} . " + __start="$(eval format_"${f}" "${srcstart}")" + __end="$(eval format_"${f}" "${srcend}")" + + if [ "${f}" = "proto" ]; then + __expr="${__expr}${__start}" + else + __expr="${__expr}${__start}-${__end}" + fi + done + + if [ -n "${timeout}" ]; then + echo "${__expr} timeout ${timeout}s }" + else + echo "${__expr} }" + fi +} + +# Format destination and source fields into nft type, start element only +format_norange() { + __expr="{ " + + for f in ${dst}; do + [ "${__expr}" != "{ " ] && __expr="${__expr} . " + + __expr="${__expr}$(eval format_"${f}" "${start}")" + done + for f in ${src}; do + __expr="${__expr} . $(eval format_"${f}" "${start}")" + done + + echo "${__expr} }" +} + +# Format first destination field into nft type +format_noconcat() { + for f in ${dst}; do + __start="$(eval format_"${f}" "${start}")" + __end="$(eval format_"${f}" "${end}")" + + if [ "${f}" = "proto" ]; then + echo "{ ${__start} }" + else + echo "{ ${__start}-${__end} }" + fi + return + done +} + +# Add single entry to 'test' set in 'inet filter' table +add() { + if ! nft add element inet filter test "${1}"; then + err "Failed to add ${1} given ruleset:" + err "$(nft list ruleset -a)" + return 1 + fi +} + +# Format and output entries for sets in 'netdev perf' table +add_perf() { + if [ "${1}" = "test" ]; then + echo "add element netdev perf test $(format)" + elif [ "${1}" = "norange" ]; then + echo "add element netdev perf norange $(format_norange)" + elif [ "${1}" = "noconcat" ]; then + echo "add element netdev perf noconcat $(format_noconcat)" + fi +} + +# Add single entry to 'norange' set in 'netdev perf' table +add_perf_norange() { + if ! nft add element netdev perf norange "${1}"; then + err "Failed to add ${1} given ruleset:" + err "$(nft list ruleset -a)" + return 1 + fi +} + +# Add single entry to 'noconcat' set in 'netdev perf' table +add_perf_noconcat() { + if ! nft add element netdev perf noconcat "${1}"; then + err "Failed to add ${1} given ruleset:" + err "$(nft list ruleset -a)" + return 1 + fi +} + +# Delete single entry from set +del() { + if ! nft delete element inet filter test "${1}"; then + err "Failed to delete ${1} given ruleset:" + err "$(nft list ruleset -a)" + return 1 + fi +} + +# Return packet count from 'test' counter in 'inet filter' table +count_packets() { + found=0 + for token in $(nft list counter inet filter test); do + [ ${found} -eq 1 ] && echo "${token}" && return + [ "${token}" = "packets" ] && found=1 + done +} + +# Return packet count from 'test' counter in 'netdev perf' table +count_perf_packets() { + found=0 + for token in $(nft list counter netdev perf test); do + [ ${found} -eq 1 ] && echo "${token}" && return + [ "${token}" = "packets" ] && found=1 + done +} + +# Set MAC addresses, send traffic according to specifier +flood() { + ip link set veth_a address "$(format_mac "${1}")" + ip -n B link set veth_b address "$(format_mac "${2}")" + + for f in ${dst}; do + eval dst_"$f"=\$\(format_\$f "${1}"\) + done + for f in ${src}; do + eval src_"$f"=\$\(format_\$f "${2}"\) + done + eval flood_\$proto +} + +# Set MAC addresses, start pktgen injection +perf() { + dst_mac="$(format_mac "${1}")" + ip link set veth_a address "${dst_mac}" + + for f in ${dst}; do + eval dst_"$f"=\$\(format_\$f "${1}"\) + done + for f in ${src}; do + eval src_"$f"=\$\(format_\$f "${2}"\) + done + eval perf_\$perf_proto +} + +# Set MAC addresses, send single packet, check that it matches, reset counter +send_match() { + ip link set veth_a address "$(format_mac "${1}")" + ip -n B link set veth_b address "$(format_mac "${2}")" + + for f in ${dst}; do + eval dst_"$f"=\$\(format_\$f "${1}"\) + done + for f in ${src}; do + eval src_"$f"=\$\(format_\$f "${2}"\) + done + eval send_\$proto + if [ "$(count_packets)" != "1" ]; then + err "${proto} packet to:" + err " $(for f in ${dst}; do + eval format_\$f "${1}"; printf ' '; done)" + err "from:" + err " $(for f in ${src}; do + eval format_\$f "${2}"; printf ' '; done)" + err "should have matched ruleset:" + err "$(nft list ruleset -a)" + return 1 + fi + nft reset counter inet filter test >/dev/null +} + +# Set MAC addresses, send single packet, check that it doesn't match +send_nomatch() { + ip link set veth_a address "$(format_mac "${1}")" + ip -n B link set veth_b address "$(format_mac "${2}")" + + for f in ${dst}; do + eval dst_"$f"=\$\(format_\$f "${1}"\) + done + for f in ${src}; do + eval src_"$f"=\$\(format_\$f "${2}"\) + done + eval send_\$proto + if [ "$(count_packets)" != "0" ]; then + err "${proto} packet to:" + err " $(for f in ${dst}; do + eval format_\$f "${1}"; printf ' '; done)" + err "from:" + err " $(for f in ${src}; do + eval format_\$f "${2}"; printf ' '; done)" + err "should not have matched ruleset:" + err "$(nft list ruleset -a)" + return 1 + fi +} + +# Correctness test template: +# - add ranged element, check that packets match it +# - check that packets outside range don't match it +# - remove some elements, check that packets don't match anymore +test_correctness() { + setup veth send_"${proto}" set || return ${KSELFTEST_SKIP} + + range_size=1 + for i in $(seq "${start}" $((start + count))); do + end=$((start + range_size)) + + # Avoid negative or zero-sized port ranges + if [ $((end / 65534)) -gt $((start / 65534)) ]; then + start=${end} + end=$((end + 1)) + fi + srcstart=$((start + src_delta)) + srcend=$((end + src_delta)) + + add "$(format)" || return 1 + for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do + send_match "${j}" $((j + src_delta)) || return 1 + done + send_nomatch $((end + 1)) $((end + 1 + src_delta)) || return 1 + + # Delete elements now and then + if [ $((i % 3)) -eq 0 ]; then + del "$(format)" || return 1 + for j in $(seq ${start} \ + $((range_size / 2 + 1)) ${end}); do + send_nomatch "${j}" $((j + src_delta)) \ + || return 1 + done + fi + + range_size=$((range_size + 1)) + start=$((end + range_size)) + done +} + +# Concurrency test template: +# - add all the elements +# - start a thread for each physical thread that: +# - adds all the elements +# - flushes the set +# - adds all the elements +# - flushes the entire ruleset +# - adds the set back +# - adds all the elements +# - delete all the elements +test_concurrency() { + proto=${flood_proto} + tools=${flood_tools} + chain_spec=${flood_spec} + setup veth flood_"${proto}" set || return ${KSELFTEST_SKIP} + + range_size=1 + cstart=${start} + flood_pids= + for i in $(seq ${start} $((start + count))); do + end=$((start + range_size)) + srcstart=$((start + src_delta)) + srcend=$((end + src_delta)) + + add "$(format)" || return 1 + + flood "${i}" $((i + src_delta)) & flood_pids="${flood_pids} $!" + + range_size=$((range_size + 1)) + start=$((end + range_size)) + done + + sleep 10 + + pids= + for c in $(seq 1 "$(nproc)"); do ( + for r in $(seq 1 "${race_repeat}"); do + range_size=1 + + # $start needs to be local to this subshell + # shellcheck disable=SC2030 + start=${cstart} + for i in $(seq ${start} $((start + count))); do + end=$((start + range_size)) + srcstart=$((start + src_delta)) + srcend=$((end + src_delta)) + + add "$(format)" 2>/dev/null + + range_size=$((range_size + 1)) + start=$((end + range_size)) + done + + nft flush inet filter test 2>/dev/null + + range_size=1 + start=${cstart} + for i in $(seq ${start} $((start + count))); do + end=$((start + range_size)) + srcstart=$((start + src_delta)) + srcend=$((end + src_delta)) + + add "$(format)" 2>/dev/null + + range_size=$((range_size + 1)) + start=$((end + range_size)) + done + + nft flush ruleset + setup set 2>/dev/null + + range_size=1 + start=${cstart} + for i in $(seq ${start} $((start + count))); do + end=$((start + range_size)) + srcstart=$((start + src_delta)) + srcend=$((end + src_delta)) + + add "$(format)" 2>/dev/null + + range_size=$((range_size + 1)) + start=$((end + range_size)) + done + + range_size=1 + start=${cstart} + for i in $(seq ${start} $((start + count))); do + end=$((start + range_size)) + srcstart=$((start + src_delta)) + srcend=$((end + src_delta)) + + del "$(format)" 2>/dev/null + + range_size=$((range_size + 1)) + start=$((end + range_size)) + done + done + ) & pids="${pids} $!" + done + + # shellcheck disable=SC2046,SC2086 # word splitting wanted here + wait $(for pid in ${pids}; do echo ${pid}; done) + # shellcheck disable=SC2046,SC2086 + kill $(for pid in ${flood_pids}; do echo ${pid}; done) 2>/dev/null + # shellcheck disable=SC2046,SC2086 + wait $(for pid in ${flood_pids}; do echo ${pid}; done) 2>/dev/null + + return 0 +} + +# Timeout test template: +# - add all the elements with 3s timeout while checking that packets match +# - wait 3s after the last insertion, check that packets don't match any entry +test_timeout() { + setup veth send_"${proto}" set || return ${KSELFTEST_SKIP} + + timeout=3 + range_size=1 + for i in $(seq "${start}" $((start + count))); do + end=$((start + range_size)) + srcstart=$((start + src_delta)) + srcend=$((end + src_delta)) + + add "$(format)" || return 1 + + for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do + send_match "${j}" $((j + src_delta)) || return 1 + done + + range_size=$((range_size + 1)) + start=$((end + range_size)) + done + sleep 3 + for i in $(seq ${start} $((start + count))); do + end=$((start + range_size)) + srcstart=$((start + src_delta)) + srcend=$((end + src_delta)) + + for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do + send_nomatch "${j}" $((j + src_delta)) || return 1 + done + + range_size=$((range_size + 1)) + start=$((end + range_size)) + done +} + +# Performance test template: +# - add concatenated ranged entries +# - add non-ranged concatenated entries (for hash set matching rate baseline) +# - add ranged entries with first field only (for rbhash baseline) +# - start pktgen injection directly on device rx path of this namespace +# - measure drop only rate, hash and rbtree baselines, then matching rate +test_performance() { + chain_spec=${perf_spec} + dst="${perf_dst}" + src="${perf_src}" + setup veth perf set || return ${KSELFTEST_SKIP} + + first=${start} + range_size=1 + for set in test norange noconcat; do + start=${first} + for i in $(seq ${start} $((start + perf_entries))); do + end=$((start + range_size)) + srcstart=$((start + src_delta)) + srcend=$((end + src_delta)) + + if [ $((end / 65534)) -gt $((start / 65534)) ]; then + start=${end} + end=$((end + 1)) + elif [ ${start} -eq ${end} ]; then + end=$((start + 1)) + fi + + add_perf ${set} + + start=$((end + range_size)) + done > "${tmp}" + nft -f "${tmp}" + done + + perf $((end - 1)) ${srcstart} + + sleep 2 + + nft add rule netdev perf test counter name \"test\" drop + nft reset counter netdev perf test >/dev/null 2>&1 + sleep "${perf_duration}" + pps="$(printf %10s $(($(count_perf_packets) / perf_duration)))" + info " baseline (drop from netdev hook): ${pps}pps" + handle="$(nft -a list chain netdev perf test | grep counter)" + handle="${handle##* }" + nft delete rule netdev perf test handle "${handle}" + + nft add rule "netdev perf test ${chain_spec} @norange \ + counter name \"test\" drop" + nft reset counter netdev perf test >/dev/null 2>&1 + sleep "${perf_duration}" + pps="$(printf %10s $(($(count_perf_packets) / perf_duration)))" + info " baseline hash (non-ranged entries): ${pps}pps" + handle="$(nft -a list chain netdev perf test | grep counter)" + handle="${handle##* }" + nft delete rule netdev perf test handle "${handle}" + + nft add rule "netdev perf test ${chain_spec%%. *} @noconcat \ + counter name \"test\" drop" + nft reset counter netdev perf test >/dev/null 2>&1 + sleep "${perf_duration}" + pps="$(printf %10s $(($(count_perf_packets) / perf_duration)))" + info " baseline rbtree (match on first field only): ${pps}pps" + handle="$(nft -a list chain netdev perf test | grep counter)" + handle="${handle##* }" + nft delete rule netdev perf test handle "${handle}" + + nft add rule "netdev perf test ${chain_spec} @test \ + counter name \"test\" drop" + nft reset counter netdev perf test >/dev/null 2>&1 + sleep "${perf_duration}" + pps="$(printf %10s $(($(count_perf_packets) / perf_duration)))" + p5="$(printf %5s "${perf_entries}")" + info " set with ${p5} full, ranged entries: ${pps}pps" + kill "${perf_pid}" +} + +# Run everything in a separate network namespace +[ "${1}" != "run" ] && { unshare -n "${0}" run; exit $?; } +tmp="$(mktemp)" +trap cleanup EXIT + +# Entry point for test runs +passed=0 +for name in ${TESTS}; do + printf "TEST: %s\n" "${name}" + for type in ${TYPES}; do + eval desc=\$TYPE_"${type}" + IFS=' +' + for __line in ${desc}; do + # shellcheck disable=SC2086 + eval ${__line%% *}=\"${__line##* }\"; + done + IFS=' +' + + if [ "${name}" = "concurrency" ] && \ + [ "${race_repeat}" = "0" ]; then + continue + fi + if [ "${name}" = "performance" ] && \ + [ "${perf_duration}" = "0" ]; then + continue + fi + + printf " %-60s " "${display}" + eval test_"${name}" + ret=$? + + if [ $ret -eq 0 ]; then + printf "[ OK ]\n" + info_flush + passed=$((passed + 1)) + elif [ $ret -eq 1 ]; then + printf "[FAIL]\n" + err_flush + exit 1 + elif [ $ret -eq ${KSELFTEST_SKIP} ]; then + printf "[SKIP]\n" + err_flush + fi + done +done + +[ ${passed} -eq 0 ] && exit ${KSELFTEST_SKIP} diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 6944b898bb53..ee1b727ede04 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -3158,7 +3158,18 @@ TEST(user_notification_basic) EXPECT_GT(poll(&pollfd, 1, -1), 0); EXPECT_EQ(pollfd.revents, POLLIN); - EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); + /* Test that we can't pass garbage to the kernel. */ + memset(&req, 0, sizeof(req)); + req.pid = -1; + errno = 0; + ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + if (ret) { + req.pid = 0; + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); + } pollfd.fd = listener; pollfd.events = POLLIN | POLLOUT; @@ -3278,6 +3289,7 @@ TEST(user_notification_signal) close(sk_pair[1]); + memset(&req, 0, sizeof(req)); EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); EXPECT_EQ(kill(pid, SIGUSR1), 0); @@ -3296,6 +3308,7 @@ TEST(user_notification_signal) EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1); EXPECT_EQ(errno, ENOENT); + memset(&req, 0, sizeof(req)); EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); resp.id = req.id; diff --git a/tools/testing/selftests/wireguard/netns.sh b/tools/testing/selftests/wireguard/netns.sh index e7310d9390f7..d5c85c7494f2 100755 --- a/tools/testing/selftests/wireguard/netns.sh +++ b/tools/testing/selftests/wireguard/netns.sh @@ -37,7 +37,7 @@ n2() { pretty 2 "$*"; maybe_exec ip netns exec $netns2 "$@"; } ip0() { pretty 0 "ip $*"; ip -n $netns0 "$@"; } ip1() { pretty 1 "ip $*"; ip -n $netns1 "$@"; } ip2() { pretty 2 "ip $*"; ip -n $netns2 "$@"; } -sleep() { read -t "$1" -N 0 || true; } +sleep() { read -t "$1" -N 1 || true; } waitiperf() { pretty "${1//*-}" "wait for iperf:5201"; while [[ $(ss -N "$1" -tlp 'sport = 5201') != *iperf3* ]]; do sleep 0.1; done; } waitncatudp() { pretty "${1//*-}" "wait for udp:1111"; while [[ $(ss -N "$1" -ulp 'sport = 1111') != *ncat* ]]; do sleep 0.1; done; } waitncattcp() { pretty "${1//*-}" "wait for tcp:1111"; while [[ $(ss -N "$1" -tlp 'sport = 1111') != *ncat* ]]; do sleep 0.1; done; } @@ -294,12 +294,9 @@ ip1 -6 rule add table main suppress_prefixlength 0 ip1 -4 route add default dev wg0 table 51820 ip1 -4 rule add not fwmark 51820 table 51820 ip1 -4 rule add table main suppress_prefixlength 0 -# suppress_prefixlength only got added in 3.12, and we want to support 3.10+. -if [[ $(ip1 -4 rule show all) == *suppress_prefixlength* ]]; then - # Flood the pings instead of sending just one, to trigger routing table reference counting bugs. - n1 ping -W 1 -c 100 -f 192.168.99.7 - n1 ping -W 1 -c 100 -f abab::1111 -fi +# Flood the pings instead of sending just one, to trigger routing table reference counting bugs. +n1 ping -W 1 -c 100 -f 192.168.99.7 +n1 ping -W 1 -c 100 -f abab::1111 n0 iptables -t nat -F ip0 link del vethrc diff --git a/tools/testing/selftests/wireguard/qemu/Makefile b/tools/testing/selftests/wireguard/qemu/Makefile index 6d51bf78eeff..f10aa3590adc 100644 --- a/tools/testing/selftests/wireguard/qemu/Makefile +++ b/tools/testing/selftests/wireguard/qemu/Makefile @@ -5,6 +5,7 @@ PWD := $(shell pwd) CHOST := $(shell gcc -dumpmachine) +HOST_ARCH := $(firstword $(subst -, ,$(CHOST))) ifneq (,$(ARCH)) CBUILD := $(subst -gcc,,$(lastword $(subst /, ,$(firstword $(wildcard $(foreach bindir,$(subst :, ,$(PATH)),$(bindir)/$(ARCH)-*-gcc)))))) ifeq (,$(CBUILD)) @@ -37,19 +38,19 @@ endef define file_download = $(DISTFILES_PATH)/$(1): mkdir -p $(DISTFILES_PATH) - flock -x $$@.lock -c '[ -f $$@ ] && exit 0; wget -O $$@.tmp $(MIRROR)$(1) || wget -t inf --retry-on-http-error=404 -O $$@.tmp $(2)$(1) || rm -f $$@.tmp' + flock -x $$@.lock -c '[ -f $$@ ] && exit 0; wget -O $$@.tmp $(MIRROR)$(1) || wget -O $$@.tmp $(2)$(1) || rm -f $$@.tmp' if echo "$(3) $$@.tmp" | sha256sum -c -; then mv $$@.tmp $$@; else rm -f $$@.tmp; exit 71; fi endef -$(eval $(call tar_download,MUSL,musl,1.1.20,.tar.gz,https://www.musl-libc.org/releases/,44be8771d0e6c6b5f82dd15662eb2957c9a3173a19a8b49966ac0542bbd40d61)) +$(eval $(call tar_download,MUSL,musl,1.1.24,.tar.gz,https://www.musl-libc.org/releases/,1370c9a812b2cf2a7d92802510cca0058cc37e66a7bedd70051f0a34015022a3)) $(eval $(call tar_download,LIBMNL,libmnl,1.0.4,.tar.bz2,https://www.netfilter.org/projects/libmnl/files/,171f89699f286a5854b72b91d06e8f8e3683064c5901fb09d954a9ab6f551f81)) -$(eval $(call tar_download,IPERF,iperf,3.1.7,.tar.gz,http://downloads.es.net/pub/iperf/,a4ef73406fe92250602b8da2ae89ec53211f805df97a1d1d629db5a14043734f)) +$(eval $(call tar_download,IPERF,iperf,3.7,.tar.gz,https://downloads.es.net/pub/iperf/,d846040224317caf2f75c843d309a950a7db23f9b44b94688ccbe557d6d1710c)) $(eval $(call tar_download,BASH,bash,5.0,.tar.gz,https://ftp.gnu.org/gnu/bash/,b4a80f2ac66170b2913efbfb9f2594f1f76c7b1afd11f799e22035d63077fb4d)) -$(eval $(call tar_download,IPROUTE2,iproute2,5.1.0,.tar.gz,https://www.kernel.org/pub/linux/utils/net/iproute2/,9b43707d6075ecdca14803ca8ce0c8553848c49fa1586d12fd508d66577243f2)) -$(eval $(call tar_download,IPTABLES,iptables,1.6.1,.tar.bz2,https://www.netfilter.org/projects/iptables/files/,0fc2d7bd5d7be11311726466789d4c65fb4c8e096c9182b56ce97440864f0cf5)) -$(eval $(call tar_download,NMAP,nmap,7.60,.tar.bz2,https://nmap.org/dist/,a8796ecc4fa6c38aad6139d9515dc8113023a82e9d787e5a5fb5fa1b05516f21)) -$(eval $(call tar_download,IPUTILS,iputils,s20161105,.tar.gz,https://github.com/iputils/iputils/archive/s20161105.tar.gz/#,f813092f03d17294fd23544b129b95cdb87fe19f7970a51908a6b88509acad8a)) -$(eval $(call tar_download,WIREGUARD_TOOLS,WireGuard,0.0.20191212,.tar.xz,https://git.zx2c4.com/WireGuard/snapshot/,b0d718380f7a8822b2f12d75e462fa4eafa3a77871002981f367cd4fe2a1b071)) +$(eval $(call tar_download,IPROUTE2,iproute2,5.4.0,.tar.xz,https://www.kernel.org/pub/linux/utils/net/iproute2/,fe97aa60a0d4c5ac830be18937e18dc3400ca713a33a89ad896ff1e3d46086ae)) +$(eval $(call tar_download,IPTABLES,iptables,1.8.4,.tar.bz2,https://www.netfilter.org/projects/iptables/files/,993a3a5490a544c2cbf2ef15cf7e7ed21af1845baf228318d5c36ef8827e157c)) +$(eval $(call tar_download,NMAP,nmap,7.80,.tar.bz2,https://nmap.org/dist/,fcfa5a0e42099e12e4bf7a68ebe6fde05553383a682e816a7ec9256ab4773faa)) +$(eval $(call tar_download,IPUTILS,iputils,s20190709,.tar.gz,https://github.com/iputils/iputils/archive/s20190709.tar.gz/#,a15720dd741d7538dd2645f9f516d193636ae4300ff7dbc8bfca757bf166490a)) +$(eval $(call tar_download,WIREGUARD_TOOLS,wireguard-tools,1.0.20191226,.tar.xz,https://git.zx2c4.com/wireguard-tools/snapshot/,aa8af0fdc9872d369d8c890a84dbc2a2466b55795dccd5b47721b2d97644b04f)) KERNEL_BUILD_PATH := $(BUILD_PATH)/kernel$(if $(findstring yes,$(DEBUG_KERNEL)),-debug) rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d)) @@ -59,23 +60,21 @@ export CFLAGS ?= -O3 -pipe export LDFLAGS ?= export CPPFLAGS := -I$(BUILD_PATH)/include -ifeq ($(CHOST),$(CBUILD)) +ifeq ($(HOST_ARCH),$(ARCH)) CROSS_COMPILE_FLAG := --host=$(CHOST) -NOPIE_GCC := gcc -fno-PIE CFLAGS += -march=native STRIP := strip else $(info Cross compilation: building for $(CBUILD) using $(CHOST)) CROSS_COMPILE_FLAG := --build=$(CBUILD) --host=$(CHOST) export CROSS_COMPILE=$(CBUILD)- -NOPIE_GCC := $(CBUILD)-gcc -fno-PIE STRIP := $(CBUILD)-strip endif ifeq ($(ARCH),aarch64) QEMU_ARCH := aarch64 KERNEL_ARCH := arm64 KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/arm64/boot/Image -ifeq ($(CHOST),$(CBUILD)) +ifeq ($(HOST_ARCH),$(ARCH)) QEMU_MACHINE := -cpu host -machine virt,gic_version=host,accel=kvm else QEMU_MACHINE := -cpu cortex-a53 -machine virt @@ -85,7 +84,7 @@ else ifeq ($(ARCH),aarch64_be) QEMU_ARCH := aarch64 KERNEL_ARCH := arm64 KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/arm64/boot/Image -ifeq ($(CHOST),$(CBUILD)) +ifeq ($(HOST_ARCH),$(ARCH)) QEMU_MACHINE := -cpu host -machine virt,gic_version=host,accel=kvm else QEMU_MACHINE := -cpu cortex-a53 -machine virt @@ -95,7 +94,7 @@ else ifeq ($(ARCH),arm) QEMU_ARCH := arm KERNEL_ARCH := arm KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/arm/boot/zImage -ifeq ($(CHOST),$(CBUILD)) +ifeq ($(HOST_ARCH),$(ARCH)) QEMU_MACHINE := -cpu host -machine virt,gic_version=host,accel=kvm else QEMU_MACHINE := -cpu cortex-a15 -machine virt @@ -105,7 +104,7 @@ else ifeq ($(ARCH),armeb) QEMU_ARCH := arm KERNEL_ARCH := arm KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/arm/boot/zImage -ifeq ($(CHOST),$(CBUILD)) +ifeq ($(HOST_ARCH),$(ARCH)) QEMU_MACHINE := -cpu host -machine virt,gic_version=host,accel=kvm else QEMU_MACHINE := -cpu cortex-a15 -machine virt @@ -116,7 +115,7 @@ else ifeq ($(ARCH),x86_64) QEMU_ARCH := x86_64 KERNEL_ARCH := x86_64 KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/x86/boot/bzImage -ifeq ($(CHOST),$(CBUILD)) +ifeq ($(HOST_ARCH),$(ARCH)) QEMU_MACHINE := -cpu host -machine q35,accel=kvm else QEMU_MACHINE := -cpu Skylake-Server -machine q35 @@ -126,7 +125,7 @@ else ifeq ($(ARCH),i686) QEMU_ARCH := i386 KERNEL_ARCH := x86 KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/x86/boot/bzImage -ifeq ($(subst i686,x86_64,$(CBUILD)),$(CHOST)) +ifeq ($(subst x86_64,i686,$(HOST_ARCH)),$(ARCH)) QEMU_MACHINE := -cpu host -machine q35,accel=kvm else QEMU_MACHINE := -cpu coreduo -machine q35 @@ -136,7 +135,7 @@ else ifeq ($(ARCH),mips64) QEMU_ARCH := mips64 KERNEL_ARCH := mips KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/vmlinux -ifeq ($(CHOST),$(CBUILD)) +ifeq ($(HOST_ARCH),$(ARCH)) QEMU_MACHINE := -cpu host -machine malta,accel=kvm CFLAGS += -EB else @@ -147,7 +146,7 @@ else ifeq ($(ARCH),mips64el) QEMU_ARCH := mips64el KERNEL_ARCH := mips KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/vmlinux -ifeq ($(CHOST),$(CBUILD)) +ifeq ($(HOST_ARCH),$(ARCH)) QEMU_MACHINE := -cpu host -machine malta,accel=kvm CFLAGS += -EL else @@ -158,7 +157,7 @@ else ifeq ($(ARCH),mips) QEMU_ARCH := mips KERNEL_ARCH := mips KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/vmlinux -ifeq ($(CHOST),$(CBUILD)) +ifeq ($(HOST_ARCH),$(ARCH)) QEMU_MACHINE := -cpu host -machine malta,accel=kvm CFLAGS += -EB else @@ -169,7 +168,7 @@ else ifeq ($(ARCH),mipsel) QEMU_ARCH := mipsel KERNEL_ARCH := mips KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/vmlinux -ifeq ($(CHOST),$(CBUILD)) +ifeq ($(HOST_ARCH),$(ARCH)) QEMU_MACHINE := -cpu host -machine malta,accel=kvm CFLAGS += -EL else @@ -180,7 +179,7 @@ else ifeq ($(ARCH),powerpc64le) QEMU_ARCH := ppc64 KERNEL_ARCH := powerpc KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/vmlinux -ifeq ($(CHOST),$(CBUILD)) +ifeq ($(HOST_ARCH),$(ARCH)) QEMU_MACHINE := -cpu host,accel=kvm -machine pseries else QEMU_MACHINE := -machine pseries @@ -190,7 +189,7 @@ else ifeq ($(ARCH),powerpc) QEMU_ARCH := ppc KERNEL_ARCH := powerpc KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/powerpc/boot/uImage -ifeq ($(CHOST),$(CBUILD)) +ifeq ($(HOST_ARCH),$(ARCH)) QEMU_MACHINE := -cpu host,accel=kvm -machine ppce500 else QEMU_MACHINE := -machine ppce500 @@ -200,10 +199,11 @@ else ifeq ($(ARCH),m68k) QEMU_ARCH := m68k KERNEL_ARCH := m68k KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/vmlinux -ifeq ($(CHOST),$(CBUILD)) -QEMU_MACHINE := -cpu host,accel=kvm -machine q800 +KERNEL_CMDLINE := $(shell sed -n 's/CONFIG_CMDLINE=\(.*\)/\1/p' arch/m68k.config) +ifeq ($(HOST_ARCH),$(ARCH)) +QEMU_MACHINE := -cpu host,accel=kvm -machine q800 -smp 1 -append $(KERNEL_CMDLINE) else -QEMU_MACHINE := -machine q800 +QEMU_MACHINE := -machine q800 -smp 1 -append $(KERNEL_CMDLINE) endif else $(error I only build: x86_64, i686, arm, armeb, aarch64, aarch64_be, mips, mipsel, mips64, mips64el, powerpc64le, powerpc, m68k) @@ -238,14 +238,14 @@ $(BUILD_PATH)/init-cpio-spec.txt: echo "nod /dev/console 644 0 0 c 5 1" >> $@ echo "dir /bin 755 0 0" >> $@ echo "file /bin/iperf3 $(IPERF_PATH)/src/iperf3 755 0 0" >> $@ - echo "file /bin/wg $(WIREGUARD_TOOLS_PATH)/src/tools/wg 755 0 0" >> $@ + echo "file /bin/wg $(WIREGUARD_TOOLS_PATH)/src/wg 755 0 0" >> $@ echo "file /bin/bash $(BASH_PATH)/bash 755 0 0" >> $@ echo "file /bin/ip $(IPROUTE2_PATH)/ip/ip 755 0 0" >> $@ echo "file /bin/ss $(IPROUTE2_PATH)/misc/ss 755 0 0" >> $@ echo "file /bin/ping $(IPUTILS_PATH)/ping 755 0 0" >> $@ echo "file /bin/ncat $(NMAP_PATH)/ncat/ncat 755 0 0" >> $@ - echo "file /bin/xtables-multi $(IPTABLES_PATH)/iptables/xtables-multi 755 0 0" >> $@ - echo "slink /bin/iptables xtables-multi 777 0 0" >> $@ + echo "file /bin/xtables-legacy-multi $(IPTABLES_PATH)/iptables/xtables-legacy-multi 755 0 0" >> $@ + echo "slink /bin/iptables xtables-legacy-multi 777 0 0" >> $@ echo "slink /bin/ping6 ping 777 0 0" >> $@ echo "dir /lib 755 0 0" >> $@ echo "file /lib/libc.so $(MUSL_PATH)/lib/libc.so 755 0 0" >> $@ @@ -260,8 +260,8 @@ $(KERNEL_BUILD_PATH)/.config: kernel.config arch/$(ARCH).config cd $(KERNEL_BUILD_PATH) && ARCH=$(KERNEL_ARCH) $(KERNEL_PATH)/scripts/kconfig/merge_config.sh -n $(KERNEL_BUILD_PATH)/.config $(KERNEL_BUILD_PATH)/minimal.config $(if $(findstring yes,$(DEBUG_KERNEL)),cp debug.config $(KERNEL_BUILD_PATH) && cd $(KERNEL_BUILD_PATH) && ARCH=$(KERNEL_ARCH) $(KERNEL_PATH)/scripts/kconfig/merge_config.sh -n $(KERNEL_BUILD_PATH)/.config debug.config,) -$(KERNEL_BZIMAGE): $(KERNEL_BUILD_PATH)/.config $(BUILD_PATH)/init-cpio-spec.txt $(MUSL_PATH)/lib/libc.so $(IPERF_PATH)/src/iperf3 $(IPUTILS_PATH)/ping $(BASH_PATH)/bash $(IPROUTE2_PATH)/misc/ss $(IPROUTE2_PATH)/ip/ip $(IPTABLES_PATH)/iptables/xtables-multi $(NMAP_PATH)/ncat/ncat $(WIREGUARD_TOOLS_PATH)/src/tools/wg $(BUILD_PATH)/init ../netns.sh $(WIREGUARD_SOURCES) - $(MAKE) -C $(KERNEL_PATH) O=$(KERNEL_BUILD_PATH) ARCH=$(KERNEL_ARCH) CROSS_COMPILE=$(CROSS_COMPILE) CC="$(NOPIE_GCC)" +$(KERNEL_BZIMAGE): $(KERNEL_BUILD_PATH)/.config $(BUILD_PATH)/init-cpio-spec.txt $(MUSL_PATH)/lib/libc.so $(IPERF_PATH)/src/iperf3 $(IPUTILS_PATH)/ping $(BASH_PATH)/bash $(IPROUTE2_PATH)/misc/ss $(IPROUTE2_PATH)/ip/ip $(IPTABLES_PATH)/iptables/xtables-legacy-multi $(NMAP_PATH)/ncat/ncat $(WIREGUARD_TOOLS_PATH)/src/wg $(BUILD_PATH)/init ../netns.sh $(WIREGUARD_SOURCES) + $(MAKE) -C $(KERNEL_PATH) O=$(KERNEL_BUILD_PATH) ARCH=$(KERNEL_ARCH) CROSS_COMPILE=$(CROSS_COMPILE) $(BUILD_PATH)/include/linux/.installed: | $(KERNEL_BUILD_PATH)/.config $(MAKE) -C $(KERNEL_PATH) O=$(KERNEL_BUILD_PATH) INSTALL_HDR_PATH=$(BUILD_PATH) ARCH=$(KERNEL_ARCH) CROSS_COMPILE=$(CROSS_COMPILE) headers_install @@ -280,7 +280,7 @@ $(BUILD_PATH)/include/.installed: $(MUSL_PATH)/lib/libc.so $(MUSL_CC): $(MUSL_PATH)/lib/libc.so sh $(MUSL_PATH)/tools/musl-gcc.specs.sh $(BUILD_PATH)/include $(MUSL_PATH)/lib /lib/ld-linux.so.1 > $(BUILD_PATH)/musl-gcc.specs - printf '#!/bin/sh\nexec "$(REAL_CC)" --specs="$(BUILD_PATH)/musl-gcc.specs" -fno-stack-protector -no-pie "$$@"\n' > $(BUILD_PATH)/musl-gcc + printf '#!/bin/sh\nexec "$(REAL_CC)" --specs="$(BUILD_PATH)/musl-gcc.specs" "$$@"\n' > $(BUILD_PATH)/musl-gcc chmod +x $(BUILD_PATH)/musl-gcc $(IPERF_PATH)/.installed: $(IPERF_TAR) @@ -291,7 +291,7 @@ $(IPERF_PATH)/.installed: $(IPERF_TAR) touch $@ $(IPERF_PATH)/src/iperf3: | $(IPERF_PATH)/.installed $(USERSPACE_DEPS) - cd $(IPERF_PATH) && CFLAGS="$(CFLAGS) -D_GNU_SOURCE" ./configure --prefix=/ $(CROSS_COMPILE_FLAG) --enable-static --disable-shared + cd $(IPERF_PATH) && CFLAGS="$(CFLAGS) -D_GNU_SOURCE" ./configure --prefix=/ $(CROSS_COMPILE_FLAG) --enable-static --disable-shared --with-openssl=no $(MAKE) -C $(IPERF_PATH) $(STRIP) -s $@ @@ -308,8 +308,8 @@ $(WIREGUARD_TOOLS_PATH)/.installed: $(WIREGUARD_TOOLS_TAR) flock -s $<.lock tar -C $(BUILD_PATH) -xf $< touch $@ -$(WIREGUARD_TOOLS_PATH)/src/tools/wg: | $(WIREGUARD_TOOLS_PATH)/.installed $(LIBMNL_PATH)/src/.libs/libmnl.a $(USERSPACE_DEPS) - LDFLAGS="$(LDFLAGS) -L$(LIBMNL_PATH)/src/.libs" $(MAKE) -C $(WIREGUARD_TOOLS_PATH)/src/tools LIBMNL_CFLAGS="-I$(LIBMNL_PATH)/include" LIBMNL_LDLIBS="-lmnl" wg +$(WIREGUARD_TOOLS_PATH)/src/wg: | $(WIREGUARD_TOOLS_PATH)/.installed $(LIBMNL_PATH)/src/.libs/libmnl.a $(USERSPACE_DEPS) + LDFLAGS="$(LDFLAGS) -L$(LIBMNL_PATH)/src/.libs" $(MAKE) -C $(WIREGUARD_TOOLS_PATH)/src LIBMNL_CFLAGS="-I$(LIBMNL_PATH)/include" LIBMNL_LDLIBS="-lmnl" wg $(STRIP) -s $@ $(BUILD_PATH)/init: init.c | $(USERSPACE_DEPS) @@ -323,7 +323,8 @@ $(IPUTILS_PATH)/.installed: $(IPUTILS_TAR) touch $@ $(IPUTILS_PATH)/ping: | $(IPUTILS_PATH)/.installed $(USERSPACE_DEPS) - $(MAKE) -C $(IPUTILS_PATH) USE_CAP=no USE_IDN=no USE_NETTLE=no USE_CRYPTO=no ping + sed -i /atexit/d $(IPUTILS_PATH)/ping.c + cd $(IPUTILS_PATH) && $(CC) $(CFLAGS) -std=c99 -o $@ ping.c ping_common.c ping6_common.c iputils_common.c -D_GNU_SOURCE -D'IPUTILS_VERSION(f)=f' -lresolv $(LDFLAGS) $(STRIP) -s $@ $(BASH_PATH)/.installed: $(BASH_TAR) @@ -357,7 +358,7 @@ $(IPTABLES_PATH)/.installed: $(IPTABLES_TAR) sed -i -e "/nfnetlink=[01]/s:=[01]:=0:" -e "/nfconntrack=[01]/s:=[01]:=0:" $(IPTABLES_PATH)/configure touch $@ -$(IPTABLES_PATH)/iptables/xtables-multi: | $(IPTABLES_PATH)/.installed $(LIBMNL_PATH)/src/.libs/libmnl.a $(USERSPACE_DEPS) +$(IPTABLES_PATH)/iptables/xtables-legacy-multi: | $(IPTABLES_PATH)/.installed $(LIBMNL_PATH)/src/.libs/libmnl.a $(USERSPACE_DEPS) cd $(IPTABLES_PATH) && PKG_CONFIG_LIBDIR="$(LIBMNL_PATH)" ./configure --prefix=/ $(CROSS_COMPILE_FLAG) --enable-static --disable-shared --disable-nftables --disable-bpf-compiler --disable-nfsynproxy --disable-libipq --with-kernel=$(BUILD_PATH)/include $(MAKE) -C $(IPTABLES_PATH) $(STRIP) -s $@ @@ -368,8 +369,9 @@ $(NMAP_PATH)/.installed: $(NMAP_TAR) touch $@ $(NMAP_PATH)/ncat/ncat: | $(NMAP_PATH)/.installed $(USERSPACE_DEPS) - cd $(NMAP_PATH) && ./configure --prefix=/ $(CROSS_COMPILE_FLAG) --enable-static --disable-shared --without-ndiff --without-zenmap --without-nping --with-libpcap=included --with-libpcre=included --with-libdnet=included --without-liblua --with-liblinear=included --without-nmap-update --without-openssl --with-pcap=linux - $(MAKE) -C $(NMAP_PATH) build-ncat + cd $(NMAP_PATH) && ./configure --prefix=/ $(CROSS_COMPILE_FLAG) --enable-static --disable-shared --without-ndiff --without-zenmap --without-nping --with-libpcap=included --with-libpcre=included --with-libdnet=included --without-liblua --with-liblinear=included --without-nmap-update --without-openssl --with-pcap=linux --without-libssh + $(MAKE) -C $(NMAP_PATH)/libpcap + $(MAKE) -C $(NMAP_PATH)/ncat $(STRIP) -s $@ clean: @@ -379,7 +381,7 @@ distclean: clean rm -rf $(DISTFILES_PATH) menuconfig: $(KERNEL_BUILD_PATH)/.config - $(MAKE) -C $(KERNEL_PATH) O=$(KERNEL_BUILD_PATH) ARCH=$(KERNEL_ARCH) CROSS_COMPILE=$(CROSS_COMPILE) CC="$(NOPIE_GCC)" menuconfig + $(MAKE) -C $(KERNEL_PATH) O=$(KERNEL_BUILD_PATH) ARCH=$(KERNEL_ARCH) CROSS_COMPILE=$(CROSS_COMPILE) menuconfig .PHONY: qemu build clean distclean menuconfig .DELETE_ON_ERROR: diff --git a/tools/testing/selftests/wireguard/qemu/arch/m68k.config b/tools/testing/selftests/wireguard/qemu/arch/m68k.config index 5381ea10896c..62a15bdb877e 100644 --- a/tools/testing/selftests/wireguard/qemu/arch/m68k.config +++ b/tools/testing/selftests/wireguard/qemu/arch/m68k.config @@ -1,9 +1,9 @@ CONFIG_MMU=y +CONFIG_M68KCLASSIC=y CONFIG_M68040=y CONFIG_MAC=y CONFIG_SERIAL_PMACZILOG=y CONFIG_SERIAL_PMACZILOG_TTYS=y CONFIG_SERIAL_PMACZILOG_CONSOLE=y -CONFIG_CMDLINE_BOOL=y CONFIG_CMDLINE="console=ttyS0 wg.success=ttyS1" CONFIG_FRAME_WARN=1024 diff --git a/tools/testing/selftests/wireguard/qemu/init.c b/tools/testing/selftests/wireguard/qemu/init.c index 51e5ddedee88..90bc9813cadc 100644 --- a/tools/testing/selftests/wireguard/qemu/init.c +++ b/tools/testing/selftests/wireguard/qemu/init.c @@ -21,6 +21,7 @@ #include <sys/reboot.h> #include <sys/utsname.h> #include <sys/sendfile.h> +#include <sys/sysmacros.h> #include <linux/random.h> #include <linux/version.h> diff --git a/tools/testing/selftests/wireguard/qemu/kernel.config b/tools/testing/selftests/wireguard/qemu/kernel.config index 9cca30206014..af9323a0b6e0 100644 --- a/tools/testing/selftests/wireguard/qemu/kernel.config +++ b/tools/testing/selftests/wireguard/qemu/kernel.config @@ -39,6 +39,7 @@ CONFIG_PRINTK=y CONFIG_KALLSYMS=y CONFIG_BUG=y CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y +CONFIG_JUMP_LABEL=y CONFIG_EMBEDDED=n CONFIG_BASE_FULL=y CONFIG_FUTEX=y @@ -55,6 +56,7 @@ CONFIG_NO_HZ_IDLE=y CONFIG_NO_HZ_FULL=n CONFIG_HZ_PERIODIC=n CONFIG_HIGH_RES_TIMERS=y +CONFIG_COMPAT_32BIT_TIME=y CONFIG_ARCH_RANDOM=y CONFIG_FILE_LOCKING=y CONFIG_POSIX_TIMERS=y diff --git a/usr/gen_initramfs_list.sh b/usr/gen_initramfs_list.sh index 0aad760fcd8c..2bbac73e6477 100755 --- a/usr/gen_initramfs_list.sh +++ b/usr/gen_initramfs_list.sh @@ -128,7 +128,7 @@ parse() { str="${ftype} ${name} ${location} ${str}" ;; "nod") - local dev=`LC_ALL=C ls -l "${location}"` + local dev="`LC_ALL=C ls -l "${location}"`" local maj=`field 5 ${dev}` local min=`field 6 ${dev}` maj=${maj%,} |