diff options
author | Jens Axboe <axboe@kernel.dk> | 2025-03-07 09:07:11 -0700 |
---|---|---|
committer | Jens Axboe <axboe@kernel.dk> | 2025-03-07 09:07:11 -0700 |
commit | 78b6f6e9bf3960c5ee3368415a11babb754b9a19 (patch) | |
tree | eadb164bc986399e128f241c97598f889e8d0679 | |
parent | 94765d71a0833c5d8c4f010f505f17285238bbca (diff) | |
parent | 89baa22d75278b69d3a30f86c3f47ac3a3a659e9 (diff) |
Merge branch 'for-6.15/io_uring-rx-zc' into for-6.15/io_uring-reg-vec
* for-6.15/io_uring-rx-zc: (80 commits)
io_uring/zcrx: add selftest case for recvzc with read limit
io_uring/zcrx: add a read limit to recvzc requests
io_uring: add missing IORING_MAP_OFF_ZCRX_REGION in io_uring_mmap
io_uring: Rename KConfig to Kconfig
io_uring/zcrx: fix leaks on failed registration
io_uring/zcrx: recheck ifq on shutdown
io_uring/zcrx: add selftest
net: add documentation for io_uring zcrx
io_uring/zcrx: add copy fallback
io_uring/zcrx: throttle receive requests
io_uring/zcrx: set pp memory provider for an rx queue
io_uring/zcrx: add io_recvzc request
io_uring/zcrx: dma-map area for the device
io_uring/zcrx: implement zerocopy receive pp memory provider
io_uring/zcrx: grab a net device
io_uring/zcrx: add io_zcrx_area
io_uring/zcrx: add interface queue and refill queue
net: add helpers for setting a memory provider on an rx queue
net: page_pool: add memory provider helpers
net: prepare for non devmem TCP memory providers
...
107 files changed, 3554 insertions, 3357 deletions
diff --git a/Documentation/arch/s390/driver-model.rst b/Documentation/arch/s390/driver-model.rst index ad4bc2dbea43..ad18f129fb0b 100644 --- a/Documentation/arch/s390/driver-model.rst +++ b/Documentation/arch/s390/driver-model.rst @@ -244,7 +244,7 @@ information about the interrupt from the irb parameter. -------------------- The ccwgroup mechanism is designed to handle devices consisting of multiple ccw -devices, like lcs or ctc. +devices, like qeth or ctc. The ccw driver provides a 'group' attribute. Piping bus ids of ccw devices to this attributes creates a ccwgroup device consisting of these ccw devices (if diff --git a/Documentation/devicetree/bindings/net/faraday,ftgmac100.yaml b/Documentation/devicetree/bindings/net/faraday,ftgmac100.yaml index 9bcbacb6640d..55d6a8379025 100644 --- a/Documentation/devicetree/bindings/net/faraday,ftgmac100.yaml +++ b/Documentation/devicetree/bindings/net/faraday,ftgmac100.yaml @@ -44,6 +44,9 @@ properties: phy-mode: enum: - rgmii + - rgmii-id + - rgmii-rxid + - rgmii-txid - rmii phy-handle: true diff --git a/Documentation/netlink/genetlink-c.yaml b/Documentation/netlink/genetlink-c.yaml index 9660ffb1ed6a..44f2226160ca 100644 --- a/Documentation/netlink/genetlink-c.yaml +++ b/Documentation/netlink/genetlink-c.yaml @@ -14,9 +14,10 @@ $defs: pattern: ^[0-9A-Za-z_-]+( - 1)?$ minimum: 0 len-or-limit: - # literal int or limit based on fixed-width type e.g. u8-min, u16-max, etc. + # literal int, const name, or limit based on fixed-width type + # e.g. u8-min, u16-max, etc. type: [ string, integer ] - pattern: ^[su](8|16|32|64)-(min|max)$ + pattern: ^[0-9A-Za-z_-]+$ minimum: 0 # Schema for specs diff --git a/Documentation/netlink/genetlink-legacy.yaml b/Documentation/netlink/genetlink-legacy.yaml index 16380e12cabe..ed64acf1bef7 100644 --- a/Documentation/netlink/genetlink-legacy.yaml +++ b/Documentation/netlink/genetlink-legacy.yaml @@ -14,9 +14,10 @@ $defs: pattern: ^[0-9A-Za-z_-]+( - 1)?$ minimum: 0 len-or-limit: - # literal int or limit based on fixed-width type e.g. u8-min, u16-max, etc. + # literal int, const name, or limit based on fixed-width type + # e.g. u8-min, u16-max, etc. type: [ string, integer ] - pattern: ^[su](8|16|32|64)-(min|max)$ + pattern: ^[0-9A-Za-z_-]+$ minimum: 0 # Schema for specs diff --git a/Documentation/netlink/genetlink.yaml b/Documentation/netlink/genetlink.yaml index b036227b46f1..e43e50dba2e4 100644 --- a/Documentation/netlink/genetlink.yaml +++ b/Documentation/netlink/genetlink.yaml @@ -14,9 +14,10 @@ $defs: pattern: ^[0-9A-Za-z_-]+( - 1)?$ minimum: 0 len-or-limit: - # literal int or limit based on fixed-width type e.g. u8-min, u16-max, etc. + # literal int, const name, or limit based on fixed-width type + # e.g. u8-min, u16-max, etc. type: [ string, integer ] - pattern: ^[su](8|16|32|64)-(min|max)$ + pattern: ^[0-9A-Za-z_-]+$ minimum: 0 # Schema for specs diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index cbb544bd6c84..288923e965ae 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -115,6 +115,9 @@ attribute-sets: type: u64 enum: xsk-flags - + name: io-uring-provider-info + attributes: [] + - name: page-pool attributes: - @@ -171,6 +174,11 @@ attribute-sets: name: dmabuf doc: ID of the dmabuf this page-pool is attached to. type: u32 + - + name: io-uring + doc: io-uring memory provider information. + type: nest + nested-attributes: io-uring-provider-info - name: page-pool-info subset-of: page-pool @@ -296,6 +304,11 @@ attribute-sets: name: dmabuf doc: ID of the dmabuf attached to this queue, if any. type: u32 + - + name: io-uring + doc: io_uring memory provider information. + type: nest + nested-attributes: io-uring-provider-info - name: qstats @@ -572,6 +585,7 @@ operations: - inflight-mem - detach-time - dmabuf + - io-uring dump: reply: *pp-reply config-cond: page-pool @@ -637,6 +651,7 @@ operations: - napi-id - ifindex - dmabuf + - io-uring dump: request: attributes: diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 058193ed2eeb..c64133d309bf 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -63,6 +63,7 @@ Contents: gtp ila ioam6-sysctl + iou-zcrx ip_dynaddr ipsec ip-sysctl diff --git a/Documentation/networking/iou-zcrx.rst b/Documentation/networking/iou-zcrx.rst new file mode 100644 index 000000000000..0127319b30bb --- /dev/null +++ b/Documentation/networking/iou-zcrx.rst @@ -0,0 +1,202 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================== +io_uring zero copy Rx +===================== + +Introduction +============ + +io_uring zero copy Rx (ZC Rx) is a feature that removes kernel-to-user copy on +the network receive path, allowing packet data to be received directly into +userspace memory. This feature is different to TCP_ZEROCOPY_RECEIVE in that +there are no strict alignment requirements and no need to mmap()/munmap(). +Compared to kernel bypass solutions such as e.g. DPDK, the packet headers are +processed by the kernel TCP stack as normal. + +NIC HW Requirements +=================== + +Several NIC HW features are required for io_uring ZC Rx to work. For now the +kernel API does not configure the NIC and it must be done by the user. + +Header/data split +----------------- + +Required to split packets at the L4 boundary into a header and a payload. +Headers are received into kernel memory as normal and processed by the TCP +stack as normal. Payloads are received into userspace memory directly. + +Flow steering +------------- + +Specific HW Rx queues are configured for this feature, but modern NICs +typically distribute flows across all HW Rx queues. Flow steering is required +to ensure that only desired flows are directed towards HW queues that are +configured for io_uring ZC Rx. + +RSS +--- + +In addition to flow steering above, RSS is required to steer all other non-zero +copy flows away from queues that are configured for io_uring ZC Rx. + +Usage +===== + +Setup NIC +--------- + +Must be done out of band for now. + +Ensure there are at least two queues:: + + ethtool -L eth0 combined 2 + +Enable header/data split:: + + ethtool -G eth0 tcp-data-split on + +Carve out half of the HW Rx queues for zero copy using RSS:: + + ethtool -X eth0 equal 1 + +Set up flow steering, bearing in mind that queues are 0-indexed:: + + ethtool -N eth0 flow-type tcp6 ... action 1 + +Setup io_uring +-------------- + +This section describes the low level io_uring kernel API. Please refer to +liburing documentation for how to use the higher level API. + +Create an io_uring instance with the following required setup flags:: + + IORING_SETUP_SINGLE_ISSUER + IORING_SETUP_DEFER_TASKRUN + IORING_SETUP_CQE32 + +Create memory area +------------------ + +Allocate userspace memory area for receiving zero copy data:: + + void *area_ptr = mmap(NULL, area_size, + PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, + 0, 0); + +Create refill ring +------------------ + +Allocate memory for a shared ringbuf used for returning consumed buffers:: + + void *ring_ptr = mmap(NULL, ring_size, + PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, + 0, 0); + +This refill ring consists of some space for the header, followed by an array of +``struct io_uring_zcrx_rqe``:: + + size_t rq_entries = 4096; + size_t ring_size = rq_entries * sizeof(struct io_uring_zcrx_rqe) + PAGE_SIZE; + /* align to page size */ + ring_size = (ring_size + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1); + +Register ZC Rx +-------------- + +Fill in registration structs:: + + struct io_uring_zcrx_area_reg area_reg = { + .addr = (__u64)(unsigned long)area_ptr, + .len = area_size, + .flags = 0, + }; + + struct io_uring_region_desc region_reg = { + .user_addr = (__u64)(unsigned long)ring_ptr, + .size = ring_size, + .flags = IORING_MEM_REGION_TYPE_USER, + }; + + struct io_uring_zcrx_ifq_reg reg = { + .if_idx = if_nametoindex("eth0"), + /* this is the HW queue with desired flow steered into it */ + .if_rxq = 1, + .rq_entries = rq_entries, + .area_ptr = (__u64)(unsigned long)&area_reg, + .region_ptr = (__u64)(unsigned long)®ion_reg, + }; + +Register with kernel:: + + io_uring_register_ifq(ring, ®); + +Map refill ring +--------------- + +The kernel fills in fields for the refill ring in the registration ``struct +io_uring_zcrx_ifq_reg``. Map it into userspace:: + + struct io_uring_zcrx_rq refill_ring; + + refill_ring.khead = (unsigned *)((char *)ring_ptr + reg.offsets.head); + refill_ring.khead = (unsigned *)((char *)ring_ptr + reg.offsets.tail); + refill_ring.rqes = + (struct io_uring_zcrx_rqe *)((char *)ring_ptr + reg.offsets.rqes); + refill_ring.rq_tail = 0; + refill_ring.ring_ptr = ring_ptr; + +Receiving data +-------------- + +Prepare a zero copy recv request:: + + struct io_uring_sqe *sqe; + + sqe = io_uring_get_sqe(ring); + io_uring_prep_rw(IORING_OP_RECV_ZC, sqe, fd, NULL, 0, 0); + sqe->ioprio |= IORING_RECV_MULTISHOT; + +Now, submit and wait:: + + io_uring_submit_and_wait(ring, 1); + +Finally, process completions:: + + struct io_uring_cqe *cqe; + unsigned int count = 0; + unsigned int head; + + io_uring_for_each_cqe(ring, head, cqe) { + struct io_uring_zcrx_cqe *rcqe = (struct io_uring_zcrx_cqe *)(cqe + 1); + + unsigned long mask = (1ULL << IORING_ZCRX_AREA_SHIFT) - 1; + unsigned char *data = area_ptr + (rcqe->off & mask); + /* do something with the data */ + + count++; + } + io_uring_cq_advance(ring, count); + +Recycling buffers +----------------- + +Return buffers back to the kernel to be used again:: + + struct io_uring_zcrx_rqe *rqe; + unsigned mask = refill_ring.ring_entries - 1; + rqe = &refill_ring.rqes[refill_ring.rq_tail & mask]; + + unsigned long area_offset = rcqe->off & ~IORING_ZCRX_AREA_MASK; + rqe->off = area_offset | area_reg.rq_area_token; + rqe->len = cqe->res; + IO_URING_WRITE_ONCE(*refill_ring.ktail, ++refill_ring.rq_tail); + +Testing +======= + +See ``tools/testing/selftests/drivers/net/hw/iou-zcrx.c`` @@ -30,3 +30,5 @@ source "lib/Kconfig" source "lib/Kconfig.debug" source "Documentation/Kconfig" + +source "io_uring/Kconfig" diff --git a/arch/s390/include/asm/irq.h b/arch/s390/include/asm/irq.h index d9e705f4a697..bde6a496df5f 100644 --- a/arch/s390/include/asm/irq.h +++ b/arch/s390/include/asm/irq.h @@ -54,7 +54,6 @@ enum interruption_class { IRQIO_C70, IRQIO_TAP, IRQIO_VMR, - IRQIO_LCS, IRQIO_CTC, IRQIO_ADM, IRQIO_CSC, diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index ef7be599e1f7..7ca157ffab30 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -84,7 +84,6 @@ static const struct irq_class irqclass_sub_desc[] = { {.irq = IRQIO_C70, .name = "C70", .desc = "[I/O] 3270"}, {.irq = IRQIO_TAP, .name = "TAP", .desc = "[I/O] Tape"}, {.irq = IRQIO_VMR, .name = "VMR", .desc = "[I/O] Unit Record Devices"}, - {.irq = IRQIO_LCS, .name = "LCS", .desc = "[I/O] LCS"}, {.irq = IRQIO_CTC, .name = "CTC", .desc = "[I/O] CTC"}, {.irq = IRQIO_ADM, .name = "ADM", .desc = "[I/O] EADM Subchannel"}, {.irq = IRQIO_CSC, .name = "CSC", .desc = "[I/O] CHSC Subchannel"}, diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index e45bba240cbc..f6d0628a36d9 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -432,9 +432,6 @@ static struct net_device *bond_ipsec_dev(struct xfrm_state *xs) struct bonding *bond; struct slave *slave; - if (!bond_dev) - return NULL; - bond = netdev_priv(bond_dev); if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) return NULL; diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.h b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.h index f5901f8e3907..f6b990b7f5b4 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.h +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.h @@ -226,7 +226,6 @@ struct __packed offload_info { struct offload_port_info ports; struct offload_ka_info kas; struct offload_rr_info rrs; - u8 buf[]; }; struct __packed hw_atl_utils_fw_rpc { diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_device.c b/drivers/net/ethernet/cavium/liquidio/octeon_device.c index 6b6cb73482d7..1753bb87dfbd 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_device.c +++ b/drivers/net/ethernet/cavium/liquidio/octeon_device.c @@ -1433,22 +1433,6 @@ int octeon_wait_for_ddr_init(struct octeon_device *oct, u32 *timeout) } EXPORT_SYMBOL_GPL(octeon_wait_for_ddr_init); -/* Get the octeon id assigned to the octeon device passed as argument. - * This function is exported to other modules. - * @param dev - octeon device pointer passed as a void *. - * @return octeon device id - */ -int lio_get_device_id(void *dev) -{ - struct octeon_device *octeon_dev = (struct octeon_device *)dev; - u32 i; - - for (i = 0; i < MAX_OCTEON_DEVICES; i++) - if (octeon_device[i] == octeon_dev) - return octeon_dev->octeon_id; - return -1; -} - void lio_enable_irq(struct octeon_droq *droq, struct octeon_instr_queue *iq) { u64 instr_cnt; diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_device.h b/drivers/net/ethernet/cavium/liquidio/octeon_device.h index d26364c2ac81..19344b21f8fb 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_device.h +++ b/drivers/net/ethernet/cavium/liquidio/octeon_device.h @@ -705,13 +705,6 @@ octeon_get_dispatch(struct octeon_device *octeon_dev, u16 opcode, */ struct octeon_device *lio_get_device(u32 octeon_id); -/** Get the octeon id assigned to the octeon device passed as argument. - * This function is exported to other modules. - * @param dev - octeon device pointer passed as a void *. - * @return octeon device id - */ -int lio_get_device_id(void *dev); - /** Read windowed register. * @param oct - pointer to the Octeon device. * @param addr - Address of the register to read. diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h index c7c2c15a1815..95e6f015a6af 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h @@ -1211,9 +1211,6 @@ struct adapter { struct timer_list flower_stats_timer; struct work_struct flower_stats_work; - /* Ethtool Dump */ - struct ethtool_dump eth_dump; - /* HMA */ struct hma_data hma; @@ -1233,6 +1230,10 @@ struct adapter { /* Ethtool n-tuple */ struct cxgb4_ethtool_filter *ethtool_filters; + + /* Ethtool Dump */ + /* Must be last - ends in a flex-array member. */ + struct ethtool_dump eth_dump; }; /* Support for "sched-class" command to allow a TX Scheduling Class to be diff --git a/drivers/net/ethernet/mellanox/mlx4/alloc.c b/drivers/net/ethernet/mellanox/mlx4/alloc.c index b330020dc0d6..598df63518c5 100644 --- a/drivers/net/ethernet/mellanox/mlx4/alloc.c +++ b/drivers/net/ethernet/mellanox/mlx4/alloc.c @@ -526,28 +526,6 @@ out: return res; } -u32 mlx4_zone_free_entries(struct mlx4_zone_allocator *zones, u32 uid, u32 obj, u32 count) -{ - struct mlx4_zone_entry *zone; - int res = 0; - - spin_lock(&zones->lock); - - zone = __mlx4_find_zone_by_uid(zones, uid); - - if (NULL == zone) { - res = -1; - goto out; - } - - __mlx4_free_from_zone(zone, obj, count); - -out: - spin_unlock(&zones->lock); - - return res; -} - u32 mlx4_zone_free_entries_unique(struct mlx4_zone_allocator *zones, u32 obj, u32 count) { struct mlx4_zone_entry *zone; diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h index d7d856d1758a..b213094ea30f 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h @@ -1478,12 +1478,6 @@ void mlx4_zone_allocator_destroy(struct mlx4_zone_allocator *zone_alloc); u32 mlx4_zone_alloc_entries(struct mlx4_zone_allocator *zones, u32 uid, int count, int align, u32 skip_mask, u32 *puid); -/* Free <count> objects, start from <obj> of the uid <uid> from zone_allocator - * <zones>. - */ -u32 mlx4_zone_free_entries(struct mlx4_zone_allocator *zones, - u32 uid, u32 obj, u32 count); - /* If <zones> was allocated with MLX4_ZONE_ALLOC_FLAGS_NO_OVERLAP, instead of * specifying the uid when freeing an object, zone allocator could figure it by * itself. Other parameters are similar to mlx4_zone_free. diff --git a/drivers/net/ethernet/mellanox/mlx4/port.c b/drivers/net/ethernet/mellanox/mlx4/port.c index 4e43f4a7d246..e3d0b13c1610 100644 --- a/drivers/net/ethernet/mellanox/mlx4/port.c +++ b/drivers/net/ethernet/mellanox/mlx4/port.c @@ -147,26 +147,6 @@ static int mlx4_set_port_mac_table(struct mlx4_dev *dev, u8 port, return err; } -int mlx4_find_cached_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *idx) -{ - struct mlx4_port_info *info = &mlx4_priv(dev)->port[port]; - struct mlx4_mac_table *table = &info->mac_table; - int i; - - for (i = 0; i < MLX4_MAX_MAC_NUM; i++) { - if (!table->refs[i]) - continue; - - if (mac == (MLX4_MAC_MASK & be64_to_cpu(table->entries[i]))) { - *idx = i; - return 0; - } - } - - return -ENOENT; -} -EXPORT_SYMBOL_GPL(mlx4_find_cached_mac); - static bool mlx4_need_mf_bond(struct mlx4_dev *dev) { int i, num_eth_ports = 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port.c b/drivers/net/ethernet/mellanox/mlx5/core/en/port.c index 5f6a0605e4ae..f62fbfb67a1b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port.c @@ -296,11 +296,16 @@ enum mlx5e_fec_supported_link_mode { MLX5E_FEC_SUPPORTED_LINK_MODE_200G_2X, MLX5E_FEC_SUPPORTED_LINK_MODE_400G_4X, MLX5E_FEC_SUPPORTED_LINK_MODE_800G_8X, + MLX5E_FEC_SUPPORTED_LINK_MODE_200G_1X, + MLX5E_FEC_SUPPORTED_LINK_MODE_400G_2X, + MLX5E_FEC_SUPPORTED_LINK_MODE_800G_4X, + MLX5E_FEC_SUPPORTED_LINK_MODE_1600G_8X, MLX5E_MAX_FEC_SUPPORTED_LINK_MODE, }; #define MLX5E_FEC_FIRST_50G_PER_LANE_MODE MLX5E_FEC_SUPPORTED_LINK_MODE_50G_1X #define MLX5E_FEC_FIRST_100G_PER_LANE_MODE MLX5E_FEC_SUPPORTED_LINK_MODE_100G_1X +#define MLX5E_FEC_FIRST_200G_PER_LANE_MODE MLX5E_FEC_SUPPORTED_LINK_MODE_200G_1X #define MLX5E_FEC_OVERRIDE_ADMIN_POLICY(buf, policy, write, link) \ do { \ @@ -320,8 +325,10 @@ static bool mlx5e_is_fec_supported_link_mode(struct mlx5_core_dev *dev, return link_mode < MLX5E_FEC_FIRST_50G_PER_LANE_MODE || (link_mode < MLX5E_FEC_FIRST_100G_PER_LANE_MODE && MLX5_CAP_PCAM_FEATURE(dev, fec_50G_per_lane_in_pplm)) || - (link_mode >= MLX5E_FEC_FIRST_100G_PER_LANE_MODE && - MLX5_CAP_PCAM_FEATURE(dev, fec_100G_per_lane_in_pplm)); + (link_mode < MLX5E_FEC_FIRST_200G_PER_LANE_MODE && + MLX5_CAP_PCAM_FEATURE(dev, fec_100G_per_lane_in_pplm)) || + (link_mode >= MLX5E_FEC_FIRST_200G_PER_LANE_MODE && + MLX5_CAP_PCAM_FEATURE(dev, fec_200G_per_lane_in_pplm)); } /* get/set FEC admin field for a given speed */ @@ -368,6 +375,18 @@ static int mlx5e_fec_admin_field(u32 *pplm, u16 *fec_policy, bool write, case MLX5E_FEC_SUPPORTED_LINK_MODE_800G_8X: MLX5E_FEC_OVERRIDE_ADMIN_POLICY(pplm, *fec_policy, write, 800g_8x); break; + case MLX5E_FEC_SUPPORTED_LINK_MODE_200G_1X: + MLX5E_FEC_OVERRIDE_ADMIN_POLICY(pplm, *fec_policy, write, 200g_1x); + break; + case MLX5E_FEC_SUPPORTED_LINK_MODE_400G_2X: + MLX5E_FEC_OVERRIDE_ADMIN_POLICY(pplm, *fec_policy, write, 400g_2x); + break; + case MLX5E_FEC_SUPPORTED_LINK_MODE_800G_4X: + MLX5E_FEC_OVERRIDE_ADMIN_POLICY(pplm, *fec_policy, write, 800g_4x); + break; + case MLX5E_FEC_SUPPORTED_LINK_MODE_1600G_8X: + MLX5E_FEC_OVERRIDE_ADMIN_POLICY(pplm, *fec_policy, write, 1600g_8x); + break; default: return -EINVAL; } @@ -421,6 +440,18 @@ static int mlx5e_get_fec_cap_field(u32 *pplm, u16 *fec_cap, case MLX5E_FEC_SUPPORTED_LINK_MODE_800G_8X: *fec_cap = MLX5E_GET_FEC_OVERRIDE_CAP(pplm, 800g_8x); break; + case MLX5E_FEC_SUPPORTED_LINK_MODE_200G_1X: + *fec_cap = MLX5E_GET_FEC_OVERRIDE_CAP(pplm, 200g_1x); + break; + case MLX5E_FEC_SUPPORTED_LINK_MODE_400G_2X: + *fec_cap = MLX5E_GET_FEC_OVERRIDE_CAP(pplm, 400g_2x); + break; + case MLX5E_FEC_SUPPORTED_LINK_MODE_800G_4X: + *fec_cap = MLX5E_GET_FEC_OVERRIDE_CAP(pplm, 800g_4x); + break; + case MLX5E_FEC_SUPPORTED_LINK_MODE_1600G_8X: + *fec_cap = MLX5E_GET_FEC_OVERRIDE_CAP(pplm, 1600g_8x); + break; default: return -EINVAL; } @@ -494,6 +525,26 @@ out: return 0; } +static u16 mlx5e_remap_fec_conf_mode(enum mlx5e_fec_supported_link_mode link_mode, + u16 conf_fec) +{ + /* RS fec in ethtool is originally mapped to MLX5E_FEC_RS_528_514. + * For link modes up to 25G per lane, the value is kept. + * For 50G or 100G per lane, it's remapped to MLX5E_FEC_RS_544_514. + * For 200G per lane, remapped to MLX5E_FEC_RS_544_514_INTERLEAVED_QUAD. + */ + if (conf_fec != BIT(MLX5E_FEC_RS_528_514)) + return conf_fec; + + if (link_mode >= MLX5E_FEC_FIRST_200G_PER_LANE_MODE) + return BIT(MLX5E_FEC_RS_544_514_INTERLEAVED_QUAD); + + if (link_mode >= MLX5E_FEC_FIRST_50G_PER_LANE_MODE) + return BIT(MLX5E_FEC_RS_544_514); + + return conf_fec; +} + int mlx5e_set_fec_mode(struct mlx5_core_dev *dev, u16 fec_policy) { bool fec_50g_per_lane = MLX5_CAP_PCAM_FEATURE(dev, fec_50G_per_lane_in_pplm); @@ -530,14 +581,7 @@ int mlx5e_set_fec_mode(struct mlx5_core_dev *dev, u16 fec_policy) if (!mlx5e_is_fec_supported_link_mode(dev, i)) break; - /* RS fec in ethtool is mapped to MLX5E_FEC_RS_528_514 - * to link modes up to 25G per lane and to - * MLX5E_FEC_RS_544_514 in the new link modes based on - * 50G or 100G per lane - */ - if (conf_fec == (1 << MLX5E_FEC_RS_528_514) && - i >= MLX5E_FEC_FIRST_50G_PER_LANE_MODE) - conf_fec = (1 << MLX5E_FEC_RS_544_514); + conf_fec = mlx5e_remap_fec_conf_mode(i, conf_fec); mlx5e_get_fec_cap_field(out, &fec_caps, i); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port.h b/drivers/net/ethernet/mellanox/mlx5/core/en/port.h index d1da225f35da..fa2283dd383b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/port.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port.h @@ -61,6 +61,7 @@ enum { MLX5E_FEC_NOFEC, MLX5E_FEC_FIRECODE, MLX5E_FEC_RS_528_514, + MLX5E_FEC_RS_544_514_INTERLEAVED_QUAD = 4, MLX5E_FEC_RS_544_514 = 7, MLX5E_FEC_LLRS_272_257_1 = 9, }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c index afd654583b6b..131ed97ca997 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c @@ -326,7 +326,7 @@ static int mlx5e_ptp_alloc_txqsq(struct mlx5e_ptp *c, int txq_ix, int node; sq->pdev = c->pdev; - sq->clock = &mdev->clock; + sq->clock = mdev->clock; sq->mkey_be = c->mkey_be; sq->netdev = c->netdev; sq->priv = c->priv; @@ -696,7 +696,7 @@ static int mlx5e_init_ptp_rq(struct mlx5e_ptp *c, struct mlx5e_params *params, rq->pdev = c->pdev; rq->netdev = priv->netdev; rq->priv = priv; - rq->clock = &mdev->clock; + rq->clock = mdev->clock; rq->tstamp = &priv->tstamp; rq->mdev = mdev; rq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h index d6c12d0ea55b..2e528b2c34d6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h @@ -73,11 +73,6 @@ struct mlx5e_tc_act { bool is_terminating_action; }; -struct mlx5e_tc_flow_action { - unsigned int num_entries; - struct flow_action_entry **entries; -}; - extern struct mlx5e_tc_act mlx5e_tc_act_drop; extern struct mlx5e_tc_act mlx5e_tc_act_trap; extern struct mlx5e_tc_act mlx5e_tc_act_accept; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c b/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c index 53ca16cb9c41..140606fcd23b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c @@ -46,7 +46,7 @@ static void mlx5e_init_trap_rq(struct mlx5e_trap *t, struct mlx5e_params *params rq->pdev = t->pdev; rq->netdev = priv->netdev; rq->priv = priv; - rq->clock = &mdev->clock; + rq->clock = mdev->clock; rq->tstamp = &priv->tstamp; rq->mdev = mdev; rq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index 94b291662087..3cc4d55613bf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -289,9 +289,9 @@ static u64 mlx5e_xsk_fill_timestamp(void *_priv) ts = get_cqe_ts(priv->cqe); if (mlx5_is_real_time_rq(priv->cq->mdev) || mlx5_is_real_time_sq(priv->cq->mdev)) - return mlx5_real_time_cyc2time(&priv->cq->mdev->clock, ts); + return mlx5_real_time_cyc2time(priv->cq->mdev->clock, ts); - return mlx5_timecounter_cyc2time(&priv->cq->mdev->clock, ts); + return mlx5_timecounter_cyc2time(priv->cq->mdev->clock, ts); } static void mlx5e_xsk_request_checksum(u16 csum_start, u16 csum_offset, void *priv) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c index 9240cfe25d10..d743e823362a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c @@ -72,7 +72,7 @@ static int mlx5e_init_xsk_rq(struct mlx5e_channel *c, rq->netdev = c->netdev; rq->priv = c->priv; rq->tstamp = c->tstamp; - rq->clock = &mdev->clock; + rq->clock = mdev->clock; rq->icosq = &c->icosq; rq->ix = c->ix; rq->channel = c; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index cae39198b4db..f9113cb13a0c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -237,6 +237,27 @@ void mlx5e_build_ptys2ethtool_map(void) ETHTOOL_LINK_MODE_800000baseDR8_2_Full_BIT, ETHTOOL_LINK_MODE_800000baseSR8_Full_BIT, ETHTOOL_LINK_MODE_800000baseVR8_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_200GAUI_1_200GBASE_CR1_KR1, ext, + ETHTOOL_LINK_MODE_200000baseCR_Full_BIT, + ETHTOOL_LINK_MODE_200000baseKR_Full_BIT, + ETHTOOL_LINK_MODE_200000baseDR_Full_BIT, + ETHTOOL_LINK_MODE_200000baseDR_2_Full_BIT, + ETHTOOL_LINK_MODE_200000baseSR_Full_BIT, + ETHTOOL_LINK_MODE_200000baseVR_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_400GAUI_2_400GBASE_CR2_KR2, ext, + ETHTOOL_LINK_MODE_400000baseCR2_Full_BIT, + ETHTOOL_LINK_MODE_400000baseKR2_Full_BIT, + ETHTOOL_LINK_MODE_400000baseDR2_Full_BIT, + ETHTOOL_LINK_MODE_400000baseDR2_2_Full_BIT, + ETHTOOL_LINK_MODE_400000baseSR2_Full_BIT, + ETHTOOL_LINK_MODE_400000baseVR2_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_800GAUI_4_800GBASE_CR4_KR4, ext, + ETHTOOL_LINK_MODE_800000baseCR4_Full_BIT, + ETHTOOL_LINK_MODE_800000baseKR4_Full_BIT, + ETHTOOL_LINK_MODE_800000baseDR4_Full_BIT, + ETHTOOL_LINK_MODE_800000baseDR4_2_Full_BIT, + ETHTOOL_LINK_MODE_800000baseSR4_Full_BIT, + ETHTOOL_LINK_MODE_800000baseVR4_Full_BIT); } static void mlx5e_ethtool_get_speed_arr(struct mlx5_core_dev *mdev, @@ -931,6 +952,7 @@ static const u32 pplm_fec_2_ethtool[] = { [MLX5E_FEC_RS_528_514] = ETHTOOL_FEC_RS, [MLX5E_FEC_RS_544_514] = ETHTOOL_FEC_RS, [MLX5E_FEC_LLRS_272_257_1] = ETHTOOL_FEC_LLRS, + [MLX5E_FEC_RS_544_514_INTERLEAVED_QUAD] = ETHTOOL_FEC_RS, }; static u32 pplm2ethtool_fec(u_long fec_mode, unsigned long size) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index a814b63ed97e..2fdc86432ac0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -737,7 +737,7 @@ static int mlx5e_init_rxq_rq(struct mlx5e_channel *c, struct mlx5e_params *param rq->netdev = c->netdev; rq->priv = c->priv; rq->tstamp = c->tstamp; - rq->clock = &mdev->clock; + rq->clock = mdev->clock; rq->icosq = &c->icosq; rq->ix = c->ix; rq->channel = c; @@ -1614,7 +1614,7 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c, int err; sq->pdev = c->pdev; - sq->clock = &mdev->clock; + sq->clock = mdev->clock; sq->mkey_be = c->mkey_be; sq->netdev = c->netdev; sq->mdev = c->mdev; @@ -3816,8 +3816,11 @@ static int mlx5e_setup_tc_mqprio(struct mlx5e_priv *priv, /* MQPRIO is another toplevel qdisc that can't be attached * simultaneously with the offloaded HTB. */ - if (WARN_ON(mlx5e_selq_is_htb_enabled(&priv->selq))) - return -EINVAL; + if (mlx5e_selq_is_htb_enabled(&priv->selq)) { + NL_SET_ERR_MSG_MOD(mqprio->extack, + "MQPRIO cannot be configured when HTB offload is enabled."); + return -EOPNOTSUPP; + } switch (mqprio->mode) { case TC_MQPRIO_MODE_DCB: diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c index bde79cac33a9..d832a12ffec0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c @@ -97,7 +97,7 @@ static int mlx5_lag_create_port_sel_table(struct mlx5_lag *ldev, mlx5_del_flow_rules(lag_definer->rules[idx]); } j = ldev->buckets; - }; + } goto destroy_fg; } } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c index d61a1a9297c9..65a94e46edcf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c @@ -43,6 +43,8 @@ #include <linux/cpufeature.h> #endif /* CONFIG_X86 */ +#define MLX5_RT_CLOCK_IDENTITY_SIZE MLX5_FLD_SZ_BYTES(mrtcq_reg, rt_clock_identity) + enum { MLX5_PIN_MODE_IN = 0x0, MLX5_PIN_MODE_OUT = 0x1, @@ -77,6 +79,56 @@ enum { MLX5_MTUTC_OPERATION_ADJUST_TIME_EXTENDED_MAX = 200000, }; +struct mlx5_clock_dev_state { + struct mlx5_core_dev *mdev; + struct mlx5_devcom_comp_dev *compdev; + struct mlx5_nb pps_nb; + struct work_struct out_work; +}; + +struct mlx5_clock_priv { + struct mlx5_clock clock; + struct mlx5_core_dev *mdev; + struct mutex lock; /* protect mdev and used in PTP callbacks */ + struct mlx5_core_dev *event_mdev; +}; + +static struct mlx5_clock_priv *clock_priv(struct mlx5_clock *clock) +{ + return container_of(clock, struct mlx5_clock_priv, clock); +} + +static void mlx5_clock_lockdep_assert(struct mlx5_clock *clock) +{ + if (!clock->shared) + return; + + lockdep_assert(lockdep_is_held(&clock_priv(clock)->lock)); +} + +static struct mlx5_core_dev *mlx5_clock_mdev_get(struct mlx5_clock *clock) +{ + mlx5_clock_lockdep_assert(clock); + + return clock_priv(clock)->mdev; +} + +static void mlx5_clock_lock(struct mlx5_clock *clock) +{ + if (!clock->shared) + return; + + mutex_lock(&clock_priv(clock)->lock); +} + +static void mlx5_clock_unlock(struct mlx5_clock *clock) +{ + if (!clock->shared) + return; + + mutex_unlock(&clock_priv(clock)->lock); +} + static bool mlx5_real_time_mode(struct mlx5_core_dev *mdev) { return (mlx5_is_real_time_rq(mdev) || mlx5_is_real_time_sq(mdev)); @@ -94,6 +146,22 @@ static bool mlx5_modify_mtutc_allowed(struct mlx5_core_dev *mdev) return MLX5_CAP_MCAM_FEATURE(mdev, ptpcyc2realtime_modify); } +static int mlx5_clock_identity_get(struct mlx5_core_dev *mdev, + u8 identify[MLX5_RT_CLOCK_IDENTITY_SIZE]) +{ + u32 out[MLX5_ST_SZ_DW(mrtcq_reg)] = {}; + u32 in[MLX5_ST_SZ_DW(mrtcq_reg)] = {}; + int err; + + err = mlx5_core_access_reg(mdev, in, sizeof(in), + out, sizeof(out), MLX5_REG_MRTCQ, 0, 0); + if (!err) + memcpy(identify, MLX5_ADDR_OF(mrtcq_reg, out, rt_clock_identity), + MLX5_RT_CLOCK_IDENTITY_SIZE); + + return err; +} + static u32 mlx5_ptp_shift_constant(u32 dev_freq_khz) { /* Optimal shift constant leads to corrections above just 1 scaled ppm. @@ -119,21 +187,30 @@ static u32 mlx5_ptp_shift_constant(u32 dev_freq_khz) ilog2((U32_MAX / NSEC_PER_MSEC) * dev_freq_khz)); } +static s32 mlx5_clock_getmaxphase(struct mlx5_core_dev *mdev) +{ + return MLX5_CAP_MCAM_FEATURE(mdev, mtutc_time_adjustment_extended_range) ? + MLX5_MTUTC_OPERATION_ADJUST_TIME_EXTENDED_MAX : + MLX5_MTUTC_OPERATION_ADJUST_TIME_MAX; +} + static s32 mlx5_ptp_getmaxphase(struct ptp_clock_info *ptp) { struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); struct mlx5_core_dev *mdev; + s32 ret; - mdev = container_of(clock, struct mlx5_core_dev, clock); + mlx5_clock_lock(clock); + mdev = mlx5_clock_mdev_get(clock); + ret = mlx5_clock_getmaxphase(mdev); + mlx5_clock_unlock(clock); - return MLX5_CAP_MCAM_FEATURE(mdev, mtutc_time_adjustment_extended_range) ? - MLX5_MTUTC_OPERATION_ADJUST_TIME_EXTENDED_MAX : - MLX5_MTUTC_OPERATION_ADJUST_TIME_MAX; + return ret; } static bool mlx5_is_mtutc_time_adj_cap(struct mlx5_core_dev *mdev, s64 delta) { - s64 max = mlx5_ptp_getmaxphase(&mdev->clock.ptp_info); + s64 max = mlx5_clock_getmaxphase(mdev); if (delta < -max || delta > max) return false; @@ -209,7 +286,7 @@ static int mlx5_mtctr_syncdevicetime(ktime_t *device_time, if (real_time_mode) *device_time = ns_to_ktime(REAL_TIME_TO_NS(device >> 32, device & U32_MAX)); else - *device_time = mlx5_timecounter_cyc2time(&mdev->clock, device); + *device_time = mlx5_timecounter_cyc2time(mdev->clock, device); return 0; } @@ -220,16 +297,23 @@ static int mlx5_ptp_getcrosststamp(struct ptp_clock_info *ptp, struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); struct system_time_snapshot history_begin = {0}; struct mlx5_core_dev *mdev; + int err; - mdev = container_of(clock, struct mlx5_core_dev, clock); + mlx5_clock_lock(clock); + mdev = mlx5_clock_mdev_get(clock); - if (!mlx5_is_ptm_source_time_available(mdev)) - return -EBUSY; + if (!mlx5_is_ptm_source_time_available(mdev)) { + err = -EBUSY; + goto unlock; + } ktime_get_snapshot(&history_begin); - return get_device_system_crosststamp(mlx5_mtctr_syncdevicetime, mdev, - &history_begin, cts); + err = get_device_system_crosststamp(mlx5_mtctr_syncdevicetime, mdev, + &history_begin, cts); +unlock: + mlx5_clock_unlock(clock); + return err; } #endif /* CONFIG_X86 */ @@ -263,8 +347,7 @@ static u64 read_internal_timer(const struct cyclecounter *cc) { struct mlx5_timer *timer = container_of(cc, struct mlx5_timer, cycles); struct mlx5_clock *clock = container_of(timer, struct mlx5_clock, timer); - struct mlx5_core_dev *mdev = container_of(clock, struct mlx5_core_dev, - clock); + struct mlx5_core_dev *mdev = mlx5_clock_mdev_get(clock); return mlx5_read_time(mdev, NULL, false) & cc->mask; } @@ -272,7 +355,7 @@ static u64 read_internal_timer(const struct cyclecounter *cc) static void mlx5_update_clock_info_page(struct mlx5_core_dev *mdev) { struct mlx5_ib_clock_info *clock_info = mdev->clock_info; - struct mlx5_clock *clock = &mdev->clock; + struct mlx5_clock *clock = mdev->clock; struct mlx5_timer *timer; u32 sign; @@ -295,12 +378,10 @@ static void mlx5_update_clock_info_page(struct mlx5_core_dev *mdev) static void mlx5_pps_out(struct work_struct *work) { - struct mlx5_pps *pps_info = container_of(work, struct mlx5_pps, - out_work); - struct mlx5_clock *clock = container_of(pps_info, struct mlx5_clock, - pps_info); - struct mlx5_core_dev *mdev = container_of(clock, struct mlx5_core_dev, - clock); + struct mlx5_clock_dev_state *clock_state = container_of(work, struct mlx5_clock_dev_state, + out_work); + struct mlx5_core_dev *mdev = clock_state->mdev; + struct mlx5_clock *clock = mdev->clock; u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0}; unsigned long flags; int i; @@ -330,7 +411,8 @@ static long mlx5_timestamp_overflow(struct ptp_clock_info *ptp_info) unsigned long flags; clock = container_of(ptp_info, struct mlx5_clock, ptp_info); - mdev = container_of(clock, struct mlx5_core_dev, clock); + mlx5_clock_lock(clock); + mdev = mlx5_clock_mdev_get(clock); timer = &clock->timer; if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) @@ -342,6 +424,7 @@ static long mlx5_timestamp_overflow(struct ptp_clock_info *ptp_info) write_sequnlock_irqrestore(&clock->lock, flags); out: + mlx5_clock_unlock(clock); return timer->overflow_period; } @@ -361,15 +444,12 @@ static int mlx5_ptp_settime_real_time(struct mlx5_core_dev *mdev, return mlx5_set_mtutc(mdev, in, sizeof(in)); } -static int mlx5_ptp_settime(struct ptp_clock_info *ptp, const struct timespec64 *ts) +static int mlx5_clock_settime(struct mlx5_core_dev *mdev, struct mlx5_clock *clock, + const struct timespec64 *ts) { - struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); struct mlx5_timer *timer = &clock->timer; - struct mlx5_core_dev *mdev; unsigned long flags; - mdev = container_of(clock, struct mlx5_core_dev, clock); - if (mlx5_modify_mtutc_allowed(mdev)) { int err = mlx5_ptp_settime_real_time(mdev, ts); @@ -385,6 +465,20 @@ static int mlx5_ptp_settime(struct ptp_clock_info *ptp, const struct timespec64 return 0; } +static int mlx5_ptp_settime(struct ptp_clock_info *ptp, const struct timespec64 *ts) +{ + struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); + struct mlx5_core_dev *mdev; + int err; + + mlx5_clock_lock(clock); + mdev = mlx5_clock_mdev_get(clock); + err = mlx5_clock_settime(mdev, clock, ts); + mlx5_clock_unlock(clock); + + return err; +} + static struct timespec64 mlx5_ptp_gettimex_real_time(struct mlx5_core_dev *mdev, struct ptp_system_timestamp *sts) @@ -404,7 +498,8 @@ static int mlx5_ptp_gettimex(struct ptp_clock_info *ptp, struct timespec64 *ts, struct mlx5_core_dev *mdev; u64 cycles, ns; - mdev = container_of(clock, struct mlx5_core_dev, clock); + mlx5_clock_lock(clock); + mdev = mlx5_clock_mdev_get(clock); if (mlx5_real_time_mode(mdev)) { *ts = mlx5_ptp_gettimex_real_time(mdev, sts); goto out; @@ -414,6 +509,7 @@ static int mlx5_ptp_gettimex(struct ptp_clock_info *ptp, struct timespec64 *ts, ns = mlx5_timecounter_cyc2time(clock, cycles); *ts = ns_to_timespec64(ns); out: + mlx5_clock_unlock(clock); return 0; } @@ -444,14 +540,16 @@ static int mlx5_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta) struct mlx5_timer *timer = &clock->timer; struct mlx5_core_dev *mdev; unsigned long flags; + int err = 0; - mdev = container_of(clock, struct mlx5_core_dev, clock); + mlx5_clock_lock(clock); + mdev = mlx5_clock_mdev_get(clock); if (mlx5_modify_mtutc_allowed(mdev)) { - int err = mlx5_ptp_adjtime_real_time(mdev, delta); + err = mlx5_ptp_adjtime_real_time(mdev, delta); if (err) - return err; + goto unlock; } write_seqlock_irqsave(&clock->lock, flags); @@ -459,17 +557,23 @@ static int mlx5_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta) mlx5_update_clock_info_page(mdev); write_sequnlock_irqrestore(&clock->lock, flags); - return 0; +unlock: + mlx5_clock_unlock(clock); + return err; } static int mlx5_ptp_adjphase(struct ptp_clock_info *ptp, s32 delta) { struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); struct mlx5_core_dev *mdev; + int err; - mdev = container_of(clock, struct mlx5_core_dev, clock); + mlx5_clock_lock(clock); + mdev = mlx5_clock_mdev_get(clock); + err = mlx5_ptp_adjtime_real_time(mdev, delta); + mlx5_clock_unlock(clock); - return mlx5_ptp_adjtime_real_time(mdev, delta); + return err; } static int mlx5_ptp_freq_adj_real_time(struct mlx5_core_dev *mdev, long scaled_ppm) @@ -498,15 +602,17 @@ static int mlx5_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm) struct mlx5_timer *timer = &clock->timer; struct mlx5_core_dev *mdev; unsigned long flags; + int err = 0; u32 mult; - mdev = container_of(clock, struct mlx5_core_dev, clock); + mlx5_clock_lock(clock); + mdev = mlx5_clock_mdev_get(clock); if (mlx5_modify_mtutc_allowed(mdev)) { - int err = mlx5_ptp_freq_adj_real_time(mdev, scaled_ppm); + err = mlx5_ptp_freq_adj_real_time(mdev, scaled_ppm); if (err) - return err; + goto unlock; } mult = (u32)adjust_by_scaled_ppm(timer->nominal_c_mult, scaled_ppm); @@ -518,7 +624,9 @@ static int mlx5_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm) write_sequnlock_irqrestore(&clock->lock, flags); ptp_schedule_worker(clock->ptp, timer->overflow_period); - return 0; +unlock: + mlx5_clock_unlock(clock); + return err; } static int mlx5_extts_configure(struct ptp_clock_info *ptp, @@ -527,18 +635,14 @@ static int mlx5_extts_configure(struct ptp_clock_info *ptp, { struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); - struct mlx5_core_dev *mdev = - container_of(clock, struct mlx5_core_dev, clock); u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0}; + struct mlx5_core_dev *mdev; u32 field_select = 0; u8 pin_mode = 0; u8 pattern = 0; int pin = -1; int err = 0; - if (!MLX5_PPS_CAP(mdev)) - return -EOPNOTSUPP; - /* Reject requests with unsupported flags */ if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | PTP_RISING_EDGE | @@ -569,6 +673,14 @@ static int mlx5_extts_configure(struct ptp_clock_info *ptp, field_select = MLX5_MTPPS_FS_ENABLE; } + mlx5_clock_lock(clock); + mdev = mlx5_clock_mdev_get(clock); + + if (!MLX5_PPS_CAP(mdev)) { + err = -EOPNOTSUPP; + goto unlock; + } + MLX5_SET(mtpps_reg, in, pin, pin); MLX5_SET(mtpps_reg, in, pin_mode, pin_mode); MLX5_SET(mtpps_reg, in, pattern, pattern); @@ -577,15 +689,23 @@ static int mlx5_extts_configure(struct ptp_clock_info *ptp, err = mlx5_set_mtpps(mdev, in, sizeof(in)); if (err) - return err; + goto unlock; + + err = mlx5_set_mtppse(mdev, pin, 0, MLX5_EVENT_MODE_REPETETIVE & on); + if (err) + goto unlock; + + clock->pps_info.pin_armed[pin] = on; + clock_priv(clock)->event_mdev = mdev; - return mlx5_set_mtppse(mdev, pin, 0, - MLX5_EVENT_MODE_REPETETIVE & on); +unlock: + mlx5_clock_unlock(clock); + return err; } static u64 find_target_cycles(struct mlx5_core_dev *mdev, s64 target_ns) { - struct mlx5_clock *clock = &mdev->clock; + struct mlx5_clock *clock = mdev->clock; u64 cycles_now, cycles_delta; u64 nsec_now, nsec_delta; struct mlx5_timer *timer; @@ -644,7 +764,7 @@ static int mlx5_perout_conf_out_pulse_duration(struct mlx5_core_dev *mdev, struct ptp_clock_request *rq, u32 *out_pulse_duration_ns) { - struct mlx5_pps *pps_info = &mdev->clock.pps_info; + struct mlx5_pps *pps_info = &mdev->clock->pps_info; u32 out_pulse_duration; struct timespec64 ts; @@ -677,7 +797,7 @@ static int perout_conf_npps_real_time(struct mlx5_core_dev *mdev, struct ptp_clo u32 *field_select, u32 *out_pulse_duration_ns, u64 *period, u64 *time_stamp) { - struct mlx5_pps *pps_info = &mdev->clock.pps_info; + struct mlx5_pps *pps_info = &mdev->clock->pps_info; struct ptp_clock_time *time = &rq->perout.start; struct timespec64 ts; @@ -712,26 +832,18 @@ static int mlx5_perout_configure(struct ptp_clock_info *ptp, { struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); - struct mlx5_core_dev *mdev = - container_of(clock, struct mlx5_core_dev, clock); - bool rt_mode = mlx5_real_time_mode(mdev); u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0}; u32 out_pulse_duration_ns = 0; + struct mlx5_core_dev *mdev; u32 field_select = 0; u64 npps_period = 0; u64 time_stamp = 0; u8 pin_mode = 0; u8 pattern = 0; + bool rt_mode; int pin = -1; int err = 0; - if (!MLX5_PPS_CAP(mdev)) - return -EOPNOTSUPP; - - /* Reject requests with unsupported flags */ - if (mlx5_perout_verify_flags(mdev, rq->perout.flags)) - return -EOPNOTSUPP; - if (rq->perout.index >= clock->ptp_info.n_pins) return -EINVAL; @@ -740,14 +852,29 @@ static int mlx5_perout_configure(struct ptp_clock_info *ptp, if (pin < 0) return -EBUSY; - if (on) { - bool rt_mode = mlx5_real_time_mode(mdev); + mlx5_clock_lock(clock); + mdev = mlx5_clock_mdev_get(clock); + rt_mode = mlx5_real_time_mode(mdev); + + if (!MLX5_PPS_CAP(mdev)) { + err = -EOPNOTSUPP; + goto unlock; + } + + /* Reject requests with unsupported flags */ + if (mlx5_perout_verify_flags(mdev, rq->perout.flags)) { + err = -EOPNOTSUPP; + goto unlock; + } + if (on) { pin_mode = MLX5_PIN_MODE_OUT; pattern = MLX5_OUT_PATTERN_PERIODIC; - if (rt_mode && rq->perout.start.sec > U32_MAX) - return -EINVAL; + if (rt_mode && rq->perout.start.sec > U32_MAX) { + err = -EINVAL; + goto unlock; + } field_select |= MLX5_MTPPS_FS_PIN_MODE | MLX5_MTPPS_FS_PATTERN | @@ -760,7 +887,7 @@ static int mlx5_perout_configure(struct ptp_clock_info *ptp, else err = perout_conf_1pps(mdev, rq, &time_stamp, rt_mode); if (err) - return err; + goto unlock; } MLX5_SET(mtpps_reg, in, pin, pin); @@ -773,13 +900,16 @@ static int mlx5_perout_configure(struct ptp_clock_info *ptp, MLX5_SET(mtpps_reg, in, out_pulse_duration_ns, out_pulse_duration_ns); err = mlx5_set_mtpps(mdev, in, sizeof(in)); if (err) - return err; + goto unlock; if (rt_mode) - return 0; + goto unlock; + + err = mlx5_set_mtppse(mdev, pin, 0, MLX5_EVENT_MODE_REPETETIVE & on); - return mlx5_set_mtppse(mdev, pin, 0, - MLX5_EVENT_MODE_REPETETIVE & on); +unlock: + mlx5_clock_unlock(clock); + return err; } static int mlx5_pps_configure(struct ptp_clock_info *ptp, @@ -866,10 +996,8 @@ static int mlx5_query_mtpps_pin_mode(struct mlx5_core_dev *mdev, u8 pin, mtpps_size, MLX5_REG_MTPPS, 0, 0); } -static int mlx5_get_pps_pin_mode(struct mlx5_clock *clock, u8 pin) +static int mlx5_get_pps_pin_mode(struct mlx5_core_dev *mdev, u8 pin) { - struct mlx5_core_dev *mdev = container_of(clock, struct mlx5_core_dev, clock); - u32 out[MLX5_ST_SZ_DW(mtpps_reg)] = {}; u8 mode; int err; @@ -888,8 +1016,9 @@ static int mlx5_get_pps_pin_mode(struct mlx5_clock *clock, u8 pin) return PTP_PF_NONE; } -static void mlx5_init_pin_config(struct mlx5_clock *clock) +static void mlx5_init_pin_config(struct mlx5_core_dev *mdev) { + struct mlx5_clock *clock = mdev->clock; int i; if (!clock->ptp_info.n_pins) @@ -910,15 +1039,15 @@ static void mlx5_init_pin_config(struct mlx5_clock *clock) sizeof(clock->ptp_info.pin_config[i].name), "mlx5_pps%d", i); clock->ptp_info.pin_config[i].index = i; - clock->ptp_info.pin_config[i].func = mlx5_get_pps_pin_mode(clock, i); + clock->ptp_info.pin_config[i].func = mlx5_get_pps_pin_mode(mdev, i); clock->ptp_info.pin_config[i].chan = 0; } } static void mlx5_get_pps_caps(struct mlx5_core_dev *mdev) { - struct mlx5_clock *clock = &mdev->clock; u32 out[MLX5_ST_SZ_DW(mtpps_reg)] = {0}; + struct mlx5_clock *clock = mdev->clock; mlx5_query_mtpps(mdev, out, sizeof(out)); @@ -968,16 +1097,16 @@ static u64 perout_conf_next_event_timer(struct mlx5_core_dev *mdev, static int mlx5_pps_event(struct notifier_block *nb, unsigned long type, void *data) { - struct mlx5_clock *clock = mlx5_nb_cof(nb, struct mlx5_clock, pps_nb); + struct mlx5_clock_dev_state *clock_state = mlx5_nb_cof(nb, struct mlx5_clock_dev_state, + pps_nb); + struct mlx5_core_dev *mdev = clock_state->mdev; + struct mlx5_clock *clock = mdev->clock; struct ptp_clock_event ptp_event; struct mlx5_eqe *eqe = data; int pin = eqe->data.pps.pin; - struct mlx5_core_dev *mdev; unsigned long flags; u64 ns; - mdev = container_of(clock, struct mlx5_core_dev, clock); - switch (clock->ptp_info.pin_config[pin].func) { case PTP_PF_EXTTS: ptp_event.index = pin; @@ -997,11 +1126,15 @@ static int mlx5_pps_event(struct notifier_block *nb, ptp_clock_event(clock->ptp, &ptp_event); break; case PTP_PF_PEROUT: + if (clock->shared) { + mlx5_core_warn(mdev, " Received unexpected PPS out event\n"); + break; + } ns = perout_conf_next_event_timer(mdev, clock); write_seqlock_irqsave(&clock->lock, flags); clock->pps_info.start[pin] = ns; write_sequnlock_irqrestore(&clock->lock, flags); - schedule_work(&clock->pps_info.out_work); + schedule_work(&clock_state->out_work); break; default: mlx5_core_err(mdev, " Unhandled clock PPS event, func %d\n", @@ -1013,7 +1146,7 @@ static int mlx5_pps_event(struct notifier_block *nb, static void mlx5_timecounter_init(struct mlx5_core_dev *mdev) { - struct mlx5_clock *clock = &mdev->clock; + struct mlx5_clock *clock = mdev->clock; struct mlx5_timer *timer = &clock->timer; u32 dev_freq; @@ -1029,10 +1162,10 @@ static void mlx5_timecounter_init(struct mlx5_core_dev *mdev) ktime_to_ns(ktime_get_real())); } -static void mlx5_init_overflow_period(struct mlx5_clock *clock) +static void mlx5_init_overflow_period(struct mlx5_core_dev *mdev) { - struct mlx5_core_dev *mdev = container_of(clock, struct mlx5_core_dev, clock); struct mlx5_ib_clock_info *clock_info = mdev->clock_info; + struct mlx5_clock *clock = mdev->clock; struct mlx5_timer *timer = &clock->timer; u64 overflow_cycles; u64 frac = 0; @@ -1065,7 +1198,7 @@ static void mlx5_init_overflow_period(struct mlx5_clock *clock) static void mlx5_init_clock_info(struct mlx5_core_dev *mdev) { - struct mlx5_clock *clock = &mdev->clock; + struct mlx5_clock *clock = mdev->clock; struct mlx5_ib_clock_info *info; struct mlx5_timer *timer; @@ -1088,7 +1221,7 @@ static void mlx5_init_clock_info(struct mlx5_core_dev *mdev) static void mlx5_init_timer_max_freq_adjustment(struct mlx5_core_dev *mdev) { - struct mlx5_clock *clock = &mdev->clock; + struct mlx5_clock *clock = mdev->clock; u32 out[MLX5_ST_SZ_DW(mtutc_reg)] = {}; u32 in[MLX5_ST_SZ_DW(mtutc_reg)] = {}; u8 log_max_freq_adjustment = 0; @@ -1107,7 +1240,7 @@ static void mlx5_init_timer_max_freq_adjustment(struct mlx5_core_dev *mdev) static void mlx5_init_timer_clock(struct mlx5_core_dev *mdev) { - struct mlx5_clock *clock = &mdev->clock; + struct mlx5_clock *clock = mdev->clock; /* Configure the PHC */ clock->ptp_info = mlx5_ptp_clock_info; @@ -1123,38 +1256,30 @@ static void mlx5_init_timer_clock(struct mlx5_core_dev *mdev) mlx5_timecounter_init(mdev); mlx5_init_clock_info(mdev); - mlx5_init_overflow_period(clock); + mlx5_init_overflow_period(mdev); if (mlx5_real_time_mode(mdev)) { struct timespec64 ts; ktime_get_real_ts64(&ts); - mlx5_ptp_settime(&clock->ptp_info, &ts); + mlx5_clock_settime(mdev, clock, &ts); } } static void mlx5_init_pps(struct mlx5_core_dev *mdev) { - struct mlx5_clock *clock = &mdev->clock; - if (!MLX5_PPS_CAP(mdev)) return; mlx5_get_pps_caps(mdev); - mlx5_init_pin_config(clock); + mlx5_init_pin_config(mdev); } -void mlx5_init_clock(struct mlx5_core_dev *mdev) +static void mlx5_init_clock_dev(struct mlx5_core_dev *mdev) { - struct mlx5_clock *clock = &mdev->clock; - - if (!MLX5_CAP_GEN(mdev, device_frequency_khz)) { - mlx5_core_warn(mdev, "invalid device_frequency_khz, aborting HW clock init\n"); - return; - } + struct mlx5_clock *clock = mdev->clock; seqlock_init(&clock->lock); - INIT_WORK(&clock->pps_info.out_work, mlx5_pps_out); /* Initialize the device clock */ mlx5_init_timer_clock(mdev); @@ -1163,35 +1288,27 @@ void mlx5_init_clock(struct mlx5_core_dev *mdev) mlx5_init_pps(mdev); clock->ptp = ptp_clock_register(&clock->ptp_info, - &mdev->pdev->dev); + clock->shared ? NULL : &mdev->pdev->dev); if (IS_ERR(clock->ptp)) { - mlx5_core_warn(mdev, "ptp_clock_register failed %ld\n", + mlx5_core_warn(mdev, "%sptp_clock_register failed %ld\n", + clock->shared ? "shared clock " : "", PTR_ERR(clock->ptp)); clock->ptp = NULL; } - MLX5_NB_INIT(&clock->pps_nb, mlx5_pps_event, PPS_EVENT); - mlx5_eq_notifier_register(mdev, &clock->pps_nb); - if (clock->ptp) ptp_schedule_worker(clock->ptp, 0); } -void mlx5_cleanup_clock(struct mlx5_core_dev *mdev) +static void mlx5_destroy_clock_dev(struct mlx5_core_dev *mdev) { - struct mlx5_clock *clock = &mdev->clock; + struct mlx5_clock *clock = mdev->clock; - if (!MLX5_CAP_GEN(mdev, device_frequency_khz)) - return; - - mlx5_eq_notifier_unregister(mdev, &clock->pps_nb); if (clock->ptp) { ptp_clock_unregister(clock->ptp); clock->ptp = NULL; } - cancel_work_sync(&clock->pps_info.out_work); - if (mdev->clock_info) { free_page((unsigned long)mdev->clock_info); mdev->clock_info = NULL; @@ -1199,3 +1316,248 @@ void mlx5_cleanup_clock(struct mlx5_core_dev *mdev) kfree(clock->ptp_info.pin_config); } + +static void mlx5_clock_free(struct mlx5_core_dev *mdev) +{ + struct mlx5_clock_priv *cpriv = clock_priv(mdev->clock); + + mlx5_destroy_clock_dev(mdev); + mutex_destroy(&cpriv->lock); + kfree(cpriv); + mdev->clock = NULL; +} + +static int mlx5_clock_alloc(struct mlx5_core_dev *mdev, bool shared) +{ + struct mlx5_clock_priv *cpriv; + struct mlx5_clock *clock; + + cpriv = kzalloc(sizeof(*cpriv), GFP_KERNEL); + if (!cpriv) + return -ENOMEM; + + mutex_init(&cpriv->lock); + cpriv->mdev = mdev; + clock = &cpriv->clock; + clock->shared = shared; + mdev->clock = clock; + mlx5_clock_lock(clock); + mlx5_init_clock_dev(mdev); + mlx5_clock_unlock(clock); + + if (!clock->shared) + return 0; + + if (!clock->ptp) { + mlx5_core_warn(mdev, "failed to create ptp dev shared by multiple functions"); + mlx5_clock_free(mdev); + return -EINVAL; + } + + return 0; +} + +static void mlx5_shared_clock_register(struct mlx5_core_dev *mdev, u64 key) +{ + struct mlx5_core_dev *peer_dev, *next = NULL; + struct mlx5_devcom_comp_dev *pos; + + mdev->clock_state->compdev = mlx5_devcom_register_component(mdev->priv.devc, + MLX5_DEVCOM_SHARED_CLOCK, + key, NULL, mdev); + if (IS_ERR(mdev->clock_state->compdev)) + return; + + mlx5_devcom_comp_lock(mdev->clock_state->compdev); + mlx5_devcom_for_each_peer_entry(mdev->clock_state->compdev, peer_dev, pos) { + if (peer_dev->clock) { + next = peer_dev; + break; + } + } + + if (next) { + mdev->clock = next->clock; + /* clock info is shared among all the functions using the same clock */ + mdev->clock_info = next->clock_info; + } else { + mlx5_clock_alloc(mdev, true); + } + mlx5_devcom_comp_unlock(mdev->clock_state->compdev); + + if (!mdev->clock) { + mlx5_devcom_unregister_component(mdev->clock_state->compdev); + mdev->clock_state->compdev = NULL; + } +} + +static void mlx5_shared_clock_unregister(struct mlx5_core_dev *mdev) +{ + struct mlx5_core_dev *peer_dev, *next = NULL; + struct mlx5_clock *clock = mdev->clock; + struct mlx5_devcom_comp_dev *pos; + + mlx5_devcom_comp_lock(mdev->clock_state->compdev); + mlx5_devcom_for_each_peer_entry(mdev->clock_state->compdev, peer_dev, pos) { + if (peer_dev->clock && peer_dev != mdev) { + next = peer_dev; + break; + } + } + + if (next) { + struct mlx5_clock_priv *cpriv = clock_priv(clock); + + mlx5_clock_lock(clock); + if (mdev == cpriv->mdev) + cpriv->mdev = next; + mlx5_clock_unlock(clock); + } else { + mlx5_clock_free(mdev); + } + + mdev->clock = NULL; + mdev->clock_info = NULL; + mlx5_devcom_comp_unlock(mdev->clock_state->compdev); + + mlx5_devcom_unregister_component(mdev->clock_state->compdev); +} + +static void mlx5_clock_arm_pps_in_event(struct mlx5_clock *clock, + struct mlx5_core_dev *new_mdev, + struct mlx5_core_dev *old_mdev) +{ + struct ptp_clock_info *ptp_info = &clock->ptp_info; + struct mlx5_clock_priv *cpriv = clock_priv(clock); + int i; + + for (i = 0; i < ptp_info->n_pins; i++) { + if (ptp_info->pin_config[i].func != PTP_PF_EXTTS || + !clock->pps_info.pin_armed[i]) + continue; + + if (new_mdev) { + mlx5_set_mtppse(new_mdev, i, 0, MLX5_EVENT_MODE_REPETETIVE); + cpriv->event_mdev = new_mdev; + } else { + cpriv->event_mdev = NULL; + } + + if (old_mdev) + mlx5_set_mtppse(old_mdev, i, 0, MLX5_EVENT_MODE_DISABLE); + } +} + +void mlx5_clock_load(struct mlx5_core_dev *mdev) +{ + struct mlx5_clock *clock = mdev->clock; + struct mlx5_clock_priv *cpriv; + + if (!MLX5_CAP_GEN(mdev, device_frequency_khz)) + return; + + INIT_WORK(&mdev->clock_state->out_work, mlx5_pps_out); + MLX5_NB_INIT(&mdev->clock_state->pps_nb, mlx5_pps_event, PPS_EVENT); + mlx5_eq_notifier_register(mdev, &mdev->clock_state->pps_nb); + + if (!clock->shared) { + mlx5_clock_arm_pps_in_event(clock, mdev, NULL); + return; + } + + cpriv = clock_priv(clock); + mlx5_devcom_comp_lock(mdev->clock_state->compdev); + mlx5_clock_lock(clock); + if (mdev == cpriv->mdev && mdev != cpriv->event_mdev) + mlx5_clock_arm_pps_in_event(clock, mdev, cpriv->event_mdev); + mlx5_clock_unlock(clock); + mlx5_devcom_comp_unlock(mdev->clock_state->compdev); +} + +void mlx5_clock_unload(struct mlx5_core_dev *mdev) +{ + struct mlx5_core_dev *peer_dev, *next = NULL; + struct mlx5_clock *clock = mdev->clock; + struct mlx5_devcom_comp_dev *pos; + + if (!MLX5_CAP_GEN(mdev, device_frequency_khz)) + return; + + if (!clock->shared) { + mlx5_clock_arm_pps_in_event(clock, NULL, mdev); + goto out; + } + + mlx5_devcom_comp_lock(mdev->clock_state->compdev); + mlx5_devcom_for_each_peer_entry(mdev->clock_state->compdev, peer_dev, pos) { + if (peer_dev->clock && peer_dev != mdev) { + next = peer_dev; + break; + } + } + + mlx5_clock_lock(clock); + if (mdev == clock_priv(clock)->event_mdev) + mlx5_clock_arm_pps_in_event(clock, next, mdev); + mlx5_clock_unlock(clock); + mlx5_devcom_comp_unlock(mdev->clock_state->compdev); + +out: + mlx5_eq_notifier_unregister(mdev, &mdev->clock_state->pps_nb); + cancel_work_sync(&mdev->clock_state->out_work); +} + +static struct mlx5_clock null_clock; + +int mlx5_init_clock(struct mlx5_core_dev *mdev) +{ + u8 identity[MLX5_RT_CLOCK_IDENTITY_SIZE]; + struct mlx5_clock_dev_state *clock_state; + u64 key; + int err; + + if (!MLX5_CAP_GEN(mdev, device_frequency_khz)) { + mdev->clock = &null_clock; + mlx5_core_warn(mdev, "invalid device_frequency_khz, aborting HW clock init\n"); + return 0; + } + + clock_state = kzalloc(sizeof(*clock_state), GFP_KERNEL); + if (!clock_state) + return -ENOMEM; + clock_state->mdev = mdev; + mdev->clock_state = clock_state; + + if (MLX5_CAP_MCAM_REG3(mdev, mrtcq) && mlx5_real_time_mode(mdev)) { + if (mlx5_clock_identity_get(mdev, identity)) { + mlx5_core_warn(mdev, "failed to get rt clock identity, create ptp dev per function\n"); + } else { + memcpy(&key, &identity, sizeof(key)); + mlx5_shared_clock_register(mdev, key); + } + } + + if (!mdev->clock) { + err = mlx5_clock_alloc(mdev, false); + if (err) { + kfree(clock_state); + mdev->clock_state = NULL; + return err; + } + } + + return 0; +} + +void mlx5_cleanup_clock(struct mlx5_core_dev *mdev) +{ + if (!MLX5_CAP_GEN(mdev, device_frequency_khz)) + return; + + if (mdev->clock->shared) + mlx5_shared_clock_unregister(mdev); + else + mlx5_clock_free(mdev); + kfree(mdev->clock_state); + mdev->clock_state = NULL; +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h index bd95b9f8d143..c18a652c0faa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h @@ -33,6 +33,35 @@ #ifndef __LIB_CLOCK_H__ #define __LIB_CLOCK_H__ +#include <linux/ptp_clock_kernel.h> + +#define MAX_PIN_NUM 8 +struct mlx5_pps { + u8 pin_caps[MAX_PIN_NUM]; + u64 start[MAX_PIN_NUM]; + u8 enabled; + u64 min_npps_period; + u64 min_out_pulse_duration_ns; + bool pin_armed[MAX_PIN_NUM]; +}; + +struct mlx5_timer { + struct cyclecounter cycles; + struct timecounter tc; + u32 nominal_c_mult; + unsigned long overflow_period; +}; + +struct mlx5_clock { + seqlock_t lock; + struct hwtstamp_config hwtstamp_config; + struct ptp_clock *ptp; + struct ptp_clock_info ptp_info; + struct mlx5_pps pps_info; + struct mlx5_timer timer; + bool shared; +}; + static inline bool mlx5_is_real_time_rq(struct mlx5_core_dev *mdev) { u8 rq_ts_format_cap = MLX5_CAP_GEN(mdev, rq_ts_format); @@ -54,12 +83,14 @@ static inline bool mlx5_is_real_time_sq(struct mlx5_core_dev *mdev) typedef ktime_t (*cqe_ts_to_ns)(struct mlx5_clock *, u64); #if IS_ENABLED(CONFIG_PTP_1588_CLOCK) -void mlx5_init_clock(struct mlx5_core_dev *mdev); +int mlx5_init_clock(struct mlx5_core_dev *mdev); void mlx5_cleanup_clock(struct mlx5_core_dev *mdev); +void mlx5_clock_load(struct mlx5_core_dev *mdev); +void mlx5_clock_unload(struct mlx5_core_dev *mdev); static inline int mlx5_clock_get_ptp_index(struct mlx5_core_dev *mdev) { - return mdev->clock.ptp ? ptp_clock_index(mdev->clock.ptp) : -1; + return mdev->clock->ptp ? ptp_clock_index(mdev->clock->ptp) : -1; } static inline ktime_t mlx5_timecounter_cyc2time(struct mlx5_clock *clock, @@ -87,8 +118,10 @@ static inline ktime_t mlx5_real_time_cyc2time(struct mlx5_clock *clock, return ns_to_ktime(time); } #else -static inline void mlx5_init_clock(struct mlx5_core_dev *mdev) {} +static inline int mlx5_init_clock(struct mlx5_core_dev *mdev) { return 0; } static inline void mlx5_cleanup_clock(struct mlx5_core_dev *mdev) {} +static inline void mlx5_clock_load(struct mlx5_core_dev *mdev) {} +static inline void mlx5_clock_unload(struct mlx5_core_dev *mdev) {} static inline int mlx5_clock_get_ptp_index(struct mlx5_core_dev *mdev) { return -1; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h index d58032dd0df7..c79699b94a02 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h @@ -11,6 +11,7 @@ enum mlx5_devcom_component { MLX5_DEVCOM_MPV, MLX5_DEVCOM_HCA_PORTS, MLX5_DEVCOM_SD_GROUP, + MLX5_DEVCOM_SHARED_CLOCK, MLX5_DEVCOM_NUM_COMPONENTS, }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index ec956c4bcebd..710633d5fdbe 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1038,7 +1038,11 @@ static int mlx5_init_once(struct mlx5_core_dev *dev) mlx5_init_reserved_gids(dev); - mlx5_init_clock(dev); + err = mlx5_init_clock(dev); + if (err) { + mlx5_core_err(dev, "failed to initialize hardware clock\n"); + goto err_tables_cleanup; + } dev->vxlan = mlx5_vxlan_create(dev); dev->geneve = mlx5_geneve_create(dev); @@ -1046,7 +1050,7 @@ static int mlx5_init_once(struct mlx5_core_dev *dev) err = mlx5_init_rl_table(dev); if (err) { mlx5_core_err(dev, "Failed to init rate limiting\n"); - goto err_tables_cleanup; + goto err_clock_cleanup; } err = mlx5_mpfs_init(dev); @@ -1123,10 +1127,11 @@ err_mpfs_cleanup: mlx5_mpfs_cleanup(dev); err_rl_cleanup: mlx5_cleanup_rl_table(dev); -err_tables_cleanup: +err_clock_cleanup: mlx5_geneve_destroy(dev->geneve); mlx5_vxlan_destroy(dev->vxlan); mlx5_cleanup_clock(dev); +err_tables_cleanup: mlx5_cleanup_reserved_gids(dev); mlx5_cq_debugfs_cleanup(dev); mlx5_fw_reset_cleanup(dev); @@ -1359,6 +1364,8 @@ static int mlx5_load(struct mlx5_core_dev *dev) goto err_eq_table; } + mlx5_clock_load(dev); + err = mlx5_fw_tracer_init(dev->tracer); if (err) { mlx5_core_err(dev, "Failed to init FW tracer %d\n", err); @@ -1442,6 +1449,7 @@ err_fpga_start: mlx5_hv_vhca_cleanup(dev->hv_vhca); mlx5_fw_reset_events_stop(dev); mlx5_fw_tracer_cleanup(dev->tracer); + mlx5_clock_unload(dev); mlx5_eq_table_destroy(dev); err_eq_table: mlx5_irq_table_destroy(dev); @@ -1468,6 +1476,7 @@ static void mlx5_unload(struct mlx5_core_dev *dev) mlx5_hv_vhca_cleanup(dev->hv_vhca); mlx5_fw_reset_events_stop(dev); mlx5_fw_tracer_cleanup(dev->tracer); + mlx5_clock_unload(dev); mlx5_eq_table_destroy(dev); mlx5_irq_table_destroy(dev); mlx5_pagealloc_stop(dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c index 50931584132b..3995df064101 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -1105,6 +1105,9 @@ static const u32 mlx5e_ext_link_speed[MLX5E_EXT_LINK_MODES_NUMBER] = { [MLX5E_200GAUI_2_200GBASE_CR2_KR2] = 200000, [MLX5E_400GAUI_4_400GBASE_CR4_KR4] = 400000, [MLX5E_800GAUI_8_800GBASE_CR8_KR8] = 800000, + [MLX5E_200GAUI_1_200GBASE_CR1_KR1] = 200000, + [MLX5E_400GAUI_2_400GBASE_CR2_KR2] = 400000, + [MLX5E_800GAUI_4_800GBASE_CR4_KR4] = 800000, }; int mlx5_port_query_eth_proto(struct mlx5_core_dev *dev, u8 port, bool ext, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_domain.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_domain.c index 60cb4527588a..65740bb68b09 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_domain.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_domain.c @@ -516,30 +516,6 @@ def_xa_destroy: return NULL; } -/* Assure synchronization of the device steering tables with updates made by SW - * insertion. - */ -int mlx5dr_domain_sync(struct mlx5dr_domain *dmn, u32 flags) -{ - int ret = 0; - - if (flags & MLX5DR_DOMAIN_SYNC_FLAGS_SW) { - mlx5dr_domain_lock(dmn); - ret = mlx5dr_send_ring_force_drain(dmn); - mlx5dr_domain_unlock(dmn); - if (ret) { - mlx5dr_err(dmn, "Force drain failed flags: %d, ret: %d\n", - flags, ret); - return ret; - } - } - - if (flags & MLX5DR_DOMAIN_SYNC_FLAGS_HW) - ret = mlx5dr_cmd_sync_steering(dmn->mdev); - - return ret; -} - int mlx5dr_domain_destroy(struct mlx5dr_domain *dmn) { if (WARN_ON_ONCE(refcount_read(&dmn->refcount) > 1)) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c index f57c84e5128b..4fd4e8483382 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c @@ -1331,36 +1331,3 @@ void mlx5dr_send_ring_free(struct mlx5dr_domain *dmn, kfree(send_ring->sync_buff); kfree(send_ring); } - -int mlx5dr_send_ring_force_drain(struct mlx5dr_domain *dmn) -{ - struct mlx5dr_send_ring *send_ring = dmn->send_ring; - struct postsend_info send_info = {}; - u8 data[DR_STE_SIZE]; - int num_of_sends_req; - int ret; - int i; - - /* Sending this amount of requests makes sure we will get drain */ - num_of_sends_req = send_ring->signal_th * TH_NUMS_TO_DRAIN / 2; - - /* Send fake requests forcing the last to be signaled */ - send_info.write.addr = (uintptr_t)data; - send_info.write.length = DR_STE_SIZE; - send_info.write.lkey = 0; - /* Using the sync_mr in order to write/read */ - send_info.remote_addr = (uintptr_t)send_ring->sync_mr->addr; - send_info.rkey = send_ring->sync_mr->mkey; - - for (i = 0; i < num_of_sends_req; i++) { - ret = dr_postsend_icm_data(dmn, &send_info); - if (ret) - return ret; - } - - spin_lock(&send_ring->lock); - ret = dr_handle_pending_wc(dmn, send_ring); - spin_unlock(&send_ring->lock); - - return ret; -} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_types.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_types.h index 7618c6147f86..cc328292bf84 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_types.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_types.h @@ -1473,7 +1473,6 @@ struct mlx5dr_send_ring { int mlx5dr_send_ring_alloc(struct mlx5dr_domain *dmn); void mlx5dr_send_ring_free(struct mlx5dr_domain *dmn, struct mlx5dr_send_ring *send_ring); -int mlx5dr_send_ring_force_drain(struct mlx5dr_domain *dmn); int mlx5dr_send_postsend_ste(struct mlx5dr_domain *dmn, struct mlx5dr_ste *ste, u8 *data, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/mlx5dr.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/mlx5dr.h index 0bb3724c10c2..fc8a2169d1a1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/mlx5dr.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/mlx5dr.h @@ -45,8 +45,6 @@ mlx5dr_domain_create(struct mlx5_core_dev *mdev, enum mlx5dr_domain_type type); int mlx5dr_domain_destroy(struct mlx5dr_domain *domain); -int mlx5dr_domain_sync(struct mlx5dr_domain *domain, u32 flags); - void mlx5dr_domain_set_peer(struct mlx5dr_domain *dmn, struct mlx5dr_domain *peer_dmn, u16 peer_vhca_id); diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index b10f80fc651b..fa7082ee5183 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -754,9 +754,6 @@ void mlxsw_sp_port_vlan_router_leave(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan); void mlxsw_sp_rif_destroy_by_dev(struct mlxsw_sp *mlxsw_sp, struct net_device *dev); -bool mlxsw_sp_rif_exists(struct mlxsw_sp *mlxsw_sp, - const struct net_device *dev); -u16 mlxsw_sp_rif_vid(struct mlxsw_sp *mlxsw_sp, const struct net_device *dev); u16 mlxsw_sp_router_port(const struct mlxsw_sp *mlxsw_sp); int mlxsw_sp_router_nve_promote_decap(struct mlxsw_sp *mlxsw_sp, u32 ul_tb_id, enum mlxsw_sp_l3proto ul_proto, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 7d6d859cef3f..464821dd492d 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -8184,41 +8184,6 @@ mlxsw_sp_rif_find_by_dev(const struct mlxsw_sp *mlxsw_sp, return NULL; } -bool mlxsw_sp_rif_exists(struct mlxsw_sp *mlxsw_sp, - const struct net_device *dev) -{ - struct mlxsw_sp_rif *rif; - - mutex_lock(&mlxsw_sp->router->lock); - rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, dev); - mutex_unlock(&mlxsw_sp->router->lock); - - return rif; -} - -u16 mlxsw_sp_rif_vid(struct mlxsw_sp *mlxsw_sp, const struct net_device *dev) -{ - struct mlxsw_sp_rif *rif; - u16 vid = 0; - - mutex_lock(&mlxsw_sp->router->lock); - rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, dev); - if (!rif) - goto out; - - /* We only return the VID for VLAN RIFs. Otherwise we return an - * invalid value (0). - */ - if (rif->ops->type != MLXSW_SP_RIF_TYPE_VLAN) - goto out; - - vid = mlxsw_sp_fid_8021q_vid(rif->fid); - -out: - mutex_unlock(&mlxsw_sp->router->lock); - return vid; -} - static int mlxsw_sp_router_rif_disable(struct mlxsw_sp *mlxsw_sp, u16 rif) { char ritr_pl[MLXSW_REG_RITR_LEN]; @@ -8417,19 +8382,6 @@ u16 mlxsw_sp_ipip_lb_rif_index(const struct mlxsw_sp_rif_ipip_lb *lb_rif) return lb_rif->common.rif_index; } -u16 mlxsw_sp_ipip_lb_ul_vr_id(const struct mlxsw_sp_rif_ipip_lb *lb_rif) -{ - struct net_device *dev = mlxsw_sp_rif_dev(&lb_rif->common); - u32 ul_tb_id = mlxsw_sp_ipip_dev_ul_tb_id(dev); - struct mlxsw_sp_vr *ul_vr; - - ul_vr = mlxsw_sp_vr_get(lb_rif->common.mlxsw_sp, ul_tb_id, NULL); - if (WARN_ON(IS_ERR(ul_vr))) - return 0; - - return ul_vr->id; -} - u16 mlxsw_sp_ipip_lb_ul_rif_id(const struct mlxsw_sp_rif_ipip_lb *lb_rif) { return lb_rif->ul_rif_id; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h index 0432c7cc6b07..313efab5c324 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h @@ -90,7 +90,6 @@ struct mlxsw_sp_ipip_entry; struct mlxsw_sp_rif *mlxsw_sp_rif_by_index(const struct mlxsw_sp *mlxsw_sp, u16 rif_index); u16 mlxsw_sp_ipip_lb_rif_index(const struct mlxsw_sp_rif_ipip_lb *rif); -u16 mlxsw_sp_ipip_lb_ul_vr_id(const struct mlxsw_sp_rif_ipip_lb *rif); u16 mlxsw_sp_ipip_lb_ul_rif_id(const struct mlxsw_sp_rif_ipip_lb *lb_rif); u32 mlxsw_sp_ipip_dev_ul_tb_id(const struct net_device *ol_dev); int mlxsw_sp_rif_dev_ifindex(const struct mlxsw_sp_rif *rif); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_debugfs.c b/drivers/net/ethernet/meta/fbnic/fbnic_debugfs.c index 59951b5abdb7..ac80981f67c0 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_debugfs.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_debugfs.c @@ -10,6 +10,40 @@ static struct dentry *fbnic_dbg_root; +static void fbnic_dbg_desc_break(struct seq_file *s, int i) +{ + while (i--) + seq_putc(s, '-'); + + seq_putc(s, '\n'); +} + +static int fbnic_dbg_mac_addr_show(struct seq_file *s, void *v) +{ + struct fbnic_dev *fbd = s->private; + char hdr[80]; + int i; + + /* Generate Header */ + snprintf(hdr, sizeof(hdr), "%3s %s %-17s %s\n", + "Idx", "S", "TCAM Bitmap", "Addr/Mask"); + seq_puts(s, hdr); + fbnic_dbg_desc_break(s, strnlen(hdr, sizeof(hdr))); + + for (i = 0; i < FBNIC_RPC_TCAM_MACDA_NUM_ENTRIES; i++) { + struct fbnic_mac_addr *mac_addr = &fbd->mac_addr[i]; + + seq_printf(s, "%02d %d %64pb %pm\n", + i, mac_addr->state, mac_addr->act_tcam, + mac_addr->value.addr8); + seq_printf(s, " %pm\n", + mac_addr->mask.addr8); + } + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(fbnic_dbg_mac_addr); + static int fbnic_dbg_pcie_stats_show(struct seq_file *s, void *v) { struct fbnic_dev *fbd = s->private; @@ -48,6 +82,8 @@ void fbnic_dbg_fbd_init(struct fbnic_dev *fbd) fbd->dbg_fbd = debugfs_create_dir(name, fbnic_dbg_root); debugfs_create_file("pcie_stats", 0400, fbd->dbg_fbd, fbd, &fbnic_dbg_pcie_stats_fops); + debugfs_create_file("mac_addr", 0400, fbd->dbg_fbd, fbd, + &fbnic_dbg_mac_addr_fops); } void fbnic_dbg_fbd_exit(struct fbnic_dev *fbd) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c index 7a96b6ee773f..1db57c42333e 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c @@ -628,6 +628,8 @@ struct net_device *fbnic_netdev_alloc(struct fbnic_dev *fbd) fbnic_rss_key_fill(fbn->rss_key); fbnic_rss_init_en_mask(fbn); + netdev->priv_flags |= IFF_UNICAST_FLT; + netdev->features |= NETIF_F_RXHASH | NETIF_F_SG | diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c index fa167b1aa019..5222a035fd19 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c +++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c @@ -3033,7 +3033,7 @@ static void qed_iov_vf_mbx_vport_update(struct qed_hwfn *p_hwfn, u16 length; int rc; - /* Valiate PF can send such a request */ + /* Validate PF can send such a request */ if (!vf->vport_instance) { DP_VERBOSE(p_hwfn, QED_MSG_IOV, @@ -3312,7 +3312,7 @@ static void qed_iov_vf_mbx_ucast_filter(struct qed_hwfn *p_hwfn, goto out; } - /* Determine if the unicast filtering is acceptible by PF */ + /* Determine if the unicast filtering is acceptable by PF */ if ((p_bulletin->valid_bitmap & BIT(VLAN_ADDR_FORCED)) && (params.type == QED_FILTER_VLAN || params.type == QED_FILTER_MAC_VLAN)) { @@ -3729,7 +3729,7 @@ qed_iov_execute_vf_flr_cleanup(struct qed_hwfn *p_hwfn, rc = qed_iov_enable_vf_access(p_hwfn, p_ptt, p_vf); if (rc) { - DP_ERR(p_hwfn, "Failed to re-enable VF[%d] acces\n", + DP_ERR(p_hwfn, "Failed to re-enable VF[%d] access\n", vfid); return rc; } @@ -4480,7 +4480,7 @@ int qed_sriov_disable(struct qed_dev *cdev, bool pci_enabled) struct qed_ptt *ptt = qed_ptt_acquire(hwfn); /* Failure to acquire the ptt in 100g creates an odd error - * where the first engine has already relased IOV. + * where the first engine has already released IOV. */ if (!ptt) { DP_ERR(hwfn, "Failed to acquire ptt\n"); diff --git a/drivers/net/ethernet/realtek/Kconfig b/drivers/net/ethernet/realtek/Kconfig index 8a8ea51c639e..fe136f61586f 100644 --- a/drivers/net/ethernet/realtek/Kconfig +++ b/drivers/net/ethernet/realtek/Kconfig @@ -114,7 +114,8 @@ config R8169 will be called r8169. This is recommended. config R8169_LEDS - def_bool R8169 && LEDS_TRIGGER_NETDEV + bool "Support for controlling the NIC LEDs" + depends on R8169 && LEDS_TRIGGER_NETDEV depends on !(R8169=y && LEDS_CLASS=m) help Optional support for controlling the NIC LED's with the netdev diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 5a5eba49c651..7306c8e323d7 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -5222,6 +5222,7 @@ static int r8169_mdio_register(struct rtl8169_private *tp) new_bus->priv = tp; new_bus->parent = &pdev->dev; new_bus->irq[0] = PHY_MAC_INTERRUPT; + new_bus->phy_mask = GENMASK(31, 1); snprintf(new_bus->id, MII_BUS_ID_SIZE, "r8169-%x-%x", pci_domain_nr(pdev->bus), pci_dev_id(pdev)); diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c index 6bf3ec985f3d..f181f05cb429 100644 --- a/drivers/net/phy/phy-core.c +++ b/drivers/net/phy/phy-core.c @@ -13,7 +13,7 @@ */ const char *phy_speed_to_str(int speed) { - BUILD_BUG_ON_MSG(__ETHTOOL_LINK_MODE_MASK_NBITS != 103, + BUILD_BUG_ON_MSG(__ETHTOOL_LINK_MODE_MASK_NBITS != 121, "Enum ethtool_link_mode_bit_indices and phylib are out of sync. " "If a speed or mode has been added please update phy_speed_to_str " "and the PHY settings array.\n"); @@ -169,6 +169,12 @@ static const struct phy_setting settings[] = { PHY_SETTING( 800000, FULL, 800000baseDR8_2_Full ), PHY_SETTING( 800000, FULL, 800000baseSR8_Full ), PHY_SETTING( 800000, FULL, 800000baseVR8_Full ), + PHY_SETTING( 800000, FULL, 800000baseCR4_Full ), + PHY_SETTING( 800000, FULL, 800000baseKR4_Full ), + PHY_SETTING( 800000, FULL, 800000baseDR4_Full ), + PHY_SETTING( 800000, FULL, 800000baseDR4_2_Full ), + PHY_SETTING( 800000, FULL, 800000baseSR4_Full ), + PHY_SETTING( 800000, FULL, 800000baseVR4_Full ), /* 400G */ PHY_SETTING( 400000, FULL, 400000baseCR8_Full ), PHY_SETTING( 400000, FULL, 400000baseKR8_Full ), @@ -180,6 +186,12 @@ static const struct phy_setting settings[] = { PHY_SETTING( 400000, FULL, 400000baseLR4_ER4_FR4_Full ), PHY_SETTING( 400000, FULL, 400000baseDR4_Full ), PHY_SETTING( 400000, FULL, 400000baseSR4_Full ), + PHY_SETTING( 400000, FULL, 400000baseCR2_Full ), + PHY_SETTING( 400000, FULL, 400000baseKR2_Full ), + PHY_SETTING( 400000, FULL, 400000baseDR2_Full ), + PHY_SETTING( 400000, FULL, 400000baseDR2_2_Full ), + PHY_SETTING( 400000, FULL, 400000baseSR2_Full ), + PHY_SETTING( 400000, FULL, 400000baseVR2_Full ), /* 200G */ PHY_SETTING( 200000, FULL, 200000baseCR4_Full ), PHY_SETTING( 200000, FULL, 200000baseKR4_Full ), @@ -191,6 +203,12 @@ static const struct phy_setting settings[] = { PHY_SETTING( 200000, FULL, 200000baseLR2_ER2_FR2_Full ), PHY_SETTING( 200000, FULL, 200000baseDR2_Full ), PHY_SETTING( 200000, FULL, 200000baseSR2_Full ), + PHY_SETTING( 200000, FULL, 200000baseCR_Full ), + PHY_SETTING( 200000, FULL, 200000baseKR_Full ), + PHY_SETTING( 200000, FULL, 200000baseDR_Full ), + PHY_SETTING( 200000, FULL, 200000baseDR_2_Full ), + PHY_SETTING( 200000, FULL, 200000baseSR_Full ), + PHY_SETTING( 200000, FULL, 200000baseVR_Full ), /* 100G */ PHY_SETTING( 100000, FULL, 100000baseCR4_Full ), PHY_SETTING( 100000, FULL, 100000baseKR4_Full ), diff --git a/drivers/net/phy/realtek/Kconfig b/drivers/net/phy/realtek/Kconfig index 31935f147d87..b05c2a1e9024 100644 --- a/drivers/net/phy/realtek/Kconfig +++ b/drivers/net/phy/realtek/Kconfig @@ -4,8 +4,12 @@ config REALTEK_PHY help Currently supports RTL821x/RTL822x and fast ethernet PHYs +if REALTEK_PHY + config REALTEK_PHY_HWMON - def_bool REALTEK_PHY && HWMON - depends on !(REALTEK_PHY=y && HWMON=m) + bool "HWMON support for Realtek PHYs" + depends on HWMON && !(REALTEK_PHY=y && HWMON=m) help Optional hwmon support for the temperature sensor + +endif # REALTEK_PHY diff --git a/drivers/net/phy/realtek/realtek_main.c b/drivers/net/phy/realtek/realtek_main.c index 572a933636b0..210fefac44d4 100644 --- a/drivers/net/phy/realtek/realtek_main.c +++ b/drivers/net/phy/realtek/realtek_main.c @@ -13,6 +13,7 @@ #include <linux/module.h> #include <linux/delay.h> #include <linux/clk.h> +#include <linux/string_choices.h> #include "realtek.h" @@ -422,11 +423,11 @@ static int rtl8211f_config_init(struct phy_device *phydev) } else if (ret) { dev_dbg(dev, "%s 2ns TX delay (and changing the value from pin-strapping RXD1 or the bootloader)\n", - val_txdly ? "Enabling" : "Disabling"); + str_enable_disable(val_txdly)); } else { dev_dbg(dev, "2ns TX delay was already %s (by pin-strapping RXD1 or bootloader configuration)\n", - val_txdly ? "enabled" : "disabled"); + str_enabled_disabled(val_txdly)); } ret = phy_modify_paged_changed(phydev, 0xd08, 0x15, RTL8211F_RX_DELAY, @@ -437,11 +438,11 @@ static int rtl8211f_config_init(struct phy_device *phydev) } else if (ret) { dev_dbg(dev, "%s 2ns RX delay (and changing the value from pin-strapping RXD0 or the bootloader)\n", - val_rxdly ? "Enabling" : "Disabling"); + str_enable_disable(val_rxdly)); } else { dev_dbg(dev, "2ns RX delay was already %s (by pin-strapping RXD0 or bootloader configuration)\n", - val_rxdly ? "enabled" : "disabled"); + str_enabled_disabled(val_rxdly)); } if (priv->has_phycr2) { diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 92516189e792..03175419bf28 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -227,9 +227,9 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, be32_to_cpu(fdb->vni))) goto nla_put_failure; - ci.ndm_used = jiffies_to_clock_t(now - fdb->used); + ci.ndm_used = jiffies_to_clock_t(now - READ_ONCE(fdb->used)); ci.ndm_confirmed = 0; - ci.ndm_updated = jiffies_to_clock_t(now - fdb->updated); + ci.ndm_updated = jiffies_to_clock_t(now - READ_ONCE(fdb->updated)); ci.ndm_refcnt = 0; if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) @@ -434,8 +434,12 @@ static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan, struct vxlan_fdb *f; f = __vxlan_find_mac(vxlan, mac, vni); - if (f && f->used != jiffies) - f->used = jiffies; + if (f) { + unsigned long now = jiffies; + + if (READ_ONCE(f->used) != now) + WRITE_ONCE(f->used, now); + } return f; } @@ -1009,12 +1013,10 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, !(f->flags & NTF_VXLAN_ADDED_BY_USER)) { if (f->state != state) { f->state = state; - f->updated = jiffies; notify = 1; } if (f->flags != fdb_flags) { f->flags = fdb_flags; - f->updated = jiffies; notify = 1; } } @@ -1048,12 +1050,13 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, } if (ndm_flags & NTF_USE) - f->used = jiffies; + WRITE_ONCE(f->updated, jiffies); if (notify) { if (rd == NULL) rd = first_remote_rtnl(f); + WRITE_ONCE(f->updated, jiffies); err = vxlan_fdb_notify(vxlan, f, rd, RTM_NEWNEIGH, swdev_notify, extack); if (err) @@ -1292,7 +1295,7 @@ int __vxlan_fdb_delete(struct vxlan_dev *vxlan, struct vxlan_fdb *f; int err = -ENOENT; - f = vxlan_find_mac(vxlan, addr, src_vni); + f = __vxlan_find_mac(vxlan, addr, src_vni); if (!f) return err; @@ -1459,9 +1462,13 @@ static enum skb_drop_reason vxlan_snoop(struct net_device *dev, ifindex = src_ifindex; #endif - f = vxlan_find_mac(vxlan, src_mac, vni); + f = __vxlan_find_mac(vxlan, src_mac, vni); if (likely(f)) { struct vxlan_rdst *rdst = first_remote_rcu(f); + unsigned long now = jiffies; + + if (READ_ONCE(f->updated) != now) + WRITE_ONCE(f->updated, now); if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip) && rdst->remote_ifindex == ifindex)) @@ -1481,7 +1488,6 @@ static enum skb_drop_reason vxlan_snoop(struct net_device *dev, src_mac, &rdst->remote_ip.sa, &src_ip->sa); rdst->remote_ip = *src_ip; - f->updated = jiffies; vxlan_fdb_notify(vxlan, f, rdst, RTM_NEWNEIGH, true, NULL); } else { u32 hash_index = fdb_head_index(vxlan, src_mac, vni); @@ -2852,7 +2858,7 @@ static void vxlan_cleanup(struct timer_list *t) if (f->flags & NTF_EXT_LEARNED) continue; - timeout = f->used + vxlan->cfg.age_interval * HZ; + timeout = READ_ONCE(f->updated) + vxlan->cfg.age_interval * HZ; if (time_before_eq(timeout, jiffies)) { netdev_dbg(vxlan->dev, "garbage collect %pM\n", @@ -4768,7 +4774,7 @@ vxlan_fdb_offloaded_set(struct net_device *dev, spin_lock_bh(&vxlan->hash_lock[hash_index]); - f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); + f = __vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); if (!f) goto out; @@ -4824,7 +4830,7 @@ vxlan_fdb_external_learn_del(struct net_device *dev, hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni); spin_lock_bh(&vxlan->hash_lock[hash_index]); - f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); + f = __vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); if (!f) err = -ENOENT; else if (f->flags & NTF_EXT_LEARNED) diff --git a/drivers/s390/net/Kconfig b/drivers/s390/net/Kconfig index c61e6427384c..9eb9e3c49f81 100644 --- a/drivers/s390/net/Kconfig +++ b/drivers/s390/net/Kconfig @@ -2,15 +2,6 @@ menu "S/390 network device drivers" depends on NETDEVICES && S390 -config LCS - def_tristate m - prompt "Lan Channel Station Interface" - depends on CCW && NETDEVICES && ETHERNET - help - Select this option if you want to use LCS networking on IBM System z. - To compile as a module, choose M. The module name is lcs. - If you do not use LCS, choose N. - config CTCM def_tristate m prompt "CTC and MPC SNA device support" @@ -98,7 +89,7 @@ config QETH_OSX config CCWGROUP tristate - default (LCS || CTCM || QETH || SMC) + default (CTCM || QETH || SMC) config ISM tristate "Support for ISM vPCI Adapter" diff --git a/drivers/s390/net/Makefile b/drivers/s390/net/Makefile index bc55ec316adb..b5aaba290127 100644 --- a/drivers/s390/net/Makefile +++ b/drivers/s390/net/Makefile @@ -8,7 +8,6 @@ obj-$(CONFIG_CTCM) += ctcm.o fsm.o obj-$(CONFIG_NETIUCV) += netiucv.o fsm.o obj-$(CONFIG_SMSGIUCV) += smsgiucv.o obj-$(CONFIG_SMSGIUCV_EVENT) += smsgiucv_app.o -obj-$(CONFIG_LCS) += lcs.o qeth-y += qeth_core_sys.o qeth_core_main.o qeth_core_mpc.o qeth_ethtool.o obj-$(CONFIG_QETH) += qeth.o qeth_l2-y += qeth_l2_main.o qeth_l2_sys.o diff --git a/drivers/s390/net/lcs.c b/drivers/s390/net/lcs.c deleted file mode 100644 index 88db8378325a..000000000000 --- a/drivers/s390/net/lcs.c +++ /dev/null @@ -1,2385 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -/* - * Linux for S/390 LAN channel station device driver - * - * Copyright IBM Corp. 1999, 2009 - * Author(s): Original Code written by - * DJ Barrow <djbarrow@de.ibm.com,barrow_dj@yahoo.com> - * Rewritten by - * Frank Pavlic <fpavlic@de.ibm.com> and - * Martin Schwidefsky <schwidefsky@de.ibm.com> - */ - -#define KMSG_COMPONENT "lcs" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt - -#include <linux/module.h> -#include <linux/if.h> -#include <linux/netdevice.h> -#include <linux/etherdevice.h> -#include <linux/inetdevice.h> -#include <linux/in.h> -#include <linux/igmp.h> -#include <linux/delay.h> -#include <linux/kthread.h> -#include <linux/slab.h> -#include <net/arp.h> -#include <net/ip.h> - -#include <asm/debug.h> -#include <asm/idals.h> -#include <asm/timex.h> -#include <linux/device.h> -#include <asm/ccwgroup.h> - -#include "lcs.h" - - -/* - * initialization string for output - */ - -static char version[] __initdata = "LCS driver"; - -/* - * the root device for lcs group devices - */ -static struct device *lcs_root_dev; - -/* - * Some prototypes. - */ -static void lcs_tasklet(unsigned long); -static void lcs_start_kernel_thread(struct work_struct *); -static void lcs_get_frames_cb(struct lcs_channel *, struct lcs_buffer *); -#ifdef CONFIG_IP_MULTICAST -static int lcs_send_delipm(struct lcs_card *, struct lcs_ipm_list *); -#endif /* CONFIG_IP_MULTICAST */ -static int lcs_recovery(void *ptr); - -/* - * Debug Facility Stuff - */ -static char debug_buffer[255]; -static debug_info_t *lcs_dbf_setup; -static debug_info_t *lcs_dbf_trace; - -/* - * LCS Debug Facility functions - */ -static void -lcs_unregister_debug_facility(void) -{ - debug_unregister(lcs_dbf_setup); - debug_unregister(lcs_dbf_trace); -} - -static int -lcs_register_debug_facility(void) -{ - lcs_dbf_setup = debug_register("lcs_setup", 2, 1, 8); - lcs_dbf_trace = debug_register("lcs_trace", 4, 1, 8); - if (lcs_dbf_setup == NULL || lcs_dbf_trace == NULL) { - pr_err("Not enough memory for debug facility.\n"); - lcs_unregister_debug_facility(); - return -ENOMEM; - } - debug_register_view(lcs_dbf_setup, &debug_hex_ascii_view); - debug_set_level(lcs_dbf_setup, 2); - debug_register_view(lcs_dbf_trace, &debug_hex_ascii_view); - debug_set_level(lcs_dbf_trace, 2); - return 0; -} - -/* - * Allocate io buffers. - */ -static int -lcs_alloc_channel(struct lcs_channel *channel) -{ - int cnt; - - LCS_DBF_TEXT(2, setup, "ichalloc"); - for (cnt = 0; cnt < LCS_NUM_BUFFS; cnt++) { - /* alloc memory fo iobuffer */ - channel->iob[cnt].data = - kzalloc(LCS_IOBUFFERSIZE, GFP_DMA | GFP_KERNEL); - if (channel->iob[cnt].data == NULL) - break; - channel->iob[cnt].state = LCS_BUF_STATE_EMPTY; - } - if (cnt < LCS_NUM_BUFFS) { - /* Not all io buffers could be allocated. */ - LCS_DBF_TEXT(2, setup, "echalloc"); - while (cnt-- > 0) - kfree(channel->iob[cnt].data); - return -ENOMEM; - } - return 0; -} - -/* - * Free io buffers. - */ -static void -lcs_free_channel(struct lcs_channel *channel) -{ - int cnt; - - LCS_DBF_TEXT(2, setup, "ichfree"); - for (cnt = 0; cnt < LCS_NUM_BUFFS; cnt++) { - kfree(channel->iob[cnt].data); - channel->iob[cnt].data = NULL; - } -} - -/* - * Cleanup channel. - */ -static void -lcs_cleanup_channel(struct lcs_channel *channel) -{ - LCS_DBF_TEXT(3, setup, "cleanch"); - /* Kill write channel tasklets. */ - tasklet_kill(&channel->irq_tasklet); - /* Free channel buffers. */ - lcs_free_channel(channel); -} - -/* - * LCS free memory for card and channels. - */ -static void -lcs_free_card(struct lcs_card *card) -{ - LCS_DBF_TEXT(2, setup, "remcard"); - LCS_DBF_HEX(2, setup, &card, sizeof(void*)); - kfree(card); -} - -/* - * LCS alloc memory for card and channels - */ -static struct lcs_card * -lcs_alloc_card(void) -{ - struct lcs_card *card; - int rc; - - LCS_DBF_TEXT(2, setup, "alloclcs"); - - card = kzalloc(sizeof(struct lcs_card), GFP_KERNEL | GFP_DMA); - if (card == NULL) - return NULL; - card->lan_type = LCS_FRAME_TYPE_AUTO; - card->pkt_seq = 0; - card->lancmd_timeout = LCS_LANCMD_TIMEOUT_DEFAULT; - /* Allocate io buffers for the read channel. */ - rc = lcs_alloc_channel(&card->read); - if (rc){ - LCS_DBF_TEXT(2, setup, "iccwerr"); - lcs_free_card(card); - return NULL; - } - /* Allocate io buffers for the write channel. */ - rc = lcs_alloc_channel(&card->write); - if (rc) { - LCS_DBF_TEXT(2, setup, "iccwerr"); - lcs_cleanup_channel(&card->read); - lcs_free_card(card); - return NULL; - } - -#ifdef CONFIG_IP_MULTICAST - INIT_LIST_HEAD(&card->ipm_list); -#endif - LCS_DBF_HEX(2, setup, &card, sizeof(void*)); - return card; -} - -/* - * Setup read channel. - */ -static void -lcs_setup_read_ccws(struct lcs_card *card) -{ - int cnt; - - LCS_DBF_TEXT(2, setup, "ireadccw"); - /* Setup read ccws. */ - memset(card->read.ccws, 0, sizeof (struct ccw1) * (LCS_NUM_BUFFS + 1)); - for (cnt = 0; cnt < LCS_NUM_BUFFS; cnt++) { - card->read.ccws[cnt].cmd_code = LCS_CCW_READ; - card->read.ccws[cnt].count = LCS_IOBUFFERSIZE; - card->read.ccws[cnt].flags = - CCW_FLAG_CC | CCW_FLAG_SLI | CCW_FLAG_PCI; - /* - * Note: we have allocated the buffer with GFP_DMA, so - * we do not need to do set_normalized_cda. - */ - card->read.ccws[cnt].cda = - virt_to_dma32(card->read.iob[cnt].data); - ((struct lcs_header *) - card->read.iob[cnt].data)->offset = LCS_ILLEGAL_OFFSET; - card->read.iob[cnt].callback = lcs_get_frames_cb; - card->read.iob[cnt].state = LCS_BUF_STATE_READY; - card->read.iob[cnt].count = LCS_IOBUFFERSIZE; - } - card->read.ccws[0].flags &= ~CCW_FLAG_PCI; - card->read.ccws[LCS_NUM_BUFFS - 1].flags &= ~CCW_FLAG_PCI; - card->read.ccws[LCS_NUM_BUFFS - 1].flags |= CCW_FLAG_SUSPEND; - /* Last ccw is a tic (transfer in channel). */ - card->read.ccws[LCS_NUM_BUFFS].cmd_code = LCS_CCW_TRANSFER; - card->read.ccws[LCS_NUM_BUFFS].cda = virt_to_dma32(card->read.ccws); - /* Setg initial state of the read channel. */ - card->read.state = LCS_CH_STATE_INIT; - - card->read.io_idx = 0; - card->read.buf_idx = 0; -} - -static void -lcs_setup_read(struct lcs_card *card) -{ - LCS_DBF_TEXT(3, setup, "initread"); - - lcs_setup_read_ccws(card); - /* Initialize read channel tasklet. */ - card->read.irq_tasklet.data = (unsigned long) &card->read; - card->read.irq_tasklet.func = lcs_tasklet; - /* Initialize waitqueue. */ - init_waitqueue_head(&card->read.wait_q); -} - -/* - * Setup write channel. - */ -static void -lcs_setup_write_ccws(struct lcs_card *card) -{ - int cnt; - - LCS_DBF_TEXT(3, setup, "iwritccw"); - /* Setup write ccws. */ - memset(card->write.ccws, 0, sizeof(struct ccw1) * (LCS_NUM_BUFFS + 1)); - for (cnt = 0; cnt < LCS_NUM_BUFFS; cnt++) { - card->write.ccws[cnt].cmd_code = LCS_CCW_WRITE; - card->write.ccws[cnt].count = 0; - card->write.ccws[cnt].flags = - CCW_FLAG_SUSPEND | CCW_FLAG_CC | CCW_FLAG_SLI; - /* - * Note: we have allocated the buffer with GFP_DMA, so - * we do not need to do set_normalized_cda. - */ - card->write.ccws[cnt].cda = - virt_to_dma32(card->write.iob[cnt].data); - } - /* Last ccw is a tic (transfer in channel). */ - card->write.ccws[LCS_NUM_BUFFS].cmd_code = LCS_CCW_TRANSFER; - card->write.ccws[LCS_NUM_BUFFS].cda = virt_to_dma32(card->write.ccws); - /* Set initial state of the write channel. */ - card->read.state = LCS_CH_STATE_INIT; - - card->write.io_idx = 0; - card->write.buf_idx = 0; -} - -static void -lcs_setup_write(struct lcs_card *card) -{ - LCS_DBF_TEXT(3, setup, "initwrit"); - - lcs_setup_write_ccws(card); - /* Initialize write channel tasklet. */ - card->write.irq_tasklet.data = (unsigned long) &card->write; - card->write.irq_tasklet.func = lcs_tasklet; - /* Initialize waitqueue. */ - init_waitqueue_head(&card->write.wait_q); -} - -static void -lcs_set_allowed_threads(struct lcs_card *card, unsigned long threads) -{ - unsigned long flags; - - spin_lock_irqsave(&card->mask_lock, flags); - card->thread_allowed_mask = threads; - spin_unlock_irqrestore(&card->mask_lock, flags); - wake_up(&card->wait_q); -} -static int lcs_threads_running(struct lcs_card *card, unsigned long threads) -{ - unsigned long flags; - int rc = 0; - - spin_lock_irqsave(&card->mask_lock, flags); - rc = (card->thread_running_mask & threads); - spin_unlock_irqrestore(&card->mask_lock, flags); - return rc; -} - -static int -lcs_wait_for_threads(struct lcs_card *card, unsigned long threads) -{ - return wait_event_interruptible(card->wait_q, - lcs_threads_running(card, threads) == 0); -} - -static int lcs_set_thread_start_bit(struct lcs_card *card, unsigned long thread) -{ - unsigned long flags; - - spin_lock_irqsave(&card->mask_lock, flags); - if ( !(card->thread_allowed_mask & thread) || - (card->thread_start_mask & thread) ) { - spin_unlock_irqrestore(&card->mask_lock, flags); - return -EPERM; - } - card->thread_start_mask |= thread; - spin_unlock_irqrestore(&card->mask_lock, flags); - return 0; -} - -static void -lcs_clear_thread_running_bit(struct lcs_card *card, unsigned long thread) -{ - unsigned long flags; - - spin_lock_irqsave(&card->mask_lock, flags); - card->thread_running_mask &= ~thread; - spin_unlock_irqrestore(&card->mask_lock, flags); - wake_up(&card->wait_q); -} - -static int __lcs_do_run_thread(struct lcs_card *card, unsigned long thread) -{ - unsigned long flags; - int rc = 0; - - spin_lock_irqsave(&card->mask_lock, flags); - if (card->thread_start_mask & thread){ - if ((card->thread_allowed_mask & thread) && - !(card->thread_running_mask & thread)){ - rc = 1; - card->thread_start_mask &= ~thread; - card->thread_running_mask |= thread; - } else - rc = -EPERM; - } - spin_unlock_irqrestore(&card->mask_lock, flags); - return rc; -} - -static int -lcs_do_run_thread(struct lcs_card *card, unsigned long thread) -{ - int rc = 0; - wait_event(card->wait_q, - (rc = __lcs_do_run_thread(card, thread)) >= 0); - return rc; -} - -static int -lcs_do_start_thread(struct lcs_card *card, unsigned long thread) -{ - unsigned long flags; - int rc = 0; - - spin_lock_irqsave(&card->mask_lock, flags); - LCS_DBF_TEXT_(4, trace, " %02x%02x%02x", - (u8) card->thread_start_mask, - (u8) card->thread_allowed_mask, - (u8) card->thread_running_mask); - rc = (card->thread_start_mask & thread); - spin_unlock_irqrestore(&card->mask_lock, flags); - return rc; -} - -/* - * Initialize channels,card and state machines. - */ -static void -lcs_setup_card(struct lcs_card *card) -{ - LCS_DBF_TEXT(2, setup, "initcard"); - LCS_DBF_HEX(2, setup, &card, sizeof(void*)); - - lcs_setup_read(card); - lcs_setup_write(card); - /* Set cards initial state. */ - card->state = DEV_STATE_DOWN; - card->tx_buffer = NULL; - card->tx_emitted = 0; - - init_waitqueue_head(&card->wait_q); - spin_lock_init(&card->lock); - spin_lock_init(&card->ipm_lock); - spin_lock_init(&card->mask_lock); -#ifdef CONFIG_IP_MULTICAST - INIT_LIST_HEAD(&card->ipm_list); -#endif - INIT_LIST_HEAD(&card->lancmd_waiters); -} - -static void lcs_clear_multicast_list(struct lcs_card *card) -{ -#ifdef CONFIG_IP_MULTICAST - struct lcs_ipm_list *ipm; - unsigned long flags; - - /* Free multicast list. */ - LCS_DBF_TEXT(3, setup, "clmclist"); - spin_lock_irqsave(&card->ipm_lock, flags); - while (!list_empty(&card->ipm_list)){ - ipm = list_entry(card->ipm_list.next, - struct lcs_ipm_list, list); - list_del(&ipm->list); - if (ipm->ipm_state != LCS_IPM_STATE_SET_REQUIRED){ - spin_unlock_irqrestore(&card->ipm_lock, flags); - lcs_send_delipm(card, ipm); - spin_lock_irqsave(&card->ipm_lock, flags); - } - kfree(ipm); - } - spin_unlock_irqrestore(&card->ipm_lock, flags); -#endif -} - -/* - * Cleanup channels,card and state machines. - */ -static void -lcs_cleanup_card(struct lcs_card *card) -{ - - LCS_DBF_TEXT(3, setup, "cleancrd"); - LCS_DBF_HEX(2,setup,&card,sizeof(void*)); - - if (card->dev != NULL) - free_netdev(card->dev); - /* Cleanup channels. */ - lcs_cleanup_channel(&card->write); - lcs_cleanup_channel(&card->read); -} - -/* - * Start channel. - */ -static int -lcs_start_channel(struct lcs_channel *channel) -{ - unsigned long flags; - int rc; - - LCS_DBF_TEXT_(4, trace,"ssch%s", dev_name(&channel->ccwdev->dev)); - spin_lock_irqsave(get_ccwdev_lock(channel->ccwdev), flags); - rc = ccw_device_start(channel->ccwdev, - channel->ccws + channel->io_idx, 0, 0, - DOIO_DENY_PREFETCH | DOIO_ALLOW_SUSPEND); - if (rc == 0) - channel->state = LCS_CH_STATE_RUNNING; - spin_unlock_irqrestore(get_ccwdev_lock(channel->ccwdev), flags); - if (rc) { - LCS_DBF_TEXT_(4,trace,"essh%s", - dev_name(&channel->ccwdev->dev)); - dev_err(&channel->ccwdev->dev, - "Starting an LCS device resulted in an error," - " rc=%d!\n", rc); - } - return rc; -} - -static int -lcs_clear_channel(struct lcs_channel *channel) -{ - unsigned long flags; - int rc; - - LCS_DBF_TEXT(4,trace,"clearch"); - LCS_DBF_TEXT_(4, trace, "%s", dev_name(&channel->ccwdev->dev)); - spin_lock_irqsave(get_ccwdev_lock(channel->ccwdev), flags); - rc = ccw_device_clear(channel->ccwdev, 0); - spin_unlock_irqrestore(get_ccwdev_lock(channel->ccwdev), flags); - if (rc) { - LCS_DBF_TEXT_(4, trace, "ecsc%s", - dev_name(&channel->ccwdev->dev)); - return rc; - } - wait_event(channel->wait_q, (channel->state == LCS_CH_STATE_CLEARED)); - channel->state = LCS_CH_STATE_STOPPED; - return rc; -} - - -/* - * Stop channel. - */ -static int -lcs_stop_channel(struct lcs_channel *channel) -{ - unsigned long flags; - int rc; - - if (channel->state == LCS_CH_STATE_STOPPED) - return 0; - LCS_DBF_TEXT(4,trace,"haltsch"); - LCS_DBF_TEXT_(4, trace, "%s", dev_name(&channel->ccwdev->dev)); - channel->state = LCS_CH_STATE_INIT; - spin_lock_irqsave(get_ccwdev_lock(channel->ccwdev), flags); - rc = ccw_device_halt(channel->ccwdev, 0); - spin_unlock_irqrestore(get_ccwdev_lock(channel->ccwdev), flags); - if (rc) { - LCS_DBF_TEXT_(4, trace, "ehsc%s", - dev_name(&channel->ccwdev->dev)); - return rc; - } - /* Asynchronous halt initialted. Wait for its completion. */ - wait_event(channel->wait_q, (channel->state == LCS_CH_STATE_HALTED)); - lcs_clear_channel(channel); - return 0; -} - -/* - * start read and write channel - */ -static int -lcs_start_channels(struct lcs_card *card) -{ - int rc; - - LCS_DBF_TEXT(2, trace, "chstart"); - /* start read channel */ - rc = lcs_start_channel(&card->read); - if (rc) - return rc; - /* start write channel */ - rc = lcs_start_channel(&card->write); - if (rc) - lcs_stop_channel(&card->read); - return rc; -} - -/* - * stop read and write channel - */ -static int -lcs_stop_channels(struct lcs_card *card) -{ - LCS_DBF_TEXT(2, trace, "chhalt"); - lcs_stop_channel(&card->read); - lcs_stop_channel(&card->write); - return 0; -} - -/* - * Get empty buffer. - */ -static struct lcs_buffer * -__lcs_get_buffer(struct lcs_channel *channel) -{ - int index; - - LCS_DBF_TEXT(5, trace, "_getbuff"); - index = channel->io_idx; - do { - if (channel->iob[index].state == LCS_BUF_STATE_EMPTY) { - channel->iob[index].state = LCS_BUF_STATE_LOCKED; - return channel->iob + index; - } - index = (index + 1) & (LCS_NUM_BUFFS - 1); - } while (index != channel->io_idx); - return NULL; -} - -static struct lcs_buffer * -lcs_get_buffer(struct lcs_channel *channel) -{ - struct lcs_buffer *buffer; - unsigned long flags; - - LCS_DBF_TEXT(5, trace, "getbuff"); - spin_lock_irqsave(get_ccwdev_lock(channel->ccwdev), flags); - buffer = __lcs_get_buffer(channel); - spin_unlock_irqrestore(get_ccwdev_lock(channel->ccwdev), flags); - return buffer; -} - -/* - * Resume channel program if the channel is suspended. - */ -static int -__lcs_resume_channel(struct lcs_channel *channel) -{ - int rc; - - if (channel->state != LCS_CH_STATE_SUSPENDED) - return 0; - if (channel->ccws[channel->io_idx].flags & CCW_FLAG_SUSPEND) - return 0; - LCS_DBF_TEXT_(5, trace, "rsch%s", dev_name(&channel->ccwdev->dev)); - rc = ccw_device_resume(channel->ccwdev); - if (rc) { - LCS_DBF_TEXT_(4, trace, "ersc%s", - dev_name(&channel->ccwdev->dev)); - dev_err(&channel->ccwdev->dev, - "Sending data from the LCS device to the LAN failed" - " with rc=%d\n",rc); - } else - channel->state = LCS_CH_STATE_RUNNING; - return rc; - -} - -/* - * Make a buffer ready for processing. - */ -static void __lcs_ready_buffer_bits(struct lcs_channel *channel, int index) -{ - int prev, next; - - LCS_DBF_TEXT(5, trace, "rdybits"); - prev = (index - 1) & (LCS_NUM_BUFFS - 1); - next = (index + 1) & (LCS_NUM_BUFFS - 1); - /* Check if we may clear the suspend bit of this buffer. */ - if (channel->ccws[next].flags & CCW_FLAG_SUSPEND) { - /* Check if we have to set the PCI bit. */ - if (!(channel->ccws[prev].flags & CCW_FLAG_SUSPEND)) - /* Suspend bit of the previous buffer is not set. */ - channel->ccws[index].flags |= CCW_FLAG_PCI; - /* Suspend bit of the next buffer is set. */ - channel->ccws[index].flags &= ~CCW_FLAG_SUSPEND; - } -} - -static int -lcs_ready_buffer(struct lcs_channel *channel, struct lcs_buffer *buffer) -{ - unsigned long flags; - int index, rc; - - LCS_DBF_TEXT(5, trace, "rdybuff"); - BUG_ON(buffer->state != LCS_BUF_STATE_LOCKED && - buffer->state != LCS_BUF_STATE_PROCESSED); - spin_lock_irqsave(get_ccwdev_lock(channel->ccwdev), flags); - buffer->state = LCS_BUF_STATE_READY; - index = buffer - channel->iob; - /* Set length. */ - channel->ccws[index].count = buffer->count; - /* Check relevant PCI/suspend bits. */ - __lcs_ready_buffer_bits(channel, index); - rc = __lcs_resume_channel(channel); - spin_unlock_irqrestore(get_ccwdev_lock(channel->ccwdev), flags); - return rc; -} - -/* - * Mark the buffer as processed. Take care of the suspend bit - * of the previous buffer. This function is called from - * interrupt context, so the lock must not be taken. - */ -static int -__lcs_processed_buffer(struct lcs_channel *channel, struct lcs_buffer *buffer) -{ - int index, prev, next; - - LCS_DBF_TEXT(5, trace, "prcsbuff"); - BUG_ON(buffer->state != LCS_BUF_STATE_READY); - buffer->state = LCS_BUF_STATE_PROCESSED; - index = buffer - channel->iob; - prev = (index - 1) & (LCS_NUM_BUFFS - 1); - next = (index + 1) & (LCS_NUM_BUFFS - 1); - /* Set the suspend bit and clear the PCI bit of this buffer. */ - channel->ccws[index].flags |= CCW_FLAG_SUSPEND; - channel->ccws[index].flags &= ~CCW_FLAG_PCI; - /* Check the suspend bit of the previous buffer. */ - if (channel->iob[prev].state == LCS_BUF_STATE_READY) { - /* - * Previous buffer is in state ready. It might have - * happened in lcs_ready_buffer that the suspend bit - * has not been cleared to avoid an endless loop. - * Do it now. - */ - __lcs_ready_buffer_bits(channel, prev); - } - /* Clear PCI bit of next buffer. */ - channel->ccws[next].flags &= ~CCW_FLAG_PCI; - return __lcs_resume_channel(channel); -} - -/* - * Put a processed buffer back to state empty. - */ -static void -lcs_release_buffer(struct lcs_channel *channel, struct lcs_buffer *buffer) -{ - unsigned long flags; - - LCS_DBF_TEXT(5, trace, "relbuff"); - BUG_ON(buffer->state != LCS_BUF_STATE_LOCKED && - buffer->state != LCS_BUF_STATE_PROCESSED); - spin_lock_irqsave(get_ccwdev_lock(channel->ccwdev), flags); - buffer->state = LCS_BUF_STATE_EMPTY; - spin_unlock_irqrestore(get_ccwdev_lock(channel->ccwdev), flags); -} - -/* - * Get buffer for a lan command. - */ -static struct lcs_buffer * -lcs_get_lancmd(struct lcs_card *card, int count) -{ - struct lcs_buffer *buffer; - struct lcs_cmd *cmd; - - LCS_DBF_TEXT(4, trace, "getlncmd"); - /* Get buffer and wait if none is available. */ - wait_event(card->write.wait_q, - ((buffer = lcs_get_buffer(&card->write)) != NULL)); - count += sizeof(struct lcs_header); - *(__u16 *)(buffer->data + count) = 0; - buffer->count = count + sizeof(__u16); - buffer->callback = lcs_release_buffer; - cmd = (struct lcs_cmd *) buffer->data; - cmd->offset = count; - cmd->type = LCS_FRAME_TYPE_CONTROL; - cmd->slot = 0; - return buffer; -} - - -static void -lcs_get_reply(struct lcs_reply *reply) -{ - refcount_inc(&reply->refcnt); -} - -static void -lcs_put_reply(struct lcs_reply *reply) -{ - if (refcount_dec_and_test(&reply->refcnt)) - kfree(reply); -} - -static struct lcs_reply * -lcs_alloc_reply(struct lcs_cmd *cmd) -{ - struct lcs_reply *reply; - - LCS_DBF_TEXT(4, trace, "getreply"); - - reply = kzalloc(sizeof(struct lcs_reply), GFP_ATOMIC); - if (!reply) - return NULL; - refcount_set(&reply->refcnt, 1); - reply->sequence_no = cmd->sequence_no; - reply->received = 0; - reply->rc = 0; - init_waitqueue_head(&reply->wait_q); - - return reply; -} - -/* - * Notifier function for lancmd replies. Called from read irq. - */ -static void -lcs_notify_lancmd_waiters(struct lcs_card *card, struct lcs_cmd *cmd) -{ - struct list_head *l, *n; - struct lcs_reply *reply; - - LCS_DBF_TEXT(4, trace, "notiwait"); - spin_lock(&card->lock); - list_for_each_safe(l, n, &card->lancmd_waiters) { - reply = list_entry(l, struct lcs_reply, list); - if (reply->sequence_no == cmd->sequence_no) { - lcs_get_reply(reply); - list_del_init(&reply->list); - if (reply->callback != NULL) - reply->callback(card, cmd); - reply->received = 1; - reply->rc = cmd->return_code; - wake_up(&reply->wait_q); - lcs_put_reply(reply); - break; - } - } - spin_unlock(&card->lock); -} - -/* - * Emit buffer of a lan command. - */ -static void -lcs_lancmd_timeout(struct timer_list *t) -{ - struct lcs_reply *reply = from_timer(reply, t, timer); - struct lcs_reply *list_reply, *r; - unsigned long flags; - - LCS_DBF_TEXT(4, trace, "timeout"); - spin_lock_irqsave(&reply->card->lock, flags); - list_for_each_entry_safe(list_reply, r, - &reply->card->lancmd_waiters,list) { - if (reply == list_reply) { - lcs_get_reply(reply); - list_del_init(&reply->list); - spin_unlock_irqrestore(&reply->card->lock, flags); - reply->received = 1; - reply->rc = -ETIME; - wake_up(&reply->wait_q); - lcs_put_reply(reply); - return; - } - } - spin_unlock_irqrestore(&reply->card->lock, flags); -} - -static int -lcs_send_lancmd(struct lcs_card *card, struct lcs_buffer *buffer, - void (*reply_callback)(struct lcs_card *, struct lcs_cmd *)) -{ - struct lcs_reply *reply; - struct lcs_cmd *cmd; - unsigned long flags; - int rc; - - LCS_DBF_TEXT(4, trace, "sendcmd"); - cmd = (struct lcs_cmd *) buffer->data; - cmd->return_code = 0; - cmd->sequence_no = card->sequence_no++; - reply = lcs_alloc_reply(cmd); - if (!reply) - return -ENOMEM; - reply->callback = reply_callback; - reply->card = card; - spin_lock_irqsave(&card->lock, flags); - list_add_tail(&reply->list, &card->lancmd_waiters); - spin_unlock_irqrestore(&card->lock, flags); - - buffer->callback = lcs_release_buffer; - rc = lcs_ready_buffer(&card->write, buffer); - if (rc) - return rc; - timer_setup(&reply->timer, lcs_lancmd_timeout, 0); - mod_timer(&reply->timer, jiffies + HZ * card->lancmd_timeout); - wait_event(reply->wait_q, reply->received); - del_timer_sync(&reply->timer); - LCS_DBF_TEXT_(4, trace, "rc:%d",reply->rc); - rc = reply->rc; - lcs_put_reply(reply); - return rc ? -EIO : 0; -} - -/* - * LCS startup command - */ -static int -lcs_send_startup(struct lcs_card *card, __u8 initiator) -{ - struct lcs_buffer *buffer; - struct lcs_cmd *cmd; - - LCS_DBF_TEXT(2, trace, "startup"); - buffer = lcs_get_lancmd(card, LCS_STD_CMD_SIZE); - cmd = (struct lcs_cmd *) buffer->data; - cmd->cmd_code = LCS_CMD_STARTUP; - cmd->initiator = initiator; - cmd->cmd.lcs_startup.buff_size = LCS_IOBUFFERSIZE; - return lcs_send_lancmd(card, buffer, NULL); -} - -/* - * LCS shutdown command - */ -static int -lcs_send_shutdown(struct lcs_card *card) -{ - struct lcs_buffer *buffer; - struct lcs_cmd *cmd; - - LCS_DBF_TEXT(2, trace, "shutdown"); - buffer = lcs_get_lancmd(card, LCS_STD_CMD_SIZE); - cmd = (struct lcs_cmd *) buffer->data; - cmd->cmd_code = LCS_CMD_SHUTDOWN; - cmd->initiator = LCS_INITIATOR_TCPIP; - return lcs_send_lancmd(card, buffer, NULL); -} - -/* - * LCS lanstat command - */ -static void -__lcs_lanstat_cb(struct lcs_card *card, struct lcs_cmd *cmd) -{ - LCS_DBF_TEXT(2, trace, "statcb"); - memcpy(card->mac, cmd->cmd.lcs_lanstat_cmd.mac_addr, LCS_MAC_LENGTH); -} - -static int -lcs_send_lanstat(struct lcs_card *card) -{ - struct lcs_buffer *buffer; - struct lcs_cmd *cmd; - - LCS_DBF_TEXT(2,trace, "cmdstat"); - buffer = lcs_get_lancmd(card, LCS_STD_CMD_SIZE); - cmd = (struct lcs_cmd *) buffer->data; - /* Setup lanstat command. */ - cmd->cmd_code = LCS_CMD_LANSTAT; - cmd->initiator = LCS_INITIATOR_TCPIP; - cmd->cmd.lcs_std_cmd.lan_type = card->lan_type; - cmd->cmd.lcs_std_cmd.portno = card->portno; - return lcs_send_lancmd(card, buffer, __lcs_lanstat_cb); -} - -/* - * send stoplan command - */ -static int -lcs_send_stoplan(struct lcs_card *card, __u8 initiator) -{ - struct lcs_buffer *buffer; - struct lcs_cmd *cmd; - - LCS_DBF_TEXT(2, trace, "cmdstpln"); - buffer = lcs_get_lancmd(card, LCS_STD_CMD_SIZE); - cmd = (struct lcs_cmd *) buffer->data; - cmd->cmd_code = LCS_CMD_STOPLAN; - cmd->initiator = initiator; - cmd->cmd.lcs_std_cmd.lan_type = card->lan_type; - cmd->cmd.lcs_std_cmd.portno = card->portno; - return lcs_send_lancmd(card, buffer, NULL); -} - -/* - * send startlan command - */ -static void -__lcs_send_startlan_cb(struct lcs_card *card, struct lcs_cmd *cmd) -{ - LCS_DBF_TEXT(2, trace, "srtlancb"); - card->lan_type = cmd->cmd.lcs_std_cmd.lan_type; - card->portno = cmd->cmd.lcs_std_cmd.portno; -} - -static int -lcs_send_startlan(struct lcs_card *card, __u8 initiator) -{ - struct lcs_buffer *buffer; - struct lcs_cmd *cmd; - - LCS_DBF_TEXT(2, trace, "cmdstaln"); - buffer = lcs_get_lancmd(card, LCS_STD_CMD_SIZE); - cmd = (struct lcs_cmd *) buffer->data; - cmd->cmd_code = LCS_CMD_STARTLAN; - cmd->initiator = initiator; - cmd->cmd.lcs_std_cmd.lan_type = card->lan_type; - cmd->cmd.lcs_std_cmd.portno = card->portno; - return lcs_send_lancmd(card, buffer, __lcs_send_startlan_cb); -} - -#ifdef CONFIG_IP_MULTICAST -/* - * send setipm command (Multicast) - */ -static int -lcs_send_setipm(struct lcs_card *card,struct lcs_ipm_list *ipm_list) -{ - struct lcs_buffer *buffer; - struct lcs_cmd *cmd; - - LCS_DBF_TEXT(2, trace, "cmdsetim"); - buffer = lcs_get_lancmd(card, LCS_MULTICAST_CMD_SIZE); - cmd = (struct lcs_cmd *) buffer->data; - cmd->cmd_code = LCS_CMD_SETIPM; - cmd->initiator = LCS_INITIATOR_TCPIP; - cmd->cmd.lcs_qipassist.lan_type = card->lan_type; - cmd->cmd.lcs_qipassist.portno = card->portno; - cmd->cmd.lcs_qipassist.version = 4; - cmd->cmd.lcs_qipassist.num_ip_pairs = 1; - memcpy(cmd->cmd.lcs_qipassist.lcs_ipass_ctlmsg.ip_mac_pair, - &ipm_list->ipm, sizeof (struct lcs_ip_mac_pair)); - LCS_DBF_TEXT_(2, trace, "%x",ipm_list->ipm.ip_addr); - return lcs_send_lancmd(card, buffer, NULL); -} - -/* - * send delipm command (Multicast) - */ -static int -lcs_send_delipm(struct lcs_card *card,struct lcs_ipm_list *ipm_list) -{ - struct lcs_buffer *buffer; - struct lcs_cmd *cmd; - - LCS_DBF_TEXT(2, trace, "cmddelim"); - buffer = lcs_get_lancmd(card, LCS_MULTICAST_CMD_SIZE); - cmd = (struct lcs_cmd *) buffer->data; - cmd->cmd_code = LCS_CMD_DELIPM; - cmd->initiator = LCS_INITIATOR_TCPIP; - cmd->cmd.lcs_qipassist.lan_type = card->lan_type; - cmd->cmd.lcs_qipassist.portno = card->portno; - cmd->cmd.lcs_qipassist.version = 4; - cmd->cmd.lcs_qipassist.num_ip_pairs = 1; - memcpy(cmd->cmd.lcs_qipassist.lcs_ipass_ctlmsg.ip_mac_pair, - &ipm_list->ipm, sizeof (struct lcs_ip_mac_pair)); - LCS_DBF_TEXT_(2, trace, "%x",ipm_list->ipm.ip_addr); - return lcs_send_lancmd(card, buffer, NULL); -} - -/* - * check if multicast is supported by LCS - */ -static void -__lcs_check_multicast_cb(struct lcs_card *card, struct lcs_cmd *cmd) -{ - LCS_DBF_TEXT(2, trace, "chkmccb"); - card->ip_assists_supported = - cmd->cmd.lcs_qipassist.ip_assists_supported; - card->ip_assists_enabled = - cmd->cmd.lcs_qipassist.ip_assists_enabled; -} - -static int -lcs_check_multicast_support(struct lcs_card *card) -{ - struct lcs_buffer *buffer; - struct lcs_cmd *cmd; - int rc; - - LCS_DBF_TEXT(2, trace, "cmdqipa"); - /* Send query ipassist. */ - buffer = lcs_get_lancmd(card, LCS_STD_CMD_SIZE); - cmd = (struct lcs_cmd *) buffer->data; - cmd->cmd_code = LCS_CMD_QIPASSIST; - cmd->initiator = LCS_INITIATOR_TCPIP; - cmd->cmd.lcs_qipassist.lan_type = card->lan_type; - cmd->cmd.lcs_qipassist.portno = card->portno; - cmd->cmd.lcs_qipassist.version = 4; - cmd->cmd.lcs_qipassist.num_ip_pairs = 1; - rc = lcs_send_lancmd(card, buffer, __lcs_check_multicast_cb); - if (rc != 0) { - pr_err("Query IPAssist failed. Assuming unsupported!\n"); - return -EOPNOTSUPP; - } - if (card->ip_assists_supported & LCS_IPASS_MULTICAST_SUPPORT) - return 0; - return -EOPNOTSUPP; -} - -/* - * set or del multicast address on LCS card - */ -static void -lcs_fix_multicast_list(struct lcs_card *card) -{ - struct list_head failed_list; - struct lcs_ipm_list *ipm, *tmp; - unsigned long flags; - int rc; - - LCS_DBF_TEXT(4,trace, "fixipm"); - INIT_LIST_HEAD(&failed_list); - spin_lock_irqsave(&card->ipm_lock, flags); -list_modified: - list_for_each_entry_safe(ipm, tmp, &card->ipm_list, list){ - switch (ipm->ipm_state) { - case LCS_IPM_STATE_SET_REQUIRED: - /* del from ipm_list so no one else can tamper with - * this entry */ - list_del_init(&ipm->list); - spin_unlock_irqrestore(&card->ipm_lock, flags); - rc = lcs_send_setipm(card, ipm); - spin_lock_irqsave(&card->ipm_lock, flags); - if (rc) { - pr_info("Adding multicast address failed." - " Table possibly full!\n"); - /* store ipm in failed list -> will be added - * to ipm_list again, so a retry will be done - * during the next call of this function */ - list_add_tail(&ipm->list, &failed_list); - } else { - ipm->ipm_state = LCS_IPM_STATE_ON_CARD; - /* re-insert into ipm_list */ - list_add_tail(&ipm->list, &card->ipm_list); - } - goto list_modified; - case LCS_IPM_STATE_DEL_REQUIRED: - list_del(&ipm->list); - spin_unlock_irqrestore(&card->ipm_lock, flags); - lcs_send_delipm(card, ipm); - spin_lock_irqsave(&card->ipm_lock, flags); - kfree(ipm); - goto list_modified; - case LCS_IPM_STATE_ON_CARD: - break; - } - } - /* re-insert all entries from the failed_list into ipm_list */ - list_for_each_entry_safe(ipm, tmp, &failed_list, list) - list_move_tail(&ipm->list, &card->ipm_list); - - spin_unlock_irqrestore(&card->ipm_lock, flags); -} - -/* - * get mac address for the relevant Multicast address - */ -static void -lcs_get_mac_for_ipm(__be32 ipm, char *mac, struct net_device *dev) -{ - LCS_DBF_TEXT(4,trace, "getmac"); - ip_eth_mc_map(ipm, mac); -} - -/* - * function called by net device to handle multicast address relevant things - */ -static void lcs_remove_mc_addresses(struct lcs_card *card, - struct in_device *in4_dev) -{ - struct ip_mc_list *im4; - struct list_head *l; - struct lcs_ipm_list *ipm; - unsigned long flags; - char buf[MAX_ADDR_LEN]; - - LCS_DBF_TEXT(4, trace, "remmclst"); - spin_lock_irqsave(&card->ipm_lock, flags); - list_for_each(l, &card->ipm_list) { - ipm = list_entry(l, struct lcs_ipm_list, list); - for (im4 = rcu_dereference(in4_dev->mc_list); - im4 != NULL; im4 = rcu_dereference(im4->next_rcu)) { - lcs_get_mac_for_ipm(im4->multiaddr, buf, card->dev); - if ( (ipm->ipm.ip_addr == im4->multiaddr) && - (memcmp(buf, &ipm->ipm.mac_addr, - LCS_MAC_LENGTH) == 0) ) - break; - } - if (im4 == NULL) - ipm->ipm_state = LCS_IPM_STATE_DEL_REQUIRED; - } - spin_unlock_irqrestore(&card->ipm_lock, flags); -} - -static struct lcs_ipm_list *lcs_check_addr_entry(struct lcs_card *card, - struct ip_mc_list *im4, - char *buf) -{ - struct lcs_ipm_list *tmp, *ipm = NULL; - struct list_head *l; - unsigned long flags; - - LCS_DBF_TEXT(4, trace, "chkmcent"); - spin_lock_irqsave(&card->ipm_lock, flags); - list_for_each(l, &card->ipm_list) { - tmp = list_entry(l, struct lcs_ipm_list, list); - if ( (tmp->ipm.ip_addr == im4->multiaddr) && - (memcmp(buf, &tmp->ipm.mac_addr, - LCS_MAC_LENGTH) == 0) ) { - ipm = tmp; - break; - } - } - spin_unlock_irqrestore(&card->ipm_lock, flags); - return ipm; -} - -static void lcs_set_mc_addresses(struct lcs_card *card, - struct in_device *in4_dev) -{ - - struct ip_mc_list *im4; - struct lcs_ipm_list *ipm; - char buf[MAX_ADDR_LEN]; - unsigned long flags; - - LCS_DBF_TEXT(4, trace, "setmclst"); - for (im4 = rcu_dereference(in4_dev->mc_list); im4 != NULL; - im4 = rcu_dereference(im4->next_rcu)) { - lcs_get_mac_for_ipm(im4->multiaddr, buf, card->dev); - ipm = lcs_check_addr_entry(card, im4, buf); - if (ipm != NULL) - continue; /* Address already in list. */ - ipm = kzalloc(sizeof(struct lcs_ipm_list), GFP_ATOMIC); - if (ipm == NULL) { - pr_info("Not enough memory to add" - " new multicast entry!\n"); - break; - } - memcpy(&ipm->ipm.mac_addr, buf, LCS_MAC_LENGTH); - ipm->ipm.ip_addr = im4->multiaddr; - ipm->ipm_state = LCS_IPM_STATE_SET_REQUIRED; - spin_lock_irqsave(&card->ipm_lock, flags); - LCS_DBF_HEX(2,trace,&ipm->ipm.ip_addr,4); - list_add(&ipm->list, &card->ipm_list); - spin_unlock_irqrestore(&card->ipm_lock, flags); - } -} - -static int -lcs_register_mc_addresses(void *data) -{ - struct lcs_card *card; - struct in_device *in4_dev; - - card = (struct lcs_card *) data; - - if (!lcs_do_run_thread(card, LCS_SET_MC_THREAD)) - return 0; - LCS_DBF_TEXT(4, trace, "regmulti"); - - in4_dev = in_dev_get(card->dev); - if (in4_dev == NULL) - goto out; - rcu_read_lock(); - lcs_remove_mc_addresses(card,in4_dev); - lcs_set_mc_addresses(card, in4_dev); - rcu_read_unlock(); - in_dev_put(in4_dev); - - netif_carrier_off(card->dev); - netif_tx_disable(card->dev); - wait_event(card->write.wait_q, - (card->write.state != LCS_CH_STATE_RUNNING)); - lcs_fix_multicast_list(card); - if (card->state == DEV_STATE_UP) { - netif_carrier_on(card->dev); - netif_wake_queue(card->dev); - } -out: - lcs_clear_thread_running_bit(card, LCS_SET_MC_THREAD); - return 0; -} -#endif /* CONFIG_IP_MULTICAST */ - -/* - * function called by net device to - * handle multicast address relevant things - */ -static void -lcs_set_multicast_list(struct net_device *dev) -{ -#ifdef CONFIG_IP_MULTICAST - struct lcs_card *card; - - LCS_DBF_TEXT(4, trace, "setmulti"); - card = (struct lcs_card *) dev->ml_priv; - - if (!lcs_set_thread_start_bit(card, LCS_SET_MC_THREAD)) - schedule_work(&card->kernel_thread_starter); -#endif /* CONFIG_IP_MULTICAST */ -} - -static long -lcs_check_irb_error(struct ccw_device *cdev, struct irb *irb) -{ - if (!IS_ERR(irb)) - return 0; - - switch (PTR_ERR(irb)) { - case -EIO: - dev_warn(&cdev->dev, - "An I/O-error occurred on the LCS device\n"); - LCS_DBF_TEXT(2, trace, "ckirberr"); - LCS_DBF_TEXT_(2, trace, " rc%d", -EIO); - break; - case -ETIMEDOUT: - dev_warn(&cdev->dev, - "A command timed out on the LCS device\n"); - LCS_DBF_TEXT(2, trace, "ckirberr"); - LCS_DBF_TEXT_(2, trace, " rc%d", -ETIMEDOUT); - break; - default: - dev_warn(&cdev->dev, - "An error occurred on the LCS device, rc=%ld\n", - PTR_ERR(irb)); - LCS_DBF_TEXT(2, trace, "ckirberr"); - LCS_DBF_TEXT(2, trace, " rc???"); - } - return PTR_ERR(irb); -} - -static int -lcs_get_problem(struct ccw_device *cdev, struct irb *irb) -{ - int dstat, cstat; - char *sense; - - sense = (char *) irb->ecw; - cstat = irb->scsw.cmd.cstat; - dstat = irb->scsw.cmd.dstat; - - if (cstat & (SCHN_STAT_CHN_CTRL_CHK | SCHN_STAT_INTF_CTRL_CHK | - SCHN_STAT_CHN_DATA_CHK | SCHN_STAT_CHAIN_CHECK | - SCHN_STAT_PROT_CHECK | SCHN_STAT_PROG_CHECK)) { - LCS_DBF_TEXT(2, trace, "CGENCHK"); - return 1; - } - if (dstat & DEV_STAT_UNIT_CHECK) { - if (sense[LCS_SENSE_BYTE_1] & - LCS_SENSE_RESETTING_EVENT) { - LCS_DBF_TEXT(2, trace, "REVIND"); - return 1; - } - if (sense[LCS_SENSE_BYTE_0] & - LCS_SENSE_CMD_REJECT) { - LCS_DBF_TEXT(2, trace, "CMDREJ"); - return 0; - } - if ((!sense[LCS_SENSE_BYTE_0]) && - (!sense[LCS_SENSE_BYTE_1]) && - (!sense[LCS_SENSE_BYTE_2]) && - (!sense[LCS_SENSE_BYTE_3])) { - LCS_DBF_TEXT(2, trace, "ZEROSEN"); - return 0; - } - LCS_DBF_TEXT(2, trace, "DGENCHK"); - return 1; - } - return 0; -} - -static void -lcs_schedule_recovery(struct lcs_card *card) -{ - LCS_DBF_TEXT(2, trace, "startrec"); - if (!lcs_set_thread_start_bit(card, LCS_RECOVERY_THREAD)) - schedule_work(&card->kernel_thread_starter); -} - -/* - * IRQ Handler for LCS channels - */ -static void -lcs_irq(struct ccw_device *cdev, unsigned long intparm, struct irb *irb) -{ - struct lcs_card *card; - struct lcs_channel *channel; - int rc, index; - int cstat, dstat; - - if (lcs_check_irb_error(cdev, irb)) - return; - - card = CARD_FROM_DEV(cdev); - if (card->read.ccwdev == cdev) - channel = &card->read; - else - channel = &card->write; - - cstat = irb->scsw.cmd.cstat; - dstat = irb->scsw.cmd.dstat; - LCS_DBF_TEXT_(5, trace, "Rint%s", dev_name(&cdev->dev)); - LCS_DBF_TEXT_(5, trace, "%4x%4x", irb->scsw.cmd.cstat, - irb->scsw.cmd.dstat); - LCS_DBF_TEXT_(5, trace, "%4x%4x", irb->scsw.cmd.fctl, - irb->scsw.cmd.actl); - - /* Check for channel and device errors presented */ - rc = lcs_get_problem(cdev, irb); - if (rc || (dstat & DEV_STAT_UNIT_EXCEP)) { - dev_warn(&cdev->dev, - "The LCS device stopped because of an error," - " dstat=0x%X, cstat=0x%X \n", - dstat, cstat); - if (rc) { - channel->state = LCS_CH_STATE_ERROR; - } - } - if (channel->state == LCS_CH_STATE_ERROR) { - lcs_schedule_recovery(card); - wake_up(&card->wait_q); - return; - } - /* How far in the ccw chain have we processed? */ - if ((channel->state != LCS_CH_STATE_INIT) && - (irb->scsw.cmd.fctl & SCSW_FCTL_START_FUNC) && - (irb->scsw.cmd.cpa != 0)) { - index = (struct ccw1 *)dma32_to_virt(irb->scsw.cmd.cpa) - - channel->ccws; - if ((irb->scsw.cmd.actl & SCSW_ACTL_SUSPENDED) || - (irb->scsw.cmd.cstat & SCHN_STAT_PCI)) - /* Bloody io subsystem tells us lies about cpa... */ - index = (index - 1) & (LCS_NUM_BUFFS - 1); - while (channel->io_idx != index) { - __lcs_processed_buffer(channel, - channel->iob + channel->io_idx); - channel->io_idx = - (channel->io_idx + 1) & (LCS_NUM_BUFFS - 1); - } - } - - if ((irb->scsw.cmd.dstat & DEV_STAT_DEV_END) || - (irb->scsw.cmd.dstat & DEV_STAT_CHN_END) || - (irb->scsw.cmd.dstat & DEV_STAT_UNIT_CHECK)) - /* Mark channel as stopped. */ - channel->state = LCS_CH_STATE_STOPPED; - else if (irb->scsw.cmd.actl & SCSW_ACTL_SUSPENDED) - /* CCW execution stopped on a suspend bit. */ - channel->state = LCS_CH_STATE_SUSPENDED; - if (irb->scsw.cmd.fctl & SCSW_FCTL_HALT_FUNC) { - if (irb->scsw.cmd.cc != 0) { - ccw_device_halt(channel->ccwdev, 0); - return; - } - /* The channel has been stopped by halt_IO. */ - channel->state = LCS_CH_STATE_HALTED; - } - if (irb->scsw.cmd.fctl & SCSW_FCTL_CLEAR_FUNC) - channel->state = LCS_CH_STATE_CLEARED; - /* Do the rest in the tasklet. */ - tasklet_schedule(&channel->irq_tasklet); -} - -/* - * Tasklet for IRQ handler - */ -static void -lcs_tasklet(unsigned long data) -{ - unsigned long flags; - struct lcs_channel *channel; - struct lcs_buffer *iob; - int buf_idx; - - channel = (struct lcs_channel *) data; - LCS_DBF_TEXT_(5, trace, "tlet%s", dev_name(&channel->ccwdev->dev)); - - /* Check for processed buffers. */ - iob = channel->iob; - buf_idx = channel->buf_idx; - while (iob[buf_idx].state == LCS_BUF_STATE_PROCESSED) { - /* Do the callback thing. */ - if (iob[buf_idx].callback != NULL) - iob[buf_idx].callback(channel, iob + buf_idx); - buf_idx = (buf_idx + 1) & (LCS_NUM_BUFFS - 1); - } - channel->buf_idx = buf_idx; - - if (channel->state == LCS_CH_STATE_STOPPED) - lcs_start_channel(channel); - spin_lock_irqsave(get_ccwdev_lock(channel->ccwdev), flags); - if (channel->state == LCS_CH_STATE_SUSPENDED && - channel->iob[channel->io_idx].state == LCS_BUF_STATE_READY) - __lcs_resume_channel(channel); - spin_unlock_irqrestore(get_ccwdev_lock(channel->ccwdev), flags); - - /* Something happened on the channel. Wake up waiters. */ - wake_up(&channel->wait_q); -} - -/* - * Finish current tx buffer and make it ready for transmit. - */ -static void -__lcs_emit_txbuffer(struct lcs_card *card) -{ - LCS_DBF_TEXT(5, trace, "emittx"); - *(__u16 *)(card->tx_buffer->data + card->tx_buffer->count) = 0; - card->tx_buffer->count += 2; - lcs_ready_buffer(&card->write, card->tx_buffer); - card->tx_buffer = NULL; - card->tx_emitted++; -} - -/* - * Callback for finished tx buffers. - */ -static void -lcs_txbuffer_cb(struct lcs_channel *channel, struct lcs_buffer *buffer) -{ - struct lcs_card *card; - - LCS_DBF_TEXT(5, trace, "txbuffcb"); - /* Put buffer back to pool. */ - lcs_release_buffer(channel, buffer); - card = container_of(channel, struct lcs_card, write); - if (netif_queue_stopped(card->dev) && netif_carrier_ok(card->dev)) - netif_wake_queue(card->dev); - spin_lock(&card->lock); - card->tx_emitted--; - if (card->tx_emitted <= 0 && card->tx_buffer != NULL) - /* - * Last running tx buffer has finished. Submit partially - * filled current buffer. - */ - __lcs_emit_txbuffer(card); - spin_unlock(&card->lock); -} - -/* - * Packet transmit function called by network stack - */ -static netdev_tx_t __lcs_start_xmit(struct lcs_card *card, struct sk_buff *skb, - struct net_device *dev) -{ - struct lcs_header *header; - int rc = NETDEV_TX_OK; - - LCS_DBF_TEXT(5, trace, "hardxmit"); - if (skb == NULL) { - card->stats.tx_dropped++; - card->stats.tx_errors++; - return NETDEV_TX_OK; - } - if (card->state != DEV_STATE_UP) { - dev_kfree_skb(skb); - card->stats.tx_dropped++; - card->stats.tx_errors++; - card->stats.tx_carrier_errors++; - return NETDEV_TX_OK; - } - if (skb->protocol == htons(ETH_P_IPV6)) { - dev_kfree_skb(skb); - return NETDEV_TX_OK; - } - netif_stop_queue(card->dev); - spin_lock(&card->lock); - if (card->tx_buffer != NULL && - card->tx_buffer->count + sizeof(struct lcs_header) + - skb->len + sizeof(u16) > LCS_IOBUFFERSIZE) - /* skb too big for current tx buffer. */ - __lcs_emit_txbuffer(card); - if (card->tx_buffer == NULL) { - /* Get new tx buffer */ - card->tx_buffer = lcs_get_buffer(&card->write); - if (card->tx_buffer == NULL) { - card->stats.tx_dropped++; - rc = NETDEV_TX_BUSY; - goto out; - } - card->tx_buffer->callback = lcs_txbuffer_cb; - card->tx_buffer->count = 0; - } - header = (struct lcs_header *) - (card->tx_buffer->data + card->tx_buffer->count); - card->tx_buffer->count += skb->len + sizeof(struct lcs_header); - header->offset = card->tx_buffer->count; - header->type = card->lan_type; - header->slot = card->portno; - skb_copy_from_linear_data(skb, header + 1, skb->len); - spin_unlock(&card->lock); - card->stats.tx_bytes += skb->len; - card->stats.tx_packets++; - dev_kfree_skb(skb); - netif_wake_queue(card->dev); - spin_lock(&card->lock); - if (card->tx_emitted <= 0 && card->tx_buffer != NULL) - /* If this is the first tx buffer emit it immediately. */ - __lcs_emit_txbuffer(card); -out: - spin_unlock(&card->lock); - return rc; -} - -static netdev_tx_t lcs_start_xmit(struct sk_buff *skb, struct net_device *dev) -{ - struct lcs_card *card; - int rc; - - LCS_DBF_TEXT(5, trace, "pktxmit"); - card = (struct lcs_card *) dev->ml_priv; - rc = __lcs_start_xmit(card, skb, dev); - return rc; -} - -/* - * send startlan and lanstat command to make LCS device ready - */ -static int -lcs_startlan_auto(struct lcs_card *card) -{ - int rc; - - LCS_DBF_TEXT(2, trace, "strtauto"); - card->lan_type = LCS_FRAME_TYPE_ENET; - rc = lcs_send_startlan(card, LCS_INITIATOR_TCPIP); - if (rc == 0) - return 0; - - return -EIO; -} - -static int -lcs_startlan(struct lcs_card *card) -{ - int rc, i; - - LCS_DBF_TEXT(2, trace, "startlan"); - rc = 0; - if (card->portno != LCS_INVALID_PORT_NO) { - if (card->lan_type == LCS_FRAME_TYPE_AUTO) - rc = lcs_startlan_auto(card); - else - rc = lcs_send_startlan(card, LCS_INITIATOR_TCPIP); - } else { - for (i = 0; i <= 16; i++) { - card->portno = i; - if (card->lan_type != LCS_FRAME_TYPE_AUTO) - rc = lcs_send_startlan(card, - LCS_INITIATOR_TCPIP); - else - /* autodetecting lan type */ - rc = lcs_startlan_auto(card); - if (rc == 0) - break; - } - } - if (rc == 0) - return lcs_send_lanstat(card); - return rc; -} - -/* - * LCS detect function - * setup channels and make them I/O ready - */ -static int -lcs_detect(struct lcs_card *card) -{ - int rc = 0; - - LCS_DBF_TEXT(2, setup, "lcsdetct"); - /* start/reset card */ - if (card->dev) - netif_stop_queue(card->dev); - rc = lcs_stop_channels(card); - if (rc == 0) { - rc = lcs_start_channels(card); - if (rc == 0) { - rc = lcs_send_startup(card, LCS_INITIATOR_TCPIP); - if (rc == 0) - rc = lcs_startlan(card); - } - } - if (rc == 0) { - card->state = DEV_STATE_UP; - } else { - card->state = DEV_STATE_DOWN; - card->write.state = LCS_CH_STATE_INIT; - card->read.state = LCS_CH_STATE_INIT; - } - return rc; -} - -/* - * LCS Stop card - */ -static int -lcs_stopcard(struct lcs_card *card) -{ - int rc; - - LCS_DBF_TEXT(3, setup, "stopcard"); - - if (card->read.state != LCS_CH_STATE_STOPPED && - card->write.state != LCS_CH_STATE_STOPPED && - card->read.state != LCS_CH_STATE_ERROR && - card->write.state != LCS_CH_STATE_ERROR && - card->state == DEV_STATE_UP) { - lcs_clear_multicast_list(card); - rc = lcs_send_stoplan(card,LCS_INITIATOR_TCPIP); - rc = lcs_send_shutdown(card); - } - rc = lcs_stop_channels(card); - card->state = DEV_STATE_DOWN; - - return rc; -} - -/* - * Kernel Thread helper functions for LGW initiated commands - */ -static void -lcs_start_kernel_thread(struct work_struct *work) -{ - struct lcs_card *card = container_of(work, struct lcs_card, kernel_thread_starter); - LCS_DBF_TEXT(5, trace, "krnthrd"); - if (lcs_do_start_thread(card, LCS_RECOVERY_THREAD)) - kthread_run(lcs_recovery, card, "lcs_recover"); -#ifdef CONFIG_IP_MULTICAST - if (lcs_do_start_thread(card, LCS_SET_MC_THREAD)) - kthread_run(lcs_register_mc_addresses, card, "regipm"); -#endif -} - -/* - * Process control frames. - */ -static void -lcs_get_control(struct lcs_card *card, struct lcs_cmd *cmd) -{ - LCS_DBF_TEXT(5, trace, "getctrl"); - if (cmd->initiator == LCS_INITIATOR_LGW) { - switch(cmd->cmd_code) { - case LCS_CMD_STARTUP: - case LCS_CMD_STARTLAN: - lcs_schedule_recovery(card); - break; - case LCS_CMD_STOPLAN: - if (card->dev) { - pr_warn("Stoplan for %s initiated by LGW\n", - card->dev->name); - netif_carrier_off(card->dev); - } - break; - default: - LCS_DBF_TEXT(5, trace, "noLGWcmd"); - break; - } - } else - lcs_notify_lancmd_waiters(card, cmd); -} - -/* - * Unpack network packet. - */ -static void -lcs_get_skb(struct lcs_card *card, char *skb_data, unsigned int skb_len) -{ - struct sk_buff *skb; - - LCS_DBF_TEXT(5, trace, "getskb"); - if (card->dev == NULL || - card->state != DEV_STATE_UP) - /* The card isn't up. Ignore the packet. */ - return; - - skb = dev_alloc_skb(skb_len); - if (skb == NULL) { - dev_err(&card->dev->dev, - " Allocating a socket buffer to interface %s failed\n", - card->dev->name); - card->stats.rx_dropped++; - return; - } - skb_put_data(skb, skb_data, skb_len); - skb->protocol = card->lan_type_trans(skb, card->dev); - card->stats.rx_bytes += skb_len; - card->stats.rx_packets++; - if (skb->protocol == htons(ETH_P_802_2)) - *((__u32 *)skb->cb) = ++card->pkt_seq; - netif_rx(skb); -} - -/* - * LCS main routine to get packets and lancmd replies from the buffers - */ -static void -lcs_get_frames_cb(struct lcs_channel *channel, struct lcs_buffer *buffer) -{ - struct lcs_card *card; - struct lcs_header *lcs_hdr; - __u16 offset; - - LCS_DBF_TEXT(5, trace, "lcsgtpkt"); - lcs_hdr = (struct lcs_header *) buffer->data; - if (lcs_hdr->offset == LCS_ILLEGAL_OFFSET) { - LCS_DBF_TEXT(4, trace, "-eiogpkt"); - return; - } - card = container_of(channel, struct lcs_card, read); - offset = 0; - while (lcs_hdr->offset != 0) { - if (lcs_hdr->offset <= 0 || - lcs_hdr->offset > LCS_IOBUFFERSIZE || - lcs_hdr->offset < offset) { - /* Offset invalid. */ - card->stats.rx_length_errors++; - card->stats.rx_errors++; - return; - } - if (lcs_hdr->type == LCS_FRAME_TYPE_CONTROL) - lcs_get_control(card, (struct lcs_cmd *) lcs_hdr); - else if (lcs_hdr->type == LCS_FRAME_TYPE_ENET) - lcs_get_skb(card, (char *)(lcs_hdr + 1), - lcs_hdr->offset - offset - - sizeof(struct lcs_header)); - else - dev_info_once(&card->dev->dev, - "Unknown frame type %d\n", - lcs_hdr->type); - offset = lcs_hdr->offset; - lcs_hdr->offset = LCS_ILLEGAL_OFFSET; - lcs_hdr = (struct lcs_header *) (buffer->data + offset); - } - /* The buffer is now empty. Make it ready again. */ - lcs_ready_buffer(&card->read, buffer); -} - -/* - * get network statistics for ifconfig and other user programs - */ -static struct net_device_stats * -lcs_getstats(struct net_device *dev) -{ - struct lcs_card *card; - - LCS_DBF_TEXT(4, trace, "netstats"); - card = (struct lcs_card *) dev->ml_priv; - return &card->stats; -} - -/* - * stop lcs device - * This function will be called by user doing ifconfig xxx down - */ -static int -lcs_stop_device(struct net_device *dev) -{ - struct lcs_card *card; - int rc; - - LCS_DBF_TEXT(2, trace, "stopdev"); - card = (struct lcs_card *) dev->ml_priv; - netif_carrier_off(dev); - netif_tx_disable(dev); - dev->flags &= ~IFF_UP; - wait_event(card->write.wait_q, - (card->write.state != LCS_CH_STATE_RUNNING)); - rc = lcs_stopcard(card); - if (rc) - dev_err(&card->dev->dev, - " Shutting down the LCS device failed\n"); - return rc; -} - -/* - * start lcs device and make it runnable - * This function will be called by user doing ifconfig xxx up - */ -static int -lcs_open_device(struct net_device *dev) -{ - struct lcs_card *card; - int rc; - - LCS_DBF_TEXT(2, trace, "opendev"); - card = (struct lcs_card *) dev->ml_priv; - /* initialize statistics */ - rc = lcs_detect(card); - if (rc) { - pr_err("Error in opening device!\n"); - - } else { - dev->flags |= IFF_UP; - netif_carrier_on(dev); - netif_wake_queue(dev); - card->state = DEV_STATE_UP; - } - return rc; -} - -/* - * show function for portno called by cat or similar things - */ -static ssize_t -lcs_portno_show (struct device *dev, struct device_attribute *attr, char *buf) -{ - struct lcs_card *card; - - card = dev_get_drvdata(dev); - - if (!card) - return 0; - - return sysfs_emit(buf, "%d\n", card->portno); -} - -/* - * store the value which is piped to file portno - */ -static ssize_t -lcs_portno_store (struct device *dev, struct device_attribute *attr, const char *buf, size_t count) -{ - struct lcs_card *card; - int rc; - s16 value; - - card = dev_get_drvdata(dev); - - if (!card) - return 0; - - rc = kstrtos16(buf, 0, &value); - if (rc) - return -EINVAL; - /* TODO: sanity checks */ - card->portno = value; - if (card->dev) - card->dev->dev_port = card->portno; - - return count; - -} - -static DEVICE_ATTR(portno, 0644, lcs_portno_show, lcs_portno_store); - -static const char *lcs_type[] = { - "not a channel", - "2216 parallel", - "2216 channel", - "OSA LCS card", - "unknown channel type", - "unsupported channel type", -}; - -static ssize_t -lcs_type_show(struct device *dev, struct device_attribute *attr, char *buf) -{ - struct ccwgroup_device *cgdev; - - cgdev = to_ccwgroupdev(dev); - if (!cgdev) - return -ENODEV; - - return sysfs_emit(buf, "%s\n", - lcs_type[cgdev->cdev[0]->id.driver_info]); -} - -static DEVICE_ATTR(type, 0444, lcs_type_show, NULL); - -static ssize_t -lcs_timeout_show(struct device *dev, struct device_attribute *attr, char *buf) -{ - struct lcs_card *card; - - card = dev_get_drvdata(dev); - - return card ? sysfs_emit(buf, "%u\n", card->lancmd_timeout) : 0; -} - -static ssize_t -lcs_timeout_store (struct device *dev, struct device_attribute *attr, const char *buf, size_t count) -{ - struct lcs_card *card; - unsigned int value; - int rc; - - card = dev_get_drvdata(dev); - - if (!card) - return 0; - - rc = kstrtouint(buf, 0, &value); - if (rc) - return -EINVAL; - /* TODO: sanity checks */ - card->lancmd_timeout = value; - - return count; - -} - -static DEVICE_ATTR(lancmd_timeout, 0644, lcs_timeout_show, lcs_timeout_store); - -static ssize_t -lcs_dev_recover_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct lcs_card *card = dev_get_drvdata(dev); - char *tmp; - int i; - - if (!card) - return -EINVAL; - if (card->state != DEV_STATE_UP) - return -EPERM; - i = simple_strtoul(buf, &tmp, 16); - if (i == 1) - lcs_schedule_recovery(card); - return count; -} - -static DEVICE_ATTR(recover, 0200, NULL, lcs_dev_recover_store); - -static struct attribute * lcs_attrs[] = { - &dev_attr_portno.attr, - &dev_attr_type.attr, - &dev_attr_lancmd_timeout.attr, - &dev_attr_recover.attr, - NULL, -}; -static struct attribute_group lcs_attr_group = { - .attrs = lcs_attrs, -}; -static const struct attribute_group *lcs_attr_groups[] = { - &lcs_attr_group, - NULL, -}; -static const struct device_type lcs_devtype = { - .name = "lcs", - .groups = lcs_attr_groups, -}; - -/* - * lcs_probe_device is called on establishing a new ccwgroup_device. - */ -static int -lcs_probe_device(struct ccwgroup_device *ccwgdev) -{ - struct lcs_card *card; - - if (!get_device(&ccwgdev->dev)) - return -ENODEV; - - LCS_DBF_TEXT(2, setup, "add_dev"); - card = lcs_alloc_card(); - if (!card) { - LCS_DBF_TEXT_(2, setup, " rc%d", -ENOMEM); - put_device(&ccwgdev->dev); - return -ENOMEM; - } - dev_set_drvdata(&ccwgdev->dev, card); - ccwgdev->cdev[0]->handler = lcs_irq; - ccwgdev->cdev[1]->handler = lcs_irq; - card->gdev = ccwgdev; - INIT_WORK(&card->kernel_thread_starter, lcs_start_kernel_thread); - card->thread_start_mask = 0; - card->thread_allowed_mask = 0; - card->thread_running_mask = 0; - ccwgdev->dev.type = &lcs_devtype; - - return 0; -} - -static int -lcs_register_netdev(struct ccwgroup_device *ccwgdev) -{ - struct lcs_card *card; - - LCS_DBF_TEXT(2, setup, "regnetdv"); - card = dev_get_drvdata(&ccwgdev->dev); - if (card->dev->reg_state != NETREG_UNINITIALIZED) - return 0; - SET_NETDEV_DEV(card->dev, &ccwgdev->dev); - return register_netdev(card->dev); -} - -/* - * lcs_new_device will be called by setting the group device online. - */ -static const struct net_device_ops lcs_netdev_ops = { - .ndo_open = lcs_open_device, - .ndo_stop = lcs_stop_device, - .ndo_get_stats = lcs_getstats, - .ndo_start_xmit = lcs_start_xmit, -}; - -static const struct net_device_ops lcs_mc_netdev_ops = { - .ndo_open = lcs_open_device, - .ndo_stop = lcs_stop_device, - .ndo_get_stats = lcs_getstats, - .ndo_start_xmit = lcs_start_xmit, - .ndo_set_rx_mode = lcs_set_multicast_list, -}; - -static int -lcs_new_device(struct ccwgroup_device *ccwgdev) -{ - struct lcs_card *card; - struct net_device *dev=NULL; - enum lcs_dev_states recover_state; - int rc; - - card = dev_get_drvdata(&ccwgdev->dev); - if (!card) - return -ENODEV; - - LCS_DBF_TEXT(2, setup, "newdev"); - LCS_DBF_HEX(3, setup, &card, sizeof(void*)); - card->read.ccwdev = ccwgdev->cdev[0]; - card->write.ccwdev = ccwgdev->cdev[1]; - - recover_state = card->state; - rc = ccw_device_set_online(card->read.ccwdev); - if (rc) - goto out_err; - rc = ccw_device_set_online(card->write.ccwdev); - if (rc) - goto out_werr; - - LCS_DBF_TEXT(3, setup, "lcsnewdv"); - - lcs_setup_card(card); - rc = lcs_detect(card); - if (rc) { - LCS_DBF_TEXT(2, setup, "dtctfail"); - dev_err(&ccwgdev->dev, - "Detecting a network adapter for LCS devices" - " failed with rc=%d (0x%x)\n", rc, rc); - lcs_stopcard(card); - goto out; - } - if (card->dev) { - LCS_DBF_TEXT(2, setup, "samedev"); - LCS_DBF_HEX(3, setup, &card, sizeof(void*)); - goto netdev_out; - } - switch (card->lan_type) { - case LCS_FRAME_TYPE_ENET: - card->lan_type_trans = eth_type_trans; - dev = alloc_etherdev(0); - break; - default: - LCS_DBF_TEXT(3, setup, "errinit"); - pr_err(" Initialization failed\n"); - goto out; - } - if (!dev) - goto out; - card->dev = dev; - card->dev->ml_priv = card; - card->dev->netdev_ops = &lcs_netdev_ops; - card->dev->dev_port = card->portno; - eth_hw_addr_set(card->dev, card->mac); -#ifdef CONFIG_IP_MULTICAST - if (!lcs_check_multicast_support(card)) - card->dev->netdev_ops = &lcs_mc_netdev_ops; -#endif -netdev_out: - lcs_set_allowed_threads(card,0xffffffff); - if (recover_state == DEV_STATE_RECOVER) { - lcs_set_multicast_list(card->dev); - card->dev->flags |= IFF_UP; - netif_carrier_on(card->dev); - netif_wake_queue(card->dev); - card->state = DEV_STATE_UP; - } else { - lcs_stopcard(card); - } - - if (lcs_register_netdev(ccwgdev) != 0) - goto out; - - /* Print out supported assists: IPv6 */ - pr_info("LCS device %s %s IPv6 support\n", card->dev->name, - (card->ip_assists_supported & LCS_IPASS_IPV6_SUPPORT) ? - "with" : "without"); - /* Print out supported assist: Multicast */ - pr_info("LCS device %s %s Multicast support\n", card->dev->name, - (card->ip_assists_supported & LCS_IPASS_MULTICAST_SUPPORT) ? - "with" : "without"); - return 0; -out: - - ccw_device_set_offline(card->write.ccwdev); -out_werr: - ccw_device_set_offline(card->read.ccwdev); -out_err: - return -ENODEV; -} - -/* - * lcs_shutdown_device, called when setting the group device offline. - */ -static int -__lcs_shutdown_device(struct ccwgroup_device *ccwgdev, int recovery_mode) -{ - struct lcs_card *card; - enum lcs_dev_states recover_state; - int ret = 0, ret2 = 0, ret3 = 0; - - LCS_DBF_TEXT(3, setup, "shtdndev"); - card = dev_get_drvdata(&ccwgdev->dev); - if (!card) - return -ENODEV; - if (recovery_mode == 0) { - lcs_set_allowed_threads(card, 0); - if (lcs_wait_for_threads(card, LCS_SET_MC_THREAD)) - return -ERESTARTSYS; - } - LCS_DBF_HEX(3, setup, &card, sizeof(void*)); - recover_state = card->state; - - ret = lcs_stop_device(card->dev); - ret2 = ccw_device_set_offline(card->read.ccwdev); - ret3 = ccw_device_set_offline(card->write.ccwdev); - if (!ret) - ret = (ret2) ? ret2 : ret3; - if (ret) - LCS_DBF_TEXT_(3, setup, "1err:%d", ret); - if (recover_state == DEV_STATE_UP) { - card->state = DEV_STATE_RECOVER; - } - return 0; -} - -static int -lcs_shutdown_device(struct ccwgroup_device *ccwgdev) -{ - return __lcs_shutdown_device(ccwgdev, 0); -} - -/* - * drive lcs recovery after startup and startlan initiated by Lan Gateway - */ -static int -lcs_recovery(void *ptr) -{ - struct lcs_card *card; - struct ccwgroup_device *gdev; - int rc; - - card = (struct lcs_card *) ptr; - - LCS_DBF_TEXT(4, trace, "recover1"); - if (!lcs_do_run_thread(card, LCS_RECOVERY_THREAD)) - return 0; - LCS_DBF_TEXT(4, trace, "recover2"); - gdev = card->gdev; - dev_warn(&gdev->dev, - "A recovery process has been started for the LCS device\n"); - rc = __lcs_shutdown_device(gdev, 1); - rc = lcs_new_device(gdev); - if (!rc) - pr_info("Device %s successfully recovered!\n", - card->dev->name); - else - pr_info("Device %s could not be recovered!\n", - card->dev->name); - lcs_clear_thread_running_bit(card, LCS_RECOVERY_THREAD); - return 0; -} - -/* - * lcs_remove_device, free buffers and card - */ -static void -lcs_remove_device(struct ccwgroup_device *ccwgdev) -{ - struct lcs_card *card; - - card = dev_get_drvdata(&ccwgdev->dev); - if (!card) - return; - - LCS_DBF_TEXT(3, setup, "remdev"); - LCS_DBF_HEX(3, setup, &card, sizeof(void*)); - if (ccwgdev->state == CCWGROUP_ONLINE) { - lcs_shutdown_device(ccwgdev); - } - if (card->dev) - unregister_netdev(card->dev); - lcs_cleanup_card(card); - lcs_free_card(card); - dev_set_drvdata(&ccwgdev->dev, NULL); - put_device(&ccwgdev->dev); -} - -static struct ccw_device_id lcs_ids[] = { - {CCW_DEVICE(0x3088, 0x08), .driver_info = lcs_channel_type_parallel}, - {CCW_DEVICE(0x3088, 0x1f), .driver_info = lcs_channel_type_2216}, - {CCW_DEVICE(0x3088, 0x60), .driver_info = lcs_channel_type_osa2}, - {}, -}; -MODULE_DEVICE_TABLE(ccw, lcs_ids); - -static struct ccw_driver lcs_ccw_driver = { - .driver = { - .owner = THIS_MODULE, - .name = "lcs", - }, - .ids = lcs_ids, - .probe = ccwgroup_probe_ccwdev, - .remove = ccwgroup_remove_ccwdev, - .int_class = IRQIO_LCS, -}; - -/* - * LCS ccwgroup driver registration - */ -static struct ccwgroup_driver lcs_group_driver = { - .driver = { - .owner = THIS_MODULE, - .name = "lcs", - }, - .ccw_driver = &lcs_ccw_driver, - .setup = lcs_probe_device, - .remove = lcs_remove_device, - .set_online = lcs_new_device, - .set_offline = lcs_shutdown_device, -}; - -static ssize_t group_store(struct device_driver *ddrv, const char *buf, - size_t count) -{ - int err; - err = ccwgroup_create_dev(lcs_root_dev, &lcs_group_driver, 2, buf); - return err ? err : count; -} -static DRIVER_ATTR_WO(group); - -static struct attribute *lcs_drv_attrs[] = { - &driver_attr_group.attr, - NULL, -}; -static struct attribute_group lcs_drv_attr_group = { - .attrs = lcs_drv_attrs, -}; -static const struct attribute_group *lcs_drv_attr_groups[] = { - &lcs_drv_attr_group, - NULL, -}; - -/* - * LCS Module/Kernel initialization function - */ -static int -__init lcs_init_module(void) -{ - int rc; - - pr_info("Loading %s\n", version); - rc = lcs_register_debug_facility(); - LCS_DBF_TEXT(0, setup, "lcsinit"); - if (rc) - goto out_err; - lcs_root_dev = root_device_register("lcs"); - rc = PTR_ERR_OR_ZERO(lcs_root_dev); - if (rc) - goto register_err; - rc = ccw_driver_register(&lcs_ccw_driver); - if (rc) - goto ccw_err; - lcs_group_driver.driver.groups = lcs_drv_attr_groups; - rc = ccwgroup_driver_register(&lcs_group_driver); - if (rc) - goto ccwgroup_err; - return 0; - -ccwgroup_err: - ccw_driver_unregister(&lcs_ccw_driver); -ccw_err: - root_device_unregister(lcs_root_dev); -register_err: - lcs_unregister_debug_facility(); -out_err: - pr_err("Initializing the lcs device driver failed\n"); - return rc; -} - - -/* - * LCS module cleanup function - */ -static void -__exit lcs_cleanup_module(void) -{ - pr_info("Terminating lcs module.\n"); - LCS_DBF_TEXT(0, trace, "cleanup"); - ccwgroup_driver_unregister(&lcs_group_driver); - ccw_driver_unregister(&lcs_ccw_driver); - root_device_unregister(lcs_root_dev); - lcs_unregister_debug_facility(); -} - -module_init(lcs_init_module); -module_exit(lcs_cleanup_module); - -MODULE_AUTHOR("Frank Pavlic <fpavlic@de.ibm.com>"); -MODULE_DESCRIPTION("S/390 LAN channel station device driver"); -MODULE_LICENSE("GPL"); - diff --git a/drivers/s390/net/lcs.h b/drivers/s390/net/lcs.h deleted file mode 100644 index a2699b70b050..000000000000 --- a/drivers/s390/net/lcs.h +++ /dev/null @@ -1,342 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/*lcs.h*/ - -#include <linux/interrupt.h> -#include <linux/netdevice.h> -#include <linux/skbuff.h> -#include <linux/workqueue.h> -#include <linux/refcount.h> -#include <asm/ccwdev.h> - -#define LCS_DBF_TEXT(level, name, text) \ - do { \ - debug_text_event(lcs_dbf_##name, level, text); \ - } while (0) - -#define LCS_DBF_HEX(level,name,addr,len) \ -do { \ - debug_event(lcs_dbf_##name,level,(void*)(addr),len); \ -} while (0) - -#define LCS_DBF_TEXT_(level,name,text...) \ - do { \ - if (debug_level_enabled(lcs_dbf_##name, level)) { \ - scnprintf(debug_buffer, sizeof(debug_buffer), text); \ - debug_text_event(lcs_dbf_##name, level, debug_buffer); \ - } \ - } while (0) - -/** - * sysfs related stuff - */ -#define CARD_FROM_DEV(cdev) \ - (struct lcs_card *) dev_get_drvdata( \ - &((struct ccwgroup_device *)dev_get_drvdata(&cdev->dev))->dev); - -/** - * Enum for classifying detected devices. - */ -enum lcs_channel_types { - /* Device is not a channel */ - lcs_channel_type_none, - - /* Device is a 2216 channel */ - lcs_channel_type_parallel, - - /* Device is a 2216 channel */ - lcs_channel_type_2216, - - /* Device is a OSA2 card */ - lcs_channel_type_osa2 -}; - -/** - * CCW commands used in this driver - */ -#define LCS_CCW_WRITE 0x01 -#define LCS_CCW_READ 0x02 -#define LCS_CCW_TRANSFER 0x08 - -/** - * LCS device status primitives - */ -#define LCS_CMD_STARTLAN 0x01 -#define LCS_CMD_STOPLAN 0x02 -#define LCS_CMD_LANSTAT 0x04 -#define LCS_CMD_STARTUP 0x07 -#define LCS_CMD_SHUTDOWN 0x08 -#define LCS_CMD_QIPASSIST 0xb2 -#define LCS_CMD_SETIPM 0xb4 -#define LCS_CMD_DELIPM 0xb5 - -#define LCS_INITIATOR_TCPIP 0x00 -#define LCS_INITIATOR_LGW 0x01 -#define LCS_STD_CMD_SIZE 16 -#define LCS_MULTICAST_CMD_SIZE 404 - -/** - * LCS IPASSIST MASKS,only used when multicast is switched on - */ -/* Not supported by LCS */ -#define LCS_IPASS_ARP_PROCESSING 0x0001 -#define LCS_IPASS_IN_CHECKSUM_SUPPORT 0x0002 -#define LCS_IPASS_OUT_CHECKSUM_SUPPORT 0x0004 -#define LCS_IPASS_IP_FRAG_REASSEMBLY 0x0008 -#define LCS_IPASS_IP_FILTERING 0x0010 -/* Supported by lcs 3172 */ -#define LCS_IPASS_IPV6_SUPPORT 0x0020 -#define LCS_IPASS_MULTICAST_SUPPORT 0x0040 - -/** - * LCS sense byte definitions - */ -#define LCS_SENSE_BYTE_0 0 -#define LCS_SENSE_BYTE_1 1 -#define LCS_SENSE_BYTE_2 2 -#define LCS_SENSE_BYTE_3 3 -#define LCS_SENSE_INTERFACE_DISCONNECT 0x01 -#define LCS_SENSE_EQUIPMENT_CHECK 0x10 -#define LCS_SENSE_BUS_OUT_CHECK 0x20 -#define LCS_SENSE_INTERVENTION_REQUIRED 0x40 -#define LCS_SENSE_CMD_REJECT 0x80 -#define LCS_SENSE_RESETTING_EVENT 0x80 -#define LCS_SENSE_DEVICE_ONLINE 0x20 - -/** - * LCS packet type definitions - */ -#define LCS_FRAME_TYPE_CONTROL 0 -#define LCS_FRAME_TYPE_ENET 1 -#define LCS_FRAME_TYPE_TR 2 -#define LCS_FRAME_TYPE_FDDI 7 -#define LCS_FRAME_TYPE_AUTO -1 - -/** - * some more definitions,we will sort them later - */ -#define LCS_ILLEGAL_OFFSET 0xffff -#define LCS_IOBUFFERSIZE 0x5000 -#define LCS_NUM_BUFFS 32 /* needs to be power of 2 */ -#define LCS_MAC_LENGTH 6 -#define LCS_INVALID_PORT_NO -1 -#define LCS_LANCMD_TIMEOUT_DEFAULT 5 - -/** - * Multicast state - */ -#define LCS_IPM_STATE_SET_REQUIRED 0 -#define LCS_IPM_STATE_DEL_REQUIRED 1 -#define LCS_IPM_STATE_ON_CARD 2 - -/** - * LCS IP Assist declarations - * seems to be only used for multicast - */ -#define LCS_IPASS_ARP_PROCESSING 0x0001 -#define LCS_IPASS_INBOUND_CSUM_SUPP 0x0002 -#define LCS_IPASS_OUTBOUND_CSUM_SUPP 0x0004 -#define LCS_IPASS_IP_FRAG_REASSEMBLY 0x0008 -#define LCS_IPASS_IP_FILTERING 0x0010 -#define LCS_IPASS_IPV6_SUPPORT 0x0020 -#define LCS_IPASS_MULTICAST_SUPPORT 0x0040 - -/** - * LCS Buffer states - */ -enum lcs_buffer_states { - LCS_BUF_STATE_EMPTY, /* buffer is empty */ - LCS_BUF_STATE_LOCKED, /* buffer is locked, don't touch */ - LCS_BUF_STATE_READY, /* buffer is ready for read/write */ - LCS_BUF_STATE_PROCESSED, -}; - -/** - * LCS Channel State Machine declarations - */ -enum lcs_channel_states { - LCS_CH_STATE_INIT, - LCS_CH_STATE_HALTED, - LCS_CH_STATE_STOPPED, - LCS_CH_STATE_RUNNING, - LCS_CH_STATE_SUSPENDED, - LCS_CH_STATE_CLEARED, - LCS_CH_STATE_ERROR, -}; - -/** - * LCS device state machine - */ -enum lcs_dev_states { - DEV_STATE_DOWN, - DEV_STATE_UP, - DEV_STATE_RECOVER, -}; - -enum lcs_threads { - LCS_SET_MC_THREAD = 1, - LCS_RECOVERY_THREAD = 2, -}; - -/** - * LCS struct declarations - */ -struct lcs_header { - __u16 offset; - __u8 type; - __u8 slot; -} __attribute__ ((packed)); - -struct lcs_ip_mac_pair { - __be32 ip_addr; - __u8 mac_addr[LCS_MAC_LENGTH]; - __u8 reserved[2]; -} __attribute__ ((packed)); - -struct lcs_ipm_list { - struct list_head list; - struct lcs_ip_mac_pair ipm; - __u8 ipm_state; -}; - -struct lcs_cmd { - __u16 offset; - __u8 type; - __u8 slot; - __u8 cmd_code; - __u8 initiator; - __u16 sequence_no; - __u16 return_code; - union { - struct { - __u8 lan_type; - __u8 portno; - __u16 parameter_count; - __u8 operator_flags[3]; - __u8 reserved[3]; - } lcs_std_cmd; - struct { - __u16 unused1; - __u16 buff_size; - __u8 unused2[6]; - } lcs_startup; - struct { - __u8 lan_type; - __u8 portno; - __u8 unused[10]; - __u8 mac_addr[LCS_MAC_LENGTH]; - __u32 num_packets_deblocked; - __u32 num_packets_blocked; - __u32 num_packets_tx_on_lan; - __u32 num_tx_errors_detected; - __u32 num_tx_packets_disgarded; - __u32 num_packets_rx_from_lan; - __u32 num_rx_errors_detected; - __u32 num_rx_discarded_nobuffs_avail; - __u32 num_rx_packets_too_large; - } lcs_lanstat_cmd; -#ifdef CONFIG_IP_MULTICAST - struct { - __u8 lan_type; - __u8 portno; - __u16 num_ip_pairs; - __u16 ip_assists_supported; - __u16 ip_assists_enabled; - __u16 version; - struct { - struct lcs_ip_mac_pair - ip_mac_pair[32]; - __u32 response_data; - } lcs_ipass_ctlmsg __attribute ((packed)); - } lcs_qipassist __attribute__ ((packed)); -#endif /*CONFIG_IP_MULTICAST */ - } cmd __attribute__ ((packed)); -} __attribute__ ((packed)); - -/** - * Forward declarations. - */ -struct lcs_card; -struct lcs_channel; - -/** - * Definition of an lcs buffer. - */ -struct lcs_buffer { - enum lcs_buffer_states state; - void *data; - int count; - /* Callback for completion notification. */ - void (*callback)(struct lcs_channel *, struct lcs_buffer *); -}; - -struct lcs_reply { - struct list_head list; - __u16 sequence_no; - refcount_t refcnt; - /* Callback for completion notification. */ - void (*callback)(struct lcs_card *, struct lcs_cmd *); - wait_queue_head_t wait_q; - struct lcs_card *card; - struct timer_list timer; - int received; - int rc; -}; - -/** - * Definition of an lcs channel - */ -struct lcs_channel { - enum lcs_channel_states state; - struct ccw_device *ccwdev; - struct ccw1 ccws[LCS_NUM_BUFFS + 1]; - wait_queue_head_t wait_q; - struct tasklet_struct irq_tasklet; - struct lcs_buffer iob[LCS_NUM_BUFFS]; - int io_idx; - int buf_idx; -}; - - -/** - * definition of the lcs card - */ -struct lcs_card { - spinlock_t lock; - spinlock_t ipm_lock; - enum lcs_dev_states state; - struct net_device *dev; - struct net_device_stats stats; - __be16 (*lan_type_trans)(struct sk_buff *skb, - struct net_device *dev); - struct ccwgroup_device *gdev; - struct lcs_channel read; - struct lcs_channel write; - struct lcs_buffer *tx_buffer; - int tx_emitted; - struct list_head lancmd_waiters; - int lancmd_timeout; - - struct work_struct kernel_thread_starter; - spinlock_t mask_lock; - unsigned long thread_start_mask; - unsigned long thread_running_mask; - unsigned long thread_allowed_mask; - wait_queue_head_t wait_q; - -#ifdef CONFIG_IP_MULTICAST - struct list_head ipm_list; -#endif - __u8 mac[LCS_MAC_LENGTH]; - __u16 ip_assists_supported; - __u16 ip_assists_enabled; - __s8 lan_type; - __u32 pkt_seq; - __u16 sequence_no; - __s16 portno; - /* Some info copied from probeinfo */ - u8 device_forced; - u8 max_port_no; - u8 hint_port_no; - s16 port_protocol_no; -} __attribute__ ((aligned(8))); - diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 72aac84dca93..35fc241c4672 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -40,6 +40,8 @@ enum io_uring_cmd_flags { IO_URING_F_TASK_DEAD = (1 << 13), }; +struct io_zcrx_ifq; + struct io_wq_work_node { struct io_wq_work_node *next; }; @@ -384,6 +386,8 @@ struct io_ring_ctx { struct wait_queue_head poll_wq; struct io_restriction restrictions; + struct io_zcrx_ifq *ifq; + u32 pers_next; struct xarray personalities; @@ -436,6 +440,8 @@ struct io_ring_ctx { struct io_mapped_region ring_region; /* used for optimised request parameter and wait argument passing */ struct io_mapped_region param_region; + /* just one zcrx per ring for now, will move to io_zcrx_ifq eventually */ + struct io_mapped_region zcrx_region; }; /* diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 27f42f713c89..87edb7a8173b 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -1415,7 +1415,6 @@ int mlx4_get_is_vlan_offload_disabled(struct mlx4_dev *dev, u8 port, bool *vlan_offload_disabled); void mlx4_handle_eth_header_mcast_prio(struct mlx4_net_trans_rule_hw_ctrl *ctrl, struct _rule_hw *eth_header); -int mlx4_find_cached_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *idx); int mlx4_find_cached_vlan(struct mlx4_dev *dev, u8 port, u16 vid, int *idx); int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index); void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index af86097641b0..46bd7550adf8 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -54,7 +54,6 @@ #include <linux/mlx5/doorbell.h> #include <linux/mlx5/eq.h> #include <linux/timecounter.h> -#include <linux/ptp_clock_kernel.h> #include <net/devlink.h> #define MLX5_ADEV_NAME "mlx5_core" @@ -679,33 +678,8 @@ struct mlx5_rsvd_gids { struct ida ida; }; -#define MAX_PIN_NUM 8 -struct mlx5_pps { - u8 pin_caps[MAX_PIN_NUM]; - struct work_struct out_work; - u64 start[MAX_PIN_NUM]; - u8 enabled; - u64 min_npps_period; - u64 min_out_pulse_duration_ns; -}; - -struct mlx5_timer { - struct cyclecounter cycles; - struct timecounter tc; - u32 nominal_c_mult; - unsigned long overflow_period; -}; - -struct mlx5_clock { - struct mlx5_nb pps_nb; - seqlock_t lock; - struct hwtstamp_config hwtstamp_config; - struct ptp_clock *ptp; - struct ptp_clock_info ptp_info; - struct mlx5_pps pps_info; - struct mlx5_timer timer; -}; - +struct mlx5_clock; +struct mlx5_clock_dev_state; struct mlx5_dm; struct mlx5_fw_tracer; struct mlx5_vxlan; @@ -789,7 +763,8 @@ struct mlx5_core_dev { #ifdef CONFIG_MLX5_FPGA struct mlx5_fpga_device *fpga; #endif - struct mlx5_clock clock; + struct mlx5_clock *clock; + struct mlx5_clock_dev_state *clock_state; struct mlx5_ib_clock_info *clock_info; struct mlx5_fw_tracer *tracer; struct mlx5_rsc_dump *rsc_dump; diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h index e68d42b8ce65..fd625e0dd869 100644 --- a/include/linux/mlx5/port.h +++ b/include/linux/mlx5/port.h @@ -115,9 +115,12 @@ enum mlx5e_ext_link_mode { MLX5E_100GAUI_1_100GBASE_CR_KR = 11, MLX5E_200GAUI_4_200GBASE_CR4_KR4 = 12, MLX5E_200GAUI_2_200GBASE_CR2_KR2 = 13, + MLX5E_200GAUI_1_200GBASE_CR1_KR1 = 14, MLX5E_400GAUI_8_400GBASE_CR8 = 15, MLX5E_400GAUI_4_400GBASE_CR4_KR4 = 16, + MLX5E_400GAUI_2_400GBASE_CR2_KR2 = 17, MLX5E_800GAUI_8_800GBASE_CR8_KR8 = 19, + MLX5E_800GAUI_4_800GBASE_CR4_KR4 = 20, MLX5E_EXT_LINK_MODES_NUMBER, }; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ab550a89b9bf..9a387d456592 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -658,6 +658,7 @@ struct netdev_queue { struct Qdisc __rcu *qdisc_sleeping; #ifdef CONFIG_SYSFS struct kobject kobj; + const struct attribute_group **groups; #endif unsigned long tx_maxrate; /* diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 4bc2ee0b10b0..ccaaf4c7d5f6 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -43,6 +43,7 @@ extern void rtnl_lock(void); extern void rtnl_unlock(void); extern int rtnl_trylock(void); extern int rtnl_is_locked(void); +extern int rtnl_lock_interruptible(void); extern int rtnl_lock_killable(void); extern bool refcount_dec_and_rtnl_lock(refcount_t *r); diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h index 596836abf7bf..af40842f229d 100644 --- a/include/net/netdev_rx_queue.h +++ b/include/net/netdev_rx_queue.h @@ -16,6 +16,7 @@ struct netdev_rx_queue { struct rps_dev_flow_table __rcu *rps_flow_table; #endif struct kobject kobj; + const struct attribute_group **groups; struct net_device *dev; netdevice_tracker dev_tracker; diff --git a/include/net/netmem.h b/include/net/netmem.h index 1b58faa4f20f..c61d5b21e7b4 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -24,11 +24,20 @@ struct net_iov { unsigned long __unused_padding; unsigned long pp_magic; struct page_pool *pp; - struct dmabuf_genpool_chunk_owner *owner; + struct net_iov_area *owner; unsigned long dma_addr; atomic_long_t pp_ref_count; }; +struct net_iov_area { + /* Array of net_iovs for this area. */ + struct net_iov *niovs; + size_t num_niovs; + + /* Offset into the dma-buf where this chunk starts. */ + unsigned long base_virtual; +}; + /* These fields in struct page are used by the page_pool and net stack: * * struct { @@ -54,6 +63,16 @@ NET_IOV_ASSERT_OFFSET(dma_addr, dma_addr); NET_IOV_ASSERT_OFFSET(pp_ref_count, pp_ref_count); #undef NET_IOV_ASSERT_OFFSET +static inline struct net_iov_area *net_iov_owner(const struct net_iov *niov) +{ + return niov->owner; +} + +static inline unsigned int net_iov_idx(const struct net_iov *niov) +{ + return niov - net_iov_owner(niov)->niovs; +} + /* netmem */ /** diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h new file mode 100644 index 000000000000..b3e665897767 --- /dev/null +++ b/include/net/page_pool/memory_provider.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NET_PAGE_POOL_MEMORY_PROVIDER_H +#define _NET_PAGE_POOL_MEMORY_PROVIDER_H + +#include <net/netmem.h> +#include <net/page_pool/types.h> + +struct netdev_rx_queue; +struct sk_buff; + +struct memory_provider_ops { + netmem_ref (*alloc_netmems)(struct page_pool *pool, gfp_t gfp); + bool (*release_netmem)(struct page_pool *pool, netmem_ref netmem); + int (*init)(struct page_pool *pool); + void (*destroy)(struct page_pool *pool); + int (*nl_fill)(void *mp_priv, struct sk_buff *rsp, + struct netdev_rx_queue *rxq); + void (*uninstall)(void *mp_priv, struct netdev_rx_queue *rxq); +}; + +bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr); +void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov); +void net_mp_niov_clear_page_pool(struct net_iov *niov); + +int net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx, + struct pp_memory_provider_params *p); +void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, + struct pp_memory_provider_params *old_p); + +/** + * net_mp_netmem_place_in_cache() - give a netmem to a page pool + * @pool: the page pool to place the netmem into + * @netmem: netmem to give + * + * Push an accounted netmem into the page pool's allocation cache. The caller + * must ensure that there is space in the cache. It should only be called off + * the mp_ops->alloc_netmems() path. + */ +static inline void net_mp_netmem_place_in_cache(struct page_pool *pool, + netmem_ref netmem) +{ + pool->alloc.cache[pool->alloc.count++] = netmem; +} + +#endif diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index 7f405672b089..36eb57d73abc 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -152,8 +152,11 @@ struct page_pool_stats { */ #define PAGE_POOL_FRAG_GROUP_ALIGN (4 * sizeof(long)) +struct memory_provider_ops; + struct pp_memory_provider_params { void *mp_priv; + const struct memory_provider_ops *mp_ops; }; struct page_pool { @@ -216,6 +219,7 @@ struct page_pool { struct ptr_ring ring; void *mp_priv; + const struct memory_provider_ops *mp_ops; #ifdef CONFIG_PAGE_POOL_STATS /* recycle stats are per-cpu to avoid locking */ diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 9b18c4cfe56f..2feba0929a8a 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -2059,6 +2059,24 @@ enum ethtool_link_mode_bit_indices { ETHTOOL_LINK_MODE_10baseT1S_Half_BIT = 100, ETHTOOL_LINK_MODE_10baseT1S_P2MP_Half_BIT = 101, ETHTOOL_LINK_MODE_10baseT1BRR_Full_BIT = 102, + ETHTOOL_LINK_MODE_200000baseCR_Full_BIT = 103, + ETHTOOL_LINK_MODE_200000baseKR_Full_BIT = 104, + ETHTOOL_LINK_MODE_200000baseDR_Full_BIT = 105, + ETHTOOL_LINK_MODE_200000baseDR_2_Full_BIT = 106, + ETHTOOL_LINK_MODE_200000baseSR_Full_BIT = 107, + ETHTOOL_LINK_MODE_200000baseVR_Full_BIT = 108, + ETHTOOL_LINK_MODE_400000baseCR2_Full_BIT = 109, + ETHTOOL_LINK_MODE_400000baseKR2_Full_BIT = 110, + ETHTOOL_LINK_MODE_400000baseDR2_Full_BIT = 111, + ETHTOOL_LINK_MODE_400000baseDR2_2_Full_BIT = 112, + ETHTOOL_LINK_MODE_400000baseSR2_Full_BIT = 113, + ETHTOOL_LINK_MODE_400000baseVR2_Full_BIT = 114, + ETHTOOL_LINK_MODE_800000baseCR4_Full_BIT = 115, + ETHTOOL_LINK_MODE_800000baseKR4_Full_BIT = 116, + ETHTOOL_LINK_MODE_800000baseDR4_Full_BIT = 117, + ETHTOOL_LINK_MODE_800000baseDR4_2_Full_BIT = 118, + ETHTOOL_LINK_MODE_800000baseSR4_Full_BIT = 119, + ETHTOOL_LINK_MODE_800000baseVR4_Full_BIT = 120, /* must be last entry */ __ETHTOOL_LINK_MODE_MASK_NBITS diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 050fa8eb2e8f..1e02e94bc26d 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -87,6 +87,7 @@ struct io_uring_sqe { union { __s32 splice_fd_in; __u32 file_index; + __u32 zcrx_ifq_idx; __u32 optlen; struct { __u16 addr_len; @@ -278,6 +279,7 @@ enum io_uring_op { IORING_OP_FTRUNCATE, IORING_OP_BIND, IORING_OP_LISTEN, + IORING_OP_RECV_ZC, /* this goes last, obviously */ IORING_OP_LAST, @@ -639,7 +641,8 @@ enum io_uring_register_op { /* send MSG_RING without having a ring */ IORING_REGISTER_SEND_MSG_RING = 31, - /* 32 reserved for zc rx */ + /* register a netdev hw rx queue for zerocopy */ + IORING_REGISTER_ZCRX_IFQ = 32, /* resize CQ ring */ IORING_REGISTER_RESIZE_RINGS = 33, @@ -956,6 +959,55 @@ enum io_uring_socket_op { SOCKET_URING_OP_SETSOCKOPT, }; +/* Zero copy receive refill queue entry */ +struct io_uring_zcrx_rqe { + __u64 off; + __u32 len; + __u32 __pad; +}; + +struct io_uring_zcrx_cqe { + __u64 off; + __u64 __pad; +}; + +/* The bit from which area id is encoded into offsets */ +#define IORING_ZCRX_AREA_SHIFT 48 +#define IORING_ZCRX_AREA_MASK (~(((__u64)1 << IORING_ZCRX_AREA_SHIFT) - 1)) + +struct io_uring_zcrx_offsets { + __u32 head; + __u32 tail; + __u32 rqes; + __u32 __resv2; + __u64 __resv[2]; +}; + +struct io_uring_zcrx_area_reg { + __u64 addr; + __u64 len; + __u64 rq_area_token; + __u32 flags; + __u32 __resv1; + __u64 __resv2[2]; +}; + +/* + * Argument for IORING_REGISTER_ZCRX_IFQ + */ +struct io_uring_zcrx_ifq_reg { + __u32 if_idx; + __u32 if_rxq; + __u32 rq_entries; + __u32 flags; + + __u64 area_ptr; /* pointer to struct io_uring_zcrx_area_reg */ + __u64 region_ptr; /* struct io_uring_region_desc * */ + + struct io_uring_zcrx_offsets offsets; + __u64 __resv[4]; +}; + #ifdef __cplusplus } #endif diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index e4be227d3ad6..6c6ee183802d 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -87,6 +87,11 @@ enum { }; enum { + __NETDEV_A_IO_URING_PROVIDER_INFO_MAX, + NETDEV_A_IO_URING_PROVIDER_INFO_MAX = (__NETDEV_A_IO_URING_PROVIDER_INFO_MAX - 1) +}; + +enum { NETDEV_A_PAGE_POOL_ID = 1, NETDEV_A_PAGE_POOL_IFINDEX, NETDEV_A_PAGE_POOL_NAPI_ID, @@ -94,6 +99,7 @@ enum { NETDEV_A_PAGE_POOL_INFLIGHT_MEM, NETDEV_A_PAGE_POOL_DETACH_TIME, NETDEV_A_PAGE_POOL_DMABUF, + NETDEV_A_PAGE_POOL_IO_URING, __NETDEV_A_PAGE_POOL_MAX, NETDEV_A_PAGE_POOL_MAX = (__NETDEV_A_PAGE_POOL_MAX - 1) @@ -136,6 +142,7 @@ enum { NETDEV_A_QUEUE_TYPE, NETDEV_A_QUEUE_NAPI_ID, NETDEV_A_QUEUE_DMABUF, + NETDEV_A_QUEUE_IO_URING, __NETDEV_A_QUEUE_MAX, NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) diff --git a/io_uring/Kconfig b/io_uring/Kconfig new file mode 100644 index 000000000000..9e2a4beba1ef --- /dev/null +++ b/io_uring/Kconfig @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# io_uring configuration +# + +config IO_URING_ZCRX + def_bool y + depends on PAGE_POOL + depends on INET + depends on NET_RX_BUSY_POLL diff --git a/io_uring/Makefile b/io_uring/Makefile index d695b60dba4f..98e48339d84d 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -14,6 +14,7 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ epoll.o statx.o timeout.o fdinfo.o \ cancel.o waitid.o register.o \ truncate.o memmap.o alloc_cache.o +obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 58003fa6b327..fa342be39158 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -97,6 +97,7 @@ #include "uring_cmd.h" #include "msg_ring.h" #include "memmap.h" +#include "zcrx.h" #include "timeout.h" #include "poll.h" @@ -2729,6 +2730,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) mutex_lock(&ctx->uring_lock); io_sqe_buffers_unregister(ctx); io_sqe_files_unregister(ctx); + io_unregister_zcrx_ifqs(ctx); io_cqring_overflow_kill(ctx); io_eventfd_unregister(ctx); io_free_alloc_caches(ctx); @@ -2888,6 +2890,11 @@ static __cold void io_ring_exit_work(struct work_struct *work) io_cqring_overflow_kill(ctx); mutex_unlock(&ctx->uring_lock); } + if (ctx->ifq) { + mutex_lock(&ctx->uring_lock); + io_shutdown_zcrx_ifqs(ctx); + mutex_unlock(&ctx->uring_lock); + } if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) io_move_task_work_from_local(ctx); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index d5c9b7a6911d..daf0e3b740ee 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -190,6 +190,16 @@ static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret return io_get_cqe_overflow(ctx, ret, false); } +static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx, + struct io_uring_cqe **cqe_ret) +{ + io_lockdep_assert_cq_locked(ctx); + + ctx->cq_extra++; + ctx->submit_state.cq_flush = true; + return io_get_cqe(ctx, cqe_ret); +} + static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, struct io_kiocb *req) { diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 361134544427..76fcc79656b0 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -271,6 +271,8 @@ static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx, return io_pbuf_get_region(ctx, bgid); case IORING_MAP_OFF_PARAM_REGION: return &ctx->param_region; + case IORING_MAP_OFF_ZCRX_REGION: + return &ctx->zcrx_region; } return NULL; } diff --git a/io_uring/memmap.h b/io_uring/memmap.h index c898dcba2b4e..dad0aa5b1b45 100644 --- a/io_uring/memmap.h +++ b/io_uring/memmap.h @@ -2,6 +2,7 @@ #define IO_URING_MEMMAP_H #define IORING_MAP_OFF_PARAM_REGION 0x20000000ULL +#define IORING_MAP_OFF_ZCRX_REGION 0x30000000ULL struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); diff --git a/io_uring/net.c b/io_uring/net.c index 6d13d378358b..905d1ee01201 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -16,6 +16,7 @@ #include "net.h" #include "notif.h" #include "rsrc.h" +#include "zcrx.h" #if defined(CONFIG_NET) struct io_shutdown { @@ -88,6 +89,14 @@ struct io_sr_msg { */ #define MULTISHOT_MAX_RETRY 32 +struct io_recvzc { + struct file *file; + unsigned msg_flags; + u16 flags; + u32 len; + struct io_zcrx_ifq *ifq; +}; + int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown); @@ -1200,6 +1209,81 @@ out_free: return ret; } +int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc); + unsigned ifq_idx; + + if (unlikely(sqe->file_index || sqe->addr2 || sqe->addr || + sqe->addr3)) + return -EINVAL; + + ifq_idx = READ_ONCE(sqe->zcrx_ifq_idx); + if (ifq_idx != 0) + return -EINVAL; + zc->ifq = req->ctx->ifq; + if (!zc->ifq) + return -EINVAL; + zc->len = READ_ONCE(sqe->len); + zc->flags = READ_ONCE(sqe->ioprio); + zc->msg_flags = READ_ONCE(sqe->msg_flags); + if (zc->msg_flags) + return -EINVAL; + if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT)) + return -EINVAL; + /* multishot required */ + if (!(zc->flags & IORING_RECV_MULTISHOT)) + return -EINVAL; + /* All data completions are posted as aux CQEs. */ + req->flags |= REQ_F_APOLL_MULTISHOT; + + return 0; +} + +int io_recvzc(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc); + struct socket *sock; + unsigned int len; + int ret; + + if (!(req->flags & REQ_F_POLLED) && + (zc->flags & IORING_RECVSEND_POLL_FIRST)) + return -EAGAIN; + + sock = sock_from_file(req->file); + if (unlikely(!sock)) + return -ENOTSOCK; + + len = zc->len; + ret = io_zcrx_recv(req, zc->ifq, sock, zc->msg_flags | MSG_DONTWAIT, + issue_flags, &zc->len); + if (len && zc->len == 0) { + io_req_set_res(req, 0, 0); + + if (issue_flags & IO_URING_F_MULTISHOT) + return IOU_STOP_MULTISHOT; + return IOU_OK; + } + if (unlikely(ret <= 0) && ret != -EAGAIN) { + if (ret == -ERESTARTSYS) + ret = -EINTR; + if (ret == IOU_REQUEUE) + return IOU_REQUEUE; + + req_set_fail(req); + io_req_set_res(req, ret, 0); + + if (issue_flags & IO_URING_F_MULTISHOT) + return IOU_STOP_MULTISHOT; + return IOU_OK; + } + + if (issue_flags & IO_URING_F_MULTISHOT) + return IOU_ISSUE_SKIP_COMPLETE; + return -EAGAIN; +} + void io_send_zc_cleanup(struct io_kiocb *req) { struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 306fd9c48b44..9511262c513e 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -37,6 +37,7 @@ #include "waitid.h" #include "futex.h" #include "truncate.h" +#include "zcrx.h" static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags) { @@ -516,6 +517,18 @@ const struct io_issue_def io_issue_defs[] = { .prep = io_eopnotsupp_prep, #endif }, + [IORING_OP_RECV_ZC] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .ioprio = 1, +#if defined(CONFIG_NET) + .prep = io_recvzc_prep, + .issue = io_recvzc, +#else + .prep = io_eopnotsupp_prep, +#endif + }, }; const struct io_cold_def io_cold_defs[] = { @@ -745,6 +758,9 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_LISTEN] = { .name = "LISTEN", }, + [IORING_OP_RECV_ZC] = { + .name = "RECV_ZC", + }, }; const char *io_uring_get_opcode(u8 opcode) diff --git a/io_uring/register.c b/io_uring/register.c index 9a4d2fbce4ae..cc23a4c205cd 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -30,6 +30,7 @@ #include "eventfd.h" #include "msg_ring.h" #include "memmap.h" +#include "zcrx.h" #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) @@ -813,6 +814,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_clone_buffers(ctx, arg); break; + case IORING_REGISTER_ZCRX_IFQ: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_zcrx_ifq(ctx, arg); + break; case IORING_REGISTER_RESIZE_RINGS: ret = -EINVAL; if (!arg || nr_args != 1) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 5dd1e0827559..28783f1dde00 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -80,7 +80,7 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) return 0; } -static int io_buffer_validate(struct iovec *iov) +int io_buffer_validate(struct iovec *iov) { unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index f10a1252b3e9..284e300e63fb 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -76,6 +76,7 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, unsigned size, unsigned type); int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, unsigned int size, unsigned int type); +int io_buffer_validate(struct iovec *iov); bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, struct io_imu_folio_data *data); diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c new file mode 100644 index 000000000000..9c95b5b6ec4e --- /dev/null +++ b/io_uring/zcrx.c @@ -0,0 +1,960 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/dma-map-ops.h> +#include <linux/mm.h> +#include <linux/nospec.h> +#include <linux/io_uring.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/skbuff_ref.h> + +#include <net/page_pool/helpers.h> +#include <net/page_pool/memory_provider.h> +#include <net/netlink.h> +#include <net/netdev_rx_queue.h> +#include <net/tcp.h> +#include <net/rps.h> + +#include <trace/events/page_pool.h> + +#include <uapi/linux/io_uring.h> + +#include "io_uring.h" +#include "kbuf.h" +#include "memmap.h" +#include "zcrx.h" +#include "rsrc.h" + +#define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING) + +static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, + struct io_zcrx_area *area, int nr_mapped) +{ + int i; + + for (i = 0; i < nr_mapped; i++) { + struct net_iov *niov = &area->nia.niovs[i]; + dma_addr_t dma; + + dma = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov)); + dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE, + DMA_FROM_DEVICE, IO_DMA_ATTR); + net_mp_niov_set_dma_addr(niov, 0); + } +} + +static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) +{ + if (area->is_mapped) + __io_zcrx_unmap_area(ifq, area, area->nia.num_niovs); +} + +static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) +{ + int i; + + for (i = 0; i < area->nia.num_niovs; i++) { + struct net_iov *niov = &area->nia.niovs[i]; + dma_addr_t dma; + + dma = dma_map_page_attrs(ifq->dev, area->pages[i], 0, PAGE_SIZE, + DMA_FROM_DEVICE, IO_DMA_ATTR); + if (dma_mapping_error(ifq->dev, dma)) + break; + if (net_mp_niov_set_dma_addr(niov, dma)) { + dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE, + DMA_FROM_DEVICE, IO_DMA_ATTR); + break; + } + } + + if (i != area->nia.num_niovs) { + __io_zcrx_unmap_area(ifq, area, i); + return -EINVAL; + } + + area->is_mapped = true; + return 0; +} + +static void io_zcrx_sync_for_device(const struct page_pool *pool, + struct net_iov *niov) +{ +#if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) + dma_addr_t dma_addr; + + if (!dma_dev_need_sync(pool->p.dev)) + return; + + dma_addr = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov)); + __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset, + PAGE_SIZE, pool->p.dma_dir); +#endif +} + +#define IO_RQ_MAX_ENTRIES 32768 + +#define IO_SKBS_PER_CALL_LIMIT 20 + +struct io_zcrx_args { + struct io_kiocb *req; + struct io_zcrx_ifq *ifq; + struct socket *sock; + unsigned nr_skbs; +}; + +static const struct memory_provider_ops io_uring_pp_zc_ops; + +static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov) +{ + struct net_iov_area *owner = net_iov_owner(niov); + + return container_of(owner, struct io_zcrx_area, nia); +} + +static inline atomic_t *io_get_user_counter(struct net_iov *niov) +{ + struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); + + return &area->user_refs[net_iov_idx(niov)]; +} + +static bool io_zcrx_put_niov_uref(struct net_iov *niov) +{ + atomic_t *uref = io_get_user_counter(niov); + + if (unlikely(!atomic_read(uref))) + return false; + atomic_dec(uref); + return true; +} + +static void io_zcrx_get_niov_uref(struct net_iov *niov) +{ + atomic_inc(io_get_user_counter(niov)); +} + +static inline struct page *io_zcrx_iov_page(const struct net_iov *niov) +{ + struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); + + return area->pages[net_iov_idx(niov)]; +} + +static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, + struct io_uring_zcrx_ifq_reg *reg, + struct io_uring_region_desc *rd) +{ + size_t off, size; + void *ptr; + int ret; + + off = sizeof(struct io_uring); + size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries; + if (size > rd->size) + return -EINVAL; + + ret = io_create_region_mmap_safe(ifq->ctx, &ifq->ctx->zcrx_region, rd, + IORING_MAP_OFF_ZCRX_REGION); + if (ret < 0) + return ret; + + ptr = io_region_get_ptr(&ifq->ctx->zcrx_region); + ifq->rq_ring = (struct io_uring *)ptr; + ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off); + return 0; +} + +static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) +{ + io_free_region(ifq->ctx, &ifq->ctx->zcrx_region); + ifq->rq_ring = NULL; + ifq->rqes = NULL; +} + +static void io_zcrx_free_area(struct io_zcrx_area *area) +{ + io_zcrx_unmap_area(area->ifq, area); + + kvfree(area->freelist); + kvfree(area->nia.niovs); + kvfree(area->user_refs); + if (area->pages) { + unpin_user_pages(area->pages, area->nia.num_niovs); + kvfree(area->pages); + } + kfree(area); +} + +static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, + struct io_zcrx_area **res, + struct io_uring_zcrx_area_reg *area_reg) +{ + struct io_zcrx_area *area; + int i, ret, nr_pages; + struct iovec iov; + + if (area_reg->flags || area_reg->rq_area_token) + return -EINVAL; + if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1]) + return -EINVAL; + if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) + return -EINVAL; + + iov.iov_base = u64_to_user_ptr(area_reg->addr); + iov.iov_len = area_reg->len; + ret = io_buffer_validate(&iov); + if (ret) + return ret; + + ret = -ENOMEM; + area = kzalloc(sizeof(*area), GFP_KERNEL); + if (!area) + goto err; + + area->pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len, + &nr_pages); + if (IS_ERR(area->pages)) { + ret = PTR_ERR(area->pages); + area->pages = NULL; + goto err; + } + area->nia.num_niovs = nr_pages; + + area->nia.niovs = kvmalloc_array(nr_pages, sizeof(area->nia.niovs[0]), + GFP_KERNEL | __GFP_ZERO); + if (!area->nia.niovs) + goto err; + + area->freelist = kvmalloc_array(nr_pages, sizeof(area->freelist[0]), + GFP_KERNEL | __GFP_ZERO); + if (!area->freelist) + goto err; + + for (i = 0; i < nr_pages; i++) + area->freelist[i] = i; + + area->user_refs = kvmalloc_array(nr_pages, sizeof(area->user_refs[0]), + GFP_KERNEL | __GFP_ZERO); + if (!area->user_refs) + goto err; + + for (i = 0; i < nr_pages; i++) { + struct net_iov *niov = &area->nia.niovs[i]; + + niov->owner = &area->nia; + area->freelist[i] = i; + atomic_set(&area->user_refs[i], 0); + } + + area->free_count = nr_pages; + area->ifq = ifq; + /* we're only supporting one area per ifq for now */ + area->area_id = 0; + area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT; + spin_lock_init(&area->freelist_lock); + *res = area; + return 0; +err: + if (area) + io_zcrx_free_area(area); + return ret; +} + +static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) +{ + struct io_zcrx_ifq *ifq; + + ifq = kzalloc(sizeof(*ifq), GFP_KERNEL); + if (!ifq) + return NULL; + + ifq->if_rxq = -1; + ifq->ctx = ctx; + spin_lock_init(&ifq->lock); + spin_lock_init(&ifq->rq_lock); + return ifq; +} + +static void io_zcrx_drop_netdev(struct io_zcrx_ifq *ifq) +{ + spin_lock(&ifq->lock); + if (ifq->netdev) { + netdev_put(ifq->netdev, &ifq->netdev_tracker); + ifq->netdev = NULL; + } + spin_unlock(&ifq->lock); +} + +static void io_close_queue(struct io_zcrx_ifq *ifq) +{ + struct net_device *netdev; + netdevice_tracker netdev_tracker; + struct pp_memory_provider_params p = { + .mp_ops = &io_uring_pp_zc_ops, + .mp_priv = ifq, + }; + + if (ifq->if_rxq == -1) + return; + + spin_lock(&ifq->lock); + netdev = ifq->netdev; + netdev_tracker = ifq->netdev_tracker; + ifq->netdev = NULL; + spin_unlock(&ifq->lock); + + if (netdev) { + net_mp_close_rxq(netdev, ifq->if_rxq, &p); + netdev_put(netdev, &netdev_tracker); + } + ifq->if_rxq = -1; +} + +static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) +{ + io_close_queue(ifq); + io_zcrx_drop_netdev(ifq); + + if (ifq->area) + io_zcrx_free_area(ifq->area); + if (ifq->dev) + put_device(ifq->dev); + + io_free_rbuf_ring(ifq); + kfree(ifq); +} + +int io_register_zcrx_ifq(struct io_ring_ctx *ctx, + struct io_uring_zcrx_ifq_reg __user *arg) +{ + struct pp_memory_provider_params mp_param = {}; + struct io_uring_zcrx_area_reg area; + struct io_uring_zcrx_ifq_reg reg; + struct io_uring_region_desc rd; + struct io_zcrx_ifq *ifq; + int ret; + + /* + * 1. Interface queue allocation. + * 2. It can observe data destined for sockets of other tasks. + */ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + /* mandatory io_uring features for zc rx */ + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN && + ctx->flags & IORING_SETUP_CQE32)) + return -EINVAL; + if (ctx->ifq) + return -EBUSY; + if (copy_from_user(®, arg, sizeof(reg))) + return -EFAULT; + if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) + return -EFAULT; + if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) + return -EINVAL; + if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags) + return -EINVAL; + if (reg.rq_entries > IO_RQ_MAX_ENTRIES) { + if (!(ctx->flags & IORING_SETUP_CLAMP)) + return -EINVAL; + reg.rq_entries = IO_RQ_MAX_ENTRIES; + } + reg.rq_entries = roundup_pow_of_two(reg.rq_entries); + + if (copy_from_user(&area, u64_to_user_ptr(reg.area_ptr), sizeof(area))) + return -EFAULT; + + ifq = io_zcrx_ifq_alloc(ctx); + if (!ifq) + return -ENOMEM; + + ret = io_allocate_rbuf_ring(ifq, ®, &rd); + if (ret) + goto err; + + ret = io_zcrx_create_area(ifq, &ifq->area, &area); + if (ret) + goto err; + + ifq->rq_entries = reg.rq_entries; + + ret = -ENODEV; + ifq->netdev = netdev_get_by_index(current->nsproxy->net_ns, reg.if_idx, + &ifq->netdev_tracker, GFP_KERNEL); + if (!ifq->netdev) + goto err; + + ifq->dev = ifq->netdev->dev.parent; + ret = -EOPNOTSUPP; + if (!ifq->dev) + goto err; + get_device(ifq->dev); + + ret = io_zcrx_map_area(ifq, ifq->area); + if (ret) + goto err; + + mp_param.mp_ops = &io_uring_pp_zc_ops; + mp_param.mp_priv = ifq; + ret = net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param); + if (ret) + goto err; + ifq->if_rxq = reg.if_rxq; + + reg.offsets.rqes = sizeof(struct io_uring); + reg.offsets.head = offsetof(struct io_uring, head); + reg.offsets.tail = offsetof(struct io_uring, tail); + + if (copy_to_user(arg, ®, sizeof(reg)) || + copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd)) || + copy_to_user(u64_to_user_ptr(reg.area_ptr), &area, sizeof(area))) { + ret = -EFAULT; + goto err; + } + ctx->ifq = ifq; + return 0; +err: + io_zcrx_ifq_free(ifq); + return ret; +} + +void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) +{ + struct io_zcrx_ifq *ifq = ctx->ifq; + + lockdep_assert_held(&ctx->uring_lock); + + if (!ifq) + return; + + ctx->ifq = NULL; + io_zcrx_ifq_free(ifq); +} + +static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area) +{ + unsigned niov_idx; + + lockdep_assert_held(&area->freelist_lock); + + niov_idx = area->freelist[--area->free_count]; + return &area->nia.niovs[niov_idx]; +} + +static void io_zcrx_return_niov_freelist(struct net_iov *niov) +{ + struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); + + spin_lock_bh(&area->freelist_lock); + area->freelist[area->free_count++] = net_iov_idx(niov); + spin_unlock_bh(&area->freelist_lock); +} + +static void io_zcrx_return_niov(struct net_iov *niov) +{ + netmem_ref netmem = net_iov_to_netmem(niov); + + if (!niov->pp) { + /* copy fallback allocated niovs */ + io_zcrx_return_niov_freelist(niov); + return; + } + page_pool_put_unrefed_netmem(niov->pp, netmem, -1, false); +} + +static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) +{ + struct io_zcrx_area *area = ifq->area; + int i; + + if (!area) + return; + + /* Reclaim back all buffers given to the user space. */ + for (i = 0; i < area->nia.num_niovs; i++) { + struct net_iov *niov = &area->nia.niovs[i]; + int nr; + + if (!atomic_read(io_get_user_counter(niov))) + continue; + nr = atomic_xchg(io_get_user_counter(niov), 0); + if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr)) + io_zcrx_return_niov(niov); + } +} + +void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) +{ + lockdep_assert_held(&ctx->uring_lock); + + if (!ctx->ifq) + return; + io_zcrx_scrub(ctx->ifq); + io_close_queue(ctx->ifq); +} + +static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq) +{ + u32 entries; + + entries = smp_load_acquire(&ifq->rq_ring->tail) - ifq->cached_rq_head; + return min(entries, ifq->rq_entries); +} + +static struct io_uring_zcrx_rqe *io_zcrx_get_rqe(struct io_zcrx_ifq *ifq, + unsigned mask) +{ + unsigned int idx = ifq->cached_rq_head++ & mask; + + return &ifq->rqes[idx]; +} + +static void io_zcrx_ring_refill(struct page_pool *pp, + struct io_zcrx_ifq *ifq) +{ + unsigned int mask = ifq->rq_entries - 1; + unsigned int entries; + netmem_ref netmem; + + spin_lock_bh(&ifq->rq_lock); + + entries = io_zcrx_rqring_entries(ifq); + entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL - pp->alloc.count); + if (unlikely(!entries)) { + spin_unlock_bh(&ifq->rq_lock); + return; + } + + do { + struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask); + struct io_zcrx_area *area; + struct net_iov *niov; + unsigned niov_idx, area_idx; + + area_idx = rqe->off >> IORING_ZCRX_AREA_SHIFT; + niov_idx = (rqe->off & ~IORING_ZCRX_AREA_MASK) >> PAGE_SHIFT; + + if (unlikely(rqe->__pad || area_idx)) + continue; + area = ifq->area; + + if (unlikely(niov_idx >= area->nia.num_niovs)) + continue; + niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs); + + niov = &area->nia.niovs[niov_idx]; + if (!io_zcrx_put_niov_uref(niov)) + continue; + + netmem = net_iov_to_netmem(niov); + if (page_pool_unref_netmem(netmem, 1) != 0) + continue; + + if (unlikely(niov->pp != pp)) { + io_zcrx_return_niov(niov); + continue; + } + + io_zcrx_sync_for_device(pp, niov); + net_mp_netmem_place_in_cache(pp, netmem); + } while (--entries); + + smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head); + spin_unlock_bh(&ifq->rq_lock); +} + +static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq) +{ + struct io_zcrx_area *area = ifq->area; + + spin_lock_bh(&area->freelist_lock); + while (area->free_count && pp->alloc.count < PP_ALLOC_CACHE_REFILL) { + struct net_iov *niov = __io_zcrx_get_free_niov(area); + netmem_ref netmem = net_iov_to_netmem(niov); + + net_mp_niov_set_page_pool(pp, niov); + io_zcrx_sync_for_device(pp, niov); + net_mp_netmem_place_in_cache(pp, netmem); + } + spin_unlock_bh(&area->freelist_lock); +} + +static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp) +{ + struct io_zcrx_ifq *ifq = pp->mp_priv; + + /* pp should already be ensuring that */ + if (unlikely(pp->alloc.count)) + goto out_return; + + io_zcrx_ring_refill(pp, ifq); + if (likely(pp->alloc.count)) + goto out_return; + + io_zcrx_refill_slow(pp, ifq); + if (!pp->alloc.count) + return 0; +out_return: + return pp->alloc.cache[--pp->alloc.count]; +} + +static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem) +{ + struct net_iov *niov; + + if (WARN_ON_ONCE(!netmem_is_net_iov(netmem))) + return false; + + niov = netmem_to_net_iov(netmem); + net_mp_niov_clear_page_pool(niov); + io_zcrx_return_niov_freelist(niov); + return false; +} + +static int io_pp_zc_init(struct page_pool *pp) +{ + struct io_zcrx_ifq *ifq = pp->mp_priv; + + if (WARN_ON_ONCE(!ifq)) + return -EINVAL; + if (WARN_ON_ONCE(ifq->dev != pp->p.dev)) + return -EINVAL; + if (WARN_ON_ONCE(!pp->dma_map)) + return -EOPNOTSUPP; + if (pp->p.order != 0) + return -EOPNOTSUPP; + if (pp->p.dma_dir != DMA_FROM_DEVICE) + return -EOPNOTSUPP; + + percpu_ref_get(&ifq->ctx->refs); + return 0; +} + +static void io_pp_zc_destroy(struct page_pool *pp) +{ + struct io_zcrx_ifq *ifq = pp->mp_priv; + struct io_zcrx_area *area = ifq->area; + + if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs)) + return; + percpu_ref_put(&ifq->ctx->refs); +} + +static int io_pp_nl_fill(void *mp_priv, struct sk_buff *rsp, + struct netdev_rx_queue *rxq) +{ + struct nlattr *nest; + int type; + + type = rxq ? NETDEV_A_QUEUE_IO_URING : NETDEV_A_PAGE_POOL_IO_URING; + nest = nla_nest_start(rsp, type); + if (!nest) + return -EMSGSIZE; + nla_nest_end(rsp, nest); + + return 0; +} + +static void io_pp_uninstall(void *mp_priv, struct netdev_rx_queue *rxq) +{ + struct pp_memory_provider_params *p = &rxq->mp_params; + struct io_zcrx_ifq *ifq = mp_priv; + + io_zcrx_drop_netdev(ifq); + p->mp_ops = NULL; + p->mp_priv = NULL; +} + +static const struct memory_provider_ops io_uring_pp_zc_ops = { + .alloc_netmems = io_pp_zc_alloc_netmems, + .release_netmem = io_pp_zc_release_netmem, + .init = io_pp_zc_init, + .destroy = io_pp_zc_destroy, + .nl_fill = io_pp_nl_fill, + .uninstall = io_pp_uninstall, +}; + +static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov, + struct io_zcrx_ifq *ifq, int off, int len) +{ + struct io_uring_zcrx_cqe *rcqe; + struct io_zcrx_area *area; + struct io_uring_cqe *cqe; + u64 offset; + + if (!io_defer_get_uncommited_cqe(req->ctx, &cqe)) + return false; + + cqe->user_data = req->cqe.user_data; + cqe->res = len; + cqe->flags = IORING_CQE_F_MORE; + + area = io_zcrx_iov_to_area(niov); + offset = off + (net_iov_idx(niov) << PAGE_SHIFT); + rcqe = (struct io_uring_zcrx_cqe *)(cqe + 1); + rcqe->off = offset + ((u64)area->area_id << IORING_ZCRX_AREA_SHIFT); + rcqe->__pad = 0; + return true; +} + +static struct net_iov *io_zcrx_alloc_fallback(struct io_zcrx_area *area) +{ + struct net_iov *niov = NULL; + + spin_lock_bh(&area->freelist_lock); + if (area->free_count) + niov = __io_zcrx_get_free_niov(area); + spin_unlock_bh(&area->freelist_lock); + + if (niov) + page_pool_fragment_netmem(net_iov_to_netmem(niov), 1); + return niov; +} + +static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + void *src_base, struct page *src_page, + unsigned int src_offset, size_t len) +{ + struct io_zcrx_area *area = ifq->area; + size_t copied = 0; + int ret = 0; + + while (len) { + size_t copy_size = min_t(size_t, PAGE_SIZE, len); + const int dst_off = 0; + struct net_iov *niov; + struct page *dst_page; + void *dst_addr; + + niov = io_zcrx_alloc_fallback(area); + if (!niov) { + ret = -ENOMEM; + break; + } + + dst_page = io_zcrx_iov_page(niov); + dst_addr = kmap_local_page(dst_page); + if (src_page) + src_base = kmap_local_page(src_page); + + memcpy(dst_addr, src_base + src_offset, copy_size); + + if (src_page) + kunmap_local(src_base); + kunmap_local(dst_addr); + + if (!io_zcrx_queue_cqe(req, niov, ifq, dst_off, copy_size)) { + io_zcrx_return_niov(niov); + ret = -ENOSPC; + break; + } + + io_zcrx_get_niov_uref(niov); + src_offset += copy_size; + len -= copy_size; + copied += copy_size; + } + + return copied ? copied : ret; +} + +static int io_zcrx_copy_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + const skb_frag_t *frag, int off, int len) +{ + struct page *page = skb_frag_page(frag); + u32 p_off, p_len, t, copied = 0; + int ret = 0; + + off += skb_frag_off(frag); + + skb_frag_foreach_page(frag, off, len, + page, p_off, p_len, t) { + ret = io_zcrx_copy_chunk(req, ifq, NULL, page, p_off, p_len); + if (ret < 0) + return copied ? copied : ret; + copied += ret; + } + return copied; +} + +static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + const skb_frag_t *frag, int off, int len) +{ + struct net_iov *niov; + + if (unlikely(!skb_frag_is_net_iov(frag))) + return io_zcrx_copy_frag(req, ifq, frag, off, len); + + niov = netmem_to_net_iov(frag->netmem); + if (niov->pp->mp_ops != &io_uring_pp_zc_ops || + niov->pp->mp_priv != ifq) + return -EFAULT; + + if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len)) + return -ENOSPC; + + /* + * Prevent it from being recycled while user is accessing it. + * It has to be done before grabbing a user reference. + */ + page_pool_ref_netmem(net_iov_to_netmem(niov)); + io_zcrx_get_niov_uref(niov); + return len; +} + +static int +io_zcrx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb, + unsigned int offset, size_t len) +{ + struct io_zcrx_args *args = desc->arg.data; + struct io_zcrx_ifq *ifq = args->ifq; + struct io_kiocb *req = args->req; + struct sk_buff *frag_iter; + unsigned start, start_off = offset; + int i, copy, end, off; + int ret = 0; + + len = min_t(size_t, len, desc->count); + if (unlikely(args->nr_skbs++ > IO_SKBS_PER_CALL_LIMIT)) + return -EAGAIN; + + if (unlikely(offset < skb_headlen(skb))) { + ssize_t copied; + size_t to_copy; + + to_copy = min_t(size_t, skb_headlen(skb) - offset, len); + copied = io_zcrx_copy_chunk(req, ifq, skb->data, NULL, + offset, to_copy); + if (copied < 0) { + ret = copied; + goto out; + } + offset += copied; + len -= copied; + if (!len) + goto out; + if (offset != skb_headlen(skb)) + goto out; + } + + start = skb_headlen(skb); + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + const skb_frag_t *frag; + + if (WARN_ON(start > offset + len)) + return -EFAULT; + + frag = &skb_shinfo(skb)->frags[i]; + end = start + skb_frag_size(frag); + + if (offset < end) { + copy = end - offset; + if (copy > len) + copy = len; + + off = offset - start; + ret = io_zcrx_recv_frag(req, ifq, frag, off, copy); + if (ret < 0) + goto out; + + offset += ret; + len -= ret; + if (len == 0 || ret != copy) + goto out; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + if (WARN_ON(start > offset + len)) + return -EFAULT; + + end = start + frag_iter->len; + if (offset < end) { + copy = end - offset; + if (copy > len) + copy = len; + + off = offset - start; + ret = io_zcrx_recv_skb(desc, frag_iter, off, copy); + if (ret < 0) + goto out; + + offset += ret; + len -= ret; + if (len == 0 || ret != copy) + goto out; + } + start = end; + } + +out: + if (offset == start_off) + return ret; + desc->count -= (offset - start_off); + return offset - start_off; +} + +static int io_zcrx_tcp_recvmsg(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + struct sock *sk, int flags, + unsigned issue_flags, unsigned int *outlen) +{ + unsigned int len = *outlen; + struct io_zcrx_args args = { + .req = req, + .ifq = ifq, + .sock = sk->sk_socket, + }; + read_descriptor_t rd_desc = { + .count = len ? len : UINT_MAX, + .arg.data = &args, + }; + int ret; + + lock_sock(sk); + ret = tcp_read_sock(sk, &rd_desc, io_zcrx_recv_skb); + if (len && ret > 0) + *outlen = len - ret; + if (ret <= 0) { + if (ret < 0 || sock_flag(sk, SOCK_DONE)) + goto out; + if (sk->sk_err) + ret = sock_error(sk); + else if (sk->sk_shutdown & RCV_SHUTDOWN) + goto out; + else if (sk->sk_state == TCP_CLOSE) + ret = -ENOTCONN; + else + ret = -EAGAIN; + } else if (unlikely(args.nr_skbs > IO_SKBS_PER_CALL_LIMIT) && + (issue_flags & IO_URING_F_MULTISHOT)) { + ret = IOU_REQUEUE; + } else if (sock_flag(sk, SOCK_DONE)) { + /* Make it to retry until it finally gets 0. */ + if (issue_flags & IO_URING_F_MULTISHOT) + ret = IOU_REQUEUE; + else + ret = -EAGAIN; + } +out: + release_sock(sk); + return ret; +} + +int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + struct socket *sock, unsigned int flags, + unsigned issue_flags, unsigned int *len) +{ + struct sock *sk = sock->sk; + const struct proto *prot = READ_ONCE(sk->sk_prot); + + if (prot->recvmsg != tcp_recvmsg) + return -EPROTONOSUPPORT; + + sock_rps_record_flow(sk); + return io_zcrx_tcp_recvmsg(req, ifq, sk, flags, issue_flags, len); +} diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h new file mode 100644 index 000000000000..706cc7300780 --- /dev/null +++ b/io_uring/zcrx.h @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IOU_ZC_RX_H +#define IOU_ZC_RX_H + +#include <linux/io_uring_types.h> +#include <linux/socket.h> +#include <net/page_pool/types.h> +#include <net/net_trackers.h> + +struct io_zcrx_area { + struct net_iov_area nia; + struct io_zcrx_ifq *ifq; + atomic_t *user_refs; + + bool is_mapped; + u16 area_id; + struct page **pages; + + /* freelist */ + spinlock_t freelist_lock ____cacheline_aligned_in_smp; + u32 free_count; + u32 *freelist; +}; + +struct io_zcrx_ifq { + struct io_ring_ctx *ctx; + struct io_zcrx_area *area; + + struct io_uring *rq_ring; + struct io_uring_zcrx_rqe *rqes; + u32 rq_entries; + u32 cached_rq_head; + spinlock_t rq_lock; + + u32 if_rxq; + struct device *dev; + struct net_device *netdev; + netdevice_tracker netdev_tracker; + spinlock_t lock; +}; + +#if defined(CONFIG_IO_URING_ZCRX) +int io_register_zcrx_ifq(struct io_ring_ctx *ctx, + struct io_uring_zcrx_ifq_reg __user *arg); +void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx); +void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx); +int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + struct socket *sock, unsigned int flags, + unsigned issue_flags, unsigned int *len); +#else +static inline int io_register_zcrx_ifq(struct io_ring_ctx *ctx, + struct io_uring_zcrx_ifq_reg __user *arg) +{ + return -EOPNOTSUPP; +} +static inline void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) +{ +} +static inline void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) +{ +} +static inline int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + struct socket *sock, unsigned int flags, + unsigned issue_flags, unsigned int *len) +{ + return -EOPNOTSUPP; +} +#endif + +int io_recvzc(struct io_kiocb *req, unsigned int issue_flags); +int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); + +#endif diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 1a52a0bca086..7e1ad229e133 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -1040,7 +1040,7 @@ static int br_mdb_add_group(const struct br_mdb_config *cfg, /* host join */ if (!port) { - if (mp->host_joined) { + if (mp->host_joined && !(cfg->nlflags & NLM_F_REPLACE)) { NL_SET_ERR_MSG_MOD(extack, "Group is already joined by host"); return -EEXIST; } diff --git a/net/core/dev.c b/net/core/dev.c index 30da277c5a6f..4349aeef0a4a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -159,6 +159,7 @@ #include <net/netdev_rx_queue.h> #include <net/page_pool/types.h> #include <net/page_pool/helpers.h> +#include <net/page_pool/memory_provider.h> #include <net/rps.h> #include <linux/phy_link_topology.h> @@ -6184,16 +6185,18 @@ EXPORT_SYMBOL(netif_receive_skb_list); static void flush_backlog(struct work_struct *work) { struct sk_buff *skb, *tmp; + struct sk_buff_head list; struct softnet_data *sd; + __skb_queue_head_init(&list); local_bh_disable(); sd = this_cpu_ptr(&softnet_data); backlog_lock_irq_disable(sd); skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { - if (skb->dev->reg_state == NETREG_UNREGISTERING) { + if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->input_pkt_queue); - dev_kfree_skb_irq(skb); + __skb_queue_tail(&list, skb); rps_input_queue_head_incr(sd); } } @@ -6201,14 +6204,16 @@ static void flush_backlog(struct work_struct *work) local_lock_nested_bh(&softnet_data.process_queue_bh_lock); skb_queue_walk_safe(&sd->process_queue, skb, tmp) { - if (skb->dev->reg_state == NETREG_UNREGISTERING) { + if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->process_queue); - kfree_skb(skb); + __skb_queue_tail(&list, skb); rps_input_queue_head_incr(sd); } } local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); local_bh_enable(); + + __skb_queue_purge_reason(&list, SKB_DROP_REASON_DEV_READY); } static bool flush_required(int cpu) @@ -7153,6 +7158,9 @@ void __netif_napi_del_locked(struct napi_struct *napi) if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state)) return; + /* Make sure NAPI is disabled (or was never enabled). */ + WARN_ON(!test_bit(NAPI_STATE_SCHED, &napi->state)); + if (napi->config) { napi->index = -1; napi->config = NULL; @@ -11820,6 +11828,19 @@ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) } EXPORT_SYMBOL(unregister_netdevice_queue); +static void dev_memory_provider_uninstall(struct net_device *dev) +{ + unsigned int i; + + for (i = 0; i < dev->real_num_rx_queues; i++) { + struct netdev_rx_queue *rxq = &dev->_rx[i]; + struct pp_memory_provider_params *p = &rxq->mp_params; + + if (p->mp_ops && p->mp_ops->uninstall) + p->mp_ops->uninstall(rxq->mp_params.mp_priv, rxq); + } +} + void unregister_netdevice_many_notify(struct list_head *head, u32 portid, const struct nlmsghdr *nlh) { @@ -11874,7 +11895,7 @@ void unregister_netdevice_many_notify(struct list_head *head, dev_tcx_uninstall(dev); dev_xdp_uninstall(dev); bpf_dev_bound_netdev_unregister(dev); - dev_dmabuf_uninstall(dev); + dev_memory_provider_uninstall(dev); netdev_offload_xstats_disable_all(dev); diff --git a/net/core/devmem.c b/net/core/devmem.c index 3bba3f018df0..7c6e0b5b6acb 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -16,6 +16,7 @@ #include <net/netdev_queues.h> #include <net/netdev_rx_queue.h> #include <net/page_pool/helpers.h> +#include <net/page_pool/memory_provider.h> #include <trace/events/page_pool.h> #include "devmem.h" @@ -27,20 +28,28 @@ /* Protected by rtnl_lock() */ static DEFINE_XARRAY_FLAGS(net_devmem_dmabuf_bindings, XA_FLAGS_ALLOC1); +static const struct memory_provider_ops dmabuf_devmem_ops; + +bool net_is_devmem_iov(struct net_iov *niov) +{ + return niov->pp->mp_ops == &dmabuf_devmem_ops; +} + static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool, struct gen_pool_chunk *chunk, void *not_used) { struct dmabuf_genpool_chunk_owner *owner = chunk->owner; - kvfree(owner->niovs); + kvfree(owner->area.niovs); kfree(owner); } static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov) { - struct dmabuf_genpool_chunk_owner *owner = net_iov_owner(niov); + struct dmabuf_genpool_chunk_owner *owner; + owner = net_devmem_iov_to_chunk_owner(niov); return owner->base_dma_addr + ((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT); } @@ -83,7 +92,7 @@ net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) offset = dma_addr - owner->base_dma_addr; index = offset / PAGE_SIZE; - niov = &owner->niovs[index]; + niov = &owner->area.niovs[index]; niov->pp_magic = 0; niov->pp = NULL; @@ -94,7 +103,7 @@ net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) void net_devmem_free_dmabuf(struct net_iov *niov) { - struct net_devmem_dmabuf_binding *binding = net_iov_binding(niov); + struct net_devmem_dmabuf_binding *binding = net_devmem_iov_binding(niov); unsigned long dma_addr = net_devmem_get_dma_addr(niov); if (WARN_ON(!gen_pool_has_addr(binding->chunk_pool, dma_addr, @@ -117,6 +126,7 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) WARN_ON(rxq->mp_params.mp_priv != binding); rxq->mp_params.mp_priv = NULL; + rxq->mp_params.mp_ops = NULL; rxq_idx = get_netdev_rx_queue_index(rxq); @@ -152,7 +162,7 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, } rxq = __netif_get_rx_queue(dev, rxq_idx); - if (rxq->mp_params.mp_priv) { + if (rxq->mp_params.mp_ops) { NL_SET_ERR_MSG(extack, "designated queue already memory provider bound"); return -EEXIST; } @@ -170,6 +180,7 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, return err; rxq->mp_params.mp_priv = binding; + rxq->mp_params.mp_ops = &dmabuf_devmem_ops; err = netdev_rx_queue_restart(dev, rxq_idx); if (err) @@ -179,6 +190,7 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, err_xa_erase: rxq->mp_params.mp_priv = NULL; + rxq->mp_params.mp_ops = NULL; xa_erase(&binding->bound_rxqs, xa_idx); return err; @@ -261,9 +273,9 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, goto err_free_chunks; } - owner->base_virtual = virtual; + owner->area.base_virtual = virtual; owner->base_dma_addr = dma_addr; - owner->num_niovs = len / PAGE_SIZE; + owner->area.num_niovs = len / PAGE_SIZE; owner->binding = binding; err = gen_pool_add_owner(binding->chunk_pool, dma_addr, @@ -275,17 +287,17 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, goto err_free_chunks; } - owner->niovs = kvmalloc_array(owner->num_niovs, - sizeof(*owner->niovs), - GFP_KERNEL); - if (!owner->niovs) { + owner->area.niovs = kvmalloc_array(owner->area.num_niovs, + sizeof(*owner->area.niovs), + GFP_KERNEL); + if (!owner->area.niovs) { err = -ENOMEM; goto err_free_chunks; } - for (i = 0; i < owner->num_niovs; i++) { - niov = &owner->niovs[i]; - niov->owner = owner; + for (i = 0; i < owner->area.num_niovs; i++) { + niov = &owner->area.niovs[i]; + niov->owner = &owner->area; page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), net_devmem_get_dma_addr(niov)); } @@ -313,26 +325,6 @@ err_put_dmabuf: return ERR_PTR(err); } -void dev_dmabuf_uninstall(struct net_device *dev) -{ - struct net_devmem_dmabuf_binding *binding; - struct netdev_rx_queue *rxq; - unsigned long xa_idx; - unsigned int i; - - for (i = 0; i < dev->real_num_rx_queues; i++) { - binding = dev->_rx[i].mp_params.mp_priv; - if (!binding) - continue; - - xa_for_each(&binding->bound_rxqs, xa_idx, rxq) - if (rxq == &dev->_rx[i]) { - xa_erase(&binding->bound_rxqs, xa_idx); - break; - } - } -} - /*** "Dmabuf devmem memory provider" ***/ int mp_dmabuf_devmem_init(struct page_pool *pool) @@ -398,3 +390,36 @@ bool mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem) /* We don't want the page pool put_page()ing our net_iovs. */ return false; } + +static int mp_dmabuf_devmem_nl_fill(void *mp_priv, struct sk_buff *rsp, + struct netdev_rx_queue *rxq) +{ + const struct net_devmem_dmabuf_binding *binding = mp_priv; + int type = rxq ? NETDEV_A_QUEUE_DMABUF : NETDEV_A_PAGE_POOL_DMABUF; + + return nla_put_u32(rsp, type, binding->id); +} + +static void mp_dmabuf_devmem_uninstall(void *mp_priv, + struct netdev_rx_queue *rxq) +{ + struct net_devmem_dmabuf_binding *binding = mp_priv; + struct netdev_rx_queue *bound_rxq; + unsigned long xa_idx; + + xa_for_each(&binding->bound_rxqs, xa_idx, bound_rxq) { + if (bound_rxq == rxq) { + xa_erase(&binding->bound_rxqs, xa_idx); + break; + } + } +} + +static const struct memory_provider_ops dmabuf_devmem_ops = { + .init = mp_dmabuf_devmem_init, + .destroy = mp_dmabuf_devmem_destroy, + .alloc_netmems = mp_dmabuf_devmem_alloc_netmems, + .release_netmem = mp_dmabuf_devmem_release_page, + .nl_fill = mp_dmabuf_devmem_nl_fill, + .uninstall = mp_dmabuf_devmem_uninstall, +}; diff --git a/net/core/devmem.h b/net/core/devmem.h index 76099ef9c482..7fc158d52729 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -10,6 +10,8 @@ #ifndef _NET_DEVMEM_H #define _NET_DEVMEM_H +#include <net/netmem.h> + struct netlink_ext_ack; struct net_devmem_dmabuf_binding { @@ -51,17 +53,11 @@ struct net_devmem_dmabuf_binding { * allocations from this chunk. */ struct dmabuf_genpool_chunk_owner { - /* Offset into the dma-buf where this chunk starts. */ - unsigned long base_virtual; + struct net_iov_area area; + struct net_devmem_dmabuf_binding *binding; /* dma_addr of the start of the chunk. */ dma_addr_t base_dma_addr; - - /* Array of net_iovs for this chunk. */ - struct net_iov *niovs; - size_t num_niovs; - - struct net_devmem_dmabuf_binding *binding; }; void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding); @@ -72,38 +68,34 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding); int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, struct net_devmem_dmabuf_binding *binding, struct netlink_ext_ack *extack); -void dev_dmabuf_uninstall(struct net_device *dev); static inline struct dmabuf_genpool_chunk_owner * -net_iov_owner(const struct net_iov *niov) +net_devmem_iov_to_chunk_owner(const struct net_iov *niov) { - return niov->owner; + struct net_iov_area *owner = net_iov_owner(niov); + + return container_of(owner, struct dmabuf_genpool_chunk_owner, area); } -static inline unsigned int net_iov_idx(const struct net_iov *niov) +static inline struct net_devmem_dmabuf_binding * +net_devmem_iov_binding(const struct net_iov *niov) { - return niov - net_iov_owner(niov)->niovs; + return net_devmem_iov_to_chunk_owner(niov)->binding; } -static inline struct net_devmem_dmabuf_binding * -net_iov_binding(const struct net_iov *niov) +static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov) { - return net_iov_owner(niov)->binding; + return net_devmem_iov_binding(niov)->id; } static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov) { - struct dmabuf_genpool_chunk_owner *owner = net_iov_owner(niov); + struct net_iov_area *owner = net_iov_owner(niov); return owner->base_virtual + ((unsigned long)net_iov_idx(niov) << PAGE_SHIFT); } -static inline u32 net_iov_binding_id(const struct net_iov *niov) -{ - return net_iov_owner(niov)->binding->id; -} - static inline void net_devmem_dmabuf_binding_get(struct net_devmem_dmabuf_binding *binding) { @@ -123,6 +115,8 @@ struct net_iov * net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding); void net_devmem_free_dmabuf(struct net_iov *ppiov); +bool net_is_devmem_iov(struct net_iov *niov); + #else struct net_devmem_dmabuf_binding; @@ -152,10 +146,6 @@ net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, return -EOPNOTSUPP; } -static inline void dev_dmabuf_uninstall(struct net_device *dev) -{ -} - static inline struct net_iov * net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) { @@ -171,10 +161,15 @@ static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov) return 0; } -static inline u32 net_iov_binding_id(const struct net_iov *niov) +static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov) { return 0; } + +static inline bool net_is_devmem_iov(struct net_iov *niov) +{ + return false; +} #endif #endif /* _NET_DEVMEM_H */ diff --git a/net/core/neighbour.c b/net/core/neighbour.c index bd0251bd74a1..d8dd686b5287 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -832,12 +832,10 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, return -ENOENT; } -static void neigh_parms_destroy(struct neigh_parms *parms); - static inline void neigh_parms_put(struct neigh_parms *parms) { if (refcount_dec_and_test(&parms->refcnt)) - neigh_parms_destroy(parms); + kfree(parms); } /* @@ -1713,11 +1711,6 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) } EXPORT_SYMBOL(neigh_parms_release); -static void neigh_parms_destroy(struct neigh_parms *parms) -{ - kfree(parms); -} - static struct lock_class_key neigh_table_proxy_queue_class; static struct neigh_table __rcu *neigh_tables[NEIGH_NR_TABLES] __read_mostly; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 07cb99b114bd..3fe2c521e574 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -42,6 +42,87 @@ static inline int dev_isalive(const struct net_device *dev) return READ_ONCE(dev->reg_state) <= NETREG_REGISTERED; } +/* There is a possible ABBA deadlock between rtnl_lock and kernfs_node->active, + * when unregistering a net device and accessing associated sysfs files. The + * potential deadlock is as follow: + * + * CPU 0 CPU 1 + * + * rtnl_lock vfs_read + * unregister_netdevice_many kernfs_seq_start + * device_del / kobject_put kernfs_get_active (kn->active++) + * kernfs_drain sysfs_kf_seq_show + * wait_event( rtnl_lock + * kn->active == KN_DEACTIVATED_BIAS) -> waits on CPU 0 to release + * -> waits on CPU 1 to decrease kn->active the rtnl lock. + * + * The historical fix was to use rtnl_trylock with restart_syscall to bail out + * of sysfs operations when the lock couldn't be taken. This fixed the above + * issue as it allowed CPU 1 to bail out of the ABBA situation. + * + * But it came with performances issues, as syscalls are being restarted in + * loops when there was contention on the rtnl lock, with huge slow downs in + * specific scenarios (e.g. lots of virtual interfaces created and userspace + * daemons querying their attributes). + * + * The idea below is to bail out of the active kernfs_node protection + * (kn->active) while trying to take the rtnl lock. + * + * This replaces rtnl_lock() and still has to be used with rtnl_unlock(). The + * net device is guaranteed to be alive if this returns successfully. + */ +static int sysfs_rtnl_lock(struct kobject *kobj, struct attribute *attr, + struct net_device *ndev) +{ + struct kernfs_node *kn; + int ret = 0; + + /* First, we hold a reference to the net device as the unregistration + * path might run in parallel. This will ensure the net device and the + * associated sysfs objects won't be freed while we try to take the rtnl + * lock. + */ + dev_hold(ndev); + /* sysfs_break_active_protection was introduced to allow self-removal of + * devices and their associated sysfs files by bailing out of the + * sysfs/kernfs protection. We do this here to allow the unregistration + * path to complete in parallel. The following takes a reference on the + * kobject and the kernfs_node being accessed. + * + * This works because we hold a reference onto the net device and the + * unregistration path will wait for us eventually in netdev_run_todo + * (outside an rtnl lock section). + */ + kn = sysfs_break_active_protection(kobj, attr); + /* We can now try to take the rtnl lock. This can't deadlock us as the + * unregistration path is able to drain sysfs files (kernfs_node) thanks + * to the above dance. + */ + if (rtnl_lock_interruptible()) { + ret = -ERESTARTSYS; + goto unbreak; + } + /* Check dismantle on the device hasn't started, otherwise deny the + * operation. + */ + if (!dev_isalive(ndev)) { + rtnl_unlock(); + ret = -ENODEV; + goto unbreak; + } + /* We are now sure the device dismantle hasn't started nor that it can + * start before we exit the locking section as we hold the rtnl lock. + * There's no need to keep unbreaking the sysfs protection nor to hold + * a net device reference from that point; that was only needed to take + * the rtnl lock. + */ +unbreak: + sysfs_unbreak_active_protection(kn); + dev_put(ndev); + + return ret; +} + /* use same locking rules as GIF* ioctl's */ static ssize_t netdev_show(const struct device *dev, struct device_attribute *attr, char *buf, @@ -95,14 +176,14 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr, if (ret) goto err; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + goto err; + + ret = (*set)(netdev, new); + if (ret == 0) + ret = len; - if (dev_isalive(netdev)) { - ret = (*set)(netdev, new); - if (ret == 0) - ret = len; - } rtnl_unlock(); err: return ret; @@ -220,7 +301,7 @@ static ssize_t carrier_store(struct device *dev, struct device_attribute *attr, struct net_device *netdev = to_net_dev(dev); /* The check is also done in change_carrier; this helps returning early - * without hitting the trylock/restart in netdev_store. + * without hitting the locking section in netdev_store. */ if (!netdev->netdev_ops->ndo_change_carrier) return -EOPNOTSUPP; @@ -234,8 +315,9 @@ static ssize_t carrier_show(struct device *dev, struct net_device *netdev = to_net_dev(dev); int ret = -EINVAL; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; if (netif_running(netdev)) { /* Synchronize carrier state with link watch, @@ -245,8 +327,8 @@ static ssize_t carrier_show(struct device *dev, ret = sysfs_emit(buf, fmt_dec, !!netif_carrier_ok(netdev)); } - rtnl_unlock(); + rtnl_unlock(); return ret; } static DEVICE_ATTR_RW(carrier); @@ -258,13 +340,14 @@ static ssize_t speed_show(struct device *dev, int ret = -EINVAL; /* The check is also done in __ethtool_get_link_ksettings; this helps - * returning early without hitting the trylock/restart below. + * returning early without hitting the locking section below. */ if (!netdev->ethtool_ops->get_link_ksettings) return ret; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; if (netif_running(netdev)) { struct ethtool_link_ksettings cmd; @@ -284,13 +367,14 @@ static ssize_t duplex_show(struct device *dev, int ret = -EINVAL; /* The check is also done in __ethtool_get_link_ksettings; this helps - * returning early without hitting the trylock/restart below. + * returning early without hitting the locking section below. */ if (!netdev->ethtool_ops->get_link_ksettings) return ret; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; if (netif_running(netdev)) { struct ethtool_link_ksettings cmd; @@ -490,16 +574,15 @@ static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, if (len > 0 && buf[len - 1] == '\n') --count; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; - if (dev_isalive(netdev)) { - ret = dev_set_alias(netdev, buf, count); - if (ret < 0) - goto err; - ret = len; - netdev_state_change(netdev); - } + ret = dev_set_alias(netdev, buf, count); + if (ret < 0) + goto err; + ret = len; + netdev_state_change(netdev); err: rtnl_unlock(); @@ -551,24 +634,23 @@ static ssize_t phys_port_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); + struct netdev_phys_item_id ppid; ssize_t ret = -EINVAL; /* The check is also done in dev_get_phys_port_id; this helps returning - * early without hitting the trylock/restart below. + * early without hitting the locking section below. */ if (!netdev->netdev_ops->ndo_get_phys_port_id) return -EOPNOTSUPP; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; - if (dev_isalive(netdev)) { - struct netdev_phys_item_id ppid; + ret = dev_get_phys_port_id(netdev, &ppid); + if (!ret) + ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); - ret = dev_get_phys_port_id(netdev, &ppid); - if (!ret) - ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); - } rtnl_unlock(); return ret; @@ -580,24 +662,23 @@ static ssize_t phys_port_name_show(struct device *dev, { struct net_device *netdev = to_net_dev(dev); ssize_t ret = -EINVAL; + char name[IFNAMSIZ]; /* The checks are also done in dev_get_phys_port_name; this helps - * returning early without hitting the trylock/restart below. + * returning early without hitting the locking section below. */ if (!netdev->netdev_ops->ndo_get_phys_port_name && !netdev->devlink_port) return -EOPNOTSUPP; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; - if (dev_isalive(netdev)) { - char name[IFNAMSIZ]; + ret = dev_get_phys_port_name(netdev, name, sizeof(name)); + if (!ret) + ret = sysfs_emit(buf, "%s\n", name); - ret = dev_get_phys_port_name(netdev, name, sizeof(name)); - if (!ret) - ret = sysfs_emit(buf, "%s\n", name); - } rtnl_unlock(); return ret; @@ -608,26 +689,25 @@ static ssize_t phys_switch_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); + struct netdev_phys_item_id ppid = { }; ssize_t ret = -EINVAL; /* The checks are also done in dev_get_phys_port_name; this helps - * returning early without hitting the trylock/restart below. This works + * returning early without hitting the locking section below. This works * because recurse is false when calling dev_get_port_parent_id. */ if (!netdev->netdev_ops->ndo_get_port_parent_id && !netdev->devlink_port) return -EOPNOTSUPP; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; - if (dev_isalive(netdev)) { - struct netdev_phys_item_id ppid = { }; + ret = dev_get_port_parent_id(netdev, &ppid, false); + if (!ret) + ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); - ret = dev_get_port_parent_id(netdev, &ppid, false); - if (!ret) - ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); - } rtnl_unlock(); return ret; @@ -1108,7 +1188,6 @@ static void rx_queue_get_ownership(const struct kobject *kobj, static const struct kobj_type rx_queue_ktype = { .sysfs_ops = &rx_queue_sysfs_ops, .release = rx_queue_release, - .default_groups = rx_queue_default_groups, .namespace = rx_queue_namespace, .get_ownership = rx_queue_get_ownership, }; @@ -1131,6 +1210,22 @@ static int rx_queue_add_kobject(struct net_device *dev, int index) struct kobject *kobj = &queue->kobj; int error = 0; + /* Rx queues are cleared in rx_queue_release to allow later + * re-registration. This is triggered when their kobj refcount is + * dropped. + * + * If a queue is removed while both a read (or write) operation and a + * the re-addition of the same queue are pending (waiting on rntl_lock) + * it might happen that the re-addition will execute before the read, + * making the initial removal to never happen (queue's kobj refcount + * won't drop enough because of the pending read). In such rare case, + * return to allow the removal operation to complete. + */ + if (unlikely(kobj->state_initialized)) { + netdev_warn_once(dev, "Cannot re-add rx queues before their removal completed"); + return -EAGAIN; + } + /* Kobject_put later will trigger rx_queue_release call which * decreases dev refcount: Take that reference here */ @@ -1142,20 +1237,27 @@ static int rx_queue_add_kobject(struct net_device *dev, int index) if (error) goto err; + queue->groups = rx_queue_default_groups; + error = sysfs_create_groups(kobj, queue->groups); + if (error) + goto err; + if (dev->sysfs_rx_queue_group) { error = sysfs_create_group(kobj, dev->sysfs_rx_queue_group); if (error) - goto err; + goto err_default_groups; } error = rx_queue_default_mask(dev, queue); if (error) - goto err; + goto err_default_groups; kobject_uevent(kobj, KOBJ_ADD); return error; +err_default_groups: + sysfs_remove_groups(kobj, queue->groups); err: kobject_put(kobj); return error; @@ -1200,12 +1302,14 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) } while (--i >= new_num) { - struct kobject *kobj = &dev->_rx[i].kobj; + struct netdev_rx_queue *queue = &dev->_rx[i]; + struct kobject *kobj = &queue->kobj; if (!refcount_read(&dev_net(dev)->ns.count)) kobj->uevent_suppress = 1; if (dev->sysfs_rx_queue_group) sysfs_remove_group(kobj, dev->sysfs_rx_queue_group); + sysfs_remove_groups(kobj, queue->groups); kobject_put(kobj); } @@ -1244,9 +1348,11 @@ static int net_rx_queue_change_owner(struct net_device *dev, int num, */ struct netdev_queue_attribute { struct attribute attr; - ssize_t (*show)(struct netdev_queue *queue, char *buf); - ssize_t (*store)(struct netdev_queue *queue, - const char *buf, size_t len); + ssize_t (*show)(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf); + ssize_t (*store)(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len); }; #define to_netdev_queue_attr(_attr) \ container_of(_attr, struct netdev_queue_attribute, attr) @@ -1263,7 +1369,7 @@ static ssize_t netdev_queue_attr_show(struct kobject *kobj, if (!attribute->show) return -EIO; - return attribute->show(queue, buf); + return attribute->show(kobj, attr, queue, buf); } static ssize_t netdev_queue_attr_store(struct kobject *kobj, @@ -1277,7 +1383,7 @@ static ssize_t netdev_queue_attr_store(struct kobject *kobj, if (!attribute->store) return -EIO; - return attribute->store(queue, buf, count); + return attribute->store(kobj, attr, queue, buf, count); } static const struct sysfs_ops netdev_queue_sysfs_ops = { @@ -1285,7 +1391,8 @@ static const struct sysfs_ops netdev_queue_sysfs_ops = { .store = netdev_queue_attr_store, }; -static ssize_t tx_timeout_show(struct netdev_queue *queue, char *buf) +static ssize_t tx_timeout_show(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { unsigned long trans_timeout = atomic_long_read(&queue->trans_timeout); @@ -1303,18 +1410,18 @@ static unsigned int get_netdev_queue_index(struct netdev_queue *queue) return i; } -static ssize_t traffic_class_show(struct netdev_queue *queue, - char *buf) +static ssize_t traffic_class_show(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; - int num_tc, tc; - int index; + int num_tc, tc, index, ret; if (!netif_is_multiqueue(dev)) return -ENOENT; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(kobj, attr, queue->dev); + if (ret) + return ret; index = get_netdev_queue_index(queue); @@ -1341,24 +1448,25 @@ static ssize_t traffic_class_show(struct netdev_queue *queue, } #ifdef CONFIG_XPS -static ssize_t tx_maxrate_show(struct netdev_queue *queue, - char *buf) +static ssize_t tx_maxrate_show(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { return sysfs_emit(buf, "%lu\n", queue->tx_maxrate); } -static ssize_t tx_maxrate_store(struct netdev_queue *queue, - const char *buf, size_t len) +static ssize_t tx_maxrate_store(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len) { - struct net_device *dev = queue->dev; int err, index = get_netdev_queue_index(queue); + struct net_device *dev = queue->dev; u32 rate = 0; if (!capable(CAP_NET_ADMIN)) return -EPERM; /* The check is also done later; this helps returning early without - * hitting the trylock/restart below. + * hitting the locking section below. */ if (!dev->netdev_ops->ndo_set_tx_maxrate) return -EOPNOTSUPP; @@ -1367,18 +1475,21 @@ static ssize_t tx_maxrate_store(struct netdev_queue *queue, if (err < 0) return err; - if (!rtnl_trylock()) - return restart_syscall(); + err = sysfs_rtnl_lock(kobj, attr, dev); + if (err) + return err; err = -EOPNOTSUPP; if (dev->netdev_ops->ndo_set_tx_maxrate) err = dev->netdev_ops->ndo_set_tx_maxrate(dev, index, rate); - rtnl_unlock(); if (!err) { queue->tx_maxrate = rate; + rtnl_unlock(); return len; } + + rtnl_unlock(); return err; } @@ -1422,16 +1533,17 @@ static ssize_t bql_set(const char *buf, const size_t count, return count; } -static ssize_t bql_show_hold_time(struct netdev_queue *queue, - char *buf) +static ssize_t bql_show_hold_time(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time)); } -static ssize_t bql_set_hold_time(struct netdev_queue *queue, - const char *buf, size_t len) +static ssize_t bql_set_hold_time(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len) { struct dql *dql = &queue->dql; unsigned int value; @@ -1450,15 +1562,17 @@ static struct netdev_queue_attribute bql_hold_time_attribute __ro_after_init = __ATTR(hold_time, 0644, bql_show_hold_time, bql_set_hold_time); -static ssize_t bql_show_stall_thrs(struct netdev_queue *queue, char *buf) +static ssize_t bql_show_stall_thrs(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->stall_thrs)); } -static ssize_t bql_set_stall_thrs(struct netdev_queue *queue, - const char *buf, size_t len) +static ssize_t bql_set_stall_thrs(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len) { struct dql *dql = &queue->dql; unsigned int value; @@ -1484,13 +1598,15 @@ static ssize_t bql_set_stall_thrs(struct netdev_queue *queue, static struct netdev_queue_attribute bql_stall_thrs_attribute __ro_after_init = __ATTR(stall_thrs, 0644, bql_show_stall_thrs, bql_set_stall_thrs); -static ssize_t bql_show_stall_max(struct netdev_queue *queue, char *buf) +static ssize_t bql_show_stall_max(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { return sysfs_emit(buf, "%u\n", READ_ONCE(queue->dql.stall_max)); } -static ssize_t bql_set_stall_max(struct netdev_queue *queue, - const char *buf, size_t len) +static ssize_t bql_set_stall_max(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len) { WRITE_ONCE(queue->dql.stall_max, 0); return len; @@ -1499,7 +1615,8 @@ static ssize_t bql_set_stall_max(struct netdev_queue *queue, static struct netdev_queue_attribute bql_stall_max_attribute __ro_after_init = __ATTR(stall_max, 0644, bql_show_stall_max, bql_set_stall_max); -static ssize_t bql_show_stall_cnt(struct netdev_queue *queue, char *buf) +static ssize_t bql_show_stall_cnt(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; @@ -1509,8 +1626,8 @@ static ssize_t bql_show_stall_cnt(struct netdev_queue *queue, char *buf) static struct netdev_queue_attribute bql_stall_cnt_attribute __ro_after_init = __ATTR(stall_cnt, 0444, bql_show_stall_cnt, NULL); -static ssize_t bql_show_inflight(struct netdev_queue *queue, - char *buf) +static ssize_t bql_show_inflight(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; @@ -1521,13 +1638,16 @@ static struct netdev_queue_attribute bql_inflight_attribute __ro_after_init = __ATTR(inflight, 0444, bql_show_inflight, NULL); #define BQL_ATTR(NAME, FIELD) \ -static ssize_t bql_show_ ## NAME(struct netdev_queue *queue, \ - char *buf) \ +static ssize_t bql_show_ ## NAME(struct kobject *kobj, \ + struct attribute *attr, \ + struct netdev_queue *queue, char *buf) \ { \ return bql_show(buf, queue->dql.FIELD); \ } \ \ -static ssize_t bql_set_ ## NAME(struct netdev_queue *queue, \ +static ssize_t bql_set_ ## NAME(struct kobject *kobj, \ + struct attribute *attr, \ + struct netdev_queue *queue, \ const char *buf, size_t len) \ { \ return bql_set(buf, len, &queue->dql.FIELD); \ @@ -1613,19 +1733,21 @@ out_no_maps: return len < PAGE_SIZE ? len : -EINVAL; } -static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf) +static ssize_t xps_cpus_show(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; unsigned int index; - int len, tc; + int len, tc, ret; if (!netif_is_multiqueue(dev)) return -ENOENT; index = get_netdev_queue_index(queue); - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(kobj, attr, queue->dev); + if (ret) + return ret; /* If queue belongs to subordinate dev use its map */ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; @@ -1636,18 +1758,21 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf) return -EINVAL; } - /* Make sure the subordinate device can't be freed */ - get_device(&dev->dev); + /* Increase the net device refcnt to make sure it won't be freed while + * xps_queue_show is running. + */ + dev_hold(dev); rtnl_unlock(); len = xps_queue_show(dev, index, tc, buf, XPS_CPUS); - put_device(&dev->dev); + dev_put(dev); return len; } -static ssize_t xps_cpus_store(struct netdev_queue *queue, - const char *buf, size_t len) +static ssize_t xps_cpus_store(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len) { struct net_device *dev = queue->dev; unsigned int index; @@ -1671,9 +1796,10 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue, return err; } - if (!rtnl_trylock()) { + err = sysfs_rtnl_lock(kobj, attr, dev); + if (err) { free_cpumask_var(mask); - return restart_syscall(); + return err; } err = netif_set_xps_queue(dev, mask, index); @@ -1687,26 +1813,34 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue, static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init = __ATTR_RW(xps_cpus); -static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) +static ssize_t xps_rxqs_show(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; unsigned int index; - int tc; + int tc, ret; index = get_netdev_queue_index(queue); - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(kobj, attr, dev); + if (ret) + return ret; tc = netdev_txq_to_tc(dev, index); + + /* Increase the net device refcnt to make sure it won't be freed while + * xps_queue_show is running. + */ + dev_hold(dev); rtnl_unlock(); - if (tc < 0) - return -EINVAL; - return xps_queue_show(dev, index, tc, buf, XPS_RXQS); + ret = tc >= 0 ? xps_queue_show(dev, index, tc, buf, XPS_RXQS) : -EINVAL; + dev_put(dev); + return ret; } -static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, +static ssize_t xps_rxqs_store(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, size_t len) { struct net_device *dev = queue->dev; @@ -1730,9 +1864,10 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, return err; } - if (!rtnl_trylock()) { + err = sysfs_rtnl_lock(kobj, attr, dev); + if (err) { bitmap_free(mask); - return restart_syscall(); + return err; } cpus_read_lock(); @@ -1792,7 +1927,6 @@ static void netdev_queue_get_ownership(const struct kobject *kobj, static const struct kobj_type netdev_queue_ktype = { .sysfs_ops = &netdev_queue_sysfs_ops, .release = netdev_queue_release, - .default_groups = netdev_queue_default_groups, .namespace = netdev_queue_namespace, .get_ownership = netdev_queue_get_ownership, }; @@ -1811,6 +1945,22 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index) struct kobject *kobj = &queue->kobj; int error = 0; + /* Tx queues are cleared in netdev_queue_release to allow later + * re-registration. This is triggered when their kobj refcount is + * dropped. + * + * If a queue is removed while both a read (or write) operation and a + * the re-addition of the same queue are pending (waiting on rntl_lock) + * it might happen that the re-addition will execute before the read, + * making the initial removal to never happen (queue's kobj refcount + * won't drop enough because of the pending read). In such rare case, + * return to allow the removal operation to complete. + */ + if (unlikely(kobj->state_initialized)) { + netdev_warn_once(dev, "Cannot re-add tx queues before their removal completed"); + return -EAGAIN; + } + /* Kobject_put later will trigger netdev_queue_release call * which decreases dev refcount: Take that reference here */ @@ -1822,15 +1972,22 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index) if (error) goto err; + queue->groups = netdev_queue_default_groups; + error = sysfs_create_groups(kobj, queue->groups); + if (error) + goto err; + if (netdev_uses_bql(dev)) { error = sysfs_create_group(kobj, &dql_group); if (error) - goto err; + goto err_default_groups; } kobject_uevent(kobj, KOBJ_ADD); return 0; +err_default_groups: + sysfs_remove_groups(kobj, queue->groups); err: kobject_put(kobj); return error; @@ -1885,6 +2042,7 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) if (netdev_uses_bql(dev)) sysfs_remove_group(&queue->kobj, &dql_group); + sysfs_remove_groups(&queue->kobj, queue->groups); kobject_put(&queue->kobj); } diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 715f85c6b62e..5b459b4fef46 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -10,6 +10,7 @@ #include <net/sock.h> #include <net/xdp.h> #include <net/xdp_sock.h> +#include <net/page_pool/memory_provider.h> #include "dev.h" #include "devmem.h" @@ -368,7 +369,7 @@ static int netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) { - struct net_devmem_dmabuf_binding *binding; + struct pp_memory_provider_params *params; struct netdev_rx_queue *rxq; struct netdev_queue *txq; void *hdr; @@ -385,15 +386,15 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, switch (q_type) { case NETDEV_QUEUE_TYPE_RX: rxq = __netif_get_rx_queue(netdev, q_idx); + if (rxq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID, rxq->napi->napi_id)) goto nla_put_failure; - binding = rxq->mp_params.mp_priv; - if (binding && - nla_put_u32(rsp, NETDEV_A_QUEUE_DMABUF, binding->id)) + params = &rxq->mp_params; + if (params->mp_ops && + params->mp_ops->nl_fill(params->mp_priv, rsp, rxq)) goto nla_put_failure; - break; case NETDEV_QUEUE_TYPE_TX: txq = netdev_get_tx_queue(netdev, q_idx); diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c index db82786fa0c4..db46880f37cc 100644 --- a/net/core/netdev_rx_queue.c +++ b/net/core/netdev_rx_queue.c @@ -3,6 +3,7 @@ #include <linux/netdevice.h> #include <net/netdev_queues.h> #include <net/netdev_rx_queue.h> +#include <net/page_pool/memory_provider.h> #include "page_pool_priv.h" @@ -80,3 +81,71 @@ err_free_new_mem: return err; } EXPORT_SYMBOL_NS_GPL(netdev_rx_queue_restart, "NETDEV_INTERNAL"); + +static int __net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx, + struct pp_memory_provider_params *p) +{ + struct netdev_rx_queue *rxq; + int ret; + + if (ifq_idx >= dev->real_num_rx_queues) + return -EINVAL; + ifq_idx = array_index_nospec(ifq_idx, dev->real_num_rx_queues); + + rxq = __netif_get_rx_queue(dev, ifq_idx); + if (rxq->mp_params.mp_ops) + return -EEXIST; + + rxq->mp_params = *p; + ret = netdev_rx_queue_restart(dev, ifq_idx); + if (ret) { + rxq->mp_params.mp_ops = NULL; + rxq->mp_params.mp_priv = NULL; + } + return ret; +} + +int net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx, + struct pp_memory_provider_params *p) +{ + int ret; + + rtnl_lock(); + ret = __net_mp_open_rxq(dev, ifq_idx, p); + rtnl_unlock(); + return ret; +} + +static void __net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, + struct pp_memory_provider_params *old_p) +{ + struct netdev_rx_queue *rxq; + + if (WARN_ON_ONCE(ifq_idx >= dev->real_num_rx_queues)) + return; + + rxq = __netif_get_rx_queue(dev, ifq_idx); + + /* Callers holding a netdev ref may get here after we already + * went thru shutdown via dev_memory_provider_uninstall(). + */ + if (dev->reg_state > NETREG_REGISTERED && + !rxq->mp_params.mp_ops) + return; + + if (WARN_ON_ONCE(rxq->mp_params.mp_ops != old_p->mp_ops || + rxq->mp_params.mp_priv != old_p->mp_priv)) + return; + + rxq->mp_params.mp_ops = NULL; + rxq->mp_params.mp_priv = NULL; + WARN_ON(netdev_rx_queue_restart(dev, ifq_idx)); +} + +void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, + struct pp_memory_provider_params *old_p) +{ + rtnl_lock(); + __net_mp_close_rxq(dev, ifq_idx, old_p); + rtnl_unlock(); +} diff --git a/net/core/page_pool.c b/net/core/page_pool.c index f5e908c9e7ad..686bd4a117d9 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -13,6 +13,7 @@ #include <net/netdev_rx_queue.h> #include <net/page_pool/helpers.h> +#include <net/page_pool/memory_provider.h> #include <net/xdp.h> #include <linux/dma-direction.h> @@ -285,13 +286,19 @@ static int page_pool_init(struct page_pool *pool, rxq = __netif_get_rx_queue(pool->slow.netdev, pool->slow.queue_idx); pool->mp_priv = rxq->mp_params.mp_priv; + pool->mp_ops = rxq->mp_params.mp_ops; } - if (pool->mp_priv) { + if (pool->mp_ops) { if (!pool->dma_map || !pool->dma_sync) return -EOPNOTSUPP; - err = mp_dmabuf_devmem_init(pool); + if (WARN_ON(!is_kernel_rodata((unsigned long)pool->mp_ops))) { + err = -EFAULT; + goto free_ptr_ring; + } + + err = pool->mp_ops->init(pool); if (err) { pr_warn("%s() mem-provider init failed %d\n", __func__, err); @@ -587,8 +594,8 @@ netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp) return netmem; /* Slow-path: cache empty, do real allocation */ - if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv) - netmem = mp_dmabuf_devmem_alloc_netmems(pool, gfp); + if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) + netmem = pool->mp_ops->alloc_netmems(pool, gfp); else netmem = __page_pool_alloc_pages_slow(pool, gfp); return netmem; @@ -679,8 +686,8 @@ void page_pool_return_page(struct page_pool *pool, netmem_ref netmem) bool put; put = true; - if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv) - put = mp_dmabuf_devmem_release_page(pool, netmem); + if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) + put = pool->mp_ops->release_netmem(pool, netmem); else __page_pool_release_page_dma(pool, netmem); @@ -1048,8 +1055,8 @@ static void __page_pool_destroy(struct page_pool *pool) page_pool_unlist(pool); page_pool_uninit(pool); - if (pool->mp_priv) { - mp_dmabuf_devmem_destroy(pool); + if (pool->mp_ops) { + pool->mp_ops->destroy(pool); static_branch_dec(&page_pool_mem_providers); } @@ -1190,3 +1197,31 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid) } } EXPORT_SYMBOL(page_pool_update_nid); + +bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr) +{ + return page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), addr); +} + +/* Associate a niov with a page pool. Should follow with a matching + * net_mp_niov_clear_page_pool() + */ +void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov) +{ + netmem_ref netmem = net_iov_to_netmem(niov); + + page_pool_set_pp_info(pool, netmem); + + pool->pages_state_hold_cnt++; + trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt); +} + +/* Disassociate a niov from a page pool. Should only be used in the + * ->release_netmem() path. + */ +void net_mp_niov_clear_page_pool(struct net_iov *niov) +{ + netmem_ref netmem = net_iov_to_netmem(niov); + + page_pool_clear_pp_info(netmem); +} diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c index 6677e0c2e256..9d8a3d8597fa 100644 --- a/net/core/page_pool_user.c +++ b/net/core/page_pool_user.c @@ -8,9 +8,9 @@ #include <net/netdev_rx_queue.h> #include <net/page_pool/helpers.h> #include <net/page_pool/types.h> +#include <net/page_pool/memory_provider.h> #include <net/sock.h> -#include "devmem.h" #include "page_pool_priv.h" #include "netdev-genl-gen.h" @@ -216,7 +216,6 @@ static int page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, const struct genl_info *info) { - struct net_devmem_dmabuf_binding *binding = pool->mp_priv; size_t inflight, refsz; unsigned int napi_id; void *hdr; @@ -249,7 +248,7 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, pool->user.detach_time)) goto err_cancel; - if (binding && nla_put_u32(rsp, NETDEV_A_PAGE_POOL_DMABUF, binding->id)) + if (pool->mp_ops && pool->mp_ops->nl_fill(pool->mp_priv, rsp, NULL)) goto err_cancel; genlmsg_end(rsp, hdr); @@ -356,7 +355,7 @@ void page_pool_unlist(struct page_pool *pool) int page_pool_check_memory_provider(struct net_device *dev, struct netdev_rx_queue *rxq) { - struct net_devmem_dmabuf_binding *binding = rxq->mp_params.mp_priv; + void *binding = rxq->mp_params.mp_priv; struct page_pool *pool; struct hlist_node *n; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d1e559fce918..abe1a461ea67 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -80,6 +80,11 @@ void rtnl_lock(void) } EXPORT_SYMBOL(rtnl_lock); +int rtnl_lock_interruptible(void) +{ + return mutex_lock_interruptible(&rtnl_mutex); +} + int rtnl_lock_killable(void) { return mutex_lock_killable(&rtnl_mutex); diff --git a/net/ethtool/common.c b/net/ethtool/common.c index b97374b508f6..ac8b6107863e 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -214,6 +214,24 @@ const char link_mode_names[][ETH_GSTRING_LEN] = { __DEFINE_LINK_MODE_NAME(10, T1S, Half), __DEFINE_LINK_MODE_NAME(10, T1S_P2MP, Half), __DEFINE_LINK_MODE_NAME(10, T1BRR, Full), + __DEFINE_LINK_MODE_NAME(200000, CR, Full), + __DEFINE_LINK_MODE_NAME(200000, KR, Full), + __DEFINE_LINK_MODE_NAME(200000, DR, Full), + __DEFINE_LINK_MODE_NAME(200000, DR_2, Full), + __DEFINE_LINK_MODE_NAME(200000, SR, Full), + __DEFINE_LINK_MODE_NAME(200000, VR, Full), + __DEFINE_LINK_MODE_NAME(400000, CR2, Full), + __DEFINE_LINK_MODE_NAME(400000, KR2, Full), + __DEFINE_LINK_MODE_NAME(400000, DR2, Full), + __DEFINE_LINK_MODE_NAME(400000, DR2_2, Full), + __DEFINE_LINK_MODE_NAME(400000, SR2, Full), + __DEFINE_LINK_MODE_NAME(400000, VR2, Full), + __DEFINE_LINK_MODE_NAME(800000, CR4, Full), + __DEFINE_LINK_MODE_NAME(800000, KR4, Full), + __DEFINE_LINK_MODE_NAME(800000, DR4, Full), + __DEFINE_LINK_MODE_NAME(800000, DR4_2, Full), + __DEFINE_LINK_MODE_NAME(800000, SR4, Full), + __DEFINE_LINK_MODE_NAME(800000, VR4, Full), }; static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); @@ -222,8 +240,11 @@ static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); #define __LINK_MODE_LANES_CR4 4 #define __LINK_MODE_LANES_CR8 8 #define __LINK_MODE_LANES_DR 1 +#define __LINK_MODE_LANES_DR_2 1 #define __LINK_MODE_LANES_DR2 2 +#define __LINK_MODE_LANES_DR2_2 2 #define __LINK_MODE_LANES_DR4 4 +#define __LINK_MODE_LANES_DR4_2 4 #define __LINK_MODE_LANES_DR8 8 #define __LINK_MODE_LANES_KR 1 #define __LINK_MODE_LANES_KR2 2 @@ -252,6 +273,9 @@ static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); #define __LINK_MODE_LANES_T1L 1 #define __LINK_MODE_LANES_T1S 1 #define __LINK_MODE_LANES_T1S_P2MP 1 +#define __LINK_MODE_LANES_VR 1 +#define __LINK_MODE_LANES_VR2 2 +#define __LINK_MODE_LANES_VR4 4 #define __LINK_MODE_LANES_VR8 8 #define __LINK_MODE_LANES_DR8_2 8 #define __LINK_MODE_LANES_T1BRR 1 @@ -379,6 +403,24 @@ const struct link_mode_info link_mode_params[] = { __DEFINE_LINK_MODE_PARAMS(10, T1S, Half), __DEFINE_LINK_MODE_PARAMS(10, T1S_P2MP, Half), __DEFINE_LINK_MODE_PARAMS(10, T1BRR, Full), + __DEFINE_LINK_MODE_PARAMS(200000, CR, Full), + __DEFINE_LINK_MODE_PARAMS(200000, KR, Full), + __DEFINE_LINK_MODE_PARAMS(200000, DR, Full), + __DEFINE_LINK_MODE_PARAMS(200000, DR_2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, SR, Full), + __DEFINE_LINK_MODE_PARAMS(200000, VR, Full), + __DEFINE_LINK_MODE_PARAMS(400000, CR2, Full), + __DEFINE_LINK_MODE_PARAMS(400000, KR2, Full), + __DEFINE_LINK_MODE_PARAMS(400000, DR2, Full), + __DEFINE_LINK_MODE_PARAMS(400000, DR2_2, Full), + __DEFINE_LINK_MODE_PARAMS(400000, SR2, Full), + __DEFINE_LINK_MODE_PARAMS(400000, VR2, Full), + __DEFINE_LINK_MODE_PARAMS(800000, CR4, Full), + __DEFINE_LINK_MODE_PARAMS(800000, KR4, Full), + __DEFINE_LINK_MODE_PARAMS(800000, DR4, Full), + __DEFINE_LINK_MODE_PARAMS(800000, DR4_2, Full), + __DEFINE_LINK_MODE_PARAMS(800000, SR4, Full), + __DEFINE_LINK_MODE_PARAMS(800000, VR4, Full), }; static_assert(ARRAY_SIZE(link_mode_params) == __ETHTOOL_LINK_MODE_MASK_NBITS); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index ed1b6b44faf8..c9f11a046c26 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -141,7 +141,6 @@ static int ipgre_err(struct sk_buff *skb, u32 info, const struct iphdr *iph; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; - unsigned int data_len = 0; struct ip_tunnel *t; if (tpi->proto == htons(ETH_P_TEB)) @@ -182,7 +181,6 @@ static int ipgre_err(struct sk_buff *skb, u32 info, case ICMP_TIME_EXCEEDED: if (code != ICMP_EXC_TTL) return 0; - data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */ break; case ICMP_REDIRECT: @@ -190,10 +188,16 @@ static int ipgre_err(struct sk_buff *skb, u32 info, } #if IS_ENABLED(CONFIG_IPV6) - if (tpi->proto == htons(ETH_P_IPV6) && - !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len, - type, data_len)) - return 0; + if (tpi->proto == htons(ETH_P_IPV6)) { + unsigned int data_len = 0; + + if (type == ICMP_TIME_EXCEEDED) + data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */ + + if (!ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len, + type, data_len)) + return 0; + } #endif if (t->parms.iph.daddr == 0 || diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 57df7c1d2faa..c2e24da633d4 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2493,6 +2493,11 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, } niov = skb_frag_net_iov(frag); + if (!net_is_devmem_iov(niov)) { + err = -ENODEV; + goto out; + } + end = start + skb_frag_size(frag); copy = end - offset; @@ -2511,7 +2516,7 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, /* Will perform the exchange later */ dmabuf_cmsg.frag_token = tcp_xa_pool.tokens[tcp_xa_pool.idx]; - dmabuf_cmsg.dmabuf_id = net_iov_binding_id(niov); + dmabuf_cmsg.dmabuf_id = net_devmem_iov_binding_id(niov); offset += copy; remaining_len -= copy; diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index e4be227d3ad6..6c6ee183802d 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -87,6 +87,11 @@ enum { }; enum { + __NETDEV_A_IO_URING_PROVIDER_INFO_MAX, + NETDEV_A_IO_URING_PROVIDER_INFO_MAX = (__NETDEV_A_IO_URING_PROVIDER_INFO_MAX - 1) +}; + +enum { NETDEV_A_PAGE_POOL_ID = 1, NETDEV_A_PAGE_POOL_IFINDEX, NETDEV_A_PAGE_POOL_NAPI_ID, @@ -94,6 +99,7 @@ enum { NETDEV_A_PAGE_POOL_INFLIGHT_MEM, NETDEV_A_PAGE_POOL_DETACH_TIME, NETDEV_A_PAGE_POOL_DMABUF, + NETDEV_A_PAGE_POOL_IO_URING, __NETDEV_A_PAGE_POOL_MAX, NETDEV_A_PAGE_POOL_MAX = (__NETDEV_A_PAGE_POOL_MAX - 1) @@ -136,6 +142,7 @@ enum { NETDEV_A_QUEUE_TYPE, NETDEV_A_QUEUE_NAPI_ID, NETDEV_A_QUEUE_DMABUF, + NETDEV_A_QUEUE_IO_URING, __NETDEV_A_QUEUE_MAX, NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) diff --git a/tools/net/ynl/Makefile.deps b/tools/net/ynl/Makefile.deps index 0712b5e82eb7..d027a07c1e2c 100644 --- a/tools/net/ynl/Makefile.deps +++ b/tools/net/ynl/Makefile.deps @@ -17,9 +17,11 @@ get_hdr_inc=-D$(1) -include $(UAPI_PATH)/linux/$(2) CFLAGS_devlink:=$(call get_hdr_inc,_LINUX_DEVLINK_H_,devlink.h) CFLAGS_dpll:=$(call get_hdr_inc,_LINUX_DPLL_H,dpll.h) CFLAGS_ethtool:=$(call get_hdr_inc,_LINUX_ETHTOOL_H,ethtool.h) \ - $(call get_hdr_inc,_LINUX_ETHTOOL_NETLINK_H_,ethtool_netlink.h) + $(call get_hdr_inc,_LINUX_ETHTOOL_NETLINK_H_,ethtool_netlink.h) \ + $(call get_hdr_inc,_LINUX_ETHTOOL_NETLINK_GENERATED_H,ethtool_netlink_generated.h) CFLAGS_handshake:=$(call get_hdr_inc,_LINUX_HANDSHAKE_H,handshake.h) CFLAGS_mptcp_pm:=$(call get_hdr_inc,_LINUX_MPTCP_PM_H,mptcp_pm.h) +CFLAGS_net_shaper:=$(call get_hdr_inc,_LINUX_NET_SHAPER_H,net_shaper.h) CFLAGS_netdev:=$(call get_hdr_inc,_LINUX_NETDEV_H,netdev.h) CFLAGS_nlctrl:=$(call get_hdr_inc,__LINUX_GENERIC_NETLINK_H,genetlink.h) CFLAGS_nfsd:=$(call get_hdr_inc,_LINUX_NFSD_NETLINK_H,nfsd_netlink.h) diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index c2eabc90dce8..b22082fd660e 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -100,7 +100,7 @@ class Type(SpecAttr): if isinstance(value, int): return value if value in self.family.consts: - raise Exception("Resolving family constants not implemented, yet") + return self.family.consts[value]["value"] return limit_to_number(value) def get_limit_str(self, limit, default=None, suffix=''): @@ -110,6 +110,9 @@ class Type(SpecAttr): if isinstance(value, int): return str(value) + suffix if value in self.family.consts: + const = self.family.consts[value] + if const.get('header'): + return c_upper(value) return c_upper(f"{self.family['name']}-{value}") return c_upper(value) @@ -2549,6 +2552,9 @@ def render_uapi(family, cw): defines = [] for const in family['definitions']: + if const.get('header'): + continue + if const['type'] != 'const': cw.writes_defines(defines) defines = [] diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile index 137470bdee0c..c7f1c443f2af 100644 --- a/tools/testing/selftests/drivers/net/Makefile +++ b/tools/testing/selftests/drivers/net/Makefile @@ -7,6 +7,7 @@ TEST_INCLUDES := $(wildcard lib/py/*.py) \ TEST_PROGS := \ netcons_basic.sh \ + netcons_fragmented_msg.sh \ netcons_overflow.sh \ ping.py \ queues.py \ diff --git a/tools/testing/selftests/drivers/net/hw/.gitignore b/tools/testing/selftests/drivers/net/hw/.gitignore index e9fe6ede681a..6942bf575497 100644 --- a/tools/testing/selftests/drivers/net/hw/.gitignore +++ b/tools/testing/selftests/drivers/net/hw/.gitignore @@ -1 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only +iou-zcrx ncdevmem diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile index 21ba64ce1e34..7efc47c89463 100644 --- a/tools/testing/selftests/drivers/net/hw/Makefile +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -1,5 +1,7 @@ # SPDX-License-Identifier: GPL-2.0+ OR MIT +TEST_GEN_FILES = iou-zcrx + TEST_PROGS = \ csum.py \ devlink_port_split.py \ @@ -10,6 +12,7 @@ TEST_PROGS = \ ethtool_rmon.sh \ hw_stats_l3.sh \ hw_stats_l3_gre.sh \ + iou-zcrx.py \ loopback.sh \ nic_link_layer.py \ nic_performance.py \ @@ -38,3 +41,5 @@ include ../../../lib.mk # YNL build YNL_GENS := ethtool netdev include ../../../net/ynl.mk + +$(OUTPUT)/iou-zcrx: LDLIBS += -luring diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.c b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c new file mode 100644 index 000000000000..c26b4180eddd --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c @@ -0,0 +1,457 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <assert.h> +#include <errno.h> +#include <error.h> +#include <fcntl.h> +#include <limits.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <arpa/inet.h> +#include <linux/errqueue.h> +#include <linux/if_packet.h> +#include <linux/ipv6.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <net/ethernet.h> +#include <net/if.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/ip6.h> +#include <netinet/tcp.h> +#include <netinet/udp.h> +#include <sys/epoll.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/resource.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/types.h> +#include <sys/un.h> +#include <sys/wait.h> + +#include <liburing.h> + +#define PAGE_SIZE (4096) +#define AREA_SIZE (8192 * PAGE_SIZE) +#define SEND_SIZE (512 * 4096) +#define min(a, b) \ + ({ \ + typeof(a) _a = (a); \ + typeof(b) _b = (b); \ + _a < _b ? _a : _b; \ + }) +#define min_t(t, a, b) \ + ({ \ + t _ta = (a); \ + t _tb = (b); \ + min(_ta, _tb); \ + }) + +#define ALIGN_UP(v, align) (((v) + (align) - 1) & ~((align) - 1)) + +static int cfg_server; +static int cfg_client; +static int cfg_port = 8000; +static int cfg_payload_len; +static const char *cfg_ifname; +static int cfg_queue_id = -1; +static bool cfg_oneshot; +static int cfg_oneshot_recvs; +static int cfg_send_size = SEND_SIZE; +static struct sockaddr_in6 cfg_addr; + +static char payload[SEND_SIZE] __attribute__((aligned(PAGE_SIZE))); +static void *area_ptr; +static void *ring_ptr; +static size_t ring_size; +static struct io_uring_zcrx_rq rq_ring; +static unsigned long area_token; +static int connfd; +static bool stop; +static size_t received; + +static unsigned long gettimeofday_ms(void) +{ + struct timeval tv; + + gettimeofday(&tv, NULL); + return (tv.tv_sec * 1000) + (tv.tv_usec / 1000); +} + +static int parse_address(const char *str, int port, struct sockaddr_in6 *sin6) +{ + int ret; + + sin6->sin6_family = AF_INET6; + sin6->sin6_port = htons(port); + + ret = inet_pton(sin6->sin6_family, str, &sin6->sin6_addr); + if (ret != 1) { + /* fallback to plain IPv4 */ + ret = inet_pton(AF_INET, str, &sin6->sin6_addr.s6_addr32[3]); + if (ret != 1) + return -1; + + /* add ::ffff prefix */ + sin6->sin6_addr.s6_addr32[0] = 0; + sin6->sin6_addr.s6_addr32[1] = 0; + sin6->sin6_addr.s6_addr16[4] = 0; + sin6->sin6_addr.s6_addr16[5] = 0xffff; + } + + return 0; +} + +static inline size_t get_refill_ring_size(unsigned int rq_entries) +{ + size_t size; + + ring_size = rq_entries * sizeof(struct io_uring_zcrx_rqe); + /* add space for the header (head/tail/etc.) */ + ring_size += PAGE_SIZE; + return ALIGN_UP(ring_size, 4096); +} + +static void setup_zcrx(struct io_uring *ring) +{ + unsigned int ifindex; + unsigned int rq_entries = 4096; + int ret; + + ifindex = if_nametoindex(cfg_ifname); + if (!ifindex) + error(1, 0, "bad interface name: %s", cfg_ifname); + + area_ptr = mmap(NULL, + AREA_SIZE, + PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, + 0, + 0); + if (area_ptr == MAP_FAILED) + error(1, 0, "mmap(): zero copy area"); + + ring_size = get_refill_ring_size(rq_entries); + ring_ptr = mmap(NULL, + ring_size, + PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, + 0, + 0); + + struct io_uring_region_desc region_reg = { + .size = ring_size, + .user_addr = (__u64)(unsigned long)ring_ptr, + .flags = IORING_MEM_REGION_TYPE_USER, + }; + + struct io_uring_zcrx_area_reg area_reg = { + .addr = (__u64)(unsigned long)area_ptr, + .len = AREA_SIZE, + .flags = 0, + }; + + struct io_uring_zcrx_ifq_reg reg = { + .if_idx = ifindex, + .if_rxq = cfg_queue_id, + .rq_entries = rq_entries, + .area_ptr = (__u64)(unsigned long)&area_reg, + .region_ptr = (__u64)(unsigned long)®ion_reg, + }; + + ret = io_uring_register_ifq(ring, ®); + if (ret) + error(1, 0, "io_uring_register_ifq(): %d", ret); + + rq_ring.khead = (unsigned int *)((char *)ring_ptr + reg.offsets.head); + rq_ring.ktail = (unsigned int *)((char *)ring_ptr + reg.offsets.tail); + rq_ring.rqes = (struct io_uring_zcrx_rqe *)((char *)ring_ptr + reg.offsets.rqes); + rq_ring.rq_tail = 0; + rq_ring.ring_entries = reg.rq_entries; + + area_token = area_reg.rq_area_token; +} + +static void add_accept(struct io_uring *ring, int sockfd) +{ + struct io_uring_sqe *sqe; + + sqe = io_uring_get_sqe(ring); + + io_uring_prep_accept(sqe, sockfd, NULL, NULL, 0); + sqe->user_data = 1; +} + +static void add_recvzc(struct io_uring *ring, int sockfd) +{ + struct io_uring_sqe *sqe; + + sqe = io_uring_get_sqe(ring); + + io_uring_prep_rw(IORING_OP_RECV_ZC, sqe, sockfd, NULL, 0, 0); + sqe->ioprio |= IORING_RECV_MULTISHOT; + sqe->user_data = 2; +} + +static void add_recvzc_oneshot(struct io_uring *ring, int sockfd, size_t len) +{ + struct io_uring_sqe *sqe; + + sqe = io_uring_get_sqe(ring); + + io_uring_prep_rw(IORING_OP_RECV_ZC, sqe, sockfd, NULL, len, 0); + sqe->ioprio |= IORING_RECV_MULTISHOT; + sqe->user_data = 2; +} + +static void process_accept(struct io_uring *ring, struct io_uring_cqe *cqe) +{ + if (cqe->res < 0) + error(1, 0, "accept()"); + if (connfd) + error(1, 0, "Unexpected second connection"); + + connfd = cqe->res; + if (cfg_oneshot) + add_recvzc_oneshot(ring, connfd, PAGE_SIZE); + else + add_recvzc(ring, connfd); +} + +static void process_recvzc(struct io_uring *ring, struct io_uring_cqe *cqe) +{ + unsigned rq_mask = rq_ring.ring_entries - 1; + struct io_uring_zcrx_cqe *rcqe; + struct io_uring_zcrx_rqe *rqe; + struct io_uring_sqe *sqe; + uint64_t mask; + char *data; + ssize_t n; + int i; + + if (cqe->res == 0 && cqe->flags == 0 && cfg_oneshot_recvs == 0) { + stop = true; + return; + } + + if (cqe->res < 0) + error(1, 0, "recvzc(): %d", cqe->res); + + if (cfg_oneshot) { + if (cqe->res == 0 && cqe->flags == 0 && cfg_oneshot_recvs) { + add_recvzc_oneshot(ring, connfd, PAGE_SIZE); + cfg_oneshot_recvs--; + } + } else if (!(cqe->flags & IORING_CQE_F_MORE)) { + add_recvzc(ring, connfd); + } + + rcqe = (struct io_uring_zcrx_cqe *)(cqe + 1); + + n = cqe->res; + mask = (1ULL << IORING_ZCRX_AREA_SHIFT) - 1; + data = (char *)area_ptr + (rcqe->off & mask); + + for (i = 0; i < n; i++) { + if (*(data + i) != payload[(received + i)]) + error(1, 0, "payload mismatch at ", i); + } + received += n; + + rqe = &rq_ring.rqes[(rq_ring.rq_tail & rq_mask)]; + rqe->off = (rcqe->off & ~IORING_ZCRX_AREA_MASK) | area_token; + rqe->len = cqe->res; + io_uring_smp_store_release(rq_ring.ktail, ++rq_ring.rq_tail); +} + +static void server_loop(struct io_uring *ring) +{ + struct io_uring_cqe *cqe; + unsigned int count = 0; + unsigned int head; + int i, ret; + + io_uring_submit_and_wait(ring, 1); + + io_uring_for_each_cqe(ring, head, cqe) { + if (cqe->user_data == 1) + process_accept(ring, cqe); + else if (cqe->user_data == 2) + process_recvzc(ring, cqe); + else + error(1, 0, "unknown cqe"); + count++; + } + io_uring_cq_advance(ring, count); +} + +static void run_server(void) +{ + unsigned int flags = 0; + struct io_uring ring; + int fd, enable, ret; + uint64_t tstop; + + fd = socket(AF_INET6, SOCK_STREAM, 0); + if (fd == -1) + error(1, 0, "socket()"); + + enable = 1; + ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int)); + if (ret < 0) + error(1, 0, "setsockopt(SO_REUSEADDR)"); + + ret = bind(fd, (struct sockaddr *)&cfg_addr, sizeof(cfg_addr)); + if (ret < 0) + error(1, 0, "bind()"); + + if (listen(fd, 1024) < 0) + error(1, 0, "listen()"); + + flags |= IORING_SETUP_COOP_TASKRUN; + flags |= IORING_SETUP_SINGLE_ISSUER; + flags |= IORING_SETUP_DEFER_TASKRUN; + flags |= IORING_SETUP_SUBMIT_ALL; + flags |= IORING_SETUP_CQE32; + + io_uring_queue_init(512, &ring, flags); + + setup_zcrx(&ring); + + add_accept(&ring, fd); + + tstop = gettimeofday_ms() + 5000; + while (!stop && gettimeofday_ms() < tstop) + server_loop(&ring); + + if (!stop) + error(1, 0, "test failed\n"); +} + +static void run_client(void) +{ + ssize_t to_send = cfg_send_size; + ssize_t sent = 0; + ssize_t chunk, res; + int fd; + + fd = socket(AF_INET6, SOCK_STREAM, 0); + if (fd == -1) + error(1, 0, "socket()"); + + if (connect(fd, (struct sockaddr *)&cfg_addr, sizeof(cfg_addr))) + error(1, 0, "connect()"); + + while (to_send) { + void *src = &payload[sent]; + + chunk = min_t(ssize_t, cfg_payload_len, to_send); + res = send(fd, src, chunk, 0); + if (res < 0) + error(1, 0, "send(): %d", sent); + sent += res; + to_send -= res; + } + + close(fd); +} + +static void usage(const char *filepath) +{ + error(1, 0, "Usage: %s (-4|-6) (-s|-c) -h<server_ip> -p<port> " + "-l<payload_size> -i<ifname> -q<rxq_id>", filepath); +} + +static void parse_opts(int argc, char **argv) +{ + const int max_payload_len = sizeof(payload) - + sizeof(struct ipv6hdr) - + sizeof(struct tcphdr) - + 40 /* max tcp options */; + struct sockaddr_in6 *addr6 = (void *) &cfg_addr; + char *addr = NULL; + int ret; + int c; + + if (argc <= 1) + usage(argv[0]); + cfg_payload_len = max_payload_len; + + while ((c = getopt(argc, argv, "sch:p:l:i:q:o:z:")) != -1) { + switch (c) { + case 's': + if (cfg_client) + error(1, 0, "Pass one of -s or -c"); + cfg_server = 1; + break; + case 'c': + if (cfg_server) + error(1, 0, "Pass one of -s or -c"); + cfg_client = 1; + break; + case 'h': + addr = optarg; + break; + case 'p': + cfg_port = strtoul(optarg, NULL, 0); + break; + case 'l': + cfg_payload_len = strtoul(optarg, NULL, 0); + break; + case 'i': + cfg_ifname = optarg; + break; + case 'q': + cfg_queue_id = strtoul(optarg, NULL, 0); + break; + case 'o': { + cfg_oneshot = true; + cfg_oneshot_recvs = strtoul(optarg, NULL, 0); + break; + } + case 'z': + cfg_send_size = strtoul(optarg, NULL, 0); + break; + } + } + + if (cfg_server && addr) + error(1, 0, "Receiver cannot have -h specified"); + + memset(addr6, 0, sizeof(*addr6)); + addr6->sin6_family = AF_INET6; + addr6->sin6_port = htons(cfg_port); + addr6->sin6_addr = in6addr_any; + if (addr) { + ret = parse_address(addr, cfg_port, addr6); + if (ret) + error(1, 0, "receiver address parse error: %s", addr); + } + + if (cfg_payload_len > max_payload_len) + error(1, 0, "-l: payload exceeds max (%d)", max_payload_len); +} + +int main(int argc, char **argv) +{ + const char *cfg_test = argv[argc - 1]; + int i; + + parse_opts(argc, argv); + + for (i = 0; i < SEND_SIZE; i++) + payload[i] = 'a' + (i % 26); + + if (cfg_server) + run_server(); + else if (cfg_client) + run_client(); + + return 0; +} diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py new file mode 100755 index 000000000000..d301d9b356f7 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import re +from os import path +from lib.py import ksft_run, ksft_exit +from lib.py import NetDrvEpEnv +from lib.py import bkg, cmd, ethtool, wait_port_listen + + +def _get_rx_ring_entries(cfg): + output = ethtool(f"-g {cfg.ifname}", host=cfg.remote).stdout + values = re.findall(r'RX:\s+(\d+)', output) + return int(values[1]) + + +def _get_combined_channels(cfg): + output = ethtool(f"-l {cfg.ifname}", host=cfg.remote).stdout + values = re.findall(r'Combined:\s+(\d+)', output) + return int(values[1]) + + +def _set_flow_rule(cfg, chan): + output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port 9999 action {chan}", host=cfg.remote).stdout + values = re.search(r'ID (\d+)', output).group(1) + return int(values) + + +def test_zcrx(cfg) -> None: + cfg.require_v6() + + combined_chans = _get_combined_channels(cfg) + if combined_chans < 2: + raise KsftSkipEx('at least 2 combined channels required') + rx_ring = _get_rx_ring_entries(cfg) + + try: + ethtool(f"-G {cfg.ifname} rx 64", host=cfg.remote) + ethtool(f"-X {cfg.ifname} equal {combined_chans - 1}", host=cfg.remote) + flow_rule_id = _set_flow_rule(cfg, combined_chans - 1) + + rx_cmd = f"{cfg.bin_remote} -s -p 9999 -i {cfg.ifname} -q {combined_chans - 1}" + tx_cmd = f"{cfg.bin_local} -c -h {cfg.remote_v6} -p 9999 -l 12840" + with bkg(rx_cmd, host=cfg.remote, exit_wait=True): + wait_port_listen(9999, proto="tcp", host=cfg.remote) + cmd(tx_cmd) + finally: + ethtool(f"-N {cfg.ifname} delete {flow_rule_id}", host=cfg.remote) + ethtool(f"-X {cfg.ifname} default", host=cfg.remote) + ethtool(f"-G {cfg.ifname} rx {rx_ring}", host=cfg.remote) + + +def test_zcrx_oneshot(cfg) -> None: + cfg.require_v6() + + combined_chans = _get_combined_channels(cfg) + if combined_chans < 2: + raise KsftSkipEx('at least 2 combined channels required') + rx_ring = _get_rx_ring_entries(cfg) + + try: + ethtool(f"-G {cfg.ifname} rx 64", host=cfg.remote) + ethtool(f"-X {cfg.ifname} equal {combined_chans - 1}", host=cfg.remote) + flow_rule_id = _set_flow_rule(cfg, combined_chans - 1) + + rx_cmd = f"{cfg.bin_remote} -s -p 9999 -i {cfg.ifname} -q {combined_chans - 1} -o 4" + tx_cmd = f"{cfg.bin_local} -c -h {cfg.remote_v6} -p 9999 -l 4096 -z 16384" + with bkg(rx_cmd, host=cfg.remote, exit_wait=True): + wait_port_listen(9999, proto="tcp", host=cfg.remote) + cmd(tx_cmd) + finally: + ethtool(f"-N {cfg.ifname} delete {flow_rule_id}", host=cfg.remote) + ethtool(f"-X {cfg.ifname} default", host=cfg.remote) + ethtool(f"-G {cfg.ifname} rx {rx_ring}", host=cfg.remote) + + +def main() -> None: + with NetDrvEpEnv(__file__) as cfg: + cfg.bin_local = path.abspath(path.dirname(__file__) + "/../../../drivers/net/hw/iou-zcrx") + cfg.bin_remote = cfg.remote.deploy(cfg.bin_local) + + ksft_run(globs=globals(), case_pfx={"test_"}, args=(cfg, )) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh index 3acaba41ac7b..0c262b123fdd 100644 --- a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh +++ b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh @@ -110,6 +110,13 @@ function create_dynamic_target() { echo 1 > "${NETCONS_PATH}"/enabled } +# Do not append the release to the header of the message +function disable_release_append() { + echo 0 > "${NETCONS_PATH}"/enabled + echo 0 > "${NETCONS_PATH}"/release + echo 1 > "${NETCONS_PATH}"/enabled +} + function cleanup() { local NSIM_DEV_SYS_DEL="/sys/bus/netdevsim/del_device" diff --git a/tools/testing/selftests/drivers/net/netcons_fragmented_msg.sh b/tools/testing/selftests/drivers/net/netcons_fragmented_msg.sh new file mode 100755 index 000000000000..4a71e01a230c --- /dev/null +++ b/tools/testing/selftests/drivers/net/netcons_fragmented_msg.sh @@ -0,0 +1,122 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-2.0 + +# Test netconsole's message fragmentation functionality. +# +# When a message exceeds the maximum packet size, netconsole splits it into +# multiple fragments for transmission. This test verifies: +# - Correct fragmentation of large messages +# - Proper reassembly of fragments at the receiver +# - Preservation of userdata across fragments +# - Behavior with and without kernel release version appending +# +# Author: Breno Leitao <leitao@debian.org> + +set -euo pipefail + +SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh + +modprobe netdevsim 2> /dev/null || true +modprobe netconsole 2> /dev/null || true + +# The content of kmsg will be save to the following file +OUTPUT_FILE="/tmp/${TARGET}" + +# set userdata to a long value. In this case, it is "1-2-3-4...50-" +USERDATA_VALUE=$(printf -- '%.2s-' {1..60}) + +# Convert the header string in a regexp, so, we can remove +# the second header as well. +# A header looks like "13,468,514729715,-,ncfrag=0/1135;". If +# release is appended, you might find something like:L +# "6.13.0-04048-g4f561a87745a,13,468,514729715,-,ncfrag=0/1135;" +function header_to_regex() { + # header is everything before ; + local HEADER="${1}" + REGEX=$(echo "${HEADER}" | cut -d'=' -f1) + echo "${REGEX}=[0-9]*\/[0-9]*;" +} + +# We have two headers in the message. Remove both to get the full message, +# and extract the full message. +function extract_msg() { + local MSGFILE="${1}" + # Extract the header, which is the very first thing that arrives in the + # first list. + HEADER=$(sed -n '1p' "${MSGFILE}" | cut -d';' -f1) + HEADER_REGEX=$(header_to_regex "${HEADER}") + + # Remove the two headers from the received message + # This will return the message without any header, similarly to what + # was sent. + sed "s/""${HEADER_REGEX}""//g" "${MSGFILE}" +} + +# Validate the message, which has two messages glued together. +# unwrap them to make sure all the characters were transmitted. +# File will look like the following: +# 13,468,514729715,-,ncfrag=0/1135;<message> +# key=<part of key>-13,468,514729715,-,ncfrag=967/1135;<rest of the key> +function validate_fragmented_result() { + # Discard the netconsole headers, and assemble the full message + RCVMSG=$(extract_msg "${1}") + + # check for the main message + if ! echo "${RCVMSG}" | grep -q "${MSG}"; then + echo "Message body doesn't match." >&2 + echo "msg received=" "${RCVMSG}" >&2 + exit "${ksft_fail}" + fi + + # check userdata + if ! echo "${RCVMSG}" | grep -q "${USERDATA_VALUE}"; then + echo "message userdata doesn't match" >&2 + echo "msg received=" "${RCVMSG}" >&2 + exit "${ksft_fail}" + fi + # test passed. hooray +} + +# Check for basic system dependency and exit if not found +check_for_dependencies +# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) +echo "6 5" > /proc/sys/kernel/printk +# Remove the namespace, interfaces and netconsole target on exit +trap cleanup EXIT +# Create one namespace and two interfaces +set_network +# Create a dynamic target for netconsole +create_dynamic_target +# Set userdata "key" with the "value" value +set_user_data + + +# TEST 1: Send message and userdata. They will fragment +# ======= +MSG=$(printf -- 'MSG%.3s=' {1..150}) + +# Listen for netconsole port inside the namespace and destination interface +listen_port_and_save_to "${OUTPUT_FILE}" & +# Wait for socat to start and listen to the port. +wait_local_port_listen "${NAMESPACE}" "${PORT}" udp +# Send the message +echo "${MSG}: ${TARGET}" > /dev/kmsg +# Wait until socat saves the file to disk +busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" +# Check if the message was not corrupted +validate_fragmented_result "${OUTPUT_FILE}" + +# TEST 2: Test with smaller message, and without release appended +# ======= +MSG=$(printf -- 'FOOBAR%.3s=' {1..100}) +# Let's disable release and test again. +disable_release_append + +listen_port_and_save_to "${OUTPUT_FILE}" & +wait_local_port_listen "${NAMESPACE}" "${PORT}" udp +echo "${MSG}: ${TARGET}" > /dev/kmsg +busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" +validate_fragmented_result "${OUTPUT_FILE}" +exit "${ksft_pass}" diff --git a/tools/testing/selftests/net/forwarding/bridge_mdb.sh b/tools/testing/selftests/net/forwarding/bridge_mdb.sh index d9d587454d20..8c1597ebc2d3 100755 --- a/tools/testing/selftests/net/forwarding/bridge_mdb.sh +++ b/tools/testing/selftests/net/forwarding/bridge_mdb.sh @@ -149,7 +149,7 @@ cfg_test_host_common() check_err $? "Failed to add $name host entry" bridge mdb replace dev br0 port br0 grp $grp $state vid 10 &> /dev/null - check_fail $? "Managed to replace $name host entry" + check_err $? "Failed to replace $name host entry" bridge mdb del dev br0 port br0 grp $grp $state vid 10 bridge mdb get dev br0 grp $grp vid 10 &> /dev/null diff --git a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh index 3f9d50f1ef9e..180c5eca556f 100755 --- a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh +++ b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh @@ -740,6 +740,8 @@ test_learning() vxlan_flood_test $mac $dst 0 10 0 + # The entry should age out when it only forwards traffic + $MZ $h1 -c 50 -d 1sec -p 64 -b $mac -B $dst -t icmp -q & sleep 60 bridge fdb show brport vx1 | grep $mac | grep -q self diff --git a/tools/testing/selftests/net/ynl.mk b/tools/testing/selftests/net/ynl.mk index 12e7cae251be..e907c2751956 100644 --- a/tools/testing/selftests/net/ynl.mk +++ b/tools/testing/selftests/net/ynl.mk @@ -27,7 +27,8 @@ $(OUTPUT)/.libynl-$(YNL_GENS_HASH).sig: $(OUTPUT)/libynl.a: $(YNL_SPECS) $(OUTPUT)/.libynl-$(YNL_GENS_HASH).sig $(Q)rm -f $(top_srcdir)/tools/net/ynl/libynl.a - $(Q)$(MAKE) -C $(top_srcdir)/tools/net/ynl GENS="$(YNL_GENS)" libynl.a + $(Q)$(MAKE) -C $(top_srcdir)/tools/net/ynl \ + GENS="$(YNL_GENS)" RSTS="" libynl.a $(Q)cp $(top_srcdir)/tools/net/ynl/libynl.a $(OUTPUT)/libynl.a EXTRA_CLEAN += \ |