diff options
Diffstat (limited to 'drivers/infiniband/hw/mlx5/main.c')
| -rw-r--r-- | drivers/infiniband/hw/mlx5/main.c | 5741 |
1 files changed, 2171 insertions, 3570 deletions
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 94fe253d4956..40284bbb45d6 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1,33 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* - * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved. + * Copyright (c) 2020, Intel Corporation. All rights reserved. */ #include <linux/debugfs.h> @@ -39,9 +13,7 @@ #include <linux/dma-mapping.h> #include <linux/slab.h> #include <linux/bitmap.h> -#if defined(CONFIG_X86) -#include <asm/pat.h> -#endif +#include <linux/log2.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> @@ -52,35 +24,42 @@ #include <linux/mlx5/port.h> #include <linux/mlx5/vport.h> #include <linux/mlx5/fs.h> +#include <linux/mlx5/eswitch.h> +#include <linux/mlx5/driver.h> #include <linux/list.h> #include <rdma/ib_smi.h> -#include <rdma/ib_umem.h> +#include <rdma/ib_umem_odp.h> +#include <rdma/lag.h> #include <linux/in.h> #include <linux/etherdevice.h> #include "mlx5_ib.h" #include "ib_rep.h" #include "cmd.h" +#include "devx.h" +#include "dm.h" +#include "fs.h" #include "srq.h" -#include <linux/mlx5/fs_helpers.h> -#include <linux/mlx5/accel.h> +#include "qp.h" +#include "wr.h" +#include "restrack.h" +#include "counters.h" +#include "umr.h" #include <rdma/uverbs_std_types.h> +#include <rdma/uverbs_ioctl.h> #include <rdma/mlx5_user_ioctl_verbs.h> #include <rdma/mlx5_user_ioctl_cmds.h> +#include <rdma/ib_ucaps.h> +#include "macsec.h" +#include "data_direct.h" +#include "dmah.h" #define UVERBS_MODULE_NAME mlx5_ib #include <rdma/uverbs_named_ioctl.h> -#define DRIVER_NAME "mlx5_ib" -#define DRIVER_VERSION "5.0-0" - MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>"); -MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver"); +MODULE_DESCRIPTION("Mellanox 5th generation network adapters (ConnectX series) IB driver"); MODULE_LICENSE("Dual BSD/GPL"); -static char mlx5_version[] = - DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v" - DRIVER_VERSION "\n"; - struct mlx5_ib_event_work { struct work_struct work; union { @@ -104,12 +83,6 @@ static LIST_HEAD(mlx5_ib_dev_list); */ static DEFINE_MUTEX(mlx5_ib_multiport_mutex); -/* We can't use an array for xlt_emergency_page because dma_map_single - * doesn't work on kernel modules memory - */ -static unsigned long xlt_emergency_page; -static struct mutex xlt_emergency_page_mutex; - struct mlx5_ib_dev *mlx5_ib_get_ibdev_from_mpi(struct mlx5_ib_multiport_info *mpi) { struct mlx5_ib_dev *dev; @@ -134,7 +107,7 @@ mlx5_port_type_cap_to_rdma_ll(int port_type_cap) } static enum rdma_link_layer -mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num) +mlx5_ib_port_link_layer(struct ib_device *device, u32 port_num) { struct mlx5_ib_dev *dev = to_mdev(device); int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type); @@ -143,7 +116,7 @@ mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num) } static int get_port_state(struct ib_device *ibdev, - u8 port_num, + u32 port_num, enum ib_port_state *state) { struct ib_port_attr attr; @@ -156,12 +129,85 @@ static int get_port_state(struct ib_device *ibdev, return ret; } +static struct mlx5_roce *mlx5_get_rep_roce(struct mlx5_ib_dev *dev, + struct net_device *ndev, + struct net_device *upper, + u32 *port_num) +{ + struct net_device *rep_ndev; + struct mlx5_ib_port *port; + int i; + + for (i = 0; i < dev->num_ports; i++) { + port = &dev->port[i]; + if (!port->rep) + continue; + + if (upper == ndev && port->rep->vport == MLX5_VPORT_UPLINK) { + *port_num = i + 1; + return &port->roce; + } + + if (upper && port->rep->vport == MLX5_VPORT_UPLINK) + continue; + rep_ndev = ib_device_get_netdev(&dev->ib_dev, i + 1); + if (rep_ndev && rep_ndev == ndev) { + dev_put(rep_ndev); + *port_num = i + 1; + return &port->roce; + } + + dev_put(rep_ndev); + } + + return NULL; +} + +static bool mlx5_netdev_send_event(struct mlx5_ib_dev *dev, + struct net_device *ndev, + struct net_device *upper, + struct net_device *ib_ndev) +{ + if (!dev->ib_active) + return false; + + /* Event is about our upper device */ + if (upper == ndev) + return true; + + /* RDMA device is not in lag and not in switchdev */ + if (!dev->is_rep && !upper && ndev == ib_ndev) + return true; + + /* RDMA devie is in switchdev */ + if (dev->is_rep && ndev == ib_ndev) + return true; + + return false; +} + +static struct net_device *mlx5_ib_get_rep_uplink_netdev(struct mlx5_ib_dev *ibdev) +{ + struct mlx5_ib_port *port; + int i; + + for (i = 0; i < ibdev->num_ports; i++) { + port = &ibdev->port[i]; + if (port->rep && port->rep->vport == MLX5_VPORT_UPLINK) { + return ib_device_get_netdev(&ibdev->ib_dev, i + 1); + } + } + + return NULL; +} + static int mlx5_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct mlx5_roce *roce = container_of(this, struct mlx5_roce, nb); struct net_device *ndev = netdev_notifier_info_to_dev(ptr); - u8 port_num = roce->native_port_num; + u32 port_num = roce->native_port_num; + struct net_device *ib_ndev = NULL; struct mlx5_core_dev *mdev; struct mlx5_ib_dev *ibdev; @@ -172,50 +218,70 @@ static int mlx5_netdev_event(struct notifier_block *this, switch (event) { case NETDEV_REGISTER: - write_lock(&roce->netdev_lock); - if (ibdev->rep) { - struct mlx5_eswitch *esw = ibdev->mdev->priv.eswitch; - struct net_device *rep_ndev; - - rep_ndev = mlx5_ib_get_rep_netdev(esw, - ibdev->rep->vport); - if (rep_ndev == ndev) - roce->netdev = ndev; - } else if (ndev->dev.parent == &mdev->pdev->dev) { - roce->netdev = ndev; - } - write_unlock(&roce->netdev_lock); + /* Should already be registered during the load */ + if (ibdev->is_rep) + break; + + ib_ndev = ib_device_get_netdev(&ibdev->ib_dev, port_num); + /* Exit if already registered */ + if (ib_ndev) + goto put_ndev; + + if (ndev->dev.parent == mdev->device) + ib_device_set_netdev(&ibdev->ib_dev, ndev, port_num); break; case NETDEV_UNREGISTER: - write_lock(&roce->netdev_lock); - if (roce->netdev == ndev) - roce->netdev = NULL; - write_unlock(&roce->netdev_lock); - break; + /* In case of reps, ib device goes away before the netdevs */ + if (ibdev->is_rep) + break; + ib_ndev = ib_device_get_netdev(&ibdev->ib_dev, port_num); + if (ib_ndev == ndev) + ib_device_set_netdev(&ibdev->ib_dev, NULL, port_num); + goto put_ndev; case NETDEV_CHANGE: case NETDEV_UP: case NETDEV_DOWN: { - struct net_device *lag_ndev = mlx5_lag_get_roce_netdev(mdev); struct net_device *upper = NULL; - if (lag_ndev) { - upper = netdev_master_upper_dev_get(lag_ndev); - dev_put(lag_ndev); + if (!netif_is_lag_master(ndev) && !netif_is_lag_port(ndev) && + !mlx5_core_mp_enabled(mdev)) + return NOTIFY_DONE; + + if (mlx5_lag_is_roce(mdev) || mlx5_lag_is_sriov(mdev)) { + struct net_device *lag_ndev; + + if(mlx5_lag_is_roce(mdev)) + lag_ndev = ib_device_get_netdev(&ibdev->ib_dev, 1); + else /* sriov lag */ + lag_ndev = mlx5_ib_get_rep_uplink_netdev(ibdev); + + if (lag_ndev) { + upper = netdev_master_upper_dev_get(lag_ndev); + dev_put(lag_ndev); + } else { + goto done; + } } - if ((upper == ndev || (!upper && ndev == roce->netdev)) - && ibdev->ib_active) { + if (ibdev->is_rep) + roce = mlx5_get_rep_roce(ibdev, ndev, upper, &port_num); + if (!roce) + return NOTIFY_DONE; + + ib_ndev = ib_device_get_netdev(&ibdev->ib_dev, port_num); + + if (mlx5_netdev_send_event(ibdev, ndev, upper, ib_ndev)) { struct ib_event ibev = { }; enum ib_port_state port_state; if (get_port_state(&ibdev->ib_dev, port_num, &port_state)) - goto done; + goto put_ndev; if (roce->last_port_state == port_state) - goto done; + goto put_ndev; roce->last_port_state = port_state; ibev.device = &ibdev->ib_dev; @@ -224,7 +290,7 @@ static int mlx5_netdev_event(struct notifier_block *this, else if (port_state == IB_PORT_ACTIVE) ibev.event = IB_EVENT_PORT_ACTIVE; else - goto done; + goto put_ndev; ibev.element.port_num = port_num; ib_dispatch_event(&ibev); @@ -235,42 +301,16 @@ static int mlx5_netdev_event(struct notifier_block *this, default: break; } +put_ndev: + dev_put(ib_ndev); done: mlx5_ib_put_native_port_mdev(ibdev, port_num); return NOTIFY_DONE; } -static struct net_device *mlx5_ib_get_netdev(struct ib_device *device, - u8 port_num) -{ - struct mlx5_ib_dev *ibdev = to_mdev(device); - struct net_device *ndev; - struct mlx5_core_dev *mdev; - - mdev = mlx5_ib_get_native_port_mdev(ibdev, port_num, NULL); - if (!mdev) - return NULL; - - ndev = mlx5_lag_get_roce_netdev(mdev); - if (ndev) - goto out; - - /* Ensure ndev does not disappear before we invoke dev_hold() - */ - read_lock(&ibdev->roce[port_num - 1].netdev_lock); - ndev = ibdev->roce[port_num - 1].netdev; - if (ndev) - dev_hold(ndev); - read_unlock(&ibdev->roce[port_num - 1].netdev_lock); - -out: - mlx5_ib_put_native_port_mdev(ibdev, port_num); - return ndev; -} - struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *ibdev, - u8 ib_port_num, - u8 *native_port_num) + u32 ib_port_num, + u32 *native_port_num) { enum rdma_link_layer ll = mlx5_ib_port_link_layer(&ibdev->ib_dev, ib_port_num); @@ -278,6 +318,14 @@ struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *ibdev, struct mlx5_ib_multiport_info *mpi; struct mlx5_ib_port *port; + if (ibdev->ib_dev.type == RDMA_DEVICE_TYPE_SMI) { + if (native_port_num) + *native_port_num = smi_to_native_portnum(ibdev, + ib_port_num); + return ibdev->mdev; + + } + if (!mlx5_core_mp_enabled(ibdev->mdev) || ll != IB_LINK_LAYER_ETHERNET) { if (native_port_num) @@ -289,9 +337,6 @@ struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *ibdev, *native_port_num = 1; port = &ibdev->port[ib_port_num - 1]; - if (!port) - return NULL; - spin_lock(&port->mp.mpi_lock); mpi = ibdev->port[ib_port_num - 1].mp.mpi; if (mpi && !mpi->unaffiliate) { @@ -307,7 +352,7 @@ struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *ibdev, return mdev; } -void mlx5_ib_put_native_port_mdev(struct mlx5_ib_dev *ibdev, u8 port_num) +void mlx5_ib_put_native_port_mdev(struct mlx5_ib_dev *ibdev, u32 port_num) { enum rdma_link_layer ll = mlx5_ib_port_link_layer(&ibdev->ib_dev, port_num); @@ -331,8 +376,8 @@ out: spin_unlock(&port->mp.mpi_lock); } -static int translate_eth_proto_oper(u32 eth_proto_oper, u8 *active_speed, - u8 *active_width) +static int translate_eth_legacy_proto_oper(u32 eth_proto_oper, + u16 *active_speed, u8 *active_width) { switch (eth_proto_oper) { case MLX5E_PROT_MASK(MLX5E_1000BASE_CX_SGMII): @@ -389,17 +434,116 @@ static int translate_eth_proto_oper(u32 eth_proto_oper, u8 *active_speed, return 0; } -static int mlx5_query_port_roce(struct ib_device *device, u8 port_num, +static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u16 *active_speed, + u8 *active_width) +{ + switch (eth_proto_oper) { + case MLX5E_PROT_MASK(MLX5E_SGMII_100M): + case MLX5E_PROT_MASK(MLX5E_1000BASE_X_SGMII): + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_SDR; + break; + case MLX5E_PROT_MASK(MLX5E_5GBASE_R): + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_DDR; + break; + case MLX5E_PROT_MASK(MLX5E_10GBASE_XFI_XAUI_1): + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_QDR; + break; + case MLX5E_PROT_MASK(MLX5E_40GBASE_XLAUI_4_XLPPI_4): + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_QDR; + break; + case MLX5E_PROT_MASK(MLX5E_25GAUI_1_25GBASE_CR_KR): + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_EDR; + break; + case MLX5E_PROT_MASK(MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2): + *active_width = IB_WIDTH_2X; + *active_speed = IB_SPEED_EDR; + break; + case MLX5E_PROT_MASK(MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR): + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_HDR; + break; + case MLX5E_PROT_MASK(MLX5E_CAUI_4_100GBASE_CR4_KR4): + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_EDR; + break; + case MLX5E_PROT_MASK(MLX5E_100GAUI_2_100GBASE_CR2_KR2): + *active_width = IB_WIDTH_2X; + *active_speed = IB_SPEED_HDR; + break; + case MLX5E_PROT_MASK(MLX5E_100GAUI_1_100GBASE_CR_KR): + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_NDR; + break; + case MLX5E_PROT_MASK(MLX5E_200GAUI_4_200GBASE_CR4_KR4): + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_HDR; + break; + case MLX5E_PROT_MASK(MLX5E_200GAUI_2_200GBASE_CR2_KR2): + *active_width = IB_WIDTH_2X; + *active_speed = IB_SPEED_NDR; + break; + case MLX5E_PROT_MASK(MLX5E_200GAUI_1_200GBASE_CR1_KR1): + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_XDR; + break; + case MLX5E_PROT_MASK(MLX5E_400GAUI_8_400GBASE_CR8): + *active_width = IB_WIDTH_8X; + *active_speed = IB_SPEED_HDR; + break; + case MLX5E_PROT_MASK(MLX5E_400GAUI_4_400GBASE_CR4_KR4): + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_NDR; + break; + case MLX5E_PROT_MASK(MLX5E_400GAUI_2_400GBASE_CR2_KR2): + *active_width = IB_WIDTH_2X; + *active_speed = IB_SPEED_XDR; + break; + case MLX5E_PROT_MASK(MLX5E_800GAUI_8_800GBASE_CR8_KR8): + *active_width = IB_WIDTH_8X; + *active_speed = IB_SPEED_NDR; + break; + case MLX5E_PROT_MASK(MLX5E_800GAUI_4_800GBASE_CR4_KR4): + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_XDR; + break; + case MLX5E_PROT_MASK(MLX5E_1600TAUI_8_1600TBASE_CR8_KR8): + *active_width = IB_WIDTH_8X; + *active_speed = IB_SPEED_XDR; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int translate_eth_proto_oper(u32 eth_proto_oper, u16 *active_speed, + u8 *active_width, bool ext) +{ + return ext ? + translate_eth_ext_proto_oper(eth_proto_oper, active_speed, + active_width) : + translate_eth_legacy_proto_oper(eth_proto_oper, active_speed, + active_width); +} + +static int mlx5_query_port_roce(struct ib_device *device, u32 port_num, struct ib_port_attr *props) { struct mlx5_ib_dev *dev = to_mdev(device); + u32 out[MLX5_ST_SZ_DW(ptys_reg)] = {0}; struct mlx5_core_dev *mdev; struct net_device *ndev, *upper; enum ib_mtu ndev_ib_mtu; bool put_mdev = true; - u16 qkey_viol_cntr; u32 eth_prot_oper; - u8 mdev_port_num; + u32 mdev_port_num; + bool ext; int err; mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num); @@ -415,41 +559,50 @@ static int mlx5_query_port_roce(struct ib_device *device, u8 port_num, /* Possible bad flows are checked before filling out props so in case * of an error it will still be zeroed out. + * Use native port in case of reps */ - err = mlx5_query_port_eth_proto_oper(mdev, ð_prot_oper, - mdev_port_num); + if (dev->is_rep) + err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, + 1, 0); + else + err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, + mdev_port_num, 0); if (err) goto out; + ext = !!MLX5_GET_ETH_PROTO(ptys_reg, out, true, eth_proto_capability); + eth_prot_oper = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, eth_proto_oper); props->active_width = IB_WIDTH_4X; props->active_speed = IB_SPEED_QDR; translate_eth_proto_oper(eth_prot_oper, &props->active_speed, - &props->active_width); + &props->active_width, ext); - props->port_cap_flags |= IB_PORT_CM_SUP; - props->ip_gids = true; + if (!dev->is_rep && dev->mdev->roce.roce_en) { + u16 qkey_viol_cntr; - props->gid_tbl_len = MLX5_CAP_ROCE(dev->mdev, - roce_address_table_size); + props->port_cap_flags |= IB_PORT_CM_SUP; + props->ip_gids = true; + props->gid_tbl_len = MLX5_CAP_ROCE(dev->mdev, + roce_address_table_size); + mlx5_query_nic_vport_qkey_viol_cntr(mdev, &qkey_viol_cntr); + props->qkey_viol_cntr = qkey_viol_cntr; + } props->max_mtu = IB_MTU_4096; props->max_msg_sz = 1 << MLX5_CAP_GEN(dev->mdev, log_max_msg); props->pkey_tbl_len = 1; props->state = IB_PORT_DOWN; - props->phys_state = 3; - - mlx5_query_nic_vport_qkey_viol_cntr(mdev, &qkey_viol_cntr); - props->qkey_viol_cntr = qkey_viol_cntr; + props->phys_state = IB_PORT_PHYS_STATE_DISABLED; /* If this is a stub query for an unaffiliated port stop here */ if (!put_mdev) goto out; - ndev = mlx5_ib_get_netdev(device, port_num); + ndev = ib_device_get_netdev(device, port_num); if (!ndev) goto out; - if (dev->lag_active) { + if (mlx5_lag_is_roce(mdev) || mlx5_lag_is_sriov(mdev)) { rcu_read_lock(); upper = netdev_master_upper_dev_get_rcu(ndev); if (upper) { @@ -462,7 +615,7 @@ static int mlx5_query_port_roce(struct ib_device *device, u8 port_num, if (netif_running(ndev) && netif_carrier_ok(ndev)) { props->state = IB_PORT_ACTIVE; - props->phys_state = 5; + props->phys_state = IB_PORT_PHYS_STATE_LINK_UP; } ndev_ib_mtu = iboe_get_mtu(ndev->mtu); @@ -476,34 +629,31 @@ out: return err; } -static int set_roce_addr(struct mlx5_ib_dev *dev, u8 port_num, - unsigned int index, const union ib_gid *gid, - const struct ib_gid_attr *attr) +int set_roce_addr(struct mlx5_ib_dev *dev, u32 port_num, + unsigned int index, const union ib_gid *gid, + const struct ib_gid_attr *attr) { - enum ib_gid_type gid_type = IB_GID_TYPE_IB; + enum ib_gid_type gid_type; + u16 vlan_id = 0xffff; u8 roce_version = 0; u8 roce_l3_type = 0; - bool vlan = false; u8 mac[ETH_ALEN]; - u16 vlan_id = 0; + int ret; + gid_type = attr->gid_type; if (gid) { - gid_type = attr->gid_type; - ether_addr_copy(mac, attr->ndev->dev_addr); - - if (is_vlan_dev(attr->ndev)) { - vlan = true; - vlan_id = vlan_dev_vlan_id(attr->ndev); - } + ret = rdma_read_gid_l2_fields(attr, &vlan_id, &mac[0]); + if (ret) + return ret; } switch (gid_type) { - case IB_GID_TYPE_IB: + case IB_GID_TYPE_ROCE: roce_version = MLX5_ROCE_VERSION_1; break; case IB_GID_TYPE_ROCE_UDP_ENCAP: roce_version = MLX5_ROCE_VERSION_2; - if (ipv6_addr_v4mapped((void *)gid)) + if (gid && ipv6_addr_v4mapped((void *)gid)) roce_l3_type = MLX5_ROCE_L3_TYPE_IPV4; else roce_l3_type = MLX5_ROCE_L3_TYPE_IPV6; @@ -514,13 +664,20 @@ static int set_roce_addr(struct mlx5_ib_dev *dev, u8 port_num, } return mlx5_core_roce_gid_set(dev->mdev, index, roce_version, - roce_l3_type, gid->raw, mac, vlan, - vlan_id, port_num); + roce_l3_type, gid->raw, mac, + vlan_id < VLAN_CFI_MASK, vlan_id, + port_num); } static int mlx5_ib_add_gid(const struct ib_gid_attr *attr, __always_unused void **context) { + int ret; + + ret = mlx5r_add_gid_macsec_operations(attr); + if (ret) + return ret; + return set_roce_addr(to_mdev(attr->device), attr->port_num, attr->index, &attr->gid, attr); } @@ -528,12 +685,19 @@ static int mlx5_ib_add_gid(const struct ib_gid_attr *attr, static int mlx5_ib_del_gid(const struct ib_gid_attr *attr, __always_unused void **context) { - return set_roce_addr(to_mdev(attr->device), attr->port_num, - attr->index, NULL, NULL); + int ret; + + ret = set_roce_addr(to_mdev(attr->device), attr->port_num, + attr->index, NULL, attr); + if (ret) + return ret; + + mlx5r_del_gid_macsec_operations(attr); + return 0; } -__be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, - const struct ib_gid_attr *attr) +__be16 mlx5_get_roce_udp_sport_min(const struct mlx5_ib_dev *dev, + const struct ib_gid_attr *attr) { if (attr->gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP) return 0; @@ -596,21 +760,6 @@ static void get_atomic_caps_qp(struct mlx5_ib_dev *dev, get_atomic_caps(dev, atomic_size_qp, props); } -static void get_atomic_caps_dc(struct mlx5_ib_dev *dev, - struct ib_device_attr *props) -{ - u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc); - - get_atomic_caps(dev, atomic_size_qp, props); -} - -bool mlx5_ib_dc_atomic_is_supported(struct mlx5_ib_dev *dev) -{ - struct ib_device_attr props = {}; - - get_atomic_caps_dc(dev, &props); - return (props.atomic_cap == IB_ATOMIC_HCA) ? true : false; -} static int mlx5_query_system_image_guid(struct ib_device *ibdev, __be64 *sys_image_guid) { @@ -697,7 +846,7 @@ static int mlx5_query_node_guid(struct mlx5_ib_dev *dev, break; case MLX5_VPORT_ACCESS_METHOD_NIC: - err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp); + err = mlx5_query_nic_vport_node_guid(dev->mdev, 0, false, &tmp); break; default: @@ -728,10 +877,67 @@ static int mlx5_query_node_desc(struct mlx5_ib_dev *dev, char *node_desc) MLX5_REG_NODE_DESC, 0, 0); } +static void fill_esw_mgr_reg_c0(struct mlx5_core_dev *mdev, + struct mlx5_ib_query_device_resp *resp) +{ + struct mlx5_eswitch *esw = mdev->priv.eswitch; + u16 vport = mlx5_eswitch_manager_vport(mdev); + + resp->reg_c0.value = mlx5_eswitch_get_vport_metadata_for_match(esw, + vport); + resp->reg_c0.mask = mlx5_eswitch_get_vport_metadata_mask(); +} + +/* + * Calculate maximum SQ overhead across all QP types. + * Other QP types (REG_UMR, UC, RC, UD/SMI/GSI, XRC_TGT) + * have smaller overhead than the types calculated below, + * so they are implicitly included. + */ +static u32 mlx5_ib_calc_max_sq_overhead(void) +{ + u32 max_overhead_xrc, overhead_ud_lso, a, b; + + /* XRC_INI */ + max_overhead_xrc = sizeof(struct mlx5_wqe_xrc_seg); + max_overhead_xrc += sizeof(struct mlx5_wqe_ctrl_seg); + a = sizeof(struct mlx5_wqe_atomic_seg) + + sizeof(struct mlx5_wqe_raddr_seg); + b = sizeof(struct mlx5_wqe_umr_ctrl_seg) + + sizeof(struct mlx5_mkey_seg) + + MLX5_IB_SQ_UMR_INLINE_THRESHOLD / MLX5_IB_UMR_OCTOWORD; + max_overhead_xrc += max(a, b); + + /* UD with LSO */ + overhead_ud_lso = sizeof(struct mlx5_wqe_ctrl_seg); + overhead_ud_lso += sizeof(struct mlx5_wqe_eth_pad); + overhead_ud_lso += sizeof(struct mlx5_wqe_eth_seg); + overhead_ud_lso += sizeof(struct mlx5_wqe_datagram_seg); + + return max(max_overhead_xrc, overhead_ud_lso); +} + +static u32 mlx5_ib_calc_max_qp_wr(struct mlx5_ib_dev *dev) +{ + struct mlx5_core_dev *mdev = dev->mdev; + u32 max_wqe_bb_units = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz); + u32 max_wqe_size; + /* max QP overhead + 1 SGE, no inline, no special features */ + max_wqe_size = mlx5_ib_calc_max_sq_overhead() + + sizeof(struct mlx5_wqe_data_seg); + + max_wqe_size = roundup_pow_of_two(max_wqe_size); + + max_wqe_size = ALIGN(max_wqe_size, MLX5_SEND_WQE_BB); + + return (max_wqe_bb_units * MLX5_SEND_WQE_BB) / max_wqe_size; +} + static int mlx5_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props, struct ib_udata *uhw) { + size_t uhw_outlen = (uhw) ? uhw->outlen : 0; struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; int err = -ENOMEM; @@ -745,12 +951,12 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, u64 max_tso; resp_len = sizeof(resp.comp_mask) + sizeof(resp.response_length); - if (uhw->outlen && uhw->outlen < resp_len) + if (uhw_outlen && uhw_outlen < resp_len) return -EINVAL; - else - resp.response_length = resp_len; - if (uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen)) + resp.response_length = resp_len; + + if (uhw && uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen)) return -EINVAL; memset(props, 0, sizeof(*props)); @@ -759,9 +965,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, if (err) return err; - err = mlx5_query_max_pkeys(ibdev, &props->max_pkeys); - if (err) - return err; + props->max_pkeys = dev->pkey_table_len; err = mlx5_query_vendor_id(ibdev, &props->vendor_id); if (err) @@ -788,11 +992,13 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, IB_DEVICE_MEM_WINDOW_TYPE_2B; props->max_mw = 1 << MLX5_CAP_GEN(mdev, log_max_mkey); /* We support 'Gappy' memory registration too */ - props->device_cap_flags |= IB_DEVICE_SG_GAPS_REG; + props->kernel_cap_flags |= IBK_SG_GAPS_REG; } - props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; + /* IB_WR_REG_MR always requires changing the entity size with UMR */ + if (!MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled)) + props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; if (MLX5_CAP_GEN(mdev, sho)) { - props->device_cap_flags |= IB_DEVICE_SIGNATURE_HANDOVER; + props->kernel_cap_flags |= IBK_INTEGRITY_HANDOVER; /* At this stage no support for signature handover */ props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 | IB_PROT_T10DIF_TYPE_2 | @@ -801,7 +1007,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, IB_GUARD_T10DIF_CSUM; } if (MLX5_CAP_GEN(mdev, block_lb_mc)) - props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; + props->kernel_cap_flags |= IBK_BLOCK_MULTICAST_LOOPBACK; if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && raw_support) { if (MLX5_CAP_ETH(mdev, csum_cap)) { @@ -814,7 +1020,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->raw_packet_caps |= IB_RAW_PACKET_CAP_CVLAN_STRIPPING; - if (field_avail(typeof(resp), tso_caps, uhw->outlen)) { + if (offsetofend(typeof(resp), tso_caps) <= uhw_outlen) { max_tso = MLX5_CAP_ETH(mdev, max_lso_cap); if (max_tso) { resp.tso_caps.max_tso = 1 << max_tso; @@ -824,7 +1030,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, } } - if (field_avail(typeof(resp), rss_caps, uhw->outlen)) { + if (offsetofend(typeof(resp), rss_caps) <= uhw_outlen) { resp.rss_caps.rx_hash_function = MLX5_RX_HASH_FUNC_TOEPLITZ; resp.rss_caps.rx_hash_fields_mask = @@ -837,22 +1043,18 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, MLX5_RX_HASH_SRC_PORT_UDP | MLX5_RX_HASH_DST_PORT_UDP | MLX5_RX_HASH_INNER; - if (mlx5_accel_ipsec_device_caps(dev->mdev) & - MLX5_ACCEL_IPSEC_CAP_DEVICE) - resp.rss_caps.rx_hash_fields_mask |= - MLX5_RX_HASH_IPSEC_SPI; resp.response_length += sizeof(resp.rss_caps); } } else { - if (field_avail(typeof(resp), tso_caps, uhw->outlen)) + if (offsetofend(typeof(resp), tso_caps) <= uhw_outlen) resp.response_length += sizeof(resp.tso_caps); - if (field_avail(typeof(resp), rss_caps, uhw->outlen)) + if (offsetofend(typeof(resp), rss_caps) <= uhw_outlen) resp.response_length += sizeof(resp.rss_caps); } if (MLX5_CAP_GEN(mdev, ipoib_basic_offloads)) { props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM; - props->device_cap_flags |= IB_DEVICE_UD_TSO; + props->kernel_cap_flags |= IBK_UD_TSO; } if (MLX5_CAP_GEN(dev->mdev, rq_delay_drop) && @@ -889,7 +1091,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->max_mr_size = ~0ull; props->page_size_cap = ~(min_page_size - 1); props->max_qp = 1 << MLX5_CAP_GEN(mdev, log_max_qp); - props->max_qp_wr = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz); + props->max_qp_wr = mlx5_ib_calc_max_qp_wr(dev); max_rq_sg = MLX5_CAP_GEN(mdev, max_wqe_sz_rq) / sizeof(struct mlx5_wqe_data_seg); max_sq_desc = min_t(int, MLX5_CAP_GEN(mdev, max_wqe_sz_sq), 512); @@ -912,28 +1114,45 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->max_srq_sge = max_rq_sg - 1; props->max_fast_reg_page_list_len = 1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size); + props->max_pi_fast_reg_page_list_len = + props->max_fast_reg_page_list_len / 2; + props->max_sgl_rd = + MLX5_CAP_GEN(mdev, max_sgl_for_optimized_performance); get_atomic_caps_qp(dev, props); props->masked_atomic_cap = IB_ATOMIC_NONE; props->max_mcast_grp = 1 << MLX5_CAP_GEN(mdev, log_max_mcg); props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg); props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * props->max_mcast_grp; - props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */ props->max_ah = INT_MAX; props->hca_core_clock = MLX5_CAP_GEN(mdev, device_frequency_khz); props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL; -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - if (MLX5_CAP_GEN(mdev, pg)) - props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING; - props->odp_caps = dev->odp_caps; -#endif - - if (MLX5_CAP_GEN(mdev, cd)) - props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL; + if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { + if (dev->odp_caps.general_caps & IB_ODP_SUPPORT) + props->kernel_cap_flags |= IBK_ON_DEMAND_PAGING; + props->odp_caps = dev->odp_caps; + if (!uhw) { + /* ODP for kernel QPs is not implemented for receive + * WQEs and SRQ WQEs + */ + props->odp_caps.per_transport_caps.rc_odp_caps &= + ~(IB_ODP_SUPPORT_READ | + IB_ODP_SUPPORT_SRQ_RECV); + props->odp_caps.per_transport_caps.uc_odp_caps &= + ~(IB_ODP_SUPPORT_READ | + IB_ODP_SUPPORT_SRQ_RECV); + props->odp_caps.per_transport_caps.ud_odp_caps &= + ~(IB_ODP_SUPPORT_READ | + IB_ODP_SUPPORT_SRQ_RECV); + props->odp_caps.per_transport_caps.xrc_odp_caps &= + ~(IB_ODP_SUPPORT_READ | + IB_ODP_SUPPORT_SRQ_RECV); + } + } - if (!mlx5_core_is_pf(mdev)) - props->device_cap_flags |= IB_DEVICE_VIRTUAL_FUNCTION; + if (mlx5_core_is_vf(mdev)) + props->kernel_cap_flags |= IBK_VIRTUAL_FUNCTION; if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET && raw_support) { @@ -947,15 +1166,19 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, } if (MLX5_CAP_GEN(mdev, tag_matching)) { - props->tm_caps.max_rndv_hdr_size = MLX5_TM_MAX_RNDV_MSG_SIZE; props->tm_caps.max_num_tags = (1 << MLX5_CAP_GEN(mdev, log_tag_matching_list_sz)) - 1; - props->tm_caps.flags = IB_TM_CAP_RC; props->tm_caps.max_ops = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz); props->tm_caps.max_sge = MLX5_TM_MAX_SGE; } + if (MLX5_CAP_GEN(mdev, tag_matching) && + MLX5_CAP_GEN(mdev, rndv_offload_rc)) { + props->tm_caps.flags = IB_TM_CAP_RNDV_RC; + props->tm_caps.max_rndv_hdr_size = MLX5_TM_MAX_RNDV_MSG_SIZE; + } + if (MLX5_CAP_GEN(dev->mdev, cq_moderation)) { props->cq_caps.max_cq_moderation_count = MLX5_MAX_CQ_COUNT; @@ -963,7 +1186,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, MLX5_MAX_CQ_PERIOD; } - if (field_avail(typeof(resp), cqe_comp_caps, uhw->outlen)) { + if (offsetofend(typeof(resp), cqe_comp_caps) <= uhw_outlen) { resp.response_length += sizeof(resp.cqe_comp_caps); if (MLX5_CAP_GEN(dev->mdev, cqe_compression)) { @@ -981,7 +1204,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, } } - if (field_avail(typeof(resp), packet_pacing_caps, uhw->outlen) && + if (offsetofend(typeof(resp), packet_pacing_caps) <= uhw_outlen && raw_support) { if (MLX5_CAP_QOS(mdev, packet_pacing) && MLX5_CAP_GEN(mdev, qos)) { @@ -999,8 +1222,8 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, resp.response_length += sizeof(resp.packet_pacing_caps); } - if (field_avail(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes, - uhw->outlen)) { + if (offsetofend(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes) <= + uhw_outlen) { if (MLX5_CAP_ETH(mdev, multi_pkt_send_wqe)) resp.mlx5_ib_support_multi_pkt_send_wqes = MLX5_IB_ALLOW_MPW; @@ -1013,7 +1236,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, sizeof(resp.mlx5_ib_support_multi_pkt_send_wqes); } - if (field_avail(typeof(resp), flags, uhw->outlen)) { + if (offsetofend(typeof(resp), flags) <= uhw_outlen) { resp.response_length += sizeof(resp.flags); if (MLX5_CAP_GEN(mdev, cqe_compression_128)) @@ -1025,10 +1248,19 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, if (MLX5_CAP_GEN(mdev, qp_packet_based)) resp.flags |= MLX5_IB_QUERY_DEV_RESP_PACKET_BASED_CREDIT_MODE; + + resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_SCAT2CQE_DCT; + + if (MLX5_CAP_GEN_2(mdev, dp_ordering_force) && + (MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_xrc) || + MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_dc) || + MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_rc) || + MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_ud) || + MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_uc))) + resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_OOO_DP; } - if (field_avail(typeof(resp), sw_parsing_caps, - uhw->outlen)) { + if (offsetofend(typeof(resp), sw_parsing_caps) <= uhw_outlen) { resp.response_length += sizeof(resp.sw_parsing_caps); if (MLX5_CAP_ETH(mdev, swp)) { resp.sw_parsing_caps.sw_parsing_offloads |= @@ -1048,7 +1280,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, } } - if (field_avail(typeof(resp), striding_rq_caps, uhw->outlen) && + if (offsetofend(typeof(resp), striding_rq_caps) <= uhw_outlen && raw_support) { resp.response_length += sizeof(resp.striding_rq_caps); if (MLX5_CAP_GEN(mdev, striding_rq)) { @@ -1056,8 +1288,14 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES; resp.striding_rq_caps.max_single_stride_log_num_of_bytes = MLX5_MAX_SINGLE_STRIDE_LOG_NUM_BYTES; - resp.striding_rq_caps.min_single_wqe_log_num_of_strides = - MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES; + if (MLX5_CAP_GEN(dev->mdev, ext_stride_num_range)) + resp.striding_rq_caps + .min_single_wqe_log_num_of_strides = + MLX5_EXT_MIN_SINGLE_WQE_LOG_NUM_STRIDES; + else + resp.striding_rq_caps + .min_single_wqe_log_num_of_strides = + MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES; resp.striding_rq_caps.max_single_wqe_log_num_of_strides = MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES; resp.striding_rq_caps.supported_qpts = @@ -1065,8 +1303,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, } } - if (field_avail(typeof(resp), tunnel_offloads_caps, - uhw->outlen)) { + if (offsetofend(typeof(resp), tunnel_offloads_caps) <= uhw_outlen) { resp.response_length += sizeof(resp.tunnel_offloads_caps); if (MLX5_CAP_ETH(mdev, tunnel_stateless_vxlan)) resp.tunnel_offloads_caps |= @@ -1077,17 +1314,38 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, if (MLX5_CAP_ETH(mdev, tunnel_stateless_gre)) resp.tunnel_offloads_caps |= MLX5_IB_TUNNELED_OFFLOADS_GRE; - if (MLX5_CAP_GEN(mdev, flex_parser_protocols) & - MLX5_FLEX_PROTO_CW_MPLS_GRE) + if (MLX5_CAP_ETH(mdev, tunnel_stateless_mpls_over_gre)) resp.tunnel_offloads_caps |= MLX5_IB_TUNNELED_OFFLOADS_MPLS_GRE; - if (MLX5_CAP_GEN(mdev, flex_parser_protocols) & - MLX5_FLEX_PROTO_CW_MPLS_UDP) + if (MLX5_CAP_ETH(mdev, tunnel_stateless_mpls_over_udp)) resp.tunnel_offloads_caps |= MLX5_IB_TUNNELED_OFFLOADS_MPLS_UDP; } - if (uhw->outlen) { + if (offsetofend(typeof(resp), dci_streams_caps) <= uhw_outlen) { + resp.response_length += sizeof(resp.dci_streams_caps); + + resp.dci_streams_caps.max_log_num_concurent = + MLX5_CAP_GEN(mdev, log_max_dci_stream_channels); + + resp.dci_streams_caps.max_log_num_errored = + MLX5_CAP_GEN(mdev, log_max_dci_errored_streams); + } + + if (offsetofend(typeof(resp), reserved) <= uhw_outlen) + resp.response_length += sizeof(resp.reserved); + + if (offsetofend(typeof(resp), reg_c0) <= uhw_outlen) { + struct mlx5_eswitch *esw = mdev->priv.eswitch; + + resp.response_length += sizeof(resp.reg_c0); + + if (mlx5_eswitch_mode(mdev) == MLX5_ESWITCH_OFFLOADS && + mlx5_eswitch_vport_match_metadata_enabled(esw)) + fill_esw_mgr_reg_c0(mdev, &resp); + } + + if (uhw_outlen) { err = ib_copy_to_udata(uhw, &resp, resp.response_length); if (err) @@ -1097,32 +1355,24 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, return 0; } -enum mlx5_ib_width { - MLX5_IB_WIDTH_1X = 1 << 0, - MLX5_IB_WIDTH_2X = 1 << 1, - MLX5_IB_WIDTH_4X = 1 << 2, - MLX5_IB_WIDTH_8X = 1 << 3, - MLX5_IB_WIDTH_12X = 1 << 4 -}; - -static void translate_active_width(struct ib_device *ibdev, u8 active_width, - u8 *ib_width) +static void translate_active_width(struct ib_device *ibdev, u16 active_width, + u8 *ib_width) { struct mlx5_ib_dev *dev = to_mdev(ibdev); - if (active_width & MLX5_IB_WIDTH_1X) + if (active_width & MLX5_PTYS_WIDTH_1X) *ib_width = IB_WIDTH_1X; - else if (active_width & MLX5_IB_WIDTH_2X) + else if (active_width & MLX5_PTYS_WIDTH_2X) *ib_width = IB_WIDTH_2X; - else if (active_width & MLX5_IB_WIDTH_4X) + else if (active_width & MLX5_PTYS_WIDTH_4X) *ib_width = IB_WIDTH_4X; - else if (active_width & MLX5_IB_WIDTH_8X) + else if (active_width & MLX5_PTYS_WIDTH_8X) *ib_width = IB_WIDTH_8X; - else if (active_width & MLX5_IB_WIDTH_12X) + else if (active_width & MLX5_PTYS_WIDTH_12X) *ib_width = IB_WIDTH_12X; else { mlx5_ib_dbg(dev, "Invalid active_width %d, setting width to default value: 4x\n", - (int)active_width); + active_width); *ib_width = IB_WIDTH_4X; } @@ -1190,17 +1440,17 @@ static int translate_max_vl_num(struct ib_device *ibdev, u8 vl_hw_cap, return 0; } -static int mlx5_query_hca_port(struct ib_device *ibdev, u8 port, +static int mlx5_query_hca_port(struct ib_device *ibdev, u32 port, struct ib_port_attr *props) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; struct mlx5_hca_vport_context *rep; + u8 vl_hw_cap, plane_index = 0; u16 max_mtu; u16 oper_mtu; int err; - u8 ib_link_width_oper; - u8 vl_hw_cap; + u16 ib_link_width_oper; rep = kzalloc(sizeof(*rep), GFP_KERNEL); if (!rep) { @@ -1210,6 +1460,11 @@ static int mlx5_query_hca_port(struct ib_device *ibdev, u8 port, /* props being zeroed by the caller, avoid zeroing it here */ + if (ibdev->type == RDMA_DEVICE_TYPE_SMI) { + plane_index = port; + port = smi_to_native_portnum(dev, port); + } + err = mlx5_query_hca_vport_context(mdev, 0, port, 0, rep); if (err) goto out; @@ -1220,7 +1475,14 @@ static int mlx5_query_hca_port(struct ib_device *ibdev, u8 port, props->sm_sl = rep->sm_sl; props->state = rep->vport_state; props->phys_state = rep->port_physical_state; - props->port_cap_flags = rep->cap_mask1; + + props->port_cap_flags = rep->cap_mask1; + if (dev->num_plane) { + props->port_cap_flags |= IB_PORT_SM_DISABLED; + props->port_cap_flags &= ~IB_PORT_SM; + } else if (ibdev->type == RDMA_DEVICE_TYPE_SMI) + props->port_cap_flags &= ~IB_PORT_CM_SUP; + props->gid_tbl_len = mlx5_get_gid_table_len(MLX5_CAP_GEN(mdev, gid_table_size)); props->max_msg_sz = 1 << MLX5_CAP_GEN(mdev, log_max_msg); props->pkey_tbl_len = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size)); @@ -1232,16 +1494,13 @@ static int mlx5_query_hca_port(struct ib_device *ibdev, u8 port, if (props->port_cap_flags & IB_PORT_CAP_MASK2_SUP) props->port_cap_flags2 = rep->cap_mask2; - err = mlx5_query_port_link_width_oper(mdev, &ib_link_width_oper, port); + err = mlx5_query_ib_port_oper(mdev, &ib_link_width_oper, + &props->active_speed, port, plane_index); if (err) goto out; translate_active_width(ibdev, ib_link_width_oper, &props->active_width); - err = mlx5_query_port_ib_proto_oper(mdev, &props->active_speed, port); - if (err) - goto out; - mlx5_query_port_max_mtu(mdev, &max_mtu, port); props->max_mtu = mlx5_mtu_to_ib_mtu(max_mtu); @@ -1261,7 +1520,7 @@ out: return err; } -int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, +int mlx5_ib_query_port(struct ib_device *ibdev, u32 port, struct ib_port_attr *props) { unsigned int count; @@ -1306,23 +1565,23 @@ int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, return ret; } -static int mlx5_ib_rep_query_port(struct ib_device *ibdev, u8 port, +static int mlx5_ib_rep_query_port(struct ib_device *ibdev, u32 port, struct ib_port_attr *props) { - int ret; - - /* Only link layer == ethernet is valid for representors */ - ret = mlx5_query_port_roce(ibdev, port, props); - if (ret || !props) - return ret; - - /* We don't support GIDS */ - props->gid_tbl_len = 0; + return mlx5_query_port_roce(ibdev, port, props); +} - return ret; +static int mlx5_ib_rep_query_pkey(struct ib_device *ibdev, u32 port, u16 index, + u16 *pkey) +{ + /* Default special Pkey for representor device port as per the + * IB specification 1.3 section 10.9.1.2. + */ + *pkey = 0xffff; + return 0; } -static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index, +static int mlx5_ib_query_gid(struct ib_device *ibdev, u32 port, int index, union ib_gid *gid) { struct mlx5_ib_dev *dev = to_mdev(ibdev); @@ -1341,13 +1600,13 @@ static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index, } -static int mlx5_query_hca_nic_pkey(struct ib_device *ibdev, u8 port, +static int mlx5_query_hca_nic_pkey(struct ib_device *ibdev, u32 port, u16 index, u16 *pkey) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev; bool put_mdev = true; - u8 mdev_port_num; + u32 mdev_port_num; int err; mdev = mlx5_ib_get_native_port_mdev(dev, port, &mdev_port_num); @@ -1368,7 +1627,7 @@ static int mlx5_query_hca_nic_pkey(struct ib_device *ibdev, u8 port, return err; } -static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, +static int mlx5_ib_query_pkey(struct ib_device *ibdev, u32 port, u16 index, u16 *pkey) { switch (mlx5_get_vport_access_method(ibdev)) { @@ -1412,12 +1671,12 @@ static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask, return err; } -static int set_port_caps_atomic(struct mlx5_ib_dev *dev, u8 port_num, u32 mask, +static int set_port_caps_atomic(struct mlx5_ib_dev *dev, u32 port_num, u32 mask, u32 value) { struct mlx5_hca_vport_context ctx = {}; struct mlx5_core_dev *mdev; - u8 mdev_port_num; + u32 mdev_port_num; int err; mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num); @@ -1446,7 +1705,7 @@ out: return err; } -static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, +static int mlx5_ib_modify_port(struct ib_device *ibdev, u32 port, int mask, struct ib_port_modify *props) { struct mlx5_ib_dev *dev = to_mdev(ibdev); @@ -1547,7 +1806,8 @@ static int allocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *conte bfregi = &context->bfregi; for (i = 0; i < bfregi->num_static_sys_pages; i++) { - err = mlx5_cmd_alloc_uar(dev->mdev, &bfregi->sys_pages[i]); + err = mlx5_cmd_uar_alloc(dev->mdev, &bfregi->sys_pages[i], + context->devx_uid); if (err) goto error; @@ -1561,7 +1821,8 @@ static int allocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *conte error: for (--i; i >= 0; i--) - if (mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i])) + if (mlx5_cmd_uar_dealloc(dev->mdev, bfregi->sys_pages[i], + context->devx_uid)) mlx5_ib_warn(dev, "failed to free uar %d\n", i); return err; @@ -1577,13 +1838,49 @@ static void deallocate_uars(struct mlx5_ib_dev *dev, for (i = 0; i < bfregi->num_sys_pages; i++) if (i < bfregi->num_static_sys_pages || bfregi->sys_pages[i] != MLX5_IB_INVALID_UAR_INDEX) - mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]); + mlx5_cmd_uar_dealloc(dev->mdev, bfregi->sys_pages[i], + context->devx_uid); +} + +static int mlx5_ib_enable_lb_mp(struct mlx5_core_dev *master, + struct mlx5_core_dev *slave, + struct mlx5_ib_lb_state *lb_state) +{ + int err; + + err = mlx5_nic_vport_update_local_lb(master, true); + if (err) + return err; + + err = mlx5_nic_vport_update_local_lb(slave, true); + if (err) + goto out; + + lb_state->force_enable = true; + return 0; + +out: + mlx5_nic_vport_update_local_lb(master, false); + return err; +} + +static void mlx5_ib_disable_lb_mp(struct mlx5_core_dev *master, + struct mlx5_core_dev *slave, + struct mlx5_ib_lb_state *lb_state) +{ + mlx5_nic_vport_update_local_lb(slave, false); + mlx5_nic_vport_update_local_lb(master, false); + + lb_state->force_enable = false; } int mlx5_ib_enable_lb(struct mlx5_ib_dev *dev, bool td, bool qp) { int err = 0; + if (dev->lb.force_enable) + return 0; + mutex_lock(&dev->lb.mutex); if (td) dev->lb.user_td++; @@ -1605,6 +1902,9 @@ int mlx5_ib_enable_lb(struct mlx5_ib_dev *dev, bool td, bool qp) void mlx5_ib_disable_lb(struct mlx5_ib_dev *dev, bool td, bool qp) { + if (dev->lb.force_enable) + return; + mutex_lock(&dev->lb.mutex); if (td) dev->lb.user_td--; @@ -1658,89 +1958,158 @@ static void mlx5_ib_dealloc_transport_domain(struct mlx5_ib_dev *dev, u32 tdn, mlx5_ib_disable_lb(dev, true, false); } -static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, - struct ib_udata *udata) +static int set_ucontext_resp(struct ib_ucontext *uctx, + struct mlx5_ib_alloc_ucontext_resp *resp) +{ + struct ib_device *ibdev = uctx->device; + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_ib_ucontext *context = to_mucontext(uctx); + struct mlx5_bfreg_info *bfregi = &context->bfregi; + + if (MLX5_CAP_GEN(dev->mdev, dump_fill_mkey)) { + resp->dump_fill_mkey = dev->mkeys.dump_fill_mkey; + resp->comp_mask |= + MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_DUMP_FILL_MKEY; + } + + resp->qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp); + if (mlx5_wc_support_get(dev->mdev)) + resp->bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, + log_bf_reg_size); + resp->cache_line_size = cache_line_size(); + resp->max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq); + resp->max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq); + resp->max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz); + resp->max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz); + resp->max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz); + resp->cqe_version = context->cqe_version; + resp->log_uar_size = MLX5_CAP_GEN(dev->mdev, uar_4k) ? + MLX5_ADAPTER_PAGE_SHIFT : PAGE_SHIFT; + resp->num_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ? + MLX5_CAP_GEN(dev->mdev, + num_of_uars_per_page) : 1; + resp->tot_bfregs = bfregi->lib_uar_dyn ? 0 : + bfregi->total_num_bfregs - bfregi->num_dyn_bfregs; + resp->num_ports = dev->num_ports; + resp->cmds_supp_uhw |= MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE | + MLX5_USER_CMDS_SUPP_UHW_CREATE_AH; + + if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET) { + mlx5_query_min_inline(dev->mdev, &resp->eth_min_inline); + resp->eth_min_inline++; + } + + if (dev->mdev->clock_info) + resp->clock_info_versions = BIT(MLX5_IB_CLOCK_INFO_V1); + + /* + * We don't want to expose information from the PCI bar that is located + * after 4096 bytes, so if the arch only supports larger pages, let's + * pretend we don't support reading the HCA's core clock. This is also + * forced by mmap function. + */ + if (PAGE_SIZE <= 4096) { + resp->comp_mask |= + MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET; + resp->hca_core_clock_offset = + offsetof(struct mlx5_init_seg, + internal_timer_h) % PAGE_SIZE; + } + + if (MLX5_CAP_GEN(dev->mdev, ece_support)) + resp->comp_mask |= MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_ECE; + + if (rt_supported(MLX5_CAP_GEN(dev->mdev, sq_ts_format)) && + rt_supported(MLX5_CAP_GEN(dev->mdev, rq_ts_format)) && + rt_supported(MLX5_CAP_ROCE(dev->mdev, qp_ts_format))) + resp->comp_mask |= + MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_REAL_TIME_TS; + + resp->num_dyn_bfregs = bfregi->num_dyn_bfregs; + + if (MLX5_CAP_GEN(dev->mdev, drain_sigerr)) + resp->comp_mask |= MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_SQD2RTS; + + resp->comp_mask |= + MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_MKEY_UPDATE_TAG; + + return 0; +} + +static bool uctx_rdma_ctrl_is_enabled(u64 enabled_caps) { + return UCAP_ENABLED(enabled_caps, RDMA_UCAP_MLX5_CTRL_LOCAL) || + UCAP_ENABLED(enabled_caps, RDMA_UCAP_MLX5_CTRL_OTHER_VHCA); +} + +static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx, + struct ib_udata *udata) +{ + struct ib_device *ibdev = uctx->device; struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_ib_alloc_ucontext_req_v2 req = {}; struct mlx5_ib_alloc_ucontext_resp resp = {}; - struct mlx5_core_dev *mdev = dev->mdev; - struct mlx5_ib_ucontext *context; + struct mlx5_ib_ucontext *context = to_mucontext(uctx); struct mlx5_bfreg_info *bfregi; int ver; int err; size_t min_req_v2 = offsetof(struct mlx5_ib_alloc_ucontext_req_v2, max_cqe_version); - u32 dump_fill_mkey; bool lib_uar_4k; + bool lib_uar_dyn; if (!dev->ib_active) - return ERR_PTR(-EAGAIN); + return -EAGAIN; if (udata->inlen == sizeof(struct mlx5_ib_alloc_ucontext_req)) ver = 0; else if (udata->inlen >= min_req_v2) ver = 2; else - return ERR_PTR(-EINVAL); + return -EINVAL; err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req))); if (err) - return ERR_PTR(err); + return err; if (req.flags & ~MLX5_IB_ALLOC_UCTX_DEVX) - return ERR_PTR(-EOPNOTSUPP); + return -EOPNOTSUPP; if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2) - return ERR_PTR(-EOPNOTSUPP); + return -EOPNOTSUPP; req.total_num_bfregs = ALIGN(req.total_num_bfregs, MLX5_NON_FP_BFREGS_PER_UAR); if (req.num_low_latency_bfregs > req.total_num_bfregs - 1) - return ERR_PTR(-EINVAL); - - resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp); - if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf)) - resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size); - resp.cache_line_size = cache_line_size(); - resp.max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq); - resp.max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq); - resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz); - resp.max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz); - resp.max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz); - resp.cqe_version = min_t(__u8, - (__u8)MLX5_CAP_GEN(dev->mdev, cqe_version), - req.max_cqe_version); - resp.log_uar_size = MLX5_CAP_GEN(dev->mdev, uar_4k) ? - MLX5_ADAPTER_PAGE_SHIFT : PAGE_SHIFT; - resp.num_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ? - MLX5_CAP_GEN(dev->mdev, num_of_uars_per_page) : 1; - resp.response_length = min(offsetof(typeof(resp), response_length) + - sizeof(resp.response_length), udata->outlen); - - if (mlx5_accel_ipsec_device_caps(dev->mdev) & MLX5_ACCEL_IPSEC_CAP_DEVICE) { - if (mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_EGRESS)) - resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM; - if (mlx5_accel_ipsec_device_caps(dev->mdev) & MLX5_ACCEL_IPSEC_CAP_REQUIRED_METADATA) - resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_REQ_METADATA; - if (MLX5_CAP_FLOWTABLE(dev->mdev, flow_table_properties_nic_receive.ft_field_support.outer_esp_spi)) - resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_SPI_STEERING; - if (mlx5_accel_ipsec_device_caps(dev->mdev) & MLX5_ACCEL_IPSEC_CAP_TX_IV_IS_ESN) - resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_TX_IV_IS_ESN; - /* MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_FULL_OFFLOAD is currently always 0 */ - } + return -EINVAL; - context = kzalloc(sizeof(*context), GFP_KERNEL); - if (!context) - return ERR_PTR(-ENOMEM); + if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) { + err = mlx5_ib_devx_create(dev, true, uctx->enabled_caps); + if (err < 0) + goto out_ctx; + context->devx_uid = err; + + if (uctx_rdma_ctrl_is_enabled(uctx->enabled_caps)) { + err = mlx5_cmd_add_privileged_uid(dev->mdev, + context->devx_uid); + if (err) + goto out_devx; + } + } lib_uar_4k = req.lib_caps & MLX5_LIB_CAP_4K_UAR; + lib_uar_dyn = req.lib_caps & MLX5_LIB_CAP_DYN_UAR; bfregi = &context->bfregi; + if (lib_uar_dyn) { + bfregi->lib_uar_dyn = lib_uar_dyn; + goto uar_done; + } + /* updates req->total_num_bfregs */ err = calc_total_bfregs(dev, lib_uar_4k, &req, bfregi); if (err) - goto out_ctx; + goto out_ucap; mutex_init(&bfregi->lock); bfregi->lib_uar_4k = lib_uar_4k; @@ -1748,7 +2117,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, GFP_KERNEL); if (!bfregi->count) { err = -ENOMEM; - goto out_ctx; + goto out_ucap; } bfregi->sys_pages = kcalloc(bfregi->num_sys_pages, @@ -1763,118 +2132,45 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, if (err) goto out_sys_pages; -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range; -#endif - - if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) { - err = mlx5_ib_devx_create(dev, true); - if (err < 0) - goto out_uars; - context->devx_uid = err; - } - +uar_done: err = mlx5_ib_alloc_transport_domain(dev, &context->tdn, context->devx_uid); if (err) - goto out_devx; - - if (MLX5_CAP_GEN(dev->mdev, dump_fill_mkey)) { - err = mlx5_cmd_dump_fill_mkey(dev->mdev, &dump_fill_mkey); - if (err) - goto out_mdev; - } + goto out_uars; INIT_LIST_HEAD(&context->db_page_list); mutex_init(&context->db_page_mutex); - resp.tot_bfregs = req.total_num_bfregs; - resp.num_ports = dev->num_ports; - - if (field_avail(typeof(resp), cqe_version, udata->outlen)) - resp.response_length += sizeof(resp.cqe_version); - - if (field_avail(typeof(resp), cmds_supp_uhw, udata->outlen)) { - resp.cmds_supp_uhw |= MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE | - MLX5_USER_CMDS_SUPP_UHW_CREATE_AH; - resp.response_length += sizeof(resp.cmds_supp_uhw); - } - - if (field_avail(typeof(resp), eth_min_inline, udata->outlen)) { - if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET) { - mlx5_query_min_inline(dev->mdev, &resp.eth_min_inline); - resp.eth_min_inline++; - } - resp.response_length += sizeof(resp.eth_min_inline); - } - - if (field_avail(typeof(resp), clock_info_versions, udata->outlen)) { - if (mdev->clock_info) - resp.clock_info_versions = BIT(MLX5_IB_CLOCK_INFO_V1); - resp.response_length += sizeof(resp.clock_info_versions); - } - - /* - * We don't want to expose information from the PCI bar that is located - * after 4096 bytes, so if the arch only supports larger pages, let's - * pretend we don't support reading the HCA's core clock. This is also - * forced by mmap function. - */ - if (field_avail(typeof(resp), hca_core_clock_offset, udata->outlen)) { - if (PAGE_SIZE <= 4096) { - resp.comp_mask |= - MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET; - resp.hca_core_clock_offset = - offsetof(struct mlx5_init_seg, internal_timer_h) % PAGE_SIZE; - } - resp.response_length += sizeof(resp.hca_core_clock_offset); - } - - if (field_avail(typeof(resp), log_uar_size, udata->outlen)) - resp.response_length += sizeof(resp.log_uar_size); - - if (field_avail(typeof(resp), num_uars_per_page, udata->outlen)) - resp.response_length += sizeof(resp.num_uars_per_page); - - if (field_avail(typeof(resp), num_dyn_bfregs, udata->outlen)) { - resp.num_dyn_bfregs = bfregi->num_dyn_bfregs; - resp.response_length += sizeof(resp.num_dyn_bfregs); - } + context->cqe_version = min_t(__u8, + (__u8)MLX5_CAP_GEN(dev->mdev, cqe_version), + req.max_cqe_version); - if (field_avail(typeof(resp), dump_fill_mkey, udata->outlen)) { - if (MLX5_CAP_GEN(dev->mdev, dump_fill_mkey)) { - resp.dump_fill_mkey = dump_fill_mkey; - resp.comp_mask |= - MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_DUMP_FILL_MKEY; - } - resp.response_length += sizeof(resp.dump_fill_mkey); - } + err = set_ucontext_resp(uctx, &resp); + if (err) + goto out_mdev; + resp.response_length = min(udata->outlen, sizeof(resp)); err = ib_copy_to_udata(udata, &resp, resp.response_length); if (err) goto out_mdev; bfregi->ver = ver; bfregi->num_low_latency_bfregs = req.num_low_latency_bfregs; - context->cqe_version = resp.cqe_version; context->lib_caps = req.lib_caps; print_lib_caps(dev, context->lib_caps); - if (dev->lag_active) { - u8 port = mlx5_core_native_port_num(dev->mdev); + if (mlx5_ib_lag_should_assign_affinity(dev)) { + u32 port = mlx5_core_native_port_num(dev->mdev) - 1; atomic_set(&context->tx_port_affinity, atomic_add_return( - 1, &dev->roce[port].tx_port_affinity)); + 1, &dev->port[port].roce.tx_port_affinity)); } - return &context->ibucontext; + return 0; out_mdev: mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid); -out_devx: - if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) - mlx5_ib_devx_destroy(dev, context->devx_uid); out_uars: deallocate_uars(dev, context); @@ -1885,37 +2181,61 @@ out_sys_pages: out_count: kfree(bfregi->count); +out_ucap: + if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX && + uctx_rdma_ctrl_is_enabled(uctx->enabled_caps)) + mlx5_cmd_remove_privileged_uid(dev->mdev, context->devx_uid); + +out_devx: + if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) + mlx5_ib_devx_destroy(dev, context->devx_uid); + out_ctx: - kfree(context); + return err; +} - return ERR_PTR(err); +static int mlx5_ib_query_ucontext(struct ib_ucontext *ibcontext, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_alloc_ucontext_resp uctx_resp = {}; + int ret; + + ret = set_ucontext_resp(ibcontext, &uctx_resp); + if (ret) + return ret; + + uctx_resp.response_length = + min_t(size_t, + uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX), + sizeof(uctx_resp)); + + ret = uverbs_copy_to_struct_or_zero(attrs, + MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX, + &uctx_resp, + sizeof(uctx_resp)); + return ret; } -static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) +static void mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) { struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); struct mlx5_bfreg_info *bfregi; -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - /* All umem's must be destroyed before destroying the ucontext. */ - mutex_lock(&ibcontext->per_mm_list_lock); - WARN_ON(!list_empty(&ibcontext->per_mm_list)); - mutex_unlock(&ibcontext->per_mm_list_lock); -#endif - bfregi = &context->bfregi; mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid); - if (context->devx_uid) - mlx5_ib_devx_destroy(dev, context->devx_uid); - deallocate_uars(dev, context); kfree(bfregi->sys_pages); kfree(bfregi->count); - kfree(context); - return 0; + if (context->devx_uid) { + if (uctx_rdma_ctrl_is_enabled(ibcontext->enabled_caps)) + mlx5_cmd_remove_privileged_uid(dev->mdev, + context->devx_uid); + mlx5_ib_devx_destroy(dev, context->devx_uid); + } } static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev, @@ -1925,7 +2245,18 @@ static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev, fw_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ? MLX5_UARS_IN_PAGE : 1; - return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) + uar_idx / fw_uars_per_page; + return (dev->mdev->bar_addr >> PAGE_SHIFT) + uar_idx / fw_uars_per_page; +} + +static u64 uar_index2paddress(struct mlx5_ib_dev *dev, + int uar_idx) +{ + unsigned int fw_uars_per_page; + + fw_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ? + MLX5_UARS_IN_PAGE : 1; + + return (dev->mdev->bar_addr + (uar_idx / fw_uars_per_page) * PAGE_SIZE); } static int get_command(unsigned long offset) @@ -1966,7 +2297,7 @@ static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd) case MLX5_IB_MMAP_DEVICE_MEM: return "Device Memory"; default: - return NULL; + return "Unknown"; } } @@ -1974,20 +2305,51 @@ static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev, struct vm_area_struct *vma, struct mlx5_ib_ucontext *context) { - if (vma->vm_end - vma->vm_start != PAGE_SIZE) + if ((vma->vm_end - vma->vm_start != PAGE_SIZE) || + !(vma->vm_flags & VM_SHARED)) return -EINVAL; if (get_index(vma->vm_pgoff) != MLX5_IB_CLOCK_INFO_V1) return -EOPNOTSUPP; - if (vma->vm_flags & VM_WRITE) + if (vma->vm_flags & (VM_WRITE | VM_EXEC)) return -EPERM; + vm_flags_clear(vma, VM_MAYWRITE); - if (!dev->mdev->clock_info_page) + if (!dev->mdev->clock_info) return -EOPNOTSUPP; - return rdma_user_mmap_page(&context->ibucontext, vma, - dev->mdev->clock_info_page, PAGE_SIZE); + return vm_insert_page(vma, vma->vm_start, + virt_to_page(dev->mdev->clock_info)); +} + +static void mlx5_ib_mmap_free(struct rdma_user_mmap_entry *entry) +{ + struct mlx5_user_mmap_entry *mentry = to_mmmap(entry); + struct mlx5_ib_dev *dev = to_mdev(entry->ucontext->device); + struct mlx5_var_table *var_table = &dev->var_table; + struct mlx5_ib_ucontext *context = to_mucontext(entry->ucontext); + + switch (mentry->mmap_flag) { + case MLX5_IB_MMAP_TYPE_MEMIC: + case MLX5_IB_MMAP_TYPE_MEMIC_OP: + mlx5_ib_dm_mmap_free(dev, mentry); + break; + case MLX5_IB_MMAP_TYPE_VAR: + mutex_lock(&var_table->bitmap_lock); + clear_bit(mentry->page_idx, var_table->bitmap); + mutex_unlock(&var_table->bitmap_lock); + kfree(mentry); + break; + case MLX5_IB_MMAP_TYPE_UAR_WC: + case MLX5_IB_MMAP_TYPE_UAR_NC: + mlx5_cmd_uar_dealloc(dev->mdev, mentry->page_idx, + context->devx_uid); + kfree(mentry); + break; + default: + WARN_ON(true); + } } static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, @@ -2005,6 +2367,9 @@ static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, int max_valid_idx = dyn_uar ? bfregi->num_sys_pages : bfregi->num_static_sys_pages; + if (bfregi->lib_uar_dyn) + return -EINVAL; + if (vma->vm_end - vma->vm_start != PAGE_SIZE) return -EINVAL; @@ -2022,14 +2387,6 @@ static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, switch (cmd) { case MLX5_IB_MMAP_WC_PAGE: case MLX5_IB_MMAP_ALLOC_WC: -/* Some architectures don't support WC memory */ -#if defined(CONFIG_X86) - if (!pat_enabled()) - return -EPERM; -#elif !(defined(CONFIG_PPC) || (defined(CONFIG_ARM) && defined(CONFIG_MMU))) - return -EPERM; -#endif - /* fall through */ case MLX5_IB_MMAP_REGULAR_PAGE: /* For MLX5_IB_MMAP_REGULAR_PAGE do the best effort to get WC */ prot = pgprot_writecombine(vma->vm_page_prot); @@ -2065,7 +2422,8 @@ static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, bfregi->count[bfreg_dyn_idx]++; mutex_unlock(&bfregi->lock); - err = mlx5_cmd_alloc_uar(dev->mdev, &uar_index); + err = mlx5_cmd_uar_alloc(dev->mdev, &uar_index, + context->devx_uid); if (err) { mlx5_ib_warn(dev, "UAR alloc failed\n"); goto free_bfreg; @@ -2078,7 +2436,7 @@ static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn); err = rdma_user_mmap_io(&context->ibucontext, vma, pfn, PAGE_SIZE, - prot); + prot, NULL); if (err) { mlx5_ib_err(dev, "rdma_user_mmap_io failed with error=%d, mmap_cmd=%s\n", @@ -2094,7 +2452,7 @@ err: if (!dyn_uar) return err; - mlx5_cmd_free_uar(dev->mdev, idx); + mlx5_cmd_uar_dealloc(dev->mdev, idx, context->devx_uid); free_bfreg: mlx5_ib_free_bfreg(dev, bfregi, bfreg_dyn_idx); @@ -2102,25 +2460,55 @@ free_bfreg: return err; } -static int dm_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) +static unsigned long mlx5_vma_to_pgoff(struct vm_area_struct *vma) { - struct mlx5_ib_ucontext *mctx = to_mucontext(context); - struct mlx5_ib_dev *dev = to_mdev(context->device); - u16 page_idx = get_extended_index(vma->vm_pgoff); - size_t map_size = vma->vm_end - vma->vm_start; - u32 npages = map_size >> PAGE_SHIFT; + unsigned long idx; + u8 command; + + command = get_command(vma->vm_pgoff); + idx = get_extended_index(vma->vm_pgoff); + + return (command << 16 | idx); +} + +static int mlx5_ib_mmap_offset(struct mlx5_ib_dev *dev, + struct vm_area_struct *vma, + struct ib_ucontext *ucontext) +{ + struct mlx5_user_mmap_entry *mentry; + struct rdma_user_mmap_entry *entry; + unsigned long pgoff; + pgprot_t prot; phys_addr_t pfn; + int ret; - if (find_next_zero_bit(mctx->dm_pages, page_idx + npages, page_idx) != - page_idx + npages) + pgoff = mlx5_vma_to_pgoff(vma); + entry = rdma_user_mmap_entry_get_pgoff(ucontext, pgoff); + if (!entry) return -EINVAL; - pfn = ((pci_resource_start(dev->mdev->pdev, 0) + - MLX5_CAP64_DEV_MEM(dev->mdev, memic_bar_start_addr)) >> - PAGE_SHIFT) + - page_idx; - return rdma_user_mmap_io(context, vma, pfn, map_size, - pgprot_writecombine(vma->vm_page_prot)); + mentry = to_mmmap(entry); + pfn = (mentry->address >> PAGE_SHIFT); + if (mentry->mmap_flag == MLX5_IB_MMAP_TYPE_VAR || + mentry->mmap_flag == MLX5_IB_MMAP_TYPE_UAR_NC) + prot = pgprot_noncached(vma->vm_page_prot); + else + prot = pgprot_writecombine(vma->vm_page_prot); + ret = rdma_user_mmap_io(ucontext, vma, pfn, + entry->npages * PAGE_SIZE, + prot, + entry); + rdma_user_mmap_entry_put(&mentry->rdma_entry); + return ret; +} + +static u64 mlx5_entry_to_mmap_offset(struct mlx5_user_mmap_entry *entry) +{ + u64 cmd = (entry->rdma_entry.start_pgoff >> 16) & 0xFFFF; + u64 index = entry->rdma_entry.start_pgoff & 0xFFFF; + + return (((index >> 8) << 16) | (cmd << MLX5_IB_MMAP_CMD_SHIFT) | + (index & 0xFF)) << PAGE_SHIFT; } static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) @@ -2133,9 +2521,12 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm command = get_command(vma->vm_pgoff); switch (command) { case MLX5_IB_MMAP_WC_PAGE: + case MLX5_IB_MMAP_ALLOC_WC: + if (!mlx5_wc_support_get(dev->mdev)) + return -EPERM; + fallthrough; case MLX5_IB_MMAP_NC_PAGE: case MLX5_IB_MMAP_REGULAR_PAGE: - case MLX5_IB_MMAP_ALLOC_WC: return uar_mmap(dev, command, vma, context); case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES: @@ -2147,1907 +2538,67 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm if (vma->vm_flags & VM_WRITE) return -EPERM; + vm_flags_clear(vma, VM_MAYWRITE); /* Don't expose to user-space information it shouldn't have */ if (PAGE_SIZE > 4096) return -EOPNOTSUPP; - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); pfn = (dev->mdev->iseg_base + offsetof(struct mlx5_init_seg, internal_timer_h)) >> PAGE_SHIFT; - if (io_remap_pfn_range(vma, vma->vm_start, pfn, - PAGE_SIZE, vma->vm_page_prot)) - return -EAGAIN; - break; + return rdma_user_mmap_io(&context->ibucontext, vma, pfn, + PAGE_SIZE, + pgprot_noncached(vma->vm_page_prot), + NULL); case MLX5_IB_MMAP_CLOCK_INFO: return mlx5_ib_mmap_clock_info_page(dev, vma, context); - case MLX5_IB_MMAP_DEVICE_MEM: - return dm_mmap(ibcontext, vma); - default: - return -EINVAL; + return mlx5_ib_mmap_offset(dev, vma, ibcontext); } return 0; } -struct ib_dm *mlx5_ib_alloc_dm(struct ib_device *ibdev, - struct ib_ucontext *context, - struct ib_dm_alloc_attr *attr, - struct uverbs_attr_bundle *attrs) -{ - u64 act_size = roundup(attr->length, MLX5_MEMIC_BASE_SIZE); - struct mlx5_memic *memic = &to_mdev(ibdev)->memic; - phys_addr_t memic_addr; - struct mlx5_ib_dm *dm; - u64 start_offset; - u32 page_idx; - int err; - - dm = kzalloc(sizeof(*dm), GFP_KERNEL); - if (!dm) - return ERR_PTR(-ENOMEM); - - mlx5_ib_dbg(to_mdev(ibdev), "alloc_memic req: user_length=0x%llx act_length=0x%llx log_alignment=%d\n", - attr->length, act_size, attr->alignment); - - err = mlx5_cmd_alloc_memic(memic, &memic_addr, - act_size, attr->alignment); - if (err) - goto err_free; - - start_offset = memic_addr & ~PAGE_MASK; - page_idx = (memic_addr - pci_resource_start(memic->dev->pdev, 0) - - MLX5_CAP64_DEV_MEM(memic->dev, memic_bar_start_addr)) >> - PAGE_SHIFT; - - err = uverbs_copy_to(attrs, - MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET, - &start_offset, sizeof(start_offset)); - if (err) - goto err_dealloc; - - err = uverbs_copy_to(attrs, - MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX, - &page_idx, sizeof(page_idx)); - if (err) - goto err_dealloc; - - bitmap_set(to_mucontext(context)->dm_pages, page_idx, - DIV_ROUND_UP(act_size, PAGE_SIZE)); - - dm->dev_addr = memic_addr; - - return &dm->ibdm; - -err_dealloc: - mlx5_cmd_dealloc_memic(memic, memic_addr, - act_size); -err_free: - kfree(dm); - return ERR_PTR(err); -} - -int mlx5_ib_dealloc_dm(struct ib_dm *ibdm) -{ - struct mlx5_memic *memic = &to_mdev(ibdm->device)->memic; - struct mlx5_ib_dm *dm = to_mdm(ibdm); - u64 act_size = roundup(dm->ibdm.length, MLX5_MEMIC_BASE_SIZE); - u32 page_idx; - int ret; - - ret = mlx5_cmd_dealloc_memic(memic, dm->dev_addr, act_size); - if (ret) - return ret; - - page_idx = (dm->dev_addr - pci_resource_start(memic->dev->pdev, 0) - - MLX5_CAP64_DEV_MEM(memic->dev, memic_bar_start_addr)) >> - PAGE_SHIFT; - bitmap_clear(to_mucontext(ibdm->uobject->context)->dm_pages, - page_idx, - DIV_ROUND_UP(act_size, PAGE_SIZE)); - - kfree(dm); - - return 0; -} - -static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev, - struct ib_ucontext *context, - struct ib_udata *udata) +static int mlx5_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) { + struct mlx5_ib_pd *pd = to_mpd(ibpd); + struct ib_device *ibdev = ibpd->device; struct mlx5_ib_alloc_pd_resp resp; - struct mlx5_ib_pd *pd; int err; u32 out[MLX5_ST_SZ_DW(alloc_pd_out)] = {}; - u32 in[MLX5_ST_SZ_DW(alloc_pd_in)] = {}; + u32 in[MLX5_ST_SZ_DW(alloc_pd_in)] = {}; u16 uid = 0; + struct mlx5_ib_ucontext *context = rdma_udata_to_drv_context( + udata, struct mlx5_ib_ucontext, ibucontext); - pd = kmalloc(sizeof(*pd), GFP_KERNEL); - if (!pd) - return ERR_PTR(-ENOMEM); - - uid = context ? to_mucontext(context)->devx_uid : 0; + uid = context ? context->devx_uid : 0; MLX5_SET(alloc_pd_in, in, opcode, MLX5_CMD_OP_ALLOC_PD); MLX5_SET(alloc_pd_in, in, uid, uid); - err = mlx5_cmd_exec(to_mdev(ibdev)->mdev, in, sizeof(in), - out, sizeof(out)); - if (err) { - kfree(pd); - return ERR_PTR(err); - } + err = mlx5_cmd_exec_inout(to_mdev(ibdev)->mdev, alloc_pd, in, out); + if (err) + return err; pd->pdn = MLX5_GET(alloc_pd_out, out, pd); pd->uid = uid; - if (context) { + if (udata) { resp.pdn = pd->pdn; if (ib_copy_to_udata(udata, &resp, sizeof(resp))) { mlx5_cmd_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn, uid); - kfree(pd); - return ERR_PTR(-EFAULT); + return -EFAULT; } } - return &pd->ibpd; + return 0; } -static int mlx5_ib_dealloc_pd(struct ib_pd *pd) +static int mlx5_ib_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata) { struct mlx5_ib_dev *mdev = to_mdev(pd->device); struct mlx5_ib_pd *mpd = to_mpd(pd); - mlx5_cmd_dealloc_pd(mdev->mdev, mpd->pdn, mpd->uid); - kfree(mpd); - - return 0; -} - -enum { - MATCH_CRITERIA_ENABLE_OUTER_BIT, - MATCH_CRITERIA_ENABLE_MISC_BIT, - MATCH_CRITERIA_ENABLE_INNER_BIT, - MATCH_CRITERIA_ENABLE_MISC2_BIT -}; - -#define HEADER_IS_ZERO(match_criteria, headers) \ - !(memchr_inv(MLX5_ADDR_OF(fte_match_param, match_criteria, headers), \ - 0, MLX5_FLD_SZ_BYTES(fte_match_param, headers))) \ - -static u8 get_match_criteria_enable(u32 *match_criteria) -{ - u8 match_criteria_enable; - - match_criteria_enable = - (!HEADER_IS_ZERO(match_criteria, outer_headers)) << - MATCH_CRITERIA_ENABLE_OUTER_BIT; - match_criteria_enable |= - (!HEADER_IS_ZERO(match_criteria, misc_parameters)) << - MATCH_CRITERIA_ENABLE_MISC_BIT; - match_criteria_enable |= - (!HEADER_IS_ZERO(match_criteria, inner_headers)) << - MATCH_CRITERIA_ENABLE_INNER_BIT; - match_criteria_enable |= - (!HEADER_IS_ZERO(match_criteria, misc_parameters_2)) << - MATCH_CRITERIA_ENABLE_MISC2_BIT; - - return match_criteria_enable; -} - -static void set_proto(void *outer_c, void *outer_v, u8 mask, u8 val) -{ - MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_protocol, mask); - MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_protocol, val); -} - -static void set_flow_label(void *misc_c, void *misc_v, u32 mask, u32 val, - bool inner) -{ - if (inner) { - MLX5_SET(fte_match_set_misc, - misc_c, inner_ipv6_flow_label, mask); - MLX5_SET(fte_match_set_misc, - misc_v, inner_ipv6_flow_label, val); - } else { - MLX5_SET(fte_match_set_misc, - misc_c, outer_ipv6_flow_label, mask); - MLX5_SET(fte_match_set_misc, - misc_v, outer_ipv6_flow_label, val); - } -} - -static void set_tos(void *outer_c, void *outer_v, u8 mask, u8 val) -{ - MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_ecn, mask); - MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_ecn, val); - MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_dscp, mask >> 2); - MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_dscp, val >> 2); -} - -static int check_mpls_supp_fields(u32 field_support, const __be32 *set_mask) -{ - if (MLX5_GET(fte_match_mpls, set_mask, mpls_label) && - !(field_support & MLX5_FIELD_SUPPORT_MPLS_LABEL)) - return -EOPNOTSUPP; - - if (MLX5_GET(fte_match_mpls, set_mask, mpls_exp) && - !(field_support & MLX5_FIELD_SUPPORT_MPLS_EXP)) - return -EOPNOTSUPP; - - if (MLX5_GET(fte_match_mpls, set_mask, mpls_s_bos) && - !(field_support & MLX5_FIELD_SUPPORT_MPLS_S_BOS)) - return -EOPNOTSUPP; - - if (MLX5_GET(fte_match_mpls, set_mask, mpls_ttl) && - !(field_support & MLX5_FIELD_SUPPORT_MPLS_TTL)) - return -EOPNOTSUPP; - - return 0; -} - -#define LAST_ETH_FIELD vlan_tag -#define LAST_IB_FIELD sl -#define LAST_IPV4_FIELD tos -#define LAST_IPV6_FIELD traffic_class -#define LAST_TCP_UDP_FIELD src_port -#define LAST_TUNNEL_FIELD tunnel_id -#define LAST_FLOW_TAG_FIELD tag_id -#define LAST_DROP_FIELD size -#define LAST_COUNTERS_FIELD counters - -/* Field is the last supported field */ -#define FIELDS_NOT_SUPPORTED(filter, field)\ - memchr_inv((void *)&filter.field +\ - sizeof(filter.field), 0,\ - sizeof(filter) -\ - offsetof(typeof(filter), field) -\ - sizeof(filter.field)) - -int parse_flow_flow_action(struct mlx5_ib_flow_action *maction, - bool is_egress, - struct mlx5_flow_act *action) -{ - - switch (maction->ib_action.type) { - case IB_FLOW_ACTION_ESP: - if (action->action & (MLX5_FLOW_CONTEXT_ACTION_ENCRYPT | - MLX5_FLOW_CONTEXT_ACTION_DECRYPT)) - return -EINVAL; - /* Currently only AES_GCM keymat is supported by the driver */ - action->esp_id = (uintptr_t)maction->esp_aes_gcm.ctx; - action->action |= is_egress ? - MLX5_FLOW_CONTEXT_ACTION_ENCRYPT : - MLX5_FLOW_CONTEXT_ACTION_DECRYPT; - return 0; - case IB_FLOW_ACTION_UNSPECIFIED: - if (maction->flow_action_raw.sub_type == - MLX5_IB_FLOW_ACTION_MODIFY_HEADER) { - if (action->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) - return -EINVAL; - action->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; - action->modify_id = maction->flow_action_raw.action_id; - return 0; - } - if (maction->flow_action_raw.sub_type == - MLX5_IB_FLOW_ACTION_DECAP) { - if (action->action & MLX5_FLOW_CONTEXT_ACTION_DECAP) - return -EINVAL; - action->action |= MLX5_FLOW_CONTEXT_ACTION_DECAP; - return 0; - } - if (maction->flow_action_raw.sub_type == - MLX5_IB_FLOW_ACTION_PACKET_REFORMAT) { - if (action->action & - MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT) - return -EINVAL; - action->action |= - MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; - action->reformat_id = - maction->flow_action_raw.action_id; - return 0; - } - /* fall through */ - default: - return -EOPNOTSUPP; - } -} - -static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c, - u32 *match_v, const union ib_flow_spec *ib_spec, - const struct ib_flow_attr *flow_attr, - struct mlx5_flow_act *action, u32 prev_type) -{ - void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c, - misc_parameters); - void *misc_params_v = MLX5_ADDR_OF(fte_match_param, match_v, - misc_parameters); - void *misc_params2_c = MLX5_ADDR_OF(fte_match_param, match_c, - misc_parameters_2); - void *misc_params2_v = MLX5_ADDR_OF(fte_match_param, match_v, - misc_parameters_2); - void *headers_c; - void *headers_v; - int match_ipv; - int ret; - - if (ib_spec->type & IB_FLOW_SPEC_INNER) { - headers_c = MLX5_ADDR_OF(fte_match_param, match_c, - inner_headers); - headers_v = MLX5_ADDR_OF(fte_match_param, match_v, - inner_headers); - match_ipv = MLX5_CAP_FLOWTABLE_NIC_RX(mdev, - ft_field_support.inner_ip_version); - } else { - headers_c = MLX5_ADDR_OF(fte_match_param, match_c, - outer_headers); - headers_v = MLX5_ADDR_OF(fte_match_param, match_v, - outer_headers); - match_ipv = MLX5_CAP_FLOWTABLE_NIC_RX(mdev, - ft_field_support.outer_ip_version); - } - - switch (ib_spec->type & ~IB_FLOW_SPEC_INNER) { - case IB_FLOW_SPEC_ETH: - if (FIELDS_NOT_SUPPORTED(ib_spec->eth.mask, LAST_ETH_FIELD)) - return -EOPNOTSUPP; - - ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, - dmac_47_16), - ib_spec->eth.mask.dst_mac); - ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, - dmac_47_16), - ib_spec->eth.val.dst_mac); - - ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, - smac_47_16), - ib_spec->eth.mask.src_mac); - ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, - smac_47_16), - ib_spec->eth.val.src_mac); - - if (ib_spec->eth.mask.vlan_tag) { - MLX5_SET(fte_match_set_lyr_2_4, headers_c, - cvlan_tag, 1); - MLX5_SET(fte_match_set_lyr_2_4, headers_v, - cvlan_tag, 1); - - MLX5_SET(fte_match_set_lyr_2_4, headers_c, - first_vid, ntohs(ib_spec->eth.mask.vlan_tag)); - MLX5_SET(fte_match_set_lyr_2_4, headers_v, - first_vid, ntohs(ib_spec->eth.val.vlan_tag)); - - MLX5_SET(fte_match_set_lyr_2_4, headers_c, - first_cfi, - ntohs(ib_spec->eth.mask.vlan_tag) >> 12); - MLX5_SET(fte_match_set_lyr_2_4, headers_v, - first_cfi, - ntohs(ib_spec->eth.val.vlan_tag) >> 12); - - MLX5_SET(fte_match_set_lyr_2_4, headers_c, - first_prio, - ntohs(ib_spec->eth.mask.vlan_tag) >> 13); - MLX5_SET(fte_match_set_lyr_2_4, headers_v, - first_prio, - ntohs(ib_spec->eth.val.vlan_tag) >> 13); - } - MLX5_SET(fte_match_set_lyr_2_4, headers_c, - ethertype, ntohs(ib_spec->eth.mask.ether_type)); - MLX5_SET(fte_match_set_lyr_2_4, headers_v, - ethertype, ntohs(ib_spec->eth.val.ether_type)); - break; - case IB_FLOW_SPEC_IPV4: - if (FIELDS_NOT_SUPPORTED(ib_spec->ipv4.mask, LAST_IPV4_FIELD)) - return -EOPNOTSUPP; - - if (match_ipv) { - MLX5_SET(fte_match_set_lyr_2_4, headers_c, - ip_version, 0xf); - MLX5_SET(fte_match_set_lyr_2_4, headers_v, - ip_version, MLX5_FS_IPV4_VERSION); - } else { - MLX5_SET(fte_match_set_lyr_2_4, headers_c, - ethertype, 0xffff); - MLX5_SET(fte_match_set_lyr_2_4, headers_v, - ethertype, ETH_P_IP); - } - - memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, - src_ipv4_src_ipv6.ipv4_layout.ipv4), - &ib_spec->ipv4.mask.src_ip, - sizeof(ib_spec->ipv4.mask.src_ip)); - memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, - src_ipv4_src_ipv6.ipv4_layout.ipv4), - &ib_spec->ipv4.val.src_ip, - sizeof(ib_spec->ipv4.val.src_ip)); - memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, - dst_ipv4_dst_ipv6.ipv4_layout.ipv4), - &ib_spec->ipv4.mask.dst_ip, - sizeof(ib_spec->ipv4.mask.dst_ip)); - memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, - dst_ipv4_dst_ipv6.ipv4_layout.ipv4), - &ib_spec->ipv4.val.dst_ip, - sizeof(ib_spec->ipv4.val.dst_ip)); - - set_tos(headers_c, headers_v, - ib_spec->ipv4.mask.tos, ib_spec->ipv4.val.tos); - - set_proto(headers_c, headers_v, - ib_spec->ipv4.mask.proto, ib_spec->ipv4.val.proto); - break; - case IB_FLOW_SPEC_IPV6: - if (FIELDS_NOT_SUPPORTED(ib_spec->ipv6.mask, LAST_IPV6_FIELD)) - return -EOPNOTSUPP; - - if (match_ipv) { - MLX5_SET(fte_match_set_lyr_2_4, headers_c, - ip_version, 0xf); - MLX5_SET(fte_match_set_lyr_2_4, headers_v, - ip_version, MLX5_FS_IPV6_VERSION); - } else { - MLX5_SET(fte_match_set_lyr_2_4, headers_c, - ethertype, 0xffff); - MLX5_SET(fte_match_set_lyr_2_4, headers_v, - ethertype, ETH_P_IPV6); - } - - memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, - src_ipv4_src_ipv6.ipv6_layout.ipv6), - &ib_spec->ipv6.mask.src_ip, - sizeof(ib_spec->ipv6.mask.src_ip)); - memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, - src_ipv4_src_ipv6.ipv6_layout.ipv6), - &ib_spec->ipv6.val.src_ip, - sizeof(ib_spec->ipv6.val.src_ip)); - memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, - dst_ipv4_dst_ipv6.ipv6_layout.ipv6), - &ib_spec->ipv6.mask.dst_ip, - sizeof(ib_spec->ipv6.mask.dst_ip)); - memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, - dst_ipv4_dst_ipv6.ipv6_layout.ipv6), - &ib_spec->ipv6.val.dst_ip, - sizeof(ib_spec->ipv6.val.dst_ip)); - - set_tos(headers_c, headers_v, - ib_spec->ipv6.mask.traffic_class, - ib_spec->ipv6.val.traffic_class); - - set_proto(headers_c, headers_v, - ib_spec->ipv6.mask.next_hdr, - ib_spec->ipv6.val.next_hdr); - - set_flow_label(misc_params_c, misc_params_v, - ntohl(ib_spec->ipv6.mask.flow_label), - ntohl(ib_spec->ipv6.val.flow_label), - ib_spec->type & IB_FLOW_SPEC_INNER); - break; - case IB_FLOW_SPEC_ESP: - if (ib_spec->esp.mask.seq) - return -EOPNOTSUPP; - - MLX5_SET(fte_match_set_misc, misc_params_c, outer_esp_spi, - ntohl(ib_spec->esp.mask.spi)); - MLX5_SET(fte_match_set_misc, misc_params_v, outer_esp_spi, - ntohl(ib_spec->esp.val.spi)); - break; - case IB_FLOW_SPEC_TCP: - if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask, - LAST_TCP_UDP_FIELD)) - return -EOPNOTSUPP; - - MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol, - 0xff); - MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, - IPPROTO_TCP); - - MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_sport, - ntohs(ib_spec->tcp_udp.mask.src_port)); - MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_sport, - ntohs(ib_spec->tcp_udp.val.src_port)); - - MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_dport, - ntohs(ib_spec->tcp_udp.mask.dst_port)); - MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_dport, - ntohs(ib_spec->tcp_udp.val.dst_port)); - break; - case IB_FLOW_SPEC_UDP: - if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask, - LAST_TCP_UDP_FIELD)) - return -EOPNOTSUPP; - - MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol, - 0xff); - MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, - IPPROTO_UDP); - - MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_sport, - ntohs(ib_spec->tcp_udp.mask.src_port)); - MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_sport, - ntohs(ib_spec->tcp_udp.val.src_port)); - - MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_dport, - ntohs(ib_spec->tcp_udp.mask.dst_port)); - MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport, - ntohs(ib_spec->tcp_udp.val.dst_port)); - break; - case IB_FLOW_SPEC_GRE: - if (ib_spec->gre.mask.c_ks_res0_ver) - return -EOPNOTSUPP; - - MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol, - 0xff); - MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, - IPPROTO_GRE); - - MLX5_SET(fte_match_set_misc, misc_params_c, gre_protocol, - ntohs(ib_spec->gre.mask.protocol)); - MLX5_SET(fte_match_set_misc, misc_params_v, gre_protocol, - ntohs(ib_spec->gre.val.protocol)); - - memcpy(MLX5_ADDR_OF(fte_match_set_misc, misc_params_c, - gre_key.nvgre.hi), - &ib_spec->gre.mask.key, - sizeof(ib_spec->gre.mask.key)); - memcpy(MLX5_ADDR_OF(fte_match_set_misc, misc_params_v, - gre_key.nvgre.hi), - &ib_spec->gre.val.key, - sizeof(ib_spec->gre.val.key)); - break; - case IB_FLOW_SPEC_MPLS: - switch (prev_type) { - case IB_FLOW_SPEC_UDP: - if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev, - ft_field_support.outer_first_mpls_over_udp), - &ib_spec->mpls.mask.tag)) - return -EOPNOTSUPP; - - memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v, - outer_first_mpls_over_udp), - &ib_spec->mpls.val.tag, - sizeof(ib_spec->mpls.val.tag)); - memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c, - outer_first_mpls_over_udp), - &ib_spec->mpls.mask.tag, - sizeof(ib_spec->mpls.mask.tag)); - break; - case IB_FLOW_SPEC_GRE: - if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev, - ft_field_support.outer_first_mpls_over_gre), - &ib_spec->mpls.mask.tag)) - return -EOPNOTSUPP; - - memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v, - outer_first_mpls_over_gre), - &ib_spec->mpls.val.tag, - sizeof(ib_spec->mpls.val.tag)); - memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c, - outer_first_mpls_over_gre), - &ib_spec->mpls.mask.tag, - sizeof(ib_spec->mpls.mask.tag)); - break; - default: - if (ib_spec->type & IB_FLOW_SPEC_INNER) { - if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev, - ft_field_support.inner_first_mpls), - &ib_spec->mpls.mask.tag)) - return -EOPNOTSUPP; - - memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v, - inner_first_mpls), - &ib_spec->mpls.val.tag, - sizeof(ib_spec->mpls.val.tag)); - memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c, - inner_first_mpls), - &ib_spec->mpls.mask.tag, - sizeof(ib_spec->mpls.mask.tag)); - } else { - if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev, - ft_field_support.outer_first_mpls), - &ib_spec->mpls.mask.tag)) - return -EOPNOTSUPP; - - memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v, - outer_first_mpls), - &ib_spec->mpls.val.tag, - sizeof(ib_spec->mpls.val.tag)); - memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c, - outer_first_mpls), - &ib_spec->mpls.mask.tag, - sizeof(ib_spec->mpls.mask.tag)); - } - } - break; - case IB_FLOW_SPEC_VXLAN_TUNNEL: - if (FIELDS_NOT_SUPPORTED(ib_spec->tunnel.mask, - LAST_TUNNEL_FIELD)) - return -EOPNOTSUPP; - - MLX5_SET(fte_match_set_misc, misc_params_c, vxlan_vni, - ntohl(ib_spec->tunnel.mask.tunnel_id)); - MLX5_SET(fte_match_set_misc, misc_params_v, vxlan_vni, - ntohl(ib_spec->tunnel.val.tunnel_id)); - break; - case IB_FLOW_SPEC_ACTION_TAG: - if (FIELDS_NOT_SUPPORTED(ib_spec->flow_tag, - LAST_FLOW_TAG_FIELD)) - return -EOPNOTSUPP; - if (ib_spec->flow_tag.tag_id >= BIT(24)) - return -EINVAL; - - action->flow_tag = ib_spec->flow_tag.tag_id; - action->flags |= FLOW_ACT_HAS_TAG; - break; - case IB_FLOW_SPEC_ACTION_DROP: - if (FIELDS_NOT_SUPPORTED(ib_spec->drop, - LAST_DROP_FIELD)) - return -EOPNOTSUPP; - action->action |= MLX5_FLOW_CONTEXT_ACTION_DROP; - break; - case IB_FLOW_SPEC_ACTION_HANDLE: - ret = parse_flow_flow_action(to_mflow_act(ib_spec->action.act), - flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS, action); - if (ret) - return ret; - break; - case IB_FLOW_SPEC_ACTION_COUNT: - if (FIELDS_NOT_SUPPORTED(ib_spec->flow_count, - LAST_COUNTERS_FIELD)) - return -EOPNOTSUPP; - - /* for now support only one counters spec per flow */ - if (action->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) - return -EINVAL; - - action->counters = ib_spec->flow_count.counters; - action->action |= MLX5_FLOW_CONTEXT_ACTION_COUNT; - break; - default: - return -EINVAL; - } - - return 0; -} - -/* If a flow could catch both multicast and unicast packets, - * it won't fall into the multicast flow steering table and this rule - * could steal other multicast packets. - */ -static bool flow_is_multicast_only(const struct ib_flow_attr *ib_attr) -{ - union ib_flow_spec *flow_spec; - - if (ib_attr->type != IB_FLOW_ATTR_NORMAL || - ib_attr->num_of_specs < 1) - return false; - - flow_spec = (union ib_flow_spec *)(ib_attr + 1); - if (flow_spec->type == IB_FLOW_SPEC_IPV4) { - struct ib_flow_spec_ipv4 *ipv4_spec; - - ipv4_spec = (struct ib_flow_spec_ipv4 *)flow_spec; - if (ipv4_is_multicast(ipv4_spec->val.dst_ip)) - return true; - - return false; - } - - if (flow_spec->type == IB_FLOW_SPEC_ETH) { - struct ib_flow_spec_eth *eth_spec; - - eth_spec = (struct ib_flow_spec_eth *)flow_spec; - return is_multicast_ether_addr(eth_spec->mask.dst_mac) && - is_multicast_ether_addr(eth_spec->val.dst_mac); - } - - return false; -} - -enum valid_spec { - VALID_SPEC_INVALID, - VALID_SPEC_VALID, - VALID_SPEC_NA, -}; - -static enum valid_spec -is_valid_esp_aes_gcm(struct mlx5_core_dev *mdev, - const struct mlx5_flow_spec *spec, - const struct mlx5_flow_act *flow_act, - bool egress) -{ - const u32 *match_c = spec->match_criteria; - bool is_crypto = - (flow_act->action & (MLX5_FLOW_CONTEXT_ACTION_ENCRYPT | - MLX5_FLOW_CONTEXT_ACTION_DECRYPT)); - bool is_ipsec = mlx5_fs_is_ipsec_flow(match_c); - bool is_drop = flow_act->action & MLX5_FLOW_CONTEXT_ACTION_DROP; - - /* - * Currently only crypto is supported in egress, when regular egress - * rules would be supported, always return VALID_SPEC_NA. - */ - if (!is_crypto) - return VALID_SPEC_NA; - - return is_crypto && is_ipsec && - (!egress || (!is_drop && !(flow_act->flags & FLOW_ACT_HAS_TAG))) ? - VALID_SPEC_VALID : VALID_SPEC_INVALID; -} - -static bool is_valid_spec(struct mlx5_core_dev *mdev, - const struct mlx5_flow_spec *spec, - const struct mlx5_flow_act *flow_act, - bool egress) -{ - /* We curretly only support ipsec egress flow */ - return is_valid_esp_aes_gcm(mdev, spec, flow_act, egress) != VALID_SPEC_INVALID; -} - -static bool is_valid_ethertype(struct mlx5_core_dev *mdev, - const struct ib_flow_attr *flow_attr, - bool check_inner) -{ - union ib_flow_spec *ib_spec = (union ib_flow_spec *)(flow_attr + 1); - int match_ipv = check_inner ? - MLX5_CAP_FLOWTABLE_NIC_RX(mdev, - ft_field_support.inner_ip_version) : - MLX5_CAP_FLOWTABLE_NIC_RX(mdev, - ft_field_support.outer_ip_version); - int inner_bit = check_inner ? IB_FLOW_SPEC_INNER : 0; - bool ipv4_spec_valid, ipv6_spec_valid; - unsigned int ip_spec_type = 0; - bool has_ethertype = false; - unsigned int spec_index; - bool mask_valid = true; - u16 eth_type = 0; - bool type_valid; - - /* Validate that ethertype is correct */ - for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) { - if ((ib_spec->type == (IB_FLOW_SPEC_ETH | inner_bit)) && - ib_spec->eth.mask.ether_type) { - mask_valid = (ib_spec->eth.mask.ether_type == - htons(0xffff)); - has_ethertype = true; - eth_type = ntohs(ib_spec->eth.val.ether_type); - } else if ((ib_spec->type == (IB_FLOW_SPEC_IPV4 | inner_bit)) || - (ib_spec->type == (IB_FLOW_SPEC_IPV6 | inner_bit))) { - ip_spec_type = ib_spec->type; - } - ib_spec = (void *)ib_spec + ib_spec->size; - } - - type_valid = (!has_ethertype) || (!ip_spec_type); - if (!type_valid && mask_valid) { - ipv4_spec_valid = (eth_type == ETH_P_IP) && - (ip_spec_type == (IB_FLOW_SPEC_IPV4 | inner_bit)); - ipv6_spec_valid = (eth_type == ETH_P_IPV6) && - (ip_spec_type == (IB_FLOW_SPEC_IPV6 | inner_bit)); - - type_valid = (ipv4_spec_valid) || (ipv6_spec_valid) || - (((eth_type == ETH_P_MPLS_UC) || - (eth_type == ETH_P_MPLS_MC)) && match_ipv); - } - - return type_valid; -} - -static bool is_valid_attr(struct mlx5_core_dev *mdev, - const struct ib_flow_attr *flow_attr) -{ - return is_valid_ethertype(mdev, flow_attr, false) && - is_valid_ethertype(mdev, flow_attr, true); -} - -static void put_flow_table(struct mlx5_ib_dev *dev, - struct mlx5_ib_flow_prio *prio, bool ft_added) -{ - prio->refcount -= !!ft_added; - if (!prio->refcount) { - mlx5_destroy_flow_table(prio->flow_table); - prio->flow_table = NULL; - } -} - -static void counters_clear_description(struct ib_counters *counters) -{ - struct mlx5_ib_mcounters *mcounters = to_mcounters(counters); - - mutex_lock(&mcounters->mcntrs_mutex); - kfree(mcounters->counters_data); - mcounters->counters_data = NULL; - mcounters->cntrs_max_index = 0; - mutex_unlock(&mcounters->mcntrs_mutex); -} - -static int mlx5_ib_destroy_flow(struct ib_flow *flow_id) -{ - struct mlx5_ib_flow_handler *handler = container_of(flow_id, - struct mlx5_ib_flow_handler, - ibflow); - struct mlx5_ib_flow_handler *iter, *tmp; - struct mlx5_ib_dev *dev = handler->dev; - - mutex_lock(&dev->flow_db->lock); - - list_for_each_entry_safe(iter, tmp, &handler->list, list) { - mlx5_del_flow_rules(iter->rule); - put_flow_table(dev, iter->prio, true); - list_del(&iter->list); - kfree(iter); - } - - mlx5_del_flow_rules(handler->rule); - put_flow_table(dev, handler->prio, true); - if (handler->ibcounters && - atomic_read(&handler->ibcounters->usecnt) == 1) - counters_clear_description(handler->ibcounters); - - mutex_unlock(&dev->flow_db->lock); - if (handler->flow_matcher) - atomic_dec(&handler->flow_matcher->usecnt); - kfree(handler); - - return 0; -} - -static int ib_prio_to_core_prio(unsigned int priority, bool dont_trap) -{ - priority *= 2; - if (!dont_trap) - priority++; - return priority; -} - -enum flow_table_type { - MLX5_IB_FT_RX, - MLX5_IB_FT_TX -}; - -#define MLX5_FS_MAX_TYPES 6 -#define MLX5_FS_MAX_ENTRIES BIT(16) - -static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_flow_namespace *ns, - struct mlx5_ib_flow_prio *prio, - int priority, - int num_entries, int num_groups, - u32 flags) -{ - struct mlx5_flow_table *ft; - - ft = mlx5_create_auto_grouped_flow_table(ns, priority, - num_entries, - num_groups, - 0, flags); - if (IS_ERR(ft)) - return ERR_CAST(ft); - - prio->flow_table = ft; - prio->refcount = 0; - return prio; -} - -static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, - struct ib_flow_attr *flow_attr, - enum flow_table_type ft_type) -{ - bool dont_trap = flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP; - struct mlx5_flow_namespace *ns = NULL; - struct mlx5_ib_flow_prio *prio; - struct mlx5_flow_table *ft; - int max_table_size; - int num_entries; - int num_groups; - u32 flags = 0; - int priority; - - max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, - log_max_ft_size)); - if (flow_attr->type == IB_FLOW_ATTR_NORMAL) { - enum mlx5_flow_namespace_type fn_type; - - if (flow_is_multicast_only(flow_attr) && - !dont_trap) - priority = MLX5_IB_FLOW_MCAST_PRIO; - else - priority = ib_prio_to_core_prio(flow_attr->priority, - dont_trap); - if (ft_type == MLX5_IB_FT_RX) { - fn_type = MLX5_FLOW_NAMESPACE_BYPASS; - prio = &dev->flow_db->prios[priority]; - if (!dev->rep && - MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap)) - flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP; - if (!dev->rep && - MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, - reformat_l3_tunnel_to_l2)) - flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; - } else { - max_table_size = - BIT(MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, - log_max_ft_size)); - fn_type = MLX5_FLOW_NAMESPACE_EGRESS; - prio = &dev->flow_db->egress_prios[priority]; - if (!dev->rep && - MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat)) - flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; - } - ns = mlx5_get_flow_namespace(dev->mdev, fn_type); - num_entries = MLX5_FS_MAX_ENTRIES; - num_groups = MLX5_FS_MAX_TYPES; - } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT || - flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) { - ns = mlx5_get_flow_namespace(dev->mdev, - MLX5_FLOW_NAMESPACE_LEFTOVERS); - build_leftovers_ft_param(&priority, - &num_entries, - &num_groups); - prio = &dev->flow_db->prios[MLX5_IB_FLOW_LEFTOVERS_PRIO]; - } else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) { - if (!MLX5_CAP_FLOWTABLE(dev->mdev, - allow_sniffer_and_nic_rx_shared_tir)) - return ERR_PTR(-ENOTSUPP); - - ns = mlx5_get_flow_namespace(dev->mdev, ft_type == MLX5_IB_FT_RX ? - MLX5_FLOW_NAMESPACE_SNIFFER_RX : - MLX5_FLOW_NAMESPACE_SNIFFER_TX); - - prio = &dev->flow_db->sniffer[ft_type]; - priority = 0; - num_entries = 1; - num_groups = 1; - } - - if (!ns) - return ERR_PTR(-ENOTSUPP); - - if (num_entries > max_table_size) - return ERR_PTR(-ENOMEM); - - ft = prio->flow_table; - if (!ft) - return _get_prio(ns, prio, priority, num_entries, num_groups, - flags); - - return prio; -} - -static void set_underlay_qp(struct mlx5_ib_dev *dev, - struct mlx5_flow_spec *spec, - u32 underlay_qpn) -{ - void *misc_params_c = MLX5_ADDR_OF(fte_match_param, - spec->match_criteria, - misc_parameters); - void *misc_params_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, - misc_parameters); - - if (underlay_qpn && - MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, - ft_field_support.bth_dst_qp)) { - MLX5_SET(fte_match_set_misc, - misc_params_v, bth_dst_qp, underlay_qpn); - MLX5_SET(fte_match_set_misc, - misc_params_c, bth_dst_qp, 0xffffff); - } -} - -static int read_flow_counters(struct ib_device *ibdev, - struct mlx5_read_counters_attr *read_attr) -{ - struct mlx5_fc *fc = read_attr->hw_cntrs_hndl; - struct mlx5_ib_dev *dev = to_mdev(ibdev); - - return mlx5_fc_query(dev->mdev, fc, - &read_attr->out[IB_COUNTER_PACKETS], - &read_attr->out[IB_COUNTER_BYTES]); -} - -/* flow counters currently expose two counters packets and bytes */ -#define FLOW_COUNTERS_NUM 2 -static int counters_set_description(struct ib_counters *counters, - enum mlx5_ib_counters_type counters_type, - struct mlx5_ib_flow_counters_desc *desc_data, - u32 ncounters) -{ - struct mlx5_ib_mcounters *mcounters = to_mcounters(counters); - u32 cntrs_max_index = 0; - int i; - - if (counters_type != MLX5_IB_COUNTERS_FLOW) - return -EINVAL; - - /* init the fields for the object */ - mcounters->type = counters_type; - mcounters->read_counters = read_flow_counters; - mcounters->counters_num = FLOW_COUNTERS_NUM; - mcounters->ncounters = ncounters; - /* each counter entry have both description and index pair */ - for (i = 0; i < ncounters; i++) { - if (desc_data[i].description > IB_COUNTER_BYTES) - return -EINVAL; - - if (cntrs_max_index <= desc_data[i].index) - cntrs_max_index = desc_data[i].index + 1; - } - - mutex_lock(&mcounters->mcntrs_mutex); - mcounters->counters_data = desc_data; - mcounters->cntrs_max_index = cntrs_max_index; - mutex_unlock(&mcounters->mcntrs_mutex); - - return 0; -} - -#define MAX_COUNTERS_NUM (USHRT_MAX / (sizeof(u32) * 2)) -static int flow_counters_set_data(struct ib_counters *ibcounters, - struct mlx5_ib_create_flow *ucmd) -{ - struct mlx5_ib_mcounters *mcounters = to_mcounters(ibcounters); - struct mlx5_ib_flow_counters_data *cntrs_data = NULL; - struct mlx5_ib_flow_counters_desc *desc_data = NULL; - bool hw_hndl = false; - int ret = 0; - - if (ucmd && ucmd->ncounters_data != 0) { - cntrs_data = ucmd->data; - if (cntrs_data->ncounters > MAX_COUNTERS_NUM) - return -EINVAL; - - desc_data = kcalloc(cntrs_data->ncounters, - sizeof(*desc_data), - GFP_KERNEL); - if (!desc_data) - return -ENOMEM; - - if (copy_from_user(desc_data, - u64_to_user_ptr(cntrs_data->counters_data), - sizeof(*desc_data) * cntrs_data->ncounters)) { - ret = -EFAULT; - goto free; - } - } - - if (!mcounters->hw_cntrs_hndl) { - mcounters->hw_cntrs_hndl = mlx5_fc_create( - to_mdev(ibcounters->device)->mdev, false); - if (IS_ERR(mcounters->hw_cntrs_hndl)) { - ret = PTR_ERR(mcounters->hw_cntrs_hndl); - goto free; - } - hw_hndl = true; - } - - if (desc_data) { - /* counters already bound to at least one flow */ - if (mcounters->cntrs_max_index) { - ret = -EINVAL; - goto free_hndl; - } - - ret = counters_set_description(ibcounters, - MLX5_IB_COUNTERS_FLOW, - desc_data, - cntrs_data->ncounters); - if (ret) - goto free_hndl; - - } else if (!mcounters->cntrs_max_index) { - /* counters not bound yet, must have udata passed */ - ret = -EINVAL; - goto free_hndl; - } - - return 0; - -free_hndl: - if (hw_hndl) { - mlx5_fc_destroy(to_mdev(ibcounters->device)->mdev, - mcounters->hw_cntrs_hndl); - mcounters->hw_cntrs_hndl = NULL; - } -free: - kfree(desc_data); - return ret; -} - -static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev, - struct mlx5_ib_flow_prio *ft_prio, - const struct ib_flow_attr *flow_attr, - struct mlx5_flow_destination *dst, - u32 underlay_qpn, - struct mlx5_ib_create_flow *ucmd) -{ - struct mlx5_flow_table *ft = ft_prio->flow_table; - struct mlx5_ib_flow_handler *handler; - struct mlx5_flow_act flow_act = {.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG}; - struct mlx5_flow_spec *spec; - struct mlx5_flow_destination dest_arr[2] = {}; - struct mlx5_flow_destination *rule_dst = dest_arr; - const void *ib_flow = (const void *)flow_attr + sizeof(*flow_attr); - unsigned int spec_index; - u32 prev_type = 0; - int err = 0; - int dest_num = 0; - bool is_egress = flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS; - - if (!is_valid_attr(dev->mdev, flow_attr)) - return ERR_PTR(-EINVAL); - - if (dev->rep && is_egress) - return ERR_PTR(-EINVAL); - - spec = kvzalloc(sizeof(*spec), GFP_KERNEL); - handler = kzalloc(sizeof(*handler), GFP_KERNEL); - if (!handler || !spec) { - err = -ENOMEM; - goto free; - } - - INIT_LIST_HEAD(&handler->list); - if (dst) { - memcpy(&dest_arr[0], dst, sizeof(*dst)); - dest_num++; - } - - for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) { - err = parse_flow_attr(dev->mdev, spec->match_criteria, - spec->match_value, - ib_flow, flow_attr, &flow_act, - prev_type); - if (err < 0) - goto free; - - prev_type = ((union ib_flow_spec *)ib_flow)->type; - ib_flow += ((union ib_flow_spec *)ib_flow)->size; - } - - if (!flow_is_multicast_only(flow_attr)) - set_underlay_qp(dev, spec, underlay_qpn); - - if (dev->rep) { - void *misc; - - misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, - misc_parameters); - MLX5_SET(fte_match_set_misc, misc, source_port, - dev->rep->vport); - misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, - misc_parameters); - MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port); - } - - spec->match_criteria_enable = get_match_criteria_enable(spec->match_criteria); - - if (is_egress && - !is_valid_spec(dev->mdev, spec, &flow_act, is_egress)) { - err = -EINVAL; - goto free; - } - - if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) { - struct mlx5_ib_mcounters *mcounters; - - err = flow_counters_set_data(flow_act.counters, ucmd); - if (err) - goto free; - - mcounters = to_mcounters(flow_act.counters); - handler->ibcounters = flow_act.counters; - dest_arr[dest_num].type = - MLX5_FLOW_DESTINATION_TYPE_COUNTER; - dest_arr[dest_num].counter_id = - mlx5_fc_id(mcounters->hw_cntrs_hndl); - dest_num++; - } - - if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_DROP) { - if (!(flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT)) { - rule_dst = NULL; - dest_num = 0; - } - } else { - if (is_egress) - flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_ALLOW; - else - flow_act.action |= - dest_num ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST : - MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO; - } - - if ((flow_act.flags & FLOW_ACT_HAS_TAG) && - (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT || - flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) { - mlx5_ib_warn(dev, "Flow tag %u and attribute type %x isn't allowed in leftovers\n", - flow_act.flow_tag, flow_attr->type); - err = -EINVAL; - goto free; - } - handler->rule = mlx5_add_flow_rules(ft, spec, - &flow_act, - rule_dst, dest_num); - - if (IS_ERR(handler->rule)) { - err = PTR_ERR(handler->rule); - goto free; - } - - ft_prio->refcount++; - handler->prio = ft_prio; - handler->dev = dev; - - ft_prio->flow_table = ft; -free: - if (err && handler) { - if (handler->ibcounters && - atomic_read(&handler->ibcounters->usecnt) == 1) - counters_clear_description(handler->ibcounters); - kfree(handler); - } - kvfree(spec); - return err ? ERR_PTR(err) : handler; -} - -static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev, - struct mlx5_ib_flow_prio *ft_prio, - const struct ib_flow_attr *flow_attr, - struct mlx5_flow_destination *dst) -{ - return _create_flow_rule(dev, ft_prio, flow_attr, dst, 0, NULL); -} - -static struct mlx5_ib_flow_handler *create_dont_trap_rule(struct mlx5_ib_dev *dev, - struct mlx5_ib_flow_prio *ft_prio, - struct ib_flow_attr *flow_attr, - struct mlx5_flow_destination *dst) -{ - struct mlx5_ib_flow_handler *handler_dst = NULL; - struct mlx5_ib_flow_handler *handler = NULL; - - handler = create_flow_rule(dev, ft_prio, flow_attr, NULL); - if (!IS_ERR(handler)) { - handler_dst = create_flow_rule(dev, ft_prio, - flow_attr, dst); - if (IS_ERR(handler_dst)) { - mlx5_del_flow_rules(handler->rule); - ft_prio->refcount--; - kfree(handler); - handler = handler_dst; - } else { - list_add(&handler_dst->list, &handler->list); - } - } - - return handler; -} -enum { - LEFTOVERS_MC, - LEFTOVERS_UC, -}; - -static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *dev, - struct mlx5_ib_flow_prio *ft_prio, - struct ib_flow_attr *flow_attr, - struct mlx5_flow_destination *dst) -{ - struct mlx5_ib_flow_handler *handler_ucast = NULL; - struct mlx5_ib_flow_handler *handler = NULL; - - static struct { - struct ib_flow_attr flow_attr; - struct ib_flow_spec_eth eth_flow; - } leftovers_specs[] = { - [LEFTOVERS_MC] = { - .flow_attr = { - .num_of_specs = 1, - .size = sizeof(leftovers_specs[0]) - }, - .eth_flow = { - .type = IB_FLOW_SPEC_ETH, - .size = sizeof(struct ib_flow_spec_eth), - .mask = {.dst_mac = {0x1} }, - .val = {.dst_mac = {0x1} } - } - }, - [LEFTOVERS_UC] = { - .flow_attr = { - .num_of_specs = 1, - .size = sizeof(leftovers_specs[0]) - }, - .eth_flow = { - .type = IB_FLOW_SPEC_ETH, - .size = sizeof(struct ib_flow_spec_eth), - .mask = {.dst_mac = {0x1} }, - .val = {.dst_mac = {} } - } - } - }; - - handler = create_flow_rule(dev, ft_prio, - &leftovers_specs[LEFTOVERS_MC].flow_attr, - dst); - if (!IS_ERR(handler) && - flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT) { - handler_ucast = create_flow_rule(dev, ft_prio, - &leftovers_specs[LEFTOVERS_UC].flow_attr, - dst); - if (IS_ERR(handler_ucast)) { - mlx5_del_flow_rules(handler->rule); - ft_prio->refcount--; - kfree(handler); - handler = handler_ucast; - } else { - list_add(&handler_ucast->list, &handler->list); - } - } - - return handler; -} - -static struct mlx5_ib_flow_handler *create_sniffer_rule(struct mlx5_ib_dev *dev, - struct mlx5_ib_flow_prio *ft_rx, - struct mlx5_ib_flow_prio *ft_tx, - struct mlx5_flow_destination *dst) -{ - struct mlx5_ib_flow_handler *handler_rx; - struct mlx5_ib_flow_handler *handler_tx; - int err; - static const struct ib_flow_attr flow_attr = { - .num_of_specs = 0, - .size = sizeof(flow_attr) - }; - - handler_rx = create_flow_rule(dev, ft_rx, &flow_attr, dst); - if (IS_ERR(handler_rx)) { - err = PTR_ERR(handler_rx); - goto err; - } - - handler_tx = create_flow_rule(dev, ft_tx, &flow_attr, dst); - if (IS_ERR(handler_tx)) { - err = PTR_ERR(handler_tx); - goto err_tx; - } - - list_add(&handler_tx->list, &handler_rx->list); - - return handler_rx; - -err_tx: - mlx5_del_flow_rules(handler_rx->rule); - ft_rx->refcount--; - kfree(handler_rx); -err: - return ERR_PTR(err); -} - -static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, - struct ib_flow_attr *flow_attr, - int domain, - struct ib_udata *udata) -{ - struct mlx5_ib_dev *dev = to_mdev(qp->device); - struct mlx5_ib_qp *mqp = to_mqp(qp); - struct mlx5_ib_flow_handler *handler = NULL; - struct mlx5_flow_destination *dst = NULL; - struct mlx5_ib_flow_prio *ft_prio_tx = NULL; - struct mlx5_ib_flow_prio *ft_prio; - bool is_egress = flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS; - struct mlx5_ib_create_flow *ucmd = NULL, ucmd_hdr; - size_t min_ucmd_sz, required_ucmd_sz; - int err; - int underlay_qpn; - - if (udata && udata->inlen) { - min_ucmd_sz = offsetof(typeof(ucmd_hdr), reserved) + - sizeof(ucmd_hdr.reserved); - if (udata->inlen < min_ucmd_sz) - return ERR_PTR(-EOPNOTSUPP); - - err = ib_copy_from_udata(&ucmd_hdr, udata, min_ucmd_sz); - if (err) - return ERR_PTR(err); - - /* currently supports only one counters data */ - if (ucmd_hdr.ncounters_data > 1) - return ERR_PTR(-EINVAL); - - required_ucmd_sz = min_ucmd_sz + - sizeof(struct mlx5_ib_flow_counters_data) * - ucmd_hdr.ncounters_data; - if (udata->inlen > required_ucmd_sz && - !ib_is_udata_cleared(udata, required_ucmd_sz, - udata->inlen - required_ucmd_sz)) - return ERR_PTR(-EOPNOTSUPP); - - ucmd = kzalloc(required_ucmd_sz, GFP_KERNEL); - if (!ucmd) - return ERR_PTR(-ENOMEM); - - err = ib_copy_from_udata(ucmd, udata, required_ucmd_sz); - if (err) - goto free_ucmd; - } - - if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO) { - err = -ENOMEM; - goto free_ucmd; - } - - if (domain != IB_FLOW_DOMAIN_USER || - flow_attr->port > dev->num_ports || - (flow_attr->flags & ~(IB_FLOW_ATTR_FLAGS_DONT_TRAP | - IB_FLOW_ATTR_FLAGS_EGRESS))) { - err = -EINVAL; - goto free_ucmd; - } - - if (is_egress && - (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT || - flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) { - err = -EINVAL; - goto free_ucmd; - } - - dst = kzalloc(sizeof(*dst), GFP_KERNEL); - if (!dst) { - err = -ENOMEM; - goto free_ucmd; - } - - mutex_lock(&dev->flow_db->lock); - - ft_prio = get_flow_table(dev, flow_attr, - is_egress ? MLX5_IB_FT_TX : MLX5_IB_FT_RX); - if (IS_ERR(ft_prio)) { - err = PTR_ERR(ft_prio); - goto unlock; - } - if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) { - ft_prio_tx = get_flow_table(dev, flow_attr, MLX5_IB_FT_TX); - if (IS_ERR(ft_prio_tx)) { - err = PTR_ERR(ft_prio_tx); - ft_prio_tx = NULL; - goto destroy_ft; - } - } - - if (is_egress) { - dst->type = MLX5_FLOW_DESTINATION_TYPE_PORT; - } else { - dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR; - if (mqp->flags & MLX5_IB_QP_RSS) - dst->tir_num = mqp->rss_qp.tirn; - else - dst->tir_num = mqp->raw_packet_qp.rq.tirn; - } - - if (flow_attr->type == IB_FLOW_ATTR_NORMAL) { - if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP) { - handler = create_dont_trap_rule(dev, ft_prio, - flow_attr, dst); - } else { - underlay_qpn = (mqp->flags & MLX5_IB_QP_UNDERLAY) ? - mqp->underlay_qpn : 0; - handler = _create_flow_rule(dev, ft_prio, flow_attr, - dst, underlay_qpn, ucmd); - } - } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT || - flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) { - handler = create_leftovers_rule(dev, ft_prio, flow_attr, - dst); - } else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) { - handler = create_sniffer_rule(dev, ft_prio, ft_prio_tx, dst); - } else { - err = -EINVAL; - goto destroy_ft; - } - - if (IS_ERR(handler)) { - err = PTR_ERR(handler); - handler = NULL; - goto destroy_ft; - } - - mutex_unlock(&dev->flow_db->lock); - kfree(dst); - kfree(ucmd); - - return &handler->ibflow; - -destroy_ft: - put_flow_table(dev, ft_prio, false); - if (ft_prio_tx) - put_flow_table(dev, ft_prio_tx, false); -unlock: - mutex_unlock(&dev->flow_db->lock); - kfree(dst); -free_ucmd: - kfree(ucmd); - return ERR_PTR(err); -} - -static struct mlx5_ib_flow_prio * -_get_flow_table(struct mlx5_ib_dev *dev, - struct mlx5_ib_flow_matcher *fs_matcher, - bool mcast) -{ - struct mlx5_flow_namespace *ns = NULL; - struct mlx5_ib_flow_prio *prio; - int max_table_size; - u32 flags = 0; - int priority; - - if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS) { - max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, - log_max_ft_size)); - if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap)) - flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP; - if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, - reformat_l3_tunnel_to_l2)) - flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; - } else { /* Can only be MLX5_FLOW_NAMESPACE_EGRESS */ - max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, - log_max_ft_size)); - if (MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat)) - flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; - } - - if (max_table_size < MLX5_FS_MAX_ENTRIES) - return ERR_PTR(-ENOMEM); - - if (mcast) - priority = MLX5_IB_FLOW_MCAST_PRIO; - else - priority = ib_prio_to_core_prio(fs_matcher->priority, false); - - ns = mlx5_get_flow_namespace(dev->mdev, fs_matcher->ns_type); - if (!ns) - return ERR_PTR(-ENOTSUPP); - - if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS) - prio = &dev->flow_db->prios[priority]; - else - prio = &dev->flow_db->egress_prios[priority]; - - if (prio->flow_table) - return prio; - - return _get_prio(ns, prio, priority, MLX5_FS_MAX_ENTRIES, - MLX5_FS_MAX_TYPES, flags); -} - -static struct mlx5_ib_flow_handler * -_create_raw_flow_rule(struct mlx5_ib_dev *dev, - struct mlx5_ib_flow_prio *ft_prio, - struct mlx5_flow_destination *dst, - struct mlx5_ib_flow_matcher *fs_matcher, - struct mlx5_flow_act *flow_act, - void *cmd_in, int inlen, - int dst_num) -{ - struct mlx5_ib_flow_handler *handler; - struct mlx5_flow_spec *spec; - struct mlx5_flow_table *ft = ft_prio->flow_table; - int err = 0; - - spec = kvzalloc(sizeof(*spec), GFP_KERNEL); - handler = kzalloc(sizeof(*handler), GFP_KERNEL); - if (!handler || !spec) { - err = -ENOMEM; - goto free; - } - - INIT_LIST_HEAD(&handler->list); - - memcpy(spec->match_value, cmd_in, inlen); - memcpy(spec->match_criteria, fs_matcher->matcher_mask.match_params, - fs_matcher->mask_len); - spec->match_criteria_enable = fs_matcher->match_criteria_enable; - - handler->rule = mlx5_add_flow_rules(ft, spec, - flow_act, dst, dst_num); - - if (IS_ERR(handler->rule)) { - err = PTR_ERR(handler->rule); - goto free; - } - - ft_prio->refcount++; - handler->prio = ft_prio; - handler->dev = dev; - ft_prio->flow_table = ft; - -free: - if (err) - kfree(handler); - kvfree(spec); - return err ? ERR_PTR(err) : handler; -} - -static bool raw_fs_is_multicast(struct mlx5_ib_flow_matcher *fs_matcher, - void *match_v) -{ - void *match_c; - void *match_v_set_lyr_2_4, *match_c_set_lyr_2_4; - void *dmac, *dmac_mask; - void *ipv4, *ipv4_mask; - - if (!(fs_matcher->match_criteria_enable & - (1 << MATCH_CRITERIA_ENABLE_OUTER_BIT))) - return false; - - match_c = fs_matcher->matcher_mask.match_params; - match_v_set_lyr_2_4 = MLX5_ADDR_OF(fte_match_param, match_v, - outer_headers); - match_c_set_lyr_2_4 = MLX5_ADDR_OF(fte_match_param, match_c, - outer_headers); - - dmac = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_v_set_lyr_2_4, - dmac_47_16); - dmac_mask = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_c_set_lyr_2_4, - dmac_47_16); - - if (is_multicast_ether_addr(dmac) && - is_multicast_ether_addr(dmac_mask)) - return true; - - ipv4 = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_v_set_lyr_2_4, - dst_ipv4_dst_ipv6.ipv4_layout.ipv4); - - ipv4_mask = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_c_set_lyr_2_4, - dst_ipv4_dst_ipv6.ipv4_layout.ipv4); - - if (ipv4_is_multicast(*(__be32 *)(ipv4)) && - ipv4_is_multicast(*(__be32 *)(ipv4_mask))) - return true; - - return false; -} - -struct mlx5_ib_flow_handler * -mlx5_ib_raw_fs_rule_add(struct mlx5_ib_dev *dev, - struct mlx5_ib_flow_matcher *fs_matcher, - struct mlx5_flow_act *flow_act, - u32 counter_id, - void *cmd_in, int inlen, int dest_id, - int dest_type) -{ - struct mlx5_flow_destination *dst; - struct mlx5_ib_flow_prio *ft_prio; - struct mlx5_ib_flow_handler *handler; - int dst_num = 0; - bool mcast; - int err; - - if (fs_matcher->flow_type != MLX5_IB_FLOW_TYPE_NORMAL) - return ERR_PTR(-EOPNOTSUPP); - - if (fs_matcher->priority > MLX5_IB_FLOW_LAST_PRIO) - return ERR_PTR(-ENOMEM); - - dst = kzalloc(sizeof(*dst) * 2, GFP_KERNEL); - if (!dst) - return ERR_PTR(-ENOMEM); - - mcast = raw_fs_is_multicast(fs_matcher, cmd_in); - mutex_lock(&dev->flow_db->lock); - - ft_prio = _get_flow_table(dev, fs_matcher, mcast); - if (IS_ERR(ft_prio)) { - err = PTR_ERR(ft_prio); - goto unlock; - } - - if (dest_type == MLX5_FLOW_DESTINATION_TYPE_TIR) { - dst[dst_num].type = dest_type; - dst[dst_num].tir_num = dest_id; - flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; - } else if (dest_type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) { - dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM; - dst[dst_num].ft_num = dest_id; - flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; - } else { - dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_PORT; - flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_ALLOW; - } - - dst_num++; - - if (flow_act->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) { - dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; - dst[dst_num].counter_id = counter_id; - dst_num++; - } - - handler = _create_raw_flow_rule(dev, ft_prio, dst, fs_matcher, flow_act, - cmd_in, inlen, dst_num); - - if (IS_ERR(handler)) { - err = PTR_ERR(handler); - goto destroy_ft; - } - - mutex_unlock(&dev->flow_db->lock); - atomic_inc(&fs_matcher->usecnt); - handler->flow_matcher = fs_matcher; - - kfree(dst); - - return handler; - -destroy_ft: - put_flow_table(dev, ft_prio, false); -unlock: - mutex_unlock(&dev->flow_db->lock); - kfree(dst); - - return ERR_PTR(err); -} - -static u32 mlx5_ib_flow_action_flags_to_accel_xfrm_flags(u32 mlx5_flags) -{ - u32 flags = 0; - - if (mlx5_flags & MLX5_IB_UAPI_FLOW_ACTION_FLAGS_REQUIRE_METADATA) - flags |= MLX5_ACCEL_XFRM_FLAG_REQUIRE_METADATA; - - return flags; -} - -#define MLX5_FLOW_ACTION_ESP_CREATE_LAST_SUPPORTED MLX5_IB_UAPI_FLOW_ACTION_FLAGS_REQUIRE_METADATA -static struct ib_flow_action * -mlx5_ib_create_flow_action_esp(struct ib_device *device, - const struct ib_flow_action_attrs_esp *attr, - struct uverbs_attr_bundle *attrs) -{ - struct mlx5_ib_dev *mdev = to_mdev(device); - struct ib_uverbs_flow_action_esp_keymat_aes_gcm *aes_gcm; - struct mlx5_accel_esp_xfrm_attrs accel_attrs = {}; - struct mlx5_ib_flow_action *action; - u64 action_flags; - u64 flags; - int err = 0; - - err = uverbs_get_flags64( - &action_flags, attrs, MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS, - ((MLX5_FLOW_ACTION_ESP_CREATE_LAST_SUPPORTED << 1) - 1)); - if (err) - return ERR_PTR(err); - - flags = mlx5_ib_flow_action_flags_to_accel_xfrm_flags(action_flags); - - /* We current only support a subset of the standard features. Only a - * keymat of type AES_GCM, with icv_len == 16, iv_algo == SEQ and esn - * (with overlap). Full offload mode isn't supported. - */ - if (!attr->keymat || attr->replay || attr->encap || - attr->spi || attr->seq || attr->tfc_pad || - attr->hard_limit_pkts || - (attr->flags & ~(IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED | - IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ENCRYPT))) - return ERR_PTR(-EOPNOTSUPP); - - if (attr->keymat->protocol != - IB_UVERBS_FLOW_ACTION_ESP_KEYMAT_AES_GCM) - return ERR_PTR(-EOPNOTSUPP); - - aes_gcm = &attr->keymat->keymat.aes_gcm; - - if (aes_gcm->icv_len != 16 || - aes_gcm->iv_algo != IB_UVERBS_FLOW_ACTION_IV_ALGO_SEQ) - return ERR_PTR(-EOPNOTSUPP); - - action = kmalloc(sizeof(*action), GFP_KERNEL); - if (!action) - return ERR_PTR(-ENOMEM); - - action->esp_aes_gcm.ib_flags = attr->flags; - memcpy(&accel_attrs.keymat.aes_gcm.aes_key, &aes_gcm->aes_key, - sizeof(accel_attrs.keymat.aes_gcm.aes_key)); - accel_attrs.keymat.aes_gcm.key_len = aes_gcm->key_len * 8; - memcpy(&accel_attrs.keymat.aes_gcm.salt, &aes_gcm->salt, - sizeof(accel_attrs.keymat.aes_gcm.salt)); - memcpy(&accel_attrs.keymat.aes_gcm.seq_iv, &aes_gcm->iv, - sizeof(accel_attrs.keymat.aes_gcm.seq_iv)); - accel_attrs.keymat.aes_gcm.icv_len = aes_gcm->icv_len * 8; - accel_attrs.keymat.aes_gcm.iv_algo = MLX5_ACCEL_ESP_AES_GCM_IV_ALGO_SEQ; - accel_attrs.keymat_type = MLX5_ACCEL_ESP_KEYMAT_AES_GCM; - - accel_attrs.esn = attr->esn; - if (attr->flags & IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED) - accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_TRIGGERED; - if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW) - accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP; - - if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ENCRYPT) - accel_attrs.action |= MLX5_ACCEL_ESP_ACTION_ENCRYPT; - - action->esp_aes_gcm.ctx = - mlx5_accel_esp_create_xfrm(mdev->mdev, &accel_attrs, flags); - if (IS_ERR(action->esp_aes_gcm.ctx)) { - err = PTR_ERR(action->esp_aes_gcm.ctx); - goto err_parse; - } - - action->esp_aes_gcm.ib_flags = attr->flags; - - return &action->ib_action; - -err_parse: - kfree(action); - return ERR_PTR(err); -} - -static int -mlx5_ib_modify_flow_action_esp(struct ib_flow_action *action, - const struct ib_flow_action_attrs_esp *attr, - struct uverbs_attr_bundle *attrs) -{ - struct mlx5_ib_flow_action *maction = to_mflow_act(action); - struct mlx5_accel_esp_xfrm_attrs accel_attrs; - int err = 0; - - if (attr->keymat || attr->replay || attr->encap || - attr->spi || attr->seq || attr->tfc_pad || - attr->hard_limit_pkts || - (attr->flags & ~(IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED | - IB_FLOW_ACTION_ESP_FLAGS_MOD_ESP_ATTRS | - IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW))) - return -EOPNOTSUPP; - - /* Only the ESN value or the MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP can - * be modified. - */ - if (!(maction->esp_aes_gcm.ib_flags & - IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED) && - attr->flags & (IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED | - IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW)) - return -EINVAL; - - memcpy(&accel_attrs, &maction->esp_aes_gcm.ctx->attrs, - sizeof(accel_attrs)); - - accel_attrs.esn = attr->esn; - if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW) - accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP; - else - accel_attrs.flags &= ~MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP; - - err = mlx5_accel_esp_modify_xfrm(maction->esp_aes_gcm.ctx, - &accel_attrs); - if (err) - return err; - - maction->esp_aes_gcm.ib_flags &= - ~IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW; - maction->esp_aes_gcm.ib_flags |= - attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW; - - return 0; -} - -static int mlx5_ib_destroy_flow_action(struct ib_flow_action *action) -{ - struct mlx5_ib_flow_action *maction = to_mflow_act(action); - - switch (action->type) { - case IB_FLOW_ACTION_ESP: - /* - * We only support aes_gcm by now, so we implicitly know this is - * the underline crypto. - */ - mlx5_accel_esp_destroy_xfrm(maction->esp_aes_gcm.ctx); - break; - case IB_FLOW_ACTION_UNSPECIFIED: - mlx5_ib_destroy_flow_action_raw(maction); - break; - default: - WARN_ON(true); - break; - } - - kfree(maction); - return 0; + return mlx5_cmd_dealloc_pd(mdev->mdev, mpd->pdn, mpd->uid); } static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) @@ -4060,7 +2611,7 @@ static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) uid = ibqp->pd ? to_mpd(ibqp->pd)->uid : 0; - if (mqp->flags & MLX5_IB_QP_UNDERLAY) { + if (mqp->flags & IB_QP_CREATE_SOURCE_QPN) { mlx5_ib_dbg(dev, "Attaching a multi cast group to underlay QP is not supported\n"); return -EOPNOTSUPP; } @@ -4106,9 +2657,9 @@ static ssize_t fw_pages_show(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = - container_of(device, struct mlx5_ib_dev, ib_dev.dev); + rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev); - return sprintf(buf, "%d\n", dev->mdev->priv.fw_pages); + return sysfs_emit(buf, "%d\n", dev->mdev->priv.fw_pages); } static DEVICE_ATTR_RO(fw_pages); @@ -4116,9 +2667,9 @@ static ssize_t reg_pages_show(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = - container_of(device, struct mlx5_ib_dev, ib_dev.dev); + rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev); - return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages)); + return sysfs_emit(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages)); } static DEVICE_ATTR_RO(reg_pages); @@ -4126,8 +2677,9 @@ static ssize_t hca_type_show(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = - container_of(device, struct mlx5_ib_dev, ib_dev.dev); - return sprintf(buf, "MT%d\n", dev->mdev->pdev->device); + rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev); + + return sysfs_emit(buf, "MT%d\n", dev->mdev->pdev->device); } static DEVICE_ATTR_RO(hca_type); @@ -4135,8 +2687,9 @@ static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = - container_of(device, struct mlx5_ib_dev, ib_dev.dev); - return sprintf(buf, "%x\n", dev->mdev->rev_id); + rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev); + + return sysfs_emit(buf, "%x\n", dev->mdev->rev_id); } static DEVICE_ATTR_RO(hw_rev); @@ -4144,9 +2697,10 @@ static ssize_t board_id_show(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = - container_of(device, struct mlx5_ib_dev, ib_dev.dev); - return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN, - dev->mdev->board_id); + rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev); + + return sysfs_emit(buf, "%.*s\n", MLX5_BOARD_ID_LEN, + dev->mdev->board_id); } static DEVICE_ATTR_RO(board_id); @@ -4169,9 +2723,14 @@ static void pkey_change_handler(struct work_struct *work) container_of(work, struct mlx5_ib_port_resources, pkey_change_work); - mutex_lock(&ports->devr->mutex); + if (!ports->gsi) + /* + * We got this event before device was fully configured + * and MAD registration code wasn't called/finished yet. + */ + return; + mlx5_ib_gsi_pkey_change(ports->gsi); - mutex_unlock(&ports->devr->mutex); } static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev) @@ -4228,7 +2787,7 @@ static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev) * lock/unlock above locks Now need to arm all involved CQs. */ list_for_each_entry(mcq, &cq_armed_list, reset_notify) { - mcq->comp(mcq); + mcq->comp(mcq, NULL); } spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags); } @@ -4243,8 +2802,7 @@ static void delay_drop_handler(struct work_struct *work) atomic_inc(&delay_drop->events_cnt); mutex_lock(&delay_drop->lock); - err = mlx5_core_set_delay_drop(delay_drop->dev->mdev, - delay_drop->timeout); + err = mlx5_core_set_delay_drop(delay_drop->dev, delay_drop->timeout); if (err) { mlx5_ib_warn(delay_drop->dev, "Failed to set delay drop, timeout=%u\n", delay_drop->timeout); @@ -4256,9 +2814,13 @@ static void delay_drop_handler(struct work_struct *work) static void handle_general_event(struct mlx5_ib_dev *ibdev, struct mlx5_eqe *eqe, struct ib_event *ibev) { + u32 port = (eqe->data.port.port >> 4) & 0xf; + switch (eqe->sub_type) { case MLX5_GENERAL_SUBTYPE_DELAY_DROP_TIMEOUT: - schedule_work(&ibdev->delay_drop.delay_drop_work); + if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) == + IB_LINK_LAYER_ETHERNET) + schedule_work(&ibdev->delay_drop.delay_drop_work); break; default: /* do nothing */ return; @@ -4268,7 +2830,7 @@ static void handle_general_event(struct mlx5_ib_dev *ibdev, struct mlx5_eqe *eqe static int handle_port_change(struct mlx5_ib_dev *ibdev, struct mlx5_eqe *eqe, struct ib_event *ibev) { - u8 port = (eqe->data.port.port >> 4) & 0xf; + u32 port = (eqe->data.port.port >> 4) & 0xf; ibev->element.port_num = port; @@ -4339,7 +2901,7 @@ static void mlx5_ib_handle_event(struct work_struct *_work) break; case MLX5_EVENT_TYPE_GENERAL_EVENT: handle_general_event(ibdev, work->param, &ibev); - /* fall through */ + fallthrough; default: goto out; } @@ -4399,370 +2961,304 @@ static int mlx5_ib_event_slave_port(struct notifier_block *nb, return NOTIFY_OK; } +static int mlx5_ib_get_plane_num(struct mlx5_core_dev *mdev, u8 *num_plane) +{ + struct mlx5_hca_vport_context vport_ctx; + int err; + + *num_plane = 0; + if (!MLX5_CAP_GEN(mdev, ib_virt) || !MLX5_CAP_GEN_2(mdev, multiplane)) + return 0; + + err = mlx5_query_hca_vport_context(mdev, 0, 1, 0, &vport_ctx); + if (err) + return err; + + *num_plane = vport_ctx.num_plane; + return 0; +} + static int set_has_smi_cap(struct mlx5_ib_dev *dev) { struct mlx5_hca_vport_context vport_ctx; int err; int port; + if (MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_IB) + return 0; + for (port = 1; port <= dev->num_ports; port++) { - dev->mdev->port_caps[port - 1].has_smi = false; - if (MLX5_CAP_GEN(dev->mdev, port_type) == - MLX5_CAP_PORT_TYPE_IB) { - if (MLX5_CAP_GEN(dev->mdev, ib_virt)) { - err = mlx5_query_hca_vport_context(dev->mdev, 0, - port, 0, - &vport_ctx); - if (err) { - mlx5_ib_err(dev, "query_hca_vport_context for port=%d failed %d\n", - port, err); - return err; - } - dev->mdev->port_caps[port - 1].has_smi = - vport_ctx.has_smi; - } else { - dev->mdev->port_caps[port - 1].has_smi = true; - } + if (dev->num_plane) { + dev->port_caps[port - 1].has_smi = false; + continue; + } else if (!MLX5_CAP_GEN(dev->mdev, ib_virt) || + dev->ib_dev.type == RDMA_DEVICE_TYPE_SMI) { + dev->port_caps[port - 1].has_smi = true; + continue; + } + + err = mlx5_query_hca_vport_context(dev->mdev, 0, port, 0, + &vport_ctx); + if (err) { + mlx5_ib_err(dev, "query_hca_vport_context for port=%d failed %d\n", + port, err); + return err; } + dev->port_caps[port - 1].has_smi = vport_ctx.has_smi; } + return 0; } static void get_ext_port_caps(struct mlx5_ib_dev *dev) { - int port; + unsigned int port; - for (port = 1; port <= dev->num_ports; port++) + rdma_for_each_port (&dev->ib_dev, port) mlx5_query_ext_port_caps(dev, port); } -static int get_port_caps(struct mlx5_ib_dev *dev, u8 port) +static u8 mlx5_get_umr_fence(u8 umr_fence_cap) { - struct ib_device_attr *dprops = NULL; - struct ib_port_attr *pprops = NULL; - int err = -ENOMEM; - struct ib_udata uhw = {.inlen = 0, .outlen = 0}; + switch (umr_fence_cap) { + case MLX5_CAP_UMR_FENCE_NONE: + return MLX5_FENCE_MODE_NONE; + case MLX5_CAP_UMR_FENCE_SMALL: + return MLX5_FENCE_MODE_INITIATOR_SMALL; + default: + return MLX5_FENCE_MODE_STRONG_ORDERING; + } +} - pprops = kmalloc(sizeof(*pprops), GFP_KERNEL); - if (!pprops) - goto out; +int mlx5_ib_dev_res_cq_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_ib_resources *devr = &dev->devr; + struct ib_cq_init_attr cq_attr = {.cqe = 1}; + struct ib_device *ibdev; + struct ib_pd *pd; + struct ib_cq *cq; + int ret = 0; - dprops = kmalloc(sizeof(*dprops), GFP_KERNEL); - if (!dprops) - goto out; - err = set_has_smi_cap(dev); - if (err) - goto out; + /* + * devr->c0 is set once, never changed until device unload. + * Avoid taking the mutex if initialization is already done. + */ + if (devr->c0) + return 0; - err = mlx5_ib_query_device(&dev->ib_dev, dprops, &uhw); - if (err) { - mlx5_ib_warn(dev, "query_device failed %d\n", err); - goto out; - } + mutex_lock(&devr->cq_lock); + if (devr->c0) + goto unlock; - memset(pprops, 0, sizeof(*pprops)); - err = mlx5_ib_query_port(&dev->ib_dev, port, pprops); - if (err) { - mlx5_ib_warn(dev, "query_port %d failed %d\n", - port, err); - goto out; + ibdev = &dev->ib_dev; + pd = ib_alloc_pd(ibdev, 0); + if (IS_ERR(pd)) { + ret = PTR_ERR(pd); + mlx5_ib_err(dev, "Couldn't allocate PD for res init, err=%pe\n", + pd); + goto unlock; } - dev->mdev->port_caps[port - 1].pkey_table_len = - dprops->max_pkeys; - dev->mdev->port_caps[port - 1].gid_table_len = - pprops->gid_tbl_len; - mlx5_ib_dbg(dev, "port %d: pkey_table_len %d, gid_table_len %d\n", - port, dprops->max_pkeys, pprops->gid_tbl_len); + cq = ib_create_cq(ibdev, NULL, NULL, NULL, &cq_attr); + if (IS_ERR(cq)) { + ret = PTR_ERR(cq); + mlx5_ib_err(dev, "Couldn't create CQ for res init, err=%pe\n", + cq); + ib_dealloc_pd(pd); + goto unlock; + } -out: - kfree(pprops); - kfree(dprops); + devr->p0 = pd; + devr->c0 = cq; - return err; +unlock: + mutex_unlock(&devr->cq_lock); + return ret; } -static void destroy_umrc_res(struct mlx5_ib_dev *dev) +int mlx5_ib_dev_res_srq_init(struct mlx5_ib_dev *dev) { - int err; + struct mlx5_ib_resources *devr = &dev->devr; + struct ib_srq_init_attr attr; + struct ib_srq *s0, *s1; + int ret = 0; - err = mlx5_mr_cache_cleanup(dev); - if (err) - mlx5_ib_warn(dev, "mr cache cleanup failed\n"); + /* + * devr->s1 is set once, never changed until device unload. + * Avoid taking the mutex if initialization is already done. + */ + if (devr->s1) + return 0; - if (dev->umrc.qp) - mlx5_ib_destroy_qp(dev->umrc.qp); - if (dev->umrc.cq) - ib_free_cq(dev->umrc.cq); - if (dev->umrc.pd) - ib_dealloc_pd(dev->umrc.pd); -} + mutex_lock(&devr->srq_lock); + if (devr->s1) + goto unlock; -enum { - MAX_UMR_WR = 128, -}; + ret = mlx5_ib_dev_res_cq_init(dev); + if (ret) + goto unlock; -static int create_umr_res(struct mlx5_ib_dev *dev) -{ - struct ib_qp_init_attr *init_attr = NULL; - struct ib_qp_attr *attr = NULL; - struct ib_pd *pd; - struct ib_cq *cq; - struct ib_qp *qp; - int ret; + memset(&attr, 0, sizeof(attr)); + attr.attr.max_sge = 1; + attr.attr.max_wr = 1; + attr.srq_type = IB_SRQT_XRC; + attr.ext.cq = devr->c0; - attr = kzalloc(sizeof(*attr), GFP_KERNEL); - init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL); - if (!attr || !init_attr) { - ret = -ENOMEM; - goto error_0; + s0 = ib_create_srq(devr->p0, &attr); + if (IS_ERR(s0)) { + ret = PTR_ERR(s0); + mlx5_ib_err(dev, + "Couldn't create SRQ 0 for res init, err=%pe\n", + s0); + goto unlock; } - pd = ib_alloc_pd(&dev->ib_dev, 0); - if (IS_ERR(pd)) { - mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n"); - ret = PTR_ERR(pd); - goto error_0; - } + memset(&attr, 0, sizeof(attr)); + attr.attr.max_sge = 1; + attr.attr.max_wr = 1; + attr.srq_type = IB_SRQT_BASIC; - cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ); - if (IS_ERR(cq)) { - mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n"); - ret = PTR_ERR(cq); - goto error_2; + s1 = ib_create_srq(devr->p0, &attr); + if (IS_ERR(s1)) { + ret = PTR_ERR(s1); + mlx5_ib_err(dev, + "Couldn't create SRQ 1 for res init, err=%pe\n", + s1); + ib_destroy_srq(s0); } - init_attr->send_cq = cq; - init_attr->recv_cq = cq; - init_attr->sq_sig_type = IB_SIGNAL_ALL_WR; - init_attr->cap.max_send_wr = MAX_UMR_WR; - init_attr->cap.max_send_sge = 1; - init_attr->qp_type = MLX5_IB_QPT_REG_UMR; - init_attr->port_num = 1; - qp = mlx5_ib_create_qp(pd, init_attr, NULL); - if (IS_ERR(qp)) { - mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n"); - ret = PTR_ERR(qp); - goto error_3; - } - qp->device = &dev->ib_dev; - qp->real_qp = qp; - qp->uobject = NULL; - qp->qp_type = MLX5_IB_QPT_REG_UMR; - qp->send_cq = init_attr->send_cq; - qp->recv_cq = init_attr->recv_cq; - - attr->qp_state = IB_QPS_INIT; - attr->port_num = 1; - ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX | - IB_QP_PORT, NULL); - if (ret) { - mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n"); - goto error_4; - } + devr->s0 = s0; + devr->s1 = s1; - memset(attr, 0, sizeof(*attr)); - attr->qp_state = IB_QPS_RTR; - attr->path_mtu = IB_MTU_256; +unlock: + mutex_unlock(&devr->srq_lock); + return ret; +} - ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL); - if (ret) { - mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n"); - goto error_4; - } +static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_ib_resources *devr = &dev->devr; + int ret; - memset(attr, 0, sizeof(*attr)); - attr->qp_state = IB_QPS_RTS; - ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL); - if (ret) { - mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n"); - goto error_4; - } + if (!MLX5_CAP_GEN(dev->mdev, xrc)) + return -EOPNOTSUPP; - dev->umrc.qp = qp; - dev->umrc.cq = cq; - dev->umrc.pd = pd; + ret = mlx5_cmd_xrcd_alloc(dev->mdev, &devr->xrcdn0, 0); + if (ret) + return ret; - sema_init(&dev->umrc.sem, MAX_UMR_WR); - ret = mlx5_mr_cache_init(dev); + ret = mlx5_cmd_xrcd_alloc(dev->mdev, &devr->xrcdn1, 0); if (ret) { - mlx5_ib_warn(dev, "mr cache init failed %d\n", ret); - goto error_4; + mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn0, 0); + return ret; } - kfree(attr); - kfree(init_attr); + mutex_init(&devr->cq_lock); + mutex_init(&devr->srq_lock); return 0; - -error_4: - mlx5_ib_destroy_qp(qp); - dev->umrc.qp = NULL; - -error_3: - ib_free_cq(cq); - dev->umrc.cq = NULL; - -error_2: - ib_dealloc_pd(pd); - dev->umrc.pd = NULL; - -error_0: - kfree(attr); - kfree(init_attr); - return ret; } -static u8 mlx5_get_umr_fence(u8 umr_fence_cap) +static void mlx5_ib_dev_res_cleanup(struct mlx5_ib_dev *dev) { - switch (umr_fence_cap) { - case MLX5_CAP_UMR_FENCE_NONE: - return MLX5_FENCE_MODE_NONE; - case MLX5_CAP_UMR_FENCE_SMALL: - return MLX5_FENCE_MODE_INITIATOR_SMALL; - default: - return MLX5_FENCE_MODE_STRONG_ORDERING; + struct mlx5_ib_resources *devr = &dev->devr; + + /* After s0/s1 init, they are not unset during the device lifetime. */ + if (devr->s1) { + ib_destroy_srq(devr->s1); + ib_destroy_srq(devr->s0); } + mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn1, 0); + mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn0, 0); + /* After p0/c0 init, they are not unset during the device lifetime. */ + if (devr->c0) { + ib_destroy_cq(devr->c0); + ib_dealloc_pd(devr->p0); + } + mutex_destroy(&devr->cq_lock); + mutex_destroy(&devr->srq_lock); } -static int create_dev_resources(struct mlx5_ib_resources *devr) +static int +mlx5_ib_create_data_direct_resources(struct mlx5_ib_dev *dev) { - struct ib_srq_init_attr attr; - struct mlx5_ib_dev *dev; - struct ib_cq_init_attr cq_attr = {.cqe = 1}; - int port; - int ret = 0; - - dev = container_of(devr, struct mlx5_ib_dev, devr); + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + struct mlx5_core_dev *mdev = dev->mdev; + bool ro_supp = false; + void *mkc; + u32 mkey; + u32 pdn; + u32 *in; + int err; - mutex_init(&devr->mutex); + err = mlx5_core_alloc_pd(mdev, &pdn); + if (err) + return err; - devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL); - if (IS_ERR(devr->p0)) { - ret = PTR_ERR(devr->p0); - goto error0; - } - devr->p0->device = &dev->ib_dev; - devr->p0->uobject = NULL; - atomic_set(&devr->p0->usecnt, 0); - - devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, &cq_attr, NULL, NULL); - if (IS_ERR(devr->c0)) { - ret = PTR_ERR(devr->c0); - goto error1; - } - devr->c0->device = &dev->ib_dev; - devr->c0->uobject = NULL; - devr->c0->comp_handler = NULL; - devr->c0->event_handler = NULL; - devr->c0->cq_context = NULL; - atomic_set(&devr->c0->usecnt, 0); - - devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL); - if (IS_ERR(devr->x0)) { - ret = PTR_ERR(devr->x0); - goto error2; - } - devr->x0->device = &dev->ib_dev; - devr->x0->inode = NULL; - atomic_set(&devr->x0->usecnt, 0); - mutex_init(&devr->x0->tgt_qp_mutex); - INIT_LIST_HEAD(&devr->x0->tgt_qp_list); - - devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL); - if (IS_ERR(devr->x1)) { - ret = PTR_ERR(devr->x1); - goto error3; + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err; } - devr->x1->device = &dev->ib_dev; - devr->x1->inode = NULL; - atomic_set(&devr->x1->usecnt, 0); - mutex_init(&devr->x1->tgt_qp_mutex); - INIT_LIST_HEAD(&devr->x1->tgt_qp_list); - memset(&attr, 0, sizeof(attr)); - attr.attr.max_sge = 1; - attr.attr.max_wr = 1; - attr.srq_type = IB_SRQT_XRC; - attr.ext.cq = devr->c0; - attr.ext.xrc.xrcd = devr->x0; + MLX5_SET(create_mkey_in, in, data_direct, 1); + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA); + MLX5_SET(mkc, mkc, lw, 1); + MLX5_SET(mkc, mkc, lr, 1); + MLX5_SET(mkc, mkc, rw, 1); + MLX5_SET(mkc, mkc, rr, 1); + MLX5_SET(mkc, mkc, a, 1); + MLX5_SET(mkc, mkc, pd, pdn); + MLX5_SET(mkc, mkc, length64, 1); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + err = mlx5_core_create_mkey(mdev, &mkey, in, inlen); + if (err) + goto err_mkey; + + dev->ddr.mkey = mkey; + dev->ddr.pdn = pdn; - devr->s0 = mlx5_ib_create_srq(devr->p0, &attr, NULL); - if (IS_ERR(devr->s0)) { - ret = PTR_ERR(devr->s0); - goto error4; + /* create another mkey with RO support */ + if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write)) { + MLX5_SET(mkc, mkc, relaxed_ordering_write, 1); + ro_supp = true; } - devr->s0->device = &dev->ib_dev; - devr->s0->pd = devr->p0; - devr->s0->uobject = NULL; - devr->s0->event_handler = NULL; - devr->s0->srq_context = NULL; - devr->s0->srq_type = IB_SRQT_XRC; - devr->s0->ext.xrc.xrcd = devr->x0; - devr->s0->ext.cq = devr->c0; - atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt); - atomic_inc(&devr->s0->ext.cq->usecnt); - atomic_inc(&devr->p0->usecnt); - atomic_set(&devr->s0->usecnt, 0); - memset(&attr, 0, sizeof(attr)); - attr.attr.max_sge = 1; - attr.attr.max_wr = 1; - attr.srq_type = IB_SRQT_BASIC; - devr->s1 = mlx5_ib_create_srq(devr->p0, &attr, NULL); - if (IS_ERR(devr->s1)) { - ret = PTR_ERR(devr->s1); - goto error5; + if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read)) { + MLX5_SET(mkc, mkc, relaxed_ordering_read, 1); + ro_supp = true; } - devr->s1->device = &dev->ib_dev; - devr->s1->pd = devr->p0; - devr->s1->uobject = NULL; - devr->s1->event_handler = NULL; - devr->s1->srq_context = NULL; - devr->s1->srq_type = IB_SRQT_BASIC; - devr->s1->ext.cq = devr->c0; - atomic_inc(&devr->p0->usecnt); - atomic_set(&devr->s1->usecnt, 0); - - for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) { - INIT_WORK(&devr->ports[port].pkey_change_work, - pkey_change_handler); - devr->ports[port].devr = devr; + + if (ro_supp) { + err = mlx5_core_create_mkey(mdev, &mkey, in, inlen); + /* RO is defined as best effort */ + if (!err) { + dev->ddr.mkey_ro = mkey; + dev->ddr.mkey_ro_valid = true; + } } + kvfree(in); return 0; -error5: - mlx5_ib_destroy_srq(devr->s0); -error4: - mlx5_ib_dealloc_xrcd(devr->x1); -error3: - mlx5_ib_dealloc_xrcd(devr->x0); -error2: - mlx5_ib_destroy_cq(devr->c0); -error1: - mlx5_ib_dealloc_pd(devr->p0); -error0: - return ret; +err_mkey: + kvfree(in); +err: + mlx5_core_dealloc_pd(mdev, pdn); + return err; } -static void destroy_dev_resources(struct mlx5_ib_resources *devr) +static void +mlx5_ib_free_data_direct_resources(struct mlx5_ib_dev *dev) { - struct mlx5_ib_dev *dev = - container_of(devr, struct mlx5_ib_dev, devr); - int port; - mlx5_ib_destroy_srq(devr->s1); - mlx5_ib_destroy_srq(devr->s0); - mlx5_ib_dealloc_xrcd(devr->x0); - mlx5_ib_dealloc_xrcd(devr->x1); - mlx5_ib_destroy_cq(devr->c0); - mlx5_ib_dealloc_pd(devr->p0); + if (dev->ddr.mkey_ro_valid) + mlx5_core_destroy_mkey(dev->mdev, dev->ddr.mkey_ro); - /* Make sure no change P_Key work items are still executing */ - for (port = 0; port < dev->num_ports; ++port) - cancel_work_sync(&devr->ports[port].pkey_change_work); + mlx5_core_destroy_mkey(dev->mdev, dev->ddr.mkey); + mlx5_core_dealloc_pd(dev->mdev, dev->ddr.pdn); } static u32 get_core_cap_flags(struct ib_device *ibdev, @@ -4778,6 +3274,13 @@ static u32 get_core_cap_flags(struct ib_device *ibdev, if (rep->grh_required) ret |= RDMA_CORE_CAP_IB_GRH_REQUIRED; + if (dev->num_plane) + return ret | RDMA_CORE_CAP_PROT_IB | RDMA_CORE_CAP_IB_MAD | + RDMA_CORE_CAP_IB_CM | RDMA_CORE_CAP_IB_SA | + RDMA_CORE_CAP_AF_IB; + else if (ibdev->type == RDMA_DEVICE_TYPE_SMI) + return ret | RDMA_CORE_CAP_IB_MAD | RDMA_CORE_CAP_IB_SMI; + if (ll == IB_LINK_LAYER_INFINIBAND) return ret | RDMA_CORE_PORT_IBA_IB; @@ -4799,7 +3302,7 @@ static u32 get_core_cap_flags(struct ib_device *ibdev, return ret; } -static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num, +static int mlx5_port_immutable(struct ib_device *ibdev, u32 port_num, struct ib_port_immutable *immutable) { struct ib_port_attr attr; @@ -4813,6 +3316,9 @@ static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num, return err; if (ll == IB_LINK_LAYER_INFINIBAND) { + if (ibdev->type == RDMA_DEVICE_TYPE_SMI) + port_num = smi_to_native_portnum(dev, port_num); + err = mlx5_query_hca_vport_context(dev->mdev, 0, port_num, 0, &rep); if (err) @@ -4822,13 +3328,12 @@ static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num, immutable->pkey_tbl_len = attr.pkey_tbl_len; immutable->gid_tbl_len = attr.gid_tbl_len; immutable->core_cap_flags = get_core_cap_flags(ibdev, &rep); - if ((ll == IB_LINK_LAYER_INFINIBAND) || MLX5_CAP_GEN(dev->mdev, roce)) - immutable->max_mad_size = IB_MGMT_MAD_SIZE; + immutable->max_mad_size = IB_MGMT_MAD_SIZE; return 0; } -static int mlx5_port_rep_immutable(struct ib_device *ibdev, u8 port_num, +static int mlx5_port_rep_immutable(struct ib_device *ibdev, u32 port_num, struct ib_port_immutable *immutable) { struct ib_port_attr attr; @@ -4856,6 +3361,67 @@ static void get_dev_fw_str(struct ib_device *ibdev, char *str) fw_rev_sub(dev->mdev)); } +static int lag_event(struct notifier_block *nb, unsigned long event, void *data) +{ + struct mlx5_ib_dev *dev = container_of(nb, struct mlx5_ib_dev, + lag_events); + struct mlx5_core_dev *mdev = dev->mdev; + struct ib_device *ibdev = &dev->ib_dev; + struct net_device *old_ndev = NULL; + struct mlx5_ib_port *port; + struct net_device *ndev; + u32 portnum = 0; + int ret = 0; + int i; + + switch (event) { + case MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE: + ndev = data; + if (ndev) { + if (!mlx5_lag_is_roce(mdev)) { + // sriov lag + for (i = 0; i < dev->num_ports; i++) { + port = &dev->port[i]; + if (port->rep && port->rep->vport == + MLX5_VPORT_UPLINK) { + portnum = i; + break; + } + } + } + old_ndev = ib_device_get_netdev(ibdev, portnum + 1); + ret = ib_device_set_netdev(ibdev, ndev, portnum + 1); + if (ret) + goto out; + + if (old_ndev) + roce_del_all_netdev_gids(ibdev, portnum + 1, + old_ndev); + rdma_roce_rescan_port(ibdev, portnum + 1); + } + break; + default: + return NOTIFY_DONE; + } + +out: + dev_put(old_ndev); + return notifier_from_errno(ret); +} + +static void mlx5e_lag_event_register(struct mlx5_ib_dev *dev) +{ + dev->lag_events.notifier_call = lag_event; + blocking_notifier_chain_register(&dev->mdev->priv.lag_nh, + &dev->lag_events); +} + +static void mlx5e_lag_event_unregister(struct mlx5_ib_dev *dev) +{ + blocking_notifier_chain_unregister(&dev->mdev->priv.lag_nh, + &dev->lag_events); +} + static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev) { struct mlx5_core_dev *mdev = dev->mdev; @@ -4864,7 +3430,7 @@ static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev) struct mlx5_flow_table *ft; int err; - if (!ns || !mlx5_lag_is_roce(mdev)) + if (!ns || !mlx5_lag_is_active(mdev)) return 0; err = mlx5_cmd_create_vport_lag(mdev); @@ -4877,7 +3443,9 @@ static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev) goto err_destroy_vport_lag; } + mlx5e_lag_event_register(dev); dev->flow_db->lag_demux_ft = ft; + dev->lag_ports = mlx5_lag_get_num_ports(mdev); dev->lag_active = true; return 0; @@ -4893,6 +3461,7 @@ static void mlx5_eth_lag_cleanup(struct mlx5_ib_dev *dev) if (dev->lag_active) { dev->lag_active = false; + mlx5e_lag_event_unregister(dev); mlx5_destroy_flow_table(dev->flow_db->lag_demux_ft); dev->flow_db->lag_demux_ft = NULL; @@ -4900,33 +3469,70 @@ static void mlx5_eth_lag_cleanup(struct mlx5_ib_dev *dev) } } -static int mlx5_add_netdev_notifier(struct mlx5_ib_dev *dev, u8 port_num) +static void mlx5_netdev_notifier_register(struct mlx5_roce *roce, + struct net_device *netdev) { int err; - dev->roce[port_num].nb.notifier_call = mlx5_netdev_event; - err = register_netdevice_notifier(&dev->roce[port_num].nb); - if (err) { - dev->roce[port_num].nb.notifier_call = NULL; - return err; - } + if (roce->tracking_netdev) + return; + roce->tracking_netdev = netdev; + roce->nb.notifier_call = mlx5_netdev_event; + err = register_netdevice_notifier_dev_net(netdev, &roce->nb, &roce->nn); + WARN_ON(err); +} - return 0; +static void mlx5_netdev_notifier_unregister(struct mlx5_roce *roce) +{ + if (!roce->tracking_netdev) + return; + unregister_netdevice_notifier_dev_net(roce->tracking_netdev, &roce->nb, + &roce->nn); + roce->tracking_netdev = NULL; } -static void mlx5_remove_netdev_notifier(struct mlx5_ib_dev *dev, u8 port_num) +static int mlx5e_mdev_notifier_event(struct notifier_block *nb, + unsigned long event, void *data) { - if (dev->roce[port_num].nb.notifier_call) { - unregister_netdevice_notifier(&dev->roce[port_num].nb); - dev->roce[port_num].nb.notifier_call = NULL; + struct mlx5_roce *roce = container_of(nb, struct mlx5_roce, mdev_nb); + struct net_device *netdev = data; + + switch (event) { + case MLX5_DRIVER_EVENT_UPLINK_NETDEV: + if (netdev) + mlx5_netdev_notifier_register(roce, netdev); + else + mlx5_netdev_notifier_unregister(roce); + break; + default: + return NOTIFY_DONE; } + + return NOTIFY_OK; +} + +static void mlx5_mdev_netdev_track(struct mlx5_ib_dev *dev, u32 port_num) +{ + struct mlx5_roce *roce = &dev->port[port_num].roce; + + roce->mdev_nb.notifier_call = mlx5e_mdev_notifier_event; + mlx5_blocking_notifier_register(dev->mdev, &roce->mdev_nb); + mlx5_core_uplink_netdev_event_replay(dev->mdev); +} + +static void mlx5_mdev_netdev_untrack(struct mlx5_ib_dev *dev, u32 port_num) +{ + struct mlx5_roce *roce = &dev->port[port_num].roce; + + mlx5_blocking_notifier_unregister(dev->mdev, &roce->mdev_nb); + mlx5_netdev_notifier_unregister(roce); } static int mlx5_enable_eth(struct mlx5_ib_dev *dev) { int err; - if (MLX5_CAP_GEN(dev->mdev, roce)) { + if (!dev->is_rep && dev->profile != &raw_eth_profile) { err = mlx5_nic_vport_enable_roce(dev->mdev); if (err) return err; @@ -4939,7 +3545,7 @@ static int mlx5_enable_eth(struct mlx5_ib_dev *dev) return 0; err_disable_roce: - if (MLX5_CAP_GEN(dev->mdev, roce)) + if (!dev->is_rep && dev->profile != &raw_eth_profile) mlx5_nic_vport_disable_roce(dev->mdev); return err; @@ -4948,337 +3554,11 @@ err_disable_roce: static void mlx5_disable_eth(struct mlx5_ib_dev *dev) { mlx5_eth_lag_cleanup(dev); - if (MLX5_CAP_GEN(dev->mdev, roce)) + if (!dev->is_rep && dev->profile != &raw_eth_profile) mlx5_nic_vport_disable_roce(dev->mdev); } -struct mlx5_ib_counter { - const char *name; - size_t offset; -}; - -#define INIT_Q_COUNTER(_name) \ - { .name = #_name, .offset = MLX5_BYTE_OFF(query_q_counter_out, _name)} - -static const struct mlx5_ib_counter basic_q_cnts[] = { - INIT_Q_COUNTER(rx_write_requests), - INIT_Q_COUNTER(rx_read_requests), - INIT_Q_COUNTER(rx_atomic_requests), - INIT_Q_COUNTER(out_of_buffer), -}; - -static const struct mlx5_ib_counter out_of_seq_q_cnts[] = { - INIT_Q_COUNTER(out_of_sequence), -}; - -static const struct mlx5_ib_counter retrans_q_cnts[] = { - INIT_Q_COUNTER(duplicate_request), - INIT_Q_COUNTER(rnr_nak_retry_err), - INIT_Q_COUNTER(packet_seq_err), - INIT_Q_COUNTER(implied_nak_seq_err), - INIT_Q_COUNTER(local_ack_timeout_err), -}; - -#define INIT_CONG_COUNTER(_name) \ - { .name = #_name, .offset = \ - MLX5_BYTE_OFF(query_cong_statistics_out, _name ## _high)} - -static const struct mlx5_ib_counter cong_cnts[] = { - INIT_CONG_COUNTER(rp_cnp_ignored), - INIT_CONG_COUNTER(rp_cnp_handled), - INIT_CONG_COUNTER(np_ecn_marked_roce_packets), - INIT_CONG_COUNTER(np_cnp_sent), -}; - -static const struct mlx5_ib_counter extended_err_cnts[] = { - INIT_Q_COUNTER(resp_local_length_error), - INIT_Q_COUNTER(resp_cqe_error), - INIT_Q_COUNTER(req_cqe_error), - INIT_Q_COUNTER(req_remote_invalid_request), - INIT_Q_COUNTER(req_remote_access_errors), - INIT_Q_COUNTER(resp_remote_access_errors), - INIT_Q_COUNTER(resp_cqe_flush_error), - INIT_Q_COUNTER(req_cqe_flush_error), -}; - -#define INIT_EXT_PPCNT_COUNTER(_name) \ - { .name = #_name, .offset = \ - MLX5_BYTE_OFF(ppcnt_reg, \ - counter_set.eth_extended_cntrs_grp_data_layout._name##_high)} - -static const struct mlx5_ib_counter ext_ppcnt_cnts[] = { - INIT_EXT_PPCNT_COUNTER(rx_icrc_encapsulated), -}; - -static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev) -{ - int i; - - for (i = 0; i < dev->num_ports; i++) { - if (dev->port[i].cnts.set_id_valid) - mlx5_core_dealloc_q_counter(dev->mdev, - dev->port[i].cnts.set_id); - kfree(dev->port[i].cnts.names); - kfree(dev->port[i].cnts.offsets); - } -} - -static int __mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev, - struct mlx5_ib_counters *cnts) -{ - u32 num_counters; - - num_counters = ARRAY_SIZE(basic_q_cnts); - - if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt)) - num_counters += ARRAY_SIZE(out_of_seq_q_cnts); - - if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) - num_counters += ARRAY_SIZE(retrans_q_cnts); - - if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters)) - num_counters += ARRAY_SIZE(extended_err_cnts); - - cnts->num_q_counters = num_counters; - - if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) { - cnts->num_cong_counters = ARRAY_SIZE(cong_cnts); - num_counters += ARRAY_SIZE(cong_cnts); - } - if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) { - cnts->num_ext_ppcnt_counters = ARRAY_SIZE(ext_ppcnt_cnts); - num_counters += ARRAY_SIZE(ext_ppcnt_cnts); - } - cnts->names = kcalloc(num_counters, sizeof(cnts->names), GFP_KERNEL); - if (!cnts->names) - return -ENOMEM; - - cnts->offsets = kcalloc(num_counters, - sizeof(cnts->offsets), GFP_KERNEL); - if (!cnts->offsets) - goto err_names; - - return 0; - -err_names: - kfree(cnts->names); - cnts->names = NULL; - return -ENOMEM; -} - -static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev, - const char **names, - size_t *offsets) -{ - int i; - int j = 0; - - for (i = 0; i < ARRAY_SIZE(basic_q_cnts); i++, j++) { - names[j] = basic_q_cnts[i].name; - offsets[j] = basic_q_cnts[i].offset; - } - - if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt)) { - for (i = 0; i < ARRAY_SIZE(out_of_seq_q_cnts); i++, j++) { - names[j] = out_of_seq_q_cnts[i].name; - offsets[j] = out_of_seq_q_cnts[i].offset; - } - } - - if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) { - for (i = 0; i < ARRAY_SIZE(retrans_q_cnts); i++, j++) { - names[j] = retrans_q_cnts[i].name; - offsets[j] = retrans_q_cnts[i].offset; - } - } - - if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters)) { - for (i = 0; i < ARRAY_SIZE(extended_err_cnts); i++, j++) { - names[j] = extended_err_cnts[i].name; - offsets[j] = extended_err_cnts[i].offset; - } - } - - if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) { - for (i = 0; i < ARRAY_SIZE(cong_cnts); i++, j++) { - names[j] = cong_cnts[i].name; - offsets[j] = cong_cnts[i].offset; - } - } - - if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) { - for (i = 0; i < ARRAY_SIZE(ext_ppcnt_cnts); i++, j++) { - names[j] = ext_ppcnt_cnts[i].name; - offsets[j] = ext_ppcnt_cnts[i].offset; - } - } -} - -static int mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev) -{ - int err = 0; - int i; - bool is_shared; - - is_shared = MLX5_CAP_GEN(dev->mdev, log_max_uctx) != 0; - - for (i = 0; i < dev->num_ports; i++) { - err = __mlx5_ib_alloc_counters(dev, &dev->port[i].cnts); - if (err) - goto err_alloc; - - mlx5_ib_fill_counters(dev, dev->port[i].cnts.names, - dev->port[i].cnts.offsets); - - err = mlx5_cmd_alloc_q_counter(dev->mdev, - &dev->port[i].cnts.set_id, - is_shared ? - MLX5_SHARED_RESOURCE_UID : 0); - if (err) { - mlx5_ib_warn(dev, - "couldn't allocate queue counter for port %d, err %d\n", - i + 1, err); - goto err_alloc; - } - dev->port[i].cnts.set_id_valid = true; - } - - return 0; - -err_alloc: - mlx5_ib_dealloc_counters(dev); - return err; -} - -static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev, - u8 port_num) -{ - struct mlx5_ib_dev *dev = to_mdev(ibdev); - struct mlx5_ib_port *port = &dev->port[port_num - 1]; - - /* We support only per port stats */ - if (port_num == 0) - return NULL; - - return rdma_alloc_hw_stats_struct(port->cnts.names, - port->cnts.num_q_counters + - port->cnts.num_cong_counters + - port->cnts.num_ext_ppcnt_counters, - RDMA_HW_STATS_DEFAULT_LIFESPAN); -} - -static int mlx5_ib_query_q_counters(struct mlx5_core_dev *mdev, - struct mlx5_ib_port *port, - struct rdma_hw_stats *stats) -{ - int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out); - void *out; - __be32 val; - int ret, i; - - out = kvzalloc(outlen, GFP_KERNEL); - if (!out) - return -ENOMEM; - - ret = mlx5_core_query_q_counter(mdev, - port->cnts.set_id, 0, - out, outlen); - if (ret) - goto free; - - for (i = 0; i < port->cnts.num_q_counters; i++) { - val = *(__be32 *)(out + port->cnts.offsets[i]); - stats->value[i] = (u64)be32_to_cpu(val); - } - -free: - kvfree(out); - return ret; -} - -static int mlx5_ib_query_ext_ppcnt_counters(struct mlx5_ib_dev *dev, - struct mlx5_ib_port *port, - struct rdma_hw_stats *stats) -{ - int offset = port->cnts.num_q_counters + port->cnts.num_cong_counters; - int sz = MLX5_ST_SZ_BYTES(ppcnt_reg); - int ret, i; - void *out; - - out = kvzalloc(sz, GFP_KERNEL); - if (!out) - return -ENOMEM; - - ret = mlx5_cmd_query_ext_ppcnt_counters(dev->mdev, out); - if (ret) - goto free; - - for (i = 0; i < port->cnts.num_ext_ppcnt_counters; i++) { - stats->value[i + offset] = - be64_to_cpup((__be64 *)(out + - port->cnts.offsets[i + offset])); - } - -free: - kvfree(out); - return ret; -} - -static int mlx5_ib_get_hw_stats(struct ib_device *ibdev, - struct rdma_hw_stats *stats, - u8 port_num, int index) -{ - struct mlx5_ib_dev *dev = to_mdev(ibdev); - struct mlx5_ib_port *port = &dev->port[port_num - 1]; - struct mlx5_core_dev *mdev; - int ret, num_counters; - u8 mdev_port_num; - - if (!stats) - return -EINVAL; - - num_counters = port->cnts.num_q_counters + - port->cnts.num_cong_counters + - port->cnts.num_ext_ppcnt_counters; - - /* q_counters are per IB device, query the master mdev */ - ret = mlx5_ib_query_q_counters(dev->mdev, port, stats); - if (ret) - return ret; - - if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) { - ret = mlx5_ib_query_ext_ppcnt_counters(dev, port, stats); - if (ret) - return ret; - } - - if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) { - mdev = mlx5_ib_get_native_port_mdev(dev, port_num, - &mdev_port_num); - if (!mdev) { - /* If port is not affiliated yet, its in down state - * which doesn't have any counters yet, so it would be - * zero. So no need to read from the HCA. - */ - goto done; - } - ret = mlx5_lag_query_cong_counters(dev->mdev, - stats->value + - port->cnts.num_q_counters, - port->cnts.num_cong_counters, - port->cnts.offsets + - port->cnts.num_q_counters); - - mlx5_ib_put_native_port_mdev(dev, port_num); - if (ret) - return ret; - } - -done: - return num_counters; -} - -static int mlx5_ib_rn_get_params(struct ib_device *device, u8 port_num, +static int mlx5_ib_rn_get_params(struct ib_device *device, u32 port_num, enum rdma_netdev_t type, struct rdma_netdev_alloc_params *params) { @@ -5288,24 +3568,6 @@ static int mlx5_ib_rn_get_params(struct ib_device *device, u8 port_num, return mlx5_rdma_rn_get_params(to_mdev(device)->mdev, device, params); } -static void delay_drop_debugfs_cleanup(struct mlx5_ib_dev *dev) -{ - if (!dev->delay_drop.dbg) - return; - debugfs_remove_recursive(dev->delay_drop.dbg->dir_debugfs); - kfree(dev->delay_drop.dbg); - dev->delay_drop.dbg = NULL; -} - -static void cancel_delay_drop(struct mlx5_ib_dev *dev) -{ - if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP)) - return; - - cancel_work_sync(&dev->delay_drop.delay_drop_work); - delay_drop_debugfs_cleanup(dev); -} - static ssize_t delay_drop_timeout_read(struct file *filp, char __user *buf, size_t count, loff_t *pos) { @@ -5345,81 +3607,26 @@ static const struct file_operations fops_delay_drop_timeout = { .read = delay_drop_timeout_read, }; -static int delay_drop_debugfs_init(struct mlx5_ib_dev *dev) -{ - struct mlx5_ib_dbg_delay_drop *dbg; - - if (!mlx5_debugfs_root) - return 0; - - dbg = kzalloc(sizeof(*dbg), GFP_KERNEL); - if (!dbg) - return -ENOMEM; - - dev->delay_drop.dbg = dbg; - - dbg->dir_debugfs = - debugfs_create_dir("delay_drop", - dev->mdev->priv.dbg_root); - if (!dbg->dir_debugfs) - goto out_debugfs; - - dbg->events_cnt_debugfs = - debugfs_create_atomic_t("num_timeout_events", 0400, - dbg->dir_debugfs, - &dev->delay_drop.events_cnt); - if (!dbg->events_cnt_debugfs) - goto out_debugfs; - - dbg->rqs_cnt_debugfs = - debugfs_create_atomic_t("num_rqs", 0400, - dbg->dir_debugfs, - &dev->delay_drop.rqs_cnt); - if (!dbg->rqs_cnt_debugfs) - goto out_debugfs; - - dbg->timeout_debugfs = - debugfs_create_file("timeout", 0600, - dbg->dir_debugfs, - &dev->delay_drop, - &fops_delay_drop_timeout); - if (!dbg->timeout_debugfs) - goto out_debugfs; - - return 0; - -out_debugfs: - delay_drop_debugfs_cleanup(dev); - return -ENOMEM; -} - -static void init_delay_drop(struct mlx5_ib_dev *dev) -{ - if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP)) - return; - - mutex_init(&dev->delay_drop.lock); - dev->delay_drop.dev = dev; - dev->delay_drop.activate = false; - dev->delay_drop.timeout = MLX5_MAX_DELAY_DROP_TIMEOUT_MS * 1000; - INIT_WORK(&dev->delay_drop.delay_drop_work, delay_drop_handler); - atomic_set(&dev->delay_drop.rqs_cnt, 0); - atomic_set(&dev->delay_drop.events_cnt, 0); - - if (delay_drop_debugfs_init(dev)) - mlx5_ib_warn(dev, "Failed to init delay drop debugfs\n"); -} - -/* The mlx5_ib_multiport_mutex should be held when calling this function */ static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev, struct mlx5_ib_multiport_info *mpi) { - u8 port_num = mlx5_core_native_port_num(mpi->mdev) - 1; + u32 port_num = mlx5_core_native_port_num(mpi->mdev) - 1; struct mlx5_ib_port *port = &ibdev->port[port_num]; int comps; int err; int i; + lockdep_assert_held(&mlx5_ib_multiport_mutex); + + mlx5_ib_disable_lb_mp(ibdev->mdev, mpi->mdev, &ibdev->lb); + + mlx5_core_mp_event_replay(ibdev->mdev, + MLX5_DRIVER_EVENT_AFFILIATION_REMOVED, + NULL); + mlx5_core_mp_event_replay(mpi->mdev, + MLX5_DRIVER_EVENT_AFFILIATION_REMOVED, + NULL); + mlx5_ib_cleanup_cong_debugfs(ibdev, port_num); spin_lock(&port->mp.mpi_lock); @@ -5428,14 +3635,13 @@ static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev, return; } - if (mpi->mdev_events.notifier_call) - mlx5_notifier_unregister(mpi->mdev, &mpi->mdev_events); - mpi->mdev_events.notifier_call = NULL; - mpi->ibdev = NULL; spin_unlock(&port->mp.mpi_lock); - mlx5_remove_netdev_notifier(ibdev, port_num); + if (mpi->mdev_events.notifier_call) + mlx5_notifier_unregister(mpi->mdev, &mpi->mdev_events); + mpi->mdev_events.notifier_call = NULL; + mlx5_mdev_netdev_untrack(ibdev, port_num); spin_lock(&port->mp.mpi_lock); comps = mpi->mdev_refcnt; @@ -5453,13 +3659,11 @@ static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev, port->mp.mpi = NULL; - list_add_tail(&mpi->list, &mlx5_ib_unaffiliated_port_list); - spin_unlock(&port->mp.mpi_lock); err = mlx5_nic_vport_unaffiliate_multiport(mpi->mdev); - mlx5_ib_dbg(ibdev, "unaffiliated port %d\n", port_num + 1); + mlx5_ib_dbg(ibdev, "unaffiliated port %u\n", port_num + 1); /* Log an error, still needed to cleanup the pointers and add * it back to the list. */ @@ -5467,19 +3671,21 @@ static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev, mlx5_ib_err(ibdev, "Failed to unaffiliate port %u\n", port_num + 1); - ibdev->roce[port_num].last_port_state = IB_PORT_DOWN; + ibdev->port[port_num].roce.last_port_state = IB_PORT_DOWN; } -/* The mlx5_ib_multiport_mutex should be held when calling this function */ static bool mlx5_ib_bind_slave_port(struct mlx5_ib_dev *ibdev, struct mlx5_ib_multiport_info *mpi) { - u8 port_num = mlx5_core_native_port_num(mpi->mdev) - 1; + u32 port_num = mlx5_core_native_port_num(mpi->mdev) - 1; + u64 key; int err; + lockdep_assert_held(&mlx5_ib_multiport_mutex); + spin_lock(&ibdev->port[port_num].mp.mpi_lock); if (ibdev->port[port_num].mp.mpi) { - mlx5_ib_dbg(ibdev, "port %d already affiliated.\n", + mlx5_ib_dbg(ibdev, "port %u already affiliated.\n", port_num + 1); spin_unlock(&ibdev->port[port_num].mp.mpi_lock); return false; @@ -5494,21 +3700,22 @@ static bool mlx5_ib_bind_slave_port(struct mlx5_ib_dev *ibdev, if (err) goto unbind; - err = get_port_caps(ibdev, mlx5_core_native_port_num(mpi->mdev)); - if (err) - goto unbind; - - err = mlx5_add_netdev_notifier(ibdev, port_num); - if (err) { - mlx5_ib_err(ibdev, "failed adding netdev notifier for port %u\n", - port_num + 1); - goto unbind; - } + mlx5_mdev_netdev_track(ibdev, port_num); mpi->mdev_events.notifier_call = mlx5_ib_event_slave_port; mlx5_notifier_register(mpi->mdev, &mpi->mdev_events); - err = mlx5_ib_init_cong_debugfs(ibdev, port_num); + mlx5_ib_init_cong_debugfs(ibdev, port_num); + + key = mpi->mdev->priv.adev_idx; + mlx5_core_mp_event_replay(mpi->mdev, + MLX5_DRIVER_EVENT_AFFILIATION_DONE, + &key); + mlx5_core_mp_event_replay(ibdev->mdev, + MLX5_DRIVER_EVENT_AFFILIATION_DONE, + &key); + + err = mlx5_ib_enable_lb_mp(ibdev->mdev, mpi->mdev, &ibdev->lb); if (err) goto unbind; @@ -5519,14 +3726,49 @@ unbind: return false; } +static int mlx5_ib_data_direct_init(struct mlx5_ib_dev *dev) +{ + char vuid[MLX5_ST_SZ_BYTES(array1024_auto) + 1] = {}; + int ret; + + if (!MLX5_CAP_GEN(dev->mdev, data_direct) || + !MLX5_CAP_GEN_2(dev->mdev, query_vuid)) + return 0; + + ret = mlx5_cmd_query_vuid(dev->mdev, true, vuid); + if (ret) + return ret; + + ret = mlx5_ib_create_data_direct_resources(dev); + if (ret) + return ret; + + INIT_LIST_HEAD(&dev->data_direct_mr_list); + ret = mlx5_data_direct_ib_reg(dev, vuid); + if (ret) + mlx5_ib_free_data_direct_resources(dev); + + return ret; +} + +static void mlx5_ib_data_direct_cleanup(struct mlx5_ib_dev *dev) +{ + if (!MLX5_CAP_GEN(dev->mdev, data_direct) || + !MLX5_CAP_GEN_2(dev->mdev, query_vuid)) + return; + + mlx5_data_direct_ib_unreg(dev); + mlx5_ib_free_data_direct_resources(dev); +} + static int mlx5_ib_init_multiport_master(struct mlx5_ib_dev *dev) { - int port_num = mlx5_core_native_port_num(dev->mdev) - 1; + u32 port_num = mlx5_core_native_port_num(dev->mdev) - 1; enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, port_num + 1); struct mlx5_ib_multiport_info *mpi; int err; - int i; + u32 i; if (!mlx5_core_is_mp_master(dev->mdev) || ll != IB_LINK_LAYER_ETHERNET) return 0; @@ -5565,22 +3807,22 @@ static int mlx5_ib_init_multiport_master(struct mlx5_ib_dev *dev) list_for_each_entry(mpi, &mlx5_ib_unaffiliated_port_list, list) { if (dev->sys_image_guid == mpi->sys_image_guid && - (mlx5_core_native_port_num(mpi->mdev) - 1) == i) { + (mlx5_core_native_port_num(mpi->mdev) - 1) == i && + mlx5_core_same_coredev_type(dev->mdev, mpi->mdev)) { bound = mlx5_ib_bind_slave_port(dev, mpi); } if (bound) { - dev_dbg(&mpi->mdev->pdev->dev, "removing port from unaffiliated list.\n"); + dev_dbg(mpi->mdev->device, + "removing port from unaffiliated list.\n"); mlx5_ib_dbg(dev, "port %d bound\n", i + 1); list_del(&mpi->list); break; } } - if (!bound) { - get_port_caps(dev, i + 1); + if (!bound) mlx5_ib_dbg(dev, "no free port found for port %d\n", i + 1); - } } list_add_tail(&dev->ib_dev_list, &mlx5_ib_dev_list); @@ -5590,10 +3832,10 @@ static int mlx5_ib_init_multiport_master(struct mlx5_ib_dev *dev) static void mlx5_ib_cleanup_multiport_master(struct mlx5_ib_dev *dev) { - int port_num = mlx5_core_native_port_num(dev->mdev) - 1; + u32 port_num = mlx5_core_native_port_num(dev->mdev) - 1; enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, port_num + 1); - int i; + u32 i; if (!mlx5_core_is_mp_master(dev->mdev) || ll != IB_LINK_LAYER_ETHERNET) return; @@ -5606,8 +3848,12 @@ static void mlx5_ib_cleanup_multiport_master(struct mlx5_ib_dev *dev) kfree(dev->port[i].mp.mpi); dev->port[i].mp.mpi = NULL; } else { - mlx5_ib_dbg(dev, "unbinding port_num: %d\n", i + 1); - mlx5_ib_unbind_slave_port(dev, dev->port[i].mp.mpi); + mlx5_ib_dbg(dev, "unbinding port_num: %u\n", + i + 1); + list_add_tail(&dev->port[i].mp.mpi->list, + &mlx5_ib_unaffiliated_port_list); + mlx5_ib_unbind_slave_port(dev, + dev->port[i].mp.mpi); } } } @@ -5619,262 +3865,432 @@ static void mlx5_ib_cleanup_multiport_master(struct mlx5_ib_dev *dev) mlx5_nic_vport_disable_roce(dev->mdev); } -ADD_UVERBS_ATTRIBUTES_SIMPLE( - mlx5_ib_dm, - UVERBS_OBJECT_DM, - UVERBS_METHOD_DM_ALLOC, - UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET, - UVERBS_ATTR_TYPE(u64), - UA_MANDATORY), - UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX, - UVERBS_ATTR_TYPE(u16), - UA_MANDATORY)); - -ADD_UVERBS_ATTRIBUTES_SIMPLE( - mlx5_ib_flow_action, - UVERBS_OBJECT_FLOW_ACTION, - UVERBS_METHOD_FLOW_ACTION_ESP_CREATE, - UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS, - enum mlx5_ib_uapi_flow_action_flags)); +static int mmap_obj_cleanup(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_user_mmap_entry *obj = uobject->object; -static const struct uapi_definition mlx5_ib_defs[] = { -#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) - UAPI_DEF_CHAIN(mlx5_ib_devx_defs), - UAPI_DEF_CHAIN(mlx5_ib_flow_defs), -#endif + rdma_user_mmap_entry_remove(&obj->rdma_entry); + return 0; +} - UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_FLOW_ACTION, - &mlx5_ib_flow_action), - UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DM, &mlx5_ib_dm), - {} -}; +static int mlx5_rdma_user_mmap_entry_insert(struct mlx5_ib_ucontext *c, + struct mlx5_user_mmap_entry *entry, + size_t length) +{ + return rdma_user_mmap_entry_insert_range( + &c->ibucontext, &entry->rdma_entry, length, + (MLX5_IB_MMAP_OFFSET_START << 16), + ((MLX5_IB_MMAP_OFFSET_END << 16) + (1UL << 16) - 1)); +} -static int mlx5_ib_read_counters(struct ib_counters *counters, - struct ib_counters_read_attr *read_attr, - struct uverbs_attr_bundle *attrs) +static struct mlx5_user_mmap_entry * +alloc_var_entry(struct mlx5_ib_ucontext *c) { - struct mlx5_ib_mcounters *mcounters = to_mcounters(counters); - struct mlx5_read_counters_attr mread_attr = {}; - struct mlx5_ib_flow_counters_desc *desc; - int ret, i; + struct mlx5_user_mmap_entry *entry; + struct mlx5_var_table *var_table; + u32 page_idx; + int err; - mutex_lock(&mcounters->mcntrs_mutex); - if (mcounters->cntrs_max_index > read_attr->ncounters) { - ret = -EINVAL; - goto err_bound; - } + var_table = &to_mdev(c->ibucontext.device)->var_table; + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return ERR_PTR(-ENOMEM); - mread_attr.out = kcalloc(mcounters->counters_num, sizeof(u64), - GFP_KERNEL); - if (!mread_attr.out) { - ret = -ENOMEM; - goto err_bound; + mutex_lock(&var_table->bitmap_lock); + page_idx = find_first_zero_bit(var_table->bitmap, + var_table->num_var_hw_entries); + if (page_idx >= var_table->num_var_hw_entries) { + err = -ENOSPC; + mutex_unlock(&var_table->bitmap_lock); + goto end; } - mread_attr.hw_cntrs_hndl = mcounters->hw_cntrs_hndl; - mread_attr.flags = read_attr->flags; - ret = mcounters->read_counters(counters->device, &mread_attr); - if (ret) - goto err_read; + set_bit(page_idx, var_table->bitmap); + mutex_unlock(&var_table->bitmap_lock); - /* do the pass over the counters data array to assign according to the - * descriptions and indexing pairs - */ - desc = mcounters->counters_data; - for (i = 0; i < mcounters->ncounters; i++) - read_attr->counters_buff[desc[i].index] += mread_attr.out[desc[i].description]; - -err_read: - kfree(mread_attr.out); -err_bound: - mutex_unlock(&mcounters->mcntrs_mutex); - return ret; + entry->address = var_table->hw_start_addr + + (page_idx * var_table->stride_size); + entry->page_idx = page_idx; + entry->mmap_flag = MLX5_IB_MMAP_TYPE_VAR; + + err = mlx5_rdma_user_mmap_entry_insert(c, entry, + var_table->stride_size); + if (err) + goto err_insert; + + return entry; + +err_insert: + mutex_lock(&var_table->bitmap_lock); + clear_bit(page_idx, var_table->bitmap); + mutex_unlock(&var_table->bitmap_lock); +end: + kfree(entry); + return ERR_PTR(err); } -static int mlx5_ib_destroy_counters(struct ib_counters *counters) +static int UVERBS_HANDLER(MLX5_IB_METHOD_VAR_OBJ_ALLOC)( + struct uverbs_attr_bundle *attrs) { - struct mlx5_ib_mcounters *mcounters = to_mcounters(counters); + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_HANDLE); + struct mlx5_ib_ucontext *c; + struct mlx5_user_mmap_entry *entry; + u64 mmap_offset; + u32 length; + int err; - counters_clear_description(counters); - if (mcounters->hw_cntrs_hndl) - mlx5_fc_destroy(to_mdev(counters->device)->mdev, - mcounters->hw_cntrs_hndl); + c = to_mucontext(ib_uverbs_get_ucontext(attrs)); + if (IS_ERR(c)) + return PTR_ERR(c); - kfree(mcounters); + entry = alloc_var_entry(c); + if (IS_ERR(entry)) + return PTR_ERR(entry); - return 0; + mmap_offset = mlx5_entry_to_mmap_offset(entry); + length = entry->rdma_entry.npages * PAGE_SIZE; + uobj->object = entry; + uverbs_finalize_uobj_create(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_HANDLE); + + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_OFFSET, + &mmap_offset, sizeof(mmap_offset)); + if (err) + return err; + + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_PAGE_ID, + &entry->page_idx, sizeof(entry->page_idx)); + if (err) + return err; + + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_LENGTH, + &length, sizeof(length)); + return err; } -static struct ib_counters *mlx5_ib_create_counters(struct ib_device *device, - struct uverbs_attr_bundle *attrs) +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_VAR_OBJ_ALLOC, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_VAR_OBJ_ALLOC_HANDLE, + MLX5_IB_OBJECT_VAR, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_VAR_OBJ_ALLOC_PAGE_ID, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_LENGTH, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_OFFSET, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + MLX5_IB_METHOD_VAR_OBJ_DESTROY, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_VAR_OBJ_DESTROY_HANDLE, + MLX5_IB_OBJECT_VAR, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_VAR, + UVERBS_TYPE_ALLOC_IDR(mmap_obj_cleanup), + &UVERBS_METHOD(MLX5_IB_METHOD_VAR_OBJ_ALLOC), + &UVERBS_METHOD(MLX5_IB_METHOD_VAR_OBJ_DESTROY)); + +static bool var_is_supported(struct ib_device *device) { - struct mlx5_ib_mcounters *mcounters; + struct mlx5_ib_dev *dev = to_mdev(device); - mcounters = kzalloc(sizeof(*mcounters), GFP_KERNEL); - if (!mcounters) + return (MLX5_CAP_GEN_64(dev->mdev, general_obj_types) & + MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q); +} + +static struct mlx5_user_mmap_entry * +alloc_uar_entry(struct mlx5_ib_ucontext *c, + enum mlx5_ib_uapi_uar_alloc_type alloc_type) +{ + struct mlx5_user_mmap_entry *entry; + struct mlx5_ib_dev *dev; + u32 uar_index; + int err; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) return ERR_PTR(-ENOMEM); - mutex_init(&mcounters->mcntrs_mutex); + dev = to_mdev(c->ibucontext.device); + err = mlx5_cmd_uar_alloc(dev->mdev, &uar_index, c->devx_uid); + if (err) + goto end; + + entry->page_idx = uar_index; + entry->address = uar_index2paddress(dev, uar_index); + if (alloc_type == MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF) + entry->mmap_flag = MLX5_IB_MMAP_TYPE_UAR_WC; + else + entry->mmap_flag = MLX5_IB_MMAP_TYPE_UAR_NC; - return &mcounters->ibcntrs; + err = mlx5_rdma_user_mmap_entry_insert(c, entry, PAGE_SIZE); + if (err) + goto err_insert; + + return entry; + +err_insert: + mlx5_cmd_uar_dealloc(dev->mdev, uar_index, c->devx_uid); +end: + kfree(entry); + return ERR_PTR(err); } -void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev) +static int UVERBS_HANDLER(MLX5_IB_METHOD_UAR_OBJ_ALLOC)( + struct uverbs_attr_bundle *attrs) { + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_HANDLE); + enum mlx5_ib_uapi_uar_alloc_type alloc_type; + struct mlx5_ib_ucontext *c; + struct mlx5_user_mmap_entry *entry; + u64 mmap_offset; + u32 length; + int err; + + c = to_mucontext(ib_uverbs_get_ucontext(attrs)); + if (IS_ERR(c)) + return PTR_ERR(c); + + err = uverbs_get_const(&alloc_type, attrs, + MLX5_IB_ATTR_UAR_OBJ_ALLOC_TYPE); + if (err) + return err; + + if (alloc_type != MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF && + alloc_type != MLX5_IB_UAPI_UAR_ALLOC_TYPE_NC) + return -EOPNOTSUPP; + + if (!mlx5_wc_support_get(to_mdev(c->ibucontext.device)->mdev) && + alloc_type == MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF) + return -EOPNOTSUPP; + + entry = alloc_uar_entry(c, alloc_type); + if (IS_ERR(entry)) + return PTR_ERR(entry); + + mmap_offset = mlx5_entry_to_mmap_offset(entry); + length = entry->rdma_entry.npages * PAGE_SIZE; + uobj->object = entry; + uverbs_finalize_uobj_create(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_HANDLE); + + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_OFFSET, + &mmap_offset, sizeof(mmap_offset)); + if (err) + return err; + + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_PAGE_ID, + &entry->page_idx, sizeof(entry->page_idx)); + if (err) + return err; + + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_LENGTH, + &length, sizeof(length)); + return err; +} + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_UAR_OBJ_ALLOC, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_UAR_OBJ_ALLOC_HANDLE, + MLX5_IB_OBJECT_UAR, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_UAR_OBJ_ALLOC_TYPE, + enum mlx5_ib_uapi_uar_alloc_type, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_UAR_OBJ_ALLOC_PAGE_ID, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_LENGTH, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_OFFSET, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + MLX5_IB_METHOD_UAR_OBJ_DESTROY, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_UAR_OBJ_DESTROY_HANDLE, + MLX5_IB_OBJECT_UAR, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_UAR, + UVERBS_TYPE_ALLOC_IDR(mmap_obj_cleanup), + &UVERBS_METHOD(MLX5_IB_METHOD_UAR_OBJ_ALLOC), + &UVERBS_METHOD(MLX5_IB_METHOD_UAR_OBJ_DESTROY)); + +ADD_UVERBS_ATTRIBUTES_SIMPLE( + mlx5_ib_query_context, + UVERBS_OBJECT_DEVICE, + UVERBS_METHOD_QUERY_CONTEXT, + UVERBS_ATTR_PTR_OUT( + MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX, + UVERBS_ATTR_STRUCT(struct mlx5_ib_alloc_ucontext_resp, + dump_fill_mkey), + UA_MANDATORY)); + +ADD_UVERBS_ATTRIBUTES_SIMPLE( + mlx5_ib_reg_dmabuf_mr, + UVERBS_OBJECT_MR, + UVERBS_METHOD_REG_DMABUF_MR, + UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS, + enum mlx5_ib_uapi_reg_dmabuf_flags, + UA_OPTIONAL)); + +static const struct uapi_definition mlx5_ib_defs[] = { + UAPI_DEF_CHAIN(mlx5_ib_devx_defs), + UAPI_DEF_CHAIN(mlx5_ib_flow_defs), + UAPI_DEF_CHAIN(mlx5_ib_qos_defs), + UAPI_DEF_CHAIN(mlx5_ib_std_types_defs), + UAPI_DEF_CHAIN(mlx5_ib_dm_defs), + UAPI_DEF_CHAIN(mlx5_ib_create_cq_defs), + + UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DEVICE, &mlx5_ib_query_context), + UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_MR, &mlx5_ib_reg_dmabuf_mr), + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_VAR, + UAPI_DEF_IS_OBJ_SUPPORTED(var_is_supported)), + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_UAR), + {} +}; + +static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev) +{ + mlx5_ib_data_direct_cleanup(dev); mlx5_ib_cleanup_multiport_master(dev); -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - cleanup_srcu_struct(&dev->mr_srcu); - drain_workqueue(dev->advise_mr_wq); - destroy_workqueue(dev->advise_mr_wq); -#endif - kfree(dev->port); + WARN_ON(!xa_empty(&dev->odp_mkeys)); + mutex_destroy(&dev->cap_mask_mutex); + WARN_ON(!xa_empty(&dev->sig_mrs)); + WARN_ON(!bitmap_empty(dev->dm.memic_alloc_pages, MLX5_MAX_MEMIC_PAGES)); + mlx5r_macsec_dealloc_gids(dev); } -int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) +static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) { struct mlx5_core_dev *mdev = dev->mdev; - int err; - int i; + int err, i; - dev->port = kcalloc(dev->num_ports, sizeof(*dev->port), - GFP_KERNEL); - if (!dev->port) - return -ENOMEM; + dev->ib_dev.node_type = RDMA_NODE_IB_CA; + dev->ib_dev.local_dma_lkey = 0 /* not supported for now */; + dev->ib_dev.dev.parent = mdev->device; + dev->ib_dev.lag_flags = RDMA_LAG_FLAGS_HASH_ALL_SLAVES; for (i = 0; i < dev->num_ports; i++) { spin_lock_init(&dev->port[i].mp.mpi_lock); - rwlock_init(&dev->roce[i].netdev_lock); + dev->port[i].roce.dev = dev; + dev->port[i].roce.native_port_num = i + 1; + dev->port[i].roce.last_port_state = IB_PORT_DOWN; } + err = mlx5r_cmd_query_special_mkeys(dev); + if (err) + return err; + + err = mlx5r_macsec_init_gids_and_devlist(dev); + if (err) + return err; + err = mlx5_ib_init_multiport_master(dev); if (err) - goto err_free_port; + goto err; - if (!mlx5_core_mp_enabled(mdev)) { - for (i = 1; i <= dev->num_ports; i++) { - err = get_port_caps(dev, i); - if (err) - break; - } - } else { - err = get_port_caps(dev, mlx5_core_native_port_num(mdev)); - } + err = set_has_smi_cap(dev); + if (err) + goto err_mp; + + err = mlx5_query_max_pkeys(&dev->ib_dev, &dev->pkey_table_len); if (err) goto err_mp; if (mlx5_use_mad_ifc(dev)) get_ext_port_caps(dev); - dev->ib_dev.owner = THIS_MODULE; - dev->ib_dev.node_type = RDMA_NODE_IB_CA; - dev->ib_dev.local_dma_lkey = 0 /* not supported for now */; - dev->ib_dev.phys_port_cnt = dev->num_ports; - dev->ib_dev.num_comp_vectors = mlx5_comp_vectors_count(mdev); - dev->ib_dev.dev.parent = &mdev->pdev->dev; + dev->ib_dev.num_comp_vectors = mlx5_comp_vectors_max(mdev); mutex_init(&dev->cap_mask_mutex); + mutex_init(&dev->data_direct_lock); INIT_LIST_HEAD(&dev->qp_list); spin_lock_init(&dev->reset_flow_resource_lock); + xa_init(&dev->odp_mkeys); + xa_init(&dev->sig_mrs); + atomic_set(&dev->mkey_var, 0); - spin_lock_init(&dev->memic.memic_lock); - dev->memic.dev = mdev; - -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - dev->advise_mr_wq = alloc_ordered_workqueue("mlx5_ib_advise_mr_wq", 0); - if (!dev->advise_mr_wq) { - err = -ENOMEM; - goto err_mp; - } - - err = init_srcu_struct(&dev->mr_srcu); - if (err) { - destroy_workqueue(dev->advise_mr_wq); + spin_lock_init(&dev->dm.lock); + dev->dm.dev = mdev; + err = mlx5_ib_data_direct_init(dev); + if (err) goto err_mp; - } -#endif return 0; err_mp: mlx5_ib_cleanup_multiport_master(dev); - -err_free_port: - kfree(dev->port); - - return -ENOMEM; -} - -static int mlx5_ib_stage_flow_db_init(struct mlx5_ib_dev *dev) -{ - dev->flow_db = kzalloc(sizeof(*dev->flow_db), GFP_KERNEL); - - if (!dev->flow_db) - return -ENOMEM; - - mutex_init(&dev->flow_db->lock); - - return 0; -} - -int mlx5_ib_stage_rep_flow_db_init(struct mlx5_ib_dev *dev) -{ - struct mlx5_ib_dev *nic_dev; - - nic_dev = mlx5_ib_get_uplink_ibdev(dev->mdev->priv.eswitch); - - if (!nic_dev) - return -EINVAL; - - dev->flow_db = nic_dev->flow_db; - - return 0; +err: + mlx5r_macsec_dealloc_gids(dev); + return err; } -static void mlx5_ib_stage_flow_db_cleanup(struct mlx5_ib_dev *dev) -{ - kfree(dev->flow_db); -} +static struct ib_device *mlx5_ib_add_sub_dev(struct ib_device *parent, + enum rdma_nl_dev_type type, + const char *name); +static void mlx5_ib_del_sub_dev(struct ib_device *sub_dev); static const struct ib_device_ops mlx5_ib_dev_ops = { + .owner = THIS_MODULE, + .driver_id = RDMA_DRIVER_MLX5, + .uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION, + .add_gid = mlx5_ib_add_gid, + .add_sub_dev = mlx5_ib_add_sub_dev, .alloc_mr = mlx5_ib_alloc_mr, + .alloc_mr_integrity = mlx5_ib_alloc_mr_integrity, .alloc_pd = mlx5_ib_alloc_pd, .alloc_ucontext = mlx5_ib_alloc_ucontext, .attach_mcast = mlx5_ib_mcg_attach, .check_mr_status = mlx5_ib_check_mr_status, .create_ah = mlx5_ib_create_ah, - .create_counters = mlx5_ib_create_counters, .create_cq = mlx5_ib_create_cq, - .create_flow = mlx5_ib_create_flow, .create_qp = mlx5_ib_create_qp, .create_srq = mlx5_ib_create_srq, + .create_user_ah = mlx5_ib_create_ah, .dealloc_pd = mlx5_ib_dealloc_pd, .dealloc_ucontext = mlx5_ib_dealloc_ucontext, .del_gid = mlx5_ib_del_gid, + .del_sub_dev = mlx5_ib_del_sub_dev, .dereg_mr = mlx5_ib_dereg_mr, .destroy_ah = mlx5_ib_destroy_ah, - .destroy_counters = mlx5_ib_destroy_counters, .destroy_cq = mlx5_ib_destroy_cq, - .destroy_flow = mlx5_ib_destroy_flow, - .destroy_flow_action = mlx5_ib_destroy_flow_action, .destroy_qp = mlx5_ib_destroy_qp, .destroy_srq = mlx5_ib_destroy_srq, .detach_mcast = mlx5_ib_mcg_detach, .disassociate_ucontext = mlx5_ib_disassociate_ucontext, .drain_rq = mlx5_ib_drain_rq, .drain_sq = mlx5_ib_drain_sq, + .device_group = &mlx5_attr_group, .get_dev_fw_str = get_dev_fw_str, .get_dma_mr = mlx5_ib_get_dma_mr, .get_link_layer = mlx5_ib_port_link_layer, .map_mr_sg = mlx5_ib_map_mr_sg, + .map_mr_sg_pi = mlx5_ib_map_mr_sg_pi, .mmap = mlx5_ib_mmap, + .mmap_free = mlx5_ib_mmap_free, .modify_cq = mlx5_ib_modify_cq, .modify_device = mlx5_ib_modify_device, .modify_port = mlx5_ib_modify_port, .modify_qp = mlx5_ib_modify_qp, .modify_srq = mlx5_ib_modify_srq, + .pre_destroy_cq = mlx5_ib_pre_destroy_cq, .poll_cq = mlx5_ib_poll_cq, - .post_recv = mlx5_ib_post_recv, - .post_send = mlx5_ib_post_send, + .post_destroy_cq = mlx5_ib_post_destroy_cq, + .post_recv = mlx5_ib_post_recv_nodrain, + .post_send = mlx5_ib_post_send_nodrain, .post_srq_recv = mlx5_ib_post_srq_recv, .process_mad = mlx5_ib_process_mad, .query_ah = mlx5_ib_query_ah, @@ -5883,16 +4299,22 @@ static const struct ib_device_ops mlx5_ib_dev_ops = { .query_pkey = mlx5_ib_query_pkey, .query_qp = mlx5_ib_query_qp, .query_srq = mlx5_ib_query_srq, - .read_counters = mlx5_ib_read_counters, + .query_ucontext = mlx5_ib_query_ucontext, .reg_user_mr = mlx5_ib_reg_user_mr, + .reg_user_mr_dmabuf = mlx5_ib_reg_user_mr_dmabuf, .req_notify_cq = mlx5_ib_arm_cq, .rereg_user_mr = mlx5_ib_rereg_user_mr, .resize_cq = mlx5_ib_resize_cq, -}; - -static const struct ib_device_ops mlx5_ib_dev_flow_ipsec_ops = { - .create_flow_action_esp = mlx5_ib_create_flow_action_esp, - .modify_flow_action_esp = mlx5_ib_modify_flow_action_esp, + .ufile_hw_cleanup = mlx5_ib_ufile_hw_cleanup, + + INIT_RDMA_OBJ_SIZE(ib_ah, mlx5_ib_ah, ibah), + INIT_RDMA_OBJ_SIZE(ib_counters, mlx5_ib_mcounters, ibcntrs), + INIT_RDMA_OBJ_SIZE(ib_cq, mlx5_ib_cq, ibcq), + INIT_RDMA_OBJ_SIZE(ib_dmah, mlx5_ib_dmah, ibdmah), + INIT_RDMA_OBJ_SIZE(ib_pd, mlx5_ib_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_qp, mlx5_ib_qp, ibqp), + INIT_RDMA_OBJ_SIZE(ib_srq, mlx5_ib_srq, ibsrq), + INIT_RDMA_OBJ_SIZE(ib_ucontext, mlx5_ib_ucontext, ibucontext), }; static const struct ib_device_ops mlx5_ib_dev_ipoib_enhanced_ops = { @@ -5901,6 +4323,7 @@ static const struct ib_device_ops mlx5_ib_dev_ipoib_enhanced_ops = { static const struct ib_device_ops mlx5_ib_dev_sriov_ops = { .get_vf_config = mlx5_ib_get_vf_config, + .get_vf_guid = mlx5_ib_get_vf_guid, .get_vf_stats = mlx5_ib_get_vf_stats, .set_vf_guid = mlx5_ib_set_vf_guid, .set_vf_link_state = mlx5_ib_set_vf_link_state, @@ -5909,60 +4332,90 @@ static const struct ib_device_ops mlx5_ib_dev_sriov_ops = { static const struct ib_device_ops mlx5_ib_dev_mw_ops = { .alloc_mw = mlx5_ib_alloc_mw, .dealloc_mw = mlx5_ib_dealloc_mw, + + INIT_RDMA_OBJ_SIZE(ib_mw, mlx5_ib_mw, ibmw), }; static const struct ib_device_ops mlx5_ib_dev_xrc_ops = { .alloc_xrcd = mlx5_ib_alloc_xrcd, .dealloc_xrcd = mlx5_ib_dealloc_xrcd, -}; -static const struct ib_device_ops mlx5_ib_dev_dm_ops = { - .alloc_dm = mlx5_ib_alloc_dm, - .dealloc_dm = mlx5_ib_dealloc_dm, - .reg_dm_mr = mlx5_ib_reg_dm_mr, + INIT_RDMA_OBJ_SIZE(ib_xrcd, mlx5_ib_xrcd, ibxrcd), }; -int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) +static int mlx5_ib_init_var_table(struct mlx5_ib_dev *dev) { struct mlx5_core_dev *mdev = dev->mdev; - int err; + struct mlx5_var_table *var_table = &dev->var_table; + u8 log_doorbell_bar_size; + u8 log_doorbell_stride; + u64 bar_size; + + log_doorbell_bar_size = MLX5_CAP_DEV_VDPA_EMULATION(mdev, + log_doorbell_bar_size); + log_doorbell_stride = MLX5_CAP_DEV_VDPA_EMULATION(mdev, + log_doorbell_stride); + var_table->hw_start_addr = dev->mdev->bar_addr + + MLX5_CAP64_DEV_VDPA_EMULATION(mdev, + doorbell_bar_offset); + bar_size = (1ULL << log_doorbell_bar_size) * 4096; + var_table->stride_size = 1ULL << log_doorbell_stride; + var_table->num_var_hw_entries = div_u64(bar_size, + var_table->stride_size); + mutex_init(&var_table->bitmap_lock); + var_table->bitmap = bitmap_zalloc(var_table->num_var_hw_entries, + GFP_KERNEL); + return (var_table->bitmap) ? 0 : -ENOMEM; +} + +static void mlx5_ib_cleanup_ucaps(struct mlx5_ib_dev *dev) +{ + if (MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RDMA_CTRL) + ib_remove_ucap(RDMA_UCAP_MLX5_CTRL_LOCAL); - dev->ib_dev.uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION; - dev->ib_dev.uverbs_cmd_mask = - (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | - (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | - (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | - (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | - (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | - (1ull << IB_USER_VERBS_CMD_CREATE_AH) | - (1ull << IB_USER_VERBS_CMD_DESTROY_AH) | - (1ull << IB_USER_VERBS_CMD_REG_MR) | - (1ull << IB_USER_VERBS_CMD_REREG_MR) | - (1ull << IB_USER_VERBS_CMD_DEREG_MR) | - (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | - (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | - (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | - (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | - (1ull << IB_USER_VERBS_CMD_CREATE_QP) | - (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | - (1ull << IB_USER_VERBS_CMD_QUERY_QP) | - (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | - (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | - (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | - (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | - (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | - (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | - (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | - (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) | - (1ull << IB_USER_VERBS_CMD_OPEN_QP); - dev->ib_dev.uverbs_ex_cmd_mask = - (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE) | - (1ull << IB_USER_VERBS_EX_CMD_CREATE_CQ) | - (1ull << IB_USER_VERBS_EX_CMD_CREATE_QP) | - (1ull << IB_USER_VERBS_EX_CMD_MODIFY_QP) | - (1ull << IB_USER_VERBS_EX_CMD_MODIFY_CQ) | - (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) | - (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW); + if (MLX5_CAP_GEN(dev->mdev, uctx_cap) & + MLX5_UCTX_CAP_RDMA_CTRL_OTHER_VHCA) + ib_remove_ucap(RDMA_UCAP_MLX5_CTRL_OTHER_VHCA); +} + +static int mlx5_ib_init_ucaps(struct mlx5_ib_dev *dev) +{ + int ret; + + if (MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RDMA_CTRL) { + ret = ib_create_ucap(RDMA_UCAP_MLX5_CTRL_LOCAL); + if (ret) + return ret; + } + + if (MLX5_CAP_GEN(dev->mdev, uctx_cap) & + MLX5_UCTX_CAP_RDMA_CTRL_OTHER_VHCA) { + ret = ib_create_ucap(RDMA_UCAP_MLX5_CTRL_OTHER_VHCA); + if (ret) + goto remove_local; + } + + return 0; + +remove_local: + if (MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RDMA_CTRL) + ib_remove_ucap(RDMA_UCAP_MLX5_CTRL_LOCAL); + return ret; +} + +static void mlx5_ib_stage_caps_cleanup(struct mlx5_ib_dev *dev) +{ + if (MLX5_CAP_GEN_2_64(dev->mdev, general_obj_types_127_64) & + MLX5_HCA_CAP_2_GENERAL_OBJECT_TYPES_RDMA_CTRL) + mlx5_ib_cleanup_ucaps(dev); + + bitmap_free(dev->var_table.bitmap); +} + +static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_core_dev *mdev = dev->mdev; + int err; if (MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads) && IS_ENABLED(CONFIG_MLX5_CORE_IPOIB)) @@ -5974,27 +4427,20 @@ int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) dev->umr_fence = mlx5_get_umr_fence(MLX5_CAP_GEN(mdev, umr_fence)); - if (MLX5_CAP_GEN(mdev, imaicl)) { - dev->ib_dev.uverbs_cmd_mask |= - (1ull << IB_USER_VERBS_CMD_ALLOC_MW) | - (1ull << IB_USER_VERBS_CMD_DEALLOC_MW); + if (MLX5_CAP_GEN(mdev, imaicl)) ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_mw_ops); - } - if (MLX5_CAP_GEN(mdev, xrc)) { - dev->ib_dev.uverbs_cmd_mask |= - (1ull << IB_USER_VERBS_CMD_OPEN_XRCD) | - (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD); + if (MLX5_CAP_GEN(mdev, xrc)) ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_xrc_ops); - } - if (MLX5_CAP_DEV_MEM(mdev, memic)) + if (MLX5_CAP_DEV_MEM(mdev, memic) || + MLX5_CAP_GEN_64(dev->mdev, general_obj_types) & + MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM) ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_dm_ops); - if (mlx5_accel_ipsec_device_caps(dev->mdev) & - MLX5_ACCEL_IPSEC_CAP_DEVICE) - ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_flow_ipsec_ops); - dev->ib_dev.driver_id = RDMA_DRIVER_MLX5; + if (mdev->st) + ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_dmah_ops); + ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_ops); if (IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)) @@ -6009,6 +4455,22 @@ int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc))) mutex_init(&dev->lb.mutex); + if (MLX5_CAP_GEN_64(dev->mdev, general_obj_types) & + MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q) { + err = mlx5_ib_init_var_table(dev); + if (err) + return err; + } + + if (MLX5_CAP_GEN_2_64(dev->mdev, general_obj_types_127_64) & + MLX5_HCA_CAP_2_GENERAL_OBJECT_TYPES_RDMA_CTRL) { + err = mlx5_ib_init_ucaps(dev); + if (err) + return err; + } + + dev->ib_dev.use_cq_dim = true; + return 0; } @@ -6026,9 +4488,10 @@ static int mlx5_ib_stage_non_default_cb(struct mlx5_ib_dev *dev) static const struct ib_device_ops mlx5_ib_dev_port_rep_ops = { .get_port_immutable = mlx5_port_rep_immutable, .query_port = mlx5_ib_rep_query_port, + .query_pkey = mlx5_ib_rep_query_pkey, }; -int mlx5_ib_stage_rep_non_default_cb(struct mlx5_ib_dev *dev) +static int mlx5_ib_stage_raw_eth_non_default_cb(struct mlx5_ib_dev *dev) { ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_port_rep_ops); return 0; @@ -6039,76 +4502,30 @@ static const struct ib_device_ops mlx5_ib_dev_common_roce_ops = { .create_wq = mlx5_ib_create_wq, .destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table, .destroy_wq = mlx5_ib_destroy_wq, - .get_netdev = mlx5_ib_get_netdev, .modify_wq = mlx5_ib_modify_wq, -}; - -static int mlx5_ib_stage_common_roce_init(struct mlx5_ib_dev *dev) -{ - u8 port_num; - int i; - - for (i = 0; i < dev->num_ports; i++) { - dev->roce[i].dev = dev; - dev->roce[i].native_port_num = i + 1; - dev->roce[i].last_port_state = IB_PORT_DOWN; - } - - dev->ib_dev.uverbs_ex_cmd_mask |= - (1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) | - (1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) | - (1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) | - (1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) | - (1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL); - ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_common_roce_ops); - - port_num = mlx5_core_native_port_num(dev->mdev) - 1; - return mlx5_add_netdev_notifier(dev, port_num); -} - -static void mlx5_ib_stage_common_roce_cleanup(struct mlx5_ib_dev *dev) -{ - u8 port_num = mlx5_core_native_port_num(dev->mdev) - 1; - - mlx5_remove_netdev_notifier(dev, port_num); -} - -int mlx5_ib_stage_rep_roce_init(struct mlx5_ib_dev *dev) -{ - struct mlx5_core_dev *mdev = dev->mdev; - enum rdma_link_layer ll; - int port_type_cap; - int err = 0; - - port_type_cap = MLX5_CAP_GEN(mdev, port_type); - ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap); - - if (ll == IB_LINK_LAYER_ETHERNET) - err = mlx5_ib_stage_common_roce_init(dev); - - return err; -} - -void mlx5_ib_stage_rep_roce_cleanup(struct mlx5_ib_dev *dev) -{ - mlx5_ib_stage_common_roce_cleanup(dev); -} + INIT_RDMA_OBJ_SIZE(ib_rwq_ind_table, mlx5_ib_rwq_ind_table, + ib_rwq_ind_tbl), +}; -static int mlx5_ib_stage_roce_init(struct mlx5_ib_dev *dev) +static int mlx5_ib_roce_init(struct mlx5_ib_dev *dev) { struct mlx5_core_dev *mdev = dev->mdev; enum rdma_link_layer ll; int port_type_cap; + u32 port_num = 0; int err; port_type_cap = MLX5_CAP_GEN(mdev, port_type); ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap); if (ll == IB_LINK_LAYER_ETHERNET) { - err = mlx5_ib_stage_common_roce_init(dev); - if (err) - return err; + ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_common_roce_ops); + + port_num = mlx5_core_native_port_num(dev->mdev) - 1; + + /* Register only for native ports */ + mlx5_mdev_netdev_track(dev, port_num); err = mlx5_enable_eth(dev); if (err) @@ -6117,74 +4534,33 @@ static int mlx5_ib_stage_roce_init(struct mlx5_ib_dev *dev) return 0; cleanup: - mlx5_ib_stage_common_roce_cleanup(dev); - + mlx5_mdev_netdev_untrack(dev, port_num); return err; } -static void mlx5_ib_stage_roce_cleanup(struct mlx5_ib_dev *dev) +static void mlx5_ib_roce_cleanup(struct mlx5_ib_dev *dev) { struct mlx5_core_dev *mdev = dev->mdev; enum rdma_link_layer ll; int port_type_cap; + u32 port_num; port_type_cap = MLX5_CAP_GEN(mdev, port_type); ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap); if (ll == IB_LINK_LAYER_ETHERNET) { mlx5_disable_eth(dev); - mlx5_ib_stage_common_roce_cleanup(dev); - } -} - -int mlx5_ib_stage_dev_res_init(struct mlx5_ib_dev *dev) -{ - return create_dev_resources(&dev->devr); -} - -void mlx5_ib_stage_dev_res_cleanup(struct mlx5_ib_dev *dev) -{ - destroy_dev_resources(&dev->devr); -} - -static int mlx5_ib_stage_odp_init(struct mlx5_ib_dev *dev) -{ - mlx5_ib_internal_fill_odp_caps(dev); - - return mlx5_ib_odp_init_one(dev); -} - -void mlx5_ib_stage_odp_cleanup(struct mlx5_ib_dev *dev) -{ - mlx5_ib_odp_cleanup_one(dev); -} - -static const struct ib_device_ops mlx5_ib_dev_hw_stats_ops = { - .alloc_hw_stats = mlx5_ib_alloc_hw_stats, - .get_hw_stats = mlx5_ib_get_hw_stats, -}; - -int mlx5_ib_stage_counters_init(struct mlx5_ib_dev *dev) -{ - if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) { - ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_hw_stats_ops); - return mlx5_ib_alloc_counters(dev); + port_num = mlx5_core_native_port_num(dev->mdev) - 1; + mlx5_mdev_netdev_untrack(dev, port_num); } - - return 0; -} - -void mlx5_ib_stage_counters_cleanup(struct mlx5_ib_dev *dev) -{ - if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) - mlx5_ib_dealloc_counters(dev); } static int mlx5_ib_stage_cong_debugfs_init(struct mlx5_ib_dev *dev) { - return mlx5_ib_init_cong_debugfs(dev, - mlx5_core_native_port_num(dev->mdev) - 1); + mlx5_ib_init_cong_debugfs(dev, + mlx5_core_native_port_num(dev->mdev) - 1); + return 0; } static void mlx5_ib_stage_cong_debugfs_cleanup(struct mlx5_ib_dev *dev) @@ -6193,18 +4569,7 @@ static void mlx5_ib_stage_cong_debugfs_cleanup(struct mlx5_ib_dev *dev) mlx5_core_native_port_num(dev->mdev) - 1); } -static int mlx5_ib_stage_uar_init(struct mlx5_ib_dev *dev) -{ - dev->mdev->priv.uar = mlx5_get_uars_page(dev->mdev); - return PTR_ERR_OR_ZERO(dev->mdev->priv.uar); -} - -static void mlx5_ib_stage_uar_cleanup(struct mlx5_ib_dev *dev) -{ - mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar); -} - -int mlx5_ib_stage_bfrag_init(struct mlx5_ib_dev *dev) +static int mlx5_ib_stage_bfrag_init(struct mlx5_ib_dev *dev) { int err; @@ -6214,102 +4579,170 @@ int mlx5_ib_stage_bfrag_init(struct mlx5_ib_dev *dev) err = mlx5_alloc_bfreg(dev->mdev, &dev->fp_bfreg, false, true); if (err) - mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg); + mlx5_free_bfreg(dev->mdev, &dev->bfreg); return err; } -void mlx5_ib_stage_bfrag_cleanup(struct mlx5_ib_dev *dev) +static void mlx5_ib_stage_bfrag_cleanup(struct mlx5_ib_dev *dev) { mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg); mlx5_free_bfreg(dev->mdev, &dev->bfreg); } -int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev) +static int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev) { const char *name; - rdma_set_device_sysfs_group(&dev->ib_dev, &mlx5_attr_group); - if (!mlx5_lag_is_roce(dev->mdev)) + if (dev->sub_dev_name) { + name = dev->sub_dev_name; + ib_mark_name_assigned_by_user(&dev->ib_dev); + } else if (!mlx5_lag_is_active(dev->mdev)) name = "mlx5_%d"; else name = "mlx5_bond_%d"; - return ib_register_device(&dev->ib_dev, name, NULL); + return ib_register_device(&dev->ib_dev, name, &dev->mdev->pdev->dev); } -void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev) +static void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev) { - destroy_umrc_res(dev); + mlx5_mkey_cache_cleanup(dev); + mlx5r_umr_resource_cleanup(dev); + mlx5r_umr_cleanup(dev); } -void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev) +static void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev) { ib_unregister_device(&dev->ib_dev); } -int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev) +static int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev) { - return create_umr_res(dev); + int ret; + + ret = mlx5r_umr_init(dev); + if (ret) + return ret; + + ret = mlx5_mkey_cache_init(dev); + if (ret) + mlx5_ib_warn(dev, "mr cache init failed %d\n", ret); + return ret; } static int mlx5_ib_stage_delay_drop_init(struct mlx5_ib_dev *dev) { - init_delay_drop(dev); + struct dentry *root; + + if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP)) + return 0; + mutex_init(&dev->delay_drop.lock); + dev->delay_drop.dev = dev; + dev->delay_drop.activate = false; + dev->delay_drop.timeout = MLX5_MAX_DELAY_DROP_TIMEOUT_MS * 1000; + INIT_WORK(&dev->delay_drop.delay_drop_work, delay_drop_handler); + atomic_set(&dev->delay_drop.rqs_cnt, 0); + atomic_set(&dev->delay_drop.events_cnt, 0); + + if (!mlx5_debugfs_root) + return 0; + + root = debugfs_create_dir("delay_drop", mlx5_debugfs_get_dev_root(dev->mdev)); + dev->delay_drop.dir_debugfs = root; + + debugfs_create_atomic_t("num_timeout_events", 0400, root, + &dev->delay_drop.events_cnt); + debugfs_create_atomic_t("num_rqs", 0400, root, + &dev->delay_drop.rqs_cnt); + debugfs_create_file("timeout", 0600, root, &dev->delay_drop, + &fops_delay_drop_timeout); return 0; } static void mlx5_ib_stage_delay_drop_cleanup(struct mlx5_ib_dev *dev) { - cancel_delay_drop(dev); + if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP)) + return; + + cancel_work_sync(&dev->delay_drop.delay_drop_work); + if (!dev->delay_drop.dir_debugfs) + return; + + debugfs_remove_recursive(dev->delay_drop.dir_debugfs); + dev->delay_drop.dir_debugfs = NULL; } static int mlx5_ib_stage_dev_notifier_init(struct mlx5_ib_dev *dev) { + struct mlx5_ib_resources *devr = &dev->devr; + int port; + + for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) + INIT_WORK(&devr->ports[port].pkey_change_work, + pkey_change_handler); + dev->mdev_events.notifier_call = mlx5_ib_event; mlx5_notifier_register(dev->mdev, &dev->mdev_events); + + mlx5r_macsec_event_register(dev); + return 0; } static void mlx5_ib_stage_dev_notifier_cleanup(struct mlx5_ib_dev *dev) { + struct mlx5_ib_resources *devr = &dev->devr; + int port; + + mlx5r_macsec_event_unregister(dev); mlx5_notifier_unregister(dev->mdev, &dev->mdev_events); + + for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) + cancel_work_sync(&devr->ports[port].pkey_change_work); } -static int mlx5_ib_stage_devx_init(struct mlx5_ib_dev *dev) +void mlx5_ib_data_direct_bind(struct mlx5_ib_dev *ibdev, + struct mlx5_data_direct_dev *dev) { - int uid; - - uid = mlx5_ib_devx_create(dev, false); - if (uid > 0) - dev->devx_whitelist_uid = uid; - - return 0; + mutex_lock(&ibdev->data_direct_lock); + ibdev->data_direct_dev = dev; + mutex_unlock(&ibdev->data_direct_lock); } -static void mlx5_ib_stage_devx_cleanup(struct mlx5_ib_dev *dev) + +void mlx5_ib_data_direct_unbind(struct mlx5_ib_dev *ibdev) { - if (dev->devx_whitelist_uid) - mlx5_ib_devx_destroy(dev, dev->devx_whitelist_uid); + mutex_lock(&ibdev->data_direct_lock); + mlx5_ib_revoke_data_direct_mrs(ibdev); + ibdev->data_direct_dev = NULL; + mutex_unlock(&ibdev->data_direct_lock); } void __mlx5_ib_remove(struct mlx5_ib_dev *dev, const struct mlx5_ib_profile *profile, int stage) { + dev->ib_active = false; + /* Number of stages to cleanup */ while (stage) { stage--; if (profile->stage[stage].cleanup) profile->stage[stage].cleanup(dev); } + + kfree(dev->port); + ib_dealloc_device(&dev->ib_dev); } -void *__mlx5_ib_add(struct mlx5_ib_dev *dev, - const struct mlx5_ib_profile *profile) +int __mlx5_ib_add(struct mlx5_ib_dev *dev, + const struct mlx5_ib_profile *profile) { int err; int i; + dev->profile = profile; + for (i = 0; i < MLX5_IB_STAGE_MAX; i++) { if (profile->stage[i].init) { err = profile->stage[i].init(dev); @@ -6318,54 +4751,53 @@ void *__mlx5_ib_add(struct mlx5_ib_dev *dev, } } - dev->profile = profile; dev->ib_active = true; - - return dev; + return 0; err_out: - __mlx5_ib_remove(dev, profile, i); - - return NULL; + /* Clean up stages which were initialized */ + while (i) { + i--; + if (profile->stage[i].cleanup) + profile->stage[i].cleanup(dev); + } + return -ENOMEM; } static const struct mlx5_ib_profile pf_profile = { STAGE_CREATE(MLX5_IB_STAGE_INIT, mlx5_ib_stage_init_init, mlx5_ib_stage_init_cleanup), - STAGE_CREATE(MLX5_IB_STAGE_FLOW_DB, - mlx5_ib_stage_flow_db_init, - mlx5_ib_stage_flow_db_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_FS, + mlx5_ib_fs_init, + mlx5_ib_fs_cleanup), STAGE_CREATE(MLX5_IB_STAGE_CAPS, mlx5_ib_stage_caps_init, - NULL), + mlx5_ib_stage_caps_cleanup), STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB, mlx5_ib_stage_non_default_cb, NULL), STAGE_CREATE(MLX5_IB_STAGE_ROCE, - mlx5_ib_stage_roce_init, - mlx5_ib_stage_roce_cleanup), + mlx5_ib_roce_init, + mlx5_ib_roce_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_QP, + mlx5_init_qp_table, + mlx5_cleanup_qp_table), STAGE_CREATE(MLX5_IB_STAGE_SRQ, mlx5_init_srq_table, mlx5_cleanup_srq_table), STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES, - mlx5_ib_stage_dev_res_init, - mlx5_ib_stage_dev_res_cleanup), - STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER, - mlx5_ib_stage_dev_notifier_init, - mlx5_ib_stage_dev_notifier_cleanup), + mlx5_ib_dev_res_init, + mlx5_ib_dev_res_cleanup), STAGE_CREATE(MLX5_IB_STAGE_ODP, - mlx5_ib_stage_odp_init, - mlx5_ib_stage_odp_cleanup), + mlx5_ib_odp_init_one, + mlx5_ib_odp_cleanup_one), STAGE_CREATE(MLX5_IB_STAGE_COUNTERS, - mlx5_ib_stage_counters_init, - mlx5_ib_stage_counters_cleanup), + mlx5_ib_counters_init, + mlx5_ib_counters_cleanup), STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS, mlx5_ib_stage_cong_debugfs_init, mlx5_ib_stage_cong_debugfs_cleanup), - STAGE_CREATE(MLX5_IB_STAGE_UAR, - mlx5_ib_stage_uar_init, - mlx5_ib_stage_uar_cleanup), STAGE_CREATE(MLX5_IB_STAGE_BFREG, mlx5_ib_stage_bfrag_init, mlx5_ib_stage_bfrag_cleanup), @@ -6373,66 +4805,172 @@ static const struct mlx5_ib_profile pf_profile = { NULL, mlx5_ib_stage_pre_ib_reg_umr_cleanup), STAGE_CREATE(MLX5_IB_STAGE_WHITELIST_UID, - mlx5_ib_stage_devx_init, - mlx5_ib_stage_devx_cleanup), + mlx5_ib_devx_init, + mlx5_ib_devx_cleanup), STAGE_CREATE(MLX5_IB_STAGE_IB_REG, mlx5_ib_stage_ib_reg_init, mlx5_ib_stage_ib_reg_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER, + mlx5_ib_stage_dev_notifier_init, + mlx5_ib_stage_dev_notifier_cleanup), STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR, mlx5_ib_stage_post_ib_reg_umr_init, NULL), STAGE_CREATE(MLX5_IB_STAGE_DELAY_DROP, mlx5_ib_stage_delay_drop_init, mlx5_ib_stage_delay_drop_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_RESTRACK, + mlx5_ib_restrack_init, + NULL), }; -static const struct mlx5_ib_profile nic_rep_profile = { +const struct mlx5_ib_profile raw_eth_profile = { STAGE_CREATE(MLX5_IB_STAGE_INIT, mlx5_ib_stage_init_init, mlx5_ib_stage_init_cleanup), - STAGE_CREATE(MLX5_IB_STAGE_FLOW_DB, - mlx5_ib_stage_flow_db_init, - mlx5_ib_stage_flow_db_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_FS, + mlx5_ib_fs_init, + mlx5_ib_fs_cleanup), STAGE_CREATE(MLX5_IB_STAGE_CAPS, mlx5_ib_stage_caps_init, - NULL), + mlx5_ib_stage_caps_cleanup), STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB, - mlx5_ib_stage_rep_non_default_cb, + mlx5_ib_stage_raw_eth_non_default_cb, NULL), STAGE_CREATE(MLX5_IB_STAGE_ROCE, - mlx5_ib_stage_rep_roce_init, - mlx5_ib_stage_rep_roce_cleanup), + mlx5_ib_roce_init, + mlx5_ib_roce_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_QP, + mlx5_init_qp_table, + mlx5_cleanup_qp_table), STAGE_CREATE(MLX5_IB_STAGE_SRQ, mlx5_init_srq_table, mlx5_cleanup_srq_table), STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES, - mlx5_ib_stage_dev_res_init, - mlx5_ib_stage_dev_res_cleanup), - STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER, - mlx5_ib_stage_dev_notifier_init, - mlx5_ib_stage_dev_notifier_cleanup), + mlx5_ib_dev_res_init, + mlx5_ib_dev_res_cleanup), STAGE_CREATE(MLX5_IB_STAGE_COUNTERS, - mlx5_ib_stage_counters_init, - mlx5_ib_stage_counters_cleanup), - STAGE_CREATE(MLX5_IB_STAGE_UAR, - mlx5_ib_stage_uar_init, - mlx5_ib_stage_uar_cleanup), + mlx5_ib_counters_init, + mlx5_ib_counters_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS, + mlx5_ib_stage_cong_debugfs_init, + mlx5_ib_stage_cong_debugfs_cleanup), STAGE_CREATE(MLX5_IB_STAGE_BFREG, mlx5_ib_stage_bfrag_init, mlx5_ib_stage_bfrag_cleanup), STAGE_CREATE(MLX5_IB_STAGE_PRE_IB_REG_UMR, NULL, mlx5_ib_stage_pre_ib_reg_umr_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_WHITELIST_UID, + mlx5_ib_devx_init, + mlx5_ib_devx_cleanup), STAGE_CREATE(MLX5_IB_STAGE_IB_REG, mlx5_ib_stage_ib_reg_init, mlx5_ib_stage_ib_reg_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER, + mlx5_ib_stage_dev_notifier_init, + mlx5_ib_stage_dev_notifier_cleanup), STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR, mlx5_ib_stage_post_ib_reg_umr_init, NULL), + STAGE_CREATE(MLX5_IB_STAGE_DELAY_DROP, + mlx5_ib_stage_delay_drop_init, + mlx5_ib_stage_delay_drop_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_RESTRACK, + mlx5_ib_restrack_init, + NULL), +}; + +static const struct mlx5_ib_profile plane_profile = { + STAGE_CREATE(MLX5_IB_STAGE_INIT, + mlx5_ib_stage_init_init, + mlx5_ib_stage_init_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_CAPS, + mlx5_ib_stage_caps_init, + mlx5_ib_stage_caps_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB, + mlx5_ib_stage_non_default_cb, + NULL), + STAGE_CREATE(MLX5_IB_STAGE_QP, + mlx5_init_qp_table, + mlx5_cleanup_qp_table), + STAGE_CREATE(MLX5_IB_STAGE_SRQ, + mlx5_init_srq_table, + mlx5_cleanup_srq_table), + STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES, + mlx5_ib_dev_res_init, + mlx5_ib_dev_res_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_BFREG, + mlx5_ib_stage_bfrag_init, + mlx5_ib_stage_bfrag_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_IB_REG, + mlx5_ib_stage_ib_reg_init, + mlx5_ib_stage_ib_reg_cleanup), }; -static void *mlx5_ib_add_slave_port(struct mlx5_core_dev *mdev) +static struct ib_device *mlx5_ib_add_sub_dev(struct ib_device *parent, + enum rdma_nl_dev_type type, + const char *name) { + struct mlx5_ib_dev *mparent = to_mdev(parent), *mplane; + enum rdma_link_layer ll; + int ret; + + if (mparent->smi_dev) + return ERR_PTR(-EEXIST); + + ll = mlx5_port_type_cap_to_rdma_ll(MLX5_CAP_GEN(mparent->mdev, + port_type)); + if (type != RDMA_DEVICE_TYPE_SMI || !mparent->num_plane || + ll != IB_LINK_LAYER_INFINIBAND || + !MLX5_CAP_GEN_2(mparent->mdev, multiplane_qp_ud)) + return ERR_PTR(-EOPNOTSUPP); + + mplane = ib_alloc_device_with_net(mlx5_ib_dev, ib_dev, + mlx5_core_net(mparent->mdev)); + if (!mplane) + return ERR_PTR(-ENOMEM); + + mplane->port = kcalloc(mparent->num_plane * mparent->num_ports, + sizeof(*mplane->port), GFP_KERNEL); + if (!mplane->port) { + ret = -ENOMEM; + goto fail_kcalloc; + } + + mplane->ib_dev.type = type; + mplane->mdev = mparent->mdev; + mplane->num_ports = mparent->num_plane; + mplane->sub_dev_name = name; + mplane->ib_dev.phys_port_cnt = mplane->num_ports; + + ret = __mlx5_ib_add(mplane, &plane_profile); + if (ret) + goto fail_ib_add; + + mparent->smi_dev = mplane; + return &mplane->ib_dev; + +fail_ib_add: + kfree(mplane->port); +fail_kcalloc: + ib_dealloc_device(&mplane->ib_dev); + return ERR_PTR(ret); +} + +static void mlx5_ib_del_sub_dev(struct ib_device *sub_dev) +{ + struct mlx5_ib_dev *mdev = to_mdev(sub_dev); + + to_mdev(sub_dev->parent)->smi_dev = NULL; + __mlx5_ib_remove(mdev, mdev->profile, MLX5_IB_STAGE_MAX); +} + +static int mlx5r_mp_probe(struct auxiliary_device *adev, + const struct auxiliary_device_id *id) +{ + struct mlx5_adev *idev = container_of(adev, struct mlx5_adev, adev); + struct mlx5_core_dev *mdev = idev->mdev; struct mlx5_ib_multiport_info *mpi; struct mlx5_ib_dev *dev; bool bound = false; @@ -6440,140 +4978,203 @@ static void *mlx5_ib_add_slave_port(struct mlx5_core_dev *mdev) mpi = kzalloc(sizeof(*mpi), GFP_KERNEL); if (!mpi) - return NULL; + return -ENOMEM; mpi->mdev = mdev; - err = mlx5_query_nic_vport_system_image_guid(mdev, &mpi->sys_image_guid); if (err) { kfree(mpi); - return NULL; + return err; } mutex_lock(&mlx5_ib_multiport_mutex); list_for_each_entry(dev, &mlx5_ib_dev_list, ib_dev_list) { - if (dev->sys_image_guid == mpi->sys_image_guid) + if (dev->sys_image_guid == mpi->sys_image_guid && + mlx5_core_same_coredev_type(dev->mdev, mpi->mdev)) bound = mlx5_ib_bind_slave_port(dev, mpi); if (bound) { rdma_roce_rescan_device(&dev->ib_dev); + mpi->ibdev->ib_active = true; break; } } if (!bound) { list_add_tail(&mpi->list, &mlx5_ib_unaffiliated_port_list); - dev_dbg(&mdev->pdev->dev, "no suitable IB device found to bind to, added to unaffiliated list.\n"); + dev_dbg(mdev->device, + "no suitable IB device found to bind to, added to unaffiliated list.\n"); } mutex_unlock(&mlx5_ib_multiport_mutex); - return mpi; + auxiliary_set_drvdata(adev, mpi); + return 0; +} + +static void mlx5r_mp_remove(struct auxiliary_device *adev) +{ + struct mlx5_ib_multiport_info *mpi; + + mpi = auxiliary_get_drvdata(adev); + mutex_lock(&mlx5_ib_multiport_mutex); + if (mpi->ibdev) + mlx5_ib_unbind_slave_port(mpi->ibdev, mpi); + else + list_del(&mpi->list); + mutex_unlock(&mlx5_ib_multiport_mutex); + kfree(mpi); } -static void *mlx5_ib_add(struct mlx5_core_dev *mdev) +static int mlx5r_probe(struct auxiliary_device *adev, + const struct auxiliary_device_id *id) { + struct mlx5_adev *idev = container_of(adev, struct mlx5_adev, adev); + struct mlx5_core_dev *mdev = idev->mdev; + const struct mlx5_ib_profile *profile; + int port_type_cap, num_ports, ret; enum rdma_link_layer ll; struct mlx5_ib_dev *dev; - int port_type_cap; - - printk_once(KERN_INFO "%s", mlx5_version); port_type_cap = MLX5_CAP_GEN(mdev, port_type); ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap); - if (mlx5_core_is_mp_slave(mdev) && ll == IB_LINK_LAYER_ETHERNET) - return mlx5_ib_add_slave_port(mdev); - - dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev)); + num_ports = max(MLX5_CAP_GEN(mdev, num_ports), + MLX5_CAP_GEN(mdev, num_vhca_ports)); + dev = ib_alloc_device_with_net(mlx5_ib_dev, ib_dev, + mlx5_core_net(mdev)); if (!dev) - return NULL; + return -ENOMEM; - dev->mdev = mdev; - dev->num_ports = max(MLX5_CAP_GEN(mdev, num_ports), - MLX5_CAP_GEN(mdev, num_vhca_ports)); - - if (MLX5_ESWITCH_MANAGER(mdev) && - mlx5_ib_eswitch_mode(mdev->priv.eswitch) == SRIOV_OFFLOADS) { - dev->rep = mlx5_ib_vport_rep(mdev->priv.eswitch, 0); - dev->profile = &nic_rep_profile; - mlx5_ib_register_vport_reps(dev); - return dev; + if (ll == IB_LINK_LAYER_INFINIBAND) { + ret = mlx5_ib_get_plane_num(mdev, &dev->num_plane); + if (ret) + goto fail; } - return __mlx5_ib_add(dev, &pf_profile); -} - -static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context) -{ - struct mlx5_ib_multiport_info *mpi; - struct mlx5_ib_dev *dev; - - if (mlx5_core_is_mp_slave(mdev)) { - mpi = context; - mutex_lock(&mlx5_ib_multiport_mutex); - if (mpi->ibdev) - mlx5_ib_unbind_slave_port(mpi->ibdev, mpi); - list_del(&mpi->list); - mutex_unlock(&mlx5_ib_multiport_mutex); - return; + dev->port = kcalloc(num_ports, sizeof(*dev->port), + GFP_KERNEL); + if (!dev->port) { + ret = -ENOMEM; + goto fail; } - dev = context; - if (dev->profile == &nic_rep_profile) - mlx5_ib_unregister_vport_reps(dev); + dev->mdev = mdev; + dev->num_ports = num_ports; + dev->ib_dev.phys_port_cnt = num_ports; + + if (ll == IB_LINK_LAYER_ETHERNET && !mlx5_get_roce_state(mdev)) + profile = &raw_eth_profile; else - __mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX); + profile = &pf_profile; - ib_dealloc_device((struct ib_device *)dev); -} + ret = __mlx5_ib_add(dev, profile); + if (ret) + goto fail_ib_add; -static struct mlx5_interface mlx5_ib_interface = { - .add = mlx5_ib_add, - .remove = mlx5_ib_remove, - .protocol = MLX5_INTERFACE_PROTOCOL_IB, -}; + auxiliary_set_drvdata(adev, dev); + return 0; -unsigned long mlx5_ib_get_xlt_emergency_page(void) -{ - mutex_lock(&xlt_emergency_page_mutex); - return xlt_emergency_page; +fail_ib_add: + kfree(dev->port); +fail: + ib_dealloc_device(&dev->ib_dev); + return ret; } -void mlx5_ib_put_xlt_emergency_page(void) +static void mlx5r_remove(struct auxiliary_device *adev) { - mutex_unlock(&xlt_emergency_page_mutex); + struct mlx5_ib_dev *dev; + + dev = auxiliary_get_drvdata(adev); + __mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX); } +static const struct auxiliary_device_id mlx5r_mp_id_table[] = { + { .name = MLX5_ADEV_NAME ".multiport", }, + {}, +}; + +static const struct auxiliary_device_id mlx5r_id_table[] = { + { .name = MLX5_ADEV_NAME ".rdma", }, + {}, +}; + +MODULE_DEVICE_TABLE(auxiliary, mlx5r_mp_id_table); +MODULE_DEVICE_TABLE(auxiliary, mlx5r_id_table); + +static struct auxiliary_driver mlx5r_mp_driver = { + .name = "multiport", + .probe = mlx5r_mp_probe, + .remove = mlx5r_mp_remove, + .id_table = mlx5r_mp_id_table, +}; + +static struct auxiliary_driver mlx5r_driver = { + .name = "rdma", + .probe = mlx5r_probe, + .remove = mlx5r_remove, + .id_table = mlx5r_id_table, +}; + static int __init mlx5_ib_init(void) { - int err; + int ret; - xlt_emergency_page = __get_free_page(GFP_KERNEL); + xlt_emergency_page = (void *)__get_free_page(GFP_KERNEL); if (!xlt_emergency_page) return -ENOMEM; - mutex_init(&xlt_emergency_page_mutex); - mlx5_ib_event_wq = alloc_ordered_workqueue("mlx5_ib_event_wq", 0); if (!mlx5_ib_event_wq) { - free_page(xlt_emergency_page); + free_page((unsigned long)xlt_emergency_page); return -ENOMEM; } + ret = mlx5_ib_qp_event_init(); + if (ret) + goto qp_event_err; + mlx5_ib_odp_init(); + ret = mlx5r_rep_init(); + if (ret) + goto rep_err; + ret = mlx5_data_direct_driver_register(); + if (ret) + goto dd_err; + ret = auxiliary_driver_register(&mlx5r_mp_driver); + if (ret) + goto mp_err; + ret = auxiliary_driver_register(&mlx5r_driver); + if (ret) + goto drv_err; - err = mlx5_register_interface(&mlx5_ib_interface); + return 0; - return err; +drv_err: + auxiliary_driver_unregister(&mlx5r_mp_driver); +mp_err: + mlx5_data_direct_driver_unregister(); +dd_err: + mlx5r_rep_cleanup(); +rep_err: + mlx5_ib_qp_event_cleanup(); +qp_event_err: + destroy_workqueue(mlx5_ib_event_wq); + free_page((unsigned long)xlt_emergency_page); + return ret; } static void __exit mlx5_ib_cleanup(void) { - mlx5_unregister_interface(&mlx5_ib_interface); + mlx5_data_direct_driver_unregister(); + auxiliary_driver_unregister(&mlx5r_driver); + auxiliary_driver_unregister(&mlx5r_mp_driver); + mlx5r_rep_cleanup(); + + mlx5_ib_qp_event_cleanup(); destroy_workqueue(mlx5_ib_event_wq); - mutex_destroy(&xlt_emergency_page_mutex); - free_page(xlt_emergency_page); + free_page((unsigned long)xlt_emergency_page); } module_init(mlx5_ib_init); |
