summaryrefslogtreecommitdiff
path: root/drivers/infiniband/core
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/infiniband/core')
-rw-r--r--drivers/infiniband/core/Makefile4
-rw-r--r--drivers/infiniband/core/addr.c96
-rw-r--r--drivers/infiniband/core/agent.c35
-rw-r--r--drivers/infiniband/core/cache.c108
-rw-r--r--drivers/infiniband/core/cm.c440
-rw-r--r--drivers/infiniband/core/cm_trace.h4
-rw-r--r--drivers/infiniband/core/cma.c1003
-rw-r--r--drivers/infiniband/core/cma_configfs.c5
-rw-r--r--drivers/infiniband/core/cma_priv.h17
-rw-r--r--drivers/infiniband/core/cma_trace.h8
-rw-r--r--drivers/infiniband/core/core_priv.h50
-rw-r--r--drivers/infiniband/core/counters.c94
-rw-r--r--drivers/infiniband/core/cq.c13
-rw-r--r--drivers/infiniband/core/device.c456
-rw-r--r--drivers/infiniband/core/iwcm.c78
-rw-r--r--drivers/infiniband/core/iwpm_msg.c34
-rw-r--r--drivers/infiniband/core/iwpm_util.c82
-rw-r--r--drivers/infiniband/core/iwpm_util.h19
-rw-r--r--drivers/infiniband/core/lag.c11
-rw-r--r--drivers/infiniband/core/mad.c523
-rw-r--r--drivers/infiniband/core/mad_priv.h76
-rw-r--r--drivers/infiniband/core/mad_rmpp.c43
-rw-r--r--drivers/infiniband/core/netlink.c3
-rw-r--r--drivers/infiniband/core/nldev.c775
-rw-r--r--drivers/infiniband/core/rdma_core.c43
-rw-r--r--drivers/infiniband/core/rdma_core.h1
-rw-r--r--drivers/infiniband/core/restrack.c71
-rw-r--r--drivers/infiniband/core/roce_gid_mgmt.c35
-rw-r--r--drivers/infiniband/core/rw.c78
-rw-r--r--drivers/infiniband/core/sa_query.c558
-rw-r--r--drivers/infiniband/core/sysfs.c107
-rw-r--r--drivers/infiniband/core/ucaps.c267
-rw-r--r--drivers/infiniband/core/ucma.c193
-rw-r--r--drivers/infiniband/core/ud_header.c83
-rw-r--r--drivers/infiniband/core/umem.c113
-rw-r--r--drivers/infiniband/core/umem_dmabuf.c127
-rw-r--r--drivers/infiniband/core/umem_odp.c292
-rw-r--r--drivers/infiniband/core/user_mad.c83
-rw-r--r--drivers/infiniband/core/uverbs.h29
-rw-r--r--drivers/infiniband/core/uverbs_cmd.c272
-rw-r--r--drivers/infiniband/core/uverbs_ioctl.c86
-rw-r--r--drivers/infiniband/core/uverbs_main.c101
-rw-r--r--drivers/infiniband/core/uverbs_marshall.c44
-rw-r--r--drivers/infiniband/core/uverbs_std_types_counters.c2
-rw-r--r--drivers/infiniband/core/uverbs_std_types_cq.c88
-rw-r--r--drivers/infiniband/core/uverbs_std_types_device.c7
-rw-r--r--drivers/infiniband/core/uverbs_std_types_dmah.c145
-rw-r--r--drivers/infiniband/core/uverbs_std_types_flow_action.c383
-rw-r--r--drivers/infiniband/core/uverbs_std_types_mr.c177
-rw-r--r--drivers/infiniband/core/uverbs_std_types_qp.c33
-rw-r--r--drivers/infiniband/core/uverbs_uapi.c4
-rw-r--r--drivers/infiniband/core/verbs.c527
52 files changed, 5182 insertions, 2744 deletions
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index 8ab4eea5a0a5..f483e0c12444 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -33,12 +33,14 @@ ib_umad-y := user_mad.o
ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
rdma_core.o uverbs_std_types.o uverbs_ioctl.o \
uverbs_std_types_cq.o \
+ uverbs_std_types_dmah.o \
uverbs_std_types_flow_action.o uverbs_std_types_dm.o \
uverbs_std_types_mr.o uverbs_std_types_counters.o \
uverbs_uapi.o uverbs_std_types_device.o \
uverbs_std_types_async_fd.o \
uverbs_std_types_srq.o \
uverbs_std_types_wq.o \
- uverbs_std_types_qp.o
+ uverbs_std_types_qp.o \
+ ucaps.o
ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o umem_dmabuf.o
ib_uverbs-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o
diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
index 65e3e7df8a4b..61596cda2b65 100644
--- a/drivers/infiniband/core/addr.c
+++ b/drivers/infiniband/core/addr.c
@@ -37,7 +37,6 @@
#include <linux/inetdevice.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
-#include <linux/module.h>
#include <net/arp.h>
#include <net/neighbour.h>
#include <net/route.h>
@@ -349,16 +348,10 @@ static int dst_fetch_ha(const struct dst_entry *dst,
static bool has_gateway(const struct dst_entry *dst, sa_family_t family)
{
- struct rtable *rt;
- struct rt6_info *rt6;
-
- if (family == AF_INET) {
- rt = container_of(dst, struct rtable, dst);
- return rt->rt_uses_gateway;
- }
+ if (family == AF_INET)
+ return dst_rtable(dst)->rt_uses_gateway;
- rt6 = container_of(dst, struct rt6_info, dst);
- return rt6->rt6i_flags & RTF_GATEWAY;
+ return dst_rt6_info(dst)->rt6i_flags & RTF_GATEWAY;
}
static int fetch_ha(const struct dst_entry *dst, struct rdma_dev_addr *dev_addr,
@@ -453,63 +446,41 @@ static int addr6_resolve(struct sockaddr *src_sock,
}
#endif
+static bool is_dst_local(const struct dst_entry *dst)
+{
+ if (dst->ops->family == AF_INET)
+ return !!(dst_rtable(dst)->rt_type & RTN_LOCAL);
+ else if (dst->ops->family == AF_INET6)
+ return !!(dst_rt6_info(dst)->rt6i_flags & RTF_LOCAL);
+ else
+ return false;
+}
+
static int addr_resolve_neigh(const struct dst_entry *dst,
const struct sockaddr *dst_in,
struct rdma_dev_addr *addr,
- unsigned int ndev_flags,
u32 seq)
{
- int ret = 0;
-
- if (ndev_flags & IFF_LOOPBACK) {
+ if (is_dst_local(dst)) {
+ /* When the destination is local entry, source and destination
+ * are same. Skip the neighbour lookup.
+ */
memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN);
- } else {
- if (!(ndev_flags & IFF_NOARP)) {
- /* If the device doesn't do ARP internally */
- ret = fetch_ha(dst, addr, dst_in, seq);
- }
+ return 0;
}
- return ret;
-}
-
-static int copy_src_l2_addr(struct rdma_dev_addr *dev_addr,
- const struct sockaddr *dst_in,
- const struct dst_entry *dst,
- const struct net_device *ndev)
-{
- int ret = 0;
-
- if (dst->dev->flags & IFF_LOOPBACK)
- ret = rdma_translate_ip(dst_in, dev_addr);
- else
- rdma_copy_src_l2_addr(dev_addr, dst->dev);
-
- /*
- * If there's a gateway and type of device not ARPHRD_INFINIBAND,
- * we're definitely in RoCE v2 (as RoCE v1 isn't routable) set the
- * network type accordingly.
- */
- if (has_gateway(dst, dst_in->sa_family) &&
- ndev->type != ARPHRD_INFINIBAND)
- dev_addr->network = dst_in->sa_family == AF_INET ?
- RDMA_NETWORK_IPV4 :
- RDMA_NETWORK_IPV6;
- else
- dev_addr->network = RDMA_NETWORK_IB;
- return ret;
+ return fetch_ha(dst, addr, dst_in, seq);
}
static int rdma_set_src_addr_rcu(struct rdma_dev_addr *dev_addr,
- unsigned int *ndev_flags,
const struct sockaddr *dst_in,
const struct dst_entry *dst)
{
struct net_device *ndev = READ_ONCE(dst->dev);
- *ndev_flags = ndev->flags;
/* A physical device must be the RDMA device to use */
- if (ndev->flags & IFF_LOOPBACK) {
+ if (is_dst_local(dst)) {
+ int ret;
/*
* RDMA (IB/RoCE, iWarp) doesn't run on lo interface or
* loopback IP address. So if route is resolved to loopback
@@ -519,9 +490,27 @@ static int rdma_set_src_addr_rcu(struct rdma_dev_addr *dev_addr,
ndev = rdma_find_ndev_for_src_ip_rcu(dev_net(ndev), dst_in);
if (IS_ERR(ndev))
return -ENODEV;
+ ret = rdma_translate_ip(dst_in, dev_addr);
+ if (ret)
+ return ret;
+ } else {
+ rdma_copy_src_l2_addr(dev_addr, dst->dev);
}
- return copy_src_l2_addr(dev_addr, dst_in, dst, ndev);
+ /*
+ * If there's a gateway and type of device not ARPHRD_INFINIBAND,
+ * we're definitely in RoCE v2 (as RoCE v1 isn't routable) set the
+ * network type accordingly.
+ */
+ if (has_gateway(dst, dst_in->sa_family) &&
+ ndev->type != ARPHRD_INFINIBAND)
+ dev_addr->network = dst_in->sa_family == AF_INET ?
+ RDMA_NETWORK_IPV4 :
+ RDMA_NETWORK_IPV6;
+ else
+ dev_addr->network = RDMA_NETWORK_IB;
+
+ return 0;
}
static int set_addr_netns_by_gid_rcu(struct rdma_dev_addr *addr)
@@ -558,7 +547,6 @@ static int addr_resolve(struct sockaddr *src_in,
u32 seq)
{
struct dst_entry *dst = NULL;
- unsigned int ndev_flags = 0;
struct rtable *rt = NULL;
int ret;
@@ -595,7 +583,7 @@ static int addr_resolve(struct sockaddr *src_in,
rcu_read_unlock();
goto done;
}
- ret = rdma_set_src_addr_rcu(addr, &ndev_flags, dst_in, dst);
+ ret = rdma_set_src_addr_rcu(addr, dst_in, dst);
rcu_read_unlock();
/*
@@ -603,7 +591,7 @@ static int addr_resolve(struct sockaddr *src_in,
* only if src addr translation didn't fail.
*/
if (!ret && resolve_neigh)
- ret = addr_resolve_neigh(dst, dst_in, addr, ndev_flags, seq);
+ ret = addr_resolve_neigh(dst, dst_in, addr, seq);
if (src_in->sa_family == AF_INET)
ip_rt_put(rt);
diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c
index f82b4260de42..25a060a28301 100644
--- a/drivers/infiniband/core/agent.c
+++ b/drivers/infiniband/core/agent.c
@@ -59,7 +59,16 @@ __ib_get_agent_port(const struct ib_device *device, int port_num)
struct ib_agent_port_private *entry;
list_for_each_entry(entry, &ib_agent_port_list, port_list) {
- if (entry->agent[1]->device == device &&
+ /* Need to check both agent[0] and agent[1], as an agent port
+ * may only have one of them
+ */
+ if (entry->agent[0] &&
+ entry->agent[0]->device == device &&
+ entry->agent[0]->port_num == port_num)
+ return entry;
+
+ if (entry->agent[1] &&
+ entry->agent[1]->device == device &&
entry->agent[1]->port_num == port_num)
return entry;
}
@@ -101,8 +110,7 @@ void agent_send_response(const struct ib_mad_hdr *mad_hdr, const struct ib_grh *
agent = port_priv->agent[qpn];
ah = ib_create_ah_from_wc(agent->qp->pd, wc, grh, port_num);
if (IS_ERR(ah)) {
- dev_err(&device->dev, "ib_create_ah_from_wc error %ld\n",
- PTR_ERR(ah));
+ dev_err(&device->dev, "ib_create_ah_from_wc error %pe\n", ah);
return;
}
@@ -172,14 +180,16 @@ int ib_agent_port_open(struct ib_device *device, int port_num)
}
}
- /* Obtain send only MAD agent for GSI QP */
- port_priv->agent[1] = ib_register_mad_agent(device, port_num,
- IB_QPT_GSI, NULL, 0,
- &agent_send_handler,
- NULL, NULL, 0);
- if (IS_ERR(port_priv->agent[1])) {
- ret = PTR_ERR(port_priv->agent[1]);
- goto error3;
+ if (rdma_cap_ib_cm(device, port_num)) {
+ /* Obtain send only MAD agent for GSI QP */
+ port_priv->agent[1] = ib_register_mad_agent(device, port_num,
+ IB_QPT_GSI, NULL, 0,
+ &agent_send_handler,
+ NULL, NULL, 0);
+ if (IS_ERR(port_priv->agent[1])) {
+ ret = PTR_ERR(port_priv->agent[1]);
+ goto error3;
+ }
}
spin_lock_irqsave(&ib_agent_port_list_lock, flags);
@@ -212,7 +222,8 @@ int ib_agent_port_close(struct ib_device *device, int port_num)
list_del(&port_priv->port_list);
spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
- ib_unregister_mad_agent(port_priv->agent[1]);
+ if (port_priv->agent[1])
+ ib_unregister_mad_agent(port_priv->agent[1]);
if (port_priv->agent[0])
ib_unregister_mad_agent(port_priv->agent[0]);
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index c9e9fc81447e..81cf3c902e81 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -33,7 +33,7 @@
* SOFTWARE.
*/
-#include <linux/module.h>
+#include <linux/if_vlan.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
@@ -46,7 +46,7 @@
struct ib_pkey_cache {
int table_len;
- u16 table[];
+ u16 table[] __counted_by(table_len);
};
struct ib_update_work {
@@ -400,6 +400,9 @@ static void del_gid(struct ib_device *ib_dev, u32 port,
table->data_vec[ix] = NULL;
write_unlock_irq(&table->rwlock);
+ if (rdma_cap_roce_gid_table(ib_dev, port))
+ ib_dev->ops.del_gid(&entry->attr, &entry->context);
+
ndev_storage = entry->ndev_storage;
if (ndev_storage) {
entry->ndev_storage = NULL;
@@ -407,9 +410,6 @@ static void del_gid(struct ib_device *ib_dev, u32 port,
call_rcu(&ndev_storage->rcu_head, put_gid_ndev);
}
- if (rdma_cap_roce_gid_table(ib_dev, port))
- ib_dev->ops.del_gid(&entry->attr, &entry->context);
-
put_gid_entry_locked(entry);
}
@@ -582,8 +582,8 @@ static int __ib_cache_gid_add(struct ib_device *ib_dev, u32 port,
out_unlock:
mutex_unlock(&table->lock);
if (ret)
- pr_warn("%s: unable to add gid %pI6 error=%d\n",
- __func__, gid->raw, ret);
+ pr_warn_ratelimited("%s: unable to add gid %pI6 error=%d\n",
+ __func__, gid->raw, ret);
return ret;
}
@@ -794,7 +794,6 @@ err_free_table:
static void release_gid_table(struct ib_device *device,
struct ib_gid_table *table)
{
- bool leak = false;
int i;
if (!table)
@@ -803,15 +802,12 @@ static void release_gid_table(struct ib_device *device,
for (i = 0; i < table->sz; i++) {
if (is_gid_entry_free(table->data_vec[i]))
continue;
- if (kref_read(&table->data_vec[i]->kref) > 1) {
- dev_err(&device->dev,
- "GID entry ref leak for index %d ref=%u\n", i,
- kref_read(&table->data_vec[i]->kref));
- leak = true;
- }
+
+ WARN_ONCE(true,
+ "GID entry ref leak for dev %s index %d ref=%u\n",
+ dev_name(&device->dev), i,
+ kref_read(&table->data_vec[i]->kref));
}
- if (leak)
- return;
mutex_destroy(&table->lock);
kfree(table->data_vec);
@@ -955,7 +951,7 @@ int rdma_query_gid(struct ib_device *device, u32 port_num,
{
struct ib_gid_table *table;
unsigned long flags;
- int res = -EINVAL;
+ int res;
if (!rdma_is_port_valid(device, port_num))
return -EINVAL;
@@ -963,9 +959,15 @@ int rdma_query_gid(struct ib_device *device, u32 port_num,
table = rdma_gid_table(device, port_num);
read_lock_irqsave(&table->rwlock, flags);
- if (index < 0 || index >= table->sz ||
- !is_gid_entry_valid(table->data_vec[index]))
+ if (index < 0 || index >= table->sz) {
+ res = -EINVAL;
+ goto done;
+ }
+
+ if (!is_gid_entry_valid(table->data_vec[index])) {
+ res = -ENOENT;
goto done;
+ }
memcpy(gid, &table->data_vec[index]->attr.gid, sizeof(*gid));
res = 0;
@@ -1125,41 +1127,6 @@ err:
}
EXPORT_SYMBOL(ib_find_cached_pkey);
-int ib_find_exact_cached_pkey(struct ib_device *device, u32 port_num,
- u16 pkey, u16 *index)
-{
- struct ib_pkey_cache *cache;
- unsigned long flags;
- int i;
- int ret = -ENOENT;
-
- if (!rdma_is_port_valid(device, port_num))
- return -EINVAL;
-
- read_lock_irqsave(&device->cache_lock, flags);
-
- cache = device->port_data[port_num].cache.pkey;
- if (!cache) {
- ret = -EINVAL;
- goto err;
- }
-
- *index = -1;
-
- for (i = 0; i < cache->table_len; ++i)
- if (cache->table[i] == pkey) {
- *index = i;
- ret = 0;
- break;
- }
-
-err:
- read_unlock_irqrestore(&device->cache_lock, flags);
-
- return ret;
-}
-EXPORT_SYMBOL(ib_find_exact_cached_pkey);
-
int ib_get_cached_lmc(struct ib_device *device, u32 port_num, u8 *lmc)
{
unsigned long flags;
@@ -1416,7 +1383,7 @@ int rdma_read_gid_l2_fields(const struct ib_gid_attr *attr,
*vlan_id = vlan_dev_vlan_id(ndev);
} else {
/* If the netdev is upper device and if it's lower
- * device is vlan device, consider vlan id of the
+ * device is vlan device, consider vlan id of
* the lower vlan device for this gid entry.
*/
netdev_walk_all_lower_dev_rcu(attr->ndev,
@@ -1429,7 +1396,7 @@ int rdma_read_gid_l2_fields(const struct ib_gid_attr *attr,
EXPORT_SYMBOL(rdma_read_gid_l2_fields);
static int config_non_roce_gid_cache(struct ib_device *device,
- u32 port, int gid_tbl_len)
+ u32 port, struct ib_port_attr *tprops)
{
struct ib_gid_attr gid_attr = {};
struct ib_gid_table *table;
@@ -1441,7 +1408,7 @@ static int config_non_roce_gid_cache(struct ib_device *device,
table = rdma_gid_table(device, port);
mutex_lock(&table->lock);
- for (i = 0; i < gid_tbl_len; ++i) {
+ for (i = 0; i < tprops->gid_tbl_len; ++i) {
if (!device->ops.query_gid)
continue;
ret = device->ops.query_gid(device, port, i, &gid_attr.gid);
@@ -1451,7 +1418,20 @@ static int config_non_roce_gid_cache(struct ib_device *device,
i);
goto err;
}
+
+ if (rdma_protocol_iwarp(device, port)) {
+ struct net_device *ndev;
+
+ ndev = ib_device_get_netdev(device, port);
+ if (!ndev)
+ continue;
+ RCU_INIT_POINTER(gid_attr.ndev, ndev);
+ dev_put(ndev);
+ }
+
gid_attr.index = i;
+ tprops->subnet_prefix =
+ be64_to_cpu(gid_attr.gid.global.subnet_prefix);
add_modify_gid(table, &gid_attr);
}
err:
@@ -1484,7 +1464,7 @@ ib_cache_update(struct ib_device *device, u32 port, bool update_gids,
if (!rdma_protocol_roce(device, port) && update_gids) {
ret = config_non_roce_gid_cache(device, port,
- tprops->gid_tbl_len);
+ tprops);
if (ret)
goto err;
}
@@ -1521,6 +1501,12 @@ ib_cache_update(struct ib_device *device, u32 port, bool update_gids,
device->port_data[port].cache.pkey = pkey_cache;
}
device->port_data[port].cache.lmc = tprops->lmc;
+
+ if (device->port_data[port].cache.port_state != IB_PORT_NOP &&
+ device->port_data[port].cache.port_state != tprops->state)
+ ibdev_info(device, "Port: %d Link %s\n", port,
+ ib_port_state_to_str(tprops->state));
+
device->port_data[port].cache.port_state = tprops->state;
device->port_data[port].cache.subnet_prefix = tprops->subnet_prefix;
@@ -1619,16 +1605,16 @@ int ib_cache_setup_one(struct ib_device *device)
u32 p;
int err;
- rwlock_init(&device->cache_lock);
-
err = gid_table_setup_one(device);
if (err)
return err;
rdma_for_each_port (device, p) {
err = ib_cache_update(device, p, true, true, true);
- if (err)
+ if (err) {
+ gid_table_cleanup_one(device);
return err;
+ }
}
return 0;
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index c903b74f46a4..024df6ee239d 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -34,6 +34,9 @@ MODULE_AUTHOR("Sean Hefty");
MODULE_DESCRIPTION("InfiniBand CM");
MODULE_LICENSE("Dual BSD/GPL");
+#define CM_DIRECT_RETRY_CTX ((void *) 1UL)
+#define CM_MRA_SETTING 24 /* 4.096us * 2^24 = ~68.7 seconds */
+
static const char * const ibcm_rej_reason_strs[] = {
[IB_CM_REJ_NO_QP] = "no QP",
[IB_CM_REJ_NO_EEC] = "no EEC",
@@ -92,8 +95,7 @@ static void cm_process_work(struct cm_id_private *cm_id_priv,
struct cm_work *work);
static int cm_send_sidr_rep_locked(struct cm_id_private *cm_id_priv,
struct ib_cm_sidr_rep_param *param);
-static int cm_send_dreq_locked(struct cm_id_private *cm_id_priv,
- const void *private_data, u8 private_data_len);
+static void cm_issue_dreq(struct cm_id_private *cm_id_priv);
static int cm_send_drep_locked(struct cm_id_private *cm_id_priv,
void *private_data, u8 private_data_len);
static int cm_send_rej_locked(struct cm_id_private *cm_id_priv,
@@ -158,6 +160,7 @@ struct cm_counter_attribute {
struct cm_port {
struct cm_device *cm_dev;
struct ib_mad_agent *mad_agent;
+ struct ib_mad_agent *rep_agent;
u32 port_num;
atomic_long_t counters[CM_COUNTER_GROUPS][CM_ATTR_COUNT];
};
@@ -165,7 +168,7 @@ struct cm_port {
struct cm_device {
struct kref kref;
struct list_head list;
- spinlock_t mad_agent_lock;
+ rwlock_t mad_agent_lock;
struct ib_device *ib_device;
u8 ack_delay;
int going_down;
@@ -175,6 +178,7 @@ struct cm_device {
struct cm_av {
struct cm_port *port;
struct rdma_ah_attr ah_attr;
+ u16 dlid_datapath;
u16 pkey_index;
u8 timeout;
};
@@ -238,7 +242,6 @@ struct cm_id_private {
u8 initiator_depth;
u8 retry_count;
u8 rnr_retry_count;
- u8 service_timeout;
u8 target_ack_delay;
struct list_head work_list;
@@ -271,7 +274,8 @@ static inline void cm_deref_id(struct cm_id_private *cm_id_priv)
complete(&cm_id_priv->comp);
}
-static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv)
+static struct ib_mad_send_buf *
+cm_alloc_msg_agent(struct cm_id_private *cm_id_priv, bool rep_agent)
{
struct ib_mad_agent *mad_agent;
struct ib_mad_send_buf *m;
@@ -282,8 +286,9 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv)
if (!cm_id_priv->av.port)
return ERR_PTR(-EINVAL);
- spin_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
- mad_agent = cm_id_priv->av.port->mad_agent;
+ read_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
+ mad_agent = rep_agent ? cm_id_priv->av.port->rep_agent :
+ cm_id_priv->av.port->mad_agent;
if (!mad_agent) {
m = ERR_PTR(-EINVAL);
goto out;
@@ -305,42 +310,54 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv)
goto out;
}
- /* Timeout set by caller if response is expected. */
m->ah = ah;
- m->retries = cm_id_priv->max_cm_retries;
-
- refcount_inc(&cm_id_priv->refcount);
- m->context[0] = cm_id_priv;
out:
- spin_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
+ read_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
return m;
}
-static void cm_free_msg(struct ib_mad_send_buf *msg)
+static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv)
{
- struct cm_id_private *cm_id_priv = msg->context[0];
+ return cm_alloc_msg_agent(cm_id_priv, false);
+}
+static void cm_free_msg(struct ib_mad_send_buf *msg)
+{
if (msg->ah)
rdma_destroy_ah(msg->ah, 0);
- cm_deref_id(cm_id_priv);
ib_free_send_mad(msg);
}
static struct ib_mad_send_buf *
-cm_alloc_priv_msg(struct cm_id_private *cm_id_priv)
+cm_alloc_priv_msg_rep(struct cm_id_private *cm_id_priv, enum ib_cm_state state,
+ bool rep_agent)
{
struct ib_mad_send_buf *msg;
lockdep_assert_held(&cm_id_priv->lock);
- msg = cm_alloc_msg(cm_id_priv);
+ msg = cm_alloc_msg_agent(cm_id_priv, rep_agent);
if (IS_ERR(msg))
return msg;
+
cm_id_priv->msg = msg;
+ refcount_inc(&cm_id_priv->refcount);
+ msg->context[0] = cm_id_priv;
+ msg->context[1] = (void *) (unsigned long) state;
+
+ msg->retries = cm_id_priv->max_cm_retries;
+ msg->timeout_ms = cm_id_priv->timeout_ms;
+
return msg;
}
+static struct ib_mad_send_buf *
+cm_alloc_priv_msg(struct cm_id_private *cm_id_priv, enum ib_cm_state state)
+{
+ return cm_alloc_priv_msg_rep(cm_id_priv, state, false);
+}
+
static void cm_free_priv_msg(struct ib_mad_send_buf *msg)
{
struct cm_id_private *cm_id_priv = msg->context[0];
@@ -356,13 +373,20 @@ static void cm_free_priv_msg(struct ib_mad_send_buf *msg)
ib_free_send_mad(msg);
}
-static struct ib_mad_send_buf *cm_alloc_response_msg_no_ah(struct cm_port *port,
- struct ib_mad_recv_wc *mad_recv_wc)
+static struct ib_mad_send_buf *
+cm_alloc_response_msg_no_ah(struct cm_port *port,
+ struct ib_mad_recv_wc *mad_recv_wc,
+ bool direct_retry)
{
- return ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index,
- 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
- GFP_ATOMIC,
- IB_MGMT_BASE_VERSION);
+ struct ib_mad_send_buf *m;
+
+ m = ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index,
+ 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
+ GFP_ATOMIC, IB_MGMT_BASE_VERSION);
+ if (!IS_ERR(m))
+ m->context[0] = direct_retry ? CM_DIRECT_RETRY_CTX : NULL;
+
+ return m;
}
static int cm_create_response_msg_ah(struct cm_port *port,
@@ -382,12 +406,13 @@ static int cm_create_response_msg_ah(struct cm_port *port,
static int cm_alloc_response_msg(struct cm_port *port,
struct ib_mad_recv_wc *mad_recv_wc,
+ bool direct_retry,
struct ib_mad_send_buf **msg)
{
struct ib_mad_send_buf *m;
int ret;
- m = cm_alloc_response_msg_no_ah(port, mad_recv_wc);
+ m = cm_alloc_response_msg_no_ah(port, mad_recv_wc, direct_retry);
if (IS_ERR(m))
return PTR_ERR(m);
@@ -401,13 +426,6 @@ static int cm_alloc_response_msg(struct cm_port *port,
return 0;
}
-static void cm_free_response_msg(struct ib_mad_send_buf *msg)
-{
- if (msg->ah)
- rdma_destroy_ah(msg->ah, 0);
- ib_free_send_mad(msg);
-}
-
static void *cm_copy_private_data(const void *private_data, u8 private_data_len)
{
void *data;
@@ -617,7 +635,6 @@ static struct cm_id_private *cm_insert_listen(struct cm_id_private *cm_id_priv,
struct rb_node *parent = NULL;
struct cm_id_private *cur_cm_id_priv;
__be64 service_id = cm_id_priv->id.service_id;
- __be64 service_mask = cm_id_priv->id.service_mask;
unsigned long flags;
spin_lock_irqsave(&cm.lock, flags);
@@ -625,9 +642,16 @@ static struct cm_id_private *cm_insert_listen(struct cm_id_private *cm_id_priv,
parent = *link;
cur_cm_id_priv = rb_entry(parent, struct cm_id_private,
service_node);
- if ((cur_cm_id_priv->id.service_mask & service_id) ==
- (service_mask & cur_cm_id_priv->id.service_id) &&
- (cm_id_priv->id.device == cur_cm_id_priv->id.device)) {
+
+ if (cm_id_priv->id.device < cur_cm_id_priv->id.device)
+ link = &(*link)->rb_left;
+ else if (cm_id_priv->id.device > cur_cm_id_priv->id.device)
+ link = &(*link)->rb_right;
+ else if (be64_lt(service_id, cur_cm_id_priv->id.service_id))
+ link = &(*link)->rb_left;
+ else if (be64_gt(service_id, cur_cm_id_priv->id.service_id))
+ link = &(*link)->rb_right;
+ else {
/*
* Sharing an ib_cm_id with different handlers is not
* supported
@@ -643,17 +667,6 @@ static struct cm_id_private *cm_insert_listen(struct cm_id_private *cm_id_priv,
spin_unlock_irqrestore(&cm.lock, flags);
return cur_cm_id_priv;
}
-
- if (cm_id_priv->id.device < cur_cm_id_priv->id.device)
- link = &(*link)->rb_left;
- else if (cm_id_priv->id.device > cur_cm_id_priv->id.device)
- link = &(*link)->rb_right;
- else if (be64_lt(service_id, cur_cm_id_priv->id.service_id))
- link = &(*link)->rb_left;
- else if (be64_gt(service_id, cur_cm_id_priv->id.service_id))
- link = &(*link)->rb_right;
- else
- link = &(*link)->rb_right;
}
cm_id_priv->listen_sharecount++;
rb_link_node(&cm_id_priv->service_node, parent, link);
@@ -670,12 +683,7 @@ static struct cm_id_private *cm_find_listen(struct ib_device *device,
while (node) {
cm_id_priv = rb_entry(node, struct cm_id_private, service_node);
- if ((cm_id_priv->id.service_mask & service_id) ==
- cm_id_priv->id.service_id &&
- (cm_id_priv->id.device == device)) {
- refcount_inc(&cm_id_priv->refcount);
- return cm_id_priv;
- }
+
if (device < cm_id_priv->id.device)
node = node->rb_left;
else if (device > cm_id_priv->id.device)
@@ -684,8 +692,10 @@ static struct cm_id_private *cm_find_listen(struct ib_device *device,
node = node->rb_left;
else if (be64_gt(service_id, cm_id_priv->id.service_id))
node = node->rb_right;
- else
- node = node->rb_right;
+ else {
+ refcount_inc(&cm_id_priv->refcount);
+ return cm_id_priv;
+ }
}
return NULL;
}
@@ -1032,13 +1042,27 @@ static void cm_reset_to_idle(struct cm_id_private *cm_id_priv)
}
}
+static noinline void cm_destroy_id_wait_timeout(struct ib_cm_id *cm_id,
+ enum ib_cm_state old_state)
+{
+ struct cm_id_private *cm_id_priv;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ pr_err_ratelimited("%s: cm_id=%p timed out. state %d -> %d, refcnt=%d\n", __func__,
+ cm_id, old_state, cm_id->state, refcount_read(&cm_id_priv->refcount));
+}
+
static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
{
struct cm_id_private *cm_id_priv;
+ enum ib_cm_state old_state;
+ unsigned long timeout;
struct cm_work *work;
+ int ret;
cm_id_priv = container_of(cm_id, struct cm_id_private, id);
spin_lock_irq(&cm_id_priv->lock);
+ old_state = cm_id->state;
retest:
switch (cm_id->state) {
case IB_CM_LISTEN:
@@ -1102,7 +1126,8 @@ retest:
cm_id->state = IB_CM_IDLE;
break;
}
- cm_send_dreq_locked(cm_id_priv, NULL, 0);
+ cm_issue_dreq(cm_id_priv);
+ cm_enter_timewait(cm_id_priv);
goto retest;
case IB_CM_DREQ_SENT:
ib_cancel_mad(cm_id_priv->msg);
@@ -1142,7 +1167,13 @@ retest:
xa_erase(&cm.local_id_table, cm_local_id(cm_id->local_id));
cm_deref_id(cm_id_priv);
- wait_for_completion(&cm_id_priv->comp);
+ timeout = msecs_to_jiffies((cm_id_priv->max_cm_retries * cm_id_priv->timeout_ms * 5) / 4);
+ do {
+ ret = wait_for_completion_timeout(&cm_id_priv->comp, timeout);
+ if (!ret) /* timeout happened */
+ cm_destroy_id_wait_timeout(cm_id, old_state);
+ } while (!ret);
+
while ((work = cm_dequeue_work(cm_id_priv)) != NULL)
cm_free_work(work);
@@ -1158,22 +1189,17 @@ void ib_destroy_cm_id(struct ib_cm_id *cm_id)
}
EXPORT_SYMBOL(ib_destroy_cm_id);
-static int cm_init_listen(struct cm_id_private *cm_id_priv, __be64 service_id,
- __be64 service_mask)
+static int cm_init_listen(struct cm_id_private *cm_id_priv, __be64 service_id)
{
- service_mask = service_mask ? service_mask : ~cpu_to_be64(0);
- service_id &= service_mask;
if ((service_id & IB_SERVICE_ID_AGN_MASK) == IB_CM_ASSIGN_SERVICE_ID &&
(service_id != IB_CM_ASSIGN_SERVICE_ID))
return -EINVAL;
- if (service_id == IB_CM_ASSIGN_SERVICE_ID) {
+ if (service_id == IB_CM_ASSIGN_SERVICE_ID)
cm_id_priv->id.service_id = cpu_to_be64(cm.listen_service_id++);
- cm_id_priv->id.service_mask = ~cpu_to_be64(0);
- } else {
+ else
cm_id_priv->id.service_id = service_id;
- cm_id_priv->id.service_mask = service_mask;
- }
+
return 0;
}
@@ -1185,12 +1211,8 @@ static int cm_init_listen(struct cm_id_private *cm_id_priv, __be64 service_id,
* and service ID resolution requests. The service ID should be specified
* network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will
* assign a service ID to the caller.
- * @service_mask: Mask applied to service ID used to listen across a
- * range of service IDs. If set to 0, the service ID is matched
- * exactly. This parameter is ignored if %service_id is set to
- * IB_CM_ASSIGN_SERVICE_ID.
*/
-int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask)
+int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id)
{
struct cm_id_private *cm_id_priv =
container_of(cm_id, struct cm_id_private, id);
@@ -1203,7 +1225,7 @@ int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask)
goto out;
}
- ret = cm_init_listen(cm_id_priv, service_id, service_mask);
+ ret = cm_init_listen(cm_id_priv, service_id);
if (ret)
goto out;
@@ -1251,9 +1273,11 @@ struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device,
if (IS_ERR(cm_id_priv))
return ERR_CAST(cm_id_priv);
- err = cm_init_listen(cm_id_priv, service_id, 0);
- if (err)
+ err = cm_init_listen(cm_id_priv, service_id);
+ if (err) {
+ ib_destroy_cm_id(&cm_id_priv->id);
return ERR_PTR(err);
+ }
spin_lock_irq(&cm_id_priv->lock);
listen_id_priv = cm_insert_listen(cm_id_priv, cm_handler);
@@ -1287,10 +1311,10 @@ static __be64 cm_form_tid(struct cm_id_private *cm_id_priv)
if (!cm_id_priv->av.port)
return cpu_to_be64(low_tid);
- spin_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
+ read_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
if (cm_id_priv->av.port->mad_agent)
hi_tid = ((u64)cm_id_priv->av.port->mad_agent->hi_tid) << 32;
- spin_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
+ read_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
return cpu_to_be64(hi_tid | low_tid);
}
@@ -1319,6 +1343,7 @@ static void cm_format_req(struct cm_req_msg *req_msg,
struct sa_path_rec *pri_path = param->primary_path;
struct sa_path_rec *alt_path = param->alternate_path;
bool pri_ext = false;
+ __be16 lid;
if (pri_path->rec_type == SA_PATH_REC_TYPE_OPA)
pri_ext = opa_is_extended_lid(pri_path->opa.dlid,
@@ -1378,9 +1403,16 @@ static void cm_format_req(struct cm_req_msg *req_msg,
htons(ntohl(sa_path_get_dlid(
pri_path)))));
} else {
+
+ if (param->primary_path_inbound) {
+ lid = param->primary_path_inbound->ib.dlid;
+ IBA_SET(CM_REQ_PRIMARY_LOCAL_PORT_LID, req_msg,
+ be16_to_cpu(lid));
+ } else
+ IBA_SET(CM_REQ_PRIMARY_LOCAL_PORT_LID, req_msg,
+ be16_to_cpu(IB_LID_PERMISSIVE));
+
/* Work-around until there's a way to obtain remote LID info */
- IBA_SET(CM_REQ_PRIMARY_LOCAL_PORT_LID, req_msg,
- be16_to_cpu(IB_LID_PERMISSIVE));
IBA_SET(CM_REQ_PRIMARY_REMOTE_PORT_LID, req_msg,
be16_to_cpu(IB_LID_PERMISSIVE));
}
@@ -1520,7 +1552,6 @@ int ib_send_cm_req(struct ib_cm_id *cm_id,
}
}
cm_id->service_id = param->service_id;
- cm_id->service_mask = ~cpu_to_be64(0);
cm_id_priv->timeout_ms = cm_convert_to_ms(
param->primary_path->packet_life_time) * 2 +
cm_convert_to_ms(
@@ -1536,10 +1567,14 @@ int ib_send_cm_req(struct ib_cm_id *cm_id,
spin_lock_irqsave(&cm_id_priv->lock, flags);
cm_move_av_from_path(&cm_id_priv->av, &av);
+ if (param->primary_path_outbound)
+ cm_id_priv->av.dlid_datapath =
+ be16_to_cpu(param->primary_path_outbound->ib.dlid);
+
if (param->alternate_path)
cm_move_av_from_path(&cm_id_priv->alt_av, &alt_av);
- msg = cm_alloc_priv_msg(cm_id_priv);
+ msg = cm_alloc_priv_msg(cm_id_priv, IB_CM_REQ_SENT);
if (IS_ERR(msg)) {
ret = PTR_ERR(msg);
goto out_unlock;
@@ -1548,8 +1583,6 @@ int ib_send_cm_req(struct ib_cm_id *cm_id,
req_msg = (struct cm_req_msg *)msg->mad;
cm_format_req(req_msg, cm_id_priv, param);
cm_id_priv->tid = req_msg->hdr.tid;
- msg->timeout_ms = cm_id_priv->timeout_ms;
- msg->context[1] = (void *)(unsigned long)IB_CM_REQ_SENT;
cm_id_priv->local_qpn = cpu_to_be32(IBA_GET(CM_REQ_LOCAL_QPN, req_msg));
cm_id_priv->rq_psn = cpu_to_be32(IBA_GET(CM_REQ_STARTING_PSN, req_msg));
@@ -1580,7 +1613,7 @@ static int cm_issue_rej(struct cm_port *port,
struct cm_rej_msg *rej_msg, *rcv_msg;
int ret;
- ret = cm_alloc_response_msg(port, mad_recv_wc, &msg);
+ ret = cm_alloc_response_msg(port, mad_recv_wc, false, &msg);
if (ret)
return ret;
@@ -1606,7 +1639,7 @@ static int cm_issue_rej(struct cm_port *port,
IBA_GET(CM_REJ_REMOTE_COMM_ID, rcv_msg));
ret = ib_post_send_mad(msg, NULL);
if (ret)
- cm_free_response_msg(msg);
+ cm_free_msg(msg);
return ret;
}
@@ -1630,14 +1663,13 @@ static void cm_path_set_rec_type(struct ib_device *ib_device, u32 port_num,
static void cm_format_path_lid_from_req(struct cm_req_msg *req_msg,
struct sa_path_rec *primary_path,
- struct sa_path_rec *alt_path)
+ struct sa_path_rec *alt_path,
+ struct ib_wc *wc)
{
u32 lid;
if (primary_path->rec_type != SA_PATH_REC_TYPE_OPA) {
- sa_path_set_dlid(primary_path,
- IBA_GET(CM_REQ_PRIMARY_LOCAL_PORT_LID,
- req_msg));
+ sa_path_set_dlid(primary_path, wc->slid);
sa_path_set_slid(primary_path,
IBA_GET(CM_REQ_PRIMARY_REMOTE_PORT_LID,
req_msg));
@@ -1674,7 +1706,8 @@ static void cm_format_path_lid_from_req(struct cm_req_msg *req_msg,
static void cm_format_paths_from_req(struct cm_req_msg *req_msg,
struct sa_path_rec *primary_path,
- struct sa_path_rec *alt_path)
+ struct sa_path_rec *alt_path,
+ struct ib_wc *wc)
{
primary_path->dgid =
*IBA_GET_MEM_PTR(CM_REQ_PRIMARY_LOCAL_PORT_GID, req_msg);
@@ -1732,7 +1765,7 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg,
if (sa_path_is_roce(alt_path))
alt_path->roce.route_resolved = false;
}
- cm_format_path_lid_from_req(req_msg, primary_path, alt_path);
+ cm_format_path_lid_from_req(req_msg, primary_path, alt_path, wc);
}
static u16 cm_get_bth_pkey(struct cm_work *work)
@@ -1853,7 +1886,7 @@ static void cm_process_work(struct cm_id_private *cm_id_priv,
static void cm_format_mra(struct cm_mra_msg *mra_msg,
struct cm_id_private *cm_id_priv,
- enum cm_msg_response msg_mraed, u8 service_timeout,
+ enum cm_msg_response msg_mraed,
const void *private_data, u8 private_data_len)
{
cm_format_mad_hdr(&mra_msg->hdr, CM_MRA_ATTR_ID, cm_id_priv->tid);
@@ -1862,7 +1895,7 @@ static void cm_format_mra(struct cm_mra_msg *mra_msg,
be32_to_cpu(cm_id_priv->id.local_id));
IBA_SET(CM_MRA_REMOTE_COMM_ID, mra_msg,
be32_to_cpu(cm_id_priv->id.remote_id));
- IBA_SET(CM_MRA_SERVICE_TIMEOUT, mra_msg, service_timeout);
+ IBA_SET(CM_MRA_SERVICE_TIMEOUT, mra_msg, CM_MRA_SETTING);
if (private_data && private_data_len)
IBA_SET_MEM(CM_MRA_PRIVATE_DATA, mra_msg, private_data,
@@ -1933,7 +1966,7 @@ static void cm_dup_req_handler(struct cm_work *work,
}
spin_unlock_irq(&cm_id_priv->lock);
- ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg);
+ ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, true, &msg);
if (ret)
return;
@@ -1941,7 +1974,7 @@ static void cm_dup_req_handler(struct cm_work *work,
switch (cm_id_priv->id.state) {
case IB_CM_MRA_REQ_SENT:
cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
- CM_MSG_RESPONSE_REQ, cm_id_priv->service_timeout,
+ CM_MSG_RESPONSE_REQ,
cm_id_priv->private_data,
cm_id_priv->private_data_len);
break;
@@ -1962,7 +1995,7 @@ static void cm_dup_req_handler(struct cm_work *work,
return;
unlock: spin_unlock_irq(&cm_id_priv->lock);
-free: cm_free_response_msg(msg);
+free: cm_free_msg(msg);
}
static struct cm_id_private *cm_match_req(struct cm_work *work,
@@ -2077,7 +2110,6 @@ static int cm_req_handler(struct cm_work *work)
cpu_to_be32(IBA_GET(CM_REQ_LOCAL_COMM_ID, req_msg));
cm_id_priv->id.service_id =
cpu_to_be64(IBA_GET(CM_REQ_SERVICE_ID, req_msg));
- cm_id_priv->id.service_mask = ~cpu_to_be64(0);
cm_id_priv->tid = req_msg->hdr.tid;
cm_id_priv->timeout_ms = cm_convert_to_ms(
IBA_GET(CM_REQ_LOCAL_CM_RESPONSE_TIMEOUT, req_msg));
@@ -2146,7 +2178,7 @@ static int cm_req_handler(struct cm_work *work)
if (cm_req_has_alt_path(req_msg))
work->path[1].rec_type = work->path[0].rec_type;
cm_format_paths_from_req(req_msg, &work->path[0],
- &work->path[1]);
+ &work->path[1], work->mad_recv_wc->wc);
if (cm_id_priv->av.ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE)
sa_path_set_dmac(&work->path[0],
cm_id_priv->av.ah_attr.roce.dmac);
@@ -2171,6 +2203,10 @@ static int cm_req_handler(struct cm_work *work)
NULL, 0);
goto rejected;
}
+ if (cm_id_priv->av.ah_attr.type == RDMA_AH_ATTR_TYPE_IB)
+ cm_id_priv->av.dlid_datapath =
+ IBA_GET(CM_REQ_PRIMARY_LOCAL_PORT_LID, req_msg);
+
if (cm_req_has_alt_path(req_msg)) {
ret = cm_init_av_by_path(&work->path[1], NULL,
&cm_id_priv->alt_av);
@@ -2273,7 +2309,7 @@ int ib_send_cm_rep(struct ib_cm_id *cm_id,
goto out;
}
- msg = cm_alloc_priv_msg(cm_id_priv);
+ msg = cm_alloc_priv_msg_rep(cm_id_priv, IB_CM_REP_SENT, true);
if (IS_ERR(msg)) {
ret = PTR_ERR(msg);
goto out;
@@ -2281,8 +2317,6 @@ int ib_send_cm_rep(struct ib_cm_id *cm_id,
rep_msg = (struct cm_rep_msg *) msg->mad;
cm_format_rep(rep_msg, cm_id_priv, param);
- msg->timeout_ms = cm_id_priv->timeout_ms;
- msg->context[1] = (void *) (unsigned long) IB_CM_REP_SENT;
trace_icm_send_rep(cm_id);
ret = ib_post_send_mad(msg, NULL);
@@ -2423,7 +2457,7 @@ static void cm_dup_rep_handler(struct cm_work *work)
atomic_long_inc(
&work->port->counters[CM_RECV_DUPLICATES][CM_REP_COUNTER]);
- ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg);
+ ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, true, &msg);
if (ret)
goto deref;
@@ -2434,7 +2468,7 @@ static void cm_dup_rep_handler(struct cm_work *work)
cm_id_priv->private_data_len);
else if (cm_id_priv->id.state == IB_CM_MRA_REP_SENT)
cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
- CM_MSG_RESPONSE_REP, cm_id_priv->service_timeout,
+ CM_MSG_RESPONSE_REP,
cm_id_priv->private_data,
cm_id_priv->private_data_len);
else
@@ -2448,7 +2482,7 @@ static void cm_dup_rep_handler(struct cm_work *work)
goto deref;
unlock: spin_unlock_irq(&cm_id_priv->lock);
-free: cm_free_response_msg(msg);
+free: cm_free_msg(msg);
deref: cm_deref_id(cm_id_priv);
}
@@ -2632,59 +2666,68 @@ static void cm_format_dreq(struct cm_dreq_msg *dreq_msg,
private_data_len);
}
-static int cm_send_dreq_locked(struct cm_id_private *cm_id_priv,
- const void *private_data, u8 private_data_len)
+static void cm_issue_dreq(struct cm_id_private *cm_id_priv)
{
struct ib_mad_send_buf *msg;
int ret;
lockdep_assert_held(&cm_id_priv->lock);
+ msg = cm_alloc_msg(cm_id_priv);
+ if (IS_ERR(msg))
+ return;
+
+ cm_format_dreq((struct cm_dreq_msg *) msg->mad, cm_id_priv, NULL, 0);
+
+ trace_icm_send_dreq(&cm_id_priv->id);
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret)
+ cm_free_msg(msg);
+}
+
+int ib_send_cm_dreq(struct ib_cm_id *cm_id, const void *private_data,
+ u8 private_data_len)
+{
+ struct cm_id_private *cm_id_priv =
+ container_of(cm_id, struct cm_id_private, id);
+ struct ib_mad_send_buf *msg;
+ unsigned long flags;
+ int ret;
+
if (private_data && private_data_len > IB_CM_DREQ_PRIVATE_DATA_SIZE)
return -EINVAL;
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
if (cm_id_priv->id.state != IB_CM_ESTABLISHED) {
trace_icm_dreq_skipped(&cm_id_priv->id);
- return -EINVAL;
+ ret = -EINVAL;
+ goto unlock;
}
if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT ||
cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD)
ib_cancel_mad(cm_id_priv->msg);
- msg = cm_alloc_priv_msg(cm_id_priv);
+ msg = cm_alloc_priv_msg(cm_id_priv, IB_CM_DREQ_SENT);
if (IS_ERR(msg)) {
cm_enter_timewait(cm_id_priv);
- return PTR_ERR(msg);
+ ret = PTR_ERR(msg);
+ goto unlock;
}
cm_format_dreq((struct cm_dreq_msg *) msg->mad, cm_id_priv,
private_data, private_data_len);
- msg->timeout_ms = cm_id_priv->timeout_ms;
- msg->context[1] = (void *) (unsigned long) IB_CM_DREQ_SENT;
trace_icm_send_dreq(&cm_id_priv->id);
ret = ib_post_send_mad(msg, NULL);
if (ret) {
cm_enter_timewait(cm_id_priv);
cm_free_priv_msg(msg);
- return ret;
+ goto unlock;
}
cm_id_priv->id.state = IB_CM_DREQ_SENT;
- return 0;
-}
-
-int ib_send_cm_dreq(struct ib_cm_id *cm_id, const void *private_data,
- u8 private_data_len)
-{
- struct cm_id_private *cm_id_priv =
- container_of(cm_id, struct cm_id_private, id);
- unsigned long flags;
- int ret;
-
- spin_lock_irqsave(&cm_id_priv->lock, flags);
- ret = cm_send_dreq_locked(cm_id_priv, private_data, private_data_len);
+unlock:
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
return ret;
}
@@ -2770,7 +2813,7 @@ static int cm_issue_drep(struct cm_port *port,
struct cm_drep_msg *drep_msg;
int ret;
- ret = cm_alloc_response_msg(port, mad_recv_wc, &msg);
+ ret = cm_alloc_response_msg(port, mad_recv_wc, true, &msg);
if (ret)
return ret;
@@ -2788,7 +2831,7 @@ static int cm_issue_drep(struct cm_port *port,
IBA_GET(CM_DREQ_REMOTE_COMM_ID, dreq_msg));
ret = ib_post_send_mad(msg, NULL);
if (ret)
- cm_free_response_msg(msg);
+ cm_free_msg(msg);
return ret;
}
@@ -2824,6 +2867,7 @@ static int cm_dreq_handler(struct cm_work *work)
switch (cm_id_priv->id.state) {
case IB_CM_REP_SENT:
case IB_CM_DREQ_SENT:
+ case IB_CM_MRA_REP_RCVD:
ib_cancel_mad(cm_id_priv->msg);
break;
case IB_CM_ESTABLISHED:
@@ -2831,12 +2875,11 @@ static int cm_dreq_handler(struct cm_work *work)
cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD)
ib_cancel_mad(cm_id_priv->msg);
break;
- case IB_CM_MRA_REP_RCVD:
- break;
case IB_CM_TIMEWAIT:
atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES]
[CM_DREQ_COUNTER]);
- msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc);
+ msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc,
+ true);
if (IS_ERR(msg))
goto unlock;
@@ -2847,7 +2890,7 @@ static int cm_dreq_handler(struct cm_work *work)
if (cm_create_response_msg_ah(work->port, work->mad_recv_wc, msg) ||
ib_post_send_mad(msg, NULL))
- cm_free_response_msg(msg);
+ cm_free_msg(msg);
goto deref;
case IB_CM_DREQ_RCVD:
atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES]
@@ -2913,6 +2956,8 @@ static int cm_send_rej_locked(struct cm_id_private *cm_id_priv,
(ari && ari_length > IB_CM_REJ_ARI_LENGTH))
return -EINVAL;
+ trace_icm_send_rej(&cm_id_priv->id, reason);
+
switch (state) {
case IB_CM_REQ_SENT:
case IB_CM_MRA_REQ_RCVD:
@@ -2943,7 +2988,6 @@ static int cm_send_rej_locked(struct cm_id_private *cm_id_priv,
return -EINVAL;
}
- trace_icm_send_rej(&cm_id_priv->id, reason);
ret = ib_post_send_mad(msg, NULL);
if (ret) {
cm_free_msg(msg);
@@ -3064,26 +3108,13 @@ out:
return -EINVAL;
}
-int ib_send_cm_mra(struct ib_cm_id *cm_id,
- u8 service_timeout,
- const void *private_data,
- u8 private_data_len)
+int ib_prepare_cm_mra(struct ib_cm_id *cm_id)
{
struct cm_id_private *cm_id_priv;
- struct ib_mad_send_buf *msg;
enum ib_cm_state cm_state;
enum ib_cm_lap_state lap_state;
- enum cm_msg_response msg_response;
- void *data;
unsigned long flags;
- int ret;
-
- if (private_data && private_data_len > IB_CM_MRA_PRIVATE_DATA_SIZE)
- return -EINVAL;
-
- data = cm_copy_private_data(private_data, private_data_len);
- if (IS_ERR(data))
- return PTR_ERR(data);
+ int ret = 0;
cm_id_priv = container_of(cm_id, struct cm_id_private, id);
@@ -3092,58 +3123,33 @@ int ib_send_cm_mra(struct ib_cm_id *cm_id,
case IB_CM_REQ_RCVD:
cm_state = IB_CM_MRA_REQ_SENT;
lap_state = cm_id->lap_state;
- msg_response = CM_MSG_RESPONSE_REQ;
break;
case IB_CM_REP_RCVD:
cm_state = IB_CM_MRA_REP_SENT;
lap_state = cm_id->lap_state;
- msg_response = CM_MSG_RESPONSE_REP;
break;
case IB_CM_ESTABLISHED:
if (cm_id->lap_state == IB_CM_LAP_RCVD) {
cm_state = cm_id->state;
lap_state = IB_CM_MRA_LAP_SENT;
- msg_response = CM_MSG_RESPONSE_OTHER;
break;
}
fallthrough;
default:
- trace_icm_send_mra_unknown_err(&cm_id_priv->id);
+ trace_icm_prepare_mra_unknown_err(&cm_id_priv->id);
ret = -EINVAL;
goto error_unlock;
}
- if (!(service_timeout & IB_CM_MRA_FLAG_DELAY)) {
- msg = cm_alloc_msg(cm_id_priv);
- if (IS_ERR(msg)) {
- ret = PTR_ERR(msg);
- goto error_unlock;
- }
-
- cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
- msg_response, service_timeout,
- private_data, private_data_len);
- trace_icm_send_mra(cm_id);
- ret = ib_post_send_mad(msg, NULL);
- if (ret)
- goto error_free_msg;
- }
-
cm_id->state = cm_state;
cm_id->lap_state = lap_state;
- cm_id_priv->service_timeout = service_timeout;
- cm_set_private_data(cm_id_priv, data, private_data_len);
- spin_unlock_irqrestore(&cm_id_priv->lock, flags);
- return 0;
+ cm_set_private_data(cm_id_priv, NULL, 0);
-error_free_msg:
- cm_free_msg(msg);
error_unlock:
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
- kfree(data);
return ret;
}
-EXPORT_SYMBOL(ib_send_cm_mra);
+EXPORT_SYMBOL(ib_prepare_cm_mra);
static struct cm_id_private *cm_acquire_mraed_id(struct cm_mra_msg *mra_msg)
{
@@ -3322,7 +3328,7 @@ static int cm_lap_handler(struct cm_work *work)
ret = cm_init_av_by_path(param->alternate_path, NULL, &alt_av);
if (ret) {
rdma_destroy_ah_attr(&ah_attr);
- return -EINVAL;
+ goto deref;
}
spin_lock_irq(&cm_id_priv->lock);
@@ -3340,20 +3346,20 @@ static int cm_lap_handler(struct cm_work *work)
case IB_CM_MRA_LAP_SENT:
atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES]
[CM_LAP_COUNTER]);
- msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc);
+ msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc,
+ true);
if (IS_ERR(msg))
goto unlock;
cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
CM_MSG_RESPONSE_OTHER,
- cm_id_priv->service_timeout,
cm_id_priv->private_data,
cm_id_priv->private_data_len);
spin_unlock_irq(&cm_id_priv->lock);
if (cm_create_response_msg_ah(work->port, work->mad_recv_wc, msg) ||
ib_post_send_mad(msg, NULL))
- cm_free_response_msg(msg);
+ cm_free_msg(msg);
goto deref;
case IB_CM_LAP_RCVD:
atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES]
@@ -3485,7 +3491,6 @@ int ib_send_cm_sidr_req(struct ib_cm_id *cm_id,
spin_lock_irqsave(&cm_id_priv->lock, flags);
cm_move_av_from_path(&cm_id_priv->av, &av);
cm_id->service_id = param->service_id;
- cm_id->service_mask = ~cpu_to_be64(0);
cm_id_priv->timeout_ms = param->timeout_ms;
cm_id_priv->max_cm_retries = param->max_cm_retries;
if (cm_id->state != IB_CM_IDLE) {
@@ -3493,7 +3498,7 @@ int ib_send_cm_sidr_req(struct ib_cm_id *cm_id,
goto out_unlock;
}
- msg = cm_alloc_priv_msg(cm_id_priv);
+ msg = cm_alloc_priv_msg(cm_id_priv, IB_CM_SIDR_REQ_SENT);
if (IS_ERR(msg)) {
ret = PTR_ERR(msg);
goto out_unlock;
@@ -3501,8 +3506,6 @@ int ib_send_cm_sidr_req(struct ib_cm_id *cm_id,
cm_format_sidr_req((struct cm_sidr_req_msg *)msg->mad, cm_id_priv,
param);
- msg->timeout_ms = cm_id_priv->timeout_ms;
- msg->context[1] = (void *)(unsigned long)IB_CM_SIDR_REQ_SENT;
trace_icm_send_sidr_req(&cm_id_priv->id);
ret = ib_post_send_mad(msg, NULL);
@@ -3560,7 +3563,6 @@ static int cm_sidr_req_handler(struct cm_work *work)
cpu_to_be32(IBA_GET(CM_SIDR_REQ_REQUESTID, sidr_req_msg));
cm_id_priv->id.service_id =
cpu_to_be64(IBA_GET(CM_SIDR_REQ_SERVICEID, sidr_req_msg));
- cm_id_priv->id.service_mask = ~cpu_to_be64(0);
cm_id_priv->tid = sidr_req_msg->hdr.tid;
wc = work->mad_recv_wc->wc;
@@ -3749,17 +3751,18 @@ out:
static void cm_process_send_error(struct cm_id_private *cm_id_priv,
struct ib_mad_send_buf *msg,
- enum ib_cm_state state,
enum ib_wc_status wc_status)
{
+ enum ib_cm_state state = (unsigned long) msg->context[1];
struct ib_cm_event cm_event = {};
int ret;
- /* Discard old sends or ones without a response. */
+ /* Discard old sends. */
spin_lock_irq(&cm_id_priv->lock);
if (msg != cm_id_priv->msg) {
spin_unlock_irq(&cm_id_priv->lock);
cm_free_msg(msg);
+ cm_deref_id(cm_id_priv);
return;
}
cm_free_priv_msg(msg);
@@ -3807,9 +3810,7 @@ static void cm_send_handler(struct ib_mad_agent *mad_agent,
struct ib_mad_send_wc *mad_send_wc)
{
struct ib_mad_send_buf *msg = mad_send_wc->send_buf;
- struct cm_id_private *cm_id_priv = msg->context[0];
- enum ib_cm_state state =
- (enum ib_cm_state)(unsigned long)msg->context[1];
+ struct cm_id_private *cm_id_priv;
struct cm_port *port;
u16 attr_index;
@@ -3817,13 +3818,12 @@ static void cm_send_handler(struct ib_mad_agent *mad_agent,
attr_index = be16_to_cpu(((struct ib_mad_hdr *)
msg->mad)->attr_id) - CM_ATTR_ID_OFFSET;
- /*
- * If the send was in response to a received message (context[0] is not
- * set to a cm_id), and is not a REJ, then it is a send that was
- * manually retried.
- */
- if (!cm_id_priv && (attr_index != CM_REJ_COUNTER))
+ if (msg->context[0] == CM_DIRECT_RETRY_CTX) {
msg->retries = 1;
+ cm_id_priv = NULL;
+ } else {
+ cm_id_priv = msg->context[0];
+ }
atomic_long_add(1 + msg->retries, &port->counters[CM_XMIT][attr_index]);
if (msg->retries)
@@ -3831,10 +3831,9 @@ static void cm_send_handler(struct ib_mad_agent *mad_agent,
&port->counters[CM_XMIT_RETRIES][attr_index]);
if (cm_id_priv)
- cm_process_send_error(cm_id_priv, msg, state,
- mad_send_wc->status);
+ cm_process_send_error(cm_id_priv, msg, mad_send_wc->status);
else
- cm_free_response_msg(msg);
+ cm_free_msg(msg);
}
static void cm_work_handler(struct work_struct *_work)
@@ -4097,9 +4096,18 @@ static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv,
*qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS |
IB_QP_PKEY_INDEX | IB_QP_PORT;
qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE;
- if (cm_id_priv->responder_resources)
+ if (cm_id_priv->responder_resources) {
+ struct ib_device *ib_dev = cm_id_priv->id.device;
+ u64 support_flush = ib_dev->attrs.device_cap_flags &
+ (IB_DEVICE_FLUSH_GLOBAL | IB_DEVICE_FLUSH_PERSISTENT);
+ u32 flushable = support_flush ?
+ (IB_ACCESS_FLUSH_GLOBAL |
+ IB_ACCESS_FLUSH_PERSISTENT) : 0;
+
qp_attr->qp_access_flags |= IB_ACCESS_REMOTE_READ |
- IB_ACCESS_REMOTE_ATOMIC;
+ IB_ACCESS_REMOTE_ATOMIC |
+ flushable;
+ }
qp_attr->pkey_index = cm_id_priv->av.pkey_index;
if (cm_id_priv->av.port)
qp_attr->port_num = cm_id_priv->av.port->port_num;
@@ -4133,6 +4141,10 @@ static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv,
*qp_attr_mask = IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU |
IB_QP_DEST_QPN | IB_QP_RQ_PSN;
qp_attr->ah_attr = cm_id_priv->av.ah_attr;
+ if ((qp_attr->ah_attr.type == RDMA_AH_ATTR_TYPE_IB) &&
+ cm_id_priv->av.dlid_datapath &&
+ (cm_id_priv->av.dlid_datapath != 0xffff))
+ qp_attr->ah_attr.ib.dlid = cm_id_priv->av.dlid_datapath;
qp_attr->path_mtu = cm_id_priv->path_mtu;
qp_attr->dest_qp_num = be32_to_cpu(cm_id_priv->remote_qpn);
qp_attr->rq_psn = be32_to_cpu(cm_id_priv->rq_psn);
@@ -4342,7 +4354,7 @@ static int cm_add_one(struct ib_device *ib_device)
return -ENOMEM;
kref_init(&cm_dev->kref);
- spin_lock_init(&cm_dev->mad_agent_lock);
+ rwlock_init(&cm_dev->mad_agent_lock);
cm_dev->ib_device = ib_device;
cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay;
cm_dev->going_down = 0;
@@ -4382,9 +4394,22 @@ static int cm_add_one(struct ib_device *ib_device)
goto error2;
}
+ port->rep_agent = ib_register_mad_agent(ib_device, i,
+ IB_QPT_GSI,
+ NULL,
+ 0,
+ cm_send_handler,
+ NULL,
+ port,
+ 0);
+ if (IS_ERR(port->rep_agent)) {
+ ret = PTR_ERR(port->rep_agent);
+ goto error3;
+ }
+
ret = ib_modify_port(ib_device, i, 0, &port_modify);
if (ret)
- goto error3;
+ goto error4;
count++;
}
@@ -4399,6 +4424,8 @@ static int cm_add_one(struct ib_device *ib_device)
write_unlock_irqrestore(&cm.device_lock, flags);
return 0;
+error4:
+ ib_unregister_mad_agent(port->rep_agent);
error3:
ib_unregister_mad_agent(port->mad_agent);
error2:
@@ -4412,6 +4439,7 @@ error1:
port = cm_dev->port[i-1];
ib_modify_port(ib_device, port->port_num, 0, &port_modify);
+ ib_unregister_mad_agent(port->rep_agent);
ib_unregister_mad_agent(port->mad_agent);
ib_port_unregister_client_groups(ib_device, i,
cm_counter_groups);
@@ -4441,12 +4469,14 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data)
rdma_for_each_port (ib_device, i) {
struct ib_mad_agent *mad_agent;
+ struct ib_mad_agent *rep_agent;
if (!rdma_cap_ib_cm(ib_device, i))
continue;
port = cm_dev->port[i-1];
mad_agent = port->mad_agent;
+ rep_agent = port->rep_agent;
ib_modify_port(ib_device, port->port_num, 0, &port_modify);
/*
* We flush the queue here after the going_down set, this
@@ -4458,10 +4488,12 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data)
* The above ensures no call paths from the work are running,
* the remaining paths all take the mad_agent_lock.
*/
- spin_lock(&cm_dev->mad_agent_lock);
+ write_lock(&cm_dev->mad_agent_lock);
port->mad_agent = NULL;
- spin_unlock(&cm_dev->mad_agent_lock);
+ port->rep_agent = NULL;
+ write_unlock(&cm_dev->mad_agent_lock);
ib_unregister_mad_agent(mad_agent);
+ ib_unregister_mad_agent(rep_agent);
ib_port_unregister_client_groups(ib_device, i,
cm_counter_groups);
}
@@ -4485,7 +4517,7 @@ static int __init ib_cm_init(void)
get_random_bytes(&cm.random_id_operand, sizeof cm.random_id_operand);
INIT_LIST_HEAD(&cm.timewait_list);
- cm.wq = alloc_workqueue("ib_cm", 0, 1);
+ cm.wq = alloc_workqueue("ib_cm", WQ_PERCPU, 1);
if (!cm.wq) {
ret = -ENOMEM;
goto error2;
diff --git a/drivers/infiniband/core/cm_trace.h b/drivers/infiniband/core/cm_trace.h
index e9d282679ef1..4a4987da69d4 100644
--- a/drivers/infiniband/core/cm_trace.h
+++ b/drivers/infiniband/core/cm_trace.h
@@ -16,7 +16,7 @@
#include <linux/tracepoint.h>
#include <rdma/ib_cm.h>
-#include <trace/events/rdma.h>
+#include <trace/misc/rdma.h>
/*
* enum ib_cm_state, from include/rdma/ib_cm.h
@@ -229,7 +229,7 @@ DEFINE_CM_ERR_EVENT(send_drep);
DEFINE_CM_ERR_EVENT(dreq_unknown);
DEFINE_CM_ERR_EVENT(send_unknown_rej);
DEFINE_CM_ERR_EVENT(rej_unknown);
-DEFINE_CM_ERR_EVENT(send_mra_unknown);
+DEFINE_CM_ERR_EVENT(prepare_mra_unknown);
DEFINE_CM_ERR_EVENT(mra_unknown);
DEFINE_CM_ERR_EVENT(qp_init);
DEFINE_CM_ERR_EVENT(qp_rtr);
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 515a7e95a421..95e89f5c147c 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -11,6 +11,7 @@
#include <linux/in6.h>
#include <linux/mutex.h>
#include <linux/random.h>
+#include <linux/rbtree.h>
#include <linux/igmp.h>
#include <linux/xarray.h>
#include <linux/inetdevice.h>
@@ -20,6 +21,7 @@
#include <net/net_namespace.h>
#include <net/netns/generic.h>
+#include <net/netevent.h>
#include <net/tcp.h>
#include <net/ipv6.h>
#include <net/ip_fib.h>
@@ -44,8 +46,7 @@ MODULE_LICENSE("Dual BSD/GPL");
#define CMA_CM_RESPONSE_TIMEOUT 20
#define CMA_MAX_CM_RETRIES 15
-#define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24)
-#define CMA_IBOE_PACKET_LIFETIME 18
+#define CMA_IBOE_PACKET_LIFETIME 16
#define CMA_PREFERRED_ROCE_GID_TYPE IB_GID_TYPE_ROCE_UDP_ENCAP
static const char * const cma_events[] = {
@@ -67,8 +68,10 @@ static const char * const cma_events[] = {
[RDMA_CM_EVENT_TIMEWAIT_EXIT] = "timewait exit",
};
-static void cma_set_mgid(struct rdma_id_private *id_priv, struct sockaddr *addr,
- union ib_gid *mgid);
+static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid,
+ enum ib_gid_type gid_type);
+
+static void cma_netevent_work_handler(struct work_struct *_work);
const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event)
{
@@ -142,19 +145,6 @@ struct iw_cm_id *rdma_iw_cm_id(struct rdma_cm_id *id)
}
EXPORT_SYMBOL(rdma_iw_cm_id);
-/**
- * rdma_res_to_id() - return the rdma_cm_id pointer for this restrack.
- * @res: rdma resource tracking entry pointer
- */
-struct rdma_cm_id *rdma_res_to_id(struct rdma_restrack_entry *res)
-{
- struct rdma_id_private *id_priv =
- container_of(res, struct rdma_id_private, res);
-
- return &id_priv->id;
-}
-EXPORT_SYMBOL(rdma_res_to_id);
-
static int cma_add_one(struct ib_device *device);
static void cma_remove_one(struct ib_device *device, void *client_data);
@@ -168,6 +158,9 @@ static struct ib_sa_client sa_client;
static LIST_HEAD(dev_list);
static LIST_HEAD(listen_any_list);
static DEFINE_MUTEX(lock);
+static struct rb_root id_table = RB_ROOT;
+/* Serialize operations of id_table tree */
+static DEFINE_SPINLOCK(id_table_lock);
static struct workqueue_struct *cma_wq;
static unsigned int cma_pernet_id;
@@ -202,6 +195,11 @@ struct xarray *cma_pernet_xa(struct net *net, enum rdma_ucm_port_space ps)
}
}
+struct id_table_entry {
+ struct list_head id_list;
+ struct rb_node rb_node;
+};
+
struct cma_device {
struct list_head list;
struct ib_device *device;
@@ -420,11 +418,21 @@ static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr)
return hdr->ip_version >> 4;
}
-static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver)
+static void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver)
{
hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF);
}
+static struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv)
+{
+ return (struct sockaddr *)&id_priv->id.route.addr.src_addr;
+}
+
+static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv)
+{
+ return (struct sockaddr *)&id_priv->id.route.addr.dst_addr;
+}
+
static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join)
{
struct in_device *in_dev = NULL;
@@ -445,6 +453,124 @@ static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join)
return (in_dev) ? 0 : -ENODEV;
}
+static int compare_netdev_and_ip(int ifindex_a, struct sockaddr *sa,
+ struct id_table_entry *entry_b)
+{
+ struct rdma_id_private *id_priv = list_first_entry(
+ &entry_b->id_list, struct rdma_id_private, id_list_entry);
+ int ifindex_b = id_priv->id.route.addr.dev_addr.bound_dev_if;
+ struct sockaddr *sb = cma_dst_addr(id_priv);
+
+ if (ifindex_a != ifindex_b)
+ return (ifindex_a > ifindex_b) ? 1 : -1;
+
+ if (sa->sa_family != sb->sa_family)
+ return sa->sa_family - sb->sa_family;
+
+ if (sa->sa_family == AF_INET &&
+ __builtin_object_size(sa, 0) >= sizeof(struct sockaddr_in)) {
+ return memcmp(&((struct sockaddr_in *)sa)->sin_addr,
+ &((struct sockaddr_in *)sb)->sin_addr,
+ sizeof(((struct sockaddr_in *)sa)->sin_addr));
+ }
+
+ if (sa->sa_family == AF_INET6 &&
+ __builtin_object_size(sa, 0) >= sizeof(struct sockaddr_in6)) {
+ return ipv6_addr_cmp(&((struct sockaddr_in6 *)sa)->sin6_addr,
+ &((struct sockaddr_in6 *)sb)->sin6_addr);
+ }
+
+ return -1;
+}
+
+static int cma_add_id_to_tree(struct rdma_id_private *node_id_priv)
+{
+ struct rb_node **new, *parent = NULL;
+ struct id_table_entry *this, *node;
+ unsigned long flags;
+ int result;
+
+ node = kzalloc(sizeof(*node), GFP_KERNEL);
+ if (!node)
+ return -ENOMEM;
+
+ spin_lock_irqsave(&id_table_lock, flags);
+ new = &id_table.rb_node;
+ while (*new) {
+ this = container_of(*new, struct id_table_entry, rb_node);
+ result = compare_netdev_and_ip(
+ node_id_priv->id.route.addr.dev_addr.bound_dev_if,
+ cma_dst_addr(node_id_priv), this);
+
+ parent = *new;
+ if (result < 0)
+ new = &((*new)->rb_left);
+ else if (result > 0)
+ new = &((*new)->rb_right);
+ else {
+ list_add_tail(&node_id_priv->id_list_entry,
+ &this->id_list);
+ kfree(node);
+ goto unlock;
+ }
+ }
+
+ INIT_LIST_HEAD(&node->id_list);
+ list_add_tail(&node_id_priv->id_list_entry, &node->id_list);
+
+ rb_link_node(&node->rb_node, parent, new);
+ rb_insert_color(&node->rb_node, &id_table);
+
+unlock:
+ spin_unlock_irqrestore(&id_table_lock, flags);
+ return 0;
+}
+
+static struct id_table_entry *
+node_from_ndev_ip(struct rb_root *root, int ifindex, struct sockaddr *sa)
+{
+ struct rb_node *node = root->rb_node;
+ struct id_table_entry *data;
+ int result;
+
+ while (node) {
+ data = container_of(node, struct id_table_entry, rb_node);
+ result = compare_netdev_and_ip(ifindex, sa, data);
+ if (result < 0)
+ node = node->rb_left;
+ else if (result > 0)
+ node = node->rb_right;
+ else
+ return data;
+ }
+
+ return NULL;
+}
+
+static void cma_remove_id_from_tree(struct rdma_id_private *id_priv)
+{
+ struct id_table_entry *data;
+ unsigned long flags;
+
+ spin_lock_irqsave(&id_table_lock, flags);
+ if (list_empty(&id_priv->id_list_entry))
+ goto out;
+
+ data = node_from_ndev_ip(&id_table,
+ id_priv->id.route.addr.dev_addr.bound_dev_if,
+ cma_dst_addr(id_priv));
+ if (!data)
+ goto out;
+
+ list_del_init(&id_priv->id_list_entry);
+ if (list_empty(&data->id_list)) {
+ rb_erase(&data->rb_node, &id_table);
+ kfree(data);
+ }
+out:
+ spin_unlock_irqrestore(&id_table_lock, flags);
+}
+
static void _cma_attach_to_dev(struct rdma_id_private *id_priv,
struct cma_device *cma_dev)
{
@@ -453,7 +579,7 @@ static void _cma_attach_to_dev(struct rdma_id_private *id_priv,
id_priv->id.device = cma_dev->device;
id_priv->id.route.addr.dev_addr.transport =
rdma_node_get_transport(cma_dev->device->node_type);
- list_add_tail(&id_priv->list, &cma_dev->id_list);
+ list_add_tail(&id_priv->device_item, &cma_dev->id_list);
trace_cm_id_attach(id_priv, cma_dev->device);
}
@@ -470,7 +596,7 @@ static void cma_attach_to_dev(struct rdma_id_private *id_priv,
static void cma_release_dev(struct rdma_id_private *id_priv)
{
mutex_lock(&lock);
- list_del(&id_priv->list);
+ list_del_init(&id_priv->device_item);
cma_dev_put(id_priv->cma_dev);
id_priv->cma_dev = NULL;
id_priv->id.device = NULL;
@@ -481,37 +607,16 @@ static void cma_release_dev(struct rdma_id_private *id_priv)
mutex_unlock(&lock);
}
-static inline struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv)
-{
- return (struct sockaddr *) &id_priv->id.route.addr.src_addr;
-}
-
-static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv)
-{
- return (struct sockaddr *) &id_priv->id.route.addr.dst_addr;
-}
-
static inline unsigned short cma_family(struct rdma_id_private *id_priv)
{
return id_priv->id.route.addr.src_addr.ss_family;
}
-static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey)
+static int cma_set_default_qkey(struct rdma_id_private *id_priv)
{
struct ib_sa_mcmember_rec rec;
int ret = 0;
- if (id_priv->qkey) {
- if (qkey && id_priv->qkey != qkey)
- return -EINVAL;
- return 0;
- }
-
- if (qkey) {
- id_priv->qkey = qkey;
- return 0;
- }
-
switch (id_priv->id.ps) {
case RDMA_PS_UDP:
case RDMA_PS_IB:
@@ -531,6 +636,16 @@ static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey)
return ret;
}
+static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey)
+{
+ if (!qkey ||
+ (id_priv->qkey && (id_priv->qkey != qkey)))
+ return -EINVAL;
+
+ id_priv->qkey = qkey;
+ return 0;
+}
+
static void cma_translate_ib(struct sockaddr_ib *sib, struct rdma_dev_addr *dev_addr)
{
dev_addr->dev_type = ARPHRD_INFINIBAND;
@@ -559,31 +674,84 @@ cma_validate_port(struct ib_device *device, u32 port,
struct rdma_id_private *id_priv)
{
struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+ const struct ib_gid_attr *sgid_attr = ERR_PTR(-ENODEV);
int bound_if_index = dev_addr->bound_dev_if;
- const struct ib_gid_attr *sgid_attr;
int dev_type = dev_addr->dev_type;
struct net_device *ndev = NULL;
+ struct net_device *pdev = NULL;
if (!rdma_dev_access_netns(device, id_priv->id.route.addr.dev_addr.net))
- return ERR_PTR(-ENODEV);
+ goto out;
if ((dev_type == ARPHRD_INFINIBAND) && !rdma_protocol_ib(device, port))
- return ERR_PTR(-ENODEV);
+ goto out;
if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port))
- return ERR_PTR(-ENODEV);
+ goto out;
+
+ /*
+ * For drivers that do not associate more than one net device with
+ * their gid tables, such as iWARP drivers, it is sufficient to
+ * return the first table entry.
+ *
+ * Other driver classes might be included in the future.
+ */
+ if (rdma_protocol_iwarp(device, port)) {
+ sgid_attr = rdma_get_gid_attr(device, port, 0);
+ if (IS_ERR(sgid_attr))
+ goto out;
+
+ rcu_read_lock();
+ ndev = rcu_dereference(sgid_attr->ndev);
+ if (ndev->ifindex != bound_if_index) {
+ pdev = dev_get_by_index_rcu(dev_addr->net, bound_if_index);
+ if (pdev) {
+ if (is_vlan_dev(pdev)) {
+ pdev = vlan_dev_real_dev(pdev);
+ if (ndev->ifindex == pdev->ifindex)
+ bound_if_index = pdev->ifindex;
+ }
+ if (is_vlan_dev(ndev)) {
+ pdev = vlan_dev_real_dev(ndev);
+ if (bound_if_index == pdev->ifindex)
+ bound_if_index = ndev->ifindex;
+ }
+ }
+ }
+ if (!net_eq(dev_net(ndev), dev_addr->net) ||
+ ndev->ifindex != bound_if_index) {
+ rdma_put_gid_attr(sgid_attr);
+ sgid_attr = ERR_PTR(-ENODEV);
+ }
+ rcu_read_unlock();
+ goto out;
+ }
- if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) {
- ndev = dev_get_by_index(dev_addr->net, bound_if_index);
- if (!ndev)
- return ERR_PTR(-ENODEV);
+ /*
+ * For a RXE device, it should work with TUN device and normal ethernet
+ * devices. Use driver_id to check if a device is a RXE device or not.
+ * ARPHDR_NONE means a TUN device.
+ */
+ if (device->ops.driver_id == RDMA_DRIVER_RXE) {
+ if ((dev_type == ARPHRD_NONE || dev_type == ARPHRD_ETHER)
+ && rdma_protocol_roce(device, port)) {
+ ndev = dev_get_by_index(dev_addr->net, bound_if_index);
+ if (!ndev)
+ goto out;
+ }
} else {
- gid_type = IB_GID_TYPE_IB;
+ if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) {
+ ndev = dev_get_by_index(dev_addr->net, bound_if_index);
+ if (!ndev)
+ goto out;
+ } else {
+ gid_type = IB_GID_TYPE_IB;
+ }
}
sgid_attr = rdma_find_gid_by_port(device, gid, gid_type, port, ndev);
- if (ndev)
- dev_put(ndev);
+ dev_put(ndev);
+out:
return sgid_attr;
}
@@ -766,6 +934,7 @@ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv)
unsigned int p;
u16 pkey, index;
enum ib_port_state port_state;
+ int ret;
int i;
cma_dev = NULL;
@@ -784,9 +953,14 @@ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv)
if (ib_get_cached_port_state(cur_dev->device, p, &port_state))
continue;
- for (i = 0; !rdma_query_gid(cur_dev->device,
- p, i, &gid);
- i++) {
+
+ for (i = 0; i < cur_dev->device->port_data[p].immutable.gid_tbl_len;
+ ++i) {
+ ret = rdma_query_gid(cur_dev->device, p, i,
+ &gid);
+ if (ret)
+ continue;
+
if (!memcmp(&gid, dgid, sizeof(gid))) {
cma_dev = cur_dev;
sgid = gid;
@@ -854,11 +1028,14 @@ __rdma_create_id(struct net *net, rdma_cm_event_handler event_handler,
init_completion(&id_priv->comp);
refcount_set(&id_priv->refcount, 1);
mutex_init(&id_priv->handler_mutex);
+ INIT_LIST_HEAD(&id_priv->device_item);
+ INIT_LIST_HEAD(&id_priv->id_list_entry);
INIT_LIST_HEAD(&id_priv->listen_list);
INIT_LIST_HEAD(&id_priv->mc_list);
get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num);
id_priv->id.route.addr.dev_addr.net = get_net(net);
id_priv->seq_num &= 0x00ffffff;
+ INIT_WORK(&id_priv->id.net_work, cma_netevent_work_handler);
rdma_restrack_new(&id_priv->res, RDMA_RESTRACK_CM_ID);
if (parent)
@@ -926,12 +1103,25 @@ static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp)
return ret;
}
+static int cma_init_conn_qp(struct rdma_id_private *id_priv, struct ib_qp *qp)
+{
+ struct ib_qp_attr qp_attr;
+ int qp_attr_mask, ret;
+
+ qp_attr.qp_state = IB_QPS_INIT;
+ ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+ if (ret)
+ return ret;
+
+ return ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+}
+
int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd,
struct ib_qp_init_attr *qp_init_attr)
{
struct rdma_id_private *id_priv;
struct ib_qp *qp;
- int ret = 0;
+ int ret;
id_priv = container_of(id, struct rdma_id_private, id);
if (id->device != pd->device) {
@@ -948,6 +1138,8 @@ int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd,
if (id->qp_type == IB_QPT_UD)
ret = cma_init_ud_qp(id_priv, qp);
+ else
+ ret = cma_init_conn_qp(id_priv, qp);
if (ret)
goto out_destroy;
@@ -1078,7 +1270,7 @@ static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv,
*qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT;
if (id_priv->id.qp_type == IB_QPT_UD) {
- ret = cma_set_qkey(id_priv, 0);
+ ret = cma_set_default_qkey(id_priv);
if (ret)
return ret;
@@ -1412,7 +1604,7 @@ static bool validate_ipv4_net_dev(struct net_device *net_dev,
return false;
memset(&fl4, 0, sizeof(fl4));
- fl4.flowi4_iif = net_dev->ifindex;
+ fl4.flowi4_oif = net_dev->ifindex;
fl4.daddr = daddr;
fl4.saddr = saddr;
@@ -1632,7 +1824,7 @@ static struct rdma_id_private *cma_find_listener(
return id_priv;
list_for_each_entry(id_priv_dev,
&id_priv->listen_list,
- listen_list) {
+ listen_item) {
if (id_priv_dev->id.device == cm_id->device &&
cma_match_net_dev(&id_priv_dev->id,
net_dev, req))
@@ -1697,8 +1889,8 @@ cma_ib_id_from_event(struct ib_cm_id *cm_id,
}
if (!validate_net_dev(*net_dev,
- (struct sockaddr *)&req->listen_addr_storage,
- (struct sockaddr *)&req->src_addr_storage)) {
+ (struct sockaddr *)&req->src_addr_storage,
+ (struct sockaddr *)&req->listen_addr_storage)) {
id_priv = ERR_PTR(-EHOSTUNREACH);
goto err;
}
@@ -1731,28 +1923,36 @@ static void cma_cancel_route(struct rdma_id_private *id_priv)
}
}
-static void cma_cancel_listens(struct rdma_id_private *id_priv)
+static void _cma_cancel_listens(struct rdma_id_private *id_priv)
{
struct rdma_id_private *dev_id_priv;
+ lockdep_assert_held(&lock);
+
/*
* Remove from listen_any_list to prevent added devices from spawning
* additional listen requests.
*/
- mutex_lock(&lock);
- list_del(&id_priv->list);
+ list_del_init(&id_priv->listen_any_item);
while (!list_empty(&id_priv->listen_list)) {
- dev_id_priv = list_entry(id_priv->listen_list.next,
- struct rdma_id_private, listen_list);
+ dev_id_priv =
+ list_first_entry(&id_priv->listen_list,
+ struct rdma_id_private, listen_item);
/* sync with device removal to avoid duplicate destruction */
- list_del_init(&dev_id_priv->list);
- list_del(&dev_id_priv->listen_list);
+ list_del_init(&dev_id_priv->device_item);
+ list_del_init(&dev_id_priv->listen_item);
mutex_unlock(&lock);
rdma_destroy_id(&dev_id_priv->id);
mutex_lock(&lock);
}
+}
+
+static void cma_cancel_listens(struct rdma_id_private *id_priv)
+{
+ mutex_lock(&lock);
+ _cma_cancel_listens(id_priv);
mutex_unlock(&lock);
}
@@ -1761,6 +1961,14 @@ static void cma_cancel_operation(struct rdma_id_private *id_priv,
{
switch (state) {
case RDMA_CM_ADDR_QUERY:
+ /*
+ * We can avoid doing the rdma_addr_cancel() based on state,
+ * only RDMA_CM_ADDR_QUERY has a work that could still execute.
+ * Notice that the addr_handler work could still be exiting
+ * outside this state, however due to the interaction with the
+ * handler_mutex the work is guaranteed not to touch id_priv
+ * during exit.
+ */
rdma_addr_cancel(&id_priv->id.route.addr.dev_addr);
break;
case RDMA_CM_ROUTE_QUERY:
@@ -1795,6 +2003,8 @@ static void cma_release_port(struct rdma_id_private *id_priv)
static void destroy_mc(struct rdma_id_private *id_priv,
struct cma_multicast *mc)
{
+ bool send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN);
+
if (rdma_cap_ib_mcast(id_priv->id.device, id_priv->id.port_num))
ib_sa_free_multicast(mc->sa_mc);
@@ -1806,14 +2016,19 @@ static void destroy_mc(struct rdma_id_private *id_priv,
if (dev_addr->bound_dev_if)
ndev = dev_get_by_index(dev_addr->net,
dev_addr->bound_dev_if);
- if (ndev) {
+ if (ndev && !send_only) {
+ enum ib_gid_type gid_type;
union ib_gid mgid;
- cma_set_mgid(id_priv, (struct sockaddr *)&mc->addr,
- &mgid);
+ gid_type = id_priv->cma_dev->default_gid_type
+ [id_priv->id.port_num -
+ rdma_start_port(
+ id_priv->cma_dev->device)];
+ cma_iboe_set_mgid((struct sockaddr *)&mc->addr, &mgid,
+ gid_type);
cma_igmp_send(ndev, &mgid, false);
- dev_put(ndev);
}
+ dev_put(ndev);
cancel_work_sync(&mc->iboe_join.work);
}
@@ -1838,6 +2053,7 @@ static void _destroy_id(struct rdma_id_private *id_priv,
cma_cancel_operation(id_priv, state);
rdma_restrack_del(&id_priv->res);
+ cma_remove_id_from_tree(id_priv);
if (id_priv->cma_dev) {
if (rdma_cap_ib_cm(id_priv->id.device, 1)) {
if (id_priv->cm_id.ib)
@@ -1858,6 +2074,9 @@ static void _destroy_id(struct rdma_id_private *id_priv,
cma_id_put(id_priv->id.context);
kfree(id_priv->id.route.path_rec);
+ kfree(id_priv->id.route.path_rec_inbound);
+ kfree(id_priv->id.route.path_rec_outbound);
+ kfree(id_priv->id.route.service_recs);
put_net(id_priv->id.route.addr.dev_addr.net);
kfree(id_priv);
@@ -1982,8 +2201,8 @@ static int cma_ib_handler(struct ib_cm_id *cm_id,
case IB_CM_REP_RECEIVED:
if (state == RDMA_CM_CONNECT &&
(id_priv->id.qp_type != IB_QPT_UD)) {
- trace_cm_send_mra(id_priv);
- ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
+ trace_cm_prepare_mra(id_priv);
+ ib_prepare_cm_mra(cm_id);
}
if (id_priv->id.qp) {
event.status = cma_rep_recv(id_priv);
@@ -2073,14 +2292,14 @@ cma_ib_new_conn_id(const struct rdma_cm_id *listen_id,
goto err;
rt = &id->route;
- rt->num_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1;
- rt->path_rec = kmalloc_array(rt->num_paths, sizeof(*rt->path_rec),
- GFP_KERNEL);
+ rt->num_pri_alt_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1;
+ rt->path_rec = kmalloc_array(rt->num_pri_alt_paths,
+ sizeof(*rt->path_rec), GFP_KERNEL);
if (!rt->path_rec)
goto err;
rt->path_rec[0] = *path;
- if (rt->num_paths == 2)
+ if (rt->num_pri_alt_paths == 2)
rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path;
if (net_dev) {
@@ -2244,8 +2463,8 @@ static int cma_ib_req_handler(struct ib_cm_id *cm_id,
if (READ_ONCE(conn_id->state) == RDMA_CM_CONNECT &&
conn_id->id.qp_type != IB_QPT_UD) {
- trace_cm_send_mra(cm_id->context);
- ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
+ trace_cm_prepare_mra(cm_id->context);
+ ib_prepare_cm_mra(cm_id);
}
mutex_unlock(&conn_id->handler_mutex);
@@ -2253,8 +2472,7 @@ err_unlock:
mutex_unlock(&listen_id->handler_mutex);
net_dev_put:
- if (net_dev)
- dev_put(net_dev);
+ dev_put(net_dev);
return ret;
}
@@ -2529,7 +2747,7 @@ static int cma_listen_on_dev(struct rdma_id_private *id_priv,
ret = rdma_listen(&dev_id_priv->id, id_priv->backlog);
if (ret)
goto err_listen;
- list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list);
+ list_add_tail(&dev_id_priv->listen_item, &id_priv->listen_list);
return 0;
err_listen:
/* Caller must destroy this after releasing lock */
@@ -2545,13 +2763,13 @@ static int cma_listen_on_all(struct rdma_id_private *id_priv)
int ret;
mutex_lock(&lock);
- list_add_tail(&id_priv->list, &listen_any_list);
+ list_add_tail(&id_priv->listen_any_item, &listen_any_list);
list_for_each_entry(cma_dev, &dev_list, list) {
ret = cma_listen_on_dev(id_priv, cma_dev, &to_destroy);
if (ret) {
/* Prevent racing with cma_process_remove() */
if (to_destroy)
- list_del_init(&to_destroy->list);
+ list_del_init(&to_destroy->device_item);
goto err_listen;
}
}
@@ -2559,7 +2777,7 @@ static int cma_listen_on_all(struct rdma_id_private *id_priv)
return 0;
err_listen:
- list_del(&id_priv->list);
+ _cma_cancel_listens(id_priv);
mutex_unlock(&lock);
if (to_destroy)
rdma_destroy_id(&to_destroy->id);
@@ -2597,7 +2815,7 @@ int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout)
{
struct rdma_id_private *id_priv;
- if (id->qp_type != IB_QPT_RC)
+ if (id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_INI)
return -EINVAL;
id_priv = container_of(id, struct rdma_id_private, id);
@@ -2649,26 +2867,76 @@ int rdma_set_min_rnr_timer(struct rdma_cm_id *id, u8 min_rnr_timer)
}
EXPORT_SYMBOL(rdma_set_min_rnr_timer);
+static int route_set_path_rec_inbound(struct cma_work *work,
+ struct sa_path_rec *path_rec)
+{
+ struct rdma_route *route = &work->id->id.route;
+
+ if (!route->path_rec_inbound) {
+ route->path_rec_inbound =
+ kzalloc(sizeof(*route->path_rec_inbound), GFP_KERNEL);
+ if (!route->path_rec_inbound)
+ return -ENOMEM;
+ }
+
+ *route->path_rec_inbound = *path_rec;
+ return 0;
+}
+
+static int route_set_path_rec_outbound(struct cma_work *work,
+ struct sa_path_rec *path_rec)
+{
+ struct rdma_route *route = &work->id->id.route;
+
+ if (!route->path_rec_outbound) {
+ route->path_rec_outbound =
+ kzalloc(sizeof(*route->path_rec_outbound), GFP_KERNEL);
+ if (!route->path_rec_outbound)
+ return -ENOMEM;
+ }
+
+ *route->path_rec_outbound = *path_rec;
+ return 0;
+}
+
static void cma_query_handler(int status, struct sa_path_rec *path_rec,
- void *context)
+ unsigned int num_prs, void *context)
{
struct cma_work *work = context;
struct rdma_route *route;
+ int i;
route = &work->id->id.route;
- if (!status) {
- route->num_paths = 1;
- *route->path_rec = *path_rec;
- } else {
- work->old_state = RDMA_CM_ROUTE_QUERY;
- work->new_state = RDMA_CM_ADDR_RESOLVED;
- work->event.event = RDMA_CM_EVENT_ROUTE_ERROR;
- work->event.status = status;
- pr_debug_ratelimited("RDMA CM: ROUTE_ERROR: failed to query path. status %d\n",
- status);
+ if (status)
+ goto fail;
+
+ for (i = 0; i < num_prs; i++) {
+ if (!path_rec[i].flags || (path_rec[i].flags & IB_PATH_GMP))
+ *route->path_rec = path_rec[i];
+ else if (path_rec[i].flags & IB_PATH_INBOUND)
+ status = route_set_path_rec_inbound(work, &path_rec[i]);
+ else if (path_rec[i].flags & IB_PATH_OUTBOUND)
+ status = route_set_path_rec_outbound(work,
+ &path_rec[i]);
+ else
+ status = -EINVAL;
+
+ if (status)
+ goto fail;
}
+ route->num_pri_alt_paths = 1;
+ queue_work(cma_wq, &work->work);
+ return;
+
+fail:
+ work->old_state = RDMA_CM_ROUTE_QUERY;
+ work->new_state = RDMA_CM_ADDR_RESOLVED;
+ work->event.event = RDMA_CM_EVENT_ROUTE_ERROR;
+ work->event.status = status;
+ pr_debug_ratelimited("RDMA CM: ROUTE_ERROR: failed to query path. status %d\n",
+ status);
queue_work(cma_wq, &work->work);
}
@@ -2913,7 +3181,7 @@ int rdma_set_ib_path(struct rdma_cm_id *id,
dev_put(ndev);
}
- id->route.num_paths = 1;
+ id->route.num_pri_alt_paths = 1;
return 0;
err_free:
@@ -3046,7 +3314,7 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
goto err1;
}
- route->num_paths = 1;
+ route->num_pri_alt_paths = 1;
ndev = cma_iboe_set_path_rec_l2_fields(id_priv);
if (!ndev) {
@@ -3071,7 +3339,7 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
route->path_rec->traffic_class = tos;
route->path_rec->mtu = iboe_get_mtu(ndev->mtu);
route->path_rec->rate_selector = IB_SA_EQ;
- route->path_rec->rate = iboe_get_rate(ndev);
+ route->path_rec->rate = IB_RATE_PORT_CURRENT;
dev_put(ndev);
route->path_rec->packet_life_time_selector = IB_SA_EQ;
/* In case ACK timeout is set, use this value to calculate
@@ -3106,7 +3374,7 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
err2:
kfree(route->path_rec);
route->path_rec = NULL;
- route->num_paths = 0;
+ route->num_pri_alt_paths = 0;
err1:
kfree(work);
return ret;
@@ -3115,17 +3383,28 @@ err1:
int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms)
{
struct rdma_id_private *id_priv;
+ enum rdma_cm_state state;
int ret;
+ if (!timeout_ms)
+ return -EINVAL;
+
id_priv = container_of(id, struct rdma_id_private, id);
- if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_QUERY))
+ state = id_priv->state;
+ if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED,
+ RDMA_CM_ROUTE_QUERY) &&
+ !cma_comp_exch(id_priv, RDMA_CM_ADDRINFO_RESOLVED,
+ RDMA_CM_ROUTE_QUERY))
return -EINVAL;
cma_id_get(id_priv);
if (rdma_cap_ib_sa(id->device, id->port_num))
ret = cma_resolve_ib_route(id_priv, timeout_ms);
- else if (rdma_protocol_roce(id->device, id->port_num))
+ else if (rdma_protocol_roce(id->device, id->port_num)) {
ret = cma_resolve_iboe_route(id_priv);
+ if (!ret)
+ cma_add_id_to_tree(id_priv);
+ }
else if (rdma_protocol_iwarp(id->device, id->port_num))
ret = cma_resolve_iw_route(id_priv);
else
@@ -3136,7 +3415,7 @@ int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms)
return 0;
err:
- cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, RDMA_CM_ADDR_RESOLVED);
+ cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, state);
cma_id_put(id_priv);
return ret;
}
@@ -3319,98 +3598,6 @@ err:
return ret;
}
-static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
- const struct sockaddr *dst_addr)
-{
- if (!src_addr || !src_addr->sa_family) {
- src_addr = (struct sockaddr *) &id->route.addr.src_addr;
- src_addr->sa_family = dst_addr->sa_family;
- if (IS_ENABLED(CONFIG_IPV6) &&
- dst_addr->sa_family == AF_INET6) {
- struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *) src_addr;
- struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *) dst_addr;
- src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id;
- if (ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
- id->route.addr.dev_addr.bound_dev_if = dst_addr6->sin6_scope_id;
- } else if (dst_addr->sa_family == AF_IB) {
- ((struct sockaddr_ib *) src_addr)->sib_pkey =
- ((struct sockaddr_ib *) dst_addr)->sib_pkey;
- }
- }
- return rdma_bind_addr(id, src_addr);
-}
-
-/*
- * If required, resolve the source address for bind and leave the id_priv in
- * state RDMA_CM_ADDR_BOUND. This oddly uses the state to determine the prior
- * calls made by ULP, a previously bound ID will not be re-bound and src_addr is
- * ignored.
- */
-static int resolve_prepare_src(struct rdma_id_private *id_priv,
- struct sockaddr *src_addr,
- const struct sockaddr *dst_addr)
-{
- int ret;
-
- memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr));
- if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) {
- /* For a well behaved ULP state will be RDMA_CM_IDLE */
- ret = cma_bind_addr(&id_priv->id, src_addr, dst_addr);
- if (ret)
- goto err_dst;
- if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND,
- RDMA_CM_ADDR_QUERY))) {
- ret = -EINVAL;
- goto err_dst;
- }
- }
-
- if (cma_family(id_priv) != dst_addr->sa_family) {
- ret = -EINVAL;
- goto err_state;
- }
- return 0;
-
-err_state:
- cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND);
-err_dst:
- memset(cma_dst_addr(id_priv), 0, rdma_addr_size(dst_addr));
- return ret;
-}
-
-int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
- const struct sockaddr *dst_addr, unsigned long timeout_ms)
-{
- struct rdma_id_private *id_priv =
- container_of(id, struct rdma_id_private, id);
- int ret;
-
- ret = resolve_prepare_src(id_priv, src_addr, dst_addr);
- if (ret)
- return ret;
-
- if (cma_any_addr(dst_addr)) {
- ret = cma_resolve_loopback(id_priv);
- } else {
- if (dst_addr->sa_family == AF_IB) {
- ret = cma_resolve_ib_addr(id_priv);
- } else {
- ret = rdma_resolve_ip(cma_src_addr(id_priv), dst_addr,
- &id->route.addr.dev_addr,
- timeout_ms, addr_handler,
- false, id_priv);
- }
- }
- if (ret)
- goto err;
-
- return 0;
-err:
- cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND);
- return ret;
-}
-EXPORT_SYMBOL(rdma_resolve_addr);
-
int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse)
{
struct rdma_id_private *id_priv;
@@ -3562,7 +3749,7 @@ static int cma_alloc_any_port(enum rdma_ucm_port_space ps,
inet_get_local_port_range(net, &low, &high);
remaining = (high - low) + 1;
- rover = prandom_u32() % remaining + low;
+ rover = get_random_u32_inclusive(low, remaining + low - 1);
retry:
if (last_used_port != rover) {
struct rdma_bind_list *bind_list;
@@ -3753,9 +3940,13 @@ int rdma_listen(struct rdma_cm_id *id, int backlog)
int ret;
if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN)) {
+ struct sockaddr_in any_in = {
+ .sin_family = AF_INET,
+ .sin_addr.s_addr = htonl(INADDR_ANY),
+ };
+
/* For a well behaved ULP state will be RDMA_CM_IDLE */
- id->route.addr.src_addr.ss_family = AF_INET;
- ret = rdma_bind_addr(id, cma_src_addr(id_priv));
+ ret = rdma_bind_addr(id, (struct sockaddr *)&any_in);
if (ret)
return ret;
if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND,
@@ -3809,27 +4000,26 @@ err:
}
EXPORT_SYMBOL(rdma_listen);
-int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
+static int rdma_bind_addr_dst(struct rdma_id_private *id_priv,
+ struct sockaddr *addr, const struct sockaddr *daddr)
{
- struct rdma_id_private *id_priv;
+ struct sockaddr *id_daddr;
int ret;
- struct sockaddr *daddr;
if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6 &&
addr->sa_family != AF_IB)
return -EAFNOSUPPORT;
- id_priv = container_of(id, struct rdma_id_private, id);
if (!cma_comp_exch(id_priv, RDMA_CM_IDLE, RDMA_CM_ADDR_BOUND))
return -EINVAL;
- ret = cma_check_linklocal(&id->route.addr.dev_addr, addr);
+ ret = cma_check_linklocal(&id_priv->id.route.addr.dev_addr, addr);
if (ret)
goto err1;
memcpy(cma_src_addr(id_priv), addr, rdma_addr_size(addr));
if (!cma_any_addr(addr)) {
- ret = cma_translate_addr(addr, &id->route.addr.dev_addr);
+ ret = cma_translate_addr(addr, &id_priv->id.route.addr.dev_addr);
if (ret)
goto err1;
@@ -3849,8 +4039,10 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
}
#endif
}
- daddr = cma_dst_addr(id_priv);
- daddr->sa_family = addr->sa_family;
+ id_daddr = cma_dst_addr(id_priv);
+ if (daddr != id_daddr)
+ memcpy(id_daddr, daddr, rdma_addr_size(addr));
+ id_daddr->sa_family = addr->sa_family;
ret = cma_get_port(id_priv);
if (ret)
@@ -3866,6 +4058,129 @@ err1:
cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE);
return ret;
}
+
+static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
+ const struct sockaddr *dst_addr)
+{
+ struct rdma_id_private *id_priv =
+ container_of(id, struct rdma_id_private, id);
+ struct sockaddr_storage zero_sock = {};
+
+ if (src_addr && src_addr->sa_family)
+ return rdma_bind_addr_dst(id_priv, src_addr, dst_addr);
+
+ /*
+ * When the src_addr is not specified, automatically supply an any addr
+ */
+ zero_sock.ss_family = dst_addr->sa_family;
+ if (IS_ENABLED(CONFIG_IPV6) && dst_addr->sa_family == AF_INET6) {
+ struct sockaddr_in6 *src_addr6 =
+ (struct sockaddr_in6 *)&zero_sock;
+ struct sockaddr_in6 *dst_addr6 =
+ (struct sockaddr_in6 *)dst_addr;
+
+ src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id;
+ if (ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
+ id->route.addr.dev_addr.bound_dev_if =
+ dst_addr6->sin6_scope_id;
+ } else if (dst_addr->sa_family == AF_IB) {
+ ((struct sockaddr_ib *)&zero_sock)->sib_pkey =
+ ((struct sockaddr_ib *)dst_addr)->sib_pkey;
+ }
+ return rdma_bind_addr_dst(id_priv, (struct sockaddr *)&zero_sock, dst_addr);
+}
+
+/*
+ * If required, resolve the source address for bind and leave the id_priv in
+ * state RDMA_CM_ADDR_BOUND. This oddly uses the state to determine the prior
+ * calls made by ULP, a previously bound ID will not be re-bound and src_addr is
+ * ignored.
+ */
+static int resolve_prepare_src(struct rdma_id_private *id_priv,
+ struct sockaddr *src_addr,
+ const struct sockaddr *dst_addr)
+{
+ int ret;
+
+ if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) {
+ /* For a well behaved ULP state will be RDMA_CM_IDLE */
+ ret = cma_bind_addr(&id_priv->id, src_addr, dst_addr);
+ if (ret)
+ return ret;
+ if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND,
+ RDMA_CM_ADDR_QUERY)))
+ return -EINVAL;
+
+ } else {
+ memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr));
+ }
+
+ if (cma_family(id_priv) != dst_addr->sa_family) {
+ ret = -EINVAL;
+ goto err_state;
+ }
+ return 0;
+
+err_state:
+ cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND);
+ return ret;
+}
+
+int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
+ const struct sockaddr *dst_addr, unsigned long timeout_ms)
+{
+ struct rdma_id_private *id_priv =
+ container_of(id, struct rdma_id_private, id);
+ int ret;
+
+ ret = resolve_prepare_src(id_priv, src_addr, dst_addr);
+ if (ret)
+ return ret;
+
+ if (cma_any_addr(dst_addr)) {
+ ret = cma_resolve_loopback(id_priv);
+ } else {
+ if (dst_addr->sa_family == AF_IB) {
+ ret = cma_resolve_ib_addr(id_priv);
+ } else {
+ /*
+ * The FSM can return back to RDMA_CM_ADDR_BOUND after
+ * rdma_resolve_ip() is called, eg through the error
+ * path in addr_handler(). If this happens the existing
+ * request must be canceled before issuing a new one.
+ * Since canceling a request is a bit slow and this
+ * oddball path is rare, keep track once a request has
+ * been issued. The track turns out to be a permanent
+ * state since this is the only cancel as it is
+ * immediately before rdma_resolve_ip().
+ */
+ if (id_priv->used_resolve_ip)
+ rdma_addr_cancel(&id->route.addr.dev_addr);
+ else
+ id_priv->used_resolve_ip = 1;
+ ret = rdma_resolve_ip(cma_src_addr(id_priv), dst_addr,
+ &id->route.addr.dev_addr,
+ timeout_ms, addr_handler,
+ false, id_priv);
+ }
+ }
+ if (ret)
+ goto err;
+
+ return 0;
+err:
+ cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_resolve_addr);
+
+int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
+{
+ struct rdma_id_private *id_priv =
+ container_of(id, struct rdma_id_private, id);
+
+ return rdma_bind_addr_dst(id_priv, addr, cma_dst_addr(id_priv));
+}
EXPORT_SYMBOL(rdma_bind_addr);
static int cma_format_hdr(void *hdr, struct rdma_id_private *id_priv)
@@ -3974,8 +4289,7 @@ static int cma_resolve_ib_udp(struct rdma_id_private *id_priv,
memset(&req, 0, sizeof req);
offset = cma_user_data_offset(id_priv);
- req.private_data_len = offset + conn_param->private_data_len;
- if (req.private_data_len < conn_param->private_data_len)
+ if (check_add_overflow(offset, conn_param->private_data_len, &req.private_data_len))
return -EINVAL;
if (req.private_data_len) {
@@ -4034,8 +4348,7 @@ static int cma_connect_ib(struct rdma_id_private *id_priv,
memset(&req, 0, sizeof req);
offset = cma_user_data_offset(id_priv);
- req.private_data_len = offset + conn_param->private_data_len;
- if (req.private_data_len < conn_param->private_data_len)
+ if (check_add_overflow(offset, conn_param->private_data_len, &req.private_data_len))
return -EINVAL;
if (req.private_data_len) {
@@ -4066,7 +4379,9 @@ static int cma_connect_ib(struct rdma_id_private *id_priv,
}
req.primary_path = &route->path_rec[0];
- if (route->num_paths == 2)
+ req.primary_path_inbound = route->path_rec_inbound;
+ req.primary_path_outbound = route->path_rec_outbound;
+ if (route->num_pri_alt_paths == 2)
req.alternate_path = &route->path_rec[1];
req.ppath_sgid_attr = id_priv->id.route.addr.dev_addr.sgid_attr;
@@ -4160,6 +4475,8 @@ int rdma_connect_locked(struct rdma_cm_id *id,
container_of(id, struct rdma_id_private, id);
int ret;
+ lockdep_assert_held(&id_priv->handler_mutex);
+
if (!cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT))
return -EINVAL;
@@ -4302,7 +4619,10 @@ static int cma_send_sidr_rep(struct rdma_id_private *id_priv,
memset(&rep, 0, sizeof rep);
rep.status = status;
if (status == IB_SIDR_SUCCESS) {
- ret = cma_set_qkey(id_priv, qkey);
+ if (qkey)
+ ret = cma_set_qkey(id_priv, qkey);
+ else
+ ret = cma_set_default_qkey(id_priv);
if (ret)
return ret;
rep.qp_num = id_priv->qp_num;
@@ -4507,9 +4827,7 @@ static void cma_make_mc_event(int status, struct rdma_id_private *id_priv,
enum ib_gid_type gid_type;
struct net_device *ndev;
- if (!status)
- status = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey));
- else
+ if (status)
pr_debug_ratelimited("RDMA CM: MULTICAST_ERROR: failed to join multicast. status %d\n",
status);
@@ -4537,11 +4855,10 @@ static void cma_make_mc_event(int status, struct rdma_id_private *id_priv,
}
event->param.ud.qp_num = 0xFFFFFF;
- event->param.ud.qkey = be32_to_cpu(multicast->rec.qkey);
+ event->param.ud.qkey = id_priv->qkey;
out:
- if (ndev)
- dev_put(ndev);
+ dev_put(ndev);
}
static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
@@ -4556,8 +4873,11 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING)
goto out;
- cma_make_mc_event(status, id_priv, multicast, &event, mc);
- ret = cma_cm_event_handler(id_priv, &event);
+ ret = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey));
+ if (!ret) {
+ cma_make_mc_event(status, id_priv, multicast, &event, mc);
+ ret = cma_cm_event_handler(id_priv, &event);
+ }
rdma_destroy_ah_attr(&event.param.ud.ah_attr);
WARN_ON(ret);
@@ -4610,9 +4930,11 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv,
if (ret)
return ret;
- ret = cma_set_qkey(id_priv, 0);
- if (ret)
- return ret;
+ if (!id_priv->qkey) {
+ ret = cma_set_default_qkey(id_priv);
+ if (ret)
+ return ret;
+ }
cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid);
rec.qkey = cpu_to_be32(id_priv->qkey);
@@ -4675,7 +4997,7 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
int err = 0;
struct sockaddr *addr = (struct sockaddr *)&mc->addr;
struct net_device *ndev = NULL;
- struct ib_sa_multicast ib;
+ struct ib_sa_multicast ib = {};
enum ib_gid_type gid_type;
bool send_only;
@@ -4689,15 +5011,12 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
cma_iboe_set_mgid(addr, &ib.rec.mgid, gid_type);
ib.rec.pkey = cpu_to_be16(0xffff);
- if (id_priv->id.ps == RDMA_PS_UDP)
- ib.rec.qkey = cpu_to_be32(RDMA_UDP_QKEY);
-
if (dev_addr->bound_dev_if)
ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if);
if (!ndev)
return -ENODEV;
- ib.rec.rate = iboe_get_rate(ndev);
+ ib.rec.rate = IB_RATE_PORT_CURRENT;
ib.rec.hop_limit = 1;
ib.rec.mtu = iboe_get_mtu(ndev->mtu);
@@ -4717,6 +5036,9 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
if (err || !ib.rec.mtu)
return err ?: -EINVAL;
+ if (!id_priv->qkey)
+ cma_set_default_qkey(id_priv);
+
rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
&ib.rec.port_gid);
INIT_WORK(&mc->iboe_join.work, cma_iboe_join_work_handler);
@@ -4742,6 +5064,9 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
READ_ONCE(id_priv->state) != RDMA_CM_ADDR_RESOLVED))
return -EINVAL;
+ if (id_priv->id.qp_type != IB_QPT_UD)
+ return -EINVAL;
+
mc = kzalloc(sizeof(*mc), GFP_KERNEL);
if (!mc)
return -ENOMEM;
@@ -4838,7 +5163,7 @@ static int cma_netdev_callback(struct notifier_block *self, unsigned long event,
mutex_lock(&lock);
list_for_each_entry(cma_dev, &dev_list, list)
- list_for_each_entry(id_priv, &cma_dev->id_list, list) {
+ list_for_each_entry(id_priv, &cma_dev->id_list, device_item) {
ret = cma_netdev_change(ndev, id_priv);
if (ret)
goto out;
@@ -4849,10 +5174,87 @@ out:
return ret;
}
+static void cma_netevent_work_handler(struct work_struct *_work)
+{
+ struct rdma_id_private *id_priv =
+ container_of(_work, struct rdma_id_private, id.net_work);
+ struct rdma_cm_event event = {};
+
+ mutex_lock(&id_priv->handler_mutex);
+
+ if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING ||
+ READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL)
+ goto out_unlock;
+
+ event.event = RDMA_CM_EVENT_UNREACHABLE;
+ event.status = -ETIMEDOUT;
+
+ if (cma_cm_event_handler(id_priv, &event)) {
+ __acquire(&id_priv->handler_mutex);
+ id_priv->cm_id.ib = NULL;
+ cma_id_put(id_priv);
+ destroy_id_handler_unlock(id_priv);
+ return;
+ }
+
+out_unlock:
+ mutex_unlock(&id_priv->handler_mutex);
+ cma_id_put(id_priv);
+}
+
+static int cma_netevent_callback(struct notifier_block *self,
+ unsigned long event, void *ctx)
+{
+ struct id_table_entry *ips_node = NULL;
+ struct rdma_id_private *current_id;
+ struct neighbour *neigh = ctx;
+ unsigned long flags;
+
+ if (event != NETEVENT_NEIGH_UPDATE)
+ return NOTIFY_DONE;
+
+ spin_lock_irqsave(&id_table_lock, flags);
+ if (neigh->tbl->family == AF_INET6) {
+ struct sockaddr_in6 neigh_sock_6;
+
+ neigh_sock_6.sin6_family = AF_INET6;
+ neigh_sock_6.sin6_addr = *(struct in6_addr *)neigh->primary_key;
+ ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex,
+ (struct sockaddr *)&neigh_sock_6);
+ } else if (neigh->tbl->family == AF_INET) {
+ struct sockaddr_in neigh_sock_4;
+
+ neigh_sock_4.sin_family = AF_INET;
+ neigh_sock_4.sin_addr.s_addr = *(__be32 *)(neigh->primary_key);
+ ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex,
+ (struct sockaddr *)&neigh_sock_4);
+ } else
+ goto out;
+
+ if (!ips_node)
+ goto out;
+
+ list_for_each_entry(current_id, &ips_node->id_list, id_list_entry) {
+ if (!memcmp(current_id->id.route.addr.dev_addr.dst_dev_addr,
+ neigh->ha, ETH_ALEN))
+ continue;
+ cma_id_get(current_id);
+ if (!queue_work(cma_wq, &current_id->id.net_work))
+ cma_id_put(current_id);
+ }
+out:
+ spin_unlock_irqrestore(&id_table_lock, flags);
+ return NOTIFY_DONE;
+}
+
static struct notifier_block cma_nb = {
.notifier_call = cma_netdev_callback
};
+static struct notifier_block cma_netevent_cb = {
+ .notifier_call = cma_netevent_callback
+};
+
static void cma_send_device_removal_put(struct rdma_id_private *id_priv)
{
struct rdma_cm_event event = { .event = RDMA_CM_EVENT_DEVICE_REMOVAL };
@@ -4898,10 +5300,10 @@ static void cma_process_remove(struct cma_device *cma_dev)
mutex_lock(&lock);
while (!list_empty(&cma_dev->id_list)) {
struct rdma_id_private *id_priv = list_first_entry(
- &cma_dev->id_list, struct rdma_id_private, list);
+ &cma_dev->id_list, struct rdma_id_private, device_item);
- list_del(&id_priv->listen_list);
- list_del_init(&id_priv->list);
+ list_del_init(&id_priv->listen_item);
+ list_del_init(&id_priv->device_item);
cma_id_get(id_priv);
mutex_unlock(&lock);
@@ -4978,7 +5380,7 @@ static int cma_add_one(struct ib_device *device)
mutex_lock(&lock);
list_add_tail(&cma_dev->list, &dev_list);
- list_for_each_entry(id_priv, &listen_any_list, list) {
+ list_for_each_entry(id_priv, &listen_any_list, listen_any_item) {
ret = cma_listen_on_dev(id_priv, cma_dev, &to_destroy);
if (ret)
goto free_listen;
@@ -5075,6 +5477,7 @@ static int __init cma_init(void)
ib_sa_register_client(&sa_client);
register_netdevice_notifier(&cma_nb);
+ register_netevent_notifier(&cma_netevent_cb);
ret = ib_register_client(&cma_client);
if (ret)
@@ -5089,6 +5492,7 @@ static int __init cma_init(void)
err_ib:
ib_unregister_client(&cma_client);
err:
+ unregister_netevent_notifier(&cma_netevent_cb);
unregister_netdevice_notifier(&cma_nb);
ib_sa_unregister_client(&sa_client);
unregister_pernet_subsys(&cma_pernet_operations);
@@ -5101,6 +5505,7 @@ static void __exit cma_cleanup(void)
{
cma_configfs_exit();
ib_unregister_client(&cma_client);
+ unregister_netevent_notifier(&cma_netevent_cb);
unregister_netdevice_notifier(&cma_nb);
ib_sa_unregister_client(&sa_client);
unregister_pernet_subsys(&cma_pernet_operations);
@@ -5109,3 +5514,129 @@ static void __exit cma_cleanup(void)
module_init(cma_init);
module_exit(cma_cleanup);
+
+static void cma_query_ib_service_handler(int status,
+ struct sa_service_rec *recs,
+ unsigned int num_recs, void *context)
+{
+ struct cma_work *work = context;
+ struct rdma_id_private *id_priv = work->id;
+ struct sockaddr_ib *addr;
+
+ if (status)
+ goto fail;
+
+ if (!num_recs) {
+ status = -ENOENT;
+ goto fail;
+ }
+
+ if (id_priv->id.route.service_recs) {
+ status = -EALREADY;
+ goto fail;
+ }
+
+ id_priv->id.route.service_recs =
+ kmalloc_array(num_recs, sizeof(*recs), GFP_KERNEL);
+ if (!id_priv->id.route.service_recs) {
+ status = -ENOMEM;
+ goto fail;
+ }
+
+ id_priv->id.route.num_service_recs = num_recs;
+ memcpy(id_priv->id.route.service_recs, recs, sizeof(*recs) * num_recs);
+
+ addr = (struct sockaddr_ib *)&id_priv->id.route.addr.dst_addr;
+ addr->sib_family = AF_IB;
+ addr->sib_addr = *(struct ib_addr *)&recs->gid;
+ addr->sib_pkey = recs->pkey;
+ addr->sib_sid = recs->id;
+ rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr,
+ (union ib_gid *)&addr->sib_addr);
+ ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr,
+ ntohs(addr->sib_pkey));
+
+ queue_work(cma_wq, &work->work);
+ return;
+
+fail:
+ work->old_state = RDMA_CM_ADDRINFO_QUERY;
+ work->new_state = RDMA_CM_ADDR_BOUND;
+ work->event.event = RDMA_CM_EVENT_ADDRINFO_ERROR;
+ work->event.status = status;
+ pr_debug_ratelimited(
+ "RDMA CM: SERVICE_ERROR: failed to query service record. status %d\n",
+ status);
+ queue_work(cma_wq, &work->work);
+}
+
+static int cma_resolve_ib_service(struct rdma_id_private *id_priv,
+ struct rdma_ucm_ib_service *ibs)
+{
+ struct sa_service_rec sr = {};
+ ib_sa_comp_mask mask = 0;
+ struct cma_work *work;
+
+ work = kzalloc(sizeof(*work), GFP_KERNEL);
+ if (!work)
+ return -ENOMEM;
+
+ cma_id_get(id_priv);
+
+ work->id = id_priv;
+ INIT_WORK(&work->work, cma_work_handler);
+ work->old_state = RDMA_CM_ADDRINFO_QUERY;
+ work->new_state = RDMA_CM_ADDRINFO_RESOLVED;
+ work->event.event = RDMA_CM_EVENT_ADDRINFO_RESOLVED;
+
+ if (ibs->flags & RDMA_USER_CM_IB_SERVICE_FLAG_ID) {
+ sr.id = cpu_to_be64(ibs->service_id);
+ mask |= IB_SA_SERVICE_REC_SERVICE_ID;
+ }
+ if (ibs->flags & RDMA_USER_CM_IB_SERVICE_FLAG_NAME) {
+ strscpy(sr.name, ibs->service_name, sizeof(sr.name));
+ mask |= IB_SA_SERVICE_REC_SERVICE_NAME;
+ }
+
+ id_priv->query_id = ib_sa_service_rec_get(&sa_client,
+ id_priv->id.device,
+ id_priv->id.port_num,
+ &sr, mask,
+ 2000, GFP_KERNEL,
+ cma_query_ib_service_handler,
+ work, &id_priv->query);
+
+ if (id_priv->query_id < 0) {
+ cma_id_put(id_priv);
+ kfree(work);
+ return id_priv->query_id;
+ }
+
+ return 0;
+}
+
+int rdma_resolve_ib_service(struct rdma_cm_id *id,
+ struct rdma_ucm_ib_service *ibs)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!id_priv->cma_dev ||
+ !cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDRINFO_QUERY))
+ return -EINVAL;
+
+ if (rdma_cap_ib_sa(id->device, id->port_num))
+ ret = cma_resolve_ib_service(id_priv, ibs);
+ else
+ ret = -EOPNOTSUPP;
+
+ if (ret)
+ goto err;
+
+ return 0;
+err:
+ cma_comp_exch(id_priv, RDMA_CM_ADDRINFO_QUERY, RDMA_CM_ADDR_BOUND);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_resolve_ib_service);
diff --git a/drivers/infiniband/core/cma_configfs.c b/drivers/infiniband/core/cma_configfs.c
index 9ac16e0db761..f2fb2d8a6597 100644
--- a/drivers/infiniband/core/cma_configfs.c
+++ b/drivers/infiniband/core/cma_configfs.c
@@ -30,7 +30,6 @@
* SOFTWARE.
*/
-#include <linux/module.h>
#include <linux/configfs.h>
#include <rdma/ib_verbs.h>
#include <rdma/rdma_cm.h>
@@ -218,7 +217,7 @@ static int make_cma_ports(struct cma_dev_group *cma_dev_group,
return -ENOMEM;
for (i = 0; i < ports_num; i++) {
- char port_str[10];
+ char port_str[11];
ports[i].port_num = i + 1;
snprintf(port_str, sizeof(port_str), "%u", i + 1);
@@ -293,7 +292,7 @@ static struct config_group *make_cma_dev(struct config_group *group,
goto fail;
}
- strlcpy(cma_dev_group->name, name, sizeof(cma_dev_group->name));
+ strscpy(cma_dev_group->name, name, sizeof(cma_dev_group->name));
config_group_init_type_name(&cma_dev_group->ports_group, "ports",
&cma_ports_group_type);
diff --git a/drivers/infiniband/core/cma_priv.h b/drivers/infiniband/core/cma_priv.h
index 5c463da99845..c604b601f4d9 100644
--- a/drivers/infiniband/core/cma_priv.h
+++ b/drivers/infiniband/core/cma_priv.h
@@ -47,7 +47,9 @@ enum rdma_cm_state {
RDMA_CM_ADDR_BOUND,
RDMA_CM_LISTEN,
RDMA_CM_DEVICE_REMOVAL,
- RDMA_CM_DESTROYING
+ RDMA_CM_DESTROYING,
+ RDMA_CM_ADDRINFO_QUERY,
+ RDMA_CM_ADDRINFO_RESOLVED
};
struct rdma_id_private {
@@ -55,8 +57,16 @@ struct rdma_id_private {
struct rdma_bind_list *bind_list;
struct hlist_node node;
- struct list_head list; /* listen_any_list or cma_device.list */
- struct list_head listen_list; /* per device listens */
+ union {
+ struct list_head device_item; /* On cma_device->id_list */
+ struct list_head listen_any_item; /* On listen_any_list */
+ };
+ union {
+ /* On rdma_id_private->listen_list */
+ struct list_head listen_item;
+ struct list_head listen_list;
+ };
+ struct list_head id_list_entry;
struct cma_device *cma_dev;
struct list_head mc_list;
@@ -91,6 +101,7 @@ struct rdma_id_private {
u8 afonly;
u8 timeout;
u8 min_rnr_timer;
+ u8 used_resolve_ip;
enum ib_gid_type gid_type;
/*
diff --git a/drivers/infiniband/core/cma_trace.h b/drivers/infiniband/core/cma_trace.h
index e45264267bcc..3456d5f3aa47 100644
--- a/drivers/infiniband/core/cma_trace.h
+++ b/drivers/infiniband/core/cma_trace.h
@@ -15,7 +15,7 @@
#define _TRACE_RDMA_CMA_H
#include <linux/tracepoint.h>
-#include <trace/events/rdma.h>
+#include <trace/misc/rdma.h>
DECLARE_EVENT_CLASS(cma_fsm_class,
@@ -55,7 +55,7 @@ DECLARE_EVENT_CLASS(cma_fsm_class,
DEFINE_CMA_FSM_EVENT(send_rtu);
DEFINE_CMA_FSM_EVENT(send_rej);
-DEFINE_CMA_FSM_EVENT(send_mra);
+DEFINE_CMA_FSM_EVENT(prepare_mra);
DEFINE_CMA_FSM_EVENT(send_sidr_req);
DEFINE_CMA_FSM_EVENT(send_sidr_rep);
DEFINE_CMA_FSM_EVENT(disconnect);
@@ -84,7 +84,7 @@ TRACE_EVENT(cm_id_attach,
sizeof(struct sockaddr_in6));
memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr,
sizeof(struct sockaddr_in6));
- __assign_str(devname, device->name);
+ __assign_str(devname);
),
TP_printk("cm.id=%u src=%pISpc dst=%pISpc device=%s",
@@ -334,7 +334,7 @@ DECLARE_EVENT_CLASS(cma_client_class,
),
TP_fast_assign(
- __assign_str(name, device->name);
+ __assign_str(name);
),
TP_printk("device name=%s",
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index 647cca4e0240..05102769a918 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -316,50 +316,15 @@ struct ib_device *ib_device_get_by_index(const struct net *net, u32 index);
void nldev_init(void);
void nldev_exit(void);
-static inline struct ib_qp *
-_ib_create_qp(struct ib_device *dev, struct ib_pd *pd,
- struct ib_qp_init_attr *attr, struct ib_udata *udata,
- struct ib_uqp_object *uobj, const char *caller)
-{
- struct ib_qp *qp;
-
- if (!dev->ops.create_qp)
- return ERR_PTR(-EOPNOTSUPP);
-
- qp = dev->ops.create_qp(pd, attr, udata);
- if (IS_ERR(qp))
- return qp;
-
- qp->device = dev;
- qp->pd = pd;
- qp->uobject = uobj;
- qp->real_qp = qp;
-
- qp->qp_type = attr->qp_type;
- qp->rwq_ind_tbl = attr->rwq_ind_tbl;
- qp->send_cq = attr->send_cq;
- qp->recv_cq = attr->recv_cq;
- qp->srq = attr->srq;
- qp->rwq_ind_tbl = attr->rwq_ind_tbl;
- qp->event_handler = attr->event_handler;
- qp->port = attr->port_num;
-
- atomic_set(&qp->usecnt, 0);
- spin_lock_init(&qp->mr_lock);
- INIT_LIST_HEAD(&qp->rdma_mrs);
- INIT_LIST_HEAD(&qp->sig_mrs);
-
- rdma_restrack_new(&qp->res, RDMA_RESTRACK_QP);
- WARN_ONCE(!udata && !caller, "Missing kernel QP owner");
- rdma_restrack_set_name(&qp->res, udata ? NULL : caller);
- rdma_restrack_add(&qp->res);
- return qp;
-}
+struct ib_qp *ib_create_qp_user(struct ib_device *dev, struct ib_pd *pd,
+ struct ib_qp_init_attr *attr,
+ struct ib_udata *udata,
+ struct ib_uqp_object *uobj, const char *caller);
+
+void ib_qp_usecnt_inc(struct ib_qp *qp);
+void ib_qp_usecnt_dec(struct ib_qp *qp);
struct rdma_dev_addr;
-int rdma_resolve_ip_route(struct sockaddr *src_addr,
- const struct sockaddr *dst_addr,
- struct rdma_dev_addr *addr);
int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid,
const union ib_gid *dgid,
@@ -405,4 +370,5 @@ void rdma_umap_priv_init(struct rdma_umap_priv *priv,
void ib_cq_pool_cleanup(struct ib_device *dev);
+bool rdma_nl_get_privileged_qkey(void);
#endif /* _CORE_PRIV_H */
diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c
index df9e6c5e4ddf..c3aa6d7fc66b 100644
--- a/drivers/infiniband/core/counters.c
+++ b/drivers/infiniband/core/counters.c
@@ -12,7 +12,8 @@
static int __counter_set_mode(struct rdma_port_counter *port_counter,
enum rdma_nl_counter_mode new_mode,
- enum rdma_nl_counter_mask new_mask)
+ enum rdma_nl_counter_mask new_mask,
+ bool bind_opcnt)
{
if (new_mode == RDMA_COUNTER_MODE_AUTO) {
if (new_mask & (~ALL_AUTO_MODE_MASKS))
@@ -23,6 +24,7 @@ static int __counter_set_mode(struct rdma_port_counter *port_counter,
port_counter->mode.mode = new_mode;
port_counter->mode.mask = new_mask;
+ port_counter->mode.bind_opcnt = bind_opcnt;
return 0;
}
@@ -41,6 +43,7 @@ static int __counter_set_mode(struct rdma_port_counter *port_counter,
*/
int rdma_counter_set_auto_mode(struct ib_device *dev, u32 port,
enum rdma_nl_counter_mask mask,
+ bool bind_opcnt,
struct netlink_ext_ack *extack)
{
struct rdma_port_counter *port_counter;
@@ -59,12 +62,13 @@ int rdma_counter_set_auto_mode(struct ib_device *dev, u32 port,
RDMA_COUNTER_MODE_NONE;
if (port_counter->mode.mode == mode &&
- port_counter->mode.mask == mask) {
+ port_counter->mode.mask == mask &&
+ port_counter->mode.bind_opcnt == bind_opcnt) {
ret = 0;
goto out;
}
- ret = __counter_set_mode(port_counter, mode, mask);
+ ret = __counter_set_mode(port_counter, mode, mask, bind_opcnt);
out:
mutex_unlock(&port_counter->lock);
@@ -89,7 +93,7 @@ static void auto_mode_init_counter(struct rdma_counter *counter,
}
static int __rdma_counter_bind_qp(struct rdma_counter *counter,
- struct ib_qp *qp)
+ struct ib_qp *qp, u32 port)
{
int ret;
@@ -100,15 +104,48 @@ static int __rdma_counter_bind_qp(struct rdma_counter *counter,
return -EOPNOTSUPP;
mutex_lock(&counter->lock);
- ret = qp->device->ops.counter_bind_qp(counter, qp);
+ ret = qp->device->ops.counter_bind_qp(counter, qp, port);
mutex_unlock(&counter->lock);
return ret;
}
+int rdma_counter_modify(struct ib_device *dev, u32 port,
+ unsigned int index, bool enable)
+{
+ struct rdma_hw_stats *stats;
+ int ret = 0;
+
+ if (!dev->ops.modify_hw_stat)
+ return -EOPNOTSUPP;
+
+ stats = ib_get_hw_stats_port(dev, port);
+ if (!stats || index >= stats->num_counters ||
+ !(stats->descs[index].flags & IB_STAT_FLAG_OPTIONAL))
+ return -EINVAL;
+
+ mutex_lock(&stats->lock);
+
+ if (enable != test_bit(index, stats->is_disabled))
+ goto out;
+
+ ret = dev->ops.modify_hw_stat(dev, port, index, enable);
+ if (ret)
+ goto out;
+
+ if (enable)
+ clear_bit(index, stats->is_disabled);
+ else
+ set_bit(index, stats->is_disabled);
+out:
+ mutex_unlock(&stats->lock);
+ return ret;
+}
+
static struct rdma_counter *alloc_and_bind(struct ib_device *dev, u32 port,
struct ib_qp *qp,
- enum rdma_nl_counter_mode mode)
+ enum rdma_nl_counter_mode mode,
+ bool bind_opcnt)
{
struct rdma_port_counter *port_counter;
struct rdma_counter *counter;
@@ -117,13 +154,15 @@ static struct rdma_counter *alloc_and_bind(struct ib_device *dev, u32 port,
if (!dev->ops.counter_dealloc || !dev->ops.counter_alloc_stats)
return NULL;
- counter = kzalloc(sizeof(*counter), GFP_KERNEL);
+ counter = rdma_zalloc_drv_obj(dev, rdma_counter);
if (!counter)
return NULL;
counter->device = dev;
counter->port = port;
+ dev->ops.counter_init(counter);
+
rdma_restrack_new(&counter->res, RDMA_RESTRACK_COUNTER);
counter->stats = dev->ops.counter_alloc_stats(counter);
if (!counter->stats)
@@ -134,7 +173,7 @@ static struct rdma_counter *alloc_and_bind(struct ib_device *dev, u32 port,
switch (mode) {
case RDMA_COUNTER_MODE_MANUAL:
ret = __counter_set_mode(port_counter, RDMA_COUNTER_MODE_MANUAL,
- 0);
+ 0, bind_opcnt);
if (ret) {
mutex_unlock(&port_counter->lock);
goto err_mode;
@@ -153,10 +192,11 @@ static struct rdma_counter *alloc_and_bind(struct ib_device *dev, u32 port,
mutex_unlock(&port_counter->lock);
counter->mode.mode = mode;
+ counter->mode.bind_opcnt = bind_opcnt;
kref_init(&counter->kref);
mutex_init(&counter->lock);
- ret = __rdma_counter_bind_qp(counter, qp);
+ ret = __rdma_counter_bind_qp(counter, qp, port);
if (ret)
goto err_mode;
@@ -165,7 +205,7 @@ static struct rdma_counter *alloc_and_bind(struct ib_device *dev, u32 port,
return counter;
err_mode:
- kfree(counter->stats);
+ rdma_free_hw_stats_struct(counter->stats);
err_stats:
rdma_restrack_put(&counter->res);
kfree(counter);
@@ -181,12 +221,13 @@ static void rdma_counter_free(struct rdma_counter *counter)
port_counter->num_counters--;
if (!port_counter->num_counters &&
(port_counter->mode.mode == RDMA_COUNTER_MODE_MANUAL))
- __counter_set_mode(port_counter, RDMA_COUNTER_MODE_NONE, 0);
+ __counter_set_mode(port_counter, RDMA_COUNTER_MODE_NONE, 0,
+ false);
mutex_unlock(&port_counter->lock);
rdma_restrack_del(&counter->res);
- kfree(counter->stats);
+ rdma_free_hw_stats_struct(counter->stats);
kfree(counter);
}
@@ -206,7 +247,7 @@ static bool auto_mode_match(struct ib_qp *qp, struct rdma_counter *counter,
return match;
}
-static int __rdma_counter_unbind_qp(struct ib_qp *qp)
+static int __rdma_counter_unbind_qp(struct ib_qp *qp, u32 port)
{
struct rdma_counter *counter = qp->counter;
int ret;
@@ -215,7 +256,7 @@ static int __rdma_counter_unbind_qp(struct ib_qp *qp)
return -EOPNOTSUPP;
mutex_lock(&counter->lock);
- ret = qp->device->ops.counter_unbind_qp(qp);
+ ret = qp->device->ops.counter_unbind_qp(qp, port);
mutex_unlock(&counter->lock);
return ret;
@@ -307,13 +348,14 @@ int rdma_counter_bind_qp_auto(struct ib_qp *qp, u32 port)
counter = rdma_get_counter_auto_mode(qp, port);
if (counter) {
- ret = __rdma_counter_bind_qp(counter, qp);
+ ret = __rdma_counter_bind_qp(counter, qp, port);
if (ret) {
kref_put(&counter->kref, counter_release);
return ret;
}
} else {
- counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_AUTO);
+ counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_AUTO,
+ port_counter->mode.bind_opcnt);
if (!counter)
return -ENOMEM;
}
@@ -326,7 +368,7 @@ int rdma_counter_bind_qp_auto(struct ib_qp *qp, u32 port)
* @force:
* true - Decrease the counter ref-count anyway (e.g., qp destroy)
*/
-int rdma_counter_unbind_qp(struct ib_qp *qp, bool force)
+int rdma_counter_unbind_qp(struct ib_qp *qp, u32 port, bool force)
{
struct rdma_counter *counter = qp->counter;
int ret;
@@ -334,7 +376,7 @@ int rdma_counter_unbind_qp(struct ib_qp *qp, bool force)
if (!counter)
return -EINVAL;
- ret = __rdma_counter_unbind_qp(qp);
+ ret = __rdma_counter_unbind_qp(qp, port);
if (ret && !force)
return ret;
@@ -419,7 +461,7 @@ static struct ib_qp *rdma_counter_get_qp(struct ib_device *dev, u32 qp_num)
return NULL;
qp = container_of(res, struct ib_qp, res);
- if (qp->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW))
+ if (qp->qp_type == IB_QPT_RAW_PACKET && !rdma_dev_has_raw_cap(dev))
goto err;
return qp;
@@ -481,7 +523,7 @@ int rdma_counter_bind_qpn(struct ib_device *dev, u32 port,
goto err_task;
}
- ret = __rdma_counter_bind_qp(counter, qp);
+ ret = __rdma_counter_bind_qp(counter, qp, port);
if (ret)
goto err_task;
@@ -526,7 +568,7 @@ int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u32 port,
goto err;
}
- counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_MANUAL);
+ counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_MANUAL, true);
if (!counter) {
ret = -ENOMEM;
goto err;
@@ -572,7 +614,7 @@ int rdma_counter_unbind_qpn(struct ib_device *dev, u32 port,
goto out;
}
- ret = rdma_counter_unbind_qp(qp, false);
+ ret = rdma_counter_unbind_qp(qp, port, false);
out:
rdma_restrack_put(&qp->res);
@@ -581,13 +623,15 @@ out:
int rdma_counter_get_mode(struct ib_device *dev, u32 port,
enum rdma_nl_counter_mode *mode,
- enum rdma_nl_counter_mask *mask)
+ enum rdma_nl_counter_mask *mask,
+ bool *opcnt)
{
struct rdma_port_counter *port_counter;
port_counter = &dev->port_data[port].port_counter;
*mode = port_counter->mode.mode;
*mask = port_counter->mode.mask;
+ *opcnt = port_counter->mode.bind_opcnt;
return 0;
}
@@ -618,7 +662,7 @@ void rdma_counter_init(struct ib_device *dev)
fail:
for (i = port; i >= rdma_start_port(dev); i--) {
port_counter = &dev->port_data[port].port_counter;
- kfree(port_counter->hstats);
+ rdma_free_hw_stats_struct(port_counter->hstats);
port_counter->hstats = NULL;
mutex_destroy(&port_counter->lock);
}
@@ -631,7 +675,7 @@ void rdma_counter_release(struct ib_device *dev)
rdma_for_each_port(dev, port) {
port_counter = &dev->port_data[port].port_counter;
- kfree(port_counter->hstats);
+ rdma_free_hw_stats_struct(port_counter->hstats);
mutex_destroy(&port_counter->lock);
}
}
diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
index 433b426729d4..584537c71545 100644
--- a/drivers/infiniband/core/cq.c
+++ b/drivers/infiniband/core/cq.c
@@ -2,7 +2,6 @@
/*
* Copyright (c) 2015 HGST, a Western Digital Company.
*/
-#include <linux/module.h>
#include <linux/err.h>
#include <linux/slab.h>
#include <rdma/ib_verbs.h>
@@ -318,13 +317,18 @@ EXPORT_SYMBOL(__ib_alloc_cq_any);
*/
void ib_free_cq(struct ib_cq *cq)
{
- int ret;
+ int ret = 0;
if (WARN_ON_ONCE(atomic_read(&cq->usecnt)))
return;
if (WARN_ON_ONCE(cq->cqe_used))
return;
+ if (cq->device->ops.pre_destroy_cq) {
+ ret = cq->device->ops.pre_destroy_cq(cq);
+ WARN_ONCE(ret, "Disable of kernel CQ shouldn't fail");
+ }
+
switch (cq->poll_ctx) {
case IB_POLL_DIRECT:
break;
@@ -341,7 +345,10 @@ void ib_free_cq(struct ib_cq *cq)
rdma_dim_destroy(cq);
trace_cq_free(cq);
- ret = cq->device->ops.destroy_cq(cq, NULL);
+ if (cq->device->ops.post_destroy_cq)
+ cq->device->ops.post_destroy_cq(cq);
+ else
+ ret = cq->device->ops.destroy_cq(cq, NULL);
WARN_ONCE(ret, "Destroy of kernel CQ shouldn't fail");
rdma_restrack_del(&cq->res);
kfree(cq->wc);
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index fa20b1824fb8..13e8a1714bbd 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -58,6 +58,7 @@ struct workqueue_struct *ib_comp_wq;
struct workqueue_struct *ib_comp_unbound_wq;
struct workqueue_struct *ib_wq;
EXPORT_SYMBOL_GPL(ib_wq);
+static struct workqueue_struct *ib_unreg_wq;
/*
* Each of the three rwsem locks (devices, clients, client_data) protects the
@@ -144,6 +145,33 @@ bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net)
}
EXPORT_SYMBOL(rdma_dev_access_netns);
+/**
+ * rdma_dev_has_raw_cap() - Returns whether a specified rdma device has
+ * CAP_NET_RAW capability or not.
+ *
+ * @dev: Pointer to rdma device whose capability to be checked
+ *
+ * Returns true if a rdma device's owning user namespace has CAP_NET_RAW
+ * capability, otherwise false. When rdma subsystem is in legacy shared network,
+ * namespace mode, the default net namespace is considered.
+ */
+bool rdma_dev_has_raw_cap(const struct ib_device *dev)
+{
+ const struct net *net;
+
+ /* Network namespace is the resource whose user namespace
+ * to be considered. When in shared mode, there is no reliable
+ * network namespace resource, so consider the default net namespace.
+ */
+ if (ib_devices_shared_netns)
+ net = &init_net;
+ else
+ net = read_pnet(&dev->coredev.rdma_net);
+
+ return ns_capable(net->user_ns, CAP_NET_RAW);
+}
+EXPORT_SYMBOL(rdma_dev_has_raw_cap);
+
/*
* xarray has this behavior where it won't iterate over NULL values stored in
* allocated arrays. So we need our own iterator to see all values stored in
@@ -208,23 +236,6 @@ static void __ibdev_printk(const char *level, const struct ib_device *ibdev,
printk("%s(NULL ib_device): %pV", level, vaf);
}
-void ibdev_printk(const char *level, const struct ib_device *ibdev,
- const char *format, ...)
-{
- struct va_format vaf;
- va_list args;
-
- va_start(args, format);
-
- vaf.fmt = format;
- vaf.va = &args;
-
- __ibdev_printk(level, ibdev, &vaf);
-
- va_end(args);
-}
-EXPORT_SYMBOL(ibdev_printk);
-
#define define_ibdev_printk_level(func, level) \
void func(const struct ib_device *ibdev, const char *fmt, ...) \
{ \
@@ -421,7 +432,7 @@ int ib_device_rename(struct ib_device *ibdev, const char *name)
return ret;
}
- strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX);
+ strscpy(ibdev->name, name, IB_DEVICE_NAME_MAX);
ret = rename_compat_devs(ibdev);
downgrade_write(&devices_rwsem);
@@ -436,6 +447,7 @@ int ib_device_rename(struct ib_device *ibdev, const char *name)
client->rename(ibdev, client_data);
}
up_read(&ibdev->client_data_rwsem);
+ rdma_nl_notify_event(ibdev, 0, RDMA_RENAME_EVENT);
up_read(&devices_rwsem);
return 0;
}
@@ -502,6 +514,7 @@ static void ib_device_release(struct device *device)
rcu_head);
}
+ mutex_destroy(&dev->subdev_lock);
mutex_destroy(&dev->unregistration_lock);
mutex_destroy(&dev->compat_devs_mutex);
@@ -510,7 +523,7 @@ static void ib_device_release(struct device *device)
kfree_rcu(dev, rcu_head);
}
-static int ib_device_uevent(struct device *device,
+static int ib_device_uevent(const struct device *device,
struct kobj_uevent_env *env)
{
if (add_uevent_var(env, "NAME=%s", dev_name(device)))
@@ -523,9 +536,9 @@ static int ib_device_uevent(struct device *device,
return 0;
}
-static const void *net_namespace(struct device *d)
+static const void *net_namespace(const struct device *d)
{
- struct ib_core_device *coredev =
+ const struct ib_core_device *coredev =
container_of(d, struct ib_core_device, dev);
return read_pnet(&coredev->rdma_net);
@@ -542,6 +555,8 @@ static struct class ib_class = {
static void rdma_init_coredev(struct ib_core_device *coredev,
struct ib_device *dev, struct net *net)
{
+ bool is_full_dev = &dev->coredev == coredev;
+
/* This BUILD_BUG_ON is intended to catch layout change
* of union of ib_core_device and device.
* dev must be the first element as ib_core and providers
@@ -553,6 +568,13 @@ static void rdma_init_coredev(struct ib_core_device *coredev,
coredev->dev.class = &ib_class;
coredev->dev.groups = dev->groups;
+
+ /*
+ * Don't expose hw counters outside of the init namespace.
+ */
+ if (!is_full_dev && dev->hw_stats_attr_index)
+ coredev->dev.groups[dev->hw_stats_attr_index] = NULL;
+
device_initialize(&coredev->dev);
coredev->owner = dev;
INIT_LIST_HEAD(&coredev->port_list);
@@ -562,6 +584,8 @@ static void rdma_init_coredev(struct ib_core_device *coredev,
/**
* _ib_alloc_device - allocate an IB device struct
* @size:size of structure to allocate
+ * @net: network namespace device should be located in, namespace
+ * must stay valid until ib_register_device() is completed.
*
* Low-level drivers should use ib_alloc_device() to allocate &struct
* ib_device. @size is the size of the structure to be allocated,
@@ -569,7 +593,7 @@ static void rdma_init_coredev(struct ib_core_device *coredev,
* ib_dealloc_device() must be used to free structures allocated with
* ib_alloc_device().
*/
-struct ib_device *_ib_alloc_device(size_t size)
+struct ib_device *_ib_alloc_device(size_t size, struct net *net)
{
struct ib_device *device;
unsigned int i;
@@ -586,7 +610,15 @@ struct ib_device *_ib_alloc_device(size_t size)
return NULL;
}
- rdma_init_coredev(&device->coredev, device, &init_net);
+ /* ib_devices_shared_netns can't change while we have active namespaces
+ * in the system which means either init_net is passed or the user has
+ * no idea what they are doing.
+ *
+ * To avoid breaking backward compatibility, when in shared mode,
+ * force to init the device in the init_net.
+ */
+ net = ib_devices_shared_netns ? &init_net : net;
+ rdma_init_coredev(&device->coredev, device, net);
INIT_LIST_HEAD(&device->event_handler_list);
spin_lock_init(&device->qp_open_list_lock);
@@ -607,6 +639,8 @@ struct ib_device *_ib_alloc_device(size_t size)
for (i = 0; i < ARRAY_SIZE(device->cq_pools); i++)
INIT_LIST_HEAD(&device->cq_pools[i]);
+ rwlock_init(&device->cache_lock);
+
device->uverbs_cmd_mask =
BIT_ULL(IB_USER_VERBS_CMD_ALLOC_MW) |
BIT_ULL(IB_USER_VERBS_CMD_ALLOC_PD) |
@@ -638,6 +672,11 @@ struct ib_device *_ib_alloc_device(size_t size)
BIT_ULL(IB_USER_VERBS_CMD_REG_MR) |
BIT_ULL(IB_USER_VERBS_CMD_REREG_MR) |
BIT_ULL(IB_USER_VERBS_CMD_RESIZE_CQ);
+
+ mutex_init(&device->subdev_lock);
+ INIT_LIST_HEAD(&device->subdev_list_head);
+ INIT_LIST_HEAD(&device->subdev_list);
+
return device;
}
EXPORT_SYMBOL(_ib_alloc_device);
@@ -801,7 +840,7 @@ static int alloc_port_data(struct ib_device *device)
* empty slots at the beginning.
*/
pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata,
- rdma_end_port(device) + 1),
+ size_add(rdma_end_port(device), 1)),
GFP_KERNEL);
if (!pdata_rcu)
return -ENOMEM;
@@ -1214,7 +1253,7 @@ static int assign_name(struct ib_device *device, const char *name)
ret = -ENFILE;
goto out;
}
- strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX);
+ strscpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX);
ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b,
&last_id, GFP_KERNEL);
@@ -1342,6 +1381,37 @@ static void prevent_dealloc_device(struct ib_device *ib_dev)
{
}
+static void ib_device_notify_register(struct ib_device *device)
+{
+ struct net_device *netdev;
+ u32 port;
+ int ret;
+
+ down_read(&devices_rwsem);
+
+ /* Mark for userspace that device is ready */
+ kobject_uevent(&device->dev.kobj, KOBJ_ADD);
+
+ ret = rdma_nl_notify_event(device, 0, RDMA_REGISTER_EVENT);
+ if (ret)
+ goto out;
+
+ rdma_for_each_port(device, port) {
+ netdev = ib_device_get_netdev(device, port);
+ if (!netdev)
+ continue;
+
+ ret = rdma_nl_notify_event(device, port,
+ RDMA_NETDEV_ATTACH_EVENT);
+ dev_put(netdev);
+ if (ret)
+ goto out;
+ }
+
+out:
+ up_read(&devices_rwsem);
+}
+
/**
* ib_register_device - Register an IB device with IB core
* @device: Device to register
@@ -1438,8 +1508,9 @@ int ib_register_device(struct ib_device *device, const char *name,
return ret;
}
dev_set_uevent_suppress(&device->dev, false);
- /* Mark for userspace that device is ready */
- kobject_uevent(&device->dev.kobj, KOBJ_ADD);
+
+ ib_device_notify_register(device);
+
ib_device_put(device);
return 0;
@@ -1458,9 +1529,21 @@ EXPORT_SYMBOL(ib_register_device);
/* Callers must hold a get on the device. */
static void __ib_unregister_device(struct ib_device *ib_dev)
{
+ struct ib_device *sub, *tmp;
+
+ mutex_lock(&ib_dev->subdev_lock);
+ list_for_each_entry_safe_reverse(sub, tmp,
+ &ib_dev->subdev_list_head,
+ subdev_list) {
+ list_del(&sub->subdev_list);
+ ib_dev->ops.del_sub_dev(sub);
+ ib_device_put(ib_dev);
+ }
+ mutex_unlock(&ib_dev->subdev_lock);
+
/*
* We have a registration lock so that all the calls to unregister are
- * fully fenced, once any unregister returns the device is truely
+ * fully fenced, once any unregister returns the device is truly
* unregistered even if multiple callers are unregistering it at the
* same time. This also interacts with the registration flow and
* provides sane semantics if register and unregister are racing.
@@ -1470,6 +1553,7 @@ static void __ib_unregister_device(struct ib_device *ib_dev)
goto out;
disable_device(ib_dev);
+ rdma_nl_notify_event(ib_dev, 0, RDMA_UNREGISTER_EVENT);
/* Expedite removing unregistered pointers from the hash table */
free_netdevs(ib_dev);
@@ -1600,7 +1684,7 @@ void ib_unregister_device_queued(struct ib_device *ib_dev)
WARN_ON(!refcount_read(&ib_dev->refcount));
WARN_ON(!ib_dev->ops.dealloc_driver);
get_device(&ib_dev->dev);
- if (!queue_work(system_unbound_wq, &ib_dev->unregistration_work))
+ if (!queue_work(ib_unreg_wq, &ib_dev->unregistration_work))
put_device(&ib_dev->dev);
}
EXPORT_SYMBOL(ib_unregister_device_queued);
@@ -1727,7 +1811,7 @@ static int assign_client_id(struct ib_client *client)
{
int ret;
- down_write(&clients_rwsem);
+ lockdep_assert_held(&clients_rwsem);
/*
* The add/remove callbacks must be called in FIFO/LIFO order. To
* achieve this we assign client_ids so they are sorted in
@@ -1736,14 +1820,11 @@ static int assign_client_id(struct ib_client *client)
client->client_id = highest_client_id;
ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL);
if (ret)
- goto out;
+ return ret;
highest_client_id++;
xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED);
-
-out:
- up_write(&clients_rwsem);
- return ret;
+ return 0;
}
static void remove_client_id(struct ib_client *client)
@@ -1773,25 +1854,35 @@ int ib_register_client(struct ib_client *client)
{
struct ib_device *device;
unsigned long index;
+ bool need_unreg = false;
int ret;
refcount_set(&client->uses, 1);
init_completion(&client->uses_zero);
+
+ /*
+ * The devices_rwsem is held in write mode to ensure that a racing
+ * ib_register_device() sees a consisent view of clients and devices.
+ */
+ down_write(&devices_rwsem);
+ down_write(&clients_rwsem);
ret = assign_client_id(client);
if (ret)
- return ret;
+ goto out;
- down_read(&devices_rwsem);
+ need_unreg = true;
xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) {
ret = add_client_context(device, client);
- if (ret) {
- up_read(&devices_rwsem);
- ib_unregister_client(client);
- return ret;
- }
+ if (ret)
+ goto out;
}
- up_read(&devices_rwsem);
- return 0;
+ ret = 0;
+out:
+ up_write(&clients_rwsem);
+ up_write(&devices_rwsem);
+ if (need_unreg && ret)
+ ib_unregister_client(client);
+ return ret;
}
EXPORT_SYMBOL(ib_register_client);
@@ -2050,7 +2141,6 @@ static int __ib_query_port(struct ib_device *device,
u32 port_num,
struct ib_port_attr *port_attr)
{
- union ib_gid gid = {};
int err;
memset(port_attr, 0, sizeof(*port_attr));
@@ -2063,11 +2153,8 @@ static int __ib_query_port(struct ib_device *device,
IB_LINK_LAYER_INFINIBAND)
return 0;
- err = device->ops.query_gid(device, port_num, 0, &gid);
- if (err)
- return err;
-
- port_attr->subnet_prefix = be64_to_cpu(gid.global.subnet_prefix);
+ ib_get_cached_subnet_prefix(device, port_num,
+ &port_attr->subnet_prefix);
return 0;
}
@@ -2135,11 +2222,15 @@ static void add_ndev_hash(struct ib_port_data *pdata)
int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
u32 port)
{
+ enum rdma_nl_notify_event_type etype;
struct net_device *old_ndev;
struct ib_port_data *pdata;
unsigned long flags;
int ret;
+ if (!rdma_is_port_valid(ib_dev, port))
+ return -EINVAL;
+
/*
* Drivers wish to call this before ib_register_driver, so we have to
* setup the port data early.
@@ -2148,9 +2239,6 @@ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
if (ret)
return ret;
- if (!rdma_is_port_valid(ib_dev, port))
- return -EINVAL;
-
pdata = &ib_dev->port_data[port];
spin_lock_irqsave(&pdata->netdev_lock, flags);
old_ndev = rcu_dereference_protected(
@@ -2160,14 +2248,19 @@ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
return 0;
}
- if (ndev)
- dev_hold(ndev);
rcu_assign_pointer(pdata->netdev, ndev);
+ netdev_put(old_ndev, &pdata->netdev_tracker);
+ netdev_hold(ndev, &pdata->netdev_tracker, GFP_ATOMIC);
spin_unlock_irqrestore(&pdata->netdev_lock, flags);
add_ndev_hash(pdata);
- if (old_ndev)
- dev_put(old_ndev);
+
+ /* Make sure that the device is registered before we send events */
+ if (xa_load(&devices, ib_dev->index) != ib_dev)
+ return 0;
+
+ etype = ndev ? RDMA_NETDEV_ATTACH_EVENT : RDMA_NETDEV_DETACH_EVENT;
+ rdma_nl_notify_event(ib_dev, port, etype);
return 0;
}
@@ -2200,7 +2293,7 @@ static void free_netdevs(struct ib_device *ib_dev)
* comparisons after the put
*/
rcu_assign_pointer(pdata->netdev, NULL);
- dev_put(ndev);
+ netdev_put(ndev, &pdata->netdev_tracker);
}
spin_unlock_irqrestore(&pdata->netdev_lock, flags);
}
@@ -2215,6 +2308,9 @@ struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
if (!rdma_is_port_valid(ib_dev, port))
return NULL;
+ if (!ib_dev->port_data)
+ return NULL;
+
pdata = &ib_dev->port_data[port];
/*
@@ -2227,22 +2323,40 @@ struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
spin_lock(&pdata->netdev_lock);
res = rcu_dereference_protected(
pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
- if (res)
- dev_hold(res);
+ dev_hold(res);
spin_unlock(&pdata->netdev_lock);
}
- /*
- * If we are starting to unregister expedite things by preventing
- * propagation of an unregistering netdev.
- */
- if (res && res->reg_state != NETREG_REGISTERED) {
- dev_put(res);
- return NULL;
+ return res;
+}
+EXPORT_SYMBOL(ib_device_get_netdev);
+
+/**
+ * ib_query_netdev_port - Query the port number of a net_device
+ * associated with an ibdev
+ * @ibdev: IB device
+ * @ndev: Network device
+ * @port: IB port the net_device is connected to
+ */
+int ib_query_netdev_port(struct ib_device *ibdev, struct net_device *ndev,
+ u32 *port)
+{
+ struct net_device *ib_ndev;
+ u32 port_num;
+
+ rdma_for_each_port(ibdev, port_num) {
+ ib_ndev = ib_device_get_netdev(ibdev, port_num);
+ if (ndev == ib_ndev) {
+ *port = port_num;
+ dev_put(ib_ndev);
+ return 0;
+ }
+ dev_put(ib_ndev);
}
- return res;
+ return -ENOENT;
}
+EXPORT_SYMBOL(ib_query_netdev_port);
/**
* ib_device_get_by_netdev - Find an IB device associated with a netdev
@@ -2303,9 +2417,7 @@ void ib_enum_roce_netdev(struct ib_device *ib_dev,
if (filter(ib_dev, port, idev, filter_cookie))
cb(ib_dev, port, idev, cookie);
-
- if (idev)
- dev_put(idev);
+ dev_put(idev);
}
}
@@ -2463,7 +2575,8 @@ int ib_find_gid(struct ib_device *device, union ib_gid *gid,
++i) {
ret = rdma_query_gid(device, port, i, &tmp_gid);
if (ret)
- return ret;
+ continue;
+
if (!memcmp(&tmp_gid, gid, sizeof *gid)) {
*port_num = port;
if (index)
@@ -2592,8 +2705,10 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
ops->uverbs_no_driver_id_binding;
SET_DEVICE_OP(dev_ops, add_gid);
+ SET_DEVICE_OP(dev_ops, add_sub_dev);
SET_DEVICE_OP(dev_ops, advise_mr);
SET_DEVICE_OP(dev_ops, alloc_dm);
+ SET_DEVICE_OP(dev_ops, alloc_dmah);
SET_DEVICE_OP(dev_ops, alloc_hw_device_stats);
SET_DEVICE_OP(dev_ops, alloc_hw_port_stats);
SET_DEVICE_OP(dev_ops, alloc_mr);
@@ -2608,25 +2723,28 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, counter_alloc_stats);
SET_DEVICE_OP(dev_ops, counter_bind_qp);
SET_DEVICE_OP(dev_ops, counter_dealloc);
+ SET_DEVICE_OP(dev_ops, counter_init);
SET_DEVICE_OP(dev_ops, counter_unbind_qp);
SET_DEVICE_OP(dev_ops, counter_update_stats);
SET_DEVICE_OP(dev_ops, create_ah);
SET_DEVICE_OP(dev_ops, create_counters);
SET_DEVICE_OP(dev_ops, create_cq);
+ SET_DEVICE_OP(dev_ops, create_cq_umem);
SET_DEVICE_OP(dev_ops, create_flow);
- SET_DEVICE_OP(dev_ops, create_flow_action_esp);
SET_DEVICE_OP(dev_ops, create_qp);
SET_DEVICE_OP(dev_ops, create_rwq_ind_table);
SET_DEVICE_OP(dev_ops, create_srq);
SET_DEVICE_OP(dev_ops, create_user_ah);
SET_DEVICE_OP(dev_ops, create_wq);
SET_DEVICE_OP(dev_ops, dealloc_dm);
+ SET_DEVICE_OP(dev_ops, dealloc_dmah);
SET_DEVICE_OP(dev_ops, dealloc_driver);
SET_DEVICE_OP(dev_ops, dealloc_mw);
SET_DEVICE_OP(dev_ops, dealloc_pd);
SET_DEVICE_OP(dev_ops, dealloc_ucontext);
SET_DEVICE_OP(dev_ops, dealloc_xrcd);
SET_DEVICE_OP(dev_ops, del_gid);
+ SET_DEVICE_OP(dev_ops, del_sub_dev);
SET_DEVICE_OP(dev_ops, dereg_mr);
SET_DEVICE_OP(dev_ops, destroy_ah);
SET_DEVICE_OP(dev_ops, destroy_counters);
@@ -2650,12 +2768,15 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, fill_res_mr_entry_raw);
SET_DEVICE_OP(dev_ops, fill_res_qp_entry);
SET_DEVICE_OP(dev_ops, fill_res_qp_entry_raw);
+ SET_DEVICE_OP(dev_ops, fill_res_srq_entry);
+ SET_DEVICE_OP(dev_ops, fill_res_srq_entry_raw);
SET_DEVICE_OP(dev_ops, fill_stat_mr_entry);
SET_DEVICE_OP(dev_ops, get_dev_fw_str);
SET_DEVICE_OP(dev_ops, get_dma_mr);
SET_DEVICE_OP(dev_ops, get_hw_stats);
SET_DEVICE_OP(dev_ops, get_link_layer);
SET_DEVICE_OP(dev_ops, get_netdev);
+ SET_DEVICE_OP(dev_ops, get_numa_node);
SET_DEVICE_OP(dev_ops, get_port_immutable);
SET_DEVICE_OP(dev_ops, get_vector_affinity);
SET_DEVICE_OP(dev_ops, get_vf_config);
@@ -2676,14 +2797,16 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, modify_ah);
SET_DEVICE_OP(dev_ops, modify_cq);
SET_DEVICE_OP(dev_ops, modify_device);
- SET_DEVICE_OP(dev_ops, modify_flow_action_esp);
+ SET_DEVICE_OP(dev_ops, modify_hw_stat);
SET_DEVICE_OP(dev_ops, modify_port);
SET_DEVICE_OP(dev_ops, modify_qp);
SET_DEVICE_OP(dev_ops, modify_srq);
SET_DEVICE_OP(dev_ops, modify_wq);
SET_DEVICE_OP(dev_ops, peek_cq);
+ SET_DEVICE_OP(dev_ops, pre_destroy_cq);
SET_DEVICE_OP(dev_ops, poll_cq);
SET_DEVICE_OP(dev_ops, port_groups);
+ SET_DEVICE_OP(dev_ops, post_destroy_cq);
SET_DEVICE_OP(dev_ops, post_recv);
SET_DEVICE_OP(dev_ops, post_send);
SET_DEVICE_OP(dev_ops, post_srq_recv);
@@ -2706,19 +2829,73 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, resize_cq);
SET_DEVICE_OP(dev_ops, set_vf_guid);
SET_DEVICE_OP(dev_ops, set_vf_link_state);
+ SET_DEVICE_OP(dev_ops, ufile_hw_cleanup);
+ SET_DEVICE_OP(dev_ops, report_port_event);
SET_OBJ_SIZE(dev_ops, ib_ah);
SET_OBJ_SIZE(dev_ops, ib_counters);
SET_OBJ_SIZE(dev_ops, ib_cq);
+ SET_OBJ_SIZE(dev_ops, ib_dmah);
SET_OBJ_SIZE(dev_ops, ib_mw);
SET_OBJ_SIZE(dev_ops, ib_pd);
+ SET_OBJ_SIZE(dev_ops, ib_qp);
SET_OBJ_SIZE(dev_ops, ib_rwq_ind_table);
SET_OBJ_SIZE(dev_ops, ib_srq);
SET_OBJ_SIZE(dev_ops, ib_ucontext);
SET_OBJ_SIZE(dev_ops, ib_xrcd);
+ SET_OBJ_SIZE(dev_ops, rdma_counter);
}
EXPORT_SYMBOL(ib_set_device_ops);
+int ib_add_sub_device(struct ib_device *parent,
+ enum rdma_nl_dev_type type,
+ const char *name)
+{
+ struct ib_device *sub;
+ int ret = 0;
+
+ if (!parent->ops.add_sub_dev || !parent->ops.del_sub_dev)
+ return -EOPNOTSUPP;
+
+ if (!ib_device_try_get(parent))
+ return -EINVAL;
+
+ sub = parent->ops.add_sub_dev(parent, type, name);
+ if (IS_ERR(sub)) {
+ ib_device_put(parent);
+ return PTR_ERR(sub);
+ }
+
+ sub->type = type;
+ sub->parent = parent;
+
+ mutex_lock(&parent->subdev_lock);
+ list_add_tail(&parent->subdev_list_head, &sub->subdev_list);
+ mutex_unlock(&parent->subdev_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_add_sub_device);
+
+int ib_del_sub_device_and_put(struct ib_device *sub)
+{
+ struct ib_device *parent = sub->parent;
+
+ if (!parent)
+ return -EOPNOTSUPP;
+
+ mutex_lock(&parent->subdev_lock);
+ list_del(&sub->subdev_list);
+ mutex_unlock(&parent->subdev_lock);
+
+ ib_device_put(sub);
+ parent->ops.del_sub_dev(sub);
+ ib_device_put(parent);
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_del_sub_device_and_put);
+
#ifdef CONFIG_INFINIBAND_VIRT_DMA
int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents)
{
@@ -2749,29 +2926,121 @@ static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = {
},
};
+void ib_dispatch_port_state_event(struct ib_device *ibdev, struct net_device *ndev)
+{
+ enum ib_port_state curr_state;
+ struct ib_event ibevent = {};
+ u32 port;
+
+ if (ib_query_netdev_port(ibdev, ndev, &port))
+ return;
+
+ curr_state = ib_get_curr_port_state(ndev);
+
+ write_lock_irq(&ibdev->cache_lock);
+ if (ibdev->port_data[port].cache.last_port_state == curr_state) {
+ write_unlock_irq(&ibdev->cache_lock);
+ return;
+ }
+ ibdev->port_data[port].cache.last_port_state = curr_state;
+ write_unlock_irq(&ibdev->cache_lock);
+
+ ibevent.event = (curr_state == IB_PORT_DOWN) ?
+ IB_EVENT_PORT_ERR : IB_EVENT_PORT_ACTIVE;
+ ibevent.device = ibdev;
+ ibevent.element.port_num = port;
+ ib_dispatch_event(&ibevent);
+}
+EXPORT_SYMBOL(ib_dispatch_port_state_event);
+
+static void handle_port_event(struct net_device *ndev, unsigned long event)
+{
+ struct ib_device *ibdev;
+
+ /* Currently, link events in bonding scenarios are still
+ * reported by drivers that support bonding.
+ */
+ if (netif_is_lag_master(ndev) || netif_is_lag_port(ndev))
+ return;
+
+ ibdev = ib_device_get_by_netdev(ndev, RDMA_DRIVER_UNKNOWN);
+ if (!ibdev)
+ return;
+
+ if (ibdev->ops.report_port_event) {
+ ibdev->ops.report_port_event(ibdev, ndev, event);
+ goto put_ibdev;
+ }
+
+ ib_dispatch_port_state_event(ibdev, ndev);
+
+put_ibdev:
+ ib_device_put(ibdev);
+};
+
+static int ib_netdevice_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
+ struct ib_device *ibdev;
+ u32 port;
+
+ switch (event) {
+ case NETDEV_CHANGENAME:
+ ibdev = ib_device_get_by_netdev(ndev, RDMA_DRIVER_UNKNOWN);
+ if (!ibdev)
+ return NOTIFY_DONE;
+
+ if (ib_query_netdev_port(ibdev, ndev, &port)) {
+ ib_device_put(ibdev);
+ break;
+ }
+
+ rdma_nl_notify_event(ibdev, port, RDMA_NETDEV_RENAME_EVENT);
+ ib_device_put(ibdev);
+ break;
+
+ case NETDEV_UP:
+ case NETDEV_CHANGE:
+ case NETDEV_DOWN:
+ handle_port_event(ndev, event);
+ break;
+
+ default:
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nb_netdevice = {
+ .notifier_call = ib_netdevice_event,
+};
+
static int __init ib_core_init(void)
{
- int ret;
+ int ret = -ENOMEM;
- ib_wq = alloc_workqueue("infiniband", 0, 0);
+ ib_wq = alloc_workqueue("infiniband", WQ_PERCPU, 0);
if (!ib_wq)
return -ENOMEM;
- ib_comp_wq = alloc_workqueue("ib-comp-wq",
- WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
- if (!ib_comp_wq) {
- ret = -ENOMEM;
+ ib_unreg_wq = alloc_workqueue("ib-unreg-wq", WQ_UNBOUND,
+ WQ_UNBOUND_MAX_ACTIVE);
+ if (!ib_unreg_wq)
goto err;
- }
+
+ ib_comp_wq = alloc_workqueue("ib-comp-wq",
+ WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS | WQ_PERCPU, 0);
+ if (!ib_comp_wq)
+ goto err_unbound;
ib_comp_unbound_wq =
alloc_workqueue("ib-comp-unb-wq",
WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM |
WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE);
- if (!ib_comp_unbound_wq) {
- ret = -ENOMEM;
+ if (!ib_comp_unbound_wq)
goto err_comp;
- }
ret = class_register(&ib_class);
if (ret) {
@@ -2813,10 +3082,20 @@ static int __init ib_core_init(void)
nldev_init();
rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table);
- roce_gid_mgmt_init();
+ ret = roce_gid_mgmt_init();
+ if (ret) {
+ pr_warn("Couldn't init RoCE GID management\n");
+ goto err_parent;
+ }
+
+ register_netdevice_notifier(&nb_netdevice);
return 0;
+err_parent:
+ rdma_nl_unregister(RDMA_NL_LS);
+ nldev_exit();
+ unregister_pernet_device(&rdma_dev_net_ops);
err_compat:
unregister_blocking_lsm_notifier(&ibdev_lsm_nb);
err_sa:
@@ -2831,6 +3110,8 @@ err_comp_unbound:
destroy_workqueue(ib_comp_unbound_wq);
err_comp:
destroy_workqueue(ib_comp_wq);
+err_unbound:
+ destroy_workqueue(ib_unreg_wq);
err:
destroy_workqueue(ib_wq);
return ret;
@@ -2838,9 +3119,10 @@ err:
static void __exit ib_core_cleanup(void)
{
+ unregister_netdevice_notifier(&nb_netdevice);
roce_gid_mgmt_cleanup();
- nldev_exit();
rdma_nl_unregister(RDMA_NL_LS);
+ nldev_exit();
unregister_pernet_device(&rdma_dev_net_ops);
unregister_blocking_lsm_notifier(&ibdev_lsm_nb);
ib_sa_cleanup();
@@ -2852,7 +3134,7 @@ static void __exit ib_core_cleanup(void)
destroy_workqueue(ib_comp_wq);
/* Make sure that any pending umem accounting work is done. */
destroy_workqueue(ib_wq);
- flush_workqueue(system_unbound_wq);
+ destroy_workqueue(ib_unreg_wq);
WARN_ON(!xa_empty(&clients));
WARN_ON(!xa_empty(&devices));
}
diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c
index 42261152b489..62410578dec3 100644
--- a/drivers/infiniband/core/iwcm.c
+++ b/drivers/infiniband/core/iwcm.c
@@ -109,9 +109,10 @@ static struct ctl_table iwcm_ctl_table[] = {
.data = &default_backlog,
.maxlen = sizeof(default_backlog),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_INT_MAX,
},
- { }
};
/*
@@ -144,8 +145,8 @@ static struct iwcm_work *get_work(struct iwcm_id_private *cm_id_priv)
if (list_empty(&cm_id_priv->work_free_list))
return NULL;
- work = list_entry(cm_id_priv->work_free_list.next, struct iwcm_work,
- free_list);
+ work = list_first_entry(&cm_id_priv->work_free_list, struct iwcm_work,
+ free_list);
list_del_init(&work->free_list);
return work;
}
@@ -207,17 +208,17 @@ static void free_cm_id(struct iwcm_id_private *cm_id_priv)
/*
* Release a reference on cm_id. If the last reference is being
- * released, free the cm_id and return 1.
+ * released, free the cm_id and return 'true'.
*/
-static int iwcm_deref_id(struct iwcm_id_private *cm_id_priv)
+static bool iwcm_deref_id(struct iwcm_id_private *cm_id_priv)
{
if (refcount_dec_and_test(&cm_id_priv->refcount)) {
BUG_ON(!list_empty(&cm_id_priv->work_list));
free_cm_id(cm_id_priv);
- return 1;
+ return true;
}
- return 0;
+ return false;
}
static void add_ref(struct iw_cm_id *cm_id)
@@ -367,8 +368,7 @@ EXPORT_SYMBOL(iw_cm_disconnect);
/*
* CM_ID <-- DESTROYING
*
- * Clean up all resources associated with the connection and release
- * the initial reference taken by iw_create_cm_id.
+ * Clean up all resources associated with the connection.
*/
static void destroy_cm_id(struct iw_cm_id *cm_id)
{
@@ -439,19 +439,22 @@ static void destroy_cm_id(struct iw_cm_id *cm_id)
iwpm_remove_mapinfo(&cm_id->local_addr, &cm_id->m_local_addr);
iwpm_remove_mapping(&cm_id->local_addr, RDMA_NL_IWCM);
}
-
- (void)iwcm_deref_id(cm_id_priv);
}
/*
- * This function is only called by the application thread and cannot
- * be called by the event thread. The function will wait for all
- * references to be released on the cm_id and then kfree the cm_id
- * object.
+ * Destroy cm_id. If the cm_id still has other references, wait for all
+ * references to be released on the cm_id and then release the initial
+ * reference taken by iw_create_cm_id.
*/
void iw_destroy_cm_id(struct iw_cm_id *cm_id)
{
+ struct iwcm_id_private *cm_id_priv;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
destroy_cm_id(cm_id);
+ if (refcount_read(&cm_id_priv->refcount) > 1)
+ flush_workqueue(iwcm_wq);
+ iwcm_deref_id(cm_id_priv);
}
EXPORT_SYMBOL(iw_destroy_cm_id);
@@ -1018,30 +1021,27 @@ static void cm_work_handler(struct work_struct *_work)
struct iw_cm_event levent;
struct iwcm_id_private *cm_id_priv = work->cm_id;
unsigned long flags;
- int empty;
int ret = 0;
spin_lock_irqsave(&cm_id_priv->lock, flags);
- empty = list_empty(&cm_id_priv->work_list);
- while (!empty) {
- work = list_entry(cm_id_priv->work_list.next,
- struct iwcm_work, list);
+ while (!list_empty(&cm_id_priv->work_list)) {
+ work = list_first_entry(&cm_id_priv->work_list,
+ struct iwcm_work, list);
list_del_init(&work->list);
- empty = list_empty(&cm_id_priv->work_list);
levent = work->event;
put_work(work);
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) {
ret = process_event(cm_id_priv, &levent);
- if (ret)
+ if (ret) {
destroy_cm_id(&cm_id_priv->id);
+ WARN_ON_ONCE(iwcm_deref_id(cm_id_priv));
+ }
} else
pr_debug("dropping event %d\n", levent.event);
if (iwcm_deref_id(cm_id_priv))
return;
- if (empty)
- return;
spin_lock_irqsave(&cm_id_priv->lock, flags);
}
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
@@ -1094,11 +1094,8 @@ static int cm_event_handler(struct iw_cm_id *cm_id,
}
refcount_inc(&cm_id_priv->refcount);
- if (list_empty(&cm_id_priv->work_list)) {
- list_add_tail(&work->list, &cm_id_priv->work_list);
- queue_work(iwcm_wq, &work->work);
- } else
- list_add_tail(&work->list, &cm_id_priv->work_list);
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ queue_work(iwcm_wq, &work->work);
out:
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
return ret;
@@ -1186,29 +1183,34 @@ static int __init iw_cm_init(void)
ret = iwpm_init(RDMA_NL_IWCM);
if (ret)
- pr_err("iw_cm: couldn't init iwpm\n");
- else
- rdma_nl_register(RDMA_NL_IWCM, iwcm_nl_cb_table);
- iwcm_wq = alloc_ordered_workqueue("iw_cm_wq", 0);
+ return ret;
+
+ iwcm_wq = alloc_ordered_workqueue("iw_cm_wq", WQ_MEM_RECLAIM);
if (!iwcm_wq)
- return -ENOMEM;
+ goto err_alloc;
iwcm_ctl_table_hdr = register_net_sysctl(&init_net, "net/iw_cm",
iwcm_ctl_table);
if (!iwcm_ctl_table_hdr) {
pr_err("iw_cm: couldn't register sysctl paths\n");
- destroy_workqueue(iwcm_wq);
- return -ENOMEM;
+ goto err_sysctl;
}
+ rdma_nl_register(RDMA_NL_IWCM, iwcm_nl_cb_table);
return 0;
+
+err_sysctl:
+ destroy_workqueue(iwcm_wq);
+err_alloc:
+ iwpm_exit(RDMA_NL_IWCM);
+ return -ENOMEM;
}
static void __exit iw_cm_cleanup(void)
{
+ rdma_nl_unregister(RDMA_NL_IWCM);
unregister_net_sysctl_table(iwcm_ctl_table_hdr);
destroy_workqueue(iwcm_wq);
- rdma_nl_unregister(RDMA_NL_IWCM);
iwpm_exit(RDMA_NL_IWCM);
}
diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c
index 12a9816fc0e2..3c9a9869212b 100644
--- a/drivers/infiniband/core/iwpm_msg.c
+++ b/drivers/infiniband/core/iwpm_msg.c
@@ -69,10 +69,6 @@ int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client)
const char *err_str = "";
int ret = -EINVAL;
- if (!iwpm_valid_client(nl_client)) {
- err_str = "Invalid port mapper client";
- goto pid_query_error;
- }
if (iwpm_check_registration(nl_client, IWPM_REG_VALID) ||
iwpm_user_pid == IWPM_PID_UNAVAILABLE)
return 0;
@@ -153,10 +149,6 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
const char *err_str = "";
int ret = -EINVAL;
- if (!iwpm_valid_client(nl_client)) {
- err_str = "Invalid port mapper client";
- goto add_mapping_error;
- }
if (!iwpm_valid_pid())
return 0;
if (!iwpm_check_registration(nl_client, IWPM_REG_VALID)) {
@@ -240,10 +232,6 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
const char *err_str = "";
int ret = -EINVAL;
- if (!iwpm_valid_client(nl_client)) {
- err_str = "Invalid port mapper client";
- goto query_mapping_error;
- }
if (!iwpm_valid_pid())
return 0;
if (!iwpm_check_registration(nl_client, IWPM_REG_VALID)) {
@@ -331,10 +319,6 @@ int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client)
const char *err_str = "";
int ret = -EINVAL;
- if (!iwpm_valid_client(nl_client)) {
- err_str = "Invalid port mapper client";
- goto remove_mapping_error;
- }
if (!iwpm_valid_pid())
return 0;
if (iwpm_check_registration(nl_client, IWPM_REG_UNDEF)) {
@@ -444,8 +428,7 @@ int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb)
atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n",
__func__, iwpm_user_pid);
- if (iwpm_valid_client(nl_client))
- iwpm_set_registration(nl_client, IWPM_REG_VALID);
+ iwpm_set_registration(nl_client, IWPM_REG_VALID);
register_pid_response_exit:
nlmsg_request->request_done = 1;
/* always for found nlmsg_request */
@@ -649,11 +632,6 @@ int iwpm_remote_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
return ret;
nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type);
- if (!iwpm_valid_client(nl_client)) {
- pr_info("%s: Invalid port mapper client = %u\n",
- __func__, nl_client);
- return ret;
- }
atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
local_sockaddr = (struct sockaddr_storage *)
@@ -736,11 +714,6 @@ int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
return ret;
}
nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type);
- if (!iwpm_valid_client(nl_client)) {
- pr_info("%s: Invalid port mapper client = %u\n",
- __func__, nl_client);
- return ret;
- }
iwpm_set_registration(nl_client, IWPM_REG_INCOMPL);
atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
iwpm_user_pid = cb->nlh->nlmsg_pid;
@@ -863,11 +836,6 @@ int iwpm_hello_cb(struct sk_buff *skb, struct netlink_callback *cb)
}
abi_version = nla_get_u16(nltb[IWPM_NLA_HELLO_ABI_VERSION]);
nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type);
- if (!iwpm_valid_client(nl_client)) {
- pr_info("%s: Invalid port mapper client = %u\n",
- __func__, nl_client);
- return ret;
- }
iwpm_set_registration(nl_client, IWPM_REG_INCOMPL);
atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
iwpm_ulib_version = min_t(u16, IWPM_UABI_VERSION, abi_version);
diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c
index 3f8c019c7260..eecb369898f5 100644
--- a/drivers/infiniband/core/iwpm_util.c
+++ b/drivers/infiniband/core/iwpm_util.c
@@ -48,7 +48,6 @@ static DEFINE_SPINLOCK(iwpm_mapinfo_lock);
static struct hlist_head *iwpm_reminfo_bucket;
static DEFINE_SPINLOCK(iwpm_reminfo_lock);
-static DEFINE_MUTEX(iwpm_admin_lock);
static struct iwpm_admin_data iwpm_admin;
/**
@@ -59,39 +58,21 @@ static struct iwpm_admin_data iwpm_admin;
*/
int iwpm_init(u8 nl_client)
{
- int ret = 0;
- mutex_lock(&iwpm_admin_lock);
- if (!refcount_read(&iwpm_admin.refcount)) {
- iwpm_hash_bucket = kcalloc(IWPM_MAPINFO_HASH_SIZE,
- sizeof(struct hlist_head),
- GFP_KERNEL);
- if (!iwpm_hash_bucket) {
- ret = -ENOMEM;
- goto init_exit;
- }
- iwpm_reminfo_bucket = kcalloc(IWPM_REMINFO_HASH_SIZE,
- sizeof(struct hlist_head),
- GFP_KERNEL);
- if (!iwpm_reminfo_bucket) {
- kfree(iwpm_hash_bucket);
- ret = -ENOMEM;
- goto init_exit;
- }
+ iwpm_hash_bucket = kcalloc(IWPM_MAPINFO_HASH_SIZE,
+ sizeof(struct hlist_head), GFP_KERNEL);
+ if (!iwpm_hash_bucket)
+ return -ENOMEM;
- refcount_set(&iwpm_admin.refcount, 1);
- } else {
- refcount_inc(&iwpm_admin.refcount);
+ iwpm_reminfo_bucket = kcalloc(IWPM_REMINFO_HASH_SIZE,
+ sizeof(struct hlist_head), GFP_KERNEL);
+ if (!iwpm_reminfo_bucket) {
+ kfree(iwpm_hash_bucket);
+ return -ENOMEM;
}
-init_exit:
- mutex_unlock(&iwpm_admin_lock);
- if (!ret) {
- iwpm_set_valid(nl_client, 1);
- iwpm_set_registration(nl_client, IWPM_REG_UNDEF);
- pr_debug("%s: Mapinfo and reminfo tables are created\n",
- __func__);
- }
- return ret;
+ iwpm_set_registration(nl_client, IWPM_REG_UNDEF);
+ pr_debug("%s: Mapinfo and reminfo tables are created\n", __func__);
+ return 0;
}
static void free_hash_bucket(void);
@@ -105,22 +86,9 @@ static void free_reminfo_bucket(void);
*/
int iwpm_exit(u8 nl_client)
{
-
- if (!iwpm_valid_client(nl_client))
- return -EINVAL;
- mutex_lock(&iwpm_admin_lock);
- if (!refcount_read(&iwpm_admin.refcount)) {
- mutex_unlock(&iwpm_admin_lock);
- pr_err("%s Incorrect usage - negative refcount\n", __func__);
- return -EINVAL;
- }
- if (refcount_dec_and_test(&iwpm_admin.refcount)) {
- free_hash_bucket();
- free_reminfo_bucket();
- pr_debug("%s: Resources are destroyed\n", __func__);
- }
- mutex_unlock(&iwpm_admin_lock);
- iwpm_set_valid(nl_client, 0);
+ free_hash_bucket();
+ free_reminfo_bucket();
+ pr_debug("%s: Resources are destroyed\n", __func__);
iwpm_set_registration(nl_client, IWPM_REG_UNDEF);
return 0;
}
@@ -145,8 +113,6 @@ int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr,
unsigned long flags;
int ret = -EINVAL;
- if (!iwpm_valid_client(nl_client))
- return ret;
map_info = kzalloc(sizeof(struct iwpm_mapping_info), GFP_KERNEL);
if (!map_info)
return -ENOMEM;
@@ -306,10 +272,6 @@ int iwpm_get_remote_info(struct sockaddr_storage *mapped_loc_addr,
unsigned long flags;
int ret = -EINVAL;
- if (!iwpm_valid_client(nl_client)) {
- pr_info("%s: Invalid client = %u\n", __func__, nl_client);
- return ret;
- }
spin_lock_irqsave(&iwpm_reminfo_lock, flags);
if (iwpm_reminfo_bucket) {
hash_bucket_head = get_reminfo_hash_bucket(
@@ -345,7 +307,7 @@ get_remote_info_exit:
struct iwpm_nlmsg_request *iwpm_get_nlmsg_request(__u32 nlmsg_seq,
u8 nl_client, gfp_t gfp)
{
- struct iwpm_nlmsg_request *nlmsg_request = NULL;
+ struct iwpm_nlmsg_request *nlmsg_request;
unsigned long flags;
nlmsg_request = kzalloc(sizeof(struct iwpm_nlmsg_request), gfp);
@@ -424,16 +386,6 @@ int iwpm_get_nlmsg_seq(void)
return atomic_inc_return(&iwpm_admin.nlmsg_seq);
}
-int iwpm_valid_client(u8 nl_client)
-{
- return iwpm_admin.client_list[nl_client];
-}
-
-void iwpm_set_valid(u8 nl_client, int valid)
-{
- iwpm_admin.client_list[nl_client] = valid;
-}
-
/* valid client */
u32 iwpm_get_registration(u8 nl_client)
{
@@ -810,7 +762,7 @@ int iwpm_send_hello(u8 nl_client, int iwpm_pid, u16 abi_version)
{
struct sk_buff *skb = NULL;
struct nlmsghdr *nlh;
- const char *err_str = "";
+ const char *err_str;
int ret = -EINVAL;
skb = iwpm_create_nlmsg(RDMA_NL_IWPM_HELLO, &nlh, nl_client);
diff --git a/drivers/infiniband/core/iwpm_util.h b/drivers/infiniband/core/iwpm_util.h
index e201835de733..d6fc8402158a 100644
--- a/drivers/infiniband/core/iwpm_util.h
+++ b/drivers/infiniband/core/iwpm_util.h
@@ -33,7 +33,6 @@
#ifndef _IWPM_UTIL_H
#define _IWPM_UTIL_H
-#include <linux/module.h>
#include <linux/io.h>
#include <linux/in.h>
#include <linux/in6.h>
@@ -90,9 +89,7 @@ struct iwpm_remote_info {
};
struct iwpm_admin_data {
- refcount_t refcount;
atomic_t nlmsg_seq;
- int client_list[RDMA_NL_NUM_CLIENTS];
u32 reg_list[RDMA_NL_NUM_CLIENTS];
};
@@ -148,22 +145,6 @@ int iwpm_get_nlmsg_seq(void);
void iwpm_add_remote_info(struct iwpm_remote_info *reminfo);
/**
- * iwpm_valid_client - Check if the port mapper client is valid
- * @nl_client: The index of the netlink client
- *
- * Valid clients need to call iwpm_init() before using
- * the port mapper
- */
-int iwpm_valid_client(u8 nl_client);
-
-/**
- * iwpm_set_valid - Set the port mapper client to valid or not
- * @nl_client: The index of the netlink client
- * @valid: 1 if valid or 0 if invalid
- */
-void iwpm_set_valid(u8 nl_client, int valid);
-
-/**
* iwpm_check_registration - Check if the client registration
* matches the given one
* @nl_client: The index of the netlink client
diff --git a/drivers/infiniband/core/lag.c b/drivers/infiniband/core/lag.c
index 7063e41eaf26..8fd80adfe833 100644
--- a/drivers/infiniband/core/lag.c
+++ b/drivers/infiniband/core/lag.c
@@ -7,8 +7,7 @@
#include <rdma/ib_cache.h>
#include <rdma/lag.h>
-static struct sk_buff *rdma_build_skb(struct ib_device *device,
- struct net_device *netdev,
+static struct sk_buff *rdma_build_skb(struct net_device *netdev,
struct rdma_ah_attr *ah_attr,
gfp_t flags)
{
@@ -86,7 +85,7 @@ static struct net_device *rdma_get_xmit_slave_udp(struct ib_device *device,
struct net_device *slave;
struct sk_buff *skb;
- skb = rdma_build_skb(device, master, ah_attr, flags);
+ skb = rdma_build_skb(master, ah_attr, flags);
if (!skb)
return ERR_PTR(-ENOMEM);
@@ -94,8 +93,7 @@ static struct net_device *rdma_get_xmit_slave_udp(struct ib_device *device,
slave = netdev_get_xmit_slave(master, skb,
!!(device->lag_flags &
RDMA_LAG_FLAGS_HASH_ALL_SLAVES));
- if (slave)
- dev_hold(slave);
+ dev_hold(slave);
rcu_read_unlock();
kfree_skb(skb);
return slave;
@@ -103,8 +101,7 @@ static struct net_device *rdma_get_xmit_slave_udp(struct ib_device *device,
void rdma_lag_put_ah_roce_slave(struct net_device *xmit_slave)
{
- if (xmit_slave)
- dev_put(xmit_slave);
+ dev_put(xmit_slave);
}
struct net_device *rdma_lag_get_ah_roce_slave(struct ib_device *device,
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 1893aa613ad7..8f26bfb69586 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -59,9 +59,6 @@ static void create_mad_addr_info(struct ib_mad_send_wr_private *mad_send_wr,
struct ib_mad_qp_info *qp_info,
struct trace_event_raw_ib_mad_send_template *entry)
{
- u16 pkey;
- struct ib_device *dev = qp_info->port_priv->device;
- u32 pnum = qp_info->port_priv->port_num;
struct ib_ud_wr *wr = &mad_send_wr->send_wr;
struct rdma_ah_attr attr = {};
@@ -69,8 +66,6 @@ static void create_mad_addr_info(struct ib_mad_send_wr_private *mad_send_wr,
/* These are common */
entry->sl = attr.sl;
- ib_query_pkey(dev, pnum, wr->pkey_index, &pkey);
- entry->pkey = pkey;
entry->rqpn = wr->remote_qpn;
entry->rqkey = wr->remote_qkey;
entry->dlid = rdma_ah_get_dlid(&attr);
@@ -215,6 +210,29 @@ int ib_response_mad(const struct ib_mad_hdr *hdr)
}
EXPORT_SYMBOL(ib_response_mad);
+#define SOL_FC_MAX_DEFAULT_FRAC 4
+#define SOL_FC_MAX_SA_FRAC 32
+
+static int get_sol_fc_max_outstanding(struct ib_mad_reg_req *mad_reg_req)
+{
+ if (!mad_reg_req)
+ /* Send only agent */
+ return mad_recvq_size / SOL_FC_MAX_DEFAULT_FRAC;
+
+ switch (mad_reg_req->mgmt_class) {
+ case IB_MGMT_CLASS_CM:
+ return mad_recvq_size / SOL_FC_MAX_DEFAULT_FRAC;
+ case IB_MGMT_CLASS_SUBN_ADM:
+ return mad_recvq_size / SOL_FC_MAX_SA_FRAC;
+ case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+ case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+ return min(mad_recvq_size, IB_MAD_QP_RECV_SIZE) /
+ SOL_FC_MAX_DEFAULT_FRAC;
+ default:
+ return 0;
+ }
+}
+
/*
* ib_register_mad_agent - Register to send/receive MADs
*
@@ -396,13 +414,17 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
spin_lock_init(&mad_agent_priv->lock);
INIT_LIST_HEAD(&mad_agent_priv->send_list);
INIT_LIST_HEAD(&mad_agent_priv->wait_list);
- INIT_LIST_HEAD(&mad_agent_priv->done_list);
INIT_LIST_HEAD(&mad_agent_priv->rmpp_list);
+ INIT_LIST_HEAD(&mad_agent_priv->backlog_list);
INIT_DELAYED_WORK(&mad_agent_priv->timed_work, timeout_sends);
INIT_LIST_HEAD(&mad_agent_priv->local_list);
INIT_WORK(&mad_agent_priv->local_work, local_completions);
refcount_set(&mad_agent_priv->refcount, 1);
init_completion(&mad_agent_priv->comp);
+ mad_agent_priv->sol_fc_send_count = 0;
+ mad_agent_priv->sol_fc_wait_count = 0;
+ mad_agent_priv->sol_fc_max =
+ recv_handler ? get_sol_fc_max_outstanding(mad_reg_req) : 0;
ret2 = ib_mad_agent_security_setup(&mad_agent_priv->agent, qp_type);
if (ret2) {
@@ -1060,6 +1082,180 @@ int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr)
return ret;
}
+static void handle_queued_state(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_agent_private *mad_agent_priv)
+{
+ if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP) {
+ mad_agent_priv->sol_fc_wait_count--;
+ list_move_tail(&mad_send_wr->agent_list,
+ &mad_agent_priv->backlog_list);
+ } else {
+ expect_mad_state(mad_send_wr, IB_MAD_STATE_INIT);
+ list_add_tail(&mad_send_wr->agent_list,
+ &mad_agent_priv->backlog_list);
+ }
+}
+
+static void handle_send_state(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_agent_private *mad_agent_priv)
+{
+ if (mad_send_wr->state == IB_MAD_STATE_INIT) {
+ list_add_tail(&mad_send_wr->agent_list,
+ &mad_agent_priv->send_list);
+ } else {
+ expect_mad_state2(mad_send_wr, IB_MAD_STATE_WAIT_RESP,
+ IB_MAD_STATE_QUEUED);
+ list_move_tail(&mad_send_wr->agent_list,
+ &mad_agent_priv->send_list);
+ }
+
+ if (mad_send_wr->is_solicited_fc) {
+ if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP)
+ mad_agent_priv->sol_fc_wait_count--;
+ mad_agent_priv->sol_fc_send_count++;
+ }
+}
+
+static void handle_wait_state(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_agent_private *mad_agent_priv)
+{
+ struct ib_mad_send_wr_private *temp_mad_send_wr;
+ struct list_head *list_item;
+ unsigned long delay;
+
+ expect_mad_state3(mad_send_wr, IB_MAD_STATE_SEND_START,
+ IB_MAD_STATE_WAIT_RESP, IB_MAD_STATE_CANCELED);
+ if (mad_send_wr->state == IB_MAD_STATE_SEND_START &&
+ mad_send_wr->is_solicited_fc) {
+ mad_agent_priv->sol_fc_send_count--;
+ mad_agent_priv->sol_fc_wait_count++;
+ }
+
+ list_del_init(&mad_send_wr->agent_list);
+ delay = mad_send_wr->timeout;
+ mad_send_wr->timeout += jiffies;
+
+ if (delay) {
+ list_for_each_prev(list_item,
+ &mad_agent_priv->wait_list) {
+ temp_mad_send_wr = list_entry(
+ list_item,
+ struct ib_mad_send_wr_private,
+ agent_list);
+ if (time_after(mad_send_wr->timeout,
+ temp_mad_send_wr->timeout))
+ break;
+ }
+ } else {
+ list_item = &mad_agent_priv->wait_list;
+ }
+
+ list_add(&mad_send_wr->agent_list, list_item);
+}
+
+static void handle_early_resp_state(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_agent_private *mad_agent_priv)
+{
+ expect_mad_state(mad_send_wr, IB_MAD_STATE_SEND_START);
+ mad_agent_priv->sol_fc_send_count -= mad_send_wr->is_solicited_fc;
+}
+
+static void handle_canceled_state(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_agent_private *mad_agent_priv)
+{
+ not_expect_mad_state(mad_send_wr, IB_MAD_STATE_DONE);
+ if (mad_send_wr->is_solicited_fc) {
+ if (mad_send_wr->state == IB_MAD_STATE_SEND_START)
+ mad_agent_priv->sol_fc_send_count--;
+ else if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP)
+ mad_agent_priv->sol_fc_wait_count--;
+ }
+}
+
+static void handle_done_state(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_agent_private *mad_agent_priv)
+{
+ if (mad_send_wr->is_solicited_fc) {
+ if (mad_send_wr->state == IB_MAD_STATE_SEND_START)
+ mad_agent_priv->sol_fc_send_count--;
+ else if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP)
+ mad_agent_priv->sol_fc_wait_count--;
+ }
+
+ list_del_init(&mad_send_wr->agent_list);
+}
+
+void change_mad_state(struct ib_mad_send_wr_private *mad_send_wr,
+ enum ib_mad_state new_state)
+{
+ struct ib_mad_agent_private *mad_agent_priv =
+ mad_send_wr->mad_agent_priv;
+
+ switch (new_state) {
+ case IB_MAD_STATE_INIT:
+ break;
+ case IB_MAD_STATE_QUEUED:
+ handle_queued_state(mad_send_wr, mad_agent_priv);
+ break;
+ case IB_MAD_STATE_SEND_START:
+ handle_send_state(mad_send_wr, mad_agent_priv);
+ break;
+ case IB_MAD_STATE_WAIT_RESP:
+ handle_wait_state(mad_send_wr, mad_agent_priv);
+ if (mad_send_wr->state == IB_MAD_STATE_CANCELED)
+ return;
+ break;
+ case IB_MAD_STATE_EARLY_RESP:
+ handle_early_resp_state(mad_send_wr, mad_agent_priv);
+ break;
+ case IB_MAD_STATE_CANCELED:
+ handle_canceled_state(mad_send_wr, mad_agent_priv);
+ break;
+ case IB_MAD_STATE_DONE:
+ handle_done_state(mad_send_wr, mad_agent_priv);
+ break;
+ }
+
+ mad_send_wr->state = new_state;
+}
+
+static bool is_solicited_fc_mad(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+ u8 mgmt_class;
+
+ if (!mad_send_wr->timeout)
+ return 0;
+
+ rmpp_mad = mad_send_wr->send_buf.mad;
+ if (mad_send_wr->mad_agent_priv->agent.rmpp_version &&
+ (ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE))
+ return 0;
+
+ mgmt_class =
+ ((struct ib_mad_hdr *)mad_send_wr->send_buf.mad)->mgmt_class;
+ return mgmt_class == IB_MGMT_CLASS_CM ||
+ mgmt_class == IB_MGMT_CLASS_SUBN_ADM ||
+ mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+ mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE;
+}
+
+static bool mad_is_for_backlog(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ struct ib_mad_agent_private *mad_agent_priv =
+ mad_send_wr->mad_agent_priv;
+
+ if (!mad_send_wr->is_solicited_fc || !mad_agent_priv->sol_fc_max)
+ return false;
+
+ if (!list_empty(&mad_agent_priv->backlog_list))
+ return true;
+
+ return mad_agent_priv->sol_fc_send_count +
+ mad_agent_priv->sol_fc_wait_count >=
+ mad_agent_priv->sol_fc_max;
+}
+
/*
* ib_post_send_mad - Posts MAD(s) to the send queue of the QP associated
* with the registered client
@@ -1085,9 +1281,7 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
if (ret)
goto error;
- if (!send_buf->mad_agent->send_handler ||
- (send_buf->timeout_ms &&
- !send_buf->mad_agent->recv_handler)) {
+ if (!send_buf->mad_agent->send_handler) {
ret = -EINVAL;
goto error;
}
@@ -1123,15 +1317,19 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
mad_send_wr->max_retries = send_buf->retries;
mad_send_wr->retries_left = send_buf->retries;
send_buf->retries = 0;
- /* Reference for work request to QP + response */
- mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0);
- mad_send_wr->status = IB_WC_SUCCESS;
+ change_mad_state(mad_send_wr, IB_MAD_STATE_INIT);
/* Reference MAD agent until send completes */
refcount_inc(&mad_agent_priv->refcount);
spin_lock_irqsave(&mad_agent_priv->lock, flags);
- list_add_tail(&mad_send_wr->agent_list,
- &mad_agent_priv->send_list);
+ mad_send_wr->is_solicited_fc = is_solicited_fc_mad(mad_send_wr);
+ if (mad_is_for_backlog(mad_send_wr)) {
+ change_mad_state(mad_send_wr, IB_MAD_STATE_QUEUED);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ return 0;
+ }
+
+ change_mad_state(mad_send_wr, IB_MAD_STATE_SEND_START);
spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) {
@@ -1143,7 +1341,7 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
if (ret < 0) {
/* Fail send request */
spin_lock_irqsave(&mad_agent_priv->lock, flags);
- list_del(&mad_send_wr->agent_list);
+ change_mad_state(mad_send_wr, IB_MAD_STATE_DONE);
spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
deref_mad_agent(mad_agent_priv);
goto error;
@@ -1751,7 +1949,19 @@ ib_find_send_mad(const struct ib_mad_agent_private *mad_agent_priv,
*/
(is_direct(mad_hdr->mgmt_class) ||
rcv_has_same_gid(mad_agent_priv, wr, wc)))
- return (wr->status == IB_WC_SUCCESS) ? wr : NULL;
+ return (wr->state != IB_MAD_STATE_CANCELED) ? wr : NULL;
+ }
+
+ list_for_each_entry(wr, &mad_agent_priv->backlog_list, agent_list) {
+ if ((wr->tid == mad_hdr->tid) &&
+ rcv_has_same_class(wr, wc) &&
+ /*
+ * Don't check GID for direct routed MADs.
+ * These might have permissive LIDs.
+ */
+ (is_direct(mad_hdr->mgmt_class) ||
+ rcv_has_same_gid(mad_agent_priv, wr, wc)))
+ return (wr->state != IB_MAD_STATE_CANCELED) ? wr : NULL;
}
/*
@@ -1770,17 +1980,55 @@ ib_find_send_mad(const struct ib_mad_agent_private *mad_agent_priv,
(is_direct(mad_hdr->mgmt_class) ||
rcv_has_same_gid(mad_agent_priv, wr, wc)))
/* Verify request has not been canceled */
- return (wr->status == IB_WC_SUCCESS) ? wr : NULL;
+ return (wr->state != IB_MAD_STATE_CANCELED) ? wr : NULL;
}
return NULL;
}
+static void
+process_backlog_mads(struct ib_mad_agent_private *mad_agent_priv)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+ struct ib_mad_send_wc mad_send_wc = {};
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ while (!list_empty(&mad_agent_priv->backlog_list) &&
+ (mad_agent_priv->sol_fc_send_count +
+ mad_agent_priv->sol_fc_wait_count <
+ mad_agent_priv->sol_fc_max)) {
+ mad_send_wr = list_entry(mad_agent_priv->backlog_list.next,
+ struct ib_mad_send_wr_private,
+ agent_list);
+ change_mad_state(mad_send_wr, IB_MAD_STATE_SEND_START);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ ret = ib_send_mad(mad_send_wr);
+ if (ret) {
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ deref_mad_agent(mad_agent_priv);
+ change_mad_state(mad_send_wr, IB_MAD_STATE_DONE);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ mad_send_wc.send_buf = &mad_send_wr->send_buf;
+ mad_send_wc.status = IB_WC_LOC_QP_OP_ERR;
+ mad_agent_priv->agent.send_handler(
+ &mad_agent_priv->agent, &mad_send_wc);
+ }
+
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ }
+
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr)
{
mad_send_wr->timeout = 0;
- if (mad_send_wr->refcount == 1)
- list_move_tail(&mad_send_wr->agent_list,
- &mad_send_wr->mad_agent_priv->done_list);
+ if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP ||
+ mad_send_wr->state == IB_MAD_STATE_QUEUED)
+ change_mad_state(mad_send_wr, IB_MAD_STATE_DONE);
+ else
+ change_mad_state(mad_send_wr, IB_MAD_STATE_EARLY_RESP);
}
static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
@@ -1789,6 +2037,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
struct ib_mad_send_wr_private *mad_send_wr;
struct ib_mad_send_wc mad_send_wc;
unsigned long flags;
+ bool is_mad_done;
int ret;
INIT_LIST_HEAD(&mad_recv_wc->rmpp_list);
@@ -1837,6 +2086,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
}
} else {
ib_mark_mad_done(mad_send_wr);
+ is_mad_done = (mad_send_wr->state == IB_MAD_STATE_DONE);
spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
/* Defined behavior is to complete response before request */
@@ -1846,10 +2096,13 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
mad_recv_wc);
deref_mad_agent(mad_agent_priv);
- mad_send_wc.status = IB_WC_SUCCESS;
- mad_send_wc.vendor_err = 0;
- mad_send_wc.send_buf = &mad_send_wr->send_buf;
- ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
+ if (is_mad_done) {
+ mad_send_wc.status = IB_WC_SUCCESS;
+ mad_send_wc.vendor_err = 0;
+ mad_send_wc.send_buf = &mad_send_wr->send_buf;
+ ib_mad_complete_send_wr(mad_send_wr,
+ &mad_send_wc);
+ }
}
} else {
mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, NULL,
@@ -2177,30 +2430,11 @@ static void adjust_timeout(struct ib_mad_agent_private *mad_agent_priv)
static void wait_for_response(struct ib_mad_send_wr_private *mad_send_wr)
{
struct ib_mad_agent_private *mad_agent_priv;
- struct ib_mad_send_wr_private *temp_mad_send_wr;
- struct list_head *list_item;
unsigned long delay;
mad_agent_priv = mad_send_wr->mad_agent_priv;
- list_del(&mad_send_wr->agent_list);
-
delay = mad_send_wr->timeout;
- mad_send_wr->timeout += jiffies;
-
- if (delay) {
- list_for_each_prev(list_item, &mad_agent_priv->wait_list) {
- temp_mad_send_wr = list_entry(list_item,
- struct ib_mad_send_wr_private,
- agent_list);
- if (time_after(mad_send_wr->timeout,
- temp_mad_send_wr->timeout))
- break;
- }
- } else {
- list_item = &mad_agent_priv->wait_list;
- }
-
- list_add(&mad_send_wr->agent_list, list_item);
+ change_mad_state(mad_send_wr, IB_MAD_STATE_WAIT_RESP);
/* Reschedule a work item if we have a shorter timeout */
if (mad_agent_priv->wait_list.next == &mad_send_wr->agent_list)
@@ -2234,32 +2468,28 @@ void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
} else
ret = IB_RMPP_RESULT_UNHANDLED;
- if (mad_send_wc->status != IB_WC_SUCCESS &&
- mad_send_wr->status == IB_WC_SUCCESS) {
- mad_send_wr->status = mad_send_wc->status;
- mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
- }
-
- if (--mad_send_wr->refcount > 0) {
- if (mad_send_wr->refcount == 1 && mad_send_wr->timeout &&
- mad_send_wr->status == IB_WC_SUCCESS) {
- wait_for_response(mad_send_wr);
- }
+ if (mad_send_wr->state == IB_MAD_STATE_CANCELED)
+ mad_send_wc->status = IB_WC_WR_FLUSH_ERR;
+ else if (mad_send_wr->state == IB_MAD_STATE_SEND_START &&
+ mad_send_wr->timeout) {
+ wait_for_response(mad_send_wr);
goto done;
}
/* Remove send from MAD agent and notify client of completion */
- list_del(&mad_send_wr->agent_list);
+ if (mad_send_wr->state != IB_MAD_STATE_DONE)
+ change_mad_state(mad_send_wr, IB_MAD_STATE_DONE);
adjust_timeout(mad_agent_priv);
spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
- if (mad_send_wr->status != IB_WC_SUCCESS)
- mad_send_wc->status = mad_send_wr->status;
- if (ret == IB_RMPP_RESULT_INTERNAL)
+ if (ret == IB_RMPP_RESULT_INTERNAL) {
ib_rmpp_send_handler(mad_send_wc);
- else
+ } else {
+ if (mad_send_wr->is_solicited_fc)
+ process_backlog_mads(mad_agent_priv);
mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
mad_send_wc);
+ }
/* Release reference on agent taken when sending */
deref_mad_agent(mad_agent_priv);
@@ -2401,40 +2631,53 @@ static bool ib_mad_send_error(struct ib_mad_port_private *port_priv,
return true;
}
+static void clear_mad_error_list(struct list_head *list,
+ enum ib_wc_status wc_status,
+ struct ib_mad_agent_private *mad_agent_priv)
+{
+ struct ib_mad_send_wr_private *mad_send_wr, *n;
+ struct ib_mad_send_wc mad_send_wc;
+
+ mad_send_wc.status = wc_status;
+ mad_send_wc.vendor_err = 0;
+
+ list_for_each_entry_safe(mad_send_wr, n, list, agent_list) {
+ mad_send_wc.send_buf = &mad_send_wr->send_buf;
+ mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+ &mad_send_wc);
+ deref_mad_agent(mad_agent_priv);
+ }
+}
+
static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv)
{
unsigned long flags;
struct ib_mad_send_wr_private *mad_send_wr, *temp_mad_send_wr;
- struct ib_mad_send_wc mad_send_wc;
struct list_head cancel_list;
INIT_LIST_HEAD(&cancel_list);
spin_lock_irqsave(&mad_agent_priv->lock, flags);
list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr,
- &mad_agent_priv->send_list, agent_list) {
- if (mad_send_wr->status == IB_WC_SUCCESS) {
- mad_send_wr->status = IB_WC_WR_FLUSH_ERR;
- mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
- }
- }
+ &mad_agent_priv->send_list, agent_list)
+ change_mad_state(mad_send_wr, IB_MAD_STATE_CANCELED);
- /* Empty wait list to prevent receives from finding a request */
- list_splice_init(&mad_agent_priv->wait_list, &cancel_list);
- spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
-
- /* Report all cancelled requests */
- mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
- mad_send_wc.vendor_err = 0;
+ /* Empty wait & backlog list to prevent receives from finding request */
+ list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr,
+ &mad_agent_priv->wait_list, agent_list) {
+ change_mad_state(mad_send_wr, IB_MAD_STATE_DONE);
+ list_add_tail(&mad_send_wr->agent_list, &cancel_list);
+ }
list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr,
- &cancel_list, agent_list) {
- mad_send_wc.send_buf = &mad_send_wr->send_buf;
- list_del(&mad_send_wr->agent_list);
- mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
- &mad_send_wc);
- deref_mad_agent(mad_agent_priv);
+ &mad_agent_priv->backlog_list, agent_list) {
+ change_mad_state(mad_send_wr, IB_MAD_STATE_DONE);
+ list_add_tail(&mad_send_wr->agent_list, &cancel_list);
}
+
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ /* Report all cancelled requests */
+ clear_mad_error_list(&cancel_list, IB_WC_WR_FLUSH_ERR, mad_agent_priv);
}
static struct ib_mad_send_wr_private*
@@ -2456,6 +2699,13 @@ find_send_wr(struct ib_mad_agent_private *mad_agent_priv,
&mad_send_wr->send_buf == send_buf)
return mad_send_wr;
}
+
+ list_for_each_entry(mad_send_wr, &mad_agent_priv->backlog_list,
+ agent_list) {
+ if (&mad_send_wr->send_buf == send_buf)
+ return mad_send_wr;
+ }
+
return NULL;
}
@@ -2473,16 +2723,16 @@ int ib_modify_mad(struct ib_mad_send_buf *send_buf, u32 timeout_ms)
struct ib_mad_agent_private, agent);
spin_lock_irqsave(&mad_agent_priv->lock, flags);
mad_send_wr = find_send_wr(mad_agent_priv, send_buf);
- if (!mad_send_wr || mad_send_wr->status != IB_WC_SUCCESS) {
+ if (!mad_send_wr || mad_send_wr->state == IB_MAD_STATE_CANCELED) {
spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
return -EINVAL;
}
- active = (!mad_send_wr->timeout || mad_send_wr->refcount > 1);
- if (!timeout_ms) {
- mad_send_wr->status = IB_WC_WR_FLUSH_ERR;
- mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
- }
+ active = ((mad_send_wr->state == IB_MAD_STATE_SEND_START) ||
+ (mad_send_wr->state == IB_MAD_STATE_EARLY_RESP) ||
+ (mad_send_wr->state == IB_MAD_STATE_QUEUED && timeout_ms));
+ if (!timeout_ms)
+ change_mad_state(mad_send_wr, IB_MAD_STATE_CANCELED);
mad_send_wr->send_buf.timeout_ms = timeout_ms;
if (active)
@@ -2594,6 +2844,11 @@ static int retry_send(struct ib_mad_send_wr_private *mad_send_wr)
mad_send_wr->send_buf.retries++;
mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
+ if (mad_send_wr->is_solicited_fc &&
+ !list_empty(&mad_send_wr->mad_agent_priv->backlog_list)) {
+ change_mad_state(mad_send_wr, IB_MAD_STATE_QUEUED);
+ return 0;
+ }
if (ib_mad_kernel_rmpp_agent(&mad_send_wr->mad_agent_priv->agent)) {
ret = ib_retry_rmpp(mad_send_wr);
@@ -2611,24 +2866,25 @@ static int retry_send(struct ib_mad_send_wr_private *mad_send_wr)
} else
ret = ib_send_mad(mad_send_wr);
- if (!ret) {
- mad_send_wr->refcount++;
- list_add_tail(&mad_send_wr->agent_list,
- &mad_send_wr->mad_agent_priv->send_list);
- }
+ if (!ret)
+ change_mad_state(mad_send_wr, IB_MAD_STATE_SEND_START);
+
return ret;
}
static void timeout_sends(struct work_struct *work)
{
- struct ib_mad_agent_private *mad_agent_priv;
struct ib_mad_send_wr_private *mad_send_wr;
- struct ib_mad_send_wc mad_send_wc;
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct list_head timeout_list;
+ struct list_head cancel_list;
+ struct list_head *list_item;
unsigned long flags, delay;
mad_agent_priv = container_of(work, struct ib_mad_agent_private,
timed_work.work);
- mad_send_wc.vendor_err = 0;
+ INIT_LIST_HEAD(&timeout_list);
+ INIT_LIST_HEAD(&cancel_list);
spin_lock_irqsave(&mad_agent_priv->lock, flags);
while (!list_empty(&mad_agent_priv->wait_list)) {
@@ -2646,25 +2902,22 @@ static void timeout_sends(struct work_struct *work)
break;
}
- list_del(&mad_send_wr->agent_list);
- if (mad_send_wr->status == IB_WC_SUCCESS &&
- !retry_send(mad_send_wr))
- continue;
-
- spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
-
- if (mad_send_wr->status == IB_WC_SUCCESS)
- mad_send_wc.status = IB_WC_RESP_TIMEOUT_ERR;
+ if (mad_send_wr->state == IB_MAD_STATE_CANCELED)
+ list_item = &cancel_list;
+ else if (retry_send(mad_send_wr))
+ list_item = &timeout_list;
else
- mad_send_wc.status = mad_send_wr->status;
- mad_send_wc.send_buf = &mad_send_wr->send_buf;
- mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
- &mad_send_wc);
+ continue;
- deref_mad_agent(mad_agent_priv);
- spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ change_mad_state(mad_send_wr, IB_MAD_STATE_DONE);
+ list_add_tail(&mad_send_wr->agent_list, list_item);
}
+
spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ process_backlog_mads(mad_agent_priv);
+ clear_mad_error_list(&timeout_list, IB_WC_RESP_TIMEOUT_ERR,
+ mad_agent_priv);
+ clear_mad_error_list(&cancel_list, IB_WC_WR_FLUSH_ERR, mad_agent_priv);
}
/*
@@ -2674,11 +2927,11 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
struct ib_mad_private *mad)
{
unsigned long flags;
- int post, ret;
struct ib_mad_private *mad_priv;
struct ib_sge sg_list;
struct ib_recv_wr recv_wr;
struct ib_mad_queue *recv_queue = &qp_info->recv_queue;
+ int ret = 0;
/* Initialize common scatter list fields */
sg_list.lkey = qp_info->port_priv->pd->local_dma_lkey;
@@ -2688,7 +2941,7 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
recv_wr.sg_list = &sg_list;
recv_wr.num_sge = 1;
- do {
+ while (true) {
/* Allocate and map receive buffer */
if (mad) {
mad_priv = mad;
@@ -2696,10 +2949,8 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
} else {
mad_priv = alloc_mad_private(port_mad_size(qp_info->port_priv),
GFP_ATOMIC);
- if (!mad_priv) {
- ret = -ENOMEM;
- break;
- }
+ if (!mad_priv)
+ return -ENOMEM;
}
sg_list.length = mad_priv_dma_size(mad_priv);
sg_list.addr = ib_dma_map_single(qp_info->port_priv->device,
@@ -2708,37 +2959,41 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
DMA_FROM_DEVICE);
if (unlikely(ib_dma_mapping_error(qp_info->port_priv->device,
sg_list.addr))) {
- kfree(mad_priv);
ret = -ENOMEM;
- break;
+ goto free_mad_priv;
}
mad_priv->header.mapping = sg_list.addr;
mad_priv->header.mad_list.mad_queue = recv_queue;
mad_priv->header.mad_list.cqe.done = ib_mad_recv_done;
recv_wr.wr_cqe = &mad_priv->header.mad_list.cqe;
-
- /* Post receive WR */
spin_lock_irqsave(&recv_queue->lock, flags);
- post = (++recv_queue->count < recv_queue->max_active);
- list_add_tail(&mad_priv->header.mad_list.list, &recv_queue->list);
+ if (recv_queue->count >= recv_queue->max_active) {
+ /* Fully populated the receive queue */
+ spin_unlock_irqrestore(&recv_queue->lock, flags);
+ break;
+ }
+ recv_queue->count++;
+ list_add_tail(&mad_priv->header.mad_list.list,
+ &recv_queue->list);
spin_unlock_irqrestore(&recv_queue->lock, flags);
+
ret = ib_post_recv(qp_info->qp, &recv_wr, NULL);
if (ret) {
spin_lock_irqsave(&recv_queue->lock, flags);
list_del(&mad_priv->header.mad_list.list);
recv_queue->count--;
spin_unlock_irqrestore(&recv_queue->lock, flags);
- ib_dma_unmap_single(qp_info->port_priv->device,
- mad_priv->header.mapping,
- mad_priv_dma_size(mad_priv),
- DMA_FROM_DEVICE);
- kfree(mad_priv);
dev_err(&qp_info->port_priv->device->dev,
"ib_post_recv failed: %d\n", ret);
break;
}
- } while (post);
+ }
+ ib_dma_unmap_single(qp_info->port_priv->device,
+ mad_priv->header.mapping,
+ mad_priv_dma_size(mad_priv), DMA_FROM_DEVICE);
+free_mad_priv:
+ kfree(mad_priv);
return ret;
}
@@ -2942,7 +3197,6 @@ static int ib_mad_port_open(struct ib_device *device,
int ret, cq_size;
struct ib_mad_port_private *port_priv;
unsigned long flags;
- char name[sizeof "ib_mad123"];
int has_smi;
if (WARN_ON(rdma_max_mad_size(device, port_num) < IB_MGMT_MAD_SIZE))
@@ -2988,12 +3242,15 @@ static int ib_mad_port_open(struct ib_device *device,
if (ret)
goto error6;
}
- ret = create_mad_qp(&port_priv->qp_info[1], IB_QPT_GSI);
- if (ret)
- goto error7;
- snprintf(name, sizeof(name), "ib_mad%u", port_num);
- port_priv->wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM);
+ if (rdma_cap_ib_cm(device, port_num)) {
+ ret = create_mad_qp(&port_priv->qp_info[1], IB_QPT_GSI);
+ if (ret)
+ goto error7;
+ }
+
+ port_priv->wq = alloc_ordered_workqueue("ib_mad%u", WQ_MEM_RECLAIM,
+ port_num);
if (!port_priv->wq) {
ret = -ENOMEM;
goto error8;
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
index 1b7445a6f671..f444357d33f4 100644
--- a/drivers/infiniband/core/mad_priv.h
+++ b/drivers/infiniband/core/mad_priv.h
@@ -95,13 +95,16 @@ struct ib_mad_agent_private {
spinlock_t lock;
struct list_head send_list;
+ unsigned int sol_fc_send_count;
struct list_head wait_list;
- struct list_head done_list;
+ unsigned int sol_fc_wait_count;
struct delayed_work timed_work;
unsigned long timeout;
struct list_head local_list;
struct work_struct local_work;
struct list_head rmpp_list;
+ unsigned int sol_fc_max;
+ struct list_head backlog_list;
refcount_t refcount;
union {
@@ -118,6 +121,32 @@ struct ib_mad_snoop_private {
struct completion comp;
};
+enum ib_mad_state {
+ /* MAD is in the making and is not yet in any list */
+ IB_MAD_STATE_INIT,
+ /* MAD is in backlog list */
+ IB_MAD_STATE_QUEUED,
+ /*
+ * MAD was sent to the QP and is waiting for completion
+ * notification in send list.
+ */
+ IB_MAD_STATE_SEND_START,
+ /*
+ * MAD send completed successfully, waiting for a response
+ * in wait list.
+ */
+ IB_MAD_STATE_WAIT_RESP,
+ /*
+ * Response came early, before send completion notification,
+ * in send list.
+ */
+ IB_MAD_STATE_EARLY_RESP,
+ /* MAD was canceled while in wait or send list */
+ IB_MAD_STATE_CANCELED,
+ /* MAD processing completed, MAD in no list */
+ IB_MAD_STATE_DONE
+};
+
struct ib_mad_send_wr_private {
struct ib_mad_list_head mad_list;
struct list_head agent_list;
@@ -132,8 +161,6 @@ struct ib_mad_send_wr_private {
int max_retries;
int retries_left;
int retry;
- int refcount;
- enum ib_wc_status status;
/* RMPP control */
struct list_head rmpp_list;
@@ -143,8 +170,48 @@ struct ib_mad_send_wr_private {
int seg_num;
int newwin;
int pad;
+
+ enum ib_mad_state state;
+
+ /* Solicited MAD flow control */
+ bool is_solicited_fc;
};
+static inline void expect_mad_state(struct ib_mad_send_wr_private *mad_send_wr,
+ enum ib_mad_state expected_state)
+{
+ if (IS_ENABLED(CONFIG_LOCKDEP))
+ WARN_ON(mad_send_wr->state != expected_state);
+}
+
+static inline void expect_mad_state2(struct ib_mad_send_wr_private *mad_send_wr,
+ enum ib_mad_state expected_state1,
+ enum ib_mad_state expected_state2)
+{
+ if (IS_ENABLED(CONFIG_LOCKDEP))
+ WARN_ON(mad_send_wr->state != expected_state1 &&
+ mad_send_wr->state != expected_state2);
+}
+
+static inline void expect_mad_state3(struct ib_mad_send_wr_private *mad_send_wr,
+ enum ib_mad_state expected_state1,
+ enum ib_mad_state expected_state2,
+ enum ib_mad_state expected_state3)
+{
+ if (IS_ENABLED(CONFIG_LOCKDEP))
+ WARN_ON(mad_send_wr->state != expected_state1 &&
+ mad_send_wr->state != expected_state2 &&
+ mad_send_wr->state != expected_state3);
+}
+
+static inline void
+not_expect_mad_state(struct ib_mad_send_wr_private *mad_send_wr,
+ enum ib_mad_state wrong_state)
+{
+ if (IS_ENABLED(CONFIG_LOCKDEP))
+ WARN_ON(mad_send_wr->state == wrong_state);
+}
+
struct ib_mad_local_private {
struct list_head completion_list;
struct ib_mad_private *mad_priv;
@@ -222,4 +289,7 @@ void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr);
void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr,
unsigned long timeout_ms);
+void change_mad_state(struct ib_mad_send_wr_private *mad_send_wr,
+ enum ib_mad_state new_state);
+
#endif /* __IB_MAD_PRIV_H__ */
diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c
index 8af0619a39cd..1c5e0eaf1c94 100644
--- a/drivers/infiniband/core/mad_rmpp.c
+++ b/drivers/infiniband/core/mad_rmpp.c
@@ -158,7 +158,7 @@ static struct ib_mad_send_buf *alloc_response_msg(struct ib_mad_agent *agent,
ah = ib_create_ah_from_wc(agent->qp->pd, recv_wc->wc,
recv_wc->recv_buf.grh, agent->port_num);
if (IS_ERR(ah))
- return (void *) ah;
+ return ERR_CAST(ah);
hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class);
msg = ib_create_send_mad(agent, recv_wc->wc->src_qp,
@@ -608,16 +608,20 @@ static void abort_send(struct ib_mad_agent_private *agent,
goto out; /* Unmatched send */
if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) ||
- (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS))
+ (!mad_send_wr->timeout) ||
+ (mad_send_wr->state == IB_MAD_STATE_CANCELED))
goto out; /* Send is already done */
ib_mark_mad_done(mad_send_wr);
+ if (mad_send_wr->state == IB_MAD_STATE_DONE) {
+ spin_unlock_irqrestore(&agent->lock, flags);
+ wc.status = IB_WC_REM_ABORT_ERR;
+ wc.vendor_err = rmpp_status;
+ wc.send_buf = &mad_send_wr->send_buf;
+ ib_mad_complete_send_wr(mad_send_wr, &wc);
+ return;
+ }
spin_unlock_irqrestore(&agent->lock, flags);
-
- wc.status = IB_WC_REM_ABORT_ERR;
- wc.vendor_err = rmpp_status;
- wc.send_buf = &mad_send_wr->send_buf;
- ib_mad_complete_send_wr(mad_send_wr, &wc);
return;
out:
spin_unlock_irqrestore(&agent->lock, flags);
@@ -684,7 +688,8 @@ static void process_rmpp_ack(struct ib_mad_agent_private *agent,
}
if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) ||
- (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS))
+ (!mad_send_wr->timeout) ||
+ (mad_send_wr->state == IB_MAD_STATE_CANCELED))
goto out; /* Send is already done */
if (seg_num > mad_send_wr->send_buf.seg_count ||
@@ -709,21 +714,24 @@ static void process_rmpp_ack(struct ib_mad_agent_private *agent,
struct ib_mad_send_wc wc;
ib_mark_mad_done(mad_send_wr);
+ if (mad_send_wr->state == IB_MAD_STATE_DONE) {
+ spin_unlock_irqrestore(&agent->lock, flags);
+ wc.status = IB_WC_SUCCESS;
+ wc.vendor_err = 0;
+ wc.send_buf = &mad_send_wr->send_buf;
+ ib_mad_complete_send_wr(mad_send_wr, &wc);
+ return;
+ }
spin_unlock_irqrestore(&agent->lock, flags);
-
- wc.status = IB_WC_SUCCESS;
- wc.vendor_err = 0;
- wc.send_buf = &mad_send_wr->send_buf;
- ib_mad_complete_send_wr(mad_send_wr, &wc);
return;
}
- if (mad_send_wr->refcount == 1)
+ if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP)
ib_reset_mad_timeout(mad_send_wr,
mad_send_wr->send_buf.timeout_ms);
spin_unlock_irqrestore(&agent->lock, flags);
ack_ds_ack(agent, mad_recv_wc);
return;
- } else if (mad_send_wr->refcount == 1 &&
+ } else if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP &&
mad_send_wr->seg_num < mad_send_wr->newwin &&
mad_send_wr->seg_num < mad_send_wr->send_buf.seg_count) {
/* Send failure will just result in a timeout/retry */
@@ -731,7 +739,7 @@ static void process_rmpp_ack(struct ib_mad_agent_private *agent,
if (ret)
goto out;
- mad_send_wr->refcount++;
+ change_mad_state(mad_send_wr, IB_MAD_STATE_SEND_START);
list_move_tail(&mad_send_wr->agent_list,
&mad_send_wr->mad_agent_priv->send_list);
}
@@ -890,7 +898,6 @@ int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr)
mad_send_wr->newwin = init_newwin(mad_send_wr);
/* We need to wait for the final ACK even if there isn't a response */
- mad_send_wr->refcount += (mad_send_wr->timeout == 0);
ret = send_next_seg(mad_send_wr);
if (!ret)
return IB_RMPP_RESULT_CONSUMED;
@@ -912,7 +919,7 @@ int ib_process_rmpp_send_wc(struct ib_mad_send_wr_private *mad_send_wr,
return IB_RMPP_RESULT_INTERNAL; /* ACK, STOP, or ABORT */
if (mad_send_wc->status != IB_WC_SUCCESS ||
- mad_send_wr->status != IB_WC_SUCCESS)
+ mad_send_wr->state == IB_MAD_STATE_CANCELED)
return IB_RMPP_RESULT_PROCESSED; /* Canceled or send error */
if (!mad_send_wr->timeout)
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
index 1b2cc9e45ade..def14c54b648 100644
--- a/drivers/infiniband/core/netlink.c
+++ b/drivers/infiniband/core/netlink.c
@@ -75,7 +75,7 @@ static bool is_nl_msg_valid(unsigned int type, unsigned int op)
if (type >= RDMA_NL_NUM_CLIENTS)
return false;
- return (op < max_num_ops[type]) ? true : false;
+ return op < max_num_ops[type];
}
static const struct rdma_nl_cbs *
@@ -311,6 +311,7 @@ int rdma_nl_net_init(struct rdma_dev_net *rnet)
struct net *net = read_pnet(&rnet->net);
struct netlink_kernel_cfg cfg = {
.input = rdma_nl_rcv,
+ .flags = NL_CFG_F_NONROOT_RECV,
};
struct sock *nls;
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index e9b4b2cccaa0..2220a2dfab24 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -43,6 +43,13 @@
#include "restrack.h"
#include "uverbs.h"
+/*
+ * This determines whether a non-privileged user is allowed to specify a
+ * controlled QKEY or not, when true non-privileged user is allowed to specify
+ * a controlled QKEY.
+ */
+static bool privileged_qkey;
+
typedef int (*res_fill_func_t)(struct sk_buff*, bool,
struct rdma_restrack_entry*, uint32_t);
@@ -130,6 +137,8 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
[RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME]= { .type = NLA_NUL_STRING,
.len = RDMA_NLDEV_ATTR_EMPTY_STRING },
[RDMA_NLDEV_ATTR_RES_TYPE] = { .type = NLA_U8 },
+ [RDMA_NLDEV_ATTR_RES_SUBTYPE] = { .type = NLA_NUL_STRING,
+ .len = RDMA_NLDEV_ATTR_EMPTY_STRING },
[RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY]= { .type = NLA_U32 },
[RDMA_NLDEV_ATTR_RES_USECNT] = { .type = NLA_U64 },
[RDMA_NLDEV_ATTR_RES_SRQ] = { .type = NLA_NESTED },
@@ -154,6 +163,15 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
[RDMA_NLDEV_NET_NS_FD] = { .type = NLA_U32 },
[RDMA_NLDEV_SYS_ATTR_NETNS_MODE] = { .type = NLA_U8 },
[RDMA_NLDEV_SYS_ATTR_COPY_ON_FORK] = { .type = NLA_U8 },
+ [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_INDEX] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_DYNAMIC] = { .type = NLA_U8 },
+ [RDMA_NLDEV_SYS_ATTR_PRIVILEGED_QKEY_MODE] = { .type = NLA_U8 },
+ [RDMA_NLDEV_ATTR_DRIVER_DETAILS] = { .type = NLA_U8 },
+ [RDMA_NLDEV_ATTR_DEV_TYPE] = { .type = NLA_U8 },
+ [RDMA_NLDEV_ATTR_PARENT_NAME] = { .type = NLA_NUL_STRING },
+ [RDMA_NLDEV_ATTR_NAME_ASSIGN_TYPE] = { .type = NLA_U8 },
+ [RDMA_NLDEV_ATTR_EVENT_TYPE] = { .type = NLA_U8 },
+ [RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED] = { .type = NLA_U8 },
};
static int put_driver_name_print_type(struct sk_buff *msg, const char *name,
@@ -235,6 +253,12 @@ int rdma_nl_put_driver_u64_hex(struct sk_buff *msg, const char *name, u64 value)
}
EXPORT_SYMBOL(rdma_nl_put_driver_u64_hex);
+bool rdma_nl_get_privileged_qkey(void)
+{
+ return privileged_qkey;
+}
+EXPORT_SYMBOL(rdma_nl_get_privileged_qkey);
+
static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device)
{
if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index))
@@ -282,6 +306,19 @@ static int fill_dev_info(struct sk_buff *msg, struct ib_device *device)
if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_DIM, device->use_cq_dim))
return -EMSGSIZE;
+ if (device->type &&
+ nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_TYPE, device->type))
+ return -EMSGSIZE;
+
+ if (device->parent &&
+ nla_put_string(msg, RDMA_NLDEV_ATTR_PARENT_NAME,
+ dev_name(&device->parent->dev)))
+ return -EMSGSIZE;
+
+ if (nla_put_u8(msg, RDMA_NLDEV_ATTR_NAME_ASSIGN_TYPE,
+ device->name_assign_type))
+ return -EMSGSIZE;
+
/*
* Link type is determined on first port and mlx4 device
* which can potentially have two different link type for the same
@@ -355,8 +392,7 @@ static int fill_port_info(struct sk_buff *msg,
}
out:
- if (netdev)
- dev_put(netdev);
+ dev_put(netdev);
return ret;
}
@@ -384,7 +420,8 @@ err:
return -EMSGSIZE;
}
-static int fill_res_info(struct sk_buff *msg, struct ib_device *device)
+static int fill_res_info(struct sk_buff *msg, struct ib_device *device,
+ bool show_details)
{
static const char * const names[RDMA_RESTRACK_MAX] = {
[RDMA_RESTRACK_PD] = "pd",
@@ -409,7 +446,7 @@ static int fill_res_info(struct sk_buff *msg, struct ib_device *device)
for (i = 0; i < RDMA_RESTRACK_MAX; i++) {
if (!names[i])
continue;
- curr = rdma_restrack_count(device, i);
+ curr = rdma_restrack_count(device, i, show_details);
ret = fill_res_info_entry(msg, names[i], curr);
if (ret)
goto err;
@@ -511,7 +548,7 @@ static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin,
/* In create_qp() port is not set yet */
if (qp->port && nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, qp->port))
- return -EINVAL;
+ return -EMSGSIZE;
ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qp->qp_num);
if (ret)
@@ -550,7 +587,7 @@ static int fill_res_cm_id_entry(struct sk_buff *msg, bool has_cap_net_admin,
struct rdma_cm_id *cm_id = &id_priv->id;
if (port && port != cm_id->port_num)
- return 0;
+ return -EAGAIN;
if (cm_id->port_num &&
nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, cm_id->port_num))
@@ -816,6 +853,7 @@ static int fill_res_srq_entry(struct sk_buff *msg, bool has_cap_net_admin,
struct rdma_restrack_entry *res, uint32_t port)
{
struct ib_srq *srq = container_of(res, struct ib_srq, res);
+ struct ib_device *dev = srq->device;
if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_SRQN, srq->res.id))
goto err;
@@ -835,12 +873,29 @@ static int fill_res_srq_entry(struct sk_buff *msg, bool has_cap_net_admin,
if (fill_res_srq_qps(msg, srq))
goto err;
- return fill_res_name_pid(msg, res);
+ if (fill_res_name_pid(msg, res))
+ goto err;
+
+ if (dev->ops.fill_res_srq_entry)
+ return dev->ops.fill_res_srq_entry(msg, srq);
+
+ return 0;
err:
return -EMSGSIZE;
}
+static int fill_res_srq_raw_entry(struct sk_buff *msg, bool has_cap_net_admin,
+ struct rdma_restrack_entry *res, uint32_t port)
+{
+ struct ib_srq *srq = container_of(res, struct ib_srq, res);
+ struct ib_device *dev = srq->device;
+
+ if (!dev->ops.fill_res_srq_entry_raw)
+ return -EINVAL;
+ return dev->ops.fill_res_srq_entry_raw(msg, srq);
+}
+
static int fill_stat_counter_mode(struct sk_buff *msg,
struct rdma_counter *counter)
{
@@ -892,6 +947,8 @@ static int fill_stat_counter_qps(struct sk_buff *msg,
int ret = 0;
table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP);
+ if (!table_attr)
+ return -EMSGSIZE;
rt = &counter->device->res[RDMA_RESTRACK_QP];
xa_lock(&rt->xa);
@@ -968,14 +1025,21 @@ static int fill_stat_counter_hwcounters(struct sk_buff *msg,
if (!table_attr)
return -EMSGSIZE;
- for (i = 0; i < st->num_counters; i++)
- if (rdma_nl_stat_hwcounter_entry(msg, st->names[i], st->value[i]))
+ mutex_lock(&st->lock);
+ for (i = 0; i < st->num_counters; i++) {
+ if (test_bit(i, st->is_disabled))
+ continue;
+ if (rdma_nl_stat_hwcounter_entry(msg, st->descs[i].name,
+ st->value[i]))
goto err;
+ }
+ mutex_unlock(&st->lock);
nla_nest_end(msg, table_attr);
return 0;
err:
+ mutex_unlock(&st->lock);
nla_nest_cancel(msg, table_attr);
return -EMSGSIZE;
}
@@ -1012,8 +1076,8 @@ static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
u32 index;
int err;
- err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
- nldev_policy, extack);
+ err = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, NL_VALIDATE_LIBERAL, extack);
if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
return -EINVAL;
@@ -1032,6 +1096,10 @@ static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET),
0, 0);
+ if (!nlh) {
+ err = -EMSGSIZE;
+ goto err_free;
+ }
err = fill_dev_info(msg, device);
if (err)
@@ -1057,8 +1125,8 @@ static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
u32 index;
int err;
- err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
- nldev_policy, extack);
+ err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, extack);
if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
return -EINVAL;
@@ -1117,7 +1185,7 @@ static int _nldev_get_dumpit(struct ib_device *device,
RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET),
0, NLM_F_MULTI);
- if (fill_dev_info(skb, device)) {
+ if (!nlh || fill_dev_info(skb, device)) {
nlmsg_cancel(skb, nlh);
goto out;
}
@@ -1149,8 +1217,8 @@ static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
u32 port;
int err;
- err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
- nldev_policy, extack);
+ err = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, NL_VALIDATE_LIBERAL, extack);
if (err ||
!tb[RDMA_NLDEV_ATTR_DEV_INDEX] ||
!tb[RDMA_NLDEV_ATTR_PORT_INDEX])
@@ -1176,6 +1244,10 @@ static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET),
0, 0);
+ if (!nlh) {
+ err = -EMSGSIZE;
+ goto err_free;
+ }
err = fill_port_info(msg, device, port, sock_net(skb->sk));
if (err)
@@ -1205,8 +1277,8 @@ static int nldev_port_get_dumpit(struct sk_buff *skb,
int err;
unsigned int p;
- err = nlmsg_parse_deprecated(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
- nldev_policy, NULL);
+ err = __nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, NL_VALIDATE_LIBERAL, NULL);
if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
return -EINVAL;
@@ -1237,7 +1309,7 @@ static int nldev_port_get_dumpit(struct sk_buff *skb,
RDMA_NLDEV_CMD_PORT_GET),
0, NLM_F_MULTI);
- if (fill_port_info(skb, device, p, sock_net(skb->sk))) {
+ if (!nlh || fill_port_info(skb, device, p, sock_net(skb->sk))) {
nlmsg_cancel(skb, nlh);
goto out;
}
@@ -1255,13 +1327,14 @@ static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ bool show_details = false;
struct ib_device *device;
struct sk_buff *msg;
u32 index;
int ret;
- ret = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
- nldev_policy, extack);
+ ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, NL_VALIDATE_LIBERAL, extack);
if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
return -EINVAL;
@@ -1270,6 +1343,9 @@ static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
if (!device)
return -EINVAL;
+ if (tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS])
+ show_details = nla_get_u8(tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS]);
+
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg) {
ret = -ENOMEM;
@@ -1279,8 +1355,12 @@ static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_GET),
0, 0);
+ if (!nlh) {
+ ret = -EMSGSIZE;
+ goto err_free;
+ }
- ret = fill_res_info(msg, device);
+ ret = fill_res_info(msg, device, show_details);
if (ret)
goto err_free;
@@ -1310,7 +1390,7 @@ static int _nldev_res_get_dumpit(struct ib_device *device,
RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_GET),
0, NLM_F_MULTI);
- if (fill_res_info(skb, device)) {
+ if (!nlh || fill_res_info(skb, device, false)) {
nlmsg_cancel(skb, nlh);
goto out;
}
@@ -1389,10 +1469,11 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = {
};
-static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct netlink_ext_ack *extack,
- enum rdma_restrack_type res_type,
- res_fill_func_t fill_func)
+static noinline_for_stack int
+res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack,
+ enum rdma_restrack_type res_type,
+ res_fill_func_t fill_func)
{
const struct nldev_fill_res_entry *fe = &fill_entries[res_type];
struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
@@ -1403,8 +1484,8 @@ static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
struct sk_buff *msg;
int ret;
- ret = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
- nldev_policy, extack);
+ ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, NL_VALIDATE_LIBERAL, extack);
if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !fe->id || !tb[fe->id])
return -EINVAL;
@@ -1445,7 +1526,7 @@ static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
RDMA_NL_GET_OP(nlh->nlmsg_type)),
0, 0);
- if (fill_nldev_handle(msg, device)) {
+ if (!nlh || fill_nldev_handle(msg, device)) {
ret = -EMSGSIZE;
goto err_free;
}
@@ -1480,6 +1561,7 @@ static int res_get_common_dumpit(struct sk_buff *skb,
struct rdma_restrack_entry *res;
struct rdma_restrack_root *rt;
int err, ret = 0, idx = 0;
+ bool show_details = false;
struct nlattr *table_attr;
struct nlattr *entry_attr;
struct ib_device *device;
@@ -1490,8 +1572,8 @@ static int res_get_common_dumpit(struct sk_buff *skb,
u32 index, port = 0;
bool filled = false;
- err = nlmsg_parse_deprecated(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
- nldev_policy, NULL);
+ err = __nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, NL_VALIDATE_LIBERAL, NULL);
/*
* Right now, we are expecting the device index to get res information,
* but it is possible to extend this code to return all devices in
@@ -1508,6 +1590,9 @@ static int res_get_common_dumpit(struct sk_buff *skb,
if (!device)
return -EINVAL;
+ if (tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS])
+ show_details = nla_get_u8(tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS]);
+
/*
* If no PORT_INDEX is supplied, we will return all QPs from that device
*/
@@ -1524,7 +1609,7 @@ static int res_get_common_dumpit(struct sk_buff *skb,
RDMA_NL_GET_OP(cb->nlh->nlmsg_type)),
0, NLM_F_MULTI);
- if (fill_nldev_handle(skb, device)) {
+ if (!nlh || fill_nldev_handle(skb, device)) {
ret = -EMSGSIZE;
goto err;
}
@@ -1545,6 +1630,9 @@ static int res_get_common_dumpit(struct sk_buff *skb,
* objects.
*/
xa_for_each(&rt->xa, id, res) {
+ if (xa_get_mark(&rt->xa, res->id, RESTRACK_DD) && !show_details)
+ goto next;
+
if (idx < start || !rdma_restrack_get(res))
goto next;
@@ -1629,6 +1717,7 @@ RES_GET_FUNCS(mr_raw, RDMA_RESTRACK_MR);
RES_GET_FUNCS(counter, RDMA_RESTRACK_COUNTER);
RES_GET_FUNCS(ctx, RDMA_RESTRACK_CTX);
RES_GET_FUNCS(srq, RDMA_RESTRACK_SRQ);
+RES_GET_FUNCS(srq_raw, RDMA_RESTRACK_SRQ);
static LIST_HEAD(link_ops);
static DECLARE_RWSEM(link_ops_rwsem);
@@ -1676,8 +1765,8 @@ static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
char type[IFNAMSIZ];
int err;
- err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
- nldev_policy, extack);
+ err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, extack);
if (err || !tb[RDMA_NLDEV_ATTR_DEV_NAME] ||
!tb[RDMA_NLDEV_ATTR_LINK_TYPE] || !tb[RDMA_NLDEV_ATTR_NDEV_NAME])
return -EINVAL;
@@ -1720,8 +1809,8 @@ static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
u32 index;
int err;
- err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
- nldev_policy, extack);
+ err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, extack);
if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
return -EINVAL;
@@ -1730,7 +1819,7 @@ static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
if (!device)
return -EINVAL;
- if (!(device->attrs.device_cap_flags & IB_DEVICE_ALLOW_USER_UNREG)) {
+ if (!(device->attrs.kernel_cap_flags & IBK_ALLOW_USER_UNREG)) {
ib_device_put(device);
return -EINVAL;
}
@@ -1750,8 +1839,8 @@ static int nldev_get_chardev(struct sk_buff *skb, struct nlmsghdr *nlh,
u32 index;
int err;
- err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy,
- extack);
+ err = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy,
+ NL_VALIDATE_LIBERAL, extack);
if (err || !tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE])
return -EINVAL;
@@ -1786,6 +1875,10 @@ static int nldev_get_chardev(struct sk_buff *skb, struct nlmsghdr *nlh,
RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
RDMA_NLDEV_CMD_GET_CHARDEV),
0, 0);
+ if (!nlh) {
+ err = -EMSGSIZE;
+ goto out_nlmsg;
+ }
data.nl_msg = msg;
err = ib_get_client_nl_info(ibdev, client_name, &data);
@@ -1830,8 +1923,8 @@ static int nldev_sys_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
struct sk_buff *msg;
int err;
- err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
- nldev_policy, extack);
+ err = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, NL_VALIDATE_LIBERAL, extack);
if (err)
return err;
@@ -1843,6 +1936,10 @@ static int nldev_sys_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
RDMA_NLDEV_CMD_SYS_GET),
0, 0);
+ if (!nlh) {
+ nlmsg_free(msg);
+ return -EMSGSIZE;
+ }
err = nla_put_u8(msg, RDMA_NLDEV_SYS_ATTR_NETNS_MODE,
(u8)ib_devices_shared_netns);
@@ -1851,6 +1948,18 @@ static int nldev_sys_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
return err;
}
+ err = nla_put_u8(msg, RDMA_NLDEV_SYS_ATTR_PRIVILEGED_QKEY_MODE,
+ (u8)privileged_qkey);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ err = nla_put_u8(msg, RDMA_NLDEV_SYS_ATTR_MONITOR_MODE, 1);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
/*
* Copy-on-fork is supported.
* See commits:
@@ -1867,45 +1976,167 @@ static int nldev_sys_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid);
}
+static int nldev_set_sys_set_netns_doit(struct nlattr *tb[])
+{
+ u8 enable;
+ int err;
+
+ enable = nla_get_u8(tb[RDMA_NLDEV_SYS_ATTR_NETNS_MODE]);
+ /* Only 0 and 1 are supported */
+ if (enable > 1)
+ return -EINVAL;
+
+ err = rdma_compatdev_set(enable);
+ return err;
+}
+
+static int nldev_set_sys_set_pqkey_doit(struct nlattr *tb[])
+{
+ u8 enable;
+
+ enable = nla_get_u8(tb[RDMA_NLDEV_SYS_ATTR_PRIVILEGED_QKEY_MODE]);
+ /* Only 0 and 1 are supported */
+ if (enable > 1)
+ return -EINVAL;
+
+ privileged_qkey = enable;
+ return 0;
+}
+
static int nldev_set_sys_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
- u8 enable;
int err;
err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
nldev_policy, extack);
- if (err || !tb[RDMA_NLDEV_SYS_ATTR_NETNS_MODE])
+ if (err)
return -EINVAL;
- enable = nla_get_u8(tb[RDMA_NLDEV_SYS_ATTR_NETNS_MODE]);
- /* Only 0 and 1 are supported */
- if (enable > 1)
+ if (tb[RDMA_NLDEV_SYS_ATTR_NETNS_MODE])
+ return nldev_set_sys_set_netns_doit(tb);
+
+ if (tb[RDMA_NLDEV_SYS_ATTR_PRIVILEGED_QKEY_MODE])
+ return nldev_set_sys_set_pqkey_doit(tb);
+
+ return -EINVAL;
+}
+
+
+static int nldev_stat_set_mode_doit(struct sk_buff *msg,
+ struct netlink_ext_ack *extack,
+ struct nlattr *tb[],
+ struct ib_device *device, u32 port)
+{
+ u32 mode, mask = 0, qpn, cntn = 0;
+ bool opcnt = false;
+ int ret;
+
+ /* Currently only counter for QP is supported */
+ if (!tb[RDMA_NLDEV_ATTR_STAT_RES] ||
+ nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP)
return -EINVAL;
- err = rdma_compatdev_set(enable);
- return err;
+ if (tb[RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED])
+ opcnt = !!nla_get_u8(
+ tb[RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED]);
+
+ mode = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_MODE]);
+ if (mode == RDMA_COUNTER_MODE_AUTO) {
+ if (tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK])
+ mask = nla_get_u32(
+ tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]);
+ return rdma_counter_set_auto_mode(device, port, mask, opcnt,
+ extack);
+ }
+
+ if (!tb[RDMA_NLDEV_ATTR_RES_LQPN])
+ return -EINVAL;
+
+ qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]);
+ if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]) {
+ cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]);
+ ret = rdma_counter_bind_qpn(device, port, qpn, cntn);
+ if (ret)
+ return ret;
+ } else {
+ ret = rdma_counter_bind_qpn_alloc(device, port, qpn, &cntn);
+ if (ret)
+ return ret;
+ }
+
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn) ||
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) {
+ ret = -EMSGSIZE;
+ goto err_fill;
+ }
+
+ return 0;
+
+err_fill:
+ rdma_counter_unbind_qpn(device, port, qpn, cntn);
+ return ret;
+}
+
+static int nldev_stat_set_counter_dynamic_doit(struct nlattr *tb[],
+ struct ib_device *device,
+ u32 port)
+{
+ struct rdma_hw_stats *stats;
+ struct nlattr *entry_attr;
+ unsigned long *target;
+ int rem, i, ret = 0;
+ u32 index;
+
+ stats = ib_get_hw_stats_port(device, port);
+ if (!stats)
+ return -EINVAL;
+
+ target = kcalloc(BITS_TO_LONGS(stats->num_counters),
+ sizeof(*stats->is_disabled), GFP_KERNEL);
+ if (!target)
+ return -ENOMEM;
+
+ nla_for_each_nested(entry_attr, tb[RDMA_NLDEV_ATTR_STAT_HWCOUNTERS],
+ rem) {
+ index = nla_get_u32(entry_attr);
+ if ((index >= stats->num_counters) ||
+ !(stats->descs[index].flags & IB_STAT_FLAG_OPTIONAL)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ set_bit(index, target);
+ }
+
+ for (i = 0; i < stats->num_counters; i++) {
+ if (!(stats->descs[i].flags & IB_STAT_FLAG_OPTIONAL))
+ continue;
+
+ ret = rdma_counter_modify(device, port, i, test_bit(i, target));
+ if (ret)
+ goto out;
+ }
+
+out:
+ kfree(target);
+ return ret;
}
static int nldev_stat_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
- u32 index, port, mode, mask = 0, qpn, cntn = 0;
struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
struct ib_device *device;
struct sk_buff *msg;
+ u32 index, port;
int ret;
- ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
- nldev_policy, extack);
- /* Currently only counter for QP is supported */
- if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES] ||
- !tb[RDMA_NLDEV_ATTR_DEV_INDEX] ||
- !tb[RDMA_NLDEV_ATTR_PORT_INDEX] || !tb[RDMA_NLDEV_ATTR_STAT_MODE])
- return -EINVAL;
-
- if (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP)
+ ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy,
+ extack);
+ if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] ||
+ !tb[RDMA_NLDEV_ATTR_PORT_INDEX])
return -EINVAL;
index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
@@ -1916,59 +2147,49 @@ static int nldev_stat_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
if (!rdma_is_port_valid(device, port)) {
ret = -EINVAL;
- goto err;
+ goto err_put_device;
+ }
+
+ if (!tb[RDMA_NLDEV_ATTR_STAT_MODE] &&
+ !tb[RDMA_NLDEV_ATTR_STAT_HWCOUNTERS]) {
+ ret = -EINVAL;
+ goto err_put_device;
}
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg) {
ret = -ENOMEM;
- goto err;
+ goto err_put_device;
}
nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
RDMA_NLDEV_CMD_STAT_SET),
0, 0);
+ if (!nlh || fill_nldev_handle(msg, device) ||
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) {
+ ret = -EMSGSIZE;
+ goto err_free_msg;
+ }
- mode = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_MODE]);
- if (mode == RDMA_COUNTER_MODE_AUTO) {
- if (tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK])
- mask = nla_get_u32(
- tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]);
- ret = rdma_counter_set_auto_mode(device, port, mask, extack);
- if (ret)
- goto err_msg;
- } else {
- if (!tb[RDMA_NLDEV_ATTR_RES_LQPN])
- goto err_msg;
- qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]);
- if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]) {
- cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]);
- ret = rdma_counter_bind_qpn(device, port, qpn, cntn);
- } else {
- ret = rdma_counter_bind_qpn_alloc(device, port,
- qpn, &cntn);
- }
+ if (tb[RDMA_NLDEV_ATTR_STAT_MODE]) {
+ ret = nldev_stat_set_mode_doit(msg, extack, tb, device, port);
if (ret)
- goto err_msg;
+ goto err_free_msg;
+ }
- if (fill_nldev_handle(msg, device) ||
- nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) ||
- nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn) ||
- nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) {
- ret = -EMSGSIZE;
- goto err_fill;
- }
+ if (tb[RDMA_NLDEV_ATTR_STAT_HWCOUNTERS]) {
+ ret = nldev_stat_set_counter_dynamic_doit(tb, device, port);
+ if (ret)
+ goto err_free_msg;
}
nlmsg_end(msg, nlh);
ib_device_put(device);
return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid);
-err_fill:
- rdma_counter_unbind_qpn(device, port, qpn, cntn);
-err_msg:
+err_free_msg:
nlmsg_free(msg);
-err:
+err_put_device:
ib_device_put(device);
return ret;
}
@@ -2013,6 +2234,10 @@ static int nldev_stat_del_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
RDMA_NLDEV_CMD_STAT_SET),
0, 0);
+ if (!nlh) {
+ ret = -EMSGSIZE;
+ goto err_fill;
+ }
cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]);
qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]);
@@ -2039,10 +2264,10 @@ err:
return ret;
}
-static int stat_get_doit_default_counter(struct sk_buff *skb,
- struct nlmsghdr *nlh,
- struct netlink_ext_ack *extack,
- struct nlattr *tb[])
+static noinline_for_stack int
+stat_get_doit_default_counter(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack,
+ struct nlattr *tb[])
{
struct rdma_hw_stats *stats;
struct nlattr *table_attr;
@@ -2083,7 +2308,7 @@ static int stat_get_doit_default_counter(struct sk_buff *skb,
RDMA_NLDEV_CMD_STAT_GET),
0, 0);
- if (fill_nldev_handle(msg, device) ||
+ if (!nlh || fill_nldev_handle(msg, device) ||
nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) {
ret = -EMSGSIZE;
goto err_msg;
@@ -2103,9 +2328,13 @@ static int stat_get_doit_default_counter(struct sk_buff *skb,
goto err_stats;
}
for (i = 0; i < num_cnts; i++) {
+ if (test_bit(i, stats->is_disabled))
+ continue;
+
v = stats->value[i] +
rdma_counter_get_hwstat_value(device, port, i);
- if (rdma_nl_stat_hwcounter_entry(msg, stats->names[i], v)) {
+ if (rdma_nl_stat_hwcounter_entry(msg,
+ stats->descs[i].name, v)) {
ret = -EMSGSIZE;
goto err_table;
}
@@ -2128,8 +2357,9 @@ err:
return ret;
}
-static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct netlink_ext_ack *extack, struct nlattr *tb[])
+static noinline_for_stack int
+stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack, struct nlattr *tb[])
{
static enum rdma_nl_counter_mode mode;
@@ -2137,6 +2367,7 @@ static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh,
struct ib_device *device;
struct sk_buff *msg;
u32 index, port;
+ bool opcnt;
int ret;
if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID])
@@ -2167,8 +2398,12 @@ static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh,
RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
RDMA_NLDEV_CMD_STAT_GET),
0, 0);
+ if (!nlh) {
+ ret = -EMSGSIZE;
+ goto err_msg;
+ }
- ret = rdma_counter_get_mode(device, port, &mode, &mask);
+ ret = rdma_counter_get_mode(device, port, &mode, &mask, &opcnt);
if (ret)
goto err_msg;
@@ -2185,6 +2420,12 @@ static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh,
goto err_msg;
}
+ if ((mode == RDMA_COUNTER_MODE_AUTO) &&
+ nla_put_u8(msg, RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED, opcnt)) {
+ ret = -EMSGSIZE;
+ goto err_msg;
+ }
+
nlmsg_end(msg, nlh);
ib_device_put(device);
return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid);
@@ -2202,8 +2443,8 @@ static int nldev_stat_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
int ret;
- ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
- nldev_policy, extack);
+ ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, NL_VALIDATE_LIBERAL, extack);
if (ret)
return -EINVAL;
@@ -2232,8 +2473,8 @@ static int nldev_stat_get_dumpit(struct sk_buff *skb,
struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
int ret;
- ret = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
- nldev_policy, NULL);
+ ret = __nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, NL_VALIDATE_LIBERAL, NULL);
if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES])
return -EINVAL;
@@ -2253,6 +2494,149 @@ static int nldev_stat_get_dumpit(struct sk_buff *skb,
return ret;
}
+static int nldev_stat_get_counter_status_doit(struct sk_buff *skb,
+ struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX], *table, *entry;
+ struct rdma_hw_stats *stats;
+ struct ib_device *device;
+ struct sk_buff *msg;
+ u32 devid, port;
+ int ret, i;
+
+ ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, NL_VALIDATE_LIBERAL, extack);
+ if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] ||
+ !tb[RDMA_NLDEV_ATTR_PORT_INDEX])
+ return -EINVAL;
+
+ devid = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+ device = ib_device_get_by_index(sock_net(skb->sk), devid);
+ if (!device)
+ return -EINVAL;
+
+ port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+ if (!rdma_is_port_valid(device, port)) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ stats = ib_get_hw_stats_port(device, port);
+ if (!stats) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ nlh = nlmsg_put(
+ msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+ RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_STAT_GET_STATUS),
+ 0, 0);
+
+ ret = -EMSGSIZE;
+ if (!nlh || fill_nldev_handle(msg, device) ||
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port))
+ goto err_msg;
+
+ table = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTERS);
+ if (!table)
+ goto err_msg;
+
+ mutex_lock(&stats->lock);
+ for (i = 0; i < stats->num_counters; i++) {
+ entry = nla_nest_start(msg,
+ RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY);
+ if (!entry)
+ goto err_msg_table;
+
+ if (nla_put_string(msg,
+ RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME,
+ stats->descs[i].name) ||
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_INDEX, i))
+ goto err_msg_entry;
+
+ if ((stats->descs[i].flags & IB_STAT_FLAG_OPTIONAL) &&
+ (nla_put_u8(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_DYNAMIC,
+ !test_bit(i, stats->is_disabled))))
+ goto err_msg_entry;
+
+ nla_nest_end(msg, entry);
+ }
+ mutex_unlock(&stats->lock);
+
+ nla_nest_end(msg, table);
+ nlmsg_end(msg, nlh);
+ ib_device_put(device);
+ return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid);
+
+err_msg_entry:
+ nla_nest_cancel(msg, entry);
+err_msg_table:
+ mutex_unlock(&stats->lock);
+ nla_nest_cancel(msg, table);
+err_msg:
+ nlmsg_free(msg);
+err:
+ ib_device_put(device);
+ return ret;
+}
+
+static int nldev_newdev(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ enum rdma_nl_dev_type type;
+ struct ib_device *parent;
+ char name[IFNAMSIZ] = {};
+ u32 parentid;
+ int ret;
+
+ ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, extack);
+ if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] ||
+ !tb[RDMA_NLDEV_ATTR_DEV_NAME] || !tb[RDMA_NLDEV_ATTR_DEV_TYPE])
+ return -EINVAL;
+
+ nla_strscpy(name, tb[RDMA_NLDEV_ATTR_DEV_NAME], sizeof(name));
+ type = nla_get_u8(tb[RDMA_NLDEV_ATTR_DEV_TYPE]);
+ parentid = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+ parent = ib_device_get_by_index(sock_net(skb->sk), parentid);
+ if (!parent)
+ return -EINVAL;
+
+ ret = ib_add_sub_device(parent, type, name);
+ ib_device_put(parent);
+
+ return ret;
+}
+
+static int nldev_deldev(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ struct ib_device *device;
+ u32 devid;
+ int ret;
+
+ ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, extack);
+ if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
+ return -EINVAL;
+
+ devid = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+ device = ib_device_get_by_index(sock_net(skb->sk), devid);
+ if (!device)
+ return -EINVAL;
+
+ return ib_del_sub_device_and_put(device);
+}
+
static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
[RDMA_NLDEV_CMD_GET] = {
.doit = nldev_get_doit,
@@ -2314,6 +2698,7 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
},
[RDMA_NLDEV_CMD_SYS_SET] = {
.doit = nldev_set_sys_set_doit,
+ .flags = RDMA_NL_ADMIN_PERM,
},
[RDMA_NLDEV_CMD_STAT_SET] = {
.doit = nldev_stat_set_doit,
@@ -2342,14 +2727,192 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
.dump = nldev_res_get_mr_raw_dumpit,
.flags = RDMA_NL_ADMIN_PERM,
},
+ [RDMA_NLDEV_CMD_RES_SRQ_GET_RAW] = {
+ .doit = nldev_res_get_srq_raw_doit,
+ .dump = nldev_res_get_srq_raw_dumpit,
+ .flags = RDMA_NL_ADMIN_PERM,
+ },
+ [RDMA_NLDEV_CMD_STAT_GET_STATUS] = {
+ .doit = nldev_stat_get_counter_status_doit,
+ },
+ [RDMA_NLDEV_CMD_NEWDEV] = {
+ .doit = nldev_newdev,
+ .flags = RDMA_NL_ADMIN_PERM,
+ },
+ [RDMA_NLDEV_CMD_DELDEV] = {
+ .doit = nldev_deldev,
+ .flags = RDMA_NL_ADMIN_PERM,
+ },
};
+static int fill_mon_netdev_rename(struct sk_buff *msg,
+ struct ib_device *device, u32 port,
+ const struct net *net)
+{
+ struct net_device *netdev = ib_device_get_netdev(device, port);
+ int ret = 0;
+
+ if (!netdev || !net_eq(dev_net(netdev), net))
+ goto out;
+
+ ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_NDEV_INDEX, netdev->ifindex);
+ if (ret)
+ goto out;
+ ret = nla_put_string(msg, RDMA_NLDEV_ATTR_NDEV_NAME, netdev->name);
+out:
+ dev_put(netdev);
+ return ret;
+}
+
+static int fill_mon_netdev_association(struct sk_buff *msg,
+ struct ib_device *device, u32 port,
+ const struct net *net)
+{
+ struct net_device *netdev = ib_device_get_netdev(device, port);
+ int ret = 0;
+
+ if (netdev && !net_eq(dev_net(netdev), net))
+ goto out;
+
+ ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index);
+ if (ret)
+ goto out;
+
+ ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME,
+ dev_name(&device->dev));
+ if (ret)
+ goto out;
+
+ ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port);
+ if (ret)
+ goto out;
+
+ if (netdev) {
+ ret = nla_put_u32(msg,
+ RDMA_NLDEV_ATTR_NDEV_INDEX, netdev->ifindex);
+ if (ret)
+ goto out;
+
+ ret = nla_put_string(msg,
+ RDMA_NLDEV_ATTR_NDEV_NAME, netdev->name);
+ }
+
+out:
+ dev_put(netdev);
+ return ret;
+}
+
+static void rdma_nl_notify_err_msg(struct ib_device *device, u32 port_num,
+ enum rdma_nl_notify_event_type type)
+{
+ struct net_device *netdev;
+
+ switch (type) {
+ case RDMA_REGISTER_EVENT:
+ dev_warn_ratelimited(&device->dev,
+ "Failed to send RDMA monitor register device event\n");
+ break;
+ case RDMA_UNREGISTER_EVENT:
+ dev_warn_ratelimited(&device->dev,
+ "Failed to send RDMA monitor unregister device event\n");
+ break;
+ case RDMA_NETDEV_ATTACH_EVENT:
+ netdev = ib_device_get_netdev(device, port_num);
+ dev_warn_ratelimited(&device->dev,
+ "Failed to send RDMA monitor netdev attach event: port %d netdev %d\n",
+ port_num, netdev->ifindex);
+ dev_put(netdev);
+ break;
+ case RDMA_NETDEV_DETACH_EVENT:
+ dev_warn_ratelimited(&device->dev,
+ "Failed to send RDMA monitor netdev detach event: port %d\n",
+ port_num);
+ break;
+ case RDMA_RENAME_EVENT:
+ dev_warn_ratelimited(&device->dev,
+ "Failed to send RDMA monitor rename device event\n");
+ break;
+
+ case RDMA_NETDEV_RENAME_EVENT:
+ netdev = ib_device_get_netdev(device, port_num);
+ dev_warn_ratelimited(&device->dev,
+ "Failed to send RDMA monitor netdev rename event: port %d netdev %d\n",
+ port_num, netdev->ifindex);
+ dev_put(netdev);
+ break;
+ default:
+ break;
+ }
+}
+
+int rdma_nl_notify_event(struct ib_device *device, u32 port_num,
+ enum rdma_nl_notify_event_type type)
+{
+ struct sk_buff *skb;
+ int ret = -EMSGSIZE;
+ struct net *net;
+ void *nlh;
+
+ net = read_pnet(&device->coredev.rdma_net);
+ if (!net)
+ return -EINVAL;
+
+ skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+ nlh = nlmsg_put(skb, 0, 0,
+ RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_MONITOR),
+ 0, 0);
+ if (!nlh)
+ goto err_free;
+
+ switch (type) {
+ case RDMA_REGISTER_EVENT:
+ case RDMA_UNREGISTER_EVENT:
+ case RDMA_RENAME_EVENT:
+ ret = fill_nldev_handle(skb, device);
+ if (ret)
+ goto err_free;
+ break;
+ case RDMA_NETDEV_ATTACH_EVENT:
+ case RDMA_NETDEV_DETACH_EVENT:
+ ret = fill_mon_netdev_association(skb, device, port_num, net);
+ if (ret)
+ goto err_free;
+ break;
+ case RDMA_NETDEV_RENAME_EVENT:
+ ret = fill_mon_netdev_rename(skb, device, port_num, net);
+ if (ret)
+ goto err_free;
+ break;
+ default:
+ break;
+ }
+
+ ret = nla_put_u8(skb, RDMA_NLDEV_ATTR_EVENT_TYPE, type);
+ if (ret)
+ goto err_free;
+
+ nlmsg_end(skb, nlh);
+ ret = rdma_nl_multicast(net, skb, RDMA_NL_GROUP_NOTIFY, GFP_KERNEL);
+ if (ret && ret != -ESRCH) {
+ skb = NULL; /* skb is freed in the netlink send-op handling */
+ goto err_free;
+ }
+ return 0;
+
+err_free:
+ rdma_nl_notify_err_msg(device, port_num, type);
+ nlmsg_free(skb);
+ return ret;
+}
+
void __init nldev_init(void)
{
rdma_nl_register(RDMA_NL_NLDEV, nldev_cb_table);
}
-void __exit nldev_exit(void)
+void nldev_exit(void)
{
rdma_nl_unregister(RDMA_NL_NLDEV);
}
diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
index 94d83b665a2f..18918f463361 100644
--- a/drivers/infiniband/core/rdma_core.c
+++ b/drivers/infiniband/core/rdma_core.c
@@ -58,8 +58,8 @@ void uverbs_uobject_put(struct ib_uobject *uobject)
}
EXPORT_SYMBOL(uverbs_uobject_put);
-static int uverbs_try_lock_object(struct ib_uobject *uobj,
- enum rdma_lookup_mode mode)
+int uverbs_try_lock_object(struct ib_uobject *uobj,
+ enum rdma_lookup_mode mode)
{
/*
* When a shared access is required, we use a positive counter. Each
@@ -68,7 +68,7 @@ static int uverbs_try_lock_object(struct ib_uobject *uobj,
* In exclusive access mode, we check that the counter is zero (nobody
* claimed this object) and we set it to -1. Releasing a shared access
* lock is done simply by decreasing the counter. As for exclusive
- * access locks, since only a single one of them is is allowed
+ * access locks, since only a single one of them is allowed
* concurrently, setting the counter to zero is enough for releasing
* this lock.
*/
@@ -84,6 +84,7 @@ static int uverbs_try_lock_object(struct ib_uobject *uobj,
}
return 0;
}
+EXPORT_SYMBOL(uverbs_try_lock_object);
static void assert_uverbs_usecnt(struct ib_uobject *uobj,
enum rdma_lookup_mode mode)
@@ -880,9 +881,14 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile,
static int __uverbs_cleanup_ufile(struct ib_uverbs_file *ufile,
enum rdma_remove_reason reason)
{
+ struct uverbs_attr_bundle attrs = { .ufile = ufile };
+ struct ib_ucontext *ucontext = ufile->ucontext;
+ struct ib_device *ib_dev = ucontext->device;
struct ib_uobject *obj, *next_obj;
int ret = -EINVAL;
- struct uverbs_attr_bundle attrs = { .ufile = ufile };
+
+ if (ib_dev->ops.ufile_hw_cleanup)
+ ib_dev->ops.ufile_hw_cleanup(ufile);
/*
* This shouldn't run while executing other commands on this
@@ -1013,3 +1019,32 @@ void uverbs_finalize_object(struct ib_uobject *uobj,
WARN_ON(true);
}
}
+
+/**
+ * rdma_uattrs_has_raw_cap() - Returns whether a rdma device linked to the
+ * uverbs attributes file has CAP_NET_RAW
+ * capability or not.
+ *
+ * @attrs: Pointer to uverbs attributes
+ *
+ * Returns true if a rdma device's owning user namespace has CAP_NET_RAW
+ * capability, otherwise false.
+ */
+bool rdma_uattrs_has_raw_cap(const struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_file *ufile = attrs->ufile;
+ struct ib_ucontext *ucontext;
+ bool has_cap = false;
+ int srcu_key;
+
+ srcu_key = srcu_read_lock(&ufile->device->disassociate_srcu);
+ ucontext = ib_uverbs_get_ucontext_file(ufile);
+ if (IS_ERR(ucontext))
+ goto out;
+ has_cap = rdma_dev_has_raw_cap(ucontext->device);
+
+out:
+ srcu_read_unlock(&ufile->device->disassociate_srcu, srcu_key);
+ return has_cap;
+}
+EXPORT_SYMBOL(rdma_uattrs_has_raw_cap);
diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h
index 33706dad6c0f..a59b087611cb 100644
--- a/drivers/infiniband/core/rdma_core.h
+++ b/drivers/infiniband/core/rdma_core.h
@@ -156,6 +156,7 @@ extern const struct uapi_definition uverbs_def_obj_counters[];
extern const struct uapi_definition uverbs_def_obj_cq[];
extern const struct uapi_definition uverbs_def_obj_device[];
extern const struct uapi_definition uverbs_def_obj_dm[];
+extern const struct uapi_definition uverbs_def_obj_dmah[];
extern const struct uapi_definition uverbs_def_obj_flow_action[];
extern const struct uapi_definition uverbs_def_obj_intf[];
extern const struct uapi_definition uverbs_def_obj_mr[];
diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c
index 033207882c82..b097cfcade1c 100644
--- a/drivers/infiniband/core/restrack.c
+++ b/drivers/infiniband/core/restrack.c
@@ -37,22 +37,6 @@ int rdma_restrack_init(struct ib_device *dev)
return 0;
}
-static const char *type2str(enum rdma_restrack_type type)
-{
- static const char * const names[RDMA_RESTRACK_MAX] = {
- [RDMA_RESTRACK_PD] = "PD",
- [RDMA_RESTRACK_CQ] = "CQ",
- [RDMA_RESTRACK_QP] = "QP",
- [RDMA_RESTRACK_CM_ID] = "CM_ID",
- [RDMA_RESTRACK_MR] = "MR",
- [RDMA_RESTRACK_CTX] = "CTX",
- [RDMA_RESTRACK_COUNTER] = "COUNTER",
- [RDMA_RESTRACK_SRQ] = "SRQ",
- };
-
- return names[type];
-};
-
/**
* rdma_restrack_clean() - clean resource tracking
* @dev: IB device
@@ -60,47 +44,14 @@ static const char *type2str(enum rdma_restrack_type type)
void rdma_restrack_clean(struct ib_device *dev)
{
struct rdma_restrack_root *rt = dev->res;
- struct rdma_restrack_entry *e;
- char buf[TASK_COMM_LEN];
- bool found = false;
- const char *owner;
int i;
for (i = 0 ; i < RDMA_RESTRACK_MAX; i++) {
struct xarray *xa = &dev->res[i].xa;
- if (!xa_empty(xa)) {
- unsigned long index;
-
- if (!found) {
- pr_err("restrack: %s", CUT_HERE);
- dev_err(&dev->dev, "BUG: RESTRACK detected leak of resources\n");
- }
- xa_for_each(xa, index, e) {
- if (rdma_is_kernel_res(e)) {
- owner = e->kern_name;
- } else {
- /*
- * There is no need to call get_task_struct here,
- * because we can be here only if there are more
- * get_task_struct() call than put_task_struct().
- */
- get_task_comm(buf, e->task);
- owner = buf;
- }
-
- pr_err("restrack: %s %s object allocated by %s is not freed\n",
- rdma_is_kernel_res(e) ? "Kernel" :
- "User",
- type2str(e->type), owner);
- }
- found = true;
- }
+ WARN_ON(!xa_empty(xa));
xa_destroy(xa);
}
- if (found)
- pr_err("restrack: %s", CUT_HERE);
-
kfree(rt);
}
@@ -108,8 +59,10 @@ void rdma_restrack_clean(struct ib_device *dev)
* rdma_restrack_count() - the current usage of specific object
* @dev: IB device
* @type: actual type of object to operate
+ * @show_details: count driver specific objects
*/
-int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type)
+int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type,
+ bool show_details)
{
struct rdma_restrack_root *rt = &dev->res[type];
struct rdma_restrack_entry *e;
@@ -117,8 +70,11 @@ int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type)
u32 cnt = 0;
xa_lock(&rt->xa);
- xas_for_each(&xas, e, U32_MAX)
+ xas_for_each(&xas, e, U32_MAX) {
+ if (xa_get_mark(&rt->xa, e->id, RESTRACK_DD) && !show_details)
+ continue;
cnt++;
+ }
xa_unlock(&rt->xa);
return cnt;
}
@@ -144,6 +100,8 @@ static struct ib_device *res_to_dev(struct rdma_restrack_entry *res)
return container_of(res, struct rdma_counter, res)->device;
case RDMA_RESTRACK_SRQ:
return container_of(res, struct ib_srq, res)->device;
+ case RDMA_RESTRACK_DMAH:
+ return container_of(res, struct ib_dmah, res)->device;
default:
WARN_ONCE(true, "Wrong resource tracking type %u\n", res->type);
return NULL;
@@ -217,7 +175,7 @@ void rdma_restrack_new(struct rdma_restrack_entry *res,
EXPORT_SYMBOL(rdma_restrack_new);
/**
- * rdma_restrack_add() - add object to the reource tracking database
+ * rdma_restrack_add() - add object to the resource tracking database
* @res: resource entry
*/
void rdma_restrack_add(struct rdma_restrack_entry *res)
@@ -247,6 +205,9 @@ void rdma_restrack_add(struct rdma_restrack_entry *res)
ret = xa_insert(&rt->xa, res->id, res, GFP_KERNEL);
if (ret)
res->id = 0;
+
+ if (qp->qp_type >= IB_QPT_DRIVER)
+ xa_set_mark(&rt->xa, res->id, RESTRACK_DD);
} else if (res->type == RDMA_RESTRACK_COUNTER) {
/* Special case to ensure that cntn points to right counter */
struct rdma_counter *counter;
@@ -316,7 +277,7 @@ int rdma_restrack_put(struct rdma_restrack_entry *res)
EXPORT_SYMBOL(rdma_restrack_put);
/**
- * rdma_restrack_del() - delete object from the reource tracking database
+ * rdma_restrack_del() - delete object from the resource tracking database
* @res: resource entry
*/
void rdma_restrack_del(struct rdma_restrack_entry *res)
@@ -343,8 +304,6 @@ void rdma_restrack_del(struct rdma_restrack_entry *res)
rt = &dev->res[res->type];
old = xa_erase(&rt->xa, res->id);
- if (res->type == RDMA_RESTRACK_MR || res->type == RDMA_RESTRACK_QP)
- return;
WARN_ON(old != res);
out:
diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c
index 68197e576433..a9f2c6b1b29e 100644
--- a/drivers/infiniband/core/roce_gid_mgmt.c
+++ b/drivers/infiniband/core/roce_gid_mgmt.c
@@ -250,7 +250,7 @@ static bool upper_device_filter(struct ib_device *ib_dev, u32 port,
/**
* is_upper_ndev_bond_master_filter - Check if a given netdevice
- * is bond master device of netdevice of the the RDMA device of port.
+ * is bond master device of netdevice of the RDMA device of port.
* @ib_dev: IB device to check
* @port: Port to consider for adding default GID
* @rdma_ndev: Pointer to rdma netdevice
@@ -515,6 +515,27 @@ void rdma_roce_rescan_device(struct ib_device *ib_dev)
}
EXPORT_SYMBOL(rdma_roce_rescan_device);
+/**
+ * rdma_roce_rescan_port - Rescan all of the network devices in the system
+ * and add their gids if relevant to the port of the RoCE device.
+ *
+ * @ib_dev: IB device
+ * @port: Port number
+ */
+void rdma_roce_rescan_port(struct ib_device *ib_dev, u32 port)
+{
+ struct net_device *ndev = NULL;
+
+ if (rdma_protocol_roce(ib_dev, port)) {
+ ndev = ib_device_get_netdev(ib_dev, port);
+ if (!ndev)
+ return;
+ enum_all_gids_of_dev_cb(ib_dev, port, ndev, ndev);
+ dev_put(ndev);
+ }
+}
+EXPORT_SYMBOL(rdma_roce_rescan_port);
+
static void callback_for_addr_gid_device_scan(struct ib_device *device,
u32 port,
struct net_device *rdma_ndev,
@@ -575,16 +596,17 @@ static void handle_netdev_upper(struct ib_device *ib_dev, u32 port,
}
}
-static void _roce_del_all_netdev_gids(struct ib_device *ib_dev, u32 port,
- struct net_device *event_ndev)
+void roce_del_all_netdev_gids(struct ib_device *ib_dev,
+ u32 port, struct net_device *ndev)
{
- ib_cache_gid_del_all_netdev_gids(ib_dev, port, event_ndev);
+ ib_cache_gid_del_all_netdev_gids(ib_dev, port, ndev);
}
+EXPORT_SYMBOL(roce_del_all_netdev_gids);
static void del_netdev_upper_ips(struct ib_device *ib_dev, u32 port,
struct net_device *rdma_ndev, void *cookie)
{
- handle_netdev_upper(ib_dev, port, cookie, _roce_del_all_netdev_gids);
+ handle_netdev_upper(ib_dev, port, cookie, roce_del_all_netdev_gids);
}
static void add_netdev_upper_ips(struct ib_device *ib_dev, u32 port,
@@ -601,8 +623,7 @@ static void del_netdev_default_ips_join(struct ib_device *ib_dev, u32 port,
rcu_read_lock();
master_ndev = netdev_master_upper_dev_get_rcu(rdma_ndev);
- if (master_ndev)
- dev_hold(master_ndev);
+ dev_hold(master_ndev);
rcu_read_unlock();
if (master_ndev) {
diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
index 5221cce65675..6354ddf2a274 100644
--- a/drivers/infiniband/core/rw.c
+++ b/drivers/infiniband/core/rw.c
@@ -2,6 +2,7 @@
/*
* Copyright (c) 2016 HGST, a Western Digital Company.
*/
+#include <linux/memremap.h>
#include <linux/moduleparam.h>
#include <linux/slab.h>
#include <linux/pci-p2pdma.h>
@@ -273,26 +274,6 @@ static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
return 1;
}
-static void rdma_rw_unmap_sg(struct ib_device *dev, struct scatterlist *sg,
- u32 sg_cnt, enum dma_data_direction dir)
-{
- if (is_pci_p2pdma_page(sg_page(sg)))
- pci_p2pdma_unmap_sg(dev->dma_device, sg, sg_cnt, dir);
- else
- ib_dma_unmap_sg(dev, sg, sg_cnt, dir);
-}
-
-static int rdma_rw_map_sg(struct ib_device *dev, struct scatterlist *sg,
- u32 sg_cnt, enum dma_data_direction dir)
-{
- if (is_pci_p2pdma_page(sg_page(sg))) {
- if (WARN_ON_ONCE(ib_uses_virt_dma(dev)))
- return 0;
- return pci_p2pdma_map_sg(dev->dma_device, sg, sg_cnt, dir);
- }
- return ib_dma_map_sg(dev, sg, sg_cnt, dir);
-}
-
/**
* rdma_rw_ctx_init - initialize a RDMA READ/WRITE context
* @ctx: context to initialize
@@ -313,12 +294,16 @@ int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num,
u64 remote_addr, u32 rkey, enum dma_data_direction dir)
{
struct ib_device *dev = qp->pd->device;
+ struct sg_table sgt = {
+ .sgl = sg,
+ .orig_nents = sg_cnt,
+ };
int ret;
- ret = rdma_rw_map_sg(dev, sg, sg_cnt, dir);
- if (!ret)
- return -ENOMEM;
- sg_cnt = ret;
+ ret = ib_dma_map_sgtable_attrs(dev, &sgt, dir, 0);
+ if (ret)
+ return ret;
+ sg_cnt = sgt.nents;
/*
* Skip to the S/G entry that sg_offset falls into:
@@ -354,7 +339,7 @@ int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num,
return ret;
out_unmap_sg:
- rdma_rw_unmap_sg(dev, sg, sg_cnt, dir);
+ ib_dma_unmap_sgtable_attrs(dev, &sgt, dir, 0);
return ret;
}
EXPORT_SYMBOL(rdma_rw_ctx_init);
@@ -385,6 +370,14 @@ int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
struct ib_device *dev = qp->pd->device;
u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device,
qp->integrity_en);
+ struct sg_table sgt = {
+ .sgl = sg,
+ .orig_nents = sg_cnt,
+ };
+ struct sg_table prot_sgt = {
+ .sgl = prot_sg,
+ .orig_nents = prot_sg_cnt,
+ };
struct ib_rdma_wr *rdma_wr;
int count = 0, ret;
@@ -394,18 +387,14 @@ int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
return -EINVAL;
}
- ret = rdma_rw_map_sg(dev, sg, sg_cnt, dir);
- if (!ret)
- return -ENOMEM;
- sg_cnt = ret;
+ ret = ib_dma_map_sgtable_attrs(dev, &sgt, dir, 0);
+ if (ret)
+ return ret;
if (prot_sg_cnt) {
- ret = rdma_rw_map_sg(dev, prot_sg, prot_sg_cnt, dir);
- if (!ret) {
- ret = -ENOMEM;
+ ret = ib_dma_map_sgtable_attrs(dev, &prot_sgt, dir, 0);
+ if (ret)
goto out_unmap_sg;
- }
- prot_sg_cnt = ret;
}
ctx->type = RDMA_RW_SIG_MR;
@@ -426,10 +415,11 @@ int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
memcpy(ctx->reg->mr->sig_attrs, sig_attrs, sizeof(struct ib_sig_attrs));
- ret = ib_map_mr_sg_pi(ctx->reg->mr, sg, sg_cnt, NULL, prot_sg,
- prot_sg_cnt, NULL, SZ_4K);
+ ret = ib_map_mr_sg_pi(ctx->reg->mr, sg, sgt.nents, NULL, prot_sg,
+ prot_sgt.nents, NULL, SZ_4K);
if (unlikely(ret)) {
- pr_err("failed to map PI sg (%u)\n", sg_cnt + prot_sg_cnt);
+ pr_err("failed to map PI sg (%u)\n",
+ sgt.nents + prot_sgt.nents);
goto out_destroy_sig_mr;
}
@@ -468,10 +458,10 @@ out_destroy_sig_mr:
out_free_ctx:
kfree(ctx->reg);
out_unmap_prot_sg:
- if (prot_sg_cnt)
- rdma_rw_unmap_sg(dev, prot_sg, prot_sg_cnt, dir);
+ if (prot_sgt.nents)
+ ib_dma_unmap_sgtable_attrs(dev, &prot_sgt, dir, 0);
out_unmap_sg:
- rdma_rw_unmap_sg(dev, sg, sg_cnt, dir);
+ ib_dma_unmap_sgtable_attrs(dev, &sgt, dir, 0);
return ret;
}
EXPORT_SYMBOL(rdma_rw_ctx_signature_init);
@@ -604,7 +594,7 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
break;
}
- rdma_rw_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
+ ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
}
EXPORT_SYMBOL(rdma_rw_ctx_destroy);
@@ -632,8 +622,8 @@ void rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
kfree(ctx->reg);
if (prot_sg_cnt)
- rdma_rw_unmap_sg(qp->pd->device, prot_sg, prot_sg_cnt, dir);
- rdma_rw_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
+ ib_dma_unmap_sg(qp->pd->device, prot_sg, prot_sg_cnt, dir);
+ ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
}
EXPORT_SYMBOL(rdma_rw_ctx_destroy_signature);
@@ -676,7 +666,7 @@ void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr)
factor = 1;
/*
- * If the devices needs MRs to perform RDMA READ or WRITE operations,
+ * If the device needs MRs to perform RDMA READ or WRITE operations,
* we'll need two additional MRs for the registrations and the
* invalidation.
*/
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index b61576f702b8..c23e9c847314 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -32,7 +32,6 @@
* SOFTWARE.
*/
-#include <linux/module.h>
#include <linux/init.h>
#include <linux/err.h>
#include <linux/random.h>
@@ -51,6 +50,7 @@
#include <rdma/ib_marshall.h>
#include <rdma/ib_addr.h>
#include <rdma/opa_addr.h>
+#include <rdma/rdma_cm.h>
#include "sa.h"
#include "core_priv.h"
@@ -105,7 +105,10 @@ struct ib_sa_device {
};
struct ib_sa_query {
- void (*callback)(struct ib_sa_query *, int, struct ib_sa_mad *);
+ void (*callback)(struct ib_sa_query *sa_query, int status,
+ struct ib_sa_mad *mad);
+ void (*rmpp_callback)(struct ib_sa_query *sa_query, int status,
+ struct ib_mad_recv_wc *mad);
void (*release)(struct ib_sa_query *);
struct ib_sa_client *client;
struct ib_sa_port *port;
@@ -123,14 +126,9 @@ struct ib_sa_query {
#define IB_SA_CANCEL 0x00000002
#define IB_SA_QUERY_OPA 0x00000004
-struct ib_sa_service_query {
- void (*callback)(int, struct ib_sa_service_rec *, void *);
- void *context;
- struct ib_sa_query sa_query;
-};
-
struct ib_sa_path_query {
- void (*callback)(int, struct sa_path_rec *, void *);
+ void (*callback)(int status, struct sa_path_rec *rec,
+ unsigned int num_paths, void *context);
void *context;
struct ib_sa_query sa_query;
struct sa_path_rec *conv_pr;
@@ -154,6 +152,13 @@ struct ib_sa_mcmember_query {
struct ib_sa_query sa_query;
};
+struct ib_sa_service_query {
+ void (*callback)(int status, struct sa_service_rec *rec,
+ unsigned int num_services, void *context);
+ void *context;
+ struct ib_sa_query sa_query;
+};
+
static LIST_HEAD(ib_nl_request_list);
static DEFINE_SPINLOCK(ib_nl_request_lock);
static atomic_t ib_nl_sa_request_seq;
@@ -502,54 +507,6 @@ static const struct ib_field mcmember_rec_table[] = {
.size_bits = 23 },
};
-#define SERVICE_REC_FIELD(field) \
- .struct_offset_bytes = offsetof(struct ib_sa_service_rec, field), \
- .struct_size_bytes = sizeof_field(struct ib_sa_service_rec, field), \
- .field_name = "sa_service_rec:" #field
-
-static const struct ib_field service_rec_table[] = {
- { SERVICE_REC_FIELD(id),
- .offset_words = 0,
- .offset_bits = 0,
- .size_bits = 64 },
- { SERVICE_REC_FIELD(gid),
- .offset_words = 2,
- .offset_bits = 0,
- .size_bits = 128 },
- { SERVICE_REC_FIELD(pkey),
- .offset_words = 6,
- .offset_bits = 0,
- .size_bits = 16 },
- { SERVICE_REC_FIELD(lease),
- .offset_words = 7,
- .offset_bits = 0,
- .size_bits = 32 },
- { SERVICE_REC_FIELD(key),
- .offset_words = 8,
- .offset_bits = 0,
- .size_bits = 128 },
- { SERVICE_REC_FIELD(name),
- .offset_words = 12,
- .offset_bits = 0,
- .size_bits = 64*8 },
- { SERVICE_REC_FIELD(data8),
- .offset_words = 28,
- .offset_bits = 0,
- .size_bits = 16*8 },
- { SERVICE_REC_FIELD(data16),
- .offset_words = 32,
- .offset_bits = 0,
- .size_bits = 8*16 },
- { SERVICE_REC_FIELD(data32),
- .offset_words = 36,
- .offset_bits = 0,
- .size_bits = 4*32 },
- { SERVICE_REC_FIELD(data64),
- .offset_words = 40,
- .offset_bits = 0,
- .size_bits = 2*64 },
-};
-
#define CLASSPORTINFO_REC_FIELD(field) \
.struct_offset_bytes = offsetof(struct ib_class_port_info, field), \
.struct_size_bytes = sizeof_field(struct ib_class_port_info, field), \
@@ -736,6 +693,60 @@ static const struct ib_field guidinfo_rec_table[] = {
.size_bits = 512 },
};
+#define SERVICE_REC_FIELD(field) \
+ .struct_offset_bytes = offsetof(struct sa_service_rec, field), \
+ .struct_size_bytes = sizeof_field(struct sa_service_rec, field), \
+ .field_name = "sa_service_rec:" #field
+
+static const struct ib_field service_rec_table[] = {
+ { SERVICE_REC_FIELD(id),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 64 },
+ { SERVICE_REC_FIELD(gid),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { SERVICE_REC_FIELD(pkey),
+ .offset_words = 6,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { RESERVED,
+ .offset_words = 6,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { SERVICE_REC_FIELD(lease),
+ .offset_words = 7,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { SERVICE_REC_FIELD(key),
+ .offset_words = 8,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { SERVICE_REC_FIELD(name),
+ .offset_words = 12,
+ .offset_bits = 0,
+ .size_bits = 512 },
+ { SERVICE_REC_FIELD(data_8),
+ .offset_words = 28,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { SERVICE_REC_FIELD(data_16),
+ .offset_words = 32,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { SERVICE_REC_FIELD(data_32),
+ .offset_words = 36,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { SERVICE_REC_FIELD(data_64),
+ .offset_words = 40,
+ .offset_bits = 0,
+ .size_bits = 128 },
+};
+
+#define RDMA_PRIMARY_PATH_MAX_REC_NUM 3
+
static inline void ib_sa_disable_local_svc(struct ib_sa_query *query)
{
query->flags &= ~IB_SA_ENABLE_LOCAL_SERVICE;
@@ -760,13 +771,14 @@ static void ib_nl_set_path_rec_attrs(struct sk_buff *skb,
/* Construct the family header first */
header = skb_put(skb, NLMSG_ALIGN(sizeof(*header)));
- memcpy(header->device_name, dev_name(&query->port->agent->device->dev),
- LS_DEVICE_NAME_MAX);
+ strscpy_pad(header->device_name,
+ dev_name(&query->port->agent->device->dev),
+ LS_DEVICE_NAME_MAX);
header->port_num = query->port->port_num;
if ((comp_mask & IB_SA_PATH_REC_REVERSIBLE) &&
sa_rec->reversible != 0)
- query->path_use = LS_RESOLVE_PATH_USE_GMP;
+ query->path_use = LS_RESOLVE_PATH_USE_ALL;
else
query->path_use = LS_RESOLVE_PATH_USE_UNIDIRECTIONAL;
header->path_use = query->path_use;
@@ -919,50 +931,77 @@ static void send_handler(struct ib_mad_agent *agent,
static void ib_nl_process_good_resolve_rsp(struct ib_sa_query *query,
const struct nlmsghdr *nlh)
{
+ struct sa_path_rec recs[RDMA_PRIMARY_PATH_MAX_REC_NUM];
+ struct ib_sa_path_query *path_query;
+ struct ib_path_rec_data *rec_data;
struct ib_mad_send_wc mad_send_wc;
- struct ib_sa_mad *mad = NULL;
const struct nlattr *head, *curr;
- struct ib_path_rec_data *rec;
- int len, rem;
+ struct ib_sa_mad *mad = NULL;
+ int len, rem, status = -EIO;
+ unsigned int num_prs = 0;
u32 mask = 0;
- int status = -EIO;
-
- if (query->callback) {
- head = (const struct nlattr *) nlmsg_data(nlh);
- len = nlmsg_len(nlh);
- switch (query->path_use) {
- case LS_RESOLVE_PATH_USE_UNIDIRECTIONAL:
- mask = IB_PATH_PRIMARY | IB_PATH_OUTBOUND;
- break;
- case LS_RESOLVE_PATH_USE_ALL:
- case LS_RESOLVE_PATH_USE_GMP:
- default:
- mask = IB_PATH_PRIMARY | IB_PATH_GMP |
- IB_PATH_BIDIRECTIONAL;
- break;
- }
- nla_for_each_attr(curr, head, len, rem) {
- if (curr->nla_type == LS_NLA_TYPE_PATH_RECORD) {
- rec = nla_data(curr);
- /*
- * Get the first one. In the future, we may
- * need to get up to 6 pathrecords.
- */
- if ((rec->flags & mask) == mask) {
- mad = query->mad_buf->mad;
- mad->mad_hdr.method |=
- IB_MGMT_METHOD_RESP;
- memcpy(mad->data, rec->path_rec,
- sizeof(rec->path_rec));
- status = 0;
- break;
- }
- }
+ if (!query->callback)
+ goto out;
+
+ path_query = container_of(query, struct ib_sa_path_query, sa_query);
+ mad = query->mad_buf->mad;
+
+ head = (const struct nlattr *) nlmsg_data(nlh);
+ len = nlmsg_len(nlh);
+ switch (query->path_use) {
+ case LS_RESOLVE_PATH_USE_UNIDIRECTIONAL:
+ mask = IB_PATH_PRIMARY | IB_PATH_OUTBOUND;
+ break;
+
+ case LS_RESOLVE_PATH_USE_ALL:
+ mask = IB_PATH_PRIMARY;
+ break;
+
+ case LS_RESOLVE_PATH_USE_GMP:
+ default:
+ mask = IB_PATH_PRIMARY | IB_PATH_GMP |
+ IB_PATH_BIDIRECTIONAL;
+ break;
+ }
+
+ nla_for_each_attr(curr, head, len, rem) {
+ if (curr->nla_type != LS_NLA_TYPE_PATH_RECORD)
+ continue;
+
+ rec_data = nla_data(curr);
+ if ((rec_data->flags & mask) != mask)
+ continue;
+
+ if ((query->flags & IB_SA_QUERY_OPA) ||
+ path_query->conv_pr) {
+ mad->mad_hdr.method |= IB_MGMT_METHOD_RESP;
+ memcpy(mad->data, rec_data->path_rec,
+ sizeof(rec_data->path_rec));
+ query->callback(query, 0, mad);
+ goto out;
}
- query->callback(query, status, mad);
+
+ status = 0;
+ ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table),
+ rec_data->path_rec, &recs[num_prs]);
+ recs[num_prs].flags = rec_data->flags;
+ recs[num_prs].rec_type = SA_PATH_REC_TYPE_IB;
+ sa_path_set_dmac_zero(&recs[num_prs]);
+
+ num_prs++;
+ if (num_prs >= RDMA_PRIMARY_PATH_MAX_REC_NUM)
+ break;
}
+ if (!status) {
+ mad->mad_hdr.method |= IB_MGMT_METHOD_RESP;
+ path_query->callback(status, recs, num_prs,
+ path_query->context);
+ } else
+ query->callback(query, status, mad);
+
+out:
mad_send_wc.send_buf = query->mad_buf;
mad_send_wc.status = IB_WC_SUCCESS;
send_handler(query->mad_buf->mad_agent, &mad_send_wc);
@@ -1035,6 +1074,8 @@ int ib_nl_handle_set_timeout(struct sk_buff *skb,
if (timeout > IB_SA_LOCAL_SVC_TIMEOUT_MAX)
timeout = IB_SA_LOCAL_SVC_TIMEOUT_MAX;
+ spin_lock_irqsave(&ib_nl_request_lock, flags);
+
delta = timeout - sa_local_svc_timeout_ms;
if (delta < 0)
abs_delta = -delta;
@@ -1042,7 +1083,6 @@ int ib_nl_handle_set_timeout(struct sk_buff *skb,
abs_delta = delta;
if (delta != 0) {
- spin_lock_irqsave(&ib_nl_request_lock, flags);
sa_local_svc_timeout_ms = timeout;
list_for_each_entry(query, &ib_nl_request_list, list) {
if (delta < 0 && abs_delta > query->timeout)
@@ -1060,9 +1100,10 @@ int ib_nl_handle_set_timeout(struct sk_buff *skb,
if (delay)
mod_delayed_work(ib_nl_wq, &ib_nl_timed_work,
(unsigned long)delay);
- spin_unlock_irqrestore(&ib_nl_request_lock, flags);
}
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+
settimeout_out:
return 0;
}
@@ -1088,10 +1129,9 @@ int ib_nl_handle_resolve_resp(struct sk_buff *skb,
struct netlink_ext_ack *extack)
{
unsigned long flags;
- struct ib_sa_query *query;
+ struct ib_sa_query *query = NULL, *iter;
struct ib_mad_send_buf *send_buf;
struct ib_mad_send_wc mad_send_wc;
- int found = 0;
int ret;
if ((nlh->nlmsg_flags & NLM_F_REQUEST) ||
@@ -1099,20 +1139,21 @@ int ib_nl_handle_resolve_resp(struct sk_buff *skb,
return -EPERM;
spin_lock_irqsave(&ib_nl_request_lock, flags);
- list_for_each_entry(query, &ib_nl_request_list, list) {
+ list_for_each_entry(iter, &ib_nl_request_list, list) {
/*
* If the query is cancelled, let the timeout routine
* take care of it.
*/
- if (nlh->nlmsg_seq == query->seq) {
- found = !ib_sa_query_cancelled(query);
- if (found)
- list_del(&query->list);
+ if (nlh->nlmsg_seq == iter->seq) {
+ if (!ib_sa_query_cancelled(iter)) {
+ list_del(&iter->list);
+ query = iter;
+ }
break;
}
}
- if (!found) {
+ if (!query) {
spin_unlock_irqrestore(&ib_nl_request_lock, flags);
goto resp_out;
}
@@ -1358,6 +1399,7 @@ static int send_mad(struct ib_sa_query *query, unsigned long timeout_ms,
{
unsigned long flags;
int ret, id;
+ const int nmbr_sa_query_retries = 10;
xa_lock_irqsave(&queries, flags);
ret = __xa_alloc(&queries, &id, query, xa_limit_32b, gfp_mask);
@@ -1365,7 +1407,13 @@ static int send_mad(struct ib_sa_query *query, unsigned long timeout_ms,
if (ret < 0)
return ret;
- query->mad_buf->timeout_ms = timeout_ms;
+ query->mad_buf->timeout_ms = timeout_ms / nmbr_sa_query_retries;
+ query->mad_buf->retries = nmbr_sa_query_retries;
+ if (!query->mad_buf->timeout_ms) {
+ /* Special case, very small timeout_ms */
+ query->mad_buf->timeout_ms = 1;
+ query->mad_buf->retries = timeout_ms;
+ }
query->mad_buf->context[0] = query;
query->id = id;
@@ -1405,6 +1453,20 @@ void ib_sa_pack_path(struct sa_path_rec *rec, void *attribute)
}
EXPORT_SYMBOL(ib_sa_pack_path);
+void ib_sa_pack_service(struct sa_service_rec *rec, void *attribute)
+{
+ ib_pack(service_rec_table, ARRAY_SIZE(service_rec_table), rec,
+ attribute);
+}
+EXPORT_SYMBOL(ib_sa_pack_service);
+
+void ib_sa_unpack_service(void *attribute, struct sa_service_rec *rec)
+{
+ ib_unpack(service_rec_table, ARRAY_SIZE(service_rec_table), attribute,
+ rec);
+}
+EXPORT_SYMBOL(ib_sa_unpack_service);
+
static bool ib_sa_opa_pathrecord_support(struct ib_sa_client *client,
struct ib_sa_device *sa_dev,
u32 port_num)
@@ -1435,7 +1497,7 @@ enum opa_pr_supported {
/*
* opa_pr_query_possible - Check if current PR query can be an OPA query.
*
- * Retuns PR_NOT_SUPPORTED if a path record query is not
+ * Returns PR_NOT_SUPPORTED if a path record query is not
* possible, PR_OPA_SUPPORTED if an OPA path record query
* is possible and PR_IB_SUPPORTED if an IB path record
* query is possible.
@@ -1459,40 +1521,101 @@ static int opa_pr_query_possible(struct ib_sa_client *client,
}
static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query,
- int status,
- struct ib_sa_mad *mad)
+ int status, struct ib_sa_mad *mad)
{
struct ib_sa_path_query *query =
container_of(sa_query, struct ib_sa_path_query, sa_query);
+ struct sa_path_rec rec = {};
- if (mad) {
- struct sa_path_rec rec;
+ if (!mad) {
+ query->callback(status, NULL, 0, query->context);
+ return;
+ }
- if (sa_query->flags & IB_SA_QUERY_OPA) {
- ib_unpack(opa_path_rec_table,
- ARRAY_SIZE(opa_path_rec_table),
- mad->data, &rec);
- rec.rec_type = SA_PATH_REC_TYPE_OPA;
- query->callback(status, &rec, query->context);
- } else {
- ib_unpack(path_rec_table,
- ARRAY_SIZE(path_rec_table),
- mad->data, &rec);
- rec.rec_type = SA_PATH_REC_TYPE_IB;
- sa_path_set_dmac_zero(&rec);
+ if (sa_query->flags & IB_SA_QUERY_OPA) {
+ ib_unpack(opa_path_rec_table, ARRAY_SIZE(opa_path_rec_table),
+ mad->data, &rec);
+ rec.rec_type = SA_PATH_REC_TYPE_OPA;
+ query->callback(status, &rec, 1, query->context);
+ return;
+ }
+
+ ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table),
+ mad->data, &rec);
+ rec.rec_type = SA_PATH_REC_TYPE_IB;
+ sa_path_set_dmac_zero(&rec);
+
+ if (query->conv_pr) {
+ struct sa_path_rec opa;
+
+ memset(&opa, 0, sizeof(struct sa_path_rec));
+ sa_convert_path_ib_to_opa(&opa, &rec);
+ query->callback(status, &opa, 1, query->context);
+ } else {
+ query->callback(status, &rec, 1, query->context);
+ }
+}
- if (query->conv_pr) {
- struct sa_path_rec opa;
+#define IB_SA_DATA_OFFS 56
+#define IB_SERVICE_REC_SZ 176
- memset(&opa, 0, sizeof(struct sa_path_rec));
- sa_convert_path_ib_to_opa(&opa, &rec);
- query->callback(status, &opa, query->context);
- } else {
- query->callback(status, &rec, query->context);
+static void ib_unpack_service_rmpp(struct sa_service_rec *rec,
+ struct ib_mad_recv_wc *mad_wc,
+ int num_services)
+{
+ unsigned int cp_sz, data_i, data_size, rec_i = 0, buf_i = 0;
+ struct ib_mad_recv_buf *mad_buf;
+ u8 buf[IB_SERVICE_REC_SZ];
+ u8 *data;
+
+ data_size = sizeof(((struct ib_sa_mad *) mad_buf->mad)->data);
+
+ list_for_each_entry(mad_buf, &mad_wc->rmpp_list, list) {
+ data = ((struct ib_sa_mad *) mad_buf->mad)->data;
+ data_i = 0;
+ while (data_i < data_size && rec_i < num_services) {
+ cp_sz = min(IB_SERVICE_REC_SZ - buf_i,
+ data_size - data_i);
+ memcpy(buf + buf_i, data + data_i, cp_sz);
+ data_i += cp_sz;
+ buf_i += cp_sz;
+ if (buf_i == IB_SERVICE_REC_SZ) {
+ ib_sa_unpack_service(buf, rec + rec_i);
+ buf_i = 0;
+ rec_i++;
}
}
- } else
- query->callback(status, NULL, query->context);
+ }
+}
+
+static void ib_sa_service_rec_callback(struct ib_sa_query *sa_query, int status,
+ struct ib_mad_recv_wc *mad_wc)
+{
+ struct ib_sa_service_query *query =
+ container_of(sa_query, struct ib_sa_service_query, sa_query);
+ struct sa_service_rec *rec;
+ int num_services;
+
+ if (!mad_wc || !mad_wc->recv_buf.mad) {
+ query->callback(status, NULL, 0, query->context);
+ return;
+ }
+
+ num_services = (mad_wc->mad_len - IB_SA_DATA_OFFS) / IB_SERVICE_REC_SZ;
+ if (!num_services) {
+ query->callback(-ENODATA, NULL, 0, query->context);
+ return;
+ }
+
+ rec = kmalloc_array(num_services, sizeof(*rec), GFP_KERNEL);
+ if (!rec) {
+ query->callback(-ENOMEM, NULL, 0, query->context);
+ return;
+ }
+
+ ib_unpack_service_rmpp(rec, mad_wc, num_services);
+ query->callback(status, rec, num_services, query->context);
+ kfree(rec);
}
static void ib_sa_path_rec_release(struct ib_sa_query *sa_query)
@@ -1504,6 +1627,14 @@ static void ib_sa_path_rec_release(struct ib_sa_query *sa_query)
kfree(query);
}
+static void ib_sa_service_rec_release(struct ib_sa_query *sa_query)
+{
+ struct ib_sa_service_query *query =
+ container_of(sa_query, struct ib_sa_service_query, sa_query);
+
+ kfree(query);
+}
+
/**
* ib_sa_path_rec_get - Start a Path get query
* @client:SA client
@@ -1536,7 +1667,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client,
unsigned long timeout_ms, gfp_t gfp_mask,
void (*callback)(int status,
struct sa_path_rec *resp,
- void *context),
+ unsigned int num_paths, void *context),
void *context,
struct ib_sa_query **sa_query)
{
@@ -1634,89 +1765,61 @@ err1:
}
EXPORT_SYMBOL(ib_sa_path_rec_get);
-static void ib_sa_service_rec_callback(struct ib_sa_query *sa_query,
- int status,
- struct ib_sa_mad *mad)
-{
- struct ib_sa_service_query *query =
- container_of(sa_query, struct ib_sa_service_query, sa_query);
-
- if (mad) {
- struct ib_sa_service_rec rec;
-
- ib_unpack(service_rec_table, ARRAY_SIZE(service_rec_table),
- mad->data, &rec);
- query->callback(status, &rec, query->context);
- } else
- query->callback(status, NULL, query->context);
-}
-
-static void ib_sa_service_rec_release(struct ib_sa_query *sa_query)
-{
- kfree(container_of(sa_query, struct ib_sa_service_query, sa_query));
-}
-
/**
- * ib_sa_service_rec_query - Start Service Record operation
- * @client:SA client
- * @device:device to send request on
- * @port_num: port number to send request on
- * @method:SA method - should be get, set, or delete
- * @rec:Service Record to send in request
- * @comp_mask:component mask to send in request
- * @timeout_ms:time to wait for response
- * @gfp_mask:GFP mask to use for internal allocations
- * @callback:function called when request completes, times out or is
+ * ib_sa_service_rec_get - Start a Service get query
+ * @client: SA client
+ * @device: device to send query on
+ * @port_num: port number to send query on
+ * @rec: Service Record to send in query
+ * @comp_mask: component mask to send in query
+ * @timeout_ms: time to wait for response
+ * @gfp_mask: GFP mask to use for internal allocations
+ * @callback: function called when query completes, times out or is
* canceled
- * @context:opaque user context passed to callback
- * @sa_query:request context, used to cancel request
+ * @context: opaque user context passed to callback
+ * @sa_query: query context, used to cancel query
*
- * Send a Service Record set/get/delete to the SA to register,
- * unregister or query a service record.
- * The callback function will be called when the request completes (or
+ * Send a Service Record Get query to the SA to look up a path. The
+ * callback function will be called when the query completes (or
* fails); status is 0 for a successful response, -EINTR if the query
* is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error
* occurred sending the query. The resp parameter of the callback is
* only valid if status is 0.
*
- * If the return value of ib_sa_service_rec_query() is negative, it is an
- * error code. Otherwise it is a request ID that can be used to cancel
+ * If the return value of ib_sa_service_rec_get() is negative, it is an
+ * error code. Otherwise it is a query ID that can be used to cancel
* the query.
*/
-int ib_sa_service_rec_query(struct ib_sa_client *client,
- struct ib_device *device, u32 port_num, u8 method,
- struct ib_sa_service_rec *rec,
- ib_sa_comp_mask comp_mask,
- unsigned long timeout_ms, gfp_t gfp_mask,
- void (*callback)(int status,
- struct ib_sa_service_rec *resp,
- void *context),
- void *context,
- struct ib_sa_query **sa_query)
+int ib_sa_service_rec_get(struct ib_sa_client *client,
+ struct ib_device *device, u32 port_num,
+ struct sa_service_rec *rec,
+ ib_sa_comp_mask comp_mask,
+ unsigned long timeout_ms, gfp_t gfp_mask,
+ void (*callback)(int status,
+ struct sa_service_rec *resp,
+ unsigned int num_services,
+ void *context),
+ void *context, struct ib_sa_query **sa_query)
{
- struct ib_sa_service_query *query;
struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
- struct ib_sa_port *port;
+ struct ib_sa_service_query *query;
struct ib_mad_agent *agent;
+ struct ib_sa_port *port;
struct ib_sa_mad *mad;
int ret;
if (!sa_dev)
return -ENODEV;
- port = &sa_dev->port[port_num - sa_dev->start_port];
+ port = &sa_dev->port[port_num - sa_dev->start_port];
agent = port->agent;
- if (method != IB_MGMT_METHOD_GET &&
- method != IB_MGMT_METHOD_SET &&
- method != IB_SA_METHOD_DELETE)
- return -EINVAL;
-
query = kzalloc(sizeof(*query), gfp_mask);
if (!query)
return -ENOMEM;
- query->sa_query.port = port;
+ query->sa_query.port = port;
+
ret = alloc_mad(&query->sa_query, gfp_mask);
if (ret)
goto err1;
@@ -1729,16 +1832,17 @@ int ib_sa_service_rec_query(struct ib_sa_client *client,
mad = query->sa_query.mad_buf->mad;
init_mad(&query->sa_query, agent);
- query->sa_query.callback = callback ? ib_sa_service_rec_callback : NULL;
- query->sa_query.release = ib_sa_service_rec_release;
- mad->mad_hdr.method = method;
- mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_SERVICE_REC);
- mad->sa_hdr.comp_mask = comp_mask;
+ query->sa_query.rmpp_callback = callback ? ib_sa_service_rec_callback :
+ NULL;
+ query->sa_query.release = ib_sa_service_rec_release;
+ mad->mad_hdr.method = IB_MGMT_METHOD_GET_TABLE;
+ mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_SERVICE_REC);
+ mad->sa_hdr.comp_mask = comp_mask;
- ib_pack(service_rec_table, ARRAY_SIZE(service_rec_table),
- rec, mad->data);
+ ib_sa_pack_service(rec, mad->data);
*sa_query = &query->sa_query;
+ query->sa_query.mad_buf->context[1] = rec;
ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
if (ret < 0)
@@ -1750,16 +1854,14 @@ err2:
*sa_query = NULL;
ib_sa_client_put(query->sa_query.client);
free_mad(&query->sa_query);
-
err1:
kfree(query);
return ret;
}
-EXPORT_SYMBOL(ib_sa_service_rec_query);
+EXPORT_SYMBOL(ib_sa_service_rec_get);
static void ib_sa_mcmember_rec_callback(struct ib_sa_query *sa_query,
- int status,
- struct ib_sa_mad *mad)
+ int status, struct ib_sa_mad *mad)
{
struct ib_sa_mcmember_query *query =
container_of(sa_query, struct ib_sa_mcmember_query, sa_query);
@@ -1850,8 +1952,7 @@ err1:
/* Support GuidInfoRecord */
static void ib_sa_guidinfo_rec_callback(struct ib_sa_query *sa_query,
- int status,
- struct ib_sa_mad *mad)
+ int status, struct ib_sa_mad *mad)
{
struct ib_sa_guidinfo_query *query =
container_of(sa_query, struct ib_sa_guidinfo_query, sa_query);
@@ -1960,8 +2061,7 @@ static void ib_classportinfo_cb(void *context)
}
static void ib_sa_classport_info_rec_callback(struct ib_sa_query *sa_query,
- int status,
- struct ib_sa_mad *mad)
+ int status, struct ib_sa_mad *mad)
{
unsigned long flags;
struct ib_sa_classport_info_query *query =
@@ -2129,23 +2229,29 @@ static void send_handler(struct ib_mad_agent *agent,
{
struct ib_sa_query *query = mad_send_wc->send_buf->context[0];
unsigned long flags;
+ int status = 0;
- if (query->callback)
+ if (query->callback || query->rmpp_callback) {
switch (mad_send_wc->status) {
case IB_WC_SUCCESS:
/* No callback -- already got recv */
break;
case IB_WC_RESP_TIMEOUT_ERR:
- query->callback(query, -ETIMEDOUT, NULL);
+ status = -ETIMEDOUT;
break;
case IB_WC_WR_FLUSH_ERR:
- query->callback(query, -EINTR, NULL);
+ status = -EINTR;
break;
default:
- query->callback(query, -EIO, NULL);
+ status = -EIO;
break;
}
+ if (status)
+ query->callback ? query->callback(query, status, NULL) :
+ query->rmpp_callback(query, status, NULL);
+ }
+
xa_lock_irqsave(&queries, flags);
__xa_erase(&queries, query->id);
xa_unlock_irqrestore(&queries, flags);
@@ -2161,17 +2267,25 @@ static void recv_handler(struct ib_mad_agent *mad_agent,
struct ib_mad_recv_wc *mad_recv_wc)
{
struct ib_sa_query *query;
+ struct ib_mad *mad;
+
if (!send_buf)
return;
query = send_buf->context[0];
- if (query->callback) {
+ mad = mad_recv_wc->recv_buf.mad;
+
+ if (query->rmpp_callback) {
+ if (mad_recv_wc->wc->status == IB_WC_SUCCESS)
+ query->rmpp_callback(query, mad->mad_hdr.status ?
+ -EINVAL : 0, mad_recv_wc);
+ else
+ query->rmpp_callback(query, -EIO, NULL);
+ } else if (query->callback) {
if (mad_recv_wc->wc->status == IB_WC_SUCCESS)
- query->callback(query,
- mad_recv_wc->recv_buf.mad->mad_hdr.status ?
- -EINVAL : 0,
- (struct ib_sa_mad *) mad_recv_wc->recv_buf.mad);
+ query->callback(query, mad->mad_hdr.status ?
+ -EINVAL : 0, (struct ib_sa_mad *)mad);
else
query->callback(query, -EIO, NULL);
}
@@ -2301,7 +2415,9 @@ static int ib_sa_add_one(struct ib_device *device)
s = rdma_start_port(device);
e = rdma_end_port(device);
- sa_dev = kzalloc(struct_size(sa_dev, port, e - s + 1), GFP_KERNEL);
+ sa_dev = kzalloc(struct_size(sa_dev, port,
+ size_add(size_sub(e, s), 1)),
+ GFP_KERNEL);
if (!sa_dev)
return -ENOMEM;
@@ -2321,8 +2437,9 @@ static int ib_sa_add_one(struct ib_device *device)
sa_dev->port[i].agent =
ib_register_mad_agent(device, i + s, IB_QPT_GSI,
- NULL, 0, send_handler,
- recv_handler, sa_dev, 0);
+ NULL, IB_MGMT_RMPP_VERSION,
+ send_handler, recv_handler,
+ sa_dev, 0);
if (IS_ERR(sa_dev->port[i].agent)) {
ret = PTR_ERR(sa_dev->port[i].agent);
goto err;
@@ -2431,7 +2548,6 @@ err1:
void ib_sa_cleanup(void)
{
cancel_delayed_work(&ib_nl_timed_work);
- flush_workqueue(ib_nl_wq);
destroy_workqueue(ib_nl_wq);
mcast_cleanup();
ib_unregister_client(&sa_client);
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index 6146c3c1cbe5..0ed862b38b44 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -216,24 +216,12 @@ static ssize_t state_show(struct ib_device *ibdev, u32 port_num,
struct ib_port_attr attr;
ssize_t ret;
- static const char *state_name[] = {
- [IB_PORT_NOP] = "NOP",
- [IB_PORT_DOWN] = "DOWN",
- [IB_PORT_INIT] = "INIT",
- [IB_PORT_ARMED] = "ARMED",
- [IB_PORT_ACTIVE] = "ACTIVE",
- [IB_PORT_ACTIVE_DEFER] = "ACTIVE_DEFER"
- };
-
ret = ib_query_port(ibdev, port_num, &attr);
if (ret)
return ret;
return sysfs_emit(buf, "%d: %s\n", attr.state,
- attr.state >= 0 &&
- attr.state < ARRAY_SIZE(state_name) ?
- state_name[attr.state] :
- "UNKNOWN");
+ ib_port_state_to_str(attr.state));
}
static ssize_t lid_show(struct ib_device *ibdev, u32 port_num,
@@ -342,6 +330,10 @@ static ssize_t rate_show(struct ib_device *ibdev, u32 port_num,
speed = " NDR";
rate = 1000;
break;
+ case IB_SPEED_XDR:
+ speed = " XDR";
+ rate = 2000;
+ break;
case IB_SPEED_SDR:
default: /* default to SDR for invalid rates */
speed = " SDR";
@@ -433,6 +425,7 @@ static struct attribute *port_default_attrs[] = {
&ib_port_attr_link_layer.attr,
NULL
};
+ATTRIBUTE_GROUPS(port_default);
static ssize_t print_ndev(const struct ib_gid_attr *gid_attr, char *buf)
{
@@ -755,9 +748,9 @@ static void ib_port_release(struct kobject *kobj)
for (i = 0; i != ARRAY_SIZE(port->groups); i++)
kfree(port->groups[i].attrs);
if (port->hw_stats_data)
- kfree(port->hw_stats_data->stats);
+ rdma_free_hw_stats_struct(port->hw_stats_data->stats);
kfree(port->hw_stats_data);
- kfree(port);
+ kvfree(port);
}
static void ib_port_gid_attr_release(struct kobject *kobj)
@@ -774,7 +767,7 @@ static void ib_port_gid_attr_release(struct kobject *kobj)
static struct kobj_type port_type = {
.release = ib_port_release,
.sysfs_ops = &port_sysfs_ops,
- .default_attrs = port_default_attrs
+ .default_groups = port_default_groups,
};
static struct kobj_type gid_attr_type = {
@@ -895,14 +888,14 @@ alloc_hw_stats_device(struct ib_device *ibdev)
stats = ibdev->ops.alloc_hw_device_stats(ibdev);
if (!stats)
return ERR_PTR(-ENOMEM);
- if (!stats->names || stats->num_counters <= 0)
+ if (!stats->descs || stats->num_counters <= 0)
goto err_free_stats;
/*
* Two extra attribue elements here, one for the lifespan entry and
* one to NULL terminate the list for the sysfs core code
*/
- data = kzalloc(struct_size(data, attrs, stats->num_counters + 1),
+ data = kzalloc(struct_size(data, attrs, size_add(stats->num_counters, 1)),
GFP_KERNEL);
if (!data)
goto err_free_stats;
@@ -911,7 +904,6 @@ alloc_hw_stats_device(struct ib_device *ibdev)
if (!data->group.attrs)
goto err_free_data;
- mutex_init(&stats->lock);
data->group.name = "hw_counters";
data->stats = stats;
return data;
@@ -919,14 +911,14 @@ alloc_hw_stats_device(struct ib_device *ibdev)
err_free_data:
kfree(data);
err_free_stats:
- kfree(stats);
+ rdma_free_hw_stats_struct(stats);
return ERR_PTR(-ENOMEM);
}
void ib_device_release_hw_stats(struct hw_stats_device_data *data)
{
kfree(data->group.attrs);
- kfree(data->stats);
+ rdma_free_hw_stats_struct(data->stats);
kfree(data);
}
@@ -934,7 +926,8 @@ int ib_setup_device_attrs(struct ib_device *ibdev)
{
struct hw_stats_device_attribute *attr;
struct hw_stats_device_data *data;
- int i, ret;
+ bool opstat_skipped = false;
+ int i, ret, pos = 0;
data = alloc_hw_stats_device(ibdev);
if (IS_ERR(data)) {
@@ -955,16 +948,23 @@ int ib_setup_device_attrs(struct ib_device *ibdev)
data->stats->timestamp = jiffies;
for (i = 0; i < data->stats->num_counters; i++) {
- attr = &data->attrs[i];
+ if (data->stats->descs[i].flags & IB_STAT_FLAG_OPTIONAL) {
+ opstat_skipped = true;
+ continue;
+ }
+
+ WARN_ON(opstat_skipped);
+ attr = &data->attrs[pos];
sysfs_attr_init(&attr->attr.attr);
- attr->attr.attr.name = data->stats->names[i];
+ attr->attr.attr.name = data->stats->descs[i].name;
attr->attr.attr.mode = 0444;
attr->attr.show = hw_stat_device_show;
attr->show = show_hw_stats;
- data->group.attrs[i] = &attr->attr.attr;
+ data->group.attrs[pos] = &attr->attr.attr;
+ pos++;
}
- attr = &data->attrs[i];
+ attr = &data->attrs[pos];
sysfs_attr_init(&attr->attr.attr);
attr->attr.attr.name = "lifespan";
attr->attr.attr.mode = 0644;
@@ -972,10 +972,11 @@ int ib_setup_device_attrs(struct ib_device *ibdev)
attr->show = show_stats_lifespan;
attr->attr.store = hw_stat_device_store;
attr->store = set_stats_lifespan;
- data->group.attrs[i] = &attr->attr.attr;
+ data->group.attrs[pos] = &attr->attr.attr;
for (i = 0; i != ARRAY_SIZE(ibdev->groups); i++)
if (!ibdev->groups[i]) {
ibdev->groups[i] = &data->group;
+ ibdev->hw_stats_attr_index = i;
return 0;
}
WARN(true, "struct ib_device->groups is too small");
@@ -994,14 +995,14 @@ alloc_hw_stats_port(struct ib_port *port, struct attribute_group *group)
stats = ibdev->ops.alloc_hw_port_stats(port->ibdev, port->port_num);
if (!stats)
return ERR_PTR(-ENOMEM);
- if (!stats->names || stats->num_counters <= 0)
+ if (!stats->descs || stats->num_counters <= 0)
goto err_free_stats;
/*
* Two extra attribue elements here, one for the lifespan entry and
* one to NULL terminate the list for the sysfs core code
*/
- data = kzalloc(struct_size(data, attrs, stats->num_counters + 1),
+ data = kzalloc(struct_size(data, attrs, size_add(stats->num_counters, 1)),
GFP_KERNEL);
if (!data)
goto err_free_stats;
@@ -1010,7 +1011,6 @@ alloc_hw_stats_port(struct ib_port *port, struct attribute_group *group)
if (!group->attrs)
goto err_free_data;
- mutex_init(&stats->lock);
group->name = "hw_counters";
data->stats = stats;
return data;
@@ -1018,7 +1018,7 @@ alloc_hw_stats_port(struct ib_port *port, struct attribute_group *group)
err_free_data:
kfree(data);
err_free_stats:
- kfree(stats);
+ rdma_free_hw_stats_struct(stats);
return ERR_PTR(-ENOMEM);
}
@@ -1027,7 +1027,8 @@ static int setup_hw_port_stats(struct ib_port *port,
{
struct hw_stats_port_attribute *attr;
struct hw_stats_port_data *data;
- int i, ret;
+ bool opstat_skipped = false;
+ int i, ret, pos = 0;
data = alloc_hw_stats_port(port, group);
if (IS_ERR(data))
@@ -1045,16 +1046,23 @@ static int setup_hw_port_stats(struct ib_port *port,
data->stats->timestamp = jiffies;
for (i = 0; i < data->stats->num_counters; i++) {
- attr = &data->attrs[i];
+ if (data->stats->descs[i].flags & IB_STAT_FLAG_OPTIONAL) {
+ opstat_skipped = true;
+ continue;
+ }
+
+ WARN_ON(opstat_skipped);
+ attr = &data->attrs[pos];
sysfs_attr_init(&attr->attr.attr);
- attr->attr.attr.name = data->stats->names[i];
+ attr->attr.attr.name = data->stats->descs[i].name;
attr->attr.attr.mode = 0444;
attr->attr.show = hw_stat_port_show;
attr->show = show_hw_stats;
- group->attrs[i] = &attr->attr.attr;
+ group->attrs[pos] = &attr->attr.attr;
+ pos++;
}
- attr = &data->attrs[i];
+ attr = &data->attrs[pos];
sysfs_attr_init(&attr->attr.attr);
attr->attr.attr.name = "lifespan";
attr->attr.attr.mode = 0644;
@@ -1062,7 +1070,7 @@ static int setup_hw_port_stats(struct ib_port *port,
attr->show = show_stats_lifespan;
attr->attr.store = hw_stat_port_store;
attr->store = set_stats_lifespan;
- group->attrs[i] = &attr->attr.attr;
+ group->attrs[pos] = &attr->attr.attr;
port->hw_stats_data = data;
return 0;
@@ -1125,7 +1133,7 @@ static int setup_gid_attrs(struct ib_port *port,
int ret;
gid_attr_group = kzalloc(struct_size(gid_attr_group, attrs_list,
- attr->gid_tbl_len * 2),
+ size_mul(attr->gid_tbl_len, 2)),
GFP_KERNEL);
if (!gid_attr_group)
return -ENOMEM;
@@ -1189,15 +1197,18 @@ static struct ib_port *setup_port(struct ib_core_device *coredev, int port_num,
struct ib_port *p;
int ret;
- p = kzalloc(struct_size(p, attrs_list,
- attr->gid_tbl_len + attr->pkey_tbl_len),
- GFP_KERNEL);
+ p = kvzalloc(struct_size(p, attrs_list,
+ size_add(attr->gid_tbl_len, attr->pkey_tbl_len)),
+ GFP_KERNEL);
if (!p)
return ERR_PTR(-ENOMEM);
p->ibdev = device;
p->port_num = port_num;
kobject_init(&p->kobj, &port_type);
+ if (device->port_data && is_full_dev)
+ device->port_data[port_num].sysfs = p;
+
cur_group = p->groups_list;
ret = alloc_port_table_group("gids", &p->groups[0], p->attrs_list,
attr->gid_tbl_len, show_port_gid);
@@ -1243,9 +1254,6 @@ static struct ib_port *setup_port(struct ib_core_device *coredev, int port_num,
}
list_add_tail(&p->kobj.entry, &coredev->port_list);
- if (device->port_data && is_full_dev)
- device->port_data[port_num].sysfs = p;
-
return p;
err_groups:
@@ -1253,6 +1261,8 @@ err_groups:
err_del:
kobject_del(&p->kobj);
err_put:
+ if (device->port_data && is_full_dev)
+ device->port_data[port_num].sysfs = NULL;
kobject_put(&p->kobj);
return ERR_PTR(ret);
}
@@ -1261,14 +1271,17 @@ static void destroy_port(struct ib_core_device *coredev, struct ib_port *port)
{
bool is_full_dev = &port->ibdev->coredev == coredev;
- if (port->ibdev->port_data &&
- port->ibdev->port_data[port->port_num].sysfs == port)
- port->ibdev->port_data[port->port_num].sysfs = NULL;
list_del(&port->kobj.entry);
if (is_full_dev)
sysfs_remove_groups(&port->kobj, port->ibdev->ops.port_groups);
+
sysfs_remove_groups(&port->kobj, port->groups_list);
kobject_del(&port->kobj);
+
+ if (port->ibdev->port_data &&
+ port->ibdev->port_data[port->port_num].sysfs == port)
+ port->ibdev->port_data[port->port_num].sysfs = NULL;
+
kobject_put(&port->kobj);
}
diff --git a/drivers/infiniband/core/ucaps.c b/drivers/infiniband/core/ucaps.c
new file mode 100644
index 000000000000..de5cb8bf0a61
--- /dev/null
+++ b/drivers/infiniband/core/ucaps.c
@@ -0,0 +1,267 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#include <linux/kref.h>
+#include <linux/cdev.h>
+#include <linux/mutex.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <rdma/ib_ucaps.h>
+
+#define RDMA_UCAP_FIRST RDMA_UCAP_MLX5_CTRL_LOCAL
+
+static DEFINE_MUTEX(ucaps_mutex);
+static struct ib_ucap *ucaps_list[RDMA_UCAP_MAX];
+static bool ucaps_class_is_registered;
+static dev_t ucaps_base_dev;
+
+struct ib_ucap {
+ struct cdev cdev;
+ struct device dev;
+ struct kref ref;
+};
+
+static const char *ucap_names[RDMA_UCAP_MAX] = {
+ [RDMA_UCAP_MLX5_CTRL_LOCAL] = "mlx5_perm_ctrl_local",
+ [RDMA_UCAP_MLX5_CTRL_OTHER_VHCA] = "mlx5_perm_ctrl_other_vhca"
+};
+
+static char *ucaps_devnode(const struct device *dev, umode_t *mode)
+{
+ if (mode)
+ *mode = 0600;
+
+ return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
+}
+
+static const struct class ucaps_class = {
+ .name = "infiniband_ucaps",
+ .devnode = ucaps_devnode,
+};
+
+static const struct file_operations ucaps_cdev_fops = {
+ .owner = THIS_MODULE,
+ .open = simple_open,
+};
+
+/**
+ * ib_cleanup_ucaps - cleanup all API resources and class.
+ *
+ * This is called once, when removing the ib_uverbs module.
+ */
+void ib_cleanup_ucaps(void)
+{
+ mutex_lock(&ucaps_mutex);
+ if (!ucaps_class_is_registered) {
+ mutex_unlock(&ucaps_mutex);
+ return;
+ }
+
+ for (int i = RDMA_UCAP_FIRST; i < RDMA_UCAP_MAX; i++)
+ WARN_ON(ucaps_list[i]);
+
+ class_unregister(&ucaps_class);
+ ucaps_class_is_registered = false;
+ unregister_chrdev_region(ucaps_base_dev, RDMA_UCAP_MAX);
+ mutex_unlock(&ucaps_mutex);
+}
+
+static int get_ucap_from_devt(dev_t devt, u64 *idx_mask)
+{
+ for (int type = RDMA_UCAP_FIRST; type < RDMA_UCAP_MAX; type++) {
+ if (ucaps_list[type] && ucaps_list[type]->dev.devt == devt) {
+ *idx_mask |= 1 << type;
+ return 0;
+ }
+ }
+
+ return -EINVAL;
+}
+
+static int get_devt_from_fd(unsigned int fd, dev_t *ret_dev)
+{
+ struct file *file;
+
+ file = fget(fd);
+ if (!file)
+ return -EBADF;
+
+ *ret_dev = file_inode(file)->i_rdev;
+ fput(file);
+ return 0;
+}
+
+/**
+ * ib_ucaps_init - Initialization required before ucap creation.
+ *
+ * Return: 0 on success, or a negative errno value on failure
+ */
+static int ib_ucaps_init(void)
+{
+ int ret = 0;
+
+ if (ucaps_class_is_registered)
+ return ret;
+
+ ret = class_register(&ucaps_class);
+ if (ret)
+ return ret;
+
+ ret = alloc_chrdev_region(&ucaps_base_dev, 0, RDMA_UCAP_MAX,
+ ucaps_class.name);
+ if (ret < 0) {
+ class_unregister(&ucaps_class);
+ return ret;
+ }
+
+ ucaps_class_is_registered = true;
+
+ return 0;
+}
+
+static void ucap_dev_release(struct device *device)
+{
+ struct ib_ucap *ucap = container_of(device, struct ib_ucap, dev);
+
+ kfree(ucap);
+}
+
+/**
+ * ib_create_ucap - Add a ucap character device
+ * @type: UCAP type
+ *
+ * Creates a ucap character device in the /dev/infiniband directory. By default,
+ * the device has root-only read-write access.
+ *
+ * A driver may call this multiple times with the same UCAP type. A reference
+ * count tracks creations and deletions.
+ *
+ * Return: 0 on success, or a negative errno value on failure
+ */
+int ib_create_ucap(enum rdma_user_cap type)
+{
+ struct ib_ucap *ucap;
+ int ret;
+
+ if (type >= RDMA_UCAP_MAX)
+ return -EINVAL;
+
+ mutex_lock(&ucaps_mutex);
+ ret = ib_ucaps_init();
+ if (ret)
+ goto unlock;
+
+ ucap = ucaps_list[type];
+ if (ucap) {
+ kref_get(&ucap->ref);
+ mutex_unlock(&ucaps_mutex);
+ return 0;
+ }
+
+ ucap = kzalloc(sizeof(*ucap), GFP_KERNEL);
+ if (!ucap) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
+ device_initialize(&ucap->dev);
+ ucap->dev.class = &ucaps_class;
+ ucap->dev.devt = MKDEV(MAJOR(ucaps_base_dev), type);
+ ucap->dev.release = ucap_dev_release;
+ ret = dev_set_name(&ucap->dev, "%s", ucap_names[type]);
+ if (ret)
+ goto err_device;
+
+ cdev_init(&ucap->cdev, &ucaps_cdev_fops);
+ ucap->cdev.owner = THIS_MODULE;
+
+ ret = cdev_device_add(&ucap->cdev, &ucap->dev);
+ if (ret)
+ goto err_device;
+
+ kref_init(&ucap->ref);
+ ucaps_list[type] = ucap;
+ mutex_unlock(&ucaps_mutex);
+
+ return 0;
+
+err_device:
+ put_device(&ucap->dev);
+unlock:
+ mutex_unlock(&ucaps_mutex);
+ return ret;
+}
+EXPORT_SYMBOL(ib_create_ucap);
+
+static void ib_release_ucap(struct kref *ref)
+{
+ struct ib_ucap *ucap = container_of(ref, struct ib_ucap, ref);
+ enum rdma_user_cap type;
+
+ for (type = RDMA_UCAP_FIRST; type < RDMA_UCAP_MAX; type++) {
+ if (ucaps_list[type] == ucap)
+ break;
+ }
+ WARN_ON(type == RDMA_UCAP_MAX);
+
+ ucaps_list[type] = NULL;
+ cdev_device_del(&ucap->cdev, &ucap->dev);
+ put_device(&ucap->dev);
+}
+
+/**
+ * ib_remove_ucap - Remove a ucap character device
+ * @type: User cap type
+ *
+ * Removes the ucap character device according to type. The device is completely
+ * removed from the filesystem when its reference count reaches 0.
+ */
+void ib_remove_ucap(enum rdma_user_cap type)
+{
+ struct ib_ucap *ucap;
+
+ mutex_lock(&ucaps_mutex);
+ ucap = ucaps_list[type];
+ if (WARN_ON(!ucap))
+ goto end;
+
+ kref_put(&ucap->ref, ib_release_ucap);
+end:
+ mutex_unlock(&ucaps_mutex);
+}
+EXPORT_SYMBOL(ib_remove_ucap);
+
+/**
+ * ib_get_ucaps - Get bitmask of ucap types from file descriptors
+ * @fds: Array of file descriptors
+ * @fd_count: Number of file descriptors in the array
+ * @idx_mask: Bitmask to be updated based on the ucaps in the fd list
+ *
+ * Given an array of file descriptors, this function returns a bitmask of
+ * the ucaps where a bit is set if an FD for that ucap type was in the array.
+ *
+ * Return: 0 on success, or a negative errno value on failure
+ */
+int ib_get_ucaps(int *fds, int fd_count, uint64_t *idx_mask)
+{
+ int ret = 0;
+ dev_t dev;
+
+ *idx_mask = 0;
+ mutex_lock(&ucaps_mutex);
+ for (int i = 0; i < fd_count; i++) {
+ ret = get_devt_from_fd(fds[i], &dev);
+ if (ret)
+ goto end;
+
+ ret = get_ucap_from_devt(dev, idx_mask);
+ if (ret)
+ goto end;
+ }
+
+end:
+ mutex_unlock(&ucaps_mutex);
+ return ret;
+}
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 2b72c4fa9550..ec3be65a2b88 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -69,9 +69,10 @@ static struct ctl_table ucma_ctl_table[] = {
.data = &max_backlog,
.maxlen = sizeof max_backlog,
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_INT_MAX,
},
- { }
};
struct ucma_file {
@@ -95,6 +96,7 @@ struct ucma_context {
u64 uid;
struct list_head list;
+ struct list_head mc_list;
struct work_struct close_work;
};
@@ -105,6 +107,7 @@ struct ucma_multicast {
u64 uid;
u8 join_state;
+ struct list_head list;
struct sockaddr_storage addr;
};
@@ -198,6 +201,7 @@ static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file)
INIT_WORK(&ctx->close_work, ucma_close_id);
init_completion(&ctx->comp);
+ INIT_LIST_HEAD(&ctx->mc_list);
/* So list_del() will work if we don't do ucma_finish_ctx() */
INIT_LIST_HEAD(&ctx->list);
ctx->file = file;
@@ -278,6 +282,10 @@ static struct ucma_event *ucma_create_uevent(struct ucma_context *ctx,
}
uevent->resp.event = event->event;
uevent->resp.status = event->status;
+
+ if (event->event == RDMA_CM_EVENT_ADDRINFO_RESOLVED)
+ goto out;
+
if (ctx->cm_id->qp_type == IB_QPT_UD)
ucma_copy_ud_event(ctx->cm_id->device, &uevent->resp.param.ud,
&event->param.ud);
@@ -285,6 +293,7 @@ static struct ucma_event *ucma_create_uevent(struct ucma_context *ctx,
ucma_copy_conn_event(&uevent->resp.param.conn,
&event->param.conn);
+out:
uevent->resp.ece.vendor_id = event->ece.vendor_id;
uevent->resp.ece.attr_mod = event->ece.attr_mod;
return uevent;
@@ -357,7 +366,7 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id,
if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
xa_lock(&ctx_table);
if (xa_load(&ctx_table, ctx->id) == ctx)
- queue_work(system_unbound_wq, &ctx->close_work);
+ queue_work(system_dfl_wq, &ctx->close_work);
xa_unlock(&ctx_table);
}
return 0;
@@ -484,19 +493,19 @@ err1:
static void ucma_cleanup_multicast(struct ucma_context *ctx)
{
- struct ucma_multicast *mc;
- unsigned long index;
+ struct ucma_multicast *mc, *tmp;
- xa_for_each(&multicast_table, index, mc) {
- if (mc->ctx != ctx)
- continue;
+ xa_lock(&multicast_table);
+ list_for_each_entry_safe(mc, tmp, &ctx->mc_list, list) {
+ list_del(&mc->list);
/*
* At this point mc->ctx->ref is 0 so the mc cannot leave the
* lock on the reader and this is enough serialization
*/
- xa_erase(&multicast_table, index);
+ __xa_erase(&multicast_table, mc->id);
kfree(mc);
}
+ xa_unlock(&multicast_table);
}
static void ucma_cleanup_mc_events(struct ucma_multicast *mc)
@@ -724,6 +733,28 @@ static ssize_t ucma_resolve_addr(struct ucma_file *file,
return ret;
}
+static ssize_t ucma_resolve_ib_service(struct ucma_file *file,
+ const char __user *inbuf, int in_len,
+ int out_len)
+{
+ struct rdma_ucm_resolve_ib_service cmd;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ mutex_lock(&ctx->mutex);
+ ret = rdma_resolve_ib_service(ctx->cm_id, &cmd.ibs);
+ mutex_unlock(&ctx->mutex);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
static ssize_t ucma_resolve_route(struct ucma_file *file,
const char __user *inbuf,
int in_len, int out_len)
@@ -751,8 +782,8 @@ static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp,
{
struct rdma_dev_addr *dev_addr;
- resp->num_paths = route->num_paths;
- switch (route->num_paths) {
+ resp->num_paths = route->num_pri_alt_paths;
+ switch (route->num_pri_alt_paths) {
case 0:
dev_addr = &route->addr.dev_addr;
rdma_addr_get_dgid(dev_addr,
@@ -778,8 +809,8 @@ static void ucma_copy_iboe_route(struct rdma_ucm_query_route_resp *resp,
struct rdma_route *route)
{
- resp->num_paths = route->num_paths;
- switch (route->num_paths) {
+ resp->num_paths = route->num_pri_alt_paths;
+ switch (route->num_pri_alt_paths) {
case 0:
rdma_ip2gid((struct sockaddr *)&route->addr.dst_addr,
(union ib_gid *)&resp->ib_route[0].dgid);
@@ -918,7 +949,7 @@ static ssize_t ucma_query_path(struct ucma_context *ctx,
if (!resp)
return -ENOMEM;
- resp->num_paths = ctx->cm_id->route.num_paths;
+ resp->num_paths = ctx->cm_id->route.num_pri_alt_paths;
for (i = 0, out_len -= sizeof(*resp);
i < resp->num_paths && out_len > sizeof(struct ib_path_rec_data);
i++, out_len -= sizeof(struct ib_path_rec_data)) {
@@ -990,6 +1021,43 @@ static ssize_t ucma_query_gid(struct ucma_context *ctx,
return ret;
}
+static ssize_t ucma_query_ib_service(struct ucma_context *ctx,
+ void __user *response, int out_len)
+{
+ struct rdma_ucm_query_ib_service_resp *resp;
+ int n, ret = 0;
+
+ if (out_len < sizeof(struct rdma_ucm_query_ib_service_resp))
+ return -ENOSPC;
+
+ if (!ctx->cm_id->route.service_recs)
+ return -ENODATA;
+
+ resp = kzalloc(out_len, GFP_KERNEL);
+ if (!resp)
+ return -ENOMEM;
+
+ resp->num_service_recs = ctx->cm_id->route.num_service_recs;
+
+ n = (out_len - sizeof(struct rdma_ucm_query_ib_service_resp)) /
+ sizeof(struct ib_user_service_rec);
+
+ if (!n)
+ goto out;
+
+ if (n > ctx->cm_id->route.num_service_recs)
+ n = ctx->cm_id->route.num_service_recs;
+
+ memcpy(resp->recs, ctx->cm_id->route.service_recs,
+ sizeof(*resp->recs) * n);
+ if (copy_to_user(response, resp, struct_size(resp, recs, n)))
+ ret = -EFAULT;
+
+out:
+ kfree(resp);
+ return ret;
+}
+
static ssize_t ucma_query(struct ucma_file *file,
const char __user *inbuf,
int in_len, int out_len)
@@ -1018,6 +1086,9 @@ static ssize_t ucma_query(struct ucma_file *file,
case RDMA_USER_CM_QUERY_GID:
ret = ucma_query_gid(ctx, response, out_len);
break;
+ case RDMA_USER_CM_QUERY_IB_SERVICE:
+ ret = ucma_query_ib_service(ctx, response, out_len);
+ break;
default:
ret = -ENOSYS;
break;
@@ -1469,12 +1540,16 @@ static ssize_t ucma_process_join(struct ucma_file *file,
mc->uid = cmd->uid;
memcpy(&mc->addr, addr, cmd->addr_size);
- if (xa_alloc(&multicast_table, &mc->id, NULL, xa_limit_32b,
+ xa_lock(&multicast_table);
+ if (__xa_alloc(&multicast_table, &mc->id, NULL, xa_limit_32b,
GFP_KERNEL)) {
ret = -ENOMEM;
goto err_free_mc;
}
+ list_add_tail(&mc->list, &ctx->mc_list);
+ xa_unlock(&multicast_table);
+
mutex_lock(&ctx->mutex);
ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *)&mc->addr,
join_state, mc);
@@ -1500,8 +1575,11 @@ err_leave_multicast:
mutex_unlock(&ctx->mutex);
ucma_cleanup_mc_events(mc);
err_xa_erase:
- xa_erase(&multicast_table, mc->id);
+ xa_lock(&multicast_table);
+ list_del(&mc->list);
+ __xa_erase(&multicast_table, mc->id);
err_free_mc:
+ xa_unlock(&multicast_table);
kfree(mc);
err_put_ctx:
ucma_put_ctx(ctx);
@@ -1569,15 +1647,17 @@ static ssize_t ucma_leave_multicast(struct ucma_file *file,
mc = ERR_PTR(-EINVAL);
else if (!refcount_inc_not_zero(&mc->ctx->ref))
mc = ERR_PTR(-ENXIO);
- else
- __xa_erase(&multicast_table, mc->id);
- xa_unlock(&multicast_table);
if (IS_ERR(mc)) {
+ xa_unlock(&multicast_table);
ret = PTR_ERR(mc);
goto out;
}
+ list_del(&mc->list);
+ __xa_erase(&multicast_table, mc->id);
+ xa_unlock(&multicast_table);
+
mutex_lock(&mc->ctx->mutex);
rdma_leave_multicast(mc->ctx->cm_id, (struct sockaddr *) &mc->addr);
mutex_unlock(&mc->ctx->mutex);
@@ -1604,7 +1684,6 @@ static ssize_t ucma_migrate_id(struct ucma_file *new_file,
struct ucma_event *uevent, *tmp;
struct ucma_context *ctx;
LIST_HEAD(event_list);
- struct fd f;
struct ucma_file *cur_file;
int ret = 0;
@@ -1612,21 +1691,17 @@ static ssize_t ucma_migrate_id(struct ucma_file *new_file,
return -EFAULT;
/* Get current fd to protect against it being closed */
- f = fdget(cmd.fd);
- if (!f.file)
+ CLASS(fd, f)(cmd.fd);
+ if (fd_empty(f))
return -ENOENT;
- if (f.file->f_op != &ucma_fops) {
- ret = -EINVAL;
- goto file_put;
- }
- cur_file = f.file->private_data;
+ if (fd_file(f)->f_op != &ucma_fops)
+ return -EINVAL;
+ cur_file = fd_file(f)->private_data;
/* Validate current fd and prevent destruction of id. */
ctx = ucma_get_ctx(cur_file, cmd.id);
- if (IS_ERR(ctx)) {
- ret = PTR_ERR(ctx);
- goto file_put;
- }
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
rdma_lock_handler(ctx->cm_id);
/*
@@ -1667,8 +1742,55 @@ static ssize_t ucma_migrate_id(struct ucma_file *new_file,
err_unlock:
rdma_unlock_handler(ctx->cm_id);
ucma_put_ctx(ctx);
-file_put:
- fdput(f);
+ return ret;
+}
+
+static ssize_t ucma_write_cm_event(struct ucma_file *file,
+ const char __user *inbuf, int in_len,
+ int out_len)
+{
+ struct rdma_ucm_write_cm_event cmd;
+ struct rdma_cm_event event = {};
+ struct ucma_event *uevent;
+ struct ucma_context *ctx;
+ int ret = 0;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ if ((cmd.event != RDMA_CM_EVENT_USER) &&
+ (cmd.event != RDMA_CM_EVENT_INTERNAL))
+ return -EINVAL;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ event.event = cmd.event;
+ event.status = cmd.status;
+ event.param.arg = cmd.param.arg;
+
+ uevent = kzalloc(sizeof(*uevent), GFP_KERNEL);
+ if (!uevent) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ uevent->ctx = ctx;
+ uevent->resp.uid = ctx->uid;
+ uevent->resp.id = ctx->id;
+ uevent->resp.event = event.event;
+ uevent->resp.status = event.status;
+ memcpy(uevent->resp.param.arg32, &event.param.arg,
+ sizeof(event.param.arg));
+
+ mutex_lock(&ctx->file->mut);
+ list_add_tail(&uevent->list, &ctx->file->event_list);
+ mutex_unlock(&ctx->file->mut);
+ wake_up_interruptible(&ctx->file->poll_wait);
+
+out:
+ ucma_put_ctx(ctx);
return ret;
}
@@ -1697,7 +1819,9 @@ static ssize_t (*ucma_cmd_table[])(struct ucma_file *file,
[RDMA_USER_CM_CMD_QUERY] = ucma_query,
[RDMA_USER_CM_CMD_BIND] = ucma_bind,
[RDMA_USER_CM_CMD_RESOLVE_ADDR] = ucma_resolve_addr,
- [RDMA_USER_CM_CMD_JOIN_MCAST] = ucma_join_multicast
+ [RDMA_USER_CM_CMD_JOIN_MCAST] = ucma_join_multicast,
+ [RDMA_USER_CM_CMD_RESOLVE_IB_SERVICE] = ucma_resolve_ib_service,
+ [RDMA_USER_CM_CMD_WRITE_CM_EVENT] = ucma_write_cm_event,
};
static ssize_t ucma_write(struct file *filp, const char __user *buf,
@@ -1806,7 +1930,6 @@ static const struct file_operations ucma_fops = {
.release = ucma_close,
.write = ucma_write,
.poll = ucma_poll,
- .llseek = no_llseek,
};
static struct miscdevice ucma_misc = {
diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c
index 64d9c492de64..8d3dfef9ebaa 100644
--- a/drivers/infiniband/core/ud_header.c
+++ b/drivers/infiniband/core/ud_header.c
@@ -462,86 +462,3 @@ int ib_ud_header_pack(struct ib_ud_header *header,
return len;
}
EXPORT_SYMBOL(ib_ud_header_pack);
-
-/**
- * ib_ud_header_unpack - Unpack UD header struct from wire format
- * @header:UD header struct
- * @buf:Buffer to pack into
- *
- * ib_ud_header_pack() unpacks the UD header structure @header from wire
- * format in the buffer @buf.
- */
-int ib_ud_header_unpack(void *buf,
- struct ib_ud_header *header)
-{
- ib_unpack(lrh_table, ARRAY_SIZE(lrh_table),
- buf, &header->lrh);
- buf += IB_LRH_BYTES;
-
- if (header->lrh.link_version != 0) {
- pr_warn("Invalid LRH.link_version %u\n",
- header->lrh.link_version);
- return -EINVAL;
- }
-
- switch (header->lrh.link_next_header) {
- case IB_LNH_IBA_LOCAL:
- header->grh_present = 0;
- break;
-
- case IB_LNH_IBA_GLOBAL:
- header->grh_present = 1;
- ib_unpack(grh_table, ARRAY_SIZE(grh_table),
- buf, &header->grh);
- buf += IB_GRH_BYTES;
-
- if (header->grh.ip_version != 6) {
- pr_warn("Invalid GRH.ip_version %u\n",
- header->grh.ip_version);
- return -EINVAL;
- }
- if (header->grh.next_header != 0x1b) {
- pr_warn("Invalid GRH.next_header 0x%02x\n",
- header->grh.next_header);
- return -EINVAL;
- }
- break;
-
- default:
- pr_warn("Invalid LRH.link_next_header %u\n",
- header->lrh.link_next_header);
- return -EINVAL;
- }
-
- ib_unpack(bth_table, ARRAY_SIZE(bth_table),
- buf, &header->bth);
- buf += IB_BTH_BYTES;
-
- switch (header->bth.opcode) {
- case IB_OPCODE_UD_SEND_ONLY:
- header->immediate_present = 0;
- break;
- case IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE:
- header->immediate_present = 1;
- break;
- default:
- pr_warn("Invalid BTH.opcode 0x%02x\n", header->bth.opcode);
- return -EINVAL;
- }
-
- if (header->bth.transport_header_version != 0) {
- pr_warn("Invalid BTH.transport_header_version %u\n",
- header->bth.transport_header_version);
- return -EINVAL;
- }
-
- ib_unpack(deth_table, ARRAY_SIZE(deth_table),
- buf, &header->deth);
- buf += IB_DETH_BYTES;
-
- if (header->immediate_present)
- memcpy(&header->immediate_data, buf, sizeof header->immediate_data);
-
- return 0;
-}
-EXPORT_SYMBOL(ib_ud_header_unpack);
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 0eb40025075f..8137031c2a65 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -45,21 +45,27 @@
#include "uverbs.h"
+#define RESCHED_LOOP_CNT_THRESHOLD 0x1000
+
static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
{
bool make_dirty = umem->writable && dirty;
struct scatterlist *sg;
unsigned int i;
- if (umem->nmap > 0)
- ib_dma_unmap_sg(dev, umem->sg_head.sgl, umem->sg_nents,
- DMA_BIDIRECTIONAL);
+ if (dirty)
+ ib_dma_unmap_sgtable_attrs(dev, &umem->sgt_append.sgt,
+ DMA_BIDIRECTIONAL, 0);
- for_each_sg(umem->sg_head.sgl, sg, umem->sg_nents, i)
+ for_each_sgtable_sg(&umem->sgt_append.sgt, sg, i) {
unpin_user_page_range_dirty_lock(sg_page(sg),
DIV_ROUND_UP(sg->length, PAGE_SIZE), make_dirty);
- sg_free_table(&umem->sg_head);
+ if (i && !(i % RESCHED_LOOP_CNT_THRESHOLD))
+ cond_resched();
+ }
+
+ sg_free_append_table(&umem->sgt_append);
}
/**
@@ -80,11 +86,16 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
unsigned long pgsz_bitmap,
unsigned long virt)
{
- struct scatterlist *sg;
+ unsigned long curr_len = 0;
+ dma_addr_t curr_base = ~0;
unsigned long va, pgoff;
+ struct scatterlist *sg;
dma_addr_t mask;
+ dma_addr_t end;
int i;
+ umem->iova = va = virt;
+
if (umem->is_odp) {
unsigned int page_size = BIT(to_ib_umem_odp(umem)->page_shift);
@@ -94,13 +105,6 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
return page_size;
}
- /* rdma_for_each_block() has a bug if the page size is smaller than the
- * page size used to build the umem. For now prevent smaller page sizes
- * from being returned.
- */
- pgsz_bitmap &= GENMASK(BITS_PER_LONG - 1, PAGE_SHIFT);
-
- umem->iova = va = virt;
/* The best result is the smallest page size that results in the minimum
* number of required pages. Compute the largest page size that could
* work based on VA address bits that don't change.
@@ -111,18 +115,31 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
/* offset into first SGL */
pgoff = umem->address & ~PAGE_MASK;
- for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) {
- /* Walk SGL and reduce max page size if VA/PA bits differ
- * for any address.
+ for_each_sgtable_dma_sg(&umem->sgt_append.sgt, sg, i) {
+ /* If the current entry is physically contiguous with the previous
+ * one, no need to take its start addresses into consideration.
*/
- mask |= (sg_dma_address(sg) + pgoff) ^ va;
+ if (check_add_overflow(curr_base, curr_len, &end) ||
+ end != sg_dma_address(sg)) {
+
+ curr_base = sg_dma_address(sg);
+ curr_len = 0;
+
+ /* Reduce max page size if VA/PA bits differ */
+ mask |= (curr_base + pgoff) ^ va;
+
+ /* The alignment of any VA matching a discontinuity point
+ * in the physical memory sets the maximum possible page
+ * size as this must be a starting point of a new page that
+ * needs to be aligned.
+ */
+ if (i != 0)
+ mask |= va;
+ }
+
+ curr_len += sg_dma_len(sg);
va += sg_dma_len(sg) - pgoff;
- /* Except for the last entry, the ending iova alignment sets
- * the maximum possible page size as the low bits of the iova
- * must be zero when starting the next chunk.
- */
- if (i != (umem->nmap - 1))
- mask |= va;
+
pgoff = 0;
}
@@ -155,9 +172,8 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
unsigned long dma_attr = 0;
struct mm_struct *mm;
unsigned long npages;
- int ret;
- struct scatterlist *sg = NULL;
- unsigned int gup_flags = FOLL_WRITE;
+ int pinned, ret;
+ unsigned int gup_flags = FOLL_LONGTERM;
/*
* If the combination of the addr and size requested for this memory
@@ -211,29 +227,29 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
cur_base = addr & PAGE_MASK;
- if (!umem->writable)
- gup_flags |= FOLL_FORCE;
+ if (umem->writable)
+ gup_flags |= FOLL_WRITE;
while (npages) {
cond_resched();
- ret = pin_user_pages_fast(cur_base,
+ pinned = pin_user_pages_fast(cur_base,
min_t(unsigned long, npages,
PAGE_SIZE /
sizeof(struct page *)),
- gup_flags | FOLL_LONGTERM, page_list);
- if (ret < 0)
+ gup_flags, page_list);
+ if (pinned < 0) {
+ ret = pinned;
goto umem_release;
+ }
- cur_base += ret * PAGE_SIZE;
- npages -= ret;
- sg = __sg_alloc_table_from_pages(&umem->sg_head, page_list, ret,
- 0, ret << PAGE_SHIFT,
- ib_dma_max_seg_size(device), sg, npages,
- GFP_KERNEL);
- umem->sg_nents = umem->sg_head.nents;
- if (IS_ERR(sg)) {
- unpin_user_pages_dirty_lock(page_list, ret, 0);
- ret = PTR_ERR(sg);
+ cur_base += pinned * PAGE_SIZE;
+ npages -= pinned;
+ ret = sg_alloc_append_table_from_pages(
+ &umem->sgt_append, page_list, pinned, 0,
+ pinned << PAGE_SHIFT, ib_dma_max_seg_size(device),
+ npages, GFP_KERNEL);
+ if (ret) {
+ unpin_user_pages_dirty_lock(page_list, pinned, 0);
goto umem_release;
}
}
@@ -241,16 +257,10 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
if (access & IB_ACCESS_RELAXED_ORDERING)
dma_attr |= DMA_ATTR_WEAK_ORDERING;
- umem->nmap =
- ib_dma_map_sg_attrs(device, umem->sg_head.sgl, umem->sg_nents,
- DMA_BIDIRECTIONAL, dma_attr);
-
- if (!umem->nmap) {
- ret = -ENOMEM;
+ ret = ib_dma_map_sgtable_attrs(device, &umem->sgt_append.sgt,
+ DMA_BIDIRECTIONAL, dma_attr);
+ if (ret)
goto umem_release;
- }
-
- ret = 0;
goto out;
umem_release:
@@ -310,7 +320,8 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
return -EINVAL;
}
- ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->sg_nents, dst, length,
+ ret = sg_pcopy_to_buffer(umem->sgt_append.sgt.sgl,
+ umem->sgt_append.sgt.orig_nents, dst, length,
offset + ib_umem_offset(umem));
if (ret < 0)
diff --git a/drivers/infiniband/core/umem_dmabuf.c b/drivers/infiniband/core/umem_dmabuf.c
index c6e875619fac..0ec2e4120cc9 100644
--- a/drivers/infiniband/core/umem_dmabuf.c
+++ b/drivers/infiniband/core/umem_dmabuf.c
@@ -6,24 +6,31 @@
#include <linux/dma-buf.h>
#include <linux/dma-resv.h>
#include <linux/dma-mapping.h>
+#include <linux/module.h>
#include "uverbs.h"
+MODULE_IMPORT_NS("DMA_BUF");
+
int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf)
{
struct sg_table *sgt;
struct scatterlist *sg;
- struct dma_fence *fence;
unsigned long start, end, cur = 0;
unsigned int nmap = 0;
+ long ret;
int i;
dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
+ if (umem_dmabuf->revoked)
+ return -EINVAL;
+
if (umem_dmabuf->sgt)
goto wait_fence;
- sgt = dma_buf_map_attachment(umem_dmabuf->attach, DMA_BIDIRECTIONAL);
+ sgt = dma_buf_map_attachment(umem_dmabuf->attach,
+ DMA_BIDIRECTIONAL);
if (IS_ERR(sgt))
return PTR_ERR(sgt);
@@ -55,9 +62,8 @@ int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf)
cur += sg_dma_len(sg);
}
- umem_dmabuf->umem.sg_head.sgl = umem_dmabuf->first_sg;
- umem_dmabuf->umem.sg_head.nents = nmap;
- umem_dmabuf->umem.nmap = nmap;
+ umem_dmabuf->umem.sgt_append.sgt.sgl = umem_dmabuf->first_sg;
+ umem_dmabuf->umem.sgt_append.sgt.nents = nmap;
umem_dmabuf->sgt = sgt;
wait_fence:
@@ -66,10 +72,13 @@ wait_fence:
* may be not up-to-date. Wait for the exporter to finish
* the migration.
*/
- fence = dma_resv_excl_fence(umem_dmabuf->attach->dmabuf->resv);
- if (fence)
- return dma_fence_wait(fence, false);
-
+ ret = dma_resv_wait_timeout(umem_dmabuf->attach->dmabuf->resv,
+ DMA_RESV_USAGE_KERNEL,
+ false, MAX_SCHEDULE_TIMEOUT);
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ return -ETIMEDOUT;
return 0;
}
EXPORT_SYMBOL(ib_umem_dmabuf_map_pages);
@@ -104,10 +113,12 @@ void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf)
}
EXPORT_SYMBOL(ib_umem_dmabuf_unmap_pages);
-struct ib_umem_dmabuf *ib_umem_dmabuf_get(struct ib_device *device,
- unsigned long offset, size_t size,
- int fd, int access,
- const struct dma_buf_attach_ops *ops)
+static struct ib_umem_dmabuf *
+ib_umem_dmabuf_get_with_dma_device(struct ib_device *device,
+ struct device *dma_device,
+ unsigned long offset, size_t size,
+ int fd, int access,
+ const struct dma_buf_attach_ops *ops)
{
struct dma_buf *dmabuf;
struct ib_umem_dmabuf *umem_dmabuf;
@@ -146,7 +157,7 @@ struct ib_umem_dmabuf *ib_umem_dmabuf_get(struct ib_device *device,
umem_dmabuf->attach = dma_buf_dynamic_attach(
dmabuf,
- device->dma_device,
+ dma_device,
ops,
umem_dmabuf);
if (IS_ERR(umem_dmabuf->attach)) {
@@ -162,15 +173,101 @@ out_release_dmabuf:
dma_buf_put(dmabuf);
return ret;
}
+
+struct ib_umem_dmabuf *ib_umem_dmabuf_get(struct ib_device *device,
+ unsigned long offset, size_t size,
+ int fd, int access,
+ const struct dma_buf_attach_ops *ops)
+{
+ return ib_umem_dmabuf_get_with_dma_device(device, device->dma_device,
+ offset, size, fd, access, ops);
+}
EXPORT_SYMBOL(ib_umem_dmabuf_get);
-void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf)
+static void
+ib_umem_dmabuf_unsupported_move_notify(struct dma_buf_attachment *attach)
+{
+ struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv;
+
+ ibdev_warn_ratelimited(umem_dmabuf->umem.ibdev,
+ "Invalidate callback should not be called when memory is pinned\n");
+}
+
+static struct dma_buf_attach_ops ib_umem_dmabuf_attach_pinned_ops = {
+ .allow_peer2peer = true,
+ .move_notify = ib_umem_dmabuf_unsupported_move_notify,
+};
+
+struct ib_umem_dmabuf *
+ib_umem_dmabuf_get_pinned_with_dma_device(struct ib_device *device,
+ struct device *dma_device,
+ unsigned long offset, size_t size,
+ int fd, int access)
+{
+ struct ib_umem_dmabuf *umem_dmabuf;
+ int err;
+
+ umem_dmabuf = ib_umem_dmabuf_get_with_dma_device(device, dma_device, offset,
+ size, fd, access,
+ &ib_umem_dmabuf_attach_pinned_ops);
+ if (IS_ERR(umem_dmabuf))
+ return umem_dmabuf;
+
+ dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
+ err = dma_buf_pin(umem_dmabuf->attach);
+ if (err)
+ goto err_release;
+ umem_dmabuf->pinned = 1;
+
+ err = ib_umem_dmabuf_map_pages(umem_dmabuf);
+ if (err)
+ goto err_unpin;
+ dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
+
+ return umem_dmabuf;
+
+err_unpin:
+ dma_buf_unpin(umem_dmabuf->attach);
+err_release:
+ dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
+ ib_umem_release(&umem_dmabuf->umem);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL(ib_umem_dmabuf_get_pinned_with_dma_device);
+
+struct ib_umem_dmabuf *ib_umem_dmabuf_get_pinned(struct ib_device *device,
+ unsigned long offset,
+ size_t size, int fd,
+ int access)
+{
+ return ib_umem_dmabuf_get_pinned_with_dma_device(device, device->dma_device,
+ offset, size, fd, access);
+}
+EXPORT_SYMBOL(ib_umem_dmabuf_get_pinned);
+
+void ib_umem_dmabuf_revoke(struct ib_umem_dmabuf *umem_dmabuf)
{
struct dma_buf *dmabuf = umem_dmabuf->attach->dmabuf;
dma_resv_lock(dmabuf->resv, NULL);
+ if (umem_dmabuf->revoked)
+ goto end;
ib_umem_dmabuf_unmap_pages(umem_dmabuf);
+ if (umem_dmabuf->pinned) {
+ dma_buf_unpin(umem_dmabuf->attach);
+ umem_dmabuf->pinned = 0;
+ }
+ umem_dmabuf->revoked = 1;
+end:
dma_resv_unlock(dmabuf->resv);
+}
+EXPORT_SYMBOL(ib_umem_dmabuf_revoke);
+
+void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf)
+{
+ struct dma_buf *dmabuf = umem_dmabuf->attach->dmabuf;
+
+ ib_umem_dmabuf_revoke(umem_dmabuf);
dma_buf_detach(dmabuf, umem_dmabuf->attach);
dma_buf_put(dmabuf);
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index 9462dbe66014..572a91a62a7b 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -41,67 +41,83 @@
#include <linux/hugetlb.h>
#include <linux/interval_tree.h>
#include <linux/hmm.h>
+#include <linux/hmm-dma.h>
#include <linux/pagemap.h>
-#include <rdma/ib_verbs.h>
-#include <rdma/ib_umem.h>
#include <rdma/ib_umem_odp.h>
#include "uverbs.h"
-static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
- const struct mmu_interval_notifier_ops *ops)
+static void ib_init_umem_implicit_odp(struct ib_umem_odp *umem_odp)
{
- int ret;
+ umem_odp->is_implicit_odp = 1;
+ umem_odp->umem.is_odp = 1;
+ mutex_init(&umem_odp->umem_mutex);
+}
+
+static int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
+ const struct mmu_interval_notifier_ops *ops)
+{
+ struct ib_device *dev = umem_odp->umem.ibdev;
+ size_t page_size = 1UL << umem_odp->page_shift;
+ struct hmm_dma_map *map;
+ unsigned long start;
+ unsigned long end;
+ size_t nr_entries;
+ int ret = 0;
umem_odp->umem.is_odp = 1;
mutex_init(&umem_odp->umem_mutex);
- if (!umem_odp->is_implicit_odp) {
- size_t page_size = 1UL << umem_odp->page_shift;
- unsigned long start;
- unsigned long end;
- size_t ndmas, npfns;
-
- start = ALIGN_DOWN(umem_odp->umem.address, page_size);
- if (check_add_overflow(umem_odp->umem.address,
- (unsigned long)umem_odp->umem.length,
- &end))
- return -EOVERFLOW;
- end = ALIGN(end, page_size);
- if (unlikely(end < page_size))
- return -EOVERFLOW;
-
- ndmas = (end - start) >> umem_odp->page_shift;
- if (!ndmas)
- return -EINVAL;
-
- npfns = (end - start) >> PAGE_SHIFT;
- umem_odp->pfn_list = kvcalloc(
- npfns, sizeof(*umem_odp->pfn_list), GFP_KERNEL);
- if (!umem_odp->pfn_list)
- return -ENOMEM;
-
- umem_odp->dma_list = kvcalloc(
- ndmas, sizeof(*umem_odp->dma_list), GFP_KERNEL);
- if (!umem_odp->dma_list) {
+ start = ALIGN_DOWN(umem_odp->umem.address, page_size);
+ if (check_add_overflow(umem_odp->umem.address,
+ (unsigned long)umem_odp->umem.length, &end))
+ return -EOVERFLOW;
+ end = ALIGN(end, page_size);
+ if (unlikely(end < page_size))
+ return -EOVERFLOW;
+ /*
+ * The mmu notifier can be called within reclaim contexts and takes the
+ * umem_mutex. This is rare to trigger in testing, teach lockdep about
+ * it.
+ */
+ if (IS_ENABLED(CONFIG_LOCKDEP)) {
+ fs_reclaim_acquire(GFP_KERNEL);
+ mutex_lock(&umem_odp->umem_mutex);
+ mutex_unlock(&umem_odp->umem_mutex);
+ fs_reclaim_release(GFP_KERNEL);
+ }
+
+ nr_entries = (end - start) >> PAGE_SHIFT;
+ if (!(nr_entries * PAGE_SIZE / page_size))
+ return -EINVAL;
+
+ map = &umem_odp->map;
+ if (ib_uses_virt_dma(dev)) {
+ map->pfn_list = kvcalloc(nr_entries, sizeof(*map->pfn_list),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!map->pfn_list)
ret = -ENOMEM;
- goto out_pfn_list;
- }
+ } else
+ ret = hmm_dma_map_alloc(dev->dma_device, map,
+ (end - start) >> PAGE_SHIFT,
+ 1 << umem_odp->page_shift);
+ if (ret)
+ return ret;
- ret = mmu_interval_notifier_insert(&umem_odp->notifier,
- umem_odp->umem.owning_mm,
- start, end - start, ops);
- if (ret)
- goto out_dma_list;
- }
+ ret = mmu_interval_notifier_insert(&umem_odp->notifier,
+ umem_odp->umem.owning_mm, start,
+ end - start, ops);
+ if (ret)
+ goto out_free_map;
return 0;
-out_dma_list:
- kvfree(umem_odp->dma_list);
-out_pfn_list:
- kvfree(umem_odp->pfn_list);
+out_free_map:
+ if (ib_uses_virt_dma(dev))
+ kvfree(map->pfn_list);
+ else
+ hmm_dma_map_free(dev->dma_device, map);
return ret;
}
@@ -120,7 +136,6 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device,
{
struct ib_umem *umem;
struct ib_umem_odp *umem_odp;
- int ret;
if (access & IB_ACCESS_HUGETLB)
return ERR_PTR(-EINVAL);
@@ -132,16 +147,10 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device,
umem->ibdev = device;
umem->writable = ib_access_writable(access);
umem->owning_mm = current->mm;
- umem_odp->is_implicit_odp = 1;
umem_odp->page_shift = PAGE_SHIFT;
umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
- ret = ib_init_umem_odp(umem_odp, NULL);
- if (ret) {
- put_pid(umem_odp->tgid);
- kfree(umem_odp);
- return ERR_PTR(ret);
- }
+ ib_init_umem_implicit_odp(umem_odp);
return umem_odp;
}
EXPORT_SYMBOL(ib_umem_odp_alloc_implicit);
@@ -227,7 +236,6 @@ struct ib_umem_odp *ib_umem_odp_get(struct ib_device *device,
const struct mmu_interval_notifier_ops *ops)
{
struct ib_umem_odp *umem_odp;
- struct mm_struct *mm;
int ret;
if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND)))
@@ -241,7 +249,7 @@ struct ib_umem_odp *ib_umem_odp_get(struct ib_device *device,
umem_odp->umem.length = size;
umem_odp->umem.address = addr;
umem_odp->umem.writable = ib_access_writable(access);
- umem_odp->umem.owning_mm = mm = current->mm;
+ umem_odp->umem.owning_mm = current->mm;
umem_odp->notifier.ops = ops;
umem_odp->page_shift = PAGE_SHIFT;
@@ -263,77 +271,41 @@ err_put_pid:
}
EXPORT_SYMBOL(ib_umem_odp_get);
-void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
+static void ib_umem_odp_free(struct ib_umem_odp *umem_odp)
{
+ struct ib_device *dev = umem_odp->umem.ibdev;
+
/*
* Ensure that no more pages are mapped in the umem.
*
* It is the driver's responsibility to ensure, before calling us,
* that the hardware will not attempt to access the MR any more.
*/
- if (!umem_odp->is_implicit_odp) {
- mutex_lock(&umem_odp->umem_mutex);
- ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
- ib_umem_end(umem_odp));
- mutex_unlock(&umem_odp->umem_mutex);
- mmu_interval_notifier_remove(&umem_odp->notifier);
- kvfree(umem_odp->dma_list);
- kvfree(umem_odp->pfn_list);
- }
- put_pid(umem_odp->tgid);
- kfree(umem_odp);
+ mutex_lock(&umem_odp->umem_mutex);
+ ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
+ ib_umem_end(umem_odp));
+ mutex_unlock(&umem_odp->umem_mutex);
+ mmu_interval_notifier_remove(&umem_odp->notifier);
+ if (ib_uses_virt_dma(dev))
+ kvfree(umem_odp->map.pfn_list);
+ else
+ hmm_dma_map_free(dev->dma_device, &umem_odp->map);
}
-EXPORT_SYMBOL(ib_umem_odp_release);
-/*
- * Map for DMA and insert a single page into the on-demand paging page tables.
- *
- * @umem: the umem to insert the page to.
- * @dma_index: index in the umem to add the dma to.
- * @page: the page struct to map and add.
- * @access_mask: access permissions needed for this page.
- * @current_seq: sequence number for synchronization with invalidations.
- * the sequence number is taken from
- * umem_odp->notifiers_seq.
- *
- * The function returns -EFAULT if the DMA mapping operation fails.
- *
- */
-static int ib_umem_odp_map_dma_single_page(
- struct ib_umem_odp *umem_odp,
- unsigned int dma_index,
- struct page *page,
- u64 access_mask)
+void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
{
- struct ib_device *dev = umem_odp->umem.ibdev;
- dma_addr_t *dma_addr = &umem_odp->dma_list[dma_index];
-
- if (*dma_addr) {
- /*
- * If the page is already dma mapped it means it went through
- * a non-invalidating trasition, like read-only to writable.
- * Resync the flags.
- */
- *dma_addr = (*dma_addr & ODP_DMA_ADDR_MASK) | access_mask;
- return 0;
- }
+ if (!umem_odp->is_implicit_odp)
+ ib_umem_odp_free(umem_odp);
- *dma_addr = ib_dma_map_page(dev, page, 0, 1 << umem_odp->page_shift,
- DMA_BIDIRECTIONAL);
- if (ib_dma_mapping_error(dev, *dma_addr)) {
- *dma_addr = 0;
- return -EFAULT;
- }
- umem_odp->npages++;
- *dma_addr |= access_mask;
- return 0;
+ put_pid(umem_odp->tgid);
+ kfree(umem_odp);
}
+EXPORT_SYMBOL(ib_umem_odp_release);
/**
* ib_umem_odp_map_dma_and_lock - DMA map userspace memory in an ODP MR and lock it.
*
* Maps the range passed in the argument to DMA addresses.
- * The DMA addresses of the mapped pages is updated in umem_odp->dma_list.
* Upon success the ODP MR will be locked to let caller complete its device
* page table update.
*
@@ -361,9 +333,6 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
struct hmm_range range = {};
unsigned long timeout;
- if (access_mask == 0)
- return -EINVAL;
-
if (user_virt < ib_umem_start(umem_odp) ||
user_virt + bcnt > ib_umem_end(umem_odp))
return -EFAULT;
@@ -389,11 +358,11 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
if (fault) {
range.default_flags = HMM_PFN_REQ_FAULT;
- if (access_mask & ODP_WRITE_ALLOWED_BIT)
+ if (access_mask & HMM_PFN_WRITE)
range.default_flags |= HMM_PFN_REQ_WRITE;
}
- range.hmm_pfns = &(umem_odp->pfn_list[pfn_start_idx]);
+ range.hmm_pfns = &(umem_odp->map.pfn_list[pfn_start_idx]);
timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
retry:
@@ -421,22 +390,17 @@ retry:
for (pfn_index = 0; pfn_index < num_pfns;
pfn_index += 1 << (page_shift - PAGE_SHIFT), dma_index++) {
- if (fault) {
- /*
- * Since we asked for hmm_range_fault() to populate
- * pages it shouldn't return an error entry on success.
- */
- WARN_ON(range.hmm_pfns[pfn_index] & HMM_PFN_ERROR);
- WARN_ON(!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID));
- } else {
- if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) {
- WARN_ON(umem_odp->dma_list[dma_index]);
- continue;
- }
- access_mask = ODP_READ_ALLOWED_BIT;
- if (range.hmm_pfns[pfn_index] & HMM_PFN_WRITE)
- access_mask |= ODP_WRITE_ALLOWED_BIT;
- }
+ /*
+ * Since we asked for hmm_range_fault() to populate
+ * pages it shouldn't return an error entry on success.
+ */
+ WARN_ON(fault && range.hmm_pfns[pfn_index] & HMM_PFN_ERROR);
+ WARN_ON(fault && !(range.hmm_pfns[pfn_index] & HMM_PFN_VALID));
+ if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID))
+ continue;
+
+ if (range.hmm_pfns[pfn_index] & HMM_PFN_DMA_MAPPED)
+ continue;
hmm_order = hmm_pfn_to_map_order(range.hmm_pfns[pfn_index]);
/* If a hugepage was detected and ODP wasn't set for, the umem
@@ -449,24 +413,15 @@ retry:
__func__, hmm_order, page_shift);
break;
}
-
- ret = ib_umem_odp_map_dma_single_page(
- umem_odp, dma_index, hmm_pfn_to_page(range.hmm_pfns[pfn_index]),
- access_mask);
- if (ret < 0) {
- ibdev_dbg(umem_odp->umem.ibdev,
- "ib_umem_odp_map_dma_single_page failed with error %d\n", ret);
- break;
- }
}
- /* upon sucesss lock should stay on hold for the callee */
+ /* upon success lock should stay on hold for the callee */
if (!ret)
ret = dma_index - start_idx;
else
mutex_unlock(&umem_odp->umem_mutex);
out_put_mm:
- mmput(owning_mm);
+ mmput_async(owning_mm);
out_put_task:
if (owning_process)
put_task_struct(owning_process);
@@ -477,45 +432,38 @@ EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock);
void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
u64 bound)
{
- dma_addr_t dma_addr;
- dma_addr_t dma;
- int idx;
- u64 addr;
struct ib_device *dev = umem_odp->umem.ibdev;
+ u64 addr;
lockdep_assert_held(&umem_odp->umem_mutex);
virt = max_t(u64, virt, ib_umem_start(umem_odp));
bound = min_t(u64, bound, ib_umem_end(umem_odp));
for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) {
- idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
- dma = umem_odp->dma_list[idx];
-
- /* The access flags guaranteed a valid DMA address in case was NULL */
- if (dma) {
- unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >> PAGE_SHIFT;
- struct page *page = hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]);
-
- dma_addr = dma & ODP_DMA_ADDR_MASK;
- ib_dma_unmap_page(dev, dma_addr,
- BIT(umem_odp->page_shift),
- DMA_BIDIRECTIONAL);
- if (dma & ODP_WRITE_ALLOWED_BIT) {
- struct page *head_page = compound_head(page);
- /*
- * set_page_dirty prefers being called with
- * the page lock. However, MMU notifiers are
- * called sometimes with and sometimes without
- * the lock. We rely on the umem_mutex instead
- * to prevent other mmu notifiers from
- * continuing and allowing the page mapping to
- * be removed.
- */
- set_page_dirty(head_page);
- }
- umem_odp->dma_list[idx] = 0;
- umem_odp->npages--;
+ u64 offset = addr - ib_umem_start(umem_odp);
+ size_t idx = offset >> umem_odp->page_shift;
+ unsigned long pfn = umem_odp->map.pfn_list[idx];
+
+ if (!hmm_dma_unmap_pfn(dev->dma_device, &umem_odp->map, idx))
+ goto clear;
+
+ if (pfn & HMM_PFN_WRITE) {
+ struct page *page = hmm_pfn_to_page(pfn);
+ struct page *head_page = compound_head(page);
+ /*
+ * set_page_dirty prefers being called with
+ * the page lock. However, MMU notifiers are
+ * called sometimes with and sometimes without
+ * the lock. We rely on the umem_mutex instead
+ * to prevent other mmu notifiers from
+ * continuing and allowing the page mapping to
+ * be removed.
+ */
+ set_page_dirty(head_page);
}
+ umem_odp->npages--;
+clear:
+ umem_odp->map.pfn_list[idx] &= ~HMM_PFN_FLAGS;
}
}
EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index 98cb594cd9a6..fd67fc9fe85a 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -63,6 +63,8 @@ MODULE_AUTHOR("Roland Dreier");
MODULE_DESCRIPTION("InfiniBand userspace MAD packet access");
MODULE_LICENSE("Dual BSD/GPL");
+#define MAX_UMAD_RECV_LIST_SIZE 200000
+
enum {
IB_UMAD_MAX_PORTS = RDMA_MAX_PORTS,
IB_UMAD_MAX_AGENTS = 32,
@@ -113,6 +115,7 @@ struct ib_umad_file {
struct mutex mutex;
struct ib_umad_port *port;
struct list_head recv_list;
+ atomic_t recv_list_size;
struct list_head send_list;
struct list_head port_list;
spinlock_t send_lock;
@@ -131,6 +134,11 @@ struct ib_umad_packet {
struct ib_user_mad mad;
};
+struct ib_rmpp_mad_hdr {
+ struct ib_mad_hdr mad_hdr;
+ struct ib_rmpp_hdr rmpp_hdr;
+} __packed;
+
#define CREATE_TRACE_POINTS
#include <trace/events/ib_umad.h>
@@ -175,24 +183,28 @@ static struct ib_mad_agent *__get_agent(struct ib_umad_file *file, int id)
return file->agents_dead ? NULL : file->agent[id];
}
-static int queue_packet(struct ib_umad_file *file,
- struct ib_mad_agent *agent,
- struct ib_umad_packet *packet)
+static int queue_packet(struct ib_umad_file *file, struct ib_mad_agent *agent,
+ struct ib_umad_packet *packet, bool is_recv_mad)
{
int ret = 1;
mutex_lock(&file->mutex);
+ if (is_recv_mad &&
+ atomic_read(&file->recv_list_size) > MAX_UMAD_RECV_LIST_SIZE)
+ goto unlock;
+
for (packet->mad.hdr.id = 0;
packet->mad.hdr.id < IB_UMAD_MAX_AGENTS;
packet->mad.hdr.id++)
if (agent == __get_agent(file, packet->mad.hdr.id)) {
list_add_tail(&packet->list, &file->recv_list);
+ atomic_inc(&file->recv_list_size);
wake_up_interruptible(&file->recv_wait);
ret = 0;
break;
}
-
+unlock:
mutex_unlock(&file->mutex);
return ret;
@@ -219,7 +231,7 @@ static void send_handler(struct ib_mad_agent *agent,
if (send_wc->status == IB_WC_RESP_TIMEOUT_ERR) {
packet->length = IB_MGMT_MAD_HDR;
packet->mad.hdr.status = ETIMEDOUT;
- if (!queue_packet(file, agent, packet))
+ if (!queue_packet(file, agent, packet, false))
return;
}
kfree(packet);
@@ -279,7 +291,7 @@ static void recv_handler(struct ib_mad_agent *agent,
rdma_destroy_ah_attr(&ah_attr);
}
- if (queue_packet(file, agent, packet))
+ if (queue_packet(file, agent, packet, true))
goto err2;
return;
@@ -404,6 +416,7 @@ static ssize_t ib_umad_read(struct file *filp, char __user *buf,
packet = list_entry(file->recv_list.next, struct ib_umad_packet, list);
list_del(&packet->list);
+ atomic_dec(&file->recv_list_size);
mutex_unlock(&file->mutex);
@@ -416,6 +429,7 @@ static ssize_t ib_umad_read(struct file *filp, char __user *buf,
/* Requeue packet */
mutex_lock(&file->mutex);
list_add(&packet->list, &file->recv_list);
+ atomic_inc(&file->recv_list_size);
mutex_unlock(&file->mutex);
} else {
if (packet->recv_wc)
@@ -494,11 +508,11 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
size_t count, loff_t *pos)
{
struct ib_umad_file *file = filp->private_data;
+ struct ib_rmpp_mad_hdr *rmpp_mad_hdr;
struct ib_umad_packet *packet;
struct ib_mad_agent *agent;
struct rdma_ah_attr ah_attr;
struct ib_ah *ah;
- struct ib_rmpp_mad *rmpp_mad;
__be64 *tid;
int ret, data_len, hdr_len, copy_offset, rmpp_active;
u8 base_version;
@@ -506,7 +520,7 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
if (count < hdr_size(file) + IB_MGMT_RMPP_HDR)
return -EINVAL;
- packet = kzalloc(sizeof *packet + IB_MGMT_RMPP_HDR, GFP_KERNEL);
+ packet = kzalloc(sizeof(*packet) + IB_MGMT_RMPP_HDR, GFP_KERNEL);
if (!packet)
return -ENOMEM;
@@ -560,13 +574,13 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
goto err_up;
}
- rmpp_mad = (struct ib_rmpp_mad *) packet->mad.data;
- hdr_len = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class);
+ rmpp_mad_hdr = (struct ib_rmpp_mad_hdr *)packet->mad.data;
+ hdr_len = ib_get_mad_data_offset(rmpp_mad_hdr->mad_hdr.mgmt_class);
- if (ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class)
+ if (ib_is_mad_class_rmpp(rmpp_mad_hdr->mad_hdr.mgmt_class)
&& ib_mad_kernel_rmpp_agent(agent)) {
copy_offset = IB_MGMT_RMPP_HDR;
- rmpp_active = ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+ rmpp_active = ib_get_rmpp_flags(&rmpp_mad_hdr->rmpp_hdr) &
IB_MGMT_RMPP_FLAG_ACTIVE;
} else {
copy_offset = IB_MGMT_MAD_HDR;
@@ -615,12 +629,12 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
tid = &((struct ib_mad_hdr *) packet->msg->mad)->tid;
*tid = cpu_to_be64(((u64) agent->hi_tid) << 32 |
(be64_to_cpup(tid) & 0xffffffff));
- rmpp_mad->mad_hdr.tid = *tid;
+ rmpp_mad_hdr->mad_hdr.tid = *tid;
}
if (!ib_mad_kernel_rmpp_agent(agent)
- && ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class)
- && (ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) {
+ && ib_is_mad_class_rmpp(rmpp_mad_hdr->mad_hdr.mgmt_class)
+ && (ib_get_rmpp_flags(&rmpp_mad_hdr->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) {
spin_lock_irq(&file->send_lock);
list_add_tail(&packet->list, &file->send_list);
spin_unlock_irq(&file->send_lock);
@@ -1068,7 +1082,6 @@ static const struct file_operations umad_fops = {
#endif
.open = ib_umad_open,
.release = ib_umad_close,
- .llseek = no_llseek,
};
static int ib_umad_sm_open(struct inode *inode, struct file *filp)
@@ -1136,7 +1149,6 @@ static const struct file_operations umad_sm_fops = {
.owner = THIS_MODULE,
.open = ib_umad_sm_open,
.release = ib_umad_sm_close,
- .llseek = no_llseek,
};
static struct ib_umad_port *get_port(struct ib_device *ibdev,
@@ -1224,13 +1236,13 @@ static struct attribute *umad_class_dev_attrs[] = {
};
ATTRIBUTE_GROUPS(umad_class_dev);
-static char *umad_devnode(struct device *dev, umode_t *mode)
+static char *umad_devnode(const struct device *dev, umode_t *mode)
{
return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
}
-static ssize_t abi_version_show(struct class *class,
- struct class_attribute *attr, char *buf)
+static ssize_t abi_version_show(const struct class *class,
+ const struct class_attribute *attr, char *buf)
{
return sysfs_emit(buf, "%d\n", IB_USER_MAD_ABI_VERSION);
}
@@ -1307,15 +1319,17 @@ static int ib_umad_init_port(struct ib_device *device, int port_num,
if (ret)
goto err_cdev;
- ib_umad_init_port_dev(&port->sm_dev, port, device);
- port->sm_dev.devt = base_issm;
- dev_set_name(&port->sm_dev, "issm%d", port->dev_num);
- cdev_init(&port->sm_cdev, &umad_sm_fops);
- port->sm_cdev.owner = THIS_MODULE;
+ if (rdma_cap_ib_smi(device, port_num)) {
+ ib_umad_init_port_dev(&port->sm_dev, port, device);
+ port->sm_dev.devt = base_issm;
+ dev_set_name(&port->sm_dev, "issm%d", port->dev_num);
+ cdev_init(&port->sm_cdev, &umad_sm_fops);
+ port->sm_cdev.owner = THIS_MODULE;
- ret = cdev_device_add(&port->sm_cdev, &port->sm_dev);
- if (ret)
- goto err_dev;
+ ret = cdev_device_add(&port->sm_cdev, &port->sm_dev);
+ if (ret)
+ goto err_dev;
+ }
return 0;
@@ -1331,9 +1345,13 @@ err_cdev:
static void ib_umad_kill_port(struct ib_umad_port *port)
{
struct ib_umad_file *file;
+ bool has_smi = false;
int id;
- cdev_device_del(&port->sm_cdev, &port->sm_dev);
+ if (rdma_cap_ib_smi(port->ib_dev, port->port_num)) {
+ cdev_device_del(&port->sm_cdev, &port->sm_dev);
+ has_smi = true;
+ }
cdev_device_del(&port->cdev, &port->dev);
mutex_lock(&port->file_mutex);
@@ -1359,7 +1377,8 @@ static void ib_umad_kill_port(struct ib_umad_port *port)
ida_free(&umad_ida, port->dev_num);
/* balances device_initialize() */
- put_device(&port->sm_dev);
+ if (has_smi)
+ put_device(&port->sm_dev);
put_device(&port->dev);
}
@@ -1373,7 +1392,9 @@ static int ib_umad_add_one(struct ib_device *device)
s = rdma_start_port(device);
e = rdma_end_port(device);
- umad_dev = kzalloc(struct_size(umad_dev, ports, e - s + 1), GFP_KERNEL);
+ umad_dev = kzalloc(struct_size(umad_dev, ports,
+ size_add(size_sub(e, s), 1)),
+ GFP_KERNEL);
if (!umad_dev)
return -ENOMEM;
diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index 821d93c8f712..797e2fcc8072 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -133,35 +133,6 @@ struct ib_uverbs_completion_event_file {
struct ib_uverbs_event_queue ev_queue;
};
-struct ib_uverbs_file {
- struct kref ref;
- struct ib_uverbs_device *device;
- struct mutex ucontext_lock;
- /*
- * ucontext must be accessed via ib_uverbs_get_ucontext() or with
- * ucontext_lock held
- */
- struct ib_ucontext *ucontext;
- struct ib_uverbs_async_event_file *default_async_file;
- struct list_head list;
-
- /*
- * To access the uobjects list hw_destroy_rwsem must be held for write
- * OR hw_destroy_rwsem held for read AND uobjects_lock held.
- * hw_destroy_rwsem should be called across any destruction of the HW
- * object of an associated uobject.
- */
- struct rw_semaphore hw_destroy_rwsem;
- spinlock_t uobjects_lock;
- struct list_head uobjects;
-
- struct mutex umap_lock;
- struct list_head umaps;
- struct page *disassociate_page;
-
- struct xarray idr;
-};
-
struct ib_uverbs_event {
union {
struct ib_uverbs_async_event_desc async;
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 8c8ca7bce3ca..ce16404cdfb8 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -42,6 +42,7 @@
#include <rdma/uverbs_types.h>
#include <rdma/uverbs_std_types.h>
+#include <rdma/ib_ucaps.h>
#include "rdma_core.h"
#include "uverbs.h"
@@ -161,7 +162,7 @@ static const void __user *uverbs_request_next_ptr(struct uverbs_req_iter *iter,
{
const void __user *res = iter->cur;
- if (iter->cur + len > iter->end)
+ if (len > iter->end - iter->cur)
return (void __force __user *)ERR_PTR(-ENOSPC);
iter->cur += len;
return res;
@@ -192,7 +193,7 @@ _ib_uverbs_lookup_comp_file(s32 fd, struct uverbs_attr_bundle *attrs)
fd, attrs);
if (IS_ERR(uobj))
- return (void *)uobj;
+ return ERR_CAST(uobj);
uverbs_uobject_get(uobj);
uobj_put_read(uobj);
@@ -232,6 +233,8 @@ int ib_init_ucontext(struct uverbs_attr_bundle *attrs)
{
struct ib_ucontext *ucontext = attrs->context;
struct ib_uverbs_file *file = attrs->ufile;
+ int *fd_array;
+ int fd_count;
int ret;
if (!down_read_trylock(&file->hw_destroy_rwsem))
@@ -247,6 +250,22 @@ int ib_init_ucontext(struct uverbs_attr_bundle *attrs)
if (ret)
goto err;
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_GET_CONTEXT_FD_ARR)) {
+ fd_count = uverbs_attr_ptr_get_array_size(attrs,
+ UVERBS_ATTR_GET_CONTEXT_FD_ARR,
+ sizeof(int));
+ if (fd_count < 0) {
+ ret = fd_count;
+ goto err_uncharge;
+ }
+
+ fd_array = uverbs_attr_get_alloced_ptr(attrs,
+ UVERBS_ATTR_GET_CONTEXT_FD_ARR);
+ ret = ib_get_ucaps(fd_array, fd_count, &ucontext->enabled_caps);
+ if (ret)
+ goto err_uncharge;
+ }
+
ret = ucontext->device->ops.alloc_ucontext(ucontext,
&attrs->driver_udata);
if (ret)
@@ -337,7 +356,7 @@ static void copy_query_dev_fields(struct ib_ucontext *ucontext,
resp->hw_ver = attr->hw_ver;
resp->max_qp = attr->max_qp;
resp->max_qp_wr = attr->max_qp_wr;
- resp->device_cap_flags = lower_32_bits(attr->device_cap_flags);
+ resp->device_cap_flags = lower_32_bits(attr->device_cap_flags);
resp->max_sge = min(attr->max_send_sge, attr->max_recv_sge);
resp->max_sge_rd = attr->max_sge_rd;
resp->max_cq = attr->max_cq;
@@ -572,7 +591,7 @@ static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs)
struct inode *inode = NULL;
int new_xrcd = 0;
struct ib_device *ib_dev;
- struct fd f = {};
+ struct fd f = EMPTY_FD;
int ret;
ret = uverbs_request(attrs, &cmd, sizeof(cmd));
@@ -584,12 +603,12 @@ static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs)
if (cmd.fd != -1) {
/* search for file descriptor */
f = fdget(cmd.fd);
- if (!f.file) {
+ if (fd_empty(f)) {
ret = -EBADF;
goto err_tree_mutex_unlock;
}
- inode = file_inode(f.file);
+ inode = file_inode(fd_file(f));
xrcd = find_xrcd(ibudev, inode);
if (!xrcd && !(cmd.oflags & O_CREAT)) {
/* no file descriptor. Need CREATE flag */
@@ -632,8 +651,7 @@ static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs)
atomic_inc(&xrcd->usecnt);
}
- if (f.file)
- fdput(f);
+ fdput(f);
mutex_unlock(&ibudev->xrcd_tree_mutex);
uobj_finalize_uobj_create(&obj->uobject, attrs);
@@ -648,8 +666,7 @@ err:
uobj_alloc_abort(&obj->uobject, attrs);
err_tree_mutex_unlock:
- if (f.file)
- fdput(f);
+ fdput(f);
mutex_unlock(&ibudev->xrcd_tree_mutex);
@@ -718,13 +735,13 @@ static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs)
goto err_free;
pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs);
- if (!pd) {
- ret = -EINVAL;
+ if (IS_ERR(pd)) {
+ ret = PTR_ERR(pd);
goto err_free;
}
mr = pd->device->ops.reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
- cmd.access_flags,
+ cmd.access_flags, NULL,
&attrs->driver_udata);
if (IS_ERR(mr)) {
ret = PTR_ERR(mr);
@@ -739,6 +756,7 @@ static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs)
mr->uobject = uobj;
atomic_inc(&pd->usecnt);
mr->iova = cmd.hca_va;
+ mr->length = cmd.length;
rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR);
rdma_restrack_set_name(&mr->res, NULL);
@@ -808,8 +826,8 @@ static int ib_uverbs_rereg_mr(struct uverbs_attr_bundle *attrs)
if (cmd.flags & IB_MR_REREG_PD) {
new_pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle,
attrs);
- if (!new_pd) {
- ret = -EINVAL;
+ if (IS_ERR(new_pd)) {
+ ret = PTR_ERR(new_pd);
goto put_uobjs;
}
} else {
@@ -837,11 +855,8 @@ static int ib_uverbs_rereg_mr(struct uverbs_attr_bundle *attrs)
new_mr->device = new_pd->device;
new_mr->pd = new_pd;
new_mr->type = IB_MR_TYPE_USER;
- new_mr->dm = NULL;
- new_mr->sig_attrs = NULL;
new_mr->uobject = uobj;
atomic_inc(&new_pd->usecnt);
- new_mr->iova = cmd.hca_va;
new_uobj->object = new_mr;
rdma_restrack_new(&new_mr->res, RDMA_RESTRACK_MR);
@@ -864,8 +879,10 @@ static int ib_uverbs_rereg_mr(struct uverbs_attr_bundle *attrs)
mr->pd = new_pd;
atomic_inc(&new_pd->usecnt);
}
- if (cmd.flags & IB_MR_REREG_TRANS)
+ if (cmd.flags & IB_MR_REREG_TRANS) {
mr->iova = cmd.hca_va;
+ mr->length = cmd.length;
+ }
}
memset(&resp, 0, sizeof(resp));
@@ -919,8 +936,8 @@ static int ib_uverbs_alloc_mw(struct uverbs_attr_bundle *attrs)
return PTR_ERR(uobj);
pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs);
- if (!pd) {
- ret = -EINVAL;
+ if (IS_ERR(pd)) {
+ ret = PTR_ERR(pd);
goto err_free;
}
@@ -1051,7 +1068,7 @@ static int create_cq(struct uverbs_attr_bundle *attrs,
rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ);
rdma_restrack_set_name(&cq->res, NULL);
- ret = ib_dev->ops.create_cq(cq, &attr, &attrs->driver_udata);
+ ret = ib_dev->ops.create_cq(cq, &attr, attrs);
if (ret)
goto err_free;
rdma_restrack_add(&cq->res);
@@ -1127,8 +1144,8 @@ static int ib_uverbs_resize_cq(struct uverbs_attr_bundle *attrs)
return ret;
cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs);
- if (!cq)
- return -EINVAL;
+ if (IS_ERR(cq))
+ return PTR_ERR(cq);
ret = cq->device->ops.resize_cq(cq, cmd.cqe, &attrs->driver_udata);
if (ret)
@@ -1189,8 +1206,8 @@ static int ib_uverbs_poll_cq(struct uverbs_attr_bundle *attrs)
return ret;
cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs);
- if (!cq)
- return -EINVAL;
+ if (IS_ERR(cq))
+ return PTR_ERR(cq);
/* we copy a struct ib_uverbs_poll_cq_resp to user space */
header_ptr = attrs->ucore.outbuf;
@@ -1238,8 +1255,8 @@ static int ib_uverbs_req_notify_cq(struct uverbs_attr_bundle *attrs)
return ret;
cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs);
- if (!cq)
- return -EINVAL;
+ if (IS_ERR(cq))
+ return PTR_ERR(cq);
ib_req_notify_cq(cq, cmd.solicited_only ?
IB_CQ_SOLICITED : IB_CQ_NEXT_COMP);
@@ -1295,9 +1312,9 @@ static int create_qp(struct uverbs_attr_bundle *attrs,
switch (cmd->qp_type) {
case IB_QPT_RAW_PACKET:
- if (!capable(CAP_NET_RAW))
+ if (!rdma_uattrs_has_raw_cap(attrs))
return -EPERM;
- break;
+ fallthrough;
case IB_QPT_RC:
case IB_QPT_UC:
case IB_QPT_UD:
@@ -1321,8 +1338,8 @@ static int create_qp(struct uverbs_attr_bundle *attrs,
ind_tbl = uobj_get_obj_read(rwq_ind_table,
UVERBS_OBJECT_RWQ_IND_TBL,
cmd->rwq_ind_tbl_handle, attrs);
- if (!ind_tbl) {
- ret = -EINVAL;
+ if (IS_ERR(ind_tbl)) {
+ ret = PTR_ERR(ind_tbl);
goto err_put;
}
@@ -1360,8 +1377,10 @@ static int create_qp(struct uverbs_attr_bundle *attrs,
if (cmd->is_srq) {
srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ,
cmd->srq_handle, attrs);
- if (!srq || srq->srq_type == IB_SRQT_XRC) {
- ret = -EINVAL;
+ if (IS_ERR(srq) ||
+ srq->srq_type == IB_SRQT_XRC) {
+ ret = IS_ERR(srq) ? PTR_ERR(srq) :
+ -EINVAL;
goto err_put;
}
}
@@ -1371,23 +1390,29 @@ static int create_qp(struct uverbs_attr_bundle *attrs,
rcq = uobj_get_obj_read(
cq, UVERBS_OBJECT_CQ,
cmd->recv_cq_handle, attrs);
- if (!rcq) {
- ret = -EINVAL;
+ if (IS_ERR(rcq)) {
+ ret = PTR_ERR(rcq);
goto err_put;
}
}
}
}
- if (has_sq)
+ if (has_sq) {
scq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ,
cmd->send_cq_handle, attrs);
+ if (IS_ERR(scq)) {
+ ret = PTR_ERR(scq);
+ goto err_put;
+ }
+ }
+
if (!ind_tbl && cmd->qp_type != IB_QPT_XRC_INI)
rcq = rcq ?: scq;
pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd->pd_handle,
attrs);
- if (!pd || (!scq && has_sq)) {
- ret = -EINVAL;
+ if (IS_ERR(pd)) {
+ ret = PTR_ERR(pd);
goto err_put;
}
@@ -1402,7 +1427,6 @@ static int create_qp(struct uverbs_attr_bundle *attrs,
attr.sq_sig_type = cmd->sq_sig_all ? IB_SIGNAL_ALL_WR :
IB_SIGNAL_REQ_WR;
attr.qp_type = cmd->qp_type;
- attr.create_flags = 0;
attr.cap.max_send_wr = cmd->max_send_wr;
attr.cap.max_recv_wr = cmd->max_recv_wr;
@@ -1427,7 +1451,7 @@ static int create_qp(struct uverbs_attr_bundle *attrs,
}
if (attr.create_flags & IB_QP_CREATE_SOURCE_QPN) {
- if (!capable(CAP_NET_RAW)) {
+ if (!rdma_uattrs_has_raw_cap(attrs)) {
ret = -EPERM;
goto err_put;
}
@@ -1435,35 +1459,13 @@ static int create_qp(struct uverbs_attr_bundle *attrs,
attr.source_qpn = cmd->source_qpn;
}
- if (cmd->qp_type == IB_QPT_XRC_TGT)
- qp = ib_create_qp(pd, &attr);
- else
- qp = _ib_create_qp(device, pd, &attr, &attrs->driver_udata, obj,
- NULL);
-
+ qp = ib_create_qp_user(device, pd, &attr, &attrs->driver_udata, obj,
+ KBUILD_MODNAME);
if (IS_ERR(qp)) {
ret = PTR_ERR(qp);
goto err_put;
}
-
- if (cmd->qp_type != IB_QPT_XRC_TGT) {
- ret = ib_create_qp_security(qp, device);
- if (ret)
- goto err_cb;
-
- atomic_inc(&pd->usecnt);
- if (attr.send_cq)
- atomic_inc(&attr.send_cq->usecnt);
- if (attr.recv_cq)
- atomic_inc(&attr.recv_cq->usecnt);
- if (attr.srq)
- atomic_inc(&attr.srq->usecnt);
- if (ind_tbl)
- atomic_inc(&ind_tbl->usecnt);
- } else {
- /* It is done in _ib_create_qp for other QP types */
- qp->uobject = obj;
- }
+ ib_qp_usecnt_inc(qp);
obj->uevent.uobject.object = qp;
obj->uevent.event_file = READ_ONCE(attrs->ufile->default_async_file);
@@ -1502,24 +1504,21 @@ static int create_qp(struct uverbs_attr_bundle *attrs,
resp.response_length = uverbs_response_length(attrs, sizeof(resp));
return uverbs_response(attrs, &resp, sizeof(resp));
-err_cb:
- ib_destroy_qp_user(qp, uverbs_get_cleared_udata(attrs));
-
err_put:
if (!IS_ERR(xrcd_uobj))
uobj_put_read(xrcd_uobj);
- if (pd)
+ if (!IS_ERR_OR_NULL(pd))
uobj_put_obj_read(pd);
- if (scq)
+ if (!IS_ERR_OR_NULL(scq))
rdma_lookup_put_uobject(&scq->uobject->uevent.uobject,
UVERBS_LOOKUP_READ);
- if (rcq && rcq != scq)
+ if (!IS_ERR_OR_NULL(rcq) && rcq != scq)
rdma_lookup_put_uobject(&rcq->uobject->uevent.uobject,
UVERBS_LOOKUP_READ);
- if (srq)
+ if (!IS_ERR_OR_NULL(srq))
rdma_lookup_put_uobject(&srq->uobject->uevent.uobject,
UVERBS_LOOKUP_READ);
- if (ind_tbl)
+ if (!IS_ERR_OR_NULL(ind_tbl))
uobj_put_obj_read(ind_tbl);
uobj_alloc_abort(&obj->uevent.uobject, attrs);
@@ -1681,8 +1680,8 @@ static int ib_uverbs_query_qp(struct uverbs_attr_bundle *attrs)
}
qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
- if (!qp) {
- ret = -EINVAL;
+ if (IS_ERR(qp)) {
+ ret = PTR_ERR(qp);
goto out;
}
@@ -1787,8 +1786,8 @@ static int modify_qp(struct uverbs_attr_bundle *attrs,
qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd->base.qp_handle,
attrs);
- if (!qp) {
- ret = -EINVAL;
+ if (IS_ERR(qp)) {
+ ret = PTR_ERR(qp);
goto out;
}
@@ -1876,8 +1875,15 @@ static int modify_qp(struct uverbs_attr_bundle *attrs,
attr->path_mtu = cmd->base.path_mtu;
if (cmd->base.attr_mask & IB_QP_PATH_MIG_STATE)
attr->path_mig_state = cmd->base.path_mig_state;
- if (cmd->base.attr_mask & IB_QP_QKEY)
+ if (cmd->base.attr_mask & IB_QP_QKEY) {
+ if (cmd->base.qkey & IB_QP_SET_QKEY &&
+ !(rdma_nl_get_privileged_qkey() ||
+ rdma_uattrs_has_raw_cap(attrs))) {
+ ret = -EPERM;
+ goto release_qp;
+ }
attr->qkey = cmd->base.qkey;
+ }
if (cmd->base.attr_mask & IB_QP_RQ_PSN)
attr->rq_psn = cmd->base.rq_psn;
if (cmd->base.attr_mask & IB_QP_SQ_PSN)
@@ -2030,11 +2036,13 @@ static int ib_uverbs_post_send(struct uverbs_attr_bundle *attrs)
ret = uverbs_request_start(attrs, &iter, &cmd, sizeof(cmd));
if (ret)
return ret;
- wqes = uverbs_request_next_ptr(&iter, cmd.wqe_size * cmd.wr_count);
+ wqes = uverbs_request_next_ptr(&iter, size_mul(cmd.wqe_size,
+ cmd.wr_count));
if (IS_ERR(wqes))
return PTR_ERR(wqes);
- sgls = uverbs_request_next_ptr(
- &iter, cmd.sge_count * sizeof(struct ib_uverbs_sge));
+ sgls = uverbs_request_next_ptr(&iter,
+ size_mul(cmd.sge_count,
+ sizeof(struct ib_uverbs_sge)));
if (IS_ERR(sgls))
return PTR_ERR(sgls);
ret = uverbs_request_finish(&iter);
@@ -2046,8 +2054,8 @@ static int ib_uverbs_post_send(struct uverbs_attr_bundle *attrs)
return -ENOMEM;
qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
- if (!qp) {
- ret = -EINVAL;
+ if (IS_ERR(qp)) {
+ ret = PTR_ERR(qp);
goto out;
}
@@ -2084,9 +2092,9 @@ static int ib_uverbs_post_send(struct uverbs_attr_bundle *attrs)
ud->ah = uobj_get_obj_read(ah, UVERBS_OBJECT_AH,
user_wr->wr.ud.ah, attrs);
- if (!ud->ah) {
+ if (IS_ERR(ud->ah)) {
+ ret = PTR_ERR(ud->ah);
kfree(ud);
- ret = -EINVAL;
goto out_put;
}
ud->remote_qpn = user_wr->wr.ud.remote_qpn;
@@ -2220,11 +2228,11 @@ ib_uverbs_unmarshall_recv(struct uverbs_req_iter *iter, u32 wr_count,
if (wqe_size < sizeof(struct ib_uverbs_recv_wr))
return ERR_PTR(-EINVAL);
- wqes = uverbs_request_next_ptr(iter, wqe_size * wr_count);
+ wqes = uverbs_request_next_ptr(iter, size_mul(wqe_size, wr_count));
if (IS_ERR(wqes))
return ERR_CAST(wqes);
- sgls = uverbs_request_next_ptr(
- iter, sge_count * sizeof(struct ib_uverbs_sge));
+ sgls = uverbs_request_next_ptr(iter, size_mul(sge_count,
+ sizeof(struct ib_uverbs_sge)));
if (IS_ERR(sgls))
return ERR_CAST(sgls);
ret = uverbs_request_finish(iter);
@@ -2323,8 +2331,8 @@ static int ib_uverbs_post_recv(struct uverbs_attr_bundle *attrs)
return PTR_ERR(wr);
qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
- if (!qp) {
- ret = -EINVAL;
+ if (IS_ERR(qp)) {
+ ret = PTR_ERR(qp);
goto out;
}
@@ -2374,8 +2382,8 @@ static int ib_uverbs_post_srq_recv(struct uverbs_attr_bundle *attrs)
return PTR_ERR(wr);
srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs);
- if (!srq) {
- ret = -EINVAL;
+ if (IS_ERR(srq)) {
+ ret = PTR_ERR(srq);
goto out;
}
@@ -2431,8 +2439,8 @@ static int ib_uverbs_create_ah(struct uverbs_attr_bundle *attrs)
}
pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs);
- if (!pd) {
- ret = -EINVAL;
+ if (IS_ERR(pd)) {
+ ret = PTR_ERR(pd);
goto err;
}
@@ -2501,8 +2509,8 @@ static int ib_uverbs_attach_mcast(struct uverbs_attr_bundle *attrs)
return ret;
qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
- if (!qp)
- return -EINVAL;
+ if (IS_ERR(qp))
+ return PTR_ERR(qp);
obj = qp->uobject;
@@ -2551,8 +2559,8 @@ static int ib_uverbs_detach_mcast(struct uverbs_attr_bundle *attrs)
return ret;
qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
- if (!qp)
- return -EINVAL;
+ if (IS_ERR(qp))
+ return PTR_ERR(qp);
obj = qp->uobject;
mutex_lock(&obj->mcast_lock);
@@ -2686,8 +2694,8 @@ static int kern_spec_to_ib_spec_action(struct uverbs_attr_bundle *attrs,
UVERBS_OBJECT_FLOW_ACTION,
kern_spec->action.handle,
attrs);
- if (!ib_spec->action.act)
- return -EINVAL;
+ if (IS_ERR(ib_spec->action.act))
+ return PTR_ERR(ib_spec->action.act);
ib_spec->action.size =
sizeof(struct ib_flow_spec_action_handle);
flow_resources_add(uflow_res,
@@ -2704,8 +2712,8 @@ static int kern_spec_to_ib_spec_action(struct uverbs_attr_bundle *attrs,
UVERBS_OBJECT_COUNTERS,
kern_spec->flow_count.handle,
attrs);
- if (!ib_spec->flow_count.counters)
- return -EINVAL;
+ if (IS_ERR(ib_spec->flow_count.counters))
+ return PTR_ERR(ib_spec->flow_count.counters);
ib_spec->flow_count.size =
sizeof(struct ib_flow_spec_action_count);
flow_resources_add(uflow_res,
@@ -2757,7 +2765,7 @@ int ib_uverbs_kern_spec_to_ib_spec_filter(enum ib_flow_spec_type type,
switch (ib_spec->type & ~IB_FLOW_SPEC_INNER) {
case IB_FLOW_SPEC_ETH:
- ib_filter_sz = offsetof(struct ib_flow_eth_filter, real_sz);
+ ib_filter_sz = sizeof(struct ib_flow_eth_filter);
actual_filter_sz = spec_filter_size(kern_spec_mask,
kern_filter_sz,
ib_filter_sz);
@@ -2768,7 +2776,7 @@ int ib_uverbs_kern_spec_to_ib_spec_filter(enum ib_flow_spec_type type,
memcpy(&ib_spec->eth.mask, kern_spec_mask, actual_filter_sz);
break;
case IB_FLOW_SPEC_IPV4:
- ib_filter_sz = offsetof(struct ib_flow_ipv4_filter, real_sz);
+ ib_filter_sz = sizeof(struct ib_flow_ipv4_filter);
actual_filter_sz = spec_filter_size(kern_spec_mask,
kern_filter_sz,
ib_filter_sz);
@@ -2779,7 +2787,7 @@ int ib_uverbs_kern_spec_to_ib_spec_filter(enum ib_flow_spec_type type,
memcpy(&ib_spec->ipv4.mask, kern_spec_mask, actual_filter_sz);
break;
case IB_FLOW_SPEC_IPV6:
- ib_filter_sz = offsetof(struct ib_flow_ipv6_filter, real_sz);
+ ib_filter_sz = sizeof(struct ib_flow_ipv6_filter);
actual_filter_sz = spec_filter_size(kern_spec_mask,
kern_filter_sz,
ib_filter_sz);
@@ -2795,7 +2803,7 @@ int ib_uverbs_kern_spec_to_ib_spec_filter(enum ib_flow_spec_type type,
break;
case IB_FLOW_SPEC_TCP:
case IB_FLOW_SPEC_UDP:
- ib_filter_sz = offsetof(struct ib_flow_tcp_udp_filter, real_sz);
+ ib_filter_sz = sizeof(struct ib_flow_tcp_udp_filter);
actual_filter_sz = spec_filter_size(kern_spec_mask,
kern_filter_sz,
ib_filter_sz);
@@ -2806,7 +2814,7 @@ int ib_uverbs_kern_spec_to_ib_spec_filter(enum ib_flow_spec_type type,
memcpy(&ib_spec->tcp_udp.mask, kern_spec_mask, actual_filter_sz);
break;
case IB_FLOW_SPEC_VXLAN_TUNNEL:
- ib_filter_sz = offsetof(struct ib_flow_tunnel_filter, real_sz);
+ ib_filter_sz = sizeof(struct ib_flow_tunnel_filter);
actual_filter_sz = spec_filter_size(kern_spec_mask,
kern_filter_sz,
ib_filter_sz);
@@ -2821,7 +2829,7 @@ int ib_uverbs_kern_spec_to_ib_spec_filter(enum ib_flow_spec_type type,
return -EINVAL;
break;
case IB_FLOW_SPEC_ESP:
- ib_filter_sz = offsetof(struct ib_flow_esp_filter, real_sz);
+ ib_filter_sz = sizeof(struct ib_flow_esp_filter);
actual_filter_sz = spec_filter_size(kern_spec_mask,
kern_filter_sz,
ib_filter_sz);
@@ -2832,7 +2840,7 @@ int ib_uverbs_kern_spec_to_ib_spec_filter(enum ib_flow_spec_type type,
memcpy(&ib_spec->esp.mask, kern_spec_mask, actual_filter_sz);
break;
case IB_FLOW_SPEC_GRE:
- ib_filter_sz = offsetof(struct ib_flow_gre_filter, real_sz);
+ ib_filter_sz = sizeof(struct ib_flow_gre_filter);
actual_filter_sz = spec_filter_size(kern_spec_mask,
kern_filter_sz,
ib_filter_sz);
@@ -2843,7 +2851,7 @@ int ib_uverbs_kern_spec_to_ib_spec_filter(enum ib_flow_spec_type type,
memcpy(&ib_spec->gre.mask, kern_spec_mask, actual_filter_sz);
break;
case IB_FLOW_SPEC_MPLS:
- ib_filter_sz = offsetof(struct ib_flow_mpls_filter, real_sz);
+ ib_filter_sz = sizeof(struct ib_flow_mpls_filter);
actual_filter_sz = spec_filter_size(kern_spec_mask,
kern_filter_sz,
ib_filter_sz);
@@ -2923,14 +2931,14 @@ static int ib_uverbs_ex_create_wq(struct uverbs_attr_bundle *attrs)
return PTR_ERR(obj);
pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs);
- if (!pd) {
- err = -EINVAL;
+ if (IS_ERR(pd)) {
+ err = PTR_ERR(pd);
goto err_uobj;
}
cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs);
- if (!cq) {
- err = -EINVAL;
+ if (IS_ERR(cq)) {
+ err = PTR_ERR(cq);
goto err_put_pd;
}
@@ -3031,8 +3039,8 @@ static int ib_uverbs_ex_modify_wq(struct uverbs_attr_bundle *attrs)
return -EINVAL;
wq = uobj_get_obj_read(wq, UVERBS_OBJECT_WQ, cmd.wq_handle, attrs);
- if (!wq)
- return -EINVAL;
+ if (IS_ERR(wq))
+ return PTR_ERR(wq);
if (cmd.attr_mask & IB_WQ_FLAGS) {
wq_attr.flags = cmd.flags;
@@ -3115,8 +3123,8 @@ static int ib_uverbs_ex_create_rwq_ind_table(struct uverbs_attr_bundle *attrs)
num_read_wqs++) {
wq = uobj_get_obj_read(wq, UVERBS_OBJECT_WQ,
wqs_handles[num_read_wqs], attrs);
- if (!wq) {
- err = -EINVAL;
+ if (IS_ERR(wq)) {
+ err = PTR_ERR(wq);
goto put_wqs;
}
@@ -3218,7 +3226,7 @@ static int ib_uverbs_ex_create_flow(struct uverbs_attr_bundle *attrs)
if (cmd.comp_mask)
return -EINVAL;
- if (!capable(CAP_NET_RAW))
+ if (!rdma_uattrs_has_raw_cap(attrs))
return -EPERM;
if (cmd.flow_attr.flags >= IB_FLOW_ATTR_FLAGS_RESERVED)
@@ -3271,8 +3279,8 @@ static int ib_uverbs_ex_create_flow(struct uverbs_attr_bundle *attrs)
}
qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
- if (!qp) {
- err = -EINVAL;
+ if (IS_ERR(qp)) {
+ err = PTR_ERR(qp);
goto err_uobj;
}
@@ -3418,15 +3426,15 @@ static int __uverbs_create_xsrq(struct uverbs_attr_bundle *attrs,
if (ib_srq_has_cq(cmd->srq_type)) {
attr.ext.cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ,
cmd->cq_handle, attrs);
- if (!attr.ext.cq) {
- ret = -EINVAL;
+ if (IS_ERR(attr.ext.cq)) {
+ ret = PTR_ERR(attr.ext.cq);
goto err_put_xrcd;
}
}
pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd->pd_handle, attrs);
- if (!pd) {
- ret = -EINVAL;
+ if (IS_ERR(pd)) {
+ ret = PTR_ERR(pd);
goto err_put_cq;
}
@@ -3533,8 +3541,8 @@ static int ib_uverbs_modify_srq(struct uverbs_attr_bundle *attrs)
return ret;
srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs);
- if (!srq)
- return -EINVAL;
+ if (IS_ERR(srq))
+ return PTR_ERR(srq);
attr.max_wr = cmd.max_wr;
attr.srq_limit = cmd.srq_limit;
@@ -3561,8 +3569,8 @@ static int ib_uverbs_query_srq(struct uverbs_attr_bundle *attrs)
return ret;
srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs);
- if (!srq)
- return -EINVAL;
+ if (IS_ERR(srq))
+ return PTR_ERR(srq);
ret = ib_query_srq(srq, &attr);
@@ -3687,8 +3695,8 @@ static int ib_uverbs_ex_modify_cq(struct uverbs_attr_bundle *attrs)
return -EOPNOTSUPP;
cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs);
- if (!cq)
- return -EINVAL;
+ if (IS_ERR(cq))
+ return PTR_ERR(cq);
ret = rdma_set_cq_moderation(cq, cmd.attr.cq_count, cmd.attr.cq_period);
diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c
index 990f0724acc6..f80da6a67e24 100644
--- a/drivers/infiniband/core/uverbs_ioctl.c
+++ b/drivers/infiniband/core/uverbs_ioctl.c
@@ -36,13 +36,15 @@
#include "uverbs.h"
struct bundle_alloc_head {
- struct bundle_alloc_head *next;
+ struct_group_tagged(bundle_alloc_head_hdr, hdr,
+ struct bundle_alloc_head *next;
+ );
u8 data[];
};
struct bundle_priv {
/* Must be first */
- struct bundle_alloc_head alloc_head;
+ struct bundle_alloc_head_hdr alloc_head;
struct bundle_alloc_head *allocated_mem;
size_t internal_avail;
size_t internal_used;
@@ -64,7 +66,7 @@ struct bundle_priv {
* Must be last. bundle ends in a flex array which overlaps
* internal_buffer.
*/
- struct uverbs_attr_bundle bundle;
+ struct uverbs_attr_bundle_hdr bundle;
u64 internal_buffer[32];
};
@@ -77,9 +79,10 @@ void uapi_compute_bundle_size(struct uverbs_api_ioctl_method *method_elm,
unsigned int num_attrs)
{
struct bundle_priv *pbundle;
+ struct uverbs_attr_bundle *bundle;
size_t bundle_size =
offsetof(struct bundle_priv, internal_buffer) +
- sizeof(*pbundle->bundle.attrs) * method_elm->key_bitmap_len +
+ sizeof(*bundle->attrs) * method_elm->key_bitmap_len +
sizeof(*pbundle->uattrs) * num_attrs;
method_elm->use_stack = bundle_size <= sizeof(*pbundle);
@@ -107,7 +110,7 @@ __malloc void *_uverbs_alloc(struct uverbs_attr_bundle *bundle, size_t size,
gfp_t flags)
{
struct bundle_priv *pbundle =
- container_of(bundle, struct bundle_priv, bundle);
+ container_of(&bundle->hdr, struct bundle_priv, bundle);
size_t new_used;
void *res;
@@ -149,7 +152,7 @@ static int uverbs_set_output(const struct uverbs_attr_bundle *bundle,
const struct uverbs_attr *attr)
{
struct bundle_priv *pbundle =
- container_of(bundle, struct bundle_priv, bundle);
+ container_of(&bundle->hdr, struct bundle_priv, bundle);
u16 flags;
flags = pbundle->uattrs[attr->ptr_attr.uattr_idx].flags |
@@ -166,6 +169,8 @@ static int uverbs_process_idrs_array(struct bundle_priv *pbundle,
struct ib_uverbs_attr *uattr,
u32 attr_bkey)
{
+ struct uverbs_attr_bundle *bundle =
+ container_of(&pbundle->bundle, struct uverbs_attr_bundle, hdr);
const struct uverbs_attr_spec *spec = &attr_uapi->spec;
size_t array_len;
u32 *idr_vals;
@@ -184,7 +189,7 @@ static int uverbs_process_idrs_array(struct bundle_priv *pbundle,
return -EINVAL;
attr->uobjects =
- uverbs_alloc(&pbundle->bundle,
+ uverbs_alloc(bundle,
array_size(array_len, sizeof(*attr->uobjects)));
if (IS_ERR(attr->uobjects))
return PTR_ERR(attr->uobjects);
@@ -209,7 +214,7 @@ static int uverbs_process_idrs_array(struct bundle_priv *pbundle,
for (i = 0; i != array_len; i++) {
attr->uobjects[i] = uverbs_get_uobject_from_file(
spec->u2.objs_arr.obj_type, spec->u2.objs_arr.access,
- idr_vals[i], &pbundle->bundle);
+ idr_vals[i], bundle);
if (IS_ERR(attr->uobjects[i])) {
ret = PTR_ERR(attr->uobjects[i]);
break;
@@ -240,7 +245,9 @@ static int uverbs_process_attr(struct bundle_priv *pbundle,
struct ib_uverbs_attr *uattr, u32 attr_bkey)
{
const struct uverbs_attr_spec *spec = &attr_uapi->spec;
- struct uverbs_attr *e = &pbundle->bundle.attrs[attr_bkey];
+ struct uverbs_attr_bundle *bundle =
+ container_of(&pbundle->bundle, struct uverbs_attr_bundle, hdr);
+ struct uverbs_attr *e = &bundle->attrs[attr_bkey];
const struct uverbs_attr_spec *val_spec = spec;
struct uverbs_obj_attr *o_attr;
@@ -288,7 +295,7 @@ static int uverbs_process_attr(struct bundle_priv *pbundle,
if (val_spec->alloc_and_copy && !uverbs_attr_ptr_is_inline(e)) {
void *p;
- p = uverbs_alloc(&pbundle->bundle, uattr->len);
+ p = uverbs_alloc(bundle, uattr->len);
if (IS_ERR(p))
return PTR_ERR(p);
@@ -321,7 +328,7 @@ static int uverbs_process_attr(struct bundle_priv *pbundle,
*/
o_attr->uobject = uverbs_get_uobject_from_file(
spec->u.obj.obj_type, spec->u.obj.access,
- uattr->data_s64, &pbundle->bundle);
+ uattr->data_s64, bundle);
if (IS_ERR(o_attr->uobject))
return PTR_ERR(o_attr->uobject);
__set_bit(attr_bkey, pbundle->uobj_finalize);
@@ -337,6 +344,14 @@ static int uverbs_process_attr(struct bundle_priv *pbundle,
break;
+ case UVERBS_ATTR_TYPE_RAW_FD:
+ if (uattr->attr_data.reserved || uattr->len != 0 ||
+ uattr->data_s64 < INT_MIN || uattr->data_s64 > INT_MAX)
+ return -EINVAL;
+ /* _uverbs_get_const_signed() is the accessor */
+ e->ptr_attr.data = uattr->data_s64;
+ break;
+
case UVERBS_ATTR_TYPE_IDRS_ARRAY:
return uverbs_process_idrs_array(pbundle, attr_uapi,
&e->objs_arr_attr, uattr,
@@ -414,6 +429,8 @@ static int ib_uverbs_run_method(struct bundle_priv *pbundle,
unsigned int num_attrs)
{
int (*handler)(struct uverbs_attr_bundle *attrs);
+ struct uverbs_attr_bundle *bundle =
+ container_of(&pbundle->bundle, struct uverbs_attr_bundle, hdr);
size_t uattrs_size = array_size(sizeof(*pbundle->uattrs), num_attrs);
unsigned int destroy_bkey = pbundle->method_elm->destroy_bkey;
unsigned int i;
@@ -426,7 +443,7 @@ static int ib_uverbs_run_method(struct bundle_priv *pbundle,
if (!handler)
return -EIO;
- pbundle->uattrs = uverbs_alloc(&pbundle->bundle, uattrs_size);
+ pbundle->uattrs = uverbs_alloc(bundle, uattrs_size);
if (IS_ERR(pbundle->uattrs))
return PTR_ERR(pbundle->uattrs);
if (copy_from_user(pbundle->uattrs, pbundle->user_attrs, uattrs_size))
@@ -445,25 +462,23 @@ static int ib_uverbs_run_method(struct bundle_priv *pbundle,
return -EINVAL;
if (pbundle->method_elm->has_udata)
- uverbs_fill_udata(&pbundle->bundle,
- &pbundle->bundle.driver_udata,
+ uverbs_fill_udata(bundle, &pbundle->bundle.driver_udata,
UVERBS_ATTR_UHW_IN, UVERBS_ATTR_UHW_OUT);
else
pbundle->bundle.driver_udata = (struct ib_udata){};
if (destroy_bkey != UVERBS_API_ATTR_BKEY_LEN) {
- struct uverbs_obj_attr *destroy_attr =
- &pbundle->bundle.attrs[destroy_bkey].obj_attr;
+ struct uverbs_obj_attr *destroy_attr = &bundle->attrs[destroy_bkey].obj_attr;
- ret = uobj_destroy(destroy_attr->uobject, &pbundle->bundle);
+ ret = uobj_destroy(destroy_attr->uobject, bundle);
if (ret)
return ret;
__clear_bit(destroy_bkey, pbundle->uobj_finalize);
- ret = handler(&pbundle->bundle);
+ ret = handler(bundle);
uobj_put_destroy(destroy_attr->uobject);
} else {
- ret = handler(&pbundle->bundle);
+ ret = handler(bundle);
}
/*
@@ -473,10 +488,10 @@ static int ib_uverbs_run_method(struct bundle_priv *pbundle,
*/
if (!ret && pbundle->method_elm->has_udata) {
const struct uverbs_attr *attr =
- uverbs_attr_get(&pbundle->bundle, UVERBS_ATTR_UHW_OUT);
+ uverbs_attr_get(bundle, UVERBS_ATTR_UHW_OUT);
if (!IS_ERR(attr))
- ret = uverbs_set_output(&pbundle->bundle, attr);
+ ret = uverbs_set_output(bundle, attr);
}
/*
@@ -493,6 +508,8 @@ static int ib_uverbs_run_method(struct bundle_priv *pbundle,
static void bundle_destroy(struct bundle_priv *pbundle, bool commit)
{
unsigned int key_bitmap_len = pbundle->method_elm->key_bitmap_len;
+ struct uverbs_attr_bundle *bundle =
+ container_of(&pbundle->bundle, struct uverbs_attr_bundle, hdr);
struct bundle_alloc_head *memblock;
unsigned int i;
@@ -500,20 +517,19 @@ static void bundle_destroy(struct bundle_priv *pbundle, bool commit)
i = -1;
while ((i = find_next_bit(pbundle->uobj_finalize, key_bitmap_len,
i + 1)) < key_bitmap_len) {
- struct uverbs_attr *attr = &pbundle->bundle.attrs[i];
+ struct uverbs_attr *attr = &bundle->attrs[i];
uverbs_finalize_object(
attr->obj_attr.uobject,
attr->obj_attr.attr_elm->spec.u.obj.access,
test_bit(i, pbundle->uobj_hw_obj_valid),
- commit,
- &pbundle->bundle);
+ commit, bundle);
}
i = -1;
while ((i = find_next_bit(pbundle->spec_finalize, key_bitmap_len,
i + 1)) < key_bitmap_len) {
- struct uverbs_attr *attr = &pbundle->bundle.attrs[i];
+ struct uverbs_attr *attr = &bundle->attrs[i];
const struct uverbs_api_attr *attr_uapi;
void __rcu **slot;
@@ -527,7 +543,7 @@ static void bundle_destroy(struct bundle_priv *pbundle, bool commit)
if (attr_uapi->spec.type == UVERBS_ATTR_TYPE_IDRS_ARRAY) {
uverbs_free_idrs_array(attr_uapi, &attr->objs_arr_attr,
- commit, &pbundle->bundle);
+ commit, bundle);
}
}
@@ -570,7 +586,8 @@ static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile,
method_elm->bundle_size -
offsetof(struct bundle_priv, internal_buffer);
pbundle->alloc_head.next = NULL;
- pbundle->allocated_mem = &pbundle->alloc_head;
+ pbundle->allocated_mem = container_of(&pbundle->alloc_head,
+ struct bundle_alloc_head, hdr);
} else {
pbundle = &onstack;
pbundle->internal_avail = sizeof(pbundle->internal_buffer);
@@ -588,8 +605,9 @@ static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile,
pbundle->user_attrs = user_attrs;
pbundle->internal_used = ALIGN(pbundle->method_elm->key_bitmap_len *
- sizeof(*pbundle->bundle.attrs),
- sizeof(*pbundle->internal_buffer));
+ sizeof(*container_of(&pbundle->bundle,
+ struct uverbs_attr_bundle, hdr)->attrs),
+ sizeof(*pbundle->internal_buffer));
memset(pbundle->bundle.attr_present, 0,
sizeof(pbundle->bundle.attr_present));
memset(pbundle->uobj_finalize, 0, sizeof(pbundle->uobj_finalize));
@@ -692,11 +710,13 @@ void uverbs_fill_udata(struct uverbs_attr_bundle *bundle,
unsigned int attr_out)
{
struct bundle_priv *pbundle =
- container_of(bundle, struct bundle_priv, bundle);
+ container_of(&bundle->hdr, struct bundle_priv, bundle);
+ struct uverbs_attr_bundle *bundle_aux =
+ container_of(&pbundle->bundle, struct uverbs_attr_bundle, hdr);
const struct uverbs_attr *in =
- uverbs_attr_get(&pbundle->bundle, attr_in);
+ uverbs_attr_get(bundle_aux, attr_in);
const struct uverbs_attr *out =
- uverbs_attr_get(&pbundle->bundle, attr_out);
+ uverbs_attr_get(bundle_aux, attr_out);
if (!IS_ERR(in)) {
udata->inlen = in->ptr_attr.len;
@@ -821,7 +841,7 @@ void uverbs_finalize_uobj_create(const struct uverbs_attr_bundle *bundle,
u16 idx)
{
struct bundle_priv *pbundle =
- container_of(bundle, struct bundle_priv, bundle);
+ container_of(&bundle->hdr, struct bundle_priv, bundle);
__set_bit(uapi_bkey_attr(uapi_key_attr(idx)),
pbundle->uobj_hw_obj_valid);
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index d54434088727..973fe2c7ef53 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -52,6 +52,7 @@
#include <rdma/ib.h>
#include <rdma/uverbs_std_types.h>
#include <rdma/rdma_netlink.h>
+#include <rdma/ib_ucaps.h>
#include "uverbs.h"
#include "core_priv.h"
@@ -72,11 +73,23 @@ enum {
#define IB_UVERBS_BASE_DEV MKDEV(IB_UVERBS_MAJOR, IB_UVERBS_BASE_MINOR)
static dev_t dynamic_uverbs_dev;
-static struct class *uverbs_class;
static DEFINE_IDA(uverbs_ida);
static int ib_uverbs_add_one(struct ib_device *device);
static void ib_uverbs_remove_one(struct ib_device *device, void *client_data);
+static struct ib_client uverbs_client;
+
+static char *uverbs_devnode(const struct device *dev, umode_t *mode)
+{
+ if (mode)
+ *mode = 0666;
+ return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
+}
+
+static const struct class uverbs_class = {
+ .name = "infiniband_verbs",
+ .devnode = uverbs_devnode,
+};
/*
* Must be called with the ufile->device->disassociate_srcu held, and the lock
@@ -206,6 +219,7 @@ void ib_uverbs_release_file(struct kref *ref)
if (file->disassociate_page)
__free_pages(file->disassociate_page, 0);
+ mutex_destroy(&file->disassociation_lock);
mutex_destroy(&file->umap_lock);
mutex_destroy(&file->ucontext_lock);
kfree(file);
@@ -222,8 +236,12 @@ static ssize_t ib_uverbs_event_read(struct ib_uverbs_event_queue *ev_queue,
spin_lock_irq(&ev_queue->lock);
while (list_empty(&ev_queue->event_list)) {
- spin_unlock_irq(&ev_queue->lock);
+ if (ev_queue->is_closed) {
+ spin_unlock_irq(&ev_queue->lock);
+ return -EIO;
+ }
+ spin_unlock_irq(&ev_queue->lock);
if (filp->f_flags & O_NONBLOCK)
return -EAGAIN;
@@ -233,12 +251,6 @@ static ssize_t ib_uverbs_event_read(struct ib_uverbs_event_queue *ev_queue,
return -ERESTARTSYS;
spin_lock_irq(&ev_queue->lock);
-
- /* If device was disassociated and no event exists set an error */
- if (list_empty(&ev_queue->event_list) && ev_queue->is_closed) {
- spin_unlock_irq(&ev_queue->lock);
- return -EIO;
- }
}
event = list_entry(ev_queue->event_list.next, struct ib_uverbs_event, list);
@@ -344,7 +356,6 @@ const struct file_operations uverbs_event_fops = {
.poll = ib_uverbs_comp_event_poll,
.release = uverbs_uobject_fd_release,
.fasync = ib_uverbs_comp_event_fasync,
- .llseek = no_llseek,
};
const struct file_operations uverbs_async_event_fops = {
@@ -353,7 +364,6 @@ const struct file_operations uverbs_async_event_fops = {
.poll = ib_uverbs_async_event_poll,
.release = uverbs_async_event_release,
.fasync = ib_uverbs_async_event_fasync,
- .llseek = no_llseek,
};
void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context)
@@ -537,7 +547,7 @@ static ssize_t verify_hdr(struct ib_uverbs_cmd_hdr *hdr,
if (hdr->in_words * 4 != count)
return -EINVAL;
- if (count < method_elm->req_size + sizeof(hdr)) {
+ if (count < method_elm->req_size + sizeof(*hdr)) {
/*
* rdma-core v18 and v19 have a bug where they send DESTROY_CQ
* with a 16 byte write instead of 24. Old kernels didn't
@@ -691,8 +701,13 @@ static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)
ret = PTR_ERR(ucontext);
goto out;
}
+
+ mutex_lock(&file->disassociation_lock);
+
vma->vm_ops = &rdma_umap_ops;
ret = ucontext->device->ops.mmap(ucontext, vma);
+
+ mutex_unlock(&file->disassociation_lock);
out:
srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
return ret;
@@ -714,6 +729,8 @@ static void rdma_umap_open(struct vm_area_struct *vma)
/* We are racing with disassociation */
if (!down_read_trylock(&ufile->hw_destroy_rwsem))
goto out_zap;
+ mutex_lock(&ufile->disassociation_lock);
+
/*
* Disassociation already completed, the VMA should already be zapped.
*/
@@ -725,10 +742,12 @@ static void rdma_umap_open(struct vm_area_struct *vma)
goto out_unlock;
rdma_umap_priv_init(priv, vma, opriv->entry);
+ mutex_unlock(&ufile->disassociation_lock);
up_read(&ufile->hw_destroy_rwsem);
return;
out_unlock:
+ mutex_unlock(&ufile->disassociation_lock);
up_read(&ufile->hw_destroy_rwsem);
out_zap:
/*
@@ -812,7 +831,7 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile)
{
struct rdma_umap_priv *priv, *next_priv;
- lockdep_assert_held(&ufile->hw_destroy_rwsem);
+ mutex_lock(&ufile->disassociation_lock);
while (1) {
struct mm_struct *mm = NULL;
@@ -838,8 +857,10 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile)
break;
}
mutex_unlock(&ufile->umap_lock);
- if (!mm)
+ if (!mm) {
+ mutex_unlock(&ufile->disassociation_lock);
return;
+ }
/*
* The umap_lock is nested under mmap_lock since it used within
@@ -869,7 +890,31 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile)
mmap_read_unlock(mm);
mmput(mm);
}
+
+ mutex_unlock(&ufile->disassociation_lock);
+}
+
+/**
+ * rdma_user_mmap_disassociate() - Revoke mmaps for a device
+ * @device: device to revoke
+ *
+ * This function should be called by drivers that need to disable mmaps for the
+ * device, for instance because it is going to be reset.
+ */
+void rdma_user_mmap_disassociate(struct ib_device *device)
+{
+ struct ib_uverbs_device *uverbs_dev =
+ ib_get_client_data(device, &uverbs_client);
+ struct ib_uverbs_file *ufile;
+
+ mutex_lock(&uverbs_dev->lists_mutex);
+ list_for_each_entry(ufile, &uverbs_dev->uverbs_file_list, list) {
+ if (ufile->ucontext)
+ uverbs_user_mmap_disassociate(ufile);
+ }
+ mutex_unlock(&uverbs_dev->lists_mutex);
}
+EXPORT_SYMBOL(rdma_user_mmap_disassociate);
/*
* ib_uverbs_open() does not need the BKL:
@@ -940,6 +985,8 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
mutex_init(&file->umap_lock);
INIT_LIST_HEAD(&file->umaps);
+ mutex_init(&file->disassociation_lock);
+
filp->private_data = file;
list_add_tail(&file->list, &dev->uverbs_file_list);
mutex_unlock(&dev->lists_mutex);
@@ -982,7 +1029,6 @@ static const struct file_operations uverbs_fops = {
.write = ib_uverbs_write,
.open = ib_uverbs_open,
.release = ib_uverbs_close,
- .llseek = no_llseek,
.unlocked_ioctl = ib_uverbs_ioctl,
.compat_ioctl = compat_ptr_ioctl,
};
@@ -993,7 +1039,6 @@ static const struct file_operations uverbs_mmap_fops = {
.mmap = ib_uverbs_mmap,
.open = ib_uverbs_open,
.release = ib_uverbs_close,
- .llseek = no_llseek,
.unlocked_ioctl = ib_uverbs_ioctl,
.compat_ioctl = compat_ptr_ioctl,
};
@@ -1105,7 +1150,8 @@ static int ib_uverbs_add_one(struct ib_device *device)
struct ib_uverbs_device *uverbs_dev;
int ret;
- if (!device->ops.alloc_ucontext)
+ if (!device->ops.alloc_ucontext ||
+ device->type == RDMA_DEVICE_TYPE_SMI)
return -EOPNOTSUPP;
uverbs_dev = kzalloc(sizeof(*uverbs_dev), GFP_KERNEL);
@@ -1119,7 +1165,7 @@ static int ib_uverbs_add_one(struct ib_device *device)
}
device_initialize(&uverbs_dev->dev);
- uverbs_dev->dev.class = uverbs_class;
+ uverbs_dev->dev.class = &uverbs_class;
uverbs_dev->dev.parent = device->dev.parent;
uverbs_dev->dev.release = ib_uverbs_release_dev;
uverbs_dev->groups[0] = &dev_attr_group;
@@ -1237,13 +1283,6 @@ static void ib_uverbs_remove_one(struct ib_device *device, void *client_data)
put_device(&uverbs_dev->dev);
}
-static char *uverbs_devnode(struct device *dev, umode_t *mode)
-{
- if (mode)
- *mode = 0666;
- return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
-}
-
static int __init ib_uverbs_init(void)
{
int ret;
@@ -1264,16 +1303,13 @@ static int __init ib_uverbs_init(void)
goto out_alloc;
}
- uverbs_class = class_create(THIS_MODULE, "infiniband_verbs");
- if (IS_ERR(uverbs_class)) {
- ret = PTR_ERR(uverbs_class);
+ ret = class_register(&uverbs_class);
+ if (ret) {
pr_err("user_verbs: couldn't create class infiniband_verbs\n");
goto out_chrdev;
}
- uverbs_class->devnode = uverbs_devnode;
-
- ret = class_create_file(uverbs_class, &class_attr_abi_version.attr);
+ ret = class_create_file(&uverbs_class, &class_attr_abi_version.attr);
if (ret) {
pr_err("user_verbs: couldn't create abi_version attribute\n");
goto out_class;
@@ -1288,7 +1324,7 @@ static int __init ib_uverbs_init(void)
return 0;
out_class:
- class_destroy(uverbs_class);
+ class_unregister(&uverbs_class);
out_chrdev:
unregister_chrdev_region(dynamic_uverbs_dev,
@@ -1305,11 +1341,12 @@ out:
static void __exit ib_uverbs_cleanup(void)
{
ib_unregister_client(&uverbs_client);
- class_destroy(uverbs_class);
+ class_unregister(&uverbs_class);
unregister_chrdev_region(IB_UVERBS_BASE_DEV,
IB_UVERBS_NUM_FIXED_MINOR);
unregister_chrdev_region(dynamic_uverbs_dev,
IB_UVERBS_NUM_DYNAMIC_MINOR);
+ ib_cleanup_ucaps();
mmu_notifier_synchronize();
}
diff --git a/drivers/infiniband/core/uverbs_marshall.c b/drivers/infiniband/core/uverbs_marshall.c
index b8d715c68ca4..e803f609ec87 100644
--- a/drivers/infiniband/core/uverbs_marshall.c
+++ b/drivers/infiniband/core/uverbs_marshall.c
@@ -66,7 +66,7 @@ void ib_copy_ah_attr_to_user(struct ib_device *device,
struct rdma_ah_attr *src = ah_attr;
struct rdma_ah_attr conv_ah;
- memset(&dst->grh.reserved, 0, sizeof(dst->grh.reserved));
+ memset(&dst->grh, 0, sizeof(dst->grh));
if ((ah_attr->type == RDMA_AH_ATTR_TYPE_OPA) &&
(rdma_ah_get_dlid(ah_attr) > be16_to_cpu(IB_LID_PERMISSIVE)) &&
@@ -171,45 +171,3 @@ void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst,
__ib_copy_path_rec_to_user(dst, src);
}
EXPORT_SYMBOL(ib_copy_path_rec_to_user);
-
-void ib_copy_path_rec_from_user(struct sa_path_rec *dst,
- struct ib_user_path_rec *src)
-{
- u32 slid, dlid;
-
- memset(dst, 0, sizeof(*dst));
- if ((ib_is_opa_gid((union ib_gid *)src->sgid)) ||
- (ib_is_opa_gid((union ib_gid *)src->dgid))) {
- dst->rec_type = SA_PATH_REC_TYPE_OPA;
- slid = opa_get_lid_from_gid((union ib_gid *)src->sgid);
- dlid = opa_get_lid_from_gid((union ib_gid *)src->dgid);
- } else {
- dst->rec_type = SA_PATH_REC_TYPE_IB;
- slid = ntohs(src->slid);
- dlid = ntohs(src->dlid);
- }
- memcpy(dst->dgid.raw, src->dgid, sizeof dst->dgid);
- memcpy(dst->sgid.raw, src->sgid, sizeof dst->sgid);
-
- sa_path_set_dlid(dst, dlid);
- sa_path_set_slid(dst, slid);
- sa_path_set_raw_traffic(dst, src->raw_traffic);
- dst->flow_label = src->flow_label;
- dst->hop_limit = src->hop_limit;
- dst->traffic_class = src->traffic_class;
- dst->reversible = src->reversible;
- dst->numb_path = src->numb_path;
- dst->pkey = src->pkey;
- dst->sl = src->sl;
- dst->mtu_selector = src->mtu_selector;
- dst->mtu = src->mtu;
- dst->rate_selector = src->rate_selector;
- dst->rate = src->rate;
- dst->packet_life_time = src->packet_life_time;
- dst->preference = src->preference;
- dst->packet_life_time_selector = src->packet_life_time_selector;
-
- /* TODO: No need to set this */
- sa_path_set_dmac_zero(dst);
-}
-EXPORT_SYMBOL(ib_copy_path_rec_from_user);
diff --git a/drivers/infiniband/core/uverbs_std_types_counters.c b/drivers/infiniband/core/uverbs_std_types_counters.c
index 999da9c79866..381aa5797641 100644
--- a/drivers/infiniband/core/uverbs_std_types_counters.c
+++ b/drivers/infiniband/core/uverbs_std_types_counters.c
@@ -107,6 +107,8 @@ static int UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_READ)(
return ret;
uattr = uverbs_attr_get(attrs, UVERBS_ATTR_READ_COUNTERS_BUFF);
+ if (IS_ERR(uattr))
+ return PTR_ERR(uattr);
read_attr.ncounters = uattr->ptr_attr.len / sizeof(u64);
read_attr.counters_buff = uverbs_zalloc(
attrs, array_size(read_attr.ncounters, sizeof(u64)));
diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c
index 370ad7c83f88..fab5d914029d 100644
--- a/drivers/infiniband/core/uverbs_std_types_cq.c
+++ b/drivers/infiniband/core/uverbs_std_types_cq.c
@@ -64,15 +64,21 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
struct ib_ucq_object *obj = container_of(
uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_CQ_HANDLE),
typeof(*obj), uevent.uobject);
+ struct ib_uverbs_completion_event_file *ev_file = NULL;
struct ib_device *ib_dev = attrs->context->device;
- int ret;
- u64 user_handle;
+ struct ib_umem_dmabuf *umem_dmabuf;
struct ib_cq_init_attr attr = {};
- struct ib_cq *cq;
- struct ib_uverbs_completion_event_file *ev_file = NULL;
struct ib_uobject *ev_file_uobj;
+ struct ib_umem *umem = NULL;
+ u64 buffer_length;
+ u64 buffer_offset;
+ struct ib_cq *cq;
+ u64 user_handle;
+ u64 buffer_va;
+ int buffer_fd;
+ int ret;
- if (!ib_dev->ops.create_cq || !ib_dev->ops.destroy_cq)
+ if ((!ib_dev->ops.create_cq && !ib_dev->ops.create_cq_umem) || !ib_dev->ops.destroy_cq)
return -EOPNOTSUPP;
ret = uverbs_copy_from(&attr.comp_vector, attrs,
@@ -112,9 +118,66 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
INIT_LIST_HEAD(&obj->comp_list);
INIT_LIST_HEAD(&obj->uevent.event_list);
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_VA)) {
+
+ ret = uverbs_copy_from(&buffer_va, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_VA);
+ if (ret)
+ goto err_event_file;
+
+ ret = uverbs_copy_from(&buffer_length, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_LENGTH);
+ if (ret)
+ goto err_event_file;
+
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_FD) ||
+ uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET) ||
+ !ib_dev->ops.create_cq_umem) {
+ ret = -EINVAL;
+ goto err_event_file;
+ }
+
+ umem = ib_umem_get(ib_dev, buffer_va, buffer_length, IB_ACCESS_LOCAL_WRITE);
+ if (IS_ERR(umem)) {
+ ret = PTR_ERR(umem);
+ goto err_event_file;
+ }
+ } else if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_FD)) {
+
+ ret = uverbs_get_raw_fd(&buffer_fd, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_FD);
+ if (ret)
+ goto err_event_file;
+
+ ret = uverbs_copy_from(&buffer_offset, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET);
+ if (ret)
+ goto err_event_file;
+
+ ret = uverbs_copy_from(&buffer_length, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_LENGTH);
+ if (ret)
+ goto err_event_file;
+
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_VA) ||
+ !ib_dev->ops.create_cq_umem) {
+ ret = -EINVAL;
+ goto err_event_file;
+ }
+
+ umem_dmabuf = ib_umem_dmabuf_get_pinned(ib_dev, buffer_offset, buffer_length,
+ buffer_fd, IB_ACCESS_LOCAL_WRITE);
+ if (IS_ERR(umem_dmabuf)) {
+ ret = PTR_ERR(umem_dmabuf);
+ goto err_event_file;
+ }
+ umem = &umem_dmabuf->umem;
+ } else if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET) ||
+ uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_LENGTH) ||
+ !ib_dev->ops.create_cq) {
+ ret = -EINVAL;
+ goto err_event_file;
+ }
+
cq = rdma_zalloc_drv_obj(ib_dev, ib_cq);
if (!cq) {
ret = -ENOMEM;
+ ib_umem_release(umem);
goto err_event_file;
}
@@ -128,7 +191,8 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ);
rdma_restrack_set_name(&cq->res, NULL);
- ret = ib_dev->ops.create_cq(cq, &attr, &attrs->driver_udata);
+ ret = umem ? ib_dev->ops.create_cq_umem(cq, &attr, umem, attrs) :
+ ib_dev->ops.create_cq(cq, &attr, attrs);
if (ret)
goto err_free;
@@ -142,6 +206,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
return ret;
err_free:
+ ib_umem_release(umem);
rdma_restrack_put(&cq->res);
kfree(cq);
err_event_file:
@@ -180,6 +245,17 @@ DECLARE_UVERBS_NAMED_METHOD(
UVERBS_OBJECT_ASYNC_EVENT,
UVERBS_ACCESS_READ,
UA_OPTIONAL),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_BUFFER_VA,
+ UVERBS_ATTR_TYPE(u64),
+ UA_OPTIONAL),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_BUFFER_LENGTH,
+ UVERBS_ATTR_TYPE(u64),
+ UA_OPTIONAL),
+ UVERBS_ATTR_RAW_FD(UVERBS_ATTR_CREATE_CQ_BUFFER_FD,
+ UA_OPTIONAL),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET,
+ UVERBS_ATTR_TYPE(u64),
+ UA_OPTIONAL),
UVERBS_ATTR_UHW());
static int UVERBS_HANDLER(UVERBS_METHOD_CQ_DESTROY)(
diff --git a/drivers/infiniband/core/uverbs_std_types_device.c b/drivers/infiniband/core/uverbs_std_types_device.c
index 049684880ae0..c0fd283d9d6c 100644
--- a/drivers/infiniband/core/uverbs_std_types_device.c
+++ b/drivers/infiniband/core/uverbs_std_types_device.c
@@ -203,6 +203,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_PORT)(
copy_port_attr_to_resp(&attr, &resp.legacy_resp, ib_dev, port_num);
resp.port_cap_flags2 = attr.port_cap_flags2;
+ resp.active_speed_ex = attr.active_speed;
return uverbs_copy_to_struct_or_zero(attrs, UVERBS_ATTR_QUERY_PORT_RESP,
&resp, sizeof(resp));
@@ -436,6 +437,10 @@ DECLARE_UVERBS_NAMED_METHOD(
UVERBS_ATTR_TYPE(u32), UA_OPTIONAL),
UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_GET_CONTEXT_CORE_SUPPORT,
UVERBS_ATTR_TYPE(u64), UA_OPTIONAL),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_GET_CONTEXT_FD_ARR,
+ UVERBS_ATTR_MIN_SIZE(sizeof(int)),
+ UA_OPTIONAL,
+ UA_ALLOC_AND_COPY),
UVERBS_ATTR_UHW());
DECLARE_UVERBS_NAMED_METHOD(
@@ -461,7 +466,7 @@ DECLARE_UVERBS_NAMED_METHOD(
UVERBS_ATTR_PTR_OUT(
UVERBS_ATTR_QUERY_PORT_RESP,
UVERBS_ATTR_STRUCT(struct ib_uverbs_query_port_resp_ex,
- reserved),
+ active_speed_ex),
UA_MANDATORY));
DECLARE_UVERBS_NAMED_METHOD(
diff --git a/drivers/infiniband/core/uverbs_std_types_dmah.c b/drivers/infiniband/core/uverbs_std_types_dmah.c
new file mode 100644
index 000000000000..453ce656c6f2
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_std_types_dmah.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#include "rdma_core.h"
+#include "uverbs.h"
+#include <rdma/uverbs_std_types.h>
+#include "restrack.h"
+
+static int uverbs_free_dmah(struct ib_uobject *uobject,
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_dmah *dmah = uobject->object;
+ int ret;
+
+ if (atomic_read(&dmah->usecnt))
+ return -EBUSY;
+
+ ret = dmah->device->ops.dealloc_dmah(dmah, attrs);
+ if (ret)
+ return ret;
+
+ rdma_restrack_del(&dmah->res);
+ kfree(dmah);
+ return 0;
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_DMAH_ALLOC)(
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uobject *uobj =
+ uverbs_attr_get(attrs, UVERBS_ATTR_ALLOC_DMAH_HANDLE)
+ ->obj_attr.uobject;
+ struct ib_device *ib_dev = attrs->context->device;
+ struct ib_dmah *dmah;
+ int ret;
+
+ dmah = rdma_zalloc_drv_obj(ib_dev, ib_dmah);
+ if (!dmah)
+ return -ENOMEM;
+
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_ALLOC_DMAH_CPU_ID)) {
+ ret = uverbs_copy_from(&dmah->cpu_id, attrs,
+ UVERBS_ATTR_ALLOC_DMAH_CPU_ID);
+ if (ret)
+ goto err;
+
+ if (!cpumask_test_cpu(dmah->cpu_id, current->cpus_ptr)) {
+ ret = -EPERM;
+ goto err;
+ }
+
+ dmah->valid_fields |= BIT(IB_DMAH_CPU_ID_EXISTS);
+ }
+
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_ALLOC_DMAH_TPH_MEM_TYPE)) {
+ dmah->mem_type = uverbs_attr_get_enum_id(attrs,
+ UVERBS_ATTR_ALLOC_DMAH_TPH_MEM_TYPE);
+ dmah->valid_fields |= BIT(IB_DMAH_MEM_TYPE_EXISTS);
+ }
+
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_ALLOC_DMAH_PH)) {
+ ret = uverbs_copy_from(&dmah->ph, attrs,
+ UVERBS_ATTR_ALLOC_DMAH_PH);
+ if (ret)
+ goto err;
+
+ /* Per PCIe spec 6.2-1.0, only the lowest two bits are applicable */
+ if (dmah->ph & 0xFC) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ dmah->valid_fields |= BIT(IB_DMAH_PH_EXISTS);
+ }
+
+ dmah->device = ib_dev;
+ dmah->uobject = uobj;
+ atomic_set(&dmah->usecnt, 0);
+
+ rdma_restrack_new(&dmah->res, RDMA_RESTRACK_DMAH);
+ rdma_restrack_set_name(&dmah->res, NULL);
+
+ ret = ib_dev->ops.alloc_dmah(dmah, attrs);
+ if (ret) {
+ rdma_restrack_put(&dmah->res);
+ goto err;
+ }
+
+ uobj->object = dmah;
+ rdma_restrack_add(&dmah->res);
+ uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_ALLOC_DMAH_HANDLE);
+ return 0;
+err:
+ kfree(dmah);
+ return ret;
+}
+
+static const struct uverbs_attr_spec uverbs_dmah_mem_type[] = {
+ [TPH_MEM_TYPE_VM] = {
+ .type = UVERBS_ATTR_TYPE_PTR_IN,
+ UVERBS_ATTR_NO_DATA(),
+ },
+ [TPH_MEM_TYPE_PM] = {
+ .type = UVERBS_ATTR_TYPE_PTR_IN,
+ UVERBS_ATTR_NO_DATA(),
+ },
+};
+
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_DMAH_ALLOC,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_ALLOC_DMAH_HANDLE,
+ UVERBS_OBJECT_DMAH,
+ UVERBS_ACCESS_NEW,
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ALLOC_DMAH_CPU_ID,
+ UVERBS_ATTR_TYPE(u32),
+ UA_OPTIONAL),
+ UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_ALLOC_DMAH_TPH_MEM_TYPE,
+ uverbs_dmah_mem_type,
+ UA_OPTIONAL),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ALLOC_DMAH_PH,
+ UVERBS_ATTR_TYPE(u8),
+ UA_OPTIONAL));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+ UVERBS_METHOD_DMAH_FREE,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_FREE_DMA_HANDLE,
+ UVERBS_OBJECT_DMAH,
+ UVERBS_ACCESS_DESTROY,
+ UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_DMAH,
+ UVERBS_TYPE_ALLOC_IDR(uverbs_free_dmah),
+ &UVERBS_METHOD(UVERBS_METHOD_DMAH_ALLOC),
+ &UVERBS_METHOD(UVERBS_METHOD_DMAH_FREE));
+
+const struct uapi_definition uverbs_def_obj_dmah[] = {
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_DMAH,
+ UAPI_DEF_OBJ_NEEDS_FN(dealloc_dmah),
+ UAPI_DEF_OBJ_NEEDS_FN(alloc_dmah)),
+ {}
+};
diff --git a/drivers/infiniband/core/uverbs_std_types_flow_action.c b/drivers/infiniband/core/uverbs_std_types_flow_action.c
index d42ed7ff223e..0ddcf6da66c4 100644
--- a/drivers/infiniband/core/uverbs_std_types_flow_action.c
+++ b/drivers/infiniband/core/uverbs_std_types_flow_action.c
@@ -46,385 +46,6 @@ static int uverbs_free_flow_action(struct ib_uobject *uobject,
return action->device->ops.destroy_flow_action(action);
}
-static u64 esp_flags_uverbs_to_verbs(struct uverbs_attr_bundle *attrs,
- u32 flags, bool is_modify)
-{
- u64 verbs_flags = flags;
-
- if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_ESN))
- verbs_flags |= IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED;
-
- if (is_modify && uverbs_attr_is_valid(attrs,
- UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS))
- verbs_flags |= IB_FLOW_ACTION_ESP_FLAGS_MOD_ESP_ATTRS;
-
- return verbs_flags;
-};
-
-static int validate_flow_action_esp_keymat_aes_gcm(struct ib_flow_action_attrs_esp_keymats *keymat)
-{
- struct ib_uverbs_flow_action_esp_keymat_aes_gcm *aes_gcm =
- &keymat->keymat.aes_gcm;
-
- if (aes_gcm->iv_algo > IB_UVERBS_FLOW_ACTION_IV_ALGO_SEQ)
- return -EOPNOTSUPP;
-
- if (aes_gcm->key_len != 32 &&
- aes_gcm->key_len != 24 &&
- aes_gcm->key_len != 16)
- return -EINVAL;
-
- if (aes_gcm->icv_len != 16 &&
- aes_gcm->icv_len != 8 &&
- aes_gcm->icv_len != 12)
- return -EINVAL;
-
- return 0;
-}
-
-static int (* const flow_action_esp_keymat_validate[])(struct ib_flow_action_attrs_esp_keymats *keymat) = {
- [IB_UVERBS_FLOW_ACTION_ESP_KEYMAT_AES_GCM] = validate_flow_action_esp_keymat_aes_gcm,
-};
-
-static int flow_action_esp_replay_none(struct ib_flow_action_attrs_esp_replays *replay,
- bool is_modify)
-{
- /* This is used in order to modify an esp flow action with an enabled
- * replay protection to a disabled one. This is only supported via
- * modify, as in create verb we can simply drop the REPLAY attribute and
- * achieve the same thing.
- */
- return is_modify ? 0 : -EINVAL;
-}
-
-static int flow_action_esp_replay_def_ok(struct ib_flow_action_attrs_esp_replays *replay,
- bool is_modify)
-{
- /* Some replay protections could always be enabled without validating
- * anything.
- */
- return 0;
-}
-
-static int (* const flow_action_esp_replay_validate[])(struct ib_flow_action_attrs_esp_replays *replay,
- bool is_modify) = {
- [IB_UVERBS_FLOW_ACTION_ESP_REPLAY_NONE] = flow_action_esp_replay_none,
- [IB_UVERBS_FLOW_ACTION_ESP_REPLAY_BMP] = flow_action_esp_replay_def_ok,
-};
-
-static int parse_esp_ip(enum ib_flow_spec_type proto,
- const void __user *val_ptr,
- size_t len, union ib_flow_spec *out)
-{
- int ret;
- const struct ib_uverbs_flow_ipv4_filter ipv4 = {
- .src_ip = cpu_to_be32(0xffffffffUL),
- .dst_ip = cpu_to_be32(0xffffffffUL),
- .proto = 0xff,
- .tos = 0xff,
- .ttl = 0xff,
- .flags = 0xff,
- };
- const struct ib_uverbs_flow_ipv6_filter ipv6 = {
- .src_ip = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
- .dst_ip = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
- .flow_label = cpu_to_be32(0xffffffffUL),
- .next_hdr = 0xff,
- .traffic_class = 0xff,
- .hop_limit = 0xff,
- };
- union {
- struct ib_uverbs_flow_ipv4_filter ipv4;
- struct ib_uverbs_flow_ipv6_filter ipv6;
- } user_val = {};
- const void *user_pmask;
- size_t val_len;
-
- /* If the flow IPv4/IPv6 flow specifications are extended, the mask
- * should be changed as well.
- */
- BUILD_BUG_ON(offsetof(struct ib_uverbs_flow_ipv4_filter, flags) +
- sizeof(ipv4.flags) != sizeof(ipv4));
- BUILD_BUG_ON(offsetof(struct ib_uverbs_flow_ipv6_filter, reserved) +
- sizeof(ipv6.reserved) != sizeof(ipv6));
-
- switch (proto) {
- case IB_FLOW_SPEC_IPV4:
- if (len > sizeof(user_val.ipv4) &&
- !ib_is_buffer_cleared(val_ptr + sizeof(user_val.ipv4),
- len - sizeof(user_val.ipv4)))
- return -EOPNOTSUPP;
-
- val_len = min_t(size_t, len, sizeof(user_val.ipv4));
- ret = copy_from_user(&user_val.ipv4, val_ptr,
- val_len);
- if (ret)
- return -EFAULT;
-
- user_pmask = &ipv4;
- break;
- case IB_FLOW_SPEC_IPV6:
- if (len > sizeof(user_val.ipv6) &&
- !ib_is_buffer_cleared(val_ptr + sizeof(user_val.ipv6),
- len - sizeof(user_val.ipv6)))
- return -EOPNOTSUPP;
-
- val_len = min_t(size_t, len, sizeof(user_val.ipv6));
- ret = copy_from_user(&user_val.ipv6, val_ptr,
- val_len);
- if (ret)
- return -EFAULT;
-
- user_pmask = &ipv6;
- break;
- default:
- return -EOPNOTSUPP;
- }
-
- return ib_uverbs_kern_spec_to_ib_spec_filter(proto, user_pmask,
- &user_val,
- val_len, out);
-}
-
-static int flow_action_esp_get_encap(struct ib_flow_spec_list *out,
- struct uverbs_attr_bundle *attrs)
-{
- struct ib_uverbs_flow_action_esp_encap uverbs_encap;
- int ret;
-
- ret = uverbs_copy_from(&uverbs_encap, attrs,
- UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP);
- if (ret)
- return ret;
-
- /* We currently support only one encap */
- if (uverbs_encap.next_ptr)
- return -EOPNOTSUPP;
-
- if (uverbs_encap.type != IB_FLOW_SPEC_IPV4 &&
- uverbs_encap.type != IB_FLOW_SPEC_IPV6)
- return -EOPNOTSUPP;
-
- return parse_esp_ip(uverbs_encap.type,
- u64_to_user_ptr(uverbs_encap.val_ptr),
- uverbs_encap.len,
- &out->spec);
-}
-
-struct ib_flow_action_esp_attr {
- struct ib_flow_action_attrs_esp hdr;
- struct ib_flow_action_attrs_esp_keymats keymat;
- struct ib_flow_action_attrs_esp_replays replay;
- /* We currently support only one spec */
- struct ib_flow_spec_list encap;
-};
-
-#define ESP_LAST_SUPPORTED_FLAG IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW
-static int parse_flow_action_esp(struct ib_device *ib_dev,
- struct uverbs_attr_bundle *attrs,
- struct ib_flow_action_esp_attr *esp_attr,
- bool is_modify)
-{
- struct ib_uverbs_flow_action_esp uverbs_esp = {};
- int ret;
-
- /* Optional param, if it doesn't exist, we get -ENOENT and skip it */
- ret = uverbs_copy_from(&esp_attr->hdr.esn, attrs,
- UVERBS_ATTR_FLOW_ACTION_ESP_ESN);
- if (IS_UVERBS_COPY_ERR(ret))
- return ret;
-
- /* This can be called from FLOW_ACTION_ESP_MODIFY where
- * UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS is optional
- */
- if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS)) {
- ret = uverbs_copy_from_or_zero(&uverbs_esp, attrs,
- UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS);
- if (ret)
- return ret;
-
- if (uverbs_esp.flags & ~((ESP_LAST_SUPPORTED_FLAG << 1) - 1))
- return -EOPNOTSUPP;
-
- esp_attr->hdr.spi = uverbs_esp.spi;
- esp_attr->hdr.seq = uverbs_esp.seq;
- esp_attr->hdr.tfc_pad = uverbs_esp.tfc_pad;
- esp_attr->hdr.hard_limit_pkts = uverbs_esp.hard_limit_pkts;
- }
- esp_attr->hdr.flags = esp_flags_uverbs_to_verbs(attrs, uverbs_esp.flags,
- is_modify);
-
- if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT)) {
- esp_attr->keymat.protocol =
- uverbs_attr_get_enum_id(attrs,
- UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT);
- ret = uverbs_copy_from_or_zero(&esp_attr->keymat.keymat,
- attrs,
- UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT);
- if (ret)
- return ret;
-
- ret = flow_action_esp_keymat_validate[esp_attr->keymat.protocol](&esp_attr->keymat);
- if (ret)
- return ret;
-
- esp_attr->hdr.keymat = &esp_attr->keymat;
- }
-
- if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY)) {
- esp_attr->replay.protocol =
- uverbs_attr_get_enum_id(attrs,
- UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY);
-
- ret = uverbs_copy_from_or_zero(&esp_attr->replay.replay,
- attrs,
- UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY);
- if (ret)
- return ret;
-
- ret = flow_action_esp_replay_validate[esp_attr->replay.protocol](&esp_attr->replay,
- is_modify);
- if (ret)
- return ret;
-
- esp_attr->hdr.replay = &esp_attr->replay;
- }
-
- if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP)) {
- ret = flow_action_esp_get_encap(&esp_attr->encap, attrs);
- if (ret)
- return ret;
-
- esp_attr->hdr.encap = &esp_attr->encap;
- }
-
- return 0;
-}
-
-static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE)(
- struct uverbs_attr_bundle *attrs)
-{
- struct ib_uobject *uobj = uverbs_attr_get_uobject(
- attrs, UVERBS_ATTR_CREATE_FLOW_ACTION_ESP_HANDLE);
- struct ib_device *ib_dev = attrs->context->device;
- int ret;
- struct ib_flow_action *action;
- struct ib_flow_action_esp_attr esp_attr = {};
-
- if (!ib_dev->ops.create_flow_action_esp)
- return -EOPNOTSUPP;
-
- ret = parse_flow_action_esp(ib_dev, attrs, &esp_attr, false);
- if (ret)
- return ret;
-
- /* No need to check as this attribute is marked as MANDATORY */
- action = ib_dev->ops.create_flow_action_esp(ib_dev, &esp_attr.hdr,
- attrs);
- if (IS_ERR(action))
- return PTR_ERR(action);
-
- uverbs_flow_action_fill_action(action, uobj, ib_dev,
- IB_FLOW_ACTION_ESP);
-
- return 0;
-}
-
-static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY)(
- struct uverbs_attr_bundle *attrs)
-{
- struct ib_uobject *uobj = uverbs_attr_get_uobject(
- attrs, UVERBS_ATTR_MODIFY_FLOW_ACTION_ESP_HANDLE);
- struct ib_flow_action *action = uobj->object;
- int ret;
- struct ib_flow_action_esp_attr esp_attr = {};
-
- if (!action->device->ops.modify_flow_action_esp)
- return -EOPNOTSUPP;
-
- ret = parse_flow_action_esp(action->device, attrs, &esp_attr, true);
- if (ret)
- return ret;
-
- if (action->type != IB_FLOW_ACTION_ESP)
- return -EINVAL;
-
- return action->device->ops.modify_flow_action_esp(action,
- &esp_attr.hdr,
- attrs);
-}
-
-static const struct uverbs_attr_spec uverbs_flow_action_esp_keymat[] = {
- [IB_UVERBS_FLOW_ACTION_ESP_KEYMAT_AES_GCM] = {
- .type = UVERBS_ATTR_TYPE_PTR_IN,
- UVERBS_ATTR_STRUCT(
- struct ib_uverbs_flow_action_esp_keymat_aes_gcm,
- aes_key),
- },
-};
-
-static const struct uverbs_attr_spec uverbs_flow_action_esp_replay[] = {
- [IB_UVERBS_FLOW_ACTION_ESP_REPLAY_NONE] = {
- .type = UVERBS_ATTR_TYPE_PTR_IN,
- UVERBS_ATTR_NO_DATA(),
- },
- [IB_UVERBS_FLOW_ACTION_ESP_REPLAY_BMP] = {
- .type = UVERBS_ATTR_TYPE_PTR_IN,
- UVERBS_ATTR_STRUCT(struct ib_uverbs_flow_action_esp_replay_bmp,
- size),
- },
-};
-
-DECLARE_UVERBS_NAMED_METHOD(
- UVERBS_METHOD_FLOW_ACTION_ESP_CREATE,
- UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_FLOW_ACTION_ESP_HANDLE,
- UVERBS_OBJECT_FLOW_ACTION,
- UVERBS_ACCESS_NEW,
- UA_MANDATORY),
- UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS,
- UVERBS_ATTR_STRUCT(struct ib_uverbs_flow_action_esp,
- hard_limit_pkts),
- UA_MANDATORY),
- UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ESN,
- UVERBS_ATTR_TYPE(__u32),
- UA_OPTIONAL),
- UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT,
- uverbs_flow_action_esp_keymat,
- UA_MANDATORY),
- UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY,
- uverbs_flow_action_esp_replay,
- UA_OPTIONAL),
- UVERBS_ATTR_PTR_IN(
- UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP,
- UVERBS_ATTR_TYPE(struct ib_uverbs_flow_action_esp_encap),
- UA_OPTIONAL));
-
-DECLARE_UVERBS_NAMED_METHOD(
- UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY,
- UVERBS_ATTR_IDR(UVERBS_ATTR_MODIFY_FLOW_ACTION_ESP_HANDLE,
- UVERBS_OBJECT_FLOW_ACTION,
- UVERBS_ACCESS_WRITE,
- UA_MANDATORY),
- UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS,
- UVERBS_ATTR_STRUCT(struct ib_uverbs_flow_action_esp,
- hard_limit_pkts),
- UA_OPTIONAL),
- UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ESN,
- UVERBS_ATTR_TYPE(__u32),
- UA_OPTIONAL),
- UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT,
- uverbs_flow_action_esp_keymat,
- UA_OPTIONAL),
- UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY,
- uverbs_flow_action_esp_replay,
- UA_OPTIONAL),
- UVERBS_ATTR_PTR_IN(
- UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP,
- UVERBS_ATTR_TYPE(struct ib_uverbs_flow_action_esp_encap),
- UA_OPTIONAL));
-
DECLARE_UVERBS_NAMED_METHOD_DESTROY(
UVERBS_METHOD_FLOW_ACTION_DESTROY,
UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_FLOW_ACTION_HANDLE,
@@ -435,9 +56,7 @@ DECLARE_UVERBS_NAMED_METHOD_DESTROY(
DECLARE_UVERBS_NAMED_OBJECT(
UVERBS_OBJECT_FLOW_ACTION,
UVERBS_TYPE_ALLOC_IDR(uverbs_free_flow_action),
- &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE),
- &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_DESTROY),
- &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY));
+ &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_DESTROY));
const struct uapi_definition uverbs_def_obj_flow_action[] = {
UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c
index f782d5e1aa25..570b9656801d 100644
--- a/drivers/infiniband/core/uverbs_std_types_mr.c
+++ b/drivers/infiniband/core/uverbs_std_types_mr.c
@@ -238,8 +238,8 @@ static int UVERBS_HANDLER(UVERBS_METHOD_REG_DMABUF_MR)(
return ret;
mr = pd->device->ops.reg_user_mr_dmabuf(pd, offset, length, iova, fd,
- access_flags,
- &attrs->driver_udata);
+ access_flags, NULL,
+ attrs);
if (IS_ERR(mr))
return PTR_ERR(mr);
@@ -249,6 +249,9 @@ static int UVERBS_HANDLER(UVERBS_METHOD_REG_DMABUF_MR)(
mr->uobject = uobj;
atomic_inc(&pd->usecnt);
+ rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR);
+ rdma_restrack_set_name(&mr->res, NULL);
+ rdma_restrack_add(&mr->res);
uobj->object = mr;
uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_REG_DMABUF_MR_HANDLE);
@@ -263,6 +266,135 @@ static int UVERBS_HANDLER(UVERBS_METHOD_REG_DMABUF_MR)(
return ret;
}
+static int UVERBS_HANDLER(UVERBS_METHOD_REG_MR)(
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uobject *uobj =
+ uverbs_attr_get_uobject(attrs, UVERBS_ATTR_REG_MR_HANDLE);
+ struct ib_pd *pd =
+ uverbs_attr_get_obj(attrs, UVERBS_ATTR_REG_MR_PD_HANDLE);
+ u32 valid_access_flags = IB_ACCESS_SUPPORTED;
+ u64 length, iova, fd_offset = 0, addr = 0;
+ struct ib_device *ib_dev = pd->device;
+ struct ib_dmah *dmah = NULL;
+ bool has_fd_offset = false;
+ bool has_addr = false;
+ bool has_fd = false;
+ u32 access_flags;
+ struct ib_mr *mr;
+ int fd;
+ int ret;
+
+ ret = uverbs_copy_from(&iova, attrs, UVERBS_ATTR_REG_MR_IOVA);
+ if (ret)
+ return ret;
+
+ ret = uverbs_copy_from(&length, attrs, UVERBS_ATTR_REG_MR_LENGTH);
+ if (ret)
+ return ret;
+
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_REG_MR_ADDR)) {
+ ret = uverbs_copy_from(&addr, attrs,
+ UVERBS_ATTR_REG_MR_ADDR);
+ if (ret)
+ return ret;
+ has_addr = true;
+ }
+
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_REG_MR_FD_OFFSET)) {
+ ret = uverbs_copy_from(&fd_offset, attrs,
+ UVERBS_ATTR_REG_MR_FD_OFFSET);
+ if (ret)
+ return ret;
+ has_fd_offset = true;
+ }
+
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_REG_MR_FD)) {
+ ret = uverbs_get_raw_fd(&fd, attrs,
+ UVERBS_ATTR_REG_MR_FD);
+ if (ret)
+ return ret;
+ has_fd = true;
+ }
+
+ if (has_fd) {
+ if (!ib_dev->ops.reg_user_mr_dmabuf)
+ return -EOPNOTSUPP;
+
+ /* FD requires offset and can't come with addr */
+ if (!has_fd_offset || has_addr)
+ return -EINVAL;
+
+ if ((fd_offset & ~PAGE_MASK) != (iova & ~PAGE_MASK))
+ return -EINVAL;
+
+ valid_access_flags = IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_READ |
+ IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_REMOTE_ATOMIC |
+ IB_ACCESS_RELAXED_ORDERING;
+ } else {
+ if (!has_addr || has_fd_offset)
+ return -EINVAL;
+
+ if ((addr & ~PAGE_MASK) != (iova & ~PAGE_MASK))
+ return -EINVAL;
+ }
+
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_REG_MR_DMA_HANDLE)) {
+ dmah = uverbs_attr_get_obj(attrs,
+ UVERBS_ATTR_REG_MR_DMA_HANDLE);
+ if (IS_ERR(dmah))
+ return PTR_ERR(dmah);
+ }
+
+ ret = uverbs_get_flags32(&access_flags, attrs,
+ UVERBS_ATTR_REG_MR_ACCESS_FLAGS,
+ valid_access_flags);
+ if (ret)
+ return ret;
+
+ ret = ib_check_mr_access(ib_dev, access_flags);
+ if (ret)
+ return ret;
+
+ if (has_fd)
+ mr = pd->device->ops.reg_user_mr_dmabuf(pd, fd_offset, length,
+ iova, fd, access_flags,
+ dmah, attrs);
+ else
+ mr = pd->device->ops.reg_user_mr(pd, addr, length, iova,
+ access_flags, dmah, NULL);
+
+ if (IS_ERR(mr))
+ return PTR_ERR(mr);
+
+ mr->device = pd->device;
+ mr->pd = pd;
+ mr->type = IB_MR_TYPE_USER;
+ mr->uobject = uobj;
+ atomic_inc(&pd->usecnt);
+ if (dmah) {
+ mr->dmah = dmah;
+ atomic_inc(&dmah->usecnt);
+ }
+ rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR);
+ rdma_restrack_set_name(&mr->res, NULL);
+ rdma_restrack_add(&mr->res);
+ uobj->object = mr;
+
+ uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_REG_MR_HANDLE);
+
+ ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_MR_RESP_LKEY,
+ &mr->lkey, sizeof(mr->lkey));
+ if (ret)
+ return ret;
+
+ ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_MR_RESP_RKEY,
+ &mr->rkey, sizeof(mr->rkey));
+ return ret;
+}
+
DECLARE_UVERBS_NAMED_METHOD(
UVERBS_METHOD_ADVISE_MR,
UVERBS_ATTR_IDR(UVERBS_ATTR_ADVISE_MR_PD_HANDLE,
@@ -359,6 +491,44 @@ DECLARE_UVERBS_NAMED_METHOD(
UVERBS_ATTR_TYPE(u32),
UA_MANDATORY));
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_REG_MR,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_REG_MR_HANDLE,
+ UVERBS_OBJECT_MR,
+ UVERBS_ACCESS_NEW,
+ UA_MANDATORY),
+ UVERBS_ATTR_IDR(UVERBS_ATTR_REG_MR_PD_HANDLE,
+ UVERBS_OBJECT_PD,
+ UVERBS_ACCESS_READ,
+ UA_MANDATORY),
+ UVERBS_ATTR_IDR(UVERBS_ATTR_REG_MR_DMA_HANDLE,
+ UVERBS_OBJECT_DMAH,
+ UVERBS_ACCESS_READ,
+ UA_OPTIONAL),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_MR_IOVA,
+ UVERBS_ATTR_TYPE(u64),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_MR_LENGTH,
+ UVERBS_ATTR_TYPE(u64),
+ UA_MANDATORY),
+ UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_REG_MR_ACCESS_FLAGS,
+ enum ib_access_flags,
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_MR_ADDR,
+ UVERBS_ATTR_TYPE(u64),
+ UA_OPTIONAL),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_MR_FD_OFFSET,
+ UVERBS_ATTR_TYPE(u64),
+ UA_OPTIONAL),
+ UVERBS_ATTR_RAW_FD(UVERBS_ATTR_REG_MR_FD,
+ UA_OPTIONAL),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_MR_RESP_LKEY,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_MR_RESP_RKEY,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY));
+
DECLARE_UVERBS_NAMED_METHOD_DESTROY(
UVERBS_METHOD_MR_DESTROY,
UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_MR_HANDLE,
@@ -373,7 +543,8 @@ DECLARE_UVERBS_NAMED_OBJECT(
&UVERBS_METHOD(UVERBS_METHOD_DM_MR_REG),
&UVERBS_METHOD(UVERBS_METHOD_MR_DESTROY),
&UVERBS_METHOD(UVERBS_METHOD_QUERY_MR),
- &UVERBS_METHOD(UVERBS_METHOD_REG_DMABUF_MR));
+ &UVERBS_METHOD(UVERBS_METHOD_REG_DMABUF_MR),
+ &UVERBS_METHOD(UVERBS_METHOD_REG_MR));
const struct uapi_definition uverbs_def_obj_mr[] = {
UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_MR,
diff --git a/drivers/infiniband/core/uverbs_std_types_qp.c b/drivers/infiniband/core/uverbs_std_types_qp.c
index c00cfb5ed387..be0730e8509e 100644
--- a/drivers/infiniband/core/uverbs_std_types_qp.c
+++ b/drivers/infiniband/core/uverbs_std_types_qp.c
@@ -133,7 +133,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_QP_CREATE)(
device = xrcd->device;
break;
case IB_UVERBS_QPT_RAW_PACKET:
- if (!capable(CAP_NET_RAW))
+ if (!rdma_uattrs_has_raw_cap(attrs))
return -EPERM;
fallthrough;
case IB_UVERBS_QPT_RC:
@@ -163,7 +163,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_QP_CREATE)(
UVERBS_ATTR_CREATE_QP_SRQ_HANDLE))
return -EINVAL;
- /* send_cq is optinal */
+ /* send_cq is optional */
if (cap.max_send_wr) {
send_cq = uverbs_attr_get_obj(attrs,
UVERBS_ATTR_CREATE_QP_SEND_CQ_HANDLE);
@@ -248,44 +248,23 @@ static int UVERBS_HANDLER(UVERBS_METHOD_QP_CREATE)(
set_caps(&attr, &cap, true);
mutex_init(&obj->mcast_lock);
- if (attr.qp_type == IB_QPT_XRC_TGT)
- qp = ib_create_qp(pd, &attr);
- else
- qp = _ib_create_qp(device, pd, &attr, &attrs->driver_udata, obj,
- NULL);
-
+ qp = ib_create_qp_user(device, pd, &attr, &attrs->driver_udata, obj,
+ KBUILD_MODNAME);
if (IS_ERR(qp)) {
ret = PTR_ERR(qp);
goto err_put;
}
+ ib_qp_usecnt_inc(qp);
- if (attr.qp_type != IB_QPT_XRC_TGT) {
- atomic_inc(&pd->usecnt);
- if (attr.send_cq)
- atomic_inc(&attr.send_cq->usecnt);
- if (attr.recv_cq)
- atomic_inc(&attr.recv_cq->usecnt);
- if (attr.srq)
- atomic_inc(&attr.srq->usecnt);
- if (attr.rwq_ind_tbl)
- atomic_inc(&attr.rwq_ind_tbl->usecnt);
- } else {
+ if (attr.qp_type == IB_QPT_XRC_TGT) {
obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object,
uobject);
atomic_inc(&obj->uxrcd->refcnt);
- /* It is done in _ib_create_qp for other QP types */
- qp->uobject = obj;
}
obj->uevent.uobject.object = qp;
uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_CREATE_QP_HANDLE);
- if (attr.qp_type != IB_QPT_XRC_TGT) {
- ret = ib_create_qp_security(qp, device);
- if (ret)
- return ret;
- }
-
set_caps(&attr, &cap, false);
ret = uverbs_copy_to_struct_or_zero(attrs,
UVERBS_ATTR_CREATE_QP_RESP_CAP, &cap,
diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c
index 2f2c7646fce1..e00ea63175bd 100644
--- a/drivers/infiniband/core/uverbs_uapi.c
+++ b/drivers/infiniband/core/uverbs_uapi.c
@@ -447,6 +447,9 @@ static int uapi_finalize(struct uverbs_api *uapi)
uapi->num_write_ex = max_write_ex + 1;
data = kmalloc_array(uapi->num_write + uapi->num_write_ex,
sizeof(*uapi->write_methods), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
for (i = 0; i != uapi->num_write + uapi->num_write_ex; i++)
data[i] = &uapi->notsupp_method;
uapi->write_methods = data;
@@ -628,6 +631,7 @@ static const struct uapi_definition uverbs_core_api[] = {
UAPI_DEF_CHAIN(uverbs_def_obj_cq),
UAPI_DEF_CHAIN(uverbs_def_obj_device),
UAPI_DEF_CHAIN(uverbs_def_obj_dm),
+ UAPI_DEF_CHAIN(uverbs_def_obj_dmah),
UAPI_DEF_CHAIN(uverbs_def_obj_flow_action),
UAPI_DEF_CHAIN(uverbs_def_obj_intf),
UAPI_DEF_CHAIN(uverbs_def_obj_mr),
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 7036967e4c0b..11b1a194de44 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -147,6 +147,8 @@ __attribute_const__ int ib_rate_to_mult(enum ib_rate rate)
case IB_RATE_50_GBPS: return 20;
case IB_RATE_400_GBPS: return 160;
case IB_RATE_600_GBPS: return 240;
+ case IB_RATE_800_GBPS: return 320;
+ case IB_RATE_1600_GBPS: return 640;
default: return -1;
}
}
@@ -176,6 +178,8 @@ __attribute_const__ enum ib_rate mult_to_ib_rate(int mult)
case 20: return IB_RATE_50_GBPS;
case 160: return IB_RATE_400_GBPS;
case 240: return IB_RATE_600_GBPS;
+ case 320: return IB_RATE_800_GBPS;
+ case 640: return IB_RATE_1600_GBPS;
default: return IB_RATE_PORT_CURRENT;
}
}
@@ -205,6 +209,8 @@ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate)
case IB_RATE_50_GBPS: return 53125;
case IB_RATE_400_GBPS: return 425000;
case IB_RATE_600_GBPS: return 637500;
+ case IB_RATE_800_GBPS: return 850000;
+ case IB_RATE_1600_GBPS: return 1700000;
default: return -1;
}
}
@@ -268,9 +274,6 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags,
return ERR_PTR(-ENOMEM);
pd->device = device;
- pd->uobject = NULL;
- pd->__internal_mr = NULL;
- atomic_set(&pd->usecnt, 0);
pd->flags = flags;
rdma_restrack_new(&pd->res, RDMA_RESTRACK_PD);
@@ -284,7 +287,7 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags,
}
rdma_restrack_add(&pd->res);
- if (device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
+ if (device->attrs.kernel_cap_flags & IBK_LOCAL_DMA_LKEY)
pd->local_dma_lkey = device->local_dma_lkey;
else
mr_access_flags |= IB_ACCESS_LOCAL_WRITE;
@@ -311,7 +314,7 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags,
pd->__internal_mr = mr;
- if (!(device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY))
+ if (!(device->attrs.kernel_cap_flags & IBK_LOCAL_DMA_LKEY))
pd->local_dma_lkey = pd->__internal_mr->lkey;
if (flags & IB_PD_UNSAFE_GLOBAL_RKEY)
@@ -341,11 +344,6 @@ int ib_dealloc_pd_user(struct ib_pd *pd, struct ib_udata *udata)
pd->__internal_mr = NULL;
}
- /* uverbs manipulates usecnt with proper locking, while the kabi
- * requires the caller to guarantee we can't race here.
- */
- WARN_ON(atomic_read(&pd->usecnt));
-
ret = pd->device->ops.dealloc_pd(pd, udata);
if (ret)
return ret;
@@ -374,7 +372,7 @@ void rdma_copy_ah_attr(struct rdma_ah_attr *dest,
EXPORT_SYMBOL(rdma_copy_ah_attr);
/**
- * rdma_replace_ah_attr - Replace valid ah_attr with new new one.
+ * rdma_replace_ah_attr - Replace valid ah_attr with new one.
* @old: Pointer to existing ah_attr which needs to be replaced.
* old is assumed to be valid or zero'd
* @new: Pointer to the new ah_attr.
@@ -540,6 +538,8 @@ static struct ib_ah *_rdma_create_ah(struct ib_pd *pd,
else
ret = device->ops.create_ah(ah, &init_attr, NULL);
if (ret) {
+ if (ah->sgid_attr)
+ rdma_put_gid_attr(ah->sgid_attr);
kfree(ah);
return ERR_PTR(ret);
}
@@ -575,7 +575,7 @@ struct ib_ah *rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
GFP_KERNEL : GFP_ATOMIC);
if (IS_ERR(slave)) {
rdma_unfill_sgid_attr(ah_attr, old_sgid_attr);
- return (void *)slave;
+ return ERR_CAST(slave);
}
ah = _rdma_create_ah(pd, ah_attr, flags, NULL, slave);
rdma_lag_put_ah_roce_slave(slave);
@@ -750,7 +750,7 @@ EXPORT_SYMBOL(ib_get_gids_from_rdma_hdr);
/* Resolve destination mac address and hop limit for unicast destination
* GID entry, considering the source GID entry as well.
- * ah_attribute must have have valid port_num, sgid_index.
+ * ah_attribute must have valid port_num, sgid_index.
*/
static int ib_resolve_unicast_gid_dmac(struct ib_device *device,
struct rdma_ah_attr *ah_attr)
@@ -1035,7 +1035,8 @@ struct ib_srq *ib_create_srq_user(struct ib_pd *pd,
}
if (srq->srq_type == IB_SRQT_XRC) {
srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd;
- atomic_inc(&srq->ext.xrc.xrcd->usecnt);
+ if (srq->ext.xrc.xrcd)
+ atomic_inc(&srq->ext.xrc.xrcd->usecnt);
}
atomic_inc(&pd->usecnt);
@@ -1045,8 +1046,8 @@ struct ib_srq *ib_create_srq_user(struct ib_pd *pd,
ret = pd->device->ops.create_srq(srq, srq_init_attr, udata);
if (ret) {
rdma_restrack_put(&srq->res);
- atomic_dec(&srq->pd->usecnt);
- if (srq->srq_type == IB_SRQT_XRC)
+ atomic_dec(&pd->usecnt);
+ if (srq->srq_type == IB_SRQT_XRC && srq->ext.xrc.xrcd)
atomic_dec(&srq->ext.xrc.xrcd->usecnt);
if (ib_srq_has_cq(srq->srq_type))
atomic_dec(&srq->ext.cq->usecnt);
@@ -1090,7 +1091,7 @@ int ib_destroy_srq_user(struct ib_srq *srq, struct ib_udata *udata)
return ret;
atomic_dec(&srq->pd->usecnt);
- if (srq->srq_type == IB_SRQT_XRC)
+ if (srq->srq_type == IB_SRQT_XRC && srq->ext.xrc.xrcd)
atomic_dec(&srq->ext.xrc.xrcd->usecnt);
if (ib_srq_has_cq(srq->srq_type))
atomic_dec(&srq->ext.cq->usecnt);
@@ -1103,6 +1104,16 @@ EXPORT_SYMBOL(ib_destroy_srq_user);
/* Queue pairs */
+static void __ib_qp_event_handler(struct ib_event *event, void *context)
+{
+ struct ib_qp *qp = event->element.qp;
+
+ if (event->event == IB_EVENT_QP_LAST_WQE_REACHED)
+ complete(&qp->srq_completion);
+ if (qp->registered_event_handler)
+ qp->registered_event_handler(event, qp->qp_context);
+}
+
static void __ib_shared_qp_event_handler(struct ib_event *event, void *context)
{
struct ib_qp *qp = context;
@@ -1199,34 +1210,147 @@ static struct ib_qp *create_xrc_qp_user(struct ib_qp *qp,
return qp;
}
+static struct ib_qp *create_qp(struct ib_device *dev, struct ib_pd *pd,
+ struct ib_qp_init_attr *attr,
+ struct ib_udata *udata,
+ struct ib_uqp_object *uobj, const char *caller)
+{
+ struct ib_udata dummy = {};
+ struct ib_qp *qp;
+ int ret;
+
+ if (!dev->ops.create_qp)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ qp = rdma_zalloc_drv_obj_numa(dev, ib_qp);
+ if (!qp)
+ return ERR_PTR(-ENOMEM);
+
+ qp->device = dev;
+ qp->pd = pd;
+ qp->uobject = uobj;
+ qp->real_qp = qp;
+
+ qp->qp_type = attr->qp_type;
+ qp->rwq_ind_tbl = attr->rwq_ind_tbl;
+ qp->srq = attr->srq;
+ qp->event_handler = __ib_qp_event_handler;
+ qp->registered_event_handler = attr->event_handler;
+ qp->port = attr->port_num;
+ qp->qp_context = attr->qp_context;
+
+ spin_lock_init(&qp->mr_lock);
+ INIT_LIST_HEAD(&qp->rdma_mrs);
+ INIT_LIST_HEAD(&qp->sig_mrs);
+ init_completion(&qp->srq_completion);
+
+ qp->send_cq = attr->send_cq;
+ qp->recv_cq = attr->recv_cq;
+
+ rdma_restrack_new(&qp->res, RDMA_RESTRACK_QP);
+ WARN_ONCE(!udata && !caller, "Missing kernel QP owner");
+ rdma_restrack_set_name(&qp->res, udata ? NULL : caller);
+ ret = dev->ops.create_qp(qp, attr, udata);
+ if (ret)
+ goto err_create;
+
+ /*
+ * TODO: The mlx4 internally overwrites send_cq and recv_cq.
+ * Unfortunately, it is not an easy task to fix that driver.
+ */
+ qp->send_cq = attr->send_cq;
+ qp->recv_cq = attr->recv_cq;
+
+ ret = ib_create_qp_security(qp, dev);
+ if (ret)
+ goto err_security;
+
+ rdma_restrack_add(&qp->res);
+ return qp;
+
+err_security:
+ qp->device->ops.destroy_qp(qp, udata ? &dummy : NULL);
+err_create:
+ rdma_restrack_put(&qp->res);
+ kfree(qp);
+ return ERR_PTR(ret);
+
+}
+
/**
- * ib_create_named_qp - Creates a kernel QP associated with the specified protection
+ * ib_create_qp_user - Creates a QP associated with the specified protection
* domain.
+ * @dev: IB device
* @pd: The protection domain associated with the QP.
- * @qp_init_attr: A list of initial attributes required to create the
+ * @attr: A list of initial attributes required to create the
* QP. If QP creation succeeds, then the attributes are updated to
* the actual capabilities of the created QP.
+ * @udata: User data
+ * @uobj: uverbs obect
* @caller: caller's build-time module name
- *
- * NOTE: for user qp use ib_create_qp_user with valid udata!
*/
-struct ib_qp *ib_create_named_qp(struct ib_pd *pd,
- struct ib_qp_init_attr *qp_init_attr,
- const char *caller)
+struct ib_qp *ib_create_qp_user(struct ib_device *dev, struct ib_pd *pd,
+ struct ib_qp_init_attr *attr,
+ struct ib_udata *udata,
+ struct ib_uqp_object *uobj, const char *caller)
{
- struct ib_device *device = pd ? pd->device : qp_init_attr->xrcd->device;
- struct ib_qp *qp;
- int ret;
+ struct ib_qp *qp, *xrc_qp;
- if (qp_init_attr->rwq_ind_tbl &&
- (qp_init_attr->recv_cq ||
- qp_init_attr->srq || qp_init_attr->cap.max_recv_wr ||
- qp_init_attr->cap.max_recv_sge))
- return ERR_PTR(-EINVAL);
+ if (attr->qp_type == IB_QPT_XRC_TGT)
+ qp = create_qp(dev, pd, attr, NULL, NULL, caller);
+ else
+ qp = create_qp(dev, pd, attr, udata, uobj, NULL);
+ if (attr->qp_type != IB_QPT_XRC_TGT || IS_ERR(qp))
+ return qp;
- if ((qp_init_attr->create_flags & IB_QP_CREATE_INTEGRITY_EN) &&
- !(device->attrs.device_cap_flags & IB_DEVICE_INTEGRITY_HANDOVER))
- return ERR_PTR(-EINVAL);
+ xrc_qp = create_xrc_qp_user(qp, attr);
+ if (IS_ERR(xrc_qp)) {
+ ib_destroy_qp(qp);
+ return xrc_qp;
+ }
+
+ xrc_qp->uobject = uobj;
+ return xrc_qp;
+}
+EXPORT_SYMBOL(ib_create_qp_user);
+
+void ib_qp_usecnt_inc(struct ib_qp *qp)
+{
+ if (qp->pd)
+ atomic_inc(&qp->pd->usecnt);
+ if (qp->send_cq)
+ atomic_inc(&qp->send_cq->usecnt);
+ if (qp->recv_cq)
+ atomic_inc(&qp->recv_cq->usecnt);
+ if (qp->srq)
+ atomic_inc(&qp->srq->usecnt);
+ if (qp->rwq_ind_tbl)
+ atomic_inc(&qp->rwq_ind_tbl->usecnt);
+}
+EXPORT_SYMBOL(ib_qp_usecnt_inc);
+
+void ib_qp_usecnt_dec(struct ib_qp *qp)
+{
+ if (qp->rwq_ind_tbl)
+ atomic_dec(&qp->rwq_ind_tbl->usecnt);
+ if (qp->srq)
+ atomic_dec(&qp->srq->usecnt);
+ if (qp->recv_cq)
+ atomic_dec(&qp->recv_cq->usecnt);
+ if (qp->send_cq)
+ atomic_dec(&qp->send_cq->usecnt);
+ if (qp->pd)
+ atomic_dec(&qp->pd->usecnt);
+}
+EXPORT_SYMBOL(ib_qp_usecnt_dec);
+
+struct ib_qp *ib_create_qp_kernel(struct ib_pd *pd,
+ struct ib_qp_init_attr *qp_init_attr,
+ const char *caller)
+{
+ struct ib_device *device = pd->device;
+ struct ib_qp *qp;
+ int ret;
/*
* If the callers is using the RDMA API calculate the resources
@@ -1237,47 +1361,11 @@ struct ib_qp *ib_create_named_qp(struct ib_pd *pd,
if (qp_init_attr->cap.max_rdma_ctxs)
rdma_rw_init_qp(device, qp_init_attr);
- qp = _ib_create_qp(device, pd, qp_init_attr, NULL, NULL, caller);
+ qp = create_qp(device, pd, qp_init_attr, NULL, NULL, caller);
if (IS_ERR(qp))
return qp;
- ret = ib_create_qp_security(qp, device);
- if (ret)
- goto err;
-
- if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) {
- struct ib_qp *xrc_qp =
- create_xrc_qp_user(qp, qp_init_attr);
-
- if (IS_ERR(xrc_qp)) {
- ret = PTR_ERR(xrc_qp);
- goto err;
- }
- return xrc_qp;
- }
-
- qp->event_handler = qp_init_attr->event_handler;
- qp->qp_context = qp_init_attr->qp_context;
- if (qp_init_attr->qp_type == IB_QPT_XRC_INI) {
- qp->recv_cq = NULL;
- qp->srq = NULL;
- } else {
- qp->recv_cq = qp_init_attr->recv_cq;
- if (qp_init_attr->recv_cq)
- atomic_inc(&qp_init_attr->recv_cq->usecnt);
- qp->srq = qp_init_attr->srq;
- if (qp->srq)
- atomic_inc(&qp_init_attr->srq->usecnt);
- }
-
- qp->send_cq = qp_init_attr->send_cq;
- qp->xrcd = NULL;
-
- atomic_inc(&pd->usecnt);
- if (qp_init_attr->send_cq)
- atomic_inc(&qp_init_attr->send_cq->usecnt);
- if (qp_init_attr->rwq_ind_tbl)
- atomic_inc(&qp->rwq_ind_tbl->usecnt);
+ ib_qp_usecnt_inc(qp);
if (qp_init_attr->cap.max_rdma_ctxs) {
ret = rdma_rw_init_mrs(qp, qp_init_attr);
@@ -1303,7 +1391,7 @@ err:
return ERR_PTR(ret);
}
-EXPORT_SYMBOL(ib_create_named_qp);
+EXPORT_SYMBOL(ib_create_qp_kernel);
static const struct {
int valid;
@@ -1810,12 +1898,95 @@ int ib_modify_qp_with_udata(struct ib_qp *ib_qp, struct ib_qp_attr *attr,
}
EXPORT_SYMBOL(ib_modify_qp_with_udata);
+static void ib_get_width_and_speed(u32 netdev_speed, u32 lanes,
+ u16 *speed, u8 *width)
+{
+ if (!lanes) {
+ if (netdev_speed <= SPEED_1000) {
+ *width = IB_WIDTH_1X;
+ *speed = IB_SPEED_SDR;
+ } else if (netdev_speed <= SPEED_10000) {
+ *width = IB_WIDTH_1X;
+ *speed = IB_SPEED_FDR10;
+ } else if (netdev_speed <= SPEED_20000) {
+ *width = IB_WIDTH_4X;
+ *speed = IB_SPEED_DDR;
+ } else if (netdev_speed <= SPEED_25000) {
+ *width = IB_WIDTH_1X;
+ *speed = IB_SPEED_EDR;
+ } else if (netdev_speed <= SPEED_40000) {
+ *width = IB_WIDTH_4X;
+ *speed = IB_SPEED_FDR10;
+ } else if (netdev_speed <= SPEED_50000) {
+ *width = IB_WIDTH_2X;
+ *speed = IB_SPEED_EDR;
+ } else if (netdev_speed <= SPEED_100000) {
+ *width = IB_WIDTH_4X;
+ *speed = IB_SPEED_EDR;
+ } else if (netdev_speed <= SPEED_200000) {
+ *width = IB_WIDTH_4X;
+ *speed = IB_SPEED_HDR;
+ } else {
+ *width = IB_WIDTH_4X;
+ *speed = IB_SPEED_NDR;
+ }
+
+ return;
+ }
+
+ switch (lanes) {
+ case 1:
+ *width = IB_WIDTH_1X;
+ break;
+ case 2:
+ *width = IB_WIDTH_2X;
+ break;
+ case 4:
+ *width = IB_WIDTH_4X;
+ break;
+ case 8:
+ *width = IB_WIDTH_8X;
+ break;
+ case 12:
+ *width = IB_WIDTH_12X;
+ break;
+ default:
+ *width = IB_WIDTH_1X;
+ }
+
+ switch (netdev_speed / lanes) {
+ case SPEED_2500:
+ *speed = IB_SPEED_SDR;
+ break;
+ case SPEED_5000:
+ *speed = IB_SPEED_DDR;
+ break;
+ case SPEED_10000:
+ *speed = IB_SPEED_FDR10;
+ break;
+ case SPEED_14000:
+ *speed = IB_SPEED_FDR;
+ break;
+ case SPEED_25000:
+ *speed = IB_SPEED_EDR;
+ break;
+ case SPEED_50000:
+ *speed = IB_SPEED_HDR;
+ break;
+ case SPEED_100000:
+ *speed = IB_SPEED_NDR;
+ break;
+ default:
+ *speed = IB_SPEED_SDR;
+ }
+}
+
int ib_get_eth_speed(struct ib_device *dev, u32 port_num, u16 *speed, u8 *width)
{
int rc;
u32 netdev_speed;
struct net_device *netdev;
- struct ethtool_link_ksettings lksettings;
+ struct ethtool_link_ksettings lksettings = {};
if (rdma_port_get_link_layer(dev, port_num) != IB_LINK_LAYER_ETHERNET)
return -EINVAL;
@@ -1834,29 +2005,13 @@ int ib_get_eth_speed(struct ib_device *dev, u32 port_num, u16 *speed, u8 *width)
netdev_speed = lksettings.base.speed;
} else {
netdev_speed = SPEED_1000;
- pr_warn("%s speed is unknown, defaulting to %u\n", netdev->name,
- netdev_speed);
+ if (rc)
+ pr_warn("%s speed is unknown, defaulting to %u\n",
+ netdev->name, netdev_speed);
}
- if (netdev_speed <= SPEED_1000) {
- *width = IB_WIDTH_1X;
- *speed = IB_SPEED_SDR;
- } else if (netdev_speed <= SPEED_10000) {
- *width = IB_WIDTH_1X;
- *speed = IB_SPEED_FDR10;
- } else if (netdev_speed <= SPEED_20000) {
- *width = IB_WIDTH_4X;
- *speed = IB_SPEED_DDR;
- } else if (netdev_speed <= SPEED_25000) {
- *width = IB_WIDTH_1X;
- *speed = IB_SPEED_EDR;
- } else if (netdev_speed <= SPEED_40000) {
- *width = IB_WIDTH_4X;
- *speed = IB_SPEED_FDR10;
- } else {
- *width = IB_WIDTH_4X;
- *speed = IB_SPEED_EDR;
- }
+ ib_get_width_and_speed(netdev_speed, lksettings.lanes,
+ speed, width);
return 0;
}
@@ -1935,10 +2090,6 @@ int ib_destroy_qp_user(struct ib_qp *qp, struct ib_udata *udata)
{
const struct ib_gid_attr *alt_path_sgid_attr = qp->alt_path_sgid_attr;
const struct ib_gid_attr *av_sgid_attr = qp->av_sgid_attr;
- struct ib_pd *pd;
- struct ib_cq *scq, *rcq;
- struct ib_srq *srq;
- struct ib_rwq_ind_table *ind_tbl;
struct ib_qp_security *sec;
int ret;
@@ -1950,11 +2101,6 @@ int ib_destroy_qp_user(struct ib_qp *qp, struct ib_udata *udata)
if (qp->real_qp != qp)
return __ib_destroy_shared_qp(qp);
- pd = qp->pd;
- scq = qp->send_cq;
- rcq = qp->recv_cq;
- srq = qp->srq;
- ind_tbl = qp->rwq_ind_tbl;
sec = qp->qp_sec;
if (sec)
ib_destroy_qp_security_begin(sec);
@@ -1962,31 +2108,25 @@ int ib_destroy_qp_user(struct ib_qp *qp, struct ib_udata *udata)
if (!qp->uobject)
rdma_rw_cleanup_mrs(qp);
- rdma_counter_unbind_qp(qp, true);
- rdma_restrack_del(&qp->res);
+ rdma_counter_unbind_qp(qp, qp->port, true);
ret = qp->device->ops.destroy_qp(qp, udata);
- if (!ret) {
- if (alt_path_sgid_attr)
- rdma_put_gid_attr(alt_path_sgid_attr);
- if (av_sgid_attr)
- rdma_put_gid_attr(av_sgid_attr);
- if (pd)
- atomic_dec(&pd->usecnt);
- if (scq)
- atomic_dec(&scq->usecnt);
- if (rcq)
- atomic_dec(&rcq->usecnt);
- if (srq)
- atomic_dec(&srq->usecnt);
- if (ind_tbl)
- atomic_dec(&ind_tbl->usecnt);
- if (sec)
- ib_destroy_qp_security_end(sec);
- } else {
+ if (ret) {
if (sec)
ib_destroy_qp_security_abort(sec);
+ return ret;
}
+ if (alt_path_sgid_attr)
+ rdma_put_gid_attr(alt_path_sgid_attr);
+ if (av_sgid_attr)
+ rdma_put_gid_attr(av_sgid_attr);
+
+ ib_qp_usecnt_dec(qp);
+ if (sec)
+ ib_destroy_qp_security_end(sec);
+
+ rdma_restrack_del(&qp->res);
+ kfree(qp);
return ret;
}
EXPORT_SYMBOL(ib_destroy_qp_user);
@@ -2078,23 +2218,26 @@ struct ib_mr *ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
struct ib_mr *mr;
if (access_flags & IB_ACCESS_ON_DEMAND) {
- if (!(pd->device->attrs.device_cap_flags &
- IB_DEVICE_ON_DEMAND_PAGING)) {
+ if (!(pd->device->attrs.kernel_cap_flags &
+ IBK_ON_DEMAND_PAGING)) {
pr_debug("ODP support not available\n");
return ERR_PTR(-EINVAL);
}
}
mr = pd->device->ops.reg_user_mr(pd, start, length, virt_addr,
- access_flags, NULL);
+ access_flags, NULL, NULL);
if (IS_ERR(mr))
return mr;
mr->device = pd->device;
+ mr->type = IB_MR_TYPE_USER;
mr->pd = pd;
mr->dm = NULL;
atomic_inc(&pd->usecnt);
+ mr->iova = virt_addr;
+ mr->length = length;
rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR);
rdma_restrack_parent_name(&mr->res, &pd->res);
@@ -2122,6 +2265,7 @@ int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata)
{
struct ib_pd *pd = mr->pd;
struct ib_dm *dm = mr->dm;
+ struct ib_dmah *dmah = mr->dmah;
struct ib_sig_attrs *sig_attrs = mr->sig_attrs;
int ret;
@@ -2132,6 +2276,8 @@ int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata)
atomic_dec(&pd->usecnt);
if (dm)
atomic_dec(&dm->usecnt);
+ if (dmah)
+ atomic_dec(&dmah->usecnt);
kfree(sig_attrs);
}
@@ -2756,6 +2902,72 @@ static void __ib_drain_rq(struct ib_qp *qp)
wait_for_completion(&rdrain.done);
}
+/*
+ * __ib_drain_srq() - Block until Last WQE Reached event arrives, or timeout
+ * expires.
+ * @qp: queue pair associated with SRQ to drain
+ *
+ * Quoting 10.3.1 Queue Pair and EE Context States:
+ *
+ * Note, for QPs that are associated with an SRQ, the Consumer should take the
+ * QP through the Error State before invoking a Destroy QP or a Modify QP to the
+ * Reset State. The Consumer may invoke the Destroy QP without first performing
+ * a Modify QP to the Error State and waiting for the Affiliated Asynchronous
+ * Last WQE Reached Event. However, if the Consumer does not wait for the
+ * Affiliated Asynchronous Last WQE Reached Event, then WQE and Data Segment
+ * leakage may occur. Therefore, it is good programming practice to tear down a
+ * QP that is associated with an SRQ by using the following process:
+ *
+ * - Put the QP in the Error State
+ * - Wait for the Affiliated Asynchronous Last WQE Reached Event;
+ * - either:
+ * drain the CQ by invoking the Poll CQ verb and either wait for CQ
+ * to be empty or the number of Poll CQ operations has exceeded
+ * CQ capacity size;
+ * - or
+ * post another WR that completes on the same CQ and wait for this
+ * WR to return as a WC;
+ * - and then invoke a Destroy QP or Reset QP.
+ *
+ * We use the first option.
+ */
+static void __ib_drain_srq(struct ib_qp *qp)
+{
+ struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
+ struct ib_cq *cq;
+ int n, polled = 0;
+ int ret;
+
+ if (!qp->srq) {
+ WARN_ONCE(1, "QP 0x%p is not associated with SRQ\n", qp);
+ return;
+ }
+
+ ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
+ if (ret) {
+ WARN_ONCE(ret, "failed to drain shared recv queue: %d\n", ret);
+ return;
+ }
+
+ if (ib_srq_has_cq(qp->srq->srq_type)) {
+ cq = qp->srq->ext.cq;
+ } else if (qp->recv_cq) {
+ cq = qp->recv_cq;
+ } else {
+ WARN_ONCE(1, "QP 0x%p has no CQ associated with SRQ\n", qp);
+ return;
+ }
+
+ if (wait_for_completion_timeout(&qp->srq_completion, 60 * HZ) > 0) {
+ while (polled != cq->cqe) {
+ n = ib_process_cq_direct(cq, cq->cqe - polled);
+ if (!n)
+ return;
+ polled += n;
+ }
+ }
+}
+
/**
* ib_drain_sq() - Block until all SQ CQEs have been consumed by the
* application.
@@ -2834,6 +3046,8 @@ void ib_drain_qp(struct ib_qp *qp)
ib_drain_sq(qp);
if (!qp->srq)
ib_drain_rq(qp);
+ else
+ __ib_drain_srq(qp);
}
EXPORT_SYMBOL(ib_drain_qp);
@@ -2901,20 +3115,73 @@ EXPORT_SYMBOL(__rdma_block_iter_start);
bool __rdma_block_iter_next(struct ib_block_iter *biter)
{
unsigned int block_offset;
+ unsigned int delta;
if (!biter->__sg_nents || !biter->__sg)
return false;
biter->__dma_addr = sg_dma_address(biter->__sg) + biter->__sg_advance;
block_offset = biter->__dma_addr & (BIT_ULL(biter->__pg_bit) - 1);
- biter->__sg_advance += BIT_ULL(biter->__pg_bit) - block_offset;
+ delta = BIT_ULL(biter->__pg_bit) - block_offset;
- if (biter->__sg_advance >= sg_dma_len(biter->__sg)) {
+ while (biter->__sg_nents && biter->__sg &&
+ sg_dma_len(biter->__sg) - biter->__sg_advance <= delta) {
+ delta -= sg_dma_len(biter->__sg) - biter->__sg_advance;
biter->__sg_advance = 0;
biter->__sg = sg_next(biter->__sg);
biter->__sg_nents--;
}
+ biter->__sg_advance += delta;
return true;
}
EXPORT_SYMBOL(__rdma_block_iter_next);
+
+/**
+ * rdma_alloc_hw_stats_struct - Helper function to allocate dynamic struct
+ * for the drivers.
+ * @descs: array of static descriptors
+ * @num_counters: number of elements in array
+ * @lifespan: milliseconds between updates
+ */
+struct rdma_hw_stats *rdma_alloc_hw_stats_struct(
+ const struct rdma_stat_desc *descs, int num_counters,
+ unsigned long lifespan)
+{
+ struct rdma_hw_stats *stats;
+
+ stats = kzalloc(struct_size(stats, value, num_counters), GFP_KERNEL);
+ if (!stats)
+ return NULL;
+
+ stats->is_disabled = kcalloc(BITS_TO_LONGS(num_counters),
+ sizeof(*stats->is_disabled), GFP_KERNEL);
+ if (!stats->is_disabled)
+ goto err;
+
+ stats->descs = descs;
+ stats->num_counters = num_counters;
+ stats->lifespan = msecs_to_jiffies(lifespan);
+ mutex_init(&stats->lock);
+
+ return stats;
+
+err:
+ kfree(stats);
+ return NULL;
+}
+EXPORT_SYMBOL(rdma_alloc_hw_stats_struct);
+
+/**
+ * rdma_free_hw_stats_struct - Helper function to release rdma_hw_stats
+ * @stats: statistics to release
+ */
+void rdma_free_hw_stats_struct(struct rdma_hw_stats *stats)
+{
+ if (!stats)
+ return;
+
+ kfree(stats->is_disabled);
+ kfree(stats);
+}
+EXPORT_SYMBOL(rdma_free_hw_stats_struct);