summaryrefslogtreecommitdiff
path: root/drivers/infiniband/core
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/infiniband/core')
-rw-r--r--drivers/infiniband/core/Makefile4
-rw-r--r--drivers/infiniband/core/addr.c83
-rw-r--r--drivers/infiniband/core/agent.c3
-rw-r--r--drivers/infiniband/core/cache.c45
-rw-r--r--drivers/infiniband/core/cm.c138
-rw-r--r--drivers/infiniband/core/cm_trace.h2
-rw-r--r--drivers/infiniband/core/cma.c207
-rw-r--r--drivers/infiniband/core/cma_priv.h4
-rw-r--r--drivers/infiniband/core/cma_trace.h2
-rw-r--r--drivers/infiniband/core/counters.c54
-rw-r--r--drivers/infiniband/core/cq.c12
-rw-r--r--drivers/infiniband/core/device.c195
-rw-r--r--drivers/infiniband/core/iwcm.c33
-rw-r--r--drivers/infiniband/core/mad.c506
-rw-r--r--drivers/infiniband/core/mad_priv.h76
-rw-r--r--drivers/infiniband/core/mad_rmpp.c43
-rw-r--r--drivers/infiniband/core/nldev.c44
-rw-r--r--drivers/infiniband/core/rdma_core.c29
-rw-r--r--drivers/infiniband/core/rdma_core.h1
-rw-r--r--drivers/infiniband/core/restrack.c6
-rw-r--r--drivers/infiniband/core/sa_query.c283
-rw-r--r--drivers/infiniband/core/sysfs.c15
-rw-r--r--drivers/infiniband/core/ucaps.c267
-rw-r--r--drivers/infiniband/core/ucma.c126
-rw-r--r--drivers/infiniband/core/ud_header.c83
-rw-r--r--drivers/infiniband/core/umem.c44
-rw-r--r--drivers/infiniband/core/umem_dmabuf.c2
-rw-r--r--drivers/infiniband/core/umem_odp.c280
-rw-r--r--drivers/infiniband/core/uverbs_cmd.c194
-rw-r--r--drivers/infiniband/core/uverbs_main.c2
-rw-r--r--drivers/infiniband/core/uverbs_marshall.c42
-rw-r--r--drivers/infiniband/core/uverbs_std_types_cq.c88
-rw-r--r--drivers/infiniband/core/uverbs_std_types_device.c4
-rw-r--r--drivers/infiniband/core/uverbs_std_types_dmah.c145
-rw-r--r--drivers/infiniband/core/uverbs_std_types_mr.c172
-rw-r--r--drivers/infiniband/core/uverbs_std_types_qp.c2
-rw-r--r--drivers/infiniband/core/uverbs_uapi.c1
-rw-r--r--drivers/infiniband/core/verbs.c23
38 files changed, 2415 insertions, 845 deletions
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index 8ab4eea5a0a5..f483e0c12444 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -33,12 +33,14 @@ ib_umad-y := user_mad.o
ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
rdma_core.o uverbs_std_types.o uverbs_ioctl.o \
uverbs_std_types_cq.o \
+ uverbs_std_types_dmah.o \
uverbs_std_types_flow_action.o uverbs_std_types_dm.o \
uverbs_std_types_mr.o uverbs_std_types_counters.o \
uverbs_uapi.o uverbs_std_types_device.o \
uverbs_std_types_async_fd.o \
uverbs_std_types_srq.o \
uverbs_std_types_wq.o \
- uverbs_std_types_qp.o
+ uverbs_std_types_qp.o \
+ ucaps.o
ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o umem_dmabuf.o
ib_uverbs-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o
diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
index be0743dac3ff..61596cda2b65 100644
--- a/drivers/infiniband/core/addr.c
+++ b/drivers/infiniband/core/addr.c
@@ -446,63 +446,41 @@ static int addr6_resolve(struct sockaddr *src_sock,
}
#endif
+static bool is_dst_local(const struct dst_entry *dst)
+{
+ if (dst->ops->family == AF_INET)
+ return !!(dst_rtable(dst)->rt_type & RTN_LOCAL);
+ else if (dst->ops->family == AF_INET6)
+ return !!(dst_rt6_info(dst)->rt6i_flags & RTF_LOCAL);
+ else
+ return false;
+}
+
static int addr_resolve_neigh(const struct dst_entry *dst,
const struct sockaddr *dst_in,
struct rdma_dev_addr *addr,
- unsigned int ndev_flags,
u32 seq)
{
- int ret = 0;
-
- if (ndev_flags & IFF_LOOPBACK) {
+ if (is_dst_local(dst)) {
+ /* When the destination is local entry, source and destination
+ * are same. Skip the neighbour lookup.
+ */
memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN);
- } else {
- if (!(ndev_flags & IFF_NOARP)) {
- /* If the device doesn't do ARP internally */
- ret = fetch_ha(dst, addr, dst_in, seq);
- }
+ return 0;
}
- return ret;
-}
-
-static int copy_src_l2_addr(struct rdma_dev_addr *dev_addr,
- const struct sockaddr *dst_in,
- const struct dst_entry *dst,
- const struct net_device *ndev)
-{
- int ret = 0;
-
- if (dst->dev->flags & IFF_LOOPBACK)
- ret = rdma_translate_ip(dst_in, dev_addr);
- else
- rdma_copy_src_l2_addr(dev_addr, dst->dev);
-
- /*
- * If there's a gateway and type of device not ARPHRD_INFINIBAND,
- * we're definitely in RoCE v2 (as RoCE v1 isn't routable) set the
- * network type accordingly.
- */
- if (has_gateway(dst, dst_in->sa_family) &&
- ndev->type != ARPHRD_INFINIBAND)
- dev_addr->network = dst_in->sa_family == AF_INET ?
- RDMA_NETWORK_IPV4 :
- RDMA_NETWORK_IPV6;
- else
- dev_addr->network = RDMA_NETWORK_IB;
- return ret;
+ return fetch_ha(dst, addr, dst_in, seq);
}
static int rdma_set_src_addr_rcu(struct rdma_dev_addr *dev_addr,
- unsigned int *ndev_flags,
const struct sockaddr *dst_in,
const struct dst_entry *dst)
{
struct net_device *ndev = READ_ONCE(dst->dev);
- *ndev_flags = ndev->flags;
/* A physical device must be the RDMA device to use */
- if (ndev->flags & IFF_LOOPBACK) {
+ if (is_dst_local(dst)) {
+ int ret;
/*
* RDMA (IB/RoCE, iWarp) doesn't run on lo interface or
* loopback IP address. So if route is resolved to loopback
@@ -512,9 +490,27 @@ static int rdma_set_src_addr_rcu(struct rdma_dev_addr *dev_addr,
ndev = rdma_find_ndev_for_src_ip_rcu(dev_net(ndev), dst_in);
if (IS_ERR(ndev))
return -ENODEV;
+ ret = rdma_translate_ip(dst_in, dev_addr);
+ if (ret)
+ return ret;
+ } else {
+ rdma_copy_src_l2_addr(dev_addr, dst->dev);
}
- return copy_src_l2_addr(dev_addr, dst_in, dst, ndev);
+ /*
+ * If there's a gateway and type of device not ARPHRD_INFINIBAND,
+ * we're definitely in RoCE v2 (as RoCE v1 isn't routable) set the
+ * network type accordingly.
+ */
+ if (has_gateway(dst, dst_in->sa_family) &&
+ ndev->type != ARPHRD_INFINIBAND)
+ dev_addr->network = dst_in->sa_family == AF_INET ?
+ RDMA_NETWORK_IPV4 :
+ RDMA_NETWORK_IPV6;
+ else
+ dev_addr->network = RDMA_NETWORK_IB;
+
+ return 0;
}
static int set_addr_netns_by_gid_rcu(struct rdma_dev_addr *addr)
@@ -551,7 +547,6 @@ static int addr_resolve(struct sockaddr *src_in,
u32 seq)
{
struct dst_entry *dst = NULL;
- unsigned int ndev_flags = 0;
struct rtable *rt = NULL;
int ret;
@@ -588,7 +583,7 @@ static int addr_resolve(struct sockaddr *src_in,
rcu_read_unlock();
goto done;
}
- ret = rdma_set_src_addr_rcu(addr, &ndev_flags, dst_in, dst);
+ ret = rdma_set_src_addr_rcu(addr, dst_in, dst);
rcu_read_unlock();
/*
@@ -596,7 +591,7 @@ static int addr_resolve(struct sockaddr *src_in,
* only if src addr translation didn't fail.
*/
if (!ret && resolve_neigh)
- ret = addr_resolve_neigh(dst, dst_in, addr, ndev_flags, seq);
+ ret = addr_resolve_neigh(dst, dst_in, addr, seq);
if (src_in->sa_family == AF_INET)
ip_rt_put(rt);
diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c
index 3bb46696731e..25a060a28301 100644
--- a/drivers/infiniband/core/agent.c
+++ b/drivers/infiniband/core/agent.c
@@ -110,8 +110,7 @@ void agent_send_response(const struct ib_mad_hdr *mad_hdr, const struct ib_grh *
agent = port_priv->agent[qpn];
ah = ib_create_ah_from_wc(agent->qp->pd, wc, grh, port_num);
if (IS_ERR(ah)) {
- dev_err(&device->dev, "ib_create_ah_from_wc error %ld\n",
- PTR_ERR(ah));
+ dev_err(&device->dev, "ib_create_ah_from_wc error %pe\n", ah);
return;
}
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index b7c078b7f7cf..81cf3c902e81 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -582,8 +582,8 @@ static int __ib_cache_gid_add(struct ib_device *ib_dev, u32 port,
out_unlock:
mutex_unlock(&table->lock);
if (ret)
- pr_warn("%s: unable to add gid %pI6 error=%d\n",
- __func__, gid->raw, ret);
+ pr_warn_ratelimited("%s: unable to add gid %pI6 error=%d\n",
+ __func__, gid->raw, ret);
return ret;
}
@@ -1127,41 +1127,6 @@ err:
}
EXPORT_SYMBOL(ib_find_cached_pkey);
-int ib_find_exact_cached_pkey(struct ib_device *device, u32 port_num,
- u16 pkey, u16 *index)
-{
- struct ib_pkey_cache *cache;
- unsigned long flags;
- int i;
- int ret = -ENOENT;
-
- if (!rdma_is_port_valid(device, port_num))
- return -EINVAL;
-
- read_lock_irqsave(&device->cache_lock, flags);
-
- cache = device->port_data[port_num].cache.pkey;
- if (!cache) {
- ret = -EINVAL;
- goto err;
- }
-
- *index = -1;
-
- for (i = 0; i < cache->table_len; ++i)
- if (cache->table[i] == pkey) {
- *index = i;
- ret = 0;
- break;
- }
-
-err:
- read_unlock_irqrestore(&device->cache_lock, flags);
-
- return ret;
-}
-EXPORT_SYMBOL(ib_find_exact_cached_pkey);
-
int ib_get_cached_lmc(struct ib_device *device, u32 port_num, u8 *lmc)
{
unsigned long flags;
@@ -1536,6 +1501,12 @@ ib_cache_update(struct ib_device *device, u32 port, bool update_gids,
device->port_data[port].cache.pkey = pkey_cache;
}
device->port_data[port].cache.lmc = tprops->lmc;
+
+ if (device->port_data[port].cache.port_state != IB_PORT_NOP &&
+ device->port_data[port].cache.port_state != tprops->state)
+ ibdev_info(device, "Port: %d Link %s\n", port,
+ ib_port_state_to_str(tprops->state));
+
device->port_data[port].cache.port_state = tprops->state;
device->port_data[port].cache.subnet_prefix = tprops->subnet_prefix;
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 142170473e75..024df6ee239d 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -34,8 +34,8 @@ MODULE_AUTHOR("Sean Hefty");
MODULE_DESCRIPTION("InfiniBand CM");
MODULE_LICENSE("Dual BSD/GPL");
-#define CM_DESTROY_ID_WAIT_TIMEOUT 10000 /* msecs */
#define CM_DIRECT_RETRY_CTX ((void *) 1UL)
+#define CM_MRA_SETTING 24 /* 4.096us * 2^24 = ~68.7 seconds */
static const char * const ibcm_rej_reason_strs[] = {
[IB_CM_REJ_NO_QP] = "no QP",
@@ -160,6 +160,7 @@ struct cm_counter_attribute {
struct cm_port {
struct cm_device *cm_dev;
struct ib_mad_agent *mad_agent;
+ struct ib_mad_agent *rep_agent;
u32 port_num;
atomic_long_t counters[CM_COUNTER_GROUPS][CM_ATTR_COUNT];
};
@@ -167,7 +168,7 @@ struct cm_port {
struct cm_device {
struct kref kref;
struct list_head list;
- spinlock_t mad_agent_lock;
+ rwlock_t mad_agent_lock;
struct ib_device *ib_device;
u8 ack_delay;
int going_down;
@@ -241,7 +242,6 @@ struct cm_id_private {
u8 initiator_depth;
u8 retry_count;
u8 rnr_retry_count;
- u8 service_timeout;
u8 target_ack_delay;
struct list_head work_list;
@@ -274,7 +274,8 @@ static inline void cm_deref_id(struct cm_id_private *cm_id_priv)
complete(&cm_id_priv->comp);
}
-static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv)
+static struct ib_mad_send_buf *
+cm_alloc_msg_agent(struct cm_id_private *cm_id_priv, bool rep_agent)
{
struct ib_mad_agent *mad_agent;
struct ib_mad_send_buf *m;
@@ -285,8 +286,9 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv)
if (!cm_id_priv->av.port)
return ERR_PTR(-EINVAL);
- spin_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
- mad_agent = cm_id_priv->av.port->mad_agent;
+ read_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
+ mad_agent = rep_agent ? cm_id_priv->av.port->rep_agent :
+ cm_id_priv->av.port->mad_agent;
if (!mad_agent) {
m = ERR_PTR(-EINVAL);
goto out;
@@ -311,10 +313,15 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv)
m->ah = ah;
out:
- spin_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
+ read_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
return m;
}
+static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv)
+{
+ return cm_alloc_msg_agent(cm_id_priv, false);
+}
+
static void cm_free_msg(struct ib_mad_send_buf *msg)
{
if (msg->ah)
@@ -323,13 +330,14 @@ static void cm_free_msg(struct ib_mad_send_buf *msg)
}
static struct ib_mad_send_buf *
-cm_alloc_priv_msg(struct cm_id_private *cm_id_priv, enum ib_cm_state state)
+cm_alloc_priv_msg_rep(struct cm_id_private *cm_id_priv, enum ib_cm_state state,
+ bool rep_agent)
{
struct ib_mad_send_buf *msg;
lockdep_assert_held(&cm_id_priv->lock);
- msg = cm_alloc_msg(cm_id_priv);
+ msg = cm_alloc_msg_agent(cm_id_priv, rep_agent);
if (IS_ERR(msg))
return msg;
@@ -344,6 +352,12 @@ cm_alloc_priv_msg(struct cm_id_private *cm_id_priv, enum ib_cm_state state)
return msg;
}
+static struct ib_mad_send_buf *
+cm_alloc_priv_msg(struct cm_id_private *cm_id_priv, enum ib_cm_state state)
+{
+ return cm_alloc_priv_msg_rep(cm_id_priv, state, false);
+}
+
static void cm_free_priv_msg(struct ib_mad_send_buf *msg)
{
struct cm_id_private *cm_id_priv = msg->context[0];
@@ -1034,14 +1048,15 @@ static noinline void cm_destroy_id_wait_timeout(struct ib_cm_id *cm_id,
struct cm_id_private *cm_id_priv;
cm_id_priv = container_of(cm_id, struct cm_id_private, id);
- pr_err("%s: cm_id=%p timed out. state %d -> %d, refcnt=%d\n", __func__,
- cm_id, old_state, cm_id->state, refcount_read(&cm_id_priv->refcount));
+ pr_err_ratelimited("%s: cm_id=%p timed out. state %d -> %d, refcnt=%d\n", __func__,
+ cm_id, old_state, cm_id->state, refcount_read(&cm_id_priv->refcount));
}
static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
{
struct cm_id_private *cm_id_priv;
enum ib_cm_state old_state;
+ unsigned long timeout;
struct cm_work *work;
int ret;
@@ -1152,10 +1167,9 @@ retest:
xa_erase(&cm.local_id_table, cm_local_id(cm_id->local_id));
cm_deref_id(cm_id_priv);
+ timeout = msecs_to_jiffies((cm_id_priv->max_cm_retries * cm_id_priv->timeout_ms * 5) / 4);
do {
- ret = wait_for_completion_timeout(&cm_id_priv->comp,
- msecs_to_jiffies(
- CM_DESTROY_ID_WAIT_TIMEOUT));
+ ret = wait_for_completion_timeout(&cm_id_priv->comp, timeout);
if (!ret) /* timeout happened */
cm_destroy_id_wait_timeout(cm_id, old_state);
} while (!ret);
@@ -1297,10 +1311,10 @@ static __be64 cm_form_tid(struct cm_id_private *cm_id_priv)
if (!cm_id_priv->av.port)
return cpu_to_be64(low_tid);
- spin_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
+ read_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
if (cm_id_priv->av.port->mad_agent)
hi_tid = ((u64)cm_id_priv->av.port->mad_agent->hi_tid) << 32;
- spin_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
+ read_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
return cpu_to_be64(hi_tid | low_tid);
}
@@ -1872,7 +1886,7 @@ static void cm_process_work(struct cm_id_private *cm_id_priv,
static void cm_format_mra(struct cm_mra_msg *mra_msg,
struct cm_id_private *cm_id_priv,
- enum cm_msg_response msg_mraed, u8 service_timeout,
+ enum cm_msg_response msg_mraed,
const void *private_data, u8 private_data_len)
{
cm_format_mad_hdr(&mra_msg->hdr, CM_MRA_ATTR_ID, cm_id_priv->tid);
@@ -1881,7 +1895,7 @@ static void cm_format_mra(struct cm_mra_msg *mra_msg,
be32_to_cpu(cm_id_priv->id.local_id));
IBA_SET(CM_MRA_REMOTE_COMM_ID, mra_msg,
be32_to_cpu(cm_id_priv->id.remote_id));
- IBA_SET(CM_MRA_SERVICE_TIMEOUT, mra_msg, service_timeout);
+ IBA_SET(CM_MRA_SERVICE_TIMEOUT, mra_msg, CM_MRA_SETTING);
if (private_data && private_data_len)
IBA_SET_MEM(CM_MRA_PRIVATE_DATA, mra_msg, private_data,
@@ -1960,7 +1974,7 @@ static void cm_dup_req_handler(struct cm_work *work,
switch (cm_id_priv->id.state) {
case IB_CM_MRA_REQ_SENT:
cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
- CM_MSG_RESPONSE_REQ, cm_id_priv->service_timeout,
+ CM_MSG_RESPONSE_REQ,
cm_id_priv->private_data,
cm_id_priv->private_data_len);
break;
@@ -2295,7 +2309,7 @@ int ib_send_cm_rep(struct ib_cm_id *cm_id,
goto out;
}
- msg = cm_alloc_priv_msg(cm_id_priv, IB_CM_REP_SENT);
+ msg = cm_alloc_priv_msg_rep(cm_id_priv, IB_CM_REP_SENT, true);
if (IS_ERR(msg)) {
ret = PTR_ERR(msg);
goto out;
@@ -2454,7 +2468,7 @@ static void cm_dup_rep_handler(struct cm_work *work)
cm_id_priv->private_data_len);
else if (cm_id_priv->id.state == IB_CM_MRA_REP_SENT)
cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
- CM_MSG_RESPONSE_REP, cm_id_priv->service_timeout,
+ CM_MSG_RESPONSE_REP,
cm_id_priv->private_data,
cm_id_priv->private_data_len);
else
@@ -3094,26 +3108,13 @@ out:
return -EINVAL;
}
-int ib_send_cm_mra(struct ib_cm_id *cm_id,
- u8 service_timeout,
- const void *private_data,
- u8 private_data_len)
+int ib_prepare_cm_mra(struct ib_cm_id *cm_id)
{
struct cm_id_private *cm_id_priv;
- struct ib_mad_send_buf *msg;
enum ib_cm_state cm_state;
enum ib_cm_lap_state lap_state;
- enum cm_msg_response msg_response;
- void *data;
unsigned long flags;
- int ret;
-
- if (private_data && private_data_len > IB_CM_MRA_PRIVATE_DATA_SIZE)
- return -EINVAL;
-
- data = cm_copy_private_data(private_data, private_data_len);
- if (IS_ERR(data))
- return PTR_ERR(data);
+ int ret = 0;
cm_id_priv = container_of(cm_id, struct cm_id_private, id);
@@ -3122,58 +3123,33 @@ int ib_send_cm_mra(struct ib_cm_id *cm_id,
case IB_CM_REQ_RCVD:
cm_state = IB_CM_MRA_REQ_SENT;
lap_state = cm_id->lap_state;
- msg_response = CM_MSG_RESPONSE_REQ;
break;
case IB_CM_REP_RCVD:
cm_state = IB_CM_MRA_REP_SENT;
lap_state = cm_id->lap_state;
- msg_response = CM_MSG_RESPONSE_REP;
break;
case IB_CM_ESTABLISHED:
if (cm_id->lap_state == IB_CM_LAP_RCVD) {
cm_state = cm_id->state;
lap_state = IB_CM_MRA_LAP_SENT;
- msg_response = CM_MSG_RESPONSE_OTHER;
break;
}
fallthrough;
default:
- trace_icm_send_mra_unknown_err(&cm_id_priv->id);
+ trace_icm_prepare_mra_unknown_err(&cm_id_priv->id);
ret = -EINVAL;
goto error_unlock;
}
- if (!(service_timeout & IB_CM_MRA_FLAG_DELAY)) {
- msg = cm_alloc_msg(cm_id_priv);
- if (IS_ERR(msg)) {
- ret = PTR_ERR(msg);
- goto error_unlock;
- }
-
- cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
- msg_response, service_timeout,
- private_data, private_data_len);
- trace_icm_send_mra(cm_id);
- ret = ib_post_send_mad(msg, NULL);
- if (ret)
- goto error_free_msg;
- }
-
cm_id->state = cm_state;
cm_id->lap_state = lap_state;
- cm_id_priv->service_timeout = service_timeout;
- cm_set_private_data(cm_id_priv, data, private_data_len);
- spin_unlock_irqrestore(&cm_id_priv->lock, flags);
- return 0;
+ cm_set_private_data(cm_id_priv, NULL, 0);
-error_free_msg:
- cm_free_msg(msg);
error_unlock:
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
- kfree(data);
return ret;
}
-EXPORT_SYMBOL(ib_send_cm_mra);
+EXPORT_SYMBOL(ib_prepare_cm_mra);
static struct cm_id_private *cm_acquire_mraed_id(struct cm_mra_msg *mra_msg)
{
@@ -3377,7 +3353,6 @@ static int cm_lap_handler(struct cm_work *work)
cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
CM_MSG_RESPONSE_OTHER,
- cm_id_priv->service_timeout,
cm_id_priv->private_data,
cm_id_priv->private_data_len);
spin_unlock_irq(&cm_id_priv->lock);
@@ -3786,7 +3761,8 @@ static void cm_process_send_error(struct cm_id_private *cm_id_priv,
spin_lock_irq(&cm_id_priv->lock);
if (msg != cm_id_priv->msg) {
spin_unlock_irq(&cm_id_priv->lock);
- cm_free_priv_msg(msg);
+ cm_free_msg(msg);
+ cm_deref_id(cm_id_priv);
return;
}
cm_free_priv_msg(msg);
@@ -4378,7 +4354,7 @@ static int cm_add_one(struct ib_device *ib_device)
return -ENOMEM;
kref_init(&cm_dev->kref);
- spin_lock_init(&cm_dev->mad_agent_lock);
+ rwlock_init(&cm_dev->mad_agent_lock);
cm_dev->ib_device = ib_device;
cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay;
cm_dev->going_down = 0;
@@ -4418,9 +4394,22 @@ static int cm_add_one(struct ib_device *ib_device)
goto error2;
}
+ port->rep_agent = ib_register_mad_agent(ib_device, i,
+ IB_QPT_GSI,
+ NULL,
+ 0,
+ cm_send_handler,
+ NULL,
+ port,
+ 0);
+ if (IS_ERR(port->rep_agent)) {
+ ret = PTR_ERR(port->rep_agent);
+ goto error3;
+ }
+
ret = ib_modify_port(ib_device, i, 0, &port_modify);
if (ret)
- goto error3;
+ goto error4;
count++;
}
@@ -4435,6 +4424,8 @@ static int cm_add_one(struct ib_device *ib_device)
write_unlock_irqrestore(&cm.device_lock, flags);
return 0;
+error4:
+ ib_unregister_mad_agent(port->rep_agent);
error3:
ib_unregister_mad_agent(port->mad_agent);
error2:
@@ -4448,6 +4439,7 @@ error1:
port = cm_dev->port[i-1];
ib_modify_port(ib_device, port->port_num, 0, &port_modify);
+ ib_unregister_mad_agent(port->rep_agent);
ib_unregister_mad_agent(port->mad_agent);
ib_port_unregister_client_groups(ib_device, i,
cm_counter_groups);
@@ -4477,12 +4469,14 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data)
rdma_for_each_port (ib_device, i) {
struct ib_mad_agent *mad_agent;
+ struct ib_mad_agent *rep_agent;
if (!rdma_cap_ib_cm(ib_device, i))
continue;
port = cm_dev->port[i-1];
mad_agent = port->mad_agent;
+ rep_agent = port->rep_agent;
ib_modify_port(ib_device, port->port_num, 0, &port_modify);
/*
* We flush the queue here after the going_down set, this
@@ -4494,10 +4488,12 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data)
* The above ensures no call paths from the work are running,
* the remaining paths all take the mad_agent_lock.
*/
- spin_lock(&cm_dev->mad_agent_lock);
+ write_lock(&cm_dev->mad_agent_lock);
port->mad_agent = NULL;
- spin_unlock(&cm_dev->mad_agent_lock);
+ port->rep_agent = NULL;
+ write_unlock(&cm_dev->mad_agent_lock);
ib_unregister_mad_agent(mad_agent);
+ ib_unregister_mad_agent(rep_agent);
ib_port_unregister_client_groups(ib_device, i,
cm_counter_groups);
}
@@ -4521,7 +4517,7 @@ static int __init ib_cm_init(void)
get_random_bytes(&cm.random_id_operand, sizeof cm.random_id_operand);
INIT_LIST_HEAD(&cm.timewait_list);
- cm.wq = alloc_workqueue("ib_cm", 0, 1);
+ cm.wq = alloc_workqueue("ib_cm", WQ_PERCPU, 1);
if (!cm.wq) {
ret = -ENOMEM;
goto error2;
diff --git a/drivers/infiniband/core/cm_trace.h b/drivers/infiniband/core/cm_trace.h
index 944d9071245d..4a4987da69d4 100644
--- a/drivers/infiniband/core/cm_trace.h
+++ b/drivers/infiniband/core/cm_trace.h
@@ -229,7 +229,7 @@ DEFINE_CM_ERR_EVENT(send_drep);
DEFINE_CM_ERR_EVENT(dreq_unknown);
DEFINE_CM_ERR_EVENT(send_unknown_rej);
DEFINE_CM_ERR_EVENT(rej_unknown);
-DEFINE_CM_ERR_EVENT(send_mra_unknown);
+DEFINE_CM_ERR_EVENT(prepare_mra_unknown);
DEFINE_CM_ERR_EVENT(mra_unknown);
DEFINE_CM_ERR_EVENT(qp_init);
DEFINE_CM_ERR_EVENT(qp_rtr);
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 64ace0b968f0..95e89f5c147c 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -46,7 +46,6 @@ MODULE_LICENSE("Dual BSD/GPL");
#define CMA_CM_RESPONSE_TIMEOUT 20
#define CMA_MAX_CM_RETRIES 15
-#define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24)
#define CMA_IBOE_PACKET_LIFETIME 16
#define CMA_PREFERRED_ROCE_GID_TYPE IB_GID_TYPE_ROCE_UDP_ENCAP
@@ -72,6 +71,8 @@ static const char * const cma_events[] = {
static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid,
enum ib_gid_type gid_type);
+static void cma_netevent_work_handler(struct work_struct *_work);
+
const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event)
{
size_t index = event;
@@ -144,19 +145,6 @@ struct iw_cm_id *rdma_iw_cm_id(struct rdma_cm_id *id)
}
EXPORT_SYMBOL(rdma_iw_cm_id);
-/**
- * rdma_res_to_id() - return the rdma_cm_id pointer for this restrack.
- * @res: rdma resource tracking entry pointer
- */
-struct rdma_cm_id *rdma_res_to_id(struct rdma_restrack_entry *res)
-{
- struct rdma_id_private *id_priv =
- container_of(res, struct rdma_id_private, res);
-
- return &id_priv->id;
-}
-EXPORT_SYMBOL(rdma_res_to_id);
-
static int cma_add_one(struct ib_device *device);
static void cma_remove_one(struct ib_device *device, void *client_data);
@@ -690,6 +678,7 @@ cma_validate_port(struct ib_device *device, u32 port,
int bound_if_index = dev_addr->bound_dev_if;
int dev_type = dev_addr->dev_type;
struct net_device *ndev = NULL;
+ struct net_device *pdev = NULL;
if (!rdma_dev_access_netns(device, id_priv->id.route.addr.dev_addr.net))
goto out;
@@ -714,6 +703,21 @@ cma_validate_port(struct ib_device *device, u32 port,
rcu_read_lock();
ndev = rcu_dereference(sgid_attr->ndev);
+ if (ndev->ifindex != bound_if_index) {
+ pdev = dev_get_by_index_rcu(dev_addr->net, bound_if_index);
+ if (pdev) {
+ if (is_vlan_dev(pdev)) {
+ pdev = vlan_dev_real_dev(pdev);
+ if (ndev->ifindex == pdev->ifindex)
+ bound_if_index = pdev->ifindex;
+ }
+ if (is_vlan_dev(ndev)) {
+ pdev = vlan_dev_real_dev(ndev);
+ if (bound_if_index == pdev->ifindex)
+ bound_if_index = ndev->ifindex;
+ }
+ }
+ }
if (!net_eq(dev_net(ndev), dev_addr->net) ||
ndev->ifindex != bound_if_index) {
rdma_put_gid_attr(sgid_attr);
@@ -723,12 +727,26 @@ cma_validate_port(struct ib_device *device, u32 port,
goto out;
}
- if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) {
- ndev = dev_get_by_index(dev_addr->net, bound_if_index);
- if (!ndev)
- goto out;
+ /*
+ * For a RXE device, it should work with TUN device and normal ethernet
+ * devices. Use driver_id to check if a device is a RXE device or not.
+ * ARPHDR_NONE means a TUN device.
+ */
+ if (device->ops.driver_id == RDMA_DRIVER_RXE) {
+ if ((dev_type == ARPHRD_NONE || dev_type == ARPHRD_ETHER)
+ && rdma_protocol_roce(device, port)) {
+ ndev = dev_get_by_index(dev_addr->net, bound_if_index);
+ if (!ndev)
+ goto out;
+ }
} else {
- gid_type = IB_GID_TYPE_IB;
+ if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) {
+ ndev = dev_get_by_index(dev_addr->net, bound_if_index);
+ if (!ndev)
+ goto out;
+ } else {
+ gid_type = IB_GID_TYPE_IB;
+ }
}
sgid_attr = rdma_find_gid_by_port(device, gid, gid_type, port, ndev);
@@ -1017,6 +1035,7 @@ __rdma_create_id(struct net *net, rdma_cm_event_handler event_handler,
get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num);
id_priv->id.route.addr.dev_addr.net = get_net(net);
id_priv->seq_num &= 0x00ffffff;
+ INIT_WORK(&id_priv->id.net_work, cma_netevent_work_handler);
rdma_restrack_new(&id_priv->res, RDMA_RESTRACK_CM_ID);
if (parent)
@@ -2057,6 +2076,7 @@ static void _destroy_id(struct rdma_id_private *id_priv,
kfree(id_priv->id.route.path_rec);
kfree(id_priv->id.route.path_rec_inbound);
kfree(id_priv->id.route.path_rec_outbound);
+ kfree(id_priv->id.route.service_recs);
put_net(id_priv->id.route.addr.dev_addr.net);
kfree(id_priv);
@@ -2181,8 +2201,8 @@ static int cma_ib_handler(struct ib_cm_id *cm_id,
case IB_CM_REP_RECEIVED:
if (state == RDMA_CM_CONNECT &&
(id_priv->id.qp_type != IB_QPT_UD)) {
- trace_cm_send_mra(id_priv);
- ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
+ trace_cm_prepare_mra(id_priv);
+ ib_prepare_cm_mra(cm_id);
}
if (id_priv->id.qp) {
event.status = cma_rep_recv(id_priv);
@@ -2443,8 +2463,8 @@ static int cma_ib_req_handler(struct ib_cm_id *cm_id,
if (READ_ONCE(conn_id->state) == RDMA_CM_CONNECT &&
conn_id->id.qp_type != IB_QPT_UD) {
- trace_cm_send_mra(cm_id->context);
- ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
+ trace_cm_prepare_mra(cm_id->context);
+ ib_prepare_cm_mra(cm_id);
}
mutex_unlock(&conn_id->handler_mutex);
@@ -3363,13 +3383,18 @@ err1:
int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms)
{
struct rdma_id_private *id_priv;
+ enum rdma_cm_state state;
int ret;
if (!timeout_ms)
return -EINVAL;
id_priv = container_of(id, struct rdma_id_private, id);
- if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_QUERY))
+ state = id_priv->state;
+ if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED,
+ RDMA_CM_ROUTE_QUERY) &&
+ !cma_comp_exch(id_priv, RDMA_CM_ADDRINFO_RESOLVED,
+ RDMA_CM_ROUTE_QUERY))
return -EINVAL;
cma_id_get(id_priv);
@@ -3390,7 +3415,7 @@ int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms)
return 0;
err:
- cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, RDMA_CM_ADDR_RESOLVED);
+ cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, state);
cma_id_put(id_priv);
return ret;
}
@@ -4450,6 +4475,8 @@ int rdma_connect_locked(struct rdma_cm_id *id,
container_of(id, struct rdma_id_private, id);
int ret;
+ lockdep_assert_held(&id_priv->handler_mutex);
+
if (!cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT))
return -EINVAL;
@@ -5211,9 +5238,9 @@ static int cma_netevent_callback(struct notifier_block *self,
if (!memcmp(current_id->id.route.addr.dev_addr.dst_dev_addr,
neigh->ha, ETH_ALEN))
continue;
- INIT_WORK(&current_id->id.net_work, cma_netevent_work_handler);
cma_id_get(current_id);
- queue_work(cma_wq, &current_id->id.net_work);
+ if (!queue_work(cma_wq, &current_id->id.net_work))
+ cma_id_put(current_id);
}
out:
spin_unlock_irqrestore(&id_table_lock, flags);
@@ -5487,3 +5514,129 @@ static void __exit cma_cleanup(void)
module_init(cma_init);
module_exit(cma_cleanup);
+
+static void cma_query_ib_service_handler(int status,
+ struct sa_service_rec *recs,
+ unsigned int num_recs, void *context)
+{
+ struct cma_work *work = context;
+ struct rdma_id_private *id_priv = work->id;
+ struct sockaddr_ib *addr;
+
+ if (status)
+ goto fail;
+
+ if (!num_recs) {
+ status = -ENOENT;
+ goto fail;
+ }
+
+ if (id_priv->id.route.service_recs) {
+ status = -EALREADY;
+ goto fail;
+ }
+
+ id_priv->id.route.service_recs =
+ kmalloc_array(num_recs, sizeof(*recs), GFP_KERNEL);
+ if (!id_priv->id.route.service_recs) {
+ status = -ENOMEM;
+ goto fail;
+ }
+
+ id_priv->id.route.num_service_recs = num_recs;
+ memcpy(id_priv->id.route.service_recs, recs, sizeof(*recs) * num_recs);
+
+ addr = (struct sockaddr_ib *)&id_priv->id.route.addr.dst_addr;
+ addr->sib_family = AF_IB;
+ addr->sib_addr = *(struct ib_addr *)&recs->gid;
+ addr->sib_pkey = recs->pkey;
+ addr->sib_sid = recs->id;
+ rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr,
+ (union ib_gid *)&addr->sib_addr);
+ ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr,
+ ntohs(addr->sib_pkey));
+
+ queue_work(cma_wq, &work->work);
+ return;
+
+fail:
+ work->old_state = RDMA_CM_ADDRINFO_QUERY;
+ work->new_state = RDMA_CM_ADDR_BOUND;
+ work->event.event = RDMA_CM_EVENT_ADDRINFO_ERROR;
+ work->event.status = status;
+ pr_debug_ratelimited(
+ "RDMA CM: SERVICE_ERROR: failed to query service record. status %d\n",
+ status);
+ queue_work(cma_wq, &work->work);
+}
+
+static int cma_resolve_ib_service(struct rdma_id_private *id_priv,
+ struct rdma_ucm_ib_service *ibs)
+{
+ struct sa_service_rec sr = {};
+ ib_sa_comp_mask mask = 0;
+ struct cma_work *work;
+
+ work = kzalloc(sizeof(*work), GFP_KERNEL);
+ if (!work)
+ return -ENOMEM;
+
+ cma_id_get(id_priv);
+
+ work->id = id_priv;
+ INIT_WORK(&work->work, cma_work_handler);
+ work->old_state = RDMA_CM_ADDRINFO_QUERY;
+ work->new_state = RDMA_CM_ADDRINFO_RESOLVED;
+ work->event.event = RDMA_CM_EVENT_ADDRINFO_RESOLVED;
+
+ if (ibs->flags & RDMA_USER_CM_IB_SERVICE_FLAG_ID) {
+ sr.id = cpu_to_be64(ibs->service_id);
+ mask |= IB_SA_SERVICE_REC_SERVICE_ID;
+ }
+ if (ibs->flags & RDMA_USER_CM_IB_SERVICE_FLAG_NAME) {
+ strscpy(sr.name, ibs->service_name, sizeof(sr.name));
+ mask |= IB_SA_SERVICE_REC_SERVICE_NAME;
+ }
+
+ id_priv->query_id = ib_sa_service_rec_get(&sa_client,
+ id_priv->id.device,
+ id_priv->id.port_num,
+ &sr, mask,
+ 2000, GFP_KERNEL,
+ cma_query_ib_service_handler,
+ work, &id_priv->query);
+
+ if (id_priv->query_id < 0) {
+ cma_id_put(id_priv);
+ kfree(work);
+ return id_priv->query_id;
+ }
+
+ return 0;
+}
+
+int rdma_resolve_ib_service(struct rdma_cm_id *id,
+ struct rdma_ucm_ib_service *ibs)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!id_priv->cma_dev ||
+ !cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDRINFO_QUERY))
+ return -EINVAL;
+
+ if (rdma_cap_ib_sa(id->device, id->port_num))
+ ret = cma_resolve_ib_service(id_priv, ibs);
+ else
+ ret = -EOPNOTSUPP;
+
+ if (ret)
+ goto err;
+
+ return 0;
+err:
+ cma_comp_exch(id_priv, RDMA_CM_ADDRINFO_QUERY, RDMA_CM_ADDR_BOUND);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_resolve_ib_service);
diff --git a/drivers/infiniband/core/cma_priv.h b/drivers/infiniband/core/cma_priv.h
index b7354c94cf1b..c604b601f4d9 100644
--- a/drivers/infiniband/core/cma_priv.h
+++ b/drivers/infiniband/core/cma_priv.h
@@ -47,7 +47,9 @@ enum rdma_cm_state {
RDMA_CM_ADDR_BOUND,
RDMA_CM_LISTEN,
RDMA_CM_DEVICE_REMOVAL,
- RDMA_CM_DESTROYING
+ RDMA_CM_DESTROYING,
+ RDMA_CM_ADDRINFO_QUERY,
+ RDMA_CM_ADDRINFO_RESOLVED
};
struct rdma_id_private {
diff --git a/drivers/infiniband/core/cma_trace.h b/drivers/infiniband/core/cma_trace.h
index dc622f3778be..3456d5f3aa47 100644
--- a/drivers/infiniband/core/cma_trace.h
+++ b/drivers/infiniband/core/cma_trace.h
@@ -55,7 +55,7 @@ DECLARE_EVENT_CLASS(cma_fsm_class,
DEFINE_CMA_FSM_EVENT(send_rtu);
DEFINE_CMA_FSM_EVENT(send_rej);
-DEFINE_CMA_FSM_EVENT(send_mra);
+DEFINE_CMA_FSM_EVENT(prepare_mra);
DEFINE_CMA_FSM_EVENT(send_sidr_req);
DEFINE_CMA_FSM_EVENT(send_sidr_rep);
DEFINE_CMA_FSM_EVENT(disconnect);
diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c
index af59486fe418..c3aa6d7fc66b 100644
--- a/drivers/infiniband/core/counters.c
+++ b/drivers/infiniband/core/counters.c
@@ -12,7 +12,8 @@
static int __counter_set_mode(struct rdma_port_counter *port_counter,
enum rdma_nl_counter_mode new_mode,
- enum rdma_nl_counter_mask new_mask)
+ enum rdma_nl_counter_mask new_mask,
+ bool bind_opcnt)
{
if (new_mode == RDMA_COUNTER_MODE_AUTO) {
if (new_mask & (~ALL_AUTO_MODE_MASKS))
@@ -23,6 +24,7 @@ static int __counter_set_mode(struct rdma_port_counter *port_counter,
port_counter->mode.mode = new_mode;
port_counter->mode.mask = new_mask;
+ port_counter->mode.bind_opcnt = bind_opcnt;
return 0;
}
@@ -41,6 +43,7 @@ static int __counter_set_mode(struct rdma_port_counter *port_counter,
*/
int rdma_counter_set_auto_mode(struct ib_device *dev, u32 port,
enum rdma_nl_counter_mask mask,
+ bool bind_opcnt,
struct netlink_ext_ack *extack)
{
struct rdma_port_counter *port_counter;
@@ -59,12 +62,13 @@ int rdma_counter_set_auto_mode(struct ib_device *dev, u32 port,
RDMA_COUNTER_MODE_NONE;
if (port_counter->mode.mode == mode &&
- port_counter->mode.mask == mask) {
+ port_counter->mode.mask == mask &&
+ port_counter->mode.bind_opcnt == bind_opcnt) {
ret = 0;
goto out;
}
- ret = __counter_set_mode(port_counter, mode, mask);
+ ret = __counter_set_mode(port_counter, mode, mask, bind_opcnt);
out:
mutex_unlock(&port_counter->lock);
@@ -89,7 +93,7 @@ static void auto_mode_init_counter(struct rdma_counter *counter,
}
static int __rdma_counter_bind_qp(struct rdma_counter *counter,
- struct ib_qp *qp)
+ struct ib_qp *qp, u32 port)
{
int ret;
@@ -100,7 +104,7 @@ static int __rdma_counter_bind_qp(struct rdma_counter *counter,
return -EOPNOTSUPP;
mutex_lock(&counter->lock);
- ret = qp->device->ops.counter_bind_qp(counter, qp);
+ ret = qp->device->ops.counter_bind_qp(counter, qp, port);
mutex_unlock(&counter->lock);
return ret;
@@ -140,7 +144,8 @@ out:
static struct rdma_counter *alloc_and_bind(struct ib_device *dev, u32 port,
struct ib_qp *qp,
- enum rdma_nl_counter_mode mode)
+ enum rdma_nl_counter_mode mode,
+ bool bind_opcnt)
{
struct rdma_port_counter *port_counter;
struct rdma_counter *counter;
@@ -149,13 +154,15 @@ static struct rdma_counter *alloc_and_bind(struct ib_device *dev, u32 port,
if (!dev->ops.counter_dealloc || !dev->ops.counter_alloc_stats)
return NULL;
- counter = kzalloc(sizeof(*counter), GFP_KERNEL);
+ counter = rdma_zalloc_drv_obj(dev, rdma_counter);
if (!counter)
return NULL;
counter->device = dev;
counter->port = port;
+ dev->ops.counter_init(counter);
+
rdma_restrack_new(&counter->res, RDMA_RESTRACK_COUNTER);
counter->stats = dev->ops.counter_alloc_stats(counter);
if (!counter->stats)
@@ -166,7 +173,7 @@ static struct rdma_counter *alloc_and_bind(struct ib_device *dev, u32 port,
switch (mode) {
case RDMA_COUNTER_MODE_MANUAL:
ret = __counter_set_mode(port_counter, RDMA_COUNTER_MODE_MANUAL,
- 0);
+ 0, bind_opcnt);
if (ret) {
mutex_unlock(&port_counter->lock);
goto err_mode;
@@ -185,10 +192,11 @@ static struct rdma_counter *alloc_and_bind(struct ib_device *dev, u32 port,
mutex_unlock(&port_counter->lock);
counter->mode.mode = mode;
+ counter->mode.bind_opcnt = bind_opcnt;
kref_init(&counter->kref);
mutex_init(&counter->lock);
- ret = __rdma_counter_bind_qp(counter, qp);
+ ret = __rdma_counter_bind_qp(counter, qp, port);
if (ret)
goto err_mode;
@@ -213,7 +221,8 @@ static void rdma_counter_free(struct rdma_counter *counter)
port_counter->num_counters--;
if (!port_counter->num_counters &&
(port_counter->mode.mode == RDMA_COUNTER_MODE_MANUAL))
- __counter_set_mode(port_counter, RDMA_COUNTER_MODE_NONE, 0);
+ __counter_set_mode(port_counter, RDMA_COUNTER_MODE_NONE, 0,
+ false);
mutex_unlock(&port_counter->lock);
@@ -238,7 +247,7 @@ static bool auto_mode_match(struct ib_qp *qp, struct rdma_counter *counter,
return match;
}
-static int __rdma_counter_unbind_qp(struct ib_qp *qp)
+static int __rdma_counter_unbind_qp(struct ib_qp *qp, u32 port)
{
struct rdma_counter *counter = qp->counter;
int ret;
@@ -247,7 +256,7 @@ static int __rdma_counter_unbind_qp(struct ib_qp *qp)
return -EOPNOTSUPP;
mutex_lock(&counter->lock);
- ret = qp->device->ops.counter_unbind_qp(qp);
+ ret = qp->device->ops.counter_unbind_qp(qp, port);
mutex_unlock(&counter->lock);
return ret;
@@ -339,13 +348,14 @@ int rdma_counter_bind_qp_auto(struct ib_qp *qp, u32 port)
counter = rdma_get_counter_auto_mode(qp, port);
if (counter) {
- ret = __rdma_counter_bind_qp(counter, qp);
+ ret = __rdma_counter_bind_qp(counter, qp, port);
if (ret) {
kref_put(&counter->kref, counter_release);
return ret;
}
} else {
- counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_AUTO);
+ counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_AUTO,
+ port_counter->mode.bind_opcnt);
if (!counter)
return -ENOMEM;
}
@@ -358,7 +368,7 @@ int rdma_counter_bind_qp_auto(struct ib_qp *qp, u32 port)
* @force:
* true - Decrease the counter ref-count anyway (e.g., qp destroy)
*/
-int rdma_counter_unbind_qp(struct ib_qp *qp, bool force)
+int rdma_counter_unbind_qp(struct ib_qp *qp, u32 port, bool force)
{
struct rdma_counter *counter = qp->counter;
int ret;
@@ -366,7 +376,7 @@ int rdma_counter_unbind_qp(struct ib_qp *qp, bool force)
if (!counter)
return -EINVAL;
- ret = __rdma_counter_unbind_qp(qp);
+ ret = __rdma_counter_unbind_qp(qp, port);
if (ret && !force)
return ret;
@@ -451,7 +461,7 @@ static struct ib_qp *rdma_counter_get_qp(struct ib_device *dev, u32 qp_num)
return NULL;
qp = container_of(res, struct ib_qp, res);
- if (qp->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW))
+ if (qp->qp_type == IB_QPT_RAW_PACKET && !rdma_dev_has_raw_cap(dev))
goto err;
return qp;
@@ -513,7 +523,7 @@ int rdma_counter_bind_qpn(struct ib_device *dev, u32 port,
goto err_task;
}
- ret = __rdma_counter_bind_qp(counter, qp);
+ ret = __rdma_counter_bind_qp(counter, qp, port);
if (ret)
goto err_task;
@@ -558,7 +568,7 @@ int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u32 port,
goto err;
}
- counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_MANUAL);
+ counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_MANUAL, true);
if (!counter) {
ret = -ENOMEM;
goto err;
@@ -604,7 +614,7 @@ int rdma_counter_unbind_qpn(struct ib_device *dev, u32 port,
goto out;
}
- ret = rdma_counter_unbind_qp(qp, false);
+ ret = rdma_counter_unbind_qp(qp, port, false);
out:
rdma_restrack_put(&qp->res);
@@ -613,13 +623,15 @@ out:
int rdma_counter_get_mode(struct ib_device *dev, u32 port,
enum rdma_nl_counter_mode *mode,
- enum rdma_nl_counter_mask *mask)
+ enum rdma_nl_counter_mask *mask,
+ bool *opcnt)
{
struct rdma_port_counter *port_counter;
port_counter = &dev->port_data[port].port_counter;
*mode = port_counter->mode.mode;
*mask = port_counter->mode.mask;
+ *opcnt = port_counter->mode.bind_opcnt;
return 0;
}
diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
index a70876a0a231..584537c71545 100644
--- a/drivers/infiniband/core/cq.c
+++ b/drivers/infiniband/core/cq.c
@@ -317,13 +317,18 @@ EXPORT_SYMBOL(__ib_alloc_cq_any);
*/
void ib_free_cq(struct ib_cq *cq)
{
- int ret;
+ int ret = 0;
if (WARN_ON_ONCE(atomic_read(&cq->usecnt)))
return;
if (WARN_ON_ONCE(cq->cqe_used))
return;
+ if (cq->device->ops.pre_destroy_cq) {
+ ret = cq->device->ops.pre_destroy_cq(cq);
+ WARN_ONCE(ret, "Disable of kernel CQ shouldn't fail");
+ }
+
switch (cq->poll_ctx) {
case IB_POLL_DIRECT:
break;
@@ -340,7 +345,10 @@ void ib_free_cq(struct ib_cq *cq)
rdma_dim_destroy(cq);
trace_cq_free(cq);
- ret = cq->device->ops.destroy_cq(cq, NULL);
+ if (cq->device->ops.post_destroy_cq)
+ cq->device->ops.post_destroy_cq(cq);
+ else
+ ret = cq->device->ops.destroy_cq(cq, NULL);
WARN_ONCE(ret, "Destroy of kernel CQ shouldn't fail");
rdma_restrack_del(&cq->res);
kfree(cq->wc);
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index ca9b956c034d..13e8a1714bbd 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -145,6 +145,33 @@ bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net)
}
EXPORT_SYMBOL(rdma_dev_access_netns);
+/**
+ * rdma_dev_has_raw_cap() - Returns whether a specified rdma device has
+ * CAP_NET_RAW capability or not.
+ *
+ * @dev: Pointer to rdma device whose capability to be checked
+ *
+ * Returns true if a rdma device's owning user namespace has CAP_NET_RAW
+ * capability, otherwise false. When rdma subsystem is in legacy shared network,
+ * namespace mode, the default net namespace is considered.
+ */
+bool rdma_dev_has_raw_cap(const struct ib_device *dev)
+{
+ const struct net *net;
+
+ /* Network namespace is the resource whose user namespace
+ * to be considered. When in shared mode, there is no reliable
+ * network namespace resource, so consider the default net namespace.
+ */
+ if (ib_devices_shared_netns)
+ net = &init_net;
+ else
+ net = read_pnet(&dev->coredev.rdma_net);
+
+ return ns_capable(net->user_ns, CAP_NET_RAW);
+}
+EXPORT_SYMBOL(rdma_dev_has_raw_cap);
+
/*
* xarray has this behavior where it won't iterate over NULL values stored in
* allocated arrays. So we need our own iterator to see all values stored in
@@ -209,23 +236,6 @@ static void __ibdev_printk(const char *level, const struct ib_device *ibdev,
printk("%s(NULL ib_device): %pV", level, vaf);
}
-void ibdev_printk(const char *level, const struct ib_device *ibdev,
- const char *format, ...)
-{
- struct va_format vaf;
- va_list args;
-
- va_start(args, format);
-
- vaf.fmt = format;
- vaf.va = &args;
-
- __ibdev_printk(level, ibdev, &vaf);
-
- va_end(args);
-}
-EXPORT_SYMBOL(ibdev_printk);
-
#define define_ibdev_printk_level(func, level) \
void func(const struct ib_device *ibdev, const char *fmt, ...) \
{ \
@@ -545,6 +555,8 @@ static struct class ib_class = {
static void rdma_init_coredev(struct ib_core_device *coredev,
struct ib_device *dev, struct net *net)
{
+ bool is_full_dev = &dev->coredev == coredev;
+
/* This BUILD_BUG_ON is intended to catch layout change
* of union of ib_core_device and device.
* dev must be the first element as ib_core and providers
@@ -556,6 +568,13 @@ static void rdma_init_coredev(struct ib_core_device *coredev,
coredev->dev.class = &ib_class;
coredev->dev.groups = dev->groups;
+
+ /*
+ * Don't expose hw counters outside of the init namespace.
+ */
+ if (!is_full_dev && dev->hw_stats_attr_index)
+ coredev->dev.groups[dev->hw_stats_attr_index] = NULL;
+
device_initialize(&coredev->dev);
coredev->owner = dev;
INIT_LIST_HEAD(&coredev->port_list);
@@ -565,6 +584,8 @@ static void rdma_init_coredev(struct ib_core_device *coredev,
/**
* _ib_alloc_device - allocate an IB device struct
* @size:size of structure to allocate
+ * @net: network namespace device should be located in, namespace
+ * must stay valid until ib_register_device() is completed.
*
* Low-level drivers should use ib_alloc_device() to allocate &struct
* ib_device. @size is the size of the structure to be allocated,
@@ -572,7 +593,7 @@ static void rdma_init_coredev(struct ib_core_device *coredev,
* ib_dealloc_device() must be used to free structures allocated with
* ib_alloc_device().
*/
-struct ib_device *_ib_alloc_device(size_t size)
+struct ib_device *_ib_alloc_device(size_t size, struct net *net)
{
struct ib_device *device;
unsigned int i;
@@ -589,7 +610,15 @@ struct ib_device *_ib_alloc_device(size_t size)
return NULL;
}
- rdma_init_coredev(&device->coredev, device, &init_net);
+ /* ib_devices_shared_netns can't change while we have active namespaces
+ * in the system which means either init_net is passed or the user has
+ * no idea what they are doing.
+ *
+ * To avoid breaking backward compatibility, when in shared mode,
+ * force to init the device in the init_net.
+ */
+ net = ib_devices_shared_netns ? &init_net : net;
+ rdma_init_coredev(&device->coredev, device, net);
INIT_LIST_HEAD(&device->event_handler_list);
spin_lock_init(&device->qp_open_list_lock);
@@ -1358,9 +1387,14 @@ static void ib_device_notify_register(struct ib_device *device)
u32 port;
int ret;
+ down_read(&devices_rwsem);
+
+ /* Mark for userspace that device is ready */
+ kobject_uevent(&device->dev.kobj, KOBJ_ADD);
+
ret = rdma_nl_notify_event(device, 0, RDMA_REGISTER_EVENT);
if (ret)
- return;
+ goto out;
rdma_for_each_port(device, port) {
netdev = ib_device_get_netdev(device, port);
@@ -1371,8 +1405,11 @@ static void ib_device_notify_register(struct ib_device *device)
RDMA_NETDEV_ATTACH_EVENT);
dev_put(netdev);
if (ret)
- return;
+ goto out;
}
+
+out:
+ up_read(&devices_rwsem);
}
/**
@@ -1471,10 +1508,9 @@ int ib_register_device(struct ib_device *device, const char *name,
return ret;
}
dev_set_uevent_suppress(&device->dev, false);
- /* Mark for userspace that device is ready */
- kobject_uevent(&device->dev.kobj, KOBJ_ADD);
ib_device_notify_register(device);
+
ib_device_put(device);
return 0;
@@ -1507,7 +1543,7 @@ static void __ib_unregister_device(struct ib_device *ib_dev)
/*
* We have a registration lock so that all the calls to unregister are
- * fully fenced, once any unregister returns the device is truely
+ * fully fenced, once any unregister returns the device is truly
* unregistered even if multiple callers are unregistering it at the
* same time. This also interacts with the registration flow and
* provides sane semantics if register and unregister are racing.
@@ -2296,6 +2332,33 @@ struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
EXPORT_SYMBOL(ib_device_get_netdev);
/**
+ * ib_query_netdev_port - Query the port number of a net_device
+ * associated with an ibdev
+ * @ibdev: IB device
+ * @ndev: Network device
+ * @port: IB port the net_device is connected to
+ */
+int ib_query_netdev_port(struct ib_device *ibdev, struct net_device *ndev,
+ u32 *port)
+{
+ struct net_device *ib_ndev;
+ u32 port_num;
+
+ rdma_for_each_port(ibdev, port_num) {
+ ib_ndev = ib_device_get_netdev(ibdev, port_num);
+ if (ndev == ib_ndev) {
+ *port = port_num;
+ dev_put(ib_ndev);
+ return 0;
+ }
+ dev_put(ib_ndev);
+ }
+
+ return -ENOENT;
+}
+EXPORT_SYMBOL(ib_query_netdev_port);
+
+/**
* ib_device_get_by_netdev - Find an IB device associated with a netdev
* @ndev: netdev to locate
* @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all)
@@ -2645,6 +2708,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, add_sub_dev);
SET_DEVICE_OP(dev_ops, advise_mr);
SET_DEVICE_OP(dev_ops, alloc_dm);
+ SET_DEVICE_OP(dev_ops, alloc_dmah);
SET_DEVICE_OP(dev_ops, alloc_hw_device_stats);
SET_DEVICE_OP(dev_ops, alloc_hw_port_stats);
SET_DEVICE_OP(dev_ops, alloc_mr);
@@ -2659,11 +2723,13 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, counter_alloc_stats);
SET_DEVICE_OP(dev_ops, counter_bind_qp);
SET_DEVICE_OP(dev_ops, counter_dealloc);
+ SET_DEVICE_OP(dev_ops, counter_init);
SET_DEVICE_OP(dev_ops, counter_unbind_qp);
SET_DEVICE_OP(dev_ops, counter_update_stats);
SET_DEVICE_OP(dev_ops, create_ah);
SET_DEVICE_OP(dev_ops, create_counters);
SET_DEVICE_OP(dev_ops, create_cq);
+ SET_DEVICE_OP(dev_ops, create_cq_umem);
SET_DEVICE_OP(dev_ops, create_flow);
SET_DEVICE_OP(dev_ops, create_qp);
SET_DEVICE_OP(dev_ops, create_rwq_ind_table);
@@ -2671,6 +2737,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, create_user_ah);
SET_DEVICE_OP(dev_ops, create_wq);
SET_DEVICE_OP(dev_ops, dealloc_dm);
+ SET_DEVICE_OP(dev_ops, dealloc_dmah);
SET_DEVICE_OP(dev_ops, dealloc_driver);
SET_DEVICE_OP(dev_ops, dealloc_mw);
SET_DEVICE_OP(dev_ops, dealloc_pd);
@@ -2736,8 +2803,10 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, modify_srq);
SET_DEVICE_OP(dev_ops, modify_wq);
SET_DEVICE_OP(dev_ops, peek_cq);
+ SET_DEVICE_OP(dev_ops, pre_destroy_cq);
SET_DEVICE_OP(dev_ops, poll_cq);
SET_DEVICE_OP(dev_ops, port_groups);
+ SET_DEVICE_OP(dev_ops, post_destroy_cq);
SET_DEVICE_OP(dev_ops, post_recv);
SET_DEVICE_OP(dev_ops, post_send);
SET_DEVICE_OP(dev_ops, post_srq_recv);
@@ -2761,10 +2830,12 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, set_vf_guid);
SET_DEVICE_OP(dev_ops, set_vf_link_state);
SET_DEVICE_OP(dev_ops, ufile_hw_cleanup);
+ SET_DEVICE_OP(dev_ops, report_port_event);
SET_OBJ_SIZE(dev_ops, ib_ah);
SET_OBJ_SIZE(dev_ops, ib_counters);
SET_OBJ_SIZE(dev_ops, ib_cq);
+ SET_OBJ_SIZE(dev_ops, ib_dmah);
SET_OBJ_SIZE(dev_ops, ib_mw);
SET_OBJ_SIZE(dev_ops, ib_pd);
SET_OBJ_SIZE(dev_ops, ib_qp);
@@ -2772,6 +2843,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_OBJ_SIZE(dev_ops, ib_srq);
SET_OBJ_SIZE(dev_ops, ib_ucontext);
SET_OBJ_SIZE(dev_ops, ib_xrcd);
+ SET_OBJ_SIZE(dev_ops, rdma_counter);
}
EXPORT_SYMBOL(ib_set_device_ops);
@@ -2854,11 +2926,62 @@ static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = {
},
};
+void ib_dispatch_port_state_event(struct ib_device *ibdev, struct net_device *ndev)
+{
+ enum ib_port_state curr_state;
+ struct ib_event ibevent = {};
+ u32 port;
+
+ if (ib_query_netdev_port(ibdev, ndev, &port))
+ return;
+
+ curr_state = ib_get_curr_port_state(ndev);
+
+ write_lock_irq(&ibdev->cache_lock);
+ if (ibdev->port_data[port].cache.last_port_state == curr_state) {
+ write_unlock_irq(&ibdev->cache_lock);
+ return;
+ }
+ ibdev->port_data[port].cache.last_port_state = curr_state;
+ write_unlock_irq(&ibdev->cache_lock);
+
+ ibevent.event = (curr_state == IB_PORT_DOWN) ?
+ IB_EVENT_PORT_ERR : IB_EVENT_PORT_ACTIVE;
+ ibevent.device = ibdev;
+ ibevent.element.port_num = port;
+ ib_dispatch_event(&ibevent);
+}
+EXPORT_SYMBOL(ib_dispatch_port_state_event);
+
+static void handle_port_event(struct net_device *ndev, unsigned long event)
+{
+ struct ib_device *ibdev;
+
+ /* Currently, link events in bonding scenarios are still
+ * reported by drivers that support bonding.
+ */
+ if (netif_is_lag_master(ndev) || netif_is_lag_port(ndev))
+ return;
+
+ ibdev = ib_device_get_by_netdev(ndev, RDMA_DRIVER_UNKNOWN);
+ if (!ibdev)
+ return;
+
+ if (ibdev->ops.report_port_event) {
+ ibdev->ops.report_port_event(ibdev, ndev, event);
+ goto put_ibdev;
+ }
+
+ ib_dispatch_port_state_event(ibdev, ndev);
+
+put_ibdev:
+ ib_device_put(ibdev);
+};
+
static int ib_netdevice_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
- struct net_device *ib_ndev;
struct ib_device *ibdev;
u32 port;
@@ -2868,15 +2991,21 @@ static int ib_netdevice_event(struct notifier_block *this,
if (!ibdev)
return NOTIFY_DONE;
- rdma_for_each_port(ibdev, port) {
- ib_ndev = ib_device_get_netdev(ibdev, port);
- if (ndev == ib_ndev)
- rdma_nl_notify_event(ibdev, port,
- RDMA_NETDEV_RENAME_EVENT);
- dev_put(ib_ndev);
+ if (ib_query_netdev_port(ibdev, ndev, &port)) {
+ ib_device_put(ibdev);
+ break;
}
+
+ rdma_nl_notify_event(ibdev, port, RDMA_NETDEV_RENAME_EVENT);
ib_device_put(ibdev);
break;
+
+ case NETDEV_UP:
+ case NETDEV_CHANGE:
+ case NETDEV_DOWN:
+ handle_port_event(ndev, event);
+ break;
+
default:
break;
}
@@ -2892,7 +3021,7 @@ static int __init ib_core_init(void)
{
int ret = -ENOMEM;
- ib_wq = alloc_workqueue("infiniband", 0, 0);
+ ib_wq = alloc_workqueue("infiniband", WQ_PERCPU, 0);
if (!ib_wq)
return -ENOMEM;
@@ -2902,7 +3031,7 @@ static int __init ib_core_init(void)
goto err;
ib_comp_wq = alloc_workqueue("ib-comp-wq",
- WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
+ WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS | WQ_PERCPU, 0);
if (!ib_comp_wq)
goto err_unbound;
diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c
index 7e3a55349e10..62410578dec3 100644
--- a/drivers/infiniband/core/iwcm.c
+++ b/drivers/infiniband/core/iwcm.c
@@ -109,7 +109,9 @@ static struct ctl_table iwcm_ctl_table[] = {
.data = &default_backlog,
.maxlen = sizeof(default_backlog),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_INT_MAX,
},
};
@@ -366,12 +368,9 @@ EXPORT_SYMBOL(iw_cm_disconnect);
/*
* CM_ID <-- DESTROYING
*
- * Clean up all resources associated with the connection and release
- * the initial reference taken by iw_create_cm_id.
- *
- * Returns true if and only if the last cm_id_priv reference has been dropped.
+ * Clean up all resources associated with the connection.
*/
-static bool destroy_cm_id(struct iw_cm_id *cm_id)
+static void destroy_cm_id(struct iw_cm_id *cm_id)
{
struct iwcm_id_private *cm_id_priv;
struct ib_qp *qp;
@@ -440,20 +439,22 @@ static bool destroy_cm_id(struct iw_cm_id *cm_id)
iwpm_remove_mapinfo(&cm_id->local_addr, &cm_id->m_local_addr);
iwpm_remove_mapping(&cm_id->local_addr, RDMA_NL_IWCM);
}
-
- return iwcm_deref_id(cm_id_priv);
}
/*
- * This function is only called by the application thread and cannot
- * be called by the event thread. The function will wait for all
- * references to be released on the cm_id and then kfree the cm_id
- * object.
+ * Destroy cm_id. If the cm_id still has other references, wait for all
+ * references to be released on the cm_id and then release the initial
+ * reference taken by iw_create_cm_id.
*/
void iw_destroy_cm_id(struct iw_cm_id *cm_id)
{
- if (!destroy_cm_id(cm_id))
+ struct iwcm_id_private *cm_id_priv;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ destroy_cm_id(cm_id);
+ if (refcount_read(&cm_id_priv->refcount) > 1)
flush_workqueue(iwcm_wq);
+ iwcm_deref_id(cm_id_priv);
}
EXPORT_SYMBOL(iw_destroy_cm_id);
@@ -1033,8 +1034,10 @@ static void cm_work_handler(struct work_struct *_work)
if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) {
ret = process_event(cm_id_priv, &levent);
- if (ret)
- WARN_ON_ONCE(destroy_cm_id(&cm_id_priv->id));
+ if (ret) {
+ destroy_cm_id(&cm_id_priv->id);
+ WARN_ON_ONCE(iwcm_deref_id(cm_id_priv));
+ }
} else
pr_debug("dropping event %d\n", levent.event);
if (iwcm_deref_id(cm_id_priv))
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 1fd54d5c4dd8..8f26bfb69586 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -210,6 +210,29 @@ int ib_response_mad(const struct ib_mad_hdr *hdr)
}
EXPORT_SYMBOL(ib_response_mad);
+#define SOL_FC_MAX_DEFAULT_FRAC 4
+#define SOL_FC_MAX_SA_FRAC 32
+
+static int get_sol_fc_max_outstanding(struct ib_mad_reg_req *mad_reg_req)
+{
+ if (!mad_reg_req)
+ /* Send only agent */
+ return mad_recvq_size / SOL_FC_MAX_DEFAULT_FRAC;
+
+ switch (mad_reg_req->mgmt_class) {
+ case IB_MGMT_CLASS_CM:
+ return mad_recvq_size / SOL_FC_MAX_DEFAULT_FRAC;
+ case IB_MGMT_CLASS_SUBN_ADM:
+ return mad_recvq_size / SOL_FC_MAX_SA_FRAC;
+ case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+ case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+ return min(mad_recvq_size, IB_MAD_QP_RECV_SIZE) /
+ SOL_FC_MAX_DEFAULT_FRAC;
+ default:
+ return 0;
+ }
+}
+
/*
* ib_register_mad_agent - Register to send/receive MADs
*
@@ -391,13 +414,17 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
spin_lock_init(&mad_agent_priv->lock);
INIT_LIST_HEAD(&mad_agent_priv->send_list);
INIT_LIST_HEAD(&mad_agent_priv->wait_list);
- INIT_LIST_HEAD(&mad_agent_priv->done_list);
INIT_LIST_HEAD(&mad_agent_priv->rmpp_list);
+ INIT_LIST_HEAD(&mad_agent_priv->backlog_list);
INIT_DELAYED_WORK(&mad_agent_priv->timed_work, timeout_sends);
INIT_LIST_HEAD(&mad_agent_priv->local_list);
INIT_WORK(&mad_agent_priv->local_work, local_completions);
refcount_set(&mad_agent_priv->refcount, 1);
init_completion(&mad_agent_priv->comp);
+ mad_agent_priv->sol_fc_send_count = 0;
+ mad_agent_priv->sol_fc_wait_count = 0;
+ mad_agent_priv->sol_fc_max =
+ recv_handler ? get_sol_fc_max_outstanding(mad_reg_req) : 0;
ret2 = ib_mad_agent_security_setup(&mad_agent_priv->agent, qp_type);
if (ret2) {
@@ -1055,6 +1082,180 @@ int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr)
return ret;
}
+static void handle_queued_state(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_agent_private *mad_agent_priv)
+{
+ if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP) {
+ mad_agent_priv->sol_fc_wait_count--;
+ list_move_tail(&mad_send_wr->agent_list,
+ &mad_agent_priv->backlog_list);
+ } else {
+ expect_mad_state(mad_send_wr, IB_MAD_STATE_INIT);
+ list_add_tail(&mad_send_wr->agent_list,
+ &mad_agent_priv->backlog_list);
+ }
+}
+
+static void handle_send_state(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_agent_private *mad_agent_priv)
+{
+ if (mad_send_wr->state == IB_MAD_STATE_INIT) {
+ list_add_tail(&mad_send_wr->agent_list,
+ &mad_agent_priv->send_list);
+ } else {
+ expect_mad_state2(mad_send_wr, IB_MAD_STATE_WAIT_RESP,
+ IB_MAD_STATE_QUEUED);
+ list_move_tail(&mad_send_wr->agent_list,
+ &mad_agent_priv->send_list);
+ }
+
+ if (mad_send_wr->is_solicited_fc) {
+ if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP)
+ mad_agent_priv->sol_fc_wait_count--;
+ mad_agent_priv->sol_fc_send_count++;
+ }
+}
+
+static void handle_wait_state(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_agent_private *mad_agent_priv)
+{
+ struct ib_mad_send_wr_private *temp_mad_send_wr;
+ struct list_head *list_item;
+ unsigned long delay;
+
+ expect_mad_state3(mad_send_wr, IB_MAD_STATE_SEND_START,
+ IB_MAD_STATE_WAIT_RESP, IB_MAD_STATE_CANCELED);
+ if (mad_send_wr->state == IB_MAD_STATE_SEND_START &&
+ mad_send_wr->is_solicited_fc) {
+ mad_agent_priv->sol_fc_send_count--;
+ mad_agent_priv->sol_fc_wait_count++;
+ }
+
+ list_del_init(&mad_send_wr->agent_list);
+ delay = mad_send_wr->timeout;
+ mad_send_wr->timeout += jiffies;
+
+ if (delay) {
+ list_for_each_prev(list_item,
+ &mad_agent_priv->wait_list) {
+ temp_mad_send_wr = list_entry(
+ list_item,
+ struct ib_mad_send_wr_private,
+ agent_list);
+ if (time_after(mad_send_wr->timeout,
+ temp_mad_send_wr->timeout))
+ break;
+ }
+ } else {
+ list_item = &mad_agent_priv->wait_list;
+ }
+
+ list_add(&mad_send_wr->agent_list, list_item);
+}
+
+static void handle_early_resp_state(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_agent_private *mad_agent_priv)
+{
+ expect_mad_state(mad_send_wr, IB_MAD_STATE_SEND_START);
+ mad_agent_priv->sol_fc_send_count -= mad_send_wr->is_solicited_fc;
+}
+
+static void handle_canceled_state(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_agent_private *mad_agent_priv)
+{
+ not_expect_mad_state(mad_send_wr, IB_MAD_STATE_DONE);
+ if (mad_send_wr->is_solicited_fc) {
+ if (mad_send_wr->state == IB_MAD_STATE_SEND_START)
+ mad_agent_priv->sol_fc_send_count--;
+ else if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP)
+ mad_agent_priv->sol_fc_wait_count--;
+ }
+}
+
+static void handle_done_state(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_agent_private *mad_agent_priv)
+{
+ if (mad_send_wr->is_solicited_fc) {
+ if (mad_send_wr->state == IB_MAD_STATE_SEND_START)
+ mad_agent_priv->sol_fc_send_count--;
+ else if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP)
+ mad_agent_priv->sol_fc_wait_count--;
+ }
+
+ list_del_init(&mad_send_wr->agent_list);
+}
+
+void change_mad_state(struct ib_mad_send_wr_private *mad_send_wr,
+ enum ib_mad_state new_state)
+{
+ struct ib_mad_agent_private *mad_agent_priv =
+ mad_send_wr->mad_agent_priv;
+
+ switch (new_state) {
+ case IB_MAD_STATE_INIT:
+ break;
+ case IB_MAD_STATE_QUEUED:
+ handle_queued_state(mad_send_wr, mad_agent_priv);
+ break;
+ case IB_MAD_STATE_SEND_START:
+ handle_send_state(mad_send_wr, mad_agent_priv);
+ break;
+ case IB_MAD_STATE_WAIT_RESP:
+ handle_wait_state(mad_send_wr, mad_agent_priv);
+ if (mad_send_wr->state == IB_MAD_STATE_CANCELED)
+ return;
+ break;
+ case IB_MAD_STATE_EARLY_RESP:
+ handle_early_resp_state(mad_send_wr, mad_agent_priv);
+ break;
+ case IB_MAD_STATE_CANCELED:
+ handle_canceled_state(mad_send_wr, mad_agent_priv);
+ break;
+ case IB_MAD_STATE_DONE:
+ handle_done_state(mad_send_wr, mad_agent_priv);
+ break;
+ }
+
+ mad_send_wr->state = new_state;
+}
+
+static bool is_solicited_fc_mad(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+ u8 mgmt_class;
+
+ if (!mad_send_wr->timeout)
+ return 0;
+
+ rmpp_mad = mad_send_wr->send_buf.mad;
+ if (mad_send_wr->mad_agent_priv->agent.rmpp_version &&
+ (ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE))
+ return 0;
+
+ mgmt_class =
+ ((struct ib_mad_hdr *)mad_send_wr->send_buf.mad)->mgmt_class;
+ return mgmt_class == IB_MGMT_CLASS_CM ||
+ mgmt_class == IB_MGMT_CLASS_SUBN_ADM ||
+ mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+ mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE;
+}
+
+static bool mad_is_for_backlog(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ struct ib_mad_agent_private *mad_agent_priv =
+ mad_send_wr->mad_agent_priv;
+
+ if (!mad_send_wr->is_solicited_fc || !mad_agent_priv->sol_fc_max)
+ return false;
+
+ if (!list_empty(&mad_agent_priv->backlog_list))
+ return true;
+
+ return mad_agent_priv->sol_fc_send_count +
+ mad_agent_priv->sol_fc_wait_count >=
+ mad_agent_priv->sol_fc_max;
+}
+
/*
* ib_post_send_mad - Posts MAD(s) to the send queue of the QP associated
* with the registered client
@@ -1080,9 +1281,7 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
if (ret)
goto error;
- if (!send_buf->mad_agent->send_handler ||
- (send_buf->timeout_ms &&
- !send_buf->mad_agent->recv_handler)) {
+ if (!send_buf->mad_agent->send_handler) {
ret = -EINVAL;
goto error;
}
@@ -1118,15 +1317,19 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
mad_send_wr->max_retries = send_buf->retries;
mad_send_wr->retries_left = send_buf->retries;
send_buf->retries = 0;
- /* Reference for work request to QP + response */
- mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0);
- mad_send_wr->status = IB_WC_SUCCESS;
+ change_mad_state(mad_send_wr, IB_MAD_STATE_INIT);
/* Reference MAD agent until send completes */
refcount_inc(&mad_agent_priv->refcount);
spin_lock_irqsave(&mad_agent_priv->lock, flags);
- list_add_tail(&mad_send_wr->agent_list,
- &mad_agent_priv->send_list);
+ mad_send_wr->is_solicited_fc = is_solicited_fc_mad(mad_send_wr);
+ if (mad_is_for_backlog(mad_send_wr)) {
+ change_mad_state(mad_send_wr, IB_MAD_STATE_QUEUED);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ return 0;
+ }
+
+ change_mad_state(mad_send_wr, IB_MAD_STATE_SEND_START);
spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) {
@@ -1138,7 +1341,7 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
if (ret < 0) {
/* Fail send request */
spin_lock_irqsave(&mad_agent_priv->lock, flags);
- list_del(&mad_send_wr->agent_list);
+ change_mad_state(mad_send_wr, IB_MAD_STATE_DONE);
spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
deref_mad_agent(mad_agent_priv);
goto error;
@@ -1746,7 +1949,19 @@ ib_find_send_mad(const struct ib_mad_agent_private *mad_agent_priv,
*/
(is_direct(mad_hdr->mgmt_class) ||
rcv_has_same_gid(mad_agent_priv, wr, wc)))
- return (wr->status == IB_WC_SUCCESS) ? wr : NULL;
+ return (wr->state != IB_MAD_STATE_CANCELED) ? wr : NULL;
+ }
+
+ list_for_each_entry(wr, &mad_agent_priv->backlog_list, agent_list) {
+ if ((wr->tid == mad_hdr->tid) &&
+ rcv_has_same_class(wr, wc) &&
+ /*
+ * Don't check GID for direct routed MADs.
+ * These might have permissive LIDs.
+ */
+ (is_direct(mad_hdr->mgmt_class) ||
+ rcv_has_same_gid(mad_agent_priv, wr, wc)))
+ return (wr->state != IB_MAD_STATE_CANCELED) ? wr : NULL;
}
/*
@@ -1765,17 +1980,55 @@ ib_find_send_mad(const struct ib_mad_agent_private *mad_agent_priv,
(is_direct(mad_hdr->mgmt_class) ||
rcv_has_same_gid(mad_agent_priv, wr, wc)))
/* Verify request has not been canceled */
- return (wr->status == IB_WC_SUCCESS) ? wr : NULL;
+ return (wr->state != IB_MAD_STATE_CANCELED) ? wr : NULL;
}
return NULL;
}
+static void
+process_backlog_mads(struct ib_mad_agent_private *mad_agent_priv)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+ struct ib_mad_send_wc mad_send_wc = {};
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ while (!list_empty(&mad_agent_priv->backlog_list) &&
+ (mad_agent_priv->sol_fc_send_count +
+ mad_agent_priv->sol_fc_wait_count <
+ mad_agent_priv->sol_fc_max)) {
+ mad_send_wr = list_entry(mad_agent_priv->backlog_list.next,
+ struct ib_mad_send_wr_private,
+ agent_list);
+ change_mad_state(mad_send_wr, IB_MAD_STATE_SEND_START);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ ret = ib_send_mad(mad_send_wr);
+ if (ret) {
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ deref_mad_agent(mad_agent_priv);
+ change_mad_state(mad_send_wr, IB_MAD_STATE_DONE);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ mad_send_wc.send_buf = &mad_send_wr->send_buf;
+ mad_send_wc.status = IB_WC_LOC_QP_OP_ERR;
+ mad_agent_priv->agent.send_handler(
+ &mad_agent_priv->agent, &mad_send_wc);
+ }
+
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ }
+
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr)
{
mad_send_wr->timeout = 0;
- if (mad_send_wr->refcount == 1)
- list_move_tail(&mad_send_wr->agent_list,
- &mad_send_wr->mad_agent_priv->done_list);
+ if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP ||
+ mad_send_wr->state == IB_MAD_STATE_QUEUED)
+ change_mad_state(mad_send_wr, IB_MAD_STATE_DONE);
+ else
+ change_mad_state(mad_send_wr, IB_MAD_STATE_EARLY_RESP);
}
static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
@@ -1784,6 +2037,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
struct ib_mad_send_wr_private *mad_send_wr;
struct ib_mad_send_wc mad_send_wc;
unsigned long flags;
+ bool is_mad_done;
int ret;
INIT_LIST_HEAD(&mad_recv_wc->rmpp_list);
@@ -1832,6 +2086,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
}
} else {
ib_mark_mad_done(mad_send_wr);
+ is_mad_done = (mad_send_wr->state == IB_MAD_STATE_DONE);
spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
/* Defined behavior is to complete response before request */
@@ -1841,10 +2096,13 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
mad_recv_wc);
deref_mad_agent(mad_agent_priv);
- mad_send_wc.status = IB_WC_SUCCESS;
- mad_send_wc.vendor_err = 0;
- mad_send_wc.send_buf = &mad_send_wr->send_buf;
- ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
+ if (is_mad_done) {
+ mad_send_wc.status = IB_WC_SUCCESS;
+ mad_send_wc.vendor_err = 0;
+ mad_send_wc.send_buf = &mad_send_wr->send_buf;
+ ib_mad_complete_send_wr(mad_send_wr,
+ &mad_send_wc);
+ }
}
} else {
mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, NULL,
@@ -2172,30 +2430,11 @@ static void adjust_timeout(struct ib_mad_agent_private *mad_agent_priv)
static void wait_for_response(struct ib_mad_send_wr_private *mad_send_wr)
{
struct ib_mad_agent_private *mad_agent_priv;
- struct ib_mad_send_wr_private *temp_mad_send_wr;
- struct list_head *list_item;
unsigned long delay;
mad_agent_priv = mad_send_wr->mad_agent_priv;
- list_del(&mad_send_wr->agent_list);
-
delay = mad_send_wr->timeout;
- mad_send_wr->timeout += jiffies;
-
- if (delay) {
- list_for_each_prev(list_item, &mad_agent_priv->wait_list) {
- temp_mad_send_wr = list_entry(list_item,
- struct ib_mad_send_wr_private,
- agent_list);
- if (time_after(mad_send_wr->timeout,
- temp_mad_send_wr->timeout))
- break;
- }
- } else {
- list_item = &mad_agent_priv->wait_list;
- }
-
- list_add(&mad_send_wr->agent_list, list_item);
+ change_mad_state(mad_send_wr, IB_MAD_STATE_WAIT_RESP);
/* Reschedule a work item if we have a shorter timeout */
if (mad_agent_priv->wait_list.next == &mad_send_wr->agent_list)
@@ -2229,32 +2468,28 @@ void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
} else
ret = IB_RMPP_RESULT_UNHANDLED;
- if (mad_send_wc->status != IB_WC_SUCCESS &&
- mad_send_wr->status == IB_WC_SUCCESS) {
- mad_send_wr->status = mad_send_wc->status;
- mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
- }
-
- if (--mad_send_wr->refcount > 0) {
- if (mad_send_wr->refcount == 1 && mad_send_wr->timeout &&
- mad_send_wr->status == IB_WC_SUCCESS) {
- wait_for_response(mad_send_wr);
- }
+ if (mad_send_wr->state == IB_MAD_STATE_CANCELED)
+ mad_send_wc->status = IB_WC_WR_FLUSH_ERR;
+ else if (mad_send_wr->state == IB_MAD_STATE_SEND_START &&
+ mad_send_wr->timeout) {
+ wait_for_response(mad_send_wr);
goto done;
}
/* Remove send from MAD agent and notify client of completion */
- list_del(&mad_send_wr->agent_list);
+ if (mad_send_wr->state != IB_MAD_STATE_DONE)
+ change_mad_state(mad_send_wr, IB_MAD_STATE_DONE);
adjust_timeout(mad_agent_priv);
spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
- if (mad_send_wr->status != IB_WC_SUCCESS)
- mad_send_wc->status = mad_send_wr->status;
- if (ret == IB_RMPP_RESULT_INTERNAL)
+ if (ret == IB_RMPP_RESULT_INTERNAL) {
ib_rmpp_send_handler(mad_send_wc);
- else
+ } else {
+ if (mad_send_wr->is_solicited_fc)
+ process_backlog_mads(mad_agent_priv);
mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
mad_send_wc);
+ }
/* Release reference on agent taken when sending */
deref_mad_agent(mad_agent_priv);
@@ -2396,40 +2631,53 @@ static bool ib_mad_send_error(struct ib_mad_port_private *port_priv,
return true;
}
+static void clear_mad_error_list(struct list_head *list,
+ enum ib_wc_status wc_status,
+ struct ib_mad_agent_private *mad_agent_priv)
+{
+ struct ib_mad_send_wr_private *mad_send_wr, *n;
+ struct ib_mad_send_wc mad_send_wc;
+
+ mad_send_wc.status = wc_status;
+ mad_send_wc.vendor_err = 0;
+
+ list_for_each_entry_safe(mad_send_wr, n, list, agent_list) {
+ mad_send_wc.send_buf = &mad_send_wr->send_buf;
+ mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+ &mad_send_wc);
+ deref_mad_agent(mad_agent_priv);
+ }
+}
+
static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv)
{
unsigned long flags;
struct ib_mad_send_wr_private *mad_send_wr, *temp_mad_send_wr;
- struct ib_mad_send_wc mad_send_wc;
struct list_head cancel_list;
INIT_LIST_HEAD(&cancel_list);
spin_lock_irqsave(&mad_agent_priv->lock, flags);
list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr,
- &mad_agent_priv->send_list, agent_list) {
- if (mad_send_wr->status == IB_WC_SUCCESS) {
- mad_send_wr->status = IB_WC_WR_FLUSH_ERR;
- mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
- }
- }
-
- /* Empty wait list to prevent receives from finding a request */
- list_splice_init(&mad_agent_priv->wait_list, &cancel_list);
- spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ &mad_agent_priv->send_list, agent_list)
+ change_mad_state(mad_send_wr, IB_MAD_STATE_CANCELED);
- /* Report all cancelled requests */
- mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
- mad_send_wc.vendor_err = 0;
+ /* Empty wait & backlog list to prevent receives from finding request */
+ list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr,
+ &mad_agent_priv->wait_list, agent_list) {
+ change_mad_state(mad_send_wr, IB_MAD_STATE_DONE);
+ list_add_tail(&mad_send_wr->agent_list, &cancel_list);
+ }
list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr,
- &cancel_list, agent_list) {
- mad_send_wc.send_buf = &mad_send_wr->send_buf;
- list_del(&mad_send_wr->agent_list);
- mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
- &mad_send_wc);
- deref_mad_agent(mad_agent_priv);
+ &mad_agent_priv->backlog_list, agent_list) {
+ change_mad_state(mad_send_wr, IB_MAD_STATE_DONE);
+ list_add_tail(&mad_send_wr->agent_list, &cancel_list);
}
+
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ /* Report all cancelled requests */
+ clear_mad_error_list(&cancel_list, IB_WC_WR_FLUSH_ERR, mad_agent_priv);
}
static struct ib_mad_send_wr_private*
@@ -2451,6 +2699,13 @@ find_send_wr(struct ib_mad_agent_private *mad_agent_priv,
&mad_send_wr->send_buf == send_buf)
return mad_send_wr;
}
+
+ list_for_each_entry(mad_send_wr, &mad_agent_priv->backlog_list,
+ agent_list) {
+ if (&mad_send_wr->send_buf == send_buf)
+ return mad_send_wr;
+ }
+
return NULL;
}
@@ -2468,16 +2723,16 @@ int ib_modify_mad(struct ib_mad_send_buf *send_buf, u32 timeout_ms)
struct ib_mad_agent_private, agent);
spin_lock_irqsave(&mad_agent_priv->lock, flags);
mad_send_wr = find_send_wr(mad_agent_priv, send_buf);
- if (!mad_send_wr || mad_send_wr->status != IB_WC_SUCCESS) {
+ if (!mad_send_wr || mad_send_wr->state == IB_MAD_STATE_CANCELED) {
spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
return -EINVAL;
}
- active = (!mad_send_wr->timeout || mad_send_wr->refcount > 1);
- if (!timeout_ms) {
- mad_send_wr->status = IB_WC_WR_FLUSH_ERR;
- mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
- }
+ active = ((mad_send_wr->state == IB_MAD_STATE_SEND_START) ||
+ (mad_send_wr->state == IB_MAD_STATE_EARLY_RESP) ||
+ (mad_send_wr->state == IB_MAD_STATE_QUEUED && timeout_ms));
+ if (!timeout_ms)
+ change_mad_state(mad_send_wr, IB_MAD_STATE_CANCELED);
mad_send_wr->send_buf.timeout_ms = timeout_ms;
if (active)
@@ -2589,6 +2844,11 @@ static int retry_send(struct ib_mad_send_wr_private *mad_send_wr)
mad_send_wr->send_buf.retries++;
mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
+ if (mad_send_wr->is_solicited_fc &&
+ !list_empty(&mad_send_wr->mad_agent_priv->backlog_list)) {
+ change_mad_state(mad_send_wr, IB_MAD_STATE_QUEUED);
+ return 0;
+ }
if (ib_mad_kernel_rmpp_agent(&mad_send_wr->mad_agent_priv->agent)) {
ret = ib_retry_rmpp(mad_send_wr);
@@ -2606,26 +2866,25 @@ static int retry_send(struct ib_mad_send_wr_private *mad_send_wr)
} else
ret = ib_send_mad(mad_send_wr);
- if (!ret) {
- mad_send_wr->refcount++;
- list_add_tail(&mad_send_wr->agent_list,
- &mad_send_wr->mad_agent_priv->send_list);
- }
+ if (!ret)
+ change_mad_state(mad_send_wr, IB_MAD_STATE_SEND_START);
+
return ret;
}
static void timeout_sends(struct work_struct *work)
{
- struct ib_mad_send_wr_private *mad_send_wr, *n;
+ struct ib_mad_send_wr_private *mad_send_wr;
struct ib_mad_agent_private *mad_agent_priv;
- struct ib_mad_send_wc mad_send_wc;
- struct list_head local_list;
+ struct list_head timeout_list;
+ struct list_head cancel_list;
+ struct list_head *list_item;
unsigned long flags, delay;
mad_agent_priv = container_of(work, struct ib_mad_agent_private,
timed_work.work);
- mad_send_wc.vendor_err = 0;
- INIT_LIST_HEAD(&local_list);
+ INIT_LIST_HEAD(&timeout_list);
+ INIT_LIST_HEAD(&cancel_list);
spin_lock_irqsave(&mad_agent_priv->lock, flags);
while (!list_empty(&mad_agent_priv->wait_list)) {
@@ -2643,25 +2902,22 @@ static void timeout_sends(struct work_struct *work)
break;
}
- list_del_init(&mad_send_wr->agent_list);
- if (mad_send_wr->status == IB_WC_SUCCESS &&
- !retry_send(mad_send_wr))
+ if (mad_send_wr->state == IB_MAD_STATE_CANCELED)
+ list_item = &cancel_list;
+ else if (retry_send(mad_send_wr))
+ list_item = &timeout_list;
+ else
continue;
- list_add_tail(&mad_send_wr->agent_list, &local_list);
+ change_mad_state(mad_send_wr, IB_MAD_STATE_DONE);
+ list_add_tail(&mad_send_wr->agent_list, list_item);
}
- spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
- list_for_each_entry_safe(mad_send_wr, n, &local_list, agent_list) {
- if (mad_send_wr->status == IB_WC_SUCCESS)
- mad_send_wc.status = IB_WC_RESP_TIMEOUT_ERR;
- else
- mad_send_wc.status = mad_send_wr->status;
- mad_send_wc.send_buf = &mad_send_wr->send_buf;
- mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
- &mad_send_wc);
- deref_mad_agent(mad_agent_priv);
- }
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ process_backlog_mads(mad_agent_priv);
+ clear_mad_error_list(&timeout_list, IB_WC_RESP_TIMEOUT_ERR,
+ mad_agent_priv);
+ clear_mad_error_list(&cancel_list, IB_WC_WR_FLUSH_ERR, mad_agent_priv);
}
/*
@@ -2671,11 +2927,11 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
struct ib_mad_private *mad)
{
unsigned long flags;
- int post, ret;
struct ib_mad_private *mad_priv;
struct ib_sge sg_list;
struct ib_recv_wr recv_wr;
struct ib_mad_queue *recv_queue = &qp_info->recv_queue;
+ int ret = 0;
/* Initialize common scatter list fields */
sg_list.lkey = qp_info->port_priv->pd->local_dma_lkey;
@@ -2685,7 +2941,7 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
recv_wr.sg_list = &sg_list;
recv_wr.num_sge = 1;
- do {
+ while (true) {
/* Allocate and map receive buffer */
if (mad) {
mad_priv = mad;
@@ -2693,10 +2949,8 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
} else {
mad_priv = alloc_mad_private(port_mad_size(qp_info->port_priv),
GFP_ATOMIC);
- if (!mad_priv) {
- ret = -ENOMEM;
- break;
- }
+ if (!mad_priv)
+ return -ENOMEM;
}
sg_list.length = mad_priv_dma_size(mad_priv);
sg_list.addr = ib_dma_map_single(qp_info->port_priv->device,
@@ -2705,37 +2959,41 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
DMA_FROM_DEVICE);
if (unlikely(ib_dma_mapping_error(qp_info->port_priv->device,
sg_list.addr))) {
- kfree(mad_priv);
ret = -ENOMEM;
- break;
+ goto free_mad_priv;
}
mad_priv->header.mapping = sg_list.addr;
mad_priv->header.mad_list.mad_queue = recv_queue;
mad_priv->header.mad_list.cqe.done = ib_mad_recv_done;
recv_wr.wr_cqe = &mad_priv->header.mad_list.cqe;
-
- /* Post receive WR */
spin_lock_irqsave(&recv_queue->lock, flags);
- post = (++recv_queue->count < recv_queue->max_active);
- list_add_tail(&mad_priv->header.mad_list.list, &recv_queue->list);
+ if (recv_queue->count >= recv_queue->max_active) {
+ /* Fully populated the receive queue */
+ spin_unlock_irqrestore(&recv_queue->lock, flags);
+ break;
+ }
+ recv_queue->count++;
+ list_add_tail(&mad_priv->header.mad_list.list,
+ &recv_queue->list);
spin_unlock_irqrestore(&recv_queue->lock, flags);
+
ret = ib_post_recv(qp_info->qp, &recv_wr, NULL);
if (ret) {
spin_lock_irqsave(&recv_queue->lock, flags);
list_del(&mad_priv->header.mad_list.list);
recv_queue->count--;
spin_unlock_irqrestore(&recv_queue->lock, flags);
- ib_dma_unmap_single(qp_info->port_priv->device,
- mad_priv->header.mapping,
- mad_priv_dma_size(mad_priv),
- DMA_FROM_DEVICE);
- kfree(mad_priv);
dev_err(&qp_info->port_priv->device->dev,
"ib_post_recv failed: %d\n", ret);
break;
}
- } while (post);
+ }
+ ib_dma_unmap_single(qp_info->port_priv->device,
+ mad_priv->header.mapping,
+ mad_priv_dma_size(mad_priv), DMA_FROM_DEVICE);
+free_mad_priv:
+ kfree(mad_priv);
return ret;
}
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
index 1b7445a6f671..f444357d33f4 100644
--- a/drivers/infiniband/core/mad_priv.h
+++ b/drivers/infiniband/core/mad_priv.h
@@ -95,13 +95,16 @@ struct ib_mad_agent_private {
spinlock_t lock;
struct list_head send_list;
+ unsigned int sol_fc_send_count;
struct list_head wait_list;
- struct list_head done_list;
+ unsigned int sol_fc_wait_count;
struct delayed_work timed_work;
unsigned long timeout;
struct list_head local_list;
struct work_struct local_work;
struct list_head rmpp_list;
+ unsigned int sol_fc_max;
+ struct list_head backlog_list;
refcount_t refcount;
union {
@@ -118,6 +121,32 @@ struct ib_mad_snoop_private {
struct completion comp;
};
+enum ib_mad_state {
+ /* MAD is in the making and is not yet in any list */
+ IB_MAD_STATE_INIT,
+ /* MAD is in backlog list */
+ IB_MAD_STATE_QUEUED,
+ /*
+ * MAD was sent to the QP and is waiting for completion
+ * notification in send list.
+ */
+ IB_MAD_STATE_SEND_START,
+ /*
+ * MAD send completed successfully, waiting for a response
+ * in wait list.
+ */
+ IB_MAD_STATE_WAIT_RESP,
+ /*
+ * Response came early, before send completion notification,
+ * in send list.
+ */
+ IB_MAD_STATE_EARLY_RESP,
+ /* MAD was canceled while in wait or send list */
+ IB_MAD_STATE_CANCELED,
+ /* MAD processing completed, MAD in no list */
+ IB_MAD_STATE_DONE
+};
+
struct ib_mad_send_wr_private {
struct ib_mad_list_head mad_list;
struct list_head agent_list;
@@ -132,8 +161,6 @@ struct ib_mad_send_wr_private {
int max_retries;
int retries_left;
int retry;
- int refcount;
- enum ib_wc_status status;
/* RMPP control */
struct list_head rmpp_list;
@@ -143,8 +170,48 @@ struct ib_mad_send_wr_private {
int seg_num;
int newwin;
int pad;
+
+ enum ib_mad_state state;
+
+ /* Solicited MAD flow control */
+ bool is_solicited_fc;
};
+static inline void expect_mad_state(struct ib_mad_send_wr_private *mad_send_wr,
+ enum ib_mad_state expected_state)
+{
+ if (IS_ENABLED(CONFIG_LOCKDEP))
+ WARN_ON(mad_send_wr->state != expected_state);
+}
+
+static inline void expect_mad_state2(struct ib_mad_send_wr_private *mad_send_wr,
+ enum ib_mad_state expected_state1,
+ enum ib_mad_state expected_state2)
+{
+ if (IS_ENABLED(CONFIG_LOCKDEP))
+ WARN_ON(mad_send_wr->state != expected_state1 &&
+ mad_send_wr->state != expected_state2);
+}
+
+static inline void expect_mad_state3(struct ib_mad_send_wr_private *mad_send_wr,
+ enum ib_mad_state expected_state1,
+ enum ib_mad_state expected_state2,
+ enum ib_mad_state expected_state3)
+{
+ if (IS_ENABLED(CONFIG_LOCKDEP))
+ WARN_ON(mad_send_wr->state != expected_state1 &&
+ mad_send_wr->state != expected_state2 &&
+ mad_send_wr->state != expected_state3);
+}
+
+static inline void
+not_expect_mad_state(struct ib_mad_send_wr_private *mad_send_wr,
+ enum ib_mad_state wrong_state)
+{
+ if (IS_ENABLED(CONFIG_LOCKDEP))
+ WARN_ON(mad_send_wr->state == wrong_state);
+}
+
struct ib_mad_local_private {
struct list_head completion_list;
struct ib_mad_private *mad_priv;
@@ -222,4 +289,7 @@ void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr);
void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr,
unsigned long timeout_ms);
+void change_mad_state(struct ib_mad_send_wr_private *mad_send_wr,
+ enum ib_mad_state new_state);
+
#endif /* __IB_MAD_PRIV_H__ */
diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c
index 8af0619a39cd..1c5e0eaf1c94 100644
--- a/drivers/infiniband/core/mad_rmpp.c
+++ b/drivers/infiniband/core/mad_rmpp.c
@@ -158,7 +158,7 @@ static struct ib_mad_send_buf *alloc_response_msg(struct ib_mad_agent *agent,
ah = ib_create_ah_from_wc(agent->qp->pd, recv_wc->wc,
recv_wc->recv_buf.grh, agent->port_num);
if (IS_ERR(ah))
- return (void *) ah;
+ return ERR_CAST(ah);
hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class);
msg = ib_create_send_mad(agent, recv_wc->wc->src_qp,
@@ -608,16 +608,20 @@ static void abort_send(struct ib_mad_agent_private *agent,
goto out; /* Unmatched send */
if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) ||
- (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS))
+ (!mad_send_wr->timeout) ||
+ (mad_send_wr->state == IB_MAD_STATE_CANCELED))
goto out; /* Send is already done */
ib_mark_mad_done(mad_send_wr);
+ if (mad_send_wr->state == IB_MAD_STATE_DONE) {
+ spin_unlock_irqrestore(&agent->lock, flags);
+ wc.status = IB_WC_REM_ABORT_ERR;
+ wc.vendor_err = rmpp_status;
+ wc.send_buf = &mad_send_wr->send_buf;
+ ib_mad_complete_send_wr(mad_send_wr, &wc);
+ return;
+ }
spin_unlock_irqrestore(&agent->lock, flags);
-
- wc.status = IB_WC_REM_ABORT_ERR;
- wc.vendor_err = rmpp_status;
- wc.send_buf = &mad_send_wr->send_buf;
- ib_mad_complete_send_wr(mad_send_wr, &wc);
return;
out:
spin_unlock_irqrestore(&agent->lock, flags);
@@ -684,7 +688,8 @@ static void process_rmpp_ack(struct ib_mad_agent_private *agent,
}
if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) ||
- (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS))
+ (!mad_send_wr->timeout) ||
+ (mad_send_wr->state == IB_MAD_STATE_CANCELED))
goto out; /* Send is already done */
if (seg_num > mad_send_wr->send_buf.seg_count ||
@@ -709,21 +714,24 @@ static void process_rmpp_ack(struct ib_mad_agent_private *agent,
struct ib_mad_send_wc wc;
ib_mark_mad_done(mad_send_wr);
+ if (mad_send_wr->state == IB_MAD_STATE_DONE) {
+ spin_unlock_irqrestore(&agent->lock, flags);
+ wc.status = IB_WC_SUCCESS;
+ wc.vendor_err = 0;
+ wc.send_buf = &mad_send_wr->send_buf;
+ ib_mad_complete_send_wr(mad_send_wr, &wc);
+ return;
+ }
spin_unlock_irqrestore(&agent->lock, flags);
-
- wc.status = IB_WC_SUCCESS;
- wc.vendor_err = 0;
- wc.send_buf = &mad_send_wr->send_buf;
- ib_mad_complete_send_wr(mad_send_wr, &wc);
return;
}
- if (mad_send_wr->refcount == 1)
+ if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP)
ib_reset_mad_timeout(mad_send_wr,
mad_send_wr->send_buf.timeout_ms);
spin_unlock_irqrestore(&agent->lock, flags);
ack_ds_ack(agent, mad_recv_wc);
return;
- } else if (mad_send_wr->refcount == 1 &&
+ } else if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP &&
mad_send_wr->seg_num < mad_send_wr->newwin &&
mad_send_wr->seg_num < mad_send_wr->send_buf.seg_count) {
/* Send failure will just result in a timeout/retry */
@@ -731,7 +739,7 @@ static void process_rmpp_ack(struct ib_mad_agent_private *agent,
if (ret)
goto out;
- mad_send_wr->refcount++;
+ change_mad_state(mad_send_wr, IB_MAD_STATE_SEND_START);
list_move_tail(&mad_send_wr->agent_list,
&mad_send_wr->mad_agent_priv->send_list);
}
@@ -890,7 +898,6 @@ int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr)
mad_send_wr->newwin = init_newwin(mad_send_wr);
/* We need to wait for the final ACK even if there isn't a response */
- mad_send_wr->refcount += (mad_send_wr->timeout == 0);
ret = send_next_seg(mad_send_wr);
if (!ret)
return IB_RMPP_RESULT_CONSUMED;
@@ -912,7 +919,7 @@ int ib_process_rmpp_send_wc(struct ib_mad_send_wr_private *mad_send_wr,
return IB_RMPP_RESULT_INTERNAL; /* ACK, STOP, or ABORT */
if (mad_send_wc->status != IB_WC_SUCCESS ||
- mad_send_wr->status != IB_WC_SUCCESS)
+ mad_send_wr->state == IB_MAD_STATE_CANCELED)
return IB_RMPP_RESULT_PROCESSED; /* Canceled or send error */
if (!mad_send_wr->timeout)
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index ff121e59b9c0..2220a2dfab24 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -171,6 +171,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
[RDMA_NLDEV_ATTR_PARENT_NAME] = { .type = NLA_NUL_STRING },
[RDMA_NLDEV_ATTR_NAME_ASSIGN_TYPE] = { .type = NLA_U8 },
[RDMA_NLDEV_ATTR_EVENT_TYPE] = { .type = NLA_U8 },
+ [RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED] = { .type = NLA_U8 },
};
static int put_driver_name_print_type(struct sk_buff *msg, const char *name,
@@ -254,7 +255,7 @@ EXPORT_SYMBOL(rdma_nl_put_driver_u64_hex);
bool rdma_nl_get_privileged_qkey(void)
{
- return privileged_qkey || capable(CAP_NET_RAW);
+ return privileged_qkey;
}
EXPORT_SYMBOL(rdma_nl_get_privileged_qkey);
@@ -1468,10 +1469,11 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = {
};
-static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct netlink_ext_ack *extack,
- enum rdma_restrack_type res_type,
- res_fill_func_t fill_func)
+static noinline_for_stack int
+res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack,
+ enum rdma_restrack_type res_type,
+ res_fill_func_t fill_func)
{
const struct nldev_fill_res_entry *fe = &fill_entries[res_type];
struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
@@ -2028,6 +2030,7 @@ static int nldev_stat_set_mode_doit(struct sk_buff *msg,
struct ib_device *device, u32 port)
{
u32 mode, mask = 0, qpn, cntn = 0;
+ bool opcnt = false;
int ret;
/* Currently only counter for QP is supported */
@@ -2035,12 +2038,17 @@ static int nldev_stat_set_mode_doit(struct sk_buff *msg,
nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP)
return -EINVAL;
+ if (tb[RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED])
+ opcnt = !!nla_get_u8(
+ tb[RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED]);
+
mode = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_MODE]);
if (mode == RDMA_COUNTER_MODE_AUTO) {
if (tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK])
mask = nla_get_u32(
tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]);
- return rdma_counter_set_auto_mode(device, port, mask, extack);
+ return rdma_counter_set_auto_mode(device, port, mask, opcnt,
+ extack);
}
if (!tb[RDMA_NLDEV_ATTR_RES_LQPN])
@@ -2256,10 +2264,10 @@ err:
return ret;
}
-static int stat_get_doit_default_counter(struct sk_buff *skb,
- struct nlmsghdr *nlh,
- struct netlink_ext_ack *extack,
- struct nlattr *tb[])
+static noinline_for_stack int
+stat_get_doit_default_counter(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack,
+ struct nlattr *tb[])
{
struct rdma_hw_stats *stats;
struct nlattr *table_attr;
@@ -2349,8 +2357,9 @@ err:
return ret;
}
-static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct netlink_ext_ack *extack, struct nlattr *tb[])
+static noinline_for_stack int
+stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack, struct nlattr *tb[])
{
static enum rdma_nl_counter_mode mode;
@@ -2358,6 +2367,7 @@ static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh,
struct ib_device *device;
struct sk_buff *msg;
u32 index, port;
+ bool opcnt;
int ret;
if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID])
@@ -2393,7 +2403,7 @@ static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh,
goto err_msg;
}
- ret = rdma_counter_get_mode(device, port, &mode, &mask);
+ ret = rdma_counter_get_mode(device, port, &mode, &mask, &opcnt);
if (ret)
goto err_msg;
@@ -2410,6 +2420,12 @@ static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh,
goto err_msg;
}
+ if ((mode == RDMA_COUNTER_MODE_AUTO) &&
+ nla_put_u8(msg, RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED, opcnt)) {
+ ret = -EMSGSIZE;
+ goto err_msg;
+ }
+
nlmsg_end(msg, nlh);
ib_device_put(device);
return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid);
@@ -2833,8 +2849,8 @@ int rdma_nl_notify_event(struct ib_device *device, u32 port_num,
enum rdma_nl_notify_event_type type)
{
struct sk_buff *skb;
+ int ret = -EMSGSIZE;
struct net *net;
- int ret = 0;
void *nlh;
net = read_pnet(&device->coredev.rdma_net);
diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
index 90c177edf9b0..18918f463361 100644
--- a/drivers/infiniband/core/rdma_core.c
+++ b/drivers/infiniband/core/rdma_core.c
@@ -1019,3 +1019,32 @@ void uverbs_finalize_object(struct ib_uobject *uobj,
WARN_ON(true);
}
}
+
+/**
+ * rdma_uattrs_has_raw_cap() - Returns whether a rdma device linked to the
+ * uverbs attributes file has CAP_NET_RAW
+ * capability or not.
+ *
+ * @attrs: Pointer to uverbs attributes
+ *
+ * Returns true if a rdma device's owning user namespace has CAP_NET_RAW
+ * capability, otherwise false.
+ */
+bool rdma_uattrs_has_raw_cap(const struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_file *ufile = attrs->ufile;
+ struct ib_ucontext *ucontext;
+ bool has_cap = false;
+ int srcu_key;
+
+ srcu_key = srcu_read_lock(&ufile->device->disassociate_srcu);
+ ucontext = ib_uverbs_get_ucontext_file(ufile);
+ if (IS_ERR(ucontext))
+ goto out;
+ has_cap = rdma_dev_has_raw_cap(ucontext->device);
+
+out:
+ srcu_read_unlock(&ufile->device->disassociate_srcu, srcu_key);
+ return has_cap;
+}
+EXPORT_SYMBOL(rdma_uattrs_has_raw_cap);
diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h
index 33706dad6c0f..a59b087611cb 100644
--- a/drivers/infiniband/core/rdma_core.h
+++ b/drivers/infiniband/core/rdma_core.h
@@ -156,6 +156,7 @@ extern const struct uapi_definition uverbs_def_obj_counters[];
extern const struct uapi_definition uverbs_def_obj_cq[];
extern const struct uapi_definition uverbs_def_obj_device[];
extern const struct uapi_definition uverbs_def_obj_dm[];
+extern const struct uapi_definition uverbs_def_obj_dmah[];
extern const struct uapi_definition uverbs_def_obj_flow_action[];
extern const struct uapi_definition uverbs_def_obj_intf[];
extern const struct uapi_definition uverbs_def_obj_mr[];
diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c
index 3313410014cd..b097cfcade1c 100644
--- a/drivers/infiniband/core/restrack.c
+++ b/drivers/infiniband/core/restrack.c
@@ -100,6 +100,8 @@ static struct ib_device *res_to_dev(struct rdma_restrack_entry *res)
return container_of(res, struct rdma_counter, res)->device;
case RDMA_RESTRACK_SRQ:
return container_of(res, struct ib_srq, res)->device;
+ case RDMA_RESTRACK_DMAH:
+ return container_of(res, struct ib_dmah, res)->device;
default:
WARN_ONCE(true, "Wrong resource tracking type %u\n", res->type);
return NULL;
@@ -173,7 +175,7 @@ void rdma_restrack_new(struct rdma_restrack_entry *res,
EXPORT_SYMBOL(rdma_restrack_new);
/**
- * rdma_restrack_add() - add object to the reource tracking database
+ * rdma_restrack_add() - add object to the resource tracking database
* @res: resource entry
*/
void rdma_restrack_add(struct rdma_restrack_entry *res)
@@ -275,7 +277,7 @@ int rdma_restrack_put(struct rdma_restrack_entry *res)
EXPORT_SYMBOL(rdma_restrack_put);
/**
- * rdma_restrack_del() - delete object from the reource tracking database
+ * rdma_restrack_del() - delete object from the resource tracking database
* @res: resource entry
*/
void rdma_restrack_del(struct rdma_restrack_entry *res)
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index 53571e6b3162..c23e9c847314 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -107,6 +107,8 @@ struct ib_sa_device {
struct ib_sa_query {
void (*callback)(struct ib_sa_query *sa_query, int status,
struct ib_sa_mad *mad);
+ void (*rmpp_callback)(struct ib_sa_query *sa_query, int status,
+ struct ib_mad_recv_wc *mad);
void (*release)(struct ib_sa_query *);
struct ib_sa_client *client;
struct ib_sa_port *port;
@@ -150,6 +152,13 @@ struct ib_sa_mcmember_query {
struct ib_sa_query sa_query;
};
+struct ib_sa_service_query {
+ void (*callback)(int status, struct sa_service_rec *rec,
+ unsigned int num_services, void *context);
+ void *context;
+ struct ib_sa_query sa_query;
+};
+
static LIST_HEAD(ib_nl_request_list);
static DEFINE_SPINLOCK(ib_nl_request_lock);
static atomic_t ib_nl_sa_request_seq;
@@ -684,6 +693,58 @@ static const struct ib_field guidinfo_rec_table[] = {
.size_bits = 512 },
};
+#define SERVICE_REC_FIELD(field) \
+ .struct_offset_bytes = offsetof(struct sa_service_rec, field), \
+ .struct_size_bytes = sizeof_field(struct sa_service_rec, field), \
+ .field_name = "sa_service_rec:" #field
+
+static const struct ib_field service_rec_table[] = {
+ { SERVICE_REC_FIELD(id),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 64 },
+ { SERVICE_REC_FIELD(gid),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { SERVICE_REC_FIELD(pkey),
+ .offset_words = 6,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { RESERVED,
+ .offset_words = 6,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { SERVICE_REC_FIELD(lease),
+ .offset_words = 7,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { SERVICE_REC_FIELD(key),
+ .offset_words = 8,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { SERVICE_REC_FIELD(name),
+ .offset_words = 12,
+ .offset_bits = 0,
+ .size_bits = 512 },
+ { SERVICE_REC_FIELD(data_8),
+ .offset_words = 28,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { SERVICE_REC_FIELD(data_16),
+ .offset_words = 32,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { SERVICE_REC_FIELD(data_32),
+ .offset_words = 36,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { SERVICE_REC_FIELD(data_64),
+ .offset_words = 40,
+ .offset_bits = 0,
+ .size_bits = 128 },
+};
+
#define RDMA_PRIMARY_PATH_MAX_REC_NUM 3
static inline void ib_sa_disable_local_svc(struct ib_sa_query *query)
@@ -1013,6 +1074,8 @@ int ib_nl_handle_set_timeout(struct sk_buff *skb,
if (timeout > IB_SA_LOCAL_SVC_TIMEOUT_MAX)
timeout = IB_SA_LOCAL_SVC_TIMEOUT_MAX;
+ spin_lock_irqsave(&ib_nl_request_lock, flags);
+
delta = timeout - sa_local_svc_timeout_ms;
if (delta < 0)
abs_delta = -delta;
@@ -1020,7 +1083,6 @@ int ib_nl_handle_set_timeout(struct sk_buff *skb,
abs_delta = delta;
if (delta != 0) {
- spin_lock_irqsave(&ib_nl_request_lock, flags);
sa_local_svc_timeout_ms = timeout;
list_for_each_entry(query, &ib_nl_request_list, list) {
if (delta < 0 && abs_delta > query->timeout)
@@ -1038,9 +1100,10 @@ int ib_nl_handle_set_timeout(struct sk_buff *skb,
if (delay)
mod_delayed_work(ib_nl_wq, &ib_nl_timed_work,
(unsigned long)delay);
- spin_unlock_irqrestore(&ib_nl_request_lock, flags);
}
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+
settimeout_out:
return 0;
}
@@ -1390,6 +1453,20 @@ void ib_sa_pack_path(struct sa_path_rec *rec, void *attribute)
}
EXPORT_SYMBOL(ib_sa_pack_path);
+void ib_sa_pack_service(struct sa_service_rec *rec, void *attribute)
+{
+ ib_pack(service_rec_table, ARRAY_SIZE(service_rec_table), rec,
+ attribute);
+}
+EXPORT_SYMBOL(ib_sa_pack_service);
+
+void ib_sa_unpack_service(void *attribute, struct sa_service_rec *rec)
+{
+ ib_unpack(service_rec_table, ARRAY_SIZE(service_rec_table), attribute,
+ rec);
+}
+EXPORT_SYMBOL(ib_sa_unpack_service);
+
static bool ib_sa_opa_pathrecord_support(struct ib_sa_client *client,
struct ib_sa_device *sa_dev,
u32 port_num)
@@ -1479,6 +1556,68 @@ static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query,
}
}
+#define IB_SA_DATA_OFFS 56
+#define IB_SERVICE_REC_SZ 176
+
+static void ib_unpack_service_rmpp(struct sa_service_rec *rec,
+ struct ib_mad_recv_wc *mad_wc,
+ int num_services)
+{
+ unsigned int cp_sz, data_i, data_size, rec_i = 0, buf_i = 0;
+ struct ib_mad_recv_buf *mad_buf;
+ u8 buf[IB_SERVICE_REC_SZ];
+ u8 *data;
+
+ data_size = sizeof(((struct ib_sa_mad *) mad_buf->mad)->data);
+
+ list_for_each_entry(mad_buf, &mad_wc->rmpp_list, list) {
+ data = ((struct ib_sa_mad *) mad_buf->mad)->data;
+ data_i = 0;
+ while (data_i < data_size && rec_i < num_services) {
+ cp_sz = min(IB_SERVICE_REC_SZ - buf_i,
+ data_size - data_i);
+ memcpy(buf + buf_i, data + data_i, cp_sz);
+ data_i += cp_sz;
+ buf_i += cp_sz;
+ if (buf_i == IB_SERVICE_REC_SZ) {
+ ib_sa_unpack_service(buf, rec + rec_i);
+ buf_i = 0;
+ rec_i++;
+ }
+ }
+ }
+}
+
+static void ib_sa_service_rec_callback(struct ib_sa_query *sa_query, int status,
+ struct ib_mad_recv_wc *mad_wc)
+{
+ struct ib_sa_service_query *query =
+ container_of(sa_query, struct ib_sa_service_query, sa_query);
+ struct sa_service_rec *rec;
+ int num_services;
+
+ if (!mad_wc || !mad_wc->recv_buf.mad) {
+ query->callback(status, NULL, 0, query->context);
+ return;
+ }
+
+ num_services = (mad_wc->mad_len - IB_SA_DATA_OFFS) / IB_SERVICE_REC_SZ;
+ if (!num_services) {
+ query->callback(-ENODATA, NULL, 0, query->context);
+ return;
+ }
+
+ rec = kmalloc_array(num_services, sizeof(*rec), GFP_KERNEL);
+ if (!rec) {
+ query->callback(-ENOMEM, NULL, 0, query->context);
+ return;
+ }
+
+ ib_unpack_service_rmpp(rec, mad_wc, num_services);
+ query->callback(status, rec, num_services, query->context);
+ kfree(rec);
+}
+
static void ib_sa_path_rec_release(struct ib_sa_query *sa_query)
{
struct ib_sa_path_query *query =
@@ -1488,6 +1627,14 @@ static void ib_sa_path_rec_release(struct ib_sa_query *sa_query)
kfree(query);
}
+static void ib_sa_service_rec_release(struct ib_sa_query *sa_query)
+{
+ struct ib_sa_service_query *query =
+ container_of(sa_query, struct ib_sa_service_query, sa_query);
+
+ kfree(query);
+}
+
/**
* ib_sa_path_rec_get - Start a Path get query
* @client:SA client
@@ -1618,6 +1765,101 @@ err1:
}
EXPORT_SYMBOL(ib_sa_path_rec_get);
+/**
+ * ib_sa_service_rec_get - Start a Service get query
+ * @client: SA client
+ * @device: device to send query on
+ * @port_num: port number to send query on
+ * @rec: Service Record to send in query
+ * @comp_mask: component mask to send in query
+ * @timeout_ms: time to wait for response
+ * @gfp_mask: GFP mask to use for internal allocations
+ * @callback: function called when query completes, times out or is
+ * canceled
+ * @context: opaque user context passed to callback
+ * @sa_query: query context, used to cancel query
+ *
+ * Send a Service Record Get query to the SA to look up a path. The
+ * callback function will be called when the query completes (or
+ * fails); status is 0 for a successful response, -EINTR if the query
+ * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error
+ * occurred sending the query. The resp parameter of the callback is
+ * only valid if status is 0.
+ *
+ * If the return value of ib_sa_service_rec_get() is negative, it is an
+ * error code. Otherwise it is a query ID that can be used to cancel
+ * the query.
+ */
+int ib_sa_service_rec_get(struct ib_sa_client *client,
+ struct ib_device *device, u32 port_num,
+ struct sa_service_rec *rec,
+ ib_sa_comp_mask comp_mask,
+ unsigned long timeout_ms, gfp_t gfp_mask,
+ void (*callback)(int status,
+ struct sa_service_rec *resp,
+ unsigned int num_services,
+ void *context),
+ void *context, struct ib_sa_query **sa_query)
+{
+ struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+ struct ib_sa_service_query *query;
+ struct ib_mad_agent *agent;
+ struct ib_sa_port *port;
+ struct ib_sa_mad *mad;
+ int ret;
+
+ if (!sa_dev)
+ return -ENODEV;
+
+ port = &sa_dev->port[port_num - sa_dev->start_port];
+ agent = port->agent;
+
+ query = kzalloc(sizeof(*query), gfp_mask);
+ if (!query)
+ return -ENOMEM;
+
+ query->sa_query.port = port;
+
+ ret = alloc_mad(&query->sa_query, gfp_mask);
+ if (ret)
+ goto err1;
+
+ ib_sa_client_get(client);
+ query->sa_query.client = client;
+ query->callback = callback;
+ query->context = context;
+
+ mad = query->sa_query.mad_buf->mad;
+ init_mad(&query->sa_query, agent);
+
+ query->sa_query.rmpp_callback = callback ? ib_sa_service_rec_callback :
+ NULL;
+ query->sa_query.release = ib_sa_service_rec_release;
+ mad->mad_hdr.method = IB_MGMT_METHOD_GET_TABLE;
+ mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_SERVICE_REC);
+ mad->sa_hdr.comp_mask = comp_mask;
+
+ ib_sa_pack_service(rec, mad->data);
+
+ *sa_query = &query->sa_query;
+ query->sa_query.mad_buf->context[1] = rec;
+
+ ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+ if (ret < 0)
+ goto err2;
+
+ return ret;
+
+err2:
+ *sa_query = NULL;
+ ib_sa_client_put(query->sa_query.client);
+ free_mad(&query->sa_query);
+err1:
+ kfree(query);
+ return ret;
+}
+EXPORT_SYMBOL(ib_sa_service_rec_get);
+
static void ib_sa_mcmember_rec_callback(struct ib_sa_query *sa_query,
int status, struct ib_sa_mad *mad)
{
@@ -1987,23 +2229,29 @@ static void send_handler(struct ib_mad_agent *agent,
{
struct ib_sa_query *query = mad_send_wc->send_buf->context[0];
unsigned long flags;
+ int status = 0;
- if (query->callback)
+ if (query->callback || query->rmpp_callback) {
switch (mad_send_wc->status) {
case IB_WC_SUCCESS:
/* No callback -- already got recv */
break;
case IB_WC_RESP_TIMEOUT_ERR:
- query->callback(query, -ETIMEDOUT, NULL);
+ status = -ETIMEDOUT;
break;
case IB_WC_WR_FLUSH_ERR:
- query->callback(query, -EINTR, NULL);
+ status = -EINTR;
break;
default:
- query->callback(query, -EIO, NULL);
+ status = -EIO;
break;
}
+ if (status)
+ query->callback ? query->callback(query, status, NULL) :
+ query->rmpp_callback(query, status, NULL);
+ }
+
xa_lock_irqsave(&queries, flags);
__xa_erase(&queries, query->id);
xa_unlock_irqrestore(&queries, flags);
@@ -2019,17 +2267,25 @@ static void recv_handler(struct ib_mad_agent *mad_agent,
struct ib_mad_recv_wc *mad_recv_wc)
{
struct ib_sa_query *query;
+ struct ib_mad *mad;
+
if (!send_buf)
return;
query = send_buf->context[0];
- if (query->callback) {
+ mad = mad_recv_wc->recv_buf.mad;
+
+ if (query->rmpp_callback) {
+ if (mad_recv_wc->wc->status == IB_WC_SUCCESS)
+ query->rmpp_callback(query, mad->mad_hdr.status ?
+ -EINVAL : 0, mad_recv_wc);
+ else
+ query->rmpp_callback(query, -EIO, NULL);
+ } else if (query->callback) {
if (mad_recv_wc->wc->status == IB_WC_SUCCESS)
- query->callback(query,
- mad_recv_wc->recv_buf.mad->mad_hdr.status ?
- -EINVAL : 0,
- (struct ib_sa_mad *) mad_recv_wc->recv_buf.mad);
+ query->callback(query, mad->mad_hdr.status ?
+ -EINVAL : 0, (struct ib_sa_mad *)mad);
else
query->callback(query, -EIO, NULL);
}
@@ -2181,8 +2437,9 @@ static int ib_sa_add_one(struct ib_device *device)
sa_dev->port[i].agent =
ib_register_mad_agent(device, i + s, IB_QPT_GSI,
- NULL, 0, send_handler,
- recv_handler, sa_dev, 0);
+ NULL, IB_MGMT_RMPP_VERSION,
+ send_handler, recv_handler,
+ sa_dev, 0);
if (IS_ERR(sa_dev->port[i].agent)) {
ret = PTR_ERR(sa_dev->port[i].agent);
goto err;
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index 9f97bef02149..0ed862b38b44 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -216,24 +216,12 @@ static ssize_t state_show(struct ib_device *ibdev, u32 port_num,
struct ib_port_attr attr;
ssize_t ret;
- static const char *state_name[] = {
- [IB_PORT_NOP] = "NOP",
- [IB_PORT_DOWN] = "DOWN",
- [IB_PORT_INIT] = "INIT",
- [IB_PORT_ARMED] = "ARMED",
- [IB_PORT_ACTIVE] = "ACTIVE",
- [IB_PORT_ACTIVE_DEFER] = "ACTIVE_DEFER"
- };
-
ret = ib_query_port(ibdev, port_num, &attr);
if (ret)
return ret;
return sysfs_emit(buf, "%d: %s\n", attr.state,
- attr.state >= 0 &&
- attr.state < ARRAY_SIZE(state_name) ?
- state_name[attr.state] :
- "UNKNOWN");
+ ib_port_state_to_str(attr.state));
}
static ssize_t lid_show(struct ib_device *ibdev, u32 port_num,
@@ -988,6 +976,7 @@ int ib_setup_device_attrs(struct ib_device *ibdev)
for (i = 0; i != ARRAY_SIZE(ibdev->groups); i++)
if (!ibdev->groups[i]) {
ibdev->groups[i] = &data->group;
+ ibdev->hw_stats_attr_index = i;
return 0;
}
WARN(true, "struct ib_device->groups is too small");
diff --git a/drivers/infiniband/core/ucaps.c b/drivers/infiniband/core/ucaps.c
new file mode 100644
index 000000000000..de5cb8bf0a61
--- /dev/null
+++ b/drivers/infiniband/core/ucaps.c
@@ -0,0 +1,267 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#include <linux/kref.h>
+#include <linux/cdev.h>
+#include <linux/mutex.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <rdma/ib_ucaps.h>
+
+#define RDMA_UCAP_FIRST RDMA_UCAP_MLX5_CTRL_LOCAL
+
+static DEFINE_MUTEX(ucaps_mutex);
+static struct ib_ucap *ucaps_list[RDMA_UCAP_MAX];
+static bool ucaps_class_is_registered;
+static dev_t ucaps_base_dev;
+
+struct ib_ucap {
+ struct cdev cdev;
+ struct device dev;
+ struct kref ref;
+};
+
+static const char *ucap_names[RDMA_UCAP_MAX] = {
+ [RDMA_UCAP_MLX5_CTRL_LOCAL] = "mlx5_perm_ctrl_local",
+ [RDMA_UCAP_MLX5_CTRL_OTHER_VHCA] = "mlx5_perm_ctrl_other_vhca"
+};
+
+static char *ucaps_devnode(const struct device *dev, umode_t *mode)
+{
+ if (mode)
+ *mode = 0600;
+
+ return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
+}
+
+static const struct class ucaps_class = {
+ .name = "infiniband_ucaps",
+ .devnode = ucaps_devnode,
+};
+
+static const struct file_operations ucaps_cdev_fops = {
+ .owner = THIS_MODULE,
+ .open = simple_open,
+};
+
+/**
+ * ib_cleanup_ucaps - cleanup all API resources and class.
+ *
+ * This is called once, when removing the ib_uverbs module.
+ */
+void ib_cleanup_ucaps(void)
+{
+ mutex_lock(&ucaps_mutex);
+ if (!ucaps_class_is_registered) {
+ mutex_unlock(&ucaps_mutex);
+ return;
+ }
+
+ for (int i = RDMA_UCAP_FIRST; i < RDMA_UCAP_MAX; i++)
+ WARN_ON(ucaps_list[i]);
+
+ class_unregister(&ucaps_class);
+ ucaps_class_is_registered = false;
+ unregister_chrdev_region(ucaps_base_dev, RDMA_UCAP_MAX);
+ mutex_unlock(&ucaps_mutex);
+}
+
+static int get_ucap_from_devt(dev_t devt, u64 *idx_mask)
+{
+ for (int type = RDMA_UCAP_FIRST; type < RDMA_UCAP_MAX; type++) {
+ if (ucaps_list[type] && ucaps_list[type]->dev.devt == devt) {
+ *idx_mask |= 1 << type;
+ return 0;
+ }
+ }
+
+ return -EINVAL;
+}
+
+static int get_devt_from_fd(unsigned int fd, dev_t *ret_dev)
+{
+ struct file *file;
+
+ file = fget(fd);
+ if (!file)
+ return -EBADF;
+
+ *ret_dev = file_inode(file)->i_rdev;
+ fput(file);
+ return 0;
+}
+
+/**
+ * ib_ucaps_init - Initialization required before ucap creation.
+ *
+ * Return: 0 on success, or a negative errno value on failure
+ */
+static int ib_ucaps_init(void)
+{
+ int ret = 0;
+
+ if (ucaps_class_is_registered)
+ return ret;
+
+ ret = class_register(&ucaps_class);
+ if (ret)
+ return ret;
+
+ ret = alloc_chrdev_region(&ucaps_base_dev, 0, RDMA_UCAP_MAX,
+ ucaps_class.name);
+ if (ret < 0) {
+ class_unregister(&ucaps_class);
+ return ret;
+ }
+
+ ucaps_class_is_registered = true;
+
+ return 0;
+}
+
+static void ucap_dev_release(struct device *device)
+{
+ struct ib_ucap *ucap = container_of(device, struct ib_ucap, dev);
+
+ kfree(ucap);
+}
+
+/**
+ * ib_create_ucap - Add a ucap character device
+ * @type: UCAP type
+ *
+ * Creates a ucap character device in the /dev/infiniband directory. By default,
+ * the device has root-only read-write access.
+ *
+ * A driver may call this multiple times with the same UCAP type. A reference
+ * count tracks creations and deletions.
+ *
+ * Return: 0 on success, or a negative errno value on failure
+ */
+int ib_create_ucap(enum rdma_user_cap type)
+{
+ struct ib_ucap *ucap;
+ int ret;
+
+ if (type >= RDMA_UCAP_MAX)
+ return -EINVAL;
+
+ mutex_lock(&ucaps_mutex);
+ ret = ib_ucaps_init();
+ if (ret)
+ goto unlock;
+
+ ucap = ucaps_list[type];
+ if (ucap) {
+ kref_get(&ucap->ref);
+ mutex_unlock(&ucaps_mutex);
+ return 0;
+ }
+
+ ucap = kzalloc(sizeof(*ucap), GFP_KERNEL);
+ if (!ucap) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
+ device_initialize(&ucap->dev);
+ ucap->dev.class = &ucaps_class;
+ ucap->dev.devt = MKDEV(MAJOR(ucaps_base_dev), type);
+ ucap->dev.release = ucap_dev_release;
+ ret = dev_set_name(&ucap->dev, "%s", ucap_names[type]);
+ if (ret)
+ goto err_device;
+
+ cdev_init(&ucap->cdev, &ucaps_cdev_fops);
+ ucap->cdev.owner = THIS_MODULE;
+
+ ret = cdev_device_add(&ucap->cdev, &ucap->dev);
+ if (ret)
+ goto err_device;
+
+ kref_init(&ucap->ref);
+ ucaps_list[type] = ucap;
+ mutex_unlock(&ucaps_mutex);
+
+ return 0;
+
+err_device:
+ put_device(&ucap->dev);
+unlock:
+ mutex_unlock(&ucaps_mutex);
+ return ret;
+}
+EXPORT_SYMBOL(ib_create_ucap);
+
+static void ib_release_ucap(struct kref *ref)
+{
+ struct ib_ucap *ucap = container_of(ref, struct ib_ucap, ref);
+ enum rdma_user_cap type;
+
+ for (type = RDMA_UCAP_FIRST; type < RDMA_UCAP_MAX; type++) {
+ if (ucaps_list[type] == ucap)
+ break;
+ }
+ WARN_ON(type == RDMA_UCAP_MAX);
+
+ ucaps_list[type] = NULL;
+ cdev_device_del(&ucap->cdev, &ucap->dev);
+ put_device(&ucap->dev);
+}
+
+/**
+ * ib_remove_ucap - Remove a ucap character device
+ * @type: User cap type
+ *
+ * Removes the ucap character device according to type. The device is completely
+ * removed from the filesystem when its reference count reaches 0.
+ */
+void ib_remove_ucap(enum rdma_user_cap type)
+{
+ struct ib_ucap *ucap;
+
+ mutex_lock(&ucaps_mutex);
+ ucap = ucaps_list[type];
+ if (WARN_ON(!ucap))
+ goto end;
+
+ kref_put(&ucap->ref, ib_release_ucap);
+end:
+ mutex_unlock(&ucaps_mutex);
+}
+EXPORT_SYMBOL(ib_remove_ucap);
+
+/**
+ * ib_get_ucaps - Get bitmask of ucap types from file descriptors
+ * @fds: Array of file descriptors
+ * @fd_count: Number of file descriptors in the array
+ * @idx_mask: Bitmask to be updated based on the ucaps in the fd list
+ *
+ * Given an array of file descriptors, this function returns a bitmask of
+ * the ucaps where a bit is set if an FD for that ucap type was in the array.
+ *
+ * Return: 0 on success, or a negative errno value on failure
+ */
+int ib_get_ucaps(int *fds, int fd_count, uint64_t *idx_mask)
+{
+ int ret = 0;
+ dev_t dev;
+
+ *idx_mask = 0;
+ mutex_lock(&ucaps_mutex);
+ for (int i = 0; i < fd_count; i++) {
+ ret = get_devt_from_fd(fds[i], &dev);
+ if (ret)
+ goto end;
+
+ ret = get_ucap_from_devt(dev, idx_mask);
+ if (ret)
+ goto end;
+ }
+
+end:
+ mutex_unlock(&ucaps_mutex);
+ return ret;
+}
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 02f1666f3cba..ec3be65a2b88 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -69,7 +69,9 @@ static struct ctl_table ucma_ctl_table[] = {
.data = &max_backlog,
.maxlen = sizeof max_backlog,
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_INT_MAX,
},
};
@@ -280,6 +282,10 @@ static struct ucma_event *ucma_create_uevent(struct ucma_context *ctx,
}
uevent->resp.event = event->event;
uevent->resp.status = event->status;
+
+ if (event->event == RDMA_CM_EVENT_ADDRINFO_RESOLVED)
+ goto out;
+
if (ctx->cm_id->qp_type == IB_QPT_UD)
ucma_copy_ud_event(ctx->cm_id->device, &uevent->resp.param.ud,
&event->param.ud);
@@ -287,6 +293,7 @@ static struct ucma_event *ucma_create_uevent(struct ucma_context *ctx,
ucma_copy_conn_event(&uevent->resp.param.conn,
&event->param.conn);
+out:
uevent->resp.ece.vendor_id = event->ece.vendor_id;
uevent->resp.ece.attr_mod = event->ece.attr_mod;
return uevent;
@@ -359,7 +366,7 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id,
if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
xa_lock(&ctx_table);
if (xa_load(&ctx_table, ctx->id) == ctx)
- queue_work(system_unbound_wq, &ctx->close_work);
+ queue_work(system_dfl_wq, &ctx->close_work);
xa_unlock(&ctx_table);
}
return 0;
@@ -726,6 +733,28 @@ static ssize_t ucma_resolve_addr(struct ucma_file *file,
return ret;
}
+static ssize_t ucma_resolve_ib_service(struct ucma_file *file,
+ const char __user *inbuf, int in_len,
+ int out_len)
+{
+ struct rdma_ucm_resolve_ib_service cmd;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ mutex_lock(&ctx->mutex);
+ ret = rdma_resolve_ib_service(ctx->cm_id, &cmd.ibs);
+ mutex_unlock(&ctx->mutex);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
static ssize_t ucma_resolve_route(struct ucma_file *file,
const char __user *inbuf,
int in_len, int out_len)
@@ -992,6 +1021,43 @@ static ssize_t ucma_query_gid(struct ucma_context *ctx,
return ret;
}
+static ssize_t ucma_query_ib_service(struct ucma_context *ctx,
+ void __user *response, int out_len)
+{
+ struct rdma_ucm_query_ib_service_resp *resp;
+ int n, ret = 0;
+
+ if (out_len < sizeof(struct rdma_ucm_query_ib_service_resp))
+ return -ENOSPC;
+
+ if (!ctx->cm_id->route.service_recs)
+ return -ENODATA;
+
+ resp = kzalloc(out_len, GFP_KERNEL);
+ if (!resp)
+ return -ENOMEM;
+
+ resp->num_service_recs = ctx->cm_id->route.num_service_recs;
+
+ n = (out_len - sizeof(struct rdma_ucm_query_ib_service_resp)) /
+ sizeof(struct ib_user_service_rec);
+
+ if (!n)
+ goto out;
+
+ if (n > ctx->cm_id->route.num_service_recs)
+ n = ctx->cm_id->route.num_service_recs;
+
+ memcpy(resp->recs, ctx->cm_id->route.service_recs,
+ sizeof(*resp->recs) * n);
+ if (copy_to_user(response, resp, struct_size(resp, recs, n)))
+ ret = -EFAULT;
+
+out:
+ kfree(resp);
+ return ret;
+}
+
static ssize_t ucma_query(struct ucma_file *file,
const char __user *inbuf,
int in_len, int out_len)
@@ -1020,6 +1086,9 @@ static ssize_t ucma_query(struct ucma_file *file,
case RDMA_USER_CM_QUERY_GID:
ret = ucma_query_gid(ctx, response, out_len);
break;
+ case RDMA_USER_CM_QUERY_IB_SERVICE:
+ ret = ucma_query_ib_service(ctx, response, out_len);
+ break;
default:
ret = -ENOSYS;
break;
@@ -1676,6 +1745,55 @@ err_unlock:
return ret;
}
+static ssize_t ucma_write_cm_event(struct ucma_file *file,
+ const char __user *inbuf, int in_len,
+ int out_len)
+{
+ struct rdma_ucm_write_cm_event cmd;
+ struct rdma_cm_event event = {};
+ struct ucma_event *uevent;
+ struct ucma_context *ctx;
+ int ret = 0;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ if ((cmd.event != RDMA_CM_EVENT_USER) &&
+ (cmd.event != RDMA_CM_EVENT_INTERNAL))
+ return -EINVAL;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ event.event = cmd.event;
+ event.status = cmd.status;
+ event.param.arg = cmd.param.arg;
+
+ uevent = kzalloc(sizeof(*uevent), GFP_KERNEL);
+ if (!uevent) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ uevent->ctx = ctx;
+ uevent->resp.uid = ctx->uid;
+ uevent->resp.id = ctx->id;
+ uevent->resp.event = event.event;
+ uevent->resp.status = event.status;
+ memcpy(uevent->resp.param.arg32, &event.param.arg,
+ sizeof(event.param.arg));
+
+ mutex_lock(&ctx->file->mut);
+ list_add_tail(&uevent->list, &ctx->file->event_list);
+ mutex_unlock(&ctx->file->mut);
+ wake_up_interruptible(&ctx->file->poll_wait);
+
+out:
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
static ssize_t (*ucma_cmd_table[])(struct ucma_file *file,
const char __user *inbuf,
int in_len, int out_len) = {
@@ -1701,7 +1819,9 @@ static ssize_t (*ucma_cmd_table[])(struct ucma_file *file,
[RDMA_USER_CM_CMD_QUERY] = ucma_query,
[RDMA_USER_CM_CMD_BIND] = ucma_bind,
[RDMA_USER_CM_CMD_RESOLVE_ADDR] = ucma_resolve_addr,
- [RDMA_USER_CM_CMD_JOIN_MCAST] = ucma_join_multicast
+ [RDMA_USER_CM_CMD_JOIN_MCAST] = ucma_join_multicast,
+ [RDMA_USER_CM_CMD_RESOLVE_IB_SERVICE] = ucma_resolve_ib_service,
+ [RDMA_USER_CM_CMD_WRITE_CM_EVENT] = ucma_write_cm_event,
};
static ssize_t ucma_write(struct file *filp, const char __user *buf,
diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c
index 64d9c492de64..8d3dfef9ebaa 100644
--- a/drivers/infiniband/core/ud_header.c
+++ b/drivers/infiniband/core/ud_header.c
@@ -462,86 +462,3 @@ int ib_ud_header_pack(struct ib_ud_header *header,
return len;
}
EXPORT_SYMBOL(ib_ud_header_pack);
-
-/**
- * ib_ud_header_unpack - Unpack UD header struct from wire format
- * @header:UD header struct
- * @buf:Buffer to pack into
- *
- * ib_ud_header_pack() unpacks the UD header structure @header from wire
- * format in the buffer @buf.
- */
-int ib_ud_header_unpack(void *buf,
- struct ib_ud_header *header)
-{
- ib_unpack(lrh_table, ARRAY_SIZE(lrh_table),
- buf, &header->lrh);
- buf += IB_LRH_BYTES;
-
- if (header->lrh.link_version != 0) {
- pr_warn("Invalid LRH.link_version %u\n",
- header->lrh.link_version);
- return -EINVAL;
- }
-
- switch (header->lrh.link_next_header) {
- case IB_LNH_IBA_LOCAL:
- header->grh_present = 0;
- break;
-
- case IB_LNH_IBA_GLOBAL:
- header->grh_present = 1;
- ib_unpack(grh_table, ARRAY_SIZE(grh_table),
- buf, &header->grh);
- buf += IB_GRH_BYTES;
-
- if (header->grh.ip_version != 6) {
- pr_warn("Invalid GRH.ip_version %u\n",
- header->grh.ip_version);
- return -EINVAL;
- }
- if (header->grh.next_header != 0x1b) {
- pr_warn("Invalid GRH.next_header 0x%02x\n",
- header->grh.next_header);
- return -EINVAL;
- }
- break;
-
- default:
- pr_warn("Invalid LRH.link_next_header %u\n",
- header->lrh.link_next_header);
- return -EINVAL;
- }
-
- ib_unpack(bth_table, ARRAY_SIZE(bth_table),
- buf, &header->bth);
- buf += IB_BTH_BYTES;
-
- switch (header->bth.opcode) {
- case IB_OPCODE_UD_SEND_ONLY:
- header->immediate_present = 0;
- break;
- case IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE:
- header->immediate_present = 1;
- break;
- default:
- pr_warn("Invalid BTH.opcode 0x%02x\n", header->bth.opcode);
- return -EINVAL;
- }
-
- if (header->bth.transport_header_version != 0) {
- pr_warn("Invalid BTH.transport_header_version %u\n",
- header->bth.transport_header_version);
- return -EINVAL;
- }
-
- ib_unpack(deth_table, ARRAY_SIZE(deth_table),
- buf, &header->deth);
- buf += IB_DETH_BYTES;
-
- if (header->immediate_present)
- memcpy(&header->immediate_data, buf, sizeof header->immediate_data);
-
- return 0;
-}
-EXPORT_SYMBOL(ib_ud_header_unpack);
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 07c571c7b699..8137031c2a65 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -45,6 +45,8 @@
#include "uverbs.h"
+#define RESCHED_LOOP_CNT_THRESHOLD 0x1000
+
static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
{
bool make_dirty = umem->writable && dirty;
@@ -55,10 +57,14 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
ib_dma_unmap_sgtable_attrs(dev, &umem->sgt_append.sgt,
DMA_BIDIRECTIONAL, 0);
- for_each_sgtable_sg(&umem->sgt_append.sgt, sg, i)
+ for_each_sgtable_sg(&umem->sgt_append.sgt, sg, i) {
unpin_user_page_range_dirty_lock(sg_page(sg),
DIV_ROUND_UP(sg->length, PAGE_SIZE), make_dirty);
+ if (i && !(i % RESCHED_LOOP_CNT_THRESHOLD))
+ cond_resched();
+ }
+
sg_free_append_table(&umem->sgt_append);
}
@@ -80,9 +86,12 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
unsigned long pgsz_bitmap,
unsigned long virt)
{
- struct scatterlist *sg;
+ unsigned long curr_len = 0;
+ dma_addr_t curr_base = ~0;
unsigned long va, pgoff;
+ struct scatterlist *sg;
dma_addr_t mask;
+ dma_addr_t end;
int i;
umem->iova = va = virt;
@@ -107,17 +116,30 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
pgoff = umem->address & ~PAGE_MASK;
for_each_sgtable_dma_sg(&umem->sgt_append.sgt, sg, i) {
- /* Walk SGL and reduce max page size if VA/PA bits differ
- * for any address.
+ /* If the current entry is physically contiguous with the previous
+ * one, no need to take its start addresses into consideration.
*/
- mask |= (sg_dma_address(sg) + pgoff) ^ va;
+ if (check_add_overflow(curr_base, curr_len, &end) ||
+ end != sg_dma_address(sg)) {
+
+ curr_base = sg_dma_address(sg);
+ curr_len = 0;
+
+ /* Reduce max page size if VA/PA bits differ */
+ mask |= (curr_base + pgoff) ^ va;
+
+ /* The alignment of any VA matching a discontinuity point
+ * in the physical memory sets the maximum possible page
+ * size as this must be a starting point of a new page that
+ * needs to be aligned.
+ */
+ if (i != 0)
+ mask |= va;
+ }
+
+ curr_len += sg_dma_len(sg);
va += sg_dma_len(sg) - pgoff;
- /* Except for the last entry, the ending iova alignment sets
- * the maximum possible page size as the low bits of the iova
- * must be zero when starting the next chunk.
- */
- if (i != (umem->sgt_append.sgt.nents - 1))
- mask |= va;
+
pgoff = 0;
}
diff --git a/drivers/infiniband/core/umem_dmabuf.c b/drivers/infiniband/core/umem_dmabuf.c
index 9fcd37761264..0ec2e4120cc9 100644
--- a/drivers/infiniband/core/umem_dmabuf.c
+++ b/drivers/infiniband/core/umem_dmabuf.c
@@ -10,7 +10,7 @@
#include "uverbs.h"
-MODULE_IMPORT_NS(DMA_BUF);
+MODULE_IMPORT_NS("DMA_BUF");
int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf)
{
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index e9fa22d31c23..572a91a62a7b 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -41,65 +41,83 @@
#include <linux/hugetlb.h>
#include <linux/interval_tree.h>
#include <linux/hmm.h>
+#include <linux/hmm-dma.h>
#include <linux/pagemap.h>
#include <rdma/ib_umem_odp.h>
#include "uverbs.h"
-static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
- const struct mmu_interval_notifier_ops *ops)
+static void ib_init_umem_implicit_odp(struct ib_umem_odp *umem_odp)
{
- int ret;
+ umem_odp->is_implicit_odp = 1;
+ umem_odp->umem.is_odp = 1;
+ mutex_init(&umem_odp->umem_mutex);
+}
+
+static int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
+ const struct mmu_interval_notifier_ops *ops)
+{
+ struct ib_device *dev = umem_odp->umem.ibdev;
+ size_t page_size = 1UL << umem_odp->page_shift;
+ struct hmm_dma_map *map;
+ unsigned long start;
+ unsigned long end;
+ size_t nr_entries;
+ int ret = 0;
umem_odp->umem.is_odp = 1;
mutex_init(&umem_odp->umem_mutex);
- if (!umem_odp->is_implicit_odp) {
- size_t page_size = 1UL << umem_odp->page_shift;
- unsigned long start;
- unsigned long end;
- size_t ndmas, npfns;
-
- start = ALIGN_DOWN(umem_odp->umem.address, page_size);
- if (check_add_overflow(umem_odp->umem.address,
- (unsigned long)umem_odp->umem.length,
- &end))
- return -EOVERFLOW;
- end = ALIGN(end, page_size);
- if (unlikely(end < page_size))
- return -EOVERFLOW;
-
- ndmas = (end - start) >> umem_odp->page_shift;
- if (!ndmas)
- return -EINVAL;
-
- npfns = (end - start) >> PAGE_SHIFT;
- umem_odp->pfn_list = kvcalloc(
- npfns, sizeof(*umem_odp->pfn_list), GFP_KERNEL);
- if (!umem_odp->pfn_list)
- return -ENOMEM;
-
- umem_odp->dma_list = kvcalloc(
- ndmas, sizeof(*umem_odp->dma_list), GFP_KERNEL);
- if (!umem_odp->dma_list) {
+ start = ALIGN_DOWN(umem_odp->umem.address, page_size);
+ if (check_add_overflow(umem_odp->umem.address,
+ (unsigned long)umem_odp->umem.length, &end))
+ return -EOVERFLOW;
+ end = ALIGN(end, page_size);
+ if (unlikely(end < page_size))
+ return -EOVERFLOW;
+ /*
+ * The mmu notifier can be called within reclaim contexts and takes the
+ * umem_mutex. This is rare to trigger in testing, teach lockdep about
+ * it.
+ */
+ if (IS_ENABLED(CONFIG_LOCKDEP)) {
+ fs_reclaim_acquire(GFP_KERNEL);
+ mutex_lock(&umem_odp->umem_mutex);
+ mutex_unlock(&umem_odp->umem_mutex);
+ fs_reclaim_release(GFP_KERNEL);
+ }
+
+ nr_entries = (end - start) >> PAGE_SHIFT;
+ if (!(nr_entries * PAGE_SIZE / page_size))
+ return -EINVAL;
+
+ map = &umem_odp->map;
+ if (ib_uses_virt_dma(dev)) {
+ map->pfn_list = kvcalloc(nr_entries, sizeof(*map->pfn_list),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!map->pfn_list)
ret = -ENOMEM;
- goto out_pfn_list;
- }
+ } else
+ ret = hmm_dma_map_alloc(dev->dma_device, map,
+ (end - start) >> PAGE_SHIFT,
+ 1 << umem_odp->page_shift);
+ if (ret)
+ return ret;
- ret = mmu_interval_notifier_insert(&umem_odp->notifier,
- umem_odp->umem.owning_mm,
- start, end - start, ops);
- if (ret)
- goto out_dma_list;
- }
+ ret = mmu_interval_notifier_insert(&umem_odp->notifier,
+ umem_odp->umem.owning_mm, start,
+ end - start, ops);
+ if (ret)
+ goto out_free_map;
return 0;
-out_dma_list:
- kvfree(umem_odp->dma_list);
-out_pfn_list:
- kvfree(umem_odp->pfn_list);
+out_free_map:
+ if (ib_uses_virt_dma(dev))
+ kvfree(map->pfn_list);
+ else
+ hmm_dma_map_free(dev->dma_device, map);
return ret;
}
@@ -118,7 +136,6 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device,
{
struct ib_umem *umem;
struct ib_umem_odp *umem_odp;
- int ret;
if (access & IB_ACCESS_HUGETLB)
return ERR_PTR(-EINVAL);
@@ -130,16 +147,10 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device,
umem->ibdev = device;
umem->writable = ib_access_writable(access);
umem->owning_mm = current->mm;
- umem_odp->is_implicit_odp = 1;
umem_odp->page_shift = PAGE_SHIFT;
umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
- ret = ib_init_umem_odp(umem_odp, NULL);
- if (ret) {
- put_pid(umem_odp->tgid);
- kfree(umem_odp);
- return ERR_PTR(ret);
- }
+ ib_init_umem_implicit_odp(umem_odp);
return umem_odp;
}
EXPORT_SYMBOL(ib_umem_odp_alloc_implicit);
@@ -260,74 +271,41 @@ err_put_pid:
}
EXPORT_SYMBOL(ib_umem_odp_get);
-void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
+static void ib_umem_odp_free(struct ib_umem_odp *umem_odp)
{
+ struct ib_device *dev = umem_odp->umem.ibdev;
+
/*
* Ensure that no more pages are mapped in the umem.
*
* It is the driver's responsibility to ensure, before calling us,
* that the hardware will not attempt to access the MR any more.
*/
- if (!umem_odp->is_implicit_odp) {
- mutex_lock(&umem_odp->umem_mutex);
- ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
- ib_umem_end(umem_odp));
- mutex_unlock(&umem_odp->umem_mutex);
- mmu_interval_notifier_remove(&umem_odp->notifier);
- kvfree(umem_odp->dma_list);
- kvfree(umem_odp->pfn_list);
- }
- put_pid(umem_odp->tgid);
- kfree(umem_odp);
+ mutex_lock(&umem_odp->umem_mutex);
+ ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
+ ib_umem_end(umem_odp));
+ mutex_unlock(&umem_odp->umem_mutex);
+ mmu_interval_notifier_remove(&umem_odp->notifier);
+ if (ib_uses_virt_dma(dev))
+ kvfree(umem_odp->map.pfn_list);
+ else
+ hmm_dma_map_free(dev->dma_device, &umem_odp->map);
}
-EXPORT_SYMBOL(ib_umem_odp_release);
-/*
- * Map for DMA and insert a single page into the on-demand paging page tables.
- *
- * @umem: the umem to insert the page to.
- * @dma_index: index in the umem to add the dma to.
- * @page: the page struct to map and add.
- * @access_mask: access permissions needed for this page.
- *
- * The function returns -EFAULT if the DMA mapping operation fails.
- *
- */
-static int ib_umem_odp_map_dma_single_page(
- struct ib_umem_odp *umem_odp,
- unsigned int dma_index,
- struct page *page,
- u64 access_mask)
+void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
{
- struct ib_device *dev = umem_odp->umem.ibdev;
- dma_addr_t *dma_addr = &umem_odp->dma_list[dma_index];
-
- if (*dma_addr) {
- /*
- * If the page is already dma mapped it means it went through
- * a non-invalidating trasition, like read-only to writable.
- * Resync the flags.
- */
- *dma_addr = (*dma_addr & ODP_DMA_ADDR_MASK) | access_mask;
- return 0;
- }
+ if (!umem_odp->is_implicit_odp)
+ ib_umem_odp_free(umem_odp);
- *dma_addr = ib_dma_map_page(dev, page, 0, 1 << umem_odp->page_shift,
- DMA_BIDIRECTIONAL);
- if (ib_dma_mapping_error(dev, *dma_addr)) {
- *dma_addr = 0;
- return -EFAULT;
- }
- umem_odp->npages++;
- *dma_addr |= access_mask;
- return 0;
+ put_pid(umem_odp->tgid);
+ kfree(umem_odp);
}
+EXPORT_SYMBOL(ib_umem_odp_release);
/**
* ib_umem_odp_map_dma_and_lock - DMA map userspace memory in an ODP MR and lock it.
*
* Maps the range passed in the argument to DMA addresses.
- * The DMA addresses of the mapped pages is updated in umem_odp->dma_list.
* Upon success the ODP MR will be locked to let caller complete its device
* page table update.
*
@@ -355,9 +333,6 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
struct hmm_range range = {};
unsigned long timeout;
- if (access_mask == 0)
- return -EINVAL;
-
if (user_virt < ib_umem_start(umem_odp) ||
user_virt + bcnt > ib_umem_end(umem_odp))
return -EFAULT;
@@ -383,11 +358,11 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
if (fault) {
range.default_flags = HMM_PFN_REQ_FAULT;
- if (access_mask & ODP_WRITE_ALLOWED_BIT)
+ if (access_mask & HMM_PFN_WRITE)
range.default_flags |= HMM_PFN_REQ_WRITE;
}
- range.hmm_pfns = &(umem_odp->pfn_list[pfn_start_idx]);
+ range.hmm_pfns = &(umem_odp->map.pfn_list[pfn_start_idx]);
timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
retry:
@@ -415,22 +390,17 @@ retry:
for (pfn_index = 0; pfn_index < num_pfns;
pfn_index += 1 << (page_shift - PAGE_SHIFT), dma_index++) {
- if (fault) {
- /*
- * Since we asked for hmm_range_fault() to populate
- * pages it shouldn't return an error entry on success.
- */
- WARN_ON(range.hmm_pfns[pfn_index] & HMM_PFN_ERROR);
- WARN_ON(!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID));
- } else {
- if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) {
- WARN_ON(umem_odp->dma_list[dma_index]);
- continue;
- }
- access_mask = ODP_READ_ALLOWED_BIT;
- if (range.hmm_pfns[pfn_index] & HMM_PFN_WRITE)
- access_mask |= ODP_WRITE_ALLOWED_BIT;
- }
+ /*
+ * Since we asked for hmm_range_fault() to populate
+ * pages it shouldn't return an error entry on success.
+ */
+ WARN_ON(fault && range.hmm_pfns[pfn_index] & HMM_PFN_ERROR);
+ WARN_ON(fault && !(range.hmm_pfns[pfn_index] & HMM_PFN_VALID));
+ if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID))
+ continue;
+
+ if (range.hmm_pfns[pfn_index] & HMM_PFN_DMA_MAPPED)
+ continue;
hmm_order = hmm_pfn_to_map_order(range.hmm_pfns[pfn_index]);
/* If a hugepage was detected and ODP wasn't set for, the umem
@@ -443,15 +413,6 @@ retry:
__func__, hmm_order, page_shift);
break;
}
-
- ret = ib_umem_odp_map_dma_single_page(
- umem_odp, dma_index, hmm_pfn_to_page(range.hmm_pfns[pfn_index]),
- access_mask);
- if (ret < 0) {
- ibdev_dbg(umem_odp->umem.ibdev,
- "ib_umem_odp_map_dma_single_page failed with error %d\n", ret);
- break;
- }
}
/* upon success lock should stay on hold for the callee */
if (!ret)
@@ -471,45 +432,38 @@ EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock);
void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
u64 bound)
{
- dma_addr_t dma_addr;
- dma_addr_t dma;
- int idx;
- u64 addr;
struct ib_device *dev = umem_odp->umem.ibdev;
+ u64 addr;
lockdep_assert_held(&umem_odp->umem_mutex);
virt = max_t(u64, virt, ib_umem_start(umem_odp));
bound = min_t(u64, bound, ib_umem_end(umem_odp));
for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) {
- idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
- dma = umem_odp->dma_list[idx];
-
- /* The access flags guaranteed a valid DMA address in case was NULL */
- if (dma) {
- unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >> PAGE_SHIFT;
- struct page *page = hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]);
-
- dma_addr = dma & ODP_DMA_ADDR_MASK;
- ib_dma_unmap_page(dev, dma_addr,
- BIT(umem_odp->page_shift),
- DMA_BIDIRECTIONAL);
- if (dma & ODP_WRITE_ALLOWED_BIT) {
- struct page *head_page = compound_head(page);
- /*
- * set_page_dirty prefers being called with
- * the page lock. However, MMU notifiers are
- * called sometimes with and sometimes without
- * the lock. We rely on the umem_mutex instead
- * to prevent other mmu notifiers from
- * continuing and allowing the page mapping to
- * be removed.
- */
- set_page_dirty(head_page);
- }
- umem_odp->dma_list[idx] = 0;
- umem_odp->npages--;
+ u64 offset = addr - ib_umem_start(umem_odp);
+ size_t idx = offset >> umem_odp->page_shift;
+ unsigned long pfn = umem_odp->map.pfn_list[idx];
+
+ if (!hmm_dma_unmap_pfn(dev->dma_device, &umem_odp->map, idx))
+ goto clear;
+
+ if (pfn & HMM_PFN_WRITE) {
+ struct page *page = hmm_pfn_to_page(pfn);
+ struct page *head_page = compound_head(page);
+ /*
+ * set_page_dirty prefers being called with
+ * the page lock. However, MMU notifiers are
+ * called sometimes with and sometimes without
+ * the lock. We rely on the umem_mutex instead
+ * to prevent other mmu notifiers from
+ * continuing and allowing the page mapping to
+ * be removed.
+ */
+ set_page_dirty(head_page);
}
+ umem_odp->npages--;
+clear:
+ umem_odp->map.pfn_list[idx] &= ~HMM_PFN_FLAGS;
}
}
EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 66b02fbf077a..ce16404cdfb8 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -42,6 +42,7 @@
#include <rdma/uverbs_types.h>
#include <rdma/uverbs_std_types.h>
+#include <rdma/ib_ucaps.h>
#include "rdma_core.h"
#include "uverbs.h"
@@ -161,7 +162,7 @@ static const void __user *uverbs_request_next_ptr(struct uverbs_req_iter *iter,
{
const void __user *res = iter->cur;
- if (iter->cur + len > iter->end)
+ if (len > iter->end - iter->cur)
return (void __force __user *)ERR_PTR(-ENOSPC);
iter->cur += len;
return res;
@@ -192,7 +193,7 @@ _ib_uverbs_lookup_comp_file(s32 fd, struct uverbs_attr_bundle *attrs)
fd, attrs);
if (IS_ERR(uobj))
- return (void *)uobj;
+ return ERR_CAST(uobj);
uverbs_uobject_get(uobj);
uobj_put_read(uobj);
@@ -232,6 +233,8 @@ int ib_init_ucontext(struct uverbs_attr_bundle *attrs)
{
struct ib_ucontext *ucontext = attrs->context;
struct ib_uverbs_file *file = attrs->ufile;
+ int *fd_array;
+ int fd_count;
int ret;
if (!down_read_trylock(&file->hw_destroy_rwsem))
@@ -247,6 +250,22 @@ int ib_init_ucontext(struct uverbs_attr_bundle *attrs)
if (ret)
goto err;
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_GET_CONTEXT_FD_ARR)) {
+ fd_count = uverbs_attr_ptr_get_array_size(attrs,
+ UVERBS_ATTR_GET_CONTEXT_FD_ARR,
+ sizeof(int));
+ if (fd_count < 0) {
+ ret = fd_count;
+ goto err_uncharge;
+ }
+
+ fd_array = uverbs_attr_get_alloced_ptr(attrs,
+ UVERBS_ATTR_GET_CONTEXT_FD_ARR);
+ ret = ib_get_ucaps(fd_array, fd_count, &ucontext->enabled_caps);
+ if (ret)
+ goto err_uncharge;
+ }
+
ret = ucontext->device->ops.alloc_ucontext(ucontext,
&attrs->driver_udata);
if (ret)
@@ -716,13 +735,13 @@ static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs)
goto err_free;
pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs);
- if (!pd) {
- ret = -EINVAL;
+ if (IS_ERR(pd)) {
+ ret = PTR_ERR(pd);
goto err_free;
}
mr = pd->device->ops.reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
- cmd.access_flags,
+ cmd.access_flags, NULL,
&attrs->driver_udata);
if (IS_ERR(mr)) {
ret = PTR_ERR(mr);
@@ -807,8 +826,8 @@ static int ib_uverbs_rereg_mr(struct uverbs_attr_bundle *attrs)
if (cmd.flags & IB_MR_REREG_PD) {
new_pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle,
attrs);
- if (!new_pd) {
- ret = -EINVAL;
+ if (IS_ERR(new_pd)) {
+ ret = PTR_ERR(new_pd);
goto put_uobjs;
}
} else {
@@ -917,8 +936,8 @@ static int ib_uverbs_alloc_mw(struct uverbs_attr_bundle *attrs)
return PTR_ERR(uobj);
pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs);
- if (!pd) {
- ret = -EINVAL;
+ if (IS_ERR(pd)) {
+ ret = PTR_ERR(pd);
goto err_free;
}
@@ -1125,8 +1144,8 @@ static int ib_uverbs_resize_cq(struct uverbs_attr_bundle *attrs)
return ret;
cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs);
- if (!cq)
- return -EINVAL;
+ if (IS_ERR(cq))
+ return PTR_ERR(cq);
ret = cq->device->ops.resize_cq(cq, cmd.cqe, &attrs->driver_udata);
if (ret)
@@ -1187,8 +1206,8 @@ static int ib_uverbs_poll_cq(struct uverbs_attr_bundle *attrs)
return ret;
cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs);
- if (!cq)
- return -EINVAL;
+ if (IS_ERR(cq))
+ return PTR_ERR(cq);
/* we copy a struct ib_uverbs_poll_cq_resp to user space */
header_ptr = attrs->ucore.outbuf;
@@ -1236,8 +1255,8 @@ static int ib_uverbs_req_notify_cq(struct uverbs_attr_bundle *attrs)
return ret;
cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs);
- if (!cq)
- return -EINVAL;
+ if (IS_ERR(cq))
+ return PTR_ERR(cq);
ib_req_notify_cq(cq, cmd.solicited_only ?
IB_CQ_SOLICITED : IB_CQ_NEXT_COMP);
@@ -1293,9 +1312,9 @@ static int create_qp(struct uverbs_attr_bundle *attrs,
switch (cmd->qp_type) {
case IB_QPT_RAW_PACKET:
- if (!capable(CAP_NET_RAW))
+ if (!rdma_uattrs_has_raw_cap(attrs))
return -EPERM;
- break;
+ fallthrough;
case IB_QPT_RC:
case IB_QPT_UC:
case IB_QPT_UD:
@@ -1319,8 +1338,8 @@ static int create_qp(struct uverbs_attr_bundle *attrs,
ind_tbl = uobj_get_obj_read(rwq_ind_table,
UVERBS_OBJECT_RWQ_IND_TBL,
cmd->rwq_ind_tbl_handle, attrs);
- if (!ind_tbl) {
- ret = -EINVAL;
+ if (IS_ERR(ind_tbl)) {
+ ret = PTR_ERR(ind_tbl);
goto err_put;
}
@@ -1358,8 +1377,10 @@ static int create_qp(struct uverbs_attr_bundle *attrs,
if (cmd->is_srq) {
srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ,
cmd->srq_handle, attrs);
- if (!srq || srq->srq_type == IB_SRQT_XRC) {
- ret = -EINVAL;
+ if (IS_ERR(srq) ||
+ srq->srq_type == IB_SRQT_XRC) {
+ ret = IS_ERR(srq) ? PTR_ERR(srq) :
+ -EINVAL;
goto err_put;
}
}
@@ -1369,23 +1390,29 @@ static int create_qp(struct uverbs_attr_bundle *attrs,
rcq = uobj_get_obj_read(
cq, UVERBS_OBJECT_CQ,
cmd->recv_cq_handle, attrs);
- if (!rcq) {
- ret = -EINVAL;
+ if (IS_ERR(rcq)) {
+ ret = PTR_ERR(rcq);
goto err_put;
}
}
}
}
- if (has_sq)
+ if (has_sq) {
scq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ,
cmd->send_cq_handle, attrs);
+ if (IS_ERR(scq)) {
+ ret = PTR_ERR(scq);
+ goto err_put;
+ }
+ }
+
if (!ind_tbl && cmd->qp_type != IB_QPT_XRC_INI)
rcq = rcq ?: scq;
pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd->pd_handle,
attrs);
- if (!pd || (!scq && has_sq)) {
- ret = -EINVAL;
+ if (IS_ERR(pd)) {
+ ret = PTR_ERR(pd);
goto err_put;
}
@@ -1424,7 +1451,7 @@ static int create_qp(struct uverbs_attr_bundle *attrs,
}
if (attr.create_flags & IB_QP_CREATE_SOURCE_QPN) {
- if (!capable(CAP_NET_RAW)) {
+ if (!rdma_uattrs_has_raw_cap(attrs)) {
ret = -EPERM;
goto err_put;
}
@@ -1480,18 +1507,18 @@ static int create_qp(struct uverbs_attr_bundle *attrs,
err_put:
if (!IS_ERR(xrcd_uobj))
uobj_put_read(xrcd_uobj);
- if (pd)
+ if (!IS_ERR_OR_NULL(pd))
uobj_put_obj_read(pd);
- if (scq)
+ if (!IS_ERR_OR_NULL(scq))
rdma_lookup_put_uobject(&scq->uobject->uevent.uobject,
UVERBS_LOOKUP_READ);
- if (rcq && rcq != scq)
+ if (!IS_ERR_OR_NULL(rcq) && rcq != scq)
rdma_lookup_put_uobject(&rcq->uobject->uevent.uobject,
UVERBS_LOOKUP_READ);
- if (srq)
+ if (!IS_ERR_OR_NULL(srq))
rdma_lookup_put_uobject(&srq->uobject->uevent.uobject,
UVERBS_LOOKUP_READ);
- if (ind_tbl)
+ if (!IS_ERR_OR_NULL(ind_tbl))
uobj_put_obj_read(ind_tbl);
uobj_alloc_abort(&obj->uevent.uobject, attrs);
@@ -1653,8 +1680,8 @@ static int ib_uverbs_query_qp(struct uverbs_attr_bundle *attrs)
}
qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
- if (!qp) {
- ret = -EINVAL;
+ if (IS_ERR(qp)) {
+ ret = PTR_ERR(qp);
goto out;
}
@@ -1759,8 +1786,8 @@ static int modify_qp(struct uverbs_attr_bundle *attrs,
qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd->base.qp_handle,
attrs);
- if (!qp) {
- ret = -EINVAL;
+ if (IS_ERR(qp)) {
+ ret = PTR_ERR(qp);
goto out;
}
@@ -1850,7 +1877,8 @@ static int modify_qp(struct uverbs_attr_bundle *attrs,
attr->path_mig_state = cmd->base.path_mig_state;
if (cmd->base.attr_mask & IB_QP_QKEY) {
if (cmd->base.qkey & IB_QP_SET_QKEY &&
- !rdma_nl_get_privileged_qkey()) {
+ !(rdma_nl_get_privileged_qkey() ||
+ rdma_uattrs_has_raw_cap(attrs))) {
ret = -EPERM;
goto release_qp;
}
@@ -2008,11 +2036,13 @@ static int ib_uverbs_post_send(struct uverbs_attr_bundle *attrs)
ret = uverbs_request_start(attrs, &iter, &cmd, sizeof(cmd));
if (ret)
return ret;
- wqes = uverbs_request_next_ptr(&iter, cmd.wqe_size * cmd.wr_count);
+ wqes = uverbs_request_next_ptr(&iter, size_mul(cmd.wqe_size,
+ cmd.wr_count));
if (IS_ERR(wqes))
return PTR_ERR(wqes);
- sgls = uverbs_request_next_ptr(
- &iter, cmd.sge_count * sizeof(struct ib_uverbs_sge));
+ sgls = uverbs_request_next_ptr(&iter,
+ size_mul(cmd.sge_count,
+ sizeof(struct ib_uverbs_sge)));
if (IS_ERR(sgls))
return PTR_ERR(sgls);
ret = uverbs_request_finish(&iter);
@@ -2024,8 +2054,8 @@ static int ib_uverbs_post_send(struct uverbs_attr_bundle *attrs)
return -ENOMEM;
qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
- if (!qp) {
- ret = -EINVAL;
+ if (IS_ERR(qp)) {
+ ret = PTR_ERR(qp);
goto out;
}
@@ -2062,9 +2092,9 @@ static int ib_uverbs_post_send(struct uverbs_attr_bundle *attrs)
ud->ah = uobj_get_obj_read(ah, UVERBS_OBJECT_AH,
user_wr->wr.ud.ah, attrs);
- if (!ud->ah) {
+ if (IS_ERR(ud->ah)) {
+ ret = PTR_ERR(ud->ah);
kfree(ud);
- ret = -EINVAL;
goto out_put;
}
ud->remote_qpn = user_wr->wr.ud.remote_qpn;
@@ -2198,11 +2228,11 @@ ib_uverbs_unmarshall_recv(struct uverbs_req_iter *iter, u32 wr_count,
if (wqe_size < sizeof(struct ib_uverbs_recv_wr))
return ERR_PTR(-EINVAL);
- wqes = uverbs_request_next_ptr(iter, wqe_size * wr_count);
+ wqes = uverbs_request_next_ptr(iter, size_mul(wqe_size, wr_count));
if (IS_ERR(wqes))
return ERR_CAST(wqes);
- sgls = uverbs_request_next_ptr(
- iter, sge_count * sizeof(struct ib_uverbs_sge));
+ sgls = uverbs_request_next_ptr(iter, size_mul(sge_count,
+ sizeof(struct ib_uverbs_sge)));
if (IS_ERR(sgls))
return ERR_CAST(sgls);
ret = uverbs_request_finish(iter);
@@ -2301,8 +2331,8 @@ static int ib_uverbs_post_recv(struct uverbs_attr_bundle *attrs)
return PTR_ERR(wr);
qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
- if (!qp) {
- ret = -EINVAL;
+ if (IS_ERR(qp)) {
+ ret = PTR_ERR(qp);
goto out;
}
@@ -2352,8 +2382,8 @@ static int ib_uverbs_post_srq_recv(struct uverbs_attr_bundle *attrs)
return PTR_ERR(wr);
srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs);
- if (!srq) {
- ret = -EINVAL;
+ if (IS_ERR(srq)) {
+ ret = PTR_ERR(srq);
goto out;
}
@@ -2409,8 +2439,8 @@ static int ib_uverbs_create_ah(struct uverbs_attr_bundle *attrs)
}
pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs);
- if (!pd) {
- ret = -EINVAL;
+ if (IS_ERR(pd)) {
+ ret = PTR_ERR(pd);
goto err;
}
@@ -2479,8 +2509,8 @@ static int ib_uverbs_attach_mcast(struct uverbs_attr_bundle *attrs)
return ret;
qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
- if (!qp)
- return -EINVAL;
+ if (IS_ERR(qp))
+ return PTR_ERR(qp);
obj = qp->uobject;
@@ -2529,8 +2559,8 @@ static int ib_uverbs_detach_mcast(struct uverbs_attr_bundle *attrs)
return ret;
qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
- if (!qp)
- return -EINVAL;
+ if (IS_ERR(qp))
+ return PTR_ERR(qp);
obj = qp->uobject;
mutex_lock(&obj->mcast_lock);
@@ -2664,8 +2694,8 @@ static int kern_spec_to_ib_spec_action(struct uverbs_attr_bundle *attrs,
UVERBS_OBJECT_FLOW_ACTION,
kern_spec->action.handle,
attrs);
- if (!ib_spec->action.act)
- return -EINVAL;
+ if (IS_ERR(ib_spec->action.act))
+ return PTR_ERR(ib_spec->action.act);
ib_spec->action.size =
sizeof(struct ib_flow_spec_action_handle);
flow_resources_add(uflow_res,
@@ -2682,8 +2712,8 @@ static int kern_spec_to_ib_spec_action(struct uverbs_attr_bundle *attrs,
UVERBS_OBJECT_COUNTERS,
kern_spec->flow_count.handle,
attrs);
- if (!ib_spec->flow_count.counters)
- return -EINVAL;
+ if (IS_ERR(ib_spec->flow_count.counters))
+ return PTR_ERR(ib_spec->flow_count.counters);
ib_spec->flow_count.size =
sizeof(struct ib_flow_spec_action_count);
flow_resources_add(uflow_res,
@@ -2901,14 +2931,14 @@ static int ib_uverbs_ex_create_wq(struct uverbs_attr_bundle *attrs)
return PTR_ERR(obj);
pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs);
- if (!pd) {
- err = -EINVAL;
+ if (IS_ERR(pd)) {
+ err = PTR_ERR(pd);
goto err_uobj;
}
cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs);
- if (!cq) {
- err = -EINVAL;
+ if (IS_ERR(cq)) {
+ err = PTR_ERR(cq);
goto err_put_pd;
}
@@ -3009,8 +3039,8 @@ static int ib_uverbs_ex_modify_wq(struct uverbs_attr_bundle *attrs)
return -EINVAL;
wq = uobj_get_obj_read(wq, UVERBS_OBJECT_WQ, cmd.wq_handle, attrs);
- if (!wq)
- return -EINVAL;
+ if (IS_ERR(wq))
+ return PTR_ERR(wq);
if (cmd.attr_mask & IB_WQ_FLAGS) {
wq_attr.flags = cmd.flags;
@@ -3093,8 +3123,8 @@ static int ib_uverbs_ex_create_rwq_ind_table(struct uverbs_attr_bundle *attrs)
num_read_wqs++) {
wq = uobj_get_obj_read(wq, UVERBS_OBJECT_WQ,
wqs_handles[num_read_wqs], attrs);
- if (!wq) {
- err = -EINVAL;
+ if (IS_ERR(wq)) {
+ err = PTR_ERR(wq);
goto put_wqs;
}
@@ -3196,7 +3226,7 @@ static int ib_uverbs_ex_create_flow(struct uverbs_attr_bundle *attrs)
if (cmd.comp_mask)
return -EINVAL;
- if (!capable(CAP_NET_RAW))
+ if (!rdma_uattrs_has_raw_cap(attrs))
return -EPERM;
if (cmd.flow_attr.flags >= IB_FLOW_ATTR_FLAGS_RESERVED)
@@ -3249,8 +3279,8 @@ static int ib_uverbs_ex_create_flow(struct uverbs_attr_bundle *attrs)
}
qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
- if (!qp) {
- err = -EINVAL;
+ if (IS_ERR(qp)) {
+ err = PTR_ERR(qp);
goto err_uobj;
}
@@ -3396,15 +3426,15 @@ static int __uverbs_create_xsrq(struct uverbs_attr_bundle *attrs,
if (ib_srq_has_cq(cmd->srq_type)) {
attr.ext.cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ,
cmd->cq_handle, attrs);
- if (!attr.ext.cq) {
- ret = -EINVAL;
+ if (IS_ERR(attr.ext.cq)) {
+ ret = PTR_ERR(attr.ext.cq);
goto err_put_xrcd;
}
}
pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd->pd_handle, attrs);
- if (!pd) {
- ret = -EINVAL;
+ if (IS_ERR(pd)) {
+ ret = PTR_ERR(pd);
goto err_put_cq;
}
@@ -3511,8 +3541,8 @@ static int ib_uverbs_modify_srq(struct uverbs_attr_bundle *attrs)
return ret;
srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs);
- if (!srq)
- return -EINVAL;
+ if (IS_ERR(srq))
+ return PTR_ERR(srq);
attr.max_wr = cmd.max_wr;
attr.srq_limit = cmd.srq_limit;
@@ -3539,8 +3569,8 @@ static int ib_uverbs_query_srq(struct uverbs_attr_bundle *attrs)
return ret;
srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs);
- if (!srq)
- return -EINVAL;
+ if (IS_ERR(srq))
+ return PTR_ERR(srq);
ret = ib_query_srq(srq, &attr);
@@ -3665,8 +3695,8 @@ static int ib_uverbs_ex_modify_cq(struct uverbs_attr_bundle *attrs)
return -EOPNOTSUPP;
cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs);
- if (!cq)
- return -EINVAL;
+ if (IS_ERR(cq))
+ return PTR_ERR(cq);
ret = rdma_set_cq_moderation(cq, cmd.attr.cq_count, cmd.attr.cq_period);
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 85cfc790a7bb..973fe2c7ef53 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -52,6 +52,7 @@
#include <rdma/ib.h>
#include <rdma/uverbs_std_types.h>
#include <rdma/rdma_netlink.h>
+#include <rdma/ib_ucaps.h>
#include "uverbs.h"
#include "core_priv.h"
@@ -1345,6 +1346,7 @@ static void __exit ib_uverbs_cleanup(void)
IB_UVERBS_NUM_FIXED_MINOR);
unregister_chrdev_region(dynamic_uverbs_dev,
IB_UVERBS_NUM_DYNAMIC_MINOR);
+ ib_cleanup_ucaps();
mmu_notifier_synchronize();
}
diff --git a/drivers/infiniband/core/uverbs_marshall.c b/drivers/infiniband/core/uverbs_marshall.c
index 11a080646916..e803f609ec87 100644
--- a/drivers/infiniband/core/uverbs_marshall.c
+++ b/drivers/infiniband/core/uverbs_marshall.c
@@ -171,45 +171,3 @@ void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst,
__ib_copy_path_rec_to_user(dst, src);
}
EXPORT_SYMBOL(ib_copy_path_rec_to_user);
-
-void ib_copy_path_rec_from_user(struct sa_path_rec *dst,
- struct ib_user_path_rec *src)
-{
- u32 slid, dlid;
-
- memset(dst, 0, sizeof(*dst));
- if ((ib_is_opa_gid((union ib_gid *)src->sgid)) ||
- (ib_is_opa_gid((union ib_gid *)src->dgid))) {
- dst->rec_type = SA_PATH_REC_TYPE_OPA;
- slid = opa_get_lid_from_gid((union ib_gid *)src->sgid);
- dlid = opa_get_lid_from_gid((union ib_gid *)src->dgid);
- } else {
- dst->rec_type = SA_PATH_REC_TYPE_IB;
- slid = ntohs(src->slid);
- dlid = ntohs(src->dlid);
- }
- memcpy(dst->dgid.raw, src->dgid, sizeof dst->dgid);
- memcpy(dst->sgid.raw, src->sgid, sizeof dst->sgid);
-
- sa_path_set_dlid(dst, dlid);
- sa_path_set_slid(dst, slid);
- sa_path_set_raw_traffic(dst, src->raw_traffic);
- dst->flow_label = src->flow_label;
- dst->hop_limit = src->hop_limit;
- dst->traffic_class = src->traffic_class;
- dst->reversible = src->reversible;
- dst->numb_path = src->numb_path;
- dst->pkey = src->pkey;
- dst->sl = src->sl;
- dst->mtu_selector = src->mtu_selector;
- dst->mtu = src->mtu;
- dst->rate_selector = src->rate_selector;
- dst->rate = src->rate;
- dst->packet_life_time = src->packet_life_time;
- dst->preference = src->preference;
- dst->packet_life_time_selector = src->packet_life_time_selector;
-
- /* TODO: No need to set this */
- sa_path_set_dmac_zero(dst);
-}
-EXPORT_SYMBOL(ib_copy_path_rec_from_user);
diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c
index 432054f0a8a4..fab5d914029d 100644
--- a/drivers/infiniband/core/uverbs_std_types_cq.c
+++ b/drivers/infiniband/core/uverbs_std_types_cq.c
@@ -64,15 +64,21 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
struct ib_ucq_object *obj = container_of(
uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_CQ_HANDLE),
typeof(*obj), uevent.uobject);
+ struct ib_uverbs_completion_event_file *ev_file = NULL;
struct ib_device *ib_dev = attrs->context->device;
- int ret;
- u64 user_handle;
+ struct ib_umem_dmabuf *umem_dmabuf;
struct ib_cq_init_attr attr = {};
- struct ib_cq *cq;
- struct ib_uverbs_completion_event_file *ev_file = NULL;
struct ib_uobject *ev_file_uobj;
+ struct ib_umem *umem = NULL;
+ u64 buffer_length;
+ u64 buffer_offset;
+ struct ib_cq *cq;
+ u64 user_handle;
+ u64 buffer_va;
+ int buffer_fd;
+ int ret;
- if (!ib_dev->ops.create_cq || !ib_dev->ops.destroy_cq)
+ if ((!ib_dev->ops.create_cq && !ib_dev->ops.create_cq_umem) || !ib_dev->ops.destroy_cq)
return -EOPNOTSUPP;
ret = uverbs_copy_from(&attr.comp_vector, attrs,
@@ -112,9 +118,66 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
INIT_LIST_HEAD(&obj->comp_list);
INIT_LIST_HEAD(&obj->uevent.event_list);
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_VA)) {
+
+ ret = uverbs_copy_from(&buffer_va, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_VA);
+ if (ret)
+ goto err_event_file;
+
+ ret = uverbs_copy_from(&buffer_length, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_LENGTH);
+ if (ret)
+ goto err_event_file;
+
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_FD) ||
+ uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET) ||
+ !ib_dev->ops.create_cq_umem) {
+ ret = -EINVAL;
+ goto err_event_file;
+ }
+
+ umem = ib_umem_get(ib_dev, buffer_va, buffer_length, IB_ACCESS_LOCAL_WRITE);
+ if (IS_ERR(umem)) {
+ ret = PTR_ERR(umem);
+ goto err_event_file;
+ }
+ } else if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_FD)) {
+
+ ret = uverbs_get_raw_fd(&buffer_fd, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_FD);
+ if (ret)
+ goto err_event_file;
+
+ ret = uverbs_copy_from(&buffer_offset, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET);
+ if (ret)
+ goto err_event_file;
+
+ ret = uverbs_copy_from(&buffer_length, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_LENGTH);
+ if (ret)
+ goto err_event_file;
+
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_VA) ||
+ !ib_dev->ops.create_cq_umem) {
+ ret = -EINVAL;
+ goto err_event_file;
+ }
+
+ umem_dmabuf = ib_umem_dmabuf_get_pinned(ib_dev, buffer_offset, buffer_length,
+ buffer_fd, IB_ACCESS_LOCAL_WRITE);
+ if (IS_ERR(umem_dmabuf)) {
+ ret = PTR_ERR(umem_dmabuf);
+ goto err_event_file;
+ }
+ umem = &umem_dmabuf->umem;
+ } else if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET) ||
+ uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_LENGTH) ||
+ !ib_dev->ops.create_cq) {
+ ret = -EINVAL;
+ goto err_event_file;
+ }
+
cq = rdma_zalloc_drv_obj(ib_dev, ib_cq);
if (!cq) {
ret = -ENOMEM;
+ ib_umem_release(umem);
goto err_event_file;
}
@@ -128,7 +191,8 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ);
rdma_restrack_set_name(&cq->res, NULL);
- ret = ib_dev->ops.create_cq(cq, &attr, attrs);
+ ret = umem ? ib_dev->ops.create_cq_umem(cq, &attr, umem, attrs) :
+ ib_dev->ops.create_cq(cq, &attr, attrs);
if (ret)
goto err_free;
@@ -142,6 +206,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
return ret;
err_free:
+ ib_umem_release(umem);
rdma_restrack_put(&cq->res);
kfree(cq);
err_event_file:
@@ -180,6 +245,17 @@ DECLARE_UVERBS_NAMED_METHOD(
UVERBS_OBJECT_ASYNC_EVENT,
UVERBS_ACCESS_READ,
UA_OPTIONAL),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_BUFFER_VA,
+ UVERBS_ATTR_TYPE(u64),
+ UA_OPTIONAL),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_BUFFER_LENGTH,
+ UVERBS_ATTR_TYPE(u64),
+ UA_OPTIONAL),
+ UVERBS_ATTR_RAW_FD(UVERBS_ATTR_CREATE_CQ_BUFFER_FD,
+ UA_OPTIONAL),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET,
+ UVERBS_ATTR_TYPE(u64),
+ UA_OPTIONAL),
UVERBS_ATTR_UHW());
static int UVERBS_HANDLER(UVERBS_METHOD_CQ_DESTROY)(
diff --git a/drivers/infiniband/core/uverbs_std_types_device.c b/drivers/infiniband/core/uverbs_std_types_device.c
index fb0555647336..c0fd283d9d6c 100644
--- a/drivers/infiniband/core/uverbs_std_types_device.c
+++ b/drivers/infiniband/core/uverbs_std_types_device.c
@@ -437,6 +437,10 @@ DECLARE_UVERBS_NAMED_METHOD(
UVERBS_ATTR_TYPE(u32), UA_OPTIONAL),
UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_GET_CONTEXT_CORE_SUPPORT,
UVERBS_ATTR_TYPE(u64), UA_OPTIONAL),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_GET_CONTEXT_FD_ARR,
+ UVERBS_ATTR_MIN_SIZE(sizeof(int)),
+ UA_OPTIONAL,
+ UA_ALLOC_AND_COPY),
UVERBS_ATTR_UHW());
DECLARE_UVERBS_NAMED_METHOD(
diff --git a/drivers/infiniband/core/uverbs_std_types_dmah.c b/drivers/infiniband/core/uverbs_std_types_dmah.c
new file mode 100644
index 000000000000..453ce656c6f2
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_std_types_dmah.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#include "rdma_core.h"
+#include "uverbs.h"
+#include <rdma/uverbs_std_types.h>
+#include "restrack.h"
+
+static int uverbs_free_dmah(struct ib_uobject *uobject,
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_dmah *dmah = uobject->object;
+ int ret;
+
+ if (atomic_read(&dmah->usecnt))
+ return -EBUSY;
+
+ ret = dmah->device->ops.dealloc_dmah(dmah, attrs);
+ if (ret)
+ return ret;
+
+ rdma_restrack_del(&dmah->res);
+ kfree(dmah);
+ return 0;
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_DMAH_ALLOC)(
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uobject *uobj =
+ uverbs_attr_get(attrs, UVERBS_ATTR_ALLOC_DMAH_HANDLE)
+ ->obj_attr.uobject;
+ struct ib_device *ib_dev = attrs->context->device;
+ struct ib_dmah *dmah;
+ int ret;
+
+ dmah = rdma_zalloc_drv_obj(ib_dev, ib_dmah);
+ if (!dmah)
+ return -ENOMEM;
+
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_ALLOC_DMAH_CPU_ID)) {
+ ret = uverbs_copy_from(&dmah->cpu_id, attrs,
+ UVERBS_ATTR_ALLOC_DMAH_CPU_ID);
+ if (ret)
+ goto err;
+
+ if (!cpumask_test_cpu(dmah->cpu_id, current->cpus_ptr)) {
+ ret = -EPERM;
+ goto err;
+ }
+
+ dmah->valid_fields |= BIT(IB_DMAH_CPU_ID_EXISTS);
+ }
+
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_ALLOC_DMAH_TPH_MEM_TYPE)) {
+ dmah->mem_type = uverbs_attr_get_enum_id(attrs,
+ UVERBS_ATTR_ALLOC_DMAH_TPH_MEM_TYPE);
+ dmah->valid_fields |= BIT(IB_DMAH_MEM_TYPE_EXISTS);
+ }
+
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_ALLOC_DMAH_PH)) {
+ ret = uverbs_copy_from(&dmah->ph, attrs,
+ UVERBS_ATTR_ALLOC_DMAH_PH);
+ if (ret)
+ goto err;
+
+ /* Per PCIe spec 6.2-1.0, only the lowest two bits are applicable */
+ if (dmah->ph & 0xFC) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ dmah->valid_fields |= BIT(IB_DMAH_PH_EXISTS);
+ }
+
+ dmah->device = ib_dev;
+ dmah->uobject = uobj;
+ atomic_set(&dmah->usecnt, 0);
+
+ rdma_restrack_new(&dmah->res, RDMA_RESTRACK_DMAH);
+ rdma_restrack_set_name(&dmah->res, NULL);
+
+ ret = ib_dev->ops.alloc_dmah(dmah, attrs);
+ if (ret) {
+ rdma_restrack_put(&dmah->res);
+ goto err;
+ }
+
+ uobj->object = dmah;
+ rdma_restrack_add(&dmah->res);
+ uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_ALLOC_DMAH_HANDLE);
+ return 0;
+err:
+ kfree(dmah);
+ return ret;
+}
+
+static const struct uverbs_attr_spec uverbs_dmah_mem_type[] = {
+ [TPH_MEM_TYPE_VM] = {
+ .type = UVERBS_ATTR_TYPE_PTR_IN,
+ UVERBS_ATTR_NO_DATA(),
+ },
+ [TPH_MEM_TYPE_PM] = {
+ .type = UVERBS_ATTR_TYPE_PTR_IN,
+ UVERBS_ATTR_NO_DATA(),
+ },
+};
+
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_DMAH_ALLOC,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_ALLOC_DMAH_HANDLE,
+ UVERBS_OBJECT_DMAH,
+ UVERBS_ACCESS_NEW,
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ALLOC_DMAH_CPU_ID,
+ UVERBS_ATTR_TYPE(u32),
+ UA_OPTIONAL),
+ UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_ALLOC_DMAH_TPH_MEM_TYPE,
+ uverbs_dmah_mem_type,
+ UA_OPTIONAL),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ALLOC_DMAH_PH,
+ UVERBS_ATTR_TYPE(u8),
+ UA_OPTIONAL));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+ UVERBS_METHOD_DMAH_FREE,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_FREE_DMA_HANDLE,
+ UVERBS_OBJECT_DMAH,
+ UVERBS_ACCESS_DESTROY,
+ UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_DMAH,
+ UVERBS_TYPE_ALLOC_IDR(uverbs_free_dmah),
+ &UVERBS_METHOD(UVERBS_METHOD_DMAH_ALLOC),
+ &UVERBS_METHOD(UVERBS_METHOD_DMAH_FREE));
+
+const struct uapi_definition uverbs_def_obj_dmah[] = {
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_DMAH,
+ UAPI_DEF_OBJ_NEEDS_FN(dealloc_dmah),
+ UAPI_DEF_OBJ_NEEDS_FN(alloc_dmah)),
+ {}
+};
diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c
index 7ebc7bd3caae..570b9656801d 100644
--- a/drivers/infiniband/core/uverbs_std_types_mr.c
+++ b/drivers/infiniband/core/uverbs_std_types_mr.c
@@ -238,7 +238,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_REG_DMABUF_MR)(
return ret;
mr = pd->device->ops.reg_user_mr_dmabuf(pd, offset, length, iova, fd,
- access_flags,
+ access_flags, NULL,
attrs);
if (IS_ERR(mr))
return PTR_ERR(mr);
@@ -266,6 +266,135 @@ static int UVERBS_HANDLER(UVERBS_METHOD_REG_DMABUF_MR)(
return ret;
}
+static int UVERBS_HANDLER(UVERBS_METHOD_REG_MR)(
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uobject *uobj =
+ uverbs_attr_get_uobject(attrs, UVERBS_ATTR_REG_MR_HANDLE);
+ struct ib_pd *pd =
+ uverbs_attr_get_obj(attrs, UVERBS_ATTR_REG_MR_PD_HANDLE);
+ u32 valid_access_flags = IB_ACCESS_SUPPORTED;
+ u64 length, iova, fd_offset = 0, addr = 0;
+ struct ib_device *ib_dev = pd->device;
+ struct ib_dmah *dmah = NULL;
+ bool has_fd_offset = false;
+ bool has_addr = false;
+ bool has_fd = false;
+ u32 access_flags;
+ struct ib_mr *mr;
+ int fd;
+ int ret;
+
+ ret = uverbs_copy_from(&iova, attrs, UVERBS_ATTR_REG_MR_IOVA);
+ if (ret)
+ return ret;
+
+ ret = uverbs_copy_from(&length, attrs, UVERBS_ATTR_REG_MR_LENGTH);
+ if (ret)
+ return ret;
+
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_REG_MR_ADDR)) {
+ ret = uverbs_copy_from(&addr, attrs,
+ UVERBS_ATTR_REG_MR_ADDR);
+ if (ret)
+ return ret;
+ has_addr = true;
+ }
+
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_REG_MR_FD_OFFSET)) {
+ ret = uverbs_copy_from(&fd_offset, attrs,
+ UVERBS_ATTR_REG_MR_FD_OFFSET);
+ if (ret)
+ return ret;
+ has_fd_offset = true;
+ }
+
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_REG_MR_FD)) {
+ ret = uverbs_get_raw_fd(&fd, attrs,
+ UVERBS_ATTR_REG_MR_FD);
+ if (ret)
+ return ret;
+ has_fd = true;
+ }
+
+ if (has_fd) {
+ if (!ib_dev->ops.reg_user_mr_dmabuf)
+ return -EOPNOTSUPP;
+
+ /* FD requires offset and can't come with addr */
+ if (!has_fd_offset || has_addr)
+ return -EINVAL;
+
+ if ((fd_offset & ~PAGE_MASK) != (iova & ~PAGE_MASK))
+ return -EINVAL;
+
+ valid_access_flags = IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_READ |
+ IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_REMOTE_ATOMIC |
+ IB_ACCESS_RELAXED_ORDERING;
+ } else {
+ if (!has_addr || has_fd_offset)
+ return -EINVAL;
+
+ if ((addr & ~PAGE_MASK) != (iova & ~PAGE_MASK))
+ return -EINVAL;
+ }
+
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_REG_MR_DMA_HANDLE)) {
+ dmah = uverbs_attr_get_obj(attrs,
+ UVERBS_ATTR_REG_MR_DMA_HANDLE);
+ if (IS_ERR(dmah))
+ return PTR_ERR(dmah);
+ }
+
+ ret = uverbs_get_flags32(&access_flags, attrs,
+ UVERBS_ATTR_REG_MR_ACCESS_FLAGS,
+ valid_access_flags);
+ if (ret)
+ return ret;
+
+ ret = ib_check_mr_access(ib_dev, access_flags);
+ if (ret)
+ return ret;
+
+ if (has_fd)
+ mr = pd->device->ops.reg_user_mr_dmabuf(pd, fd_offset, length,
+ iova, fd, access_flags,
+ dmah, attrs);
+ else
+ mr = pd->device->ops.reg_user_mr(pd, addr, length, iova,
+ access_flags, dmah, NULL);
+
+ if (IS_ERR(mr))
+ return PTR_ERR(mr);
+
+ mr->device = pd->device;
+ mr->pd = pd;
+ mr->type = IB_MR_TYPE_USER;
+ mr->uobject = uobj;
+ atomic_inc(&pd->usecnt);
+ if (dmah) {
+ mr->dmah = dmah;
+ atomic_inc(&dmah->usecnt);
+ }
+ rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR);
+ rdma_restrack_set_name(&mr->res, NULL);
+ rdma_restrack_add(&mr->res);
+ uobj->object = mr;
+
+ uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_REG_MR_HANDLE);
+
+ ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_MR_RESP_LKEY,
+ &mr->lkey, sizeof(mr->lkey));
+ if (ret)
+ return ret;
+
+ ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_MR_RESP_RKEY,
+ &mr->rkey, sizeof(mr->rkey));
+ return ret;
+}
+
DECLARE_UVERBS_NAMED_METHOD(
UVERBS_METHOD_ADVISE_MR,
UVERBS_ATTR_IDR(UVERBS_ATTR_ADVISE_MR_PD_HANDLE,
@@ -362,6 +491,44 @@ DECLARE_UVERBS_NAMED_METHOD(
UVERBS_ATTR_TYPE(u32),
UA_MANDATORY));
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_REG_MR,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_REG_MR_HANDLE,
+ UVERBS_OBJECT_MR,
+ UVERBS_ACCESS_NEW,
+ UA_MANDATORY),
+ UVERBS_ATTR_IDR(UVERBS_ATTR_REG_MR_PD_HANDLE,
+ UVERBS_OBJECT_PD,
+ UVERBS_ACCESS_READ,
+ UA_MANDATORY),
+ UVERBS_ATTR_IDR(UVERBS_ATTR_REG_MR_DMA_HANDLE,
+ UVERBS_OBJECT_DMAH,
+ UVERBS_ACCESS_READ,
+ UA_OPTIONAL),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_MR_IOVA,
+ UVERBS_ATTR_TYPE(u64),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_MR_LENGTH,
+ UVERBS_ATTR_TYPE(u64),
+ UA_MANDATORY),
+ UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_REG_MR_ACCESS_FLAGS,
+ enum ib_access_flags,
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_MR_ADDR,
+ UVERBS_ATTR_TYPE(u64),
+ UA_OPTIONAL),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_MR_FD_OFFSET,
+ UVERBS_ATTR_TYPE(u64),
+ UA_OPTIONAL),
+ UVERBS_ATTR_RAW_FD(UVERBS_ATTR_REG_MR_FD,
+ UA_OPTIONAL),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_MR_RESP_LKEY,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_MR_RESP_RKEY,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY));
+
DECLARE_UVERBS_NAMED_METHOD_DESTROY(
UVERBS_METHOD_MR_DESTROY,
UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_MR_HANDLE,
@@ -376,7 +543,8 @@ DECLARE_UVERBS_NAMED_OBJECT(
&UVERBS_METHOD(UVERBS_METHOD_DM_MR_REG),
&UVERBS_METHOD(UVERBS_METHOD_MR_DESTROY),
&UVERBS_METHOD(UVERBS_METHOD_QUERY_MR),
- &UVERBS_METHOD(UVERBS_METHOD_REG_DMABUF_MR));
+ &UVERBS_METHOD(UVERBS_METHOD_REG_DMABUF_MR),
+ &UVERBS_METHOD(UVERBS_METHOD_REG_MR));
const struct uapi_definition uverbs_def_obj_mr[] = {
UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_MR,
diff --git a/drivers/infiniband/core/uverbs_std_types_qp.c b/drivers/infiniband/core/uverbs_std_types_qp.c
index 7b4773fa4bc0..be0730e8509e 100644
--- a/drivers/infiniband/core/uverbs_std_types_qp.c
+++ b/drivers/infiniband/core/uverbs_std_types_qp.c
@@ -133,7 +133,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_QP_CREATE)(
device = xrcd->device;
break;
case IB_UVERBS_QPT_RAW_PACKET:
- if (!capable(CAP_NET_RAW))
+ if (!rdma_uattrs_has_raw_cap(attrs))
return -EPERM;
fallthrough;
case IB_UVERBS_QPT_RC:
diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c
index a02916a3a79c..e00ea63175bd 100644
--- a/drivers/infiniband/core/uverbs_uapi.c
+++ b/drivers/infiniband/core/uverbs_uapi.c
@@ -631,6 +631,7 @@ static const struct uapi_definition uverbs_core_api[] = {
UAPI_DEF_CHAIN(uverbs_def_obj_cq),
UAPI_DEF_CHAIN(uverbs_def_obj_device),
UAPI_DEF_CHAIN(uverbs_def_obj_dm),
+ UAPI_DEF_CHAIN(uverbs_def_obj_dmah),
UAPI_DEF_CHAIN(uverbs_def_obj_flow_action),
UAPI_DEF_CHAIN(uverbs_def_obj_intf),
UAPI_DEF_CHAIN(uverbs_def_obj_mr),
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 473ee0831307..11b1a194de44 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -148,6 +148,7 @@ __attribute_const__ int ib_rate_to_mult(enum ib_rate rate)
case IB_RATE_400_GBPS: return 160;
case IB_RATE_600_GBPS: return 240;
case IB_RATE_800_GBPS: return 320;
+ case IB_RATE_1600_GBPS: return 640;
default: return -1;
}
}
@@ -178,6 +179,7 @@ __attribute_const__ enum ib_rate mult_to_ib_rate(int mult)
case 160: return IB_RATE_400_GBPS;
case 240: return IB_RATE_600_GBPS;
case 320: return IB_RATE_800_GBPS;
+ case 640: return IB_RATE_1600_GBPS;
default: return IB_RATE_PORT_CURRENT;
}
}
@@ -208,6 +210,7 @@ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate)
case IB_RATE_400_GBPS: return 425000;
case IB_RATE_600_GBPS: return 637500;
case IB_RATE_800_GBPS: return 850000;
+ case IB_RATE_1600_GBPS: return 1700000;
default: return -1;
}
}
@@ -572,7 +575,7 @@ struct ib_ah *rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
GFP_KERNEL : GFP_ATOMIC);
if (IS_ERR(slave)) {
rdma_unfill_sgid_attr(ah_attr, old_sgid_attr);
- return (void *)slave;
+ return ERR_CAST(slave);
}
ah = _rdma_create_ah(pd, ah_attr, flags, NULL, slave);
rdma_lag_put_ah_roce_slave(slave);
@@ -2105,7 +2108,7 @@ int ib_destroy_qp_user(struct ib_qp *qp, struct ib_udata *udata)
if (!qp->uobject)
rdma_rw_cleanup_mrs(qp);
- rdma_counter_unbind_qp(qp, true);
+ rdma_counter_unbind_qp(qp, qp->port, true);
ret = qp->device->ops.destroy_qp(qp, udata);
if (ret) {
if (sec)
@@ -2223,7 +2226,7 @@ struct ib_mr *ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
}
mr = pd->device->ops.reg_user_mr(pd, start, length, virt_addr,
- access_flags, NULL);
+ access_flags, NULL, NULL);
if (IS_ERR(mr))
return mr;
@@ -2262,6 +2265,7 @@ int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata)
{
struct ib_pd *pd = mr->pd;
struct ib_dm *dm = mr->dm;
+ struct ib_dmah *dmah = mr->dmah;
struct ib_sig_attrs *sig_attrs = mr->sig_attrs;
int ret;
@@ -2272,6 +2276,8 @@ int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata)
atomic_dec(&pd->usecnt);
if (dm)
atomic_dec(&dm->usecnt);
+ if (dmah)
+ atomic_dec(&dmah->usecnt);
kfree(sig_attrs);
}
@@ -3109,22 +3115,23 @@ EXPORT_SYMBOL(__rdma_block_iter_start);
bool __rdma_block_iter_next(struct ib_block_iter *biter)
{
unsigned int block_offset;
- unsigned int sg_delta;
+ unsigned int delta;
if (!biter->__sg_nents || !biter->__sg)
return false;
biter->__dma_addr = sg_dma_address(biter->__sg) + biter->__sg_advance;
block_offset = biter->__dma_addr & (BIT_ULL(biter->__pg_bit) - 1);
- sg_delta = BIT_ULL(biter->__pg_bit) - block_offset;
+ delta = BIT_ULL(biter->__pg_bit) - block_offset;
- if (sg_dma_len(biter->__sg) - biter->__sg_advance > sg_delta) {
- biter->__sg_advance += sg_delta;
- } else {
+ while (biter->__sg_nents && biter->__sg &&
+ sg_dma_len(biter->__sg) - biter->__sg_advance <= delta) {
+ delta -= sg_dma_len(biter->__sg) - biter->__sg_advance;
biter->__sg_advance = 0;
biter->__sg = sg_next(biter->__sg);
biter->__sg_nents--;
}
+ biter->__sg_advance += delta;
return true;
}