summaryrefslogtreecommitdiff
path: root/drivers/infiniband/core
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/infiniband/core')
-rw-r--r--drivers/infiniband/core/Makefile3
-rw-r--r--drivers/infiniband/core/addr.c12
-rw-r--r--drivers/infiniband/core/agent.c32
-rw-r--r--drivers/infiniband/core/cache.c59
-rw-r--r--drivers/infiniband/core/cm.c244
-rw-r--r--drivers/infiniband/core/cm_trace.h2
-rw-r--r--drivers/infiniband/core/cma.c73
-rw-r--r--drivers/infiniband/core/cma_trace.h6
-rw-r--r--drivers/infiniband/core/core_priv.h3
-rw-r--r--drivers/infiniband/core/counters.c52
-rw-r--r--drivers/infiniband/core/device.c298
-rw-r--r--drivers/infiniband/core/iwcm.c58
-rw-r--r--drivers/infiniband/core/lag.c3
-rw-r--r--drivers/infiniband/core/mad.c66
-rw-r--r--drivers/infiniband/core/mad_rmpp.c2
-rw-r--r--drivers/infiniband/core/netlink.c1
-rw-r--r--drivers/infiniband/core/nldev.c340
-rw-r--r--drivers/infiniband/core/rdma_core.c12
-rw-r--r--drivers/infiniband/core/restrack.c63
-rw-r--r--drivers/infiniband/core/roce_gid_mgmt.c33
-rw-r--r--drivers/infiniband/core/sa_query.c2
-rw-r--r--drivers/infiniband/core/sysfs.c15
-rw-r--r--drivers/infiniband/core/ucaps.c267
-rw-r--r--drivers/infiniband/core/ucma.c26
-rw-r--r--drivers/infiniband/core/ud_header.c83
-rw-r--r--drivers/infiniband/core/umem.c36
-rw-r--r--drivers/infiniband/core/umem_dmabuf.c68
-rw-r--r--drivers/infiniband/core/umem_odp.c269
-rw-r--r--drivers/infiniband/core/user_mad.c50
-rw-r--r--drivers/infiniband/core/uverbs.h29
-rw-r--r--drivers/infiniband/core/uverbs_cmd.c195
-rw-r--r--drivers/infiniband/core/uverbs_main.c52
-rw-r--r--drivers/infiniband/core/uverbs_marshall.c42
-rw-r--r--drivers/infiniband/core/uverbs_std_types_cq.c2
-rw-r--r--drivers/infiniband/core/uverbs_std_types_device.c4
-rw-r--r--drivers/infiniband/core/uverbs_std_types_mr.c2
-rw-r--r--drivers/infiniband/core/verbs.c97
37 files changed, 1655 insertions, 946 deletions
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index 8ab4eea5a0a5..d49ded7e95f0 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -39,6 +39,7 @@ ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
uverbs_std_types_async_fd.o \
uverbs_std_types_srq.o \
uverbs_std_types_wq.o \
- uverbs_std_types_qp.o
+ uverbs_std_types_qp.o \
+ ucaps.o
ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o umem_dmabuf.o
ib_uverbs-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o
diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
index f253295795f0..be0743dac3ff 100644
--- a/drivers/infiniband/core/addr.c
+++ b/drivers/infiniband/core/addr.c
@@ -348,16 +348,10 @@ static int dst_fetch_ha(const struct dst_entry *dst,
static bool has_gateway(const struct dst_entry *dst, sa_family_t family)
{
- struct rtable *rt;
- struct rt6_info *rt6;
-
- if (family == AF_INET) {
- rt = container_of(dst, struct rtable, dst);
- return rt->rt_uses_gateway;
- }
+ if (family == AF_INET)
+ return dst_rtable(dst)->rt_uses_gateway;
- rt6 = container_of(dst, struct rt6_info, dst);
- return rt6->rt6i_flags & RTF_GATEWAY;
+ return dst_rt6_info(dst)->rt6i_flags & RTF_GATEWAY;
}
static int fetch_ha(const struct dst_entry *dst, struct rdma_dev_addr *dev_addr,
diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c
index f82b4260de42..3bb46696731e 100644
--- a/drivers/infiniband/core/agent.c
+++ b/drivers/infiniband/core/agent.c
@@ -59,7 +59,16 @@ __ib_get_agent_port(const struct ib_device *device, int port_num)
struct ib_agent_port_private *entry;
list_for_each_entry(entry, &ib_agent_port_list, port_list) {
- if (entry->agent[1]->device == device &&
+ /* Need to check both agent[0] and agent[1], as an agent port
+ * may only have one of them
+ */
+ if (entry->agent[0] &&
+ entry->agent[0]->device == device &&
+ entry->agent[0]->port_num == port_num)
+ return entry;
+
+ if (entry->agent[1] &&
+ entry->agent[1]->device == device &&
entry->agent[1]->port_num == port_num)
return entry;
}
@@ -172,14 +181,16 @@ int ib_agent_port_open(struct ib_device *device, int port_num)
}
}
- /* Obtain send only MAD agent for GSI QP */
- port_priv->agent[1] = ib_register_mad_agent(device, port_num,
- IB_QPT_GSI, NULL, 0,
- &agent_send_handler,
- NULL, NULL, 0);
- if (IS_ERR(port_priv->agent[1])) {
- ret = PTR_ERR(port_priv->agent[1]);
- goto error3;
+ if (rdma_cap_ib_cm(device, port_num)) {
+ /* Obtain send only MAD agent for GSI QP */
+ port_priv->agent[1] = ib_register_mad_agent(device, port_num,
+ IB_QPT_GSI, NULL, 0,
+ &agent_send_handler,
+ NULL, NULL, 0);
+ if (IS_ERR(port_priv->agent[1])) {
+ ret = PTR_ERR(port_priv->agent[1]);
+ goto error3;
+ }
}
spin_lock_irqsave(&ib_agent_port_list_lock, flags);
@@ -212,7 +223,8 @@ int ib_agent_port_close(struct ib_device *device, int port_num)
list_del(&port_priv->port_list);
spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
- ib_unregister_mad_agent(port_priv->agent[1]);
+ if (port_priv->agent[1])
+ ib_unregister_mad_agent(port_priv->agent[1]);
if (port_priv->agent[0])
ib_unregister_mad_agent(port_priv->agent[0]);
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index c02a96d3572a..9979a351577f 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -794,7 +794,6 @@ err_free_table:
static void release_gid_table(struct ib_device *device,
struct ib_gid_table *table)
{
- bool leak = false;
int i;
if (!table)
@@ -803,15 +802,12 @@ static void release_gid_table(struct ib_device *device,
for (i = 0; i < table->sz; i++) {
if (is_gid_entry_free(table->data_vec[i]))
continue;
- if (kref_read(&table->data_vec[i]->kref) > 1) {
- dev_err(&device->dev,
- "GID entry ref leak for index %d ref=%u\n", i,
- kref_read(&table->data_vec[i]->kref));
- leak = true;
- }
+
+ WARN_ONCE(true,
+ "GID entry ref leak for dev %s index %d ref=%u\n",
+ dev_name(&device->dev), i,
+ kref_read(&table->data_vec[i]->kref));
}
- if (leak)
- return;
mutex_destroy(&table->lock);
kfree(table->data_vec);
@@ -1131,41 +1127,6 @@ err:
}
EXPORT_SYMBOL(ib_find_cached_pkey);
-int ib_find_exact_cached_pkey(struct ib_device *device, u32 port_num,
- u16 pkey, u16 *index)
-{
- struct ib_pkey_cache *cache;
- unsigned long flags;
- int i;
- int ret = -ENOENT;
-
- if (!rdma_is_port_valid(device, port_num))
- return -EINVAL;
-
- read_lock_irqsave(&device->cache_lock, flags);
-
- cache = device->port_data[port_num].cache.pkey;
- if (!cache) {
- ret = -EINVAL;
- goto err;
- }
-
- *index = -1;
-
- for (i = 0; i < cache->table_len; ++i)
- if (cache->table[i] == pkey) {
- *index = i;
- ret = 0;
- break;
- }
-
-err:
- read_unlock_irqrestore(&device->cache_lock, flags);
-
- return ret;
-}
-EXPORT_SYMBOL(ib_find_exact_cached_pkey);
-
int ib_get_cached_lmc(struct ib_device *device, u32 port_num, u8 *lmc)
{
unsigned long flags;
@@ -1540,6 +1501,12 @@ ib_cache_update(struct ib_device *device, u32 port, bool update_gids,
device->port_data[port].cache.pkey = pkey_cache;
}
device->port_data[port].cache.lmc = tprops->lmc;
+
+ if (device->port_data[port].cache.port_state != IB_PORT_NOP &&
+ device->port_data[port].cache.port_state != tprops->state)
+ ibdev_info(device, "Port: %d Link %s\n", port,
+ ib_port_state_to_str(tprops->state));
+
device->port_data[port].cache.port_state = tprops->state;
device->port_data[port].cache.subnet_prefix = tprops->subnet_prefix;
@@ -1644,8 +1611,10 @@ int ib_cache_setup_one(struct ib_device *device)
rdma_for_each_port (device, p) {
err = ib_cache_update(device, p, true, true, true);
- if (err)
+ if (err) {
+ gid_table_cleanup_one(device);
return err;
+ }
}
return 0;
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 07fb8d3c037f..8670e58675c6 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -35,6 +35,9 @@ MODULE_DESCRIPTION("InfiniBand CM");
MODULE_LICENSE("Dual BSD/GPL");
#define CM_DESTROY_ID_WAIT_TIMEOUT 10000 /* msecs */
+#define CM_DIRECT_RETRY_CTX ((void *) 1UL)
+#define CM_MRA_SETTING 24 /* 4.096us * 2^24 = ~68.7 seconds */
+
static const char * const ibcm_rej_reason_strs[] = {
[IB_CM_REJ_NO_QP] = "no QP",
[IB_CM_REJ_NO_EEC] = "no EEC",
@@ -93,8 +96,7 @@ static void cm_process_work(struct cm_id_private *cm_id_priv,
struct cm_work *work);
static int cm_send_sidr_rep_locked(struct cm_id_private *cm_id_priv,
struct ib_cm_sidr_rep_param *param);
-static int cm_send_dreq_locked(struct cm_id_private *cm_id_priv,
- const void *private_data, u8 private_data_len);
+static void cm_issue_dreq(struct cm_id_private *cm_id_priv);
static int cm_send_drep_locked(struct cm_id_private *cm_id_priv,
void *private_data, u8 private_data_len);
static int cm_send_rej_locked(struct cm_id_private *cm_id_priv,
@@ -166,7 +168,7 @@ struct cm_port {
struct cm_device {
struct kref kref;
struct list_head list;
- spinlock_t mad_agent_lock;
+ rwlock_t mad_agent_lock;
struct ib_device *ib_device;
u8 ack_delay;
int going_down;
@@ -240,7 +242,6 @@ struct cm_id_private {
u8 initiator_depth;
u8 retry_count;
u8 rnr_retry_count;
- u8 service_timeout;
u8 target_ack_delay;
struct list_head work_list;
@@ -284,7 +285,7 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv)
if (!cm_id_priv->av.port)
return ERR_PTR(-EINVAL);
- spin_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
+ read_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
mad_agent = cm_id_priv->av.port->mad_agent;
if (!mad_agent) {
m = ERR_PTR(-EINVAL);
@@ -307,30 +308,22 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv)
goto out;
}
- /* Timeout set by caller if response is expected. */
m->ah = ah;
- m->retries = cm_id_priv->max_cm_retries;
-
- refcount_inc(&cm_id_priv->refcount);
- m->context[0] = cm_id_priv;
out:
- spin_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
+ read_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
return m;
}
static void cm_free_msg(struct ib_mad_send_buf *msg)
{
- struct cm_id_private *cm_id_priv = msg->context[0];
-
if (msg->ah)
rdma_destroy_ah(msg->ah, 0);
- cm_deref_id(cm_id_priv);
ib_free_send_mad(msg);
}
static struct ib_mad_send_buf *
-cm_alloc_priv_msg(struct cm_id_private *cm_id_priv)
+cm_alloc_priv_msg(struct cm_id_private *cm_id_priv, enum ib_cm_state state)
{
struct ib_mad_send_buf *msg;
@@ -339,7 +332,15 @@ cm_alloc_priv_msg(struct cm_id_private *cm_id_priv)
msg = cm_alloc_msg(cm_id_priv);
if (IS_ERR(msg))
return msg;
+
cm_id_priv->msg = msg;
+ refcount_inc(&cm_id_priv->refcount);
+ msg->context[0] = cm_id_priv;
+ msg->context[1] = (void *) (unsigned long) state;
+
+ msg->retries = cm_id_priv->max_cm_retries;
+ msg->timeout_ms = cm_id_priv->timeout_ms;
+
return msg;
}
@@ -358,13 +359,20 @@ static void cm_free_priv_msg(struct ib_mad_send_buf *msg)
ib_free_send_mad(msg);
}
-static struct ib_mad_send_buf *cm_alloc_response_msg_no_ah(struct cm_port *port,
- struct ib_mad_recv_wc *mad_recv_wc)
+static struct ib_mad_send_buf *
+cm_alloc_response_msg_no_ah(struct cm_port *port,
+ struct ib_mad_recv_wc *mad_recv_wc,
+ bool direct_retry)
{
- return ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index,
- 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
- GFP_ATOMIC,
- IB_MGMT_BASE_VERSION);
+ struct ib_mad_send_buf *m;
+
+ m = ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index,
+ 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
+ GFP_ATOMIC, IB_MGMT_BASE_VERSION);
+ if (!IS_ERR(m))
+ m->context[0] = direct_retry ? CM_DIRECT_RETRY_CTX : NULL;
+
+ return m;
}
static int cm_create_response_msg_ah(struct cm_port *port,
@@ -384,12 +392,13 @@ static int cm_create_response_msg_ah(struct cm_port *port,
static int cm_alloc_response_msg(struct cm_port *port,
struct ib_mad_recv_wc *mad_recv_wc,
+ bool direct_retry,
struct ib_mad_send_buf **msg)
{
struct ib_mad_send_buf *m;
int ret;
- m = cm_alloc_response_msg_no_ah(port, mad_recv_wc);
+ m = cm_alloc_response_msg_no_ah(port, mad_recv_wc, direct_retry);
if (IS_ERR(m))
return PTR_ERR(m);
@@ -403,13 +412,6 @@ static int cm_alloc_response_msg(struct cm_port *port,
return 0;
}
-static void cm_free_response_msg(struct ib_mad_send_buf *msg)
-{
- if (msg->ah)
- rdma_destroy_ah(msg->ah, 0);
- ib_free_send_mad(msg);
-}
-
static void *cm_copy_private_data(const void *private_data, u8 private_data_len)
{
void *data;
@@ -1109,7 +1111,8 @@ retest:
cm_id->state = IB_CM_IDLE;
break;
}
- cm_send_dreq_locked(cm_id_priv, NULL, 0);
+ cm_issue_dreq(cm_id_priv);
+ cm_enter_timewait(cm_id_priv);
goto retest;
case IB_CM_DREQ_SENT:
ib_cancel_mad(cm_id_priv->msg);
@@ -1294,10 +1297,10 @@ static __be64 cm_form_tid(struct cm_id_private *cm_id_priv)
if (!cm_id_priv->av.port)
return cpu_to_be64(low_tid);
- spin_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
+ read_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
if (cm_id_priv->av.port->mad_agent)
hi_tid = ((u64)cm_id_priv->av.port->mad_agent->hi_tid) << 32;
- spin_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
+ read_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
return cpu_to_be64(hi_tid | low_tid);
}
@@ -1557,7 +1560,7 @@ int ib_send_cm_req(struct ib_cm_id *cm_id,
if (param->alternate_path)
cm_move_av_from_path(&cm_id_priv->alt_av, &alt_av);
- msg = cm_alloc_priv_msg(cm_id_priv);
+ msg = cm_alloc_priv_msg(cm_id_priv, IB_CM_REQ_SENT);
if (IS_ERR(msg)) {
ret = PTR_ERR(msg);
goto out_unlock;
@@ -1566,8 +1569,6 @@ int ib_send_cm_req(struct ib_cm_id *cm_id,
req_msg = (struct cm_req_msg *)msg->mad;
cm_format_req(req_msg, cm_id_priv, param);
cm_id_priv->tid = req_msg->hdr.tid;
- msg->timeout_ms = cm_id_priv->timeout_ms;
- msg->context[1] = (void *)(unsigned long)IB_CM_REQ_SENT;
cm_id_priv->local_qpn = cpu_to_be32(IBA_GET(CM_REQ_LOCAL_QPN, req_msg));
cm_id_priv->rq_psn = cpu_to_be32(IBA_GET(CM_REQ_STARTING_PSN, req_msg));
@@ -1598,7 +1599,7 @@ static int cm_issue_rej(struct cm_port *port,
struct cm_rej_msg *rej_msg, *rcv_msg;
int ret;
- ret = cm_alloc_response_msg(port, mad_recv_wc, &msg);
+ ret = cm_alloc_response_msg(port, mad_recv_wc, false, &msg);
if (ret)
return ret;
@@ -1624,7 +1625,7 @@ static int cm_issue_rej(struct cm_port *port,
IBA_GET(CM_REJ_REMOTE_COMM_ID, rcv_msg));
ret = ib_post_send_mad(msg, NULL);
if (ret)
- cm_free_response_msg(msg);
+ cm_free_msg(msg);
return ret;
}
@@ -1871,7 +1872,7 @@ static void cm_process_work(struct cm_id_private *cm_id_priv,
static void cm_format_mra(struct cm_mra_msg *mra_msg,
struct cm_id_private *cm_id_priv,
- enum cm_msg_response msg_mraed, u8 service_timeout,
+ enum cm_msg_response msg_mraed,
const void *private_data, u8 private_data_len)
{
cm_format_mad_hdr(&mra_msg->hdr, CM_MRA_ATTR_ID, cm_id_priv->tid);
@@ -1880,7 +1881,7 @@ static void cm_format_mra(struct cm_mra_msg *mra_msg,
be32_to_cpu(cm_id_priv->id.local_id));
IBA_SET(CM_MRA_REMOTE_COMM_ID, mra_msg,
be32_to_cpu(cm_id_priv->id.remote_id));
- IBA_SET(CM_MRA_SERVICE_TIMEOUT, mra_msg, service_timeout);
+ IBA_SET(CM_MRA_SERVICE_TIMEOUT, mra_msg, CM_MRA_SETTING);
if (private_data && private_data_len)
IBA_SET_MEM(CM_MRA_PRIVATE_DATA, mra_msg, private_data,
@@ -1951,7 +1952,7 @@ static void cm_dup_req_handler(struct cm_work *work,
}
spin_unlock_irq(&cm_id_priv->lock);
- ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg);
+ ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, true, &msg);
if (ret)
return;
@@ -1959,7 +1960,7 @@ static void cm_dup_req_handler(struct cm_work *work,
switch (cm_id_priv->id.state) {
case IB_CM_MRA_REQ_SENT:
cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
- CM_MSG_RESPONSE_REQ, cm_id_priv->service_timeout,
+ CM_MSG_RESPONSE_REQ,
cm_id_priv->private_data,
cm_id_priv->private_data_len);
break;
@@ -1980,7 +1981,7 @@ static void cm_dup_req_handler(struct cm_work *work,
return;
unlock: spin_unlock_irq(&cm_id_priv->lock);
-free: cm_free_response_msg(msg);
+free: cm_free_msg(msg);
}
static struct cm_id_private *cm_match_req(struct cm_work *work,
@@ -2294,7 +2295,7 @@ int ib_send_cm_rep(struct ib_cm_id *cm_id,
goto out;
}
- msg = cm_alloc_priv_msg(cm_id_priv);
+ msg = cm_alloc_priv_msg(cm_id_priv, IB_CM_REP_SENT);
if (IS_ERR(msg)) {
ret = PTR_ERR(msg);
goto out;
@@ -2302,8 +2303,6 @@ int ib_send_cm_rep(struct ib_cm_id *cm_id,
rep_msg = (struct cm_rep_msg *) msg->mad;
cm_format_rep(rep_msg, cm_id_priv, param);
- msg->timeout_ms = cm_id_priv->timeout_ms;
- msg->context[1] = (void *) (unsigned long) IB_CM_REP_SENT;
trace_icm_send_rep(cm_id);
ret = ib_post_send_mad(msg, NULL);
@@ -2444,7 +2443,7 @@ static void cm_dup_rep_handler(struct cm_work *work)
atomic_long_inc(
&work->port->counters[CM_RECV_DUPLICATES][CM_REP_COUNTER]);
- ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg);
+ ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, true, &msg);
if (ret)
goto deref;
@@ -2455,7 +2454,7 @@ static void cm_dup_rep_handler(struct cm_work *work)
cm_id_priv->private_data_len);
else if (cm_id_priv->id.state == IB_CM_MRA_REP_SENT)
cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
- CM_MSG_RESPONSE_REP, cm_id_priv->service_timeout,
+ CM_MSG_RESPONSE_REP,
cm_id_priv->private_data,
cm_id_priv->private_data_len);
else
@@ -2469,7 +2468,7 @@ static void cm_dup_rep_handler(struct cm_work *work)
goto deref;
unlock: spin_unlock_irq(&cm_id_priv->lock);
-free: cm_free_response_msg(msg);
+free: cm_free_msg(msg);
deref: cm_deref_id(cm_id_priv);
}
@@ -2653,59 +2652,68 @@ static void cm_format_dreq(struct cm_dreq_msg *dreq_msg,
private_data_len);
}
-static int cm_send_dreq_locked(struct cm_id_private *cm_id_priv,
- const void *private_data, u8 private_data_len)
+static void cm_issue_dreq(struct cm_id_private *cm_id_priv)
{
struct ib_mad_send_buf *msg;
int ret;
lockdep_assert_held(&cm_id_priv->lock);
+ msg = cm_alloc_msg(cm_id_priv);
+ if (IS_ERR(msg))
+ return;
+
+ cm_format_dreq((struct cm_dreq_msg *) msg->mad, cm_id_priv, NULL, 0);
+
+ trace_icm_send_dreq(&cm_id_priv->id);
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret)
+ cm_free_msg(msg);
+}
+
+int ib_send_cm_dreq(struct ib_cm_id *cm_id, const void *private_data,
+ u8 private_data_len)
+{
+ struct cm_id_private *cm_id_priv =
+ container_of(cm_id, struct cm_id_private, id);
+ struct ib_mad_send_buf *msg;
+ unsigned long flags;
+ int ret;
+
if (private_data && private_data_len > IB_CM_DREQ_PRIVATE_DATA_SIZE)
return -EINVAL;
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
if (cm_id_priv->id.state != IB_CM_ESTABLISHED) {
trace_icm_dreq_skipped(&cm_id_priv->id);
- return -EINVAL;
+ ret = -EINVAL;
+ goto unlock;
}
if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT ||
cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD)
ib_cancel_mad(cm_id_priv->msg);
- msg = cm_alloc_priv_msg(cm_id_priv);
+ msg = cm_alloc_priv_msg(cm_id_priv, IB_CM_DREQ_SENT);
if (IS_ERR(msg)) {
cm_enter_timewait(cm_id_priv);
- return PTR_ERR(msg);
+ ret = PTR_ERR(msg);
+ goto unlock;
}
cm_format_dreq((struct cm_dreq_msg *) msg->mad, cm_id_priv,
private_data, private_data_len);
- msg->timeout_ms = cm_id_priv->timeout_ms;
- msg->context[1] = (void *) (unsigned long) IB_CM_DREQ_SENT;
trace_icm_send_dreq(&cm_id_priv->id);
ret = ib_post_send_mad(msg, NULL);
if (ret) {
cm_enter_timewait(cm_id_priv);
cm_free_priv_msg(msg);
- return ret;
+ goto unlock;
}
cm_id_priv->id.state = IB_CM_DREQ_SENT;
- return 0;
-}
-
-int ib_send_cm_dreq(struct ib_cm_id *cm_id, const void *private_data,
- u8 private_data_len)
-{
- struct cm_id_private *cm_id_priv =
- container_of(cm_id, struct cm_id_private, id);
- unsigned long flags;
- int ret;
-
- spin_lock_irqsave(&cm_id_priv->lock, flags);
- ret = cm_send_dreq_locked(cm_id_priv, private_data, private_data_len);
+unlock:
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
return ret;
}
@@ -2791,7 +2799,7 @@ static int cm_issue_drep(struct cm_port *port,
struct cm_drep_msg *drep_msg;
int ret;
- ret = cm_alloc_response_msg(port, mad_recv_wc, &msg);
+ ret = cm_alloc_response_msg(port, mad_recv_wc, true, &msg);
if (ret)
return ret;
@@ -2809,7 +2817,7 @@ static int cm_issue_drep(struct cm_port *port,
IBA_GET(CM_DREQ_REMOTE_COMM_ID, dreq_msg));
ret = ib_post_send_mad(msg, NULL);
if (ret)
- cm_free_response_msg(msg);
+ cm_free_msg(msg);
return ret;
}
@@ -2856,7 +2864,8 @@ static int cm_dreq_handler(struct cm_work *work)
case IB_CM_TIMEWAIT:
atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES]
[CM_DREQ_COUNTER]);
- msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc);
+ msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc,
+ true);
if (IS_ERR(msg))
goto unlock;
@@ -2867,7 +2876,7 @@ static int cm_dreq_handler(struct cm_work *work)
if (cm_create_response_msg_ah(work->port, work->mad_recv_wc, msg) ||
ib_post_send_mad(msg, NULL))
- cm_free_response_msg(msg);
+ cm_free_msg(msg);
goto deref;
case IB_CM_DREQ_RCVD:
atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES]
@@ -3085,26 +3094,13 @@ out:
return -EINVAL;
}
-int ib_send_cm_mra(struct ib_cm_id *cm_id,
- u8 service_timeout,
- const void *private_data,
- u8 private_data_len)
+int ib_prepare_cm_mra(struct ib_cm_id *cm_id)
{
struct cm_id_private *cm_id_priv;
- struct ib_mad_send_buf *msg;
enum ib_cm_state cm_state;
enum ib_cm_lap_state lap_state;
- enum cm_msg_response msg_response;
- void *data;
unsigned long flags;
- int ret;
-
- if (private_data && private_data_len > IB_CM_MRA_PRIVATE_DATA_SIZE)
- return -EINVAL;
-
- data = cm_copy_private_data(private_data, private_data_len);
- if (IS_ERR(data))
- return PTR_ERR(data);
+ int ret = 0;
cm_id_priv = container_of(cm_id, struct cm_id_private, id);
@@ -3113,58 +3109,33 @@ int ib_send_cm_mra(struct ib_cm_id *cm_id,
case IB_CM_REQ_RCVD:
cm_state = IB_CM_MRA_REQ_SENT;
lap_state = cm_id->lap_state;
- msg_response = CM_MSG_RESPONSE_REQ;
break;
case IB_CM_REP_RCVD:
cm_state = IB_CM_MRA_REP_SENT;
lap_state = cm_id->lap_state;
- msg_response = CM_MSG_RESPONSE_REP;
break;
case IB_CM_ESTABLISHED:
if (cm_id->lap_state == IB_CM_LAP_RCVD) {
cm_state = cm_id->state;
lap_state = IB_CM_MRA_LAP_SENT;
- msg_response = CM_MSG_RESPONSE_OTHER;
break;
}
fallthrough;
default:
- trace_icm_send_mra_unknown_err(&cm_id_priv->id);
+ trace_icm_prepare_mra_unknown_err(&cm_id_priv->id);
ret = -EINVAL;
goto error_unlock;
}
- if (!(service_timeout & IB_CM_MRA_FLAG_DELAY)) {
- msg = cm_alloc_msg(cm_id_priv);
- if (IS_ERR(msg)) {
- ret = PTR_ERR(msg);
- goto error_unlock;
- }
-
- cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
- msg_response, service_timeout,
- private_data, private_data_len);
- trace_icm_send_mra(cm_id);
- ret = ib_post_send_mad(msg, NULL);
- if (ret)
- goto error_free_msg;
- }
-
cm_id->state = cm_state;
cm_id->lap_state = lap_state;
- cm_id_priv->service_timeout = service_timeout;
- cm_set_private_data(cm_id_priv, data, private_data_len);
- spin_unlock_irqrestore(&cm_id_priv->lock, flags);
- return 0;
+ cm_set_private_data(cm_id_priv, NULL, 0);
-error_free_msg:
- cm_free_msg(msg);
error_unlock:
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
- kfree(data);
return ret;
}
-EXPORT_SYMBOL(ib_send_cm_mra);
+EXPORT_SYMBOL(ib_prepare_cm_mra);
static struct cm_id_private *cm_acquire_mraed_id(struct cm_mra_msg *mra_msg)
{
@@ -3361,20 +3332,20 @@ static int cm_lap_handler(struct cm_work *work)
case IB_CM_MRA_LAP_SENT:
atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES]
[CM_LAP_COUNTER]);
- msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc);
+ msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc,
+ true);
if (IS_ERR(msg))
goto unlock;
cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
CM_MSG_RESPONSE_OTHER,
- cm_id_priv->service_timeout,
cm_id_priv->private_data,
cm_id_priv->private_data_len);
spin_unlock_irq(&cm_id_priv->lock);
if (cm_create_response_msg_ah(work->port, work->mad_recv_wc, msg) ||
ib_post_send_mad(msg, NULL))
- cm_free_response_msg(msg);
+ cm_free_msg(msg);
goto deref;
case IB_CM_LAP_RCVD:
atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES]
@@ -3513,7 +3484,7 @@ int ib_send_cm_sidr_req(struct ib_cm_id *cm_id,
goto out_unlock;
}
- msg = cm_alloc_priv_msg(cm_id_priv);
+ msg = cm_alloc_priv_msg(cm_id_priv, IB_CM_SIDR_REQ_SENT);
if (IS_ERR(msg)) {
ret = PTR_ERR(msg);
goto out_unlock;
@@ -3521,8 +3492,6 @@ int ib_send_cm_sidr_req(struct ib_cm_id *cm_id,
cm_format_sidr_req((struct cm_sidr_req_msg *)msg->mad, cm_id_priv,
param);
- msg->timeout_ms = cm_id_priv->timeout_ms;
- msg->context[1] = (void *)(unsigned long)IB_CM_SIDR_REQ_SENT;
trace_icm_send_sidr_req(&cm_id_priv->id);
ret = ib_post_send_mad(msg, NULL);
@@ -3768,17 +3737,18 @@ out:
static void cm_process_send_error(struct cm_id_private *cm_id_priv,
struct ib_mad_send_buf *msg,
- enum ib_cm_state state,
enum ib_wc_status wc_status)
{
+ enum ib_cm_state state = (unsigned long) msg->context[1];
struct ib_cm_event cm_event = {};
int ret;
- /* Discard old sends or ones without a response. */
+ /* Discard old sends. */
spin_lock_irq(&cm_id_priv->lock);
if (msg != cm_id_priv->msg) {
spin_unlock_irq(&cm_id_priv->lock);
cm_free_msg(msg);
+ cm_deref_id(cm_id_priv);
return;
}
cm_free_priv_msg(msg);
@@ -3826,9 +3796,7 @@ static void cm_send_handler(struct ib_mad_agent *mad_agent,
struct ib_mad_send_wc *mad_send_wc)
{
struct ib_mad_send_buf *msg = mad_send_wc->send_buf;
- struct cm_id_private *cm_id_priv = msg->context[0];
- enum ib_cm_state state =
- (enum ib_cm_state)(unsigned long)msg->context[1];
+ struct cm_id_private *cm_id_priv;
struct cm_port *port;
u16 attr_index;
@@ -3836,13 +3804,12 @@ static void cm_send_handler(struct ib_mad_agent *mad_agent,
attr_index = be16_to_cpu(((struct ib_mad_hdr *)
msg->mad)->attr_id) - CM_ATTR_ID_OFFSET;
- /*
- * If the send was in response to a received message (context[0] is not
- * set to a cm_id), and is not a REJ, then it is a send that was
- * manually retried.
- */
- if (!cm_id_priv && (attr_index != CM_REJ_COUNTER))
+ if (msg->context[0] == CM_DIRECT_RETRY_CTX) {
msg->retries = 1;
+ cm_id_priv = NULL;
+ } else {
+ cm_id_priv = msg->context[0];
+ }
atomic_long_add(1 + msg->retries, &port->counters[CM_XMIT][attr_index]);
if (msg->retries)
@@ -3850,10 +3817,9 @@ static void cm_send_handler(struct ib_mad_agent *mad_agent,
&port->counters[CM_XMIT_RETRIES][attr_index]);
if (cm_id_priv)
- cm_process_send_error(cm_id_priv, msg, state,
- mad_send_wc->status);
+ cm_process_send_error(cm_id_priv, msg, mad_send_wc->status);
else
- cm_free_response_msg(msg);
+ cm_free_msg(msg);
}
static void cm_work_handler(struct work_struct *_work)
@@ -4374,7 +4340,7 @@ static int cm_add_one(struct ib_device *ib_device)
return -ENOMEM;
kref_init(&cm_dev->kref);
- spin_lock_init(&cm_dev->mad_agent_lock);
+ rwlock_init(&cm_dev->mad_agent_lock);
cm_dev->ib_device = ib_device;
cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay;
cm_dev->going_down = 0;
@@ -4490,9 +4456,9 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data)
* The above ensures no call paths from the work are running,
* the remaining paths all take the mad_agent_lock.
*/
- spin_lock(&cm_dev->mad_agent_lock);
+ write_lock(&cm_dev->mad_agent_lock);
port->mad_agent = NULL;
- spin_unlock(&cm_dev->mad_agent_lock);
+ write_unlock(&cm_dev->mad_agent_lock);
ib_unregister_mad_agent(mad_agent);
ib_port_unregister_client_groups(ib_device, i,
cm_counter_groups);
diff --git a/drivers/infiniband/core/cm_trace.h b/drivers/infiniband/core/cm_trace.h
index 944d9071245d..4a4987da69d4 100644
--- a/drivers/infiniband/core/cm_trace.h
+++ b/drivers/infiniband/core/cm_trace.h
@@ -229,7 +229,7 @@ DEFINE_CM_ERR_EVENT(send_drep);
DEFINE_CM_ERR_EVENT(dreq_unknown);
DEFINE_CM_ERR_EVENT(send_unknown_rej);
DEFINE_CM_ERR_EVENT(rej_unknown);
-DEFINE_CM_ERR_EVENT(send_mra_unknown);
+DEFINE_CM_ERR_EVENT(prepare_mra_unknown);
DEFINE_CM_ERR_EVENT(mra_unknown);
DEFINE_CM_ERR_EVENT(qp_init);
DEFINE_CM_ERR_EVENT(qp_rtr);
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 1e2cd7c8716e..9b471548e7ae 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -46,7 +46,6 @@ MODULE_LICENSE("Dual BSD/GPL");
#define CMA_CM_RESPONSE_TIMEOUT 20
#define CMA_MAX_CM_RETRIES 15
-#define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24)
#define CMA_IBOE_PACKET_LIFETIME 16
#define CMA_PREFERRED_ROCE_GID_TYPE IB_GID_TYPE_ROCE_UDP_ENCAP
@@ -72,6 +71,8 @@ static const char * const cma_events[] = {
static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid,
enum ib_gid_type gid_type);
+static void cma_netevent_work_handler(struct work_struct *_work);
+
const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event)
{
size_t index = event;
@@ -144,19 +145,6 @@ struct iw_cm_id *rdma_iw_cm_id(struct rdma_cm_id *id)
}
EXPORT_SYMBOL(rdma_iw_cm_id);
-/**
- * rdma_res_to_id() - return the rdma_cm_id pointer for this restrack.
- * @res: rdma resource tracking entry pointer
- */
-struct rdma_cm_id *rdma_res_to_id(struct rdma_restrack_entry *res)
-{
- struct rdma_id_private *id_priv =
- container_of(res, struct rdma_id_private, res);
-
- return &id_priv->id;
-}
-EXPORT_SYMBOL(rdma_res_to_id);
-
static int cma_add_one(struct ib_device *device);
static void cma_remove_one(struct ib_device *device, void *client_data);
@@ -690,6 +678,7 @@ cma_validate_port(struct ib_device *device, u32 port,
int bound_if_index = dev_addr->bound_dev_if;
int dev_type = dev_addr->dev_type;
struct net_device *ndev = NULL;
+ struct net_device *pdev = NULL;
if (!rdma_dev_access_netns(device, id_priv->id.route.addr.dev_addr.net))
goto out;
@@ -714,19 +703,50 @@ cma_validate_port(struct ib_device *device, u32 port,
rcu_read_lock();
ndev = rcu_dereference(sgid_attr->ndev);
+ if (ndev->ifindex != bound_if_index) {
+ pdev = dev_get_by_index_rcu(dev_addr->net, bound_if_index);
+ if (pdev) {
+ if (is_vlan_dev(pdev)) {
+ pdev = vlan_dev_real_dev(pdev);
+ if (ndev->ifindex == pdev->ifindex)
+ bound_if_index = pdev->ifindex;
+ }
+ if (is_vlan_dev(ndev)) {
+ pdev = vlan_dev_real_dev(ndev);
+ if (bound_if_index == pdev->ifindex)
+ bound_if_index = ndev->ifindex;
+ }
+ }
+ }
if (!net_eq(dev_net(ndev), dev_addr->net) ||
- ndev->ifindex != bound_if_index)
+ ndev->ifindex != bound_if_index) {
+ rdma_put_gid_attr(sgid_attr);
sgid_attr = ERR_PTR(-ENODEV);
+ }
rcu_read_unlock();
goto out;
}
- if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) {
- ndev = dev_get_by_index(dev_addr->net, bound_if_index);
- if (!ndev)
- goto out;
+ /*
+ * For a RXE device, it should work with TUN device and normal ethernet
+ * devices. Use driver_id to check if a device is a RXE device or not.
+ * ARPHDR_NONE means a TUN device.
+ */
+ if (device->ops.driver_id == RDMA_DRIVER_RXE) {
+ if ((dev_type == ARPHRD_NONE || dev_type == ARPHRD_ETHER)
+ && rdma_protocol_roce(device, port)) {
+ ndev = dev_get_by_index(dev_addr->net, bound_if_index);
+ if (!ndev)
+ goto out;
+ }
} else {
- gid_type = IB_GID_TYPE_IB;
+ if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) {
+ ndev = dev_get_by_index(dev_addr->net, bound_if_index);
+ if (!ndev)
+ goto out;
+ } else {
+ gid_type = IB_GID_TYPE_IB;
+ }
}
sgid_attr = rdma_find_gid_by_port(device, gid, gid_type, port, ndev);
@@ -1015,6 +1035,7 @@ __rdma_create_id(struct net *net, rdma_cm_event_handler event_handler,
get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num);
id_priv->id.route.addr.dev_addr.net = get_net(net);
id_priv->seq_num &= 0x00ffffff;
+ INIT_WORK(&id_priv->id.net_work, cma_netevent_work_handler);
rdma_restrack_new(&id_priv->res, RDMA_RESTRACK_CM_ID);
if (parent)
@@ -2179,8 +2200,8 @@ static int cma_ib_handler(struct ib_cm_id *cm_id,
case IB_CM_REP_RECEIVED:
if (state == RDMA_CM_CONNECT &&
(id_priv->id.qp_type != IB_QPT_UD)) {
- trace_cm_send_mra(id_priv);
- ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
+ trace_cm_prepare_mra(id_priv);
+ ib_prepare_cm_mra(cm_id);
}
if (id_priv->id.qp) {
event.status = cma_rep_recv(id_priv);
@@ -2441,8 +2462,8 @@ static int cma_ib_req_handler(struct ib_cm_id *cm_id,
if (READ_ONCE(conn_id->state) == RDMA_CM_CONNECT &&
conn_id->id.qp_type != IB_QPT_UD) {
- trace_cm_send_mra(cm_id->context);
- ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
+ trace_cm_prepare_mra(cm_id->context);
+ ib_prepare_cm_mra(cm_id);
}
mutex_unlock(&conn_id->handler_mutex);
@@ -5209,9 +5230,9 @@ static int cma_netevent_callback(struct notifier_block *self,
if (!memcmp(current_id->id.route.addr.dev_addr.dst_dev_addr,
neigh->ha, ETH_ALEN))
continue;
- INIT_WORK(&current_id->id.net_work, cma_netevent_work_handler);
cma_id_get(current_id);
- queue_work(cma_wq, &current_id->id.net_work);
+ if (!queue_work(cma_wq, &current_id->id.net_work))
+ cma_id_put(current_id);
}
out:
spin_unlock_irqrestore(&id_table_lock, flags);
diff --git a/drivers/infiniband/core/cma_trace.h b/drivers/infiniband/core/cma_trace.h
index 47f3c6e4be89..3456d5f3aa47 100644
--- a/drivers/infiniband/core/cma_trace.h
+++ b/drivers/infiniband/core/cma_trace.h
@@ -55,7 +55,7 @@ DECLARE_EVENT_CLASS(cma_fsm_class,
DEFINE_CMA_FSM_EVENT(send_rtu);
DEFINE_CMA_FSM_EVENT(send_rej);
-DEFINE_CMA_FSM_EVENT(send_mra);
+DEFINE_CMA_FSM_EVENT(prepare_mra);
DEFINE_CMA_FSM_EVENT(send_sidr_req);
DEFINE_CMA_FSM_EVENT(send_sidr_rep);
DEFINE_CMA_FSM_EVENT(disconnect);
@@ -84,7 +84,7 @@ TRACE_EVENT(cm_id_attach,
sizeof(struct sockaddr_in6));
memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr,
sizeof(struct sockaddr_in6));
- __assign_str(devname, device->name);
+ __assign_str(devname);
),
TP_printk("cm.id=%u src=%pISpc dst=%pISpc device=%s",
@@ -334,7 +334,7 @@ DECLARE_EVENT_CLASS(cma_client_class,
),
TP_fast_assign(
- __assign_str(name, device->name);
+ __assign_str(name);
),
TP_printk("device name=%s",
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index dd7715ba9fd1..05102769a918 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -325,9 +325,6 @@ void ib_qp_usecnt_inc(struct ib_qp *qp);
void ib_qp_usecnt_dec(struct ib_qp *qp);
struct rdma_dev_addr;
-int rdma_resolve_ip_route(struct sockaddr *src_addr,
- const struct sockaddr *dst_addr,
- struct rdma_dev_addr *addr);
int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid,
const union ib_gid *dgid,
diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c
index af59486fe418..e6ec7b7a40af 100644
--- a/drivers/infiniband/core/counters.c
+++ b/drivers/infiniband/core/counters.c
@@ -12,7 +12,8 @@
static int __counter_set_mode(struct rdma_port_counter *port_counter,
enum rdma_nl_counter_mode new_mode,
- enum rdma_nl_counter_mask new_mask)
+ enum rdma_nl_counter_mask new_mask,
+ bool bind_opcnt)
{
if (new_mode == RDMA_COUNTER_MODE_AUTO) {
if (new_mask & (~ALL_AUTO_MODE_MASKS))
@@ -23,6 +24,7 @@ static int __counter_set_mode(struct rdma_port_counter *port_counter,
port_counter->mode.mode = new_mode;
port_counter->mode.mask = new_mask;
+ port_counter->mode.bind_opcnt = bind_opcnt;
return 0;
}
@@ -41,6 +43,7 @@ static int __counter_set_mode(struct rdma_port_counter *port_counter,
*/
int rdma_counter_set_auto_mode(struct ib_device *dev, u32 port,
enum rdma_nl_counter_mask mask,
+ bool bind_opcnt,
struct netlink_ext_ack *extack)
{
struct rdma_port_counter *port_counter;
@@ -59,12 +62,13 @@ int rdma_counter_set_auto_mode(struct ib_device *dev, u32 port,
RDMA_COUNTER_MODE_NONE;
if (port_counter->mode.mode == mode &&
- port_counter->mode.mask == mask) {
+ port_counter->mode.mask == mask &&
+ port_counter->mode.bind_opcnt == bind_opcnt) {
ret = 0;
goto out;
}
- ret = __counter_set_mode(port_counter, mode, mask);
+ ret = __counter_set_mode(port_counter, mode, mask, bind_opcnt);
out:
mutex_unlock(&port_counter->lock);
@@ -89,7 +93,7 @@ static void auto_mode_init_counter(struct rdma_counter *counter,
}
static int __rdma_counter_bind_qp(struct rdma_counter *counter,
- struct ib_qp *qp)
+ struct ib_qp *qp, u32 port)
{
int ret;
@@ -100,7 +104,7 @@ static int __rdma_counter_bind_qp(struct rdma_counter *counter,
return -EOPNOTSUPP;
mutex_lock(&counter->lock);
- ret = qp->device->ops.counter_bind_qp(counter, qp);
+ ret = qp->device->ops.counter_bind_qp(counter, qp, port);
mutex_unlock(&counter->lock);
return ret;
@@ -140,7 +144,8 @@ out:
static struct rdma_counter *alloc_and_bind(struct ib_device *dev, u32 port,
struct ib_qp *qp,
- enum rdma_nl_counter_mode mode)
+ enum rdma_nl_counter_mode mode,
+ bool bind_opcnt)
{
struct rdma_port_counter *port_counter;
struct rdma_counter *counter;
@@ -149,13 +154,15 @@ static struct rdma_counter *alloc_and_bind(struct ib_device *dev, u32 port,
if (!dev->ops.counter_dealloc || !dev->ops.counter_alloc_stats)
return NULL;
- counter = kzalloc(sizeof(*counter), GFP_KERNEL);
+ counter = rdma_zalloc_drv_obj(dev, rdma_counter);
if (!counter)
return NULL;
counter->device = dev;
counter->port = port;
+ dev->ops.counter_init(counter);
+
rdma_restrack_new(&counter->res, RDMA_RESTRACK_COUNTER);
counter->stats = dev->ops.counter_alloc_stats(counter);
if (!counter->stats)
@@ -166,7 +173,7 @@ static struct rdma_counter *alloc_and_bind(struct ib_device *dev, u32 port,
switch (mode) {
case RDMA_COUNTER_MODE_MANUAL:
ret = __counter_set_mode(port_counter, RDMA_COUNTER_MODE_MANUAL,
- 0);
+ 0, bind_opcnt);
if (ret) {
mutex_unlock(&port_counter->lock);
goto err_mode;
@@ -185,10 +192,11 @@ static struct rdma_counter *alloc_and_bind(struct ib_device *dev, u32 port,
mutex_unlock(&port_counter->lock);
counter->mode.mode = mode;
+ counter->mode.bind_opcnt = bind_opcnt;
kref_init(&counter->kref);
mutex_init(&counter->lock);
- ret = __rdma_counter_bind_qp(counter, qp);
+ ret = __rdma_counter_bind_qp(counter, qp, port);
if (ret)
goto err_mode;
@@ -213,7 +221,8 @@ static void rdma_counter_free(struct rdma_counter *counter)
port_counter->num_counters--;
if (!port_counter->num_counters &&
(port_counter->mode.mode == RDMA_COUNTER_MODE_MANUAL))
- __counter_set_mode(port_counter, RDMA_COUNTER_MODE_NONE, 0);
+ __counter_set_mode(port_counter, RDMA_COUNTER_MODE_NONE, 0,
+ false);
mutex_unlock(&port_counter->lock);
@@ -238,7 +247,7 @@ static bool auto_mode_match(struct ib_qp *qp, struct rdma_counter *counter,
return match;
}
-static int __rdma_counter_unbind_qp(struct ib_qp *qp)
+static int __rdma_counter_unbind_qp(struct ib_qp *qp, u32 port)
{
struct rdma_counter *counter = qp->counter;
int ret;
@@ -247,7 +256,7 @@ static int __rdma_counter_unbind_qp(struct ib_qp *qp)
return -EOPNOTSUPP;
mutex_lock(&counter->lock);
- ret = qp->device->ops.counter_unbind_qp(qp);
+ ret = qp->device->ops.counter_unbind_qp(qp, port);
mutex_unlock(&counter->lock);
return ret;
@@ -339,13 +348,14 @@ int rdma_counter_bind_qp_auto(struct ib_qp *qp, u32 port)
counter = rdma_get_counter_auto_mode(qp, port);
if (counter) {
- ret = __rdma_counter_bind_qp(counter, qp);
+ ret = __rdma_counter_bind_qp(counter, qp, port);
if (ret) {
kref_put(&counter->kref, counter_release);
return ret;
}
} else {
- counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_AUTO);
+ counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_AUTO,
+ port_counter->mode.bind_opcnt);
if (!counter)
return -ENOMEM;
}
@@ -358,7 +368,7 @@ int rdma_counter_bind_qp_auto(struct ib_qp *qp, u32 port)
* @force:
* true - Decrease the counter ref-count anyway (e.g., qp destroy)
*/
-int rdma_counter_unbind_qp(struct ib_qp *qp, bool force)
+int rdma_counter_unbind_qp(struct ib_qp *qp, u32 port, bool force)
{
struct rdma_counter *counter = qp->counter;
int ret;
@@ -366,7 +376,7 @@ int rdma_counter_unbind_qp(struct ib_qp *qp, bool force)
if (!counter)
return -EINVAL;
- ret = __rdma_counter_unbind_qp(qp);
+ ret = __rdma_counter_unbind_qp(qp, port);
if (ret && !force)
return ret;
@@ -513,7 +523,7 @@ int rdma_counter_bind_qpn(struct ib_device *dev, u32 port,
goto err_task;
}
- ret = __rdma_counter_bind_qp(counter, qp);
+ ret = __rdma_counter_bind_qp(counter, qp, port);
if (ret)
goto err_task;
@@ -558,7 +568,7 @@ int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u32 port,
goto err;
}
- counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_MANUAL);
+ counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_MANUAL, true);
if (!counter) {
ret = -ENOMEM;
goto err;
@@ -604,7 +614,7 @@ int rdma_counter_unbind_qpn(struct ib_device *dev, u32 port,
goto out;
}
- ret = rdma_counter_unbind_qp(qp, false);
+ ret = rdma_counter_unbind_qp(qp, port, false);
out:
rdma_restrack_put(&qp->res);
@@ -613,13 +623,15 @@ out:
int rdma_counter_get_mode(struct ib_device *dev, u32 port,
enum rdma_nl_counter_mode *mode,
- enum rdma_nl_counter_mask *mask)
+ enum rdma_nl_counter_mask *mask,
+ bool *opcnt)
{
struct rdma_port_counter *port_counter;
port_counter = &dev->port_data[port].port_counter;
*mode = port_counter->mode.mode;
*mask = port_counter->mode.mask;
+ *opcnt = port_counter->mode.bind_opcnt;
return 0;
}
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 07cb6c5ffda0..d4263385850a 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -209,23 +209,6 @@ static void __ibdev_printk(const char *level, const struct ib_device *ibdev,
printk("%s(NULL ib_device): %pV", level, vaf);
}
-void ibdev_printk(const char *level, const struct ib_device *ibdev,
- const char *format, ...)
-{
- struct va_format vaf;
- va_list args;
-
- va_start(args, format);
-
- vaf.fmt = format;
- vaf.va = &args;
-
- __ibdev_printk(level, ibdev, &vaf);
-
- va_end(args);
-}
-EXPORT_SYMBOL(ibdev_printk);
-
#define define_ibdev_printk_level(func, level) \
void func(const struct ib_device *ibdev, const char *fmt, ...) \
{ \
@@ -437,6 +420,7 @@ int ib_device_rename(struct ib_device *ibdev, const char *name)
client->rename(ibdev, client_data);
}
up_read(&ibdev->client_data_rwsem);
+ rdma_nl_notify_event(ibdev, 0, RDMA_RENAME_EVENT);
up_read(&devices_rwsem);
return 0;
}
@@ -503,6 +487,7 @@ static void ib_device_release(struct device *device)
rcu_head);
}
+ mutex_destroy(&dev->subdev_lock);
mutex_destroy(&dev->unregistration_lock);
mutex_destroy(&dev->compat_devs_mutex);
@@ -543,6 +528,8 @@ static struct class ib_class = {
static void rdma_init_coredev(struct ib_core_device *coredev,
struct ib_device *dev, struct net *net)
{
+ bool is_full_dev = &dev->coredev == coredev;
+
/* This BUILD_BUG_ON is intended to catch layout change
* of union of ib_core_device and device.
* dev must be the first element as ib_core and providers
@@ -554,6 +541,13 @@ static void rdma_init_coredev(struct ib_core_device *coredev,
coredev->dev.class = &ib_class;
coredev->dev.groups = dev->groups;
+
+ /*
+ * Don't expose hw counters outside of the init namespace.
+ */
+ if (!is_full_dev && dev->hw_stats_attr_index)
+ coredev->dev.groups[dev->hw_stats_attr_index] = NULL;
+
device_initialize(&coredev->dev);
coredev->owner = dev;
INIT_LIST_HEAD(&coredev->port_list);
@@ -641,6 +635,11 @@ struct ib_device *_ib_alloc_device(size_t size)
BIT_ULL(IB_USER_VERBS_CMD_REG_MR) |
BIT_ULL(IB_USER_VERBS_CMD_REREG_MR) |
BIT_ULL(IB_USER_VERBS_CMD_RESIZE_CQ);
+
+ mutex_init(&device->subdev_lock);
+ INIT_LIST_HEAD(&device->subdev_list_head);
+ INIT_LIST_HEAD(&device->subdev_list);
+
return device;
}
EXPORT_SYMBOL(_ib_alloc_device);
@@ -1345,6 +1344,37 @@ static void prevent_dealloc_device(struct ib_device *ib_dev)
{
}
+static void ib_device_notify_register(struct ib_device *device)
+{
+ struct net_device *netdev;
+ u32 port;
+ int ret;
+
+ down_read(&devices_rwsem);
+
+ /* Mark for userspace that device is ready */
+ kobject_uevent(&device->dev.kobj, KOBJ_ADD);
+
+ ret = rdma_nl_notify_event(device, 0, RDMA_REGISTER_EVENT);
+ if (ret)
+ goto out;
+
+ rdma_for_each_port(device, port) {
+ netdev = ib_device_get_netdev(device, port);
+ if (!netdev)
+ continue;
+
+ ret = rdma_nl_notify_event(device, port,
+ RDMA_NETDEV_ATTACH_EVENT);
+ dev_put(netdev);
+ if (ret)
+ goto out;
+ }
+
+out:
+ up_read(&devices_rwsem);
+}
+
/**
* ib_register_device - Register an IB device with IB core
* @device: Device to register
@@ -1441,8 +1471,9 @@ int ib_register_device(struct ib_device *device, const char *name,
return ret;
}
dev_set_uevent_suppress(&device->dev, false);
- /* Mark for userspace that device is ready */
- kobject_uevent(&device->dev.kobj, KOBJ_ADD);
+
+ ib_device_notify_register(device);
+
ib_device_put(device);
return 0;
@@ -1461,6 +1492,18 @@ EXPORT_SYMBOL(ib_register_device);
/* Callers must hold a get on the device. */
static void __ib_unregister_device(struct ib_device *ib_dev)
{
+ struct ib_device *sub, *tmp;
+
+ mutex_lock(&ib_dev->subdev_lock);
+ list_for_each_entry_safe_reverse(sub, tmp,
+ &ib_dev->subdev_list_head,
+ subdev_list) {
+ list_del(&sub->subdev_list);
+ ib_dev->ops.del_sub_dev(sub);
+ ib_device_put(ib_dev);
+ }
+ mutex_unlock(&ib_dev->subdev_lock);
+
/*
* We have a registration lock so that all the calls to unregister are
* fully fenced, once any unregister returns the device is truely
@@ -1473,6 +1516,7 @@ static void __ib_unregister_device(struct ib_device *ib_dev)
goto out;
disable_device(ib_dev);
+ rdma_nl_notify_event(ib_dev, 0, RDMA_UNREGISTER_EVENT);
/* Expedite removing unregistered pointers from the hash table */
free_netdevs(ib_dev);
@@ -2141,11 +2185,15 @@ static void add_ndev_hash(struct ib_port_data *pdata)
int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
u32 port)
{
+ enum rdma_nl_notify_event_type etype;
struct net_device *old_ndev;
struct ib_port_data *pdata;
unsigned long flags;
int ret;
+ if (!rdma_is_port_valid(ib_dev, port))
+ return -EINVAL;
+
/*
* Drivers wish to call this before ib_register_driver, so we have to
* setup the port data early.
@@ -2154,9 +2202,6 @@ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
if (ret)
return ret;
- if (!rdma_is_port_valid(ib_dev, port))
- return -EINVAL;
-
pdata = &ib_dev->port_data[port];
spin_lock_irqsave(&pdata->netdev_lock, flags);
old_ndev = rcu_dereference_protected(
@@ -2166,16 +2211,19 @@ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
return 0;
}
- if (old_ndev)
- netdev_tracker_free(ndev, &pdata->netdev_tracker);
- if (ndev)
- netdev_hold(ndev, &pdata->netdev_tracker, GFP_ATOMIC);
rcu_assign_pointer(pdata->netdev, ndev);
+ netdev_put(old_ndev, &pdata->netdev_tracker);
+ netdev_hold(ndev, &pdata->netdev_tracker, GFP_ATOMIC);
spin_unlock_irqrestore(&pdata->netdev_lock, flags);
add_ndev_hash(pdata);
- if (old_ndev)
- __dev_put(old_ndev);
+
+ /* Make sure that the device is registered before we send events */
+ if (xa_load(&devices, ib_dev->index) != ib_dev)
+ return 0;
+
+ etype = ndev ? RDMA_NETDEV_ATTACH_EVENT : RDMA_NETDEV_DETACH_EVENT;
+ rdma_nl_notify_event(ib_dev, port, etype);
return 0;
}
@@ -2223,6 +2271,9 @@ struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
if (!rdma_is_port_valid(ib_dev, port))
return NULL;
+ if (!ib_dev->port_data)
+ return NULL;
+
pdata = &ib_dev->port_data[port];
/*
@@ -2235,22 +2286,40 @@ struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
spin_lock(&pdata->netdev_lock);
res = rcu_dereference_protected(
pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
- if (res)
- dev_hold(res);
+ dev_hold(res);
spin_unlock(&pdata->netdev_lock);
}
- /*
- * If we are starting to unregister expedite things by preventing
- * propagation of an unregistering netdev.
- */
- if (res && res->reg_state != NETREG_REGISTERED) {
- dev_put(res);
- return NULL;
+ return res;
+}
+EXPORT_SYMBOL(ib_device_get_netdev);
+
+/**
+ * ib_query_netdev_port - Query the port number of a net_device
+ * associated with an ibdev
+ * @ibdev: IB device
+ * @ndev: Network device
+ * @port: IB port the net_device is connected to
+ */
+int ib_query_netdev_port(struct ib_device *ibdev, struct net_device *ndev,
+ u32 *port)
+{
+ struct net_device *ib_ndev;
+ u32 port_num;
+
+ rdma_for_each_port(ibdev, port_num) {
+ ib_ndev = ib_device_get_netdev(ibdev, port_num);
+ if (ndev == ib_ndev) {
+ *port = port_num;
+ dev_put(ib_ndev);
+ return 0;
+ }
+ dev_put(ib_ndev);
}
- return res;
+ return -ENOENT;
}
+EXPORT_SYMBOL(ib_query_netdev_port);
/**
* ib_device_get_by_netdev - Find an IB device associated with a netdev
@@ -2311,9 +2380,7 @@ void ib_enum_roce_netdev(struct ib_device *ib_dev,
if (filter(ib_dev, port, idev, filter_cookie))
cb(ib_dev, port, idev, cookie);
-
- if (idev)
- dev_put(idev);
+ dev_put(idev);
}
}
@@ -2601,6 +2668,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
ops->uverbs_no_driver_id_binding;
SET_DEVICE_OP(dev_ops, add_gid);
+ SET_DEVICE_OP(dev_ops, add_sub_dev);
SET_DEVICE_OP(dev_ops, advise_mr);
SET_DEVICE_OP(dev_ops, alloc_dm);
SET_DEVICE_OP(dev_ops, alloc_hw_device_stats);
@@ -2617,6 +2685,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, counter_alloc_stats);
SET_DEVICE_OP(dev_ops, counter_bind_qp);
SET_DEVICE_OP(dev_ops, counter_dealloc);
+ SET_DEVICE_OP(dev_ops, counter_init);
SET_DEVICE_OP(dev_ops, counter_unbind_qp);
SET_DEVICE_OP(dev_ops, counter_update_stats);
SET_DEVICE_OP(dev_ops, create_ah);
@@ -2635,6 +2704,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, dealloc_ucontext);
SET_DEVICE_OP(dev_ops, dealloc_xrcd);
SET_DEVICE_OP(dev_ops, del_gid);
+ SET_DEVICE_OP(dev_ops, del_sub_dev);
SET_DEVICE_OP(dev_ops, dereg_mr);
SET_DEVICE_OP(dev_ops, destroy_ah);
SET_DEVICE_OP(dev_ops, destroy_counters);
@@ -2717,6 +2787,8 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, resize_cq);
SET_DEVICE_OP(dev_ops, set_vf_guid);
SET_DEVICE_OP(dev_ops, set_vf_link_state);
+ SET_DEVICE_OP(dev_ops, ufile_hw_cleanup);
+ SET_DEVICE_OP(dev_ops, report_port_event);
SET_OBJ_SIZE(dev_ops, ib_ah);
SET_OBJ_SIZE(dev_ops, ib_counters);
@@ -2728,9 +2800,59 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_OBJ_SIZE(dev_ops, ib_srq);
SET_OBJ_SIZE(dev_ops, ib_ucontext);
SET_OBJ_SIZE(dev_ops, ib_xrcd);
+ SET_OBJ_SIZE(dev_ops, rdma_counter);
}
EXPORT_SYMBOL(ib_set_device_ops);
+int ib_add_sub_device(struct ib_device *parent,
+ enum rdma_nl_dev_type type,
+ const char *name)
+{
+ struct ib_device *sub;
+ int ret = 0;
+
+ if (!parent->ops.add_sub_dev || !parent->ops.del_sub_dev)
+ return -EOPNOTSUPP;
+
+ if (!ib_device_try_get(parent))
+ return -EINVAL;
+
+ sub = parent->ops.add_sub_dev(parent, type, name);
+ if (IS_ERR(sub)) {
+ ib_device_put(parent);
+ return PTR_ERR(sub);
+ }
+
+ sub->type = type;
+ sub->parent = parent;
+
+ mutex_lock(&parent->subdev_lock);
+ list_add_tail(&parent->subdev_list_head, &sub->subdev_list);
+ mutex_unlock(&parent->subdev_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_add_sub_device);
+
+int ib_del_sub_device_and_put(struct ib_device *sub)
+{
+ struct ib_device *parent = sub->parent;
+
+ if (!parent)
+ return -EOPNOTSUPP;
+
+ mutex_lock(&parent->subdev_lock);
+ list_del(&sub->subdev_list);
+ mutex_unlock(&parent->subdev_lock);
+
+ ib_device_put(sub);
+ parent->ops.del_sub_dev(sub);
+ ib_device_put(parent);
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_del_sub_device_and_put);
+
#ifdef CONFIG_INFINIBAND_VIRT_DMA
int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents)
{
@@ -2761,6 +2883,97 @@ static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = {
},
};
+void ib_dispatch_port_state_event(struct ib_device *ibdev, struct net_device *ndev)
+{
+ enum ib_port_state curr_state;
+ struct ib_event ibevent = {};
+ u32 port;
+
+ if (ib_query_netdev_port(ibdev, ndev, &port))
+ return;
+
+ curr_state = ib_get_curr_port_state(ndev);
+
+ write_lock_irq(&ibdev->cache_lock);
+ if (ibdev->port_data[port].cache.last_port_state == curr_state) {
+ write_unlock_irq(&ibdev->cache_lock);
+ return;
+ }
+ ibdev->port_data[port].cache.last_port_state = curr_state;
+ write_unlock_irq(&ibdev->cache_lock);
+
+ ibevent.event = (curr_state == IB_PORT_DOWN) ?
+ IB_EVENT_PORT_ERR : IB_EVENT_PORT_ACTIVE;
+ ibevent.device = ibdev;
+ ibevent.element.port_num = port;
+ ib_dispatch_event(&ibevent);
+}
+EXPORT_SYMBOL(ib_dispatch_port_state_event);
+
+static void handle_port_event(struct net_device *ndev, unsigned long event)
+{
+ struct ib_device *ibdev;
+
+ /* Currently, link events in bonding scenarios are still
+ * reported by drivers that support bonding.
+ */
+ if (netif_is_lag_master(ndev) || netif_is_lag_port(ndev))
+ return;
+
+ ibdev = ib_device_get_by_netdev(ndev, RDMA_DRIVER_UNKNOWN);
+ if (!ibdev)
+ return;
+
+ if (ibdev->ops.report_port_event) {
+ ibdev->ops.report_port_event(ibdev, ndev, event);
+ goto put_ibdev;
+ }
+
+ ib_dispatch_port_state_event(ibdev, ndev);
+
+put_ibdev:
+ ib_device_put(ibdev);
+};
+
+static int ib_netdevice_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
+ struct ib_device *ibdev;
+ u32 port;
+
+ switch (event) {
+ case NETDEV_CHANGENAME:
+ ibdev = ib_device_get_by_netdev(ndev, RDMA_DRIVER_UNKNOWN);
+ if (!ibdev)
+ return NOTIFY_DONE;
+
+ if (ib_query_netdev_port(ibdev, ndev, &port)) {
+ ib_device_put(ibdev);
+ break;
+ }
+
+ rdma_nl_notify_event(ibdev, port, RDMA_NETDEV_RENAME_EVENT);
+ ib_device_put(ibdev);
+ break;
+
+ case NETDEV_UP:
+ case NETDEV_CHANGE:
+ case NETDEV_DOWN:
+ handle_port_event(ndev, event);
+ break;
+
+ default:
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nb_netdevice = {
+ .notifier_call = ib_netdevice_event,
+};
+
static int __init ib_core_init(void)
{
int ret = -ENOMEM;
@@ -2832,6 +3045,8 @@ static int __init ib_core_init(void)
goto err_parent;
}
+ register_netdevice_notifier(&nb_netdevice);
+
return 0;
err_parent:
@@ -2861,6 +3076,7 @@ err:
static void __exit ib_core_cleanup(void)
{
+ unregister_netdevice_notifier(&nb_netdevice);
roce_gid_mgmt_cleanup();
rdma_nl_unregister(RDMA_NL_LS);
nldev_exit();
diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c
index 0301fcad4b48..62410578dec3 100644
--- a/drivers/infiniband/core/iwcm.c
+++ b/drivers/infiniband/core/iwcm.c
@@ -109,7 +109,9 @@ static struct ctl_table iwcm_ctl_table[] = {
.data = &default_backlog,
.maxlen = sizeof(default_backlog),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_INT_MAX,
},
};
@@ -143,8 +145,8 @@ static struct iwcm_work *get_work(struct iwcm_id_private *cm_id_priv)
if (list_empty(&cm_id_priv->work_free_list))
return NULL;
- work = list_entry(cm_id_priv->work_free_list.next, struct iwcm_work,
- free_list);
+ work = list_first_entry(&cm_id_priv->work_free_list, struct iwcm_work,
+ free_list);
list_del_init(&work->free_list);
return work;
}
@@ -206,17 +208,17 @@ static void free_cm_id(struct iwcm_id_private *cm_id_priv)
/*
* Release a reference on cm_id. If the last reference is being
- * released, free the cm_id and return 1.
+ * released, free the cm_id and return 'true'.
*/
-static int iwcm_deref_id(struct iwcm_id_private *cm_id_priv)
+static bool iwcm_deref_id(struct iwcm_id_private *cm_id_priv)
{
if (refcount_dec_and_test(&cm_id_priv->refcount)) {
BUG_ON(!list_empty(&cm_id_priv->work_list));
free_cm_id(cm_id_priv);
- return 1;
+ return true;
}
- return 0;
+ return false;
}
static void add_ref(struct iw_cm_id *cm_id)
@@ -366,8 +368,7 @@ EXPORT_SYMBOL(iw_cm_disconnect);
/*
* CM_ID <-- DESTROYING
*
- * Clean up all resources associated with the connection and release
- * the initial reference taken by iw_create_cm_id.
+ * Clean up all resources associated with the connection.
*/
static void destroy_cm_id(struct iw_cm_id *cm_id)
{
@@ -438,19 +439,22 @@ static void destroy_cm_id(struct iw_cm_id *cm_id)
iwpm_remove_mapinfo(&cm_id->local_addr, &cm_id->m_local_addr);
iwpm_remove_mapping(&cm_id->local_addr, RDMA_NL_IWCM);
}
-
- (void)iwcm_deref_id(cm_id_priv);
}
/*
- * This function is only called by the application thread and cannot
- * be called by the event thread. The function will wait for all
- * references to be released on the cm_id and then kfree the cm_id
- * object.
+ * Destroy cm_id. If the cm_id still has other references, wait for all
+ * references to be released on the cm_id and then release the initial
+ * reference taken by iw_create_cm_id.
*/
void iw_destroy_cm_id(struct iw_cm_id *cm_id)
{
+ struct iwcm_id_private *cm_id_priv;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
destroy_cm_id(cm_id);
+ if (refcount_read(&cm_id_priv->refcount) > 1)
+ flush_workqueue(iwcm_wq);
+ iwcm_deref_id(cm_id_priv);
}
EXPORT_SYMBOL(iw_destroy_cm_id);
@@ -1017,30 +1021,27 @@ static void cm_work_handler(struct work_struct *_work)
struct iw_cm_event levent;
struct iwcm_id_private *cm_id_priv = work->cm_id;
unsigned long flags;
- int empty;
int ret = 0;
spin_lock_irqsave(&cm_id_priv->lock, flags);
- empty = list_empty(&cm_id_priv->work_list);
- while (!empty) {
- work = list_entry(cm_id_priv->work_list.next,
- struct iwcm_work, list);
+ while (!list_empty(&cm_id_priv->work_list)) {
+ work = list_first_entry(&cm_id_priv->work_list,
+ struct iwcm_work, list);
list_del_init(&work->list);
- empty = list_empty(&cm_id_priv->work_list);
levent = work->event;
put_work(work);
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) {
ret = process_event(cm_id_priv, &levent);
- if (ret)
+ if (ret) {
destroy_cm_id(&cm_id_priv->id);
+ WARN_ON_ONCE(iwcm_deref_id(cm_id_priv));
+ }
} else
pr_debug("dropping event %d\n", levent.event);
if (iwcm_deref_id(cm_id_priv))
return;
- if (empty)
- return;
spin_lock_irqsave(&cm_id_priv->lock, flags);
}
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
@@ -1093,11 +1094,8 @@ static int cm_event_handler(struct iw_cm_id *cm_id,
}
refcount_inc(&cm_id_priv->refcount);
- if (list_empty(&cm_id_priv->work_list)) {
- list_add_tail(&work->list, &cm_id_priv->work_list);
- queue_work(iwcm_wq, &work->work);
- } else
- list_add_tail(&work->list, &cm_id_priv->work_list);
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ queue_work(iwcm_wq, &work->work);
out:
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
return ret;
@@ -1187,7 +1185,7 @@ static int __init iw_cm_init(void)
if (ret)
return ret;
- iwcm_wq = alloc_ordered_workqueue("iw_cm_wq", 0);
+ iwcm_wq = alloc_ordered_workqueue("iw_cm_wq", WQ_MEM_RECLAIM);
if (!iwcm_wq)
goto err_alloc;
diff --git a/drivers/infiniband/core/lag.c b/drivers/infiniband/core/lag.c
index eca6e37c72ba..8fd80adfe833 100644
--- a/drivers/infiniband/core/lag.c
+++ b/drivers/infiniband/core/lag.c
@@ -93,8 +93,7 @@ static struct net_device *rdma_get_xmit_slave_udp(struct ib_device *device,
slave = netdev_get_xmit_slave(master, skb,
!!(device->lag_flags &
RDMA_LAG_FLAGS_HASH_ALL_SLAVES));
- if (slave)
- dev_hold(slave);
+ dev_hold(slave);
rcu_read_unlock();
kfree_skb(skb);
return slave;
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 674344eb8e2f..73f3a0b9a54b 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -2616,14 +2616,16 @@ static int retry_send(struct ib_mad_send_wr_private *mad_send_wr)
static void timeout_sends(struct work_struct *work)
{
+ struct ib_mad_send_wr_private *mad_send_wr, *n;
struct ib_mad_agent_private *mad_agent_priv;
- struct ib_mad_send_wr_private *mad_send_wr;
struct ib_mad_send_wc mad_send_wc;
+ struct list_head local_list;
unsigned long flags, delay;
mad_agent_priv = container_of(work, struct ib_mad_agent_private,
timed_work.work);
mad_send_wc.vendor_err = 0;
+ INIT_LIST_HEAD(&local_list);
spin_lock_irqsave(&mad_agent_priv->lock, flags);
while (!list_empty(&mad_agent_priv->wait_list)) {
@@ -2641,13 +2643,16 @@ static void timeout_sends(struct work_struct *work)
break;
}
- list_del(&mad_send_wr->agent_list);
+ list_del_init(&mad_send_wr->agent_list);
if (mad_send_wr->status == IB_WC_SUCCESS &&
!retry_send(mad_send_wr))
continue;
- spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ list_add_tail(&mad_send_wr->agent_list, &local_list);
+ }
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ list_for_each_entry_safe(mad_send_wr, n, &local_list, agent_list) {
if (mad_send_wr->status == IB_WC_SUCCESS)
mad_send_wc.status = IB_WC_RESP_TIMEOUT_ERR;
else
@@ -2655,11 +2660,8 @@ static void timeout_sends(struct work_struct *work)
mad_send_wc.send_buf = &mad_send_wr->send_buf;
mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
&mad_send_wc);
-
deref_mad_agent(mad_agent_priv);
- spin_lock_irqsave(&mad_agent_priv->lock, flags);
}
- spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
}
/*
@@ -2669,11 +2671,11 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
struct ib_mad_private *mad)
{
unsigned long flags;
- int post, ret;
struct ib_mad_private *mad_priv;
struct ib_sge sg_list;
struct ib_recv_wr recv_wr;
struct ib_mad_queue *recv_queue = &qp_info->recv_queue;
+ int ret = 0;
/* Initialize common scatter list fields */
sg_list.lkey = qp_info->port_priv->pd->local_dma_lkey;
@@ -2683,7 +2685,7 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
recv_wr.sg_list = &sg_list;
recv_wr.num_sge = 1;
- do {
+ while (true) {
/* Allocate and map receive buffer */
if (mad) {
mad_priv = mad;
@@ -2691,10 +2693,8 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
} else {
mad_priv = alloc_mad_private(port_mad_size(qp_info->port_priv),
GFP_ATOMIC);
- if (!mad_priv) {
- ret = -ENOMEM;
- break;
- }
+ if (!mad_priv)
+ return -ENOMEM;
}
sg_list.length = mad_priv_dma_size(mad_priv);
sg_list.addr = ib_dma_map_single(qp_info->port_priv->device,
@@ -2703,37 +2703,41 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
DMA_FROM_DEVICE);
if (unlikely(ib_dma_mapping_error(qp_info->port_priv->device,
sg_list.addr))) {
- kfree(mad_priv);
ret = -ENOMEM;
- break;
+ goto free_mad_priv;
}
mad_priv->header.mapping = sg_list.addr;
mad_priv->header.mad_list.mad_queue = recv_queue;
mad_priv->header.mad_list.cqe.done = ib_mad_recv_done;
recv_wr.wr_cqe = &mad_priv->header.mad_list.cqe;
-
- /* Post receive WR */
spin_lock_irqsave(&recv_queue->lock, flags);
- post = (++recv_queue->count < recv_queue->max_active);
- list_add_tail(&mad_priv->header.mad_list.list, &recv_queue->list);
+ if (recv_queue->count >= recv_queue->max_active) {
+ /* Fully populated the receive queue */
+ spin_unlock_irqrestore(&recv_queue->lock, flags);
+ break;
+ }
+ recv_queue->count++;
+ list_add_tail(&mad_priv->header.mad_list.list,
+ &recv_queue->list);
spin_unlock_irqrestore(&recv_queue->lock, flags);
+
ret = ib_post_recv(qp_info->qp, &recv_wr, NULL);
if (ret) {
spin_lock_irqsave(&recv_queue->lock, flags);
list_del(&mad_priv->header.mad_list.list);
recv_queue->count--;
spin_unlock_irqrestore(&recv_queue->lock, flags);
- ib_dma_unmap_single(qp_info->port_priv->device,
- mad_priv->header.mapping,
- mad_priv_dma_size(mad_priv),
- DMA_FROM_DEVICE);
- kfree(mad_priv);
dev_err(&qp_info->port_priv->device->dev,
"ib_post_recv failed: %d\n", ret);
break;
}
- } while (post);
+ }
+ ib_dma_unmap_single(qp_info->port_priv->device,
+ mad_priv->header.mapping,
+ mad_priv_dma_size(mad_priv), DMA_FROM_DEVICE);
+free_mad_priv:
+ kfree(mad_priv);
return ret;
}
@@ -2937,7 +2941,6 @@ static int ib_mad_port_open(struct ib_device *device,
int ret, cq_size;
struct ib_mad_port_private *port_priv;
unsigned long flags;
- char name[sizeof "ib_mad123"];
int has_smi;
if (WARN_ON(rdma_max_mad_size(device, port_num) < IB_MGMT_MAD_SIZE))
@@ -2983,12 +2986,15 @@ static int ib_mad_port_open(struct ib_device *device,
if (ret)
goto error6;
}
- ret = create_mad_qp(&port_priv->qp_info[1], IB_QPT_GSI);
- if (ret)
- goto error7;
- snprintf(name, sizeof(name), "ib_mad%u", port_num);
- port_priv->wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM);
+ if (rdma_cap_ib_cm(device, port_num)) {
+ ret = create_mad_qp(&port_priv->qp_info[1], IB_QPT_GSI);
+ if (ret)
+ goto error7;
+ }
+
+ port_priv->wq = alloc_ordered_workqueue("ib_mad%u", WQ_MEM_RECLAIM,
+ port_num);
if (!port_priv->wq) {
ret = -ENOMEM;
goto error8;
diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c
index 8af0619a39cd..b4b10e8a6495 100644
--- a/drivers/infiniband/core/mad_rmpp.c
+++ b/drivers/infiniband/core/mad_rmpp.c
@@ -158,7 +158,7 @@ static struct ib_mad_send_buf *alloc_response_msg(struct ib_mad_agent *agent,
ah = ib_create_ah_from_wc(agent->qp->pd, recv_wc->wc,
recv_wc->recv_buf.grh, agent->port_num);
if (IS_ERR(ah))
- return (void *) ah;
+ return ERR_CAST(ah);
hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class);
msg = ib_create_send_mad(agent, recv_wc->wc->src_qp,
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
index ae2db0c70788..def14c54b648 100644
--- a/drivers/infiniband/core/netlink.c
+++ b/drivers/infiniband/core/netlink.c
@@ -311,6 +311,7 @@ int rdma_nl_net_init(struct rdma_dev_net *rnet)
struct net *net = read_pnet(&rnet->net);
struct netlink_kernel_cfg cfg = {
.input = rdma_nl_rcv,
+ .flags = NL_CFG_F_NONROOT_RECV,
};
struct sock *nls;
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 4900a0848124..a872643e8039 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -137,6 +137,8 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
[RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME]= { .type = NLA_NUL_STRING,
.len = RDMA_NLDEV_ATTR_EMPTY_STRING },
[RDMA_NLDEV_ATTR_RES_TYPE] = { .type = NLA_U8 },
+ [RDMA_NLDEV_ATTR_RES_SUBTYPE] = { .type = NLA_NUL_STRING,
+ .len = RDMA_NLDEV_ATTR_EMPTY_STRING },
[RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY]= { .type = NLA_U32 },
[RDMA_NLDEV_ATTR_RES_USECNT] = { .type = NLA_U64 },
[RDMA_NLDEV_ATTR_RES_SRQ] = { .type = NLA_NESTED },
@@ -164,6 +166,12 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
[RDMA_NLDEV_ATTR_STAT_HWCOUNTER_INDEX] = { .type = NLA_U32 },
[RDMA_NLDEV_ATTR_STAT_HWCOUNTER_DYNAMIC] = { .type = NLA_U8 },
[RDMA_NLDEV_SYS_ATTR_PRIVILEGED_QKEY_MODE] = { .type = NLA_U8 },
+ [RDMA_NLDEV_ATTR_DRIVER_DETAILS] = { .type = NLA_U8 },
+ [RDMA_NLDEV_ATTR_DEV_TYPE] = { .type = NLA_U8 },
+ [RDMA_NLDEV_ATTR_PARENT_NAME] = { .type = NLA_NUL_STRING },
+ [RDMA_NLDEV_ATTR_NAME_ASSIGN_TYPE] = { .type = NLA_U8 },
+ [RDMA_NLDEV_ATTR_EVENT_TYPE] = { .type = NLA_U8 },
+ [RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED] = { .type = NLA_U8 },
};
static int put_driver_name_print_type(struct sk_buff *msg, const char *name,
@@ -298,6 +306,19 @@ static int fill_dev_info(struct sk_buff *msg, struct ib_device *device)
if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_DIM, device->use_cq_dim))
return -EMSGSIZE;
+ if (device->type &&
+ nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_TYPE, device->type))
+ return -EMSGSIZE;
+
+ if (device->parent &&
+ nla_put_string(msg, RDMA_NLDEV_ATTR_PARENT_NAME,
+ dev_name(&device->parent->dev)))
+ return -EMSGSIZE;
+
+ if (nla_put_u8(msg, RDMA_NLDEV_ATTR_NAME_ASSIGN_TYPE,
+ device->name_assign_type))
+ return -EMSGSIZE;
+
/*
* Link type is determined on first port and mlx4 device
* which can potentially have two different link type for the same
@@ -399,7 +420,8 @@ err:
return -EMSGSIZE;
}
-static int fill_res_info(struct sk_buff *msg, struct ib_device *device)
+static int fill_res_info(struct sk_buff *msg, struct ib_device *device,
+ bool show_details)
{
static const char * const names[RDMA_RESTRACK_MAX] = {
[RDMA_RESTRACK_PD] = "pd",
@@ -424,7 +446,7 @@ static int fill_res_info(struct sk_buff *msg, struct ib_device *device)
for (i = 0; i < RDMA_RESTRACK_MAX; i++) {
if (!names[i])
continue;
- curr = rdma_restrack_count(device, i);
+ curr = rdma_restrack_count(device, i, show_details);
ret = fill_res_info_entry(msg, names[i], curr);
if (ret)
goto err;
@@ -1054,8 +1076,8 @@ static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
u32 index;
int err;
- err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
- nldev_policy, extack);
+ err = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, NL_VALIDATE_LIBERAL, extack);
if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
return -EINVAL;
@@ -1103,8 +1125,8 @@ static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
u32 index;
int err;
- err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
- nldev_policy, extack);
+ err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, extack);
if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
return -EINVAL;
@@ -1195,8 +1217,8 @@ static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
u32 port;
int err;
- err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
- nldev_policy, extack);
+ err = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, NL_VALIDATE_LIBERAL, extack);
if (err ||
!tb[RDMA_NLDEV_ATTR_DEV_INDEX] ||
!tb[RDMA_NLDEV_ATTR_PORT_INDEX])
@@ -1255,8 +1277,8 @@ static int nldev_port_get_dumpit(struct sk_buff *skb,
int err;
unsigned int p;
- err = nlmsg_parse_deprecated(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
- nldev_policy, NULL);
+ err = __nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, NL_VALIDATE_LIBERAL, NULL);
if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
return -EINVAL;
@@ -1305,13 +1327,14 @@ static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ bool show_details = false;
struct ib_device *device;
struct sk_buff *msg;
u32 index;
int ret;
- ret = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
- nldev_policy, extack);
+ ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, NL_VALIDATE_LIBERAL, extack);
if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
return -EINVAL;
@@ -1320,6 +1343,9 @@ static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
if (!device)
return -EINVAL;
+ if (tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS])
+ show_details = nla_get_u8(tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS]);
+
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg) {
ret = -ENOMEM;
@@ -1334,7 +1360,7 @@ static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
goto err_free;
}
- ret = fill_res_info(msg, device);
+ ret = fill_res_info(msg, device, show_details);
if (ret)
goto err_free;
@@ -1364,7 +1390,7 @@ static int _nldev_res_get_dumpit(struct ib_device *device,
RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_GET),
0, NLM_F_MULTI);
- if (!nlh || fill_res_info(skb, device)) {
+ if (!nlh || fill_res_info(skb, device, false)) {
nlmsg_cancel(skb, nlh);
goto out;
}
@@ -1457,8 +1483,8 @@ static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
struct sk_buff *msg;
int ret;
- ret = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
- nldev_policy, extack);
+ ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, NL_VALIDATE_LIBERAL, extack);
if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !fe->id || !tb[fe->id])
return -EINVAL;
@@ -1534,6 +1560,7 @@ static int res_get_common_dumpit(struct sk_buff *skb,
struct rdma_restrack_entry *res;
struct rdma_restrack_root *rt;
int err, ret = 0, idx = 0;
+ bool show_details = false;
struct nlattr *table_attr;
struct nlattr *entry_attr;
struct ib_device *device;
@@ -1544,8 +1571,8 @@ static int res_get_common_dumpit(struct sk_buff *skb,
u32 index, port = 0;
bool filled = false;
- err = nlmsg_parse_deprecated(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
- nldev_policy, NULL);
+ err = __nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, NL_VALIDATE_LIBERAL, NULL);
/*
* Right now, we are expecting the device index to get res information,
* but it is possible to extend this code to return all devices in
@@ -1562,6 +1589,9 @@ static int res_get_common_dumpit(struct sk_buff *skb,
if (!device)
return -EINVAL;
+ if (tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS])
+ show_details = nla_get_u8(tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS]);
+
/*
* If no PORT_INDEX is supplied, we will return all QPs from that device
*/
@@ -1599,6 +1629,9 @@ static int res_get_common_dumpit(struct sk_buff *skb,
* objects.
*/
xa_for_each(&rt->xa, id, res) {
+ if (xa_get_mark(&rt->xa, res->id, RESTRACK_DD) && !show_details)
+ goto next;
+
if (idx < start || !rdma_restrack_get(res))
goto next;
@@ -1731,8 +1764,8 @@ static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
char type[IFNAMSIZ];
int err;
- err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
- nldev_policy, extack);
+ err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, extack);
if (err || !tb[RDMA_NLDEV_ATTR_DEV_NAME] ||
!tb[RDMA_NLDEV_ATTR_LINK_TYPE] || !tb[RDMA_NLDEV_ATTR_NDEV_NAME])
return -EINVAL;
@@ -1775,8 +1808,8 @@ static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
u32 index;
int err;
- err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
- nldev_policy, extack);
+ err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, extack);
if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
return -EINVAL;
@@ -1805,8 +1838,8 @@ static int nldev_get_chardev(struct sk_buff *skb, struct nlmsghdr *nlh,
u32 index;
int err;
- err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy,
- extack);
+ err = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy,
+ NL_VALIDATE_LIBERAL, extack);
if (err || !tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE])
return -EINVAL;
@@ -1889,8 +1922,8 @@ static int nldev_sys_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
struct sk_buff *msg;
int err;
- err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
- nldev_policy, extack);
+ err = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, NL_VALIDATE_LIBERAL, extack);
if (err)
return err;
@@ -1920,6 +1953,12 @@ static int nldev_sys_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
nlmsg_free(msg);
return err;
}
+
+ err = nla_put_u8(msg, RDMA_NLDEV_SYS_ATTR_MONITOR_MODE, 1);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
/*
* Copy-on-fork is supported.
* See commits:
@@ -1990,6 +2029,7 @@ static int nldev_stat_set_mode_doit(struct sk_buff *msg,
struct ib_device *device, u32 port)
{
u32 mode, mask = 0, qpn, cntn = 0;
+ bool opcnt = false;
int ret;
/* Currently only counter for QP is supported */
@@ -1997,12 +2037,17 @@ static int nldev_stat_set_mode_doit(struct sk_buff *msg,
nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP)
return -EINVAL;
+ if (tb[RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED])
+ opcnt = !!nla_get_u8(
+ tb[RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED]);
+
mode = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_MODE]);
if (mode == RDMA_COUNTER_MODE_AUTO) {
if (tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK])
mask = nla_get_u32(
tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]);
- return rdma_counter_set_auto_mode(device, port, mask, extack);
+ return rdma_counter_set_auto_mode(device, port, mask, opcnt,
+ extack);
}
if (!tb[RDMA_NLDEV_ATTR_RES_LQPN])
@@ -2320,6 +2365,7 @@ static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh,
struct ib_device *device;
struct sk_buff *msg;
u32 index, port;
+ bool opcnt;
int ret;
if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID])
@@ -2355,7 +2401,7 @@ static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh,
goto err_msg;
}
- ret = rdma_counter_get_mode(device, port, &mode, &mask);
+ ret = rdma_counter_get_mode(device, port, &mode, &mask, &opcnt);
if (ret)
goto err_msg;
@@ -2372,6 +2418,12 @@ static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh,
goto err_msg;
}
+ if ((mode == RDMA_COUNTER_MODE_AUTO) &&
+ nla_put_u8(msg, RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED, opcnt)) {
+ ret = -EMSGSIZE;
+ goto err_msg;
+ }
+
nlmsg_end(msg, nlh);
ib_device_put(device);
return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid);
@@ -2389,8 +2441,8 @@ static int nldev_stat_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
int ret;
- ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
- nldev_policy, extack);
+ ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, NL_VALIDATE_LIBERAL, extack);
if (ret)
return -EINVAL;
@@ -2419,8 +2471,8 @@ static int nldev_stat_get_dumpit(struct sk_buff *skb,
struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
int ret;
- ret = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
- nldev_policy, NULL);
+ ret = __nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, NL_VALIDATE_LIBERAL, NULL);
if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES])
return -EINVAL;
@@ -2451,8 +2503,8 @@ static int nldev_stat_get_counter_status_doit(struct sk_buff *skb,
u32 devid, port;
int ret, i;
- ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
- nldev_policy, extack);
+ ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, NL_VALIDATE_LIBERAL, extack);
if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] ||
!tb[RDMA_NLDEV_ATTR_PORT_INDEX])
return -EINVAL;
@@ -2533,6 +2585,56 @@ err:
return ret;
}
+static int nldev_newdev(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ enum rdma_nl_dev_type type;
+ struct ib_device *parent;
+ char name[IFNAMSIZ] = {};
+ u32 parentid;
+ int ret;
+
+ ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, extack);
+ if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] ||
+ !tb[RDMA_NLDEV_ATTR_DEV_NAME] || !tb[RDMA_NLDEV_ATTR_DEV_TYPE])
+ return -EINVAL;
+
+ nla_strscpy(name, tb[RDMA_NLDEV_ATTR_DEV_NAME], sizeof(name));
+ type = nla_get_u8(tb[RDMA_NLDEV_ATTR_DEV_TYPE]);
+ parentid = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+ parent = ib_device_get_by_index(sock_net(skb->sk), parentid);
+ if (!parent)
+ return -EINVAL;
+
+ ret = ib_add_sub_device(parent, type, name);
+ ib_device_put(parent);
+
+ return ret;
+}
+
+static int nldev_deldev(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ struct ib_device *device;
+ u32 devid;
+ int ret;
+
+ ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, extack);
+ if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
+ return -EINVAL;
+
+ devid = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+ device = ib_device_get_by_index(sock_net(skb->sk), devid);
+ if (!device)
+ return -EINVAL;
+
+ return ib_del_sub_device_and_put(device);
+}
+
static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
[RDMA_NLDEV_CMD_GET] = {
.doit = nldev_get_doit,
@@ -2631,8 +2733,178 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
[RDMA_NLDEV_CMD_STAT_GET_STATUS] = {
.doit = nldev_stat_get_counter_status_doit,
},
+ [RDMA_NLDEV_CMD_NEWDEV] = {
+ .doit = nldev_newdev,
+ .flags = RDMA_NL_ADMIN_PERM,
+ },
+ [RDMA_NLDEV_CMD_DELDEV] = {
+ .doit = nldev_deldev,
+ .flags = RDMA_NL_ADMIN_PERM,
+ },
};
+static int fill_mon_netdev_rename(struct sk_buff *msg,
+ struct ib_device *device, u32 port,
+ const struct net *net)
+{
+ struct net_device *netdev = ib_device_get_netdev(device, port);
+ int ret = 0;
+
+ if (!netdev || !net_eq(dev_net(netdev), net))
+ goto out;
+
+ ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_NDEV_INDEX, netdev->ifindex);
+ if (ret)
+ goto out;
+ ret = nla_put_string(msg, RDMA_NLDEV_ATTR_NDEV_NAME, netdev->name);
+out:
+ dev_put(netdev);
+ return ret;
+}
+
+static int fill_mon_netdev_association(struct sk_buff *msg,
+ struct ib_device *device, u32 port,
+ const struct net *net)
+{
+ struct net_device *netdev = ib_device_get_netdev(device, port);
+ int ret = 0;
+
+ if (netdev && !net_eq(dev_net(netdev), net))
+ goto out;
+
+ ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index);
+ if (ret)
+ goto out;
+
+ ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME,
+ dev_name(&device->dev));
+ if (ret)
+ goto out;
+
+ ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port);
+ if (ret)
+ goto out;
+
+ if (netdev) {
+ ret = nla_put_u32(msg,
+ RDMA_NLDEV_ATTR_NDEV_INDEX, netdev->ifindex);
+ if (ret)
+ goto out;
+
+ ret = nla_put_string(msg,
+ RDMA_NLDEV_ATTR_NDEV_NAME, netdev->name);
+ }
+
+out:
+ dev_put(netdev);
+ return ret;
+}
+
+static void rdma_nl_notify_err_msg(struct ib_device *device, u32 port_num,
+ enum rdma_nl_notify_event_type type)
+{
+ struct net_device *netdev;
+
+ switch (type) {
+ case RDMA_REGISTER_EVENT:
+ dev_warn_ratelimited(&device->dev,
+ "Failed to send RDMA monitor register device event\n");
+ break;
+ case RDMA_UNREGISTER_EVENT:
+ dev_warn_ratelimited(&device->dev,
+ "Failed to send RDMA monitor unregister device event\n");
+ break;
+ case RDMA_NETDEV_ATTACH_EVENT:
+ netdev = ib_device_get_netdev(device, port_num);
+ dev_warn_ratelimited(&device->dev,
+ "Failed to send RDMA monitor netdev attach event: port %d netdev %d\n",
+ port_num, netdev->ifindex);
+ dev_put(netdev);
+ break;
+ case RDMA_NETDEV_DETACH_EVENT:
+ dev_warn_ratelimited(&device->dev,
+ "Failed to send RDMA monitor netdev detach event: port %d\n",
+ port_num);
+ break;
+ case RDMA_RENAME_EVENT:
+ dev_warn_ratelimited(&device->dev,
+ "Failed to send RDMA monitor rename device event\n");
+ break;
+
+ case RDMA_NETDEV_RENAME_EVENT:
+ netdev = ib_device_get_netdev(device, port_num);
+ dev_warn_ratelimited(&device->dev,
+ "Failed to send RDMA monitor netdev rename event: port %d netdev %d\n",
+ port_num, netdev->ifindex);
+ dev_put(netdev);
+ break;
+ default:
+ break;
+ }
+}
+
+int rdma_nl_notify_event(struct ib_device *device, u32 port_num,
+ enum rdma_nl_notify_event_type type)
+{
+ struct sk_buff *skb;
+ int ret = -EMSGSIZE;
+ struct net *net;
+ void *nlh;
+
+ net = read_pnet(&device->coredev.rdma_net);
+ if (!net)
+ return -EINVAL;
+
+ skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+ nlh = nlmsg_put(skb, 0, 0,
+ RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_MONITOR),
+ 0, 0);
+ if (!nlh)
+ goto err_free;
+
+ switch (type) {
+ case RDMA_REGISTER_EVENT:
+ case RDMA_UNREGISTER_EVENT:
+ case RDMA_RENAME_EVENT:
+ ret = fill_nldev_handle(skb, device);
+ if (ret)
+ goto err_free;
+ break;
+ case RDMA_NETDEV_ATTACH_EVENT:
+ case RDMA_NETDEV_DETACH_EVENT:
+ ret = fill_mon_netdev_association(skb, device, port_num, net);
+ if (ret)
+ goto err_free;
+ break;
+ case RDMA_NETDEV_RENAME_EVENT:
+ ret = fill_mon_netdev_rename(skb, device, port_num, net);
+ if (ret)
+ goto err_free;
+ break;
+ default:
+ break;
+ }
+
+ ret = nla_put_u8(skb, RDMA_NLDEV_ATTR_EVENT_TYPE, type);
+ if (ret)
+ goto err_free;
+
+ nlmsg_end(skb, nlh);
+ ret = rdma_nl_multicast(net, skb, RDMA_NL_GROUP_NOTIFY, GFP_KERNEL);
+ if (ret && ret != -ESRCH) {
+ skb = NULL; /* skb is freed in the netlink send-op handling */
+ goto err_free;
+ }
+ return 0;
+
+err_free:
+ rdma_nl_notify_err_msg(device, port_num, type);
+ nlmsg_free(skb);
+ return ret;
+}
+
void __init nldev_init(void)
{
rdma_nl_register(RDMA_NL_NLDEV, nldev_cb_table);
diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
index 29b1ab1d5f93..90c177edf9b0 100644
--- a/drivers/infiniband/core/rdma_core.c
+++ b/drivers/infiniband/core/rdma_core.c
@@ -58,8 +58,8 @@ void uverbs_uobject_put(struct ib_uobject *uobject)
}
EXPORT_SYMBOL(uverbs_uobject_put);
-static int uverbs_try_lock_object(struct ib_uobject *uobj,
- enum rdma_lookup_mode mode)
+int uverbs_try_lock_object(struct ib_uobject *uobj,
+ enum rdma_lookup_mode mode)
{
/*
* When a shared access is required, we use a positive counter. Each
@@ -84,6 +84,7 @@ static int uverbs_try_lock_object(struct ib_uobject *uobj,
}
return 0;
}
+EXPORT_SYMBOL(uverbs_try_lock_object);
static void assert_uverbs_usecnt(struct ib_uobject *uobj,
enum rdma_lookup_mode mode)
@@ -880,9 +881,14 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile,
static int __uverbs_cleanup_ufile(struct ib_uverbs_file *ufile,
enum rdma_remove_reason reason)
{
+ struct uverbs_attr_bundle attrs = { .ufile = ufile };
+ struct ib_ucontext *ucontext = ufile->ucontext;
+ struct ib_device *ib_dev = ucontext->device;
struct ib_uobject *obj, *next_obj;
int ret = -EINVAL;
- struct uverbs_attr_bundle attrs = { .ufile = ufile };
+
+ if (ib_dev->ops.ufile_hw_cleanup)
+ ib_dev->ops.ufile_hw_cleanup(ufile);
/*
* This shouldn't run while executing other commands on this
diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c
index 01a499a8b88d..3313410014cd 100644
--- a/drivers/infiniband/core/restrack.c
+++ b/drivers/infiniband/core/restrack.c
@@ -37,22 +37,6 @@ int rdma_restrack_init(struct ib_device *dev)
return 0;
}
-static const char *type2str(enum rdma_restrack_type type)
-{
- static const char * const names[RDMA_RESTRACK_MAX] = {
- [RDMA_RESTRACK_PD] = "PD",
- [RDMA_RESTRACK_CQ] = "CQ",
- [RDMA_RESTRACK_QP] = "QP",
- [RDMA_RESTRACK_CM_ID] = "CM_ID",
- [RDMA_RESTRACK_MR] = "MR",
- [RDMA_RESTRACK_CTX] = "CTX",
- [RDMA_RESTRACK_COUNTER] = "COUNTER",
- [RDMA_RESTRACK_SRQ] = "SRQ",
- };
-
- return names[type];
-};
-
/**
* rdma_restrack_clean() - clean resource tracking
* @dev: IB device
@@ -60,47 +44,14 @@ static const char *type2str(enum rdma_restrack_type type)
void rdma_restrack_clean(struct ib_device *dev)
{
struct rdma_restrack_root *rt = dev->res;
- struct rdma_restrack_entry *e;
- char buf[TASK_COMM_LEN];
- bool found = false;
- const char *owner;
int i;
for (i = 0 ; i < RDMA_RESTRACK_MAX; i++) {
struct xarray *xa = &dev->res[i].xa;
- if (!xa_empty(xa)) {
- unsigned long index;
-
- if (!found) {
- pr_err("restrack: %s", CUT_HERE);
- dev_err(&dev->dev, "BUG: RESTRACK detected leak of resources\n");
- }
- xa_for_each(xa, index, e) {
- if (rdma_is_kernel_res(e)) {
- owner = e->kern_name;
- } else {
- /*
- * There is no need to call get_task_struct here,
- * because we can be here only if there are more
- * get_task_struct() call than put_task_struct().
- */
- get_task_comm(buf, e->task);
- owner = buf;
- }
-
- pr_err("restrack: %s %s object allocated by %s is not freed\n",
- rdma_is_kernel_res(e) ? "Kernel" :
- "User",
- type2str(e->type), owner);
- }
- found = true;
- }
+ WARN_ON(!xa_empty(xa));
xa_destroy(xa);
}
- if (found)
- pr_err("restrack: %s", CUT_HERE);
-
kfree(rt);
}
@@ -108,8 +59,10 @@ void rdma_restrack_clean(struct ib_device *dev)
* rdma_restrack_count() - the current usage of specific object
* @dev: IB device
* @type: actual type of object to operate
+ * @show_details: count driver specific objects
*/
-int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type)
+int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type,
+ bool show_details)
{
struct rdma_restrack_root *rt = &dev->res[type];
struct rdma_restrack_entry *e;
@@ -117,8 +70,11 @@ int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type)
u32 cnt = 0;
xa_lock(&rt->xa);
- xas_for_each(&xas, e, U32_MAX)
+ xas_for_each(&xas, e, U32_MAX) {
+ if (xa_get_mark(&rt->xa, e->id, RESTRACK_DD) && !show_details)
+ continue;
cnt++;
+ }
xa_unlock(&rt->xa);
return cnt;
}
@@ -247,6 +203,9 @@ void rdma_restrack_add(struct rdma_restrack_entry *res)
ret = xa_insert(&rt->xa, res->id, res, GFP_KERNEL);
if (ret)
res->id = 0;
+
+ if (qp->qp_type >= IB_QPT_DRIVER)
+ xa_set_mark(&rt->xa, res->id, RESTRACK_DD);
} else if (res->type == RDMA_RESTRACK_COUNTER) {
/* Special case to ensure that cntn points to right counter */
struct rdma_counter *counter;
diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c
index e958c43dd28f..a9f2c6b1b29e 100644
--- a/drivers/infiniband/core/roce_gid_mgmt.c
+++ b/drivers/infiniband/core/roce_gid_mgmt.c
@@ -515,6 +515,27 @@ void rdma_roce_rescan_device(struct ib_device *ib_dev)
}
EXPORT_SYMBOL(rdma_roce_rescan_device);
+/**
+ * rdma_roce_rescan_port - Rescan all of the network devices in the system
+ * and add their gids if relevant to the port of the RoCE device.
+ *
+ * @ib_dev: IB device
+ * @port: Port number
+ */
+void rdma_roce_rescan_port(struct ib_device *ib_dev, u32 port)
+{
+ struct net_device *ndev = NULL;
+
+ if (rdma_protocol_roce(ib_dev, port)) {
+ ndev = ib_device_get_netdev(ib_dev, port);
+ if (!ndev)
+ return;
+ enum_all_gids_of_dev_cb(ib_dev, port, ndev, ndev);
+ dev_put(ndev);
+ }
+}
+EXPORT_SYMBOL(rdma_roce_rescan_port);
+
static void callback_for_addr_gid_device_scan(struct ib_device *device,
u32 port,
struct net_device *rdma_ndev,
@@ -575,16 +596,17 @@ static void handle_netdev_upper(struct ib_device *ib_dev, u32 port,
}
}
-static void _roce_del_all_netdev_gids(struct ib_device *ib_dev, u32 port,
- struct net_device *event_ndev)
+void roce_del_all_netdev_gids(struct ib_device *ib_dev,
+ u32 port, struct net_device *ndev)
{
- ib_cache_gid_del_all_netdev_gids(ib_dev, port, event_ndev);
+ ib_cache_gid_del_all_netdev_gids(ib_dev, port, ndev);
}
+EXPORT_SYMBOL(roce_del_all_netdev_gids);
static void del_netdev_upper_ips(struct ib_device *ib_dev, u32 port,
struct net_device *rdma_ndev, void *cookie)
{
- handle_netdev_upper(ib_dev, port, cookie, _roce_del_all_netdev_gids);
+ handle_netdev_upper(ib_dev, port, cookie, roce_del_all_netdev_gids);
}
static void add_netdev_upper_ips(struct ib_device *ib_dev, u32 port,
@@ -601,8 +623,7 @@ static void del_netdev_default_ips_join(struct ib_device *ib_dev, u32 port,
rcu_read_lock();
master_ndev = netdev_master_upper_dev_get_rcu(rdma_ndev);
- if (master_ndev)
- dev_hold(master_ndev);
+ dev_hold(master_ndev);
rcu_read_unlock();
if (master_ndev) {
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index 8175dde60b0a..53571e6b3162 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -1420,7 +1420,7 @@ enum opa_pr_supported {
/*
* opa_pr_query_possible - Check if current PR query can be an OPA query.
*
- * Retuns PR_NOT_SUPPORTED if a path record query is not
+ * Returns PR_NOT_SUPPORTED if a path record query is not
* possible, PR_OPA_SUPPORTED if an OPA path record query
* is possible and PR_IB_SUPPORTED if an IB path record
* query is possible.
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index 9f97bef02149..0ed862b38b44 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -216,24 +216,12 @@ static ssize_t state_show(struct ib_device *ibdev, u32 port_num,
struct ib_port_attr attr;
ssize_t ret;
- static const char *state_name[] = {
- [IB_PORT_NOP] = "NOP",
- [IB_PORT_DOWN] = "DOWN",
- [IB_PORT_INIT] = "INIT",
- [IB_PORT_ARMED] = "ARMED",
- [IB_PORT_ACTIVE] = "ACTIVE",
- [IB_PORT_ACTIVE_DEFER] = "ACTIVE_DEFER"
- };
-
ret = ib_query_port(ibdev, port_num, &attr);
if (ret)
return ret;
return sysfs_emit(buf, "%d: %s\n", attr.state,
- attr.state >= 0 &&
- attr.state < ARRAY_SIZE(state_name) ?
- state_name[attr.state] :
- "UNKNOWN");
+ ib_port_state_to_str(attr.state));
}
static ssize_t lid_show(struct ib_device *ibdev, u32 port_num,
@@ -988,6 +976,7 @@ int ib_setup_device_attrs(struct ib_device *ibdev)
for (i = 0; i != ARRAY_SIZE(ibdev->groups); i++)
if (!ibdev->groups[i]) {
ibdev->groups[i] = &data->group;
+ ibdev->hw_stats_attr_index = i;
return 0;
}
WARN(true, "struct ib_device->groups is too small");
diff --git a/drivers/infiniband/core/ucaps.c b/drivers/infiniband/core/ucaps.c
new file mode 100644
index 000000000000..de5cb8bf0a61
--- /dev/null
+++ b/drivers/infiniband/core/ucaps.c
@@ -0,0 +1,267 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#include <linux/kref.h>
+#include <linux/cdev.h>
+#include <linux/mutex.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <rdma/ib_ucaps.h>
+
+#define RDMA_UCAP_FIRST RDMA_UCAP_MLX5_CTRL_LOCAL
+
+static DEFINE_MUTEX(ucaps_mutex);
+static struct ib_ucap *ucaps_list[RDMA_UCAP_MAX];
+static bool ucaps_class_is_registered;
+static dev_t ucaps_base_dev;
+
+struct ib_ucap {
+ struct cdev cdev;
+ struct device dev;
+ struct kref ref;
+};
+
+static const char *ucap_names[RDMA_UCAP_MAX] = {
+ [RDMA_UCAP_MLX5_CTRL_LOCAL] = "mlx5_perm_ctrl_local",
+ [RDMA_UCAP_MLX5_CTRL_OTHER_VHCA] = "mlx5_perm_ctrl_other_vhca"
+};
+
+static char *ucaps_devnode(const struct device *dev, umode_t *mode)
+{
+ if (mode)
+ *mode = 0600;
+
+ return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
+}
+
+static const struct class ucaps_class = {
+ .name = "infiniband_ucaps",
+ .devnode = ucaps_devnode,
+};
+
+static const struct file_operations ucaps_cdev_fops = {
+ .owner = THIS_MODULE,
+ .open = simple_open,
+};
+
+/**
+ * ib_cleanup_ucaps - cleanup all API resources and class.
+ *
+ * This is called once, when removing the ib_uverbs module.
+ */
+void ib_cleanup_ucaps(void)
+{
+ mutex_lock(&ucaps_mutex);
+ if (!ucaps_class_is_registered) {
+ mutex_unlock(&ucaps_mutex);
+ return;
+ }
+
+ for (int i = RDMA_UCAP_FIRST; i < RDMA_UCAP_MAX; i++)
+ WARN_ON(ucaps_list[i]);
+
+ class_unregister(&ucaps_class);
+ ucaps_class_is_registered = false;
+ unregister_chrdev_region(ucaps_base_dev, RDMA_UCAP_MAX);
+ mutex_unlock(&ucaps_mutex);
+}
+
+static int get_ucap_from_devt(dev_t devt, u64 *idx_mask)
+{
+ for (int type = RDMA_UCAP_FIRST; type < RDMA_UCAP_MAX; type++) {
+ if (ucaps_list[type] && ucaps_list[type]->dev.devt == devt) {
+ *idx_mask |= 1 << type;
+ return 0;
+ }
+ }
+
+ return -EINVAL;
+}
+
+static int get_devt_from_fd(unsigned int fd, dev_t *ret_dev)
+{
+ struct file *file;
+
+ file = fget(fd);
+ if (!file)
+ return -EBADF;
+
+ *ret_dev = file_inode(file)->i_rdev;
+ fput(file);
+ return 0;
+}
+
+/**
+ * ib_ucaps_init - Initialization required before ucap creation.
+ *
+ * Return: 0 on success, or a negative errno value on failure
+ */
+static int ib_ucaps_init(void)
+{
+ int ret = 0;
+
+ if (ucaps_class_is_registered)
+ return ret;
+
+ ret = class_register(&ucaps_class);
+ if (ret)
+ return ret;
+
+ ret = alloc_chrdev_region(&ucaps_base_dev, 0, RDMA_UCAP_MAX,
+ ucaps_class.name);
+ if (ret < 0) {
+ class_unregister(&ucaps_class);
+ return ret;
+ }
+
+ ucaps_class_is_registered = true;
+
+ return 0;
+}
+
+static void ucap_dev_release(struct device *device)
+{
+ struct ib_ucap *ucap = container_of(device, struct ib_ucap, dev);
+
+ kfree(ucap);
+}
+
+/**
+ * ib_create_ucap - Add a ucap character device
+ * @type: UCAP type
+ *
+ * Creates a ucap character device in the /dev/infiniband directory. By default,
+ * the device has root-only read-write access.
+ *
+ * A driver may call this multiple times with the same UCAP type. A reference
+ * count tracks creations and deletions.
+ *
+ * Return: 0 on success, or a negative errno value on failure
+ */
+int ib_create_ucap(enum rdma_user_cap type)
+{
+ struct ib_ucap *ucap;
+ int ret;
+
+ if (type >= RDMA_UCAP_MAX)
+ return -EINVAL;
+
+ mutex_lock(&ucaps_mutex);
+ ret = ib_ucaps_init();
+ if (ret)
+ goto unlock;
+
+ ucap = ucaps_list[type];
+ if (ucap) {
+ kref_get(&ucap->ref);
+ mutex_unlock(&ucaps_mutex);
+ return 0;
+ }
+
+ ucap = kzalloc(sizeof(*ucap), GFP_KERNEL);
+ if (!ucap) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
+ device_initialize(&ucap->dev);
+ ucap->dev.class = &ucaps_class;
+ ucap->dev.devt = MKDEV(MAJOR(ucaps_base_dev), type);
+ ucap->dev.release = ucap_dev_release;
+ ret = dev_set_name(&ucap->dev, "%s", ucap_names[type]);
+ if (ret)
+ goto err_device;
+
+ cdev_init(&ucap->cdev, &ucaps_cdev_fops);
+ ucap->cdev.owner = THIS_MODULE;
+
+ ret = cdev_device_add(&ucap->cdev, &ucap->dev);
+ if (ret)
+ goto err_device;
+
+ kref_init(&ucap->ref);
+ ucaps_list[type] = ucap;
+ mutex_unlock(&ucaps_mutex);
+
+ return 0;
+
+err_device:
+ put_device(&ucap->dev);
+unlock:
+ mutex_unlock(&ucaps_mutex);
+ return ret;
+}
+EXPORT_SYMBOL(ib_create_ucap);
+
+static void ib_release_ucap(struct kref *ref)
+{
+ struct ib_ucap *ucap = container_of(ref, struct ib_ucap, ref);
+ enum rdma_user_cap type;
+
+ for (type = RDMA_UCAP_FIRST; type < RDMA_UCAP_MAX; type++) {
+ if (ucaps_list[type] == ucap)
+ break;
+ }
+ WARN_ON(type == RDMA_UCAP_MAX);
+
+ ucaps_list[type] = NULL;
+ cdev_device_del(&ucap->cdev, &ucap->dev);
+ put_device(&ucap->dev);
+}
+
+/**
+ * ib_remove_ucap - Remove a ucap character device
+ * @type: User cap type
+ *
+ * Removes the ucap character device according to type. The device is completely
+ * removed from the filesystem when its reference count reaches 0.
+ */
+void ib_remove_ucap(enum rdma_user_cap type)
+{
+ struct ib_ucap *ucap;
+
+ mutex_lock(&ucaps_mutex);
+ ucap = ucaps_list[type];
+ if (WARN_ON(!ucap))
+ goto end;
+
+ kref_put(&ucap->ref, ib_release_ucap);
+end:
+ mutex_unlock(&ucaps_mutex);
+}
+EXPORT_SYMBOL(ib_remove_ucap);
+
+/**
+ * ib_get_ucaps - Get bitmask of ucap types from file descriptors
+ * @fds: Array of file descriptors
+ * @fd_count: Number of file descriptors in the array
+ * @idx_mask: Bitmask to be updated based on the ucaps in the fd list
+ *
+ * Given an array of file descriptors, this function returns a bitmask of
+ * the ucaps where a bit is set if an FD for that ucap type was in the array.
+ *
+ * Return: 0 on success, or a negative errno value on failure
+ */
+int ib_get_ucaps(int *fds, int fd_count, uint64_t *idx_mask)
+{
+ int ret = 0;
+ dev_t dev;
+
+ *idx_mask = 0;
+ mutex_lock(&ucaps_mutex);
+ for (int i = 0; i < fd_count; i++) {
+ ret = get_devt_from_fd(fds[i], &dev);
+ if (ret)
+ goto end;
+
+ ret = get_ucap_from_devt(dev, idx_mask);
+ if (ret)
+ goto end;
+ }
+
+end:
+ mutex_unlock(&ucaps_mutex);
+ return ret;
+}
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 5f5ad8faf86e..6e700b974033 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -69,7 +69,9 @@ static struct ctl_table ucma_ctl_table[] = {
.data = &max_backlog,
.maxlen = sizeof max_backlog,
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_INT_MAX,
},
};
@@ -1615,7 +1617,6 @@ static ssize_t ucma_migrate_id(struct ucma_file *new_file,
struct ucma_event *uevent, *tmp;
struct ucma_context *ctx;
LIST_HEAD(event_list);
- struct fd f;
struct ucma_file *cur_file;
int ret = 0;
@@ -1623,21 +1624,17 @@ static ssize_t ucma_migrate_id(struct ucma_file *new_file,
return -EFAULT;
/* Get current fd to protect against it being closed */
- f = fdget(cmd.fd);
- if (!f.file)
+ CLASS(fd, f)(cmd.fd);
+ if (fd_empty(f))
return -ENOENT;
- if (f.file->f_op != &ucma_fops) {
- ret = -EINVAL;
- goto file_put;
- }
- cur_file = f.file->private_data;
+ if (fd_file(f)->f_op != &ucma_fops)
+ return -EINVAL;
+ cur_file = fd_file(f)->private_data;
/* Validate current fd and prevent destruction of id. */
ctx = ucma_get_ctx(cur_file, cmd.id);
- if (IS_ERR(ctx)) {
- ret = PTR_ERR(ctx);
- goto file_put;
- }
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
rdma_lock_handler(ctx->cm_id);
/*
@@ -1678,8 +1675,6 @@ static ssize_t ucma_migrate_id(struct ucma_file *new_file,
err_unlock:
rdma_unlock_handler(ctx->cm_id);
ucma_put_ctx(ctx);
-file_put:
- fdput(f);
return ret;
}
@@ -1817,7 +1812,6 @@ static const struct file_operations ucma_fops = {
.release = ucma_close,
.write = ucma_write,
.poll = ucma_poll,
- .llseek = no_llseek,
};
static struct miscdevice ucma_misc = {
diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c
index 64d9c492de64..8d3dfef9ebaa 100644
--- a/drivers/infiniband/core/ud_header.c
+++ b/drivers/infiniband/core/ud_header.c
@@ -462,86 +462,3 @@ int ib_ud_header_pack(struct ib_ud_header *header,
return len;
}
EXPORT_SYMBOL(ib_ud_header_pack);
-
-/**
- * ib_ud_header_unpack - Unpack UD header struct from wire format
- * @header:UD header struct
- * @buf:Buffer to pack into
- *
- * ib_ud_header_pack() unpacks the UD header structure @header from wire
- * format in the buffer @buf.
- */
-int ib_ud_header_unpack(void *buf,
- struct ib_ud_header *header)
-{
- ib_unpack(lrh_table, ARRAY_SIZE(lrh_table),
- buf, &header->lrh);
- buf += IB_LRH_BYTES;
-
- if (header->lrh.link_version != 0) {
- pr_warn("Invalid LRH.link_version %u\n",
- header->lrh.link_version);
- return -EINVAL;
- }
-
- switch (header->lrh.link_next_header) {
- case IB_LNH_IBA_LOCAL:
- header->grh_present = 0;
- break;
-
- case IB_LNH_IBA_GLOBAL:
- header->grh_present = 1;
- ib_unpack(grh_table, ARRAY_SIZE(grh_table),
- buf, &header->grh);
- buf += IB_GRH_BYTES;
-
- if (header->grh.ip_version != 6) {
- pr_warn("Invalid GRH.ip_version %u\n",
- header->grh.ip_version);
- return -EINVAL;
- }
- if (header->grh.next_header != 0x1b) {
- pr_warn("Invalid GRH.next_header 0x%02x\n",
- header->grh.next_header);
- return -EINVAL;
- }
- break;
-
- default:
- pr_warn("Invalid LRH.link_next_header %u\n",
- header->lrh.link_next_header);
- return -EINVAL;
- }
-
- ib_unpack(bth_table, ARRAY_SIZE(bth_table),
- buf, &header->bth);
- buf += IB_BTH_BYTES;
-
- switch (header->bth.opcode) {
- case IB_OPCODE_UD_SEND_ONLY:
- header->immediate_present = 0;
- break;
- case IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE:
- header->immediate_present = 1;
- break;
- default:
- pr_warn("Invalid BTH.opcode 0x%02x\n", header->bth.opcode);
- return -EINVAL;
- }
-
- if (header->bth.transport_header_version != 0) {
- pr_warn("Invalid BTH.transport_header_version %u\n",
- header->bth.transport_header_version);
- return -EINVAL;
- }
-
- ib_unpack(deth_table, ARRAY_SIZE(deth_table),
- buf, &header->deth);
- buf += IB_DETH_BYTES;
-
- if (header->immediate_present)
- memcpy(&header->immediate_data, buf, sizeof header->immediate_data);
-
- return 0;
-}
-EXPORT_SYMBOL(ib_ud_header_unpack);
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 07c571c7b699..c5b686394760 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -80,9 +80,12 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
unsigned long pgsz_bitmap,
unsigned long virt)
{
- struct scatterlist *sg;
+ unsigned long curr_len = 0;
+ dma_addr_t curr_base = ~0;
unsigned long va, pgoff;
+ struct scatterlist *sg;
dma_addr_t mask;
+ dma_addr_t end;
int i;
umem->iova = va = virt;
@@ -107,17 +110,30 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
pgoff = umem->address & ~PAGE_MASK;
for_each_sgtable_dma_sg(&umem->sgt_append.sgt, sg, i) {
- /* Walk SGL and reduce max page size if VA/PA bits differ
- * for any address.
+ /* If the current entry is physically contiguous with the previous
+ * one, no need to take its start addresses into consideration.
*/
- mask |= (sg_dma_address(sg) + pgoff) ^ va;
+ if (check_add_overflow(curr_base, curr_len, &end) ||
+ end != sg_dma_address(sg)) {
+
+ curr_base = sg_dma_address(sg);
+ curr_len = 0;
+
+ /* Reduce max page size if VA/PA bits differ */
+ mask |= (curr_base + pgoff) ^ va;
+
+ /* The alignment of any VA matching a discontinuity point
+ * in the physical memory sets the maximum possible page
+ * size as this must be a starting point of a new page that
+ * needs to be aligned.
+ */
+ if (i != 0)
+ mask |= va;
+ }
+
+ curr_len += sg_dma_len(sg);
va += sg_dma_len(sg) - pgoff;
- /* Except for the last entry, the ending iova alignment sets
- * the maximum possible page size as the low bits of the iova
- * must be zero when starting the next chunk.
- */
- if (i != (umem->sgt_append.sgt.nents - 1))
- mask |= va;
+
pgoff = 0;
}
diff --git a/drivers/infiniband/core/umem_dmabuf.c b/drivers/infiniband/core/umem_dmabuf.c
index 39357dc2d229..0ec2e4120cc9 100644
--- a/drivers/infiniband/core/umem_dmabuf.c
+++ b/drivers/infiniband/core/umem_dmabuf.c
@@ -10,7 +10,7 @@
#include "uverbs.h"
-MODULE_IMPORT_NS(DMA_BUF);
+MODULE_IMPORT_NS("DMA_BUF");
int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf)
{
@@ -23,6 +23,9 @@ int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf)
dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
+ if (umem_dmabuf->revoked)
+ return -EINVAL;
+
if (umem_dmabuf->sgt)
goto wait_fence;
@@ -110,10 +113,12 @@ void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf)
}
EXPORT_SYMBOL(ib_umem_dmabuf_unmap_pages);
-struct ib_umem_dmabuf *ib_umem_dmabuf_get(struct ib_device *device,
- unsigned long offset, size_t size,
- int fd, int access,
- const struct dma_buf_attach_ops *ops)
+static struct ib_umem_dmabuf *
+ib_umem_dmabuf_get_with_dma_device(struct ib_device *device,
+ struct device *dma_device,
+ unsigned long offset, size_t size,
+ int fd, int access,
+ const struct dma_buf_attach_ops *ops)
{
struct dma_buf *dmabuf;
struct ib_umem_dmabuf *umem_dmabuf;
@@ -152,7 +157,7 @@ struct ib_umem_dmabuf *ib_umem_dmabuf_get(struct ib_device *device,
umem_dmabuf->attach = dma_buf_dynamic_attach(
dmabuf,
- device->dma_device,
+ dma_device,
ops,
umem_dmabuf);
if (IS_ERR(umem_dmabuf->attach)) {
@@ -168,6 +173,15 @@ out_release_dmabuf:
dma_buf_put(dmabuf);
return ret;
}
+
+struct ib_umem_dmabuf *ib_umem_dmabuf_get(struct ib_device *device,
+ unsigned long offset, size_t size,
+ int fd, int access,
+ const struct dma_buf_attach_ops *ops)
+{
+ return ib_umem_dmabuf_get_with_dma_device(device, device->dma_device,
+ offset, size, fd, access, ops);
+}
EXPORT_SYMBOL(ib_umem_dmabuf_get);
static void
@@ -184,16 +198,18 @@ static struct dma_buf_attach_ops ib_umem_dmabuf_attach_pinned_ops = {
.move_notify = ib_umem_dmabuf_unsupported_move_notify,
};
-struct ib_umem_dmabuf *ib_umem_dmabuf_get_pinned(struct ib_device *device,
- unsigned long offset,
- size_t size, int fd,
- int access)
+struct ib_umem_dmabuf *
+ib_umem_dmabuf_get_pinned_with_dma_device(struct ib_device *device,
+ struct device *dma_device,
+ unsigned long offset, size_t size,
+ int fd, int access)
{
struct ib_umem_dmabuf *umem_dmabuf;
int err;
- umem_dmabuf = ib_umem_dmabuf_get(device, offset, size, fd, access,
- &ib_umem_dmabuf_attach_pinned_ops);
+ umem_dmabuf = ib_umem_dmabuf_get_with_dma_device(device, dma_device, offset,
+ size, fd, access,
+ &ib_umem_dmabuf_attach_pinned_ops);
if (IS_ERR(umem_dmabuf))
return umem_dmabuf;
@@ -217,17 +233,41 @@ err_release:
ib_umem_release(&umem_dmabuf->umem);
return ERR_PTR(err);
}
+EXPORT_SYMBOL(ib_umem_dmabuf_get_pinned_with_dma_device);
+
+struct ib_umem_dmabuf *ib_umem_dmabuf_get_pinned(struct ib_device *device,
+ unsigned long offset,
+ size_t size, int fd,
+ int access)
+{
+ return ib_umem_dmabuf_get_pinned_with_dma_device(device, device->dma_device,
+ offset, size, fd, access);
+}
EXPORT_SYMBOL(ib_umem_dmabuf_get_pinned);
-void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf)
+void ib_umem_dmabuf_revoke(struct ib_umem_dmabuf *umem_dmabuf)
{
struct dma_buf *dmabuf = umem_dmabuf->attach->dmabuf;
dma_resv_lock(dmabuf->resv, NULL);
+ if (umem_dmabuf->revoked)
+ goto end;
ib_umem_dmabuf_unmap_pages(umem_dmabuf);
- if (umem_dmabuf->pinned)
+ if (umem_dmabuf->pinned) {
dma_buf_unpin(umem_dmabuf->attach);
+ umem_dmabuf->pinned = 0;
+ }
+ umem_dmabuf->revoked = 1;
+end:
dma_resv_unlock(dmabuf->resv);
+}
+EXPORT_SYMBOL(ib_umem_dmabuf_revoke);
+
+void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf)
+{
+ struct dma_buf *dmabuf = umem_dmabuf->attach->dmabuf;
+
+ ib_umem_dmabuf_revoke(umem_dmabuf);
dma_buf_detach(dmabuf, umem_dmabuf->attach);
dma_buf_put(dmabuf);
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index e9fa22d31c23..c752ae9fad6c 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -41,65 +41,72 @@
#include <linux/hugetlb.h>
#include <linux/interval_tree.h>
#include <linux/hmm.h>
+#include <linux/hmm-dma.h>
#include <linux/pagemap.h>
#include <rdma/ib_umem_odp.h>
#include "uverbs.h"
-static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
- const struct mmu_interval_notifier_ops *ops)
+static void ib_init_umem_implicit_odp(struct ib_umem_odp *umem_odp)
{
- int ret;
+ umem_odp->is_implicit_odp = 1;
+ umem_odp->umem.is_odp = 1;
+ mutex_init(&umem_odp->umem_mutex);
+}
+
+static int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
+ const struct mmu_interval_notifier_ops *ops)
+{
+ struct ib_device *dev = umem_odp->umem.ibdev;
+ size_t page_size = 1UL << umem_odp->page_shift;
+ struct hmm_dma_map *map;
+ unsigned long start;
+ unsigned long end;
+ size_t nr_entries;
+ int ret = 0;
umem_odp->umem.is_odp = 1;
mutex_init(&umem_odp->umem_mutex);
- if (!umem_odp->is_implicit_odp) {
- size_t page_size = 1UL << umem_odp->page_shift;
- unsigned long start;
- unsigned long end;
- size_t ndmas, npfns;
-
- start = ALIGN_DOWN(umem_odp->umem.address, page_size);
- if (check_add_overflow(umem_odp->umem.address,
- (unsigned long)umem_odp->umem.length,
- &end))
- return -EOVERFLOW;
- end = ALIGN(end, page_size);
- if (unlikely(end < page_size))
- return -EOVERFLOW;
-
- ndmas = (end - start) >> umem_odp->page_shift;
- if (!ndmas)
- return -EINVAL;
-
- npfns = (end - start) >> PAGE_SHIFT;
- umem_odp->pfn_list = kvcalloc(
- npfns, sizeof(*umem_odp->pfn_list), GFP_KERNEL);
- if (!umem_odp->pfn_list)
- return -ENOMEM;
-
- umem_odp->dma_list = kvcalloc(
- ndmas, sizeof(*umem_odp->dma_list), GFP_KERNEL);
- if (!umem_odp->dma_list) {
+ start = ALIGN_DOWN(umem_odp->umem.address, page_size);
+ if (check_add_overflow(umem_odp->umem.address,
+ (unsigned long)umem_odp->umem.length, &end))
+ return -EOVERFLOW;
+ end = ALIGN(end, page_size);
+ if (unlikely(end < page_size))
+ return -EOVERFLOW;
+
+ nr_entries = (end - start) >> PAGE_SHIFT;
+ if (!(nr_entries * PAGE_SIZE / page_size))
+ return -EINVAL;
+
+ map = &umem_odp->map;
+ if (ib_uses_virt_dma(dev)) {
+ map->pfn_list = kvcalloc(nr_entries, sizeof(*map->pfn_list),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!map->pfn_list)
ret = -ENOMEM;
- goto out_pfn_list;
- }
+ } else
+ ret = hmm_dma_map_alloc(dev->dma_device, map,
+ (end - start) >> PAGE_SHIFT,
+ 1 << umem_odp->page_shift);
+ if (ret)
+ return ret;
- ret = mmu_interval_notifier_insert(&umem_odp->notifier,
- umem_odp->umem.owning_mm,
- start, end - start, ops);
- if (ret)
- goto out_dma_list;
- }
+ ret = mmu_interval_notifier_insert(&umem_odp->notifier,
+ umem_odp->umem.owning_mm, start,
+ end - start, ops);
+ if (ret)
+ goto out_free_map;
return 0;
-out_dma_list:
- kvfree(umem_odp->dma_list);
-out_pfn_list:
- kvfree(umem_odp->pfn_list);
+out_free_map:
+ if (ib_uses_virt_dma(dev))
+ kfree(map->pfn_list);
+ else
+ hmm_dma_map_free(dev->dma_device, map);
return ret;
}
@@ -118,7 +125,6 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device,
{
struct ib_umem *umem;
struct ib_umem_odp *umem_odp;
- int ret;
if (access & IB_ACCESS_HUGETLB)
return ERR_PTR(-EINVAL);
@@ -130,16 +136,10 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device,
umem->ibdev = device;
umem->writable = ib_access_writable(access);
umem->owning_mm = current->mm;
- umem_odp->is_implicit_odp = 1;
umem_odp->page_shift = PAGE_SHIFT;
umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
- ret = ib_init_umem_odp(umem_odp, NULL);
- if (ret) {
- put_pid(umem_odp->tgid);
- kfree(umem_odp);
- return ERR_PTR(ret);
- }
+ ib_init_umem_implicit_odp(umem_odp);
return umem_odp;
}
EXPORT_SYMBOL(ib_umem_odp_alloc_implicit);
@@ -260,74 +260,41 @@ err_put_pid:
}
EXPORT_SYMBOL(ib_umem_odp_get);
-void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
+static void ib_umem_odp_free(struct ib_umem_odp *umem_odp)
{
+ struct ib_device *dev = umem_odp->umem.ibdev;
+
/*
* Ensure that no more pages are mapped in the umem.
*
* It is the driver's responsibility to ensure, before calling us,
* that the hardware will not attempt to access the MR any more.
*/
- if (!umem_odp->is_implicit_odp) {
- mutex_lock(&umem_odp->umem_mutex);
- ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
- ib_umem_end(umem_odp));
- mutex_unlock(&umem_odp->umem_mutex);
- mmu_interval_notifier_remove(&umem_odp->notifier);
- kvfree(umem_odp->dma_list);
- kvfree(umem_odp->pfn_list);
- }
- put_pid(umem_odp->tgid);
- kfree(umem_odp);
+ mutex_lock(&umem_odp->umem_mutex);
+ ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
+ ib_umem_end(umem_odp));
+ mutex_unlock(&umem_odp->umem_mutex);
+ mmu_interval_notifier_remove(&umem_odp->notifier);
+ if (ib_uses_virt_dma(dev))
+ kfree(umem_odp->map.pfn_list);
+ else
+ hmm_dma_map_free(dev->dma_device, &umem_odp->map);
}
-EXPORT_SYMBOL(ib_umem_odp_release);
-/*
- * Map for DMA and insert a single page into the on-demand paging page tables.
- *
- * @umem: the umem to insert the page to.
- * @dma_index: index in the umem to add the dma to.
- * @page: the page struct to map and add.
- * @access_mask: access permissions needed for this page.
- *
- * The function returns -EFAULT if the DMA mapping operation fails.
- *
- */
-static int ib_umem_odp_map_dma_single_page(
- struct ib_umem_odp *umem_odp,
- unsigned int dma_index,
- struct page *page,
- u64 access_mask)
+void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
{
- struct ib_device *dev = umem_odp->umem.ibdev;
- dma_addr_t *dma_addr = &umem_odp->dma_list[dma_index];
-
- if (*dma_addr) {
- /*
- * If the page is already dma mapped it means it went through
- * a non-invalidating trasition, like read-only to writable.
- * Resync the flags.
- */
- *dma_addr = (*dma_addr & ODP_DMA_ADDR_MASK) | access_mask;
- return 0;
- }
+ if (!umem_odp->is_implicit_odp)
+ ib_umem_odp_free(umem_odp);
- *dma_addr = ib_dma_map_page(dev, page, 0, 1 << umem_odp->page_shift,
- DMA_BIDIRECTIONAL);
- if (ib_dma_mapping_error(dev, *dma_addr)) {
- *dma_addr = 0;
- return -EFAULT;
- }
- umem_odp->npages++;
- *dma_addr |= access_mask;
- return 0;
+ put_pid(umem_odp->tgid);
+ kfree(umem_odp);
}
+EXPORT_SYMBOL(ib_umem_odp_release);
/**
* ib_umem_odp_map_dma_and_lock - DMA map userspace memory in an ODP MR and lock it.
*
* Maps the range passed in the argument to DMA addresses.
- * The DMA addresses of the mapped pages is updated in umem_odp->dma_list.
* Upon success the ODP MR will be locked to let caller complete its device
* page table update.
*
@@ -355,9 +322,6 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
struct hmm_range range = {};
unsigned long timeout;
- if (access_mask == 0)
- return -EINVAL;
-
if (user_virt < ib_umem_start(umem_odp) ||
user_virt + bcnt > ib_umem_end(umem_odp))
return -EFAULT;
@@ -383,11 +347,11 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
if (fault) {
range.default_flags = HMM_PFN_REQ_FAULT;
- if (access_mask & ODP_WRITE_ALLOWED_BIT)
+ if (access_mask & HMM_PFN_WRITE)
range.default_flags |= HMM_PFN_REQ_WRITE;
}
- range.hmm_pfns = &(umem_odp->pfn_list[pfn_start_idx]);
+ range.hmm_pfns = &(umem_odp->map.pfn_list[pfn_start_idx]);
timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
retry:
@@ -415,22 +379,17 @@ retry:
for (pfn_index = 0; pfn_index < num_pfns;
pfn_index += 1 << (page_shift - PAGE_SHIFT), dma_index++) {
- if (fault) {
- /*
- * Since we asked for hmm_range_fault() to populate
- * pages it shouldn't return an error entry on success.
- */
- WARN_ON(range.hmm_pfns[pfn_index] & HMM_PFN_ERROR);
- WARN_ON(!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID));
- } else {
- if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) {
- WARN_ON(umem_odp->dma_list[dma_index]);
- continue;
- }
- access_mask = ODP_READ_ALLOWED_BIT;
- if (range.hmm_pfns[pfn_index] & HMM_PFN_WRITE)
- access_mask |= ODP_WRITE_ALLOWED_BIT;
- }
+ /*
+ * Since we asked for hmm_range_fault() to populate
+ * pages it shouldn't return an error entry on success.
+ */
+ WARN_ON(fault && range.hmm_pfns[pfn_index] & HMM_PFN_ERROR);
+ WARN_ON(fault && !(range.hmm_pfns[pfn_index] & HMM_PFN_VALID));
+ if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID))
+ continue;
+
+ if (range.hmm_pfns[pfn_index] & HMM_PFN_DMA_MAPPED)
+ continue;
hmm_order = hmm_pfn_to_map_order(range.hmm_pfns[pfn_index]);
/* If a hugepage was detected and ODP wasn't set for, the umem
@@ -443,15 +402,6 @@ retry:
__func__, hmm_order, page_shift);
break;
}
-
- ret = ib_umem_odp_map_dma_single_page(
- umem_odp, dma_index, hmm_pfn_to_page(range.hmm_pfns[pfn_index]),
- access_mask);
- if (ret < 0) {
- ibdev_dbg(umem_odp->umem.ibdev,
- "ib_umem_odp_map_dma_single_page failed with error %d\n", ret);
- break;
- }
}
/* upon success lock should stay on hold for the callee */
if (!ret)
@@ -471,45 +421,38 @@ EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock);
void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
u64 bound)
{
- dma_addr_t dma_addr;
- dma_addr_t dma;
- int idx;
- u64 addr;
struct ib_device *dev = umem_odp->umem.ibdev;
+ u64 addr;
lockdep_assert_held(&umem_odp->umem_mutex);
virt = max_t(u64, virt, ib_umem_start(umem_odp));
bound = min_t(u64, bound, ib_umem_end(umem_odp));
for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) {
- idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
- dma = umem_odp->dma_list[idx];
-
- /* The access flags guaranteed a valid DMA address in case was NULL */
- if (dma) {
- unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >> PAGE_SHIFT;
- struct page *page = hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]);
-
- dma_addr = dma & ODP_DMA_ADDR_MASK;
- ib_dma_unmap_page(dev, dma_addr,
- BIT(umem_odp->page_shift),
- DMA_BIDIRECTIONAL);
- if (dma & ODP_WRITE_ALLOWED_BIT) {
- struct page *head_page = compound_head(page);
- /*
- * set_page_dirty prefers being called with
- * the page lock. However, MMU notifiers are
- * called sometimes with and sometimes without
- * the lock. We rely on the umem_mutex instead
- * to prevent other mmu notifiers from
- * continuing and allowing the page mapping to
- * be removed.
- */
- set_page_dirty(head_page);
- }
- umem_odp->dma_list[idx] = 0;
- umem_odp->npages--;
+ u64 offset = addr - ib_umem_start(umem_odp);
+ size_t idx = offset >> umem_odp->page_shift;
+ unsigned long pfn = umem_odp->map.pfn_list[idx];
+
+ if (!hmm_dma_unmap_pfn(dev->dma_device, &umem_odp->map, idx))
+ goto clear;
+
+ if (pfn & HMM_PFN_WRITE) {
+ struct page *page = hmm_pfn_to_page(pfn);
+ struct page *head_page = compound_head(page);
+ /*
+ * set_page_dirty prefers being called with
+ * the page lock. However, MMU notifiers are
+ * called sometimes with and sometimes without
+ * the lock. We rely on the umem_mutex instead
+ * to prevent other mmu notifiers from
+ * continuing and allowing the page mapping to
+ * be removed.
+ */
+ set_page_dirty(head_page);
}
+ umem_odp->npages--;
+clear:
+ umem_odp->map.pfn_list[idx] &= ~HMM_PFN_FLAGS;
}
}
EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index f5feca7fa9b9..fd67fc9fe85a 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -63,6 +63,8 @@ MODULE_AUTHOR("Roland Dreier");
MODULE_DESCRIPTION("InfiniBand userspace MAD packet access");
MODULE_LICENSE("Dual BSD/GPL");
+#define MAX_UMAD_RECV_LIST_SIZE 200000
+
enum {
IB_UMAD_MAX_PORTS = RDMA_MAX_PORTS,
IB_UMAD_MAX_AGENTS = 32,
@@ -113,6 +115,7 @@ struct ib_umad_file {
struct mutex mutex;
struct ib_umad_port *port;
struct list_head recv_list;
+ atomic_t recv_list_size;
struct list_head send_list;
struct list_head port_list;
spinlock_t send_lock;
@@ -180,24 +183,28 @@ static struct ib_mad_agent *__get_agent(struct ib_umad_file *file, int id)
return file->agents_dead ? NULL : file->agent[id];
}
-static int queue_packet(struct ib_umad_file *file,
- struct ib_mad_agent *agent,
- struct ib_umad_packet *packet)
+static int queue_packet(struct ib_umad_file *file, struct ib_mad_agent *agent,
+ struct ib_umad_packet *packet, bool is_recv_mad)
{
int ret = 1;
mutex_lock(&file->mutex);
+ if (is_recv_mad &&
+ atomic_read(&file->recv_list_size) > MAX_UMAD_RECV_LIST_SIZE)
+ goto unlock;
+
for (packet->mad.hdr.id = 0;
packet->mad.hdr.id < IB_UMAD_MAX_AGENTS;
packet->mad.hdr.id++)
if (agent == __get_agent(file, packet->mad.hdr.id)) {
list_add_tail(&packet->list, &file->recv_list);
+ atomic_inc(&file->recv_list_size);
wake_up_interruptible(&file->recv_wait);
ret = 0;
break;
}
-
+unlock:
mutex_unlock(&file->mutex);
return ret;
@@ -224,7 +231,7 @@ static void send_handler(struct ib_mad_agent *agent,
if (send_wc->status == IB_WC_RESP_TIMEOUT_ERR) {
packet->length = IB_MGMT_MAD_HDR;
packet->mad.hdr.status = ETIMEDOUT;
- if (!queue_packet(file, agent, packet))
+ if (!queue_packet(file, agent, packet, false))
return;
}
kfree(packet);
@@ -284,7 +291,7 @@ static void recv_handler(struct ib_mad_agent *agent,
rdma_destroy_ah_attr(&ah_attr);
}
- if (queue_packet(file, agent, packet))
+ if (queue_packet(file, agent, packet, true))
goto err2;
return;
@@ -409,6 +416,7 @@ static ssize_t ib_umad_read(struct file *filp, char __user *buf,
packet = list_entry(file->recv_list.next, struct ib_umad_packet, list);
list_del(&packet->list);
+ atomic_dec(&file->recv_list_size);
mutex_unlock(&file->mutex);
@@ -421,6 +429,7 @@ static ssize_t ib_umad_read(struct file *filp, char __user *buf,
/* Requeue packet */
mutex_lock(&file->mutex);
list_add(&packet->list, &file->recv_list);
+ atomic_inc(&file->recv_list_size);
mutex_unlock(&file->mutex);
} else {
if (packet->recv_wc)
@@ -1073,7 +1082,6 @@ static const struct file_operations umad_fops = {
#endif
.open = ib_umad_open,
.release = ib_umad_close,
- .llseek = no_llseek,
};
static int ib_umad_sm_open(struct inode *inode, struct file *filp)
@@ -1141,7 +1149,6 @@ static const struct file_operations umad_sm_fops = {
.owner = THIS_MODULE,
.open = ib_umad_sm_open,
.release = ib_umad_sm_close,
- .llseek = no_llseek,
};
static struct ib_umad_port *get_port(struct ib_device *ibdev,
@@ -1312,15 +1319,17 @@ static int ib_umad_init_port(struct ib_device *device, int port_num,
if (ret)
goto err_cdev;
- ib_umad_init_port_dev(&port->sm_dev, port, device);
- port->sm_dev.devt = base_issm;
- dev_set_name(&port->sm_dev, "issm%d", port->dev_num);
- cdev_init(&port->sm_cdev, &umad_sm_fops);
- port->sm_cdev.owner = THIS_MODULE;
+ if (rdma_cap_ib_smi(device, port_num)) {
+ ib_umad_init_port_dev(&port->sm_dev, port, device);
+ port->sm_dev.devt = base_issm;
+ dev_set_name(&port->sm_dev, "issm%d", port->dev_num);
+ cdev_init(&port->sm_cdev, &umad_sm_fops);
+ port->sm_cdev.owner = THIS_MODULE;
- ret = cdev_device_add(&port->sm_cdev, &port->sm_dev);
- if (ret)
- goto err_dev;
+ ret = cdev_device_add(&port->sm_cdev, &port->sm_dev);
+ if (ret)
+ goto err_dev;
+ }
return 0;
@@ -1336,9 +1345,13 @@ err_cdev:
static void ib_umad_kill_port(struct ib_umad_port *port)
{
struct ib_umad_file *file;
+ bool has_smi = false;
int id;
- cdev_device_del(&port->sm_cdev, &port->sm_dev);
+ if (rdma_cap_ib_smi(port->ib_dev, port->port_num)) {
+ cdev_device_del(&port->sm_cdev, &port->sm_dev);
+ has_smi = true;
+ }
cdev_device_del(&port->cdev, &port->dev);
mutex_lock(&port->file_mutex);
@@ -1364,7 +1377,8 @@ static void ib_umad_kill_port(struct ib_umad_port *port)
ida_free(&umad_ida, port->dev_num);
/* balances device_initialize() */
- put_device(&port->sm_dev);
+ if (has_smi)
+ put_device(&port->sm_dev);
put_device(&port->dev);
}
diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index 821d93c8f712..797e2fcc8072 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -133,35 +133,6 @@ struct ib_uverbs_completion_event_file {
struct ib_uverbs_event_queue ev_queue;
};
-struct ib_uverbs_file {
- struct kref ref;
- struct ib_uverbs_device *device;
- struct mutex ucontext_lock;
- /*
- * ucontext must be accessed via ib_uverbs_get_ucontext() or with
- * ucontext_lock held
- */
- struct ib_ucontext *ucontext;
- struct ib_uverbs_async_event_file *default_async_file;
- struct list_head list;
-
- /*
- * To access the uobjects list hw_destroy_rwsem must be held for write
- * OR hw_destroy_rwsem held for read AND uobjects_lock held.
- * hw_destroy_rwsem should be called across any destruction of the HW
- * object of an associated uobject.
- */
- struct rw_semaphore hw_destroy_rwsem;
- spinlock_t uobjects_lock;
- struct list_head uobjects;
-
- struct mutex umap_lock;
- struct list_head umaps;
- struct page *disassociate_page;
-
- struct xarray idr;
-};
-
struct ib_uverbs_event {
union {
struct ib_uverbs_async_event_desc async;
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 3d3ee3eca983..bc9fe3ceca4d 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -42,6 +42,7 @@
#include <rdma/uverbs_types.h>
#include <rdma/uverbs_std_types.h>
+#include <rdma/ib_ucaps.h>
#include "rdma_core.h"
#include "uverbs.h"
@@ -161,7 +162,7 @@ static const void __user *uverbs_request_next_ptr(struct uverbs_req_iter *iter,
{
const void __user *res = iter->cur;
- if (iter->cur + len > iter->end)
+ if (len > iter->end - iter->cur)
return (void __force __user *)ERR_PTR(-ENOSPC);
iter->cur += len;
return res;
@@ -192,7 +193,7 @@ _ib_uverbs_lookup_comp_file(s32 fd, struct uverbs_attr_bundle *attrs)
fd, attrs);
if (IS_ERR(uobj))
- return (void *)uobj;
+ return ERR_CAST(uobj);
uverbs_uobject_get(uobj);
uobj_put_read(uobj);
@@ -232,6 +233,8 @@ int ib_init_ucontext(struct uverbs_attr_bundle *attrs)
{
struct ib_ucontext *ucontext = attrs->context;
struct ib_uverbs_file *file = attrs->ufile;
+ int *fd_array;
+ int fd_count;
int ret;
if (!down_read_trylock(&file->hw_destroy_rwsem))
@@ -247,6 +250,22 @@ int ib_init_ucontext(struct uverbs_attr_bundle *attrs)
if (ret)
goto err;
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_GET_CONTEXT_FD_ARR)) {
+ fd_count = uverbs_attr_ptr_get_array_size(attrs,
+ UVERBS_ATTR_GET_CONTEXT_FD_ARR,
+ sizeof(int));
+ if (fd_count < 0) {
+ ret = fd_count;
+ goto err_uncharge;
+ }
+
+ fd_array = uverbs_attr_get_alloced_ptr(attrs,
+ UVERBS_ATTR_GET_CONTEXT_FD_ARR);
+ ret = ib_get_ucaps(fd_array, fd_count, &ucontext->enabled_caps);
+ if (ret)
+ goto err_uncharge;
+ }
+
ret = ucontext->device->ops.alloc_ucontext(ucontext,
&attrs->driver_udata);
if (ret)
@@ -572,7 +591,7 @@ static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs)
struct inode *inode = NULL;
int new_xrcd = 0;
struct ib_device *ib_dev;
- struct fd f = {};
+ struct fd f = EMPTY_FD;
int ret;
ret = uverbs_request(attrs, &cmd, sizeof(cmd));
@@ -584,12 +603,12 @@ static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs)
if (cmd.fd != -1) {
/* search for file descriptor */
f = fdget(cmd.fd);
- if (!f.file) {
+ if (fd_empty(f)) {
ret = -EBADF;
goto err_tree_mutex_unlock;
}
- inode = file_inode(f.file);
+ inode = file_inode(fd_file(f));
xrcd = find_xrcd(ibudev, inode);
if (!xrcd && !(cmd.oflags & O_CREAT)) {
/* no file descriptor. Need CREATE flag */
@@ -632,8 +651,7 @@ static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs)
atomic_inc(&xrcd->usecnt);
}
- if (f.file)
- fdput(f);
+ fdput(f);
mutex_unlock(&ibudev->xrcd_tree_mutex);
uobj_finalize_uobj_create(&obj->uobject, attrs);
@@ -648,8 +666,7 @@ err:
uobj_alloc_abort(&obj->uobject, attrs);
err_tree_mutex_unlock:
- if (f.file)
- fdput(f);
+ fdput(f);
mutex_unlock(&ibudev->xrcd_tree_mutex);
@@ -718,8 +735,8 @@ static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs)
goto err_free;
pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs);
- if (!pd) {
- ret = -EINVAL;
+ if (IS_ERR(pd)) {
+ ret = PTR_ERR(pd);
goto err_free;
}
@@ -809,8 +826,8 @@ static int ib_uverbs_rereg_mr(struct uverbs_attr_bundle *attrs)
if (cmd.flags & IB_MR_REREG_PD) {
new_pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle,
attrs);
- if (!new_pd) {
- ret = -EINVAL;
+ if (IS_ERR(new_pd)) {
+ ret = PTR_ERR(new_pd);
goto put_uobjs;
}
} else {
@@ -919,8 +936,8 @@ static int ib_uverbs_alloc_mw(struct uverbs_attr_bundle *attrs)
return PTR_ERR(uobj);
pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs);
- if (!pd) {
- ret = -EINVAL;
+ if (IS_ERR(pd)) {
+ ret = PTR_ERR(pd);
goto err_free;
}
@@ -1051,7 +1068,7 @@ static int create_cq(struct uverbs_attr_bundle *attrs,
rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ);
rdma_restrack_set_name(&cq->res, NULL);
- ret = ib_dev->ops.create_cq(cq, &attr, &attrs->driver_udata);
+ ret = ib_dev->ops.create_cq(cq, &attr, attrs);
if (ret)
goto err_free;
rdma_restrack_add(&cq->res);
@@ -1127,8 +1144,8 @@ static int ib_uverbs_resize_cq(struct uverbs_attr_bundle *attrs)
return ret;
cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs);
- if (!cq)
- return -EINVAL;
+ if (IS_ERR(cq))
+ return PTR_ERR(cq);
ret = cq->device->ops.resize_cq(cq, cmd.cqe, &attrs->driver_udata);
if (ret)
@@ -1189,8 +1206,8 @@ static int ib_uverbs_poll_cq(struct uverbs_attr_bundle *attrs)
return ret;
cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs);
- if (!cq)
- return -EINVAL;
+ if (IS_ERR(cq))
+ return PTR_ERR(cq);
/* we copy a struct ib_uverbs_poll_cq_resp to user space */
header_ptr = attrs->ucore.outbuf;
@@ -1238,8 +1255,8 @@ static int ib_uverbs_req_notify_cq(struct uverbs_attr_bundle *attrs)
return ret;
cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs);
- if (!cq)
- return -EINVAL;
+ if (IS_ERR(cq))
+ return PTR_ERR(cq);
ib_req_notify_cq(cq, cmd.solicited_only ?
IB_CQ_SOLICITED : IB_CQ_NEXT_COMP);
@@ -1321,8 +1338,8 @@ static int create_qp(struct uverbs_attr_bundle *attrs,
ind_tbl = uobj_get_obj_read(rwq_ind_table,
UVERBS_OBJECT_RWQ_IND_TBL,
cmd->rwq_ind_tbl_handle, attrs);
- if (!ind_tbl) {
- ret = -EINVAL;
+ if (IS_ERR(ind_tbl)) {
+ ret = PTR_ERR(ind_tbl);
goto err_put;
}
@@ -1360,8 +1377,10 @@ static int create_qp(struct uverbs_attr_bundle *attrs,
if (cmd->is_srq) {
srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ,
cmd->srq_handle, attrs);
- if (!srq || srq->srq_type == IB_SRQT_XRC) {
- ret = -EINVAL;
+ if (IS_ERR(srq) ||
+ srq->srq_type == IB_SRQT_XRC) {
+ ret = IS_ERR(srq) ? PTR_ERR(srq) :
+ -EINVAL;
goto err_put;
}
}
@@ -1371,23 +1390,29 @@ static int create_qp(struct uverbs_attr_bundle *attrs,
rcq = uobj_get_obj_read(
cq, UVERBS_OBJECT_CQ,
cmd->recv_cq_handle, attrs);
- if (!rcq) {
- ret = -EINVAL;
+ if (IS_ERR(rcq)) {
+ ret = PTR_ERR(rcq);
goto err_put;
}
}
}
}
- if (has_sq)
+ if (has_sq) {
scq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ,
cmd->send_cq_handle, attrs);
+ if (IS_ERR(scq)) {
+ ret = PTR_ERR(scq);
+ goto err_put;
+ }
+ }
+
if (!ind_tbl && cmd->qp_type != IB_QPT_XRC_INI)
rcq = rcq ?: scq;
pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd->pd_handle,
attrs);
- if (!pd || (!scq && has_sq)) {
- ret = -EINVAL;
+ if (IS_ERR(pd)) {
+ ret = PTR_ERR(pd);
goto err_put;
}
@@ -1482,18 +1507,18 @@ static int create_qp(struct uverbs_attr_bundle *attrs,
err_put:
if (!IS_ERR(xrcd_uobj))
uobj_put_read(xrcd_uobj);
- if (pd)
+ if (!IS_ERR_OR_NULL(pd))
uobj_put_obj_read(pd);
- if (scq)
+ if (!IS_ERR_OR_NULL(scq))
rdma_lookup_put_uobject(&scq->uobject->uevent.uobject,
UVERBS_LOOKUP_READ);
- if (rcq && rcq != scq)
+ if (!IS_ERR_OR_NULL(rcq) && rcq != scq)
rdma_lookup_put_uobject(&rcq->uobject->uevent.uobject,
UVERBS_LOOKUP_READ);
- if (srq)
+ if (!IS_ERR_OR_NULL(srq))
rdma_lookup_put_uobject(&srq->uobject->uevent.uobject,
UVERBS_LOOKUP_READ);
- if (ind_tbl)
+ if (!IS_ERR_OR_NULL(ind_tbl))
uobj_put_obj_read(ind_tbl);
uobj_alloc_abort(&obj->uevent.uobject, attrs);
@@ -1655,8 +1680,8 @@ static int ib_uverbs_query_qp(struct uverbs_attr_bundle *attrs)
}
qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
- if (!qp) {
- ret = -EINVAL;
+ if (IS_ERR(qp)) {
+ ret = PTR_ERR(qp);
goto out;
}
@@ -1761,8 +1786,8 @@ static int modify_qp(struct uverbs_attr_bundle *attrs,
qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd->base.qp_handle,
attrs);
- if (!qp) {
- ret = -EINVAL;
+ if (IS_ERR(qp)) {
+ ret = PTR_ERR(qp);
goto out;
}
@@ -2010,11 +2035,13 @@ static int ib_uverbs_post_send(struct uverbs_attr_bundle *attrs)
ret = uverbs_request_start(attrs, &iter, &cmd, sizeof(cmd));
if (ret)
return ret;
- wqes = uverbs_request_next_ptr(&iter, cmd.wqe_size * cmd.wr_count);
+ wqes = uverbs_request_next_ptr(&iter, size_mul(cmd.wqe_size,
+ cmd.wr_count));
if (IS_ERR(wqes))
return PTR_ERR(wqes);
- sgls = uverbs_request_next_ptr(
- &iter, cmd.sge_count * sizeof(struct ib_uverbs_sge));
+ sgls = uverbs_request_next_ptr(&iter,
+ size_mul(cmd.sge_count,
+ sizeof(struct ib_uverbs_sge)));
if (IS_ERR(sgls))
return PTR_ERR(sgls);
ret = uverbs_request_finish(&iter);
@@ -2026,8 +2053,8 @@ static int ib_uverbs_post_send(struct uverbs_attr_bundle *attrs)
return -ENOMEM;
qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
- if (!qp) {
- ret = -EINVAL;
+ if (IS_ERR(qp)) {
+ ret = PTR_ERR(qp);
goto out;
}
@@ -2064,9 +2091,9 @@ static int ib_uverbs_post_send(struct uverbs_attr_bundle *attrs)
ud->ah = uobj_get_obj_read(ah, UVERBS_OBJECT_AH,
user_wr->wr.ud.ah, attrs);
- if (!ud->ah) {
+ if (IS_ERR(ud->ah)) {
+ ret = PTR_ERR(ud->ah);
kfree(ud);
- ret = -EINVAL;
goto out_put;
}
ud->remote_qpn = user_wr->wr.ud.remote_qpn;
@@ -2200,11 +2227,11 @@ ib_uverbs_unmarshall_recv(struct uverbs_req_iter *iter, u32 wr_count,
if (wqe_size < sizeof(struct ib_uverbs_recv_wr))
return ERR_PTR(-EINVAL);
- wqes = uverbs_request_next_ptr(iter, wqe_size * wr_count);
+ wqes = uverbs_request_next_ptr(iter, size_mul(wqe_size, wr_count));
if (IS_ERR(wqes))
return ERR_CAST(wqes);
- sgls = uverbs_request_next_ptr(
- iter, sge_count * sizeof(struct ib_uverbs_sge));
+ sgls = uverbs_request_next_ptr(iter, size_mul(sge_count,
+ sizeof(struct ib_uverbs_sge)));
if (IS_ERR(sgls))
return ERR_CAST(sgls);
ret = uverbs_request_finish(iter);
@@ -2303,8 +2330,8 @@ static int ib_uverbs_post_recv(struct uverbs_attr_bundle *attrs)
return PTR_ERR(wr);
qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
- if (!qp) {
- ret = -EINVAL;
+ if (IS_ERR(qp)) {
+ ret = PTR_ERR(qp);
goto out;
}
@@ -2354,8 +2381,8 @@ static int ib_uverbs_post_srq_recv(struct uverbs_attr_bundle *attrs)
return PTR_ERR(wr);
srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs);
- if (!srq) {
- ret = -EINVAL;
+ if (IS_ERR(srq)) {
+ ret = PTR_ERR(srq);
goto out;
}
@@ -2411,8 +2438,8 @@ static int ib_uverbs_create_ah(struct uverbs_attr_bundle *attrs)
}
pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs);
- if (!pd) {
- ret = -EINVAL;
+ if (IS_ERR(pd)) {
+ ret = PTR_ERR(pd);
goto err;
}
@@ -2481,8 +2508,8 @@ static int ib_uverbs_attach_mcast(struct uverbs_attr_bundle *attrs)
return ret;
qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
- if (!qp)
- return -EINVAL;
+ if (IS_ERR(qp))
+ return PTR_ERR(qp);
obj = qp->uobject;
@@ -2531,8 +2558,8 @@ static int ib_uverbs_detach_mcast(struct uverbs_attr_bundle *attrs)
return ret;
qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
- if (!qp)
- return -EINVAL;
+ if (IS_ERR(qp))
+ return PTR_ERR(qp);
obj = qp->uobject;
mutex_lock(&obj->mcast_lock);
@@ -2666,8 +2693,8 @@ static int kern_spec_to_ib_spec_action(struct uverbs_attr_bundle *attrs,
UVERBS_OBJECT_FLOW_ACTION,
kern_spec->action.handle,
attrs);
- if (!ib_spec->action.act)
- return -EINVAL;
+ if (IS_ERR(ib_spec->action.act))
+ return PTR_ERR(ib_spec->action.act);
ib_spec->action.size =
sizeof(struct ib_flow_spec_action_handle);
flow_resources_add(uflow_res,
@@ -2684,8 +2711,8 @@ static int kern_spec_to_ib_spec_action(struct uverbs_attr_bundle *attrs,
UVERBS_OBJECT_COUNTERS,
kern_spec->flow_count.handle,
attrs);
- if (!ib_spec->flow_count.counters)
- return -EINVAL;
+ if (IS_ERR(ib_spec->flow_count.counters))
+ return PTR_ERR(ib_spec->flow_count.counters);
ib_spec->flow_count.size =
sizeof(struct ib_flow_spec_action_count);
flow_resources_add(uflow_res,
@@ -2903,14 +2930,14 @@ static int ib_uverbs_ex_create_wq(struct uverbs_attr_bundle *attrs)
return PTR_ERR(obj);
pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs);
- if (!pd) {
- err = -EINVAL;
+ if (IS_ERR(pd)) {
+ err = PTR_ERR(pd);
goto err_uobj;
}
cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs);
- if (!cq) {
- err = -EINVAL;
+ if (IS_ERR(cq)) {
+ err = PTR_ERR(cq);
goto err_put_pd;
}
@@ -3011,8 +3038,8 @@ static int ib_uverbs_ex_modify_wq(struct uverbs_attr_bundle *attrs)
return -EINVAL;
wq = uobj_get_obj_read(wq, UVERBS_OBJECT_WQ, cmd.wq_handle, attrs);
- if (!wq)
- return -EINVAL;
+ if (IS_ERR(wq))
+ return PTR_ERR(wq);
if (cmd.attr_mask & IB_WQ_FLAGS) {
wq_attr.flags = cmd.flags;
@@ -3095,8 +3122,8 @@ static int ib_uverbs_ex_create_rwq_ind_table(struct uverbs_attr_bundle *attrs)
num_read_wqs++) {
wq = uobj_get_obj_read(wq, UVERBS_OBJECT_WQ,
wqs_handles[num_read_wqs], attrs);
- if (!wq) {
- err = -EINVAL;
+ if (IS_ERR(wq)) {
+ err = PTR_ERR(wq);
goto put_wqs;
}
@@ -3251,8 +3278,8 @@ static int ib_uverbs_ex_create_flow(struct uverbs_attr_bundle *attrs)
}
qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
- if (!qp) {
- err = -EINVAL;
+ if (IS_ERR(qp)) {
+ err = PTR_ERR(qp);
goto err_uobj;
}
@@ -3398,15 +3425,15 @@ static int __uverbs_create_xsrq(struct uverbs_attr_bundle *attrs,
if (ib_srq_has_cq(cmd->srq_type)) {
attr.ext.cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ,
cmd->cq_handle, attrs);
- if (!attr.ext.cq) {
- ret = -EINVAL;
+ if (IS_ERR(attr.ext.cq)) {
+ ret = PTR_ERR(attr.ext.cq);
goto err_put_xrcd;
}
}
pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd->pd_handle, attrs);
- if (!pd) {
- ret = -EINVAL;
+ if (IS_ERR(pd)) {
+ ret = PTR_ERR(pd);
goto err_put_cq;
}
@@ -3513,8 +3540,8 @@ static int ib_uverbs_modify_srq(struct uverbs_attr_bundle *attrs)
return ret;
srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs);
- if (!srq)
- return -EINVAL;
+ if (IS_ERR(srq))
+ return PTR_ERR(srq);
attr.max_wr = cmd.max_wr;
attr.srq_limit = cmd.srq_limit;
@@ -3541,8 +3568,8 @@ static int ib_uverbs_query_srq(struct uverbs_attr_bundle *attrs)
return ret;
srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs);
- if (!srq)
- return -EINVAL;
+ if (IS_ERR(srq))
+ return PTR_ERR(srq);
ret = ib_query_srq(srq, &attr);
@@ -3667,8 +3694,8 @@ static int ib_uverbs_ex_modify_cq(struct uverbs_attr_bundle *attrs)
return -EOPNOTSUPP;
cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs);
- if (!cq)
- return -EINVAL;
+ if (IS_ERR(cq))
+ return PTR_ERR(cq);
ret = rdma_set_cq_moderation(cq, cmd.attr.cq_count, cmd.attr.cq_period);
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 495d5a5d0373..973fe2c7ef53 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -52,6 +52,7 @@
#include <rdma/ib.h>
#include <rdma/uverbs_std_types.h>
#include <rdma/rdma_netlink.h>
+#include <rdma/ib_ucaps.h>
#include "uverbs.h"
#include "core_priv.h"
@@ -76,6 +77,7 @@ static dev_t dynamic_uverbs_dev;
static DEFINE_IDA(uverbs_ida);
static int ib_uverbs_add_one(struct ib_device *device);
static void ib_uverbs_remove_one(struct ib_device *device, void *client_data);
+static struct ib_client uverbs_client;
static char *uverbs_devnode(const struct device *dev, umode_t *mode)
{
@@ -217,6 +219,7 @@ void ib_uverbs_release_file(struct kref *ref)
if (file->disassociate_page)
__free_pages(file->disassociate_page, 0);
+ mutex_destroy(&file->disassociation_lock);
mutex_destroy(&file->umap_lock);
mutex_destroy(&file->ucontext_lock);
kfree(file);
@@ -353,7 +356,6 @@ const struct file_operations uverbs_event_fops = {
.poll = ib_uverbs_comp_event_poll,
.release = uverbs_uobject_fd_release,
.fasync = ib_uverbs_comp_event_fasync,
- .llseek = no_llseek,
};
const struct file_operations uverbs_async_event_fops = {
@@ -362,7 +364,6 @@ const struct file_operations uverbs_async_event_fops = {
.poll = ib_uverbs_async_event_poll,
.release = uverbs_async_event_release,
.fasync = ib_uverbs_async_event_fasync,
- .llseek = no_llseek,
};
void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context)
@@ -700,8 +701,13 @@ static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)
ret = PTR_ERR(ucontext);
goto out;
}
+
+ mutex_lock(&file->disassociation_lock);
+
vma->vm_ops = &rdma_umap_ops;
ret = ucontext->device->ops.mmap(ucontext, vma);
+
+ mutex_unlock(&file->disassociation_lock);
out:
srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
return ret;
@@ -723,6 +729,8 @@ static void rdma_umap_open(struct vm_area_struct *vma)
/* We are racing with disassociation */
if (!down_read_trylock(&ufile->hw_destroy_rwsem))
goto out_zap;
+ mutex_lock(&ufile->disassociation_lock);
+
/*
* Disassociation already completed, the VMA should already be zapped.
*/
@@ -734,10 +742,12 @@ static void rdma_umap_open(struct vm_area_struct *vma)
goto out_unlock;
rdma_umap_priv_init(priv, vma, opriv->entry);
+ mutex_unlock(&ufile->disassociation_lock);
up_read(&ufile->hw_destroy_rwsem);
return;
out_unlock:
+ mutex_unlock(&ufile->disassociation_lock);
up_read(&ufile->hw_destroy_rwsem);
out_zap:
/*
@@ -821,7 +831,7 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile)
{
struct rdma_umap_priv *priv, *next_priv;
- lockdep_assert_held(&ufile->hw_destroy_rwsem);
+ mutex_lock(&ufile->disassociation_lock);
while (1) {
struct mm_struct *mm = NULL;
@@ -847,8 +857,10 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile)
break;
}
mutex_unlock(&ufile->umap_lock);
- if (!mm)
+ if (!mm) {
+ mutex_unlock(&ufile->disassociation_lock);
return;
+ }
/*
* The umap_lock is nested under mmap_lock since it used within
@@ -878,7 +890,31 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile)
mmap_read_unlock(mm);
mmput(mm);
}
+
+ mutex_unlock(&ufile->disassociation_lock);
+}
+
+/**
+ * rdma_user_mmap_disassociate() - Revoke mmaps for a device
+ * @device: device to revoke
+ *
+ * This function should be called by drivers that need to disable mmaps for the
+ * device, for instance because it is going to be reset.
+ */
+void rdma_user_mmap_disassociate(struct ib_device *device)
+{
+ struct ib_uverbs_device *uverbs_dev =
+ ib_get_client_data(device, &uverbs_client);
+ struct ib_uverbs_file *ufile;
+
+ mutex_lock(&uverbs_dev->lists_mutex);
+ list_for_each_entry(ufile, &uverbs_dev->uverbs_file_list, list) {
+ if (ufile->ucontext)
+ uverbs_user_mmap_disassociate(ufile);
+ }
+ mutex_unlock(&uverbs_dev->lists_mutex);
}
+EXPORT_SYMBOL(rdma_user_mmap_disassociate);
/*
* ib_uverbs_open() does not need the BKL:
@@ -949,6 +985,8 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
mutex_init(&file->umap_lock);
INIT_LIST_HEAD(&file->umaps);
+ mutex_init(&file->disassociation_lock);
+
filp->private_data = file;
list_add_tail(&file->list, &dev->uverbs_file_list);
mutex_unlock(&dev->lists_mutex);
@@ -991,7 +1029,6 @@ static const struct file_operations uverbs_fops = {
.write = ib_uverbs_write,
.open = ib_uverbs_open,
.release = ib_uverbs_close,
- .llseek = no_llseek,
.unlocked_ioctl = ib_uverbs_ioctl,
.compat_ioctl = compat_ptr_ioctl,
};
@@ -1002,7 +1039,6 @@ static const struct file_operations uverbs_mmap_fops = {
.mmap = ib_uverbs_mmap,
.open = ib_uverbs_open,
.release = ib_uverbs_close,
- .llseek = no_llseek,
.unlocked_ioctl = ib_uverbs_ioctl,
.compat_ioctl = compat_ptr_ioctl,
};
@@ -1114,7 +1150,8 @@ static int ib_uverbs_add_one(struct ib_device *device)
struct ib_uverbs_device *uverbs_dev;
int ret;
- if (!device->ops.alloc_ucontext)
+ if (!device->ops.alloc_ucontext ||
+ device->type == RDMA_DEVICE_TYPE_SMI)
return -EOPNOTSUPP;
uverbs_dev = kzalloc(sizeof(*uverbs_dev), GFP_KERNEL);
@@ -1309,6 +1346,7 @@ static void __exit ib_uverbs_cleanup(void)
IB_UVERBS_NUM_FIXED_MINOR);
unregister_chrdev_region(dynamic_uverbs_dev,
IB_UVERBS_NUM_DYNAMIC_MINOR);
+ ib_cleanup_ucaps();
mmu_notifier_synchronize();
}
diff --git a/drivers/infiniband/core/uverbs_marshall.c b/drivers/infiniband/core/uverbs_marshall.c
index 11a080646916..e803f609ec87 100644
--- a/drivers/infiniband/core/uverbs_marshall.c
+++ b/drivers/infiniband/core/uverbs_marshall.c
@@ -171,45 +171,3 @@ void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst,
__ib_copy_path_rec_to_user(dst, src);
}
EXPORT_SYMBOL(ib_copy_path_rec_to_user);
-
-void ib_copy_path_rec_from_user(struct sa_path_rec *dst,
- struct ib_user_path_rec *src)
-{
- u32 slid, dlid;
-
- memset(dst, 0, sizeof(*dst));
- if ((ib_is_opa_gid((union ib_gid *)src->sgid)) ||
- (ib_is_opa_gid((union ib_gid *)src->dgid))) {
- dst->rec_type = SA_PATH_REC_TYPE_OPA;
- slid = opa_get_lid_from_gid((union ib_gid *)src->sgid);
- dlid = opa_get_lid_from_gid((union ib_gid *)src->dgid);
- } else {
- dst->rec_type = SA_PATH_REC_TYPE_IB;
- slid = ntohs(src->slid);
- dlid = ntohs(src->dlid);
- }
- memcpy(dst->dgid.raw, src->dgid, sizeof dst->dgid);
- memcpy(dst->sgid.raw, src->sgid, sizeof dst->sgid);
-
- sa_path_set_dlid(dst, dlid);
- sa_path_set_slid(dst, slid);
- sa_path_set_raw_traffic(dst, src->raw_traffic);
- dst->flow_label = src->flow_label;
- dst->hop_limit = src->hop_limit;
- dst->traffic_class = src->traffic_class;
- dst->reversible = src->reversible;
- dst->numb_path = src->numb_path;
- dst->pkey = src->pkey;
- dst->sl = src->sl;
- dst->mtu_selector = src->mtu_selector;
- dst->mtu = src->mtu;
- dst->rate_selector = src->rate_selector;
- dst->rate = src->rate;
- dst->packet_life_time = src->packet_life_time;
- dst->preference = src->preference;
- dst->packet_life_time_selector = src->packet_life_time_selector;
-
- /* TODO: No need to set this */
- sa_path_set_dmac_zero(dst);
-}
-EXPORT_SYMBOL(ib_copy_path_rec_from_user);
diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c
index 370ad7c83f88..432054f0a8a4 100644
--- a/drivers/infiniband/core/uverbs_std_types_cq.c
+++ b/drivers/infiniband/core/uverbs_std_types_cq.c
@@ -128,7 +128,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ);
rdma_restrack_set_name(&cq->res, NULL);
- ret = ib_dev->ops.create_cq(cq, &attr, &attrs->driver_udata);
+ ret = ib_dev->ops.create_cq(cq, &attr, attrs);
if (ret)
goto err_free;
diff --git a/drivers/infiniband/core/uverbs_std_types_device.c b/drivers/infiniband/core/uverbs_std_types_device.c
index fb0555647336..c0fd283d9d6c 100644
--- a/drivers/infiniband/core/uverbs_std_types_device.c
+++ b/drivers/infiniband/core/uverbs_std_types_device.c
@@ -437,6 +437,10 @@ DECLARE_UVERBS_NAMED_METHOD(
UVERBS_ATTR_TYPE(u32), UA_OPTIONAL),
UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_GET_CONTEXT_CORE_SUPPORT,
UVERBS_ATTR_TYPE(u64), UA_OPTIONAL),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_GET_CONTEXT_FD_ARR,
+ UVERBS_ATTR_MIN_SIZE(sizeof(int)),
+ UA_OPTIONAL,
+ UA_ALLOC_AND_COPY),
UVERBS_ATTR_UHW());
DECLARE_UVERBS_NAMED_METHOD(
diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c
index 03e1db5d1e8c..7ebc7bd3caae 100644
--- a/drivers/infiniband/core/uverbs_std_types_mr.c
+++ b/drivers/infiniband/core/uverbs_std_types_mr.c
@@ -239,7 +239,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_REG_DMABUF_MR)(
mr = pd->device->ops.reg_user_mr_dmabuf(pd, offset, length, iova, fd,
access_flags,
- &attrs->driver_udata);
+ attrs);
if (IS_ERR(mr))
return PTR_ERR(mr);
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 94a7f3b0c71c..75fde0fe9989 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -572,7 +572,7 @@ struct ib_ah *rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
GFP_KERNEL : GFP_ATOMIC);
if (IS_ERR(slave)) {
rdma_unfill_sgid_attr(ah_attr, old_sgid_attr);
- return (void *)slave;
+ return ERR_CAST(slave);
}
ah = _rdma_create_ah(pd, ah_attr, flags, NULL, slave);
rdma_lag_put_ah_roce_slave(slave);
@@ -1101,6 +1101,16 @@ EXPORT_SYMBOL(ib_destroy_srq_user);
/* Queue pairs */
+static void __ib_qp_event_handler(struct ib_event *event, void *context)
+{
+ struct ib_qp *qp = event->element.qp;
+
+ if (event->event == IB_EVENT_QP_LAST_WQE_REACHED)
+ complete(&qp->srq_completion);
+ if (qp->registered_event_handler)
+ qp->registered_event_handler(event, qp->qp_context);
+}
+
static void __ib_shared_qp_event_handler(struct ib_event *event, void *context)
{
struct ib_qp *qp = context;
@@ -1221,13 +1231,15 @@ static struct ib_qp *create_qp(struct ib_device *dev, struct ib_pd *pd,
qp->qp_type = attr->qp_type;
qp->rwq_ind_tbl = attr->rwq_ind_tbl;
qp->srq = attr->srq;
- qp->event_handler = attr->event_handler;
+ qp->event_handler = __ib_qp_event_handler;
+ qp->registered_event_handler = attr->event_handler;
qp->port = attr->port_num;
qp->qp_context = attr->qp_context;
spin_lock_init(&qp->mr_lock);
INIT_LIST_HEAD(&qp->rdma_mrs);
INIT_LIST_HEAD(&qp->sig_mrs);
+ init_completion(&qp->srq_completion);
qp->send_cq = attr->send_cq;
qp->recv_cq = attr->recv_cq;
@@ -2093,7 +2105,7 @@ int ib_destroy_qp_user(struct ib_qp *qp, struct ib_udata *udata)
if (!qp->uobject)
rdma_rw_cleanup_mrs(qp);
- rdma_counter_unbind_qp(qp, true);
+ rdma_counter_unbind_qp(qp, qp->port, true);
ret = qp->device->ops.destroy_qp(qp, udata);
if (ret) {
if (sec)
@@ -2884,6 +2896,72 @@ static void __ib_drain_rq(struct ib_qp *qp)
wait_for_completion(&rdrain.done);
}
+/*
+ * __ib_drain_srq() - Block until Last WQE Reached event arrives, or timeout
+ * expires.
+ * @qp: queue pair associated with SRQ to drain
+ *
+ * Quoting 10.3.1 Queue Pair and EE Context States:
+ *
+ * Note, for QPs that are associated with an SRQ, the Consumer should take the
+ * QP through the Error State before invoking a Destroy QP or a Modify QP to the
+ * Reset State. The Consumer may invoke the Destroy QP without first performing
+ * a Modify QP to the Error State and waiting for the Affiliated Asynchronous
+ * Last WQE Reached Event. However, if the Consumer does not wait for the
+ * Affiliated Asynchronous Last WQE Reached Event, then WQE and Data Segment
+ * leakage may occur. Therefore, it is good programming practice to tear down a
+ * QP that is associated with an SRQ by using the following process:
+ *
+ * - Put the QP in the Error State
+ * - Wait for the Affiliated Asynchronous Last WQE Reached Event;
+ * - either:
+ * drain the CQ by invoking the Poll CQ verb and either wait for CQ
+ * to be empty or the number of Poll CQ operations has exceeded
+ * CQ capacity size;
+ * - or
+ * post another WR that completes on the same CQ and wait for this
+ * WR to return as a WC;
+ * - and then invoke a Destroy QP or Reset QP.
+ *
+ * We use the first option.
+ */
+static void __ib_drain_srq(struct ib_qp *qp)
+{
+ struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
+ struct ib_cq *cq;
+ int n, polled = 0;
+ int ret;
+
+ if (!qp->srq) {
+ WARN_ONCE(1, "QP 0x%p is not associated with SRQ\n", qp);
+ return;
+ }
+
+ ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
+ if (ret) {
+ WARN_ONCE(ret, "failed to drain shared recv queue: %d\n", ret);
+ return;
+ }
+
+ if (ib_srq_has_cq(qp->srq->srq_type)) {
+ cq = qp->srq->ext.cq;
+ } else if (qp->recv_cq) {
+ cq = qp->recv_cq;
+ } else {
+ WARN_ONCE(1, "QP 0x%p has no CQ associated with SRQ\n", qp);
+ return;
+ }
+
+ if (wait_for_completion_timeout(&qp->srq_completion, 60 * HZ) > 0) {
+ while (polled != cq->cqe) {
+ n = ib_process_cq_direct(cq, cq->cqe - polled);
+ if (!n)
+ return;
+ polled += n;
+ }
+ }
+}
+
/**
* ib_drain_sq() - Block until all SQ CQEs have been consumed by the
* application.
@@ -2962,6 +3040,8 @@ void ib_drain_qp(struct ib_qp *qp)
ib_drain_sq(qp);
if (!qp->srq)
ib_drain_rq(qp);
+ else
+ __ib_drain_srq(qp);
}
EXPORT_SYMBOL(ib_drain_qp);
@@ -3029,22 +3109,23 @@ EXPORT_SYMBOL(__rdma_block_iter_start);
bool __rdma_block_iter_next(struct ib_block_iter *biter)
{
unsigned int block_offset;
- unsigned int sg_delta;
+ unsigned int delta;
if (!biter->__sg_nents || !biter->__sg)
return false;
biter->__dma_addr = sg_dma_address(biter->__sg) + biter->__sg_advance;
block_offset = biter->__dma_addr & (BIT_ULL(biter->__pg_bit) - 1);
- sg_delta = BIT_ULL(biter->__pg_bit) - block_offset;
+ delta = BIT_ULL(biter->__pg_bit) - block_offset;
- if (sg_dma_len(biter->__sg) - biter->__sg_advance > sg_delta) {
- biter->__sg_advance += sg_delta;
- } else {
+ while (biter->__sg_nents && biter->__sg &&
+ sg_dma_len(biter->__sg) - biter->__sg_advance <= delta) {
+ delta -= sg_dma_len(biter->__sg) - biter->__sg_advance;
biter->__sg_advance = 0;
biter->__sg = sg_next(biter->__sg);
biter->__sg_nents--;
}
+ biter->__sg_advance += delta;
return true;
}