diff options
Diffstat (limited to 'drivers/infiniband/core')
21 files changed, 1024 insertions, 166 deletions
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index d49ded7e95f0..f483e0c12444 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -33,6 +33,7 @@ ib_umad-y := user_mad.o ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \ rdma_core.o uverbs_std_types.o uverbs_ioctl.o \ uverbs_std_types_cq.o \ + uverbs_std_types_dmah.o \ uverbs_std_types_flow_action.o uverbs_std_types_dm.o \ uverbs_std_types_mr.o uverbs_std_types_counters.o \ uverbs_uapi.o uverbs_std_types_device.o \ diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 9979a351577f..81cf3c902e81 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -582,8 +582,8 @@ static int __ib_cache_gid_add(struct ib_device *ib_dev, u32 port, out_unlock: mutex_unlock(&table->lock); if (ret) - pr_warn("%s: unable to add gid %pI6 error=%d\n", - __func__, gid->raw, ret); + pr_warn_ratelimited("%s: unable to add gid %pI6 error=%d\n", + __func__, gid->raw, ret); return ret; } diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 8670e58675c6..92678e438ff4 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -161,6 +161,7 @@ struct cm_counter_attribute { struct cm_port { struct cm_device *cm_dev; struct ib_mad_agent *mad_agent; + struct ib_mad_agent *rep_agent; u32 port_num; atomic_long_t counters[CM_COUNTER_GROUPS][CM_ATTR_COUNT]; }; @@ -274,7 +275,8 @@ static inline void cm_deref_id(struct cm_id_private *cm_id_priv) complete(&cm_id_priv->comp); } -static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv) +static struct ib_mad_send_buf * +cm_alloc_msg_agent(struct cm_id_private *cm_id_priv, bool rep_agent) { struct ib_mad_agent *mad_agent; struct ib_mad_send_buf *m; @@ -286,7 +288,8 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv) return ERR_PTR(-EINVAL); read_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); - mad_agent = cm_id_priv->av.port->mad_agent; + mad_agent = rep_agent ? cm_id_priv->av.port->rep_agent : + cm_id_priv->av.port->mad_agent; if (!mad_agent) { m = ERR_PTR(-EINVAL); goto out; @@ -315,6 +318,11 @@ out: return m; } +static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv) +{ + return cm_alloc_msg_agent(cm_id_priv, false); +} + static void cm_free_msg(struct ib_mad_send_buf *msg) { if (msg->ah) @@ -323,13 +331,14 @@ static void cm_free_msg(struct ib_mad_send_buf *msg) } static struct ib_mad_send_buf * -cm_alloc_priv_msg(struct cm_id_private *cm_id_priv, enum ib_cm_state state) +cm_alloc_priv_msg_rep(struct cm_id_private *cm_id_priv, enum ib_cm_state state, + bool rep_agent) { struct ib_mad_send_buf *msg; lockdep_assert_held(&cm_id_priv->lock); - msg = cm_alloc_msg(cm_id_priv); + msg = cm_alloc_msg_agent(cm_id_priv, rep_agent); if (IS_ERR(msg)) return msg; @@ -344,6 +353,12 @@ cm_alloc_priv_msg(struct cm_id_private *cm_id_priv, enum ib_cm_state state) return msg; } +static struct ib_mad_send_buf * +cm_alloc_priv_msg(struct cm_id_private *cm_id_priv, enum ib_cm_state state) +{ + return cm_alloc_priv_msg_rep(cm_id_priv, state, false); +} + static void cm_free_priv_msg(struct ib_mad_send_buf *msg) { struct cm_id_private *cm_id_priv = msg->context[0]; @@ -2295,7 +2310,7 @@ int ib_send_cm_rep(struct ib_cm_id *cm_id, goto out; } - msg = cm_alloc_priv_msg(cm_id_priv, IB_CM_REP_SENT); + msg = cm_alloc_priv_msg_rep(cm_id_priv, IB_CM_REP_SENT, true); if (IS_ERR(msg)) { ret = PTR_ERR(msg); goto out; @@ -4380,9 +4395,22 @@ static int cm_add_one(struct ib_device *ib_device) goto error2; } + port->rep_agent = ib_register_mad_agent(ib_device, i, + IB_QPT_GSI, + NULL, + 0, + cm_send_handler, + NULL, + port, + 0); + if (IS_ERR(port->rep_agent)) { + ret = PTR_ERR(port->rep_agent); + goto error3; + } + ret = ib_modify_port(ib_device, i, 0, &port_modify); if (ret) - goto error3; + goto error4; count++; } @@ -4397,6 +4425,8 @@ static int cm_add_one(struct ib_device *ib_device) write_unlock_irqrestore(&cm.device_lock, flags); return 0; +error4: + ib_unregister_mad_agent(port->rep_agent); error3: ib_unregister_mad_agent(port->mad_agent); error2: @@ -4410,6 +4440,7 @@ error1: port = cm_dev->port[i-1]; ib_modify_port(ib_device, port->port_num, 0, &port_modify); + ib_unregister_mad_agent(port->rep_agent); ib_unregister_mad_agent(port->mad_agent); ib_port_unregister_client_groups(ib_device, i, cm_counter_groups); @@ -4439,12 +4470,14 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data) rdma_for_each_port (ib_device, i) { struct ib_mad_agent *mad_agent; + struct ib_mad_agent *rep_agent; if (!rdma_cap_ib_cm(ib_device, i)) continue; port = cm_dev->port[i-1]; mad_agent = port->mad_agent; + rep_agent = port->rep_agent; ib_modify_port(ib_device, port->port_num, 0, &port_modify); /* * We flush the queue here after the going_down set, this @@ -4458,8 +4491,10 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data) */ write_lock(&cm_dev->mad_agent_lock); port->mad_agent = NULL; + port->rep_agent = NULL; write_unlock(&cm_dev->mad_agent_lock); ib_unregister_mad_agent(mad_agent); + ib_unregister_mad_agent(rep_agent); ib_port_unregister_client_groups(ib_device, i, cm_counter_groups); } diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c index e6ec7b7a40af..c3aa6d7fc66b 100644 --- a/drivers/infiniband/core/counters.c +++ b/drivers/infiniband/core/counters.c @@ -461,7 +461,7 @@ static struct ib_qp *rdma_counter_get_qp(struct ib_device *dev, u32 qp_num) return NULL; qp = container_of(res, struct ib_qp, res); - if (qp->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW)) + if (qp->qp_type == IB_QPT_RAW_PACKET && !rdma_dev_has_raw_cap(dev)) goto err; return qp; diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index a70876a0a231..584537c71545 100644 --- a/drivers/infiniband/core/cq.c +++ b/drivers/infiniband/core/cq.c @@ -317,13 +317,18 @@ EXPORT_SYMBOL(__ib_alloc_cq_any); */ void ib_free_cq(struct ib_cq *cq) { - int ret; + int ret = 0; if (WARN_ON_ONCE(atomic_read(&cq->usecnt))) return; if (WARN_ON_ONCE(cq->cqe_used)) return; + if (cq->device->ops.pre_destroy_cq) { + ret = cq->device->ops.pre_destroy_cq(cq); + WARN_ONCE(ret, "Disable of kernel CQ shouldn't fail"); + } + switch (cq->poll_ctx) { case IB_POLL_DIRECT: break; @@ -340,7 +345,10 @@ void ib_free_cq(struct ib_cq *cq) rdma_dim_destroy(cq); trace_cq_free(cq); - ret = cq->device->ops.destroy_cq(cq, NULL); + if (cq->device->ops.post_destroy_cq) + cq->device->ops.post_destroy_cq(cq); + else + ret = cq->device->ops.destroy_cq(cq, NULL); WARN_ONCE(ret, "Destroy of kernel CQ shouldn't fail"); rdma_restrack_del(&cq->res); kfree(cq->wc); diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index d4263385850a..3145cb34a1d2 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -145,6 +145,33 @@ bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net) } EXPORT_SYMBOL(rdma_dev_access_netns); +/** + * rdma_dev_has_raw_cap() - Returns whether a specified rdma device has + * CAP_NET_RAW capability or not. + * + * @dev: Pointer to rdma device whose capability to be checked + * + * Returns true if a rdma device's owning user namespace has CAP_NET_RAW + * capability, otherwise false. When rdma subsystem is in legacy shared network, + * namespace mode, the default net namespace is considered. + */ +bool rdma_dev_has_raw_cap(const struct ib_device *dev) +{ + const struct net *net; + + /* Network namespace is the resource whose user namespace + * to be considered. When in shared mode, there is no reliable + * network namespace resource, so consider the default net namespace. + */ + if (ib_devices_shared_netns) + net = &init_net; + else + net = read_pnet(&dev->coredev.rdma_net); + + return ns_capable(net->user_ns, CAP_NET_RAW); +} +EXPORT_SYMBOL(rdma_dev_has_raw_cap); + /* * xarray has this behavior where it won't iterate over NULL values stored in * allocated arrays. So we need our own iterator to see all values stored in @@ -557,6 +584,8 @@ static void rdma_init_coredev(struct ib_core_device *coredev, /** * _ib_alloc_device - allocate an IB device struct * @size:size of structure to allocate + * @net: network namespace device should be located in, namespace + * must stay valid until ib_register_device() is completed. * * Low-level drivers should use ib_alloc_device() to allocate &struct * ib_device. @size is the size of the structure to be allocated, @@ -564,7 +593,7 @@ static void rdma_init_coredev(struct ib_core_device *coredev, * ib_dealloc_device() must be used to free structures allocated with * ib_alloc_device(). */ -struct ib_device *_ib_alloc_device(size_t size) +struct ib_device *_ib_alloc_device(size_t size, struct net *net) { struct ib_device *device; unsigned int i; @@ -581,7 +610,15 @@ struct ib_device *_ib_alloc_device(size_t size) return NULL; } - rdma_init_coredev(&device->coredev, device, &init_net); + /* ib_devices_shared_netns can't change while we have active namespaces + * in the system which means either init_net is passed or the user has + * no idea what they are doing. + * + * To avoid breaking backward compatibility, when in shared mode, + * force to init the device in the init_net. + */ + net = ib_devices_shared_netns ? &init_net : net; + rdma_init_coredev(&device->coredev, device, net); INIT_LIST_HEAD(&device->event_handler_list); spin_lock_init(&device->qp_open_list_lock); @@ -2671,6 +2708,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, add_sub_dev); SET_DEVICE_OP(dev_ops, advise_mr); SET_DEVICE_OP(dev_ops, alloc_dm); + SET_DEVICE_OP(dev_ops, alloc_dmah); SET_DEVICE_OP(dev_ops, alloc_hw_device_stats); SET_DEVICE_OP(dev_ops, alloc_hw_port_stats); SET_DEVICE_OP(dev_ops, alloc_mr); @@ -2691,6 +2729,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, create_ah); SET_DEVICE_OP(dev_ops, create_counters); SET_DEVICE_OP(dev_ops, create_cq); + SET_DEVICE_OP(dev_ops, create_cq_umem); SET_DEVICE_OP(dev_ops, create_flow); SET_DEVICE_OP(dev_ops, create_qp); SET_DEVICE_OP(dev_ops, create_rwq_ind_table); @@ -2698,6 +2737,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, create_user_ah); SET_DEVICE_OP(dev_ops, create_wq); SET_DEVICE_OP(dev_ops, dealloc_dm); + SET_DEVICE_OP(dev_ops, dealloc_dmah); SET_DEVICE_OP(dev_ops, dealloc_driver); SET_DEVICE_OP(dev_ops, dealloc_mw); SET_DEVICE_OP(dev_ops, dealloc_pd); @@ -2763,8 +2803,10 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, modify_srq); SET_DEVICE_OP(dev_ops, modify_wq); SET_DEVICE_OP(dev_ops, peek_cq); + SET_DEVICE_OP(dev_ops, pre_destroy_cq); SET_DEVICE_OP(dev_ops, poll_cq); SET_DEVICE_OP(dev_ops, port_groups); + SET_DEVICE_OP(dev_ops, post_destroy_cq); SET_DEVICE_OP(dev_ops, post_recv); SET_DEVICE_OP(dev_ops, post_send); SET_DEVICE_OP(dev_ops, post_srq_recv); @@ -2793,6 +2835,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_OBJ_SIZE(dev_ops, ib_ah); SET_OBJ_SIZE(dev_ops, ib_counters); SET_OBJ_SIZE(dev_ops, ib_cq); + SET_OBJ_SIZE(dev_ops, ib_dmah); SET_OBJ_SIZE(dev_ops, ib_mw); SET_OBJ_SIZE(dev_ops, ib_pd); SET_OBJ_SIZE(dev_ops, ib_qp); diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 73f3a0b9a54b..8f26bfb69586 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -210,6 +210,29 @@ int ib_response_mad(const struct ib_mad_hdr *hdr) } EXPORT_SYMBOL(ib_response_mad); +#define SOL_FC_MAX_DEFAULT_FRAC 4 +#define SOL_FC_MAX_SA_FRAC 32 + +static int get_sol_fc_max_outstanding(struct ib_mad_reg_req *mad_reg_req) +{ + if (!mad_reg_req) + /* Send only agent */ + return mad_recvq_size / SOL_FC_MAX_DEFAULT_FRAC; + + switch (mad_reg_req->mgmt_class) { + case IB_MGMT_CLASS_CM: + return mad_recvq_size / SOL_FC_MAX_DEFAULT_FRAC; + case IB_MGMT_CLASS_SUBN_ADM: + return mad_recvq_size / SOL_FC_MAX_SA_FRAC; + case IB_MGMT_CLASS_SUBN_LID_ROUTED: + case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: + return min(mad_recvq_size, IB_MAD_QP_RECV_SIZE) / + SOL_FC_MAX_DEFAULT_FRAC; + default: + return 0; + } +} + /* * ib_register_mad_agent - Register to send/receive MADs * @@ -391,13 +414,17 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, spin_lock_init(&mad_agent_priv->lock); INIT_LIST_HEAD(&mad_agent_priv->send_list); INIT_LIST_HEAD(&mad_agent_priv->wait_list); - INIT_LIST_HEAD(&mad_agent_priv->done_list); INIT_LIST_HEAD(&mad_agent_priv->rmpp_list); + INIT_LIST_HEAD(&mad_agent_priv->backlog_list); INIT_DELAYED_WORK(&mad_agent_priv->timed_work, timeout_sends); INIT_LIST_HEAD(&mad_agent_priv->local_list); INIT_WORK(&mad_agent_priv->local_work, local_completions); refcount_set(&mad_agent_priv->refcount, 1); init_completion(&mad_agent_priv->comp); + mad_agent_priv->sol_fc_send_count = 0; + mad_agent_priv->sol_fc_wait_count = 0; + mad_agent_priv->sol_fc_max = + recv_handler ? get_sol_fc_max_outstanding(mad_reg_req) : 0; ret2 = ib_mad_agent_security_setup(&mad_agent_priv->agent, qp_type); if (ret2) { @@ -1055,6 +1082,180 @@ int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr) return ret; } +static void handle_queued_state(struct ib_mad_send_wr_private *mad_send_wr, + struct ib_mad_agent_private *mad_agent_priv) +{ + if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP) { + mad_agent_priv->sol_fc_wait_count--; + list_move_tail(&mad_send_wr->agent_list, + &mad_agent_priv->backlog_list); + } else { + expect_mad_state(mad_send_wr, IB_MAD_STATE_INIT); + list_add_tail(&mad_send_wr->agent_list, + &mad_agent_priv->backlog_list); + } +} + +static void handle_send_state(struct ib_mad_send_wr_private *mad_send_wr, + struct ib_mad_agent_private *mad_agent_priv) +{ + if (mad_send_wr->state == IB_MAD_STATE_INIT) { + list_add_tail(&mad_send_wr->agent_list, + &mad_agent_priv->send_list); + } else { + expect_mad_state2(mad_send_wr, IB_MAD_STATE_WAIT_RESP, + IB_MAD_STATE_QUEUED); + list_move_tail(&mad_send_wr->agent_list, + &mad_agent_priv->send_list); + } + + if (mad_send_wr->is_solicited_fc) { + if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP) + mad_agent_priv->sol_fc_wait_count--; + mad_agent_priv->sol_fc_send_count++; + } +} + +static void handle_wait_state(struct ib_mad_send_wr_private *mad_send_wr, + struct ib_mad_agent_private *mad_agent_priv) +{ + struct ib_mad_send_wr_private *temp_mad_send_wr; + struct list_head *list_item; + unsigned long delay; + + expect_mad_state3(mad_send_wr, IB_MAD_STATE_SEND_START, + IB_MAD_STATE_WAIT_RESP, IB_MAD_STATE_CANCELED); + if (mad_send_wr->state == IB_MAD_STATE_SEND_START && + mad_send_wr->is_solicited_fc) { + mad_agent_priv->sol_fc_send_count--; + mad_agent_priv->sol_fc_wait_count++; + } + + list_del_init(&mad_send_wr->agent_list); + delay = mad_send_wr->timeout; + mad_send_wr->timeout += jiffies; + + if (delay) { + list_for_each_prev(list_item, + &mad_agent_priv->wait_list) { + temp_mad_send_wr = list_entry( + list_item, + struct ib_mad_send_wr_private, + agent_list); + if (time_after(mad_send_wr->timeout, + temp_mad_send_wr->timeout)) + break; + } + } else { + list_item = &mad_agent_priv->wait_list; + } + + list_add(&mad_send_wr->agent_list, list_item); +} + +static void handle_early_resp_state(struct ib_mad_send_wr_private *mad_send_wr, + struct ib_mad_agent_private *mad_agent_priv) +{ + expect_mad_state(mad_send_wr, IB_MAD_STATE_SEND_START); + mad_agent_priv->sol_fc_send_count -= mad_send_wr->is_solicited_fc; +} + +static void handle_canceled_state(struct ib_mad_send_wr_private *mad_send_wr, + struct ib_mad_agent_private *mad_agent_priv) +{ + not_expect_mad_state(mad_send_wr, IB_MAD_STATE_DONE); + if (mad_send_wr->is_solicited_fc) { + if (mad_send_wr->state == IB_MAD_STATE_SEND_START) + mad_agent_priv->sol_fc_send_count--; + else if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP) + mad_agent_priv->sol_fc_wait_count--; + } +} + +static void handle_done_state(struct ib_mad_send_wr_private *mad_send_wr, + struct ib_mad_agent_private *mad_agent_priv) +{ + if (mad_send_wr->is_solicited_fc) { + if (mad_send_wr->state == IB_MAD_STATE_SEND_START) + mad_agent_priv->sol_fc_send_count--; + else if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP) + mad_agent_priv->sol_fc_wait_count--; + } + + list_del_init(&mad_send_wr->agent_list); +} + +void change_mad_state(struct ib_mad_send_wr_private *mad_send_wr, + enum ib_mad_state new_state) +{ + struct ib_mad_agent_private *mad_agent_priv = + mad_send_wr->mad_agent_priv; + + switch (new_state) { + case IB_MAD_STATE_INIT: + break; + case IB_MAD_STATE_QUEUED: + handle_queued_state(mad_send_wr, mad_agent_priv); + break; + case IB_MAD_STATE_SEND_START: + handle_send_state(mad_send_wr, mad_agent_priv); + break; + case IB_MAD_STATE_WAIT_RESP: + handle_wait_state(mad_send_wr, mad_agent_priv); + if (mad_send_wr->state == IB_MAD_STATE_CANCELED) + return; + break; + case IB_MAD_STATE_EARLY_RESP: + handle_early_resp_state(mad_send_wr, mad_agent_priv); + break; + case IB_MAD_STATE_CANCELED: + handle_canceled_state(mad_send_wr, mad_agent_priv); + break; + case IB_MAD_STATE_DONE: + handle_done_state(mad_send_wr, mad_agent_priv); + break; + } + + mad_send_wr->state = new_state; +} + +static bool is_solicited_fc_mad(struct ib_mad_send_wr_private *mad_send_wr) +{ + struct ib_rmpp_mad *rmpp_mad; + u8 mgmt_class; + + if (!mad_send_wr->timeout) + return 0; + + rmpp_mad = mad_send_wr->send_buf.mad; + if (mad_send_wr->mad_agent_priv->agent.rmpp_version && + (ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) + return 0; + + mgmt_class = + ((struct ib_mad_hdr *)mad_send_wr->send_buf.mad)->mgmt_class; + return mgmt_class == IB_MGMT_CLASS_CM || + mgmt_class == IB_MGMT_CLASS_SUBN_ADM || + mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || + mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE; +} + +static bool mad_is_for_backlog(struct ib_mad_send_wr_private *mad_send_wr) +{ + struct ib_mad_agent_private *mad_agent_priv = + mad_send_wr->mad_agent_priv; + + if (!mad_send_wr->is_solicited_fc || !mad_agent_priv->sol_fc_max) + return false; + + if (!list_empty(&mad_agent_priv->backlog_list)) + return true; + + return mad_agent_priv->sol_fc_send_count + + mad_agent_priv->sol_fc_wait_count >= + mad_agent_priv->sol_fc_max; +} + /* * ib_post_send_mad - Posts MAD(s) to the send queue of the QP associated * with the registered client @@ -1080,9 +1281,7 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf, if (ret) goto error; - if (!send_buf->mad_agent->send_handler || - (send_buf->timeout_ms && - !send_buf->mad_agent->recv_handler)) { + if (!send_buf->mad_agent->send_handler) { ret = -EINVAL; goto error; } @@ -1118,15 +1317,19 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf, mad_send_wr->max_retries = send_buf->retries; mad_send_wr->retries_left = send_buf->retries; send_buf->retries = 0; - /* Reference for work request to QP + response */ - mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0); - mad_send_wr->status = IB_WC_SUCCESS; + change_mad_state(mad_send_wr, IB_MAD_STATE_INIT); /* Reference MAD agent until send completes */ refcount_inc(&mad_agent_priv->refcount); spin_lock_irqsave(&mad_agent_priv->lock, flags); - list_add_tail(&mad_send_wr->agent_list, - &mad_agent_priv->send_list); + mad_send_wr->is_solicited_fc = is_solicited_fc_mad(mad_send_wr); + if (mad_is_for_backlog(mad_send_wr)) { + change_mad_state(mad_send_wr, IB_MAD_STATE_QUEUED); + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + return 0; + } + + change_mad_state(mad_send_wr, IB_MAD_STATE_SEND_START); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) { @@ -1138,7 +1341,7 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf, if (ret < 0) { /* Fail send request */ spin_lock_irqsave(&mad_agent_priv->lock, flags); - list_del(&mad_send_wr->agent_list); + change_mad_state(mad_send_wr, IB_MAD_STATE_DONE); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); deref_mad_agent(mad_agent_priv); goto error; @@ -1746,7 +1949,19 @@ ib_find_send_mad(const struct ib_mad_agent_private *mad_agent_priv, */ (is_direct(mad_hdr->mgmt_class) || rcv_has_same_gid(mad_agent_priv, wr, wc))) - return (wr->status == IB_WC_SUCCESS) ? wr : NULL; + return (wr->state != IB_MAD_STATE_CANCELED) ? wr : NULL; + } + + list_for_each_entry(wr, &mad_agent_priv->backlog_list, agent_list) { + if ((wr->tid == mad_hdr->tid) && + rcv_has_same_class(wr, wc) && + /* + * Don't check GID for direct routed MADs. + * These might have permissive LIDs. + */ + (is_direct(mad_hdr->mgmt_class) || + rcv_has_same_gid(mad_agent_priv, wr, wc))) + return (wr->state != IB_MAD_STATE_CANCELED) ? wr : NULL; } /* @@ -1765,17 +1980,55 @@ ib_find_send_mad(const struct ib_mad_agent_private *mad_agent_priv, (is_direct(mad_hdr->mgmt_class) || rcv_has_same_gid(mad_agent_priv, wr, wc))) /* Verify request has not been canceled */ - return (wr->status == IB_WC_SUCCESS) ? wr : NULL; + return (wr->state != IB_MAD_STATE_CANCELED) ? wr : NULL; } return NULL; } +static void +process_backlog_mads(struct ib_mad_agent_private *mad_agent_priv) +{ + struct ib_mad_send_wr_private *mad_send_wr; + struct ib_mad_send_wc mad_send_wc = {}; + unsigned long flags; + int ret; + + spin_lock_irqsave(&mad_agent_priv->lock, flags); + while (!list_empty(&mad_agent_priv->backlog_list) && + (mad_agent_priv->sol_fc_send_count + + mad_agent_priv->sol_fc_wait_count < + mad_agent_priv->sol_fc_max)) { + mad_send_wr = list_entry(mad_agent_priv->backlog_list.next, + struct ib_mad_send_wr_private, + agent_list); + change_mad_state(mad_send_wr, IB_MAD_STATE_SEND_START); + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + ret = ib_send_mad(mad_send_wr); + if (ret) { + spin_lock_irqsave(&mad_agent_priv->lock, flags); + deref_mad_agent(mad_agent_priv); + change_mad_state(mad_send_wr, IB_MAD_STATE_DONE); + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + mad_send_wc.send_buf = &mad_send_wr->send_buf; + mad_send_wc.status = IB_WC_LOC_QP_OP_ERR; + mad_agent_priv->agent.send_handler( + &mad_agent_priv->agent, &mad_send_wc); + } + + spin_lock_irqsave(&mad_agent_priv->lock, flags); + } + + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); +} + void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr) { mad_send_wr->timeout = 0; - if (mad_send_wr->refcount == 1) - list_move_tail(&mad_send_wr->agent_list, - &mad_send_wr->mad_agent_priv->done_list); + if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP || + mad_send_wr->state == IB_MAD_STATE_QUEUED) + change_mad_state(mad_send_wr, IB_MAD_STATE_DONE); + else + change_mad_state(mad_send_wr, IB_MAD_STATE_EARLY_RESP); } static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, @@ -1784,6 +2037,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, struct ib_mad_send_wr_private *mad_send_wr; struct ib_mad_send_wc mad_send_wc; unsigned long flags; + bool is_mad_done; int ret; INIT_LIST_HEAD(&mad_recv_wc->rmpp_list); @@ -1832,6 +2086,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, } } else { ib_mark_mad_done(mad_send_wr); + is_mad_done = (mad_send_wr->state == IB_MAD_STATE_DONE); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); /* Defined behavior is to complete response before request */ @@ -1841,10 +2096,13 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, mad_recv_wc); deref_mad_agent(mad_agent_priv); - mad_send_wc.status = IB_WC_SUCCESS; - mad_send_wc.vendor_err = 0; - mad_send_wc.send_buf = &mad_send_wr->send_buf; - ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc); + if (is_mad_done) { + mad_send_wc.status = IB_WC_SUCCESS; + mad_send_wc.vendor_err = 0; + mad_send_wc.send_buf = &mad_send_wr->send_buf; + ib_mad_complete_send_wr(mad_send_wr, + &mad_send_wc); + } } } else { mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, NULL, @@ -2172,30 +2430,11 @@ static void adjust_timeout(struct ib_mad_agent_private *mad_agent_priv) static void wait_for_response(struct ib_mad_send_wr_private *mad_send_wr) { struct ib_mad_agent_private *mad_agent_priv; - struct ib_mad_send_wr_private *temp_mad_send_wr; - struct list_head *list_item; unsigned long delay; mad_agent_priv = mad_send_wr->mad_agent_priv; - list_del(&mad_send_wr->agent_list); - delay = mad_send_wr->timeout; - mad_send_wr->timeout += jiffies; - - if (delay) { - list_for_each_prev(list_item, &mad_agent_priv->wait_list) { - temp_mad_send_wr = list_entry(list_item, - struct ib_mad_send_wr_private, - agent_list); - if (time_after(mad_send_wr->timeout, - temp_mad_send_wr->timeout)) - break; - } - } else { - list_item = &mad_agent_priv->wait_list; - } - - list_add(&mad_send_wr->agent_list, list_item); + change_mad_state(mad_send_wr, IB_MAD_STATE_WAIT_RESP); /* Reschedule a work item if we have a shorter timeout */ if (mad_agent_priv->wait_list.next == &mad_send_wr->agent_list) @@ -2229,32 +2468,28 @@ void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr, } else ret = IB_RMPP_RESULT_UNHANDLED; - if (mad_send_wc->status != IB_WC_SUCCESS && - mad_send_wr->status == IB_WC_SUCCESS) { - mad_send_wr->status = mad_send_wc->status; - mad_send_wr->refcount -= (mad_send_wr->timeout > 0); - } - - if (--mad_send_wr->refcount > 0) { - if (mad_send_wr->refcount == 1 && mad_send_wr->timeout && - mad_send_wr->status == IB_WC_SUCCESS) { - wait_for_response(mad_send_wr); - } + if (mad_send_wr->state == IB_MAD_STATE_CANCELED) + mad_send_wc->status = IB_WC_WR_FLUSH_ERR; + else if (mad_send_wr->state == IB_MAD_STATE_SEND_START && + mad_send_wr->timeout) { + wait_for_response(mad_send_wr); goto done; } /* Remove send from MAD agent and notify client of completion */ - list_del(&mad_send_wr->agent_list); + if (mad_send_wr->state != IB_MAD_STATE_DONE) + change_mad_state(mad_send_wr, IB_MAD_STATE_DONE); adjust_timeout(mad_agent_priv); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); - if (mad_send_wr->status != IB_WC_SUCCESS) - mad_send_wc->status = mad_send_wr->status; - if (ret == IB_RMPP_RESULT_INTERNAL) + if (ret == IB_RMPP_RESULT_INTERNAL) { ib_rmpp_send_handler(mad_send_wc); - else + } else { + if (mad_send_wr->is_solicited_fc) + process_backlog_mads(mad_agent_priv); mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, mad_send_wc); + } /* Release reference on agent taken when sending */ deref_mad_agent(mad_agent_priv); @@ -2396,40 +2631,53 @@ static bool ib_mad_send_error(struct ib_mad_port_private *port_priv, return true; } +static void clear_mad_error_list(struct list_head *list, + enum ib_wc_status wc_status, + struct ib_mad_agent_private *mad_agent_priv) +{ + struct ib_mad_send_wr_private *mad_send_wr, *n; + struct ib_mad_send_wc mad_send_wc; + + mad_send_wc.status = wc_status; + mad_send_wc.vendor_err = 0; + + list_for_each_entry_safe(mad_send_wr, n, list, agent_list) { + mad_send_wc.send_buf = &mad_send_wr->send_buf; + mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, + &mad_send_wc); + deref_mad_agent(mad_agent_priv); + } +} + static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv) { unsigned long flags; struct ib_mad_send_wr_private *mad_send_wr, *temp_mad_send_wr; - struct ib_mad_send_wc mad_send_wc; struct list_head cancel_list; INIT_LIST_HEAD(&cancel_list); spin_lock_irqsave(&mad_agent_priv->lock, flags); list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr, - &mad_agent_priv->send_list, agent_list) { - if (mad_send_wr->status == IB_WC_SUCCESS) { - mad_send_wr->status = IB_WC_WR_FLUSH_ERR; - mad_send_wr->refcount -= (mad_send_wr->timeout > 0); - } - } + &mad_agent_priv->send_list, agent_list) + change_mad_state(mad_send_wr, IB_MAD_STATE_CANCELED); - /* Empty wait list to prevent receives from finding a request */ - list_splice_init(&mad_agent_priv->wait_list, &cancel_list); - spin_unlock_irqrestore(&mad_agent_priv->lock, flags); - - /* Report all cancelled requests */ - mad_send_wc.status = IB_WC_WR_FLUSH_ERR; - mad_send_wc.vendor_err = 0; + /* Empty wait & backlog list to prevent receives from finding request */ + list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr, + &mad_agent_priv->wait_list, agent_list) { + change_mad_state(mad_send_wr, IB_MAD_STATE_DONE); + list_add_tail(&mad_send_wr->agent_list, &cancel_list); + } list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr, - &cancel_list, agent_list) { - mad_send_wc.send_buf = &mad_send_wr->send_buf; - list_del(&mad_send_wr->agent_list); - mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, - &mad_send_wc); - deref_mad_agent(mad_agent_priv); + &mad_agent_priv->backlog_list, agent_list) { + change_mad_state(mad_send_wr, IB_MAD_STATE_DONE); + list_add_tail(&mad_send_wr->agent_list, &cancel_list); } + + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + /* Report all cancelled requests */ + clear_mad_error_list(&cancel_list, IB_WC_WR_FLUSH_ERR, mad_agent_priv); } static struct ib_mad_send_wr_private* @@ -2451,6 +2699,13 @@ find_send_wr(struct ib_mad_agent_private *mad_agent_priv, &mad_send_wr->send_buf == send_buf) return mad_send_wr; } + + list_for_each_entry(mad_send_wr, &mad_agent_priv->backlog_list, + agent_list) { + if (&mad_send_wr->send_buf == send_buf) + return mad_send_wr; + } + return NULL; } @@ -2468,16 +2723,16 @@ int ib_modify_mad(struct ib_mad_send_buf *send_buf, u32 timeout_ms) struct ib_mad_agent_private, agent); spin_lock_irqsave(&mad_agent_priv->lock, flags); mad_send_wr = find_send_wr(mad_agent_priv, send_buf); - if (!mad_send_wr || mad_send_wr->status != IB_WC_SUCCESS) { + if (!mad_send_wr || mad_send_wr->state == IB_MAD_STATE_CANCELED) { spin_unlock_irqrestore(&mad_agent_priv->lock, flags); return -EINVAL; } - active = (!mad_send_wr->timeout || mad_send_wr->refcount > 1); - if (!timeout_ms) { - mad_send_wr->status = IB_WC_WR_FLUSH_ERR; - mad_send_wr->refcount -= (mad_send_wr->timeout > 0); - } + active = ((mad_send_wr->state == IB_MAD_STATE_SEND_START) || + (mad_send_wr->state == IB_MAD_STATE_EARLY_RESP) || + (mad_send_wr->state == IB_MAD_STATE_QUEUED && timeout_ms)); + if (!timeout_ms) + change_mad_state(mad_send_wr, IB_MAD_STATE_CANCELED); mad_send_wr->send_buf.timeout_ms = timeout_ms; if (active) @@ -2589,6 +2844,11 @@ static int retry_send(struct ib_mad_send_wr_private *mad_send_wr) mad_send_wr->send_buf.retries++; mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms); + if (mad_send_wr->is_solicited_fc && + !list_empty(&mad_send_wr->mad_agent_priv->backlog_list)) { + change_mad_state(mad_send_wr, IB_MAD_STATE_QUEUED); + return 0; + } if (ib_mad_kernel_rmpp_agent(&mad_send_wr->mad_agent_priv->agent)) { ret = ib_retry_rmpp(mad_send_wr); @@ -2606,26 +2866,25 @@ static int retry_send(struct ib_mad_send_wr_private *mad_send_wr) } else ret = ib_send_mad(mad_send_wr); - if (!ret) { - mad_send_wr->refcount++; - list_add_tail(&mad_send_wr->agent_list, - &mad_send_wr->mad_agent_priv->send_list); - } + if (!ret) + change_mad_state(mad_send_wr, IB_MAD_STATE_SEND_START); + return ret; } static void timeout_sends(struct work_struct *work) { - struct ib_mad_send_wr_private *mad_send_wr, *n; + struct ib_mad_send_wr_private *mad_send_wr; struct ib_mad_agent_private *mad_agent_priv; - struct ib_mad_send_wc mad_send_wc; - struct list_head local_list; + struct list_head timeout_list; + struct list_head cancel_list; + struct list_head *list_item; unsigned long flags, delay; mad_agent_priv = container_of(work, struct ib_mad_agent_private, timed_work.work); - mad_send_wc.vendor_err = 0; - INIT_LIST_HEAD(&local_list); + INIT_LIST_HEAD(&timeout_list); + INIT_LIST_HEAD(&cancel_list); spin_lock_irqsave(&mad_agent_priv->lock, flags); while (!list_empty(&mad_agent_priv->wait_list)) { @@ -2643,25 +2902,22 @@ static void timeout_sends(struct work_struct *work) break; } - list_del_init(&mad_send_wr->agent_list); - if (mad_send_wr->status == IB_WC_SUCCESS && - !retry_send(mad_send_wr)) + if (mad_send_wr->state == IB_MAD_STATE_CANCELED) + list_item = &cancel_list; + else if (retry_send(mad_send_wr)) + list_item = &timeout_list; + else continue; - list_add_tail(&mad_send_wr->agent_list, &local_list); + change_mad_state(mad_send_wr, IB_MAD_STATE_DONE); + list_add_tail(&mad_send_wr->agent_list, list_item); } - spin_unlock_irqrestore(&mad_agent_priv->lock, flags); - list_for_each_entry_safe(mad_send_wr, n, &local_list, agent_list) { - if (mad_send_wr->status == IB_WC_SUCCESS) - mad_send_wc.status = IB_WC_RESP_TIMEOUT_ERR; - else - mad_send_wc.status = mad_send_wr->status; - mad_send_wc.send_buf = &mad_send_wr->send_buf; - mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, - &mad_send_wc); - deref_mad_agent(mad_agent_priv); - } + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + process_backlog_mads(mad_agent_priv); + clear_mad_error_list(&timeout_list, IB_WC_RESP_TIMEOUT_ERR, + mad_agent_priv); + clear_mad_error_list(&cancel_list, IB_WC_WR_FLUSH_ERR, mad_agent_priv); } /* diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h index 1b7445a6f671..f444357d33f4 100644 --- a/drivers/infiniband/core/mad_priv.h +++ b/drivers/infiniband/core/mad_priv.h @@ -95,13 +95,16 @@ struct ib_mad_agent_private { spinlock_t lock; struct list_head send_list; + unsigned int sol_fc_send_count; struct list_head wait_list; - struct list_head done_list; + unsigned int sol_fc_wait_count; struct delayed_work timed_work; unsigned long timeout; struct list_head local_list; struct work_struct local_work; struct list_head rmpp_list; + unsigned int sol_fc_max; + struct list_head backlog_list; refcount_t refcount; union { @@ -118,6 +121,32 @@ struct ib_mad_snoop_private { struct completion comp; }; +enum ib_mad_state { + /* MAD is in the making and is not yet in any list */ + IB_MAD_STATE_INIT, + /* MAD is in backlog list */ + IB_MAD_STATE_QUEUED, + /* + * MAD was sent to the QP and is waiting for completion + * notification in send list. + */ + IB_MAD_STATE_SEND_START, + /* + * MAD send completed successfully, waiting for a response + * in wait list. + */ + IB_MAD_STATE_WAIT_RESP, + /* + * Response came early, before send completion notification, + * in send list. + */ + IB_MAD_STATE_EARLY_RESP, + /* MAD was canceled while in wait or send list */ + IB_MAD_STATE_CANCELED, + /* MAD processing completed, MAD in no list */ + IB_MAD_STATE_DONE +}; + struct ib_mad_send_wr_private { struct ib_mad_list_head mad_list; struct list_head agent_list; @@ -132,8 +161,6 @@ struct ib_mad_send_wr_private { int max_retries; int retries_left; int retry; - int refcount; - enum ib_wc_status status; /* RMPP control */ struct list_head rmpp_list; @@ -143,8 +170,48 @@ struct ib_mad_send_wr_private { int seg_num; int newwin; int pad; + + enum ib_mad_state state; + + /* Solicited MAD flow control */ + bool is_solicited_fc; }; +static inline void expect_mad_state(struct ib_mad_send_wr_private *mad_send_wr, + enum ib_mad_state expected_state) +{ + if (IS_ENABLED(CONFIG_LOCKDEP)) + WARN_ON(mad_send_wr->state != expected_state); +} + +static inline void expect_mad_state2(struct ib_mad_send_wr_private *mad_send_wr, + enum ib_mad_state expected_state1, + enum ib_mad_state expected_state2) +{ + if (IS_ENABLED(CONFIG_LOCKDEP)) + WARN_ON(mad_send_wr->state != expected_state1 && + mad_send_wr->state != expected_state2); +} + +static inline void expect_mad_state3(struct ib_mad_send_wr_private *mad_send_wr, + enum ib_mad_state expected_state1, + enum ib_mad_state expected_state2, + enum ib_mad_state expected_state3) +{ + if (IS_ENABLED(CONFIG_LOCKDEP)) + WARN_ON(mad_send_wr->state != expected_state1 && + mad_send_wr->state != expected_state2 && + mad_send_wr->state != expected_state3); +} + +static inline void +not_expect_mad_state(struct ib_mad_send_wr_private *mad_send_wr, + enum ib_mad_state wrong_state) +{ + if (IS_ENABLED(CONFIG_LOCKDEP)) + WARN_ON(mad_send_wr->state == wrong_state); +} + struct ib_mad_local_private { struct list_head completion_list; struct ib_mad_private *mad_priv; @@ -222,4 +289,7 @@ void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr); void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr, unsigned long timeout_ms); +void change_mad_state(struct ib_mad_send_wr_private *mad_send_wr, + enum ib_mad_state new_state); + #endif /* __IB_MAD_PRIV_H__ */ diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c index b4b10e8a6495..1c5e0eaf1c94 100644 --- a/drivers/infiniband/core/mad_rmpp.c +++ b/drivers/infiniband/core/mad_rmpp.c @@ -608,16 +608,20 @@ static void abort_send(struct ib_mad_agent_private *agent, goto out; /* Unmatched send */ if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) || - (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS)) + (!mad_send_wr->timeout) || + (mad_send_wr->state == IB_MAD_STATE_CANCELED)) goto out; /* Send is already done */ ib_mark_mad_done(mad_send_wr); + if (mad_send_wr->state == IB_MAD_STATE_DONE) { + spin_unlock_irqrestore(&agent->lock, flags); + wc.status = IB_WC_REM_ABORT_ERR; + wc.vendor_err = rmpp_status; + wc.send_buf = &mad_send_wr->send_buf; + ib_mad_complete_send_wr(mad_send_wr, &wc); + return; + } spin_unlock_irqrestore(&agent->lock, flags); - - wc.status = IB_WC_REM_ABORT_ERR; - wc.vendor_err = rmpp_status; - wc.send_buf = &mad_send_wr->send_buf; - ib_mad_complete_send_wr(mad_send_wr, &wc); return; out: spin_unlock_irqrestore(&agent->lock, flags); @@ -684,7 +688,8 @@ static void process_rmpp_ack(struct ib_mad_agent_private *agent, } if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) || - (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS)) + (!mad_send_wr->timeout) || + (mad_send_wr->state == IB_MAD_STATE_CANCELED)) goto out; /* Send is already done */ if (seg_num > mad_send_wr->send_buf.seg_count || @@ -709,21 +714,24 @@ static void process_rmpp_ack(struct ib_mad_agent_private *agent, struct ib_mad_send_wc wc; ib_mark_mad_done(mad_send_wr); + if (mad_send_wr->state == IB_MAD_STATE_DONE) { + spin_unlock_irqrestore(&agent->lock, flags); + wc.status = IB_WC_SUCCESS; + wc.vendor_err = 0; + wc.send_buf = &mad_send_wr->send_buf; + ib_mad_complete_send_wr(mad_send_wr, &wc); + return; + } spin_unlock_irqrestore(&agent->lock, flags); - - wc.status = IB_WC_SUCCESS; - wc.vendor_err = 0; - wc.send_buf = &mad_send_wr->send_buf; - ib_mad_complete_send_wr(mad_send_wr, &wc); return; } - if (mad_send_wr->refcount == 1) + if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP) ib_reset_mad_timeout(mad_send_wr, mad_send_wr->send_buf.timeout_ms); spin_unlock_irqrestore(&agent->lock, flags); ack_ds_ack(agent, mad_recv_wc); return; - } else if (mad_send_wr->refcount == 1 && + } else if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP && mad_send_wr->seg_num < mad_send_wr->newwin && mad_send_wr->seg_num < mad_send_wr->send_buf.seg_count) { /* Send failure will just result in a timeout/retry */ @@ -731,7 +739,7 @@ static void process_rmpp_ack(struct ib_mad_agent_private *agent, if (ret) goto out; - mad_send_wr->refcount++; + change_mad_state(mad_send_wr, IB_MAD_STATE_SEND_START); list_move_tail(&mad_send_wr->agent_list, &mad_send_wr->mad_agent_priv->send_list); } @@ -890,7 +898,6 @@ int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr) mad_send_wr->newwin = init_newwin(mad_send_wr); /* We need to wait for the final ACK even if there isn't a response */ - mad_send_wr->refcount += (mad_send_wr->timeout == 0); ret = send_next_seg(mad_send_wr); if (!ret) return IB_RMPP_RESULT_CONSUMED; @@ -912,7 +919,7 @@ int ib_process_rmpp_send_wc(struct ib_mad_send_wr_private *mad_send_wr, return IB_RMPP_RESULT_INTERNAL; /* ACK, STOP, or ABORT */ if (mad_send_wc->status != IB_WC_SUCCESS || - mad_send_wr->status != IB_WC_SUCCESS) + mad_send_wr->state == IB_MAD_STATE_CANCELED) return IB_RMPP_RESULT_PROCESSED; /* Canceled or send error */ if (!mad_send_wr->timeout) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index a872643e8039..2220a2dfab24 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -255,7 +255,7 @@ EXPORT_SYMBOL(rdma_nl_put_driver_u64_hex); bool rdma_nl_get_privileged_qkey(void) { - return privileged_qkey || capable(CAP_NET_RAW); + return privileged_qkey; } EXPORT_SYMBOL(rdma_nl_get_privileged_qkey); @@ -1469,10 +1469,11 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = { }; -static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack, - enum rdma_restrack_type res_type, - res_fill_func_t fill_func) +static noinline_for_stack int +res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, + enum rdma_restrack_type res_type, + res_fill_func_t fill_func) { const struct nldev_fill_res_entry *fe = &fill_entries[res_type]; struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; @@ -2263,10 +2264,10 @@ err: return ret; } -static int stat_get_doit_default_counter(struct sk_buff *skb, - struct nlmsghdr *nlh, - struct netlink_ext_ack *extack, - struct nlattr *tb[]) +static noinline_for_stack int +stat_get_doit_default_counter(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, + struct nlattr *tb[]) { struct rdma_hw_stats *stats; struct nlattr *table_attr; @@ -2356,8 +2357,9 @@ err: return ret; } -static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack, struct nlattr *tb[]) +static noinline_for_stack int +stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, struct nlattr *tb[]) { static enum rdma_nl_counter_mode mode; diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index 90c177edf9b0..18918f463361 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -1019,3 +1019,32 @@ void uverbs_finalize_object(struct ib_uobject *uobj, WARN_ON(true); } } + +/** + * rdma_uattrs_has_raw_cap() - Returns whether a rdma device linked to the + * uverbs attributes file has CAP_NET_RAW + * capability or not. + * + * @attrs: Pointer to uverbs attributes + * + * Returns true if a rdma device's owning user namespace has CAP_NET_RAW + * capability, otherwise false. + */ +bool rdma_uattrs_has_raw_cap(const struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_file *ufile = attrs->ufile; + struct ib_ucontext *ucontext; + bool has_cap = false; + int srcu_key; + + srcu_key = srcu_read_lock(&ufile->device->disassociate_srcu); + ucontext = ib_uverbs_get_ucontext_file(ufile); + if (IS_ERR(ucontext)) + goto out; + has_cap = rdma_dev_has_raw_cap(ucontext->device); + +out: + srcu_read_unlock(&ufile->device->disassociate_srcu, srcu_key); + return has_cap; +} +EXPORT_SYMBOL(rdma_uattrs_has_raw_cap); diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h index 33706dad6c0f..a59b087611cb 100644 --- a/drivers/infiniband/core/rdma_core.h +++ b/drivers/infiniband/core/rdma_core.h @@ -156,6 +156,7 @@ extern const struct uapi_definition uverbs_def_obj_counters[]; extern const struct uapi_definition uverbs_def_obj_cq[]; extern const struct uapi_definition uverbs_def_obj_device[]; extern const struct uapi_definition uverbs_def_obj_dm[]; +extern const struct uapi_definition uverbs_def_obj_dmah[]; extern const struct uapi_definition uverbs_def_obj_flow_action[]; extern const struct uapi_definition uverbs_def_obj_intf[]; extern const struct uapi_definition uverbs_def_obj_mr[]; diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index 3313410014cd..a7de6f403fca 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -100,6 +100,8 @@ static struct ib_device *res_to_dev(struct rdma_restrack_entry *res) return container_of(res, struct rdma_counter, res)->device; case RDMA_RESTRACK_SRQ: return container_of(res, struct ib_srq, res)->device; + case RDMA_RESTRACK_DMAH: + return container_of(res, struct ib_dmah, res)->device; default: WARN_ONCE(true, "Wrong resource tracking type %u\n", res->type); return NULL; diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index c752ae9fad6c..b1c44ec1a3f3 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -76,6 +76,17 @@ static int ib_init_umem_odp(struct ib_umem_odp *umem_odp, end = ALIGN(end, page_size); if (unlikely(end < page_size)) return -EOVERFLOW; + /* + * The mmu notifier can be called within reclaim contexts and takes the + * umem_mutex. This is rare to trigger in testing, teach lockdep about + * it. + */ + if (IS_ENABLED(CONFIG_LOCKDEP)) { + fs_reclaim_acquire(GFP_KERNEL); + mutex_lock(&umem_odp->umem_mutex); + mutex_unlock(&umem_odp->umem_mutex); + fs_reclaim_release(GFP_KERNEL); + } nr_entries = (end - start) >> PAGE_SHIFT; if (!(nr_entries * PAGE_SIZE / page_size)) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index bc9fe3ceca4d..ce16404cdfb8 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -741,7 +741,7 @@ static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs) } mr = pd->device->ops.reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va, - cmd.access_flags, + cmd.access_flags, NULL, &attrs->driver_udata); if (IS_ERR(mr)) { ret = PTR_ERR(mr); @@ -1312,9 +1312,9 @@ static int create_qp(struct uverbs_attr_bundle *attrs, switch (cmd->qp_type) { case IB_QPT_RAW_PACKET: - if (!capable(CAP_NET_RAW)) + if (!rdma_uattrs_has_raw_cap(attrs)) return -EPERM; - break; + fallthrough; case IB_QPT_RC: case IB_QPT_UC: case IB_QPT_UD: @@ -1451,7 +1451,7 @@ static int create_qp(struct uverbs_attr_bundle *attrs, } if (attr.create_flags & IB_QP_CREATE_SOURCE_QPN) { - if (!capable(CAP_NET_RAW)) { + if (!rdma_uattrs_has_raw_cap(attrs)) { ret = -EPERM; goto err_put; } @@ -1877,7 +1877,8 @@ static int modify_qp(struct uverbs_attr_bundle *attrs, attr->path_mig_state = cmd->base.path_mig_state; if (cmd->base.attr_mask & IB_QP_QKEY) { if (cmd->base.qkey & IB_QP_SET_QKEY && - !rdma_nl_get_privileged_qkey()) { + !(rdma_nl_get_privileged_qkey() || + rdma_uattrs_has_raw_cap(attrs))) { ret = -EPERM; goto release_qp; } @@ -3225,7 +3226,7 @@ static int ib_uverbs_ex_create_flow(struct uverbs_attr_bundle *attrs) if (cmd.comp_mask) return -EINVAL; - if (!capable(CAP_NET_RAW)) + if (!rdma_uattrs_has_raw_cap(attrs)) return -EPERM; if (cmd.flow_attr.flags >= IB_FLOW_ATTR_FLAGS_RESERVED) diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c index 432054f0a8a4..37cd37556510 100644 --- a/drivers/infiniband/core/uverbs_std_types_cq.c +++ b/drivers/infiniband/core/uverbs_std_types_cq.c @@ -64,15 +64,21 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( struct ib_ucq_object *obj = container_of( uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_CQ_HANDLE), typeof(*obj), uevent.uobject); + struct ib_uverbs_completion_event_file *ev_file = NULL; struct ib_device *ib_dev = attrs->context->device; - int ret; - u64 user_handle; + struct ib_umem_dmabuf *umem_dmabuf; struct ib_cq_init_attr attr = {}; - struct ib_cq *cq; - struct ib_uverbs_completion_event_file *ev_file = NULL; struct ib_uobject *ev_file_uobj; + struct ib_umem *umem = NULL; + u64 buffer_length; + u64 buffer_offset; + struct ib_cq *cq; + u64 user_handle; + u64 buffer_va; + int buffer_fd; + int ret; - if (!ib_dev->ops.create_cq || !ib_dev->ops.destroy_cq) + if ((!ib_dev->ops.create_cq && !ib_dev->ops.create_cq_umem) || !ib_dev->ops.destroy_cq) return -EOPNOTSUPP; ret = uverbs_copy_from(&attr.comp_vector, attrs, @@ -112,9 +118,66 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( INIT_LIST_HEAD(&obj->comp_list); INIT_LIST_HEAD(&obj->uevent.event_list); + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_VA)) { + + ret = uverbs_copy_from(&buffer_va, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_VA); + if (ret) + goto err_event_file; + + ret = uverbs_copy_from(&buffer_length, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_LENGTH); + if (ret) + goto err_event_file; + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_FD) || + uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET) || + !ib_dev->ops.create_cq_umem) { + ret = -EINVAL; + goto err_event_file; + } + + umem = ib_umem_get(ib_dev, buffer_va, buffer_length, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(umem)) { + ret = PTR_ERR(umem); + goto err_event_file; + } + } else if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_FD)) { + + ret = uverbs_get_raw_fd(&buffer_fd, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_FD); + if (ret) + goto err_event_file; + + ret = uverbs_copy_from(&buffer_offset, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET); + if (ret) + goto err_event_file; + + ret = uverbs_copy_from(&buffer_length, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_LENGTH); + if (ret) + goto err_event_file; + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_VA) || + !ib_dev->ops.create_cq_umem) { + ret = -EINVAL; + goto err_event_file; + } + + umem_dmabuf = ib_umem_dmabuf_get_pinned(ib_dev, buffer_offset, buffer_length, + buffer_fd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(umem_dmabuf)) { + ret = PTR_ERR(umem_dmabuf); + goto err_event_file; + } + umem = &umem_dmabuf->umem; + } else if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET) || + uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_LENGTH) || + !ib_dev->ops.create_cq) { + ret = -EINVAL; + goto err_event_file; + } + cq = rdma_zalloc_drv_obj(ib_dev, ib_cq); if (!cq) { ret = -ENOMEM; + ib_umem_release(umem); goto err_event_file; } @@ -128,7 +191,8 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ); rdma_restrack_set_name(&cq->res, NULL); - ret = ib_dev->ops.create_cq(cq, &attr, attrs); + ret = umem ? ib_dev->ops.create_cq_umem(cq, &attr, umem, attrs) : + ib_dev->ops.create_cq(cq, &attr, attrs); if (ret) goto err_free; @@ -180,6 +244,17 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_OBJECT_ASYNC_EVENT, UVERBS_ACCESS_READ, UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_BUFFER_VA, + UVERBS_ATTR_TYPE(u64), + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_BUFFER_LENGTH, + UVERBS_ATTR_TYPE(u64), + UA_OPTIONAL), + UVERBS_ATTR_RAW_FD(UVERBS_ATTR_CREATE_CQ_BUFFER_FD, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET, + UVERBS_ATTR_TYPE(u64), + UA_OPTIONAL), UVERBS_ATTR_UHW()); static int UVERBS_HANDLER(UVERBS_METHOD_CQ_DESTROY)( diff --git a/drivers/infiniband/core/uverbs_std_types_dmah.c b/drivers/infiniband/core/uverbs_std_types_dmah.c new file mode 100644 index 000000000000..453ce656c6f2 --- /dev/null +++ b/drivers/infiniband/core/uverbs_std_types_dmah.c @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include "rdma_core.h" +#include "uverbs.h" +#include <rdma/uverbs_std_types.h> +#include "restrack.h" + +static int uverbs_free_dmah(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct ib_dmah *dmah = uobject->object; + int ret; + + if (atomic_read(&dmah->usecnt)) + return -EBUSY; + + ret = dmah->device->ops.dealloc_dmah(dmah, attrs); + if (ret) + return ret; + + rdma_restrack_del(&dmah->res); + kfree(dmah); + return 0; +} + +static int UVERBS_HANDLER(UVERBS_METHOD_DMAH_ALLOC)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = + uverbs_attr_get(attrs, UVERBS_ATTR_ALLOC_DMAH_HANDLE) + ->obj_attr.uobject; + struct ib_device *ib_dev = attrs->context->device; + struct ib_dmah *dmah; + int ret; + + dmah = rdma_zalloc_drv_obj(ib_dev, ib_dmah); + if (!dmah) + return -ENOMEM; + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_ALLOC_DMAH_CPU_ID)) { + ret = uverbs_copy_from(&dmah->cpu_id, attrs, + UVERBS_ATTR_ALLOC_DMAH_CPU_ID); + if (ret) + goto err; + + if (!cpumask_test_cpu(dmah->cpu_id, current->cpus_ptr)) { + ret = -EPERM; + goto err; + } + + dmah->valid_fields |= BIT(IB_DMAH_CPU_ID_EXISTS); + } + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_ALLOC_DMAH_TPH_MEM_TYPE)) { + dmah->mem_type = uverbs_attr_get_enum_id(attrs, + UVERBS_ATTR_ALLOC_DMAH_TPH_MEM_TYPE); + dmah->valid_fields |= BIT(IB_DMAH_MEM_TYPE_EXISTS); + } + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_ALLOC_DMAH_PH)) { + ret = uverbs_copy_from(&dmah->ph, attrs, + UVERBS_ATTR_ALLOC_DMAH_PH); + if (ret) + goto err; + + /* Per PCIe spec 6.2-1.0, only the lowest two bits are applicable */ + if (dmah->ph & 0xFC) { + ret = -EINVAL; + goto err; + } + + dmah->valid_fields |= BIT(IB_DMAH_PH_EXISTS); + } + + dmah->device = ib_dev; + dmah->uobject = uobj; + atomic_set(&dmah->usecnt, 0); + + rdma_restrack_new(&dmah->res, RDMA_RESTRACK_DMAH); + rdma_restrack_set_name(&dmah->res, NULL); + + ret = ib_dev->ops.alloc_dmah(dmah, attrs); + if (ret) { + rdma_restrack_put(&dmah->res); + goto err; + } + + uobj->object = dmah; + rdma_restrack_add(&dmah->res); + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_ALLOC_DMAH_HANDLE); + return 0; +err: + kfree(dmah); + return ret; +} + +static const struct uverbs_attr_spec uverbs_dmah_mem_type[] = { + [TPH_MEM_TYPE_VM] = { + .type = UVERBS_ATTR_TYPE_PTR_IN, + UVERBS_ATTR_NO_DATA(), + }, + [TPH_MEM_TYPE_PM] = { + .type = UVERBS_ATTR_TYPE_PTR_IN, + UVERBS_ATTR_NO_DATA(), + }, +}; + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_DMAH_ALLOC, + UVERBS_ATTR_IDR(UVERBS_ATTR_ALLOC_DMAH_HANDLE, + UVERBS_OBJECT_DMAH, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ALLOC_DMAH_CPU_ID, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL), + UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_ALLOC_DMAH_TPH_MEM_TYPE, + uverbs_dmah_mem_type, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ALLOC_DMAH_PH, + UVERBS_ATTR_TYPE(u8), + UA_OPTIONAL)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + UVERBS_METHOD_DMAH_FREE, + UVERBS_ATTR_IDR(UVERBS_ATTR_FREE_DMA_HANDLE, + UVERBS_OBJECT_DMAH, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_DMAH, + UVERBS_TYPE_ALLOC_IDR(uverbs_free_dmah), + &UVERBS_METHOD(UVERBS_METHOD_DMAH_ALLOC), + &UVERBS_METHOD(UVERBS_METHOD_DMAH_FREE)); + +const struct uapi_definition uverbs_def_obj_dmah[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_DMAH, + UAPI_DEF_OBJ_NEEDS_FN(dealloc_dmah), + UAPI_DEF_OBJ_NEEDS_FN(alloc_dmah)), + {} +}; diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c index 7ebc7bd3caae..570b9656801d 100644 --- a/drivers/infiniband/core/uverbs_std_types_mr.c +++ b/drivers/infiniband/core/uverbs_std_types_mr.c @@ -238,7 +238,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_REG_DMABUF_MR)( return ret; mr = pd->device->ops.reg_user_mr_dmabuf(pd, offset, length, iova, fd, - access_flags, + access_flags, NULL, attrs); if (IS_ERR(mr)) return PTR_ERR(mr); @@ -266,6 +266,135 @@ static int UVERBS_HANDLER(UVERBS_METHOD_REG_DMABUF_MR)( return ret; } +static int UVERBS_HANDLER(UVERBS_METHOD_REG_MR)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_REG_MR_HANDLE); + struct ib_pd *pd = + uverbs_attr_get_obj(attrs, UVERBS_ATTR_REG_MR_PD_HANDLE); + u32 valid_access_flags = IB_ACCESS_SUPPORTED; + u64 length, iova, fd_offset = 0, addr = 0; + struct ib_device *ib_dev = pd->device; + struct ib_dmah *dmah = NULL; + bool has_fd_offset = false; + bool has_addr = false; + bool has_fd = false; + u32 access_flags; + struct ib_mr *mr; + int fd; + int ret; + + ret = uverbs_copy_from(&iova, attrs, UVERBS_ATTR_REG_MR_IOVA); + if (ret) + return ret; + + ret = uverbs_copy_from(&length, attrs, UVERBS_ATTR_REG_MR_LENGTH); + if (ret) + return ret; + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_REG_MR_ADDR)) { + ret = uverbs_copy_from(&addr, attrs, + UVERBS_ATTR_REG_MR_ADDR); + if (ret) + return ret; + has_addr = true; + } + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_REG_MR_FD_OFFSET)) { + ret = uverbs_copy_from(&fd_offset, attrs, + UVERBS_ATTR_REG_MR_FD_OFFSET); + if (ret) + return ret; + has_fd_offset = true; + } + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_REG_MR_FD)) { + ret = uverbs_get_raw_fd(&fd, attrs, + UVERBS_ATTR_REG_MR_FD); + if (ret) + return ret; + has_fd = true; + } + + if (has_fd) { + if (!ib_dev->ops.reg_user_mr_dmabuf) + return -EOPNOTSUPP; + + /* FD requires offset and can't come with addr */ + if (!has_fd_offset || has_addr) + return -EINVAL; + + if ((fd_offset & ~PAGE_MASK) != (iova & ~PAGE_MASK)) + return -EINVAL; + + valid_access_flags = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_ATOMIC | + IB_ACCESS_RELAXED_ORDERING; + } else { + if (!has_addr || has_fd_offset) + return -EINVAL; + + if ((addr & ~PAGE_MASK) != (iova & ~PAGE_MASK)) + return -EINVAL; + } + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_REG_MR_DMA_HANDLE)) { + dmah = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_REG_MR_DMA_HANDLE); + if (IS_ERR(dmah)) + return PTR_ERR(dmah); + } + + ret = uverbs_get_flags32(&access_flags, attrs, + UVERBS_ATTR_REG_MR_ACCESS_FLAGS, + valid_access_flags); + if (ret) + return ret; + + ret = ib_check_mr_access(ib_dev, access_flags); + if (ret) + return ret; + + if (has_fd) + mr = pd->device->ops.reg_user_mr_dmabuf(pd, fd_offset, length, + iova, fd, access_flags, + dmah, attrs); + else + mr = pd->device->ops.reg_user_mr(pd, addr, length, iova, + access_flags, dmah, NULL); + + if (IS_ERR(mr)) + return PTR_ERR(mr); + + mr->device = pd->device; + mr->pd = pd; + mr->type = IB_MR_TYPE_USER; + mr->uobject = uobj; + atomic_inc(&pd->usecnt); + if (dmah) { + mr->dmah = dmah; + atomic_inc(&dmah->usecnt); + } + rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR); + rdma_restrack_set_name(&mr->res, NULL); + rdma_restrack_add(&mr->res); + uobj->object = mr; + + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_REG_MR_HANDLE); + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_MR_RESP_LKEY, + &mr->lkey, sizeof(mr->lkey)); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_MR_RESP_RKEY, + &mr->rkey, sizeof(mr->rkey)); + return ret; +} + DECLARE_UVERBS_NAMED_METHOD( UVERBS_METHOD_ADVISE_MR, UVERBS_ATTR_IDR(UVERBS_ATTR_ADVISE_MR_PD_HANDLE, @@ -362,6 +491,44 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_ATTR_TYPE(u32), UA_MANDATORY)); +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_REG_MR, + UVERBS_ATTR_IDR(UVERBS_ATTR_REG_MR_HANDLE, + UVERBS_OBJECT_MR, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_REG_MR_PD_HANDLE, + UVERBS_OBJECT_PD, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_REG_MR_DMA_HANDLE, + UVERBS_OBJECT_DMAH, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_MR_IOVA, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_MR_LENGTH, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_REG_MR_ACCESS_FLAGS, + enum ib_access_flags, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_MR_ADDR, + UVERBS_ATTR_TYPE(u64), + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_MR_FD_OFFSET, + UVERBS_ATTR_TYPE(u64), + UA_OPTIONAL), + UVERBS_ATTR_RAW_FD(UVERBS_ATTR_REG_MR_FD, + UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_MR_RESP_LKEY, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_MR_RESP_RKEY, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY)); + DECLARE_UVERBS_NAMED_METHOD_DESTROY( UVERBS_METHOD_MR_DESTROY, UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_MR_HANDLE, @@ -376,7 +543,8 @@ DECLARE_UVERBS_NAMED_OBJECT( &UVERBS_METHOD(UVERBS_METHOD_DM_MR_REG), &UVERBS_METHOD(UVERBS_METHOD_MR_DESTROY), &UVERBS_METHOD(UVERBS_METHOD_QUERY_MR), - &UVERBS_METHOD(UVERBS_METHOD_REG_DMABUF_MR)); + &UVERBS_METHOD(UVERBS_METHOD_REG_DMABUF_MR), + &UVERBS_METHOD(UVERBS_METHOD_REG_MR)); const struct uapi_definition uverbs_def_obj_mr[] = { UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_MR, diff --git a/drivers/infiniband/core/uverbs_std_types_qp.c b/drivers/infiniband/core/uverbs_std_types_qp.c index 7b4773fa4bc0..be0730e8509e 100644 --- a/drivers/infiniband/core/uverbs_std_types_qp.c +++ b/drivers/infiniband/core/uverbs_std_types_qp.c @@ -133,7 +133,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_QP_CREATE)( device = xrcd->device; break; case IB_UVERBS_QPT_RAW_PACKET: - if (!capable(CAP_NET_RAW)) + if (!rdma_uattrs_has_raw_cap(attrs)) return -EPERM; fallthrough; case IB_UVERBS_QPT_RC: diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c index a02916a3a79c..e00ea63175bd 100644 --- a/drivers/infiniband/core/uverbs_uapi.c +++ b/drivers/infiniband/core/uverbs_uapi.c @@ -631,6 +631,7 @@ static const struct uapi_definition uverbs_core_api[] = { UAPI_DEF_CHAIN(uverbs_def_obj_cq), UAPI_DEF_CHAIN(uverbs_def_obj_device), UAPI_DEF_CHAIN(uverbs_def_obj_dm), + UAPI_DEF_CHAIN(uverbs_def_obj_dmah), UAPI_DEF_CHAIN(uverbs_def_obj_flow_action), UAPI_DEF_CHAIN(uverbs_def_obj_intf), UAPI_DEF_CHAIN(uverbs_def_obj_mr), diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 75fde0fe9989..3a5f81402d2f 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -2223,7 +2223,7 @@ struct ib_mr *ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, } mr = pd->device->ops.reg_user_mr(pd, start, length, virt_addr, - access_flags, NULL); + access_flags, NULL, NULL); if (IS_ERR(mr)) return mr; @@ -2262,6 +2262,7 @@ int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata) { struct ib_pd *pd = mr->pd; struct ib_dm *dm = mr->dm; + struct ib_dmah *dmah = mr->dmah; struct ib_sig_attrs *sig_attrs = mr->sig_attrs; int ret; @@ -2272,6 +2273,8 @@ int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata) atomic_dec(&pd->usecnt); if (dm) atomic_dec(&dm->usecnt); + if (dmah) + atomic_dec(&dmah->usecnt); kfree(sig_attrs); } |