diff options
Diffstat (limited to 'drivers/infiniband')
285 files changed, 14799 insertions, 5527 deletions
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index 8ab4eea5a0a5..d49ded7e95f0 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -39,6 +39,7 @@ ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \ uverbs_std_types_async_fd.o \ uverbs_std_types_srq.o \ uverbs_std_types_wq.o \ - uverbs_std_types_qp.o + uverbs_std_types_qp.o \ + ucaps.o ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o umem_dmabuf.o ib_uverbs-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index f253295795f0..be0743dac3ff 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -348,16 +348,10 @@ static int dst_fetch_ha(const struct dst_entry *dst, static bool has_gateway(const struct dst_entry *dst, sa_family_t family) { - struct rtable *rt; - struct rt6_info *rt6; - - if (family == AF_INET) { - rt = container_of(dst, struct rtable, dst); - return rt->rt_uses_gateway; - } + if (family == AF_INET) + return dst_rtable(dst)->rt_uses_gateway; - rt6 = container_of(dst, struct rt6_info, dst); - return rt6->rt6i_flags & RTF_GATEWAY; + return dst_rt6_info(dst)->rt6i_flags & RTF_GATEWAY; } static int fetch_ha(const struct dst_entry *dst, struct rdma_dev_addr *dev_addr, diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c index f82b4260de42..3bb46696731e 100644 --- a/drivers/infiniband/core/agent.c +++ b/drivers/infiniband/core/agent.c @@ -59,7 +59,16 @@ __ib_get_agent_port(const struct ib_device *device, int port_num) struct ib_agent_port_private *entry; list_for_each_entry(entry, &ib_agent_port_list, port_list) { - if (entry->agent[1]->device == device && + /* Need to check both agent[0] and agent[1], as an agent port + * may only have one of them + */ + if (entry->agent[0] && + entry->agent[0]->device == device && + entry->agent[0]->port_num == port_num) + return entry; + + if (entry->agent[1] && + entry->agent[1]->device == device && entry->agent[1]->port_num == port_num) return entry; } @@ -172,14 +181,16 @@ int ib_agent_port_open(struct ib_device *device, int port_num) } } - /* Obtain send only MAD agent for GSI QP */ - port_priv->agent[1] = ib_register_mad_agent(device, port_num, - IB_QPT_GSI, NULL, 0, - &agent_send_handler, - NULL, NULL, 0); - if (IS_ERR(port_priv->agent[1])) { - ret = PTR_ERR(port_priv->agent[1]); - goto error3; + if (rdma_cap_ib_cm(device, port_num)) { + /* Obtain send only MAD agent for GSI QP */ + port_priv->agent[1] = ib_register_mad_agent(device, port_num, + IB_QPT_GSI, NULL, 0, + &agent_send_handler, + NULL, NULL, 0); + if (IS_ERR(port_priv->agent[1])) { + ret = PTR_ERR(port_priv->agent[1]); + goto error3; + } } spin_lock_irqsave(&ib_agent_port_list_lock, flags); @@ -212,7 +223,8 @@ int ib_agent_port_close(struct ib_device *device, int port_num) list_del(&port_priv->port_list); spin_unlock_irqrestore(&ib_agent_port_list_lock, flags); - ib_unregister_mad_agent(port_priv->agent[1]); + if (port_priv->agent[1]) + ib_unregister_mad_agent(port_priv->agent[1]); if (port_priv->agent[0]) ib_unregister_mad_agent(port_priv->agent[0]); diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index c02a96d3572a..9979a351577f 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -794,7 +794,6 @@ err_free_table: static void release_gid_table(struct ib_device *device, struct ib_gid_table *table) { - bool leak = false; int i; if (!table) @@ -803,15 +802,12 @@ static void release_gid_table(struct ib_device *device, for (i = 0; i < table->sz; i++) { if (is_gid_entry_free(table->data_vec[i])) continue; - if (kref_read(&table->data_vec[i]->kref) > 1) { - dev_err(&device->dev, - "GID entry ref leak for index %d ref=%u\n", i, - kref_read(&table->data_vec[i]->kref)); - leak = true; - } + + WARN_ONCE(true, + "GID entry ref leak for dev %s index %d ref=%u\n", + dev_name(&device->dev), i, + kref_read(&table->data_vec[i]->kref)); } - if (leak) - return; mutex_destroy(&table->lock); kfree(table->data_vec); @@ -1131,41 +1127,6 @@ err: } EXPORT_SYMBOL(ib_find_cached_pkey); -int ib_find_exact_cached_pkey(struct ib_device *device, u32 port_num, - u16 pkey, u16 *index) -{ - struct ib_pkey_cache *cache; - unsigned long flags; - int i; - int ret = -ENOENT; - - if (!rdma_is_port_valid(device, port_num)) - return -EINVAL; - - read_lock_irqsave(&device->cache_lock, flags); - - cache = device->port_data[port_num].cache.pkey; - if (!cache) { - ret = -EINVAL; - goto err; - } - - *index = -1; - - for (i = 0; i < cache->table_len; ++i) - if (cache->table[i] == pkey) { - *index = i; - ret = 0; - break; - } - -err: - read_unlock_irqrestore(&device->cache_lock, flags); - - return ret; -} -EXPORT_SYMBOL(ib_find_exact_cached_pkey); - int ib_get_cached_lmc(struct ib_device *device, u32 port_num, u8 *lmc) { unsigned long flags; @@ -1540,6 +1501,12 @@ ib_cache_update(struct ib_device *device, u32 port, bool update_gids, device->port_data[port].cache.pkey = pkey_cache; } device->port_data[port].cache.lmc = tprops->lmc; + + if (device->port_data[port].cache.port_state != IB_PORT_NOP && + device->port_data[port].cache.port_state != tprops->state) + ibdev_info(device, "Port: %d Link %s\n", port, + ib_port_state_to_str(tprops->state)); + device->port_data[port].cache.port_state = tprops->state; device->port_data[port].cache.subnet_prefix = tprops->subnet_prefix; @@ -1644,8 +1611,10 @@ int ib_cache_setup_one(struct ib_device *device) rdma_for_each_port (device, p) { err = ib_cache_update(device, p, true, true, true); - if (err) + if (err) { + gid_table_cleanup_one(device); return err; + } } return 0; diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 07fb8d3c037f..8670e58675c6 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -35,6 +35,9 @@ MODULE_DESCRIPTION("InfiniBand CM"); MODULE_LICENSE("Dual BSD/GPL"); #define CM_DESTROY_ID_WAIT_TIMEOUT 10000 /* msecs */ +#define CM_DIRECT_RETRY_CTX ((void *) 1UL) +#define CM_MRA_SETTING 24 /* 4.096us * 2^24 = ~68.7 seconds */ + static const char * const ibcm_rej_reason_strs[] = { [IB_CM_REJ_NO_QP] = "no QP", [IB_CM_REJ_NO_EEC] = "no EEC", @@ -93,8 +96,7 @@ static void cm_process_work(struct cm_id_private *cm_id_priv, struct cm_work *work); static int cm_send_sidr_rep_locked(struct cm_id_private *cm_id_priv, struct ib_cm_sidr_rep_param *param); -static int cm_send_dreq_locked(struct cm_id_private *cm_id_priv, - const void *private_data, u8 private_data_len); +static void cm_issue_dreq(struct cm_id_private *cm_id_priv); static int cm_send_drep_locked(struct cm_id_private *cm_id_priv, void *private_data, u8 private_data_len); static int cm_send_rej_locked(struct cm_id_private *cm_id_priv, @@ -166,7 +168,7 @@ struct cm_port { struct cm_device { struct kref kref; struct list_head list; - spinlock_t mad_agent_lock; + rwlock_t mad_agent_lock; struct ib_device *ib_device; u8 ack_delay; int going_down; @@ -240,7 +242,6 @@ struct cm_id_private { u8 initiator_depth; u8 retry_count; u8 rnr_retry_count; - u8 service_timeout; u8 target_ack_delay; struct list_head work_list; @@ -284,7 +285,7 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv) if (!cm_id_priv->av.port) return ERR_PTR(-EINVAL); - spin_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); + read_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); mad_agent = cm_id_priv->av.port->mad_agent; if (!mad_agent) { m = ERR_PTR(-EINVAL); @@ -307,30 +308,22 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv) goto out; } - /* Timeout set by caller if response is expected. */ m->ah = ah; - m->retries = cm_id_priv->max_cm_retries; - - refcount_inc(&cm_id_priv->refcount); - m->context[0] = cm_id_priv; out: - spin_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); + read_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); return m; } static void cm_free_msg(struct ib_mad_send_buf *msg) { - struct cm_id_private *cm_id_priv = msg->context[0]; - if (msg->ah) rdma_destroy_ah(msg->ah, 0); - cm_deref_id(cm_id_priv); ib_free_send_mad(msg); } static struct ib_mad_send_buf * -cm_alloc_priv_msg(struct cm_id_private *cm_id_priv) +cm_alloc_priv_msg(struct cm_id_private *cm_id_priv, enum ib_cm_state state) { struct ib_mad_send_buf *msg; @@ -339,7 +332,15 @@ cm_alloc_priv_msg(struct cm_id_private *cm_id_priv) msg = cm_alloc_msg(cm_id_priv); if (IS_ERR(msg)) return msg; + cm_id_priv->msg = msg; + refcount_inc(&cm_id_priv->refcount); + msg->context[0] = cm_id_priv; + msg->context[1] = (void *) (unsigned long) state; + + msg->retries = cm_id_priv->max_cm_retries; + msg->timeout_ms = cm_id_priv->timeout_ms; + return msg; } @@ -358,13 +359,20 @@ static void cm_free_priv_msg(struct ib_mad_send_buf *msg) ib_free_send_mad(msg); } -static struct ib_mad_send_buf *cm_alloc_response_msg_no_ah(struct cm_port *port, - struct ib_mad_recv_wc *mad_recv_wc) +static struct ib_mad_send_buf * +cm_alloc_response_msg_no_ah(struct cm_port *port, + struct ib_mad_recv_wc *mad_recv_wc, + bool direct_retry) { - return ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index, - 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, - GFP_ATOMIC, - IB_MGMT_BASE_VERSION); + struct ib_mad_send_buf *m; + + m = ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index, + 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, + GFP_ATOMIC, IB_MGMT_BASE_VERSION); + if (!IS_ERR(m)) + m->context[0] = direct_retry ? CM_DIRECT_RETRY_CTX : NULL; + + return m; } static int cm_create_response_msg_ah(struct cm_port *port, @@ -384,12 +392,13 @@ static int cm_create_response_msg_ah(struct cm_port *port, static int cm_alloc_response_msg(struct cm_port *port, struct ib_mad_recv_wc *mad_recv_wc, + bool direct_retry, struct ib_mad_send_buf **msg) { struct ib_mad_send_buf *m; int ret; - m = cm_alloc_response_msg_no_ah(port, mad_recv_wc); + m = cm_alloc_response_msg_no_ah(port, mad_recv_wc, direct_retry); if (IS_ERR(m)) return PTR_ERR(m); @@ -403,13 +412,6 @@ static int cm_alloc_response_msg(struct cm_port *port, return 0; } -static void cm_free_response_msg(struct ib_mad_send_buf *msg) -{ - if (msg->ah) - rdma_destroy_ah(msg->ah, 0); - ib_free_send_mad(msg); -} - static void *cm_copy_private_data(const void *private_data, u8 private_data_len) { void *data; @@ -1109,7 +1111,8 @@ retest: cm_id->state = IB_CM_IDLE; break; } - cm_send_dreq_locked(cm_id_priv, NULL, 0); + cm_issue_dreq(cm_id_priv); + cm_enter_timewait(cm_id_priv); goto retest; case IB_CM_DREQ_SENT: ib_cancel_mad(cm_id_priv->msg); @@ -1294,10 +1297,10 @@ static __be64 cm_form_tid(struct cm_id_private *cm_id_priv) if (!cm_id_priv->av.port) return cpu_to_be64(low_tid); - spin_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); + read_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); if (cm_id_priv->av.port->mad_agent) hi_tid = ((u64)cm_id_priv->av.port->mad_agent->hi_tid) << 32; - spin_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); + read_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); return cpu_to_be64(hi_tid | low_tid); } @@ -1557,7 +1560,7 @@ int ib_send_cm_req(struct ib_cm_id *cm_id, if (param->alternate_path) cm_move_av_from_path(&cm_id_priv->alt_av, &alt_av); - msg = cm_alloc_priv_msg(cm_id_priv); + msg = cm_alloc_priv_msg(cm_id_priv, IB_CM_REQ_SENT); if (IS_ERR(msg)) { ret = PTR_ERR(msg); goto out_unlock; @@ -1566,8 +1569,6 @@ int ib_send_cm_req(struct ib_cm_id *cm_id, req_msg = (struct cm_req_msg *)msg->mad; cm_format_req(req_msg, cm_id_priv, param); cm_id_priv->tid = req_msg->hdr.tid; - msg->timeout_ms = cm_id_priv->timeout_ms; - msg->context[1] = (void *)(unsigned long)IB_CM_REQ_SENT; cm_id_priv->local_qpn = cpu_to_be32(IBA_GET(CM_REQ_LOCAL_QPN, req_msg)); cm_id_priv->rq_psn = cpu_to_be32(IBA_GET(CM_REQ_STARTING_PSN, req_msg)); @@ -1598,7 +1599,7 @@ static int cm_issue_rej(struct cm_port *port, struct cm_rej_msg *rej_msg, *rcv_msg; int ret; - ret = cm_alloc_response_msg(port, mad_recv_wc, &msg); + ret = cm_alloc_response_msg(port, mad_recv_wc, false, &msg); if (ret) return ret; @@ -1624,7 +1625,7 @@ static int cm_issue_rej(struct cm_port *port, IBA_GET(CM_REJ_REMOTE_COMM_ID, rcv_msg)); ret = ib_post_send_mad(msg, NULL); if (ret) - cm_free_response_msg(msg); + cm_free_msg(msg); return ret; } @@ -1871,7 +1872,7 @@ static void cm_process_work(struct cm_id_private *cm_id_priv, static void cm_format_mra(struct cm_mra_msg *mra_msg, struct cm_id_private *cm_id_priv, - enum cm_msg_response msg_mraed, u8 service_timeout, + enum cm_msg_response msg_mraed, const void *private_data, u8 private_data_len) { cm_format_mad_hdr(&mra_msg->hdr, CM_MRA_ATTR_ID, cm_id_priv->tid); @@ -1880,7 +1881,7 @@ static void cm_format_mra(struct cm_mra_msg *mra_msg, be32_to_cpu(cm_id_priv->id.local_id)); IBA_SET(CM_MRA_REMOTE_COMM_ID, mra_msg, be32_to_cpu(cm_id_priv->id.remote_id)); - IBA_SET(CM_MRA_SERVICE_TIMEOUT, mra_msg, service_timeout); + IBA_SET(CM_MRA_SERVICE_TIMEOUT, mra_msg, CM_MRA_SETTING); if (private_data && private_data_len) IBA_SET_MEM(CM_MRA_PRIVATE_DATA, mra_msg, private_data, @@ -1951,7 +1952,7 @@ static void cm_dup_req_handler(struct cm_work *work, } spin_unlock_irq(&cm_id_priv->lock); - ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg); + ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, true, &msg); if (ret) return; @@ -1959,7 +1960,7 @@ static void cm_dup_req_handler(struct cm_work *work, switch (cm_id_priv->id.state) { case IB_CM_MRA_REQ_SENT: cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, - CM_MSG_RESPONSE_REQ, cm_id_priv->service_timeout, + CM_MSG_RESPONSE_REQ, cm_id_priv->private_data, cm_id_priv->private_data_len); break; @@ -1980,7 +1981,7 @@ static void cm_dup_req_handler(struct cm_work *work, return; unlock: spin_unlock_irq(&cm_id_priv->lock); -free: cm_free_response_msg(msg); +free: cm_free_msg(msg); } static struct cm_id_private *cm_match_req(struct cm_work *work, @@ -2294,7 +2295,7 @@ int ib_send_cm_rep(struct ib_cm_id *cm_id, goto out; } - msg = cm_alloc_priv_msg(cm_id_priv); + msg = cm_alloc_priv_msg(cm_id_priv, IB_CM_REP_SENT); if (IS_ERR(msg)) { ret = PTR_ERR(msg); goto out; @@ -2302,8 +2303,6 @@ int ib_send_cm_rep(struct ib_cm_id *cm_id, rep_msg = (struct cm_rep_msg *) msg->mad; cm_format_rep(rep_msg, cm_id_priv, param); - msg->timeout_ms = cm_id_priv->timeout_ms; - msg->context[1] = (void *) (unsigned long) IB_CM_REP_SENT; trace_icm_send_rep(cm_id); ret = ib_post_send_mad(msg, NULL); @@ -2444,7 +2443,7 @@ static void cm_dup_rep_handler(struct cm_work *work) atomic_long_inc( &work->port->counters[CM_RECV_DUPLICATES][CM_REP_COUNTER]); - ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg); + ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, true, &msg); if (ret) goto deref; @@ -2455,7 +2454,7 @@ static void cm_dup_rep_handler(struct cm_work *work) cm_id_priv->private_data_len); else if (cm_id_priv->id.state == IB_CM_MRA_REP_SENT) cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, - CM_MSG_RESPONSE_REP, cm_id_priv->service_timeout, + CM_MSG_RESPONSE_REP, cm_id_priv->private_data, cm_id_priv->private_data_len); else @@ -2469,7 +2468,7 @@ static void cm_dup_rep_handler(struct cm_work *work) goto deref; unlock: spin_unlock_irq(&cm_id_priv->lock); -free: cm_free_response_msg(msg); +free: cm_free_msg(msg); deref: cm_deref_id(cm_id_priv); } @@ -2653,59 +2652,68 @@ static void cm_format_dreq(struct cm_dreq_msg *dreq_msg, private_data_len); } -static int cm_send_dreq_locked(struct cm_id_private *cm_id_priv, - const void *private_data, u8 private_data_len) +static void cm_issue_dreq(struct cm_id_private *cm_id_priv) { struct ib_mad_send_buf *msg; int ret; lockdep_assert_held(&cm_id_priv->lock); + msg = cm_alloc_msg(cm_id_priv); + if (IS_ERR(msg)) + return; + + cm_format_dreq((struct cm_dreq_msg *) msg->mad, cm_id_priv, NULL, 0); + + trace_icm_send_dreq(&cm_id_priv->id); + ret = ib_post_send_mad(msg, NULL); + if (ret) + cm_free_msg(msg); +} + +int ib_send_cm_dreq(struct ib_cm_id *cm_id, const void *private_data, + u8 private_data_len) +{ + struct cm_id_private *cm_id_priv = + container_of(cm_id, struct cm_id_private, id); + struct ib_mad_send_buf *msg; + unsigned long flags; + int ret; + if (private_data && private_data_len > IB_CM_DREQ_PRIVATE_DATA_SIZE) return -EINVAL; + spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->id.state != IB_CM_ESTABLISHED) { trace_icm_dreq_skipped(&cm_id_priv->id); - return -EINVAL; + ret = -EINVAL; + goto unlock; } if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT || cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD) ib_cancel_mad(cm_id_priv->msg); - msg = cm_alloc_priv_msg(cm_id_priv); + msg = cm_alloc_priv_msg(cm_id_priv, IB_CM_DREQ_SENT); if (IS_ERR(msg)) { cm_enter_timewait(cm_id_priv); - return PTR_ERR(msg); + ret = PTR_ERR(msg); + goto unlock; } cm_format_dreq((struct cm_dreq_msg *) msg->mad, cm_id_priv, private_data, private_data_len); - msg->timeout_ms = cm_id_priv->timeout_ms; - msg->context[1] = (void *) (unsigned long) IB_CM_DREQ_SENT; trace_icm_send_dreq(&cm_id_priv->id); ret = ib_post_send_mad(msg, NULL); if (ret) { cm_enter_timewait(cm_id_priv); cm_free_priv_msg(msg); - return ret; + goto unlock; } cm_id_priv->id.state = IB_CM_DREQ_SENT; - return 0; -} - -int ib_send_cm_dreq(struct ib_cm_id *cm_id, const void *private_data, - u8 private_data_len) -{ - struct cm_id_private *cm_id_priv = - container_of(cm_id, struct cm_id_private, id); - unsigned long flags; - int ret; - - spin_lock_irqsave(&cm_id_priv->lock, flags); - ret = cm_send_dreq_locked(cm_id_priv, private_data, private_data_len); +unlock: spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } @@ -2791,7 +2799,7 @@ static int cm_issue_drep(struct cm_port *port, struct cm_drep_msg *drep_msg; int ret; - ret = cm_alloc_response_msg(port, mad_recv_wc, &msg); + ret = cm_alloc_response_msg(port, mad_recv_wc, true, &msg); if (ret) return ret; @@ -2809,7 +2817,7 @@ static int cm_issue_drep(struct cm_port *port, IBA_GET(CM_DREQ_REMOTE_COMM_ID, dreq_msg)); ret = ib_post_send_mad(msg, NULL); if (ret) - cm_free_response_msg(msg); + cm_free_msg(msg); return ret; } @@ -2856,7 +2864,8 @@ static int cm_dreq_handler(struct cm_work *work) case IB_CM_TIMEWAIT: atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] [CM_DREQ_COUNTER]); - msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc); + msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc, + true); if (IS_ERR(msg)) goto unlock; @@ -2867,7 +2876,7 @@ static int cm_dreq_handler(struct cm_work *work) if (cm_create_response_msg_ah(work->port, work->mad_recv_wc, msg) || ib_post_send_mad(msg, NULL)) - cm_free_response_msg(msg); + cm_free_msg(msg); goto deref; case IB_CM_DREQ_RCVD: atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] @@ -3085,26 +3094,13 @@ out: return -EINVAL; } -int ib_send_cm_mra(struct ib_cm_id *cm_id, - u8 service_timeout, - const void *private_data, - u8 private_data_len) +int ib_prepare_cm_mra(struct ib_cm_id *cm_id) { struct cm_id_private *cm_id_priv; - struct ib_mad_send_buf *msg; enum ib_cm_state cm_state; enum ib_cm_lap_state lap_state; - enum cm_msg_response msg_response; - void *data; unsigned long flags; - int ret; - - if (private_data && private_data_len > IB_CM_MRA_PRIVATE_DATA_SIZE) - return -EINVAL; - - data = cm_copy_private_data(private_data, private_data_len); - if (IS_ERR(data)) - return PTR_ERR(data); + int ret = 0; cm_id_priv = container_of(cm_id, struct cm_id_private, id); @@ -3113,58 +3109,33 @@ int ib_send_cm_mra(struct ib_cm_id *cm_id, case IB_CM_REQ_RCVD: cm_state = IB_CM_MRA_REQ_SENT; lap_state = cm_id->lap_state; - msg_response = CM_MSG_RESPONSE_REQ; break; case IB_CM_REP_RCVD: cm_state = IB_CM_MRA_REP_SENT; lap_state = cm_id->lap_state; - msg_response = CM_MSG_RESPONSE_REP; break; case IB_CM_ESTABLISHED: if (cm_id->lap_state == IB_CM_LAP_RCVD) { cm_state = cm_id->state; lap_state = IB_CM_MRA_LAP_SENT; - msg_response = CM_MSG_RESPONSE_OTHER; break; } fallthrough; default: - trace_icm_send_mra_unknown_err(&cm_id_priv->id); + trace_icm_prepare_mra_unknown_err(&cm_id_priv->id); ret = -EINVAL; goto error_unlock; } - if (!(service_timeout & IB_CM_MRA_FLAG_DELAY)) { - msg = cm_alloc_msg(cm_id_priv); - if (IS_ERR(msg)) { - ret = PTR_ERR(msg); - goto error_unlock; - } - - cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, - msg_response, service_timeout, - private_data, private_data_len); - trace_icm_send_mra(cm_id); - ret = ib_post_send_mad(msg, NULL); - if (ret) - goto error_free_msg; - } - cm_id->state = cm_state; cm_id->lap_state = lap_state; - cm_id_priv->service_timeout = service_timeout; - cm_set_private_data(cm_id_priv, data, private_data_len); - spin_unlock_irqrestore(&cm_id_priv->lock, flags); - return 0; + cm_set_private_data(cm_id_priv, NULL, 0); -error_free_msg: - cm_free_msg(msg); error_unlock: spin_unlock_irqrestore(&cm_id_priv->lock, flags); - kfree(data); return ret; } -EXPORT_SYMBOL(ib_send_cm_mra); +EXPORT_SYMBOL(ib_prepare_cm_mra); static struct cm_id_private *cm_acquire_mraed_id(struct cm_mra_msg *mra_msg) { @@ -3361,20 +3332,20 @@ static int cm_lap_handler(struct cm_work *work) case IB_CM_MRA_LAP_SENT: atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] [CM_LAP_COUNTER]); - msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc); + msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc, + true); if (IS_ERR(msg)) goto unlock; cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, CM_MSG_RESPONSE_OTHER, - cm_id_priv->service_timeout, cm_id_priv->private_data, cm_id_priv->private_data_len); spin_unlock_irq(&cm_id_priv->lock); if (cm_create_response_msg_ah(work->port, work->mad_recv_wc, msg) || ib_post_send_mad(msg, NULL)) - cm_free_response_msg(msg); + cm_free_msg(msg); goto deref; case IB_CM_LAP_RCVD: atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] @@ -3513,7 +3484,7 @@ int ib_send_cm_sidr_req(struct ib_cm_id *cm_id, goto out_unlock; } - msg = cm_alloc_priv_msg(cm_id_priv); + msg = cm_alloc_priv_msg(cm_id_priv, IB_CM_SIDR_REQ_SENT); if (IS_ERR(msg)) { ret = PTR_ERR(msg); goto out_unlock; @@ -3521,8 +3492,6 @@ int ib_send_cm_sidr_req(struct ib_cm_id *cm_id, cm_format_sidr_req((struct cm_sidr_req_msg *)msg->mad, cm_id_priv, param); - msg->timeout_ms = cm_id_priv->timeout_ms; - msg->context[1] = (void *)(unsigned long)IB_CM_SIDR_REQ_SENT; trace_icm_send_sidr_req(&cm_id_priv->id); ret = ib_post_send_mad(msg, NULL); @@ -3768,17 +3737,18 @@ out: static void cm_process_send_error(struct cm_id_private *cm_id_priv, struct ib_mad_send_buf *msg, - enum ib_cm_state state, enum ib_wc_status wc_status) { + enum ib_cm_state state = (unsigned long) msg->context[1]; struct ib_cm_event cm_event = {}; int ret; - /* Discard old sends or ones without a response. */ + /* Discard old sends. */ spin_lock_irq(&cm_id_priv->lock); if (msg != cm_id_priv->msg) { spin_unlock_irq(&cm_id_priv->lock); cm_free_msg(msg); + cm_deref_id(cm_id_priv); return; } cm_free_priv_msg(msg); @@ -3826,9 +3796,7 @@ static void cm_send_handler(struct ib_mad_agent *mad_agent, struct ib_mad_send_wc *mad_send_wc) { struct ib_mad_send_buf *msg = mad_send_wc->send_buf; - struct cm_id_private *cm_id_priv = msg->context[0]; - enum ib_cm_state state = - (enum ib_cm_state)(unsigned long)msg->context[1]; + struct cm_id_private *cm_id_priv; struct cm_port *port; u16 attr_index; @@ -3836,13 +3804,12 @@ static void cm_send_handler(struct ib_mad_agent *mad_agent, attr_index = be16_to_cpu(((struct ib_mad_hdr *) msg->mad)->attr_id) - CM_ATTR_ID_OFFSET; - /* - * If the send was in response to a received message (context[0] is not - * set to a cm_id), and is not a REJ, then it is a send that was - * manually retried. - */ - if (!cm_id_priv && (attr_index != CM_REJ_COUNTER)) + if (msg->context[0] == CM_DIRECT_RETRY_CTX) { msg->retries = 1; + cm_id_priv = NULL; + } else { + cm_id_priv = msg->context[0]; + } atomic_long_add(1 + msg->retries, &port->counters[CM_XMIT][attr_index]); if (msg->retries) @@ -3850,10 +3817,9 @@ static void cm_send_handler(struct ib_mad_agent *mad_agent, &port->counters[CM_XMIT_RETRIES][attr_index]); if (cm_id_priv) - cm_process_send_error(cm_id_priv, msg, state, - mad_send_wc->status); + cm_process_send_error(cm_id_priv, msg, mad_send_wc->status); else - cm_free_response_msg(msg); + cm_free_msg(msg); } static void cm_work_handler(struct work_struct *_work) @@ -4374,7 +4340,7 @@ static int cm_add_one(struct ib_device *ib_device) return -ENOMEM; kref_init(&cm_dev->kref); - spin_lock_init(&cm_dev->mad_agent_lock); + rwlock_init(&cm_dev->mad_agent_lock); cm_dev->ib_device = ib_device; cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay; cm_dev->going_down = 0; @@ -4490,9 +4456,9 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data) * The above ensures no call paths from the work are running, * the remaining paths all take the mad_agent_lock. */ - spin_lock(&cm_dev->mad_agent_lock); + write_lock(&cm_dev->mad_agent_lock); port->mad_agent = NULL; - spin_unlock(&cm_dev->mad_agent_lock); + write_unlock(&cm_dev->mad_agent_lock); ib_unregister_mad_agent(mad_agent); ib_port_unregister_client_groups(ib_device, i, cm_counter_groups); diff --git a/drivers/infiniband/core/cm_trace.h b/drivers/infiniband/core/cm_trace.h index 944d9071245d..4a4987da69d4 100644 --- a/drivers/infiniband/core/cm_trace.h +++ b/drivers/infiniband/core/cm_trace.h @@ -229,7 +229,7 @@ DEFINE_CM_ERR_EVENT(send_drep); DEFINE_CM_ERR_EVENT(dreq_unknown); DEFINE_CM_ERR_EVENT(send_unknown_rej); DEFINE_CM_ERR_EVENT(rej_unknown); -DEFINE_CM_ERR_EVENT(send_mra_unknown); +DEFINE_CM_ERR_EVENT(prepare_mra_unknown); DEFINE_CM_ERR_EVENT(mra_unknown); DEFINE_CM_ERR_EVENT(qp_init); DEFINE_CM_ERR_EVENT(qp_rtr); diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 1e2cd7c8716e..9b471548e7ae 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -46,7 +46,6 @@ MODULE_LICENSE("Dual BSD/GPL"); #define CMA_CM_RESPONSE_TIMEOUT 20 #define CMA_MAX_CM_RETRIES 15 -#define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) #define CMA_IBOE_PACKET_LIFETIME 16 #define CMA_PREFERRED_ROCE_GID_TYPE IB_GID_TYPE_ROCE_UDP_ENCAP @@ -72,6 +71,8 @@ static const char * const cma_events[] = { static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, enum ib_gid_type gid_type); +static void cma_netevent_work_handler(struct work_struct *_work); + const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event) { size_t index = event; @@ -144,19 +145,6 @@ struct iw_cm_id *rdma_iw_cm_id(struct rdma_cm_id *id) } EXPORT_SYMBOL(rdma_iw_cm_id); -/** - * rdma_res_to_id() - return the rdma_cm_id pointer for this restrack. - * @res: rdma resource tracking entry pointer - */ -struct rdma_cm_id *rdma_res_to_id(struct rdma_restrack_entry *res) -{ - struct rdma_id_private *id_priv = - container_of(res, struct rdma_id_private, res); - - return &id_priv->id; -} -EXPORT_SYMBOL(rdma_res_to_id); - static int cma_add_one(struct ib_device *device); static void cma_remove_one(struct ib_device *device, void *client_data); @@ -690,6 +678,7 @@ cma_validate_port(struct ib_device *device, u32 port, int bound_if_index = dev_addr->bound_dev_if; int dev_type = dev_addr->dev_type; struct net_device *ndev = NULL; + struct net_device *pdev = NULL; if (!rdma_dev_access_netns(device, id_priv->id.route.addr.dev_addr.net)) goto out; @@ -714,19 +703,50 @@ cma_validate_port(struct ib_device *device, u32 port, rcu_read_lock(); ndev = rcu_dereference(sgid_attr->ndev); + if (ndev->ifindex != bound_if_index) { + pdev = dev_get_by_index_rcu(dev_addr->net, bound_if_index); + if (pdev) { + if (is_vlan_dev(pdev)) { + pdev = vlan_dev_real_dev(pdev); + if (ndev->ifindex == pdev->ifindex) + bound_if_index = pdev->ifindex; + } + if (is_vlan_dev(ndev)) { + pdev = vlan_dev_real_dev(ndev); + if (bound_if_index == pdev->ifindex) + bound_if_index = ndev->ifindex; + } + } + } if (!net_eq(dev_net(ndev), dev_addr->net) || - ndev->ifindex != bound_if_index) + ndev->ifindex != bound_if_index) { + rdma_put_gid_attr(sgid_attr); sgid_attr = ERR_PTR(-ENODEV); + } rcu_read_unlock(); goto out; } - if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) { - ndev = dev_get_by_index(dev_addr->net, bound_if_index); - if (!ndev) - goto out; + /* + * For a RXE device, it should work with TUN device and normal ethernet + * devices. Use driver_id to check if a device is a RXE device or not. + * ARPHDR_NONE means a TUN device. + */ + if (device->ops.driver_id == RDMA_DRIVER_RXE) { + if ((dev_type == ARPHRD_NONE || dev_type == ARPHRD_ETHER) + && rdma_protocol_roce(device, port)) { + ndev = dev_get_by_index(dev_addr->net, bound_if_index); + if (!ndev) + goto out; + } } else { - gid_type = IB_GID_TYPE_IB; + if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) { + ndev = dev_get_by_index(dev_addr->net, bound_if_index); + if (!ndev) + goto out; + } else { + gid_type = IB_GID_TYPE_IB; + } } sgid_attr = rdma_find_gid_by_port(device, gid, gid_type, port, ndev); @@ -1015,6 +1035,7 @@ __rdma_create_id(struct net *net, rdma_cm_event_handler event_handler, get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num); id_priv->id.route.addr.dev_addr.net = get_net(net); id_priv->seq_num &= 0x00ffffff; + INIT_WORK(&id_priv->id.net_work, cma_netevent_work_handler); rdma_restrack_new(&id_priv->res, RDMA_RESTRACK_CM_ID); if (parent) @@ -2179,8 +2200,8 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, case IB_CM_REP_RECEIVED: if (state == RDMA_CM_CONNECT && (id_priv->id.qp_type != IB_QPT_UD)) { - trace_cm_send_mra(id_priv); - ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); + trace_cm_prepare_mra(id_priv); + ib_prepare_cm_mra(cm_id); } if (id_priv->id.qp) { event.status = cma_rep_recv(id_priv); @@ -2441,8 +2462,8 @@ static int cma_ib_req_handler(struct ib_cm_id *cm_id, if (READ_ONCE(conn_id->state) == RDMA_CM_CONNECT && conn_id->id.qp_type != IB_QPT_UD) { - trace_cm_send_mra(cm_id->context); - ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); + trace_cm_prepare_mra(cm_id->context); + ib_prepare_cm_mra(cm_id); } mutex_unlock(&conn_id->handler_mutex); @@ -5209,9 +5230,9 @@ static int cma_netevent_callback(struct notifier_block *self, if (!memcmp(current_id->id.route.addr.dev_addr.dst_dev_addr, neigh->ha, ETH_ALEN)) continue; - INIT_WORK(¤t_id->id.net_work, cma_netevent_work_handler); cma_id_get(current_id); - queue_work(cma_wq, ¤t_id->id.net_work); + if (!queue_work(cma_wq, ¤t_id->id.net_work)) + cma_id_put(current_id); } out: spin_unlock_irqrestore(&id_table_lock, flags); diff --git a/drivers/infiniband/core/cma_trace.h b/drivers/infiniband/core/cma_trace.h index 47f3c6e4be89..3456d5f3aa47 100644 --- a/drivers/infiniband/core/cma_trace.h +++ b/drivers/infiniband/core/cma_trace.h @@ -55,7 +55,7 @@ DECLARE_EVENT_CLASS(cma_fsm_class, DEFINE_CMA_FSM_EVENT(send_rtu); DEFINE_CMA_FSM_EVENT(send_rej); -DEFINE_CMA_FSM_EVENT(send_mra); +DEFINE_CMA_FSM_EVENT(prepare_mra); DEFINE_CMA_FSM_EVENT(send_sidr_req); DEFINE_CMA_FSM_EVENT(send_sidr_rep); DEFINE_CMA_FSM_EVENT(disconnect); @@ -84,7 +84,7 @@ TRACE_EVENT(cm_id_attach, sizeof(struct sockaddr_in6)); memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, sizeof(struct sockaddr_in6)); - __assign_str(devname, device->name); + __assign_str(devname); ), TP_printk("cm.id=%u src=%pISpc dst=%pISpc device=%s", @@ -334,7 +334,7 @@ DECLARE_EVENT_CLASS(cma_client_class, ), TP_fast_assign( - __assign_str(name, device->name); + __assign_str(name); ), TP_printk("device name=%s", diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index dd7715ba9fd1..05102769a918 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -325,9 +325,6 @@ void ib_qp_usecnt_inc(struct ib_qp *qp); void ib_qp_usecnt_dec(struct ib_qp *qp); struct rdma_dev_addr; -int rdma_resolve_ip_route(struct sockaddr *src_addr, - const struct sockaddr *dst_addr, - struct rdma_dev_addr *addr); int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, const union ib_gid *dgid, diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c index af59486fe418..e6ec7b7a40af 100644 --- a/drivers/infiniband/core/counters.c +++ b/drivers/infiniband/core/counters.c @@ -12,7 +12,8 @@ static int __counter_set_mode(struct rdma_port_counter *port_counter, enum rdma_nl_counter_mode new_mode, - enum rdma_nl_counter_mask new_mask) + enum rdma_nl_counter_mask new_mask, + bool bind_opcnt) { if (new_mode == RDMA_COUNTER_MODE_AUTO) { if (new_mask & (~ALL_AUTO_MODE_MASKS)) @@ -23,6 +24,7 @@ static int __counter_set_mode(struct rdma_port_counter *port_counter, port_counter->mode.mode = new_mode; port_counter->mode.mask = new_mask; + port_counter->mode.bind_opcnt = bind_opcnt; return 0; } @@ -41,6 +43,7 @@ static int __counter_set_mode(struct rdma_port_counter *port_counter, */ int rdma_counter_set_auto_mode(struct ib_device *dev, u32 port, enum rdma_nl_counter_mask mask, + bool bind_opcnt, struct netlink_ext_ack *extack) { struct rdma_port_counter *port_counter; @@ -59,12 +62,13 @@ int rdma_counter_set_auto_mode(struct ib_device *dev, u32 port, RDMA_COUNTER_MODE_NONE; if (port_counter->mode.mode == mode && - port_counter->mode.mask == mask) { + port_counter->mode.mask == mask && + port_counter->mode.bind_opcnt == bind_opcnt) { ret = 0; goto out; } - ret = __counter_set_mode(port_counter, mode, mask); + ret = __counter_set_mode(port_counter, mode, mask, bind_opcnt); out: mutex_unlock(&port_counter->lock); @@ -89,7 +93,7 @@ static void auto_mode_init_counter(struct rdma_counter *counter, } static int __rdma_counter_bind_qp(struct rdma_counter *counter, - struct ib_qp *qp) + struct ib_qp *qp, u32 port) { int ret; @@ -100,7 +104,7 @@ static int __rdma_counter_bind_qp(struct rdma_counter *counter, return -EOPNOTSUPP; mutex_lock(&counter->lock); - ret = qp->device->ops.counter_bind_qp(counter, qp); + ret = qp->device->ops.counter_bind_qp(counter, qp, port); mutex_unlock(&counter->lock); return ret; @@ -140,7 +144,8 @@ out: static struct rdma_counter *alloc_and_bind(struct ib_device *dev, u32 port, struct ib_qp *qp, - enum rdma_nl_counter_mode mode) + enum rdma_nl_counter_mode mode, + bool bind_opcnt) { struct rdma_port_counter *port_counter; struct rdma_counter *counter; @@ -149,13 +154,15 @@ static struct rdma_counter *alloc_and_bind(struct ib_device *dev, u32 port, if (!dev->ops.counter_dealloc || !dev->ops.counter_alloc_stats) return NULL; - counter = kzalloc(sizeof(*counter), GFP_KERNEL); + counter = rdma_zalloc_drv_obj(dev, rdma_counter); if (!counter) return NULL; counter->device = dev; counter->port = port; + dev->ops.counter_init(counter); + rdma_restrack_new(&counter->res, RDMA_RESTRACK_COUNTER); counter->stats = dev->ops.counter_alloc_stats(counter); if (!counter->stats) @@ -166,7 +173,7 @@ static struct rdma_counter *alloc_and_bind(struct ib_device *dev, u32 port, switch (mode) { case RDMA_COUNTER_MODE_MANUAL: ret = __counter_set_mode(port_counter, RDMA_COUNTER_MODE_MANUAL, - 0); + 0, bind_opcnt); if (ret) { mutex_unlock(&port_counter->lock); goto err_mode; @@ -185,10 +192,11 @@ static struct rdma_counter *alloc_and_bind(struct ib_device *dev, u32 port, mutex_unlock(&port_counter->lock); counter->mode.mode = mode; + counter->mode.bind_opcnt = bind_opcnt; kref_init(&counter->kref); mutex_init(&counter->lock); - ret = __rdma_counter_bind_qp(counter, qp); + ret = __rdma_counter_bind_qp(counter, qp, port); if (ret) goto err_mode; @@ -213,7 +221,8 @@ static void rdma_counter_free(struct rdma_counter *counter) port_counter->num_counters--; if (!port_counter->num_counters && (port_counter->mode.mode == RDMA_COUNTER_MODE_MANUAL)) - __counter_set_mode(port_counter, RDMA_COUNTER_MODE_NONE, 0); + __counter_set_mode(port_counter, RDMA_COUNTER_MODE_NONE, 0, + false); mutex_unlock(&port_counter->lock); @@ -238,7 +247,7 @@ static bool auto_mode_match(struct ib_qp *qp, struct rdma_counter *counter, return match; } -static int __rdma_counter_unbind_qp(struct ib_qp *qp) +static int __rdma_counter_unbind_qp(struct ib_qp *qp, u32 port) { struct rdma_counter *counter = qp->counter; int ret; @@ -247,7 +256,7 @@ static int __rdma_counter_unbind_qp(struct ib_qp *qp) return -EOPNOTSUPP; mutex_lock(&counter->lock); - ret = qp->device->ops.counter_unbind_qp(qp); + ret = qp->device->ops.counter_unbind_qp(qp, port); mutex_unlock(&counter->lock); return ret; @@ -339,13 +348,14 @@ int rdma_counter_bind_qp_auto(struct ib_qp *qp, u32 port) counter = rdma_get_counter_auto_mode(qp, port); if (counter) { - ret = __rdma_counter_bind_qp(counter, qp); + ret = __rdma_counter_bind_qp(counter, qp, port); if (ret) { kref_put(&counter->kref, counter_release); return ret; } } else { - counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_AUTO); + counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_AUTO, + port_counter->mode.bind_opcnt); if (!counter) return -ENOMEM; } @@ -358,7 +368,7 @@ int rdma_counter_bind_qp_auto(struct ib_qp *qp, u32 port) * @force: * true - Decrease the counter ref-count anyway (e.g., qp destroy) */ -int rdma_counter_unbind_qp(struct ib_qp *qp, bool force) +int rdma_counter_unbind_qp(struct ib_qp *qp, u32 port, bool force) { struct rdma_counter *counter = qp->counter; int ret; @@ -366,7 +376,7 @@ int rdma_counter_unbind_qp(struct ib_qp *qp, bool force) if (!counter) return -EINVAL; - ret = __rdma_counter_unbind_qp(qp); + ret = __rdma_counter_unbind_qp(qp, port); if (ret && !force) return ret; @@ -513,7 +523,7 @@ int rdma_counter_bind_qpn(struct ib_device *dev, u32 port, goto err_task; } - ret = __rdma_counter_bind_qp(counter, qp); + ret = __rdma_counter_bind_qp(counter, qp, port); if (ret) goto err_task; @@ -558,7 +568,7 @@ int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u32 port, goto err; } - counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_MANUAL); + counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_MANUAL, true); if (!counter) { ret = -ENOMEM; goto err; @@ -604,7 +614,7 @@ int rdma_counter_unbind_qpn(struct ib_device *dev, u32 port, goto out; } - ret = rdma_counter_unbind_qp(qp, false); + ret = rdma_counter_unbind_qp(qp, port, false); out: rdma_restrack_put(&qp->res); @@ -613,13 +623,15 @@ out: int rdma_counter_get_mode(struct ib_device *dev, u32 port, enum rdma_nl_counter_mode *mode, - enum rdma_nl_counter_mask *mask) + enum rdma_nl_counter_mask *mask, + bool *opcnt) { struct rdma_port_counter *port_counter; port_counter = &dev->port_data[port].port_counter; *mode = port_counter->mode.mode; *mask = port_counter->mode.mask; + *opcnt = port_counter->mode.bind_opcnt; return 0; } diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 07cb6c5ffda0..d4263385850a 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -209,23 +209,6 @@ static void __ibdev_printk(const char *level, const struct ib_device *ibdev, printk("%s(NULL ib_device): %pV", level, vaf); } -void ibdev_printk(const char *level, const struct ib_device *ibdev, - const char *format, ...) -{ - struct va_format vaf; - va_list args; - - va_start(args, format); - - vaf.fmt = format; - vaf.va = &args; - - __ibdev_printk(level, ibdev, &vaf); - - va_end(args); -} -EXPORT_SYMBOL(ibdev_printk); - #define define_ibdev_printk_level(func, level) \ void func(const struct ib_device *ibdev, const char *fmt, ...) \ { \ @@ -437,6 +420,7 @@ int ib_device_rename(struct ib_device *ibdev, const char *name) client->rename(ibdev, client_data); } up_read(&ibdev->client_data_rwsem); + rdma_nl_notify_event(ibdev, 0, RDMA_RENAME_EVENT); up_read(&devices_rwsem); return 0; } @@ -503,6 +487,7 @@ static void ib_device_release(struct device *device) rcu_head); } + mutex_destroy(&dev->subdev_lock); mutex_destroy(&dev->unregistration_lock); mutex_destroy(&dev->compat_devs_mutex); @@ -543,6 +528,8 @@ static struct class ib_class = { static void rdma_init_coredev(struct ib_core_device *coredev, struct ib_device *dev, struct net *net) { + bool is_full_dev = &dev->coredev == coredev; + /* This BUILD_BUG_ON is intended to catch layout change * of union of ib_core_device and device. * dev must be the first element as ib_core and providers @@ -554,6 +541,13 @@ static void rdma_init_coredev(struct ib_core_device *coredev, coredev->dev.class = &ib_class; coredev->dev.groups = dev->groups; + + /* + * Don't expose hw counters outside of the init namespace. + */ + if (!is_full_dev && dev->hw_stats_attr_index) + coredev->dev.groups[dev->hw_stats_attr_index] = NULL; + device_initialize(&coredev->dev); coredev->owner = dev; INIT_LIST_HEAD(&coredev->port_list); @@ -641,6 +635,11 @@ struct ib_device *_ib_alloc_device(size_t size) BIT_ULL(IB_USER_VERBS_CMD_REG_MR) | BIT_ULL(IB_USER_VERBS_CMD_REREG_MR) | BIT_ULL(IB_USER_VERBS_CMD_RESIZE_CQ); + + mutex_init(&device->subdev_lock); + INIT_LIST_HEAD(&device->subdev_list_head); + INIT_LIST_HEAD(&device->subdev_list); + return device; } EXPORT_SYMBOL(_ib_alloc_device); @@ -1345,6 +1344,37 @@ static void prevent_dealloc_device(struct ib_device *ib_dev) { } +static void ib_device_notify_register(struct ib_device *device) +{ + struct net_device *netdev; + u32 port; + int ret; + + down_read(&devices_rwsem); + + /* Mark for userspace that device is ready */ + kobject_uevent(&device->dev.kobj, KOBJ_ADD); + + ret = rdma_nl_notify_event(device, 0, RDMA_REGISTER_EVENT); + if (ret) + goto out; + + rdma_for_each_port(device, port) { + netdev = ib_device_get_netdev(device, port); + if (!netdev) + continue; + + ret = rdma_nl_notify_event(device, port, + RDMA_NETDEV_ATTACH_EVENT); + dev_put(netdev); + if (ret) + goto out; + } + +out: + up_read(&devices_rwsem); +} + /** * ib_register_device - Register an IB device with IB core * @device: Device to register @@ -1441,8 +1471,9 @@ int ib_register_device(struct ib_device *device, const char *name, return ret; } dev_set_uevent_suppress(&device->dev, false); - /* Mark for userspace that device is ready */ - kobject_uevent(&device->dev.kobj, KOBJ_ADD); + + ib_device_notify_register(device); + ib_device_put(device); return 0; @@ -1461,6 +1492,18 @@ EXPORT_SYMBOL(ib_register_device); /* Callers must hold a get on the device. */ static void __ib_unregister_device(struct ib_device *ib_dev) { + struct ib_device *sub, *tmp; + + mutex_lock(&ib_dev->subdev_lock); + list_for_each_entry_safe_reverse(sub, tmp, + &ib_dev->subdev_list_head, + subdev_list) { + list_del(&sub->subdev_list); + ib_dev->ops.del_sub_dev(sub); + ib_device_put(ib_dev); + } + mutex_unlock(&ib_dev->subdev_lock); + /* * We have a registration lock so that all the calls to unregister are * fully fenced, once any unregister returns the device is truely @@ -1473,6 +1516,7 @@ static void __ib_unregister_device(struct ib_device *ib_dev) goto out; disable_device(ib_dev); + rdma_nl_notify_event(ib_dev, 0, RDMA_UNREGISTER_EVENT); /* Expedite removing unregistered pointers from the hash table */ free_netdevs(ib_dev); @@ -2141,11 +2185,15 @@ static void add_ndev_hash(struct ib_port_data *pdata) int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, u32 port) { + enum rdma_nl_notify_event_type etype; struct net_device *old_ndev; struct ib_port_data *pdata; unsigned long flags; int ret; + if (!rdma_is_port_valid(ib_dev, port)) + return -EINVAL; + /* * Drivers wish to call this before ib_register_driver, so we have to * setup the port data early. @@ -2154,9 +2202,6 @@ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, if (ret) return ret; - if (!rdma_is_port_valid(ib_dev, port)) - return -EINVAL; - pdata = &ib_dev->port_data[port]; spin_lock_irqsave(&pdata->netdev_lock, flags); old_ndev = rcu_dereference_protected( @@ -2166,16 +2211,19 @@ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, return 0; } - if (old_ndev) - netdev_tracker_free(ndev, &pdata->netdev_tracker); - if (ndev) - netdev_hold(ndev, &pdata->netdev_tracker, GFP_ATOMIC); rcu_assign_pointer(pdata->netdev, ndev); + netdev_put(old_ndev, &pdata->netdev_tracker); + netdev_hold(ndev, &pdata->netdev_tracker, GFP_ATOMIC); spin_unlock_irqrestore(&pdata->netdev_lock, flags); add_ndev_hash(pdata); - if (old_ndev) - __dev_put(old_ndev); + + /* Make sure that the device is registered before we send events */ + if (xa_load(&devices, ib_dev->index) != ib_dev) + return 0; + + etype = ndev ? RDMA_NETDEV_ATTACH_EVENT : RDMA_NETDEV_DETACH_EVENT; + rdma_nl_notify_event(ib_dev, port, etype); return 0; } @@ -2223,6 +2271,9 @@ struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, if (!rdma_is_port_valid(ib_dev, port)) return NULL; + if (!ib_dev->port_data) + return NULL; + pdata = &ib_dev->port_data[port]; /* @@ -2235,22 +2286,40 @@ struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, spin_lock(&pdata->netdev_lock); res = rcu_dereference_protected( pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); - if (res) - dev_hold(res); + dev_hold(res); spin_unlock(&pdata->netdev_lock); } - /* - * If we are starting to unregister expedite things by preventing - * propagation of an unregistering netdev. - */ - if (res && res->reg_state != NETREG_REGISTERED) { - dev_put(res); - return NULL; + return res; +} +EXPORT_SYMBOL(ib_device_get_netdev); + +/** + * ib_query_netdev_port - Query the port number of a net_device + * associated with an ibdev + * @ibdev: IB device + * @ndev: Network device + * @port: IB port the net_device is connected to + */ +int ib_query_netdev_port(struct ib_device *ibdev, struct net_device *ndev, + u32 *port) +{ + struct net_device *ib_ndev; + u32 port_num; + + rdma_for_each_port(ibdev, port_num) { + ib_ndev = ib_device_get_netdev(ibdev, port_num); + if (ndev == ib_ndev) { + *port = port_num; + dev_put(ib_ndev); + return 0; + } + dev_put(ib_ndev); } - return res; + return -ENOENT; } +EXPORT_SYMBOL(ib_query_netdev_port); /** * ib_device_get_by_netdev - Find an IB device associated with a netdev @@ -2311,9 +2380,7 @@ void ib_enum_roce_netdev(struct ib_device *ib_dev, if (filter(ib_dev, port, idev, filter_cookie)) cb(ib_dev, port, idev, cookie); - - if (idev) - dev_put(idev); + dev_put(idev); } } @@ -2601,6 +2668,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) ops->uverbs_no_driver_id_binding; SET_DEVICE_OP(dev_ops, add_gid); + SET_DEVICE_OP(dev_ops, add_sub_dev); SET_DEVICE_OP(dev_ops, advise_mr); SET_DEVICE_OP(dev_ops, alloc_dm); SET_DEVICE_OP(dev_ops, alloc_hw_device_stats); @@ -2617,6 +2685,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, counter_alloc_stats); SET_DEVICE_OP(dev_ops, counter_bind_qp); SET_DEVICE_OP(dev_ops, counter_dealloc); + SET_DEVICE_OP(dev_ops, counter_init); SET_DEVICE_OP(dev_ops, counter_unbind_qp); SET_DEVICE_OP(dev_ops, counter_update_stats); SET_DEVICE_OP(dev_ops, create_ah); @@ -2635,6 +2704,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, dealloc_ucontext); SET_DEVICE_OP(dev_ops, dealloc_xrcd); SET_DEVICE_OP(dev_ops, del_gid); + SET_DEVICE_OP(dev_ops, del_sub_dev); SET_DEVICE_OP(dev_ops, dereg_mr); SET_DEVICE_OP(dev_ops, destroy_ah); SET_DEVICE_OP(dev_ops, destroy_counters); @@ -2717,6 +2787,8 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, resize_cq); SET_DEVICE_OP(dev_ops, set_vf_guid); SET_DEVICE_OP(dev_ops, set_vf_link_state); + SET_DEVICE_OP(dev_ops, ufile_hw_cleanup); + SET_DEVICE_OP(dev_ops, report_port_event); SET_OBJ_SIZE(dev_ops, ib_ah); SET_OBJ_SIZE(dev_ops, ib_counters); @@ -2728,9 +2800,59 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_OBJ_SIZE(dev_ops, ib_srq); SET_OBJ_SIZE(dev_ops, ib_ucontext); SET_OBJ_SIZE(dev_ops, ib_xrcd); + SET_OBJ_SIZE(dev_ops, rdma_counter); } EXPORT_SYMBOL(ib_set_device_ops); +int ib_add_sub_device(struct ib_device *parent, + enum rdma_nl_dev_type type, + const char *name) +{ + struct ib_device *sub; + int ret = 0; + + if (!parent->ops.add_sub_dev || !parent->ops.del_sub_dev) + return -EOPNOTSUPP; + + if (!ib_device_try_get(parent)) + return -EINVAL; + + sub = parent->ops.add_sub_dev(parent, type, name); + if (IS_ERR(sub)) { + ib_device_put(parent); + return PTR_ERR(sub); + } + + sub->type = type; + sub->parent = parent; + + mutex_lock(&parent->subdev_lock); + list_add_tail(&parent->subdev_list_head, &sub->subdev_list); + mutex_unlock(&parent->subdev_lock); + + return ret; +} +EXPORT_SYMBOL(ib_add_sub_device); + +int ib_del_sub_device_and_put(struct ib_device *sub) +{ + struct ib_device *parent = sub->parent; + + if (!parent) + return -EOPNOTSUPP; + + mutex_lock(&parent->subdev_lock); + list_del(&sub->subdev_list); + mutex_unlock(&parent->subdev_lock); + + ib_device_put(sub); + parent->ops.del_sub_dev(sub); + ib_device_put(parent); + + return 0; +} +EXPORT_SYMBOL(ib_del_sub_device_and_put); + #ifdef CONFIG_INFINIBAND_VIRT_DMA int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents) { @@ -2761,6 +2883,97 @@ static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = { }, }; +void ib_dispatch_port_state_event(struct ib_device *ibdev, struct net_device *ndev) +{ + enum ib_port_state curr_state; + struct ib_event ibevent = {}; + u32 port; + + if (ib_query_netdev_port(ibdev, ndev, &port)) + return; + + curr_state = ib_get_curr_port_state(ndev); + + write_lock_irq(&ibdev->cache_lock); + if (ibdev->port_data[port].cache.last_port_state == curr_state) { + write_unlock_irq(&ibdev->cache_lock); + return; + } + ibdev->port_data[port].cache.last_port_state = curr_state; + write_unlock_irq(&ibdev->cache_lock); + + ibevent.event = (curr_state == IB_PORT_DOWN) ? + IB_EVENT_PORT_ERR : IB_EVENT_PORT_ACTIVE; + ibevent.device = ibdev; + ibevent.element.port_num = port; + ib_dispatch_event(&ibevent); +} +EXPORT_SYMBOL(ib_dispatch_port_state_event); + +static void handle_port_event(struct net_device *ndev, unsigned long event) +{ + struct ib_device *ibdev; + + /* Currently, link events in bonding scenarios are still + * reported by drivers that support bonding. + */ + if (netif_is_lag_master(ndev) || netif_is_lag_port(ndev)) + return; + + ibdev = ib_device_get_by_netdev(ndev, RDMA_DRIVER_UNKNOWN); + if (!ibdev) + return; + + if (ibdev->ops.report_port_event) { + ibdev->ops.report_port_event(ibdev, ndev, event); + goto put_ibdev; + } + + ib_dispatch_port_state_event(ibdev, ndev); + +put_ibdev: + ib_device_put(ibdev); +}; + +static int ib_netdevice_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *ndev = netdev_notifier_info_to_dev(ptr); + struct ib_device *ibdev; + u32 port; + + switch (event) { + case NETDEV_CHANGENAME: + ibdev = ib_device_get_by_netdev(ndev, RDMA_DRIVER_UNKNOWN); + if (!ibdev) + return NOTIFY_DONE; + + if (ib_query_netdev_port(ibdev, ndev, &port)) { + ib_device_put(ibdev); + break; + } + + rdma_nl_notify_event(ibdev, port, RDMA_NETDEV_RENAME_EVENT); + ib_device_put(ibdev); + break; + + case NETDEV_UP: + case NETDEV_CHANGE: + case NETDEV_DOWN: + handle_port_event(ndev, event); + break; + + default: + break; + } + + return NOTIFY_DONE; +} + +static struct notifier_block nb_netdevice = { + .notifier_call = ib_netdevice_event, +}; + static int __init ib_core_init(void) { int ret = -ENOMEM; @@ -2832,6 +3045,8 @@ static int __init ib_core_init(void) goto err_parent; } + register_netdevice_notifier(&nb_netdevice); + return 0; err_parent: @@ -2861,6 +3076,7 @@ err: static void __exit ib_core_cleanup(void) { + unregister_netdevice_notifier(&nb_netdevice); roce_gid_mgmt_cleanup(); rdma_nl_unregister(RDMA_NL_LS); nldev_exit(); diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 0301fcad4b48..62410578dec3 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -109,7 +109,9 @@ static struct ctl_table iwcm_ctl_table[] = { .data = &default_backlog, .maxlen = sizeof(default_backlog), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, }; @@ -143,8 +145,8 @@ static struct iwcm_work *get_work(struct iwcm_id_private *cm_id_priv) if (list_empty(&cm_id_priv->work_free_list)) return NULL; - work = list_entry(cm_id_priv->work_free_list.next, struct iwcm_work, - free_list); + work = list_first_entry(&cm_id_priv->work_free_list, struct iwcm_work, + free_list); list_del_init(&work->free_list); return work; } @@ -206,17 +208,17 @@ static void free_cm_id(struct iwcm_id_private *cm_id_priv) /* * Release a reference on cm_id. If the last reference is being - * released, free the cm_id and return 1. + * released, free the cm_id and return 'true'. */ -static int iwcm_deref_id(struct iwcm_id_private *cm_id_priv) +static bool iwcm_deref_id(struct iwcm_id_private *cm_id_priv) { if (refcount_dec_and_test(&cm_id_priv->refcount)) { BUG_ON(!list_empty(&cm_id_priv->work_list)); free_cm_id(cm_id_priv); - return 1; + return true; } - return 0; + return false; } static void add_ref(struct iw_cm_id *cm_id) @@ -366,8 +368,7 @@ EXPORT_SYMBOL(iw_cm_disconnect); /* * CM_ID <-- DESTROYING * - * Clean up all resources associated with the connection and release - * the initial reference taken by iw_create_cm_id. + * Clean up all resources associated with the connection. */ static void destroy_cm_id(struct iw_cm_id *cm_id) { @@ -438,19 +439,22 @@ static void destroy_cm_id(struct iw_cm_id *cm_id) iwpm_remove_mapinfo(&cm_id->local_addr, &cm_id->m_local_addr); iwpm_remove_mapping(&cm_id->local_addr, RDMA_NL_IWCM); } - - (void)iwcm_deref_id(cm_id_priv); } /* - * This function is only called by the application thread and cannot - * be called by the event thread. The function will wait for all - * references to be released on the cm_id and then kfree the cm_id - * object. + * Destroy cm_id. If the cm_id still has other references, wait for all + * references to be released on the cm_id and then release the initial + * reference taken by iw_create_cm_id. */ void iw_destroy_cm_id(struct iw_cm_id *cm_id) { + struct iwcm_id_private *cm_id_priv; + + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); destroy_cm_id(cm_id); + if (refcount_read(&cm_id_priv->refcount) > 1) + flush_workqueue(iwcm_wq); + iwcm_deref_id(cm_id_priv); } EXPORT_SYMBOL(iw_destroy_cm_id); @@ -1017,30 +1021,27 @@ static void cm_work_handler(struct work_struct *_work) struct iw_cm_event levent; struct iwcm_id_private *cm_id_priv = work->cm_id; unsigned long flags; - int empty; int ret = 0; spin_lock_irqsave(&cm_id_priv->lock, flags); - empty = list_empty(&cm_id_priv->work_list); - while (!empty) { - work = list_entry(cm_id_priv->work_list.next, - struct iwcm_work, list); + while (!list_empty(&cm_id_priv->work_list)) { + work = list_first_entry(&cm_id_priv->work_list, + struct iwcm_work, list); list_del_init(&work->list); - empty = list_empty(&cm_id_priv->work_list); levent = work->event; put_work(work); spin_unlock_irqrestore(&cm_id_priv->lock, flags); if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) { ret = process_event(cm_id_priv, &levent); - if (ret) + if (ret) { destroy_cm_id(&cm_id_priv->id); + WARN_ON_ONCE(iwcm_deref_id(cm_id_priv)); + } } else pr_debug("dropping event %d\n", levent.event); if (iwcm_deref_id(cm_id_priv)) return; - if (empty) - return; spin_lock_irqsave(&cm_id_priv->lock, flags); } spin_unlock_irqrestore(&cm_id_priv->lock, flags); @@ -1093,11 +1094,8 @@ static int cm_event_handler(struct iw_cm_id *cm_id, } refcount_inc(&cm_id_priv->refcount); - if (list_empty(&cm_id_priv->work_list)) { - list_add_tail(&work->list, &cm_id_priv->work_list); - queue_work(iwcm_wq, &work->work); - } else - list_add_tail(&work->list, &cm_id_priv->work_list); + list_add_tail(&work->list, &cm_id_priv->work_list); + queue_work(iwcm_wq, &work->work); out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; @@ -1187,7 +1185,7 @@ static int __init iw_cm_init(void) if (ret) return ret; - iwcm_wq = alloc_ordered_workqueue("iw_cm_wq", 0); + iwcm_wq = alloc_ordered_workqueue("iw_cm_wq", WQ_MEM_RECLAIM); if (!iwcm_wq) goto err_alloc; diff --git a/drivers/infiniband/core/lag.c b/drivers/infiniband/core/lag.c index eca6e37c72ba..8fd80adfe833 100644 --- a/drivers/infiniband/core/lag.c +++ b/drivers/infiniband/core/lag.c @@ -93,8 +93,7 @@ static struct net_device *rdma_get_xmit_slave_udp(struct ib_device *device, slave = netdev_get_xmit_slave(master, skb, !!(device->lag_flags & RDMA_LAG_FLAGS_HASH_ALL_SLAVES)); - if (slave) - dev_hold(slave); + dev_hold(slave); rcu_read_unlock(); kfree_skb(skb); return slave; diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 674344eb8e2f..73f3a0b9a54b 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -2616,14 +2616,16 @@ static int retry_send(struct ib_mad_send_wr_private *mad_send_wr) static void timeout_sends(struct work_struct *work) { + struct ib_mad_send_wr_private *mad_send_wr, *n; struct ib_mad_agent_private *mad_agent_priv; - struct ib_mad_send_wr_private *mad_send_wr; struct ib_mad_send_wc mad_send_wc; + struct list_head local_list; unsigned long flags, delay; mad_agent_priv = container_of(work, struct ib_mad_agent_private, timed_work.work); mad_send_wc.vendor_err = 0; + INIT_LIST_HEAD(&local_list); spin_lock_irqsave(&mad_agent_priv->lock, flags); while (!list_empty(&mad_agent_priv->wait_list)) { @@ -2641,13 +2643,16 @@ static void timeout_sends(struct work_struct *work) break; } - list_del(&mad_send_wr->agent_list); + list_del_init(&mad_send_wr->agent_list); if (mad_send_wr->status == IB_WC_SUCCESS && !retry_send(mad_send_wr)) continue; - spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + list_add_tail(&mad_send_wr->agent_list, &local_list); + } + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + list_for_each_entry_safe(mad_send_wr, n, &local_list, agent_list) { if (mad_send_wr->status == IB_WC_SUCCESS) mad_send_wc.status = IB_WC_RESP_TIMEOUT_ERR; else @@ -2655,11 +2660,8 @@ static void timeout_sends(struct work_struct *work) mad_send_wc.send_buf = &mad_send_wr->send_buf; mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, &mad_send_wc); - deref_mad_agent(mad_agent_priv); - spin_lock_irqsave(&mad_agent_priv->lock, flags); } - spin_unlock_irqrestore(&mad_agent_priv->lock, flags); } /* @@ -2669,11 +2671,11 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, struct ib_mad_private *mad) { unsigned long flags; - int post, ret; struct ib_mad_private *mad_priv; struct ib_sge sg_list; struct ib_recv_wr recv_wr; struct ib_mad_queue *recv_queue = &qp_info->recv_queue; + int ret = 0; /* Initialize common scatter list fields */ sg_list.lkey = qp_info->port_priv->pd->local_dma_lkey; @@ -2683,7 +2685,7 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, recv_wr.sg_list = &sg_list; recv_wr.num_sge = 1; - do { + while (true) { /* Allocate and map receive buffer */ if (mad) { mad_priv = mad; @@ -2691,10 +2693,8 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, } else { mad_priv = alloc_mad_private(port_mad_size(qp_info->port_priv), GFP_ATOMIC); - if (!mad_priv) { - ret = -ENOMEM; - break; - } + if (!mad_priv) + return -ENOMEM; } sg_list.length = mad_priv_dma_size(mad_priv); sg_list.addr = ib_dma_map_single(qp_info->port_priv->device, @@ -2703,37 +2703,41 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, DMA_FROM_DEVICE); if (unlikely(ib_dma_mapping_error(qp_info->port_priv->device, sg_list.addr))) { - kfree(mad_priv); ret = -ENOMEM; - break; + goto free_mad_priv; } mad_priv->header.mapping = sg_list.addr; mad_priv->header.mad_list.mad_queue = recv_queue; mad_priv->header.mad_list.cqe.done = ib_mad_recv_done; recv_wr.wr_cqe = &mad_priv->header.mad_list.cqe; - - /* Post receive WR */ spin_lock_irqsave(&recv_queue->lock, flags); - post = (++recv_queue->count < recv_queue->max_active); - list_add_tail(&mad_priv->header.mad_list.list, &recv_queue->list); + if (recv_queue->count >= recv_queue->max_active) { + /* Fully populated the receive queue */ + spin_unlock_irqrestore(&recv_queue->lock, flags); + break; + } + recv_queue->count++; + list_add_tail(&mad_priv->header.mad_list.list, + &recv_queue->list); spin_unlock_irqrestore(&recv_queue->lock, flags); + ret = ib_post_recv(qp_info->qp, &recv_wr, NULL); if (ret) { spin_lock_irqsave(&recv_queue->lock, flags); list_del(&mad_priv->header.mad_list.list); recv_queue->count--; spin_unlock_irqrestore(&recv_queue->lock, flags); - ib_dma_unmap_single(qp_info->port_priv->device, - mad_priv->header.mapping, - mad_priv_dma_size(mad_priv), - DMA_FROM_DEVICE); - kfree(mad_priv); dev_err(&qp_info->port_priv->device->dev, "ib_post_recv failed: %d\n", ret); break; } - } while (post); + } + ib_dma_unmap_single(qp_info->port_priv->device, + mad_priv->header.mapping, + mad_priv_dma_size(mad_priv), DMA_FROM_DEVICE); +free_mad_priv: + kfree(mad_priv); return ret; } @@ -2937,7 +2941,6 @@ static int ib_mad_port_open(struct ib_device *device, int ret, cq_size; struct ib_mad_port_private *port_priv; unsigned long flags; - char name[sizeof "ib_mad123"]; int has_smi; if (WARN_ON(rdma_max_mad_size(device, port_num) < IB_MGMT_MAD_SIZE)) @@ -2983,12 +2986,15 @@ static int ib_mad_port_open(struct ib_device *device, if (ret) goto error6; } - ret = create_mad_qp(&port_priv->qp_info[1], IB_QPT_GSI); - if (ret) - goto error7; - snprintf(name, sizeof(name), "ib_mad%u", port_num); - port_priv->wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM); + if (rdma_cap_ib_cm(device, port_num)) { + ret = create_mad_qp(&port_priv->qp_info[1], IB_QPT_GSI); + if (ret) + goto error7; + } + + port_priv->wq = alloc_ordered_workqueue("ib_mad%u", WQ_MEM_RECLAIM, + port_num); if (!port_priv->wq) { ret = -ENOMEM; goto error8; diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c index 8af0619a39cd..b4b10e8a6495 100644 --- a/drivers/infiniband/core/mad_rmpp.c +++ b/drivers/infiniband/core/mad_rmpp.c @@ -158,7 +158,7 @@ static struct ib_mad_send_buf *alloc_response_msg(struct ib_mad_agent *agent, ah = ib_create_ah_from_wc(agent->qp->pd, recv_wc->wc, recv_wc->recv_buf.grh, agent->port_num); if (IS_ERR(ah)) - return (void *) ah; + return ERR_CAST(ah); hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class); msg = ib_create_send_mad(agent, recv_wc->wc->src_qp, diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index ae2db0c70788..def14c54b648 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -311,6 +311,7 @@ int rdma_nl_net_init(struct rdma_dev_net *rnet) struct net *net = read_pnet(&rnet->net); struct netlink_kernel_cfg cfg = { .input = rdma_nl_rcv, + .flags = NL_CFG_F_NONROOT_RECV, }; struct sock *nls; diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 4900a0848124..a872643e8039 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -137,6 +137,8 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME]= { .type = NLA_NUL_STRING, .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_RES_TYPE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_RES_SUBTYPE] = { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY]= { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_USECNT] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_RES_SRQ] = { .type = NLA_NESTED }, @@ -164,6 +166,12 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_INDEX] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_DYNAMIC] = { .type = NLA_U8 }, [RDMA_NLDEV_SYS_ATTR_PRIVILEGED_QKEY_MODE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_DRIVER_DETAILS] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_DEV_TYPE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_PARENT_NAME] = { .type = NLA_NUL_STRING }, + [RDMA_NLDEV_ATTR_NAME_ASSIGN_TYPE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_EVENT_TYPE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED] = { .type = NLA_U8 }, }; static int put_driver_name_print_type(struct sk_buff *msg, const char *name, @@ -298,6 +306,19 @@ static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_DIM, device->use_cq_dim)) return -EMSGSIZE; + if (device->type && + nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_TYPE, device->type)) + return -EMSGSIZE; + + if (device->parent && + nla_put_string(msg, RDMA_NLDEV_ATTR_PARENT_NAME, + dev_name(&device->parent->dev))) + return -EMSGSIZE; + + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_NAME_ASSIGN_TYPE, + device->name_assign_type)) + return -EMSGSIZE; + /* * Link type is determined on first port and mlx4 device * which can potentially have two different link type for the same @@ -399,7 +420,8 @@ err: return -EMSGSIZE; } -static int fill_res_info(struct sk_buff *msg, struct ib_device *device) +static int fill_res_info(struct sk_buff *msg, struct ib_device *device, + bool show_details) { static const char * const names[RDMA_RESTRACK_MAX] = { [RDMA_RESTRACK_PD] = "pd", @@ -424,7 +446,7 @@ static int fill_res_info(struct sk_buff *msg, struct ib_device *device) for (i = 0; i < RDMA_RESTRACK_MAX; i++) { if (!names[i]) continue; - curr = rdma_restrack_count(device, i); + curr = rdma_restrack_count(device, i, show_details); ret = fill_res_info_entry(msg, names[i], curr); if (ret) goto err; @@ -1054,8 +1076,8 @@ static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, u32 index; int err; - err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, extack); + err = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, extack); if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; @@ -1103,8 +1125,8 @@ static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, u32 index; int err; - err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, extack); + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; @@ -1195,8 +1217,8 @@ static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, u32 port; int err; - err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, extack); + err = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, extack); if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) @@ -1255,8 +1277,8 @@ static int nldev_port_get_dumpit(struct sk_buff *skb, int err; unsigned int p; - err = nlmsg_parse_deprecated(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, NULL); + err = __nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, NULL); if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; @@ -1305,13 +1327,14 @@ static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + bool show_details = false; struct ib_device *device; struct sk_buff *msg; u32 index; int ret; - ret = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, extack); + ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, extack); if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; @@ -1320,6 +1343,9 @@ static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, if (!device) return -EINVAL; + if (tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS]) + show_details = nla_get_u8(tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS]); + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; @@ -1334,7 +1360,7 @@ static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, goto err_free; } - ret = fill_res_info(msg, device); + ret = fill_res_info(msg, device, show_details); if (ret) goto err_free; @@ -1364,7 +1390,7 @@ static int _nldev_res_get_dumpit(struct ib_device *device, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_GET), 0, NLM_F_MULTI); - if (!nlh || fill_res_info(skb, device)) { + if (!nlh || fill_res_info(skb, device, false)) { nlmsg_cancel(skb, nlh); goto out; } @@ -1457,8 +1483,8 @@ static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct sk_buff *msg; int ret; - ret = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, extack); + ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, extack); if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !fe->id || !tb[fe->id]) return -EINVAL; @@ -1534,6 +1560,7 @@ static int res_get_common_dumpit(struct sk_buff *skb, struct rdma_restrack_entry *res; struct rdma_restrack_root *rt; int err, ret = 0, idx = 0; + bool show_details = false; struct nlattr *table_attr; struct nlattr *entry_attr; struct ib_device *device; @@ -1544,8 +1571,8 @@ static int res_get_common_dumpit(struct sk_buff *skb, u32 index, port = 0; bool filled = false; - err = nlmsg_parse_deprecated(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, NULL); + err = __nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, NULL); /* * Right now, we are expecting the device index to get res information, * but it is possible to extend this code to return all devices in @@ -1562,6 +1589,9 @@ static int res_get_common_dumpit(struct sk_buff *skb, if (!device) return -EINVAL; + if (tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS]) + show_details = nla_get_u8(tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS]); + /* * If no PORT_INDEX is supplied, we will return all QPs from that device */ @@ -1599,6 +1629,9 @@ static int res_get_common_dumpit(struct sk_buff *skb, * objects. */ xa_for_each(&rt->xa, id, res) { + if (xa_get_mark(&rt->xa, res->id, RESTRACK_DD) && !show_details) + goto next; + if (idx < start || !rdma_restrack_get(res)) goto next; @@ -1731,8 +1764,8 @@ static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, char type[IFNAMSIZ]; int err; - err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, extack); + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); if (err || !tb[RDMA_NLDEV_ATTR_DEV_NAME] || !tb[RDMA_NLDEV_ATTR_LINK_TYPE] || !tb[RDMA_NLDEV_ATTR_NDEV_NAME]) return -EINVAL; @@ -1775,8 +1808,8 @@ static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, u32 index; int err; - err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, extack); + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; @@ -1805,8 +1838,8 @@ static int nldev_get_chardev(struct sk_buff *skb, struct nlmsghdr *nlh, u32 index; int err; - err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, - extack); + err = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, + NL_VALIDATE_LIBERAL, extack); if (err || !tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE]) return -EINVAL; @@ -1889,8 +1922,8 @@ static int nldev_sys_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct sk_buff *msg; int err; - err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, extack); + err = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, extack); if (err) return err; @@ -1920,6 +1953,12 @@ static int nldev_sys_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, nlmsg_free(msg); return err; } + + err = nla_put_u8(msg, RDMA_NLDEV_SYS_ATTR_MONITOR_MODE, 1); + if (err) { + nlmsg_free(msg); + return err; + } /* * Copy-on-fork is supported. * See commits: @@ -1990,6 +2029,7 @@ static int nldev_stat_set_mode_doit(struct sk_buff *msg, struct ib_device *device, u32 port) { u32 mode, mask = 0, qpn, cntn = 0; + bool opcnt = false; int ret; /* Currently only counter for QP is supported */ @@ -1997,12 +2037,17 @@ static int nldev_stat_set_mode_doit(struct sk_buff *msg, nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP) return -EINVAL; + if (tb[RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED]) + opcnt = !!nla_get_u8( + tb[RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED]); + mode = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_MODE]); if (mode == RDMA_COUNTER_MODE_AUTO) { if (tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]) mask = nla_get_u32( tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]); - return rdma_counter_set_auto_mode(device, port, mask, extack); + return rdma_counter_set_auto_mode(device, port, mask, opcnt, + extack); } if (!tb[RDMA_NLDEV_ATTR_RES_LQPN]) @@ -2320,6 +2365,7 @@ static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh, struct ib_device *device; struct sk_buff *msg; u32 index, port; + bool opcnt; int ret; if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]) @@ -2355,7 +2401,7 @@ static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh, goto err_msg; } - ret = rdma_counter_get_mode(device, port, &mode, &mask); + ret = rdma_counter_get_mode(device, port, &mode, &mask, &opcnt); if (ret) goto err_msg; @@ -2372,6 +2418,12 @@ static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh, goto err_msg; } + if ((mode == RDMA_COUNTER_MODE_AUTO) && + nla_put_u8(msg, RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED, opcnt)) { + ret = -EMSGSIZE; + goto err_msg; + } + nlmsg_end(msg, nlh); ib_device_put(device); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); @@ -2389,8 +2441,8 @@ static int nldev_stat_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; int ret; - ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, extack); + ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, extack); if (ret) return -EINVAL; @@ -2419,8 +2471,8 @@ static int nldev_stat_get_dumpit(struct sk_buff *skb, struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; int ret; - ret = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, NULL); + ret = __nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, NULL); if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES]) return -EINVAL; @@ -2451,8 +2503,8 @@ static int nldev_stat_get_counter_status_doit(struct sk_buff *skb, u32 devid, port; int ret, i; - ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, extack); + ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, extack); if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) return -EINVAL; @@ -2533,6 +2585,56 @@ err: return ret; } +static int nldev_newdev(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + enum rdma_nl_dev_type type; + struct ib_device *parent; + char name[IFNAMSIZ] = {}; + u32 parentid; + int ret; + + ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || + !tb[RDMA_NLDEV_ATTR_DEV_NAME] || !tb[RDMA_NLDEV_ATTR_DEV_TYPE]) + return -EINVAL; + + nla_strscpy(name, tb[RDMA_NLDEV_ATTR_DEV_NAME], sizeof(name)); + type = nla_get_u8(tb[RDMA_NLDEV_ATTR_DEV_TYPE]); + parentid = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + parent = ib_device_get_by_index(sock_net(skb->sk), parentid); + if (!parent) + return -EINVAL; + + ret = ib_add_sub_device(parent, type, name); + ib_device_put(parent); + + return ret; +} + +static int nldev_deldev(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + u32 devid; + int ret; + + ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + + devid = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(sock_net(skb->sk), devid); + if (!device) + return -EINVAL; + + return ib_del_sub_device_and_put(device); +} + static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { [RDMA_NLDEV_CMD_GET] = { .doit = nldev_get_doit, @@ -2631,8 +2733,178 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { [RDMA_NLDEV_CMD_STAT_GET_STATUS] = { .doit = nldev_stat_get_counter_status_doit, }, + [RDMA_NLDEV_CMD_NEWDEV] = { + .doit = nldev_newdev, + .flags = RDMA_NL_ADMIN_PERM, + }, + [RDMA_NLDEV_CMD_DELDEV] = { + .doit = nldev_deldev, + .flags = RDMA_NL_ADMIN_PERM, + }, }; +static int fill_mon_netdev_rename(struct sk_buff *msg, + struct ib_device *device, u32 port, + const struct net *net) +{ + struct net_device *netdev = ib_device_get_netdev(device, port); + int ret = 0; + + if (!netdev || !net_eq(dev_net(netdev), net)) + goto out; + + ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_NDEV_INDEX, netdev->ifindex); + if (ret) + goto out; + ret = nla_put_string(msg, RDMA_NLDEV_ATTR_NDEV_NAME, netdev->name); +out: + dev_put(netdev); + return ret; +} + +static int fill_mon_netdev_association(struct sk_buff *msg, + struct ib_device *device, u32 port, + const struct net *net) +{ + struct net_device *netdev = ib_device_get_netdev(device, port); + int ret = 0; + + if (netdev && !net_eq(dev_net(netdev), net)) + goto out; + + ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index); + if (ret) + goto out; + + ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, + dev_name(&device->dev)); + if (ret) + goto out; + + ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port); + if (ret) + goto out; + + if (netdev) { + ret = nla_put_u32(msg, + RDMA_NLDEV_ATTR_NDEV_INDEX, netdev->ifindex); + if (ret) + goto out; + + ret = nla_put_string(msg, + RDMA_NLDEV_ATTR_NDEV_NAME, netdev->name); + } + +out: + dev_put(netdev); + return ret; +} + +static void rdma_nl_notify_err_msg(struct ib_device *device, u32 port_num, + enum rdma_nl_notify_event_type type) +{ + struct net_device *netdev; + + switch (type) { + case RDMA_REGISTER_EVENT: + dev_warn_ratelimited(&device->dev, + "Failed to send RDMA monitor register device event\n"); + break; + case RDMA_UNREGISTER_EVENT: + dev_warn_ratelimited(&device->dev, + "Failed to send RDMA monitor unregister device event\n"); + break; + case RDMA_NETDEV_ATTACH_EVENT: + netdev = ib_device_get_netdev(device, port_num); + dev_warn_ratelimited(&device->dev, + "Failed to send RDMA monitor netdev attach event: port %d netdev %d\n", + port_num, netdev->ifindex); + dev_put(netdev); + break; + case RDMA_NETDEV_DETACH_EVENT: + dev_warn_ratelimited(&device->dev, + "Failed to send RDMA monitor netdev detach event: port %d\n", + port_num); + break; + case RDMA_RENAME_EVENT: + dev_warn_ratelimited(&device->dev, + "Failed to send RDMA monitor rename device event\n"); + break; + + case RDMA_NETDEV_RENAME_EVENT: + netdev = ib_device_get_netdev(device, port_num); + dev_warn_ratelimited(&device->dev, + "Failed to send RDMA monitor netdev rename event: port %d netdev %d\n", + port_num, netdev->ifindex); + dev_put(netdev); + break; + default: + break; + } +} + +int rdma_nl_notify_event(struct ib_device *device, u32 port_num, + enum rdma_nl_notify_event_type type) +{ + struct sk_buff *skb; + int ret = -EMSGSIZE; + struct net *net; + void *nlh; + + net = read_pnet(&device->coredev.rdma_net); + if (!net) + return -EINVAL; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) + return -ENOMEM; + nlh = nlmsg_put(skb, 0, 0, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_MONITOR), + 0, 0); + if (!nlh) + goto err_free; + + switch (type) { + case RDMA_REGISTER_EVENT: + case RDMA_UNREGISTER_EVENT: + case RDMA_RENAME_EVENT: + ret = fill_nldev_handle(skb, device); + if (ret) + goto err_free; + break; + case RDMA_NETDEV_ATTACH_EVENT: + case RDMA_NETDEV_DETACH_EVENT: + ret = fill_mon_netdev_association(skb, device, port_num, net); + if (ret) + goto err_free; + break; + case RDMA_NETDEV_RENAME_EVENT: + ret = fill_mon_netdev_rename(skb, device, port_num, net); + if (ret) + goto err_free; + break; + default: + break; + } + + ret = nla_put_u8(skb, RDMA_NLDEV_ATTR_EVENT_TYPE, type); + if (ret) + goto err_free; + + nlmsg_end(skb, nlh); + ret = rdma_nl_multicast(net, skb, RDMA_NL_GROUP_NOTIFY, GFP_KERNEL); + if (ret && ret != -ESRCH) { + skb = NULL; /* skb is freed in the netlink send-op handling */ + goto err_free; + } + return 0; + +err_free: + rdma_nl_notify_err_msg(device, port_num, type); + nlmsg_free(skb); + return ret; +} + void __init nldev_init(void) { rdma_nl_register(RDMA_NL_NLDEV, nldev_cb_table); diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index 29b1ab1d5f93..90c177edf9b0 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -58,8 +58,8 @@ void uverbs_uobject_put(struct ib_uobject *uobject) } EXPORT_SYMBOL(uverbs_uobject_put); -static int uverbs_try_lock_object(struct ib_uobject *uobj, - enum rdma_lookup_mode mode) +int uverbs_try_lock_object(struct ib_uobject *uobj, + enum rdma_lookup_mode mode) { /* * When a shared access is required, we use a positive counter. Each @@ -84,6 +84,7 @@ static int uverbs_try_lock_object(struct ib_uobject *uobj, } return 0; } +EXPORT_SYMBOL(uverbs_try_lock_object); static void assert_uverbs_usecnt(struct ib_uobject *uobj, enum rdma_lookup_mode mode) @@ -880,9 +881,14 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile, static int __uverbs_cleanup_ufile(struct ib_uverbs_file *ufile, enum rdma_remove_reason reason) { + struct uverbs_attr_bundle attrs = { .ufile = ufile }; + struct ib_ucontext *ucontext = ufile->ucontext; + struct ib_device *ib_dev = ucontext->device; struct ib_uobject *obj, *next_obj; int ret = -EINVAL; - struct uverbs_attr_bundle attrs = { .ufile = ufile }; + + if (ib_dev->ops.ufile_hw_cleanup) + ib_dev->ops.ufile_hw_cleanup(ufile); /* * This shouldn't run while executing other commands on this diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index 01a499a8b88d..3313410014cd 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -37,22 +37,6 @@ int rdma_restrack_init(struct ib_device *dev) return 0; } -static const char *type2str(enum rdma_restrack_type type) -{ - static const char * const names[RDMA_RESTRACK_MAX] = { - [RDMA_RESTRACK_PD] = "PD", - [RDMA_RESTRACK_CQ] = "CQ", - [RDMA_RESTRACK_QP] = "QP", - [RDMA_RESTRACK_CM_ID] = "CM_ID", - [RDMA_RESTRACK_MR] = "MR", - [RDMA_RESTRACK_CTX] = "CTX", - [RDMA_RESTRACK_COUNTER] = "COUNTER", - [RDMA_RESTRACK_SRQ] = "SRQ", - }; - - return names[type]; -}; - /** * rdma_restrack_clean() - clean resource tracking * @dev: IB device @@ -60,47 +44,14 @@ static const char *type2str(enum rdma_restrack_type type) void rdma_restrack_clean(struct ib_device *dev) { struct rdma_restrack_root *rt = dev->res; - struct rdma_restrack_entry *e; - char buf[TASK_COMM_LEN]; - bool found = false; - const char *owner; int i; for (i = 0 ; i < RDMA_RESTRACK_MAX; i++) { struct xarray *xa = &dev->res[i].xa; - if (!xa_empty(xa)) { - unsigned long index; - - if (!found) { - pr_err("restrack: %s", CUT_HERE); - dev_err(&dev->dev, "BUG: RESTRACK detected leak of resources\n"); - } - xa_for_each(xa, index, e) { - if (rdma_is_kernel_res(e)) { - owner = e->kern_name; - } else { - /* - * There is no need to call get_task_struct here, - * because we can be here only if there are more - * get_task_struct() call than put_task_struct(). - */ - get_task_comm(buf, e->task); - owner = buf; - } - - pr_err("restrack: %s %s object allocated by %s is not freed\n", - rdma_is_kernel_res(e) ? "Kernel" : - "User", - type2str(e->type), owner); - } - found = true; - } + WARN_ON(!xa_empty(xa)); xa_destroy(xa); } - if (found) - pr_err("restrack: %s", CUT_HERE); - kfree(rt); } @@ -108,8 +59,10 @@ void rdma_restrack_clean(struct ib_device *dev) * rdma_restrack_count() - the current usage of specific object * @dev: IB device * @type: actual type of object to operate + * @show_details: count driver specific objects */ -int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type) +int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type, + bool show_details) { struct rdma_restrack_root *rt = &dev->res[type]; struct rdma_restrack_entry *e; @@ -117,8 +70,11 @@ int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type) u32 cnt = 0; xa_lock(&rt->xa); - xas_for_each(&xas, e, U32_MAX) + xas_for_each(&xas, e, U32_MAX) { + if (xa_get_mark(&rt->xa, e->id, RESTRACK_DD) && !show_details) + continue; cnt++; + } xa_unlock(&rt->xa); return cnt; } @@ -247,6 +203,9 @@ void rdma_restrack_add(struct rdma_restrack_entry *res) ret = xa_insert(&rt->xa, res->id, res, GFP_KERNEL); if (ret) res->id = 0; + + if (qp->qp_type >= IB_QPT_DRIVER) + xa_set_mark(&rt->xa, res->id, RESTRACK_DD); } else if (res->type == RDMA_RESTRACK_COUNTER) { /* Special case to ensure that cntn points to right counter */ struct rdma_counter *counter; diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c index e958c43dd28f..a9f2c6b1b29e 100644 --- a/drivers/infiniband/core/roce_gid_mgmt.c +++ b/drivers/infiniband/core/roce_gid_mgmt.c @@ -515,6 +515,27 @@ void rdma_roce_rescan_device(struct ib_device *ib_dev) } EXPORT_SYMBOL(rdma_roce_rescan_device); +/** + * rdma_roce_rescan_port - Rescan all of the network devices in the system + * and add their gids if relevant to the port of the RoCE device. + * + * @ib_dev: IB device + * @port: Port number + */ +void rdma_roce_rescan_port(struct ib_device *ib_dev, u32 port) +{ + struct net_device *ndev = NULL; + + if (rdma_protocol_roce(ib_dev, port)) { + ndev = ib_device_get_netdev(ib_dev, port); + if (!ndev) + return; + enum_all_gids_of_dev_cb(ib_dev, port, ndev, ndev); + dev_put(ndev); + } +} +EXPORT_SYMBOL(rdma_roce_rescan_port); + static void callback_for_addr_gid_device_scan(struct ib_device *device, u32 port, struct net_device *rdma_ndev, @@ -575,16 +596,17 @@ static void handle_netdev_upper(struct ib_device *ib_dev, u32 port, } } -static void _roce_del_all_netdev_gids(struct ib_device *ib_dev, u32 port, - struct net_device *event_ndev) +void roce_del_all_netdev_gids(struct ib_device *ib_dev, + u32 port, struct net_device *ndev) { - ib_cache_gid_del_all_netdev_gids(ib_dev, port, event_ndev); + ib_cache_gid_del_all_netdev_gids(ib_dev, port, ndev); } +EXPORT_SYMBOL(roce_del_all_netdev_gids); static void del_netdev_upper_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { - handle_netdev_upper(ib_dev, port, cookie, _roce_del_all_netdev_gids); + handle_netdev_upper(ib_dev, port, cookie, roce_del_all_netdev_gids); } static void add_netdev_upper_ips(struct ib_device *ib_dev, u32 port, @@ -601,8 +623,7 @@ static void del_netdev_default_ips_join(struct ib_device *ib_dev, u32 port, rcu_read_lock(); master_ndev = netdev_master_upper_dev_get_rcu(rdma_ndev); - if (master_ndev) - dev_hold(master_ndev); + dev_hold(master_ndev); rcu_read_unlock(); if (master_ndev) { diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 8175dde60b0a..53571e6b3162 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -1420,7 +1420,7 @@ enum opa_pr_supported { /* * opa_pr_query_possible - Check if current PR query can be an OPA query. * - * Retuns PR_NOT_SUPPORTED if a path record query is not + * Returns PR_NOT_SUPPORTED if a path record query is not * possible, PR_OPA_SUPPORTED if an OPA path record query * is possible and PR_IB_SUPPORTED if an IB path record * query is possible. diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 9f97bef02149..0ed862b38b44 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -216,24 +216,12 @@ static ssize_t state_show(struct ib_device *ibdev, u32 port_num, struct ib_port_attr attr; ssize_t ret; - static const char *state_name[] = { - [IB_PORT_NOP] = "NOP", - [IB_PORT_DOWN] = "DOWN", - [IB_PORT_INIT] = "INIT", - [IB_PORT_ARMED] = "ARMED", - [IB_PORT_ACTIVE] = "ACTIVE", - [IB_PORT_ACTIVE_DEFER] = "ACTIVE_DEFER" - }; - ret = ib_query_port(ibdev, port_num, &attr); if (ret) return ret; return sysfs_emit(buf, "%d: %s\n", attr.state, - attr.state >= 0 && - attr.state < ARRAY_SIZE(state_name) ? - state_name[attr.state] : - "UNKNOWN"); + ib_port_state_to_str(attr.state)); } static ssize_t lid_show(struct ib_device *ibdev, u32 port_num, @@ -988,6 +976,7 @@ int ib_setup_device_attrs(struct ib_device *ibdev) for (i = 0; i != ARRAY_SIZE(ibdev->groups); i++) if (!ibdev->groups[i]) { ibdev->groups[i] = &data->group; + ibdev->hw_stats_attr_index = i; return 0; } WARN(true, "struct ib_device->groups is too small"); diff --git a/drivers/infiniband/core/ucaps.c b/drivers/infiniband/core/ucaps.c new file mode 100644 index 000000000000..de5cb8bf0a61 --- /dev/null +++ b/drivers/infiniband/core/ucaps.c @@ -0,0 +1,267 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include <linux/kref.h> +#include <linux/cdev.h> +#include <linux/mutex.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <rdma/ib_ucaps.h> + +#define RDMA_UCAP_FIRST RDMA_UCAP_MLX5_CTRL_LOCAL + +static DEFINE_MUTEX(ucaps_mutex); +static struct ib_ucap *ucaps_list[RDMA_UCAP_MAX]; +static bool ucaps_class_is_registered; +static dev_t ucaps_base_dev; + +struct ib_ucap { + struct cdev cdev; + struct device dev; + struct kref ref; +}; + +static const char *ucap_names[RDMA_UCAP_MAX] = { + [RDMA_UCAP_MLX5_CTRL_LOCAL] = "mlx5_perm_ctrl_local", + [RDMA_UCAP_MLX5_CTRL_OTHER_VHCA] = "mlx5_perm_ctrl_other_vhca" +}; + +static char *ucaps_devnode(const struct device *dev, umode_t *mode) +{ + if (mode) + *mode = 0600; + + return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev)); +} + +static const struct class ucaps_class = { + .name = "infiniband_ucaps", + .devnode = ucaps_devnode, +}; + +static const struct file_operations ucaps_cdev_fops = { + .owner = THIS_MODULE, + .open = simple_open, +}; + +/** + * ib_cleanup_ucaps - cleanup all API resources and class. + * + * This is called once, when removing the ib_uverbs module. + */ +void ib_cleanup_ucaps(void) +{ + mutex_lock(&ucaps_mutex); + if (!ucaps_class_is_registered) { + mutex_unlock(&ucaps_mutex); + return; + } + + for (int i = RDMA_UCAP_FIRST; i < RDMA_UCAP_MAX; i++) + WARN_ON(ucaps_list[i]); + + class_unregister(&ucaps_class); + ucaps_class_is_registered = false; + unregister_chrdev_region(ucaps_base_dev, RDMA_UCAP_MAX); + mutex_unlock(&ucaps_mutex); +} + +static int get_ucap_from_devt(dev_t devt, u64 *idx_mask) +{ + for (int type = RDMA_UCAP_FIRST; type < RDMA_UCAP_MAX; type++) { + if (ucaps_list[type] && ucaps_list[type]->dev.devt == devt) { + *idx_mask |= 1 << type; + return 0; + } + } + + return -EINVAL; +} + +static int get_devt_from_fd(unsigned int fd, dev_t *ret_dev) +{ + struct file *file; + + file = fget(fd); + if (!file) + return -EBADF; + + *ret_dev = file_inode(file)->i_rdev; + fput(file); + return 0; +} + +/** + * ib_ucaps_init - Initialization required before ucap creation. + * + * Return: 0 on success, or a negative errno value on failure + */ +static int ib_ucaps_init(void) +{ + int ret = 0; + + if (ucaps_class_is_registered) + return ret; + + ret = class_register(&ucaps_class); + if (ret) + return ret; + + ret = alloc_chrdev_region(&ucaps_base_dev, 0, RDMA_UCAP_MAX, + ucaps_class.name); + if (ret < 0) { + class_unregister(&ucaps_class); + return ret; + } + + ucaps_class_is_registered = true; + + return 0; +} + +static void ucap_dev_release(struct device *device) +{ + struct ib_ucap *ucap = container_of(device, struct ib_ucap, dev); + + kfree(ucap); +} + +/** + * ib_create_ucap - Add a ucap character device + * @type: UCAP type + * + * Creates a ucap character device in the /dev/infiniband directory. By default, + * the device has root-only read-write access. + * + * A driver may call this multiple times with the same UCAP type. A reference + * count tracks creations and deletions. + * + * Return: 0 on success, or a negative errno value on failure + */ +int ib_create_ucap(enum rdma_user_cap type) +{ + struct ib_ucap *ucap; + int ret; + + if (type >= RDMA_UCAP_MAX) + return -EINVAL; + + mutex_lock(&ucaps_mutex); + ret = ib_ucaps_init(); + if (ret) + goto unlock; + + ucap = ucaps_list[type]; + if (ucap) { + kref_get(&ucap->ref); + mutex_unlock(&ucaps_mutex); + return 0; + } + + ucap = kzalloc(sizeof(*ucap), GFP_KERNEL); + if (!ucap) { + ret = -ENOMEM; + goto unlock; + } + + device_initialize(&ucap->dev); + ucap->dev.class = &ucaps_class; + ucap->dev.devt = MKDEV(MAJOR(ucaps_base_dev), type); + ucap->dev.release = ucap_dev_release; + ret = dev_set_name(&ucap->dev, "%s", ucap_names[type]); + if (ret) + goto err_device; + + cdev_init(&ucap->cdev, &ucaps_cdev_fops); + ucap->cdev.owner = THIS_MODULE; + + ret = cdev_device_add(&ucap->cdev, &ucap->dev); + if (ret) + goto err_device; + + kref_init(&ucap->ref); + ucaps_list[type] = ucap; + mutex_unlock(&ucaps_mutex); + + return 0; + +err_device: + put_device(&ucap->dev); +unlock: + mutex_unlock(&ucaps_mutex); + return ret; +} +EXPORT_SYMBOL(ib_create_ucap); + +static void ib_release_ucap(struct kref *ref) +{ + struct ib_ucap *ucap = container_of(ref, struct ib_ucap, ref); + enum rdma_user_cap type; + + for (type = RDMA_UCAP_FIRST; type < RDMA_UCAP_MAX; type++) { + if (ucaps_list[type] == ucap) + break; + } + WARN_ON(type == RDMA_UCAP_MAX); + + ucaps_list[type] = NULL; + cdev_device_del(&ucap->cdev, &ucap->dev); + put_device(&ucap->dev); +} + +/** + * ib_remove_ucap - Remove a ucap character device + * @type: User cap type + * + * Removes the ucap character device according to type. The device is completely + * removed from the filesystem when its reference count reaches 0. + */ +void ib_remove_ucap(enum rdma_user_cap type) +{ + struct ib_ucap *ucap; + + mutex_lock(&ucaps_mutex); + ucap = ucaps_list[type]; + if (WARN_ON(!ucap)) + goto end; + + kref_put(&ucap->ref, ib_release_ucap); +end: + mutex_unlock(&ucaps_mutex); +} +EXPORT_SYMBOL(ib_remove_ucap); + +/** + * ib_get_ucaps - Get bitmask of ucap types from file descriptors + * @fds: Array of file descriptors + * @fd_count: Number of file descriptors in the array + * @idx_mask: Bitmask to be updated based on the ucaps in the fd list + * + * Given an array of file descriptors, this function returns a bitmask of + * the ucaps where a bit is set if an FD for that ucap type was in the array. + * + * Return: 0 on success, or a negative errno value on failure + */ +int ib_get_ucaps(int *fds, int fd_count, uint64_t *idx_mask) +{ + int ret = 0; + dev_t dev; + + *idx_mask = 0; + mutex_lock(&ucaps_mutex); + for (int i = 0; i < fd_count; i++) { + ret = get_devt_from_fd(fds[i], &dev); + if (ret) + goto end; + + ret = get_ucap_from_devt(dev, idx_mask); + if (ret) + goto end; + } + +end: + mutex_unlock(&ucaps_mutex); + return ret; +} diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 5f5ad8faf86e..6e700b974033 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -69,7 +69,9 @@ static struct ctl_table ucma_ctl_table[] = { .data = &max_backlog, .maxlen = sizeof max_backlog, .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, }; @@ -1615,7 +1617,6 @@ static ssize_t ucma_migrate_id(struct ucma_file *new_file, struct ucma_event *uevent, *tmp; struct ucma_context *ctx; LIST_HEAD(event_list); - struct fd f; struct ucma_file *cur_file; int ret = 0; @@ -1623,21 +1624,17 @@ static ssize_t ucma_migrate_id(struct ucma_file *new_file, return -EFAULT; /* Get current fd to protect against it being closed */ - f = fdget(cmd.fd); - if (!f.file) + CLASS(fd, f)(cmd.fd); + if (fd_empty(f)) return -ENOENT; - if (f.file->f_op != &ucma_fops) { - ret = -EINVAL; - goto file_put; - } - cur_file = f.file->private_data; + if (fd_file(f)->f_op != &ucma_fops) + return -EINVAL; + cur_file = fd_file(f)->private_data; /* Validate current fd and prevent destruction of id. */ ctx = ucma_get_ctx(cur_file, cmd.id); - if (IS_ERR(ctx)) { - ret = PTR_ERR(ctx); - goto file_put; - } + if (IS_ERR(ctx)) + return PTR_ERR(ctx); rdma_lock_handler(ctx->cm_id); /* @@ -1678,8 +1675,6 @@ static ssize_t ucma_migrate_id(struct ucma_file *new_file, err_unlock: rdma_unlock_handler(ctx->cm_id); ucma_put_ctx(ctx); -file_put: - fdput(f); return ret; } @@ -1817,7 +1812,6 @@ static const struct file_operations ucma_fops = { .release = ucma_close, .write = ucma_write, .poll = ucma_poll, - .llseek = no_llseek, }; static struct miscdevice ucma_misc = { diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c index 64d9c492de64..8d3dfef9ebaa 100644 --- a/drivers/infiniband/core/ud_header.c +++ b/drivers/infiniband/core/ud_header.c @@ -462,86 +462,3 @@ int ib_ud_header_pack(struct ib_ud_header *header, return len; } EXPORT_SYMBOL(ib_ud_header_pack); - -/** - * ib_ud_header_unpack - Unpack UD header struct from wire format - * @header:UD header struct - * @buf:Buffer to pack into - * - * ib_ud_header_pack() unpacks the UD header structure @header from wire - * format in the buffer @buf. - */ -int ib_ud_header_unpack(void *buf, - struct ib_ud_header *header) -{ - ib_unpack(lrh_table, ARRAY_SIZE(lrh_table), - buf, &header->lrh); - buf += IB_LRH_BYTES; - - if (header->lrh.link_version != 0) { - pr_warn("Invalid LRH.link_version %u\n", - header->lrh.link_version); - return -EINVAL; - } - - switch (header->lrh.link_next_header) { - case IB_LNH_IBA_LOCAL: - header->grh_present = 0; - break; - - case IB_LNH_IBA_GLOBAL: - header->grh_present = 1; - ib_unpack(grh_table, ARRAY_SIZE(grh_table), - buf, &header->grh); - buf += IB_GRH_BYTES; - - if (header->grh.ip_version != 6) { - pr_warn("Invalid GRH.ip_version %u\n", - header->grh.ip_version); - return -EINVAL; - } - if (header->grh.next_header != 0x1b) { - pr_warn("Invalid GRH.next_header 0x%02x\n", - header->grh.next_header); - return -EINVAL; - } - break; - - default: - pr_warn("Invalid LRH.link_next_header %u\n", - header->lrh.link_next_header); - return -EINVAL; - } - - ib_unpack(bth_table, ARRAY_SIZE(bth_table), - buf, &header->bth); - buf += IB_BTH_BYTES; - - switch (header->bth.opcode) { - case IB_OPCODE_UD_SEND_ONLY: - header->immediate_present = 0; - break; - case IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE: - header->immediate_present = 1; - break; - default: - pr_warn("Invalid BTH.opcode 0x%02x\n", header->bth.opcode); - return -EINVAL; - } - - if (header->bth.transport_header_version != 0) { - pr_warn("Invalid BTH.transport_header_version %u\n", - header->bth.transport_header_version); - return -EINVAL; - } - - ib_unpack(deth_table, ARRAY_SIZE(deth_table), - buf, &header->deth); - buf += IB_DETH_BYTES; - - if (header->immediate_present) - memcpy(&header->immediate_data, buf, sizeof header->immediate_data); - - return 0; -} -EXPORT_SYMBOL(ib_ud_header_unpack); diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 07c571c7b699..c5b686394760 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -80,9 +80,12 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem, unsigned long pgsz_bitmap, unsigned long virt) { - struct scatterlist *sg; + unsigned long curr_len = 0; + dma_addr_t curr_base = ~0; unsigned long va, pgoff; + struct scatterlist *sg; dma_addr_t mask; + dma_addr_t end; int i; umem->iova = va = virt; @@ -107,17 +110,30 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem, pgoff = umem->address & ~PAGE_MASK; for_each_sgtable_dma_sg(&umem->sgt_append.sgt, sg, i) { - /* Walk SGL and reduce max page size if VA/PA bits differ - * for any address. + /* If the current entry is physically contiguous with the previous + * one, no need to take its start addresses into consideration. */ - mask |= (sg_dma_address(sg) + pgoff) ^ va; + if (check_add_overflow(curr_base, curr_len, &end) || + end != sg_dma_address(sg)) { + + curr_base = sg_dma_address(sg); + curr_len = 0; + + /* Reduce max page size if VA/PA bits differ */ + mask |= (curr_base + pgoff) ^ va; + + /* The alignment of any VA matching a discontinuity point + * in the physical memory sets the maximum possible page + * size as this must be a starting point of a new page that + * needs to be aligned. + */ + if (i != 0) + mask |= va; + } + + curr_len += sg_dma_len(sg); va += sg_dma_len(sg) - pgoff; - /* Except for the last entry, the ending iova alignment sets - * the maximum possible page size as the low bits of the iova - * must be zero when starting the next chunk. - */ - if (i != (umem->sgt_append.sgt.nents - 1)) - mask |= va; + pgoff = 0; } diff --git a/drivers/infiniband/core/umem_dmabuf.c b/drivers/infiniband/core/umem_dmabuf.c index 39357dc2d229..0ec2e4120cc9 100644 --- a/drivers/infiniband/core/umem_dmabuf.c +++ b/drivers/infiniband/core/umem_dmabuf.c @@ -10,7 +10,7 @@ #include "uverbs.h" -MODULE_IMPORT_NS(DMA_BUF); +MODULE_IMPORT_NS("DMA_BUF"); int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf) { @@ -23,6 +23,9 @@ int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf) dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv); + if (umem_dmabuf->revoked) + return -EINVAL; + if (umem_dmabuf->sgt) goto wait_fence; @@ -110,10 +113,12 @@ void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf) } EXPORT_SYMBOL(ib_umem_dmabuf_unmap_pages); -struct ib_umem_dmabuf *ib_umem_dmabuf_get(struct ib_device *device, - unsigned long offset, size_t size, - int fd, int access, - const struct dma_buf_attach_ops *ops) +static struct ib_umem_dmabuf * +ib_umem_dmabuf_get_with_dma_device(struct ib_device *device, + struct device *dma_device, + unsigned long offset, size_t size, + int fd, int access, + const struct dma_buf_attach_ops *ops) { struct dma_buf *dmabuf; struct ib_umem_dmabuf *umem_dmabuf; @@ -152,7 +157,7 @@ struct ib_umem_dmabuf *ib_umem_dmabuf_get(struct ib_device *device, umem_dmabuf->attach = dma_buf_dynamic_attach( dmabuf, - device->dma_device, + dma_device, ops, umem_dmabuf); if (IS_ERR(umem_dmabuf->attach)) { @@ -168,6 +173,15 @@ out_release_dmabuf: dma_buf_put(dmabuf); return ret; } + +struct ib_umem_dmabuf *ib_umem_dmabuf_get(struct ib_device *device, + unsigned long offset, size_t size, + int fd, int access, + const struct dma_buf_attach_ops *ops) +{ + return ib_umem_dmabuf_get_with_dma_device(device, device->dma_device, + offset, size, fd, access, ops); +} EXPORT_SYMBOL(ib_umem_dmabuf_get); static void @@ -184,16 +198,18 @@ static struct dma_buf_attach_ops ib_umem_dmabuf_attach_pinned_ops = { .move_notify = ib_umem_dmabuf_unsupported_move_notify, }; -struct ib_umem_dmabuf *ib_umem_dmabuf_get_pinned(struct ib_device *device, - unsigned long offset, - size_t size, int fd, - int access) +struct ib_umem_dmabuf * +ib_umem_dmabuf_get_pinned_with_dma_device(struct ib_device *device, + struct device *dma_device, + unsigned long offset, size_t size, + int fd, int access) { struct ib_umem_dmabuf *umem_dmabuf; int err; - umem_dmabuf = ib_umem_dmabuf_get(device, offset, size, fd, access, - &ib_umem_dmabuf_attach_pinned_ops); + umem_dmabuf = ib_umem_dmabuf_get_with_dma_device(device, dma_device, offset, + size, fd, access, + &ib_umem_dmabuf_attach_pinned_ops); if (IS_ERR(umem_dmabuf)) return umem_dmabuf; @@ -217,17 +233,41 @@ err_release: ib_umem_release(&umem_dmabuf->umem); return ERR_PTR(err); } +EXPORT_SYMBOL(ib_umem_dmabuf_get_pinned_with_dma_device); + +struct ib_umem_dmabuf *ib_umem_dmabuf_get_pinned(struct ib_device *device, + unsigned long offset, + size_t size, int fd, + int access) +{ + return ib_umem_dmabuf_get_pinned_with_dma_device(device, device->dma_device, + offset, size, fd, access); +} EXPORT_SYMBOL(ib_umem_dmabuf_get_pinned); -void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf) +void ib_umem_dmabuf_revoke(struct ib_umem_dmabuf *umem_dmabuf) { struct dma_buf *dmabuf = umem_dmabuf->attach->dmabuf; dma_resv_lock(dmabuf->resv, NULL); + if (umem_dmabuf->revoked) + goto end; ib_umem_dmabuf_unmap_pages(umem_dmabuf); - if (umem_dmabuf->pinned) + if (umem_dmabuf->pinned) { dma_buf_unpin(umem_dmabuf->attach); + umem_dmabuf->pinned = 0; + } + umem_dmabuf->revoked = 1; +end: dma_resv_unlock(dmabuf->resv); +} +EXPORT_SYMBOL(ib_umem_dmabuf_revoke); + +void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf) +{ + struct dma_buf *dmabuf = umem_dmabuf->attach->dmabuf; + + ib_umem_dmabuf_revoke(umem_dmabuf); dma_buf_detach(dmabuf, umem_dmabuf->attach); dma_buf_put(dmabuf); diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index e9fa22d31c23..c752ae9fad6c 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -41,65 +41,72 @@ #include <linux/hugetlb.h> #include <linux/interval_tree.h> #include <linux/hmm.h> +#include <linux/hmm-dma.h> #include <linux/pagemap.h> #include <rdma/ib_umem_odp.h> #include "uverbs.h" -static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, - const struct mmu_interval_notifier_ops *ops) +static void ib_init_umem_implicit_odp(struct ib_umem_odp *umem_odp) { - int ret; + umem_odp->is_implicit_odp = 1; + umem_odp->umem.is_odp = 1; + mutex_init(&umem_odp->umem_mutex); +} + +static int ib_init_umem_odp(struct ib_umem_odp *umem_odp, + const struct mmu_interval_notifier_ops *ops) +{ + struct ib_device *dev = umem_odp->umem.ibdev; + size_t page_size = 1UL << umem_odp->page_shift; + struct hmm_dma_map *map; + unsigned long start; + unsigned long end; + size_t nr_entries; + int ret = 0; umem_odp->umem.is_odp = 1; mutex_init(&umem_odp->umem_mutex); - if (!umem_odp->is_implicit_odp) { - size_t page_size = 1UL << umem_odp->page_shift; - unsigned long start; - unsigned long end; - size_t ndmas, npfns; - - start = ALIGN_DOWN(umem_odp->umem.address, page_size); - if (check_add_overflow(umem_odp->umem.address, - (unsigned long)umem_odp->umem.length, - &end)) - return -EOVERFLOW; - end = ALIGN(end, page_size); - if (unlikely(end < page_size)) - return -EOVERFLOW; - - ndmas = (end - start) >> umem_odp->page_shift; - if (!ndmas) - return -EINVAL; - - npfns = (end - start) >> PAGE_SHIFT; - umem_odp->pfn_list = kvcalloc( - npfns, sizeof(*umem_odp->pfn_list), GFP_KERNEL); - if (!umem_odp->pfn_list) - return -ENOMEM; - - umem_odp->dma_list = kvcalloc( - ndmas, sizeof(*umem_odp->dma_list), GFP_KERNEL); - if (!umem_odp->dma_list) { + start = ALIGN_DOWN(umem_odp->umem.address, page_size); + if (check_add_overflow(umem_odp->umem.address, + (unsigned long)umem_odp->umem.length, &end)) + return -EOVERFLOW; + end = ALIGN(end, page_size); + if (unlikely(end < page_size)) + return -EOVERFLOW; + + nr_entries = (end - start) >> PAGE_SHIFT; + if (!(nr_entries * PAGE_SIZE / page_size)) + return -EINVAL; + + map = &umem_odp->map; + if (ib_uses_virt_dma(dev)) { + map->pfn_list = kvcalloc(nr_entries, sizeof(*map->pfn_list), + GFP_KERNEL | __GFP_NOWARN); + if (!map->pfn_list) ret = -ENOMEM; - goto out_pfn_list; - } + } else + ret = hmm_dma_map_alloc(dev->dma_device, map, + (end - start) >> PAGE_SHIFT, + 1 << umem_odp->page_shift); + if (ret) + return ret; - ret = mmu_interval_notifier_insert(&umem_odp->notifier, - umem_odp->umem.owning_mm, - start, end - start, ops); - if (ret) - goto out_dma_list; - } + ret = mmu_interval_notifier_insert(&umem_odp->notifier, + umem_odp->umem.owning_mm, start, + end - start, ops); + if (ret) + goto out_free_map; return 0; -out_dma_list: - kvfree(umem_odp->dma_list); -out_pfn_list: - kvfree(umem_odp->pfn_list); +out_free_map: + if (ib_uses_virt_dma(dev)) + kfree(map->pfn_list); + else + hmm_dma_map_free(dev->dma_device, map); return ret; } @@ -118,7 +125,6 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device, { struct ib_umem *umem; struct ib_umem_odp *umem_odp; - int ret; if (access & IB_ACCESS_HUGETLB) return ERR_PTR(-EINVAL); @@ -130,16 +136,10 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device, umem->ibdev = device; umem->writable = ib_access_writable(access); umem->owning_mm = current->mm; - umem_odp->is_implicit_odp = 1; umem_odp->page_shift = PAGE_SHIFT; umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); - ret = ib_init_umem_odp(umem_odp, NULL); - if (ret) { - put_pid(umem_odp->tgid); - kfree(umem_odp); - return ERR_PTR(ret); - } + ib_init_umem_implicit_odp(umem_odp); return umem_odp; } EXPORT_SYMBOL(ib_umem_odp_alloc_implicit); @@ -260,74 +260,41 @@ err_put_pid: } EXPORT_SYMBOL(ib_umem_odp_get); -void ib_umem_odp_release(struct ib_umem_odp *umem_odp) +static void ib_umem_odp_free(struct ib_umem_odp *umem_odp) { + struct ib_device *dev = umem_odp->umem.ibdev; + /* * Ensure that no more pages are mapped in the umem. * * It is the driver's responsibility to ensure, before calling us, * that the hardware will not attempt to access the MR any more. */ - if (!umem_odp->is_implicit_odp) { - mutex_lock(&umem_odp->umem_mutex); - ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), - ib_umem_end(umem_odp)); - mutex_unlock(&umem_odp->umem_mutex); - mmu_interval_notifier_remove(&umem_odp->notifier); - kvfree(umem_odp->dma_list); - kvfree(umem_odp->pfn_list); - } - put_pid(umem_odp->tgid); - kfree(umem_odp); + mutex_lock(&umem_odp->umem_mutex); + ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), + ib_umem_end(umem_odp)); + mutex_unlock(&umem_odp->umem_mutex); + mmu_interval_notifier_remove(&umem_odp->notifier); + if (ib_uses_virt_dma(dev)) + kfree(umem_odp->map.pfn_list); + else + hmm_dma_map_free(dev->dma_device, &umem_odp->map); } -EXPORT_SYMBOL(ib_umem_odp_release); -/* - * Map for DMA and insert a single page into the on-demand paging page tables. - * - * @umem: the umem to insert the page to. - * @dma_index: index in the umem to add the dma to. - * @page: the page struct to map and add. - * @access_mask: access permissions needed for this page. - * - * The function returns -EFAULT if the DMA mapping operation fails. - * - */ -static int ib_umem_odp_map_dma_single_page( - struct ib_umem_odp *umem_odp, - unsigned int dma_index, - struct page *page, - u64 access_mask) +void ib_umem_odp_release(struct ib_umem_odp *umem_odp) { - struct ib_device *dev = umem_odp->umem.ibdev; - dma_addr_t *dma_addr = &umem_odp->dma_list[dma_index]; - - if (*dma_addr) { - /* - * If the page is already dma mapped it means it went through - * a non-invalidating trasition, like read-only to writable. - * Resync the flags. - */ - *dma_addr = (*dma_addr & ODP_DMA_ADDR_MASK) | access_mask; - return 0; - } + if (!umem_odp->is_implicit_odp) + ib_umem_odp_free(umem_odp); - *dma_addr = ib_dma_map_page(dev, page, 0, 1 << umem_odp->page_shift, - DMA_BIDIRECTIONAL); - if (ib_dma_mapping_error(dev, *dma_addr)) { - *dma_addr = 0; - return -EFAULT; - } - umem_odp->npages++; - *dma_addr |= access_mask; - return 0; + put_pid(umem_odp->tgid); + kfree(umem_odp); } +EXPORT_SYMBOL(ib_umem_odp_release); /** * ib_umem_odp_map_dma_and_lock - DMA map userspace memory in an ODP MR and lock it. * * Maps the range passed in the argument to DMA addresses. - * The DMA addresses of the mapped pages is updated in umem_odp->dma_list. * Upon success the ODP MR will be locked to let caller complete its device * page table update. * @@ -355,9 +322,6 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt, struct hmm_range range = {}; unsigned long timeout; - if (access_mask == 0) - return -EINVAL; - if (user_virt < ib_umem_start(umem_odp) || user_virt + bcnt > ib_umem_end(umem_odp)) return -EFAULT; @@ -383,11 +347,11 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt, if (fault) { range.default_flags = HMM_PFN_REQ_FAULT; - if (access_mask & ODP_WRITE_ALLOWED_BIT) + if (access_mask & HMM_PFN_WRITE) range.default_flags |= HMM_PFN_REQ_WRITE; } - range.hmm_pfns = &(umem_odp->pfn_list[pfn_start_idx]); + range.hmm_pfns = &(umem_odp->map.pfn_list[pfn_start_idx]); timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); retry: @@ -415,22 +379,17 @@ retry: for (pfn_index = 0; pfn_index < num_pfns; pfn_index += 1 << (page_shift - PAGE_SHIFT), dma_index++) { - if (fault) { - /* - * Since we asked for hmm_range_fault() to populate - * pages it shouldn't return an error entry on success. - */ - WARN_ON(range.hmm_pfns[pfn_index] & HMM_PFN_ERROR); - WARN_ON(!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)); - } else { - if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) { - WARN_ON(umem_odp->dma_list[dma_index]); - continue; - } - access_mask = ODP_READ_ALLOWED_BIT; - if (range.hmm_pfns[pfn_index] & HMM_PFN_WRITE) - access_mask |= ODP_WRITE_ALLOWED_BIT; - } + /* + * Since we asked for hmm_range_fault() to populate + * pages it shouldn't return an error entry on success. + */ + WARN_ON(fault && range.hmm_pfns[pfn_index] & HMM_PFN_ERROR); + WARN_ON(fault && !(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)); + if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) + continue; + + if (range.hmm_pfns[pfn_index] & HMM_PFN_DMA_MAPPED) + continue; hmm_order = hmm_pfn_to_map_order(range.hmm_pfns[pfn_index]); /* If a hugepage was detected and ODP wasn't set for, the umem @@ -443,15 +402,6 @@ retry: __func__, hmm_order, page_shift); break; } - - ret = ib_umem_odp_map_dma_single_page( - umem_odp, dma_index, hmm_pfn_to_page(range.hmm_pfns[pfn_index]), - access_mask); - if (ret < 0) { - ibdev_dbg(umem_odp->umem.ibdev, - "ib_umem_odp_map_dma_single_page failed with error %d\n", ret); - break; - } } /* upon success lock should stay on hold for the callee */ if (!ret) @@ -471,45 +421,38 @@ EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock); void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, u64 bound) { - dma_addr_t dma_addr; - dma_addr_t dma; - int idx; - u64 addr; struct ib_device *dev = umem_odp->umem.ibdev; + u64 addr; lockdep_assert_held(&umem_odp->umem_mutex); virt = max_t(u64, virt, ib_umem_start(umem_odp)); bound = min_t(u64, bound, ib_umem_end(umem_odp)); for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) { - idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; - dma = umem_odp->dma_list[idx]; - - /* The access flags guaranteed a valid DMA address in case was NULL */ - if (dma) { - unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >> PAGE_SHIFT; - struct page *page = hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]); - - dma_addr = dma & ODP_DMA_ADDR_MASK; - ib_dma_unmap_page(dev, dma_addr, - BIT(umem_odp->page_shift), - DMA_BIDIRECTIONAL); - if (dma & ODP_WRITE_ALLOWED_BIT) { - struct page *head_page = compound_head(page); - /* - * set_page_dirty prefers being called with - * the page lock. However, MMU notifiers are - * called sometimes with and sometimes without - * the lock. We rely on the umem_mutex instead - * to prevent other mmu notifiers from - * continuing and allowing the page mapping to - * be removed. - */ - set_page_dirty(head_page); - } - umem_odp->dma_list[idx] = 0; - umem_odp->npages--; + u64 offset = addr - ib_umem_start(umem_odp); + size_t idx = offset >> umem_odp->page_shift; + unsigned long pfn = umem_odp->map.pfn_list[idx]; + + if (!hmm_dma_unmap_pfn(dev->dma_device, &umem_odp->map, idx)) + goto clear; + + if (pfn & HMM_PFN_WRITE) { + struct page *page = hmm_pfn_to_page(pfn); + struct page *head_page = compound_head(page); + /* + * set_page_dirty prefers being called with + * the page lock. However, MMU notifiers are + * called sometimes with and sometimes without + * the lock. We rely on the umem_mutex instead + * to prevent other mmu notifiers from + * continuing and allowing the page mapping to + * be removed. + */ + set_page_dirty(head_page); } + umem_odp->npages--; +clear: + umem_odp->map.pfn_list[idx] &= ~HMM_PFN_FLAGS; } } EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages); diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index f5feca7fa9b9..fd67fc9fe85a 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -63,6 +63,8 @@ MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("InfiniBand userspace MAD packet access"); MODULE_LICENSE("Dual BSD/GPL"); +#define MAX_UMAD_RECV_LIST_SIZE 200000 + enum { IB_UMAD_MAX_PORTS = RDMA_MAX_PORTS, IB_UMAD_MAX_AGENTS = 32, @@ -113,6 +115,7 @@ struct ib_umad_file { struct mutex mutex; struct ib_umad_port *port; struct list_head recv_list; + atomic_t recv_list_size; struct list_head send_list; struct list_head port_list; spinlock_t send_lock; @@ -180,24 +183,28 @@ static struct ib_mad_agent *__get_agent(struct ib_umad_file *file, int id) return file->agents_dead ? NULL : file->agent[id]; } -static int queue_packet(struct ib_umad_file *file, - struct ib_mad_agent *agent, - struct ib_umad_packet *packet) +static int queue_packet(struct ib_umad_file *file, struct ib_mad_agent *agent, + struct ib_umad_packet *packet, bool is_recv_mad) { int ret = 1; mutex_lock(&file->mutex); + if (is_recv_mad && + atomic_read(&file->recv_list_size) > MAX_UMAD_RECV_LIST_SIZE) + goto unlock; + for (packet->mad.hdr.id = 0; packet->mad.hdr.id < IB_UMAD_MAX_AGENTS; packet->mad.hdr.id++) if (agent == __get_agent(file, packet->mad.hdr.id)) { list_add_tail(&packet->list, &file->recv_list); + atomic_inc(&file->recv_list_size); wake_up_interruptible(&file->recv_wait); ret = 0; break; } - +unlock: mutex_unlock(&file->mutex); return ret; @@ -224,7 +231,7 @@ static void send_handler(struct ib_mad_agent *agent, if (send_wc->status == IB_WC_RESP_TIMEOUT_ERR) { packet->length = IB_MGMT_MAD_HDR; packet->mad.hdr.status = ETIMEDOUT; - if (!queue_packet(file, agent, packet)) + if (!queue_packet(file, agent, packet, false)) return; } kfree(packet); @@ -284,7 +291,7 @@ static void recv_handler(struct ib_mad_agent *agent, rdma_destroy_ah_attr(&ah_attr); } - if (queue_packet(file, agent, packet)) + if (queue_packet(file, agent, packet, true)) goto err2; return; @@ -409,6 +416,7 @@ static ssize_t ib_umad_read(struct file *filp, char __user *buf, packet = list_entry(file->recv_list.next, struct ib_umad_packet, list); list_del(&packet->list); + atomic_dec(&file->recv_list_size); mutex_unlock(&file->mutex); @@ -421,6 +429,7 @@ static ssize_t ib_umad_read(struct file *filp, char __user *buf, /* Requeue packet */ mutex_lock(&file->mutex); list_add(&packet->list, &file->recv_list); + atomic_inc(&file->recv_list_size); mutex_unlock(&file->mutex); } else { if (packet->recv_wc) @@ -1073,7 +1082,6 @@ static const struct file_operations umad_fops = { #endif .open = ib_umad_open, .release = ib_umad_close, - .llseek = no_llseek, }; static int ib_umad_sm_open(struct inode *inode, struct file *filp) @@ -1141,7 +1149,6 @@ static const struct file_operations umad_sm_fops = { .owner = THIS_MODULE, .open = ib_umad_sm_open, .release = ib_umad_sm_close, - .llseek = no_llseek, }; static struct ib_umad_port *get_port(struct ib_device *ibdev, @@ -1312,15 +1319,17 @@ static int ib_umad_init_port(struct ib_device *device, int port_num, if (ret) goto err_cdev; - ib_umad_init_port_dev(&port->sm_dev, port, device); - port->sm_dev.devt = base_issm; - dev_set_name(&port->sm_dev, "issm%d", port->dev_num); - cdev_init(&port->sm_cdev, &umad_sm_fops); - port->sm_cdev.owner = THIS_MODULE; + if (rdma_cap_ib_smi(device, port_num)) { + ib_umad_init_port_dev(&port->sm_dev, port, device); + port->sm_dev.devt = base_issm; + dev_set_name(&port->sm_dev, "issm%d", port->dev_num); + cdev_init(&port->sm_cdev, &umad_sm_fops); + port->sm_cdev.owner = THIS_MODULE; - ret = cdev_device_add(&port->sm_cdev, &port->sm_dev); - if (ret) - goto err_dev; + ret = cdev_device_add(&port->sm_cdev, &port->sm_dev); + if (ret) + goto err_dev; + } return 0; @@ -1336,9 +1345,13 @@ err_cdev: static void ib_umad_kill_port(struct ib_umad_port *port) { struct ib_umad_file *file; + bool has_smi = false; int id; - cdev_device_del(&port->sm_cdev, &port->sm_dev); + if (rdma_cap_ib_smi(port->ib_dev, port->port_num)) { + cdev_device_del(&port->sm_cdev, &port->sm_dev); + has_smi = true; + } cdev_device_del(&port->cdev, &port->dev); mutex_lock(&port->file_mutex); @@ -1364,7 +1377,8 @@ static void ib_umad_kill_port(struct ib_umad_port *port) ida_free(&umad_ida, port->dev_num); /* balances device_initialize() */ - put_device(&port->sm_dev); + if (has_smi) + put_device(&port->sm_dev); put_device(&port->dev); } diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 821d93c8f712..797e2fcc8072 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -133,35 +133,6 @@ struct ib_uverbs_completion_event_file { struct ib_uverbs_event_queue ev_queue; }; -struct ib_uverbs_file { - struct kref ref; - struct ib_uverbs_device *device; - struct mutex ucontext_lock; - /* - * ucontext must be accessed via ib_uverbs_get_ucontext() or with - * ucontext_lock held - */ - struct ib_ucontext *ucontext; - struct ib_uverbs_async_event_file *default_async_file; - struct list_head list; - - /* - * To access the uobjects list hw_destroy_rwsem must be held for write - * OR hw_destroy_rwsem held for read AND uobjects_lock held. - * hw_destroy_rwsem should be called across any destruction of the HW - * object of an associated uobject. - */ - struct rw_semaphore hw_destroy_rwsem; - spinlock_t uobjects_lock; - struct list_head uobjects; - - struct mutex umap_lock; - struct list_head umaps; - struct page *disassociate_page; - - struct xarray idr; -}; - struct ib_uverbs_event { union { struct ib_uverbs_async_event_desc async; diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 3d3ee3eca983..bc9fe3ceca4d 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -42,6 +42,7 @@ #include <rdma/uverbs_types.h> #include <rdma/uverbs_std_types.h> +#include <rdma/ib_ucaps.h> #include "rdma_core.h" #include "uverbs.h" @@ -161,7 +162,7 @@ static const void __user *uverbs_request_next_ptr(struct uverbs_req_iter *iter, { const void __user *res = iter->cur; - if (iter->cur + len > iter->end) + if (len > iter->end - iter->cur) return (void __force __user *)ERR_PTR(-ENOSPC); iter->cur += len; return res; @@ -192,7 +193,7 @@ _ib_uverbs_lookup_comp_file(s32 fd, struct uverbs_attr_bundle *attrs) fd, attrs); if (IS_ERR(uobj)) - return (void *)uobj; + return ERR_CAST(uobj); uverbs_uobject_get(uobj); uobj_put_read(uobj); @@ -232,6 +233,8 @@ int ib_init_ucontext(struct uverbs_attr_bundle *attrs) { struct ib_ucontext *ucontext = attrs->context; struct ib_uverbs_file *file = attrs->ufile; + int *fd_array; + int fd_count; int ret; if (!down_read_trylock(&file->hw_destroy_rwsem)) @@ -247,6 +250,22 @@ int ib_init_ucontext(struct uverbs_attr_bundle *attrs) if (ret) goto err; + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_GET_CONTEXT_FD_ARR)) { + fd_count = uverbs_attr_ptr_get_array_size(attrs, + UVERBS_ATTR_GET_CONTEXT_FD_ARR, + sizeof(int)); + if (fd_count < 0) { + ret = fd_count; + goto err_uncharge; + } + + fd_array = uverbs_attr_get_alloced_ptr(attrs, + UVERBS_ATTR_GET_CONTEXT_FD_ARR); + ret = ib_get_ucaps(fd_array, fd_count, &ucontext->enabled_caps); + if (ret) + goto err_uncharge; + } + ret = ucontext->device->ops.alloc_ucontext(ucontext, &attrs->driver_udata); if (ret) @@ -572,7 +591,7 @@ static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs) struct inode *inode = NULL; int new_xrcd = 0; struct ib_device *ib_dev; - struct fd f = {}; + struct fd f = EMPTY_FD; int ret; ret = uverbs_request(attrs, &cmd, sizeof(cmd)); @@ -584,12 +603,12 @@ static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs) if (cmd.fd != -1) { /* search for file descriptor */ f = fdget(cmd.fd); - if (!f.file) { + if (fd_empty(f)) { ret = -EBADF; goto err_tree_mutex_unlock; } - inode = file_inode(f.file); + inode = file_inode(fd_file(f)); xrcd = find_xrcd(ibudev, inode); if (!xrcd && !(cmd.oflags & O_CREAT)) { /* no file descriptor. Need CREATE flag */ @@ -632,8 +651,7 @@ static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs) atomic_inc(&xrcd->usecnt); } - if (f.file) - fdput(f); + fdput(f); mutex_unlock(&ibudev->xrcd_tree_mutex); uobj_finalize_uobj_create(&obj->uobject, attrs); @@ -648,8 +666,7 @@ err: uobj_alloc_abort(&obj->uobject, attrs); err_tree_mutex_unlock: - if (f.file) - fdput(f); + fdput(f); mutex_unlock(&ibudev->xrcd_tree_mutex); @@ -718,8 +735,8 @@ static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs) goto err_free; pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs); - if (!pd) { - ret = -EINVAL; + if (IS_ERR(pd)) { + ret = PTR_ERR(pd); goto err_free; } @@ -809,8 +826,8 @@ static int ib_uverbs_rereg_mr(struct uverbs_attr_bundle *attrs) if (cmd.flags & IB_MR_REREG_PD) { new_pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs); - if (!new_pd) { - ret = -EINVAL; + if (IS_ERR(new_pd)) { + ret = PTR_ERR(new_pd); goto put_uobjs; } } else { @@ -919,8 +936,8 @@ static int ib_uverbs_alloc_mw(struct uverbs_attr_bundle *attrs) return PTR_ERR(uobj); pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs); - if (!pd) { - ret = -EINVAL; + if (IS_ERR(pd)) { + ret = PTR_ERR(pd); goto err_free; } @@ -1051,7 +1068,7 @@ static int create_cq(struct uverbs_attr_bundle *attrs, rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ); rdma_restrack_set_name(&cq->res, NULL); - ret = ib_dev->ops.create_cq(cq, &attr, &attrs->driver_udata); + ret = ib_dev->ops.create_cq(cq, &attr, attrs); if (ret) goto err_free; rdma_restrack_add(&cq->res); @@ -1127,8 +1144,8 @@ static int ib_uverbs_resize_cq(struct uverbs_attr_bundle *attrs) return ret; cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs); - if (!cq) - return -EINVAL; + if (IS_ERR(cq)) + return PTR_ERR(cq); ret = cq->device->ops.resize_cq(cq, cmd.cqe, &attrs->driver_udata); if (ret) @@ -1189,8 +1206,8 @@ static int ib_uverbs_poll_cq(struct uverbs_attr_bundle *attrs) return ret; cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs); - if (!cq) - return -EINVAL; + if (IS_ERR(cq)) + return PTR_ERR(cq); /* we copy a struct ib_uverbs_poll_cq_resp to user space */ header_ptr = attrs->ucore.outbuf; @@ -1238,8 +1255,8 @@ static int ib_uverbs_req_notify_cq(struct uverbs_attr_bundle *attrs) return ret; cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs); - if (!cq) - return -EINVAL; + if (IS_ERR(cq)) + return PTR_ERR(cq); ib_req_notify_cq(cq, cmd.solicited_only ? IB_CQ_SOLICITED : IB_CQ_NEXT_COMP); @@ -1321,8 +1338,8 @@ static int create_qp(struct uverbs_attr_bundle *attrs, ind_tbl = uobj_get_obj_read(rwq_ind_table, UVERBS_OBJECT_RWQ_IND_TBL, cmd->rwq_ind_tbl_handle, attrs); - if (!ind_tbl) { - ret = -EINVAL; + if (IS_ERR(ind_tbl)) { + ret = PTR_ERR(ind_tbl); goto err_put; } @@ -1360,8 +1377,10 @@ static int create_qp(struct uverbs_attr_bundle *attrs, if (cmd->is_srq) { srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd->srq_handle, attrs); - if (!srq || srq->srq_type == IB_SRQT_XRC) { - ret = -EINVAL; + if (IS_ERR(srq) || + srq->srq_type == IB_SRQT_XRC) { + ret = IS_ERR(srq) ? PTR_ERR(srq) : + -EINVAL; goto err_put; } } @@ -1371,23 +1390,29 @@ static int create_qp(struct uverbs_attr_bundle *attrs, rcq = uobj_get_obj_read( cq, UVERBS_OBJECT_CQ, cmd->recv_cq_handle, attrs); - if (!rcq) { - ret = -EINVAL; + if (IS_ERR(rcq)) { + ret = PTR_ERR(rcq); goto err_put; } } } } - if (has_sq) + if (has_sq) { scq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd->send_cq_handle, attrs); + if (IS_ERR(scq)) { + ret = PTR_ERR(scq); + goto err_put; + } + } + if (!ind_tbl && cmd->qp_type != IB_QPT_XRC_INI) rcq = rcq ?: scq; pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd->pd_handle, attrs); - if (!pd || (!scq && has_sq)) { - ret = -EINVAL; + if (IS_ERR(pd)) { + ret = PTR_ERR(pd); goto err_put; } @@ -1482,18 +1507,18 @@ static int create_qp(struct uverbs_attr_bundle *attrs, err_put: if (!IS_ERR(xrcd_uobj)) uobj_put_read(xrcd_uobj); - if (pd) + if (!IS_ERR_OR_NULL(pd)) uobj_put_obj_read(pd); - if (scq) + if (!IS_ERR_OR_NULL(scq)) rdma_lookup_put_uobject(&scq->uobject->uevent.uobject, UVERBS_LOOKUP_READ); - if (rcq && rcq != scq) + if (!IS_ERR_OR_NULL(rcq) && rcq != scq) rdma_lookup_put_uobject(&rcq->uobject->uevent.uobject, UVERBS_LOOKUP_READ); - if (srq) + if (!IS_ERR_OR_NULL(srq)) rdma_lookup_put_uobject(&srq->uobject->uevent.uobject, UVERBS_LOOKUP_READ); - if (ind_tbl) + if (!IS_ERR_OR_NULL(ind_tbl)) uobj_put_obj_read(ind_tbl); uobj_alloc_abort(&obj->uevent.uobject, attrs); @@ -1655,8 +1680,8 @@ static int ib_uverbs_query_qp(struct uverbs_attr_bundle *attrs) } qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs); - if (!qp) { - ret = -EINVAL; + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); goto out; } @@ -1761,8 +1786,8 @@ static int modify_qp(struct uverbs_attr_bundle *attrs, qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd->base.qp_handle, attrs); - if (!qp) { - ret = -EINVAL; + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); goto out; } @@ -2010,11 +2035,13 @@ static int ib_uverbs_post_send(struct uverbs_attr_bundle *attrs) ret = uverbs_request_start(attrs, &iter, &cmd, sizeof(cmd)); if (ret) return ret; - wqes = uverbs_request_next_ptr(&iter, cmd.wqe_size * cmd.wr_count); + wqes = uverbs_request_next_ptr(&iter, size_mul(cmd.wqe_size, + cmd.wr_count)); if (IS_ERR(wqes)) return PTR_ERR(wqes); - sgls = uverbs_request_next_ptr( - &iter, cmd.sge_count * sizeof(struct ib_uverbs_sge)); + sgls = uverbs_request_next_ptr(&iter, + size_mul(cmd.sge_count, + sizeof(struct ib_uverbs_sge))); if (IS_ERR(sgls)) return PTR_ERR(sgls); ret = uverbs_request_finish(&iter); @@ -2026,8 +2053,8 @@ static int ib_uverbs_post_send(struct uverbs_attr_bundle *attrs) return -ENOMEM; qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs); - if (!qp) { - ret = -EINVAL; + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); goto out; } @@ -2064,9 +2091,9 @@ static int ib_uverbs_post_send(struct uverbs_attr_bundle *attrs) ud->ah = uobj_get_obj_read(ah, UVERBS_OBJECT_AH, user_wr->wr.ud.ah, attrs); - if (!ud->ah) { + if (IS_ERR(ud->ah)) { + ret = PTR_ERR(ud->ah); kfree(ud); - ret = -EINVAL; goto out_put; } ud->remote_qpn = user_wr->wr.ud.remote_qpn; @@ -2200,11 +2227,11 @@ ib_uverbs_unmarshall_recv(struct uverbs_req_iter *iter, u32 wr_count, if (wqe_size < sizeof(struct ib_uverbs_recv_wr)) return ERR_PTR(-EINVAL); - wqes = uverbs_request_next_ptr(iter, wqe_size * wr_count); + wqes = uverbs_request_next_ptr(iter, size_mul(wqe_size, wr_count)); if (IS_ERR(wqes)) return ERR_CAST(wqes); - sgls = uverbs_request_next_ptr( - iter, sge_count * sizeof(struct ib_uverbs_sge)); + sgls = uverbs_request_next_ptr(iter, size_mul(sge_count, + sizeof(struct ib_uverbs_sge))); if (IS_ERR(sgls)) return ERR_CAST(sgls); ret = uverbs_request_finish(iter); @@ -2303,8 +2330,8 @@ static int ib_uverbs_post_recv(struct uverbs_attr_bundle *attrs) return PTR_ERR(wr); qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs); - if (!qp) { - ret = -EINVAL; + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); goto out; } @@ -2354,8 +2381,8 @@ static int ib_uverbs_post_srq_recv(struct uverbs_attr_bundle *attrs) return PTR_ERR(wr); srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs); - if (!srq) { - ret = -EINVAL; + if (IS_ERR(srq)) { + ret = PTR_ERR(srq); goto out; } @@ -2411,8 +2438,8 @@ static int ib_uverbs_create_ah(struct uverbs_attr_bundle *attrs) } pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs); - if (!pd) { - ret = -EINVAL; + if (IS_ERR(pd)) { + ret = PTR_ERR(pd); goto err; } @@ -2481,8 +2508,8 @@ static int ib_uverbs_attach_mcast(struct uverbs_attr_bundle *attrs) return ret; qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs); - if (!qp) - return -EINVAL; + if (IS_ERR(qp)) + return PTR_ERR(qp); obj = qp->uobject; @@ -2531,8 +2558,8 @@ static int ib_uverbs_detach_mcast(struct uverbs_attr_bundle *attrs) return ret; qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs); - if (!qp) - return -EINVAL; + if (IS_ERR(qp)) + return PTR_ERR(qp); obj = qp->uobject; mutex_lock(&obj->mcast_lock); @@ -2666,8 +2693,8 @@ static int kern_spec_to_ib_spec_action(struct uverbs_attr_bundle *attrs, UVERBS_OBJECT_FLOW_ACTION, kern_spec->action.handle, attrs); - if (!ib_spec->action.act) - return -EINVAL; + if (IS_ERR(ib_spec->action.act)) + return PTR_ERR(ib_spec->action.act); ib_spec->action.size = sizeof(struct ib_flow_spec_action_handle); flow_resources_add(uflow_res, @@ -2684,8 +2711,8 @@ static int kern_spec_to_ib_spec_action(struct uverbs_attr_bundle *attrs, UVERBS_OBJECT_COUNTERS, kern_spec->flow_count.handle, attrs); - if (!ib_spec->flow_count.counters) - return -EINVAL; + if (IS_ERR(ib_spec->flow_count.counters)) + return PTR_ERR(ib_spec->flow_count.counters); ib_spec->flow_count.size = sizeof(struct ib_flow_spec_action_count); flow_resources_add(uflow_res, @@ -2903,14 +2930,14 @@ static int ib_uverbs_ex_create_wq(struct uverbs_attr_bundle *attrs) return PTR_ERR(obj); pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs); - if (!pd) { - err = -EINVAL; + if (IS_ERR(pd)) { + err = PTR_ERR(pd); goto err_uobj; } cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs); - if (!cq) { - err = -EINVAL; + if (IS_ERR(cq)) { + err = PTR_ERR(cq); goto err_put_pd; } @@ -3011,8 +3038,8 @@ static int ib_uverbs_ex_modify_wq(struct uverbs_attr_bundle *attrs) return -EINVAL; wq = uobj_get_obj_read(wq, UVERBS_OBJECT_WQ, cmd.wq_handle, attrs); - if (!wq) - return -EINVAL; + if (IS_ERR(wq)) + return PTR_ERR(wq); if (cmd.attr_mask & IB_WQ_FLAGS) { wq_attr.flags = cmd.flags; @@ -3095,8 +3122,8 @@ static int ib_uverbs_ex_create_rwq_ind_table(struct uverbs_attr_bundle *attrs) num_read_wqs++) { wq = uobj_get_obj_read(wq, UVERBS_OBJECT_WQ, wqs_handles[num_read_wqs], attrs); - if (!wq) { - err = -EINVAL; + if (IS_ERR(wq)) { + err = PTR_ERR(wq); goto put_wqs; } @@ -3251,8 +3278,8 @@ static int ib_uverbs_ex_create_flow(struct uverbs_attr_bundle *attrs) } qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs); - if (!qp) { - err = -EINVAL; + if (IS_ERR(qp)) { + err = PTR_ERR(qp); goto err_uobj; } @@ -3398,15 +3425,15 @@ static int __uverbs_create_xsrq(struct uverbs_attr_bundle *attrs, if (ib_srq_has_cq(cmd->srq_type)) { attr.ext.cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd->cq_handle, attrs); - if (!attr.ext.cq) { - ret = -EINVAL; + if (IS_ERR(attr.ext.cq)) { + ret = PTR_ERR(attr.ext.cq); goto err_put_xrcd; } } pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd->pd_handle, attrs); - if (!pd) { - ret = -EINVAL; + if (IS_ERR(pd)) { + ret = PTR_ERR(pd); goto err_put_cq; } @@ -3513,8 +3540,8 @@ static int ib_uverbs_modify_srq(struct uverbs_attr_bundle *attrs) return ret; srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs); - if (!srq) - return -EINVAL; + if (IS_ERR(srq)) + return PTR_ERR(srq); attr.max_wr = cmd.max_wr; attr.srq_limit = cmd.srq_limit; @@ -3541,8 +3568,8 @@ static int ib_uverbs_query_srq(struct uverbs_attr_bundle *attrs) return ret; srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs); - if (!srq) - return -EINVAL; + if (IS_ERR(srq)) + return PTR_ERR(srq); ret = ib_query_srq(srq, &attr); @@ -3667,8 +3694,8 @@ static int ib_uverbs_ex_modify_cq(struct uverbs_attr_bundle *attrs) return -EOPNOTSUPP; cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs); - if (!cq) - return -EINVAL; + if (IS_ERR(cq)) + return PTR_ERR(cq); ret = rdma_set_cq_moderation(cq, cmd.attr.cq_count, cmd.attr.cq_period); diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 495d5a5d0373..973fe2c7ef53 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -52,6 +52,7 @@ #include <rdma/ib.h> #include <rdma/uverbs_std_types.h> #include <rdma/rdma_netlink.h> +#include <rdma/ib_ucaps.h> #include "uverbs.h" #include "core_priv.h" @@ -76,6 +77,7 @@ static dev_t dynamic_uverbs_dev; static DEFINE_IDA(uverbs_ida); static int ib_uverbs_add_one(struct ib_device *device); static void ib_uverbs_remove_one(struct ib_device *device, void *client_data); +static struct ib_client uverbs_client; static char *uverbs_devnode(const struct device *dev, umode_t *mode) { @@ -217,6 +219,7 @@ void ib_uverbs_release_file(struct kref *ref) if (file->disassociate_page) __free_pages(file->disassociate_page, 0); + mutex_destroy(&file->disassociation_lock); mutex_destroy(&file->umap_lock); mutex_destroy(&file->ucontext_lock); kfree(file); @@ -353,7 +356,6 @@ const struct file_operations uverbs_event_fops = { .poll = ib_uverbs_comp_event_poll, .release = uverbs_uobject_fd_release, .fasync = ib_uverbs_comp_event_fasync, - .llseek = no_llseek, }; const struct file_operations uverbs_async_event_fops = { @@ -362,7 +364,6 @@ const struct file_operations uverbs_async_event_fops = { .poll = ib_uverbs_async_event_poll, .release = uverbs_async_event_release, .fasync = ib_uverbs_async_event_fasync, - .llseek = no_llseek, }; void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context) @@ -700,8 +701,13 @@ static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma) ret = PTR_ERR(ucontext); goto out; } + + mutex_lock(&file->disassociation_lock); + vma->vm_ops = &rdma_umap_ops; ret = ucontext->device->ops.mmap(ucontext, vma); + + mutex_unlock(&file->disassociation_lock); out: srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); return ret; @@ -723,6 +729,8 @@ static void rdma_umap_open(struct vm_area_struct *vma) /* We are racing with disassociation */ if (!down_read_trylock(&ufile->hw_destroy_rwsem)) goto out_zap; + mutex_lock(&ufile->disassociation_lock); + /* * Disassociation already completed, the VMA should already be zapped. */ @@ -734,10 +742,12 @@ static void rdma_umap_open(struct vm_area_struct *vma) goto out_unlock; rdma_umap_priv_init(priv, vma, opriv->entry); + mutex_unlock(&ufile->disassociation_lock); up_read(&ufile->hw_destroy_rwsem); return; out_unlock: + mutex_unlock(&ufile->disassociation_lock); up_read(&ufile->hw_destroy_rwsem); out_zap: /* @@ -821,7 +831,7 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) { struct rdma_umap_priv *priv, *next_priv; - lockdep_assert_held(&ufile->hw_destroy_rwsem); + mutex_lock(&ufile->disassociation_lock); while (1) { struct mm_struct *mm = NULL; @@ -847,8 +857,10 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) break; } mutex_unlock(&ufile->umap_lock); - if (!mm) + if (!mm) { + mutex_unlock(&ufile->disassociation_lock); return; + } /* * The umap_lock is nested under mmap_lock since it used within @@ -878,7 +890,31 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) mmap_read_unlock(mm); mmput(mm); } + + mutex_unlock(&ufile->disassociation_lock); +} + +/** + * rdma_user_mmap_disassociate() - Revoke mmaps for a device + * @device: device to revoke + * + * This function should be called by drivers that need to disable mmaps for the + * device, for instance because it is going to be reset. + */ +void rdma_user_mmap_disassociate(struct ib_device *device) +{ + struct ib_uverbs_device *uverbs_dev = + ib_get_client_data(device, &uverbs_client); + struct ib_uverbs_file *ufile; + + mutex_lock(&uverbs_dev->lists_mutex); + list_for_each_entry(ufile, &uverbs_dev->uverbs_file_list, list) { + if (ufile->ucontext) + uverbs_user_mmap_disassociate(ufile); + } + mutex_unlock(&uverbs_dev->lists_mutex); } +EXPORT_SYMBOL(rdma_user_mmap_disassociate); /* * ib_uverbs_open() does not need the BKL: @@ -949,6 +985,8 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) mutex_init(&file->umap_lock); INIT_LIST_HEAD(&file->umaps); + mutex_init(&file->disassociation_lock); + filp->private_data = file; list_add_tail(&file->list, &dev->uverbs_file_list); mutex_unlock(&dev->lists_mutex); @@ -991,7 +1029,6 @@ static const struct file_operations uverbs_fops = { .write = ib_uverbs_write, .open = ib_uverbs_open, .release = ib_uverbs_close, - .llseek = no_llseek, .unlocked_ioctl = ib_uverbs_ioctl, .compat_ioctl = compat_ptr_ioctl, }; @@ -1002,7 +1039,6 @@ static const struct file_operations uverbs_mmap_fops = { .mmap = ib_uverbs_mmap, .open = ib_uverbs_open, .release = ib_uverbs_close, - .llseek = no_llseek, .unlocked_ioctl = ib_uverbs_ioctl, .compat_ioctl = compat_ptr_ioctl, }; @@ -1114,7 +1150,8 @@ static int ib_uverbs_add_one(struct ib_device *device) struct ib_uverbs_device *uverbs_dev; int ret; - if (!device->ops.alloc_ucontext) + if (!device->ops.alloc_ucontext || + device->type == RDMA_DEVICE_TYPE_SMI) return -EOPNOTSUPP; uverbs_dev = kzalloc(sizeof(*uverbs_dev), GFP_KERNEL); @@ -1309,6 +1346,7 @@ static void __exit ib_uverbs_cleanup(void) IB_UVERBS_NUM_FIXED_MINOR); unregister_chrdev_region(dynamic_uverbs_dev, IB_UVERBS_NUM_DYNAMIC_MINOR); + ib_cleanup_ucaps(); mmu_notifier_synchronize(); } diff --git a/drivers/infiniband/core/uverbs_marshall.c b/drivers/infiniband/core/uverbs_marshall.c index 11a080646916..e803f609ec87 100644 --- a/drivers/infiniband/core/uverbs_marshall.c +++ b/drivers/infiniband/core/uverbs_marshall.c @@ -171,45 +171,3 @@ void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst, __ib_copy_path_rec_to_user(dst, src); } EXPORT_SYMBOL(ib_copy_path_rec_to_user); - -void ib_copy_path_rec_from_user(struct sa_path_rec *dst, - struct ib_user_path_rec *src) -{ - u32 slid, dlid; - - memset(dst, 0, sizeof(*dst)); - if ((ib_is_opa_gid((union ib_gid *)src->sgid)) || - (ib_is_opa_gid((union ib_gid *)src->dgid))) { - dst->rec_type = SA_PATH_REC_TYPE_OPA; - slid = opa_get_lid_from_gid((union ib_gid *)src->sgid); - dlid = opa_get_lid_from_gid((union ib_gid *)src->dgid); - } else { - dst->rec_type = SA_PATH_REC_TYPE_IB; - slid = ntohs(src->slid); - dlid = ntohs(src->dlid); - } - memcpy(dst->dgid.raw, src->dgid, sizeof dst->dgid); - memcpy(dst->sgid.raw, src->sgid, sizeof dst->sgid); - - sa_path_set_dlid(dst, dlid); - sa_path_set_slid(dst, slid); - sa_path_set_raw_traffic(dst, src->raw_traffic); - dst->flow_label = src->flow_label; - dst->hop_limit = src->hop_limit; - dst->traffic_class = src->traffic_class; - dst->reversible = src->reversible; - dst->numb_path = src->numb_path; - dst->pkey = src->pkey; - dst->sl = src->sl; - dst->mtu_selector = src->mtu_selector; - dst->mtu = src->mtu; - dst->rate_selector = src->rate_selector; - dst->rate = src->rate; - dst->packet_life_time = src->packet_life_time; - dst->preference = src->preference; - dst->packet_life_time_selector = src->packet_life_time_selector; - - /* TODO: No need to set this */ - sa_path_set_dmac_zero(dst); -} -EXPORT_SYMBOL(ib_copy_path_rec_from_user); diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c index 370ad7c83f88..432054f0a8a4 100644 --- a/drivers/infiniband/core/uverbs_std_types_cq.c +++ b/drivers/infiniband/core/uverbs_std_types_cq.c @@ -128,7 +128,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ); rdma_restrack_set_name(&cq->res, NULL); - ret = ib_dev->ops.create_cq(cq, &attr, &attrs->driver_udata); + ret = ib_dev->ops.create_cq(cq, &attr, attrs); if (ret) goto err_free; diff --git a/drivers/infiniband/core/uverbs_std_types_device.c b/drivers/infiniband/core/uverbs_std_types_device.c index fb0555647336..c0fd283d9d6c 100644 --- a/drivers/infiniband/core/uverbs_std_types_device.c +++ b/drivers/infiniband/core/uverbs_std_types_device.c @@ -437,6 +437,10 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_ATTR_TYPE(u32), UA_OPTIONAL), UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_GET_CONTEXT_CORE_SUPPORT, UVERBS_ATTR_TYPE(u64), UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_GET_CONTEXT_FD_ARR, + UVERBS_ATTR_MIN_SIZE(sizeof(int)), + UA_OPTIONAL, + UA_ALLOC_AND_COPY), UVERBS_ATTR_UHW()); DECLARE_UVERBS_NAMED_METHOD( diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c index 03e1db5d1e8c..7ebc7bd3caae 100644 --- a/drivers/infiniband/core/uverbs_std_types_mr.c +++ b/drivers/infiniband/core/uverbs_std_types_mr.c @@ -239,7 +239,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_REG_DMABUF_MR)( mr = pd->device->ops.reg_user_mr_dmabuf(pd, offset, length, iova, fd, access_flags, - &attrs->driver_udata); + attrs); if (IS_ERR(mr)) return PTR_ERR(mr); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 94a7f3b0c71c..75fde0fe9989 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -572,7 +572,7 @@ struct ib_ah *rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, GFP_KERNEL : GFP_ATOMIC); if (IS_ERR(slave)) { rdma_unfill_sgid_attr(ah_attr, old_sgid_attr); - return (void *)slave; + return ERR_CAST(slave); } ah = _rdma_create_ah(pd, ah_attr, flags, NULL, slave); rdma_lag_put_ah_roce_slave(slave); @@ -1101,6 +1101,16 @@ EXPORT_SYMBOL(ib_destroy_srq_user); /* Queue pairs */ +static void __ib_qp_event_handler(struct ib_event *event, void *context) +{ + struct ib_qp *qp = event->element.qp; + + if (event->event == IB_EVENT_QP_LAST_WQE_REACHED) + complete(&qp->srq_completion); + if (qp->registered_event_handler) + qp->registered_event_handler(event, qp->qp_context); +} + static void __ib_shared_qp_event_handler(struct ib_event *event, void *context) { struct ib_qp *qp = context; @@ -1221,13 +1231,15 @@ static struct ib_qp *create_qp(struct ib_device *dev, struct ib_pd *pd, qp->qp_type = attr->qp_type; qp->rwq_ind_tbl = attr->rwq_ind_tbl; qp->srq = attr->srq; - qp->event_handler = attr->event_handler; + qp->event_handler = __ib_qp_event_handler; + qp->registered_event_handler = attr->event_handler; qp->port = attr->port_num; qp->qp_context = attr->qp_context; spin_lock_init(&qp->mr_lock); INIT_LIST_HEAD(&qp->rdma_mrs); INIT_LIST_HEAD(&qp->sig_mrs); + init_completion(&qp->srq_completion); qp->send_cq = attr->send_cq; qp->recv_cq = attr->recv_cq; @@ -2093,7 +2105,7 @@ int ib_destroy_qp_user(struct ib_qp *qp, struct ib_udata *udata) if (!qp->uobject) rdma_rw_cleanup_mrs(qp); - rdma_counter_unbind_qp(qp, true); + rdma_counter_unbind_qp(qp, qp->port, true); ret = qp->device->ops.destroy_qp(qp, udata); if (ret) { if (sec) @@ -2884,6 +2896,72 @@ static void __ib_drain_rq(struct ib_qp *qp) wait_for_completion(&rdrain.done); } +/* + * __ib_drain_srq() - Block until Last WQE Reached event arrives, or timeout + * expires. + * @qp: queue pair associated with SRQ to drain + * + * Quoting 10.3.1 Queue Pair and EE Context States: + * + * Note, for QPs that are associated with an SRQ, the Consumer should take the + * QP through the Error State before invoking a Destroy QP or a Modify QP to the + * Reset State. The Consumer may invoke the Destroy QP without first performing + * a Modify QP to the Error State and waiting for the Affiliated Asynchronous + * Last WQE Reached Event. However, if the Consumer does not wait for the + * Affiliated Asynchronous Last WQE Reached Event, then WQE and Data Segment + * leakage may occur. Therefore, it is good programming practice to tear down a + * QP that is associated with an SRQ by using the following process: + * + * - Put the QP in the Error State + * - Wait for the Affiliated Asynchronous Last WQE Reached Event; + * - either: + * drain the CQ by invoking the Poll CQ verb and either wait for CQ + * to be empty or the number of Poll CQ operations has exceeded + * CQ capacity size; + * - or + * post another WR that completes on the same CQ and wait for this + * WR to return as a WC; + * - and then invoke a Destroy QP or Reset QP. + * + * We use the first option. + */ +static void __ib_drain_srq(struct ib_qp *qp) +{ + struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; + struct ib_cq *cq; + int n, polled = 0; + int ret; + + if (!qp->srq) { + WARN_ONCE(1, "QP 0x%p is not associated with SRQ\n", qp); + return; + } + + ret = ib_modify_qp(qp, &attr, IB_QP_STATE); + if (ret) { + WARN_ONCE(ret, "failed to drain shared recv queue: %d\n", ret); + return; + } + + if (ib_srq_has_cq(qp->srq->srq_type)) { + cq = qp->srq->ext.cq; + } else if (qp->recv_cq) { + cq = qp->recv_cq; + } else { + WARN_ONCE(1, "QP 0x%p has no CQ associated with SRQ\n", qp); + return; + } + + if (wait_for_completion_timeout(&qp->srq_completion, 60 * HZ) > 0) { + while (polled != cq->cqe) { + n = ib_process_cq_direct(cq, cq->cqe - polled); + if (!n) + return; + polled += n; + } + } +} + /** * ib_drain_sq() - Block until all SQ CQEs have been consumed by the * application. @@ -2962,6 +3040,8 @@ void ib_drain_qp(struct ib_qp *qp) ib_drain_sq(qp); if (!qp->srq) ib_drain_rq(qp); + else + __ib_drain_srq(qp); } EXPORT_SYMBOL(ib_drain_qp); @@ -3029,22 +3109,23 @@ EXPORT_SYMBOL(__rdma_block_iter_start); bool __rdma_block_iter_next(struct ib_block_iter *biter) { unsigned int block_offset; - unsigned int sg_delta; + unsigned int delta; if (!biter->__sg_nents || !biter->__sg) return false; biter->__dma_addr = sg_dma_address(biter->__sg) + biter->__sg_advance; block_offset = biter->__dma_addr & (BIT_ULL(biter->__pg_bit) - 1); - sg_delta = BIT_ULL(biter->__pg_bit) - block_offset; + delta = BIT_ULL(biter->__pg_bit) - block_offset; - if (sg_dma_len(biter->__sg) - biter->__sg_advance > sg_delta) { - biter->__sg_advance += sg_delta; - } else { + while (biter->__sg_nents && biter->__sg && + sg_dma_len(biter->__sg) - biter->__sg_advance <= delta) { + delta -= sg_dma_len(biter->__sg) - biter->__sg_advance; biter->__sg_advance = 0; biter->__sg = sg_next(biter->__sg); biter->__sg_nents--; } + biter->__sg_advance += delta; return true; } diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile index 1211f4317a9f..aba96ca9bce5 100644 --- a/drivers/infiniband/hw/Makefile +++ b/drivers/infiniband/hw/Makefile @@ -11,7 +11,7 @@ obj-$(CONFIG_INFINIBAND_OCRDMA) += ocrdma/ obj-$(CONFIG_INFINIBAND_VMWARE_PVRDMA) += vmw_pvrdma/ obj-$(CONFIG_INFINIBAND_USNIC) += usnic/ obj-$(CONFIG_INFINIBAND_HFI1) += hfi1/ -obj-$(CONFIG_INFINIBAND_HNS) += hns/ +obj-$(CONFIG_INFINIBAND_HNS_HIP08) += hns/ obj-$(CONFIG_INFINIBAND_QEDR) += qedr/ obj-$(CONFIG_INFINIBAND_BNXT_RE) += bnxt_re/ obj-$(CONFIG_INFINIBAND_ERDMA) += erdma/ diff --git a/drivers/infiniband/hw/bnxt_re/Makefile b/drivers/infiniband/hw/bnxt_re/Makefile index ee9bb1be61ea..f63417d2ccc6 100644 --- a/drivers/infiniband/hw/bnxt_re/Makefile +++ b/drivers/infiniband/hw/bnxt_re/Makefile @@ -4,4 +4,5 @@ ccflags-y := -I $(srctree)/drivers/net/ethernet/broadcom/bnxt obj-$(CONFIG_INFINIBAND_BNXT_RE) += bnxt_re.o bnxt_re-y := main.o ib_verbs.o \ qplib_res.o qplib_rcfw.o \ - qplib_sp.o qplib_fp.o hw_counters.o + qplib_sp.o qplib_fp.o hw_counters.o \ + debugfs.o diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index 9dca451ed522..6df5a2738c95 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -53,12 +53,6 @@ #define BNXT_RE_MAX_MR_SIZE_HIGH BIT_ULL(39) #define BNXT_RE_MAX_MR_SIZE BNXT_RE_MAX_MR_SIZE_HIGH -#define BNXT_RE_MAX_QPC_COUNT (64 * 1024) -#define BNXT_RE_MAX_MRW_COUNT (64 * 1024) -#define BNXT_RE_MAX_SRQC_COUNT (64 * 1024) -#define BNXT_RE_MAX_CQ_COUNT (64 * 1024) -#define BNXT_RE_MAX_MRW_COUNT_64K (64 * 1024) -#define BNXT_RE_MAX_MRW_COUNT_256K (256 * 1024) /* Number of MRs to reserve for PF, leaving remainder for VFs */ #define BNXT_RE_RESVD_MR_FOR_PF (32 * 1024) @@ -91,6 +85,15 @@ struct bnxt_re_ring_attr { u8 mode; }; +/* + * Data structure and defines to handle + * recovery + */ +#define BNXT_RE_PRE_RECOVERY_REMOVE 0x1 +#define BNXT_RE_COMPLETE_REMOVE 0x2 +#define BNXT_RE_POST_RECOVERY_INIT 0x4 +#define BNXT_RE_COMPLETE_INIT 0x8 + struct bnxt_re_sqp_entries { struct bnxt_qplib_sge sge; u64 wrid; @@ -107,8 +110,11 @@ struct bnxt_re_gsi_context { struct bnxt_re_sqp_entries *sqp_tbl; }; -#define BNXT_RE_MIN_MSIX 2 -#define BNXT_RE_MAX_MSIX 9 +struct bnxt_re_en_dev_info { + struct bnxt_en_dev *en_dev; + struct bnxt_re_dev *rdev; +}; + #define BNXT_RE_AEQ_IDX 0 #define BNXT_RE_NQ_IDX 1 #define BNXT_RE_GEN_P5_MAX_VF 64 @@ -131,12 +137,36 @@ struct bnxt_re_pacing { #define BNXT_RE_PACING_ALARM_TH_MULTIPLE 2 /* Multiple of pacing algo threshold */ /* Default do_pacing value when there is no congestion */ #define BNXT_RE_DBR_DO_PACING_NO_CONGESTION 0x7F /* 1 in 512 probability */ -#define BNXT_RE_DB_FIFO_ROOM_MASK 0x1FFF8000 -#define BNXT_RE_MAX_FIFO_DEPTH 0x2c00 -#define BNXT_RE_DB_FIFO_ROOM_SHIFT 15 + +#define BNXT_RE_MAX_FIFO_DEPTH_P5 0x2c00 +#define BNXT_RE_MAX_FIFO_DEPTH_P7 0x8000 + +#define BNXT_RE_MAX_FIFO_DEPTH(ctx) \ + (bnxt_qplib_is_chip_gen_p7((ctx)) ? \ + BNXT_RE_MAX_FIFO_DEPTH_P7 :\ + BNXT_RE_MAX_FIFO_DEPTH_P5) + #define BNXT_RE_GRC_FIFO_REG_BASE 0x2000 +#define BNXT_RE_MIN_MSIX 2 +#define BNXT_RE_MAX_MSIX BNXT_MAX_ROCE_MSIX +struct bnxt_re_nq_record { + struct bnxt_msix_entry msix_entries[BNXT_RE_MAX_MSIX]; + struct bnxt_qplib_nq nq[BNXT_RE_MAX_MSIX]; + int num_msix; + /* serialize NQ access */ + struct mutex load_lock; +}; + #define MAX_CQ_HASH_BITS (16) +#define MAX_SRQ_HASH_BITS (16) + +static inline bool bnxt_re_chip_gen_p7(u16 chip_num) +{ + return (chip_num == CHIP_NUM_58818 || + chip_num == CHIP_NUM_57608); +} + struct bnxt_re_dev { struct ib_device ibdev; struct list_head list; @@ -150,31 +180,28 @@ struct bnxt_re_dev { #define BNXT_RE_FLAG_ERR_DEVICE_DETACHED 17 #define BNXT_RE_FLAG_ISSUE_ROCE_STATS 29 struct net_device *netdev; - struct notifier_block nb; + struct auxiliary_device *adev; unsigned int version, major, minor; struct bnxt_qplib_chip_ctx *chip_ctx; struct bnxt_en_dev *en_dev; - int num_msix; int id; struct delayed_work worker; u8 cur_prio_map; - /* FP Notification Queue (CQ & SRQ) */ - struct tasklet_struct nq_task; - /* RCFW Channel */ struct bnxt_qplib_rcfw rcfw; - /* NQ */ - struct bnxt_qplib_nq nq[BNXT_RE_MAX_MSIX]; + /* NQ record */ + struct bnxt_re_nq_record *nqr; /* Device Resources */ - struct bnxt_qplib_dev_attr dev_attr; + struct bnxt_qplib_dev_attr *dev_attr; struct bnxt_qplib_ctx qplib_ctx; struct bnxt_qplib_res qplib_res; struct bnxt_qplib_dpi dpi_privileged; + struct bnxt_qplib_cq_coal_param cq_coalescing; struct mutex qp_lock; /* protect qp list */ struct list_head qp_list; @@ -192,6 +219,14 @@ struct bnxt_re_dev { struct work_struct dbq_fifo_check_work; struct delayed_work dbq_pacing_work; DECLARE_HASHTABLE(cq_hash, MAX_CQ_HASH_BITS); + DECLARE_HASHTABLE(srq_hash, MAX_SRQ_HASH_BITS); + struct dentry *dbg_root; + struct dentry *qp_debugfs; + unsigned long event_bitmap; + struct bnxt_qplib_cc_param cc_param; + struct workqueue_struct *dcb_wq; + struct dentry *cc_config; + struct bnxt_re_dbg_cc_config_params *cc_config_params; }; #define to_bnxt_re_dev(ptr, member) \ @@ -204,6 +239,10 @@ struct bnxt_re_dev { #define BNXT_RE_CHECK_RC(x) ((x) && ((x) != -ETIMEDOUT)) void bnxt_re_pacing_alert(struct bnxt_re_dev *rdev); +int bnxt_re_assign_pma_port_counters(struct bnxt_re_dev *rdev, struct ib_mad *out_mad); +int bnxt_re_assign_pma_port_ext_counters(struct bnxt_re_dev *rdev, + struct ib_mad *out_mad); + static inline struct device *rdev_to_dev(struct bnxt_re_dev *rdev) { if (rdev) @@ -212,4 +251,29 @@ static inline struct device *rdev_to_dev(struct bnxt_re_dev *rdev) } extern const struct uapi_definition bnxt_re_uapi_defs[]; + +static inline void bnxt_re_set_pacing_dev_state(struct bnxt_re_dev *rdev) +{ + rdev->qplib_res.pacing_data->dev_err_state = + test_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags); +} + +static inline int bnxt_re_read_context_allowed(struct bnxt_re_dev *rdev) +{ + if (!bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx) || + rdev->rcfw.res->cctx->hwrm_intf_ver < HWRM_VERSION_READ_CTX) + return -EOPNOTSUPP; + return 0; +} + +#define BNXT_RE_CONTEXT_TYPE_QPC_SIZE_P5 1088 +#define BNXT_RE_CONTEXT_TYPE_CQ_SIZE_P5 128 +#define BNXT_RE_CONTEXT_TYPE_MRW_SIZE_P5 128 +#define BNXT_RE_CONTEXT_TYPE_SRQ_SIZE_P5 192 + +#define BNXT_RE_CONTEXT_TYPE_QPC_SIZE_P7 1088 +#define BNXT_RE_CONTEXT_TYPE_CQ_SIZE_P7 192 +#define BNXT_RE_CONTEXT_TYPE_MRW_SIZE_P7 192 +#define BNXT_RE_CONTEXT_TYPE_SRQ_SIZE_P7 192 + #endif diff --git a/drivers/infiniband/hw/bnxt_re/debugfs.c b/drivers/infiniband/hw/bnxt_re/debugfs.c new file mode 100644 index 000000000000..e632f1661b92 --- /dev/null +++ b/drivers/infiniband/hw/bnxt_re/debugfs.c @@ -0,0 +1,359 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause +/* + * Copyright (c) 2024, Broadcom. All rights reserved. The term + * Broadcom refers to Broadcom Limited and/or its subsidiaries. + * + * Description: Debugfs component of the bnxt_re driver + */ + +#include <linux/debugfs.h> +#include <linux/pci.h> +#include <rdma/ib_addr.h> + +#include "bnxt_ulp.h" +#include "roce_hsi.h" +#include "qplib_res.h" +#include "qplib_sp.h" +#include "qplib_fp.h" +#include "qplib_rcfw.h" +#include "bnxt_re.h" +#include "ib_verbs.h" +#include "debugfs.h" + +static struct dentry *bnxt_re_debugfs_root; + +static const char * const bnxt_re_cc_gen0_name[] = { + "enable_cc", + "run_avg_weight_g", + "num_phase_per_state", + "init_cr", + "init_tr", + "tos_ecn", + "tos_dscp", + "alt_vlan_pcp", + "alt_vlan_dscp", + "rtt", + "cc_mode", + "tcp_cp", + "tx_queue", + "inactivity_cp", +}; + +static inline const char *bnxt_re_qp_state_str(u8 state) +{ + switch (state) { + case CMDQ_MODIFY_QP_NEW_STATE_RESET: + return "RST"; + case CMDQ_MODIFY_QP_NEW_STATE_INIT: + return "INIT"; + case CMDQ_MODIFY_QP_NEW_STATE_RTR: + return "RTR"; + case CMDQ_MODIFY_QP_NEW_STATE_RTS: + return "RTS"; + case CMDQ_MODIFY_QP_NEW_STATE_SQE: + return "SQER"; + case CMDQ_MODIFY_QP_NEW_STATE_SQD: + return "SQD"; + case CMDQ_MODIFY_QP_NEW_STATE_ERR: + return "ERR"; + default: + return "Invalid QP state"; + } +} + +static inline const char *bnxt_re_qp_type_str(u8 type) +{ + switch (type) { + case CMDQ_CREATE_QP1_TYPE_GSI: return "QP1"; + case CMDQ_CREATE_QP_TYPE_GSI: return "QP1"; + case CMDQ_CREATE_QP_TYPE_RC: return "RC"; + case CMDQ_CREATE_QP_TYPE_UD: return "UD"; + case CMDQ_CREATE_QP_TYPE_RAW_ETHERTYPE: return "RAW_ETHERTYPE"; + default: return "Invalid transport type"; + } +} + +static ssize_t qp_info_read(struct file *filep, + char __user *buffer, + size_t count, loff_t *ppos) +{ + struct bnxt_re_qp *qp = filep->private_data; + char *buf; + int len; + + if (*ppos) + return 0; + + buf = kasprintf(GFP_KERNEL, + "QPN\t\t: %d\n" + "transport\t: %s\n" + "state\t\t: %s\n" + "mtu\t\t: %d\n" + "timeout\t\t: %d\n" + "remote QPN\t: %d\n", + qp->qplib_qp.id, + bnxt_re_qp_type_str(qp->qplib_qp.type), + bnxt_re_qp_state_str(qp->qplib_qp.state), + qp->qplib_qp.mtu, + qp->qplib_qp.timeout, + qp->qplib_qp.dest_qpn); + if (!buf) + return -ENOMEM; + if (count < strlen(buf)) { + kfree(buf); + return -ENOSPC; + } + len = simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf)); + kfree(buf); + return len; +} + +static const struct file_operations debugfs_qp_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = qp_info_read, +}; + +void bnxt_re_debug_add_qpinfo(struct bnxt_re_dev *rdev, struct bnxt_re_qp *qp) +{ + char resn[32]; + + sprintf(resn, "0x%x", qp->qplib_qp.id); + qp->dentry = debugfs_create_file(resn, 0400, rdev->qp_debugfs, qp, &debugfs_qp_fops); +} + +void bnxt_re_debug_rem_qpinfo(struct bnxt_re_dev *rdev, struct bnxt_re_qp *qp) +{ + debugfs_remove(qp->dentry); +} + +static int map_cc_config_offset_gen0_ext0(u32 offset, struct bnxt_qplib_cc_param *ccparam, u32 *val) +{ + u64 map_offset; + + map_offset = BIT(offset); + + switch (map_offset) { + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_ENABLE_CC: + *val = ccparam->enable; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_G: + *val = ccparam->g; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_NUMPHASEPERSTATE: + *val = ccparam->nph_per_state; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_INIT_CR: + *val = ccparam->init_cr; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_INIT_TR: + *val = ccparam->init_tr; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TOS_ECN: + *val = ccparam->tos_ecn; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TOS_DSCP: + *val = ccparam->tos_dscp; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_ALT_VLAN_PCP: + *val = ccparam->alt_vlan_pcp; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_ALT_TOS_DSCP: + *val = ccparam->alt_tos_dscp; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_RTT: + *val = ccparam->rtt; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_CC_MODE: + *val = ccparam->cc_mode; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TCP_CP: + *val = ccparam->tcp_cp; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_INACTIVITY_CP: + *val = ccparam->inact_th; + break; + default: + return -EINVAL; + } + + return 0; +} + +static ssize_t bnxt_re_cc_config_get(struct file *filp, char __user *buffer, + size_t usr_buf_len, loff_t *ppos) +{ + struct bnxt_re_cc_param *dbg_cc_param = filp->private_data; + struct bnxt_re_dev *rdev = dbg_cc_param->rdev; + struct bnxt_qplib_cc_param ccparam = {}; + u32 offset = dbg_cc_param->offset; + char buf[16]; + u32 val; + int rc; + + rc = bnxt_qplib_query_cc_param(&rdev->qplib_res, &ccparam); + if (rc) + return rc; + + rc = map_cc_config_offset_gen0_ext0(offset, &ccparam, &val); + if (rc) + return rc; + + rc = snprintf(buf, sizeof(buf), "%d\n", val); + if (rc < 0) + return rc; + + return simple_read_from_buffer(buffer, usr_buf_len, ppos, (u8 *)(buf), rc); +} + +static int bnxt_re_fill_gen0_ext0(struct bnxt_qplib_cc_param *ccparam, u32 offset, u32 val) +{ + u32 modify_mask; + + modify_mask = BIT(offset); + + switch (modify_mask) { + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_ENABLE_CC: + ccparam->enable = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_G: + ccparam->g = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_NUMPHASEPERSTATE: + ccparam->nph_per_state = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_INIT_CR: + ccparam->init_cr = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_INIT_TR: + ccparam->init_tr = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TOS_ECN: + ccparam->tos_ecn = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TOS_DSCP: + ccparam->tos_dscp = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_ALT_VLAN_PCP: + ccparam->alt_vlan_pcp = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_ALT_TOS_DSCP: + ccparam->alt_tos_dscp = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_RTT: + ccparam->rtt = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_CC_MODE: + ccparam->cc_mode = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TCP_CP: + ccparam->tcp_cp = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TX_QUEUE: + return -EOPNOTSUPP; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_INACTIVITY_CP: + ccparam->inact_th = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TIME_PER_PHASE: + ccparam->time_pph = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_PKTS_PER_PHASE: + ccparam->pkts_pph = val; + break; + } + + ccparam->mask = modify_mask; + return 0; +} + +static int bnxt_re_configure_cc(struct bnxt_re_dev *rdev, u32 gen_ext, u32 offset, u32 val) +{ + struct bnxt_qplib_cc_param ccparam = { }; + int rc; + + if (gen_ext != CC_CONFIG_GEN0_EXT0) + return -EOPNOTSUPP; + + rc = bnxt_re_fill_gen0_ext0(&ccparam, offset, val); + if (rc) + return rc; + + bnxt_qplib_modify_cc(&rdev->qplib_res, &ccparam); + return 0; +} + +static ssize_t bnxt_re_cc_config_set(struct file *filp, const char __user *buffer, + size_t count, loff_t *ppos) +{ + struct bnxt_re_cc_param *dbg_cc_param = filp->private_data; + struct bnxt_re_dev *rdev = dbg_cc_param->rdev; + u32 offset = dbg_cc_param->offset; + u8 cc_gen = dbg_cc_param->cc_gen; + char buf[16]; + u32 val; + int rc; + + if (count >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(buf, buffer, count)) + return -EFAULT; + + buf[count] = '\0'; + if (kstrtou32(buf, 0, &val)) + return -EINVAL; + + rc = bnxt_re_configure_cc(rdev, cc_gen, offset, val); + return rc ? rc : count; +} + +static const struct file_operations bnxt_re_cc_config_ops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = bnxt_re_cc_config_get, + .write = bnxt_re_cc_config_set, +}; + +void bnxt_re_debugfs_add_pdev(struct bnxt_re_dev *rdev) +{ + struct pci_dev *pdev = rdev->en_dev->pdev; + struct bnxt_re_dbg_cc_config_params *cc_params; + int i; + + rdev->dbg_root = debugfs_create_dir(dev_name(&pdev->dev), bnxt_re_debugfs_root); + + rdev->qp_debugfs = debugfs_create_dir("QPs", rdev->dbg_root); + rdev->cc_config = debugfs_create_dir("cc_config", rdev->dbg_root); + + rdev->cc_config_params = kzalloc(sizeof(*cc_params), GFP_KERNEL); + + for (i = 0; i < BNXT_RE_CC_PARAM_GEN0; i++) { + struct bnxt_re_cc_param *tmp_params = &rdev->cc_config_params->gen0_parms[i]; + + tmp_params->rdev = rdev; + tmp_params->offset = i; + tmp_params->cc_gen = CC_CONFIG_GEN0_EXT0; + tmp_params->dentry = debugfs_create_file(bnxt_re_cc_gen0_name[i], 0400, + rdev->cc_config, tmp_params, + &bnxt_re_cc_config_ops); + } +} + +void bnxt_re_debugfs_rem_pdev(struct bnxt_re_dev *rdev) +{ + debugfs_remove_recursive(rdev->qp_debugfs); + debugfs_remove_recursive(rdev->cc_config); + kfree(rdev->cc_config_params); + debugfs_remove_recursive(rdev->dbg_root); + rdev->dbg_root = NULL; +} + +void bnxt_re_register_debugfs(void) +{ + bnxt_re_debugfs_root = debugfs_create_dir("bnxt_re", NULL); +} + +void bnxt_re_unregister_debugfs(void) +{ + debugfs_remove(bnxt_re_debugfs_root); +} diff --git a/drivers/infiniband/hw/bnxt_re/debugfs.h b/drivers/infiniband/hw/bnxt_re/debugfs.h new file mode 100644 index 000000000000..8f101df4e838 --- /dev/null +++ b/drivers/infiniband/hw/bnxt_re/debugfs.h @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause +/* + * Copyright (c) 2024, Broadcom. All rights reserved. The term + * Broadcom refers to Broadcom Limited and/or its subsidiaries. + * + * Description: Debugfs header + */ + +#ifndef __BNXT_RE_DEBUGFS__ +#define __BNXT_RE_DEBUGFS__ + +void bnxt_re_debug_add_qpinfo(struct bnxt_re_dev *rdev, struct bnxt_re_qp *qp); +void bnxt_re_debug_rem_qpinfo(struct bnxt_re_dev *rdev, struct bnxt_re_qp *qp); + +void bnxt_re_debugfs_add_pdev(struct bnxt_re_dev *rdev); +void bnxt_re_debugfs_rem_pdev(struct bnxt_re_dev *rdev); + +void bnxt_re_register_debugfs(void); +void bnxt_re_unregister_debugfs(void); + +#define CC_CONFIG_GEN_EXT(x, y) (((x) << 16) | (y)) +#define CC_CONFIG_GEN0_EXT0 CC_CONFIG_GEN_EXT(0, 0) + +#define BNXT_RE_CC_PARAM_GEN0 14 + +struct bnxt_re_cc_param { + struct bnxt_re_dev *rdev; + struct dentry *dentry; + u32 offset; + u8 cc_gen; +}; + +struct bnxt_re_dbg_cc_config_params { + struct bnxt_re_cc_param gen0_parms[BNXT_RE_CC_PARAM_GEN0]; +}; +#endif diff --git a/drivers/infiniband/hw/bnxt_re/hw_counters.c b/drivers/infiniband/hw/bnxt_re/hw_counters.c index 128651c01595..44bb082e0a60 100644 --- a/drivers/infiniband/hw/bnxt_re/hw_counters.c +++ b/drivers/infiniband/hw/bnxt_re/hw_counters.c @@ -37,18 +37,11 @@ * */ -#include <linux/interrupt.h> #include <linux/types.h> -#include <linux/spinlock.h> -#include <linux/sched.h> -#include <linux/slab.h> #include <linux/pci.h> -#include <linux/prefetch.h> -#include <linux/delay.h> +#include <rdma/ib_mad.h> +#include <rdma/ib_pma.h> -#include <rdma/ib_addr.h> - -#include "bnxt_ulp.h" #include "roce_hsi.h" #include "qplib_res.h" #include "qplib_sp.h" @@ -294,6 +287,96 @@ static void bnxt_re_copy_db_pacing_stats(struct bnxt_re_dev *rdev, readl(rdev->en_dev->bar0 + rdev->pacing.dbr_db_fifo_reg_off); } +int bnxt_re_assign_pma_port_ext_counters(struct bnxt_re_dev *rdev, struct ib_mad *out_mad) +{ + struct ib_pma_portcounters_ext *pma_cnt_ext; + struct bnxt_qplib_ext_stat *estat = &rdev->stats.rstat.ext_stat; + struct ctx_hw_stats *hw_stats = NULL; + int rc; + + hw_stats = rdev->qplib_ctx.stats.dma; + + pma_cnt_ext = (struct ib_pma_portcounters_ext *)(out_mad->data + 40); + if (_is_ext_stats_supported(rdev->dev_attr->dev_cap_flags)) { + u32 fid = PCI_FUNC(rdev->en_dev->pdev->devfn); + + rc = bnxt_qplib_qext_stat(&rdev->rcfw, fid, estat); + if (rc) + return rc; + } + + pma_cnt_ext = (struct ib_pma_portcounters_ext *)(out_mad->data + 40); + if ((bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx) && rdev->is_virtfn) || + !bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx)) { + pma_cnt_ext->port_xmit_data = + cpu_to_be64(le64_to_cpu(hw_stats->tx_ucast_bytes) / 4); + pma_cnt_ext->port_rcv_data = + cpu_to_be64(le64_to_cpu(hw_stats->rx_ucast_bytes) / 4); + pma_cnt_ext->port_xmit_packets = + cpu_to_be64(le64_to_cpu(hw_stats->tx_ucast_pkts)); + pma_cnt_ext->port_rcv_packets = + cpu_to_be64(le64_to_cpu(hw_stats->rx_ucast_pkts)); + pma_cnt_ext->port_unicast_rcv_packets = + cpu_to_be64(le64_to_cpu(hw_stats->rx_ucast_pkts)); + pma_cnt_ext->port_unicast_xmit_packets = + cpu_to_be64(le64_to_cpu(hw_stats->tx_ucast_pkts)); + + } else { + pma_cnt_ext->port_rcv_packets = cpu_to_be64(estat->rx_roce_good_pkts); + pma_cnt_ext->port_rcv_data = cpu_to_be64(estat->rx_roce_good_bytes / 4); + pma_cnt_ext->port_xmit_packets = cpu_to_be64(estat->tx_roce_pkts); + pma_cnt_ext->port_xmit_data = cpu_to_be64(estat->tx_roce_bytes / 4); + pma_cnt_ext->port_unicast_rcv_packets = cpu_to_be64(estat->rx_roce_good_pkts); + pma_cnt_ext->port_unicast_xmit_packets = cpu_to_be64(estat->tx_roce_pkts); + } + return 0; +} + +int bnxt_re_assign_pma_port_counters(struct bnxt_re_dev *rdev, struct ib_mad *out_mad) +{ + struct bnxt_qplib_ext_stat *estat = &rdev->stats.rstat.ext_stat; + struct ib_pma_portcounters *pma_cnt; + struct ctx_hw_stats *hw_stats = NULL; + int rc; + + hw_stats = rdev->qplib_ctx.stats.dma; + + pma_cnt = (struct ib_pma_portcounters *)(out_mad->data + 40); + if (_is_ext_stats_supported(rdev->dev_attr->dev_cap_flags)) { + u32 fid = PCI_FUNC(rdev->en_dev->pdev->devfn); + + rc = bnxt_qplib_qext_stat(&rdev->rcfw, fid, estat); + if (rc) + return rc; + } + if ((bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx) && rdev->is_virtfn) || + !bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx)) { + pma_cnt->port_rcv_packets = + cpu_to_be32((u32)(le64_to_cpu(hw_stats->rx_ucast_pkts)) & 0xFFFFFFFF); + pma_cnt->port_rcv_data = + cpu_to_be32((u32)((le64_to_cpu(hw_stats->rx_ucast_bytes) & + 0xFFFFFFFF) / 4)); + pma_cnt->port_xmit_packets = + cpu_to_be32((u32)(le64_to_cpu(hw_stats->tx_ucast_pkts)) & 0xFFFFFFFF); + pma_cnt->port_xmit_data = + cpu_to_be32((u32)((le64_to_cpu(hw_stats->tx_ucast_bytes) + & 0xFFFFFFFF) / 4)); + } else { + pma_cnt->port_rcv_packets = cpu_to_be32(estat->rx_roce_good_pkts); + pma_cnt->port_rcv_data = cpu_to_be32((estat->rx_roce_good_bytes / 4)); + pma_cnt->port_xmit_packets = cpu_to_be32(estat->tx_roce_pkts); + pma_cnt->port_xmit_data = cpu_to_be32((estat->tx_roce_bytes / 4)); + } + pma_cnt->port_rcv_constraint_errors = (u8)(le64_to_cpu(hw_stats->rx_discard_pkts) & 0xFF); + pma_cnt->port_rcv_errors = cpu_to_be16((u16)(le64_to_cpu(hw_stats->rx_error_pkts) + & 0xFFFF)); + pma_cnt->port_xmit_constraint_errors = (u8)(le64_to_cpu(hw_stats->tx_error_pkts) & 0xFF); + pma_cnt->port_xmit_discards = cpu_to_be16((u16)(le64_to_cpu(hw_stats->tx_discard_pkts) + & 0xFFFF)); + + return 0; +} + int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, u32 port, int index) @@ -357,8 +440,8 @@ int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev, goto done; } bnxt_re_copy_err_stats(rdev, stats, err_s); - if (_is_ext_stats_supported(rdev->dev_attr.dev_cap_flags) && - !rdev->is_virtfn) { + if (bnxt_ext_stats_supported(rdev->chip_ctx, rdev->dev_attr->dev_cap_flags, + rdev->is_virtfn)) { rc = bnxt_re_get_ext_stat(rdev, stats); if (rc) { clear_bit(BNXT_RE_FLAG_ISSUE_ROCE_STATS, @@ -366,7 +449,7 @@ int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev, goto done; } } - if (rdev->pacing.dbr_pacing) + if (rdev->pacing.dbr_pacing && bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx)) bnxt_re_copy_db_pacing_stats(rdev, stats); } diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index ce9c5bae83bf..063801384b2b 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -49,11 +49,10 @@ #include <rdma/ib_addr.h> #include <rdma/ib_mad.h> #include <rdma/ib_cache.h> +#include <rdma/ib_pma.h> #include <rdma/uverbs_ioctl.h> #include <linux/hashtable.h> -#include "bnxt_ulp.h" - #include "roce_hsi.h" #include "qplib_res.h" #include "qplib_sp.h" @@ -62,6 +61,7 @@ #include "bnxt_re.h" #include "ib_verbs.h" +#include "debugfs.h" #include <rdma/uverbs_types.h> #include <rdma/uverbs_std_types.h> @@ -94,9 +94,9 @@ static int __from_ib_access_flags(int iflags) return qflags; }; -static enum ib_access_flags __to_ib_access_flags(int qflags) +static int __to_ib_access_flags(int qflags) { - enum ib_access_flags iflags = 0; + int iflags = 0; if (qflags & BNXT_QPLIB_ACCESS_LOCAL_WRITE) iflags |= IB_ACCESS_LOCAL_WRITE; @@ -113,7 +113,57 @@ static enum ib_access_flags __to_ib_access_flags(int qflags) if (qflags & BNXT_QPLIB_ACCESS_ON_DEMAND) iflags |= IB_ACCESS_ON_DEMAND; return iflags; -}; +} + +static u8 __qp_access_flags_from_ib(struct bnxt_qplib_chip_ctx *cctx, int iflags) +{ + u8 qflags = 0; + + if (!bnxt_qplib_is_chip_gen_p5_p7(cctx)) + /* For Wh+ */ + return (u8)__from_ib_access_flags(iflags); + + /* For P5, P7 and later chips */ + if (iflags & IB_ACCESS_LOCAL_WRITE) + qflags |= CMDQ_MODIFY_QP_ACCESS_LOCAL_WRITE; + if (iflags & IB_ACCESS_REMOTE_WRITE) + qflags |= CMDQ_MODIFY_QP_ACCESS_REMOTE_WRITE; + if (iflags & IB_ACCESS_REMOTE_READ) + qflags |= CMDQ_MODIFY_QP_ACCESS_REMOTE_READ; + if (iflags & IB_ACCESS_REMOTE_ATOMIC) + qflags |= CMDQ_MODIFY_QP_ACCESS_REMOTE_ATOMIC; + + return qflags; +} + +static int __qp_access_flags_to_ib(struct bnxt_qplib_chip_ctx *cctx, u8 qflags) +{ + int iflags = 0; + + if (!bnxt_qplib_is_chip_gen_p5_p7(cctx)) + /* For Wh+ */ + return __to_ib_access_flags(qflags); + + /* For P5, P7 and later chips */ + if (qflags & CMDQ_MODIFY_QP_ACCESS_LOCAL_WRITE) + iflags |= IB_ACCESS_LOCAL_WRITE; + if (qflags & CMDQ_MODIFY_QP_ACCESS_REMOTE_WRITE) + iflags |= IB_ACCESS_REMOTE_WRITE; + if (qflags & CMDQ_MODIFY_QP_ACCESS_REMOTE_READ) + iflags |= IB_ACCESS_REMOTE_READ; + if (qflags & CMDQ_MODIFY_QP_ACCESS_REMOTE_ATOMIC) + iflags |= IB_ACCESS_REMOTE_ATOMIC; + + return iflags; +} + +static void bnxt_re_check_and_set_relaxed_ordering(struct bnxt_re_dev *rdev, + struct bnxt_qplib_mrw *qplib_mr) +{ + if (_is_relaxed_ordering_supported(rdev->dev_attr->dev_cap_flags2) && + pcie_relaxed_ordering_enabled(rdev->en_dev->pdev)) + qplib_mr->flags |= CMDQ_REGISTER_MR_FLAGS_ENABLE_RO; +} static int bnxt_re_build_sgl(struct ib_sge *ib_sg_list, struct bnxt_qplib_sge *sg_list, int num) @@ -135,7 +185,7 @@ int bnxt_re_query_device(struct ib_device *ibdev, struct ib_udata *udata) { struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); - struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; + struct bnxt_qplib_dev_attr *dev_attr = rdev->dev_attr; memset(ib_attr, 0, sizeof(*ib_attr)); memcpy(&ib_attr->fw_ver, dev_attr->fw_ver, @@ -148,7 +198,7 @@ int bnxt_re_query_device(struct ib_device *ibdev, ib_attr->vendor_id = rdev->en_dev->pdev->vendor; ib_attr->vendor_part_id = rdev->en_dev->pdev->device; - ib_attr->hw_ver = rdev->en_dev->pdev->subsystem_device; + ib_attr->hw_ver = rdev->en_dev->pdev->revision; ib_attr->max_qp = dev_attr->max_qp; ib_attr->max_qp_wr = dev_attr->max_qp_wqes; ib_attr->device_cap_flags = @@ -203,12 +253,28 @@ int bnxt_re_query_device(struct ib_device *ibdev, return 0; } +int bnxt_re_modify_device(struct ib_device *ibdev, + int device_modify_mask, + struct ib_device_modify *device_modify) +{ + ibdev_dbg(ibdev, "Modify device with mask 0x%x", device_modify_mask); + + if (device_modify_mask & ~IB_DEVICE_MODIFY_NODE_DESC) + return -EOPNOTSUPP; + + if (!(device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC)) + return 0; + + memcpy(ibdev->node_desc, device_modify->node_desc, IB_DEVICE_NODE_DESC_MAX); + return 0; +} + /* Port */ int bnxt_re_query_port(struct ib_device *ibdev, u32 port_num, struct ib_port_attr *port_attr) { struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); - struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; + struct bnxt_qplib_dev_attr *dev_attr = rdev->dev_attr; int rc; memset(port_attr, 0, sizeof(*port_attr)); @@ -266,8 +332,8 @@ void bnxt_re_query_fw_str(struct ib_device *ibdev, char *str) struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%d.%d", - rdev->dev_attr.fw_ver[0], rdev->dev_attr.fw_ver[1], - rdev->dev_attr.fw_ver[2], rdev->dev_attr.fw_ver[3]); + rdev->dev_attr->fw_ver[0], rdev->dev_attr->fw_ver[1], + rdev->dev_attr->fw_ver[2], rdev->dev_attr->fw_ver[3]); } int bnxt_re_query_pkey(struct ib_device *ibdev, u32 port_num, @@ -517,15 +583,19 @@ static int bnxt_re_create_fence_mr(struct bnxt_re_pd *pd) mr->rdev = rdev; mr->qplib_mr.pd = &pd->qplib_pd; mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_PMR; - mr->qplib_mr.flags = __from_ib_access_flags(mr_access_flags); - rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr); - if (rc) { - ibdev_err(&rdev->ibdev, "Failed to alloc fence-HW-MR\n"); - goto fail; - } + mr->qplib_mr.access_flags = __from_ib_access_flags(mr_access_flags); + if (!_is_alloc_mr_unified(rdev->dev_attr->dev_cap_flags)) { + rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr); + if (rc) { + ibdev_err(&rdev->ibdev, "Failed to alloc fence-HW-MR\n"); + goto fail; + } - /* Register MR */ - mr->ib_mr.lkey = mr->qplib_mr.lkey; + /* Register MR */ + mr->ib_mr.lkey = mr->qplib_mr.lkey; + } else { + mr->qplib_mr.flags = CMDQ_REGISTER_MR_FLAGS_ALLOC_MR; + } mr->qplib_mr.va = (u64)(unsigned long)fence->va; mr->qplib_mr.total_size = BNXT_RE_FENCE_BYTES; rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, NULL, @@ -896,13 +966,13 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata) unsigned int flags; int rc; + bnxt_re_debug_rem_qpinfo(rdev, qp); + bnxt_qplib_flush_cqn_wq(&qp->qplib_qp); rc = bnxt_qplib_destroy_qp(&rdev->qplib_res, &qp->qplib_qp); - if (rc) { + if (rc) ibdev_err(&rdev->ibdev, "Failed to destroy HW QP"); - return rc; - } if (rdma_is_kernel_res(&qp->ib_qp.res)) { flags = bnxt_re_lock_cqs(qp); @@ -912,11 +982,8 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata) bnxt_qplib_free_qp_res(&rdev->qplib_res, &qp->qplib_qp); - if (ib_qp->qp_type == IB_QPT_GSI && rdev->gsi_ctx.gsi_sqp) { - rc = bnxt_re_destroy_gsi_sqp(qp); - if (rc) - return rc; - } + if (ib_qp->qp_type == IB_QPT_GSI && rdev->gsi_ctx.gsi_sqp) + bnxt_re_destroy_gsi_sqp(qp); mutex_lock(&rdev->qp_lock); list_del(&qp->list); @@ -989,48 +1056,42 @@ static int bnxt_re_setup_swqe_size(struct bnxt_re_qp *qp, rdev = qp->rdev; qplqp = &qp->qplib_qp; sq = &qplqp->sq; - dev_attr = &rdev->dev_attr; + dev_attr = rdev->dev_attr; align = sizeof(struct sq_send_hdr); ilsize = ALIGN(init_attr->cap.max_inline_data, align); - sq->wqe_size = bnxt_re_get_wqe_size(ilsize, sq->max_sge); - if (sq->wqe_size > bnxt_re_get_swqe_size(dev_attr->max_qp_sges)) - return -EINVAL; - /* For gen p4 and gen p5 backward compatibility mode - * wqe size is fixed to 128 bytes + /* For gen p4 and gen p5 fixed wqe compatibility mode + * wqe size is fixed to 128 bytes - ie 6 SGEs */ - if (sq->wqe_size < bnxt_re_get_swqe_size(dev_attr->max_qp_sges) && - qplqp->wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) - sq->wqe_size = bnxt_re_get_swqe_size(dev_attr->max_qp_sges); + if (qplqp->wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) { + sq->wqe_size = bnxt_re_get_swqe_size(BNXT_STATIC_MAX_SGE); + sq->max_sge = BNXT_STATIC_MAX_SGE; + } else { + sq->wqe_size = bnxt_re_get_wqe_size(ilsize, sq->max_sge); + if (sq->wqe_size > bnxt_re_get_swqe_size(dev_attr->max_qp_sges)) + return -EINVAL; + } if (init_attr->cap.max_inline_data) { qplqp->max_inline_data = sq->wqe_size - sizeof(struct sq_send_hdr); init_attr->cap.max_inline_data = qplqp->max_inline_data; - if (qplqp->wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) - sq->max_sge = qplqp->max_inline_data / - sizeof(struct sq_sge); } return 0; } static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd, - struct bnxt_re_qp *qp, struct ib_udata *udata) + struct bnxt_re_qp *qp, struct bnxt_re_ucontext *cntx, + struct bnxt_re_qp_req *ureq) { struct bnxt_qplib_qp *qplib_qp; - struct bnxt_re_ucontext *cntx; - struct bnxt_re_qp_req ureq; int bytes = 0, psn_sz; struct ib_umem *umem; int psn_nume; qplib_qp = &qp->qplib_qp; - cntx = rdma_udata_to_drv_context(udata, struct bnxt_re_ucontext, - ib_uctx); - if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) - return -EFAULT; bytes = (qplib_qp->sq.max_wqe * qplib_qp->sq.wqe_size); /* Consider mapping PSN search memory only for RC QPs. */ @@ -1038,15 +1099,20 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd, psn_sz = bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx) ? sizeof(struct sq_psn_search_ext) : sizeof(struct sq_psn_search); - psn_nume = (qplib_qp->wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) ? - qplib_qp->sq.max_wqe : - ((qplib_qp->sq.max_wqe * qplib_qp->sq.wqe_size) / - sizeof(struct bnxt_qplib_sge)); + if (cntx && bnxt_re_is_var_size_supported(rdev, cntx)) { + psn_nume = ureq->sq_slots; + } else { + psn_nume = (qplib_qp->wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) ? + qplib_qp->sq.max_wqe : ((qplib_qp->sq.max_wqe * qplib_qp->sq.wqe_size) / + sizeof(struct bnxt_qplib_sge)); + } + if (_is_host_msn_table(rdev->qplib_res.dattr->dev_cap_flags2)) + psn_nume = roundup_pow_of_two(psn_nume); bytes += (psn_nume * psn_sz); } bytes = PAGE_ALIGN(bytes); - umem = ib_umem_get(&rdev->ibdev, ureq.qpsva, bytes, + umem = ib_umem_get(&rdev->ibdev, ureq->qpsva, bytes, IB_ACCESS_LOCAL_WRITE); if (IS_ERR(umem)) return PTR_ERR(umem); @@ -1055,12 +1121,12 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd, qplib_qp->sq.sg_info.umem = umem; qplib_qp->sq.sg_info.pgsize = PAGE_SIZE; qplib_qp->sq.sg_info.pgshft = PAGE_SHIFT; - qplib_qp->qp_handle = ureq.qp_handle; + qplib_qp->qp_handle = ureq->qp_handle; if (!qp->qplib_qp.srq) { bytes = (qplib_qp->rq.max_wqe * qplib_qp->rq.wqe_size); bytes = PAGE_ALIGN(bytes); - umem = ib_umem_get(&rdev->ibdev, ureq.qprva, bytes, + umem = ib_umem_get(&rdev->ibdev, ureq->qprva, bytes, IB_ACCESS_LOCAL_WRITE); if (IS_ERR(umem)) goto rqfail; @@ -1156,6 +1222,7 @@ static struct bnxt_re_qp *bnxt_re_create_shadow_qp /* Shadow QP SQ depth should be same as QP1 RQ depth */ qp->qplib_qp.sq.wqe_size = bnxt_re_get_wqe_size(0, 6); qp->qplib_qp.sq.max_wqe = qp1_qp->rq.max_wqe; + qp->qplib_qp.sq.max_sw_wqe = qp1_qp->rq.max_wqe; qp->qplib_qp.sq.max_sge = 2; /* Q full delta can be 1 since it is internal QP */ qp->qplib_qp.sq.q_full_delta = 1; @@ -1167,6 +1234,7 @@ static struct bnxt_re_qp *bnxt_re_create_shadow_qp qp->qplib_qp.rq.wqe_size = bnxt_re_get_rwqe_size(6); qp->qplib_qp.rq.max_wqe = qp1_qp->rq.max_wqe; + qp->qplib_qp.rq.max_sw_wqe = qp1_qp->rq.max_wqe; qp->qplib_qp.rq.max_sge = qp1_qp->rq.max_sge; /* Q full delta can be 1 since it is internal QP */ qp->qplib_qp.rq.q_full_delta = 1; @@ -1208,7 +1276,7 @@ static int bnxt_re_init_rq_attr(struct bnxt_re_qp *qp, rdev = qp->rdev; qplqp = &qp->qplib_qp; rq = &qplqp->rq; - dev_attr = &rdev->dev_attr; + dev_attr = rdev->dev_attr; if (init_attr->srq) { struct bnxt_re_srq *srq; @@ -1228,6 +1296,7 @@ static int bnxt_re_init_rq_attr(struct bnxt_re_qp *qp, */ entries = bnxt_re_init_depth(init_attr->cap.max_recv_wr + 1, uctx); rq->max_wqe = min_t(u32, entries, dev_attr->max_qp_wqes + 1); + rq->max_sw_wqe = rq->max_wqe; rq->q_full_delta = 0; rq->sg_info.pgsize = PAGE_SIZE; rq->sg_info.pgshft = PAGE_SHIFT; @@ -1244,7 +1313,7 @@ static void bnxt_re_adjust_gsi_rq_attr(struct bnxt_re_qp *qp) rdev = qp->rdev; qplqp = &qp->qplib_qp; - dev_attr = &rdev->dev_attr; + dev_attr = rdev->dev_attr; if (!bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx)) { qplqp->rq.max_sge = dev_attr->max_qp_sges; @@ -1256,37 +1325,49 @@ static void bnxt_re_adjust_gsi_rq_attr(struct bnxt_re_qp *qp) static int bnxt_re_init_sq_attr(struct bnxt_re_qp *qp, struct ib_qp_init_attr *init_attr, - struct bnxt_re_ucontext *uctx) + struct bnxt_re_ucontext *uctx, + struct bnxt_re_qp_req *ureq) { struct bnxt_qplib_dev_attr *dev_attr; struct bnxt_qplib_qp *qplqp; struct bnxt_re_dev *rdev; struct bnxt_qplib_q *sq; + int diff = 0; int entries; - int diff; int rc; rdev = qp->rdev; qplqp = &qp->qplib_qp; sq = &qplqp->sq; - dev_attr = &rdev->dev_attr; + dev_attr = rdev->dev_attr; sq->max_sge = init_attr->cap.max_send_sge; - if (sq->max_sge > dev_attr->max_qp_sges) { - sq->max_sge = dev_attr->max_qp_sges; - init_attr->cap.max_send_sge = sq->max_sge; - } + entries = init_attr->cap.max_send_wr; + if (uctx && qplqp->wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE) { + sq->max_wqe = ureq->sq_slots; + sq->max_sw_wqe = ureq->sq_slots; + sq->wqe_size = sizeof(struct sq_sge); + } else { + if (sq->max_sge > dev_attr->max_qp_sges) { + sq->max_sge = dev_attr->max_qp_sges; + init_attr->cap.max_send_sge = sq->max_sge; + } - rc = bnxt_re_setup_swqe_size(qp, init_attr); - if (rc) - return rc; + rc = bnxt_re_setup_swqe_size(qp, init_attr); + if (rc) + return rc; - entries = init_attr->cap.max_send_wr; - /* Allocate 128 + 1 more than what's provided */ - diff = (qplqp->wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE) ? - 0 : BNXT_QPLIB_RESERVED_QP_WRS; - entries = bnxt_re_init_depth(entries + diff + 1, uctx); - sq->max_wqe = min_t(u32, entries, dev_attr->max_qp_wqes + diff + 1); + /* Allocate 128 + 1 more than what's provided */ + diff = (qplqp->wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE) ? + 0 : BNXT_QPLIB_RESERVED_QP_WRS; + entries = bnxt_re_init_depth(entries + diff + 1, uctx); + sq->max_wqe = min_t(u32, entries, dev_attr->max_qp_wqes + diff + 1); + if (qplqp->wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE) + sq->max_sw_wqe = bnxt_qplib_get_depth(sq, qplqp->wqe_mode, true); + else + sq->max_sw_wqe = sq->max_wqe; + + } sq->q_full_delta = diff + 1; /* * Reserving one slot for Phantom WQE. Application can @@ -1311,7 +1392,7 @@ static void bnxt_re_adjust_gsi_sq_attr(struct bnxt_re_qp *qp, rdev = qp->rdev; qplqp = &qp->qplib_qp; - dev_attr = &rdev->dev_attr; + dev_attr = rdev->dev_attr; if (!bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx)) { entries = bnxt_re_init_depth(init_attr->cap.max_send_wr + 1, uctx); @@ -1349,10 +1430,10 @@ out: static int bnxt_re_init_qp_attr(struct bnxt_re_qp *qp, struct bnxt_re_pd *pd, struct ib_qp_init_attr *init_attr, - struct ib_udata *udata) + struct bnxt_re_ucontext *uctx, + struct bnxt_re_qp_req *ureq) { struct bnxt_qplib_dev_attr *dev_attr; - struct bnxt_re_ucontext *uctx; struct bnxt_qplib_qp *qplqp; struct bnxt_re_dev *rdev; struct bnxt_re_cq *cq; @@ -1360,9 +1441,8 @@ static int bnxt_re_init_qp_attr(struct bnxt_re_qp *qp, struct bnxt_re_pd *pd, rdev = qp->rdev; qplqp = &qp->qplib_qp; - dev_attr = &rdev->dev_attr; + dev_attr = rdev->dev_attr; - uctx = rdma_udata_to_drv_context(udata, struct bnxt_re_ucontext, ib_uctx); /* Setup misc params */ ether_addr_copy(qplqp->smac, rdev->netdev->dev_addr); qplqp->pd = &pd->qplib_pd; @@ -1375,8 +1455,7 @@ static int bnxt_re_init_qp_attr(struct bnxt_re_qp *qp, struct bnxt_re_pd *pd, goto out; } qplqp->type = (u8)qptype; - qplqp->wqe_mode = rdev->chip_ctx->modes.wqe_mode; - + qplqp->wqe_mode = bnxt_re_is_var_size_supported(rdev, uctx); if (init_attr->qp_type == IB_QPT_RC) { qplqp->max_rd_atomic = dev_attr->max_qp_rd_atom; qplqp->max_dest_rd_atomic = dev_attr->max_qp_init_rd_atom; @@ -1411,14 +1490,14 @@ static int bnxt_re_init_qp_attr(struct bnxt_re_qp *qp, struct bnxt_re_pd *pd, bnxt_re_adjust_gsi_rq_attr(qp); /* Setup SQ */ - rc = bnxt_re_init_sq_attr(qp, init_attr, uctx); + rc = bnxt_re_init_sq_attr(qp, init_attr, uctx, ureq); if (rc) goto out; if (init_attr->qp_type == IB_QPT_GSI) bnxt_re_adjust_gsi_sq_attr(qp, init_attr, uctx); - if (udata) /* This will update DPI and qp_handle */ - rc = bnxt_re_init_user_qp(rdev, pd, qp, udata); + if (uctx) /* This will update DPI and qp_handle */ + rc = bnxt_re_init_user_qp(rdev, pd, qp, uctx, ureq); out: return rc; } @@ -1519,14 +1598,27 @@ static bool bnxt_re_test_qp_limits(struct bnxt_re_dev *rdev, int bnxt_re_create_qp(struct ib_qp *ib_qp, struct ib_qp_init_attr *qp_init_attr, struct ib_udata *udata) { - struct ib_pd *ib_pd = ib_qp->pd; - struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd); - struct bnxt_re_dev *rdev = pd->rdev; - struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; - struct bnxt_re_qp *qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp); + struct bnxt_qplib_dev_attr *dev_attr; + struct bnxt_re_ucontext *uctx; + struct bnxt_re_qp_req ureq; + struct bnxt_re_dev *rdev; + struct bnxt_re_pd *pd; + struct bnxt_re_qp *qp; + struct ib_pd *ib_pd; u32 active_qps; int rc; + ib_pd = ib_qp->pd; + pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd); + rdev = pd->rdev; + dev_attr = rdev->dev_attr; + qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp); + + uctx = rdma_udata_to_drv_context(udata, struct bnxt_re_ucontext, ib_uctx); + if (udata) + if (ib_copy_from_udata(&ureq, udata, min(udata->inlen, sizeof(ureq)))) + return -EFAULT; + rc = bnxt_re_test_qp_limits(rdev, qp_init_attr, dev_attr); if (!rc) { rc = -EINVAL; @@ -1534,7 +1626,7 @@ int bnxt_re_create_qp(struct ib_qp *ib_qp, struct ib_qp_init_attr *qp_init_attr, } qp->rdev = rdev; - rc = bnxt_re_init_qp_attr(qp, pd, qp_init_attr, udata); + rc = bnxt_re_init_qp_attr(qp, pd, qp_init_attr, uctx, &ureq); if (rc) goto fail; @@ -1585,6 +1677,7 @@ int bnxt_re_create_qp(struct ib_qp *ib_qp, struct ib_qp_init_attr *qp_init_attr, if (active_qps > rdev->stats.res.ud_qp_watermark) rdev->stats.res.ud_qp_watermark = active_qps; } + bnxt_re_debug_add_qpinfo(rdev, qp); return 0; qp_destroy: @@ -1681,15 +1774,14 @@ int bnxt_re_destroy_srq(struct ib_srq *ib_srq, struct ib_udata *udata) ib_srq); struct bnxt_re_dev *rdev = srq->rdev; struct bnxt_qplib_srq *qplib_srq = &srq->qplib_srq; - struct bnxt_qplib_nq *nq = NULL; - if (qplib_srq->cq) - nq = qplib_srq->cq->nq; + if (rdev->chip_ctx->modes.toggle_bits & BNXT_QPLIB_SRQ_TOGGLE_BIT) { + free_page((unsigned long)srq->uctx_srq_page); + hash_del(&srq->hash_entry); + } bnxt_qplib_destroy_srq(&rdev->qplib_res, qplib_srq); ib_umem_release(srq->umem); atomic_dec(&rdev->stats.res.srq_count); - if (nq) - nq->budget--; return 0; } @@ -1730,7 +1822,6 @@ int bnxt_re_create_srq(struct ib_srq *ib_srq, struct ib_udata *udata) { struct bnxt_qplib_dev_attr *dev_attr; - struct bnxt_qplib_nq *nq = NULL; struct bnxt_re_ucontext *uctx; struct bnxt_re_dev *rdev; struct bnxt_re_srq *srq; @@ -1742,7 +1833,7 @@ int bnxt_re_create_srq(struct ib_srq *ib_srq, ib_pd = ib_srq->pd; pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd); rdev = pd->rdev; - dev_attr = &rdev->dev_attr; + dev_attr = rdev->dev_attr; srq = container_of(ib_srq, struct bnxt_re_srq, ib_srq); if (srq_init_attr->attr.max_wr >= dev_attr->max_srq_wqes) { @@ -1773,8 +1864,9 @@ int bnxt_re_create_srq(struct ib_srq *ib_srq, srq->qplib_srq.wqe_size = bnxt_re_get_rwqe_size(dev_attr->max_srq_sges); srq->qplib_srq.threshold = srq_init_attr->attr.srq_limit; srq->srq_limit = srq_init_attr->attr.srq_limit; - srq->qplib_srq.eventq_hw_ring_id = rdev->nq[0].ring_id; - nq = &rdev->nq[0]; + srq->qplib_srq.eventq_hw_ring_id = rdev->nqr->nq[0].ring_id; + srq->qplib_srq.sg_info.pgsize = PAGE_SIZE; + srq->qplib_srq.sg_info.pgshft = PAGE_SHIFT; if (udata) { rc = bnxt_re_init_user_srq(rdev, pd, srq, udata); @@ -1789,9 +1881,18 @@ int bnxt_re_create_srq(struct ib_srq *ib_srq, } if (udata) { - struct bnxt_re_srq_resp resp; + struct bnxt_re_srq_resp resp = {}; resp.srqid = srq->qplib_srq.id; + if (rdev->chip_ctx->modes.toggle_bits & BNXT_QPLIB_SRQ_TOGGLE_BIT) { + hash_add(rdev->srq_hash, &srq->hash_entry, srq->qplib_srq.id); + srq->uctx_srq_page = (void *)get_zeroed_page(GFP_KERNEL); + if (!srq->uctx_srq_page) { + rc = -ENOMEM; + goto fail; + } + resp.comp_mask |= BNXT_RE_SRQ_TOGGLE_PAGE_SUPPORT; + } rc = ib_copy_to_udata(udata, &resp, sizeof(resp)); if (rc) { ibdev_err(&rdev->ibdev, "SRQ copy to udata failed!"); @@ -1800,8 +1901,6 @@ int bnxt_re_create_srq(struct ib_srq *ib_srq, goto fail; } } - if (nq) - nq->budget++; active_srqs = atomic_inc_return(&rdev->stats.res.srq_count); if (active_srqs > rdev->stats.res.srq_watermark) rdev->stats.res.srq_watermark = active_srqs; @@ -1937,7 +2036,7 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, { struct bnxt_re_qp *qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp); struct bnxt_re_dev *rdev = qp->rdev; - struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; + struct bnxt_qplib_dev_attr *dev_attr = rdev->dev_attr; enum ib_qp_state curr_qp_state, new_qp_state; int rc, entries; unsigned int flags; @@ -1991,12 +2090,10 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, if (qp_attr_mask & IB_QP_ACCESS_FLAGS) { qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_ACCESS; qp->qplib_qp.access = - __from_ib_access_flags(qp_attr->qp_access_flags); + __qp_access_flags_from_ib(qp->qplib_qp.cctx, + qp_attr->qp_access_flags); /* LOCAL_WRITE access must be set to allow RC receive */ - qp->qplib_qp.access |= BNXT_QPLIB_ACCESS_LOCAL_WRITE; - /* Temp: Set all params on QP as of now */ - qp->qplib_qp.access |= CMDQ_MODIFY_QP_ACCESS_REMOTE_WRITE; - qp->qplib_qp.access |= CMDQ_MODIFY_QP_ACCESS_REMOTE_READ; + qp->qplib_qp.access |= CMDQ_MODIFY_QP_ACCESS_LOCAL_WRITE; } if (qp_attr_mask & IB_QP_PKEY_INDEX) { qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_PKEY; @@ -2030,7 +2127,7 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, qp->qplib_qp.ah.sgid_index = ctx->idx; qp->qplib_qp.ah.host_sgid_index = grh->sgid_index; qp->qplib_qp.ah.hop_limit = grh->hop_limit; - qp->qplib_qp.ah.traffic_class = grh->traffic_class; + qp->qplib_qp.ah.traffic_class = grh->traffic_class >> 2; qp->qplib_qp.ah.sl = rdma_ah_get_sl(&qp_attr->ah_attr); ether_addr_copy(qp->qplib_qp.ah.dmac, qp_attr->ah_attr.roce.dmac); @@ -2057,18 +2154,20 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, } } - if (qp_attr_mask & IB_QP_PATH_MTU) { - qp->qplib_qp.modify_flags |= - CMDQ_MODIFY_QP_MODIFY_MASK_PATH_MTU; - qp->qplib_qp.path_mtu = __from_ib_mtu(qp_attr->path_mtu); - qp->qplib_qp.mtu = ib_mtu_enum_to_int(qp_attr->path_mtu); - } else if (qp_attr->qp_state == IB_QPS_RTR) { - qp->qplib_qp.modify_flags |= - CMDQ_MODIFY_QP_MODIFY_MASK_PATH_MTU; - qp->qplib_qp.path_mtu = - __from_ib_mtu(iboe_get_mtu(rdev->netdev->mtu)); - qp->qplib_qp.mtu = - ib_mtu_enum_to_int(iboe_get_mtu(rdev->netdev->mtu)); + if (qp_attr->qp_state == IB_QPS_RTR) { + enum ib_mtu qpmtu; + + qpmtu = iboe_get_mtu(rdev->netdev->mtu); + if (qp_attr_mask & IB_QP_PATH_MTU) { + if (ib_mtu_enum_to_int(qp_attr->path_mtu) > + ib_mtu_enum_to_int(qpmtu)) + return -EINVAL; + qpmtu = qp_attr->path_mtu; + } + + qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_PATH_MTU; + qp->qplib_qp.path_mtu = __from_ib_mtu(qpmtu); + qp->qplib_qp.mtu = ib_mtu_enum_to_int(qpmtu); } if (qp_attr_mask & IB_QP_TIMEOUT) { @@ -2155,6 +2254,7 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, entries = bnxt_re_init_depth(qp_attr->cap.max_recv_wr, uctx); qp->qplib_qp.rq.max_wqe = min_t(u32, entries, dev_attr->max_qp_wqes + 1); + qp->qplib_qp.rq.max_sw_wqe = qp->qplib_qp.rq.max_wqe; qp->qplib_qp.rq.q_full_delta = qp->qplib_qp.rq.max_wqe - qp_attr->cap.max_recv_wr; qp->qplib_qp.rq.max_sge = qp_attr->cap.max_recv_sge; @@ -2200,7 +2300,8 @@ int bnxt_re_query_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, qp_attr->qp_state = __to_ib_qp_state(qplib_qp->state); qp_attr->cur_qp_state = __to_ib_qp_state(qplib_qp->cur_qp_state); qp_attr->en_sqd_async_notify = qplib_qp->en_sqd_async_notify ? 1 : 0; - qp_attr->qp_access_flags = __to_ib_access_flags(qplib_qp->access); + qp_attr->qp_access_flags = __qp_access_flags_to_ib(qp->qplib_qp.cctx, + qplib_qp->access); qp_attr->pkey_index = qplib_qp->pkey_index; qp_attr->qkey = qplib_qp->qkey; qp_attr->ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE; @@ -2216,6 +2317,7 @@ int bnxt_re_query_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, qp_attr->retry_cnt = qplib_qp->retry_cnt; qp_attr->rnr_retry = qplib_qp->rnr_retry; qp_attr->min_rnr_timer = qplib_qp->min_rnr_timer; + qp_attr->port_num = __to_ib_port_num(qplib_qp->port_id); qp_attr->rq_psn = qplib_qp->rq.psn; qp_attr->max_rd_atomic = qplib_qp->max_rd_atomic; qp_attr->sq_psn = qplib_qp->sq.psn; @@ -2479,7 +2581,7 @@ static int bnxt_re_build_send_wqe(struct bnxt_re_qp *qp, break; case IB_WR_SEND_WITH_IMM: wqe->type = BNXT_QPLIB_SWQE_TYPE_SEND_WITH_IMM; - wqe->send.imm_data = wr->ex.imm_data; + wqe->send.imm_data = be32_to_cpu(wr->ex.imm_data); break; case IB_WR_SEND_WITH_INV: wqe->type = BNXT_QPLIB_SWQE_TYPE_SEND_WITH_INV; @@ -2509,7 +2611,7 @@ static int bnxt_re_build_rdma_wqe(const struct ib_send_wr *wr, break; case IB_WR_RDMA_WRITE_WITH_IMM: wqe->type = BNXT_QPLIB_SWQE_TYPE_RDMA_WRITE_WITH_IMM; - wqe->rdma.imm_data = wr->ex.imm_data; + wqe->rdma.imm_data = be32_to_cpu(wr->ex.imm_data); break; case IB_WR_RDMA_READ: wqe->type = BNXT_QPLIB_SWQE_TYPE_RDMA_READ; @@ -2712,7 +2814,8 @@ bad: wr = wr->next; } bnxt_qplib_post_send_db(&qp->qplib_qp); - bnxt_ud_qp_hw_stall_workaround(qp); + if (!bnxt_qplib_is_chip_gen_p5_p7(qp->rdev->chip_ctx)) + bnxt_ud_qp_hw_stall_workaround(qp); spin_unlock_irqrestore(&qp->sq_lock, flags); return rc; } @@ -2824,7 +2927,8 @@ bad: wr = wr->next; } bnxt_qplib_post_send_db(&qp->qplib_qp); - bnxt_ud_qp_hw_stall_workaround(qp); + if (!bnxt_qplib_is_chip_gen_p5_p7(qp->rdev->chip_ctx)) + bnxt_ud_qp_hw_stall_workaround(qp); spin_unlock_irqrestore(&qp->sq_lock, flags); return rc; @@ -2921,6 +3025,28 @@ int bnxt_re_post_recv(struct ib_qp *ib_qp, const struct ib_recv_wr *wr, return rc; } +static struct bnxt_qplib_nq *bnxt_re_get_nq(struct bnxt_re_dev *rdev) +{ + int min, indx; + + mutex_lock(&rdev->nqr->load_lock); + for (indx = 0, min = 0; indx < (rdev->nqr->num_msix - 1); indx++) { + if (rdev->nqr->nq[min].load > rdev->nqr->nq[indx].load) + min = indx; + } + rdev->nqr->nq[min].load++; + mutex_unlock(&rdev->nqr->load_lock); + + return &rdev->nqr->nq[min]; +} + +static void bnxt_re_put_nq(struct bnxt_re_dev *rdev, struct bnxt_qplib_nq *nq) +{ + mutex_lock(&rdev->nqr->load_lock); + nq->load--; + mutex_unlock(&rdev->nqr->load_lock); +} + /* Completion Queues */ int bnxt_re_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) { @@ -2939,25 +3065,25 @@ int bnxt_re_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) hash_del(&cq->hash_entry); } bnxt_qplib_destroy_cq(&rdev->qplib_res, &cq->qplib_cq); + + bnxt_re_put_nq(rdev, nq); ib_umem_release(cq->umem); atomic_dec(&rdev->stats.res.cq_count); - nq->budget--; kfree(cq->cql); return 0; } int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { struct bnxt_re_cq *cq = container_of(ibcq, struct bnxt_re_cq, ib_cq); struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibcq->device, ibdev); + struct ib_udata *udata = &attrs->driver_udata; struct bnxt_re_ucontext *uctx = rdma_udata_to_drv_context(udata, struct bnxt_re_ucontext, ib_uctx); - struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; + struct bnxt_qplib_dev_attr *dev_attr = rdev->dev_attr; struct bnxt_qplib_chip_ctx *cctx; - struct bnxt_qplib_nq *nq = NULL; - unsigned int nq_alloc_cnt; int cqe = attr->cqe; int rc, entries; u32 active_cqs; @@ -3008,15 +3134,10 @@ int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, cq->qplib_cq.dpi = &rdev->dpi_privileged; } - /* - * Allocating the NQ in a round robin fashion. nq_alloc_cnt is a - * used for getting the NQ index. - */ - nq_alloc_cnt = atomic_inc_return(&rdev->nq_alloc_cnt); - nq = &rdev->nq[nq_alloc_cnt % (rdev->num_msix - 1)]; cq->qplib_cq.max_wqe = entries; - cq->qplib_cq.cnq_hw_ring_id = nq->ring_id; - cq->qplib_cq.nq = nq; + cq->qplib_cq.coalescing = &rdev->cq_coalescing; + cq->qplib_cq.nq = bnxt_re_get_nq(rdev); + cq->qplib_cq.cnq_hw_ring_id = cq->qplib_cq.nq->ring_id; rc = bnxt_qplib_create_cq(&rdev->qplib_res, &cq->qplib_cq); if (rc) { @@ -3026,7 +3147,6 @@ int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, cq->ib_cq.cqe = entries; cq->cq_period = cq->qplib_cq.period; - nq->budget++; active_cqs = atomic_inc_return(&rdev->stats.res.cq_count); if (active_cqs > rdev->stats.res.cq_watermark) @@ -3097,7 +3217,7 @@ int bnxt_re_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) cq = container_of(ibcq, struct bnxt_re_cq, ib_cq); rdev = cq->rdev; - dev_attr = &rdev->dev_attr; + dev_attr = rdev->dev_attr; if (!ibcq->uobject) { ibdev_err(&rdev->ibdev, "Kernel CQ Resize not supported"); return -EOPNOTSUPP; @@ -3581,7 +3701,7 @@ static void bnxt_re_process_res_shadow_qp_wc(struct bnxt_re_qp *gsi_sqp, wc->byte_len = orig_cqe->length; wc->qp = &gsi_qp->ib_qp; - wc->ex.imm_data = orig_cqe->immdata; + wc->ex.imm_data = cpu_to_be32(orig_cqe->immdata); wc->src_qp = orig_cqe->src_qp; memcpy(wc->smac, orig_cqe->smac, ETH_ALEN); if (bnxt_re_is_vlan_pkt(orig_cqe, &vlan_id, &sl)) { @@ -3726,7 +3846,10 @@ int bnxt_re_poll_cq(struct ib_cq *ib_cq, int num_entries, struct ib_wc *wc) (unsigned long)(cqe->qp_handle), struct bnxt_re_qp, qplib_qp); wc->qp = &qp->ib_qp; - wc->ex.imm_data = cqe->immdata; + if (cqe->flags & CQ_RES_RC_FLAGS_IMM) + wc->ex.imm_data = cpu_to_be32(cqe->immdata); + else + wc->ex.invalidate_rkey = cqe->invrkey; wc->src_qp = cqe->src_qp; memcpy(wc->smac, cqe->smac, ETH_ALEN); wc->port_num = 1; @@ -3844,9 +3967,12 @@ struct ib_mr *bnxt_re_get_dma_mr(struct ib_pd *ib_pd, int mr_access_flags) mr->rdev = rdev; mr->qplib_mr.pd = &pd->qplib_pd; - mr->qplib_mr.flags = __from_ib_access_flags(mr_access_flags); + mr->qplib_mr.access_flags = __from_ib_access_flags(mr_access_flags); mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_PMR; + if (mr_access_flags & IB_ACCESS_RELAXED_ORDERING) + bnxt_re_check_and_set_relaxed_ordering(rdev, &mr->qplib_mr); + /* Allocate and register 0 as the address */ rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr); if (rc) @@ -3944,7 +4070,7 @@ struct ib_mr *bnxt_re_alloc_mr(struct ib_pd *ib_pd, enum ib_mr_type type, mr->rdev = rdev; mr->qplib_mr.pd = &pd->qplib_pd; - mr->qplib_mr.flags = BNXT_QPLIB_FR_PMR; + mr->qplib_mr.access_flags = BNXT_QPLIB_FR_PMR; mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_PMR; rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr); @@ -4061,21 +4187,28 @@ static struct ib_mr *__bnxt_re_user_reg_mr(struct ib_pd *ib_pd, u64 length, u64 mr->rdev = rdev; mr->qplib_mr.pd = &pd->qplib_pd; - mr->qplib_mr.flags = __from_ib_access_flags(mr_access_flags); + mr->qplib_mr.access_flags = __from_ib_access_flags(mr_access_flags); mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_MR; - rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr); - if (rc) { - ibdev_err(&rdev->ibdev, "Failed to allocate MR rc = %d", rc); - rc = -EIO; - goto free_mr; + if (!_is_alloc_mr_unified(rdev->dev_attr->dev_cap_flags)) { + rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr); + if (rc) { + ibdev_err(&rdev->ibdev, "Failed to allocate MR rc = %d", rc); + rc = -EIO; + goto free_mr; + } + /* The fixed portion of the rkey is the same as the lkey */ + mr->ib_mr.rkey = mr->qplib_mr.rkey; + } else { + mr->qplib_mr.flags = CMDQ_REGISTER_MR_FLAGS_ALLOC_MR; } - /* The fixed portion of the rkey is the same as the lkey */ - mr->ib_mr.rkey = mr->qplib_mr.rkey; mr->ib_umem = umem; mr->qplib_mr.va = virt_addr; mr->qplib_mr.total_size = length; + if (mr_access_flags & IB_ACCESS_RELAXED_ORDERING) + bnxt_re_check_and_set_relaxed_ordering(rdev, &mr->qplib_mr); + umem_pgs = ib_umem_num_dma_blocks(umem, page_size); rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, umem, umem_pgs, page_size); @@ -4121,7 +4254,8 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length, struct ib_mr *bnxt_re_reg_user_mr_dmabuf(struct ib_pd *ib_pd, u64 start, u64 length, u64 virt_addr, int fd, - int mr_access_flags, struct ib_udata *udata) + int mr_access_flags, + struct uverbs_attr_bundle *attrs) { struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd); struct bnxt_re_dev *rdev = pd->rdev; @@ -4148,7 +4282,7 @@ int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata) struct bnxt_re_ucontext *uctx = container_of(ctx, struct bnxt_re_ucontext, ib_uctx); struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); - struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; + struct bnxt_qplib_dev_attr *dev_attr = rdev->dev_attr; struct bnxt_re_user_mmap_entry *entry; struct bnxt_re_uctx_resp resp = {}; struct bnxt_re_uctx_req ureq = {}; @@ -4186,9 +4320,6 @@ int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata) resp.cqe_sz = sizeof(struct cq_base); resp.max_cqd = dev_attr->max_cq_wqes; - resp.comp_mask |= BNXT_RE_UCNTX_CMASK_HAVE_MODE; - resp.mode = rdev->chip_ctx->modes.wqe_mode; - if (rdev->chip_ctx->modes.db_push) resp.comp_mask |= BNXT_RE_UCNTX_CMASK_WC_DPI_ENABLED; @@ -4201,13 +4332,22 @@ int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata) if (rdev->pacing.dbr_pacing) resp.comp_mask |= BNXT_RE_UCNTX_CMASK_DBR_PACING_ENABLED; + if (_is_host_msn_table(rdev->qplib_res.dattr->dev_cap_flags2)) + resp.comp_mask |= BNXT_RE_UCNTX_CMASK_MSN_TABLE_ENABLED; + if (udata->inlen >= sizeof(ureq)) { rc = ib_copy_from_udata(&ureq, udata, min(udata->inlen, sizeof(ureq))); if (rc) goto cfail; if (ureq.comp_mask & BNXT_RE_COMP_MASK_REQ_UCNTX_POW2_SUPPORT) { resp.comp_mask |= BNXT_RE_UCNTX_CMASK_POW2_DISABLED; - uctx->cmask |= BNXT_RE_UCNTX_CMASK_POW2_DISABLED; + uctx->cmask |= BNXT_RE_UCNTX_CAP_POW2_DISABLED; + } + if (ureq.comp_mask & BNXT_RE_COMP_MASK_REQ_UCNTX_VAR_WQE_SUPPORT) { + resp.comp_mask |= BNXT_RE_UCNTX_CMASK_HAVE_MODE; + resp.mode = rdev->chip_ctx->modes.wqe_mode; + if (resp.mode == BNXT_QPLIB_WQE_MODE_VARIABLE) + uctx->cmask |= BNXT_RE_UCNTX_CAP_VAR_WQE_ENABLED; } } @@ -4261,6 +4401,19 @@ static struct bnxt_re_cq *bnxt_re_search_for_cq(struct bnxt_re_dev *rdev, u32 cq return cq; } +static struct bnxt_re_srq *bnxt_re_search_for_srq(struct bnxt_re_dev *rdev, u32 srq_id) +{ + struct bnxt_re_srq *srq = NULL, *tmp_srq; + + hash_for_each_possible(rdev->srq_hash, tmp_srq, hash_entry, srq_id) { + if (tmp_srq->qplib_srq.id == srq_id) { + srq = tmp_srq; + break; + } + } + return srq; +} + /* Helper function to mmap the virtual memory from user app */ int bnxt_re_mmap(struct ib_ucontext *ib_uctx, struct vm_area_struct *vma) { @@ -4305,9 +4458,10 @@ int bnxt_re_mmap(struct ib_ucontext *ib_uctx, struct vm_area_struct *vma) case BNXT_RE_MMAP_TOGGLE_PAGE: /* Driver doesn't expect write access for user space */ if (vma->vm_flags & VM_WRITE) - return -EFAULT; - ret = vm_insert_page(vma, vma->vm_start, - virt_to_page((void *)bnxt_entry->mem_offset)); + ret = -EFAULT; + else + ret = vm_insert_page(vma, vma->vm_start, + virt_to_page((void *)bnxt_entry->mem_offset)); break; default: ret = -EINVAL; @@ -4328,6 +4482,41 @@ void bnxt_re_mmap_free(struct rdma_user_mmap_entry *rdma_entry) kfree(bnxt_entry); } +int bnxt_re_process_mad(struct ib_device *ibdev, int mad_flags, + u32 port_num, const struct ib_wc *in_wc, + const struct ib_grh *in_grh, + const struct ib_mad *in_mad, struct ib_mad *out_mad, + size_t *out_mad_size, u16 *out_mad_pkey_index) +{ + struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); + struct ib_class_port_info cpi = {}; + int ret = IB_MAD_RESULT_SUCCESS; + int rc = 0; + + if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_PERF_MGMT) + return ret; + + switch (in_mad->mad_hdr.attr_id) { + case IB_PMA_CLASS_PORT_INFO: + cpi.capability_mask = IB_PMA_CLASS_CAP_EXT_WIDTH; + memcpy((out_mad->data + 40), &cpi, sizeof(cpi)); + break; + case IB_PMA_PORT_COUNTERS_EXT: + rc = bnxt_re_assign_pma_port_ext_counters(rdev, out_mad); + break; + case IB_PMA_PORT_COUNTERS: + rc = bnxt_re_assign_pma_port_counters(rdev, out_mad); + break; + default: + rc = -EINVAL; + break; + } + if (rc) + return IB_MAD_RESULT_FAILURE; + ret |= IB_MAD_RESULT_REPLY; + return ret; +} + static int UVERBS_HANDLER(BNXT_RE_METHOD_NOTIFY_DRV)(struct uverbs_attr_bundle *attrs) { struct bnxt_re_ucontext *uctx; @@ -4489,12 +4678,13 @@ static int UVERBS_HANDLER(BNXT_RE_METHOD_GET_TOGGLE_MEM)(struct uverbs_attr_bund struct bnxt_re_ucontext *uctx; struct ib_ucontext *ib_uctx; struct bnxt_re_dev *rdev; + struct bnxt_re_srq *srq; + u32 length = PAGE_SIZE; struct bnxt_re_cq *cq; u64 mem_offset; + u32 offset = 0; u64 addr = 0; - u32 length; - u32 offset; - u32 cq_id; + u32 res_id; int err; ib_uctx = ib_uverbs_get_ucontext(attrs); @@ -4507,23 +4697,24 @@ static int UVERBS_HANDLER(BNXT_RE_METHOD_GET_TOGGLE_MEM)(struct uverbs_attr_bund uctx = container_of(ib_uctx, struct bnxt_re_ucontext, ib_uctx); rdev = uctx->rdev; + err = uverbs_copy_from(&res_id, attrs, BNXT_RE_TOGGLE_MEM_RES_ID); + if (err) + return err; switch (res_type) { case BNXT_RE_CQ_TOGGLE_MEM: - err = uverbs_copy_from(&cq_id, attrs, BNXT_RE_TOGGLE_MEM_RES_ID); - if (err) - return err; - - cq = bnxt_re_search_for_cq(rdev, cq_id); + cq = bnxt_re_search_for_cq(rdev, res_id); if (!cq) return -EINVAL; - length = PAGE_SIZE; addr = (u64)cq->uctx_cq_page; - mmap_flag = BNXT_RE_MMAP_TOGGLE_PAGE; - offset = 0; break; case BNXT_RE_SRQ_TOGGLE_MEM: + srq = bnxt_re_search_for_srq(rdev, res_id); + if (!srq) + return -EINVAL; + + addr = (u64)srq->uctx_srq_page; break; default: diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index b267d6d5975f..22c9eb8e9cfc 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -77,6 +77,8 @@ struct bnxt_re_srq { struct bnxt_qplib_srq qplib_srq; struct ib_umem *umem; spinlock_t lock; /* protect srq */ + void *uctx_srq_page; + struct hlist_node hash_entry; }; struct bnxt_re_qp { @@ -93,6 +95,7 @@ struct bnxt_re_qp { struct ib_ud_header qp1_hdr; struct bnxt_re_cq *scq; struct bnxt_re_cq *rcq; + struct dentry *dentry; }; struct bnxt_re_cq { @@ -171,15 +174,32 @@ static inline u16 bnxt_re_get_rwqe_size(int nsge) return sizeof(struct rq_wqe_hdr) + (nsge * sizeof(struct sq_sge)); } +enum { + BNXT_RE_UCNTX_CAP_POW2_DISABLED = 0x1ULL, + BNXT_RE_UCNTX_CAP_VAR_WQE_ENABLED = 0x2ULL, +}; + static inline u32 bnxt_re_init_depth(u32 ent, struct bnxt_re_ucontext *uctx) { - return uctx ? (uctx->cmask & BNXT_RE_UCNTX_CMASK_POW2_DISABLED) ? + return uctx ? (uctx->cmask & BNXT_RE_UCNTX_CAP_POW2_DISABLED) ? ent : roundup_pow_of_two(ent) : ent; } +static inline bool bnxt_re_is_var_size_supported(struct bnxt_re_dev *rdev, + struct bnxt_re_ucontext *uctx) +{ + if (uctx) + return uctx->cmask & BNXT_RE_UCNTX_CAP_VAR_WQE_ENABLED; + else + return rdev->chip_ctx->modes.wqe_mode; +} + int bnxt_re_query_device(struct ib_device *ibdev, struct ib_device_attr *ib_attr, struct ib_udata *udata); +int bnxt_re_modify_device(struct ib_device *ibdev, + int device_modify_mask, + struct ib_device_modify *device_modify); int bnxt_re_query_port(struct ib_device *ibdev, u32 port_num, struct ib_port_attr *port_attr); int bnxt_re_get_port_immutable(struct ib_device *ibdev, u32 port_num, @@ -221,7 +241,7 @@ int bnxt_re_post_send(struct ib_qp *qp, const struct ib_send_wr *send_wr, int bnxt_re_post_recv(struct ib_qp *qp, const struct ib_recv_wr *recv_wr, const struct ib_recv_wr **bad_recv_wr); int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int bnxt_re_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata); int bnxt_re_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); int bnxt_re_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc); @@ -242,12 +262,22 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, struct ib_mr *bnxt_re_reg_user_mr_dmabuf(struct ib_pd *ib_pd, u64 start, u64 length, u64 virt_addr, int fd, int mr_access_flags, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata); void bnxt_re_dealloc_ucontext(struct ib_ucontext *context); int bnxt_re_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); void bnxt_re_mmap_free(struct rdma_user_mmap_entry *rdma_entry); +int bnxt_re_process_mad(struct ib_device *device, int process_mad_flags, + u32 port_num, const struct ib_wc *in_wc, + const struct ib_grh *in_grh, + const struct ib_mad *in_mad, struct ib_mad *out_mad, + size_t *out_mad_size, u16 *out_mad_pkey_index); + +static inline u32 __to_ib_port_num(u16 port_id) +{ + return (u32)port_id + 1; +} unsigned long bnxt_re_lock_cqs(struct bnxt_re_qp *qp); void bnxt_re_unlock_cqs(struct bnxt_re_qp *qp, unsigned long flags); diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 54b4d2f3a5d8..293b0a96c8e3 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -67,6 +67,7 @@ #include <rdma/bnxt_re-abi.h> #include "bnxt.h" #include "hw_counters.h" +#include "debugfs.h" static char version[] = BNXT_RE_DESC "\n"; @@ -78,16 +79,12 @@ MODULE_LICENSE("Dual BSD/GPL"); /* globals */ static DEFINE_MUTEX(bnxt_re_mutex); -static void bnxt_re_stop_irq(void *handle); -static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev); -static int bnxt_re_netdev_event(struct notifier_block *notifier, - unsigned long event, void *ptr); -static struct bnxt_re_dev *bnxt_re_from_netdev(struct net_device *netdev); -static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev); static int bnxt_re_hwrm_qcaps(struct bnxt_re_dev *rdev); static int bnxt_re_hwrm_qcfg(struct bnxt_re_dev *rdev, u32 *db_len, u32 *offset); +static void bnxt_re_dispatch_event(struct ib_device *ibdev, struct ib_qp *qp, + u8 port_num, enum ib_event_type event); static void bnxt_re_set_db_offset(struct bnxt_re_dev *rdev) { struct bnxt_qplib_chip_ctx *cctx; @@ -129,18 +126,20 @@ static void bnxt_re_set_db_offset(struct bnxt_re_dev *rdev) } } -static void bnxt_re_set_drv_mode(struct bnxt_re_dev *rdev, u8 mode) +static void bnxt_re_set_drv_mode(struct bnxt_re_dev *rdev) { struct bnxt_qplib_chip_ctx *cctx; cctx = rdev->chip_ctx; - cctx->modes.wqe_mode = bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx) ? - mode : BNXT_QPLIB_WQE_MODE_STATIC; + cctx->modes.wqe_mode = bnxt_qplib_is_chip_gen_p7(rdev->chip_ctx) ? + BNXT_QPLIB_WQE_MODE_VARIABLE : BNXT_QPLIB_WQE_MODE_STATIC; if (bnxt_re_hwrm_qcaps(rdev)) dev_err(rdev_to_dev(rdev), "Failed to query hwrm qcaps\n"); - if (bnxt_qplib_is_chip_gen_p7(rdev->chip_ctx)) + if (bnxt_qplib_is_chip_gen_p7(rdev->chip_ctx)) { cctx->modes.toggle_bits |= BNXT_QPLIB_CQ_TOGGLE_BIT; + cctx->modes.toggle_bits |= BNXT_QPLIB_SRQ_TOGGLE_BIT; + } } static void bnxt_re_destroy_chip_ctx(struct bnxt_re_dev *rdev) @@ -149,6 +148,10 @@ static void bnxt_re_destroy_chip_ctx(struct bnxt_re_dev *rdev) if (!rdev->chip_ctx) return; + + kfree(rdev->dev_attr); + rdev->dev_attr = NULL; + chip_ctx = rdev->chip_ctx; rdev->chip_ctx = NULL; rdev->rcfw.res = NULL; @@ -158,14 +161,15 @@ static void bnxt_re_destroy_chip_ctx(struct bnxt_re_dev *rdev) kfree(chip_ctx); } -static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev, u8 wqe_mode) +static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev) { struct bnxt_qplib_chip_ctx *chip_ctx; struct bnxt_en_dev *en_dev; - int rc; + int rc = -ENOMEM; en_dev = rdev->en_dev; + rdev->qplib_res.pdev = en_dev->pdev; chip_ctx = kzalloc(sizeof(*chip_ctx), GFP_KERNEL); if (!chip_ctx) return -ENOMEM; @@ -177,20 +181,31 @@ static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev, u8 wqe_mode) rdev->qplib_res.cctx = rdev->chip_ctx; rdev->rcfw.res = &rdev->qplib_res; - rdev->qplib_res.dattr = &rdev->dev_attr; + rdev->dev_attr = kzalloc(sizeof(*rdev->dev_attr), GFP_KERNEL); + if (!rdev->dev_attr) + goto free_chip_ctx; + rdev->qplib_res.dattr = rdev->dev_attr; rdev->qplib_res.is_vf = BNXT_EN_VF(en_dev); + rdev->qplib_res.en_dev = en_dev; - bnxt_re_set_drv_mode(rdev, wqe_mode); + bnxt_re_set_drv_mode(rdev); bnxt_re_set_db_offset(rdev); rc = bnxt_qplib_map_db_bar(&rdev->qplib_res); if (rc) - return rc; + goto free_dev_attr; if (bnxt_qplib_determine_atomics(en_dev->pdev)) ibdev_info(&rdev->ibdev, "platform doesn't support global atomics."); return 0; +free_dev_attr: + kfree(rdev->dev_attr); + rdev->dev_attr = NULL; +free_chip_ctx: + kfree(rdev->chip_ctx); + rdev->chip_ctx = NULL; + return rc; } /* SR-IOV helper functions */ @@ -212,7 +227,7 @@ static void bnxt_re_limit_pf_res(struct bnxt_re_dev *rdev) struct bnxt_qplib_ctx *ctx; int i; - attr = &rdev->dev_attr; + attr = rdev->dev_attr; ctx = &rdev->qplib_ctx; ctx->qpc_count = min_t(u32, BNXT_RE_MAX_QPC_COUNT, @@ -226,7 +241,7 @@ static void bnxt_re_limit_pf_res(struct bnxt_re_dev *rdev) if (!bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx)) for (i = 0; i < MAX_TQM_ALLOC_REQ; i++) rdev->qplib_ctx.tqm_ctx.qcount[i] = - rdev->dev_attr.tqm_alloc_reqs[i]; + rdev->dev_attr->tqm_alloc_reqs[i]; } static void bnxt_re_limit_vf_res(struct bnxt_qplib_ctx *qplib_ctx, u32 num_vf) @@ -280,33 +295,163 @@ static void bnxt_re_set_resource_limits(struct bnxt_re_dev *rdev) static void bnxt_re_vf_res_config(struct bnxt_re_dev *rdev) { + /* + * Use the total VF count since the actual VF count may not be + * available at this point. + */ rdev->num_vfs = pci_sriov_get_totalvfs(rdev->en_dev->pdev); - if (!bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx)) { - bnxt_re_set_resource_limits(rdev); - bnxt_qplib_set_func_resources(&rdev->qplib_res, &rdev->rcfw, - &rdev->qplib_ctx); + if (!rdev->num_vfs) + return; + + bnxt_re_set_resource_limits(rdev); + bnxt_qplib_set_func_resources(&rdev->qplib_res, &rdev->rcfw, + &rdev->qplib_ctx); +} + +struct bnxt_re_dcb_work { + struct work_struct work; + struct bnxt_re_dev *rdev; + struct hwrm_async_event_cmpl cmpl; +}; + +static bool bnxt_re_is_qp1_qp(struct bnxt_re_qp *qp) +{ + return qp->ib_qp.qp_type == IB_QPT_GSI; +} + +static struct bnxt_re_qp *bnxt_re_get_qp1_qp(struct bnxt_re_dev *rdev) +{ + struct bnxt_re_qp *qp; + + mutex_lock(&rdev->qp_lock); + list_for_each_entry(qp, &rdev->qp_list, list) { + if (bnxt_re_is_qp1_qp(qp)) { + mutex_unlock(&rdev->qp_lock); + return qp; + } } + mutex_unlock(&rdev->qp_lock); + return NULL; } -static void bnxt_re_shutdown(struct auxiliary_device *adev) +static int bnxt_re_update_qp1_tos_dscp(struct bnxt_re_dev *rdev) +{ + struct bnxt_re_qp *qp; + + if (!bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx)) + return 0; + + qp = bnxt_re_get_qp1_qp(rdev); + if (!qp) + return 0; + + qp->qplib_qp.modify_flags = CMDQ_MODIFY_QP_MODIFY_MASK_TOS_DSCP; + qp->qplib_qp.tos_dscp = rdev->cc_param.qp1_tos_dscp; + + return bnxt_qplib_modify_qp(&rdev->qplib_res, &qp->qplib_qp); +} + +static void bnxt_re_init_dcb_wq(struct bnxt_re_dev *rdev) +{ + rdev->dcb_wq = create_singlethread_workqueue("bnxt_re_dcb_wq"); +} + +static void bnxt_re_uninit_dcb_wq(struct bnxt_re_dev *rdev) +{ + if (!rdev->dcb_wq) + return; + destroy_workqueue(rdev->dcb_wq); +} + +static void bnxt_re_dcb_wq_task(struct work_struct *work) +{ + struct bnxt_re_dcb_work *dcb_work = + container_of(work, struct bnxt_re_dcb_work, work); + struct bnxt_re_dev *rdev = dcb_work->rdev; + struct bnxt_qplib_cc_param *cc_param; + int rc; + + if (!rdev) + goto free_dcb; + + cc_param = &rdev->cc_param; + rc = bnxt_qplib_query_cc_param(&rdev->qplib_res, cc_param); + if (rc) { + ibdev_dbg(&rdev->ibdev, "Failed to query ccparam rc:%d", rc); + goto free_dcb; + } + if (cc_param->qp1_tos_dscp != cc_param->tos_dscp) { + cc_param->qp1_tos_dscp = cc_param->tos_dscp; + rc = bnxt_re_update_qp1_tos_dscp(rdev); + if (rc) { + ibdev_dbg(&rdev->ibdev, "%s: Failed to modify QP1 rc:%d", + __func__, rc); + goto free_dcb; + } + } + +free_dcb: + kfree(dcb_work); +} + +static void bnxt_re_async_notifier(void *handle, struct hwrm_async_event_cmpl *cmpl) { - struct bnxt_re_dev *rdev = auxiliary_get_drvdata(adev); + struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(handle); + struct bnxt_re_dcb_work *dcb_work; + struct bnxt_re_dev *rdev; + u32 data1, data2; + u16 event_id; + rdev = en_info->rdev; if (!rdev) return; - ib_unregister_device(&rdev->ibdev); - bnxt_re_dev_uninit(rdev); + + event_id = le16_to_cpu(cmpl->event_id); + data1 = le32_to_cpu(cmpl->event_data1); + data2 = le32_to_cpu(cmpl->event_data2); + + ibdev_dbg(&rdev->ibdev, "Async event_id = %d data1 = %d data2 = %d", + event_id, data1, data2); + + switch (event_id) { + case ASYNC_EVENT_CMPL_EVENT_ID_DCB_CONFIG_CHANGE: + dcb_work = kzalloc(sizeof(*dcb_work), GFP_ATOMIC); + if (!dcb_work) + break; + + dcb_work->rdev = rdev; + memcpy(&dcb_work->cmpl, cmpl, sizeof(*cmpl)); + INIT_WORK(&dcb_work->work, bnxt_re_dcb_wq_task); + queue_work(rdev->dcb_wq, &dcb_work->work); + break; + default: + break; + } } -static void bnxt_re_stop_irq(void *handle) +static void bnxt_re_stop_irq(void *handle, bool reset) { - struct bnxt_re_dev *rdev = (struct bnxt_re_dev *)handle; - struct bnxt_qplib_rcfw *rcfw = &rdev->rcfw; + struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(handle); + struct bnxt_qplib_rcfw *rcfw; + struct bnxt_re_dev *rdev; struct bnxt_qplib_nq *nq; int indx; - for (indx = BNXT_RE_NQ_IDX; indx < rdev->num_msix; indx++) { - nq = &rdev->nq[indx - 1]; + rdev = en_info->rdev; + if (!rdev) + return; + rcfw = &rdev->rcfw; + + if (reset) { + set_bit(ERR_DEVICE_DETACHED, &rdev->rcfw.cmdq.flags); + set_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags); + wake_up_all(&rdev->rcfw.cmdq.waitq); + bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1, + IB_EVENT_DEVICE_FATAL); + } + + for (indx = BNXT_RE_NQ_IDX; indx < rdev->nqr->num_msix; indx++) { + nq = &rdev->nqr->nq[indx - 1]; bnxt_qplib_nq_stop_irq(nq, false); } @@ -315,12 +460,18 @@ static void bnxt_re_stop_irq(void *handle) static void bnxt_re_start_irq(void *handle, struct bnxt_msix_entry *ent) { - struct bnxt_re_dev *rdev = (struct bnxt_re_dev *)handle; - struct bnxt_msix_entry *msix_ent = rdev->en_dev->msix_entries; - struct bnxt_qplib_rcfw *rcfw = &rdev->rcfw; + struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(handle); + struct bnxt_msix_entry *msix_ent; + struct bnxt_qplib_rcfw *rcfw; + struct bnxt_re_dev *rdev; struct bnxt_qplib_nq *nq; int indx, rc; + rdev = en_info->rdev; + if (!rdev) + return; + msix_ent = rdev->nqr->msix_entries; + rcfw = &rdev->rcfw; if (!ent) { /* Not setting the f/w timeout bit in rcfw. * During the driver unload the first command @@ -334,8 +485,8 @@ static void bnxt_re_start_irq(void *handle, struct bnxt_msix_entry *ent) /* Vectors may change after restart, so update with new vectors * in device sctructure. */ - for (indx = 0; indx < rdev->num_msix; indx++) - rdev->en_dev->msix_entries[indx].vector = ent[indx].vector; + for (indx = 0; indx < rdev->nqr->num_msix; indx++) + rdev->nqr->msix_entries[indx].vector = ent[indx].vector; rc = bnxt_qplib_rcfw_start_irq(rcfw, msix_ent[BNXT_RE_AEQ_IDX].vector, false); @@ -343,8 +494,8 @@ static void bnxt_re_start_irq(void *handle, struct bnxt_msix_entry *ent) ibdev_warn(&rdev->ibdev, "Failed to reinit CREQ\n"); return; } - for (indx = BNXT_RE_NQ_IDX ; indx < rdev->num_msix; indx++) { - nq = &rdev->nq[indx - 1]; + for (indx = BNXT_RE_NQ_IDX ; indx < rdev->nqr->num_msix; indx++) { + nq = &rdev->nqr->nq[indx - 1]; rc = bnxt_qplib_nq_start_irq(nq, indx - 1, msix_ent[indx].vector, false); if (rc) { @@ -356,6 +507,7 @@ static void bnxt_re_start_irq(void *handle, struct bnxt_msix_entry *ent) } static struct bnxt_ulp_ops bnxt_re_ulp_ops = { + .ulp_async_notifier = bnxt_re_async_notifier, .ulp_irq_stop = bnxt_re_stop_irq, .ulp_irq_restart = bnxt_re_start_irq }; @@ -365,14 +517,9 @@ static struct bnxt_ulp_ops bnxt_re_ulp_ops = { static int bnxt_re_register_netdev(struct bnxt_re_dev *rdev) { struct bnxt_en_dev *en_dev; - int rc; en_dev = rdev->en_dev; - - rc = bnxt_register_dev(en_dev, &bnxt_re_ulp_ops, rdev); - if (!rc) - rdev->qplib_res.pdev = rdev->en_dev->pdev; - return rc; + return bnxt_register_dev(en_dev, &bnxt_re_ulp_ops, rdev->adev); } static void bnxt_re_init_hwrm_hdr(struct input *hdr, u16 opcd) @@ -423,6 +570,7 @@ int bnxt_re_hwrm_qcaps(struct bnxt_re_dev *rdev) struct hwrm_func_qcaps_input req = {}; struct bnxt_qplib_chip_ctx *cctx; struct bnxt_fw_msg fw_msg = {}; + u32 flags_ext2; int rc; cctx = rdev->chip_ctx; @@ -436,14 +584,15 @@ int bnxt_re_hwrm_qcaps(struct bnxt_re_dev *rdev) return rc; cctx->modes.db_push = le32_to_cpu(resp.flags) & FUNC_QCAPS_RESP_FLAGS_WCB_PUSH_MODE; - cctx->modes.dbr_pacing = - le32_to_cpu(resp.flags_ext2) & - FUNC_QCAPS_RESP_FLAGS_EXT2_DBR_PACING_EXT_SUPPORTED; + flags_ext2 = le32_to_cpu(resp.flags_ext2); + cctx->modes.dbr_pacing = flags_ext2 & FUNC_QCAPS_RESP_FLAGS_EXT2_DBR_PACING_EXT_SUPPORTED || + flags_ext2 & FUNC_QCAPS_RESP_FLAGS_EXT2_DBR_PACING_V0_SUPPORTED; return 0; } static int bnxt_re_hwrm_dbr_pacing_qcfg(struct bnxt_re_dev *rdev) { + struct bnxt_qplib_db_pacing_data *pacing_data = rdev->qplib_res.pacing_data; struct hwrm_func_dbr_pacing_qcfg_output resp = {}; struct hwrm_func_dbr_pacing_qcfg_input req = {}; struct bnxt_en_dev *en_dev = rdev->en_dev; @@ -465,6 +614,13 @@ static int bnxt_re_hwrm_dbr_pacing_qcfg(struct bnxt_re_dev *rdev) cctx->dbr_stat_db_fifo = le32_to_cpu(resp.dbr_stat_db_fifo_reg) & ~FUNC_DBR_PACING_QCFG_RESP_DBR_STAT_DB_FIFO_REG_ADDR_SPACE_MASK; + + pacing_data->fifo_max_depth = le32_to_cpu(resp.dbr_stat_db_max_fifo_depth); + if (!pacing_data->fifo_max_depth) + pacing_data->fifo_max_depth = BNXT_RE_MAX_FIFO_DEPTH(cctx); + pacing_data->fifo_room_mask = le32_to_cpu(resp.dbr_stat_db_fifo_reg_fifo_room_mask); + pacing_data->fifo_room_shift = resp.dbr_stat_db_fifo_reg_fifo_room_shift; + return 0; } @@ -479,24 +635,55 @@ static void bnxt_re_set_default_pacing_data(struct bnxt_re_dev *rdev) pacing_data->pacing_th * BNXT_RE_PACING_ALARM_TH_MULTIPLE; } -static void __wait_for_fifo_occupancy_below_th(struct bnxt_re_dev *rdev) +static u32 __get_fifo_occupancy(struct bnxt_re_dev *rdev) { + struct bnxt_qplib_db_pacing_data *pacing_data = rdev->qplib_res.pacing_data; u32 read_val, fifo_occup; + read_val = readl(rdev->en_dev->bar0 + rdev->pacing.dbr_db_fifo_reg_off); + fifo_occup = pacing_data->fifo_max_depth - + ((read_val & pacing_data->fifo_room_mask) >> + pacing_data->fifo_room_shift); + return fifo_occup; +} + +static bool is_dbr_fifo_full(struct bnxt_re_dev *rdev) +{ + u32 max_occup, fifo_occup; + + fifo_occup = __get_fifo_occupancy(rdev); + max_occup = BNXT_RE_MAX_FIFO_DEPTH(rdev->chip_ctx) - 1; + if (fifo_occup == max_occup) + return true; + + return false; +} + +static void __wait_for_fifo_occupancy_below_th(struct bnxt_re_dev *rdev) +{ + struct bnxt_qplib_db_pacing_data *pacing_data = rdev->qplib_res.pacing_data; + u32 retry_fifo_check = 1000; + u32 fifo_occup; + /* loop shouldn't run infintely as the occupancy usually goes * below pacing algo threshold as soon as pacing kicks in. */ while (1) { - read_val = readl(rdev->en_dev->bar0 + rdev->pacing.dbr_db_fifo_reg_off); - fifo_occup = BNXT_RE_MAX_FIFO_DEPTH - - ((read_val & BNXT_RE_DB_FIFO_ROOM_MASK) >> - BNXT_RE_DB_FIFO_ROOM_SHIFT); + fifo_occup = __get_fifo_occupancy(rdev); /* Fifo occupancy cannot be greater the MAX FIFO depth */ - if (fifo_occup > BNXT_RE_MAX_FIFO_DEPTH) + if (fifo_occup > pacing_data->fifo_max_depth) break; - if (fifo_occup < rdev->qplib_res.pacing_data->pacing_th) + if (fifo_occup < pacing_data->pacing_th) + break; + if (!retry_fifo_check--) { + dev_info_once(rdev_to_dev(rdev), + "%s: fifo_occup = 0x%xfifo_max_depth = 0x%x pacing_th = 0x%x\n", + __func__, fifo_occup, pacing_data->fifo_max_depth, + pacing_data->pacing_th); break; + } + } } @@ -546,16 +733,13 @@ static void bnxt_re_pacing_timer_exp(struct work_struct *work) struct bnxt_re_dev *rdev = container_of(work, struct bnxt_re_dev, dbq_pacing_work.work); struct bnxt_qplib_db_pacing_data *pacing_data; - u32 read_val, fifo_occup; + u32 fifo_occup; if (!mutex_trylock(&rdev->pacing.dbq_lock)) return; pacing_data = rdev->qplib_res.pacing_data; - read_val = readl(rdev->en_dev->bar0 + rdev->pacing.dbr_db_fifo_reg_off); - fifo_occup = BNXT_RE_MAX_FIFO_DEPTH - - ((read_val & BNXT_RE_DB_FIFO_ROOM_MASK) >> - BNXT_RE_DB_FIFO_ROOM_SHIFT); + fifo_occup = __get_fifo_occupancy(rdev); if (fifo_occup > pacing_data->pacing_th) goto restart_timer; @@ -594,7 +778,7 @@ void bnxt_re_pacing_alert(struct bnxt_re_dev *rdev) * Increase the alarm_th to max so that other user lib instances do not * keep alerting the driver. */ - pacing_data->alarm_th = BNXT_RE_MAX_FIFO_DEPTH; + pacing_data->alarm_th = pacing_data->fifo_max_depth; pacing_data->do_pacing = BNXT_RE_MAX_DBR_DO_PACING; cancel_work_sync(&rdev->dbq_fifo_check_work); schedule_work(&rdev->dbq_fifo_check_work); @@ -603,9 +787,6 @@ void bnxt_re_pacing_alert(struct bnxt_re_dev *rdev) static int bnxt_re_initialize_dbr_pacing(struct bnxt_re_dev *rdev) { - if (bnxt_re_hwrm_dbr_pacing_qcfg(rdev)) - return -EIO; - /* Allocate a page for app use */ rdev->pacing.dbr_page = (void *)__get_free_page(GFP_KERNEL); if (!rdev->pacing.dbr_page) @@ -614,6 +795,12 @@ static int bnxt_re_initialize_dbr_pacing(struct bnxt_re_dev *rdev) memset((u8 *)rdev->pacing.dbr_page, 0, PAGE_SIZE); rdev->qplib_res.pacing_data = (struct bnxt_qplib_db_pacing_data *)rdev->pacing.dbr_page; + if (bnxt_re_hwrm_dbr_pacing_qcfg(rdev)) { + free_page((u64)rdev->pacing.dbr_page); + rdev->pacing.dbr_page = NULL; + return -EIO; + } + /* MAP HW window 2 for reading db fifo depth */ writel(rdev->chip_ctx->dbr_stat_db_fifo & BNXT_GRC_BASE_MASK, rdev->en_dev->bar0 + BNXT_GRCPF_REG_WINDOW_BASE_OUT + 4); @@ -623,13 +810,16 @@ static int bnxt_re_initialize_dbr_pacing(struct bnxt_re_dev *rdev) rdev->pacing.dbr_bar_addr = pci_resource_start(rdev->qplib_res.pdev, 0) + rdev->pacing.dbr_db_fifo_reg_off; + if (is_dbr_fifo_full(rdev)) { + free_page((u64)rdev->pacing.dbr_page); + rdev->pacing.dbr_page = NULL; + return -EIO; + } + rdev->pacing.pacing_algo_th = BNXT_RE_PACING_ALGO_THRESHOLD; rdev->pacing.dbq_pacing_time = BNXT_RE_DBR_PACING_TIME; rdev->pacing.dbr_def_do_pacing = BNXT_RE_DBR_DO_PACING_NO_CONGESTION; rdev->pacing.do_pacing_save = rdev->pacing.dbr_def_do_pacing; - rdev->qplib_res.pacing_data->fifo_max_depth = BNXT_RE_MAX_FIFO_DEPTH; - rdev->qplib_res.pacing_data->fifo_room_mask = BNXT_RE_DB_FIFO_ROOM_MASK; - rdev->qplib_res.pacing_data->fifo_room_shift = BNXT_RE_DB_FIFO_ROOM_SHIFT; rdev->qplib_res.pacing_data->grc_reg_offset = rdev->pacing.dbr_db_fifo_reg_off; bnxt_re_set_default_pacing_data(rdev); /* Initialize worker for DBR Pacing */ @@ -779,17 +969,6 @@ static void bnxt_re_disassociate_ucontext(struct ib_ucontext *ibcontext) } /* Device */ - -static struct bnxt_re_dev *bnxt_re_from_netdev(struct net_device *netdev) -{ - struct ib_device *ibdev = - ib_device_get_by_netdev(netdev, RDMA_DRIVER_BNXT_RE); - if (!ibdev) - return NULL; - - return container_of(ibdev, struct bnxt_re_dev, ibdev); -} - static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr, char *buf) { @@ -820,6 +999,253 @@ static const struct attribute_group bnxt_re_dev_attr_group = { .attrs = bnxt_re_attributes, }; +static int bnxt_re_fill_res_mr_entry(struct sk_buff *msg, struct ib_mr *ib_mr) +{ + struct bnxt_qplib_hwq *mr_hwq; + struct nlattr *table_attr; + struct bnxt_re_mr *mr; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_DRIVER); + if (!table_attr) + return -EMSGSIZE; + + mr = container_of(ib_mr, struct bnxt_re_mr, ib_mr); + mr_hwq = &mr->qplib_mr.hwq; + + if (rdma_nl_put_driver_u32(msg, "page_size", + mr_hwq->qe_ppg * mr_hwq->element_size)) + goto err; + if (rdma_nl_put_driver_u32(msg, "max_elements", mr_hwq->max_elements)) + goto err; + if (rdma_nl_put_driver_u32(msg, "element_size", mr_hwq->element_size)) + goto err; + if (rdma_nl_put_driver_u64_hex(msg, "hwq", (unsigned long)mr_hwq)) + goto err; + if (rdma_nl_put_driver_u64_hex(msg, "va", mr->qplib_mr.va)) + goto err; + + nla_nest_end(msg, table_attr); + return 0; + +err: + nla_nest_cancel(msg, table_attr); + return -EMSGSIZE; +} + +static int bnxt_re_fill_res_mr_entry_raw(struct sk_buff *msg, struct ib_mr *ib_mr) +{ + struct bnxt_re_dev *rdev; + struct bnxt_re_mr *mr; + int err, len; + void *data; + + mr = container_of(ib_mr, struct bnxt_re_mr, ib_mr); + rdev = mr->rdev; + + err = bnxt_re_read_context_allowed(rdev); + if (err) + return err; + + len = bnxt_qplib_is_chip_gen_p7(rdev->chip_ctx) ? BNXT_RE_CONTEXT_TYPE_MRW_SIZE_P7 : + BNXT_RE_CONTEXT_TYPE_MRW_SIZE_P5; + data = kzalloc(len, GFP_KERNEL); + if (!data) + return -ENOMEM; + + err = bnxt_qplib_read_context(&rdev->rcfw, CMDQ_READ_CONTEXT_TYPE_MRW, + mr->qplib_mr.lkey, len, data); + if (!err) + err = nla_put(msg, RDMA_NLDEV_ATTR_RES_RAW, len, data); + + kfree(data); + return err; +} + +static int bnxt_re_fill_res_cq_entry(struct sk_buff *msg, struct ib_cq *ib_cq) +{ + struct bnxt_qplib_hwq *cq_hwq; + struct nlattr *table_attr; + struct bnxt_re_cq *cq; + + cq = container_of(ib_cq, struct bnxt_re_cq, ib_cq); + cq_hwq = &cq->qplib_cq.hwq; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_DRIVER); + if (!table_attr) + return -EMSGSIZE; + + if (rdma_nl_put_driver_u32(msg, "cq_depth", cq_hwq->depth)) + goto err; + if (rdma_nl_put_driver_u32(msg, "max_elements", cq_hwq->max_elements)) + goto err; + if (rdma_nl_put_driver_u32(msg, "element_size", cq_hwq->element_size)) + goto err; + if (rdma_nl_put_driver_u32(msg, "max_wqe", cq->qplib_cq.max_wqe)) + goto err; + + nla_nest_end(msg, table_attr); + return 0; + +err: + nla_nest_cancel(msg, table_attr); + return -EMSGSIZE; +} + +static int bnxt_re_fill_res_cq_entry_raw(struct sk_buff *msg, struct ib_cq *ib_cq) +{ + struct bnxt_re_dev *rdev; + struct bnxt_re_cq *cq; + int err, len; + void *data; + + cq = container_of(ib_cq, struct bnxt_re_cq, ib_cq); + rdev = cq->rdev; + + err = bnxt_re_read_context_allowed(rdev); + if (err) + return err; + + len = bnxt_qplib_is_chip_gen_p7(rdev->chip_ctx) ? BNXT_RE_CONTEXT_TYPE_CQ_SIZE_P7 : + BNXT_RE_CONTEXT_TYPE_CQ_SIZE_P5; + data = kzalloc(len, GFP_KERNEL); + if (!data) + return -ENOMEM; + + err = bnxt_qplib_read_context(&rdev->rcfw, + CMDQ_READ_CONTEXT_TYPE_CQ, + cq->qplib_cq.id, len, data); + if (!err) + err = nla_put(msg, RDMA_NLDEV_ATTR_RES_RAW, len, data); + + kfree(data); + return err; +} + +static int bnxt_re_fill_res_qp_entry(struct sk_buff *msg, struct ib_qp *ib_qp) +{ + struct bnxt_qplib_qp *qplib_qp; + struct nlattr *table_attr; + struct bnxt_re_qp *qp; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_DRIVER); + if (!table_attr) + return -EMSGSIZE; + + qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp); + qplib_qp = &qp->qplib_qp; + + if (rdma_nl_put_driver_u32(msg, "sq_max_wqe", qplib_qp->sq.max_wqe)) + goto err; + if (rdma_nl_put_driver_u32(msg, "sq_max_sge", qplib_qp->sq.max_sge)) + goto err; + if (rdma_nl_put_driver_u32(msg, "sq_wqe_size", qplib_qp->sq.wqe_size)) + goto err; + if (rdma_nl_put_driver_u32(msg, "sq_swq_start", qplib_qp->sq.swq_start)) + goto err; + if (rdma_nl_put_driver_u32(msg, "sq_swq_last", qplib_qp->sq.swq_last)) + goto err; + if (rdma_nl_put_driver_u32(msg, "rq_max_wqe", qplib_qp->rq.max_wqe)) + goto err; + if (rdma_nl_put_driver_u32(msg, "rq_max_sge", qplib_qp->rq.max_sge)) + goto err; + if (rdma_nl_put_driver_u32(msg, "rq_wqe_size", qplib_qp->rq.wqe_size)) + goto err; + if (rdma_nl_put_driver_u32(msg, "rq_swq_start", qplib_qp->rq.swq_start)) + goto err; + if (rdma_nl_put_driver_u32(msg, "rq_swq_last", qplib_qp->rq.swq_last)) + goto err; + if (rdma_nl_put_driver_u32(msg, "timeout", qplib_qp->timeout)) + goto err; + + nla_nest_end(msg, table_attr); + return 0; + +err: + nla_nest_cancel(msg, table_attr); + return -EMSGSIZE; +} + +static int bnxt_re_fill_res_qp_entry_raw(struct sk_buff *msg, struct ib_qp *ibqp) +{ + struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibqp->device, ibdev); + int err, len; + void *data; + + err = bnxt_re_read_context_allowed(rdev); + if (err) + return err; + + len = bnxt_qplib_is_chip_gen_p7(rdev->chip_ctx) ? BNXT_RE_CONTEXT_TYPE_QPC_SIZE_P7 : + BNXT_RE_CONTEXT_TYPE_QPC_SIZE_P5; + data = kzalloc(len, GFP_KERNEL); + if (!data) + return -ENOMEM; + + err = bnxt_qplib_read_context(&rdev->rcfw, CMDQ_READ_CONTEXT_TYPE_QPC, + ibqp->qp_num, len, data); + if (!err) + err = nla_put(msg, RDMA_NLDEV_ATTR_RES_RAW, len, data); + + kfree(data); + return err; +} + +static int bnxt_re_fill_res_srq_entry(struct sk_buff *msg, struct ib_srq *ib_srq) +{ + struct nlattr *table_attr; + struct bnxt_re_srq *srq; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_DRIVER); + if (!table_attr) + return -EMSGSIZE; + + srq = container_of(ib_srq, struct bnxt_re_srq, ib_srq); + + if (rdma_nl_put_driver_u32_hex(msg, "wqe_size", srq->qplib_srq.wqe_size)) + goto err; + if (rdma_nl_put_driver_u32_hex(msg, "max_wqe", srq->qplib_srq.max_wqe)) + goto err; + if (rdma_nl_put_driver_u32_hex(msg, "max_sge", srq->qplib_srq.max_sge)) + goto err; + + nla_nest_end(msg, table_attr); + return 0; + +err: + nla_nest_cancel(msg, table_attr); + return -EMSGSIZE; +} + +static int bnxt_re_fill_res_srq_entry_raw(struct sk_buff *msg, struct ib_srq *ib_srq) +{ + struct bnxt_re_dev *rdev; + struct bnxt_re_srq *srq; + int err, len; + void *data; + + srq = container_of(ib_srq, struct bnxt_re_srq, ib_srq); + rdev = srq->rdev; + + err = bnxt_re_read_context_allowed(rdev); + if (err) + return err; + + len = bnxt_qplib_is_chip_gen_p7(rdev->chip_ctx) ? BNXT_RE_CONTEXT_TYPE_SRQ_SIZE_P7 : + BNXT_RE_CONTEXT_TYPE_SRQ_SIZE_P5; + + data = kzalloc(len, GFP_KERNEL); + if (!data) + return -ENOMEM; + + err = bnxt_qplib_read_context(&rdev->rcfw, CMDQ_READ_CONTEXT_TYPE_SRQ, + srq->qplib_srq.id, len, data); + if (!err) + err = nla_put(msg, RDMA_NLDEV_ATTR_RES_RAW, len, data); + + kfree(data); + return err; +} + static const struct ib_device_ops bnxt_re_dev_ops = { .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_BNXT_RE, @@ -859,8 +1285,10 @@ static const struct ib_device_ops bnxt_re_dev_ops = { .post_recv = bnxt_re_post_recv, .post_send = bnxt_re_post_send, .post_srq_recv = bnxt_re_post_srq_recv, + .process_mad = bnxt_re_process_mad, .query_ah = bnxt_re_query_ah, .query_device = bnxt_re_query_device, + .modify_device = bnxt_re_modify_device, .query_pkey = bnxt_re_query_pkey, .query_port = bnxt_re_query_port, .query_qp = bnxt_re_query_qp, @@ -877,6 +1305,17 @@ static const struct ib_device_ops bnxt_re_dev_ops = { INIT_RDMA_OBJ_SIZE(ib_ucontext, bnxt_re_ucontext, ib_uctx), }; +static const struct ib_device_ops restrack_ops = { + .fill_res_cq_entry = bnxt_re_fill_res_cq_entry, + .fill_res_cq_entry_raw = bnxt_re_fill_res_cq_entry_raw, + .fill_res_qp_entry = bnxt_re_fill_res_qp_entry, + .fill_res_qp_entry_raw = bnxt_re_fill_res_qp_entry_raw, + .fill_res_mr_entry = bnxt_re_fill_res_mr_entry, + .fill_res_mr_entry_raw = bnxt_re_fill_res_mr_entry_raw, + .fill_res_srq_entry = bnxt_re_fill_res_srq_entry, + .fill_res_srq_entry_raw = bnxt_re_fill_res_srq_entry_raw, +}; + static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) { struct ib_device *ibdev = &rdev->ibdev; @@ -890,7 +1329,7 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) addrconf_addr_eui48((u8 *)&ibdev->node_guid, rdev->netdev->dev_addr); - ibdev->num_comp_vectors = rdev->num_msix - 1; + ibdev->num_comp_vectors = rdev->nqr->num_msix - 1; ibdev->dev.parent = &rdev->en_dev->pdev->dev; ibdev->local_dma_lkey = BNXT_QPLIB_RSVD_LKEY; @@ -898,6 +1337,7 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) ibdev->driver_def = bnxt_re_uapi_defs; ib_set_device_ops(ibdev, &bnxt_re_dev_ops); + ib_set_device_ops(ibdev, &restrack_ops); ret = ib_device_set_netdev(&rdev->ibdev, rdev->netdev, 1); if (ret) return ret; @@ -907,7 +1347,7 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) return ib_register_device(ibdev, "bnxt_re%d", &rdev->en_dev->pdev->dev); } -static struct bnxt_re_dev *bnxt_re_dev_add(struct bnxt_aux_priv *aux_priv, +static struct bnxt_re_dev *bnxt_re_dev_add(struct auxiliary_device *adev, struct bnxt_en_dev *en_dev) { struct bnxt_re_dev *rdev; @@ -920,9 +1360,9 @@ static struct bnxt_re_dev *bnxt_re_dev_add(struct bnxt_aux_priv *aux_priv, return NULL; } /* Default values */ - rdev->nb.notifier_call = NULL; rdev->netdev = en_dev->net; rdev->en_dev = en_dev; + rdev->adev = adev; rdev->id = rdev->en_dev->pdev->devfn; INIT_LIST_HEAD(&rdev->qp_list); mutex_init(&rdev->qp_lock); @@ -936,6 +1376,15 @@ static struct bnxt_re_dev *bnxt_re_dev_add(struct bnxt_aux_priv *aux_priv, atomic_set(&rdev->stats.res.pd_count, 0); rdev->cosq[0] = 0xFFFF; rdev->cosq[1] = 0xFFFF; + rdev->cq_coalescing.buf_maxtime = BNXT_QPLIB_CQ_COAL_DEF_BUF_MAXTIME; + if (bnxt_re_chip_gen_p7(en_dev->chip_num)) { + rdev->cq_coalescing.normal_maxbuf = BNXT_QPLIB_CQ_COAL_DEF_NORMAL_MAXBUF_P7; + rdev->cq_coalescing.during_maxbuf = BNXT_QPLIB_CQ_COAL_DEF_DURING_MAXBUF_P7; + } else { + rdev->cq_coalescing.normal_maxbuf = BNXT_QPLIB_CQ_COAL_DEF_NORMAL_MAXBUF_P5; + rdev->cq_coalescing.during_maxbuf = BNXT_QPLIB_CQ_COAL_DEF_DURING_MAXBUF_P5; + } + rdev->cq_coalescing.en_ring_idle_mode = BNXT_QPLIB_CQ_COAL_DEF_EN_RING_IDLE_MODE; return rdev; } @@ -975,12 +1424,15 @@ static int bnxt_re_handle_unaffi_async_event(struct creq_func_event static int bnxt_re_handle_qp_async_event(struct creq_qp_event *qp_event, struct bnxt_re_qp *qp) { - struct bnxt_re_srq *srq = container_of(qp->qplib_qp.srq, struct bnxt_re_srq, - qplib_srq); struct creq_qp_error_notification *err_event; + struct bnxt_re_srq *srq = NULL; struct ib_event event = {}; unsigned int flags; + if (qp->qplib_qp.srq) + srq = container_of(qp->qplib_qp.srq, struct bnxt_re_srq, + qplib_srq); + if (qp->qplib_qp.state == CMDQ_MODIFY_QP_NEW_STATE_ERR && rdma_is_kernel_res(&qp->ib_qp.res)) { flags = bnxt_re_lock_cqs(qp); @@ -1208,15 +1660,9 @@ static int bnxt_re_cqn_handler(struct bnxt_qplib_nq *nq, { struct bnxt_re_cq *cq = container_of(handle, struct bnxt_re_cq, qplib_cq); - u32 *cq_ptr; - if (cq->ib_cq.comp_handler) { - if (cq->uctx_cq_page) { - cq_ptr = (u32 *)cq->uctx_cq_page; - *cq_ptr = cq->qplib_cq.toggle; - } + if (cq->ib_cq.comp_handler) (*cq->ib_cq.comp_handler)(&cq->ib_cq, cq->ib_cq.cq_context); - } return 0; } @@ -1225,8 +1671,8 @@ static void bnxt_re_cleanup_res(struct bnxt_re_dev *rdev) { int i; - for (i = 1; i < rdev->num_msix; i++) - bnxt_qplib_disable_nq(&rdev->nq[i - 1]); + for (i = 1; i < rdev->nqr->num_msix; i++) + bnxt_qplib_disable_nq(&rdev->nqr->nq[i - 1]); if (rdev->qplib_res.rcfw) bnxt_qplib_cleanup_res(&rdev->qplib_res); @@ -1240,10 +1686,12 @@ static int bnxt_re_init_res(struct bnxt_re_dev *rdev) bnxt_qplib_init_res(&rdev->qplib_res); - for (i = 1; i < rdev->num_msix ; i++) { - db_offt = rdev->en_dev->msix_entries[i].db_offset; - rc = bnxt_qplib_enable_nq(rdev->en_dev->pdev, &rdev->nq[i - 1], - i - 1, rdev->en_dev->msix_entries[i].vector, + mutex_init(&rdev->nqr->load_lock); + + for (i = 1; i < rdev->nqr->num_msix ; i++) { + db_offt = rdev->nqr->msix_entries[i].db_offset; + rc = bnxt_qplib_enable_nq(rdev->en_dev->pdev, &rdev->nqr->nq[i - 1], + i - 1, rdev->nqr->msix_entries[i].vector, db_offt, &bnxt_re_cqn_handler, &bnxt_re_srqn_handler); if (rc) { @@ -1256,20 +1704,22 @@ static int bnxt_re_init_res(struct bnxt_re_dev *rdev) return 0; fail: for (i = num_vec_enabled; i >= 0; i--) - bnxt_qplib_disable_nq(&rdev->nq[i]); + bnxt_qplib_disable_nq(&rdev->nqr->nq[i]); return rc; } static void bnxt_re_free_nq_res(struct bnxt_re_dev *rdev) { + struct bnxt_qplib_nq *nq; u8 type; int i; - for (i = 0; i < rdev->num_msix - 1; i++) { + for (i = 0; i < rdev->nqr->num_msix - 1; i++) { type = bnxt_qplib_get_ring_type(rdev->chip_ctx); - bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id, type); - bnxt_qplib_free_nq(&rdev->nq[i]); - rdev->nq[i].res = NULL; + nq = &rdev->nqr->nq[i]; + bnxt_re_net_ring_free(rdev, nq->ring_id, type); + bnxt_qplib_free_nq(nq); + nq->res = NULL; } } @@ -1296,12 +1746,11 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev) /* Configure and allocate resources for qplib */ rdev->qplib_res.rcfw = &rdev->rcfw; - rc = bnxt_qplib_get_dev_attr(&rdev->rcfw, &rdev->dev_attr); + rc = bnxt_qplib_get_dev_attr(&rdev->rcfw); if (rc) goto fail; - rc = bnxt_qplib_alloc_res(&rdev->qplib_res, rdev->en_dev->pdev, - rdev->netdev, &rdev->dev_attr); + rc = bnxt_qplib_alloc_res(&rdev->qplib_res, rdev->netdev); if (rc) goto fail; @@ -1311,12 +1760,12 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev) if (rc) goto dealloc_res; - for (i = 0; i < rdev->num_msix - 1; i++) { + for (i = 0; i < rdev->nqr->num_msix - 1; i++) { struct bnxt_qplib_nq *nq; - nq = &rdev->nq[i]; + nq = &rdev->nqr->nq[i]; nq->hwq.max_elements = BNXT_QPLIB_NQE_MAX_CNT; - rc = bnxt_qplib_alloc_nq(&rdev->qplib_res, &rdev->nq[i]); + rc = bnxt_qplib_alloc_nq(&rdev->qplib_res, nq); if (rc) { ibdev_err(&rdev->ibdev, "Alloc Failed NQ%d rc:%#x", i, rc); @@ -1324,17 +1773,17 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev) } type = bnxt_qplib_get_ring_type(rdev->chip_ctx); rattr.dma_arr = nq->hwq.pbl[PBL_LVL_0].pg_map_arr; - rattr.pages = nq->hwq.pbl[rdev->nq[i].hwq.level].pg_count; + rattr.pages = nq->hwq.pbl[rdev->nqr->nq[i].hwq.level].pg_count; rattr.type = type; rattr.mode = RING_ALLOC_REQ_INT_MODE_MSIX; rattr.depth = BNXT_QPLIB_NQE_MAX_CNT - 1; - rattr.lrid = rdev->en_dev->msix_entries[i + 1].ring_idx; + rattr.lrid = rdev->nqr->msix_entries[i + 1].ring_idx; rc = bnxt_re_net_ring_alloc(rdev, &rattr, &nq->ring_id); if (rc) { ibdev_err(&rdev->ibdev, "Failed to allocate NQ fw id with rc = 0x%x", rc); - bnxt_qplib_free_nq(&rdev->nq[i]); + bnxt_qplib_free_nq(nq); goto free_nq; } num_vec_created++; @@ -1343,8 +1792,8 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev) free_nq: for (i = num_vec_created - 1; i >= 0; i--) { type = bnxt_qplib_get_ring_type(rdev->chip_ctx); - bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id, type); - bnxt_qplib_free_nq(&rdev->nq[i]); + bnxt_re_net_ring_free(rdev, rdev->nqr->nq[i].ring_id, type); + bnxt_qplib_free_nq(&rdev->nqr->nq[i]); } bnxt_qplib_dealloc_dpi(&rdev->qplib_res, &rdev->dpi_privileged); @@ -1384,11 +1833,8 @@ static bool bnxt_re_is_qp1_or_shadow_qp(struct bnxt_re_dev *rdev, static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev) { - int mask = IB_QP_STATE; - struct ib_qp_attr qp_attr; struct bnxt_re_qp *qp; - qp_attr.qp_state = IB_QPS_ERR; mutex_lock(&rdev->qp_lock); list_for_each_entry(qp, &rdev->qp_list, list) { /* Modify the state of all QPs except QP1/Shadow QP */ @@ -1396,12 +1842,9 @@ static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev) if (qp->qplib_qp.state != CMDQ_MODIFY_QP_NEW_STATE_RESET && qp->qplib_qp.state != - CMDQ_MODIFY_QP_NEW_STATE_ERR) { + CMDQ_MODIFY_QP_NEW_STATE_ERR) bnxt_re_dispatch_event(&rdev->ibdev, &qp->ib_qp, 1, IB_EVENT_QP_FATAL); - bnxt_re_modify_qp(&qp->ib_qp, &qp_attr, mask, - NULL); - } } } mutex_unlock(&rdev->qp_lock); @@ -1482,6 +1925,26 @@ static int bnxt_re_setup_qos(struct bnxt_re_dev *rdev) return 0; } +static void bnxt_re_net_unregister_async_event(struct bnxt_re_dev *rdev) +{ + if (rdev->is_virtfn) + return; + + memset(&rdev->event_bitmap, 0, sizeof(rdev->event_bitmap)); + bnxt_register_async_events(rdev->en_dev, &rdev->event_bitmap, + ASYNC_EVENT_CMPL_EVENT_ID_DCB_CONFIG_CHANGE); +} + +static void bnxt_re_net_register_async_event(struct bnxt_re_dev *rdev) +{ + if (rdev->is_virtfn) + return; + + rdev->event_bitmap |= (1 << ASYNC_EVENT_CMPL_EVENT_ID_DCB_CONFIG_CHANGE); + bnxt_register_async_events(rdev->en_dev, &rdev->event_bitmap, + ASYNC_EVENT_CMPL_EVENT_ID_DCB_CONFIG_CHANGE); +} + static void bnxt_re_query_hwrm_intf_version(struct bnxt_re_dev *rdev) { struct bnxt_en_dev *en_dev = rdev->en_dev; @@ -1539,11 +2002,31 @@ static int bnxt_re_ib_init(struct bnxt_re_dev *rdev) return rc; } -static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev) +static int bnxt_re_alloc_nqr_mem(struct bnxt_re_dev *rdev) +{ + rdev->nqr = kzalloc(sizeof(*rdev->nqr), GFP_KERNEL); + if (!rdev->nqr) + return -ENOMEM; + + return 0; +} + +static void bnxt_re_free_nqr_mem(struct bnxt_re_dev *rdev) +{ + kfree(rdev->nqr); + rdev->nqr = NULL; +} + +static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev, u8 op_type) { u8 type; int rc; + bnxt_re_debugfs_rem_pdev(rdev); + + bnxt_re_net_unregister_async_event(rdev); + bnxt_re_uninit_dcb_wq(rdev); + if (test_and_clear_bit(BNXT_RE_FLAG_QOS_WORK_REG, &rdev->flags)) cancel_delayed_work_sync(&rdev->worker); @@ -1566,14 +2049,17 @@ static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev) bnxt_qplib_free_rcfw_channel(&rdev->rcfw); } - rdev->num_msix = 0; + rdev->nqr->num_msix = 0; if (rdev->pacing.dbr_pacing) bnxt_re_deinitialize_dbr_pacing(rdev); + bnxt_re_free_nqr_mem(rdev); bnxt_re_destroy_chip_ctx(rdev); - if (test_and_clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags)) - bnxt_unregister_dev(rdev->en_dev); + if (op_type == BNXT_RE_COMPLETE_REMOVE) { + if (test_and_clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags)) + bnxt_unregister_dev(rdev->en_dev); + } } /* worker thread for polling periodic events. Now used for QoS programming*/ @@ -1586,7 +2072,7 @@ static void bnxt_re_worker(struct work_struct *work) schedule_delayed_work(&rdev->worker, msecs_to_jiffies(30000)); } -static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 wqe_mode) +static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 op_type) { struct bnxt_re_ring_attr rattr = {}; struct bnxt_qplib_creq_ctx *creq; @@ -1595,16 +2081,29 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 wqe_mode) u8 type; int rc; - /* Registered a new RoCE device instance to netdev */ - rc = bnxt_re_register_netdev(rdev); - if (rc) { + if (op_type == BNXT_RE_COMPLETE_INIT) { + /* Registered a new RoCE device instance to netdev */ + rc = bnxt_re_register_netdev(rdev); + if (rc) { + ibdev_err(&rdev->ibdev, + "Failed to register with netedev: %#x\n", rc); + return -EINVAL; + } + } + set_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); + + if (rdev->en_dev->ulp_tbl->msix_requested < BNXT_RE_MIN_MSIX) { ibdev_err(&rdev->ibdev, - "Failed to register with netedev: %#x\n", rc); + "RoCE requires minimum 2 MSI-X vectors, but only %d reserved\n", + rdev->en_dev->ulp_tbl->msix_requested); + bnxt_unregister_dev(rdev->en_dev); + clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); return -EINVAL; } - set_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); + ibdev_dbg(&rdev->ibdev, "Got %d MSI-X vectors\n", + rdev->en_dev->ulp_tbl->msix_requested); - rc = bnxt_re_setup_chip_ctx(rdev, wqe_mode); + rc = bnxt_re_setup_chip_ctx(rdev); if (rc) { bnxt_unregister_dev(rdev->en_dev); clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); @@ -1612,27 +2111,27 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 wqe_mode) return -EINVAL; } + rc = bnxt_re_alloc_nqr_mem(rdev); + if (rc) { + bnxt_re_destroy_chip_ctx(rdev); + bnxt_unregister_dev(rdev->en_dev); + clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); + return rc; + } + rdev->nqr->num_msix = rdev->en_dev->ulp_tbl->msix_requested; + memcpy(rdev->nqr->msix_entries, rdev->en_dev->msix_entries, + sizeof(struct bnxt_msix_entry) * rdev->nqr->num_msix); + /* Check whether VF or PF */ bnxt_re_get_sriov_func_type(rdev); - if (!rdev->en_dev->ulp_tbl->msix_requested) { - ibdev_err(&rdev->ibdev, - "Failed to get MSI-X vectors: %#x\n", rc); - rc = -EINVAL; - goto fail; - } - ibdev_dbg(&rdev->ibdev, "Got %d MSI-X vectors\n", - rdev->en_dev->ulp_tbl->msix_requested); - rdev->num_msix = rdev->en_dev->ulp_tbl->msix_requested; - bnxt_re_query_hwrm_intf_version(rdev); /* Establish RCFW Communication Channel to initialize the context * memory for the function and all child VFs */ rc = bnxt_qplib_alloc_rcfw_channel(&rdev->qplib_res, &rdev->rcfw, - &rdev->qplib_ctx, - BNXT_RE_MAX_QPC_COUNT); + &rdev->qplib_ctx); if (rc) { ibdev_err(&rdev->ibdev, "Failed to allocate RCFW Channel: %#x\n", rc); @@ -1646,14 +2145,14 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 wqe_mode) rattr.type = type; rattr.mode = RING_ALLOC_REQ_INT_MODE_MSIX; rattr.depth = BNXT_QPLIB_CREQE_MAX_CNT - 1; - rattr.lrid = rdev->en_dev->msix_entries[BNXT_RE_AEQ_IDX].ring_idx; + rattr.lrid = rdev->nqr->msix_entries[BNXT_RE_AEQ_IDX].ring_idx; rc = bnxt_re_net_ring_alloc(rdev, &rattr, &creq->ring_id); if (rc) { ibdev_err(&rdev->ibdev, "Failed to allocate CREQ: %#x\n", rc); goto free_rcfw; } - db_offt = rdev->en_dev->msix_entries[BNXT_RE_AEQ_IDX].db_offset; - vid = rdev->en_dev->msix_entries[BNXT_RE_AEQ_IDX].vector; + db_offt = rdev->nqr->msix_entries[BNXT_RE_AEQ_IDX].db_offset; + vid = rdev->nqr->msix_entries[BNXT_RE_AEQ_IDX].vector; rc = bnxt_qplib_enable_rcfw_channel(&rdev->rcfw, vid, db_offt, &bnxt_re_aeq_handler); @@ -1673,7 +2172,7 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 wqe_mode) rdev->pacing.dbr_pacing = false; } } - rc = bnxt_qplib_get_dev_attr(&rdev->rcfw, &rdev->dev_attr); + rc = bnxt_qplib_get_dev_attr(&rdev->rcfw); if (rc) goto disable_rcfw; @@ -1722,6 +2221,11 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 wqe_mode) set_bit(BNXT_RE_FLAG_RESOURCES_INITIALIZED, &rdev->flags); if (!rdev->is_virtfn) { + /* Query f/w defaults of CC params */ + rc = bnxt_qplib_query_cc_param(&rdev->qplib_res, &rdev->cc_param); + if (rc) + ibdev_warn(&rdev->ibdev, "Failed to query CC defaults\n"); + rc = bnxt_re_setup_qos(rdev); if (rc) ibdev_info(&rdev->ibdev, @@ -1730,13 +2234,18 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 wqe_mode) INIT_DELAYED_WORK(&rdev->worker, bnxt_re_worker); set_bit(BNXT_RE_FLAG_QOS_WORK_REG, &rdev->flags); schedule_delayed_work(&rdev->worker, msecs_to_jiffies(30000)); - /* - * Use the total VF count since the actual VF count may not be - * available at this point. - */ - bnxt_re_vf_res_config(rdev); + + if (!(rdev->qplib_res.en_dev->flags & BNXT_EN_FLAG_ROCE_VF_RES_MGMT)) + bnxt_re_vf_res_config(rdev); } hash_init(rdev->cq_hash); + if (rdev->chip_ctx->modes.toggle_bits & BNXT_QPLIB_SRQ_TOGGLE_BIT) + hash_init(rdev->srq_hash); + + bnxt_re_debugfs_add_pdev(rdev); + + bnxt_re_init_dcb_wq(rdev); + bnxt_re_net_register_async_event(rdev); return 0; free_sctx: @@ -1751,50 +2260,11 @@ free_ring: free_rcfw: bnxt_qplib_free_rcfw_channel(&rdev->rcfw); fail: - bnxt_re_dev_uninit(rdev); + bnxt_re_dev_uninit(rdev, BNXT_RE_COMPLETE_REMOVE); return rc; } -static int bnxt_re_add_device(struct auxiliary_device *adev, u8 wqe_mode) -{ - struct bnxt_aux_priv *aux_priv = - container_of(adev, struct bnxt_aux_priv, aux_dev); - struct bnxt_en_dev *en_dev; - struct bnxt_re_dev *rdev; - int rc; - - /* en_dev should never be NULL as long as adev and aux_dev are valid. */ - en_dev = aux_priv->edev; - - rdev = bnxt_re_dev_add(aux_priv, en_dev); - if (!rdev || !rdev_to_dev(rdev)) { - rc = -ENOMEM; - goto exit; - } - - rc = bnxt_re_dev_init(rdev, wqe_mode); - if (rc) - goto re_dev_dealloc; - - rc = bnxt_re_ib_init(rdev); - if (rc) { - pr_err("Failed to register with IB: %s", - aux_priv->aux_dev.name); - goto re_dev_uninit; - } - auxiliary_set_drvdata(adev, rdev); - - return 0; - -re_dev_uninit: - bnxt_re_dev_uninit(rdev); -re_dev_dealloc: - ib_dealloc_device(&rdev->ibdev); -exit: - return rc; -} - static void bnxt_re_setup_cc(struct bnxt_re_dev *rdev, bool enable) { struct bnxt_qplib_cc_param cc_param = {}; @@ -1809,142 +2279,139 @@ static void bnxt_re_setup_cc(struct bnxt_re_dev *rdev, bool enable) if (enable) { cc_param.enable = 1; - cc_param.cc_mode = CMDQ_MODIFY_ROCE_CC_CC_MODE_PROBABILISTIC_CC_MODE; + cc_param.tos_ecn = 1; } - cc_param.mask = (CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_CC_MODE | - CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_ENABLE_CC | + cc_param.mask = (CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_ENABLE_CC | CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TOS_ECN); if (bnxt_qplib_modify_cc(&rdev->qplib_res, &cc_param)) ibdev_err(&rdev->ibdev, "Failed to setup CC enable = %d\n", enable); } -/* - * "Notifier chain callback can be invoked for the same chain from - * different CPUs at the same time". - * - * For cases when the netdev is already present, our call to the - * register_netdevice_notifier() will actually get the rtnl_lock() - * before sending NETDEV_REGISTER and (if up) NETDEV_UP - * events. - * - * But for cases when the netdev is not already present, the notifier - * chain is subjected to be invoked from different CPUs simultaneously. - * - * This is protected by the netdev_mutex. - */ -static int bnxt_re_netdev_event(struct notifier_block *notifier, - unsigned long event, void *ptr) +static void bnxt_re_update_en_info_rdev(struct bnxt_re_dev *rdev, + struct bnxt_re_en_dev_info *en_info, + struct auxiliary_device *adev) { - struct net_device *real_dev, *netdev = netdev_notifier_info_to_dev(ptr); + /* Before updating the rdev pointer in bnxt_re_en_dev_info structure, + * take the rtnl lock to avoid accessing invalid rdev pointer from + * L2 ULP callbacks. This is applicable in all the places where rdev + * pointer is updated in bnxt_re_en_dev_info. + */ + rtnl_lock(); + en_info->rdev = rdev; + rtnl_unlock(); +} + +static int bnxt_re_add_device(struct auxiliary_device *adev, u8 op_type) +{ + struct bnxt_aux_priv *aux_priv = + container_of(adev, struct bnxt_aux_priv, aux_dev); + struct bnxt_re_en_dev_info *en_info; + struct bnxt_en_dev *en_dev; struct bnxt_re_dev *rdev; + int rc; + + en_info = auxiliary_get_drvdata(adev); + en_dev = en_info->en_dev; - real_dev = rdma_vlan_dev_real_dev(netdev); - if (!real_dev) - real_dev = netdev; - if (real_dev != netdev) + rdev = bnxt_re_dev_add(adev, en_dev); + if (!rdev || !rdev_to_dev(rdev)) { + rc = -ENOMEM; goto exit; + } - rdev = bnxt_re_from_netdev(real_dev); - if (!rdev) - return NOTIFY_DONE; + bnxt_re_update_en_info_rdev(rdev, en_info, adev); + rc = bnxt_re_dev_init(rdev, op_type); + if (rc) + goto re_dev_dealloc; - switch (event) { - case NETDEV_UP: - case NETDEV_DOWN: - case NETDEV_CHANGE: - bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1, - netif_carrier_ok(real_dev) ? - IB_EVENT_PORT_ACTIVE : - IB_EVENT_PORT_ERR); - break; - default: - break; + rc = bnxt_re_ib_init(rdev); + if (rc) { + pr_err("Failed to register with IB: %s", + aux_priv->aux_dev.name); + goto re_dev_uninit; } - ib_device_put(&rdev->ibdev); + + bnxt_re_setup_cc(rdev, true); + + return 0; + +re_dev_uninit: + bnxt_re_update_en_info_rdev(NULL, en_info, adev); + bnxt_re_dev_uninit(rdev, BNXT_RE_COMPLETE_REMOVE); +re_dev_dealloc: + ib_dealloc_device(&rdev->ibdev); exit: - return NOTIFY_DONE; + return rc; } #define BNXT_ADEV_NAME "bnxt_en" -static void bnxt_re_remove(struct auxiliary_device *adev) +static void bnxt_re_remove_device(struct bnxt_re_dev *rdev, u8 op_type, + struct auxiliary_device *aux_dev) { - struct bnxt_re_dev *rdev = auxiliary_get_drvdata(adev); - - if (!rdev) - return; - - mutex_lock(&bnxt_re_mutex); - if (rdev->nb.notifier_call) { - unregister_netdevice_notifier(&rdev->nb); - rdev->nb.notifier_call = NULL; - } else { - /* If notifier is null, we should have already done a - * clean up before coming here. - */ - goto skip_remove; - } bnxt_re_setup_cc(rdev, false); ib_unregister_device(&rdev->ibdev); - bnxt_re_dev_uninit(rdev); + bnxt_re_dev_uninit(rdev, op_type); ib_dealloc_device(&rdev->ibdev); -skip_remove: +} + +static void bnxt_re_remove(struct auxiliary_device *adev) +{ + struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(adev); + struct bnxt_re_dev *rdev; + + mutex_lock(&bnxt_re_mutex); + rdev = en_info->rdev; + + if (rdev) + bnxt_re_remove_device(rdev, BNXT_RE_COMPLETE_REMOVE, adev); + kfree(en_info); mutex_unlock(&bnxt_re_mutex); } static int bnxt_re_probe(struct auxiliary_device *adev, const struct auxiliary_device_id *id) { - struct bnxt_re_dev *rdev; + struct bnxt_aux_priv *aux_priv = + container_of(adev, struct bnxt_aux_priv, aux_dev); + struct bnxt_re_en_dev_info *en_info; + struct bnxt_en_dev *en_dev; int rc; + en_dev = aux_priv->edev; + mutex_lock(&bnxt_re_mutex); - rc = bnxt_re_add_device(adev, BNXT_QPLIB_WQE_MODE_STATIC); - if (rc) { + en_info = kzalloc(sizeof(*en_info), GFP_KERNEL); + if (!en_info) { mutex_unlock(&bnxt_re_mutex); - return rc; + return -ENOMEM; } + en_info->en_dev = en_dev; - rdev = auxiliary_get_drvdata(adev); + auxiliary_set_drvdata(adev, en_info); - rdev->nb.notifier_call = bnxt_re_netdev_event; - rc = register_netdevice_notifier(&rdev->nb); - if (rc) { - rdev->nb.notifier_call = NULL; - pr_err("%s: Cannot register to netdevice_notifier", - ROCE_DRV_MODULE_NAME); - goto err; - } - - bnxt_re_setup_cc(rdev, true); - mutex_unlock(&bnxt_re_mutex); - return 0; + rc = bnxt_re_add_device(adev, BNXT_RE_COMPLETE_INIT); + if (rc) + kfree(en_info); -err: mutex_unlock(&bnxt_re_mutex); - bnxt_re_remove(adev); return rc; } static int bnxt_re_suspend(struct auxiliary_device *adev, pm_message_t state) { - struct bnxt_re_dev *rdev = auxiliary_get_drvdata(adev); - - if (!rdev) - return 0; + struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(adev); + struct bnxt_en_dev *en_dev; + struct bnxt_re_dev *rdev; + rdev = en_info->rdev; + en_dev = en_info->en_dev; mutex_lock(&bnxt_re_mutex); - /* L2 driver may invoke this callback during device error/crash or device - * reset. Current RoCE driver doesn't recover the device in case of - * error. Handle the error by dispatching fatal events to all qps - * ie. by calling bnxt_re_dev_stop and release the MSIx vectors as - * L2 driver want to modify the MSIx table. - */ ibdev_info(&rdev->ibdev, "Handle device suspend call"); /* Check the current device state from bnxt_en_dev and move the @@ -1952,17 +2419,20 @@ static int bnxt_re_suspend(struct auxiliary_device *adev, pm_message_t state) * This prevents more commands to HW during clean-up, * in case the device is already in error. */ - if (test_bit(BNXT_STATE_FW_FATAL_COND, &rdev->en_dev->en_state)) + if (test_bit(BNXT_STATE_FW_FATAL_COND, &rdev->en_dev->en_state)) { set_bit(ERR_DEVICE_DETACHED, &rdev->rcfw.cmdq.flags); + set_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags); + wake_up_all(&rdev->rcfw.cmdq.waitq); + bnxt_re_dev_stop(rdev); + } - bnxt_re_dev_stop(rdev); - bnxt_re_stop_irq(rdev); - /* Move the device states to detached and avoid sending any more - * commands to HW - */ - set_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags); - set_bit(ERR_DEVICE_DETACHED, &rdev->rcfw.cmdq.flags); - wake_up_all(&rdev->rcfw.cmdq.waitq); + if (rdev->pacing.dbr_pacing) + bnxt_re_set_pacing_dev_state(rdev); + + ibdev_info(&rdev->ibdev, "%s: L2 driver notified to stop en_state 0x%lx", + __func__, en_dev->en_state); + bnxt_re_remove_device(rdev, BNXT_RE_PRE_RECOVERY_REMOVE, adev); + bnxt_re_update_en_info_rdev(NULL, en_info, adev); mutex_unlock(&bnxt_re_mutex); return 0; @@ -1970,25 +2440,28 @@ static int bnxt_re_suspend(struct auxiliary_device *adev, pm_message_t state) static int bnxt_re_resume(struct auxiliary_device *adev) { - struct bnxt_re_dev *rdev = auxiliary_get_drvdata(adev); - - if (!rdev) - return 0; + struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(adev); + struct bnxt_re_dev *rdev; mutex_lock(&bnxt_re_mutex); - /* L2 driver may invoke this callback during device recovery, resume. - * reset. Current RoCE driver doesn't recover the device in case of - * error. Handle the error by dispatching fatal events to all qps - * ie. by calling bnxt_re_dev_stop and release the MSIx vectors as - * L2 driver want to modify the MSIx table. - */ - - ibdev_info(&rdev->ibdev, "Handle device resume call"); + bnxt_re_add_device(adev, BNXT_RE_POST_RECOVERY_INIT); + rdev = en_info->rdev; + ibdev_info(&rdev->ibdev, "Device resume completed"); mutex_unlock(&bnxt_re_mutex); return 0; } +static void bnxt_re_shutdown(struct auxiliary_device *adev) +{ + struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(adev); + struct bnxt_re_dev *rdev; + + rdev = en_info->rdev; + ib_unregister_device(&rdev->ibdev); + bnxt_re_dev_uninit(rdev, BNXT_RE_COMPLETE_REMOVE); +} + static const struct auxiliary_device_id bnxt_re_id_table[] = { { .name = BNXT_ADEV_NAME ".rdma", }, {}, @@ -2011,18 +2484,24 @@ static int __init bnxt_re_mod_init(void) int rc; pr_info("%s: %s", ROCE_DRV_MODULE_NAME, version); + bnxt_re_register_debugfs(); + rc = auxiliary_driver_register(&bnxt_re_driver); if (rc) { pr_err("%s: Failed to register auxiliary driver\n", ROCE_DRV_MODULE_NAME); - return rc; + goto err_debug; } return 0; +err_debug: + bnxt_re_unregister_debugfs(); + return rc; } static void __exit bnxt_re_mod_exit(void) { auxiliary_driver_unregister(&bnxt_re_driver); + bnxt_re_unregister_debugfs(); } module_init(bnxt_re_mod_init); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 439d0c7c5d0c..be34c605d516 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -54,6 +54,10 @@ #include "qplib_rcfw.h" #include "qplib_sp.h" #include "qplib_fp.h" +#include <rdma/ib_addr.h> +#include "bnxt_ulp.h" +#include "bnxt_re.h" +#include "ib_verbs.h" static void __clean_cq(struct bnxt_qplib_cq *cq, u64 qp); @@ -323,6 +327,7 @@ static void bnxt_qplib_service_nq(struct tasklet_struct *t) case NQ_BASE_TYPE_CQ_NOTIFICATION: { struct nq_cn *nqcne = (struct nq_cn *)nqe; + struct bnxt_re_cq *cq_p; q_handle = le32_to_cpu(nqcne->cq_handle_low); q_handle |= (u64)le32_to_cpu(nqcne->cq_handle_high) @@ -333,6 +338,10 @@ static void bnxt_qplib_service_nq(struct tasklet_struct *t) cq->toggle = (le16_to_cpu(nqe->info10_type) & NQ_CN_TOGGLE_MASK) >> NQ_CN_TOGGLE_SFT; cq->dbinfo.toggle = cq->toggle; + cq_p = container_of(cq, struct bnxt_re_cq, qplib_cq); + if (cq_p->uctx_cq_page) + *((u32 *)cq_p->uctx_cq_page) = cq->toggle; + bnxt_qplib_armen_db(&cq->dbinfo, DBC_DBC_TYPE_CQ_ARMENA); spin_lock_bh(&cq->compl_lock); @@ -347,6 +356,7 @@ static void bnxt_qplib_service_nq(struct tasklet_struct *t) case NQ_BASE_TYPE_SRQ_EVENT: { struct bnxt_qplib_srq *srq; + struct bnxt_re_srq *srq_p; struct nq_srq_event *nqsrqe = (struct nq_srq_event *)nqe; @@ -354,6 +364,12 @@ static void bnxt_qplib_service_nq(struct tasklet_struct *t) q_handle |= (u64)le32_to_cpu(nqsrqe->srq_handle_high) << 32; srq = (struct bnxt_qplib_srq *)q_handle; + srq->toggle = (le16_to_cpu(nqe->info10_type) & NQ_CN_TOGGLE_MASK) + >> NQ_CN_TOGGLE_SFT; + srq->dbinfo.toggle = srq->toggle; + srq_p = container_of(srq, struct bnxt_re_srq, qplib_srq); + if (srq_p->uctx_srq_page) + *((u32 *)srq_p->uctx_srq_page) = srq->toggle; bnxt_qplib_armen_db(&srq->dbinfo, DBC_DBC_TYPE_SRQ_ARMENA); if (nq->srqn_handler(nq, @@ -540,6 +556,7 @@ int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq, nq->pdev = pdev; nq->cqn_handler = cqn_handler; nq->srqn_handler = srqn_handler; + nq->load = 0; /* Have a task to schedule CQ notifiers in post send case */ nq->cqn_wq = create_singlethread_workqueue("bnxt_qplib_nq"); @@ -642,13 +659,6 @@ int bnxt_qplib_create_srq(struct bnxt_qplib_res *res, rc = bnxt_qplib_alloc_init_hwq(&srq->hwq, &hwq_attr); if (rc) return rc; - - srq->swq = kcalloc(srq->hwq.max_elements, sizeof(*srq->swq), - GFP_KERNEL); - if (!srq->swq) { - rc = -ENOMEM; - goto fail; - } srq->dbinfo.flags = 0; bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, CMDQ_BASE_OPCODE_CREATE_SRQ, @@ -677,9 +687,17 @@ int bnxt_qplib_create_srq(struct bnxt_qplib_res *res, spin_lock_init(&srq->lock); srq->start_idx = 0; srq->last_idx = srq->hwq.max_elements - 1; - for (idx = 0; idx < srq->hwq.max_elements; idx++) - srq->swq[idx].next_idx = idx + 1; - srq->swq[srq->last_idx].next_idx = -1; + if (!srq->hwq.is_user) { + srq->swq = kcalloc(srq->hwq.max_elements, sizeof(*srq->swq), + GFP_KERNEL); + if (!srq->swq) { + rc = -ENOMEM; + goto fail; + } + for (idx = 0; idx < srq->hwq.max_elements; idx++) + srq->swq[idx].next_idx = idx + 1; + srq->swq[srq->last_idx].next_idx = -1; + } srq->id = le32_to_cpu(resp.xid); srq->dbinfo.hwq = &srq->hwq; @@ -809,13 +827,13 @@ static int bnxt_qplib_alloc_init_swq(struct bnxt_qplib_q *que) { int indx; - que->swq = kcalloc(que->max_wqe, sizeof(*que->swq), GFP_KERNEL); + que->swq = kcalloc(que->max_sw_wqe, sizeof(*que->swq), GFP_KERNEL); if (!que->swq) return -ENOMEM; que->swq_start = 0; - que->swq_last = que->max_wqe - 1; - for (indx = 0; indx < que->max_wqe; indx++) + que->swq_last = que->max_sw_wqe - 1; + for (indx = 0; indx < que->max_sw_wqe; indx++) que->swq[indx].next_idx = indx + 1; que->swq[que->swq_last].next_idx = 0; /* Make it circular */ que->swq_last = 0; @@ -851,7 +869,7 @@ int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) hwq_attr.res = res; hwq_attr.sginfo = &sq->sg_info; hwq_attr.stride = sizeof(struct sq_sge); - hwq_attr.depth = bnxt_qplib_get_depth(sq); + hwq_attr.depth = bnxt_qplib_get_depth(sq, qp->wqe_mode, false); hwq_attr.type = HWQ_TYPE_QUEUE; rc = bnxt_qplib_alloc_init_hwq(&sq->hwq, &hwq_attr); if (rc) @@ -879,7 +897,7 @@ int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) hwq_attr.res = res; hwq_attr.sginfo = &rq->sg_info; hwq_attr.stride = sizeof(struct sq_sge); - hwq_attr.depth = bnxt_qplib_get_depth(rq); + hwq_attr.depth = bnxt_qplib_get_depth(rq, qp->wqe_mode, false); hwq_attr.type = HWQ_TYPE_QUEUE; rc = bnxt_qplib_alloc_init_hwq(&rq->hwq, &hwq_attr); if (rc) @@ -983,9 +1001,7 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) u32 tbl_indx; u16 nsge; - if (res->dattr) - qp->dev_cap_flags = res->dattr->dev_cap_flags; - + qp->is_host_msn_tbl = _is_host_msn_table(res->dattr->dev_cap_flags2); sq->dbinfo.flags = 0; bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, CMDQ_BASE_OPCODE_CREATE_QP, @@ -1002,7 +1018,7 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) sizeof(struct sq_psn_search_ext) : sizeof(struct sq_psn_search); - if (BNXT_RE_HW_RETX(qp->dev_cap_flags)) { + if (qp->is_host_msn_tbl) { psn_sz = sizeof(struct sq_msn_search); qp->msn = 0; } @@ -1011,12 +1027,18 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) hwq_attr.res = res; hwq_attr.sginfo = &sq->sg_info; hwq_attr.stride = sizeof(struct sq_sge); - hwq_attr.depth = bnxt_qplib_get_depth(sq); + hwq_attr.depth = bnxt_qplib_get_depth(sq, qp->wqe_mode, true); hwq_attr.aux_stride = psn_sz; - hwq_attr.aux_depth = bnxt_qplib_set_sq_size(sq, qp->wqe_mode); + hwq_attr.aux_depth = psn_sz ? bnxt_qplib_set_sq_size(sq, qp->wqe_mode) + : 0; /* Update msn tbl size */ - if (BNXT_RE_HW_RETX(qp->dev_cap_flags) && psn_sz) { - hwq_attr.aux_depth = roundup_pow_of_two(bnxt_qplib_set_sq_size(sq, qp->wqe_mode)); + if (qp->is_host_msn_tbl && psn_sz) { + if (qp->wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) + hwq_attr.aux_depth = + roundup_pow_of_two(bnxt_qplib_set_sq_size(sq, qp->wqe_mode)); + else + hwq_attr.aux_depth = + roundup_pow_of_two(bnxt_qplib_set_sq_size(sq, qp->wqe_mode)) / 2; qp->msn_tbl_sz = hwq_attr.aux_depth; qp->msn = 0; } @@ -1026,13 +1048,14 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) if (rc) return rc; - rc = bnxt_qplib_alloc_init_swq(sq); - if (rc) - goto fail_sq; - - if (psn_sz) - bnxt_qplib_init_psn_ptr(qp, psn_sz); + if (!sq->hwq.is_user) { + rc = bnxt_qplib_alloc_init_swq(sq); + if (rc) + goto fail_sq; + if (psn_sz) + bnxt_qplib_init_psn_ptr(qp, psn_sz); + } req.sq_size = cpu_to_le32(bnxt_qplib_set_sq_size(sq, qp->wqe_mode)); pbl = &sq->hwq.pbl[PBL_LVL_0]; req.sq_pbl = cpu_to_le64(pbl->pg_map_arr[0]); @@ -1051,16 +1074,18 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) hwq_attr.res = res; hwq_attr.sginfo = &rq->sg_info; hwq_attr.stride = sizeof(struct sq_sge); - hwq_attr.depth = bnxt_qplib_get_depth(rq); + hwq_attr.depth = bnxt_qplib_get_depth(rq, qp->wqe_mode, false); hwq_attr.aux_stride = 0; hwq_attr.aux_depth = 0; hwq_attr.type = HWQ_TYPE_QUEUE; rc = bnxt_qplib_alloc_init_hwq(&rq->hwq, &hwq_attr); if (rc) goto sq_swq; - rc = bnxt_qplib_alloc_init_swq(rq); - if (rc) - goto fail_rq; + if (!rq->hwq.is_user) { + rc = bnxt_qplib_alloc_init_swq(rq); + if (rc) + goto fail_rq; + } req.rq_size = cpu_to_le32(rq->max_wqe); pbl = &rq->hwq.pbl[PBL_LVL_0]; @@ -1088,7 +1113,7 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_FORCE_COMPLETION; if (qp->wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE) qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_VARIABLE_SIZED_WQE_ENABLED; - if (_is_ext_stats_supported(res->dattr->dev_cap_flags) && !res->is_vf) + if (bnxt_ext_stats_supported(res->cctx, res->dattr->dev_cap_flags, res->is_vf)) qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_EXT_STATS_ENABLED; req.qp_flags = cpu_to_le32(qp_flags); @@ -1156,9 +1181,11 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) rq->dbinfo.db = qp->dpi->dbr; rq->dbinfo.max_slot = bnxt_qplib_set_rq_max_slot(rq->wqe_size); } + spin_lock_bh(&rcfw->tbl_lock); tbl_indx = map_qp_id_to_tbl_indx(qp->id, rcfw); rcfw->qp_tbl[tbl_indx].qp_id = qp->id; rcfw->qp_tbl[tbl_indx].qp_handle = (void *)qp; + spin_unlock_bh(&rcfw->tbl_lock); return 0; fail: @@ -1190,8 +1217,6 @@ static void __modify_flags_from_init_state(struct bnxt_qplib_qp *qp) qp->path_mtu = CMDQ_MODIFY_QP_PATH_MTU_MTU_2048; } - qp->modify_flags &= - ~CMDQ_MODIFY_QP_MODIFY_MASK_VLAN_ID; /* Bono FW require the max_dest_rd_atomic to be >= 1 */ if (qp->max_dest_rd_atomic < 1) qp->max_dest_rd_atomic = 1; @@ -1265,12 +1290,56 @@ static void __filter_modify_flags(struct bnxt_qplib_qp *qp) } } +static void bnxt_set_mandatory_attributes(struct bnxt_qplib_res *res, + struct bnxt_qplib_qp *qp, + struct cmdq_modify_qp *req) +{ + u32 mandatory_flags = 0; + + if (qp->type == CMDQ_MODIFY_QP_QP_TYPE_RC) + mandatory_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_ACCESS; + + if (qp->cur_qp_state == CMDQ_MODIFY_QP_NEW_STATE_INIT && + qp->state == CMDQ_MODIFY_QP_NEW_STATE_RTR) { + if (qp->type == CMDQ_MODIFY_QP_QP_TYPE_RC && qp->srq) + req->flags = cpu_to_le16(CMDQ_MODIFY_QP_FLAGS_SRQ_USED); + mandatory_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_PKEY; + } + + if (_is_min_rnr_in_rtr_rts_mandatory(res->dattr->dev_cap_flags2) && + (qp->cur_qp_state == CMDQ_MODIFY_QP_NEW_STATE_RTR && + qp->state == CMDQ_MODIFY_QP_NEW_STATE_RTS)) { + if (qp->type == CMDQ_MODIFY_QP_QP_TYPE_RC) + mandatory_flags |= + CMDQ_MODIFY_QP_MODIFY_MASK_MIN_RNR_TIMER; + } + + if (qp->type == CMDQ_MODIFY_QP_QP_TYPE_UD || + qp->type == CMDQ_MODIFY_QP_QP_TYPE_GSI) + mandatory_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_QKEY; + + qp->modify_flags |= mandatory_flags; + req->qp_type = qp->type; +} + +static bool is_optimized_state_transition(struct bnxt_qplib_qp *qp) +{ + if ((qp->cur_qp_state == CMDQ_MODIFY_QP_NEW_STATE_INIT && + qp->state == CMDQ_MODIFY_QP_NEW_STATE_RTR) || + (qp->cur_qp_state == CMDQ_MODIFY_QP_NEW_STATE_RTR && + qp->state == CMDQ_MODIFY_QP_NEW_STATE_RTS)) + return true; + + return false; +} + int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) { struct bnxt_qplib_rcfw *rcfw = res->rcfw; struct creq_modify_qp_resp resp = {}; struct bnxt_qplib_cmdqmsg msg = {}; struct cmdq_modify_qp req = {}; + u16 vlan_pcp_vlan_dei_vlan_id; u32 temp32[4]; u32 bmask; int rc; @@ -1281,6 +1350,12 @@ int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) /* Filter out the qp_attr_mask based on the state->new transition */ __filter_modify_flags(qp); + if (qp->modify_flags & CMDQ_MODIFY_QP_MODIFY_MASK_STATE) { + /* Set mandatory attributes for INIT -> RTR and RTR -> RTS transition */ + if (_is_optimize_modify_qp_supported(res->dattr->dev_cap_flags2) && + is_optimized_state_transition(qp)) + bnxt_set_mandatory_attributes(res, qp, &req); + } bmask = qp->modify_flags; req.modify_mask = cpu_to_le32(qp->modify_flags); req.qp_cid = cpu_to_le32(qp->id); @@ -1361,7 +1436,16 @@ int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_DEST_QP_ID) req.dest_qp_id = cpu_to_le32(qp->dest_qpn); - req.vlan_pcp_vlan_dei_vlan_id = cpu_to_le16(qp->vlan_id); + if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_VLAN_ID) { + vlan_pcp_vlan_dei_vlan_id = + ((res->sgid_tbl.tbl[qp->ah.sgid_index].vlan_id << + CMDQ_MODIFY_QP_VLAN_ID_SFT) & + CMDQ_MODIFY_QP_VLAN_ID_MASK); + vlan_pcp_vlan_dei_vlan_id |= + ((qp->ah.sl << CMDQ_MODIFY_QP_VLAN_PCP_SFT) & + CMDQ_MODIFY_QP_VLAN_PCP_MASK); + req.vlan_pcp_vlan_dei_vlan_id = cpu_to_le16(vlan_pcp_vlan_dei_vlan_id); + } bnxt_qplib_fill_cmdqmsg(&msg, &req, &resp, NULL, sizeof(req), sizeof(resp), 0); rc = bnxt_qplib_rcfw_send_message(rcfw, &msg); @@ -1453,6 +1537,7 @@ int bnxt_qplib_query_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) qp->dest_qpn = le32_to_cpu(sb->dest_qp_id); memcpy(qp->smac, sb->src_mac, 6); qp->vlan_id = le16_to_cpu(sb->vlan_pcp_vlan_dei_vlan_id); + qp->port_id = le16_to_cpu(sb->port_id); bail: dma_free_coherent(&rcfw->pdev->dev, sbuf.size, sbuf.sb, sbuf.dma_addr); @@ -1515,9 +1600,11 @@ int bnxt_qplib_destroy_qp(struct bnxt_qplib_res *res, u32 tbl_indx; int rc; + spin_lock_bh(&rcfw->tbl_lock); tbl_indx = map_qp_id_to_tbl_indx(qp->id, rcfw); rcfw->qp_tbl[tbl_indx].qp_id = BNXT_QPLIB_QP_ID_INVALID; rcfw->qp_tbl[tbl_indx].qp_handle = NULL; + spin_unlock_bh(&rcfw->tbl_lock); bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, CMDQ_BASE_OPCODE_DESTROY_QP, @@ -1528,8 +1615,10 @@ int bnxt_qplib_destroy_qp(struct bnxt_qplib_res *res, sizeof(resp), 0); rc = bnxt_qplib_rcfw_send_message(rcfw, &msg); if (rc) { + spin_lock_bh(&rcfw->tbl_lock); rcfw->qp_tbl[tbl_indx].qp_id = qp->id; rcfw->qp_tbl[tbl_indx].qp_handle = qp; + spin_unlock_bh(&rcfw->tbl_lock); return rc; } @@ -1636,7 +1725,7 @@ static void bnxt_qplib_fill_psn_search(struct bnxt_qplib_qp *qp, if (!swq->psn_search) return; /* Handle MSN differently on cap flags */ - if (BNXT_RE_HW_RETX(qp->dev_cap_flags)) { + if (qp->is_host_msn_tbl) { bnxt_qplib_fill_msn_search(qp, wqe, swq); return; } @@ -1818,7 +1907,7 @@ int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp, } swq = bnxt_qplib_get_swqe(sq, &wqe_idx); - bnxt_qplib_pull_psn_buff(qp, sq, swq, BNXT_RE_HW_RETX(qp->dev_cap_flags)); + bnxt_qplib_pull_psn_buff(qp, sq, swq, qp->is_host_msn_tbl); idx = 0; swq->slot_idx = hwq->prod; @@ -2008,7 +2097,7 @@ int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp, rc = -EINVAL; goto done; } - if (!BNXT_RE_HW_RETX(qp->dev_cap_flags) || msn_update) { + if (!qp->is_host_msn_tbl || msn_update) { swq->next_psn = sq->psn & BTH_PSN_MASK; bnxt_qplib_fill_psn_search(qp, wqe, swq); } @@ -2130,6 +2219,7 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq) struct bnxt_qplib_cmdqmsg msg = {}; struct cmdq_create_cq req = {}; struct bnxt_qplib_pbl *pbl; + u32 coalescing = 0; u32 pg_sz_lvl; int rc; @@ -2156,6 +2246,25 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq) req.dpi = cpu_to_le32(cq->dpi->dpi); req.cq_handle = cpu_to_le64(cq->cq_handle); req.cq_size = cpu_to_le32(cq->max_wqe); + + if (_is_cq_coalescing_supported(res->dattr->dev_cap_flags2)) { + req.flags |= cpu_to_le16(CMDQ_CREATE_CQ_FLAGS_COALESCING_VALID); + coalescing |= ((cq->coalescing->buf_maxtime << + CMDQ_CREATE_CQ_BUF_MAXTIME_SFT) & + CMDQ_CREATE_CQ_BUF_MAXTIME_MASK); + coalescing |= ((cq->coalescing->normal_maxbuf << + CMDQ_CREATE_CQ_NORMAL_MAXBUF_SFT) & + CMDQ_CREATE_CQ_NORMAL_MAXBUF_MASK); + coalescing |= ((cq->coalescing->during_maxbuf << + CMDQ_CREATE_CQ_DURING_MAXBUF_SFT) & + CMDQ_CREATE_CQ_DURING_MAXBUF_MASK); + if (cq->coalescing->en_ring_idle_mode) + coalescing |= CMDQ_CREATE_CQ_ENABLE_RING_IDLE_MODE; + else + coalescing &= ~CMDQ_CREATE_CQ_ENABLE_RING_IDLE_MODE; + req.coalescing = cpu_to_le32(coalescing); + } + pbl = &cq->hwq.pbl[PBL_LVL_0]; pg_sz_lvl = (bnxt_qplib_base_pg_size(&cq->hwq) << CMDQ_CREATE_CQ_PG_SIZE_SFT); @@ -2470,6 +2579,32 @@ out: return rc; } +static int bnxt_qplib_get_cqe_sq_cons(struct bnxt_qplib_q *sq, u32 cqe_slot) +{ + struct bnxt_qplib_hwq *sq_hwq; + struct bnxt_qplib_swq *swq; + int cqe_sq_cons = -1; + u32 start, last; + + sq_hwq = &sq->hwq; + + start = sq->swq_start; + last = sq->swq_last; + + while (last != start) { + swq = &sq->swq[last]; + if (swq->slot_idx == cqe_slot) { + cqe_sq_cons = swq->next_idx; + dev_err(&sq_hwq->pdev->dev, "%s: Found cons wqe = %d slot = %d\n", + __func__, cqe_sq_cons, cqe_slot); + break; + } + + last = swq->next_idx; + } + return cqe_sq_cons; +} + static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq, struct cq_req *hwcqe, struct bnxt_qplib_cqe **pcqe, int *budget, @@ -2477,9 +2612,10 @@ static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq, { struct bnxt_qplib_swq *swq; struct bnxt_qplib_cqe *cqe; + u32 cqe_sq_cons, slot_num; struct bnxt_qplib_qp *qp; struct bnxt_qplib_q *sq; - u32 cqe_sq_cons; + int cqe_cons; int rc = 0; qp = (struct bnxt_qplib_qp *)((unsigned long) @@ -2491,12 +2627,26 @@ static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq, } sq = &qp->sq; - cqe_sq_cons = le16_to_cpu(hwcqe->sq_cons_idx) % sq->max_wqe; + cqe_sq_cons = le16_to_cpu(hwcqe->sq_cons_idx) % sq->max_sw_wqe; if (qp->sq.flushed) { dev_dbg(&cq->hwq.pdev->dev, "%s: QP in Flush QP = %p\n", __func__, qp); goto done; } + + if (__is_err_cqe_for_var_wqe(qp, hwcqe->status)) { + slot_num = le16_to_cpu(hwcqe->sq_cons_idx); + cqe_cons = bnxt_qplib_get_cqe_sq_cons(sq, slot_num); + if (cqe_cons < 0) { + dev_err(&cq->hwq.pdev->dev, "%s: Wrong SQ cons cqe_slot_indx = %d\n", + __func__, slot_num); + goto done; + } + cqe_sq_cons = cqe_cons; + dev_err(&cq->hwq.pdev->dev, "%s: cqe_sq_cons = %d swq_last = %d swq_start = %d\n", + __func__, cqe_sq_cons, sq->swq_last, sq->swq_start); + } + /* Require to walk the sq's swq to fabricate CQEs for all previously * signaled SWQEs due to CQE aggregation from the current sq cons * to the cqe_sq_cons @@ -2534,10 +2684,12 @@ static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq, bnxt_qplib_add_flush_qp(qp); } else { /* Before we complete, do WA 9060 */ - if (do_wa9060(qp, cq, cq_cons, sq->swq_last, - cqe_sq_cons)) { - *lib_qp = qp; - goto out; + if (!bnxt_qplib_is_chip_gen_p5_p7(qp->cctx)) { + if (do_wa9060(qp, cq, cq_cons, sq->swq_last, + cqe_sq_cons)) { + *lib_qp = qp; + goto out; + } } if (swq->flags & SQ_SEND_FLAGS_SIGNAL_COMP) { cqe->status = CQ_REQ_STATUS_OK; @@ -2881,7 +3033,7 @@ static int bnxt_qplib_cq_process_terminal(struct bnxt_qplib_cq *cq, cqe_cons = le16_to_cpu(hwcqe->sq_cons_idx); if (cqe_cons == 0xFFFF) goto do_rq; - cqe_cons %= sq->max_wqe; + cqe_cons %= sq->max_sw_wqe; if (qp->sq.flushed) { dev_dbg(&cq->hwq.pdev->dev, diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h index 7fd4506b3584..0d9487c889ff 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h @@ -105,6 +105,7 @@ struct bnxt_qplib_srq { struct bnxt_qplib_sg_info sg_info; u16 eventq_hw_ring_id; spinlock_t lock; /* protect SRQE link list */ + u8 toggle; }; struct bnxt_qplib_sge { @@ -113,7 +114,6 @@ struct bnxt_qplib_sge { u32 size; }; -#define BNXT_QPLIB_QP_MAX_SGL 6 struct bnxt_qplib_swq { u64 wr_id; int next_idx; @@ -153,7 +153,7 @@ struct bnxt_qplib_swqe { #define BNXT_QPLIB_SWQE_FLAGS_UC_FENCE BIT(2) #define BNXT_QPLIB_SWQE_FLAGS_SOLICIT_EVENT BIT(3) #define BNXT_QPLIB_SWQE_FLAGS_INLINE BIT(4) - struct bnxt_qplib_sge sg_list[BNXT_QPLIB_QP_MAX_SGL]; + struct bnxt_qplib_sge sg_list[BNXT_VAR_MAX_SGE]; int num_sge; /* Max inline data is 96 bytes */ u32 inline_len; @@ -164,12 +164,12 @@ struct bnxt_qplib_swqe { /* Send, with imm, inval key */ struct { union { - __be32 imm_data; + u32 imm_data; u32 inv_key; }; u32 q_key; u32 dst_qp; - u16 avid; + u32 avid; } send; /* Send Raw Ethernet and QP1 */ @@ -182,7 +182,7 @@ struct bnxt_qplib_swqe { /* RDMA write, with imm, read */ struct { union { - __be32 imm_data; + u32 imm_data; u32 inv_key; }; u64 remote_va; @@ -251,6 +251,7 @@ struct bnxt_qplib_q { struct bnxt_qplib_db_info dbinfo; struct bnxt_qplib_sg_info sg_info; u32 max_wqe; + u32 max_sw_wqe; u16 wqe_size; u16 q_full_delta; u16 max_sge; @@ -297,6 +298,7 @@ struct bnxt_qplib_qp { u32 dest_qpn; u8 smac[6]; u16 vlan_id; + u16 port_id; u8 nw_type; struct bnxt_qplib_ah ah; @@ -340,7 +342,8 @@ struct bnxt_qplib_qp { struct list_head rq_flush; u32 msn; u32 msn_tbl_sz; - u16 dev_cap_flags; + bool is_host_msn_tbl; + u8 tos_dscp; }; #define BNXT_QPLIB_MAX_CQE_ENTRY_SIZE sizeof(struct cq_base) @@ -381,6 +384,25 @@ static inline bool bnxt_qplib_queue_full(struct bnxt_qplib_q *que, return avail <= slots; } +/* CQ coalescing parameters */ +struct bnxt_qplib_cq_coal_param { + u16 buf_maxtime; + u8 normal_maxbuf; + u8 during_maxbuf; + u8 en_ring_idle_mode; +}; + +#define BNXT_QPLIB_CQ_COAL_DEF_BUF_MAXTIME 0x1 +#define BNXT_QPLIB_CQ_COAL_DEF_NORMAL_MAXBUF_P7 0x8 +#define BNXT_QPLIB_CQ_COAL_DEF_DURING_MAXBUF_P7 0x8 +#define BNXT_QPLIB_CQ_COAL_DEF_NORMAL_MAXBUF_P5 0x1 +#define BNXT_QPLIB_CQ_COAL_DEF_DURING_MAXBUF_P5 0x1 +#define BNXT_QPLIB_CQ_COAL_DEF_EN_RING_IDLE_MODE 0x1 +#define BNXT_QPLIB_CQ_COAL_MAX_BUF_MAXTIME 0x1bf +#define BNXT_QPLIB_CQ_COAL_MAX_NORMAL_MAXBUF 0x1f +#define BNXT_QPLIB_CQ_COAL_MAX_DURING_MAXBUF 0x1f +#define BNXT_QPLIB_CQ_COAL_MAX_EN_RING_IDLE_MODE 0x1 + struct bnxt_qplib_cqe { u8 status; u8 type; @@ -389,7 +411,7 @@ struct bnxt_qplib_cqe { u16 cfa_meta; u64 wr_id; union { - __be32 immdata; + u32 immdata; u32 invrkey; }; u64 qp_handle; @@ -443,6 +465,7 @@ struct bnxt_qplib_cq { */ spinlock_t flush_lock; /* QP flush management */ u16 cnq_events; + struct bnxt_qplib_cq_coal_param *coalescing; }; #define BNXT_QPLIB_MAX_IRRQE_ENTRY_SIZE sizeof(struct xrrq_irrq) @@ -497,6 +520,7 @@ struct bnxt_qplib_nq { struct tasklet_struct nq_tasklet; bool requested; int budget; + u32 load; cqn_handler_t cqn_handler; srqn_handler_t srqn_handler; @@ -586,15 +610,22 @@ static inline void bnxt_qplib_swq_mod_start(struct bnxt_qplib_q *que, u32 idx) que->swq_start = que->swq[idx].next_idx; } -static inline u32 bnxt_qplib_get_depth(struct bnxt_qplib_q *que) +static inline u32 bnxt_qplib_get_depth(struct bnxt_qplib_q *que, u8 wqe_mode, bool is_sq) { - return (que->wqe_size * que->max_wqe) / sizeof(struct sq_sge); + u32 slots; + + /* Queue depth is the number of slots. */ + slots = (que->wqe_size * que->max_wqe) / sizeof(struct sq_sge); + /* For variable WQE mode, need to align the slots to 256 */ + if (wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE && is_sq) + slots = ALIGN(slots, BNXT_VAR_MAX_SLOT_ALIGN); + return slots; } static inline u32 bnxt_qplib_set_sq_size(struct bnxt_qplib_q *que, u8 wqe_mode) { return (wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) ? - que->max_wqe : bnxt_qplib_get_depth(que); + que->max_wqe : bnxt_qplib_get_depth(que, wqe_mode, true); } static inline u32 bnxt_qplib_set_sq_max_slot(u8 wqe_mode) @@ -641,4 +672,14 @@ static inline __le64 bnxt_re_update_msn_tbl(u32 st_idx, u32 npsn, u32 start_psn) (((start_psn) << SQ_MSN_SEARCH_START_PSN_SFT) & SQ_MSN_SEARCH_START_PSN_MASK)); } + +static inline bool __is_var_wqe(struct bnxt_qplib_qp *qp) +{ + return (qp->wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE); +} + +static inline bool __is_err_cqe_for_var_wqe(struct bnxt_qplib_qp *qp, u8 status) +{ + return (status != CQ_REQ_STATUS_OK) && __is_var_wqe(qp); +} #endif /* __BNXT_QPLIB_FP_H__ */ diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 3ffaef0c2651..804bc773b4ef 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -160,7 +160,7 @@ static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie) wait_event_timeout(cmdq->waitq, !crsqe->is_in_used || test_bit(ERR_DEVICE_DETACHED, &cmdq->flags), - msecs_to_jiffies(rcfw->max_timeout * 1000)); + secs_to_jiffies(rcfw->max_timeout)); if (!crsqe->is_in_used) return 0; @@ -290,7 +290,6 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, struct bnxt_qplib_hwq *hwq; u32 sw_prod, cmdq_prod; struct pci_dev *pdev; - unsigned long flags; u16 cookie; u8 *preq; @@ -301,7 +300,7 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, /* Cmdq are in 16-byte units, each request can consume 1 or more * cmdqe */ - spin_lock_irqsave(&hwq->lock, flags); + spin_lock_bh(&hwq->lock); required_slots = bnxt_qplib_get_cmd_slots(msg->req); free_slots = HWQ_FREE_SLOTS(hwq); cookie = cmdq->seq_num & RCFW_MAX_COOKIE_VALUE; @@ -311,7 +310,7 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, dev_info_ratelimited(&pdev->dev, "CMDQ is full req/free %d/%d!", required_slots, free_slots); - spin_unlock_irqrestore(&hwq->lock, flags); + spin_unlock_bh(&hwq->lock); return -EAGAIN; } if (msg->block) @@ -367,7 +366,7 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, wmb(); writel(cmdq_prod, cmdq->cmdq_mbox.prod); writel(RCFW_CMDQ_TRIG_VAL, cmdq->cmdq_mbox.db); - spin_unlock_irqrestore(&hwq->lock, flags); + spin_unlock_bh(&hwq->lock); /* Return the CREQ response pointer */ return 0; } @@ -425,7 +424,8 @@ static int __send_message_basic_sanity(struct bnxt_qplib_rcfw *rcfw, /* Prevent posting if f/w is not in a state to process */ if (test_bit(ERR_DEVICE_DETACHED, &rcfw->cmdq.flags)) - return bnxt_qplib_map_rc(opcode); + return -ENXIO; + if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags)) return -ETIMEDOUT; @@ -486,7 +486,6 @@ static int __bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, { struct creq_qp_event *evnt = (struct creq_qp_event *)msg->resp; struct bnxt_qplib_crsqe *crsqe; - unsigned long flags; u16 cookie; int rc; u8 opcode; @@ -495,7 +494,7 @@ static int __bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, rc = __send_message_basic_sanity(rcfw, msg, opcode); if (rc) - return rc; + return rc == -ENXIO ? bnxt_qplib_map_rc(opcode) : rc; rc = __send_message(rcfw, msg, opcode); if (rc) @@ -512,12 +511,12 @@ static int __bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, rc = __poll_for_resp(rcfw, cookie); if (rc) { - spin_lock_irqsave(&rcfw->cmdq.hwq.lock, flags); + spin_lock_bh(&rcfw->cmdq.hwq.lock); crsqe = &rcfw->crsqe_tbl[cookie]; crsqe->is_waiter_alive = false; if (rc == -ENODEV) set_bit(FIRMWARE_STALL_DETECTED, &rcfw->cmdq.flags); - spin_unlock_irqrestore(&rcfw->cmdq.hwq.lock, flags); + spin_unlock_bh(&rcfw->cmdq.hwq.lock); return -ETIMEDOUT; } @@ -525,7 +524,7 @@ static int __bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, /* failed with status */ dev_err(&rcfw->pdev->dev, "cmdq[%#x]=%#x status %#x\n", cookie, opcode, evnt->status); - rc = -EFAULT; + rc = -EIO; } return rc; @@ -628,7 +627,6 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw, u16 cookie, blocked = 0; bool is_waiter_alive; struct pci_dev *pdev; - unsigned long flags; u32 wait_cmds = 0; int rc = 0; @@ -637,17 +635,21 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw, case CREQ_QP_EVENT_EVENT_QP_ERROR_NOTIFICATION: err_event = (struct creq_qp_error_notification *)qp_event; qp_id = le32_to_cpu(err_event->xid); + spin_lock(&rcfw->tbl_lock); tbl_indx = map_qp_id_to_tbl_indx(qp_id, rcfw); qp = rcfw->qp_tbl[tbl_indx].qp_handle; + if (!qp) { + spin_unlock(&rcfw->tbl_lock); + break; + } + bnxt_qplib_mark_qp_error(qp); + rc = rcfw->creq.aeq_handler(rcfw, qp_event, qp); + spin_unlock(&rcfw->tbl_lock); dev_dbg(&pdev->dev, "Received QP error notification\n"); dev_dbg(&pdev->dev, "qpid 0x%x, req_err=0x%x, resp_err=0x%x\n", qp_id, err_event->req_err_state_reason, err_event->res_err_state_reason); - if (!qp) - break; - bnxt_qplib_mark_qp_error(qp); - rc = rcfw->creq.aeq_handler(rcfw, qp_event, qp); break; default: /* @@ -659,8 +661,7 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw, * */ - spin_lock_irqsave_nested(&hwq->lock, flags, - SINGLE_DEPTH_NESTING); + spin_lock_nested(&hwq->lock, SINGLE_DEPTH_NESTING); cookie = le16_to_cpu(qp_event->cookie); blocked = cookie & RCFW_CMD_IS_BLOCKING; cookie &= RCFW_MAX_COOKIE_VALUE; @@ -672,7 +673,7 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw, dev_info(&pdev->dev, "rcfw timedout: cookie = %#x, free_slots = %d", cookie, crsqe->free_slots); - spin_unlock_irqrestore(&hwq->lock, flags); + spin_unlock(&hwq->lock); return rc; } @@ -720,7 +721,7 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw, __destroy_timedout_ah(rcfw, (struct creq_create_ah_resp *) qp_event); - spin_unlock_irqrestore(&hwq->lock, flags); + spin_unlock(&hwq->lock); } *num_wait += wait_cmds; return rc; @@ -734,12 +735,11 @@ static void bnxt_qplib_service_creq(struct tasklet_struct *t) u32 type, budget = CREQ_ENTRY_POLL_BUDGET; struct bnxt_qplib_hwq *hwq = &creq->hwq; struct creq_base *creqe; - unsigned long flags; u32 num_wakeup = 0; u32 hw_polled = 0; /* Service the CREQ until budget is over */ - spin_lock_irqsave(&hwq->lock, flags); + spin_lock_bh(&hwq->lock); while (budget > 0) { creqe = bnxt_qplib_get_qe(hwq, hwq->cons, NULL); if (!CREQ_CMP_VALID(creqe, creq->creq_db.dbinfo.flags)) @@ -782,7 +782,7 @@ static void bnxt_qplib_service_creq(struct tasklet_struct *t) if (hw_polled) bnxt_qplib_ring_nq_db(&creq->creq_db.dbinfo, rcfw->res->cctx, true); - spin_unlock_irqrestore(&hwq->lock, flags); + spin_unlock_bh(&hwq->lock); if (num_wakeup) wake_up_nr(&rcfw->cmdq.waitq, num_wakeup); } @@ -832,6 +832,7 @@ int bnxt_qplib_init_rcfw(struct bnxt_qplib_rcfw *rcfw, struct creq_initialize_fw_resp resp = {}; struct cmdq_initialize_fw req = {}; struct bnxt_qplib_cmdqmsg msg = {}; + u16 flags = 0; u8 pgsz, lvl; int rc; @@ -850,10 +851,8 @@ int bnxt_qplib_init_rcfw(struct bnxt_qplib_rcfw *rcfw, * shall setup this area for VF. Skipping the * HW programming */ - if (is_virtfn) + if (is_virtfn || bnxt_qplib_is_chip_gen_p5_p7(rcfw->res->cctx)) goto skip_ctx_setup; - if (bnxt_qplib_is_chip_gen_p5_p7(rcfw->res->cctx)) - goto config_vf_res; lvl = ctx->qpc_tbl.level; pgsz = bnxt_qplib_base_pg_size(&ctx->qpc_tbl); @@ -897,16 +896,14 @@ int bnxt_qplib_init_rcfw(struct bnxt_qplib_rcfw *rcfw, req.number_of_srq = cpu_to_le32(ctx->srqc_tbl.max_elements); req.number_of_cq = cpu_to_le32(ctx->cq_tbl.max_elements); -config_vf_res: - req.max_qp_per_vf = cpu_to_le32(ctx->vf_res.max_qp_per_vf); - req.max_mrw_per_vf = cpu_to_le32(ctx->vf_res.max_mrw_per_vf); - req.max_srq_per_vf = cpu_to_le32(ctx->vf_res.max_srq_per_vf); - req.max_cq_per_vf = cpu_to_le32(ctx->vf_res.max_cq_per_vf); - req.max_gid_per_vf = cpu_to_le32(ctx->vf_res.max_gid_per_vf); - skip_ctx_setup: if (BNXT_RE_HW_RETX(rcfw->res->dattr->dev_cap_flags)) - req.flags |= cpu_to_le16(CMDQ_INITIALIZE_FW_FLAGS_HW_REQUESTER_RETX_SUPPORTED); + flags |= CMDQ_INITIALIZE_FW_FLAGS_HW_REQUESTER_RETX_SUPPORTED; + if (_is_optimize_modify_qp_supported(rcfw->res->dattr->dev_cap_flags2)) + flags |= CMDQ_INITIALIZE_FW_FLAGS_OPTIMIZE_MODIFY_QP_SUPPORTED; + if (rcfw->res->en_dev->flags & BNXT_EN_FLAG_ROCE_VF_RES_MGMT) + flags |= CMDQ_INITIALIZE_FW_FLAGS_L2_VF_RESOURCE_MGMT; + req.flags |= cpu_to_le16(flags); req.stat_ctx_id = cpu_to_le32(ctx->stats.fw_id); bnxt_qplib_fill_cmdqmsg(&msg, &req, &resp, NULL, sizeof(req), sizeof(resp), 0); rc = bnxt_qplib_rcfw_send_message(rcfw, &msg); @@ -918,7 +915,6 @@ skip_ctx_setup: void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw) { - kfree(rcfw->qp_tbl); kfree(rcfw->crsqe_tbl); bnxt_qplib_free_hwq(rcfw->res, &rcfw->cmdq.hwq); bnxt_qplib_free_hwq(rcfw->res, &rcfw->creq.hwq); @@ -927,8 +923,7 @@ void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw) int bnxt_qplib_alloc_rcfw_channel(struct bnxt_qplib_res *res, struct bnxt_qplib_rcfw *rcfw, - struct bnxt_qplib_ctx *ctx, - int qp_tbl_sz) + struct bnxt_qplib_ctx *ctx) { struct bnxt_qplib_hwq_attr hwq_attr = {}; struct bnxt_qplib_sg_info sginfo = {}; @@ -972,12 +967,7 @@ int bnxt_qplib_alloc_rcfw_channel(struct bnxt_qplib_res *res, if (!rcfw->crsqe_tbl) goto fail; - /* Allocate one extra to hold the QP1 entries */ - rcfw->qp_tbl_size = qp_tbl_sz + 1; - rcfw->qp_tbl = kcalloc(rcfw->qp_tbl_size, sizeof(struct bnxt_qplib_qp_node), - GFP_KERNEL); - if (!rcfw->qp_tbl) - goto fail; + spin_lock_init(&rcfw->tbl_lock); rcfw->max_timeout = res->cctx->hwrm_cmd_max_timeout; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h index 45996e60a0d0..ff873c5f1b25 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h @@ -131,6 +131,8 @@ static inline u32 bnxt_qplib_set_cmd_slots(struct cmdq_base *req) #define RCFW_CMD_IS_BLOCKING 0x8000 #define HWRM_VERSION_DEV_ATTR_MAX_DPI 0x1000A0000000DULL +/* HWRM version 1.10.3.18 */ +#define HWRM_VERSION_READ_CTX 0x1000A00030012 /* Crsq buf is 1024-Byte */ struct bnxt_qplib_crsbe { @@ -224,6 +226,8 @@ struct bnxt_qplib_rcfw { struct bnxt_qplib_crsqe *crsqe_tbl; int qp_tbl_size; struct bnxt_qplib_qp_node *qp_tbl; + /* To synchronize the qp-handle hash table */ + spinlock_t tbl_lock; u64 oos_prev; u32 init_oos_stats; u32 cmdq_depth; @@ -258,8 +262,7 @@ static inline void bnxt_qplib_fill_cmdqmsg(struct bnxt_qplib_cmdqmsg *msg, void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw); int bnxt_qplib_alloc_rcfw_channel(struct bnxt_qplib_res *res, struct bnxt_qplib_rcfw *rcfw, - struct bnxt_qplib_ctx *ctx, - int qp_tbl_sz); + struct bnxt_qplib_ctx *ctx); void bnxt_qplib_rcfw_stop_irq(struct bnxt_qplib_rcfw *rcfw, bool kill); void bnxt_qplib_disable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw); int bnxt_qplib_rcfw_start_irq(struct bnxt_qplib_rcfw *rcfw, int msix_vector, @@ -281,9 +284,10 @@ int bnxt_qplib_deinit_rcfw(struct bnxt_qplib_rcfw *rcfw); int bnxt_qplib_init_rcfw(struct bnxt_qplib_rcfw *rcfw, struct bnxt_qplib_ctx *ctx, int is_virtfn); void bnxt_qplib_mark_qp_error(void *qp_handle); + static inline u32 map_qp_id_to_tbl_indx(u32 qid, struct bnxt_qplib_rcfw *rcfw) { /* Last index of the qp_tbl is for QP1 ie. qp_tbl_size - 1*/ - return (qid == 1) ? rcfw->qp_tbl_size - 1 : qid % rcfw->qp_tbl_size - 2; + return (qid == 1) ? rcfw->qp_tbl_size - 1 : (qid % (rcfw->qp_tbl_size - 2)); } #endif /* __BNXT_QPLIB_RCFW_H__ */ diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c index dfc943fab87b..6cd05207ffed 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c @@ -244,6 +244,8 @@ int bnxt_qplib_alloc_init_hwq(struct bnxt_qplib_hwq *hwq, sginfo.pgsize = npde * pg_size; sginfo.npages = 1; rc = __alloc_pbl(res, &hwq->pbl[PBL_LVL_0], &sginfo); + if (rc) + goto fail; /* Alloc PBL pages */ sginfo.npages = npbl; @@ -255,22 +257,9 @@ int bnxt_qplib_alloc_init_hwq(struct bnxt_qplib_hwq *hwq, dst_virt_ptr = (dma_addr_t **)hwq->pbl[PBL_LVL_0].pg_arr; src_phys_ptr = hwq->pbl[PBL_LVL_1].pg_map_arr; - if (hwq_attr->type == HWQ_TYPE_MR) { - /* For MR it is expected that we supply only 1 contigous - * page i.e only 1 entry in the PDL that will contain - * all the PBLs for the user supplied memory region - */ - for (i = 0; i < hwq->pbl[PBL_LVL_1].pg_count; - i++) - dst_virt_ptr[0][i] = src_phys_ptr[i] | - flag; - } else { - for (i = 0; i < hwq->pbl[PBL_LVL_1].pg_count; - i++) - dst_virt_ptr[PTR_PG(i)][PTR_IDX(i)] = - src_phys_ptr[i] | - PTU_PDE_VALID; - } + for (i = 0; i < hwq->pbl[PBL_LVL_1].pg_count; i++) + dst_virt_ptr[0][i] = src_phys_ptr[i] | flag; + /* Alloc or init PTEs */ rc = __alloc_pbl(res, &hwq->pbl[PBL_LVL_2], hwq_attr->sginfo); @@ -882,19 +871,27 @@ int bnxt_qplib_init_res(struct bnxt_qplib_res *res) void bnxt_qplib_free_res(struct bnxt_qplib_res *res) { + kfree(res->rcfw->qp_tbl); bnxt_qplib_free_sgid_tbl(res, &res->sgid_tbl); bnxt_qplib_free_pd_tbl(&res->pd_tbl); bnxt_qplib_free_dpi_tbl(res, &res->dpi_tbl); } -int bnxt_qplib_alloc_res(struct bnxt_qplib_res *res, struct pci_dev *pdev, - struct net_device *netdev, - struct bnxt_qplib_dev_attr *dev_attr) +int bnxt_qplib_alloc_res(struct bnxt_qplib_res *res, struct net_device *netdev) { + struct bnxt_qplib_rcfw *rcfw = res->rcfw; + struct bnxt_qplib_dev_attr *dev_attr; int rc; - res->pdev = pdev; res->netdev = netdev; + dev_attr = res->dattr; + + /* Allocate one extra to hold the QP1 entries */ + rcfw->qp_tbl_size = max_t(u32, BNXT_RE_MAX_QPC_COUNT + 1, dev_attr->max_qp); + rcfw->qp_tbl = kcalloc(rcfw->qp_tbl_size, sizeof(struct bnxt_qplib_qp_node), + GFP_KERNEL); + if (!rcfw->qp_tbl) + return -ENOMEM; rc = bnxt_qplib_alloc_sgid_tbl(res, &res->sgid_tbl, dev_attr->max_sgid); if (rc) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h index 61628f7f1253..6a13927674b4 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h @@ -39,6 +39,8 @@ #ifndef __BNXT_QPLIB_RES_H__ #define __BNXT_QPLIB_RES_H__ +#include "bnxt_ulp.h" + extern const struct bnxt_qplib_gid bnxt_qplib_gid_zero; #define CHIP_NUM_57508 0x1750 @@ -47,6 +49,13 @@ extern const struct bnxt_qplib_gid bnxt_qplib_gid_zero; #define CHIP_NUM_58818 0xd818 #define CHIP_NUM_57608 0x1760 +#define BNXT_RE_MAX_QPC_COUNT (64 * 1024) +#define BNXT_RE_MAX_MRW_COUNT (64 * 1024) +#define BNXT_RE_MAX_SRQC_COUNT (64 * 1024) +#define BNXT_RE_MAX_CQ_COUNT (64 * 1024) +#define BNXT_RE_MAX_MRW_COUNT_64K (64 * 1024) +#define BNXT_RE_MAX_MRW_COUNT_256K (256 * 1024) + #define BNXT_QPLIB_DBR_VALID (0x1UL << 26) #define BNXT_QPLIB_DBR_EPOCH_SHIFT 24 #define BNXT_QPLIB_DBR_TOGGLE_SHIFT 25 @@ -82,6 +91,7 @@ struct bnxt_qplib_db_pacing_data { u32 fifo_room_mask; u32 fifo_room_shift; u32 grc_reg_offset; + u32 dev_err_state; }; #define BNXT_QPLIB_DBR_PF_DB_OFFSET 0x10000 @@ -301,6 +311,7 @@ struct bnxt_qplib_res { struct bnxt_qplib_chip_ctx *cctx; struct bnxt_qplib_dev_attr *dattr; struct net_device *netdev; + struct bnxt_en_dev *en_dev; struct bnxt_qplib_rcfw *rcfw; struct bnxt_qplib_pd_tbl pd_tbl; /* To protect the pd table bit map */ @@ -420,9 +431,7 @@ int bnxt_qplib_dealloc_dpi(struct bnxt_qplib_res *res, void bnxt_qplib_cleanup_res(struct bnxt_qplib_res *res); int bnxt_qplib_init_res(struct bnxt_qplib_res *res); void bnxt_qplib_free_res(struct bnxt_qplib_res *res); -int bnxt_qplib_alloc_res(struct bnxt_qplib_res *res, struct pci_dev *pdev, - struct net_device *netdev, - struct bnxt_qplib_dev_attr *dev_attr); +int bnxt_qplib_alloc_res(struct bnxt_qplib_res *res, struct net_device *netdev); void bnxt_qplib_free_ctx(struct bnxt_qplib_res *res, struct bnxt_qplib_ctx *ctx); int bnxt_qplib_alloc_ctx(struct bnxt_qplib_res *res, @@ -545,6 +554,14 @@ static inline bool _is_ext_stats_supported(u16 dev_cap_flags) CREQ_QUERY_FUNC_RESP_SB_EXT_STATS; } +static inline int bnxt_ext_stats_supported(struct bnxt_qplib_chip_ctx *ctx, + u16 flags, bool virtfn) +{ + /* ext stats supported if cap flag is set AND is a PF OR a Thor2 VF */ + return (_is_ext_stats_supported(flags) && + ((virtfn && bnxt_qplib_is_chip_gen_p7(ctx)) || (!virtfn))); +} + static inline bool _is_hw_retx_supported(u16 dev_cap_flags) { return dev_cap_flags & @@ -554,9 +571,45 @@ static inline bool _is_hw_retx_supported(u16 dev_cap_flags) #define BNXT_RE_HW_RETX(a) _is_hw_retx_supported((a)) +static inline bool _is_host_msn_table(u16 dev_cap_ext_flags2) +{ + return (dev_cap_ext_flags2 & CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_MASK) == + CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_HOST_MSN_TABLE; +} + static inline u8 bnxt_qplib_dbr_pacing_en(struct bnxt_qplib_chip_ctx *cctx) { return cctx->modes.dbr_pacing; } +static inline bool _is_alloc_mr_unified(u16 dev_cap_flags) +{ + return dev_cap_flags & CREQ_QUERY_FUNC_RESP_SB_MR_REGISTER_ALLOC; +} + +static inline bool _is_relaxed_ordering_supported(u16 dev_cap_ext_flags2) +{ + return dev_cap_ext_flags2 & CREQ_QUERY_FUNC_RESP_SB_MEMORY_REGION_RO_SUPPORTED; +} + +static inline bool _is_optimize_modify_qp_supported(u16 dev_cap_ext_flags2) +{ + return dev_cap_ext_flags2 & CREQ_QUERY_FUNC_RESP_SB_OPTIMIZE_MODIFY_QP_SUPPORTED; +} + +static inline bool _is_min_rnr_in_rtr_rts_mandatory(u16 dev_cap_ext_flags2) +{ + return !!(dev_cap_ext_flags2 & CREQ_QUERY_FUNC_RESP_SB_MIN_RNR_RTR_RTS_OPT_SUPPORTED); +} + +static inline bool _is_cq_coalescing_supported(u16 dev_cap_ext_flags2) +{ + return dev_cap_ext_flags2 & CREQ_QUERY_FUNC_RESP_SB_CQ_COALESCING_SUPPORTED; +} + +static inline bool _is_max_srq_ext_supported(u16 dev_cap_ext_flags_2) +{ + return !!(dev_cap_ext_flags_2 & CREQ_QUERY_FUNC_RESP_SB_MAX_SRQ_EXTENDED); +} + #endif /* __BNXT_QPLIB_RES_H__ */ diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 8beeedd15061..9efd32a3dc55 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -88,18 +88,20 @@ static void bnxt_qplib_query_version(struct bnxt_qplib_rcfw *rcfw, fw_ver[3] = resp.fw_rsvd; } -int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, - struct bnxt_qplib_dev_attr *attr) +int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw) { + struct bnxt_qplib_dev_attr *attr = rcfw->res->dattr; struct creq_query_func_resp resp = {}; struct bnxt_qplib_cmdqmsg msg = {}; struct creq_query_func_resp_sb *sb; struct bnxt_qplib_rcfw_sbuf sbuf; + struct bnxt_qplib_chip_ctx *cctx; struct cmdq_query_func req = {}; u8 *tqm_alloc; int i, rc; u32 temp; + cctx = rcfw->res->cctx; bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, CMDQ_BASE_OPCODE_QUERY_FUNC, sizeof(req)); @@ -127,16 +129,25 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, attr->max_qp_init_rd_atom = sb->max_qp_init_rd_atom > BNXT_QPLIB_MAX_OUT_RD_ATOM ? BNXT_QPLIB_MAX_OUT_RD_ATOM : sb->max_qp_init_rd_atom; - attr->max_qp_wqes = le16_to_cpu(sb->max_qp_wr); - /* - * 128 WQEs needs to be reserved for the HW (8916). Prevent - * reporting the max number - */ - attr->max_qp_wqes -= BNXT_QPLIB_RESERVED_QP_WRS + 1; - attr->max_qp_sges = bnxt_qplib_is_chip_gen_p5_p7(rcfw->res->cctx) ? - 6 : sb->max_sge; + attr->max_qp_wqes = le16_to_cpu(sb->max_qp_wr) - 1; + if (!bnxt_qplib_is_chip_gen_p5_p7(rcfw->res->cctx)) { + /* + * 128 WQEs needs to be reserved for the HW (8916). Prevent + * reporting the max number on legacy devices + */ + attr->max_qp_wqes -= BNXT_QPLIB_RESERVED_QP_WRS + 1; + } + + /* Adjust for max_qp_wqes for variable wqe */ + if (cctx->modes.wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE) + attr->max_qp_wqes = BNXT_VAR_MAX_WQE - 1; + + attr->max_qp_sges = cctx->modes.wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE ? + min_t(u32, sb->max_sge_var_wqe, BNXT_VAR_MAX_SGE) : 6; attr->max_cq = le32_to_cpu(sb->max_cq); attr->max_cq_wqes = le32_to_cpu(sb->max_cqe); + if (!bnxt_qplib_is_chip_gen_p7(rcfw->res->cctx)) + attr->max_cq_wqes = min_t(u32, BNXT_QPLIB_MAX_CQ_WQES, attr->max_cq_wqes); attr->max_cq_sges = attr->max_qp_sges; attr->max_mr = le32_to_cpu(sb->max_mr); attr->max_mw = le32_to_cpu(sb->max_mw); @@ -154,8 +165,19 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, if (!bnxt_qplib_is_chip_gen_p7(rcfw->res->cctx)) attr->l2_db_size = (sb->l2_db_space_size + 1) * (0x01 << RCFW_DBR_BASE_PAGE_SHIFT); - attr->max_sgid = BNXT_QPLIB_NUM_GIDS_SUPPORTED; + /* + * Read the max gid supported by HW. + * For each entry in HW GID in HW table, we consume 2 + * GID entries in the kernel GID table. So max_gid reported + * to stack can be up to twice the value reported by the HW, up to 256 gids. + */ + attr->max_sgid = le32_to_cpu(sb->max_gid); + attr->max_sgid = min_t(u32, BNXT_QPLIB_NUM_GIDS_SUPPORTED, 2 * attr->max_sgid); attr->dev_cap_flags = le16_to_cpu(sb->dev_cap_flags); + attr->dev_cap_flags2 = le16_to_cpu(sb->dev_cap_ext_flags_2); + + if (_is_max_srq_ext_supported(attr->dev_cap_flags2)) + attr->max_srq += le16_to_cpu(sb->max_srq_ext); bnxt_qplib_query_version(rcfw, attr->fw_ver); @@ -540,7 +562,7 @@ int bnxt_qplib_alloc_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw) req.pd_id = cpu_to_le32(mrw->pd->id); req.mrw_flags = mrw->type; if ((mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_PMR && - mrw->flags & BNXT_QPLIB_FR_PMR) || + mrw->access_flags & BNXT_QPLIB_FR_PMR) || mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2A || mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2B) req.access = CMDQ_ALLOCATE_MRW_ACCESS_CONSUMER_OWNED_KEY; @@ -652,9 +674,12 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, req.log2_pbl_pg_size = cpu_to_le16(((ilog2(PAGE_SIZE) << CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_SFT) & CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_MASK)); - req.access = (mr->flags & 0xFFFF); + req.access = (mr->access_flags & 0xFFFF); req.va = cpu_to_le64(mr->va); req.key = cpu_to_le32(mr->lkey); + if (_is_alloc_mr_unified(res->dattr->dev_cap_flags)) + req.key = cpu_to_le32(mr->pd->id); + req.flags = cpu_to_le16(mr->flags); req.mr_size = cpu_to_le64(mr->total_size); bnxt_qplib_fill_cmdqmsg(&msg, &req, &resp, NULL, sizeof(req), @@ -663,6 +688,11 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, if (rc) goto fail; + if (_is_alloc_mr_unified(res->dattr->dev_cap_flags)) { + mr->lkey = le32_to_cpu(resp.xid); + mr->rkey = mr->lkey; + } + return 0; fail: @@ -816,7 +846,12 @@ int bnxt_qplib_qext_stat(struct bnxt_qplib_rcfw *rcfw, u32 fid, req.resp_size = sbuf.size / BNXT_QPLIB_CMDQE_UNITS; req.resp_addr = cpu_to_le64(sbuf.dma_addr); - req.function_id = cpu_to_le32(fid); + if (bnxt_qplib_is_chip_gen_p7(rcfw->res->cctx) && rcfw->res->is_vf) + req.function_id = + cpu_to_le32(CMDQ_QUERY_ROCE_STATS_EXT_VF_VALID | + (fid << CMDQ_QUERY_ROCE_STATS_EXT_VF_NUM_SFT)); + else + req.function_id = cpu_to_le32(fid); req.flags = cpu_to_le16(CMDQ_QUERY_ROCE_STATS_EXT_FLAGS_FUNCTION_ID); bnxt_qplib_fill_cmdqmsg(&msg, &req, &resp, &sbuf, sizeof(req), @@ -960,3 +995,151 @@ int bnxt_qplib_modify_cc(struct bnxt_qplib_res *res, rc = bnxt_qplib_rcfw_send_message(res->rcfw, &msg); return rc; } + +int bnxt_qplib_read_context(struct bnxt_qplib_rcfw *rcfw, u8 res_type, + u32 xid, u32 resp_size, void *resp_va) +{ + struct creq_read_context resp = {}; + struct bnxt_qplib_cmdqmsg msg = {}; + struct cmdq_read_context req = {}; + struct bnxt_qplib_rcfw_sbuf sbuf; + int rc; + + sbuf.size = resp_size; + sbuf.sb = dma_alloc_coherent(&rcfw->pdev->dev, sbuf.size, + &sbuf.dma_addr, GFP_KERNEL); + if (!sbuf.sb) + return -ENOMEM; + + bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, + CMDQ_BASE_OPCODE_READ_CONTEXT, sizeof(req)); + req.resp_addr = cpu_to_le64(sbuf.dma_addr); + req.resp_size = resp_size / BNXT_QPLIB_CMDQE_UNITS; + + req.xid = cpu_to_le32(xid); + req.type = res_type; + + bnxt_qplib_fill_cmdqmsg(&msg, &req, &resp, &sbuf, sizeof(req), + sizeof(resp), 0); + rc = bnxt_qplib_rcfw_send_message(rcfw, &msg); + if (rc) + goto free_mem; + + memcpy(resp_va, sbuf.sb, resp_size); +free_mem: + dma_free_coherent(&rcfw->pdev->dev, sbuf.size, sbuf.sb, sbuf.dma_addr); + return rc; +} + +static void bnxt_qplib_read_cc_gen1(struct bnxt_qplib_cc_param_ext *cc_ext, + struct creq_query_roce_cc_gen1_resp_sb_tlv *sb) +{ + cc_ext->inact_th_hi = le16_to_cpu(sb->inactivity_th_hi); + cc_ext->min_delta_cnp = le16_to_cpu(sb->min_time_between_cnps); + cc_ext->init_cp = le16_to_cpu(sb->init_cp); + cc_ext->tr_update_mode = sb->tr_update_mode; + cc_ext->tr_update_cyls = sb->tr_update_cycles; + cc_ext->fr_rtt = sb->fr_num_rtts; + cc_ext->ai_rate_incr = sb->ai_rate_increase; + cc_ext->rr_rtt_th = le16_to_cpu(sb->reduction_relax_rtts_th); + cc_ext->ar_cr_th = le16_to_cpu(sb->additional_relax_cr_th); + cc_ext->cr_min_th = le16_to_cpu(sb->cr_min_th); + cc_ext->bw_avg_weight = sb->bw_avg_weight; + cc_ext->cr_factor = sb->actual_cr_factor; + cc_ext->cr_th_max_cp = le16_to_cpu(sb->max_cp_cr_th); + cc_ext->cp_bias_en = sb->cp_bias_en; + cc_ext->cp_bias = sb->cp_bias; + cc_ext->cnp_ecn = sb->cnp_ecn; + cc_ext->rtt_jitter_en = sb->rtt_jitter_en; + cc_ext->bytes_per_usec = le16_to_cpu(sb->link_bytes_per_usec); + cc_ext->cc_cr_reset_th = le16_to_cpu(sb->reset_cc_cr_th); + cc_ext->cr_width = sb->cr_width; + cc_ext->min_quota = sb->quota_period_min; + cc_ext->max_quota = sb->quota_period_max; + cc_ext->abs_max_quota = sb->quota_period_abs_max; + cc_ext->tr_lb = le16_to_cpu(sb->tr_lower_bound); + cc_ext->cr_prob_fac = sb->cr_prob_factor; + cc_ext->tr_prob_fac = sb->tr_prob_factor; + cc_ext->fair_cr_th = le16_to_cpu(sb->fairness_cr_th); + cc_ext->red_div = sb->red_div; + cc_ext->cnp_ratio_th = sb->cnp_ratio_th; + cc_ext->ai_ext_rtt = le16_to_cpu(sb->exp_ai_rtts); + cc_ext->exp_crcp_ratio = sb->exp_ai_cr_cp_ratio; + cc_ext->low_rate_en = sb->use_rate_table; + cc_ext->cpcr_update_th = le16_to_cpu(sb->cp_exp_update_th); + cc_ext->ai_rtt_th1 = le16_to_cpu(sb->high_exp_ai_rtts_th1); + cc_ext->ai_rtt_th2 = le16_to_cpu(sb->high_exp_ai_rtts_th2); + cc_ext->cf_rtt_th = le16_to_cpu(sb->actual_cr_cong_free_rtts_th); + cc_ext->sc_cr_th1 = le16_to_cpu(sb->severe_cong_cr_th1); + cc_ext->sc_cr_th2 = le16_to_cpu(sb->severe_cong_cr_th2); + cc_ext->l64B_per_rtt = le32_to_cpu(sb->link64B_per_rtt); + cc_ext->cc_ack_bytes = sb->cc_ack_bytes; + cc_ext->reduce_cf_rtt_th = le16_to_cpu(sb->reduce_init_cong_free_rtts_th); +} + +int bnxt_qplib_query_cc_param(struct bnxt_qplib_res *res, + struct bnxt_qplib_cc_param *cc_param) +{ + struct bnxt_qplib_tlv_query_rcc_sb *ext_sb; + struct bnxt_qplib_rcfw *rcfw = res->rcfw; + struct creq_query_roce_cc_resp resp = {}; + struct creq_query_roce_cc_resp_sb *sb; + struct bnxt_qplib_cmdqmsg msg = {}; + struct cmdq_query_roce_cc req = {}; + struct bnxt_qplib_rcfw_sbuf sbuf; + size_t resp_size; + int rc; + + /* Query the parameters from chip */ + bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, CMDQ_BASE_OPCODE_QUERY_ROCE_CC, + sizeof(req)); + if (bnxt_qplib_is_chip_gen_p5_p7(res->cctx)) + resp_size = sizeof(*ext_sb); + else + resp_size = sizeof(*sb); + + sbuf.size = ALIGN(resp_size, BNXT_QPLIB_CMDQE_UNITS); + sbuf.sb = dma_alloc_coherent(&rcfw->pdev->dev, sbuf.size, + &sbuf.dma_addr, GFP_KERNEL); + if (!sbuf.sb) + return -ENOMEM; + + req.resp_size = sbuf.size / BNXT_QPLIB_CMDQE_UNITS; + bnxt_qplib_fill_cmdqmsg(&msg, &req, &resp, &sbuf, sizeof(req), + sizeof(resp), 0); + rc = bnxt_qplib_rcfw_send_message(res->rcfw, &msg); + if (rc) + goto out; + + ext_sb = sbuf.sb; + sb = bnxt_qplib_is_chip_gen_p5_p7(res->cctx) ? &ext_sb->base_sb : + (struct creq_query_roce_cc_resp_sb *)ext_sb; + + cc_param->enable = sb->enable_cc & CREQ_QUERY_ROCE_CC_RESP_SB_ENABLE_CC; + cc_param->tos_ecn = (sb->tos_dscp_tos_ecn & + CREQ_QUERY_ROCE_CC_RESP_SB_TOS_ECN_MASK) >> + CREQ_QUERY_ROCE_CC_RESP_SB_TOS_ECN_SFT; + cc_param->tos_dscp = (sb->tos_dscp_tos_ecn & + CREQ_QUERY_ROCE_CC_RESP_SB_TOS_DSCP_MASK) >> + CREQ_QUERY_ROCE_CC_RESP_SB_TOS_DSCP_SFT; + cc_param->alt_tos_dscp = sb->alt_tos_dscp; + cc_param->alt_vlan_pcp = sb->alt_vlan_pcp; + + cc_param->g = sb->g; + cc_param->nph_per_state = sb->num_phases_per_state; + cc_param->init_cr = le16_to_cpu(sb->init_cr); + cc_param->init_tr = le16_to_cpu(sb->init_tr); + cc_param->cc_mode = sb->cc_mode; + cc_param->inact_th = le16_to_cpu(sb->inactivity_th); + cc_param->rtt = le16_to_cpu(sb->rtt); + cc_param->tcp_cp = le16_to_cpu(sb->tcp_cp); + cc_param->time_pph = sb->time_per_phase; + cc_param->pkts_pph = sb->pkts_per_phase; + if (bnxt_qplib_is_chip_gen_p5_p7(res->cctx)) { + bnxt_qplib_read_cc_gen1(&cc_param->cc_ext, &ext_sb->gen1_sb); + cc_param->inact_th |= (cc_param->cc_ext.inact_th_hi & 0x3F) << 16; + } +out: + dma_free_coherent(&rcfw->pdev->dev, sbuf.size, sbuf.sb, sbuf.dma_addr); + return rc; +} diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h index d33c78b96217..e626b05038a1 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h @@ -40,6 +40,7 @@ #ifndef __BNXT_QPLIB_SP_H__ #define __BNXT_QPLIB_SP_H__ +#include <rdma/bnxt_re-abi.h> #define BNXT_QPLIB_RESERVED_QP_WRS 128 struct bnxt_qplib_dev_attr { @@ -55,6 +56,7 @@ struct bnxt_qplib_dev_attr { u32 max_qp_wqes; u32 max_qp_sges; u32 max_cq; +#define BNXT_QPLIB_MAX_CQ_WQES 0xfffff u32 max_cq_wqes; u32 max_cq_sges; u32 max_mr; @@ -72,6 +74,7 @@ struct bnxt_qplib_dev_attr { u8 tqm_alloc_reqs[MAX_TQM_ALLOC_REQ]; bool is_atomic; u16 dev_cap_flags; + u16 dev_cap_flags2; u32 max_dpi; }; @@ -107,7 +110,7 @@ struct bnxt_qplib_ah { struct bnxt_qplib_mrw { struct bnxt_qplib_pd *pd; int type; - u32 flags; + u32 access_flags; #define BNXT_QPLIB_FR_PMR 0x80000000 u32 lkey; u32 rkey; @@ -115,6 +118,7 @@ struct bnxt_qplib_mrw { u64 va; u64 total_size; u32 npages; + u16 flags; u64 mr_handle; struct bnxt_qplib_hwq hwq; }; @@ -292,6 +296,7 @@ struct bnxt_qplib_cc_param_ext { struct bnxt_qplib_cc_param { u8 alt_vlan_pcp; + u8 qp1_tos_dscp; u16 alt_tos_dscp; u8 cc_mode; u8 enable; @@ -321,8 +326,7 @@ int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, int bnxt_qplib_update_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, struct bnxt_qplib_gid *gid, u16 gid_idx, const u8 *smac); -int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, - struct bnxt_qplib_dev_attr *attr); +int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw); int bnxt_qplib_set_func_resources(struct bnxt_qplib_res *res, struct bnxt_qplib_rcfw *rcfw, struct bnxt_qplib_ctx *ctx); @@ -349,5 +353,16 @@ int bnxt_qplib_qext_stat(struct bnxt_qplib_rcfw *rcfw, u32 fid, struct bnxt_qplib_ext_stat *estat); int bnxt_qplib_modify_cc(struct bnxt_qplib_res *res, struct bnxt_qplib_cc_param *cc_param); +int bnxt_qplib_read_context(struct bnxt_qplib_rcfw *rcfw, u8 type, u32 xid, + u32 resp_size, void *resp_va); +int bnxt_qplib_query_cc_param(struct bnxt_qplib_res *res, + struct bnxt_qplib_cc_param *cc_param); + +#define BNXT_VAR_MAX_WQE 4352 +#define BNXT_VAR_MAX_SLOT_ALIGN 256 +#define BNXT_VAR_MAX_SGE 13 +#define BNXT_RE_MAX_RQ_WQES 65536 + +#define BNXT_STATIC_MAX_SGE 6 #endif /* __BNXT_QPLIB_SP_H__*/ diff --git a/drivers/infiniband/hw/bnxt_re/roce_hsi.h b/drivers/infiniband/hw/bnxt_re/roce_hsi.h index 605c9463c408..7eceb3e9f4ce 100644 --- a/drivers/infiniband/hw/bnxt_re/roce_hsi.h +++ b/drivers/infiniband/hw/bnxt_re/roce_hsi.h @@ -216,6 +216,8 @@ struct cmdq_initialize_fw { __le16 flags; #define CMDQ_INITIALIZE_FW_FLAGS_MRAV_RESERVATION_SPLIT 0x1UL #define CMDQ_INITIALIZE_FW_FLAGS_HW_REQUESTER_RETX_SUPPORTED 0x2UL + #define CMDQ_INITIALIZE_FW_FLAGS_OPTIMIZE_MODIFY_QP_SUPPORTED 0x8UL + #define CMDQ_INITIALIZE_FW_FLAGS_L2_VF_RESOURCE_MGMT 0x10UL __le16 cookie; u8 resp_size; u8 reserved8; @@ -409,7 +411,7 @@ struct creq_deinitialize_fw_resp { u8 reserved48[6]; }; -/* cmdq_create_qp (size:768b/96B) */ +/* cmdq_create_qp (size:832b/104B) */ struct cmdq_create_qp { u8 opcode; #define CMDQ_CREATE_QP_OPCODE_CREATE_QP 0x1UL @@ -430,8 +432,11 @@ struct cmdq_create_qp { #define CMDQ_CREATE_QP_QP_FLAGS_OPTIMIZED_TRANSMIT_ENABLED 0x20UL #define CMDQ_CREATE_QP_QP_FLAGS_RESPONDER_UD_CQE_WITH_CFA 0x40UL #define CMDQ_CREATE_QP_QP_FLAGS_EXT_STATS_ENABLED 0x80UL + #define CMDQ_CREATE_QP_QP_FLAGS_EXPRESS_MODE_ENABLED 0x100UL + #define CMDQ_CREATE_QP_QP_FLAGS_STEERING_TAG_VALID 0x200UL + #define CMDQ_CREATE_QP_QP_FLAGS_RDMA_READ_OR_ATOMICS_USED 0x400UL #define CMDQ_CREATE_QP_QP_FLAGS_LAST \ - CMDQ_CREATE_QP_QP_FLAGS_EXT_STATS_ENABLED + CMDQ_CREATE_QP_QP_FLAGS_RDMA_READ_OR_ATOMICS_USED u8 type; #define CMDQ_CREATE_QP_TYPE_RC 0x2UL #define CMDQ_CREATE_QP_TYPE_UD 0x4UL @@ -492,6 +497,9 @@ struct cmdq_create_qp { __le64 rq_pbl; __le64 irrq_addr; __le64 orrq_addr; + __le32 request_xid; + __le16 steering_tag; + __le16 reserved16; }; /* creq_create_qp_resp (size:128b/16B) */ @@ -553,6 +561,7 @@ struct cmdq_modify_qp { #define CMDQ_MODIFY_QP_OPCODE_LAST CMDQ_MODIFY_QP_OPCODE_MODIFY_QP u8 cmd_size; __le16 flags; + #define CMDQ_MODIFY_QP_FLAGS_SRQ_USED 0x1UL __le16 cookie; u8 resp_size; u8 qp_type; @@ -972,13 +981,14 @@ struct creq_query_qp_extend_resp_sb_tlv { __le16 reserved_16; }; -/* cmdq_create_srq (size:384b/48B) */ +/* cmdq_create_srq (size:448b/56B) */ struct cmdq_create_srq { u8 opcode; #define CMDQ_CREATE_SRQ_OPCODE_CREATE_SRQ 0x5UL #define CMDQ_CREATE_SRQ_OPCODE_LAST CMDQ_CREATE_SRQ_OPCODE_CREATE_SRQ u8 cmd_size; __le16 flags; + #define CMDQ_CREATE_SRQ_FLAGS_STEERING_TAG_VALID 0x1UL __le16 cookie; u8 resp_size; u8 reserved8; @@ -1012,6 +1022,8 @@ struct cmdq_create_srq { __le32 dpi; __le32 pd_id; __le64 pbl; + __le16 steering_tag; + u8 reserved48[6]; }; /* creq_create_srq_resp (size:128b/16B) */ @@ -1118,7 +1130,7 @@ struct creq_query_srq_resp_sb { __le32 data[4]; }; -/* cmdq_create_cq (size:384b/48B) */ +/* cmdq_create_cq (size:448b/56B) */ struct cmdq_create_cq { u8 opcode; #define CMDQ_CREATE_CQ_OPCODE_CREATE_CQ 0x9UL @@ -1126,6 +1138,9 @@ struct cmdq_create_cq { u8 cmd_size; __le16 flags; #define CMDQ_CREATE_CQ_FLAGS_DISABLE_CQ_OVERFLOW_DETECTION 0x1UL + #define CMDQ_CREATE_CQ_FLAGS_STEERING_TAG_VALID 0x2UL + #define CMDQ_CREATE_CQ_FLAGS_INFINITE_CQ_MODE 0x4UL + #define CMDQ_CREATE_CQ_FLAGS_COALESCING_VALID 0x8UL __le16 cookie; u8 resp_size; u8 reserved8; @@ -1157,6 +1172,19 @@ struct cmdq_create_cq { __le32 dpi; __le32 cq_size; __le64 pbl; + __le16 steering_tag; + u8 reserved48[2]; + __le32 coalescing; + #define CMDQ_CREATE_CQ_BUF_MAXTIME_MASK 0x1ffUL + #define CMDQ_CREATE_CQ_BUF_MAXTIME_SFT 0 + #define CMDQ_CREATE_CQ_NORMAL_MAXBUF_MASK 0x3e00UL + #define CMDQ_CREATE_CQ_NORMAL_MAXBUF_SFT 9 + #define CMDQ_CREATE_CQ_DURING_MAXBUF_MASK 0x7c000UL + #define CMDQ_CREATE_CQ_DURING_MAXBUF_SFT 14 + #define CMDQ_CREATE_CQ_ENABLE_RING_IDLE_MODE 0x80000UL + #define CMDQ_CREATE_CQ_UNUSED12_MASK 0xfff00000UL + #define CMDQ_CREATE_CQ_UNUSED12_SFT 20 + __le64 reserved64; }; /* creq_create_cq_resp (size:128b/16B) */ @@ -1288,11 +1316,12 @@ struct cmdq_allocate_mrw { #define CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2A 0x3UL #define CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2B 0x4UL #define CMDQ_ALLOCATE_MRW_MRW_FLAGS_LAST CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2B - #define CMDQ_ALLOCATE_MRW_UNUSED4_MASK 0xf0UL - #define CMDQ_ALLOCATE_MRW_UNUSED4_SFT 4 + #define CMDQ_ALLOCATE_MRW_STEERING_TAG_VALID 0x10UL + #define CMDQ_ALLOCATE_MRW_UNUSED4_MASK 0xe0UL + #define CMDQ_ALLOCATE_MRW_UNUSED4_SFT 5 u8 access; #define CMDQ_ALLOCATE_MRW_ACCESS_CONSUMER_OWNED_KEY 0x20UL - __le16 unused16; + __le16 steering_tag; __le32 pd_id; }; @@ -1359,14 +1388,16 @@ struct creq_deallocate_key_resp { __le32 bound_window_info; }; -/* cmdq_register_mr (size:384b/48B) */ +/* cmdq_register_mr (size:448b/56B) */ struct cmdq_register_mr { u8 opcode; #define CMDQ_REGISTER_MR_OPCODE_REGISTER_MR 0xfUL #define CMDQ_REGISTER_MR_OPCODE_LAST CMDQ_REGISTER_MR_OPCODE_REGISTER_MR u8 cmd_size; __le16 flags; - #define CMDQ_REGISTER_MR_FLAGS_ALLOC_MR 0x1UL + #define CMDQ_REGISTER_MR_FLAGS_ALLOC_MR 0x1UL + #define CMDQ_REGISTER_MR_FLAGS_STEERING_TAG_VALID 0x2UL + #define CMDQ_REGISTER_MR_FLAGS_ENABLE_RO 0x4UL __le16 cookie; u8 resp_size; u8 reserved8; @@ -1415,6 +1446,8 @@ struct cmdq_register_mr { __le64 pbl; __le64 va; __le64 mr_size; + __le16 steering_tag; + u8 reserved48[6]; }; /* creq_register_mr_resp (size:128b/16B) */ @@ -2157,8 +2190,38 @@ struct creq_query_func_resp_sb { __le32 tqm_alloc_reqs[12]; __le32 max_dpi; u8 max_sge_var_wqe; - u8 reserved_8; + u8 dev_cap_ext_flags; + #define CREQ_QUERY_FUNC_RESP_SB_ATOMIC_OPS_NOT_SUPPORTED 0x1UL + #define CREQ_QUERY_FUNC_RESP_SB_DRV_VERSION_RGTR_SUPPORTED 0x2UL + #define CREQ_QUERY_FUNC_RESP_SB_CREATE_QP_BATCH_SUPPORTED 0x4UL + #define CREQ_QUERY_FUNC_RESP_SB_DESTROY_QP_BATCH_SUPPORTED 0x8UL + #define CREQ_QUERY_FUNC_RESP_SB_ROCE_STATS_EXT_CTX_SUPPORTED 0x10UL + #define CREQ_QUERY_FUNC_RESP_SB_CREATE_SRQ_SGE_SUPPORTED 0x20UL + #define CREQ_QUERY_FUNC_RESP_SB_FIXED_SIZE_WQE_DISABLED 0x40UL + #define CREQ_QUERY_FUNC_RESP_SB_DCN_SUPPORTED 0x80UL __le16 max_inline_data_var_wqe; + __le32 start_qid; + u8 max_msn_table_size; + u8 reserved8_1; + __le16 dev_cap_ext_flags_2; + #define CREQ_QUERY_FUNC_RESP_SB_OPTIMIZE_MODIFY_QP_SUPPORTED 0x1UL + #define CREQ_QUERY_FUNC_RESP_SB_CHANGE_UDP_SRC_PORT_WQE_SUPPORTED 0x2UL + #define CREQ_QUERY_FUNC_RESP_SB_CQ_COALESCING_SUPPORTED 0x4UL + #define CREQ_QUERY_FUNC_RESP_SB_MEMORY_REGION_RO_SUPPORTED 0x8UL + #define CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_MASK 0x30UL + #define CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_SFT 4 + #define CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_HOST_PSN_TABLE (0x0UL << 4) + #define CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_HOST_MSN_TABLE (0x1UL << 4) + #define CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_IQM_MSN_TABLE (0x2UL << 4) + #define CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_LAST \ + CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_IQM_MSN_TABLE + #define CREQ_QUERY_FUNC_RESP_SB_MAX_SRQ_EXTENDED 0x40UL + #define CREQ_QUERY_FUNC_RESP_SB_MIN_RNR_RTR_RTS_OPT_SUPPORTED 0x1000UL + __le16 max_xp_qp_size; + __le16 create_qp_batch_size; + __le16 destroy_qp_batch_size; + __le16 max_srq_ext; + __le64 reserved64; }; /* cmdq_set_func_resources (size:448b/56B) */ @@ -2205,6 +2268,46 @@ struct creq_set_func_resources_resp { u8 reserved48[6]; }; +/* cmdq_read_context (size:192b/24B) */ +struct cmdq_read_context { + u8 opcode; + #define CMDQ_READ_CONTEXT_OPCODE_READ_CONTEXT 0x85UL + #define CMDQ_READ_CONTEXT_OPCODE_LAST CMDQ_READ_CONTEXT_OPCODE_READ_CONTEXT + u8 cmd_size; + __le16 flags; + __le16 cookie; + u8 resp_size; + u8 reserved8; + __le64 resp_addr; + __le32 xid; + u8 type; + #define CMDQ_READ_CONTEXT_TYPE_QPC 0x0UL + #define CMDQ_READ_CONTEXT_TYPE_CQ 0x1UL + #define CMDQ_READ_CONTEXT_TYPE_MRW 0x2UL + #define CMDQ_READ_CONTEXT_TYPE_SRQ 0x3UL + #define CMDQ_READ_CONTEXT_TYPE_LAST CMDQ_READ_CONTEXT_TYPE_SRQ + u8 unused_0[3]; +}; + +/* creq_read_context (size:128b/16B) */ +struct creq_read_context { + u8 type; + #define CREQ_READ_CONTEXT_TYPE_MASK 0x3fUL + #define CREQ_READ_CONTEXT_TYPE_SFT 0 + #define CREQ_READ_CONTEXT_TYPE_QP_EVENT 0x38UL + #define CREQ_READ_CONTEXT_TYPE_LAST CREQ_READ_CONTEXT_TYPE_QP_EVENT + u8 status; + __le16 cookie; + __le32 reserved32; + u8 v; + #define CREQ_READ_CONTEXT_V 0x1UL + u8 event; + #define CREQ_READ_CONTEXT_EVENT_READ_CONTEXT 0x85UL + #define CREQ_READ_CONTEXT_EVENT_LAST CREQ_READ_CONTEXT_EVENT_READ_CONTEXT + __le16 reserved16; + __le32 reserved_32; +}; + /* cmdq_map_tc_to_cos (size:192b/24B) */ struct cmdq_map_tc_to_cos { u8 opcode; diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 040ba2224f9f..e02721a9e288 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -191,7 +191,7 @@ static void start_ep_timer(struct c4iw_ep *ep) static int stop_ep_timer(struct c4iw_ep *ep) { pr_debug("ep %p stopping\n", ep); - del_timer_sync(&ep->timer); + timer_delete_sync(&ep->timer); if (!test_and_set_bit(TIMEOUT, &ep->com.flags)) { c4iw_put_ep(&ep->com); return 0; @@ -1222,6 +1222,8 @@ static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb) int ret; ep = lookup_atid(t, atid); + if (!ep) + return -EINVAL; pr_debug("ep %p tid %u snd_isn %u rcv_isn %u\n", ep, tid, be32_to_cpu(req->snd_isn), be32_to_cpu(req->rcv_isn)); @@ -2084,7 +2086,7 @@ static int import_ep(struct c4iw_ep *ep, int iptype, __u8 *peer_ip, err = -ENOMEM; if (n->dev->flags & IFF_LOOPBACK) { if (iptype == 4) - pdev = ip_dev_find(&init_net, *(__be32 *)peer_ip); + pdev = __ip_dev_find(&init_net, *(__be32 *)peer_ip, false); else if (IS_ENABLED(CONFIG_IPV6)) for_each_netdev(&init_net, pdev) { if (ipv6_chk_addr(&init_net, @@ -2099,12 +2101,12 @@ static int import_ep(struct c4iw_ep *ep, int iptype, __u8 *peer_ip, err = -ENODEV; goto out; } + if (is_vlan_dev(pdev)) + pdev = vlan_dev_real_dev(pdev); ep->l2t = cxgb4_l2t_get(cdev->rdev.lldi.l2t, n, pdev, rt_tos2priority(tos)); - if (!ep->l2t) { - dev_put(pdev); + if (!ep->l2t) goto out; - } ep->mtu = pdev->mtu; ep->tx_chan = cxgb4_port_chan(pdev); ep->smac_idx = ((struct port_info *)netdev_priv(pdev))->smt_idx; @@ -2117,7 +2119,6 @@ static int import_ep(struct c4iw_ep *ep, int iptype, __u8 *peer_ip, ep->rss_qid = cdev->rdev.lldi.rxq_ids[ cxgb4_port_idx(pdev) * step]; set_tcp_window(ep, (struct port_info *)netdev_priv(pdev)); - dev_put(pdev); } else { pdev = get_real_dev(n->dev); ep->l2t = cxgb4_l2t_get(cdev->rdev.lldi.l2t, @@ -2279,6 +2280,9 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb) int ret = 0; ep = lookup_atid(t, atid); + if (!ep) + return -EINVAL; + la = (struct sockaddr_in *)&ep->com.local_addr; ra = (struct sockaddr_in *)&ep->com.remote_addr; la6 = (struct sockaddr_in6 *)&ep->com.local_addr; diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index 7e2835dcbc1c..14ced7b667fa 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -995,8 +995,9 @@ int c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) } int c4iw_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct ib_device *ibdev = ibcq->device; int entries = attr->cqe; int vector = attr->comp_vector; @@ -1125,13 +1126,19 @@ int c4iw_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, goto err_free_mm2; mm->key = uresp.key; - mm->addr = virt_to_phys(chp->cq.queue); + mm->addr = 0; + mm->vaddr = chp->cq.queue; + mm->dma_addr = chp->cq.dma_addr; mm->len = chp->cq.memsize; + insert_flag_to_mmap(&rhp->rdev, mm, mm->addr); insert_mmap(ucontext, mm); mm2->key = uresp.gts_key; mm2->addr = chp->cq.bar2_pa; mm2->len = PAGE_SIZE; + mm2->vaddr = NULL; + mm2->dma_addr = 0; + insert_flag_to_mmap(&rhp->rdev, mm2, mm2->addr); insert_mmap(ucontext, mm2); } diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index 80970a1738f8..034b85c42255 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -1114,8 +1114,10 @@ static inline struct sk_buff *copy_gl_to_skb_pkt(const struct pkt_gl *gl, * The math here assumes sizeof cpl_pass_accept_req >= sizeof * cpl_rx_pkt. */ - skb = alloc_skb(gl->tot_len + sizeof(struct cpl_pass_accept_req) + - sizeof(struct rss_header) - pktshift, GFP_ATOMIC); + skb = alloc_skb(size_add(gl->tot_len, + sizeof(struct cpl_pass_accept_req) + + sizeof(struct rss_header)) - pktshift, + GFP_ATOMIC); if (unlikely(!skb)) return NULL; diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index fb8a0c248866..5b3007acaa1f 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -532,11 +532,21 @@ static inline struct c4iw_ucontext *to_c4iw_ucontext(struct ib_ucontext *c) return container_of(c, struct c4iw_ucontext, ibucontext); } +enum { + CXGB4_MMAP_BAR, + CXGB4_MMAP_BAR_WC, + CXGB4_MMAP_CONTIG, + CXGB4_MMAP_NON_CONTIG, +}; + struct c4iw_mm_entry { struct list_head entry; u64 addr; u32 key; + void *vaddr; + dma_addr_t dma_addr; unsigned len; + u8 mmap_flag; }; static inline struct c4iw_mm_entry *remove_mmap(struct c4iw_ucontext *ucontext, @@ -561,6 +571,32 @@ static inline struct c4iw_mm_entry *remove_mmap(struct c4iw_ucontext *ucontext, return NULL; } +static inline void insert_flag_to_mmap(struct c4iw_rdev *rdev, + struct c4iw_mm_entry *mm, u64 addr) +{ + if (addr >= pci_resource_start(rdev->lldi.pdev, 0) && + (addr < (pci_resource_start(rdev->lldi.pdev, 0) + + pci_resource_len(rdev->lldi.pdev, 0)))) + mm->mmap_flag = CXGB4_MMAP_BAR; + else if (addr >= pci_resource_start(rdev->lldi.pdev, 2) && + (addr < (pci_resource_start(rdev->lldi.pdev, 2) + + pci_resource_len(rdev->lldi.pdev, 2)))) { + if (addr >= rdev->oc_mw_pa) { + mm->mmap_flag = CXGB4_MMAP_BAR_WC; + } else { + if (is_t4(rdev->lldi.adapter_type)) + mm->mmap_flag = CXGB4_MMAP_BAR; + else + mm->mmap_flag = CXGB4_MMAP_BAR_WC; + } + } else { + if (addr) + mm->mmap_flag = CXGB4_MMAP_CONTIG; + else + mm->mmap_flag = CXGB4_MMAP_NON_CONTIG; + } +} + static inline void insert_mmap(struct c4iw_ucontext *ucontext, struct c4iw_mm_entry *mm) { @@ -936,7 +972,6 @@ u32 c4iw_get_resource(struct c4iw_id_table *id_table); void c4iw_put_resource(struct c4iw_id_table *id_table, u32 entry); int c4iw_init_resource(struct c4iw_rdev *rdev, u32 nr_tpt, u32 nr_pdid, u32 nr_srqt); -int c4iw_init_ctrl_qp(struct c4iw_rdev *rdev); int c4iw_pblpool_create(struct c4iw_rdev *rdev); int c4iw_rqtpool_create(struct c4iw_rdev *rdev); int c4iw_ocqp_pool_create(struct c4iw_rdev *rdev); @@ -944,7 +979,6 @@ void c4iw_pblpool_destroy(struct c4iw_rdev *rdev); void c4iw_rqtpool_destroy(struct c4iw_rdev *rdev); void c4iw_ocqp_pool_destroy(struct c4iw_rdev *rdev); void c4iw_destroy_resource(struct c4iw_resource *rscp); -int c4iw_destroy_ctrl_qp(struct c4iw_rdev *rdev); void c4iw_register_device(struct work_struct *work); void c4iw_unregister_device(struct c4iw_dev *dev); int __init c4iw_cm_init(void); @@ -978,7 +1012,7 @@ int c4iw_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata); int c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata); void c4iw_cq_rem_ref(struct c4iw_cq *chp); int c4iw_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int c4iw_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); int c4iw_modify_srq(struct ib_srq *ib_srq, struct ib_srq_attr *attr, enum ib_srq_attr_mask srq_attr_mask, @@ -1006,8 +1040,6 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp); int c4iw_flush_rq(struct t4_wq *wq, struct t4_cq *cq, int count); int c4iw_flush_sq(struct c4iw_qp *qhp); int c4iw_ev_handler(struct c4iw_dev *rnicp, u32 qid); -u16 c4iw_rqes_posted(struct c4iw_qp *qhp); -int c4iw_post_terminate(struct c4iw_qp *qhp, struct t4_cqe *err_cqe); u32 c4iw_get_cqid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx); void c4iw_put_cqid(struct c4iw_rdev *rdev, u32 qid, struct c4iw_dev_ucontext *uctx); diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index 246b739ddb2b..e059f92d90fd 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -113,6 +113,9 @@ static int c4iw_alloc_ucontext(struct ib_ucontext *ucontext, mm->key = uresp.status_page_key; mm->addr = virt_to_phys(rhp->rdev.status_page); mm->len = PAGE_SIZE; + mm->vaddr = NULL; + mm->dma_addr = 0; + insert_flag_to_mmap(&rhp->rdev, mm, mm->addr); insert_mmap(context, mm); } return 0; @@ -131,6 +134,11 @@ static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) struct c4iw_mm_entry *mm; struct c4iw_ucontext *ucontext; u64 addr; + u8 mmap_flag; + size_t size; + void *vaddr; + unsigned long vm_pgoff; + dma_addr_t dma_addr; pr_debug("pgoff 0x%lx key 0x%x len %d\n", vma->vm_pgoff, key, len); @@ -145,47 +153,38 @@ static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) if (!mm) return -EINVAL; addr = mm->addr; + vaddr = mm->vaddr; + dma_addr = mm->dma_addr; + size = mm->len; + mmap_flag = mm->mmap_flag; kfree(mm); - if ((addr >= pci_resource_start(rdev->lldi.pdev, 0)) && - (addr < (pci_resource_start(rdev->lldi.pdev, 0) + - pci_resource_len(rdev->lldi.pdev, 0)))) { - - /* - * MA_SYNC register... - */ - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + switch (mmap_flag) { + case CXGB4_MMAP_BAR: + ret = io_remap_pfn_range(vma, vma->vm_start, addr >> PAGE_SHIFT, + len, + pgprot_noncached(vma->vm_page_prot)); + break; + case CXGB4_MMAP_BAR_WC: ret = io_remap_pfn_range(vma, vma->vm_start, addr >> PAGE_SHIFT, - len, vma->vm_page_prot); - } else if ((addr >= pci_resource_start(rdev->lldi.pdev, 2)) && - (addr < (pci_resource_start(rdev->lldi.pdev, 2) + - pci_resource_len(rdev->lldi.pdev, 2)))) { - - /* - * Map user DB or OCQP memory... - */ - if (addr >= rdev->oc_mw_pa) - vma->vm_page_prot = t4_pgprot_wc(vma->vm_page_prot); - else { - if (!is_t4(rdev->lldi.adapter_type)) - vma->vm_page_prot = - t4_pgprot_wc(vma->vm_page_prot); - else - vma->vm_page_prot = - pgprot_noncached(vma->vm_page_prot); - } + len, t4_pgprot_wc(vma->vm_page_prot)); + break; + case CXGB4_MMAP_CONTIG: ret = io_remap_pfn_range(vma, vma->vm_start, addr >> PAGE_SHIFT, len, vma->vm_page_prot); - } else { - - /* - * Map WQ or CQ contig dma memory... - */ - ret = remap_pfn_range(vma, vma->vm_start, - addr >> PAGE_SHIFT, - len, vma->vm_page_prot); + break; + case CXGB4_MMAP_NON_CONTIG: + vm_pgoff = vma->vm_pgoff; + vma->vm_pgoff = 0; + ret = dma_mmap_coherent(&rdev->lldi.pdev->dev, vma, + vaddr, dma_addr, size); + vma->vm_pgoff = vm_pgoff; + break; + default: + ret = -EINVAL; + break; } return ret; @@ -474,6 +473,7 @@ static const struct ib_device_ops c4iw_dev_ops = { .fill_res_cq_entry = c4iw_fill_res_cq_entry, .fill_res_cm_id_entry = c4iw_fill_res_cm_id_entry, .fill_res_mr_entry = c4iw_fill_res_mr_entry, + .fill_res_qp_entry = c4iw_fill_res_qp_entry, .get_dev_fw_str = get_dev_fw_str, .get_dma_mr = c4iw_get_dma_mr, .get_hw_stats = c4iw_get_mib, diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index d16d8eaa1415..955f061a55e9 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -1599,6 +1599,7 @@ static void __flush_qp(struct c4iw_qp *qhp, struct c4iw_cq *rchp, int count; int rq_flushed = 0, sq_flushed; unsigned long flag; + struct ib_event ev; pr_debug("qhp %p rchp %p schp %p\n", qhp, rchp, schp); @@ -1607,6 +1608,13 @@ static void __flush_qp(struct c4iw_qp *qhp, struct c4iw_cq *rchp, if (schp != rchp) spin_lock(&schp->lock); spin_lock(&qhp->lock); + if (qhp->srq && qhp->attr.state == C4IW_QP_STATE_ERROR && + qhp->ibqp.event_handler) { + ev.device = qhp->ibqp.device; + ev.element.qp = &qhp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + qhp->ibqp.event_handler(&ev, qhp->ibqp.qp_context); + } if (qhp->wq.flushed) { spin_unlock(&qhp->lock); @@ -2281,24 +2289,39 @@ int c4iw_create_qp(struct ib_qp *qp, struct ib_qp_init_attr *attrs, if (ret) goto err_free_ma_sync_key; sq_key_mm->key = uresp.sq_key; - sq_key_mm->addr = qhp->wq.sq.phys_addr; + sq_key_mm->addr = 0; + sq_key_mm->vaddr = qhp->wq.sq.queue; + sq_key_mm->dma_addr = qhp->wq.sq.dma_addr; sq_key_mm->len = PAGE_ALIGN(qhp->wq.sq.memsize); + insert_flag_to_mmap(&rhp->rdev, sq_key_mm, sq_key_mm->addr); insert_mmap(ucontext, sq_key_mm); if (!attrs->srq) { rq_key_mm->key = uresp.rq_key; - rq_key_mm->addr = virt_to_phys(qhp->wq.rq.queue); + rq_key_mm->addr = 0; + rq_key_mm->vaddr = qhp->wq.rq.queue; + rq_key_mm->dma_addr = qhp->wq.rq.dma_addr; rq_key_mm->len = PAGE_ALIGN(qhp->wq.rq.memsize); + insert_flag_to_mmap(&rhp->rdev, rq_key_mm, + rq_key_mm->addr); insert_mmap(ucontext, rq_key_mm); } sq_db_key_mm->key = uresp.sq_db_gts_key; sq_db_key_mm->addr = (u64)(unsigned long)qhp->wq.sq.bar2_pa; + sq_db_key_mm->vaddr = NULL; + sq_db_key_mm->dma_addr = 0; sq_db_key_mm->len = PAGE_SIZE; + insert_flag_to_mmap(&rhp->rdev, sq_db_key_mm, + sq_db_key_mm->addr); insert_mmap(ucontext, sq_db_key_mm); if (!attrs->srq) { rq_db_key_mm->key = uresp.rq_db_gts_key; rq_db_key_mm->addr = (u64)(unsigned long)qhp->wq.rq.bar2_pa; rq_db_key_mm->len = PAGE_SIZE; + rq_db_key_mm->vaddr = NULL; + rq_db_key_mm->dma_addr = 0; + insert_flag_to_mmap(&rhp->rdev, rq_db_key_mm, + rq_db_key_mm->addr); insert_mmap(ucontext, rq_db_key_mm); } if (ma_sync_key_mm) { @@ -2307,6 +2330,10 @@ int c4iw_create_qp(struct ib_qp *qp, struct ib_qp_init_attr *attrs, (pci_resource_start(rhp->rdev.lldi.pdev, 0) + PCIE_MA_SYNC_A) & PAGE_MASK; ma_sync_key_mm->len = PAGE_SIZE; + ma_sync_key_mm->vaddr = NULL; + ma_sync_key_mm->dma_addr = 0; + insert_flag_to_mmap(&rhp->rdev, ma_sync_key_mm, + ma_sync_key_mm->addr); insert_mmap(ucontext, ma_sync_key_mm); } @@ -2761,12 +2788,19 @@ int c4iw_create_srq(struct ib_srq *ib_srq, struct ib_srq_init_attr *attrs, if (ret) goto err_free_srq_db_key_mm; srq_key_mm->key = uresp.srq_key; - srq_key_mm->addr = virt_to_phys(srq->wq.queue); + srq_key_mm->addr = 0; srq_key_mm->len = PAGE_ALIGN(srq->wq.memsize); + srq_key_mm->vaddr = srq->wq.queue; + srq_key_mm->dma_addr = srq->wq.dma_addr; + insert_flag_to_mmap(&rhp->rdev, srq_key_mm, srq_key_mm->addr); insert_mmap(ucontext, srq_key_mm); srq_db_key_mm->key = uresp.srq_db_gts_key; srq_db_key_mm->addr = (u64)(unsigned long)srq->wq.bar2_pa; srq_db_key_mm->len = PAGE_SIZE; + srq_db_key_mm->vaddr = NULL; + srq_db_key_mm->dma_addr = 0; + insert_flag_to_mmap(&rhp->rdev, srq_db_key_mm, + srq_db_key_mm->addr); insert_mmap(ucontext, srq_db_key_mm); } diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h index 926f9ff1f60f..838182d0409c 100644 --- a/drivers/infiniband/hw/efa/efa.h +++ b/drivers/infiniband/hw/efa/efa.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ /* - * Copyright 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2018-2025 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef _EFA_H_ @@ -57,15 +57,15 @@ struct efa_dev { u64 db_bar_addr; u64 db_bar_len; - unsigned int num_irq_vectors; - int admin_msix_vector_idx; + u32 num_irq_vectors; + u32 admin_msix_vector_idx; struct efa_irq admin_irq; struct efa_stats stats; /* Array of completion EQs */ struct efa_eq *eqs; - unsigned int neqs; + u32 neqs; /* Only stores CQs with interrupts enabled */ struct xarray cqs_xa; @@ -161,14 +161,14 @@ int efa_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr, struct ib_udata *udata); int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata); struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, u64 length, u64 virt_addr, int fd, int access_flags, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata); int efa_get_port_immutable(struct ib_device *ibdev, u32 port_num, struct ib_port_immutable *immutable); diff --git a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h index 7377c8a9f4d5..fe0b6aec7839 100644 --- a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h +++ b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h @@ -30,7 +30,8 @@ enum efa_admin_aq_opcode { EFA_ADMIN_DEALLOC_UAR = 17, EFA_ADMIN_CREATE_EQ = 18, EFA_ADMIN_DESTROY_EQ = 19, - EFA_ADMIN_MAX_OPCODE = 19, + EFA_ADMIN_ALLOC_MR = 20, + EFA_ADMIN_MAX_OPCODE = 20, }; enum efa_admin_aq_feature_id { @@ -110,7 +111,10 @@ struct efa_admin_create_qp_cmd { * virtual (IOVA returned by MR registration) * 1 : rq_virt - If set, RQ ring base address is * virtual (IOVA returned by MR registration) - * 7:2 : reserved - MBZ + * 2 : unsolicited_write_recv - If set, work requests + * will not be consumed for incoming RDMA write with + * immediate + * 7:3 : reserved - MBZ */ u8 flags; @@ -147,8 +151,11 @@ struct efa_admin_create_qp_cmd { /* UAR number */ u16 uar; + /* Requested service level for the QP, 0 is the default SL */ + u8 sl; + /* MBZ */ - u16 reserved; + u8 reserved; /* MBZ */ u32 reserved2; @@ -456,6 +463,41 @@ struct efa_admin_dereg_mr_resp { struct efa_admin_acq_common_desc acq_common_desc; }; +/* + * Allocation of MemoryRegion, required for QP working with Virtual + * Addresses in kernel verbs semantics, ready for fast registration use. + */ +struct efa_admin_alloc_mr_cmd { + /* Common Admin Queue descriptor */ + struct efa_admin_aq_common_desc aq_common_desc; + + /* Protection Domain */ + u16 pd; + + /* MBZ */ + u16 reserved1; + + /* Maximum number of pages this MR supports. */ + u32 max_pages; +}; + +struct efa_admin_alloc_mr_resp { + /* Common Admin Queue completion descriptor */ + struct efa_admin_acq_common_desc acq_common_desc; + + /* + * L_Key, to be used in conjunction with local buffer references in + * SQ and RQ WQE, or with virtual RQ/CQ rings + */ + u32 l_key; + + /* + * R_Key, to be used in RDMA messages to refer to remotely accessed + * memory region + */ + u32 r_key; +}; + struct efa_admin_create_cq_cmd { struct efa_admin_aq_common_desc aq_common_desc; @@ -480,8 +522,8 @@ struct efa_admin_create_cq_cmd { */ u8 cq_caps_2; - /* completion queue depth in # of entries. must be power of 2 */ - u16 cq_depth; + /* Sub completion queue depth in # of entries. must be power of 2 */ + u16 sub_cq_depth; /* EQ number assigned to this cq */ u16 eqn; @@ -516,8 +558,8 @@ struct efa_admin_create_cq_resp { u16 cq_idx; - /* actual cq depth in number of entries */ - u16 cq_actual_depth; + /* actual sub cq depth in number of entries */ + u16 sub_cq_actual_depth; /* CQ doorbell address, as offset to PCIe DB BAR */ u32 db_offset; @@ -575,6 +617,8 @@ struct efa_admin_basic_stats { u64 rx_pkts; u64 rx_drops; + + u64 qkey_viol; }; struct efa_admin_messages_stats { @@ -663,12 +707,26 @@ struct efa_admin_feature_device_attr_desc { * polling is supported * 3 : rdma_write - If set, RDMA Write is supported * on TX queues - * 31:4 : reserved - MBZ + * 4 : unsolicited_write_recv - If set, unsolicited + * write with imm. receive is supported + * 31:5 : reserved - MBZ */ u32 device_caps; /* Max RDMA transfer size in bytes */ u32 max_rdma_size; + + /* Unique global ID for an EFA device */ + u64 guid; + + /* The device maximum link speed in Gbit/sec */ + u16 max_link_speed_gbps; + + /* MBZ */ + u16 reserved0; + + /* MBZ */ + u32 reserved1; }; struct efa_admin_feature_queue_attr_desc { @@ -1009,6 +1067,7 @@ struct efa_admin_host_info { /* create_qp_cmd */ #define EFA_ADMIN_CREATE_QP_CMD_SQ_VIRT_MASK BIT(0) #define EFA_ADMIN_CREATE_QP_CMD_RQ_VIRT_MASK BIT(1) +#define EFA_ADMIN_CREATE_QP_CMD_UNSOLICITED_WRITE_RECV_MASK BIT(2) /* modify_qp_cmd */ #define EFA_ADMIN_MODIFY_QP_CMD_QP_STATE_MASK BIT(0) @@ -1044,10 +1103,10 @@ struct efa_admin_host_info { #define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_RNR_RETRY_MASK BIT(1) #define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_DATA_POLLING_128_MASK BIT(2) #define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_RDMA_WRITE_MASK BIT(3) +#define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_UNSOLICITED_WRITE_RECV_MASK BIT(4) /* create_eq_cmd */ #define EFA_ADMIN_CREATE_EQ_CMD_ENTRY_SIZE_WORDS_MASK GENMASK(4, 0) -#define EFA_ADMIN_CREATE_EQ_CMD_VIRT_MASK BIT(6) #define EFA_ADMIN_CREATE_EQ_CMD_COMPLETION_EVENTS_MASK BIT(0) /* host_info */ diff --git a/drivers/infiniband/hw/efa/efa_admin_defs.h b/drivers/infiniband/hw/efa/efa_admin_defs.h index 83f20c38a840..35700c93e639 100644 --- a/drivers/infiniband/hw/efa/efa_admin_defs.h +++ b/drivers/infiniband/hw/efa/efa_admin_defs.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ /* - * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef _EFA_ADMIN_H_ @@ -96,7 +96,7 @@ struct efa_admin_acq_entry { struct efa_admin_aenq_common_desc { u16 group; - u16 syndrom; + u16 syndrome; /* * 0 : phase diff --git a/drivers/infiniband/hw/efa/efa_com.c b/drivers/infiniband/hw/efa/efa_com.c index 16a24a05fc2a..bafd210dd43e 100644 --- a/drivers/infiniband/hw/efa/efa_com.c +++ b/drivers/infiniband/hw/efa/efa_com.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause /* - * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved. */ #include "efa_com.h" @@ -406,8 +406,8 @@ static struct efa_comp_ctx *efa_com_submit_admin_cmd(struct efa_com_admin_queue return comp_ctx; } -static void efa_com_handle_single_admin_completion(struct efa_com_admin_queue *aq, - struct efa_admin_acq_entry *cqe) +static int efa_com_handle_single_admin_completion(struct efa_com_admin_queue *aq, + struct efa_admin_acq_entry *cqe) { struct efa_comp_ctx *comp_ctx; u16 cmd_id; @@ -416,11 +416,11 @@ static void efa_com_handle_single_admin_completion(struct efa_com_admin_queue *a EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID); comp_ctx = efa_com_get_comp_ctx(aq, cmd_id, false); - if (!comp_ctx) { + if (comp_ctx->status != EFA_CMD_SUBMITTED) { ibdev_err(aq->efa_dev, - "comp_ctx is NULL. Changing the admin queue running state\n"); - clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state); - return; + "Received completion with unexpected command id[%d], sq producer: %d, sq consumer: %d, cq consumer: %d\n", + cmd_id, aq->sq.pc, aq->sq.cc, aq->cq.cc); + return -EINVAL; } comp_ctx->status = EFA_CMD_COMPLETED; @@ -428,14 +428,17 @@ static void efa_com_handle_single_admin_completion(struct efa_com_admin_queue *a if (!test_bit(EFA_AQ_STATE_POLLING_BIT, &aq->state)) complete(&comp_ctx->wait_event); + + return 0; } static void efa_com_handle_admin_completion(struct efa_com_admin_queue *aq) { struct efa_admin_acq_entry *cqe; u16 queue_size_mask; - u16 comp_num = 0; + u16 comp_cmds = 0; u8 phase; + int err; u16 ci; queue_size_mask = aq->depth - 1; @@ -453,10 +456,12 @@ static void efa_com_handle_admin_completion(struct efa_com_admin_queue *aq) * phase bit was validated */ dma_rmb(); - efa_com_handle_single_admin_completion(aq, cqe); + err = efa_com_handle_single_admin_completion(aq, cqe); + if (!err) + comp_cmds++; + aq->cq.cc++; ci++; - comp_num++; if (ci == aq->depth) { ci = 0; phase = !phase; @@ -465,10 +470,9 @@ static void efa_com_handle_admin_completion(struct efa_com_admin_queue *aq) cqe = &aq->cq.entries[ci]; } - aq->cq.cc += comp_num; aq->cq.phase = phase; - aq->sq.cc += comp_num; - atomic64_add(comp_num, &aq->stats.completed_cmd); + aq->sq.cc += comp_cmds; + atomic64_add(comp_cmds, &aq->stats.completed_cmd); } static int efa_com_comp_status_to_errno(u8 comp_status) diff --git a/drivers/infiniband/hw/efa/efa_com.h b/drivers/infiniband/hw/efa/efa_com.h index 77282234ce68..4d9ca97e4296 100644 --- a/drivers/infiniband/hw/efa/efa_com.h +++ b/drivers/infiniband/hw/efa/efa_com.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ /* - * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2018-2025 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef _EFA_COM_H_ @@ -65,7 +65,7 @@ struct efa_com_admin_queue { u16 depth; struct efa_com_admin_cq cq; struct efa_com_admin_sq sq; - u16 msix_vector_idx; + u32 msix_vector_idx; unsigned long state; @@ -89,7 +89,7 @@ struct efa_com_aenq { struct efa_aenq_handlers *aenq_handlers; dma_addr_t dma_addr; u32 cc; /* consumer counter */ - u16 msix_vector_idx; + u32 msix_vector_idx; u16 depth; u8 phase; }; diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.c b/drivers/infiniband/hw/efa/efa_com_cmd.c index d3398c7b0bd0..c6b89c45fdc9 100644 --- a/drivers/infiniband/hw/efa/efa_com_cmd.c +++ b/drivers/infiniband/hw/efa/efa_com_cmd.c @@ -31,6 +31,10 @@ int efa_com_create_qp(struct efa_com_dev *edev, create_qp_cmd.qp_alloc_size.recv_queue_depth = params->rq_depth; create_qp_cmd.uar = params->uarn; + create_qp_cmd.sl = params->sl; + + if (params->unsolicited_write_recv) + EFA_SET(&create_qp_cmd.flags, EFA_ADMIN_CREATE_QP_CMD_UNSOLICITED_WRITE_RECV, 1); err = efa_com_cmd_exec(aq, (struct efa_admin_aq_entry *)&create_qp_cmd, @@ -160,7 +164,7 @@ int efa_com_create_cq(struct efa_com_dev *edev, EFA_SET(&create_cmd.cq_caps_2, EFA_ADMIN_CREATE_CQ_CMD_CQ_ENTRY_SIZE_WORDS, params->entry_size_in_bytes / 4); - create_cmd.cq_depth = params->cq_depth; + create_cmd.sub_cq_depth = params->sub_cq_depth; create_cmd.num_sub_cqs = params->num_sub_cqs; create_cmd.uar = params->uarn; if (params->interrupt_mode_enabled) { @@ -188,7 +192,7 @@ int efa_com_create_cq(struct efa_com_dev *edev, } result->cq_idx = cmd_completion.cq_idx; - result->actual_depth = params->cq_depth; + result->actual_depth = params->sub_cq_depth; result->db_off = cmd_completion.db_offset; result->db_valid = EFA_GET(&cmd_completion.flags, EFA_ADMIN_CREATE_CQ_RESP_DB_VALID); @@ -462,6 +466,8 @@ int efa_com_get_device_attr(struct efa_com_dev *edev, result->db_bar = resp.u.device_attr.db_bar; result->max_rdma_size = resp.u.device_attr.max_rdma_size; result->device_caps = resp.u.device_attr.device_caps; + result->guid = resp.u.device_attr.guid; + result->max_link_speed_gbps = resp.u.device_attr.max_link_speed_gbps; if (result->admin_api_version < 1) { ibdev_err_ratelimited( diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.h b/drivers/infiniband/hw/efa/efa_com_cmd.h index 720a99ba0f7d..5511355b700d 100644 --- a/drivers/infiniband/hw/efa/efa_com_cmd.h +++ b/drivers/infiniband/hw/efa/efa_com_cmd.h @@ -27,6 +27,8 @@ struct efa_com_create_qp_params { u16 pd; u16 uarn; u8 qp_type; + u8 sl; + u8 unsolicited_write_recv : 1; }; struct efa_com_create_qp_result { @@ -70,7 +72,7 @@ struct efa_com_create_cq_params { /* cq physical base address in OS memory */ dma_addr_t dma_addr; /* completion queue depth in # of entries */ - u16 cq_depth; + u16 sub_cq_depth; u16 num_sub_cqs; u16 uarn; u16 eqn; @@ -111,6 +113,7 @@ struct efa_com_get_device_attr_result { u8 addr[EFA_GID_SIZE]; u64 page_size_cap; u64 max_mr_pages; + u64 guid; u32 mtu; u32 fw_version; u32 admin_api_version; @@ -139,6 +142,7 @@ struct efa_com_get_device_attr_result { u16 max_wr_rdma_sge; u16 max_tx_batch; u16 min_sq_depth; + u16 max_link_speed_gbps; u8 db_bar; }; diff --git a/drivers/infiniband/hw/efa/efa_io_defs.h b/drivers/infiniband/hw/efa/efa_io_defs.h index 2d8eb96eaa81..a4c9fd33da38 100644 --- a/drivers/infiniband/hw/efa/efa_io_defs.h +++ b/drivers/infiniband/hw/efa/efa_io_defs.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ /* - * Copyright 2018-2023 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef _EFA_IO_H_ @@ -10,6 +10,7 @@ #define EFA_IO_TX_DESC_NUM_RDMA_BUFS 1 #define EFA_IO_TX_DESC_INLINE_MAX_SIZE 32 #define EFA_IO_TX_DESC_IMM_DATA_SIZE 4 +#define EFA_IO_TX_DESC_INLINE_PBL_SIZE 1 enum efa_io_queue_type { /* send queue (of a QP) */ @@ -25,6 +26,10 @@ enum efa_io_send_op_type { EFA_IO_RDMA_READ = 1, /* RDMA write */ EFA_IO_RDMA_WRITE = 2, + /* Fast MR registration */ + EFA_IO_FAST_REG = 3, + /* Fast MR invalidation */ + EFA_IO_FAST_INV = 4, }; enum efa_io_comp_status { @@ -34,15 +39,15 @@ enum efa_io_comp_status { EFA_IO_COMP_STATUS_FLUSHED = 1, /* Internal QP error */ EFA_IO_COMP_STATUS_LOCAL_ERROR_QP_INTERNAL_ERROR = 2, - /* Bad operation type */ - EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_OP_TYPE = 3, + /* Unsupported operation */ + EFA_IO_COMP_STATUS_LOCAL_ERROR_UNSUPPORTED_OP = 3, /* Bad AH */ EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_AH = 4, /* LKEY not registered or does not match IOVA */ EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_LKEY = 5, /* Message too long */ EFA_IO_COMP_STATUS_LOCAL_ERROR_BAD_LENGTH = 6, - /* Destination ENI is down or does not run EFA */ + /* RKEY not registered or does not match remote IOVA */ EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_ADDRESS = 7, /* Connection was reset by remote side */ EFA_IO_COMP_STATUS_REMOTE_ERROR_ABORT = 8, @@ -54,8 +59,17 @@ enum efa_io_comp_status { EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_LENGTH = 11, /* Unexpected status returned by responder */ EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_STATUS = 12, - /* Unresponsive remote - detected locally */ + /* Unresponsive remote - was previously responsive */ EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE = 13, + /* No valid AH at remote side (required for RDMA operations) */ + EFA_IO_COMP_STATUS_REMOTE_ERROR_UNKNOWN_PEER = 14, + /* Unreachable remote - never received a response */ + EFA_IO_COMP_STATUS_LOCAL_ERROR_UNREACH_REMOTE = 15, +}; + +enum efa_io_frwr_pbl_mode { + EFA_IO_FRWR_INLINE_PBL = 0, + EFA_IO_FRWR_DIRECT_PBL = 1, }; struct efa_io_tx_meta_desc { @@ -95,13 +109,13 @@ struct efa_io_tx_meta_desc { /* * If inline_msg bit is set, length of inline message in bytes, - * otherwise length of SGL (number of buffers). + * otherwise length of SGL (number of buffers). */ u16 length; /* - * immediate data: if has_imm is set, then this field is included - * within Tx message and reported in remote Rx completion. + * immediate data: if has_imm is set, then this field is included within + * Tx message and reported in remote Rx completion. */ u32 immediate_data; @@ -158,6 +172,63 @@ struct efa_io_rdma_req { struct efa_io_tx_buf_desc local_mem[1]; }; +struct efa_io_fast_mr_reg_req { + /* Updated local key of the MR after lkey/rkey increment */ + u32 lkey; + + /* + * permissions + * 0 : local_write_enable - Local write permissions: + * must be set for RQ buffers and buffers posted for + * RDMA Read requests + * 1 : remote_write_enable - Remote write + * permissions: must be set to enable RDMA write to + * the region + * 2 : remote_read_enable - Remote read permissions: + * must be set to enable RDMA read from the region + * 7:3 : reserved2 - MBZ + */ + u8 permissions; + + /* + * control flags + * 4:0 : phys_page_size_shift - page size is (1 << + * phys_page_size_shift) + * 6:5 : pbl_mode - enum efa_io_frwr_pbl_mode + * 7 : reserved - MBZ + */ + u8 flags; + + /* MBZ */ + u8 reserved[2]; + + /* IO Virtual Address associated with this MR */ + u64 iova; + + /* Memory region length, in bytes */ + u64 mr_length; + + /* Physical Buffer List, each element is page-aligned. */ + union { + /* + * Inline array of physical page addresses (optimization + * for short region activation). + */ + u64 inline_array[1]; + + /* points to PBL (Currently only direct) */ + u64 dma_addr; + } pbl; +}; + +struct efa_io_fast_mr_inv_req { + /* Local key of the MR to invalidate */ + u32 lkey; + + /* MBZ */ + u8 reserved[28]; +}; + /* * Tx WQE, composed of tx meta descriptors followed by either tx buffer * descriptors or inline data @@ -174,6 +245,12 @@ struct efa_io_tx_wqe { /* RDMA local and remote memory addresses */ struct efa_io_rdma_req rdma_req; + + /* Fast registration */ + struct efa_io_fast_mr_reg_req reg_mr_req; + + /* Fast invalidation */ + struct efa_io_fast_mr_inv_req inv_mr_req; } data; }; @@ -208,7 +285,7 @@ struct efa_io_rx_desc { struct efa_io_cdesc_common { /* * verbs-generated request ID, as provided in the completed tx or rx - * descriptor. + * descriptor. */ u16 req_id; @@ -221,7 +298,8 @@ struct efa_io_cdesc_common { * 3 : has_imm - indicates that immediate data is * present - for RX completions only * 6:4 : op_type - enum efa_io_send_op_type - * 7 : reserved31 - MBZ + * 7 : unsolicited - indicates that there is no + * matching request - for RDMA with imm. RX only */ u8 flags; @@ -291,6 +369,13 @@ struct efa_io_rx_cdesc_ex { /* tx_buf_desc */ #define EFA_IO_TX_BUF_DESC_LKEY_MASK GENMASK(23, 0) +/* fast_mr_reg_req */ +#define EFA_IO_FAST_MR_REG_REQ_LOCAL_WRITE_ENABLE_MASK BIT(0) +#define EFA_IO_FAST_MR_REG_REQ_REMOTE_WRITE_ENABLE_MASK BIT(1) +#define EFA_IO_FAST_MR_REG_REQ_REMOTE_READ_ENABLE_MASK BIT(2) +#define EFA_IO_FAST_MR_REG_REQ_PHYS_PAGE_SIZE_SHIFT_MASK GENMASK(4, 0) +#define EFA_IO_FAST_MR_REG_REQ_PBL_MODE_MASK GENMASK(6, 5) + /* rx_desc */ #define EFA_IO_RX_DESC_LKEY_MASK GENMASK(23, 0) #define EFA_IO_RX_DESC_FIRST_MASK BIT(30) @@ -301,5 +386,6 @@ struct efa_io_rx_cdesc_ex { #define EFA_IO_CDESC_COMMON_Q_TYPE_MASK GENMASK(2, 1) #define EFA_IO_CDESC_COMMON_HAS_IMM_MASK BIT(3) #define EFA_IO_CDESC_COMMON_OP_TYPE_MASK GENMASK(6, 4) +#define EFA_IO_CDESC_COMMON_UNSOLICITED_MASK BIT(7) #endif /* _EFA_IO_H_ */ diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c index 5fa3603c80d8..4f03c0ec819f 100644 --- a/drivers/infiniband/hw/efa/efa_main.c +++ b/drivers/infiniband/hw/efa/efa_main.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause /* - * Copyright 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2018-2025 Amazon.com, Inc. or its affiliates. All rights reserved. */ #include <linux/module.h> @@ -16,11 +16,13 @@ #define PCI_DEV_ID_EFA0_VF 0xefa0 #define PCI_DEV_ID_EFA1_VF 0xefa1 #define PCI_DEV_ID_EFA2_VF 0xefa2 +#define PCI_DEV_ID_EFA3_VF 0xefa3 static const struct pci_device_id efa_pci_tbl[] = { { PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA0_VF) }, { PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA1_VF) }, { PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA2_VF) }, + { PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA3_VF) }, { } }; @@ -139,8 +141,7 @@ static int efa_request_irq(struct efa_dev *dev, struct efa_irq *irq) return 0; } -static void efa_setup_comp_irq(struct efa_dev *dev, struct efa_eq *eq, - int vector) +static void efa_setup_comp_irq(struct efa_dev *dev, struct efa_eq *eq, u32 vector) { u32 cpu; @@ -190,15 +191,23 @@ static int efa_request_doorbell_bar(struct efa_dev *dev) { u8 db_bar_idx = dev->dev_attr.db_bar; struct pci_dev *pdev = dev->pdev; - int bars; + int pci_mem_bars; + int db_bar; int err; - if (!(BIT(db_bar_idx) & EFA_BASE_BAR_MASK)) { - bars = pci_select_bars(pdev, IORESOURCE_MEM) & BIT(db_bar_idx); + db_bar = BIT(db_bar_idx); + if (!(db_bar & EFA_BASE_BAR_MASK)) { + pci_mem_bars = pci_select_bars(pdev, IORESOURCE_MEM); + if (db_bar & ~pci_mem_bars) { + dev_err(&pdev->dev, + "Doorbells BAR unavailable. Requested %#x, available %#x\n", + db_bar, pci_mem_bars); + return -ENODEV; + } - err = pci_request_selected_regions(pdev, bars, DRV_MODULE_NAME); + err = pci_request_selected_regions(pdev, db_bar, DRV_MODULE_NAME); if (err) { - dev_err(&dev->pdev->dev, + dev_err(&pdev->dev, "pci_request_selected_regions for bar %d failed %d\n", db_bar_idx, err); return err; @@ -295,7 +304,7 @@ static void efa_destroy_eq(struct efa_dev *dev, struct efa_eq *eq) efa_free_irq(dev, &eq->irq); } -static int efa_create_eq(struct efa_dev *dev, struct efa_eq *eq, u8 msix_vec) +static int efa_create_eq(struct efa_dev *dev, struct efa_eq *eq, u32 msix_vec) { int err; @@ -318,21 +327,17 @@ err_free_comp_irq: static int efa_create_eqs(struct efa_dev *dev) { - unsigned int neqs = dev->dev_attr.max_eq; - int err; - int i; - - neqs = min_t(unsigned int, neqs, - dev->num_irq_vectors - EFA_COMP_EQS_VEC_BASE); + u32 neqs = dev->dev_attr.max_eq; + int err, i; + neqs = min_t(u32, neqs, dev->num_irq_vectors - EFA_COMP_EQS_VEC_BASE); dev->neqs = neqs; dev->eqs = kcalloc(neqs, sizeof(*dev->eqs), GFP_KERNEL); if (!dev->eqs) return -ENOMEM; for (i = 0; i < neqs; i++) { - err = efa_create_eq(dev, &dev->eqs[i], - i + EFA_COMP_EQS_VEC_BASE); + err = efa_create_eq(dev, &dev->eqs[i], i + EFA_COMP_EQS_VEC_BASE); if (err) goto err_destroy_eqs; } @@ -431,6 +436,7 @@ static int efa_ib_device_add(struct efa_dev *dev) efa_set_host_info(dev); dev->ibdev.node_type = RDMA_NODE_UNSPECIFIED; + dev->ibdev.node_guid = dev->dev_attr.guid; dev->ibdev.phys_port_cnt = 1; dev->ibdev.num_comp_vectors = dev->neqs ?: 1; dev->ibdev.dev.parent = &pdev->dev; @@ -459,7 +465,6 @@ static void efa_ib_device_remove(struct efa_dev *dev) ibdev_info(&dev->ibdev, "Unregister ib device\n"); ib_unregister_device(&dev->ibdev); efa_destroy_eqs(dev); - efa_com_dev_reset(&dev->edev, EFA_REGS_RESET_NORMAL); efa_release_doorbell_bar(dev); } @@ -531,7 +536,7 @@ static struct efa_dev *efa_probe_device(struct pci_dev *pdev) { struct efa_com_dev *edev; struct efa_dev *dev; - int bars; + int pci_mem_bars; int err; err = pci_enable_device_mem(pdev); @@ -556,8 +561,14 @@ static struct efa_dev *efa_probe_device(struct pci_dev *pdev) dev->pdev = pdev; xa_init(&dev->cqs_xa); - bars = pci_select_bars(pdev, IORESOURCE_MEM) & EFA_BASE_BAR_MASK; - err = pci_request_selected_regions(pdev, bars, DRV_MODULE_NAME); + pci_mem_bars = pci_select_bars(pdev, IORESOURCE_MEM); + if (EFA_BASE_BAR_MASK & ~pci_mem_bars) { + dev_err(&pdev->dev, "BARs unavailable. Requested %#x, available %#x\n", + (int)EFA_BASE_BAR_MASK, pci_mem_bars); + err = -ENODEV; + goto err_ibdev_destroy; + } + err = pci_request_selected_regions(pdev, EFA_BASE_BAR_MASK, DRV_MODULE_NAME); if (err) { dev_err(&pdev->dev, "pci_request_selected_regions failed %d\n", err); @@ -626,12 +637,14 @@ err_disable_device: return ERR_PTR(err); } -static void efa_remove_device(struct pci_dev *pdev) +static void efa_remove_device(struct pci_dev *pdev, + enum efa_regs_reset_reason_types reset_reason) { struct efa_dev *dev = pci_get_drvdata(pdev); struct efa_com_dev *edev; edev = &dev->edev; + efa_com_dev_reset(edev, reset_reason); efa_com_admin_destroy(edev); efa_free_irq(dev, &dev->admin_irq); efa_disable_msix(dev); @@ -659,7 +672,7 @@ static int efa_probe(struct pci_dev *pdev, const struct pci_device_id *ent) return 0; err_remove_device: - efa_remove_device(pdev); + efa_remove_device(pdev, EFA_REGS_RESET_INIT_ERR); return err; } @@ -668,7 +681,17 @@ static void efa_remove(struct pci_dev *pdev) struct efa_dev *dev = pci_get_drvdata(pdev); efa_ib_device_remove(dev); - efa_remove_device(pdev); + efa_remove_device(pdev, EFA_REGS_RESET_NORMAL); +} + +static void efa_shutdown(struct pci_dev *pdev) +{ + struct efa_dev *dev = pci_get_drvdata(pdev); + + efa_destroy_eqs(dev); + efa_com_dev_reset(&dev->edev, EFA_REGS_RESET_SHUTDOWN); + efa_free_irq(dev, &dev->admin_irq); + efa_disable_msix(dev); } static struct pci_driver efa_pci_driver = { @@ -676,6 +699,7 @@ static struct pci_driver efa_pci_driver = { .id_table = efa_pci_tbl, .probe = efa_probe, .remove = efa_remove, + .shutdown = efa_shutdown, }; module_pci_driver(efa_pci_driver); diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index 2f412db2edcd..a8645a40730f 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -26,10 +26,6 @@ enum { EFA_MMAP_IO_NC, }; -#define EFA_AENQ_ENABLED_GROUPS \ - (BIT(EFA_ADMIN_FATAL_ERROR) | BIT(EFA_ADMIN_WARNING) | \ - BIT(EFA_ADMIN_NOTIFICATION) | BIT(EFA_ADMIN_KEEP_ALIVE)) - struct efa_user_mmap_entry { struct rdma_user_mmap_entry rdma_entry; u64 address; @@ -89,6 +85,8 @@ static const struct rdma_stat_desc efa_port_stats_descs[] = { EFA_DEFINE_PORT_STATS(EFA_STATS_STR) }; +#define EFA_DEFAULT_LINK_SPEED_GBPS 100 + #define EFA_CHUNK_PAYLOAD_SHIFT 12 #define EFA_CHUNK_PAYLOAD_SIZE BIT(EFA_CHUNK_PAYLOAD_SHIFT) #define EFA_CHUNK_PAYLOAD_PTR_SIZE 8 @@ -263,6 +261,9 @@ int efa_query_device(struct ib_device *ibdev, if (EFA_DEV_CAP(dev, RDMA_WRITE)) resp.device_caps |= EFA_QUERY_DEVICE_CAPS_RDMA_WRITE; + if (EFA_DEV_CAP(dev, UNSOLICITED_WRITE_RECV)) + resp.device_caps |= EFA_QUERY_DEVICE_CAPS_UNSOLICITED_WRITE_RECV; + if (dev->neqs) resp.device_caps |= EFA_QUERY_DEVICE_CAPS_CQ_NOTIFICATIONS; @@ -278,10 +279,47 @@ int efa_query_device(struct ib_device *ibdev, return 0; } +static void efa_link_gbps_to_speed_and_width(u16 gbps, + enum ib_port_speed *speed, + enum ib_port_width *width) +{ + if (gbps >= 400) { + *width = IB_WIDTH_8X; + *speed = IB_SPEED_HDR; + } else if (gbps >= 200) { + *width = IB_WIDTH_4X; + *speed = IB_SPEED_HDR; + } else if (gbps >= 120) { + *width = IB_WIDTH_12X; + *speed = IB_SPEED_FDR10; + } else if (gbps >= 100) { + *width = IB_WIDTH_4X; + *speed = IB_SPEED_EDR; + } else if (gbps >= 60) { + *width = IB_WIDTH_12X; + *speed = IB_SPEED_DDR; + } else if (gbps >= 50) { + *width = IB_WIDTH_1X; + *speed = IB_SPEED_HDR; + } else if (gbps >= 40) { + *width = IB_WIDTH_4X; + *speed = IB_SPEED_FDR10; + } else if (gbps >= 30) { + *width = IB_WIDTH_12X; + *speed = IB_SPEED_SDR; + } else { + *width = IB_WIDTH_1X; + *speed = IB_SPEED_EDR; + } +} + int efa_query_port(struct ib_device *ibdev, u32 port, struct ib_port_attr *props) { struct efa_dev *dev = to_edev(ibdev); + enum ib_port_speed link_speed; + enum ib_port_width link_width; + u16 link_gbps; props->lmc = 1; @@ -289,8 +327,10 @@ int efa_query_port(struct ib_device *ibdev, u32 port, props->phys_state = IB_PORT_PHYS_STATE_LINK_UP; props->gid_tbl_len = 1; props->pkey_tbl_len = 1; - props->active_speed = IB_SPEED_EDR; - props->active_width = IB_WIDTH_4X; + link_gbps = dev->dev_attr.max_link_speed_gbps ?: EFA_DEFAULT_LINK_SPEED_GBPS; + efa_link_gbps_to_speed_and_width(link_gbps, &link_speed, &link_width); + props->active_speed = link_speed; + props->active_width = link_width; props->max_mtu = ib_mtu_int_to_enum(dev->dev_attr.mtu); props->active_mtu = ib_mtu_int_to_enum(dev->dev_attr.mtu); props->max_msg_sz = dev->dev_attr.mtu; @@ -521,7 +561,7 @@ static int qp_mmap_entries_setup(struct efa_qp *qp, address = dev->mem_bar_addr + resp->llq_desc_offset; length = PAGE_ALIGN(params->sq_ring_size_in_bytes + - (resp->llq_desc_offset & ~PAGE_MASK)); + offset_in_page(resp->llq_desc_offset)); qp->llq_desc_mmap_entry = efa_user_mmap_entry_insert(&ucontext->ibucontext, @@ -639,6 +679,7 @@ int efa_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr, struct efa_ibv_create_qp cmd = {}; struct efa_qp *qp = to_eqp(ibqp); struct efa_ucontext *ucontext; + u16 supported_efa_flags = 0; int err; ucontext = rdma_udata_to_drv_context(udata, struct efa_ucontext, @@ -676,13 +717,23 @@ int efa_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr, goto err_out; } - if (cmd.comp_mask) { + if (cmd.comp_mask || !is_reserved_cleared(cmd.reserved_98)) { ibdev_dbg(&dev->ibdev, "Incompatible ABI params, unknown fields in udata\n"); err = -EINVAL; goto err_out; } + if (EFA_DEV_CAP(dev, UNSOLICITED_WRITE_RECV)) + supported_efa_flags |= EFA_CREATE_QP_WITH_UNSOLICITED_WRITE_RECV; + + if (cmd.flags & ~supported_efa_flags) { + ibdev_dbg(&dev->ibdev, "Unsupported EFA QP create flags[%#x], supported[%#x]\n", + cmd.flags, supported_efa_flags); + err = -EOPNOTSUPP; + goto err_out; + } + create_qp_params.uarn = ucontext->uarn; create_qp_params.pd = to_epd(ibqp->pd)->pdn; @@ -722,6 +773,11 @@ int efa_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr, create_qp_params.rq_base_addr = qp->rq_dma_addr; } + create_qp_params.sl = cmd.sl; + + if (cmd.flags & EFA_CREATE_QP_WITH_UNSOLICITED_WRITE_RECV) + create_qp_params.unsolicited_write_recv = true; + err = efa_com_create_qp(&dev->edev, &create_qp_params, &create_qp_resp); if (err) @@ -1067,8 +1123,9 @@ static int cq_mmap_entries_setup(struct efa_dev *dev, struct efa_cq *cq, } int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct efa_ucontext *ucontext = rdma_udata_to_drv_context( udata, struct efa_ucontext, ibucontext); struct efa_com_create_cq_params params = {}; @@ -1153,7 +1210,7 @@ int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, } params.uarn = cq->ucontext->uarn; - params.cq_depth = entries; + params.sub_cq_depth = entries; params.dma_addr = cq->dma_addr; params.entry_size_in_bytes = cmd.cq_entry_size; params.num_sub_cqs = cmd.num_sub_cqs; @@ -1670,14 +1727,14 @@ static int efa_register_mr(struct ib_pd *ibpd, struct efa_mr *mr, u64 start, struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, u64 length, u64 virt_addr, int fd, int access_flags, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { struct efa_dev *dev = to_edev(ibpd->device); struct ib_umem_dmabuf *umem_dmabuf; struct efa_mr *mr; int err; - mr = efa_alloc_mr(ibpd, access_flags, udata); + mr = efa_alloc_mr(ibpd, access_flags, &attrs->driver_udata); if (IS_ERR(mr)) { err = PTR_ERR(mr); goto err_out; diff --git a/drivers/infiniband/hw/erdma/Kconfig b/drivers/infiniband/hw/erdma/Kconfig index 169038e3ceb1..267fc1f3c42a 100644 --- a/drivers/infiniband/hw/erdma/Kconfig +++ b/drivers/infiniband/hw/erdma/Kconfig @@ -5,7 +5,7 @@ config INFINIBAND_ERDMA depends on INFINIBAND_ADDR_TRANS depends on INFINIBAND_USER_ACCESS help - This is a RDMA/iWarp driver for Alibaba Elastic RDMA Adapter(ERDMA), + This is a RDMA driver for Alibaba Elastic RDMA Adapter(ERDMA), which supports RDMA features in Alibaba cloud environment. To compile this driver as module, choose M here. The module will be diff --git a/drivers/infiniband/hw/erdma/erdma.h b/drivers/infiniband/hw/erdma/erdma.h index 5df401a30cb9..2a023b99f992 100644 --- a/drivers/infiniband/hw/erdma/erdma.h +++ b/drivers/infiniband/hw/erdma/erdma.h @@ -16,7 +16,7 @@ #include "erdma_hw.h" #define DRV_MODULE_NAME "erdma" -#define ERDMA_NODE_DESC "Elastic RDMA(iWARP) stack" +#define ERDMA_NODE_DESC "Elastic RDMA Adapter stack" struct erdma_eq { void *qbuf; @@ -33,7 +33,8 @@ struct erdma_eq { atomic64_t notify_num; void __iomem *db; - u64 *db_record; + u64 *dbrec; + dma_addr_t dbrec_dma; }; struct erdma_cmdq_sq { @@ -48,7 +49,8 @@ struct erdma_cmdq_sq { u16 wqebb_cnt; - u64 *db_record; + u64 *dbrec; + dma_addr_t dbrec_dma; }; struct erdma_cmdq_cq { @@ -61,7 +63,8 @@ struct erdma_cmdq_cq { u32 ci; u32 cmdsn; - u64 *db_record; + u64 *dbrec; + dma_addr_t dbrec_dma; atomic64_t armed_num; }; @@ -98,8 +101,6 @@ struct erdma_cmdq { struct erdma_comp_wait *wait_pool; spinlock_t lock; - bool use_event; - struct erdma_cmdq_sq sq; struct erdma_cmdq_cq cq; struct erdma_eq eq; @@ -145,6 +146,8 @@ struct erdma_devattr { u32 max_mr; u32 max_pd; u32 max_mw; + u32 max_gid; + u32 max_ah; u32 local_dma_key; }; @@ -174,12 +177,10 @@ struct erdma_resource_cb { enum { ERDMA_RES_TYPE_PD = 0, ERDMA_RES_TYPE_STAG_IDX = 1, - ERDMA_RES_CNT = 2, + ERDMA_RES_TYPE_AH = 2, + ERDMA_RES_CNT = 3, }; -#define ERDMA_EXTRA_BUFFER_SIZE ERDMA_DB_SIZE -#define WARPPED_BUFSIZE(size) ((size) + ERDMA_EXTRA_BUFFER_SIZE) - struct erdma_dev { struct ib_device ibdev; struct net_device *netdev; @@ -192,8 +193,6 @@ struct erdma_dev { u8 __iomem *func_bar; struct erdma_devattr attrs; - /* physical port state (only one port per device) */ - enum ib_port_state state; u32 mtu; /* cmdq and aeq use the same msix vector */ @@ -213,7 +212,9 @@ struct erdma_dev { atomic_t num_ctx; struct list_head cep_list; + struct dma_pool *db_pool; struct dma_pool *resp_pool; + enum erdma_proto_type proto; }; static inline void *get_queue_entry(void *qbuf, u32 idx, u32 depth, u32 shift) @@ -264,7 +265,7 @@ void erdma_cmdq_destroy(struct erdma_dev *dev); void erdma_cmdq_build_reqhdr(u64 *hdr, u32 mod, u32 op); int erdma_post_cmd_wait(struct erdma_cmdq *cmdq, void *req, u32 req_size, - u64 *resp0, u64 *resp1); + u64 *resp0, u64 *resp1, bool sleepable); void erdma_cmdq_completion_handler(struct erdma_cmdq *cmdq); int erdma_ceqs_init(struct erdma_dev *dev); @@ -273,7 +274,8 @@ void notify_eq(struct erdma_eq *eq); void *get_next_valid_eqe(struct erdma_eq *eq); int erdma_aeq_init(struct erdma_dev *dev); -void erdma_aeq_destroy(struct erdma_dev *dev); +int erdma_eq_common_init(struct erdma_dev *dev, struct erdma_eq *eq, u32 depth); +void erdma_eq_destroy(struct erdma_dev *dev, struct erdma_eq *eq); void erdma_aeq_event_handler(struct erdma_dev *dev); void erdma_ceq_completion_handler(struct erdma_eq_cb *ceq_cb); diff --git a/drivers/infiniband/hw/erdma/erdma_cm.c b/drivers/infiniband/hw/erdma/erdma_cm.c index 771059a8eb7d..e0acc185e719 100644 --- a/drivers/infiniband/hw/erdma/erdma_cm.c +++ b/drivers/infiniband/hw/erdma/erdma_cm.c @@ -567,7 +567,8 @@ reject_conn: static int erdma_proc_mpareply(struct erdma_cep *cep) { - struct erdma_qp_attrs qp_attrs; + enum erdma_qpa_mask_iwarp to_modify_attrs = 0; + struct erdma_mod_qp_params_iwarp params; struct erdma_qp *qp = cep->qp; struct mpa_rr *rep; int ret; @@ -597,26 +598,29 @@ static int erdma_proc_mpareply(struct erdma_cep *cep) return -EINVAL; } - memset(&qp_attrs, 0, sizeof(qp_attrs)); - qp_attrs.irq_size = cep->ird; - qp_attrs.orq_size = cep->ord; - qp_attrs.state = ERDMA_QP_STATE_RTS; + memset(¶ms, 0, sizeof(params)); + params.state = ERDMA_QPS_IWARP_RTS; + params.irq_size = cep->ird; + params.orq_size = cep->ord; down_write(&qp->state_lock); - if (qp->attrs.state > ERDMA_QP_STATE_RTR) { + if (qp->attrs.iwarp.state > ERDMA_QPS_IWARP_RTR) { ret = -EINVAL; up_write(&qp->state_lock); goto out_err; } - qp->attrs.qp_type = ERDMA_QP_ACTIVE; - if (__mpa_ext_cc(cep->mpa.ext_data.bits) != qp->attrs.cc) - qp->attrs.cc = COMPROMISE_CC; + to_modify_attrs = ERDMA_QPA_IWARP_STATE | ERDMA_QPA_IWARP_LLP_HANDLE | + ERDMA_QPA_IWARP_MPA | ERDMA_QPA_IWARP_IRD | + ERDMA_QPA_IWARP_ORD; - ret = erdma_modify_qp_internal(qp, &qp_attrs, - ERDMA_QP_ATTR_STATE | - ERDMA_QP_ATTR_LLP_HANDLE | - ERDMA_QP_ATTR_MPA); + params.qp_type = ERDMA_QP_ACTIVE; + if (__mpa_ext_cc(cep->mpa.ext_data.bits) != qp->attrs.cc) { + to_modify_attrs |= ERDMA_QPA_IWARP_CC; + params.cc = COMPROMISE_CC; + } + + ret = erdma_modify_qp_state_iwarp(qp, ¶ms, to_modify_attrs); up_write(&qp->state_lock); @@ -705,7 +709,6 @@ error: erdma_cancel_mpatimer(new_cep); erdma_cep_put(new_cep); - new_cep->sock = NULL; } if (new_s) { @@ -722,7 +725,7 @@ static int erdma_newconn_connected(struct erdma_cep *cep) __mpa_rr_set_revision(&cep->mpa.hdr.params.bits, MPA_REVISION_EXT_1); memcpy(cep->mpa.hdr.key, MPA_KEY_REQ, MPA_KEY_SIZE); - cep->mpa.ext_data.cookie = cpu_to_be32(cep->qp->attrs.cookie); + cep->mpa.ext_data.cookie = cpu_to_be32(cep->qp->attrs.iwarp.cookie); __mpa_ext_set_cc(&cep->mpa.ext_data.bits, cep->qp->attrs.cc); ret = erdma_send_mpareqrep(cep, cep->private_data, cep->pd_len); @@ -1126,10 +1129,11 @@ error_put_qp: int erdma_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params) { - struct erdma_dev *dev = to_edev(id->device); struct erdma_cep *cep = (struct erdma_cep *)id->provider_data; + struct erdma_mod_qp_params_iwarp mod_qp_params; + enum erdma_qpa_mask_iwarp to_modify_attrs = 0; + struct erdma_dev *dev = to_edev(id->device); struct erdma_qp *qp; - struct erdma_qp_attrs qp_attrs; int ret; erdma_cep_set_inuse(cep); @@ -1156,7 +1160,7 @@ int erdma_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params) erdma_qp_get(qp); down_write(&qp->state_lock); - if (qp->attrs.state > ERDMA_QP_STATE_RTR) { + if (qp->attrs.iwarp.state > ERDMA_QPS_IWARP_RTR) { ret = -EINVAL; up_write(&qp->state_lock); goto error; @@ -1181,11 +1185,11 @@ int erdma_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params) cep->cm_id = id; id->add_ref(id); - memset(&qp_attrs, 0, sizeof(qp_attrs)); - qp_attrs.orq_size = params->ord; - qp_attrs.irq_size = params->ird; + memset(&mod_qp_params, 0, sizeof(mod_qp_params)); - qp_attrs.state = ERDMA_QP_STATE_RTS; + mod_qp_params.irq_size = params->ird; + mod_qp_params.orq_size = params->ord; + mod_qp_params.state = ERDMA_QPS_IWARP_RTS; /* Associate QP with CEP */ erdma_cep_get(cep); @@ -1194,19 +1198,21 @@ int erdma_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params) cep->state = ERDMA_EPSTATE_RDMA_MODE; - qp->attrs.qp_type = ERDMA_QP_PASSIVE; - qp->attrs.pd_len = params->private_data_len; + mod_qp_params.qp_type = ERDMA_QP_PASSIVE; + mod_qp_params.pd_len = params->private_data_len; - if (qp->attrs.cc != __mpa_ext_cc(cep->mpa.ext_data.bits)) - qp->attrs.cc = COMPROMISE_CC; + to_modify_attrs = ERDMA_QPA_IWARP_STATE | ERDMA_QPA_IWARP_ORD | + ERDMA_QPA_IWARP_LLP_HANDLE | ERDMA_QPA_IWARP_IRD | + ERDMA_QPA_IWARP_MPA; + + if (qp->attrs.cc != __mpa_ext_cc(cep->mpa.ext_data.bits)) { + to_modify_attrs |= ERDMA_QPA_IWARP_CC; + mod_qp_params.cc = COMPROMISE_CC; + } /* move to rts */ - ret = erdma_modify_qp_internal(qp, &qp_attrs, - ERDMA_QP_ATTR_STATE | - ERDMA_QP_ATTR_ORD | - ERDMA_QP_ATTR_LLP_HANDLE | - ERDMA_QP_ATTR_IRD | - ERDMA_QP_ATTR_MPA); + ret = erdma_modify_qp_state_iwarp(qp, &mod_qp_params, to_modify_attrs); + up_write(&qp->state_lock); if (ret) @@ -1214,7 +1220,7 @@ int erdma_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params) cep->mpa.ext_data.bits = 0; __mpa_ext_set_cc(&cep->mpa.ext_data.bits, qp->attrs.cc); - cep->mpa.ext_data.cookie = cpu_to_be32(cep->qp->attrs.cookie); + cep->mpa.ext_data.cookie = cpu_to_be32(cep->qp->attrs.iwarp.cookie); ret = erdma_send_mpareqrep(cep, params->private_data, params->private_data_len); diff --git a/drivers/infiniband/hw/erdma/erdma_cmdq.c b/drivers/infiniband/hw/erdma/erdma_cmdq.c index a151a7bdd504..b867aefe83b2 100644 --- a/drivers/infiniband/hw/erdma/erdma_cmdq.c +++ b/drivers/infiniband/hw/erdma/erdma_cmdq.c @@ -14,7 +14,7 @@ static void arm_cmdq_cq(struct erdma_cmdq *cmdq) FIELD_PREP(ERDMA_CQDB_CMDSN_MASK, cmdq->cq.cmdsn) | FIELD_PREP(ERDMA_CQDB_IDX_MASK, cmdq->cq.cmdsn); - *cmdq->cq.db_record = db_data; + *cmdq->cq.dbrec = db_data; writeq(db_data, dev->func_bar + ERDMA_CMDQ_CQDB_REG); atomic64_inc(&cmdq->cq.armed_num); @@ -25,7 +25,7 @@ static void kick_cmdq_db(struct erdma_cmdq *cmdq) struct erdma_dev *dev = container_of(cmdq, struct erdma_dev, cmdq); u64 db_data = FIELD_PREP(ERDMA_CMD_HDR_WQEBB_INDEX_MASK, cmdq->sq.pi); - *cmdq->sq.db_record = db_data; + *cmdq->sq.dbrec = db_data; writeq(db_data, dev->func_bar + ERDMA_CMDQ_SQDB_REG); } @@ -89,20 +89,18 @@ static int erdma_cmdq_sq_init(struct erdma_dev *dev) { struct erdma_cmdq *cmdq = &dev->cmdq; struct erdma_cmdq_sq *sq = &cmdq->sq; - u32 buf_size; sq->wqebb_cnt = SQEBB_COUNT(ERDMA_CMDQ_SQE_SIZE); sq->depth = cmdq->max_outstandings * sq->wqebb_cnt; - buf_size = sq->depth << SQEBB_SHIFT; - - sq->qbuf = - dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), - &sq->qbuf_dma_addr, GFP_KERNEL); + sq->qbuf = dma_alloc_coherent(&dev->pdev->dev, sq->depth << SQEBB_SHIFT, + &sq->qbuf_dma_addr, GFP_KERNEL); if (!sq->qbuf) return -ENOMEM; - sq->db_record = (u64 *)(sq->qbuf + buf_size); + sq->dbrec = dma_pool_zalloc(dev->db_pool, GFP_KERNEL, &sq->dbrec_dma); + if (!sq->dbrec) + goto err_out; spin_lock_init(&sq->lock); @@ -111,30 +109,33 @@ static int erdma_cmdq_sq_init(struct erdma_dev *dev) erdma_reg_write32(dev, ERDMA_REGS_CMDQ_SQ_ADDR_L_REG, lower_32_bits(sq->qbuf_dma_addr)); erdma_reg_write32(dev, ERDMA_REGS_CMDQ_DEPTH_REG, sq->depth); - erdma_reg_write64(dev, ERDMA_CMDQ_SQ_DB_HOST_ADDR_REG, - sq->qbuf_dma_addr + buf_size); + erdma_reg_write64(dev, ERDMA_CMDQ_SQ_DB_HOST_ADDR_REG, sq->dbrec_dma); return 0; + +err_out: + dma_free_coherent(&dev->pdev->dev, sq->depth << SQEBB_SHIFT, + sq->qbuf, sq->qbuf_dma_addr); + + return -ENOMEM; } static int erdma_cmdq_cq_init(struct erdma_dev *dev) { struct erdma_cmdq *cmdq = &dev->cmdq; struct erdma_cmdq_cq *cq = &cmdq->cq; - u32 buf_size; cq->depth = cmdq->sq.depth; - buf_size = cq->depth << CQE_SHIFT; - - cq->qbuf = - dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), - &cq->qbuf_dma_addr, GFP_KERNEL | __GFP_ZERO); + cq->qbuf = dma_alloc_coherent(&dev->pdev->dev, cq->depth << CQE_SHIFT, + &cq->qbuf_dma_addr, GFP_KERNEL); if (!cq->qbuf) return -ENOMEM; spin_lock_init(&cq->lock); - cq->db_record = (u64 *)(cq->qbuf + buf_size); + cq->dbrec = dma_pool_zalloc(dev->db_pool, GFP_KERNEL, &cq->dbrec_dma); + if (!cq->dbrec) + goto err_out; atomic64_set(&cq->armed_num, 0); @@ -142,40 +143,35 @@ static int erdma_cmdq_cq_init(struct erdma_dev *dev) upper_32_bits(cq->qbuf_dma_addr)); erdma_reg_write32(dev, ERDMA_REGS_CMDQ_CQ_ADDR_L_REG, lower_32_bits(cq->qbuf_dma_addr)); - erdma_reg_write64(dev, ERDMA_CMDQ_CQ_DB_HOST_ADDR_REG, - cq->qbuf_dma_addr + buf_size); + erdma_reg_write64(dev, ERDMA_CMDQ_CQ_DB_HOST_ADDR_REG, cq->dbrec_dma); return 0; + +err_out: + dma_free_coherent(&dev->pdev->dev, cq->depth << CQE_SHIFT, cq->qbuf, + cq->qbuf_dma_addr); + + return -ENOMEM; } static int erdma_cmdq_eq_init(struct erdma_dev *dev) { struct erdma_cmdq *cmdq = &dev->cmdq; struct erdma_eq *eq = &cmdq->eq; - u32 buf_size; - - eq->depth = cmdq->max_outstandings; - buf_size = eq->depth << EQE_SHIFT; - - eq->qbuf = - dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), - &eq->qbuf_dma_addr, GFP_KERNEL | __GFP_ZERO); - if (!eq->qbuf) - return -ENOMEM; + int ret; - spin_lock_init(&eq->lock); - atomic64_set(&eq->event_num, 0); + ret = erdma_eq_common_init(dev, eq, cmdq->max_outstandings); + if (ret) + return ret; eq->db = dev->func_bar + ERDMA_REGS_CEQ_DB_BASE_REG; - eq->db_record = (u64 *)(eq->qbuf + buf_size); erdma_reg_write32(dev, ERDMA_REGS_CMDQ_EQ_ADDR_H_REG, upper_32_bits(eq->qbuf_dma_addr)); erdma_reg_write32(dev, ERDMA_REGS_CMDQ_EQ_ADDR_L_REG, lower_32_bits(eq->qbuf_dma_addr)); erdma_reg_write32(dev, ERDMA_REGS_CMDQ_EQ_DEPTH_REG, eq->depth); - erdma_reg_write64(dev, ERDMA_CMDQ_EQ_DB_HOST_ADDR_REG, - eq->qbuf_dma_addr + buf_size); + erdma_reg_write64(dev, ERDMA_CMDQ_EQ_DB_HOST_ADDR_REG, eq->dbrec_dma); return 0; } @@ -186,7 +182,6 @@ int erdma_cmdq_init(struct erdma_dev *dev) int err; cmdq->max_outstandings = ERDMA_CMDQ_MAX_OUTSTANDING; - cmdq->use_event = false; sema_init(&cmdq->credits, cmdq->max_outstandings); @@ -211,24 +206,22 @@ int erdma_cmdq_init(struct erdma_dev *dev) return 0; err_destroy_cq: - dma_free_coherent(&dev->pdev->dev, - (cmdq->cq.depth << CQE_SHIFT) + - ERDMA_EXTRA_BUFFER_SIZE, + dma_free_coherent(&dev->pdev->dev, cmdq->cq.depth << CQE_SHIFT, cmdq->cq.qbuf, cmdq->cq.qbuf_dma_addr); + dma_pool_free(dev->db_pool, cmdq->cq.dbrec, cmdq->cq.dbrec_dma); + err_destroy_sq: - dma_free_coherent(&dev->pdev->dev, - (cmdq->sq.depth << SQEBB_SHIFT) + - ERDMA_EXTRA_BUFFER_SIZE, + dma_free_coherent(&dev->pdev->dev, cmdq->sq.depth << SQEBB_SHIFT, cmdq->sq.qbuf, cmdq->sq.qbuf_dma_addr); + dma_pool_free(dev->db_pool, cmdq->sq.dbrec, cmdq->sq.dbrec_dma); + return err; } void erdma_finish_cmdq_init(struct erdma_dev *dev) { - /* after device init successfully, change cmdq to event mode. */ - dev->cmdq.use_event = true; arm_cmdq_cq(&dev->cmdq); } @@ -238,18 +231,17 @@ void erdma_cmdq_destroy(struct erdma_dev *dev) clear_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state); - dma_free_coherent(&dev->pdev->dev, - (cmdq->eq.depth << EQE_SHIFT) + - ERDMA_EXTRA_BUFFER_SIZE, - cmdq->eq.qbuf, cmdq->eq.qbuf_dma_addr); - dma_free_coherent(&dev->pdev->dev, - (cmdq->sq.depth << SQEBB_SHIFT) + - ERDMA_EXTRA_BUFFER_SIZE, + erdma_eq_destroy(dev, &cmdq->eq); + + dma_free_coherent(&dev->pdev->dev, cmdq->sq.depth << SQEBB_SHIFT, cmdq->sq.qbuf, cmdq->sq.qbuf_dma_addr); - dma_free_coherent(&dev->pdev->dev, - (cmdq->cq.depth << CQE_SHIFT) + - ERDMA_EXTRA_BUFFER_SIZE, + + dma_pool_free(dev->db_pool, cmdq->sq.dbrec, cmdq->sq.dbrec_dma); + + dma_free_coherent(&dev->pdev->dev, cmdq->cq.depth << CQE_SHIFT, cmdq->cq.qbuf, cmdq->cq.qbuf_dma_addr); + + dma_pool_free(dev->db_pool, cmdq->cq.dbrec, cmdq->cq.dbrec_dma); } static void *get_next_valid_cmdq_cqe(struct erdma_cmdq *cmdq) @@ -317,8 +309,7 @@ static int erdma_poll_single_cmd_completion(struct erdma_cmdq *cmdq) /* Copy 16B comp data after cqe hdr to outer */ be32_to_cpu_array(comp_wait->comp_data, cqe + 2, 4); - if (cmdq->use_event) - complete(&comp_wait->wait_event); + complete(&comp_wait->wait_event); return 0; } @@ -337,9 +328,6 @@ static void erdma_polling_cmd_completions(struct erdma_cmdq *cmdq) if (erdma_poll_single_cmd_completion(cmdq)) break; - if (comp_num && cmdq->use_event) - arm_cmdq_cq(cmdq); - spin_unlock_irqrestore(&cmdq->cq.lock, flags); } @@ -347,8 +335,7 @@ void erdma_cmdq_completion_handler(struct erdma_cmdq *cmdq) { int got_event = 0; - if (!test_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state) || - !cmdq->use_event) + if (!test_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state)) return; while (get_next_valid_eqe(&cmdq->eq)) { @@ -359,6 +346,7 @@ void erdma_cmdq_completion_handler(struct erdma_cmdq *cmdq) if (got_event) { cmdq->cq.cmdsn++; erdma_polling_cmd_completions(cmdq); + arm_cmdq_cq(cmdq); } notify_eq(&cmdq->eq); @@ -377,7 +365,7 @@ static int erdma_poll_cmd_completion(struct erdma_comp_wait *comp_ctx, if (time_is_before_jiffies(comp_timeout)) return -ETIME; - msleep(20); + udelay(20); } return 0; @@ -408,7 +396,7 @@ void erdma_cmdq_build_reqhdr(u64 *hdr, u32 mod, u32 op) } int erdma_post_cmd_wait(struct erdma_cmdq *cmdq, void *req, u32 req_size, - u64 *resp0, u64 *resp1) + u64 *resp0, u64 *resp1, bool sleepable) { struct erdma_comp_wait *comp_wait; int ret; @@ -416,7 +404,12 @@ int erdma_post_cmd_wait(struct erdma_cmdq *cmdq, void *req, u32 req_size, if (!test_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state)) return -ENODEV; - down(&cmdq->credits); + if (!sleepable) { + while (down_trylock(&cmdq->credits)) + ; + } else { + down(&cmdq->credits); + } comp_wait = get_comp_wait(cmdq); if (IS_ERR(comp_wait)) { @@ -430,7 +423,7 @@ int erdma_post_cmd_wait(struct erdma_cmdq *cmdq, void *req, u32 req_size, push_cmdq_sqe(cmdq, req, req_size, comp_wait); spin_unlock(&cmdq->sq.lock); - if (cmdq->use_event) + if (sleepable) ret = erdma_wait_cmd_completion(comp_wait, cmdq, ERDMA_CMDQ_TIMEOUT_MS); else diff --git a/drivers/infiniband/hw/erdma/erdma_cq.c b/drivers/infiniband/hw/erdma/erdma_cq.c index c1cb5568eab2..1f456327e63c 100644 --- a/drivers/infiniband/hw/erdma/erdma_cq.c +++ b/drivers/infiniband/hw/erdma/erdma_cq.c @@ -26,7 +26,7 @@ static void notify_cq(struct erdma_cq *cq, u8 solcitied) FIELD_PREP(ERDMA_CQDB_CMDSN_MASK, cq->kern_cq.cmdsn) | FIELD_PREP(ERDMA_CQDB_CI_MASK, cq->kern_cq.ci); - *cq->kern_cq.db_record = db_data; + *cq->kern_cq.dbrec = db_data; writeq(db_data, cq->kern_cq.db); } @@ -105,6 +105,22 @@ static const struct { { ERDMA_WC_RETRY_EXC_ERR, IB_WC_RETRY_EXC_ERR, ERDMA_WC_VENDOR_NO_ERR }, }; +static void erdma_process_ud_cqe(struct erdma_cqe *cqe, struct ib_wc *wc) +{ + u32 ud_info; + + wc->wc_flags |= (IB_WC_GRH | IB_WC_WITH_NETWORK_HDR_TYPE); + ud_info = be32_to_cpu(cqe->ud.info); + wc->network_hdr_type = FIELD_GET(ERDMA_CQE_NTYPE_MASK, ud_info); + if (wc->network_hdr_type == ERDMA_NETWORK_TYPE_IPV4) + wc->network_hdr_type = RDMA_NETWORK_IPV4; + else + wc->network_hdr_type = RDMA_NETWORK_IPV6; + wc->src_qp = FIELD_GET(ERDMA_CQE_SQPN_MASK, ud_info); + wc->sl = FIELD_GET(ERDMA_CQE_SL_MASK, ud_info); + wc->pkey_index = 0; +} + #define ERDMA_POLLCQ_NO_QP 1 static int erdma_poll_one_cqe(struct erdma_cq *cq, struct ib_wc *wc) @@ -168,6 +184,10 @@ static int erdma_poll_one_cqe(struct erdma_cq *cq, struct ib_wc *wc) wc->wc_flags |= IB_WC_WITH_INVALIDATE; } + if (erdma_device_rocev2(dev) && + (qp->ibqp.qp_type == IB_QPT_UD || qp->ibqp.qp_type == IB_QPT_GSI)) + erdma_process_ud_cqe(cqe, wc); + if (syndrome >= ERDMA_NUM_WC_STATUS) syndrome = ERDMA_WC_GENERAL_ERR; @@ -201,3 +221,48 @@ int erdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) return npolled; } + +void erdma_remove_cqes_of_qp(struct ib_cq *ibcq, u32 qpn) +{ + struct erdma_cq *cq = to_ecq(ibcq); + struct erdma_cqe *cqe, *dst_cqe; + u32 prev_cq_ci, cur_cq_ci; + u32 ncqe = 0, nqp_cqe = 0; + unsigned long flags; + u8 owner; + + spin_lock_irqsave(&cq->kern_cq.lock, flags); + + prev_cq_ci = cq->kern_cq.ci; + + while (ncqe < cq->depth && (cqe = get_next_valid_cqe(cq)) != NULL) { + ++cq->kern_cq.ci; + ++ncqe; + } + + while (ncqe > 0) { + cur_cq_ci = prev_cq_ci + ncqe - 1; + cqe = get_queue_entry(cq->kern_cq.qbuf, cur_cq_ci, cq->depth, + CQE_SHIFT); + + if (be32_to_cpu(cqe->qpn) == qpn) { + ++nqp_cqe; + } else if (nqp_cqe) { + dst_cqe = get_queue_entry(cq->kern_cq.qbuf, + cur_cq_ci + nqp_cqe, + cq->depth, CQE_SHIFT); + owner = FIELD_GET(ERDMA_CQE_HDR_OWNER_MASK, + be32_to_cpu(dst_cqe->hdr)); + cqe->hdr = cpu_to_be32( + (be32_to_cpu(cqe->hdr) & + ~ERDMA_CQE_HDR_OWNER_MASK) | + FIELD_PREP(ERDMA_CQE_HDR_OWNER_MASK, owner)); + memcpy(dst_cqe, cqe, sizeof(*cqe)); + } + + --ncqe; + } + + cq->kern_cq.ci = prev_cq_ci + nqp_cqe; + spin_unlock_irqrestore(&cq->kern_cq.lock, flags); +} diff --git a/drivers/infiniband/hw/erdma/erdma_eq.c b/drivers/infiniband/hw/erdma/erdma_eq.c index ea47cb21fdb8..6486234a2360 100644 --- a/drivers/infiniband/hw/erdma/erdma_eq.c +++ b/drivers/infiniband/hw/erdma/erdma_eq.c @@ -13,7 +13,7 @@ void notify_eq(struct erdma_eq *eq) u64 db_data = FIELD_PREP(ERDMA_EQDB_CI_MASK, eq->ci) | FIELD_PREP(ERDMA_EQDB_ARM_MASK, 1); - *eq->db_record = db_data; + *eq->dbrec = db_data; writeq(db_data, eq->db); atomic64_inc(&eq->notify_num); @@ -80,47 +80,62 @@ void erdma_aeq_event_handler(struct erdma_dev *dev) notify_eq(&dev->aeq); } -int erdma_aeq_init(struct erdma_dev *dev) +int erdma_eq_common_init(struct erdma_dev *dev, struct erdma_eq *eq, u32 depth) { - struct erdma_eq *eq = &dev->aeq; - u32 buf_size; - - eq->depth = ERDMA_DEFAULT_EQ_DEPTH; - buf_size = eq->depth << EQE_SHIFT; + u32 buf_size = depth << EQE_SHIFT; - eq->qbuf = - dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), - &eq->qbuf_dma_addr, GFP_KERNEL | __GFP_ZERO); + eq->qbuf = dma_alloc_coherent(&dev->pdev->dev, buf_size, + &eq->qbuf_dma_addr, GFP_KERNEL); if (!eq->qbuf) return -ENOMEM; + eq->dbrec = dma_pool_zalloc(dev->db_pool, GFP_KERNEL, &eq->dbrec_dma); + if (!eq->dbrec) + goto err_free_qbuf; + spin_lock_init(&eq->lock); atomic64_set(&eq->event_num, 0); atomic64_set(&eq->notify_num, 0); + eq->ci = 0; + eq->depth = depth; + + return 0; + +err_free_qbuf: + dma_free_coherent(&dev->pdev->dev, buf_size, eq->qbuf, + eq->qbuf_dma_addr); + + return -ENOMEM; +} + +void erdma_eq_destroy(struct erdma_dev *dev, struct erdma_eq *eq) +{ + dma_pool_free(dev->db_pool, eq->dbrec, eq->dbrec_dma); + dma_free_coherent(&dev->pdev->dev, eq->depth << EQE_SHIFT, eq->qbuf, + eq->qbuf_dma_addr); +} + +int erdma_aeq_init(struct erdma_dev *dev) +{ + struct erdma_eq *eq = &dev->aeq; + int ret; + + ret = erdma_eq_common_init(dev, &dev->aeq, ERDMA_DEFAULT_EQ_DEPTH); + if (ret) + return ret; eq->db = dev->func_bar + ERDMA_REGS_AEQ_DB_REG; - eq->db_record = (u64 *)(eq->qbuf + buf_size); erdma_reg_write32(dev, ERDMA_REGS_AEQ_ADDR_H_REG, upper_32_bits(eq->qbuf_dma_addr)); erdma_reg_write32(dev, ERDMA_REGS_AEQ_ADDR_L_REG, lower_32_bits(eq->qbuf_dma_addr)); erdma_reg_write32(dev, ERDMA_REGS_AEQ_DEPTH_REG, eq->depth); - erdma_reg_write64(dev, ERDMA_AEQ_DB_HOST_ADDR_REG, - eq->qbuf_dma_addr + buf_size); + erdma_reg_write64(dev, ERDMA_AEQ_DB_HOST_ADDR_REG, eq->dbrec_dma); return 0; } -void erdma_aeq_destroy(struct erdma_dev *dev) -{ - struct erdma_eq *eq = &dev->aeq; - - dma_free_coherent(&dev->pdev->dev, - WARPPED_BUFSIZE(eq->depth << EQE_SHIFT), eq->qbuf, - eq->qbuf_dma_addr); -} - void erdma_ceq_completion_handler(struct erdma_eq_cb *ceq_cb) { struct erdma_dev *dev = ceq_cb->dev; @@ -209,7 +224,6 @@ static void erdma_free_ceq_irq(struct erdma_dev *dev, u16 ceqn) static int create_eq_cmd(struct erdma_dev *dev, u32 eqn, struct erdma_eq *eq) { struct erdma_cmdq_create_eq_req req; - dma_addr_t db_info_dma_addr; erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_COMMON, CMDQ_OPCODE_CREATE_EQ); @@ -219,39 +233,33 @@ static int create_eq_cmd(struct erdma_dev *dev, u32 eqn, struct erdma_eq *eq) req.qtype = ERDMA_EQ_TYPE_CEQ; /* Vector index is the same as EQN. */ req.vector_idx = eqn; - db_info_dma_addr = eq->qbuf_dma_addr + (eq->depth << EQE_SHIFT); - req.db_dma_addr_l = lower_32_bits(db_info_dma_addr); - req.db_dma_addr_h = upper_32_bits(db_info_dma_addr); + req.db_dma_addr_l = lower_32_bits(eq->dbrec_dma); + req.db_dma_addr_h = upper_32_bits(eq->dbrec_dma); - return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + false); } static int erdma_ceq_init_one(struct erdma_dev *dev, u16 ceqn) { struct erdma_eq *eq = &dev->ceqs[ceqn].eq; - u32 buf_size = ERDMA_DEFAULT_EQ_DEPTH << EQE_SHIFT; int ret; - eq->qbuf = - dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), - &eq->qbuf_dma_addr, GFP_KERNEL | __GFP_ZERO); - if (!eq->qbuf) - return -ENOMEM; + ret = erdma_eq_common_init(dev, eq, ERDMA_DEFAULT_EQ_DEPTH); + if (ret) + return ret; - spin_lock_init(&eq->lock); - atomic64_set(&eq->event_num, 0); - atomic64_set(&eq->notify_num, 0); - - eq->depth = ERDMA_DEFAULT_EQ_DEPTH; eq->db = dev->func_bar + ERDMA_REGS_CEQ_DB_BASE_REG + (ceqn + 1) * ERDMA_DB_SIZE; - eq->db_record = (u64 *)(eq->qbuf + buf_size); - eq->ci = 0; dev->ceqs[ceqn].dev = dev; + dev->ceqs[ceqn].ready = true; /* CEQ indexed from 1, 0 rsvd for CMDQ-EQ. */ ret = create_eq_cmd(dev, ceqn + 1, eq); - dev->ceqs[ceqn].ready = ret ? false : true; + if (ret) { + erdma_eq_destroy(dev, eq); + dev->ceqs[ceqn].ready = false; + } return ret; } @@ -259,7 +267,6 @@ static int erdma_ceq_init_one(struct erdma_dev *dev, u16 ceqn) static void erdma_ceq_uninit_one(struct erdma_dev *dev, u16 ceqn) { struct erdma_eq *eq = &dev->ceqs[ceqn].eq; - u32 buf_size = ERDMA_DEFAULT_EQ_DEPTH << EQE_SHIFT; struct erdma_cmdq_destroy_eq_req req; int err; @@ -272,12 +279,12 @@ static void erdma_ceq_uninit_one(struct erdma_dev *dev, u16 ceqn) req.qtype = ERDMA_EQ_TYPE_CEQ; req.vector_idx = ceqn + 1; - err = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + err = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + false); if (err) return; - dma_free_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), eq->qbuf, - eq->qbuf_dma_addr); + erdma_eq_destroy(dev, eq); } int erdma_ceqs_init(struct erdma_dev *dev) diff --git a/drivers/infiniband/hw/erdma/erdma_hw.h b/drivers/infiniband/hw/erdma/erdma_hw.h index 3212a1222760..ea4db53901a4 100644 --- a/drivers/infiniband/hw/erdma/erdma_hw.h +++ b/drivers/infiniband/hw/erdma/erdma_hw.h @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/types.h> +#include <linux/if_ether.h> /* PCIe device related definition. */ #define ERDMA_PCI_WIDTH 64 @@ -21,8 +22,21 @@ #define ERDMA_NUM_MSIX_VEC 32U #define ERDMA_MSIX_VECTOR_CMDQ 0 +/* RoCEv2 related */ +#define ERDMA_ROCEV2_GID_SIZE 16 +#define ERDMA_MAX_PKEYS 1 +#define ERDMA_DEFAULT_PKEY 0xFFFF + +/* erdma device protocol type */ +enum erdma_proto_type { + ERDMA_PROTO_IWARP = 0, + ERDMA_PROTO_ROCEV2 = 1, + ERDMA_PROTO_COUNT = 2, +}; + /* PCIe Bar0 Registers. */ #define ERDMA_REGS_VERSION_REG 0x0 +#define ERDMA_REGS_DEV_PROTO_REG 0xC #define ERDMA_REGS_DEV_CTRL_REG 0x10 #define ERDMA_REGS_DEV_ST_REG 0x14 #define ERDMA_REGS_NETDEV_MAC_L_REG 0x18 @@ -136,7 +150,11 @@ enum CMDQ_RDMA_OPCODE { CMDQ_OPCODE_DESTROY_CQ = 5, CMDQ_OPCODE_REFLUSH = 6, CMDQ_OPCODE_REG_MR = 8, - CMDQ_OPCODE_DEREG_MR = 9 + CMDQ_OPCODE_DEREG_MR = 9, + CMDQ_OPCODE_SET_GID = 14, + CMDQ_OPCODE_CREATE_AH = 15, + CMDQ_OPCODE_DESTROY_AH = 16, + CMDQ_OPCODE_QUERY_QP = 17, }; enum CMDQ_COMMON_OPCODE { @@ -240,7 +258,7 @@ struct erdma_cmdq_create_cq_req { u32 qbuf_addr_l; u32 qbuf_addr_h; u32 cfg1; - u64 cq_db_info_addr; + u64 cq_dbrec_dma; u32 first_page_offset; u32 cfg2; }; @@ -284,6 +302,36 @@ struct erdma_cmdq_dereg_mr_req { u32 cfg; }; +/* create_av cfg0 */ +#define ERDMA_CMD_CREATE_AV_FL_MASK GENMASK(19, 0) +#define ERDMA_CMD_CREATE_AV_NTYPE_MASK BIT(20) + +struct erdma_av_cfg { + u32 cfg0; + u8 traffic_class; + u8 hop_limit; + u8 sl; + u8 rsvd; + u16 udp_sport; + u16 sgid_index; + u8 dmac[ETH_ALEN]; + u8 padding[2]; + u8 dgid[ERDMA_ROCEV2_GID_SIZE]; +}; + +struct erdma_cmdq_create_ah_req { + u64 hdr; + u32 pdn; + u32 ahn; + struct erdma_av_cfg av_cfg; +}; + +struct erdma_cmdq_destroy_ah_req { + u64 hdr; + u32 pdn; + u32 ahn; +}; + /* modify qp cfg */ #define ERDMA_CMD_MODIFY_QP_STATE_MASK GENMASK(31, 24) #define ERDMA_CMD_MODIFY_QP_CC_MASK GENMASK(23, 20) @@ -301,6 +349,36 @@ struct erdma_cmdq_modify_qp_req { u32 recv_nxt; }; +/* modify qp cfg1 for roce device */ +#define ERDMA_CMD_MODIFY_QP_DQPN_MASK GENMASK(19, 0) + +struct erdma_cmdq_mod_qp_req_rocev2 { + u64 hdr; + u32 cfg0; + u32 cfg1; + u32 attr_mask; + u32 qkey; + u32 rq_psn; + u32 sq_psn; + struct erdma_av_cfg av_cfg; +}; + +/* query qp response mask */ +#define ERDMA_CMD_QUERY_QP_RESP_SQ_PSN_MASK GENMASK_ULL(23, 0) +#define ERDMA_CMD_QUERY_QP_RESP_RQ_PSN_MASK GENMASK_ULL(47, 24) +#define ERDMA_CMD_QUERY_QP_RESP_QP_STATE_MASK GENMASK_ULL(55, 48) +#define ERDMA_CMD_QUERY_QP_RESP_SQ_DRAINING_MASK GENMASK_ULL(56, 56) + +struct erdma_cmdq_query_qp_req_rocev2 { + u64 hdr; + u32 qpn; +}; + +enum erdma_qp_type { + ERDMA_QPT_RC = 0, + ERDMA_QPT_UD = 1, +}; + /* create qp cfg0 */ #define ERDMA_CMD_CREATE_QP_SQ_DEPTH_MASK GENMASK(31, 20) #define ERDMA_CMD_CREATE_QP_QPN_MASK GENMASK(19, 0) @@ -309,6 +387,9 @@ struct erdma_cmdq_modify_qp_req { #define ERDMA_CMD_CREATE_QP_RQ_DEPTH_MASK GENMASK(31, 20) #define ERDMA_CMD_CREATE_QP_PD_MASK GENMASK(19, 0) +/* create qp cfg2 */ +#define ERDMA_CMD_CREATE_QP_TYPE_MASK GENMASK(3, 0) + /* create qp cqn_mtt_cfg */ #define ERDMA_CMD_CREATE_QP_PAGE_SIZE_MASK GENMASK(31, 28) #define ERDMA_CMD_CREATE_QP_DB_CFG_MASK BIT(25) @@ -335,13 +416,14 @@ struct erdma_cmdq_create_qp_req { u64 rq_buf_addr; u32 sq_mtt_cfg; u32 rq_mtt_cfg; - u64 sq_db_info_dma_addr; - u64 rq_db_info_dma_addr; + u64 sq_dbrec_dma; + u64 rq_dbrec_dma; u64 sq_mtt_entry[3]; u64 rq_mtt_entry[3]; u32 db_cfg; + u32 cfg2; }; struct erdma_cmdq_destroy_qp_req { @@ -394,10 +476,33 @@ struct erdma_cmdq_query_stats_resp { u64 rx_pps_meter_drop_packets_cnt; }; +enum erdma_network_type { + ERDMA_NETWORK_TYPE_IPV4 = 0, + ERDMA_NETWORK_TYPE_IPV6 = 1, +}; + +enum erdma_set_gid_op { + ERDMA_SET_GID_OP_ADD = 0, + ERDMA_SET_GID_OP_DEL = 1, +}; + +/* set gid cfg */ +#define ERDMA_CMD_SET_GID_SGID_IDX_MASK GENMASK(15, 0) +#define ERDMA_CMD_SET_GID_NTYPE_MASK BIT(16) +#define ERDMA_CMD_SET_GID_OP_MASK BIT(31) + +struct erdma_cmdq_set_gid_req { + u64 hdr; + u32 cfg; + u8 gid[ERDMA_ROCEV2_GID_SIZE]; +}; + /* cap qword 0 definition */ +#define ERDMA_CMD_DEV_CAP_MAX_GID_MASK GENMASK_ULL(51, 48) #define ERDMA_CMD_DEV_CAP_MAX_CQE_MASK GENMASK_ULL(47, 40) #define ERDMA_CMD_DEV_CAP_FLAGS_MASK GENMASK_ULL(31, 24) #define ERDMA_CMD_DEV_CAP_MAX_RECV_WR_MASK GENMASK_ULL(23, 16) +#define ERDMA_CMD_DEV_CAP_MAX_AH_MASK GENMASK_ULL(15, 8) #define ERDMA_CMD_DEV_CAP_MAX_MR_SIZE_MASK GENMASK_ULL(7, 0) /* cap qword 1 definition */ @@ -426,6 +531,10 @@ enum { #define ERDMA_CQE_QTYPE_RQ 1 #define ERDMA_CQE_QTYPE_CMDQ 2 +#define ERDMA_CQE_NTYPE_MASK BIT(31) +#define ERDMA_CQE_SL_MASK GENMASK(27, 20) +#define ERDMA_CQE_SQPN_MASK GENMASK(19, 0) + struct erdma_cqe { __be32 hdr; __be32 qe_idx; @@ -435,7 +544,16 @@ struct erdma_cqe { __be32 inv_rkey; }; __be32 size; - __be32 rsvd[3]; + union { + struct { + __be32 rsvd[3]; + } rc; + + struct { + __be32 rsvd[2]; + __be32 info; + } ud; + }; }; struct erdma_sge { @@ -487,7 +605,7 @@ struct erdma_write_sqe { struct erdma_sge sgl[]; }; -struct erdma_send_sqe { +struct erdma_send_sqe_rc { __le64 hdr; union { __be32 imm_data; @@ -498,6 +616,17 @@ struct erdma_send_sqe { struct erdma_sge sgl[]; }; +struct erdma_send_sqe_ud { + __le64 hdr; + __be32 imm_data; + __le32 length; + __le32 qkey; + __le32 dst_qpn; + __le32 ahn; + __le32 rsvd; + struct erdma_sge sgl[]; +}; + struct erdma_readreq_sqe { __le64 hdr; __le32 invalid_stag; diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c index 472939172f0c..f35b30235018 100644 --- a/drivers/infiniband/hw/erdma/erdma_main.c +++ b/drivers/infiniband/hw/erdma/erdma_main.c @@ -26,14 +26,6 @@ static int erdma_netdev_event(struct notifier_block *nb, unsigned long event, goto done; switch (event) { - case NETDEV_UP: - dev->state = IB_PORT_ACTIVE; - erdma_port_event(dev, IB_EVENT_PORT_ACTIVE); - break; - case NETDEV_DOWN: - dev->state = IB_PORT_DOWN; - erdma_port_event(dev, IB_EVENT_PORT_ERR); - break; case NETDEV_CHANGEMTU: if (dev->mtu != netdev->mtu) { erdma_set_mtu(dev, netdev->mtu); @@ -172,22 +164,34 @@ static int erdma_device_init(struct erdma_dev *dev, struct pci_dev *pdev) { int ret; + dev->proto = erdma_reg_read32(dev, ERDMA_REGS_DEV_PROTO_REG); + dev->resp_pool = dma_pool_create("erdma_resp_pool", &pdev->dev, ERDMA_HW_RESP_SIZE, ERDMA_HW_RESP_SIZE, 0); if (!dev->resp_pool) return -ENOMEM; + dev->db_pool = dma_pool_create("erdma_db_pool", &pdev->dev, + ERDMA_DB_SIZE, ERDMA_DB_SIZE, 0); + if (!dev->db_pool) { + ret = -ENOMEM; + goto destroy_resp_pool; + } + ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(ERDMA_PCI_WIDTH)); if (ret) - goto destroy_pool; + goto destroy_db_pool; dma_set_max_seg_size(&pdev->dev, UINT_MAX); return 0; -destroy_pool: +destroy_db_pool: + dma_pool_destroy(dev->db_pool); + +destroy_resp_pool: dma_pool_destroy(dev->resp_pool); return ret; @@ -195,6 +199,7 @@ destroy_pool: static void erdma_device_uninit(struct erdma_dev *dev) { + dma_pool_destroy(dev->db_pool); dma_pool_destroy(dev->resp_pool); } @@ -322,7 +327,7 @@ err_uninit_cmdq: erdma_cmdq_destroy(dev); err_uninit_aeq: - erdma_aeq_destroy(dev); + erdma_eq_destroy(dev, &dev->aeq); err_uninit_comm_irq: erdma_comm_irq_uninit(dev); @@ -355,7 +360,7 @@ static void erdma_remove_dev(struct pci_dev *pdev) erdma_ceqs_uninit(dev); erdma_hw_reset(dev); erdma_cmdq_destroy(dev); - erdma_aeq_destroy(dev); + erdma_eq_destroy(dev, &dev->aeq); erdma_comm_irq_uninit(dev); pci_free_irq_vectors(dev->pdev); erdma_device_uninit(dev); @@ -379,7 +384,7 @@ static int erdma_dev_attrs_init(struct erdma_dev *dev) CMDQ_OPCODE_QUERY_DEVICE); err = erdma_post_cmd_wait(&dev->cmdq, &req_hdr, sizeof(req_hdr), &cap0, - &cap1); + &cap1, true); if (err) return err; @@ -387,6 +392,8 @@ static int erdma_dev_attrs_init(struct erdma_dev *dev) dev->attrs.max_mr_size = 1ULL << ERDMA_GET_CAP(MAX_MR_SIZE, cap0); dev->attrs.max_mw = 1 << ERDMA_GET_CAP(MAX_MW, cap1); dev->attrs.max_recv_wr = 1 << ERDMA_GET_CAP(MAX_RECV_WR, cap0); + dev->attrs.max_gid = 1 << ERDMA_GET_CAP(MAX_GID, cap0); + dev->attrs.max_ah = 1 << ERDMA_GET_CAP(MAX_AH, cap0); dev->attrs.local_dma_key = ERDMA_GET_CAP(DMA_LOCAL_KEY, cap1); dev->attrs.cc = ERDMA_GET_CAP(DEFAULT_CC, cap1); dev->attrs.max_qp = ERDMA_NQP_PER_QBLOCK * ERDMA_GET_CAP(QBLOCK, cap1); @@ -404,12 +411,13 @@ static int erdma_dev_attrs_init(struct erdma_dev *dev) dev->res_cb[ERDMA_RES_TYPE_PD].max_cap = ERDMA_MAX_PD; dev->res_cb[ERDMA_RES_TYPE_STAG_IDX].max_cap = dev->attrs.max_mr; + dev->res_cb[ERDMA_RES_TYPE_AH].max_cap = dev->attrs.max_ah; erdma_cmdq_build_reqhdr(&req_hdr, CMDQ_SUBMOD_COMMON, CMDQ_OPCODE_QUERY_FW_INFO); err = erdma_post_cmd_wait(&dev->cmdq, &req_hdr, sizeof(req_hdr), &cap0, - &cap1); + &cap1, true); if (!err) dev->attrs.fw_version = FIELD_GET(ERDMA_CMD_INFO0_FW_VER_MASK, cap0); @@ -430,7 +438,8 @@ static int erdma_device_config(struct erdma_dev *dev) req.cfg = FIELD_PREP(ERDMA_CMD_CONFIG_DEVICE_PGSHIFT_MASK, PAGE_SHIFT) | FIELD_PREP(ERDMA_CMD_CONFIG_DEVICE_PS_EN_MASK, 1); - return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + true); } static int erdma_res_cb_init(struct erdma_dev *dev) @@ -463,6 +472,29 @@ static void erdma_res_cb_free(struct erdma_dev *dev) bitmap_free(dev->res_cb[i].bitmap); } +static const struct ib_device_ops erdma_device_ops_rocev2 = { + .get_link_layer = erdma_get_link_layer, + .add_gid = erdma_add_gid, + .del_gid = erdma_del_gid, + .query_pkey = erdma_query_pkey, + .create_ah = erdma_create_ah, + .destroy_ah = erdma_destroy_ah, + .query_ah = erdma_query_ah, + + INIT_RDMA_OBJ_SIZE(ib_ah, erdma_ah, ibah), +}; + +static const struct ib_device_ops erdma_device_ops_iwarp = { + .iw_accept = erdma_accept, + .iw_add_ref = erdma_qp_get_ref, + .iw_connect = erdma_connect, + .iw_create_listen = erdma_create_listen, + .iw_destroy_listen = erdma_destroy_listen, + .iw_get_qp = erdma_get_ibqp, + .iw_reject = erdma_reject, + .iw_rem_ref = erdma_qp_put_ref, +}; + static const struct ib_device_ops erdma_device_ops = { .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_ERDMA, @@ -479,21 +511,13 @@ static const struct ib_device_ops erdma_device_ops = { .dereg_mr = erdma_dereg_mr, .destroy_cq = erdma_destroy_cq, .destroy_qp = erdma_destroy_qp, + .disassociate_ucontext = erdma_disassociate_ucontext, .get_dma_mr = erdma_get_dma_mr, .get_hw_stats = erdma_get_hw_stats, .get_port_immutable = erdma_get_port_immutable, - .iw_accept = erdma_accept, - .iw_add_ref = erdma_qp_get_ref, - .iw_connect = erdma_connect, - .iw_create_listen = erdma_create_listen, - .iw_destroy_listen = erdma_destroy_listen, - .iw_get_qp = erdma_get_ibqp, - .iw_reject = erdma_reject, - .iw_rem_ref = erdma_qp_put_ref, .map_mr_sg = erdma_map_mr_sg, .mmap = erdma_mmap, .mmap_free = erdma_mmap_free, - .modify_qp = erdma_modify_qp, .post_recv = erdma_post_recv, .post_send = erdma_post_send, .poll_cq = erdma_poll_cq, @@ -503,6 +527,7 @@ static const struct ib_device_ops erdma_device_ops = { .query_qp = erdma_query_qp, .req_notify_cq = erdma_req_notify_cq, .reg_user_mr = erdma_reg_user_mr, + .modify_qp = erdma_modify_qp, INIT_RDMA_OBJ_SIZE(ib_cq, erdma_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_pd, erdma_pd, ibpd), @@ -525,7 +550,14 @@ static int erdma_ib_device_add(struct pci_dev *pdev) if (ret) return ret; - ibdev->node_type = RDMA_NODE_RNIC; + if (erdma_device_iwarp(dev)) { + ibdev->node_type = RDMA_NODE_RNIC; + ib_set_device_ops(ibdev, &erdma_device_ops_iwarp); + } else { + ibdev->node_type = RDMA_NODE_IB_CA; + ib_set_device_ops(ibdev, &erdma_device_ops_rocev2); + } + memcpy(ibdev->node_desc, ERDMA_NODE_DESC, sizeof(ERDMA_NODE_DESC)); /* diff --git a/drivers/infiniband/hw/erdma/erdma_qp.c b/drivers/infiniband/hw/erdma/erdma_qp.c index 6d0330badd68..25f6c49aec77 100644 --- a/drivers/infiniband/hw/erdma/erdma_qp.c +++ b/drivers/infiniband/hw/erdma/erdma_qp.c @@ -11,20 +11,20 @@ void erdma_qp_llp_close(struct erdma_qp *qp) { - struct erdma_qp_attrs qp_attrs; + struct erdma_mod_qp_params_iwarp params; down_write(&qp->state_lock); - switch (qp->attrs.state) { - case ERDMA_QP_STATE_RTS: - case ERDMA_QP_STATE_RTR: - case ERDMA_QP_STATE_IDLE: - case ERDMA_QP_STATE_TERMINATE: - qp_attrs.state = ERDMA_QP_STATE_CLOSING; - erdma_modify_qp_internal(qp, &qp_attrs, ERDMA_QP_ATTR_STATE); + switch (qp->attrs.iwarp.state) { + case ERDMA_QPS_IWARP_RTS: + case ERDMA_QPS_IWARP_RTR: + case ERDMA_QPS_IWARP_IDLE: + case ERDMA_QPS_IWARP_TERMINATE: + params.state = ERDMA_QPS_IWARP_CLOSING; + erdma_modify_qp_state_iwarp(qp, ¶ms, ERDMA_QPA_IWARP_STATE); break; - case ERDMA_QP_STATE_CLOSING: - qp->attrs.state = ERDMA_QP_STATE_IDLE; + case ERDMA_QPS_IWARP_CLOSING: + qp->attrs.iwarp.state = ERDMA_QPS_IWARP_IDLE; break; default: break; @@ -48,9 +48,10 @@ struct ib_qp *erdma_get_ibqp(struct ib_device *ibdev, int id) return NULL; } -static int erdma_modify_qp_state_to_rts(struct erdma_qp *qp, - struct erdma_qp_attrs *attrs, - enum erdma_qp_attr_mask mask) +static int +erdma_modify_qp_state_to_rts(struct erdma_qp *qp, + struct erdma_mod_qp_params_iwarp *params, + enum erdma_qpa_mask_iwarp mask) { int ret; struct erdma_dev *dev = qp->dev; @@ -59,12 +60,15 @@ static int erdma_modify_qp_state_to_rts(struct erdma_qp *qp, struct erdma_cep *cep = qp->cep; struct sockaddr_storage local_addr, remote_addr; - if (!(mask & ERDMA_QP_ATTR_LLP_HANDLE)) + if (!(mask & ERDMA_QPA_IWARP_LLP_HANDLE)) return -EINVAL; - if (!(mask & ERDMA_QP_ATTR_MPA)) + if (!(mask & ERDMA_QPA_IWARP_MPA)) return -EINVAL; + if (!(mask & ERDMA_QPA_IWARP_CC)) + params->cc = qp->attrs.cc; + ret = getname_local(cep->sock, &local_addr); if (ret < 0) return ret; @@ -73,18 +77,16 @@ static int erdma_modify_qp_state_to_rts(struct erdma_qp *qp, if (ret < 0) return ret; - qp->attrs.state = ERDMA_QP_STATE_RTS; - tp = tcp_sk(qp->cep->sock->sk); erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, CMDQ_OPCODE_MODIFY_QP); - req.cfg = FIELD_PREP(ERDMA_CMD_MODIFY_QP_STATE_MASK, qp->attrs.state) | - FIELD_PREP(ERDMA_CMD_MODIFY_QP_CC_MASK, qp->attrs.cc) | + req.cfg = FIELD_PREP(ERDMA_CMD_MODIFY_QP_STATE_MASK, params->state) | + FIELD_PREP(ERDMA_CMD_MODIFY_QP_CC_MASK, params->cc) | FIELD_PREP(ERDMA_CMD_MODIFY_QP_QPN_MASK, QP_ID(qp)); - req.cookie = be32_to_cpu(qp->cep->mpa.ext_data.cookie); + req.cookie = be32_to_cpu(cep->mpa.ext_data.cookie); req.dip = to_sockaddr_in(remote_addr).sin_addr.s_addr; req.sip = to_sockaddr_in(local_addr).sin_addr.s_addr; req.dport = to_sockaddr_in(remote_addr).sin_port; @@ -92,33 +94,57 @@ static int erdma_modify_qp_state_to_rts(struct erdma_qp *qp, req.send_nxt = tp->snd_nxt; /* rsvd tcp seq for mpa-rsp in server. */ - if (qp->attrs.qp_type == ERDMA_QP_PASSIVE) - req.send_nxt += MPA_DEFAULT_HDR_LEN + qp->attrs.pd_len; + if (params->qp_type == ERDMA_QP_PASSIVE) + req.send_nxt += MPA_DEFAULT_HDR_LEN + params->pd_len; req.recv_nxt = tp->rcv_nxt; - return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + true); + if (ret) + return ret; + + if (mask & ERDMA_QPA_IWARP_IRD) + qp->attrs.irq_size = params->irq_size; + + if (mask & ERDMA_QPA_IWARP_ORD) + qp->attrs.orq_size = params->orq_size; + + if (mask & ERDMA_QPA_IWARP_CC) + qp->attrs.cc = params->cc; + + qp->attrs.iwarp.state = ERDMA_QPS_IWARP_RTS; + + return 0; } -static int erdma_modify_qp_state_to_stop(struct erdma_qp *qp, - struct erdma_qp_attrs *attrs, - enum erdma_qp_attr_mask mask) +static int +erdma_modify_qp_state_to_stop(struct erdma_qp *qp, + struct erdma_mod_qp_params_iwarp *params, + enum erdma_qpa_mask_iwarp mask) { struct erdma_dev *dev = qp->dev; struct erdma_cmdq_modify_qp_req req; - - qp->attrs.state = attrs->state; + int ret; erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, CMDQ_OPCODE_MODIFY_QP); - req.cfg = FIELD_PREP(ERDMA_CMD_MODIFY_QP_STATE_MASK, attrs->state) | + req.cfg = FIELD_PREP(ERDMA_CMD_MODIFY_QP_STATE_MASK, params->state) | FIELD_PREP(ERDMA_CMD_MODIFY_QP_QPN_MASK, QP_ID(qp)); - return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + true); + if (ret) + return ret; + + qp->attrs.iwarp.state = params->state; + + return 0; } -int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs, - enum erdma_qp_attr_mask mask) +int erdma_modify_qp_state_iwarp(struct erdma_qp *qp, + struct erdma_mod_qp_params_iwarp *params, + int mask) { bool need_reflush = false; int drop_conn, ret = 0; @@ -126,31 +152,31 @@ int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs, if (!mask) return 0; - if (!(mask & ERDMA_QP_ATTR_STATE)) + if (!(mask & ERDMA_QPA_IWARP_STATE)) return 0; - switch (qp->attrs.state) { - case ERDMA_QP_STATE_IDLE: - case ERDMA_QP_STATE_RTR: - if (attrs->state == ERDMA_QP_STATE_RTS) { - ret = erdma_modify_qp_state_to_rts(qp, attrs, mask); - } else if (attrs->state == ERDMA_QP_STATE_ERROR) { - qp->attrs.state = ERDMA_QP_STATE_ERROR; + switch (qp->attrs.iwarp.state) { + case ERDMA_QPS_IWARP_IDLE: + case ERDMA_QPS_IWARP_RTR: + if (params->state == ERDMA_QPS_IWARP_RTS) { + ret = erdma_modify_qp_state_to_rts(qp, params, mask); + } else if (params->state == ERDMA_QPS_IWARP_ERROR) { + qp->attrs.iwarp.state = ERDMA_QPS_IWARP_ERROR; need_reflush = true; if (qp->cep) { erdma_cep_put(qp->cep); qp->cep = NULL; } - ret = erdma_modify_qp_state_to_stop(qp, attrs, mask); + ret = erdma_modify_qp_state_to_stop(qp, params, mask); } break; - case ERDMA_QP_STATE_RTS: + case ERDMA_QPS_IWARP_RTS: drop_conn = 0; - if (attrs->state == ERDMA_QP_STATE_CLOSING || - attrs->state == ERDMA_QP_STATE_TERMINATE || - attrs->state == ERDMA_QP_STATE_ERROR) { - ret = erdma_modify_qp_state_to_stop(qp, attrs, mask); + if (params->state == ERDMA_QPS_IWARP_CLOSING || + params->state == ERDMA_QPS_IWARP_TERMINATE || + params->state == ERDMA_QPS_IWARP_ERROR) { + ret = erdma_modify_qp_state_to_stop(qp, params, mask); drop_conn = 1; need_reflush = true; } @@ -159,17 +185,17 @@ int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs, erdma_qp_cm_drop(qp); break; - case ERDMA_QP_STATE_TERMINATE: - if (attrs->state == ERDMA_QP_STATE_ERROR) - qp->attrs.state = ERDMA_QP_STATE_ERROR; + case ERDMA_QPS_IWARP_TERMINATE: + if (params->state == ERDMA_QPS_IWARP_ERROR) + qp->attrs.iwarp.state = ERDMA_QPS_IWARP_ERROR; break; - case ERDMA_QP_STATE_CLOSING: - if (attrs->state == ERDMA_QP_STATE_IDLE) { - qp->attrs.state = ERDMA_QP_STATE_IDLE; - } else if (attrs->state == ERDMA_QP_STATE_ERROR) { - ret = erdma_modify_qp_state_to_stop(qp, attrs, mask); - qp->attrs.state = ERDMA_QP_STATE_ERROR; - } else if (attrs->state != ERDMA_QP_STATE_CLOSING) { + case ERDMA_QPS_IWARP_CLOSING: + if (params->state == ERDMA_QPS_IWARP_IDLE) { + qp->attrs.iwarp.state = ERDMA_QPS_IWARP_IDLE; + } else if (params->state == ERDMA_QPS_IWARP_ERROR) { + ret = erdma_modify_qp_state_to_stop(qp, params, mask); + qp->attrs.iwarp.state = ERDMA_QPS_IWARP_ERROR; + } else if (params->state != ERDMA_QPS_IWARP_CLOSING) { return -ECONNABORTED; } break; @@ -186,6 +212,98 @@ int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs, return ret; } +static int modify_qp_cmd_rocev2(struct erdma_qp *qp, + struct erdma_mod_qp_params_rocev2 *params, + enum erdma_qpa_mask_rocev2 attr_mask) +{ + struct erdma_cmdq_mod_qp_req_rocev2 req; + + memset(&req, 0, sizeof(req)); + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_MODIFY_QP); + + req.cfg0 = FIELD_PREP(ERDMA_CMD_MODIFY_QP_QPN_MASK, QP_ID(qp)); + + if (attr_mask & ERDMA_QPA_ROCEV2_STATE) + req.cfg0 |= FIELD_PREP(ERDMA_CMD_MODIFY_QP_STATE_MASK, + params->state); + + if (attr_mask & ERDMA_QPA_ROCEV2_DST_QPN) + req.cfg1 = FIELD_PREP(ERDMA_CMD_MODIFY_QP_DQPN_MASK, + params->dst_qpn); + + if (attr_mask & ERDMA_QPA_ROCEV2_QKEY) + req.qkey = params->qkey; + + if (attr_mask & ERDMA_QPA_ROCEV2_AV) + erdma_set_av_cfg(&req.av_cfg, ¶ms->av); + + if (attr_mask & ERDMA_QPA_ROCEV2_SQ_PSN) + req.sq_psn = params->sq_psn; + + if (attr_mask & ERDMA_QPA_ROCEV2_RQ_PSN) + req.rq_psn = params->rq_psn; + + req.attr_mask = attr_mask; + + return erdma_post_cmd_wait(&qp->dev->cmdq, &req, sizeof(req), NULL, + NULL, true); +} + +static void erdma_reset_qp(struct erdma_qp *qp) +{ + qp->kern_qp.sq_pi = 0; + qp->kern_qp.sq_ci = 0; + qp->kern_qp.rq_pi = 0; + qp->kern_qp.rq_ci = 0; + memset(qp->kern_qp.swr_tbl, 0, qp->attrs.sq_size * sizeof(u64)); + memset(qp->kern_qp.rwr_tbl, 0, qp->attrs.rq_size * sizeof(u64)); + memset(qp->kern_qp.sq_buf, 0, qp->attrs.sq_size << SQEBB_SHIFT); + memset(qp->kern_qp.rq_buf, 0, qp->attrs.rq_size << RQE_SHIFT); + erdma_remove_cqes_of_qp(&qp->scq->ibcq, QP_ID(qp)); + if (qp->rcq != qp->scq) + erdma_remove_cqes_of_qp(&qp->rcq->ibcq, QP_ID(qp)); +} + +int erdma_modify_qp_state_rocev2(struct erdma_qp *qp, + struct erdma_mod_qp_params_rocev2 *params, + int attr_mask) +{ + struct erdma_dev *dev = to_edev(qp->ibqp.device); + int ret; + + ret = modify_qp_cmd_rocev2(qp, params, attr_mask); + if (ret) + return ret; + + if (attr_mask & ERDMA_QPA_ROCEV2_STATE) + qp->attrs.rocev2.state = params->state; + + if (attr_mask & ERDMA_QPA_ROCEV2_QKEY) + qp->attrs.rocev2.qkey = params->qkey; + + if (attr_mask & ERDMA_QPA_ROCEV2_DST_QPN) + qp->attrs.rocev2.dst_qpn = params->dst_qpn; + + if (attr_mask & ERDMA_QPA_ROCEV2_AV) + memcpy(&qp->attrs.rocev2.av, ¶ms->av, + sizeof(struct erdma_av)); + + if (rdma_is_kernel_res(&qp->ibqp.res) && + params->state == ERDMA_QPS_ROCEV2_RESET) + erdma_reset_qp(qp); + + if (rdma_is_kernel_res(&qp->ibqp.res) && + params->state == ERDMA_QPS_ROCEV2_ERROR) { + qp->flags |= ERDMA_QP_IN_FLUSHING; + mod_delayed_work(dev->reflush_wq, &qp->reflush_dwork, + usecs_to_jiffies(100)); + } + + return 0; +} + static void erdma_qp_safe_free(struct kref *ref) { struct erdma_qp *qp = container_of(ref, struct erdma_qp, ref); @@ -282,17 +400,57 @@ static int fill_sgl(struct erdma_qp *qp, const struct ib_send_wr *send_wr, return 0; } +static void init_send_sqe_rc(struct erdma_qp *qp, struct erdma_send_sqe_rc *sqe, + const struct ib_send_wr *wr, u32 *hw_op) +{ + u32 op = ERDMA_OP_SEND; + + if (wr->opcode == IB_WR_SEND_WITH_IMM) { + op = ERDMA_OP_SEND_WITH_IMM; + sqe->imm_data = wr->ex.imm_data; + } else if (wr->opcode == IB_WR_SEND_WITH_INV) { + op = ERDMA_OP_SEND_WITH_INV; + sqe->invalid_stag = cpu_to_le32(wr->ex.invalidate_rkey); + } + + *hw_op = op; +} + +static void init_send_sqe_ud(struct erdma_qp *qp, struct erdma_send_sqe_ud *sqe, + const struct ib_send_wr *wr, u32 *hw_op) +{ + const struct ib_ud_wr *uwr = ud_wr(wr); + struct erdma_ah *ah = to_eah(uwr->ah); + u32 op = ERDMA_OP_SEND; + + if (wr->opcode == IB_WR_SEND_WITH_IMM) { + op = ERDMA_OP_SEND_WITH_IMM; + sqe->imm_data = wr->ex.imm_data; + } + + *hw_op = op; + + sqe->ahn = cpu_to_le32(ah->ahn); + sqe->dst_qpn = cpu_to_le32(uwr->remote_qpn); + /* Not allowed to send control qkey */ + if (uwr->remote_qkey & 0x80000000) + sqe->qkey = cpu_to_le32(qp->attrs.rocev2.qkey); + else + sqe->qkey = cpu_to_le32(uwr->remote_qkey); +} + static int erdma_push_one_sqe(struct erdma_qp *qp, u16 *pi, const struct ib_send_wr *send_wr) { u32 wqe_size, wqebb_cnt, hw_op, flags, sgl_offset; u32 idx = *pi & (qp->attrs.sq_size - 1); enum ib_wr_opcode op = send_wr->opcode; + struct erdma_send_sqe_rc *rc_send_sqe; + struct erdma_send_sqe_ud *ud_send_sqe; struct erdma_atomic_sqe *atomic_sqe; struct erdma_readreq_sqe *read_sqe; struct erdma_reg_mr_sqe *regmr_sge; struct erdma_write_sqe *write_sqe; - struct erdma_send_sqe *send_sqe; struct ib_rdma_wr *rdma_wr; struct erdma_sge *sge; __le32 *length_field; @@ -301,6 +459,10 @@ static int erdma_push_one_sqe(struct erdma_qp *qp, u16 *pi, u32 attrs; int ret; + if (qp->ibqp.qp_type != IB_QPT_RC && send_wr->opcode != IB_WR_SEND && + send_wr->opcode != IB_WR_SEND_WITH_IMM) + return -EINVAL; + entry = get_queue_entry(qp->kern_qp.sq_buf, idx, qp->attrs.sq_size, SQEBB_SHIFT); @@ -374,21 +536,20 @@ static int erdma_push_one_sqe(struct erdma_qp *qp, u16 *pi, case IB_WR_SEND: case IB_WR_SEND_WITH_IMM: case IB_WR_SEND_WITH_INV: - send_sqe = (struct erdma_send_sqe *)entry; - hw_op = ERDMA_OP_SEND; - if (op == IB_WR_SEND_WITH_IMM) { - hw_op = ERDMA_OP_SEND_WITH_IMM; - send_sqe->imm_data = send_wr->ex.imm_data; - } else if (op == IB_WR_SEND_WITH_INV) { - hw_op = ERDMA_OP_SEND_WITH_INV; - send_sqe->invalid_stag = - cpu_to_le32(send_wr->ex.invalidate_rkey); + if (qp->ibqp.qp_type == IB_QPT_RC) { + rc_send_sqe = (struct erdma_send_sqe_rc *)entry; + init_send_sqe_rc(qp, rc_send_sqe, send_wr, &hw_op); + length_field = &rc_send_sqe->length; + wqe_size = sizeof(struct erdma_send_sqe_rc); + } else { + ud_send_sqe = (struct erdma_send_sqe_ud *)entry; + init_send_sqe_ud(qp, ud_send_sqe, send_wr, &hw_op); + length_field = &ud_send_sqe->length; + wqe_size = sizeof(struct erdma_send_sqe_ud); } - wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, hw_op); - length_field = &send_sqe->length; - wqe_size = sizeof(struct erdma_send_sqe); - sgl_offset = wqe_size; + sgl_offset = wqe_size; + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, hw_op); break; case IB_WR_REG_MR: wqe_hdr |= @@ -492,7 +653,7 @@ static void kick_sq_db(struct erdma_qp *qp, u16 pi) u64 db_data = FIELD_PREP(ERDMA_SQE_HDR_QPN_MASK, QP_ID(qp)) | FIELD_PREP(ERDMA_SQE_HDR_WQEBB_INDEX_MASK, pi); - *(u64 *)qp->kern_qp.sq_db_info = db_data; + *(u64 *)qp->kern_qp.sq_dbrec = db_data; writeq(db_data, qp->kern_qp.hw_sq_db); } @@ -557,7 +718,7 @@ static int erdma_post_recv_one(struct erdma_qp *qp, return -EINVAL; } - *(u64 *)qp->kern_qp.rq_db_info = *(u64 *)rqe; + *(u64 *)qp->kern_qp.rq_dbrec = *(u64 *)rqe; writeq(*(u64 *)rqe, qp->kern_qp.hw_rq_db); qp->kern_qp.rwr_tbl[qp->kern_qp.rq_pi & (qp->attrs.rq_size - 1)] = diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c index 23dfc01603f8..af36a8d2df22 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.c +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -55,6 +55,13 @@ static int create_qp_cmd(struct erdma_ucontext *uctx, struct erdma_qp *qp) ilog2(qp->attrs.rq_size)) | FIELD_PREP(ERDMA_CMD_CREATE_QP_PD_MASK, pd->pdn); + if (qp->ibqp.qp_type == IB_QPT_RC) + req.cfg2 = FIELD_PREP(ERDMA_CMD_CREATE_QP_TYPE_MASK, + ERDMA_QPT_RC); + else + req.cfg2 = FIELD_PREP(ERDMA_CMD_CREATE_QP_TYPE_MASK, + ERDMA_QPT_UD); + if (rdma_is_kernel_res(&qp->ibqp.res)) { u32 pgsz_range = ilog2(SZ_1M) - ERDMA_HW_PAGE_SHIFT; @@ -76,10 +83,8 @@ static int create_qp_cmd(struct erdma_ucontext *uctx, struct erdma_qp *qp) req.rq_buf_addr = qp->kern_qp.rq_buf_dma_addr; req.sq_buf_addr = qp->kern_qp.sq_buf_dma_addr; - req.sq_db_info_dma_addr = qp->kern_qp.sq_buf_dma_addr + - (qp->attrs.sq_size << SQEBB_SHIFT); - req.rq_db_info_dma_addr = qp->kern_qp.rq_buf_dma_addr + - (qp->attrs.rq_size << RQE_SHIFT); + req.sq_dbrec_dma = qp->kern_qp.sq_dbrec_dma; + req.rq_dbrec_dma = qp->kern_qp.rq_dbrec_dma; } else { user_qp = &qp->user_qp; req.sq_cqn_mtt_cfg = FIELD_PREP( @@ -107,8 +112,8 @@ static int create_qp_cmd(struct erdma_ucontext *uctx, struct erdma_qp *qp) assemble_qbuf_mtt_for_cmd(&user_qp->rq_mem, &req.rq_mtt_cfg, &req.rq_buf_addr, req.rq_mtt_entry); - req.sq_db_info_dma_addr = user_qp->sq_db_info_dma_addr; - req.rq_db_info_dma_addr = user_qp->rq_db_info_dma_addr; + req.sq_dbrec_dma = user_qp->sq_dbrec_dma; + req.rq_dbrec_dma = user_qp->rq_dbrec_dma; if (uctx->ext_db.enable) { req.sq_cqn_mtt_cfg |= @@ -121,10 +126,10 @@ static int create_qp_cmd(struct erdma_ucontext *uctx, struct erdma_qp *qp) } } - err = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), &resp0, - &resp1); - if (!err) - qp->attrs.cookie = + err = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), &resp0, &resp1, + true); + if (!err && erdma_device_iwarp(dev)) + qp->attrs.iwarp.cookie = FIELD_GET(ERDMA_CMDQ_CREATE_QP_RESP_COOKIE_MASK, resp0); return err; @@ -180,7 +185,8 @@ static int regmr_cmd(struct erdma_dev *dev, struct erdma_mr *mr) } post_cmd: - return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + true); } static int create_cq_cmd(struct erdma_ucontext *uctx, struct erdma_cq *cq) @@ -209,8 +215,7 @@ static int create_cq_cmd(struct erdma_ucontext *uctx, struct erdma_cq *cq) ERDMA_MR_MTT_0LEVEL); req.first_page_offset = 0; - req.cq_db_info_addr = - cq->kern_cq.qbuf_dma_addr + (cq->depth << CQE_SHIFT); + req.cq_dbrec_dma = cq->kern_cq.dbrec_dma; } else { mem = &cq->user_cq.qbuf_mem; req.cfg0 |= @@ -233,7 +238,7 @@ static int create_cq_cmd(struct erdma_ucontext *uctx, struct erdma_cq *cq) mem->mtt_nents); req.first_page_offset = mem->page_offset; - req.cq_db_info_addr = cq->user_cq.db_info_dma_addr; + req.cq_dbrec_dma = cq->user_cq.dbrec_dma; if (uctx->ext_db.enable) { req.cfg1 |= FIELD_PREP( @@ -243,7 +248,8 @@ static int create_cq_cmd(struct erdma_ucontext *uctx, struct erdma_cq *cq) } } - return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + true); } static int erdma_alloc_idx(struct erdma_resource_cb *res_cb) @@ -339,6 +345,11 @@ int erdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr, attr->max_fast_reg_page_list_len = ERDMA_MAX_FRMR_PA; attr->page_size_cap = ERDMA_PAGE_SIZE_SUPPORT; + if (erdma_device_rocev2(dev)) { + attr->max_pkeys = ERDMA_MAX_PKEYS; + attr->max_ah = dev->attrs.max_ah; + } + if (dev->attrs.cap_flags & ERDMA_DEV_CAP_FLAGS_ATOMIC) attr->atomic_cap = IB_ATOMIC_GLOB; @@ -370,7 +381,14 @@ int erdma_query_port(struct ib_device *ibdev, u32 port, memset(attr, 0, sizeof(*attr)); - attr->gid_tbl_len = 1; + if (erdma_device_iwarp(dev)) { + attr->gid_tbl_len = 1; + } else { + attr->gid_tbl_len = dev->attrs.max_gid; + attr->ip_gids = true; + attr->pkey_tbl_len = ERDMA_MAX_PKEYS; + } + attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP; attr->max_msg_sz = -1; @@ -380,14 +398,10 @@ int erdma_query_port(struct ib_device *ibdev, u32 port, ib_get_eth_speed(ibdev, port, &attr->active_speed, &attr->active_width); attr->max_mtu = ib_mtu_int_to_enum(ndev->mtu); attr->active_mtu = ib_mtu_int_to_enum(ndev->mtu); - if (netif_running(ndev) && netif_carrier_ok(ndev)) - dev->state = IB_PORT_ACTIVE; - else - dev->state = IB_PORT_DOWN; - attr->state = dev->state; + attr->state = ib_get_curr_port_state(ndev); out: - if (dev->state == IB_PORT_ACTIVE) + if (attr->state == IB_PORT_ACTIVE) attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; else attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; @@ -398,8 +412,18 @@ out: int erdma_get_port_immutable(struct ib_device *ibdev, u32 port, struct ib_port_immutable *port_immutable) { - port_immutable->gid_tbl_len = 1; - port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP; + struct erdma_dev *dev = to_edev(ibdev); + + if (erdma_device_iwarp(dev)) { + port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP; + port_immutable->gid_tbl_len = 1; + } else { + port_immutable->core_cap_flags = + RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; + port_immutable->max_mad_size = IB_MGMT_MAD_SIZE; + port_immutable->gid_tbl_len = dev->attrs.max_gid; + port_immutable->pkey_tbl_len = ERDMA_MAX_PKEYS; + } return 0; } @@ -441,7 +465,8 @@ static void erdma_flush_worker(struct work_struct *work) req.qpn = QP_ID(qp); req.sq_pi = qp->kern_qp.sq_pi; req.rq_pi = qp->kern_qp.rq_pi; - erdma_post_cmd_wait(&qp->dev->cmdq, &req, sizeof(req), NULL, NULL); + erdma_post_cmd_wait(&qp->dev->cmdq, &req, sizeof(req), NULL, NULL, + true); } static int erdma_qp_validate_cap(struct erdma_dev *dev, @@ -462,7 +487,11 @@ static int erdma_qp_validate_cap(struct erdma_dev *dev, static int erdma_qp_validate_attr(struct erdma_dev *dev, struct ib_qp_init_attr *attrs) { - if (attrs->qp_type != IB_QPT_RC) + if (erdma_device_iwarp(dev) && attrs->qp_type != IB_QPT_RC) + return -EOPNOTSUPP; + + if (erdma_device_rocev2(dev) && attrs->qp_type != IB_QPT_RC && + attrs->qp_type != IB_QPT_UD && attrs->qp_type != IB_QPT_GSI) return -EOPNOTSUPP; if (attrs->srq) @@ -482,16 +511,24 @@ static void free_kernel_qp(struct erdma_qp *qp) vfree(qp->kern_qp.rwr_tbl); if (qp->kern_qp.sq_buf) - dma_free_coherent( - &dev->pdev->dev, - WARPPED_BUFSIZE(qp->attrs.sq_size << SQEBB_SHIFT), - qp->kern_qp.sq_buf, qp->kern_qp.sq_buf_dma_addr); + dma_free_coherent(&dev->pdev->dev, + qp->attrs.sq_size << SQEBB_SHIFT, + qp->kern_qp.sq_buf, + qp->kern_qp.sq_buf_dma_addr); + + if (qp->kern_qp.sq_dbrec) + dma_pool_free(dev->db_pool, qp->kern_qp.sq_dbrec, + qp->kern_qp.sq_dbrec_dma); if (qp->kern_qp.rq_buf) - dma_free_coherent( - &dev->pdev->dev, - WARPPED_BUFSIZE(qp->attrs.rq_size << RQE_SHIFT), - qp->kern_qp.rq_buf, qp->kern_qp.rq_buf_dma_addr); + dma_free_coherent(&dev->pdev->dev, + qp->attrs.rq_size << RQE_SHIFT, + qp->kern_qp.rq_buf, + qp->kern_qp.rq_buf_dma_addr); + + if (qp->kern_qp.rq_dbrec) + dma_pool_free(dev->db_pool, qp->kern_qp.rq_dbrec, + qp->kern_qp.rq_dbrec_dma); } static int init_kernel_qp(struct erdma_dev *dev, struct erdma_qp *qp, @@ -516,20 +553,27 @@ static int init_kernel_qp(struct erdma_dev *dev, struct erdma_qp *qp, if (!kqp->swr_tbl || !kqp->rwr_tbl) goto err_out; - size = (qp->attrs.sq_size << SQEBB_SHIFT) + ERDMA_EXTRA_BUFFER_SIZE; + size = qp->attrs.sq_size << SQEBB_SHIFT; kqp->sq_buf = dma_alloc_coherent(&dev->pdev->dev, size, &kqp->sq_buf_dma_addr, GFP_KERNEL); if (!kqp->sq_buf) goto err_out; - size = (qp->attrs.rq_size << RQE_SHIFT) + ERDMA_EXTRA_BUFFER_SIZE; + kqp->sq_dbrec = + dma_pool_zalloc(dev->db_pool, GFP_KERNEL, &kqp->sq_dbrec_dma); + if (!kqp->sq_dbrec) + goto err_out; + + size = qp->attrs.rq_size << RQE_SHIFT; kqp->rq_buf = dma_alloc_coherent(&dev->pdev->dev, size, &kqp->rq_buf_dma_addr, GFP_KERNEL); if (!kqp->rq_buf) goto err_out; - kqp->sq_db_info = kqp->sq_buf + (qp->attrs.sq_size << SQEBB_SHIFT); - kqp->rq_db_info = kqp->rq_buf + (qp->attrs.rq_size << RQE_SHIFT); + kqp->rq_dbrec = + dma_pool_zalloc(dev->db_pool, GFP_KERNEL, &kqp->rq_dbrec_dma); + if (!kqp->rq_dbrec) + goto err_out; return 0; @@ -864,9 +908,9 @@ erdma_unmap_user_dbrecords(struct erdma_ucontext *ctx, } static int init_user_qp(struct erdma_qp *qp, struct erdma_ucontext *uctx, - u64 va, u32 len, u64 db_info_va) + u64 va, u32 len, u64 dbrec_va) { - dma_addr_t db_info_dma_addr; + dma_addr_t dbrec_dma; u32 rq_offset; int ret; @@ -889,14 +933,14 @@ static int init_user_qp(struct erdma_qp *qp, struct erdma_ucontext *uctx, if (ret) goto put_sq_mtt; - ret = erdma_map_user_dbrecords(uctx, db_info_va, + ret = erdma_map_user_dbrecords(uctx, dbrec_va, &qp->user_qp.user_dbr_page, - &db_info_dma_addr); + &dbrec_dma); if (ret) goto put_rq_mtt; - qp->user_qp.sq_db_info_dma_addr = db_info_dma_addr; - qp->user_qp.rq_db_info_dma_addr = db_info_dma_addr + ERDMA_DB_SIZE; + qp->user_qp.sq_dbrec_dma = dbrec_dma; + qp->user_qp.rq_dbrec_dma = dbrec_dma + ERDMA_DB_SIZE; return 0; @@ -925,7 +969,8 @@ int erdma_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attrs, udata, struct erdma_ucontext, ibucontext); struct erdma_ureq_create_qp ureq; struct erdma_uresp_create_qp uresp; - int ret; + void *old_entry; + int ret = 0; ret = erdma_qp_validate_cap(dev, attrs); if (ret) @@ -944,9 +989,16 @@ int erdma_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attrs, kref_init(&qp->ref); init_completion(&qp->safe_free); - ret = xa_alloc_cyclic(&dev->qp_xa, &qp->ibqp.qp_num, qp, - XA_LIMIT(1, dev->attrs.max_qp - 1), - &dev->next_alloc_qpn, GFP_KERNEL); + if (qp->ibqp.qp_type == IB_QPT_GSI) { + old_entry = xa_store(&dev->qp_xa, 1, qp, GFP_KERNEL); + if (xa_is_err(old_entry)) + ret = xa_err(old_entry); + } else { + ret = xa_alloc_cyclic(&dev->qp_xa, &qp->ibqp.qp_num, qp, + XA_LIMIT(1, dev->attrs.max_qp - 1), + &dev->next_alloc_qpn, GFP_KERNEL); + } + if (ret < 0) { ret = -ENOMEM; goto err_out; @@ -983,7 +1035,12 @@ int erdma_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attrs, qp->attrs.max_send_sge = attrs->cap.max_send_sge; qp->attrs.max_recv_sge = attrs->cap.max_recv_sge; - qp->attrs.state = ERDMA_QP_STATE_IDLE; + + if (erdma_device_iwarp(qp->dev)) + qp->attrs.iwarp.state = ERDMA_QPS_IWARP_IDLE; + else + qp->attrs.rocev2.state = ERDMA_QPS_ROCEV2_RESET; + INIT_DELAYED_WORK(&qp->reflush_dwork, erdma_flush_worker); ret = create_qp_cmd(uctx, qp); @@ -1207,7 +1264,8 @@ int erdma_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) req.cfg = FIELD_PREP(ERDMA_CMD_MR_MPT_IDX_MASK, ibmr->lkey >> 8) | FIELD_PREP(ERDMA_CMD_MR_KEY_MASK, ibmr->lkey & 0xFF); - ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + true); if (ret) return ret; @@ -1232,14 +1290,16 @@ int erdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) CMDQ_OPCODE_DESTROY_CQ); req.cqn = cq->cqn; - err = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + err = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + true); if (err) return err; if (rdma_is_kernel_res(&cq->ibcq.res)) { - dma_free_coherent(&dev->pdev->dev, - WARPPED_BUFSIZE(cq->depth << CQE_SHIFT), + dma_free_coherent(&dev->pdev->dev, cq->depth << CQE_SHIFT, cq->kern_cq.qbuf, cq->kern_cq.qbuf_dma_addr); + dma_pool_free(dev->db_pool, cq->kern_cq.dbrec, + cq->kern_cq.dbrec_dma); } else { erdma_unmap_user_dbrecords(ctx, &cq->user_cq.user_dbr_page); put_mtt_entries(dev, &cq->user_cq.qbuf_mem); @@ -1256,13 +1316,20 @@ int erdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) struct erdma_dev *dev = to_edev(ibqp->device); struct erdma_ucontext *ctx = rdma_udata_to_drv_context( udata, struct erdma_ucontext, ibucontext); - struct erdma_qp_attrs qp_attrs; - int err; struct erdma_cmdq_destroy_qp_req req; + union erdma_mod_qp_params params; + int err; down_write(&qp->state_lock); - qp_attrs.state = ERDMA_QP_STATE_ERROR; - erdma_modify_qp_internal(qp, &qp_attrs, ERDMA_QP_ATTR_STATE); + if (erdma_device_iwarp(dev)) { + params.iwarp.state = ERDMA_QPS_IWARP_ERROR; + erdma_modify_qp_state_iwarp(qp, ¶ms.iwarp, + ERDMA_QPA_IWARP_STATE); + } else { + params.rocev2.state = ERDMA_QPS_ROCEV2_ERROR; + erdma_modify_qp_state_rocev2(qp, ¶ms.rocev2, + ERDMA_QPA_ROCEV2_STATE); + } up_write(&qp->state_lock); cancel_delayed_work_sync(&qp->reflush_dwork); @@ -1271,7 +1338,8 @@ int erdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) CMDQ_OPCODE_DESTROY_QP); req.qpn = QP_ID(qp); - err = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + err = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + true); if (err) return err; @@ -1279,16 +1347,7 @@ int erdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) wait_for_completion(&qp->safe_free); if (rdma_is_kernel_res(&qp->ibqp.res)) { - vfree(qp->kern_qp.swr_tbl); - vfree(qp->kern_qp.rwr_tbl); - dma_free_coherent( - &dev->pdev->dev, - WARPPED_BUFSIZE(qp->attrs.rq_size << RQE_SHIFT), - qp->kern_qp.rq_buf, qp->kern_qp.rq_buf_dma_addr); - dma_free_coherent( - &dev->pdev->dev, - WARPPED_BUFSIZE(qp->attrs.sq_size << SQEBB_SHIFT), - qp->kern_qp.sq_buf, qp->kern_qp.sq_buf_dma_addr); + free_kernel_qp(qp); } else { put_mtt_entries(dev, &qp->user_qp.sq_mem); put_mtt_entries(dev, &qp->user_qp.rq_mem); @@ -1378,7 +1437,8 @@ static int alloc_db_resources(struct erdma_dev *dev, struct erdma_ucontext *ctx, FIELD_PREP(ERDMA_CMD_EXT_DB_RQ_EN_MASK, 1) | FIELD_PREP(ERDMA_CMD_EXT_DB_SQ_EN_MASK, 1); - ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), &val0, &val1); + ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), &val0, &val1, + true); if (ret) return ret; @@ -1413,7 +1473,8 @@ static void free_db_resources(struct erdma_dev *dev, struct erdma_ucontext *ctx) req.rdb_off = ctx->ext_db.rdb_off; req.cdb_off = ctx->ext_db.cdb_off; - ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + true); if (ret) ibdev_err_ratelimited(&dev->ibdev, "free db resources failed %d", ret); @@ -1502,49 +1563,248 @@ void erdma_dealloc_ucontext(struct ib_ucontext *ibctx) atomic_dec(&dev->num_ctx); } -static int ib_qp_state_to_erdma_qp_state[IB_QPS_ERR + 1] = { - [IB_QPS_RESET] = ERDMA_QP_STATE_IDLE, - [IB_QPS_INIT] = ERDMA_QP_STATE_IDLE, - [IB_QPS_RTR] = ERDMA_QP_STATE_RTR, - [IB_QPS_RTS] = ERDMA_QP_STATE_RTS, - [IB_QPS_SQD] = ERDMA_QP_STATE_CLOSING, - [IB_QPS_SQE] = ERDMA_QP_STATE_TERMINATE, - [IB_QPS_ERR] = ERDMA_QP_STATE_ERROR +static void erdma_attr_to_av(const struct rdma_ah_attr *ah_attr, + struct erdma_av *av, u16 sport) +{ + const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr); + + av->port = rdma_ah_get_port_num(ah_attr); + av->sgid_index = grh->sgid_index; + av->hop_limit = grh->hop_limit; + av->traffic_class = grh->traffic_class; + av->sl = rdma_ah_get_sl(ah_attr); + + av->flow_label = grh->flow_label; + av->udp_sport = sport; + + ether_addr_copy(av->dmac, ah_attr->roce.dmac); + memcpy(av->dgid, grh->dgid.raw, ERDMA_ROCEV2_GID_SIZE); + + if (ipv6_addr_v4mapped((struct in6_addr *)&grh->dgid)) + av->ntype = ERDMA_NETWORK_TYPE_IPV4; + else + av->ntype = ERDMA_NETWORK_TYPE_IPV6; +} + +static void erdma_av_to_attr(struct erdma_av *av, struct rdma_ah_attr *ah_attr) +{ + ah_attr->type = RDMA_AH_ATTR_TYPE_ROCE; + + rdma_ah_set_sl(ah_attr, av->sl); + rdma_ah_set_port_num(ah_attr, av->port); + rdma_ah_set_ah_flags(ah_attr, IB_AH_GRH); + + rdma_ah_set_grh(ah_attr, NULL, av->flow_label, av->sgid_index, + av->hop_limit, av->traffic_class); + rdma_ah_set_dgid_raw(ah_attr, av->dgid); +} + +static int ib_qps_to_erdma_qps[ERDMA_PROTO_COUNT][IB_QPS_ERR + 1] = { + [ERDMA_PROTO_IWARP] = { + [IB_QPS_RESET] = ERDMA_QPS_IWARP_IDLE, + [IB_QPS_INIT] = ERDMA_QPS_IWARP_IDLE, + [IB_QPS_RTR] = ERDMA_QPS_IWARP_RTR, + [IB_QPS_RTS] = ERDMA_QPS_IWARP_RTS, + [IB_QPS_SQD] = ERDMA_QPS_IWARP_CLOSING, + [IB_QPS_SQE] = ERDMA_QPS_IWARP_TERMINATE, + [IB_QPS_ERR] = ERDMA_QPS_IWARP_ERROR, + }, + [ERDMA_PROTO_ROCEV2] = { + [IB_QPS_RESET] = ERDMA_QPS_ROCEV2_RESET, + [IB_QPS_INIT] = ERDMA_QPS_ROCEV2_INIT, + [IB_QPS_RTR] = ERDMA_QPS_ROCEV2_RTR, + [IB_QPS_RTS] = ERDMA_QPS_ROCEV2_RTS, + [IB_QPS_SQD] = ERDMA_QPS_ROCEV2_SQD, + [IB_QPS_SQE] = ERDMA_QPS_ROCEV2_SQE, + [IB_QPS_ERR] = ERDMA_QPS_ROCEV2_ERROR, + }, }; -int erdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, - struct ib_udata *udata) +static int erdma_qps_to_ib_qps[ERDMA_PROTO_COUNT][ERDMA_QPS_ROCEV2_COUNT] = { + [ERDMA_PROTO_IWARP] = { + [ERDMA_QPS_IWARP_IDLE] = IB_QPS_INIT, + [ERDMA_QPS_IWARP_RTR] = IB_QPS_RTR, + [ERDMA_QPS_IWARP_RTS] = IB_QPS_RTS, + [ERDMA_QPS_IWARP_CLOSING] = IB_QPS_ERR, + [ERDMA_QPS_IWARP_TERMINATE] = IB_QPS_ERR, + [ERDMA_QPS_IWARP_ERROR] = IB_QPS_ERR, + }, + [ERDMA_PROTO_ROCEV2] = { + [ERDMA_QPS_ROCEV2_RESET] = IB_QPS_RESET, + [ERDMA_QPS_ROCEV2_INIT] = IB_QPS_INIT, + [ERDMA_QPS_ROCEV2_RTR] = IB_QPS_RTR, + [ERDMA_QPS_ROCEV2_RTS] = IB_QPS_RTS, + [ERDMA_QPS_ROCEV2_SQD] = IB_QPS_SQD, + [ERDMA_QPS_ROCEV2_SQE] = IB_QPS_SQE, + [ERDMA_QPS_ROCEV2_ERROR] = IB_QPS_ERR, + }, +}; + +static inline enum erdma_qps_iwarp ib_to_iwarp_qps(enum ib_qp_state state) { - struct erdma_qp_attrs new_attrs; - enum erdma_qp_attr_mask erdma_attr_mask = 0; - struct erdma_qp *qp = to_eqp(ibqp); - int ret = 0; + return ib_qps_to_erdma_qps[ERDMA_PROTO_IWARP][state]; +} - if (attr_mask & ~IB_QP_ATTR_STANDARD_BITS) - return -EOPNOTSUPP; +static inline enum erdma_qps_rocev2 ib_to_rocev2_qps(enum ib_qp_state state) +{ + return ib_qps_to_erdma_qps[ERDMA_PROTO_ROCEV2][state]; +} + +static inline enum ib_qp_state iwarp_to_ib_qps(enum erdma_qps_iwarp state) +{ + return erdma_qps_to_ib_qps[ERDMA_PROTO_IWARP][state]; +} + +static inline enum ib_qp_state rocev2_to_ib_qps(enum erdma_qps_rocev2 state) +{ + return erdma_qps_to_ib_qps[ERDMA_PROTO_ROCEV2][state]; +} + +static int erdma_check_qp_attrs(struct erdma_qp *qp, struct ib_qp_attr *attr, + int attr_mask) +{ + enum ib_qp_state cur_state, nxt_state; + struct erdma_dev *dev = qp->dev; + int ret = -EINVAL; + + if (attr_mask & ~IB_QP_ATTR_STANDARD_BITS) { + ret = -EOPNOTSUPP; + goto out; + } + + if ((attr_mask & IB_QP_PORT) && + !rdma_is_port_valid(&dev->ibdev, attr->port_num)) + goto out; + + if (erdma_device_rocev2(dev)) { + cur_state = (attr_mask & IB_QP_CUR_STATE) ? + attr->cur_qp_state : + rocev2_to_ib_qps(qp->attrs.rocev2.state); + + nxt_state = (attr_mask & IB_QP_STATE) ? attr->qp_state : + cur_state; + + if (!ib_modify_qp_is_ok(cur_state, nxt_state, qp->ibqp.qp_type, + attr_mask)) + goto out; + + if ((attr_mask & IB_QP_AV) && + erdma_check_gid_attr( + rdma_ah_read_grh(&attr->ah_attr)->sgid_attr)) + goto out; + + if ((attr_mask & IB_QP_PKEY_INDEX) && + attr->pkey_index >= ERDMA_MAX_PKEYS) + goto out; + } + + return 0; + +out: + return ret; +} + +static void erdma_init_mod_qp_params_rocev2( + struct erdma_qp *qp, struct erdma_mod_qp_params_rocev2 *params, + int *erdma_attr_mask, struct ib_qp_attr *attr, int ib_attr_mask) +{ + enum erdma_qpa_mask_rocev2 to_modify_attrs = 0; + enum erdma_qps_rocev2 cur_state, nxt_state; + u16 udp_sport; + + if (ib_attr_mask & IB_QP_CUR_STATE) + cur_state = ib_to_rocev2_qps(attr->cur_qp_state); + else + cur_state = qp->attrs.rocev2.state; + + if (ib_attr_mask & IB_QP_STATE) + nxt_state = ib_to_rocev2_qps(attr->qp_state); + else + nxt_state = cur_state; + + to_modify_attrs |= ERDMA_QPA_ROCEV2_STATE; + params->state = nxt_state; - memset(&new_attrs, 0, sizeof(new_attrs)); + if (ib_attr_mask & IB_QP_QKEY) { + to_modify_attrs |= ERDMA_QPA_ROCEV2_QKEY; + params->qkey = attr->qkey; + } - if (attr_mask & IB_QP_STATE) { - new_attrs.state = ib_qp_state_to_erdma_qp_state[attr->qp_state]; + if (ib_attr_mask & IB_QP_SQ_PSN) { + to_modify_attrs |= ERDMA_QPA_ROCEV2_SQ_PSN; + params->sq_psn = attr->sq_psn; + } - erdma_attr_mask |= ERDMA_QP_ATTR_STATE; + if (ib_attr_mask & IB_QP_RQ_PSN) { + to_modify_attrs |= ERDMA_QPA_ROCEV2_RQ_PSN; + params->rq_psn = attr->rq_psn; } + if (ib_attr_mask & IB_QP_DEST_QPN) { + to_modify_attrs |= ERDMA_QPA_ROCEV2_DST_QPN; + params->dst_qpn = attr->dest_qp_num; + } + + if (ib_attr_mask & IB_QP_AV) { + to_modify_attrs |= ERDMA_QPA_ROCEV2_AV; + udp_sport = rdma_get_udp_sport(attr->ah_attr.grh.flow_label, + QP_ID(qp), params->dst_qpn); + erdma_attr_to_av(&attr->ah_attr, ¶ms->av, udp_sport); + } + + *erdma_attr_mask = to_modify_attrs; +} + +int erdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, + struct ib_udata *udata) +{ + struct erdma_qp *qp = to_eqp(ibqp); + union erdma_mod_qp_params params; + int ret = 0, erdma_attr_mask = 0; + down_write(&qp->state_lock); - ret = erdma_modify_qp_internal(qp, &new_attrs, erdma_attr_mask); + ret = erdma_check_qp_attrs(qp, attr, attr_mask); + if (ret) + goto out; - up_write(&qp->state_lock); + if (erdma_device_iwarp(qp->dev)) { + if (attr_mask & IB_QP_STATE) { + erdma_attr_mask |= ERDMA_QPA_IWARP_STATE; + params.iwarp.state = ib_to_iwarp_qps(attr->qp_state); + } + ret = erdma_modify_qp_state_iwarp(qp, ¶ms.iwarp, + erdma_attr_mask); + } else { + erdma_init_mod_qp_params_rocev2( + qp, ¶ms.rocev2, &erdma_attr_mask, attr, attr_mask); + + ret = erdma_modify_qp_state_rocev2(qp, ¶ms.rocev2, + erdma_attr_mask); + } + +out: + up_write(&qp->state_lock); return ret; } +static enum ib_qp_state query_qp_state(struct erdma_qp *qp) +{ + if (erdma_device_iwarp(qp->dev)) + return iwarp_to_ib_qps(qp->attrs.iwarp.state); + else + return rocev2_to_ib_qps(qp->attrs.rocev2.state); +} + int erdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) { - struct erdma_qp *qp; + struct erdma_cmdq_query_qp_req_rocev2 req; struct erdma_dev *dev; + struct erdma_qp *qp; + u64 resp0, resp1; + int ret; if (ibqp && qp_attr && qp_init_attr) { qp = to_eqp(ibqp); @@ -1571,6 +1831,38 @@ int erdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, qp_init_attr->cap = qp_attr->cap; + if (erdma_device_rocev2(dev)) { + /* Query hardware to get some attributes */ + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_QUERY_QP); + req.qpn = QP_ID(qp); + + ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), &resp0, + &resp1, true); + if (ret) + return ret; + + qp_attr->sq_psn = + FIELD_GET(ERDMA_CMD_QUERY_QP_RESP_SQ_PSN_MASK, resp0); + qp_attr->rq_psn = + FIELD_GET(ERDMA_CMD_QUERY_QP_RESP_RQ_PSN_MASK, resp0); + qp_attr->qp_state = rocev2_to_ib_qps(FIELD_GET( + ERDMA_CMD_QUERY_QP_RESP_QP_STATE_MASK, resp0)); + qp_attr->cur_qp_state = qp_attr->qp_state; + qp_attr->sq_draining = FIELD_GET( + ERDMA_CMD_QUERY_QP_RESP_SQ_DRAINING_MASK, resp0); + + qp_attr->pkey_index = 0; + qp_attr->dest_qp_num = qp->attrs.rocev2.dst_qpn; + + if (qp->ibqp.qp_type == IB_QPT_RC) + erdma_av_to_attr(&qp->attrs.rocev2.av, + &qp_attr->ah_attr); + } else { + qp_attr->qp_state = query_qp_state(qp); + qp_attr->cur_qp_state = qp_attr->qp_state; + } + return 0; } @@ -1588,7 +1880,7 @@ static int erdma_init_user_cq(struct erdma_ucontext *ctx, struct erdma_cq *cq, ret = erdma_map_user_dbrecords(ctx, ureq->db_record_va, &cq->user_cq.user_dbr_page, - &cq->user_cq.db_info_dma_addr); + &cq->user_cq.dbrec_dma); if (ret) put_mtt_entries(dev, &cq->user_cq.qbuf_mem); @@ -1600,24 +1892,33 @@ static int erdma_init_kernel_cq(struct erdma_cq *cq) struct erdma_dev *dev = to_edev(cq->ibcq.device); cq->kern_cq.qbuf = - dma_alloc_coherent(&dev->pdev->dev, - WARPPED_BUFSIZE(cq->depth << CQE_SHIFT), + dma_alloc_coherent(&dev->pdev->dev, cq->depth << CQE_SHIFT, &cq->kern_cq.qbuf_dma_addr, GFP_KERNEL); if (!cq->kern_cq.qbuf) return -ENOMEM; - cq->kern_cq.db_record = - (u64 *)(cq->kern_cq.qbuf + (cq->depth << CQE_SHIFT)); + cq->kern_cq.dbrec = dma_pool_zalloc(dev->db_pool, GFP_KERNEL, + &cq->kern_cq.dbrec_dma); + if (!cq->kern_cq.dbrec) + goto err_out; + spin_lock_init(&cq->kern_cq.lock); /* use default cqdb addr */ cq->kern_cq.db = dev->func_bar + ERDMA_BAR_CQDB_SPACE_OFFSET; return 0; + +err_out: + dma_free_coherent(&dev->pdev->dev, cq->depth << CQE_SHIFT, + cq->kern_cq.qbuf, cq->kern_cq.qbuf_dma_addr); + + return -ENOMEM; } int erdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct erdma_cq *cq = to_ecq(ibcq); struct erdma_dev *dev = to_edev(ibcq->device); unsigned int depth = attr->cqe; @@ -1676,9 +1977,10 @@ err_free_res: erdma_unmap_user_dbrecords(ctx, &cq->user_cq.user_dbr_page); put_mtt_entries(dev, &cq->user_cq.qbuf_mem); } else { - dma_free_coherent(&dev->pdev->dev, - WARPPED_BUFSIZE(depth << CQE_SHIFT), + dma_free_coherent(&dev->pdev->dev, depth << CQE_SHIFT, cq->kern_cq.qbuf, cq->kern_cq.qbuf_dma_addr); + dma_pool_free(dev->db_pool, cq->kern_cq.dbrec, + cq->kern_cq.dbrec_dma); } err_out_xa: @@ -1687,6 +1989,10 @@ err_out_xa: return ret; } +void erdma_disassociate_ucontext(struct ib_ucontext *ibcontext) +{ +} + void erdma_set_mtu(struct erdma_dev *dev, u32 mtu) { struct erdma_cmdq_config_mtu_req req; @@ -1695,7 +2001,7 @@ void erdma_set_mtu(struct erdma_dev *dev, u32 mtu) CMDQ_OPCODE_CONF_MTU); req.mtu = mtu; - erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, true); } void erdma_port_event(struct erdma_dev *dev, enum ib_event_type reason) @@ -1765,7 +2071,8 @@ static int erdma_query_hw_stats(struct erdma_dev *dev, req.target_addr = dma_addr; req.target_length = ERDMA_HW_RESP_SIZE; - err = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + err = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + true); if (err) goto out; @@ -1798,3 +2105,159 @@ int erdma_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, return stats->num_counters; } + +enum rdma_link_layer erdma_get_link_layer(struct ib_device *ibdev, u32 port_num) +{ + return IB_LINK_LAYER_ETHERNET; +} + +static int erdma_set_gid(struct erdma_dev *dev, u8 op, u32 idx, + const union ib_gid *gid) +{ + struct erdma_cmdq_set_gid_req req; + u8 ntype; + + req.cfg = FIELD_PREP(ERDMA_CMD_SET_GID_SGID_IDX_MASK, idx) | + FIELD_PREP(ERDMA_CMD_SET_GID_OP_MASK, op); + + if (op == ERDMA_SET_GID_OP_ADD) { + if (ipv6_addr_v4mapped((struct in6_addr *)gid)) + ntype = ERDMA_NETWORK_TYPE_IPV4; + else + ntype = ERDMA_NETWORK_TYPE_IPV6; + + req.cfg |= FIELD_PREP(ERDMA_CMD_SET_GID_NTYPE_MASK, ntype); + + memcpy(&req.gid, gid, ERDMA_ROCEV2_GID_SIZE); + } + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_SET_GID); + return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + true); +} + +int erdma_add_gid(const struct ib_gid_attr *attr, void **context) +{ + struct erdma_dev *dev = to_edev(attr->device); + int ret; + + ret = erdma_check_gid_attr(attr); + if (ret) + return ret; + + return erdma_set_gid(dev, ERDMA_SET_GID_OP_ADD, attr->index, + &attr->gid); +} + +int erdma_del_gid(const struct ib_gid_attr *attr, void **context) +{ + return erdma_set_gid(to_edev(attr->device), ERDMA_SET_GID_OP_DEL, + attr->index, NULL); +} + +int erdma_query_pkey(struct ib_device *ibdev, u32 port, u16 index, u16 *pkey) +{ + if (index >= ERDMA_MAX_PKEYS) + return -EINVAL; + + *pkey = ERDMA_DEFAULT_PKEY; + return 0; +} + +void erdma_set_av_cfg(struct erdma_av_cfg *av_cfg, struct erdma_av *av) +{ + av_cfg->cfg0 = FIELD_PREP(ERDMA_CMD_CREATE_AV_FL_MASK, av->flow_label) | + FIELD_PREP(ERDMA_CMD_CREATE_AV_NTYPE_MASK, av->ntype); + + av_cfg->traffic_class = av->traffic_class; + av_cfg->hop_limit = av->hop_limit; + av_cfg->sl = av->sl; + + av_cfg->udp_sport = av->udp_sport; + av_cfg->sgid_index = av->sgid_index; + + ether_addr_copy(av_cfg->dmac, av->dmac); + memcpy(av_cfg->dgid, av->dgid, ERDMA_ROCEV2_GID_SIZE); +} + +int erdma_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, + struct ib_udata *udata) +{ + const struct ib_global_route *grh = + rdma_ah_read_grh(init_attr->ah_attr); + struct erdma_dev *dev = to_edev(ibah->device); + struct erdma_pd *pd = to_epd(ibah->pd); + struct erdma_ah *ah = to_eah(ibah); + struct erdma_cmdq_create_ah_req req; + u32 udp_sport; + int ret; + + ret = erdma_check_gid_attr(grh->sgid_attr); + if (ret) + return ret; + + ret = erdma_alloc_idx(&dev->res_cb[ERDMA_RES_TYPE_AH]); + if (ret < 0) + return ret; + + ah->ahn = ret; + + if (grh->flow_label) + udp_sport = rdma_flow_label_to_udp_sport(grh->flow_label); + else + udp_sport = + IB_ROCE_UDP_ENCAP_VALID_PORT_MIN + (ah->ahn & 0x3FFF); + + erdma_attr_to_av(init_attr->ah_attr, &ah->av, udp_sport); + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_CREATE_AH); + + req.pdn = pd->pdn; + req.ahn = ah->ahn; + erdma_set_av_cfg(&req.av_cfg, &ah->av); + + ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + init_attr->flags & RDMA_CREATE_AH_SLEEPABLE); + if (ret) { + erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_AH], ah->ahn); + return ret; + } + + return 0; +} + +int erdma_destroy_ah(struct ib_ah *ibah, u32 flags) +{ + struct erdma_dev *dev = to_edev(ibah->device); + struct erdma_pd *pd = to_epd(ibah->pd); + struct erdma_ah *ah = to_eah(ibah); + struct erdma_cmdq_destroy_ah_req req; + int ret; + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_DESTROY_AH); + + req.pdn = pd->pdn; + req.ahn = ah->ahn; + + ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + flags & RDMA_DESTROY_AH_SLEEPABLE); + if (ret) + return ret; + + erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_AH], ah->ahn); + + return 0; +} + +int erdma_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr) +{ + struct erdma_ah *ah = to_eah(ibah); + + memset(ah_attr, 0, sizeof(*ah_attr)); + erdma_av_to_attr(&ah->av, ah_attr); + + return 0; +} diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.h b/drivers/infiniband/hw/erdma/erdma_verbs.h index db6018529ccc..f9408ccc8bad 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.h +++ b/drivers/infiniband/hw/erdma/erdma_verbs.h @@ -136,12 +136,31 @@ struct erdma_user_dbrecords_page { int refcnt; }; +struct erdma_av { + u8 port; + u8 hop_limit; + u8 traffic_class; + u8 sl; + u8 sgid_index; + u16 udp_sport; + u32 flow_label; + u8 dmac[ETH_ALEN]; + u8 dgid[ERDMA_ROCEV2_GID_SIZE]; + enum erdma_network_type ntype; +}; + +struct erdma_ah { + struct ib_ah ibah; + struct erdma_av av; + u32 ahn; +}; + struct erdma_uqp { struct erdma_mem sq_mem; struct erdma_mem rq_mem; - dma_addr_t sq_db_info_dma_addr; - dma_addr_t rq_db_info_dma_addr; + dma_addr_t sq_dbrec_dma; + dma_addr_t rq_dbrec_dma; struct erdma_user_dbrecords_page *user_dbr_page; @@ -167,39 +186,100 @@ struct erdma_kqp { void *rq_buf; dma_addr_t rq_buf_dma_addr; - void *sq_db_info; - void *rq_db_info; + void *sq_dbrec; + void *rq_dbrec; + + dma_addr_t sq_dbrec_dma; + dma_addr_t rq_dbrec_dma; u8 sig_all; }; -enum erdma_qp_state { - ERDMA_QP_STATE_IDLE = 0, - ERDMA_QP_STATE_RTR = 1, - ERDMA_QP_STATE_RTS = 2, - ERDMA_QP_STATE_CLOSING = 3, - ERDMA_QP_STATE_TERMINATE = 4, - ERDMA_QP_STATE_ERROR = 5, - ERDMA_QP_STATE_UNDEF = 7, - ERDMA_QP_STATE_COUNT = 8 +enum erdma_qps_iwarp { + ERDMA_QPS_IWARP_IDLE = 0, + ERDMA_QPS_IWARP_RTR = 1, + ERDMA_QPS_IWARP_RTS = 2, + ERDMA_QPS_IWARP_CLOSING = 3, + ERDMA_QPS_IWARP_TERMINATE = 4, + ERDMA_QPS_IWARP_ERROR = 5, + ERDMA_QPS_IWARP_UNDEF = 6, + ERDMA_QPS_IWARP_COUNT = 7, +}; + +enum erdma_qpa_mask_iwarp { + ERDMA_QPA_IWARP_STATE = (1 << 0), + ERDMA_QPA_IWARP_LLP_HANDLE = (1 << 2), + ERDMA_QPA_IWARP_ORD = (1 << 3), + ERDMA_QPA_IWARP_IRD = (1 << 4), + ERDMA_QPA_IWARP_SQ_SIZE = (1 << 5), + ERDMA_QPA_IWARP_RQ_SIZE = (1 << 6), + ERDMA_QPA_IWARP_MPA = (1 << 7), + ERDMA_QPA_IWARP_CC = (1 << 8), }; -enum erdma_qp_attr_mask { - ERDMA_QP_ATTR_STATE = (1 << 0), - ERDMA_QP_ATTR_LLP_HANDLE = (1 << 2), - ERDMA_QP_ATTR_ORD = (1 << 3), - ERDMA_QP_ATTR_IRD = (1 << 4), - ERDMA_QP_ATTR_SQ_SIZE = (1 << 5), - ERDMA_QP_ATTR_RQ_SIZE = (1 << 6), - ERDMA_QP_ATTR_MPA = (1 << 7) +enum erdma_qps_rocev2 { + ERDMA_QPS_ROCEV2_RESET = 0, + ERDMA_QPS_ROCEV2_INIT = 1, + ERDMA_QPS_ROCEV2_RTR = 2, + ERDMA_QPS_ROCEV2_RTS = 3, + ERDMA_QPS_ROCEV2_SQD = 4, + ERDMA_QPS_ROCEV2_SQE = 5, + ERDMA_QPS_ROCEV2_ERROR = 6, + ERDMA_QPS_ROCEV2_COUNT = 7, +}; + +enum erdma_qpa_mask_rocev2 { + ERDMA_QPA_ROCEV2_STATE = (1 << 0), + ERDMA_QPA_ROCEV2_QKEY = (1 << 1), + ERDMA_QPA_ROCEV2_AV = (1 << 2), + ERDMA_QPA_ROCEV2_SQ_PSN = (1 << 3), + ERDMA_QPA_ROCEV2_RQ_PSN = (1 << 4), + ERDMA_QPA_ROCEV2_DST_QPN = (1 << 5), }; enum erdma_qp_flags { ERDMA_QP_IN_FLUSHING = (1 << 0), }; +#define ERDMA_QP_ACTIVE 0 +#define ERDMA_QP_PASSIVE 1 + +struct erdma_mod_qp_params_iwarp { + enum erdma_qps_iwarp state; + enum erdma_cc_alg cc; + u8 qp_type; + u8 pd_len; + u32 irq_size; + u32 orq_size; +}; + +struct erdma_qp_attrs_iwarp { + enum erdma_qps_iwarp state; + u32 cookie; +}; + +struct erdma_mod_qp_params_rocev2 { + enum erdma_qps_rocev2 state; + u32 qkey; + u32 sq_psn; + u32 rq_psn; + u32 dst_qpn; + struct erdma_av av; +}; + +union erdma_mod_qp_params { + struct erdma_mod_qp_params_iwarp iwarp; + struct erdma_mod_qp_params_rocev2 rocev2; +}; + +struct erdma_qp_attrs_rocev2 { + enum erdma_qps_rocev2 state; + u32 qkey; + u32 dst_qpn; + struct erdma_av av; +}; + struct erdma_qp_attrs { - enum erdma_qp_state state; enum erdma_cc_alg cc; /* Congestion control algorithm */ u32 sq_size; u32 rq_size; @@ -207,11 +287,10 @@ struct erdma_qp_attrs { u32 irq_size; u32 max_send_sge; u32 max_recv_sge; - u32 cookie; -#define ERDMA_QP_ACTIVE 0 -#define ERDMA_QP_PASSIVE 1 - u8 qp_type; - u8 pd_len; + union { + struct erdma_qp_attrs_iwarp iwarp; + struct erdma_qp_attrs_rocev2 rocev2; + }; }; struct erdma_qp { @@ -246,13 +325,14 @@ struct erdma_kcq_info { spinlock_t lock; u8 __iomem *db; - u64 *db_record; + u64 *dbrec; + dma_addr_t dbrec_dma; }; struct erdma_ucq_info { struct erdma_mem qbuf_mem; struct erdma_user_dbrecords_page *user_dbr_page; - dma_addr_t db_info_dma_addr; + dma_addr_t dbrec_dma; }; struct erdma_cq { @@ -282,11 +362,25 @@ static inline struct erdma_cq *find_cq_by_cqn(struct erdma_dev *dev, int id) void erdma_qp_get(struct erdma_qp *qp); void erdma_qp_put(struct erdma_qp *qp); -int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs, - enum erdma_qp_attr_mask mask); +int erdma_modify_qp_state_iwarp(struct erdma_qp *qp, + struct erdma_mod_qp_params_iwarp *params, + int mask); +int erdma_modify_qp_state_rocev2(struct erdma_qp *qp, + struct erdma_mod_qp_params_rocev2 *params, + int attr_mask); void erdma_qp_llp_close(struct erdma_qp *qp); void erdma_qp_cm_drop(struct erdma_qp *qp); +static inline bool erdma_device_iwarp(struct erdma_dev *dev) +{ + return dev->proto == ERDMA_PROTO_IWARP; +} + +static inline bool erdma_device_rocev2(struct erdma_dev *dev) +{ + return dev->proto == ERDMA_PROTO_ROCEV2; +} + static inline struct erdma_ucontext *to_ectx(struct ib_ucontext *ibctx) { return container_of(ibctx, struct erdma_ucontext, ibucontext); @@ -312,6 +406,21 @@ static inline struct erdma_cq *to_ecq(struct ib_cq *ibcq) return container_of(ibcq, struct erdma_cq, ibcq); } +static inline struct erdma_ah *to_eah(struct ib_ah *ibah) +{ + return container_of(ibah, struct erdma_ah, ibah); +} + +static inline int erdma_check_gid_attr(const struct ib_gid_attr *attr) +{ + u8 ntype = rdma_gid_attr_network_type(attr); + + if (ntype != RDMA_NETWORK_IPV4 && ntype != RDMA_NETWORK_IPV6) + return -EINVAL; + + return 0; +} + static inline struct erdma_user_mmap_entry * to_emmap(struct rdma_user_mmap_entry *ibmmap) { @@ -325,7 +434,7 @@ int erdma_query_device(struct ib_device *dev, struct ib_device_attr *attr, int erdma_get_port_immutable(struct ib_device *dev, u32 port, struct ib_port_immutable *ib_port_immutable); int erdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *data); + struct uverbs_attr_bundle *attrs); int erdma_query_port(struct ib_device *dev, u32 port, struct ib_port_attr *attr); int erdma_query_gid(struct ib_device *dev, u32 port, int idx, @@ -340,6 +449,7 @@ int erdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask, struct ib_udata *data); int erdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata); int erdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); +void erdma_disassociate_ucontext(struct ib_ucontext *ibcontext); int erdma_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); struct ib_mr *erdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len, u64 virt, int access, struct ib_udata *udata); @@ -355,6 +465,7 @@ int erdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *send_wr, int erdma_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *recv_wr, const struct ib_recv_wr **bad_recv_wr); int erdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); +void erdma_remove_cqes_of_qp(struct ib_cq *ibcq, u32 qpn); struct ib_mr *erdma_ib_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, u32 max_num_sg); int erdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, @@ -365,5 +476,15 @@ struct rdma_hw_stats *erdma_alloc_hw_port_stats(struct ib_device *device, u32 port_num); int erdma_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, u32 port, int index); +enum rdma_link_layer erdma_get_link_layer(struct ib_device *ibdev, + u32 port_num); +int erdma_add_gid(const struct ib_gid_attr *attr, void **context); +int erdma_del_gid(const struct ib_gid_attr *attr, void **context); +int erdma_query_pkey(struct ib_device *ibdev, u32 port, u16 index, u16 *pkey); +void erdma_set_av_cfg(struct erdma_av_cfg *av_cfg, struct erdma_av *av); +int erdma_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, + struct ib_udata *udata); +int erdma_destroy_ah(struct ib_ah *ibah, u32 flags); +int erdma_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr); #endif diff --git a/drivers/infiniband/hw/hfi1/aspm.c b/drivers/infiniband/hw/hfi1/aspm.c index a3c53be4072c..9b508eaf441d 100644 --- a/drivers/infiniband/hw/hfi1/aspm.c +++ b/drivers/infiniband/hw/hfi1/aspm.c @@ -191,7 +191,7 @@ void aspm_disable_all(struct hfi1_devdata *dd) for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) { rcd = hfi1_rcd_get_by_index(dd, i); if (rcd) { - del_timer_sync(&rcd->aspm_timer); + timer_delete_sync(&rcd->aspm_timer); spin_lock_irqsave(&rcd->aspm_lock, flags); rcd->aspm_intr_enable = false; spin_unlock_irqrestore(&rcd->aspm_lock, flags); diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 78f27f7b4203..e908f529335d 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -251,7 +251,7 @@ struct flag_table { /* * CCE Error flags. */ -static struct flag_table cce_err_status_flags[] = { +static const struct flag_table cce_err_status_flags[] = { /* 0*/ FLAG_ENTRY0("CceCsrParityErr", CCE_ERR_STATUS_CCE_CSR_PARITY_ERR_SMASK), /* 1*/ FLAG_ENTRY0("CceCsrReadBadAddrErr", @@ -341,7 +341,7 @@ static struct flag_table cce_err_status_flags[] = { * Misc Error flags */ #define MES(text) MISC_ERR_STATUS_MISC_##text##_ERR_SMASK -static struct flag_table misc_err_status_flags[] = { +static const struct flag_table misc_err_status_flags[] = { /* 0*/ FLAG_ENTRY0("CSR_PARITY", MES(CSR_PARITY)), /* 1*/ FLAG_ENTRY0("CSR_READ_BAD_ADDR", MES(CSR_READ_BAD_ADDR)), /* 2*/ FLAG_ENTRY0("CSR_WRITE_BAD_ADDR", MES(CSR_WRITE_BAD_ADDR)), @@ -360,7 +360,7 @@ static struct flag_table misc_err_status_flags[] = { /* * TXE PIO Error flags and consequences */ -static struct flag_table pio_err_status_flags[] = { +static const struct flag_table pio_err_status_flags[] = { /* 0*/ FLAG_ENTRY("PioWriteBadCtxt", SEC_WRITE_DROPPED, SEND_PIO_ERR_STATUS_PIO_WRITE_BAD_CTXT_ERR_SMASK), @@ -502,7 +502,7 @@ static struct flag_table pio_err_status_flags[] = { /* * TXE SDMA Error flags */ -static struct flag_table sdma_err_status_flags[] = { +static const struct flag_table sdma_err_status_flags[] = { /* 0*/ FLAG_ENTRY0("SDmaRpyTagErr", SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK), /* 1*/ FLAG_ENTRY0("SDmaCsrParityErr", @@ -530,7 +530,7 @@ static struct flag_table sdma_err_status_flags[] = { * TXE Egress Error flags */ #define SEES(text) SEND_EGRESS_ERR_STATUS_##text##_ERR_SMASK -static struct flag_table egress_err_status_flags[] = { +static const struct flag_table egress_err_status_flags[] = { /* 0*/ FLAG_ENTRY0("TxPktIntegrityMemCorErr", SEES(TX_PKT_INTEGRITY_MEM_COR)), /* 1*/ FLAG_ENTRY0("TxPktIntegrityMemUncErr", SEES(TX_PKT_INTEGRITY_MEM_UNC)), /* 2 reserved */ @@ -631,7 +631,7 @@ static struct flag_table egress_err_status_flags[] = { * TXE Egress Error Info flags */ #define SEEI(text) SEND_EGRESS_ERR_INFO_##text##_ERR_SMASK -static struct flag_table egress_err_info_flags[] = { +static const struct flag_table egress_err_info_flags[] = { /* 0*/ FLAG_ENTRY0("Reserved", 0ull), /* 1*/ FLAG_ENTRY0("VLErr", SEEI(VL)), /* 2*/ FLAG_ENTRY0("JobKeyErr", SEEI(JOB_KEY)), @@ -680,7 +680,7 @@ static struct flag_table egress_err_info_flags[] = { * TXE Send error flags */ #define SES(name) SEND_ERR_STATUS_SEND_##name##_ERR_SMASK -static struct flag_table send_err_status_flags[] = { +static const struct flag_table send_err_status_flags[] = { /* 0*/ FLAG_ENTRY0("SendCsrParityErr", SES(CSR_PARITY)), /* 1*/ FLAG_ENTRY0("SendCsrReadBadAddrErr", SES(CSR_READ_BAD_ADDR)), /* 2*/ FLAG_ENTRY0("SendCsrWriteBadAddrErr", SES(CSR_WRITE_BAD_ADDR)) @@ -689,7 +689,7 @@ static struct flag_table send_err_status_flags[] = { /* * TXE Send Context Error flags and consequences */ -static struct flag_table sc_err_status_flags[] = { +static const struct flag_table sc_err_status_flags[] = { /* 0*/ FLAG_ENTRY("InconsistentSop", SEC_PACKET_DROPPED | SEC_SC_HALTED, SEND_CTXT_ERR_STATUS_PIO_INCONSISTENT_SOP_ERR_SMASK), @@ -712,7 +712,7 @@ static struct flag_table sc_err_status_flags[] = { * RXE Receive Error flags */ #define RXES(name) RCV_ERR_STATUS_RX_##name##_ERR_SMASK -static struct flag_table rxe_err_status_flags[] = { +static const struct flag_table rxe_err_status_flags[] = { /* 0*/ FLAG_ENTRY0("RxDmaCsrCorErr", RXES(DMA_CSR_COR)), /* 1*/ FLAG_ENTRY0("RxDcIntfParityErr", RXES(DC_INTF_PARITY)), /* 2*/ FLAG_ENTRY0("RxRcvHdrUncErr", RXES(RCV_HDR_UNC)), @@ -847,7 +847,7 @@ static struct flag_table rxe_err_status_flags[] = { * DCC Error Flags */ #define DCCE(name) DCC_ERR_FLG_##name##_SMASK -static struct flag_table dcc_err_flags[] = { +static const struct flag_table dcc_err_flags[] = { FLAG_ENTRY0("bad_l2_err", DCCE(BAD_L2_ERR)), FLAG_ENTRY0("bad_sc_err", DCCE(BAD_SC_ERR)), FLAG_ENTRY0("bad_mid_tail_err", DCCE(BAD_MID_TAIL_ERR)), @@ -900,7 +900,7 @@ static struct flag_table dcc_err_flags[] = { * LCB error flags */ #define LCBE(name) DC_LCB_ERR_FLG_##name##_SMASK -static struct flag_table lcb_err_flags[] = { +static const struct flag_table lcb_err_flags[] = { /* 0*/ FLAG_ENTRY0("CSR_PARITY_ERR", LCBE(CSR_PARITY_ERR)), /* 1*/ FLAG_ENTRY0("INVALID_CSR_ADDR", LCBE(INVALID_CSR_ADDR)), /* 2*/ FLAG_ENTRY0("RST_FOR_FAILED_DESKEW", LCBE(RST_FOR_FAILED_DESKEW)), @@ -943,7 +943,7 @@ static struct flag_table lcb_err_flags[] = { * DC8051 Error Flags */ #define D8E(name) DC_DC8051_ERR_FLG_##name##_SMASK -static struct flag_table dc8051_err_flags[] = { +static const struct flag_table dc8051_err_flags[] = { FLAG_ENTRY0("SET_BY_8051", D8E(SET_BY_8051)), FLAG_ENTRY0("LOST_8051_HEART_BEAT", D8E(LOST_8051_HEART_BEAT)), FLAG_ENTRY0("CRAM_MBE", D8E(CRAM_MBE)), @@ -962,7 +962,7 @@ static struct flag_table dc8051_err_flags[] = { * * Flags in DC8051_DBG_ERR_INFO_SET_BY_8051.ERROR field. */ -static struct flag_table dc8051_info_err_flags[] = { +static const struct flag_table dc8051_info_err_flags[] = { FLAG_ENTRY0("Spico ROM check failed", SPICO_ROM_FAILED), FLAG_ENTRY0("Unknown frame received", UNKNOWN_FRAME), FLAG_ENTRY0("Target BER not met", TARGET_BER_NOT_MET), @@ -986,7 +986,7 @@ static struct flag_table dc8051_info_err_flags[] = { * * Flags in DC8051_DBG_ERR_INFO_SET_BY_8051.HOST_MSG field. */ -static struct flag_table dc8051_info_host_msg_flags[] = { +static const struct flag_table dc8051_info_host_msg_flags[] = { FLAG_ENTRY0("Host request done", 0x0001), FLAG_ENTRY0("BC PWR_MGM message", 0x0002), FLAG_ENTRY0("BC SMA message", 0x0004), @@ -5275,7 +5275,7 @@ done: * the buffer. End in '*' if the buffer is too short. */ static char *flag_string(char *buf, int buf_len, u64 flags, - struct flag_table *table, int table_size) + const struct flag_table *table, int table_size) { char extra[32]; char *p = buf; @@ -5576,7 +5576,7 @@ static int init_rcverr(struct hfi1_devdata *dd) static void free_rcverr(struct hfi1_devdata *dd) { if (dd->rcverr_timer.function) - del_timer_sync(&dd->rcverr_timer); + timer_delete_sync(&dd->rcverr_timer); } static void handle_rxe_err(struct hfi1_devdata *dd, u32 unused, u64 reg) @@ -12308,7 +12308,7 @@ static void free_cntrs(struct hfi1_devdata *dd) int i; if (dd->synth_stats_timer.function) - del_timer_sync(&dd->synth_stats_timer); + timer_delete_sync(&dd->synth_stats_timer); cancel_work_sync(&dd->update_cntr_work); ppd = (struct hfi1_pportdata *)(dd + 1); for (i = 0; i < dd->num_pports; i++, ppd++) { @@ -12882,22 +12882,6 @@ u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate) } } -/* return the OPA port logical state name */ -const char *opa_lstate_name(u32 lstate) -{ - static const char * const port_logical_names[] = { - "PORT_NOP", - "PORT_DOWN", - "PORT_INIT", - "PORT_ARMED", - "PORT_ACTIVE", - "PORT_ACTIVE_DEFER", - }; - if (lstate < ARRAY_SIZE(port_logical_names)) - return port_logical_names[lstate]; - return "unknown"; -} - /* return the OPA port physical state name */ const char *opa_pstate_name(u32 pstate) { @@ -12956,8 +12940,6 @@ static void update_statusp(struct hfi1_pportdata *ppd, u32 state) break; } } - dd_dev_info(ppd->dd, "logical state changed to %s (0x%x)\n", - opa_lstate_name(state), state); } /** @@ -13235,7 +13217,7 @@ int set_intr_bits(struct hfi1_devdata *dd, u16 first, u16 last, bool set) /* * Clear all interrupt sources on the chip. */ -void clear_all_interrupts(struct hfi1_devdata *dd) +static void clear_all_interrupts(struct hfi1_devdata *dd) { int i; diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index d861aa8fc640..6992f6d40255 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -771,7 +771,6 @@ int is_bx(struct hfi1_devdata *dd); bool is_urg_masked(struct hfi1_ctxtdata *rcd); u32 read_physical_state(struct hfi1_devdata *dd); u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate); -const char *opa_lstate_name(u32 lstate); const char *opa_pstate_name(u32 pstate); u32 driver_pstate(struct hfi1_pportdata *ppd); u32 driver_lstate(struct hfi1_pportdata *ppd); @@ -1404,7 +1403,6 @@ irqreturn_t receive_context_interrupt_napi(int irq, void *data); int set_intr_bits(struct hfi1_devdata *dd, u16 first, u16 last, bool set); void init_qsfp_int(struct hfi1_devdata *dd); -void clear_all_interrupts(struct hfi1_devdata *dd); void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr); void remap_sdma_interrupts(struct hfi1_devdata *dd, int engine, int msix_intr); void reset_interrupts(struct hfi1_devdata *dd); diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c index 37a6794885d3..3da90f2eb8e7 100644 --- a/drivers/infiniband/hw/hfi1/driver.c +++ b/drivers/infiniband/hw/hfi1/driver.c @@ -968,7 +968,7 @@ static bool __set_armed_to_active(struct hfi1_packet *packet) if (hwstate != IB_PORT_ACTIVE) { dd_dev_info(packet->rcd->dd, "Unexpected link state %s\n", - opa_lstate_name(hwstate)); + ib_port_state_to_str(hwstate)); return false; } @@ -1303,7 +1303,7 @@ void shutdown_led_override(struct hfi1_pportdata *ppd) */ smp_rmb(); if (atomic_read(&ppd->led_override_timer_active)) { - del_timer_sync(&ppd->led_override_timer); + timer_delete_sync(&ppd->led_override_timer); atomic_set(&ppd->led_override_timer_active, 0); /* Ensure the atomic_set is visible to all CPUs */ smp_wmb(); diff --git a/drivers/infiniband/hw/hfi1/fault.c b/drivers/infiniband/hw/hfi1/fault.c index 35d2382ee618..ec9ee59fcf0c 100644 --- a/drivers/infiniband/hw/hfi1/fault.c +++ b/drivers/infiniband/hw/hfi1/fault.c @@ -203,7 +203,6 @@ static const struct file_operations __fault_opcodes_fops = { .open = fault_opcodes_open, .read = fault_opcodes_read, .write = fault_opcodes_write, - .llseek = no_llseek }; void hfi1_fault_exit_debugfs(struct hfi1_ibdev *ibd) diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 4b3f1cb125fc..cb630551cf1a 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -2339,20 +2339,6 @@ static inline u64 hfi1_pkt_base_sdma_integrity(struct hfi1_devdata *dd) dev_err(&(dd)->pcidev->dev, "%s: port %u: " fmt, \ rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), (port), ##__VA_ARGS__) -/* - * this is used for formatting hw error messages... - */ -struct hfi1_hwerror_msgs { - u64 mask; - const char *msg; - size_t sz; -}; - -/* in intr.c... */ -void hfi1_format_hwerrors(u64 hwerrs, - const struct hfi1_hwerror_msgs *hwerrmsgs, - size_t nhwerrmsgs, char *msg, size_t lmsg); - #define USER_OPCODE_CHECK_VAL 0xC0 #define USER_OPCODE_CHECK_MASK 0xC0 #define OPCODE_CHECK_VAL_DISABLED 0x0 @@ -2425,7 +2411,7 @@ static inline bool hfi1_need_drop(struct hfi1_devdata *dd) int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp); #define DD_DEV_ENTRY(dd) __string(dev, dev_name(&(dd)->pcidev->dev)) -#define DD_DEV_ASSIGN(dd) __assign_str(dev, dev_name(&(dd)->pcidev->dev)) +#define DD_DEV_ASSIGN(dd) __assign_str(dev) static inline void hfi1_update_ah_attr(struct ib_device *ibdev, struct rdma_ah_attr *attr) diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index cbac4a442d9e..b35f92e7d865 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -635,12 +635,11 @@ void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, spin_lock_init(&ppd->cca_timer_lock); for (i = 0; i < OPA_MAX_SLS; i++) { - hrtimer_init(&ppd->cca_timer[i].hrtimer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); ppd->cca_timer[i].ppd = ppd; ppd->cca_timer[i].sl = i; ppd->cca_timer[i].ccti = 0; - ppd->cca_timer[i].hrtimer.function = cca_timer_fn; + hrtimer_setup(&ppd->cca_timer[i].hrtimer, cca_timer_fn, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); } ppd->cc_max_table_entries = IB_CC_TABLE_CAP_DEFAULT; @@ -986,7 +985,7 @@ static void stop_timers(struct hfi1_devdata *dd) for (pidx = 0; pidx < dd->num_pports; ++pidx) { ppd = dd->pport + pidx; if (ppd->led_override_timer.function) { - del_timer_sync(&ppd->led_override_timer); + timer_delete_sync(&ppd->led_override_timer); atomic_set(&ppd->led_override_timer_active, 0); } } diff --git a/drivers/infiniband/hw/hfi1/intr.c b/drivers/infiniband/hw/hfi1/intr.c index 3737f632d62a..d8dd1a599631 100644 --- a/drivers/infiniband/hw/hfi1/intr.c +++ b/drivers/infiniband/hw/hfi1/intr.c @@ -47,37 +47,6 @@ static void add_full_mgmt_pkey(struct hfi1_pportdata *ppd) hfi1_event_pkey_change(ppd->dd, ppd->port); } -/** - * format_hwmsg - format a single hwerror message - * @msg: message buffer - * @msgl: length of message buffer - * @hwmsg: message to add to message buffer - */ -static void format_hwmsg(char *msg, size_t msgl, const char *hwmsg) -{ - strlcat(msg, "[", msgl); - strlcat(msg, hwmsg, msgl); - strlcat(msg, "]", msgl); -} - -/** - * hfi1_format_hwerrors - format hardware error messages for display - * @hwerrs: hardware errors bit vector - * @hwerrmsgs: hardware error descriptions - * @nhwerrmsgs: number of hwerrmsgs - * @msg: message buffer - * @msgl: message buffer length - */ -void hfi1_format_hwerrors(u64 hwerrs, const struct hfi1_hwerror_msgs *hwerrmsgs, - size_t nhwerrmsgs, char *msg, size_t msgl) -{ - int i; - - for (i = 0; i < nhwerrmsgs; i++) - if (hwerrs & hwerrmsgs[i].mask) - format_hwmsg(msg, msgl, hwerrmsgs[i].msg); -} - static void signal_ib_event(struct hfi1_pportdata *ppd, enum ib_event_type ev) { struct ib_event event; diff --git a/drivers/infiniband/hw/hfi1/iowait.h b/drivers/infiniband/hw/hfi1/iowait.h index 49805a24bb0a..7259f4f55700 100644 --- a/drivers/infiniband/hw/hfi1/iowait.h +++ b/drivers/infiniband/hw/hfi1/iowait.h @@ -92,7 +92,7 @@ struct iowait_work { * * The lock field is used by waiters to record * the seqlock_t that guards the list head. - * Waiters explicity know that, but the destroy + * Waiters explicitly know that, but the destroy * code that unwaits QPs does not. */ struct iowait { diff --git a/drivers/infiniband/hw/hfi1/ipoib_main.c b/drivers/infiniband/hw/hfi1/ipoib_main.c index 5d814afdf7f3..7c9d5203002b 100644 --- a/drivers/infiniband/hw/hfi1/ipoib_main.c +++ b/drivers/infiniband/hw/hfi1/ipoib_main.c @@ -21,36 +21,25 @@ static int hfi1_ipoib_dev_init(struct net_device *dev) struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); int ret; - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; - ret = priv->netdev_ops->ndo_init(dev); if (ret) - goto out_ret; + return ret; ret = hfi1_netdev_add_data(priv->dd, qpn_from_mac(priv->netdev->dev_addr), dev); if (ret < 0) { priv->netdev_ops->ndo_uninit(dev); - goto out_ret; + return ret; } return 0; -out_ret: - free_percpu(dev->tstats); - dev->tstats = NULL; - return ret; } static void hfi1_ipoib_dev_uninit(struct net_device *dev) { struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); - free_percpu(dev->tstats); - dev->tstats = NULL; - hfi1_netdev_remove_data(priv->dd, qpn_from_mac(priv->netdev->dev_addr)); priv->netdev_ops->ndo_uninit(dev); @@ -107,7 +96,6 @@ static const struct net_device_ops hfi1_ipoib_netdev_ops = { .ndo_uninit = hfi1_ipoib_dev_uninit, .ndo_open = hfi1_ipoib_dev_open, .ndo_stop = hfi1_ipoib_dev_stop, - .ndo_get_stats64 = dev_get_tstats64, }; static int hfi1_ipoib_mcast_attach(struct net_device *dev, @@ -173,9 +161,6 @@ static void hfi1_ipoib_netdev_dtor(struct net_device *dev) hfi1_ipoib_txreq_deinit(priv); hfi1_ipoib_rxq_deinit(priv->netdev); - - free_percpu(dev->tstats); - dev->tstats = NULL; } static void hfi1_ipoib_set_id(struct net_device *dev, int id) @@ -234,6 +219,7 @@ static int hfi1_ipoib_setup_rn(struct ib_device *device, netdev->priv_destructor = hfi1_ipoib_netdev_dtor; netdev->needs_free_netdev = true; + netdev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; return 0; } diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c index a9883295f4af..b39f63ce6dfc 100644 --- a/drivers/infiniband/hw/hfi1/mad.c +++ b/drivers/infiniband/hw/hfi1/mad.c @@ -1160,8 +1160,8 @@ static int port_states_transition_allowed(struct hfi1_pportdata *ppd, if (ret == HFI_TRANSITION_DISALLOWED || ret == HFI_TRANSITION_UNDEFINED) { pr_warn("invalid logical state transition %s -> %s\n", - opa_lstate_name(logical_old), - opa_lstate_name(logical_new)); + ib_port_state_to_str(logical_old), + ib_port_state_to_str(logical_new)); return ret; } diff --git a/drivers/infiniband/hw/hfi1/mad.h b/drivers/infiniband/hw/hfi1/mad.h index b6e3141253c4..d6dde762921a 100644 --- a/drivers/infiniband/hw/hfi1/mad.h +++ b/drivers/infiniband/hw/hfi1/mad.h @@ -124,7 +124,6 @@ struct opa_mad_notice_attr { } __packed ntc_2048; }; - u8 class_data[]; }; #define IB_VLARB_LOWPRI_0_31 1 diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c index d4a6acad0e65..67a5c410fb5e 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.c +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c @@ -40,7 +40,7 @@ static unsigned long mmu_node_last(struct mmu_rb_node *node) } int hfi1_mmu_rb_register(void *ops_arg, - struct mmu_rb_ops *ops, + const struct mmu_rb_ops *ops, struct workqueue_struct *wq, struct mmu_rb_handler **handler) { diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.h b/drivers/infiniband/hw/hfi1/mmu_rb.h index 8e5d05454d70..3fa50dd64db6 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.h +++ b/drivers/infiniband/hw/hfi1/mmu_rb.h @@ -42,7 +42,7 @@ struct mmu_rb_handler { /* Begin on a new cachline boundary here */ struct rb_root_cached root ____cacheline_aligned_in_smp; void *ops_arg; - struct mmu_rb_ops *ops; + const struct mmu_rb_ops *ops; struct list_head lru_list; struct work_struct del_work; struct list_head del_list; @@ -51,7 +51,7 @@ struct mmu_rb_handler { }; int hfi1_mmu_rb_register(void *ops_arg, - struct mmu_rb_ops *ops, + const struct mmu_rb_ops *ops, struct workqueue_struct *wq, struct mmu_rb_handler **handler); void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler); diff --git a/drivers/infiniband/hw/hfi1/netdev.h b/drivers/infiniband/hw/hfi1/netdev.h index 8aa074670a9c..07c8f77c9181 100644 --- a/drivers/infiniband/hw/hfi1/netdev.h +++ b/drivers/infiniband/hw/hfi1/netdev.h @@ -49,7 +49,7 @@ struct hfi1_netdev_rxq { * When 0 receive queues will be freed. */ struct hfi1_netdev_rx { - struct net_device rx_napi; + struct net_device *rx_napi; struct hfi1_devdata *dd; struct hfi1_netdev_rxq *rxq; int num_rx_q; diff --git a/drivers/infiniband/hw/hfi1/netdev_rx.c b/drivers/infiniband/hw/hfi1/netdev_rx.c index 720d4c85c9c9..8608044203bb 100644 --- a/drivers/infiniband/hw/hfi1/netdev_rx.c +++ b/drivers/infiniband/hw/hfi1/netdev_rx.c @@ -188,7 +188,7 @@ static int hfi1_netdev_rxq_init(struct hfi1_netdev_rx *rx) int i; int rc; struct hfi1_devdata *dd = rx->dd; - struct net_device *dev = &rx->rx_napi; + struct net_device *dev = rx->rx_napi; rx->num_rx_q = dd->num_netdev_contexts; rx->rxq = kcalloc_node(rx->num_rx_q, sizeof(*rx->rxq), @@ -360,7 +360,11 @@ int hfi1_alloc_rx(struct hfi1_devdata *dd) if (!rx) return -ENOMEM; rx->dd = dd; - init_dummy_netdev(&rx->rx_napi); + rx->rx_napi = alloc_netdev_dummy(0); + if (!rx->rx_napi) { + kfree(rx); + return -ENOMEM; + } xa_init(&rx->dev_tbl); atomic_set(&rx->enabled, 0); @@ -374,6 +378,7 @@ void hfi1_free_rx(struct hfi1_devdata *dd) { if (dd->netdev_rx) { dd_dev_info(dd, "hfi1 rx freed\n"); + free_netdev(dd->netdev_rx->rx_napi); kfree(dd->netdev_rx); dd->netdev_rx = NULL; } diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c index 119ec2f1382b..7133964749f8 100644 --- a/drivers/infiniband/hw/hfi1/pcie.c +++ b/drivers/infiniband/hw/hfi1/pcie.c @@ -1207,14 +1207,11 @@ retry: (u32)lnkctl2); /* only write to parent if target is not as high as ours */ if ((lnkctl2 & PCI_EXP_LNKCTL2_TLS) < target_vector) { - lnkctl2 &= ~PCI_EXP_LNKCTL2_TLS; - lnkctl2 |= target_vector; - dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__, - (u32)lnkctl2); - ret = pcie_capability_write_word(parent, - PCI_EXP_LNKCTL2, lnkctl2); + ret = pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL2, + PCI_EXP_LNKCTL2_TLS, + target_vector); if (ret) { - dd_dev_err(dd, "Unable to write to PCI config\n"); + dd_dev_err(dd, "Unable to change parent PCI target speed\n"); return_error = 1; goto done; } @@ -1223,22 +1220,11 @@ retry: } dd_dev_info(dd, "%s: setting target link speed\n", __func__); - ret = pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL2, &lnkctl2); + ret = pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL2, + PCI_EXP_LNKCTL2_TLS, + target_vector); if (ret) { - dd_dev_err(dd, "Unable to read from PCI config\n"); - return_error = 1; - goto done; - } - - dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__, - (u32)lnkctl2); - lnkctl2 &= ~PCI_EXP_LNKCTL2_TLS; - lnkctl2 |= target_vector; - dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__, - (u32)lnkctl2); - ret = pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL2, lnkctl2); - if (ret) { - dd_dev_err(dd, "Unable to write to PCI config\n"); + dd_dev_err(dd, "Unable to change device PCI target speed\n"); return_error = 1; goto done; } diff --git a/drivers/infiniband/hw/hfi1/pin_system.c b/drivers/infiniband/hw/hfi1/pin_system.c index 384f722093e0..cce56134519b 100644 --- a/drivers/infiniband/hw/hfi1/pin_system.c +++ b/drivers/infiniband/hw/hfi1/pin_system.c @@ -26,7 +26,7 @@ static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, void *arg2, bool *stop); static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode); -static struct mmu_rb_ops sdma_rb_ops = { +static const struct mmu_rb_ops sdma_rb_ops = { .filter = sdma_rb_filter, .evict = sdma_rb_evict, .remove = sdma_rb_remove, diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c index 5a91cbda4aee..764286da2ce8 100644 --- a/drivers/infiniband/hw/hfi1/pio.c +++ b/drivers/infiniband/hw/hfi1/pio.c @@ -1361,16 +1361,6 @@ void sc_flush(struct send_context *sc) sc_wait_for_packet_egress(sc, 1); } -/* drop all packets on the context, no waiting until they are sent */ -void sc_drop(struct send_context *sc) -{ - if (!sc) - return; - - dd_dev_info(sc->dd, "%s: context %u(%u) - not implemented\n", - __func__, sc->sw_index, sc->hw_context); -} - /* * Start the software reaction to a context halt or SPC freeze: * - mark the context as halted or frozen diff --git a/drivers/infiniband/hw/hfi1/pio.h b/drivers/infiniband/hw/hfi1/pio.h index d07cc6ea7c63..ab0f9a3a8d12 100644 --- a/drivers/infiniband/hw/hfi1/pio.h +++ b/drivers/infiniband/hw/hfi1/pio.h @@ -246,7 +246,6 @@ void sc_disable(struct send_context *sc); int sc_restart(struct send_context *sc); void sc_return_credits(struct send_context *sc); void sc_flush(struct send_context *sc); -void sc_drop(struct send_context *sc); void sc_stop(struct send_context *sc, int bit); struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len, pio_release_cb cb, void *arg); diff --git a/drivers/infiniband/hw/hfi1/qsfp.c b/drivers/infiniband/hw/hfi1/qsfp.c index 52cce1c8b76a..3b7842a7f634 100644 --- a/drivers/infiniband/hw/hfi1/qsfp.c +++ b/drivers/infiniband/hw/hfi1/qsfp.c @@ -405,26 +405,6 @@ int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, } /* - * Perform a stand-alone single QSFP write. Acquire the resource, do the - * write, then release the resource. - */ -int one_qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, - int len) -{ - struct hfi1_devdata *dd = ppd->dd; - u32 resource = qsfp_resource(dd); - int ret; - - ret = acquire_chip_resource(dd, resource, QSFP_WAIT); - if (ret) - return ret; - ret = qsfp_write(ppd, target, addr, bp, len); - release_chip_resource(dd, resource); - - return ret; -} - -/* * Access page n, offset m of QSFP memory as defined by SFF 8636 * by reading @addr = ((256 * n) + m) * diff --git a/drivers/infiniband/hw/hfi1/qsfp.h b/drivers/infiniband/hw/hfi1/qsfp.h index df1389bad86b..5c59d53fcb63 100644 --- a/drivers/infiniband/hw/hfi1/qsfp.h +++ b/drivers/infiniband/hw/hfi1/qsfp.h @@ -195,8 +195,6 @@ int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, int len); int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, int len); -int one_qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, - int len); int one_qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, int len); struct hfi1_asic_data; diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index b67d23b1f286..16a749d16ee9 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -1521,24 +1521,6 @@ void sdma_all_running(struct hfi1_devdata *dd) } /** - * sdma_all_idle() - called when the link goes down - * @dd: hfi1_devdata - * - * This routine moves all engines to the idle state. - */ -void sdma_all_idle(struct hfi1_devdata *dd) -{ - struct sdma_engine *sde; - unsigned int i; - - /* idle all engines */ - for (i = 0; i < dd->num_sdma; ++i) { - sde = &dd->per_sdma[i]; - sdma_process_event(sde, sdma_event_e70_go_idle); - } -} - -/** * sdma_start() - called to kick off state processing for all engines * @dd: hfi1_devdata * @@ -1575,7 +1557,7 @@ void sdma_exit(struct hfi1_devdata *dd) sde->this_idx); sdma_process_event(sde, sdma_event_e00_go_hw_down); - del_timer_sync(&sde->err_progress_check_timer); + timer_delete_sync(&sde->err_progress_check_timer); /* * This waits for the state machine to exit so it is not diff --git a/drivers/infiniband/hw/hfi1/sdma.h b/drivers/infiniband/hw/hfi1/sdma.h index d77246b48434..91dfd5d0c419 100644 --- a/drivers/infiniband/hw/hfi1/sdma.h +++ b/drivers/infiniband/hw/hfi1/sdma.h @@ -373,7 +373,6 @@ void sdma_start(struct hfi1_devdata *dd); void sdma_exit(struct hfi1_devdata *dd); void sdma_clean(struct hfi1_devdata *dd, size_t num_engines); void sdma_all_running(struct hfi1_devdata *dd); -void sdma_all_idle(struct hfi1_devdata *dd); void sdma_freeze_notify(struct hfi1_devdata *dd, int go_idle); void sdma_freeze(struct hfi1_devdata *dd); void sdma_unfreeze(struct hfi1_devdata *dd); diff --git a/drivers/infiniband/hw/hfi1/sysfs.c b/drivers/infiniband/hw/hfi1/sysfs.c index d62ba5fdd80c..d94216c7d576 100644 --- a/drivers/infiniband/hw/hfi1/sysfs.c +++ b/drivers/infiniband/hw/hfi1/sysfs.c @@ -27,8 +27,8 @@ static struct hfi1_pportdata *hfi1_get_pportdata_kobj(struct kobject *kobj) * Congestion control table size followed by table entries */ static ssize_t cc_table_bin_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, char *buf, - loff_t pos, size_t count) + const struct bin_attribute *bin_attr, + char *buf, loff_t pos, size_t count) { int ret; struct hfi1_pportdata *ppd = hfi1_get_pportdata_kobj(kobj); @@ -57,7 +57,7 @@ static ssize_t cc_table_bin_read(struct file *filp, struct kobject *kobj, return count; } -static BIN_ATTR_RO(cc_table_bin, PAGE_SIZE); +static const BIN_ATTR_RO(cc_table_bin, PAGE_SIZE); /* * Congestion settings: port control, control map and an array of 16 @@ -65,7 +65,7 @@ static BIN_ATTR_RO(cc_table_bin, PAGE_SIZE); * trigger threshold and the minimum injection rate delay. */ static ssize_t cc_setting_bin_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, + const struct bin_attribute *bin_attr, char *buf, loff_t pos, size_t count) { struct hfi1_pportdata *ppd = hfi1_get_pportdata_kobj(kobj); @@ -93,9 +93,9 @@ static ssize_t cc_setting_bin_read(struct file *filp, struct kobject *kobj, return count; } -static BIN_ATTR_RO(cc_setting_bin, PAGE_SIZE); +static const BIN_ATTR_RO(cc_setting_bin, PAGE_SIZE); -static struct bin_attribute *port_cc_bin_attributes[] = { +static const struct bin_attribute *const port_cc_bin_attributes[] = { &bin_attr_cc_setting_bin, &bin_attr_cc_table_bin, NULL @@ -134,7 +134,7 @@ static struct attribute *port_cc_attributes[] = { static const struct attribute_group port_cc_group = { .name = "CCMgtA", .attrs = port_cc_attributes, - .bin_attrs = port_cc_bin_attributes, + .bin_attrs_new = port_cc_bin_attributes, }; /* Start sc2vl */ diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index c465966a1d9c..78bf4a48c035 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -3965,7 +3965,7 @@ static int hfi1_stop_tid_reap_timer(struct rvt_qp *qp) lockdep_assert_held(&qp->s_lock); if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) { - rval = del_timer(&qpriv->s_tid_timer); + rval = timer_delete(&qpriv->s_tid_timer); qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER; } return rval; @@ -3975,7 +3975,7 @@ void hfi1_del_tid_reap_timer(struct rvt_qp *qp) { struct hfi1_qp_priv *qpriv = qp->priv; - del_timer_sync(&qpriv->s_tid_timer); + timer_delete_sync(&qpriv->s_tid_timer); qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER; } @@ -4781,7 +4781,7 @@ static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp) lockdep_assert_held(&qp->s_lock); if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) { - rval = del_timer(&priv->s_tid_retry_timer); + rval = timer_delete(&priv->s_tid_retry_timer); priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER; } return rval; @@ -4791,7 +4791,7 @@ void hfi1_del_tid_retry_timer(struct rvt_qp *qp) { struct hfi1_qp_priv *priv = qp->priv; - del_timer_sync(&priv->s_tid_retry_timer); + timer_delete_sync(&priv->s_tid_retry_timer); priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER; } diff --git a/drivers/infiniband/hw/hfi1/trace_dbg.h b/drivers/infiniband/hw/hfi1/trace_dbg.h index 75599d5168db..58304b91380f 100644 --- a/drivers/infiniband/hw/hfi1/trace_dbg.h +++ b/drivers/infiniband/hw/hfi1/trace_dbg.h @@ -33,7 +33,7 @@ DECLARE_EVENT_CLASS(hfi1_trace_template, TP_STRUCT__entry(__string(function, function) __vstring(msg, vaf->fmt, vaf->va) ), - TP_fast_assign(__assign_str(function, function); + TP_fast_assign(__assign_str(function); __assign_vstr(msg, vaf->fmt, vaf->va); ), TP_printk("(%s) %s", diff --git a/drivers/infiniband/hw/hfi1/trace_rx.h b/drivers/infiniband/hw/hfi1/trace_rx.h index e6904aa80c00..8d5e12fe88a5 100644 --- a/drivers/infiniband/hw/hfi1/trace_rx.h +++ b/drivers/infiniband/hw/hfi1/trace_rx.h @@ -90,7 +90,7 @@ TRACE_EVENT(hfi1_mmu_invalidate, TP_fast_assign( __entry->ctxt = ctxt; __entry->subctxt = subctxt; - __assign_str(type, type); + __assign_str(type); __entry->start = start; __entry->end = end; ), diff --git a/drivers/infiniband/hw/hfi1/trace_tid.h b/drivers/infiniband/hw/hfi1/trace_tid.h index d129b8195959..e358f5b885fa 100644 --- a/drivers/infiniband/hw/hfi1/trace_tid.h +++ b/drivers/infiniband/hw/hfi1/trace_tid.h @@ -358,7 +358,7 @@ DECLARE_EVENT_CLASS(/* msg */ ), TP_fast_assign(/* assign */ __entry->qpn = qp ? qp->ibqp.qp_num : 0; - __assign_str(msg, msg); + __assign_str(msg); __entry->more = more; ), TP_printk(/* print */ @@ -651,7 +651,7 @@ DECLARE_EVENT_CLASS(/* tid_node */ TP_fast_assign(/* assign */ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); __entry->qpn = qp->ibqp.qp_num; - __assign_str(msg, msg); + __assign_str(msg); __entry->index = index; __entry->base = base; __entry->map = map; diff --git a/drivers/infiniband/hw/hfi1/trace_tx.h b/drivers/infiniband/hw/hfi1/trace_tx.h index c79856d4fdfb..c0ba6b0a2c4e 100644 --- a/drivers/infiniband/hw/hfi1/trace_tx.h +++ b/drivers/infiniband/hw/hfi1/trace_tx.h @@ -740,8 +740,8 @@ TRACE_EVENT(hfi1_sdma_state, __string(newstate, nstate) ), TP_fast_assign(DD_DEV_ASSIGN(sde->dd); - __assign_str(curstate, cstate); - __assign_str(newstate, nstate); + __assign_str(curstate); + __assign_str(newstate); ), TP_printk("[%s] current state %s new state %s", __get_str(dev), diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c index cf2d29098406..62b4f16dab27 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c @@ -53,7 +53,7 @@ int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd, int ret = 0; fd->entry_to_rb = kcalloc(uctxt->expected_count, - sizeof(struct rb_node *), + sizeof(*fd->entry_to_rb), GFP_KERNEL); if (!fd->entry_to_rb) return -ENOMEM; diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 33af2196ef31..49e0f79b950c 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1900,7 +1900,7 @@ void hfi1_unregister_ib_device(struct hfi1_devdata *dd) if (!list_empty(&dev->memwait)) dd_dev_err(dd, "memwait list not empty!\n"); - del_timer_sync(&dev->mem_timer); + timer_delete_sync(&dev->mem_timer); verbs_txreq_exit(dev); kfree(dev_cntr_descs); diff --git a/drivers/infiniband/hw/hns/Kconfig b/drivers/infiniband/hw/hns/Kconfig index ab3fbba70789..44cdb706fe27 100644 --- a/drivers/infiniband/hw/hns/Kconfig +++ b/drivers/infiniband/hw/hns/Kconfig @@ -1,21 +1,11 @@ # SPDX-License-Identifier: GPL-2.0-only -config INFINIBAND_HNS - tristate "HNS RoCE Driver" - depends on NET_VENDOR_HISILICON - depends on ARM64 || (COMPILE_TEST && 64BIT) - depends on (HNS_DSAF && HNS_ENET) || HNS3 - help - This is a RoCE/RDMA driver for the Hisilicon RoCE engine. - - To compile HIP08 driver as module, choose M here. - config INFINIBAND_HNS_HIP08 - bool "Hisilicon Hip08 Family RoCE support" - depends on INFINIBAND_HNS && PCI && HNS3 - depends on INFINIBAND_HNS=m || HNS3=y + tristate "Hisilicon Hip08 Family RoCE support" + depends on ARM64 || (COMPILE_TEST && 64BIT) + depends on PCI && HNS3 help RoCE driver support for Hisilicon RoCE engine in Hisilicon Hip08 SoC. The RoCE engine is a PCI device. - To compile this driver, choose Y here: if INFINIBAND_HNS is m, this - module will be called hns-roce-hw-v2. + To compile this driver, choose M here. This module will be called + hns-roce-hw-v2. diff --git a/drivers/infiniband/hw/hns/Makefile b/drivers/infiniband/hw/hns/Makefile index be1e1cdbcfa8..baf592e6f21b 100644 --- a/drivers/infiniband/hw/hns/Makefile +++ b/drivers/infiniband/hw/hns/Makefile @@ -4,13 +4,11 @@ # ccflags-y := -I $(srctree)/drivers/net/ethernet/hisilicon/hns3 +ccflags-y += -I $(src) -hns-roce-objs := hns_roce_main.o hns_roce_cmd.o hns_roce_pd.o \ +hns-roce-hw-v2-objs := hns_roce_main.o hns_roce_cmd.o hns_roce_pd.o \ hns_roce_ah.o hns_roce_hem.o hns_roce_mr.o hns_roce_qp.o \ hns_roce_cq.o hns_roce_alloc.o hns_roce_db.o hns_roce_srq.o hns_roce_restrack.o \ - hns_roce_debugfs.o + hns_roce_debugfs.o hns_roce_hw_v2.o -ifdef CONFIG_INFINIBAND_HNS_HIP08 -hns-roce-hw-v2-objs := hns_roce_hw_v2.o $(hns-roce-objs) -obj-$(CONFIG_INFINIBAND_HNS) += hns-roce-hw-v2.o -endif +obj-$(CONFIG_INFINIBAND_HNS_HIP08) += hns-roce-hw-v2.o diff --git a/drivers/infiniband/hw/hns/hns_roce_ah.c b/drivers/infiniband/hw/hns/hns_roce_ah.c index b4209b6aed8d..307c35888b30 100644 --- a/drivers/infiniband/hw/hns/hns_roce_ah.c +++ b/drivers/infiniband/hw/hns/hns_roce_ah.c @@ -33,7 +33,6 @@ #include <linux/pci.h> #include <rdma/ib_addr.h> #include <rdma/ib_cache.h> -#include "hnae3.h" #include "hns_roce_device.h" #include "hns_roce_hw_v2.h" @@ -59,11 +58,15 @@ int hns_roce_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, struct hns_roce_dev *hr_dev = to_hr_dev(ibah->device); struct hns_roce_ib_create_ah_resp resp = {}; struct hns_roce_ah *ah = to_hr_ah(ibah); - int ret = 0; - u32 max_sl; - - if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08 && udata) - return -EOPNOTSUPP; + u8 tclass = get_tclass(grh); + u8 priority = 0; + u8 tc_mode = 0; + int ret; + + if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08 && udata) { + ret = -EOPNOTSUPP; + goto err_out; + } ah->av.port = rdma_ah_get_port_num(ah_attr); ah->av.gid_index = grh->sgid_index; @@ -74,15 +77,24 @@ int hns_roce_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, ah->av.hop_limit = grh->hop_limit; ah->av.flowlabel = grh->flow_label; ah->av.udp_sport = get_ah_udp_sport(ah_attr); - ah->av.tclass = get_tclass(grh); - - ah->av.sl = rdma_ah_get_sl(ah_attr); - max_sl = min_t(u32, MAX_SERVICE_LEVEL, hr_dev->caps.sl_num - 1); - if (unlikely(ah->av.sl > max_sl)) { - ibdev_err_ratelimited(&hr_dev->ib_dev, - "failed to set sl, sl (%u) shouldn't be larger than %u.\n", - ah->av.sl, max_sl); - return -EINVAL; + ah->av.tclass = tclass; + + ret = hr_dev->hw->get_dscp(hr_dev, tclass, &tc_mode, &priority); + if (ret == -EOPNOTSUPP) + ret = 0; + + if (ret && grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) + goto err_out; + + if (tc_mode == HNAE3_TC_MAP_MODE_DSCP && + grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) + ah->av.sl = priority; + else + ah->av.sl = rdma_ah_get_sl(ah_attr); + + if (!check_sl_valid(hr_dev, ah->av.sl)) { + ret = -EINVAL; + goto err_out; } memcpy(ah->av.dgid, grh->dgid.raw, HNS_ROCE_GID_SIZE); @@ -99,6 +111,8 @@ int hns_roce_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, } if (udata) { + resp.priority = ah->av.sl; + resp.tc_mode = tc_mode; memcpy(resp.dmac, ah_attr->roce.dmac, ETH_ALEN); ret = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp))); diff --git a/drivers/infiniband/hw/hns/hns_roce_alloc.c b/drivers/infiniband/hw/hns/hns_roce_alloc.c index 11a78ceae568..6ee911f6885b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_alloc.c +++ b/drivers/infiniband/hw/hns/hns_roce_alloc.c @@ -153,8 +153,7 @@ int hns_roce_get_kmem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs, return total; } -int hns_roce_get_umem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs, - int buf_cnt, struct ib_umem *umem, +int hns_roce_get_umem_bufs(dma_addr_t *bufs, int buf_cnt, struct ib_umem *umem, unsigned int page_shift) { struct ib_block_iter biter; @@ -176,8 +175,10 @@ void hns_roce_cleanup_bitmap(struct hns_roce_dev *hr_dev) if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_XRC) ida_destroy(&hr_dev->xrcd_ida.ida); - if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SRQ) + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SRQ) { ida_destroy(&hr_dev->srq_table.srq_ida.ida); + xa_destroy(&hr_dev->srq_table.xa); + } hns_roce_cleanup_qp_table(hr_dev); hns_roce_cleanup_cq_table(hr_dev); ida_destroy(&hr_dev->mr_table.mtpt_ida.ida); diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 7250d0643b5c..3a5c93c9fb3e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -149,7 +149,7 @@ static int alloc_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) return ret; } - ret = xa_err(xa_store(&cq_table->array, hr_cq->cqn, hr_cq, GFP_KERNEL)); + ret = xa_err(xa_store_irq(&cq_table->array, hr_cq->cqn, hr_cq, GFP_KERNEL)); if (ret) { ibdev_err(ibdev, "failed to xa_store CQ, ret = %d.\n", ret); goto err_put; @@ -163,7 +163,7 @@ static int alloc_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) return 0; err_xa: - xa_erase(&cq_table->array, hr_cq->cqn); + xa_erase_irq(&cq_table->array, hr_cq->cqn); err_put: hns_roce_table_put(hr_dev, &cq_table->table, hr_cq->cqn); @@ -179,10 +179,10 @@ static void free_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) ret = hns_roce_destroy_hw_ctx(hr_dev, HNS_ROCE_CMD_DESTROY_CQC, hr_cq->cqn); if (ret) - dev_err(dev, "DESTROY_CQ failed (%d) for CQN %06lx\n", ret, - hr_cq->cqn); + dev_err_ratelimited(dev, "DESTROY_CQ failed (%d) for CQN %06lx\n", + ret, hr_cq->cqn); - xa_erase(&cq_table->array, hr_cq->cqn); + xa_erase_irq(&cq_table->array, hr_cq->cqn); /* Waiting interrupt process procedure carried out */ synchronize_irq(hr_dev->eq_table.eq[hr_cq->vector].irq); @@ -353,9 +353,10 @@ static int set_cqe_size(struct hns_roce_cq *hr_cq, struct ib_udata *udata, } int hns_roce_create_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { struct hns_roce_dev *hr_dev = to_hr_dev(ib_cq->device); + struct ib_udata *udata = &attrs->driver_udata; struct hns_roce_ib_create_cq_resp resp = {}; struct hns_roce_cq *hr_cq = to_hr_cq(ib_cq); struct ib_device *ibdev = &hr_dev->ib_dev; @@ -476,13 +477,6 @@ void hns_roce_cq_event(struct hns_roce_dev *hr_dev, u32 cqn, int event_type) struct ib_event event; struct ib_cq *ibcq; - hr_cq = xa_load(&hr_dev->cq_table.array, - cqn & (hr_dev->caps.num_cqs - 1)); - if (!hr_cq) { - dev_warn(dev, "async event for bogus CQ 0x%06x\n", cqn); - return; - } - if (event_type != HNS_ROCE_EVENT_TYPE_CQ_ID_INVALID && event_type != HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR && event_type != HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW) { @@ -491,7 +485,16 @@ void hns_roce_cq_event(struct hns_roce_dev *hr_dev, u32 cqn, int event_type) return; } - refcount_inc(&hr_cq->refcount); + xa_lock(&hr_dev->cq_table.array); + hr_cq = xa_load(&hr_dev->cq_table.array, + cqn & (hr_dev->caps.num_cqs - 1)); + if (hr_cq) + refcount_inc(&hr_cq->refcount); + xa_unlock(&hr_dev->cq_table.array); + if (!hr_cq) { + dev_warn(dev, "async event for bogus CQ 0x%06x\n", cqn); + return; + } ibcq = &hr_cq->ib_cq; if (ibcq->event_handler) { @@ -534,4 +537,6 @@ void hns_roce_cleanup_cq_table(struct hns_roce_dev *hr_dev) for (i = 0; i < HNS_ROCE_CQ_BANK_NUM; i++) ida_destroy(&hr_dev->cq_table.bank[i].ida); + xa_destroy(&hr_dev->cq_table.array); + mutex_destroy(&hr_dev->cq_table.bank_mutex); } diff --git a/drivers/infiniband/hw/hns/hns_roce_debugfs.c b/drivers/infiniband/hw/hns/hns_roce_debugfs.c index e8febb40f645..b869cdc54118 100644 --- a/drivers/infiniband/hw/hns/hns_roce_debugfs.c +++ b/drivers/infiniband/hw/hns/hns_roce_debugfs.c @@ -5,6 +5,7 @@ #include <linux/debugfs.h> #include <linux/device.h> +#include <linux/pci.h> #include "hns_roce_device.h" @@ -86,7 +87,7 @@ void hns_roce_register_debugfs(struct hns_roce_dev *hr_dev) { struct hns_roce_dev_debugfs *dbgfs = &hr_dev->dbgfs; - dbgfs->root = debugfs_create_dir(dev_name(&hr_dev->ib_dev.dev), + dbgfs->root = debugfs_create_dir(pci_name(hr_dev->pci_dev), hns_roce_dbgfs_root); create_sw_stat_debugfs(hr_dev, dbgfs->root); diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index c3cbd0a494bf..1dcc9cbb4678 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -83,6 +83,7 @@ #define MR_TYPE_DMA 0x03 #define HNS_ROCE_FRMR_MAX_PA 512 +#define HNS_ROCE_FRMR_ALIGN_SIZE 128 #define PKEY_ID 0xffff #define NODE_DESC_SIZE 64 @@ -91,6 +92,8 @@ /* Configure to HW for PAGE_SIZE larger than 4KB */ #define PG_SHIFT_OFFSET (PAGE_SHIFT - 12) +#define ATOMIC_WR_LEN 8 + #define HNS_ROCE_IDX_QUE_ENTRY_SZ 4 #define SRQ_DB_REG 0x230 @@ -100,6 +103,9 @@ #define CQ_BANKID_SHIFT 2 #define CQ_BANKID_MASK GENMASK(1, 0) +#define HNS_ROCE_MAX_CQ_COUNT 0xFFFF +#define HNS_ROCE_MAX_CQ_PERIOD 0xFFFF + enum { SERV_TYPE_RC, SERV_TYPE_UC, @@ -184,6 +190,9 @@ enum { #define HNS_HW_PAGE_SHIFT 12 #define HNS_HW_PAGE_SIZE (1 << HNS_HW_PAGE_SHIFT) +#define HNS_HW_MAX_PAGE_SHIFT 27 +#define HNS_HW_MAX_PAGE_SIZE (1 << HNS_HW_MAX_PAGE_SHIFT) + struct hns_roce_uar { u64 pfn; unsigned long index; @@ -480,12 +489,6 @@ struct hns_roce_bank { u32 next; /* Next ID to allocate. */ }; -struct hns_roce_idx_table { - u32 *spare_idx; - u32 head; - u32 tail; -}; - struct hns_roce_qp_table { struct hns_roce_hem_table qp_table; struct hns_roce_hem_table irrl_table; @@ -494,7 +497,7 @@ struct hns_roce_qp_table { struct mutex scc_mutex; struct hns_roce_bank bank[HNS_ROCE_QP_BANK_NUM]; struct mutex bank_mutex; - struct hns_roce_idx_table idx_table; + struct xarray dip_xa; }; struct hns_roce_cq_table { @@ -584,6 +587,7 @@ struct hns_roce_dev; enum { HNS_ROCE_FLUSH_FLAG = 0, + HNS_ROCE_STOP_FLUSH_FLAG = 1, }; struct hns_roce_work { @@ -645,6 +649,10 @@ struct hns_roce_qp { struct hns_user_mmap_entry *dwqe_mmap_entry; u32 config; enum hns_roce_cong_type cong_type; + u8 tc_mode; + u8 priority; + spinlock_t flush_lock; + struct hns_roce_dip *dip; }; struct hns_roce_ib_iboe { @@ -710,6 +718,7 @@ struct hns_roce_eq { int shift; int event_type; int sub_type; + struct work_struct work; }; struct hns_roce_eq_table { @@ -923,8 +932,7 @@ struct hns_roce_hw { int (*rereg_write_mtpt)(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr, int flags, void *mb_buf); - int (*frmr_write_mtpt)(struct hns_roce_dev *hr_dev, void *mb_buf, - struct hns_roce_mr *mr); + int (*frmr_write_mtpt)(void *mb_buf, struct hns_roce_mr *mr); int (*mw_write_mtpt)(void *mb_buf, struct hns_roce_mw *mw); void (*write_cqc)(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq, void *mb_buf, u64 *mtts, @@ -950,6 +958,8 @@ struct hns_roce_hw { int (*query_sccc)(struct hns_roce_dev *hr_dev, u32 qpn, void *buffer); int (*query_hw_counter)(struct hns_roce_dev *hr_dev, u64 *stats, u32 port, int *hw_counters); + int (*get_dscp)(struct hns_roce_dev *hr_dev, u8 dscp, + u8 *tc_mode, u8 *priority); const struct ib_device_ops *hns_roce_dev_ops; const struct ib_device_ops *hns_roce_dev_srq_ops; }; @@ -969,8 +979,6 @@ struct hns_roce_dev { enum hns_roce_device_state state; struct list_head qp_list; /* list of all qps on this dev */ spinlock_t qp_list_lock; /* protect qp_list */ - struct list_head dip_list; /* list of all dest ips on this dev */ - spinlock_t dip_list_lock; /* protect dip_list */ struct list_head pgdir_list; struct mutex pgdir_mutex; @@ -1019,6 +1027,26 @@ struct hns_roce_dev { atomic64_t *dfx_cnt; }; +enum hns_roce_trace_type { + TRACE_SQ, + TRACE_RQ, + TRACE_SRQ, +}; + +static inline const char *trace_type_to_str(enum hns_roce_trace_type type) +{ + switch (type) { + case TRACE_SQ: + return "SQ"; + case TRACE_RQ: + return "RQ"; + case TRACE_SRQ: + return "SRQ"; + default: + return "UNKNOWN"; + } +} + static inline struct hns_roce_dev *to_hr_dev(struct ib_device *ib_dev) { return container_of(ib_dev, struct hns_roce_dev, ib_dev); @@ -1228,7 +1256,7 @@ struct hns_roce_buf *hns_roce_buf_alloc(struct hns_roce_dev *hr_dev, u32 size, int hns_roce_get_kmem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs, int buf_cnt, struct hns_roce_buf *buf, unsigned int page_shift); -int hns_roce_get_umem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs, +int hns_roce_get_umem_bufs(dma_addr_t *bufs, int buf_cnt, struct ib_umem *umem, unsigned int page_shift); @@ -1261,7 +1289,7 @@ __be32 send_ieth(const struct ib_send_wr *wr); int to_hr_qp_type(int qp_type); int hns_roce_create_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int hns_roce_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata); int hns_roce_db_map_user(struct hns_roce_ucontext *context, unsigned long virt, @@ -1276,6 +1304,7 @@ void hns_roce_cq_completion(struct hns_roce_dev *hr_dev, u32 cqn); void hns_roce_cq_event(struct hns_roce_dev *hr_dev, u32 cqn, int event_type); void flush_cqe(struct hns_roce_dev *dev, struct hns_roce_qp *qp); void hns_roce_qp_event(struct hns_roce_dev *hr_dev, u32 qpn, int event_type); +void hns_roce_flush_cqe(struct hns_roce_dev *hr_dev, u32 qpn); void hns_roce_srq_event(struct hns_roce_dev *hr_dev, u32 srqn, int event_type); void hns_roce_handle_device_err(struct hns_roce_dev *hr_dev); int hns_roce_init(struct hns_roce_dev *hr_dev); @@ -1292,4 +1321,6 @@ struct hns_user_mmap_entry * hns_roce_user_mmap_entry_insert(struct ib_ucontext *ucontext, u64 address, size_t length, enum hns_roce_mmap_type mmap_type); +bool check_sl_valid(struct hns_roce_dev *hr_dev, u8 sl); + #endif /* _HNS_ROCE_DEVICE_H */ diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c index a4b3f19161dc..ca0798224e56 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.c +++ b/drivers/infiniband/hw/hns/hns_roce_hem.c @@ -281,7 +281,7 @@ static struct hns_roce_hem *hns_roce_alloc_hem(struct hns_roce_dev *hr_dev, return hem; fail: - hns_roce_free_hem(hr_dev, hem); + kfree(hem); return NULL; } @@ -300,7 +300,7 @@ static int calc_hem_config(struct hns_roce_dev *hr_dev, struct hns_roce_hem_mhop *mhop, struct hns_roce_hem_index *index) { - struct ib_device *ibdev = &hr_dev->ib_dev; + struct device *dev = hr_dev->dev; unsigned long mhop_obj = obj; u32 l0_idx, l1_idx, l2_idx; u32 chunk_ba_num; @@ -331,14 +331,14 @@ static int calc_hem_config(struct hns_roce_dev *hr_dev, index->buf = l0_idx; break; default: - ibdev_err(ibdev, "table %u not support mhop.hop_num = %u!\n", - table->type, mhop->hop_num); + dev_err(dev, "table %u not support mhop.hop_num = %u!\n", + table->type, mhop->hop_num); return -EINVAL; } if (unlikely(index->buf >= table->num_hem)) { - ibdev_err(ibdev, "table %u exceed hem limt idx %llu, max %lu!\n", - table->type, index->buf, table->num_hem); + dev_err(dev, "table %u exceed hem limt idx %llu, max %lu!\n", + table->type, index->buf, table->num_hem); return -EINVAL; } @@ -448,14 +448,14 @@ static int set_mhop_hem(struct hns_roce_dev *hr_dev, struct hns_roce_hem_mhop *mhop, struct hns_roce_hem_index *index) { - struct ib_device *ibdev = &hr_dev->ib_dev; + struct device *dev = hr_dev->dev; u32 step_idx; int ret = 0; if (index->inited & HEM_INDEX_L0) { ret = hr_dev->hw->set_hem(hr_dev, table, obj, 0); if (ret) { - ibdev_err(ibdev, "set HEM step 0 failed!\n"); + dev_err(dev, "set HEM step 0 failed!\n"); goto out; } } @@ -463,7 +463,7 @@ static int set_mhop_hem(struct hns_roce_dev *hr_dev, if (index->inited & HEM_INDEX_L1) { ret = hr_dev->hw->set_hem(hr_dev, table, obj, 1); if (ret) { - ibdev_err(ibdev, "set HEM step 1 failed!\n"); + dev_err(dev, "set HEM step 1 failed!\n"); goto out; } } @@ -475,7 +475,7 @@ static int set_mhop_hem(struct hns_roce_dev *hr_dev, step_idx = mhop->hop_num; ret = hr_dev->hw->set_hem(hr_dev, table, obj, step_idx); if (ret) - ibdev_err(ibdev, "set HEM step last failed!\n"); + dev_err(dev, "set HEM step last failed!\n"); } out: return ret; @@ -485,14 +485,14 @@ static int hns_roce_table_mhop_get(struct hns_roce_dev *hr_dev, struct hns_roce_hem_table *table, unsigned long obj) { - struct ib_device *ibdev = &hr_dev->ib_dev; struct hns_roce_hem_index index = {}; struct hns_roce_hem_mhop mhop = {}; + struct device *dev = hr_dev->dev; int ret; ret = calc_hem_config(hr_dev, table, obj, &mhop, &index); if (ret) { - ibdev_err(ibdev, "calc hem config failed!\n"); + dev_err(dev, "calc hem config failed!\n"); return ret; } @@ -504,7 +504,7 @@ static int hns_roce_table_mhop_get(struct hns_roce_dev *hr_dev, ret = alloc_mhop_hem(hr_dev, table, &mhop, &index); if (ret) { - ibdev_err(ibdev, "alloc mhop hem failed!\n"); + dev_err(dev, "alloc mhop hem failed!\n"); goto out; } @@ -512,7 +512,7 @@ static int hns_roce_table_mhop_get(struct hns_roce_dev *hr_dev, if (table->type < HEM_TYPE_MTT) { ret = set_mhop_hem(hr_dev, table, obj, &mhop, &index); if (ret) { - ibdev_err(ibdev, "set HEM address to HW failed!\n"); + dev_err(dev, "set HEM address to HW failed!\n"); goto err_alloc; } } @@ -575,7 +575,7 @@ static void clear_mhop_hem(struct hns_roce_dev *hr_dev, struct hns_roce_hem_mhop *mhop, struct hns_roce_hem_index *index) { - struct ib_device *ibdev = &hr_dev->ib_dev; + struct device *dev = hr_dev->dev; u32 hop_num = mhop->hop_num; u32 chunk_ba_num; u32 step_idx; @@ -605,21 +605,21 @@ static void clear_mhop_hem(struct hns_roce_dev *hr_dev, ret = hr_dev->hw->clear_hem(hr_dev, table, obj, step_idx); if (ret) - ibdev_warn(ibdev, "failed to clear hop%u HEM, ret = %d.\n", - hop_num, ret); + dev_warn(dev, "failed to clear hop%u HEM, ret = %d.\n", + hop_num, ret); if (index->inited & HEM_INDEX_L1) { ret = hr_dev->hw->clear_hem(hr_dev, table, obj, 1); if (ret) - ibdev_warn(ibdev, "failed to clear HEM step 1, ret = %d.\n", - ret); + dev_warn(dev, "failed to clear HEM step 1, ret = %d.\n", + ret); } if (index->inited & HEM_INDEX_L0) { ret = hr_dev->hw->clear_hem(hr_dev, table, obj, 0); if (ret) - ibdev_warn(ibdev, "failed to clear HEM step 0, ret = %d.\n", - ret); + dev_warn(dev, "failed to clear HEM step 0, ret = %d.\n", + ret); } } } @@ -629,14 +629,14 @@ static void hns_roce_table_mhop_put(struct hns_roce_dev *hr_dev, unsigned long obj, int check_refcount) { - struct ib_device *ibdev = &hr_dev->ib_dev; struct hns_roce_hem_index index = {}; struct hns_roce_hem_mhop mhop = {}; + struct device *dev = hr_dev->dev; int ret; ret = calc_hem_config(hr_dev, table, obj, &mhop, &index); if (ret) { - ibdev_err(ibdev, "calc hem config failed!\n"); + dev_err(dev, "calc hem config failed!\n"); return; } @@ -672,8 +672,8 @@ void hns_roce_table_put(struct hns_roce_dev *hr_dev, ret = hr_dev->hw->clear_hem(hr_dev, table, obj, HEM_HOP_STEP_DIRECT); if (ret) - dev_warn(dev, "failed to clear HEM base address, ret = %d.\n", - ret); + dev_warn_ratelimited(dev, "failed to clear HEM base address, ret = %d.\n", + ret); hns_roce_free_hem(hr_dev, table->hem[i]); table->hem[i] = NULL; @@ -877,6 +877,7 @@ void hns_roce_cleanup_hem_table(struct hns_roce_dev *hr_dev, if (hns_roce_check_whether_mhop(hr_dev, table->type)) { hns_roce_cleanup_mhop_hem_table(hr_dev, table); + mutex_destroy(&table->mutex); return; } @@ -891,6 +892,7 @@ void hns_roce_cleanup_hem_table(struct hns_roce_dev *hr_dev, hns_roce_free_hem(hr_dev, table->hem[i]); } + mutex_destroy(&table->mutex); kfree(table->hem); } @@ -929,6 +931,7 @@ struct hns_roce_hem_item { size_t count; /* max ba numbers */ int start; /* start buf offset in this hem */ int end; /* end buf offset in this hem */ + bool exist_bt; }; /* All HEM items are linked in a tree structure */ @@ -957,6 +960,7 @@ hem_list_alloc_item(struct hns_roce_dev *hr_dev, int start, int end, int count, } } + hem->exist_bt = exist_bt; hem->count = count; hem->start = start; hem->end = end; @@ -967,34 +971,32 @@ hem_list_alloc_item(struct hns_roce_dev *hr_dev, int start, int end, int count, } static void hem_list_free_item(struct hns_roce_dev *hr_dev, - struct hns_roce_hem_item *hem, bool exist_bt) + struct hns_roce_hem_item *hem) { - if (exist_bt) + if (hem->exist_bt) dma_free_coherent(hr_dev->dev, hem->count * BA_BYTE_LEN, hem->addr, hem->dma_addr); kfree(hem); } static void hem_list_free_all(struct hns_roce_dev *hr_dev, - struct list_head *head, bool exist_bt) + struct list_head *head) { struct hns_roce_hem_item *hem, *temp_hem; list_for_each_entry_safe(hem, temp_hem, head, list) { list_del(&hem->list); - hem_list_free_item(hr_dev, hem, exist_bt); + hem_list_free_item(hr_dev, hem); } } -static void hem_list_link_bt(struct hns_roce_dev *hr_dev, void *base_addr, - u64 table_addr) +static void hem_list_link_bt(void *base_addr, u64 table_addr) { *(u64 *)(base_addr) = table_addr; } /* assign L0 table address to hem from root bt */ -static void hem_list_assign_bt(struct hns_roce_dev *hr_dev, - struct hns_roce_hem_item *hem, void *cpu_addr, +static void hem_list_assign_bt(struct hns_roce_hem_item *hem, void *cpu_addr, u64 phy_addr) { hem->addr = cpu_addr; @@ -1041,9 +1043,9 @@ static bool hem_list_is_bottom_bt(int hopnum, int bt_level) * @bt_level: base address table level * @unit: ba entries per bt page */ -static u32 hem_list_calc_ba_range(int hopnum, int bt_level, int unit) +static u64 hem_list_calc_ba_range(int hopnum, int bt_level, int unit) { - u32 step; + u64 step; int max; int i; @@ -1079,11 +1081,15 @@ int hns_roce_hem_list_calc_root_ba(const struct hns_roce_buf_region *regions, { struct hns_roce_buf_region *r; int total = 0; - int step; + u64 step; int i; for (i = 0; i < region_cnt; i++) { r = (struct hns_roce_buf_region *)®ions[i]; + /* when r->hopnum = 0, the region should not occupy root_ba. */ + if (!r->hopnum) + continue; + if (r->hopnum > 1) { step = hem_list_calc_ba_range(r->hopnum, 1, unit); if (step > 0) @@ -1110,7 +1116,7 @@ static int hem_list_alloc_mid_bt(struct hns_roce_dev *hr_dev, int ret = 0; int max_ofs; int level; - u32 step; + u64 step; int end; if (hopnum <= 1) @@ -1134,10 +1140,12 @@ static int hem_list_alloc_mid_bt(struct hns_roce_dev *hr_dev, /* config L1 bt to last bt and link them to corresponding parent */ for (level = 1; level < hopnum; level++) { - cur = hem_list_search_item(&mid_bt[level], offset); - if (cur) { - hem_ptrs[level] = cur; - continue; + if (!hem_list_is_bottom_bt(hopnum, level)) { + cur = hem_list_search_item(&mid_bt[level], offset); + if (cur) { + hem_ptrs[level] = cur; + continue; + } } step = hem_list_calc_ba_range(hopnum, level, unit); @@ -1147,7 +1155,7 @@ static int hem_list_alloc_mid_bt(struct hns_roce_dev *hr_dev, } start_aligned = (distance / step) * step + r->offset; - end = min_t(int, start_aligned + step - 1, max_ofs); + end = min_t(u64, start_aligned + step - 1, max_ofs); cur = hem_list_alloc_item(hr_dev, start_aligned, end, unit, true); if (!cur) { @@ -1163,8 +1171,7 @@ static int hem_list_alloc_mid_bt(struct hns_roce_dev *hr_dev, if (level > 1) { pre = hem_ptrs[level - 1]; step = (cur->start - pre->start) / step * BA_BYTE_LEN; - hem_list_link_bt(hr_dev, pre->addr + step, - cur->dma_addr); + hem_list_link_bt(pre->addr + step, cur->dma_addr); } } @@ -1176,7 +1183,7 @@ static int hem_list_alloc_mid_bt(struct hns_roce_dev *hr_dev, err_exit: for (level = 1; level < hopnum; level++) - hem_list_free_all(hr_dev, &temp_list[level], true); + hem_list_free_all(hr_dev, &temp_list[level]); return ret; } @@ -1217,16 +1224,26 @@ static int alloc_fake_root_bt(struct hns_roce_dev *hr_dev, void *cpu_base, { struct hns_roce_hem_item *hem; + /* This is on the has_mtt branch, if r->hopnum + * is 0, there is no root_ba to reuse for the + * region's fake hem, so a dma_alloc request is + * necessary here. + */ hem = hem_list_alloc_item(hr_dev, r->offset, r->offset + r->count - 1, - r->count, false); + r->count, !r->hopnum); if (!hem) return -ENOMEM; - hem_list_assign_bt(hr_dev, hem, cpu_base, phy_base); + /* The root_ba can be reused only when r->hopnum > 0. */ + if (r->hopnum) + hem_list_assign_bt(hem, cpu_base, phy_base); list_add(&hem->list, branch_head); list_add(&hem->sibling, leaf_head); - return r->count; + /* If r->hopnum == 0, 0 is returned, + * so that the root_bt entry is not occupied. + */ + return r->hopnum ? r->count : 0; } static int setup_middle_bt(struct hns_roce_dev *hr_dev, void *cpu_base, @@ -1236,7 +1253,7 @@ static int setup_middle_bt(struct hns_roce_dev *hr_dev, void *cpu_base, struct hns_roce_hem_item *hem, *temp_hem; int total = 0; int offset; - int step; + u64 step; step = hem_list_calc_ba_range(r->hopnum, 1, unit); if (step < 1) @@ -1245,7 +1262,7 @@ static int setup_middle_bt(struct hns_roce_dev *hr_dev, void *cpu_base, /* if exist mid bt, link L1 to L0 */ list_for_each_entry_safe(hem, temp_hem, branch_head, list) { offset = (hem->start - r->offset) / step * BA_BYTE_LEN; - hem_list_link_bt(hr_dev, cpu_base + offset, hem->dma_addr); + hem_list_link_bt(cpu_base + offset, hem->dma_addr); total++; } @@ -1270,7 +1287,7 @@ setup_root_hem(struct hns_roce_dev *hr_dev, struct hns_roce_hem_list *hem_list, return -ENOMEM; total = 0; - for (i = 0; i < region_cnt && total < max_ba_num; i++) { + for (i = 0; i < region_cnt && total <= max_ba_num; i++) { r = ®ions[i]; if (!r->count) continue; @@ -1336,14 +1353,19 @@ static int hem_list_alloc_root_bt(struct hns_roce_dev *hr_dev, region_cnt); if (ret) { for (i = 0; i < region_cnt; i++) - hem_list_free_all(hr_dev, &head.branch[i], false); + hem_list_free_all(hr_dev, &head.branch[i]); - hem_list_free_all(hr_dev, &head.root, true); + hem_list_free_all(hr_dev, &head.root); } return ret; } +/* This is the bottom bt pages number of a 100G MR on 4K OS, assuming + * the bt page size is not expanded by cal_best_bt_pg_sz() + */ +#define RESCHED_LOOP_CNT_THRESHOLD_ON_4K 12800 + /* construct the base address table and link them by address hop config */ int hns_roce_hem_list_request(struct hns_roce_dev *hr_dev, struct hns_roce_hem_list *hem_list, @@ -1352,6 +1374,7 @@ int hns_roce_hem_list_request(struct hns_roce_dev *hr_dev, { const struct hns_roce_buf_region *r; int ofs, end; + int loop; int unit; int ret; int i; @@ -1369,7 +1392,10 @@ int hns_roce_hem_list_request(struct hns_roce_dev *hr_dev, continue; end = r->offset + r->count; - for (ofs = r->offset; ofs < end; ofs += unit) { + for (ofs = r->offset, loop = 1; ofs < end; ofs += unit, loop++) { + if (!(loop % RESCHED_LOOP_CNT_THRESHOLD_ON_4K)) + cond_resched(); + ret = hem_list_alloc_mid_bt(hr_dev, r, unit, ofs, hem_list->mid_bt[i], &hem_list->btm_bt); @@ -1401,10 +1427,9 @@ void hns_roce_hem_list_release(struct hns_roce_dev *hr_dev, for (i = 0; i < HNS_ROCE_MAX_BT_REGION; i++) for (j = 0; j < HNS_ROCE_MAX_BT_LEVEL; j++) - hem_list_free_all(hr_dev, &hem_list->mid_bt[i][j], - j != 0); + hem_list_free_all(hr_dev, &hem_list->mid_bt[i][j]); - hem_list_free_all(hr_dev, &hem_list->root_bt, true); + hem_list_free_all(hr_dev, &hem_list->root_bt); INIT_LIST_HEAD(&hem_list->btm_bt); hem_list->root_ba = 0; } @@ -1427,9 +1452,14 @@ void *hns_roce_hem_list_find_mtt(struct hns_roce_dev *hr_dev, struct list_head *head = &hem_list->btm_bt; struct hns_roce_hem_item *hem, *temp_hem; void *cpu_base = NULL; + int loop = 1; int nr = 0; list_for_each_entry_safe(hem, temp_hem, head, sibling) { + if (!(loop % RESCHED_LOOP_CNT_THRESHOLD_ON_4K)) + cond_resched(); + loop++; + if (hem_list_page_is_in_range(hem, offset)) { nr = offset - hem->start; cpu_base = hem->addr + nr * BA_BYTE_LEN; diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.h b/drivers/infiniband/hw/hns/hns_roce_hem.h index 6fb51db9682b..9c415b2541af 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.h +++ b/drivers/infiniband/hw/hns/hns_roce_hem.h @@ -57,16 +57,16 @@ enum { }; #define check_whether_bt_num_3(type, hop_num) \ - (type < HEM_TYPE_MTT && hop_num == 2) + ((type) < HEM_TYPE_MTT && (hop_num) == 2) #define check_whether_bt_num_2(type, hop_num) \ - ((type < HEM_TYPE_MTT && hop_num == 1) || \ - (type >= HEM_TYPE_MTT && hop_num == 2)) + (((type) < HEM_TYPE_MTT && (hop_num) == 1) || \ + ((type) >= HEM_TYPE_MTT && (hop_num) == 2)) #define check_whether_bt_num_1(type, hop_num) \ - ((type < HEM_TYPE_MTT && hop_num == HNS_ROCE_HOP_NUM_0) || \ - (type >= HEM_TYPE_MTT && hop_num == 1) || \ - (type >= HEM_TYPE_MTT && hop_num == HNS_ROCE_HOP_NUM_0)) + (((type) < HEM_TYPE_MTT && (hop_num) == HNS_ROCE_HOP_NUM_0) || \ + ((type) >= HEM_TYPE_MTT && (hop_num) == 1) || \ + ((type) >= HEM_TYPE_MTT && (hop_num) == HNS_ROCE_HOP_NUM_0)) struct hns_roce_hem { void *buf; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index ba7ae792d279..fa8747656f25 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -36,19 +36,22 @@ #include <linux/iopoll.h> #include <linux/kernel.h> #include <linux/types.h> +#include <linux/workqueue.h> #include <net/addrconf.h> #include <rdma/ib_addr.h> #include <rdma/ib_cache.h> #include <rdma/ib_umem.h> #include <rdma/uverbs_ioctl.h> -#include "hnae3.h" #include "hns_roce_common.h" #include "hns_roce_device.h" #include "hns_roce_cmd.h" #include "hns_roce_hem.h" #include "hns_roce_hw_v2.h" +#define CREATE_TRACE_POINTS +#include "hns_roce_trace.h" + enum { CMD_RST_PRC_OTHERS, CMD_RST_PRC_SUCCESS, @@ -372,19 +375,12 @@ static int set_rwqe_data_seg(struct ib_qp *ibqp, const struct ib_send_wr *wr, static int check_send_valid(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) { - struct ib_device *ibdev = &hr_dev->ib_dev; - if (unlikely(hr_qp->state == IB_QPS_RESET || hr_qp->state == IB_QPS_INIT || - hr_qp->state == IB_QPS_RTR)) { - ibdev_err(ibdev, "failed to post WQE, QP state %u!\n", - hr_qp->state); + hr_qp->state == IB_QPS_RTR)) return -EINVAL; - } else if (unlikely(hr_dev->state >= HNS_ROCE_DEVICE_STATE_RST_DOWN)) { - ibdev_err(ibdev, "failed to post WQE, dev state %d!\n", - hr_dev->state); + else if (unlikely(hr_dev->state >= HNS_ROCE_DEVICE_STATE_RST_DOWN)) return -EIO; - } return 0; } @@ -443,10 +439,6 @@ static int fill_ud_av(struct hns_roce_v2_ud_send_wqe *ud_sq_wqe, hr_reg_write(ud_sq_wqe, UD_SEND_WQE_HOPLIMIT, ah->av.hop_limit); hr_reg_write(ud_sq_wqe, UD_SEND_WQE_TCLASS, ah->av.tclass); hr_reg_write(ud_sq_wqe, UD_SEND_WQE_FLOW_LABEL, ah->av.flowlabel); - - if (WARN_ON(ah->av.sl > MAX_SERVICE_LEVEL)) - return -EINVAL; - hr_reg_write(ud_sq_wqe, UD_SEND_WQE_SL, ah->av.sl); ud_sq_wqe->sgid_index = ah->av.gid_index; @@ -478,7 +470,7 @@ static inline int set_ud_wqe(struct hns_roce_qp *qp, valid_num_sge = calc_wr_sge_num(wr, &msg_len); ret = set_ud_opcode(ud_sq_wqe, wr); - if (WARN_ON(ret)) + if (WARN_ON_ONCE(ret)) return ret; ud_sq_wqe->msg_len = cpu_to_le32(msg_len); @@ -582,10 +574,10 @@ static inline int set_rc_wqe(struct hns_roce_qp *qp, rc_sq_wqe->msg_len = cpu_to_le32(msg_len); ret = set_rc_opcode(hr_dev, rc_sq_wqe, wr); - if (WARN_ON(ret)) + if (WARN_ON_ONCE(ret)) return ret; - hr_reg_write(rc_sq_wqe, RC_SEND_WQE_FENCE, + hr_reg_write(rc_sq_wqe, RC_SEND_WQE_SO, (wr->send_flags & IB_SEND_FENCE) ? 1 : 0); hr_reg_write(rc_sq_wqe, RC_SEND_WQE_SE, @@ -595,11 +587,16 @@ static inline int set_rc_wqe(struct hns_roce_qp *qp, (wr->send_flags & IB_SEND_SIGNALED) ? 1 : 0); if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP || - wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD) + wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { + if (msg_len != ATOMIC_WR_LEN) + return -EINVAL; set_atomic_seg(wr, rc_sq_wqe, valid_num_sge); - else if (wr->opcode != IB_WR_REG_MR) + } else if (wr->opcode != IB_WR_REG_MR) { ret = set_rwqe_data_seg(&qp->ibqp, wr, rc_sq_wqe, &curr_idx, valid_num_sge); + if (ret) + return ret; + } /* * The pipeline can sequentially post all valid WQEs into WQ buffer, @@ -675,6 +672,10 @@ static void write_dwqe(struct hns_roce_dev *hr_dev, struct hns_roce_qp *qp, #define HNS_ROCE_SL_SHIFT 2 struct hns_roce_v2_rc_send_wqe *rc_sq_wqe = wqe; + if (unlikely(qp->state == IB_QPS_ERR)) { + flush_cqe(hr_dev, qp); + return; + } /* All kinds of DirectWQE have the same header field layout */ hr_reg_enable(rc_sq_wqe, RC_SEND_WQE_FLAG); hr_reg_write(rc_sq_wqe, RC_SEND_WQE_DB_SL_L, qp->sl); @@ -739,6 +740,8 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, else ret = set_ud_wqe(qp, wr, wqe, &sge_idx, owner_bit); + trace_hns_sq_wqe(qp->qpn, wqe_idx, wqe, 1 << qp->sq.wqe_shift, + wr->wr_id, TRACE_SQ); if (unlikely(ret)) { *bad_wr = wr; goto out; @@ -808,6 +811,9 @@ static void fill_rq_wqe(struct hns_roce_qp *hr_qp, const struct ib_recv_wr *wr, wqe = hns_roce_get_recv_wqe(hr_qp, wqe_idx); fill_recv_sge_to_wqe(wr, wqe, max_sge, hr_qp->rq.rsv_sge); + + trace_hns_rq_wqe(hr_qp->qpn, wqe_idx, wqe, 1 << hr_qp->rq.wqe_shift, + wr->wr_id, TRACE_RQ); } static int hns_roce_v2_post_recv(struct ib_qp *ibqp, @@ -944,7 +950,7 @@ static void fill_wqe_idx(struct hns_roce_srq *srq, unsigned int wqe_idx) static void update_srq_db(struct hns_roce_srq *srq) { struct hns_roce_dev *hr_dev = to_hr_dev(srq->ibsrq.device); - struct hns_roce_v2_db db; + struct hns_roce_v2_db db = {}; hr_reg_write(&db, DB_TAG, srq->srqn); hr_reg_write(&db, DB_CMD, HNS_ROCE_V2_SRQ_DB); @@ -985,6 +991,9 @@ static int hns_roce_v2_post_srq_recv(struct ib_srq *ibsrq, fill_recv_sge_to_wqe(wr, wqe, max_sge, srq->rsv_sge); fill_wqe_idx(srq, wqe_idx); srq->wrid[wqe_idx] = wr->wr_id; + + trace_hns_srq_wqe(srq->srqn, wqe_idx, wqe, 1 << srq->wqe_shift, + wr->wr_id, TRACE_SRQ); } if (likely(nreq)) { @@ -1273,22 +1282,47 @@ static int hns_roce_cmd_err_convert_errno(u16 desc_ret) return -EIO; } -static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev, - struct hns_roce_cmq_desc *desc, int num) +static u32 hns_roce_cmdq_tx_timeout(u16 opcode, u32 tx_timeout) +{ + static const struct hns_roce_cmdq_tx_timeout_map cmdq_tx_timeout[] = { + {HNS_ROCE_OPC_POST_MB, HNS_ROCE_OPC_POST_MB_TIMEOUT}, + }; + int i; + + for (i = 0; i < ARRAY_SIZE(cmdq_tx_timeout); i++) + if (cmdq_tx_timeout[i].opcode == opcode) + return cmdq_tx_timeout[i].tx_timeout; + + return tx_timeout; +} + +static void hns_roce_wait_csq_done(struct hns_roce_dev *hr_dev, u32 tx_timeout) +{ + u32 timeout = 0; + + do { + if (hns_roce_cmq_csq_done(hr_dev)) + break; + udelay(1); + } while (++timeout < tx_timeout); +} + +static int __hns_roce_cmq_send_one(struct hns_roce_dev *hr_dev, + struct hns_roce_cmq_desc *desc, + int num, u32 tx_timeout) { struct hns_roce_v2_priv *priv = hr_dev->priv; struct hns_roce_v2_cmq_ring *csq = &priv->cmq.csq; - u32 timeout = 0; u16 desc_ret; u32 tail; int ret; int i; - spin_lock_bh(&csq->lock); - tail = csq->head; for (i = 0; i < num; i++) { + trace_hns_cmdq_req(hr_dev, &desc[i]); + csq->desc[csq->head++] = desc[i]; if (csq->head == csq->desc_num) csq->head = 0; @@ -1299,27 +1333,19 @@ static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev, atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_CMDS_CNT]); - do { - if (hns_roce_cmq_csq_done(hr_dev)) - break; - udelay(1); - } while (++timeout < priv->cmq.tx_timeout); - + hns_roce_wait_csq_done(hr_dev, tx_timeout); if (hns_roce_cmq_csq_done(hr_dev)) { ret = 0; for (i = 0; i < num; i++) { + trace_hns_cmdq_resp(hr_dev, &csq->desc[tail]); + /* check the result of hardware write back */ - desc[i] = csq->desc[tail++]; + desc_ret = le16_to_cpu(csq->desc[tail++].retval); if (tail == csq->desc_num) tail = 0; - - desc_ret = le16_to_cpu(desc[i].retval); if (likely(desc_ret == CMD_EXEC_SUCCESS)) continue; - dev_err_ratelimited(hr_dev->dev, - "Cmdq IO error, opcode = 0x%x, return = 0x%x.\n", - desc->opcode, desc_ret); ret = hns_roce_cmd_err_convert_errno(desc_ret); } } else { @@ -1334,14 +1360,54 @@ static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev, ret = -EAGAIN; } - spin_unlock_bh(&csq->lock); - if (ret) atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_CMDS_ERR_CNT]); return ret; } +static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev, + struct hns_roce_cmq_desc *desc, int num) +{ + struct hns_roce_v2_priv *priv = hr_dev->priv; + struct hns_roce_v2_cmq_ring *csq = &priv->cmq.csq; + u16 opcode = le16_to_cpu(desc->opcode); + u32 tx_timeout = hns_roce_cmdq_tx_timeout(opcode, priv->cmq.tx_timeout); + u8 try_cnt = HNS_ROCE_OPC_POST_MB_TRY_CNT; + u32 rsv_tail; + int ret; + int i; + + while (try_cnt) { + try_cnt--; + + spin_lock_bh(&csq->lock); + rsv_tail = csq->head; + ret = __hns_roce_cmq_send_one(hr_dev, desc, num, tx_timeout); + if (opcode == HNS_ROCE_OPC_POST_MB && ret == -ETIME && + try_cnt) { + spin_unlock_bh(&csq->lock); + mdelay(HNS_ROCE_OPC_POST_MB_RETRY_GAP_MSEC); + continue; + } + + for (i = 0; i < num; i++) { + desc[i] = csq->desc[rsv_tail++]; + if (rsv_tail == csq->desc_num) + rsv_tail = 0; + } + spin_unlock_bh(&csq->lock); + break; + } + + if (ret) + dev_err_ratelimited(hr_dev->dev, + "Cmdq IO error, opcode = 0x%x, return = %d.\n", + opcode, ret); + + return ret; +} + static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev, struct hns_roce_cmq_desc *desc, int num) { @@ -1658,8 +1724,8 @@ static int hns_roce_hw_v2_query_counter(struct hns_roce_dev *hr_dev, for (i = 0; i < HNS_ROCE_HW_CNT_TOTAL && i < *num_counters; i++) { bd_idx = i / CNT_PER_DESC; - if (!(desc[bd_idx].flag & HNS_ROCE_CMD_FLAG_NEXT) && - bd_idx != HNS_ROCE_HW_CNT_TOTAL / CNT_PER_DESC) + if (bd_idx != HNS_ROCE_HW_CNT_TOTAL / CNT_PER_DESC && + !(desc[bd_idx].flag & cpu_to_le16(HNS_ROCE_CMD_FLAG_NEXT))) break; cnt_data = (__le64 *)&desc[bd_idx].data[0]; @@ -2105,7 +2171,7 @@ static void apply_func_caps(struct hns_roce_dev *hr_dev) caps->gmv_bt_num * (HNS_HW_PAGE_SIZE / caps->gmv_entry_sz)); - caps->gmv_entry_num = caps->gmv_bt_num * (PAGE_SIZE / + caps->gmv_entry_num = caps->gmv_bt_num * (HNS_HW_PAGE_SIZE / caps->gmv_entry_sz); } else { u32 func_num = max_t(u32, 1, hr_dev->func_num); @@ -2461,14 +2527,16 @@ static int set_llm_cfg_to_hw(struct hns_roce_dev *hr_dev, static struct hns_roce_link_table * alloc_link_table_buf(struct hns_roce_dev *hr_dev) { + u16 total_sl = hr_dev->caps.sl_num * hr_dev->func_num; struct hns_roce_v2_priv *priv = hr_dev->priv; struct hns_roce_link_table *link_tbl; u32 pg_shift, size, min_size; link_tbl = &priv->ext_llm; pg_shift = hr_dev->caps.llm_buf_pg_sz + PAGE_SHIFT; - size = hr_dev->caps.num_qps * HNS_ROCE_V2_EXT_LLM_ENTRY_SZ; - min_size = HNS_ROCE_EXT_LLM_MIN_PAGES(hr_dev->caps.sl_num) << pg_shift; + size = hr_dev->caps.num_qps * hr_dev->func_num * + HNS_ROCE_V2_EXT_LLM_ENTRY_SZ; + min_size = HNS_ROCE_EXT_LLM_MIN_PAGES(total_sl) << pg_shift; /* Alloc data table */ size = max(size, min_size); @@ -2535,20 +2603,19 @@ static void hns_roce_free_link_table(struct hns_roce_dev *hr_dev) free_link_table_buf(hr_dev, &priv->ext_llm); } -static void free_dip_list(struct hns_roce_dev *hr_dev) +static void free_dip_entry(struct hns_roce_dev *hr_dev) { struct hns_roce_dip *hr_dip; - struct hns_roce_dip *tmp; - unsigned long flags; + unsigned long idx; - spin_lock_irqsave(&hr_dev->dip_list_lock, flags); + xa_lock(&hr_dev->qp_table.dip_xa); - list_for_each_entry_safe(hr_dip, tmp, &hr_dev->dip_list, node) { - list_del(&hr_dip->node); + xa_for_each(&hr_dev->qp_table.dip_xa, idx, hr_dip) { + __xa_erase(&hr_dev->qp_table.dip_xa, hr_dip->dip_idx); kfree(hr_dip); } - spin_unlock_irqrestore(&hr_dev->dip_list_lock, flags); + xa_unlock(&hr_dev->qp_table.dip_xa); } static struct ib_pd *free_mr_init_pd(struct hns_roce_dev *hr_dev) @@ -2671,6 +2738,8 @@ static void free_mr_exit(struct hns_roce_dev *hr_dev) kfree(free_mr->rsv_pd); free_mr->rsv_pd = NULL; } + + mutex_destroy(&free_mr->mutex); } static int free_mr_alloc_res(struct hns_roce_dev *hr_dev) @@ -2748,8 +2817,8 @@ static int free_mr_modify_rsv_qp(struct hns_roce_dev *hr_dev, ret = hr_dev->hw->modify_qp(&hr_qp->ibqp, attr, mask, IB_QPS_INIT, IB_QPS_INIT, NULL); if (ret) { - ibdev_err(ibdev, "failed to modify qp to init, ret = %d.\n", - ret); + ibdev_err_ratelimited(ibdev, "failed to modify qp to init, ret = %d.\n", + ret); return ret; } @@ -2821,8 +2890,10 @@ static int free_mr_init(struct hns_roce_dev *hr_dev) mutex_init(&free_mr->mutex); ret = free_mr_alloc_res(hr_dev); - if (ret) + if (ret) { + mutex_destroy(&free_mr->mutex); return ret; + } ret = free_mr_modify_qp(hr_dev); if (ret) @@ -2943,13 +3014,16 @@ err_llm_init_failed: static void hns_roce_v2_exit(struct hns_roce_dev *hr_dev) { + if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) + free_mr_exit(hr_dev); + hns_roce_function_clear(hr_dev); if (!hr_dev->is_vf) hns_roce_free_link_table(hr_dev); if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP09) - free_dip_list(hr_dev); + free_dip_entry(hr_dev); } static int hns_roce_mbox_post(struct hns_roce_dev *hr_dev, @@ -3208,13 +3282,14 @@ static int set_mtpt_pbl(struct hns_roce_dev *hr_dev, /* Aligned to the hardware address access unit */ for (i = 0; i < ARRAY_SIZE(pages); i++) - pages[i] >>= 6; + pages[i] >>= MPT_PBL_BUF_ADDR_S; pbl_ba = hns_roce_get_mtr_ba(&mr->pbl_mtr); mpt_entry->pbl_size = cpu_to_le32(mr->npages); - mpt_entry->pbl_ba_l = cpu_to_le32(pbl_ba >> 3); - hr_reg_write(mpt_entry, MPT_PBL_BA_H, upper_32_bits(pbl_ba >> 3)); + mpt_entry->pbl_ba_l = cpu_to_le32(pbl_ba >> MPT_PBL_BA_ADDR_S); + hr_reg_write(mpt_entry, MPT_PBL_BA_H, + upper_32_bits(pbl_ba >> MPT_PBL_BA_ADDR_S)); mpt_entry->pa0_l = cpu_to_le32(lower_32_bits(pages[0])); hr_reg_write(mpt_entry, MPT_PA0_H, upper_32_bits(pages[0])); @@ -3307,8 +3382,7 @@ static int hns_roce_v2_rereg_write_mtpt(struct hns_roce_dev *hr_dev, return ret; } -static int hns_roce_v2_frmr_write_mtpt(struct hns_roce_dev *hr_dev, - void *mb_buf, struct hns_roce_mr *mr) +static int hns_roce_v2_frmr_write_mtpt(void *mb_buf, struct hns_roce_mr *mr) { dma_addr_t pbl_ba = hns_roce_get_mtr_ba(&mr->pbl_mtr); struct hns_roce_v2_mpt_entry *mpt_entry; @@ -3335,8 +3409,10 @@ static int hns_roce_v2_frmr_write_mtpt(struct hns_roce_dev *hr_dev, mpt_entry->pbl_size = cpu_to_le32(mr->npages); - mpt_entry->pbl_ba_l = cpu_to_le32(lower_32_bits(pbl_ba >> 3)); - hr_reg_write(mpt_entry, MPT_PBL_BA_H, upper_32_bits(pbl_ba >> 3)); + mpt_entry->pbl_ba_l = cpu_to_le32(lower_32_bits(pbl_ba >> + MPT_PBL_BA_ADDR_S)); + hr_reg_write(mpt_entry, MPT_PBL_BA_H, + upper_32_bits(pbl_ba >> MPT_PBL_BA_ADDR_S)); return 0; } @@ -3387,8 +3463,8 @@ static int free_mr_post_send_lp_wqe(struct hns_roce_qp *hr_qp) ret = hns_roce_v2_post_send(&hr_qp->ibqp, send_wr, &bad_wr); if (ret) { - ibdev_err(ibdev, "failed to post wqe for free mr, ret = %d.\n", - ret); + ibdev_err_ratelimited(ibdev, "failed to post wqe for free mr, ret = %d.\n", + ret); return ret; } @@ -3427,9 +3503,9 @@ static void free_mr_send_cmd_to_hw(struct hns_roce_dev *hr_dev) ret = free_mr_post_send_lp_wqe(hr_qp); if (ret) { - ibdev_err(ibdev, - "failed to send wqe (qp:0x%lx) for free mr, ret = %d.\n", - hr_qp->qpn, ret); + ibdev_err_ratelimited(ibdev, + "failed to send wqe (qp:0x%lx) for free mr, ret = %d.\n", + hr_qp->qpn, ret); break; } @@ -3440,16 +3516,16 @@ static void free_mr_send_cmd_to_hw(struct hns_roce_dev *hr_dev) while (cqe_cnt) { npolled = hns_roce_v2_poll_cq(&free_mr->rsv_cq->ib_cq, cqe_cnt, wc); if (npolled < 0) { - ibdev_err(ibdev, - "failed to poll cqe for free mr, remain %d cqe.\n", - cqe_cnt); + ibdev_err_ratelimited(ibdev, + "failed to poll cqe for free mr, remain %d cqe.\n", + cqe_cnt); goto out; } if (time_after(jiffies, end)) { - ibdev_err(ibdev, - "failed to poll cqe for free mr and timeout, remain %d cqe.\n", - cqe_cnt); + ibdev_err_ratelimited(ibdev, + "failed to poll cqe for free mr and timeout, remain %d cqe.\n", + cqe_cnt); goto out; } cqe_cnt -= npolled; @@ -3582,14 +3658,14 @@ static void hns_roce_v2_write_cqc(struct hns_roce_dev *hr_dev, to_hr_hw_page_shift(hr_cq->mtr.hem_cfg.ba_pg_shift)); hr_reg_write(cq_context, CQC_CQE_BUF_PG_SZ, to_hr_hw_page_shift(hr_cq->mtr.hem_cfg.buf_pg_shift)); - hr_reg_write(cq_context, CQC_CQE_BA_L, dma_handle >> 3); - hr_reg_write(cq_context, CQC_CQE_BA_H, (dma_handle >> (32 + 3))); + hr_reg_write(cq_context, CQC_CQE_BA_L, dma_handle >> CQC_CQE_BA_L_S); + hr_reg_write(cq_context, CQC_CQE_BA_H, dma_handle >> CQC_CQE_BA_H_S); hr_reg_write_bool(cq_context, CQC_DB_RECORD_EN, hr_cq->flags & HNS_ROCE_CQ_FLAG_RECORD_DB); hr_reg_write(cq_context, CQC_CQE_DB_RECORD_ADDR_L, ((u32)hr_cq->db.dma) >> 1); hr_reg_write(cq_context, CQC_CQE_DB_RECORD_ADDR_H, - hr_cq->db.dma >> 32); + hr_cq->db.dma >> CQC_CQE_DB_RECORD_ADDR_H_S); hr_reg_write(cq_context, CQC_CQ_MAX_CNT, HNS_ROCE_V2_CQ_DEFAULT_BURST_NUM); hr_reg_write(cq_context, CQC_CQ_PERIOD, @@ -3711,8 +3787,9 @@ static void get_cqe_status(struct hns_roce_dev *hr_dev, struct hns_roce_qp *qp, wc->status == IB_WC_WR_FLUSH_ERR)) return; - ibdev_err(&hr_dev->ib_dev, "error cqe status 0x%x:\n", cqe_status); - print_hex_dump(KERN_ERR, "", DUMP_PREFIX_NONE, 16, 4, cqe, + ibdev_err_ratelimited(&hr_dev->ib_dev, "error cqe status 0x%x:\n", + cqe_status); + print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_NONE, 16, 4, cqe, cq->cqe_size, false); wc->vendor_err = hr_reg_read(cqe, CQE_SUB_STATUS); @@ -4217,8 +4294,7 @@ static void set_access_flags(struct hns_roce_qp *hr_qp, } static void set_qpc_wqe_cnt(struct hns_roce_qp *hr_qp, - struct hns_roce_v2_qp_context *context, - struct hns_roce_v2_qp_context *qpc_mask) + struct hns_roce_v2_qp_context *context) { hr_reg_write(context, QPC_SGE_SHIFT, to_hr_hem_entries_shift(hr_qp->sge.sge_cnt, @@ -4240,9 +4316,7 @@ static inline int get_pdn(struct ib_pd *ib_pd) } static void modify_qp_reset_to_init(struct ib_qp *ibqp, - const struct ib_qp_attr *attr, - struct hns_roce_v2_qp_context *context, - struct hns_roce_v2_qp_context *qpc_mask) + struct hns_roce_v2_qp_context *context) { struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); @@ -4259,7 +4333,7 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp, hr_reg_write(context, QPC_RQWS, ilog2(hr_qp->rq.max_gs)); - set_qpc_wqe_cnt(hr_qp, context, qpc_mask); + set_qpc_wqe_cnt(hr_qp, context); /* No VLAN need to set 0xFFF */ hr_reg_write(context, QPC_VLAN_ID, 0xfff); @@ -4300,7 +4374,6 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp, } static void modify_qp_init_to_init(struct ib_qp *ibqp, - const struct ib_qp_attr *attr, struct hns_roce_v2_qp_context *context, struct hns_roce_v2_qp_context *qpc_mask) { @@ -4394,12 +4467,14 @@ static int config_qp_rq_buf(struct hns_roce_dev *hr_dev, upper_32_bits(to_hr_hw_page_addr(mtts[0]))); hr_reg_clear(qpc_mask, QPC_RQ_CUR_BLK_ADDR_H); - context->rq_nxt_blk_addr = cpu_to_le32(to_hr_hw_page_addr(mtts[1])); - qpc_mask->rq_nxt_blk_addr = 0; - - hr_reg_write(context, QPC_RQ_NXT_BLK_ADDR_H, - upper_32_bits(to_hr_hw_page_addr(mtts[1]))); - hr_reg_clear(qpc_mask, QPC_RQ_NXT_BLK_ADDR_H); + if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) { + context->rq_nxt_blk_addr = + cpu_to_le32(to_hr_hw_page_addr(mtts[1])); + qpc_mask->rq_nxt_blk_addr = 0; + hr_reg_write(context, QPC_RQ_NXT_BLK_ADDR_H, + upper_32_bits(to_hr_hw_page_addr(mtts[1]))); + hr_reg_clear(qpc_mask, QPC_RQ_NXT_BLK_ADDR_H); + } return 0; } @@ -4521,16 +4596,16 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, return -EINVAL; } - hr_reg_write(context, QPC_TRRL_BA_L, trrl_ba >> 4); + hr_reg_write(context, QPC_TRRL_BA_L, trrl_ba >> QPC_TRRL_BA_L_S); hr_reg_clear(qpc_mask, QPC_TRRL_BA_L); - context->trrl_ba = cpu_to_le32(trrl_ba >> (16 + 4)); + context->trrl_ba = cpu_to_le32(trrl_ba >> QPC_TRRL_BA_M_S); qpc_mask->trrl_ba = 0; - hr_reg_write(context, QPC_TRRL_BA_H, trrl_ba >> (32 + 16 + 4)); + hr_reg_write(context, QPC_TRRL_BA_H, trrl_ba >> QPC_TRRL_BA_H_S); hr_reg_clear(qpc_mask, QPC_TRRL_BA_H); - context->irrl_ba = cpu_to_le32(irrl_ba >> 6); + context->irrl_ba = cpu_to_le32(irrl_ba >> QPC_IRRL_BA_L_S); qpc_mask->irrl_ba = 0; - hr_reg_write(context, QPC_IRRL_BA_H, irrl_ba >> (32 + 6)); + hr_reg_write(context, QPC_IRRL_BA_H, irrl_ba >> QPC_IRRL_BA_H_S); hr_reg_clear(qpc_mask, QPC_IRRL_BA_H); hr_reg_enable(context, QPC_RMT_E2E); @@ -4592,8 +4667,9 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, hr_reg_clear(qpc_mask, QPC_TRRL_HEAD_MAX); hr_reg_clear(qpc_mask, QPC_TRRL_TAIL_MAX); +#define MAX_LP_SGEN 3 /* rocee send 2^lp_sgen_ini segs every time */ - hr_reg_write(context, QPC_LP_SGEN_INI, 3); + hr_reg_write(context, QPC_LP_SGEN_INI, MAX_LP_SGEN); hr_reg_clear(qpc_mask, QPC_LP_SGEN_INI); if (udata && ibqp->qp_type == IB_QPT_RC && @@ -4619,8 +4695,7 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, return 0; } -static int modify_qp_rtr_to_rts(struct ib_qp *ibqp, - const struct ib_qp_attr *attr, int attr_mask, +static int modify_qp_rtr_to_rts(struct ib_qp *ibqp, int attr_mask, struct hns_roce_v2_qp_context *context, struct hns_roce_v2_qp_context *qpc_mask) { @@ -4667,26 +4742,49 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp, return 0; } +static int alloc_dip_entry(struct xarray *dip_xa, u32 qpn) +{ + struct hns_roce_dip *hr_dip; + int ret; + + hr_dip = xa_load(dip_xa, qpn); + if (hr_dip) + return 0; + + hr_dip = kzalloc(sizeof(*hr_dip), GFP_KERNEL); + if (!hr_dip) + return -ENOMEM; + + ret = xa_err(xa_store(dip_xa, qpn, hr_dip, GFP_KERNEL)); + if (ret) + kfree(hr_dip); + + return ret; +} + static int get_dip_ctx_idx(struct ib_qp *ibqp, const struct ib_qp_attr *attr, u32 *dip_idx) { const struct ib_global_route *grh = rdma_ah_read_grh(&attr->ah_attr); struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); - u32 *spare_idx = hr_dev->qp_table.idx_table.spare_idx; - u32 *head = &hr_dev->qp_table.idx_table.head; - u32 *tail = &hr_dev->qp_table.idx_table.tail; + struct xarray *dip_xa = &hr_dev->qp_table.dip_xa; + struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); struct hns_roce_dip *hr_dip; - unsigned long flags; + unsigned long idx; int ret = 0; - spin_lock_irqsave(&hr_dev->dip_list_lock, flags); + ret = alloc_dip_entry(dip_xa, ibqp->qp_num); + if (ret) + return ret; - spare_idx[*tail] = ibqp->qp_num; - *tail = (*tail == hr_dev->caps.num_qps - 1) ? 0 : (*tail + 1); + xa_lock(dip_xa); - list_for_each_entry(hr_dip, &hr_dev->dip_list, node) { - if (!memcmp(grh->dgid.raw, hr_dip->dgid, 16)) { + xa_for_each(dip_xa, idx, hr_dip) { + if (hr_dip->qp_cnt && + !memcmp(grh->dgid.raw, hr_dip->dgid, GID_LEN_V2)) { *dip_idx = hr_dip->dip_idx; + hr_dip->qp_cnt++; + hr_qp->dip = hr_dip; goto out; } } @@ -4694,19 +4792,24 @@ static int get_dip_ctx_idx(struct ib_qp *ibqp, const struct ib_qp_attr *attr, /* If no dgid is found, a new dip and a mapping between dgid and * dip_idx will be created. */ - hr_dip = kzalloc(sizeof(*hr_dip), GFP_ATOMIC); - if (!hr_dip) { - ret = -ENOMEM; - goto out; + xa_for_each(dip_xa, idx, hr_dip) { + if (hr_dip->qp_cnt) + continue; + + *dip_idx = idx; + memcpy(hr_dip->dgid, grh->dgid.raw, sizeof(grh->dgid.raw)); + hr_dip->dip_idx = idx; + hr_dip->qp_cnt++; + hr_qp->dip = hr_dip; + break; } - memcpy(hr_dip->dgid, grh->dgid.raw, sizeof(grh->dgid.raw)); - hr_dip->dip_idx = *dip_idx = spare_idx[*head]; - *head = (*head == hr_dev->caps.num_qps - 1) ? 0 : (*head + 1); - list_add_tail(&hr_dip->node, &hr_dev->dip_list); + /* This should never happen. */ + if (WARN_ON_ONCE(!hr_qp->dip)) + ret = -ENOSPC; out: - spin_unlock_irqrestore(&hr_dev->dip_list_lock, flags); + xa_unlock(dip_xa); return ret; } @@ -4828,6 +4931,69 @@ static int fill_cong_field(struct ib_qp *ibqp, const struct ib_qp_attr *attr, return 0; } +static int hns_roce_hw_v2_get_dscp(struct hns_roce_dev *hr_dev, u8 dscp, + u8 *tc_mode, u8 *priority) +{ + struct hns_roce_v2_priv *priv = hr_dev->priv; + struct hnae3_handle *handle = priv->handle; + const struct hnae3_ae_ops *ops = handle->ae_algo->ops; + + if (!ops->get_dscp_prio) + return -EOPNOTSUPP; + + return ops->get_dscp_prio(handle, dscp, tc_mode, priority); +} + +bool check_sl_valid(struct hns_roce_dev *hr_dev, u8 sl) +{ + u32 max_sl; + + max_sl = min_t(u32, MAX_SERVICE_LEVEL, hr_dev->caps.sl_num - 1); + if (unlikely(sl > max_sl)) { + ibdev_err_ratelimited(&hr_dev->ib_dev, + "failed to set SL(%u). Shouldn't be larger than %u.\n", + sl, max_sl); + return false; + } + + return true; +} + +static int hns_roce_set_sl(struct ib_qp *ibqp, + const struct ib_qp_attr *attr, + struct hns_roce_v2_qp_context *context, + struct hns_roce_v2_qp_context *qpc_mask) +{ + const struct ib_global_route *grh = rdma_ah_read_grh(&attr->ah_attr); + struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); + struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); + struct ib_device *ibdev = &hr_dev->ib_dev; + int ret; + + ret = hns_roce_hw_v2_get_dscp(hr_dev, get_tclass(&attr->ah_attr.grh), + &hr_qp->tc_mode, &hr_qp->priority); + if (ret && ret != -EOPNOTSUPP && + grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { + ibdev_err_ratelimited(ibdev, + "failed to get dscp, ret = %d.\n", ret); + return ret; + } + + if (hr_qp->tc_mode == HNAE3_TC_MAP_MODE_DSCP && + grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) + hr_qp->sl = hr_qp->priority; + else + hr_qp->sl = rdma_ah_get_sl(&attr->ah_attr); + + if (!check_sl_valid(hr_dev, hr_qp->sl)) + return -EINVAL; + + hr_reg_write(context, QPC_SL, hr_qp->sl); + hr_reg_clear(qpc_mask, QPC_SL); + + return 0; +} + static int hns_roce_v2_set_path(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, @@ -4843,25 +5009,18 @@ static int hns_roce_v2_set_path(struct ib_qp *ibqp, int is_roce_protocol; u16 vlan_id = 0xffff; bool is_udp = false; - u32 max_sl; u8 ib_port; u8 hr_port; int ret; - max_sl = min_t(u32, MAX_SERVICE_LEVEL, hr_dev->caps.sl_num - 1); - if (unlikely(sl > max_sl)) { - ibdev_err_ratelimited(ibdev, - "failed to fill QPC, sl (%u) shouldn't be larger than %u.\n", - sl, max_sl); - return -EINVAL; - } - /* * If free_mr_en of qp is set, it means that this qp comes from * free mr. This qp will perform the loopback operation. * In the loopback scenario, only sl needs to be set. */ if (hr_qp->free_mr_en) { + if (!check_sl_valid(hr_dev, sl)) + return -EINVAL; hr_reg_write(context, QPC_SL, sl); hr_reg_clear(qpc_mask, QPC_SL); hr_qp->sl = sl; @@ -4931,11 +5090,7 @@ static int hns_roce_v2_set_path(struct ib_qp *ibqp, memcpy(context->dgid, grh->dgid.raw, sizeof(grh->dgid.raw)); memset(qpc_mask->dgid, 0, sizeof(grh->dgid.raw)); - hr_qp->sl = sl; - hr_reg_write(context, QPC_SL, hr_qp->sl); - hr_reg_clear(qpc_mask, QPC_SL); - - return 0; + return hns_roce_set_sl(ibqp, attr, context, qpc_mask); } static bool check_qp_state(enum ib_qp_state cur_state, @@ -4975,22 +5130,19 @@ static int hns_roce_v2_set_abs_fields(struct ib_qp *ibqp, struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); int ret = 0; - if (!check_qp_state(cur_state, new_state)) { - ibdev_err(&hr_dev->ib_dev, "Illegal state for QP!\n"); + if (!check_qp_state(cur_state, new_state)) return -EINVAL; - } if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { memset(qpc_mask, 0, hr_dev->caps.qpc_sz); - modify_qp_reset_to_init(ibqp, attr, context, qpc_mask); + modify_qp_reset_to_init(ibqp, context); } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_INIT) { - modify_qp_init_to_init(ibqp, attr, context, qpc_mask); + modify_qp_init_to_init(ibqp, context, qpc_mask); } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { ret = modify_qp_init_to_rtr(ibqp, attr, attr_mask, context, qpc_mask, udata); } else if (cur_state == IB_QPS_RTR && new_state == IB_QPS_RTS) { - ret = modify_qp_rtr_to_rts(ibqp, attr, attr_mask, context, - qpc_mask); + ret = modify_qp_rtr_to_rts(ibqp, attr_mask, context, qpc_mask); } return ret; @@ -5174,6 +5326,7 @@ static void v2_set_flushed_fields(struct ib_qp *ibqp, return; spin_lock_irqsave(&hr_qp->sq.lock, sq_flag); + trace_hns_sq_flush_cqe(hr_qp->qpn, hr_qp->sq.head, TRACE_SQ); hr_reg_write(context, QPC_SQ_PRODUCER_IDX, hr_qp->sq.head); hr_reg_clear(qpc_mask, QPC_SQ_PRODUCER_IDX); hr_qp->state = IB_QPS_ERR; @@ -5183,6 +5336,7 @@ static void v2_set_flushed_fields(struct ib_qp *ibqp, return; spin_lock_irqsave(&hr_qp->rq.lock, rq_flag); + trace_hns_rq_flush_cqe(hr_qp->qpn, hr_qp->rq.head, TRACE_RQ); hr_reg_write(context, QPC_RQ_PRODUCER_IDX, hr_qp->rq.head); hr_reg_clear(qpc_mask, QPC_RQ_PRODUCER_IDX); spin_unlock_irqrestore(&hr_qp->rq.lock, rq_flag); @@ -5240,7 +5394,7 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp, /* SW pass context to HW */ ret = hns_roce_v2_qp_modify(hr_dev, context, qpc_mask, hr_qp); if (ret) { - ibdev_err(ibdev, "failed to modify QP, ret = %d.\n", ret); + ibdev_err_ratelimited(ibdev, "failed to modify QP, ret = %d.\n", ret); goto out; } @@ -5378,7 +5532,9 @@ static int hns_roce_v2_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, ret = hns_roce_v2_query_qpc(hr_dev, hr_qp->qpn, &context); if (ret) { - ibdev_err(ibdev, "failed to query QPC, ret = %d.\n", ret); + ibdev_err_ratelimited(ibdev, + "failed to query QPC, ret = %d.\n", + ret); ret = -EINVAL; goto out; } @@ -5386,7 +5542,7 @@ static int hns_roce_v2_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, state = hr_reg_read(&context, QPC_QP_ST); tmp_qp_state = to_ib_qp_st((enum hns_roce_v2_qp_state)state); if (tmp_qp_state == -1) { - ibdev_err(ibdev, "Illegal ib_qp_state\n"); + ibdev_err_ratelimited(ibdev, "Illegal ib_qp_state\n"); ret = -EINVAL; goto out; } @@ -5479,9 +5635,9 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev, ret = hns_roce_v2_modify_qp(&hr_qp->ibqp, NULL, 0, hr_qp->state, IB_QPS_RESET, udata); if (ret) - ibdev_err(ibdev, - "failed to modify QP to RST, ret = %d.\n", - ret); + ibdev_err_ratelimited(ibdev, + "failed to modify QP to RST, ret = %d.\n", + ret); } send_cq = hr_qp->ibqp.send_cq ? to_hr_cq(hr_qp->ibqp.send_cq) : NULL; @@ -5509,17 +5665,44 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev, return ret; } +static void put_dip_ctx_idx(struct hns_roce_dev *hr_dev, + struct hns_roce_qp *hr_qp) +{ + struct hns_roce_dip *hr_dip = hr_qp->dip; + + if (!hr_dip) + return; + + xa_lock(&hr_dev->qp_table.dip_xa); + + hr_dip->qp_cnt--; + if (!hr_dip->qp_cnt) + memset(hr_dip->dgid, 0, GID_LEN_V2); + + xa_unlock(&hr_dev->qp_table.dip_xa); +} + int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) { struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); + unsigned long flags; int ret; + /* Make sure flush_cqe() is completed */ + spin_lock_irqsave(&hr_qp->flush_lock, flags); + set_bit(HNS_ROCE_STOP_FLUSH_FLAG, &hr_qp->flush_flag); + spin_unlock_irqrestore(&hr_qp->flush_lock, flags); + flush_work(&hr_qp->flush_work.work); + + if (hr_qp->cong_type == CONG_TYPE_DIP) + put_dip_ctx_idx(hr_dev, hr_qp); + ret = hns_roce_v2_destroy_qp_common(hr_dev, hr_qp, udata); if (ret) - ibdev_err(&hr_dev->ib_dev, - "failed to destroy QP, QPN = 0x%06lx, ret = %d.\n", - hr_qp->qpn, ret); + ibdev_err_ratelimited(&hr_dev->ib_dev, + "failed to destroy QP, QPN = 0x%06lx, ret = %d.\n", + hr_qp->qpn, ret); hns_roce_qp_destroy(hr_dev, hr_qp, udata); @@ -5802,7 +5985,7 @@ static int hns_roce_v2_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) dev_info(hr_dev->dev, "cq_period(%u) reached the upper limit, adjusted to 65.\n", cq_period); - cq_period = HNS_ROCE_MAX_CQ_PERIOD; + cq_period = HNS_ROCE_MAX_CQ_PERIOD_HIP08; } cq_period *= HNS_ROCE_CLOCK_ADJUST; } @@ -5813,9 +5996,9 @@ static int hns_roce_v2_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) HNS_ROCE_CMD_MODIFY_CQC, hr_cq->cqn); hns_roce_free_cmd_mailbox(hr_dev, mailbox); if (ret) - ibdev_err(&hr_dev->ib_dev, - "failed to process cmd when modifying CQ, ret = %d.\n", - ret); + ibdev_err_ratelimited(&hr_dev->ib_dev, + "failed to process cmd when modifying CQ, ret = %d.\n", + ret); err_out: if (ret) @@ -5839,9 +6022,9 @@ static int hns_roce_v2_query_cqc(struct hns_roce_dev *hr_dev, u32 cqn, ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, HNS_ROCE_CMD_QUERY_CQC, cqn); if (ret) { - ibdev_err(&hr_dev->ib_dev, - "failed to process cmd when querying CQ, ret = %d.\n", - ret); + ibdev_err_ratelimited(&hr_dev->ib_dev, + "failed to process cmd when querying CQ, ret = %d.\n", + ret); goto err_mailbox; } @@ -5882,11 +6065,10 @@ err_mailbox: return ret; } -static void hns_roce_irq_work_handle(struct work_struct *work) +static void dump_aeqe_log(struct hns_roce_work *irq_work) { - struct hns_roce_work *irq_work = - container_of(work, struct hns_roce_work, work); - struct ib_device *ibdev = &irq_work->hr_dev->ib_dev; + struct hns_roce_dev *hr_dev = irq_work->hr_dev; + struct ib_device *ibdev = &hr_dev->ib_dev; switch (irq_work->event_type) { case HNS_ROCE_EVENT_TYPE_PATH_MIG: @@ -5930,6 +6112,8 @@ static void hns_roce_irq_work_handle(struct work_struct *work) case HNS_ROCE_EVENT_TYPE_DB_OVERFLOW: ibdev_warn(ibdev, "DB overflow.\n"); break; + case HNS_ROCE_EVENT_TYPE_MB: + break; case HNS_ROCE_EVENT_TYPE_FLR: ibdev_warn(ibdev, "function level reset.\n"); break; @@ -5940,8 +6124,46 @@ static void hns_roce_irq_work_handle(struct work_struct *work) ibdev_err(ibdev, "invalid xrceth error.\n"); break; default: + ibdev_info(ibdev, "Undefined event %d.\n", + irq_work->event_type); break; } +} + +static void hns_roce_irq_work_handle(struct work_struct *work) +{ + struct hns_roce_work *irq_work = + container_of(work, struct hns_roce_work, work); + struct hns_roce_dev *hr_dev = irq_work->hr_dev; + int event_type = irq_work->event_type; + u32 queue_num = irq_work->queue_num; + + switch (event_type) { + case HNS_ROCE_EVENT_TYPE_PATH_MIG: + case HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED: + case HNS_ROCE_EVENT_TYPE_COMM_EST: + case HNS_ROCE_EVENT_TYPE_SQ_DRAINED: + case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR: + case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH: + case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR: + case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR: + case HNS_ROCE_EVENT_TYPE_XRCD_VIOLATION: + case HNS_ROCE_EVENT_TYPE_INVALID_XRCETH: + hns_roce_qp_event(hr_dev, queue_num, event_type); + break; + case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH: + case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR: + hns_roce_srq_event(hr_dev, queue_num, event_type); + break; + case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR: + case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW: + hns_roce_cq_event(hr_dev, queue_num, event_type); + break; + default: + break; + } + + dump_aeqe_log(irq_work); kfree(irq_work); } @@ -6002,14 +6224,14 @@ static struct hns_roce_aeqe *next_aeqe_sw_v2(struct hns_roce_eq *eq) static irqreturn_t hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq) { - struct device *dev = hr_dev->dev; struct hns_roce_aeqe *aeqe = next_aeqe_sw_v2(eq); irqreturn_t aeqe_found = IRQ_NONE; + int num_aeqes = 0; int event_type; u32 queue_num; int sub_type; - while (aeqe) { + while (aeqe && num_aeqes < HNS_AEQ_POLLING_BUDGET) { /* Make sure we read AEQ entry after we have checked the * ownership bit */ @@ -6020,25 +6242,12 @@ static irqreturn_t hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, queue_num = hr_reg_read(aeqe, AEQE_EVENT_QUEUE_NUM); switch (event_type) { - case HNS_ROCE_EVENT_TYPE_PATH_MIG: - case HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED: - case HNS_ROCE_EVENT_TYPE_COMM_EST: - case HNS_ROCE_EVENT_TYPE_SQ_DRAINED: case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR: - case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH: case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR: case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR: case HNS_ROCE_EVENT_TYPE_XRCD_VIOLATION: case HNS_ROCE_EVENT_TYPE_INVALID_XRCETH: - hns_roce_qp_event(hr_dev, queue_num, event_type); - break; - case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH: - case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR: - hns_roce_srq_event(hr_dev, queue_num, event_type); - break; - case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR: - case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW: - hns_roce_cq_event(hr_dev, queue_num, event_type); + hns_roce_flush_cqe(hr_dev, queue_num); break; case HNS_ROCE_EVENT_TYPE_MB: hns_roce_cmd_event(hr_dev, @@ -6046,12 +6255,7 @@ static irqreturn_t hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, aeqe->event.cmd.status, le64_to_cpu(aeqe->event.cmd.out_param)); break; - case HNS_ROCE_EVENT_TYPE_DB_OVERFLOW: - case HNS_ROCE_EVENT_TYPE_FLR: - break; default: - dev_err(dev, "unhandled event %d on EQ %d at idx %u.\n", - event_type, eq->eqn, eq->cons_index); break; } @@ -6059,12 +6263,14 @@ static irqreturn_t hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, eq->sub_type = sub_type; ++eq->cons_index; aeqe_found = IRQ_HANDLED; + trace_hns_ae_info(event_type, aeqe, eq->eqe_size); atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_AEQE_CNT]); hns_roce_v2_init_irq_work(hr_dev, eq, queue_num); aeqe = next_aeqe_sw_v2(eq); + ++num_aeqes; } update_eq_db(eq); @@ -6084,33 +6290,11 @@ static struct hns_roce_ceqe *next_ceqe_sw_v2(struct hns_roce_eq *eq) !!(eq->cons_index & eq->entries)) ? ceqe : NULL; } -static irqreturn_t hns_roce_v2_ceq_int(struct hns_roce_dev *hr_dev, - struct hns_roce_eq *eq) +static irqreturn_t hns_roce_v2_ceq_int(struct hns_roce_eq *eq) { - struct hns_roce_ceqe *ceqe = next_ceqe_sw_v2(eq); - irqreturn_t ceqe_found = IRQ_NONE; - u32 cqn; + queue_work(system_bh_wq, &eq->work); - while (ceqe) { - /* Make sure we read CEQ entry after we have checked the - * ownership bit - */ - dma_rmb(); - - cqn = hr_reg_read(ceqe, CEQE_CQN); - - hns_roce_cq_completion(hr_dev, cqn); - - ++eq->cons_index; - ceqe_found = IRQ_HANDLED; - atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_CEQE_CNT]); - - ceqe = next_ceqe_sw_v2(eq); - } - - update_eq_db(eq); - - return IRQ_RETVAL(ceqe_found); + return IRQ_HANDLED; } static irqreturn_t hns_roce_v2_msix_interrupt_eq(int irq, void *eq_ptr) @@ -6121,7 +6305,7 @@ static irqreturn_t hns_roce_v2_msix_interrupt_eq(int irq, void *eq_ptr) if (eq->type_flag == HNS_ROCE_CEQ) /* Completion event interrupt */ - int_work = hns_roce_v2_ceq_int(hr_dev, eq); + int_work = hns_roce_v2_ceq_int(eq); else /* Asynchronous event interrupt */ int_work = hns_roce_v2_aeq_int(hr_dev, eq); @@ -6135,6 +6319,7 @@ static irqreturn_t abnormal_interrupt_basic(struct hns_roce_dev *hr_dev, struct pci_dev *pdev = hr_dev->pci_dev; struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); const struct hnae3_ae_ops *ops = ae_dev->ops; + enum hnae3_reset_type reset_type; irqreturn_t int_work = IRQ_NONE; u32 int_en; @@ -6146,10 +6331,12 @@ static irqreturn_t abnormal_interrupt_basic(struct hns_roce_dev *hr_dev, roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, 1 << HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S); + reset_type = hr_dev->is_vf ? + HNAE3_VF_FUNC_RESET : HNAE3_FUNC_RESET; + /* Set reset level for reset_event() */ if (ops->set_default_reset_request) - ops->set_default_reset_request(ae_dev, - HNAE3_FUNC_RESET); + ops->set_default_reset_request(ae_dev, reset_type); if (ops->reset_event) ops->reset_event(pdev, NULL); @@ -6219,7 +6406,7 @@ static u64 fmea_get_ram_res_addr(u32 res_type, __le64 *data) res_type == ECC_RESOURCE_SCCC) return le64_to_cpu(*data); - return le64_to_cpu(*data) << PAGE_SHIFT; + return le64_to_cpu(*data) << HNS_HW_PAGE_SHIFT; } static int fmea_recover_others(struct hns_roce_dev *hr_dev, u32 res_type, @@ -6333,9 +6520,16 @@ static void hns_roce_v2_int_mask_enable(struct hns_roce_dev *hr_dev, roce_write(hr_dev, ROCEE_VF_ABN_INT_CFG_REG, enable_flag); } -static void hns_roce_v2_destroy_eqc(struct hns_roce_dev *hr_dev, u32 eqn) +static void free_eq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq) +{ + hns_roce_mtr_destroy(hr_dev, &eq->mtr); +} + +static void hns_roce_v2_destroy_eqc(struct hns_roce_dev *hr_dev, + struct hns_roce_eq *eq) { struct device *dev = hr_dev->dev; + int eqn = eq->eqn; int ret; u8 cmd; @@ -6346,12 +6540,9 @@ static void hns_roce_v2_destroy_eqc(struct hns_roce_dev *hr_dev, u32 eqn) ret = hns_roce_destroy_hw_ctx(hr_dev, cmd, eqn & HNS_ROCE_V2_EQN_M); if (ret) - dev_err(dev, "[mailbox cmd] destroy eqc(%u) failed.\n", eqn); -} + dev_err(dev, "[mailbox cmd] destroy eqc(%d) failed.\n", eqn); -static void free_eq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq) -{ - hns_roce_mtr_destroy(hr_dev, &eq->mtr); + free_eq_buf(hr_dev, eq); } static void init_eq_config(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq) @@ -6489,6 +6680,34 @@ free_cmd_mbox: return ret; } +static void hns_roce_ceq_work(struct work_struct *work) +{ + struct hns_roce_eq *eq = from_work(eq, work, work); + struct hns_roce_ceqe *ceqe = next_ceqe_sw_v2(eq); + struct hns_roce_dev *hr_dev = eq->hr_dev; + int ceqe_num = 0; + u32 cqn; + + while (ceqe && ceqe_num < hr_dev->caps.ceqe_depth) { + /* Make sure we read CEQ entry after we have checked the + * ownership bit + */ + dma_rmb(); + + cqn = hr_reg_read(ceqe, CEQE_CQN); + + hns_roce_cq_completion(hr_dev, cqn); + + ++eq->cons_index; + ++ceqe_num; + atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_CEQE_CNT]); + + ceqe = next_ceqe_sw_v2(eq); + } + + update_eq_db(eq); +} + static int __hns_roce_request_irq(struct hns_roce_dev *hr_dev, int irq_num, int comp_num, int aeq_num, int other_num) { @@ -6520,21 +6739,24 @@ static int __hns_roce_request_irq(struct hns_roce_dev *hr_dev, int irq_num, j - other_num - aeq_num); for (j = 0; j < irq_num; j++) { - if (j < other_num) + if (j < other_num) { ret = request_irq(hr_dev->irq[j], hns_roce_v2_msix_interrupt_abn, 0, hr_dev->irq_names[j], hr_dev); - - else if (j < (other_num + comp_num)) + } else if (j < (other_num + comp_num)) { + INIT_WORK(&eq_table->eq[j - other_num].work, + hns_roce_ceq_work); ret = request_irq(eq_table->eq[j - other_num].irq, hns_roce_v2_msix_interrupt_eq, 0, hr_dev->irq_names[j + aeq_num], &eq_table->eq[j - other_num]); - else + } else { ret = request_irq(eq_table->eq[j - other_num].irq, hns_roce_v2_msix_interrupt_eq, 0, hr_dev->irq_names[j - comp_num], &eq_table->eq[j - other_num]); + } + if (ret) { dev_err(hr_dev->dev, "request irq error!\n"); goto err_request_failed; @@ -6544,12 +6766,16 @@ static int __hns_roce_request_irq(struct hns_roce_dev *hr_dev, int irq_num, return 0; err_request_failed: - for (j -= 1; j >= 0; j--) - if (j < other_num) + for (j -= 1; j >= 0; j--) { + if (j < other_num) { free_irq(hr_dev->irq[j], hr_dev); - else - free_irq(eq_table->eq[j - other_num].irq, - &eq_table->eq[j - other_num]); + continue; + } + free_irq(eq_table->eq[j - other_num].irq, + &eq_table->eq[j - other_num]); + if (j < other_num + comp_num) + cancel_work_sync(&eq_table->eq[j - other_num].work); + } err_kzalloc_failed: for (i -= 1; i >= 0; i--) @@ -6570,8 +6796,11 @@ static void __hns_roce_free_irq(struct hns_roce_dev *hr_dev) for (i = 0; i < hr_dev->caps.num_other_vectors; i++) free_irq(hr_dev->irq[i], hr_dev); - for (i = 0; i < eq_num; i++) + for (i = 0; i < eq_num; i++) { free_irq(hr_dev->eq_table.eq[i].irq, &hr_dev->eq_table.eq[i]); + if (i < hr_dev->caps.num_comp_vectors) + cancel_work_sync(&hr_dev->eq_table.eq[i].work); + } for (i = 0; i < irq_num; i++) kfree(hr_dev->irq_names[i]); @@ -6591,6 +6820,9 @@ static int hns_roce_v2_init_eq_table(struct hns_roce_dev *hr_dev) int ret; int i; + if (hr_dev->caps.aeqe_depth < HNS_AEQ_POLLING_BUDGET) + return -EINVAL; + other_num = hr_dev->caps.num_other_vectors; comp_num = hr_dev->caps.num_comp_vectors; aeq_num = hr_dev->caps.num_aeq_vectors; @@ -6660,7 +6892,7 @@ err_request_irq_fail: err_create_eq_fail: for (i -= 1; i >= 0; i--) - free_eq_buf(hr_dev, &eq_table->eq[i]); + hns_roce_v2_destroy_eqc(hr_dev, &eq_table->eq[i]); kfree(eq_table->eq); return ret; @@ -6680,11 +6912,8 @@ static void hns_roce_v2_cleanup_eq_table(struct hns_roce_dev *hr_dev) __hns_roce_free_irq(hr_dev); destroy_workqueue(hr_dev->irq_workq); - for (i = 0; i < eq_num; i++) { - hns_roce_v2_destroy_eqc(hr_dev, i); - - free_eq_buf(hr_dev, &eq_table->eq[i]); - } + for (i = 0; i < eq_num; i++) + hns_roce_v2_destroy_eqc(hr_dev, &eq_table->eq[i]); kfree(eq_table->eq); } @@ -6735,6 +6964,7 @@ static const struct hns_roce_hw hns_roce_hw_v2 = { .query_srqc = hns_roce_v2_query_srqc, .query_sccc = hns_roce_v2_query_sccc, .query_hw_counter = hns_roce_hw_v2_query_counter, + .get_dscp = hns_roce_hw_v2_get_dscp, .hns_roce_dev_ops = &hns_roce_v2_dev_ops, .hns_roce_dev_srq_ops = &hns_roce_v2_dev_srq_ops, }; @@ -6851,9 +7081,6 @@ static void __hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, hr_dev->state = HNS_ROCE_DEVICE_STATE_UNINIT; hns_roce_handle_device_err(hr_dev); - if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) - free_mr_exit(hr_dev); - hns_roce_exit(hr_dev); kfree(hr_dev->priv); ib_dealloc_device(&hr_dev->ib_dev); @@ -6914,6 +7141,7 @@ static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT; } + static int hns_roce_hw_v2_reset_notify_down(struct hnae3_handle *handle) { struct hns_roce_dev *hr_dev; @@ -6932,6 +7160,9 @@ static int hns_roce_hw_v2_reset_notify_down(struct hnae3_handle *handle) hr_dev->active = false; hr_dev->dis_db = true; + + rdma_user_mmap_disassociate(&hr_dev->ib_dev); + hr_dev->state = HNS_ROCE_DEVICE_STATE_RST_DOWN; return 0; @@ -7002,9 +7233,22 @@ static int hns_roce_hw_v2_reset_notify(struct hnae3_handle *handle, return ret; } +static void hns_roce_hw_v2_link_status_change(struct hnae3_handle *handle, + bool linkup) +{ + struct hns_roce_dev *hr_dev = (struct hns_roce_dev *)handle->priv; + struct net_device *netdev = handle->rinfo.netdev; + + if (linkup || !hr_dev) + return; + + ib_dispatch_port_state_event(&hr_dev->ib_dev, netdev); +} + static const struct hnae3_client_ops hns_roce_hw_v2_ops = { .init_instance = hns_roce_hw_v2_init_instance, .uninit_instance = hns_roce_hw_v2_uninit_instance, + .link_status_change = hns_roce_hw_v2_link_status_change, .reset_notify = hns_roce_hw_v2_reset_notify, }; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index df04bc8ede57..bc7466830eaf 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -34,6 +34,7 @@ #define _HNS_ROCE_HW_V2_H #include <linux/bitops.h> +#include "hnae3.h" #define HNS_ROCE_V2_MAX_RC_INL_INN_SZ 32 #define HNS_ROCE_V2_MTT_ENTRY_SZ 64 @@ -85,6 +86,11 @@ #define HNS_ROCE_V2_TABLE_CHUNK_SIZE (1 << 18) +/* budget must be smaller than aeqe_depth to guarantee that we update + * the ci before we polled all the entries in the EQ. + */ +#define HNS_AEQ_POLLING_BUDGET 64 + enum { HNS_ROCE_CMD_FLAG_IN = BIT(0), HNS_ROCE_CMD_FLAG_OUT = BIT(1), @@ -224,6 +230,14 @@ enum hns_roce_opcode_type { HNS_SWITCH_PARAMETER_CFG = 0x1033, }; +#define HNS_ROCE_OPC_POST_MB_TIMEOUT 35000 +#define HNS_ROCE_OPC_POST_MB_TRY_CNT 8 +#define HNS_ROCE_OPC_POST_MB_RETRY_GAP_MSEC 5 +struct hns_roce_cmdq_tx_timeout_map { + u16 opcode; + u32 tx_timeout; +}; + enum { TYPE_CRQ, TYPE_CSQ, @@ -276,6 +290,10 @@ struct hns_roce_v2_cq_context { __le32 byte_64_se_cqe_idx; }; +#define CQC_CQE_BA_L_S 3 +#define CQC_CQE_BA_H_S (32 + CQC_CQE_BA_L_S) +#define CQC_CQE_DB_RECORD_ADDR_H_S 32 + #define HNS_ROCE_V2_CQ_DEFAULT_BURST_NUM 0x0 #define HNS_ROCE_V2_CQ_DEFAULT_INTERVAL 0x0 @@ -447,6 +465,12 @@ struct hns_roce_v2_qp_context { struct hns_roce_v2_qp_context_ex ext; }; +#define QPC_TRRL_BA_L_S 4 +#define QPC_TRRL_BA_M_S (16 + QPC_TRRL_BA_L_S) +#define QPC_TRRL_BA_H_S (32 + QPC_TRRL_BA_M_S) +#define QPC_IRRL_BA_L_S 6 +#define QPC_IRRL_BA_H_S (32 + QPC_IRRL_BA_L_S) + #define QPC_FIELD_LOC(h, l) FIELD_LOC(struct hns_roce_v2_qp_context, h, l) #define QPC_TST QPC_FIELD_LOC(2, 0) @@ -716,6 +740,9 @@ struct hns_roce_v2_mpt_entry { __le32 byte_64_buf_pa1; }; +#define MPT_PBL_BUF_ADDR_S 6 +#define MPT_PBL_BA_ADDR_S 3 + #define MPT_FIELD_LOC(h, l) FIELD_LOC(struct hns_roce_v2_mpt_entry, h, l) #define MPT_ST MPT_FIELD_LOC(1, 0) @@ -900,6 +927,7 @@ struct hns_roce_v2_rc_send_wqe { #define RC_SEND_WQE_OWNER RC_SEND_WQE_FIELD_LOC(7, 7) #define RC_SEND_WQE_CQE RC_SEND_WQE_FIELD_LOC(8, 8) #define RC_SEND_WQE_FENCE RC_SEND_WQE_FIELD_LOC(9, 9) +#define RC_SEND_WQE_SO RC_SEND_WQE_FIELD_LOC(10, 10) #define RC_SEND_WQE_SE RC_SEND_WQE_FIELD_LOC(11, 11) #define RC_SEND_WQE_INLINE RC_SEND_WQE_FIELD_LOC(12, 12) #define RC_SEND_WQE_WQE_INDEX RC_SEND_WQE_FIELD_LOC(30, 15) @@ -1323,7 +1351,7 @@ struct hns_roce_v2_priv { struct hns_roce_dip { u8 dgid[GID_LEN_V2]; u32 dip_idx; - struct list_head node; /* all dips are on a list */ + u32 qp_cnt; }; struct fmea_ram_ecc { @@ -1334,7 +1362,7 @@ struct fmea_ram_ecc { /* only for RNR timeout issue of HIP08 */ #define HNS_ROCE_CLOCK_ADJUST 1000 -#define HNS_ROCE_MAX_CQ_PERIOD 65 +#define HNS_ROCE_MAX_CQ_PERIOD_HIP08 65 #define HNS_ROCE_MAX_EQ_PERIOD 65 #define HNS_ROCE_RNR_TIMER_10NS 1 #define HNS_ROCE_1US_CFG 999 diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 1dc60c2b2b7a..e7a497cc125c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -40,6 +40,7 @@ #include "hns_roce_common.h" #include "hns_roce_device.h" #include "hns_roce_hem.h" +#include "hns_roce_hw_v2.h" static int hns_roce_set_mac(struct hns_roce_dev *hr_dev, u32 port, const u8 *addr) @@ -181,7 +182,7 @@ static int hns_roce_query_device(struct ib_device *ib_dev, IB_DEVICE_RC_RNR_NAK_GEN; props->max_send_sge = hr_dev->caps.max_sq_sg; props->max_recv_sge = hr_dev->caps.max_rq_sg; - props->max_sge_rd = 1; + props->max_sge_rd = hr_dev->caps.max_sq_sg; props->max_cq = hr_dev->caps.num_cqs; props->max_cqe = hr_dev->caps.max_cqes; props->max_mr = hr_dev->caps.num_mtpts; @@ -192,6 +193,12 @@ static int hns_roce_query_device(struct ib_device *ib_dev, IB_ATOMIC_HCA : IB_ATOMIC_NONE; props->max_pkeys = 1; props->local_ca_ack_delay = hr_dev->caps.local_ca_ack_delay; + props->max_ah = INT_MAX; + props->cq_caps.max_cq_moderation_period = HNS_ROCE_MAX_CQ_PERIOD; + props->cq_caps.max_cq_moderation_count = HNS_ROCE_MAX_CQ_COUNT; + if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) + props->cq_caps.max_cq_moderation_period = HNS_ROCE_MAX_CQ_PERIOD_HIP08; + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SRQ) { props->max_srq = hr_dev->caps.num_srqs; props->max_srq_wr = hr_dev->caps.max_srq_wrs; @@ -421,6 +428,9 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, return 0; error_fail_copy_to_udata: + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQ_RECORD_DB || + hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB) + mutex_destroy(&context->page_mutex); hns_roce_dealloc_uar_entry(context); error_fail_uar_entry: @@ -437,6 +447,10 @@ static void hns_roce_dealloc_ucontext(struct ib_ucontext *ibcontext) struct hns_roce_ucontext *context = to_hr_ucontext(ibcontext); struct hns_roce_dev *hr_dev = to_hr_dev(ibcontext->device); + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQ_RECORD_DB || + hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB) + mutex_destroy(&context->page_mutex); + hns_roce_dealloc_uar_entry(context); ida_free(&hr_dev->uar_ida.ida, (int)context->uar.logic_idx); @@ -451,6 +465,11 @@ static int hns_roce_mmap(struct ib_ucontext *uctx, struct vm_area_struct *vma) pgprot_t prot; int ret; + if (hr_dev->dis_db) { + atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_MMAP_ERR_CNT]); + return -EPERM; + } + rdma_entry = rdma_user_mmap_entry_get_pgoff(uctx, vma->vm_pgoff); if (!rdma_entry) { atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_MMAP_ERR_CNT]); @@ -743,7 +762,7 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) if (ret) return ret; } - dma_set_max_seg_size(dev, UINT_MAX); + dma_set_max_seg_size(dev, SZ_2G); ret = ib_register_device(ib_dev, "hns_%d", dev); if (ret) { dev_err(dev, "ib_register_device failed!\n"); @@ -925,6 +944,15 @@ err_unmap_dmpt: return ret; } +static void hns_roce_teardown_hca(struct hns_roce_dev *hr_dev) +{ + hns_roce_cleanup_bitmap(hr_dev); + + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQ_RECORD_DB || + hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB) + mutex_destroy(&hr_dev->pgdir_mutex); +} + /** * hns_roce_setup_hca - setup host channel adapter * @hr_dev: pointer to hns roce device @@ -973,6 +1001,10 @@ static int hns_roce_setup_hca(struct hns_roce_dev *hr_dev) err_uar_table_free: ida_destroy(&hr_dev->uar_ida.ida); + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQ_RECORD_DB || + hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB) + mutex_destroy(&hr_dev->pgdir_mutex); + return ret; } @@ -1102,8 +1134,6 @@ int hns_roce_init(struct hns_roce_dev *hr_dev) INIT_LIST_HEAD(&hr_dev->qp_list); spin_lock_init(&hr_dev->qp_list_lock); - INIT_LIST_HEAD(&hr_dev->dip_list); - spin_lock_init(&hr_dev->dip_list_lock); ret = hns_roce_register_device(hr_dev); if (ret) @@ -1118,7 +1148,7 @@ error_failed_register_device: hr_dev->hw->hw_exit(hr_dev); error_failed_engine_init: - hns_roce_cleanup_bitmap(hr_dev); + hns_roce_teardown_hca(hr_dev); error_failed_setup_hca: hns_roce_cleanup_hem(hr_dev); @@ -1148,7 +1178,7 @@ void hns_roce_exit(struct hns_roce_dev *hr_dev) if (hr_dev->hw->hw_exit) hr_dev->hw->hw_exit(hr_dev); - hns_roce_cleanup_bitmap(hr_dev); + hns_roce_teardown_hca(hr_dev); hns_roce_cleanup_hem(hr_dev); if (hr_dev->cmd_mod) diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 9e05b57a2d67..93a48b41955b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -38,6 +38,7 @@ #include "hns_roce_device.h" #include "hns_roce_cmd.h" #include "hns_roce_hem.h" +#include "hns_roce_trace.h" static u32 hw_index_to_key(int ind) { @@ -138,8 +139,8 @@ static void hns_roce_mr_free(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr key_to_hw_index(mr->key) & (hr_dev->caps.num_mtpts - 1)); if (ret) - ibdev_warn(ibdev, "failed to destroy mpt, ret = %d.\n", - ret); + ibdev_warn_ratelimited(ibdev, "failed to destroy mpt, ret = %d.\n", + ret); } free_mr_pbl(hr_dev, mr); @@ -159,10 +160,11 @@ static int hns_roce_mr_enable(struct hns_roce_dev *hr_dev, if (IS_ERR(mailbox)) return PTR_ERR(mailbox); + trace_hns_mr(mr); if (mr->type != MR_TYPE_FRMR) ret = hr_dev->hw->write_mtpt(hr_dev, mailbox->buf, mr); else - ret = hr_dev->hw->frmr_write_mtpt(hr_dev, mailbox->buf, mr); + ret = hr_dev->hw->frmr_write_mtpt(mailbox->buf, mr); if (ret) { dev_err(dev, "failed to write mtpt, ret = %d.\n", ret); goto err_page; @@ -435,24 +437,30 @@ static int hns_roce_set_page(struct ib_mr *ibmr, u64 addr) } int hns_roce_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, - unsigned int *sg_offset) + unsigned int *sg_offset_p) { + unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; struct hns_roce_dev *hr_dev = to_hr_dev(ibmr->device); struct ib_device *ibdev = &hr_dev->ib_dev; struct hns_roce_mr *mr = to_hr_mr(ibmr); struct hns_roce_mtr *mtr = &mr->pbl_mtr; - int ret = 0; + int ret, sg_num = 0; + + if (!IS_ALIGNED(sg_offset, HNS_ROCE_FRMR_ALIGN_SIZE) || + ibmr->page_size < HNS_HW_PAGE_SIZE || + ibmr->page_size > HNS_HW_MAX_PAGE_SIZE) + return sg_num; mr->npages = 0; mr->page_list = kvcalloc(mr->pbl_mtr.hem_cfg.buf_pg_count, sizeof(dma_addr_t), GFP_KERNEL); if (!mr->page_list) - return ret; + return sg_num; - ret = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, hns_roce_set_page); - if (ret < 1) { + sg_num = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset_p, hns_roce_set_page); + if (sg_num < 1) { ibdev_err(ibdev, "failed to store sg pages %u %u, cnt = %d.\n", - mr->npages, mr->pbl_mtr.hem_cfg.buf_pg_count, ret); + mr->npages, mr->pbl_mtr.hem_cfg.buf_pg_count, sg_num); goto err_page_list; } @@ -463,17 +471,16 @@ int hns_roce_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, ret = hns_roce_mtr_map(hr_dev, mtr, mr->page_list, mr->npages); if (ret) { ibdev_err(ibdev, "failed to map sg mtr, ret = %d.\n", ret); - ret = 0; + sg_num = 0; } else { mr->pbl_mtr.hem_cfg.buf_pg_shift = (u32)ilog2(ibmr->page_size); - ret = mr->npages; } err_page_list: kvfree(mr->page_list); mr->page_list = NULL; - return ret; + return sg_num; } static void hns_roce_mw_free(struct hns_roce_dev *hr_dev, @@ -756,7 +763,7 @@ static int mtr_map_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr) return -ENOMEM; if (mtr->umem) - npage = hns_roce_get_umem_bufs(hr_dev, pages, page_count, + npage = hns_roce_get_umem_bufs(pages, page_count, mtr->umem, page_shift); else npage = hns_roce_get_kmem_bufs(hr_dev, pages, page_count, @@ -809,11 +816,6 @@ int hns_roce_mtr_map(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, for (i = 0, mapped_cnt = 0; i < mtr->hem_cfg.region_count && mapped_cnt < page_cnt; i++) { r = &mtr->hem_cfg.region[i]; - /* if hopnum is 0, no need to map pages in this region */ - if (!r->hopnum) { - mapped_cnt += r->count; - continue; - } if (r->offset + r->count > page_cnt) { ret = -EINVAL; @@ -998,7 +1000,7 @@ static bool is_buf_attr_valid(struct hns_roce_dev *hr_dev, if (attr->region_count > ARRAY_SIZE(attr->region) || attr->region_count < 1 || attr->page_shift < HNS_HW_PAGE_SHIFT) { ibdev_err(ibdev, - "invalid buf attr, region count %d, page shift %u.\n", + "invalid buf attr, region count %u, page shift %u.\n", attr->region_count, attr->page_shift); return false; } @@ -1146,6 +1148,7 @@ int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, struct ib_device *ibdev = &hr_dev->ib_dev; int ret; + trace_hns_buf_attr(buf_attr); /* The caller has its own buffer list and invokes the hns_roce_mtr_map() * to finish the MTT configuration. */ diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index f35a66325d9a..9f376a2232b0 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -39,6 +39,25 @@ #include "hns_roce_device.h" #include "hns_roce_hem.h" +static struct hns_roce_qp *hns_roce_qp_lookup(struct hns_roce_dev *hr_dev, + u32 qpn) +{ + struct device *dev = hr_dev->dev; + struct hns_roce_qp *qp; + unsigned long flags; + + xa_lock_irqsave(&hr_dev->qp_table_xa, flags); + qp = __hns_roce_qp_lookup(hr_dev, qpn); + if (qp) + refcount_inc(&qp->refcount); + xa_unlock_irqrestore(&hr_dev->qp_table_xa, flags); + + if (!qp) + dev_warn(dev, "async event for bogus QP %08x\n", qpn); + + return qp; +} + static void flush_work_handle(struct work_struct *work) { struct hns_roce_work *flush_work = container_of(work, @@ -71,11 +90,18 @@ static void flush_work_handle(struct work_struct *work) void init_flush_work(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) { struct hns_roce_work *flush_work = &hr_qp->flush_work; + unsigned long flags; + + spin_lock_irqsave(&hr_qp->flush_lock, flags); + /* Exit directly after destroy_qp() */ + if (test_bit(HNS_ROCE_STOP_FLUSH_FLAG, &hr_qp->flush_flag)) { + spin_unlock_irqrestore(&hr_qp->flush_lock, flags); + return; + } - flush_work->hr_dev = hr_dev; - INIT_WORK(&flush_work->work, flush_work_handle); refcount_inc(&hr_qp->refcount); queue_work(hr_dev->irq_workq, &flush_work->work); + spin_unlock_irqrestore(&hr_qp->flush_lock, flags); } void flush_cqe(struct hns_roce_dev *dev, struct hns_roce_qp *qp) @@ -95,31 +121,28 @@ void flush_cqe(struct hns_roce_dev *dev, struct hns_roce_qp *qp) void hns_roce_qp_event(struct hns_roce_dev *hr_dev, u32 qpn, int event_type) { - struct device *dev = hr_dev->dev; struct hns_roce_qp *qp; - xa_lock(&hr_dev->qp_table_xa); - qp = __hns_roce_qp_lookup(hr_dev, qpn); - if (qp) - refcount_inc(&qp->refcount); - xa_unlock(&hr_dev->qp_table_xa); - - if (!qp) { - dev_warn(dev, "async event for bogus QP %08x\n", qpn); + qp = hns_roce_qp_lookup(hr_dev, qpn); + if (!qp) return; - } - if (event_type == HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR || - event_type == HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR || - event_type == HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR || - event_type == HNS_ROCE_EVENT_TYPE_XRCD_VIOLATION || - event_type == HNS_ROCE_EVENT_TYPE_INVALID_XRCETH) { - qp->state = IB_QPS_ERR; + qp->event(qp, (enum hns_roce_event)event_type); - flush_cqe(hr_dev, qp); - } + if (refcount_dec_and_test(&qp->refcount)) + complete(&qp->free); +} - qp->event(qp, (enum hns_roce_event)event_type); +void hns_roce_flush_cqe(struct hns_roce_dev *hr_dev, u32 qpn) +{ + struct hns_roce_qp *qp; + + qp = hns_roce_qp_lookup(hr_dev, qpn); + if (!qp) + return; + + qp->state = IB_QPS_ERR; + flush_cqe(hr_dev, qp); if (refcount_dec_and_test(&qp->refcount)) complete(&qp->free); @@ -410,7 +433,8 @@ static void free_qpn(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) bankid = get_qp_bankid(hr_qp->qpn); - ida_free(&hr_dev->qp_table.bank[bankid].ida, hr_qp->qpn >> 3); + ida_free(&hr_dev->qp_table.bank[bankid].ida, + hr_qp->qpn / HNS_ROCE_QP_BANK_NUM); mutex_lock(&hr_dev->qp_table.bank_mutex); hr_dev->qp_table.bank[bankid].inuse--; @@ -531,13 +555,15 @@ static unsigned int get_sge_num_from_max_inl_data(bool is_ud_or_gsi, { unsigned int inline_sge; - inline_sge = roundup_pow_of_two(max_inline_data) / HNS_ROCE_SGE_SIZE; + if (!max_inline_data) + return 0; /* * if max_inline_data less than * HNS_ROCE_SGE_IN_WQE * HNS_ROCE_SGE_SIZE, * In addition to ud's mode, no need to extend sge. */ + inline_sge = roundup_pow_of_two(max_inline_data) / HNS_ROCE_SGE_SIZE; if (!is_ud_or_gsi && inline_sge <= HNS_ROCE_SGE_IN_WQE) inline_sge = 0; @@ -842,12 +868,14 @@ static int alloc_user_qp_db(struct hns_roce_dev *hr_dev, struct hns_roce_ib_create_qp *ucmd, struct hns_roce_ib_create_qp_resp *resp) { + bool has_sdb = user_qp_has_sdb(hr_dev, init_attr, udata, resp, ucmd); struct hns_roce_ucontext *uctx = rdma_udata_to_drv_context(udata, struct hns_roce_ucontext, ibucontext); + bool has_rdb = user_qp_has_rdb(hr_dev, init_attr, udata, resp); struct ib_device *ibdev = &hr_dev->ib_dev; int ret; - if (user_qp_has_sdb(hr_dev, init_attr, udata, resp, ucmd)) { + if (has_sdb) { ret = hns_roce_db_map_user(uctx, ucmd->sdb_addr, &hr_qp->sdb); if (ret) { ibdev_err(ibdev, @@ -858,7 +886,7 @@ static int alloc_user_qp_db(struct hns_roce_dev *hr_dev, hr_qp->en_flags |= HNS_ROCE_QP_CAP_SQ_RECORD_DB; } - if (user_qp_has_rdb(hr_dev, init_attr, udata, resp)) { + if (has_rdb) { ret = hns_roce_db_map_user(uctx, ucmd->db_addr, &hr_qp->rdb); if (ret) { ibdev_err(ibdev, @@ -872,7 +900,7 @@ static int alloc_user_qp_db(struct hns_roce_dev *hr_dev, return 0; err_sdb: - if (hr_qp->en_flags & HNS_ROCE_QP_CAP_SQ_RECORD_DB) + if (has_sdb) hns_roce_db_unmap_user(uctx, &hr_qp->sdb); err_out: return ret; @@ -1093,35 +1121,34 @@ static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, ibucontext); hr_qp->config = uctx->config; ret = set_user_sq_size(hr_dev, &init_attr->cap, hr_qp, ucmd); - if (ret) + if (ret) { ibdev_err(ibdev, "failed to set user SQ size, ret = %d.\n", ret); + return ret; + } ret = set_congest_param(hr_dev, hr_qp, ucmd); - if (ret) - return ret; } else { if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) hr_qp->config = HNS_ROCE_EXSGE_FLAGS; + default_congest_type(hr_dev, hr_qp); ret = set_kernel_sq_size(hr_dev, &init_attr->cap, hr_qp); if (ret) ibdev_err(ibdev, "failed to set kernel SQ size, ret = %d.\n", ret); - - default_congest_type(hr_dev, hr_qp); } return ret; } static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, - struct ib_pd *ib_pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata, struct hns_roce_qp *hr_qp) { + struct hns_roce_work *flush_work = &hr_qp->flush_work; struct hns_roce_ib_create_qp_resp resp = {}; struct ib_device *ibdev = &hr_dev->ib_dev; struct hns_roce_ib_create_qp ucmd = {}; @@ -1130,9 +1157,12 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, mutex_init(&hr_qp->mutex); spin_lock_init(&hr_qp->sq.lock); spin_lock_init(&hr_qp->rq.lock); + spin_lock_init(&hr_qp->flush_lock); hr_qp->state = IB_QPS_RESET; hr_qp->flush_flag = 0; + flush_work->hr_dev = hr_dev; + INIT_WORK(&flush_work->work, flush_work_handle); if (init_attr->create_flags) return -EOPNOTSUPP; @@ -1140,7 +1170,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, ret = set_qp_param(hr_dev, hr_qp, init_attr, udata, &ucmd); if (ret) { ibdev_err(ibdev, "failed to set QP param, ret = %d.\n", ret); - return ret; + goto err_out; } if (!udata) { @@ -1148,7 +1178,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, if (ret) { ibdev_err(ibdev, "failed to alloc wrid, ret = %d.\n", ret); - return ret; + goto err_out; } } @@ -1190,7 +1220,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, min(udata->outlen, sizeof(resp))); if (ret) { ibdev_err(ibdev, "copy qp resp failed!\n"); - goto err_store; + goto err_flow_ctrl; } } @@ -1219,6 +1249,8 @@ err_qpn: free_qp_buf(hr_dev, hr_qp); err_buf: free_kernel_wrid(hr_qp); +err_out: + mutex_destroy(&hr_qp->mutex); return ret; } @@ -1234,6 +1266,7 @@ void hns_roce_qp_destroy(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, free_qp_buf(hr_dev, hr_qp); free_kernel_wrid(hr_qp); free_qp_db(hr_dev, hr_qp, udata); + mutex_destroy(&hr_qp->mutex); } static int check_qp_type(struct hns_roce_dev *hr_dev, enum ib_qp_type type, @@ -1271,7 +1304,6 @@ int hns_roce_create_qp(struct ib_qp *qp, struct ib_qp_init_attr *init_attr, struct ib_device *ibdev = qp->device; struct hns_roce_dev *hr_dev = to_hr_dev(ibdev); struct hns_roce_qp *hr_qp = to_hr_qp(qp); - struct ib_pd *pd = qp->pd; int ret; ret = check_qp_type(hr_dev, init_attr->qp_type, !!udata); @@ -1286,9 +1318,9 @@ int hns_roce_create_qp(struct ib_qp *qp, struct ib_qp_init_attr *init_attr, hr_qp->phy_port = hr_dev->iboe.phy_port[hr_qp->port]; } - ret = hns_roce_create_qp_common(hr_dev, pd, init_attr, udata, hr_qp); + ret = hns_roce_create_qp_common(hr_dev, init_attr, udata, hr_qp); if (ret) - ibdev_err(ibdev, "create QP type 0x%x failed(%d)\n", + ibdev_err(ibdev, "create QP type %d failed(%d)\n", init_attr->qp_type, ret); err_out: @@ -1386,6 +1418,7 @@ int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); + struct hns_roce_ib_modify_qp_resp resp = {}; struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); enum ib_qp_state cur_state, new_state; int ret = -EINVAL; @@ -1427,6 +1460,18 @@ int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, ret = hr_dev->hw->modify_qp(ibqp, attr, attr_mask, cur_state, new_state, udata); + if (ret) + goto out; + + if (udata && udata->outlen) { + resp.tc_mode = hr_qp->tc_mode; + resp.priority = hr_qp->sl; + ret = ib_copy_to_udata(udata, &resp, + min(udata->outlen, sizeof(resp))); + if (ret) + ibdev_err_ratelimited(&hr_dev->ib_dev, + "failed to copy modify qp resp.\n"); + } out: mutex_unlock(&hr_qp->mutex); @@ -1443,19 +1488,19 @@ void hns_roce_lock_cqs(struct hns_roce_cq *send_cq, struct hns_roce_cq *recv_cq) __acquire(&send_cq->lock); __acquire(&recv_cq->lock); } else if (unlikely(send_cq != NULL && recv_cq == NULL)) { - spin_lock_irq(&send_cq->lock); + spin_lock(&send_cq->lock); __acquire(&recv_cq->lock); } else if (unlikely(send_cq == NULL && recv_cq != NULL)) { - spin_lock_irq(&recv_cq->lock); + spin_lock(&recv_cq->lock); __acquire(&send_cq->lock); } else if (send_cq == recv_cq) { - spin_lock_irq(&send_cq->lock); + spin_lock(&send_cq->lock); __acquire(&recv_cq->lock); } else if (send_cq->cqn < recv_cq->cqn) { - spin_lock_irq(&send_cq->lock); + spin_lock(&send_cq->lock); spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING); } else { - spin_lock_irq(&recv_cq->lock); + spin_lock(&recv_cq->lock); spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING); } } @@ -1475,13 +1520,13 @@ void hns_roce_unlock_cqs(struct hns_roce_cq *send_cq, spin_unlock(&recv_cq->lock); } else if (send_cq == recv_cq) { __release(&recv_cq->lock); - spin_unlock_irq(&send_cq->lock); + spin_unlock(&send_cq->lock); } else if (send_cq->cqn < recv_cq->cqn) { spin_unlock(&recv_cq->lock); - spin_unlock_irq(&send_cq->lock); + spin_unlock(&send_cq->lock); } else { spin_unlock(&send_cq->lock); - spin_unlock_irq(&recv_cq->lock); + spin_unlock(&recv_cq->lock); } } @@ -1529,14 +1574,10 @@ int hns_roce_init_qp_table(struct hns_roce_dev *hr_dev) unsigned int reserved_from_bot; unsigned int i; - qp_table->idx_table.spare_idx = kcalloc(hr_dev->caps.num_qps, - sizeof(u32), GFP_KERNEL); - if (!qp_table->idx_table.spare_idx) - return -ENOMEM; - mutex_init(&qp_table->scc_mutex); mutex_init(&qp_table->bank_mutex); xa_init(&hr_dev->qp_table_xa); + xa_init(&qp_table->dip_xa); reserved_from_bot = hr_dev->caps.reserved_qps; @@ -1561,5 +1602,8 @@ void hns_roce_cleanup_qp_table(struct hns_roce_dev *hr_dev) for (i = 0; i < HNS_ROCE_QP_BANK_NUM; i++) ida_destroy(&hr_dev->qp_table.bank[i].ida); - kfree(hr_dev->qp_table.idx_table.spare_idx); + xa_destroy(&hr_dev->qp_table.dip_xa); + xa_destroy(&hr_dev->qp_table_xa); + mutex_destroy(&hr_dev->qp_table.bank_mutex); + mutex_destroy(&hr_dev->qp_table.scc_mutex); } diff --git a/drivers/infiniband/hw/hns/hns_roce_restrack.c b/drivers/infiniband/hw/hns/hns_roce_restrack.c index 356d98816949..f637b73b946e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_restrack.c +++ b/drivers/infiniband/hw/hns/hns_roce_restrack.c @@ -4,7 +4,6 @@ #include <rdma/rdma_cm.h> #include <rdma/restrack.h> #include <uapi/rdma/rdma_netlink.h> -#include "hnae3.h" #include "hns_roce_common.h" #include "hns_roce_device.h" #include "hns_roce_hw_v2.h" diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index 4abae9477854..1090051f493b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -51,7 +51,7 @@ static void hns_roce_ib_srq_event(struct hns_roce_srq *srq, break; default: dev_err(hr_dev->dev, - "hns_roce:Unexpected event type 0x%x on SRQ %06lx\n", + "hns_roce:Unexpected event type %d on SRQ %06lx\n", event_type, srq->srqn); return; } @@ -123,7 +123,7 @@ static int alloc_srqc(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) return ret; } - ret = xa_err(xa_store(&srq_table->xa, srq->srqn, srq, GFP_KERNEL)); + ret = xa_err(xa_store_irq(&srq_table->xa, srq->srqn, srq, GFP_KERNEL)); if (ret) { ibdev_err(ibdev, "failed to store SRQC, ret = %d.\n", ret); goto err_put; @@ -136,7 +136,7 @@ static int alloc_srqc(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) return 0; err_xa: - xa_erase(&srq_table->xa, srq->srqn); + xa_erase_irq(&srq_table->xa, srq->srqn); err_put: hns_roce_table_put(hr_dev, &srq_table->table, srq->srqn); @@ -151,10 +151,10 @@ static void free_srqc(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) ret = hns_roce_destroy_hw_ctx(hr_dev, HNS_ROCE_CMD_DESTROY_SRQ, srq->srqn); if (ret) - dev_err(hr_dev->dev, "DESTROY_SRQ failed (%d) for SRQN %06lx\n", - ret, srq->srqn); + dev_err_ratelimited(hr_dev->dev, "DESTROY_SRQ failed (%d) for SRQN %06lx\n", + ret, srq->srqn); - xa_erase(&srq_table->xa, srq->srqn); + xa_erase_irq(&srq_table->xa, srq->srqn); if (refcount_dec_and_test(&srq->refcount)) complete(&srq->free); @@ -250,7 +250,7 @@ static void free_srq_wqe_buf(struct hns_roce_dev *hr_dev, hns_roce_mtr_destroy(hr_dev, &srq->buf_mtr); } -static int alloc_srq_wrid(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) +static int alloc_srq_wrid(struct hns_roce_srq *srq) { srq->wrid = kvmalloc_array(srq->wqe_cnt, sizeof(u64), GFP_KERNEL); if (!srq->wrid) @@ -297,7 +297,7 @@ static int set_srq_basic_param(struct hns_roce_srq *srq, max_sge = proc_srq_sge(hr_dev, srq, !!udata); if (attr->max_wr > hr_dev->caps.max_srq_wrs || - attr->max_sge > max_sge) { + attr->max_sge > max_sge || !attr->max_sge) { ibdev_err(&hr_dev->ib_dev, "invalid SRQ attr, depth = %u, sge = %u.\n", attr->max_wr, attr->max_sge); @@ -366,7 +366,7 @@ static int alloc_srq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq, goto err_idx; if (!udata) { - ret = alloc_srq_wrid(hr_dev, srq); + ret = alloc_srq_wrid(srq); if (ret) goto err_wqe_buf; } @@ -518,6 +518,7 @@ err_srq_db: err_srq_buf: free_srq_buf(hr_dev, srq); err_out: + mutex_destroy(&srq->mutex); atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_SRQ_CREATE_ERR_CNT]); return ret; @@ -532,6 +533,7 @@ int hns_roce_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata) free_srqn(hr_dev, srq); free_srq_db(hr_dev, srq, udata); free_srq_buf(hr_dev, srq); + mutex_destroy(&srq->mutex); return 0; } diff --git a/drivers/infiniband/hw/hns/hns_roce_trace.h b/drivers/infiniband/hw/hns/hns_roce_trace.h new file mode 100644 index 000000000000..59ceb591b3a1 --- /dev/null +++ b/drivers/infiniband/hw/hns/hns_roce_trace.h @@ -0,0 +1,216 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Copyright (c) 2025 Hisilicon Limited. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hns_roce + +#if !defined(__HNS_ROCE_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define __HNS_ROCE_TRACE_H + +#include <linux/tracepoint.h> +#include <linux/string_choices.h> +#include "hns_roce_device.h" +#include "hns_roce_hw_v2.h" + +DECLARE_EVENT_CLASS(flush_head_template, + TP_PROTO(unsigned long qpn, u32 pi, + enum hns_roce_trace_type type), + TP_ARGS(qpn, pi, type), + + TP_STRUCT__entry(__field(unsigned long, qpn) + __field(u32, pi) + __field(enum hns_roce_trace_type, type) + ), + + TP_fast_assign(__entry->qpn = qpn; + __entry->pi = pi; + __entry->type = type; + ), + + TP_printk("%s 0x%lx flush head 0x%x.", + trace_type_to_str(__entry->type), + __entry->qpn, __entry->pi) +); + +DEFINE_EVENT(flush_head_template, hns_sq_flush_cqe, + TP_PROTO(unsigned long qpn, u32 pi, + enum hns_roce_trace_type type), + TP_ARGS(qpn, pi, type)); +DEFINE_EVENT(flush_head_template, hns_rq_flush_cqe, + TP_PROTO(unsigned long qpn, u32 pi, + enum hns_roce_trace_type type), + TP_ARGS(qpn, pi, type)); + +#define MAX_SGE_PER_WQE 64 +#define MAX_WQE_SIZE (MAX_SGE_PER_WQE * HNS_ROCE_SGE_SIZE) +DECLARE_EVENT_CLASS(wqe_template, + TP_PROTO(unsigned long qpn, u32 idx, void *wqe, u32 len, + u64 id, enum hns_roce_trace_type type), + TP_ARGS(qpn, idx, wqe, len, id, type), + + TP_STRUCT__entry(__field(unsigned long, qpn) + __field(u32, idx) + __array(u32, wqe, + MAX_WQE_SIZE / sizeof(__le32)) + __field(u32, len) + __field(u64, id) + __field(enum hns_roce_trace_type, type) + ), + + TP_fast_assign(__entry->qpn = qpn; + __entry->idx = idx; + __entry->id = id; + __entry->len = len / sizeof(__le32); + __entry->type = type; + for (int i = 0; i < __entry->len; i++) + __entry->wqe[i] = le32_to_cpu(((__le32 *)wqe)[i]); + ), + + TP_printk("%s 0x%lx wqe(0x%x/0x%llx): %s", + trace_type_to_str(__entry->type), + __entry->qpn, __entry->idx, __entry->id, + __print_array(__entry->wqe, __entry->len, + sizeof(__le32))) +); + +DEFINE_EVENT(wqe_template, hns_sq_wqe, + TP_PROTO(unsigned long qpn, u32 idx, void *wqe, u32 len, u64 id, + enum hns_roce_trace_type type), + TP_ARGS(qpn, idx, wqe, len, id, type)); +DEFINE_EVENT(wqe_template, hns_rq_wqe, + TP_PROTO(unsigned long qpn, u32 idx, void *wqe, u32 len, u64 id, + enum hns_roce_trace_type type), + TP_ARGS(qpn, idx, wqe, len, id, type)); +DEFINE_EVENT(wqe_template, hns_srq_wqe, + TP_PROTO(unsigned long qpn, u32 idx, void *wqe, u32 len, u64 id, + enum hns_roce_trace_type type), + TP_ARGS(qpn, idx, wqe, len, id, type)); + +TRACE_EVENT(hns_ae_info, + TP_PROTO(int event_type, void *aeqe, unsigned int len), + TP_ARGS(event_type, aeqe, len), + + TP_STRUCT__entry(__field(int, event_type) + __array(u32, aeqe, + HNS_ROCE_V3_EQE_SIZE / sizeof(__le32)) + __field(u32, len) + ), + + TP_fast_assign(__entry->event_type = event_type; + __entry->len = len / sizeof(__le32); + for (int i = 0; i < __entry->len; i++) + __entry->aeqe[i] = le32_to_cpu(((__le32 *)aeqe)[i]); + ), + + TP_printk("event %2d aeqe: %s", __entry->event_type, + __print_array(__entry->aeqe, __entry->len, sizeof(__le32))) +); + +TRACE_EVENT(hns_mr, + TP_PROTO(struct hns_roce_mr *mr), + TP_ARGS(mr), + + TP_STRUCT__entry(__field(u64, iova) + __field(u64, size) + __field(u32, key) + __field(u32, pd) + __field(u32, pbl_hop_num) + __field(u32, npages) + __field(int, type) + __field(int, enabled) + ), + + TP_fast_assign(__entry->iova = mr->iova; + __entry->size = mr->size; + __entry->key = mr->key; + __entry->pd = mr->pd; + __entry->pbl_hop_num = mr->pbl_hop_num; + __entry->npages = mr->npages; + __entry->type = mr->type; + __entry->enabled = mr->enabled; + ), + + TP_printk("iova:0x%llx, size:%llu, key:%u, pd:%u, pbl_hop:%u, npages:%u, type:%d, status:%d", + __entry->iova, __entry->size, __entry->key, + __entry->pd, __entry->pbl_hop_num, __entry->npages, + __entry->type, __entry->enabled) +); + +TRACE_EVENT(hns_buf_attr, + TP_PROTO(struct hns_roce_buf_attr *attr), + TP_ARGS(attr), + + TP_STRUCT__entry(__field(unsigned int, region_count) + __field(unsigned int, region0_size) + __field(int, region0_hopnum) + __field(unsigned int, region1_size) + __field(int, region1_hopnum) + __field(unsigned int, region2_size) + __field(int, region2_hopnum) + __field(unsigned int, page_shift) + __field(bool, mtt_only) + ), + + TP_fast_assign(__entry->region_count = attr->region_count; + __entry->region0_size = attr->region[0].size; + __entry->region0_hopnum = attr->region[0].hopnum; + __entry->region1_size = attr->region[1].size; + __entry->region1_hopnum = attr->region[1].hopnum; + __entry->region2_size = attr->region[2].size; + __entry->region2_hopnum = attr->region[2].hopnum; + __entry->page_shift = attr->page_shift; + __entry->mtt_only = attr->mtt_only; + ), + + TP_printk("rg cnt:%u, pg_sft:0x%x, mtt_only:%s, rg 0 (sz:%u, hop:%u), rg 1 (sz:%u, hop:%u), rg 2 (sz:%u, hop:%u)\n", + __entry->region_count, __entry->page_shift, + str_yes_no(__entry->mtt_only), + __entry->region0_size, __entry->region0_hopnum, + __entry->region1_size, __entry->region1_hopnum, + __entry->region2_size, __entry->region2_hopnum) +); + +DECLARE_EVENT_CLASS(cmdq, + TP_PROTO(struct hns_roce_dev *hr_dev, + struct hns_roce_cmq_desc *desc), + TP_ARGS(hr_dev, desc), + + TP_STRUCT__entry(__string(dev_name, dev_name(hr_dev->dev)) + __field(u16, opcode) + __field(u16, flag) + __field(u16, retval) + __array(u32, data, 6) + ), + + TP_fast_assign(__assign_str(dev_name); + __entry->opcode = le16_to_cpu(desc->opcode); + __entry->flag = le16_to_cpu(desc->flag); + __entry->retval = le16_to_cpu(desc->retval); + for (int i = 0; i < 6; i++) + __entry->data[i] = le32_to_cpu(desc->data[i]); + ), + + TP_printk("%s cmdq opcode:0x%x, flag:0x%x, retval:0x%x, data:%s\n", + __get_str(dev_name), __entry->opcode, + __entry->flag, __entry->retval, + __print_array(__entry->data, 6, sizeof(__le32))) +); + +DEFINE_EVENT(cmdq, hns_cmdq_req, + TP_PROTO(struct hns_roce_dev *hr_dev, + struct hns_roce_cmq_desc *desc), + TP_ARGS(hr_dev, desc)); +DEFINE_EVENT(cmdq, hns_cmdq_resp, + TP_PROTO(struct hns_roce_dev *hr_dev, + struct hns_roce_cmq_desc *desc), + TP_ARGS(hr_dev, desc)); + +#endif /* __HNS_ROCE_TRACE_H */ + +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE hns_roce_trace +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#include <trace/define_trace.h> diff --git a/drivers/infiniband/hw/irdma/Kconfig b/drivers/infiniband/hw/irdma/Kconfig index b6f9c41bca51..5f49a58590ed 100644 --- a/drivers/infiniband/hw/irdma/Kconfig +++ b/drivers/infiniband/hw/irdma/Kconfig @@ -7,6 +7,7 @@ config INFINIBAND_IRDMA depends on ICE && I40E select GENERIC_ALLOCATOR select AUXILIARY_BUS + select CRC32 help This is an Intel(R) Ethernet Protocol Driver for RDMA driver that support E810 (iWARP/RoCE) and X722 (iWARP) network devices. diff --git a/drivers/infiniband/hw/irdma/cm.c b/drivers/infiniband/hw/irdma/cm.c index 1ee7a4e0d8d8..23207f13ac1b 100644 --- a/drivers/infiniband/hw/irdma/cm.c +++ b/drivers/infiniband/hw/irdma/cm.c @@ -1985,7 +1985,8 @@ static int irdma_addr_resolve_neigh(struct irdma_device *iwdev, u32 src_ip, __be32 dst_ipaddr = htonl(dst_ip); __be32 src_ipaddr = htonl(src_ip); - rt = ip_route_output(&init_net, dst_ipaddr, src_ipaddr, 0, 0); + rt = ip_route_output(&init_net, dst_ipaddr, src_ipaddr, 0, 0, + RT_SCOPE_UNIVERSE); if (IS_ERR(rt)) { ibdev_dbg(&iwdev->ibdev, "CM: ip_route_output fail\n"); return -EINVAL; @@ -3302,7 +3303,7 @@ void irdma_cleanup_cm_core(struct irdma_cm_core *cm_core) if (!cm_core) return; - del_timer_sync(&cm_core->tcp_timer); + timer_delete_sync(&cm_core->tcp_timer); destroy_workqueue(cm_core->event_wq); cm_core->dev->ws_reset(&cm_core->iwdev->vsi); @@ -3630,7 +3631,7 @@ void irdma_free_lsmm_rsrc(struct irdma_qp *iwqp) /** * irdma_accept - registered call for connection to be accepted * @cm_id: cm information for passive connection - * @conn_param: accpet parameters + * @conn_param: accept parameters */ int irdma_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) { diff --git a/drivers/infiniband/hw/irdma/ctrl.c b/drivers/infiniband/hw/irdma/ctrl.c index 6aed6169c07d..99a7f1a6c0b5 100644 --- a/drivers/infiniband/hw/irdma/ctrl.c +++ b/drivers/infiniband/hw/irdma/ctrl.c @@ -3131,7 +3131,7 @@ int irdma_sc_cqp_init(struct irdma_sc_cqp *cqp, writel(0, cqp->dev->hw_regs[IRDMA_CCQPSTATUS]); ibdev_dbg(to_ibdev(cqp->dev), - "WQE: sq_size[%04d] hw_sq_size[%04d] sq_base[%p] sq_pa[%pK] cqp[%p] polarity[x%04x]\n", + "WQE: sq_size[%04d] hw_sq_size[%04d] sq_base[%p] sq_pa[%p] cqp[%p] polarity[x%04x]\n", cqp->sq_size, cqp->hw_sq_size, cqp->sq_base, (u64 *)(uintptr_t)cqp->sq_pa, cqp, cqp->polarity); return 0; diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c index ad50b77282f8..69ce1862eabe 100644 --- a/drivers/infiniband/hw/irdma/hw.c +++ b/drivers/infiniband/hw/irdma/hw.c @@ -498,8 +498,6 @@ static int irdma_save_msix_info(struct irdma_pci_f *rf) iw_qvlist->num_vectors = rf->msix_count; if (rf->msix_count <= num_online_cpus()) rf->msix_shared = true; - else if (rf->msix_count > num_online_cpus() + 1) - rf->msix_count = num_online_cpus() + 1; pmsix = rf->msix_entries; for (i = 0, ceq_idx = 0; i < rf->msix_count; i++, iw_qvinfo++) { diff --git a/drivers/infiniband/hw/irdma/main.c b/drivers/infiniband/hw/irdma/main.c index 3f13200ff71b..1e840bbd619d 100644 --- a/drivers/infiniband/hw/irdma/main.c +++ b/drivers/infiniband/hw/irdma/main.c @@ -1,10 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* Copyright (c) 2015 - 2021 Intel Corporation */ #include "main.h" -#include "../../../net/ethernet/intel/ice/ice.h" MODULE_ALIAS("i40iw"); -MODULE_AUTHOR("Intel Corporation, <e1000-rdma@lists.sourceforge.net>"); MODULE_DESCRIPTION("Intel(R) Ethernet Protocol Driver for RDMA"); MODULE_LICENSE("Dual BSD/GPL"); @@ -61,7 +59,7 @@ static void irdma_log_invalid_mtu(u16 mtu, struct irdma_sc_dev *dev) } static void irdma_fill_qos_info(struct irdma_l2params *l2params, - struct iidc_qos_params *qos_info) + struct iidc_rdma_qos_params *qos_info) { int i; @@ -85,12 +83,13 @@ static void irdma_fill_qos_info(struct irdma_l2params *l2params, } } -static void irdma_iidc_event_handler(struct ice_pf *pf, struct iidc_event *event) +static void irdma_iidc_event_handler(struct iidc_rdma_core_dev_info *cdev_info, + struct iidc_rdma_event *event) { - struct irdma_device *iwdev = dev_get_drvdata(&pf->adev->dev); + struct irdma_device *iwdev = dev_get_drvdata(&cdev_info->adev->dev); struct irdma_l2params l2params = {}; - if (*event->type & BIT(IIDC_EVENT_AFTER_MTU_CHANGE)) { + if (*event->type & BIT(IIDC_RDMA_EVENT_AFTER_MTU_CHANGE)) { ibdev_dbg(&iwdev->ibdev, "CLNT: new MTU = %d\n", iwdev->netdev->mtu); if (iwdev->vsi.mtu != iwdev->netdev->mtu) { l2params.mtu = iwdev->netdev->mtu; @@ -98,25 +97,26 @@ static void irdma_iidc_event_handler(struct ice_pf *pf, struct iidc_event *event irdma_log_invalid_mtu(l2params.mtu, &iwdev->rf->sc_dev); irdma_change_l2params(&iwdev->vsi, &l2params); } - } else if (*event->type & BIT(IIDC_EVENT_BEFORE_TC_CHANGE)) { + } else if (*event->type & BIT(IIDC_RDMA_EVENT_BEFORE_TC_CHANGE)) { if (iwdev->vsi.tc_change_pending) return; irdma_prep_tc_change(iwdev); - } else if (*event->type & BIT(IIDC_EVENT_AFTER_TC_CHANGE)) { - struct iidc_qos_params qos_info = {}; + } else if (*event->type & BIT(IIDC_RDMA_EVENT_AFTER_TC_CHANGE)) { + struct iidc_rdma_priv_dev_info *iidc_priv = cdev_info->iidc_priv; if (!iwdev->vsi.tc_change_pending) return; l2params.tc_changed = true; ibdev_dbg(&iwdev->ibdev, "CLNT: TC Change\n"); - ice_get_qos_params(pf, &qos_info); - irdma_fill_qos_info(&l2params, &qos_info); + + irdma_fill_qos_info(&l2params, &iidc_priv->qos_info); if (iwdev->rf->protocol_used != IRDMA_IWARP_PROTOCOL_ONLY) - iwdev->dcb_vlan_mode = qos_info.num_tc > 1 && !l2params.dscp_mode; + iwdev->dcb_vlan_mode = + l2params.num_tc > 1 && !l2params.dscp_mode; irdma_change_l2params(&iwdev->vsi, &l2params); - } else if (*event->type & BIT(IIDC_EVENT_CRIT_ERR)) { + } else if (*event->type & BIT(IIDC_RDMA_EVENT_CRIT_ERR)) { ibdev_warn(&iwdev->ibdev, "ICE OICR event notification: oicr = 0x%08x\n", event->reg); if (event->reg & IRDMAPFINT_OICR_PE_CRITERR_M) { @@ -151,10 +151,8 @@ static void irdma_iidc_event_handler(struct ice_pf *pf, struct iidc_event *event */ static void irdma_request_reset(struct irdma_pci_f *rf) { - struct ice_pf *pf = rf->cdev; - ibdev_warn(&rf->iwdev->ibdev, "Requesting a reset\n"); - ice_rdma_request_reset(pf, IIDC_PFR); + ice_rdma_request_reset(rf->cdev, IIDC_FUNC_RESET); } /** @@ -166,14 +164,15 @@ static int irdma_lan_register_qset(struct irdma_sc_vsi *vsi, struct irdma_ws_node *tc_node) { struct irdma_device *iwdev = vsi->back_vsi; - struct ice_pf *pf = iwdev->rf->cdev; + struct iidc_rdma_core_dev_info *cdev_info; struct iidc_rdma_qset_params qset = {}; int ret; + cdev_info = iwdev->rf->cdev; qset.qs_handle = tc_node->qs_handle; qset.tc = tc_node->traffic_class; qset.vport_id = vsi->vsi_idx; - ret = ice_add_rdma_qset(pf, &qset); + ret = ice_add_rdma_qset(cdev_info, &qset); if (ret) { ibdev_dbg(&iwdev->ibdev, "WS: LAN alloc_res for rdma qset failed.\n"); return ret; @@ -194,57 +193,105 @@ static void irdma_lan_unregister_qset(struct irdma_sc_vsi *vsi, struct irdma_ws_node *tc_node) { struct irdma_device *iwdev = vsi->back_vsi; - struct ice_pf *pf = iwdev->rf->cdev; + struct iidc_rdma_core_dev_info *cdev_info; struct iidc_rdma_qset_params qset = {}; + cdev_info = iwdev->rf->cdev; qset.qs_handle = tc_node->qs_handle; qset.tc = tc_node->traffic_class; qset.vport_id = vsi->vsi_idx; qset.teid = tc_node->l2_sched_node_id; - if (ice_del_rdma_qset(pf, &qset)) + if (ice_del_rdma_qset(cdev_info, &qset)) ibdev_dbg(&iwdev->ibdev, "WS: LAN free_res for rdma qset failed.\n"); } +static int irdma_init_interrupts(struct irdma_pci_f *rf, struct iidc_rdma_core_dev_info *cdev) +{ + int i; + + rf->msix_count = num_online_cpus() + IRDMA_NUM_AEQ_MSIX; + rf->msix_entries = kcalloc(rf->msix_count, sizeof(*rf->msix_entries), + GFP_KERNEL); + if (!rf->msix_entries) + return -ENOMEM; + + for (i = 0; i < rf->msix_count; i++) + if (ice_alloc_rdma_qvector(cdev, &rf->msix_entries[i])) + break; + + if (i < IRDMA_MIN_MSIX) { + while (--i >= 0) + ice_free_rdma_qvector(cdev, &rf->msix_entries[i]); + + kfree(rf->msix_entries); + return -ENOMEM; + } + + rf->msix_count = i; + + return 0; +} + +static void irdma_deinit_interrupts(struct irdma_pci_f *rf, struct iidc_rdma_core_dev_info *cdev) +{ + int i; + + for (i = 0; i < rf->msix_count; i++) + ice_free_rdma_qvector(cdev, &rf->msix_entries[i]); + + kfree(rf->msix_entries); +} + static void irdma_remove(struct auxiliary_device *aux_dev) { - struct iidc_auxiliary_dev *iidc_adev = container_of(aux_dev, - struct iidc_auxiliary_dev, - adev); - struct ice_pf *pf = iidc_adev->pf; struct irdma_device *iwdev = auxiliary_get_drvdata(aux_dev); + struct iidc_rdma_core_auxiliary_dev *iidc_adev; + struct iidc_rdma_core_dev_info *cdev_info; + iidc_adev = container_of(aux_dev, struct iidc_rdma_core_auxiliary_dev, adev); + cdev_info = iidc_adev->cdev_info; + + ice_rdma_update_vsi_filter(cdev_info, iwdev->vsi_num, false); irdma_ib_unregister_device(iwdev); - ice_rdma_update_vsi_filter(pf, iwdev->vsi_num, false); + irdma_deinit_interrupts(iwdev->rf, cdev_info); + + kfree(iwdev->rf); - pr_debug("INIT: Gen2 PF[%d] device remove success\n", PCI_FUNC(pf->pdev->devfn)); + pr_debug("INIT: Gen2 PF[%d] device remove success\n", PCI_FUNC(cdev_info->pdev->devfn)); } -static void irdma_fill_device_info(struct irdma_device *iwdev, struct ice_pf *pf, - struct ice_vsi *vsi) +static void irdma_fill_device_info(struct irdma_device *iwdev, + struct iidc_rdma_core_dev_info *cdev_info) { + struct iidc_rdma_priv_dev_info *iidc_priv = cdev_info->iidc_priv; struct irdma_pci_f *rf = iwdev->rf; - rf->cdev = pf; + rf->sc_dev.hw = &rf->hw; + rf->iwdev = iwdev; + rf->cdev = cdev_info; + rf->hw.hw_addr = iidc_priv->hw_addr; + rf->pcidev = cdev_info->pdev; + rf->hw.device = &rf->pcidev->dev; + rf->pf_id = iidc_priv->pf_id; rf->gen_ops.register_qset = irdma_lan_register_qset; rf->gen_ops.unregister_qset = irdma_lan_unregister_qset; - rf->hw.hw_addr = pf->hw.hw_addr; - rf->pcidev = pf->pdev; - rf->msix_count = pf->num_rdma_msix; - rf->pf_id = pf->hw.pf_id; - rf->msix_entries = &pf->msix_entries[pf->rdma_base_vector]; - rf->default_vsi.vsi_idx = vsi->vsi_num; - rf->protocol_used = pf->rdma_mode & IIDC_RDMA_PROTOCOL_ROCEV2 ? - IRDMA_ROCE_PROTOCOL_ONLY : IRDMA_IWARP_PROTOCOL_ONLY; + + rf->default_vsi.vsi_idx = iidc_priv->vport_id; + rf->protocol_used = + cdev_info->rdma_protocol == IIDC_RDMA_PROTOCOL_ROCEV2 ? + IRDMA_ROCE_PROTOCOL_ONLY : IRDMA_IWARP_PROTOCOL_ONLY; rf->rdma_ver = IRDMA_GEN_2; rf->rsrc_profile = IRDMA_HMC_PROFILE_DEFAULT; rf->rst_to = IRDMA_RST_TIMEOUT_HZ; rf->gen_ops.request_reset = irdma_request_reset; rf->limits_sel = 7; rf->iwdev = iwdev; + mutex_init(&iwdev->ah_tbl_lock); - iwdev->netdev = vsi->netdev; - iwdev->vsi_num = vsi->vsi_num; + + iwdev->netdev = iidc_priv->netdev; + iwdev->vsi_num = iidc_priv->vport_id; iwdev->init_state = INITIAL_STATE; iwdev->roce_cwnd = IRDMA_ROCE_CWND_DEFAULT; iwdev->roce_ackcreds = IRDMA_ROCE_ACKCREDS_DEFAULT; @@ -256,19 +303,18 @@ static void irdma_fill_device_info(struct irdma_device *iwdev, struct ice_pf *pf static int irdma_probe(struct auxiliary_device *aux_dev, const struct auxiliary_device_id *id) { - struct iidc_auxiliary_dev *iidc_adev = container_of(aux_dev, - struct iidc_auxiliary_dev, - adev); - struct ice_pf *pf = iidc_adev->pf; - struct ice_vsi *vsi = ice_get_main_vsi(pf); - struct iidc_qos_params qos_info = {}; + struct iidc_rdma_core_auxiliary_dev *iidc_adev; + struct iidc_rdma_core_dev_info *cdev_info; + struct iidc_rdma_priv_dev_info *iidc_priv; + struct irdma_l2params l2params = {}; struct irdma_device *iwdev; struct irdma_pci_f *rf; - struct irdma_l2params l2params = {}; int err; - if (!vsi) - return -EIO; + iidc_adev = container_of(aux_dev, struct iidc_rdma_core_auxiliary_dev, adev); + cdev_info = iidc_adev->cdev_info; + iidc_priv = cdev_info->iidc_priv; + iwdev = ib_alloc_device(irdma_device, ibdev); if (!iwdev) return -ENOMEM; @@ -278,16 +324,19 @@ static int irdma_probe(struct auxiliary_device *aux_dev, const struct auxiliary_ return -ENOMEM; } - irdma_fill_device_info(iwdev, pf, vsi); + irdma_fill_device_info(iwdev, cdev_info); rf = iwdev->rf; + err = irdma_init_interrupts(rf, cdev_info); + if (err) + goto err_init_interrupts; + err = irdma_ctrl_init_hw(rf); if (err) goto err_ctrl_init; l2params.mtu = iwdev->netdev->mtu; - ice_get_qos_params(pf, &qos_info); - irdma_fill_qos_info(&l2params, &qos_info); + irdma_fill_qos_info(&l2params, &iidc_priv->qos_info); if (iwdev->rf->protocol_used != IRDMA_IWARP_PROTOCOL_ONLY) iwdev->dcb_vlan_mode = l2params.num_tc > 1 && !l2params.dscp_mode; @@ -299,7 +348,7 @@ static int irdma_probe(struct auxiliary_device *aux_dev, const struct auxiliary_ if (err) goto err_ibreg; - ice_rdma_update_vsi_filter(pf, iwdev->vsi_num, true); + ice_rdma_update_vsi_filter(cdev_info, iwdev->vsi_num, true); ibdev_dbg(&iwdev->ibdev, "INIT: Gen2 PF[%d] device probe success\n", PCI_FUNC(rf->pcidev->devfn)); auxiliary_set_drvdata(aux_dev, iwdev); @@ -311,6 +360,8 @@ err_ibreg: err_rt_init: irdma_ctrl_deinit_hw(rf); err_ctrl_init: + irdma_deinit_interrupts(rf, cdev_info); +err_init_interrupts: kfree(iwdev->rf); ib_dealloc_device(&iwdev->ibdev); @@ -325,7 +376,7 @@ static const struct auxiliary_device_id irdma_auxiliary_id_table[] = { MODULE_DEVICE_TABLE(auxiliary, irdma_auxiliary_id_table); -static struct iidc_auxiliary_drv irdma_auxiliary_drv = { +static struct iidc_rdma_core_auxiliary_drv irdma_auxiliary_drv = { .adrv = { .id_table = irdma_auxiliary_id_table, .probe = irdma_probe, diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h index b65bc2ea542f..674acc952168 100644 --- a/drivers/infiniband/hw/irdma/main.h +++ b/drivers/infiniband/hw/irdma/main.h @@ -29,8 +29,8 @@ #include <linux/io-64-nonatomic-lo-hi.h> #endif #include <linux/auxiliary_bus.h> -#include <linux/net/intel/iidc.h> -#include <crypto/hash.h> +#include <linux/net/intel/iidc_rdma.h> +#include <linux/net/intel/iidc_rdma_ice.h> #include <rdma/ib_smi.h> #include <rdma/ib_verbs.h> #include <rdma/ib_pack.h> @@ -117,6 +117,9 @@ extern struct auxiliary_driver i40iw_auxiliary_drv; #define IRDMA_IRQ_NAME_STR_LEN (64) +#define IRDMA_NUM_AEQ_MSIX 1 +#define IRDMA_MIN_MSIX 2 + enum init_completion_state { INVALID_STATE = 0, INITIAL_STATE, @@ -239,7 +242,7 @@ struct irdma_qv_info { struct irdma_qvlist_info { u32 num_vectors; - struct irdma_qv_info qv_info[]; + struct irdma_qv_info qv_info[] __counted_by(num_vectors); }; struct irdma_gen_ops { diff --git a/drivers/infiniband/hw/irdma/osdep.h b/drivers/infiniband/hw/irdma/osdep.h index e1e3d3ae72b7..3f73ceacccb6 100644 --- a/drivers/infiniband/hw/irdma/osdep.h +++ b/drivers/infiniband/hw/irdma/osdep.h @@ -5,9 +5,8 @@ #include <linux/pci.h> #include <linux/bitfield.h> -#include <linux/net/intel/iidc.h> -#include <crypto/hash.h> #include <rdma/ib_verbs.h> +#include <net/dscp.h> #define STATS_TIMER_DELAY 60000 @@ -43,15 +42,12 @@ enum irdma_status_code irdma_vf_wait_vchnl_resp(struct irdma_sc_dev *dev); bool irdma_vf_clear_to_send(struct irdma_sc_dev *dev); void irdma_add_dev_ref(struct irdma_sc_dev *dev); void irdma_put_dev_ref(struct irdma_sc_dev *dev); -int irdma_ieq_check_mpacrc(struct shash_desc *desc, void *addr, u32 len, - u32 val); +int irdma_ieq_check_mpacrc(const void *addr, u32 len, u32 val); struct irdma_sc_qp *irdma_ieq_get_qp(struct irdma_sc_dev *dev, struct irdma_puda_buf *buf); void irdma_send_ieq_ack(struct irdma_sc_qp *qp); void irdma_ieq_update_tcpip_info(struct irdma_puda_buf *buf, u16 len, u32 seqnum); -void irdma_free_hash_desc(struct shash_desc *hash_desc); -int irdma_init_hash_desc(struct shash_desc **hash_desc); int irdma_puda_get_tcpip_info(struct irdma_puda_cmpl_info *info, struct irdma_puda_buf *buf); int irdma_cqp_sds_cmd(struct irdma_sc_dev *dev, @@ -59,10 +55,6 @@ int irdma_cqp_sds_cmd(struct irdma_sc_dev *dev, int irdma_cqp_manage_hmc_fcn_cmd(struct irdma_sc_dev *dev, struct irdma_hmc_fcn_info *hmcfcninfo, u16 *pmf_idx); -int irdma_cqp_query_fpm_val_cmd(struct irdma_sc_dev *dev, - struct irdma_dma_mem *val_mem, u8 hmc_fn_id); -int irdma_cqp_commit_fpm_val_cmd(struct irdma_sc_dev *dev, - struct irdma_dma_mem *val_mem, u8 hmc_fn_id); int irdma_alloc_query_fpm_buf(struct irdma_sc_dev *dev, struct irdma_dma_mem *mem); void *irdma_remove_cqp_head(struct irdma_sc_dev *dev); diff --git a/drivers/infiniband/hw/irdma/pble.c b/drivers/infiniband/hw/irdma/pble.c index e7ce6840755f..37ce35cb10e7 100644 --- a/drivers/infiniband/hw/irdma/pble.c +++ b/drivers/infiniband/hw/irdma/pble.c @@ -108,7 +108,7 @@ static int add_sd_direct(struct irdma_hmc_pble_rsrc *pble_rsrc, chunk->vaddr = sd_entry->u.bp.addr.va + offset; chunk->fpm_addr = pble_rsrc->next_fpm_addr; ibdev_dbg(to_ibdev(dev), - "PBLE: chunk_size[%lld] = 0x%llx vaddr=0x%pK fpm_addr = %llx\n", + "PBLE: chunk_size[%lld] = 0x%llx vaddr=0x%p fpm_addr = %llx\n", chunk->size, chunk->size, chunk->vaddr, chunk->fpm_addr); return 0; diff --git a/drivers/infiniband/hw/irdma/protos.h b/drivers/infiniband/hw/irdma/protos.h index d7c8ea948bcd..c0c9441885d3 100644 --- a/drivers/infiniband/hw/irdma/protos.h +++ b/drivers/infiniband/hw/irdma/protos.h @@ -85,10 +85,6 @@ int irdma_process_cqp_cmd(struct irdma_sc_dev *dev, int irdma_process_bh(struct irdma_sc_dev *dev); int irdma_cqp_sds_cmd(struct irdma_sc_dev *dev, struct irdma_update_sds_info *info); -int irdma_cqp_query_fpm_val_cmd(struct irdma_sc_dev *dev, - struct irdma_dma_mem *val_mem, u8 hmc_fn_id); -int irdma_cqp_commit_fpm_val_cmd(struct irdma_sc_dev *dev, - struct irdma_dma_mem *val_mem, u8 hmc_fn_id); int irdma_alloc_query_fpm_buf(struct irdma_sc_dev *dev, struct irdma_dma_mem *mem); int irdma_cqp_manage_hmc_fcn_cmd(struct irdma_sc_dev *dev, diff --git a/drivers/infiniband/hw/irdma/puda.c b/drivers/infiniband/hw/irdma/puda.c index 7e3f9bca2c23..694e5a9ed15d 100644 --- a/drivers/infiniband/hw/irdma/puda.c +++ b/drivers/infiniband/hw/irdma/puda.c @@ -923,8 +923,6 @@ void irdma_puda_dele_rsrc(struct irdma_sc_vsi *vsi, enum puda_rsrc_type type, switch (rsrc->cmpl) { case PUDA_HASH_CRC_COMPLETE: - irdma_free_hash_desc(rsrc->hash_desc); - fallthrough; case PUDA_QP_CREATED: irdma_qp_rem_qos(&rsrc->qp); @@ -1095,15 +1093,12 @@ int irdma_puda_create_rsrc(struct irdma_sc_vsi *vsi, goto error; if (info->type == IRDMA_PUDA_RSRC_TYPE_IEQ) { - if (!irdma_init_hash_desc(&rsrc->hash_desc)) { - rsrc->check_crc = true; - rsrc->cmpl = PUDA_HASH_CRC_COMPLETE; - ret = 0; - } + rsrc->check_crc = true; + rsrc->cmpl = PUDA_HASH_CRC_COMPLETE; } irdma_sc_ccq_arm(&rsrc->cq); - return ret; + return 0; error: irdma_puda_dele_rsrc(vsi, info->type, false); @@ -1396,8 +1391,8 @@ static int irdma_ieq_handle_partial(struct irdma_puda_rsrc *ieq, crcptr = txbuf->data + fpdu_len - 4; mpacrc = *(u32 *)crcptr; if (ieq->check_crc) { - status = irdma_ieq_check_mpacrc(ieq->hash_desc, txbuf->data, - (fpdu_len - 4), mpacrc); + status = irdma_ieq_check_mpacrc(txbuf->data, fpdu_len - 4, + mpacrc); if (status) { ibdev_dbg(to_ibdev(ieq->dev), "IEQ: error bad crc\n"); goto error; @@ -1465,8 +1460,8 @@ static int irdma_ieq_process_buf(struct irdma_puda_rsrc *ieq, crcptr = datap + fpdu_len - 4; mpacrc = *(u32 *)crcptr; if (ieq->check_crc) - ret = irdma_ieq_check_mpacrc(ieq->hash_desc, datap, - fpdu_len - 4, mpacrc); + ret = irdma_ieq_check_mpacrc(datap, fpdu_len - 4, + mpacrc); if (ret) { list_add(&buf->list, rxlist); ibdev_dbg(to_ibdev(ieq->dev), diff --git a/drivers/infiniband/hw/irdma/puda.h b/drivers/infiniband/hw/irdma/puda.h index bc6d9514c9c1..2fc638f2b143 100644 --- a/drivers/infiniband/hw/irdma/puda.h +++ b/drivers/infiniband/hw/irdma/puda.h @@ -119,7 +119,6 @@ struct irdma_puda_rsrc { u32 rx_wqe_idx; u32 rxq_invalid_cnt; u32 tx_wqe_avail_cnt; - struct shash_desc *hash_desc; struct list_head txpend; struct list_head bufpool; /* free buffers pool list for recv and xmit */ u32 alloc_buf_count; @@ -163,10 +162,8 @@ struct irdma_sc_qp *irdma_ieq_get_qp(struct irdma_sc_dev *dev, struct irdma_puda_buf *buf); int irdma_puda_get_tcpip_info(struct irdma_puda_cmpl_info *info, struct irdma_puda_buf *buf); -int irdma_ieq_check_mpacrc(struct shash_desc *desc, void *addr, u32 len, u32 val); -int irdma_init_hash_desc(struct shash_desc **desc); +int irdma_ieq_check_mpacrc(const void *addr, u32 len, u32 val); void irdma_ieq_mpa_crc_ae(struct irdma_sc_dev *dev, struct irdma_sc_qp *qp); -void irdma_free_hash_desc(struct shash_desc *desc); void irdma_ieq_update_tcpip_info(struct irdma_puda_buf *buf, u16 len, u32 seqnum); int irdma_cqp_qp_create_cmd(struct irdma_sc_dev *dev, struct irdma_sc_qp *qp); int irdma_cqp_cq_create_cmd(struct irdma_sc_dev *dev, struct irdma_sc_cq *cq); diff --git a/drivers/infiniband/hw/irdma/type.h b/drivers/infiniband/hw/irdma/type.h index 59b34afa867b..527c6da2c1ac 100644 --- a/drivers/infiniband/hw/irdma/type.h +++ b/drivers/infiniband/hw/irdma/type.h @@ -567,7 +567,7 @@ struct irdma_sc_vsi { u8 qos_rel_bw; u8 qos_prio_type; u8 stats_idx; - u8 dscp_map[IIDC_MAX_DSCP_MAPPING]; + u8 dscp_map[DSCP_MAX]; struct irdma_qos qos[IRDMA_MAX_USER_PRIORITY]; u64 hw_stats_regs[IRDMA_HW_STAT_INDEX_MAX_GEN_1]; bool dscp_mode:1; @@ -695,7 +695,7 @@ struct irdma_l2params { u16 qs_handle_list[IRDMA_MAX_USER_PRIORITY]; u16 mtu; u8 up2tc[IRDMA_MAX_USER_PRIORITY]; - u8 dscp_map[IIDC_MAX_DSCP_MAPPING]; + u8 dscp_map[DSCP_MAX]; u8 num_tc; u8 vsi_rel_bw; u8 vsi_prio_type; diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c index 0422787592d8..d66b4f7a84ec 100644 --- a/drivers/infiniband/hw/irdma/utils.c +++ b/drivers/infiniband/hw/irdma/utils.c @@ -320,9 +320,6 @@ int irdma_netdevice_event(struct notifier_block *notifier, unsigned long event, case NETDEV_DOWN: iwdev->iw_status = 0; fallthrough; - case NETDEV_UP: - irdma_port_ibevent(iwdev); - break; default: break; } @@ -966,80 +963,12 @@ void irdma_terminate_del_timer(struct irdma_sc_qp *qp) int ret; iwqp = qp->qp_uk.back_qp; - ret = del_timer(&iwqp->terminate_timer); + ret = timer_delete(&iwqp->terminate_timer); if (ret) irdma_qp_rem_ref(&iwqp->ibqp); } /** - * irdma_cqp_query_fpm_val_cmd - send cqp command for fpm - * @dev: function device struct - * @val_mem: buffer for fpm - * @hmc_fn_id: function id for fpm - */ -int irdma_cqp_query_fpm_val_cmd(struct irdma_sc_dev *dev, - struct irdma_dma_mem *val_mem, u8 hmc_fn_id) -{ - struct irdma_cqp_request *cqp_request; - struct cqp_cmds_info *cqp_info; - struct irdma_pci_f *rf = dev_to_rf(dev); - int status; - - cqp_request = irdma_alloc_and_get_cqp_request(&rf->cqp, true); - if (!cqp_request) - return -ENOMEM; - - cqp_info = &cqp_request->info; - cqp_request->param = NULL; - cqp_info->in.u.query_fpm_val.cqp = dev->cqp; - cqp_info->in.u.query_fpm_val.fpm_val_pa = val_mem->pa; - cqp_info->in.u.query_fpm_val.fpm_val_va = val_mem->va; - cqp_info->in.u.query_fpm_val.hmc_fn_id = hmc_fn_id; - cqp_info->cqp_cmd = IRDMA_OP_QUERY_FPM_VAL; - cqp_info->post_sq = 1; - cqp_info->in.u.query_fpm_val.scratch = (uintptr_t)cqp_request; - - status = irdma_handle_cqp_op(rf, cqp_request); - irdma_put_cqp_request(&rf->cqp, cqp_request); - - return status; -} - -/** - * irdma_cqp_commit_fpm_val_cmd - commit fpm values in hw - * @dev: hardware control device structure - * @val_mem: buffer with fpm values - * @hmc_fn_id: function id for fpm - */ -int irdma_cqp_commit_fpm_val_cmd(struct irdma_sc_dev *dev, - struct irdma_dma_mem *val_mem, u8 hmc_fn_id) -{ - struct irdma_cqp_request *cqp_request; - struct cqp_cmds_info *cqp_info; - struct irdma_pci_f *rf = dev_to_rf(dev); - int status; - - cqp_request = irdma_alloc_and_get_cqp_request(&rf->cqp, true); - if (!cqp_request) - return -ENOMEM; - - cqp_info = &cqp_request->info; - cqp_request->param = NULL; - cqp_info->in.u.commit_fpm_val.cqp = dev->cqp; - cqp_info->in.u.commit_fpm_val.fpm_val_pa = val_mem->pa; - cqp_info->in.u.commit_fpm_val.fpm_val_va = val_mem->va; - cqp_info->in.u.commit_fpm_val.hmc_fn_id = hmc_fn_id; - cqp_info->cqp_cmd = IRDMA_OP_COMMIT_FPM_VAL; - cqp_info->post_sq = 1; - cqp_info->in.u.commit_fpm_val.scratch = (uintptr_t)cqp_request; - - status = irdma_handle_cqp_op(rf, cqp_request); - irdma_put_cqp_request(&rf->cqp, cqp_request); - - return status; -} - -/** * irdma_cqp_cq_create_cmd - create a cq for the cqp * @dev: device pointer * @cq: pointer to created cq @@ -1345,57 +1274,14 @@ void irdma_ieq_mpa_crc_ae(struct irdma_sc_dev *dev, struct irdma_sc_qp *qp) } /** - * irdma_init_hash_desc - initialize hash for crc calculation - * @desc: cryption type - */ -int irdma_init_hash_desc(struct shash_desc **desc) -{ - struct crypto_shash *tfm; - struct shash_desc *tdesc; - - tfm = crypto_alloc_shash("crc32c", 0, 0); - if (IS_ERR(tfm)) - return -EINVAL; - - tdesc = kzalloc(sizeof(*tdesc) + crypto_shash_descsize(tfm), - GFP_KERNEL); - if (!tdesc) { - crypto_free_shash(tfm); - return -EINVAL; - } - - tdesc->tfm = tfm; - *desc = tdesc; - - return 0; -} - -/** - * irdma_free_hash_desc - free hash desc - * @desc: to be freed - */ -void irdma_free_hash_desc(struct shash_desc *desc) -{ - if (desc) { - crypto_free_shash(desc->tfm); - kfree(desc); - } -} - -/** * irdma_ieq_check_mpacrc - check if mpa crc is OK - * @desc: desc for hash * @addr: address of buffer for crc * @len: length of buffer * @val: value to be compared */ -int irdma_ieq_check_mpacrc(struct shash_desc *desc, void *addr, u32 len, - u32 val) +int irdma_ieq_check_mpacrc(const void *addr, u32 len, u32 val) { - u32 crc = 0; - - crypto_shash_digest(desc, addr, len, (u8 *)&crc); - if (crc != val) + if ((__force u32)cpu_to_le32(~crc32c(~0, addr, len)) != val) return -EINVAL; return 0; @@ -1684,7 +1570,7 @@ void irdma_hw_stats_stop_timer(struct irdma_sc_vsi *vsi) { struct irdma_vsi_pestat *devstat = vsi->pestat; - del_timer_sync(&devstat->stats_timer); + timer_delete_sync(&devstat->stats_timer); } /** diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 12704efb7b19..1e8c92826de2 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -1347,7 +1347,7 @@ int irdma_modify_qp_roce(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (attr->max_dest_rd_atomic > dev->hw_attrs.max_hw_ird) { ibdev_err(&iwdev->ibdev, "rd_atomic = %d, above max_hw_ird=%d\n", - attr->max_rd_atomic, + attr->max_dest_rd_atomic, dev->hw_attrs.max_hw_ird); return -EINVAL; } @@ -2035,14 +2035,15 @@ static inline int cq_validate_flags(u32 flags, u8 hw_rev) * irdma_create_cq - create cq * @ibcq: CQ allocated * @attr: attributes for cq - * @udata: user data + * @attrs: uverbs attribute bundle */ static int irdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { #define IRDMA_CREATE_CQ_MIN_REQ_LEN offsetofend(struct irdma_create_cq_req, user_cq_buf) #define IRDMA_CREATE_CQ_MIN_RESP_LEN offsetofend(struct irdma_create_cq_resp, cq_size) + struct ib_udata *udata = &attrs->driver_udata; struct ib_device *ibdev = ibcq->device; struct irdma_device *iwdev = to_iwdev(ibdev); struct irdma_pci_f *rf = iwdev->rf; @@ -3084,7 +3085,7 @@ error: static struct ib_mr *irdma_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start, u64 len, u64 virt, int fd, int access, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { struct irdma_device *iwdev = to_iwdev(pd->device); struct ib_umem_dmabuf *umem_dmabuf; @@ -4870,5 +4871,4 @@ void irdma_ib_dealloc_device(struct ib_device *ibdev) irdma_rt_deinit_hw(iwdev); irdma_ctrl_deinit_hw(iwdev->rf); - kfree(iwdev->rf); } diff --git a/drivers/infiniband/hw/mana/Makefile b/drivers/infiniband/hw/mana/Makefile index 88655fe5e398..921c05e08b11 100644 --- a/drivers/infiniband/hw/mana/Makefile +++ b/drivers/infiniband/hw/mana/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_MANA_INFINIBAND) += mana_ib.o -mana_ib-y := device.o main.o wq.o qp.o cq.o mr.o +mana_ib-y := device.o main.o wq.o qp.o cq.o mr.o ah.o wr.o counters.o diff --git a/drivers/infiniband/hw/mana/ah.c b/drivers/infiniband/hw/mana/ah.c new file mode 100644 index 000000000000..f56952eebbaa --- /dev/null +++ b/drivers/infiniband/hw/mana/ah.c @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024, Microsoft Corporation. All rights reserved. + */ + +#include "mana_ib.h" + +int mana_ib_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *attr, + struct ib_udata *udata) +{ + struct mana_ib_dev *mdev = container_of(ibah->device, struct mana_ib_dev, ib_dev); + struct mana_ib_ah *ah = container_of(ibah, struct mana_ib_ah, ibah); + struct rdma_ah_attr *ah_attr = attr->ah_attr; + const struct ib_global_route *grh; + enum rdma_network_type ntype; + + if (ah_attr->type != RDMA_AH_ATTR_TYPE_ROCE || + !(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH)) + return -EINVAL; + + if (udata) + return -EINVAL; + + ah->av = dma_pool_zalloc(mdev->av_pool, GFP_ATOMIC, &ah->dma_handle); + if (!ah->av) + return -ENOMEM; + + grh = rdma_ah_read_grh(ah_attr); + ntype = rdma_gid_attr_network_type(grh->sgid_attr); + + copy_in_reverse(ah->av->dest_mac, ah_attr->roce.dmac, ETH_ALEN); + ah->av->udp_src_port = rdma_flow_label_to_udp_sport(grh->flow_label); + ah->av->hop_limit = grh->hop_limit; + ah->av->dscp = (grh->traffic_class >> 2) & 0x3f; + ah->av->is_ipv6 = (ntype == RDMA_NETWORK_IPV6); + + if (ah->av->is_ipv6) { + copy_in_reverse(ah->av->dest_ip, grh->dgid.raw, 16); + copy_in_reverse(ah->av->src_ip, grh->sgid_attr->gid.raw, 16); + } else { + ah->av->dest_ip[10] = 0xFF; + ah->av->dest_ip[11] = 0xFF; + copy_in_reverse(&ah->av->dest_ip[12], &grh->dgid.raw[12], 4); + copy_in_reverse(&ah->av->src_ip[12], &grh->sgid_attr->gid.raw[12], 4); + } + + return 0; +} + +int mana_ib_destroy_ah(struct ib_ah *ibah, u32 flags) +{ + struct mana_ib_dev *mdev = container_of(ibah->device, struct mana_ib_dev, ib_dev); + struct mana_ib_ah *ah = container_of(ibah, struct mana_ib_ah, ibah); + + dma_pool_free(mdev->av_pool, ah->av, ah->dma_handle); + + return 0; +} diff --git a/drivers/infiniband/hw/mana/counters.c b/drivers/infiniband/hw/mana/counters.c new file mode 100644 index 000000000000..e533ce21013d --- /dev/null +++ b/drivers/infiniband/hw/mana/counters.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024, Microsoft Corporation. All rights reserved. + */ + +#include "counters.h" + +static const struct rdma_stat_desc mana_ib_port_stats_desc[] = { + [MANA_IB_REQUESTER_TIMEOUT].name = "requester_timeout", + [MANA_IB_REQUESTER_OOS_NAK].name = "requester_oos_nak", + [MANA_IB_REQUESTER_RNR_NAK].name = "requester_rnr_nak", + [MANA_IB_RESPONDER_RNR_NAK].name = "responder_rnr_nak", + [MANA_IB_RESPONDER_OOS].name = "responder_oos", + [MANA_IB_RESPONDER_DUP_REQUEST].name = "responder_dup_request", + [MANA_IB_REQUESTER_IMPLICIT_NAK].name = "requester_implicit_nak", + [MANA_IB_REQUESTER_READRESP_PSN_MISMATCH].name = "requester_readresp_psn_mismatch", + [MANA_IB_NAK_INV_REQ].name = "nak_inv_req", + [MANA_IB_NAK_ACCESS_ERR].name = "nak_access_error", + [MANA_IB_NAK_OPP_ERR].name = "nak_opp_error", + [MANA_IB_NAK_INV_READ].name = "nak_inv_read", + [MANA_IB_RESPONDER_LOCAL_LEN_ERR].name = "responder_local_len_error", + [MANA_IB_REQUESTOR_LOCAL_PROT_ERR].name = "requestor_local_prot_error", + [MANA_IB_RESPONDER_REM_ACCESS_ERR].name = "responder_rem_access_error", + [MANA_IB_RESPONDER_LOCAL_QP_ERR].name = "responder_local_qp_error", + [MANA_IB_RESPONDER_MALFORMED_WQE].name = "responder_malformed_wqe", + [MANA_IB_GENERAL_HW_ERR].name = "general_hw_error", + [MANA_IB_REQUESTER_RNR_NAK_RETRIES_EXCEEDED].name = "requester_rnr_nak_retries_exceeded", + [MANA_IB_REQUESTER_RETRIES_EXCEEDED].name = "requester_retries_exceeded", + [MANA_IB_TOTAL_FATAL_ERR].name = "total_fatal_error", + [MANA_IB_RECEIVED_CNPS].name = "received_cnps", + [MANA_IB_NUM_QPS_CONGESTED].name = "num_qps_congested", + [MANA_IB_RATE_INC_EVENTS].name = "rate_inc_events", + [MANA_IB_NUM_QPS_RECOVERED].name = "num_qps_recovered", + [MANA_IB_CURRENT_RATE].name = "current_rate", +}; + +struct rdma_hw_stats *mana_ib_alloc_hw_port_stats(struct ib_device *ibdev, + u32 port_num) +{ + return rdma_alloc_hw_stats_struct(mana_ib_port_stats_desc, + ARRAY_SIZE(mana_ib_port_stats_desc), + RDMA_HW_STATS_DEFAULT_LIFESPAN); +} + +int mana_ib_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, + u32 port_num, int index) +{ + struct mana_ib_dev *mdev = container_of(ibdev, struct mana_ib_dev, + ib_dev); + struct mana_rnic_query_vf_cntrs_resp resp = {}; + struct mana_rnic_query_vf_cntrs_req req = {}; + int err; + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_QUERY_VF_COUNTERS, + sizeof(req), sizeof(resp)); + req.hdr.dev_id = mdev->gdma_dev->dev_id; + req.adapter = mdev->adapter_handle; + + err = mana_gd_send_request(mdev_to_gc(mdev), sizeof(req), &req, + sizeof(resp), &resp); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to query vf counters err %d", + err); + return err; + } + + stats->value[MANA_IB_REQUESTER_TIMEOUT] = resp.requester_timeout; + stats->value[MANA_IB_REQUESTER_OOS_NAK] = resp.requester_oos_nak; + stats->value[MANA_IB_REQUESTER_RNR_NAK] = resp.requester_rnr_nak; + stats->value[MANA_IB_RESPONDER_RNR_NAK] = resp.responder_rnr_nak; + stats->value[MANA_IB_RESPONDER_OOS] = resp.responder_oos; + stats->value[MANA_IB_RESPONDER_DUP_REQUEST] = resp.responder_dup_request; + stats->value[MANA_IB_REQUESTER_IMPLICIT_NAK] = + resp.requester_implicit_nak; + stats->value[MANA_IB_REQUESTER_READRESP_PSN_MISMATCH] = + resp.requester_readresp_psn_mismatch; + stats->value[MANA_IB_NAK_INV_REQ] = resp.nak_inv_req; + stats->value[MANA_IB_NAK_ACCESS_ERR] = resp.nak_access_err; + stats->value[MANA_IB_NAK_OPP_ERR] = resp.nak_opp_err; + stats->value[MANA_IB_NAK_INV_READ] = resp.nak_inv_read; + stats->value[MANA_IB_RESPONDER_LOCAL_LEN_ERR] = + resp.responder_local_len_err; + stats->value[MANA_IB_REQUESTOR_LOCAL_PROT_ERR] = + resp.requestor_local_prot_err; + stats->value[MANA_IB_RESPONDER_REM_ACCESS_ERR] = + resp.responder_rem_access_err; + stats->value[MANA_IB_RESPONDER_LOCAL_QP_ERR] = + resp.responder_local_qp_err; + stats->value[MANA_IB_RESPONDER_MALFORMED_WQE] = + resp.responder_malformed_wqe; + stats->value[MANA_IB_GENERAL_HW_ERR] = resp.general_hw_err; + stats->value[MANA_IB_REQUESTER_RNR_NAK_RETRIES_EXCEEDED] = + resp.requester_rnr_nak_retries_exceeded; + stats->value[MANA_IB_REQUESTER_RETRIES_EXCEEDED] = + resp.requester_retries_exceeded; + stats->value[MANA_IB_TOTAL_FATAL_ERR] = resp.total_fatal_err; + + stats->value[MANA_IB_RECEIVED_CNPS] = resp.received_cnps; + stats->value[MANA_IB_NUM_QPS_CONGESTED] = resp.num_qps_congested; + stats->value[MANA_IB_RATE_INC_EVENTS] = resp.rate_inc_events; + stats->value[MANA_IB_NUM_QPS_RECOVERED] = resp.num_qps_recovered; + stats->value[MANA_IB_CURRENT_RATE] = resp.current_rate; + + return ARRAY_SIZE(mana_ib_port_stats_desc); +} diff --git a/drivers/infiniband/hw/mana/counters.h b/drivers/infiniband/hw/mana/counters.h new file mode 100644 index 000000000000..7ff92d27f6c3 --- /dev/null +++ b/drivers/infiniband/hw/mana/counters.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024 Microsoft Corporation. All rights reserved. + */ + +#ifndef _COUNTERS_H_ +#define _COUNTERS_H_ + +#include "mana_ib.h" + +enum mana_ib_port_counters { + MANA_IB_REQUESTER_TIMEOUT, + MANA_IB_REQUESTER_OOS_NAK, + MANA_IB_REQUESTER_RNR_NAK, + MANA_IB_RESPONDER_RNR_NAK, + MANA_IB_RESPONDER_OOS, + MANA_IB_RESPONDER_DUP_REQUEST, + MANA_IB_REQUESTER_IMPLICIT_NAK, + MANA_IB_REQUESTER_READRESP_PSN_MISMATCH, + MANA_IB_NAK_INV_REQ, + MANA_IB_NAK_ACCESS_ERR, + MANA_IB_NAK_OPP_ERR, + MANA_IB_NAK_INV_READ, + MANA_IB_RESPONDER_LOCAL_LEN_ERR, + MANA_IB_REQUESTOR_LOCAL_PROT_ERR, + MANA_IB_RESPONDER_REM_ACCESS_ERR, + MANA_IB_RESPONDER_LOCAL_QP_ERR, + MANA_IB_RESPONDER_MALFORMED_WQE, + MANA_IB_GENERAL_HW_ERR, + MANA_IB_REQUESTER_RNR_NAK_RETRIES_EXCEEDED, + MANA_IB_REQUESTER_RETRIES_EXCEEDED, + MANA_IB_TOTAL_FATAL_ERR, + MANA_IB_RECEIVED_CNPS, + MANA_IB_NUM_QPS_CONGESTED, + MANA_IB_RATE_INC_EVENTS, + MANA_IB_NUM_QPS_RECOVERED, + MANA_IB_CURRENT_RATE, +}; + +struct rdma_hw_stats *mana_ib_alloc_hw_port_stats(struct ib_device *ibdev, + u32 port_num); +int mana_ib_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, + u32 port_num, int index); +#endif /* _COUNTERS_H_ */ diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c index 4a71e678d09c..28e154bbb50f 100644 --- a/drivers/infiniband/hw/mana/cq.c +++ b/drivers/infiniband/hw/mana/cq.c @@ -6,69 +6,102 @@ #include "mana_ib.h" int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct mana_ib_cq *cq = container_of(ibcq, struct mana_ib_cq, ibcq); + struct mana_ib_create_cq_resp resp = {}; + struct mana_ib_ucontext *mana_ucontext; struct ib_device *ibdev = ibcq->device; struct mana_ib_create_cq ucmd = {}; struct mana_ib_dev *mdev; - struct gdma_context *gc; + bool is_rnic_cq; + u32 doorbell; + u32 buf_size; int err; mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); - gc = mdev_to_gc(mdev); - if (udata->inlen < sizeof(ucmd)) - return -EINVAL; + cq->comp_vector = attr->comp_vector % ibdev->num_comp_vectors; + cq->cq_handle = INVALID_MANA_HANDLE; - if (attr->comp_vector > gc->max_num_queues) - return -EINVAL; + if (udata) { + if (udata->inlen < offsetof(struct mana_ib_create_cq, flags)) + return -EINVAL; - cq->comp_vector = attr->comp_vector; + err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen)); + if (err) { + ibdev_dbg(ibdev, "Failed to copy from udata for create cq, %d\n", err); + return err; + } - err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen)); - if (err) { - ibdev_dbg(ibdev, - "Failed to copy from udata for create cq, %d\n", err); - return err; - } + is_rnic_cq = !!(ucmd.flags & MANA_IB_CREATE_RNIC_CQ); - if (attr->cqe > mdev->adapter_caps.max_qp_wr) { - ibdev_dbg(ibdev, "CQE %d exceeding limit\n", attr->cqe); - return -EINVAL; - } + if ((!is_rnic_cq && attr->cqe > mdev->adapter_caps.max_qp_wr) || + attr->cqe > U32_MAX / COMP_ENTRY_SIZE) { + ibdev_dbg(ibdev, "CQE %d exceeding limit\n", attr->cqe); + return -EINVAL; + } - cq->cqe = attr->cqe; - cq->umem = ib_umem_get(ibdev, ucmd.buf_addr, cq->cqe * COMP_ENTRY_SIZE, - IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(cq->umem)) { - err = PTR_ERR(cq->umem); - ibdev_dbg(ibdev, "Failed to get umem for create cq, err %d\n", - err); - return err; + cq->cqe = attr->cqe; + err = mana_ib_create_queue(mdev, ucmd.buf_addr, cq->cqe * COMP_ENTRY_SIZE, + &cq->queue); + if (err) { + ibdev_dbg(ibdev, "Failed to create queue for create cq, %d\n", err); + return err; + } + + mana_ucontext = rdma_udata_to_drv_context(udata, struct mana_ib_ucontext, + ibucontext); + doorbell = mana_ucontext->doorbell; + } else { + is_rnic_cq = true; + buf_size = MANA_PAGE_ALIGN(roundup_pow_of_two(attr->cqe * COMP_ENTRY_SIZE)); + cq->cqe = buf_size / COMP_ENTRY_SIZE; + err = mana_ib_create_kernel_queue(mdev, buf_size, GDMA_CQ, &cq->queue); + if (err) { + ibdev_dbg(ibdev, "Failed to create kernel queue for create cq, %d\n", err); + return err; + } + doorbell = mdev->gdma_dev->doorbell; } - err = mana_ib_create_zero_offset_dma_region(mdev, cq->umem, &cq->gdma_region); - if (err) { - ibdev_dbg(ibdev, - "Failed to create dma region for create cq, %d\n", - err); - goto err_release_umem; + if (is_rnic_cq) { + err = mana_ib_gd_create_cq(mdev, cq, doorbell); + if (err) { + ibdev_dbg(ibdev, "Failed to create RNIC cq, %d\n", err); + goto err_destroy_queue; + } + + err = mana_ib_install_cq_cb(mdev, cq); + if (err) { + ibdev_dbg(ibdev, "Failed to install cq callback, %d\n", err); + goto err_destroy_rnic_cq; + } } - ibdev_dbg(ibdev, - "create_dma_region ret %d gdma_region 0x%llx\n", - err, cq->gdma_region); + if (udata) { + resp.cqid = cq->queue.id; + err = ib_copy_to_udata(udata, &resp, min(sizeof(resp), udata->outlen)); + if (err) { + ibdev_dbg(&mdev->ib_dev, "Failed to copy to udata, %d\n", err); + goto err_remove_cq_cb; + } + } - /* - * The CQ ID is not known at this time. The ID is generated at create_qp - */ - cq->id = INVALID_QUEUE_ID; + spin_lock_init(&cq->cq_lock); + INIT_LIST_HEAD(&cq->list_send_qp); + INIT_LIST_HEAD(&cq->list_recv_qp); return 0; -err_release_umem: - ib_umem_release(cq->umem); +err_remove_cq_cb: + mana_ib_remove_cq_cb(mdev, cq); +err_destroy_rnic_cq: + mana_ib_gd_destroy_cq(mdev, cq); +err_destroy_queue: + mana_ib_destroy_queue(mdev, &cq->queue); + return err; } @@ -77,25 +110,17 @@ int mana_ib_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) struct mana_ib_cq *cq = container_of(ibcq, struct mana_ib_cq, ibcq); struct ib_device *ibdev = ibcq->device; struct mana_ib_dev *mdev; - struct gdma_context *gc; - int err; mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); - gc = mdev_to_gc(mdev); - err = mana_ib_gd_destroy_dma_region(mdev, cq->gdma_region); - if (err) { - ibdev_dbg(ibdev, - "Failed to destroy dma region, %d\n", err); - return err; - } + mana_ib_remove_cq_cb(mdev, cq); - if (cq->id != INVALID_QUEUE_ID) { - kfree(gc->cq_table[cq->id]); - gc->cq_table[cq->id] = NULL; - } + /* Ignore return code as there is not much we can do about it. + * The error message is printed inside. + */ + mana_ib_gd_destroy_cq(mdev, cq); - ib_umem_release(cq->umem); + mana_ib_destroy_queue(mdev, &cq->queue); return 0; } @@ -113,16 +138,179 @@ int mana_ib_install_cq_cb(struct mana_ib_dev *mdev, struct mana_ib_cq *cq) struct gdma_context *gc = mdev_to_gc(mdev); struct gdma_queue *gdma_cq; + if (cq->queue.id >= gc->max_num_cqs) + return -EINVAL; /* Create CQ table entry */ - WARN_ON(gc->cq_table[cq->id]); - gdma_cq = kzalloc(sizeof(*gdma_cq), GFP_KERNEL); + WARN_ON(gc->cq_table[cq->queue.id]); + if (cq->queue.kmem) + gdma_cq = cq->queue.kmem; + else + gdma_cq = kzalloc(sizeof(*gdma_cq), GFP_KERNEL); if (!gdma_cq) return -ENOMEM; gdma_cq->cq.context = cq; gdma_cq->type = GDMA_CQ; gdma_cq->cq.callback = mana_ib_cq_handler; - gdma_cq->id = cq->id; - gc->cq_table[cq->id] = gdma_cq; + gdma_cq->id = cq->queue.id; + gc->cq_table[cq->queue.id] = gdma_cq; return 0; } + +void mana_ib_remove_cq_cb(struct mana_ib_dev *mdev, struct mana_ib_cq *cq) +{ + struct gdma_context *gc = mdev_to_gc(mdev); + + if (cq->queue.id >= gc->max_num_cqs || cq->queue.id == INVALID_QUEUE_ID) + return; + + if (cq->queue.kmem) + /* Then it will be cleaned and removed by the mana */ + return; + + kfree(gc->cq_table[cq->queue.id]); + gc->cq_table[cq->queue.id] = NULL; +} + +int mana_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) +{ + struct mana_ib_cq *cq = container_of(ibcq, struct mana_ib_cq, ibcq); + struct gdma_queue *gdma_cq = cq->queue.kmem; + + if (!gdma_cq) + return -EINVAL; + + mana_gd_ring_cq(gdma_cq, SET_ARM_BIT); + return 0; +} + +static inline void handle_ud_sq_cqe(struct mana_ib_qp *qp, struct gdma_comp *cqe) +{ + struct mana_rdma_cqe *rdma_cqe = (struct mana_rdma_cqe *)cqe->cqe_data; + struct gdma_queue *wq = qp->ud_qp.queues[MANA_UD_SEND_QUEUE].kmem; + struct ud_sq_shadow_wqe *shadow_wqe; + + shadow_wqe = shadow_queue_get_next_to_complete(&qp->shadow_sq); + if (!shadow_wqe) + return; + + shadow_wqe->header.error_code = rdma_cqe->ud_send.vendor_error; + + wq->tail += shadow_wqe->header.posted_wqe_size; + shadow_queue_advance_next_to_complete(&qp->shadow_sq); +} + +static inline void handle_ud_rq_cqe(struct mana_ib_qp *qp, struct gdma_comp *cqe) +{ + struct mana_rdma_cqe *rdma_cqe = (struct mana_rdma_cqe *)cqe->cqe_data; + struct gdma_queue *wq = qp->ud_qp.queues[MANA_UD_RECV_QUEUE].kmem; + struct ud_rq_shadow_wqe *shadow_wqe; + + shadow_wqe = shadow_queue_get_next_to_complete(&qp->shadow_rq); + if (!shadow_wqe) + return; + + shadow_wqe->byte_len = rdma_cqe->ud_recv.msg_len; + shadow_wqe->src_qpn = rdma_cqe->ud_recv.src_qpn; + shadow_wqe->header.error_code = IB_WC_SUCCESS; + + wq->tail += shadow_wqe->header.posted_wqe_size; + shadow_queue_advance_next_to_complete(&qp->shadow_rq); +} + +static void mana_handle_cqe(struct mana_ib_dev *mdev, struct gdma_comp *cqe) +{ + struct mana_ib_qp *qp = mana_get_qp_ref(mdev, cqe->wq_num, cqe->is_sq); + + if (!qp) + return; + + if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_UD) { + if (cqe->is_sq) + handle_ud_sq_cqe(qp, cqe); + else + handle_ud_rq_cqe(qp, cqe); + } + + mana_put_qp_ref(qp); +} + +static void fill_verbs_from_shadow_wqe(struct mana_ib_qp *qp, struct ib_wc *wc, + const struct shadow_wqe_header *shadow_wqe) +{ + const struct ud_rq_shadow_wqe *ud_wqe = (const struct ud_rq_shadow_wqe *)shadow_wqe; + + wc->wr_id = shadow_wqe->wr_id; + wc->status = shadow_wqe->error_code; + wc->opcode = shadow_wqe->opcode; + wc->vendor_err = shadow_wqe->error_code; + wc->wc_flags = 0; + wc->qp = &qp->ibqp; + wc->pkey_index = 0; + + if (shadow_wqe->opcode == IB_WC_RECV) { + wc->byte_len = ud_wqe->byte_len; + wc->src_qp = ud_wqe->src_qpn; + wc->wc_flags |= IB_WC_GRH; + } +} + +static int mana_process_completions(struct mana_ib_cq *cq, int nwc, struct ib_wc *wc) +{ + struct shadow_wqe_header *shadow_wqe; + struct mana_ib_qp *qp; + int wc_index = 0; + + /* process send shadow queue completions */ + list_for_each_entry(qp, &cq->list_send_qp, cq_send_list) { + while ((shadow_wqe = shadow_queue_get_next_to_consume(&qp->shadow_sq)) + != NULL) { + if (wc_index >= nwc) + goto out; + + fill_verbs_from_shadow_wqe(qp, &wc[wc_index], shadow_wqe); + shadow_queue_advance_consumer(&qp->shadow_sq); + wc_index++; + } + } + + /* process recv shadow queue completions */ + list_for_each_entry(qp, &cq->list_recv_qp, cq_recv_list) { + while ((shadow_wqe = shadow_queue_get_next_to_consume(&qp->shadow_rq)) + != NULL) { + if (wc_index >= nwc) + goto out; + + fill_verbs_from_shadow_wqe(qp, &wc[wc_index], shadow_wqe); + shadow_queue_advance_consumer(&qp->shadow_rq); + wc_index++; + } + } + +out: + return wc_index; +} + +int mana_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) +{ + struct mana_ib_cq *cq = container_of(ibcq, struct mana_ib_cq, ibcq); + struct mana_ib_dev *mdev = container_of(ibcq->device, struct mana_ib_dev, ib_dev); + struct gdma_queue *queue = cq->queue.kmem; + struct gdma_comp gdma_cqe; + unsigned long flags; + int num_polled = 0; + int comp_read, i; + + spin_lock_irqsave(&cq->cq_lock, flags); + for (i = 0; i < num_entries; i++) { + comp_read = mana_gd_poll_cq(queue, &gdma_cqe, 1); + if (comp_read < 1) + break; + mana_handle_cqe(mdev, &gdma_cqe); + } + + num_polled = mana_process_completions(cq, num_entries, wc); + spin_unlock_irqrestore(&cq->cq_lock, flags); + + return num_polled; +} diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c index 6fa902ee80a6..165c0a1e67d1 100644 --- a/drivers/infiniband/hw/mana/device.c +++ b/drivers/infiniband/hw/mana/device.c @@ -5,39 +5,53 @@ #include "mana_ib.h" #include <net/mana/mana_auxiliary.h> +#include <net/addrconf.h> MODULE_DESCRIPTION("Microsoft Azure Network Adapter IB driver"); MODULE_LICENSE("GPL"); -MODULE_IMPORT_NS(NET_MANA); +MODULE_IMPORT_NS("NET_MANA"); static const struct ib_device_ops mana_ib_dev_ops = { .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_MANA, .uverbs_abi_ver = MANA_IB_UVERBS_ABI_VERSION, + .add_gid = mana_ib_gd_add_gid, .alloc_pd = mana_ib_alloc_pd, .alloc_ucontext = mana_ib_alloc_ucontext, + .create_ah = mana_ib_create_ah, .create_cq = mana_ib_create_cq, .create_qp = mana_ib_create_qp, .create_rwq_ind_table = mana_ib_create_rwq_ind_table, .create_wq = mana_ib_create_wq, .dealloc_pd = mana_ib_dealloc_pd, .dealloc_ucontext = mana_ib_dealloc_ucontext, + .del_gid = mana_ib_gd_del_gid, .dereg_mr = mana_ib_dereg_mr, + .destroy_ah = mana_ib_destroy_ah, .destroy_cq = mana_ib_destroy_cq, .destroy_qp = mana_ib_destroy_qp, .destroy_rwq_ind_table = mana_ib_destroy_rwq_ind_table, .destroy_wq = mana_ib_destroy_wq, .disassociate_ucontext = mana_ib_disassociate_ucontext, + .get_dma_mr = mana_ib_get_dma_mr, + .get_link_layer = mana_ib_get_link_layer, .get_port_immutable = mana_ib_get_port_immutable, .mmap = mana_ib_mmap, .modify_qp = mana_ib_modify_qp, .modify_wq = mana_ib_modify_wq, + .poll_cq = mana_ib_poll_cq, + .post_recv = mana_ib_post_recv, + .post_send = mana_ib_post_send, .query_device = mana_ib_query_device, .query_gid = mana_ib_query_gid, + .query_pkey = mana_ib_query_pkey, .query_port = mana_ib_query_port, .reg_user_mr = mana_ib_reg_user_mr, + .reg_user_mr_dmabuf = mana_ib_reg_user_mr_dmabuf, + .req_notify_cq = mana_ib_arm_cq, + INIT_RDMA_OBJ_SIZE(ib_ah, mana_ib_ah, ibah), INIT_RDMA_OBJ_SIZE(ib_cq, mana_ib_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_pd, mana_ib_pd, ibpd), INIT_RDMA_OBJ_SIZE(ib_qp, mana_ib_qp, ibqp), @@ -46,64 +60,156 @@ static const struct ib_device_ops mana_ib_dev_ops = { ib_ind_table), }; +static const struct ib_device_ops mana_ib_stats_ops = { + .alloc_hw_port_stats = mana_ib_alloc_hw_port_stats, + .get_hw_stats = mana_ib_get_hw_stats, +}; + +static int mana_ib_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct mana_ib_dev *dev = container_of(this, struct mana_ib_dev, nb); + struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); + struct gdma_context *gc = dev->gdma_dev->gdma_context; + struct mana_context *mc = gc->mana.driver_data; + struct net_device *ndev; + + /* Only process events from our parent device */ + if (event_dev != mc->ports[0]) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_CHANGEUPPER: + ndev = mana_get_primary_netdev(mc, 0, &dev->dev_tracker); + /* + * RDMA core will setup GID based on updated netdev. + * It's not possible to race with the core as rtnl lock is being + * held. + */ + ib_device_set_netdev(&dev->ib_dev, ndev, 1); + + /* mana_get_primary_netdev() returns ndev with refcount held */ + netdev_put(ndev, &dev->dev_tracker); + + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + static int mana_ib_probe(struct auxiliary_device *adev, const struct auxiliary_device_id *id) { struct mana_adev *madev = container_of(adev, struct mana_adev, adev); + struct gdma_context *gc = madev->mdev->gdma_context; + struct mana_context *mc = gc->mana.driver_data; struct gdma_dev *mdev = madev->mdev; - struct mana_context *mc; + struct net_device *ndev; struct mana_ib_dev *dev; + u8 mac_addr[ETH_ALEN]; int ret; - mc = mdev->driver_data; - dev = ib_alloc_device(mana_ib_dev, ib_dev); if (!dev) return -ENOMEM; ib_set_device_ops(&dev->ib_dev, &mana_ib_dev_ops); - - dev->ib_dev.phys_port_cnt = mc->num_ports; - - ibdev_dbg(&dev->ib_dev, "mdev=%p id=%d num_ports=%d\n", mdev, - mdev->dev_id.as_uint32, dev->ib_dev.phys_port_cnt); - dev->ib_dev.node_type = RDMA_NODE_IB_CA; - - /* - * num_comp_vectors needs to set to the max MSIX index - * when interrupts and event queues are implemented - */ - dev->ib_dev.num_comp_vectors = 1; - dev->ib_dev.dev.parent = mdev->gdma_context->dev; - - ret = mana_gd_register_device(&mdev->gdma_context->mana_ib); - if (ret) { - ibdev_err(&dev->ib_dev, "Failed to register device, ret %d", - ret); - goto free_ib_device; + dev->ib_dev.num_comp_vectors = gc->max_num_queues; + dev->ib_dev.dev.parent = gc->dev; + dev->gdma_dev = mdev; + xa_init_flags(&dev->qp_table_wq, XA_FLAGS_LOCK_IRQ); + + if (mana_ib_is_rnic(dev)) { + dev->ib_dev.phys_port_cnt = 1; + ndev = mana_get_primary_netdev(mc, 0, &dev->dev_tracker); + if (!ndev) { + ret = -ENODEV; + ibdev_err(&dev->ib_dev, "Failed to get netdev for IB port 1"); + goto free_ib_device; + } + ether_addr_copy(mac_addr, ndev->dev_addr); + addrconf_addr_eui48((u8 *)&dev->ib_dev.node_guid, ndev->dev_addr); + ret = ib_device_set_netdev(&dev->ib_dev, ndev, 1); + /* mana_get_primary_netdev() returns ndev with refcount held */ + netdev_put(ndev, &dev->dev_tracker); + if (ret) { + ibdev_err(&dev->ib_dev, "Failed to set ib netdev, ret %d", ret); + goto free_ib_device; + } + + dev->nb.notifier_call = mana_ib_netdev_event; + ret = register_netdevice_notifier(&dev->nb); + if (ret) { + ibdev_err(&dev->ib_dev, "Failed to register net notifier, %d", + ret); + goto free_ib_device; + } + + ret = mana_ib_gd_query_adapter_caps(dev); + if (ret) { + ibdev_err(&dev->ib_dev, "Failed to query device caps, ret %d", ret); + goto deregister_net_notifier; + } + + ib_set_device_ops(&dev->ib_dev, &mana_ib_stats_ops); + + ret = mana_ib_create_eqs(dev); + if (ret) { + ibdev_err(&dev->ib_dev, "Failed to create EQs, ret %d", ret); + goto deregister_net_notifier; + } + + ret = mana_ib_gd_create_rnic_adapter(dev); + if (ret) + goto destroy_eqs; + + ret = mana_ib_gd_config_mac(dev, ADDR_OP_ADD, mac_addr); + if (ret) { + ibdev_err(&dev->ib_dev, "Failed to add Mac address, ret %d", ret); + goto destroy_rnic; + } + } else { + dev->ib_dev.phys_port_cnt = mc->num_ports; + ret = mana_eth_query_adapter_caps(dev); + if (ret) { + ibdev_err(&dev->ib_dev, "Failed to query ETH device caps, ret %d", ret); + goto free_ib_device; + } } - dev->gdma_dev = &mdev->gdma_context->mana_ib; - ret = mana_ib_gd_query_adapter_caps(dev); - if (ret) { - ibdev_err(&dev->ib_dev, "Failed to query device caps, ret %d", - ret); - goto deregister_device; + dev->av_pool = dma_pool_create("mana_ib_av", gc->dev, MANA_AV_BUFFER_SIZE, + MANA_AV_BUFFER_SIZE, 0); + if (!dev->av_pool) { + ret = -ENOMEM; + goto destroy_rnic; } - ret = ib_register_device(&dev->ib_dev, "mana_%d", - mdev->gdma_context->dev); + ibdev_dbg(&dev->ib_dev, "mdev=%p id=%d num_ports=%d\n", mdev, + mdev->dev_id.as_uint32, dev->ib_dev.phys_port_cnt); + + ret = ib_register_device(&dev->ib_dev, mana_ib_is_rnic(dev) ? "mana_%d" : "manae_%d", + gc->dev); if (ret) - goto deregister_device; + goto deallocate_pool; dev_set_drvdata(&adev->dev, dev); return 0; -deregister_device: - mana_gd_deregister_device(dev->gdma_dev); +deallocate_pool: + dma_pool_destroy(dev->av_pool); +destroy_rnic: + if (mana_ib_is_rnic(dev)) + mana_ib_gd_destroy_rnic_adapter(dev); +destroy_eqs: + if (mana_ib_is_rnic(dev)) + mana_ib_destroy_eqs(dev); +deregister_net_notifier: + if (mana_ib_is_rnic(dev)) + unregister_netdevice_notifier(&dev->nb); free_ib_device: + xa_destroy(&dev->qp_table_wq); ib_dealloc_device(&dev->ib_dev); return ret; } @@ -113,23 +219,25 @@ static void mana_ib_remove(struct auxiliary_device *adev) struct mana_ib_dev *dev = dev_get_drvdata(&adev->dev); ib_unregister_device(&dev->ib_dev); - - mana_gd_deregister_device(dev->gdma_dev); - + dma_pool_destroy(dev->av_pool); + if (mana_ib_is_rnic(dev)) { + mana_ib_gd_destroy_rnic_adapter(dev); + mana_ib_destroy_eqs(dev); + unregister_netdevice_notifier(&dev->nb); + } + xa_destroy(&dev->qp_table_wq); ib_dealloc_device(&dev->ib_dev); } static const struct auxiliary_device_id mana_id_table[] = { - { - .name = "mana.rdma", - }, + { .name = "mana.rdma", }, + { .name = "mana.eth", }, {}, }; MODULE_DEVICE_TABLE(auxiliary, mana_id_table); static struct auxiliary_driver mana_driver = { - .name = "rdma", .probe = mana_ib_probe, .remove = mana_ib_remove, .id_table = mana_id_table, diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index 71e33feee61b..41a24a186f9d 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -4,6 +4,7 @@ */ #include "mana_ib.h" +#include "linux/pci.h" void mana_ib_uncfg_vport(struct mana_ib_dev *dev, struct mana_ib_pd *pd, u32 port) @@ -82,6 +83,9 @@ int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_PD, sizeof(req), sizeof(resp)); + if (!udata) + flags |= GDMA_PD_FLAG_ALLOW_GPA_MR; + req.flags = flags; err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); @@ -174,7 +178,7 @@ static int mana_gd_allocate_doorbell_page(struct gdma_context *gc, req.resource_type = GDMA_RESOURCE_DOORBELL_PAGE; req.num_resources = 1; - req.alignment = 1; + req.alignment = PAGE_SIZE / MANA_PAGE_SIZE; /* Have GDMA start searching from 0 */ req.allocated_resources = 0; @@ -237,6 +241,69 @@ void mana_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) ibdev_dbg(ibdev, "Failed to destroy doorbell page %d\n", ret); } +int mana_ib_create_kernel_queue(struct mana_ib_dev *mdev, u32 size, enum gdma_queue_type type, + struct mana_ib_queue *queue) +{ + struct gdma_queue_spec spec = {}; + int err; + + queue->id = INVALID_QUEUE_ID; + queue->gdma_region = GDMA_INVALID_DMA_REGION; + spec.type = type; + spec.monitor_avl_buf = false; + spec.queue_size = size; + err = mana_gd_create_mana_wq_cq(mdev->gdma_dev, &spec, &queue->kmem); + if (err) + return err; + /* take ownership into mana_ib from mana */ + queue->gdma_region = queue->kmem->mem_info.dma_region_handle; + queue->kmem->mem_info.dma_region_handle = GDMA_INVALID_DMA_REGION; + return 0; +} + +int mana_ib_create_queue(struct mana_ib_dev *mdev, u64 addr, u32 size, + struct mana_ib_queue *queue) +{ + struct ib_umem *umem; + int err; + + queue->umem = NULL; + queue->id = INVALID_QUEUE_ID; + queue->gdma_region = GDMA_INVALID_DMA_REGION; + + umem = ib_umem_get(&mdev->ib_dev, addr, size, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(umem)) { + err = PTR_ERR(umem); + ibdev_dbg(&mdev->ib_dev, "Failed to get umem, %d\n", err); + return err; + } + + err = mana_ib_create_zero_offset_dma_region(mdev, umem, &queue->gdma_region); + if (err) { + ibdev_dbg(&mdev->ib_dev, "Failed to create dma region, %d\n", err); + goto free_umem; + } + queue->umem = umem; + + ibdev_dbg(&mdev->ib_dev, "created dma region 0x%llx\n", queue->gdma_region); + + return 0; +free_umem: + ib_umem_release(umem); + return err; +} + +void mana_ib_destroy_queue(struct mana_ib_dev *mdev, struct mana_ib_queue *queue) +{ + /* Ignore return code as there is not much we can do about it. + * The error message is printed inside. + */ + mana_ib_gd_destroy_dma_region(mdev, queue->gdma_region); + ib_umem_release(queue->umem); + if (queue->kmem) + mana_gd_destroy_queue(mdev_to_gc(mdev), queue->kmem); +} + static int mana_ib_gd_first_dma_region(struct mana_ib_dev *dev, struct gdma_context *gc, @@ -317,7 +384,7 @@ static int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem unsigned int tail = 0; u64 *page_addr_list; void *request_buf; - int err; + int err = 0; gc = mdev_to_gc(dev); hwc = gc->hwc.driver_data; @@ -342,7 +409,7 @@ static int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem create_req->length = umem->length; create_req->offset_in_page = ib_umem_dma_offset(umem, page_sz); - create_req->gdma_page_type = order_base_2(page_sz) - PAGE_SHIFT; + create_req->gdma_page_type = order_base_2(page_sz) - MANA_PAGE_SHIFT; create_req->page_count = num_pages_total; ibdev_dbg(&dev->ib_dev, "size_dma_region %lu num_pages_total %lu\n", @@ -412,7 +479,7 @@ int mana_ib_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, { unsigned long page_sz; - page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, virt); + page_sz = ib_umem_find_best_pgsz(umem, dev->adapter_caps.page_size_cap, virt); if (!page_sz) { ibdev_dbg(&dev->ib_dev, "Failed to find page size.\n"); return -EINVAL; @@ -427,7 +494,7 @@ int mana_ib_create_zero_offset_dma_region(struct mana_ib_dev *dev, struct ib_ume unsigned long page_sz; /* Hardware requires dma region to align to chosen page size */ - page_sz = ib_umem_find_best_pgoff(umem, PAGE_SZ_BM, 0); + page_sz = ib_umem_find_best_pgoff(umem, dev->adapter_caps.page_size_cap, 0); if (!page_sz) { ibdev_dbg(&dev->ib_dev, "Failed to find page size.\n"); return -EINVAL; @@ -470,13 +537,13 @@ int mana_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) PAGE_SHIFT; prot = pgprot_writecombine(vma->vm_page_prot); - ret = rdma_user_mmap_io(ibcontext, vma, pfn, gc->db_page_size, prot, + ret = rdma_user_mmap_io(ibcontext, vma, pfn, PAGE_SIZE, prot, NULL); if (ret) ibdev_dbg(ibdev, "can't rdma_user_mmap_io ret %d\n", ret); else - ibdev_dbg(ibdev, "mapped I/O pfn 0x%llx page_size %u, ret %d\n", - pfn, gc->db_page_size, ret); + ibdev_dbg(ibdev, "mapped I/O pfn 0x%llx page_size %lu, ret %d\n", + pfn, PAGE_SIZE, ret); return ret; } @@ -484,11 +551,23 @@ int mana_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) int mana_ib_get_port_immutable(struct ib_device *ibdev, u32 port_num, struct ib_port_immutable *immutable) { - /* - * This version only support RAW_PACKET - * other values need to be filled for other types - */ - immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET; + struct mana_ib_dev *dev = container_of(ibdev, struct mana_ib_dev, ib_dev); + struct ib_port_attr attr; + int err; + + err = ib_query_port(ibdev, port_num, &attr); + if (err) + return err; + + immutable->pkey_tbl_len = attr.pkey_tbl_len; + immutable->gid_tbl_len = attr.gid_tbl_len; + + if (mana_ib_is_rnic(dev)) { + immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; + immutable->max_mad_size = IB_MGMT_MAD_SIZE; + } else { + immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET; + } return 0; } @@ -496,17 +575,34 @@ int mana_ib_get_port_immutable(struct ib_device *ibdev, u32 port_num, int mana_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props, struct ib_udata *uhw) { - struct mana_ib_dev *dev = container_of(ibdev, - struct mana_ib_dev, ib_dev); + struct mana_ib_dev *dev = container_of(ibdev, struct mana_ib_dev, ib_dev); + struct pci_dev *pdev = to_pci_dev(mdev_to_gc(dev)->dev); + memset(props, 0, sizeof(*props)); + props->vendor_id = pdev->vendor; + props->vendor_part_id = dev->gdma_dev->dev_id.type; + props->max_mr_size = MANA_IB_MAX_MR_SIZE; + props->page_size_cap = dev->adapter_caps.page_size_cap; props->max_qp = dev->adapter_caps.max_qp_count; props->max_qp_wr = dev->adapter_caps.max_qp_wr; + props->device_cap_flags = IB_DEVICE_RC_RNR_NAK_GEN; + props->max_send_sge = dev->adapter_caps.max_send_sge_count; + props->max_recv_sge = dev->adapter_caps.max_recv_sge_count; + props->max_sge_rd = dev->adapter_caps.max_recv_sge_count; props->max_cq = dev->adapter_caps.max_cq_count; props->max_cqe = dev->adapter_caps.max_qp_wr; props->max_mr = dev->adapter_caps.max_mr_count; - props->max_mr_size = MANA_IB_MAX_MR_SIZE; - props->max_send_sge = dev->adapter_caps.max_send_sge_count; - props->max_recv_sge = dev->adapter_caps.max_recv_sge_count; + props->max_pd = dev->adapter_caps.max_pd_count; + props->max_qp_rd_atom = dev->adapter_caps.max_inbound_read_limit; + props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; + props->max_qp_init_rd_atom = dev->adapter_caps.max_outbound_read_limit; + props->atomic_cap = IB_ATOMIC_NONE; + props->masked_atomic_cap = IB_ATOMIC_NONE; + props->max_ah = INT_MAX; + props->max_pkeys = 1; + props->local_ca_ack_delay = MANA_CA_ACK_DELAY; + if (!mana_ib_is_rnic(dev)) + props->raw_packet_caps = IB_RAW_PACKET_CAP_IP_CSUM; return 0; } @@ -514,7 +610,46 @@ int mana_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props, int mana_ib_query_port(struct ib_device *ibdev, u32 port, struct ib_port_attr *props) { - /* This version doesn't return port properties */ + struct mana_ib_dev *dev = container_of(ibdev, struct mana_ib_dev, ib_dev); + struct net_device *ndev = mana_ib_get_netdev(ibdev, port); + + if (!ndev) + return -EINVAL; + + memset(props, 0, sizeof(*props)); + props->max_mtu = IB_MTU_4096; + props->active_mtu = ib_mtu_int_to_enum(ndev->mtu); + + if (netif_carrier_ok(ndev) && netif_running(ndev)) { + props->state = IB_PORT_ACTIVE; + props->phys_state = IB_PORT_PHYS_STATE_LINK_UP; + } else { + props->state = IB_PORT_DOWN; + props->phys_state = IB_PORT_PHYS_STATE_DISABLED; + } + + props->active_width = IB_WIDTH_4X; + props->active_speed = IB_SPEED_EDR; + props->pkey_tbl_len = 1; + if (mana_ib_is_rnic(dev)) { + props->gid_tbl_len = 16; + props->port_cap_flags = IB_PORT_CM_SUP; + props->ip_gids = true; + } + + return 0; +} + +enum rdma_link_layer mana_ib_get_link_layer(struct ib_device *device, u32 port_num) +{ + return IB_LINK_LAYER_ETHERNET; +} + +int mana_ib_query_pkey(struct ib_device *ibdev, u32 port, u16 index, u16 *pkey) +{ + if (index != 0) + return -EINVAL; + *pkey = IB_DEFAULT_PKEY_FULL; return 0; } @@ -538,7 +673,7 @@ int mana_ib_gd_query_adapter_caps(struct mana_ib_dev *dev) mana_gd_init_req_hdr(&req.hdr, MANA_IB_GET_ADAPTER_CAP, sizeof(req), sizeof(resp)); - req.hdr.resp.msg_version = GDMA_MESSAGE_V3; + req.hdr.resp.msg_version = GDMA_MESSAGE_V4; req.hdr.dev_id = dev->gdma_dev->dev_id; err = mana_gd_send_request(mdev_to_gc(dev), sizeof(req), @@ -567,6 +702,427 @@ int mana_ib_gd_query_adapter_caps(struct mana_ib_dev *dev) caps->max_inline_data_size = resp.max_inline_data_size; caps->max_send_sge_count = resp.max_send_sge_count; caps->max_recv_sge_count = resp.max_recv_sge_count; + caps->feature_flags = resp.feature_flags; + + caps->page_size_cap = PAGE_SZ_BM; + if (mdev_to_gc(dev)->pf_cap_flags1 & GDMA_DRV_CAP_FLAG_1_GDMA_PAGES_4MB_1GB_2GB) + caps->page_size_cap |= (SZ_4M | SZ_1G | SZ_2G); + + return 0; +} + +int mana_eth_query_adapter_caps(struct mana_ib_dev *dev) +{ + struct mana_ib_adapter_caps *caps = &dev->adapter_caps; + struct gdma_query_max_resources_resp resp = {}; + struct gdma_general_req req = {}; + int err; + + mana_gd_init_req_hdr(&req.hdr, GDMA_QUERY_MAX_RESOURCES, + sizeof(req), sizeof(resp)); + + err = mana_gd_send_request(mdev_to_gc(dev), sizeof(req), &req, sizeof(resp), &resp); + if (err) { + ibdev_err(&dev->ib_dev, + "Failed to query adapter caps err %d", err); + return err; + } + + caps->max_qp_count = min_t(u32, resp.max_sq, resp.max_rq); + caps->max_cq_count = resp.max_cq; + caps->max_mr_count = resp.max_mst; + caps->max_pd_count = 0x6000; + caps->max_qp_wr = min_t(u32, + 0x100000 / GDMA_MAX_SQE_SIZE, + 0x100000 / GDMA_MAX_RQE_SIZE); + caps->max_send_sge_count = 30; + caps->max_recv_sge_count = 15; + caps->page_size_cap = PAGE_SZ_BM; + + return 0; +} + +static void +mana_ib_event_handler(void *ctx, struct gdma_queue *q, struct gdma_event *event) +{ + struct mana_ib_dev *mdev = (struct mana_ib_dev *)ctx; + struct mana_ib_qp *qp; + struct ib_event ev; + u32 qpn; + + switch (event->type) { + case GDMA_EQE_RNIC_QP_FATAL: + qpn = event->details[0]; + qp = mana_get_qp_ref(mdev, qpn, false); + if (!qp) + break; + if (qp->ibqp.event_handler) { + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_QP_FATAL; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } + mana_put_qp_ref(qp); + break; + default: + break; + } +} + +int mana_ib_create_eqs(struct mana_ib_dev *mdev) +{ + struct gdma_context *gc = mdev_to_gc(mdev); + struct gdma_queue_spec spec = {}; + int err, i; + + spec.type = GDMA_EQ; + spec.monitor_avl_buf = false; + spec.queue_size = EQ_SIZE; + spec.eq.callback = mana_ib_event_handler; + spec.eq.context = mdev; + spec.eq.log2_throttle_limit = LOG2_EQ_THROTTLE; + spec.eq.msix_index = 0; + + err = mana_gd_create_mana_eq(mdev->gdma_dev, &spec, &mdev->fatal_err_eq); + if (err) + return err; + + mdev->eqs = kcalloc(mdev->ib_dev.num_comp_vectors, sizeof(struct gdma_queue *), + GFP_KERNEL); + if (!mdev->eqs) { + err = -ENOMEM; + goto destroy_fatal_eq; + } + spec.eq.callback = NULL; + for (i = 0; i < mdev->ib_dev.num_comp_vectors; i++) { + spec.eq.msix_index = (i + 1) % gc->num_msix_usable; + err = mana_gd_create_mana_eq(mdev->gdma_dev, &spec, &mdev->eqs[i]); + if (err) + goto destroy_eqs; + } + + return 0; + +destroy_eqs: + while (i-- > 0) + mana_gd_destroy_queue(gc, mdev->eqs[i]); + kfree(mdev->eqs); +destroy_fatal_eq: + mana_gd_destroy_queue(gc, mdev->fatal_err_eq); + return err; +} + +void mana_ib_destroy_eqs(struct mana_ib_dev *mdev) +{ + struct gdma_context *gc = mdev_to_gc(mdev); + int i; + + mana_gd_destroy_queue(gc, mdev->fatal_err_eq); + + for (i = 0; i < mdev->ib_dev.num_comp_vectors; i++) + mana_gd_destroy_queue(gc, mdev->eqs[i]); + + kfree(mdev->eqs); +} + +int mana_ib_gd_create_rnic_adapter(struct mana_ib_dev *mdev) +{ + struct mana_rnic_create_adapter_resp resp = {}; + struct mana_rnic_create_adapter_req req = {}; + struct gdma_context *gc = mdev_to_gc(mdev); + int err; + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_CREATE_ADAPTER, sizeof(req), sizeof(resp)); + req.hdr.req.msg_version = GDMA_MESSAGE_V2; + req.hdr.dev_id = mdev->gdma_dev->dev_id; + req.notify_eq_id = mdev->fatal_err_eq->id; + + if (mdev->adapter_caps.feature_flags & MANA_IB_FEATURE_CLIENT_ERROR_CQE_SUPPORT) + req.feature_flags |= MANA_IB_FEATURE_CLIENT_ERROR_CQE_REQUEST; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to create RNIC adapter err %d", err); + return err; + } + mdev->adapter_handle = resp.adapter; + + return 0; +} + +int mana_ib_gd_destroy_rnic_adapter(struct mana_ib_dev *mdev) +{ + struct mana_rnic_destroy_adapter_resp resp = {}; + struct mana_rnic_destroy_adapter_req req = {}; + struct gdma_context *gc; + int err; + + gc = mdev_to_gc(mdev); + mana_gd_init_req_hdr(&req.hdr, MANA_IB_DESTROY_ADAPTER, sizeof(req), sizeof(resp)); + req.hdr.dev_id = mdev->gdma_dev->dev_id; + req.adapter = mdev->adapter_handle; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to destroy RNIC adapter err %d", err); + return err; + } + + return 0; +} + +int mana_ib_gd_add_gid(const struct ib_gid_attr *attr, void **context) +{ + struct mana_ib_dev *mdev = container_of(attr->device, struct mana_ib_dev, ib_dev); + enum rdma_network_type ntype = rdma_gid_attr_network_type(attr); + struct mana_rnic_config_addr_resp resp = {}; + struct gdma_context *gc = mdev_to_gc(mdev); + struct mana_rnic_config_addr_req req = {}; + int err; + + if (ntype != RDMA_NETWORK_IPV4 && ntype != RDMA_NETWORK_IPV6) { + ibdev_dbg(&mdev->ib_dev, "Unsupported rdma network type %d", ntype); + return -EINVAL; + } + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_CONFIG_IP_ADDR, sizeof(req), sizeof(resp)); + req.hdr.dev_id = mdev->gdma_dev->dev_id; + req.adapter = mdev->adapter_handle; + req.op = ADDR_OP_ADD; + req.sgid_type = (ntype == RDMA_NETWORK_IPV6) ? SGID_TYPE_IPV6 : SGID_TYPE_IPV4; + copy_in_reverse(req.ip_addr, attr->gid.raw, sizeof(union ib_gid)); + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to config IP addr err %d\n", err); + return err; + } + + return 0; +} + +int mana_ib_gd_del_gid(const struct ib_gid_attr *attr, void **context) +{ + struct mana_ib_dev *mdev = container_of(attr->device, struct mana_ib_dev, ib_dev); + enum rdma_network_type ntype = rdma_gid_attr_network_type(attr); + struct mana_rnic_config_addr_resp resp = {}; + struct gdma_context *gc = mdev_to_gc(mdev); + struct mana_rnic_config_addr_req req = {}; + int err; + + if (ntype != RDMA_NETWORK_IPV4 && ntype != RDMA_NETWORK_IPV6) { + ibdev_dbg(&mdev->ib_dev, "Unsupported rdma network type %d", ntype); + return -EINVAL; + } + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_CONFIG_IP_ADDR, sizeof(req), sizeof(resp)); + req.hdr.dev_id = mdev->gdma_dev->dev_id; + req.adapter = mdev->adapter_handle; + req.op = ADDR_OP_REMOVE; + req.sgid_type = (ntype == RDMA_NETWORK_IPV6) ? SGID_TYPE_IPV6 : SGID_TYPE_IPV4; + copy_in_reverse(req.ip_addr, attr->gid.raw, sizeof(union ib_gid)); + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to config IP addr err %d\n", err); + return err; + } + + return 0; +} + +int mana_ib_gd_config_mac(struct mana_ib_dev *mdev, enum mana_ib_addr_op op, u8 *mac) +{ + struct mana_rnic_config_mac_addr_resp resp = {}; + struct mana_rnic_config_mac_addr_req req = {}; + struct gdma_context *gc = mdev_to_gc(mdev); + int err; + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_CONFIG_MAC_ADDR, sizeof(req), sizeof(resp)); + req.hdr.dev_id = mdev->gdma_dev->dev_id; + req.adapter = mdev->adapter_handle; + req.op = op; + copy_in_reverse(req.mac_addr, mac, ETH_ALEN); + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to config Mac addr err %d", err); + return err; + } + + return 0; +} + +int mana_ib_gd_create_cq(struct mana_ib_dev *mdev, struct mana_ib_cq *cq, u32 doorbell) +{ + struct gdma_context *gc = mdev_to_gc(mdev); + struct mana_rnic_create_cq_resp resp = {}; + struct mana_rnic_create_cq_req req = {}; + int err; + + if (!mdev->eqs) + return -EINVAL; + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_CREATE_CQ, sizeof(req), sizeof(resp)); + req.hdr.dev_id = mdev->gdma_dev->dev_id; + req.adapter = mdev->adapter_handle; + req.gdma_region = cq->queue.gdma_region; + req.eq_id = mdev->eqs[cq->comp_vector]->id; + req.doorbell_page = doorbell; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to create cq err %d", err); + return err; + } + + cq->queue.id = resp.cq_id; + cq->cq_handle = resp.cq_handle; + /* The GDMA region is now owned by the CQ handle */ + cq->queue.gdma_region = GDMA_INVALID_DMA_REGION; + + return 0; +} + +int mana_ib_gd_destroy_cq(struct mana_ib_dev *mdev, struct mana_ib_cq *cq) +{ + struct gdma_context *gc = mdev_to_gc(mdev); + struct mana_rnic_destroy_cq_resp resp = {}; + struct mana_rnic_destroy_cq_req req = {}; + int err; + + if (cq->cq_handle == INVALID_MANA_HANDLE) + return 0; + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_DESTROY_CQ, sizeof(req), sizeof(resp)); + req.hdr.dev_id = mdev->gdma_dev->dev_id; + req.adapter = mdev->adapter_handle; + req.cq_handle = cq->cq_handle; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to destroy cq err %d", err); + return err; + } + + return 0; +} + +int mana_ib_gd_create_rc_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp, + struct ib_qp_init_attr *attr, u32 doorbell, u64 flags) +{ + struct mana_ib_cq *send_cq = container_of(qp->ibqp.send_cq, struct mana_ib_cq, ibcq); + struct mana_ib_cq *recv_cq = container_of(qp->ibqp.recv_cq, struct mana_ib_cq, ibcq); + struct mana_ib_pd *pd = container_of(qp->ibqp.pd, struct mana_ib_pd, ibpd); + struct gdma_context *gc = mdev_to_gc(mdev); + struct mana_rnic_create_qp_resp resp = {}; + struct mana_rnic_create_qp_req req = {}; + int err, i; + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_CREATE_RC_QP, sizeof(req), sizeof(resp)); + req.hdr.dev_id = mdev->gdma_dev->dev_id; + req.adapter = mdev->adapter_handle; + req.pd_handle = pd->pd_handle; + req.send_cq_handle = send_cq->cq_handle; + req.recv_cq_handle = recv_cq->cq_handle; + for (i = 0; i < MANA_RC_QUEUE_TYPE_MAX; i++) + req.dma_region[i] = qp->rc_qp.queues[i].gdma_region; + req.doorbell_page = doorbell; + req.max_send_wr = attr->cap.max_send_wr; + req.max_recv_wr = attr->cap.max_recv_wr; + req.max_send_sge = attr->cap.max_send_sge; + req.max_recv_sge = attr->cap.max_recv_sge; + req.flags = flags; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to create rc qp err %d", err); + return err; + } + qp->qp_handle = resp.rc_qp_handle; + for (i = 0; i < MANA_RC_QUEUE_TYPE_MAX; i++) { + qp->rc_qp.queues[i].id = resp.queue_ids[i]; + /* The GDMA regions are now owned by the RNIC QP handle */ + qp->rc_qp.queues[i].gdma_region = GDMA_INVALID_DMA_REGION; + } + return 0; +} + +int mana_ib_gd_destroy_rc_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp) +{ + struct mana_rnic_destroy_rc_qp_resp resp = {0}; + struct mana_rnic_destroy_rc_qp_req req = {0}; + struct gdma_context *gc = mdev_to_gc(mdev); + int err; + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_DESTROY_RC_QP, sizeof(req), sizeof(resp)); + req.hdr.dev_id = mdev->gdma_dev->dev_id; + req.adapter = mdev->adapter_handle; + req.rc_qp_handle = qp->qp_handle; + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to destroy rc qp err %d", err); + return err; + } + return 0; +} + +int mana_ib_gd_create_ud_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp, + struct ib_qp_init_attr *attr, u32 doorbell, u32 type) +{ + struct mana_ib_cq *send_cq = container_of(qp->ibqp.send_cq, struct mana_ib_cq, ibcq); + struct mana_ib_cq *recv_cq = container_of(qp->ibqp.recv_cq, struct mana_ib_cq, ibcq); + struct mana_ib_pd *pd = container_of(qp->ibqp.pd, struct mana_ib_pd, ibpd); + struct gdma_context *gc = mdev_to_gc(mdev); + struct mana_rnic_create_udqp_resp resp = {}; + struct mana_rnic_create_udqp_req req = {}; + int err, i; + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_CREATE_UD_QP, sizeof(req), sizeof(resp)); + req.hdr.dev_id = mdev->gdma_dev->dev_id; + req.adapter = mdev->adapter_handle; + req.pd_handle = pd->pd_handle; + req.send_cq_handle = send_cq->cq_handle; + req.recv_cq_handle = recv_cq->cq_handle; + for (i = 0; i < MANA_UD_QUEUE_TYPE_MAX; i++) + req.dma_region[i] = qp->ud_qp.queues[i].gdma_region; + req.doorbell_page = doorbell; + req.max_send_wr = attr->cap.max_send_wr; + req.max_recv_wr = attr->cap.max_recv_wr; + req.max_send_sge = attr->cap.max_send_sge; + req.max_recv_sge = attr->cap.max_recv_sge; + req.qp_type = type; + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to create ud qp err %d", err); + return err; + } + qp->qp_handle = resp.qp_handle; + for (i = 0; i < MANA_UD_QUEUE_TYPE_MAX; i++) { + qp->ud_qp.queues[i].id = resp.queue_ids[i]; + /* The GDMA regions are now owned by the RNIC QP handle */ + qp->ud_qp.queues[i].gdma_region = GDMA_INVALID_DMA_REGION; + } + return 0; +} + +int mana_ib_gd_destroy_ud_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp) +{ + struct mana_rnic_destroy_udqp_resp resp = {0}; + struct mana_rnic_destroy_udqp_req req = {0}; + struct gdma_context *gc = mdev_to_gc(mdev); + int err; + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_DESTROY_UD_QP, sizeof(req), sizeof(resp)); + req.hdr.dev_id = mdev->gdma_dev->dev_id; + req.adapter = mdev->adapter_handle; + req.qp_handle = qp->qp_handle; + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to destroy ud qp err %d", err); + return err; + } return 0; } diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index f83390eebb7d..42bebd6cd4f7 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -11,8 +11,11 @@ #include <rdma/ib_umem.h> #include <rdma/mana-abi.h> #include <rdma/uverbs_ioctl.h> +#include <linux/dmapool.h> #include <net/mana/mana.h> +#include "shadow_queue.h" +#include "counters.h" #define PAGE_SZ_BM \ (SZ_4K | SZ_8K | SZ_16K | SZ_32K | SZ_64K | SZ_128K | SZ_256K | \ @@ -21,12 +24,25 @@ /* MANA doesn't have any limit for MR size */ #define MANA_IB_MAX_MR_SIZE U64_MAX +/* Send queue ID mask */ +#define MANA_SENDQ_MASK BIT(31) + /* * The hardware limit of number of MRs is greater than maximum number of MRs * that can possibly represent in 24 bits */ #define MANA_IB_MAX_MR 0xFFFFFFu +/* + * The CA timeout is approx. 260ms (4us * 2^(DELAY)) + */ +#define MANA_CA_ACK_DELAY 16 + +/* + * The buffer used for writing AV + */ +#define MANA_AV_BUFFER_SIZE 64 + struct mana_ib_adapter_caps { u32 max_sq_id; u32 max_rq_id; @@ -43,21 +59,35 @@ struct mana_ib_adapter_caps { u32 max_send_sge_count; u32 max_recv_sge_count; u32 max_inline_data_size; + u64 feature_flags; + u64 page_size_cap; +}; + +struct mana_ib_queue { + struct ib_umem *umem; + struct gdma_queue *kmem; + u64 gdma_region; + u64 id; }; struct mana_ib_dev { struct ib_device ib_dev; struct gdma_dev *gdma_dev; + mana_handle_t adapter_handle; + struct gdma_queue *fatal_err_eq; + struct gdma_queue **eqs; + struct xarray qp_table_wq; struct mana_ib_adapter_caps adapter_caps; + struct dma_pool *av_pool; + netdevice_tracker dev_tracker; + struct notifier_block nb; }; struct mana_ib_wq { struct ib_wq ibwq; - struct ib_umem *umem; + struct mana_ib_queue queue; int wqe; u32 wq_buf_size; - u64 gdma_region; - u64 id; mana_handle_t rx_object; }; @@ -74,6 +104,25 @@ struct mana_ib_pd { u32 tx_vp_offset; }; +struct mana_ib_av { + u8 dest_ip[16]; + u8 dest_mac[ETH_ALEN]; + u16 udp_src_port; + u8 src_ip[16]; + u32 hop_limit : 8; + u32 reserved1 : 12; + u32 dscp : 6; + u32 reserved2 : 5; + u32 is_ipv6 : 1; + u32 reserved3 : 32; +}; + +struct mana_ib_ah { + struct ib_ah ibah; + struct mana_ib_av *av; + dma_addr_t dma_handle; +}; + struct mana_ib_mr { struct ib_mr ibmr; struct ib_umem *umem; @@ -82,25 +131,60 @@ struct mana_ib_mr { struct mana_ib_cq { struct ib_cq ibcq; - struct ib_umem *umem; + struct mana_ib_queue queue; + /* protects CQ polling */ + spinlock_t cq_lock; + struct list_head list_send_qp; + struct list_head list_recv_qp; int cqe; - u64 gdma_region; - u64 id; u32 comp_vector; + mana_handle_t cq_handle; +}; + +enum mana_rc_queue_type { + MANA_RC_SEND_QUEUE_REQUESTER = 0, + MANA_RC_SEND_QUEUE_RESPONDER, + MANA_RC_SEND_QUEUE_FMR, + MANA_RC_RECV_QUEUE_REQUESTER, + MANA_RC_RECV_QUEUE_RESPONDER, + MANA_RC_QUEUE_TYPE_MAX, +}; + +struct mana_ib_rc_qp { + struct mana_ib_queue queues[MANA_RC_QUEUE_TYPE_MAX]; +}; + +enum mana_ud_queue_type { + MANA_UD_SEND_QUEUE = 0, + MANA_UD_RECV_QUEUE, + MANA_UD_QUEUE_TYPE_MAX, +}; + +struct mana_ib_ud_qp { + struct mana_ib_queue queues[MANA_UD_QUEUE_TYPE_MAX]; + u32 sq_psn; }; struct mana_ib_qp { struct ib_qp ibqp; - /* Work queue info */ - struct ib_umem *sq_umem; - int sqe; - u64 sq_gdma_region; - u64 sq_id; - mana_handle_t tx_object; + mana_handle_t qp_handle; + union { + struct mana_ib_queue raw_sq; + struct mana_ib_rc_qp rc_qp; + struct mana_ib_ud_qp ud_qp; + }; /* The port on the IB device, starting with 1 */ u32 port; + + struct list_head cq_send_list; + struct list_head cq_recv_list; + struct shadow_queue shadow_rq; + struct shadow_queue shadow_sq; + + refcount_t refcount; + struct completion free; }; struct mana_ib_ucontext { @@ -114,12 +198,28 @@ struct mana_ib_rwq_ind_table { enum mana_ib_command_code { MANA_IB_GET_ADAPTER_CAP = 0x30001, + MANA_IB_CREATE_ADAPTER = 0x30002, + MANA_IB_DESTROY_ADAPTER = 0x30003, + MANA_IB_CONFIG_IP_ADDR = 0x30004, + MANA_IB_CONFIG_MAC_ADDR = 0x30005, + MANA_IB_CREATE_UD_QP = 0x30006, + MANA_IB_DESTROY_UD_QP = 0x30007, + MANA_IB_CREATE_CQ = 0x30008, + MANA_IB_DESTROY_CQ = 0x30009, + MANA_IB_CREATE_RC_QP = 0x3000a, + MANA_IB_DESTROY_RC_QP = 0x3000b, + MANA_IB_SET_QP_STATE = 0x3000d, + MANA_IB_QUERY_VF_COUNTERS = 0x30022, }; struct mana_ib_query_adapter_caps_req { struct gdma_req_hdr hdr; }; /*HW Data */ +enum mana_ib_adapter_features { + MANA_IB_FEATURE_CLIENT_ERROR_CQE_SUPPORT = BIT(4), +}; + struct mana_ib_query_adapter_caps_resp { struct gdma_resp_hdr hdr; u32 max_sq_id; @@ -140,6 +240,280 @@ struct mana_ib_query_adapter_caps_resp { u32 max_send_sge_count; u32 max_recv_sge_count; u32 max_inline_data_size; + u64 feature_flags; +}; /* HW Data */ + +enum mana_ib_adapter_features_request { + MANA_IB_FEATURE_CLIENT_ERROR_CQE_REQUEST = BIT(1), +}; /*HW Data */ + +struct mana_rnic_create_adapter_req { + struct gdma_req_hdr hdr; + u32 notify_eq_id; + u32 reserved; + u64 feature_flags; +}; /*HW Data */ + +struct mana_rnic_create_adapter_resp { + struct gdma_resp_hdr hdr; + mana_handle_t adapter; +}; /* HW Data */ + +struct mana_rnic_destroy_adapter_req { + struct gdma_req_hdr hdr; + mana_handle_t adapter; +}; /*HW Data */ + +struct mana_rnic_destroy_adapter_resp { + struct gdma_resp_hdr hdr; +}; /* HW Data */ + +enum mana_ib_addr_op { + ADDR_OP_ADD = 1, + ADDR_OP_REMOVE = 2, +}; + +enum sgid_entry_type { + SGID_TYPE_IPV4 = 1, + SGID_TYPE_IPV6 = 2, +}; + +struct mana_rnic_config_addr_req { + struct gdma_req_hdr hdr; + mana_handle_t adapter; + enum mana_ib_addr_op op; + enum sgid_entry_type sgid_type; + u8 ip_addr[16]; +}; /* HW Data */ + +struct mana_rnic_config_addr_resp { + struct gdma_resp_hdr hdr; +}; /* HW Data */ + +struct mana_rnic_config_mac_addr_req { + struct gdma_req_hdr hdr; + mana_handle_t adapter; + enum mana_ib_addr_op op; + u8 mac_addr[ETH_ALEN]; + u8 reserved[6]; +}; /* HW Data */ + +struct mana_rnic_config_mac_addr_resp { + struct gdma_resp_hdr hdr; +}; /* HW Data */ + +struct mana_rnic_create_cq_req { + struct gdma_req_hdr hdr; + mana_handle_t adapter; + u64 gdma_region; + u32 eq_id; + u32 doorbell_page; +}; /* HW Data */ + +struct mana_rnic_create_cq_resp { + struct gdma_resp_hdr hdr; + mana_handle_t cq_handle; + u32 cq_id; + u32 reserved; +}; /* HW Data */ + +struct mana_rnic_destroy_cq_req { + struct gdma_req_hdr hdr; + mana_handle_t adapter; + mana_handle_t cq_handle; +}; /* HW Data */ + +struct mana_rnic_destroy_cq_resp { + struct gdma_resp_hdr hdr; +}; /* HW Data */ + +enum mana_rnic_create_rc_flags { + MANA_RC_FLAG_NO_FMR = 2, +}; + +struct mana_rnic_create_qp_req { + struct gdma_req_hdr hdr; + mana_handle_t adapter; + mana_handle_t pd_handle; + mana_handle_t send_cq_handle; + mana_handle_t recv_cq_handle; + u64 dma_region[MANA_RC_QUEUE_TYPE_MAX]; + u64 deprecated[2]; + u64 flags; + u32 doorbell_page; + u32 max_send_wr; + u32 max_recv_wr; + u32 max_send_sge; + u32 max_recv_sge; + u32 reserved; +}; /* HW Data */ + +struct mana_rnic_create_qp_resp { + struct gdma_resp_hdr hdr; + mana_handle_t rc_qp_handle; + u32 queue_ids[MANA_RC_QUEUE_TYPE_MAX]; + u32 reserved; +}; /* HW Data*/ + +struct mana_rnic_destroy_rc_qp_req { + struct gdma_req_hdr hdr; + mana_handle_t adapter; + mana_handle_t rc_qp_handle; +}; /* HW Data */ + +struct mana_rnic_destroy_rc_qp_resp { + struct gdma_resp_hdr hdr; +}; /* HW Data */ + +struct mana_rnic_create_udqp_req { + struct gdma_req_hdr hdr; + mana_handle_t adapter; + mana_handle_t pd_handle; + mana_handle_t send_cq_handle; + mana_handle_t recv_cq_handle; + u64 dma_region[MANA_UD_QUEUE_TYPE_MAX]; + u32 qp_type; + u32 doorbell_page; + u32 max_send_wr; + u32 max_recv_wr; + u32 max_send_sge; + u32 max_recv_sge; +}; /* HW Data */ + +struct mana_rnic_create_udqp_resp { + struct gdma_resp_hdr hdr; + mana_handle_t qp_handle; + u32 queue_ids[MANA_UD_QUEUE_TYPE_MAX]; +}; /* HW Data*/ + +struct mana_rnic_destroy_udqp_req { + struct gdma_req_hdr hdr; + mana_handle_t adapter; + mana_handle_t qp_handle; +}; /* HW Data */ + +struct mana_rnic_destroy_udqp_resp { + struct gdma_resp_hdr hdr; +}; /* HW Data */ + +struct mana_ib_ah_attr { + u8 src_addr[16]; + u8 dest_addr[16]; + u8 src_mac[ETH_ALEN]; + u8 dest_mac[ETH_ALEN]; + u8 src_addr_type; + u8 dest_addr_type; + u8 hop_limit; + u8 traffic_class; + u16 src_port; + u16 dest_port; + u32 reserved; +}; + +struct mana_rnic_set_qp_state_req { + struct gdma_req_hdr hdr; + mana_handle_t adapter; + mana_handle_t qp_handle; + u64 attr_mask; + u32 qp_state; + u32 path_mtu; + u32 rq_psn; + u32 sq_psn; + u32 dest_qpn; + u32 max_dest_rd_atomic; + u32 retry_cnt; + u32 rnr_retry; + u32 min_rnr_timer; + u32 reserved; + struct mana_ib_ah_attr ah_attr; +}; /* HW Data */ + +struct mana_rnic_set_qp_state_resp { + struct gdma_resp_hdr hdr; +}; /* HW Data */ + +enum WQE_OPCODE_TYPES { + WQE_TYPE_UD_SEND = 0, + WQE_TYPE_UD_RECV = 8, +}; /* HW DATA */ + +struct rdma_send_oob { + u32 wqe_type : 5; + u32 fence : 1; + u32 signaled : 1; + u32 solicited : 1; + u32 psn : 24; + + u32 ssn_or_rqpn : 24; + u32 reserved1 : 8; + union { + struct { + u32 remote_qkey; + u32 immediate; + u32 reserved1; + u32 reserved2; + } ud_send; + }; +}; /* HW DATA */ + +struct mana_rdma_cqe { + union { + struct { + u8 cqe_type; + u8 data[GDMA_COMP_DATA_SIZE - 1]; + }; + struct { + u32 cqe_type : 8; + u32 vendor_error : 9; + u32 reserved1 : 15; + u32 sge_offset : 5; + u32 tx_wqe_offset : 27; + } ud_send; + struct { + u32 cqe_type : 8; + u32 reserved1 : 24; + u32 msg_len; + u32 src_qpn : 24; + u32 reserved2 : 8; + u32 imm_data; + u32 rx_wqe_offset; + } ud_recv; + }; +}; /* HW DATA */ + +struct mana_rnic_query_vf_cntrs_req { + struct gdma_req_hdr hdr; + mana_handle_t adapter; +}; /* HW Data */ + +struct mana_rnic_query_vf_cntrs_resp { + struct gdma_resp_hdr hdr; + u64 requester_timeout; + u64 requester_oos_nak; + u64 requester_rnr_nak; + u64 responder_rnr_nak; + u64 responder_oos; + u64 responder_dup_request; + u64 requester_implicit_nak; + u64 requester_readresp_psn_mismatch; + u64 nak_inv_req; + u64 nak_access_err; + u64 nak_opp_err; + u64 nak_inv_read; + u64 responder_local_len_err; + u64 requestor_local_prot_err; + u64 responder_rem_access_err; + u64 responder_local_qp_err; + u64 responder_malformed_wqe; + u64 general_hw_err; + u64 requester_rnr_nak_retries_exceeded; + u64 requester_retries_exceeded; + u64 total_fatal_err; + u64 received_cnps; + u64 num_qps_congested; + u64 rate_inc_events; + u64 num_qps_recovered; + u64 current_rate; }; /* HW Data */ static inline struct gdma_context *mdev_to_gc(struct mana_ib_dev *mdev) @@ -147,6 +521,34 @@ static inline struct gdma_context *mdev_to_gc(struct mana_ib_dev *mdev) return mdev->gdma_dev->gdma_context; } +static inline struct mana_ib_qp *mana_get_qp_ref(struct mana_ib_dev *mdev, + u32 qid, bool is_sq) +{ + struct mana_ib_qp *qp; + unsigned long flag; + + if (is_sq) + qid |= MANA_SENDQ_MASK; + + xa_lock_irqsave(&mdev->qp_table_wq, flag); + qp = xa_load(&mdev->qp_table_wq, qid); + if (qp) + refcount_inc(&qp->refcount); + xa_unlock_irqrestore(&mdev->qp_table_wq, flag); + return qp; +} + +static inline void mana_put_qp_ref(struct mana_ib_qp *qp) +{ + if (refcount_dec_and_test(&qp->refcount)) + complete(&qp->free); +} + +static inline bool mana_ib_is_rnic(struct mana_ib_dev *mdev) +{ + return mdev->gdma_dev->dev_id.type == GDMA_DEVICE_MANA_IB; +} + static inline struct net_device *mana_ib_get_netdev(struct ib_device *ibdev, u32 port) { struct mana_ib_dev *mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); @@ -158,7 +560,16 @@ static inline struct net_device *mana_ib_get_netdev(struct ib_device *ibdev, u32 return mc->ports[port - 1]; } +static inline void copy_in_reverse(u8 *dst, const u8 *src, u32 size) +{ + u32 i; + + for (i = 0; i < size; i++) + dst[size - 1 - i] = src[i]; +} + int mana_ib_install_cq_cb(struct mana_ib_dev *mdev, struct mana_ib_cq *cq); +void mana_ib_remove_cq_cb(struct mana_ib_dev *mdev, struct mana_ib_cq *cq); int mana_ib_create_zero_offset_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, mana_handle_t *gdma_region); @@ -169,6 +580,12 @@ int mana_ib_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, int mana_ib_gd_destroy_dma_region(struct mana_ib_dev *dev, mana_handle_t gdma_region); +int mana_ib_create_kernel_queue(struct mana_ib_dev *mdev, u32 size, enum gdma_queue_type type, + struct mana_ib_queue *queue); +int mana_ib_create_queue(struct mana_ib_dev *mdev, u64 addr, u32 size, + struct mana_ib_queue *queue); +void mana_ib_destroy_queue(struct mana_ib_dev *mdev, struct mana_ib_queue *queue); + struct ib_wq *mana_ib_create_wq(struct ib_pd *pd, struct ib_wq_init_attr *init_attr, struct ib_udata *udata); @@ -206,7 +623,7 @@ void mana_ib_uncfg_vport(struct mana_ib_dev *dev, struct mana_ib_pd *pd, u32 port); int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int mana_ib_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); @@ -231,4 +648,51 @@ int mana_ib_query_gid(struct ib_device *ibdev, u32 port, int index, void mana_ib_disassociate_ucontext(struct ib_ucontext *ibcontext); int mana_ib_gd_query_adapter_caps(struct mana_ib_dev *mdev); +int mana_eth_query_adapter_caps(struct mana_ib_dev *mdev); + +int mana_ib_create_eqs(struct mana_ib_dev *mdev); + +void mana_ib_destroy_eqs(struct mana_ib_dev *mdev); + +int mana_ib_gd_create_rnic_adapter(struct mana_ib_dev *mdev); + +int mana_ib_gd_destroy_rnic_adapter(struct mana_ib_dev *mdev); + +int mana_ib_query_pkey(struct ib_device *ibdev, u32 port, u16 index, u16 *pkey); + +enum rdma_link_layer mana_ib_get_link_layer(struct ib_device *device, u32 port_num); + +int mana_ib_gd_add_gid(const struct ib_gid_attr *attr, void **context); + +int mana_ib_gd_del_gid(const struct ib_gid_attr *attr, void **context); + +int mana_ib_gd_config_mac(struct mana_ib_dev *mdev, enum mana_ib_addr_op op, u8 *mac); + +int mana_ib_gd_create_cq(struct mana_ib_dev *mdev, struct mana_ib_cq *cq, u32 doorbell); + +int mana_ib_gd_destroy_cq(struct mana_ib_dev *mdev, struct mana_ib_cq *cq); + +int mana_ib_gd_create_rc_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp, + struct ib_qp_init_attr *attr, u32 doorbell, u64 flags); +int mana_ib_gd_destroy_rc_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp); + +int mana_ib_gd_create_ud_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp, + struct ib_qp_init_attr *attr, u32 doorbell, u32 type); +int mana_ib_gd_destroy_ud_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp); + +int mana_ib_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, + struct ib_udata *udata); +int mana_ib_destroy_ah(struct ib_ah *ah, u32 flags); + +int mana_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); +int mana_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr); + +int mana_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); +int mana_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); + +struct ib_mr *mana_ib_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, u64 length, + u64 iova, int fd, int mr_access_flags, + struct uverbs_attr_bundle *attrs); #endif diff --git a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c index b70b13484f09..6d974d0a8400 100644 --- a/drivers/infiniband/hw/mana/mr.c +++ b/drivers/infiniband/hw/mana/mr.c @@ -5,8 +5,10 @@ #include "mana_ib.h" -#define VALID_MR_FLAGS \ - (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ) +#define VALID_MR_FLAGS (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ |\ + IB_ACCESS_REMOTE_ATOMIC | IB_ZERO_BASED) + +#define VALID_DMA_MR_FLAGS (IB_ACCESS_LOCAL_WRITE) static enum gdma_mr_access_flags mana_ib_verbs_to_gdma_access_flags(int access_flags) @@ -22,6 +24,9 @@ mana_ib_verbs_to_gdma_access_flags(int access_flags) if (access_flags & IB_ACCESS_REMOTE_READ) flags |= GDMA_ACCESS_FLAG_REMOTE_READ; + if (access_flags & IB_ACCESS_REMOTE_ATOMIC) + flags |= GDMA_ACCESS_FLAG_REMOTE_ATOMIC; + return flags; } @@ -39,12 +44,17 @@ static int mana_ib_gd_create_mr(struct mana_ib_dev *dev, struct mana_ib_mr *mr, req.mr_type = mr_params->mr_type; switch (mr_params->mr_type) { + case GDMA_MR_TYPE_GPA: + break; case GDMA_MR_TYPE_GVA: req.gva.dma_region_handle = mr_params->gva.dma_region_handle; req.gva.virtual_address = mr_params->gva.virtual_address; req.gva.access_flags = mr_params->gva.access_flags; break; - + case GDMA_MR_TYPE_ZBVA: + req.zbva.dma_region_handle = mr_params->zbva.dma_region_handle; + req.zbva.access_flags = mr_params->zbva.access_flags; + break; default: ibdev_dbg(&dev->ib_dev, "invalid param (GDMA_MR_TYPE) passed, type %d\n", @@ -112,6 +122,7 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, "start 0x%llx, iova 0x%llx length 0x%llx access_flags 0x%x", start, iova, length, access_flags); + access_flags &= ~IB_ACCESS_OPTIONAL; if (access_flags & ~VALID_MR_FLAGS) return ERR_PTR(-EINVAL); @@ -135,10 +146,86 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, } ibdev_dbg(ibdev, - "create_dma_region ret %d gdma_region %llx\n", err, + "created dma region for user-mr 0x%llx\n", dma_region_handle); mr_params.pd_handle = pd->pd_handle; + if (access_flags & IB_ZERO_BASED) { + mr_params.mr_type = GDMA_MR_TYPE_ZBVA; + mr_params.zbva.dma_region_handle = dma_region_handle; + mr_params.zbva.access_flags = + mana_ib_verbs_to_gdma_access_flags(access_flags); + } else { + mr_params.mr_type = GDMA_MR_TYPE_GVA; + mr_params.gva.dma_region_handle = dma_region_handle; + mr_params.gva.virtual_address = iova; + mr_params.gva.access_flags = + mana_ib_verbs_to_gdma_access_flags(access_flags); + } + + err = mana_ib_gd_create_mr(dev, mr, &mr_params); + if (err) + goto err_dma_region; + + /* + * There is no need to keep track of dma_region_handle after MR is + * successfully created. The dma_region_handle is tracked in the PF + * as part of the lifecycle of this MR. + */ + + return &mr->ibmr; + +err_dma_region: + mana_gd_destroy_dma_region(mdev_to_gc(dev), dma_region_handle); + +err_umem: + ib_umem_release(mr->umem); + +err_free: + kfree(mr); + return ERR_PTR(err); +} + +struct ib_mr *mana_ib_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, u64 length, + u64 iova, int fd, int access_flags, + struct uverbs_attr_bundle *attrs) +{ + struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd); + struct gdma_create_mr_params mr_params = {}; + struct ib_device *ibdev = ibpd->device; + struct ib_umem_dmabuf *umem_dmabuf; + struct mana_ib_dev *dev; + struct mana_ib_mr *mr; + u64 dma_region_handle; + int err; + + dev = container_of(ibdev, struct mana_ib_dev, ib_dev); + + access_flags &= ~IB_ACCESS_OPTIONAL; + if (access_flags & ~VALID_MR_FLAGS) + return ERR_PTR(-EOPNOTSUPP); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + umem_dmabuf = ib_umem_dmabuf_get_pinned(ibdev, start, length, fd, access_flags); + if (IS_ERR(umem_dmabuf)) { + err = PTR_ERR(umem_dmabuf); + ibdev_dbg(ibdev, "Failed to get dmabuf umem, %d\n", err); + goto err_free; + } + + mr->umem = &umem_dmabuf->umem; + + err = mana_ib_create_dma_region(dev, mr->umem, &dma_region_handle, iova); + if (err) { + ibdev_dbg(ibdev, "Failed create dma region for user-mr, %d\n", + err); + goto err_umem; + } + + mr_params.pd_handle = pd->pd_handle; mr_params.mr_type = GDMA_MR_TYPE_GVA; mr_params.gva.dma_region_handle = dma_region_handle; mr_params.gva.virtual_address = iova; @@ -168,6 +255,38 @@ err_free: return ERR_PTR(err); } +struct ib_mr *mana_ib_get_dma_mr(struct ib_pd *ibpd, int access_flags) +{ + struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd); + struct gdma_create_mr_params mr_params = {}; + struct ib_device *ibdev = ibpd->device; + struct mana_ib_dev *dev; + struct mana_ib_mr *mr; + int err; + + dev = container_of(ibdev, struct mana_ib_dev, ib_dev); + + if (access_flags & ~VALID_DMA_MR_FLAGS) + return ERR_PTR(-EINVAL); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + mr_params.pd_handle = pd->pd_handle; + mr_params.mr_type = GDMA_MR_TYPE_GPA; + + err = mana_ib_gd_create_mr(dev, mr, &mr_params); + if (err) + goto err_free; + + return &mr->ibmr; + +err_free: + kfree(mr); + return ERR_PTR(err); +} + int mana_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) { struct mana_ib_mr *mr = container_of(ibmr, struct mana_ib_mr, ibmr); diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c index 6e7627745c95..14fd7d6c54a2 100644 --- a/drivers/infiniband/hw/mana/qp.c +++ b/drivers/infiniband/hw/mana/qp.c @@ -15,15 +15,13 @@ static int mana_ib_cfg_vport_steering(struct mana_ib_dev *dev, struct mana_port_context *mpc = netdev_priv(ndev); struct mana_cfg_rx_steer_req_v2 *req; struct mana_cfg_rx_steer_resp resp = {}; - mana_handle_t *req_indir_tab; struct gdma_context *gc; u32 req_buf_size; int i, err; gc = mdev_to_gc(dev); - req_buf_size = - sizeof(*req) + sizeof(mana_handle_t) * MANA_INDIRECT_TABLE_SIZE; + req_buf_size = struct_size(req, indir_tab, MANA_INDIRECT_TABLE_DEF_SIZE); req = kzalloc(req_buf_size, GFP_KERNEL); if (!req) return -ENOMEM; @@ -43,21 +41,21 @@ static int mana_ib_cfg_vport_steering(struct mana_ib_dev *dev, if (log_ind_tbl_size) req->rss_enable = true; - req->num_indir_entries = MANA_INDIRECT_TABLE_SIZE; - req->indir_tab_offset = sizeof(*req); + req->num_indir_entries = MANA_INDIRECT_TABLE_DEF_SIZE; + req->indir_tab_offset = offsetof(struct mana_cfg_rx_steer_req_v2, + indir_tab); req->update_indir_tab = true; req->cqe_coalescing_enable = 1; - req_indir_tab = (mana_handle_t *)(req + 1); /* The ind table passed to the hardware must have - * MANA_INDIRECT_TABLE_SIZE entries. Adjust the verb + * MANA_INDIRECT_TABLE_DEF_SIZE entries. Adjust the verb * ind_table to MANA_INDIRECT_TABLE_SIZE if required */ ibdev_dbg(&dev->ib_dev, "ind table size %u\n", 1 << log_ind_tbl_size); - for (i = 0; i < MANA_INDIRECT_TABLE_SIZE; i++) { - req_indir_tab[i] = ind_table[i % (1 << log_ind_tbl_size)]; + for (i = 0; i < MANA_INDIRECT_TABLE_DEF_SIZE; i++) { + req->indir_tab[i] = ind_table[i % (1 << log_ind_tbl_size)]; ibdev_dbg(&dev->ib_dev, "index %u handle 0x%llx\n", i, - req_indir_tab[i]); + req->indir_tab[i]); } req->update_hashkey = true; @@ -97,11 +95,9 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp); struct mana_ib_dev *mdev = container_of(pd->device, struct mana_ib_dev, ib_dev); - struct gdma_context *gc = mdev_to_gc(mdev); struct ib_rwq_ind_table *ind_tbl = attr->rwq_ind_tbl; struct mana_ib_create_qp_rss_resp resp = {}; struct mana_ib_create_qp_rss ucmd = {}; - struct gdma_queue **gdma_cq_allocated; mana_handle_t *mana_ind_table; struct mana_port_context *mpc; unsigned int ind_tbl_size; @@ -141,7 +137,7 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, } ind_tbl_size = 1 << ind_tbl->log_ind_tbl_size; - if (ind_tbl_size > MANA_INDIRECT_TABLE_SIZE) { + if (ind_tbl_size > MANA_INDIRECT_TABLE_DEF_SIZE) { ibdev_dbg(&mdev->ib_dev, "Indirect table size %d exceeding limit\n", ind_tbl_size); @@ -175,13 +171,6 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, goto fail; } - gdma_cq_allocated = kcalloc(ind_tbl_size, sizeof(*gdma_cq_allocated), - GFP_KERNEL); - if (!gdma_cq_allocated) { - ret = -ENOMEM; - goto fail; - } - qp->port = port; for (i = 0; i < ind_tbl_size; i++) { @@ -194,13 +183,13 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, ibcq = ibwq->cq; cq = container_of(ibcq, struct mana_ib_cq, ibcq); - wq_spec.gdma_region = wq->gdma_region; + wq_spec.gdma_region = wq->queue.gdma_region; wq_spec.queue_size = wq->wq_buf_size; - cq_spec.gdma_region = cq->gdma_region; + cq_spec.gdma_region = cq->queue.gdma_region; cq_spec.queue_size = cq->cqe * COMP_ENTRY_SIZE; cq_spec.modr_ctx_id = 0; - eq = &mpc->ac->eqs[cq->comp_vector % gc->max_num_queues]; + eq = &mpc->ac->eqs[cq->comp_vector]; cq_spec.attached_eq = eq->eq->id; ret = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_RQ, @@ -212,18 +201,18 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, } /* The GDMA regions are now owned by the WQ object */ - wq->gdma_region = GDMA_INVALID_DMA_REGION; - cq->gdma_region = GDMA_INVALID_DMA_REGION; + wq->queue.gdma_region = GDMA_INVALID_DMA_REGION; + cq->queue.gdma_region = GDMA_INVALID_DMA_REGION; - wq->id = wq_spec.queue_index; - cq->id = cq_spec.queue_index; + wq->queue.id = wq_spec.queue_index; + cq->queue.id = cq_spec.queue_index; ibdev_dbg(&mdev->ib_dev, - "ret %d rx_object 0x%llx wq id %llu cq id %llu\n", - ret, wq->rx_object, wq->id, cq->id); + "rx_object 0x%llx wq id %llu cq id %llu\n", + wq->rx_object, wq->queue.id, cq->queue.id); - resp.entries[i].cqid = cq->id; - resp.entries[i].wqid = wq->id; + resp.entries[i].cqid = cq->queue.id; + resp.entries[i].wqid = wq->queue.id; mana_ind_table[i] = wq->rx_object; @@ -231,8 +220,6 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, ret = mana_ib_install_cq_cb(mdev, cq); if (ret) goto fail; - - gdma_cq_allocated[i] = gc->cq_table[cq->id]; } resp.num_entries = i; @@ -252,7 +239,6 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, goto fail; } - kfree(gdma_cq_allocated); kfree(mana_ind_table); return 0; @@ -264,13 +250,10 @@ fail: wq = container_of(ibwq, struct mana_ib_wq, ibwq); cq = container_of(ibcq, struct mana_ib_cq, ibcq); - gc->cq_table[cq->id] = NULL; - kfree(gdma_cq_allocated[i]); - + mana_ib_remove_cq_cb(mdev, cq); mana_destroy_wq_obj(mpc, GDMA_RQ, wq->rx_object); } - kfree(gdma_cq_allocated); kfree(mana_ind_table); return ret; @@ -289,15 +272,12 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd, struct mana_ib_ucontext *mana_ucontext = rdma_udata_to_drv_context(udata, struct mana_ib_ucontext, ibucontext); - struct gdma_context *gc = mdev_to_gc(mdev); struct mana_ib_create_qp_resp resp = {}; struct mana_ib_create_qp ucmd = {}; - struct gdma_queue *gdma_cq = NULL; struct mana_obj_spec wq_spec = {}; struct mana_obj_spec cq_spec = {}; struct mana_port_context *mpc; struct net_device *ndev; - struct ib_umem *umem; struct mana_eq *eq; int eq_vec; u32 port; @@ -346,56 +326,39 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd, ibdev_dbg(&mdev->ib_dev, "ucmd sq_buf_addr 0x%llx port %u\n", ucmd.sq_buf_addr, ucmd.port); - umem = ib_umem_get(ibpd->device, ucmd.sq_buf_addr, ucmd.sq_buf_size, - IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(umem)) { - err = PTR_ERR(umem); - ibdev_dbg(&mdev->ib_dev, - "Failed to get umem for create qp-raw, err %d\n", - err); - goto err_free_vport; - } - qp->sq_umem = umem; - - err = mana_ib_create_zero_offset_dma_region(mdev, qp->sq_umem, - &qp->sq_gdma_region); + err = mana_ib_create_queue(mdev, ucmd.sq_buf_addr, ucmd.sq_buf_size, &qp->raw_sq); if (err) { ibdev_dbg(&mdev->ib_dev, - "Failed to create dma region for create qp-raw, %d\n", - err); - goto err_release_umem; + "Failed to create queue for create qp-raw, err %d\n", err); + goto err_free_vport; } - ibdev_dbg(&mdev->ib_dev, - "create_dma_region ret %d gdma_region 0x%llx\n", - err, qp->sq_gdma_region); - /* Create a WQ on the same port handle used by the Ethernet */ - wq_spec.gdma_region = qp->sq_gdma_region; + wq_spec.gdma_region = qp->raw_sq.gdma_region; wq_spec.queue_size = ucmd.sq_buf_size; - cq_spec.gdma_region = send_cq->gdma_region; + cq_spec.gdma_region = send_cq->queue.gdma_region; cq_spec.queue_size = send_cq->cqe * COMP_ENTRY_SIZE; cq_spec.modr_ctx_id = 0; - eq_vec = send_cq->comp_vector % gc->max_num_queues; + eq_vec = send_cq->comp_vector; eq = &mpc->ac->eqs[eq_vec]; cq_spec.attached_eq = eq->eq->id; err = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_SQ, &wq_spec, - &cq_spec, &qp->tx_object); + &cq_spec, &qp->qp_handle); if (err) { ibdev_dbg(&mdev->ib_dev, "Failed to create wq for create raw-qp, err %d\n", err); - goto err_destroy_dma_region; + goto err_destroy_queue; } /* The GDMA regions are now owned by the WQ object */ - qp->sq_gdma_region = GDMA_INVALID_DMA_REGION; - send_cq->gdma_region = GDMA_INVALID_DMA_REGION; + qp->raw_sq.gdma_region = GDMA_INVALID_DMA_REGION; + send_cq->queue.gdma_region = GDMA_INVALID_DMA_REGION; - qp->sq_id = wq_spec.queue_index; - send_cq->id = cq_spec.queue_index; + qp->raw_sq.id = wq_spec.queue_index; + send_cq->queue.id = cq_spec.queue_index; /* Create CQ table entry */ err = mana_ib_install_cq_cb(mdev, send_cq); @@ -403,11 +366,11 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd, goto err_destroy_wq_obj; ibdev_dbg(&mdev->ib_dev, - "ret %d qp->tx_object 0x%llx sq id %llu cq id %llu\n", err, - qp->tx_object, qp->sq_id, send_cq->id); + "qp->qp_handle 0x%llx sq id %llu cq id %llu\n", + qp->qp_handle, qp->raw_sq.id, send_cq->queue.id); - resp.sqid = qp->sq_id; - resp.cqid = send_cq->id; + resp.sqid = qp->raw_sq.id; + resp.cqid = send_cq->queue.id; resp.tx_vp_offset = pd->tx_vp_offset; err = ib_copy_to_udata(udata, &resp, sizeof(resp)); @@ -415,23 +378,19 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd, ibdev_dbg(&mdev->ib_dev, "Failed copy udata for create qp-raw, %d\n", err); - goto err_release_gdma_cq; + goto err_remove_cq_cb; } return 0; -err_release_gdma_cq: - kfree(gdma_cq); - gc->cq_table[send_cq->id] = NULL; +err_remove_cq_cb: + mana_ib_remove_cq_cb(mdev, send_cq); err_destroy_wq_obj: - mana_destroy_wq_obj(mpc, GDMA_SQ, qp->tx_object); - -err_destroy_dma_region: - mana_ib_gd_destroy_dma_region(mdev, qp->sq_gdma_region); + mana_destroy_wq_obj(mpc, GDMA_SQ, qp->qp_handle); -err_release_umem: - ib_umem_release(umem); +err_destroy_queue: + mana_ib_destroy_queue(mdev, &qp->raw_sq); err_free_vport: mana_ib_uncfg_vport(mdev, pd, port); @@ -439,6 +398,306 @@ err_free_vport: return err; } +static u32 mana_ib_wqe_size(u32 sge, u32 oob_size) +{ + u32 wqe_size = sge * sizeof(struct gdma_sge) + sizeof(struct gdma_wqe) + oob_size; + + return ALIGN(wqe_size, GDMA_WQE_BU_SIZE); +} + +static u32 mana_ib_queue_size(struct ib_qp_init_attr *attr, u32 queue_type) +{ + u32 queue_size; + + switch (attr->qp_type) { + case IB_QPT_UD: + case IB_QPT_GSI: + if (queue_type == MANA_UD_SEND_QUEUE) + queue_size = attr->cap.max_send_wr * + mana_ib_wqe_size(attr->cap.max_send_sge, INLINE_OOB_LARGE_SIZE); + else + queue_size = attr->cap.max_recv_wr * + mana_ib_wqe_size(attr->cap.max_recv_sge, INLINE_OOB_SMALL_SIZE); + break; + default: + return 0; + } + + return MANA_PAGE_ALIGN(roundup_pow_of_two(queue_size)); +} + +static enum gdma_queue_type mana_ib_queue_type(struct ib_qp_init_attr *attr, u32 queue_type) +{ + enum gdma_queue_type type; + + switch (attr->qp_type) { + case IB_QPT_UD: + case IB_QPT_GSI: + if (queue_type == MANA_UD_SEND_QUEUE) + type = GDMA_SQ; + else + type = GDMA_RQ; + break; + default: + type = GDMA_INVALID_QUEUE; + } + return type; +} + +static int mana_table_store_rc_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp) +{ + return xa_insert_irq(&mdev->qp_table_wq, qp->ibqp.qp_num, qp, + GFP_KERNEL); +} + +static void mana_table_remove_rc_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp) +{ + xa_erase_irq(&mdev->qp_table_wq, qp->ibqp.qp_num); +} + +static int mana_table_store_ud_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp) +{ + u32 qids = qp->ud_qp.queues[MANA_UD_SEND_QUEUE].id | MANA_SENDQ_MASK; + u32 qidr = qp->ud_qp.queues[MANA_UD_RECV_QUEUE].id; + int err; + + err = xa_insert_irq(&mdev->qp_table_wq, qids, qp, GFP_KERNEL); + if (err) + return err; + + err = xa_insert_irq(&mdev->qp_table_wq, qidr, qp, GFP_KERNEL); + if (err) + goto remove_sq; + + return 0; + +remove_sq: + xa_erase_irq(&mdev->qp_table_wq, qids); + return err; +} + +static void mana_table_remove_ud_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp) +{ + u32 qids = qp->ud_qp.queues[MANA_UD_SEND_QUEUE].id | MANA_SENDQ_MASK; + u32 qidr = qp->ud_qp.queues[MANA_UD_RECV_QUEUE].id; + + xa_erase_irq(&mdev->qp_table_wq, qids); + xa_erase_irq(&mdev->qp_table_wq, qidr); +} + +static int mana_table_store_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp) +{ + refcount_set(&qp->refcount, 1); + init_completion(&qp->free); + + switch (qp->ibqp.qp_type) { + case IB_QPT_RC: + return mana_table_store_rc_qp(mdev, qp); + case IB_QPT_UD: + case IB_QPT_GSI: + return mana_table_store_ud_qp(mdev, qp); + default: + ibdev_dbg(&mdev->ib_dev, "Unknown QP type for storing in mana table, %d\n", + qp->ibqp.qp_type); + } + + return -EINVAL; +} + +static void mana_table_remove_qp(struct mana_ib_dev *mdev, + struct mana_ib_qp *qp) +{ + switch (qp->ibqp.qp_type) { + case IB_QPT_RC: + mana_table_remove_rc_qp(mdev, qp); + break; + case IB_QPT_UD: + case IB_QPT_GSI: + mana_table_remove_ud_qp(mdev, qp); + break; + default: + ibdev_dbg(&mdev->ib_dev, "Unknown QP type for removing from mana table, %d\n", + qp->ibqp.qp_type); + return; + } + mana_put_qp_ref(qp); + wait_for_completion(&qp->free); +} + +static int mana_ib_create_rc_qp(struct ib_qp *ibqp, struct ib_pd *ibpd, + struct ib_qp_init_attr *attr, struct ib_udata *udata) +{ + struct mana_ib_dev *mdev = container_of(ibpd->device, struct mana_ib_dev, ib_dev); + struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp); + struct mana_ib_create_rc_qp_resp resp = {}; + struct mana_ib_ucontext *mana_ucontext; + struct mana_ib_create_rc_qp ucmd = {}; + int i, err, j; + u64 flags = 0; + u32 doorbell; + + if (!udata || udata->inlen < sizeof(ucmd)) + return -EINVAL; + + mana_ucontext = rdma_udata_to_drv_context(udata, struct mana_ib_ucontext, ibucontext); + doorbell = mana_ucontext->doorbell; + flags = MANA_RC_FLAG_NO_FMR; + err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen)); + if (err) { + ibdev_dbg(&mdev->ib_dev, "Failed to copy from udata, %d\n", err); + return err; + } + + for (i = 0, j = 0; i < MANA_RC_QUEUE_TYPE_MAX; ++i) { + /* skip FMR for user-level RC QPs */ + if (i == MANA_RC_SEND_QUEUE_FMR) { + qp->rc_qp.queues[i].id = INVALID_QUEUE_ID; + qp->rc_qp.queues[i].gdma_region = GDMA_INVALID_DMA_REGION; + continue; + } + err = mana_ib_create_queue(mdev, ucmd.queue_buf[j], ucmd.queue_size[j], + &qp->rc_qp.queues[i]); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to create queue %d, err %d\n", i, err); + goto destroy_queues; + } + j++; + } + + err = mana_ib_gd_create_rc_qp(mdev, qp, attr, doorbell, flags); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to create rc qp %d\n", err); + goto destroy_queues; + } + qp->ibqp.qp_num = qp->rc_qp.queues[MANA_RC_RECV_QUEUE_RESPONDER].id; + qp->port = attr->port_num; + + if (udata) { + for (i = 0, j = 0; i < MANA_RC_QUEUE_TYPE_MAX; ++i) { + if (i == MANA_RC_SEND_QUEUE_FMR) + continue; + resp.queue_id[j] = qp->rc_qp.queues[i].id; + j++; + } + err = ib_copy_to_udata(udata, &resp, min(sizeof(resp), udata->outlen)); + if (err) { + ibdev_dbg(&mdev->ib_dev, "Failed to copy to udata, %d\n", err); + goto destroy_qp; + } + } + + err = mana_table_store_qp(mdev, qp); + if (err) + goto destroy_qp; + + return 0; + +destroy_qp: + mana_ib_gd_destroy_rc_qp(mdev, qp); +destroy_queues: + while (i-- > 0) + mana_ib_destroy_queue(mdev, &qp->rc_qp.queues[i]); + return err; +} + +static void mana_add_qp_to_cqs(struct mana_ib_qp *qp) +{ + struct mana_ib_cq *send_cq = container_of(qp->ibqp.send_cq, struct mana_ib_cq, ibcq); + struct mana_ib_cq *recv_cq = container_of(qp->ibqp.recv_cq, struct mana_ib_cq, ibcq); + unsigned long flags; + + spin_lock_irqsave(&send_cq->cq_lock, flags); + list_add_tail(&qp->cq_send_list, &send_cq->list_send_qp); + spin_unlock_irqrestore(&send_cq->cq_lock, flags); + + spin_lock_irqsave(&recv_cq->cq_lock, flags); + list_add_tail(&qp->cq_recv_list, &recv_cq->list_recv_qp); + spin_unlock_irqrestore(&recv_cq->cq_lock, flags); +} + +static void mana_remove_qp_from_cqs(struct mana_ib_qp *qp) +{ + struct mana_ib_cq *send_cq = container_of(qp->ibqp.send_cq, struct mana_ib_cq, ibcq); + struct mana_ib_cq *recv_cq = container_of(qp->ibqp.recv_cq, struct mana_ib_cq, ibcq); + unsigned long flags; + + spin_lock_irqsave(&send_cq->cq_lock, flags); + list_del(&qp->cq_send_list); + spin_unlock_irqrestore(&send_cq->cq_lock, flags); + + spin_lock_irqsave(&recv_cq->cq_lock, flags); + list_del(&qp->cq_recv_list); + spin_unlock_irqrestore(&recv_cq->cq_lock, flags); +} + +static int mana_ib_create_ud_qp(struct ib_qp *ibqp, struct ib_pd *ibpd, + struct ib_qp_init_attr *attr, struct ib_udata *udata) +{ + struct mana_ib_dev *mdev = container_of(ibpd->device, struct mana_ib_dev, ib_dev); + struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp); + u32 doorbell, queue_size; + int i, err; + + if (udata) { + ibdev_dbg(&mdev->ib_dev, "User-level UD QPs are not supported\n"); + return -EOPNOTSUPP; + } + + for (i = 0; i < MANA_UD_QUEUE_TYPE_MAX; ++i) { + queue_size = mana_ib_queue_size(attr, i); + err = mana_ib_create_kernel_queue(mdev, queue_size, mana_ib_queue_type(attr, i), + &qp->ud_qp.queues[i]); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to create queue %d, err %d\n", + i, err); + goto destroy_queues; + } + } + doorbell = mdev->gdma_dev->doorbell; + + err = create_shadow_queue(&qp->shadow_rq, attr->cap.max_recv_wr, + sizeof(struct ud_rq_shadow_wqe)); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to create shadow rq err %d\n", err); + goto destroy_queues; + } + err = create_shadow_queue(&qp->shadow_sq, attr->cap.max_send_wr, + sizeof(struct ud_sq_shadow_wqe)); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to create shadow sq err %d\n", err); + goto destroy_shadow_queues; + } + + err = mana_ib_gd_create_ud_qp(mdev, qp, attr, doorbell, attr->qp_type); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to create ud qp %d\n", err); + goto destroy_shadow_queues; + } + qp->ibqp.qp_num = qp->ud_qp.queues[MANA_UD_RECV_QUEUE].id; + qp->port = attr->port_num; + + for (i = 0; i < MANA_UD_QUEUE_TYPE_MAX; ++i) + qp->ud_qp.queues[i].kmem->id = qp->ud_qp.queues[i].id; + + err = mana_table_store_qp(mdev, qp); + if (err) + goto destroy_qp; + + mana_add_qp_to_cqs(qp); + + return 0; + +destroy_qp: + mana_ib_gd_destroy_ud_qp(mdev, qp); +destroy_shadow_queues: + destroy_shadow_queue(&qp->shadow_rq); + destroy_shadow_queue(&qp->shadow_sq); +destroy_queues: + while (i-- > 0) + mana_ib_destroy_queue(mdev, &qp->ud_qp.queues[i]); + return err; +} + int mana_ib_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr, struct ib_udata *udata) { @@ -450,8 +709,12 @@ int mana_ib_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr, udata); return mana_ib_create_qp_raw(ibqp, ibqp->pd, attr, udata); + case IB_QPT_RC: + return mana_ib_create_rc_qp(ibqp, ibqp->pd, attr, udata); + case IB_QPT_UD: + case IB_QPT_GSI: + return mana_ib_create_ud_qp(ibqp, ibqp->pd, attr, udata); default: - /* Creating QP other than IB_QPT_RAW_PACKET is not supported */ ibdev_dbg(ibqp->device, "Creating QP type %u not supported\n", attr->qp_type); } @@ -459,11 +722,81 @@ int mana_ib_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr, return -EINVAL; } +static int mana_ib_gd_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct mana_ib_dev *mdev = container_of(ibqp->device, struct mana_ib_dev, ib_dev); + struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp); + struct mana_rnic_set_qp_state_resp resp = {}; + struct mana_rnic_set_qp_state_req req = {}; + struct gdma_context *gc = mdev_to_gc(mdev); + struct mana_port_context *mpc; + struct net_device *ndev; + int err; + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_SET_QP_STATE, sizeof(req), sizeof(resp)); + req.hdr.dev_id = mdev->gdma_dev->dev_id; + req.adapter = mdev->adapter_handle; + req.qp_handle = qp->qp_handle; + req.qp_state = attr->qp_state; + req.attr_mask = attr_mask; + req.path_mtu = attr->path_mtu; + req.rq_psn = attr->rq_psn; + req.sq_psn = attr->sq_psn; + req.dest_qpn = attr->dest_qp_num; + req.max_dest_rd_atomic = attr->max_dest_rd_atomic; + req.retry_cnt = attr->retry_cnt; + req.rnr_retry = attr->rnr_retry; + req.min_rnr_timer = attr->min_rnr_timer; + if (attr_mask & IB_QP_AV) { + ndev = mana_ib_get_netdev(&mdev->ib_dev, ibqp->port); + if (!ndev) { + ibdev_dbg(&mdev->ib_dev, "Invalid port %u in QP %u\n", + ibqp->port, ibqp->qp_num); + return -EINVAL; + } + mpc = netdev_priv(ndev); + copy_in_reverse(req.ah_attr.src_mac, mpc->mac_addr, ETH_ALEN); + copy_in_reverse(req.ah_attr.dest_mac, attr->ah_attr.roce.dmac, ETH_ALEN); + copy_in_reverse(req.ah_attr.src_addr, attr->ah_attr.grh.sgid_attr->gid.raw, + sizeof(union ib_gid)); + copy_in_reverse(req.ah_attr.dest_addr, attr->ah_attr.grh.dgid.raw, + sizeof(union ib_gid)); + if (rdma_gid_attr_network_type(attr->ah_attr.grh.sgid_attr) == RDMA_NETWORK_IPV4) { + req.ah_attr.src_addr_type = SGID_TYPE_IPV4; + req.ah_attr.dest_addr_type = SGID_TYPE_IPV4; + } else { + req.ah_attr.src_addr_type = SGID_TYPE_IPV6; + req.ah_attr.dest_addr_type = SGID_TYPE_IPV6; + } + req.ah_attr.dest_port = ROCE_V2_UDP_DPORT; + req.ah_attr.src_port = rdma_get_udp_sport(attr->ah_attr.grh.flow_label, + ibqp->qp_num, attr->dest_qp_num); + req.ah_attr.traffic_class = attr->ah_attr.grh.traffic_class; + req.ah_attr.hop_limit = attr->ah_attr.grh.hop_limit; + } + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed modify qp err %d", err); + return err; + } + + return 0; +} + int mana_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { - /* modify_qp is not supported by this version of the driver */ - return -EOPNOTSUPP; + switch (ibqp->qp_type) { + case IB_QPT_RC: + case IB_QPT_UD: + case IB_QPT_GSI: + return mana_ib_gd_modify_qp(ibqp, attr, attr_mask, udata); + default: + ibdev_dbg(ibqp->device, "Modify QP type %u not supported", ibqp->qp_type); + return -EOPNOTSUPP; + } } static int mana_ib_destroy_qp_rss(struct mana_ib_qp *qp, @@ -505,18 +838,55 @@ static int mana_ib_destroy_qp_raw(struct mana_ib_qp *qp, struct ib_udata *udata) mpc = netdev_priv(ndev); pd = container_of(ibpd, struct mana_ib_pd, ibpd); - mana_destroy_wq_obj(mpc, GDMA_SQ, qp->tx_object); + mana_destroy_wq_obj(mpc, GDMA_SQ, qp->qp_handle); - if (qp->sq_umem) { - mana_ib_gd_destroy_dma_region(mdev, qp->sq_gdma_region); - ib_umem_release(qp->sq_umem); - } + mana_ib_destroy_queue(mdev, &qp->raw_sq); mana_ib_uncfg_vport(mdev, pd, qp->port); return 0; } +static int mana_ib_destroy_rc_qp(struct mana_ib_qp *qp, struct ib_udata *udata) +{ + struct mana_ib_dev *mdev = + container_of(qp->ibqp.device, struct mana_ib_dev, ib_dev); + int i; + + mana_table_remove_qp(mdev, qp); + + /* Ignore return code as there is not much we can do about it. + * The error message is printed inside. + */ + mana_ib_gd_destroy_rc_qp(mdev, qp); + for (i = 0; i < MANA_RC_QUEUE_TYPE_MAX; ++i) + mana_ib_destroy_queue(mdev, &qp->rc_qp.queues[i]); + + return 0; +} + +static int mana_ib_destroy_ud_qp(struct mana_ib_qp *qp, struct ib_udata *udata) +{ + struct mana_ib_dev *mdev = + container_of(qp->ibqp.device, struct mana_ib_dev, ib_dev); + int i; + + mana_remove_qp_from_cqs(qp); + mana_table_remove_qp(mdev, qp); + + destroy_shadow_queue(&qp->shadow_rq); + destroy_shadow_queue(&qp->shadow_sq); + + /* Ignore return code as there is not much we can do about it. + * The error message is printed inside. + */ + mana_ib_gd_destroy_ud_qp(mdev, qp); + for (i = 0; i < MANA_UD_QUEUE_TYPE_MAX; ++i) + mana_ib_destroy_queue(mdev, &qp->ud_qp.queues[i]); + + return 0; +} + int mana_ib_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) { struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp); @@ -528,7 +898,11 @@ int mana_ib_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) udata); return mana_ib_destroy_qp_raw(qp, udata); - + case IB_QPT_RC: + return mana_ib_destroy_rc_qp(qp, udata); + case IB_QPT_UD: + case IB_QPT_GSI: + return mana_ib_destroy_ud_qp(qp, udata); default: ibdev_dbg(ibqp->device, "Unexpected QP type %u\n", ibqp->qp_type); diff --git a/drivers/infiniband/hw/mana/shadow_queue.h b/drivers/infiniband/hw/mana/shadow_queue.h new file mode 100644 index 000000000000..a4b3818f9c39 --- /dev/null +++ b/drivers/infiniband/hw/mana/shadow_queue.h @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2024, Microsoft Corporation. All rights reserved. + */ + +#ifndef _MANA_SHADOW_QUEUE_H_ +#define _MANA_SHADOW_QUEUE_H_ + +struct shadow_wqe_header { + u16 opcode; + u16 error_code; + u32 posted_wqe_size; + u64 wr_id; +}; + +struct ud_rq_shadow_wqe { + struct shadow_wqe_header header; + u32 byte_len; + u32 src_qpn; +}; + +struct ud_sq_shadow_wqe { + struct shadow_wqe_header header; +}; + +struct shadow_queue { + /* Unmasked producer index, Incremented on wqe posting */ + u64 prod_idx; + /* Unmasked consumer index, Incremented on cq polling */ + u64 cons_idx; + /* Unmasked index of next-to-complete (from HW) shadow WQE */ + u64 next_to_complete_idx; + /* queue size in wqes */ + u32 length; + /* distance between elements in bytes */ + u32 stride; + /* ring buffer holding wqes */ + void *buffer; +}; + +static inline int create_shadow_queue(struct shadow_queue *queue, uint32_t length, uint32_t stride) +{ + queue->buffer = kvmalloc_array(length, stride, GFP_KERNEL); + if (!queue->buffer) + return -ENOMEM; + + queue->length = length; + queue->stride = stride; + + return 0; +} + +static inline void destroy_shadow_queue(struct shadow_queue *queue) +{ + kvfree(queue->buffer); +} + +static inline bool shadow_queue_full(struct shadow_queue *queue) +{ + return (queue->prod_idx - queue->cons_idx) >= queue->length; +} + +static inline bool shadow_queue_empty(struct shadow_queue *queue) +{ + return queue->prod_idx == queue->cons_idx; +} + +static inline void * +shadow_queue_get_element(const struct shadow_queue *queue, u64 unmasked_index) +{ + u32 index = unmasked_index % queue->length; + + return ((u8 *)queue->buffer + index * queue->stride); +} + +static inline void * +shadow_queue_producer_entry(struct shadow_queue *queue) +{ + return shadow_queue_get_element(queue, queue->prod_idx); +} + +static inline void * +shadow_queue_get_next_to_consume(const struct shadow_queue *queue) +{ + if (queue->cons_idx == queue->next_to_complete_idx) + return NULL; + + return shadow_queue_get_element(queue, queue->cons_idx); +} + +static inline void * +shadow_queue_get_next_to_complete(struct shadow_queue *queue) +{ + if (queue->next_to_complete_idx == queue->prod_idx) + return NULL; + + return shadow_queue_get_element(queue, queue->next_to_complete_idx); +} + +static inline void shadow_queue_advance_producer(struct shadow_queue *queue) +{ + queue->prod_idx++; +} + +static inline void shadow_queue_advance_consumer(struct shadow_queue *queue) +{ + queue->cons_idx++; +} + +static inline void shadow_queue_advance_next_to_complete(struct shadow_queue *queue) +{ + queue->next_to_complete_idx++; +} + +#endif diff --git a/drivers/infiniband/hw/mana/wq.c b/drivers/infiniband/hw/mana/wq.c index 7c9c69962573..f959f4b9244f 100644 --- a/drivers/infiniband/hw/mana/wq.c +++ b/drivers/infiniband/hw/mana/wq.c @@ -13,7 +13,6 @@ struct ib_wq *mana_ib_create_wq(struct ib_pd *pd, container_of(pd->device, struct mana_ib_dev, ib_dev); struct mana_ib_create_wq ucmd = {}; struct mana_ib_wq *wq; - struct ib_umem *umem; int err; if (udata->inlen < sizeof(ucmd)) @@ -32,39 +31,18 @@ struct ib_wq *mana_ib_create_wq(struct ib_pd *pd, ibdev_dbg(&mdev->ib_dev, "ucmd wq_buf_addr 0x%llx\n", ucmd.wq_buf_addr); - umem = ib_umem_get(pd->device, ucmd.wq_buf_addr, ucmd.wq_buf_size, - IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(umem)) { - err = PTR_ERR(umem); + err = mana_ib_create_queue(mdev, ucmd.wq_buf_addr, ucmd.wq_buf_size, &wq->queue); + if (err) { ibdev_dbg(&mdev->ib_dev, - "Failed to get umem for create wq, err %d\n", err); + "Failed to create queue for create wq, %d\n", err); goto err_free_wq; } - wq->umem = umem; wq->wqe = init_attr->max_wr; wq->wq_buf_size = ucmd.wq_buf_size; wq->rx_object = INVALID_MANA_HANDLE; - - err = mana_ib_create_zero_offset_dma_region(mdev, wq->umem, &wq->gdma_region); - if (err) { - ibdev_dbg(&mdev->ib_dev, - "Failed to create dma region for create wq, %d\n", - err); - goto err_release_umem; - } - - ibdev_dbg(&mdev->ib_dev, - "create_dma_region ret %d gdma_region 0x%llx\n", - err, wq->gdma_region); - - /* WQ ID is returned at wq_create time, doesn't know the value yet */ - return &wq->ibwq; -err_release_umem: - ib_umem_release(umem); - err_free_wq: kfree(wq); @@ -86,8 +64,7 @@ int mana_ib_destroy_wq(struct ib_wq *ibwq, struct ib_udata *udata) mdev = container_of(ib_dev, struct mana_ib_dev, ib_dev); - mana_ib_gd_destroy_dma_region(mdev, wq->gdma_region); - ib_umem_release(wq->umem); + mana_ib_destroy_queue(mdev, &wq->queue); kfree(wq); diff --git a/drivers/infiniband/hw/mana/wr.c b/drivers/infiniband/hw/mana/wr.c new file mode 100644 index 000000000000..1813567d3b16 --- /dev/null +++ b/drivers/infiniband/hw/mana/wr.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024, Microsoft Corporation. All rights reserved. + */ + +#include "mana_ib.h" + +#define MAX_WR_SGL_NUM (2) + +static int mana_ib_post_recv_ud(struct mana_ib_qp *qp, const struct ib_recv_wr *wr) +{ + struct mana_ib_dev *mdev = container_of(qp->ibqp.device, struct mana_ib_dev, ib_dev); + struct gdma_queue *queue = qp->ud_qp.queues[MANA_UD_RECV_QUEUE].kmem; + struct gdma_posted_wqe_info wqe_info = {0}; + struct gdma_sge gdma_sgl[MAX_WR_SGL_NUM]; + struct gdma_wqe_request wqe_req = {0}; + struct ud_rq_shadow_wqe *shadow_wqe; + int err, i; + + if (shadow_queue_full(&qp->shadow_rq)) + return -EINVAL; + + if (wr->num_sge > MAX_WR_SGL_NUM) + return -EINVAL; + + for (i = 0; i < wr->num_sge; ++i) { + gdma_sgl[i].address = wr->sg_list[i].addr; + gdma_sgl[i].mem_key = wr->sg_list[i].lkey; + gdma_sgl[i].size = wr->sg_list[i].length; + } + wqe_req.num_sge = wr->num_sge; + wqe_req.sgl = gdma_sgl; + + err = mana_gd_post_work_request(queue, &wqe_req, &wqe_info); + if (err) + return err; + + shadow_wqe = shadow_queue_producer_entry(&qp->shadow_rq); + memset(shadow_wqe, 0, sizeof(*shadow_wqe)); + shadow_wqe->header.opcode = IB_WC_RECV; + shadow_wqe->header.wr_id = wr->wr_id; + shadow_wqe->header.posted_wqe_size = wqe_info.wqe_size_in_bu; + shadow_queue_advance_producer(&qp->shadow_rq); + + mana_gd_wq_ring_doorbell(mdev_to_gc(mdev), queue); + return 0; +} + +int mana_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) +{ + struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp); + int err = 0; + + for (; wr; wr = wr->next) { + switch (ibqp->qp_type) { + case IB_QPT_UD: + case IB_QPT_GSI: + err = mana_ib_post_recv_ud(qp, wr); + if (unlikely(err)) { + *bad_wr = wr; + return err; + } + break; + default: + ibdev_dbg(ibqp->device, "Posting recv wr on qp type %u is not supported\n", + ibqp->qp_type); + return -EINVAL; + } + } + + return err; +} + +static int mana_ib_post_send_ud(struct mana_ib_qp *qp, const struct ib_ud_wr *wr) +{ + struct mana_ib_dev *mdev = container_of(qp->ibqp.device, struct mana_ib_dev, ib_dev); + struct mana_ib_ah *ah = container_of(wr->ah, struct mana_ib_ah, ibah); + struct net_device *ndev = mana_ib_get_netdev(&mdev->ib_dev, qp->port); + struct gdma_queue *queue = qp->ud_qp.queues[MANA_UD_SEND_QUEUE].kmem; + struct gdma_sge gdma_sgl[MAX_WR_SGL_NUM + 1]; + struct gdma_posted_wqe_info wqe_info = {0}; + struct gdma_wqe_request wqe_req = {0}; + struct rdma_send_oob send_oob = {0}; + struct ud_sq_shadow_wqe *shadow_wqe; + int err, i; + + if (!ndev) { + ibdev_dbg(&mdev->ib_dev, "Invalid port %u in QP %u\n", + qp->port, qp->ibqp.qp_num); + return -EINVAL; + } + + if (wr->wr.opcode != IB_WR_SEND) + return -EINVAL; + + if (shadow_queue_full(&qp->shadow_sq)) + return -EINVAL; + + if (wr->wr.num_sge > MAX_WR_SGL_NUM) + return -EINVAL; + + gdma_sgl[0].address = ah->dma_handle; + gdma_sgl[0].mem_key = qp->ibqp.pd->local_dma_lkey; + gdma_sgl[0].size = sizeof(struct mana_ib_av); + for (i = 0; i < wr->wr.num_sge; ++i) { + gdma_sgl[i + 1].address = wr->wr.sg_list[i].addr; + gdma_sgl[i + 1].mem_key = wr->wr.sg_list[i].lkey; + gdma_sgl[i + 1].size = wr->wr.sg_list[i].length; + } + + wqe_req.num_sge = wr->wr.num_sge + 1; + wqe_req.sgl = gdma_sgl; + wqe_req.inline_oob_size = sizeof(struct rdma_send_oob); + wqe_req.inline_oob_data = &send_oob; + wqe_req.flags = GDMA_WR_OOB_IN_SGL; + wqe_req.client_data_unit = ib_mtu_enum_to_int(ib_mtu_int_to_enum(ndev->mtu)); + + send_oob.wqe_type = WQE_TYPE_UD_SEND; + send_oob.fence = !!(wr->wr.send_flags & IB_SEND_FENCE); + send_oob.signaled = !!(wr->wr.send_flags & IB_SEND_SIGNALED); + send_oob.solicited = !!(wr->wr.send_flags & IB_SEND_SOLICITED); + send_oob.psn = qp->ud_qp.sq_psn; + send_oob.ssn_or_rqpn = wr->remote_qpn; + send_oob.ud_send.remote_qkey = + qp->ibqp.qp_type == IB_QPT_GSI ? IB_QP1_QKEY : wr->remote_qkey; + + err = mana_gd_post_work_request(queue, &wqe_req, &wqe_info); + if (err) + return err; + + qp->ud_qp.sq_psn++; + shadow_wqe = shadow_queue_producer_entry(&qp->shadow_sq); + memset(shadow_wqe, 0, sizeof(*shadow_wqe)); + shadow_wqe->header.opcode = IB_WC_SEND; + shadow_wqe->header.wr_id = wr->wr.wr_id; + shadow_wqe->header.posted_wqe_size = wqe_info.wqe_size_in_bu; + shadow_queue_advance_producer(&qp->shadow_sq); + + mana_gd_wq_ring_doorbell(mdev_to_gc(mdev), queue); + return 0; +} + +int mana_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) +{ + int err; + struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp); + + for (; wr; wr = wr->next) { + switch (ibqp->qp_type) { + case IB_QPT_UD: + case IB_QPT_GSI: + err = mana_ib_post_send_ud(qp, ud_wr(wr)); + if (unlikely(err)) { + *bad_wr = wr; + return err; + } + break; + default: + ibdev_dbg(ibqp->device, "Posting send wr on qp type %u is not supported\n", + ibqp->qp_type); + return -EINVAL; + } + } + + return err; +} diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c index 111fa88a3be4..d7327735b8d0 100644 --- a/drivers/infiniband/hw/mlx4/alias_GUID.c +++ b/drivers/infiniband/hw/mlx4/alias_GUID.c @@ -829,7 +829,6 @@ void mlx4_ib_destroy_alias_guid_service(struct mlx4_ib_dev *dev) int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev) { - char alias_wq_name[15]; int ret = 0; int i, j; union ib_gid gid; @@ -875,9 +874,8 @@ int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev) dev->sriov.alias_guid.ports_guid[i].parent = &dev->sriov.alias_guid; dev->sriov.alias_guid.ports_guid[i].port = i; - snprintf(alias_wq_name, sizeof alias_wq_name, "alias_guid%d", i); dev->sriov.alias_guid.ports_guid[i].wq = - alloc_ordered_workqueue(alias_wq_name, WQ_MEM_RECLAIM); + alloc_ordered_workqueue("alias_guid%d", WQ_MEM_RECLAIM, i); if (!dev->sriov.alias_guid.ports_guid[i].wq) { ret = -ENOMEM; goto err_thread; diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index 4cd738aae53c..c592374f4a58 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -150,8 +150,12 @@ static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, return PTR_ERR(*umem); shift = mlx4_ib_umem_calc_optimal_mtt_size(*umem, 0, &n); - err = mlx4_mtt_init(dev->dev, n, shift, &buf->mtt); + if (shift < 0) { + err = shift; + goto err_buf; + } + err = mlx4_mtt_init(dev->dev, n, shift, &buf->mtt); if (err) goto err_buf; @@ -172,8 +176,9 @@ err_buf: #define CQ_CREATE_FLAGS_SUPPORTED IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION int mlx4_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct ib_device *ibdev = ibcq->device; int entries = attr->cqe; int vector = attr->comp_vector; diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index a37cfac5e23f..e6e132f10625 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -2158,7 +2158,6 @@ static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev, struct mlx4_ib_demux_ctx *ctx, int port) { - char name[12]; int ret = 0; int i; @@ -2194,24 +2193,21 @@ static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev, goto err_mcg; } - snprintf(name, sizeof(name), "mlx4_ibt%d", port); - ctx->wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM); + ctx->wq = alloc_ordered_workqueue("mlx4_ibt%d", WQ_MEM_RECLAIM, port); if (!ctx->wq) { pr_err("Failed to create tunnelling WQ for port %d\n", port); ret = -ENOMEM; goto err_wq; } - snprintf(name, sizeof(name), "mlx4_ibwi%d", port); - ctx->wi_wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM); + ctx->wi_wq = alloc_ordered_workqueue("mlx4_ibwi%d", WQ_MEM_RECLAIM, port); if (!ctx->wi_wq) { pr_err("Failed to create wire WQ for port %d\n", port); ret = -ENOMEM; goto err_wiwq; } - snprintf(name, sizeof(name), "mlx4_ibud%d", port); - ctx->ud_wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM); + ctx->ud_wq = alloc_ordered_workqueue("mlx4_ibud%d", WQ_MEM_RECLAIM, port); if (!ctx->ud_wq) { pr_err("Failed to create up/down WQ for port %d\n", port); ret = -ENOMEM; diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 529db874d67c..dd35e03402ab 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -351,7 +351,7 @@ static int mlx4_ib_del_gid(const struct ib_gid_attr *attr, void **context) struct mlx4_port_gid_table *port_gid_table; int ret = 0; int hw_update = 0; - struct gid_entry *gids; + struct gid_entry *gids = NULL; if (!rdma_cap_roce_gid_table(attr->device, attr->port_num)) return -EINVAL; @@ -389,10 +389,10 @@ static int mlx4_ib_del_gid(const struct ib_gid_attr *attr, void **context) } spin_unlock_bh(&iboe->lock); - if (!ret && hw_update) { + if (gids) ret = mlx4_ib_update_gids(gids, ibdev, attr->port_num); - kfree(gids); - } + + kfree(gids); return ret; } @@ -2341,37 +2341,38 @@ static void mlx4_ib_scan_netdev(struct mlx4_ib_dev *ibdev, iboe->netdevs[dev->dev_port] = event != NETDEV_UNREGISTER ? dev : NULL; - if (event == NETDEV_UP || event == NETDEV_DOWN) { - enum ib_port_state port_state; - struct ib_event ibev = { }; + spin_unlock_bh(&iboe->lock); - if (ib_get_cached_port_state(&ibdev->ib_dev, dev->dev_port + 1, - &port_state)) - goto iboe_out; + if (event == NETDEV_CHANGEADDR || event == NETDEV_REGISTER) + mlx4_ib_update_qps(ibdev, dev, dev->dev_port + 1); +} - if (event == NETDEV_UP && - (port_state != IB_PORT_ACTIVE || - iboe->last_port_state[dev->dev_port] != IB_PORT_DOWN)) - goto iboe_out; - if (event == NETDEV_DOWN && - (port_state != IB_PORT_DOWN || - iboe->last_port_state[dev->dev_port] != IB_PORT_ACTIVE)) - goto iboe_out; - iboe->last_port_state[dev->dev_port] = port_state; +static void mlx4_ib_port_event(struct ib_device *ibdev, struct net_device *ndev, + unsigned long event) +{ + struct mlx4_ib_dev *mlx4_ibdev = + container_of(ibdev, struct mlx4_ib_dev, ib_dev); + struct mlx4_ib_iboe *iboe = &mlx4_ibdev->iboe; - ibev.device = &ibdev->ib_dev; - ibev.element.port_num = dev->dev_port + 1; - ibev.event = event == NETDEV_UP ? IB_EVENT_PORT_ACTIVE : - IB_EVENT_PORT_ERR; - ib_dispatch_event(&ibev); - } + if (!net_eq(dev_net(ndev), &init_net)) + return; + + ASSERT_RTNL(); + + if (ndev->dev.parent != mlx4_ibdev->ib_dev.dev.parent) + return; + + spin_lock_bh(&iboe->lock); + + iboe->netdevs[ndev->dev_port] = event != NETDEV_UNREGISTER ? ndev : NULL; + + if (event == NETDEV_UP || event == NETDEV_DOWN) + ib_dispatch_port_state_event(&mlx4_ibdev->ib_dev, ndev); -iboe_out: spin_unlock_bh(&iboe->lock); - if (event == NETDEV_CHANGEADDR || event == NETDEV_REGISTER || - event == NETDEV_UP || event == NETDEV_CHANGE) - mlx4_ib_update_qps(ibdev, dev, dev->dev_port + 1); + if (event == NETDEV_UP || event == NETDEV_CHANGE) + mlx4_ib_update_qps(mlx4_ibdev, ndev, ndev->dev_port + 1); } static int mlx4_ib_netdev_event(struct notifier_block *this, @@ -2569,6 +2570,7 @@ static const struct ib_device_ops mlx4_ib_dev_ops = { .req_notify_cq = mlx4_ib_arm_cq, .rereg_user_mr = mlx4_ib_rereg_user_mr, .resize_cq = mlx4_ib_resize_cq, + .report_port_event = mlx4_ib_port_event, INIT_RDMA_OBJ_SIZE(ib_ah, mlx4_ib_ah, ibah), INIT_RDMA_OBJ_SIZE(ib_cq, mlx4_ib_cq, ibcq), diff --git a/drivers/infiniband/hw/mlx4/mcg.c b/drivers/infiniband/hw/mlx4/mcg.c index 33f525b744f2..e279e69b9a51 100644 --- a/drivers/infiniband/hw/mlx4/mcg.c +++ b/drivers/infiniband/hw/mlx4/mcg.c @@ -43,7 +43,7 @@ #define MAX_VFS 80 #define MAX_PEND_REQS_PER_FUNC 4 -#define MAD_TIMEOUT_MS 2000 +#define MAD_TIMEOUT_SEC 2 #define mcg_warn(fmt, arg...) pr_warn("MCG WARNING: " fmt, ##arg) #define mcg_error(fmt, arg...) pr_err(fmt, ##arg) @@ -270,7 +270,7 @@ static int send_join_to_wire(struct mcast_group *group, struct ib_sa_mad *sa_mad if (!ret) { /* calls mlx4_ib_mcg_timeout_handler */ queue_delayed_work(group->demux->mcg_wq, &group->timeout_work, - msecs_to_jiffies(MAD_TIMEOUT_MS)); + secs_to_jiffies(MAD_TIMEOUT_SEC)); } return ret; @@ -309,7 +309,7 @@ static int send_leave_to_wire(struct mcast_group *group, u8 join_state) if (!ret) { /* calls mlx4_ib_mcg_timeout_handler */ queue_delayed_work(group->demux->mcg_wq, &group->timeout_work, - msecs_to_jiffies(MAD_TIMEOUT_MS)); + secs_to_jiffies(MAD_TIMEOUT_SEC)); } return ret; @@ -1091,7 +1091,7 @@ static void _mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy for (i = 0; i < MAX_VFS; ++i) clean_vf_mcast(ctx, i); - end = jiffies + msecs_to_jiffies(MAD_TIMEOUT_MS + 3000); + end = jiffies + secs_to_jiffies(MAD_TIMEOUT_SEC + 3); do { count = 0; mutex_lock(&ctx->mcg_table_lock); diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 41ca1114a995..f53b1846594c 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -667,6 +667,9 @@ struct mlx4_uverbs_ex_query_device { __u32 reserved; }; +/* 4k - 4G */ +#define MLX4_PAGE_SIZE_SUPPORTED ((unsigned long)GENMASK_ULL(31, 12)) + static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev) { return container_of(ibdev, struct mlx4_ib_dev, ib_dev); @@ -767,7 +770,7 @@ int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata); int mlx4_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int mlx4_ib_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); @@ -936,8 +939,19 @@ mlx4_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table) { return 0; } -int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, u64 start_va, - int *num_of_mtts); +static inline int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, + u64 start, + int *num_of_mtts) +{ + unsigned long pg_sz; + + pg_sz = ib_umem_find_best_pgsz(umem, MLX4_PAGE_SIZE_SUPPORTED, start); + if (!pg_sz) + return -EOPNOTSUPP; + + *num_of_mtts = ib_umem_num_dma_blocks(umem, pg_sz); + return order_base_2(pg_sz); +} int mlx4_ib_cm_init(void); void mlx4_ib_cm_destroy(void); diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index a40bf58bcdd3..e77645a673fb 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -87,286 +87,20 @@ err_free: return ERR_PTR(err); } -enum { - MLX4_MAX_MTT_SHIFT = 31 -}; - -static int mlx4_ib_umem_write_mtt_block(struct mlx4_ib_dev *dev, - struct mlx4_mtt *mtt, - u64 mtt_size, u64 mtt_shift, u64 len, - u64 cur_start_addr, u64 *pages, - int *start_index, int *npages) -{ - u64 cur_end_addr = cur_start_addr + len; - u64 cur_end_addr_aligned = 0; - u64 mtt_entries; - int err = 0; - int k; - - len += (cur_start_addr & (mtt_size - 1ULL)); - cur_end_addr_aligned = round_up(cur_end_addr, mtt_size); - len += (cur_end_addr_aligned - cur_end_addr); - if (len & (mtt_size - 1ULL)) { - pr_warn("write_block: len %llx is not aligned to mtt_size %llx\n", - len, mtt_size); - return -EINVAL; - } - - mtt_entries = (len >> mtt_shift); - - /* - * Align the MTT start address to the mtt_size. - * Required to handle cases when the MR starts in the middle of an MTT - * record. Was not required in old code since the physical addresses - * provided by the dma subsystem were page aligned, which was also the - * MTT size. - */ - cur_start_addr = round_down(cur_start_addr, mtt_size); - /* A new block is started ... */ - for (k = 0; k < mtt_entries; ++k) { - pages[*npages] = cur_start_addr + (mtt_size * k); - (*npages)++; - /* - * Be friendly to mlx4_write_mtt() and pass it chunks of - * appropriate size. - */ - if (*npages == PAGE_SIZE / sizeof(u64)) { - err = mlx4_write_mtt(dev->dev, mtt, *start_index, - *npages, pages); - if (err) - return err; - - (*start_index) += *npages; - *npages = 0; - } - } - - return 0; -} - -static inline u64 alignment_of(u64 ptr) -{ - return ilog2(ptr & (~(ptr - 1))); -} - -static int mlx4_ib_umem_calc_block_mtt(u64 next_block_start, - u64 current_block_end, - u64 block_shift) -{ - /* Check whether the alignment of the new block is aligned as well as - * the previous block. - * Block address must start with zeros till size of entity_size. - */ - if ((next_block_start & ((1ULL << block_shift) - 1ULL)) != 0) - /* - * It is not as well aligned as the previous block-reduce the - * mtt size accordingly. Here we take the last right bit which - * is 1. - */ - block_shift = alignment_of(next_block_start); - - /* - * Check whether the alignment of the end of previous block - is it - * aligned as well as the start of the block - */ - if (((current_block_end) & ((1ULL << block_shift) - 1ULL)) != 0) - /* - * It is not as well aligned as the start of the block - - * reduce the mtt size accordingly. - */ - block_shift = alignment_of(current_block_end); - - return block_shift; -} - int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt, struct ib_umem *umem) { - u64 *pages; - u64 len = 0; - int err = 0; - u64 mtt_size; - u64 cur_start_addr = 0; - u64 mtt_shift; - int start_index = 0; - int npages = 0; - struct scatterlist *sg; - int i; - - pages = (u64 *) __get_free_page(GFP_KERNEL); - if (!pages) - return -ENOMEM; - - mtt_shift = mtt->page_shift; - mtt_size = 1ULL << mtt_shift; + struct ib_block_iter biter; + int err, i = 0; + u64 addr; - for_each_sgtable_dma_sg(&umem->sgt_append.sgt, sg, i) { - if (cur_start_addr + len == sg_dma_address(sg)) { - /* still the same block */ - len += sg_dma_len(sg); - continue; - } - /* - * A new block is started ... - * If len is malaligned, write an extra mtt entry to cover the - * misaligned area (round up the division) - */ - err = mlx4_ib_umem_write_mtt_block(dev, mtt, mtt_size, - mtt_shift, len, - cur_start_addr, - pages, &start_index, - &npages); - if (err) - goto out; - - cur_start_addr = sg_dma_address(sg); - len = sg_dma_len(sg); - } - - /* Handle the last block */ - if (len > 0) { - /* - * If len is malaligned, write an extra mtt entry to cover - * the misaligned area (round up the division) - */ - err = mlx4_ib_umem_write_mtt_block(dev, mtt, mtt_size, - mtt_shift, len, - cur_start_addr, pages, - &start_index, &npages); + rdma_umem_for_each_dma_block(umem, &biter, BIT(mtt->page_shift)) { + addr = rdma_block_iter_dma_address(&biter); + err = mlx4_write_mtt(dev->dev, mtt, i++, 1, &addr); if (err) - goto out; - } - - if (npages) - err = mlx4_write_mtt(dev->dev, mtt, start_index, npages, pages); - -out: - free_page((unsigned long) pages); - return err; -} - -/* - * Calculate optimal mtt size based on contiguous pages. - * Function will return also the number of pages that are not aligned to the - * calculated mtt_size to be added to total number of pages. For that we should - * check the first chunk length & last chunk length and if not aligned to - * mtt_size we should increment the non_aligned_pages number. All chunks in the - * middle already handled as part of mtt shift calculation for both their start - * & end addresses. - */ -int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, u64 start_va, - int *num_of_mtts) -{ - u64 block_shift = MLX4_MAX_MTT_SHIFT; - u64 min_shift = PAGE_SHIFT; - u64 last_block_aligned_end = 0; - u64 current_block_start = 0; - u64 first_block_start = 0; - u64 current_block_len = 0; - u64 last_block_end = 0; - struct scatterlist *sg; - u64 current_block_end; - u64 misalignment_bits; - u64 next_block_start; - u64 total_len = 0; - int i; - - *num_of_mtts = ib_umem_num_dma_blocks(umem, PAGE_SIZE); - - for_each_sgtable_dma_sg(&umem->sgt_append.sgt, sg, i) { - /* - * Initialization - save the first chunk start as the - * current_block_start - block means contiguous pages. - */ - if (current_block_len == 0 && current_block_start == 0) { - current_block_start = sg_dma_address(sg); - first_block_start = current_block_start; - /* - * Find the bits that are different between the physical - * address and the virtual address for the start of the - * MR. - * umem_get aligned the start_va to a page boundary. - * Therefore, we need to align the start va to the same - * boundary. - * misalignment_bits is needed to handle the case of a - * single memory region. In this case, the rest of the - * logic will not reduce the block size. If we use a - * block size which is bigger than the alignment of the - * misalignment bits, we might use the virtual page - * number instead of the physical page number, resulting - * in access to the wrong data. - */ - misalignment_bits = - (start_va & (~(((u64)(PAGE_SIZE)) - 1ULL))) ^ - current_block_start; - block_shift = min(alignment_of(misalignment_bits), - block_shift); - } - - /* - * Go over the scatter entries and check if they continue the - * previous scatter entry. - */ - next_block_start = sg_dma_address(sg); - current_block_end = current_block_start + current_block_len; - /* If we have a split (non-contig.) between two blocks */ - if (current_block_end != next_block_start) { - block_shift = mlx4_ib_umem_calc_block_mtt - (next_block_start, - current_block_end, - block_shift); - - /* - * If we reached the minimum shift for 4k page we stop - * the loop. - */ - if (block_shift <= min_shift) - goto end; - - /* - * If not saved yet we are in first block - we save the - * length of first block to calculate the - * non_aligned_pages number at the end. - */ - total_len += current_block_len; - - /* Start a new block */ - current_block_start = next_block_start; - current_block_len = sg_dma_len(sg); - continue; - } - /* The scatter entry is another part of the current block, - * increase the block size. - * An entry in the scatter can be larger than 4k (page) as of - * dma mapping which merge some blocks together. - */ - current_block_len += sg_dma_len(sg); + return err; } - - /* Account for the last block in the total len */ - total_len += current_block_len; - /* Add to the first block the misalignment that it suffers from. */ - total_len += (first_block_start & ((1ULL << block_shift) - 1ULL)); - last_block_end = current_block_start + current_block_len; - last_block_aligned_end = round_up(last_block_end, 1ULL << block_shift); - total_len += (last_block_aligned_end - last_block_end); - - if (total_len & ((1ULL << block_shift) - 1ULL)) - pr_warn("misaligned total length detected (%llu, %llu)!", - total_len, block_shift); - - *num_of_mtts = total_len >> block_shift; -end: - if (block_shift < min_shift) { - /* - * If shift is less than the min we set a warning and return the - * min shift. - */ - pr_warn("umem_calc_optimal_mtt_size - unexpected shift %lld\n", block_shift); - - block_shift = min_shift; - } - return block_shift; + return 0; } static struct ib_umem *mlx4_get_umem_mr(struct ib_device *device, u64 start, @@ -424,6 +158,10 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, } shift = mlx4_ib_umem_calc_optimal_mtt_size(mr->umem, start, &n); + if (shift < 0) { + err = shift; + goto err_umem; + } err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, length, convert_access(access_flags), n, shift, &mr->mmr); diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 9d08aa99f3cb..50fd407103c7 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -925,8 +925,12 @@ static int create_rq(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, } shift = mlx4_ib_umem_calc_optimal_mtt_size(qp->umem, 0, &n); - err = mlx4_mtt_init(dev->dev, n, shift, &qp->mtt); + if (shift < 0) { + err = shift; + goto err_buf; + } + err = mlx4_mtt_init(dev->dev, n, shift, &qp->mtt); if (err) goto err_buf; @@ -1108,8 +1112,12 @@ static int create_qp_common(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, } shift = mlx4_ib_umem_calc_optimal_mtt_size(qp->umem, 0, &n); - err = mlx4_mtt_init(dev->dev, n, shift, &qp->mtt); + if (shift < 0) { + err = shift; + goto err_buf; + } + err = mlx4_mtt_init(dev->dev, n, shift, &qp->mtt); if (err) goto err_buf; diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile index 72a526236c2e..11878ddf7cc7 100644 --- a/drivers/infiniband/hw/mlx5/Makefile +++ b/drivers/infiniband/hw/mlx5/Makefile @@ -6,8 +6,10 @@ mlx5_ib-y := ah.o \ cong.o \ counters.o \ cq.o \ + data_direct.o \ dm.o \ doorbell.o \ + fs.o \ gsi.o \ ib_virt.o \ mad.o \ @@ -25,7 +27,6 @@ mlx5_ib-y := ah.o \ mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o mlx5_ib-$(CONFIG_MLX5_ESWITCH) += ib_rep.o mlx5_ib-$(CONFIG_INFINIBAND_USER_ACCESS) += devx.o \ - fs.o \ qos.o \ std_types.o mlx5_ib-$(CONFIG_MLX5_MACSEC) += macsec.o diff --git a/drivers/infiniband/hw/mlx5/ah.c b/drivers/infiniband/hw/mlx5/ah.c index 505bc47fd575..531a57f9ee7e 100644 --- a/drivers/infiniband/hw/mlx5/ah.c +++ b/drivers/infiniband/hw/mlx5/ah.c @@ -50,11 +50,12 @@ static __be16 mlx5_ah_get_udp_sport(const struct mlx5_ib_dev *dev, return sport; } -static void create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah, +static int create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah, struct rdma_ah_init_attr *init_attr) { struct rdma_ah_attr *ah_attr = init_attr->ah_attr; enum ib_gid_type gid_type; + int rate_val; if (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) { const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr); @@ -67,7 +68,10 @@ static void create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah, ah->av.tclass = grh->traffic_class; } - ah->av.stat_rate_sl = (rdma_ah_get_static_rate(ah_attr) << 4); + rate_val = mlx5r_ib_rate(dev, rdma_ah_get_static_rate(ah_attr)); + if (rate_val < 0) + return rate_val; + ah->av.stat_rate_sl = rate_val << 4; if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) { if (init_attr->xmit_slave) @@ -88,6 +92,8 @@ static void create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah, ah->av.fl_mlid = rdma_ah_get_path_bits(ah_attr) & 0x7f; ah->av.stat_rate_sl |= (rdma_ah_get_sl(ah_attr) & 0xf); } + + return 0; } int mlx5_ib_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, @@ -120,8 +126,7 @@ int mlx5_ib_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, return err; } - create_ib_ah(dev, ah, init_attr); - return 0; + return create_ib_ah(dev, ah, init_attr); } int mlx5_ib_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr) diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c index 1d0c8d5e745b..7c08e3008927 100644 --- a/drivers/infiniband/hw/mlx5/cmd.c +++ b/drivers/infiniband/hw/mlx5/cmd.c @@ -177,7 +177,7 @@ int mlx5_cmd_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn, u16 uid) return mlx5_cmd_exec_in(dev, dealloc_xrcd, in); } -int mlx5_cmd_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb, +int mlx5_cmd_mad_ifc(struct mlx5_ib_dev *dev, const void *inb, void *outb, u16 opmod, u8 port) { int outlen = MLX5_ST_SZ_BYTES(mad_ifc_out); @@ -195,12 +195,18 @@ int mlx5_cmd_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb, MLX5_SET(mad_ifc_in, in, opcode, MLX5_CMD_OP_MAD_IFC); MLX5_SET(mad_ifc_in, in, op_mod, opmod); - MLX5_SET(mad_ifc_in, in, port, port); + if (dev->ib_dev.type == RDMA_DEVICE_TYPE_SMI) { + MLX5_SET(mad_ifc_in, in, plane_index, port); + MLX5_SET(mad_ifc_in, in, port, + smi_to_native_portnum(dev, port)); + } else { + MLX5_SET(mad_ifc_in, in, port, port); + } data = MLX5_ADDR_OF(mad_ifc_in, in, mad); memcpy(data, inb, MLX5_FLD_SZ_BYTES(mad_ifc_in, mad)); - err = mlx5_cmd_exec_inout(dev, mad_ifc, in, out); + err = mlx5_cmd_exec_inout(dev->mdev, mad_ifc, in, out); if (err) goto out; @@ -239,3 +245,24 @@ int mlx5_cmd_uar_dealloc(struct mlx5_core_dev *dev, u32 uarn, u16 uid) MLX5_SET(dealloc_uar_in, in, uid, uid); return mlx5_cmd_exec_in(dev, dealloc_uar, in); } + +int mlx5_cmd_query_vuid(struct mlx5_core_dev *dev, bool data_direct, + char *out_vuid) +{ + u8 out[MLX5_ST_SZ_BYTES(query_vuid_out) + + MLX5_ST_SZ_BYTES(array1024_auto)] = {}; + u8 in[MLX5_ST_SZ_BYTES(query_vuid_in)] = {}; + char *vuid; + int err; + + MLX5_SET(query_vuid_in, in, opcode, MLX5_CMD_OPCODE_QUERY_VUID); + MLX5_SET(query_vuid_in, in, vhca_id, MLX5_CAP_GEN(dev, vhca_id)); + MLX5_SET(query_vuid_in, in, data_direct, data_direct); + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + if (err) + return err; + + vuid = MLX5_ADDR_OF(query_vuid_out, out, vuid); + memcpy(out_vuid, vuid, MLX5_ST_SZ_BYTES(array1024_auto)); + return 0; +} diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h index 93a971a40d11..e6c88b6ebd0d 100644 --- a/drivers/infiniband/hw/mlx5/cmd.h +++ b/drivers/infiniband/hw/mlx5/cmd.h @@ -54,8 +54,10 @@ int mlx5_cmd_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn, u16 uid); int mlx5_cmd_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn, u16 uid); int mlx5_cmd_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn, u16 uid); -int mlx5_cmd_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb, +int mlx5_cmd_mad_ifc(struct mlx5_ib_dev *dev, const void *inb, void *outb, u16 opmod, u8 port); int mlx5_cmd_uar_alloc(struct mlx5_core_dev *dev, u32 *uarn, u16 uid); int mlx5_cmd_uar_dealloc(struct mlx5_core_dev *dev, u32 uarn, u16 uid); +int mlx5_cmd_query_vuid(struct mlx5_core_dev *dev, bool data_direct, + char *out_vuid); #endif /* MLX5_IB_CMD_H */ diff --git a/drivers/infiniband/hw/mlx5/counters.c b/drivers/infiniband/hw/mlx5/counters.c index 8300ce622835..b847084dcd99 100644 --- a/drivers/infiniband/hw/mlx5/counters.c +++ b/drivers/infiniband/hw/mlx5/counters.c @@ -83,6 +83,8 @@ static const struct mlx5_ib_counter extended_err_cnts[] = { INIT_Q_COUNTER(resp_remote_access_errors), INIT_Q_COUNTER(resp_cqe_flush_error), INIT_Q_COUNTER(req_cqe_flush_error), + INIT_Q_COUNTER(req_transport_retries_exceeded), + INIT_Q_COUNTER(req_rnr_retries_exceeded), }; static const struct mlx5_ib_counter roce_accl_cnts[] = { @@ -102,6 +104,8 @@ static const struct mlx5_ib_counter vport_extended_err_cnts[] = { INIT_VPORT_Q_COUNTER(resp_remote_access_errors), INIT_VPORT_Q_COUNTER(resp_cqe_flush_error), INIT_VPORT_Q_COUNTER(req_cqe_flush_error), + INIT_VPORT_Q_COUNTER(req_transport_retries_exceeded), + INIT_VPORT_Q_COUNTER(req_rnr_retries_exceeded), }; static const struct mlx5_ib_counter vport_roce_accl_cnts[] = { @@ -136,6 +140,13 @@ static const struct mlx5_ib_counter rdmatx_cnp_op_cnts[] = { INIT_OP_COUNTER(cc_tx_cnp_pkts, CC_TX_CNP_PKTS), }; +static const struct mlx5_ib_counter packets_op_cnts[] = { + INIT_OP_COUNTER(rdma_tx_packets, RDMA_TX_PACKETS), + INIT_OP_COUNTER(rdma_tx_bytes, RDMA_TX_BYTES), + INIT_OP_COUNTER(rdma_rx_packets, RDMA_RX_PACKETS), + INIT_OP_COUNTER(rdma_rx_bytes, RDMA_RX_BYTES), +}; + static int mlx5_ib_read_counters(struct ib_counters *counters, struct ib_counters_read_attr *read_attr, struct uverbs_attr_bundle *attrs) @@ -423,6 +434,52 @@ done: return num_counters; } +static bool is_rdma_bytes_counter(u32 type) +{ + if (type == MLX5_IB_OPCOUNTER_RDMA_TX_BYTES || + type == MLX5_IB_OPCOUNTER_RDMA_RX_BYTES || + type == MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP || + type == MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP) + return true; + + return false; +} + +static int do_per_qp_get_op_stat(struct rdma_counter *counter) +{ + struct mlx5_ib_dev *dev = to_mdev(counter->device); + const struct mlx5_ib_counters *cnts = get_counters(dev, counter->port); + struct mlx5_rdma_counter *mcounter = to_mcounter(counter); + int i, ret, index, num_hw_counters; + u64 packets = 0, bytes = 0; + + for (i = MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP; + i <= MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP; i++) { + if (!mcounter->fc[i]) + continue; + + ret = mlx5_fc_query(dev->mdev, mcounter->fc[i], + &packets, &bytes); + if (ret) + return ret; + + num_hw_counters = cnts->num_q_counters + + cnts->num_cong_counters + + cnts->num_ext_ppcnt_counters; + + index = i - MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP + + num_hw_counters; + + if (is_rdma_bytes_counter(i)) + counter->stats->value[index] = bytes; + else + counter->stats->value[index] = packets; + + clear_bit(index, counter->stats->is_disabled); + } + return 0; +} + static int do_get_op_stat(struct ib_device *ibdev, struct rdma_hw_stats *stats, u32 port_num, int index) @@ -430,7 +487,7 @@ static int do_get_op_stat(struct ib_device *ibdev, struct mlx5_ib_dev *dev = to_mdev(ibdev); const struct mlx5_ib_counters *cnts; const struct mlx5_ib_op_fc *opfcs; - u64 packets = 0, bytes; + u64 packets, bytes; u32 type; int ret; @@ -449,8 +506,11 @@ static int do_get_op_stat(struct ib_device *ibdev, if (ret) return ret; + if (is_rdma_bytes_counter(type)) + stats->value[index] = bytes; + else + stats->value[index] = packets; out: - stats->value[index] = packets; return index; } @@ -519,19 +579,30 @@ static int mlx5_ib_counter_update_stats(struct rdma_counter *counter) { struct mlx5_ib_dev *dev = to_mdev(counter->device); const struct mlx5_ib_counters *cnts = get_counters(dev, counter->port); + int ret; + + ret = mlx5_ib_query_q_counters(dev->mdev, cnts, counter->stats, + counter->id); + if (ret) + return ret; + + if (!counter->mode.bind_opcnt) + return 0; - return mlx5_ib_query_q_counters(dev->mdev, cnts, - counter->stats, counter->id); + return do_per_qp_get_op_stat(counter); } static int mlx5_ib_counter_dealloc(struct rdma_counter *counter) { + struct mlx5_rdma_counter *mcounter = to_mcounter(counter); struct mlx5_ib_dev *dev = to_mdev(counter->device); u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)] = {}; if (!counter->id) return 0; + WARN_ON(!xa_empty(&mcounter->qpn_opfc_xa)); + mlx5r_fs_destroy_fcs(dev, counter); MLX5_SET(dealloc_q_counter_in, in, opcode, MLX5_CMD_OP_DEALLOC_Q_COUNTER); MLX5_SET(dealloc_q_counter_in, in, counter_set_id, counter->id); @@ -539,9 +610,10 @@ static int mlx5_ib_counter_dealloc(struct rdma_counter *counter) } static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter, - struct ib_qp *qp) + struct ib_qp *qp, u32 port) { struct mlx5_ib_dev *dev = to_mdev(qp->device); + bool new = false; int err; if (!counter->id) { @@ -556,24 +628,46 @@ static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter, return err; counter->id = MLX5_GET(alloc_q_counter_out, out, counter_set_id); + new = true; } err = mlx5_ib_qp_set_counter(qp, counter); if (err) goto fail_set_counter; + err = mlx5r_fs_bind_op_fc(qp, counter, port); + if (err) + goto fail_bind_op_fc; + return 0; +fail_bind_op_fc: + mlx5_ib_qp_set_counter(qp, NULL); fail_set_counter: - mlx5_ib_counter_dealloc(counter); - counter->id = 0; + if (new) { + mlx5_ib_counter_dealloc(counter); + counter->id = 0; + } return err; } -static int mlx5_ib_counter_unbind_qp(struct ib_qp *qp) +static int mlx5_ib_counter_unbind_qp(struct ib_qp *qp, u32 port) { - return mlx5_ib_qp_set_counter(qp, NULL); + struct rdma_counter *counter = qp->counter; + int err; + + mlx5r_fs_unbind_op_fc(qp, counter); + + err = mlx5_ib_qp_set_counter(qp, NULL); + if (err) + goto fail_set_counter; + + return 0; + +fail_set_counter: + mlx5r_fs_bind_op_fc(qp, counter, port); + return err; } static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev, @@ -673,6 +767,12 @@ static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev, descs[j].priv = &rdmatx_cnp_op_cnts[i].type; } } + + for (i = 0; i < ARRAY_SIZE(packets_op_cnts); i++, j++) { + descs[j].name = packets_op_cnts[i].name; + descs[j].flags |= IB_STAT_FLAG_OPTIONAL; + descs[j].priv = &packets_op_cnts[i].type; + } } @@ -723,6 +823,8 @@ static int __mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev, num_op_counters = ARRAY_SIZE(basic_op_cnts); + num_op_counters += ARRAY_SIZE(packets_op_cnts); + if (MLX5_CAP_FLOWTABLE(dev->mdev, ft_field_support_2_nic_receive_rdma.bth_opcode)) num_op_counters += ARRAY_SIZE(rdmarx_cnp_op_cnts); @@ -752,10 +854,58 @@ err: return -ENOMEM; } +/* + * Checks if the given flow counter type should be sharing the same flow counter + * with another type and if it should, checks if that other type flow counter + * was already created, if both conditions are met return true and the counter + * else return false. + */ +bool mlx5r_is_opfc_shared_and_in_use(struct mlx5_ib_op_fc *opfcs, u32 type, + struct mlx5_ib_op_fc **opfc) +{ + u32 shared_fc_type; + + switch (type) { + case MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS: + shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_TX_BYTES; + break; + case MLX5_IB_OPCOUNTER_RDMA_TX_BYTES: + shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS: + shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_RX_BYTES; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES: + shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS; + break; + case MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS_PER_QP: + shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP: + shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS_PER_QP: + shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP: + shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS_PER_QP; + break; + default: + return false; + } + + *opfc = &opfcs[shared_fc_type]; + if (!(*opfc)->fc) + return false; + + return true; +} + static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev) { u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)] = {}; int num_cnt_ports = dev->num_ports; + struct mlx5_ib_op_fc *in_use_opfc; int i, j; if (is_mdev_switchdev_mode(dev->mdev)) @@ -777,11 +927,15 @@ static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev) if (!dev->port[i].cnts.opfcs[j].fc) continue; - if (IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)) - mlx5_ib_fs_remove_op_fc(dev, - &dev->port[i].cnts.opfcs[j], j); + if (mlx5r_is_opfc_shared_and_in_use( + dev->port[i].cnts.opfcs, j, &in_use_opfc)) + goto skip; + + mlx5_ib_fs_remove_op_fc(dev, + &dev->port[i].cnts.opfcs[j], j); mlx5_fc_destroy(dev->mdev, dev->port[i].cnts.opfcs[j].fc); +skip: dev->port[i].cnts.opfcs[j].fc = NULL; } } @@ -975,8 +1129,8 @@ static int mlx5_ib_modify_stat(struct ib_device *device, u32 port, unsigned int index, bool enable) { struct mlx5_ib_dev *dev = to_mdev(device); + struct mlx5_ib_op_fc *opfc, *in_use_opfc; struct mlx5_ib_counters *cnts; - struct mlx5_ib_op_fc *opfc; u32 num_hw_counters, type; int ret; @@ -1000,6 +1154,13 @@ static int mlx5_ib_modify_stat(struct ib_device *device, u32 port, if (opfc->fc) return -EEXIST; + if (mlx5r_is_opfc_shared_and_in_use(cnts->opfcs, type, + &in_use_opfc)) { + opfc->fc = in_use_opfc->fc; + opfc->rule[0] = in_use_opfc->rule[0]; + return 0; + } + opfc->fc = mlx5_fc_create(dev->mdev, false); if (IS_ERR(opfc->fc)) return PTR_ERR(opfc->fc); @@ -1015,12 +1176,23 @@ static int mlx5_ib_modify_stat(struct ib_device *device, u32 port, if (!opfc->fc) return -EINVAL; + if (mlx5r_is_opfc_shared_and_in_use(cnts->opfcs, type, &in_use_opfc)) + goto out; + mlx5_ib_fs_remove_op_fc(dev, opfc, type); mlx5_fc_destroy(dev->mdev, opfc->fc); +out: opfc->fc = NULL; return 0; } +static void mlx5_ib_counter_init(struct rdma_counter *counter) +{ + struct mlx5_rdma_counter *mcounter = to_mcounter(counter); + + xa_init(&mcounter->qpn_opfc_xa); +} + static const struct ib_device_ops hw_stats_ops = { .alloc_hw_port_stats = mlx5_ib_alloc_hw_port_stats, .get_hw_stats = mlx5_ib_get_hw_stats, @@ -1029,8 +1201,10 @@ static const struct ib_device_ops hw_stats_ops = { .counter_dealloc = mlx5_ib_counter_dealloc, .counter_alloc_stats = mlx5_ib_counter_alloc_stats, .counter_update_stats = mlx5_ib_counter_update_stats, - .modify_hw_stat = IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) ? - mlx5_ib_modify_stat : NULL, + .modify_hw_stat = mlx5_ib_modify_stat, + .counter_init = mlx5_ib_counter_init, + + INIT_RDMA_OBJ_SIZE(rdma_counter, mlx5_rdma_counter, rdma_counter), }; static const struct ib_device_ops hw_switchdev_vport_op = { @@ -1045,6 +1219,9 @@ static const struct ib_device_ops hw_switchdev_stats_ops = { .counter_dealloc = mlx5_ib_counter_dealloc, .counter_alloc_stats = mlx5_ib_counter_alloc_stats, .counter_update_stats = mlx5_ib_counter_update_stats, + .counter_init = mlx5_ib_counter_init, + + INIT_RDMA_OBJ_SIZE(rdma_counter, mlx5_rdma_counter, rdma_counter), }; static const struct ib_device_ops counters_ops = { diff --git a/drivers/infiniband/hw/mlx5/counters.h b/drivers/infiniband/hw/mlx5/counters.h index 6bcaaa52e2b2..bd03cee42014 100644 --- a/drivers/infiniband/hw/mlx5/counters.h +++ b/drivers/infiniband/hw/mlx5/counters.h @@ -8,10 +8,25 @@ #include "mlx5_ib.h" +struct mlx5_rdma_counter { + struct rdma_counter rdma_counter; + + struct mlx5_fc *fc[MLX5_IB_OPCOUNTER_MAX]; + struct xarray qpn_opfc_xa; +}; + +static inline struct mlx5_rdma_counter * +to_mcounter(struct rdma_counter *counter) +{ + return container_of(counter, struct mlx5_rdma_counter, rdma_counter); +} + int mlx5_ib_counters_init(struct mlx5_ib_dev *dev); void mlx5_ib_counters_cleanup(struct mlx5_ib_dev *dev); void mlx5_ib_counters_clear_description(struct ib_counters *counters); int mlx5_ib_flow_counters_set_data(struct ib_counters *ibcounters, struct mlx5_ib_create_flow *ucmd); u16 mlx5_ib_get_counters_id(struct mlx5_ib_dev *dev, u32 port_num); +bool mlx5r_is_opfc_shared_and_in_use(struct mlx5_ib_op_fc *opfcs, u32 type, + struct mlx5_ib_op_fc **opfc); #endif /* _MLX5_IB_COUNTERS_H */ diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 9773d2a3d97f..1aa5311b03e9 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -38,6 +38,9 @@ #include "srq.h" #include "qp.h" +#define UVERBS_MODULE_NAME mlx5_ib +#include <rdma/uverbs_named_ioctl.h> + static void mlx5_ib_cq_comp(struct mlx5_core_cq *cq, struct mlx5_eqe *eqe) { struct ib_cq *ibcq = &to_mibcq(cq)->ibcq; @@ -487,7 +490,7 @@ repoll: } qpn = ntohl(cqe64->sop_drop_qpn) & 0xffffff; - if (!*cur_qp || (qpn != (*cur_qp)->ibqp.qp_num)) { + if (!*cur_qp || (qpn != (*cur_qp)->trans_qp.base.mqp.qpn)) { /* We do not have to take the QP table lock here, * because CQs will be locked while QPs are removed * from the table. @@ -714,7 +717,8 @@ static int mini_cqe_res_format_to_hw(struct mlx5_ib_dev *dev, u8 format) static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, struct mlx5_ib_cq *cq, int entries, u32 **cqb, - int *cqe_size, int *index, int *inlen) + int *cqe_size, int *index, int *inlen, + struct uverbs_attr_bundle *attrs) { struct mlx5_ib_create_cq ucmd = {}; unsigned long page_size; @@ -788,7 +792,11 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, order_base_2(page_size) - MLX5_ADAPTER_PAGE_SHIFT); MLX5_SET(cqc, cqc, page_offset, page_offset_quantized); - if (ucmd.flags & MLX5_IB_CREATE_CQ_FLAGS_UAR_PAGE_INDEX) { + if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_CREATE_CQ_UAR_INDEX)) { + err = uverbs_copy_from(index, attrs, MLX5_IB_ATTR_CREATE_CQ_UAR_INDEX); + if (err) + goto err_cqb; + } else if (ucmd.flags & MLX5_IB_CREATE_CQ_FLAGS_UAR_PAGE_INDEX) { *index = ucmd.uar_page_index; } else if (context->bfregi.lib_uar_dyn) { err = -EINVAL; @@ -942,8 +950,9 @@ static void notify_soft_wc_handler(struct work_struct *work) } int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct ib_device *ibdev = ibcq->device; int entries = attr->cqe; int vector = attr->comp_vector; @@ -980,7 +989,7 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, if (udata) { err = create_cq_user(dev, udata, cq, entries, &cqb, &cqe_size, - &index, &inlen); + &index, &inlen, attrs); if (err) return err; } else { @@ -1442,3 +1451,17 @@ int mlx5_ib_generate_wc(struct ib_cq *ibcq, struct ib_wc *wc) return 0; } + +ADD_UVERBS_ATTRIBUTES_SIMPLE( + mlx5_ib_cq_create, + UVERBS_OBJECT_CQ, + UVERBS_METHOD_CQ_CREATE, + UVERBS_ATTR_PTR_IN( + MLX5_IB_ATTR_CREATE_CQ_UAR_INDEX, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL)); + +const struct uapi_definition mlx5_ib_create_cq_defs[] = { + UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_CQ, &mlx5_ib_cq_create), + {}, +}; diff --git a/drivers/infiniband/hw/mlx5/data_direct.c b/drivers/infiniband/hw/mlx5/data_direct.c new file mode 100644 index 000000000000..b9ba84afaae2 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/data_direct.c @@ -0,0 +1,227 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include "mlx5_ib.h" +#include "data_direct.h" + +static LIST_HEAD(mlx5_data_direct_dev_list); +static LIST_HEAD(mlx5_data_direct_reg_list); + +/* + * This mutex should be held when accessing either of the above lists + */ +static DEFINE_MUTEX(mlx5_data_direct_mutex); + +struct mlx5_data_direct_registration { + struct mlx5_ib_dev *ibdev; + char vuid[MLX5_ST_SZ_BYTES(array1024_auto) + 1]; + struct list_head list; +}; + +static const struct pci_device_id mlx5_data_direct_pci_table[] = { + { PCI_VDEVICE(MELLANOX, 0x2100) }, /* ConnectX-8 Data Direct */ + { 0, } +}; + +static int mlx5_data_direct_vpd_get_vuid(struct mlx5_data_direct_dev *dev) +{ + struct pci_dev *pdev = dev->pdev; + unsigned int vpd_size, kw_len; + u8 *vpd_data; + int start; + int ret; + + vpd_data = pci_vpd_alloc(pdev, &vpd_size); + if (IS_ERR(vpd_data)) { + pci_err(pdev, "Unable to read VPD, err=%ld\n", PTR_ERR(vpd_data)); + return PTR_ERR(vpd_data); + } + + start = pci_vpd_find_ro_info_keyword(vpd_data, vpd_size, "VU", &kw_len); + if (start < 0) { + ret = start; + pci_err(pdev, "VU keyword not found, err=%d\n", ret); + goto end; + } + + dev->vuid = kmemdup_nul(vpd_data + start, kw_len, GFP_KERNEL); + ret = dev->vuid ? 0 : -ENOMEM; + +end: + kfree(vpd_data); + return ret; +} + +static void mlx5_data_direct_shutdown(struct pci_dev *pdev) +{ + pci_disable_device(pdev); +} + +static int mlx5_data_direct_set_dma_caps(struct pci_dev *pdev) +{ + int err; + + err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); + if (err) { + dev_warn(&pdev->dev, + "Warning: couldn't set 64-bit PCI DMA mask, err=%d\n", err); + err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); + if (err) { + dev_err(&pdev->dev, "Can't set PCI DMA mask, err=%d\n", err); + return err; + } + } + + dma_set_max_seg_size(&pdev->dev, SZ_2G); + return 0; +} + +int mlx5_data_direct_ib_reg(struct mlx5_ib_dev *ibdev, char *vuid) +{ + struct mlx5_data_direct_registration *reg; + struct mlx5_data_direct_dev *dev; + + reg = kzalloc(sizeof(*reg), GFP_KERNEL); + if (!reg) + return -ENOMEM; + + reg->ibdev = ibdev; + strcpy(reg->vuid, vuid); + + mutex_lock(&mlx5_data_direct_mutex); + list_for_each_entry(dev, &mlx5_data_direct_dev_list, list) { + if (strcmp(dev->vuid, vuid) == 0) { + mlx5_ib_data_direct_bind(ibdev, dev); + break; + } + } + + /* Add the registration to its global list, to be used upon bind/unbind + * of its affiliated data direct device + */ + list_add_tail(®->list, &mlx5_data_direct_reg_list); + mutex_unlock(&mlx5_data_direct_mutex); + return 0; +} + +void mlx5_data_direct_ib_unreg(struct mlx5_ib_dev *ibdev) +{ + struct mlx5_data_direct_registration *reg; + + mutex_lock(&mlx5_data_direct_mutex); + list_for_each_entry(reg, &mlx5_data_direct_reg_list, list) { + if (reg->ibdev == ibdev) { + list_del(®->list); + kfree(reg); + goto end; + } + } + + WARN_ON(true); +end: + mutex_unlock(&mlx5_data_direct_mutex); +} + +static void mlx5_data_direct_dev_reg(struct mlx5_data_direct_dev *dev) +{ + struct mlx5_data_direct_registration *reg; + + mutex_lock(&mlx5_data_direct_mutex); + list_for_each_entry(reg, &mlx5_data_direct_reg_list, list) { + if (strcmp(dev->vuid, reg->vuid) == 0) + mlx5_ib_data_direct_bind(reg->ibdev, dev); + } + + /* Add the data direct device to the global list, further IB devices may + * use it later as well + */ + list_add_tail(&dev->list, &mlx5_data_direct_dev_list); + mutex_unlock(&mlx5_data_direct_mutex); +} + +static void mlx5_data_direct_dev_unreg(struct mlx5_data_direct_dev *dev) +{ + struct mlx5_data_direct_registration *reg; + + mutex_lock(&mlx5_data_direct_mutex); + /* Prevent any further affiliations */ + list_del(&dev->list); + list_for_each_entry(reg, &mlx5_data_direct_reg_list, list) { + if (strcmp(dev->vuid, reg->vuid) == 0) + mlx5_ib_data_direct_unbind(reg->ibdev); + } + mutex_unlock(&mlx5_data_direct_mutex); +} + +static int mlx5_data_direct_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct mlx5_data_direct_dev *dev; + int err; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return -ENOMEM; + + dev->device = &pdev->dev; + dev->pdev = pdev; + + pci_set_drvdata(dev->pdev, dev); + err = pci_enable_device(pdev); + if (err) { + dev_err(dev->device, "Cannot enable PCI device, err=%d\n", err); + goto err; + } + + pci_set_master(pdev); + err = mlx5_data_direct_set_dma_caps(pdev); + if (err) + goto err_disable; + + if (pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP32) && + pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP64) && + pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP128)) + dev_dbg(dev->device, "Enabling pci atomics failed\n"); + + err = mlx5_data_direct_vpd_get_vuid(dev); + if (err) + goto err_disable; + + mlx5_data_direct_dev_reg(dev); + return 0; + +err_disable: + pci_disable_device(pdev); +err: + kfree(dev); + return err; +} + +static void mlx5_data_direct_remove(struct pci_dev *pdev) +{ + struct mlx5_data_direct_dev *dev = pci_get_drvdata(pdev); + + mlx5_data_direct_dev_unreg(dev); + pci_disable_device(pdev); + kfree(dev->vuid); + kfree(dev); +} + +static struct pci_driver mlx5_data_direct_driver = { + .name = KBUILD_MODNAME, + .id_table = mlx5_data_direct_pci_table, + .probe = mlx5_data_direct_probe, + .remove = mlx5_data_direct_remove, + .shutdown = mlx5_data_direct_shutdown, +}; + +int mlx5_data_direct_driver_register(void) +{ + return pci_register_driver(&mlx5_data_direct_driver); +} + +void mlx5_data_direct_driver_unregister(void) +{ + pci_unregister_driver(&mlx5_data_direct_driver); +} diff --git a/drivers/infiniband/hw/mlx5/data_direct.h b/drivers/infiniband/hw/mlx5/data_direct.h new file mode 100644 index 000000000000..2fd2bdbe8f69 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/data_direct.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#ifndef _MLX5_IB_DATA_DIRECT_H +#define _MLX5_IB_DATA_DIRECT_H + +struct mlx5_ib_dev; + +struct mlx5_data_direct_dev { + struct device *device; + struct pci_dev *pdev; + char *vuid; + struct list_head list; +}; + +int mlx5_data_direct_ib_reg(struct mlx5_ib_dev *ibdev, char *vuid); +void mlx5_data_direct_ib_unreg(struct mlx5_ib_dev *ibdev); +int mlx5_data_direct_driver_register(void); +void mlx5_data_direct_driver_unregister(void); + +#endif diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 253fea374a72..2479da8620ca 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -13,6 +13,7 @@ #include <rdma/uverbs_std_types.h> #include <linux/mlx5/driver.h> #include <linux/mlx5/fs.h> +#include <rdma/ib_ucaps.h> #include "mlx5_ib.h" #include "devx.h" #include "qp.h" @@ -27,6 +28,19 @@ enum devx_obj_flags { DEVX_OBJ_FLAGS_INDIRECT_MKEY = 1 << 0, DEVX_OBJ_FLAGS_DCT = 1 << 1, DEVX_OBJ_FLAGS_CQ = 1 << 2, + DEVX_OBJ_FLAGS_HW_FREED = 1 << 3, +}; + +#define MAX_ASYNC_CMDS 8 + +struct mlx5_async_cmd { + struct ib_uobject *uobject; + void *in; + int in_size; + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; + int err; + struct mlx5_async_work cb_work; + struct completion comp; }; struct devx_async_data { @@ -109,7 +123,27 @@ devx_ufile2uctx(const struct uverbs_attr_bundle *attrs) return to_mucontext(ib_uverbs_get_ucontext(attrs)); } -int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user) +static int set_uctx_ucaps(struct mlx5_ib_dev *dev, u64 req_ucaps, u32 *cap) +{ + if (UCAP_ENABLED(req_ucaps, RDMA_UCAP_MLX5_CTRL_LOCAL)) { + if (MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RDMA_CTRL) + *cap |= MLX5_UCTX_CAP_RDMA_CTRL; + else + return -EOPNOTSUPP; + } + + if (UCAP_ENABLED(req_ucaps, RDMA_UCAP_MLX5_CTRL_OTHER_VHCA)) { + if (MLX5_CAP_GEN(dev->mdev, uctx_cap) & + MLX5_UCTX_CAP_RDMA_CTRL_OTHER_VHCA) + *cap |= MLX5_UCTX_CAP_RDMA_CTRL_OTHER_VHCA; + else + return -EOPNOTSUPP; + } + + return 0; +} + +int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user, u64 req_ucaps) { u32 in[MLX5_ST_SZ_DW(create_uctx_in)] = {}; u32 out[MLX5_ST_SZ_DW(create_uctx_out)] = {}; @@ -123,14 +157,22 @@ int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user) return -EINVAL; uctx = MLX5_ADDR_OF(create_uctx_in, in, uctx); - if (is_user && capable(CAP_NET_RAW) && - (MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RAW_TX)) + if (is_user && + (MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RAW_TX) && + capable(CAP_NET_RAW)) cap |= MLX5_UCTX_CAP_RAW_TX; - if (is_user && capable(CAP_SYS_RAWIO) && + if (is_user && (MLX5_CAP_GEN(dev->mdev, uctx_cap) & - MLX5_UCTX_CAP_INTERNAL_DEV_RES)) + MLX5_UCTX_CAP_INTERNAL_DEV_RES) && + capable(CAP_SYS_RAWIO)) cap |= MLX5_UCTX_CAP_INTERNAL_DEV_RES; + if (req_ucaps) { + err = set_uctx_ucaps(dev, req_ucaps, &cap); + if (err) + return err; + } + MLX5_SET(create_uctx_in, in, opcode, MLX5_CMD_OP_CREATE_UCTX); MLX5_SET(uctx, uctx, cap, cap); @@ -1405,7 +1447,9 @@ static int devx_obj_cleanup(struct ib_uobject *uobject, */ mlx5r_deref_wait_odp_mkey(&obj->mkey); - if (obj->flags & DEVX_OBJ_FLAGS_DCT) + if (obj->flags & DEVX_OBJ_FLAGS_HW_FREED) + ret = 0; + else if (obj->flags & DEVX_OBJ_FLAGS_DCT) ret = mlx5_core_destroy_dct(obj->ib_dev, &obj->core_dct); else if (obj->flags & DEVX_OBJ_FLAGS_CQ) ret = mlx5_core_destroy_cq(obj->ib_dev->mdev, &obj->core_cq); @@ -2558,7 +2602,7 @@ int mlx5_ib_devx_init(struct mlx5_ib_dev *dev) struct mlx5_devx_event_table *table = &dev->devx_event_table; int uid; - uid = mlx5_ib_devx_create(dev, false); + uid = mlx5_ib_devx_create(dev, false, 0); if (uid > 0) { dev->devx_whitelist_uid = uid; xa_init(&table->event_xa); @@ -2595,6 +2639,82 @@ void mlx5_ib_devx_cleanup(struct mlx5_ib_dev *dev) } } +static void devx_async_destroy_cb(int status, struct mlx5_async_work *context) +{ + struct mlx5_async_cmd *devx_out = container_of(context, + struct mlx5_async_cmd, cb_work); + struct devx_obj *obj = devx_out->uobject->object; + + if (!status) + obj->flags |= DEVX_OBJ_FLAGS_HW_FREED; + + complete(&devx_out->comp); +} + +static void devx_async_destroy(struct mlx5_ib_dev *dev, + struct mlx5_async_cmd *cmd) +{ + init_completion(&cmd->comp); + cmd->err = mlx5_cmd_exec_cb(&dev->async_ctx, cmd->in, cmd->in_size, + &cmd->out, sizeof(cmd->out), + devx_async_destroy_cb, &cmd->cb_work); +} + +static void devx_wait_async_destroy(struct mlx5_async_cmd *cmd) +{ + if (!cmd->err) + wait_for_completion(&cmd->comp); + atomic_set(&cmd->uobject->usecnt, 0); +} + +void mlx5_ib_ufile_hw_cleanup(struct ib_uverbs_file *ufile) +{ + struct mlx5_async_cmd async_cmd[MAX_ASYNC_CMDS]; + struct ib_ucontext *ucontext = ufile->ucontext; + struct ib_device *device = ucontext->device; + struct mlx5_ib_dev *dev = to_mdev(device); + struct ib_uobject *uobject; + struct devx_obj *obj; + int head = 0; + int tail = 0; + + list_for_each_entry(uobject, &ufile->uobjects, list) { + WARN_ON(uverbs_try_lock_object(uobject, UVERBS_LOOKUP_WRITE)); + + /* + * Currently we only support QP destruction, if other objects + * are to be destroyed need to add type synchronization to the + * cleanup algorithm and handle pre/post FW cleanup for the + * new types if needed. + */ + if (uobj_get_object_id(uobject) != MLX5_IB_OBJECT_DEVX_OBJ || + (get_dec_obj_type(uobject->object, MLX5_EVENT_TYPE_MAX) != + MLX5_OBJ_TYPE_QP)) { + atomic_set(&uobject->usecnt, 0); + continue; + } + + obj = uobject->object; + + async_cmd[tail % MAX_ASYNC_CMDS].in = obj->dinbox; + async_cmd[tail % MAX_ASYNC_CMDS].in_size = obj->dinlen; + async_cmd[tail % MAX_ASYNC_CMDS].uobject = uobject; + + devx_async_destroy(dev, &async_cmd[tail % MAX_ASYNC_CMDS]); + tail++; + + if (tail - head == MAX_ASYNC_CMDS) { + devx_wait_async_destroy(&async_cmd[head % MAX_ASYNC_CMDS]); + head++; + } + } + + while (head != tail) { + devx_wait_async_destroy(&async_cmd[head % MAX_ASYNC_CMDS]); + head++; + } +} + static ssize_t devx_async_cmd_event_read(struct file *filp, char __user *buf, size_t count, loff_t *pos) { @@ -2673,7 +2793,6 @@ static const struct file_operations devx_async_cmd_event_fops = { .read = devx_async_cmd_event_read, .poll = devx_async_cmd_event_poll, .release = uverbs_uobject_fd_release, - .llseek = no_llseek, }; static ssize_t devx_async_event_read(struct file *filp, char __user *buf, @@ -2788,7 +2907,6 @@ static const struct file_operations devx_async_event_fops = { .read = devx_async_event_read, .poll = devx_async_event_poll, .release = uverbs_uobject_fd_release, - .llseek = no_llseek, }; static void devx_async_cmd_event_destroy_uobj(struct ib_uobject *uobj, diff --git a/drivers/infiniband/hw/mlx5/devx.h b/drivers/infiniband/hw/mlx5/devx.h index ee2213275fd6..ee9e7d3af93f 100644 --- a/drivers/infiniband/hw/mlx5/devx.h +++ b/drivers/infiniband/hw/mlx5/devx.h @@ -24,12 +24,14 @@ struct devx_obj { struct list_head event_sub; /* holds devx_event_subscription entries */ }; #if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) -int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user); +int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user, u64 req_ucaps); void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid); int mlx5_ib_devx_init(struct mlx5_ib_dev *dev); void mlx5_ib_devx_cleanup(struct mlx5_ib_dev *dev); +void mlx5_ib_ufile_hw_cleanup(struct ib_uverbs_file *ufile); #else -static inline int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user) +static inline int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user, + u64 req_ucaps) { return -EOPNOTSUPP; } @@ -41,5 +43,8 @@ static inline int mlx5_ib_devx_init(struct mlx5_ib_dev *dev) static inline void mlx5_ib_devx_cleanup(struct mlx5_ib_dev *dev) { } +static inline void mlx5_ib_ufile_hw_cleanup(struct ib_uverbs_file *ufile) +{ +} #endif #endif /* _MLX5_IB_DEVX_H */ diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c index 520034acf73a..680627f1de33 100644 --- a/drivers/infiniband/hw/mlx5/fs.c +++ b/drivers/infiniband/hw/mlx5/fs.c @@ -12,6 +12,7 @@ #include <rdma/mlx5_user_ioctl_verbs.h> #include <rdma/ib_hdrs.h> #include <rdma/ib_umem.h> +#include <rdma/ib_ucaps.h> #include <linux/mlx5/driver.h> #include <linux/mlx5/fs.h> #include <linux/mlx5/fs_helpers.h> @@ -32,6 +33,11 @@ enum { MATCH_CRITERIA_ENABLE_MISC2_BIT }; + +struct mlx5_per_qp_opfc { + struct mlx5_ib_op_fc opfcs[MLX5_IB_OPCOUNTER_MAX]; +}; + #define HEADER_IS_ZERO(match_criteria, headers) \ !(memchr_inv(MLX5_ADDR_OF(fte_match_param, match_criteria, headers), \ 0, MLX5_FLD_SZ_BYTES(fte_match_param, headers))) \ @@ -678,7 +684,7 @@ enum flow_table_type { #define MLX5_FS_MAX_TYPES 6 #define MLX5_FS_MAX_ENTRIES BIT(16) -static bool mlx5_ib_shared_ft_allowed(struct ib_device *device) +static bool __maybe_unused mlx5_ib_shared_ft_allowed(struct ib_device *device) { struct mlx5_ib_dev *dev = to_mdev(device); @@ -690,7 +696,7 @@ static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_ib_dev *dev, struct mlx5_ib_flow_prio *prio, int priority, int num_entries, int num_groups, - u32 flags) + u32 flags, u16 vport) { struct mlx5_flow_table_attr ft_attr = {}; struct mlx5_flow_table *ft; @@ -698,6 +704,7 @@ static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_ib_dev *dev, ft_attr.prio = priority; ft_attr.max_fte = num_entries; ft_attr.flags = flags; + ft_attr.vport = vport; ft_attr.autogroup.max_num_groups = num_groups; ft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr); if (IS_ERR(ft)) @@ -792,18 +799,25 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, ft = prio->flow_table; if (!ft) return _get_prio(dev, ns, prio, priority, max_table_size, - num_groups, flags); + num_groups, flags, 0); return prio; } enum { + RDMA_RX_ECN_OPCOUNTER_PER_QP_PRIO, + RDMA_RX_CNP_OPCOUNTER_PER_QP_PRIO, + RDMA_RX_PKTS_BYTES_OPCOUNTER_PER_QP_PRIO, RDMA_RX_ECN_OPCOUNTER_PRIO, RDMA_RX_CNP_OPCOUNTER_PRIO, + RDMA_RX_PKTS_BYTES_OPCOUNTER_PRIO, }; enum { + RDMA_TX_CNP_OPCOUNTER_PER_QP_PRIO, + RDMA_TX_PKTS_BYTES_OPCOUNTER_PER_QP_PRIO, RDMA_TX_CNP_OPCOUNTER_PRIO, + RDMA_TX_PKTS_BYTES_OPCOUNTER_PRIO, }; static int set_vhca_port_spec(struct mlx5_ib_dev *dev, u32 port_num, @@ -867,6 +881,344 @@ static int set_cnp_spec(struct mlx5_ib_dev *dev, u32 port_num, return 0; } +/* Returns the prio we should use for the given optional counter type, + * whereas for bytes type we use the packet type, since they share the same + * resources. + */ +static struct mlx5_ib_flow_prio *get_opfc_prio(struct mlx5_ib_dev *dev, + u32 type) +{ + u32 prio_type; + + switch (type) { + case MLX5_IB_OPCOUNTER_RDMA_TX_BYTES: + prio_type = MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES: + prio_type = MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS; + break; + case MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP: + prio_type = MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP: + prio_type = MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS_PER_QP; + break; + default: + prio_type = type; + } + + return &dev->flow_db->opfcs[prio_type]; +} + +static void put_per_qp_prio(struct mlx5_ib_dev *dev, + enum mlx5_ib_optional_counter_type type) +{ + enum mlx5_ib_optional_counter_type per_qp_type; + struct mlx5_ib_flow_prio *prio; + + switch (type) { + case MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS: + per_qp_type = MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_CC_RX_CNP_PKTS: + per_qp_type = MLX5_IB_OPCOUNTER_CC_RX_CNP_PKTS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_CC_TX_CNP_PKTS: + per_qp_type = MLX5_IB_OPCOUNTER_CC_TX_CNP_PKTS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS: + per_qp_type = MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_TX_BYTES: + per_qp_type = MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS: + per_qp_type = MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES: + per_qp_type = MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP; + break; + default: + return; + } + + prio = get_opfc_prio(dev, per_qp_type); + put_flow_table(dev, prio, true); +} + +static int get_per_qp_prio(struct mlx5_ib_dev *dev, + enum mlx5_ib_optional_counter_type type) +{ + enum mlx5_ib_optional_counter_type per_qp_type; + enum mlx5_flow_namespace_type fn_type; + struct mlx5_flow_namespace *ns; + struct mlx5_ib_flow_prio *prio; + int priority; + + switch (type) { + case MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS: + fn_type = MLX5_FLOW_NAMESPACE_RDMA_RX_COUNTERS; + priority = RDMA_RX_ECN_OPCOUNTER_PER_QP_PRIO; + per_qp_type = MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_CC_RX_CNP_PKTS: + fn_type = MLX5_FLOW_NAMESPACE_RDMA_RX_COUNTERS; + priority = RDMA_RX_CNP_OPCOUNTER_PER_QP_PRIO; + per_qp_type = MLX5_IB_OPCOUNTER_CC_RX_CNP_PKTS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_CC_TX_CNP_PKTS: + fn_type = MLX5_FLOW_NAMESPACE_RDMA_TX_COUNTERS; + priority = RDMA_TX_CNP_OPCOUNTER_PER_QP_PRIO; + per_qp_type = MLX5_IB_OPCOUNTER_CC_TX_CNP_PKTS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS: + fn_type = MLX5_FLOW_NAMESPACE_RDMA_TX_COUNTERS; + priority = RDMA_TX_PKTS_BYTES_OPCOUNTER_PER_QP_PRIO; + per_qp_type = MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_TX_BYTES: + fn_type = MLX5_FLOW_NAMESPACE_RDMA_TX_COUNTERS; + priority = RDMA_TX_PKTS_BYTES_OPCOUNTER_PER_QP_PRIO; + per_qp_type = MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS: + fn_type = MLX5_FLOW_NAMESPACE_RDMA_RX_COUNTERS; + priority = RDMA_RX_PKTS_BYTES_OPCOUNTER_PER_QP_PRIO; + per_qp_type = MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES: + fn_type = MLX5_FLOW_NAMESPACE_RDMA_RX_COUNTERS; + priority = RDMA_RX_PKTS_BYTES_OPCOUNTER_PER_QP_PRIO; + per_qp_type = MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP; + break; + default: + return -EINVAL; + } + + ns = mlx5_get_flow_namespace(dev->mdev, fn_type); + if (!ns) + return -EOPNOTSUPP; + + prio = get_opfc_prio(dev, per_qp_type); + if (prio->flow_table) + return 0; + + prio = _get_prio(dev, ns, prio, priority, MLX5_FS_MAX_POOL_SIZE, 1, 0, 0); + if (IS_ERR(prio)) + return PTR_ERR(prio); + + prio->refcount = 1; + + return 0; +} + +static struct mlx5_per_qp_opfc * +get_per_qp_opfc(struct mlx5_rdma_counter *mcounter, u32 qp_num, bool *new) +{ + struct mlx5_per_qp_opfc *per_qp_opfc; + + *new = false; + + per_qp_opfc = xa_load(&mcounter->qpn_opfc_xa, qp_num); + if (per_qp_opfc) + return per_qp_opfc; + per_qp_opfc = kzalloc(sizeof(*per_qp_opfc), GFP_KERNEL); + + if (!per_qp_opfc) + return NULL; + + *new = true; + return per_qp_opfc; +} + +static int add_op_fc_rules(struct mlx5_ib_dev *dev, + struct mlx5_rdma_counter *mcounter, + struct mlx5_per_qp_opfc *per_qp_opfc, + struct mlx5_ib_flow_prio *prio, + enum mlx5_ib_optional_counter_type type, + u32 qp_num, u32 port_num) +{ + struct mlx5_ib_op_fc *opfc = &per_qp_opfc->opfcs[type], *in_use_opfc; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_destination dst; + struct mlx5_flow_spec *spec; + int i, err, spec_num; + bool is_tx; + + if (opfc->fc) + return -EEXIST; + + if (mlx5r_is_opfc_shared_and_in_use(per_qp_opfc->opfcs, type, + &in_use_opfc)) { + opfc->fc = in_use_opfc->fc; + opfc->rule[0] = in_use_opfc->rule[0]; + return 0; + } + + opfc->fc = mcounter->fc[type]; + + spec = kcalloc(MAX_OPFC_RULES, sizeof(*spec), GFP_KERNEL); + if (!spec) { + err = -ENOMEM; + goto null_fc; + } + + switch (type) { + case MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP: + if (set_ecn_ce_spec(dev, port_num, &spec[0], + MLX5_FS_IPV4_VERSION) || + set_ecn_ce_spec(dev, port_num, &spec[1], + MLX5_FS_IPV6_VERSION)) { + err = -EOPNOTSUPP; + goto free_spec; + } + spec_num = 2; + is_tx = false; + + MLX5_SET_TO_ONES(fte_match_param, spec[1].match_criteria, + misc_parameters.bth_dst_qp); + MLX5_SET(fte_match_param, spec[1].match_value, + misc_parameters.bth_dst_qp, qp_num); + spec[1].match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS; + break; + case MLX5_IB_OPCOUNTER_CC_RX_CNP_PKTS_PER_QP: + if (!MLX5_CAP_FLOWTABLE( + dev->mdev, + ft_field_support_2_nic_receive_rdma.bth_opcode) || + set_cnp_spec(dev, port_num, &spec[0])) { + err = -EOPNOTSUPP; + goto free_spec; + } + spec_num = 1; + is_tx = false; + break; + case MLX5_IB_OPCOUNTER_CC_TX_CNP_PKTS_PER_QP: + if (!MLX5_CAP_FLOWTABLE( + dev->mdev, + ft_field_support_2_nic_transmit_rdma.bth_opcode) || + set_cnp_spec(dev, port_num, &spec[0])) { + err = -EOPNOTSUPP; + goto free_spec; + } + spec_num = 1; + is_tx = true; + break; + case MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS_PER_QP: + case MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP: + spec_num = 1; + is_tx = true; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS_PER_QP: + case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP: + spec_num = 1; + is_tx = false; + break; + default: + err = -EINVAL; + goto free_spec; + } + + if (is_tx) { + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + misc_parameters.source_sqn); + MLX5_SET(fte_match_param, spec->match_value, + misc_parameters.source_sqn, qp_num); + } else { + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + misc_parameters.bth_dst_qp); + MLX5_SET(fte_match_param, spec->match_value, + misc_parameters.bth_dst_qp, qp_num); + } + + spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS; + + dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; + dst.counter = opfc->fc; + + flow_act.action = + MLX5_FLOW_CONTEXT_ACTION_COUNT | MLX5_FLOW_CONTEXT_ACTION_ALLOW; + + for (i = 0; i < spec_num; i++) { + opfc->rule[i] = mlx5_add_flow_rules(prio->flow_table, &spec[i], + &flow_act, &dst, 1); + if (IS_ERR(opfc->rule[i])) { + err = PTR_ERR(opfc->rule[i]); + goto del_rules; + } + } + prio->refcount += spec_num; + + err = xa_err(xa_store(&mcounter->qpn_opfc_xa, qp_num, per_qp_opfc, + GFP_KERNEL)); + if (err) + goto del_rules; + + kfree(spec); + + return 0; + +del_rules: + while (i--) + mlx5_del_flow_rules(opfc->rule[i]); + put_flow_table(dev, prio, false); +free_spec: + kfree(spec); +null_fc: + opfc->fc = NULL; + return err; +} + +static bool is_fc_shared_and_in_use(struct mlx5_rdma_counter *mcounter, + u32 type, struct mlx5_fc **fc) +{ + u32 shared_fc_type; + + switch (type) { + case MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS_PER_QP: + shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP: + shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS_PER_QP: + shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP: + shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS_PER_QP; + break; + default: + return false; + } + + *fc = mcounter->fc[shared_fc_type]; + if (!(*fc)) + return false; + + return true; +} + +void mlx5r_fs_destroy_fcs(struct mlx5_ib_dev *dev, + struct rdma_counter *counter) +{ + struct mlx5_rdma_counter *mcounter = to_mcounter(counter); + struct mlx5_fc *in_use_fc; + int i; + + for (i = MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP; + i <= MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP; i++) { + if (!mcounter->fc[i]) + continue; + + if (is_fc_shared_and_in_use(mcounter, i, &in_use_fc)) { + mcounter->fc[i] = NULL; + continue; + } + + mlx5_fc_destroy(dev->mdev, mcounter->fc[i]); + mcounter->fc[i] = NULL; + } +} + int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num, struct mlx5_ib_op_fc *opfc, enum mlx5_ib_optional_counter_type type) @@ -921,6 +1273,20 @@ int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num, priority = RDMA_TX_CNP_OPCOUNTER_PRIO; break; + case MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS: + case MLX5_IB_OPCOUNTER_RDMA_TX_BYTES: + spec_num = 1; + fn_type = MLX5_FLOW_NAMESPACE_RDMA_TX_COUNTERS; + priority = RDMA_TX_PKTS_BYTES_OPCOUNTER_PRIO; + break; + + case MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS: + case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES: + spec_num = 1; + fn_type = MLX5_FLOW_NAMESPACE_RDMA_RX_COUNTERS; + priority = RDMA_RX_PKTS_BYTES_OPCOUNTER_PRIO; + break; + default: err = -EOPNOTSUPP; goto free; @@ -932,18 +1298,22 @@ int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num, goto free; } - prio = &dev->flow_db->opfcs[type]; + prio = get_opfc_prio(dev, type); if (!prio->flow_table) { + err = get_per_qp_prio(dev, type); + if (err) + goto free; + prio = _get_prio(dev, ns, prio, priority, - dev->num_ports * MAX_OPFC_RULES, 1, 0); + dev->num_ports * MAX_OPFC_RULES, 1, 0, 0); if (IS_ERR(prio)) { err = PTR_ERR(prio); - goto free; + goto put_prio; } } dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; - dst.counter_id = mlx5_fc_id(opfc->fc); + dst.counter = opfc->fc; flow_act.action = MLX5_FLOW_CONTEXT_ACTION_COUNT | MLX5_FLOW_CONTEXT_ACTION_ALLOW; @@ -965,6 +1335,8 @@ del_rules: for (i -= 1; i >= 0; i--) mlx5_del_flow_rules(opfc->rule[i]); put_flow_table(dev, prio, false); +put_prio: + put_per_qp_prio(dev, type); free: kfree(spec); return err; @@ -974,12 +1346,115 @@ void mlx5_ib_fs_remove_op_fc(struct mlx5_ib_dev *dev, struct mlx5_ib_op_fc *opfc, enum mlx5_ib_optional_counter_type type) { + struct mlx5_ib_flow_prio *prio; int i; + prio = get_opfc_prio(dev, type); + for (i = 0; i < MAX_OPFC_RULES && opfc->rule[i]; i++) { mlx5_del_flow_rules(opfc->rule[i]); - put_flow_table(dev, &dev->flow_db->opfcs[type], true); + put_flow_table(dev, prio, true); + } + + put_per_qp_prio(dev, type); +} + +void mlx5r_fs_unbind_op_fc(struct ib_qp *qp, struct rdma_counter *counter) +{ + struct mlx5_rdma_counter *mcounter = to_mcounter(counter); + struct mlx5_ib_dev *dev = to_mdev(counter->device); + struct mlx5_per_qp_opfc *per_qp_opfc; + struct mlx5_ib_op_fc *in_use_opfc; + struct mlx5_ib_flow_prio *prio; + int i, j; + + per_qp_opfc = xa_load(&mcounter->qpn_opfc_xa, qp->qp_num); + if (!per_qp_opfc) + return; + + for (i = MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP; + i <= MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP; i++) { + if (!per_qp_opfc->opfcs[i].fc) + continue; + + if (mlx5r_is_opfc_shared_and_in_use(per_qp_opfc->opfcs, i, + &in_use_opfc)) { + per_qp_opfc->opfcs[i].fc = NULL; + continue; + } + + for (j = 0; j < MAX_OPFC_RULES; j++) { + if (!per_qp_opfc->opfcs[i].rule[j]) + continue; + mlx5_del_flow_rules(per_qp_opfc->opfcs[i].rule[j]); + prio = get_opfc_prio(dev, i); + put_flow_table(dev, prio, true); + } + per_qp_opfc->opfcs[i].fc = NULL; + } + + kfree(per_qp_opfc); + xa_erase(&mcounter->qpn_opfc_xa, qp->qp_num); +} + +int mlx5r_fs_bind_op_fc(struct ib_qp *qp, struct rdma_counter *counter, + u32 port) +{ + struct mlx5_rdma_counter *mcounter = to_mcounter(counter); + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_per_qp_opfc *per_qp_opfc; + struct mlx5_ib_flow_prio *prio; + struct mlx5_ib_counters *cnts; + struct mlx5_ib_op_fc *opfc; + struct mlx5_fc *in_use_fc; + int i, err, per_qp_type; + bool new; + + if (!counter->mode.bind_opcnt) + return 0; + + cnts = &dev->port[port - 1].cnts; + + for (i = 0; i <= MLX5_IB_OPCOUNTER_RDMA_RX_BYTES; i++) { + opfc = &cnts->opfcs[i]; + if (!opfc->fc) + continue; + + per_qp_type = i + MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP; + prio = get_opfc_prio(dev, per_qp_type); + WARN_ON(!prio->flow_table); + + if (is_fc_shared_and_in_use(mcounter, per_qp_type, &in_use_fc)) + mcounter->fc[per_qp_type] = in_use_fc; + + if (!mcounter->fc[per_qp_type]) { + mcounter->fc[per_qp_type] = mlx5_fc_create(dev->mdev, + false); + if (IS_ERR(mcounter->fc[per_qp_type])) + return PTR_ERR(mcounter->fc[per_qp_type]); + } + + per_qp_opfc = get_per_qp_opfc(mcounter, qp->qp_num, &new); + if (!per_qp_opfc) { + err = -ENOMEM; + goto free_fc; + } + err = add_op_fc_rules(dev, mcounter, per_qp_opfc, prio, + per_qp_type, qp->qp_num, port); + if (err) + goto del_rules; } + + return 0; + +del_rules: + mlx5r_fs_unbind_op_fc(qp, counter); + if (new) + kfree(per_qp_opfc); +free_fc: + if (xa_empty(&mcounter->qpn_opfc_xa)) + mlx5r_fs_destroy_fcs(dev, counter); + return err; } static void set_underlay_qp(struct mlx5_ib_dev *dev, @@ -1113,8 +1588,8 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev, handler->ibcounters = flow_act.counters; dest_arr[dest_num].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; - dest_arr[dest_num].counter_id = - mlx5_fc_id(mcounters->hw_cntrs_hndl); + dest_arr[dest_num].counter = + mcounters->hw_cntrs_hndl; dest_num++; } @@ -1170,11 +1645,6 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev, return _create_flow_rule(dev, ft_prio, flow_attr, dst, 0, NULL); } -enum { - LEFTOVERS_MC, - LEFTOVERS_UC, -}; - static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *dev, struct mlx5_ib_flow_prio *ft_prio, struct ib_flow_attr *flow_attr, @@ -1184,43 +1654,32 @@ static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *de struct mlx5_ib_flow_handler *handler = NULL; static struct { - struct ib_flow_attr flow_attr; struct ib_flow_spec_eth eth_flow; - } leftovers_specs[] = { - [LEFTOVERS_MC] = { - .flow_attr = { - .num_of_specs = 1, - .size = sizeof(leftovers_specs[0]) - }, - .eth_flow = { - .type = IB_FLOW_SPEC_ETH, - .size = sizeof(struct ib_flow_spec_eth), - .mask = {.dst_mac = {0x1} }, - .val = {.dst_mac = {0x1} } - } - }, - [LEFTOVERS_UC] = { - .flow_attr = { - .num_of_specs = 1, - .size = sizeof(leftovers_specs[0]) - }, - .eth_flow = { - .type = IB_FLOW_SPEC_ETH, - .size = sizeof(struct ib_flow_spec_eth), - .mask = {.dst_mac = {0x1} }, - .val = {.dst_mac = {} } - } - } - }; + struct ib_flow_attr flow_attr; + } leftovers_wc = { .flow_attr = { .num_of_specs = 1, + .size = sizeof(leftovers_wc) }, + .eth_flow = { + .type = IB_FLOW_SPEC_ETH, + .size = sizeof(struct ib_flow_spec_eth), + .mask = { .dst_mac = { 0x1 } }, + .val = { .dst_mac = { 0x1 } } } }; - handler = create_flow_rule(dev, ft_prio, - &leftovers_specs[LEFTOVERS_MC].flow_attr, - dst); + static struct { + struct ib_flow_spec_eth eth_flow; + struct ib_flow_attr flow_attr; + } leftovers_uc = { .flow_attr = { .num_of_specs = 1, + .size = sizeof(leftovers_uc) }, + .eth_flow = { + .type = IB_FLOW_SPEC_ETH, + .size = sizeof(struct ib_flow_spec_eth), + .mask = { .dst_mac = { 0x1 } }, + .val = { .dst_mac = {} } } }; + + handler = create_flow_rule(dev, ft_prio, &leftovers_wc.flow_attr, dst); if (!IS_ERR(handler) && flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT) { handler_ucast = create_flow_rule(dev, ft_prio, - &leftovers_specs[LEFTOVERS_UC].flow_attr, - dst); + &leftovers_uc.flow_attr, dst); if (IS_ERR(handler_ucast)) { mlx5_del_flow_rules(handler->rule); ft_prio->refcount--; @@ -1413,17 +1872,51 @@ free_ucmd: return ERR_PTR(err); } +static int mlx5_ib_fill_transport_ns_info(struct mlx5_ib_dev *dev, + enum mlx5_flow_namespace_type type, + u32 *flags, u16 *vport_idx, + u16 *vport, + struct mlx5_core_dev **ft_mdev, + u32 ib_port) +{ + struct mlx5_core_dev *esw_mdev; + + if (!is_mdev_switchdev_mode(dev->mdev)) + return 0; + + if (!MLX5_CAP_ADV_RDMA(dev->mdev, rdma_transport_manager)) + return -EOPNOTSUPP; + + if (!dev->port[ib_port - 1].rep) + return -EINVAL; + + esw_mdev = mlx5_eswitch_get_core_dev(dev->port[ib_port - 1].rep->esw); + if (esw_mdev != dev->mdev) + return -EOPNOTSUPP; + + *flags |= MLX5_FLOW_TABLE_OTHER_VPORT; + *ft_mdev = esw_mdev; + *vport = dev->port[ib_port - 1].rep->vport; + *vport_idx = dev->port[ib_port - 1].rep->vport_index; + + return 0; +} + static struct mlx5_ib_flow_prio * _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority, enum mlx5_flow_namespace_type ns_type, - bool mcast) + bool mcast, u32 ib_port) { + struct mlx5_core_dev *ft_mdev = dev->mdev; struct mlx5_flow_namespace *ns = NULL; struct mlx5_ib_flow_prio *prio = NULL; int max_table_size = 0; + u16 vport_idx = 0; bool esw_encap; u32 flags = 0; + u16 vport = 0; int priority; + int ret; if (mcast) priority = MLX5_IB_FLOW_MCAST_PRIO; @@ -1471,13 +1964,38 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority, MLX5_CAP_FLOWTABLE_RDMA_TX(dev->mdev, log_max_ft_size)); priority = user_priority; break; + case MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX: + case MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX: + if (ib_port == 0 || user_priority > MLX5_RDMA_TRANSPORT_BYPASS_PRIO) + return ERR_PTR(-EINVAL); + ret = mlx5_ib_fill_transport_ns_info(dev, ns_type, &flags, + &vport_idx, &vport, + &ft_mdev, ib_port); + if (ret) + return ERR_PTR(ret); + + if (ns_type == MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX) + max_table_size = + BIT(MLX5_CAP_FLOWTABLE_RDMA_TRANSPORT_RX( + ft_mdev, log_max_ft_size)); + else + max_table_size = + BIT(MLX5_CAP_FLOWTABLE_RDMA_TRANSPORT_TX( + ft_mdev, log_max_ft_size)); + priority = user_priority; + break; default: break; } max_table_size = min_t(int, max_table_size, MLX5_FS_MAX_ENTRIES); - ns = mlx5_get_flow_namespace(dev->mdev, ns_type); + if (ns_type == MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX || + ns_type == MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX) + ns = mlx5_get_flow_vport_namespace(ft_mdev, ns_type, vport_idx); + else + ns = mlx5_get_flow_namespace(ft_mdev, ns_type); + if (!ns) return ERR_PTR(-EOPNOTSUPP); @@ -1497,6 +2015,12 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority, case MLX5_FLOW_NAMESPACE_RDMA_TX: prio = &dev->flow_db->rdma_tx[priority]; break; + case MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX: + prio = &dev->flow_db->rdma_transport_rx[ib_port - 1]; + break; + case MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX: + prio = &dev->flow_db->rdma_transport_tx[ib_port - 1]; + break; default: return ERR_PTR(-EINVAL); } @@ -1507,7 +2031,7 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority, return prio; return _get_prio(dev, ns, prio, priority, max_table_size, - MLX5_FS_MAX_TYPES, flags); + MLX5_FS_MAX_TYPES, flags, vport); } static struct mlx5_ib_flow_handler * @@ -1603,7 +2127,7 @@ static bool raw_fs_is_multicast(struct mlx5_ib_flow_matcher *fs_matcher, static struct mlx5_ib_flow_handler *raw_fs_rule_add( struct mlx5_ib_dev *dev, struct mlx5_ib_flow_matcher *fs_matcher, struct mlx5_flow_context *flow_context, struct mlx5_flow_act *flow_act, - u32 counter_id, void *cmd_in, int inlen, int dest_id, int dest_type) + struct mlx5_fc *counter, void *cmd_in, int inlen, int dest_id, int dest_type) { struct mlx5_flow_destination *dst; struct mlx5_ib_flow_prio *ft_prio; @@ -1626,7 +2150,8 @@ static struct mlx5_ib_flow_handler *raw_fs_rule_add( mutex_lock(&dev->flow_db->lock); ft_prio = _get_flow_table(dev, fs_matcher->priority, - fs_matcher->ns_type, mcast); + fs_matcher->ns_type, mcast, + fs_matcher->ib_port); if (IS_ERR(ft_prio)) { err = PTR_ERR(ft_prio); goto unlock; @@ -1652,8 +2177,12 @@ static struct mlx5_ib_flow_handler *raw_fs_rule_add( } if (flow_act->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) { + if (WARN_ON(!counter)) { + err = -EINVAL; + goto unlock; + } dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; - dst[dst_num].counter_id = counter_id; + dst[dst_num].counter = counter; dst_num++; } @@ -1738,6 +2267,12 @@ mlx5_ib_ft_type_to_namespace(enum mlx5_ib_uapi_flow_table_type table_type, case MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_TX: *namespace = MLX5_FLOW_NAMESPACE_RDMA_TX; break; + case MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_TRANSPORT_RX: + *namespace = MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX; + break; + case MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_TRANSPORT_TX: + *namespace = MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX; + break; default: return -EINVAL; } @@ -1827,7 +2362,8 @@ static int get_dests(struct uverbs_attr_bundle *attrs, return -EINVAL; /* Allow only DEVX object or QP as dest when inserting to RDMA_RX */ - if ((fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX) && + if ((fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX || + fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX) && ((!dest_devx && !dest_qp) || (dest_devx && dest_qp))) return -EINVAL; @@ -1844,7 +2380,8 @@ static int get_dests(struct uverbs_attr_bundle *attrs, return -EINVAL; /* Allow only flow table as dest when inserting to FDB or RDMA_RX */ if ((fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB_BYPASS || - fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX) && + fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX || + fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX) && *dest_type != MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) return -EINVAL; } else if (dest_qp) { @@ -1865,20 +2402,23 @@ static int get_dests(struct uverbs_attr_bundle *attrs, *dest_id = mqp->raw_packet_qp.rq.tirn; *dest_type = MLX5_FLOW_DESTINATION_TYPE_TIR; } else if ((fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS || - fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TX) && + fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TX || + fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX) && !(*flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP)) { *dest_type = MLX5_FLOW_DESTINATION_TYPE_PORT; } if (*dest_type == MLX5_FLOW_DESTINATION_TYPE_TIR && (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS || - fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TX)) + fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TX || + fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX)) return -EINVAL; return 0; } -static bool is_flow_counter(void *obj, u32 offset, u32 *counter_id) +static bool +is_flow_counter(void *obj, u32 offset, u32 *counter_id, u32 *fc_bulk_size) { struct devx_obj *devx_obj = obj; u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, devx_obj->dinbox, opcode); @@ -1888,6 +2428,7 @@ static bool is_flow_counter(void *obj, u32 offset, u32 *counter_id) if (offset && offset >= devx_obj->flow_counter_bulk_size) return false; + *fc_bulk_size = devx_obj->flow_counter_bulk_size; *counter_id = MLX5_GET(dealloc_flow_counter_in, devx_obj->dinbox, flow_counter_id); @@ -1904,13 +2445,13 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( { struct mlx5_flow_context flow_context = {.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG}; - u32 *offset_attr, offset = 0, counter_id = 0; int dest_id, dest_type = -1, inlen, len, ret, i; struct mlx5_ib_flow_handler *flow_handler; struct mlx5_ib_flow_matcher *fs_matcher; struct ib_uobject **arr_flow_actions; struct ib_uflow_resources *uflow_res; struct mlx5_flow_act flow_act = {}; + struct mlx5_fc *counter = NULL; struct ib_qp *qp = NULL; void *devx_obj, *cmd_in; struct ib_uobject *uobj; @@ -1937,6 +2478,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( len = uverbs_attr_get_uobjs_arr(attrs, MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX, &arr_flow_actions); if (len) { + u32 *offset_attr, fc_bulk_size, offset = 0, counter_id = 0; devx_obj = arr_flow_actions[0]->object; if (uverbs_attr_is_valid(attrs, @@ -1956,8 +2498,11 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( offset = *offset_attr; } - if (!is_flow_counter(devx_obj, offset, &counter_id)) + if (!is_flow_counter(devx_obj, offset, &counter_id, &fc_bulk_size)) return -EINVAL; + counter = mlx5_fc_local_create(counter_id, offset, fc_bulk_size); + if (IS_ERR(counter)) + return PTR_ERR(counter); flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT; } @@ -1968,8 +2513,10 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE); uflow_res = flow_resources_alloc(MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS); - if (!uflow_res) - return -ENOMEM; + if (!uflow_res) { + ret = -ENOMEM; + goto destroy_counter; + } len = uverbs_attr_get_uobjs_arr(attrs, MLX5_IB_ATTR_CREATE_FLOW_ARR_FLOW_ACTIONS, &arr_flow_actions); @@ -1996,7 +2543,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( flow_handler = raw_fs_rule_add(dev, fs_matcher, &flow_context, &flow_act, - counter_id, cmd_in, inlen, dest_id, dest_type); + counter, cmd_in, inlen, dest_id, dest_type); if (IS_ERR(flow_handler)) { ret = PTR_ERR(flow_handler); goto err_out; @@ -2007,6 +2554,9 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( return 0; err_out: ib_uverbs_flow_resources_free(uflow_res); +destroy_counter: + if (counter) + mlx5_fc_local_destroy(counter); return ret; } @@ -2338,6 +2888,15 @@ static int mlx5_ib_matcher_ns(struct uverbs_attr_bundle *attrs, return 0; } +static bool verify_context_caps(struct mlx5_ib_dev *dev, u64 enabled_caps) +{ + if (is_mdev_switchdev_mode(dev->mdev)) + return UCAP_ENABLED(enabled_caps, + RDMA_UCAP_MLX5_CTRL_OTHER_VHCA); + + return UCAP_ENABLED(enabled_caps, RDMA_UCAP_MLX5_CTRL_LOCAL); +} + static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_MATCHER_CREATE)( struct uverbs_attr_bundle *attrs) { @@ -2386,6 +2945,26 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_MATCHER_CREATE)( goto end; } + if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_FLOW_MATCHER_IB_PORT)) { + err = uverbs_copy_from(&obj->ib_port, attrs, + MLX5_IB_ATTR_FLOW_MATCHER_IB_PORT); + if (err) + goto end; + if (!rdma_is_port_valid(&dev->ib_dev, obj->ib_port)) { + err = -EINVAL; + goto end; + } + if (obj->ns_type != MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX && + obj->ns_type != MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX) { + err = -EINVAL; + goto end; + } + if (!verify_context_caps(dev, uobj->context->enabled_caps)) { + err = -EOPNOTSUPP; + goto end; + } + } + uobj->object = obj; obj->mdev = dev->mdev; atomic_set(&obj->usecnt, 0); @@ -2433,7 +3012,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_STEERING_ANCHOR_CREATE)( mutex_lock(&dev->flow_db->lock); - ft_prio = _get_flow_table(dev, priority, ns_type, 0); + ft_prio = _get_flow_table(dev, priority, ns_type, 0, 0); if (IS_ERR(ft_prio)) { err = PTR_ERR(ft_prio); goto free_obj; @@ -2819,7 +3398,10 @@ DECLARE_UVERBS_NAMED_METHOD( UA_OPTIONAL), UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE, enum mlx5_ib_uapi_flow_table_type, - UA_OPTIONAL)); + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_FLOW_MATCHER_IB_PORT, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL)); DECLARE_UVERBS_NAMED_METHOD_DESTROY( MLX5_IB_METHOD_FLOW_MATCHER_DESTROY, @@ -2889,8 +3471,26 @@ int mlx5_ib_fs_init(struct mlx5_ib_dev *dev) if (!dev->flow_db) return -ENOMEM; + dev->flow_db->rdma_transport_rx = kcalloc(dev->num_ports, + sizeof(struct mlx5_ib_flow_prio), + GFP_KERNEL); + if (!dev->flow_db->rdma_transport_rx) + goto free_flow_db; + + dev->flow_db->rdma_transport_tx = kcalloc(dev->num_ports, + sizeof(struct mlx5_ib_flow_prio), + GFP_KERNEL); + if (!dev->flow_db->rdma_transport_tx) + goto free_rdma_transport_rx; + mutex_init(&dev->flow_db->lock); ib_set_device_ops(&dev->ib_dev, &flow_ops); return 0; + +free_rdma_transport_rx: + kfree(dev->flow_db->rdma_transport_rx); +free_flow_db: + kfree(dev->flow_db); + return -ENOMEM; } diff --git a/drivers/infiniband/hw/mlx5/fs.h b/drivers/infiniband/hw/mlx5/fs.h index b9734904f5f0..2ebe86e5be10 100644 --- a/drivers/infiniband/hw/mlx5/fs.h +++ b/drivers/infiniband/hw/mlx5/fs.h @@ -8,23 +8,8 @@ #include "mlx5_ib.h" -#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) int mlx5_ib_fs_init(struct mlx5_ib_dev *dev); void mlx5_ib_fs_cleanup_anchor(struct mlx5_ib_dev *dev); -#else -static inline int mlx5_ib_fs_init(struct mlx5_ib_dev *dev) -{ - dev->flow_db = kzalloc(sizeof(*dev->flow_db), GFP_KERNEL); - - if (!dev->flow_db) - return -ENOMEM; - - mutex_init(&dev->flow_db->lock); - return 0; -} - -inline void mlx5_ib_fs_cleanup_anchor(struct mlx5_ib_dev *dev) {} -#endif static inline void mlx5_ib_fs_cleanup(struct mlx5_ib_dev *dev) { @@ -40,6 +25,8 @@ static inline void mlx5_ib_fs_cleanup(struct mlx5_ib_dev *dev) * is a safe assumption that all references are gone. */ mlx5_ib_fs_cleanup_anchor(dev); + kfree(dev->flow_db->rdma_transport_tx); + kfree(dev->flow_db->rdma_transport_rx); kfree(dev->flow_db); } #endif /* _MLX5_IB_FS_H */ diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c index c7a4ee896121..49af1cfbe6d1 100644 --- a/drivers/infiniband/hw/mlx5/ib_rep.c +++ b/drivers/infiniband/hw/mlx5/ib_rep.c @@ -13,6 +13,7 @@ mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev, int vport_index) { struct mlx5_ib_dev *ibdev; + struct net_device *ndev; ibdev = mlx5_eswitch_uplink_get_proto_dev(dev->priv.eswitch, REP_IB); if (!ibdev) @@ -20,12 +21,9 @@ mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev, ibdev->port[vport_index].rep = rep; rep->rep_data[REP_IB].priv = ibdev; - write_lock(&ibdev->port[vport_index].roce.netdev_lock); - ibdev->port[vport_index].roce.netdev = - mlx5_ib_get_rep_netdev(rep->esw, rep->vport); - write_unlock(&ibdev->port[vport_index].roce.netdev_lock); + ndev = mlx5_ib_get_rep_netdev(rep->esw, rep->vport); - return 0; + return ib_device_set_netdev(&ibdev->ib_dev, ndev, vport_index + 1); } static void mlx5_ib_register_peer_vport_reps(struct mlx5_core_dev *mdev); @@ -104,10 +102,15 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) ibdev->is_rep = true; vport_index = rep->vport_index; ibdev->port[vport_index].rep = rep; - ibdev->port[vport_index].roce.netdev = - mlx5_ib_get_rep_netdev(lag_master->priv.eswitch, rep->vport); ibdev->mdev = lag_master; ibdev->num_ports = num_ports; + ibdev->ib_dev.phys_port_cnt = num_ports; + ret = ib_device_set_netdev(&ibdev->ib_dev, + mlx5_ib_get_rep_netdev(lag_master->priv.eswitch, + rep->vport), + vport_index + 1); + if (ret) + goto fail_add; ret = __mlx5_ib_add(ibdev, profile); if (ret) @@ -160,9 +163,8 @@ mlx5_ib_vport_rep_unload(struct mlx5_eswitch_rep *rep) } port = &dev->port[vport_index]; - write_lock(&port->roce.netdev_lock); - port->roce.netdev = NULL; - write_unlock(&port->roce.netdev_lock); + + ib_device_set_netdev(&dev->ib_dev, NULL, vport_index + 1); rep->rep_data[REP_IB].priv = NULL; port->rep = NULL; diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c index 3e43687a7f6f..2453ae4384a7 100644 --- a/drivers/infiniband/hw/mlx5/mad.c +++ b/drivers/infiniband/hw/mlx5/mad.c @@ -69,7 +69,7 @@ static int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, if (ignore_bkey || !in_wc) op_modifier |= 0x2; - return mlx5_cmd_mad_ifc(dev->mdev, in_mad, response_mad, op_modifier, + return mlx5_cmd_mad_ifc(dev, in_mad, response_mad, op_modifier, port); } @@ -147,8 +147,39 @@ static void pma_cnt_assign(struct ib_pma_portcounters *pma_cnt, vl_15_dropped); } -static int query_ib_ppcnt(struct mlx5_core_dev *dev, u8 port_num, void *out, - size_t sz) +static void pma_cnt_ext_assign_ppcnt(struct ib_pma_portcounters_ext *cnt_ext, + void *out) +{ + void *out_pma = MLX5_ADDR_OF(ppcnt_reg, out, + counter_set); + +#define MLX5_GET_EXT_CNTR(counter_name) \ + MLX5_GET64(ib_ext_port_cntrs_grp_data_layout, \ + out_pma, counter_name##_high) + + cnt_ext->port_xmit_data = + cpu_to_be64(MLX5_GET_EXT_CNTR(port_xmit_data) >> 2); + cnt_ext->port_rcv_data = + cpu_to_be64(MLX5_GET_EXT_CNTR(port_rcv_data) >> 2); + + cnt_ext->port_xmit_packets = + cpu_to_be64(MLX5_GET_EXT_CNTR(port_xmit_pkts)); + cnt_ext->port_rcv_packets = + cpu_to_be64(MLX5_GET_EXT_CNTR(port_rcv_pkts)); + + cnt_ext->port_unicast_xmit_packets = + cpu_to_be64(MLX5_GET_EXT_CNTR(port_unicast_xmit_pkts)); + cnt_ext->port_unicast_rcv_packets = + cpu_to_be64(MLX5_GET_EXT_CNTR(port_unicast_rcv_pkts)); + + cnt_ext->port_multicast_xmit_packets = + cpu_to_be64(MLX5_GET_EXT_CNTR(port_multicast_xmit_pkts)); + cnt_ext->port_multicast_rcv_packets = + cpu_to_be64(MLX5_GET_EXT_CNTR(port_multicast_rcv_pkts)); +} + +static int query_ib_ppcnt(struct mlx5_core_dev *dev, u8 port_num, u8 plane_num, + void *out, size_t sz, bool ext) { u32 *in; int err; @@ -160,8 +191,14 @@ static int query_ib_ppcnt(struct mlx5_core_dev *dev, u8 port_num, void *out, } MLX5_SET(ppcnt_reg, in, local_port, port_num); - - MLX5_SET(ppcnt_reg, in, grp, MLX5_INFINIBAND_PORT_COUNTERS_GROUP); + MLX5_SET(ppcnt_reg, in, plane_ind, plane_num); + + if (ext) + MLX5_SET(ppcnt_reg, in, grp, + MLX5_INFINIBAND_EXTENDED_PORT_COUNTERS_GROUP); + else + MLX5_SET(ppcnt_reg, in, grp, + MLX5_INFINIBAND_PORT_COUNTERS_GROUP); err = mlx5_core_access_reg(dev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); @@ -189,7 +226,8 @@ static int process_pma_cmd(struct mlx5_ib_dev *dev, u32 port_num, mdev_port_num = 1; } if (MLX5_CAP_GEN(dev->mdev, num_ports) == 1 && - !mlx5_core_mp_enabled(mdev)) { + !mlx5_core_mp_enabled(mdev) && + dev->ib_dev.type != RDMA_DEVICE_TYPE_SMI) { /* set local port to one for Function-Per-Port HCA. */ mdev = dev->mdev; mdev_port_num = 1; @@ -208,7 +246,8 @@ static int process_pma_cmd(struct mlx5_ib_dev *dev, u32 port_num, if (in_mad->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS_EXT) { struct ib_pma_portcounters_ext *pma_cnt_ext = (struct ib_pma_portcounters_ext *)(out_mad->data + 40); - int sz = MLX5_ST_SZ_BYTES(query_vport_counter_out); + int sz = max(MLX5_ST_SZ_BYTES(query_vport_counter_out), + MLX5_ST_SZ_BYTES(ppcnt_reg)); out_cnt = kvzalloc(sz, GFP_KERNEL); if (!out_cnt) { @@ -216,10 +255,18 @@ static int process_pma_cmd(struct mlx5_ib_dev *dev, u32 port_num, goto done; } - err = mlx5_core_query_vport_counter(mdev, 0, 0, mdev_port_num, - out_cnt); - if (!err) - pma_cnt_ext_assign(pma_cnt_ext, out_cnt); + if (dev->ib_dev.type == RDMA_DEVICE_TYPE_SMI) { + err = query_ib_ppcnt(mdev, mdev_port_num, + port_num, out_cnt, sz, 1); + if (!err) + pma_cnt_ext_assign_ppcnt(pma_cnt_ext, out_cnt); + } else { + err = mlx5_core_query_vport_counter(mdev, 0, 0, + mdev_port_num, + out_cnt); + if (!err) + pma_cnt_ext_assign(pma_cnt_ext, out_cnt); + } } else { struct ib_pma_portcounters *pma_cnt = (struct ib_pma_portcounters *)(out_mad->data + 40); @@ -231,7 +278,13 @@ static int process_pma_cmd(struct mlx5_ib_dev *dev, u32 port_num, goto done; } - err = query_ib_ppcnt(mdev, mdev_port_num, out_cnt, sz); + if (dev->ib_dev.type == RDMA_DEVICE_TYPE_SMI) + err = query_ib_ppcnt(mdev, mdev_port_num, port_num, + out_cnt, sz, 0); + else + err = query_ib_ppcnt(mdev, mdev_port_num, 0, + out_cnt, sz, 0); + if (!err) pma_cnt_assign(pma_cnt, out_cnt); } diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index c2b557e64290..ce7610740412 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -47,7 +47,9 @@ #include <rdma/uverbs_ioctl.h> #include <rdma/mlx5_user_ioctl_verbs.h> #include <rdma/mlx5_user_ioctl_cmds.h> +#include <rdma/ib_ucaps.h> #include "macsec.h" +#include "data_direct.h" #define UVERBS_MODULE_NAME mlx5_ib #include <rdma/uverbs_named_ioctl.h> @@ -146,16 +148,52 @@ static struct mlx5_roce *mlx5_get_rep_roce(struct mlx5_ib_dev *dev, if (upper && port->rep->vport == MLX5_VPORT_UPLINK) continue; - - read_lock(&port->roce.netdev_lock); - rep_ndev = mlx5_ib_get_rep_netdev(port->rep->esw, - port->rep->vport); - if (rep_ndev == ndev) { - read_unlock(&port->roce.netdev_lock); + rep_ndev = ib_device_get_netdev(&dev->ib_dev, i + 1); + if (rep_ndev && rep_ndev == ndev) { + dev_put(rep_ndev); *port_num = i + 1; return &port->roce; } - read_unlock(&port->roce.netdev_lock); + + dev_put(rep_ndev); + } + + return NULL; +} + +static bool mlx5_netdev_send_event(struct mlx5_ib_dev *dev, + struct net_device *ndev, + struct net_device *upper, + struct net_device *ib_ndev) +{ + if (!dev->ib_active) + return false; + + /* Event is about our upper device */ + if (upper == ndev) + return true; + + /* RDMA device is not in lag and not in switchdev */ + if (!dev->is_rep && !upper && ndev == ib_ndev) + return true; + + /* RDMA devie is in switchdev */ + if (dev->is_rep && ndev == ib_ndev) + return true; + + return false; +} + +static struct net_device *mlx5_ib_get_rep_uplink_netdev(struct mlx5_ib_dev *ibdev) +{ + struct mlx5_ib_port *port; + int i; + + for (i = 0; i < ibdev->num_ports; i++) { + port = &ibdev->port[i]; + if (port->rep && port->rep->vport == MLX5_VPORT_UPLINK) { + return ib_device_get_netdev(&ibdev->ib_dev, i + 1); + } } return NULL; @@ -167,6 +205,7 @@ static int mlx5_netdev_event(struct notifier_block *this, struct mlx5_roce *roce = container_of(this, struct mlx5_roce, nb); struct net_device *ndev = netdev_notifier_info_to_dev(ptr); u32 port_num = roce->native_port_num; + struct net_device *ib_ndev = NULL; struct mlx5_core_dev *mdev; struct mlx5_ib_dev *ibdev; @@ -180,47 +219,67 @@ static int mlx5_netdev_event(struct notifier_block *this, /* Should already be registered during the load */ if (ibdev->is_rep) break; - write_lock(&roce->netdev_lock); + + ib_ndev = ib_device_get_netdev(&ibdev->ib_dev, port_num); + /* Exit if already registered */ + if (ib_ndev) + goto put_ndev; + if (ndev->dev.parent == mdev->device) - roce->netdev = ndev; - write_unlock(&roce->netdev_lock); + ib_device_set_netdev(&ibdev->ib_dev, ndev, port_num); break; case NETDEV_UNREGISTER: /* In case of reps, ib device goes away before the netdevs */ - write_lock(&roce->netdev_lock); - if (roce->netdev == ndev) - roce->netdev = NULL; - write_unlock(&roce->netdev_lock); - break; + if (ibdev->is_rep) + break; + ib_ndev = ib_device_get_netdev(&ibdev->ib_dev, port_num); + if (ib_ndev == ndev) + ib_device_set_netdev(&ibdev->ib_dev, NULL, port_num); + goto put_ndev; case NETDEV_CHANGE: case NETDEV_UP: case NETDEV_DOWN: { - struct net_device *lag_ndev = mlx5_lag_get_roce_netdev(mdev); struct net_device *upper = NULL; - if (lag_ndev) { - upper = netdev_master_upper_dev_get(lag_ndev); - dev_put(lag_ndev); + if (!netif_is_lag_master(ndev) && !netif_is_lag_port(ndev) && + !mlx5_core_mp_enabled(mdev)) + return NOTIFY_DONE; + + if (mlx5_lag_is_roce(mdev) || mlx5_lag_is_sriov(mdev)) { + struct net_device *lag_ndev; + + if(mlx5_lag_is_roce(mdev)) + lag_ndev = ib_device_get_netdev(&ibdev->ib_dev, 1); + else /* sriov lag */ + lag_ndev = mlx5_ib_get_rep_uplink_netdev(ibdev); + + if (lag_ndev) { + upper = netdev_master_upper_dev_get(lag_ndev); + dev_put(lag_ndev); + } else { + goto done; + } } if (ibdev->is_rep) roce = mlx5_get_rep_roce(ibdev, ndev, upper, &port_num); if (!roce) return NOTIFY_DONE; - if ((upper == ndev || - ((!upper || ibdev->is_rep) && ndev == roce->netdev)) && - ibdev->ib_active) { + + ib_ndev = ib_device_get_netdev(&ibdev->ib_dev, port_num); + + if (mlx5_netdev_send_event(ibdev, ndev, upper, ib_ndev)) { struct ib_event ibev = { }; enum ib_port_state port_state; if (get_port_state(&ibdev->ib_dev, port_num, &port_state)) - goto done; + goto put_ndev; if (roce->last_port_state == port_state) - goto done; + goto put_ndev; roce->last_port_state = port_state; ibev.device = &ibdev->ib_dev; @@ -229,7 +288,7 @@ static int mlx5_netdev_event(struct notifier_block *this, else if (port_state == IB_PORT_ACTIVE) ibev.event = IB_EVENT_PORT_ACTIVE; else - goto done; + goto put_ndev; ibev.element.port_num = port_num; ib_dispatch_event(&ibev); @@ -240,39 +299,13 @@ static int mlx5_netdev_event(struct notifier_block *this, default: break; } +put_ndev: + dev_put(ib_ndev); done: mlx5_ib_put_native_port_mdev(ibdev, port_num); return NOTIFY_DONE; } -static struct net_device *mlx5_ib_get_netdev(struct ib_device *device, - u32 port_num) -{ - struct mlx5_ib_dev *ibdev = to_mdev(device); - struct net_device *ndev; - struct mlx5_core_dev *mdev; - - mdev = mlx5_ib_get_native_port_mdev(ibdev, port_num, NULL); - if (!mdev) - return NULL; - - ndev = mlx5_lag_get_roce_netdev(mdev); - if (ndev) - goto out; - - /* Ensure ndev does not disappear before we invoke dev_hold() - */ - read_lock(&ibdev->port[port_num - 1].roce.netdev_lock); - ndev = ibdev->port[port_num - 1].roce.netdev; - if (ndev) - dev_hold(ndev); - read_unlock(&ibdev->port[port_num - 1].roce.netdev_lock); - -out: - mlx5_ib_put_native_port_mdev(ibdev, port_num); - return ndev; -} - struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *ibdev, u32 ib_port_num, u32 *native_port_num) @@ -283,6 +316,14 @@ struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *ibdev, struct mlx5_ib_multiport_info *mpi; struct mlx5_ib_port *port; + if (ibdev->ib_dev.type == RDMA_DEVICE_TYPE_SMI) { + if (native_port_num) + *native_port_num = smi_to_native_portnum(ibdev, + ib_port_num); + return ibdev->mdev; + + } + if (!mlx5_core_mp_enabled(ibdev->mdev) || ll != IB_LINK_LAYER_ETHERNET) { if (native_port_num) @@ -444,6 +485,10 @@ static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u16 *active_speed, *active_width = IB_WIDTH_2X; *active_speed = IB_SPEED_NDR; break; + case MLX5E_PROT_MASK(MLX5E_200GAUI_1_200GBASE_CR1_KR1): + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_XDR; + break; case MLX5E_PROT_MASK(MLX5E_400GAUI_8_400GBASE_CR8): *active_width = IB_WIDTH_8X; *active_speed = IB_SPEED_HDR; @@ -452,10 +497,18 @@ static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u16 *active_speed, *active_width = IB_WIDTH_4X; *active_speed = IB_SPEED_NDR; break; + case MLX5E_PROT_MASK(MLX5E_400GAUI_2_400GBASE_CR2_KR2): + *active_width = IB_WIDTH_2X; + *active_speed = IB_SPEED_XDR; + break; case MLX5E_PROT_MASK(MLX5E_800GAUI_8_800GBASE_CR8_KR8): *active_width = IB_WIDTH_8X; *active_speed = IB_SPEED_NDR; break; + case MLX5E_PROT_MASK(MLX5E_800GAUI_4_800GBASE_CR4_KR4): + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_XDR; + break; default: return -EINVAL; } @@ -504,10 +557,10 @@ static int mlx5_query_port_roce(struct ib_device *device, u32 port_num, */ if (dev->is_rep) err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, - 1); + 1, 0); else err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, - mdev_port_num); + mdev_port_num, 0); if (err) goto out; ext = !!MLX5_GET_ETH_PROTO(ptys_reg, out, true, eth_proto_capability); @@ -539,11 +592,11 @@ static int mlx5_query_port_roce(struct ib_device *device, u32 port_num, if (!put_mdev) goto out; - ndev = mlx5_ib_get_netdev(device, port_num); + ndev = ib_device_get_netdev(device, port_num); if (!ndev) goto out; - if (dev->lag_active) { + if (mlx5_lag_is_roce(mdev) || mlx5_lag_is_sriov(mdev)) { rcu_read_lock(); upper = netdev_master_upper_dev_get_rcu(ndev); if (upper) { @@ -1146,6 +1199,14 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, MLX5_IB_QUERY_DEV_RESP_PACKET_BASED_CREDIT_MODE; resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_SCAT2CQE_DCT; + + if (MLX5_CAP_GEN_2(mdev, dp_ordering_force) && + (MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_xrc) || + MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_dc) || + MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_rc) || + MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_ud) || + MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_uc))) + resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_OOO_DP; } if (offsetofend(typeof(resp), sw_parsing_caps) <= uhw_outlen) { @@ -1334,11 +1395,11 @@ static int mlx5_query_hca_port(struct ib_device *ibdev, u32 port, struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; struct mlx5_hca_vport_context *rep; + u8 vl_hw_cap, plane_index = 0; u16 max_mtu; u16 oper_mtu; int err; u16 ib_link_width_oper; - u8 vl_hw_cap; rep = kzalloc(sizeof(*rep), GFP_KERNEL); if (!rep) { @@ -1348,6 +1409,11 @@ static int mlx5_query_hca_port(struct ib_device *ibdev, u32 port, /* props being zeroed by the caller, avoid zeroing it here */ + if (ibdev->type == RDMA_DEVICE_TYPE_SMI) { + plane_index = port; + port = smi_to_native_portnum(dev, port); + } + err = mlx5_query_hca_vport_context(mdev, 0, port, 0, rep); if (err) goto out; @@ -1358,7 +1424,14 @@ static int mlx5_query_hca_port(struct ib_device *ibdev, u32 port, props->sm_sl = rep->sm_sl; props->state = rep->vport_state; props->phys_state = rep->port_physical_state; - props->port_cap_flags = rep->cap_mask1; + + props->port_cap_flags = rep->cap_mask1; + if (dev->num_plane) { + props->port_cap_flags |= IB_PORT_SM_DISABLED; + props->port_cap_flags &= ~IB_PORT_SM; + } else if (ibdev->type == RDMA_DEVICE_TYPE_SMI) + props->port_cap_flags &= ~IB_PORT_CM_SUP; + props->gid_tbl_len = mlx5_get_gid_table_len(MLX5_CAP_GEN(mdev, gid_table_size)); props->max_msg_sz = 1 << MLX5_CAP_GEN(mdev, log_max_msg); props->pkey_tbl_len = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size)); @@ -1371,7 +1444,7 @@ static int mlx5_query_hca_port(struct ib_device *ibdev, u32 port, props->port_cap_flags2 = rep->cap_mask2; err = mlx5_query_ib_port_oper(mdev, &ib_link_width_oper, - &props->active_speed, port); + &props->active_speed, port, plane_index); if (err) goto out; @@ -1811,7 +1884,7 @@ static int set_ucontext_resp(struct ib_ucontext *uctx, } resp->qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp); - if (dev->wc_support) + if (mlx5_wc_support_get(dev->mdev)) resp->bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size); resp->cache_line_size = cache_line_size(); @@ -1874,6 +1947,12 @@ static int set_ucontext_resp(struct ib_ucontext *uctx, return 0; } +static bool uctx_rdma_ctrl_is_enabled(u64 enabled_caps) +{ + return UCAP_ENABLED(enabled_caps, RDMA_UCAP_MLX5_CTRL_LOCAL) || + UCAP_ENABLED(enabled_caps, RDMA_UCAP_MLX5_CTRL_OTHER_VHCA); +} + static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata) { @@ -1916,10 +1995,17 @@ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx, return -EINVAL; if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) { - err = mlx5_ib_devx_create(dev, true); + err = mlx5_ib_devx_create(dev, true, uctx->enabled_caps); if (err < 0) goto out_ctx; context->devx_uid = err; + + if (uctx_rdma_ctrl_is_enabled(uctx->enabled_caps)) { + err = mlx5_cmd_add_privileged_uid(dev->mdev, + context->devx_uid); + if (err) + goto out_devx; + } } lib_uar_4k = req.lib_caps & MLX5_LIB_CAP_4K_UAR; @@ -1934,7 +2020,7 @@ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx, /* updates req->total_num_bfregs */ err = calc_total_bfregs(dev, lib_uar_4k, &req, bfregi); if (err) - goto out_devx; + goto out_ucap; mutex_init(&bfregi->lock); bfregi->lib_uar_4k = lib_uar_4k; @@ -1942,7 +2028,7 @@ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx, GFP_KERNEL); if (!bfregi->count) { err = -ENOMEM; - goto out_devx; + goto out_ucap; } bfregi->sys_pages = kcalloc(bfregi->num_sys_pages, @@ -2006,6 +2092,11 @@ out_sys_pages: out_count: kfree(bfregi->count); +out_ucap: + if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX && + uctx_rdma_ctrl_is_enabled(uctx->enabled_caps)) + mlx5_cmd_remove_privileged_uid(dev->mdev, context->devx_uid); + out_devx: if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) mlx5_ib_devx_destroy(dev, context->devx_uid); @@ -2050,8 +2141,12 @@ static void mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) kfree(bfregi->sys_pages); kfree(bfregi->count); - if (context->devx_uid) + if (context->devx_uid) { + if (uctx_rdma_ctrl_is_enabled(ibcontext->enabled_caps)) + mlx5_cmd_remove_privileged_uid(dev->mdev, + context->devx_uid); mlx5_ib_devx_destroy(dev, context->devx_uid); + } } static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev, @@ -2338,7 +2433,7 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm switch (command) { case MLX5_IB_MMAP_WC_PAGE: case MLX5_IB_MMAP_ALLOC_WC: - if (!dev->wc_support) + if (!mlx5_wc_support_get(dev->mdev)) return -EPERM; fallthrough; case MLX5_IB_MMAP_NC_PAGE: @@ -2777,6 +2872,23 @@ static int mlx5_ib_event_slave_port(struct notifier_block *nb, return NOTIFY_OK; } +static int mlx5_ib_get_plane_num(struct mlx5_core_dev *mdev, u8 *num_plane) +{ + struct mlx5_hca_vport_context vport_ctx; + int err; + + *num_plane = 0; + if (!MLX5_CAP_GEN(mdev, ib_virt) || !MLX5_CAP_GEN_2(mdev, multiplane)) + return 0; + + err = mlx5_query_hca_vport_context(mdev, 0, 1, 0, &vport_ctx); + if (err) + return err; + + *num_plane = vport_ctx.num_plane; + return 0; +} + static int set_has_smi_cap(struct mlx5_ib_dev *dev) { struct mlx5_hca_vport_context vport_ctx; @@ -2787,10 +2899,15 @@ static int set_has_smi_cap(struct mlx5_ib_dev *dev) return 0; for (port = 1; port <= dev->num_ports; port++) { - if (!MLX5_CAP_GEN(dev->mdev, ib_virt)) { + if (dev->num_plane) { + dev->port_caps[port - 1].has_smi = false; + continue; + } else if (!MLX5_CAP_GEN(dev->mdev, ib_virt) || + dev->ib_dev.type == RDMA_DEVICE_TYPE_SMI) { dev->port_caps[port - 1].has_smi = true; continue; } + err = mlx5_query_hca_vport_context(dev->mdev, 0, port, 0, &vport_ctx); if (err) { @@ -2824,37 +2941,72 @@ static u8 mlx5_get_umr_fence(u8 umr_fence_cap) } } -static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev) +int mlx5_ib_dev_res_cq_init(struct mlx5_ib_dev *dev) { struct mlx5_ib_resources *devr = &dev->devr; - struct ib_srq_init_attr attr; - struct ib_device *ibdev; struct ib_cq_init_attr cq_attr = {.cqe = 1}; - int port; + struct ib_device *ibdev; + struct ib_pd *pd; + struct ib_cq *cq; int ret = 0; - ibdev = &dev->ib_dev; - if (!MLX5_CAP_GEN(dev->mdev, xrc)) - return -EOPNOTSUPP; + /* + * devr->c0 is set once, never changed until device unload. + * Avoid taking the mutex if initialization is already done. + */ + if (devr->c0) + return 0; - devr->p0 = ib_alloc_pd(ibdev, 0); - if (IS_ERR(devr->p0)) - return PTR_ERR(devr->p0); + mutex_lock(&devr->cq_lock); + if (devr->c0) + goto unlock; - devr->c0 = ib_create_cq(ibdev, NULL, NULL, NULL, &cq_attr); - if (IS_ERR(devr->c0)) { - ret = PTR_ERR(devr->c0); - goto error1; + ibdev = &dev->ib_dev; + pd = ib_alloc_pd(ibdev, 0); + if (IS_ERR(pd)) { + ret = PTR_ERR(pd); + mlx5_ib_err(dev, "Couldn't allocate PD for res init, err=%d\n", ret); + goto unlock; } - ret = mlx5_cmd_xrcd_alloc(dev->mdev, &devr->xrcdn0, 0); - if (ret) - goto error2; + cq = ib_create_cq(ibdev, NULL, NULL, NULL, &cq_attr); + if (IS_ERR(cq)) { + ret = PTR_ERR(cq); + mlx5_ib_err(dev, "Couldn't create CQ for res init, err=%d\n", ret); + ib_dealloc_pd(pd); + goto unlock; + } - ret = mlx5_cmd_xrcd_alloc(dev->mdev, &devr->xrcdn1, 0); + devr->p0 = pd; + devr->c0 = cq; + +unlock: + mutex_unlock(&devr->cq_lock); + return ret; +} + +int mlx5_ib_dev_res_srq_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_ib_resources *devr = &dev->devr; + struct ib_srq_init_attr attr; + struct ib_srq *s0, *s1; + int ret = 0; + + /* + * devr->s1 is set once, never changed until device unload. + * Avoid taking the mutex if initialization is already done. + */ + if (devr->s1) + return 0; + + mutex_lock(&devr->srq_lock); + if (devr->s1) + goto unlock; + + ret = mlx5_ib_dev_res_cq_init(dev); if (ret) - goto error3; + goto unlock; memset(&attr, 0, sizeof(attr)); attr.attr.max_sge = 1; @@ -2862,10 +3014,11 @@ static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev) attr.srq_type = IB_SRQT_XRC; attr.ext.cq = devr->c0; - devr->s0 = ib_create_srq(devr->p0, &attr); - if (IS_ERR(devr->s0)) { - ret = PTR_ERR(devr->s0); - goto err_create; + s0 = ib_create_srq(devr->p0, &attr); + if (IS_ERR(s0)) { + ret = PTR_ERR(s0); + mlx5_ib_err(dev, "Couldn't create SRQ 0 for res init, err=%d\n", ret); + goto unlock; } memset(&attr, 0, sizeof(attr)); @@ -2873,51 +3026,116 @@ static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev) attr.attr.max_wr = 1; attr.srq_type = IB_SRQT_BASIC; - devr->s1 = ib_create_srq(devr->p0, &attr); - if (IS_ERR(devr->s1)) { - ret = PTR_ERR(devr->s1); - goto error6; + s1 = ib_create_srq(devr->p0, &attr); + if (IS_ERR(s1)) { + ret = PTR_ERR(s1); + mlx5_ib_err(dev, "Couldn't create SRQ 1 for res init, err=%d\n", ret); + ib_destroy_srq(s0); } - for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) - INIT_WORK(&devr->ports[port].pkey_change_work, - pkey_change_handler); - - return 0; + devr->s0 = s0; + devr->s1 = s1; -error6: - ib_destroy_srq(devr->s0); -err_create: - mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn1, 0); -error3: - mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn0, 0); -error2: - ib_destroy_cq(devr->c0); -error1: - ib_dealloc_pd(devr->p0); +unlock: + mutex_unlock(&devr->srq_lock); return ret; } -static void mlx5_ib_dev_res_cleanup(struct mlx5_ib_dev *dev) +static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev) { struct mlx5_ib_resources *devr = &dev->devr; - int port; + int ret; - /* - * Make sure no change P_Key work items are still executing. - * - * At this stage, the mlx5_ib_event should be unregistered - * and it ensures that no new works are added. - */ - for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) - cancel_work_sync(&devr->ports[port].pkey_change_work); + if (!MLX5_CAP_GEN(dev->mdev, xrc)) + return -EOPNOTSUPP; + + ret = mlx5_cmd_xrcd_alloc(dev->mdev, &devr->xrcdn0, 0); + if (ret) + return ret; + + ret = mlx5_cmd_xrcd_alloc(dev->mdev, &devr->xrcdn1, 0); + if (ret) { + mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn0, 0); + return ret; + } + + mutex_init(&devr->cq_lock); + mutex_init(&devr->srq_lock); - ib_destroy_srq(devr->s1); - ib_destroy_srq(devr->s0); + return 0; +} + +static void mlx5_ib_dev_res_cleanup(struct mlx5_ib_dev *dev) +{ + struct mlx5_ib_resources *devr = &dev->devr; + + /* After s0/s1 init, they are not unset during the device lifetime. */ + if (devr->s1) { + ib_destroy_srq(devr->s1); + ib_destroy_srq(devr->s0); + } mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn1, 0); mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn0, 0); - ib_destroy_cq(devr->c0); - ib_dealloc_pd(devr->p0); + /* After p0/c0 init, they are not unset during the device lifetime. */ + if (devr->c0) { + ib_destroy_cq(devr->c0); + ib_dealloc_pd(devr->p0); + } + mutex_destroy(&devr->cq_lock); + mutex_destroy(&devr->srq_lock); +} + +static int +mlx5_ib_create_data_direct_resources(struct mlx5_ib_dev *dev) +{ + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + struct mlx5_core_dev *mdev = dev->mdev; + void *mkc; + u32 mkey; + u32 pdn; + u32 *in; + int err; + + err = mlx5_core_alloc_pd(mdev, &pdn); + if (err) + return err; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err; + } + + MLX5_SET(create_mkey_in, in, data_direct, 1); + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA); + MLX5_SET(mkc, mkc, lw, 1); + MLX5_SET(mkc, mkc, lr, 1); + MLX5_SET(mkc, mkc, rw, 1); + MLX5_SET(mkc, mkc, rr, 1); + MLX5_SET(mkc, mkc, a, 1); + MLX5_SET(mkc, mkc, pd, pdn); + MLX5_SET(mkc, mkc, length64, 1); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + err = mlx5_core_create_mkey(mdev, &mkey, in, inlen); + kvfree(in); + if (err) + goto err; + + dev->ddr.mkey = mkey; + dev->ddr.pdn = pdn; + return 0; + +err: + mlx5_core_dealloc_pd(mdev, pdn); + return err; +} + +static void +mlx5_ib_free_data_direct_resources(struct mlx5_ib_dev *dev) +{ + mlx5_core_destroy_mkey(dev->mdev, dev->ddr.mkey); + mlx5_core_dealloc_pd(dev->mdev, dev->ddr.pdn); } static u32 get_core_cap_flags(struct ib_device *ibdev, @@ -2933,6 +3151,13 @@ static u32 get_core_cap_flags(struct ib_device *ibdev, if (rep->grh_required) ret |= RDMA_CORE_CAP_IB_GRH_REQUIRED; + if (dev->num_plane) + return ret | RDMA_CORE_CAP_PROT_IB | RDMA_CORE_CAP_IB_MAD | + RDMA_CORE_CAP_IB_CM | RDMA_CORE_CAP_IB_SA | + RDMA_CORE_CAP_AF_IB; + else if (ibdev->type == RDMA_DEVICE_TYPE_SMI) + return ret | RDMA_CORE_CAP_IB_MAD | RDMA_CORE_CAP_IB_SMI; + if (ll == IB_LINK_LAYER_INFINIBAND) return ret | RDMA_CORE_PORT_IBA_IB; @@ -2968,6 +3193,9 @@ static int mlx5_port_immutable(struct ib_device *ibdev, u32 port_num, return err; if (ll == IB_LINK_LAYER_INFINIBAND) { + if (ibdev->type == RDMA_DEVICE_TYPE_SMI) + port_num = smi_to_native_portnum(dev, port_num); + err = mlx5_query_hca_vport_context(dev->mdev, 0, port_num, 0, &rep); if (err) @@ -3010,6 +3238,67 @@ static void get_dev_fw_str(struct ib_device *ibdev, char *str) fw_rev_sub(dev->mdev)); } +static int lag_event(struct notifier_block *nb, unsigned long event, void *data) +{ + struct mlx5_ib_dev *dev = container_of(nb, struct mlx5_ib_dev, + lag_events); + struct mlx5_core_dev *mdev = dev->mdev; + struct ib_device *ibdev = &dev->ib_dev; + struct net_device *old_ndev = NULL; + struct mlx5_ib_port *port; + struct net_device *ndev; + u32 portnum = 0; + int ret = 0; + int i; + + switch (event) { + case MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE: + ndev = data; + if (ndev) { + if (!mlx5_lag_is_roce(mdev)) { + // sriov lag + for (i = 0; i < dev->num_ports; i++) { + port = &dev->port[i]; + if (port->rep && port->rep->vport == + MLX5_VPORT_UPLINK) { + portnum = i; + break; + } + } + } + old_ndev = ib_device_get_netdev(ibdev, portnum + 1); + ret = ib_device_set_netdev(ibdev, ndev, portnum + 1); + if (ret) + goto out; + + if (old_ndev) + roce_del_all_netdev_gids(ibdev, portnum + 1, + old_ndev); + rdma_roce_rescan_port(ibdev, portnum + 1); + } + break; + default: + return NOTIFY_DONE; + } + +out: + dev_put(old_ndev); + return notifier_from_errno(ret); +} + +static void mlx5e_lag_event_register(struct mlx5_ib_dev *dev) +{ + dev->lag_events.notifier_call = lag_event; + blocking_notifier_chain_register(&dev->mdev->priv.lag_nh, + &dev->lag_events); +} + +static void mlx5e_lag_event_unregister(struct mlx5_ib_dev *dev) +{ + blocking_notifier_chain_unregister(&dev->mdev->priv.lag_nh, + &dev->lag_events); +} + static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev) { struct mlx5_core_dev *mdev = dev->mdev; @@ -3031,6 +3320,7 @@ static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev) goto err_destroy_vport_lag; } + mlx5e_lag_event_register(dev); dev->flow_db->lag_demux_ft = ft; dev->lag_ports = mlx5_lag_get_num_ports(mdev); dev->lag_active = true; @@ -3048,6 +3338,7 @@ static void mlx5_eth_lag_cleanup(struct mlx5_ib_dev *dev) if (dev->lag_active) { dev->lag_active = false; + mlx5e_lag_event_unregister(dev); mlx5_destroy_flow_table(dev->flow_db->lag_demux_ft); dev->flow_db->lag_demux_ft = NULL; @@ -3306,6 +3597,41 @@ unbind: return false; } +static int mlx5_ib_data_direct_init(struct mlx5_ib_dev *dev) +{ + char vuid[MLX5_ST_SZ_BYTES(array1024_auto) + 1] = {}; + int ret; + + if (!MLX5_CAP_GEN(dev->mdev, data_direct) || + !MLX5_CAP_GEN_2(dev->mdev, query_vuid)) + return 0; + + ret = mlx5_cmd_query_vuid(dev->mdev, true, vuid); + if (ret) + return ret; + + ret = mlx5_ib_create_data_direct_resources(dev); + if (ret) + return ret; + + INIT_LIST_HEAD(&dev->data_direct_mr_list); + ret = mlx5_data_direct_ib_reg(dev, vuid); + if (ret) + mlx5_ib_free_data_direct_resources(dev); + + return ret; +} + +static void mlx5_ib_data_direct_cleanup(struct mlx5_ib_dev *dev) +{ + if (!MLX5_CAP_GEN(dev->mdev, data_direct) || + !MLX5_CAP_GEN_2(dev->mdev, query_vuid)) + return; + + mlx5_data_direct_ib_unreg(dev); + mlx5_ib_free_data_direct_resources(dev); +} + static int mlx5_ib_init_multiport_master(struct mlx5_ib_dev *dev) { u32 port_num = mlx5_core_native_port_num(dev->mdev) - 1; @@ -3352,7 +3678,8 @@ static int mlx5_ib_init_multiport_master(struct mlx5_ib_dev *dev) list_for_each_entry(mpi, &mlx5_ib_unaffiliated_port_list, list) { if (dev->sys_image_guid == mpi->sys_image_guid && - (mlx5_core_native_port_num(mpi->mdev) - 1) == i) { + (mlx5_core_native_port_num(mpi->mdev) - 1) == i && + mlx5_core_same_coredev_type(dev->mdev, mpi->mdev)) { bound = mlx5_ib_bind_slave_port(dev, mpi); } @@ -3613,7 +3940,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_UAR_OBJ_ALLOC)( alloc_type != MLX5_IB_UAPI_UAR_ALLOC_TYPE_NC) return -EOPNOTSUPP; - if (!to_mdev(c->ibucontext.device)->wc_support && + if (!mlx5_wc_support_get(to_mdev(c->ibucontext.device)->mdev) && alloc_type == MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF) return -EOPNOTSUPP; @@ -3682,14 +4009,24 @@ ADD_UVERBS_ATTRIBUTES_SIMPLE( dump_fill_mkey), UA_MANDATORY)); +ADD_UVERBS_ATTRIBUTES_SIMPLE( + mlx5_ib_reg_dmabuf_mr, + UVERBS_OBJECT_MR, + UVERBS_METHOD_REG_DMABUF_MR, + UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS, + enum mlx5_ib_uapi_reg_dmabuf_flags, + UA_OPTIONAL)); + static const struct uapi_definition mlx5_ib_defs[] = { UAPI_DEF_CHAIN(mlx5_ib_devx_defs), UAPI_DEF_CHAIN(mlx5_ib_flow_defs), UAPI_DEF_CHAIN(mlx5_ib_qos_defs), UAPI_DEF_CHAIN(mlx5_ib_std_types_defs), UAPI_DEF_CHAIN(mlx5_ib_dm_defs), + UAPI_DEF_CHAIN(mlx5_ib_create_cq_defs), UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DEVICE, &mlx5_ib_query_context), + UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_MR, &mlx5_ib_reg_dmabuf_mr), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_VAR, UAPI_DEF_IS_OBJ_SUPPORTED(var_is_supported)), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_UAR), @@ -3698,6 +4035,7 @@ static const struct uapi_definition mlx5_ib_defs[] = { static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev) { + mlx5_ib_data_direct_cleanup(dev); mlx5_ib_cleanup_multiport_master(dev); WARN_ON(!xa_empty(&dev->odp_mkeys)); mutex_destroy(&dev->cap_mask_mutex); @@ -3713,13 +4051,11 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) dev->ib_dev.node_type = RDMA_NODE_IB_CA; dev->ib_dev.local_dma_lkey = 0 /* not supported for now */; - dev->ib_dev.phys_port_cnt = dev->num_ports; dev->ib_dev.dev.parent = mdev->device; dev->ib_dev.lag_flags = RDMA_LAG_FLAGS_HASH_ALL_SLAVES; for (i = 0; i < dev->num_ports; i++) { spin_lock_init(&dev->port[i].mp.mpi_lock); - rwlock_init(&dev->port[i].roce.netdev_lock); dev->port[i].roce.dev = dev; dev->port[i].roce.native_port_num = i + 1; dev->port[i].roce.last_port_state = IB_PORT_DOWN; @@ -3751,6 +4087,7 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) dev->ib_dev.num_comp_vectors = mlx5_comp_vectors_max(mdev); mutex_init(&dev->cap_mask_mutex); + mutex_init(&dev->data_direct_lock); INIT_LIST_HEAD(&dev->qp_list); spin_lock_init(&dev->reset_flow_resource_lock); xa_init(&dev->odp_mkeys); @@ -3759,25 +4096,22 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) spin_lock_init(&dev->dm.lock); dev->dm.dev = mdev; + err = mlx5_ib_data_direct_init(dev); + if (err) + goto err_mp; + return 0; -err: - mlx5r_macsec_dealloc_gids(dev); err_mp: mlx5_ib_cleanup_multiport_master(dev); +err: + mlx5r_macsec_dealloc_gids(dev); return err; } -static int mlx5_ib_enable_driver(struct ib_device *dev) -{ - struct mlx5_ib_dev *mdev = to_mdev(dev); - int ret; - - ret = mlx5_ib_test_wc(mdev); - mlx5_ib_dbg(mdev, "Write-Combining %s", - mdev->wc_support ? "supported" : "not supported"); - - return ret; -} +static struct ib_device *mlx5_ib_add_sub_dev(struct ib_device *parent, + enum rdma_nl_dev_type type, + const char *name); +static void mlx5_ib_del_sub_dev(struct ib_device *sub_dev); static const struct ib_device_ops mlx5_ib_dev_ops = { .owner = THIS_MODULE, @@ -3785,6 +4119,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = { .uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION, .add_gid = mlx5_ib_add_gid, + .add_sub_dev = mlx5_ib_add_sub_dev, .alloc_mr = mlx5_ib_alloc_mr, .alloc_mr_integrity = mlx5_ib_alloc_mr_integrity, .alloc_pd = mlx5_ib_alloc_pd, @@ -3799,6 +4134,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = { .dealloc_pd = mlx5_ib_dealloc_pd, .dealloc_ucontext = mlx5_ib_dealloc_ucontext, .del_gid = mlx5_ib_del_gid, + .del_sub_dev = mlx5_ib_del_sub_dev, .dereg_mr = mlx5_ib_dereg_mr, .destroy_ah = mlx5_ib_destroy_ah, .destroy_cq = mlx5_ib_destroy_cq, @@ -3809,7 +4145,6 @@ static const struct ib_device_ops mlx5_ib_dev_ops = { .drain_rq = mlx5_ib_drain_rq, .drain_sq = mlx5_ib_drain_sq, .device_group = &mlx5_attr_group, - .enable_driver = mlx5_ib_enable_driver, .get_dev_fw_str = get_dev_fw_str, .get_dma_mr = mlx5_ib_get_dma_mr, .get_link_layer = mlx5_ib_port_link_layer, @@ -3839,6 +4174,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = { .req_notify_cq = mlx5_ib_arm_cq, .rereg_user_mr = mlx5_ib_rereg_user_mr, .resize_cq = mlx5_ib_resize_cq, + .ufile_hw_cleanup = mlx5_ib_ufile_hw_cleanup, INIT_RDMA_OBJ_SIZE(ib_ah, mlx5_ib_ah, ibah), INIT_RDMA_OBJ_SIZE(ib_counters, mlx5_ib_mcounters, ibcntrs), @@ -3900,8 +4236,47 @@ static int mlx5_ib_init_var_table(struct mlx5_ib_dev *dev) return (var_table->bitmap) ? 0 : -ENOMEM; } +static void mlx5_ib_cleanup_ucaps(struct mlx5_ib_dev *dev) +{ + if (MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RDMA_CTRL) + ib_remove_ucap(RDMA_UCAP_MLX5_CTRL_LOCAL); + + if (MLX5_CAP_GEN(dev->mdev, uctx_cap) & + MLX5_UCTX_CAP_RDMA_CTRL_OTHER_VHCA) + ib_remove_ucap(RDMA_UCAP_MLX5_CTRL_OTHER_VHCA); +} + +static int mlx5_ib_init_ucaps(struct mlx5_ib_dev *dev) +{ + int ret; + + if (MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RDMA_CTRL) { + ret = ib_create_ucap(RDMA_UCAP_MLX5_CTRL_LOCAL); + if (ret) + return ret; + } + + if (MLX5_CAP_GEN(dev->mdev, uctx_cap) & + MLX5_UCTX_CAP_RDMA_CTRL_OTHER_VHCA) { + ret = ib_create_ucap(RDMA_UCAP_MLX5_CTRL_OTHER_VHCA); + if (ret) + goto remove_local; + } + + return 0; + +remove_local: + if (MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RDMA_CTRL) + ib_remove_ucap(RDMA_UCAP_MLX5_CTRL_LOCAL); + return ret; +} + static void mlx5_ib_stage_caps_cleanup(struct mlx5_ib_dev *dev) { + if (MLX5_CAP_GEN_2_64(dev->mdev, general_obj_types_127_64) & + MLX5_HCA_CAP_2_GENERAL_OBJECT_TYPES_RDMA_CTRL) + mlx5_ib_cleanup_ucaps(dev); + bitmap_free(dev->var_table.bitmap); } @@ -3952,6 +4327,13 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) return err; } + if (MLX5_CAP_GEN_2_64(dev->mdev, general_obj_types_127_64) & + MLX5_HCA_CAP_2_GENERAL_OBJECT_TYPES_RDMA_CTRL) { + err = mlx5_ib_init_ucaps(dev); + if (err) + return err; + } + dev->ib_dev.use_cq_dim = true; return 0; @@ -3985,7 +4367,6 @@ static const struct ib_device_ops mlx5_ib_dev_common_roce_ops = { .create_wq = mlx5_ib_create_wq, .destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table, .destroy_wq = mlx5_ib_destroy_wq, - .get_netdev = mlx5_ib_get_netdev, .modify_wq = mlx5_ib_modify_wq, INIT_RDMA_OBJ_SIZE(ib_rwq_ind_table, mlx5_ib_rwq_ind_table, @@ -4053,17 +4434,6 @@ static void mlx5_ib_stage_cong_debugfs_cleanup(struct mlx5_ib_dev *dev) mlx5_core_native_port_num(dev->mdev) - 1); } -static int mlx5_ib_stage_uar_init(struct mlx5_ib_dev *dev) -{ - dev->mdev->priv.uar = mlx5_get_uars_page(dev->mdev); - return PTR_ERR_OR_ZERO(dev->mdev->priv.uar); -} - -static void mlx5_ib_stage_uar_cleanup(struct mlx5_ib_dev *dev) -{ - mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar); -} - static int mlx5_ib_stage_bfrag_init(struct mlx5_ib_dev *dev) { int err; @@ -4089,7 +4459,10 @@ static int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev) { const char *name; - if (!mlx5_lag_is_active(dev->mdev)) + if (dev->sub_dev_name) { + name = dev->sub_dev_name; + ib_mark_name_assigned_by_user(&dev->ib_dev); + } else if (!mlx5_lag_is_active(dev->mdev)) name = "mlx5_%d"; else name = "mlx5_bond_%d"; @@ -4100,6 +4473,7 @@ static void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev) { mlx5_mkey_cache_cleanup(dev); mlx5r_umr_resource_cleanup(dev); + mlx5r_umr_cleanup(dev); } static void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev) @@ -4111,7 +4485,7 @@ static int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev) { int ret; - ret = mlx5r_umr_resource_init(dev); + ret = mlx5r_umr_init(dev); if (ret) return ret; @@ -4166,6 +4540,13 @@ static void mlx5_ib_stage_delay_drop_cleanup(struct mlx5_ib_dev *dev) static int mlx5_ib_stage_dev_notifier_init(struct mlx5_ib_dev *dev) { + struct mlx5_ib_resources *devr = &dev->devr; + int port; + + for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) + INIT_WORK(&devr->ports[port].pkey_change_work, + pkey_change_handler); + dev->mdev_events.notifier_call = mlx5_ib_event; mlx5_notifier_register(dev->mdev, &dev->mdev_events); @@ -4176,8 +4557,30 @@ static int mlx5_ib_stage_dev_notifier_init(struct mlx5_ib_dev *dev) static void mlx5_ib_stage_dev_notifier_cleanup(struct mlx5_ib_dev *dev) { + struct mlx5_ib_resources *devr = &dev->devr; + int port; + mlx5r_macsec_event_unregister(dev); mlx5_notifier_unregister(dev->mdev, &dev->mdev_events); + + for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) + cancel_work_sync(&devr->ports[port].pkey_change_work); +} + +void mlx5_ib_data_direct_bind(struct mlx5_ib_dev *ibdev, + struct mlx5_data_direct_dev *dev) +{ + mutex_lock(&ibdev->data_direct_lock); + ibdev->data_direct_dev = dev; + mutex_unlock(&ibdev->data_direct_lock); +} + +void mlx5_ib_data_direct_unbind(struct mlx5_ib_dev *ibdev) +{ + mutex_lock(&ibdev->data_direct_lock); + mlx5_ib_revoke_data_direct_mrs(ibdev); + ibdev->data_direct_dev = NULL; + mutex_unlock(&ibdev->data_direct_lock); } void __mlx5_ib_remove(struct mlx5_ib_dev *dev, @@ -4251,9 +4654,6 @@ static const struct mlx5_ib_profile pf_profile = { STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES, mlx5_ib_dev_res_init, mlx5_ib_dev_res_cleanup), - STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER, - mlx5_ib_stage_dev_notifier_init, - mlx5_ib_stage_dev_notifier_cleanup), STAGE_CREATE(MLX5_IB_STAGE_ODP, mlx5_ib_odp_init_one, mlx5_ib_odp_cleanup_one), @@ -4263,9 +4663,6 @@ static const struct mlx5_ib_profile pf_profile = { STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS, mlx5_ib_stage_cong_debugfs_init, mlx5_ib_stage_cong_debugfs_cleanup), - STAGE_CREATE(MLX5_IB_STAGE_UAR, - mlx5_ib_stage_uar_init, - mlx5_ib_stage_uar_cleanup), STAGE_CREATE(MLX5_IB_STAGE_BFREG, mlx5_ib_stage_bfrag_init, mlx5_ib_stage_bfrag_cleanup), @@ -4278,6 +4675,9 @@ static const struct mlx5_ib_profile pf_profile = { STAGE_CREATE(MLX5_IB_STAGE_IB_REG, mlx5_ib_stage_ib_reg_init, mlx5_ib_stage_ib_reg_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER, + mlx5_ib_stage_dev_notifier_init, + mlx5_ib_stage_dev_notifier_cleanup), STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR, mlx5_ib_stage_post_ib_reg_umr_init, NULL), @@ -4314,18 +4714,12 @@ const struct mlx5_ib_profile raw_eth_profile = { STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES, mlx5_ib_dev_res_init, mlx5_ib_dev_res_cleanup), - STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER, - mlx5_ib_stage_dev_notifier_init, - mlx5_ib_stage_dev_notifier_cleanup), STAGE_CREATE(MLX5_IB_STAGE_COUNTERS, mlx5_ib_counters_init, mlx5_ib_counters_cleanup), STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS, mlx5_ib_stage_cong_debugfs_init, mlx5_ib_stage_cong_debugfs_cleanup), - STAGE_CREATE(MLX5_IB_STAGE_UAR, - mlx5_ib_stage_uar_init, - mlx5_ib_stage_uar_cleanup), STAGE_CREATE(MLX5_IB_STAGE_BFREG, mlx5_ib_stage_bfrag_init, mlx5_ib_stage_bfrag_cleanup), @@ -4338,6 +4732,9 @@ const struct mlx5_ib_profile raw_eth_profile = { STAGE_CREATE(MLX5_IB_STAGE_IB_REG, mlx5_ib_stage_ib_reg_init, mlx5_ib_stage_ib_reg_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER, + mlx5_ib_stage_dev_notifier_init, + mlx5_ib_stage_dev_notifier_cleanup), STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR, mlx5_ib_stage_post_ib_reg_umr_init, NULL), @@ -4349,6 +4746,90 @@ const struct mlx5_ib_profile raw_eth_profile = { NULL), }; +static const struct mlx5_ib_profile plane_profile = { + STAGE_CREATE(MLX5_IB_STAGE_INIT, + mlx5_ib_stage_init_init, + mlx5_ib_stage_init_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_CAPS, + mlx5_ib_stage_caps_init, + mlx5_ib_stage_caps_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB, + mlx5_ib_stage_non_default_cb, + NULL), + STAGE_CREATE(MLX5_IB_STAGE_QP, + mlx5_init_qp_table, + mlx5_cleanup_qp_table), + STAGE_CREATE(MLX5_IB_STAGE_SRQ, + mlx5_init_srq_table, + mlx5_cleanup_srq_table), + STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES, + mlx5_ib_dev_res_init, + mlx5_ib_dev_res_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_BFREG, + mlx5_ib_stage_bfrag_init, + mlx5_ib_stage_bfrag_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_IB_REG, + mlx5_ib_stage_ib_reg_init, + mlx5_ib_stage_ib_reg_cleanup), +}; + +static struct ib_device *mlx5_ib_add_sub_dev(struct ib_device *parent, + enum rdma_nl_dev_type type, + const char *name) +{ + struct mlx5_ib_dev *mparent = to_mdev(parent), *mplane; + enum rdma_link_layer ll; + int ret; + + if (mparent->smi_dev) + return ERR_PTR(-EEXIST); + + ll = mlx5_port_type_cap_to_rdma_ll(MLX5_CAP_GEN(mparent->mdev, + port_type)); + if (type != RDMA_DEVICE_TYPE_SMI || !mparent->num_plane || + ll != IB_LINK_LAYER_INFINIBAND || + !MLX5_CAP_GEN_2(mparent->mdev, multiplane_qp_ud)) + return ERR_PTR(-EOPNOTSUPP); + + mplane = ib_alloc_device(mlx5_ib_dev, ib_dev); + if (!mplane) + return ERR_PTR(-ENOMEM); + + mplane->port = kcalloc(mparent->num_plane * mparent->num_ports, + sizeof(*mplane->port), GFP_KERNEL); + if (!mplane->port) { + ret = -ENOMEM; + goto fail_kcalloc; + } + + mplane->ib_dev.type = type; + mplane->mdev = mparent->mdev; + mplane->num_ports = mparent->num_plane; + mplane->sub_dev_name = name; + mplane->ib_dev.phys_port_cnt = mplane->num_ports; + + ret = __mlx5_ib_add(mplane, &plane_profile); + if (ret) + goto fail_ib_add; + + mparent->smi_dev = mplane; + return &mplane->ib_dev; + +fail_ib_add: + kfree(mplane->port); +fail_kcalloc: + ib_dealloc_device(&mplane->ib_dev); + return ERR_PTR(ret); +} + +static void mlx5_ib_del_sub_dev(struct ib_device *sub_dev) +{ + struct mlx5_ib_dev *mdev = to_mdev(sub_dev); + + to_mdev(sub_dev->parent)->smi_dev = NULL; + __mlx5_ib_remove(mdev, mdev->profile, MLX5_IB_STAGE_MAX); +} + static int mlx5r_mp_probe(struct auxiliary_device *adev, const struct auxiliary_device_id *id) { @@ -4373,7 +4854,8 @@ static int mlx5r_mp_probe(struct auxiliary_device *adev, mutex_lock(&mlx5_ib_multiport_mutex); list_for_each_entry(dev, &mlx5_ib_dev_list, ib_dev_list) { - if (dev->sys_image_guid == mpi->sys_image_guid) + if (dev->sys_image_guid == mpi->sys_image_guid && + mlx5_core_same_coredev_type(dev->mdev, mpi->mdev)) bound = mlx5_ib_bind_slave_port(dev, mpi); if (bound) { @@ -4426,15 +4908,23 @@ static int mlx5r_probe(struct auxiliary_device *adev, dev = ib_alloc_device(mlx5_ib_dev, ib_dev); if (!dev) return -ENOMEM; + + if (ll == IB_LINK_LAYER_INFINIBAND) { + ret = mlx5_ib_get_plane_num(mdev, &dev->num_plane); + if (ret) + goto fail; + } + dev->port = kcalloc(num_ports, sizeof(*dev->port), GFP_KERNEL); if (!dev->port) { - ib_dealloc_device(&dev->ib_dev); - return -ENOMEM; + ret = -ENOMEM; + goto fail; } dev->mdev = mdev; dev->num_ports = num_ports; + dev->ib_dev.phys_port_cnt = num_ports; if (ll == IB_LINK_LAYER_ETHERNET && !mlx5_get_roce_state(mdev)) profile = &raw_eth_profile; @@ -4442,14 +4932,17 @@ static int mlx5r_probe(struct auxiliary_device *adev, profile = &pf_profile; ret = __mlx5_ib_add(dev, profile); - if (ret) { - kfree(dev->port); - ib_dealloc_device(&dev->ib_dev); - return ret; - } + if (ret) + goto fail_ib_add; auxiliary_set_drvdata(adev, dev); return 0; + +fail_ib_add: + kfree(dev->port); +fail: + ib_dealloc_device(&dev->ib_dev); + return ret; } static void mlx5r_remove(struct auxiliary_device *adev) @@ -4509,17 +5002,23 @@ static int __init mlx5_ib_init(void) ret = mlx5r_rep_init(); if (ret) goto rep_err; + ret = mlx5_data_direct_driver_register(); + if (ret) + goto dd_err; ret = auxiliary_driver_register(&mlx5r_mp_driver); if (ret) goto mp_err; ret = auxiliary_driver_register(&mlx5r_driver); if (ret) goto drv_err; + return 0; drv_err: auxiliary_driver_unregister(&mlx5r_mp_driver); mp_err: + mlx5_data_direct_driver_unregister(); +dd_err: mlx5r_rep_cleanup(); rep_err: mlx5_ib_qp_event_cleanup(); @@ -4531,6 +5030,7 @@ qp_event_err: static void __exit mlx5_ib_cleanup(void) { + mlx5_data_direct_driver_unregister(); auxiliary_driver_unregister(&mlx5r_driver); auxiliary_driver_unregister(&mlx5r_mp_driver); mlx5r_rep_cleanup(); diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c index 96ffbbaf0a73..af321f6ef7f5 100644 --- a/drivers/infiniband/hw/mlx5/mem.c +++ b/drivers/infiniband/hw/mlx5/mem.c @@ -32,7 +32,6 @@ #include <rdma/ib_umem_odp.h> #include "mlx5_ib.h" -#include <linux/jiffies.h> /* * Fill in a physical address list. ib_umem_num_dma_blocks() entries will be @@ -94,202 +93,3 @@ unsigned long __mlx5_umem_find_best_quantized_pgoff( return 0; return page_size; } - -#define WR_ID_BF 0xBF -#define WR_ID_END 0xBAD -#define TEST_WC_NUM_WQES 255 -#define TEST_WC_POLLING_MAX_TIME_JIFFIES msecs_to_jiffies(100) -static int post_send_nop(struct mlx5_ib_dev *dev, struct ib_qp *ibqp, u64 wr_id, - bool signaled) -{ - struct mlx5_ib_qp *qp = to_mqp(ibqp); - struct mlx5_wqe_ctrl_seg *ctrl; - struct mlx5_bf *bf = &qp->bf; - __be32 mmio_wqe[16] = {}; - unsigned long flags; - unsigned int idx; - int i; - - if (unlikely(dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)) - return -EIO; - - spin_lock_irqsave(&qp->sq.lock, flags); - - idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1); - ctrl = mlx5_frag_buf_get_wqe(&qp->sq.fbc, idx); - - memset(ctrl, 0, sizeof(struct mlx5_wqe_ctrl_seg)); - ctrl->fm_ce_se = signaled ? MLX5_WQE_CTRL_CQ_UPDATE : 0; - ctrl->opmod_idx_opcode = - cpu_to_be32(((u32)(qp->sq.cur_post) << 8) | MLX5_OPCODE_NOP); - ctrl->qpn_ds = cpu_to_be32((sizeof(struct mlx5_wqe_ctrl_seg) / 16) | - (qp->trans_qp.base.mqp.qpn << 8)); - - qp->sq.wrid[idx] = wr_id; - qp->sq.w_list[idx].opcode = MLX5_OPCODE_NOP; - qp->sq.wqe_head[idx] = qp->sq.head + 1; - qp->sq.cur_post += DIV_ROUND_UP(sizeof(struct mlx5_wqe_ctrl_seg), - MLX5_SEND_WQE_BB); - qp->sq.w_list[idx].next = qp->sq.cur_post; - qp->sq.head++; - - memcpy(mmio_wqe, ctrl, sizeof(*ctrl)); - ((struct mlx5_wqe_ctrl_seg *)&mmio_wqe)->fm_ce_se |= - MLX5_WQE_CTRL_CQ_UPDATE; - - /* Make sure that descriptors are written before - * updating doorbell record and ringing the doorbell - */ - wmb(); - - qp->db.db[MLX5_SND_DBR] = cpu_to_be32(qp->sq.cur_post); - - /* Make sure doorbell record is visible to the HCA before - * we hit doorbell - */ - wmb(); - for (i = 0; i < 8; i++) - mlx5_write64(&mmio_wqe[i * 2], - bf->bfreg->map + bf->offset + i * 8); - io_stop_wc(); - - bf->offset ^= bf->buf_size; - - spin_unlock_irqrestore(&qp->sq.lock, flags); - - return 0; -} - -static int test_wc_poll_cq_result(struct mlx5_ib_dev *dev, struct ib_cq *cq) -{ - int ret; - struct ib_wc wc = {}; - unsigned long end = jiffies + TEST_WC_POLLING_MAX_TIME_JIFFIES; - - do { - ret = ib_poll_cq(cq, 1, &wc); - if (ret < 0 || wc.status) - return ret < 0 ? ret : -EINVAL; - if (ret) - break; - } while (!time_after(jiffies, end)); - - if (!ret) - return -ETIMEDOUT; - - if (wc.wr_id != WR_ID_BF) - ret = 0; - - return ret; -} - -static int test_wc_do_send(struct mlx5_ib_dev *dev, struct ib_qp *qp) -{ - int err, i; - - for (i = 0; i < TEST_WC_NUM_WQES; i++) { - err = post_send_nop(dev, qp, WR_ID_BF, false); - if (err) - return err; - } - - return post_send_nop(dev, qp, WR_ID_END, true); -} - -int mlx5_ib_test_wc(struct mlx5_ib_dev *dev) -{ - struct ib_cq_init_attr cq_attr = { .cqe = TEST_WC_NUM_WQES + 1 }; - int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type); - struct ib_qp_init_attr qp_init_attr = { - .cap = { .max_send_wr = TEST_WC_NUM_WQES }, - .qp_type = IB_QPT_UD, - .sq_sig_type = IB_SIGNAL_REQ_WR, - .create_flags = MLX5_IB_QP_CREATE_WC_TEST, - }; - struct ib_qp_attr qp_attr = { .port_num = 1 }; - struct ib_device *ibdev = &dev->ib_dev; - struct ib_qp *qp; - struct ib_cq *cq; - struct ib_pd *pd; - int ret; - - if (!MLX5_CAP_GEN(dev->mdev, bf)) - return 0; - - if (!dev->mdev->roce.roce_en && - port_type_cap == MLX5_CAP_PORT_TYPE_ETH) { - if (mlx5_core_is_pf(dev->mdev)) - dev->wc_support = arch_can_pci_mmap_wc(); - return 0; - } - - ret = mlx5_alloc_bfreg(dev->mdev, &dev->wc_bfreg, true, false); - if (ret) - goto print_err; - - if (!dev->wc_bfreg.wc) - goto out1; - - pd = ib_alloc_pd(ibdev, 0); - if (IS_ERR(pd)) { - ret = PTR_ERR(pd); - goto out1; - } - - cq = ib_create_cq(ibdev, NULL, NULL, NULL, &cq_attr); - if (IS_ERR(cq)) { - ret = PTR_ERR(cq); - goto out2; - } - - qp_init_attr.recv_cq = cq; - qp_init_attr.send_cq = cq; - qp = ib_create_qp(pd, &qp_init_attr); - if (IS_ERR(qp)) { - ret = PTR_ERR(qp); - goto out3; - } - - qp_attr.qp_state = IB_QPS_INIT; - ret = ib_modify_qp(qp, &qp_attr, - IB_QP_STATE | IB_QP_PORT | IB_QP_PKEY_INDEX | - IB_QP_QKEY); - if (ret) - goto out4; - - qp_attr.qp_state = IB_QPS_RTR; - ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE); - if (ret) - goto out4; - - qp_attr.qp_state = IB_QPS_RTS; - ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN); - if (ret) - goto out4; - - ret = test_wc_do_send(dev, qp); - if (ret < 0) - goto out4; - - ret = test_wc_poll_cq_result(dev, cq); - if (ret > 0) { - dev->wc_support = true; - ret = 0; - } - -out4: - ib_destroy_qp(qp); -out3: - ib_destroy_cq(cq); -out2: - ib_dealloc_pd(pd); -out1: - mlx5_free_bfreg(dev->mdev, &dev->wc_bfreg); -print_err: - if (ret) - mlx5_ib_err( - dev, - "Error %d while trying to test write-combining support\n", - ret); - return ret; -} diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index a8de35c07c9e..fde859d207ae 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -63,17 +63,6 @@ __mlx5_log_page_size_to_bitmap(unsigned int log_pgsz_bits, return GENMASK(largest_pg_shift, pgsz_shift); } -/* - * For mkc users, instead of a page_offset the command has a start_iova which - * specifies both the page_offset and the on-the-wire IOVA - */ -#define mlx5_umem_find_best_pgsz(umem, typ, log_pgsz_fld, pgsz_shift, iova) \ - ib_umem_find_best_pgsz(umem, \ - __mlx5_log_page_size_to_bitmap( \ - __mlx5_bit_sz(typ, log_pgsz_fld), \ - pgsz_shift), \ - iova) - static __always_inline unsigned long __mlx5_page_offset_to_bitmask(unsigned int page_offset_bits, unsigned int offset_shift) @@ -115,6 +104,19 @@ unsigned long __mlx5_umem_find_best_quantized_pgoff( __mlx5_bit_sz(typ, page_offset_fld), 0, scale, \ page_offset_quantized) +static inline unsigned long +mlx5_umem_dmabuf_find_best_pgsz(struct ib_umem_dmabuf *umem_dmabuf) +{ + /* + * mkeys used for dmabuf are fixed at PAGE_SIZE because we must be able + * to hold any sgl after a move operation. Ideally the mkc page size + * could be changed at runtime to be optimal, but right now the driver + * cannot do that. + */ + return ib_umem_find_best_pgsz(&umem_dmabuf->umem, PAGE_SIZE, + umem_dmabuf->umem.iova); +} + enum { MLX5_IB_MMAP_OFFSET_START = 9, MLX5_IB_MMAP_OFFSET_END = 255, @@ -274,6 +276,7 @@ struct mlx5_ib_flow_matcher { struct mlx5_core_dev *mdev; atomic_t usecnt; u8 match_criteria_enable; + u32 ib_port; }; struct mlx5_ib_steering_anchor { @@ -291,6 +294,18 @@ enum mlx5_ib_optional_counter_type { MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS, MLX5_IB_OPCOUNTER_CC_RX_CNP_PKTS, MLX5_IB_OPCOUNTER_CC_TX_CNP_PKTS, + MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS, + MLX5_IB_OPCOUNTER_RDMA_TX_BYTES, + MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS, + MLX5_IB_OPCOUNTER_RDMA_RX_BYTES, + + MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP, + MLX5_IB_OPCOUNTER_CC_RX_CNP_PKTS_PER_QP, + MLX5_IB_OPCOUNTER_CC_TX_CNP_PKTS_PER_QP, + MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS_PER_QP, + MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP, + MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS_PER_QP, + MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP, MLX5_IB_OPCOUNTER_MAX, }; @@ -305,6 +320,8 @@ struct mlx5_ib_flow_db { struct mlx5_ib_flow_prio rdma_tx[MLX5_IB_NUM_FLOW_FT]; struct mlx5_ib_flow_prio opfcs[MLX5_IB_OPCOUNTER_MAX]; struct mlx5_flow_table *lag_demux_ft; + struct mlx5_ib_flow_prio *rdma_transport_rx; + struct mlx5_ib_flow_prio *rdma_transport_tx; /* Protect flow steering bypass flow tables * when add/del flow rules. * only single add/removal of flow steering rule could be done @@ -334,6 +351,7 @@ struct mlx5_ib_flow_db { #define MLX5_IB_UPD_XLT_PD BIT(4) #define MLX5_IB_UPD_XLT_ACCESS BIT(5) #define MLX5_IB_UPD_XLT_INDIRECT BIT(6) +#define MLX5_IB_UPD_XLT_DOWNGRADE BIT(7) /* Private QP creation flags to be passed in ib_qp_init_attr.create_flags. * @@ -341,7 +359,6 @@ struct mlx5_ib_flow_db { * rely on the range reserved for that use in the ib_qp_create_flags enum. */ #define MLX5_IB_QP_CREATE_SQPN_QP1 IB_QP_CREATE_RESERVED_START -#define MLX5_IB_QP_CREATE_WC_TEST (IB_QP_CREATE_RESERVED_START << 1) struct wr_list { u16 opcode; @@ -520,6 +537,7 @@ struct mlx5_ib_qp { struct mlx5_bf bf; u8 has_rq:1; u8 is_rss:1; + u8 is_ooo_rq:1; /* only for user space QPs. For kernel * we have it from the bf object @@ -628,6 +646,8 @@ enum mlx5_mkey_type { MLX5_MKEY_MR = 1, MLX5_MKEY_MW, MLX5_MKEY_INDIRECT_DEVX, + MLX5_MKEY_NULL, + MLX5_MKEY_IMPLICIT_CHILD, }; struct mlx5r_cache_rb_key { @@ -643,9 +663,10 @@ struct mlx5_ib_mkey { unsigned int ndescs; struct wait_queue_head wait; refcount_t usecount; - /* User Mkey must hold either a rb_key or a cache_ent. */ + /* Cacheable user Mkey must hold either a rb_key or a cache_ent. */ struct mlx5r_cache_rb_key rb_key; struct mlx5_cache_ent *cache_ent; + u8 cacheable : 1; }; #define MLX5_IB_MTT_PRESENT (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE) @@ -664,11 +685,19 @@ struct mlx5_ib_mkey { #define mlx5_update_odp_stats(mr, counter_name, value) \ atomic64_add(value, &((mr)->odp_stats.counter_name)) +#define mlx5_update_odp_stats_with_handled(mr, counter_name, value) \ + do { \ + mlx5_update_odp_stats(mr, counter_name, value); \ + atomic64_add(1, &((mr)->odp_stats.counter_name##_handled)); \ + } while (0) + struct mlx5_ib_mr { struct ib_mr ibmr; struct mlx5_ib_mkey mmkey; struct ib_umem *umem; + /* The mr is data direct related */ + u8 data_direct :1; union { /* Used only by kernel MRs (umem == NULL) */ @@ -706,6 +735,11 @@ struct mlx5_ib_mr { } odp_destroy; struct ib_odp_counters odp_stats; bool is_odp_implicit; + /* The affilated data direct crossed mr */ + struct mlx5_ib_mr *dd_crossed_mr; + struct list_head dd_node; + u8 revoked :1; + struct mlx5_ib_mkey null_mmkey; }; }; }; @@ -751,6 +785,8 @@ struct umr_common { */ struct mutex lock; unsigned int state; + /* Protects from repeat UMR QP creation */ + struct mutex init_lock; }; #define NUM_MKEYS_PER_PAGE \ @@ -781,6 +817,7 @@ struct mlx5_cache_ent { u8 is_tmp:1; u8 disabled:1; u8 fill_to_high_water:1; + u8 tmp_cleanup_scheduled:1; /* * - limit is the low water mark for stored mkeys, 2* limit is the @@ -812,7 +849,6 @@ struct mlx5_mkey_cache { struct mutex rb_lock; struct dentry *fs_root; unsigned long last_add; - struct delayed_work remove_ent_dwork; }; struct mlx5_ib_port_resources { @@ -820,13 +856,20 @@ struct mlx5_ib_port_resources { struct work_struct pkey_change_work; }; +struct mlx5_data_direct_resources { + u32 pdn; + u32 mkey; +}; + struct mlx5_ib_resources { struct ib_cq *c0; + struct mutex cq_lock; u32 xrcdn0; u32 xrcdn1; struct ib_pd *p0; struct ib_srq *s0; struct ib_srq *s1; + struct mutex srq_lock; struct mlx5_ib_port_resources ports[2]; }; @@ -856,6 +899,14 @@ void mlx5_ib_fs_remove_op_fc(struct mlx5_ib_dev *dev, struct mlx5_ib_op_fc *opfc, enum mlx5_ib_optional_counter_type type); +int mlx5r_fs_bind_op_fc(struct ib_qp *qp, struct rdma_counter *counter, + u32 port); + +void mlx5r_fs_unbind_op_fc(struct ib_qp *qp, struct rdma_counter *counter); + +void mlx5r_fs_destroy_fcs(struct mlx5_ib_dev *dev, + struct rdma_counter *counter); + struct mlx5_ib_multiport_info; struct mlx5_ib_multiport { @@ -868,8 +919,6 @@ struct mlx5_roce { /* Protect mlx5_ib_get_netdev from invoking dev_hold() with a NULL * netdev pointer */ - rwlock_t netdev_lock; - struct net_device *netdev; struct notifier_block nb; struct netdev_net_notifier nn; struct notifier_block mdev_nb; @@ -954,15 +1003,14 @@ enum mlx5_ib_stages { MLX5_IB_STAGE_QP, MLX5_IB_STAGE_SRQ, MLX5_IB_STAGE_DEVICE_RESOURCES, - MLX5_IB_STAGE_DEVICE_NOTIFIER, MLX5_IB_STAGE_ODP, MLX5_IB_STAGE_COUNTERS, MLX5_IB_STAGE_CONG_DEBUGFS, - MLX5_IB_STAGE_UAR, MLX5_IB_STAGE_BFREG, MLX5_IB_STAGE_PRE_IB_REG_UMR, MLX5_IB_STAGE_WHITELIST_UID, MLX5_IB_STAGE_IB_REG, + MLX5_IB_STAGE_DEVICE_NOTIFIER, MLX5_IB_STAGE_POST_IB_REG_UMR, MLX5_IB_STAGE_DELAY_DROP, MLX5_IB_STAGE_RESTRACK, @@ -1114,7 +1162,11 @@ struct mlx5_macsec { struct mlx5_ib_dev { struct ib_device ib_dev; struct mlx5_core_dev *mdev; + struct mlx5_data_direct_dev *data_direct_dev; + /* protect accessing data_direct_dev */ + struct mutex data_direct_lock; struct notifier_block mdev_events; + struct notifier_block lag_events; int num_ports; /* serialize update of capability mask */ @@ -1122,7 +1174,6 @@ struct mlx5_ib_dev { u8 ib_active:1; u8 is_rep:1; u8 lag_active:1; - u8 wc_support:1; u8 fill_delay; struct umr_common umrc; /* sync used page count stats @@ -1145,10 +1196,10 @@ struct mlx5_ib_dev { /* protect resources needed as part of reset flow */ spinlock_t reset_flow_resource_lock; struct list_head qp_list; + struct list_head data_direct_mr_list; /* Array with num_ports elements */ struct mlx5_ib_port *port; struct mlx5_sq_bfreg bfreg; - struct mlx5_sq_bfreg wc_bfreg; struct mlx5_sq_bfreg fp_bfreg; struct mlx5_ib_delay_drop delay_drop; const struct mlx5_ib_profile *profile; @@ -1170,10 +1221,15 @@ struct mlx5_ib_dev { u16 pkey_table_len; u8 lag_ports; struct mlx5_special_mkeys mkeys; + struct mlx5_data_direct_resources ddr; #ifdef CONFIG_MLX5_MACSEC struct mlx5_macsec macsec; #endif + + u8 num_plane; + struct mlx5_ib_dev *smi_dev; + const char *sub_dev_name; }; static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) @@ -1272,6 +1328,8 @@ to_mmmap(struct rdma_user_mmap_entry *rdma_entry) struct mlx5_user_mmap_entry, rdma_entry); } +int mlx5_ib_dev_res_cq_init(struct mlx5_ib_dev *dev); +int mlx5_ib_dev_res_srq_init(struct mlx5_ib_dev *dev); int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt, struct mlx5_db *db); void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db); @@ -1311,7 +1369,7 @@ int mlx5_ib_read_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer, int mlx5_ib_read_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, void *buffer, size_t buflen, size_t *bc); int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); @@ -1324,7 +1382,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int fd, int access_flags, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int mlx5_ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, u32 flags, @@ -1335,7 +1393,6 @@ int mlx5_ib_alloc_mw(struct ib_mw *mw, struct ib_udata *udata); int mlx5_ib_dealloc_mw(struct ib_mw *mw); struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, int access_flags); -void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *mr); void mlx5_ib_free_odp_mr(struct mlx5_ib_mr *mr); struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, u64 length, u64 virt_addr, int access_flags, @@ -1404,6 +1461,10 @@ int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table); struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm, struct ib_dm_mr_attr *attr, struct uverbs_attr_bundle *attrs); +void mlx5_ib_data_direct_bind(struct mlx5_ib_dev *ibdev, + struct mlx5_data_direct_dev *dev); +void mlx5_ib_data_direct_unbind(struct mlx5_ib_dev *ibdev); +void mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev *dev); #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev); @@ -1412,8 +1473,8 @@ void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev); int __init mlx5_ib_odp_init(void); void mlx5_ib_odp_cleanup(void); int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev); -void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, - struct mlx5_ib_mr *mr, int flags); +int mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, + struct mlx5_ib_mr *mr, int flags); int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, @@ -1434,8 +1495,11 @@ static inline int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev) { return 0; } -static inline void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, - struct mlx5_ib_mr *mr, int flags) {} +static inline int mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, + struct mlx5_ib_mr *mr, int flags) +{ + return -EOPNOTSUPP; +} static inline int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, @@ -1511,6 +1575,7 @@ extern const struct uapi_definition mlx5_ib_devx_defs[]; extern const struct uapi_definition mlx5_ib_flow_defs[]; extern const struct uapi_definition mlx5_ib_qos_defs[]; extern const struct uapi_definition mlx5_ib_std_types_defs[]; +extern const struct uapi_definition mlx5_ib_create_cq_defs[]; static inline int is_qp1(enum ib_qp_type qp_type) { @@ -1611,8 +1676,6 @@ static inline void mlx5r_deref_wait_odp_mkey(struct mlx5_ib_mkey *mmkey) wait_event(mmkey->wait, refcount_read(&mmkey->usecount) == 0); } -int mlx5_ib_test_wc(struct mlx5_ib_dev *dev); - static inline bool mlx5_ib_lag_should_assign_affinity(struct mlx5_ib_dev *dev) { /* @@ -1679,4 +1742,26 @@ static inline bool mlx5_umem_needs_ats(struct mlx5_ib_dev *dev, int set_roce_addr(struct mlx5_ib_dev *dev, u32 port_num, unsigned int index, const union ib_gid *gid, const struct ib_gid_attr *attr); + +static inline u32 smi_to_native_portnum(struct mlx5_ib_dev *dev, u32 port) +{ + return (port - 1) / dev->num_ports + 1; +} + +/* + * For mkc users, instead of a page_offset the command has a start_iova which + * specifies both the page_offset and the on-the-wire IOVA + */ +static __always_inline unsigned long +mlx5_umem_mkc_find_best_pgsz(struct mlx5_ib_dev *dev, struct ib_umem *umem, + u64 iova) +{ + int page_size_bits = + MLX5_CAP_GEN_2(dev->mdev, umr_log_entity_size_5) ? 6 : 5; + unsigned long bitmap = + __mlx5_log_page_size_to_bitmap(page_size_bits, 0); + + return ib_umem_find_best_pgsz(umem, bitmap, iova); +} + #endif /* MLX5_IB_H */ diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index a8ee2ca1f4a1..6dd813bac5b2 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -43,18 +43,22 @@ #include "dm.h" #include "mlx5_ib.h" #include "umr.h" +#include "data_direct.h" enum { MAX_PENDING_REG_MR = 8, }; +#define MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS 4 #define MLX5_UMR_ALIGN 2048 static void create_mkey_callback(int status, struct mlx5_async_work *context); static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, u64 iova, int access_flags, - unsigned int page_size, bool populate); + unsigned long page_size, bool populate, + int access_mode); +static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr); static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr, struct ib_pd *pd) @@ -211,9 +215,9 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context) spin_lock_irqsave(&ent->mkeys_queue.lock, flags); push_mkey_locked(ent, mkey_out->mkey); + ent->pending--; /* If we are doing fill_to_high_water then keep going. */ queue_adjust_cache_locked(ent); - ent->pending--; spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags); kfree(mkey_out); } @@ -246,6 +250,7 @@ static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc) MLX5_SET(mkc, mkc, access_mode_1_0, ent->rb_key.access_mode & 0x3); MLX5_SET(mkc, mkc, access_mode_4_2, (ent->rb_key.access_mode >> 2) & 0x7); + MLX5_SET(mkc, mkc, ma_translation_mode, !!ent->rb_key.ats); MLX5_SET(mkc, mkc, translations_octword_size, get_mkc_octo_size(ent->rb_key.access_mode, @@ -520,12 +525,27 @@ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent) ent->fill_to_high_water = false; if (ent->pending) queue_delayed_work(ent->dev->cache.wq, &ent->dwork, - msecs_to_jiffies(1000)); + secs_to_jiffies(1)); else mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); } } +static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent) +{ + u32 mkey; + + spin_lock_irq(&ent->mkeys_queue.lock); + while (ent->mkeys_queue.ci) { + mkey = pop_mkey_locked(ent); + spin_unlock_irq(&ent->mkeys_queue.lock); + mlx5_core_destroy_mkey(dev->mdev, mkey); + spin_lock_irq(&ent->mkeys_queue.lock); + } + ent->tmp_cleanup_scheduled = false; + spin_unlock_irq(&ent->mkeys_queue.lock); +} + static void __cache_work_func(struct mlx5_cache_ent *ent) { struct mlx5_ib_dev *dev = ent->dev; @@ -556,7 +576,7 @@ static void __cache_work_func(struct mlx5_cache_ent *ent) "add keys command failed, err %d\n", err); queue_delayed_work(cache->wq, &ent->dwork, - msecs_to_jiffies(1000)); + secs_to_jiffies(1)); } } } else if (ent->mkeys_queue.ci > 2 * ent->limit) { @@ -597,7 +617,11 @@ static void delayed_cache_work_func(struct work_struct *work) struct mlx5_cache_ent *ent; ent = container_of(work, struct mlx5_cache_ent, dwork.work); - __cache_work_func(ent); + /* temp entries are never filled, only cleaned */ + if (ent->is_tmp) + clean_keys(ent->dev, ent); + else + __cache_work_func(ent); } static int cache_ent_key_cmp(struct mlx5r_cache_rb_key key1, @@ -641,10 +665,8 @@ static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache, new = &((*new)->rb_left); if (cmp < 0) new = &((*new)->rb_right); - if (cmp == 0) { - mutex_unlock(&cache->rb_lock); + if (cmp == 0) return -EEXIST; - } } /* Add new node and rebalance tree. */ @@ -660,6 +682,7 @@ mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev, { struct rb_node *node = dev->cache.rb_root.rb_node; struct mlx5_cache_ent *cur, *smallest = NULL; + u64 ndescs_limit; int cmp; /* @@ -678,17 +701,24 @@ mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev, return cur; } + /* + * Limit the usage of mkeys larger than twice the required size while + * also allowing the usage of smallest cache entry for small MRs. + */ + ndescs_limit = max_t(u64, rb_key.ndescs * 2, + MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS); + return (smallest && smallest->rb_key.access_mode == rb_key.access_mode && smallest->rb_key.access_flags == rb_key.access_flags && - smallest->rb_key.ats == rb_key.ats) ? + smallest->rb_key.ats == rb_key.ats && + smallest->rb_key.ndescs <= ndescs_limit) ? smallest : NULL; } static struct mlx5_ib_mr *_mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, - struct mlx5_cache_ent *ent, - int access_flags) + struct mlx5_cache_ent *ent) { struct mlx5_ib_mr *mr; int err; @@ -719,6 +749,8 @@ static struct mlx5_ib_mr *_mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, } mr->mmkey.cache_ent = ent; mr->mmkey.type = MLX5_MKEY_MR; + mr->mmkey.rb_key = ent->rb_key; + mr->mmkey.cacheable = true; init_waitqueue_head(&mr->mmkey.wait); return mr; } @@ -761,22 +793,7 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, if (!ent) return ERR_PTR(-EOPNOTSUPP); - return _mlx5_mr_cache_alloc(dev, ent, access_flags); -} - -static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent) -{ - u32 mkey; - - cancel_delayed_work(&ent->dwork); - spin_lock_irq(&ent->mkeys_queue.lock); - while (ent->mkeys_queue.ci) { - mkey = pop_mkey_locked(ent); - spin_unlock_irq(&ent->mkeys_queue.lock); - mlx5_core_destroy_mkey(dev->mdev, mkey); - spin_lock_irq(&ent->mkeys_queue.lock); - } - spin_unlock_irq(&ent->mkeys_queue.lock); + return _mlx5_mr_cache_alloc(dev, ent); } static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) @@ -891,10 +908,6 @@ mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev, ent->limit = 0; mlx5_mkey_cache_debugfs_add_ent(dev, ent); - } else { - mod_delayed_work(ent->dev->cache.wq, - &ent->dev->cache.remove_ent_dwork, - msecs_to_jiffies(30 * 1000)); } return ent; @@ -905,33 +918,23 @@ mkeys_err: return ERR_PTR(ret); } -static void remove_ent_work_func(struct work_struct *work) +static void mlx5r_destroy_cache_entries(struct mlx5_ib_dev *dev) { - struct mlx5_mkey_cache *cache; + struct rb_root *root = &dev->cache.rb_root; struct mlx5_cache_ent *ent; - struct rb_node *cur; - - cache = container_of(work, struct mlx5_mkey_cache, - remove_ent_dwork.work); - mutex_lock(&cache->rb_lock); - cur = rb_last(&cache->rb_root); - while (cur) { - ent = rb_entry(cur, struct mlx5_cache_ent, node); - cur = rb_prev(cur); - mutex_unlock(&cache->rb_lock); - - spin_lock_irq(&ent->mkeys_queue.lock); - if (!ent->is_tmp) { - spin_unlock_irq(&ent->mkeys_queue.lock); - mutex_lock(&cache->rb_lock); - continue; - } - spin_unlock_irq(&ent->mkeys_queue.lock); + struct rb_node *node; - clean_keys(ent->dev, ent); - mutex_lock(&cache->rb_lock); + mutex_lock(&dev->cache.rb_lock); + node = rb_first(root); + while (node) { + ent = rb_entry(node, struct mlx5_cache_ent, node); + node = rb_next(node); + clean_keys(dev, ent); + rb_erase(&ent->node, root); + mlx5r_mkeys_uninit(ent); + kfree(ent); } - mutex_unlock(&cache->rb_lock); + mutex_unlock(&dev->cache.rb_lock); } int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) @@ -949,7 +952,6 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) mutex_init(&dev->slow_path_mutex); mutex_init(&dev->cache.rb_lock); dev->cache.rb_root = RB_ROOT; - INIT_DELAYED_WORK(&dev->cache.remove_ent_dwork, remove_ent_work_func); cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM); if (!cache->wq) { mlx5_ib_warn(dev, "failed to create work queue\n"); @@ -961,7 +963,7 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) mlx5_mkey_cache_debugfs_init(dev); mutex_lock(&cache->rb_lock); for (i = 0; i <= mkey_cache_max_order(dev); i++) { - rb_key.ndescs = 1 << (i + 2); + rb_key.ndescs = MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS << i; ent = mlx5r_cache_create_ent_locked(dev, rb_key, true); if (IS_ERR(ent)) { ret = PTR_ERR(ent); @@ -986,6 +988,8 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) err: mutex_unlock(&cache->rb_lock); mlx5_mkey_cache_debugfs_cleanup(dev); + mlx5r_destroy_cache_entries(dev); + destroy_workqueue(cache->wq); mlx5_ib_warn(dev, "failed to create mkey cache entry\n"); return ret; } @@ -1000,7 +1004,6 @@ void mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev) return; mutex_lock(&dev->cache.rb_lock); - cancel_delayed_work(&dev->cache.remove_ent_dwork); for (node = rb_first(root); node; node = rb_next(node)) { ent = rb_entry(node, struct mlx5_cache_ent, node); spin_lock_irq(&ent->mkeys_queue.lock); @@ -1020,20 +1023,10 @@ void mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev) mlx5_cmd_cleanup_async_ctx(&dev->async_ctx); /* At this point all entries are disabled and have no concurrent work. */ - mutex_lock(&dev->cache.rb_lock); - node = rb_first(root); - while (node) { - ent = rb_entry(node, struct mlx5_cache_ent, node); - node = rb_next(node); - clean_keys(dev, ent); - rb_erase(&ent->node, root); - mlx5r_mkeys_uninit(ent); - kfree(ent); - } - mutex_unlock(&dev->cache.rb_lock); + mlx5r_destroy_cache_entries(dev); destroy_workqueue(dev->cache.wq); - del_timer_sync(&dev->delay_timer); + timer_delete_sync(&dev->delay_timer); } struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) @@ -1061,6 +1054,7 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) MLX5_SET(mkc, mkc, length64, 1); set_mkc_access_pd_addr_fields(mkc, acc | IB_ACCESS_RELAXED_ORDERING, 0, pd); + MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats)); err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); if (err) @@ -1125,24 +1119,22 @@ static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem, static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, struct ib_umem *umem, u64 iova, - int access_flags) + int access_flags, int access_mode) { - struct mlx5r_cache_rb_key rb_key = { - .access_mode = MLX5_MKC_ACCESS_MODE_MTT, - }; struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5r_cache_rb_key rb_key = {}; struct mlx5_cache_ent *ent; struct mlx5_ib_mr *mr; - unsigned int page_size; + unsigned long page_size; if (umem->is_dmabuf) page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova); else - page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size, - 0, iova); + page_size = mlx5_umem_mkc_find_best_pgsz(dev, umem, iova); if (WARN_ON(!page_size)) return ERR_PTR(-EINVAL); + rb_key.access_mode = access_mode; rb_key.ndescs = ib_umem_num_dma_blocks(umem, page_size); rb_key.ats = mlx5_umem_needs_ats(dev, umem, access_flags); rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags); @@ -1153,15 +1145,16 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, */ if (!ent) { mutex_lock(&dev->slow_path_mutex); - mr = reg_create(pd, umem, iova, access_flags, page_size, false); + mr = reg_create(pd, umem, iova, access_flags, page_size, false, access_mode); mutex_unlock(&dev->slow_path_mutex); if (IS_ERR(mr)) return mr; mr->mmkey.rb_key = rb_key; + mr->mmkey.cacheable = true; return mr; } - mr = _mlx5_mr_cache_alloc(dev, ent, access_flags); + mr = _mlx5_mr_cache_alloc(dev, ent); if (IS_ERR(mr)) return mr; @@ -1173,13 +1166,71 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, return mr; } +static struct ib_mr * +reg_create_crossing_vhca_mr(struct ib_pd *pd, u64 iova, u64 length, int access_flags, + u32 crossed_lkey) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + int access_mode = MLX5_MKC_ACCESS_MODE_CROSSING; + struct mlx5_ib_mr *mr; + void *mkc; + int inlen; + u32 *in; + int err; + + if (!MLX5_CAP_GEN(dev->mdev, crossing_vhca_mkey)) + return ERR_PTR(-EOPNOTSUPP); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_1; + } + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, crossing_target_vhca_id, + MLX5_CAP_GEN(dev->mdev, vhca_id)); + MLX5_SET(mkc, mkc, translations_octword_size, crossed_lkey); + MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3); + MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7); + + /* for this crossing mkey IOVA should be 0 and len should be IOVA + len */ + set_mkc_access_pd_addr_fields(mkc, access_flags, 0, pd); + MLX5_SET64(mkc, mkc, len, iova + length); + + MLX5_SET(mkc, mkc, free, 0); + MLX5_SET(mkc, mkc, umr_en, 0); + err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); + if (err) + goto err_2; + + mr->mmkey.type = MLX5_MKEY_MR; + set_mr_fields(dev, mr, length, access_flags, iova); + mr->ibmr.pd = pd; + kvfree(in); + mlx5_ib_dbg(dev, "crossing mkey = 0x%x\n", mr->mmkey.key); + + return &mr->ibmr; +err_2: + kvfree(in); +err_1: + kfree(mr); + return ERR_PTR(err); +} + /* * If ibmr is NULL it will be allocated by reg_create. * Else, the given ibmr will be used. */ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, u64 iova, int access_flags, - unsigned int page_size, bool populate) + unsigned long page_size, bool populate, + int access_mode) { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_mr *mr; @@ -1188,7 +1239,9 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, int inlen; u32 *in; int err; - bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)); + bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)) && + (access_mode == MLX5_MKC_ACCESS_MODE_MTT); + bool ksm_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM); if (!page_size) return ERR_PTR(-EINVAL); @@ -1211,7 +1264,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, } pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); if (populate) { - if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND)) { + if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND || ksm_mode)) { err = -EINVAL; goto err_2; } @@ -1227,14 +1280,22 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); set_mkc_access_pd_addr_fields(mkc, access_flags, iova, populate ? pd : dev->umrc.pd); + /* In case a data direct flow, overwrite the pdn field by its internal kernel PD */ + if (umem->is_dmabuf && ksm_mode) + MLX5_SET(mkc, mkc, pd, dev->ddr.pdn); + MLX5_SET(mkc, mkc, free, !populate); - MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); + MLX5_SET(mkc, mkc, access_mode_1_0, access_mode); MLX5_SET(mkc, mkc, umr_en, 1); MLX5_SET64(mkc, mkc, len, umem->length); MLX5_SET(mkc, mkc, bsf_octword_size, 0); - MLX5_SET(mkc, mkc, translations_octword_size, - get_octo_len(iova, umem->length, mr->page_shift)); + if (ksm_mode) + MLX5_SET(mkc, mkc, translations_octword_size, + get_octo_len(iova, umem->length, mr->page_shift) * 2); + else + MLX5_SET(mkc, mkc, translations_octword_size, + get_octo_len(iova, umem->length, mr->page_shift)); MLX5_SET(mkc, mkc, log_page_size, mr->page_shift); if (mlx5_umem_needs_ats(dev, umem, access_flags)) MLX5_SET(mkc, mkc, ma_translation_mode, 1); @@ -1371,13 +1432,15 @@ static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem, xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length); if (xlt_with_umr) { - mr = alloc_cacheable_mr(pd, umem, iova, access_flags); + mr = alloc_cacheable_mr(pd, umem, iova, access_flags, + MLX5_MKC_ACCESS_MODE_MTT); } else { - unsigned int page_size = mlx5_umem_find_best_pgsz( - umem, mkc, log_page_size, 0, iova); + unsigned long page_size = + mlx5_umem_mkc_find_best_pgsz(dev, umem, iova); mutex_lock(&dev->slow_path_mutex); - mr = reg_create(pd, umem, iova, access_flags, page_size, true); + mr = reg_create(pd, umem, iova, access_flags, page_size, + true, MLX5_MKC_ACCESS_MODE_MTT); mutex_unlock(&dev->slow_path_mutex); } if (IS_ERR(mr)) { @@ -1440,7 +1503,8 @@ static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length, if (IS_ERR(odp)) return ERR_CAST(odp); - mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags); + mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags, + MLX5_MKC_ACCESS_MODE_MTT); if (IS_ERR(mr)) { ib_umem_release(&odp->umem); return ERR_CAST(mr); @@ -1468,6 +1532,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct ib_umem *umem; + int err; if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) return ERR_PTR(-EOPNOTSUPP); @@ -1475,6 +1540,10 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", start, iova, length, access_flags); + err = mlx5r_umr_resource_init(dev); + if (err) + return ERR_PTR(err); + if (access_flags & IB_ACCESS_ON_DEMAND) return create_user_odp_mr(pd, start, length, iova, access_flags, udata); @@ -1491,7 +1560,7 @@ static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach) dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv); - if (!umem_dmabuf->sgt) + if (!umem_dmabuf->sgt || !mr) return; mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP); @@ -1503,31 +1572,31 @@ static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = { .move_notify = mlx5_ib_dmabuf_invalidate_cb, }; -struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, - u64 length, u64 virt_addr, - int fd, int access_flags, - struct ib_udata *udata) +static struct ib_mr * +reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device, + u64 offset, u64 length, u64 virt_addr, + int fd, int access_flags, int access_mode) { + bool pinned_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM); struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_mr *mr = NULL; struct ib_umem_dmabuf *umem_dmabuf; int err; - if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || - !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) - return ERR_PTR(-EOPNOTSUPP); - - mlx5_ib_dbg(dev, - "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n", - offset, virt_addr, length, fd, access_flags); + err = mlx5r_umr_resource_init(dev); + if (err) + return ERR_PTR(err); - /* dmabuf requires xlt update via umr to work. */ - if (!mlx5r_umr_can_load_pas(dev, length)) - return ERR_PTR(-EINVAL); + if (!pinned_mode) + umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, + offset, length, fd, + access_flags, + &mlx5_ib_dmabuf_attach_ops); + else + umem_dmabuf = ib_umem_dmabuf_get_pinned_with_dma_device(&dev->ib_dev, + dma_device, offset, length, + fd, access_flags); - umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd, - access_flags, - &mlx5_ib_dmabuf_attach_ops); if (IS_ERR(umem_dmabuf)) { mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n", PTR_ERR(umem_dmabuf)); @@ -1535,7 +1604,7 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, } mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr, - access_flags); + access_flags, access_mode); if (IS_ERR(mr)) { ib_umem_release(&umem_dmabuf->umem); return ERR_CAST(mr); @@ -1545,9 +1614,13 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages); umem_dmabuf->private = mr; - err = mlx5r_store_odp_mkey(dev, &mr->mmkey); - if (err) - goto err_dereg_mr; + if (!pinned_mode) { + err = mlx5r_store_odp_mkey(dev, &mr->mmkey); + if (err) + goto err_dereg_mr; + } else { + mr->data_direct = true; + } err = mlx5_ib_init_dmabuf_mr(mr); if (err) @@ -1555,10 +1628,101 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, return &mr->ibmr; err_dereg_mr: - mlx5_ib_dereg_mr(&mr->ibmr, NULL); + __mlx5_ib_dereg_mr(&mr->ibmr); return ERR_PTR(err); } +static struct ib_mr * +reg_user_mr_dmabuf_by_data_direct(struct ib_pd *pd, u64 offset, + u64 length, u64 virt_addr, + int fd, int access_flags) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_data_direct_dev *data_direct_dev; + struct ib_mr *crossing_mr; + struct ib_mr *crossed_mr; + int ret = 0; + + /* As of HW behaviour the IOVA must be page aligned in KSM mode */ + if (!PAGE_ALIGNED(virt_addr) || (access_flags & IB_ACCESS_ON_DEMAND)) + return ERR_PTR(-EOPNOTSUPP); + + mutex_lock(&dev->data_direct_lock); + data_direct_dev = dev->data_direct_dev; + if (!data_direct_dev) { + ret = -EINVAL; + goto end; + } + + /* The device's 'data direct mkey' was created without RO flags to + * simplify things and allow for a single mkey per device. + * Since RO is not a must, mask it out accordingly. + */ + access_flags &= ~IB_ACCESS_RELAXED_ORDERING; + crossed_mr = reg_user_mr_dmabuf(pd, &data_direct_dev->pdev->dev, + offset, length, virt_addr, fd, + access_flags, MLX5_MKC_ACCESS_MODE_KSM); + if (IS_ERR(crossed_mr)) { + ret = PTR_ERR(crossed_mr); + goto end; + } + + mutex_lock(&dev->slow_path_mutex); + crossing_mr = reg_create_crossing_vhca_mr(pd, virt_addr, length, access_flags, + crossed_mr->lkey); + mutex_unlock(&dev->slow_path_mutex); + if (IS_ERR(crossing_mr)) { + __mlx5_ib_dereg_mr(crossed_mr); + ret = PTR_ERR(crossing_mr); + goto end; + } + + list_add_tail(&to_mmr(crossed_mr)->dd_node, &dev->data_direct_mr_list); + to_mmr(crossing_mr)->dd_crossed_mr = to_mmr(crossed_mr); + to_mmr(crossing_mr)->data_direct = true; +end: + mutex_unlock(&dev->data_direct_lock); + return ret ? ERR_PTR(ret) : crossing_mr; +} + +struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, + u64 length, u64 virt_addr, + int fd, int access_flags, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + int mlx5_access_flags = 0; + int err; + + if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || + !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) + return ERR_PTR(-EOPNOTSUPP); + + if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS)) { + err = uverbs_get_flags32(&mlx5_access_flags, attrs, + MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS, + MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT); + if (err) + return ERR_PTR(err); + } + + mlx5_ib_dbg(dev, + "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x, mlx5_access_flags 0x%x\n", + offset, virt_addr, length, fd, access_flags, mlx5_access_flags); + + /* dmabuf requires xlt update via umr to work. */ + if (!mlx5r_umr_can_load_pas(dev, length)) + return ERR_PTR(-EINVAL); + + if (mlx5_access_flags & MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT) + return reg_user_mr_dmabuf_by_data_direct(pd, offset, length, virt_addr, + fd, access_flags); + + return reg_user_mr_dmabuf(pd, pd->device->dma_device, + offset, length, virt_addr, + fd, access_flags, MLX5_MKC_ACCESS_MODE_MTT); +} + /* * True if the change in access flags can be done via UMR, only some access * flags can be updated. @@ -1570,7 +1734,8 @@ static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev, unsigned int diffs = current_access_flags ^ target_access_flags; if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING)) + IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING | + IB_ACCESS_REMOTE_ATOMIC)) return false; return mlx5r_umr_can_reconfig(dev, current_access_flags, target_access_flags); @@ -1589,8 +1754,7 @@ static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr, if (!mlx5r_umr_can_load_pas(dev, new_umem->length)) return false; - *page_size = - mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova); + *page_size = mlx5_umem_mkc_find_best_pgsz(dev, new_umem, iova); if (WARN_ON(!*page_size)) return false; return (mr->mmkey.cache_ent->rb_key.ndescs) >= @@ -1653,7 +1817,7 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, struct mlx5_ib_mr *mr = to_mmr(ib_mr); int err; - if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) + if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || mr->data_direct) return ERR_PTR(-EOPNOTSUPP); mlx5_ib_dbg( @@ -1781,7 +1945,8 @@ err: static void mlx5_free_priv_descs(struct mlx5_ib_mr *mr) { - if (!mr->umem && mr->descs) { + if (!mr->umem && !mr->data_direct && + mr->ibmr.type != IB_MR_TYPE_DM && mr->descs) { struct ib_device *device = mr->ibmr.device; int size = mr->max_descs * mr->desc_size; struct mlx5_ib_dev *dev = to_mdev(device); @@ -1802,7 +1967,6 @@ static int cache_ent_find_and_store(struct mlx5_ib_dev *dev, if (mr->mmkey.cache_ent) { spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock); - mr->mmkey.cache_ent->in_use--; goto end; } @@ -1835,7 +1999,89 @@ end: return ret; } -int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) +static int mlx5_ib_revoke_data_direct_mr(struct mlx5_ib_mr *mr) +{ + struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); + struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem); + int err; + + lockdep_assert_held(&dev->data_direct_lock); + mr->revoked = true; + err = mlx5r_umr_revoke_mr(mr); + if (WARN_ON(err)) + return err; + + ib_umem_dmabuf_revoke(umem_dmabuf); + return 0; +} + +void mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev *dev) +{ + struct mlx5_ib_mr *mr, *next; + + lockdep_assert_held(&dev->data_direct_lock); + + list_for_each_entry_safe(mr, next, &dev->data_direct_mr_list, dd_node) { + list_del(&mr->dd_node); + mlx5_ib_revoke_data_direct_mr(mr); + } +} + +static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) +{ + struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); + struct mlx5_cache_ent *ent = mr->mmkey.cache_ent; + bool is_odp = is_odp_mr(mr); + bool is_odp_dma_buf = is_dmabuf_mr(mr) && + !to_ib_umem_dmabuf(mr->umem)->pinned; + bool from_cache = !!ent; + int ret = 0; + + if (is_odp) + mutex_lock(&to_ib_umem_odp(mr->umem)->umem_mutex); + + if (is_odp_dma_buf) + dma_resv_lock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv, NULL); + + if (mr->mmkey.cacheable && !mlx5r_umr_revoke_mr(mr) && !cache_ent_find_and_store(dev, mr)) { + ent = mr->mmkey.cache_ent; + /* upon storing to a clean temp entry - schedule its cleanup */ + spin_lock_irq(&ent->mkeys_queue.lock); + if (from_cache) + ent->in_use--; + if (ent->is_tmp && !ent->tmp_cleanup_scheduled) { + mod_delayed_work(ent->dev->cache.wq, &ent->dwork, + secs_to_jiffies(30)); + ent->tmp_cleanup_scheduled = true; + } + spin_unlock_irq(&ent->mkeys_queue.lock); + goto out; + } + + if (ent) { + spin_lock_irq(&ent->mkeys_queue.lock); + ent->in_use--; + mr->mmkey.cache_ent = NULL; + spin_unlock_irq(&ent->mkeys_queue.lock); + } + ret = destroy_mkey(dev, mr); +out: + if (is_odp) { + if (!ret) + to_ib_umem_odp(mr->umem)->private = NULL; + mutex_unlock(&to_ib_umem_odp(mr->umem)->umem_mutex); + } + + if (is_odp_dma_buf) { + if (!ret) + to_ib_umem_dmabuf(mr->umem)->private = NULL; + dma_resv_unlock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv); + } + + return ret; +} + +static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr) { struct mlx5_ib_mr *mr = to_mmr(ibmr); struct mlx5_ib_dev *dev = to_mdev(ibmr->device); @@ -1880,16 +2126,9 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) } /* Stop DMA */ - if (mr->umem && mlx5r_umr_can_load_pas(dev, mr->umem->length)) - if (mlx5r_umr_revoke_mr(mr) || - cache_ent_find_and_store(dev, mr)) - mr->mmkey.cache_ent = NULL; - - if (!mr->mmkey.cache_ent) { - rc = destroy_mkey(to_mdev(mr->ibmr.device), mr); - if (rc) - return rc; - } + rc = mlx5_revoke_mr(mr); + if (rc) + return rc; if (mr->umem) { bool is_odp = is_odp_mr(mr); @@ -1909,9 +2148,40 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) return 0; } +static int dereg_crossing_data_direct_mr(struct mlx5_ib_dev *dev, + struct mlx5_ib_mr *mr) +{ + struct mlx5_ib_mr *dd_crossed_mr = mr->dd_crossed_mr; + int ret; + + ret = __mlx5_ib_dereg_mr(&mr->ibmr); + if (ret) + return ret; + + mutex_lock(&dev->data_direct_lock); + if (!dd_crossed_mr->revoked) + list_del(&dd_crossed_mr->dd_node); + + ret = __mlx5_ib_dereg_mr(&dd_crossed_mr->ibmr); + mutex_unlock(&dev->data_direct_lock); + return ret; +} + +int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) +{ + struct mlx5_ib_mr *mr = to_mmr(ibmr); + struct mlx5_ib_dev *dev = to_mdev(ibmr->device); + + if (mr->data_direct) + return dereg_crossing_data_direct_mr(dev, mr); + + return __mlx5_ib_dereg_mr(ibmr); +} + static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs, int access_mode, int page_shift) { + struct mlx5_ib_dev *dev = to_mdev(pd->device); void *mkc; mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); @@ -1924,6 +2194,9 @@ static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs, MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7); MLX5_SET(mkc, mkc, umr_en, 1); MLX5_SET(mkc, mkc, log_page_size, page_shift); + if (access_mode == MLX5_MKC_ACCESS_MODE_PA || + access_mode == MLX5_MKC_ACCESS_MODE_MTT) + MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats)); } static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 4a04cbc5b78a..eaa2f9f5f3a9 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -34,6 +34,9 @@ #include <linux/kernel.h> #include <linux/dma-buf.h> #include <linux/dma-resv.h> +#include <linux/hmm.h> +#include <linux/hmm-dma.h> +#include <linux/pci-p2pdma.h> #include "mlx5_ib.h" #include "cmd.h" @@ -45,7 +48,7 @@ /* Contains the details of a pagefault. */ struct mlx5_pagefault { u32 bytes_committed; - u32 token; + u64 token; u8 event_subtype; u8 type; union { @@ -74,6 +77,14 @@ struct mlx5_pagefault { u32 rdma_op_len; u64 rdma_va; } rdma; + struct { + u64 va; + u32 mkey; + u32 fault_byte_count; + u32 prefetch_before_byte_count; + u32 prefetch_after_byte_count; + u8 flags; + } memory; }; struct mlx5_ib_pf_eq *eq; @@ -99,13 +110,20 @@ static u64 mlx5_imr_ksm_entries; static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, struct mlx5_ib_mr *imr, int flags) { + struct mlx5_core_dev *dev = mr_to_mdev(imr)->mdev; struct mlx5_klm *end = pklm + nentries; + int step = MLX5_CAP_ODP(dev, mem_page_fault) ? MLX5_IMR_MTT_SIZE : 0; + __be32 key = MLX5_CAP_ODP(dev, mem_page_fault) ? + cpu_to_be32(imr->null_mmkey.key) : + mr_to_mdev(imr)->mkeys.null_mkey; + u64 va = + MLX5_CAP_ODP(dev, mem_page_fault) ? idx * MLX5_IMR_MTT_SIZE : 0; if (flags & MLX5_IB_UPD_XLT_ZAP) { - for (; pklm != end; pklm++, idx++) { + for (; pklm != end; pklm++, idx++, va += step) { pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); - pklm->key = mr_to_mdev(imr)->mkeys.null_mkey; - pklm->va = 0; + pklm->key = key; + pklm->va = cpu_to_be64(va); } return; } @@ -129,7 +147,7 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, */ lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex); - for (; pklm != end; pklm++, idx++) { + for (; pklm != end; pklm++, idx++, va += step) { struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx); pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); @@ -137,47 +155,56 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, pklm->key = cpu_to_be32(mtt->ibmr.lkey); pklm->va = cpu_to_be64(idx * MLX5_IMR_MTT_SIZE); } else { - pklm->key = mr_to_mdev(imr)->mkeys.null_mkey; - pklm->va = 0; + pklm->key = key; + pklm->va = cpu_to_be64(va); } } } -static u64 umem_dma_to_mtt(dma_addr_t umem_dma) -{ - u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK; - - if (umem_dma & ODP_READ_ALLOWED_BIT) - mtt_entry |= MLX5_IB_MTT_READ; - if (umem_dma & ODP_WRITE_ALLOWED_BIT) - mtt_entry |= MLX5_IB_MTT_WRITE; - - return mtt_entry; -} - -static void populate_mtt(__be64 *pas, size_t idx, size_t nentries, - struct mlx5_ib_mr *mr, int flags) +static int populate_mtt(__be64 *pas, size_t start, size_t nentries, + struct mlx5_ib_mr *mr, int flags) { struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); - dma_addr_t pa; + bool downgrade = flags & MLX5_IB_UPD_XLT_DOWNGRADE; + struct pci_p2pdma_map_state p2pdma_state = {}; + struct ib_device *dev = odp->umem.ibdev; size_t i; if (flags & MLX5_IB_UPD_XLT_ZAP) - return; + return 0; for (i = 0; i < nentries; i++) { - pa = odp->dma_list[idx + i]; - pas[i] = cpu_to_be64(umem_dma_to_mtt(pa)); + unsigned long pfn = odp->map.pfn_list[start + i]; + dma_addr_t dma_addr; + + pfn = odp->map.pfn_list[start + i]; + if (!(pfn & HMM_PFN_VALID)) + /* ODP initialization */ + continue; + + dma_addr = hmm_dma_map_pfn(dev->dma_device, &odp->map, + start + i, &p2pdma_state); + if (ib_dma_mapping_error(dev, dma_addr)) + return -EFAULT; + + dma_addr |= MLX5_IB_MTT_READ; + if ((pfn & HMM_PFN_WRITE) && !downgrade) + dma_addr |= MLX5_IB_MTT_WRITE; + + pas[i] = cpu_to_be64(dma_addr); + odp->npages++; } + return 0; } -void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, - struct mlx5_ib_mr *mr, int flags) +int mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, + struct mlx5_ib_mr *mr, int flags) { if (flags & MLX5_IB_UPD_XLT_INDIRECT) { populate_klm(xlt, idx, nentries, mr, flags); + return 0; } else { - populate_mtt(xlt, idx, nentries, mr, flags); + return populate_mtt(xlt, idx, nentries, mr, flags); } } @@ -213,10 +240,28 @@ static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr) unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT; struct mlx5_ib_mr *imr = mr->parent; + /* + * If userspace is racing freeing the parent implicit ODP MR then we can + * loose the race with parent destruction. In this case + * mlx5_ib_free_odp_mr() will free everything in the implicit_children + * xarray so NOP is fine. This child MR cannot be destroyed here because + * we are under its umem_mutex. + */ if (!refcount_inc_not_zero(&imr->mmkey.usecount)) return; - xa_erase(&imr->implicit_children, idx); + xa_lock(&imr->implicit_children); + if (__xa_cmpxchg(&imr->implicit_children, idx, mr, NULL, GFP_KERNEL) != + mr) { + xa_unlock(&imr->implicit_children); + mlx5r_deref_odp_mkey(&imr->mmkey); + return; + } + + if (MLX5_CAP_ODP(mr_to_mdev(mr)->mdev, mem_page_fault)) + __xa_erase(&mr_to_mdev(mr)->odp_mkeys, + mlx5_base_mkey(mr->mmkey.key)); + xa_unlock(&imr->implicit_children); /* Freeing a MR is a sleeping operation, so bounce to a work queue */ INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work); @@ -250,6 +295,8 @@ static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni, if (!umem_odp->npages) goto out; mr = umem_odp->private; + if (!mr) + goto out; start = max_t(u64, ib_umem_start(umem_odp), range->start); end = min_t(u64, ib_umem_end(umem_odp), range->end); @@ -268,15 +315,11 @@ static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni, * estimate the cost of another UMR vs. the cost of bigger * UMR. */ - if (umem_odp->dma_list[idx] & - (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) { + if (umem_odp->map.pfn_list[idx] & HMM_PFN_VALID) { if (!in_block) { blk_start_idx = idx; in_block = 1; } - - /* Count page invalidations */ - invalidations += idx - blk_start_idx + 1; } else { u64 umr_offset = idx & umr_block_mask; @@ -286,16 +329,21 @@ static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni, MLX5_IB_UPD_XLT_ZAP | MLX5_IB_UPD_XLT_ATOMIC); in_block = 0; + /* Count page invalidations */ + invalidations += idx - blk_start_idx + 1; } } } - if (in_block) + if (in_block) { mlx5r_umr_update_xlt(mr, blk_start_idx, idx - blk_start_idx + 1, 0, MLX5_IB_UPD_XLT_ZAP | MLX5_IB_UPD_XLT_ATOMIC); + /* Count page invalidations */ + invalidations += idx - blk_start_idx + 1; + } - mlx5_update_odp_stats(mr, invalidations, invalidations); + mlx5_update_odp_stats_with_handled(mr, invalidations, invalidations); /* * We are now sure that the device will not access the @@ -332,46 +380,46 @@ static void internal_fill_odp_caps(struct mlx5_ib_dev *dev) else dev->odp_max_size = BIT_ULL(MLX5_MAX_UMR_SHIFT + PAGE_SHIFT); - if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, ud_odp_caps.send)) caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND; - if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.srq_receive)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, ud_odp_caps.srq_receive)) caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; - if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.send)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.send)) caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND; - if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.receive)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.receive)) caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV; - if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.write)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.write)) caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE; - if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.read)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.read)) caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; - if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.atomic)) caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; - if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.srq_receive)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.srq_receive)) caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; - if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.send)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.send)) caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SEND; - if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.receive)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.receive)) caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_RECV; - if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.write)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.write)) caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_WRITE; - if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.read)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.read)) caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_READ; - if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.atomic)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.atomic)) caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; - if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.srq_receive)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.srq_receive)) caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) && @@ -388,13 +436,29 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ? pfault->wqe.wq_num : pfault->token; u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)] = {}; + void *info; int err; MLX5_SET(page_fault_resume_in, in, opcode, MLX5_CMD_OP_PAGE_FAULT_RESUME); - MLX5_SET(page_fault_resume_in, in, page_fault_type, pfault->type); - MLX5_SET(page_fault_resume_in, in, token, pfault->token); - MLX5_SET(page_fault_resume_in, in, wq_number, wq_num); - MLX5_SET(page_fault_resume_in, in, error, !!error); + + if (pfault->event_subtype == MLX5_PFAULT_SUBTYPE_MEMORY) { + info = MLX5_ADDR_OF(page_fault_resume_in, in, + page_fault_info.mem_page_fault_info); + MLX5_SET(mem_page_fault_info, info, fault_token_31_0, + pfault->token & 0xffffffff); + MLX5_SET(mem_page_fault_info, info, fault_token_47_32, + (pfault->token >> 32) & 0xffff); + MLX5_SET(mem_page_fault_info, info, error, !!error); + } else { + info = MLX5_ADDR_OF(page_fault_resume_in, in, + page_fault_info.trans_page_fault_info); + MLX5_SET(trans_page_fault_info, info, page_fault_type, + pfault->type); + MLX5_SET(trans_page_fault_info, info, fault_token, + pfault->token); + MLX5_SET(trans_page_fault_info, info, wq_number, wq_num); + MLX5_SET(trans_page_fault_info, info, error, !!error); + } err = mlx5_cmd_exec_in(dev->mdev, page_fault_resume, in); if (err) @@ -466,8 +530,18 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, refcount_inc(&ret->mmkey.usecount); goto out_lock; } - xa_unlock(&imr->implicit_children); + if (MLX5_CAP_ODP(dev->mdev, mem_page_fault)) { + ret = __xa_store(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key), + &mr->mmkey, GFP_KERNEL); + if (xa_is_err(ret)) { + ret = ERR_PTR(xa_err(ret)); + __xa_erase(&imr->implicit_children, idx); + goto out_lock; + } + mr->mmkey.type = MLX5_MKEY_IMPLICIT_CHILD; + } + xa_unlock(&imr->implicit_children); mlx5_ib_dbg(mr_to_mdev(imr), "key %x mr %p\n", mr->mmkey.key, mr); return mr; @@ -478,6 +552,57 @@ out_mr: return ret; } +/* + * When using memory scheme ODP, implicit MRs can't use the reserved null mkey + * and each implicit MR needs to assign a private null mkey to get the page + * faults on. + * The null mkey is created with the properties to enable getting the page + * fault for every time it is accessed and having all relevant access flags. + */ +static int alloc_implicit_mr_null_mkey(struct mlx5_ib_dev *dev, + struct mlx5_ib_mr *imr, + struct mlx5_ib_pd *pd) +{ + size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + 64; + void *mkc; + u32 *in; + int err; + + in = kzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(create_mkey_in, in, translations_octword_actual_size, 4); + MLX5_SET(create_mkey_in, in, pg_access, 1); + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, a, 1); + MLX5_SET(mkc, mkc, rw, 1); + MLX5_SET(mkc, mkc, rr, 1); + MLX5_SET(mkc, mkc, lw, 1); + MLX5_SET(mkc, mkc, lr, 1); + MLX5_SET(mkc, mkc, free, 0); + MLX5_SET(mkc, mkc, umr_en, 0); + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); + + MLX5_SET(mkc, mkc, translations_octword_size, 4); + MLX5_SET(mkc, mkc, log_page_size, 61); + MLX5_SET(mkc, mkc, length64, 1); + MLX5_SET(mkc, mkc, pd, pd->pdn); + MLX5_SET64(mkc, mkc, start_addr, 0); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + + err = mlx5_core_create_mkey(dev->mdev, &imr->null_mmkey.key, in, inlen); + if (err) + goto free_in; + + imr->null_mmkey.type = MLX5_MKEY_NULL; + +free_in: + kfree(in); + return err; +} + struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, int access_flags) { @@ -510,6 +635,16 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, imr->is_odp_implicit = true; xa_init(&imr->implicit_children); + if (MLX5_CAP_ODP(dev->mdev, mem_page_fault)) { + err = alloc_implicit_mr_null_mkey(dev, imr, pd); + if (err) + goto out_mr; + + err = mlx5r_store_odp_mkey(dev, &imr->null_mmkey); + if (err) + goto out_mr; + } + err = mlx5r_umr_update_xlt(imr, 0, mlx5_imr_ksm_entries, MLX5_KSM_PAGE_SHIFT, @@ -544,6 +679,14 @@ void mlx5_ib_free_odp_mr(struct mlx5_ib_mr *mr) xa_erase(&mr->implicit_children, idx); mlx5_ib_dereg_mr(&mtt->ibmr, NULL); } + + if (mr->null_mmkey.key) { + xa_erase(&mr_to_mdev(mr)->odp_mkeys, + mlx5_base_mkey(mr->null_mmkey.key)); + + mlx5_core_destroy_mkey(mr_to_mdev(mr)->mdev, + mr->null_mmkey.key); + } } #define MLX5_PF_FLAGS_DOWNGRADE BIT(1) @@ -555,7 +698,7 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp, { int page_shift, ret, np; bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE; - u64 access_mask; + u64 access_mask = 0; u64 start_idx; bool fault = !(flags & MLX5_PF_FLAGS_SNAPSHOT); u32 xlt_flags = MLX5_IB_UPD_XLT_ATOMIC; @@ -563,12 +706,14 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp, if (flags & MLX5_PF_FLAGS_ENABLE) xlt_flags |= MLX5_IB_UPD_XLT_ENABLE; + if (flags & MLX5_PF_FLAGS_DOWNGRADE) + xlt_flags |= MLX5_IB_UPD_XLT_DOWNGRADE; + page_shift = odp->page_shift; start_idx = (user_va - ib_umem_start(odp)) >> page_shift; - access_mask = ODP_READ_ALLOWED_BIT; if (odp->umem.writable && !downgrade) - access_mask |= ODP_WRITE_ALLOWED_BIT; + access_mask |= HMM_PFN_WRITE; np = ib_umem_odp_map_dma_and_lock(odp, user_va, bcnt, access_mask, fault); if (np < 0) @@ -693,7 +838,7 @@ static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt, struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem); u32 xlt_flags = 0; int err; - unsigned int page_size; + unsigned long page_size; if (flags & MLX5_PF_FLAGS_ENABLE) xlt_flags |= MLX5_IB_UPD_XLT_ENABLE; @@ -705,14 +850,15 @@ static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt, return err; } - page_size = mlx5_umem_find_best_pgsz(&umem_dmabuf->umem, mkc, - log_page_size, 0, - umem_dmabuf->umem.iova); - if (unlikely(page_size < PAGE_SIZE)) { + page_size = mlx5_umem_dmabuf_find_best_pgsz(umem_dmabuf); + if (!page_size) { ib_umem_dmabuf_unmap_pages(umem_dmabuf); err = -EINVAL; } else { - err = mlx5r_umr_update_mr_pas(mr, xlt_flags); + if (mr->data_direct) + err = mlx5r_umr_update_data_direct_ksm_pas(mr, xlt_flags); + else + err = mlx5r_umr_update_mr_pas(mr, xlt_flags); } dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv); @@ -735,24 +881,31 @@ static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt, * >0: Number of pages mapped */ static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt, - u32 *bytes_mapped, u32 flags) + u32 *bytes_mapped, u32 flags, bool permissive_fault) { struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); - if (unlikely(io_virt < mr->ibmr.iova)) + if (unlikely(io_virt < mr->ibmr.iova) && !permissive_fault) return -EFAULT; if (mr->umem->is_dmabuf) return pagefault_dmabuf_mr(mr, bcnt, bytes_mapped, flags); if (!odp->is_implicit_odp) { + u64 offset = io_virt < mr->ibmr.iova ? 0 : io_virt - mr->ibmr.iova; u64 user_va; - if (check_add_overflow(io_virt - mr->ibmr.iova, - (u64)odp->umem.address, &user_va)) + if (check_add_overflow(offset, (u64)odp->umem.address, + &user_va)) return -EFAULT; - if (unlikely(user_va >= ib_umem_end(odp) || - ib_umem_end(odp) - user_va < bcnt)) + + if (permissive_fault) { + if (user_va < ib_umem_start(odp)) + user_va = ib_umem_start(odp); + if ((user_va + bcnt) > ib_umem_end(odp)) + bcnt = ib_umem_end(odp) - user_va; + } else if (unlikely(user_va >= ib_umem_end(odp) || + ib_umem_end(odp) - user_va < bcnt)) return -EFAULT; return pagefault_real_mr(mr, odp, user_va, bcnt, bytes_mapped, flags); @@ -799,11 +952,31 @@ static bool mkey_is_eq(struct mlx5_ib_mkey *mmkey, u32 key) return mmkey->key == key; } +static struct mlx5_ib_mkey *find_odp_mkey(struct mlx5_ib_dev *dev, u32 key) +{ + struct mlx5_ib_mkey *mmkey; + + xa_lock(&dev->odp_mkeys); + mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(key)); + if (!mmkey) { + mmkey = ERR_PTR(-ENOENT); + goto out; + } + if (!mkey_is_eq(mmkey, key)) { + mmkey = ERR_PTR(-EFAULT); + goto out; + } + refcount_inc(&mmkey->usecount); +out: + xa_unlock(&dev->odp_mkeys); + + return mmkey; +} + /* * Handle a single data segment in a page-fault WQE or RDMA region. * - * Returns number of OS pages retrieved on success. The caller may continue to - * the next data segment. + * Returns zero on success. The caller may continue to the next data segment. * Can return the following error codes: * -EAGAIN to designate a temporary error. The caller will abort handling the * page fault and resolve it. @@ -816,7 +989,7 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, u32 *bytes_committed, u32 *bytes_mapped) { - int npages = 0, ret, i, outlen, cur_outlen = 0, depth = 0; + int ret, i, outlen, cur_outlen = 0, depth = 0, pages_in_range; struct pf_frame *head = NULL, *frame; struct mlx5_ib_mkey *mmkey; struct mlx5_ib_mr *mr; @@ -826,32 +999,24 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, io_virt += *bytes_committed; bcnt -= *bytes_committed; - next_mr: - xa_lock(&dev->odp_mkeys); - mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(key)); - if (!mmkey) { - xa_unlock(&dev->odp_mkeys); - mlx5_ib_dbg( - dev, - "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", - key); - if (bytes_mapped) - *bytes_mapped += bcnt; - /* - * The user could specify a SGL with multiple lkeys and only - * some of them are ODP. Treat the non-ODP ones as fully - * faulted. - */ - ret = 0; - goto end; - } - refcount_inc(&mmkey->usecount); - xa_unlock(&dev->odp_mkeys); - - if (!mkey_is_eq(mmkey, key)) { - mlx5_ib_dbg(dev, "failed to find mkey %x\n", key); - ret = -EFAULT; + mmkey = find_odp_mkey(dev, key); + if (IS_ERR(mmkey)) { + ret = PTR_ERR(mmkey); + if (ret == -ENOENT) { + mlx5_ib_dbg( + dev, + "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", + key); + if (bytes_mapped) + *bytes_mapped += bcnt; + /* + * The user could specify a SGL with multiple lkeys and + * only some of them are ODP. Treat the non-ODP ones as + * fully faulted. + */ + ret = 0; + } goto end; } @@ -859,13 +1024,20 @@ next_mr: case MLX5_MKEY_MR: mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); - ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0); + pages_in_range = (ALIGN(io_virt + bcnt, PAGE_SIZE) - + (io_virt & PAGE_MASK)) >> + PAGE_SHIFT; + ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0, false); if (ret < 0) goto end; - mlx5_update_odp_stats(mr, faults, ret); + mlx5_update_odp_stats_with_handled(mr, faults, ret); + + if (ret < pages_in_range) { + ret = -EFAULT; + goto end; + } - npages += ret; ret = 0; break; @@ -946,7 +1118,7 @@ next_mr: } end: - if (mmkey) + if (!IS_ERR(mmkey)) mlx5r_deref_odp_mkey(mmkey); while (head) { frame = head; @@ -956,7 +1128,7 @@ end: kfree(out); *bytes_committed = 0; - return ret ? ret : npages; + return ret; } /* @@ -975,8 +1147,7 @@ end: * the committed bytes). * @receive_queue: receive WQE end of sg list * - * Returns the number of pages loaded if positive, zero for an empty WQE, or a - * negative error code. + * Returns zero for success or a negative error code. */ static int pagefault_data_segments(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault, @@ -984,7 +1155,7 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev, void *wqe_end, u32 *bytes_mapped, u32 *total_wqe_bytes, bool receive_queue) { - int ret = 0, npages = 0; + int ret = 0; u64 io_virt; __be32 key; u32 byte_count; @@ -1041,10 +1212,9 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev, bytes_mapped); if (ret < 0) break; - npages += ret; } - return ret < 0 ? ret : npages; + return ret; } /* @@ -1268,7 +1438,7 @@ read_user: if (ret) mlx5_ib_err( dev, - "Failed reading a WQE following page fault, error %d, wqe_index %x, qpn %x\n", + "Failed reading a WQE following page fault, error %d, wqe_index %x, qpn %llx\n", ret, wqe_index, pfault->token); resolve_page_fault: @@ -1280,12 +1450,6 @@ resolve_page_fault: free_page((unsigned long)wqe_start); } -static int pages_in_range(u64 address, u32 length) -{ - return (ALIGN(address + length, PAGE_SIZE) - - (address & PAGE_MASK)) >> PAGE_SHIFT; -} - static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault) { @@ -1324,16 +1488,16 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, if (ret == -EAGAIN) { /* We're racing with an invalidation, don't prefetch */ prefetch_activated = 0; - } else if (ret < 0 || pages_in_range(address, length) > ret) { + } else if (ret < 0) { mlx5_ib_page_fault_resume(dev, pfault, 1); if (ret != -ENOENT) - mlx5_ib_dbg(dev, "PAGE FAULT error %d. QP 0x%x, type: 0x%x\n", + mlx5_ib_dbg(dev, "PAGE FAULT error %d. QP 0x%llx, type: 0x%x\n", ret, pfault->token, pfault->type); return; } mlx5_ib_page_fault_resume(dev, pfault, 0); - mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x, type: 0x%x, prefetch_activated: %d\n", + mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%llx, type: 0x%x, prefetch_activated: %d\n", pfault->token, pfault->type, prefetch_activated); @@ -1349,12 +1513,80 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, prefetch_len, &bytes_committed, NULL); if (ret < 0 && ret != -EAGAIN) { - mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n", + mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%llx, address: 0x%.16llx, length = 0x%.16x\n", ret, pfault->token, address, prefetch_len); } } } +#define MLX5_MEMORY_PAGE_FAULT_FLAGS_LAST BIT(7) +static void mlx5_ib_mr_memory_pfault_handler(struct mlx5_ib_dev *dev, + struct mlx5_pagefault *pfault) +{ + u64 prefetch_va = + pfault->memory.va - pfault->memory.prefetch_before_byte_count; + size_t prefetch_size = pfault->memory.prefetch_before_byte_count + + pfault->memory.fault_byte_count + + pfault->memory.prefetch_after_byte_count; + struct mlx5_ib_mkey *mmkey; + struct mlx5_ib_mr *mr, *child_mr; + int ret = 0; + + mmkey = find_odp_mkey(dev, pfault->memory.mkey); + if (IS_ERR(mmkey)) + goto err; + + switch (mmkey->type) { + case MLX5_MKEY_IMPLICIT_CHILD: + child_mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); + mr = child_mr->parent; + break; + case MLX5_MKEY_NULL: + mr = container_of(mmkey, struct mlx5_ib_mr, null_mmkey); + break; + default: + mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); + break; + } + + /* If prefetch fails, handle only demanded page fault */ + ret = pagefault_mr(mr, prefetch_va, prefetch_size, NULL, 0, true); + if (ret < 0) { + ret = pagefault_mr(mr, pfault->memory.va, + pfault->memory.fault_byte_count, NULL, 0, + true); + if (ret < 0) + goto err; + } + + mlx5_update_odp_stats_with_handled(mr, faults, ret); + mlx5r_deref_odp_mkey(mmkey); + + if (pfault->memory.flags & MLX5_MEMORY_PAGE_FAULT_FLAGS_LAST) + mlx5_ib_page_fault_resume(dev, pfault, 0); + + mlx5_ib_dbg( + dev, + "PAGE FAULT completed %s. token 0x%llx, mkey: 0x%x, va: 0x%llx, byte_count: 0x%x\n", + pfault->memory.flags & MLX5_MEMORY_PAGE_FAULT_FLAGS_LAST ? + "" : + "without resume cmd", + pfault->token, pfault->memory.mkey, pfault->memory.va, + pfault->memory.fault_byte_count); + + return; + +err: + if (!IS_ERR(mmkey)) + mlx5r_deref_odp_mkey(mmkey); + mlx5_ib_page_fault_resume(dev, pfault, 1); + mlx5_ib_dbg( + dev, + "PAGE FAULT error. token 0x%llx, mkey: 0x%x, va: 0x%llx, byte_count: 0x%x, err: %d\n", + pfault->token, pfault->memory.mkey, pfault->memory.va, + pfault->memory.fault_byte_count, ret); +} + static void mlx5_ib_pfault(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault) { u8 event_subtype = pfault->event_subtype; @@ -1366,6 +1598,9 @@ static void mlx5_ib_pfault(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfaul case MLX5_PFAULT_SUBTYPE_RDMA: mlx5_ib_mr_rdma_pfault_handler(dev, pfault); break; + case MLX5_PFAULT_SUBTYPE_MEMORY: + mlx5_ib_mr_memory_pfault_handler(dev, pfault); + break; default: mlx5_ib_err(dev, "Invalid page fault event subtype: 0x%x\n", event_subtype); @@ -1384,6 +1619,7 @@ static void mlx5_ib_eqe_pf_action(struct work_struct *work) mempool_free(pfault, eq->pool); } +#define MEMORY_SCHEME_PAGE_FAULT_GRANULARITY 4096 static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq) { struct mlx5_eqe_page_fault *pf_eqe; @@ -1400,15 +1636,12 @@ static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq) pf_eqe = &eqe->data.page_fault; pfault->event_subtype = eqe->sub_type; - pfault->bytes_committed = be32_to_cpu(pf_eqe->bytes_committed); - - mlx5_ib_dbg(eq->dev, - "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x\n", - eqe->sub_type, pfault->bytes_committed); switch (eqe->sub_type) { case MLX5_PFAULT_SUBTYPE_RDMA: /* RDMA based event */ + pfault->bytes_committed = + be32_to_cpu(pf_eqe->rdma.bytes_committed); pfault->type = be32_to_cpu(pf_eqe->rdma.pftype_token) >> 24; pfault->token = @@ -1422,10 +1655,12 @@ static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq) be32_to_cpu(pf_eqe->rdma.rdma_op_len); pfault->rdma.rdma_va = be64_to_cpu(pf_eqe->rdma.rdma_va); - mlx5_ib_dbg(eq->dev, - "PAGE_FAULT: type:0x%x, token: 0x%06x, r_key: 0x%08x\n", - pfault->type, pfault->token, - pfault->rdma.r_key); + mlx5_ib_dbg( + eq->dev, + "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x, type:0x%x, token: 0x%06llx, r_key: 0x%08x\n", + eqe->sub_type, pfault->bytes_committed, + pfault->type, pfault->token, + pfault->rdma.r_key); mlx5_ib_dbg(eq->dev, "PAGE_FAULT: rdma_op_len: 0x%08x, rdma_va: 0x%016llx\n", pfault->rdma.rdma_op_len, @@ -1434,6 +1669,8 @@ static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq) case MLX5_PFAULT_SUBTYPE_WQE: /* WQE based event */ + pfault->bytes_committed = + be32_to_cpu(pf_eqe->wqe.bytes_committed); pfault->type = (be32_to_cpu(pf_eqe->wqe.pftype_wq) >> 24) & 0x7; pfault->token = @@ -1445,11 +1682,47 @@ static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq) be16_to_cpu(pf_eqe->wqe.wqe_index); pfault->wqe.packet_size = be16_to_cpu(pf_eqe->wqe.packet_length); - mlx5_ib_dbg(eq->dev, - "PAGE_FAULT: type:0x%x, token: 0x%06x, wq_num: 0x%06x, wqe_index: 0x%04x\n", - pfault->type, pfault->token, - pfault->wqe.wq_num, - pfault->wqe.wqe_index); + mlx5_ib_dbg( + eq->dev, + "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x, type:0x%x, token: 0x%06llx, wq_num: 0x%06x, wqe_index: 0x%04x\n", + eqe->sub_type, pfault->bytes_committed, + pfault->type, pfault->token, pfault->wqe.wq_num, + pfault->wqe.wqe_index); + break; + + case MLX5_PFAULT_SUBTYPE_MEMORY: + /* Memory based event */ + pfault->bytes_committed = 0; + pfault->token = + be32_to_cpu(pf_eqe->memory.token31_0) | + ((u64)be16_to_cpu(pf_eqe->memory.token47_32) + << 32); + pfault->memory.va = be64_to_cpu(pf_eqe->memory.va); + pfault->memory.mkey = be32_to_cpu(pf_eqe->memory.mkey); + pfault->memory.fault_byte_count = (be32_to_cpu( + pf_eqe->memory.demand_fault_pages) >> 12) * + MEMORY_SCHEME_PAGE_FAULT_GRANULARITY; + pfault->memory.prefetch_before_byte_count = + be16_to_cpu( + pf_eqe->memory.pre_demand_fault_pages) * + MEMORY_SCHEME_PAGE_FAULT_GRANULARITY; + pfault->memory.prefetch_after_byte_count = + be16_to_cpu( + pf_eqe->memory.post_demand_fault_pages) * + MEMORY_SCHEME_PAGE_FAULT_GRANULARITY; + pfault->memory.flags = pf_eqe->memory.flags; + mlx5_ib_dbg( + eq->dev, + "PAGE_FAULT: subtype: 0x%02x, token: 0x%06llx, mkey: 0x%06x, fault_byte_count: 0x%06x, va: 0x%016llx, flags: 0x%02x\n", + eqe->sub_type, pfault->token, + pfault->memory.mkey, + pfault->memory.fault_byte_count, + pfault->memory.va, pfault->memory.flags); + mlx5_ib_dbg( + eq->dev, + "PAGE_FAULT: prefetch size: before: 0x%06x, after 0x%06x\n", + pfault->memory.prefetch_before_byte_count, + pfault->memory.prefetch_after_byte_count); break; default: @@ -1712,7 +1985,7 @@ static void mlx5_ib_prefetch_mr_work(struct work_struct *w) for (i = 0; i < work->num_sge; ++i) { ret = pagefault_mr(work->frags[i].mr, work->frags[i].io_virt, work->frags[i].length, &bytes_mapped, - work->pf_flags); + work->pf_flags, false); if (ret <= 0) continue; mlx5_update_odp_stats(work->frags[i].mr, prefetch, ret); @@ -1763,7 +2036,7 @@ static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd, if (IS_ERR(mr)) return PTR_ERR(mr); ret = pagefault_mr(mr, sg_list[i].addr, sg_list[i].length, - &bytes_mapped, pf_flags); + &bytes_mapped, pf_flags, false); if (ret < 0) { mlx5r_deref_odp_mkey(&mr->mmkey); return ret; diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 8115ab107149..88724d15705d 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1107,8 +1107,6 @@ static int _create_kernel_qp(struct mlx5_ib_dev *dev, if (init_attr->qp_type == MLX5_IB_QPT_REG_UMR) qp->bf.bfreg = &dev->fp_bfreg; - else if (qp->flags & MLX5_IB_QP_CREATE_WC_TEST) - qp->bf.bfreg = &dev->wc_bfreg; else qp->bf.bfreg = &dev->bfreg; @@ -1962,7 +1960,7 @@ static int atomic_size_to_mode(int size_mask) } static int get_atomic_mode(struct mlx5_ib_dev *dev, - enum ib_qp_type qp_type) + struct mlx5_ib_qp *qp) { u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations); u8 atomic = MLX5_CAP_GEN(dev->mdev, atomic); @@ -1972,7 +1970,7 @@ static int get_atomic_mode(struct mlx5_ib_dev *dev, if (!atomic) return -EOPNOTSUPP; - if (qp_type == MLX5_IB_QPT_DCT) + if (qp->type == MLX5_IB_QPT_DCT) atomic_size_mask = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc); else atomic_size_mask = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp); @@ -1986,6 +1984,10 @@ static int get_atomic_mode(struct mlx5_ib_dev *dev, atomic_operations & MLX5_ATOMIC_OPS_FETCH_ADD)) atomic_mode = MLX5_ATOMIC_MODE_IB_COMP; + /* OOO DP QPs do not support larger than 8-Bytes atomic operations */ + if (atomic_mode > MLX5_ATOMIC_MODE_8B && qp->is_ooo_rq) + atomic_mode = MLX5_ATOMIC_MODE_8B; + return atomic_mode; } @@ -2841,6 +2843,29 @@ static int check_valid_flow(struct mlx5_ib_dev *dev, struct ib_pd *pd, return 0; } +static bool get_dp_ooo_cap(struct mlx5_core_dev *mdev, enum ib_qp_type qp_type) +{ + if (!MLX5_CAP_GEN_2(mdev, dp_ordering_force)) + return false; + + switch (qp_type) { + case IB_QPT_RC: + return MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_rc); + case IB_QPT_XRC_INI: + case IB_QPT_XRC_TGT: + return MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_xrc); + case IB_QPT_UC: + return MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_uc); + case IB_QPT_UD: + return MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_ud); + case MLX5_IB_QPT_DCI: + case MLX5_IB_QPT_DCT: + return MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_dc); + default: + return false; + } +} + static void process_vendor_flag(struct mlx5_ib_dev *dev, int *flags, int flag, bool cond, struct mlx5_ib_qp *qp) { @@ -2959,14 +2984,6 @@ static void process_create_flag(struct mlx5_ib_dev *dev, int *flags, int flag, return; } - if (flag == MLX5_IB_QP_CREATE_WC_TEST) { - /* - * Special case, if condition didn't meet, it won't be error, - * just different in-kernel flow. - */ - *flags &= ~MLX5_IB_QP_CREATE_WC_TEST; - return; - } mlx5_ib_dbg(dev, "Verbs create QP flag 0x%X is not supported\n", flag); } @@ -3027,8 +3044,6 @@ static int process_create_flags(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, IB_QP_CREATE_PCI_WRITE_END_PADDING, MLX5_CAP_GEN(mdev, end_pad), qp); - process_create_flag(dev, &create_flags, MLX5_IB_QP_CREATE_WC_TEST, - qp_type != MLX5_IB_QPT_REG_UMR, qp); process_create_flag(dev, &create_flags, MLX5_IB_QP_CREATE_SQPN_QP1, true, qp); @@ -3097,7 +3112,6 @@ static int create_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, switch (qp->type) { case MLX5_IB_QPT_DCT: err = create_dct(dev, pd, qp, params); - rdma_restrack_no_track(&qp->ibqp.res); break; case MLX5_IB_QPT_DCI: err = create_dci(dev, pd, qp, params); @@ -3109,9 +3123,9 @@ static int create_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, err = mlx5_ib_create_gsi(pd, qp, params->attr); break; case MLX5_IB_QPT_HW_GSI: - case MLX5_IB_QPT_REG_UMR: rdma_restrack_no_track(&qp->ibqp.res); fallthrough; + case MLX5_IB_QPT_REG_UMR: default: if (params->udata) err = create_user_qp(dev, pd, qp, params); @@ -3247,6 +3261,10 @@ int mlx5_ib_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr, enum ib_qp_type type; int err; + err = mlx5_ib_dev_res_srq_init(dev); + if (err) + return err; + err = check_qp_type(dev, attr, &type); if (err) return err; @@ -3374,7 +3392,7 @@ static int set_qpc_atomic_flags(struct mlx5_ib_qp *qp, if (access_flags & IB_ACCESS_REMOTE_ATOMIC) { int atomic_mode; - atomic_mode = get_atomic_mode(dev, qp->type); + atomic_mode = get_atomic_mode(dev, qp); if (atomic_mode < 0) return -EOPNOTSUPP; @@ -3429,11 +3447,11 @@ static int ib_to_mlx5_rate_map(u8 rate) return 0; } -static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate) +int mlx5r_ib_rate(struct mlx5_ib_dev *dev, u8 rate) { u32 stat_rate_support; - if (rate == IB_RATE_PORT_CURRENT) + if (rate == IB_RATE_PORT_CURRENT || rate == IB_RATE_800_GBPS) return 0; if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_800_GBPS) @@ -3578,7 +3596,7 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, sizeof(grh->dgid.raw)); } - err = ib_rate_to_mlx5(dev, rdma_ah_get_static_rate(ah)); + err = mlx5r_ib_rate(dev, rdma_ah_get_static_rate(ah)); if (err < 0) return err; MLX5_SET(ads, path, stat_rate, err); @@ -4226,7 +4244,12 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, /* todo implement counter_index functionality */ - if (is_sqp(qp->type)) + if (dev->ib_dev.type == RDMA_DEVICE_TYPE_SMI && is_qp0(qp->type)) { + MLX5_SET(ads, pri_path, vhca_port_num, + smi_to_native_portnum(dev, qp->port)); + if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) + MLX5_SET(ads, pri_path, plane_index, qp->port); + } else if (is_sqp(qp->type)) MLX5_SET(ads, pri_path, vhca_port_num, qp->port); if (attr_mask & IB_QP_PORT) @@ -4272,14 +4295,14 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, MLX5_SET(qpc, qpc, retry_count, attr->retry_cnt); if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && attr->max_rd_atomic) - MLX5_SET(qpc, qpc, log_sra_max, ilog2(attr->max_rd_atomic)); + MLX5_SET(qpc, qpc, log_sra_max, fls(attr->max_rd_atomic - 1)); if (attr_mask & IB_QP_SQ_PSN) MLX5_SET(qpc, qpc, next_send_psn, attr->sq_psn); if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && attr->max_dest_rd_atomic) MLX5_SET(qpc, qpc, log_rra_max, - ilog2(attr->max_dest_rd_atomic)); + fls(attr->max_dest_rd_atomic - 1)); if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) { err = set_qpc_atomic_flags(qp, attr, attr_mask, qpc); @@ -4320,6 +4343,11 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, if (qp->flags & MLX5_IB_QP_CREATE_SQPN_QP1) MLX5_SET(qpc, qpc, deth_sqpn, 1); + if (qp->is_ooo_rq && cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { + MLX5_SET(qpc, qpc, dp_ordering_1, 1); + MLX5_SET(qpc, qpc, dp_ordering_force, 1); + } + mlx5_cur = to_mlx5_state(cur_state); mlx5_new = to_mlx5_state(new_state); @@ -4535,7 +4563,7 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) { int atomic_mode; - atomic_mode = get_atomic_mode(dev, MLX5_IB_QPT_DCT); + atomic_mode = get_atomic_mode(dev, qp); if (atomic_mode < 0) return -EOPNOTSUPP; @@ -4551,6 +4579,8 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, set_id = mlx5_ib_get_counters_id(dev, attr->port_num - 1); MLX5_SET(dctc, dctc, counter_set_id, set_id); + + qp->port = attr->port_num; } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { struct mlx5_ib_modify_qp_resp resp = {}; u32 out[MLX5_ST_SZ_DW(create_dct_out)] = {}; @@ -4577,6 +4607,10 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, MLX5_SET(dctc, dctc, hop_limit, attr->ah_attr.grh.hop_limit); if (attr->ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE) MLX5_SET(dctc, dctc, eth_prio, attr->ah_attr.sl & 0x7); + if (qp->is_ooo_rq) { + MLX5_SET(dctc, dctc, dp_ordering_1, 1); + MLX5_SET(dctc, dctc, dp_ordering_force, 1); + } err = mlx5_core_create_dct(dev, &qp->dct.mdct, qp->dct.in, MLX5_ST_SZ_BYTES(create_dct_in), out, @@ -4610,10 +4644,6 @@ static bool mlx5_ib_modify_qp_allowed(struct mlx5_ib_dev *dev, if (qp->type == IB_QPT_RAW_PACKET || qp->type == MLX5_IB_QPT_REG_UMR) return true; - /* Internal QP used for wc testing, with NOPs in wq */ - if (qp->flags & MLX5_IB_QP_CREATE_WC_TEST) - return true; - return false; } @@ -4684,11 +4714,16 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, min(udata->inlen, sizeof(ucmd)))) return -EFAULT; - if (ucmd.comp_mask || + if (ucmd.comp_mask & ~MLX5_IB_MODIFY_QP_OOO_DP || memchr_inv(&ucmd.burst_info.reserved, 0, sizeof(ucmd.burst_info.reserved))) return -EOPNOTSUPP; + if (ucmd.comp_mask & MLX5_IB_MODIFY_QP_OOO_DP) { + if (!get_dp_ooo_cap(dev->mdev, qp->type)) + return -EOPNOTSUPP; + qp->is_ooo_rq = 1; + } } if (qp->type == IB_QPT_GSI) @@ -5041,7 +5076,7 @@ static int mlx5_ib_dct_query_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *mqp, } if (qp_attr_mask & IB_QP_PORT) - qp_attr->port_num = MLX5_GET(dctc, dctc, port); + qp_attr->port_num = mqp->port; if (qp_attr_mask & IB_QP_MIN_RNR_TIMER) qp_attr->min_rnr_timer = MLX5_GET(dctc, dctc, min_rnr_nak); if (qp_attr_mask & IB_QP_AV) { diff --git a/drivers/infiniband/hw/mlx5/qp.h b/drivers/infiniband/hw/mlx5/qp.h index b6ee7c3ee1ca..2530e7730635 100644 --- a/drivers/infiniband/hw/mlx5/qp.h +++ b/drivers/infiniband/hw/mlx5/qp.h @@ -56,4 +56,5 @@ int mlx5_core_xrcd_dealloc(struct mlx5_ib_dev *dev, u32 xrcdn); int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter); int mlx5_ib_qp_event_init(void); void mlx5_ib_qp_event_cleanup(void); +int mlx5r_ib_rate(struct mlx5_ib_dev *dev, u8 rate); #endif /* _MLX5_IB_QP_H */ diff --git a/drivers/infiniband/hw/mlx5/qpc.c b/drivers/infiniband/hw/mlx5/qpc.c index d9cf6982d645..146d03ae40bd 100644 --- a/drivers/infiniband/hw/mlx5/qpc.c +++ b/drivers/infiniband/hw/mlx5/qpc.c @@ -21,8 +21,10 @@ mlx5_get_rsc(struct mlx5_qp_table *table, u32 rsn) spin_lock_irqsave(&table->lock, flags); common = radix_tree_lookup(&table->tree, rsn); - if (common) + if (common && !common->invalid) refcount_inc(&common->refcount); + else + common = NULL; spin_unlock_irqrestore(&table->lock, flags); @@ -178,6 +180,18 @@ static int create_resource_common(struct mlx5_ib_dev *dev, return 0; } +static void modify_resource_common_state(struct mlx5_ib_dev *dev, + struct mlx5_core_qp *qp, + bool invalid) +{ + struct mlx5_qp_table *table = &dev->qp_table; + unsigned long flags; + + spin_lock_irqsave(&table->lock, flags); + qp->common.invalid = invalid; + spin_unlock_irqrestore(&table->lock, flags); +} + static void destroy_resource_common(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp) { @@ -249,7 +263,8 @@ int mlx5_qpc_create_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp, if (err) goto err_cmd; - mlx5_debug_qp_add(dev->mdev, qp); + if (dev->ib_dev.type != RDMA_DEVICE_TYPE_SMI) + mlx5_debug_qp_add(dev->mdev, qp); return 0; @@ -307,7 +322,8 @@ int mlx5_core_destroy_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp) { u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {}; - mlx5_debug_qp_remove(dev->mdev, qp); + if (dev->ib_dev.type != RDMA_DEVICE_TYPE_SMI) + mlx5_debug_qp_remove(dev->mdev, qp); destroy_resource_common(dev, qp); @@ -504,7 +520,9 @@ int mlx5_init_qp_table(struct mlx5_ib_dev *dev) spin_lock_init(&table->lock); INIT_RADIX_TREE(&table->tree, GFP_ATOMIC); xa_init(&table->dct_xa); - mlx5_qp_debugfs_init(dev->mdev); + + if (dev->ib_dev.type != RDMA_DEVICE_TYPE_SMI) + mlx5_qp_debugfs_init(dev->mdev); table->nb.notifier_call = rsc_event_notifier; mlx5_notifier_register(dev->mdev, &table->nb); @@ -517,7 +535,8 @@ void mlx5_cleanup_qp_table(struct mlx5_ib_dev *dev) struct mlx5_qp_table *table = &dev->qp_table; mlx5_notifier_unregister(dev->mdev, &table->nb); - mlx5_qp_debugfs_cleanup(dev->mdev); + if (dev->ib_dev.type != RDMA_DEVICE_TYPE_SMI) + mlx5_qp_debugfs_cleanup(dev->mdev); } int mlx5_core_qp_query(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp, @@ -604,8 +623,20 @@ err_destroy_rq: int mlx5_core_destroy_rq_tracked(struct mlx5_ib_dev *dev, struct mlx5_core_qp *rq) { + int ret; + + /* The rq destruction can be called again in case it fails, hence we + * mark the common resource as invalid and only once FW destruction + * is completed successfully we actually destroy the resources. + */ + modify_resource_common_state(dev, rq, true); + ret = destroy_rq_tracked(dev, rq->qpn, rq->uid); + if (ret) { + modify_resource_common_state(dev, rq, false); + return ret; + } destroy_resource_common(dev, rq); - return destroy_rq_tracked(dev, rq->qpn, rq->uid); + return 0; } static void destroy_sq_tracked(struct mlx5_ib_dev *dev, u32 sqn, u16 uid) diff --git a/drivers/infiniband/hw/mlx5/restrack.c b/drivers/infiniband/hw/mlx5/restrack.c index 4ac429e72004..67841922c7b8 100644 --- a/drivers/infiniband/hw/mlx5/restrack.c +++ b/drivers/infiniband/hw/mlx5/restrack.c @@ -96,9 +96,18 @@ static int fill_stat_mr_entry(struct sk_buff *msg, struct ib_mr *ibmr) atomic64_read(&mr->odp_stats.faults))) goto err_table; if (rdma_nl_stat_hwcounter_entry( + msg, "page_faults_handled", + atomic64_read(&mr->odp_stats.faults_handled))) + goto err_table; + if (rdma_nl_stat_hwcounter_entry( msg, "page_invalidations", atomic64_read(&mr->odp_stats.invalidations))) goto err_table; + if (rdma_nl_stat_hwcounter_entry( + msg, "page_invalidations_handled", + atomic64_read(&mr->odp_stats.invalidations_handled))) + goto err_table; + if (rdma_nl_stat_hwcounter_entry(msg, "page_prefetch", atomic64_read(&mr->odp_stats.prefetch))) goto err_table; @@ -156,6 +165,34 @@ static int fill_res_cq_entry_raw(struct sk_buff *msg, struct ib_cq *ibcq) return fill_res_raw(msg, dev, MLX5_SGMT_TYPE_PRM_QUERY_CQ, cq->mcq.cqn); } +static int fill_res_qp_entry(struct sk_buff *msg, struct ib_qp *ibqp) +{ + struct mlx5_ib_qp *qp = to_mqp(ibqp); + int ret; + + if (qp->type < IB_QPT_DRIVER) + return 0; + + switch (qp->type) { + case MLX5_IB_QPT_REG_UMR: + ret = nla_put_string(msg, RDMA_NLDEV_ATTR_RES_SUBTYPE, + "REG_UMR"); + break; + case MLX5_IB_QPT_DCT: + ret = nla_put_string(msg, RDMA_NLDEV_ATTR_RES_SUBTYPE, "DCT"); + break; + case MLX5_IB_QPT_DCI: + ret = nla_put_string(msg, RDMA_NLDEV_ATTR_RES_SUBTYPE, "DCI"); + break; + default: + return 0; + } + if (ret) + return ret; + + return nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, IB_QPT_DRIVER); +} + static int fill_res_qp_entry_raw(struct sk_buff *msg, struct ib_qp *ibqp) { struct mlx5_ib_dev *dev = to_mdev(ibqp->device); @@ -168,6 +205,7 @@ static const struct ib_device_ops restrack_ops = { .fill_res_cq_entry_raw = fill_res_cq_entry_raw, .fill_res_mr_entry = fill_res_mr_entry, .fill_res_mr_entry_raw = fill_res_mr_entry_raw, + .fill_res_qp_entry = fill_res_qp_entry, .fill_res_qp_entry_raw = fill_res_qp_entry_raw, .fill_stat_mr_entry = fill_stat_mr_entry, }; diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index a056ea835da5..bcb6b324af50 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -199,20 +199,27 @@ int mlx5_ib_create_srq(struct ib_srq *ib_srq, int err; struct mlx5_srq_attr in = {}; __u32 max_srq_wqes = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz); + __u32 max_sge_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq) / + sizeof(struct mlx5_wqe_data_seg); if (init_attr->srq_type != IB_SRQT_BASIC && init_attr->srq_type != IB_SRQT_XRC && init_attr->srq_type != IB_SRQT_TM) return -EOPNOTSUPP; - /* Sanity check SRQ size before proceeding */ - if (init_attr->attr.max_wr >= max_srq_wqes) { - mlx5_ib_dbg(dev, "max_wr %d, cap %d\n", - init_attr->attr.max_wr, - max_srq_wqes); + /* Sanity check SRQ and sge size before proceeding */ + if (init_attr->attr.max_wr >= max_srq_wqes || + init_attr->attr.max_sge > max_sge_sz) { + mlx5_ib_dbg(dev, "max_wr %d,wr_cap %d,max_sge %d, sge_cap:%d\n", + init_attr->attr.max_wr, max_srq_wqes, + init_attr->attr.max_sge, max_sge_sz); return -EINVAL; } + err = mlx5_ib_dev_res_cq_init(dev); + if (err) + return err; + mutex_init(&srq->mutex); spin_lock_init(&srq->lock); srq->msrq.max = roundup_pow_of_two(init_attr->attr.max_wr + 1); diff --git a/drivers/infiniband/hw/mlx5/std_types.c b/drivers/infiniband/hw/mlx5/std_types.c index bbfcce3bdc84..bdb568411091 100644 --- a/drivers/infiniband/hw/mlx5/std_types.c +++ b/drivers/infiniband/hw/mlx5/std_types.c @@ -10,6 +10,7 @@ #include <linux/mlx5/eswitch.h> #include <linux/mlx5/vport.h> #include "mlx5_ib.h" +#include "data_direct.h" #define UVERBS_MODULE_NAME mlx5_ib #include <rdma/uverbs_named_ioctl.h> @@ -111,6 +112,23 @@ out: return err; } +static int fill_multiport_info(struct mlx5_ib_dev *dev, u32 port_num, + struct mlx5_ib_uapi_query_port *info) +{ + struct mlx5_core_dev *mdev; + + mdev = mlx5_ib_get_native_port_mdev(dev, port_num, NULL); + if (!mdev) + return -EINVAL; + + info->vport_vhca_id = MLX5_CAP_GEN(mdev, vhca_id); + info->flags |= MLX5_IB_UAPI_QUERY_PORT_VPORT_VHCA_ID; + + mlx5_ib_put_native_port_mdev(dev, port_num); + + return 0; +} + static int fill_switchdev_info(struct mlx5_ib_dev *dev, u32 port_num, struct mlx5_ib_uapi_query_port *info) { @@ -177,12 +195,60 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_QUERY_PORT)( ret = fill_switchdev_info(dev, port_num, &info); if (ret) return ret; + } else if (mlx5_core_mp_enabled(dev->mdev)) { + ret = fill_multiport_info(dev, port_num, &info); + if (ret) + return ret; } return uverbs_copy_to_struct_or_zero(attrs, MLX5_IB_ATTR_QUERY_PORT, &info, sizeof(info)); } +static int UVERBS_HANDLER(MLX5_IB_METHOD_GET_DATA_DIRECT_SYSFS_PATH)( + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_data_direct_dev *data_direct_dev; + struct mlx5_ib_ucontext *c; + struct mlx5_ib_dev *dev; + int out_len = uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_GET_DATA_DIRECT_SYSFS_PATH); + u32 dev_path_len; + char *dev_path; + int ret; + + c = to_mucontext(ib_uverbs_get_ucontext(attrs)); + if (IS_ERR(c)) + return PTR_ERR(c); + dev = to_mdev(c->ibucontext.device); + mutex_lock(&dev->data_direct_lock); + data_direct_dev = dev->data_direct_dev; + if (!data_direct_dev) { + ret = -ENODEV; + goto end; + } + + dev_path = kobject_get_path(&data_direct_dev->device->kobj, GFP_KERNEL); + if (!dev_path) { + ret = -ENOMEM; + goto end; + } + + dev_path_len = strlen(dev_path) + 1; + if (dev_path_len > out_len) { + ret = -ENOSPC; + goto end; + } + + ret = uverbs_copy_to(attrs, MLX5_IB_ATTR_GET_DATA_DIRECT_SYSFS_PATH, dev_path, + dev_path_len); + kfree(dev_path); + +end: + mutex_unlock(&dev->data_direct_lock); + return ret; +} + DECLARE_UVERBS_NAMED_METHOD( MLX5_IB_METHOD_QUERY_PORT, UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_QUERY_PORT_PORT_NUM, @@ -193,9 +259,17 @@ DECLARE_UVERBS_NAMED_METHOD( reg_c0), UA_MANDATORY)); +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_GET_DATA_DIRECT_SYSFS_PATH, + UVERBS_ATTR_PTR_OUT( + MLX5_IB_ATTR_GET_DATA_DIRECT_SYSFS_PATH, + UVERBS_ATTR_MIN_SIZE(0), + UA_MANDATORY)); + ADD_UVERBS_METHODS(mlx5_ib_device, UVERBS_OBJECT_DEVICE, - &UVERBS_METHOD(MLX5_IB_METHOD_QUERY_PORT)); + &UVERBS_METHOD(MLX5_IB_METHOD_QUERY_PORT), + &UVERBS_METHOD(MLX5_IB_METHOD_GET_DATA_DIRECT_SYSFS_PATH)); DECLARE_UVERBS_NAMED_METHOD( MLX5_IB_METHOD_PD_QUERY, diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c index e76142f6fa88..5be4426a2884 100644 --- a/drivers/infiniband/hw/mlx5/umr.c +++ b/drivers/infiniband/hw/mlx5/umr.c @@ -135,22 +135,28 @@ static int mlx5r_umr_qp_rst2rts(struct mlx5_ib_dev *dev, struct ib_qp *qp) int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev) { struct ib_qp_init_attr init_attr = {}; - struct ib_pd *pd; struct ib_cq *cq; struct ib_qp *qp; - int ret; + int ret = 0; - pd = ib_alloc_pd(&dev->ib_dev, 0); - if (IS_ERR(pd)) { - mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n"); - return PTR_ERR(pd); - } + + /* + * UMR qp is set once, never changed until device unload. + * Avoid taking the mutex if initialization is already done. + */ + if (dev->umrc.qp) + return 0; + + mutex_lock(&dev->umrc.init_lock); + /* First user allocates the UMR resources. Skip if already allocated. */ + if (dev->umrc.qp) + goto unlock; cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ); if (IS_ERR(cq)) { mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n"); ret = PTR_ERR(cq); - goto destroy_pd; + goto unlock; } init_attr.send_cq = cq; @@ -160,7 +166,7 @@ int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev) init_attr.cap.max_send_sge = 1; init_attr.qp_type = MLX5_IB_QPT_REG_UMR; init_attr.port_num = 1; - qp = ib_create_qp(pd, &init_attr); + qp = ib_create_qp(dev->umrc.pd, &init_attr); if (IS_ERR(qp)) { mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n"); ret = PTR_ERR(qp); @@ -171,22 +177,22 @@ int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev) if (ret) goto destroy_qp; - dev->umrc.qp = qp; dev->umrc.cq = cq; - dev->umrc.pd = pd; sema_init(&dev->umrc.sem, MAX_UMR_WR); mutex_init(&dev->umrc.lock); dev->umrc.state = MLX5_UMR_STATE_ACTIVE; + dev->umrc.qp = qp; + mutex_unlock(&dev->umrc.init_lock); return 0; destroy_qp: ib_destroy_qp(qp); destroy_cq: ib_free_cq(cq); -destroy_pd: - ib_dealloc_pd(pd); +unlock: + mutex_unlock(&dev->umrc.init_lock); return ret; } @@ -194,36 +200,38 @@ void mlx5r_umr_resource_cleanup(struct mlx5_ib_dev *dev) { if (dev->umrc.state == MLX5_UMR_STATE_UNINIT) return; + mutex_destroy(&dev->umrc.lock); + /* After device init, UMR cp/qp are not unset during the lifetime. */ ib_destroy_qp(dev->umrc.qp); ib_free_cq(dev->umrc.cq); - ib_dealloc_pd(dev->umrc.pd); } -static int mlx5r_umr_recover(struct mlx5_ib_dev *dev) +int mlx5r_umr_init(struct mlx5_ib_dev *dev) { - struct umr_common *umrc = &dev->umrc; - struct ib_qp_attr attr; - int err; + struct ib_pd *pd; - attr.qp_state = IB_QPS_RESET; - err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE); - if (err) { - mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n"); - goto err; + pd = ib_alloc_pd(&dev->ib_dev, 0); + if (IS_ERR(pd)) { + mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n"); + return PTR_ERR(pd); } + dev->umrc.pd = pd; - err = mlx5r_umr_qp_rst2rts(dev, umrc->qp); - if (err) - goto err; + mutex_init(&dev->umrc.init_lock); - umrc->state = MLX5_UMR_STATE_ACTIVE; return 0; +} -err: - umrc->state = MLX5_UMR_STATE_ERR; - return err; +void mlx5r_umr_cleanup(struct mlx5_ib_dev *dev) +{ + if (!dev->umrc.pd) + return; + + mutex_destroy(&dev->umrc.init_lock); + ib_dealloc_pd(dev->umrc.pd); } + static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe, struct mlx5r_umr_wqe *wqe, bool with_data) { @@ -270,6 +278,61 @@ out: return err; } +static int mlx5r_umr_recover(struct mlx5_ib_dev *dev, u32 mkey, + struct mlx5r_umr_context *umr_context, + struct mlx5r_umr_wqe *wqe, bool with_data) +{ + struct umr_common *umrc = &dev->umrc; + struct ib_qp_attr attr; + int err; + + mutex_lock(&umrc->lock); + /* Preventing any further WRs to be sent now */ + if (umrc->state != MLX5_UMR_STATE_RECOVER) { + mlx5_ib_warn(dev, "UMR recovery encountered an unexpected state=%d\n", + umrc->state); + umrc->state = MLX5_UMR_STATE_RECOVER; + } + mutex_unlock(&umrc->lock); + + /* Sending a final/barrier WR (the failed one) and wait for its completion. + * This will ensure that all the previous WRs got a completion before + * we set the QP state to RESET. + */ + err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context->cqe, wqe, + with_data); + if (err) { + mlx5_ib_warn(dev, "UMR recovery post send failed, err %d\n", err); + goto err; + } + + /* Since the QP is in an error state, it will only receive + * IB_WC_WR_FLUSH_ERR. However, as it serves only as a barrier + * we don't care about its status. + */ + wait_for_completion(&umr_context->done); + + attr.qp_state = IB_QPS_RESET; + err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE); + if (err) { + mlx5_ib_warn(dev, "Couldn't modify UMR QP to RESET, err=%d\n", err); + goto err; + } + + err = mlx5r_umr_qp_rst2rts(dev, umrc->qp); + if (err) { + mlx5_ib_warn(dev, "Couldn't modify UMR QP to RTS, err=%d\n", err); + goto err; + } + + umrc->state = MLX5_UMR_STATE_ACTIVE; + return 0; + +err: + umrc->state = MLX5_UMR_STATE_ERR; + return err; +} + static void mlx5r_umr_done(struct ib_cq *cq, struct ib_wc *wc) { struct mlx5_ib_umr_context *context = @@ -334,9 +397,7 @@ static int mlx5r_umr_post_send_wait(struct mlx5_ib_dev *dev, u32 mkey, mlx5_ib_warn(dev, "reg umr failed (%u). Trying to recover and resubmit the flushed WQEs, mkey = %u\n", umr_context.status, mkey); - mutex_lock(&umrc->lock); - err = mlx5r_umr_recover(dev); - mutex_unlock(&umrc->lock); + err = mlx5r_umr_recover(dev, mkey, &umr_context, wqe, with_data); if (err) mlx5_ib_warn(dev, "couldn't recover UMR, err %d\n", err); @@ -603,44 +664,47 @@ static void mlx5r_umr_final_update_xlt(struct mlx5_ib_dev *dev, wqe->data_seg.byte_count = cpu_to_be32(sg->length); } -/* - * Send the DMA list to the HW for a normal MR using UMR. - * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP - * flag may be used. - */ -int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) +static int +_mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags, bool dd) { + size_t ent_size = dd ? sizeof(struct mlx5_ksm) : sizeof(struct mlx5_mtt); struct mlx5_ib_dev *dev = mr_to_mdev(mr); struct device *ddev = &dev->mdev->pdev->dev; struct mlx5r_umr_wqe wqe = {}; struct ib_block_iter biter; + struct mlx5_ksm *cur_ksm; struct mlx5_mtt *cur_mtt; size_t orig_sg_length; - struct mlx5_mtt *mtt; size_t final_size; + void *curr_entry; struct ib_sge sg; + void *entry; u64 offset = 0; int err = 0; - if (WARN_ON(mr->umem->is_odp)) - return -EINVAL; - - mtt = mlx5r_umr_create_xlt( - dev, &sg, ib_umem_num_dma_blocks(mr->umem, 1 << mr->page_shift), - sizeof(*mtt), flags); - if (!mtt) + entry = mlx5r_umr_create_xlt(dev, &sg, + ib_umem_num_dma_blocks(mr->umem, 1 << mr->page_shift), + ent_size, flags); + if (!entry) return -ENOMEM; orig_sg_length = sg.length; - mlx5r_umr_set_update_xlt_ctrl_seg(&wqe.ctrl_seg, flags, &sg); mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe.mkey_seg, mr, mr->page_shift); + if (dd) { + /* Use the data direct internal kernel PD */ + MLX5_SET(mkc, &wqe.mkey_seg, pd, dev->ddr.pdn); + cur_ksm = entry; + } else { + cur_mtt = entry; + } + mlx5r_umr_set_update_xlt_data_seg(&wqe.data_seg, &sg); - cur_mtt = mtt; + curr_entry = entry; rdma_umem_for_each_dma_block(mr->umem, &biter, BIT(mr->page_shift)) { - if (cur_mtt == (void *)mtt + sg.length) { + if (curr_entry == entry + sg.length) { dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE); @@ -652,23 +716,31 @@ int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) DMA_TO_DEVICE); offset += sg.length; mlx5r_umr_update_offset(&wqe.ctrl_seg, offset); - - cur_mtt = mtt; + if (dd) + cur_ksm = entry; + else + cur_mtt = entry; } - cur_mtt->ptag = - cpu_to_be64(rdma_block_iter_dma_address(&biter) | - MLX5_IB_MTT_PRESENT); - - if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP)) - cur_mtt->ptag = 0; - - cur_mtt++; + if (dd) { + cur_ksm->va = cpu_to_be64(rdma_block_iter_dma_address(&biter)); + cur_ksm->key = cpu_to_be32(dev->ddr.mkey); + cur_ksm++; + curr_entry = cur_ksm; + } else { + cur_mtt->ptag = + cpu_to_be64(rdma_block_iter_dma_address(&biter) | + MLX5_IB_MTT_PRESENT); + if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP)) + cur_mtt->ptag = 0; + cur_mtt++; + curr_entry = cur_mtt; + } } - final_size = (void *)cur_mtt - (void *)mtt; + final_size = curr_entry - entry; sg.length = ALIGN(final_size, MLX5_UMR_FLEX_ALIGNMENT); - memset(cur_mtt, 0, sg.length - final_size); + memset(curr_entry, 0, sg.length - final_size); mlx5r_umr_final_update_xlt(dev, &wqe, mr, &sg, flags); dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE); @@ -676,10 +748,32 @@ int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) err: sg.length = orig_sg_length; - mlx5r_umr_unmap_free_xlt(dev, mtt, &sg); + mlx5r_umr_unmap_free_xlt(dev, entry, &sg); return err; } +int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, unsigned int flags) +{ + /* No invalidation flow is expected */ + if (WARN_ON(!mr->umem->is_dmabuf) || (flags & MLX5_IB_UPD_XLT_ZAP)) + return -EINVAL; + + return _mlx5r_umr_update_mr_pas(mr, flags, true); +} + +/* + * Send the DMA list to the HW for a normal MR using UMR. + * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP + * flag may be used. + */ +int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) +{ + if (WARN_ON(mr->umem->is_odp)) + return -EINVAL; + + return _mlx5r_umr_update_mr_pas(mr, flags, false); +} + static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev) { return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled); @@ -746,7 +840,17 @@ int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, size_to_map = npages * desc_size; dma_sync_single_for_cpu(ddev, sg.addr, sg.length, DMA_TO_DEVICE); - mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags); + /* + * npages is the maximum number of pages to map, but we + * can't guarantee that all pages are actually mapped. + * + * For example, if page is p2p of type which is not supported + * for mapping, the number of pages mapped will be less than + * requested. + */ + err = mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags); + if (err) + return err; dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE); sg.length = ALIGN(size_to_map, MLX5_UMR_FLEX_ALIGNMENT); diff --git a/drivers/infiniband/hw/mlx5/umr.h b/drivers/infiniband/hw/mlx5/umr.h index 3799bb758e49..4a02c9b5aad8 100644 --- a/drivers/infiniband/hw/mlx5/umr.h +++ b/drivers/infiniband/hw/mlx5/umr.h @@ -16,6 +16,9 @@ int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev); void mlx5r_umr_resource_cleanup(struct mlx5_ib_dev *dev); +int mlx5r_umr_init(struct mlx5_ib_dev *dev); +void mlx5r_umr_cleanup(struct mlx5_ib_dev *dev); + static inline bool mlx5r_umr_can_load_pas(struct mlx5_ib_dev *dev, size_t length) { @@ -92,6 +95,7 @@ int mlx5r_umr_revoke_mr(struct mlx5_ib_mr *mr); int mlx5r_umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd, int access_flags); int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags); +int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, unsigned int flags); int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, int page_shift, int flags); diff --git a/drivers/infiniband/hw/mthca/mthca_catas.c b/drivers/infiniband/hw/mthca/mthca_catas.c index ffb98eaaf1c2..6eabef9aa211 100644 --- a/drivers/infiniband/hw/mthca/mthca_catas.c +++ b/drivers/infiniband/hw/mthca/mthca_catas.c @@ -171,7 +171,7 @@ void mthca_start_catas_poll(struct mthca_dev *dev) void mthca_stop_catas_poll(struct mthca_dev *dev) { - del_timer_sync(&dev->catas_err.timer); + timer_delete_sync(&dev->catas_err.timer); if (dev->catas_err.map) iounmap(dev->catas_err.map); diff --git a/drivers/infiniband/hw/mthca/mthca_mr.c b/drivers/infiniband/hw/mthca/mthca_mr.c index 192f83fd7c8a..dacb8ceeebe0 100644 --- a/drivers/infiniband/hw/mthca/mthca_mr.c +++ b/drivers/infiniband/hw/mthca/mthca_mr.c @@ -144,7 +144,7 @@ static int mthca_buddy_init(struct mthca_buddy *buddy, int max_order) buddy->max_order = max_order; spin_lock_init(&buddy->lock); - buddy->bits = kcalloc(buddy->max_order + 1, sizeof(long *), + buddy->bits = kcalloc(buddy->max_order + 1, sizeof(*buddy->bits), GFP_KERNEL); buddy->num_free = kcalloc((buddy->max_order + 1), sizeof *buddy->num_free, GFP_KERNEL); diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index e1325f2927d6..6a1e2e79ddc3 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -574,8 +574,9 @@ static int mthca_destroy_qp(struct ib_qp *qp, struct ib_udata *udata) static int mthca_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct ib_device *ibdev = ibcq->device; int entries = attr->cqe; struct mthca_create_cq ucmd; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c index 5f831e3bdbad..0834416cb3f8 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c @@ -46,7 +46,7 @@ static struct dentry *ocrdma_dbgfs_dir; -static int ocrdma_add_stat(char *start, char *pcur, +static noinline_for_stack int ocrdma_add_stat(char *start, char *pcur, char *name, u64 count) { char buff[128] = {0}; @@ -99,7 +99,7 @@ void ocrdma_release_stats_resources(struct ocrdma_dev *dev) kfree(mem->debugfs_mem); } -static char *ocrdma_resource_stats(struct ocrdma_dev *dev) +static noinline_for_stack char *ocrdma_resource_stats(struct ocrdma_dev *dev) { char *stats = dev->stats_mem.debugfs_mem, *pcur; struct ocrdma_rdma_stats_resp *rdma_stats = @@ -216,7 +216,7 @@ static char *ocrdma_resource_stats(struct ocrdma_dev *dev) return stats; } -static char *ocrdma_rx_stats(struct ocrdma_dev *dev) +static noinline_for_stack char *ocrdma_rx_stats(struct ocrdma_dev *dev) { char *stats = dev->stats_mem.debugfs_mem, *pcur; struct ocrdma_rdma_stats_resp *rdma_stats = @@ -284,7 +284,7 @@ static u64 ocrdma_sysfs_rcv_data(struct ocrdma_dev *dev) rx_stats->roce_frame_bytes_hi))/4; } -static char *ocrdma_tx_stats(struct ocrdma_dev *dev) +static noinline_for_stack char *ocrdma_tx_stats(struct ocrdma_dev *dev) { char *stats = dev->stats_mem.debugfs_mem, *pcur; struct ocrdma_rdma_stats_resp *rdma_stats = @@ -358,7 +358,7 @@ static u64 ocrdma_sysfs_xmit_data(struct ocrdma_dev *dev) tx_stats->read_rsp_bytes_hi))/4; } -static char *ocrdma_wqe_stats(struct ocrdma_dev *dev) +static noinline_for_stack char *ocrdma_wqe_stats(struct ocrdma_dev *dev) { char *stats = dev->stats_mem.debugfs_mem, *pcur; struct ocrdma_rdma_stats_resp *rdma_stats = @@ -391,7 +391,7 @@ static char *ocrdma_wqe_stats(struct ocrdma_dev *dev) return stats; } -static char *ocrdma_db_errstats(struct ocrdma_dev *dev) +static noinline_for_stack char *ocrdma_db_errstats(struct ocrdma_dev *dev) { char *stats = dev->stats_mem.debugfs_mem, *pcur; struct ocrdma_rdma_stats_resp *rdma_stats = @@ -412,7 +412,7 @@ static char *ocrdma_db_errstats(struct ocrdma_dev *dev) return stats; } -static char *ocrdma_rxqp_errstats(struct ocrdma_dev *dev) +static noinline_for_stack char *ocrdma_rxqp_errstats(struct ocrdma_dev *dev) { char *stats = dev->stats_mem.debugfs_mem, *pcur; struct ocrdma_rdma_stats_resp *rdma_stats = @@ -438,7 +438,7 @@ static char *ocrdma_rxqp_errstats(struct ocrdma_dev *dev) return stats; } -static char *ocrdma_txqp_errstats(struct ocrdma_dev *dev) +static noinline_for_stack char *ocrdma_txqp_errstats(struct ocrdma_dev *dev) { char *stats = dev->stats_mem.debugfs_mem, *pcur; struct ocrdma_rdma_stats_resp *rdma_stats = @@ -462,7 +462,7 @@ static char *ocrdma_txqp_errstats(struct ocrdma_dev *dev) return stats; } -static char *ocrdma_tx_dbg_stats(struct ocrdma_dev *dev) +static noinline_for_stack char *ocrdma_tx_dbg_stats(struct ocrdma_dev *dev) { int i; char *pstats = dev->stats_mem.debugfs_mem; @@ -480,7 +480,7 @@ static char *ocrdma_tx_dbg_stats(struct ocrdma_dev *dev) return dev->stats_mem.debugfs_mem; } -static char *ocrdma_rx_dbg_stats(struct ocrdma_dev *dev) +static noinline_for_stack char *ocrdma_rx_dbg_stats(struct ocrdma_dev *dev) { int i; char *pstats = dev->stats_mem.debugfs_mem; @@ -498,7 +498,7 @@ static char *ocrdma_rx_dbg_stats(struct ocrdma_dev *dev) return dev->stats_mem.debugfs_mem; } -static char *ocrdma_driver_dbg_stats(struct ocrdma_dev *dev) +static noinline_for_stack char *ocrdma_driver_dbg_stats(struct ocrdma_dev *dev) { char *stats = dev->stats_mem.debugfs_mem, *pcur; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index c849fdbd4c99..979de8f8df14 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -963,8 +963,9 @@ err: } int ocrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct ib_device *ibdev = ibcq->device; int entries = attr->cqe; struct ocrdma_cq *cq = get_ocrdma_cq(ibcq); diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h index f860b7fcef33..0644346d8d98 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h @@ -70,7 +70,7 @@ int ocrdma_alloc_pd(struct ib_pd *pd, struct ib_udata *udata); int ocrdma_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata); int ocrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int ocrdma_resize_cq(struct ib_cq *, int cqe, struct ib_udata *); int ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/qedr/qedr_iw_cm.c b/drivers/infiniband/hw/qedr/qedr_iw_cm.c index a51fc6854984..259303b9907c 100644 --- a/drivers/infiniband/hw/qedr/qedr_iw_cm.c +++ b/drivers/infiniband/hw/qedr/qedr_iw_cm.c @@ -447,7 +447,8 @@ qedr_addr4_resolve(struct qedr_dev *dev, struct rtable *rt = NULL; int rc = 0; - rt = ip_route_output(&init_net, dst_ip, src_ip, 0, 0); + rt = ip_route_output(&init_net, dst_ip, src_ip, 0, 0, + RT_SCOPE_UNIVERSE); if (IS_ERR(rt)) { DP_ERR(dev, "ip_route_output returned error\n"); return -EINVAL; diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index f118ce0a9a61..568a5b18803f 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -900,8 +900,9 @@ int qedr_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) } int qedr_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct ib_device *ibdev = ibcq->device; struct qedr_ucontext *ctx = rdma_udata_to_drv_context( udata, struct qedr_ucontext, ibucontext); diff --git a/drivers/infiniband/hw/qedr/verbs.h b/drivers/infiniband/hw/qedr/verbs.h index 081753df79ef..5731458abb06 100644 --- a/drivers/infiniband/hw/qedr/verbs.h +++ b/drivers/infiniband/hw/qedr/verbs.h @@ -52,7 +52,7 @@ int qedr_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata); int qedr_alloc_xrcd(struct ib_xrcd *ibxrcd, struct ib_udata *udata); int qedr_dealloc_xrcd(struct ib_xrcd *ibxrcd, struct ib_udata *udata); int qedr_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); int qedr_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); int qedr_create_qp(struct ib_qp *qp, struct ib_qp_init_attr *attrs, diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h index 26c615772be3..8ee4edd7883c 100644 --- a/drivers/infiniband/hw/qib/qib.h +++ b/drivers/infiniband/hw/qib/qib.h @@ -1359,7 +1359,6 @@ static inline u32 qib_get_rcvhdrtail(const struct qib_ctxtdata *rcd) * sysfs interface. */ -extern const char ib_qib_version[]; extern const struct attribute_group qib_attr_group; extern const struct attribute_group *qib_attr_port_groups[]; diff --git a/drivers/infiniband/hw/qib/qib_driver.c b/drivers/infiniband/hw/qib/qib_driver.c index bf3fa12fe935..bdd724add147 100644 --- a/drivers/infiniband/hw/qib/qib_driver.c +++ b/drivers/infiniband/hw/qib/qib_driver.c @@ -44,12 +44,6 @@ #include "qib.h" -/* - * The size has to be longer than this string, so we can append - * board/chip information to it in the init code. - */ -const char ib_qib_version[] = QIB_DRIVER_VERSION "\n"; - DEFINE_MUTEX(qib_mutex); /* general driver use */ unsigned qib_ibmtu; @@ -774,7 +768,7 @@ int qib_reset_device(int unit) ppd = dd->pport + pidx; if (atomic_read(&ppd->led_override_timer_active)) { /* Need to stop LED timer, _then_ shut off LEDs */ - del_timer_sync(&ppd->led_override_timer); + timer_delete_sync(&ppd->led_override_timer); atomic_set(&ppd->led_override_timer_active, 0); } diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c index 455e966eeff3..2098de762bf5 100644 --- a/drivers/infiniband/hw/qib/qib_fs.c +++ b/drivers/infiniband/hw/qib/qib_fs.c @@ -55,6 +55,7 @@ static int qibfs_mknod(struct inode *dir, struct dentry *dentry, struct inode *inode = new_inode(dir->i_sb); if (!inode) { + dput(dentry); error = -EPERM; goto bail; } @@ -89,7 +90,7 @@ static int create_file(const char *name, umode_t mode, int error; inode_lock(d_inode(parent)); - *dentry = lookup_one_len(name, parent, strlen(name)); + *dentry = lookup_noperm(&QSTR(name), parent); if (!IS_ERR(*dentry)) error = qibfs_mknod(d_inode(parent), *dentry, mode, fops, data); @@ -432,13 +433,14 @@ static int remove_device_files(struct super_block *sb, char unit[10]; snprintf(unit, sizeof(unit), "%u", dd->unit); - dir = lookup_one_len_unlocked(unit, sb->s_root, strlen(unit)); + dir = lookup_noperm_unlocked(&QSTR(unit), sb->s_root); if (IS_ERR(dir)) { pr_err("Lookup of %s failed\n", unit); return PTR_ERR(dir); } simple_recursive_removal(dir, NULL); + dput(dir); return 0; } diff --git a/drivers/infiniband/hw/qib/qib_iba7220.c b/drivers/infiniband/hw/qib/qib_iba7220.c index 6af57067c32e..302c0d19f57d 100644 --- a/drivers/infiniband/hw/qib/qib_iba7220.c +++ b/drivers/infiniband/hw/qib/qib_iba7220.c @@ -1656,7 +1656,7 @@ static void qib_7220_quiet_serdes(struct qib_pportdata *ppd) ppd->cpspec->chase_end = 0; if (ppd->cpspec->chase_timer.function) /* if initted */ - del_timer_sync(&ppd->cpspec->chase_timer); + timer_delete_sync(&ppd->cpspec->chase_timer); if (ppd->cpspec->ibsymdelta || ppd->cpspec->iblnkerrdelta || ppd->cpspec->ibdeltainprog) { @@ -2605,7 +2605,7 @@ static int qib_7220_set_ib_cfg(struct qib_pportdata *ppd, int which, u32 val) * wait forpending timer, but don't clear .data (ppd)! */ if (ppd->cpspec->chase_timer.expires) { - del_timer_sync(&ppd->cpspec->chase_timer); + timer_delete_sync(&ppd->cpspec->chase_timer); ppd->cpspec->chase_timer.expires = 0; } break; @@ -3281,7 +3281,7 @@ static int qib_7220_intr_fallback(struct qib_devdata *dd) qib_free_irq(dd); dd->msi_lo = 0; - if (pci_alloc_irq_vectors(dd->pcidev, 1, 1, PCI_IRQ_LEGACY) < 0) + if (pci_alloc_irq_vectors(dd->pcidev, 1, 1, PCI_IRQ_INTX) < 0) qib_dev_err(dd, "Failed to enable INTx\n"); qib_setup_7220_interrupt(dd); return 1; diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c index f93906d8fc09..7b4bf06c3b38 100644 --- a/drivers/infiniband/hw/qib/qib_iba7322.c +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -2512,7 +2512,7 @@ static void qib_7322_mini_quiet_serdes(struct qib_pportdata *ppd) ppd->cpspec->chase_end = 0; if (ppd->cpspec->chase_timer.function) /* if initted */ - del_timer_sync(&ppd->cpspec->chase_timer); + timer_delete_sync(&ppd->cpspec->chase_timer); /* * Despite the name, actually disables IBC as well. Do it when @@ -3471,8 +3471,7 @@ try_intx: pci_irq_vector(dd->pcidev, msixnum), ret); qib_7322_free_irq(dd); - pci_alloc_irq_vectors(dd->pcidev, 1, 1, - PCI_IRQ_LEGACY); + pci_alloc_irq_vectors(dd->pcidev, 1, 1, PCI_IRQ_INTX); goto try_intx; } dd->cspec->msix_entries[msixnum].arg = arg; @@ -4240,7 +4239,7 @@ static int qib_7322_set_ib_cfg(struct qib_pportdata *ppd, int which, u32 val) * wait forpending timer, but don't clear .data (ppd)! */ if (ppd->cpspec->chase_timer.expires) { - del_timer_sync(&ppd->cpspec->chase_timer); + timer_delete_sync(&ppd->cpspec->chase_timer); ppd->cpspec->chase_timer.expires = 0; } break; @@ -5143,7 +5142,7 @@ static int qib_7322_intr_fallback(struct qib_devdata *dd) qib_devinfo(dd->pcidev, "MSIx interrupt not detected, trying INTx interrupts\n"); qib_7322_free_irq(dd); - if (pci_alloc_irq_vectors(dd->pcidev, 1, 1, PCI_IRQ_LEGACY) < 0) + if (pci_alloc_irq_vectors(dd->pcidev, 1, 1, PCI_IRQ_INTX) < 0) qib_dev_err(dd, "Failed to enable INTx\n"); qib_setup_7322_interrupt(dd, 0); return 1; diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c index 33667becd52b..33c23adec101 100644 --- a/drivers/infiniband/hw/qib/qib_init.c +++ b/drivers/infiniband/hw/qib/qib_init.c @@ -581,12 +581,9 @@ static int qib_create_workqueues(struct qib_devdata *dd) for (pidx = 0; pidx < dd->num_pports; ++pidx) { ppd = dd->pport + pidx; if (!ppd->qib_wq) { - char wq_name[8]; /* 3 + 2 + 1 + 1 + 1 */ - - snprintf(wq_name, sizeof(wq_name), "qib%d_%d", - dd->unit, pidx); - ppd->qib_wq = alloc_ordered_workqueue(wq_name, - WQ_MEM_RECLAIM); + ppd->qib_wq = alloc_ordered_workqueue("qib%d_%d", + WQ_MEM_RECLAIM, + dd->unit, pidx); if (!ppd->qib_wq) goto wq_error; } @@ -799,19 +796,19 @@ static void qib_stop_timers(struct qib_devdata *dd) int pidx; if (dd->stats_timer.function) - del_timer_sync(&dd->stats_timer); + timer_delete_sync(&dd->stats_timer); if (dd->intrchk_timer.function) - del_timer_sync(&dd->intrchk_timer); + timer_delete_sync(&dd->intrchk_timer); for (pidx = 0; pidx < dd->num_pports; ++pidx) { ppd = dd->pport + pidx; if (ppd->hol_timer.function) - del_timer_sync(&ppd->hol_timer); + timer_delete_sync(&ppd->hol_timer); if (ppd->led_override_timer.function) { - del_timer_sync(&ppd->led_override_timer); + timer_delete_sync(&ppd->led_override_timer); atomic_set(&ppd->led_override_timer_active, 0); } if (ppd->symerr_clear_timer.function) - del_timer_sync(&ppd->symerr_clear_timer); + timer_delete_sync(&ppd->symerr_clear_timer); } } diff --git a/drivers/infiniband/hw/qib/qib_mad.c b/drivers/infiniband/hw/qib/qib_mad.c index ef02f2bfddb2..568deb77ab4d 100644 --- a/drivers/infiniband/hw/qib/qib_mad.c +++ b/drivers/infiniband/hw/qib/qib_mad.c @@ -2441,7 +2441,7 @@ void qib_notify_free_mad_agent(struct rvt_dev_info *rdi, int port_idx) struct qib_devdata, verbs_dev); if (dd->pport[port_idx].cong_stats.timer.function) - del_timer_sync(&dd->pport[port_idx].cong_stats.timer); + timer_delete_sync(&dd->pport[port_idx].cong_stats.timer); if (dd->pport[port_idx].ibport_data.smi_ah) rdma_destroy_ah(&dd->pport[port_idx].ibport_data.smi_ah->ibah, diff --git a/drivers/infiniband/hw/qib/qib_pcie.c b/drivers/infiniband/hw/qib/qib_pcie.c index 47bf64ace05c..58c1d62d341b 100644 --- a/drivers/infiniband/hw/qib/qib_pcie.c +++ b/drivers/infiniband/hw/qib/qib_pcie.c @@ -210,7 +210,7 @@ int qib_pcie_params(struct qib_devdata *dd, u32 minw, u32 *nent) } if (dd->flags & QIB_HAS_INTX) - flags |= PCI_IRQ_LEGACY; + flags |= PCI_IRQ_INTX; maxvec = (nent && *nent) ? *nent : 1; nvec = pci_alloc_irq_vectors(dd->pcidev, 1, maxvec, flags); if (nvec < 0) diff --git a/drivers/infiniband/hw/qib/qib_sd7220.c b/drivers/infiniband/hw/qib/qib_sd7220.c index 1dc3ccf0cf1f..c4ee120ac7fb 100644 --- a/drivers/infiniband/hw/qib/qib_sd7220.c +++ b/drivers/infiniband/hw/qib/qib_sd7220.c @@ -1375,7 +1375,7 @@ void toggle_7220_rclkrls(struct qib_devdata *dd) void shutdown_7220_relock_poll(struct qib_devdata *dd) { if (dd->cspec->relock_timer_active) - del_timer_sync(&dd->cspec->relock_timer); + timer_delete_sync(&dd->cspec->relock_timer); } static unsigned qib_relock_by_timer = 1; diff --git a/drivers/infiniband/hw/qib/qib_sysfs.c b/drivers/infiniband/hw/qib/qib_sysfs.c index 41c272980f91..805e37dc7621 100644 --- a/drivers/infiniband/hw/qib/qib_sysfs.c +++ b/drivers/infiniband/hw/qib/qib_sysfs.c @@ -214,8 +214,8 @@ static const struct attribute_group port_linkcontrol_group = { * Congestion control table size followed by table entries */ static ssize_t cc_table_bin_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, char *buf, - loff_t pos, size_t count) + const struct bin_attribute *bin_attr, + char *buf, loff_t pos, size_t count) { struct qib_pportdata *ppd = qib_get_pportdata_kobj(kobj); int ret; @@ -241,7 +241,7 @@ static ssize_t cc_table_bin_read(struct file *filp, struct kobject *kobj, return count; } -static BIN_ATTR_RO(cc_table_bin, PAGE_SIZE); +static const BIN_ATTR_RO(cc_table_bin, PAGE_SIZE); /* * Congestion settings: port control, control map and an array of 16 @@ -249,8 +249,8 @@ static BIN_ATTR_RO(cc_table_bin, PAGE_SIZE); * trigger threshold and the minimum injection rate delay. */ static ssize_t cc_setting_bin_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, char *buf, - loff_t pos, size_t count) + const struct bin_attribute *bin_attr, + char *buf, loff_t pos, size_t count) { struct qib_pportdata *ppd = qib_get_pportdata_kobj(kobj); int ret; @@ -274,16 +274,16 @@ static ssize_t cc_setting_bin_read(struct file *filp, struct kobject *kobj, return count; } -static BIN_ATTR_RO(cc_setting_bin, PAGE_SIZE); +static const BIN_ATTR_RO(cc_setting_bin, PAGE_SIZE); -static struct bin_attribute *port_ccmgta_attributes[] = { +static const struct bin_attribute *const port_ccmgta_attributes[] = { &bin_attr_cc_setting_bin, &bin_attr_cc_table_bin, NULL, }; static umode_t qib_ccmgta_is_bin_visible(struct kobject *kobj, - struct bin_attribute *attr, int n) + const struct bin_attribute *attr, int n) { struct qib_pportdata *ppd = qib_get_pportdata_kobj(kobj); @@ -295,7 +295,7 @@ static umode_t qib_ccmgta_is_bin_visible(struct kobject *kobj, static const struct attribute_group port_ccmgta_attribute_group = { .name = "CCMgtA", .is_bin_visible = qib_ccmgta_is_bin_visible, - .bin_attrs = port_ccmgta_attributes, + .bin_attrs_new = port_ccmgta_attributes, }; /* Start sl2vl */ @@ -585,13 +585,7 @@ static ssize_t hca_type_show(struct device *device, static DEVICE_ATTR_RO(hca_type); static DEVICE_ATTR(board_id, 0444, hca_type_show, NULL); -static ssize_t version_show(struct device *device, - struct device_attribute *attr, char *buf) -{ - /* The string printed here is already newline-terminated. */ - return sysfs_emit(buf, "%s", (char *)ib_qib_version); -} -static DEVICE_ATTR_RO(version); +static DEVICE_STRING_ATTR_RO(version, 0444, QIB_DRIVER_VERSION); static ssize_t boardversion_show(struct device *device, struct device_attribute *attr, char *buf) @@ -721,7 +715,7 @@ static struct attribute *qib_attributes[] = { &dev_attr_hw_rev.attr, &dev_attr_hca_type.attr, &dev_attr_board_id.attr, - &dev_attr_version.attr, + &dev_attr_version.attr.attr, &dev_attr_nctxts.attr, &dev_attr_nfreectxts.attr, &dev_attr_serial.attr, diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index 0080f0be72fe..9832567a8bb8 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -1551,7 +1551,7 @@ int qib_register_ib_device(struct qib_devdata *dd) ibdev->dev.parent = &dd->pcidev->dev; snprintf(ibdev->node_desc, sizeof(ibdev->node_desc), - "Intel Infiniband HCA %s", init_utsname()->nodename); + "Intel Infiniband HCA %.42s", init_utsname()->nodename); /* * Fill in rvt info object. @@ -1655,7 +1655,7 @@ void qib_unregister_ib_device(struct qib_devdata *dd) if (!list_empty(&dev->memwait)) qib_dev_err(dd, "memwait list not empty!\n"); - del_timer_sync(&dev->mem_timer); + timer_delete_sync(&dev->mem_timer); while (!list_empty(&dev->txreq_free)) { struct list_head *l = dev->txreq_free.next; struct qib_verbs_txreq *tx; diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index 07548fac1d8e..408fe1ba74b9 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -303,8 +303,6 @@ int qib_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, struct ib_ah *qib_create_qp0_ah(struct qib_ibport *ibp, u16 dlid); -void qib_rc_rnr_retry(unsigned long arg); - void qib_rc_send_complete(struct rvt_qp *qp, struct ib_header *hdr); int qib_post_ud_send(struct rvt_qp *qp, const struct ib_send_wr *wr); @@ -312,8 +310,6 @@ int qib_post_ud_send(struct rvt_qp *qp, const struct ib_send_wr *wr); void qib_ud_rcv(struct qib_ibport *ibp, struct ib_header *hdr, int has_grh, void *data, u32 tlen, struct rvt_qp *qp); -void mr_rcu_callback(struct rcu_head *list); - void qib_migrate_qp(struct rvt_qp *qp); int qib_ruc_check_hdr(struct qib_ibport *ibp, struct ib_header *hdr, diff --git a/drivers/infiniband/hw/usnic/usnic_abi.h b/drivers/infiniband/hw/usnic/usnic_abi.h index 7fe9502ce8d3..86a82a4da0aa 100644 --- a/drivers/infiniband/hw/usnic/usnic_abi.h +++ b/drivers/infiniband/hw/usnic/usnic_abi.h @@ -72,7 +72,7 @@ struct usnic_ib_create_qp_resp { u64 bar_bus_addr; u32 bar_len; /* - * WQ, RQ, CQ are explicity specified bc exposing a generic resources inteface + * WQ, RQ, CQ are explicitly specified bc exposing a generic resources inteface * expands the scope of ABI to many files. */ u32 wq_cnt; diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c index 13b654ddd3cc..11eca39b73a9 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_main.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -151,34 +151,6 @@ static void usnic_ib_handle_usdev_event(struct usnic_ib_dev *us_ibdev, ib_event.element.port_num = 1; ib_dispatch_event(&ib_event); break; - case NETDEV_UP: - case NETDEV_DOWN: - case NETDEV_CHANGE: - if (!us_ibdev->ufdev->link_up && - netif_carrier_ok(netdev)) { - usnic_fwd_carrier_up(us_ibdev->ufdev); - usnic_info("Link UP on %s\n", - dev_name(&us_ibdev->ib_dev.dev)); - ib_event.event = IB_EVENT_PORT_ACTIVE; - ib_event.device = &us_ibdev->ib_dev; - ib_event.element.port_num = 1; - ib_dispatch_event(&ib_event); - } else if (us_ibdev->ufdev->link_up && - !netif_carrier_ok(netdev)) { - usnic_fwd_carrier_down(us_ibdev->ufdev); - usnic_info("Link DOWN on %s\n", - dev_name(&us_ibdev->ib_dev.dev)); - usnic_ib_qp_grp_modify_active_to_err(us_ibdev); - ib_event.event = IB_EVENT_PORT_ERR; - ib_event.device = &us_ibdev->ib_dev; - ib_event.element.port_num = 1; - ib_dispatch_event(&ib_event); - } else { - usnic_dbg("Ignoring %s on %s\n", - netdev_cmd_to_name(event), - dev_name(&us_ibdev->ib_dev.dev)); - } - break; case NETDEV_CHANGEADDR: if (!memcmp(us_ibdev->ufdev->mac, netdev->dev_addr, sizeof(us_ibdev->ufdev->mac))) { @@ -218,6 +190,50 @@ static void usnic_ib_handle_usdev_event(struct usnic_ib_dev *us_ibdev, mutex_unlock(&us_ibdev->usdev_lock); } +static void usnic_ib_handle_port_event(struct ib_device *ibdev, + struct net_device *netdev, + unsigned long event) +{ + struct usnic_ib_dev *us_ibdev = + container_of(ibdev, struct usnic_ib_dev, ib_dev); + struct ib_event ib_event; + + mutex_lock(&us_ibdev->usdev_lock); + switch (event) { + case NETDEV_UP: + case NETDEV_DOWN: + case NETDEV_CHANGE: + if (!us_ibdev->ufdev->link_up && + netif_carrier_ok(netdev)) { + usnic_fwd_carrier_up(us_ibdev->ufdev); + usnic_info("Link UP on %s\n", + dev_name(&us_ibdev->ib_dev.dev)); + ib_event.event = IB_EVENT_PORT_ACTIVE; + ib_event.device = &us_ibdev->ib_dev; + ib_event.element.port_num = 1; + ib_dispatch_event(&ib_event); + } else if (us_ibdev->ufdev->link_up && + !netif_carrier_ok(netdev)) { + usnic_fwd_carrier_down(us_ibdev->ufdev); + usnic_info("Link DOWN on %s\n", + dev_name(&us_ibdev->ib_dev.dev)); + usnic_ib_qp_grp_modify_active_to_err(us_ibdev); + ib_event.event = IB_EVENT_PORT_ERR; + ib_event.device = &us_ibdev->ib_dev; + ib_event.element.port_num = 1; + ib_dispatch_event(&ib_event); + } else { + usnic_dbg("Ignoring %s on %s\n", + netdev_cmd_to_name(event), + dev_name(&us_ibdev->ib_dev.dev)); + } + break; + default: + break; + } + mutex_unlock(&us_ibdev->usdev_lock); +} + static int usnic_ib_netdevice_event(struct notifier_block *notifier, unsigned long event, void *ptr) { @@ -358,6 +374,7 @@ static const struct ib_device_ops usnic_dev_ops = { .query_port = usnic_ib_query_port, .query_qp = usnic_ib_query_qp, .reg_user_mr = usnic_ib_reg_mr, + .report_port_event = usnic_ib_handle_port_event, INIT_RDMA_OBJ_SIZE(ib_pd, usnic_ib_pd, ibpd), INIT_RDMA_OBJ_SIZE(ib_cq, usnic_ib_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_qp, usnic_ib_qp_grp, ibqp), @@ -380,7 +397,7 @@ static void *usnic_ib_device_add(struct pci_dev *dev) if (!us_ibdev) { usnic_err("Device %s context alloc failed\n", netdev_name(pci_get_drvdata(dev))); - return ERR_PTR(-EFAULT); + return NULL; } us_ibdev->ufdev = usnic_fwd_dev_alloc(dev); @@ -500,8 +517,8 @@ static struct usnic_ib_dev *usnic_ib_discover_pf(struct usnic_vnic *vnic) } us_ibdev = usnic_ib_device_add(parent_pci); - if (IS_ERR_OR_NULL(us_ibdev)) { - us_ibdev = us_ibdev ? us_ibdev : ERR_PTR(-EFAULT); + if (!us_ibdev) { + us_ibdev = ERR_PTR(-EFAULT); goto out; } @@ -569,10 +586,10 @@ static int usnic_ib_pci_probe(struct pci_dev *pdev, } pf = usnic_ib_discover_pf(vf->vnic); - if (IS_ERR_OR_NULL(pf)) { - usnic_err("Failed to discover pf of vnic %s with err%ld\n", - pci_name(pdev), PTR_ERR(pf)); - err = pf ? PTR_ERR(pf) : -EFAULT; + if (IS_ERR(pf)) { + err = PTR_ERR(pf); + usnic_err("Failed to discover pf of vnic %s with err%d\n", + pci_name(pdev), err); goto out_clean_vnic; } diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c index 6289238cc5af..217af34e82b3 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -577,7 +577,7 @@ out_unlock: } int usnic_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { if (attr->flags) return -EOPNOTSUPP; diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h index 6ca9ee0dddbe..53f53f2d53be 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h @@ -56,7 +56,7 @@ int usnic_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata); int usnic_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata); int usnic_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index 84e0f41e7dfa..3fbf99757b11 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -56,7 +56,7 @@ static int usnic_uiom_dma_fault(struct iommu_domain *domain, unsigned long iova, int flags, void *token) { - usnic_err("Device %s iommu fault domain 0x%pK va 0x%lx flags 0x%x\n", + usnic_err("Device %s iommu fault domain 0x%p va 0x%lx flags 0x%x\n", dev_name(dev), domain, iova, flags); return -ENOSYS; @@ -443,11 +443,11 @@ struct usnic_uiom_pd *usnic_uiom_alloc_pd(struct device *dev) if (!pd) return ERR_PTR(-ENOMEM); - pd->domain = domain = iommu_domain_alloc(dev->bus); - if (!domain) { + pd->domain = domain = iommu_paging_domain_alloc(dev); + if (IS_ERR(domain)) { usnic_err("Failed to allocate IOMMU domain"); kfree(pd); - return ERR_PTR(-ENOMEM); + return ERR_CAST(domain); } iommu_set_fault_handler(pd->domain, usnic_uiom_dma_fault, NULL); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c index 6aa40bd2fd52..b3df6eb9b8ef 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c @@ -94,13 +94,14 @@ int pvrdma_req_notify_cq(struct ib_cq *ibcq, * pvrdma_create_cq - create completion queue * @ibcq: Allocated CQ * @attr: completion queue attributes - * @udata: user data + * @attrs: bundle * * @return: 0 on success */ int pvrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct ib_device *ibdev = ibcq->device; int entries = attr->cqe; struct pvrdma_dev *dev = to_vdev(ibdev); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c index a5e88185171f..1664d1d7d969 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c @@ -143,6 +143,46 @@ static int pvrdma_port_immutable(struct ib_device *ibdev, u32 port_num, return 0; } +static void pvrdma_dispatch_event(struct pvrdma_dev *dev, int port, + enum ib_event_type event) +{ + struct ib_event ib_event; + + memset(&ib_event, 0, sizeof(ib_event)); + ib_event.device = &dev->ib_dev; + ib_event.element.port_num = port; + ib_event.event = event; + ib_dispatch_event(&ib_event); +} + +static void pvrdma_report_event_handle(struct ib_device *ibdev, + struct net_device *ndev, + unsigned long event) +{ + struct pvrdma_dev *dev = container_of(ibdev, struct pvrdma_dev, ib_dev); + + switch (event) { + case NETDEV_DOWN: + pvrdma_dispatch_event(dev, 1, IB_EVENT_PORT_ERR); + break; + case NETDEV_UP: + pvrdma_write_reg(dev, PVRDMA_REG_CTL, + PVRDMA_DEVICE_CTL_UNQUIESCE); + + mb(); + + if (pvrdma_read_reg(dev, PVRDMA_REG_ERR)) + dev_err(&dev->pdev->dev, + "failed to activate device during link up\n"); + else + pvrdma_dispatch_event(dev, 1, IB_EVENT_PORT_ACTIVE); + break; + + default: + break; + } +} + static const struct ib_device_ops pvrdma_dev_ops = { .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_VMW_PVRDMA, @@ -181,6 +221,7 @@ static const struct ib_device_ops pvrdma_dev_ops = { .query_qp = pvrdma_query_qp, .reg_user_mr = pvrdma_reg_user_mr, .req_notify_cq = pvrdma_req_notify_cq, + .report_port_event = pvrdma_report_event_handle, INIT_RDMA_OBJ_SIZE(ib_ah, pvrdma_ah, ibah), INIT_RDMA_OBJ_SIZE(ib_cq, pvrdma_cq, ibcq), @@ -362,18 +403,6 @@ static void pvrdma_srq_event(struct pvrdma_dev *dev, u32 srqn, int type) } } -static void pvrdma_dispatch_event(struct pvrdma_dev *dev, int port, - enum ib_event_type event) -{ - struct ib_event ib_event; - - memset(&ib_event, 0, sizeof(ib_event)); - ib_event.device = &dev->ib_dev; - ib_event.element.port_num = port; - ib_event.event = event; - ib_dispatch_event(&ib_event); -} - static void pvrdma_dev_event(struct pvrdma_dev *dev, u8 port, int type) { if (port < 1 || port > dev->dsr->caps.phys_port_cnt) { @@ -531,7 +560,7 @@ static int pvrdma_alloc_intrs(struct pvrdma_dev *dev) PCI_IRQ_MSIX); if (ret < 0) { ret = pci_alloc_irq_vectors(pdev, 1, 1, - PCI_IRQ_MSI | PCI_IRQ_LEGACY); + PCI_IRQ_MSI | PCI_IRQ_INTX); if (ret < 0) return ret; } @@ -666,21 +695,8 @@ static void pvrdma_netdevice_event_handle(struct pvrdma_dev *dev, switch (event) { case NETDEV_REBOOT: - case NETDEV_DOWN: pvrdma_dispatch_event(dev, 1, IB_EVENT_PORT_ERR); break; - case NETDEV_UP: - pvrdma_write_reg(dev, PVRDMA_REG_CTL, - PVRDMA_DEVICE_CTL_UNQUIESCE); - - mb(); - - if (pvrdma_read_reg(dev, PVRDMA_REG_ERR)) - dev_err(&dev->pdev->dev, - "failed to activate device during link up\n"); - else - pvrdma_dispatch_event(dev, 1, IB_EVENT_PORT_ACTIVE); - break; case NETDEV_UNREGISTER: ib_device_set_netdev(&dev->ib_dev, NULL, 1); dev_put(dev->netdev); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c index 9f54aa90a35a..bcd43dc30e21 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c @@ -237,34 +237,6 @@ enum rdma_link_layer pvrdma_port_link_layer(struct ib_device *ibdev, return IB_LINK_LAYER_ETHERNET; } -int pvrdma_modify_device(struct ib_device *ibdev, int mask, - struct ib_device_modify *props) -{ - unsigned long flags; - - if (mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID | - IB_DEVICE_MODIFY_NODE_DESC)) { - dev_warn(&to_vdev(ibdev)->pdev->dev, - "unsupported device modify mask %#x\n", mask); - return -EOPNOTSUPP; - } - - if (mask & IB_DEVICE_MODIFY_NODE_DESC) { - spin_lock_irqsave(&to_vdev(ibdev)->desc_lock, flags); - memcpy(ibdev->node_desc, props->node_desc, 64); - spin_unlock_irqrestore(&to_vdev(ibdev)->desc_lock, flags); - } - - if (mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) { - mutex_lock(&to_vdev(ibdev)->port_mutex); - to_vdev(ibdev)->sys_image_guid = - cpu_to_be64(props->sys_image_guid); - mutex_unlock(&to_vdev(ibdev)->port_mutex); - } - - return 0; -} - /** * pvrdma_modify_port - modify device port attributes * @ibdev: the device to modify diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h index 78807b23d831..fd47b0b1df5c 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h @@ -356,8 +356,6 @@ int pvrdma_query_pkey(struct ib_device *ibdev, u32 port, u16 index, u16 *pkey); enum rdma_link_layer pvrdma_port_link_layer(struct ib_device *ibdev, u32 port); -int pvrdma_modify_device(struct ib_device *ibdev, int mask, - struct ib_device_modify *props); int pvrdma_modify_port(struct ib_device *ibdev, u32 port, int mask, struct ib_port_modify *props); int pvrdma_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); @@ -375,7 +373,7 @@ struct ib_mr *pvrdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, int pvrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset); int pvrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int pvrdma_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); int pvrdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int pvrdma_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c index 82c3f5932249..0ca2743f1075 100644 --- a/drivers/infiniband/sw/rdmavt/cq.c +++ b/drivers/infiniband/sw/rdmavt/cq.c @@ -5,6 +5,7 @@ #include <linux/slab.h> #include <linux/vmalloc.h> +#include <rdma/uverbs_ioctl.h> #include "cq.h" #include "vt.h" #include "trace.h" @@ -149,15 +150,16 @@ static void send_complete(struct work_struct *work) * rvt_create_cq - create a completion queue * @ibcq: Allocated CQ * @attr: creation attributes - * @udata: user data for libibverbs.so + * @attrs: uverbs bundle * * Called by ib_create_cq() in the generic verbs code. * * Return: 0 on success */ int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct ib_device *ibdev = ibcq->device; struct rvt_dev_info *rdi = ib_to_rvt(ibdev); struct rvt_cq *cq = ibcq_to_rvtcq(ibcq); diff --git a/drivers/infiniband/sw/rdmavt/cq.h b/drivers/infiniband/sw/rdmavt/cq.h index d49b6d1a26cb..4028702a7b2f 100644 --- a/drivers/infiniband/sw/rdmavt/cq.h +++ b/drivers/infiniband/sw/rdmavt/cq.h @@ -10,7 +10,7 @@ #include <rdma/rdmavt_cq.h> int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags); int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata); diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c index 7a9afd5231d5..5ed5cfc2b280 100644 --- a/drivers/infiniband/sw/rdmavt/mr.c +++ b/drivers/infiniband/sw/rdmavt/mr.c @@ -348,13 +348,13 @@ struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, umem = ib_umem_get(pd->device, start, length, mr_access_flags); if (IS_ERR(umem)) - return (void *)umem; + return ERR_CAST(umem); n = ib_umem_num_pages(umem); mr = __rvt_alloc_mr(n, pd); if (IS_ERR(mr)) { - ret = (struct ib_mr *)mr; + ret = ERR_CAST(mr); goto bail_umem; } @@ -542,7 +542,7 @@ struct ib_mr *rvt_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, mr = __rvt_alloc_mr(max_num_sg, pd); if (IS_ERR(mr)) - return (struct ib_mr *)mr; + return ERR_CAST(mr); return &mr->ibmr; } diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index e6203e26cc06..583debe4b9a2 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1107,9 +1107,8 @@ int rvt_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr, } /* initialize timers needed for rc qp */ timer_setup(&qp->s_timer, rvt_rc_timeout, 0); - hrtimer_init(&qp->s_rnr_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - qp->s_rnr_timer.function = rvt_rc_rnr_retry; + hrtimer_setup(&qp->s_rnr_timer, rvt_rc_rnr_retry, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); /* * Driver needs to set up it's private QP structure and do any @@ -1298,7 +1297,7 @@ int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err) if (qp->s_flags & (RVT_S_TIMER | RVT_S_WAIT_RNR)) { qp->s_flags &= ~(RVT_S_TIMER | RVT_S_WAIT_RNR); - del_timer(&qp->s_timer); + timer_delete(&qp->s_timer); } if (qp->s_flags & RVT_S_ANY_WAIT_SEND) @@ -2547,7 +2546,7 @@ void rvt_stop_rc_timers(struct rvt_qp *qp) /* Remove QP from all timers */ if (qp->s_flags & (RVT_S_TIMER | RVT_S_WAIT_RNR)) { qp->s_flags &= ~(RVT_S_TIMER | RVT_S_WAIT_RNR); - del_timer(&qp->s_timer); + timer_delete(&qp->s_timer); hrtimer_try_to_cancel(&qp->s_rnr_timer); } } @@ -2576,7 +2575,7 @@ static void rvt_stop_rnr_timer(struct rvt_qp *qp) */ void rvt_del_timers_sync(struct rvt_qp *qp) { - del_timer_sync(&qp->s_timer); + timer_delete_sync(&qp->s_timer); hrtimer_cancel(&qp->s_rnr_timer); } EXPORT_SYMBOL(rvt_del_timers_sync); @@ -2597,7 +2596,7 @@ static void rvt_rc_timeout(struct timer_list *t) qp->s_flags &= ~RVT_S_TIMER; rvp->n_rc_timeouts++; - del_timer(&qp->s_timer); + timer_delete(&qp->s_timer); trace_rvt_rc_timeout(qp, qp->s_last_psn + 1); if (rdi->driver_f.notify_restart_rc) rdi->driver_f.notify_restart_rc(qp, diff --git a/drivers/infiniband/sw/rdmavt/trace.h b/drivers/infiniband/sw/rdmavt/trace.h index 4341965a5ea7..bdb6b9326b64 100644 --- a/drivers/infiniband/sw/rdmavt/trace.h +++ b/drivers/infiniband/sw/rdmavt/trace.h @@ -4,7 +4,7 @@ */ #define RDI_DEV_ENTRY(rdi) __string(dev, rvt_get_ibdev_name(rdi)) -#define RDI_DEV_ASSIGN(rdi) __assign_str(dev, rvt_get_ibdev_name(rdi)) +#define RDI_DEV_ASSIGN(rdi) __assign_str(dev) #include "trace_rvt.h" #include "trace_qp.h" diff --git a/drivers/infiniband/sw/rdmavt/trace_rvt.h b/drivers/infiniband/sw/rdmavt/trace_rvt.h index df33c2ca9710..a00489e66ddf 100644 --- a/drivers/infiniband/sw/rdmavt/trace_rvt.h +++ b/drivers/infiniband/sw/rdmavt/trace_rvt.h @@ -24,7 +24,7 @@ TRACE_EVENT(rvt_dbg, ), TP_fast_assign( RDI_DEV_ASSIGN(rdi); - __assign_str(msg, msg); + __assign_str(msg); ), TP_printk("[%s]: %s", __get_str(dev), __get_str(msg)) ); diff --git a/drivers/infiniband/sw/rxe/Kconfig b/drivers/infiniband/sw/rxe/Kconfig index 06b8dc5093f7..1ed5b63f8afc 100644 --- a/drivers/infiniband/sw/rxe/Kconfig +++ b/drivers/infiniband/sw/rxe/Kconfig @@ -1,11 +1,10 @@ # SPDX-License-Identifier: GPL-2.0-only config RDMA_RXE tristate "Software RDMA over Ethernet (RoCE) driver" - depends on INET && PCI && INFINIBAND + depends on INET && PCI && INFINIBAND && 64BIT depends on INFINIBAND_VIRT_DMA select NET_UDP_TUNNEL - select CRYPTO - select CRYPTO_CRC32 + select CRC32 help This driver implements the InfiniBand RDMA transport over the Linux network stack. It enables a system with a diff --git a/drivers/infiniband/sw/rxe/Makefile b/drivers/infiniband/sw/rxe/Makefile index 5395a581f4bb..93134f1d1d0c 100644 --- a/drivers/infiniband/sw/rxe/Makefile +++ b/drivers/infiniband/sw/rxe/Makefile @@ -23,3 +23,5 @@ rdma_rxe-y := \ rxe_task.o \ rxe_net.o \ rxe_hw_counters.o + +rdma_rxe-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += rxe_odp.o diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index 255677bc12b2..3a77d6db1720 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -31,14 +31,11 @@ void rxe_dealloc(struct ib_device *ib_dev) WARN_ON(!RB_EMPTY_ROOT(&rxe->mcg_tree)); - if (rxe->tfm) - crypto_free_shash(rxe->tfm); - mutex_destroy(&rxe->usdev_lock); } /* initialize rxe device parameters */ -static void rxe_init_device_param(struct rxe_dev *rxe) +static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev) { rxe->max_inline_data = RXE_MAX_INLINE_DATA; @@ -71,10 +68,42 @@ static void rxe_init_device_param(struct rxe_dev *rxe) rxe->attr.max_fast_reg_page_list_len = RXE_MAX_FMR_PAGE_LIST_LEN; rxe->attr.max_pkeys = RXE_MAX_PKEYS; rxe->attr.local_ca_ack_delay = RXE_LOCAL_CA_ACK_DELAY; + + if (ndev->addr_len) { + memcpy(rxe->raw_gid, ndev->dev_addr, + min_t(unsigned int, ndev->addr_len, ETH_ALEN)); + } else { + /* + * This device does not have a HW address, but + * connection mangagement requires a unique gid. + */ + eth_random_addr(rxe->raw_gid); + } + addrconf_addr_eui48((unsigned char *)&rxe->attr.sys_image_guid, - rxe->ndev->dev_addr); + rxe->raw_gid); rxe->max_ucontext = RXE_MAX_UCONTEXT; + + if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { + rxe->attr.kernel_cap_flags |= IBK_ON_DEMAND_PAGING; + + /* IB_ODP_SUPPORT_IMPLICIT is not supported right now. */ + rxe->attr.odp_caps.general_caps |= IB_ODP_SUPPORT; + + rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND; + rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_RECV; + rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; + + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND; + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV; + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE; + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_FLUSH; + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC_WRITE; + } } /* initialize port attributes */ @@ -106,13 +135,13 @@ static void rxe_init_port_param(struct rxe_port *port) /* initialize port state, note IB convention that HCA ports are always * numbered from 1 */ -static void rxe_init_ports(struct rxe_dev *rxe) +static void rxe_init_ports(struct rxe_dev *rxe, struct net_device *ndev) { struct rxe_port *port = &rxe->port; rxe_init_port_param(port); addrconf_addr_eui48((unsigned char *)&port->port_guid, - rxe->ndev->dev_addr); + rxe->raw_gid); spin_lock_init(&port->port_lock); } @@ -130,12 +159,12 @@ static void rxe_init_pools(struct rxe_dev *rxe) } /* initialize rxe device state */ -static void rxe_init(struct rxe_dev *rxe) +static void rxe_init(struct rxe_dev *rxe, struct net_device *ndev) { /* init default device parameters */ - rxe_init_device_param(rxe); + rxe_init_device_param(rxe, ndev); - rxe_init_ports(rxe); + rxe_init_ports(rxe, ndev); rxe_init_pools(rxe); /* init pending mmap list */ @@ -167,12 +196,13 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu) /* called by ifc layer to create new rxe device. * The caller should allocate memory for rxe by calling ib_alloc_device. */ -int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name) +int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name, + struct net_device *ndev) { - rxe_init(rxe); + rxe_init(rxe, ndev); rxe_set_mtu(rxe, mtu); - return rxe_register_device(rxe, ibdev_name); + return rxe_register_device(rxe, ibdev_name, ndev); } static int rxe_newlink(const char *ibdev_name, struct net_device *ndev) diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h index d8fb2c7af30a..ff8cd53f5f28 100644 --- a/drivers/infiniband/sw/rxe/rxe.h +++ b/drivers/infiniband/sw/rxe/rxe.h @@ -21,7 +21,6 @@ #include <rdma/ib_umem.h> #include <rdma/ib_cache.h> #include <rdma/ib_addr.h> -#include <crypto/hash.h> #include "rxe_net.h" #include "rxe_opcode.h" @@ -100,46 +99,10 @@ #define rxe_info_mw(mw, fmt, ...) ibdev_info_ratelimited((mw)->ibmw.device, \ "mw#%d %s: " fmt, (mw)->elem.index, __func__, ##__VA_ARGS__) -/* responder states */ -enum resp_states { - RESPST_NONE, - RESPST_GET_REQ, - RESPST_CHK_PSN, - RESPST_CHK_OP_SEQ, - RESPST_CHK_OP_VALID, - RESPST_CHK_RESOURCE, - RESPST_CHK_LENGTH, - RESPST_CHK_RKEY, - RESPST_EXECUTE, - RESPST_READ_REPLY, - RESPST_ATOMIC_REPLY, - RESPST_ATOMIC_WRITE_REPLY, - RESPST_PROCESS_FLUSH, - RESPST_COMPLETE, - RESPST_ACKNOWLEDGE, - RESPST_CLEANUP, - RESPST_DUPLICATE_REQUEST, - RESPST_ERR_MALFORMED_WQE, - RESPST_ERR_UNSUPPORTED_OPCODE, - RESPST_ERR_MISALIGNED_ATOMIC, - RESPST_ERR_PSN_OUT_OF_SEQ, - RESPST_ERR_MISSING_OPCODE_FIRST, - RESPST_ERR_MISSING_OPCODE_LAST_C, - RESPST_ERR_MISSING_OPCODE_LAST_D1E, - RESPST_ERR_TOO_MANY_RDMA_ATM_REQ, - RESPST_ERR_RNR, - RESPST_ERR_RKEY_VIOLATION, - RESPST_ERR_INVALIDATE_RKEY, - RESPST_ERR_LENGTH, - RESPST_ERR_CQ_OVERFLOW, - RESPST_ERROR, - RESPST_DONE, - RESPST_EXIT, -}; - void rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu); -int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name); +int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name, + struct net_device *ndev); void rxe_rcv(struct sk_buff *skb); diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index b78b8c0856ab..d48af2180745 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -122,25 +122,16 @@ void retransmit_timer(struct timer_list *t) spin_lock_irqsave(&qp->state_lock, flags); if (qp->valid) { qp->comp.timeout = 1; - rxe_sched_task(&qp->comp.task); + rxe_sched_task(&qp->send_task); } spin_unlock_irqrestore(&qp->state_lock, flags); } void rxe_comp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb) { - int must_sched; - + rxe_counter_inc(SKB_TO_PKT(skb)->rxe, RXE_CNT_SENDER_SCHED); skb_queue_tail(&qp->resp_pkts, skb); - - must_sched = skb_queue_len(&qp->resp_pkts) > 1; - if (must_sched != 0) - rxe_counter_inc(SKB_TO_PKT(skb)->rxe, RXE_CNT_COMPLETER_SCHED); - - if (must_sched) - rxe_sched_task(&qp->comp.task); - else - rxe_run_task(&qp->comp.task); + rxe_sched_task(&qp->send_task); } static inline enum comp_state get_wqe(struct rxe_qp *qp, @@ -325,7 +316,7 @@ static inline enum comp_state check_ack(struct rxe_qp *qp, qp->comp.psn = pkt->psn; if (qp->req.wait_psn) { qp->req.wait_psn = 0; - rxe_sched_task(&qp->req.task); + qp->req.again = 1; } } return COMPST_ERROR_RETRY; @@ -476,7 +467,7 @@ static void do_complete(struct rxe_qp *qp, struct rxe_send_wqe *wqe) */ if (qp->req.wait_fence) { qp->req.wait_fence = 0; - rxe_sched_task(&qp->req.task); + qp->req.again = 1; } } @@ -515,7 +506,7 @@ static inline enum comp_state complete_ack(struct rxe_qp *qp, if (qp->req.need_rd_atomic) { qp->comp.timeout_retry = 0; qp->req.need_rd_atomic = 0; - rxe_sched_task(&qp->req.task); + qp->req.again = 1; } } @@ -541,7 +532,7 @@ static inline enum comp_state complete_wqe(struct rxe_qp *qp, if (qp->req.wait_psn) { qp->req.wait_psn = 0; - rxe_sched_task(&qp->req.task); + qp->req.again = 1; } } @@ -654,6 +645,8 @@ int rxe_completer(struct rxe_qp *qp) int ret; unsigned long flags; + qp->req.again = 0; + spin_lock_irqsave(&qp->state_lock, flags); if (!qp->valid || qp_state(qp) == IB_QPS_ERR || qp_state(qp) == IB_QPS_RESET) { @@ -737,7 +730,7 @@ int rxe_completer(struct rxe_qp *qp) if (qp->req.wait_psn) { qp->req.wait_psn = 0; - rxe_sched_task(&qp->req.task); + qp->req.again = 1; } state = COMPST_DONE; @@ -792,7 +785,7 @@ int rxe_completer(struct rxe_qp *qp) RXE_CNT_COMP_RETRY); qp->req.need_retry = 1; qp->comp.started_retry = 1; - rxe_sched_task(&qp->req.task); + qp->req.again = 1; } goto done; @@ -843,8 +836,9 @@ done: ret = 0; goto out; exit: - ret = -EAGAIN; + ret = (qp->req.again) ? 0 : -EAGAIN; out: + qp->req.again = 0; if (pkt) free_pkt(pkt); return ret; diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c index fec87c9030ab..fffd144d509e 100644 --- a/drivers/infiniband/sw/rxe/rxe_cq.c +++ b/drivers/infiniband/sw/rxe/rxe_cq.c @@ -56,11 +56,8 @@ int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe, err = do_mmap_info(rxe, uresp ? &uresp->mi : NULL, udata, cq->queue->buf, cq->queue->buf_size, &cq->queue->ip); - if (err) { - vfree(cq->queue->buf); - kfree(cq->queue); + if (err) return err; - } cq->is_user = uresp; diff --git a/drivers/infiniband/sw/rxe/rxe_hdr.h b/drivers/infiniband/sw/rxe/rxe_hdr.h index 46f82b27fcd2..1f0322491d8c 100644 --- a/drivers/infiniband/sw/rxe/rxe_hdr.h +++ b/drivers/infiniband/sw/rxe/rxe_hdr.h @@ -234,7 +234,7 @@ static inline void __bth_set_resv6a(void *arg) { struct rxe_bth *bth = arg; - bth->qpn = cpu_to_be32(~BTH_RESV6A_MASK); + bth->qpn &= cpu_to_be32(~BTH_RESV6A_MASK); } static inline int __bth_ack(void *arg) diff --git a/drivers/infiniband/sw/rxe/rxe_hw_counters.c b/drivers/infiniband/sw/rxe/rxe_hw_counters.c index a012522b577a..437917a7d8f2 100644 --- a/drivers/infiniband/sw/rxe/rxe_hw_counters.c +++ b/drivers/infiniband/sw/rxe/rxe_hw_counters.c @@ -14,7 +14,7 @@ static const struct rdma_stat_desc rxe_counter_descs[] = { [RXE_CNT_RCV_RNR].name = "rcvd_rnr_err", [RXE_CNT_SND_RNR].name = "send_rnr_err", [RXE_CNT_RCV_SEQ_ERR].name = "rcvd_seq_err", - [RXE_CNT_COMPLETER_SCHED].name = "ack_deferred", + [RXE_CNT_SENDER_SCHED].name = "ack_deferred", [RXE_CNT_RETRY_EXCEEDED].name = "retry_exceeded_err", [RXE_CNT_RNR_RETRY_EXCEEDED].name = "retry_rnr_exceeded_err", [RXE_CNT_COMP_RETRY].name = "completer_retry_err", diff --git a/drivers/infiniband/sw/rxe/rxe_hw_counters.h b/drivers/infiniband/sw/rxe/rxe_hw_counters.h index 71f4d4fa9dc8..051f9e1c3852 100644 --- a/drivers/infiniband/sw/rxe/rxe_hw_counters.h +++ b/drivers/infiniband/sw/rxe/rxe_hw_counters.h @@ -18,7 +18,7 @@ enum rxe_counters { RXE_CNT_RCV_RNR, RXE_CNT_SND_RNR, RXE_CNT_RCV_SEQ_ERR, - RXE_CNT_COMPLETER_SCHED, + RXE_CNT_SENDER_SCHED, RXE_CNT_RETRY_EXCEEDED, RXE_CNT_RNR_RETRY_EXCEEDED, RXE_CNT_COMP_RETRY, diff --git a/drivers/infiniband/sw/rxe/rxe_icrc.c b/drivers/infiniband/sw/rxe/rxe_icrc.c index fdf5f08cd8f1..76d760fbe7ea 100644 --- a/drivers/infiniband/sw/rxe/rxe_icrc.c +++ b/drivers/infiniband/sw/rxe/rxe_icrc.c @@ -10,28 +10,6 @@ #include "rxe_loc.h" /** - * rxe_icrc_init() - Initialize crypto function for computing crc32 - * @rxe: rdma_rxe device object - * - * Return: 0 on success else an error - */ -int rxe_icrc_init(struct rxe_dev *rxe) -{ - struct crypto_shash *tfm; - - tfm = crypto_alloc_shash("crc32", 0, 0); - if (IS_ERR(tfm)) { - rxe_dbg_dev(rxe, "failed to init crc32 algorithm err: %ld\n", - PTR_ERR(tfm)); - return PTR_ERR(tfm); - } - - rxe->tfm = tfm; - - return 0; -} - -/** * rxe_crc32() - Compute cumulative crc32 for a contiguous segment * @rxe: rdma_rxe device object * @crc: starting crc32 value from previous segments @@ -42,23 +20,7 @@ int rxe_icrc_init(struct rxe_dev *rxe) */ static __be32 rxe_crc32(struct rxe_dev *rxe, __be32 crc, void *next, size_t len) { - __be32 icrc; - int err; - - SHASH_DESC_ON_STACK(shash, rxe->tfm); - - shash->tfm = rxe->tfm; - *(__be32 *)shash_desc_ctx(shash) = crc; - err = crypto_shash_update(shash, next, len); - if (unlikely(err)) { - rxe_dbg_dev(rxe, "failed crc calculation, err: %d\n", err); - return (__force __be32)crc32_le((__force u32)crc, next, len); - } - - icrc = *(__be32 *)shash_desc_ctx(shash); - barrier_data(shash_desc_ctx(shash)); - - return icrc; + return (__force __be32)crc32_le((__force u32)crc, next, len); } /** diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index 746110898a0e..876702058c84 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -58,6 +58,7 @@ int rxe_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); /* rxe_mr.c */ u8 rxe_get_next_key(u32 last_key); +void rxe_mr_init(int access, struct rxe_mr *mr); void rxe_mr_init_dma(int access, struct rxe_mr *mr); int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, int access, struct rxe_mr *mr); @@ -69,9 +70,9 @@ int copy_data(struct rxe_pd *pd, int access, struct rxe_dma_info *dma, void *addr, int length, enum rxe_mr_copy_dir dir); int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset); -int rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, - u64 compare, u64 swap_add, u64 *orig_val); -int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value); +enum resp_states rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, + u64 compare, u64 swap_add, u64 *orig_val); +enum resp_states rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value); struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key, enum rxe_mr_lookup_type type); int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length); @@ -80,6 +81,9 @@ int rxe_invalidate_mr(struct rxe_qp *qp, u32 key); int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe); void rxe_mr_cleanup(struct rxe_pool_elem *elem); +/* defined in rxe_mr.c; used in rxe_mr.c and rxe_odp.c */ +extern spinlock_t atomic_ops_lock; + /* rxe_mw.c */ int rxe_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata); int rxe_dealloc_mw(struct ib_mw *ibmw); @@ -136,6 +140,12 @@ static inline int qp_mtu(struct rxe_qp *qp) return IB_MTU_4096; } +static inline bool is_odp_mr(struct rxe_mr *mr) +{ + return IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && mr->umem && + mr->umem->is_odp; +} + void free_rd_atomic_resource(struct resp_res *res); static inline void rxe_advance_resp_resource(struct rxe_qp *qp) @@ -164,10 +174,10 @@ void rxe_dealloc(struct ib_device *ib_dev); int rxe_completer(struct rxe_qp *qp); int rxe_requester(struct rxe_qp *qp); -int rxe_responder(struct rxe_qp *qp); +int rxe_sender(struct rxe_qp *qp); +int rxe_receiver(struct rxe_qp *qp); /* rxe_icrc.c */ -int rxe_icrc_init(struct rxe_dev *rxe); int rxe_icrc_check(struct sk_buff *skb, struct rxe_pkt_info *pkt); void rxe_icrc_generate(struct sk_buff *skb, struct rxe_pkt_info *pkt); @@ -180,4 +190,47 @@ static inline unsigned int wr_opcode_mask(int opcode, struct rxe_qp *qp) return rxe_wr_opcode_info[opcode].mask[qp->ibqp.qp_type]; } +/* rxe_odp.c */ +extern const struct mmu_interval_notifier_ops rxe_mn_ops; + +#if defined CONFIG_INFINIBAND_ON_DEMAND_PAGING +int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, + u64 iova, int access_flags, struct rxe_mr *mr); +int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, + enum rxe_mr_copy_dir dir); +enum resp_states rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, + u64 compare, u64 swap_add, u64 *orig_val); +int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova, + unsigned int length); +enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value); +#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ +static inline int +rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, + int access_flags, struct rxe_mr *mr) +{ + return -EOPNOTSUPP; +} +static inline int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, + int length, enum rxe_mr_copy_dir dir) +{ + return -EOPNOTSUPP; +} +static inline enum resp_states +rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, + u64 compare, u64 swap_add, u64 *orig_val) +{ + return RESPST_ERR_UNSUPPORTED_OPCODE; +} +static inline int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova, + unsigned int length) +{ + return -EOPNOTSUPP; +} +static inline enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, + u64 iova, u64 value) +{ + return RESPST_ERR_UNSUPPORTED_OPCODE; +} +#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ + #endif /* RXE_LOC_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c index 86cc2e18a7fd..07ff47bae31d 100644 --- a/drivers/infiniband/sw/rxe/rxe_mcast.c +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c @@ -31,10 +31,19 @@ static int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid) { unsigned char ll_addr[ETH_ALEN]; + struct net_device *ndev; + int ret; + + ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); + if (!ndev) + return -ENODEV; ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr); - return dev_mc_add(rxe->ndev, ll_addr); + ret = dev_mc_add(ndev, ll_addr); + dev_put(ndev); + + return ret; } /** @@ -47,10 +56,19 @@ static int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid) static int rxe_mcast_del(struct rxe_dev *rxe, union ib_gid *mgid) { unsigned char ll_addr[ETH_ALEN]; + struct net_device *ndev; + int ret; + + ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); + if (!ndev) + return -ENODEV; ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr); - return dev_mc_del(rxe->ndev, ll_addr); + ret = dev_mc_del(ndev, ll_addr); + dev_put(ndev); + + return ret; } /** diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index da3dee520876..bcb97b3ea58a 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -45,7 +45,7 @@ int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length) } } -static void rxe_mr_init(int access, struct rxe_mr *mr) +void rxe_mr_init(int access, struct rxe_mr *mr) { u32 key = mr->elem.index << 8 | rxe_get_next_key(-1); @@ -323,7 +323,10 @@ int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, return err; } - return rxe_mr_copy_xarray(mr, iova, addr, length, dir); + if (is_odp_mr(mr)) + return rxe_odp_mr_copy(mr, iova, addr, length, dir); + else + return rxe_mr_copy_xarray(mr, iova, addr, length, dir); } /* copy data in or out of a wqe, i.e. sg list @@ -421,7 +424,7 @@ err1: return err; } -int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) +static int rxe_mr_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) { unsigned int page_offset; unsigned long index; @@ -430,16 +433,6 @@ int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) int err; u8 *va; - /* mr must be valid even if length is zero */ - if (WARN_ON(!mr)) - return -EINVAL; - - if (length == 0) - return 0; - - if (mr->ibmr.type == IB_MR_TYPE_DMA) - return -EFAULT; - err = mr_check_range(mr, iova, length); if (err) return err; @@ -451,7 +444,7 @@ int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) if (!page) return -EFAULT; bytes = min_t(unsigned int, length, - mr_page_size(mr) - page_offset); + mr_page_size(mr) - page_offset); va = kmap_local_page(page); arch_wb_cache_pmem(va + page_offset, bytes); @@ -465,11 +458,33 @@ int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) return 0; } +int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 start, unsigned int length) +{ + int err; + + /* mr must be valid even if length is zero */ + if (WARN_ON(!mr)) + return -EINVAL; + + if (length == 0) + return 0; + + if (mr->ibmr.type == IB_MR_TYPE_DMA) + return -EFAULT; + + if (is_odp_mr(mr)) + err = rxe_odp_flush_pmem_iova(mr, start, length); + else + err = rxe_mr_flush_pmem_iova(mr, start, length); + + return err; +} + /* Guarantee atomicity of atomic operations at the machine level. */ -static DEFINE_SPINLOCK(atomic_ops_lock); +DEFINE_SPINLOCK(atomic_ops_lock); -int rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, - u64 compare, u64 swap_add, u64 *orig_val) +enum resp_states rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, + u64 compare, u64 swap_add, u64 *orig_val) { unsigned int page_offset; struct page *page; @@ -521,23 +536,15 @@ int rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, kunmap_local(va); - return 0; + return RESPST_NONE; } -#if defined CONFIG_64BIT -/* only implemented or called for 64 bit architectures */ -int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) +enum resp_states rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) { unsigned int page_offset; struct page *page; u64 *va; - /* See IBA oA19-28 */ - if (unlikely(mr->state != RXE_MR_STATE_VALID)) { - rxe_dbg_mr(mr, "mr not in valid state\n"); - return RESPST_ERR_RKEY_VIOLATION; - } - if (mr->ibmr.type == IB_MR_TYPE_DMA) { page_offset = iova & (PAGE_SIZE - 1); page = ib_virt_dma_to_page(iova); @@ -565,20 +572,12 @@ int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) } va = kmap_local_page(page); - /* Do atomic write after all prior operations have completed */ smp_store_release(&va[page_offset >> 3], value); - kunmap_local(va); - return 0; -} -#else -int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) -{ - return RESPST_ERR_UNSUPPORTED_OPCODE; + return RESPST_NONE; } -#endif int advance_dma_data(struct rxe_dma_info *dma, unsigned int length) { diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index cd59666158b1..132a87e52d5c 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -345,46 +345,52 @@ int rxe_prepare(struct rxe_av *av, struct rxe_pkt_info *pkt, static void rxe_skb_tx_dtor(struct sk_buff *skb) { - struct sock *sk = skb->sk; - struct rxe_qp *qp = sk->sk_user_data; - int skb_out = atomic_dec_return(&qp->skb_out); + struct net_device *ndev = skb->dev; + struct rxe_dev *rxe; + unsigned int qp_index; + struct rxe_qp *qp; + int skb_out; + + rxe = rxe_get_dev_from_net(ndev); + if (!rxe && is_vlan_dev(ndev)) + rxe = rxe_get_dev_from_net(vlan_dev_real_dev(ndev)); + if (WARN_ON(!rxe)) + return; - if (unlikely(qp->need_req_skb && - skb_out < RXE_INFLIGHT_SKBS_PER_QP_LOW)) - rxe_sched_task(&qp->req.task); + qp_index = (int)(uintptr_t)skb->sk->sk_user_data; + if (!qp_index) + return; + + qp = rxe_pool_get_index(&rxe->qp_pool, qp_index); + if (!qp) + goto put_dev; + + skb_out = atomic_dec_return(&qp->skb_out); + if (qp->need_req_skb && skb_out < RXE_INFLIGHT_SKBS_PER_QP_LOW) + rxe_sched_task(&qp->send_task); rxe_put(qp); +put_dev: + ib_device_put(&rxe->ib_dev); + sock_put(skb->sk); } static int rxe_send(struct sk_buff *skb, struct rxe_pkt_info *pkt) { int err; + struct sock *sk = pkt->qp->sk->sk; + sock_hold(sk); + skb->sk = sk; skb->destructor = rxe_skb_tx_dtor; - skb->sk = pkt->qp->sk->sk; - - rxe_get(pkt->qp); atomic_inc(&pkt->qp->skb_out); - if (skb->protocol == htons(ETH_P_IP)) { + if (skb->protocol == htons(ETH_P_IP)) err = ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); - } else if (skb->protocol == htons(ETH_P_IPV6)) { + else err = ip6_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); - } else { - rxe_dbg_qp(pkt->qp, "Unknown layer 3 protocol: %d\n", - skb->protocol); - atomic_dec(&pkt->qp->skb_out); - rxe_put(pkt->qp); - kfree_skb(skb); - return -EINVAL; - } - - if (unlikely(net_xmit_eval(err))) { - rxe_dbg_qp(pkt->qp, "error sending packet: %d\n", err); - return -EAGAIN; - } - return 0; + return err; } /* fix up a send packet to match the packets @@ -392,8 +398,15 @@ static int rxe_send(struct sk_buff *skb, struct rxe_pkt_info *pkt) */ static int rxe_loopback(struct sk_buff *skb, struct rxe_pkt_info *pkt) { + struct sock *sk = pkt->qp->sk->sk; + memcpy(SKB_TO_PKT(skb), pkt, sizeof(*pkt)); + sock_hold(sk); + skb->sk = sk; + skb->destructor = rxe_skb_tx_dtor; + atomic_inc(&pkt->qp->skb_out); + if (skb->protocol == htons(ETH_P_IP)) skb_pull(skb, sizeof(struct iphdr)); else @@ -440,12 +453,6 @@ int rxe_xmit_packet(struct rxe_qp *qp, struct rxe_pkt_info *pkt, return err; } - if ((qp_type(qp) != IB_QPT_RC) && - (pkt->mask & RXE_END_MASK)) { - pkt->wqe->state = wqe_state_done; - rxe_sched_task(&qp->comp.task); - } - rxe_counter_inc(rxe, RXE_CNT_SENT_PKTS); goto done; @@ -517,7 +524,16 @@ out: */ const char *rxe_parent_name(struct rxe_dev *rxe, unsigned int port_num) { - return rxe->ndev->name; + struct net_device *ndev; + char *ndev_name; + + ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); + if (!ndev) + return NULL; + ndev_name = ndev->name; + dev_put(ndev); + + return ndev_name; } int rxe_net_add(const char *ibdev_name, struct net_device *ndev) @@ -529,9 +545,9 @@ int rxe_net_add(const char *ibdev_name, struct net_device *ndev) if (!rxe) return -ENOMEM; - rxe->ndev = ndev; + ib_mark_name_assigned_by_user(&rxe->ib_dev); - err = rxe_add(rxe, ndev->mtu, ibdev_name); + err = rxe_add(rxe, ndev->mtu, ibdev_name, ndev); if (err) { ib_dealloc_device(&rxe->ib_dev); return err; @@ -555,11 +571,6 @@ static void rxe_port_event(struct rxe_dev *rxe, /* Caller must hold net_info_lock */ void rxe_port_up(struct rxe_dev *rxe) { - struct rxe_port *port; - - port = &rxe->port; - port->attr.state = IB_PORT_ACTIVE; - rxe_port_event(rxe, IB_EVENT_PORT_ACTIVE); dev_info(&rxe->ib_dev.dev, "set active\n"); } @@ -567,11 +578,6 @@ void rxe_port_up(struct rxe_dev *rxe) /* Caller must hold net_info_lock */ void rxe_port_down(struct rxe_dev *rxe) { - struct rxe_port *port; - - port = &rxe->port; - port->attr.state = IB_PORT_DOWN; - rxe_port_event(rxe, IB_EVENT_PORT_ERR); rxe_counter_inc(rxe, RXE_CNT_LINK_DOWNED); dev_info(&rxe->ib_dev.dev, "set down\n"); @@ -579,10 +585,18 @@ void rxe_port_down(struct rxe_dev *rxe) void rxe_set_port_state(struct rxe_dev *rxe) { - if (netif_running(rxe->ndev) && netif_carrier_ok(rxe->ndev)) + struct net_device *ndev; + + ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); + if (!ndev) + return; + + if (ib_get_curr_port_state(ndev) == IB_PORT_ACTIVE) rxe_port_up(rxe); else rxe_port_down(rxe); + + dev_put(ndev); } static int rxe_notify(struct notifier_block *not_blk, @@ -599,18 +613,14 @@ static int rxe_notify(struct notifier_block *not_blk, case NETDEV_UNREGISTER: ib_unregister_device_queued(&rxe->ib_dev); break; - case NETDEV_UP: - rxe_port_up(rxe); - break; - case NETDEV_DOWN: - rxe_port_down(rxe); - break; case NETDEV_CHANGEMTU: rxe_dbg_dev(rxe, "%s changed mtu to %d\n", ndev->name, ndev->mtu); rxe_set_mtu(rxe, ndev->mtu); break; + case NETDEV_DOWN: case NETDEV_CHANGE: - rxe_set_port_state(rxe); + if (ib_get_curr_port_state(ndev) == IB_PORT_DOWN) + rxe_counter_inc(rxe, RXE_CNT_LINK_DOWNED); break; case NETDEV_REBOOT: case NETDEV_GOING_DOWN: diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c new file mode 100644 index 000000000000..dbc5a5600eb7 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_odp.c @@ -0,0 +1,420 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2022-2023 Fujitsu Ltd. All rights reserved. + */ + +#include <linux/hmm.h> +#include <linux/libnvdimm.h> + +#include <rdma/ib_umem_odp.h> + +#include "rxe.h" + +static bool rxe_ib_invalidate_range(struct mmu_interval_notifier *mni, + const struct mmu_notifier_range *range, + unsigned long cur_seq) +{ + struct ib_umem_odp *umem_odp = + container_of(mni, struct ib_umem_odp, notifier); + unsigned long start, end; + + if (!mmu_notifier_range_blockable(range)) + return false; + + mutex_lock(&umem_odp->umem_mutex); + mmu_interval_set_seq(mni, cur_seq); + + start = max_t(u64, ib_umem_start(umem_odp), range->start); + end = min_t(u64, ib_umem_end(umem_odp), range->end); + + /* update umem_odp->map.pfn_list */ + ib_umem_odp_unmap_dma_pages(umem_odp, start, end); + + mutex_unlock(&umem_odp->umem_mutex); + return true; +} + +const struct mmu_interval_notifier_ops rxe_mn_ops = { + .invalidate = rxe_ib_invalidate_range, +}; + +#define RXE_PAGEFAULT_DEFAULT 0 +#define RXE_PAGEFAULT_RDONLY BIT(0) +#define RXE_PAGEFAULT_SNAPSHOT BIT(1) +static int rxe_odp_do_pagefault_and_lock(struct rxe_mr *mr, u64 user_va, int bcnt, u32 flags) +{ + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); + bool fault = !(flags & RXE_PAGEFAULT_SNAPSHOT); + u64 access_mask = 0; + int np; + + if (umem_odp->umem.writable && !(flags & RXE_PAGEFAULT_RDONLY)) + access_mask |= HMM_PFN_WRITE; + + /* + * ib_umem_odp_map_dma_and_lock() locks umem_mutex on success. + * Callers must release the lock later to let invalidation handler + * do its work again. + */ + np = ib_umem_odp_map_dma_and_lock(umem_odp, user_va, bcnt, + access_mask, fault); + return np; +} + +static int rxe_odp_init_pages(struct rxe_mr *mr) +{ + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); + int ret; + + ret = rxe_odp_do_pagefault_and_lock(mr, mr->umem->address, + mr->umem->length, + RXE_PAGEFAULT_SNAPSHOT); + + if (ret >= 0) + mutex_unlock(&umem_odp->umem_mutex); + + return ret >= 0 ? 0 : ret; +} + +int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, + u64 iova, int access_flags, struct rxe_mr *mr) +{ + struct ib_umem_odp *umem_odp; + int err; + + if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) + return -EOPNOTSUPP; + + rxe_mr_init(access_flags, mr); + + if (!start && length == U64_MAX) { + if (iova != 0) + return -EINVAL; + if (!(rxe->attr.odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) + return -EINVAL; + + /* Never reach here, for implicit ODP is not implemented. */ + } + + umem_odp = ib_umem_odp_get(&rxe->ib_dev, start, length, access_flags, + &rxe_mn_ops); + if (IS_ERR(umem_odp)) { + rxe_dbg_mr(mr, "Unable to create umem_odp err = %d\n", + (int)PTR_ERR(umem_odp)); + return PTR_ERR(umem_odp); + } + + umem_odp->private = mr; + + mr->umem = &umem_odp->umem; + mr->access = access_flags; + mr->ibmr.length = length; + mr->ibmr.iova = iova; + mr->page_offset = ib_umem_offset(&umem_odp->umem); + + err = rxe_odp_init_pages(mr); + if (err) { + ib_umem_odp_release(umem_odp); + return err; + } + + mr->state = RXE_MR_STATE_VALID; + mr->ibmr.type = IB_MR_TYPE_USER; + + return err; +} + +static inline bool rxe_check_pagefault(struct ib_umem_odp *umem_odp, u64 iova, + int length) +{ + bool need_fault = false; + u64 addr; + int idx; + + addr = iova & (~(BIT(umem_odp->page_shift) - 1)); + + /* Skim through all pages that are to be accessed. */ + while (addr < iova + length) { + idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; + + if (!(umem_odp->map.pfn_list[idx] & HMM_PFN_VALID)) { + need_fault = true; + break; + } + + addr += BIT(umem_odp->page_shift); + } + return need_fault; +} + +static unsigned long rxe_odp_iova_to_index(struct ib_umem_odp *umem_odp, u64 iova) +{ + return (iova - ib_umem_start(umem_odp)) >> umem_odp->page_shift; +} + +static unsigned long rxe_odp_iova_to_page_offset(struct ib_umem_odp *umem_odp, u64 iova) +{ + return iova & (BIT(umem_odp->page_shift) - 1); +} + +static int rxe_odp_map_range_and_lock(struct rxe_mr *mr, u64 iova, int length, u32 flags) +{ + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); + bool need_fault; + int err; + + if (unlikely(length < 1)) + return -EINVAL; + + mutex_lock(&umem_odp->umem_mutex); + + need_fault = rxe_check_pagefault(umem_odp, iova, length); + if (need_fault) { + mutex_unlock(&umem_odp->umem_mutex); + + /* umem_mutex is locked on success. */ + err = rxe_odp_do_pagefault_and_lock(mr, iova, length, + flags); + if (err < 0) + return err; + + need_fault = rxe_check_pagefault(umem_odp, iova, length); + if (need_fault) + return -EFAULT; + } + + return 0; +} + +static int __rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, + int length, enum rxe_mr_copy_dir dir) +{ + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); + struct page *page; + int idx, bytes; + size_t offset; + u8 *user_va; + + idx = rxe_odp_iova_to_index(umem_odp, iova); + offset = rxe_odp_iova_to_page_offset(umem_odp, iova); + + while (length > 0) { + u8 *src, *dest; + + page = hmm_pfn_to_page(umem_odp->map.pfn_list[idx]); + user_va = kmap_local_page(page); + if (!user_va) + return -EFAULT; + + src = (dir == RXE_TO_MR_OBJ) ? addr : user_va; + dest = (dir == RXE_TO_MR_OBJ) ? user_va : addr; + + bytes = BIT(umem_odp->page_shift) - offset; + if (bytes > length) + bytes = length; + + memcpy(dest, src, bytes); + kunmap_local(user_va); + + length -= bytes; + idx++; + offset = 0; + } + + return 0; +} + +int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, + enum rxe_mr_copy_dir dir) +{ + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); + u32 flags = RXE_PAGEFAULT_DEFAULT; + int err; + + if (length == 0) + return 0; + + if (unlikely(!mr->umem->is_odp)) + return -EOPNOTSUPP; + + switch (dir) { + case RXE_TO_MR_OBJ: + break; + + case RXE_FROM_MR_OBJ: + flags |= RXE_PAGEFAULT_RDONLY; + break; + + default: + return -EINVAL; + } + + err = rxe_odp_map_range_and_lock(mr, iova, length, flags); + if (err) + return err; + + err = __rxe_odp_mr_copy(mr, iova, addr, length, dir); + + mutex_unlock(&umem_odp->umem_mutex); + + return err; +} + +static enum resp_states rxe_odp_do_atomic_op(struct rxe_mr *mr, u64 iova, + int opcode, u64 compare, + u64 swap_add, u64 *orig_val) +{ + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); + unsigned int page_offset; + struct page *page; + unsigned int idx; + u64 value; + u64 *va; + int err; + + if (unlikely(mr->state != RXE_MR_STATE_VALID)) { + rxe_dbg_mr(mr, "mr not in valid state\n"); + return RESPST_ERR_RKEY_VIOLATION; + } + + err = mr_check_range(mr, iova, sizeof(value)); + if (err) { + rxe_dbg_mr(mr, "iova out of range\n"); + return RESPST_ERR_RKEY_VIOLATION; + } + + idx = rxe_odp_iova_to_index(umem_odp, iova); + page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova); + page = hmm_pfn_to_page(umem_odp->map.pfn_list[idx]); + if (!page) + return RESPST_ERR_RKEY_VIOLATION; + + if (unlikely(page_offset & 0x7)) { + rxe_dbg_mr(mr, "iova not aligned\n"); + return RESPST_ERR_MISALIGNED_ATOMIC; + } + + va = kmap_local_page(page); + + spin_lock_bh(&atomic_ops_lock); + value = *orig_val = va[page_offset >> 3]; + + if (opcode == IB_OPCODE_RC_COMPARE_SWAP) { + if (value == compare) + va[page_offset >> 3] = swap_add; + } else { + value += swap_add; + va[page_offset >> 3] = value; + } + spin_unlock_bh(&atomic_ops_lock); + + kunmap_local(va); + + return RESPST_NONE; +} + +enum resp_states rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, + u64 compare, u64 swap_add, u64 *orig_val) +{ + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); + int err; + + err = rxe_odp_map_range_and_lock(mr, iova, sizeof(char), + RXE_PAGEFAULT_DEFAULT); + if (err < 0) + return RESPST_ERR_RKEY_VIOLATION; + + err = rxe_odp_do_atomic_op(mr, iova, opcode, compare, swap_add, + orig_val); + mutex_unlock(&umem_odp->umem_mutex); + + return err; +} + +int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova, + unsigned int length) +{ + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); + unsigned int page_offset; + unsigned long index; + struct page *page; + unsigned int bytes; + int err; + u8 *va; + + err = rxe_odp_map_range_and_lock(mr, iova, length, + RXE_PAGEFAULT_DEFAULT); + if (err) + return err; + + while (length > 0) { + index = rxe_odp_iova_to_index(umem_odp, iova); + page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova); + + page = hmm_pfn_to_page(umem_odp->map.pfn_list[index]); + if (!page) { + mutex_unlock(&umem_odp->umem_mutex); + return -EFAULT; + } + + bytes = min_t(unsigned int, length, + mr_page_size(mr) - page_offset); + + va = kmap_local_page(page); + arch_wb_cache_pmem(va + page_offset, bytes); + kunmap_local(va); + + length -= bytes; + iova += bytes; + page_offset = 0; + } + + mutex_unlock(&umem_odp->umem_mutex); + + return 0; +} + +enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) +{ + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); + unsigned int page_offset; + unsigned long index; + struct page *page; + int err; + u64 *va; + + /* See IBA oA19-28 */ + err = mr_check_range(mr, iova, sizeof(value)); + if (unlikely(err)) { + rxe_dbg_mr(mr, "iova out of range\n"); + return RESPST_ERR_RKEY_VIOLATION; + } + + err = rxe_odp_map_range_and_lock(mr, iova, sizeof(value), + RXE_PAGEFAULT_DEFAULT); + if (err) + return RESPST_ERR_RKEY_VIOLATION; + + page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova); + index = rxe_odp_iova_to_index(umem_odp, iova); + page = hmm_pfn_to_page(umem_odp->map.pfn_list[index]); + if (!page) { + mutex_unlock(&umem_odp->umem_mutex); + return RESPST_ERR_RKEY_VIOLATION; + } + /* See IBA A19.4.2 */ + if (unlikely(page_offset & 0x7)) { + mutex_unlock(&umem_odp->umem_mutex); + rxe_dbg_mr(mr, "misaligned address\n"); + return RESPST_ERR_MISALIGNED_ATOMIC; + } + + va = kmap_local_page(page); + /* Do atomic write after all prior operations have completed */ + smp_store_release(&va[page_offset >> 3], value); + kunmap_local(va); + + mutex_unlock(&umem_odp->umem_mutex); + + return RESPST_NONE; +} diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h index d2f57ead78ad..767870568372 100644 --- a/drivers/infiniband/sw/rxe/rxe_param.h +++ b/drivers/infiniband/sw/rxe/rxe_param.h @@ -53,12 +53,9 @@ enum rxe_device_param { | IB_DEVICE_MEM_WINDOW | IB_DEVICE_FLUSH_GLOBAL | IB_DEVICE_FLUSH_PERSISTENT -#ifdef CONFIG_64BIT | IB_DEVICE_MEM_WINDOW_TYPE_2B | IB_DEVICE_ATOMIC_WRITE, -#else - | IB_DEVICE_MEM_WINDOW_TYPE_2B, -#endif /* CONFIG_64BIT */ + RXE_MAX_SGE = 32, RXE_MAX_WQE_SIZE = sizeof(struct rxe_send_wqe) + sizeof(struct ib_sge) * RXE_MAX_SGE, @@ -129,7 +126,7 @@ enum rxe_device_param { enum rxe_port_param { RXE_PORT_GID_TBL_LEN = 1024, RXE_PORT_PORT_CAP_FLAGS = IB_PORT_CM_SUP, - RXE_PORT_MAX_MSG_SZ = 0x800000, + RXE_PORT_MAX_MSG_SZ = (1UL << 31), RXE_PORT_BAD_PKEY_CNTR = 0, RXE_PORT_QKEY_VIOL_CNTR = 0, RXE_PORT_LID = 0, diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index 6215c6de3a84..d9cb682fd71f 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -119,7 +119,7 @@ void rxe_pool_cleanup(struct rxe_pool *pool) int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem, bool sleepable) { - int err; + int err = -EINVAL; gfp_t gfp_flags; if (atomic_inc_return(&pool->num_elem) > pool->max_elem) @@ -147,7 +147,7 @@ int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem, err_cnt: atomic_dec(&pool->num_elem); - return -EINVAL; + return err; } void *rxe_pool_get_index(struct rxe_pool *pool, u32 index) @@ -178,7 +178,6 @@ int __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable) { struct rxe_pool *pool = elem->pool; struct xarray *xa = &pool->xa; - static int timeout = RXE_POOL_TIMEOUT; int ret, err = 0; void *xa_ret; @@ -202,19 +201,19 @@ int __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable) * return to rdma-core */ if (sleepable) { - if (!completion_done(&elem->complete) && timeout) { + if (!completion_done(&elem->complete)) { ret = wait_for_completion_timeout(&elem->complete, - timeout); + msecs_to_jiffies(50000)); /* Shouldn't happen. There are still references to * the object but, rather than deadlock, free the * object or pass back to rdma-core. */ if (WARN_ON(!ret)) - err = -EINVAL; + err = -ETIMEDOUT; } } else { - unsigned long until = jiffies + timeout; + unsigned long until = jiffies + RXE_POOL_TIMEOUT; /* AH objects are unique in that the destroy_ah verb * can be called in atomic context. This delay @@ -226,7 +225,7 @@ int __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable) mdelay(1); if (WARN_ON(!completion_done(&elem->complete))) - err = -EINVAL; + err = -ETIMEDOUT; } if (pool->cleanup) diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index e3589c02013e..f2af3e0aef35 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -244,7 +244,7 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp, err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM, 0, &qp->sk); if (err < 0) return err; - qp->sk->sk->sk_user_data = qp; + qp->sk->sk->sk_user_data = (void *)(uintptr_t)qp->elem.index; /* pick a source UDP port number for this QP based on * the source QPN. this spreads traffic for different QPs @@ -265,8 +265,7 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp, qp->req.opcode = -1; qp->comp.opcode = -1; - rxe_init_task(&qp->req.task, qp, rxe_requester); - rxe_init_task(&qp->comp.task, qp, rxe_completer); + rxe_init_task(&qp->send_task, qp, rxe_sender); qp->qp_timeout_jiffies = 0; /* Can't be set for UD/UC in modify_qp */ if (init->qp_type == IB_QPT_RC) { @@ -337,7 +336,7 @@ static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp, return err; } - rxe_init_task(&qp->resp.task, qp, rxe_responder); + rxe_init_task(&qp->recv_task, qp, rxe_receiver); qp->resp.opcode = OPCODE_NONE; qp->resp.msn = 0; @@ -514,14 +513,12 @@ err1: static void rxe_qp_reset(struct rxe_qp *qp) { /* stop tasks from running */ - rxe_disable_task(&qp->resp.task); - rxe_disable_task(&qp->comp.task); - rxe_disable_task(&qp->req.task); + rxe_disable_task(&qp->recv_task); + rxe_disable_task(&qp->send_task); /* drain work and packet queuesc */ - rxe_requester(qp); - rxe_completer(qp); - rxe_responder(qp); + rxe_sender(qp); + rxe_receiver(qp); if (qp->rq.queue) rxe_queue_reset(qp->rq.queue); @@ -548,9 +545,8 @@ static void rxe_qp_reset(struct rxe_qp *qp) cleanup_rd_atomic_resources(qp); /* reenable tasks */ - rxe_enable_task(&qp->resp.task); - rxe_enable_task(&qp->comp.task); - rxe_enable_task(&qp->req.task); + rxe_enable_task(&qp->recv_task); + rxe_enable_task(&qp->send_task); } /* move the qp to the error state */ @@ -562,9 +558,8 @@ void rxe_qp_error(struct rxe_qp *qp) qp->attr.qp_state = IB_QPS_ERR; /* drain work and packet queues */ - rxe_sched_task(&qp->resp.task); - rxe_sched_task(&qp->comp.task); - rxe_sched_task(&qp->req.task); + rxe_sched_task(&qp->recv_task); + rxe_sched_task(&qp->send_task); spin_unlock_irqrestore(&qp->state_lock, flags); } @@ -575,8 +570,7 @@ static void rxe_qp_sqd(struct rxe_qp *qp, struct ib_qp_attr *attr, spin_lock_irqsave(&qp->state_lock, flags); qp->attr.sq_draining = 1; - rxe_sched_task(&qp->comp.task); - rxe_sched_task(&qp->req.task); + rxe_sched_task(&qp->send_task); spin_unlock_irqrestore(&qp->state_lock, flags); } @@ -781,6 +775,7 @@ int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask) * Yield the processor */ spin_lock_irqsave(&qp->state_lock, flags); + attr->cur_qp_state = qp_state(qp); if (qp->attr.sq_draining) { spin_unlock_irqrestore(&qp->state_lock, flags); cond_resched(); @@ -816,24 +811,25 @@ static void rxe_qp_do_cleanup(struct work_struct *work) spin_unlock_irqrestore(&qp->state_lock, flags); qp->qp_timeout_jiffies = 0; - if (qp_type(qp) == IB_QPT_RC) { - del_timer_sync(&qp->retrans_timer); - del_timer_sync(&qp->rnr_nak_timer); + /* In the function timer_setup, .function is initialized. If .function + * is NULL, it indicates the function timer_setup is not called, the + * timer is not initialized. Or else, the timer is initialized. + */ + if (qp_type(qp) == IB_QPT_RC && qp->retrans_timer.function && + qp->rnr_nak_timer.function) { + timer_delete_sync(&qp->retrans_timer); + timer_delete_sync(&qp->rnr_nak_timer); } - if (qp->resp.task.func) - rxe_cleanup_task(&qp->resp.task); - - if (qp->req.task.func) - rxe_cleanup_task(&qp->req.task); + if (qp->recv_task.func) + rxe_cleanup_task(&qp->recv_task); - if (qp->comp.task.func) - rxe_cleanup_task(&qp->comp.task); + if (qp->send_task.func) + rxe_cleanup_task(&qp->send_task); /* flush out any receive wr's or pending requests */ - rxe_requester(qp); - rxe_completer(qp); - rxe_responder(qp); + rxe_sender(qp); + rxe_receiver(qp); if (qp->sq.queue) rxe_queue_cleanup(qp->sq.queue); diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index d8c41fd626a9..9d0392df8a92 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -5,7 +5,6 @@ */ #include <linux/skbuff.h> -#include <crypto/hash.h> #include "rxe.h" #include "rxe_loc.h" @@ -108,7 +107,7 @@ void rnr_nak_timer(struct timer_list *t) /* request a send queue retry */ qp->req.need_retry = 1; qp->req.wait_for_rnr_timer = 0; - rxe_sched_task(&qp->req.task); + rxe_sched_task(&qp->send_task); } spin_unlock_irqrestore(&qp->state_lock, flags); } @@ -424,7 +423,7 @@ static struct sk_buff *init_req_packet(struct rxe_qp *qp, int paylen; int solicited; u32 qp_num; - int ack_req; + int ack_req = 0; /* length from start of bth to end of icrc */ paylen = rxe_opcode[opcode].length + payload + pad + RXE_ICRC_SIZE; @@ -445,8 +444,9 @@ static struct sk_buff *init_req_packet(struct rxe_qp *qp, qp_num = (pkt->mask & RXE_DETH_MASK) ? ibwr->wr.ud.remote_qpn : qp->attr.dest_qp_num; - ack_req = ((pkt->mask & RXE_END_MASK) || - (qp->req.noack_pkts++ > RXE_MAX_PKT_PER_ACK)); + if (qp_type(qp) != IB_QPT_UD && qp_type(qp) != IB_QPT_UC) + ack_req = ((pkt->mask & RXE_END_MASK) || + (qp->req.noack_pkts++ > RXE_MAX_PKT_PER_ACK)); if (ack_req) qp->req.noack_pkts = 0; @@ -545,6 +545,8 @@ static void update_wqe_state(struct rxe_qp *qp, if (pkt->mask & RXE_END_MASK) { if (qp_type(qp) == IB_QPT_RC) wqe->state = wqe_state_pending; + else + wqe->state = wqe_state_done; } else { wqe->state = wqe_state_processing; } @@ -573,30 +575,6 @@ static void update_wqe_psn(struct rxe_qp *qp, qp->req.psn = (qp->req.psn + 1) & BTH_PSN_MASK; } -static void save_state(struct rxe_send_wqe *wqe, - struct rxe_qp *qp, - struct rxe_send_wqe *rollback_wqe, - u32 *rollback_psn) -{ - rollback_wqe->state = wqe->state; - rollback_wqe->first_psn = wqe->first_psn; - rollback_wqe->last_psn = wqe->last_psn; - rollback_wqe->dma = wqe->dma; - *rollback_psn = qp->req.psn; -} - -static void rollback_state(struct rxe_send_wqe *wqe, - struct rxe_qp *qp, - struct rxe_send_wqe *rollback_wqe, - u32 rollback_psn) -{ - wqe->state = rollback_wqe->state; - wqe->first_psn = rollback_wqe->first_psn; - wqe->last_psn = rollback_wqe->last_psn; - wqe->dma = rollback_wqe->dma; - qp->req.psn = rollback_psn; -} - static void update_state(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { qp->req.opcode = pkt->opcode; @@ -655,12 +633,6 @@ static int rxe_do_local_ops(struct rxe_qp *qp, struct rxe_send_wqe *wqe) wqe->status = IB_WC_SUCCESS; qp->req.wqe_index = queue_next_index(qp->sq.queue, qp->req.wqe_index); - /* There is no ack coming for local work requests - * which can lead to a deadlock. So go ahead and complete - * it now. - */ - rxe_sched_task(&qp->comp.task); - return 0; } @@ -676,8 +648,6 @@ int rxe_requester(struct rxe_qp *qp) int opcode; int err; int ret; - struct rxe_send_wqe rollback_wqe; - u32 rollback_psn; struct rxe_queue *q = qp->sq.queue; struct rxe_ah *ah; struct rxe_av *av; @@ -692,10 +662,12 @@ int rxe_requester(struct rxe_qp *qp) if (unlikely(qp_state(qp) == IB_QPS_ERR)) { wqe = __req_next_wqe(qp); spin_unlock_irqrestore(&qp->state_lock, flags); - if (wqe) + if (wqe) { + wqe->status = IB_WC_WR_FLUSH_ERR; goto err; - else + } else { goto exit; + } } if (unlikely(qp_state(qp) == IB_QPS_RESET)) { @@ -786,7 +758,6 @@ int rxe_requester(struct rxe_qp *qp) qp->req.wqe_index); wqe->state = wqe_state_done; wqe->status = IB_WC_SUCCESS; - rxe_sched_task(&qp->comp.task); goto done; } payload = mtu; @@ -799,9 +770,6 @@ int rxe_requester(struct rxe_qp *qp) pkt.mask = rxe_opcode[opcode].mask; pkt.wqe = wqe; - /* save wqe state before we build and send packet */ - save_state(wqe, qp, &rollback_wqe, &rollback_psn); - av = rxe_get_av(&pkt, &ah); if (unlikely(!av)) { rxe_dbg_qp(qp, "Failed no address vector\n"); @@ -834,31 +802,14 @@ int rxe_requester(struct rxe_qp *qp) if (ah) rxe_put(ah); - /* update wqe state as though we had sent it */ - update_wqe_state(qp, wqe, &pkt); - update_wqe_psn(qp, wqe, &pkt, payload); - err = rxe_xmit_packet(qp, &pkt, skb); if (err) { - if (err != -EAGAIN) { - wqe->status = IB_WC_LOC_QP_OP_ERR; - goto err; - } - - /* the packet was dropped so reset wqe to the state - * before we sent it so we can try to resend - */ - rollback_state(wqe, qp, &rollback_wqe, rollback_psn); - - /* force a delay until the dropped packet is freed and - * the send queue is drained below the low water mark - */ - qp->need_req_skb = 1; - - rxe_sched_task(&qp->req.task); - goto exit; + wqe->status = IB_WC_LOC_QP_OP_ERR; + goto err; } + update_wqe_state(qp, wqe, &pkt); + update_wqe_psn(qp, wqe, &pkt, payload); update_state(qp, &pkt); /* A non-zero return value will cause rxe_do_task to @@ -878,3 +829,20 @@ exit: out: return ret; } + +int rxe_sender(struct rxe_qp *qp) +{ + int req_ret; + int comp_ret; + + /* process the send queue */ + req_ret = rxe_requester(qp); + + /* process the response queue */ + comp_ret = rxe_completer(qp); + + /* exit the task loop if both requester and completer + * are ready + */ + return (req_ret && comp_ret) ? -EAGAIN : 0; +} diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 963382f625d7..711f73e0bbb1 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -49,18 +49,8 @@ static char *resp_state_name[] = { /* rxe_recv calls here to add a request packet to the input queue */ void rxe_resp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb) { - int must_sched; - struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); - skb_queue_tail(&qp->req_pkts, skb); - - must_sched = (pkt->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST) || - (skb_queue_len(&qp->req_pkts) > 1); - - if (must_sched) - rxe_sched_task(&qp->resp.task); - else - rxe_run_task(&qp->resp.task); + rxe_sched_task(&qp->recv_task); } static inline enum resp_states get_req(struct rxe_qp *qp, @@ -351,9 +341,22 @@ static enum resp_states rxe_resp_check_length(struct rxe_qp *qp, /* * See IBA C9-92 * For UD QPs we only check if the packet will fit in the - * receive buffer later. For rmda operations additional + * receive buffer later. For RDMA operations additional * length checks are performed in check_rkey. */ + if ((qp_type(qp) == IB_QPT_GSI) || (qp_type(qp) == IB_QPT_UD)) { + unsigned int payload = payload_size(pkt); + unsigned int recv_buffer_len = 0; + int i; + + for (i = 0; i < qp->resp.wqe->dma.num_sge; i++) + recv_buffer_len += qp->resp.wqe->dma.sge[i].length; + if (payload + sizeof(union rdma_network_hdr) > recv_buffer_len) { + rxe_dbg_qp(qp, "The receive buffer is too small for this UD packet.\n"); + return RESPST_ERR_LENGTH; + } + } + if (pkt->mask & RXE_PAYLOAD_MASK && ((qp_type(qp) == IB_QPT_RC) || (qp_type(qp) == IB_QPT_UC))) { unsigned int mtu = qp->mtu; @@ -699,10 +702,16 @@ static enum resp_states atomic_reply(struct rxe_qp *qp, if (!res->replay) { u64 iova = qp->resp.va + qp->resp.offset; - err = rxe_mr_do_atomic_op(mr, iova, pkt->opcode, - atmeth_comp(pkt), - atmeth_swap_add(pkt), - &res->atomic.orig_val); + if (is_odp_mr(mr)) + err = rxe_odp_atomic_op(mr, iova, pkt->opcode, + atmeth_comp(pkt), + atmeth_swap_add(pkt), + &res->atomic.orig_val); + else + err = rxe_mr_do_atomic_op(mr, iova, pkt->opcode, + atmeth_comp(pkt), + atmeth_swap_add(pkt), + &res->atomic.orig_val); if (err) return err; @@ -740,7 +749,16 @@ static enum resp_states atomic_write_reply(struct rxe_qp *qp, value = *(u64 *)payload_addr(pkt); iova = qp->resp.va + qp->resp.offset; - err = rxe_mr_do_atomic_write(mr, iova, value); + /* See IBA oA19-28 */ + if (unlikely(mr->state != RXE_MR_STATE_VALID)) { + rxe_dbg_mr(mr, "mr not in valid state\n"); + return RESPST_ERR_RKEY_VIOLATION; + } + + if (is_odp_mr(mr)) + err = rxe_odp_do_atomic_write(mr, iova, value); + else + err = rxe_mr_do_atomic_write(mr, iova, value); if (err) return err; @@ -1485,7 +1503,7 @@ static void flush_recv_queue(struct rxe_qp *qp, bool notify) qp->resp.wqe = NULL; } -int rxe_responder(struct rxe_qp *qp) +int rxe_receiver(struct rxe_qp *qp) { struct rxe_dev *rxe = to_rdev(qp->ibqp.device); enum resp_states state; diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c index 80332638d9e3..6f8f353e9583 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.c +++ b/drivers/infiniband/sw/rxe/rxe_task.c @@ -85,17 +85,17 @@ static bool is_done(struct rxe_task *task) /* do_task is a wrapper for the three tasks (requester, * completer, responder) and calls them in a loop until - * they return a non-zero value. It is called either - * directly by rxe_run_task or indirectly if rxe_sched_task - * schedules the task. They must call __reserve_if_idle to - * move the task to busy before calling or scheduling. - * The task can also be moved to drained or invalid - * by calls to rxe_cleanup_task or rxe_disable_task. - * In that case tasks which get here are not executed but - * just flushed. The tasks are designed to look to see if - * there is work to do and then do part of it before returning - * here with a return value of zero until all the work - * has been consumed then it returns a non-zero value. + * they return a non-zero value. It is called indirectly + * when rxe_sched_task schedules the task. They must + * call __reserve_if_idle to move the task to busy before + * calling or scheduling. The task can also be moved to + * drained or invalid by calls to rxe_cleanup_task or + * rxe_disable_task. In that case tasks which get here + * are not executed but just flushed. The tasks are + * designed to look to see if there is work to do and + * then do part of it before returning here with a return + * value of zero until all the work has been consumed then + * it returns a non-zero value. * The number of times the task can be run is limited by * max iterations so one task cannot hold the cpu forever. * If the limit is hit and work remains the task is rescheduled. @@ -234,24 +234,6 @@ void rxe_cleanup_task(struct rxe_task *task) spin_unlock_irqrestore(&task->lock, flags); } -/* run the task inline if it is currently idle - * cannot call do_task holding the lock - */ -void rxe_run_task(struct rxe_task *task) -{ - unsigned long flags; - bool run; - - WARN_ON(rxe_read(task->qp) <= 0); - - spin_lock_irqsave(&task->lock, flags); - run = __reserve_if_idle(task); - spin_unlock_irqrestore(&task->lock, flags); - - if (run) - do_task(task); -} - /* schedule the task to run later as a work queue entry. * the queue_work call can be called holding * the lock. diff --git a/drivers/infiniband/sw/rxe/rxe_task.h b/drivers/infiniband/sw/rxe/rxe_task.h index a63e258b3d66..a8c9a77b6027 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.h +++ b/drivers/infiniband/sw/rxe/rxe_task.h @@ -47,8 +47,6 @@ int rxe_init_task(struct rxe_task *task, struct rxe_qp *qp, /* cleanup task */ void rxe_cleanup_task(struct rxe_task *task); -void rxe_run_task(struct rxe_task *task); - void rxe_sched_task(struct rxe_task *task); /* keep a task from scheduling */ diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 614581989b38..2331e698a65b 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -41,6 +41,7 @@ static int rxe_query_port(struct ib_device *ibdev, u32 port_num, struct ib_port_attr *attr) { struct rxe_dev *rxe = to_rdev(ibdev); + struct net_device *ndev; int err, ret; if (port_num != 1) { @@ -49,21 +50,29 @@ static int rxe_query_port(struct ib_device *ibdev, goto err_out; } + ndev = rxe_ib_device_get_netdev(ibdev); + if (!ndev) { + err = -ENODEV; + goto err_out; + } + memcpy(attr, &rxe->port.attr, sizeof(*attr)); mutex_lock(&rxe->usdev_lock); ret = ib_get_eth_speed(ibdev, port_num, &attr->active_speed, &attr->active_width); + attr->state = ib_get_curr_port_state(ndev); if (attr->state == IB_PORT_ACTIVE) attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; - else if (dev_get_flags(rxe->ndev) & IFF_UP) + else if (dev_get_flags(ndev) & IFF_UP) attr->phys_state = IB_PORT_PHYS_STATE_POLLING; else attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; mutex_unlock(&rxe->usdev_lock); + dev_put(ndev); return ret; err_out: @@ -71,6 +80,18 @@ err_out: return err; } +static int rxe_query_gid(struct ib_device *ibdev, u32 port, int idx, + union ib_gid *gid) +{ + struct rxe_dev *rxe = to_rdev(ibdev); + + /* subnet_prefix == interface_id == 0; */ + memset(gid, 0, sizeof(*gid)); + memcpy(gid->raw, rxe->raw_gid, ETH_ALEN); + + return 0; +} + static int rxe_query_pkey(struct ib_device *ibdev, u32 port_num, u16 index, u16 *pkey) { @@ -688,7 +709,7 @@ static int validate_send_wr(struct rxe_qp *qp, const struct ib_send_wr *ibwr, for (i = 0; i < ibwr->num_sge; i++) length += ibwr->sg_list[i].length; - if (length > (1UL << 31)) { + if (length > RXE_PORT_MAX_MSG_SZ) { rxe_err_qp(qp, "message length too long\n"); break; } @@ -812,7 +833,7 @@ static void copy_inline_data_to_wqe(struct rxe_send_wqe *wqe, int i; for (i = 0; i < ibwr->num_sge; i++, sge++) { - memcpy(p, ib_virt_dma_to_page(sge->addr), sge->length); + memcpy(p, ib_virt_dma_to_ptr(sge->addr), sge->length); p += sge->length; } } @@ -888,6 +909,7 @@ static int rxe_post_send_kernel(struct rxe_qp *qp, { int err = 0; unsigned long flags; + int good = 0; spin_lock_irqsave(&qp->sq.sq_lock, flags); while (ibwr) { @@ -895,18 +917,16 @@ static int rxe_post_send_kernel(struct rxe_qp *qp, if (err) { *bad_wr = ibwr; break; + } else { + good++; } ibwr = ibwr->next; } spin_unlock_irqrestore(&qp->sq.sq_lock, flags); - if (!err) - rxe_sched_task(&qp->req.task); - - spin_lock_irqsave(&qp->state_lock, flags); - if (qp_state(qp) == IB_QPS_ERR) - rxe_sched_task(&qp->comp.task); - spin_unlock_irqrestore(&qp->state_lock, flags); + /* kickoff processing of any posted wqes */ + if (good) + rxe_sched_task(&qp->send_task); return err; } @@ -936,7 +956,7 @@ static int rxe_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, if (qp->is_user) { /* Utilize process context to do protocol processing */ - rxe_run_task(&qp->req.task); + rxe_sched_task(&qp->send_task); } else { err = rxe_post_send_kernel(qp, wr, bad_wr); if (err) @@ -973,8 +993,7 @@ static int post_one_recv(struct rxe_rq *rq, const struct ib_recv_wr *ibwr) for (i = 0; i < num_sge; i++) length += ibwr->sg_list[i].length; - /* IBA max message size is 2^31 */ - if (length >= (1UL<<31)) { + if (length > RXE_PORT_MAX_MSG_SZ) { err = -EINVAL; rxe_dbg("message length too long\n"); goto err_out; @@ -1046,7 +1065,7 @@ static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, spin_lock_irqsave(&qp->state_lock, flags); if (qp_state(qp) == IB_QPS_ERR) - rxe_sched_task(&qp->resp.task); + rxe_sched_task(&qp->recv_task); spin_unlock_irqrestore(&qp->state_lock, flags); return err; @@ -1054,8 +1073,9 @@ static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, /* cq */ static int rxe_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct ib_device *dev = ibcq->device; struct rxe_dev *rxe = to_rdev(dev); struct rxe_cq *cq = to_rcq(ibcq); @@ -1278,7 +1298,10 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, u64 start, mr->ibmr.pd = ibpd; mr->ibmr.device = ibpd->device; - err = rxe_mr_init_user(rxe, start, length, access, mr); + if (access & IB_ACCESS_ON_DEMAND) + err = rxe_odp_mr_init_user(rxe, start, length, iova, access, mr); + else + err = rxe_mr_init_user(rxe, start, length, access, mr); if (err) { rxe_dbg_mr(mr, "reg_user_mr failed, err = %d\n", err); goto err_cleanup; @@ -1425,9 +1448,16 @@ static const struct attribute_group rxe_attr_group = { static int rxe_enable_driver(struct ib_device *ib_dev) { struct rxe_dev *rxe = container_of(ib_dev, struct rxe_dev, ib_dev); + struct net_device *ndev; + + ndev = rxe_ib_device_get_netdev(ib_dev); + if (!ndev) + return -ENODEV; rxe_set_port_state(rxe); - dev_info(&rxe->ib_dev.dev, "added %s\n", netdev_name(rxe->ndev)); + dev_info(&rxe->ib_dev.dev, "added %s\n", netdev_name(ndev)); + + dev_put(ndev); return 0; } @@ -1478,6 +1508,7 @@ static const struct ib_device_ops rxe_dev_ops = { .query_ah = rxe_query_ah, .query_device = rxe_query_device, .query_pkey = rxe_query_pkey, + .query_gid = rxe_query_gid, .query_port = rxe_query_port, .query_qp = rxe_query_qp, .query_srq = rxe_query_srq, @@ -1495,7 +1526,8 @@ static const struct ib_device_ops rxe_dev_ops = { INIT_RDMA_OBJ_SIZE(ib_mw, rxe_mw, ibmw), }; -int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name) +int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name, + struct net_device *ndev) { int err; struct ib_device *dev = &rxe->ib_dev; @@ -1507,17 +1539,13 @@ int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name) dev->num_comp_vectors = num_possible_cpus(); dev->local_dma_lkey = 0; addrconf_addr_eui48((unsigned char *)&dev->node_guid, - rxe->ndev->dev_addr); + rxe->raw_gid); dev->uverbs_cmd_mask |= BIT_ULL(IB_USER_VERBS_CMD_POST_SEND) | BIT_ULL(IB_USER_VERBS_CMD_REQ_NOTIFY_CQ); ib_set_device_ops(dev, &rxe_dev_ops); - err = ib_device_set_netdev(&rxe->ib_dev, rxe->ndev, 1); - if (err) - return err; - - err = rxe_icrc_init(rxe); + err = ib_device_set_netdev(&rxe->ib_dev, ndev, 1); if (err) return err; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index ccb9d19ffe8a..fd48075810dd 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -113,7 +113,7 @@ struct rxe_req_info { int need_retry; int wait_for_rnr_timer; int noack_pkts; - struct rxe_task task; + int again; }; struct rxe_comp_info { @@ -124,7 +124,43 @@ struct rxe_comp_info { int started_retry; u32 retry_cnt; u32 rnr_retry; - struct rxe_task task; +}; + +/* responder states */ +enum resp_states { + RESPST_NONE, + RESPST_GET_REQ, + RESPST_CHK_PSN, + RESPST_CHK_OP_SEQ, + RESPST_CHK_OP_VALID, + RESPST_CHK_RESOURCE, + RESPST_CHK_LENGTH, + RESPST_CHK_RKEY, + RESPST_EXECUTE, + RESPST_READ_REPLY, + RESPST_ATOMIC_REPLY, + RESPST_ATOMIC_WRITE_REPLY, + RESPST_PROCESS_FLUSH, + RESPST_COMPLETE, + RESPST_ACKNOWLEDGE, + RESPST_CLEANUP, + RESPST_DUPLICATE_REQUEST, + RESPST_ERR_MALFORMED_WQE, + RESPST_ERR_UNSUPPORTED_OPCODE, + RESPST_ERR_MISALIGNED_ATOMIC, + RESPST_ERR_PSN_OUT_OF_SEQ, + RESPST_ERR_MISSING_OPCODE_FIRST, + RESPST_ERR_MISSING_OPCODE_LAST_C, + RESPST_ERR_MISSING_OPCODE_LAST_D1E, + RESPST_ERR_TOO_MANY_RDMA_ATM_REQ, + RESPST_ERR_RNR, + RESPST_ERR_RKEY_VIOLATION, + RESPST_ERR_INVALIDATE_RKEY, + RESPST_ERR_LENGTH, + RESPST_ERR_CQ_OVERFLOW, + RESPST_ERROR, + RESPST_DONE, + RESPST_EXIT, }; enum rdatm_res_state { @@ -196,7 +232,6 @@ struct rxe_resp_info { unsigned int res_head; unsigned int res_tail; struct resp_res *res; - struct rxe_task task; }; struct rxe_qp { @@ -229,6 +264,9 @@ struct rxe_qp { struct sk_buff_head req_pkts; struct sk_buff_head resp_pkts; + struct rxe_task send_task; + struct rxe_task recv_task; + struct rxe_req_info req; struct rxe_comp_info comp; struct rxe_resp_info resp; @@ -369,14 +407,15 @@ struct rxe_port { u32 qp_gsi_index; }; +#define RXE_PORT 1 struct rxe_dev { struct ib_device ib_dev; struct ib_device_attr attr; int max_ucontext; int max_inline_data; - struct mutex usdev_lock; + struct mutex usdev_lock; - struct net_device *ndev; + char raw_gid[ETH_ALEN]; struct rxe_pool uc_pool; struct rxe_pool pd_pool; @@ -402,9 +441,13 @@ struct rxe_dev { atomic64_t stats_counters[RXE_NUM_OF_COUNTERS]; struct rxe_port port; - struct crypto_shash *tfm; }; +static inline struct net_device *rxe_ib_device_get_netdev(struct ib_device *dev) +{ + return ib_device_get_netdev(dev, RXE_PORT); +} + static inline void rxe_counter_inc(struct rxe_dev *rxe, enum rxe_counters index) { atomic64_inc(&rxe->stats_counters[index]); @@ -470,6 +513,7 @@ static inline struct rxe_pd *rxe_mw_pd(struct rxe_mw *mw) return to_rpd(mw->ibmw.pd); } -int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name); +int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name, + struct net_device *ndev); #endif /* RXE_VERBS_H */ diff --git a/drivers/infiniband/sw/siw/Kconfig b/drivers/infiniband/sw/siw/Kconfig index 81b70a3eeb87..186f182b80e7 100644 --- a/drivers/infiniband/sw/siw/Kconfig +++ b/drivers/infiniband/sw/siw/Kconfig @@ -2,9 +2,8 @@ config RDMA_SIW tristate "Software RDMA over TCP/IP (iWARP) driver" depends on INET && INFINIBAND depends on INFINIBAND_VIRT_DMA - select LIBCRC32C - select CRYPTO - select CRYPTO_CRC32C + select CRC32 + select NET_CRC32C help This driver implements the iWARP RDMA transport over the Linux TCP/IP network stack. It enables a system with a diff --git a/drivers/infiniband/sw/siw/siw.h b/drivers/infiniband/sw/siw/siw.h index 75253f2b3e3d..f5fd71717b80 100644 --- a/drivers/infiniband/sw/siw/siw.h +++ b/drivers/infiniband/sw/siw/siw.h @@ -10,9 +10,9 @@ #include <rdma/restrack.h> #include <linux/socket.h> #include <linux/skbuff.h> -#include <crypto/hash.h> #include <linux/crc32.h> #include <linux/crc32c.h> +#include <linux/unaligned.h> #include <rdma/siw-abi.h> #include "iwarp.h" @@ -46,6 +46,9 @@ */ #define SIW_IRQ_MAXBURST_SQ_ACTIVE 4 +/* There is always only a port 1 per siw device */ +#define SIW_PORT 1 + struct siw_dev_cap { int max_qp; int max_qp_wr; @@ -69,16 +72,12 @@ struct siw_pd { struct siw_device { struct ib_device base_dev; - struct net_device *netdev; struct siw_dev_cap attrs; u32 vendor_part_id; int numa_node; char raw_gid[ETH_ALEN]; - /* physical port state (only one port per device) */ - enum ib_port_state state; - spinlock_t lock; struct xarray qp_xa; @@ -94,8 +93,6 @@ struct siw_device { atomic_t num_mr; atomic_t num_srq; atomic_t num_ctx; - - struct work_struct netdev_down; }; struct siw_ucontext { @@ -292,7 +289,8 @@ struct siw_rx_stream { union iwarp_hdr hdr; struct mpa_trailer trailer; - struct shash_desc *mpa_crc_hd; + u32 mpa_crc; + bool mpa_crc_enabled; /* * For each FPDU, main RX loop runs through 3 stages: @@ -393,7 +391,8 @@ struct siw_iwarp_tx { int burst; int bytes_unsent; /* ddp payload bytes */ - struct shash_desc *mpa_crc_hd; + u32 mpa_crc; + bool mpa_crc_enabled; u8 do_crc : 1; /* do crc for segment */ u8 use_sendpage : 1; /* send w/o copy */ @@ -499,7 +498,6 @@ extern u_char mpa_version; extern const bool peer_to_peer; extern struct task_struct *siw_tx_thread[]; -extern struct crypto_shash *siw_crypto_shash; extern struct iwarp_msg_info iwarp_pktinfo[RDMAP_TERMINATE + 1]; /* QP general functions */ @@ -671,29 +669,33 @@ static inline struct siw_sqe *irq_alloc_free(struct siw_qp *qp) return NULL; } -static inline __wsum siw_csum_update(const void *buff, int len, __wsum sum) +static inline void siw_crc_init(u32 *crc) { - return (__force __wsum)crc32c((__force __u32)sum, buff, len); + *crc = ~0; } -static inline __wsum siw_csum_combine(__wsum csum, __wsum csum2, int offset, - int len) +static inline void siw_crc_update(u32 *crc, const void *data, size_t len) { - return (__force __wsum)__crc32c_le_combine((__force __u32)csum, - (__force __u32)csum2, len); + *crc = crc32c(*crc, data, len); } -static inline void siw_crc_skb(struct siw_rx_stream *srx, unsigned int len) +static inline void siw_crc_final(u32 *crc, u8 out[4]) { - const struct skb_checksum_ops siw_cs_ops = { - .update = siw_csum_update, - .combine = siw_csum_combine, - }; - __wsum crc = *(u32 *)shash_desc_ctx(srx->mpa_crc_hd); + put_unaligned_le32(~*crc, out); +} - crc = __skb_checksum(srx->skb, srx->skb_offset, len, crc, - &siw_cs_ops); - *(u32 *)shash_desc_ctx(srx->mpa_crc_hd) = crc; +static inline void siw_crc_oneshot(const void *data, size_t len, u8 out[4]) +{ + u32 crc; + + siw_crc_init(&crc); + siw_crc_update(&crc, data, len); + return siw_crc_final(&crc, out); +} + +static inline void siw_crc_skb(struct siw_rx_stream *srx, unsigned int len) +{ + srx->mpa_crc = skb_crc32c(srx->skb, srx->skb_offset, len, srx->mpa_crc); } #define siw_dbg(ibdev, fmt, ...) \ @@ -716,7 +718,7 @@ static inline void siw_crc_skb(struct siw_rx_stream *srx, unsigned int len) "MEM[0x%08x] %s: " fmt, mem->stag, __func__, ##__VA_ARGS__) #define siw_dbg_cep(cep, fmt, ...) \ - ibdev_dbg(&cep->sdev->base_dev, "CEP[0x%pK] %s: " fmt, \ + ibdev_dbg(&cep->sdev->base_dev, "CEP[0x%p] %s: " fmt, \ cep, __func__, ##__VA_ARGS__) void siw_cq_flush(struct siw_cq *cq); diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c index 86323918a570..708b13993fdf 100644 --- a/drivers/infiniband/sw/siw/siw_cm.c +++ b/drivers/infiniband/sw/siw/siw_cm.c @@ -1759,6 +1759,7 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) { struct socket *s; struct siw_cep *cep = NULL; + struct net_device *ndev = NULL; struct siw_device *sdev = to_siw_dev(id->device); int addr_family = id->local_addr.ss_family; int rv = 0; @@ -1779,9 +1780,15 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) struct sockaddr_in *laddr = &to_sockaddr_in(id->local_addr); /* For wildcard addr, limit binding to current device only */ - if (ipv4_is_zeronet(laddr->sin_addr.s_addr)) - s->sk->sk_bound_dev_if = sdev->netdev->ifindex; - + if (ipv4_is_zeronet(laddr->sin_addr.s_addr)) { + ndev = ib_device_get_netdev(id->device, SIW_PORT); + if (ndev) { + s->sk->sk_bound_dev_if = ndev->ifindex; + } else { + rv = -ENODEV; + goto error; + } + } rv = s->ops->bind(s, (struct sockaddr *)laddr, sizeof(struct sockaddr_in)); } else { @@ -1797,9 +1804,15 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) } /* For wildcard addr, limit binding to current device only */ - if (ipv6_addr_any(&laddr->sin6_addr)) - s->sk->sk_bound_dev_if = sdev->netdev->ifindex; - + if (ipv6_addr_any(&laddr->sin6_addr)) { + ndev = ib_device_get_netdev(id->device, SIW_PORT); + if (ndev) { + s->sk->sk_bound_dev_if = ndev->ifindex; + } else { + rv = -ENODEV; + goto error; + } + } rv = s->ops->bind(s, (struct sockaddr *)laddr, sizeof(struct sockaddr_in6)); } @@ -1860,6 +1873,7 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) } list_add_tail(&cep->listenq, (struct list_head *)id->provider_data); cep->state = SIW_EPSTATE_LISTENING; + dev_put(ndev); siw_dbg(id->device, "Listen at laddr %pISp\n", &id->local_addr); @@ -1879,6 +1893,7 @@ error: siw_cep_set_free_and_put(cep); } sock_release(s); + dev_put(ndev); return rv; } diff --git a/drivers/infiniband/sw/siw/siw_cq.c b/drivers/infiniband/sw/siw/siw_cq.c index f3c2226aff94..25b3c741b66b 100644 --- a/drivers/infiniband/sw/siw/siw_cq.c +++ b/drivers/infiniband/sw/siw/siw_cq.c @@ -72,7 +72,7 @@ int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *wc) wc->opcode = map_wc_opcode[cqe->opcode]; wc->status = map_cqe_status[cqe->status].ib; siw_dbg_cq(cq, - "idx %u, type %d, flags %2x, id 0x%pK\n", + "idx %u, type %d, flags %2x, id 0x%p\n", cq->cq_get % cq->num_cqe, cqe->opcode, cqe->flags, (void *)(uintptr_t)cqe->id); } else { diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c index 61ad8ca3d1a2..5168307229a9 100644 --- a/drivers/infiniband/sw/siw/siw_main.c +++ b/drivers/infiniband/sw/siw/siw_main.c @@ -59,7 +59,6 @@ u_char mpa_version = MPA_REVISION_2; const bool peer_to_peer; struct task_struct *siw_tx_thread[NR_CPUS]; -struct crypto_shash *siw_crypto_shash; static int siw_device_register(struct siw_device *sdev, const char *name) { @@ -287,7 +286,6 @@ static struct siw_device *siw_device_create(struct net_device *netdev) return NULL; base_dev = &sdev->base_dev; - sdev->netdev = netdev; if (netdev->addr_len) { memcpy(sdev->raw_gid, netdev->dev_addr, @@ -364,39 +362,6 @@ error: return NULL; } -/* - * Network link becomes unavailable. Mark all - * affected QP's accordingly. - */ -static void siw_netdev_down(struct work_struct *work) -{ - struct siw_device *sdev = - container_of(work, struct siw_device, netdev_down); - - struct siw_qp_attrs qp_attrs; - struct list_head *pos, *tmp; - - memset(&qp_attrs, 0, sizeof(qp_attrs)); - qp_attrs.state = SIW_QP_STATE_ERROR; - - list_for_each_safe(pos, tmp, &sdev->qp_list) { - struct siw_qp *qp = list_entry(pos, struct siw_qp, devq); - - down_write(&qp->state_lock); - WARN_ON(siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE)); - up_write(&qp->state_lock); - } - ib_device_put(&sdev->base_dev); -} - -static void siw_device_goes_down(struct siw_device *sdev) -{ - if (ib_device_try_get(&sdev->base_dev)) { - INIT_WORK(&sdev->netdev_down, siw_netdev_down); - schedule_work(&sdev->netdev_down); - } -} - static int siw_netdev_event(struct notifier_block *nb, unsigned long event, void *arg) { @@ -413,20 +378,6 @@ static int siw_netdev_event(struct notifier_block *nb, unsigned long event, sdev = to_siw_dev(base_dev); switch (event) { - case NETDEV_UP: - sdev->state = IB_PORT_ACTIVE; - siw_port_event(sdev, 1, IB_EVENT_PORT_ACTIVE); - break; - - case NETDEV_GOING_DOWN: - siw_device_goes_down(sdev); - break; - - case NETDEV_DOWN: - sdev->state = IB_PORT_DOWN; - siw_port_event(sdev, 1, IB_EVENT_PORT_ERR); - break; - case NETDEV_REGISTER: /* * Device registration now handled only by @@ -444,12 +395,8 @@ static int siw_netdev_event(struct notifier_block *nb, unsigned long event, siw_port_event(sdev, 1, IB_EVENT_LID_CHANGE); break; /* - * Todo: Below netdev events are currently not handled. + * All other events are not handled */ - case NETDEV_CHANGEMTU: - case NETDEV_CHANGE: - break; - default: break; } @@ -479,12 +426,7 @@ static int siw_newlink(const char *basedev_name, struct net_device *netdev) sdev = siw_device_create(netdev); if (sdev) { dev_dbg(&netdev->dev, "siw: new device\n"); - - if (netif_running(netdev) && netif_carrier_ok(netdev)) - sdev->state = IB_PORT_ACTIVE; - else - sdev->state = IB_PORT_DOWN; - + ib_mark_name_assigned_by_user(&sdev->base_dev); rv = siw_device_register(sdev, basedev_name); if (rv) ib_dealloc_device(&sdev->base_dev); @@ -524,20 +466,7 @@ static __init int siw_init_module(void) rv = -ENOMEM; goto out_error; } - /* - * Locate CRC32 algorithm. If unsuccessful, fail - * loading siw only, if CRC is required. - */ - siw_crypto_shash = crypto_alloc_shash("crc32c", 0, 0); - if (IS_ERR(siw_crypto_shash)) { - pr_info("siw: Loading CRC32c failed: %ld\n", - PTR_ERR(siw_crypto_shash)); - siw_crypto_shash = NULL; - if (mpa_crc_required) { - rv = -EOPNOTSUPP; - goto out_error; - } - } + rv = register_netdevice_notifier(&siw_netdev_nb); if (rv) goto out_error; @@ -550,9 +479,6 @@ static __init int siw_init_module(void) out_error: siw_stop_tx_threads(); - if (siw_crypto_shash) - crypto_free_shash(siw_crypto_shash); - pr_info("SoftIWARP attach failed. Error: %d\n", rv); siw_cm_exit(); @@ -573,9 +499,6 @@ static void __exit siw_exit_module(void) siw_destroy_cpulist(siw_cpu_info.num_nodes); - if (siw_crypto_shash) - crypto_free_shash(siw_crypto_shash); - pr_info("SoftiWARP detached\n"); } diff --git a/drivers/infiniband/sw/siw/siw_mem.c b/drivers/infiniband/sw/siw/siw_mem.c index dcb963607c8b..d5ddeb17bd22 100644 --- a/drivers/infiniband/sw/siw/siw_mem.c +++ b/drivers/infiniband/sw/siw/siw_mem.c @@ -18,30 +18,6 @@ #define SIW_STAG_MAX_INDEX 0x00ffffff /* - * The code avoids special Stag of zero and tries to randomize - * STag values between 1 and SIW_STAG_MAX_INDEX. - */ -int siw_mem_add(struct siw_device *sdev, struct siw_mem *m) -{ - struct xa_limit limit = XA_LIMIT(1, SIW_STAG_MAX_INDEX); - u32 id, next; - - get_random_bytes(&next, 4); - next &= SIW_STAG_MAX_INDEX; - - if (xa_alloc_cyclic(&sdev->mem_xa, &id, m, limit, &next, - GFP_KERNEL) < 0) - return -ENOMEM; - - /* Set the STag index part */ - m->stag = id << 8; - - siw_dbg_mem(m, "new MEM object\n"); - - return 0; -} - -/* * siw_mem_id2obj() * * resolves memory from stag given by id. might be called from: @@ -181,10 +157,10 @@ int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr, */ if (addr < mem->va || addr + len > mem->va + mem->len) { siw_dbg_pd(pd, "MEM interval len %d\n", len); - siw_dbg_pd(pd, "[0x%pK, 0x%pK] out of bounds\n", + siw_dbg_pd(pd, "[0x%p, 0x%p] out of bounds\n", (void *)(uintptr_t)addr, (void *)(uintptr_t)(addr + len)); - siw_dbg_pd(pd, "[0x%pK, 0x%pK] STag=0x%08x\n", + siw_dbg_pd(pd, "[0x%p, 0x%p] STag=0x%08x\n", (void *)(uintptr_t)mem->va, (void *)(uintptr_t)(mem->va + mem->len), mem->stag); diff --git a/drivers/infiniband/sw/siw/siw_mem.h b/drivers/infiniband/sw/siw/siw_mem.h index e74cfcd6dbc1..8e769d30e2ac 100644 --- a/drivers/infiniband/sw/siw/siw_mem.h +++ b/drivers/infiniband/sw/siw/siw_mem.h @@ -12,7 +12,6 @@ void siw_umem_release(struct siw_umem *umem); struct siw_pbl *siw_pbl_alloc(u32 num_buf); dma_addr_t siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx); struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index); -int siw_mem_add(struct siw_device *sdev, struct siw_mem *m); int siw_invalidate_stag(struct ib_pd *pd, u32 stag); int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr, enum ib_access_flags perms, int len); diff --git a/drivers/infiniband/sw/siw/siw_qp.c b/drivers/infiniband/sw/siw/siw_qp.c index da92cfa2073d..c1e6e7d6e32f 100644 --- a/drivers/infiniband/sw/siw/siw_qp.c +++ b/drivers/infiniband/sw/siw/siw_qp.c @@ -226,33 +226,6 @@ static int siw_qp_readq_init(struct siw_qp *qp, int irq_size, int orq_size) return 0; } -static int siw_qp_enable_crc(struct siw_qp *qp) -{ - struct siw_rx_stream *c_rx = &qp->rx_stream; - struct siw_iwarp_tx *c_tx = &qp->tx_ctx; - int size; - - if (siw_crypto_shash == NULL) - return -ENOENT; - - size = crypto_shash_descsize(siw_crypto_shash) + - sizeof(struct shash_desc); - - c_tx->mpa_crc_hd = kzalloc(size, GFP_KERNEL); - c_rx->mpa_crc_hd = kzalloc(size, GFP_KERNEL); - if (!c_tx->mpa_crc_hd || !c_rx->mpa_crc_hd) { - kfree(c_tx->mpa_crc_hd); - kfree(c_rx->mpa_crc_hd); - c_tx->mpa_crc_hd = NULL; - c_rx->mpa_crc_hd = NULL; - return -ENOMEM; - } - c_tx->mpa_crc_hd->tfm = siw_crypto_shash; - c_rx->mpa_crc_hd->tfm = siw_crypto_shash; - - return 0; -} - /* * Send a non signalled READ or WRITE to peer side as negotiated * with MPAv2 P2P setup protocol. The work request is only created @@ -583,20 +556,15 @@ void siw_send_terminate(struct siw_qp *qp) term->ctrl.mpa_len = cpu_to_be16(len_terminate - (MPA_HDR_SIZE + MPA_CRC_SIZE)); - if (qp->tx_ctx.mpa_crc_hd) { - crypto_shash_init(qp->tx_ctx.mpa_crc_hd); - if (crypto_shash_update(qp->tx_ctx.mpa_crc_hd, - (u8 *)iov[0].iov_base, - iov[0].iov_len)) - goto out; - + if (qp->tx_ctx.mpa_crc_enabled) { + siw_crc_init(&qp->tx_ctx.mpa_crc); + siw_crc_update(&qp->tx_ctx.mpa_crc, + iov[0].iov_base, iov[0].iov_len); if (num_frags == 3) { - if (crypto_shash_update(qp->tx_ctx.mpa_crc_hd, - (u8 *)iov[1].iov_base, - iov[1].iov_len)) - goto out; + siw_crc_update(&qp->tx_ctx.mpa_crc, + iov[1].iov_base, iov[1].iov_len); } - crypto_shash_final(qp->tx_ctx.mpa_crc_hd, (u8 *)&crc); + siw_crc_final(&qp->tx_ctx.mpa_crc, (u8 *)&crc); } rv = kernel_sendmsg(s, &msg, iov, num_frags, len_terminate); @@ -604,7 +572,6 @@ void siw_send_terminate(struct siw_qp *qp) rv == len_terminate ? "success" : "failure", __rdmap_term_layer(term), __rdmap_term_etype(term), __rdmap_term_ecode(term), rv); -out: kfree(term); kfree(err_hdr); } @@ -643,9 +610,10 @@ static int siw_qp_nextstate_from_idle(struct siw_qp *qp, switch (attrs->state) { case SIW_QP_STATE_RTS: if (attrs->flags & SIW_MPA_CRC) { - rv = siw_qp_enable_crc(qp); - if (rv) - break; + siw_crc_init(&qp->tx_ctx.mpa_crc); + qp->tx_ctx.mpa_crc_enabled = true; + siw_crc_init(&qp->rx_stream.mpa_crc); + qp->rx_stream.mpa_crc_enabled = true; } if (!(mask & SIW_QP_ATTR_LLP_HANDLE)) { siw_dbg_qp(qp, "no socket\n"); diff --git a/drivers/infiniband/sw/siw/siw_qp_rx.c b/drivers/infiniband/sw/siw/siw_qp_rx.c index ed4fc39718b4..a10820e33887 100644 --- a/drivers/infiniband/sw/siw/siw_qp_rx.c +++ b/drivers/infiniband/sw/siw/siw_qp_rx.c @@ -38,7 +38,7 @@ static int siw_rx_umem(struct siw_rx_stream *srx, struct siw_umem *umem, p = siw_get_upage(umem, dest_addr); if (unlikely(!p)) { - pr_warn("siw: %s: [QP %u]: bogus addr: %pK, %pK\n", + pr_warn("siw: %s: [QP %u]: bogus addr: %p, %p\n", __func__, qp_id(rx_qp(srx)), (void *)(uintptr_t)dest_addr, (void *)(uintptr_t)umem->fp_addr); @@ -51,7 +51,7 @@ static int siw_rx_umem(struct siw_rx_stream *srx, struct siw_umem *umem, pg_off = dest_addr & ~PAGE_MASK; bytes = min(len, (int)PAGE_SIZE - pg_off); - siw_dbg_qp(rx_qp(srx), "page %pK, bytes=%u\n", p, bytes); + siw_dbg_qp(rx_qp(srx), "page %p, bytes=%u\n", p, bytes); dest = kmap_atomic(p); rv = skb_copy_bits(srx->skb, srx->skb_offset, dest + pg_off, @@ -67,10 +67,10 @@ static int siw_rx_umem(struct siw_rx_stream *srx, struct siw_umem *umem, return -EFAULT; } - if (srx->mpa_crc_hd) { + if (srx->mpa_crc_enabled) { if (rdma_is_kernel_res(&rx_qp(srx)->base_qp.res)) { - crypto_shash_update(srx->mpa_crc_hd, - (u8 *)(dest + pg_off), bytes); + siw_crc_update(&srx->mpa_crc, dest + pg_off, + bytes); kunmap_atomic(dest); } else { kunmap_atomic(dest); @@ -105,17 +105,17 @@ static int siw_rx_kva(struct siw_rx_stream *srx, void *kva, int len) { int rv; - siw_dbg_qp(rx_qp(srx), "kva: 0x%pK, len: %u\n", kva, len); + siw_dbg_qp(rx_qp(srx), "kva: 0x%p, len: %u\n", kva, len); rv = skb_copy_bits(srx->skb, srx->skb_offset, kva, len); if (unlikely(rv)) { - pr_warn("siw: [QP %u]: %s, len %d, kva 0x%pK, rv %d\n", + pr_warn("siw: [QP %u]: %s, len %d, kva 0x%p, rv %d\n", qp_id(rx_qp(srx)), __func__, len, kva, rv); return rv; } - if (srx->mpa_crc_hd) - crypto_shash_update(srx->mpa_crc_hd, (u8 *)kva, len); + if (srx->mpa_crc_enabled) + siw_crc_update(&srx->mpa_crc, kva, len); srx->skb_offset += len; srx->skb_copied += len; @@ -966,16 +966,16 @@ static int siw_get_trailer(struct siw_qp *qp, struct siw_rx_stream *srx) if (srx->fpdu_part_rem) return -EAGAIN; - if (!srx->mpa_crc_hd) + if (!srx->mpa_crc_enabled) return 0; if (srx->pad) - crypto_shash_update(srx->mpa_crc_hd, tbuf, srx->pad); + siw_crc_update(&srx->mpa_crc, tbuf, srx->pad); /* * CRC32 is computed, transmitted and received directly in NBO, * so there's never a reason to convert byte order. */ - crypto_shash_final(srx->mpa_crc_hd, (u8 *)&crc_own); + siw_crc_final(&srx->mpa_crc, (u8 *)&crc_own); crc_in = (__force __wsum)srx->trailer.crc; if (unlikely(crc_in != crc_own)) { @@ -1093,13 +1093,12 @@ static int siw_get_hdr(struct siw_rx_stream *srx) * (tagged/untagged). E.g., a WRITE can get intersected by a SEND, * but not by a READ RESPONSE etc. */ - if (srx->mpa_crc_hd) { + if (srx->mpa_crc_enabled) { /* * Restart CRC computation */ - crypto_shash_init(srx->mpa_crc_hd); - crypto_shash_update(srx->mpa_crc_hd, (u8 *)c_hdr, - srx->fpdu_part_rcvd); + siw_crc_init(&srx->mpa_crc); + siw_crc_update(&srx->mpa_crc, c_hdr, srx->fpdu_part_rcvd); } if (frx->more_ddp_segs) { frx->first_ddp_seg = 0; diff --git a/drivers/infiniband/sw/siw/siw_qp_tx.c b/drivers/infiniband/sw/siw/siw_qp_tx.c index 64ad9e0895bd..6432bce7d083 100644 --- a/drivers/infiniband/sw/siw/siw_qp_tx.c +++ b/drivers/infiniband/sw/siw/siw_qp_tx.c @@ -248,10 +248,8 @@ static int siw_qp_prepare_tx(struct siw_iwarp_tx *c_tx) /* * Do complete CRC if enabled and short packet */ - if (c_tx->mpa_crc_hd && - crypto_shash_digest(c_tx->mpa_crc_hd, (u8 *)&c_tx->pkt, - c_tx->ctrl_len, (u8 *)crc) != 0) - return -EINVAL; + if (c_tx->mpa_crc_enabled) + siw_crc_oneshot(&c_tx->pkt, c_tx->ctrl_len, (u8 *)crc); c_tx->ctrl_len += MPA_CRC_SIZE; return PKT_COMPLETE; @@ -331,6 +329,8 @@ static int siw_tcp_sendpages(struct socket *s, struct page **page, int offset, msg.msg_flags &= ~MSG_MORE; tcp_rate_check_app_limited(sk); + if (!sendpage_ok(page[i])) + msg.msg_flags &= ~MSG_SPLICE_PAGES; bvec_set_page(&bvec, page[i], bytes, offset); iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); @@ -480,9 +480,8 @@ static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s) iov[seg].iov_len = sge_len; if (do_crc) - crypto_shash_update(c_tx->mpa_crc_hd, - iov[seg].iov_base, - sge_len); + siw_crc_update(&c_tx->mpa_crc, + iov[seg].iov_base, sge_len); sge_off += sge_len; data_len -= sge_len; seg++; @@ -514,15 +513,14 @@ static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s) iov[seg].iov_len = plen; if (do_crc) - crypto_shash_update( - c_tx->mpa_crc_hd, + siw_crc_update( + &c_tx->mpa_crc, iov[seg].iov_base, plen); } else if (do_crc) { kaddr = kmap_local_page(p); - crypto_shash_update(c_tx->mpa_crc_hd, - kaddr + fp_off, - plen); + siw_crc_update(&c_tx->mpa_crc, + kaddr + fp_off, plen); kunmap_local(kaddr); } } else { @@ -534,10 +532,9 @@ static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s) page_array[seg] = ib_virt_dma_to_page(va); if (do_crc) - crypto_shash_update( - c_tx->mpa_crc_hd, - ib_virt_dma_to_ptr(va), - plen); + siw_crc_update(&c_tx->mpa_crc, + ib_virt_dma_to_ptr(va), + plen); } sge_len -= plen; @@ -574,14 +571,14 @@ sge_done: if (c_tx->pad) { *(u32 *)c_tx->trailer.pad = 0; if (do_crc) - crypto_shash_update(c_tx->mpa_crc_hd, - (u8 *)&c_tx->trailer.crc - c_tx->pad, - c_tx->pad); + siw_crc_update(&c_tx->mpa_crc, + (u8 *)&c_tx->trailer.crc - c_tx->pad, + c_tx->pad); } - if (!c_tx->mpa_crc_hd) + if (!c_tx->mpa_crc_enabled) c_tx->trailer.crc = 0; else if (do_crc) - crypto_shash_final(c_tx->mpa_crc_hd, (u8 *)&c_tx->trailer.crc); + siw_crc_final(&c_tx->mpa_crc, (u8 *)&c_tx->trailer.crc); data_len = c_tx->bytes_unsent; @@ -734,10 +731,9 @@ static void siw_prepare_fpdu(struct siw_qp *qp, struct siw_wqe *wqe) /* * Init MPA CRC computation */ - if (c_tx->mpa_crc_hd) { - crypto_shash_init(c_tx->mpa_crc_hd); - crypto_shash_update(c_tx->mpa_crc_hd, (u8 *)&c_tx->pkt, - c_tx->ctrl_len); + if (c_tx->mpa_crc_enabled) { + siw_crc_init(&c_tx->mpa_crc); + siw_crc_update(&c_tx->mpa_crc, &c_tx->pkt, c_tx->ctrl_len); c_tx->do_crc = 1; } } diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c index ecf0444666b4..2b2a7b8e93b0 100644 --- a/drivers/infiniband/sw/siw/siw_verbs.c +++ b/drivers/infiniband/sw/siw/siw_verbs.c @@ -171,21 +171,28 @@ int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr, int siw_query_port(struct ib_device *base_dev, u32 port, struct ib_port_attr *attr) { - struct siw_device *sdev = to_siw_dev(base_dev); + struct net_device *ndev; int rv; memset(attr, 0, sizeof(*attr)); rv = ib_get_eth_speed(base_dev, port, &attr->active_speed, &attr->active_width); + if (rv) + return rv; + + ndev = ib_device_get_netdev(base_dev, SIW_PORT); + if (!ndev) + return -ENODEV; + attr->gid_tbl_len = 1; attr->max_msg_sz = -1; - attr->max_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu); - attr->active_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu); - attr->phys_state = sdev->state == IB_PORT_ACTIVE ? + attr->max_mtu = ib_mtu_int_to_enum(ndev->max_mtu); + attr->active_mtu = ib_mtu_int_to_enum(READ_ONCE(ndev->mtu)); + attr->state = ib_get_curr_port_state(ndev); + attr->phys_state = attr->state == IB_PORT_ACTIVE ? IB_PORT_PHYS_STATE_LINK_UP : IB_PORT_PHYS_STATE_DISABLED; attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP; - attr->state = sdev->state; /* * All zero * @@ -199,6 +206,7 @@ int siw_query_port(struct ib_device *base_dev, u32 port, * attr->subnet_timeout = 0; * attr->init_type_repy = 0; */ + dev_put(ndev); return rv; } @@ -505,21 +513,24 @@ int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) { struct siw_qp *qp; - struct siw_device *sdev; + struct net_device *ndev; - if (base_qp && qp_attr && qp_init_attr) { + if (base_qp && qp_attr && qp_init_attr) qp = to_siw_qp(base_qp); - sdev = to_siw_dev(base_qp->device); - } else { + else return -EINVAL; - } + + ndev = ib_device_get_netdev(base_qp->device, SIW_PORT); + if (!ndev) + return -ENODEV; + qp_attr->qp_state = siw_qp_state_to_ib_qp_state[qp->attrs.state]; qp_attr->cap.max_inline_data = SIW_MAX_INLINE; qp_attr->cap.max_send_wr = qp->attrs.sq_size; qp_attr->cap.max_send_sge = qp->attrs.sq_max_sges; qp_attr->cap.max_recv_wr = qp->attrs.rq_size; qp_attr->cap.max_recv_sge = qp->attrs.rq_max_sges; - qp_attr->path_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu); + qp_attr->path_mtu = ib_mtu_int_to_enum(READ_ONCE(ndev->mtu)); qp_attr->max_rd_atomic = qp->attrs.irq_size; qp_attr->max_dest_rd_atomic = qp->attrs.orq_size; @@ -534,6 +545,7 @@ int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr, qp_init_attr->cap = qp_attr->cap; + dev_put(ndev); return 0; } @@ -619,9 +631,6 @@ int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata) } up_write(&qp->state_lock); - kfree(qp->tx_ctx.mpa_crc_hd); - kfree(qp->rx_stream.mpa_crc_hd); - qp->scq = qp->rcq = NULL; siw_qp_put(qp); @@ -927,7 +936,7 @@ int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr, rv = -EINVAL; break; } - siw_dbg_qp(qp, "opcode %d, flags 0x%x, wr_id 0x%pK\n", + siw_dbg_qp(qp, "opcode %d, flags 0x%x, wr_id 0x%p\n", sqe->opcode, sqe->flags, (void *)(uintptr_t)sqe->id); @@ -1093,7 +1102,7 @@ int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr, siw_dbg_qp(qp, "error %d\n", rv); *bad_wr = wr; } - return rv > 0 ? 0 : rv; + return rv; } int siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata) @@ -1124,12 +1133,13 @@ int siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata) * * @base_cq: CQ as allocated by RDMA midlayer * @attr: Initial CQ attributes - * @udata: relates to user context + * @attrs: uverbs bundle */ int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct siw_device *sdev = to_siw_dev(base_cq->device); struct siw_cq *cq = to_siw_cq(base_cq); int rv, size = attr->cqe; @@ -1322,7 +1332,7 @@ struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len, struct siw_device *sdev = to_siw_dev(pd->device); int rv; - siw_dbg_pd(pd, "start: 0x%pK, va: 0x%pK, len: %llu\n", + siw_dbg_pd(pd, "start: 0x%p, va: 0x%p, len: %llu\n", (void *)(uintptr_t)start, (void *)(uintptr_t)rnic_va, (unsigned long long)len); @@ -1515,7 +1525,7 @@ int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle, mem->len = base_mr->length; mem->va = base_mr->iova; siw_dbg_mem(mem, - "%llu bytes, start 0x%pK, %u SLE to %u entries\n", + "%llu bytes, start 0x%p, %u SLE to %u entries\n", mem->len, (void *)(uintptr_t)mem->va, num_sle, pbl->num_buf); } diff --git a/drivers/infiniband/sw/siw/siw_verbs.h b/drivers/infiniband/sw/siw/siw_verbs.h index 4b57a4fb7237..1f1a305540af 100644 --- a/drivers/infiniband/sw/siw/siw_verbs.h +++ b/drivers/infiniband/sw/siw/siw_verbs.h @@ -43,7 +43,7 @@ int siw_get_port_immutable(struct ib_device *base_dev, u32 port, int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr, struct ib_udata *udata); int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int siw_query_port(struct ib_device *base_dev, u32 port, struct ib_port_attr *attr); int siw_query_gid(struct ib_device *base_dev, u32 port, int idx, diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 963e936da5e3..91f866e3fb8b 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -329,14 +329,6 @@ struct ipoib_dev_priv { unsigned long flags; - /* - * This protects access to the child_intfs list. - * To READ from child_intfs the RTNL or vlan_rwsem read side must be - * held. To WRITE RTNL and the vlan_rwsem write side must be held (in - * that order) This lock exists because we have a few contexts where - * we need the child_intfs, but do not want to grab the RTNL. - */ - struct rw_semaphore vlan_rwsem; struct mutex mcast_mutex; struct rb_root path_tree; @@ -399,6 +391,9 @@ struct ipoib_dev_priv { struct ib_event_handler event_handler; struct net_device *parent; + /* 'child_intfs' and 'list' membership of all child devices are + * protected by the netdev instance lock of 'dev'. + */ struct list_head child_intfs; struct list_head list; int child_type; @@ -509,12 +504,12 @@ struct net_device *ipoib_intf_alloc(struct ib_device *hca, u32 port, const char *format); int ipoib_intf_init(struct ib_device *hca, u32 port, const char *format, struct net_device *dev); -void ipoib_ib_tx_timer_func(struct timer_list *t); void ipoib_ib_dev_flush_light(struct work_struct *work); void ipoib_ib_dev_flush_normal(struct work_struct *work); void ipoib_ib_dev_flush_heavy(struct work_struct *work); +void ipoib_queue_work(struct ipoib_dev_priv *priv, + enum ipoib_flush_level level); void ipoib_ib_tx_timeout_work(struct work_struct *work); -void ipoib_pkey_event(struct work_struct *work); void ipoib_ib_dev_cleanup(struct net_device *dev); int ipoib_ib_dev_open_default(struct net_device *dev); @@ -533,7 +528,6 @@ void ipoib_mcast_restart_task(struct work_struct *work); void ipoib_mcast_start_thread(struct net_device *dev); void ipoib_mcast_stop_thread(struct net_device *dev); -void ipoib_mcast_dev_down(struct net_device *dev); void ipoib_mcast_dev_flush(struct net_device *dev); int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req); @@ -610,7 +604,6 @@ int ipoib_set_mode(struct net_device *dev, const char *buf); void ipoib_setup_common(struct net_device *dev); -void ipoib_pkey_open(struct ipoib_dev_priv *priv); void ipoib_drain_cq(struct net_device *dev); void ipoib_set_ethtool_ops(struct net_device *dev); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c index 7da94fb8d7fa..4feb7170535c 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c @@ -128,16 +128,13 @@ static void ipoib_get_ethtool_stats(struct net_device *dev, static void ipoib_get_strings(struct net_device __always_unused *dev, u32 stringset, u8 *data) { - u8 *p = data; int i; switch (stringset) { case ETH_SS_STATS: - for (i = 0; i < IPOIB_GLOBAL_STATS_LEN; i++) { - memcpy(p, ipoib_gstrings_stats[i].stat_string, - ETH_GSTRING_LEN); - p += ETH_GSTRING_LEN; - } + for (i = 0; i < IPOIB_GLOBAL_STATS_LEN; i++) + ethtool_puts(&data, + ipoib_gstrings_stats[i].stat_string); break; default: break; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 5cde275daa94..10b0dbda6cd5 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -40,6 +40,7 @@ #include <linux/ip.h> #include <linux/tcp.h> +#include <net/netdev_lock.h> #include <rdma/ib_cache.h> #include "ipoib.h" @@ -781,16 +782,20 @@ static void ipoib_napi_enable(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); - napi_enable(&priv->recv_napi); - napi_enable(&priv->send_napi); + netdev_lock_ops_to_full(dev); + napi_enable_locked(&priv->recv_napi); + napi_enable_locked(&priv->send_napi); + netdev_unlock_full_to_ops(dev); } static void ipoib_napi_disable(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); - napi_disable(&priv->recv_napi); - napi_disable(&priv->send_napi); + netdev_lock_ops_to_full(dev); + napi_disable_locked(&priv->recv_napi); + napi_disable_locked(&priv->send_napi); + netdev_unlock_full_to_ops(dev); } int ipoib_ib_dev_stop_default(struct net_device *dev) @@ -1172,24 +1177,11 @@ out: } static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, - enum ipoib_flush_level level, - int nesting) + enum ipoib_flush_level level) { - struct ipoib_dev_priv *cpriv; struct net_device *dev = priv->dev; int result; - down_read_nested(&priv->vlan_rwsem, nesting); - - /* - * Flush any child interfaces too -- they might be up even if - * the parent is down. - */ - list_for_each_entry(cpriv, &priv->child_intfs, list) - __ipoib_ib_dev_flush(cpriv, level, nesting + 1); - - up_read(&priv->vlan_rwsem); - if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags) && level != IPOIB_FLUSH_HEAVY) { /* Make sure the dev_addr is set even if not flushing */ @@ -1253,10 +1245,14 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, ipoib_ib_dev_down(dev); if (level == IPOIB_FLUSH_HEAVY) { + netdev_lock_ops(dev); if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) ipoib_ib_dev_stop(dev); - if (ipoib_ib_dev_open(dev)) + result = ipoib_ib_dev_open(dev); + netdev_unlock_ops(dev); + + if (result) return; if (netif_queue_stopped(dev)) @@ -1280,7 +1276,7 @@ void ipoib_ib_dev_flush_light(struct work_struct *work) struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, flush_light); - __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_LIGHT, 0); + __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_LIGHT); } void ipoib_ib_dev_flush_normal(struct work_struct *work) @@ -1288,7 +1284,7 @@ void ipoib_ib_dev_flush_normal(struct work_struct *work) struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, flush_normal); - __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_NORMAL, 0); + __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_NORMAL); } void ipoib_ib_dev_flush_heavy(struct work_struct *work) @@ -1297,10 +1293,35 @@ void ipoib_ib_dev_flush_heavy(struct work_struct *work) container_of(work, struct ipoib_dev_priv, flush_heavy); rtnl_lock(); - __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY, 0); + __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY); rtnl_unlock(); } +void ipoib_queue_work(struct ipoib_dev_priv *priv, + enum ipoib_flush_level level) +{ + if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { + struct ipoib_dev_priv *cpriv; + + netdev_lock(priv->dev); + list_for_each_entry(cpriv, &priv->child_intfs, list) + ipoib_queue_work(cpriv, level); + netdev_unlock(priv->dev); + } + + switch (level) { + case IPOIB_FLUSH_LIGHT: + queue_work(ipoib_workqueue, &priv->flush_light); + break; + case IPOIB_FLUSH_NORMAL: + queue_work(ipoib_workqueue, &priv->flush_normal); + break; + case IPOIB_FLUSH_HEAVY: + queue_work(ipoib_workqueue, &priv->flush_heavy); + break; + } +} + void ipoib_ib_dev_cleanup(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 6f2a688fccbf..f2f5465f2a90 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -49,6 +49,8 @@ #include <linux/jhash.h> #include <net/arp.h> #include <net/addrconf.h> +#include <net/netdev_lock.h> +#include <net/pkt_sched.h> #include <linux/inetdevice.h> #include <rdma/ib_cache.h> @@ -131,6 +133,52 @@ static int ipoib_netdev_event(struct notifier_block *this, } #endif +struct ipoib_ifupdown_work { + struct work_struct work; + struct net_device *dev; + netdevice_tracker dev_tracker; + bool up; +}; + +static void ipoib_ifupdown_task(struct work_struct *work) +{ + struct ipoib_ifupdown_work *pwork = + container_of(work, struct ipoib_ifupdown_work, work); + struct net_device *dev = pwork->dev; + unsigned int flags; + + rtnl_lock(); + flags = dev->flags; + if (pwork->up) + flags |= IFF_UP; + else + flags &= ~IFF_UP; + + if (dev->flags != flags) + dev_change_flags(dev, flags, NULL); + rtnl_unlock(); + netdev_put(dev, &pwork->dev_tracker); + kfree(pwork); +} + +static void ipoib_schedule_ifupdown_task(struct net_device *dev, bool up) +{ + struct ipoib_ifupdown_work *work; + + if ((up && (dev->flags & IFF_UP)) || + (!up && !(dev->flags & IFF_UP))) + return; + + work = kmalloc(sizeof(*work), GFP_KERNEL); + if (!work) + return; + work->dev = dev; + netdev_hold(dev, &work->dev_tracker, GFP_KERNEL); + work->up = up; + INIT_WORK(&work->work, ipoib_ifupdown_task); + queue_work(ipoib_workqueue, &work->work); +} + int ipoib_open(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); @@ -153,17 +201,10 @@ int ipoib_open(struct net_device *dev) struct ipoib_dev_priv *cpriv; /* Bring up any child interfaces too */ - down_read(&priv->vlan_rwsem); - list_for_each_entry(cpriv, &priv->child_intfs, list) { - int flags; - - flags = cpriv->dev->flags; - if (flags & IFF_UP) - continue; - - dev_change_flags(cpriv->dev, flags | IFF_UP, NULL); - } - up_read(&priv->vlan_rwsem); + netdev_lock_ops_to_full(dev); + list_for_each_entry(cpriv, &priv->child_intfs, list) + ipoib_schedule_ifupdown_task(cpriv->dev, true); + netdev_unlock_full_to_ops(dev); } else if (priv->parent) { struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); @@ -198,17 +239,10 @@ static int ipoib_stop(struct net_device *dev) struct ipoib_dev_priv *cpriv; /* Bring down any child interfaces too */ - down_read(&priv->vlan_rwsem); - list_for_each_entry(cpriv, &priv->child_intfs, list) { - int flags; - - flags = cpriv->dev->flags; - if (!(flags & IFF_UP)) - continue; - - dev_change_flags(cpriv->dev, flags & ~IFF_UP, NULL); - } - up_read(&priv->vlan_rwsem); + netdev_lock_ops_to_full(dev); + list_for_each_entry(cpriv, &priv->child_intfs, list) + ipoib_schedule_ifupdown_task(cpriv->dev, false); + netdev_unlock_full_to_ops(dev); } return 0; @@ -238,7 +272,7 @@ static int ipoib_change_mtu(struct net_device *dev, int new_mtu) ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n", priv->mcast_mtu); - dev->mtu = new_mtu; + WRITE_ONCE(dev->mtu, new_mtu); return 0; } @@ -265,7 +299,7 @@ static int ipoib_change_mtu(struct net_device *dev, int new_mtu) if (carrier_status) netif_carrier_on(dev); } else { - dev->mtu = new_mtu; + WRITE_ONCE(dev->mtu, new_mtu); } return ret; @@ -329,8 +363,7 @@ static struct net_device *ipoib_get_master_net_dev(struct net_device *dev) rcu_read_lock(); master = netdev_master_upper_dev_get_rcu(dev); - if (master) - dev_hold(master); + dev_hold(master); rcu_read_unlock(); if (master) @@ -426,17 +459,20 @@ static int ipoib_match_gid_pkey_addr(struct ipoib_dev_priv *priv, } } + if (test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) + return matches; + /* Check child interfaces */ - down_read_nested(&priv->vlan_rwsem, nesting); + netdev_lock(priv->dev); list_for_each_entry(child_priv, &priv->child_intfs, list) { matches += ipoib_match_gid_pkey_addr(child_priv, gid, - pkey_index, addr, - nesting + 1, - found_net_dev); + pkey_index, addr, + nesting + 1, + found_net_dev); if (matches > 1) break; } - up_read(&priv->vlan_rwsem); + netdev_unlock(priv->dev); return matches; } @@ -531,9 +567,11 @@ int ipoib_set_mode(struct net_device *dev, const char *buf) set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); ipoib_warn(priv, "enabling connected mode " "will cause multicast packet drops\n"); + netdev_lock_ops(dev); netdev_update_features(dev); - dev_set_mtu(dev, ipoib_cm_max_mtu(dev)); + netif_set_mtu(dev, ipoib_cm_max_mtu(dev)); netif_set_real_num_tx_queues(dev, 1); + netdev_unlock_ops(dev); rtnl_unlock(); priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM; @@ -543,9 +581,11 @@ int ipoib_set_mode(struct net_device *dev, const char *buf) if (!strcmp(buf, "datagram\n")) { clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); + netdev_lock_ops(dev); netdev_update_features(dev); - dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu)); + netif_set_mtu(dev, min(priv->mcast_mtu, dev->mtu)); netif_set_real_num_tx_queues(dev, dev->num_tx_queues); + netdev_unlock_ops(dev); rtnl_unlock(); ipoib_flush_paths(dev); return (!rtnl_trylock()) ? -EBUSY : 0; @@ -1212,6 +1252,7 @@ void ipoib_ib_tx_timeout_work(struct work_struct *work) int err; rtnl_lock(); + netdev_lock_ops(priv->dev); if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) goto unlock; @@ -1226,6 +1267,7 @@ void ipoib_ib_tx_timeout_work(struct work_struct *work) netif_tx_wake_all_queues(priv->dev); unlock: + netdev_unlock_ops(priv->dev); rtnl_unlock(); } @@ -1992,9 +2034,9 @@ static int ipoib_ndo_init(struct net_device *ndev) dev_hold(priv->parent); - down_write(&ppriv->vlan_rwsem); + netdev_lock(priv->parent); list_add_tail(&priv->list, &ppriv->child_intfs); - up_write(&ppriv->vlan_rwsem); + netdev_unlock(priv->parent); } return 0; @@ -2004,8 +2046,6 @@ static void ipoib_ndo_uninit(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); - ASSERT_RTNL(); - /* * ipoib_remove_one guarantees the children are removed before the * parent, and that is the only place where a parent can be removed. @@ -2015,9 +2055,9 @@ static void ipoib_ndo_uninit(struct net_device *dev) if (priv->parent) { struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); - down_write(&ppriv->vlan_rwsem); + netdev_lock(ppriv->dev); list_del(&priv->list); - up_write(&ppriv->vlan_rwsem); + netdev_unlock(ppriv->dev); } ipoib_neigh_hash_uninit(dev); @@ -2146,7 +2186,7 @@ void ipoib_setup_common(struct net_device *dev) dev->hard_header_len = IPOIB_HARD_LEN; dev->addr_len = INFINIBAND_ALEN; dev->type = ARPHRD_INFINIBAND; - dev->tx_queue_len = ipoib_sendq_size * 2; + dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; dev->features = (NETIF_F_VLAN_CHALLENGED | NETIF_F_HIGHDMA); netif_keep_dst(dev); @@ -2167,7 +2207,6 @@ static void ipoib_build_priv(struct net_device *dev) priv->dev = dev; spin_lock_init(&priv->lock); - init_rwsem(&priv->vlan_rwsem); mutex_init(&priv->mcast_mutex); INIT_LIST_HEAD(&priv->path_list); @@ -2372,10 +2411,10 @@ static void set_base_guid(struct ipoib_dev_priv *priv, union ib_gid *gid) netif_addr_unlock_bh(netdev); if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { - down_read(&priv->vlan_rwsem); + netdev_lock_ops_to_full(priv->dev); list_for_each_entry(child_priv, &priv->child_intfs, list) set_base_guid(child_priv, gid); - up_read(&priv->vlan_rwsem); + netdev_unlock_full_to_ops(priv->dev); } } @@ -2415,6 +2454,14 @@ static int ipoib_set_mac(struct net_device *dev, void *addr) set_base_guid(priv, (union ib_gid *)(ss->__data + 4)); + if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { + struct ipoib_dev_priv *cpriv; + + netdev_lock_ops_to_full(dev); + list_for_each_entry(cpriv, &priv->child_intfs, list) + queue_work(ipoib_workqueue, &cpriv->flush_light); + netdev_unlock_full_to_ops(dev); + } queue_work(ipoib_workqueue, &priv->flush_light); return 0; @@ -2526,7 +2573,7 @@ static struct net_device *ipoib_add_port(const char *format, ib_register_event_handler(&priv->event_handler); /* call event handler to ensure pkey in sync */ - queue_work(ipoib_workqueue, &priv->flush_heavy); + ipoib_queue_work(priv, IPOIB_FLUSH_HEAVY); ndev->rtnl_link_ops = ipoib_get_link_ops(); @@ -2624,9 +2671,11 @@ static void ipoib_remove_one(struct ib_device *device, void *client_data) rtnl_lock(); + netdev_lock(priv->dev); list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) unregister_netdevice_queue(cpriv->dev, &head); + netdev_unlock(priv->dev); unregister_netdevice_queue(priv->dev, &head); unregister_netdevice_many(&head); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c index 9ad8d9856275..53db7c8191e3 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c @@ -97,10 +97,13 @@ out_err: return ret; } -static int ipoib_new_child_link(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int ipoib_new_child_link(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct net *link_net = rtnl_newlink_link_net(params); + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; struct net_device *pdev; struct ipoib_dev_priv *ppriv; u16 child_pkey; @@ -109,7 +112,7 @@ static int ipoib_new_child_link(struct net *src_net, struct net_device *dev, if (!tb[IFLA_LINK]) return -EINVAL; - pdev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); + pdev = __dev_get_by_index(link_net, nla_get_u32(tb[IFLA_LINK])); if (!pdev || pdev->type != ARPHRD_INFINIBAND) return -ENODEV; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index 368e5d77416d..86983080d28b 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -280,15 +280,15 @@ void ipoib_event(struct ib_event_handler *handler, dev_name(&record->device->dev), record->element.port_num); if (record->event == IB_EVENT_CLIENT_REREGISTER) { - queue_work(ipoib_workqueue, &priv->flush_light); + ipoib_queue_work(priv, IPOIB_FLUSH_LIGHT); } else if (record->event == IB_EVENT_PORT_ERR || record->event == IB_EVENT_PORT_ACTIVE || record->event == IB_EVENT_LID_CHANGE) { - queue_work(ipoib_workqueue, &priv->flush_normal); + ipoib_queue_work(priv, IPOIB_FLUSH_NORMAL); } else if (record->event == IB_EVENT_PKEY_CHANGE) { - queue_work(ipoib_workqueue, &priv->flush_heavy); + ipoib_queue_work(priv, IPOIB_FLUSH_HEAVY); } else if (record->event == IB_EVENT_GID_CHANGE && !test_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags)) { - queue_work(ipoib_workqueue, &priv->flush_light); + ipoib_queue_work(priv, IPOIB_FLUSH_LIGHT); } } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index 4bd161e86f8d..243e8f555eca 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -53,8 +53,7 @@ static bool is_child_unique(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv) { struct ipoib_dev_priv *tpriv; - - ASSERT_RTNL(); + bool result = true; /* * Since the legacy sysfs interface uses pkey for deletion it cannot @@ -73,13 +72,17 @@ static bool is_child_unique(struct ipoib_dev_priv *ppriv, if (ppriv->pkey == priv->pkey) return false; + netdev_lock(ppriv->dev); list_for_each_entry(tpriv, &ppriv->child_intfs, list) { if (tpriv->pkey == priv->pkey && - tpriv->child_type == IPOIB_LEGACY_CHILD) - return false; + tpriv->child_type == IPOIB_LEGACY_CHILD) { + result = false; + break; + } } + netdev_unlock(ppriv->dev); - return true; + return result; } /* @@ -98,8 +101,6 @@ int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv, int result; struct rdma_netdev *rn = netdev_priv(ndev); - ASSERT_RTNL(); - /* * We do not need to touch priv if register_netdevice fails, so just * always use this flow. @@ -184,8 +185,12 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) ppriv = ipoib_priv(pdev); - snprintf(intf_name, sizeof(intf_name), "%s.%04x", - ppriv->dev->name, pkey); + /* If you increase IFNAMSIZ, update snprintf below + * to allow longer names. + */ + BUILD_BUG_ON(IFNAMSIZ != 16); + snprintf(intf_name, sizeof(intf_name), "%.10s.%04x", ppriv->dev->name, + pkey); ndev = ipoib_intf_alloc(ppriv->ca, ppriv->port, intf_name); if (IS_ERR(ndev)) { @@ -263,6 +268,7 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) ppriv = ipoib_priv(pdev); rc = -ENODEV; + netdev_lock(ppriv->dev); list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) { if (priv->pkey == pkey && priv->child_type == IPOIB_LEGACY_CHILD) { @@ -274,9 +280,7 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) goto out; } - down_write(&ppriv->vlan_rwsem); list_del_init(&priv->list); - up_write(&ppriv->vlan_rwsem); work->dev = priv->dev; INIT_WORK(&work->work, ipoib_vlan_delete_task); queue_work(ipoib_workqueue, &work->work); @@ -287,6 +291,7 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) } out: + netdev_unlock(ppriv->dev); rtnl_unlock(); return rc; diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index bb9aaff92ca3..a5be6f1ba12b 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -393,10 +393,10 @@ static void iscsi_iser_cleanup_task(struct iscsi_task *task) * @task: iscsi task * @sector: error sector if exsists (output) * - * Return: zero if no data-integrity errors have occured - * 0x1: data-integrity error occured in the guard-block - * 0x2: data-integrity error occured in the reference tag - * 0x3: data-integrity error occured in the application tag + * Return: zero if no data-integrity errors have occurred + * 0x1: data-integrity error occurred in the guard-block + * 0x2: data-integrity error occurred in the reference tag + * 0x3: data-integrity error occurred in the application tag * * In addition the error sector is marked. */ diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index 68429a5f796d..1d7ac24c4c00 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -507,10 +507,6 @@ void iser_task_rdma_finalize(struct iscsi_iser_task *task); void iser_free_rx_descriptors(struct iser_conn *iser_conn); -void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, - struct iser_data_buf *mem, - enum iser_data_dir cmd_dir); - int iser_reg_mem_fastreg(struct iscsi_iser_task *task, enum iser_data_dir dir, bool all_imm); diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index 00a7303c8cc6..42977a5326ee 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -91,9 +91,6 @@ isert_qp_event_callback(struct ib_event *e, void *context) case IB_EVENT_COMM_EST: rdma_notify(isert_conn->cm_id, IB_EVENT_COMM_EST); break; - case IB_EVENT_QP_LAST_WQE_REACHED: - isert_warn("Reached TX IB_EVENT_QP_LAST_WQE_REACHED\n"); - break; default: break; } diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_ethtool.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_ethtool.c index 29b3d8fce3f5..316959940d2f 100644 --- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_ethtool.c +++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_ethtool.c @@ -164,9 +164,7 @@ static void vnic_get_strings(struct net_device *netdev, u32 stringset, u8 *data) return; for (i = 0; i < VNIC_STATS_LEN; i++) - memcpy(data + i * ETH_GSTRING_LEN, - vnic_gstrings_stats[i].stat_string, - ETH_GSTRING_LEN); + ethtool_puts(&data, vnic_gstrings_stats[i].stat_string); } /* ethtool ops */ diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index 88106cf5ce55..71387811b281 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -331,7 +331,7 @@ static void rtrs_clt_fast_reg_done(struct ib_cq *cq, struct ib_wc *wc) struct rtrs_clt_con *con = to_clt_con(wc->qp->qp_context); if (wc->status != IB_WC_SUCCESS) { - rtrs_err(con->c.path, "Failed IB_WR_REG_MR: %s\n", + rtrs_err_rl(con->c.path, "Failed IB_WR_REG_MR: %s\n", ib_wc_status_msg(wc->status)); rtrs_rdma_error_recovery(con); } @@ -351,11 +351,11 @@ static void rtrs_clt_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc) struct rtrs_clt_con *con = to_clt_con(wc->qp->qp_context); if (wc->status != IB_WC_SUCCESS) { - rtrs_err(con->c.path, "Failed IB_WR_LOCAL_INV: %s\n", + rtrs_err_rl(con->c.path, "Failed IB_WR_LOCAL_INV: %s\n", ib_wc_status_msg(wc->status)); rtrs_rdma_error_recovery(con); } - req->need_inv = false; + req->mr->need_inval = false; if (req->need_inv_comp) complete(&req->inv_comp); else @@ -391,12 +391,13 @@ static void complete_rdma_req(struct rtrs_clt_io_req *req, int errno, clt_path = to_clt_path(con->c.path); if (req->sg_cnt) { - if (req->dir == DMA_FROM_DEVICE && req->need_inv) { + if (req->mr->need_inval) { /* - * We are here to invalidate read requests + * We are here to invalidate read/write requests * ourselves. In normal scenario server should - * send INV for all read requests, but - * we are here, thus two things could happen: + * send INV for all read requests, we do local + * invalidate for write requests ourselves, but + * we are here, thus three things could happen: * * 1. this is failover, when errno != 0 * and can_wait == 1, @@ -404,6 +405,9 @@ static void complete_rdma_req(struct rtrs_clt_io_req *req, int errno, * 2. something totally bad happened and * server forgot to send INV, so we * should do that ourselves. + * + * 3. write request finishes, we need to do local + * invalidate */ if (can_wait) { @@ -418,18 +422,10 @@ static void complete_rdma_req(struct rtrs_clt_io_req *req, int errno, refcount_inc(&req->ref); err = rtrs_inv_rkey(req); if (err) { - rtrs_err(con->c.path, "Send INV WR key=%#x: %d\n", + rtrs_err_rl(con->c.path, "Send INV WR key=%#x: %d\n", req->mr->rkey, err); } else if (can_wait) { wait_for_completion(&req->inv_comp); - } else { - /* - * Something went wrong, so request will be - * completed from INV callback. - */ - WARN_ON_ONCE(1); - - return; } if (!refcount_dec_and_test(&req->ref)) return; @@ -446,8 +442,10 @@ static void complete_rdma_req(struct rtrs_clt_io_req *req, int errno, req->con = NULL; if (errno) { - rtrs_err_rl(con->c.path, "IO request failed: error=%d path=%s [%s:%u] notify=%d\n", - errno, kobject_name(&clt_path->kobj), clt_path->hca_name, + rtrs_err_rl(con->c.path, + "IO %s request failed: error=%d path=%s [%s:%u] notify=%d\n", + req->dir == DMA_TO_DEVICE ? "write" : "read", errno, + kobject_name(&clt_path->kobj), clt_path->hca_name, clt_path->hca_port, notify); } @@ -501,7 +499,7 @@ static void process_io_rsp(struct rtrs_clt_path *clt_path, u32 msg_id, req = &clt_path->reqs[msg_id]; /* Drop need_inv if server responded with send with invalidation */ - req->need_inv &= !w_inval; + req->mr->need_inval &= !w_inval; complete_rdma_req(req, errno, true, false); } @@ -626,6 +624,7 @@ static void rtrs_clt_rdma_done(struct ib_cq *cq, struct ib_wc *wc) */ if (WARN_ON(wc->wr_cqe->done != rtrs_clt_rdma_done)) return; + clt_path->s.hb_missed_cnt = 0; rtrs_from_imm(be32_to_cpu(wc->ex.imm_data), &imm_type, &imm_payload); if (imm_type == RTRS_IO_RSP_IMM || @@ -643,7 +642,6 @@ static void rtrs_clt_rdma_done(struct ib_cq *cq, struct ib_wc *wc) return rtrs_clt_recv_done(con, wc); } else if (imm_type == RTRS_HB_ACK_IMM) { WARN_ON(con->c.cid); - clt_path->s.hb_missed_cnt = 0; clt_path->s.hb_cur_latency = ktime_sub(ktime_get(), clt_path->s.hb_last_sent); if (clt_path->flags & RTRS_MSG_NEW_RKEY_F) @@ -670,6 +668,7 @@ static void rtrs_clt_rdma_done(struct ib_cq *cq, struct ib_wc *wc) /* * Key invalidations from server side */ + clt_path->s.hb_missed_cnt = 0; WARN_ON(!(wc->wc_flags & IB_WC_WITH_INVALIDATE || wc->wc_flags & IB_WC_WITH_IMM)); WARN_ON(wc->wr_cqe->done != rtrs_clt_rdma_done); @@ -967,7 +966,7 @@ static void rtrs_clt_init_req(struct rtrs_clt_io_req *req, req->dir = dir; req->con = rtrs_permit_to_clt_con(clt_path, permit); req->conf = conf; - req->need_inv = false; + req->mr->need_inval = false; req->need_inv_comp = false; req->inv_errno = 0; refcount_set(&req->ref, 1); @@ -1089,7 +1088,6 @@ static int rtrs_clt_write_req(struct rtrs_clt_io_req *req) int ret, count = 0; u32 imm, buf_id; struct ib_reg_wr rwr; - struct ib_send_wr inv_wr; struct ib_send_wr *wr = NULL; bool fr_en = false; @@ -1130,13 +1128,6 @@ static int rtrs_clt_write_req(struct rtrs_clt_io_req *req) req->sg_cnt, req->dir); return ret; } - inv_wr = (struct ib_send_wr) { - .opcode = IB_WR_LOCAL_INV, - .wr_cqe = &req->inv_cqe, - .send_flags = IB_SEND_SIGNALED, - .ex.invalidate_rkey = req->mr->rkey, - }; - req->inv_cqe.done = rtrs_clt_inv_rkey_done; rwr = (struct ib_reg_wr) { .wr.opcode = IB_WR_REG_MR, .wr.wr_cqe = &fast_reg_cqe, @@ -1146,7 +1137,7 @@ static int rtrs_clt_write_req(struct rtrs_clt_io_req *req) }; wr = &rwr.wr; fr_en = true; - refcount_inc(&req->ref); + req->mr->need_inval = true; } /* * Update stats now, after request is successfully sent it is not @@ -1156,7 +1147,7 @@ static int rtrs_clt_write_req(struct rtrs_clt_io_req *req) ret = rtrs_post_rdma_write_sg(req->con, req, rbuf, fr_en, count, req->usr_len + sizeof(*msg), - imm, wr, &inv_wr); + imm, wr, NULL); if (ret) { rtrs_err_rl(s, "Write request failed: error=%d path=%s [%s:%u]\n", @@ -1164,6 +1155,10 @@ static int rtrs_clt_write_req(struct rtrs_clt_io_req *req) clt_path->hca_port); if (req->mp_policy == MP_POLICY_MIN_INFLIGHT) atomic_dec(&clt_path->stats->inflight); + if (req->mr->need_inval) { + req->mr->need_inval = false; + refcount_dec(&req->ref); + } if (req->sg_cnt) ib_dma_unmap_sg(clt_path->s.dev->ib_dev, req->sglist, req->sg_cnt, req->dir); @@ -1213,7 +1208,7 @@ static int rtrs_clt_read_req(struct rtrs_clt_io_req *req) ret = rtrs_map_sg_fr(req, count); if (ret < 0) { rtrs_err_rl(s, - "Read request failed, failed to map fast reg. data, err: %d\n", + "Read request failed, failed to map fast reg. data, err: %d\n", ret); ib_dma_unmap_sg(dev->ib_dev, req->sglist, req->sg_cnt, req->dir); @@ -1237,7 +1232,7 @@ static int rtrs_clt_read_req(struct rtrs_clt_io_req *req) msg->desc[0].len = cpu_to_le32(req->mr->length); /* Further invalidation is required */ - req->need_inv = !!RTRS_MSG_NEED_INVAL_F; + req->mr->need_inval = !!RTRS_MSG_NEED_INVAL_F; } else { msg->sg_cnt = 0; @@ -1270,7 +1265,7 @@ static int rtrs_clt_read_req(struct rtrs_clt_io_req *req) clt_path->hca_port); if (req->mp_policy == MP_POLICY_MIN_INFLIGHT) atomic_dec(&clt_path->stats->inflight); - req->need_inv = false; + req->mr->need_inval = false; if (req->sg_cnt) ib_dma_unmap_sg(dev->ib_dev, req->sglist, req->sg_cnt, req->dir); @@ -1494,7 +1489,9 @@ static bool rtrs_clt_change_state_get_old(struct rtrs_clt_path *clt_path, static void rtrs_clt_hb_err_handler(struct rtrs_con *c) { struct rtrs_clt_con *con = container_of(c, typeof(*con), c); + struct rtrs_clt_path *clt_path = to_clt_path(con->c.path); + rtrs_err(con->c.path, "HB err handler for path=%s\n", kobject_name(&clt_path->kobj)); rtrs_rdma_error_recovery(con); } @@ -2346,6 +2343,12 @@ static int init_conns(struct rtrs_clt_path *clt_path) if (err) goto destroy; } + + /* + * Set the cid to con_num - 1, since if we fail later, we want to stay in bounds. + */ + cid = clt_path->s.con_num - 1; + err = alloc_path_reqs(clt_path); if (err) goto destroy; @@ -3140,8 +3143,20 @@ close_path: return err; } +void rtrs_clt_ib_event_handler(struct ib_event_handler *handler, + struct ib_event *ibevent) +{ + pr_info("Handling event: %s (%d).\n", ib_event_msg(ibevent->event), + ibevent->event); +} + + static int rtrs_clt_ib_dev_init(struct rtrs_ib_dev *dev) { + INIT_IB_EVENT_HANDLER(&dev->event_handler, dev->ib_dev, + rtrs_clt_ib_event_handler); + ib_register_event_handler(&dev->event_handler); + if (!(dev->ib_dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) { pr_err("Memory registrations not supported.\n"); @@ -3151,8 +3166,15 @@ static int rtrs_clt_ib_dev_init(struct rtrs_ib_dev *dev) return 0; } +static void rtrs_clt_ib_dev_deinit(struct rtrs_ib_dev *dev) +{ + ib_unregister_event_handler(&dev->event_handler); +} + + static const struct rtrs_rdma_dev_pd_ops dev_pd_ops = { - .init = rtrs_clt_ib_dev_init + .init = rtrs_clt_ib_dev_init, + .deinit = rtrs_clt_ib_dev_deinit }; static int __init rtrs_client_init(void) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.h b/drivers/infiniband/ulp/rtrs/rtrs-clt.h index f848c0392d98..0f57759b3080 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.h +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.h @@ -115,7 +115,6 @@ struct rtrs_clt_io_req { struct completion inv_comp; int inv_errno; bool need_inv_comp; - bool need_inv; refcount_t ref; }; @@ -213,6 +212,8 @@ int rtrs_clt_remove_path_from_sysfs(struct rtrs_clt_path *path, void rtrs_clt_set_max_reconnect_attempts(struct rtrs_clt_sess *clt, int value); int rtrs_clt_get_max_reconnect_attempts(const struct rtrs_clt_sess *clt); void free_path(struct rtrs_clt_path *clt_path); +void rtrs_clt_ib_event_handler(struct ib_event_handler *handler, + struct ib_event *ibevent); /* rtrs-clt-stats.c */ diff --git a/drivers/infiniband/ulp/rtrs/rtrs-pri.h b/drivers/infiniband/ulp/rtrs/rtrs-pri.h index ab25619261d2..ef29bd483b5a 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-pri.h +++ b/drivers/infiniband/ulp/rtrs/rtrs-pri.h @@ -69,6 +69,7 @@ struct rtrs_ib_dev; struct rtrs_rdma_dev_pd_ops { int (*init)(struct rtrs_ib_dev *dev); + void (*deinit)(struct rtrs_ib_dev *dev); }; struct rtrs_rdma_dev_pd { @@ -84,6 +85,7 @@ struct rtrs_ib_dev { struct kref ref; struct list_head entry; struct rtrs_rdma_dev_pd *pool; + struct ib_event_handler event_handler; }; struct rtrs_con { diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index 1d33efb8fb03..ef4abdea3c2d 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -26,7 +26,10 @@ MODULE_LICENSE("GPL"); #define DEFAULT_SESS_QUEUE_DEPTH 512 #define MAX_HDR_SIZE PAGE_SIZE -static struct rtrs_rdma_dev_pd dev_pd; +static const struct rtrs_rdma_dev_pd_ops dev_pd_ops; +static struct rtrs_rdma_dev_pd dev_pd = { + .ops = &dev_pd_ops +}; const struct class rtrs_dev_class = { .name = "rtrs-server", }; @@ -346,6 +349,7 @@ static int send_io_resp_imm(struct rtrs_srv_con *con, struct rtrs_srv_op *id, struct rtrs_srv_mr *srv_mr; bool need_inval = false; enum ib_send_flags flags; + struct ib_sge list; u32 imm; int err; @@ -398,7 +402,6 @@ static int send_io_resp_imm(struct rtrs_srv_con *con, struct rtrs_srv_op *id, imm = rtrs_to_io_rsp_imm(id->msg_id, errno, need_inval); imm_wr.wr.next = NULL; if (always_invalidate) { - struct ib_sge list; struct rtrs_msg_rkey_rsp *msg; srv_mr = &srv_path->mrs[id->msg_id]; @@ -672,6 +675,10 @@ err: static void rtrs_srv_hb_err_handler(struct rtrs_con *c) { + struct rtrs_srv_con *con = container_of(c, typeof(*con), c); + struct rtrs_srv_path *srv_path = to_srv_path(con->c.path); + + rtrs_err(con->c.path, "HB err handler for path=%s\n", kobject_name(&srv_path->kobj)); close_path(to_srv_path(c->path)); } @@ -931,12 +938,11 @@ static void rtrs_srv_info_req_done(struct ib_cq *cq, struct ib_wc *wc) if (err) goto close; -out: rtrs_iu_free(iu, srv_path->s.dev->ib_dev, 1); return; close: + rtrs_iu_free(iu, srv_path->s.dev->ib_dev, 1); close_path(srv_path); - goto out; } static int post_recv_info_req(struct rtrs_srv_con *con) @@ -987,6 +993,16 @@ static int post_recv_path(struct rtrs_srv_path *srv_path) q_size = SERVICE_CON_QUEUE_DEPTH; else q_size = srv->queue_depth; + if (srv_path->state != RTRS_SRV_CONNECTING) { + rtrs_err(s, "Path state invalid. state %s\n", + rtrs_srv_state_str(srv_path->state)); + return -EIO; + } + + if (!srv_path->s.con[cid]) { + rtrs_err(s, "Conn not set for %d\n", cid); + return -EIO; + } err = post_recv_io(to_srv_con(srv_path->s.con[cid]), q_size); if (err) { @@ -1229,6 +1245,7 @@ static void rtrs_srv_rdma_done(struct ib_cq *cq, struct ib_wc *wc) */ if (WARN_ON(wc->wr_cqe != &io_comp_cqe)) return; + srv_path->s.hb_missed_cnt = 0; err = rtrs_post_recv_empty(&con->c, &io_comp_cqe); if (err) { rtrs_err(s, "rtrs_post_recv(), err: %d\n", err); @@ -2255,6 +2272,34 @@ static int check_module_params(void) return 0; } +void rtrs_srv_ib_event_handler(struct ib_event_handler *handler, + struct ib_event *ibevent) +{ + pr_info("Handling event: %s (%d).\n", ib_event_msg(ibevent->event), + ibevent->event); +} + +static int rtrs_srv_ib_dev_init(struct rtrs_ib_dev *dev) +{ + INIT_IB_EVENT_HANDLER(&dev->event_handler, dev->ib_dev, + rtrs_srv_ib_event_handler); + ib_register_event_handler(&dev->event_handler); + + return 0; +} + +static void rtrs_srv_ib_dev_deinit(struct rtrs_ib_dev *dev) +{ + ib_unregister_event_handler(&dev->event_handler); +} + + +static const struct rtrs_rdma_dev_pd_ops dev_pd_ops = { + .init = rtrs_srv_ib_dev_init, + .deinit = rtrs_srv_ib_dev_deinit +}; + + static int __init rtrs_server_init(void) { int err; diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.h b/drivers/infiniband/ulp/rtrs/rtrs-srv.h index 5e325b82ff33..014f85681f37 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.h +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.h @@ -132,6 +132,8 @@ struct rtrs_srv_ib_ctx { extern const struct class rtrs_dev_class; void close_path(struct rtrs_srv_path *srv_path); +void rtrs_srv_ib_event_handler(struct ib_event_handler *handler, + struct ib_event *ibevent); static inline void rtrs_srv_update_rdma_stats(struct rtrs_srv_stats *s, size_t size, int d) diff --git a/drivers/infiniband/ulp/rtrs/rtrs.c b/drivers/infiniband/ulp/rtrs/rtrs.c index 4e17d546d4cc..bf38ac6f87c4 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs.c +++ b/drivers/infiniband/ulp/rtrs/rtrs.c @@ -584,6 +584,9 @@ static void dev_free(struct kref *ref) list_del(&dev->entry); mutex_unlock(&pool->mutex); + if (pool->ops && pool->ops->deinit) + pool->ops->deinit(dev); + ib_dealloc_pd(dev->ib_pd); kfree(dev); } diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 2916e77f589b..1378651735f6 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -2844,7 +2844,8 @@ static int srp_target_alloc(struct scsi_target *starget) return 0; } -static int srp_slave_configure(struct scsi_device *sdev) +static int srp_sdev_configure(struct scsi_device *sdev, + struct queue_limits *lim) { struct Scsi_Host *shost = sdev->host; struct srp_target_port *target = host_to_target(shost); @@ -3067,7 +3068,7 @@ static const struct scsi_host_template srp_template = { .name = "InfiniBand SRP initiator", .proc_name = DRV_NAME, .target_alloc = srp_target_alloc, - .slave_configure = srp_slave_configure, + .sdev_configure = srp_sdev_configure, .info = srp_target_info, .init_cmd_priv = srp_init_cmd_priv, .exit_cmd_priv = srp_exit_cmd_priv, @@ -3978,7 +3979,6 @@ static struct srp_host *srp_add_port(struct srp_device *device, u32 port) return host; put_host: - device_del(&host->dev); put_device(&host->dev); return NULL; } diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 9632afbd727b..5dfb4644446b 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -68,6 +68,8 @@ MODULE_LICENSE("Dual BSD/GPL"); static u64 srpt_service_guid; static DEFINE_SPINLOCK(srpt_dev_lock); /* Protects srpt_dev_list. */ static LIST_HEAD(srpt_dev_list); /* List of srpt_device structures. */ +static DEFINE_MUTEX(srpt_mc_mutex); /* Protects srpt_memory_caches. */ +static DEFINE_XARRAY(srpt_memory_caches); /* See also srpt_memory_cache_entry */ static unsigned srp_max_req_size = DEFAULT_MAX_REQ_SIZE; module_param(srp_max_req_size, int, 0444); @@ -105,6 +107,63 @@ static void srpt_recv_done(struct ib_cq *cq, struct ib_wc *wc); static void srpt_send_done(struct ib_cq *cq, struct ib_wc *wc); static void srpt_process_wait_list(struct srpt_rdma_ch *ch); +/* Type of the entries in srpt_memory_caches. */ +struct srpt_memory_cache_entry { + refcount_t ref; + struct kmem_cache *c; +}; + +static struct kmem_cache *srpt_cache_get(unsigned int object_size) +{ + struct srpt_memory_cache_entry *e; + char name[32]; + void *res; + + guard(mutex)(&srpt_mc_mutex); + e = xa_load(&srpt_memory_caches, object_size); + if (e) { + refcount_inc(&e->ref); + return e->c; + } + snprintf(name, sizeof(name), "srpt-%u", object_size); + e = kmalloc(sizeof(*e), GFP_KERNEL); + if (!e) + return NULL; + refcount_set(&e->ref, 1); + e->c = kmem_cache_create(name, object_size, /*align=*/512, 0, NULL); + if (!e->c) + goto free_entry; + res = xa_store(&srpt_memory_caches, object_size, e, GFP_KERNEL); + if (xa_is_err(res)) + goto destroy_cache; + return e->c; + +destroy_cache: + kmem_cache_destroy(e->c); + +free_entry: + kfree(e); + return NULL; +} + +static void srpt_cache_put(struct kmem_cache *c) +{ + struct srpt_memory_cache_entry *e = NULL; + unsigned long object_size; + + guard(mutex)(&srpt_mc_mutex); + xa_for_each(&srpt_memory_caches, object_size, e) + if (e->c == c) + break; + if (WARN_ON_ONCE(!e)) + return; + if (!refcount_dec_and_test(&e->ref)) + return; + WARN_ON_ONCE(xa_erase(&srpt_memory_caches, object_size) != e); + kmem_cache_destroy(e->c); + kfree(e); +} + /* * The only allowed channel state changes are those that change the channel * state into a state with a higher numerical value. Hence the new > prev test. @@ -2119,13 +2178,13 @@ static void srpt_release_channel_work(struct work_struct *w) ch->sport->sdev, ch->rq_size, ch->rsp_buf_cache, DMA_TO_DEVICE); - kmem_cache_destroy(ch->rsp_buf_cache); + srpt_cache_put(ch->rsp_buf_cache); srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_recv_ring, sdev, ch->rq_size, ch->req_buf_cache, DMA_FROM_DEVICE); - kmem_cache_destroy(ch->req_buf_cache); + srpt_cache_put(ch->req_buf_cache); kref_put(&ch->kref, srpt_free_ch); } @@ -2245,8 +2304,7 @@ static int srpt_cm_req_recv(struct srpt_device *const sdev, INIT_LIST_HEAD(&ch->cmd_wait_list); ch->max_rsp_size = ch->sport->port_attrib.srp_max_rsp_size; - ch->rsp_buf_cache = kmem_cache_create("srpt-rsp-buf", ch->max_rsp_size, - 512, 0, NULL); + ch->rsp_buf_cache = srpt_cache_get(ch->max_rsp_size); if (!ch->rsp_buf_cache) goto free_ch; @@ -2280,8 +2338,7 @@ static int srpt_cm_req_recv(struct srpt_device *const sdev, alignment_offset = round_up(imm_data_offset, 512) - imm_data_offset; req_sz = alignment_offset + imm_data_offset + srp_max_req_size; - ch->req_buf_cache = kmem_cache_create("srpt-req-buf", req_sz, - 512, 0, NULL); + ch->req_buf_cache = srpt_cache_get(req_sz); if (!ch->req_buf_cache) goto free_rsp_ring; @@ -2478,7 +2535,7 @@ free_recv_ring: ch->req_buf_cache, DMA_FROM_DEVICE); free_recv_cache: - kmem_cache_destroy(ch->req_buf_cache); + srpt_cache_put(ch->req_buf_cache); free_rsp_ring: srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_ring, @@ -2486,7 +2543,7 @@ free_rsp_ring: ch->rsp_buf_cache, DMA_TO_DEVICE); free_rsp_cache: - kmem_cache_destroy(ch->rsp_buf_cache); + srpt_cache_put(ch->rsp_buf_cache); free_ch: if (rdma_cm_id) @@ -3055,7 +3112,7 @@ static void srpt_free_srq(struct srpt_device *sdev) srpt_free_ioctx_ring((struct srpt_ioctx **)sdev->ioctx_ring, sdev, sdev->srq_size, sdev->req_buf_cache, DMA_FROM_DEVICE); - kmem_cache_destroy(sdev->req_buf_cache); + srpt_cache_put(sdev->req_buf_cache); sdev->srq = NULL; } @@ -3082,8 +3139,7 @@ static int srpt_alloc_srq(struct srpt_device *sdev) pr_debug("create SRQ #wr= %d max_allow=%d dev= %s\n", sdev->srq_size, sdev->device->attrs.max_srq_wr, dev_name(&device->dev)); - sdev->req_buf_cache = kmem_cache_create("srpt-srq-req-buf", - srp_max_req_size, 0, 0, NULL); + sdev->req_buf_cache = srpt_cache_get(srp_max_req_size); if (!sdev->req_buf_cache) goto free_srq; @@ -3105,7 +3161,7 @@ static int srpt_alloc_srq(struct srpt_device *sdev) return 0; free_cache: - kmem_cache_destroy(sdev->req_buf_cache); + srpt_cache_put(sdev->req_buf_cache); free_srq: ib_destroy_srq(srq); |