diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-01-31 12:05:10 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-01-31 12:05:10 -0800 |
commit | 7b1cd95d65eb3b1e13f8a90eb757e0ea232c7899 (patch) | |
tree | cbc3ec5d45b04666c24f7c0b1df04a85d29c7d0f /drivers/infiniband/core | |
parent | 2155e69a9d9acd42488ef994a4e1ff535438c128 (diff) | |
parent | e7996a9a77fc669387da43ff4823b91cc4872bd0 (diff) |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
Pull RDMA subsystem updates from Jason Gunthorpe:
"Overall this cycle did not have any major excitement, and did not
require any shared branch with netdev.
Lots of driver updates, particularly of the scale-up and performance
variety. The largest body of core work was Parav's patches fixing and
restructing some of the core code to make way for future RDMA
containerization.
Summary:
- misc small driver fixups to
bnxt_re/hfi1/qib/hns/ocrdma/rdmavt/vmw_pvrdma/nes
- several major feature adds to bnxt_re driver: SRIOV VF RoCE
support, HugePages support, extended hardware stats support, and
SRQ support
- a notable number of fixes to the i40iw driver from debugging scale
up testing
- more work to enable the new hip08 chip in the hns driver
- misc small ULP fixups to srp/srpt//ipoib
- preparation for srp initiator and target to support the RDMA-CM
protocol for connections
- add RDMA-CM support to srp initiator, srp target is still a WIP
- fixes for a couple of places where ipoib could spam the dmesg log
- fix encode/decode of FDR/EDR data rates in the core
- many patches from Parav with ongoing work to clean up
inconsistencies and bugs in RoCE support around the rdma_cm
- mlx5 driver support for the userspace features 'thread domain',
'wallclock timestamps' and 'DV Direct Connected transport'. Support
for the firmware dual port rocee capability
- core support for more than 32 rdma devices in the char dev
allocation
- kernel doc updates from Randy Dunlap
- new netlink uAPI for inspecting RDMA objects similar in spirit to 'ss'
- one minor change to the kobject code acked by Greg KH"
* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (259 commits)
RDMA/nldev: Provide detailed QP information
RDMA/nldev: Provide global resource utilization
RDMA/core: Add resource tracking for create and destroy PDs
RDMA/core: Add resource tracking for create and destroy CQs
RDMA/core: Add resource tracking for create and destroy QPs
RDMA/restrack: Add general infrastructure to track RDMA resources
RDMA/core: Save kernel caller name when creating PD and CQ objects
RDMA/core: Use the MODNAME instead of the function name for pd callers
RDMA: Move enum ib_cq_creation_flags to uapi headers
IB/rxe: Change RDMA_RXE kconfig to use select
IB/qib: remove qib_keys.c
IB/mthca: remove mthca_user.h
RDMA/cm: Fix access to uninitialized variable
RDMA/cma: Use existing netif_is_bond_master function
IB/core: Avoid SGID attributes query while converting GID from OPA to IB
RDMA/mlx5: Avoid memory leak in case of XRCD dealloc failure
IB/umad: Fix use of unprotected device pointer
IB/iser: Combine substrings for three messages
IB/iser: Delete an unnecessary variable initialisation in iser_send_data_out()
IB/iser: Delete an error message for a failed memory allocation in iser_send_data_out()
...
Diffstat (limited to 'drivers/infiniband/core')
28 files changed, 1327 insertions, 661 deletions
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index 504b926552c6..f69833db0a32 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -12,7 +12,7 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ device.o fmr_pool.o cache.o netlink.o \ roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \ multicast.o mad.o smi.o agent.o mad_rmpp.o \ - security.o nldev.o + security.o nldev.o restrack.o ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index f4e8185bccd3..a5b4cf030c11 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -243,8 +243,7 @@ void rdma_copy_addr(struct rdma_dev_addr *dev_addr, EXPORT_SYMBOL(rdma_copy_addr); int rdma_translate_ip(const struct sockaddr *addr, - struct rdma_dev_addr *dev_addr, - u16 *vlan_id) + struct rdma_dev_addr *dev_addr) { struct net_device *dev; @@ -266,9 +265,6 @@ int rdma_translate_ip(const struct sockaddr *addr, return -EADDRNOTAVAIL; rdma_copy_addr(dev_addr, dev, NULL); - dev_addr->bound_dev_if = dev->ifindex; - if (vlan_id) - *vlan_id = rdma_vlan_dev_vlan_id(dev); dev_put(dev); break; #if IS_ENABLED(CONFIG_IPV6) @@ -279,9 +275,6 @@ int rdma_translate_ip(const struct sockaddr *addr, &((const struct sockaddr_in6 *)addr)->sin6_addr, dev, 1)) { rdma_copy_addr(dev_addr, dev, NULL); - dev_addr->bound_dev_if = dev->ifindex; - if (vlan_id) - *vlan_id = rdma_vlan_dev_vlan_id(dev); break; } } @@ -481,7 +474,7 @@ static int addr_resolve_neigh(struct dst_entry *dst, if (dst->dev->flags & IFF_LOOPBACK) { int ret; - ret = rdma_translate_ip(dst_in, addr, NULL); + ret = rdma_translate_ip(dst_in, addr); if (!ret) memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN); @@ -558,7 +551,7 @@ static int addr_resolve(struct sockaddr *src_in, } if (ndev->flags & IFF_LOOPBACK) { - ret = rdma_translate_ip(dst_in, addr, NULL); + ret = rdma_translate_ip(dst_in, addr); /* * Put the loopback device and get the translated * device instead. @@ -744,7 +737,6 @@ void rdma_addr_cancel(struct rdma_dev_addr *addr) EXPORT_SYMBOL(rdma_addr_cancel); struct resolve_cb_context { - struct rdma_dev_addr *addr; struct completion comp; int status; }; @@ -752,39 +744,31 @@ struct resolve_cb_context { static void resolve_cb(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context) { - if (!status) - memcpy(((struct resolve_cb_context *)context)->addr, - addr, sizeof(struct rdma_dev_addr)); ((struct resolve_cb_context *)context)->status = status; complete(&((struct resolve_cb_context *)context)->comp); } int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, const union ib_gid *dgid, - u8 *dmac, u16 *vlan_id, int *if_index, + u8 *dmac, const struct net_device *ndev, int *hoplimit) { - int ret = 0; struct rdma_dev_addr dev_addr; struct resolve_cb_context ctx; - struct net_device *dev; - union { struct sockaddr _sockaddr; struct sockaddr_in _sockaddr_in; struct sockaddr_in6 _sockaddr_in6; } sgid_addr, dgid_addr; - + int ret; rdma_gid2ip(&sgid_addr._sockaddr, sgid); rdma_gid2ip(&dgid_addr._sockaddr, dgid); memset(&dev_addr, 0, sizeof(dev_addr)); - if (if_index) - dev_addr.bound_dev_if = *if_index; + dev_addr.bound_dev_if = ndev->ifindex; dev_addr.net = &init_net; - ctx.addr = &dev_addr; init_completion(&ctx.comp); ret = rdma_resolve_ip(&self, &sgid_addr._sockaddr, &dgid_addr._sockaddr, &dev_addr, 1000, resolve_cb, &ctx); @@ -798,42 +782,9 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, return ret; memcpy(dmac, dev_addr.dst_dev_addr, ETH_ALEN); - dev = dev_get_by_index(&init_net, dev_addr.bound_dev_if); - if (!dev) - return -ENODEV; - if (if_index) - *if_index = dev_addr.bound_dev_if; - if (vlan_id) - *vlan_id = rdma_vlan_dev_vlan_id(dev); - if (hoplimit) - *hoplimit = dev_addr.hoplimit; - dev_put(dev); - return ret; -} -EXPORT_SYMBOL(rdma_addr_find_l2_eth_by_grh); - -int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id) -{ - int ret = 0; - struct rdma_dev_addr dev_addr; - union { - struct sockaddr _sockaddr; - struct sockaddr_in _sockaddr_in; - struct sockaddr_in6 _sockaddr_in6; - } gid_addr; - - rdma_gid2ip(&gid_addr._sockaddr, sgid); - - memset(&dev_addr, 0, sizeof(dev_addr)); - dev_addr.net = &init_net; - ret = rdma_translate_ip(&gid_addr._sockaddr, &dev_addr, vlan_id); - if (ret) - return ret; - - memcpy(smac, dev_addr.src_dev_addr, ETH_ALEN); - return ret; + *hoplimit = dev_addr.hoplimit; + return 0; } -EXPORT_SYMBOL(rdma_addr_find_smac_by_sgid); static int netevent_callback(struct notifier_block *self, unsigned long event, void *ctx) diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 77515638c55c..e9a409d7f4e2 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -573,27 +573,24 @@ static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev, struct ib_gid_attr attr; if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID) - goto next; + continue; if (memcmp(gid, &table->data_vec[i].gid, sizeof(*gid))) - goto next; + continue; memcpy(&attr, &table->data_vec[i].attr, sizeof(attr)); - if (filter(gid, &attr, context)) + if (filter(gid, &attr, context)) { found = true; - -next: - if (found) + if (index) + *index = i; break; + } } read_unlock_irqrestore(&table->rwlock, flags); if (!found) return -ENOENT; - - if (index) - *index = i; return 0; } @@ -824,12 +821,7 @@ static int gid_table_setup_one(struct ib_device *ib_dev) if (err) return err; - err = roce_rescan_device(ib_dev); - - if (err) { - gid_table_cleanup_one(ib_dev); - gid_table_release_one(ib_dev); - } + rdma_roce_rescan_device(ib_dev); return err; } @@ -883,7 +875,6 @@ int ib_find_gid_by_filter(struct ib_device *device, port_num, filter, context, index); } -EXPORT_SYMBOL(ib_find_gid_by_filter); int ib_get_cached_pkey(struct ib_device *device, u8 port_num, diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index f6b159d79977..e6749157fd86 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -452,13 +452,14 @@ static void cm_set_private_data(struct cm_id_private *cm_id_priv, cm_id_priv->private_data_len = private_data_len; } -static void cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc, - struct ib_grh *grh, struct cm_av *av) +static int cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc, + struct ib_grh *grh, struct cm_av *av) { av->port = port; av->pkey_index = wc->pkey_index; - ib_init_ah_from_wc(port->cm_dev->ib_device, port->port_num, wc, - grh, &av->ah_attr); + return ib_init_ah_attr_from_wc(port->cm_dev->ib_device, + port->port_num, wc, + grh, &av->ah_attr); } static int cm_init_av_by_path(struct sa_path_rec *path, struct cm_av *av, @@ -494,8 +495,11 @@ static int cm_init_av_by_path(struct sa_path_rec *path, struct cm_av *av, return ret; av->port = port; - ib_init_ah_from_path(cm_dev->ib_device, port->port_num, path, - &av->ah_attr); + ret = ib_init_ah_attr_from_path(cm_dev->ib_device, port->port_num, path, + &av->ah_attr); + if (ret) + return ret; + av->timeout = path->packet_life_time + 1; spin_lock_irqsave(&cm.lock, flags); @@ -1560,6 +1564,35 @@ static u16 cm_get_bth_pkey(struct cm_work *work) return pkey; } +/** + * Convert OPA SGID to IB SGID + * ULPs (such as IPoIB) do not understand OPA GIDs and will + * reject them as the local_gid will not match the sgid. Therefore, + * change the pathrec's SGID to an IB SGID. + * + * @work: Work completion + * @path: Path record + */ +static void cm_opa_to_ib_sgid(struct cm_work *work, + struct sa_path_rec *path) +{ + struct ib_device *dev = work->port->cm_dev->ib_device; + u8 port_num = work->port->port_num; + + if (rdma_cap_opa_ah(dev, port_num) && + (ib_is_opa_gid(&path->sgid))) { + union ib_gid sgid; + + if (ib_get_cached_gid(dev, port_num, 0, &sgid, NULL)) { + dev_warn(&dev->dev, + "Error updating sgid in CM request\n"); + return; + } + + path->sgid = sgid; + } +} + static void cm_format_req_event(struct cm_work *work, struct cm_id_private *cm_id_priv, struct ib_cm_id *listen_id) @@ -1573,10 +1606,13 @@ static void cm_format_req_event(struct cm_work *work, param->bth_pkey = cm_get_bth_pkey(work); param->port = cm_id_priv->av.port->port_num; param->primary_path = &work->path[0]; - if (cm_req_has_alt_path(req_msg)) + cm_opa_to_ib_sgid(work, param->primary_path); + if (cm_req_has_alt_path(req_msg)) { param->alternate_path = &work->path[1]; - else + cm_opa_to_ib_sgid(work, param->alternate_path); + } else { param->alternate_path = NULL; + } param->remote_ca_guid = req_msg->local_ca_guid; param->remote_qkey = be32_to_cpu(req_msg->local_qkey); param->remote_qpn = be32_to_cpu(cm_req_get_local_qpn(req_msg)); @@ -1826,9 +1862,11 @@ static int cm_req_handler(struct cm_work *work) cm_id_priv = container_of(cm_id, struct cm_id_private, id); cm_id_priv->id.remote_id = req_msg->local_comm_id; - cm_init_av_for_response(work->port, work->mad_recv_wc->wc, - work->mad_recv_wc->recv_buf.grh, - &cm_id_priv->av); + ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc, + work->mad_recv_wc->recv_buf.grh, + &cm_id_priv->av); + if (ret) + goto destroy; cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv-> id.local_id); if (IS_ERR(cm_id_priv->timewait_info)) { @@ -1841,9 +1879,10 @@ static int cm_req_handler(struct cm_work *work) listen_cm_id_priv = cm_match_req(work, cm_id_priv); if (!listen_cm_id_priv) { + pr_debug("%s: local_id %d, no listen_cm_id_priv\n", __func__, + be32_to_cpu(cm_id->local_id)); ret = -EINVAL; - kfree(cm_id_priv->timewait_info); - goto destroy; + goto free_timeinfo; } cm_id_priv->id.cm_handler = listen_cm_id_priv->id.cm_handler; @@ -1861,56 +1900,50 @@ static int cm_req_handler(struct cm_work *work) work->port->port_num, grh->sgid_index, &gid, &gid_attr); - if (!ret) { - if (gid_attr.ndev) { - work->path[0].rec_type = - sa_conv_gid_to_pathrec_type(gid_attr.gid_type); - sa_path_set_ifindex(&work->path[0], - gid_attr.ndev->ifindex); - sa_path_set_ndev(&work->path[0], - dev_net(gid_attr.ndev)); - dev_put(gid_attr.ndev); - } else { - cm_path_set_rec_type(work->port->cm_dev->ib_device, - work->port->port_num, - &work->path[0], - &req_msg->primary_local_gid); - } - if (cm_req_has_alt_path(req_msg)) - work->path[1].rec_type = work->path[0].rec_type; - cm_format_paths_from_req(req_msg, &work->path[0], - &work->path[1]); - if (cm_id_priv->av.ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE) - sa_path_set_dmac(&work->path[0], - cm_id_priv->av.ah_attr.roce.dmac); - work->path[0].hop_limit = grh->hop_limit; - ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av, - cm_id_priv); + if (ret) { + ib_send_cm_rej(cm_id, IB_CM_REJ_UNSUPPORTED, NULL, 0, NULL, 0); + goto rejected; + } + + if (gid_attr.ndev) { + work->path[0].rec_type = + sa_conv_gid_to_pathrec_type(gid_attr.gid_type); + sa_path_set_ifindex(&work->path[0], + gid_attr.ndev->ifindex); + sa_path_set_ndev(&work->path[0], + dev_net(gid_attr.ndev)); + dev_put(gid_attr.ndev); + } else { + cm_path_set_rec_type(work->port->cm_dev->ib_device, + work->port->port_num, + &work->path[0], + &req_msg->primary_local_gid); } + if (cm_req_has_alt_path(req_msg)) + work->path[1].rec_type = work->path[0].rec_type; + cm_format_paths_from_req(req_msg, &work->path[0], + &work->path[1]); + if (cm_id_priv->av.ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE) + sa_path_set_dmac(&work->path[0], + cm_id_priv->av.ah_attr.roce.dmac); + work->path[0].hop_limit = grh->hop_limit; + ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av, + cm_id_priv); if (ret) { - int err = ib_get_cached_gid(work->port->cm_dev->ib_device, - work->port->port_num, 0, - &work->path[0].sgid, - &gid_attr); - if (!err && gid_attr.ndev) { - work->path[0].rec_type = - sa_conv_gid_to_pathrec_type(gid_attr.gid_type); - sa_path_set_ifindex(&work->path[0], - gid_attr.ndev->ifindex); - sa_path_set_ndev(&work->path[0], - dev_net(gid_attr.ndev)); - dev_put(gid_attr.ndev); - } else { - cm_path_set_rec_type(work->port->cm_dev->ib_device, - work->port->port_num, - &work->path[0], - &req_msg->primary_local_gid); - } - if (cm_req_has_alt_path(req_msg)) - work->path[1].rec_type = work->path[0].rec_type; - ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID, - &work->path[0].sgid, sizeof work->path[0].sgid, - NULL, 0); + int err; + + err = ib_get_cached_gid(work->port->cm_dev->ib_device, + work->port->port_num, 0, + &work->path[0].sgid, + NULL); + if (err) + ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID, + NULL, 0, NULL, 0); + else + ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID, + &work->path[0].sgid, + sizeof(work->path[0].sgid), + NULL, 0); goto rejected; } if (cm_req_has_alt_path(req_msg)) { @@ -1919,7 +1952,7 @@ static int cm_req_handler(struct cm_work *work) if (ret) { ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_ALT_GID, &work->path[0].sgid, - sizeof work->path[0].sgid, NULL, 0); + sizeof(work->path[0].sgid), NULL, 0); goto rejected; } } @@ -1945,6 +1978,8 @@ static int cm_req_handler(struct cm_work *work) rejected: atomic_dec(&cm_id_priv->refcount); cm_deref_id(listen_cm_id_priv); +free_timeinfo: + kfree(cm_id_priv->timewait_info); destroy: ib_destroy_cm_id(cm_id); return ret; @@ -1997,6 +2032,8 @@ int ib_send_cm_rep(struct ib_cm_id *cm_id, spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_REQ_RCVD && cm_id->state != IB_CM_MRA_REQ_SENT) { + pr_debug("%s: local_comm_id %d, cm_id->state: %d\n", __func__, + be32_to_cpu(cm_id_priv->id.local_id), cm_id->state); ret = -EINVAL; goto out; } @@ -2063,6 +2100,8 @@ int ib_send_cm_rtu(struct ib_cm_id *cm_id, spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_REP_RCVD && cm_id->state != IB_CM_MRA_REP_SENT) { + pr_debug("%s: local_id %d, cm_id->state %d\n", __func__, + be32_to_cpu(cm_id->local_id), cm_id->state); ret = -EINVAL; goto error; } @@ -2170,6 +2209,8 @@ static int cm_rep_handler(struct cm_work *work) cm_id_priv = cm_acquire_id(rep_msg->remote_comm_id, 0); if (!cm_id_priv) { cm_dup_rep_handler(work); + pr_debug("%s: remote_comm_id %d, no cm_id_priv\n", __func__, + be32_to_cpu(rep_msg->remote_comm_id)); return -EINVAL; } @@ -2183,6 +2224,10 @@ static int cm_rep_handler(struct cm_work *work) default: spin_unlock_irq(&cm_id_priv->lock); ret = -EINVAL; + pr_debug("%s: cm_id_priv->id.state: %d, local_comm_id %d, remote_comm_id %d\n", + __func__, cm_id_priv->id.state, + be32_to_cpu(rep_msg->local_comm_id), + be32_to_cpu(rep_msg->remote_comm_id)); goto error; } @@ -2196,6 +2241,8 @@ static int cm_rep_handler(struct cm_work *work) spin_unlock(&cm.lock); spin_unlock_irq(&cm_id_priv->lock); ret = -EINVAL; + pr_debug("%s: Failed to insert remote id %d\n", __func__, + be32_to_cpu(rep_msg->remote_comm_id)); goto error; } /* Check for a stale connection. */ @@ -2213,6 +2260,10 @@ static int cm_rep_handler(struct cm_work *work) IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REP, NULL, 0); ret = -EINVAL; + pr_debug("%s: Stale connection. local_comm_id %d, remote_comm_id %d\n", + __func__, be32_to_cpu(rep_msg->local_comm_id), + be32_to_cpu(rep_msg->remote_comm_id)); + if (cur_cm_id_priv) { cm_id = &cur_cm_id_priv->id; ib_send_cm_dreq(cm_id, NULL, 0); @@ -2359,6 +2410,8 @@ int ib_send_cm_dreq(struct ib_cm_id *cm_id, cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_ESTABLISHED) { + pr_debug("%s: local_id %d, cm_id->state: %d\n", __func__, + be32_to_cpu(cm_id->local_id), cm_id->state); ret = -EINVAL; goto out; } @@ -2428,6 +2481,8 @@ int ib_send_cm_drep(struct ib_cm_id *cm_id, if (cm_id->state != IB_CM_DREQ_RCVD) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); kfree(data); + pr_debug("%s: local_id %d, cm_idcm_id->state(%d) != IB_CM_DREQ_RCVD\n", + __func__, be32_to_cpu(cm_id->local_id), cm_id->state); return -EINVAL; } @@ -2493,6 +2548,9 @@ static int cm_dreq_handler(struct cm_work *work) atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. counter[CM_DREQ_COUNTER]); cm_issue_drep(work->port, work->mad_recv_wc); + pr_debug("%s: no cm_id_priv, local_comm_id %d, remote_comm_id %d\n", + __func__, be32_to_cpu(dreq_msg->local_comm_id), + be32_to_cpu(dreq_msg->remote_comm_id)); return -EINVAL; } @@ -2535,6 +2593,9 @@ static int cm_dreq_handler(struct cm_work *work) counter[CM_DREQ_COUNTER]); goto unlock; default: + pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", + __func__, be32_to_cpu(cm_id_priv->id.local_id), + cm_id_priv->id.state); goto unlock; } cm_id_priv->id.state = IB_CM_DREQ_RCVD; @@ -2638,6 +2699,8 @@ int ib_send_cm_rej(struct ib_cm_id *cm_id, cm_enter_timewait(cm_id_priv); break; default: + pr_debug("%s: local_id %d, cm_id->state: %d\n", __func__, + be32_to_cpu(cm_id_priv->id.local_id), cm_id->state); ret = -EINVAL; goto out; } @@ -2748,6 +2811,9 @@ static int cm_rej_handler(struct cm_work *work) /* fall through */ default: spin_unlock_irq(&cm_id_priv->lock); + pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", + __func__, be32_to_cpu(cm_id_priv->id.local_id), + cm_id_priv->id.state); ret = -EINVAL; goto out; } @@ -2811,6 +2877,9 @@ int ib_send_cm_mra(struct ib_cm_id *cm_id, } /* fall through */ default: + pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", + __func__, be32_to_cpu(cm_id_priv->id.local_id), + cm_id_priv->id.state); ret = -EINVAL; goto error1; } @@ -2912,6 +2981,9 @@ static int cm_mra_handler(struct cm_work *work) counter[CM_MRA_COUNTER]); /* fall through */ default: + pr_debug("%s local_id %d, cm_id_priv->id.state: %d\n", + __func__, be32_to_cpu(cm_id_priv->id.local_id), + cm_id_priv->id.state); goto out; } @@ -3085,6 +3157,12 @@ static int cm_lap_handler(struct cm_work *work) if (!cm_id_priv) return -EINVAL; + ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc, + work->mad_recv_wc->recv_buf.grh, + &cm_id_priv->av); + if (ret) + goto deref; + param = &work->cm_event.param.lap_rcvd; memset(&work->path[0], 0, sizeof(work->path[1])); cm_path_set_rec_type(work->port->cm_dev->ib_device, @@ -3131,9 +3209,6 @@ static int cm_lap_handler(struct cm_work *work) cm_id_priv->id.lap_state = IB_CM_LAP_RCVD; cm_id_priv->tid = lap_msg->hdr.tid; - cm_init_av_for_response(work->port, work->mad_recv_wc->wc, - work->mad_recv_wc->recv_buf.grh, - &cm_id_priv->av); cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av, cm_id_priv); ret = atomic_inc_and_test(&cm_id_priv->work_count); @@ -3386,6 +3461,7 @@ static int cm_sidr_req_handler(struct cm_work *work) struct cm_id_private *cm_id_priv, *cur_cm_id_priv; struct cm_sidr_req_msg *sidr_req_msg; struct ib_wc *wc; + int ret; cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL); if (IS_ERR(cm_id)) @@ -3398,9 +3474,12 @@ static int cm_sidr_req_handler(struct cm_work *work) wc = work->mad_recv_wc->wc; cm_id_priv->av.dgid.global.subnet_prefix = cpu_to_be64(wc->slid); cm_id_priv->av.dgid.global.interface_id = 0; - cm_init_av_for_response(work->port, work->mad_recv_wc->wc, - work->mad_recv_wc->recv_buf.grh, - &cm_id_priv->av); + ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc, + work->mad_recv_wc->recv_buf.grh, + &cm_id_priv->av); + if (ret) + goto out; + cm_id_priv->id.remote_id = sidr_req_msg->request_id; cm_id_priv->tid = sidr_req_msg->hdr.tid; atomic_inc(&cm_id_priv->work_count); @@ -3692,6 +3771,7 @@ static void cm_work_handler(struct work_struct *_work) ret = cm_timewait_handler(work); break; default: + pr_debug("cm_event.event: 0x%x\n", work->cm_event.event); ret = -EINVAL; break; } @@ -3727,6 +3807,8 @@ static int cm_establish(struct ib_cm_id *cm_id) ret = -EISCONN; break; default: + pr_debug("%s: local_id %d, cm_id->state: %d\n", __func__, + be32_to_cpu(cm_id->local_id), cm_id->state); ret = -EINVAL; break; } @@ -3924,6 +4006,9 @@ static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv, ret = 0; break; default: + pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", + __func__, be32_to_cpu(cm_id_priv->id.local_id), + cm_id_priv->id.state); ret = -EINVAL; break; } @@ -3971,6 +4056,9 @@ static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv, ret = 0; break; default: + pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", + __func__, be32_to_cpu(cm_id_priv->id.local_id), + cm_id_priv->id.state); ret = -EINVAL; break; } @@ -4030,6 +4118,9 @@ static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv, ret = 0; break; default: + pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", + __func__, be32_to_cpu(cm_id_priv->id.local_id), + cm_id_priv->id.state); ret = -EINVAL; break; } diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 6294a7001d33..e66963ca58bd 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -601,7 +601,7 @@ static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_a int ret; if (addr->sa_family != AF_IB) { - ret = rdma_translate_ip(addr, dev_addr, NULL); + ret = rdma_translate_ip(addr, dev_addr); } else { cma_translate_ib((struct sockaddr_ib *) addr, dev_addr); ret = 0; @@ -612,11 +612,14 @@ static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_a static inline int cma_validate_port(struct ib_device *device, u8 port, enum ib_gid_type gid_type, - union ib_gid *gid, int dev_type, - int bound_if_index) + union ib_gid *gid, + struct rdma_id_private *id_priv) { - int ret = -ENODEV; + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + int bound_if_index = dev_addr->bound_dev_if; + int dev_type = dev_addr->dev_type; struct net_device *ndev = NULL; + int ret = -ENODEV; if ((dev_type == ARPHRD_INFINIBAND) && !rdma_protocol_ib(device, port)) return ret; @@ -624,11 +627,13 @@ static inline int cma_validate_port(struct ib_device *device, u8 port, if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port)) return ret; - if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) - ndev = dev_get_by_index(&init_net, bound_if_index); - else + if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) { + ndev = dev_get_by_index(dev_addr->net, bound_if_index); + if (!ndev) + return ret; + } else { gid_type = IB_GID_TYPE_IB; - + } ret = ib_find_cached_gid_by_port(device, gid, gid_type, port, ndev, NULL); @@ -669,8 +674,7 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv, rdma_protocol_ib(cma_dev->device, port) ? IB_GID_TYPE_IB : listen_id_priv->gid_type, gidp, - dev_addr->dev_type, - dev_addr->bound_dev_if); + id_priv); if (!ret) { id_priv->id.port_num = port; goto out; @@ -691,8 +695,7 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv, rdma_protocol_ib(cma_dev->device, port) ? IB_GID_TYPE_IB : cma_dev->default_gid_type[port - 1], - gidp, dev_addr->dev_type, - dev_addr->bound_dev_if); + gidp, id_priv); if (!ret) { id_priv->id.port_num = port; goto out; @@ -2036,6 +2039,33 @@ __be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr) } EXPORT_SYMBOL(rdma_get_service_id); +void rdma_read_gids(struct rdma_cm_id *cm_id, union ib_gid *sgid, + union ib_gid *dgid) +{ + struct rdma_addr *addr = &cm_id->route.addr; + + if (!cm_id->device) { + if (sgid) + memset(sgid, 0, sizeof(*sgid)); + if (dgid) + memset(dgid, 0, sizeof(*dgid)); + return; + } + + if (rdma_protocol_roce(cm_id->device, cm_id->port_num)) { + if (sgid) + rdma_ip2gid((struct sockaddr *)&addr->src_addr, sgid); + if (dgid) + rdma_ip2gid((struct sockaddr *)&addr->dst_addr, dgid); + } else { + if (sgid) + rdma_addr_get_sgid(&addr->dev_addr, sgid); + if (dgid) + rdma_addr_get_dgid(&addr->dev_addr, dgid); + } +} +EXPORT_SYMBOL(rdma_read_gids); + static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) { struct rdma_id_private *id_priv = iw_id->context; @@ -2132,7 +2162,7 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); conn_id->state = RDMA_CM_CONNECT; - ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr, NULL); + ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr); if (ret) { mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(new_cm_id); @@ -2414,6 +2444,26 @@ out: kfree(work); } +static void cma_init_resolve_route_work(struct cma_work *work, + struct rdma_id_private *id_priv) +{ + work->id = id_priv; + INIT_WORK(&work->work, cma_work_handler); + work->old_state = RDMA_CM_ROUTE_QUERY; + work->new_state = RDMA_CM_ROUTE_RESOLVED; + work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; +} + +static void cma_init_resolve_addr_work(struct cma_work *work, + struct rdma_id_private *id_priv) +{ + work->id = id_priv; + INIT_WORK(&work->work, cma_work_handler); + work->old_state = RDMA_CM_ADDR_QUERY; + work->new_state = RDMA_CM_ADDR_RESOLVED; + work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; +} + static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms) { struct rdma_route *route = &id_priv->id.route; @@ -2424,11 +2474,7 @@ static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms) if (!work) return -ENOMEM; - work->id = id_priv; - INIT_WORK(&work->work, cma_work_handler); - work->old_state = RDMA_CM_ROUTE_QUERY; - work->new_state = RDMA_CM_ROUTE_RESOLVED; - work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; + cma_init_resolve_route_work(work, id_priv); route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL); if (!route->path_rec) { @@ -2449,10 +2495,63 @@ err1: return ret; } -int rdma_set_ib_paths(struct rdma_cm_id *id, - struct sa_path_rec *path_rec, int num_paths) +static enum ib_gid_type cma_route_gid_type(enum rdma_network_type network_type, + unsigned long supported_gids, + enum ib_gid_type default_gid) +{ + if ((network_type == RDMA_NETWORK_IPV4 || + network_type == RDMA_NETWORK_IPV6) && + test_bit(IB_GID_TYPE_ROCE_UDP_ENCAP, &supported_gids)) + return IB_GID_TYPE_ROCE_UDP_ENCAP; + + return default_gid; +} + +/* + * cma_iboe_set_path_rec_l2_fields() is helper function which sets + * path record type based on GID type. + * It also sets up other L2 fields which includes destination mac address + * netdev ifindex, of the path record. + * It returns the netdev of the bound interface for this path record entry. + */ +static struct net_device * +cma_iboe_set_path_rec_l2_fields(struct rdma_id_private *id_priv) +{ + struct rdma_route *route = &id_priv->id.route; + enum ib_gid_type gid_type = IB_GID_TYPE_ROCE; + struct rdma_addr *addr = &route->addr; + unsigned long supported_gids; + struct net_device *ndev; + + if (!addr->dev_addr.bound_dev_if) + return NULL; + + ndev = dev_get_by_index(addr->dev_addr.net, + addr->dev_addr.bound_dev_if); + if (!ndev) + return NULL; + + supported_gids = roce_gid_type_mask_support(id_priv->id.device, + id_priv->id.port_num); + gid_type = cma_route_gid_type(addr->dev_addr.network, + supported_gids, + id_priv->gid_type); + /* Use the hint from IP Stack to select GID Type */ + if (gid_type < ib_network_to_gid_type(addr->dev_addr.network)) + gid_type = ib_network_to_gid_type(addr->dev_addr.network); + route->path_rec->rec_type = sa_conv_gid_to_pathrec_type(gid_type); + + sa_path_set_ndev(route->path_rec, addr->dev_addr.net); + sa_path_set_ifindex(route->path_rec, ndev->ifindex); + sa_path_set_dmac(route->path_rec, addr->dev_addr.dst_dev_addr); + return ndev; +} + +int rdma_set_ib_path(struct rdma_cm_id *id, + struct sa_path_rec *path_rec) { struct rdma_id_private *id_priv; + struct net_device *ndev; int ret; id_priv = container_of(id, struct rdma_id_private, id); @@ -2460,20 +2559,33 @@ int rdma_set_ib_paths(struct rdma_cm_id *id, RDMA_CM_ROUTE_RESOLVED)) return -EINVAL; - id->route.path_rec = kmemdup(path_rec, sizeof *path_rec * num_paths, + id->route.path_rec = kmemdup(path_rec, sizeof(*path_rec), GFP_KERNEL); if (!id->route.path_rec) { ret = -ENOMEM; goto err; } - id->route.num_paths = num_paths; + if (rdma_protocol_roce(id->device, id->port_num)) { + ndev = cma_iboe_set_path_rec_l2_fields(id_priv); + if (!ndev) { + ret = -ENODEV; + goto err_free; + } + dev_put(ndev); + } + + id->route.num_paths = 1; return 0; + +err_free: + kfree(id->route.path_rec); + id->route.path_rec = NULL; err: cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_ADDR_RESOLVED); return ret; } -EXPORT_SYMBOL(rdma_set_ib_paths); +EXPORT_SYMBOL(rdma_set_ib_path); static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms) { @@ -2483,11 +2595,7 @@ static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms) if (!work) return -ENOMEM; - work->id = id_priv; - INIT_WORK(&work->work, cma_work_handler); - work->old_state = RDMA_CM_ROUTE_QUERY; - work->new_state = RDMA_CM_ROUTE_RESOLVED; - work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; + cma_init_resolve_route_work(work, id_priv); queue_work(cma_wq, &work->work); return 0; } @@ -2510,26 +2618,14 @@ static int iboe_tos_to_sl(struct net_device *ndev, int tos) return 0; } -static enum ib_gid_type cma_route_gid_type(enum rdma_network_type network_type, - unsigned long supported_gids, - enum ib_gid_type default_gid) -{ - if ((network_type == RDMA_NETWORK_IPV4 || - network_type == RDMA_NETWORK_IPV6) && - test_bit(IB_GID_TYPE_ROCE_UDP_ENCAP, &supported_gids)) - return IB_GID_TYPE_ROCE_UDP_ENCAP; - - return default_gid; -} - static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) { struct rdma_route *route = &id_priv->id.route; struct rdma_addr *addr = &route->addr; struct cma_work *work; int ret; - struct net_device *ndev = NULL; - enum ib_gid_type gid_type = IB_GID_TYPE_IB; + struct net_device *ndev; + u8 default_roce_tos = id_priv->cma_dev->default_roce_tos[id_priv->id.port_num - rdma_start_port(id_priv->cma_dev->device)]; u8 tos = id_priv->tos_set ? id_priv->tos : default_roce_tos; @@ -2539,9 +2635,6 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) if (!work) return -ENOMEM; - work->id = id_priv; - INIT_WORK(&work->work, cma_work_handler); - route->path_rec = kzalloc(sizeof *route->path_rec, GFP_KERNEL); if (!route->path_rec) { ret = -ENOMEM; @@ -2550,42 +2643,17 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) route->num_paths = 1; - if (addr->dev_addr.bound_dev_if) { - unsigned long supported_gids; - - ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if); - if (!ndev) { - ret = -ENODEV; - goto err2; - } - - supported_gids = roce_gid_type_mask_support(id_priv->id.device, - id_priv->id.port_num); - gid_type = cma_route_gid_type(addr->dev_addr.network, - supported_gids, - id_priv->gid_type); - route->path_rec->rec_type = - sa_conv_gid_to_pathrec_type(gid_type); - sa_path_set_ndev(route->path_rec, &init_net); - sa_path_set_ifindex(route->path_rec, ndev->ifindex); - } + ndev = cma_iboe_set_path_rec_l2_fields(id_priv); if (!ndev) { ret = -ENODEV; goto err2; } - sa_path_set_dmac(route->path_rec, addr->dev_addr.dst_dev_addr); - rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &route->path_rec->sgid); rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr, &route->path_rec->dgid); - /* Use the hint from IP Stack to select GID Type */ - if (gid_type < ib_network_to_gid_type(addr->dev_addr.network)) - gid_type = ib_network_to_gid_type(addr->dev_addr.network); - route->path_rec->rec_type = sa_conv_gid_to_pathrec_type(gid_type); - if (((struct sockaddr *)&id_priv->id.route.addr.dst_addr)->sa_family != AF_IB) /* TODO: get the hoplimit from the inet/inet6 device */ route->path_rec->hop_limit = addr->dev_addr.hoplimit; @@ -2607,11 +2675,7 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) goto err2; } - work->old_state = RDMA_CM_ROUTE_QUERY; - work->new_state = RDMA_CM_ROUTE_RESOLVED; - work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; - work->event.status = 0; - + cma_init_resolve_route_work(work, id_priv); queue_work(cma_wq, &work->work); return 0; @@ -2791,11 +2855,7 @@ static int cma_resolve_loopback(struct rdma_id_private *id_priv) rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid); rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid); - work->id = id_priv; - INIT_WORK(&work->work, cma_work_handler); - work->old_state = RDMA_CM_ADDR_QUERY; - work->new_state = RDMA_CM_ADDR_RESOLVED; - work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; + cma_init_resolve_addr_work(work, id_priv); queue_work(cma_wq, &work->work); return 0; err: @@ -2821,11 +2881,7 @@ static int cma_resolve_ib_addr(struct rdma_id_private *id_priv) rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, (union ib_gid *) &(((struct sockaddr_ib *) &id_priv->id.route.addr.dst_addr)->sib_addr)); - work->id = id_priv; - INIT_WORK(&work->work, cma_work_handler); - work->old_state = RDMA_CM_ADDR_QUERY; - work->new_state = RDMA_CM_ADDR_RESOLVED; - work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; + cma_init_resolve_addr_work(work, id_priv); queue_work(cma_wq, &work->work); return 0; err: @@ -3404,9 +3460,10 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, event.status = ret; break; } - ib_init_ah_from_path(id_priv->id.device, id_priv->id.port_num, - id_priv->id.route.path_rec, - &event.param.ud.ah_attr); + ib_init_ah_attr_from_path(id_priv->id.device, + id_priv->id.port_num, + id_priv->id.route.path_rec, + &event.param.ud.ah_attr); event.param.ud.qp_num = rep->qpn; event.param.ud.qkey = rep->qkey; event.event = RDMA_CM_EVENT_ESTABLISHED; @@ -3873,7 +3930,7 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct net_device *ndev = - dev_get_by_index(&init_net, dev_addr->bound_dev_if); + dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); enum ib_gid_type gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num - rdma_start_port(id_priv->cma_dev->device)]; @@ -4010,8 +4067,10 @@ static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, } else if (addr->sa_family == AF_INET6) { memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); } else { - mgid->raw[0] = (gid_type == IB_GID_TYPE_IB) ? 0xff : 0; - mgid->raw[1] = (gid_type == IB_GID_TYPE_IB) ? 0x0e : 0; + mgid->raw[0] = + (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? 0 : 0xff; + mgid->raw[1] = + (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? 0 : 0x0e; mgid->raw[2] = 0; mgid->raw[3] = 0; mgid->raw[4] = 0; @@ -4061,7 +4120,7 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, mc->multicast.ib->rec.qkey = cpu_to_be32(RDMA_UDP_QKEY); if (dev_addr->bound_dev_if) - ndev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); + ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); if (!ndev) { err = -ENODEV; goto out2; @@ -4179,7 +4238,7 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) struct net_device *ndev = NULL; if (dev_addr->bound_dev_if) - ndev = dev_get_by_index(&init_net, + ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); if (ndev) { cma_igmp_send(ndev, @@ -4235,7 +4294,7 @@ static int cma_netdev_callback(struct notifier_block *self, unsigned long event, if (event != NETDEV_BONDING_FAILOVER) return NOTIFY_DONE; - if (!(ndev->flags & IFF_MASTER) || !(ndev->priv_flags & IFF_BONDING)) + if (!netif_is_bond_master(ndev)) return NOTIFY_DONE; mutex_lock(&lock); @@ -4432,7 +4491,7 @@ static int cma_get_id_stats(struct sk_buff *skb, struct netlink_callback *cb) RDMA_NL_RDMA_CM_ATTR_SRC_ADDR)) goto out; if (ibnl_put_attr(skb, nlh, - rdma_addr_size(cma_src_addr(id_priv)), + rdma_addr_size(cma_dst_addr(id_priv)), cma_dst_addr(id_priv), RDMA_NL_RDMA_CM_ATTR_DST_ADDR)) goto out; @@ -4444,6 +4503,7 @@ static int cma_get_id_stats(struct sk_buff *skb, struct netlink_callback *cb) id_stats->qp_type = id->qp_type; i_id++; + nlmsg_end(skb, nlh); } cb->args[1] = 0; diff --git a/drivers/infiniband/core/cma_configfs.c b/drivers/infiniband/core/cma_configfs.c index 31dfee0c8295..eee38b40be99 100644 --- a/drivers/infiniband/core/cma_configfs.c +++ b/drivers/infiniband/core/cma_configfs.c @@ -295,7 +295,7 @@ static struct config_group *make_cma_dev(struct config_group *group, goto fail; } - strncpy(cma_dev_group->name, name, sizeof(cma_dev_group->name)); + strlcpy(cma_dev_group->name, name, sizeof(cma_dev_group->name)); config_group_init_type_name(&cma_dev_group->ports_group, "ports", &cma_ports_group_type); diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 66f0268f37a6..c4560d84dfae 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -40,8 +40,12 @@ #include <rdma/ib_verbs.h> #include <rdma/opa_addr.h> #include <rdma/ib_mad.h> +#include <rdma/restrack.h> #include "mad_priv.h" +/* Total number of ports combined across all struct ib_devices's */ +#define RDMA_MAX_PORTS 1024 + struct pkey_index_qp_list { struct list_head pkey_index_list; u16 pkey_index; @@ -137,7 +141,6 @@ int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, int roce_gid_mgmt_init(void); void roce_gid_mgmt_cleanup(void); -int roce_rescan_device(struct ib_device *ib_dev); unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port); int ib_cache_setup_one(struct ib_device *device); @@ -191,13 +194,6 @@ void ib_sa_cleanup(void); int rdma_nl_init(void); void rdma_nl_exit(void); -/** - * Check if there are any listeners to the netlink group - * @group: the netlink group ID - * Returns 0 on success or a negative for no listeners. - */ -int ibnl_chk_listeners(unsigned int group); - int ib_nl_handle_resolve_resp(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack); @@ -213,11 +209,6 @@ int ib_get_cached_subnet_prefix(struct ib_device *device, u64 *sn_pfx); #ifdef CONFIG_SECURITY_INFINIBAND -int ib_security_pkey_access(struct ib_device *dev, - u8 port_num, - u16 pkey_index, - void *sec); - void ib_security_destroy_port_pkey_list(struct ib_device *device); void ib_security_cache_change(struct ib_device *device, @@ -240,14 +231,6 @@ int ib_mad_agent_security_setup(struct ib_mad_agent *agent, void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent); int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index); #else -static inline int ib_security_pkey_access(struct ib_device *dev, - u8 port_num, - u16 pkey_index, - void *sec) -{ - return 0; -} - static inline void ib_security_destroy_port_pkey_list(struct ib_device *device) { } @@ -318,4 +301,31 @@ struct ib_device *ib_device_get_by_index(u32 ifindex); /* RDMA device netlink */ void nldev_init(void); void nldev_exit(void); + +static inline struct ib_qp *_ib_create_qp(struct ib_device *dev, + struct ib_pd *pd, + struct ib_qp_init_attr *attr, + struct ib_udata *udata) +{ + struct ib_qp *qp; + + qp = dev->create_qp(pd, attr, udata); + if (IS_ERR(qp)) + return qp; + + qp->device = dev; + qp->pd = pd; + /* + * We don't track XRC QPs for now, because they don't have PD + * and more importantly they are created internaly by driver, + * see mlx5 create_dev_resources() as an example. + */ + if (attr->qp_type < IB_QPT_XRC_INI) { + qp->res.type = RDMA_RESTRACK_QP; + rdma_restrack_add(&qp->res); + } else + qp->res.valid = false; + + return qp; +} #endif /* _CORE_PRIV_H */ diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index f2ae75fa3128..bc79ca8215d7 100644 --- a/drivers/infiniband/core/cq.c +++ b/drivers/infiniband/core/cq.c @@ -25,9 +25,10 @@ #define IB_POLL_FLAGS \ (IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS) -static int __ib_process_cq(struct ib_cq *cq, int budget) +static int __ib_process_cq(struct ib_cq *cq, int budget, struct ib_wc *poll_wc) { int i, n, completed = 0; + struct ib_wc *wcs = poll_wc ? : cq->wc; /* * budget might be (-1) if the caller does not @@ -35,9 +36,9 @@ static int __ib_process_cq(struct ib_cq *cq, int budget) * minimum here. */ while ((n = ib_poll_cq(cq, min_t(u32, IB_POLL_BATCH, - budget - completed), cq->wc)) > 0) { + budget - completed), wcs)) > 0) { for (i = 0; i < n; i++) { - struct ib_wc *wc = &cq->wc[i]; + struct ib_wc *wc = &wcs[i]; if (wc->wr_cqe) wc->wr_cqe->done(cq, wc); @@ -60,18 +61,20 @@ static int __ib_process_cq(struct ib_cq *cq, int budget) * @cq: CQ to process * @budget: number of CQEs to poll for * - * This function is used to process all outstanding CQ entries on a - * %IB_POLL_DIRECT CQ. It does not offload CQ processing to a different - * context and does not ask for completion interrupts from the HCA. + * This function is used to process all outstanding CQ entries. + * It does not offload CQ processing to a different context and does + * not ask for completion interrupts from the HCA. + * Using direct processing on CQ with non IB_POLL_DIRECT type may trigger + * concurrent processing. * * Note: do not pass -1 as %budget unless it is guaranteed that the number * of completions that will be processed is small. */ int ib_process_cq_direct(struct ib_cq *cq, int budget) { - WARN_ON_ONCE(cq->poll_ctx != IB_POLL_DIRECT); + struct ib_wc wcs[IB_POLL_BATCH]; - return __ib_process_cq(cq, budget); + return __ib_process_cq(cq, budget, wcs); } EXPORT_SYMBOL(ib_process_cq_direct); @@ -85,7 +88,7 @@ static int ib_poll_handler(struct irq_poll *iop, int budget) struct ib_cq *cq = container_of(iop, struct ib_cq, iop); int completed; - completed = __ib_process_cq(cq, budget); + completed = __ib_process_cq(cq, budget, NULL); if (completed < budget) { irq_poll_complete(&cq->iop); if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) @@ -105,7 +108,7 @@ static void ib_cq_poll_work(struct work_struct *work) struct ib_cq *cq = container_of(work, struct ib_cq, work); int completed; - completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE); + completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE, NULL); if (completed >= IB_POLL_BUDGET_WORKQUEUE || ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) queue_work(ib_comp_wq, &cq->work); @@ -117,20 +120,22 @@ static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private) } /** - * ib_alloc_cq - allocate a completion queue + * __ib_alloc_cq - allocate a completion queue * @dev: device to allocate the CQ for * @private: driver private data, accessible from cq->cq_context * @nr_cqe: number of CQEs to allocate * @comp_vector: HCA completion vectors for this CQ * @poll_ctx: context to poll the CQ from. + * @caller: module owner name. * * This is the proper interface to allocate a CQ for in-kernel users. A * CQ allocated with this interface will automatically be polled from the * specified context. The ULP must use wr->wr_cqe instead of wr->wr_id * to use this CQ abstraction. */ -struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private, - int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx) +struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, + int nr_cqe, int comp_vector, + enum ib_poll_context poll_ctx, const char *caller) { struct ib_cq_init_attr cq_attr = { .cqe = nr_cqe, @@ -154,6 +159,10 @@ struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private, if (!cq->wc) goto out_destroy_cq; + cq->res.type = RDMA_RESTRACK_CQ; + cq->res.kern_name = caller; + rdma_restrack_add(&cq->res); + switch (cq->poll_ctx) { case IB_POLL_DIRECT: cq->comp_handler = ib_cq_completion_direct; @@ -178,11 +187,12 @@ struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private, out_free_wc: kfree(cq->wc); + rdma_restrack_del(&cq->res); out_destroy_cq: cq->device->destroy_cq(cq); return ERR_PTR(ret); } -EXPORT_SYMBOL(ib_alloc_cq); +EXPORT_SYMBOL(__ib_alloc_cq); /** * ib_free_cq - free a completion queue @@ -209,6 +219,7 @@ void ib_free_cq(struct ib_cq *cq) } kfree(cq->wc); + rdma_restrack_del(&cq->res); ret = cq->device->destroy_cq(cq); WARN_ON_ONCE(ret); } diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 465520627e4b..e8010e73a1cf 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -263,6 +263,8 @@ struct ib_device *ib_alloc_device(size_t size) if (!device) return NULL; + rdma_restrack_init(&device->res); + device->dev.class = &ib_class; device_initialize(&device->dev); @@ -288,7 +290,7 @@ void ib_dealloc_device(struct ib_device *device) { WARN_ON(device->reg_state != IB_DEV_UNREGISTERED && device->reg_state != IB_DEV_UNINITIALIZED); - kobject_put(&device->dev.kobj); + put_device(&device->dev); } EXPORT_SYMBOL(ib_dealloc_device); @@ -462,7 +464,6 @@ int ib_register_device(struct ib_device *device, struct ib_udata uhw = {.outlen = 0, .inlen = 0}; struct device *parent = device->dev.parent; - WARN_ON_ONCE(!parent); WARN_ON_ONCE(device->dma_device); if (device->dev.dma_ops) { /* @@ -471,16 +472,25 @@ int ib_register_device(struct ib_device *device, * into device->dev. */ device->dma_device = &device->dev; - if (!device->dev.dma_mask) - device->dev.dma_mask = parent->dma_mask; - if (!device->dev.coherent_dma_mask) - device->dev.coherent_dma_mask = - parent->coherent_dma_mask; + if (!device->dev.dma_mask) { + if (parent) + device->dev.dma_mask = parent->dma_mask; + else + WARN_ON_ONCE(true); + } + if (!device->dev.coherent_dma_mask) { + if (parent) + device->dev.coherent_dma_mask = + parent->coherent_dma_mask; + else + WARN_ON_ONCE(true); + } } else { /* * The caller did not provide custom DMA operations. Use the * DMA mapping operations of the parent device. */ + WARN_ON_ONCE(!parent); device->dma_device = parent; } @@ -588,6 +598,8 @@ void ib_unregister_device(struct ib_device *device) } up_read(&lists_rwsem); + rdma_restrack_clean(&device->res); + ib_device_unregister_rdmacg(device); ib_device_unregister_sysfs(device); @@ -1033,32 +1045,22 @@ EXPORT_SYMBOL(ib_modify_port); /** * ib_find_gid - Returns the port number and GID table index where - * a specified GID value occurs. + * a specified GID value occurs. Its searches only for IB link layer. * @device: The device to query. * @gid: The GID value to search for. - * @gid_type: Type of GID. * @ndev: The ndev related to the GID to search for. * @port_num: The port number of the device where the GID value was found. * @index: The index into the GID table where the GID was found. This * parameter may be NULL. */ int ib_find_gid(struct ib_device *device, union ib_gid *gid, - enum ib_gid_type gid_type, struct net_device *ndev, - u8 *port_num, u16 *index) + struct net_device *ndev, u8 *port_num, u16 *index) { union ib_gid tmp_gid; int ret, port, i; for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) { - if (rdma_cap_roce_gid_table(device, port)) { - if (!ib_find_cached_gid_by_port(device, gid, gid_type, port, - ndev, index)) { - *port_num = port; - return 0; - } - } - - if (gid_type != IB_GID_TYPE_IB) + if (rdma_cap_roce_gid_table(device, port)) continue; for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) { diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c index 84d2615b5d4b..a0a9ed719031 100644 --- a/drivers/infiniband/core/fmr_pool.c +++ b/drivers/infiniband/core/fmr_pool.c @@ -388,13 +388,11 @@ int ib_flush_fmr_pool(struct ib_fmr_pool *pool) EXPORT_SYMBOL(ib_flush_fmr_pool); /** - * ib_fmr_pool_map_phys - - * @pool:FMR pool to allocate FMR from - * @page_list:List of pages to map - * @list_len:Number of pages in @page_list - * @io_virtual_address:I/O virtual address for new FMR - * - * Map an FMR from an FMR pool. + * ib_fmr_pool_map_phys - Map an FMR from an FMR pool. + * @pool_handle: FMR pool to allocate FMR from + * @page_list: List of pages to map + * @list_len: Number of pages in @page_list + * @io_virtual_address: I/O virtual address for new FMR */ struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle, u64 *page_list, diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c index 3c4faadb8cdd..81528f64061a 100644 --- a/drivers/infiniband/core/iwpm_util.c +++ b/drivers/infiniband/core/iwpm_util.c @@ -654,6 +654,7 @@ int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid) } skb_num++; spin_lock_irqsave(&iwpm_mapinfo_lock, flags); + ret = -EINVAL; for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) { hlist_for_each_entry(map_info, &iwpm_hash_bucket[i], hlist_node) { diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index cb91245e9163..c50596f7f98a 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -49,7 +49,6 @@ #include "smi.h" #include "opa_smi.h" #include "agent.h" -#include "core_priv.h" static int mad_sendq_size = IB_MAD_QP_SEND_SIZE; static int mad_recvq_size = IB_MAD_QP_RECV_SIZE; diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index 1fb72c356e36..3ccaae18ad75 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -41,8 +41,6 @@ #include <linux/module.h> #include "core_priv.h" -#include "core_priv.h" - static DEFINE_MUTEX(rdma_nl_mutex); static struct sock *nls; static struct { @@ -83,15 +81,13 @@ static bool is_nl_valid(unsigned int type, unsigned int op) if (!is_nl_msg_valid(type, op)) return false; - cb_table = rdma_nl_types[type].cb_table; -#ifdef CONFIG_MODULES - if (!cb_table) { + if (!rdma_nl_types[type].cb_table) { mutex_unlock(&rdma_nl_mutex); request_module("rdma-netlink-subsys-%d", type); mutex_lock(&rdma_nl_mutex); - cb_table = rdma_nl_types[type].cb_table; } -#endif + + cb_table = rdma_nl_types[type].cb_table; if (!cb_table || (!cb_table[op].dump && !cb_table[op].doit)) return false; diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 0dcd1aa6f683..fa8655e3b3ed 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -31,6 +31,8 @@ */ #include <linux/module.h> +#include <linux/pid.h> +#include <linux/pid_namespace.h> #include <net/netlink.h> #include <rdma/rdma_netlink.h> @@ -52,16 +54,42 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_PORT_STATE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_PORT_PHYS_STATE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_DEV_NODE_TYPE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_RES_SUMMARY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME] = { .type = NLA_NUL_STRING, + .len = 16 }, + [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_RES_QP] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_QP_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_LQPN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_RQPN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_RQ_PSN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_SQ_PSN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_RES_TYPE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_RES_STATE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_RES_PID] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_KERN_NAME] = { .type = NLA_NUL_STRING, + .len = TASK_COMM_LEN }, }; -static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) +static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device) { - char fw[IB_FW_VERSION_NAME_MAX]; - if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index)) return -EMSGSIZE; if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, device->name)) return -EMSGSIZE; + + return 0; +} + +static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) +{ + char fw[IB_FW_VERSION_NAME_MAX]; + + if (fill_nldev_handle(msg, device)) + return -EMSGSIZE; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, rdma_end_port(device))) return -EMSGSIZE; @@ -92,10 +120,9 @@ static int fill_port_info(struct sk_buff *msg, struct ib_port_attr attr; int ret; - if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index)) - return -EMSGSIZE; - if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, device->name)) + if (fill_nldev_handle(msg, device)) return -EMSGSIZE; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) return -EMSGSIZE; @@ -126,6 +153,137 @@ static int fill_port_info(struct sk_buff *msg, return 0; } +static int fill_res_info_entry(struct sk_buff *msg, + const char *name, u64 curr) +{ + struct nlattr *entry_attr; + + entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY); + if (!entry_attr) + return -EMSGSIZE; + + if (nla_put_string(msg, RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME, name)) + goto err; + if (nla_put_u64_64bit(msg, + RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR, curr, 0)) + goto err; + + nla_nest_end(msg, entry_attr); + return 0; + +err: + nla_nest_cancel(msg, entry_attr); + return -EMSGSIZE; +} + +static int fill_res_info(struct sk_buff *msg, struct ib_device *device) +{ + static const char * const names[RDMA_RESTRACK_MAX] = { + [RDMA_RESTRACK_PD] = "pd", + [RDMA_RESTRACK_CQ] = "cq", + [RDMA_RESTRACK_QP] = "qp", + }; + + struct rdma_restrack_root *res = &device->res; + struct nlattr *table_attr; + int ret, i, curr; + + if (fill_nldev_handle(msg, device)) + return -EMSGSIZE; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_SUMMARY); + if (!table_attr) + return -EMSGSIZE; + + for (i = 0; i < RDMA_RESTRACK_MAX; i++) { + if (!names[i]) + continue; + curr = rdma_restrack_count(res, i, task_active_pid_ns(current)); + ret = fill_res_info_entry(msg, names[i], curr); + if (ret) + goto err; + } + + nla_nest_end(msg, table_attr); + return 0; + +err: + nla_nest_cancel(msg, table_attr); + return ret; +} + +static int fill_res_qp_entry(struct sk_buff *msg, + struct ib_qp *qp, uint32_t port) +{ + struct rdma_restrack_entry *res = &qp->res; + struct ib_qp_init_attr qp_init_attr; + struct nlattr *entry_attr; + struct ib_qp_attr qp_attr; + int ret; + + ret = ib_query_qp(qp, &qp_attr, 0, &qp_init_attr); + if (ret) + return ret; + + if (port && port != qp_attr.port_num) + return 0; + + entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP_ENTRY); + if (!entry_attr) + goto out; + + /* In create_qp() port is not set yet */ + if (qp_attr.port_num && + nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, qp_attr.port_num)) + goto err; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qp->qp_num)) + goto err; + if (qp->qp_type == IB_QPT_RC || qp->qp_type == IB_QPT_UC) { + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RQPN, + qp_attr.dest_qp_num)) + goto err; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RQ_PSN, + qp_attr.rq_psn)) + goto err; + } + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_SQ_PSN, qp_attr.sq_psn)) + goto err; + + if (qp->qp_type == IB_QPT_RC || qp->qp_type == IB_QPT_UC || + qp->qp_type == IB_QPT_XRC_INI || qp->qp_type == IB_QPT_XRC_TGT) { + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE, + qp_attr.path_mig_state)) + goto err; + } + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, qp->qp_type)) + goto err; + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, qp_attr.qp_state)) + goto err; + + /* + * Existence of task means that it is user QP and netlink + * user is invited to go and read /proc/PID/comm to get name + * of the task file and res->task_com should be NULL. + */ + if (rdma_is_kernel_res(res)) { + if (nla_put_string(msg, RDMA_NLDEV_ATTR_RES_KERN_NAME, res->kern_name)) + goto err; + } else { + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PID, task_pid_vnr(res->task))) + goto err; + } + + nla_nest_end(msg, entry_attr); + return 0; + +err: + nla_nest_cancel(msg, entry_attr); +out: + return -EMSGSIZE; +} + static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -321,6 +479,213 @@ out: return skb->len; } +static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + struct sk_buff *msg; + u32 index; + int ret; + + ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(index); + if (!device) + return -EINVAL; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + goto err; + + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_GET), + 0, 0); + + ret = fill_res_info(msg, device); + if (ret) + goto err_free; + + nlmsg_end(msg, nlh); + put_device(&device->dev); + return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + +err_free: + nlmsg_free(msg); +err: + put_device(&device->dev); + return ret; +} + +static int _nldev_res_get_dumpit(struct ib_device *device, + struct sk_buff *skb, + struct netlink_callback *cb, + unsigned int idx) +{ + int start = cb->args[0]; + struct nlmsghdr *nlh; + + if (idx < start) + return 0; + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_GET), + 0, NLM_F_MULTI); + + if (fill_res_info(skb, device)) { + nlmsg_cancel(skb, nlh); + goto out; + } + + nlmsg_end(skb, nlh); + + idx++; + +out: + cb->args[0] = idx; + return skb->len; +} + +static int nldev_res_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return ib_enum_all_devs(_nldev_res_get_dumpit, skb, cb); +} + +static int nldev_res_get_qp_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct rdma_restrack_entry *res; + int err, ret = 0, idx = 0; + struct nlattr *table_attr; + struct ib_device *device; + int start = cb->args[0]; + struct ib_qp *qp = NULL; + struct nlmsghdr *nlh; + u32 index, port = 0; + + err = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NULL); + /* + * Right now, we are expecting the device index to get QP information, + * but it is possible to extend this code to return all devices in + * one shot by checking the existence of RDMA_NLDEV_ATTR_DEV_INDEX. + * if it doesn't exist, we will iterate over all devices. + * + * But it is not needed for now. + */ + if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(index); + if (!device) + return -EINVAL; + + /* + * If no PORT_INDEX is supplied, we will return all QPs from that device + */ + if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) { + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(device, port)) { + ret = -EINVAL; + goto err_index; + } + } + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_QP_GET), + 0, NLM_F_MULTI); + + if (fill_nldev_handle(skb, device)) { + ret = -EMSGSIZE; + goto err; + } + + table_attr = nla_nest_start(skb, RDMA_NLDEV_ATTR_RES_QP); + if (!table_attr) { + ret = -EMSGSIZE; + goto err; + } + + down_read(&device->res.rwsem); + hash_for_each_possible(device->res.hash, res, node, RDMA_RESTRACK_QP) { + if (idx < start) + goto next; + + if ((rdma_is_kernel_res(res) && + task_active_pid_ns(current) != &init_pid_ns) || + (!rdma_is_kernel_res(res) && + task_active_pid_ns(current) != task_active_pid_ns(res->task))) + /* + * 1. Kernel QPs should be visible in init namspace only + * 2. Present only QPs visible in the current namespace + */ + goto next; + + if (!rdma_restrack_get(res)) + /* + * Resource is under release now, but we are not + * relesing lock now, so it will be released in + * our next pass, once we will get ->next pointer. + */ + goto next; + + qp = container_of(res, struct ib_qp, res); + + up_read(&device->res.rwsem); + ret = fill_res_qp_entry(skb, qp, port); + down_read(&device->res.rwsem); + /* + * Return resource back, but it won't be released till + * the &device->res.rwsem will be released for write. + */ + rdma_restrack_put(res); + + if (ret == -EMSGSIZE) + /* + * There is a chance to optimize here. + * It can be done by using list_prepare_entry + * and list_for_each_entry_continue afterwards. + */ + break; + if (ret) + goto res_err; +next: idx++; + } + up_read(&device->res.rwsem); + + nla_nest_end(skb, table_attr); + nlmsg_end(skb, nlh); + cb->args[0] = idx; + + /* + * No more QPs to fill, cancel the message and + * return 0 to mark end of dumpit. + */ + if (!qp) + goto err; + + put_device(&device->dev); + return skb->len; + +res_err: + nla_nest_cancel(skb, table_attr); + up_read(&device->res.rwsem); + +err: + nlmsg_cancel(skb, nlh); + +err_index: + put_device(&device->dev); + return ret; +} + static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { [RDMA_NLDEV_CMD_GET] = { .doit = nldev_get_doit, @@ -330,6 +695,23 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { .doit = nldev_port_get_doit, .dump = nldev_port_get_dumpit, }, + [RDMA_NLDEV_CMD_RES_GET] = { + .doit = nldev_res_get_doit, + .dump = nldev_res_get_dumpit, + }, + [RDMA_NLDEV_CMD_RES_QP_GET] = { + .dump = nldev_res_get_qp_dumpit, + /* + * .doit is not implemented yet for two reasons: + * 1. It is not needed yet. + * 2. There is a need to provide identifier, while it is easy + * for the QPs (device index + port index + LQPN), it is not + * the case for the rest of resources (PD and CQ). Because it + * is better to provide similar interface for all resources, + * let's wait till we will have other resources implemented + * too. + */ + }, }; void __init nldev_init(void) diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c new file mode 100644 index 000000000000..857637bf46da --- /dev/null +++ b/drivers/infiniband/core/restrack.c @@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */ +/* + * Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved. + */ + +#include <rdma/ib_verbs.h> +#include <rdma/restrack.h> +#include <linux/mutex.h> +#include <linux/sched/task.h> +#include <linux/uaccess.h> +#include <linux/pid_namespace.h> + +void rdma_restrack_init(struct rdma_restrack_root *res) +{ + init_rwsem(&res->rwsem); +} + +void rdma_restrack_clean(struct rdma_restrack_root *res) +{ + WARN_ON_ONCE(!hash_empty(res->hash)); +} + +int rdma_restrack_count(struct rdma_restrack_root *res, + enum rdma_restrack_type type, + struct pid_namespace *ns) +{ + struct rdma_restrack_entry *e; + u32 cnt = 0; + + down_read(&res->rwsem); + hash_for_each_possible(res->hash, e, node, type) { + if (ns == &init_pid_ns || + (!rdma_is_kernel_res(e) && + ns == task_active_pid_ns(e->task))) + cnt++; + } + up_read(&res->rwsem); + return cnt; +} +EXPORT_SYMBOL(rdma_restrack_count); + +static void set_kern_name(struct rdma_restrack_entry *res) +{ + enum rdma_restrack_type type = res->type; + struct ib_qp *qp; + + if (type != RDMA_RESTRACK_QP) + /* PD and CQ types already have this name embedded in */ + return; + + qp = container_of(res, struct ib_qp, res); + if (!qp->pd) { + WARN_ONCE(true, "XRC QPs are not supported\n"); + /* Survive, despite the programmer's error */ + res->kern_name = " "; + return; + } + + res->kern_name = qp->pd->res.kern_name; +} + +static struct ib_device *res_to_dev(struct rdma_restrack_entry *res) +{ + enum rdma_restrack_type type = res->type; + struct ib_device *dev; + struct ib_xrcd *xrcd; + struct ib_pd *pd; + struct ib_cq *cq; + struct ib_qp *qp; + + switch (type) { + case RDMA_RESTRACK_PD: + pd = container_of(res, struct ib_pd, res); + dev = pd->device; + break; + case RDMA_RESTRACK_CQ: + cq = container_of(res, struct ib_cq, res); + dev = cq->device; + break; + case RDMA_RESTRACK_QP: + qp = container_of(res, struct ib_qp, res); + dev = qp->device; + break; + case RDMA_RESTRACK_XRCD: + xrcd = container_of(res, struct ib_xrcd, res); + dev = xrcd->device; + break; + default: + WARN_ONCE(true, "Wrong resource tracking type %u\n", type); + return NULL; + } + + return dev; +} + +void rdma_restrack_add(struct rdma_restrack_entry *res) +{ + struct ib_device *dev = res_to_dev(res); + + if (!dev) + return; + + if (!uaccess_kernel()) { + get_task_struct(current); + res->task = current; + res->kern_name = NULL; + } else { + set_kern_name(res); + res->task = NULL; + } + + kref_init(&res->kref); + init_completion(&res->comp); + res->valid = true; + + down_write(&dev->res.rwsem); + hash_add(dev->res.hash, &res->node, res->type); + up_write(&dev->res.rwsem); +} +EXPORT_SYMBOL(rdma_restrack_add); + +int __must_check rdma_restrack_get(struct rdma_restrack_entry *res) +{ + return kref_get_unless_zero(&res->kref); +} +EXPORT_SYMBOL(rdma_restrack_get); + +static void restrack_release(struct kref *kref) +{ + struct rdma_restrack_entry *res; + + res = container_of(kref, struct rdma_restrack_entry, kref); + complete(&res->comp); +} + +int rdma_restrack_put(struct rdma_restrack_entry *res) +{ + return kref_put(&res->kref, restrack_release); +} +EXPORT_SYMBOL(rdma_restrack_put); + +void rdma_restrack_del(struct rdma_restrack_entry *res) +{ + struct ib_device *dev; + + if (!res->valid) + return; + + dev = res_to_dev(res); + if (!dev) + return; + + rdma_restrack_put(res); + + wait_for_completion(&res->comp); + + down_write(&dev->res.rwsem); + hash_del(&res->node); + res->valid = false; + if (res->task) + put_task_struct(res->task); + up_write(&dev->res.rwsem); +} +EXPORT_SYMBOL(rdma_restrack_del); diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c index 90e3889b7fbe..5a52ec77940a 100644 --- a/drivers/infiniband/core/roce_gid_mgmt.c +++ b/drivers/infiniband/core/roce_gid_mgmt.c @@ -410,15 +410,18 @@ static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev, rtnl_unlock(); } -/* This function will rescan all of the network devices in the system - * and add their gids, as needed, to the relevant RoCE devices. */ -int roce_rescan_device(struct ib_device *ib_dev) +/** + * rdma_roce_rescan_device - Rescan all of the network devices in the system + * and add their gids, as needed, to the relevant RoCE devices. + * + * @device: the rdma device + */ +void rdma_roce_rescan_device(struct ib_device *ib_dev) { ib_enum_roce_netdev(ib_dev, pass_all_filter, NULL, enum_all_gids_of_dev_cb, NULL); - - return 0; } +EXPORT_SYMBOL(rdma_roce_rescan_device); static void callback_for_addr_gid_device_scan(struct ib_device *device, u8 port, diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index ab5e1024fea9..8cf15d4a8ac4 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -1227,9 +1227,9 @@ static u8 get_src_path_mask(struct ib_device *device, u8 port_num) return src_path_mask; } -int ib_init_ah_from_path(struct ib_device *device, u8 port_num, - struct sa_path_rec *rec, - struct rdma_ah_attr *ah_attr) +int ib_init_ah_attr_from_path(struct ib_device *device, u8 port_num, + struct sa_path_rec *rec, + struct rdma_ah_attr *ah_attr) { int ret; u16 gid_index; @@ -1341,10 +1341,11 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, return 0; } -EXPORT_SYMBOL(ib_init_ah_from_path); +EXPORT_SYMBOL(ib_init_ah_attr_from_path); static int alloc_mad(struct ib_sa_query *query, gfp_t gfp_mask) { + struct rdma_ah_attr ah_attr; unsigned long flags; spin_lock_irqsave(&query->port->ah_lock, flags); @@ -1356,6 +1357,15 @@ static int alloc_mad(struct ib_sa_query *query, gfp_t gfp_mask) query->sm_ah = query->port->sm_ah; spin_unlock_irqrestore(&query->port->ah_lock, flags); + /* + * Always check if sm_ah has valid dlid assigned, + * before querying for class port info + */ + if ((rdma_query_ah(query->sm_ah->ah, &ah_attr) < 0) || + !rdma_is_valid_unicast_lid(&ah_attr)) { + kref_put(&query->sm_ah->ref, free_sm_ah); + return -EAGAIN; + } query->mad_buf = ib_create_send_mad(query->port->agent, 1, query->sm_ah->pkey_index, 0, IB_MGMT_SA_HDR, IB_MGMT_SA_DATA, diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c index 59b2f96d986a..b61dda6b04fc 100644 --- a/drivers/infiniband/core/security.c +++ b/drivers/infiniband/core/security.c @@ -653,12 +653,11 @@ int ib_security_modify_qp(struct ib_qp *qp, } return ret; } -EXPORT_SYMBOL(ib_security_modify_qp); -int ib_security_pkey_access(struct ib_device *dev, - u8 port_num, - u16 pkey_index, - void *sec) +static int ib_security_pkey_access(struct ib_device *dev, + u8 port_num, + u16 pkey_index, + void *sec) { u64 subnet_prefix; u16 pkey; @@ -678,7 +677,6 @@ int ib_security_pkey_access(struct ib_device *dev, return security_ib_pkey_access(sec, subnet_prefix, pkey); } -EXPORT_SYMBOL(ib_security_pkey_access); static int ib_mad_agent_security_change(struct notifier_block *nb, unsigned long event, diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index e30d86fa1855..8ae1308eecc7 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -1276,7 +1276,6 @@ int ib_device_register_sysfs(struct ib_device *device, int ret; int i; - WARN_ON_ONCE(!device->dev.parent); ret = dev_set_name(class_dev, "%s", device->name); if (ret) return ret; diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c index f7adae0adc19..8ae636bb09e5 100644 --- a/drivers/infiniband/core/ucm.c +++ b/drivers/infiniband/core/ucm.c @@ -53,6 +53,8 @@ #include <rdma/ib_user_cm.h> #include <rdma/ib_marshall.h> +#include "core_priv.h" + MODULE_AUTHOR("Libor Michalek"); MODULE_DESCRIPTION("InfiniBand userspace Connection Manager access"); MODULE_LICENSE("Dual BSD/GPL"); @@ -104,10 +106,13 @@ struct ib_ucm_event { enum { IB_UCM_MAJOR = 231, IB_UCM_BASE_MINOR = 224, - IB_UCM_MAX_DEVICES = 32 + IB_UCM_MAX_DEVICES = RDMA_MAX_PORTS, + IB_UCM_NUM_FIXED_MINOR = 32, + IB_UCM_NUM_DYNAMIC_MINOR = IB_UCM_MAX_DEVICES - IB_UCM_NUM_FIXED_MINOR, }; #define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR) +static dev_t dynamic_ucm_dev; static void ib_ucm_add_one(struct ib_device *device); static void ib_ucm_remove_one(struct ib_device *device, void *client_data); @@ -1199,7 +1204,6 @@ static int ib_ucm_close(struct inode *inode, struct file *filp) return 0; } -static DECLARE_BITMAP(overflow_map, IB_UCM_MAX_DEVICES); static void ib_ucm_release_dev(struct device *dev) { struct ib_ucm_device *ucm_dev; @@ -1210,10 +1214,7 @@ static void ib_ucm_release_dev(struct device *dev) static void ib_ucm_free_dev(struct ib_ucm_device *ucm_dev) { - if (ucm_dev->devnum < IB_UCM_MAX_DEVICES) - clear_bit(ucm_dev->devnum, dev_map); - else - clear_bit(ucm_dev->devnum - IB_UCM_MAX_DEVICES, overflow_map); + clear_bit(ucm_dev->devnum, dev_map); } static const struct file_operations ucm_fops = { @@ -1235,27 +1236,6 @@ static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); -static dev_t overflow_maj; -static int find_overflow_devnum(void) -{ - int ret; - - if (!overflow_maj) { - ret = alloc_chrdev_region(&overflow_maj, 0, IB_UCM_MAX_DEVICES, - "infiniband_cm"); - if (ret) { - pr_err("ucm: couldn't register dynamic device number\n"); - return ret; - } - } - - ret = find_first_zero_bit(overflow_map, IB_UCM_MAX_DEVICES); - if (ret >= IB_UCM_MAX_DEVICES) - return -1; - - return ret; -} - static void ib_ucm_add_one(struct ib_device *device) { int devnum; @@ -1274,19 +1254,14 @@ static void ib_ucm_add_one(struct ib_device *device) ucm_dev->dev.release = ib_ucm_release_dev; devnum = find_first_zero_bit(dev_map, IB_UCM_MAX_DEVICES); - if (devnum >= IB_UCM_MAX_DEVICES) { - devnum = find_overflow_devnum(); - if (devnum < 0) - goto err; - - ucm_dev->devnum = devnum + IB_UCM_MAX_DEVICES; - base = devnum + overflow_maj; - set_bit(devnum, overflow_map); - } else { - ucm_dev->devnum = devnum; - base = devnum + IB_UCM_BASE_DEV; - set_bit(devnum, dev_map); - } + if (devnum >= IB_UCM_MAX_DEVICES) + goto err; + ucm_dev->devnum = devnum; + set_bit(devnum, dev_map); + if (devnum >= IB_UCM_NUM_FIXED_MINOR) + base = dynamic_ucm_dev + devnum - IB_UCM_NUM_FIXED_MINOR; + else + base = IB_UCM_BASE_DEV + devnum; cdev_init(&ucm_dev->cdev, &ucm_fops); ucm_dev->cdev.owner = THIS_MODULE; @@ -1334,13 +1309,20 @@ static int __init ib_ucm_init(void) { int ret; - ret = register_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES, + ret = register_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR, "infiniband_cm"); if (ret) { pr_err("ucm: couldn't register device number\n"); goto error1; } + ret = alloc_chrdev_region(&dynamic_ucm_dev, 0, IB_UCM_NUM_DYNAMIC_MINOR, + "infiniband_cm"); + if (ret) { + pr_err("ucm: couldn't register dynamic device number\n"); + goto err_alloc; + } + ret = class_create_file(&cm_class, &class_attr_abi_version.attr); if (ret) { pr_err("ucm: couldn't create abi_version attribute\n"); @@ -1357,7 +1339,9 @@ static int __init ib_ucm_init(void) error3: class_remove_file(&cm_class, &class_attr_abi_version.attr); error2: - unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES); + unregister_chrdev_region(dynamic_ucm_dev, IB_UCM_NUM_DYNAMIC_MINOR); +err_alloc: + unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR); error1: return ret; } @@ -1366,9 +1350,8 @@ static void __exit ib_ucm_cleanup(void) { ib_unregister_client(&ucm_client); class_remove_file(&cm_class, &class_attr_abi_version.attr); - unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES); - if (overflow_maj) - unregister_chrdev_region(overflow_maj, IB_UCM_MAX_DEVICES); + unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR); + unregister_chrdev_region(dynamic_ucm_dev, IB_UCM_NUM_DYNAMIC_MINOR); idr_destroy(&ctx_id_table); } diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index e4be89d1f3d8..6ba4231f2b07 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -904,13 +904,14 @@ static ssize_t ucma_query_path(struct ucma_context *ctx, resp->path_data[i].flags = IB_PATH_GMP | IB_PATH_PRIMARY | IB_PATH_BIDIRECTIONAL; - if (rec->rec_type == SA_PATH_REC_TYPE_IB) { - ib_sa_pack_path(rec, &resp->path_data[i].path_rec); - } else { + if (rec->rec_type == SA_PATH_REC_TYPE_OPA) { struct sa_path_rec ib; sa_convert_path_opa_to_ib(&ib, rec); ib_sa_pack_path(&ib, &resp->path_data[i].path_rec); + + } else { + ib_sa_pack_path(rec, &resp->path_data[i].path_rec); } } @@ -943,8 +944,8 @@ static ssize_t ucma_query_gid(struct ucma_context *ctx, } else { addr->sib_family = AF_IB; addr->sib_pkey = (__force __be16) resp.pkey; - rdma_addr_get_sgid(&ctx->cm_id->route.addr.dev_addr, - (union ib_gid *) &addr->sib_addr); + rdma_read_gids(ctx->cm_id, (union ib_gid *)&addr->sib_addr, + NULL); addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *) &ctx->cm_id->route.addr.src_addr); } @@ -956,8 +957,8 @@ static ssize_t ucma_query_gid(struct ucma_context *ctx, } else { addr->sib_family = AF_IB; addr->sib_pkey = (__force __be16) resp.pkey; - rdma_addr_get_dgid(&ctx->cm_id->route.addr.dev_addr, - (union ib_gid *) &addr->sib_addr); + rdma_read_gids(ctx->cm_id, NULL, + (union ib_gid *)&addr->sib_addr); addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr); } @@ -1231,9 +1232,9 @@ static int ucma_set_ib_path(struct ucma_context *ctx, struct sa_path_rec opa; sa_convert_path_ib_to_opa(&opa, &sa_path); - ret = rdma_set_ib_paths(ctx->cm_id, &opa, 1); + ret = rdma_set_ib_path(ctx->cm_id, &opa); } else { - ret = rdma_set_ib_paths(ctx->cm_id, &sa_path, 1); + ret = rdma_set_ib_path(ctx->cm_id, &sa_path); } if (ret) return ret; diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 130606c3b07c..9a4e899d94b3 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -352,7 +352,7 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset, return -EINVAL; } - ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->nmap, dst, length, + ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->npages, dst, length, offset + ib_umem_offset(umem)); if (ret < 0) diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 0c32d10f23ff..78c77962422e 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -55,16 +55,21 @@ #include <rdma/ib_mad.h> #include <rdma/ib_user_mad.h> +#include "core_priv.h" + MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("InfiniBand userspace MAD packet access"); MODULE_LICENSE("Dual BSD/GPL"); enum { - IB_UMAD_MAX_PORTS = 64, + IB_UMAD_MAX_PORTS = RDMA_MAX_PORTS, IB_UMAD_MAX_AGENTS = 32, IB_UMAD_MAJOR = 231, - IB_UMAD_MINOR_BASE = 0 + IB_UMAD_MINOR_BASE = 0, + IB_UMAD_NUM_FIXED_MINOR = 64, + IB_UMAD_NUM_DYNAMIC_MINOR = IB_UMAD_MAX_PORTS - IB_UMAD_NUM_FIXED_MINOR, + IB_ISSM_MINOR_BASE = IB_UMAD_NUM_FIXED_MINOR, }; /* @@ -127,9 +132,12 @@ struct ib_umad_packet { static struct class *umad_class; -static const dev_t base_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE); +static const dev_t base_umad_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE); +static const dev_t base_issm_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE) + + IB_UMAD_NUM_FIXED_MINOR; +static dev_t dynamic_umad_dev; +static dev_t dynamic_issm_dev; -static DEFINE_SPINLOCK(port_lock); static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS); static void ib_umad_add_one(struct ib_device *device); @@ -233,8 +241,7 @@ static void recv_handler(struct ib_mad_agent *agent, * On OPA devices it is okay to lose the upper 16 bits of LID as this * information is obtained elsewhere. Mask off the upper 16 bits. */ - if (agent->device->port_immutable[agent->port_num].core_cap_flags & - RDMA_CORE_PORT_INTEL_OPA) + if (rdma_cap_opa_mad(agent->device, agent->port_num)) packet->mad.hdr.lid = ib_lid_be16(0xFFFF & mad_recv_wc->wc->slid); else @@ -246,10 +253,14 @@ static void recv_handler(struct ib_mad_agent *agent, if (packet->mad.hdr.grh_present) { struct rdma_ah_attr ah_attr; const struct ib_global_route *grh; + int ret; - ib_init_ah_from_wc(agent->device, agent->port_num, - mad_recv_wc->wc, mad_recv_wc->recv_buf.grh, - &ah_attr); + ret = ib_init_ah_attr_from_wc(agent->device, agent->port_num, + mad_recv_wc->wc, + mad_recv_wc->recv_buf.grh, + &ah_attr); + if (ret) + goto err2; grh = rdma_ah_read_grh(&ah_attr); packet->mad.hdr.gid_index = grh->sgid_index; @@ -500,7 +511,7 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, } memset(&ah_attr, 0, sizeof ah_attr); - ah_attr.type = rdma_ah_find_type(file->port->ib_dev, + ah_attr.type = rdma_ah_find_type(agent->device, file->port->port_num); rdma_ah_set_dlid(&ah_attr, be16_to_cpu(packet->mad.hdr.lid)); rdma_ah_set_sl(&ah_attr, packet->mad.hdr.sl); @@ -1139,54 +1150,26 @@ static DEVICE_ATTR(port, S_IRUGO, show_port, NULL); static CLASS_ATTR_STRING(abi_version, S_IRUGO, __stringify(IB_USER_MAD_ABI_VERSION)); -static dev_t overflow_maj; -static DECLARE_BITMAP(overflow_map, IB_UMAD_MAX_PORTS); -static int find_overflow_devnum(struct ib_device *device) -{ - int ret; - - if (!overflow_maj) { - ret = alloc_chrdev_region(&overflow_maj, 0, IB_UMAD_MAX_PORTS * 2, - "infiniband_mad"); - if (ret) { - dev_err(&device->dev, - "couldn't register dynamic device number\n"); - return ret; - } - } - - ret = find_first_zero_bit(overflow_map, IB_UMAD_MAX_PORTS); - if (ret >= IB_UMAD_MAX_PORTS) - return -1; - - return ret; -} - static int ib_umad_init_port(struct ib_device *device, int port_num, struct ib_umad_device *umad_dev, struct ib_umad_port *port) { int devnum; - dev_t base; + dev_t base_umad; + dev_t base_issm; - spin_lock(&port_lock); devnum = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS); - if (devnum >= IB_UMAD_MAX_PORTS) { - spin_unlock(&port_lock); - devnum = find_overflow_devnum(device); - if (devnum < 0) - return -1; - - spin_lock(&port_lock); - port->dev_num = devnum + IB_UMAD_MAX_PORTS; - base = devnum + overflow_maj; - set_bit(devnum, overflow_map); + if (devnum >= IB_UMAD_MAX_PORTS) + return -1; + port->dev_num = devnum; + set_bit(devnum, dev_map); + if (devnum >= IB_UMAD_NUM_FIXED_MINOR) { + base_umad = dynamic_umad_dev + devnum - IB_UMAD_NUM_FIXED_MINOR; + base_issm = dynamic_issm_dev + devnum - IB_UMAD_NUM_FIXED_MINOR; } else { - port->dev_num = devnum; - base = devnum + base_dev; - set_bit(devnum, dev_map); + base_umad = devnum + base_umad_dev; + base_issm = devnum + base_issm_dev; } - spin_unlock(&port_lock); port->ib_dev = device; port->port_num = port_num; @@ -1198,7 +1181,7 @@ static int ib_umad_init_port(struct ib_device *device, int port_num, port->cdev.owner = THIS_MODULE; cdev_set_parent(&port->cdev, &umad_dev->kobj); kobject_set_name(&port->cdev.kobj, "umad%d", port->dev_num); - if (cdev_add(&port->cdev, base, 1)) + if (cdev_add(&port->cdev, base_umad, 1)) goto err_cdev; port->dev = device_create(umad_class, device->dev.parent, @@ -1212,12 +1195,11 @@ static int ib_umad_init_port(struct ib_device *device, int port_num, if (device_create_file(port->dev, &dev_attr_port)) goto err_dev; - base += IB_UMAD_MAX_PORTS; cdev_init(&port->sm_cdev, &umad_sm_fops); port->sm_cdev.owner = THIS_MODULE; cdev_set_parent(&port->sm_cdev, &umad_dev->kobj); kobject_set_name(&port->sm_cdev.kobj, "issm%d", port->dev_num); - if (cdev_add(&port->sm_cdev, base, 1)) + if (cdev_add(&port->sm_cdev, base_issm, 1)) goto err_sm_cdev; port->sm_dev = device_create(umad_class, device->dev.parent, @@ -1244,10 +1226,7 @@ err_dev: err_cdev: cdev_del(&port->cdev); - if (port->dev_num < IB_UMAD_MAX_PORTS) - clear_bit(devnum, dev_map); - else - clear_bit(devnum, overflow_map); + clear_bit(devnum, dev_map); return -1; } @@ -1281,11 +1260,7 @@ static void ib_umad_kill_port(struct ib_umad_port *port) } mutex_unlock(&port->file_mutex); - - if (port->dev_num < IB_UMAD_MAX_PORTS) - clear_bit(port->dev_num, dev_map); - else - clear_bit(port->dev_num - IB_UMAD_MAX_PORTS, overflow_map); + clear_bit(port->dev_num, dev_map); } static void ib_umad_add_one(struct ib_device *device) @@ -1361,13 +1336,23 @@ static int __init ib_umad_init(void) { int ret; - ret = register_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2, + ret = register_chrdev_region(base_umad_dev, + IB_UMAD_NUM_FIXED_MINOR * 2, "infiniband_mad"); if (ret) { pr_err("couldn't register device number\n"); goto out; } + ret = alloc_chrdev_region(&dynamic_umad_dev, 0, + IB_UMAD_NUM_DYNAMIC_MINOR * 2, + "infiniband_mad"); + if (ret) { + pr_err("couldn't register dynamic device number\n"); + goto out_alloc; + } + dynamic_issm_dev = dynamic_umad_dev + IB_UMAD_NUM_DYNAMIC_MINOR; + umad_class = class_create(THIS_MODULE, "infiniband_mad"); if (IS_ERR(umad_class)) { ret = PTR_ERR(umad_class); @@ -1395,7 +1380,12 @@ out_class: class_destroy(umad_class); out_chrdev: - unregister_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2); + unregister_chrdev_region(dynamic_umad_dev, + IB_UMAD_NUM_DYNAMIC_MINOR * 2); + +out_alloc: + unregister_chrdev_region(base_umad_dev, + IB_UMAD_NUM_FIXED_MINOR * 2); out: return ret; @@ -1405,9 +1395,10 @@ static void __exit ib_umad_cleanup(void) { ib_unregister_client(&umad_client); class_destroy(umad_class); - unregister_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2); - if (overflow_maj) - unregister_chrdev_region(overflow_maj, IB_UMAD_MAX_PORTS * 2); + unregister_chrdev_region(base_umad_dev, + IB_UMAD_NUM_FIXED_MINOR * 2); + unregister_chrdev_region(dynamic_umad_dev, + IB_UMAD_NUM_DYNAMIC_MINOR * 2); } module_init(ib_umad_init); diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 840b24096690..256934d1f64f 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -340,6 +340,8 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file, uobj->object = pd; memset(&resp, 0, sizeof resp); resp.pd_handle = uobj->id; + pd->res.type = RDMA_RESTRACK_PD; + rdma_restrack_add(&pd->res); if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) { ret = -EFAULT; @@ -1033,6 +1035,8 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file, goto err_cb; uobj_alloc_commit(&obj->uobject); + cq->res.type = RDMA_RESTRACK_CQ; + rdma_restrack_add(&cq->res); return obj; @@ -1145,10 +1149,7 @@ int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file, min(ucore->inlen, sizeof(cmd)), ib_uverbs_ex_create_cq_cb, NULL); - if (IS_ERR(obj)) - return PTR_ERR(obj); - - return 0; + return PTR_ERR_OR_ZERO(obj); } ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file, @@ -1199,7 +1200,7 @@ static int copy_wc_to_user(struct ib_device *ib_dev, void __user *dest, tmp.opcode = wc->opcode; tmp.vendor_err = wc->vendor_err; tmp.byte_len = wc->byte_len; - tmp.ex.imm_data = (__u32 __force) wc->ex.imm_data; + tmp.ex.imm_data = wc->ex.imm_data; tmp.qp_num = wc->qp->qp_num; tmp.src_qp = wc->src_qp; tmp.wc_flags = wc->wc_flags; @@ -1517,7 +1518,7 @@ static int create_qp(struct ib_uverbs_file *file, if (cmd->qp_type == IB_QPT_XRC_TGT) qp = ib_create_qp(pd, &attr); else - qp = device->create_qp(pd, &attr, uhw); + qp = _ib_create_qp(device, pd, &attr, uhw); if (IS_ERR(qp)) { ret = PTR_ERR(qp); @@ -1530,7 +1531,6 @@ static int create_qp(struct ib_uverbs_file *file, goto err_cb; qp->real_qp = qp; - qp->device = device; qp->pd = pd; qp->send_cq = attr.send_cq; qp->recv_cq = attr.recv_cq; diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c index 71ff2644e053..d96dc1d17be1 100644 --- a/drivers/infiniband/core/uverbs_ioctl.c +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -243,16 +243,13 @@ static long ib_uverbs_cmd_verbs(struct ib_device *ib_dev, size_t ctx_size; uintptr_t data[UVERBS_OPTIMIZE_USING_STACK_SZ / sizeof(uintptr_t)]; - if (hdr->reserved) - return -EINVAL; - object_spec = uverbs_get_object(ib_dev, hdr->object_id); if (!object_spec) - return -EOPNOTSUPP; + return -EPROTONOSUPPORT; method_spec = uverbs_get_method(object_spec, hdr->method_id); if (!method_spec) - return -EOPNOTSUPP; + return -EPROTONOSUPPORT; if ((method_spec->flags & UVERBS_ACTION_FLAG_CREATE_ROOT) ^ !file->ucontext) return -EINVAL; @@ -305,6 +302,16 @@ static long ib_uverbs_cmd_verbs(struct ib_device *ib_dev, err = uverbs_handle_method(buf, ctx->uattrs, hdr->num_attrs, ib_dev, file, method_spec, ctx->uverbs_attr_bundle); + + /* + * EPROTONOSUPPORT is ONLY to be returned if the ioctl framework can + * not invoke the method because the request is not supported. No + * other cases should return this code. + */ + if (unlikely(err == -EPROTONOSUPPORT)) { + WARN_ON_ONCE(err == -EPROTONOSUPPORT); + err = -EINVAL; + } out: if (ctx != (void *)data) kfree(ctx); @@ -341,7 +348,7 @@ long ib_uverbs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } if (hdr.reserved) { - err = -EOPNOTSUPP; + err = -EPROTONOSUPPORT; goto out; } diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 5f216ffb465a..5b811bf574d6 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -62,14 +62,16 @@ MODULE_LICENSE("Dual BSD/GPL"); enum { IB_UVERBS_MAJOR = 231, IB_UVERBS_BASE_MINOR = 192, - IB_UVERBS_MAX_DEVICES = 32 + IB_UVERBS_MAX_DEVICES = RDMA_MAX_PORTS, + IB_UVERBS_NUM_FIXED_MINOR = 32, + IB_UVERBS_NUM_DYNAMIC_MINOR = IB_UVERBS_MAX_DEVICES - IB_UVERBS_NUM_FIXED_MINOR, }; #define IB_UVERBS_BASE_DEV MKDEV(IB_UVERBS_MAJOR, IB_UVERBS_BASE_MINOR) +static dev_t dynamic_uverbs_dev; static struct class *uverbs_class; -static DEFINE_SPINLOCK(map_lock); static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES); static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, @@ -1005,34 +1007,6 @@ static DEVICE_ATTR(abi_version, S_IRUGO, show_dev_abi_version, NULL); static CLASS_ATTR_STRING(abi_version, S_IRUGO, __stringify(IB_USER_VERBS_ABI_VERSION)); -static dev_t overflow_maj; -static DECLARE_BITMAP(overflow_map, IB_UVERBS_MAX_DEVICES); - -/* - * If we have more than IB_UVERBS_MAX_DEVICES, dynamically overflow by - * requesting a new major number and doubling the number of max devices we - * support. It's stupid, but simple. - */ -static int find_overflow_devnum(void) -{ - int ret; - - if (!overflow_maj) { - ret = alloc_chrdev_region(&overflow_maj, 0, IB_UVERBS_MAX_DEVICES, - "infiniband_verbs"); - if (ret) { - pr_err("user_verbs: couldn't register dynamic device number\n"); - return ret; - } - } - - ret = find_first_zero_bit(overflow_map, IB_UVERBS_MAX_DEVICES); - if (ret >= IB_UVERBS_MAX_DEVICES) - return -1; - - return ret; -} - static void ib_uverbs_add_one(struct ib_device *device) { int devnum; @@ -1062,24 +1036,15 @@ static void ib_uverbs_add_one(struct ib_device *device) INIT_LIST_HEAD(&uverbs_dev->uverbs_file_list); INIT_LIST_HEAD(&uverbs_dev->uverbs_events_file_list); - spin_lock(&map_lock); devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES); - if (devnum >= IB_UVERBS_MAX_DEVICES) { - spin_unlock(&map_lock); - devnum = find_overflow_devnum(); - if (devnum < 0) - goto err; - - spin_lock(&map_lock); - uverbs_dev->devnum = devnum + IB_UVERBS_MAX_DEVICES; - base = devnum + overflow_maj; - set_bit(devnum, overflow_map); - } else { - uverbs_dev->devnum = devnum; - base = devnum + IB_UVERBS_BASE_DEV; - set_bit(devnum, dev_map); - } - spin_unlock(&map_lock); + if (devnum >= IB_UVERBS_MAX_DEVICES) + goto err; + uverbs_dev->devnum = devnum; + set_bit(devnum, dev_map); + if (devnum >= IB_UVERBS_NUM_FIXED_MINOR) + base = dynamic_uverbs_dev + devnum - IB_UVERBS_NUM_FIXED_MINOR; + else + base = IB_UVERBS_BASE_DEV + devnum; rcu_assign_pointer(uverbs_dev->ib_dev, device); uverbs_dev->num_comp_vectors = device->num_comp_vectors; @@ -1124,10 +1089,7 @@ err_class: err_cdev: cdev_del(&uverbs_dev->cdev); - if (uverbs_dev->devnum < IB_UVERBS_MAX_DEVICES) - clear_bit(devnum, dev_map); - else - clear_bit(devnum, overflow_map); + clear_bit(devnum, dev_map); err: if (atomic_dec_and_test(&uverbs_dev->refcount)) @@ -1219,11 +1181,7 @@ static void ib_uverbs_remove_one(struct ib_device *device, void *client_data) dev_set_drvdata(uverbs_dev->dev, NULL); device_destroy(uverbs_class, uverbs_dev->cdev.dev); cdev_del(&uverbs_dev->cdev); - - if (uverbs_dev->devnum < IB_UVERBS_MAX_DEVICES) - clear_bit(uverbs_dev->devnum, dev_map); - else - clear_bit(uverbs_dev->devnum - IB_UVERBS_MAX_DEVICES, overflow_map); + clear_bit(uverbs_dev->devnum, dev_map); if (device->disassociate_ucontext) { /* We disassociate HW resources and immediately return. @@ -1265,13 +1223,22 @@ static int __init ib_uverbs_init(void) { int ret; - ret = register_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES, + ret = register_chrdev_region(IB_UVERBS_BASE_DEV, + IB_UVERBS_NUM_FIXED_MINOR, "infiniband_verbs"); if (ret) { pr_err("user_verbs: couldn't register device number\n"); goto out; } + ret = alloc_chrdev_region(&dynamic_uverbs_dev, 0, + IB_UVERBS_NUM_DYNAMIC_MINOR, + "infiniband_verbs"); + if (ret) { + pr_err("couldn't register dynamic device number\n"); + goto out_alloc; + } + uverbs_class = class_create(THIS_MODULE, "infiniband_verbs"); if (IS_ERR(uverbs_class)) { ret = PTR_ERR(uverbs_class); @@ -1299,7 +1266,12 @@ out_class: class_destroy(uverbs_class); out_chrdev: - unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES); + unregister_chrdev_region(dynamic_uverbs_dev, + IB_UVERBS_NUM_DYNAMIC_MINOR); + +out_alloc: + unregister_chrdev_region(IB_UVERBS_BASE_DEV, + IB_UVERBS_NUM_FIXED_MINOR); out: return ret; @@ -1309,9 +1281,10 @@ static void __exit ib_uverbs_cleanup(void) { ib_unregister_client(&uverbs_client); class_destroy(uverbs_class); - unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES); - if (overflow_maj) - unregister_chrdev_region(overflow_maj, IB_UVERBS_MAX_DEVICES); + unregister_chrdev_region(IB_UVERBS_BASE_DEV, + IB_UVERBS_NUM_FIXED_MINOR); + unregister_chrdev_region(dynamic_uverbs_dev, + IB_UVERBS_NUM_DYNAMIC_MINOR); } module_init(ib_uverbs_init); diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c index c3ee5d9b336d..b571176babbe 100644 --- a/drivers/infiniband/core/uverbs_std_types.c +++ b/drivers/infiniband/core/uverbs_std_types.c @@ -35,6 +35,7 @@ #include <rdma/ib_verbs.h> #include <linux/bug.h> #include <linux/file.h> +#include <rdma/restrack.h> #include "rdma_core.h" #include "uverbs.h" @@ -319,6 +320,8 @@ static int uverbs_create_cq_handler(struct ib_device *ib_dev, obj->uobject.object = cq; obj->uobject.user_handle = user_handle; atomic_set(&cq->usecnt, 0); + cq->res.type = RDMA_RESTRACK_CQ; + rdma_restrack_add(&cq->res); ret = uverbs_copy_to(attrs, CREATE_CQ_RESP_CQE, &cq->cqe); if (ret) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index e36d27ed4daa..16ebc6372c31 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -124,16 +124,24 @@ EXPORT_SYMBOL(ib_wc_status_msg); __attribute_const__ int ib_rate_to_mult(enum ib_rate rate) { switch (rate) { - case IB_RATE_2_5_GBPS: return 1; - case IB_RATE_5_GBPS: return 2; - case IB_RATE_10_GBPS: return 4; - case IB_RATE_20_GBPS: return 8; - case IB_RATE_30_GBPS: return 12; - case IB_RATE_40_GBPS: return 16; - case IB_RATE_60_GBPS: return 24; - case IB_RATE_80_GBPS: return 32; - case IB_RATE_120_GBPS: return 48; - default: return -1; + case IB_RATE_2_5_GBPS: return 1; + case IB_RATE_5_GBPS: return 2; + case IB_RATE_10_GBPS: return 4; + case IB_RATE_20_GBPS: return 8; + case IB_RATE_30_GBPS: return 12; + case IB_RATE_40_GBPS: return 16; + case IB_RATE_60_GBPS: return 24; + case IB_RATE_80_GBPS: return 32; + case IB_RATE_120_GBPS: return 48; + case IB_RATE_14_GBPS: return 6; + case IB_RATE_56_GBPS: return 22; + case IB_RATE_112_GBPS: return 45; + case IB_RATE_168_GBPS: return 67; + case IB_RATE_25_GBPS: return 10; + case IB_RATE_100_GBPS: return 40; + case IB_RATE_200_GBPS: return 80; + case IB_RATE_300_GBPS: return 120; + default: return -1; } } EXPORT_SYMBOL(ib_rate_to_mult); @@ -141,16 +149,24 @@ EXPORT_SYMBOL(ib_rate_to_mult); __attribute_const__ enum ib_rate mult_to_ib_rate(int mult) { switch (mult) { - case 1: return IB_RATE_2_5_GBPS; - case 2: return IB_RATE_5_GBPS; - case 4: return IB_RATE_10_GBPS; - case 8: return IB_RATE_20_GBPS; - case 12: return IB_RATE_30_GBPS; - case 16: return IB_RATE_40_GBPS; - case 24: return IB_RATE_60_GBPS; - case 32: return IB_RATE_80_GBPS; - case 48: return IB_RATE_120_GBPS; - default: return IB_RATE_PORT_CURRENT; + case 1: return IB_RATE_2_5_GBPS; + case 2: return IB_RATE_5_GBPS; + case 4: return IB_RATE_10_GBPS; + case 8: return IB_RATE_20_GBPS; + case 12: return IB_RATE_30_GBPS; + case 16: return IB_RATE_40_GBPS; + case 24: return IB_RATE_60_GBPS; + case 32: return IB_RATE_80_GBPS; + case 48: return IB_RATE_120_GBPS; + case 6: return IB_RATE_14_GBPS; + case 22: return IB_RATE_56_GBPS; + case 45: return IB_RATE_112_GBPS; + case 67: return IB_RATE_168_GBPS; + case 10: return IB_RATE_25_GBPS; + case 40: return IB_RATE_100_GBPS; + case 80: return IB_RATE_200_GBPS; + case 120: return IB_RATE_300_GBPS; + default: return IB_RATE_PORT_CURRENT; } } EXPORT_SYMBOL(mult_to_ib_rate); @@ -247,6 +263,10 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, mr_access_flags |= IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE; } + pd->res.type = RDMA_RESTRACK_PD; + pd->res.kern_name = caller; + rdma_restrack_add(&pd->res); + if (mr_access_flags) { struct ib_mr *mr; @@ -296,6 +316,7 @@ void ib_dealloc_pd(struct ib_pd *pd) requires the caller to guarantee we can't race here. */ WARN_ON(atomic_read(&pd->usecnt)); + rdma_restrack_del(&pd->res); /* Making delalloc_pd a void return is a WIP, no driver should return an error here. */ ret = pd->device->dealloc_pd(pd); @@ -421,8 +442,7 @@ static bool find_gid_index(const union ib_gid *gid, const struct ib_gid_attr *gid_attr, void *context) { - struct find_gid_index_context *ctx = - (struct find_gid_index_context *)context; + struct find_gid_index_context *ctx = context; if (ctx->gid_type != gid_attr->gid_type) return false; @@ -481,8 +501,53 @@ int ib_get_gids_from_rdma_hdr(const union rdma_network_hdr *hdr, } EXPORT_SYMBOL(ib_get_gids_from_rdma_hdr); +/* Resolve destination mac address and hop limit for unicast destination + * GID entry, considering the source GID entry as well. + * ah_attribute must have have valid port_num, sgid_index. + */ +static int ib_resolve_unicast_gid_dmac(struct ib_device *device, + struct rdma_ah_attr *ah_attr) +{ + struct ib_gid_attr sgid_attr; + struct ib_global_route *grh; + int hop_limit = 0xff; + union ib_gid sgid; + int ret; + + grh = rdma_ah_retrieve_grh(ah_attr); + + ret = ib_query_gid(device, + rdma_ah_get_port_num(ah_attr), + grh->sgid_index, + &sgid, &sgid_attr); + if (ret || !sgid_attr.ndev) { + if (!ret) + ret = -ENXIO; + return ret; + } + + /* If destination is link local and source GID is RoCEv1, + * IP stack is not used. + */ + if (rdma_link_local_addr((struct in6_addr *)grh->dgid.raw) && + sgid_attr.gid_type == IB_GID_TYPE_ROCE) { + rdma_get_ll_mac((struct in6_addr *)grh->dgid.raw, + ah_attr->roce.dmac); + goto done; + } + + ret = rdma_addr_find_l2_eth_by_grh(&sgid, &grh->dgid, + ah_attr->roce.dmac, + sgid_attr.ndev, &hop_limit); +done: + dev_put(sgid_attr.ndev); + + grh->hop_limit = hop_limit; + return ret; +} + /* - * This function creates ah from the incoming packet. + * This function initializes address handle attributes from the incoming packet. * Incoming packet has dgid of the receiver node on which this code is * getting executed and, sgid contains the GID of the sender. * @@ -490,13 +555,10 @@ EXPORT_SYMBOL(ib_get_gids_from_rdma_hdr); * as sgid and, sgid is used as dgid because sgid contains destinations * GID whom to respond to. * - * This is why when calling rdma_addr_find_l2_eth_by_grh() function, the - * position of arguments dgid and sgid do not match the order of the - * parameters. */ -int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, - const struct ib_wc *wc, const struct ib_grh *grh, - struct rdma_ah_attr *ah_attr) +int ib_init_ah_attr_from_wc(struct ib_device *device, u8 port_num, + const struct ib_wc *wc, const struct ib_grh *grh, + struct rdma_ah_attr *ah_attr) { u32 flow_class; u16 gid_index; @@ -523,57 +585,33 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, if (ret) return ret; + rdma_ah_set_sl(ah_attr, wc->sl); + rdma_ah_set_port_num(ah_attr, port_num); + if (rdma_protocol_roce(device, port_num)) { - int if_index = 0; u16 vlan_id = wc->wc_flags & IB_WC_WITH_VLAN ? wc->vlan_id : 0xffff; - struct net_device *idev; - struct net_device *resolved_dev; if (!(wc->wc_flags & IB_WC_GRH)) return -EPROTOTYPE; - if (!device->get_netdev) - return -EOPNOTSUPP; - - idev = device->get_netdev(device, port_num); - if (!idev) - return -ENODEV; - - ret = rdma_addr_find_l2_eth_by_grh(&dgid, &sgid, - ah_attr->roce.dmac, - wc->wc_flags & IB_WC_WITH_VLAN ? - NULL : &vlan_id, - &if_index, &hoplimit); - if (ret) { - dev_put(idev); - return ret; - } - - resolved_dev = dev_get_by_index(&init_net, if_index); - rcu_read_lock(); - if (resolved_dev != idev && !rdma_is_upper_dev_rcu(idev, - resolved_dev)) - ret = -EHOSTUNREACH; - rcu_read_unlock(); - dev_put(idev); - dev_put(resolved_dev); + ret = get_sgid_index_from_eth(device, port_num, + vlan_id, &dgid, + gid_type, &gid_index); if (ret) return ret; - ret = get_sgid_index_from_eth(device, port_num, vlan_id, - &dgid, gid_type, &gid_index); - if (ret) - return ret; - } - - rdma_ah_set_dlid(ah_attr, wc->slid); - rdma_ah_set_sl(ah_attr, wc->sl); - rdma_ah_set_path_bits(ah_attr, wc->dlid_path_bits); - rdma_ah_set_port_num(ah_attr, port_num); + flow_class = be32_to_cpu(grh->version_tclass_flow); + rdma_ah_set_grh(ah_attr, &sgid, + flow_class & 0xFFFFF, + (u8)gid_index, hoplimit, + (flow_class >> 20) & 0xFF); + return ib_resolve_unicast_gid_dmac(device, ah_attr); + } else { + rdma_ah_set_dlid(ah_attr, wc->slid); + rdma_ah_set_path_bits(ah_attr, wc->dlid_path_bits); - if (wc->wc_flags & IB_WC_GRH) { - if (!rdma_cap_eth_ah(device, port_num)) { + if (wc->wc_flags & IB_WC_GRH) { if (dgid.global.interface_id != cpu_to_be64(IB_SA_WELL_KNOWN_GUID)) { ret = ib_find_cached_gid_by_port(device, &dgid, IB_GID_TYPE_IB, @@ -584,18 +622,17 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, } else { gid_index = 0; } - } - - flow_class = be32_to_cpu(grh->version_tclass_flow); - rdma_ah_set_grh(ah_attr, &sgid, - flow_class & 0xFFFFF, - (u8)gid_index, hoplimit, - (flow_class >> 20) & 0xFF); + flow_class = be32_to_cpu(grh->version_tclass_flow); + rdma_ah_set_grh(ah_attr, &sgid, + flow_class & 0xFFFFF, + (u8)gid_index, hoplimit, + (flow_class >> 20) & 0xFF); + } + return 0; } - return 0; } -EXPORT_SYMBOL(ib_init_ah_from_wc); +EXPORT_SYMBOL(ib_init_ah_attr_from_wc); struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc, const struct ib_grh *grh, u8 port_num) @@ -603,7 +640,7 @@ struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc, struct rdma_ah_attr ah_attr; int ret; - ret = ib_init_ah_from_wc(pd->device, port_num, wc, grh, &ah_attr); + ret = ib_init_ah_attr_from_wc(pd->device, port_num, wc, grh, &ah_attr); if (ret) return ERR_PTR(ret); @@ -850,7 +887,7 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, if (qp_init_attr->cap.max_rdma_ctxs) rdma_rw_init_qp(device, qp_init_attr); - qp = device->create_qp(pd, qp_init_attr, NULL); + qp = _ib_create_qp(device, pd, qp_init_attr, NULL); if (IS_ERR(qp)) return qp; @@ -860,7 +897,6 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, return ERR_PTR(ret); } - qp->device = device; qp->real_qp = qp; qp->uobject = NULL; qp->qp_type = qp_init_attr->qp_type; @@ -890,7 +926,6 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, atomic_inc(&qp_init_attr->srq->usecnt); } - qp->pd = pd; qp->send_cq = qp_init_attr->send_cq; qp->xrcd = NULL; @@ -1269,16 +1304,8 @@ static int ib_resolve_eth_dmac(struct ib_device *device, if (!rdma_is_port_valid(device, rdma_ah_get_port_num(ah_attr))) return -EINVAL; - if (ah_attr->type != RDMA_AH_ATTR_TYPE_ROCE) - return 0; - grh = rdma_ah_retrieve_grh(ah_attr); - if (rdma_link_local_addr((struct in6_addr *)grh->dgid.raw)) { - rdma_get_ll_mac((struct in6_addr *)grh->dgid.raw, - ah_attr->roce.dmac); - return 0; - } if (rdma_is_multicast_addr((struct in6_addr *)ah_attr->grh.dgid.raw)) { if (ipv6_addr_v4mapped((struct in6_addr *)ah_attr->grh.dgid.raw)) { __be32 addr = 0; @@ -1290,40 +1317,52 @@ static int ib_resolve_eth_dmac(struct ib_device *device, (char *)ah_attr->roce.dmac); } } else { - union ib_gid sgid; - struct ib_gid_attr sgid_attr; - int ifindex; - int hop_limit; - - ret = ib_query_gid(device, - rdma_ah_get_port_num(ah_attr), - grh->sgid_index, - &sgid, &sgid_attr); - - if (ret || !sgid_attr.ndev) { - if (!ret) - ret = -ENXIO; - goto out; - } - - ifindex = sgid_attr.ndev->ifindex; + ret = ib_resolve_unicast_gid_dmac(device, ah_attr); + } + return ret; +} - ret = - rdma_addr_find_l2_eth_by_grh(&sgid, &grh->dgid, - ah_attr->roce.dmac, - NULL, &ifindex, &hop_limit); +/** + * IB core internal function to perform QP attributes modification. + */ +static int _ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + u8 port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; + int ret; - dev_put(sgid_attr.ndev); + if (rdma_ib_or_roce(qp->device, port)) { + if (attr_mask & IB_QP_RQ_PSN && attr->rq_psn & ~0xffffff) { + pr_warn("%s: %s rq_psn overflow, masking to 24 bits\n", + __func__, qp->device->name); + attr->rq_psn &= 0xffffff; + } - grh->hop_limit = hop_limit; + if (attr_mask & IB_QP_SQ_PSN && attr->sq_psn & ~0xffffff) { + pr_warn("%s: %s sq_psn overflow, masking to 24 bits\n", + __func__, qp->device->name); + attr->sq_psn &= 0xffffff; + } } -out: + + ret = ib_security_modify_qp(qp, attr, attr_mask, udata); + if (!ret && (attr_mask & IB_QP_PORT)) + qp->port = attr->port_num; + return ret; } +static bool is_qp_type_connected(const struct ib_qp *qp) +{ + return (qp->qp_type == IB_QPT_UC || + qp->qp_type == IB_QPT_RC || + qp->qp_type == IB_QPT_XRC_INI || + qp->qp_type == IB_QPT_XRC_TGT); +} + /** * ib_modify_qp_with_udata - Modifies the attributes for the specified QP. - * @qp: The QP to modify. + * @ib_qp: The QP to modify. * @attr: On input, specifies the QP attributes to modify. On output, * the current values of selected QP attributes are returned. * @attr_mask: A bit-mask used to specify which attributes of the QP @@ -1332,21 +1371,20 @@ out: * are being modified. * It returns 0 on success and returns appropriate error code on error. */ -int ib_modify_qp_with_udata(struct ib_qp *qp, struct ib_qp_attr *attr, +int ib_modify_qp_with_udata(struct ib_qp *ib_qp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { + struct ib_qp *qp = ib_qp->real_qp; int ret; - if (attr_mask & IB_QP_AV) { + if (attr_mask & IB_QP_AV && + attr->ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE && + is_qp_type_connected(qp)) { ret = ib_resolve_eth_dmac(qp->device, &attr->ah_attr); if (ret) return ret; } - ret = ib_security_modify_qp(qp, attr, attr_mask, udata); - if (!ret && (attr_mask & IB_QP_PORT)) - qp->port = attr->port_num; - - return ret; + return _ib_modify_qp(qp, attr, attr_mask, udata); } EXPORT_SYMBOL(ib_modify_qp_with_udata); @@ -1409,7 +1447,7 @@ int ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask) { - return ib_modify_qp_with_udata(qp, qp_attr, qp_attr_mask, NULL); + return _ib_modify_qp(qp->real_qp, qp_attr, qp_attr_mask, NULL); } EXPORT_SYMBOL(ib_modify_qp); @@ -1503,6 +1541,7 @@ int ib_destroy_qp(struct ib_qp *qp) if (!qp->uobject) rdma_rw_cleanup_mrs(qp); + rdma_restrack_del(&qp->res); ret = qp->device->destroy_qp(qp); if (!ret) { if (pd) @@ -1545,6 +1584,8 @@ struct ib_cq *ib_create_cq(struct ib_device *device, cq->event_handler = event_handler; cq->cq_context = cq_context; atomic_set(&cq->usecnt, 0); + cq->res.type = RDMA_RESTRACK_CQ; + rdma_restrack_add(&cq->res); } return cq; @@ -1563,6 +1604,7 @@ int ib_destroy_cq(struct ib_cq *cq) if (atomic_read(&cq->usecnt)) return -EBUSY; + rdma_restrack_del(&cq->res); return cq->device->destroy_cq(cq); } EXPORT_SYMBOL(ib_destroy_cq); @@ -1747,7 +1789,7 @@ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) } EXPORT_SYMBOL(ib_detach_mcast); -struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device) +struct ib_xrcd *__ib_alloc_xrcd(struct ib_device *device, const char *caller) { struct ib_xrcd *xrcd; @@ -1765,7 +1807,7 @@ struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device) return xrcd; } -EXPORT_SYMBOL(ib_alloc_xrcd); +EXPORT_SYMBOL(__ib_alloc_xrcd); int ib_dealloc_xrcd(struct ib_xrcd *xrcd) { @@ -1790,11 +1832,11 @@ EXPORT_SYMBOL(ib_dealloc_xrcd); * ib_create_wq - Creates a WQ associated with the specified protection * domain. * @pd: The protection domain associated with the WQ. - * @wq_init_attr: A list of initial attributes required to create the + * @wq_attr: A list of initial attributes required to create the * WQ. If WQ creation succeeds, then the attributes are updated to * the actual capabilities of the created WQ. * - * wq_init_attr->max_wr and wq_init_attr->max_sge determine + * wq_attr->max_wr and wq_attr->max_sge determine * the requested size of the WQ, and set to the actual values allocated * on return. * If ib_create_wq() succeeds, then max_wr and max_sge will always be @@ -2156,16 +2198,16 @@ static void __ib_drain_sq(struct ib_qp *qp) struct ib_send_wr swr = {}, *bad_swr; int ret; - swr.wr_cqe = &sdrain.cqe; - sdrain.cqe.done = ib_drain_qp_done; - init_completion(&sdrain.done); - ret = ib_modify_qp(qp, &attr, IB_QP_STATE); if (ret) { WARN_ONCE(ret, "failed to drain send queue: %d\n", ret); return; } + swr.wr_cqe = &sdrain.cqe; + sdrain.cqe.done = ib_drain_qp_done; + init_completion(&sdrain.done); + ret = ib_post_send(qp, &swr, &bad_swr); if (ret) { WARN_ONCE(ret, "failed to drain send queue: %d\n", ret); @@ -2190,16 +2232,16 @@ static void __ib_drain_rq(struct ib_qp *qp) struct ib_recv_wr rwr = {}, *bad_rwr; int ret; - rwr.wr_cqe = &rdrain.cqe; - rdrain.cqe.done = ib_drain_qp_done; - init_completion(&rdrain.done); - ret = ib_modify_qp(qp, &attr, IB_QP_STATE); if (ret) { WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret); return; } + rwr.wr_cqe = &rdrain.cqe; + rdrain.cqe.done = ib_drain_qp_done; + init_completion(&rdrain.done); + ret = ib_post_recv(qp, &rwr, &bad_rwr); if (ret) { WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret); |